diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index 21a4a930..356da835 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -44,7 +44,7 @@ export async function crawlPreviewController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: true, removeTags: [] }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] }; // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? // try { diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 4e1b696d..20d29f26 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -74,7 +74,15 @@ export async function scrapeHelper( // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") { - delete doc.rawHtml; + if (doc.rawHtml) { + delete doc.rawHtml; + } + } + + if (!pageOptions.includeHtml) { + if (doc.html) { + delete doc.html; + } } return { diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index 34d415a5..79f6d74a 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -132,11 +132,11 @@ export async function searchController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { - includeHtml: true, - onlyMainContent: true, - fetchPageContent: true, - removeTags: [], - fallback: false, + includeHtml: req.body.pageOptions?.includeHtml ?? false, + onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false, + fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true, + removeTags: req.body.pageOptions?.removeTags ?? [], + fallback: req.body.pageOptions?.fallback ?? false, }; const origin = req.body.origin ?? "api"; diff --git a/apps/api/src/lib/default-values.ts b/apps/api/src/lib/default-values.ts index cdf4605d..152f47d7 100644 --- a/apps/api/src/lib/default-values.ts +++ b/apps/api/src/lib/default-values.ts @@ -4,7 +4,7 @@ export const defaultTimeout = 45000; // 45 seconds export const defaultPageOptions = { onlyMainContent: false, - includeHtml: true, + includeHtml: false, waitFor: 0, screenshot: false, fullPageScreenshot: false, @@ -17,7 +17,7 @@ export const defaultCrawlerOptions = { export const defaultCrawlPageOptions = { onlyMainContent: false, - includeHtml: true, + includeHtml: false, removeTags: [], parsePDF: true } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 04b861b1..f56f378e 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -296,6 +296,12 @@ export class WebScraperDataProvider { if (this.pageOptions.includeMarkdown) { documents = this.applyPathReplacements(documents); } + + if (!this.pageOptions.includeHtml) { + for (let document of documents) { + delete document.html; + } + } // documents = await this.applyImgAltText(documents); if ( @@ -572,12 +578,19 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? { - onlyMainContent: false, - includeHtml: true, - replaceAllPathsWithAbsolutePaths: false, - parsePDF: true, - removeTags: [], + this.pageOptions = { + onlyMainContent: options.pageOptions?.onlyMainContent ?? false, + includeHtml: options.pageOptions?.includeHtml ?? false, + replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false, + parsePDF: options.pageOptions?.parsePDF ?? true, + removeTags: options.pageOptions?.removeTags ?? [], + includeMarkdown: options.pageOptions?.includeMarkdown ?? true, + includeRawHtml: options.pageOptions?.includeRawHtml ?? false, + waitFor: options.pageOptions?.waitFor ?? undefined, + headers: options.pageOptions?.headers ?? undefined, + includeLinks: options.pageOptions?.includeLinks ?? true, + fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false, + screenshot: options.pageOptions?.screenshot ?? false, }; this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; this.replaceAllPathsWithAbsolutePaths = diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 9f8419b6..58e0185e 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -122,23 +122,36 @@ function getScrapingFallbackOrder( export async function scrapSingleUrl( jobId: string, urlToScrap: string, - pageOptions: PageOptions = { - includeMarkdown: true, - onlyMainContent: true, - includeHtml: true, - includeRawHtml: false, - waitFor: 0, - screenshot: false, - fullPageScreenshot: false, - headers: undefined, - includeLinks: true - }, - extractorOptions: ExtractorOptions = { - mode: "llm-extraction-from-markdown", - }, - existingHtml: string = "", + pageOptions: PageOptions, + extractorOptions?: ExtractorOptions, + existingHtml?: string, priority?: number, ): Promise { + pageOptions = { + includeMarkdown: pageOptions.includeMarkdown ?? true, + onlyMainContent: pageOptions.onlyMainContent ?? false, + includeHtml: pageOptions.includeHtml ?? false, + includeRawHtml: pageOptions.includeRawHtml ?? false, + waitFor: pageOptions.waitFor ?? undefined, + screenshot: pageOptions.screenshot ?? false, + fullPageScreenshot: pageOptions.fullPageScreenshot ?? false, + headers: pageOptions.headers ?? undefined, + includeLinks: pageOptions.includeLinks ?? true, + replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? false, + parsePDF: pageOptions.parsePDF ?? true, + removeTags: pageOptions.removeTags ?? [], + } + + if (extractorOptions) { + extractorOptions = { + mode: extractorOptions.mode ?? "llm-extraction-from-markdown", + } + } + + if (!existingHtml) { + existingHtml = ""; + } + urlToScrap = urlToScrap.trim(); const attemptScraping = async ( diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index afd80f42..80d53954 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -130,10 +130,12 @@ async function processJob(job: Job, token: string) { const end = Date.now(); const timeTakenInSeconds = (end - start) / 1000; - const rawHtml = docs[0].rawHtml; + const rawHtml = docs[0] ? docs[0].rawHtml : ""; if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) { - delete docs[0].rawHtml; + if (docs[0] && docs[0].rawHtml) { + delete docs[0].rawHtml; + } } const data = {