From 5818236659a9c3ceef45488e7f439897cb2d8383 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 20 Aug 2024 22:51:12 +0200 Subject: [PATCH 1/2] fix: remove rawHtml properly --- apps/api/src/main/runWebScraper.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 84826bdd..b28d039e 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -45,6 +45,9 @@ export async function startWebScraperPipeline({ }, onSuccess: (result, mode) => { Logger.debug(`🐂 Job completed ${job.id}`); + if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) { + delete result[0].rawHtml; + } saveJob(job, result, token, mode); }, onError: (error) => { From fe2e8c0b7a7bc5713e05998eda40363f6e832d43 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 21 Aug 2024 15:54:00 -0300 Subject: [PATCH 2/2] includehtml fix --- apps/api/src/controllers/v0/crawlPreview.ts | 2 +- apps/api/src/controllers/v0/search.ts | 2 +- apps/api/src/lib/default-values.ts | 4 ++-- apps/api/src/scraper/WebScraper/index.ts | 2 +- apps/api/src/scraper/WebScraper/single_url.ts | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index 356da835..21a4a930 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -44,7 +44,7 @@ export async function crawlPreviewController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: true, removeTags: [] }; // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? // try { diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index 948e883d..34d415a5 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -132,7 +132,7 @@ export async function searchController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { - includeHtml: false, + includeHtml: true, onlyMainContent: true, fetchPageContent: true, removeTags: [], diff --git a/apps/api/src/lib/default-values.ts b/apps/api/src/lib/default-values.ts index 152f47d7..cdf4605d 100644 --- a/apps/api/src/lib/default-values.ts +++ b/apps/api/src/lib/default-values.ts @@ -4,7 +4,7 @@ export const defaultTimeout = 45000; // 45 seconds export const defaultPageOptions = { onlyMainContent: false, - includeHtml: false, + includeHtml: true, waitFor: 0, screenshot: false, fullPageScreenshot: false, @@ -17,7 +17,7 @@ export const defaultCrawlerOptions = { export const defaultCrawlPageOptions = { onlyMainContent: false, - includeHtml: false, + includeHtml: true, removeTags: [], parsePDF: true } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 45a36bb1..04b861b1 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -574,7 +574,7 @@ export class WebScraperDataProvider { options.crawlerOptions?.generateImgAltText ?? false; this.pageOptions = options.pageOptions ?? { onlyMainContent: false, - includeHtml: false, + includeHtml: true, replaceAllPathsWithAbsolutePaths: false, parsePDF: true, removeTags: [], diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 408f9838..9f8419b6 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -125,7 +125,7 @@ export async function scrapSingleUrl( pageOptions: PageOptions = { includeMarkdown: true, onlyMainContent: true, - includeHtml: false, + includeHtml: true, includeRawHtml: false, waitFor: 0, screenshot: false,