fix: html and rawlhtmls for pdfs

2025-08-12 04:49:05 +08:00 · 2024-08-22 15:15:45 -03:00 · 2024-08-22 15:15:45 -03:00 · 7473b74021
commit 7473b74021
parent b1d61d8557
7 changed files with 68 additions and 32 deletions
--- a/apps/api/src/controllers/v0/crawlPreview.ts
+++ b/apps/api/src/controllers/v0/crawlPreview.ts
@ -44,7 +44,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
    }

    const crawlerOptions = req.body.crawlerOptions ?? {};
-    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: true, removeTags: [] };
+    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };

    // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
    //   try {
--- a/apps/api/src/controllers/v0/scrape.ts
+++ b/apps/api/src/controllers/v0/scrape.ts
@ -74,7 +74,15 @@ export async function scrapeHelper(

  // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
  if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
-    delete doc.rawHtml;
+    if (doc.rawHtml) {
+      delete doc.rawHtml;
+    }
+  }
+
+  if (!pageOptions.includeHtml) {
+    if (doc.html) {
+      delete doc.html;
+    }
  }

  return {
--- a/apps/api/src/controllers/v0/search.ts
+++ b/apps/api/src/controllers/v0/search.ts
@ -132,11 +132,11 @@ export async function searchController(req: Request, res: Response) {
    }
    const crawlerOptions = req.body.crawlerOptions ?? {};
    const pageOptions = req.body.pageOptions ?? {
-      includeHtml: true,
-      onlyMainContent: true,
-      fetchPageContent: true,
-      removeTags: [],
-      fallback: false,
+      includeHtml: req.body.pageOptions?.includeHtml ?? false,
+      onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false,
+      fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true,
+      removeTags: req.body.pageOptions?.removeTags ?? [],
+      fallback: req.body.pageOptions?.fallback ?? false,
    };
    const origin = req.body.origin ?? "api";

--- a/apps/api/src/lib/default-values.ts
+++ b/apps/api/src/lib/default-values.ts
@ -4,7 +4,7 @@ export const defaultTimeout = 45000; // 45 seconds

 export const defaultPageOptions = {
  onlyMainContent: false,
-  includeHtml: true,
+  includeHtml: false,
  waitFor: 0,
  screenshot: false,
  fullPageScreenshot: false,
@ -17,7 +17,7 @@ export const defaultCrawlerOptions = {

 export const defaultCrawlPageOptions = {
  onlyMainContent: false,
-  includeHtml: true,
+  includeHtml: false,
  removeTags: [],
  parsePDF: true
 }
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -296,6 +296,12 @@ export class WebScraperDataProvider {
    if (this.pageOptions.includeMarkdown) {
      documents = this.applyPathReplacements(documents);
    }
+
+    if (!this.pageOptions.includeHtml) {
+      for (let document of documents) {
+        delete document.html;
+      }
+    }
    
    // documents = await this.applyImgAltText(documents);
    if (
@ -572,12 +578,19 @@ export class WebScraperDataProvider {
    this.limit = options.crawlerOptions?.limit ?? 10000;
    this.generateImgAltText =
      options.crawlerOptions?.generateImgAltText ?? false;
-    this.pageOptions = options.pageOptions ?? {
-      onlyMainContent: false,
-      includeHtml: true,
-      replaceAllPathsWithAbsolutePaths: false,
-      parsePDF: true,
-      removeTags: [],
+    this.pageOptions = {
+      onlyMainContent: options.pageOptions?.onlyMainContent ?? false,
+      includeHtml: options.pageOptions?.includeHtml ?? false,
+      replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false,
+      parsePDF: options.pageOptions?.parsePDF ?? true,
+      removeTags: options.pageOptions?.removeTags ?? [],
+      includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
+      includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
+      waitFor: options.pageOptions?.waitFor ?? undefined,
+      headers: options.pageOptions?.headers ?? undefined,
+      includeLinks: options.pageOptions?.includeLinks ?? true,
+      fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
+      screenshot: options.pageOptions?.screenshot ?? false,
    };
    this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
    this.replaceAllPathsWithAbsolutePaths =
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -122,23 +122,36 @@ function getScrapingFallbackOrder(
 export async function scrapSingleUrl(
  jobId: string,
  urlToScrap: string,
-  pageOptions: PageOptions = {
-    includeMarkdown: true,
-    onlyMainContent: true,
-    includeHtml: true,
-    includeRawHtml: false,
-    waitFor: 0,
-    screenshot: false,
-    fullPageScreenshot: false,
-    headers: undefined,
-    includeLinks: true
-  },
-  extractorOptions: ExtractorOptions = {
-    mode: "llm-extraction-from-markdown",
-  },
-  existingHtml: string = "",
+  pageOptions: PageOptions,
+  extractorOptions?: ExtractorOptions,
+  existingHtml?: string,
  priority?: number,
 ): Promise<Document> {
+  pageOptions = {
+    includeMarkdown: pageOptions.includeMarkdown ?? true,
+    onlyMainContent: pageOptions.onlyMainContent ?? false,
+    includeHtml: pageOptions.includeHtml ?? false,
+    includeRawHtml: pageOptions.includeRawHtml ?? false,
+    waitFor: pageOptions.waitFor ?? undefined,
+    screenshot: pageOptions.screenshot ?? false,
+    fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
+    headers: pageOptions.headers ?? undefined,
+    includeLinks: pageOptions.includeLinks ?? true,
+    replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? false,
+    parsePDF: pageOptions.parsePDF ?? true,
+    removeTags: pageOptions.removeTags ?? [],
+  }
+
+  if (extractorOptions) {
+    extractorOptions = {
+      mode: extractorOptions.mode ?? "llm-extraction-from-markdown",
+    }
+  }
+
+  if (!existingHtml) {
+    existingHtml = "";
+  }
+
  urlToScrap = urlToScrap.trim();

  const attemptScraping = async (
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -130,10 +130,12 @@ async function processJob(job: Job, token: string) {
    const end = Date.now();
    const timeTakenInSeconds = (end - start) / 1000;

-    const rawHtml = docs[0].rawHtml;
+    const rawHtml = docs[0] ? docs[0].rawHtml : "";

    if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) {
-      delete docs[0].rawHtml;
+      if (docs[0] && docs[0].rawHtml) {
+        delete docs[0].rawHtml;
+      }
    }

    const data = {