diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 32a92745..ff485f28 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -6801,7 +6801,7 @@ packages: handlebars: 4.7.8 openai: 3.3.0 sbd: 1.0.19 - typescript: 5.4.5 + typescript: 5.5.3 uuid: 9.0.1 zod: 3.23.8 transitivePeerDependencies: @@ -7767,6 +7767,12 @@ packages: engines: {node: '>=14.17'} hasBin: true + /typescript@5.5.3: + resolution: {integrity: sha512-/hreyEujaB0w76zKo6717l3L0o/qEUtRgdvUBvlkhoWeOVMjMuHNHk0BRBzikzuGDqNmPQbg5ifMEqsHLiIUcQ==} + engines: {node: '>=14.17'} + hasBin: true + dev: false + /typesense@1.8.2(@babel/runtime@7.24.6): resolution: {integrity: sha512-aBpePjA99Qvo+OP2pJwMpvga4Jrm1Y2oV5NsrWXBxlqUDNEUCPZBIksPv2Hq0jxQxHhLLyJVbjXjByXsvpCDVA==} engines: {node: '>=18'} diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts index c8281edd..af7fe4a3 100644 --- a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts @@ -131,6 +131,28 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.metadata.pageStatusCode).toBe(200); expect(response.body.data.metadata.pageError).toBeUndefined(); }, 30000); // 30 seconds timeout + + it.concurrent("should return a successful response with a valid API key and includeRawHtml set to true", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://roastmywebsite.ai", + pageOptions: { includeRawHtml: true }, + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("rawHtml"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("_Roast_"); + expect(response.body.data.markdown).toContain("_Roast_"); + expect(response.body.data.rawHtml).toContain(" { const response = await request(TEST_URL) @@ -1177,6 +1199,47 @@ describe("E2E Tests for API Routes", () => { expect(llmExtraction.is_open_source).toBe(false); expect(typeof llmExtraction.is_open_source).toBe("boolean"); }, 60000); // 60 secs + + it.concurrent("should extract data using LLM extraction mode with RawHtml", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + + extractorOptions: { + mode: "llm-extraction-from-raw-html", + extractionPrompt: + "Based on the information on the page, what are the primary and secondary CTA buttons?", + extractionSchema: { + type: "object", + properties: { + primary_cta: { + type: "string", + }, + secondary_cta: { + type: "string", + }, + }, + required: ["primary_cta", "secondary_cta"], + }, + }, + }); + + // Ensure that the job was successfully created before proceeding with LLM extraction + expect(response.statusCode).toBe(200); + + // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` + let llmExtraction = response.body.data.llm_extraction; + + // Check if the llm_extraction object has the required properties with correct types and values + expect(llmExtraction).toHaveProperty("primary_cta"); + expect(typeof llmExtraction.primary_cta).toBe("string"); + expect(llmExtraction).toHaveProperty("secondary_cta"); + expect(typeof llmExtraction.secondary_cta).toBe("string"); + + }, 60000); // 60 secs }); // describe("POST /v0/scrape for Top 100 Companies", () => { diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index d394efe8..f5e2c322 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -58,19 +58,27 @@ export async function scrapeHelper( } // make sure doc.content is not empty - const filteredDocs = docs.filter( + let filteredDocs = docs.filter( (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 ); if (filteredDocs.length === 0) { return { success: true, error: "No page found", returnCode: 200, data: docs[0] }; } + + // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html + if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") { + filteredDocs.forEach(doc => { + delete doc.rawHtml; + }); + } + let creditsToBeBilled = filteredDocs.length; const creditsPerLLMExtract = 50; - if (extractorOptions.mode === "llm-extraction") { + if (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "llm-extraction-from-raw-html" || extractorOptions.mode === "llm-extraction-from-markdown") { creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); } diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index 6614dbdf..2156fb3c 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -8,7 +8,8 @@ import { Document, ExtractorOptions } from "../entities"; // Generate completion using OpenAI export async function generateCompletions( documents: Document[], - extractionOptions: ExtractorOptions + extractionOptions: ExtractorOptions, + mode: "markdown" | "raw-html" ): Promise { // const schema = zodToJsonSchema(options.schema) @@ -28,6 +29,7 @@ export async function generateCompletions( document: document, schema: schema, prompt: prompt, + mode: mode, }); // Validate the JSON output against the schema using AJV const validate = ajv.compile(schema); diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 1434e35e..8de8ee4b 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -13,26 +13,37 @@ const defaultPrompt = "You are a professional web scraper. Extract the contents of the webpage"; function prepareOpenAIDoc( - document: Document + document: Document, + mode: "markdown" | "raw-html" ): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] { + let markdown = document.markdown; -// Check if the markdown content exists in the document - if (!markdown) { + let extractionTarget = document.markdown; + + if (mode === "raw-html") { + extractionTarget = document.rawHtml; + } + + // Check if the markdown content exists in the document + if (!extractionTarget) { throw new Error( - "Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai" + `${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai` ); } + + + // count number of tokens - const numTokens = numTokensFromString(document.markdown, "gpt-4"); + const numTokens = numTokensFromString(extractionTarget, "gpt-4"); if (numTokens > maxTokens) { // trim the document to the maximum number of tokens, tokens != characters - markdown = markdown.slice(0, (maxTokens * modifier)); + extractionTarget = extractionTarget.slice(0, (maxTokens * modifier)); } - return [[{ type: "text", text: markdown }], numTokens]; + return [[{ type: "text", text: extractionTarget }], numTokens]; } export async function generateOpenAICompletions({ @@ -42,6 +53,7 @@ export async function generateOpenAICompletions({ schema, //TODO - add zod dynamic type checking prompt = defaultPrompt, temperature, + mode }: { client: OpenAI; model?: string; @@ -49,9 +61,10 @@ export async function generateOpenAICompletions({ schema: any; // This should be replaced with a proper Zod schema type when available prompt?: string; temperature?: number; + mode: "markdown" | "raw-html"; }): Promise { const openai = client as OpenAI; - const [content, numTokens] = prepareOpenAIDoc(document); + const [content, numTokens] = prepareOpenAIDoc(document, mode); const completion = await openai.chat.completions.create({ model, diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 2f43b9a4..d2b3b002 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -13,6 +13,7 @@ export interface Progress { export type PageOptions = { onlyMainContent?: boolean; includeHtml?: boolean; + includeRawHtml?: boolean; fallback?: boolean; fetchPageContent?: boolean; waitFor?: number; @@ -25,7 +26,7 @@ export type PageOptions = { }; export type ExtractorOptions = { - mode: "markdown" | "llm-extraction"; + mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html"; extractionPrompt?: string; extractionSchema?: Record; } @@ -73,6 +74,7 @@ export class Document { content: string; markdown?: string; html?: string; + rawHtml?: string; llm_extraction?: Record; createdAt?: Date; updatedAt?: Date; diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 9e318505..3badfa19 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -66,6 +66,7 @@ export class WebScraperDataProvider { const result = await scrapSingleUrl( url, this.pageOptions, + this.extractorOptions, existingHTML ); processedUrls++; @@ -269,10 +270,16 @@ export class WebScraperDataProvider { // documents = await this.applyImgAltText(documents); if ( - this.extractorOptions.mode === "llm-extraction" && + (this.extractorOptions.mode === "llm-extraction" || this.extractorOptions.mode === "llm-extraction-from-markdown") && this.mode === "single_urls" ) { - documents = await generateCompletions(documents, this.extractorOptions); + documents = await generateCompletions(documents, this.extractorOptions, "markdown"); + } + if ( + (this.extractorOptions.mode === "llm-extraction-from-raw-html") && + this.mode === "single_urls" + ) { + documents = await generateCompletions(documents, this.extractorOptions, "raw-html"); } return documents.concat(pdfDocuments).concat(docxDocuments); } diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 2d66315c..c4496ce0 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -2,7 +2,7 @@ import * as cheerio from "cheerio"; import { ScrapingBeeClient } from "scrapingbee"; import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; -import { Document, PageOptions, FireEngineResponse } from "../../lib/entities"; +import { Document, PageOptions, FireEngineResponse, ExtractorOptions } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { urlSpecificParams } from "./utils/custom/website_params"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; @@ -348,10 +348,14 @@ export async function scrapSingleUrl( pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, + includeRawHtml: false, waitFor: 0, screenshot: false, headers: undefined, }, + extractorOptions: ExtractorOptions = { + mode: "llm-extraction-from-markdown" + }, existingHtml: string = "" ): Promise { urlToScrap = urlToScrap.trim(); @@ -517,8 +521,10 @@ export async function scrapSingleUrl( if (attempt.pageStatusCode) { pageStatusCode = attempt.pageStatusCode; } - if (attempt.pageError) { + if (attempt.pageError && attempt.pageStatusCode != 200) { pageError = attempt.pageError; + } else { + pageError = undefined; } if (text && text.trim().length >= 100) break; @@ -542,6 +548,7 @@ export async function scrapSingleUrl( content: text, markdown: text, html: pageOptions.includeHtml ? html : undefined, + rawHtml: pageOptions.includeRawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, metadata: { ...metadata, screenshot: screenshot, @@ -555,6 +562,7 @@ export async function scrapSingleUrl( content: text, markdown: text, html: pageOptions.includeHtml ? html : undefined, + rawHtml: pageOptions.includeRawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, metadata: { ...metadata, sourceURL: urlToScrap,