Merge pull request #332 from mendableai/feat/rawHtmlExtraction

Adds pageOptions.includeRawHtml and new extraction mode "llm-extraction-from-raw-html"
This commit is contained in:
Nicolas 2024-07-01 18:23:26 -03:00 committed by GitHub
commit 42cd58a679
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 126 additions and 17 deletions

View File

@ -6801,7 +6801,7 @@ packages:
handlebars: 4.7.8 handlebars: 4.7.8
openai: 3.3.0 openai: 3.3.0
sbd: 1.0.19 sbd: 1.0.19
typescript: 5.4.5 typescript: 5.5.3
uuid: 9.0.1 uuid: 9.0.1
zod: 3.23.8 zod: 3.23.8
transitivePeerDependencies: transitivePeerDependencies:
@ -7767,6 +7767,12 @@ packages:
engines: {node: '>=14.17'} engines: {node: '>=14.17'}
hasBin: true hasBin: true
/typescript@5.5.3:
resolution: {integrity: sha512-/hreyEujaB0w76zKo6717l3L0o/qEUtRgdvUBvlkhoWeOVMjMuHNHk0BRBzikzuGDqNmPQbg5ifMEqsHLiIUcQ==}
engines: {node: '>=14.17'}
hasBin: true
dev: false
/typesense@1.8.2(@babel/runtime@7.24.6): /typesense@1.8.2(@babel/runtime@7.24.6):
resolution: {integrity: sha512-aBpePjA99Qvo+OP2pJwMpvga4Jrm1Y2oV5NsrWXBxlqUDNEUCPZBIksPv2Hq0jxQxHhLLyJVbjXjByXsvpCDVA==} resolution: {integrity: sha512-aBpePjA99Qvo+OP2pJwMpvga4Jrm1Y2oV5NsrWXBxlqUDNEUCPZBIksPv2Hq0jxQxHhLLyJVbjXjByXsvpCDVA==}
engines: {node: '>=18'} engines: {node: '>=18'}

View File

@ -131,6 +131,28 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.metadata.pageStatusCode).toBe(200); expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined(); expect(response.body.data.metadata.pageError).toBeUndefined();
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
it.concurrent("should return a successful response with a valid API key and includeRawHtml set to true", async () => {
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://roastmywebsite.ai",
pageOptions: { includeRawHtml: true },
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("rawHtml");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.content).toContain("_Roast_");
expect(response.body.data.markdown).toContain("_Roast_");
expect(response.body.data.rawHtml).toContain("<h1");
expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined();
}, 30000); // 30 seconds timeout
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => { it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
@ -1177,6 +1199,47 @@ describe("E2E Tests for API Routes", () => {
expect(llmExtraction.is_open_source).toBe(false); expect(llmExtraction.is_open_source).toBe(false);
expect(typeof llmExtraction.is_open_source).toBe("boolean"); expect(typeof llmExtraction.is_open_source).toBe("boolean");
}, 60000); // 60 secs }, 60000); // 60 secs
it.concurrent("should extract data using LLM extraction mode with RawHtml", async () => {
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai",
extractorOptions: {
mode: "llm-extraction-from-raw-html",
extractionPrompt:
"Based on the information on the page, what are the primary and secondary CTA buttons?",
extractionSchema: {
type: "object",
properties: {
primary_cta: {
type: "string",
},
secondary_cta: {
type: "string",
},
},
required: ["primary_cta", "secondary_cta"],
},
},
});
// Ensure that the job was successfully created before proceeding with LLM extraction
expect(response.statusCode).toBe(200);
// Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
let llmExtraction = response.body.data.llm_extraction;
// Check if the llm_extraction object has the required properties with correct types and values
expect(llmExtraction).toHaveProperty("primary_cta");
expect(typeof llmExtraction.primary_cta).toBe("string");
expect(llmExtraction).toHaveProperty("secondary_cta");
expect(typeof llmExtraction.secondary_cta).toBe("string");
}, 60000); // 60 secs
}); });
// describe("POST /v0/scrape for Top 100 Companies", () => { // describe("POST /v0/scrape for Top 100 Companies", () => {

View File

@ -58,19 +58,27 @@ export async function scrapeHelper(
} }
// make sure doc.content is not empty // make sure doc.content is not empty
const filteredDocs = docs.filter( let filteredDocs = docs.filter(
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0 (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
); );
if (filteredDocs.length === 0) { if (filteredDocs.length === 0) {
return { success: true, error: "No page found", returnCode: 200, data: docs[0] }; return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
} }
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
filteredDocs.forEach(doc => {
delete doc.rawHtml;
});
}
let creditsToBeBilled = filteredDocs.length; let creditsToBeBilled = filteredDocs.length;
const creditsPerLLMExtract = 50; const creditsPerLLMExtract = 50;
if (extractorOptions.mode === "llm-extraction") { if (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "llm-extraction-from-raw-html" || extractorOptions.mode === "llm-extraction-from-markdown") {
creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
} }

View File

@ -8,7 +8,8 @@ import { Document, ExtractorOptions } from "../entities";
// Generate completion using OpenAI // Generate completion using OpenAI
export async function generateCompletions( export async function generateCompletions(
documents: Document[], documents: Document[],
extractionOptions: ExtractorOptions extractionOptions: ExtractorOptions,
mode: "markdown" | "raw-html"
): Promise<Document[]> { ): Promise<Document[]> {
// const schema = zodToJsonSchema(options.schema) // const schema = zodToJsonSchema(options.schema)
@ -28,6 +29,7 @@ export async function generateCompletions(
document: document, document: document,
schema: schema, schema: schema,
prompt: prompt, prompt: prompt,
mode: mode,
}); });
// Validate the JSON output against the schema using AJV // Validate the JSON output against the schema using AJV
const validate = ajv.compile(schema); const validate = ajv.compile(schema);

View File

@ -13,26 +13,37 @@ const defaultPrompt =
"You are a professional web scraper. Extract the contents of the webpage"; "You are a professional web scraper. Extract the contents of the webpage";
function prepareOpenAIDoc( function prepareOpenAIDoc(
document: Document document: Document,
mode: "markdown" | "raw-html"
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] { ): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {
let markdown = document.markdown; let markdown = document.markdown;
// Check if the markdown content exists in the document let extractionTarget = document.markdown;
if (!markdown) {
if (mode === "raw-html") {
extractionTarget = document.rawHtml;
}
// Check if the markdown content exists in the document
if (!extractionTarget) {
throw new Error( throw new Error(
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai" `${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai`
); );
} }
// count number of tokens // count number of tokens
const numTokens = numTokensFromString(document.markdown, "gpt-4"); const numTokens = numTokensFromString(extractionTarget, "gpt-4");
if (numTokens > maxTokens) { if (numTokens > maxTokens) {
// trim the document to the maximum number of tokens, tokens != characters // trim the document to the maximum number of tokens, tokens != characters
markdown = markdown.slice(0, (maxTokens * modifier)); extractionTarget = extractionTarget.slice(0, (maxTokens * modifier));
} }
return [[{ type: "text", text: markdown }], numTokens]; return [[{ type: "text", text: extractionTarget }], numTokens];
} }
export async function generateOpenAICompletions({ export async function generateOpenAICompletions({
@ -42,6 +53,7 @@ export async function generateOpenAICompletions({
schema, //TODO - add zod dynamic type checking schema, //TODO - add zod dynamic type checking
prompt = defaultPrompt, prompt = defaultPrompt,
temperature, temperature,
mode
}: { }: {
client: OpenAI; client: OpenAI;
model?: string; model?: string;
@ -49,9 +61,10 @@ export async function generateOpenAICompletions({
schema: any; // This should be replaced with a proper Zod schema type when available schema: any; // This should be replaced with a proper Zod schema type when available
prompt?: string; prompt?: string;
temperature?: number; temperature?: number;
mode: "markdown" | "raw-html";
}): Promise<Document> { }): Promise<Document> {
const openai = client as OpenAI; const openai = client as OpenAI;
const [content, numTokens] = prepareOpenAIDoc(document); const [content, numTokens] = prepareOpenAIDoc(document, mode);
const completion = await openai.chat.completions.create({ const completion = await openai.chat.completions.create({
model, model,

View File

@ -13,6 +13,7 @@ export interface Progress {
export type PageOptions = { export type PageOptions = {
onlyMainContent?: boolean; onlyMainContent?: boolean;
includeHtml?: boolean; includeHtml?: boolean;
includeRawHtml?: boolean;
fallback?: boolean; fallback?: boolean;
fetchPageContent?: boolean; fetchPageContent?: boolean;
waitFor?: number; waitFor?: number;
@ -25,7 +26,7 @@ export type PageOptions = {
}; };
export type ExtractorOptions = { export type ExtractorOptions = {
mode: "markdown" | "llm-extraction"; mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
extractionPrompt?: string; extractionPrompt?: string;
extractionSchema?: Record<string, any>; extractionSchema?: Record<string, any>;
} }
@ -73,6 +74,7 @@ export class Document {
content: string; content: string;
markdown?: string; markdown?: string;
html?: string; html?: string;
rawHtml?: string;
llm_extraction?: Record<string, any>; llm_extraction?: Record<string, any>;
createdAt?: Date; createdAt?: Date;
updatedAt?: Date; updatedAt?: Date;

View File

@ -66,6 +66,7 @@ export class WebScraperDataProvider {
const result = await scrapSingleUrl( const result = await scrapSingleUrl(
url, url,
this.pageOptions, this.pageOptions,
this.extractorOptions,
existingHTML existingHTML
); );
processedUrls++; processedUrls++;
@ -269,10 +270,16 @@ export class WebScraperDataProvider {
// documents = await this.applyImgAltText(documents); // documents = await this.applyImgAltText(documents);
if ( if (
this.extractorOptions.mode === "llm-extraction" && (this.extractorOptions.mode === "llm-extraction" || this.extractorOptions.mode === "llm-extraction-from-markdown") &&
this.mode === "single_urls" this.mode === "single_urls"
) { ) {
documents = await generateCompletions(documents, this.extractorOptions); documents = await generateCompletions(documents, this.extractorOptions, "markdown");
}
if (
(this.extractorOptions.mode === "llm-extraction-from-raw-html") &&
this.mode === "single_urls"
) {
documents = await generateCompletions(documents, this.extractorOptions, "raw-html");
} }
return documents.concat(pdfDocuments).concat(docxDocuments); return documents.concat(pdfDocuments).concat(docxDocuments);
} }

View File

@ -2,7 +2,7 @@ import * as cheerio from "cheerio";
import { ScrapingBeeClient } from "scrapingbee"; import { ScrapingBeeClient } from "scrapingbee";
import { extractMetadata } from "./utils/metadata"; import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv"; import dotenv from "dotenv";
import { Document, PageOptions, FireEngineResponse } from "../../lib/entities"; import { Document, PageOptions, FireEngineResponse, ExtractorOptions } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown"; import { parseMarkdown } from "../../lib/html-to-markdown";
import { urlSpecificParams } from "./utils/custom/website_params"; import { urlSpecificParams } from "./utils/custom/website_params";
import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { fetchAndProcessPdf } from "./utils/pdfProcessor";
@ -348,10 +348,14 @@ export async function scrapSingleUrl(
pageOptions: PageOptions = { pageOptions: PageOptions = {
onlyMainContent: true, onlyMainContent: true,
includeHtml: false, includeHtml: false,
includeRawHtml: false,
waitFor: 0, waitFor: 0,
screenshot: false, screenshot: false,
headers: undefined, headers: undefined,
}, },
extractorOptions: ExtractorOptions = {
mode: "llm-extraction-from-markdown"
},
existingHtml: string = "" existingHtml: string = ""
): Promise<Document> { ): Promise<Document> {
urlToScrap = urlToScrap.trim(); urlToScrap = urlToScrap.trim();
@ -517,8 +521,10 @@ export async function scrapSingleUrl(
if (attempt.pageStatusCode) { if (attempt.pageStatusCode) {
pageStatusCode = attempt.pageStatusCode; pageStatusCode = attempt.pageStatusCode;
} }
if (attempt.pageError) { if (attempt.pageError && attempt.pageStatusCode != 200) {
pageError = attempt.pageError; pageError = attempt.pageError;
} else {
pageError = undefined;
} }
if (text && text.trim().length >= 100) break; if (text && text.trim().length >= 100) break;
@ -542,6 +548,7 @@ export async function scrapSingleUrl(
content: text, content: text,
markdown: text, markdown: text,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
rawHtml: pageOptions.includeRawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined,
metadata: { metadata: {
...metadata, ...metadata,
screenshot: screenshot, screenshot: screenshot,
@ -555,6 +562,7 @@ export async function scrapSingleUrl(
content: text, content: text,
markdown: text, markdown: text,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
rawHtml: pageOptions.includeRawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined,
metadata: { metadata: {
...metadata, ...metadata,
sourceURL: urlToScrap, sourceURL: urlToScrap,