fix: html and rawlhtmls for pdfs

This commit is contained in:
rafaelsideguide 2024-08-22 15:15:45 -03:00
parent b1d61d8557
commit 7473b74021
7 changed files with 68 additions and 32 deletions

View File

@ -44,7 +44,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
}
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: true, removeTags: [] };
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try {

View File

@ -74,7 +74,15 @@ export async function scrapeHelper(
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
delete doc.rawHtml;
if (doc.rawHtml) {
delete doc.rawHtml;
}
}
if (!pageOptions.includeHtml) {
if (doc.html) {
delete doc.html;
}
}
return {

View File

@ -132,11 +132,11 @@ export async function searchController(req: Request, res: Response) {
}
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? {
includeHtml: true,
onlyMainContent: true,
fetchPageContent: true,
removeTags: [],
fallback: false,
includeHtml: req.body.pageOptions?.includeHtml ?? false,
onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false,
fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true,
removeTags: req.body.pageOptions?.removeTags ?? [],
fallback: req.body.pageOptions?.fallback ?? false,
};
const origin = req.body.origin ?? "api";

View File

@ -4,7 +4,7 @@ export const defaultTimeout = 45000; // 45 seconds
export const defaultPageOptions = {
onlyMainContent: false,
includeHtml: true,
includeHtml: false,
waitFor: 0,
screenshot: false,
fullPageScreenshot: false,
@ -17,7 +17,7 @@ export const defaultCrawlerOptions = {
export const defaultCrawlPageOptions = {
onlyMainContent: false,
includeHtml: true,
includeHtml: false,
removeTags: [],
parsePDF: true
}

View File

@ -296,6 +296,12 @@ export class WebScraperDataProvider {
if (this.pageOptions.includeMarkdown) {
documents = this.applyPathReplacements(documents);
}
if (!this.pageOptions.includeHtml) {
for (let document of documents) {
delete document.html;
}
}
// documents = await this.applyImgAltText(documents);
if (
@ -572,12 +578,19 @@ export class WebScraperDataProvider {
this.limit = options.crawlerOptions?.limit ?? 10000;
this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = options.pageOptions ?? {
onlyMainContent: false,
includeHtml: true,
replaceAllPathsWithAbsolutePaths: false,
parsePDF: true,
removeTags: [],
this.pageOptions = {
onlyMainContent: options.pageOptions?.onlyMainContent ?? false,
includeHtml: options.pageOptions?.includeHtml ?? false,
replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false,
parsePDF: options.pageOptions?.parsePDF ?? true,
removeTags: options.pageOptions?.removeTags ?? [],
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
waitFor: options.pageOptions?.waitFor ?? undefined,
headers: options.pageOptions?.headers ?? undefined,
includeLinks: options.pageOptions?.includeLinks ?? true,
fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
screenshot: options.pageOptions?.screenshot ?? false,
};
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
this.replaceAllPathsWithAbsolutePaths =

View File

@ -122,23 +122,36 @@ function getScrapingFallbackOrder(
export async function scrapSingleUrl(
jobId: string,
urlToScrap: string,
pageOptions: PageOptions = {
includeMarkdown: true,
onlyMainContent: true,
includeHtml: true,
includeRawHtml: false,
waitFor: 0,
screenshot: false,
fullPageScreenshot: false,
headers: undefined,
includeLinks: true
},
extractorOptions: ExtractorOptions = {
mode: "llm-extraction-from-markdown",
},
existingHtml: string = "",
pageOptions: PageOptions,
extractorOptions?: ExtractorOptions,
existingHtml?: string,
priority?: number,
): Promise<Document> {
pageOptions = {
includeMarkdown: pageOptions.includeMarkdown ?? true,
onlyMainContent: pageOptions.onlyMainContent ?? false,
includeHtml: pageOptions.includeHtml ?? false,
includeRawHtml: pageOptions.includeRawHtml ?? false,
waitFor: pageOptions.waitFor ?? undefined,
screenshot: pageOptions.screenshot ?? false,
fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
headers: pageOptions.headers ?? undefined,
includeLinks: pageOptions.includeLinks ?? true,
replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? false,
parsePDF: pageOptions.parsePDF ?? true,
removeTags: pageOptions.removeTags ?? [],
}
if (extractorOptions) {
extractorOptions = {
mode: extractorOptions.mode ?? "llm-extraction-from-markdown",
}
}
if (!existingHtml) {
existingHtml = "";
}
urlToScrap = urlToScrap.trim();
const attemptScraping = async (

View File

@ -130,10 +130,12 @@ async function processJob(job: Job, token: string) {
const end = Date.now();
const timeTakenInSeconds = (end - start) / 1000;
const rawHtml = docs[0].rawHtml;
const rawHtml = docs[0] ? docs[0].rawHtml : "";
if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) {
delete docs[0].rawHtml;
if (docs[0] && docs[0].rawHtml) {
delete docs[0].rawHtml;
}
}
const data = {