mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 04:49:05 +08:00
fix: html and rawlhtmls for pdfs
This commit is contained in:
parent
b1d61d8557
commit
7473b74021
@ -44,7 +44,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||
}
|
||||
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: true, removeTags: [] };
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
|
||||
|
||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
// try {
|
||||
|
@ -74,7 +74,15 @@ export async function scrapeHelper(
|
||||
|
||||
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
|
||||
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
|
||||
delete doc.rawHtml;
|
||||
if (doc.rawHtml) {
|
||||
delete doc.rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
if (!pageOptions.includeHtml) {
|
||||
if (doc.html) {
|
||||
delete doc.html;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
|
@ -132,11 +132,11 @@ export async function searchController(req: Request, res: Response) {
|
||||
}
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? {
|
||||
includeHtml: true,
|
||||
onlyMainContent: true,
|
||||
fetchPageContent: true,
|
||||
removeTags: [],
|
||||
fallback: false,
|
||||
includeHtml: req.body.pageOptions?.includeHtml ?? false,
|
||||
onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false,
|
||||
fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true,
|
||||
removeTags: req.body.pageOptions?.removeTags ?? [],
|
||||
fallback: req.body.pageOptions?.fallback ?? false,
|
||||
};
|
||||
const origin = req.body.origin ?? "api";
|
||||
|
||||
|
@ -4,7 +4,7 @@ export const defaultTimeout = 45000; // 45 seconds
|
||||
|
||||
export const defaultPageOptions = {
|
||||
onlyMainContent: false,
|
||||
includeHtml: true,
|
||||
includeHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
fullPageScreenshot: false,
|
||||
@ -17,7 +17,7 @@ export const defaultCrawlerOptions = {
|
||||
|
||||
export const defaultCrawlPageOptions = {
|
||||
onlyMainContent: false,
|
||||
includeHtml: true,
|
||||
includeHtml: false,
|
||||
removeTags: [],
|
||||
parsePDF: true
|
||||
}
|
||||
|
@ -296,6 +296,12 @@ export class WebScraperDataProvider {
|
||||
if (this.pageOptions.includeMarkdown) {
|
||||
documents = this.applyPathReplacements(documents);
|
||||
}
|
||||
|
||||
if (!this.pageOptions.includeHtml) {
|
||||
for (let document of documents) {
|
||||
delete document.html;
|
||||
}
|
||||
}
|
||||
|
||||
// documents = await this.applyImgAltText(documents);
|
||||
if (
|
||||
@ -572,12 +578,19 @@ export class WebScraperDataProvider {
|
||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||
this.generateImgAltText =
|
||||
options.crawlerOptions?.generateImgAltText ?? false;
|
||||
this.pageOptions = options.pageOptions ?? {
|
||||
onlyMainContent: false,
|
||||
includeHtml: true,
|
||||
replaceAllPathsWithAbsolutePaths: false,
|
||||
parsePDF: true,
|
||||
removeTags: [],
|
||||
this.pageOptions = {
|
||||
onlyMainContent: options.pageOptions?.onlyMainContent ?? false,
|
||||
includeHtml: options.pageOptions?.includeHtml ?? false,
|
||||
replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false,
|
||||
parsePDF: options.pageOptions?.parsePDF ?? true,
|
||||
removeTags: options.pageOptions?.removeTags ?? [],
|
||||
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
|
||||
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
|
||||
waitFor: options.pageOptions?.waitFor ?? undefined,
|
||||
headers: options.pageOptions?.headers ?? undefined,
|
||||
includeLinks: options.pageOptions?.includeLinks ?? true,
|
||||
fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
|
||||
screenshot: options.pageOptions?.screenshot ?? false,
|
||||
};
|
||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||
this.replaceAllPathsWithAbsolutePaths =
|
||||
|
@ -122,23 +122,36 @@ function getScrapingFallbackOrder(
|
||||
export async function scrapSingleUrl(
|
||||
jobId: string,
|
||||
urlToScrap: string,
|
||||
pageOptions: PageOptions = {
|
||||
includeMarkdown: true,
|
||||
onlyMainContent: true,
|
||||
includeHtml: true,
|
||||
includeRawHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
fullPageScreenshot: false,
|
||||
headers: undefined,
|
||||
includeLinks: true
|
||||
},
|
||||
extractorOptions: ExtractorOptions = {
|
||||
mode: "llm-extraction-from-markdown",
|
||||
},
|
||||
existingHtml: string = "",
|
||||
pageOptions: PageOptions,
|
||||
extractorOptions?: ExtractorOptions,
|
||||
existingHtml?: string,
|
||||
priority?: number,
|
||||
): Promise<Document> {
|
||||
pageOptions = {
|
||||
includeMarkdown: pageOptions.includeMarkdown ?? true,
|
||||
onlyMainContent: pageOptions.onlyMainContent ?? false,
|
||||
includeHtml: pageOptions.includeHtml ?? false,
|
||||
includeRawHtml: pageOptions.includeRawHtml ?? false,
|
||||
waitFor: pageOptions.waitFor ?? undefined,
|
||||
screenshot: pageOptions.screenshot ?? false,
|
||||
fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
|
||||
headers: pageOptions.headers ?? undefined,
|
||||
includeLinks: pageOptions.includeLinks ?? true,
|
||||
replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? false,
|
||||
parsePDF: pageOptions.parsePDF ?? true,
|
||||
removeTags: pageOptions.removeTags ?? [],
|
||||
}
|
||||
|
||||
if (extractorOptions) {
|
||||
extractorOptions = {
|
||||
mode: extractorOptions.mode ?? "llm-extraction-from-markdown",
|
||||
}
|
||||
}
|
||||
|
||||
if (!existingHtml) {
|
||||
existingHtml = "";
|
||||
}
|
||||
|
||||
urlToScrap = urlToScrap.trim();
|
||||
|
||||
const attemptScraping = async (
|
||||
|
@ -130,10 +130,12 @@ async function processJob(job: Job, token: string) {
|
||||
const end = Date.now();
|
||||
const timeTakenInSeconds = (end - start) / 1000;
|
||||
|
||||
const rawHtml = docs[0].rawHtml;
|
||||
const rawHtml = docs[0] ? docs[0].rawHtml : "";
|
||||
|
||||
if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) {
|
||||
delete docs[0].rawHtml;
|
||||
if (docs[0] && docs[0].rawHtml) {
|
||||
delete docs[0].rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
const data = {
|
||||
|
Loading…
x
Reference in New Issue
Block a user