mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 12:39:05 +08:00
fix: html and rawlhtmls for pdfs
This commit is contained in:
parent
b1d61d8557
commit
7473b74021
@ -44,7 +44,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: true, removeTags: [] };
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
|
||||||
|
|
||||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||||
// try {
|
// try {
|
||||||
|
@ -74,7 +74,15 @@ export async function scrapeHelper(
|
|||||||
|
|
||||||
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
|
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
|
||||||
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
|
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
|
||||||
delete doc.rawHtml;
|
if (doc.rawHtml) {
|
||||||
|
delete doc.rawHtml;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!pageOptions.includeHtml) {
|
||||||
|
if (doc.html) {
|
||||||
|
delete doc.html;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -132,11 +132,11 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? {
|
const pageOptions = req.body.pageOptions ?? {
|
||||||
includeHtml: true,
|
includeHtml: req.body.pageOptions?.includeHtml ?? false,
|
||||||
onlyMainContent: true,
|
onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false,
|
||||||
fetchPageContent: true,
|
fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true,
|
||||||
removeTags: [],
|
removeTags: req.body.pageOptions?.removeTags ?? [],
|
||||||
fallback: false,
|
fallback: req.body.pageOptions?.fallback ?? false,
|
||||||
};
|
};
|
||||||
const origin = req.body.origin ?? "api";
|
const origin = req.body.origin ?? "api";
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ export const defaultTimeout = 45000; // 45 seconds
|
|||||||
|
|
||||||
export const defaultPageOptions = {
|
export const defaultPageOptions = {
|
||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
includeHtml: true,
|
includeHtml: false,
|
||||||
waitFor: 0,
|
waitFor: 0,
|
||||||
screenshot: false,
|
screenshot: false,
|
||||||
fullPageScreenshot: false,
|
fullPageScreenshot: false,
|
||||||
@ -17,7 +17,7 @@ export const defaultCrawlerOptions = {
|
|||||||
|
|
||||||
export const defaultCrawlPageOptions = {
|
export const defaultCrawlPageOptions = {
|
||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
includeHtml: true,
|
includeHtml: false,
|
||||||
removeTags: [],
|
removeTags: [],
|
||||||
parsePDF: true
|
parsePDF: true
|
||||||
}
|
}
|
||||||
|
@ -296,6 +296,12 @@ export class WebScraperDataProvider {
|
|||||||
if (this.pageOptions.includeMarkdown) {
|
if (this.pageOptions.includeMarkdown) {
|
||||||
documents = this.applyPathReplacements(documents);
|
documents = this.applyPathReplacements(documents);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!this.pageOptions.includeHtml) {
|
||||||
|
for (let document of documents) {
|
||||||
|
delete document.html;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// documents = await this.applyImgAltText(documents);
|
// documents = await this.applyImgAltText(documents);
|
||||||
if (
|
if (
|
||||||
@ -572,12 +578,19 @@ export class WebScraperDataProvider {
|
|||||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||||
this.generateImgAltText =
|
this.generateImgAltText =
|
||||||
options.crawlerOptions?.generateImgAltText ?? false;
|
options.crawlerOptions?.generateImgAltText ?? false;
|
||||||
this.pageOptions = options.pageOptions ?? {
|
this.pageOptions = {
|
||||||
onlyMainContent: false,
|
onlyMainContent: options.pageOptions?.onlyMainContent ?? false,
|
||||||
includeHtml: true,
|
includeHtml: options.pageOptions?.includeHtml ?? false,
|
||||||
replaceAllPathsWithAbsolutePaths: false,
|
replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false,
|
||||||
parsePDF: true,
|
parsePDF: options.pageOptions?.parsePDF ?? true,
|
||||||
removeTags: [],
|
removeTags: options.pageOptions?.removeTags ?? [],
|
||||||
|
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
|
||||||
|
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
|
||||||
|
waitFor: options.pageOptions?.waitFor ?? undefined,
|
||||||
|
headers: options.pageOptions?.headers ?? undefined,
|
||||||
|
includeLinks: options.pageOptions?.includeLinks ?? true,
|
||||||
|
fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
|
||||||
|
screenshot: options.pageOptions?.screenshot ?? false,
|
||||||
};
|
};
|
||||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||||
this.replaceAllPathsWithAbsolutePaths =
|
this.replaceAllPathsWithAbsolutePaths =
|
||||||
|
@ -122,23 +122,36 @@ function getScrapingFallbackOrder(
|
|||||||
export async function scrapSingleUrl(
|
export async function scrapSingleUrl(
|
||||||
jobId: string,
|
jobId: string,
|
||||||
urlToScrap: string,
|
urlToScrap: string,
|
||||||
pageOptions: PageOptions = {
|
pageOptions: PageOptions,
|
||||||
includeMarkdown: true,
|
extractorOptions?: ExtractorOptions,
|
||||||
onlyMainContent: true,
|
existingHtml?: string,
|
||||||
includeHtml: true,
|
|
||||||
includeRawHtml: false,
|
|
||||||
waitFor: 0,
|
|
||||||
screenshot: false,
|
|
||||||
fullPageScreenshot: false,
|
|
||||||
headers: undefined,
|
|
||||||
includeLinks: true
|
|
||||||
},
|
|
||||||
extractorOptions: ExtractorOptions = {
|
|
||||||
mode: "llm-extraction-from-markdown",
|
|
||||||
},
|
|
||||||
existingHtml: string = "",
|
|
||||||
priority?: number,
|
priority?: number,
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
|
pageOptions = {
|
||||||
|
includeMarkdown: pageOptions.includeMarkdown ?? true,
|
||||||
|
onlyMainContent: pageOptions.onlyMainContent ?? false,
|
||||||
|
includeHtml: pageOptions.includeHtml ?? false,
|
||||||
|
includeRawHtml: pageOptions.includeRawHtml ?? false,
|
||||||
|
waitFor: pageOptions.waitFor ?? undefined,
|
||||||
|
screenshot: pageOptions.screenshot ?? false,
|
||||||
|
fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
|
||||||
|
headers: pageOptions.headers ?? undefined,
|
||||||
|
includeLinks: pageOptions.includeLinks ?? true,
|
||||||
|
replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? false,
|
||||||
|
parsePDF: pageOptions.parsePDF ?? true,
|
||||||
|
removeTags: pageOptions.removeTags ?? [],
|
||||||
|
}
|
||||||
|
|
||||||
|
if (extractorOptions) {
|
||||||
|
extractorOptions = {
|
||||||
|
mode: extractorOptions.mode ?? "llm-extraction-from-markdown",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!existingHtml) {
|
||||||
|
existingHtml = "";
|
||||||
|
}
|
||||||
|
|
||||||
urlToScrap = urlToScrap.trim();
|
urlToScrap = urlToScrap.trim();
|
||||||
|
|
||||||
const attemptScraping = async (
|
const attemptScraping = async (
|
||||||
|
@ -130,10 +130,12 @@ async function processJob(job: Job, token: string) {
|
|||||||
const end = Date.now();
|
const end = Date.now();
|
||||||
const timeTakenInSeconds = (end - start) / 1000;
|
const timeTakenInSeconds = (end - start) / 1000;
|
||||||
|
|
||||||
const rawHtml = docs[0].rawHtml;
|
const rawHtml = docs[0] ? docs[0].rawHtml : "";
|
||||||
|
|
||||||
if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) {
|
if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) {
|
||||||
delete docs[0].rawHtml;
|
if (docs[0] && docs[0].rawHtml) {
|
||||||
|
delete docs[0].rawHtml;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const data = {
|
const data = {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user