mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-13 20:45:57 +08:00
Merge remote-tracking branch 'origin/v1-webscraper' into v1/python-sdk
This commit is contained in:
commit
b1d61d8557
@ -44,7 +44,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: true, removeTags: [] };
|
||||||
|
|
||||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||||
// try {
|
// try {
|
||||||
|
@ -132,7 +132,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? {
|
const pageOptions = req.body.pageOptions ?? {
|
||||||
includeHtml: false,
|
includeHtml: true,
|
||||||
onlyMainContent: true,
|
onlyMainContent: true,
|
||||||
fetchPageContent: true,
|
fetchPageContent: true,
|
||||||
removeTags: [],
|
removeTags: [],
|
||||||
|
@ -4,7 +4,7 @@ export const defaultTimeout = 45000; // 45 seconds
|
|||||||
|
|
||||||
export const defaultPageOptions = {
|
export const defaultPageOptions = {
|
||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
includeHtml: false,
|
includeHtml: true,
|
||||||
waitFor: 0,
|
waitFor: 0,
|
||||||
screenshot: false,
|
screenshot: false,
|
||||||
fullPageScreenshot: false,
|
fullPageScreenshot: false,
|
||||||
@ -17,7 +17,7 @@ export const defaultCrawlerOptions = {
|
|||||||
|
|
||||||
export const defaultCrawlPageOptions = {
|
export const defaultCrawlPageOptions = {
|
||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
includeHtml: false,
|
includeHtml: true,
|
||||||
removeTags: [],
|
removeTags: [],
|
||||||
parsePDF: true
|
parsePDF: true
|
||||||
}
|
}
|
||||||
|
@ -45,6 +45,9 @@ export async function startWebScraperPipeline({
|
|||||||
},
|
},
|
||||||
onSuccess: (result, mode) => {
|
onSuccess: (result, mode) => {
|
||||||
Logger.debug(`🐂 Job completed ${job.id}`);
|
Logger.debug(`🐂 Job completed ${job.id}`);
|
||||||
|
if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) {
|
||||||
|
delete result[0].rawHtml;
|
||||||
|
}
|
||||||
saveJob(job, result, token, mode);
|
saveJob(job, result, token, mode);
|
||||||
},
|
},
|
||||||
onError: (error) => {
|
onError: (error) => {
|
||||||
|
@ -574,7 +574,7 @@ export class WebScraperDataProvider {
|
|||||||
options.crawlerOptions?.generateImgAltText ?? false;
|
options.crawlerOptions?.generateImgAltText ?? false;
|
||||||
this.pageOptions = options.pageOptions ?? {
|
this.pageOptions = options.pageOptions ?? {
|
||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
includeHtml: false,
|
includeHtml: true,
|
||||||
replaceAllPathsWithAbsolutePaths: false,
|
replaceAllPathsWithAbsolutePaths: false,
|
||||||
parsePDF: true,
|
parsePDF: true,
|
||||||
removeTags: [],
|
removeTags: [],
|
||||||
|
@ -125,7 +125,7 @@ export async function scrapSingleUrl(
|
|||||||
pageOptions: PageOptions = {
|
pageOptions: PageOptions = {
|
||||||
includeMarkdown: true,
|
includeMarkdown: true,
|
||||||
onlyMainContent: true,
|
onlyMainContent: true,
|
||||||
includeHtml: false,
|
includeHtml: true,
|
||||||
includeRawHtml: false,
|
includeRawHtml: false,
|
||||||
waitFor: 0,
|
waitFor: 0,
|
||||||
screenshot: false,
|
screenshot: false,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user