Merge remote-tracking branch 'origin/v1-webscraper' into v1/python-sdk

This commit is contained in:
rafaelsideguide 2024-08-22 13:39:09 -03:00
commit b1d61d8557
6 changed files with 9 additions and 6 deletions

View File

@ -44,7 +44,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
} }
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] }; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: true, removeTags: [] };
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try { // try {

View File

@ -132,7 +132,7 @@ export async function searchController(req: Request, res: Response) {
} }
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { const pageOptions = req.body.pageOptions ?? {
includeHtml: false, includeHtml: true,
onlyMainContent: true, onlyMainContent: true,
fetchPageContent: true, fetchPageContent: true,
removeTags: [], removeTags: [],

View File

@ -4,7 +4,7 @@ export const defaultTimeout = 45000; // 45 seconds
export const defaultPageOptions = { export const defaultPageOptions = {
onlyMainContent: false, onlyMainContent: false,
includeHtml: false, includeHtml: true,
waitFor: 0, waitFor: 0,
screenshot: false, screenshot: false,
fullPageScreenshot: false, fullPageScreenshot: false,
@ -17,7 +17,7 @@ export const defaultCrawlerOptions = {
export const defaultCrawlPageOptions = { export const defaultCrawlPageOptions = {
onlyMainContent: false, onlyMainContent: false,
includeHtml: false, includeHtml: true,
removeTags: [], removeTags: [],
parsePDF: true parsePDF: true
} }

View File

@ -45,6 +45,9 @@ export async function startWebScraperPipeline({
}, },
onSuccess: (result, mode) => { onSuccess: (result, mode) => {
Logger.debug(`🐂 Job completed ${job.id}`); Logger.debug(`🐂 Job completed ${job.id}`);
if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) {
delete result[0].rawHtml;
}
saveJob(job, result, token, mode); saveJob(job, result, token, mode);
}, },
onError: (error) => { onError: (error) => {

View File

@ -574,7 +574,7 @@ export class WebScraperDataProvider {
options.crawlerOptions?.generateImgAltText ?? false; options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = options.pageOptions ?? { this.pageOptions = options.pageOptions ?? {
onlyMainContent: false, onlyMainContent: false,
includeHtml: false, includeHtml: true,
replaceAllPathsWithAbsolutePaths: false, replaceAllPathsWithAbsolutePaths: false,
parsePDF: true, parsePDF: true,
removeTags: [], removeTags: [],

View File

@ -125,7 +125,7 @@ export async function scrapSingleUrl(
pageOptions: PageOptions = { pageOptions: PageOptions = {
includeMarkdown: true, includeMarkdown: true,
onlyMainContent: true, onlyMainContent: true,
includeHtml: false, includeHtml: true,
includeRawHtml: false, includeRawHtml: false,
waitFor: 0, waitFor: 0,
screenshot: false, screenshot: false,