mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 00:28:59 +08:00
Nick: metadata fixes and lock duration for bull decreased to 2 hrs
This commit is contained in:
parent
e5314ee8e7
commit
e7be17db92
@ -399,12 +399,14 @@ export async function scrapSingleUrl(
|
|||||||
return {
|
return {
|
||||||
text: await parseMarkdown(cleanedHtml),
|
text: await parseMarkdown(cleanedHtml),
|
||||||
html: cleanedHtml,
|
html: cleanedHtml,
|
||||||
|
rawHtml: scraperResponse.text,
|
||||||
screenshot: scraperResponse.screenshot,
|
screenshot: scraperResponse.screenshot,
|
||||||
pageStatusCode: scraperResponse.metadata.pageStatusCode,
|
pageStatusCode: scraperResponse.metadata.pageStatusCode,
|
||||||
pageError: scraperResponse.metadata.pageError || undefined
|
pageError: scraperResponse.metadata.pageError || undefined
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
|
|
||||||
|
let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = { text: "", html: "", rawHtml: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
|
||||||
try {
|
try {
|
||||||
let urlKey = urlToScrap;
|
let urlKey = urlToScrap;
|
||||||
try {
|
try {
|
||||||
@ -432,6 +434,7 @@ export async function scrapSingleUrl(
|
|||||||
const attempt = await attemptScraping(urlToScrap, scraper);
|
const attempt = await attemptScraping(urlToScrap, scraper);
|
||||||
text = attempt.text ?? '';
|
text = attempt.text ?? '';
|
||||||
html = attempt.html ?? '';
|
html = attempt.html ?? '';
|
||||||
|
rawHtml = attempt.rawHtml ?? '';
|
||||||
screenshot = attempt.screenshot ?? '';
|
screenshot = attempt.screenshot ?? '';
|
||||||
if (attempt.pageStatusCode) {
|
if (attempt.pageStatusCode) {
|
||||||
pageStatusCode = attempt.pageStatusCode;
|
pageStatusCode = attempt.pageStatusCode;
|
||||||
@ -453,7 +456,7 @@ export async function scrapSingleUrl(
|
|||||||
throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
|
throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const soup = cheerio.load(html);
|
const soup = cheerio.load(rawHtml);
|
||||||
const metadata = extractMetadata(soup, urlToScrap);
|
const metadata = extractMetadata(soup, urlToScrap);
|
||||||
|
|
||||||
let document: Document;
|
let document: Document;
|
||||||
|
@ -7,7 +7,7 @@ export function getWebScraperQueue() {
|
|||||||
if (!webScraperQueue) {
|
if (!webScraperQueue) {
|
||||||
webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, {
|
webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, {
|
||||||
settings: {
|
settings: {
|
||||||
lockDuration: 4 * 60 * 60 * 1000, // 4 hours in milliseconds,
|
lockDuration: 2 * 60 * 60 * 1000, // 2 hours in milliseconds,
|
||||||
lockRenewTime: 30 * 60 * 1000, // 30 minutes in milliseconds
|
lockRenewTime: 30 * 60 * 1000, // 30 minutes in milliseconds
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
Loading…
x
Reference in New Issue
Block a user