mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-10 22:29:00 +08:00
Merge pull request #312 from mendableai/rafa/investigating-crawl-bugs
[Bug] Fixed axios bug that were making jobs stuck on active queue
This commit is contained in:
commit
e5314ee8e7
@ -1,10 +1,10 @@
|
|||||||
### Crawl Website
|
### Crawl Website
|
||||||
POST http://localhost:3002/v0/scrape HTTP/1.1
|
POST http://localhost:3002/v0/scrape HTTP/1.1
|
||||||
Authorization: Bearer
|
Authorization: Bearer fc
|
||||||
content-type: application/json
|
content-type: application/json
|
||||||
|
|
||||||
{
|
{
|
||||||
"url":"https://docs.mendable.ai"
|
"url":"firecrawl.dev"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -14,16 +14,24 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1
|
|||||||
|
|
||||||
### Scrape Website
|
### Scrape Website
|
||||||
POST http://localhost:3002/v0/crawl HTTP/1.1
|
POST http://localhost:3002/v0/crawl HTTP/1.1
|
||||||
Authorization: Bearer
|
Authorization: Bearer fc-
|
||||||
content-type: application/json
|
content-type: application/json
|
||||||
|
|
||||||
{
|
{
|
||||||
"url":"https://www.mendable.ai",
|
"url": "firecrawl.dev"
|
||||||
"crawlerOptions": {
|
|
||||||
"returnOnlyUrls": true
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
## "reoveTags": [],
|
||||||
|
# "mode": "crawl",
|
||||||
|
# "crawlerOptions": {
|
||||||
|
# "allowBackwardCrawling": false
|
||||||
|
# },
|
||||||
|
# "pageOptions": {
|
||||||
|
# "onlyMainContent": false,
|
||||||
|
# "includeHtml": false,
|
||||||
|
# "parsePDF": true
|
||||||
|
# }
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -50,6 +50,5 @@ export function parseMarkdown(html: string) {
|
|||||||
/\[Skip to Content\]\(#[^\)]*\)/gi,
|
/\[Skip to Content\]\(#[^\)]*\)/gi,
|
||||||
""
|
""
|
||||||
);
|
);
|
||||||
|
|
||||||
return markdownContent;
|
return markdownContent;
|
||||||
}
|
}
|
||||||
|
1
apps/api/src/lib/timeout.ts
Normal file
1
apps/api/src/lib/timeout.ts
Normal file
@ -0,0 +1 @@
|
|||||||
|
export const axiosTimeout = 3000;
|
@ -7,6 +7,7 @@ import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
|
|||||||
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
|
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
|
||||||
import robotsParser from "robots-parser";
|
import robotsParser from "robots-parser";
|
||||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||||
|
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||||
|
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
private initialUrl: string;
|
private initialUrl: string;
|
||||||
@ -129,20 +130,16 @@ export class WebCrawler {
|
|||||||
): Promise<{ url: string, html: string }[]> {
|
): Promise<{ url: string, html: string }[]> {
|
||||||
// Fetch and parse robots.txt
|
// Fetch and parse robots.txt
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(this.robotsTxtUrl);
|
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
||||||
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if(!crawlerOptions?.ignoreSitemap){
|
if(!crawlerOptions?.ignoreSitemap){
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||||
|
|
||||||
if (sitemapLinks.length > 0) {
|
if (sitemapLinks.length > 0) {
|
||||||
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||||
|
|
||||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -154,7 +151,6 @@ export class WebCrawler {
|
|||||||
inProgress
|
inProgress
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
urls.length === 0 &&
|
urls.length === 0 &&
|
||||||
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
||||||
@ -192,7 +188,6 @@ export class WebCrawler {
|
|||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
|
||||||
|
|
||||||
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
|
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
|
||||||
|
|
||||||
if (inProgress && newUrls.length > 0) {
|
if (inProgress && newUrls.length > 0) {
|
||||||
@ -258,11 +253,12 @@ export class WebCrawler {
|
|||||||
pageStatusCode = page.metadata?.pageStatusCode;
|
pageStatusCode = page.metadata?.pageStatusCode;
|
||||||
pageError = page.metadata?.pageError || undefined;
|
pageError = page.metadata?.pageError || undefined;
|
||||||
} else {
|
} else {
|
||||||
const response = await axios.get(url);
|
const response = await axios.get(url, { timeout: axiosTimeout });
|
||||||
content = response.data ?? "";
|
content = response.data ?? "";
|
||||||
pageStatusCode = response.status;
|
pageStatusCode = response.status;
|
||||||
pageError = response.statusText != "OK" ? response.statusText : undefined;
|
pageError = response.statusText != "OK" ? response.statusText : undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
const $ = load(content);
|
const $ = load(content);
|
||||||
let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];
|
let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];
|
||||||
|
|
||||||
@ -290,15 +286,15 @@ export class WebCrawler {
|
|||||||
!this.matchesExcludes(path) &&
|
!this.matchesExcludes(path) &&
|
||||||
this.isRobotsAllowed(fullUrl)
|
this.isRobotsAllowed(fullUrl)
|
||||||
) {
|
) {
|
||||||
|
|
||||||
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
if (this.visited.size === 1) {
|
if (this.visited.size === 1) {
|
||||||
return links;
|
return links;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create a new list to return to avoid modifying the visited list
|
// Create a new list to return to avoid modifying the visited list
|
||||||
return links.filter((link) => !this.visited.has(link.url));
|
return links.filter((link) => !this.visited.has(link.url));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@ -400,39 +396,32 @@ export class WebCrawler {
|
|||||||
let sitemapLinks: string[] = [];
|
let sitemapLinks: string[] = [];
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(sitemapUrl);
|
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
sitemapLinks = await getLinksFromSitemap(sitemapUrl);
|
sitemapLinks = await getLinksFromSitemap(sitemapUrl);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
// Error handling for failed sitemap fetch
|
console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
|
||||||
// console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sitemapLinks.length === 0) {
|
if (sitemapLinks.length === 0) {
|
||||||
// If the first one doesn't work, try the base URL
|
|
||||||
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(baseUrlSitemap);
|
const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
|
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
// Error handling for failed base URL sitemap fetch
|
console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||||
// console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Normalize and check if the URL is present in any of the sitemaps
|
|
||||||
const normalizedUrl = normalizeUrl(url);
|
const normalizedUrl = normalizeUrl(url);
|
||||||
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
|
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
|
||||||
|
|
||||||
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
|
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
|
||||||
if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) {
|
if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) {
|
||||||
// do not push the normalized url
|
|
||||||
sitemapLinks.push(url);
|
sitemapLinks.push(url);
|
||||||
}
|
}
|
||||||
|
|
||||||
return sitemapLinks;
|
return sitemapLinks;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -106,7 +106,6 @@ export class WebScraperDataProvider {
|
|||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
this.validateInitialUrl();
|
this.validateInitialUrl();
|
||||||
|
|
||||||
if (!useCaching) {
|
if (!useCaching) {
|
||||||
return this.processDocumentsWithoutCache(inProgress);
|
return this.processDocumentsWithoutCache(inProgress);
|
||||||
}
|
}
|
||||||
@ -264,8 +263,8 @@ export class WebScraperDataProvider {
|
|||||||
inProgress,
|
inProgress,
|
||||||
allHtmls
|
allHtmls
|
||||||
);
|
);
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
|
||||||
|
|
||||||
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
documents = this.applyPathReplacements(documents);
|
documents = this.applyPathReplacements(documents);
|
||||||
// documents = await this.applyImgAltText(documents);
|
// documents = await this.applyImgAltText(documents);
|
||||||
|
|
||||||
|
@ -119,7 +119,6 @@ export async function scrapWithScrapingBee(
|
|||||||
wait_browser,
|
wait_browser,
|
||||||
timeout,
|
timeout,
|
||||||
);
|
);
|
||||||
|
|
||||||
const response = await client.get({
|
const response = await client.get({
|
||||||
...clientParams,
|
...clientParams,
|
||||||
params: {
|
params: {
|
||||||
@ -127,7 +126,6 @@ export async function scrapWithScrapingBee(
|
|||||||
'transparent_status_code': 'True'
|
'transparent_status_code': 'True'
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||||
@ -398,7 +396,6 @@ export async function scrapSingleUrl(
|
|||||||
|
|
||||||
//* TODO: add an optional to return markdown or structured/extracted content
|
//* TODO: add an optional to return markdown or structured/extracted content
|
||||||
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
text: await parseMarkdown(cleanedHtml),
|
text: await parseMarkdown(cleanedHtml),
|
||||||
html: cleanedHtml,
|
html: cleanedHtml,
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
|
import { axiosTimeout } from "../../lib/timeout";
|
||||||
import { parseStringPromise } from "xml2js";
|
import { parseStringPromise } from "xml2js";
|
||||||
|
|
||||||
export async function getLinksFromSitemap(
|
export async function getLinksFromSitemap(
|
||||||
@ -8,7 +9,7 @@ export async function getLinksFromSitemap(
|
|||||||
try {
|
try {
|
||||||
let content: string;
|
let content: string;
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(sitemapUrl);
|
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
content = response.data;
|
content = response.data;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
||||||
@ -42,7 +43,7 @@ export async function getLinksFromSitemap(
|
|||||||
export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => {
|
export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => {
|
||||||
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(sitemapUrl);
|
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
const xml = response.data;
|
const xml = response.data;
|
||||||
const parsedXml = await parseStringPromise(xml);
|
const parsedXml = await parseStringPromise(xml);
|
||||||
|
@ -43,6 +43,10 @@ export function isUrlBlocked(url: string): boolean {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
if (!url.startsWith('http://') && !url.startsWith('https://')) {
|
||||||
|
url = 'https://' + url;
|
||||||
|
}
|
||||||
|
|
||||||
const urlObj = new URL(url);
|
const urlObj = new URL(url);
|
||||||
const hostname = urlObj.hostname.toLowerCase();
|
const hostname = urlObj.hostname.toLowerCase();
|
||||||
|
|
||||||
|
@ -6,6 +6,7 @@ import dotenv from "dotenv";
|
|||||||
import pdf from "pdf-parse";
|
import pdf from "pdf-parse";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
import os from "os";
|
import os from "os";
|
||||||
|
import { axiosTimeout } from "../../../lib/timeout";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
@ -71,7 +72,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
|||||||
|
|
||||||
while (attempt < maxAttempts && !resultAvailable) {
|
while (attempt < maxAttempts && !resultAvailable) {
|
||||||
try {
|
try {
|
||||||
resultResponse = await axios.get(resultUrl, { headers });
|
resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
|
||||||
if (resultResponse.status === 200) {
|
if (resultResponse.status === 200) {
|
||||||
resultAvailable = true; // Exit condition met
|
resultAvailable = true; // Exit condition met
|
||||||
} else {
|
} else {
|
||||||
|
@ -4,7 +4,7 @@ export async function attemptScrapWithRequests(
|
|||||||
urlToScrap: string
|
urlToScrap: string
|
||||||
): Promise<string | null> {
|
): Promise<string | null> {
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(urlToScrap);
|
const response = await axios.get(urlToScrap, { timeout: 15000 });
|
||||||
|
|
||||||
if (!response.data) {
|
if (!response.data) {
|
||||||
console.log("Failed normal requests as well");
|
console.log("Failed normal requests as well");
|
||||||
|
@ -14,6 +14,7 @@ if(process.env.ENV === 'production') {
|
|||||||
getWebScraperQueue().process(
|
getWebScraperQueue().process(
|
||||||
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
|
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
|
||||||
async function (job, done) {
|
async function (job, done) {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
job.progress({
|
job.progress({
|
||||||
current: 1,
|
current: 1,
|
||||||
@ -22,7 +23,6 @@ getWebScraperQueue().process(
|
|||||||
current_url: "",
|
current_url: "",
|
||||||
});
|
});
|
||||||
const start = Date.now();
|
const start = Date.now();
|
||||||
|
|
||||||
const { success, message, docs } = await startWebScraperPipeline({ job });
|
const { success, message, docs } = await startWebScraperPipeline({ job });
|
||||||
const end = Date.now();
|
const end = Date.now();
|
||||||
const timeTakenInSeconds = (end - start) / 1000;
|
const timeTakenInSeconds = (end - start) / 1000;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user