Merge pull request #312 from mendableai/rafa/investigating-crawl-bugs

[Bug] Fixed axios bug that were making jobs stuck on active queue
This commit is contained in:
Nicolas 2024-06-24 16:52:34 -03:00 committed by GitHub
commit e5314ee8e7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 38 additions and 39 deletions

View File

@ -1,10 +1,10 @@
### Crawl Website ### Crawl Website
POST http://localhost:3002/v0/scrape HTTP/1.1 POST http://localhost:3002/v0/scrape HTTP/1.1
Authorization: Bearer Authorization: Bearer fc
content-type: application/json content-type: application/json
{ {
"url":"https://docs.mendable.ai" "url":"firecrawl.dev"
} }
@ -14,16 +14,24 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1
### Scrape Website ### Scrape Website
POST http://localhost:3002/v0/crawl HTTP/1.1 POST http://localhost:3002/v0/crawl HTTP/1.1
Authorization: Bearer Authorization: Bearer fc-
content-type: application/json content-type: application/json
{ {
"url":"https://www.mendable.ai", "url": "firecrawl.dev"
"crawlerOptions": {
"returnOnlyUrls": true
}
} }
## "reoveTags": [],
# "mode": "crawl",
# "crawlerOptions": {
# "allowBackwardCrawling": false
# },
# "pageOptions": {
# "onlyMainContent": false,
# "includeHtml": false,
# "parsePDF": true
# }

View File

@ -50,6 +50,5 @@ export function parseMarkdown(html: string) {
/\[Skip to Content\]\(#[^\)]*\)/gi, /\[Skip to Content\]\(#[^\)]*\)/gi,
"" ""
); );
return markdownContent; return markdownContent;
} }

View File

@ -0,0 +1 @@
export const axiosTimeout = 3000;

View File

@ -7,6 +7,7 @@ import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
import robotsParser from "robots-parser"; import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils"; import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout";
export class WebCrawler { export class WebCrawler {
private initialUrl: string; private initialUrl: string;
@ -129,20 +130,16 @@ export class WebCrawler {
): Promise<{ url: string, html: string }[]> { ): Promise<{ url: string, html: string }[]> {
// Fetch and parse robots.txt // Fetch and parse robots.txt
try { try {
const response = await axios.get(this.robotsTxtUrl); const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
this.robots = robotsParser(this.robotsTxtUrl, response.data); this.robots = robotsParser(this.robotsTxtUrl, response.data);
} catch (error) { } catch (error) {
console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
} }
if(!crawlerOptions?.ignoreSitemap){ if(!crawlerOptions?.ignoreSitemap){
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (sitemapLinks.length > 0) { if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
return filteredLinks.map(link => ({ url: link, html: "" })); return filteredLinks.map(link => ({ url: link, html: "" }));
} }
} }
@ -154,7 +151,6 @@ export class WebCrawler {
inProgress inProgress
); );
if ( if (
urls.length === 0 && urls.length === 0 &&
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
@ -192,7 +188,6 @@ export class WebCrawler {
// } // }
// } // }
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html)); newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
if (inProgress && newUrls.length > 0) { if (inProgress && newUrls.length > 0) {
@ -258,11 +253,12 @@ export class WebCrawler {
pageStatusCode = page.metadata?.pageStatusCode; pageStatusCode = page.metadata?.pageStatusCode;
pageError = page.metadata?.pageError || undefined; pageError = page.metadata?.pageError || undefined;
} else { } else {
const response = await axios.get(url); const response = await axios.get(url, { timeout: axiosTimeout });
content = response.data ?? ""; content = response.data ?? "";
pageStatusCode = response.status; pageStatusCode = response.status;
pageError = response.statusText != "OK" ? response.statusText : undefined; pageError = response.statusText != "OK" ? response.statusText : undefined;
} }
const $ = load(content); const $ = load(content);
let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = []; let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];
@ -290,15 +286,15 @@ export class WebCrawler {
!this.matchesExcludes(path) && !this.matchesExcludes(path) &&
this.isRobotsAllowed(fullUrl) this.isRobotsAllowed(fullUrl)
) { ) {
links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
} }
} }
}); });
if (this.visited.size === 1) { if (this.visited.size === 1) {
return links; return links;
} }
// Create a new list to return to avoid modifying the visited list // Create a new list to return to avoid modifying the visited list
return links.filter((link) => !this.visited.has(link.url)); return links.filter((link) => !this.visited.has(link.url));
} catch (error) { } catch (error) {
@ -400,39 +396,32 @@ export class WebCrawler {
let sitemapLinks: string[] = []; let sitemapLinks: string[] = [];
try { try {
const response = await axios.get(sitemapUrl); const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
if (response.status === 200) { if (response.status === 200) {
sitemapLinks = await getLinksFromSitemap(sitemapUrl); sitemapLinks = await getLinksFromSitemap(sitemapUrl);
} }
} catch (error) { } catch (error) {
// Error handling for failed sitemap fetch console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
// console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
} }
if (sitemapLinks.length === 0) { if (sitemapLinks.length === 0) {
// If the first one doesn't work, try the base URL
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
try { try {
const response = await axios.get(baseUrlSitemap); const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
if (response.status === 200) { if (response.status === 200) {
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap); sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
} }
} catch (error) { } catch (error) {
// Error handling for failed base URL sitemap fetch console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
// console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
} }
} }
// Normalize and check if the URL is present in any of the sitemaps
const normalizedUrl = normalizeUrl(url); const normalizedUrl = normalizeUrl(url);
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) { if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) {
// do not push the normalized url
sitemapLinks.push(url); sitemapLinks.push(url);
} }
return sitemapLinks; return sitemapLinks;
} }
} }

View File

@ -106,7 +106,6 @@ export class WebScraperDataProvider {
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<Document[]> { ): Promise<Document[]> {
this.validateInitialUrl(); this.validateInitialUrl();
if (!useCaching) { if (!useCaching) {
return this.processDocumentsWithoutCache(inProgress); return this.processDocumentsWithoutCache(inProgress);
} }
@ -264,8 +263,8 @@ export class WebScraperDataProvider {
inProgress, inProgress,
allHtmls allHtmls
); );
documents = await this.getSitemapData(this.urls[0], documents);
documents = await this.getSitemapData(this.urls[0], documents);
documents = this.applyPathReplacements(documents); documents = this.applyPathReplacements(documents);
// documents = await this.applyImgAltText(documents); // documents = await this.applyImgAltText(documents);

View File

@ -119,7 +119,6 @@ export async function scrapWithScrapingBee(
wait_browser, wait_browser,
timeout, timeout,
); );
const response = await client.get({ const response = await client.get({
...clientParams, ...clientParams,
params: { params: {
@ -127,7 +126,6 @@ export async function scrapWithScrapingBee(
'transparent_status_code': 'True' 'transparent_status_code': 'True'
} }
}); });
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return await fetchAndProcessPdf(url, pageOptions?.parsePDF); return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
@ -398,7 +396,6 @@ export async function scrapSingleUrl(
//* TODO: add an optional to return markdown or structured/extracted content //* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
return { return {
text: await parseMarkdown(cleanedHtml), text: await parseMarkdown(cleanedHtml),
html: cleanedHtml, html: cleanedHtml,

View File

@ -1,4 +1,5 @@
import axios from "axios"; import axios from "axios";
import { axiosTimeout } from "../../lib/timeout";
import { parseStringPromise } from "xml2js"; import { parseStringPromise } from "xml2js";
export async function getLinksFromSitemap( export async function getLinksFromSitemap(
@ -8,7 +9,7 @@ export async function getLinksFromSitemap(
try { try {
let content: string; let content: string;
try { try {
const response = await axios.get(sitemapUrl); const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data; content = response.data;
} catch (error) { } catch (error) {
console.error(`Request failed for ${sitemapUrl}: ${error}`); console.error(`Request failed for ${sitemapUrl}: ${error}`);
@ -42,7 +43,7 @@ export async function getLinksFromSitemap(
export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => { export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => {
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
try { try {
const response = await axios.get(sitemapUrl); const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
if (response.status === 200) { if (response.status === 200) {
const xml = response.data; const xml = response.data;
const parsedXml = await parseStringPromise(xml); const parsedXml = await parseStringPromise(xml);

View File

@ -43,6 +43,10 @@ export function isUrlBlocked(url: string): boolean {
} }
try { try {
if (!url.startsWith('http://') && !url.startsWith('https://')) {
url = 'https://' + url;
}
const urlObj = new URL(url); const urlObj = new URL(url);
const hostname = urlObj.hostname.toLowerCase(); const hostname = urlObj.hostname.toLowerCase();

View File

@ -6,6 +6,7 @@ import dotenv from "dotenv";
import pdf from "pdf-parse"; import pdf from "pdf-parse";
import path from "path"; import path from "path";
import os from "os"; import os from "os";
import { axiosTimeout } from "../../../lib/timeout";
dotenv.config(); dotenv.config();
@ -71,7 +72,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
while (attempt < maxAttempts && !resultAvailable) { while (attempt < maxAttempts && !resultAvailable) {
try { try {
resultResponse = await axios.get(resultUrl, { headers }); resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
if (resultResponse.status === 200) { if (resultResponse.status === 200) {
resultAvailable = true; // Exit condition met resultAvailable = true; // Exit condition met
} else { } else {

View File

@ -4,7 +4,7 @@ export async function attemptScrapWithRequests(
urlToScrap: string urlToScrap: string
): Promise<string | null> { ): Promise<string | null> {
try { try {
const response = await axios.get(urlToScrap); const response = await axios.get(urlToScrap, { timeout: 15000 });
if (!response.data) { if (!response.data) {
console.log("Failed normal requests as well"); console.log("Failed normal requests as well");

View File

@ -14,6 +14,7 @@ if(process.env.ENV === 'production') {
getWebScraperQueue().process( getWebScraperQueue().process(
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
async function (job, done) { async function (job, done) {
try { try {
job.progress({ job.progress({
current: 1, current: 1,
@ -22,7 +23,6 @@ getWebScraperQueue().process(
current_url: "", current_url: "",
}); });
const start = Date.now(); const start = Date.now();
const { success, message, docs } = await startWebScraperPipeline({ job }); const { success, message, docs } = await startWebScraperPipeline({ job });
const end = Date.now(); const end = Date.now();
const timeTakenInSeconds = (end - start) / 1000; const timeTakenInSeconds = (end - start) / 1000;