mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 00:38:58 +08:00
Nick:
This commit is contained in:
parent
21d29de819
commit
56d42d9c9b
1
apps/api/src/lib/timeout.ts
Normal file
1
apps/api/src/lib/timeout.ts
Normal file
@ -0,0 +1 @@
|
|||||||
|
export const axiosTimeout = 3000;
|
@ -7,6 +7,7 @@ import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
|
|||||||
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
|
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
|
||||||
import robotsParser from "robots-parser";
|
import robotsParser from "robots-parser";
|
||||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||||
|
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||||
|
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
private initialUrl: string;
|
private initialUrl: string;
|
||||||
@ -131,7 +132,7 @@ export class WebCrawler {
|
|||||||
try {
|
try {
|
||||||
console.log('3.1 here OK')
|
console.log('3.1 here OK')
|
||||||
console.log('this.robotsTxtUrl:', this.robotsTxtUrl)
|
console.log('this.robotsTxtUrl:', this.robotsTxtUrl)
|
||||||
const response = await axios.get(this.robotsTxtUrl, { timeout: 3000 });
|
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
||||||
console.log('????', {response})
|
console.log('????', {response})
|
||||||
console.log('3.2 here OK')
|
console.log('3.2 here OK')
|
||||||
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
||||||
@ -274,7 +275,7 @@ export class WebCrawler {
|
|||||||
pageError = page.metadata?.pageError || undefined;
|
pageError = page.metadata?.pageError || undefined;
|
||||||
} else {
|
} else {
|
||||||
// console.log('crawl - else')
|
// console.log('crawl - else')
|
||||||
const response = await axios.get(url, { timeout: 3000 });
|
const response = await axios.get(url, { timeout: axiosTimeout });
|
||||||
console.log('crawl - else - response ok')
|
console.log('crawl - else - response ok')
|
||||||
content = response.data ?? "";
|
content = response.data ?? "";
|
||||||
pageStatusCode = response.status;
|
pageStatusCode = response.status;
|
||||||
@ -312,6 +313,7 @@ export class WebCrawler {
|
|||||||
!this.matchesExcludes(path) &&
|
!this.matchesExcludes(path) &&
|
||||||
this.isRobotsAllowed(fullUrl)
|
this.isRobotsAllowed(fullUrl)
|
||||||
) {
|
) {
|
||||||
|
console.log(fullUrl)
|
||||||
|
|
||||||
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
||||||
}
|
}
|
||||||
@ -428,7 +430,7 @@ export class WebCrawler {
|
|||||||
|
|
||||||
console.log("4.1.3 - Fetching sitemap from constructed URL");
|
console.log("4.1.3 - Fetching sitemap from constructed URL");
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(sitemapUrl, { timeout: 3000 });
|
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
console.log("4.1.4 - Extracting links from sitemap");
|
console.log("4.1.4 - Extracting links from sitemap");
|
||||||
sitemapLinks = await getLinksFromSitemap(sitemapUrl);
|
sitemapLinks = await getLinksFromSitemap(sitemapUrl);
|
||||||
@ -441,7 +443,7 @@ export class WebCrawler {
|
|||||||
console.log("4.1.5 - Trying base URL sitemap as fallback");
|
console.log("4.1.5 - Trying base URL sitemap as fallback");
|
||||||
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(baseUrlSitemap, { timeout: 3000 });
|
const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
console.log("4.1.6 - Extracting links from base URL sitemap");
|
console.log("4.1.6 - Extracting links from base URL sitemap");
|
||||||
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
|
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
|
import { axiosTimeout } from "../../lib/timeout";
|
||||||
import { parseStringPromise } from "xml2js";
|
import { parseStringPromise } from "xml2js";
|
||||||
|
|
||||||
export async function getLinksFromSitemap(
|
export async function getLinksFromSitemap(
|
||||||
@ -8,7 +9,7 @@ export async function getLinksFromSitemap(
|
|||||||
try {
|
try {
|
||||||
let content: string;
|
let content: string;
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(sitemapUrl, { timeout: 3000 });
|
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
content = response.data;
|
content = response.data;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
||||||
@ -42,7 +43,7 @@ export async function getLinksFromSitemap(
|
|||||||
export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => {
|
export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => {
|
||||||
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(sitemapUrl, { timeout: 3000 });
|
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
const xml = response.data;
|
const xml = response.data;
|
||||||
const parsedXml = await parseStringPromise(xml);
|
const parsedXml = await parseStringPromise(xml);
|
||||||
|
@ -6,6 +6,7 @@ import dotenv from "dotenv";
|
|||||||
import pdf from "pdf-parse";
|
import pdf from "pdf-parse";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
import os from "os";
|
import os from "os";
|
||||||
|
import { axiosTimeout } from "../../../lib/timeout";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
@ -71,7 +72,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
|||||||
|
|
||||||
while (attempt < maxAttempts && !resultAvailable) {
|
while (attempt < maxAttempts && !resultAvailable) {
|
||||||
try {
|
try {
|
||||||
resultResponse = await axios.get(resultUrl, { headers, timeout: 6000 });
|
resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
|
||||||
if (resultResponse.status === 200) {
|
if (resultResponse.status === 200) {
|
||||||
resultAvailable = true; // Exit condition met
|
resultAvailable = true; // Exit condition met
|
||||||
} else {
|
} else {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user