Merge remote-tracking branch 'origin/main' into nsc/new-extract

This commit is contained in:
rafaelmmiller 2024-11-19 09:34:08 -03:00
commit 36cf49c959
25 changed files with 271 additions and 219 deletions

View File

@ -75,7 +75,7 @@ export async function crawlController(req: Request, res: Response) {
await checkTeamCredits(chunk, team_id, limitCheck); await checkTeamCredits(chunk, team_id, limitCheck);
if (!creditsCheckSuccess) { if (!creditsCheckSuccess) {
return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at hello@firecrawl.com" }); return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com" });
} }
// TODO: need to do this to v1 // TODO: need to do this to v1

View File

@ -209,7 +209,7 @@ export async function scrapeController(req: Request, res: Response) {
earlyReturn = true; earlyReturn = true;
return res.status(500).json({ return res.status(500).json({
error: error:
"Error checking team credits. Please contact hello@firecrawl.com for help.", "Error checking team credits. Please contact help@firecrawl.com for help.",
}); });
} }

View File

@ -16,6 +16,7 @@ import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service"; import { getScrapeQueue } from "../../services/queue-service";
import { getJobPriority } from "../../lib/job-priority"; import { getJobPriority } from "../../lib/job-priority";
import { addScrapeJobs } from "../../services/queue-jobs"; import { addScrapeJobs } from "../../services/queue-jobs";
import { callWebhook } from "../../services/webhook";
export async function batchScrapeController( export async function batchScrapeController(
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>, req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
@ -66,6 +67,7 @@ export async function batchScrapeController(
crawl_id: id, crawl_id: id,
sitemapped: true, sitemapped: true,
v1: true, v1: true,
webhook: req.body.webhook,
}, },
opts: { opts: {
jobId: uuidv4(), jobId: uuidv4(),
@ -85,6 +87,10 @@ export async function batchScrapeController(
); );
await addScrapeJobs(jobs); await addScrapeJobs(jobs);
if(req.body.webhook) {
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "batch_scrape.started");
}
const protocol = process.env.ENV === "local" ? req.protocol : "https"; const protocol = process.env.ENV === "local" ? req.protocol : "https";
return res.status(200).json({ return res.status(200).json({

View File

@ -175,7 +175,7 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut
logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose); logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
return close(ws, 1011, { return close(ws, 1011, {
type: "error", type: "error",
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id
}); });
} }
} }

View File

@ -1,11 +1,6 @@
import { Response } from "express"; import { Response } from "express";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { import { MapDocument, mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
MapDocument,
mapRequestSchema,
RequestWithAuth,
scrapeOptions,
} from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
import { MapResponse, MapRequest } from "./types"; import { MapResponse, MapRequest } from "./types";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
@ -65,11 +60,13 @@ export async function getMapResults({
}): Promise<MapResult> { }): Promise<MapResult> {
const id = uuidv4(); const id = uuidv4();
let links: string[] = [url]; let links: string[] = [url];
let mapResults: MapDocument[] = [];
const sc: StoredCrawl = { const sc: StoredCrawl = {
originUrl: url, originUrl: url,
crawlerOptions: { crawlerOptions: {
...crawlerOptions, ...crawlerOptions,
limit: crawlerOptions.sitemapOnly ? 10000000 : limit,
scrapeOptions: undefined, scrapeOptions: undefined,
}, },
scrapeOptions: scrapeOptions.parse({}), scrapeOptions: scrapeOptions.parse({}),
@ -81,6 +78,29 @@ export async function getMapResults({
const crawler = crawlToCrawler(id, sc); const crawler = crawlToCrawler(id, sc);
// If sitemapOnly is true, only get links from sitemap
if (crawlerOptions.sitemapOnly) {
if (includeMetadata) {
throw new Error("includeMetadata is not supported with sitemapOnly");
}
const sitemap = await crawler.tryGetSitemap(true, true);
if (sitemap !== null) {
sitemap.forEach((x) => {
links.push(x.url);
});
links = links.slice(1)
.map((x) => {
try {
return checkAndUpdateURLForMap(x).url.trim();
} catch (_) {
return null;
}
})
.filter((x) => x !== null) as string[];
// links = links.slice(1, limit); // don't slice, unnecessary
}
} else {
let urlWithoutWww = url.replace("www.", ""); let urlWithoutWww = url.replace("www.", "");
let mapUrl = search && allowExternalLinks let mapUrl = search && allowExternalLinks
@ -107,13 +127,14 @@ export async function getMapResults({
}); });
}; };
pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1)); pagePromises = Array.from({ length: maxPages }, (_, i) =>
fetchPage(i + 1)
);
allResults = await Promise.all(pagePromises); allResults = await Promise.all(pagePromises);
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
} }
console.log("allResults", allResults);
// Parallelize sitemap fetch with serper search // Parallelize sitemap fetch with serper search
const [sitemap, ...searchResults] = await Promise.all([ const [sitemap, ...searchResults] = await Promise.all([
ignoreSitemap ? null : crawler.tryGetSitemap(), ignoreSitemap ? null : crawler.tryGetSitemap(),
@ -130,7 +151,7 @@ export async function getMapResults({
}); });
} }
let mapResults : MapDocument[] = allResults mapResults = allResults
.flat() .flat()
.filter((result) => result !== null && result !== undefined); .filter((result) => result !== null && result !== undefined);
@ -180,6 +201,7 @@ export async function getMapResults({
// remove duplicates that could be due to http/https or www // remove duplicates that could be due to http/https or www
links = removeDuplicateUrls(links); links = removeDuplicateUrls(links);
}
const linksToReturn = links.slice(0, limit); const linksToReturn = links.slice(0, limit);
@ -242,51 +264,3 @@ export async function mapController(
return res.status(200).json(response); return res.status(200).json(response);
} }
// Subdomain sitemap url checking
// // For each result, check for subdomains, get their sitemaps and add them to the links
// const processedUrls = new Set();
// const processedSubdomains = new Set();
// for (const result of links) {
// let url;
// let hostParts;
// try {
// url = new URL(result);
// hostParts = url.hostname.split('.');
// } catch (e) {
// continue;
// }
// console.log("hostParts", hostParts);
// // Check if it's a subdomain (more than 2 parts, and not 'www')
// if (hostParts.length > 2 && hostParts[0] !== 'www') {
// const subdomain = hostParts[0];
// console.log("subdomain", subdomain);
// const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
// console.log("subdomainUrl", subdomainUrl);
// if (!processedSubdomains.has(subdomainUrl)) {
// processedSubdomains.add(subdomainUrl);
// const subdomainCrawl = crawlToCrawler(id, {
// originUrl: subdomainUrl,
// crawlerOptions: legacyCrawlerOptions(req.body),
// pageOptions: {},
// team_id: req.auth.team_id,
// createdAt: Date.now(),
// plan: req.auth.plan,
// });
// const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
// if (subdomainSitemap) {
// subdomainSitemap.forEach((x) => {
// if (!processedUrls.has(x.url)) {
// processedUrls.add(x.url);
// links.push(x.url);
// }
// });
// }
// }
// }
// }

View File

@ -11,8 +11,12 @@ export async function scrapeStatusController(req: any, res: any) {
await rateLimiter.consume(iptoken); await rateLimiter.consume(iptoken);
const job = await supabaseGetJobByIdOnlyData(req.params.jobId); const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
const allowedTeams = [
"41bdbfe1-0579-4d9b-b6d5-809f16be12f5",
"511544f2-2fce-4183-9c59-6c29b02c69b5"
];
if(job?.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){ if(!allowedTeams.includes(job?.team_id)){
return res.status(403).json({ return res.status(403).json({
success: false, success: false,
error: "You are not allowed to access this resource.", error: "You are not allowed to access this resource.",

View File

@ -119,7 +119,7 @@ export const scrapeOptions = z.object({
includeTags: z.string().array().optional(), includeTags: z.string().array().optional(),
excludeTags: z.string().array().optional(), excludeTags: z.string().array().optional(),
onlyMainContent: z.boolean().default(true), onlyMainContent: z.boolean().default(true),
timeout: z.number().int().positive().finite().safe().default(30000), timeout: z.number().int().positive().finite().safe().optional(),
waitFor: z.number().int().nonnegative().finite().safe().default(0), waitFor: z.number().int().nonnegative().finite().safe().default(0),
extract: extractOptions.optional(), extract: extractOptions.optional(),
mobile: z.boolean().default(false), mobile: z.boolean().default(false),
@ -170,9 +170,10 @@ export type ExtractV1Options = z.infer<typeof extractV1Options>;
export const extractRequestSchema = extractV1Options; export const extractRequestSchema = extractV1Options;
export type ExtractRequest = z.infer<typeof extractRequestSchema>; export type ExtractRequest = z.infer<typeof extractRequestSchema>;
export const scrapeRequestSchema = scrapeOptions.extend({ export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({
url, url,
origin: z.string().optional().default("api"), origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(30000),
}).strict(strictMessage).refine( }).strict(strictMessage).refine(
(obj) => { (obj) => {
const hasExtractFormat = obj.formats?.includes("extract"); const hasExtractFormat = obj.formats?.includes("extract");
@ -194,9 +195,21 @@ export const scrapeRequestSchema = scrapeOptions.extend({
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>; export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>; export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
export const webhookSchema = z.preprocess(x => {
if (typeof x === "string") {
return { url: x };
} else {
return x;
}
}, z.object({
url: z.string().url(),
headers: z.record(z.string(), z.string()).default({}),
}).strict(strictMessage))
export const batchScrapeRequestSchema = scrapeOptions.extend({ export const batchScrapeRequestSchema = scrapeOptions.extend({
urls: url.array(), urls: url.array(),
origin: z.string().optional().default("api"), origin: z.string().optional().default("api"),
webhook: webhookSchema.optional(),
}).strict(strictMessage).refine( }).strict(strictMessage).refine(
(obj) => { (obj) => {
const hasExtractFormat = obj.formats?.includes("extract"); const hasExtractFormat = obj.formats?.includes("extract");
@ -206,12 +219,7 @@ export const batchScrapeRequestSchema = scrapeOptions.extend({
{ {
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa", message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
} }
).transform((obj) => { );
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
return { ...obj, timeout: 60000 };
}
return obj;
});
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>; export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
@ -239,21 +247,10 @@ const crawlerOptions = z.object({
export type CrawlerOptions = z.infer<typeof crawlerOptions>; export type CrawlerOptions = z.infer<typeof crawlerOptions>;
export const webhookSchema = z.preprocess(x => {
if (typeof x === "string") {
return { url: x };
} else {
return x;
}
}, z.object({
url: z.string().url(),
headers: z.record(z.string(), z.string()).default({}),
}).strict(strictMessage))
export const crawlRequestSchema = crawlerOptions.extend({ export const crawlRequestSchema = crawlerOptions.extend({
url, url,
origin: z.string().optional().default("api"), origin: z.string().optional().default("api"),
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}), scrapeOptions: scrapeOptions.default({}),
webhook: webhookSchema.optional(), webhook: webhookSchema.optional(),
limit: z.number().default(10000), limit: z.number().default(10000),
}).strict(strictMessage); }).strict(strictMessage);
@ -279,6 +276,7 @@ export const mapRequestSchema = crawlerOptions.extend({
includeSubdomains: z.boolean().default(true), includeSubdomains: z.boolean().default(true),
search: z.string().optional(), search: z.string().optional(),
ignoreSitemap: z.boolean().default(false), ignoreSitemap: z.boolean().default(false),
sitemapOnly: z.boolean().default(false),
limit: z.number().min(1).max(5000).default(5000), limit: z.number().min(1).max(5000).default(5000),
}).strict(strictMessage); }).strict(strictMessage);

View File

@ -207,7 +207,7 @@ app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response
} }
logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id }); res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id });
}); });
logger.info(`Worker ${process.pid} started`); logger.info(`Worker ${process.pid} started`);

View File

@ -52,7 +52,7 @@ export async function addCrawlJobs(id: string, job_ids: string[]) {
export async function addCrawlJobDone(id: string, job_id: string) { export async function addCrawlJobDone(id: string, job_id: string) {
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id); await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id); await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id);
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX"); await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX"); await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX");
} }

View File

@ -6,22 +6,28 @@ import * as Sentry from "@sentry/node";
import dotenv from 'dotenv'; import dotenv from 'dotenv';
import { logger } from './logger'; import { logger } from './logger';
import { stat } from 'fs/promises';
dotenv.config(); dotenv.config();
// TODO: add a timeout to the Go parser // TODO: add a timeout to the Go parser
const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so');
class GoMarkdownConverter { class GoMarkdownConverter {
private static instance: GoMarkdownConverter; private static instance: GoMarkdownConverter;
private convert: any; private convert: any;
private constructor() { private constructor() {
const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so');
const lib = koffi.load(goExecutablePath); const lib = koffi.load(goExecutablePath);
this.convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']); this.convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']);
} }
public static getInstance(): GoMarkdownConverter { public static async getInstance(): Promise<GoMarkdownConverter> {
if (!GoMarkdownConverter.instance) { if (!GoMarkdownConverter.instance) {
try {
await stat(goExecutablePath);
} catch (_) {
throw Error("Go shared library not found");
}
GoMarkdownConverter.instance = new GoMarkdownConverter(); GoMarkdownConverter.instance = new GoMarkdownConverter();
} }
return GoMarkdownConverter.instance; return GoMarkdownConverter.instance;
@ -47,7 +53,7 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
try { try {
if (process.env.USE_GO_MARKDOWN_PARSER == "true") { if (process.env.USE_GO_MARKDOWN_PARSER == "true") {
const converter = GoMarkdownConverter.getInstance(); const converter = await GoMarkdownConverter.getInstance();
let markdownContent = await converter.convertHTMLToMarkdown(html); let markdownContent = await converter.convertHTMLToMarkdown(html);
markdownContent = processMultiLineLinks(markdownContent); markdownContent = processMultiLineLinks(markdownContent);
@ -56,8 +62,12 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
return markdownContent; return markdownContent;
} }
} catch (error) { } catch (error) {
if (!(error instanceof Error) || error.message !== "Go shared library not found") {
Sentry.captureException(error); Sentry.captureException(error);
logger.error(`Error converting HTML to Markdown with Go parser: ${error}`); logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
} else {
logger.warn("Tried to use Go parser, but it doesn't exist in the file system.", { goExecutablePath });
}
} }
// Fallback to TurndownService if Go parser fails or is not enabled // Fallback to TurndownService if Go parser fails or is not enabled
@ -89,7 +99,7 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
return markdownContent; return markdownContent;
} catch (error) { } catch (error) {
console.error("Error converting HTML to Markdown: ", error); logger.error("Error converting HTML to Markdown", {error});
return ""; // Optionally return an empty string or handle the error as needed return ""; // Optionally return an empty string or handle the error as needed
} }
} }

View File

@ -65,7 +65,12 @@ export class WebCrawler {
this.allowExternalContentLinks = allowExternalContentLinks ?? false; this.allowExternalContentLinks = allowExternalContentLinks ?? false;
} }
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
// If the initial URL is a sitemap.xml, skip filtering
if (this.initialUrl.endsWith('sitemap.xml') && fromMap) {
return sitemapLinks.slice(0, limit);
}
return sitemapLinks return sitemapLinks
.filter((link) => { .filter((link) => {
let url: URL; let url: URL;
@ -159,11 +164,14 @@ export class WebCrawler {
this.robots = robotsParser(this.robotsTxtUrl, txt); this.robots = robotsParser(this.robotsTxtUrl, txt);
} }
public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> { public async tryGetSitemap(fromMap: boolean = false, onlySitemap: boolean = false): Promise<{ url: string; html: string; }[] | null> {
logger.debug(`Fetching sitemap links from ${this.initialUrl}`); logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if(fromMap && onlySitemap) {
return sitemapLinks.map(link => ({ url: link, html: "" }));
}
if (sitemapLinks.length > 0) { if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth); let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth, fromMap);
return filteredLinks.map(link => ({ url: link, html: "" })); return filteredLinks.map(link => ({ url: link, html: "" }));
} }
return null; return null;
@ -353,7 +361,8 @@ export class WebCrawler {
return url; return url;
}; };
const sitemapUrl = url.endsWith("/sitemap.xml")
const sitemapUrl = url.endsWith(".xml")
? url ? url
: `${url}/sitemap.xml`; : `${url}/sitemap.xml`;

View File

@ -24,7 +24,7 @@ export async function getLinksFromSitemap(
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data; content = response.data;
} else if (mode === 'fire-engine') { } else if (mode === 'fire-engine') {
const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;playwright" });; const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true });
if (!response.success) { if (!response.success) {
throw response.error; throw response.error;
} }

View File

@ -87,6 +87,7 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<Engi
priority: meta.internalOptions.priority, priority: meta.internalOptions.priority,
geolocation: meta.options.geolocation, geolocation: meta.options.geolocation,
mobile: meta.options.mobile, mobile: meta.options.mobile,
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
// TODO: scrollXPaths // TODO: scrollXPaths
}; };
@ -95,7 +96,9 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<Engi
let response = await performFireEngineScrape( let response = await performFireEngineScrape(
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }), meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
request, request,
defaultTimeout + totalWait, meta.options.timeout !== undefined
? defaultTimeout + totalWait
: Infinity, // TODO: better timeout handling
); );
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck" }), response.responseHeaders); specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck" }), response.responseHeaders);
@ -140,12 +143,16 @@ export async function scrapeURLWithFireEnginePlaywright(meta: Meta): Promise<Eng
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"), fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
wait: meta.options.waitFor, wait: meta.options.waitFor,
geolocation: meta.options.geolocation, geolocation: meta.options.geolocation,
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
}; };
let response = await performFireEngineScrape( let response = await performFireEngineScrape(
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }), meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
request, request,
defaultTimeout + meta.options.waitFor meta.options.timeout !== undefined
? defaultTimeout + meta.options.waitFor
: Infinity, // TODO: better timeout handling
); );
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck" }), response.responseHeaders); specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck" }), response.responseHeaders);
@ -179,11 +186,16 @@ export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise<Engi
atsv: meta.internalOptions.atsv, atsv: meta.internalOptions.atsv,
geolocation: meta.options.geolocation, geolocation: meta.options.geolocation,
disableJsDom: meta.internalOptions.v0DisableJsDom, disableJsDom: meta.internalOptions.v0DisableJsDom,
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
}; };
let response = await performFireEngineScrape( let response = await performFireEngineScrape(
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }), meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
request, request,
meta.options.timeout !== undefined
? defaultTimeout
: Infinity, // TODO: better timeout handling
); );
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck" }), response.responseHeaders); specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck" }), response.responseHeaders);

View File

@ -25,6 +25,8 @@ export type FireEngineScrapeRequestCommon = {
logRequest?: boolean; // default: true logRequest?: boolean; // default: true
instantReturn?: boolean; // default: false instantReturn?: boolean; // default: false
geolocation?: { country?: string; languages?: string[]; }; geolocation?: { country?: string; languages?: string[]; };
timeout?: number;
} }
export type FireEngineScrapeRequestChromeCDP = { export type FireEngineScrapeRequestChromeCDP = {

View File

@ -13,12 +13,12 @@ export async function scrapeURLWithPlaywright(meta: Meta): Promise<EngineScrapeR
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
body: JSON.stringify({ body: {
url: meta.url, url: meta.url,
wait_after_load: meta.options.waitFor, wait_after_load: meta.options.waitFor,
timeout, timeout,
headers: meta.options.headers, headers: meta.options.headers,
}), },
method: "POST", method: "POST",
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"), logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
schema: z.object({ schema: z.object({

View File

@ -18,7 +18,7 @@ export class NoEnginesLeftError extends Error {
public results: EngineResultsTracker; public results: EngineResultsTracker;
constructor(fallbackList: Engine[], results: EngineResultsTracker) { constructor(fallbackList: Engine[], results: EngineResultsTracker) {
super("All scraping engines failed!"); super("All scraping engines failed! -- Double check the URL to make sure it's not broken. If the issue persists, contact us at help@firecrawl.com.");
this.fallbackList = fallbackList; this.fallbackList = fallbackList;
this.results = results; this.results = results;
} }

View File

@ -5,7 +5,6 @@ import { logger } from "../lib/logger";
dotenv.config(); dotenv.config();
export async function fireEngineMap( export async function fireEngineMap(
q: string, q: string,
options: { options: {
@ -40,14 +39,13 @@ export async function fireEngineMap(
method: "POST", method: "POST",
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",
"X-Disable-Cache": "true" "X-Disable-Cache": "true",
}, },
body: data body: data,
}); });
if (response.ok) { if (response.ok) {
const responseData = await response.json(); const responseData = await response.json();
console.log("response", responseData);
return responseData; return responseData;
} else { } else {
return []; return [];

View File

@ -7,7 +7,7 @@ import { logger } from "../../lib/logger";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
configDotenv(); configDotenv();
export async function logJob(job: FirecrawlJob) { export async function logJob(job: FirecrawlJob, force: boolean = false) {
try { try {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (!useDbAuthentication) { if (!useDbAuthentication) {
@ -23,11 +23,7 @@ export async function logJob(job: FirecrawlJob) {
job.scrapeOptions.headers["Authorization"] = "REDACTED"; job.scrapeOptions.headers["Authorization"] = "REDACTED";
job.docs = [{ content: "REDACTED DUE TO AUTHORIZATION HEADER", html: "REDACTED DUE TO AUTHORIZATION HEADER" }]; job.docs = [{ content: "REDACTED DUE TO AUTHORIZATION HEADER", html: "REDACTED DUE TO AUTHORIZATION HEADER" }];
} }
const jobColumn = {
const { data, error } = await supabase_service
.from("firecrawl_jobs")
.insert([
{
job_id: job.job_id ? job.job_id : null, job_id: job.job_id ? job.job_id : null,
success: job.success, success: job.success,
message: job.message, message: job.message,
@ -43,8 +39,36 @@ export async function logJob(job: FirecrawlJob) {
num_tokens: job.num_tokens, num_tokens: job.num_tokens,
retry: !!job.retry, retry: !!job.retry,
crawl_id: job.crawl_id, crawl_id: job.crawl_id,
}, };
]);
if (force) {
while (true) {
try {
const { error } = await supabase_service
.from("firecrawl_jobs")
.insert([jobColumn]);
if (error) {
logger.error("Failed to log job due to Supabase error -- trying again", { error, scrapeId: job.job_id });
await new Promise<void>((resolve) => setTimeout(() => resolve(), 75));
} else {
break;
}
} catch (error) {
logger.error("Failed to log job due to thrown error -- trying again", { error, scrapeId: job.job_id });
await new Promise<void>((resolve) => setTimeout(() => resolve(), 75));
}
}
logger.debug("Job logged successfully!", { scrapeId: job.job_id });
} else {
const { error } = await supabase_service
.from("firecrawl_jobs")
.insert([jobColumn]);
if (error) {
logger.error(`Error logging job: ${error.message}`, { error, scrapeId: job.job_id });
} else {
logger.debug("Job logged successfully!", { scrapeId: job.job_id });
}
}
if (process.env.POSTHOG_API_KEY && !job.crawl_id) { if (process.env.POSTHOG_API_KEY && !job.crawl_id) {
let phLog = { let phLog = {
@ -72,9 +96,7 @@ export async function logJob(job: FirecrawlJob) {
posthog.capture(phLog); posthog.capture(phLog);
} }
} }
if (error) {
logger.error(`Error logging job: ${error.message}`);
}
} catch (error) { } catch (error) {
logger.error(`Error logging job: ${error.message}`); logger.error(`Error logging job: ${error.message}`);
} }

View File

@ -23,7 +23,7 @@ const emailTemplates: Record<
}, },
[NotificationType.RATE_LIMIT_REACHED]: { [NotificationType.RATE_LIMIT_REACHED]: {
subject: "Rate Limit Reached - Firecrawl", subject: "Rate Limit Reached - Firecrawl",
html: "Hey there,<br/><p>You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our <a href='https://firecrawl.dev/pricing'>pricing page</a> for more info.</p><p>If you have any questions, feel free to reach out to us at <a href='mailto:hello@firecrawl.com'>hello@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/><br/>Ps. this email is only sent once every 7 days if you reach a rate limit.", html: "Hey there,<br/><p>You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our <a href='https://firecrawl.dev/pricing'>pricing page</a> for more info.</p><p>If you have any questions, feel free to reach out to us at <a href='mailto:help@firecrawl.com'>help@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/><br/>Ps. this email is only sent once every 7 days if you reach a rate limit.",
}, },
[NotificationType.AUTO_RECHARGE_SUCCESS]: { [NotificationType.AUTO_RECHARGE_SUCCESS]: {
subject: "Auto recharge successful - Firecrawl", subject: "Auto recharge successful - Firecrawl",
@ -31,7 +31,7 @@ const emailTemplates: Record<
}, },
[NotificationType.AUTO_RECHARGE_FAILED]: { [NotificationType.AUTO_RECHARGE_FAILED]: {
subject: "Auto recharge failed - Firecrawl", subject: "Auto recharge failed - Firecrawl",
html: "Hey there,<br/><p>Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at <a href='mailto:hello@firecrawl.com'>hello@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/>", html: "Hey there,<br/><p>Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at <a href='mailto:help@firecrawl.com'>help@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/>",
}, },
}; };
@ -63,7 +63,7 @@ export async function sendEmailNotification(
const { data, error } = await resend.emails.send({ const { data, error } = await resend.emails.send({
from: "Firecrawl <firecrawl@getmendableai.com>", from: "Firecrawl <firecrawl@getmendableai.com>",
to: [email], to: [email],
reply_to: "hello@firecrawl.com", reply_to: "help@firecrawl.com",
subject: emailTemplates[notificationType].subject, subject: emailTemplates[notificationType].subject,
html: emailTemplates[notificationType].html, html: emailTemplates[notificationType].html,
}); });

View File

@ -262,7 +262,7 @@ async function processJob(job: Job & { id: string }, token: string) {
document: null, document: null,
project_id: job.data.project_id, project_id: job.data.project_id,
error: error:
"URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.", "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.",
}; };
return data; return data;
} }
@ -346,7 +346,7 @@ async function processJob(job: Job & { id: string }, token: string) {
scrapeOptions: job.data.scrapeOptions, scrapeOptions: job.data.scrapeOptions,
origin: job.data.origin, origin: job.data.origin,
crawl_id: job.data.crawl_id, crawl_id: job.data.crawl_id,
}); }, true);
await addCrawlJobDone(job.data.crawl_id, job.id); await addCrawlJobDone(job.data.crawl_id, job.id);
@ -486,7 +486,7 @@ async function processJob(job: Job & { id: string }, token: string) {
url: sc?.originUrl ?? (job.data.crawlerOptions === null ? "Batch Scrape" : "Unknown"), url: sc?.originUrl ?? (job.data.crawlerOptions === null ? "Batch Scrape" : "Unknown"),
crawlerOptions: sc.crawlerOptions, crawlerOptions: sc.crawlerOptions,
origin: job.data.origin, origin: job.data.origin,
}); }, true);
} }
} }
} }
@ -566,7 +566,7 @@ async function processJob(job: Job & { id: string }, token: string) {
scrapeOptions: job.data.scrapeOptions, scrapeOptions: job.data.scrapeOptions,
origin: job.data.origin, origin: job.data.origin,
crawl_id: job.data.crawl_id, crawl_id: job.data.crawl_id,
}); }, true);
// await logJob({ // await logJob({
// job_id: job.data.crawl_id, // job_id: job.data.crawl_id,

View File

@ -46,6 +46,8 @@ export const callWebhook = async (
webhookUrl = webhooksData[0].url; webhookUrl = webhooksData[0].url;
} }
logger.debug("Calling webhook...", { webhookUrl, teamId, specified, v1, eventType, awaitWebhook });
if (!webhookUrl) { if (!webhookUrl) {
return null; return null;
} }
@ -128,7 +130,6 @@ export const callWebhook = async (
"Content-Type": "application/json", "Content-Type": "application/json",
...webhookUrl.headers, ...webhookUrl.headers,
}, },
timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1)
} }
) )
.catch((error) => { .catch((error) => {

View File

@ -175,4 +175,4 @@ export type PlanType =
| ""; | "";
export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed"; export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "batch_scrape.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed";

View File

@ -221,6 +221,7 @@ export interface MapParams {
search?: string; search?: string;
ignoreSitemap?: boolean; ignoreSitemap?: boolean;
includeSubdomains?: boolean; includeSubdomains?: boolean;
sitemapOnly?: boolean;
limit?: number; limit?: number;
} }
@ -563,16 +564,18 @@ export default class FirecrawlApp {
* @param params - Additional parameters for the scrape request. * @param params - Additional parameters for the scrape request.
* @param pollInterval - Time in seconds for job status checks. * @param pollInterval - Time in seconds for job status checks.
* @param idempotencyKey - Optional idempotency key for the request. * @param idempotencyKey - Optional idempotency key for the request.
* @param webhook - Optional webhook for the batch scrape.
* @returns The response from the crawl operation. * @returns The response from the crawl operation.
*/ */
async batchScrapeUrls( async batchScrapeUrls(
urls: string[], urls: string[],
params?: ScrapeParams, params?: ScrapeParams,
pollInterval: number = 2, pollInterval: number = 2,
idempotencyKey?: string idempotencyKey?: string,
webhook?: CrawlParams["webhook"],
): Promise<BatchScrapeStatusResponse | ErrorResponse> { ): Promise<BatchScrapeStatusResponse | ErrorResponse> {
const headers = this.prepareHeaders(idempotencyKey); const headers = this.prepareHeaders(idempotencyKey);
let jsonData: any = { urls, ...(params ?? {}) }; let jsonData: any = { urls, ...(params ?? {}), webhook };
try { try {
const response: AxiosResponse = await this.postRequest( const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/batch/scrape`, this.apiUrl + `/v1/batch/scrape`,

View File

@ -0,0 +1,6 @@
# AGI News ✨
AGI News is a daily AI newsletter that's completely sourced by autonomous AI agents. It is live at [https://www.aginews.io/](https://www.aginews.io/)
Here is a link to the repo:
[https://github.com/ericciarla/aginews](https://github.com/ericciarla/aginews)

View File

@ -0,0 +1,7 @@
# Generate AI podcasts based on real time news 🎙️
This example crawls the web for interesting news stories then records a podcast with your own voice.
Here is a link to the repo:
[https://github.com/ericciarla/aginews-podcast](https://github.com/ericciarla/aginews-podcast)