Merge remote-tracking branch 'origin/main' into nsc/new-extract

This commit is contained in:
rafaelmmiller 2024-11-19 09:34:08 -03:00
commit 36cf49c959
25 changed files with 271 additions and 219 deletions

View File

@ -75,7 +75,7 @@ export async function crawlController(req: Request, res: Response) {
await checkTeamCredits(chunk, team_id, limitCheck);
if (!creditsCheckSuccess) {
return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at hello@firecrawl.com" });
return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com" });
}
// TODO: need to do this to v1

View File

@ -209,7 +209,7 @@ export async function scrapeController(req: Request, res: Response) {
earlyReturn = true;
return res.status(500).json({
error:
"Error checking team credits. Please contact hello@firecrawl.com for help.",
"Error checking team credits. Please contact help@firecrawl.com for help.",
});
}

View File

@ -16,6 +16,7 @@ import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { getJobPriority } from "../../lib/job-priority";
import { addScrapeJobs } from "../../services/queue-jobs";
import { callWebhook } from "../../services/webhook";
export async function batchScrapeController(
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
@ -66,6 +67,7 @@ export async function batchScrapeController(
crawl_id: id,
sitemapped: true,
v1: true,
webhook: req.body.webhook,
},
opts: {
jobId: uuidv4(),
@ -85,6 +87,10 @@ export async function batchScrapeController(
);
await addScrapeJobs(jobs);
if(req.body.webhook) {
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "batch_scrape.started");
}
const protocol = process.env.ENV === "local" ? req.protocol : "https";
return res.status(200).json({

View File

@ -175,7 +175,7 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut
logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
return close(ws, 1011, {
type: "error",
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id
});
}
}

View File

@ -1,11 +1,6 @@
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import {
MapDocument,
mapRequestSchema,
RequestWithAuth,
scrapeOptions,
} from "./types";
import { MapDocument, mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
import { MapResponse, MapRequest } from "./types";
import { configDotenv } from "dotenv";
@ -65,11 +60,13 @@ export async function getMapResults({
}): Promise<MapResult> {
const id = uuidv4();
let links: string[] = [url];
let mapResults: MapDocument[] = [];
const sc: StoredCrawl = {
originUrl: url,
crawlerOptions: {
...crawlerOptions,
limit: crawlerOptions.sitemapOnly ? 10000000 : limit,
scrapeOptions: undefined,
},
scrapeOptions: scrapeOptions.parse({}),
@ -81,106 +78,131 @@ export async function getMapResults({
const crawler = crawlToCrawler(id, sc);
let urlWithoutWww = url.replace("www.", "");
// If sitemapOnly is true, only get links from sitemap
if (crawlerOptions.sitemapOnly) {
if (includeMetadata) {
throw new Error("includeMetadata is not supported with sitemapOnly");
}
let mapUrl = search && allowExternalLinks
? `${search} ${urlWithoutWww}`
: search ? `${search} site:${urlWithoutWww}`
: `site:${url}`;
const resultsPerPage = 100;
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
const cacheKey = `fireEngineMap:${mapUrl}`;
const cachedResult = null;
let allResults: any[] = [];
let pagePromises: Promise<any>[] = [];
if (cachedResult) {
allResults = JSON.parse(cachedResult);
} else {
const fetchPage = async (page: number) => {
return fireEngineMap(mapUrl, {
numResults: resultsPerPage,
page: page,
const sitemap = await crawler.tryGetSitemap(true, true);
if (sitemap !== null) {
sitemap.forEach((x) => {
links.push(x.url);
});
};
links = links.slice(1)
.map((x) => {
try {
return checkAndUpdateURLForMap(x).url.trim();
} catch (_) {
return null;
}
})
.filter((x) => x !== null) as string[];
// links = links.slice(1, limit); // don't slice, unnecessary
}
} else {
let urlWithoutWww = url.replace("www.", "");
pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
allResults = await Promise.all(pagePromises);
let mapUrl = search && allowExternalLinks
? `${search} ${urlWithoutWww}`
: search ? `${search} site:${urlWithoutWww}`
: `site:${url}`;
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
}
const resultsPerPage = 100;
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
console.log("allResults", allResults);
// Parallelize sitemap fetch with serper search
const [sitemap, ...searchResults] = await Promise.all([
ignoreSitemap ? null : crawler.tryGetSitemap(),
...(cachedResult ? [] : pagePromises),
]);
const cacheKey = `fireEngineMap:${mapUrl}`;
const cachedResult = null;
if (!cachedResult) {
allResults = searchResults;
}
let allResults: any[] = [];
let pagePromises: Promise<any>[] = [];
if (sitemap !== null) {
sitemap.forEach((x) => {
links.push(x.url);
});
}
let mapResults : MapDocument[] = allResults
.flat()
.filter((result) => result !== null && result !== undefined);
const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit);
if (mapResults.length > minumumCutoff) {
mapResults = mapResults.slice(0, minumumCutoff);
}
if (mapResults.length > 0) {
if (search) {
// Ensure all map results are first, maintaining their order
links = [
mapResults[0].url,
...mapResults.slice(1).map((x) => x.url),
...links,
];
if (cachedResult) {
allResults = JSON.parse(cachedResult);
} else {
mapResults.map((x) => {
const fetchPage = async (page: number) => {
return fireEngineMap(mapUrl, {
numResults: resultsPerPage,
page: page,
});
};
pagePromises = Array.from({ length: maxPages }, (_, i) =>
fetchPage(i + 1)
);
allResults = await Promise.all(pagePromises);
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
}
// Parallelize sitemap fetch with serper search
const [sitemap, ...searchResults] = await Promise.all([
ignoreSitemap ? null : crawler.tryGetSitemap(),
...(cachedResult ? [] : pagePromises),
]);
if (!cachedResult) {
allResults = searchResults;
}
if (sitemap !== null) {
sitemap.forEach((x) => {
links.push(x.url);
});
}
}
// Perform cosine similarity between the search query and the list of links
if (search) {
const searchQuery = search.toLowerCase();
links = performCosineSimilarity(links, searchQuery);
}
mapResults = allResults
.flat()
.filter((result) => result !== null && result !== undefined);
links = links
.map((x) => {
try {
return checkAndUpdateURLForMap(x).url.trim();
} catch (_) {
return null;
const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit);
if (mapResults.length > minumumCutoff) {
mapResults = mapResults.slice(0, minumumCutoff);
}
if (mapResults.length > 0) {
if (search) {
// Ensure all map results are first, maintaining their order
links = [
mapResults[0].url,
...mapResults.slice(1).map((x) => x.url),
...links,
];
} else {
mapResults.map((x) => {
links.push(x.url);
});
}
})
.filter((x) => x !== null) as string[];
}
// allows for subdomains to be included
links = links.filter((x) => isSameDomain(x, url));
// Perform cosine similarity between the search query and the list of links
if (search) {
const searchQuery = search.toLowerCase();
links = performCosineSimilarity(links, searchQuery);
}
// if includeSubdomains is false, filter out subdomains
if (!includeSubdomains) {
links = links.filter((x) => isSameSubdomain(x, url));
links = links
.map((x) => {
try {
return checkAndUpdateURLForMap(x).url.trim();
} catch (_) {
return null;
}
})
.filter((x) => x !== null) as string[];
// allows for subdomains to be included
links = links.filter((x) => isSameDomain(x, url));
// if includeSubdomains is false, filter out subdomains
if (!includeSubdomains) {
links = links.filter((x) => isSameSubdomain(x, url));
}
// remove duplicates that could be due to http/https or www
links = removeDuplicateUrls(links);
}
// remove duplicates that could be due to http/https or www
links = removeDuplicateUrls(links);
const linksToReturn = links.slice(0, limit);
return {
@ -241,52 +263,4 @@ export async function mapController(
};
return res.status(200).json(response);
}
// Subdomain sitemap url checking
// // For each result, check for subdomains, get their sitemaps and add them to the links
// const processedUrls = new Set();
// const processedSubdomains = new Set();
// for (const result of links) {
// let url;
// let hostParts;
// try {
// url = new URL(result);
// hostParts = url.hostname.split('.');
// } catch (e) {
// continue;
// }
// console.log("hostParts", hostParts);
// // Check if it's a subdomain (more than 2 parts, and not 'www')
// if (hostParts.length > 2 && hostParts[0] !== 'www') {
// const subdomain = hostParts[0];
// console.log("subdomain", subdomain);
// const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
// console.log("subdomainUrl", subdomainUrl);
// if (!processedSubdomains.has(subdomainUrl)) {
// processedSubdomains.add(subdomainUrl);
// const subdomainCrawl = crawlToCrawler(id, {
// originUrl: subdomainUrl,
// crawlerOptions: legacyCrawlerOptions(req.body),
// pageOptions: {},
// team_id: req.auth.team_id,
// createdAt: Date.now(),
// plan: req.auth.plan,
// });
// const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
// if (subdomainSitemap) {
// subdomainSitemap.forEach((x) => {
// if (!processedUrls.has(x.url)) {
// processedUrls.add(x.url);
// links.push(x.url);
// }
// });
// }
// }
// }
// }
}

View File

@ -11,8 +11,12 @@ export async function scrapeStatusController(req: any, res: any) {
await rateLimiter.consume(iptoken);
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
const allowedTeams = [
"41bdbfe1-0579-4d9b-b6d5-809f16be12f5",
"511544f2-2fce-4183-9c59-6c29b02c69b5"
];
if(job?.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
if(!allowedTeams.includes(job?.team_id)){
return res.status(403).json({
success: false,
error: "You are not allowed to access this resource.",

View File

@ -119,7 +119,7 @@ export const scrapeOptions = z.object({
includeTags: z.string().array().optional(),
excludeTags: z.string().array().optional(),
onlyMainContent: z.boolean().default(true),
timeout: z.number().int().positive().finite().safe().default(30000),
timeout: z.number().int().positive().finite().safe().optional(),
waitFor: z.number().int().nonnegative().finite().safe().default(0),
extract: extractOptions.optional(),
mobile: z.boolean().default(false),
@ -170,9 +170,10 @@ export type ExtractV1Options = z.infer<typeof extractV1Options>;
export const extractRequestSchema = extractV1Options;
export type ExtractRequest = z.infer<typeof extractRequestSchema>;
export const scrapeRequestSchema = scrapeOptions.extend({
export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({
url,
origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(30000),
}).strict(strictMessage).refine(
(obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
@ -194,9 +195,21 @@ export const scrapeRequestSchema = scrapeOptions.extend({
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
export const webhookSchema = z.preprocess(x => {
if (typeof x === "string") {
return { url: x };
} else {
return x;
}
}, z.object({
url: z.string().url(),
headers: z.record(z.string(), z.string()).default({}),
}).strict(strictMessage))
export const batchScrapeRequestSchema = scrapeOptions.extend({
urls: url.array(),
origin: z.string().optional().default("api"),
webhook: webhookSchema.optional(),
}).strict(strictMessage).refine(
(obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
@ -206,12 +219,7 @@ export const batchScrapeRequestSchema = scrapeOptions.extend({
{
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
}
).transform((obj) => {
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
return { ...obj, timeout: 60000 };
}
return obj;
});
);
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
@ -239,21 +247,10 @@ const crawlerOptions = z.object({
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
export const webhookSchema = z.preprocess(x => {
if (typeof x === "string") {
return { url: x };
} else {
return x;
}
}, z.object({
url: z.string().url(),
headers: z.record(z.string(), z.string()).default({}),
}).strict(strictMessage))
export const crawlRequestSchema = crawlerOptions.extend({
url,
origin: z.string().optional().default("api"),
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
scrapeOptions: scrapeOptions.default({}),
webhook: webhookSchema.optional(),
limit: z.number().default(10000),
}).strict(strictMessage);
@ -279,6 +276,7 @@ export const mapRequestSchema = crawlerOptions.extend({
includeSubdomains: z.boolean().default(true),
search: z.string().optional(),
ignoreSitemap: z.boolean().default(false),
sitemapOnly: z.boolean().default(false),
limit: z.number().min(1).max(5000).default(5000),
}).strict(strictMessage);

View File

@ -207,7 +207,7 @@ app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response
}
logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id });
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id });
});
logger.info(`Worker ${process.pid} started`);

View File

@ -52,7 +52,7 @@ export async function addCrawlJobs(id: string, job_ids: string[]) {
export async function addCrawlJobDone(id: string, job_id: string) {
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id);
await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id);
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX");
}

View File

@ -6,22 +6,28 @@ import * as Sentry from "@sentry/node";
import dotenv from 'dotenv';
import { logger } from './logger';
import { stat } from 'fs/promises';
dotenv.config();
// TODO: add a timeout to the Go parser
const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so');
class GoMarkdownConverter {
private static instance: GoMarkdownConverter;
private convert: any;
private constructor() {
const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so');
const lib = koffi.load(goExecutablePath);
this.convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']);
}
public static getInstance(): GoMarkdownConverter {
public static async getInstance(): Promise<GoMarkdownConverter> {
if (!GoMarkdownConverter.instance) {
try {
await stat(goExecutablePath);
} catch (_) {
throw Error("Go shared library not found");
}
GoMarkdownConverter.instance = new GoMarkdownConverter();
}
return GoMarkdownConverter.instance;
@ -47,7 +53,7 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
try {
if (process.env.USE_GO_MARKDOWN_PARSER == "true") {
const converter = GoMarkdownConverter.getInstance();
const converter = await GoMarkdownConverter.getInstance();
let markdownContent = await converter.convertHTMLToMarkdown(html);
markdownContent = processMultiLineLinks(markdownContent);
@ -56,8 +62,12 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
return markdownContent;
}
} catch (error) {
Sentry.captureException(error);
logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
if (!(error instanceof Error) || error.message !== "Go shared library not found") {
Sentry.captureException(error);
logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
} else {
logger.warn("Tried to use Go parser, but it doesn't exist in the file system.", { goExecutablePath });
}
}
// Fallback to TurndownService if Go parser fails or is not enabled
@ -89,7 +99,7 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
return markdownContent;
} catch (error) {
console.error("Error converting HTML to Markdown: ", error);
logger.error("Error converting HTML to Markdown", {error});
return ""; // Optionally return an empty string or handle the error as needed
}
}

View File

@ -65,7 +65,12 @@ export class WebCrawler {
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
}
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
// If the initial URL is a sitemap.xml, skip filtering
if (this.initialUrl.endsWith('sitemap.xml') && fromMap) {
return sitemapLinks.slice(0, limit);
}
return sitemapLinks
.filter((link) => {
let url: URL;
@ -159,11 +164,14 @@ export class WebCrawler {
this.robots = robotsParser(this.robotsTxtUrl, txt);
}
public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> {
public async tryGetSitemap(fromMap: boolean = false, onlySitemap: boolean = false): Promise<{ url: string; html: string; }[] | null> {
logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if(fromMap && onlySitemap) {
return sitemapLinks.map(link => ({ url: link, html: "" }));
}
if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth);
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth, fromMap);
return filteredLinks.map(link => ({ url: link, html: "" }));
}
return null;
@ -353,7 +361,8 @@ export class WebCrawler {
return url;
};
const sitemapUrl = url.endsWith("/sitemap.xml")
const sitemapUrl = url.endsWith(".xml")
? url
: `${url}/sitemap.xml`;

View File

@ -24,7 +24,7 @@ export async function getLinksFromSitemap(
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data;
} else if (mode === 'fire-engine') {
const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;playwright" });;
const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true });
if (!response.success) {
throw response.error;
}

View File

@ -87,6 +87,7 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<Engi
priority: meta.internalOptions.priority,
geolocation: meta.options.geolocation,
mobile: meta.options.mobile,
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
// TODO: scrollXPaths
};
@ -95,7 +96,9 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<Engi
let response = await performFireEngineScrape(
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
request,
defaultTimeout + totalWait,
meta.options.timeout !== undefined
? defaultTimeout + totalWait
: Infinity, // TODO: better timeout handling
);
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck" }), response.responseHeaders);
@ -140,12 +143,16 @@ export async function scrapeURLWithFireEnginePlaywright(meta: Meta): Promise<Eng
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
wait: meta.options.waitFor,
geolocation: meta.options.geolocation,
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
};
let response = await performFireEngineScrape(
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
request,
defaultTimeout + meta.options.waitFor
meta.options.timeout !== undefined
? defaultTimeout + meta.options.waitFor
: Infinity, // TODO: better timeout handling
);
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck" }), response.responseHeaders);
@ -179,11 +186,16 @@ export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise<Engi
atsv: meta.internalOptions.atsv,
geolocation: meta.options.geolocation,
disableJsDom: meta.internalOptions.v0DisableJsDom,
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
};
let response = await performFireEngineScrape(
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
request,
meta.options.timeout !== undefined
? defaultTimeout
: Infinity, // TODO: better timeout handling
);
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck" }), response.responseHeaders);

View File

@ -25,6 +25,8 @@ export type FireEngineScrapeRequestCommon = {
logRequest?: boolean; // default: true
instantReturn?: boolean; // default: false
geolocation?: { country?: string; languages?: string[]; };
timeout?: number;
}
export type FireEngineScrapeRequestChromeCDP = {

View File

@ -13,12 +13,12 @@ export async function scrapeURLWithPlaywright(meta: Meta): Promise<EngineScrapeR
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
body: {
url: meta.url,
wait_after_load: meta.options.waitFor,
timeout,
headers: meta.options.headers,
}),
},
method: "POST",
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
schema: z.object({

View File

@ -18,7 +18,7 @@ export class NoEnginesLeftError extends Error {
public results: EngineResultsTracker;
constructor(fallbackList: Engine[], results: EngineResultsTracker) {
super("All scraping engines failed!");
super("All scraping engines failed! -- Double check the URL to make sure it's not broken. If the issue persists, contact us at help@firecrawl.com.");
this.fallbackList = fallbackList;
this.results = results;
}

View File

@ -5,7 +5,6 @@ import { logger } from "../lib/logger";
dotenv.config();
export async function fireEngineMap(
q: string,
options: {
@ -40,14 +39,13 @@ export async function fireEngineMap(
method: "POST",
headers: {
"Content-Type": "application/json",
"X-Disable-Cache": "true"
"X-Disable-Cache": "true",
},
body: data
body: data,
});
if (response.ok) {
const responseData = await response.json();
console.log("response", responseData);
return responseData;
} else {
return [];

View File

@ -7,7 +7,7 @@ import { logger } from "../../lib/logger";
import { configDotenv } from "dotenv";
configDotenv();
export async function logJob(job: FirecrawlJob) {
export async function logJob(job: FirecrawlJob, force: boolean = false) {
try {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (!useDbAuthentication) {
@ -23,28 +23,52 @@ export async function logJob(job: FirecrawlJob) {
job.scrapeOptions.headers["Authorization"] = "REDACTED";
job.docs = [{ content: "REDACTED DUE TO AUTHORIZATION HEADER", html: "REDACTED DUE TO AUTHORIZATION HEADER" }];
}
const jobColumn = {
job_id: job.job_id ? job.job_id : null,
success: job.success,
message: job.message,
num_docs: job.num_docs,
docs: job.docs,
time_taken: job.time_taken,
team_id: job.team_id === "preview" ? null : job.team_id,
mode: job.mode,
url: job.url,
crawler_options: job.crawlerOptions,
page_options: job.scrapeOptions,
origin: job.origin,
num_tokens: job.num_tokens,
retry: !!job.retry,
crawl_id: job.crawl_id,
};
const { data, error } = await supabase_service
.from("firecrawl_jobs")
.insert([
{
job_id: job.job_id ? job.job_id : null,
success: job.success,
message: job.message,
num_docs: job.num_docs,
docs: job.docs,
time_taken: job.time_taken,
team_id: job.team_id === "preview" ? null : job.team_id,
mode: job.mode,
url: job.url,
crawler_options: job.crawlerOptions,
page_options: job.scrapeOptions,
origin: job.origin,
num_tokens: job.num_tokens,
retry: !!job.retry,
crawl_id: job.crawl_id,
},
]);
if (force) {
while (true) {
try {
const { error } = await supabase_service
.from("firecrawl_jobs")
.insert([jobColumn]);
if (error) {
logger.error("Failed to log job due to Supabase error -- trying again", { error, scrapeId: job.job_id });
await new Promise<void>((resolve) => setTimeout(() => resolve(), 75));
} else {
break;
}
} catch (error) {
logger.error("Failed to log job due to thrown error -- trying again", { error, scrapeId: job.job_id });
await new Promise<void>((resolve) => setTimeout(() => resolve(), 75));
}
}
logger.debug("Job logged successfully!", { scrapeId: job.job_id });
} else {
const { error } = await supabase_service
.from("firecrawl_jobs")
.insert([jobColumn]);
if (error) {
logger.error(`Error logging job: ${error.message}`, { error, scrapeId: job.job_id });
} else {
logger.debug("Job logged successfully!", { scrapeId: job.job_id });
}
}
if (process.env.POSTHOG_API_KEY && !job.crawl_id) {
let phLog = {
@ -72,9 +96,7 @@ export async function logJob(job: FirecrawlJob) {
posthog.capture(phLog);
}
}
if (error) {
logger.error(`Error logging job: ${error.message}`);
}
} catch (error) {
logger.error(`Error logging job: ${error.message}`);
}

View File

@ -23,7 +23,7 @@ const emailTemplates: Record<
},
[NotificationType.RATE_LIMIT_REACHED]: {
subject: "Rate Limit Reached - Firecrawl",
html: "Hey there,<br/><p>You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our <a href='https://firecrawl.dev/pricing'>pricing page</a> for more info.</p><p>If you have any questions, feel free to reach out to us at <a href='mailto:hello@firecrawl.com'>hello@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/><br/>Ps. this email is only sent once every 7 days if you reach a rate limit.",
html: "Hey there,<br/><p>You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our <a href='https://firecrawl.dev/pricing'>pricing page</a> for more info.</p><p>If you have any questions, feel free to reach out to us at <a href='mailto:help@firecrawl.com'>help@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/><br/>Ps. this email is only sent once every 7 days if you reach a rate limit.",
},
[NotificationType.AUTO_RECHARGE_SUCCESS]: {
subject: "Auto recharge successful - Firecrawl",
@ -31,7 +31,7 @@ const emailTemplates: Record<
},
[NotificationType.AUTO_RECHARGE_FAILED]: {
subject: "Auto recharge failed - Firecrawl",
html: "Hey there,<br/><p>Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at <a href='mailto:hello@firecrawl.com'>hello@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/>",
html: "Hey there,<br/><p>Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at <a href='mailto:help@firecrawl.com'>help@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/>",
},
};
@ -63,7 +63,7 @@ export async function sendEmailNotification(
const { data, error } = await resend.emails.send({
from: "Firecrawl <firecrawl@getmendableai.com>",
to: [email],
reply_to: "hello@firecrawl.com",
reply_to: "help@firecrawl.com",
subject: emailTemplates[notificationType].subject,
html: emailTemplates[notificationType].html,
});

View File

@ -262,7 +262,7 @@ async function processJob(job: Job & { id: string }, token: string) {
document: null,
project_id: job.data.project_id,
error:
"URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.",
"URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.",
};
return data;
}
@ -346,7 +346,7 @@ async function processJob(job: Job & { id: string }, token: string) {
scrapeOptions: job.data.scrapeOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
});
}, true);
await addCrawlJobDone(job.data.crawl_id, job.id);
@ -486,7 +486,7 @@ async function processJob(job: Job & { id: string }, token: string) {
url: sc?.originUrl ?? (job.data.crawlerOptions === null ? "Batch Scrape" : "Unknown"),
crawlerOptions: sc.crawlerOptions,
origin: job.data.origin,
});
}, true);
}
}
}
@ -566,7 +566,7 @@ async function processJob(job: Job & { id: string }, token: string) {
scrapeOptions: job.data.scrapeOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
});
}, true);
// await logJob({
// job_id: job.data.crawl_id,

View File

@ -46,6 +46,8 @@ export const callWebhook = async (
webhookUrl = webhooksData[0].url;
}
logger.debug("Calling webhook...", { webhookUrl, teamId, specified, v1, eventType, awaitWebhook });
if (!webhookUrl) {
return null;
}
@ -128,7 +130,6 @@ export const callWebhook = async (
"Content-Type": "application/json",
...webhookUrl.headers,
},
timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1)
}
)
.catch((error) => {

View File

@ -175,4 +175,4 @@ export type PlanType =
| "";
export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed";
export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "batch_scrape.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed";

View File

@ -221,6 +221,7 @@ export interface MapParams {
search?: string;
ignoreSitemap?: boolean;
includeSubdomains?: boolean;
sitemapOnly?: boolean;
limit?: number;
}
@ -563,16 +564,18 @@ export default class FirecrawlApp {
* @param params - Additional parameters for the scrape request.
* @param pollInterval - Time in seconds for job status checks.
* @param idempotencyKey - Optional idempotency key for the request.
* @param webhook - Optional webhook for the batch scrape.
* @returns The response from the crawl operation.
*/
async batchScrapeUrls(
urls: string[],
params?: ScrapeParams,
pollInterval: number = 2,
idempotencyKey?: string
idempotencyKey?: string,
webhook?: CrawlParams["webhook"],
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData: any = { urls, ...(params ?? {}) };
let jsonData: any = { urls, ...(params ?? {}), webhook };
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/batch/scrape`,

View File

@ -0,0 +1,6 @@
# AGI News ✨
AGI News is a daily AI newsletter that's completely sourced by autonomous AI agents. It is live at [https://www.aginews.io/](https://www.aginews.io/)
Here is a link to the repo:
[https://github.com/ericciarla/aginews](https://github.com/ericciarla/aginews)

View File

@ -0,0 +1,7 @@
# Generate AI podcasts based on real time news 🎙️
This example crawls the web for interesting news stories then records a podcast with your own voice.
Here is a link to the repo:
[https://github.com/ericciarla/aginews-podcast](https://github.com/ericciarla/aginews-podcast)