diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index fa7627da..5de5eccf 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -75,7 +75,7 @@ export async function crawlController(req: Request, res: Response) { await checkTeamCredits(chunk, team_id, limitCheck); if (!creditsCheckSuccess) { - return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at hello@firecrawl.com" }); + return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com" }); } // TODO: need to do this to v1 diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 9bc41fc1..02b9400e 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -209,7 +209,7 @@ export async function scrapeController(req: Request, res: Response) { earlyReturn = true; return res.status(500).json({ error: - "Error checking team credits. Please contact hello@firecrawl.com for help.", + "Error checking team credits. Please contact help@firecrawl.com for help.", }); } diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index b018dc99..3830b1fe 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -16,6 +16,7 @@ import { logCrawl } from "../../services/logging/crawl_log"; import { getScrapeQueue } from "../../services/queue-service"; import { getJobPriority } from "../../lib/job-priority"; import { addScrapeJobs } from "../../services/queue-jobs"; +import { callWebhook } from "../../services/webhook"; export async function batchScrapeController( req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>, @@ -66,6 +67,7 @@ export async function batchScrapeController( crawl_id: id, sitemapped: true, v1: true, + webhook: req.body.webhook, }, opts: { jobId: uuidv4(), @@ -85,6 +87,10 @@ export async function batchScrapeController( ); await addScrapeJobs(jobs); + if(req.body.webhook) { + await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "batch_scrape.started"); + } + const protocol = process.env.ENV === "local" ? req.protocol : "https"; return res.status(200).json({ diff --git a/apps/api/src/controllers/v1/crawl-status-ws.ts b/apps/api/src/controllers/v1/crawl-status-ws.ts index 18222edc..f552492f 100644 --- a/apps/api/src/controllers/v1/crawl-status-ws.ts +++ b/apps/api/src/controllers/v1/crawl-status-ws.ts @@ -175,7 +175,7 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose); return close(ws, 1011, { type: "error", - error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id + error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id }); } } diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index ba7be01f..8ab5c135 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -1,11 +1,6 @@ import { Response } from "express"; import { v4 as uuidv4 } from "uuid"; -import { - MapDocument, - mapRequestSchema, - RequestWithAuth, - scrapeOptions, -} from "./types"; +import { MapDocument, mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types"; import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; import { MapResponse, MapRequest } from "./types"; import { configDotenv } from "dotenv"; @@ -65,11 +60,13 @@ export async function getMapResults({ }): Promise { const id = uuidv4(); let links: string[] = [url]; + let mapResults: MapDocument[] = []; const sc: StoredCrawl = { originUrl: url, crawlerOptions: { ...crawlerOptions, + limit: crawlerOptions.sitemapOnly ? 10000000 : limit, scrapeOptions: undefined, }, scrapeOptions: scrapeOptions.parse({}), @@ -81,106 +78,131 @@ export async function getMapResults({ const crawler = crawlToCrawler(id, sc); - let urlWithoutWww = url.replace("www.", ""); + // If sitemapOnly is true, only get links from sitemap + if (crawlerOptions.sitemapOnly) { + if (includeMetadata) { + throw new Error("includeMetadata is not supported with sitemapOnly"); + } - let mapUrl = search && allowExternalLinks - ? `${search} ${urlWithoutWww}` - : search ? `${search} site:${urlWithoutWww}` - : `site:${url}`; - - const resultsPerPage = 100; - const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage); - - const cacheKey = `fireEngineMap:${mapUrl}`; - const cachedResult = null; - - let allResults: any[] = []; - let pagePromises: Promise[] = []; - - if (cachedResult) { - allResults = JSON.parse(cachedResult); - } else { - const fetchPage = async (page: number) => { - return fireEngineMap(mapUrl, { - numResults: resultsPerPage, - page: page, + const sitemap = await crawler.tryGetSitemap(true, true); + if (sitemap !== null) { + sitemap.forEach((x) => { + links.push(x.url); }); - }; + links = links.slice(1) + .map((x) => { + try { + return checkAndUpdateURLForMap(x).url.trim(); + } catch (_) { + return null; + } + }) + .filter((x) => x !== null) as string[]; + // links = links.slice(1, limit); // don't slice, unnecessary + } + } else { + let urlWithoutWww = url.replace("www.", ""); - pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1)); - allResults = await Promise.all(pagePromises); + let mapUrl = search && allowExternalLinks + ? `${search} ${urlWithoutWww}` + : search ? `${search} site:${urlWithoutWww}` + : `site:${url}`; - await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours - } + const resultsPerPage = 100; + const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage); - console.log("allResults", allResults); - // Parallelize sitemap fetch with serper search - const [sitemap, ...searchResults] = await Promise.all([ - ignoreSitemap ? null : crawler.tryGetSitemap(), - ...(cachedResult ? [] : pagePromises), - ]); + const cacheKey = `fireEngineMap:${mapUrl}`; + const cachedResult = null; - if (!cachedResult) { - allResults = searchResults; - } + let allResults: any[] = []; + let pagePromises: Promise[] = []; - if (sitemap !== null) { - sitemap.forEach((x) => { - links.push(x.url); - }); - } - - let mapResults : MapDocument[] = allResults - .flat() - .filter((result) => result !== null && result !== undefined); - - const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit); - if (mapResults.length > minumumCutoff) { - mapResults = mapResults.slice(0, minumumCutoff); - } - - if (mapResults.length > 0) { - if (search) { - // Ensure all map results are first, maintaining their order - links = [ - mapResults[0].url, - ...mapResults.slice(1).map((x) => x.url), - ...links, - ]; + if (cachedResult) { + allResults = JSON.parse(cachedResult); } else { - mapResults.map((x) => { + const fetchPage = async (page: number) => { + return fireEngineMap(mapUrl, { + numResults: resultsPerPage, + page: page, + }); + }; + + pagePromises = Array.from({ length: maxPages }, (_, i) => + fetchPage(i + 1) + ); + allResults = await Promise.all(pagePromises); + + await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours + } + + // Parallelize sitemap fetch with serper search + const [sitemap, ...searchResults] = await Promise.all([ + ignoreSitemap ? null : crawler.tryGetSitemap(), + ...(cachedResult ? [] : pagePromises), + ]); + + if (!cachedResult) { + allResults = searchResults; + } + + if (sitemap !== null) { + sitemap.forEach((x) => { links.push(x.url); }); } - } - // Perform cosine similarity between the search query and the list of links - if (search) { - const searchQuery = search.toLowerCase(); - links = performCosineSimilarity(links, searchQuery); - } + mapResults = allResults + .flat() + .filter((result) => result !== null && result !== undefined); - links = links - .map((x) => { - try { - return checkAndUpdateURLForMap(x).url.trim(); - } catch (_) { - return null; + const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit); + if (mapResults.length > minumumCutoff) { + mapResults = mapResults.slice(0, minumumCutoff); + } + + if (mapResults.length > 0) { + if (search) { + // Ensure all map results are first, maintaining their order + links = [ + mapResults[0].url, + ...mapResults.slice(1).map((x) => x.url), + ...links, + ]; + } else { + mapResults.map((x) => { + links.push(x.url); + }); } - }) - .filter((x) => x !== null) as string[]; + } - // allows for subdomains to be included - links = links.filter((x) => isSameDomain(x, url)); + // Perform cosine similarity between the search query and the list of links + if (search) { + const searchQuery = search.toLowerCase(); + links = performCosineSimilarity(links, searchQuery); + } - // if includeSubdomains is false, filter out subdomains - if (!includeSubdomains) { - links = links.filter((x) => isSameSubdomain(x, url)); + links = links + .map((x) => { + try { + return checkAndUpdateURLForMap(x).url.trim(); + } catch (_) { + return null; + } + }) + .filter((x) => x !== null) as string[]; + + // allows for subdomains to be included + links = links.filter((x) => isSameDomain(x, url)); + + // if includeSubdomains is false, filter out subdomains + if (!includeSubdomains) { + links = links.filter((x) => isSameSubdomain(x, url)); + } + + // remove duplicates that could be due to http/https or www + links = removeDuplicateUrls(links); } - // remove duplicates that could be due to http/https or www - links = removeDuplicateUrls(links); - const linksToReturn = links.slice(0, limit); return { @@ -241,52 +263,4 @@ export async function mapController( }; return res.status(200).json(response); -} - -// Subdomain sitemap url checking - -// // For each result, check for subdomains, get their sitemaps and add them to the links -// const processedUrls = new Set(); -// const processedSubdomains = new Set(); - -// for (const result of links) { -// let url; -// let hostParts; -// try { -// url = new URL(result); -// hostParts = url.hostname.split('.'); -// } catch (e) { -// continue; -// } - -// console.log("hostParts", hostParts); -// // Check if it's a subdomain (more than 2 parts, and not 'www') -// if (hostParts.length > 2 && hostParts[0] !== 'www') { -// const subdomain = hostParts[0]; -// console.log("subdomain", subdomain); -// const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`; -// console.log("subdomainUrl", subdomainUrl); - -// if (!processedSubdomains.has(subdomainUrl)) { -// processedSubdomains.add(subdomainUrl); - -// const subdomainCrawl = crawlToCrawler(id, { -// originUrl: subdomainUrl, -// crawlerOptions: legacyCrawlerOptions(req.body), -// pageOptions: {}, -// team_id: req.auth.team_id, -// createdAt: Date.now(), -// plan: req.auth.plan, -// }); -// const subdomainSitemap = await subdomainCrawl.tryGetSitemap(); -// if (subdomainSitemap) { -// subdomainSitemap.forEach((x) => { -// if (!processedUrls.has(x.url)) { -// processedUrls.add(x.url); -// links.push(x.url); -// } -// }); -// } -// } -// } -// } \ No newline at end of file +} \ No newline at end of file diff --git a/apps/api/src/controllers/v1/scrape-status.ts b/apps/api/src/controllers/v1/scrape-status.ts index db50f7d3..b7f19a3b 100644 --- a/apps/api/src/controllers/v1/scrape-status.ts +++ b/apps/api/src/controllers/v1/scrape-status.ts @@ -11,8 +11,12 @@ export async function scrapeStatusController(req: any, res: any) { await rateLimiter.consume(iptoken); const job = await supabaseGetJobByIdOnlyData(req.params.jobId); + const allowedTeams = [ + "41bdbfe1-0579-4d9b-b6d5-809f16be12f5", + "511544f2-2fce-4183-9c59-6c29b02c69b5" + ]; - if(job?.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){ + if(!allowedTeams.includes(job?.team_id)){ return res.status(403).json({ success: false, error: "You are not allowed to access this resource.", diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 96dea785..377c47f1 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -119,7 +119,7 @@ export const scrapeOptions = z.object({ includeTags: z.string().array().optional(), excludeTags: z.string().array().optional(), onlyMainContent: z.boolean().default(true), - timeout: z.number().int().positive().finite().safe().default(30000), + timeout: z.number().int().positive().finite().safe().optional(), waitFor: z.number().int().nonnegative().finite().safe().default(0), extract: extractOptions.optional(), mobile: z.boolean().default(false), @@ -170,9 +170,10 @@ export type ExtractV1Options = z.infer; export const extractRequestSchema = extractV1Options; export type ExtractRequest = z.infer; -export const scrapeRequestSchema = scrapeOptions.extend({ +export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({ url, origin: z.string().optional().default("api"), + timeout: z.number().int().positive().finite().safe().default(30000), }).strict(strictMessage).refine( (obj) => { const hasExtractFormat = obj.formats?.includes("extract"); @@ -194,9 +195,21 @@ export const scrapeRequestSchema = scrapeOptions.extend({ export type ScrapeRequest = z.infer; export type ScrapeRequestInput = z.input; +export const webhookSchema = z.preprocess(x => { + if (typeof x === "string") { + return { url: x }; + } else { + return x; + } +}, z.object({ + url: z.string().url(), + headers: z.record(z.string(), z.string()).default({}), +}).strict(strictMessage)) + export const batchScrapeRequestSchema = scrapeOptions.extend({ urls: url.array(), origin: z.string().optional().default("api"), + webhook: webhookSchema.optional(), }).strict(strictMessage).refine( (obj) => { const hasExtractFormat = obj.formats?.includes("extract"); @@ -206,12 +219,7 @@ export const batchScrapeRequestSchema = scrapeOptions.extend({ { message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa", } -).transform((obj) => { - if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) { - return { ...obj, timeout: 60000 }; - } - return obj; -}); +); export type BatchScrapeRequest = z.infer; @@ -239,21 +247,10 @@ const crawlerOptions = z.object({ export type CrawlerOptions = z.infer; -export const webhookSchema = z.preprocess(x => { - if (typeof x === "string") { - return { url: x }; - } else { - return x; - } -}, z.object({ - url: z.string().url(), - headers: z.record(z.string(), z.string()).default({}), -}).strict(strictMessage)) - export const crawlRequestSchema = crawlerOptions.extend({ url, origin: z.string().optional().default("api"), - scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}), + scrapeOptions: scrapeOptions.default({}), webhook: webhookSchema.optional(), limit: z.number().default(10000), }).strict(strictMessage); @@ -279,6 +276,7 @@ export const mapRequestSchema = crawlerOptions.extend({ includeSubdomains: z.boolean().default(true), search: z.string().optional(), ignoreSitemap: z.boolean().default(false), + sitemapOnly: z.boolean().default(false), limit: z.number().min(1).max(5000).default(5000), }).strict(strictMessage); diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 049b37a9..e32bf97f 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -207,7 +207,7 @@ app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response } logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); - res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id }); + res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id }); }); logger.info(`Worker ${process.pid} started`); diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 2b255971..9bce160b 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -52,7 +52,7 @@ export async function addCrawlJobs(id: string, job_ids: string[]) { export async function addCrawlJobDone(id: string, job_id: string) { await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id); - await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id); + await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id); await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX"); await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX"); } diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 8800d916..92bcd4cd 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -6,22 +6,28 @@ import * as Sentry from "@sentry/node"; import dotenv from 'dotenv'; import { logger } from './logger'; +import { stat } from 'fs/promises'; dotenv.config(); // TODO: add a timeout to the Go parser +const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so'); class GoMarkdownConverter { private static instance: GoMarkdownConverter; private convert: any; private constructor() { - const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so'); const lib = koffi.load(goExecutablePath); this.convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']); } - public static getInstance(): GoMarkdownConverter { + public static async getInstance(): Promise { if (!GoMarkdownConverter.instance) { + try { + await stat(goExecutablePath); + } catch (_) { + throw Error("Go shared library not found"); + } GoMarkdownConverter.instance = new GoMarkdownConverter(); } return GoMarkdownConverter.instance; @@ -47,7 +53,7 @@ export async function parseMarkdown(html: string | null | undefined): Promise { let url: URL; @@ -159,11 +164,14 @@ export class WebCrawler { this.robots = robotsParser(this.robotsTxtUrl, txt); } - public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> { + public async tryGetSitemap(fromMap: boolean = false, onlySitemap: boolean = false): Promise<{ url: string; html: string; }[] | null> { logger.debug(`Fetching sitemap links from ${this.initialUrl}`); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); + if(fromMap && onlySitemap) { + return sitemapLinks.map(link => ({ url: link, html: "" })); + } if (sitemapLinks.length > 0) { - let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth); + let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth, fromMap); return filteredLinks.map(link => ({ url: link, html: "" })); } return null; @@ -353,7 +361,8 @@ export class WebCrawler { return url; }; - const sitemapUrl = url.endsWith("/sitemap.xml") + + const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`; diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 05b3d00d..51f90b18 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -24,7 +24,7 @@ export async function getLinksFromSitemap( const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); content = response.data; } else if (mode === 'fire-engine') { - const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;playwright" });; + const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }); if (!response.success) { throw response.error; } diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 0ea54382..17bb40f2 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -87,6 +87,7 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise((resolve) => setTimeout(() => resolve(), 75)); + } else { + break; + } + } catch (error) { + logger.error("Failed to log job due to thrown error -- trying again", { error, scrapeId: job.job_id }); + await new Promise((resolve) => setTimeout(() => resolve(), 75)); + } + } + logger.debug("Job logged successfully!", { scrapeId: job.job_id }); + } else { + const { error } = await supabase_service + .from("firecrawl_jobs") + .insert([jobColumn]); + if (error) { + logger.error(`Error logging job: ${error.message}`, { error, scrapeId: job.job_id }); + } else { + logger.debug("Job logged successfully!", { scrapeId: job.job_id }); + } + } if (process.env.POSTHOG_API_KEY && !job.crawl_id) { let phLog = { @@ -72,9 +96,7 @@ export async function logJob(job: FirecrawlJob) { posthog.capture(phLog); } } - if (error) { - logger.error(`Error logging job: ${error.message}`); - } + } catch (error) { logger.error(`Error logging job: ${error.message}`); } diff --git a/apps/api/src/services/notification/email_notification.ts b/apps/api/src/services/notification/email_notification.ts index e451e0c0..e4575b32 100644 --- a/apps/api/src/services/notification/email_notification.ts +++ b/apps/api/src/services/notification/email_notification.ts @@ -23,7 +23,7 @@ const emailTemplates: Record< }, [NotificationType.RATE_LIMIT_REACHED]: { subject: "Rate Limit Reached - Firecrawl", - html: "Hey there,

You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our pricing page for more info.

If you have any questions, feel free to reach out to us at hello@firecrawl.com


Thanks,
Firecrawl Team

Ps. this email is only sent once every 7 days if you reach a rate limit.", + html: "Hey there,

You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our pricing page for more info.

If you have any questions, feel free to reach out to us at help@firecrawl.com


Thanks,
Firecrawl Team

Ps. this email is only sent once every 7 days if you reach a rate limit.", }, [NotificationType.AUTO_RECHARGE_SUCCESS]: { subject: "Auto recharge successful - Firecrawl", @@ -31,7 +31,7 @@ const emailTemplates: Record< }, [NotificationType.AUTO_RECHARGE_FAILED]: { subject: "Auto recharge failed - Firecrawl", - html: "Hey there,

Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at hello@firecrawl.com


Thanks,
Firecrawl Team
", + html: "Hey there,

Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at help@firecrawl.com


Thanks,
Firecrawl Team
", }, }; @@ -63,7 +63,7 @@ export async function sendEmailNotification( const { data, error } = await resend.emails.send({ from: "Firecrawl ", to: [email], - reply_to: "hello@firecrawl.com", + reply_to: "help@firecrawl.com", subject: emailTemplates[notificationType].subject, html: emailTemplates[notificationType].html, }); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 33b2ca9a..c25601ca 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -262,7 +262,7 @@ async function processJob(job: Job & { id: string }, token: string) { document: null, project_id: job.data.project_id, error: - "URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.", + "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.", }; return data; } @@ -346,7 +346,7 @@ async function processJob(job: Job & { id: string }, token: string) { scrapeOptions: job.data.scrapeOptions, origin: job.data.origin, crawl_id: job.data.crawl_id, - }); + }, true); await addCrawlJobDone(job.data.crawl_id, job.id); @@ -486,7 +486,7 @@ async function processJob(job: Job & { id: string }, token: string) { url: sc?.originUrl ?? (job.data.crawlerOptions === null ? "Batch Scrape" : "Unknown"), crawlerOptions: sc.crawlerOptions, origin: job.data.origin, - }); + }, true); } } } @@ -566,7 +566,7 @@ async function processJob(job: Job & { id: string }, token: string) { scrapeOptions: job.data.scrapeOptions, origin: job.data.origin, crawl_id: job.data.crawl_id, - }); + }, true); // await logJob({ // job_id: job.data.crawl_id, diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index 1cc4db84..7840484d 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -46,6 +46,8 @@ export const callWebhook = async ( webhookUrl = webhooksData[0].url; } + logger.debug("Calling webhook...", { webhookUrl, teamId, specified, v1, eventType, awaitWebhook }); + if (!webhookUrl) { return null; } @@ -128,7 +130,6 @@ export const callWebhook = async ( "Content-Type": "application/json", ...webhookUrl.headers, }, - timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1) } ) .catch((error) => { diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index ee9aaf00..06ff1d48 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -175,4 +175,4 @@ export type PlanType = | ""; -export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed"; \ No newline at end of file +export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "batch_scrape.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed"; \ No newline at end of file diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index cee054d4..16cf674f 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -221,6 +221,7 @@ export interface MapParams { search?: string; ignoreSitemap?: boolean; includeSubdomains?: boolean; + sitemapOnly?: boolean; limit?: number; } @@ -563,16 +564,18 @@ export default class FirecrawlApp { * @param params - Additional parameters for the scrape request. * @param pollInterval - Time in seconds for job status checks. * @param idempotencyKey - Optional idempotency key for the request. + * @param webhook - Optional webhook for the batch scrape. * @returns The response from the crawl operation. */ async batchScrapeUrls( urls: string[], params?: ScrapeParams, pollInterval: number = 2, - idempotencyKey?: string + idempotencyKey?: string, + webhook?: CrawlParams["webhook"], ): Promise { const headers = this.prepareHeaders(idempotencyKey); - let jsonData: any = { urls, ...(params ?? {}) }; + let jsonData: any = { urls, ...(params ?? {}), webhook }; try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/batch/scrape`, diff --git a/examples/aginews-ai-newsletter/README.md b/examples/aginews-ai-newsletter/README.md new file mode 100644 index 00000000..12dc6ada --- /dev/null +++ b/examples/aginews-ai-newsletter/README.md @@ -0,0 +1,6 @@ +# AGI News ✨ +AGI News is a daily AI newsletter that's completely sourced by autonomous AI agents. It is live at [https://www.aginews.io/](https://www.aginews.io/) + +Here is a link to the repo: + +[https://github.com/ericciarla/aginews](https://github.com/ericciarla/aginews) \ No newline at end of file diff --git a/examples/ai-podcast-generator/README.md b/examples/ai-podcast-generator/README.md new file mode 100644 index 00000000..f27cf084 --- /dev/null +++ b/examples/ai-podcast-generator/README.md @@ -0,0 +1,7 @@ +# Generate AI podcasts based on real time news 🎙️ + +This example crawls the web for interesting news stories then records a podcast with your own voice. + +Here is a link to the repo: + +[https://github.com/ericciarla/aginews-podcast](https://github.com/ericciarla/aginews-podcast) \ No newline at end of file