mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 06:15:53 +08:00
Merge remote-tracking branch 'origin/main' into nsc/new-extract
This commit is contained in:
commit
36cf49c959
@ -75,7 +75,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||
await checkTeamCredits(chunk, team_id, limitCheck);
|
||||
|
||||
if (!creditsCheckSuccess) {
|
||||
return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at hello@firecrawl.com" });
|
||||
return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com" });
|
||||
}
|
||||
|
||||
// TODO: need to do this to v1
|
||||
|
@ -209,7 +209,7 @@ export async function scrapeController(req: Request, res: Response) {
|
||||
earlyReturn = true;
|
||||
return res.status(500).json({
|
||||
error:
|
||||
"Error checking team credits. Please contact hello@firecrawl.com for help.",
|
||||
"Error checking team credits. Please contact help@firecrawl.com for help.",
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -16,6 +16,7 @@ import { logCrawl } from "../../services/logging/crawl_log";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { addScrapeJobs } from "../../services/queue-jobs";
|
||||
import { callWebhook } from "../../services/webhook";
|
||||
|
||||
export async function batchScrapeController(
|
||||
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
|
||||
@ -66,6 +67,7 @@ export async function batchScrapeController(
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
v1: true,
|
||||
webhook: req.body.webhook,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuidv4(),
|
||||
@ -85,6 +87,10 @@ export async function batchScrapeController(
|
||||
);
|
||||
await addScrapeJobs(jobs);
|
||||
|
||||
if(req.body.webhook) {
|
||||
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "batch_scrape.started");
|
||||
}
|
||||
|
||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||
|
||||
return res.status(200).json({
|
||||
|
@ -175,7 +175,7 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut
|
||||
logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
return close(ws, 1011, {
|
||||
type: "error",
|
||||
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
|
||||
error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@ -1,11 +1,6 @@
|
||||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import {
|
||||
MapDocument,
|
||||
mapRequestSchema,
|
||||
RequestWithAuth,
|
||||
scrapeOptions,
|
||||
} from "./types";
|
||||
import { MapDocument, mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
|
||||
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||
import { MapResponse, MapRequest } from "./types";
|
||||
import { configDotenv } from "dotenv";
|
||||
@ -65,11 +60,13 @@ export async function getMapResults({
|
||||
}): Promise<MapResult> {
|
||||
const id = uuidv4();
|
||||
let links: string[] = [url];
|
||||
let mapResults: MapDocument[] = [];
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: url,
|
||||
crawlerOptions: {
|
||||
...crawlerOptions,
|
||||
limit: crawlerOptions.sitemapOnly ? 10000000 : limit,
|
||||
scrapeOptions: undefined,
|
||||
},
|
||||
scrapeOptions: scrapeOptions.parse({}),
|
||||
@ -81,106 +78,131 @@ export async function getMapResults({
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
let urlWithoutWww = url.replace("www.", "");
|
||||
// If sitemapOnly is true, only get links from sitemap
|
||||
if (crawlerOptions.sitemapOnly) {
|
||||
if (includeMetadata) {
|
||||
throw new Error("includeMetadata is not supported with sitemapOnly");
|
||||
}
|
||||
|
||||
let mapUrl = search && allowExternalLinks
|
||||
? `${search} ${urlWithoutWww}`
|
||||
: search ? `${search} site:${urlWithoutWww}`
|
||||
: `site:${url}`;
|
||||
|
||||
const resultsPerPage = 100;
|
||||
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
|
||||
|
||||
const cacheKey = `fireEngineMap:${mapUrl}`;
|
||||
const cachedResult = null;
|
||||
|
||||
let allResults: any[] = [];
|
||||
let pagePromises: Promise<any>[] = [];
|
||||
|
||||
if (cachedResult) {
|
||||
allResults = JSON.parse(cachedResult);
|
||||
} else {
|
||||
const fetchPage = async (page: number) => {
|
||||
return fireEngineMap(mapUrl, {
|
||||
numResults: resultsPerPage,
|
||||
page: page,
|
||||
const sitemap = await crawler.tryGetSitemap(true, true);
|
||||
if (sitemap !== null) {
|
||||
sitemap.forEach((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
};
|
||||
links = links.slice(1)
|
||||
.map((x) => {
|
||||
try {
|
||||
return checkAndUpdateURLForMap(x).url.trim();
|
||||
} catch (_) {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.filter((x) => x !== null) as string[];
|
||||
// links = links.slice(1, limit); // don't slice, unnecessary
|
||||
}
|
||||
} else {
|
||||
let urlWithoutWww = url.replace("www.", "");
|
||||
|
||||
pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
|
||||
allResults = await Promise.all(pagePromises);
|
||||
let mapUrl = search && allowExternalLinks
|
||||
? `${search} ${urlWithoutWww}`
|
||||
: search ? `${search} site:${urlWithoutWww}`
|
||||
: `site:${url}`;
|
||||
|
||||
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
|
||||
}
|
||||
const resultsPerPage = 100;
|
||||
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
|
||||
|
||||
console.log("allResults", allResults);
|
||||
// Parallelize sitemap fetch with serper search
|
||||
const [sitemap, ...searchResults] = await Promise.all([
|
||||
ignoreSitemap ? null : crawler.tryGetSitemap(),
|
||||
...(cachedResult ? [] : pagePromises),
|
||||
]);
|
||||
const cacheKey = `fireEngineMap:${mapUrl}`;
|
||||
const cachedResult = null;
|
||||
|
||||
if (!cachedResult) {
|
||||
allResults = searchResults;
|
||||
}
|
||||
let allResults: any[] = [];
|
||||
let pagePromises: Promise<any>[] = [];
|
||||
|
||||
if (sitemap !== null) {
|
||||
sitemap.forEach((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
}
|
||||
|
||||
let mapResults : MapDocument[] = allResults
|
||||
.flat()
|
||||
.filter((result) => result !== null && result !== undefined);
|
||||
|
||||
const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit);
|
||||
if (mapResults.length > minumumCutoff) {
|
||||
mapResults = mapResults.slice(0, minumumCutoff);
|
||||
}
|
||||
|
||||
if (mapResults.length > 0) {
|
||||
if (search) {
|
||||
// Ensure all map results are first, maintaining their order
|
||||
links = [
|
||||
mapResults[0].url,
|
||||
...mapResults.slice(1).map((x) => x.url),
|
||||
...links,
|
||||
];
|
||||
if (cachedResult) {
|
||||
allResults = JSON.parse(cachedResult);
|
||||
} else {
|
||||
mapResults.map((x) => {
|
||||
const fetchPage = async (page: number) => {
|
||||
return fireEngineMap(mapUrl, {
|
||||
numResults: resultsPerPage,
|
||||
page: page,
|
||||
});
|
||||
};
|
||||
|
||||
pagePromises = Array.from({ length: maxPages }, (_, i) =>
|
||||
fetchPage(i + 1)
|
||||
);
|
||||
allResults = await Promise.all(pagePromises);
|
||||
|
||||
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
|
||||
}
|
||||
|
||||
// Parallelize sitemap fetch with serper search
|
||||
const [sitemap, ...searchResults] = await Promise.all([
|
||||
ignoreSitemap ? null : crawler.tryGetSitemap(),
|
||||
...(cachedResult ? [] : pagePromises),
|
||||
]);
|
||||
|
||||
if (!cachedResult) {
|
||||
allResults = searchResults;
|
||||
}
|
||||
|
||||
if (sitemap !== null) {
|
||||
sitemap.forEach((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Perform cosine similarity between the search query and the list of links
|
||||
if (search) {
|
||||
const searchQuery = search.toLowerCase();
|
||||
links = performCosineSimilarity(links, searchQuery);
|
||||
}
|
||||
mapResults = allResults
|
||||
.flat()
|
||||
.filter((result) => result !== null && result !== undefined);
|
||||
|
||||
links = links
|
||||
.map((x) => {
|
||||
try {
|
||||
return checkAndUpdateURLForMap(x).url.trim();
|
||||
} catch (_) {
|
||||
return null;
|
||||
const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit);
|
||||
if (mapResults.length > minumumCutoff) {
|
||||
mapResults = mapResults.slice(0, minumumCutoff);
|
||||
}
|
||||
|
||||
if (mapResults.length > 0) {
|
||||
if (search) {
|
||||
// Ensure all map results are first, maintaining their order
|
||||
links = [
|
||||
mapResults[0].url,
|
||||
...mapResults.slice(1).map((x) => x.url),
|
||||
...links,
|
||||
];
|
||||
} else {
|
||||
mapResults.map((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
}
|
||||
})
|
||||
.filter((x) => x !== null) as string[];
|
||||
}
|
||||
|
||||
// allows for subdomains to be included
|
||||
links = links.filter((x) => isSameDomain(x, url));
|
||||
// Perform cosine similarity between the search query and the list of links
|
||||
if (search) {
|
||||
const searchQuery = search.toLowerCase();
|
||||
links = performCosineSimilarity(links, searchQuery);
|
||||
}
|
||||
|
||||
// if includeSubdomains is false, filter out subdomains
|
||||
if (!includeSubdomains) {
|
||||
links = links.filter((x) => isSameSubdomain(x, url));
|
||||
links = links
|
||||
.map((x) => {
|
||||
try {
|
||||
return checkAndUpdateURLForMap(x).url.trim();
|
||||
} catch (_) {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.filter((x) => x !== null) as string[];
|
||||
|
||||
// allows for subdomains to be included
|
||||
links = links.filter((x) => isSameDomain(x, url));
|
||||
|
||||
// if includeSubdomains is false, filter out subdomains
|
||||
if (!includeSubdomains) {
|
||||
links = links.filter((x) => isSameSubdomain(x, url));
|
||||
}
|
||||
|
||||
// remove duplicates that could be due to http/https or www
|
||||
links = removeDuplicateUrls(links);
|
||||
}
|
||||
|
||||
// remove duplicates that could be due to http/https or www
|
||||
links = removeDuplicateUrls(links);
|
||||
|
||||
const linksToReturn = links.slice(0, limit);
|
||||
|
||||
return {
|
||||
@ -241,52 +263,4 @@ export async function mapController(
|
||||
};
|
||||
|
||||
return res.status(200).json(response);
|
||||
}
|
||||
|
||||
// Subdomain sitemap url checking
|
||||
|
||||
// // For each result, check for subdomains, get their sitemaps and add them to the links
|
||||
// const processedUrls = new Set();
|
||||
// const processedSubdomains = new Set();
|
||||
|
||||
// for (const result of links) {
|
||||
// let url;
|
||||
// let hostParts;
|
||||
// try {
|
||||
// url = new URL(result);
|
||||
// hostParts = url.hostname.split('.');
|
||||
// } catch (e) {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
// console.log("hostParts", hostParts);
|
||||
// // Check if it's a subdomain (more than 2 parts, and not 'www')
|
||||
// if (hostParts.length > 2 && hostParts[0] !== 'www') {
|
||||
// const subdomain = hostParts[0];
|
||||
// console.log("subdomain", subdomain);
|
||||
// const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
|
||||
// console.log("subdomainUrl", subdomainUrl);
|
||||
|
||||
// if (!processedSubdomains.has(subdomainUrl)) {
|
||||
// processedSubdomains.add(subdomainUrl);
|
||||
|
||||
// const subdomainCrawl = crawlToCrawler(id, {
|
||||
// originUrl: subdomainUrl,
|
||||
// crawlerOptions: legacyCrawlerOptions(req.body),
|
||||
// pageOptions: {},
|
||||
// team_id: req.auth.team_id,
|
||||
// createdAt: Date.now(),
|
||||
// plan: req.auth.plan,
|
||||
// });
|
||||
// const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
|
||||
// if (subdomainSitemap) {
|
||||
// subdomainSitemap.forEach((x) => {
|
||||
// if (!processedUrls.has(x.url)) {
|
||||
// processedUrls.add(x.url);
|
||||
// links.push(x.url);
|
||||
// }
|
||||
// });
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
}
|
@ -11,8 +11,12 @@ export async function scrapeStatusController(req: any, res: any) {
|
||||
await rateLimiter.consume(iptoken);
|
||||
|
||||
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
|
||||
const allowedTeams = [
|
||||
"41bdbfe1-0579-4d9b-b6d5-809f16be12f5",
|
||||
"511544f2-2fce-4183-9c59-6c29b02c69b5"
|
||||
];
|
||||
|
||||
if(job?.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
|
||||
if(!allowedTeams.includes(job?.team_id)){
|
||||
return res.status(403).json({
|
||||
success: false,
|
||||
error: "You are not allowed to access this resource.",
|
||||
|
@ -119,7 +119,7 @@ export const scrapeOptions = z.object({
|
||||
includeTags: z.string().array().optional(),
|
||||
excludeTags: z.string().array().optional(),
|
||||
onlyMainContent: z.boolean().default(true),
|
||||
timeout: z.number().int().positive().finite().safe().default(30000),
|
||||
timeout: z.number().int().positive().finite().safe().optional(),
|
||||
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
||||
extract: extractOptions.optional(),
|
||||
mobile: z.boolean().default(false),
|
||||
@ -170,9 +170,10 @@ export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
||||
export const extractRequestSchema = extractV1Options;
|
||||
export type ExtractRequest = z.infer<typeof extractRequestSchema>;
|
||||
|
||||
export const scrapeRequestSchema = scrapeOptions.extend({
|
||||
export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
timeout: z.number().int().positive().finite().safe().default(30000),
|
||||
}).strict(strictMessage).refine(
|
||||
(obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
@ -194,9 +195,21 @@ export const scrapeRequestSchema = scrapeOptions.extend({
|
||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
|
||||
|
||||
export const webhookSchema = z.preprocess(x => {
|
||||
if (typeof x === "string") {
|
||||
return { url: x };
|
||||
} else {
|
||||
return x;
|
||||
}
|
||||
}, z.object({
|
||||
url: z.string().url(),
|
||||
headers: z.record(z.string(), z.string()).default({}),
|
||||
}).strict(strictMessage))
|
||||
|
||||
export const batchScrapeRequestSchema = scrapeOptions.extend({
|
||||
urls: url.array(),
|
||||
origin: z.string().optional().default("api"),
|
||||
webhook: webhookSchema.optional(),
|
||||
}).strict(strictMessage).refine(
|
||||
(obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
@ -206,12 +219,7 @@ export const batchScrapeRequestSchema = scrapeOptions.extend({
|
||||
{
|
||||
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
||||
}
|
||||
).transform((obj) => {
|
||||
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
|
||||
return { ...obj, timeout: 60000 };
|
||||
}
|
||||
return obj;
|
||||
});
|
||||
);
|
||||
|
||||
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
|
||||
|
||||
@ -239,21 +247,10 @@ const crawlerOptions = z.object({
|
||||
|
||||
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
|
||||
|
||||
export const webhookSchema = z.preprocess(x => {
|
||||
if (typeof x === "string") {
|
||||
return { url: x };
|
||||
} else {
|
||||
return x;
|
||||
}
|
||||
}, z.object({
|
||||
url: z.string().url(),
|
||||
headers: z.record(z.string(), z.string()).default({}),
|
||||
}).strict(strictMessage))
|
||||
|
||||
export const crawlRequestSchema = crawlerOptions.extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
|
||||
scrapeOptions: scrapeOptions.default({}),
|
||||
webhook: webhookSchema.optional(),
|
||||
limit: z.number().default(10000),
|
||||
}).strict(strictMessage);
|
||||
@ -279,6 +276,7 @@ export const mapRequestSchema = crawlerOptions.extend({
|
||||
includeSubdomains: z.boolean().default(true),
|
||||
search: z.string().optional(),
|
||||
ignoreSitemap: z.boolean().default(false),
|
||||
sitemapOnly: z.boolean().default(false),
|
||||
limit: z.number().min(1).max(5000).default(5000),
|
||||
}).strict(strictMessage);
|
||||
|
||||
|
@ -207,7 +207,7 @@ app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response
|
||||
}
|
||||
|
||||
logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id });
|
||||
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id });
|
||||
});
|
||||
|
||||
logger.info(`Worker ${process.pid} started`);
|
||||
|
@ -52,7 +52,7 @@ export async function addCrawlJobs(id: string, job_ids: string[]) {
|
||||
|
||||
export async function addCrawlJobDone(id: string, job_id: string) {
|
||||
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
||||
await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id);
|
||||
await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id);
|
||||
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
|
||||
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX");
|
||||
}
|
||||
|
@ -6,22 +6,28 @@ import * as Sentry from "@sentry/node";
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import { logger } from './logger';
|
||||
import { stat } from 'fs/promises';
|
||||
dotenv.config();
|
||||
|
||||
// TODO: add a timeout to the Go parser
|
||||
const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so');
|
||||
|
||||
class GoMarkdownConverter {
|
||||
private static instance: GoMarkdownConverter;
|
||||
private convert: any;
|
||||
|
||||
private constructor() {
|
||||
const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so');
|
||||
const lib = koffi.load(goExecutablePath);
|
||||
this.convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']);
|
||||
}
|
||||
|
||||
public static getInstance(): GoMarkdownConverter {
|
||||
public static async getInstance(): Promise<GoMarkdownConverter> {
|
||||
if (!GoMarkdownConverter.instance) {
|
||||
try {
|
||||
await stat(goExecutablePath);
|
||||
} catch (_) {
|
||||
throw Error("Go shared library not found");
|
||||
}
|
||||
GoMarkdownConverter.instance = new GoMarkdownConverter();
|
||||
}
|
||||
return GoMarkdownConverter.instance;
|
||||
@ -47,7 +53,7 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
|
||||
|
||||
try {
|
||||
if (process.env.USE_GO_MARKDOWN_PARSER == "true") {
|
||||
const converter = GoMarkdownConverter.getInstance();
|
||||
const converter = await GoMarkdownConverter.getInstance();
|
||||
let markdownContent = await converter.convertHTMLToMarkdown(html);
|
||||
|
||||
markdownContent = processMultiLineLinks(markdownContent);
|
||||
@ -56,8 +62,12 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
|
||||
return markdownContent;
|
||||
}
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
|
||||
if (!(error instanceof Error) || error.message !== "Go shared library not found") {
|
||||
Sentry.captureException(error);
|
||||
logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
|
||||
} else {
|
||||
logger.warn("Tried to use Go parser, but it doesn't exist in the file system.", { goExecutablePath });
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to TurndownService if Go parser fails or is not enabled
|
||||
@ -89,7 +99,7 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
|
||||
|
||||
return markdownContent;
|
||||
} catch (error) {
|
||||
console.error("Error converting HTML to Markdown: ", error);
|
||||
logger.error("Error converting HTML to Markdown", {error});
|
||||
return ""; // Optionally return an empty string or handle the error as needed
|
||||
}
|
||||
}
|
||||
|
@ -65,7 +65,12 @@ export class WebCrawler {
|
||||
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
||||
}
|
||||
|
||||
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
|
||||
// If the initial URL is a sitemap.xml, skip filtering
|
||||
if (this.initialUrl.endsWith('sitemap.xml') && fromMap) {
|
||||
return sitemapLinks.slice(0, limit);
|
||||
}
|
||||
|
||||
return sitemapLinks
|
||||
.filter((link) => {
|
||||
let url: URL;
|
||||
@ -159,11 +164,14 @@ export class WebCrawler {
|
||||
this.robots = robotsParser(this.robotsTxtUrl, txt);
|
||||
}
|
||||
|
||||
public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> {
|
||||
public async tryGetSitemap(fromMap: boolean = false, onlySitemap: boolean = false): Promise<{ url: string; html: string; }[] | null> {
|
||||
logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||
if(fromMap && onlySitemap) {
|
||||
return sitemapLinks.map(link => ({ url: link, html: "" }));
|
||||
}
|
||||
if (sitemapLinks.length > 0) {
|
||||
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth);
|
||||
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth, fromMap);
|
||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||
}
|
||||
return null;
|
||||
@ -353,7 +361,8 @@ export class WebCrawler {
|
||||
return url;
|
||||
};
|
||||
|
||||
const sitemapUrl = url.endsWith("/sitemap.xml")
|
||||
|
||||
const sitemapUrl = url.endsWith(".xml")
|
||||
? url
|
||||
: `${url}/sitemap.xml`;
|
||||
|
||||
|
@ -24,7 +24,7 @@ export async function getLinksFromSitemap(
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
content = response.data;
|
||||
} else if (mode === 'fire-engine') {
|
||||
const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;playwright" });;
|
||||
const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true });
|
||||
if (!response.success) {
|
||||
throw response.error;
|
||||
}
|
||||
|
@ -87,6 +87,7 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<Engi
|
||||
priority: meta.internalOptions.priority,
|
||||
geolocation: meta.options.geolocation,
|
||||
mobile: meta.options.mobile,
|
||||
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
||||
// TODO: scrollXPaths
|
||||
};
|
||||
|
||||
@ -95,7 +96,9 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<Engi
|
||||
let response = await performFireEngineScrape(
|
||||
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
||||
request,
|
||||
defaultTimeout + totalWait,
|
||||
meta.options.timeout !== undefined
|
||||
? defaultTimeout + totalWait
|
||||
: Infinity, // TODO: better timeout handling
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck" }), response.responseHeaders);
|
||||
@ -140,12 +143,16 @@ export async function scrapeURLWithFireEnginePlaywright(meta: Meta): Promise<Eng
|
||||
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
|
||||
wait: meta.options.waitFor,
|
||||
geolocation: meta.options.geolocation,
|
||||
|
||||
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
||||
};
|
||||
|
||||
let response = await performFireEngineScrape(
|
||||
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
||||
request,
|
||||
defaultTimeout + meta.options.waitFor
|
||||
meta.options.timeout !== undefined
|
||||
? defaultTimeout + meta.options.waitFor
|
||||
: Infinity, // TODO: better timeout handling
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck" }), response.responseHeaders);
|
||||
@ -179,11 +186,16 @@ export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise<Engi
|
||||
atsv: meta.internalOptions.atsv,
|
||||
geolocation: meta.options.geolocation,
|
||||
disableJsDom: meta.internalOptions.v0DisableJsDom,
|
||||
|
||||
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
||||
};
|
||||
|
||||
let response = await performFireEngineScrape(
|
||||
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
||||
request,
|
||||
meta.options.timeout !== undefined
|
||||
? defaultTimeout
|
||||
: Infinity, // TODO: better timeout handling
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck" }), response.responseHeaders);
|
||||
|
@ -25,6 +25,8 @@ export type FireEngineScrapeRequestCommon = {
|
||||
logRequest?: boolean; // default: true
|
||||
instantReturn?: boolean; // default: false
|
||||
geolocation?: { country?: string; languages?: string[]; };
|
||||
|
||||
timeout?: number;
|
||||
}
|
||||
|
||||
export type FireEngineScrapeRequestChromeCDP = {
|
||||
|
@ -13,12 +13,12 @@ export async function scrapeURLWithPlaywright(meta: Meta): Promise<EngineScrapeR
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
body: {
|
||||
url: meta.url,
|
||||
wait_after_load: meta.options.waitFor,
|
||||
timeout,
|
||||
headers: meta.options.headers,
|
||||
}),
|
||||
},
|
||||
method: "POST",
|
||||
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
|
||||
schema: z.object({
|
||||
|
@ -18,7 +18,7 @@ export class NoEnginesLeftError extends Error {
|
||||
public results: EngineResultsTracker;
|
||||
|
||||
constructor(fallbackList: Engine[], results: EngineResultsTracker) {
|
||||
super("All scraping engines failed!");
|
||||
super("All scraping engines failed! -- Double check the URL to make sure it's not broken. If the issue persists, contact us at help@firecrawl.com.");
|
||||
this.fallbackList = fallbackList;
|
||||
this.results = results;
|
||||
}
|
||||
|
@ -5,7 +5,6 @@ import { logger } from "../lib/logger";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
|
||||
export async function fireEngineMap(
|
||||
q: string,
|
||||
options: {
|
||||
@ -40,14 +39,13 @@ export async function fireEngineMap(
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"X-Disable-Cache": "true"
|
||||
"X-Disable-Cache": "true",
|
||||
},
|
||||
body: data
|
||||
body: data,
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const responseData = await response.json();
|
||||
console.log("response", responseData);
|
||||
return responseData;
|
||||
} else {
|
||||
return [];
|
||||
|
@ -7,7 +7,7 @@ import { logger } from "../../lib/logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function logJob(job: FirecrawlJob) {
|
||||
export async function logJob(job: FirecrawlJob, force: boolean = false) {
|
||||
try {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (!useDbAuthentication) {
|
||||
@ -23,28 +23,52 @@ export async function logJob(job: FirecrawlJob) {
|
||||
job.scrapeOptions.headers["Authorization"] = "REDACTED";
|
||||
job.docs = [{ content: "REDACTED DUE TO AUTHORIZATION HEADER", html: "REDACTED DUE TO AUTHORIZATION HEADER" }];
|
||||
}
|
||||
const jobColumn = {
|
||||
job_id: job.job_id ? job.job_id : null,
|
||||
success: job.success,
|
||||
message: job.message,
|
||||
num_docs: job.num_docs,
|
||||
docs: job.docs,
|
||||
time_taken: job.time_taken,
|
||||
team_id: job.team_id === "preview" ? null : job.team_id,
|
||||
mode: job.mode,
|
||||
url: job.url,
|
||||
crawler_options: job.crawlerOptions,
|
||||
page_options: job.scrapeOptions,
|
||||
origin: job.origin,
|
||||
num_tokens: job.num_tokens,
|
||||
retry: !!job.retry,
|
||||
crawl_id: job.crawl_id,
|
||||
};
|
||||
|
||||
const { data, error } = await supabase_service
|
||||
.from("firecrawl_jobs")
|
||||
.insert([
|
||||
{
|
||||
job_id: job.job_id ? job.job_id : null,
|
||||
success: job.success,
|
||||
message: job.message,
|
||||
num_docs: job.num_docs,
|
||||
docs: job.docs,
|
||||
time_taken: job.time_taken,
|
||||
team_id: job.team_id === "preview" ? null : job.team_id,
|
||||
mode: job.mode,
|
||||
url: job.url,
|
||||
crawler_options: job.crawlerOptions,
|
||||
page_options: job.scrapeOptions,
|
||||
origin: job.origin,
|
||||
num_tokens: job.num_tokens,
|
||||
retry: !!job.retry,
|
||||
crawl_id: job.crawl_id,
|
||||
},
|
||||
]);
|
||||
if (force) {
|
||||
while (true) {
|
||||
try {
|
||||
const { error } = await supabase_service
|
||||
.from("firecrawl_jobs")
|
||||
.insert([jobColumn]);
|
||||
if (error) {
|
||||
logger.error("Failed to log job due to Supabase error -- trying again", { error, scrapeId: job.job_id });
|
||||
await new Promise<void>((resolve) => setTimeout(() => resolve(), 75));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error("Failed to log job due to thrown error -- trying again", { error, scrapeId: job.job_id });
|
||||
await new Promise<void>((resolve) => setTimeout(() => resolve(), 75));
|
||||
}
|
||||
}
|
||||
logger.debug("Job logged successfully!", { scrapeId: job.job_id });
|
||||
} else {
|
||||
const { error } = await supabase_service
|
||||
.from("firecrawl_jobs")
|
||||
.insert([jobColumn]);
|
||||
if (error) {
|
||||
logger.error(`Error logging job: ${error.message}`, { error, scrapeId: job.job_id });
|
||||
} else {
|
||||
logger.debug("Job logged successfully!", { scrapeId: job.job_id });
|
||||
}
|
||||
}
|
||||
|
||||
if (process.env.POSTHOG_API_KEY && !job.crawl_id) {
|
||||
let phLog = {
|
||||
@ -72,9 +96,7 @@ export async function logJob(job: FirecrawlJob) {
|
||||
posthog.capture(phLog);
|
||||
}
|
||||
}
|
||||
if (error) {
|
||||
logger.error(`Error logging job: ${error.message}`);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Error logging job: ${error.message}`);
|
||||
}
|
||||
|
@ -23,7 +23,7 @@ const emailTemplates: Record<
|
||||
},
|
||||
[NotificationType.RATE_LIMIT_REACHED]: {
|
||||
subject: "Rate Limit Reached - Firecrawl",
|
||||
html: "Hey there,<br/><p>You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our <a href='https://firecrawl.dev/pricing'>pricing page</a> for more info.</p><p>If you have any questions, feel free to reach out to us at <a href='mailto:hello@firecrawl.com'>hello@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/><br/>Ps. this email is only sent once every 7 days if you reach a rate limit.",
|
||||
html: "Hey there,<br/><p>You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our <a href='https://firecrawl.dev/pricing'>pricing page</a> for more info.</p><p>If you have any questions, feel free to reach out to us at <a href='mailto:help@firecrawl.com'>help@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/><br/>Ps. this email is only sent once every 7 days if you reach a rate limit.",
|
||||
},
|
||||
[NotificationType.AUTO_RECHARGE_SUCCESS]: {
|
||||
subject: "Auto recharge successful - Firecrawl",
|
||||
@ -31,7 +31,7 @@ const emailTemplates: Record<
|
||||
},
|
||||
[NotificationType.AUTO_RECHARGE_FAILED]: {
|
||||
subject: "Auto recharge failed - Firecrawl",
|
||||
html: "Hey there,<br/><p>Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at <a href='mailto:hello@firecrawl.com'>hello@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/>",
|
||||
html: "Hey there,<br/><p>Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at <a href='mailto:help@firecrawl.com'>help@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/>",
|
||||
},
|
||||
};
|
||||
|
||||
@ -63,7 +63,7 @@ export async function sendEmailNotification(
|
||||
const { data, error } = await resend.emails.send({
|
||||
from: "Firecrawl <firecrawl@getmendableai.com>",
|
||||
to: [email],
|
||||
reply_to: "hello@firecrawl.com",
|
||||
reply_to: "help@firecrawl.com",
|
||||
subject: emailTemplates[notificationType].subject,
|
||||
html: emailTemplates[notificationType].html,
|
||||
});
|
||||
|
@ -262,7 +262,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
document: null,
|
||||
project_id: job.data.project_id,
|
||||
error:
|
||||
"URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.",
|
||||
"URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.",
|
||||
};
|
||||
return data;
|
||||
}
|
||||
@ -346,7 +346,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
scrapeOptions: job.data.scrapeOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
});
|
||||
}, true);
|
||||
|
||||
await addCrawlJobDone(job.data.crawl_id, job.id);
|
||||
|
||||
@ -486,7 +486,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
url: sc?.originUrl ?? (job.data.crawlerOptions === null ? "Batch Scrape" : "Unknown"),
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
}, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -566,7 +566,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
scrapeOptions: job.data.scrapeOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
});
|
||||
}, true);
|
||||
|
||||
// await logJob({
|
||||
// job_id: job.data.crawl_id,
|
||||
|
@ -46,6 +46,8 @@ export const callWebhook = async (
|
||||
webhookUrl = webhooksData[0].url;
|
||||
}
|
||||
|
||||
logger.debug("Calling webhook...", { webhookUrl, teamId, specified, v1, eventType, awaitWebhook });
|
||||
|
||||
if (!webhookUrl) {
|
||||
return null;
|
||||
}
|
||||
@ -128,7 +130,6 @@ export const callWebhook = async (
|
||||
"Content-Type": "application/json",
|
||||
...webhookUrl.headers,
|
||||
},
|
||||
timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1)
|
||||
}
|
||||
)
|
||||
.catch((error) => {
|
||||
|
@ -175,4 +175,4 @@ export type PlanType =
|
||||
| "";
|
||||
|
||||
|
||||
export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed";
|
||||
export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "batch_scrape.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed";
|
@ -221,6 +221,7 @@ export interface MapParams {
|
||||
search?: string;
|
||||
ignoreSitemap?: boolean;
|
||||
includeSubdomains?: boolean;
|
||||
sitemapOnly?: boolean;
|
||||
limit?: number;
|
||||
}
|
||||
|
||||
@ -563,16 +564,18 @@ export default class FirecrawlApp {
|
||||
* @param params - Additional parameters for the scrape request.
|
||||
* @param pollInterval - Time in seconds for job status checks.
|
||||
* @param idempotencyKey - Optional idempotency key for the request.
|
||||
* @param webhook - Optional webhook for the batch scrape.
|
||||
* @returns The response from the crawl operation.
|
||||
*/
|
||||
async batchScrapeUrls(
|
||||
urls: string[],
|
||||
params?: ScrapeParams,
|
||||
pollInterval: number = 2,
|
||||
idempotencyKey?: string
|
||||
idempotencyKey?: string,
|
||||
webhook?: CrawlParams["webhook"],
|
||||
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData: any = { urls, ...(params ?? {}) };
|
||||
let jsonData: any = { urls, ...(params ?? {}), webhook };
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/v1/batch/scrape`,
|
||||
|
6
examples/aginews-ai-newsletter/README.md
Normal file
6
examples/aginews-ai-newsletter/README.md
Normal file
@ -0,0 +1,6 @@
|
||||
# AGI News ✨
|
||||
AGI News is a daily AI newsletter that's completely sourced by autonomous AI agents. It is live at [https://www.aginews.io/](https://www.aginews.io/)
|
||||
|
||||
Here is a link to the repo:
|
||||
|
||||
[https://github.com/ericciarla/aginews](https://github.com/ericciarla/aginews)
|
7
examples/ai-podcast-generator/README.md
Normal file
7
examples/ai-podcast-generator/README.md
Normal file
@ -0,0 +1,7 @@
|
||||
# Generate AI podcasts based on real time news 🎙️
|
||||
|
||||
This example crawls the web for interesting news stories then records a podcast with your own voice.
|
||||
|
||||
Here is a link to the repo:
|
||||
|
||||
[https://github.com/ericciarla/aginews-podcast](https://github.com/ericciarla/aginews-podcast)
|
Loading…
x
Reference in New Issue
Block a user