Merge branch 'main' into mog/mineru

This commit is contained in:
Nicolas 2024-12-27 19:53:09 -03:00
commit 5fcf3fa97e
25 changed files with 1681 additions and 440 deletions

View File

@ -116,6 +116,10 @@ If youd like to test the crawl endpoint, you can run this:
This section provides solutions to common issues you might encounter while setting up or running your self-hosted instance of Firecrawl. This section provides solutions to common issues you might encounter while setting up or running your self-hosted instance of Firecrawl.
### API Keys for SDK Usage
**Note:** When using Firecrawl SDKs with a self-hosted instance, API keys are optional. API keys are only required when connecting to the cloud service (api.firecrawl.dev).
### Supabase client is not configured ### Supabase client is not configured
**Symptom:** **Symptom:**

View File

@ -70,8 +70,8 @@ content-type: application/json
"urls": ["firecrawl.dev"], "urls": ["firecrawl.dev"],
"prompt": "What is the title, description and main product of the page?", "prompt": "What is the title, description and main product of the page?",
"schema": { "schema": {
"title": "string", "title": { "type": "string" },
"description": "string", "description": { "type": "string" },
"mainProduct": "string" "mainProduct": { "type": "string" }
} }
} }

View File

@ -177,56 +177,51 @@ export async function crawlController(req: Request, res: Response) {
await saveCrawl(id, sc); await saveCrawl(id, sc);
const sitemap = sc.crawlerOptions?.ignoreSitemap const sitemap = sc.crawlerOptions.ignoreSitemap
? null ? 0
: await crawler.tryGetSitemap(); : await crawler.tryGetSitemap(async urls => {
if (urls.length === 0) return;
let jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 });
const jobs = urls.map(url => {
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls",
crawlerOptions,
scrapeOptions,
internalOptions,
team_id,
plan,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
sitemapped: true,
},
opts: {
jobId: uuid,
priority: jobPriority,
},
};
});
if (sitemap !== null && sitemap.length > 0) { await lockURLs(
let jobPriority = 20; id,
// If it is over 1000, we need to get the job priority, sc,
// otherwise we can use the default priority of 20 jobs.map((x) => x.data.url),
if (sitemap.length > 1000) { );
// set base to 21 await addCrawlJobs(
jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 }); id,
} jobs.map((x) => x.opts.jobId),
const jobs = sitemap.map((x) => { );
const url = x.url; for (const job of jobs) {
const uuid = uuidv4(); // add with sentry instrumentation
return { await addScrapeJob(job.data as any, {}, job.opts.jobId);
name: uuid, }
data: { });
url,
mode: "single_urls",
crawlerOptions,
scrapeOptions,
internalOptions,
team_id,
plan,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
sitemapped: true,
},
opts: {
jobId: uuid,
priority: jobPriority,
},
};
});
await lockURLs( if (sitemap === 0) {
id,
sc,
jobs.map((x) => x.data.url),
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId),
);
for (const job of jobs) {
// add with sentry instrumentation
await addScrapeJob(job.data as any, {}, job.opts.jobId);
}
} else {
await lockURL(id, sc, url); await lockURL(id, sc, url);
// Not needed, first one should be 15. // Not needed, first one should be 15.

View File

@ -113,32 +113,32 @@ export async function crawlPreviewController(req: Request, res: Response) {
const crawler = crawlToCrawler(id, sc); const crawler = crawlToCrawler(id, sc);
const sitemap = sc.crawlerOptions?.ignoreSitemap const sitemap = sc.crawlerOptions?.ignoreSitemap
? null ? 0
: await crawler.tryGetSitemap(); : await crawler.tryGetSitemap(async urls => {
for (const url of urls) {
await lockURL(id, sc, url);
const jobId = uuidv4();
await addScrapeJob(
{
url,
mode: "single_urls",
team_id,
plan: plan!,
crawlerOptions,
scrapeOptions,
internalOptions,
origin: "website-preview",
crawl_id: id,
sitemapped: true,
},
{},
jobId,
);
await addCrawlJob(id, jobId);
}
});
if (sitemap !== null) { if (sitemap === 0) {
for (const url of sitemap.map((x) => x.url)) {
await lockURL(id, sc, url);
const jobId = uuidv4();
await addScrapeJob(
{
url,
mode: "single_urls",
team_id,
plan: plan!,
crawlerOptions,
scrapeOptions,
internalOptions,
origin: "website-preview",
crawl_id: id,
sitemapped: true,
},
{},
jobId,
);
await addCrawlJob(id, jobId);
}
} else {
await lockURL(id, sc, url); await lockURL(id, sc, url);
const jobId = uuidv4(); const jobId = uuidv4();
await addScrapeJob( await addScrapeJob(

View File

@ -115,7 +115,7 @@ export async function crawlStatusController(
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
sc.cancelled sc.cancelled
? "cancelled" ? "cancelled"
: validJobStatuses.every((x) => x[1] === "completed") : (validJobStatuses.every((x) => x[1] === "completed") && validJobStatuses.length > 0)
? "completed" ? "completed"
: "scraping"; : "scraping";

View File

@ -18,7 +18,7 @@ import {
} from "../../lib/crawl-redis"; } from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log"; import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service"; import { getScrapeQueue } from "../../services/queue-service";
import { addScrapeJob, addScrapeJobs } from "../../services/queue-jobs"; import { _addScrapeJobToBullMQ, addScrapeJob, addScrapeJobs } from "../../services/queue-jobs";
import { logger as _logger } from "../../lib/logger"; import { logger as _logger } from "../../lib/logger";
import { getJobPriority } from "../../lib/job-priority"; import { getJobPriority } from "../../lib/job-priority";
import { callWebhook } from "../../services/webhook"; import { callWebhook } from "../../services/webhook";
@ -111,113 +111,20 @@ export async function crawlController(
await saveCrawl(id, sc); await saveCrawl(id, sc);
const sitemap = sc.crawlerOptions.ignoreSitemap await _addScrapeJobToBullMQ({
? null url: req.body.url,
: await crawler.tryGetSitemap(); mode: "kickoff" as const,
team_id: req.auth.team_id,
if (sitemap !== null && sitemap.length > 0) { plan: req.auth.plan,
logger.debug("Using sitemap of length " + sitemap.length, { crawlerOptions,
sitemapLength: sitemap.length, scrapeOptions: sc.scrapeOptions,
}); internalOptions: sc.internalOptions,
let jobPriority = 20; origin: "api",
// If it is over 1000, we need to get the job priority, crawl_id: id,
// otherwise we can use the default priority of 20 webhook: req.body.webhook,
if (sitemap.length > 1000) { v1: true,
// set base to 21 }, {}, crypto.randomUUID(), 10);
jobPriority = await getJobPriority({
plan: req.auth.plan,
team_id: req.auth.team_id,
basePriority: 21,
});
}
logger.debug("Using job priority " + jobPriority, { jobPriority });
const jobs = sitemap.map((x) => {
const url = x.url;
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls" as const,
team_id: req.auth.team_id,
plan: req.auth.plan!,
crawlerOptions,
scrapeOptions,
internalOptions: sc.internalOptions,
origin: "api",
crawl_id: id,
sitemapped: true,
webhook: req.body.webhook,
v1: true,
},
opts: {
jobId: uuid,
priority: 20,
},
};
});
logger.debug("Locking URLs...");
await lockURLs(
id,
sc,
jobs.map((x) => x.data.url),
);
logger.debug("Adding scrape jobs to Redis...");
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId),
);
logger.debug("Adding scrape jobs to BullMQ...");
await addScrapeJobs(jobs);
} else {
logger.debug("Sitemap not found or ignored.", {
ignoreSitemap: sc.crawlerOptions.ignoreSitemap,
});
logger.debug("Locking URL...");
await lockURL(id, sc, req.body.url);
const jobId = uuidv4();
logger.debug("Adding scrape job to Redis...", { jobId });
await addScrapeJob(
{
url: req.body.url,
mode: "single_urls",
team_id: req.auth.team_id,
crawlerOptions,
scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions),
internalOptions: sc.internalOptions,
plan: req.auth.plan!,
origin: "api",
crawl_id: id,
webhook: req.body.webhook,
v1: true,
},
{
priority: 15,
},
jobId,
);
logger.debug("Adding scrape job to BullMQ...", { jobId });
await addCrawlJob(id, jobId);
}
logger.debug("Done queueing jobs!");
if (req.body.webhook) {
logger.debug("Calling webhook with crawl.started...", {
webhook: req.body.webhook,
});
await callWebhook(
req.auth.team_id,
id,
null,
req.body.webhook,
true,
"crawl.started",
);
}
const protocol = process.env.ENV === "local" ? req.protocol : "https"; const protocol = process.env.ENV === "local" ? req.protocol : "https";
return res.status(200).json({ return res.status(200).json({

View File

@ -86,11 +86,12 @@ export async function getMapResults({
// If sitemapOnly is true, only get links from sitemap // If sitemapOnly is true, only get links from sitemap
if (crawlerOptions.sitemapOnly) { if (crawlerOptions.sitemapOnly) {
const sitemap = await crawler.tryGetSitemap(true, true); const sitemap = await crawler.tryGetSitemap(urls => {
if (sitemap !== null) { urls.forEach((x) => {
sitemap.forEach((x) => { links.push(x);
links.push(x.url);
}); });
}, true, true);
if (sitemap > 0) {
links = links links = links
.slice(1) .slice(1)
.map((x) => { .map((x) => {
@ -143,8 +144,10 @@ export async function getMapResults({
} }
// Parallelize sitemap fetch with serper search // Parallelize sitemap fetch with serper search
const [sitemap, ...searchResults] = await Promise.all([ const [_, ...searchResults] = await Promise.all([
ignoreSitemap ? null : crawler.tryGetSitemap(true), ignoreSitemap ? null : crawler.tryGetSitemap(urls => {
links.push(...urls);
}, true),
...(cachedResult ? [] : pagePromises), ...(cachedResult ? [] : pagePromises),
]); ]);
@ -152,12 +155,6 @@ export async function getMapResults({
allResults = searchResults; allResults = searchResults;
} }
if (sitemap !== null) {
sitemap.forEach((x) => {
links.push(x.url);
});
}
mapResults = allResults mapResults = allResults
.flat() .flat()
.filter((result) => result !== null && result !== undefined); .filter((result) => result !== null && result !== undefined);

View File

@ -17,7 +17,7 @@ export function withAuth<T, U extends any[]>(
logger.warn("You're bypassing authentication"); logger.warn("You're bypassing authentication");
warningCount++; warningCount++;
} }
return { success: true } as T; return { success: true, ...(mockSuccess || {}) } as T;
} else { } else {
return await originalFunction(...args); return await originalFunction(...args);
} }

View File

@ -4,9 +4,10 @@ import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap"; import { getLinksFromSitemap } from "./sitemap";
import robotsParser from "robots-parser"; import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils"; import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout"; import { axiosTimeout } from "../../lib/timeout";
import { logger as _logger } from "../../../src/lib/logger"; import { logger as _logger } from "../../lib/logger";
import https from "https"; import https from "https";
import { redisConnection } from "../../services/queue-service";
export class WebCrawler { export class WebCrawler {
private jobId: string; private jobId: string;
private initialUrl: string; private initialUrl: string;
@ -198,26 +199,60 @@ export class WebCrawler {
} }
public async tryGetSitemap( public async tryGetSitemap(
urlsHandler: (urls: string[]) => unknown,
fromMap: boolean = false, fromMap: boolean = false,
onlySitemap: boolean = false, onlySitemap: boolean = false,
): Promise<{ url: string; html: string }[] | null> { ): Promise<number> {
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, { this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
method: "tryGetSitemap", method: "tryGetSitemap",
}); });
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); let leftOfLimit = this.limit;
if (fromMap && onlySitemap) {
return sitemapLinks.map((link) => ({ url: link, html: "" })); const normalizeUrl = (url: string) => {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
return url;
};
const _urlsHandler = async (urls: string[]) => {
let uniqueURLs: string[] = [];
for (const url of urls) {
if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(url))) {
uniqueURLs.push(url);
}
}
await redisConnection.expire("sitemap:" + this.jobId + ":links", 3600, "NX");
if (uniqueURLs.length > 0) {
urlsHandler(uniqueURLs);
}
};
let count = await this.tryFetchSitemapLinks(this.initialUrl, (urls: string[]) => {
if (fromMap && onlySitemap) {
return urlsHandler(urls);
} else {
let filteredLinks = this.filterLinks(
[...new Set(urls)],
leftOfLimit,
this.maxCrawledDepth,
fromMap,
);
leftOfLimit -= filteredLinks.length;
return _urlsHandler(filteredLinks);
}
});
if (count > 0) {
if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(this.initialUrl))) {
urlsHandler([this.initialUrl]);
}
count++;
} }
if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks( return count;
[...new Set(sitemapLinks)],
this.limit,
this.maxCrawledDepth,
fromMap,
);
return filteredLinks.map((link) => ({ url: link, html: "" }));
}
return null;
} }
public filterURL(href: string, url: string): string | null { public filterURL(href: string, url: string): string | null {
@ -436,54 +471,74 @@ export class WebCrawler {
return socialMediaOrEmail.some((ext) => url.includes(ext)); return socialMediaOrEmail.some((ext) => url.includes(ext));
} }
private async tryFetchSitemapLinks(url: string): Promise<string[]> { private async tryFetchSitemapLinks(url: string, urlsHandler: (urls: string[]) => unknown): Promise<number> {
const normalizeUrl = (url: string) => {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
return url;
};
const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`; const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`;
let sitemapLinks: string[] = []; let sitemapCount: number = 0;
// Try to get sitemap from the provided URL first
try { try {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); sitemapCount = await getLinksFromSitemap(
if (response.status === 200) { { sitemapUrl, urlsHandler, mode: "fire-engine" },
sitemapLinks = await getLinksFromSitemap({ sitemapUrl }, this.logger); this.logger,
} );
} catch (error) { } catch (error) {
this.logger.debug( this.logger.debug(
`Failed to fetch sitemap with axios from ${sitemapUrl}`, `Failed to fetch sitemap from ${sitemapUrl}`,
{ method: "tryFetchSitemapLinks", sitemapUrl, error }, { method: "tryFetchSitemapLinks", sitemapUrl, error },
); );
if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
} else {
const response = await getLinksFromSitemap(
{ sitemapUrl, mode: "fire-engine" },
this.logger,
);
if (response) {
sitemapLinks = response;
}
}
} }
if (sitemapLinks.length === 0) { // If this is a subdomain, also try to get sitemap from the main domain
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; try {
try { const urlObj = new URL(url);
const response = await axios.get(baseUrlSitemap, { const hostname = urlObj.hostname;
timeout: axiosTimeout, const domainParts = hostname.split('.');
});
if (response.status === 200) { // Check if this is a subdomain (has more than 2 parts and not www)
sitemapLinks = await getLinksFromSitemap( if (domainParts.length > 2 && domainParts[0] !== 'www') {
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" }, // Get the main domain by taking the last two parts
const mainDomain = domainParts.slice(-2).join('.');
const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`;
const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`;
try {
// Get all links from the main domain's sitemap
sitemapCount += await getLinksFromSitemap(
{ sitemapUrl: mainDomainSitemapUrl, urlsHandler(urls) {
urlsHandler(urls.filter(link => {
try {
const linkUrl = new URL(link);
return linkUrl.hostname.endsWith(hostname);
} catch {
}
}))
}, mode: "fire-engine" },
this.logger, this.logger,
); );
} catch (error) {
this.logger.debug(
`Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`,
{ method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error },
);
} }
}
} catch (error) {
this.logger.debug(`Error processing main domain sitemap`, {
method: "tryFetchSitemapLinks",
url,
error,
});
}
// If no sitemap found yet, try the baseUrl as a last resort
if (sitemapCount === 0) {
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
try {
sitemapCount += await getLinksFromSitemap(
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
this.logger,
);
} catch (error) { } catch (error) {
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, { this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
method: "tryFetchSitemapLinks", method: "tryFetchSitemapLinks",
@ -493,25 +548,14 @@ export class WebCrawler {
if (error instanceof AxiosError && error.response?.status === 404) { if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404 // ignore 404
} else { } else {
sitemapLinks = await getLinksFromSitemap( sitemapCount += await getLinksFromSitemap(
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" }, { sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
this.logger, this.logger,
); );
} }
} }
} }
const normalizedUrl = normalizeUrl(url); return sitemapCount;
const normalizedSitemapLinks = sitemapLinks.map((link) =>
normalizeUrl(link),
);
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
if (
!normalizedSitemapLinks.includes(normalizedUrl) &&
sitemapLinks.length > 0
) {
sitemapLinks.push(url);
}
return sitemapLinks;
} }
} }

View File

@ -5,26 +5,25 @@ import { WebCrawler } from "./crawler";
import { scrapeURL } from "../scrapeURL"; import { scrapeURL } from "../scrapeURL";
import { scrapeOptions } from "../../controllers/v1/types"; import { scrapeOptions } from "../../controllers/v1/types";
import type { Logger } from "winston"; import type { Logger } from "winston";
const useFireEngine =
process.env.FIRE_ENGINE_BETA_URL !== "" &&
process.env.FIRE_ENGINE_BETA_URL !== undefined;
export async function getLinksFromSitemap( export async function getLinksFromSitemap(
{ {
sitemapUrl, sitemapUrl,
allUrls = [], urlsHandler,
mode = "axios", mode = "axios",
}: { }: {
sitemapUrl: string; sitemapUrl: string;
allUrls?: string[]; urlsHandler(urls: string[]): unknown,
mode?: "axios" | "fire-engine"; mode?: "axios" | "fire-engine";
}, },
logger: Logger, logger: Logger,
): Promise<string[]> { ): Promise<number> {
try { try {
let content: string = ""; let content: string = "";
try { try {
if (mode === "axios" || process.env.FIRE_ENGINE_BETA_URL === "") { if (mode === "fire-engine" && useFireEngine) {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data;
} else if (mode === "fire-engine") {
const response = await scrapeURL( const response = await scrapeURL(
"sitemap", "sitemap",
sitemapUrl, sitemapUrl,
@ -32,9 +31,15 @@ export async function getLinksFromSitemap(
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }, { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
); );
if (!response.success) { if (!response.success) {
throw response.error; logger.debug("Failed to scrape sitemap via TLSClient, falling back to axios...", { error: response.error })
const ar = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = ar.data;
} else {
content = response.document.rawHtml!;
} }
content = response.document.rawHtml!; } else {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data;
} }
} catch (error) { } catch (error) {
logger.error(`Request failed for ${sitemapUrl}`, { logger.error(`Request failed for ${sitemapUrl}`, {
@ -44,33 +49,64 @@ export async function getLinksFromSitemap(
error, error,
}); });
return allUrls; return 0;
} }
const parsed = await parseStringPromise(content); const parsed = await parseStringPromise(content);
const root = parsed.urlset || parsed.sitemapindex; const root = parsed.urlset || parsed.sitemapindex;
let count = 0;
if (root && root.sitemap) { if (root && root.sitemap) {
const sitemapPromises = root.sitemap // Handle sitemap index files
const sitemapUrls = root.sitemap
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0) .filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
.map((sitemap) => .map((sitemap) => sitemap.loc[0]);
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
getLinksFromSitemap(
{ sitemapUrl, urlsHandler, mode },
logger,
),
);
const results = await Promise.all(sitemapPromises);
count = results.reduce((a,x) => a + x)
} else if (root && root.url) {
// Check if any URLs point to additional sitemaps
const xmlSitemaps: string[] = root.url
.filter(
(url) =>
url.loc &&
url.loc.length > 0 &&
url.loc[0].toLowerCase().endsWith('.xml')
)
.map((url) => url.loc[0]);
if (xmlSitemaps.length > 0) {
// Recursively fetch links from additional sitemaps
const sitemapPromises = xmlSitemaps.map((sitemapUrl) =>
getLinksFromSitemap( getLinksFromSitemap(
{ sitemapUrl: sitemap.loc[0], allUrls, mode }, { sitemapUrl: sitemapUrl, urlsHandler, mode },
logger, logger,
), ),
); );
await Promise.all(sitemapPromises); count += (await Promise.all(sitemapPromises)).reduce((a,x) => a + x, 0);
} else if (root && root.url) { }
const validUrls = root.url const validUrls = root.url
.filter( .filter(
(url) => (url) =>
url.loc && url.loc &&
url.loc.length > 0 && url.loc.length > 0 &&
!url.loc[0].toLowerCase().endsWith('.xml') &&
!WebCrawler.prototype.isFile(url.loc[0]), !WebCrawler.prototype.isFile(url.loc[0]),
) )
.map((url) => url.loc[0]); .map((url) => url.loc[0]);
allUrls.push(...validUrls); count += validUrls.length;
urlsHandler(validUrls);
} }
return count;
} catch (error) { } catch (error) {
logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, { logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, {
method: "getLinksFromSitemap", method: "getLinksFromSitemap",
@ -80,7 +116,7 @@ export async function getLinksFromSitemap(
}); });
} }
return allUrls; return 0;
} }
export const fetchSitemapData = async ( export const fetchSitemapData = async (

View File

@ -17,6 +17,7 @@ import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { Action } from "../../../../lib/entities"; import { Action } from "../../../../lib/entities";
import { specialtyScrapeCheck } from "../utils/specialtyHandler"; import { specialtyScrapeCheck } from "../utils/specialtyHandler";
import { fireEngineDelete } from "./delete";
// This function does not take `Meta` on purpose. It may not access any // This function does not take `Meta` on purpose. It may not access any
// meta values to construct the request -- that must be done by the // meta values to construct the request -- that must be done by the
@ -44,6 +45,13 @@ async function performFireEngineScrape<
while (status === undefined) { while (status === undefined) {
if (errors.length >= errorLimit) { if (errors.length >= errorLimit) {
logger.error("Error limit hit.", { errors }); logger.error("Error limit hit.", { errors });
fireEngineDelete(
logger.child({
method: "performFireEngineScrape/fireEngineDelete",
afterErrors: errors,
}),
scrape.jobId,
);
throw new Error("Error limit hit. See e.cause.errors for errors.", { throw new Error("Error limit hit. See e.cause.errors for errors.", {
cause: { errors }, cause: { errors },
}); });
@ -74,6 +82,13 @@ async function performFireEngineScrape<
error instanceof ActionError || error instanceof ActionError ||
error instanceof UnsupportedFileError error instanceof UnsupportedFileError
) { ) {
fireEngineDelete(
logger.child({
method: "performFireEngineScrape/fireEngineDelete",
afterError: error,
}),
scrape.jobId,
);
logger.debug("Fire-engine scrape job failed.", { logger.debug("Fire-engine scrape job failed.", {
error, error,
jobId: scrape.jobId, jobId: scrape.jobId,
@ -105,6 +120,13 @@ async function performFireEngineScrape<
status.content = Buffer.from(content, "base64").toString("utf8"); // TODO: handle other encodings via Content-Type tag status.content = Buffer.from(content, "base64").toString("utf8"); // TODO: handle other encodings via Content-Type tag
} }
fireEngineDelete(
logger.child({
method: "performFireEngineScrape/fireEngineDelete",
}),
scrape.jobId,
);
return status; return status;
} }

View File

@ -8,6 +8,7 @@ export function extractMetadata(
): Partial<Document["metadata"]> { ): Partial<Document["metadata"]> {
let title: string | undefined = undefined; let title: string | undefined = undefined;
let description: string | undefined = undefined; let description: string | undefined = undefined;
let favicon: string | undefined = undefined;
let language: string | undefined = undefined; let language: string | undefined = undefined;
let keywords: string | undefined = undefined; let keywords: string | undefined = undefined;
let robots: string | undefined = undefined; let robots: string | undefined = undefined;
@ -42,6 +43,12 @@ export function extractMetadata(
try { try {
title = soup("title").first().text().trim() || undefined; title = soup("title").first().text().trim() || undefined;
description = soup('meta[name="description"]').attr("content") || undefined; description = soup('meta[name="description"]').attr("content") || undefined;
const faviconLink = soup('link[rel="icon"]').attr("href") || soup('link[rel*="icon"]').first().attr("href") || undefined;
if (faviconLink) {
const baseUrl = new URL(meta.url).origin;
favicon = faviconLink.startsWith('http') ? faviconLink : `${baseUrl}${faviconLink}`;
}
// Assuming the language is part of the URL as per the regex pattern // Assuming the language is part of the URL as per the regex pattern
language = soup("html").attr("lang") || undefined; language = soup("html").attr("lang") || undefined;
@ -121,6 +128,7 @@ export function extractMetadata(
return { return {
title, title,
description, description,
favicon,
language, language,
keywords, keywords,
robots, robots,

View File

@ -0,0 +1,33 @@
import { removeDefaultProperty } from "./llmExtract";
describe("removeDefaultProperty", () => {
it("should remove the default property from a simple object", () => {
const input = { default: "test", test: "test" };
const expectedOutput = { test: "test" };
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
});
it("should remove the default property from a nested object", () => {
const input = { default: "test", nested: { default: "nestedTest", test: "nestedTest" } };
const expectedOutput = { nested: { test: "nestedTest" } };
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
});
it("should remove the default property from an array of objects", () => {
const input = { array: [{ default: "test1", test: "test1" }, { default: "test2", test: "test2" }] };
const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] };
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
});
it("should handle objects without a default property", () => {
const input = { test: "test" };
const expectedOutput = { test: "test" };
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
});
it("should handle null and non-object inputs", () => {
expect(removeDefaultProperty(null)).toBeNull();
expect(removeDefaultProperty("string")).toBe("string");
expect(removeDefaultProperty(123)).toBe(123);
});
});

View File

@ -121,6 +121,10 @@ export async function generateOpenAICompletions(
} }
let schema = options.schema; let schema = options.schema;
if (schema) {
schema = removeDefaultProperty(schema);
}
if (schema && schema.type === "array") { if (schema && schema.type === "array") {
schema = { schema = {
type: "object", type: "object",
@ -134,10 +138,12 @@ export async function generateOpenAICompletions(
schema = { schema = {
type: "object", type: "object",
properties: Object.fromEntries( properties: Object.fromEntries(
Object.entries(schema).map(([key, value]) => [key, { type: value }]), Object.entries(schema).map(([key, value]) => {
return [key, removeDefaultProperty(value)];
})
), ),
required: Object.keys(schema), required: Object.keys(schema),
additionalProperties: false, additionalProperties: false
}; };
} }
@ -232,3 +238,19 @@ export async function performLLMExtract(
return document; return document;
} }
export function removeDefaultProperty(schema: any): any {
if (typeof schema !== 'object' || schema === null) return schema;
const { default: _, ...rest } = schema;
for (const key in rest) {
if (Array.isArray(rest[key])) {
rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));
} else if (typeof rest[key] === 'object' && rest[key] !== null) {
rest[key] = removeDefaultProperty(rest[key]);
}
}
return rest;
}

View File

@ -29,7 +29,7 @@ async function _addScrapeJobToConcurrencyQueue(
}); });
} }
async function _addScrapeJobToBullMQ( export async function _addScrapeJobToBullMQ(
webScraperOptions: any, webScraperOptions: any,
options: any, options: any,
jobId: string, jobId: string,
@ -138,7 +138,6 @@ export async function addScrapeJobs(
if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) { if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) {
const now = Date.now(); const now = Date.now();
const limit = await getConcurrencyLimitMax(jobs[0].data.plan); const limit = await getConcurrencyLimitMax(jobs[0].data.plan);
console.log("CC limit", limit);
cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now); cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now);
countCanBeDirectlyAdded = Math.max( countCanBeDirectlyAdded = Math.max(

View File

@ -18,16 +18,18 @@ import { v4 as uuidv4 } from "uuid";
import { import {
addCrawlJob, addCrawlJob,
addCrawlJobDone, addCrawlJobDone,
addCrawlJobs,
crawlToCrawler, crawlToCrawler,
finishCrawl, finishCrawl,
generateURLPermutations, generateURLPermutations,
getCrawl, getCrawl,
getCrawlJobs, getCrawlJobs,
lockURL, lockURL,
lockURLs,
normalizeURL, normalizeURL,
} from "../lib/crawl-redis"; } from "../lib/crawl-redis";
import { StoredCrawl } from "../lib/crawl-redis"; import { StoredCrawl } from "../lib/crawl-redis";
import { addScrapeJob } from "./queue-jobs"; import { addScrapeJob, addScrapeJobs } from "./queue-jobs";
import { import {
addJobPriority, addJobPriority,
deleteJobPriority, deleteJobPriority,
@ -191,22 +193,34 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
await addJobPriority(job.data.team_id, job.id); await addJobPriority(job.data.team_id, job.id);
let err = null; let err = null;
try { try {
const result = await processJob(job, token); if (job.data?.mode === "kickoff") {
if (result.success) { const result = await processKickoffJob(job, token);
try { if (result.success) {
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") { try {
logger.debug(
"Job succeeded -- has crawl associated, putting null in Redis",
);
await job.moveToCompleted(null, token, false); await job.moveToCompleted(null, token, false);
} else { } catch (e) {}
logger.debug("Job succeeded -- putting result in Redis"); } else {
await job.moveToCompleted(result.document, token, false); logger.debug("Job failed", { result, mode: job.data.mode });
} await job.moveToFailed((result as any).error, token, false);
} catch (e) {} }
} else { } else {
logger.debug("Job failed", { result }); const result = await processJob(job, token);
await job.moveToFailed((result as any).error, token, false); if (result.success) {
try {
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
logger.debug(
"Job succeeded -- has crawl associated, putting null in Redis",
);
await job.moveToCompleted(null, token, false);
} else {
logger.debug("Job succeeded -- putting result in Redis");
await job.moveToCompleted(result.document, token, false);
}
} catch (e) {}
} else {
logger.debug("Job failed", { result });
await job.moveToFailed((result as any).error, token, false);
}
} }
} catch (error) { } catch (error) {
logger.debug("Job failed", { error }); logger.debug("Job failed", { error });
@ -379,6 +393,130 @@ const workerFun = async (
workerFun(getScrapeQueue(), processJobInternal); workerFun(getScrapeQueue(), processJobInternal);
async function processKickoffJob(job: Job & { id: string }, token: string) {
const logger = _logger.child({
module: "queue-worker",
method: "processKickoffJob",
jobId: job.id,
scrapeId: job.id,
crawlId: job.data?.crawl_id ?? undefined,
teamId: job.data?.team_id ?? undefined,
});
try {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
const crawler = crawlToCrawler(job.data.crawl_id, sc);
const sitemap = sc.crawlerOptions.ignoreSitemap
? 0
: await crawler.tryGetSitemap(async urls => {
if (urls.length === 0) return;
logger.debug("Using sitemap chunk of length " + urls.length, {
sitemapLength: urls.length,
});
let jobPriority = await getJobPriority({
plan: job.data.plan,
team_id: job.data.team_id,
basePriority: 21,
});
logger.debug("Using job priority " + jobPriority, { jobPriority });
const jobs = urls.map(url => {
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls" as const,
team_id: job.data.team_id,
plan: job.data.plan!,
crawlerOptions: job.data.crawlerOptions,
scrapeOptions: job.data.scrapeOptions,
internalOptions: sc.internalOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
sitemapped: true,
webhook: job.data.webhook,
v1: job.data.v1,
},
opts: {
jobId: uuid,
priority: 20,
},
};
});
logger.debug("Locking URLs...");
await lockURLs(
job.data.crawl_id,
sc,
jobs.map((x) => x.data.url),
);
logger.debug("Adding scrape jobs to Redis...");
await addCrawlJobs(
job.data.crawl_id,
jobs.map((x) => x.opts.jobId),
);
logger.debug("Adding scrape jobs to BullMQ...");
await addScrapeJobs(jobs);
});
if (sitemap === 0) {
logger.debug("Sitemap not found or ignored.", {
ignoreSitemap: sc.crawlerOptions.ignoreSitemap,
});
logger.debug("Locking URL...");
await lockURL(job.data.crawl_id, sc, job.data.url);
const jobId = uuidv4();
logger.debug("Adding scrape job to Redis...", { jobId });
await addScrapeJob(
{
url: job.data.url,
mode: "single_urls",
team_id: job.data.team_id,
crawlerOptions: job.data.crawlerOptions,
scrapeOptions: scrapeOptions.parse(job.data.scrapeOptions),
internalOptions: sc.internalOptions,
plan: job.data.plan!,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
webhook: job.data.webhook,
v1: job.data.v1,
},
{
priority: 15,
},
jobId,
);
logger.debug("Adding scrape job to BullMQ...", { jobId });
await addCrawlJob(job.data.crawl_id, jobId);
}
logger.debug("Done queueing jobs!");
if (job.data.webhook) {
logger.debug("Calling webhook with crawl.started...", {
webhook: job.data.webhook,
});
await callWebhook(
job.data.team_id,
job.data.crawl_id,
null,
job.data.webhook,
true,
"crawl.started",
);
}
return { success: true }
} catch (error) {
logger.error("An error occurred!", { error })
return { success: false, error };
}
}
async function processJob(job: Job & { id: string }, token: string) { async function processJob(job: Job & { id: string }, token: string) {
const logger = _logger.child({ const logger = _logger.child({
module: "queue-worker", module: "queue-worker",

File diff suppressed because it is too large Load Diff

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp # noqa from .firecrawl import FirecrawlApp # noqa
__version__ = "1.6.8" __version__ = "1.7.0"
# Define the logger for the Firecrawl project # Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl") logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -8,7 +8,7 @@ from datetime import datetime
load_dotenv() load_dotenv()
API_URL = "http://127.0.0.1:3002"; API_URL = os.getenv('API_URL', 'http://127.0.0.1:3002')
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py" ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
TEST_API_KEY = os.getenv('TEST_API_KEY') TEST_API_KEY = os.getenv('TEST_API_KEY')
@ -20,15 +20,26 @@ spec.loader.exec_module(firecrawl)
FirecrawlApp = firecrawl.FirecrawlApp FirecrawlApp = firecrawl.FirecrawlApp
def test_no_api_key(): def test_no_api_key():
with pytest.raises(Exception) as excinfo: if 'api.firecrawl.dev' in API_URL:
invalid_app = FirecrawlApp(api_url=API_URL) with pytest.raises(Exception) as excinfo:
assert "No API key provided" in str(excinfo.value) invalid_app = FirecrawlApp(api_url=API_URL)
assert "No API key provided" in str(excinfo.value)
else:
# Should not raise error for self-hosted
app = FirecrawlApp(api_url=API_URL)
assert app is not None
def test_scrape_url_invalid_api_key(): def test_scrape_url_invalid_api_key():
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") if 'api.firecrawl.dev' in API_URL:
with pytest.raises(Exception) as excinfo: invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
invalid_app.scrape_url('https://firecrawl.dev') with pytest.raises(Exception) as excinfo:
assert "Unauthorized: Invalid token" in str(excinfo.value) invalid_app.scrape_url('https://firecrawl.dev')
assert "Unauthorized: Invalid token" in str(excinfo.value)
else:
# Should work without API key for self-hosted
app = FirecrawlApp(api_url=API_URL)
response = app.scrape_url('https://firecrawl.dev')
assert response is not None
# def test_blocklisted_url(): # def test_blocklisted_url():
# blocklisted_url = "https://facebook.com/fake-test" # blocklisted_url = "https://facebook.com/fake-test"
@ -131,10 +142,16 @@ def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_ext
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown'] assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown']
def test_crawl_url_invalid_api_key(): def test_crawl_url_invalid_api_key():
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") if 'api.firecrawl.dev' in API_URL:
with pytest.raises(Exception) as excinfo: invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
invalid_app.crawl_url('https://firecrawl.dev') with pytest.raises(Exception) as excinfo:
assert "Unauthorized: Invalid token" in str(excinfo.value) invalid_app.crawl_url('https://firecrawl.dev')
assert "Unauthorized: Invalid token" in str(excinfo.value)
else:
# Should work without API key for self-hosted
app = FirecrawlApp(api_url=API_URL)
response = app.crawl_url('https://firecrawl.dev')
assert response is not None
# def test_should_return_error_for_blocklisted_url(): # def test_should_return_error_for_blocklisted_url():
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
@ -291,10 +308,16 @@ def test_check_crawl_status_e2e():
assert 'error' not in status_response['data'][0]['metadata'] assert 'error' not in status_response['data'][0]['metadata']
def test_invalid_api_key_on_map(): def test_invalid_api_key_on_map():
invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL) if 'api.firecrawl.dev' in API_URL:
with pytest.raises(Exception) as excinfo: invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL)
invalid_app.map_url('https://roastmywebsite.ai') with pytest.raises(Exception) as excinfo:
assert "Unauthorized: Invalid token" in str(excinfo.value) invalid_app.map_url('https://roastmywebsite.ai')
assert "Unauthorized: Invalid token" in str(excinfo.value)
else:
# Should work without API key for self-hosted
app = FirecrawlApp(api_url=API_URL)
response = app.map_url('https://roastmywebsite.ai')
assert response is not None
# def test_blocklisted_url_on_map(): # def test_blocklisted_url_on_map():
# app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL) # app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
@ -349,4 +372,3 @@ def test_search_e2e():
# assert isinstance(llm_extraction['is_open_source'], bool) # assert isinstance(llm_extraction['is_open_source'], bool)

View File

@ -40,19 +40,22 @@ class FirecrawlApp:
error: Optional[str] = None error: Optional[str] = None
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
""" """
Initialize the FirecrawlApp instance with API key, API URL. Initialize the FirecrawlApp instance with API key, API URL.
Args: Args:
api_key (Optional[str]): API key for authenticating with the Firecrawl API. api_key (Optional[str]): API key for authenticating with the Firecrawl API.
api_url (Optional[str]): Base URL for the Firecrawl API. api_url (Optional[str]): Base URL for the Firecrawl API.
""" """
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
if self.api_key is None:
logger.warning("No API key provided") # Only require API key when using cloud service
raise ValueError('No API key provided') if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key}") logger.warning("No API key provided for cloud service")
raise ValueError('No API key provided')
logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
""" """

View File

@ -12,7 +12,8 @@ dependencies = [
"requests", "requests",
"python-dotenv", "python-dotenv",
"websockets", "websockets",
"nest-asyncio" "nest-asyncio",
"pydantic>=2.10.3",
] ]
authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}] authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}] maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}]

View File

@ -2,4 +2,5 @@ requests
pytest pytest
python-dotenv python-dotenv
websockets websockets
nest-asyncio nest-asyncio
pydantic

View File

@ -9,7 +9,7 @@ use crate::crawl::CrawlStatus;
#[derive(Debug, Deserialize, Serialize, Clone)] #[derive(Debug, Deserialize, Serialize, Clone)]
pub struct FirecrawlAPIError { pub struct FirecrawlAPIError {
/// Always false. /// Always false.
success: bool, pub success: bool,
/// Error message /// Error message
pub error: String, pub error: String,

View File

@ -9,6 +9,7 @@ pub mod map;
pub mod scrape; pub mod scrape;
pub use error::FirecrawlError; pub use error::FirecrawlError;
use error::FirecrawlAPIError;
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct FirecrawlApp { pub struct FirecrawlApp {
@ -18,16 +19,30 @@ pub struct FirecrawlApp {
} }
pub(crate) const API_VERSION: &str = "/v1"; pub(crate) const API_VERSION: &str = "/v1";
const CLOUD_API_URL: &str = "https://api.firecrawl.dev";
impl FirecrawlApp { impl FirecrawlApp {
pub fn new(api_key: impl AsRef<str>) -> Result<Self, FirecrawlError> { pub fn new(api_key: impl AsRef<str>) -> Result<Self, FirecrawlError> {
FirecrawlApp::new_selfhosted("https://api.firecrawl.dev", Some(api_key)) FirecrawlApp::new_selfhosted(CLOUD_API_URL, Some(api_key))
} }
pub fn new_selfhosted(api_url: impl AsRef<str>, api_key: Option<impl AsRef<str>>) -> Result<Self, FirecrawlError> { pub fn new_selfhosted(api_url: impl AsRef<str>, api_key: Option<impl AsRef<str>>) -> Result<Self, FirecrawlError> {
let url = api_url.as_ref().to_string();
if url == CLOUD_API_URL && api_key.is_none() {
return Err(FirecrawlError::APIError(
"Configuration".to_string(),
FirecrawlAPIError {
success: false,
error: "API key is required for cloud service".to_string(),
details: None,
}
));
}
Ok(FirecrawlApp { Ok(FirecrawlApp {
api_key: api_key.map(|x| x.as_ref().to_string()), api_key: api_key.map(|x| x.as_ref().to_string()),
api_url: api_url.as_ref().to_string(), api_url: url,
client: Client::new(), client: Client::new(),
}) })
} }

View File

@ -1,7 +1,7 @@
use assert_matches::assert_matches; use assert_matches::assert_matches;
use dotenvy::dotenv; use dotenvy::dotenv;
use firecrawl::scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions}; use firecrawl::scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions};
use firecrawl::FirecrawlApp; use firecrawl::{FirecrawlApp, FirecrawlError};
use serde_json::json; use serde_json::json;
use std::env; use std::env;
@ -155,3 +155,29 @@ async fn test_llm_extraction() {
assert!(llm_extraction["supports_sso"].is_boolean()); assert!(llm_extraction["supports_sso"].is_boolean());
assert!(llm_extraction["is_open_source"].is_boolean()); assert!(llm_extraction["is_open_source"].is_boolean());
} }
#[test]
fn test_api_key_requirements() {
dotenv().ok();
let api_url = env::var("API_URL").unwrap_or("http://localhost:3002".to_string());
let api_key = env::var("TEST_API_KEY").ok();
match (api_url.contains("api.firecrawl.dev"), api_key) {
(false, _) => {
let result = FirecrawlApp::new_selfhosted(&api_url, None::<String>);
assert!(result.is_ok(), "Local setup failed: {:?}", result.err().unwrap());
}
(true, None) => {
let result = FirecrawlApp::new_selfhosted(&api_url, None::<String>);
assert!(matches!(
result,
Err(FirecrawlError::APIError(msg, _)) if msg == "Configuration"
));
}
(true, Some(key)) => {
let result = FirecrawlApp::new_selfhosted(&api_url, Some(&key));
assert!(result.is_ok());
}
}
}