mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 06:05:51 +08:00
Merge branch 'main' into mog/mineru
This commit is contained in:
commit
5fcf3fa97e
@ -116,6 +116,10 @@ If you’d like to test the crawl endpoint, you can run this:
|
||||
|
||||
This section provides solutions to common issues you might encounter while setting up or running your self-hosted instance of Firecrawl.
|
||||
|
||||
### API Keys for SDK Usage
|
||||
|
||||
**Note:** When using Firecrawl SDKs with a self-hosted instance, API keys are optional. API keys are only required when connecting to the cloud service (api.firecrawl.dev).
|
||||
|
||||
### Supabase client is not configured
|
||||
|
||||
**Symptom:**
|
||||
|
@ -70,8 +70,8 @@ content-type: application/json
|
||||
"urls": ["firecrawl.dev"],
|
||||
"prompt": "What is the title, description and main product of the page?",
|
||||
"schema": {
|
||||
"title": "string",
|
||||
"description": "string",
|
||||
"mainProduct": "string"
|
||||
"title": { "type": "string" },
|
||||
"description": { "type": "string" },
|
||||
"mainProduct": { "type": "string" }
|
||||
}
|
||||
}
|
@ -177,20 +177,13 @@ export async function crawlController(req: Request, res: Response) {
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
const sitemap = sc.crawlerOptions?.ignoreSitemap
|
||||
? null
|
||||
: await crawler.tryGetSitemap();
|
||||
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||
? 0
|
||||
: await crawler.tryGetSitemap(async urls => {
|
||||
if (urls.length === 0) return;
|
||||
|
||||
if (sitemap !== null && sitemap.length > 0) {
|
||||
let jobPriority = 20;
|
||||
// If it is over 1000, we need to get the job priority,
|
||||
// otherwise we can use the default priority of 20
|
||||
if (sitemap.length > 1000) {
|
||||
// set base to 21
|
||||
jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 });
|
||||
}
|
||||
const jobs = sitemap.map((x) => {
|
||||
const url = x.url;
|
||||
let jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 });
|
||||
const jobs = urls.map(url => {
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
@ -226,7 +219,9 @@ export async function crawlController(req: Request, res: Response) {
|
||||
// add with sentry instrumentation
|
||||
await addScrapeJob(job.data as any, {}, job.opts.jobId);
|
||||
}
|
||||
} else {
|
||||
});
|
||||
|
||||
if (sitemap === 0) {
|
||||
await lockURL(id, sc, url);
|
||||
|
||||
// Not needed, first one should be 15.
|
||||
|
@ -113,11 +113,9 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
const sitemap = sc.crawlerOptions?.ignoreSitemap
|
||||
? null
|
||||
: await crawler.tryGetSitemap();
|
||||
|
||||
if (sitemap !== null) {
|
||||
for (const url of sitemap.map((x) => x.url)) {
|
||||
? 0
|
||||
: await crawler.tryGetSitemap(async urls => {
|
||||
for (const url of urls) {
|
||||
await lockURL(id, sc, url);
|
||||
const jobId = uuidv4();
|
||||
await addScrapeJob(
|
||||
@ -138,7 +136,9 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||
);
|
||||
await addCrawlJob(id, jobId);
|
||||
}
|
||||
} else {
|
||||
});
|
||||
|
||||
if (sitemap === 0) {
|
||||
await lockURL(id, sc, url);
|
||||
const jobId = uuidv4();
|
||||
await addScrapeJob(
|
||||
|
@ -115,7 +115,7 @@ export async function crawlStatusController(
|
||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
|
||||
sc.cancelled
|
||||
? "cancelled"
|
||||
: validJobStatuses.every((x) => x[1] === "completed")
|
||||
: (validJobStatuses.every((x) => x[1] === "completed") && validJobStatuses.length > 0)
|
||||
? "completed"
|
||||
: "scraping";
|
||||
|
||||
|
@ -18,7 +18,7 @@ import {
|
||||
} from "../../lib/crawl-redis";
|
||||
import { logCrawl } from "../../services/logging/crawl_log";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { addScrapeJob, addScrapeJobs } from "../../services/queue-jobs";
|
||||
import { _addScrapeJobToBullMQ, addScrapeJob, addScrapeJobs } from "../../services/queue-jobs";
|
||||
import { logger as _logger } from "../../lib/logger";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { callWebhook } from "../../services/webhook";
|
||||
@ -111,112 +111,19 @@ export async function crawlController(
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||
? null
|
||||
: await crawler.tryGetSitemap();
|
||||
|
||||
if (sitemap !== null && sitemap.length > 0) {
|
||||
logger.debug("Using sitemap of length " + sitemap.length, {
|
||||
sitemapLength: sitemap.length,
|
||||
});
|
||||
let jobPriority = 20;
|
||||
// If it is over 1000, we need to get the job priority,
|
||||
// otherwise we can use the default priority of 20
|
||||
if (sitemap.length > 1000) {
|
||||
// set base to 21
|
||||
jobPriority = await getJobPriority({
|
||||
plan: req.auth.plan,
|
||||
team_id: req.auth.team_id,
|
||||
basePriority: 21,
|
||||
});
|
||||
}
|
||||
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
||||
|
||||
const jobs = sitemap.map((x) => {
|
||||
const url = x.url;
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls" as const,
|
||||
team_id: req.auth.team_id,
|
||||
plan: req.auth.plan!,
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions: sc.internalOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
webhook: req.body.webhook,
|
||||
v1: true,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 20,
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
logger.debug("Locking URLs...");
|
||||
await lockURLs(
|
||||
id,
|
||||
sc,
|
||||
jobs.map((x) => x.data.url),
|
||||
);
|
||||
logger.debug("Adding scrape jobs to Redis...");
|
||||
await addCrawlJobs(
|
||||
id,
|
||||
jobs.map((x) => x.opts.jobId),
|
||||
);
|
||||
logger.debug("Adding scrape jobs to BullMQ...");
|
||||
await addScrapeJobs(jobs);
|
||||
} else {
|
||||
logger.debug("Sitemap not found or ignored.", {
|
||||
ignoreSitemap: sc.crawlerOptions.ignoreSitemap,
|
||||
});
|
||||
|
||||
logger.debug("Locking URL...");
|
||||
await lockURL(id, sc, req.body.url);
|
||||
const jobId = uuidv4();
|
||||
logger.debug("Adding scrape job to Redis...", { jobId });
|
||||
await addScrapeJob(
|
||||
{
|
||||
await _addScrapeJobToBullMQ({
|
||||
url: req.body.url,
|
||||
mode: "single_urls",
|
||||
mode: "kickoff" as const,
|
||||
team_id: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
crawlerOptions,
|
||||
scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions),
|
||||
scrapeOptions: sc.scrapeOptions,
|
||||
internalOptions: sc.internalOptions,
|
||||
plan: req.auth.plan!,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
webhook: req.body.webhook,
|
||||
v1: true,
|
||||
},
|
||||
{
|
||||
priority: 15,
|
||||
},
|
||||
jobId,
|
||||
);
|
||||
logger.debug("Adding scrape job to BullMQ...", { jobId });
|
||||
await addCrawlJob(id, jobId);
|
||||
}
|
||||
logger.debug("Done queueing jobs!");
|
||||
|
||||
if (req.body.webhook) {
|
||||
logger.debug("Calling webhook with crawl.started...", {
|
||||
webhook: req.body.webhook,
|
||||
});
|
||||
await callWebhook(
|
||||
req.auth.team_id,
|
||||
id,
|
||||
null,
|
||||
req.body.webhook,
|
||||
true,
|
||||
"crawl.started",
|
||||
);
|
||||
}
|
||||
}, {}, crypto.randomUUID(), 10);
|
||||
|
||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||
|
||||
|
@ -86,11 +86,12 @@ export async function getMapResults({
|
||||
|
||||
// If sitemapOnly is true, only get links from sitemap
|
||||
if (crawlerOptions.sitemapOnly) {
|
||||
const sitemap = await crawler.tryGetSitemap(true, true);
|
||||
if (sitemap !== null) {
|
||||
sitemap.forEach((x) => {
|
||||
links.push(x.url);
|
||||
const sitemap = await crawler.tryGetSitemap(urls => {
|
||||
urls.forEach((x) => {
|
||||
links.push(x);
|
||||
});
|
||||
}, true, true);
|
||||
if (sitemap > 0) {
|
||||
links = links
|
||||
.slice(1)
|
||||
.map((x) => {
|
||||
@ -143,8 +144,10 @@ export async function getMapResults({
|
||||
}
|
||||
|
||||
// Parallelize sitemap fetch with serper search
|
||||
const [sitemap, ...searchResults] = await Promise.all([
|
||||
ignoreSitemap ? null : crawler.tryGetSitemap(true),
|
||||
const [_, ...searchResults] = await Promise.all([
|
||||
ignoreSitemap ? null : crawler.tryGetSitemap(urls => {
|
||||
links.push(...urls);
|
||||
}, true),
|
||||
...(cachedResult ? [] : pagePromises),
|
||||
]);
|
||||
|
||||
@ -152,12 +155,6 @@ export async function getMapResults({
|
||||
allResults = searchResults;
|
||||
}
|
||||
|
||||
if (sitemap !== null) {
|
||||
sitemap.forEach((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
}
|
||||
|
||||
mapResults = allResults
|
||||
.flat()
|
||||
.filter((result) => result !== null && result !== undefined);
|
||||
|
@ -17,7 +17,7 @@ export function withAuth<T, U extends any[]>(
|
||||
logger.warn("You're bypassing authentication");
|
||||
warningCount++;
|
||||
}
|
||||
return { success: true } as T;
|
||||
return { success: true, ...(mockSuccess || {}) } as T;
|
||||
} else {
|
||||
return await originalFunction(...args);
|
||||
}
|
||||
|
@ -4,9 +4,10 @@ import { URL } from "url";
|
||||
import { getLinksFromSitemap } from "./sitemap";
|
||||
import robotsParser from "robots-parser";
|
||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||
import { logger as _logger } from "../../../src/lib/logger";
|
||||
import { axiosTimeout } from "../../lib/timeout";
|
||||
import { logger as _logger } from "../../lib/logger";
|
||||
import https from "https";
|
||||
import { redisConnection } from "../../services/queue-service";
|
||||
export class WebCrawler {
|
||||
private jobId: string;
|
||||
private initialUrl: string;
|
||||
@ -198,26 +199,60 @@ export class WebCrawler {
|
||||
}
|
||||
|
||||
public async tryGetSitemap(
|
||||
urlsHandler: (urls: string[]) => unknown,
|
||||
fromMap: boolean = false,
|
||||
onlySitemap: boolean = false,
|
||||
): Promise<{ url: string; html: string }[] | null> {
|
||||
): Promise<number> {
|
||||
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
|
||||
method: "tryGetSitemap",
|
||||
});
|
||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||
if (fromMap && onlySitemap) {
|
||||
return sitemapLinks.map((link) => ({ url: link, html: "" }));
|
||||
let leftOfLimit = this.limit;
|
||||
|
||||
const normalizeUrl = (url: string) => {
|
||||
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
||||
if (url.endsWith("/")) {
|
||||
url = url.slice(0, -1);
|
||||
}
|
||||
if (sitemapLinks.length > 0) {
|
||||
return url;
|
||||
};
|
||||
|
||||
const _urlsHandler = async (urls: string[]) => {
|
||||
let uniqueURLs: string[] = [];
|
||||
for (const url of urls) {
|
||||
if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(url))) {
|
||||
uniqueURLs.push(url);
|
||||
}
|
||||
}
|
||||
|
||||
await redisConnection.expire("sitemap:" + this.jobId + ":links", 3600, "NX");
|
||||
if (uniqueURLs.length > 0) {
|
||||
urlsHandler(uniqueURLs);
|
||||
}
|
||||
};
|
||||
|
||||
let count = await this.tryFetchSitemapLinks(this.initialUrl, (urls: string[]) => {
|
||||
if (fromMap && onlySitemap) {
|
||||
return urlsHandler(urls);
|
||||
} else {
|
||||
let filteredLinks = this.filterLinks(
|
||||
[...new Set(sitemapLinks)],
|
||||
this.limit,
|
||||
[...new Set(urls)],
|
||||
leftOfLimit,
|
||||
this.maxCrawledDepth,
|
||||
fromMap,
|
||||
);
|
||||
return filteredLinks.map((link) => ({ url: link, html: "" }));
|
||||
leftOfLimit -= filteredLinks.length;
|
||||
return _urlsHandler(filteredLinks);
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
if (count > 0) {
|
||||
if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(this.initialUrl))) {
|
||||
urlsHandler([this.initialUrl]);
|
||||
}
|
||||
count++;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
public filterURL(href: string, url: string): string | null {
|
||||
@ -436,54 +471,74 @@ export class WebCrawler {
|
||||
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
||||
}
|
||||
|
||||
private async tryFetchSitemapLinks(url: string): Promise<string[]> {
|
||||
const normalizeUrl = (url: string) => {
|
||||
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
||||
if (url.endsWith("/")) {
|
||||
url = url.slice(0, -1);
|
||||
}
|
||||
return url;
|
||||
};
|
||||
|
||||
private async tryFetchSitemapLinks(url: string, urlsHandler: (urls: string[]) => unknown): Promise<number> {
|
||||
const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`;
|
||||
|
||||
let sitemapLinks: string[] = [];
|
||||
let sitemapCount: number = 0;
|
||||
|
||||
// Try to get sitemap from the provided URL first
|
||||
try {
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
if (response.status === 200) {
|
||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl }, this.logger);
|
||||
}
|
||||
sitemapCount = await getLinksFromSitemap(
|
||||
{ sitemapUrl, urlsHandler, mode: "fire-engine" },
|
||||
this.logger,
|
||||
);
|
||||
} catch (error) {
|
||||
this.logger.debug(
|
||||
`Failed to fetch sitemap with axios from ${sitemapUrl}`,
|
||||
`Failed to fetch sitemap from ${sitemapUrl}`,
|
||||
{ method: "tryFetchSitemapLinks", sitemapUrl, error },
|
||||
);
|
||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||
// ignore 404
|
||||
} else {
|
||||
const response = await getLinksFromSitemap(
|
||||
{ sitemapUrl, mode: "fire-engine" },
|
||||
this.logger,
|
||||
);
|
||||
if (response) {
|
||||
sitemapLinks = response;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sitemapLinks.length === 0) {
|
||||
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
||||
// If this is a subdomain, also try to get sitemap from the main domain
|
||||
try {
|
||||
const response = await axios.get(baseUrlSitemap, {
|
||||
timeout: axiosTimeout,
|
||||
});
|
||||
if (response.status === 200) {
|
||||
sitemapLinks = await getLinksFromSitemap(
|
||||
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
|
||||
const urlObj = new URL(url);
|
||||
const hostname = urlObj.hostname;
|
||||
const domainParts = hostname.split('.');
|
||||
|
||||
// Check if this is a subdomain (has more than 2 parts and not www)
|
||||
if (domainParts.length > 2 && domainParts[0] !== 'www') {
|
||||
// Get the main domain by taking the last two parts
|
||||
const mainDomain = domainParts.slice(-2).join('.');
|
||||
const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`;
|
||||
const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`;
|
||||
|
||||
try {
|
||||
// Get all links from the main domain's sitemap
|
||||
sitemapCount += await getLinksFromSitemap(
|
||||
{ sitemapUrl: mainDomainSitemapUrl, urlsHandler(urls) {
|
||||
urlsHandler(urls.filter(link => {
|
||||
try {
|
||||
const linkUrl = new URL(link);
|
||||
return linkUrl.hostname.endsWith(hostname);
|
||||
} catch {
|
||||
}
|
||||
}))
|
||||
}, mode: "fire-engine" },
|
||||
this.logger,
|
||||
);
|
||||
} catch (error) {
|
||||
this.logger.debug(
|
||||
`Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`,
|
||||
{ method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error },
|
||||
);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
this.logger.debug(`Error processing main domain sitemap`, {
|
||||
method: "tryFetchSitemapLinks",
|
||||
url,
|
||||
error,
|
||||
});
|
||||
}
|
||||
|
||||
// If no sitemap found yet, try the baseUrl as a last resort
|
||||
if (sitemapCount === 0) {
|
||||
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
||||
try {
|
||||
sitemapCount += await getLinksFromSitemap(
|
||||
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
|
||||
this.logger,
|
||||
);
|
||||
} catch (error) {
|
||||
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
|
||||
method: "tryFetchSitemapLinks",
|
||||
@ -493,25 +548,14 @@ export class WebCrawler {
|
||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||
// ignore 404
|
||||
} else {
|
||||
sitemapLinks = await getLinksFromSitemap(
|
||||
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
|
||||
sitemapCount += await getLinksFromSitemap(
|
||||
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
|
||||
this.logger,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const normalizedUrl = normalizeUrl(url);
|
||||
const normalizedSitemapLinks = sitemapLinks.map((link) =>
|
||||
normalizeUrl(link),
|
||||
);
|
||||
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
|
||||
if (
|
||||
!normalizedSitemapLinks.includes(normalizedUrl) &&
|
||||
sitemapLinks.length > 0
|
||||
) {
|
||||
sitemapLinks.push(url);
|
||||
}
|
||||
return sitemapLinks;
|
||||
return sitemapCount;
|
||||
}
|
||||
}
|
||||
|
@ -5,26 +5,25 @@ import { WebCrawler } from "./crawler";
|
||||
import { scrapeURL } from "../scrapeURL";
|
||||
import { scrapeOptions } from "../../controllers/v1/types";
|
||||
import type { Logger } from "winston";
|
||||
|
||||
const useFireEngine =
|
||||
process.env.FIRE_ENGINE_BETA_URL !== "" &&
|
||||
process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||
export async function getLinksFromSitemap(
|
||||
{
|
||||
sitemapUrl,
|
||||
allUrls = [],
|
||||
urlsHandler,
|
||||
mode = "axios",
|
||||
}: {
|
||||
sitemapUrl: string;
|
||||
allUrls?: string[];
|
||||
urlsHandler(urls: string[]): unknown,
|
||||
mode?: "axios" | "fire-engine";
|
||||
},
|
||||
logger: Logger,
|
||||
): Promise<string[]> {
|
||||
): Promise<number> {
|
||||
try {
|
||||
let content: string = "";
|
||||
try {
|
||||
if (mode === "axios" || process.env.FIRE_ENGINE_BETA_URL === "") {
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
content = response.data;
|
||||
} else if (mode === "fire-engine") {
|
||||
if (mode === "fire-engine" && useFireEngine) {
|
||||
const response = await scrapeURL(
|
||||
"sitemap",
|
||||
sitemapUrl,
|
||||
@ -32,10 +31,16 @@ export async function getLinksFromSitemap(
|
||||
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
|
||||
);
|
||||
if (!response.success) {
|
||||
throw response.error;
|
||||
}
|
||||
logger.debug("Failed to scrape sitemap via TLSClient, falling back to axios...", { error: response.error })
|
||||
const ar = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
content = ar.data;
|
||||
} else {
|
||||
content = response.document.rawHtml!;
|
||||
}
|
||||
} else {
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
content = response.data;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Request failed for ${sitemapUrl}`, {
|
||||
method: "getLinksFromSitemap",
|
||||
@ -44,33 +49,64 @@ export async function getLinksFromSitemap(
|
||||
error,
|
||||
});
|
||||
|
||||
return allUrls;
|
||||
return 0;
|
||||
}
|
||||
|
||||
const parsed = await parseStringPromise(content);
|
||||
const root = parsed.urlset || parsed.sitemapindex;
|
||||
let count = 0;
|
||||
|
||||
if (root && root.sitemap) {
|
||||
const sitemapPromises = root.sitemap
|
||||
// Handle sitemap index files
|
||||
const sitemapUrls = root.sitemap
|
||||
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
|
||||
.map((sitemap) =>
|
||||
.map((sitemap) => sitemap.loc[0]);
|
||||
|
||||
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
||||
getLinksFromSitemap(
|
||||
{ sitemapUrl: sitemap.loc[0], allUrls, mode },
|
||||
{ sitemapUrl, urlsHandler, mode },
|
||||
logger,
|
||||
),
|
||||
);
|
||||
await Promise.all(sitemapPromises);
|
||||
|
||||
const results = await Promise.all(sitemapPromises);
|
||||
count = results.reduce((a,x) => a + x)
|
||||
} else if (root && root.url) {
|
||||
// Check if any URLs point to additional sitemaps
|
||||
const xmlSitemaps: string[] = root.url
|
||||
.filter(
|
||||
(url) =>
|
||||
url.loc &&
|
||||
url.loc.length > 0 &&
|
||||
url.loc[0].toLowerCase().endsWith('.xml')
|
||||
)
|
||||
.map((url) => url.loc[0]);
|
||||
|
||||
if (xmlSitemaps.length > 0) {
|
||||
// Recursively fetch links from additional sitemaps
|
||||
const sitemapPromises = xmlSitemaps.map((sitemapUrl) =>
|
||||
getLinksFromSitemap(
|
||||
{ sitemapUrl: sitemapUrl, urlsHandler, mode },
|
||||
logger,
|
||||
),
|
||||
);
|
||||
count += (await Promise.all(sitemapPromises)).reduce((a,x) => a + x, 0);
|
||||
}
|
||||
|
||||
const validUrls = root.url
|
||||
.filter(
|
||||
(url) =>
|
||||
url.loc &&
|
||||
url.loc.length > 0 &&
|
||||
!url.loc[0].toLowerCase().endsWith('.xml') &&
|
||||
!WebCrawler.prototype.isFile(url.loc[0]),
|
||||
)
|
||||
.map((url) => url.loc[0]);
|
||||
allUrls.push(...validUrls);
|
||||
count += validUrls.length;
|
||||
urlsHandler(validUrls);
|
||||
}
|
||||
|
||||
return count;
|
||||
} catch (error) {
|
||||
logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, {
|
||||
method: "getLinksFromSitemap",
|
||||
@ -80,7 +116,7 @@ export async function getLinksFromSitemap(
|
||||
});
|
||||
}
|
||||
|
||||
return allUrls;
|
||||
return 0;
|
||||
}
|
||||
|
||||
export const fetchSitemapData = async (
|
||||
|
@ -17,6 +17,7 @@ import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { Action } from "../../../../lib/entities";
|
||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
import { fireEngineDelete } from "./delete";
|
||||
|
||||
// This function does not take `Meta` on purpose. It may not access any
|
||||
// meta values to construct the request -- that must be done by the
|
||||
@ -44,6 +45,13 @@ async function performFireEngineScrape<
|
||||
while (status === undefined) {
|
||||
if (errors.length >= errorLimit) {
|
||||
logger.error("Error limit hit.", { errors });
|
||||
fireEngineDelete(
|
||||
logger.child({
|
||||
method: "performFireEngineScrape/fireEngineDelete",
|
||||
afterErrors: errors,
|
||||
}),
|
||||
scrape.jobId,
|
||||
);
|
||||
throw new Error("Error limit hit. See e.cause.errors for errors.", {
|
||||
cause: { errors },
|
||||
});
|
||||
@ -74,6 +82,13 @@ async function performFireEngineScrape<
|
||||
error instanceof ActionError ||
|
||||
error instanceof UnsupportedFileError
|
||||
) {
|
||||
fireEngineDelete(
|
||||
logger.child({
|
||||
method: "performFireEngineScrape/fireEngineDelete",
|
||||
afterError: error,
|
||||
}),
|
||||
scrape.jobId,
|
||||
);
|
||||
logger.debug("Fire-engine scrape job failed.", {
|
||||
error,
|
||||
jobId: scrape.jobId,
|
||||
@ -105,6 +120,13 @@ async function performFireEngineScrape<
|
||||
status.content = Buffer.from(content, "base64").toString("utf8"); // TODO: handle other encodings via Content-Type tag
|
||||
}
|
||||
|
||||
fireEngineDelete(
|
||||
logger.child({
|
||||
method: "performFireEngineScrape/fireEngineDelete",
|
||||
}),
|
||||
scrape.jobId,
|
||||
);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
@ -8,6 +8,7 @@ export function extractMetadata(
|
||||
): Partial<Document["metadata"]> {
|
||||
let title: string | undefined = undefined;
|
||||
let description: string | undefined = undefined;
|
||||
let favicon: string | undefined = undefined;
|
||||
let language: string | undefined = undefined;
|
||||
let keywords: string | undefined = undefined;
|
||||
let robots: string | undefined = undefined;
|
||||
@ -43,6 +44,12 @@ export function extractMetadata(
|
||||
title = soup("title").first().text().trim() || undefined;
|
||||
description = soup('meta[name="description"]').attr("content") || undefined;
|
||||
|
||||
const faviconLink = soup('link[rel="icon"]').attr("href") || soup('link[rel*="icon"]').first().attr("href") || undefined;
|
||||
if (faviconLink) {
|
||||
const baseUrl = new URL(meta.url).origin;
|
||||
favicon = faviconLink.startsWith('http') ? faviconLink : `${baseUrl}${faviconLink}`;
|
||||
}
|
||||
|
||||
// Assuming the language is part of the URL as per the regex pattern
|
||||
language = soup("html").attr("lang") || undefined;
|
||||
|
||||
@ -121,6 +128,7 @@ export function extractMetadata(
|
||||
return {
|
||||
title,
|
||||
description,
|
||||
favicon,
|
||||
language,
|
||||
keywords,
|
||||
robots,
|
||||
|
@ -0,0 +1,33 @@
|
||||
import { removeDefaultProperty } from "./llmExtract";
|
||||
|
||||
describe("removeDefaultProperty", () => {
|
||||
it("should remove the default property from a simple object", () => {
|
||||
const input = { default: "test", test: "test" };
|
||||
const expectedOutput = { test: "test" };
|
||||
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||
});
|
||||
|
||||
it("should remove the default property from a nested object", () => {
|
||||
const input = { default: "test", nested: { default: "nestedTest", test: "nestedTest" } };
|
||||
const expectedOutput = { nested: { test: "nestedTest" } };
|
||||
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||
});
|
||||
|
||||
it("should remove the default property from an array of objects", () => {
|
||||
const input = { array: [{ default: "test1", test: "test1" }, { default: "test2", test: "test2" }] };
|
||||
const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] };
|
||||
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||
});
|
||||
|
||||
it("should handle objects without a default property", () => {
|
||||
const input = { test: "test" };
|
||||
const expectedOutput = { test: "test" };
|
||||
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||
});
|
||||
|
||||
it("should handle null and non-object inputs", () => {
|
||||
expect(removeDefaultProperty(null)).toBeNull();
|
||||
expect(removeDefaultProperty("string")).toBe("string");
|
||||
expect(removeDefaultProperty(123)).toBe(123);
|
||||
});
|
||||
});
|
@ -121,6 +121,10 @@ export async function generateOpenAICompletions(
|
||||
}
|
||||
|
||||
let schema = options.schema;
|
||||
if (schema) {
|
||||
schema = removeDefaultProperty(schema);
|
||||
}
|
||||
|
||||
if (schema && schema.type === "array") {
|
||||
schema = {
|
||||
type: "object",
|
||||
@ -134,10 +138,12 @@ export async function generateOpenAICompletions(
|
||||
schema = {
|
||||
type: "object",
|
||||
properties: Object.fromEntries(
|
||||
Object.entries(schema).map(([key, value]) => [key, { type: value }]),
|
||||
Object.entries(schema).map(([key, value]) => {
|
||||
return [key, removeDefaultProperty(value)];
|
||||
})
|
||||
),
|
||||
required: Object.keys(schema),
|
||||
additionalProperties: false,
|
||||
additionalProperties: false
|
||||
};
|
||||
}
|
||||
|
||||
@ -232,3 +238,19 @@ export async function performLLMExtract(
|
||||
|
||||
return document;
|
||||
}
|
||||
|
||||
export function removeDefaultProperty(schema: any): any {
|
||||
if (typeof schema !== 'object' || schema === null) return schema;
|
||||
|
||||
const { default: _, ...rest } = schema;
|
||||
|
||||
for (const key in rest) {
|
||||
if (Array.isArray(rest[key])) {
|
||||
rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));
|
||||
} else if (typeof rest[key] === 'object' && rest[key] !== null) {
|
||||
rest[key] = removeDefaultProperty(rest[key]);
|
||||
}
|
||||
}
|
||||
|
||||
return rest;
|
||||
}
|
@ -29,7 +29,7 @@ async function _addScrapeJobToConcurrencyQueue(
|
||||
});
|
||||
}
|
||||
|
||||
async function _addScrapeJobToBullMQ(
|
||||
export async function _addScrapeJobToBullMQ(
|
||||
webScraperOptions: any,
|
||||
options: any,
|
||||
jobId: string,
|
||||
@ -138,7 +138,6 @@ export async function addScrapeJobs(
|
||||
if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) {
|
||||
const now = Date.now();
|
||||
const limit = await getConcurrencyLimitMax(jobs[0].data.plan);
|
||||
console.log("CC limit", limit);
|
||||
cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now);
|
||||
|
||||
countCanBeDirectlyAdded = Math.max(
|
||||
|
@ -18,16 +18,18 @@ import { v4 as uuidv4 } from "uuid";
|
||||
import {
|
||||
addCrawlJob,
|
||||
addCrawlJobDone,
|
||||
addCrawlJobs,
|
||||
crawlToCrawler,
|
||||
finishCrawl,
|
||||
generateURLPermutations,
|
||||
getCrawl,
|
||||
getCrawlJobs,
|
||||
lockURL,
|
||||
lockURLs,
|
||||
normalizeURL,
|
||||
} from "../lib/crawl-redis";
|
||||
import { StoredCrawl } from "../lib/crawl-redis";
|
||||
import { addScrapeJob } from "./queue-jobs";
|
||||
import { addScrapeJob, addScrapeJobs } from "./queue-jobs";
|
||||
import {
|
||||
addJobPriority,
|
||||
deleteJobPriority,
|
||||
@ -191,6 +193,17 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
|
||||
await addJobPriority(job.data.team_id, job.id);
|
||||
let err = null;
|
||||
try {
|
||||
if (job.data?.mode === "kickoff") {
|
||||
const result = await processKickoffJob(job, token);
|
||||
if (result.success) {
|
||||
try {
|
||||
await job.moveToCompleted(null, token, false);
|
||||
} catch (e) {}
|
||||
} else {
|
||||
logger.debug("Job failed", { result, mode: job.data.mode });
|
||||
await job.moveToFailed((result as any).error, token, false);
|
||||
}
|
||||
} else {
|
||||
const result = await processJob(job, token);
|
||||
if (result.success) {
|
||||
try {
|
||||
@ -208,6 +221,7 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
|
||||
logger.debug("Job failed", { result });
|
||||
await job.moveToFailed((result as any).error, token, false);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
logger.debug("Job failed", { error });
|
||||
Sentry.captureException(error);
|
||||
@ -379,6 +393,130 @@ const workerFun = async (
|
||||
|
||||
workerFun(getScrapeQueue(), processJobInternal);
|
||||
|
||||
async function processKickoffJob(job: Job & { id: string }, token: string) {
|
||||
const logger = _logger.child({
|
||||
module: "queue-worker",
|
||||
method: "processKickoffJob",
|
||||
jobId: job.id,
|
||||
scrapeId: job.id,
|
||||
crawlId: job.data?.crawl_id ?? undefined,
|
||||
teamId: job.data?.team_id ?? undefined,
|
||||
});
|
||||
|
||||
try {
|
||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||
|
||||
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||
? 0
|
||||
: await crawler.tryGetSitemap(async urls => {
|
||||
if (urls.length === 0) return;
|
||||
|
||||
logger.debug("Using sitemap chunk of length " + urls.length, {
|
||||
sitemapLength: urls.length,
|
||||
});
|
||||
|
||||
let jobPriority = await getJobPriority({
|
||||
plan: job.data.plan,
|
||||
team_id: job.data.team_id,
|
||||
basePriority: 21,
|
||||
});
|
||||
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
||||
|
||||
const jobs = urls.map(url => {
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls" as const,
|
||||
team_id: job.data.team_id,
|
||||
plan: job.data.plan!,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
scrapeOptions: job.data.scrapeOptions,
|
||||
internalOptions: sc.internalOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
sitemapped: true,
|
||||
webhook: job.data.webhook,
|
||||
v1: job.data.v1,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 20,
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
logger.debug("Locking URLs...");
|
||||
await lockURLs(
|
||||
job.data.crawl_id,
|
||||
sc,
|
||||
jobs.map((x) => x.data.url),
|
||||
);
|
||||
logger.debug("Adding scrape jobs to Redis...");
|
||||
await addCrawlJobs(
|
||||
job.data.crawl_id,
|
||||
jobs.map((x) => x.opts.jobId),
|
||||
);
|
||||
logger.debug("Adding scrape jobs to BullMQ...");
|
||||
await addScrapeJobs(jobs);
|
||||
});
|
||||
|
||||
if (sitemap === 0) {
|
||||
logger.debug("Sitemap not found or ignored.", {
|
||||
ignoreSitemap: sc.crawlerOptions.ignoreSitemap,
|
||||
});
|
||||
|
||||
logger.debug("Locking URL...");
|
||||
await lockURL(job.data.crawl_id, sc, job.data.url);
|
||||
const jobId = uuidv4();
|
||||
logger.debug("Adding scrape job to Redis...", { jobId });
|
||||
await addScrapeJob(
|
||||
{
|
||||
url: job.data.url,
|
||||
mode: "single_urls",
|
||||
team_id: job.data.team_id,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
scrapeOptions: scrapeOptions.parse(job.data.scrapeOptions),
|
||||
internalOptions: sc.internalOptions,
|
||||
plan: job.data.plan!,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
webhook: job.data.webhook,
|
||||
v1: job.data.v1,
|
||||
},
|
||||
{
|
||||
priority: 15,
|
||||
},
|
||||
jobId,
|
||||
);
|
||||
logger.debug("Adding scrape job to BullMQ...", { jobId });
|
||||
await addCrawlJob(job.data.crawl_id, jobId);
|
||||
}
|
||||
logger.debug("Done queueing jobs!");
|
||||
|
||||
if (job.data.webhook) {
|
||||
logger.debug("Calling webhook with crawl.started...", {
|
||||
webhook: job.data.webhook,
|
||||
});
|
||||
await callWebhook(
|
||||
job.data.team_id,
|
||||
job.data.crawl_id,
|
||||
null,
|
||||
job.data.webhook,
|
||||
true,
|
||||
"crawl.started",
|
||||
);
|
||||
}
|
||||
|
||||
return { success: true }
|
||||
} catch (error) {
|
||||
logger.error("An error occurred!", { error })
|
||||
return { success: false, error };
|
||||
}
|
||||
}
|
||||
|
||||
async function processJob(job: Job & { id: string }, token: string) {
|
||||
const logger = _logger.child({
|
||||
module: "queue-worker",
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -13,7 +13,7 @@ import os
|
||||
|
||||
from .firecrawl import FirecrawlApp # noqa
|
||||
|
||||
__version__ = "1.6.8"
|
||||
__version__ = "1.7.0"
|
||||
|
||||
# Define the logger for the Firecrawl project
|
||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
@ -8,7 +8,7 @@ from datetime import datetime
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "http://127.0.0.1:3002";
|
||||
API_URL = os.getenv('API_URL', 'http://127.0.0.1:3002')
|
||||
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
|
||||
TEST_API_KEY = os.getenv('TEST_API_KEY')
|
||||
|
||||
@ -20,15 +20,26 @@ spec.loader.exec_module(firecrawl)
|
||||
FirecrawlApp = firecrawl.FirecrawlApp
|
||||
|
||||
def test_no_api_key():
|
||||
if 'api.firecrawl.dev' in API_URL:
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app = FirecrawlApp(api_url=API_URL)
|
||||
assert "No API key provided" in str(excinfo.value)
|
||||
else:
|
||||
# Should not raise error for self-hosted
|
||||
app = FirecrawlApp(api_url=API_URL)
|
||||
assert app is not None
|
||||
|
||||
def test_scrape_url_invalid_api_key():
|
||||
if 'api.firecrawl.dev' in API_URL:
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.scrape_url('https://firecrawl.dev')
|
||||
assert "Unauthorized: Invalid token" in str(excinfo.value)
|
||||
else:
|
||||
# Should work without API key for self-hosted
|
||||
app = FirecrawlApp(api_url=API_URL)
|
||||
response = app.scrape_url('https://firecrawl.dev')
|
||||
assert response is not None
|
||||
|
||||
# def test_blocklisted_url():
|
||||
# blocklisted_url = "https://facebook.com/fake-test"
|
||||
@ -131,10 +142,16 @@ def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_ext
|
||||
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown']
|
||||
|
||||
def test_crawl_url_invalid_api_key():
|
||||
if 'api.firecrawl.dev' in API_URL:
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.crawl_url('https://firecrawl.dev')
|
||||
assert "Unauthorized: Invalid token" in str(excinfo.value)
|
||||
else:
|
||||
# Should work without API key for self-hosted
|
||||
app = FirecrawlApp(api_url=API_URL)
|
||||
response = app.crawl_url('https://firecrawl.dev')
|
||||
assert response is not None
|
||||
|
||||
# def test_should_return_error_for_blocklisted_url():
|
||||
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
@ -291,10 +308,16 @@ def test_check_crawl_status_e2e():
|
||||
assert 'error' not in status_response['data'][0]['metadata']
|
||||
|
||||
def test_invalid_api_key_on_map():
|
||||
if 'api.firecrawl.dev' in API_URL:
|
||||
invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL)
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.map_url('https://roastmywebsite.ai')
|
||||
assert "Unauthorized: Invalid token" in str(excinfo.value)
|
||||
else:
|
||||
# Should work without API key for self-hosted
|
||||
app = FirecrawlApp(api_url=API_URL)
|
||||
response = app.map_url('https://roastmywebsite.ai')
|
||||
assert response is not None
|
||||
|
||||
# def test_blocklisted_url_on_map():
|
||||
# app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
|
||||
@ -349,4 +372,3 @@ def test_search_e2e():
|
||||
# assert isinstance(llm_extraction['is_open_source'], bool)
|
||||
|
||||
|
||||
|
@ -49,10 +49,13 @@ class FirecrawlApp:
|
||||
"""
|
||||
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
||||
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
||||
if self.api_key is None:
|
||||
logger.warning("No API key provided")
|
||||
|
||||
# Only require API key when using cloud service
|
||||
if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
|
||||
logger.warning("No API key provided for cloud service")
|
||||
raise ValueError('No API key provided')
|
||||
logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key}")
|
||||
|
||||
logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
|
||||
|
||||
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||
"""
|
||||
|
@ -12,7 +12,8 @@ dependencies = [
|
||||
"requests",
|
||||
"python-dotenv",
|
||||
"websockets",
|
||||
"nest-asyncio"
|
||||
"nest-asyncio",
|
||||
"pydantic>=2.10.3",
|
||||
]
|
||||
authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
|
||||
maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
|
||||
|
@ -3,3 +3,4 @@ pytest
|
||||
python-dotenv
|
||||
websockets
|
||||
nest-asyncio
|
||||
pydantic
|
@ -9,7 +9,7 @@ use crate::crawl::CrawlStatus;
|
||||
#[derive(Debug, Deserialize, Serialize, Clone)]
|
||||
pub struct FirecrawlAPIError {
|
||||
/// Always false.
|
||||
success: bool,
|
||||
pub success: bool,
|
||||
|
||||
/// Error message
|
||||
pub error: String,
|
||||
|
@ -9,6 +9,7 @@ pub mod map;
|
||||
pub mod scrape;
|
||||
|
||||
pub use error::FirecrawlError;
|
||||
use error::FirecrawlAPIError;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct FirecrawlApp {
|
||||
@ -18,16 +19,30 @@ pub struct FirecrawlApp {
|
||||
}
|
||||
|
||||
pub(crate) const API_VERSION: &str = "/v1";
|
||||
const CLOUD_API_URL: &str = "https://api.firecrawl.dev";
|
||||
|
||||
impl FirecrawlApp {
|
||||
pub fn new(api_key: impl AsRef<str>) -> Result<Self, FirecrawlError> {
|
||||
FirecrawlApp::new_selfhosted("https://api.firecrawl.dev", Some(api_key))
|
||||
FirecrawlApp::new_selfhosted(CLOUD_API_URL, Some(api_key))
|
||||
}
|
||||
|
||||
pub fn new_selfhosted(api_url: impl AsRef<str>, api_key: Option<impl AsRef<str>>) -> Result<Self, FirecrawlError> {
|
||||
let url = api_url.as_ref().to_string();
|
||||
|
||||
if url == CLOUD_API_URL && api_key.is_none() {
|
||||
return Err(FirecrawlError::APIError(
|
||||
"Configuration".to_string(),
|
||||
FirecrawlAPIError {
|
||||
success: false,
|
||||
error: "API key is required for cloud service".to_string(),
|
||||
details: None,
|
||||
}
|
||||
));
|
||||
}
|
||||
|
||||
Ok(FirecrawlApp {
|
||||
api_key: api_key.map(|x| x.as_ref().to_string()),
|
||||
api_url: api_url.as_ref().to_string(),
|
||||
api_url: url,
|
||||
client: Client::new(),
|
||||
})
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
use assert_matches::assert_matches;
|
||||
use dotenvy::dotenv;
|
||||
use firecrawl::scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions};
|
||||
use firecrawl::FirecrawlApp;
|
||||
use firecrawl::{FirecrawlApp, FirecrawlError};
|
||||
use serde_json::json;
|
||||
use std::env;
|
||||
|
||||
@ -155,3 +155,29 @@ async fn test_llm_extraction() {
|
||||
assert!(llm_extraction["supports_sso"].is_boolean());
|
||||
assert!(llm_extraction["is_open_source"].is_boolean());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_api_key_requirements() {
|
||||
dotenv().ok();
|
||||
|
||||
let api_url = env::var("API_URL").unwrap_or("http://localhost:3002".to_string());
|
||||
let api_key = env::var("TEST_API_KEY").ok();
|
||||
|
||||
match (api_url.contains("api.firecrawl.dev"), api_key) {
|
||||
(false, _) => {
|
||||
let result = FirecrawlApp::new_selfhosted(&api_url, None::<String>);
|
||||
assert!(result.is_ok(), "Local setup failed: {:?}", result.err().unwrap());
|
||||
}
|
||||
(true, None) => {
|
||||
let result = FirecrawlApp::new_selfhosted(&api_url, None::<String>);
|
||||
assert!(matches!(
|
||||
result,
|
||||
Err(FirecrawlError::APIError(msg, _)) if msg == "Configuration"
|
||||
));
|
||||
}
|
||||
(true, Some(key)) => {
|
||||
let result = FirecrawlApp::new_selfhosted(&api_url, Some(&key));
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user