mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 05:56:00 +08:00
Merge branch 'main' into mog/mineru
This commit is contained in:
commit
5fcf3fa97e
@ -116,6 +116,10 @@ If you’d like to test the crawl endpoint, you can run this:
|
|||||||
|
|
||||||
This section provides solutions to common issues you might encounter while setting up or running your self-hosted instance of Firecrawl.
|
This section provides solutions to common issues you might encounter while setting up or running your self-hosted instance of Firecrawl.
|
||||||
|
|
||||||
|
### API Keys for SDK Usage
|
||||||
|
|
||||||
|
**Note:** When using Firecrawl SDKs with a self-hosted instance, API keys are optional. API keys are only required when connecting to the cloud service (api.firecrawl.dev).
|
||||||
|
|
||||||
### Supabase client is not configured
|
### Supabase client is not configured
|
||||||
|
|
||||||
**Symptom:**
|
**Symptom:**
|
||||||
|
@ -70,8 +70,8 @@ content-type: application/json
|
|||||||
"urls": ["firecrawl.dev"],
|
"urls": ["firecrawl.dev"],
|
||||||
"prompt": "What is the title, description and main product of the page?",
|
"prompt": "What is the title, description and main product of the page?",
|
||||||
"schema": {
|
"schema": {
|
||||||
"title": "string",
|
"title": { "type": "string" },
|
||||||
"description": "string",
|
"description": { "type": "string" },
|
||||||
"mainProduct": "string"
|
"mainProduct": { "type": "string" }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -177,56 +177,51 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
|
|
||||||
await saveCrawl(id, sc);
|
await saveCrawl(id, sc);
|
||||||
|
|
||||||
const sitemap = sc.crawlerOptions?.ignoreSitemap
|
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||||
? null
|
? 0
|
||||||
: await crawler.tryGetSitemap();
|
: await crawler.tryGetSitemap(async urls => {
|
||||||
|
if (urls.length === 0) return;
|
||||||
|
|
||||||
|
let jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 });
|
||||||
|
const jobs = urls.map(url => {
|
||||||
|
const uuid = uuidv4();
|
||||||
|
return {
|
||||||
|
name: uuid,
|
||||||
|
data: {
|
||||||
|
url,
|
||||||
|
mode: "single_urls",
|
||||||
|
crawlerOptions,
|
||||||
|
scrapeOptions,
|
||||||
|
internalOptions,
|
||||||
|
team_id,
|
||||||
|
plan,
|
||||||
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
|
crawl_id: id,
|
||||||
|
sitemapped: true,
|
||||||
|
},
|
||||||
|
opts: {
|
||||||
|
jobId: uuid,
|
||||||
|
priority: jobPriority,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
if (sitemap !== null && sitemap.length > 0) {
|
await lockURLs(
|
||||||
let jobPriority = 20;
|
id,
|
||||||
// If it is over 1000, we need to get the job priority,
|
sc,
|
||||||
// otherwise we can use the default priority of 20
|
jobs.map((x) => x.data.url),
|
||||||
if (sitemap.length > 1000) {
|
);
|
||||||
// set base to 21
|
await addCrawlJobs(
|
||||||
jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 });
|
id,
|
||||||
}
|
jobs.map((x) => x.opts.jobId),
|
||||||
const jobs = sitemap.map((x) => {
|
);
|
||||||
const url = x.url;
|
for (const job of jobs) {
|
||||||
const uuid = uuidv4();
|
// add with sentry instrumentation
|
||||||
return {
|
await addScrapeJob(job.data as any, {}, job.opts.jobId);
|
||||||
name: uuid,
|
}
|
||||||
data: {
|
});
|
||||||
url,
|
|
||||||
mode: "single_urls",
|
|
||||||
crawlerOptions,
|
|
||||||
scrapeOptions,
|
|
||||||
internalOptions,
|
|
||||||
team_id,
|
|
||||||
plan,
|
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
|
||||||
crawl_id: id,
|
|
||||||
sitemapped: true,
|
|
||||||
},
|
|
||||||
opts: {
|
|
||||||
jobId: uuid,
|
|
||||||
priority: jobPriority,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
});
|
|
||||||
|
|
||||||
await lockURLs(
|
if (sitemap === 0) {
|
||||||
id,
|
|
||||||
sc,
|
|
||||||
jobs.map((x) => x.data.url),
|
|
||||||
);
|
|
||||||
await addCrawlJobs(
|
|
||||||
id,
|
|
||||||
jobs.map((x) => x.opts.jobId),
|
|
||||||
);
|
|
||||||
for (const job of jobs) {
|
|
||||||
// add with sentry instrumentation
|
|
||||||
await addScrapeJob(job.data as any, {}, job.opts.jobId);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
await lockURL(id, sc, url);
|
await lockURL(id, sc, url);
|
||||||
|
|
||||||
// Not needed, first one should be 15.
|
// Not needed, first one should be 15.
|
||||||
|
@ -113,32 +113,32 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
const sitemap = sc.crawlerOptions?.ignoreSitemap
|
const sitemap = sc.crawlerOptions?.ignoreSitemap
|
||||||
? null
|
? 0
|
||||||
: await crawler.tryGetSitemap();
|
: await crawler.tryGetSitemap(async urls => {
|
||||||
|
for (const url of urls) {
|
||||||
|
await lockURL(id, sc, url);
|
||||||
|
const jobId = uuidv4();
|
||||||
|
await addScrapeJob(
|
||||||
|
{
|
||||||
|
url,
|
||||||
|
mode: "single_urls",
|
||||||
|
team_id,
|
||||||
|
plan: plan!,
|
||||||
|
crawlerOptions,
|
||||||
|
scrapeOptions,
|
||||||
|
internalOptions,
|
||||||
|
origin: "website-preview",
|
||||||
|
crawl_id: id,
|
||||||
|
sitemapped: true,
|
||||||
|
},
|
||||||
|
{},
|
||||||
|
jobId,
|
||||||
|
);
|
||||||
|
await addCrawlJob(id, jobId);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
if (sitemap !== null) {
|
if (sitemap === 0) {
|
||||||
for (const url of sitemap.map((x) => x.url)) {
|
|
||||||
await lockURL(id, sc, url);
|
|
||||||
const jobId = uuidv4();
|
|
||||||
await addScrapeJob(
|
|
||||||
{
|
|
||||||
url,
|
|
||||||
mode: "single_urls",
|
|
||||||
team_id,
|
|
||||||
plan: plan!,
|
|
||||||
crawlerOptions,
|
|
||||||
scrapeOptions,
|
|
||||||
internalOptions,
|
|
||||||
origin: "website-preview",
|
|
||||||
crawl_id: id,
|
|
||||||
sitemapped: true,
|
|
||||||
},
|
|
||||||
{},
|
|
||||||
jobId,
|
|
||||||
);
|
|
||||||
await addCrawlJob(id, jobId);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
await lockURL(id, sc, url);
|
await lockURL(id, sc, url);
|
||||||
const jobId = uuidv4();
|
const jobId = uuidv4();
|
||||||
await addScrapeJob(
|
await addScrapeJob(
|
||||||
|
@ -115,7 +115,7 @@ export async function crawlStatusController(
|
|||||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
|
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
|
||||||
sc.cancelled
|
sc.cancelled
|
||||||
? "cancelled"
|
? "cancelled"
|
||||||
: validJobStatuses.every((x) => x[1] === "completed")
|
: (validJobStatuses.every((x) => x[1] === "completed") && validJobStatuses.length > 0)
|
||||||
? "completed"
|
? "completed"
|
||||||
: "scraping";
|
: "scraping";
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ import {
|
|||||||
} from "../../lib/crawl-redis";
|
} from "../../lib/crawl-redis";
|
||||||
import { logCrawl } from "../../services/logging/crawl_log";
|
import { logCrawl } from "../../services/logging/crawl_log";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import { addScrapeJob, addScrapeJobs } from "../../services/queue-jobs";
|
import { _addScrapeJobToBullMQ, addScrapeJob, addScrapeJobs } from "../../services/queue-jobs";
|
||||||
import { logger as _logger } from "../../lib/logger";
|
import { logger as _logger } from "../../lib/logger";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
import { callWebhook } from "../../services/webhook";
|
import { callWebhook } from "../../services/webhook";
|
||||||
@ -111,113 +111,20 @@ export async function crawlController(
|
|||||||
|
|
||||||
await saveCrawl(id, sc);
|
await saveCrawl(id, sc);
|
||||||
|
|
||||||
const sitemap = sc.crawlerOptions.ignoreSitemap
|
await _addScrapeJobToBullMQ({
|
||||||
? null
|
url: req.body.url,
|
||||||
: await crawler.tryGetSitemap();
|
mode: "kickoff" as const,
|
||||||
|
team_id: req.auth.team_id,
|
||||||
if (sitemap !== null && sitemap.length > 0) {
|
plan: req.auth.plan,
|
||||||
logger.debug("Using sitemap of length " + sitemap.length, {
|
crawlerOptions,
|
||||||
sitemapLength: sitemap.length,
|
scrapeOptions: sc.scrapeOptions,
|
||||||
});
|
internalOptions: sc.internalOptions,
|
||||||
let jobPriority = 20;
|
origin: "api",
|
||||||
// If it is over 1000, we need to get the job priority,
|
crawl_id: id,
|
||||||
// otherwise we can use the default priority of 20
|
webhook: req.body.webhook,
|
||||||
if (sitemap.length > 1000) {
|
v1: true,
|
||||||
// set base to 21
|
}, {}, crypto.randomUUID(), 10);
|
||||||
jobPriority = await getJobPriority({
|
|
||||||
plan: req.auth.plan,
|
|
||||||
team_id: req.auth.team_id,
|
|
||||||
basePriority: 21,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
|
||||||
|
|
||||||
const jobs = sitemap.map((x) => {
|
|
||||||
const url = x.url;
|
|
||||||
const uuid = uuidv4();
|
|
||||||
return {
|
|
||||||
name: uuid,
|
|
||||||
data: {
|
|
||||||
url,
|
|
||||||
mode: "single_urls" as const,
|
|
||||||
team_id: req.auth.team_id,
|
|
||||||
plan: req.auth.plan!,
|
|
||||||
crawlerOptions,
|
|
||||||
scrapeOptions,
|
|
||||||
internalOptions: sc.internalOptions,
|
|
||||||
origin: "api",
|
|
||||||
crawl_id: id,
|
|
||||||
sitemapped: true,
|
|
||||||
webhook: req.body.webhook,
|
|
||||||
v1: true,
|
|
||||||
},
|
|
||||||
opts: {
|
|
||||||
jobId: uuid,
|
|
||||||
priority: 20,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
});
|
|
||||||
|
|
||||||
logger.debug("Locking URLs...");
|
|
||||||
await lockURLs(
|
|
||||||
id,
|
|
||||||
sc,
|
|
||||||
jobs.map((x) => x.data.url),
|
|
||||||
);
|
|
||||||
logger.debug("Adding scrape jobs to Redis...");
|
|
||||||
await addCrawlJobs(
|
|
||||||
id,
|
|
||||||
jobs.map((x) => x.opts.jobId),
|
|
||||||
);
|
|
||||||
logger.debug("Adding scrape jobs to BullMQ...");
|
|
||||||
await addScrapeJobs(jobs);
|
|
||||||
} else {
|
|
||||||
logger.debug("Sitemap not found or ignored.", {
|
|
||||||
ignoreSitemap: sc.crawlerOptions.ignoreSitemap,
|
|
||||||
});
|
|
||||||
|
|
||||||
logger.debug("Locking URL...");
|
|
||||||
await lockURL(id, sc, req.body.url);
|
|
||||||
const jobId = uuidv4();
|
|
||||||
logger.debug("Adding scrape job to Redis...", { jobId });
|
|
||||||
await addScrapeJob(
|
|
||||||
{
|
|
||||||
url: req.body.url,
|
|
||||||
mode: "single_urls",
|
|
||||||
team_id: req.auth.team_id,
|
|
||||||
crawlerOptions,
|
|
||||||
scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions),
|
|
||||||
internalOptions: sc.internalOptions,
|
|
||||||
plan: req.auth.plan!,
|
|
||||||
origin: "api",
|
|
||||||
crawl_id: id,
|
|
||||||
webhook: req.body.webhook,
|
|
||||||
v1: true,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
priority: 15,
|
|
||||||
},
|
|
||||||
jobId,
|
|
||||||
);
|
|
||||||
logger.debug("Adding scrape job to BullMQ...", { jobId });
|
|
||||||
await addCrawlJob(id, jobId);
|
|
||||||
}
|
|
||||||
logger.debug("Done queueing jobs!");
|
|
||||||
|
|
||||||
if (req.body.webhook) {
|
|
||||||
logger.debug("Calling webhook with crawl.started...", {
|
|
||||||
webhook: req.body.webhook,
|
|
||||||
});
|
|
||||||
await callWebhook(
|
|
||||||
req.auth.team_id,
|
|
||||||
id,
|
|
||||||
null,
|
|
||||||
req.body.webhook,
|
|
||||||
true,
|
|
||||||
"crawl.started",
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
|
@ -86,11 +86,12 @@ export async function getMapResults({
|
|||||||
|
|
||||||
// If sitemapOnly is true, only get links from sitemap
|
// If sitemapOnly is true, only get links from sitemap
|
||||||
if (crawlerOptions.sitemapOnly) {
|
if (crawlerOptions.sitemapOnly) {
|
||||||
const sitemap = await crawler.tryGetSitemap(true, true);
|
const sitemap = await crawler.tryGetSitemap(urls => {
|
||||||
if (sitemap !== null) {
|
urls.forEach((x) => {
|
||||||
sitemap.forEach((x) => {
|
links.push(x);
|
||||||
links.push(x.url);
|
|
||||||
});
|
});
|
||||||
|
}, true, true);
|
||||||
|
if (sitemap > 0) {
|
||||||
links = links
|
links = links
|
||||||
.slice(1)
|
.slice(1)
|
||||||
.map((x) => {
|
.map((x) => {
|
||||||
@ -143,8 +144,10 @@ export async function getMapResults({
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Parallelize sitemap fetch with serper search
|
// Parallelize sitemap fetch with serper search
|
||||||
const [sitemap, ...searchResults] = await Promise.all([
|
const [_, ...searchResults] = await Promise.all([
|
||||||
ignoreSitemap ? null : crawler.tryGetSitemap(true),
|
ignoreSitemap ? null : crawler.tryGetSitemap(urls => {
|
||||||
|
links.push(...urls);
|
||||||
|
}, true),
|
||||||
...(cachedResult ? [] : pagePromises),
|
...(cachedResult ? [] : pagePromises),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
@ -152,12 +155,6 @@ export async function getMapResults({
|
|||||||
allResults = searchResults;
|
allResults = searchResults;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sitemap !== null) {
|
|
||||||
sitemap.forEach((x) => {
|
|
||||||
links.push(x.url);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
mapResults = allResults
|
mapResults = allResults
|
||||||
.flat()
|
.flat()
|
||||||
.filter((result) => result !== null && result !== undefined);
|
.filter((result) => result !== null && result !== undefined);
|
||||||
|
@ -17,7 +17,7 @@ export function withAuth<T, U extends any[]>(
|
|||||||
logger.warn("You're bypassing authentication");
|
logger.warn("You're bypassing authentication");
|
||||||
warningCount++;
|
warningCount++;
|
||||||
}
|
}
|
||||||
return { success: true } as T;
|
return { success: true, ...(mockSuccess || {}) } as T;
|
||||||
} else {
|
} else {
|
||||||
return await originalFunction(...args);
|
return await originalFunction(...args);
|
||||||
}
|
}
|
||||||
|
@ -4,9 +4,10 @@ import { URL } from "url";
|
|||||||
import { getLinksFromSitemap } from "./sitemap";
|
import { getLinksFromSitemap } from "./sitemap";
|
||||||
import robotsParser from "robots-parser";
|
import robotsParser from "robots-parser";
|
||||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
import { axiosTimeout } from "../../lib/timeout";
|
||||||
import { logger as _logger } from "../../../src/lib/logger";
|
import { logger as _logger } from "../../lib/logger";
|
||||||
import https from "https";
|
import https from "https";
|
||||||
|
import { redisConnection } from "../../services/queue-service";
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
private jobId: string;
|
private jobId: string;
|
||||||
private initialUrl: string;
|
private initialUrl: string;
|
||||||
@ -198,26 +199,60 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public async tryGetSitemap(
|
public async tryGetSitemap(
|
||||||
|
urlsHandler: (urls: string[]) => unknown,
|
||||||
fromMap: boolean = false,
|
fromMap: boolean = false,
|
||||||
onlySitemap: boolean = false,
|
onlySitemap: boolean = false,
|
||||||
): Promise<{ url: string; html: string }[] | null> {
|
): Promise<number> {
|
||||||
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
|
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
|
||||||
method: "tryGetSitemap",
|
method: "tryGetSitemap",
|
||||||
});
|
});
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
let leftOfLimit = this.limit;
|
||||||
if (fromMap && onlySitemap) {
|
|
||||||
return sitemapLinks.map((link) => ({ url: link, html: "" }));
|
const normalizeUrl = (url: string) => {
|
||||||
|
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
||||||
|
if (url.endsWith("/")) {
|
||||||
|
url = url.slice(0, -1);
|
||||||
|
}
|
||||||
|
return url;
|
||||||
|
};
|
||||||
|
|
||||||
|
const _urlsHandler = async (urls: string[]) => {
|
||||||
|
let uniqueURLs: string[] = [];
|
||||||
|
for (const url of urls) {
|
||||||
|
if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(url))) {
|
||||||
|
uniqueURLs.push(url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await redisConnection.expire("sitemap:" + this.jobId + ":links", 3600, "NX");
|
||||||
|
if (uniqueURLs.length > 0) {
|
||||||
|
urlsHandler(uniqueURLs);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let count = await this.tryFetchSitemapLinks(this.initialUrl, (urls: string[]) => {
|
||||||
|
if (fromMap && onlySitemap) {
|
||||||
|
return urlsHandler(urls);
|
||||||
|
} else {
|
||||||
|
let filteredLinks = this.filterLinks(
|
||||||
|
[...new Set(urls)],
|
||||||
|
leftOfLimit,
|
||||||
|
this.maxCrawledDepth,
|
||||||
|
fromMap,
|
||||||
|
);
|
||||||
|
leftOfLimit -= filteredLinks.length;
|
||||||
|
return _urlsHandler(filteredLinks);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (count > 0) {
|
||||||
|
if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(this.initialUrl))) {
|
||||||
|
urlsHandler([this.initialUrl]);
|
||||||
|
}
|
||||||
|
count++;
|
||||||
}
|
}
|
||||||
if (sitemapLinks.length > 0) {
|
|
||||||
let filteredLinks = this.filterLinks(
|
return count;
|
||||||
[...new Set(sitemapLinks)],
|
|
||||||
this.limit,
|
|
||||||
this.maxCrawledDepth,
|
|
||||||
fromMap,
|
|
||||||
);
|
|
||||||
return filteredLinks.map((link) => ({ url: link, html: "" }));
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public filterURL(href: string, url: string): string | null {
|
public filterURL(href: string, url: string): string | null {
|
||||||
@ -436,54 +471,74 @@ export class WebCrawler {
|
|||||||
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
||||||
}
|
}
|
||||||
|
|
||||||
private async tryFetchSitemapLinks(url: string): Promise<string[]> {
|
private async tryFetchSitemapLinks(url: string, urlsHandler: (urls: string[]) => unknown): Promise<number> {
|
||||||
const normalizeUrl = (url: string) => {
|
|
||||||
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
|
||||||
if (url.endsWith("/")) {
|
|
||||||
url = url.slice(0, -1);
|
|
||||||
}
|
|
||||||
return url;
|
|
||||||
};
|
|
||||||
|
|
||||||
const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`;
|
const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`;
|
||||||
|
|
||||||
let sitemapLinks: string[] = [];
|
let sitemapCount: number = 0;
|
||||||
|
|
||||||
|
// Try to get sitemap from the provided URL first
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
sitemapCount = await getLinksFromSitemap(
|
||||||
if (response.status === 200) {
|
{ sitemapUrl, urlsHandler, mode: "fire-engine" },
|
||||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl }, this.logger);
|
this.logger,
|
||||||
}
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(
|
this.logger.debug(
|
||||||
`Failed to fetch sitemap with axios from ${sitemapUrl}`,
|
`Failed to fetch sitemap from ${sitemapUrl}`,
|
||||||
{ method: "tryFetchSitemapLinks", sitemapUrl, error },
|
{ method: "tryFetchSitemapLinks", sitemapUrl, error },
|
||||||
);
|
);
|
||||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
|
||||||
// ignore 404
|
|
||||||
} else {
|
|
||||||
const response = await getLinksFromSitemap(
|
|
||||||
{ sitemapUrl, mode: "fire-engine" },
|
|
||||||
this.logger,
|
|
||||||
);
|
|
||||||
if (response) {
|
|
||||||
sitemapLinks = response;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sitemapLinks.length === 0) {
|
// If this is a subdomain, also try to get sitemap from the main domain
|
||||||
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
try {
|
||||||
try {
|
const urlObj = new URL(url);
|
||||||
const response = await axios.get(baseUrlSitemap, {
|
const hostname = urlObj.hostname;
|
||||||
timeout: axiosTimeout,
|
const domainParts = hostname.split('.');
|
||||||
});
|
|
||||||
if (response.status === 200) {
|
// Check if this is a subdomain (has more than 2 parts and not www)
|
||||||
sitemapLinks = await getLinksFromSitemap(
|
if (domainParts.length > 2 && domainParts[0] !== 'www') {
|
||||||
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
|
// Get the main domain by taking the last two parts
|
||||||
|
const mainDomain = domainParts.slice(-2).join('.');
|
||||||
|
const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`;
|
||||||
|
const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Get all links from the main domain's sitemap
|
||||||
|
sitemapCount += await getLinksFromSitemap(
|
||||||
|
{ sitemapUrl: mainDomainSitemapUrl, urlsHandler(urls) {
|
||||||
|
urlsHandler(urls.filter(link => {
|
||||||
|
try {
|
||||||
|
const linkUrl = new URL(link);
|
||||||
|
return linkUrl.hostname.endsWith(hostname);
|
||||||
|
} catch {
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
}, mode: "fire-engine" },
|
||||||
this.logger,
|
this.logger,
|
||||||
);
|
);
|
||||||
|
} catch (error) {
|
||||||
|
this.logger.debug(
|
||||||
|
`Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`,
|
||||||
|
{ method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error },
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
this.logger.debug(`Error processing main domain sitemap`, {
|
||||||
|
method: "tryFetchSitemapLinks",
|
||||||
|
url,
|
||||||
|
error,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no sitemap found yet, try the baseUrl as a last resort
|
||||||
|
if (sitemapCount === 0) {
|
||||||
|
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
||||||
|
try {
|
||||||
|
sitemapCount += await getLinksFromSitemap(
|
||||||
|
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
|
||||||
|
this.logger,
|
||||||
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
|
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
|
||||||
method: "tryFetchSitemapLinks",
|
method: "tryFetchSitemapLinks",
|
||||||
@ -493,25 +548,14 @@ export class WebCrawler {
|
|||||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||||
// ignore 404
|
// ignore 404
|
||||||
} else {
|
} else {
|
||||||
sitemapLinks = await getLinksFromSitemap(
|
sitemapCount += await getLinksFromSitemap(
|
||||||
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
|
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
|
||||||
this.logger,
|
this.logger,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const normalizedUrl = normalizeUrl(url);
|
return sitemapCount;
|
||||||
const normalizedSitemapLinks = sitemapLinks.map((link) =>
|
|
||||||
normalizeUrl(link),
|
|
||||||
);
|
|
||||||
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
|
|
||||||
if (
|
|
||||||
!normalizedSitemapLinks.includes(normalizedUrl) &&
|
|
||||||
sitemapLinks.length > 0
|
|
||||||
) {
|
|
||||||
sitemapLinks.push(url);
|
|
||||||
}
|
|
||||||
return sitemapLinks;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5,26 +5,25 @@ import { WebCrawler } from "./crawler";
|
|||||||
import { scrapeURL } from "../scrapeURL";
|
import { scrapeURL } from "../scrapeURL";
|
||||||
import { scrapeOptions } from "../../controllers/v1/types";
|
import { scrapeOptions } from "../../controllers/v1/types";
|
||||||
import type { Logger } from "winston";
|
import type { Logger } from "winston";
|
||||||
|
const useFireEngine =
|
||||||
|
process.env.FIRE_ENGINE_BETA_URL !== "" &&
|
||||||
|
process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||||
export async function getLinksFromSitemap(
|
export async function getLinksFromSitemap(
|
||||||
{
|
{
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
allUrls = [],
|
urlsHandler,
|
||||||
mode = "axios",
|
mode = "axios",
|
||||||
}: {
|
}: {
|
||||||
sitemapUrl: string;
|
sitemapUrl: string;
|
||||||
allUrls?: string[];
|
urlsHandler(urls: string[]): unknown,
|
||||||
mode?: "axios" | "fire-engine";
|
mode?: "axios" | "fire-engine";
|
||||||
},
|
},
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
): Promise<string[]> {
|
): Promise<number> {
|
||||||
try {
|
try {
|
||||||
let content: string = "";
|
let content: string = "";
|
||||||
try {
|
try {
|
||||||
if (mode === "axios" || process.env.FIRE_ENGINE_BETA_URL === "") {
|
if (mode === "fire-engine" && useFireEngine) {
|
||||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
|
||||||
content = response.data;
|
|
||||||
} else if (mode === "fire-engine") {
|
|
||||||
const response = await scrapeURL(
|
const response = await scrapeURL(
|
||||||
"sitemap",
|
"sitemap",
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
@ -32,9 +31,15 @@ export async function getLinksFromSitemap(
|
|||||||
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
|
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
|
||||||
);
|
);
|
||||||
if (!response.success) {
|
if (!response.success) {
|
||||||
throw response.error;
|
logger.debug("Failed to scrape sitemap via TLSClient, falling back to axios...", { error: response.error })
|
||||||
|
const ar = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
|
content = ar.data;
|
||||||
|
} else {
|
||||||
|
content = response.document.rawHtml!;
|
||||||
}
|
}
|
||||||
content = response.document.rawHtml!;
|
} else {
|
||||||
|
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
|
content = response.data;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Request failed for ${sitemapUrl}`, {
|
logger.error(`Request failed for ${sitemapUrl}`, {
|
||||||
@ -44,33 +49,64 @@ export async function getLinksFromSitemap(
|
|||||||
error,
|
error,
|
||||||
});
|
});
|
||||||
|
|
||||||
return allUrls;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
const parsed = await parseStringPromise(content);
|
const parsed = await parseStringPromise(content);
|
||||||
const root = parsed.urlset || parsed.sitemapindex;
|
const root = parsed.urlset || parsed.sitemapindex;
|
||||||
|
let count = 0;
|
||||||
|
|
||||||
if (root && root.sitemap) {
|
if (root && root.sitemap) {
|
||||||
const sitemapPromises = root.sitemap
|
// Handle sitemap index files
|
||||||
|
const sitemapUrls = root.sitemap
|
||||||
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
|
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
|
||||||
.map((sitemap) =>
|
.map((sitemap) => sitemap.loc[0]);
|
||||||
|
|
||||||
|
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
||||||
|
getLinksFromSitemap(
|
||||||
|
{ sitemapUrl, urlsHandler, mode },
|
||||||
|
logger,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
const results = await Promise.all(sitemapPromises);
|
||||||
|
count = results.reduce((a,x) => a + x)
|
||||||
|
} else if (root && root.url) {
|
||||||
|
// Check if any URLs point to additional sitemaps
|
||||||
|
const xmlSitemaps: string[] = root.url
|
||||||
|
.filter(
|
||||||
|
(url) =>
|
||||||
|
url.loc &&
|
||||||
|
url.loc.length > 0 &&
|
||||||
|
url.loc[0].toLowerCase().endsWith('.xml')
|
||||||
|
)
|
||||||
|
.map((url) => url.loc[0]);
|
||||||
|
|
||||||
|
if (xmlSitemaps.length > 0) {
|
||||||
|
// Recursively fetch links from additional sitemaps
|
||||||
|
const sitemapPromises = xmlSitemaps.map((sitemapUrl) =>
|
||||||
getLinksFromSitemap(
|
getLinksFromSitemap(
|
||||||
{ sitemapUrl: sitemap.loc[0], allUrls, mode },
|
{ sitemapUrl: sitemapUrl, urlsHandler, mode },
|
||||||
logger,
|
logger,
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
await Promise.all(sitemapPromises);
|
count += (await Promise.all(sitemapPromises)).reduce((a,x) => a + x, 0);
|
||||||
} else if (root && root.url) {
|
}
|
||||||
|
|
||||||
const validUrls = root.url
|
const validUrls = root.url
|
||||||
.filter(
|
.filter(
|
||||||
(url) =>
|
(url) =>
|
||||||
url.loc &&
|
url.loc &&
|
||||||
url.loc.length > 0 &&
|
url.loc.length > 0 &&
|
||||||
|
!url.loc[0].toLowerCase().endsWith('.xml') &&
|
||||||
!WebCrawler.prototype.isFile(url.loc[0]),
|
!WebCrawler.prototype.isFile(url.loc[0]),
|
||||||
)
|
)
|
||||||
.map((url) => url.loc[0]);
|
.map((url) => url.loc[0]);
|
||||||
allUrls.push(...validUrls);
|
count += validUrls.length;
|
||||||
|
urlsHandler(validUrls);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return count;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, {
|
logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, {
|
||||||
method: "getLinksFromSitemap",
|
method: "getLinksFromSitemap",
|
||||||
@ -80,7 +116,7 @@ export async function getLinksFromSitemap(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return allUrls;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
export const fetchSitemapData = async (
|
export const fetchSitemapData = async (
|
||||||
|
@ -17,6 +17,7 @@ import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError
|
|||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { Action } from "../../../../lib/entities";
|
import { Action } from "../../../../lib/entities";
|
||||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||||
|
import { fireEngineDelete } from "./delete";
|
||||||
|
|
||||||
// This function does not take `Meta` on purpose. It may not access any
|
// This function does not take `Meta` on purpose. It may not access any
|
||||||
// meta values to construct the request -- that must be done by the
|
// meta values to construct the request -- that must be done by the
|
||||||
@ -44,6 +45,13 @@ async function performFireEngineScrape<
|
|||||||
while (status === undefined) {
|
while (status === undefined) {
|
||||||
if (errors.length >= errorLimit) {
|
if (errors.length >= errorLimit) {
|
||||||
logger.error("Error limit hit.", { errors });
|
logger.error("Error limit hit.", { errors });
|
||||||
|
fireEngineDelete(
|
||||||
|
logger.child({
|
||||||
|
method: "performFireEngineScrape/fireEngineDelete",
|
||||||
|
afterErrors: errors,
|
||||||
|
}),
|
||||||
|
scrape.jobId,
|
||||||
|
);
|
||||||
throw new Error("Error limit hit. See e.cause.errors for errors.", {
|
throw new Error("Error limit hit. See e.cause.errors for errors.", {
|
||||||
cause: { errors },
|
cause: { errors },
|
||||||
});
|
});
|
||||||
@ -74,6 +82,13 @@ async function performFireEngineScrape<
|
|||||||
error instanceof ActionError ||
|
error instanceof ActionError ||
|
||||||
error instanceof UnsupportedFileError
|
error instanceof UnsupportedFileError
|
||||||
) {
|
) {
|
||||||
|
fireEngineDelete(
|
||||||
|
logger.child({
|
||||||
|
method: "performFireEngineScrape/fireEngineDelete",
|
||||||
|
afterError: error,
|
||||||
|
}),
|
||||||
|
scrape.jobId,
|
||||||
|
);
|
||||||
logger.debug("Fire-engine scrape job failed.", {
|
logger.debug("Fire-engine scrape job failed.", {
|
||||||
error,
|
error,
|
||||||
jobId: scrape.jobId,
|
jobId: scrape.jobId,
|
||||||
@ -105,6 +120,13 @@ async function performFireEngineScrape<
|
|||||||
status.content = Buffer.from(content, "base64").toString("utf8"); // TODO: handle other encodings via Content-Type tag
|
status.content = Buffer.from(content, "base64").toString("utf8"); // TODO: handle other encodings via Content-Type tag
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fireEngineDelete(
|
||||||
|
logger.child({
|
||||||
|
method: "performFireEngineScrape/fireEngineDelete",
|
||||||
|
}),
|
||||||
|
scrape.jobId,
|
||||||
|
);
|
||||||
|
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@ export function extractMetadata(
|
|||||||
): Partial<Document["metadata"]> {
|
): Partial<Document["metadata"]> {
|
||||||
let title: string | undefined = undefined;
|
let title: string | undefined = undefined;
|
||||||
let description: string | undefined = undefined;
|
let description: string | undefined = undefined;
|
||||||
|
let favicon: string | undefined = undefined;
|
||||||
let language: string | undefined = undefined;
|
let language: string | undefined = undefined;
|
||||||
let keywords: string | undefined = undefined;
|
let keywords: string | undefined = undefined;
|
||||||
let robots: string | undefined = undefined;
|
let robots: string | undefined = undefined;
|
||||||
@ -42,6 +43,12 @@ export function extractMetadata(
|
|||||||
try {
|
try {
|
||||||
title = soup("title").first().text().trim() || undefined;
|
title = soup("title").first().text().trim() || undefined;
|
||||||
description = soup('meta[name="description"]').attr("content") || undefined;
|
description = soup('meta[name="description"]').attr("content") || undefined;
|
||||||
|
|
||||||
|
const faviconLink = soup('link[rel="icon"]').attr("href") || soup('link[rel*="icon"]').first().attr("href") || undefined;
|
||||||
|
if (faviconLink) {
|
||||||
|
const baseUrl = new URL(meta.url).origin;
|
||||||
|
favicon = faviconLink.startsWith('http') ? faviconLink : `${baseUrl}${faviconLink}`;
|
||||||
|
}
|
||||||
|
|
||||||
// Assuming the language is part of the URL as per the regex pattern
|
// Assuming the language is part of the URL as per the regex pattern
|
||||||
language = soup("html").attr("lang") || undefined;
|
language = soup("html").attr("lang") || undefined;
|
||||||
@ -121,6 +128,7 @@ export function extractMetadata(
|
|||||||
return {
|
return {
|
||||||
title,
|
title,
|
||||||
description,
|
description,
|
||||||
|
favicon,
|
||||||
language,
|
language,
|
||||||
keywords,
|
keywords,
|
||||||
robots,
|
robots,
|
||||||
|
@ -0,0 +1,33 @@
|
|||||||
|
import { removeDefaultProperty } from "./llmExtract";
|
||||||
|
|
||||||
|
describe("removeDefaultProperty", () => {
|
||||||
|
it("should remove the default property from a simple object", () => {
|
||||||
|
const input = { default: "test", test: "test" };
|
||||||
|
const expectedOutput = { test: "test" };
|
||||||
|
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should remove the default property from a nested object", () => {
|
||||||
|
const input = { default: "test", nested: { default: "nestedTest", test: "nestedTest" } };
|
||||||
|
const expectedOutput = { nested: { test: "nestedTest" } };
|
||||||
|
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should remove the default property from an array of objects", () => {
|
||||||
|
const input = { array: [{ default: "test1", test: "test1" }, { default: "test2", test: "test2" }] };
|
||||||
|
const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] };
|
||||||
|
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle objects without a default property", () => {
|
||||||
|
const input = { test: "test" };
|
||||||
|
const expectedOutput = { test: "test" };
|
||||||
|
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle null and non-object inputs", () => {
|
||||||
|
expect(removeDefaultProperty(null)).toBeNull();
|
||||||
|
expect(removeDefaultProperty("string")).toBe("string");
|
||||||
|
expect(removeDefaultProperty(123)).toBe(123);
|
||||||
|
});
|
||||||
|
});
|
@ -121,6 +121,10 @@ export async function generateOpenAICompletions(
|
|||||||
}
|
}
|
||||||
|
|
||||||
let schema = options.schema;
|
let schema = options.schema;
|
||||||
|
if (schema) {
|
||||||
|
schema = removeDefaultProperty(schema);
|
||||||
|
}
|
||||||
|
|
||||||
if (schema && schema.type === "array") {
|
if (schema && schema.type === "array") {
|
||||||
schema = {
|
schema = {
|
||||||
type: "object",
|
type: "object",
|
||||||
@ -134,10 +138,12 @@ export async function generateOpenAICompletions(
|
|||||||
schema = {
|
schema = {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: Object.fromEntries(
|
properties: Object.fromEntries(
|
||||||
Object.entries(schema).map(([key, value]) => [key, { type: value }]),
|
Object.entries(schema).map(([key, value]) => {
|
||||||
|
return [key, removeDefaultProperty(value)];
|
||||||
|
})
|
||||||
),
|
),
|
||||||
required: Object.keys(schema),
|
required: Object.keys(schema),
|
||||||
additionalProperties: false,
|
additionalProperties: false
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -232,3 +238,19 @@ export async function performLLMExtract(
|
|||||||
|
|
||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function removeDefaultProperty(schema: any): any {
|
||||||
|
if (typeof schema !== 'object' || schema === null) return schema;
|
||||||
|
|
||||||
|
const { default: _, ...rest } = schema;
|
||||||
|
|
||||||
|
for (const key in rest) {
|
||||||
|
if (Array.isArray(rest[key])) {
|
||||||
|
rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));
|
||||||
|
} else if (typeof rest[key] === 'object' && rest[key] !== null) {
|
||||||
|
rest[key] = removeDefaultProperty(rest[key]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return rest;
|
||||||
|
}
|
@ -29,7 +29,7 @@ async function _addScrapeJobToConcurrencyQueue(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async function _addScrapeJobToBullMQ(
|
export async function _addScrapeJobToBullMQ(
|
||||||
webScraperOptions: any,
|
webScraperOptions: any,
|
||||||
options: any,
|
options: any,
|
||||||
jobId: string,
|
jobId: string,
|
||||||
@ -138,7 +138,6 @@ export async function addScrapeJobs(
|
|||||||
if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) {
|
if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) {
|
||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
const limit = await getConcurrencyLimitMax(jobs[0].data.plan);
|
const limit = await getConcurrencyLimitMax(jobs[0].data.plan);
|
||||||
console.log("CC limit", limit);
|
|
||||||
cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now);
|
cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now);
|
||||||
|
|
||||||
countCanBeDirectlyAdded = Math.max(
|
countCanBeDirectlyAdded = Math.max(
|
||||||
|
@ -18,16 +18,18 @@ import { v4 as uuidv4 } from "uuid";
|
|||||||
import {
|
import {
|
||||||
addCrawlJob,
|
addCrawlJob,
|
||||||
addCrawlJobDone,
|
addCrawlJobDone,
|
||||||
|
addCrawlJobs,
|
||||||
crawlToCrawler,
|
crawlToCrawler,
|
||||||
finishCrawl,
|
finishCrawl,
|
||||||
generateURLPermutations,
|
generateURLPermutations,
|
||||||
getCrawl,
|
getCrawl,
|
||||||
getCrawlJobs,
|
getCrawlJobs,
|
||||||
lockURL,
|
lockURL,
|
||||||
|
lockURLs,
|
||||||
normalizeURL,
|
normalizeURL,
|
||||||
} from "../lib/crawl-redis";
|
} from "../lib/crawl-redis";
|
||||||
import { StoredCrawl } from "../lib/crawl-redis";
|
import { StoredCrawl } from "../lib/crawl-redis";
|
||||||
import { addScrapeJob } from "./queue-jobs";
|
import { addScrapeJob, addScrapeJobs } from "./queue-jobs";
|
||||||
import {
|
import {
|
||||||
addJobPriority,
|
addJobPriority,
|
||||||
deleteJobPriority,
|
deleteJobPriority,
|
||||||
@ -191,22 +193,34 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
|
|||||||
await addJobPriority(job.data.team_id, job.id);
|
await addJobPriority(job.data.team_id, job.id);
|
||||||
let err = null;
|
let err = null;
|
||||||
try {
|
try {
|
||||||
const result = await processJob(job, token);
|
if (job.data?.mode === "kickoff") {
|
||||||
if (result.success) {
|
const result = await processKickoffJob(job, token);
|
||||||
try {
|
if (result.success) {
|
||||||
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
|
try {
|
||||||
logger.debug(
|
|
||||||
"Job succeeded -- has crawl associated, putting null in Redis",
|
|
||||||
);
|
|
||||||
await job.moveToCompleted(null, token, false);
|
await job.moveToCompleted(null, token, false);
|
||||||
} else {
|
} catch (e) {}
|
||||||
logger.debug("Job succeeded -- putting result in Redis");
|
} else {
|
||||||
await job.moveToCompleted(result.document, token, false);
|
logger.debug("Job failed", { result, mode: job.data.mode });
|
||||||
}
|
await job.moveToFailed((result as any).error, token, false);
|
||||||
} catch (e) {}
|
}
|
||||||
} else {
|
} else {
|
||||||
logger.debug("Job failed", { result });
|
const result = await processJob(job, token);
|
||||||
await job.moveToFailed((result as any).error, token, false);
|
if (result.success) {
|
||||||
|
try {
|
||||||
|
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
|
logger.debug(
|
||||||
|
"Job succeeded -- has crawl associated, putting null in Redis",
|
||||||
|
);
|
||||||
|
await job.moveToCompleted(null, token, false);
|
||||||
|
} else {
|
||||||
|
logger.debug("Job succeeded -- putting result in Redis");
|
||||||
|
await job.moveToCompleted(result.document, token, false);
|
||||||
|
}
|
||||||
|
} catch (e) {}
|
||||||
|
} else {
|
||||||
|
logger.debug("Job failed", { result });
|
||||||
|
await job.moveToFailed((result as any).error, token, false);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.debug("Job failed", { error });
|
logger.debug("Job failed", { error });
|
||||||
@ -379,6 +393,130 @@ const workerFun = async (
|
|||||||
|
|
||||||
workerFun(getScrapeQueue(), processJobInternal);
|
workerFun(getScrapeQueue(), processJobInternal);
|
||||||
|
|
||||||
|
async function processKickoffJob(job: Job & { id: string }, token: string) {
|
||||||
|
const logger = _logger.child({
|
||||||
|
module: "queue-worker",
|
||||||
|
method: "processKickoffJob",
|
||||||
|
jobId: job.id,
|
||||||
|
scrapeId: job.id,
|
||||||
|
crawlId: job.data?.crawl_id ?? undefined,
|
||||||
|
teamId: job.data?.team_id ?? undefined,
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||||
|
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||||
|
|
||||||
|
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||||
|
? 0
|
||||||
|
: await crawler.tryGetSitemap(async urls => {
|
||||||
|
if (urls.length === 0) return;
|
||||||
|
|
||||||
|
logger.debug("Using sitemap chunk of length " + urls.length, {
|
||||||
|
sitemapLength: urls.length,
|
||||||
|
});
|
||||||
|
|
||||||
|
let jobPriority = await getJobPriority({
|
||||||
|
plan: job.data.plan,
|
||||||
|
team_id: job.data.team_id,
|
||||||
|
basePriority: 21,
|
||||||
|
});
|
||||||
|
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
||||||
|
|
||||||
|
const jobs = urls.map(url => {
|
||||||
|
const uuid = uuidv4();
|
||||||
|
return {
|
||||||
|
name: uuid,
|
||||||
|
data: {
|
||||||
|
url,
|
||||||
|
mode: "single_urls" as const,
|
||||||
|
team_id: job.data.team_id,
|
||||||
|
plan: job.data.plan!,
|
||||||
|
crawlerOptions: job.data.crawlerOptions,
|
||||||
|
scrapeOptions: job.data.scrapeOptions,
|
||||||
|
internalOptions: sc.internalOptions,
|
||||||
|
origin: job.data.origin,
|
||||||
|
crawl_id: job.data.crawl_id,
|
||||||
|
sitemapped: true,
|
||||||
|
webhook: job.data.webhook,
|
||||||
|
v1: job.data.v1,
|
||||||
|
},
|
||||||
|
opts: {
|
||||||
|
jobId: uuid,
|
||||||
|
priority: 20,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
logger.debug("Locking URLs...");
|
||||||
|
await lockURLs(
|
||||||
|
job.data.crawl_id,
|
||||||
|
sc,
|
||||||
|
jobs.map((x) => x.data.url),
|
||||||
|
);
|
||||||
|
logger.debug("Adding scrape jobs to Redis...");
|
||||||
|
await addCrawlJobs(
|
||||||
|
job.data.crawl_id,
|
||||||
|
jobs.map((x) => x.opts.jobId),
|
||||||
|
);
|
||||||
|
logger.debug("Adding scrape jobs to BullMQ...");
|
||||||
|
await addScrapeJobs(jobs);
|
||||||
|
});
|
||||||
|
|
||||||
|
if (sitemap === 0) {
|
||||||
|
logger.debug("Sitemap not found or ignored.", {
|
||||||
|
ignoreSitemap: sc.crawlerOptions.ignoreSitemap,
|
||||||
|
});
|
||||||
|
|
||||||
|
logger.debug("Locking URL...");
|
||||||
|
await lockURL(job.data.crawl_id, sc, job.data.url);
|
||||||
|
const jobId = uuidv4();
|
||||||
|
logger.debug("Adding scrape job to Redis...", { jobId });
|
||||||
|
await addScrapeJob(
|
||||||
|
{
|
||||||
|
url: job.data.url,
|
||||||
|
mode: "single_urls",
|
||||||
|
team_id: job.data.team_id,
|
||||||
|
crawlerOptions: job.data.crawlerOptions,
|
||||||
|
scrapeOptions: scrapeOptions.parse(job.data.scrapeOptions),
|
||||||
|
internalOptions: sc.internalOptions,
|
||||||
|
plan: job.data.plan!,
|
||||||
|
origin: job.data.origin,
|
||||||
|
crawl_id: job.data.crawl_id,
|
||||||
|
webhook: job.data.webhook,
|
||||||
|
v1: job.data.v1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
priority: 15,
|
||||||
|
},
|
||||||
|
jobId,
|
||||||
|
);
|
||||||
|
logger.debug("Adding scrape job to BullMQ...", { jobId });
|
||||||
|
await addCrawlJob(job.data.crawl_id, jobId);
|
||||||
|
}
|
||||||
|
logger.debug("Done queueing jobs!");
|
||||||
|
|
||||||
|
if (job.data.webhook) {
|
||||||
|
logger.debug("Calling webhook with crawl.started...", {
|
||||||
|
webhook: job.data.webhook,
|
||||||
|
});
|
||||||
|
await callWebhook(
|
||||||
|
job.data.team_id,
|
||||||
|
job.data.crawl_id,
|
||||||
|
null,
|
||||||
|
job.data.webhook,
|
||||||
|
true,
|
||||||
|
"crawl.started",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return { success: true }
|
||||||
|
} catch (error) {
|
||||||
|
logger.error("An error occurred!", { error })
|
||||||
|
return { success: false, error };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function processJob(job: Job & { id: string }, token: string) {
|
async function processJob(job: Job & { id: string }, token: string) {
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
module: "queue-worker",
|
module: "queue-worker",
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -13,7 +13,7 @@ import os
|
|||||||
|
|
||||||
from .firecrawl import FirecrawlApp # noqa
|
from .firecrawl import FirecrawlApp # noqa
|
||||||
|
|
||||||
__version__ = "1.6.8"
|
__version__ = "1.7.0"
|
||||||
|
|
||||||
# Define the logger for the Firecrawl project
|
# Define the logger for the Firecrawl project
|
||||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
@ -8,7 +8,7 @@ from datetime import datetime
|
|||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
API_URL = "http://127.0.0.1:3002";
|
API_URL = os.getenv('API_URL', 'http://127.0.0.1:3002')
|
||||||
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
|
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
|
||||||
TEST_API_KEY = os.getenv('TEST_API_KEY')
|
TEST_API_KEY = os.getenv('TEST_API_KEY')
|
||||||
|
|
||||||
@ -20,15 +20,26 @@ spec.loader.exec_module(firecrawl)
|
|||||||
FirecrawlApp = firecrawl.FirecrawlApp
|
FirecrawlApp = firecrawl.FirecrawlApp
|
||||||
|
|
||||||
def test_no_api_key():
|
def test_no_api_key():
|
||||||
with pytest.raises(Exception) as excinfo:
|
if 'api.firecrawl.dev' in API_URL:
|
||||||
invalid_app = FirecrawlApp(api_url=API_URL)
|
with pytest.raises(Exception) as excinfo:
|
||||||
assert "No API key provided" in str(excinfo.value)
|
invalid_app = FirecrawlApp(api_url=API_URL)
|
||||||
|
assert "No API key provided" in str(excinfo.value)
|
||||||
|
else:
|
||||||
|
# Should not raise error for self-hosted
|
||||||
|
app = FirecrawlApp(api_url=API_URL)
|
||||||
|
assert app is not None
|
||||||
|
|
||||||
def test_scrape_url_invalid_api_key():
|
def test_scrape_url_invalid_api_key():
|
||||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
if 'api.firecrawl.dev' in API_URL:
|
||||||
with pytest.raises(Exception) as excinfo:
|
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||||
invalid_app.scrape_url('https://firecrawl.dev')
|
with pytest.raises(Exception) as excinfo:
|
||||||
assert "Unauthorized: Invalid token" in str(excinfo.value)
|
invalid_app.scrape_url('https://firecrawl.dev')
|
||||||
|
assert "Unauthorized: Invalid token" in str(excinfo.value)
|
||||||
|
else:
|
||||||
|
# Should work without API key for self-hosted
|
||||||
|
app = FirecrawlApp(api_url=API_URL)
|
||||||
|
response = app.scrape_url('https://firecrawl.dev')
|
||||||
|
assert response is not None
|
||||||
|
|
||||||
# def test_blocklisted_url():
|
# def test_blocklisted_url():
|
||||||
# blocklisted_url = "https://facebook.com/fake-test"
|
# blocklisted_url = "https://facebook.com/fake-test"
|
||||||
@ -131,10 +142,16 @@ def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_ext
|
|||||||
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown']
|
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown']
|
||||||
|
|
||||||
def test_crawl_url_invalid_api_key():
|
def test_crawl_url_invalid_api_key():
|
||||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
if 'api.firecrawl.dev' in API_URL:
|
||||||
with pytest.raises(Exception) as excinfo:
|
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||||
invalid_app.crawl_url('https://firecrawl.dev')
|
with pytest.raises(Exception) as excinfo:
|
||||||
assert "Unauthorized: Invalid token" in str(excinfo.value)
|
invalid_app.crawl_url('https://firecrawl.dev')
|
||||||
|
assert "Unauthorized: Invalid token" in str(excinfo.value)
|
||||||
|
else:
|
||||||
|
# Should work without API key for self-hosted
|
||||||
|
app = FirecrawlApp(api_url=API_URL)
|
||||||
|
response = app.crawl_url('https://firecrawl.dev')
|
||||||
|
assert response is not None
|
||||||
|
|
||||||
# def test_should_return_error_for_blocklisted_url():
|
# def test_should_return_error_for_blocklisted_url():
|
||||||
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||||
@ -291,10 +308,16 @@ def test_check_crawl_status_e2e():
|
|||||||
assert 'error' not in status_response['data'][0]['metadata']
|
assert 'error' not in status_response['data'][0]['metadata']
|
||||||
|
|
||||||
def test_invalid_api_key_on_map():
|
def test_invalid_api_key_on_map():
|
||||||
invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL)
|
if 'api.firecrawl.dev' in API_URL:
|
||||||
with pytest.raises(Exception) as excinfo:
|
invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL)
|
||||||
invalid_app.map_url('https://roastmywebsite.ai')
|
with pytest.raises(Exception) as excinfo:
|
||||||
assert "Unauthorized: Invalid token" in str(excinfo.value)
|
invalid_app.map_url('https://roastmywebsite.ai')
|
||||||
|
assert "Unauthorized: Invalid token" in str(excinfo.value)
|
||||||
|
else:
|
||||||
|
# Should work without API key for self-hosted
|
||||||
|
app = FirecrawlApp(api_url=API_URL)
|
||||||
|
response = app.map_url('https://roastmywebsite.ai')
|
||||||
|
assert response is not None
|
||||||
|
|
||||||
# def test_blocklisted_url_on_map():
|
# def test_blocklisted_url_on_map():
|
||||||
# app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
|
# app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
|
||||||
@ -349,4 +372,3 @@ def test_search_e2e():
|
|||||||
# assert isinstance(llm_extraction['is_open_source'], bool)
|
# assert isinstance(llm_extraction['is_open_source'], bool)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -40,19 +40,22 @@ class FirecrawlApp:
|
|||||||
error: Optional[str] = None
|
error: Optional[str] = None
|
||||||
|
|
||||||
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
||||||
"""
|
"""
|
||||||
Initialize the FirecrawlApp instance with API key, API URL.
|
Initialize the FirecrawlApp instance with API key, API URL.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
api_key (Optional[str]): API key for authenticating with the Firecrawl API.
|
api_key (Optional[str]): API key for authenticating with the Firecrawl API.
|
||||||
api_url (Optional[str]): Base URL for the Firecrawl API.
|
api_url (Optional[str]): Base URL for the Firecrawl API.
|
||||||
"""
|
"""
|
||||||
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
||||||
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
||||||
if self.api_key is None:
|
|
||||||
logger.warning("No API key provided")
|
# Only require API key when using cloud service
|
||||||
raise ValueError('No API key provided')
|
if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
|
||||||
logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key}")
|
logger.warning("No API key provided for cloud service")
|
||||||
|
raise ValueError('No API key provided')
|
||||||
|
|
||||||
|
logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
|
||||||
|
|
||||||
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||||
"""
|
"""
|
||||||
|
@ -12,7 +12,8 @@ dependencies = [
|
|||||||
"requests",
|
"requests",
|
||||||
"python-dotenv",
|
"python-dotenv",
|
||||||
"websockets",
|
"websockets",
|
||||||
"nest-asyncio"
|
"nest-asyncio",
|
||||||
|
"pydantic>=2.10.3",
|
||||||
]
|
]
|
||||||
authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
|
authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
|
||||||
maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
|
maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
|
||||||
|
@ -2,4 +2,5 @@ requests
|
|||||||
pytest
|
pytest
|
||||||
python-dotenv
|
python-dotenv
|
||||||
websockets
|
websockets
|
||||||
nest-asyncio
|
nest-asyncio
|
||||||
|
pydantic
|
@ -9,7 +9,7 @@ use crate::crawl::CrawlStatus;
|
|||||||
#[derive(Debug, Deserialize, Serialize, Clone)]
|
#[derive(Debug, Deserialize, Serialize, Clone)]
|
||||||
pub struct FirecrawlAPIError {
|
pub struct FirecrawlAPIError {
|
||||||
/// Always false.
|
/// Always false.
|
||||||
success: bool,
|
pub success: bool,
|
||||||
|
|
||||||
/// Error message
|
/// Error message
|
||||||
pub error: String,
|
pub error: String,
|
||||||
|
@ -9,6 +9,7 @@ pub mod map;
|
|||||||
pub mod scrape;
|
pub mod scrape;
|
||||||
|
|
||||||
pub use error::FirecrawlError;
|
pub use error::FirecrawlError;
|
||||||
|
use error::FirecrawlAPIError;
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct FirecrawlApp {
|
pub struct FirecrawlApp {
|
||||||
@ -18,16 +19,30 @@ pub struct FirecrawlApp {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) const API_VERSION: &str = "/v1";
|
pub(crate) const API_VERSION: &str = "/v1";
|
||||||
|
const CLOUD_API_URL: &str = "https://api.firecrawl.dev";
|
||||||
|
|
||||||
impl FirecrawlApp {
|
impl FirecrawlApp {
|
||||||
pub fn new(api_key: impl AsRef<str>) -> Result<Self, FirecrawlError> {
|
pub fn new(api_key: impl AsRef<str>) -> Result<Self, FirecrawlError> {
|
||||||
FirecrawlApp::new_selfhosted("https://api.firecrawl.dev", Some(api_key))
|
FirecrawlApp::new_selfhosted(CLOUD_API_URL, Some(api_key))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn new_selfhosted(api_url: impl AsRef<str>, api_key: Option<impl AsRef<str>>) -> Result<Self, FirecrawlError> {
|
pub fn new_selfhosted(api_url: impl AsRef<str>, api_key: Option<impl AsRef<str>>) -> Result<Self, FirecrawlError> {
|
||||||
|
let url = api_url.as_ref().to_string();
|
||||||
|
|
||||||
|
if url == CLOUD_API_URL && api_key.is_none() {
|
||||||
|
return Err(FirecrawlError::APIError(
|
||||||
|
"Configuration".to_string(),
|
||||||
|
FirecrawlAPIError {
|
||||||
|
success: false,
|
||||||
|
error: "API key is required for cloud service".to_string(),
|
||||||
|
details: None,
|
||||||
|
}
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
Ok(FirecrawlApp {
|
Ok(FirecrawlApp {
|
||||||
api_key: api_key.map(|x| x.as_ref().to_string()),
|
api_key: api_key.map(|x| x.as_ref().to_string()),
|
||||||
api_url: api_url.as_ref().to_string(),
|
api_url: url,
|
||||||
client: Client::new(),
|
client: Client::new(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use assert_matches::assert_matches;
|
use assert_matches::assert_matches;
|
||||||
use dotenvy::dotenv;
|
use dotenvy::dotenv;
|
||||||
use firecrawl::scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions};
|
use firecrawl::scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions};
|
||||||
use firecrawl::FirecrawlApp;
|
use firecrawl::{FirecrawlApp, FirecrawlError};
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
use std::env;
|
use std::env;
|
||||||
|
|
||||||
@ -155,3 +155,29 @@ async fn test_llm_extraction() {
|
|||||||
assert!(llm_extraction["supports_sso"].is_boolean());
|
assert!(llm_extraction["supports_sso"].is_boolean());
|
||||||
assert!(llm_extraction["is_open_source"].is_boolean());
|
assert!(llm_extraction["is_open_source"].is_boolean());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_api_key_requirements() {
|
||||||
|
dotenv().ok();
|
||||||
|
|
||||||
|
let api_url = env::var("API_URL").unwrap_or("http://localhost:3002".to_string());
|
||||||
|
let api_key = env::var("TEST_API_KEY").ok();
|
||||||
|
|
||||||
|
match (api_url.contains("api.firecrawl.dev"), api_key) {
|
||||||
|
(false, _) => {
|
||||||
|
let result = FirecrawlApp::new_selfhosted(&api_url, None::<String>);
|
||||||
|
assert!(result.is_ok(), "Local setup failed: {:?}", result.err().unwrap());
|
||||||
|
}
|
||||||
|
(true, None) => {
|
||||||
|
let result = FirecrawlApp::new_selfhosted(&api_url, None::<String>);
|
||||||
|
assert!(matches!(
|
||||||
|
result,
|
||||||
|
Err(FirecrawlError::APIError(msg, _)) if msg == "Configuration"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
(true, Some(key)) => {
|
||||||
|
let result = FirecrawlApp::new_selfhosted(&api_url, Some(&key));
|
||||||
|
assert!(result.is_ok());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user