Merge branch 'main' into mog/mineru

This commit is contained in:
Nicolas 2024-12-27 19:53:09 -03:00
commit 5fcf3fa97e
25 changed files with 1681 additions and 440 deletions

View File

@ -116,6 +116,10 @@ If youd like to test the crawl endpoint, you can run this:
This section provides solutions to common issues you might encounter while setting up or running your self-hosted instance of Firecrawl.
### API Keys for SDK Usage
**Note:** When using Firecrawl SDKs with a self-hosted instance, API keys are optional. API keys are only required when connecting to the cloud service (api.firecrawl.dev).
### Supabase client is not configured
**Symptom:**

View File

@ -70,8 +70,8 @@ content-type: application/json
"urls": ["firecrawl.dev"],
"prompt": "What is the title, description and main product of the page?",
"schema": {
"title": "string",
"description": "string",
"mainProduct": "string"
"title": { "type": "string" },
"description": { "type": "string" },
"mainProduct": { "type": "string" }
}
}
}

View File

@ -177,56 +177,51 @@ export async function crawlController(req: Request, res: Response) {
await saveCrawl(id, sc);
const sitemap = sc.crawlerOptions?.ignoreSitemap
? null
: await crawler.tryGetSitemap();
const sitemap = sc.crawlerOptions.ignoreSitemap
? 0
: await crawler.tryGetSitemap(async urls => {
if (urls.length === 0) return;
let jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 });
const jobs = urls.map(url => {
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls",
crawlerOptions,
scrapeOptions,
internalOptions,
team_id,
plan,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
sitemapped: true,
},
opts: {
jobId: uuid,
priority: jobPriority,
},
};
});
if (sitemap !== null && sitemap.length > 0) {
let jobPriority = 20;
// If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20
if (sitemap.length > 1000) {
// set base to 21
jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 });
}
const jobs = sitemap.map((x) => {
const url = x.url;
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls",
crawlerOptions,
scrapeOptions,
internalOptions,
team_id,
plan,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
sitemapped: true,
},
opts: {
jobId: uuid,
priority: jobPriority,
},
};
});
await lockURLs(
id,
sc,
jobs.map((x) => x.data.url),
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId),
);
for (const job of jobs) {
// add with sentry instrumentation
await addScrapeJob(job.data as any, {}, job.opts.jobId);
}
});
await lockURLs(
id,
sc,
jobs.map((x) => x.data.url),
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId),
);
for (const job of jobs) {
// add with sentry instrumentation
await addScrapeJob(job.data as any, {}, job.opts.jobId);
}
} else {
if (sitemap === 0) {
await lockURL(id, sc, url);
// Not needed, first one should be 15.

View File

@ -113,32 +113,32 @@ export async function crawlPreviewController(req: Request, res: Response) {
const crawler = crawlToCrawler(id, sc);
const sitemap = sc.crawlerOptions?.ignoreSitemap
? null
: await crawler.tryGetSitemap();
? 0
: await crawler.tryGetSitemap(async urls => {
for (const url of urls) {
await lockURL(id, sc, url);
const jobId = uuidv4();
await addScrapeJob(
{
url,
mode: "single_urls",
team_id,
plan: plan!,
crawlerOptions,
scrapeOptions,
internalOptions,
origin: "website-preview",
crawl_id: id,
sitemapped: true,
},
{},
jobId,
);
await addCrawlJob(id, jobId);
}
});
if (sitemap !== null) {
for (const url of sitemap.map((x) => x.url)) {
await lockURL(id, sc, url);
const jobId = uuidv4();
await addScrapeJob(
{
url,
mode: "single_urls",
team_id,
plan: plan!,
crawlerOptions,
scrapeOptions,
internalOptions,
origin: "website-preview",
crawl_id: id,
sitemapped: true,
},
{},
jobId,
);
await addCrawlJob(id, jobId);
}
} else {
if (sitemap === 0) {
await lockURL(id, sc, url);
const jobId = uuidv4();
await addScrapeJob(

View File

@ -115,7 +115,7 @@ export async function crawlStatusController(
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
sc.cancelled
? "cancelled"
: validJobStatuses.every((x) => x[1] === "completed")
: (validJobStatuses.every((x) => x[1] === "completed") && validJobStatuses.length > 0)
? "completed"
: "scraping";

View File

@ -18,7 +18,7 @@ import {
} from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { addScrapeJob, addScrapeJobs } from "../../services/queue-jobs";
import { _addScrapeJobToBullMQ, addScrapeJob, addScrapeJobs } from "../../services/queue-jobs";
import { logger as _logger } from "../../lib/logger";
import { getJobPriority } from "../../lib/job-priority";
import { callWebhook } from "../../services/webhook";
@ -111,113 +111,20 @@ export async function crawlController(
await saveCrawl(id, sc);
const sitemap = sc.crawlerOptions.ignoreSitemap
? null
: await crawler.tryGetSitemap();
if (sitemap !== null && sitemap.length > 0) {
logger.debug("Using sitemap of length " + sitemap.length, {
sitemapLength: sitemap.length,
});
let jobPriority = 20;
// If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20
if (sitemap.length > 1000) {
// set base to 21
jobPriority = await getJobPriority({
plan: req.auth.plan,
team_id: req.auth.team_id,
basePriority: 21,
});
}
logger.debug("Using job priority " + jobPriority, { jobPriority });
const jobs = sitemap.map((x) => {
const url = x.url;
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls" as const,
team_id: req.auth.team_id,
plan: req.auth.plan!,
crawlerOptions,
scrapeOptions,
internalOptions: sc.internalOptions,
origin: "api",
crawl_id: id,
sitemapped: true,
webhook: req.body.webhook,
v1: true,
},
opts: {
jobId: uuid,
priority: 20,
},
};
});
logger.debug("Locking URLs...");
await lockURLs(
id,
sc,
jobs.map((x) => x.data.url),
);
logger.debug("Adding scrape jobs to Redis...");
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId),
);
logger.debug("Adding scrape jobs to BullMQ...");
await addScrapeJobs(jobs);
} else {
logger.debug("Sitemap not found or ignored.", {
ignoreSitemap: sc.crawlerOptions.ignoreSitemap,
});
logger.debug("Locking URL...");
await lockURL(id, sc, req.body.url);
const jobId = uuidv4();
logger.debug("Adding scrape job to Redis...", { jobId });
await addScrapeJob(
{
url: req.body.url,
mode: "single_urls",
team_id: req.auth.team_id,
crawlerOptions,
scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions),
internalOptions: sc.internalOptions,
plan: req.auth.plan!,
origin: "api",
crawl_id: id,
webhook: req.body.webhook,
v1: true,
},
{
priority: 15,
},
jobId,
);
logger.debug("Adding scrape job to BullMQ...", { jobId });
await addCrawlJob(id, jobId);
}
logger.debug("Done queueing jobs!");
if (req.body.webhook) {
logger.debug("Calling webhook with crawl.started...", {
webhook: req.body.webhook,
});
await callWebhook(
req.auth.team_id,
id,
null,
req.body.webhook,
true,
"crawl.started",
);
}
await _addScrapeJobToBullMQ({
url: req.body.url,
mode: "kickoff" as const,
team_id: req.auth.team_id,
plan: req.auth.plan,
crawlerOptions,
scrapeOptions: sc.scrapeOptions,
internalOptions: sc.internalOptions,
origin: "api",
crawl_id: id,
webhook: req.body.webhook,
v1: true,
}, {}, crypto.randomUUID(), 10);
const protocol = process.env.ENV === "local" ? req.protocol : "https";
return res.status(200).json({

View File

@ -86,11 +86,12 @@ export async function getMapResults({
// If sitemapOnly is true, only get links from sitemap
if (crawlerOptions.sitemapOnly) {
const sitemap = await crawler.tryGetSitemap(true, true);
if (sitemap !== null) {
sitemap.forEach((x) => {
links.push(x.url);
const sitemap = await crawler.tryGetSitemap(urls => {
urls.forEach((x) => {
links.push(x);
});
}, true, true);
if (sitemap > 0) {
links = links
.slice(1)
.map((x) => {
@ -143,8 +144,10 @@ export async function getMapResults({
}
// Parallelize sitemap fetch with serper search
const [sitemap, ...searchResults] = await Promise.all([
ignoreSitemap ? null : crawler.tryGetSitemap(true),
const [_, ...searchResults] = await Promise.all([
ignoreSitemap ? null : crawler.tryGetSitemap(urls => {
links.push(...urls);
}, true),
...(cachedResult ? [] : pagePromises),
]);
@ -152,12 +155,6 @@ export async function getMapResults({
allResults = searchResults;
}
if (sitemap !== null) {
sitemap.forEach((x) => {
links.push(x.url);
});
}
mapResults = allResults
.flat()
.filter((result) => result !== null && result !== undefined);

View File

@ -17,7 +17,7 @@ export function withAuth<T, U extends any[]>(
logger.warn("You're bypassing authentication");
warningCount++;
}
return { success: true } as T;
return { success: true, ...(mockSuccess || {}) } as T;
} else {
return await originalFunction(...args);
}

View File

@ -4,9 +4,10 @@ import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap";
import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout";
import { logger as _logger } from "../../../src/lib/logger";
import { axiosTimeout } from "../../lib/timeout";
import { logger as _logger } from "../../lib/logger";
import https from "https";
import { redisConnection } from "../../services/queue-service";
export class WebCrawler {
private jobId: string;
private initialUrl: string;
@ -198,26 +199,60 @@ export class WebCrawler {
}
public async tryGetSitemap(
urlsHandler: (urls: string[]) => unknown,
fromMap: boolean = false,
onlySitemap: boolean = false,
): Promise<{ url: string; html: string }[] | null> {
): Promise<number> {
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
method: "tryGetSitemap",
});
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (fromMap && onlySitemap) {
return sitemapLinks.map((link) => ({ url: link, html: "" }));
let leftOfLimit = this.limit;
const normalizeUrl = (url: string) => {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
return url;
};
const _urlsHandler = async (urls: string[]) => {
let uniqueURLs: string[] = [];
for (const url of urls) {
if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(url))) {
uniqueURLs.push(url);
}
}
await redisConnection.expire("sitemap:" + this.jobId + ":links", 3600, "NX");
if (uniqueURLs.length > 0) {
urlsHandler(uniqueURLs);
}
};
let count = await this.tryFetchSitemapLinks(this.initialUrl, (urls: string[]) => {
if (fromMap && onlySitemap) {
return urlsHandler(urls);
} else {
let filteredLinks = this.filterLinks(
[...new Set(urls)],
leftOfLimit,
this.maxCrawledDepth,
fromMap,
);
leftOfLimit -= filteredLinks.length;
return _urlsHandler(filteredLinks);
}
});
if (count > 0) {
if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(this.initialUrl))) {
urlsHandler([this.initialUrl]);
}
count++;
}
if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks(
[...new Set(sitemapLinks)],
this.limit,
this.maxCrawledDepth,
fromMap,
);
return filteredLinks.map((link) => ({ url: link, html: "" }));
}
return null;
return count;
}
public filterURL(href: string, url: string): string | null {
@ -436,54 +471,74 @@ export class WebCrawler {
return socialMediaOrEmail.some((ext) => url.includes(ext));
}
private async tryFetchSitemapLinks(url: string): Promise<string[]> {
const normalizeUrl = (url: string) => {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
return url;
};
private async tryFetchSitemapLinks(url: string, urlsHandler: (urls: string[]) => unknown): Promise<number> {
const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`;
let sitemapLinks: string[] = [];
let sitemapCount: number = 0;
// Try to get sitemap from the provided URL first
try {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
if (response.status === 200) {
sitemapLinks = await getLinksFromSitemap({ sitemapUrl }, this.logger);
}
sitemapCount = await getLinksFromSitemap(
{ sitemapUrl, urlsHandler, mode: "fire-engine" },
this.logger,
);
} catch (error) {
this.logger.debug(
`Failed to fetch sitemap with axios from ${sitemapUrl}`,
`Failed to fetch sitemap from ${sitemapUrl}`,
{ method: "tryFetchSitemapLinks", sitemapUrl, error },
);
if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
} else {
const response = await getLinksFromSitemap(
{ sitemapUrl, mode: "fire-engine" },
this.logger,
);
if (response) {
sitemapLinks = response;
}
}
}
if (sitemapLinks.length === 0) {
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
try {
const response = await axios.get(baseUrlSitemap, {
timeout: axiosTimeout,
});
if (response.status === 200) {
sitemapLinks = await getLinksFromSitemap(
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
// If this is a subdomain, also try to get sitemap from the main domain
try {
const urlObj = new URL(url);
const hostname = urlObj.hostname;
const domainParts = hostname.split('.');
// Check if this is a subdomain (has more than 2 parts and not www)
if (domainParts.length > 2 && domainParts[0] !== 'www') {
// Get the main domain by taking the last two parts
const mainDomain = domainParts.slice(-2).join('.');
const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`;
const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`;
try {
// Get all links from the main domain's sitemap
sitemapCount += await getLinksFromSitemap(
{ sitemapUrl: mainDomainSitemapUrl, urlsHandler(urls) {
urlsHandler(urls.filter(link => {
try {
const linkUrl = new URL(link);
return linkUrl.hostname.endsWith(hostname);
} catch {
}
}))
}, mode: "fire-engine" },
this.logger,
);
} catch (error) {
this.logger.debug(
`Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`,
{ method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error },
);
}
}
} catch (error) {
this.logger.debug(`Error processing main domain sitemap`, {
method: "tryFetchSitemapLinks",
url,
error,
});
}
// If no sitemap found yet, try the baseUrl as a last resort
if (sitemapCount === 0) {
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
try {
sitemapCount += await getLinksFromSitemap(
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
this.logger,
);
} catch (error) {
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
method: "tryFetchSitemapLinks",
@ -493,25 +548,14 @@ export class WebCrawler {
if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
} else {
sitemapLinks = await getLinksFromSitemap(
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
sitemapCount += await getLinksFromSitemap(
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
this.logger,
);
}
}
}
const normalizedUrl = normalizeUrl(url);
const normalizedSitemapLinks = sitemapLinks.map((link) =>
normalizeUrl(link),
);
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
if (
!normalizedSitemapLinks.includes(normalizedUrl) &&
sitemapLinks.length > 0
) {
sitemapLinks.push(url);
}
return sitemapLinks;
return sitemapCount;
}
}

View File

@ -5,26 +5,25 @@ import { WebCrawler } from "./crawler";
import { scrapeURL } from "../scrapeURL";
import { scrapeOptions } from "../../controllers/v1/types";
import type { Logger } from "winston";
const useFireEngine =
process.env.FIRE_ENGINE_BETA_URL !== "" &&
process.env.FIRE_ENGINE_BETA_URL !== undefined;
export async function getLinksFromSitemap(
{
sitemapUrl,
allUrls = [],
urlsHandler,
mode = "axios",
}: {
sitemapUrl: string;
allUrls?: string[];
urlsHandler(urls: string[]): unknown,
mode?: "axios" | "fire-engine";
},
logger: Logger,
): Promise<string[]> {
): Promise<number> {
try {
let content: string = "";
try {
if (mode === "axios" || process.env.FIRE_ENGINE_BETA_URL === "") {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data;
} else if (mode === "fire-engine") {
if (mode === "fire-engine" && useFireEngine) {
const response = await scrapeURL(
"sitemap",
sitemapUrl,
@ -32,9 +31,15 @@ export async function getLinksFromSitemap(
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
);
if (!response.success) {
throw response.error;
logger.debug("Failed to scrape sitemap via TLSClient, falling back to axios...", { error: response.error })
const ar = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = ar.data;
} else {
content = response.document.rawHtml!;
}
content = response.document.rawHtml!;
} else {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data;
}
} catch (error) {
logger.error(`Request failed for ${sitemapUrl}`, {
@ -44,33 +49,64 @@ export async function getLinksFromSitemap(
error,
});
return allUrls;
return 0;
}
const parsed = await parseStringPromise(content);
const root = parsed.urlset || parsed.sitemapindex;
let count = 0;
if (root && root.sitemap) {
const sitemapPromises = root.sitemap
// Handle sitemap index files
const sitemapUrls = root.sitemap
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
.map((sitemap) =>
.map((sitemap) => sitemap.loc[0]);
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
getLinksFromSitemap(
{ sitemapUrl, urlsHandler, mode },
logger,
),
);
const results = await Promise.all(sitemapPromises);
count = results.reduce((a,x) => a + x)
} else if (root && root.url) {
// Check if any URLs point to additional sitemaps
const xmlSitemaps: string[] = root.url
.filter(
(url) =>
url.loc &&
url.loc.length > 0 &&
url.loc[0].toLowerCase().endsWith('.xml')
)
.map((url) => url.loc[0]);
if (xmlSitemaps.length > 0) {
// Recursively fetch links from additional sitemaps
const sitemapPromises = xmlSitemaps.map((sitemapUrl) =>
getLinksFromSitemap(
{ sitemapUrl: sitemap.loc[0], allUrls, mode },
{ sitemapUrl: sitemapUrl, urlsHandler, mode },
logger,
),
);
await Promise.all(sitemapPromises);
} else if (root && root.url) {
count += (await Promise.all(sitemapPromises)).reduce((a,x) => a + x, 0);
}
const validUrls = root.url
.filter(
(url) =>
url.loc &&
url.loc.length > 0 &&
!url.loc[0].toLowerCase().endsWith('.xml') &&
!WebCrawler.prototype.isFile(url.loc[0]),
)
.map((url) => url.loc[0]);
allUrls.push(...validUrls);
count += validUrls.length;
urlsHandler(validUrls);
}
return count;
} catch (error) {
logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, {
method: "getLinksFromSitemap",
@ -80,7 +116,7 @@ export async function getLinksFromSitemap(
});
}
return allUrls;
return 0;
}
export const fetchSitemapData = async (

View File

@ -17,6 +17,7 @@ import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError
import * as Sentry from "@sentry/node";
import { Action } from "../../../../lib/entities";
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
import { fireEngineDelete } from "./delete";
// This function does not take `Meta` on purpose. It may not access any
// meta values to construct the request -- that must be done by the
@ -44,6 +45,13 @@ async function performFireEngineScrape<
while (status === undefined) {
if (errors.length >= errorLimit) {
logger.error("Error limit hit.", { errors });
fireEngineDelete(
logger.child({
method: "performFireEngineScrape/fireEngineDelete",
afterErrors: errors,
}),
scrape.jobId,
);
throw new Error("Error limit hit. See e.cause.errors for errors.", {
cause: { errors },
});
@ -74,6 +82,13 @@ async function performFireEngineScrape<
error instanceof ActionError ||
error instanceof UnsupportedFileError
) {
fireEngineDelete(
logger.child({
method: "performFireEngineScrape/fireEngineDelete",
afterError: error,
}),
scrape.jobId,
);
logger.debug("Fire-engine scrape job failed.", {
error,
jobId: scrape.jobId,
@ -105,6 +120,13 @@ async function performFireEngineScrape<
status.content = Buffer.from(content, "base64").toString("utf8"); // TODO: handle other encodings via Content-Type tag
}
fireEngineDelete(
logger.child({
method: "performFireEngineScrape/fireEngineDelete",
}),
scrape.jobId,
);
return status;
}

View File

@ -8,6 +8,7 @@ export function extractMetadata(
): Partial<Document["metadata"]> {
let title: string | undefined = undefined;
let description: string | undefined = undefined;
let favicon: string | undefined = undefined;
let language: string | undefined = undefined;
let keywords: string | undefined = undefined;
let robots: string | undefined = undefined;
@ -42,6 +43,12 @@ export function extractMetadata(
try {
title = soup("title").first().text().trim() || undefined;
description = soup('meta[name="description"]').attr("content") || undefined;
const faviconLink = soup('link[rel="icon"]').attr("href") || soup('link[rel*="icon"]').first().attr("href") || undefined;
if (faviconLink) {
const baseUrl = new URL(meta.url).origin;
favicon = faviconLink.startsWith('http') ? faviconLink : `${baseUrl}${faviconLink}`;
}
// Assuming the language is part of the URL as per the regex pattern
language = soup("html").attr("lang") || undefined;
@ -121,6 +128,7 @@ export function extractMetadata(
return {
title,
description,
favicon,
language,
keywords,
robots,

View File

@ -0,0 +1,33 @@
import { removeDefaultProperty } from "./llmExtract";
describe("removeDefaultProperty", () => {
it("should remove the default property from a simple object", () => {
const input = { default: "test", test: "test" };
const expectedOutput = { test: "test" };
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
});
it("should remove the default property from a nested object", () => {
const input = { default: "test", nested: { default: "nestedTest", test: "nestedTest" } };
const expectedOutput = { nested: { test: "nestedTest" } };
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
});
it("should remove the default property from an array of objects", () => {
const input = { array: [{ default: "test1", test: "test1" }, { default: "test2", test: "test2" }] };
const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] };
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
});
it("should handle objects without a default property", () => {
const input = { test: "test" };
const expectedOutput = { test: "test" };
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
});
it("should handle null and non-object inputs", () => {
expect(removeDefaultProperty(null)).toBeNull();
expect(removeDefaultProperty("string")).toBe("string");
expect(removeDefaultProperty(123)).toBe(123);
});
});

View File

@ -121,6 +121,10 @@ export async function generateOpenAICompletions(
}
let schema = options.schema;
if (schema) {
schema = removeDefaultProperty(schema);
}
if (schema && schema.type === "array") {
schema = {
type: "object",
@ -134,10 +138,12 @@ export async function generateOpenAICompletions(
schema = {
type: "object",
properties: Object.fromEntries(
Object.entries(schema).map(([key, value]) => [key, { type: value }]),
Object.entries(schema).map(([key, value]) => {
return [key, removeDefaultProperty(value)];
})
),
required: Object.keys(schema),
additionalProperties: false,
additionalProperties: false
};
}
@ -232,3 +238,19 @@ export async function performLLMExtract(
return document;
}
export function removeDefaultProperty(schema: any): any {
if (typeof schema !== 'object' || schema === null) return schema;
const { default: _, ...rest } = schema;
for (const key in rest) {
if (Array.isArray(rest[key])) {
rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));
} else if (typeof rest[key] === 'object' && rest[key] !== null) {
rest[key] = removeDefaultProperty(rest[key]);
}
}
return rest;
}

View File

@ -29,7 +29,7 @@ async function _addScrapeJobToConcurrencyQueue(
});
}
async function _addScrapeJobToBullMQ(
export async function _addScrapeJobToBullMQ(
webScraperOptions: any,
options: any,
jobId: string,
@ -138,7 +138,6 @@ export async function addScrapeJobs(
if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) {
const now = Date.now();
const limit = await getConcurrencyLimitMax(jobs[0].data.plan);
console.log("CC limit", limit);
cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now);
countCanBeDirectlyAdded = Math.max(

View File

@ -18,16 +18,18 @@ import { v4 as uuidv4 } from "uuid";
import {
addCrawlJob,
addCrawlJobDone,
addCrawlJobs,
crawlToCrawler,
finishCrawl,
generateURLPermutations,
getCrawl,
getCrawlJobs,
lockURL,
lockURLs,
normalizeURL,
} from "../lib/crawl-redis";
import { StoredCrawl } from "../lib/crawl-redis";
import { addScrapeJob } from "./queue-jobs";
import { addScrapeJob, addScrapeJobs } from "./queue-jobs";
import {
addJobPriority,
deleteJobPriority,
@ -191,22 +193,34 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
await addJobPriority(job.data.team_id, job.id);
let err = null;
try {
const result = await processJob(job, token);
if (result.success) {
try {
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
logger.debug(
"Job succeeded -- has crawl associated, putting null in Redis",
);
if (job.data?.mode === "kickoff") {
const result = await processKickoffJob(job, token);
if (result.success) {
try {
await job.moveToCompleted(null, token, false);
} else {
logger.debug("Job succeeded -- putting result in Redis");
await job.moveToCompleted(result.document, token, false);
}
} catch (e) {}
} catch (e) {}
} else {
logger.debug("Job failed", { result, mode: job.data.mode });
await job.moveToFailed((result as any).error, token, false);
}
} else {
logger.debug("Job failed", { result });
await job.moveToFailed((result as any).error, token, false);
const result = await processJob(job, token);
if (result.success) {
try {
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
logger.debug(
"Job succeeded -- has crawl associated, putting null in Redis",
);
await job.moveToCompleted(null, token, false);
} else {
logger.debug("Job succeeded -- putting result in Redis");
await job.moveToCompleted(result.document, token, false);
}
} catch (e) {}
} else {
logger.debug("Job failed", { result });
await job.moveToFailed((result as any).error, token, false);
}
}
} catch (error) {
logger.debug("Job failed", { error });
@ -379,6 +393,130 @@ const workerFun = async (
workerFun(getScrapeQueue(), processJobInternal);
async function processKickoffJob(job: Job & { id: string }, token: string) {
const logger = _logger.child({
module: "queue-worker",
method: "processKickoffJob",
jobId: job.id,
scrapeId: job.id,
crawlId: job.data?.crawl_id ?? undefined,
teamId: job.data?.team_id ?? undefined,
});
try {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
const crawler = crawlToCrawler(job.data.crawl_id, sc);
const sitemap = sc.crawlerOptions.ignoreSitemap
? 0
: await crawler.tryGetSitemap(async urls => {
if (urls.length === 0) return;
logger.debug("Using sitemap chunk of length " + urls.length, {
sitemapLength: urls.length,
});
let jobPriority = await getJobPriority({
plan: job.data.plan,
team_id: job.data.team_id,
basePriority: 21,
});
logger.debug("Using job priority " + jobPriority, { jobPriority });
const jobs = urls.map(url => {
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls" as const,
team_id: job.data.team_id,
plan: job.data.plan!,
crawlerOptions: job.data.crawlerOptions,
scrapeOptions: job.data.scrapeOptions,
internalOptions: sc.internalOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
sitemapped: true,
webhook: job.data.webhook,
v1: job.data.v1,
},
opts: {
jobId: uuid,
priority: 20,
},
};
});
logger.debug("Locking URLs...");
await lockURLs(
job.data.crawl_id,
sc,
jobs.map((x) => x.data.url),
);
logger.debug("Adding scrape jobs to Redis...");
await addCrawlJobs(
job.data.crawl_id,
jobs.map((x) => x.opts.jobId),
);
logger.debug("Adding scrape jobs to BullMQ...");
await addScrapeJobs(jobs);
});
if (sitemap === 0) {
logger.debug("Sitemap not found or ignored.", {
ignoreSitemap: sc.crawlerOptions.ignoreSitemap,
});
logger.debug("Locking URL...");
await lockURL(job.data.crawl_id, sc, job.data.url);
const jobId = uuidv4();
logger.debug("Adding scrape job to Redis...", { jobId });
await addScrapeJob(
{
url: job.data.url,
mode: "single_urls",
team_id: job.data.team_id,
crawlerOptions: job.data.crawlerOptions,
scrapeOptions: scrapeOptions.parse(job.data.scrapeOptions),
internalOptions: sc.internalOptions,
plan: job.data.plan!,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
webhook: job.data.webhook,
v1: job.data.v1,
},
{
priority: 15,
},
jobId,
);
logger.debug("Adding scrape job to BullMQ...", { jobId });
await addCrawlJob(job.data.crawl_id, jobId);
}
logger.debug("Done queueing jobs!");
if (job.data.webhook) {
logger.debug("Calling webhook with crawl.started...", {
webhook: job.data.webhook,
});
await callWebhook(
job.data.team_id,
job.data.crawl_id,
null,
job.data.webhook,
true,
"crawl.started",
);
}
return { success: true }
} catch (error) {
logger.error("An error occurred!", { error })
return { success: false, error };
}
}
async function processJob(job: Job & { id: string }, token: string) {
const logger = _logger.child({
module: "queue-worker",

File diff suppressed because it is too large Load Diff

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp # noqa
__version__ = "1.6.8"
__version__ = "1.7.0"
# Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -8,7 +8,7 @@ from datetime import datetime
load_dotenv()
API_URL = "http://127.0.0.1:3002";
API_URL = os.getenv('API_URL', 'http://127.0.0.1:3002')
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
TEST_API_KEY = os.getenv('TEST_API_KEY')
@ -20,15 +20,26 @@ spec.loader.exec_module(firecrawl)
FirecrawlApp = firecrawl.FirecrawlApp
def test_no_api_key():
with pytest.raises(Exception) as excinfo:
invalid_app = FirecrawlApp(api_url=API_URL)
assert "No API key provided" in str(excinfo.value)
if 'api.firecrawl.dev' in API_URL:
with pytest.raises(Exception) as excinfo:
invalid_app = FirecrawlApp(api_url=API_URL)
assert "No API key provided" in str(excinfo.value)
else:
# Should not raise error for self-hosted
app = FirecrawlApp(api_url=API_URL)
assert app is not None
def test_scrape_url_invalid_api_key():
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
with pytest.raises(Exception) as excinfo:
invalid_app.scrape_url('https://firecrawl.dev')
assert "Unauthorized: Invalid token" in str(excinfo.value)
if 'api.firecrawl.dev' in API_URL:
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
with pytest.raises(Exception) as excinfo:
invalid_app.scrape_url('https://firecrawl.dev')
assert "Unauthorized: Invalid token" in str(excinfo.value)
else:
# Should work without API key for self-hosted
app = FirecrawlApp(api_url=API_URL)
response = app.scrape_url('https://firecrawl.dev')
assert response is not None
# def test_blocklisted_url():
# blocklisted_url = "https://facebook.com/fake-test"
@ -131,10 +142,16 @@ def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_ext
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown']
def test_crawl_url_invalid_api_key():
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
with pytest.raises(Exception) as excinfo:
invalid_app.crawl_url('https://firecrawl.dev')
assert "Unauthorized: Invalid token" in str(excinfo.value)
if 'api.firecrawl.dev' in API_URL:
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
with pytest.raises(Exception) as excinfo:
invalid_app.crawl_url('https://firecrawl.dev')
assert "Unauthorized: Invalid token" in str(excinfo.value)
else:
# Should work without API key for self-hosted
app = FirecrawlApp(api_url=API_URL)
response = app.crawl_url('https://firecrawl.dev')
assert response is not None
# def test_should_return_error_for_blocklisted_url():
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
@ -291,10 +308,16 @@ def test_check_crawl_status_e2e():
assert 'error' not in status_response['data'][0]['metadata']
def test_invalid_api_key_on_map():
invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL)
with pytest.raises(Exception) as excinfo:
invalid_app.map_url('https://roastmywebsite.ai')
assert "Unauthorized: Invalid token" in str(excinfo.value)
if 'api.firecrawl.dev' in API_URL:
invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL)
with pytest.raises(Exception) as excinfo:
invalid_app.map_url('https://roastmywebsite.ai')
assert "Unauthorized: Invalid token" in str(excinfo.value)
else:
# Should work without API key for self-hosted
app = FirecrawlApp(api_url=API_URL)
response = app.map_url('https://roastmywebsite.ai')
assert response is not None
# def test_blocklisted_url_on_map():
# app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
@ -349,4 +372,3 @@ def test_search_e2e():
# assert isinstance(llm_extraction['is_open_source'], bool)

View File

@ -40,19 +40,22 @@ class FirecrawlApp:
error: Optional[str] = None
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
"""
Initialize the FirecrawlApp instance with API key, API URL.
"""
Initialize the FirecrawlApp instance with API key, API URL.
Args:
api_key (Optional[str]): API key for authenticating with the Firecrawl API.
api_url (Optional[str]): Base URL for the Firecrawl API.
"""
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
if self.api_key is None:
logger.warning("No API key provided")
raise ValueError('No API key provided')
logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key}")
Args:
api_key (Optional[str]): API key for authenticating with the Firecrawl API.
api_url (Optional[str]): Base URL for the Firecrawl API.
"""
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
# Only require API key when using cloud service
if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
logger.warning("No API key provided for cloud service")
raise ValueError('No API key provided')
logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
"""

View File

@ -12,7 +12,8 @@ dependencies = [
"requests",
"python-dotenv",
"websockets",
"nest-asyncio"
"nest-asyncio",
"pydantic>=2.10.3",
]
authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}]

View File

@ -2,4 +2,5 @@ requests
pytest
python-dotenv
websockets
nest-asyncio
nest-asyncio
pydantic

View File

@ -9,7 +9,7 @@ use crate::crawl::CrawlStatus;
#[derive(Debug, Deserialize, Serialize, Clone)]
pub struct FirecrawlAPIError {
/// Always false.
success: bool,
pub success: bool,
/// Error message
pub error: String,

View File

@ -9,6 +9,7 @@ pub mod map;
pub mod scrape;
pub use error::FirecrawlError;
use error::FirecrawlAPIError;
#[derive(Clone, Debug)]
pub struct FirecrawlApp {
@ -18,16 +19,30 @@ pub struct FirecrawlApp {
}
pub(crate) const API_VERSION: &str = "/v1";
const CLOUD_API_URL: &str = "https://api.firecrawl.dev";
impl FirecrawlApp {
pub fn new(api_key: impl AsRef<str>) -> Result<Self, FirecrawlError> {
FirecrawlApp::new_selfhosted("https://api.firecrawl.dev", Some(api_key))
FirecrawlApp::new_selfhosted(CLOUD_API_URL, Some(api_key))
}
pub fn new_selfhosted(api_url: impl AsRef<str>, api_key: Option<impl AsRef<str>>) -> Result<Self, FirecrawlError> {
let url = api_url.as_ref().to_string();
if url == CLOUD_API_URL && api_key.is_none() {
return Err(FirecrawlError::APIError(
"Configuration".to_string(),
FirecrawlAPIError {
success: false,
error: "API key is required for cloud service".to_string(),
details: None,
}
));
}
Ok(FirecrawlApp {
api_key: api_key.map(|x| x.as_ref().to_string()),
api_url: api_url.as_ref().to_string(),
api_url: url,
client: Client::new(),
})
}

View File

@ -1,7 +1,7 @@
use assert_matches::assert_matches;
use dotenvy::dotenv;
use firecrawl::scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions};
use firecrawl::FirecrawlApp;
use firecrawl::{FirecrawlApp, FirecrawlError};
use serde_json::json;
use std::env;
@ -155,3 +155,29 @@ async fn test_llm_extraction() {
assert!(llm_extraction["supports_sso"].is_boolean());
assert!(llm_extraction["is_open_source"].is_boolean());
}
#[test]
fn test_api_key_requirements() {
dotenv().ok();
let api_url = env::var("API_URL").unwrap_or("http://localhost:3002".to_string());
let api_key = env::var("TEST_API_KEY").ok();
match (api_url.contains("api.firecrawl.dev"), api_key) {
(false, _) => {
let result = FirecrawlApp::new_selfhosted(&api_url, None::<String>);
assert!(result.is_ok(), "Local setup failed: {:?}", result.err().unwrap());
}
(true, None) => {
let result = FirecrawlApp::new_selfhosted(&api_url, None::<String>);
assert!(matches!(
result,
Err(FirecrawlError::APIError(msg, _)) if msg == "Configuration"
));
}
(true, Some(key)) => {
let result = FirecrawlApp::new_selfhosted(&api_url, Some(&key));
assert!(result.is_ok());
}
}
}