diff --git a/SELF_HOST.md b/SELF_HOST.md index 46e08db9..e8a3444f 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -116,6 +116,10 @@ If you’d like to test the crawl endpoint, you can run this: This section provides solutions to common issues you might encounter while setting up or running your self-hosted instance of Firecrawl. +### API Keys for SDK Usage + +**Note:** When using Firecrawl SDKs with a self-hosted instance, API keys are optional. API keys are only required when connecting to the cloud service (api.firecrawl.dev). + ### Supabase client is not configured **Symptom:** diff --git a/apps/api/requests.http b/apps/api/requests.http index 0e3b9206..8aa3788d 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -70,8 +70,8 @@ content-type: application/json "urls": ["firecrawl.dev"], "prompt": "What is the title, description and main product of the page?", "schema": { - "title": "string", - "description": "string", - "mainProduct": "string" + "title": { "type": "string" }, + "description": { "type": "string" }, + "mainProduct": { "type": "string" } } -} \ No newline at end of file +} diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index ceeaa436..9659c218 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -177,56 +177,51 @@ export async function crawlController(req: Request, res: Response) { await saveCrawl(id, sc); - const sitemap = sc.crawlerOptions?.ignoreSitemap - ? null - : await crawler.tryGetSitemap(); + const sitemap = sc.crawlerOptions.ignoreSitemap + ? 0 + : await crawler.tryGetSitemap(async urls => { + if (urls.length === 0) return; + + let jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 }); + const jobs = urls.map(url => { + const uuid = uuidv4(); + return { + name: uuid, + data: { + url, + mode: "single_urls", + crawlerOptions, + scrapeOptions, + internalOptions, + team_id, + plan, + origin: req.body.origin ?? defaultOrigin, + crawl_id: id, + sitemapped: true, + }, + opts: { + jobId: uuid, + priority: jobPriority, + }, + }; + }); - if (sitemap !== null && sitemap.length > 0) { - let jobPriority = 20; - // If it is over 1000, we need to get the job priority, - // otherwise we can use the default priority of 20 - if (sitemap.length > 1000) { - // set base to 21 - jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 }); - } - const jobs = sitemap.map((x) => { - const url = x.url; - const uuid = uuidv4(); - return { - name: uuid, - data: { - url, - mode: "single_urls", - crawlerOptions, - scrapeOptions, - internalOptions, - team_id, - plan, - origin: req.body.origin ?? defaultOrigin, - crawl_id: id, - sitemapped: true, - }, - opts: { - jobId: uuid, - priority: jobPriority, - }, - }; - }); + await lockURLs( + id, + sc, + jobs.map((x) => x.data.url), + ); + await addCrawlJobs( + id, + jobs.map((x) => x.opts.jobId), + ); + for (const job of jobs) { + // add with sentry instrumentation + await addScrapeJob(job.data as any, {}, job.opts.jobId); + } + }); - await lockURLs( - id, - sc, - jobs.map((x) => x.data.url), - ); - await addCrawlJobs( - id, - jobs.map((x) => x.opts.jobId), - ); - for (const job of jobs) { - // add with sentry instrumentation - await addScrapeJob(job.data as any, {}, job.opts.jobId); - } - } else { + if (sitemap === 0) { await lockURL(id, sc, url); // Not needed, first one should be 15. diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index f9462c3d..9ba9bd46 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -113,32 +113,32 @@ export async function crawlPreviewController(req: Request, res: Response) { const crawler = crawlToCrawler(id, sc); const sitemap = sc.crawlerOptions?.ignoreSitemap - ? null - : await crawler.tryGetSitemap(); + ? 0 + : await crawler.tryGetSitemap(async urls => { + for (const url of urls) { + await lockURL(id, sc, url); + const jobId = uuidv4(); + await addScrapeJob( + { + url, + mode: "single_urls", + team_id, + plan: plan!, + crawlerOptions, + scrapeOptions, + internalOptions, + origin: "website-preview", + crawl_id: id, + sitemapped: true, + }, + {}, + jobId, + ); + await addCrawlJob(id, jobId); + } + }); - if (sitemap !== null) { - for (const url of sitemap.map((x) => x.url)) { - await lockURL(id, sc, url); - const jobId = uuidv4(); - await addScrapeJob( - { - url, - mode: "single_urls", - team_id, - plan: plan!, - crawlerOptions, - scrapeOptions, - internalOptions, - origin: "website-preview", - crawl_id: id, - sitemapped: true, - }, - {}, - jobId, - ); - await addCrawlJob(id, jobId); - } - } else { + if (sitemap === 0) { await lockURL(id, sc, url); const jobId = uuidv4(); await addScrapeJob( diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 59db16d8..1aec86c8 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -115,7 +115,7 @@ export async function crawlStatusController( const status: Exclude["status"] = sc.cancelled ? "cancelled" - : validJobStatuses.every((x) => x[1] === "completed") + : (validJobStatuses.every((x) => x[1] === "completed") && validJobStatuses.length > 0) ? "completed" : "scraping"; diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index c2e3369f..a759f448 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -18,7 +18,7 @@ import { } from "../../lib/crawl-redis"; import { logCrawl } from "../../services/logging/crawl_log"; import { getScrapeQueue } from "../../services/queue-service"; -import { addScrapeJob, addScrapeJobs } from "../../services/queue-jobs"; +import { _addScrapeJobToBullMQ, addScrapeJob, addScrapeJobs } from "../../services/queue-jobs"; import { logger as _logger } from "../../lib/logger"; import { getJobPriority } from "../../lib/job-priority"; import { callWebhook } from "../../services/webhook"; @@ -111,113 +111,20 @@ export async function crawlController( await saveCrawl(id, sc); - const sitemap = sc.crawlerOptions.ignoreSitemap - ? null - : await crawler.tryGetSitemap(); - - if (sitemap !== null && sitemap.length > 0) { - logger.debug("Using sitemap of length " + sitemap.length, { - sitemapLength: sitemap.length, - }); - let jobPriority = 20; - // If it is over 1000, we need to get the job priority, - // otherwise we can use the default priority of 20 - if (sitemap.length > 1000) { - // set base to 21 - jobPriority = await getJobPriority({ - plan: req.auth.plan, - team_id: req.auth.team_id, - basePriority: 21, - }); - } - logger.debug("Using job priority " + jobPriority, { jobPriority }); - - const jobs = sitemap.map((x) => { - const url = x.url; - const uuid = uuidv4(); - return { - name: uuid, - data: { - url, - mode: "single_urls" as const, - team_id: req.auth.team_id, - plan: req.auth.plan!, - crawlerOptions, - scrapeOptions, - internalOptions: sc.internalOptions, - origin: "api", - crawl_id: id, - sitemapped: true, - webhook: req.body.webhook, - v1: true, - }, - opts: { - jobId: uuid, - priority: 20, - }, - }; - }); - - logger.debug("Locking URLs..."); - await lockURLs( - id, - sc, - jobs.map((x) => x.data.url), - ); - logger.debug("Adding scrape jobs to Redis..."); - await addCrawlJobs( - id, - jobs.map((x) => x.opts.jobId), - ); - logger.debug("Adding scrape jobs to BullMQ..."); - await addScrapeJobs(jobs); - } else { - logger.debug("Sitemap not found or ignored.", { - ignoreSitemap: sc.crawlerOptions.ignoreSitemap, - }); - - logger.debug("Locking URL..."); - await lockURL(id, sc, req.body.url); - const jobId = uuidv4(); - logger.debug("Adding scrape job to Redis...", { jobId }); - await addScrapeJob( - { - url: req.body.url, - mode: "single_urls", - team_id: req.auth.team_id, - crawlerOptions, - scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions), - internalOptions: sc.internalOptions, - plan: req.auth.plan!, - origin: "api", - crawl_id: id, - webhook: req.body.webhook, - v1: true, - }, - { - priority: 15, - }, - jobId, - ); - logger.debug("Adding scrape job to BullMQ...", { jobId }); - await addCrawlJob(id, jobId); - } - logger.debug("Done queueing jobs!"); - - if (req.body.webhook) { - logger.debug("Calling webhook with crawl.started...", { - webhook: req.body.webhook, - }); - await callWebhook( - req.auth.team_id, - id, - null, - req.body.webhook, - true, - "crawl.started", - ); - } - + await _addScrapeJobToBullMQ({ + url: req.body.url, + mode: "kickoff" as const, + team_id: req.auth.team_id, + plan: req.auth.plan, + crawlerOptions, + scrapeOptions: sc.scrapeOptions, + internalOptions: sc.internalOptions, + origin: "api", + crawl_id: id, + webhook: req.body.webhook, + v1: true, + }, {}, crypto.randomUUID(), 10); + const protocol = process.env.ENV === "local" ? req.protocol : "https"; return res.status(200).json({ diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 27a926fc..3274dd93 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -86,11 +86,12 @@ export async function getMapResults({ // If sitemapOnly is true, only get links from sitemap if (crawlerOptions.sitemapOnly) { - const sitemap = await crawler.tryGetSitemap(true, true); - if (sitemap !== null) { - sitemap.forEach((x) => { - links.push(x.url); + const sitemap = await crawler.tryGetSitemap(urls => { + urls.forEach((x) => { + links.push(x); }); + }, true, true); + if (sitemap > 0) { links = links .slice(1) .map((x) => { @@ -143,8 +144,10 @@ export async function getMapResults({ } // Parallelize sitemap fetch with serper search - const [sitemap, ...searchResults] = await Promise.all([ - ignoreSitemap ? null : crawler.tryGetSitemap(true), + const [_, ...searchResults] = await Promise.all([ + ignoreSitemap ? null : crawler.tryGetSitemap(urls => { + links.push(...urls); + }, true), ...(cachedResult ? [] : pagePromises), ]); @@ -152,12 +155,6 @@ export async function getMapResults({ allResults = searchResults; } - if (sitemap !== null) { - sitemap.forEach((x) => { - links.push(x.url); - }); - } - mapResults = allResults .flat() .filter((result) => result !== null && result !== undefined); diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts index a585fe0a..bec3d4d1 100644 --- a/apps/api/src/lib/withAuth.ts +++ b/apps/api/src/lib/withAuth.ts @@ -17,7 +17,7 @@ export function withAuth( logger.warn("You're bypassing authentication"); warningCount++; } - return { success: true } as T; + return { success: true, ...(mockSuccess || {}) } as T; } else { return await originalFunction(...args); } diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 2e47d352..52bea9e5 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -4,9 +4,10 @@ import { URL } from "url"; import { getLinksFromSitemap } from "./sitemap"; import robotsParser from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; -import { axiosTimeout } from "../../../src/lib/timeout"; -import { logger as _logger } from "../../../src/lib/logger"; +import { axiosTimeout } from "../../lib/timeout"; +import { logger as _logger } from "../../lib/logger"; import https from "https"; +import { redisConnection } from "../../services/queue-service"; export class WebCrawler { private jobId: string; private initialUrl: string; @@ -198,26 +199,60 @@ export class WebCrawler { } public async tryGetSitemap( + urlsHandler: (urls: string[]) => unknown, fromMap: boolean = false, onlySitemap: boolean = false, - ): Promise<{ url: string; html: string }[] | null> { + ): Promise { this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, { method: "tryGetSitemap", }); - const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); - if (fromMap && onlySitemap) { - return sitemapLinks.map((link) => ({ url: link, html: "" })); + let leftOfLimit = this.limit; + + const normalizeUrl = (url: string) => { + url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); + if (url.endsWith("/")) { + url = url.slice(0, -1); + } + return url; + }; + + const _urlsHandler = async (urls: string[]) => { + let uniqueURLs: string[] = []; + for (const url of urls) { + if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(url))) { + uniqueURLs.push(url); + } + } + + await redisConnection.expire("sitemap:" + this.jobId + ":links", 3600, "NX"); + if (uniqueURLs.length > 0) { + urlsHandler(uniqueURLs); + } + }; + + let count = await this.tryFetchSitemapLinks(this.initialUrl, (urls: string[]) => { + if (fromMap && onlySitemap) { + return urlsHandler(urls); + } else { + let filteredLinks = this.filterLinks( + [...new Set(urls)], + leftOfLimit, + this.maxCrawledDepth, + fromMap, + ); + leftOfLimit -= filteredLinks.length; + return _urlsHandler(filteredLinks); + } + }); + + if (count > 0) { + if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(this.initialUrl))) { + urlsHandler([this.initialUrl]); + } + count++; } - if (sitemapLinks.length > 0) { - let filteredLinks = this.filterLinks( - [...new Set(sitemapLinks)], - this.limit, - this.maxCrawledDepth, - fromMap, - ); - return filteredLinks.map((link) => ({ url: link, html: "" })); - } - return null; + + return count; } public filterURL(href: string, url: string): string | null { @@ -436,54 +471,74 @@ export class WebCrawler { return socialMediaOrEmail.some((ext) => url.includes(ext)); } - private async tryFetchSitemapLinks(url: string): Promise { - const normalizeUrl = (url: string) => { - url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); - if (url.endsWith("/")) { - url = url.slice(0, -1); - } - return url; - }; - + private async tryFetchSitemapLinks(url: string, urlsHandler: (urls: string[]) => unknown): Promise { const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`; - let sitemapLinks: string[] = []; + let sitemapCount: number = 0; + // Try to get sitemap from the provided URL first try { - const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); - if (response.status === 200) { - sitemapLinks = await getLinksFromSitemap({ sitemapUrl }, this.logger); - } + sitemapCount = await getLinksFromSitemap( + { sitemapUrl, urlsHandler, mode: "fire-engine" }, + this.logger, + ); } catch (error) { this.logger.debug( - `Failed to fetch sitemap with axios from ${sitemapUrl}`, + `Failed to fetch sitemap from ${sitemapUrl}`, { method: "tryFetchSitemapLinks", sitemapUrl, error }, ); - if (error instanceof AxiosError && error.response?.status === 404) { - // ignore 404 - } else { - const response = await getLinksFromSitemap( - { sitemapUrl, mode: "fire-engine" }, - this.logger, - ); - if (response) { - sitemapLinks = response; - } - } } - if (sitemapLinks.length === 0) { - const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; - try { - const response = await axios.get(baseUrlSitemap, { - timeout: axiosTimeout, - }); - if (response.status === 200) { - sitemapLinks = await getLinksFromSitemap( - { sitemapUrl: baseUrlSitemap, mode: "fire-engine" }, + // If this is a subdomain, also try to get sitemap from the main domain + try { + const urlObj = new URL(url); + const hostname = urlObj.hostname; + const domainParts = hostname.split('.'); + + // Check if this is a subdomain (has more than 2 parts and not www) + if (domainParts.length > 2 && domainParts[0] !== 'www') { + // Get the main domain by taking the last two parts + const mainDomain = domainParts.slice(-2).join('.'); + const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`; + const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`; + + try { + // Get all links from the main domain's sitemap + sitemapCount += await getLinksFromSitemap( + { sitemapUrl: mainDomainSitemapUrl, urlsHandler(urls) { + urlsHandler(urls.filter(link => { + try { + const linkUrl = new URL(link); + return linkUrl.hostname.endsWith(hostname); + } catch { + } + })) + }, mode: "fire-engine" }, this.logger, ); + } catch (error) { + this.logger.debug( + `Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`, + { method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error }, + ); } + } + } catch (error) { + this.logger.debug(`Error processing main domain sitemap`, { + method: "tryFetchSitemapLinks", + url, + error, + }); + } + + // If no sitemap found yet, try the baseUrl as a last resort + if (sitemapCount === 0) { + const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; + try { + sitemapCount += await getLinksFromSitemap( + { sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" }, + this.logger, + ); } catch (error) { this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, { method: "tryFetchSitemapLinks", @@ -493,25 +548,14 @@ export class WebCrawler { if (error instanceof AxiosError && error.response?.status === 404) { // ignore 404 } else { - sitemapLinks = await getLinksFromSitemap( - { sitemapUrl: baseUrlSitemap, mode: "fire-engine" }, + sitemapCount += await getLinksFromSitemap( + { sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" }, this.logger, ); } } } - const normalizedUrl = normalizeUrl(url); - const normalizedSitemapLinks = sitemapLinks.map((link) => - normalizeUrl(link), - ); - // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl - if ( - !normalizedSitemapLinks.includes(normalizedUrl) && - sitemapLinks.length > 0 - ) { - sitemapLinks.push(url); - } - return sitemapLinks; + return sitemapCount; } } diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index c080373e..8028d225 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -5,26 +5,25 @@ import { WebCrawler } from "./crawler"; import { scrapeURL } from "../scrapeURL"; import { scrapeOptions } from "../../controllers/v1/types"; import type { Logger } from "winston"; - +const useFireEngine = + process.env.FIRE_ENGINE_BETA_URL !== "" && + process.env.FIRE_ENGINE_BETA_URL !== undefined; export async function getLinksFromSitemap( { sitemapUrl, - allUrls = [], + urlsHandler, mode = "axios", }: { sitemapUrl: string; - allUrls?: string[]; + urlsHandler(urls: string[]): unknown, mode?: "axios" | "fire-engine"; }, logger: Logger, -): Promise { +): Promise { try { let content: string = ""; try { - if (mode === "axios" || process.env.FIRE_ENGINE_BETA_URL === "") { - const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); - content = response.data; - } else if (mode === "fire-engine") { + if (mode === "fire-engine" && useFireEngine) { const response = await scrapeURL( "sitemap", sitemapUrl, @@ -32,9 +31,15 @@ export async function getLinksFromSitemap( { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }, ); if (!response.success) { - throw response.error; + logger.debug("Failed to scrape sitemap via TLSClient, falling back to axios...", { error: response.error }) + const ar = await axios.get(sitemapUrl, { timeout: axiosTimeout }); + content = ar.data; + } else { + content = response.document.rawHtml!; } - content = response.document.rawHtml!; + } else { + const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); + content = response.data; } } catch (error) { logger.error(`Request failed for ${sitemapUrl}`, { @@ -44,33 +49,64 @@ export async function getLinksFromSitemap( error, }); - return allUrls; + return 0; } const parsed = await parseStringPromise(content); const root = parsed.urlset || parsed.sitemapindex; + let count = 0; if (root && root.sitemap) { - const sitemapPromises = root.sitemap + // Handle sitemap index files + const sitemapUrls = root.sitemap .filter((sitemap) => sitemap.loc && sitemap.loc.length > 0) - .map((sitemap) => + .map((sitemap) => sitemap.loc[0]); + + const sitemapPromises: Promise[] = sitemapUrls.map((sitemapUrl) => + getLinksFromSitemap( + { sitemapUrl, urlsHandler, mode }, + logger, + ), + ); + + const results = await Promise.all(sitemapPromises); + count = results.reduce((a,x) => a + x) + } else if (root && root.url) { + // Check if any URLs point to additional sitemaps + const xmlSitemaps: string[] = root.url + .filter( + (url) => + url.loc && + url.loc.length > 0 && + url.loc[0].toLowerCase().endsWith('.xml') + ) + .map((url) => url.loc[0]); + + if (xmlSitemaps.length > 0) { + // Recursively fetch links from additional sitemaps + const sitemapPromises = xmlSitemaps.map((sitemapUrl) => getLinksFromSitemap( - { sitemapUrl: sitemap.loc[0], allUrls, mode }, + { sitemapUrl: sitemapUrl, urlsHandler, mode }, logger, ), ); - await Promise.all(sitemapPromises); - } else if (root && root.url) { + count += (await Promise.all(sitemapPromises)).reduce((a,x) => a + x, 0); + } + const validUrls = root.url .filter( (url) => url.loc && url.loc.length > 0 && + !url.loc[0].toLowerCase().endsWith('.xml') && !WebCrawler.prototype.isFile(url.loc[0]), ) .map((url) => url.loc[0]); - allUrls.push(...validUrls); + count += validUrls.length; + urlsHandler(validUrls); } + + return count; } catch (error) { logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, { method: "getLinksFromSitemap", @@ -80,7 +116,7 @@ export async function getLinksFromSitemap( }); } - return allUrls; + return 0; } export const fetchSitemapData = async ( diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index aa869836..aeafebea 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -17,6 +17,7 @@ import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError import * as Sentry from "@sentry/node"; import { Action } from "../../../../lib/entities"; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; +import { fireEngineDelete } from "./delete"; // This function does not take `Meta` on purpose. It may not access any // meta values to construct the request -- that must be done by the @@ -44,6 +45,13 @@ async function performFireEngineScrape< while (status === undefined) { if (errors.length >= errorLimit) { logger.error("Error limit hit.", { errors }); + fireEngineDelete( + logger.child({ + method: "performFireEngineScrape/fireEngineDelete", + afterErrors: errors, + }), + scrape.jobId, + ); throw new Error("Error limit hit. See e.cause.errors for errors.", { cause: { errors }, }); @@ -74,6 +82,13 @@ async function performFireEngineScrape< error instanceof ActionError || error instanceof UnsupportedFileError ) { + fireEngineDelete( + logger.child({ + method: "performFireEngineScrape/fireEngineDelete", + afterError: error, + }), + scrape.jobId, + ); logger.debug("Fire-engine scrape job failed.", { error, jobId: scrape.jobId, @@ -105,6 +120,13 @@ async function performFireEngineScrape< status.content = Buffer.from(content, "base64").toString("utf8"); // TODO: handle other encodings via Content-Type tag } + fireEngineDelete( + logger.child({ + method: "performFireEngineScrape/fireEngineDelete", + }), + scrape.jobId, + ); + return status; } diff --git a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts index 66cf30cc..1f494893 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts @@ -8,6 +8,7 @@ export function extractMetadata( ): Partial { let title: string | undefined = undefined; let description: string | undefined = undefined; + let favicon: string | undefined = undefined; let language: string | undefined = undefined; let keywords: string | undefined = undefined; let robots: string | undefined = undefined; @@ -42,6 +43,12 @@ export function extractMetadata( try { title = soup("title").first().text().trim() || undefined; description = soup('meta[name="description"]').attr("content") || undefined; + + const faviconLink = soup('link[rel="icon"]').attr("href") || soup('link[rel*="icon"]').first().attr("href") || undefined; + if (faviconLink) { + const baseUrl = new URL(meta.url).origin; + favicon = faviconLink.startsWith('http') ? faviconLink : `${baseUrl}${faviconLink}`; + } // Assuming the language is part of the URL as per the regex pattern language = soup("html").attr("lang") || undefined; @@ -121,6 +128,7 @@ export function extractMetadata( return { title, description, + favicon, language, keywords, robots, diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts new file mode 100644 index 00000000..f23f506f --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts @@ -0,0 +1,33 @@ +import { removeDefaultProperty } from "./llmExtract"; + +describe("removeDefaultProperty", () => { + it("should remove the default property from a simple object", () => { + const input = { default: "test", test: "test" }; + const expectedOutput = { test: "test" }; + expect(removeDefaultProperty(input)).toEqual(expectedOutput); + }); + + it("should remove the default property from a nested object", () => { + const input = { default: "test", nested: { default: "nestedTest", test: "nestedTest" } }; + const expectedOutput = { nested: { test: "nestedTest" } }; + expect(removeDefaultProperty(input)).toEqual(expectedOutput); + }); + + it("should remove the default property from an array of objects", () => { + const input = { array: [{ default: "test1", test: "test1" }, { default: "test2", test: "test2" }] }; + const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] }; + expect(removeDefaultProperty(input)).toEqual(expectedOutput); + }); + + it("should handle objects without a default property", () => { + const input = { test: "test" }; + const expectedOutput = { test: "test" }; + expect(removeDefaultProperty(input)).toEqual(expectedOutput); + }); + + it("should handle null and non-object inputs", () => { + expect(removeDefaultProperty(null)).toBeNull(); + expect(removeDefaultProperty("string")).toBe("string"); + expect(removeDefaultProperty(123)).toBe(123); + }); +}); \ No newline at end of file diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 759f87e2..0b4d6e1e 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -121,6 +121,10 @@ export async function generateOpenAICompletions( } let schema = options.schema; + if (schema) { + schema = removeDefaultProperty(schema); +} + if (schema && schema.type === "array") { schema = { type: "object", @@ -134,10 +138,12 @@ export async function generateOpenAICompletions( schema = { type: "object", properties: Object.fromEntries( - Object.entries(schema).map(([key, value]) => [key, { type: value }]), + Object.entries(schema).map(([key, value]) => { + return [key, removeDefaultProperty(value)]; + }) ), required: Object.keys(schema), - additionalProperties: false, + additionalProperties: false }; } @@ -232,3 +238,19 @@ export async function performLLMExtract( return document; } + +export function removeDefaultProperty(schema: any): any { + if (typeof schema !== 'object' || schema === null) return schema; + + const { default: _, ...rest } = schema; + + for (const key in rest) { + if (Array.isArray(rest[key])) { + rest[key] = rest[key].map((item: any) => removeDefaultProperty(item)); + } else if (typeof rest[key] === 'object' && rest[key] !== null) { + rest[key] = removeDefaultProperty(rest[key]); + } + } + + return rest; +} \ No newline at end of file diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 654f6cda..f59babe4 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -29,7 +29,7 @@ async function _addScrapeJobToConcurrencyQueue( }); } -async function _addScrapeJobToBullMQ( +export async function _addScrapeJobToBullMQ( webScraperOptions: any, options: any, jobId: string, @@ -138,7 +138,6 @@ export async function addScrapeJobs( if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) { const now = Date.now(); const limit = await getConcurrencyLimitMax(jobs[0].data.plan); - console.log("CC limit", limit); cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now); countCanBeDirectlyAdded = Math.max( diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 4ef9610d..e8c8bdf3 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -18,16 +18,18 @@ import { v4 as uuidv4 } from "uuid"; import { addCrawlJob, addCrawlJobDone, + addCrawlJobs, crawlToCrawler, finishCrawl, generateURLPermutations, getCrawl, getCrawlJobs, lockURL, + lockURLs, normalizeURL, } from "../lib/crawl-redis"; import { StoredCrawl } from "../lib/crawl-redis"; -import { addScrapeJob } from "./queue-jobs"; +import { addScrapeJob, addScrapeJobs } from "./queue-jobs"; import { addJobPriority, deleteJobPriority, @@ -191,22 +193,34 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => { await addJobPriority(job.data.team_id, job.id); let err = null; try { - const result = await processJob(job, token); - if (result.success) { - try { - if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") { - logger.debug( - "Job succeeded -- has crawl associated, putting null in Redis", - ); + if (job.data?.mode === "kickoff") { + const result = await processKickoffJob(job, token); + if (result.success) { + try { await job.moveToCompleted(null, token, false); - } else { - logger.debug("Job succeeded -- putting result in Redis"); - await job.moveToCompleted(result.document, token, false); - } - } catch (e) {} + } catch (e) {} + } else { + logger.debug("Job failed", { result, mode: job.data.mode }); + await job.moveToFailed((result as any).error, token, false); + } } else { - logger.debug("Job failed", { result }); - await job.moveToFailed((result as any).error, token, false); + const result = await processJob(job, token); + if (result.success) { + try { + if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") { + logger.debug( + "Job succeeded -- has crawl associated, putting null in Redis", + ); + await job.moveToCompleted(null, token, false); + } else { + logger.debug("Job succeeded -- putting result in Redis"); + await job.moveToCompleted(result.document, token, false); + } + } catch (e) {} + } else { + logger.debug("Job failed", { result }); + await job.moveToFailed((result as any).error, token, false); + } } } catch (error) { logger.debug("Job failed", { error }); @@ -379,6 +393,130 @@ const workerFun = async ( workerFun(getScrapeQueue(), processJobInternal); +async function processKickoffJob(job: Job & { id: string }, token: string) { + const logger = _logger.child({ + module: "queue-worker", + method: "processKickoffJob", + jobId: job.id, + scrapeId: job.id, + crawlId: job.data?.crawl_id ?? undefined, + teamId: job.data?.team_id ?? undefined, + }); + + try { + const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; + const crawler = crawlToCrawler(job.data.crawl_id, sc); + + const sitemap = sc.crawlerOptions.ignoreSitemap + ? 0 + : await crawler.tryGetSitemap(async urls => { + if (urls.length === 0) return; + + logger.debug("Using sitemap chunk of length " + urls.length, { + sitemapLength: urls.length, + }); + + let jobPriority = await getJobPriority({ + plan: job.data.plan, + team_id: job.data.team_id, + basePriority: 21, + }); + logger.debug("Using job priority " + jobPriority, { jobPriority }); + + const jobs = urls.map(url => { + const uuid = uuidv4(); + return { + name: uuid, + data: { + url, + mode: "single_urls" as const, + team_id: job.data.team_id, + plan: job.data.plan!, + crawlerOptions: job.data.crawlerOptions, + scrapeOptions: job.data.scrapeOptions, + internalOptions: sc.internalOptions, + origin: job.data.origin, + crawl_id: job.data.crawl_id, + sitemapped: true, + webhook: job.data.webhook, + v1: job.data.v1, + }, + opts: { + jobId: uuid, + priority: 20, + }, + }; + }); + + logger.debug("Locking URLs..."); + await lockURLs( + job.data.crawl_id, + sc, + jobs.map((x) => x.data.url), + ); + logger.debug("Adding scrape jobs to Redis..."); + await addCrawlJobs( + job.data.crawl_id, + jobs.map((x) => x.opts.jobId), + ); + logger.debug("Adding scrape jobs to BullMQ..."); + await addScrapeJobs(jobs); + }); + + if (sitemap === 0) { + logger.debug("Sitemap not found or ignored.", { + ignoreSitemap: sc.crawlerOptions.ignoreSitemap, + }); + + logger.debug("Locking URL..."); + await lockURL(job.data.crawl_id, sc, job.data.url); + const jobId = uuidv4(); + logger.debug("Adding scrape job to Redis...", { jobId }); + await addScrapeJob( + { + url: job.data.url, + mode: "single_urls", + team_id: job.data.team_id, + crawlerOptions: job.data.crawlerOptions, + scrapeOptions: scrapeOptions.parse(job.data.scrapeOptions), + internalOptions: sc.internalOptions, + plan: job.data.plan!, + origin: job.data.origin, + crawl_id: job.data.crawl_id, + webhook: job.data.webhook, + v1: job.data.v1, + }, + { + priority: 15, + }, + jobId, + ); + logger.debug("Adding scrape job to BullMQ...", { jobId }); + await addCrawlJob(job.data.crawl_id, jobId); + } + logger.debug("Done queueing jobs!"); + + if (job.data.webhook) { + logger.debug("Calling webhook with crawl.started...", { + webhook: job.data.webhook, + }); + await callWebhook( + job.data.team_id, + job.data.crawl_id, + null, + job.data.webhook, + true, + "crawl.started", + ); + } + + return { success: true } + } catch (error) { + logger.error("An error occurred!", { error }) + return { success: false, error }; + } +} + async function processJob(job: Job & { id: string }, token: string) { const logger = _logger.child({ module: "queue-worker", diff --git a/apps/api/v1-openapi.json b/apps/api/v1-openapi.json index 6cd2b3da..9aab05c9 100644 --- a/apps/api/v1-openapi.json +++ b/apps/api/v1-openapi.json @@ -42,7 +42,15 @@ "type": "array", "items": { "type": "string", - "enum": ["markdown", "html", "rawHtml", "links", "screenshot", "extract", "screenshot@fullPage"] + "enum": [ + "markdown", + "html", + "rawHtml", + "links", + "screenshot", + "extract", + "screenshot@fullPage" + ] }, "description": "Formats to include in the output.", "default": ["markdown"] @@ -75,6 +83,16 @@ "description": "Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.", "default": 0 }, + "mobile": { + "type": "boolean", + "description": "Set to true if you want to emulate scraping from a mobile device. Useful for testing responsive pages and taking mobile screenshots.", + "default": false + }, + "skipTlsVerification": { + "type": "boolean", + "description": "Skip TLS certificate verification when making requests", + "default": false + }, "timeout": { "type": "integer", "description": "Timeout in milliseconds for the request", @@ -116,9 +134,391 @@ "type": "integer", "minimum": 1, "description": "Number of milliseconds to wait" + }, + "selector": { + "type": "string", + "description": "Query selector to find the element by", + "example": "#my-element" } }, - "required": ["type", "milliseconds"] + "required": ["type"] + }, + { + "type": "object", + "title": "Screenshot", + "properties": { + "type": { + "type": "string", + "enum": ["screenshot"], + "description": "Take a screenshot" + }, + "fullPage": { + "type": "boolean", + "description": "Should the screenshot be full-page or viewport sized?", + "default": false + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "Click", + "properties": { + "type": { + "type": "string", + "enum": ["click"], + "description": "Click on an element" + }, + "selector": { + "type": "string", + "description": "Query selector to find the element by", + "example": "#load-more-button" + } + }, + "required": ["type", "selector"] + }, + { + "type": "object", + "title": "Write text", + "properties": { + "type": { + "type": "string", + "enum": ["write"], + "description": "Write text into an input field, text area, or contenteditable element. Note: You must first focus the element using a 'click' action before writing. The text will be typed character by character to simulate keyboard input." + }, + "text": { + "type": "string", + "description": "Text to type", + "example": "Hello, world!" + } + }, + "required": ["type", "text"] + }, + { + "type": "object", + "title": "Press a key", + "description": "Press a key on the page. See https://asawicki.info/nosense/doc/devices/keyboard/key_codes.html for key codes.", + "properties": { + "type": { + "type": "string", + "enum": ["press"], + "description": "Press a key on the page" + }, + "key": { + "type": "string", + "description": "Key to press", + "example": "Enter" + } + }, + "required": ["type", "key"] + }, + { + "type": "object", + "title": "Scroll", + "properties": { + "type": { + "type": "string", + "enum": ["scroll"], + "description": "Scroll the page or a specific element" + }, + "direction": { + "type": "string", + "enum": ["up", "down"], + "description": "Direction to scroll", + "default": "down" + }, + "selector": { + "type": "string", + "description": "Query selector for the element to scroll", + "example": "#my-element" + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "Scrape", + "properties": { + "type": { + "type": "string", + "enum": ["scrape"], + "description": "Scrape the current page content, returns the url and the html." + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "Execute JavaScript", + "properties": { + "type": { + "type": "string", + "enum": ["executeJavascript"], + "description": "Execute JavaScript code on the page" + }, + "script": { + "type": "string", + "description": "JavaScript code to execute", + "example": "document.querySelector('.button').click();" + } + }, + "required": ["type", "script"] + } + ] + } + }, + "location": { + "type": "object", + "description": "Location settings for the request. When specified, this will use an appropriate proxy if available and emulate the corresponding language and timezone settings. Defaults to 'US' if not specified.", + "properties": { + "country": { + "type": "string", + "description": "ISO 3166-1 alpha-2 country code (e.g., 'US', 'AU', 'DE', 'JP')", + "pattern": "^[A-Z]{2}$", + "default": "US" + }, + "languages": { + "type": "array", + "description": "Preferred languages and locales for the request in order of priority. Defaults to the language of the specified location. See https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language", + "items": { + "type": "string", + "example": "en-US" + } + } + } + }, + "removeBase64Images": { + "type": "boolean", + "description": "Removes all base 64 images from the output, which may be overwhelmingly long. The image's alt text remains in the output, but the URL is replaced with a placeholder." + } + }, + "required": ["url"] + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScrapeResponse" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/batch/scrape": { + "post": { + "summary": "Scrape multiple URLs and optionally extract information using an LLM", + "operationId": "scrapeAndExtractFromUrls", + "tags": ["Scraping"], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "urls": { + "type": "array", + "items": { + "type": "string", + "format": "uri", + "description": "The URL to scrape" + } + }, + "webhook": { + "oneOf": [ + { + "type": "string", + "description": "The URL to send the webhook to. This will trigger for batch scrape started (batch_scrape.started), every page scraped (batch_scrape.page) and when the batch scrape is completed (batch_scrape.completed or batch_scrape.failed). The response will be the same as the `/scrape` endpoint." + }, + { + "type": "object", + "description": "A complex webhook specification object.", + "properties": { + "url": { + "type": "string", + "description": "The URL to send the webhook to. This will trigger for batch scrape started (batch_scrape.started), every page scraped (batch_scrape.page) and when the batch scrape is completed (batch_scrape.completed or batch_scrape.failed). The response will be the same as the `/scrape` endpoint." + }, + "headers": { + "type": "object", + "description": "Headers to send to the webhook URL.", + "additionalProperties": { + "type": "string" + } + }, + "metadata": { + "type": "object", + "description": "Custom metadata that will be included in all webhook payloads for this crawl", + "additionalProperties": true + } + }, + "required": ["url"] + } + ] + }, + "formats": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "markdown", + "html", + "rawHtml", + "links", + "screenshot", + "extract", + "screenshot@fullPage" + ] + }, + "description": "Formats to include in the output.", + "default": ["markdown"] + }, + "onlyMainContent": { + "type": "boolean", + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "default": true + }, + "includeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags to include in the output." + }, + "excludeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags to exclude from the output." + }, + "headers": { + "type": "object", + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." + }, + "waitFor": { + "type": "integer", + "description": "Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.", + "default": 0 + }, + "mobile": { + "type": "boolean", + "description": "Set to true if you want to emulate scraping from a mobile device. Useful for testing responsive pages and taking mobile screenshots.", + "default": false + }, + "skipTlsVerification": { + "type": "boolean", + "description": "Skip TLS certificate verification when making requests", + "default": false + }, + "timeout": { + "type": "integer", + "description": "Timeout in milliseconds for the request", + "default": 30000 + }, + "extract": { + "type": "object", + "description": "Extract object", + "properties": { + "schema": { + "type": "object", + "description": "The schema to use for the extraction (Optional)" + }, + "systemPrompt": { + "type": "string", + "description": "The system prompt to use for the extraction (Optional)" + }, + "prompt": { + "type": "string", + "description": "The prompt to use for the extraction without a schema (Optional)" + } + } + }, + "actions": { + "type": "array", + "description": "Actions to perform on the page before grabbing the content", + "items": { + "oneOf": [ + { + "type": "object", + "title": "Wait", + "properties": { + "type": { + "type": "string", + "enum": ["wait"], + "description": "Wait for a specified amount of milliseconds" + }, + "milliseconds": { + "type": "integer", + "minimum": 1, + "description": "Number of milliseconds to wait" + }, + "selector": { + "type": "string", + "description": "Query selector to find the element by", + "example": "#my-element" + } + }, + "required": ["type"] }, { "type": "object", @@ -201,23 +601,82 @@ "type": { "type": "string", "enum": ["scroll"], - "description": "Scroll the page" + "description": "Scroll the page or a specific element" }, "direction": { "type": "string", "enum": ["up", "down"], - "description": "Direction to scroll" + "description": "Direction to scroll", + "default": "down" }, - "amount": { - "type": "integer", - "description": "Amount to scroll in pixels", - "minimum": 1 + "selector": { + "type": "string", + "description": "Query selector for the element to scroll", + "example": "#my-element" } }, - "required": ["type", "direction"] - } + "required": ["type"] + }, + { + "type": "object", + "title": "Scrape", + "properties": { + "type": { + "type": "string", + "enum": ["scrape"], + "description": "Scrape the current page content, returns the url and the html." + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "Execute JavaScript", + "properties": { + "type": { + "type": "string", + "enum": ["executeJavascript"], + "description": "Execute JavaScript code on the page" + }, + "script": { + "type": "string", + "description": "JavaScript code to execute", + "example": "document.querySelector('.button').click();" + } + }, + "required": ["type", "script"] + } ] } + }, + "location": { + "type": "object", + "description": "Location settings for the request. When specified, this will use an appropriate proxy if available and emulate the corresponding language and timezone settings. Defaults to 'US' if not specified.", + "properties": { + "country": { + "type": "string", + "description": "ISO 3166-1 alpha-2 country code (e.g., 'US', 'AU', 'DE', 'JP')", + "pattern": "^[A-Z]{2}$", + "default": "US" + }, + "languages": { + "type": "array", + "description": "Preferred languages and locales for the request in order of priority. Defaults to the language of the specified location. See https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language", + "items": { + "type": "string", + "example": "en-US" + } + } + } + }, + "removeBase64Images": { + "type": "boolean", + "description": "Removes all base 64 images from the output, which may be overwhelmingly long. The image's alt text remains in the output, but the URL is replaced with a placeholder." + }, + "ignoreInvalidURLs": { + "type": "boolean", + "default": false, + "description": "If invalid URLs are specified in the urls array, they will be ignored. Instead of them failing the entire request, a batch scrape using the remaining valid URLs will be created, and the invalid URLs will be returned in the invalidURLs field of the response." } }, "required": ["url"] @@ -231,7 +690,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ScrapeResponse" + "$ref": "#/components/schemas/BatchScrapeResponseObj" } } } @@ -287,6 +746,154 @@ } } }, + "/batch/scrape/{id}": { + "parameters": [ + { + "name": "id", + "in": "path", + "description": "The ID of the batch scrape job", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "get": { + "summary": "Get the status of a batch scrape job", + "operationId": "getBatchScrapeStatus", + "tags": ["Scraping"], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/BatchScrapeStatusResponseObj" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + }, + "delete": { + "summary": "Cancel a crawl job", + "operationId": "cancelCrawl", + "tags": ["Crawling"], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful cancellation", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": true + }, + "message": { + "type": "string", + "example": "Crawl job successfully cancelled." + } + } + } + } + } + }, + "404": { + "description": "Crawl job not found", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Crawl job not found." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, "/crawl/{id}": { "parameters": [ { @@ -479,12 +1086,12 @@ "ignoreSitemap": { "type": "boolean", "description": "Ignore the website sitemap when crawling", - "default": true + "default": false }, "limit": { "type": "integer", "description": "Maximum number of pages to crawl. Default limit is 10000.", - "default": 10 + "default": 10000 }, "allowBackwardLinks": { "type": "boolean", @@ -497,8 +1104,35 @@ "default": false }, "webhook": { - "type": "string", - "description": "The URL to send the webhook to. This will trigger for crawl started (crawl.started) ,every page crawled (crawl.page) and when the crawl is completed (crawl.completed or crawl.failed). The response will be the same as the `/scrape` endpoint." + "oneOf": [ + { + "type": "string", + "description": "The URL to send the webhook to. This will trigger for crawl started (crawl.started) ,every page crawled (crawl.page) and when the crawl is completed (crawl.completed or crawl.failed). The response will be the same as the `/scrape` endpoint." + }, + { + "type": "object", + "description": "A complex webhook specification object.", + "properties": { + "url": { + "type": "string", + "description": "The URL to send the webhook to. This will trigger for crawl started (crawl.started), every page crawled (crawl.page) and when the crawl is completed (crawl.completed or crawl.failed). The response will be the same as the `/scrape` endpoint." + }, + "headers": { + "type": "object", + "description": "Headers to send to the webhook URL.", + "additionalProperties": { + "type": "string" + } + }, + "metadata": { + "type": "object", + "description": "Custom metadata that will be included in all webhook payloads for this crawl", + "additionalProperties": true + } + }, + "required": ["url"] + } + ] }, "scrapeOptions": { "type": "object", @@ -507,7 +1141,13 @@ "type": "array", "items": { "type": "string", - "enum": ["markdown", "html", "rawHtml", "links", "screenshot"] + "enum": [ + "markdown", + "html", + "rawHtml", + "links", + "screenshot" + ] }, "description": "Formats to include in the output.", "default": ["markdown"] @@ -535,6 +1175,16 @@ "description": "Only return the main content of the page excluding headers, navs, footers, etc.", "default": true }, + "removeBase64Images": { + "type": "boolean", + "description": "Remove base64 encoded images from the output", + "default": true + }, + "mobile": { + "type": "boolean", + "description": "Set to true if you want to emulate scraping from a mobile device. Useful for testing responsive pages and taking mobile screenshots.", + "default": false + }, "waitFor": { "type": "integer", "description": "Wait x amount of milliseconds for the page to load to fetch content", @@ -612,106 +1262,110 @@ }, "/map": { "post": { - "summary": "Map multiple URLs based on options", - "operationId": "mapUrls", - "tags": ["Mapping"], - "security": [ - { - "bearerAuth": [] + "summary": "Map multiple URLs based on options", + "operationId": "mapUrls", + "tags": ["Mapping"], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "format": "uri", + "description": "The base URL to start crawling from" + }, + "search": { + "type": "string", + "description": "Search query to use for mapping. During the Alpha phase, the 'smart' part of the search functionality is limited to 1000 search results. However, if map finds more results, there is no limit applied." + }, + "ignoreSitemap": { + "type": "boolean", + "description": "Ignore the website sitemap when crawling.", + "default": true + }, + "sitemapOnly": { + "type": "boolean", + "description": "Only return links found in the website sitemap", + "default": false + }, + "includeSubdomains": { + "type": "boolean", + "description": "Include subdomains of the website", + "default": false + }, + "limit": { + "type": "integer", + "description": "Maximum number of links to return", + "default": 5000, + "maximum": 5000 + } + }, + "required": ["url"] + } } - ], - "requestBody": { - "required": true, + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/MapResponse" + } + } + } + }, + "402": { + "description": "Payment required", "content": { "application/json": { "schema": { "type": "object", "properties": { - "url": { + "error": { "type": "string", - "format": "uri", - "description": "The base URL to start crawling from" - }, - "search": { - "type": "string", - "description": "Search query to use for mapping. During the Alpha phase, the 'smart' part of the search functionality is limited to 1000 search results. However, if map finds more results, there is no limit applied." - }, - "ignoreSitemap": { - "type": "boolean", - "description": "Ignore the website sitemap when crawling", - "default": true - }, - "includeSubdomains": { - "type": "boolean", - "description": "Include subdomains of the website", - "default": false - }, - "limit": { - "type": "integer", - "description": "Maximum number of links to return", - "default": 5000, - "maximum": 5000 + "example": "Payment required to access this resource." } - }, - "required": ["url"] + } } } } }, - "responses": { - "200": { - "description": "Successful response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/MapResponse" - } - } - } - }, - "402": { - "description": "Payment required", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "error": { - "type": "string", - "example": "Payment required to access this resource." - } + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." } } } } - }, - "429": { - "description": "Too many requests", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "error": { - "type": "string", - "example": "Request rate limit exceeded. Please wait and try again later." - } - } - } - } - } - }, - "500": { - "description": "Server error", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "error": { - "type": "string", - "example": "An unexpected error occurred on the server." - } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." } } } @@ -720,6 +1374,183 @@ } } } + }, + "/extract": { + "post": { + "summary": "Extract structured data from pages using LLMs", + "operationId": "extractData", + "tags": ["Extraction"], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "urls": { + "type": "array", + "items": { + "type": "string", + "format": "uri", + "description": "The URLs to extract data from. URLs should be in glob format." + } + }, + "prompt": { + "type": "string", + "description": "Prompt to guide the extraction process" + }, + "schema": { + "type": "object", + "description": "Schema to define the structure of the extracted data", + "properties": { + "property1": { + "type": "string", + "description": "Description of property1" + }, + "property2": { + "type": "integer", + "description": "Description of property2" + } + }, + "required": ["property1", "property2"] + } + }, + "required": ["urls", "prompt"] + } + } + } + }, + "responses": { + "200": { + "description": "Successful extraction", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ExtractResponse" + } + } + } + }, + "400": { + "description": "Invalid request", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Invalid input data." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/team/credit-usage": { + "get": { + "summary": "Get remaining credits for the authenticated team", + "operationId": "getCreditUsage", + "tags": ["Billing"], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": true + }, + "data": { + "type": "object", + "properties": { + "remaining_credits": { + "type": "number", + "description": "Number of credits remaining for the team", + "example": 1000 + } + } + } + } + } + } + } + }, + "404": { + "description": "Credit usage information not found", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Could not find credit usage information" + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Internal server error while fetching credit usage" + } + } + } + } + } + } + } + } + } }, "components": { "securitySchemes": { @@ -777,7 +1608,7 @@ } } } - }, + }, "metadata": { "type": "object", "properties": { @@ -807,7 +1638,6 @@ "nullable": true, "description": "The error message of the page" } - } }, "llm_extraction": { @@ -920,6 +1750,102 @@ } } }, + "BatchScrapeStatusResponseObj": { + "type": "object", + "properties": { + "status": { + "type": "string", + "description": "The current status of the batch scrape. Can be `scraping`, `completed`, or `failed`." + }, + "total": { + "type": "integer", + "description": "The total number of pages that were attempted to be scraped." + }, + "completed": { + "type": "integer", + "description": "The number of pages that have been successfully scraped." + }, + "creditsUsed": { + "type": "integer", + "description": "The number of credits used for the batch scrape." + }, + "expiresAt": { + "type": "string", + "format": "date-time", + "description": "The date and time when the batch scrape will expire." + }, + "next": { + "type": "string", + "nullable": true, + "description": "The URL to retrieve the next 10MB of data. Returned if the batch scrape is not completed or if the response is larger than 10MB." + }, + "data": { + "type": "array", + "description": "The data of the batch scrape.", + "items": { + "type": "object", + "properties": { + "markdown": { + "type": "string" + }, + "html": { + "type": "string", + "nullable": true, + "description": "HTML version of the content on page if `includeHtml` is true" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if `includeRawHtml` is true" + }, + "links": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of links on the page if `includeLinks` is true" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "Screenshot of the page if `includeScreenshot` is true" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + }, + " ": { + "type": "string" + }, + "statusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "error": { + "type": "string", + "nullable": true, + "description": "The error message of the page" + } + } + } + } + } + } + } + }, "CrawlResponse": { "type": "object", "properties": { @@ -935,6 +1861,29 @@ } } }, + "BatchScrapeResponseObj": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "id": { + "type": "string" + }, + "url": { + "type": "string", + "format": "uri" + }, + "invalidURLs": { + "type": "array", + "nullable": true, + "items": { + "type": "string" + }, + "description": "If ignoreInvalidURLs is true, this is an array containing the invalid URLs that were specified in the request. If there were no invalid URLs, this will be an empty array. If ignoreInvalidURLs is false, this field will be undefined." + } + } + }, "MapResponse": { "type": "object", "properties": { @@ -948,6 +1897,25 @@ } } } + }, + "ExtractResponse": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "data": { + "type": "object", + "properties": { + "": { + "type": "string" + }, + "": { + "type": "number" + } + } + } + } } } }, @@ -956,4 +1924,4 @@ "bearerAuth": [] } ] -} \ No newline at end of file +} diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 5f592c2c..352305a4 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.6.8" +__version__ = "1.7.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py index 0ada6c1d..d25d43f3 100644 --- a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py @@ -8,7 +8,7 @@ from datetime import datetime load_dotenv() -API_URL = "http://127.0.0.1:3002"; +API_URL = os.getenv('API_URL', 'http://127.0.0.1:3002') ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py" TEST_API_KEY = os.getenv('TEST_API_KEY') @@ -20,15 +20,26 @@ spec.loader.exec_module(firecrawl) FirecrawlApp = firecrawl.FirecrawlApp def test_no_api_key(): - with pytest.raises(Exception) as excinfo: - invalid_app = FirecrawlApp(api_url=API_URL) - assert "No API key provided" in str(excinfo.value) + if 'api.firecrawl.dev' in API_URL: + with pytest.raises(Exception) as excinfo: + invalid_app = FirecrawlApp(api_url=API_URL) + assert "No API key provided" in str(excinfo.value) + else: + # Should not raise error for self-hosted + app = FirecrawlApp(api_url=API_URL) + assert app is not None def test_scrape_url_invalid_api_key(): - invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") - with pytest.raises(Exception) as excinfo: - invalid_app.scrape_url('https://firecrawl.dev') - assert "Unauthorized: Invalid token" in str(excinfo.value) + if 'api.firecrawl.dev' in API_URL: + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + with pytest.raises(Exception) as excinfo: + invalid_app.scrape_url('https://firecrawl.dev') + assert "Unauthorized: Invalid token" in str(excinfo.value) + else: + # Should work without API key for self-hosted + app = FirecrawlApp(api_url=API_URL) + response = app.scrape_url('https://firecrawl.dev') + assert response is not None # def test_blocklisted_url(): # blocklisted_url = "https://facebook.com/fake-test" @@ -131,10 +142,16 @@ def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_ext assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown'] def test_crawl_url_invalid_api_key(): - invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") - with pytest.raises(Exception) as excinfo: - invalid_app.crawl_url('https://firecrawl.dev') - assert "Unauthorized: Invalid token" in str(excinfo.value) + if 'api.firecrawl.dev' in API_URL: + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + with pytest.raises(Exception) as excinfo: + invalid_app.crawl_url('https://firecrawl.dev') + assert "Unauthorized: Invalid token" in str(excinfo.value) + else: + # Should work without API key for self-hosted + app = FirecrawlApp(api_url=API_URL) + response = app.crawl_url('https://firecrawl.dev') + assert response is not None # def test_should_return_error_for_blocklisted_url(): # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) @@ -291,10 +308,16 @@ def test_check_crawl_status_e2e(): assert 'error' not in status_response['data'][0]['metadata'] def test_invalid_api_key_on_map(): - invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL) - with pytest.raises(Exception) as excinfo: - invalid_app.map_url('https://roastmywebsite.ai') - assert "Unauthorized: Invalid token" in str(excinfo.value) + if 'api.firecrawl.dev' in API_URL: + invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL) + with pytest.raises(Exception) as excinfo: + invalid_app.map_url('https://roastmywebsite.ai') + assert "Unauthorized: Invalid token" in str(excinfo.value) + else: + # Should work without API key for self-hosted + app = FirecrawlApp(api_url=API_URL) + response = app.map_url('https://roastmywebsite.ai') + assert response is not None # def test_blocklisted_url_on_map(): # app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL) @@ -349,4 +372,3 @@ def test_search_e2e(): # assert isinstance(llm_extraction['is_open_source'], bool) - \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index e4ac2726..0181db90 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -40,19 +40,22 @@ class FirecrawlApp: error: Optional[str] = None def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: - """ - Initialize the FirecrawlApp instance with API key, API URL. + """ + Initialize the FirecrawlApp instance with API key, API URL. - Args: - api_key (Optional[str]): API key for authenticating with the Firecrawl API. - api_url (Optional[str]): Base URL for the Firecrawl API. - """ - self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') - self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') - if self.api_key is None: - logger.warning("No API key provided") - raise ValueError('No API key provided') - logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key}") + Args: + api_key (Optional[str]): API key for authenticating with the Firecrawl API. + api_url (Optional[str]): Base URL for the Firecrawl API. + """ + self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') + + # Only require API key when using cloud service + if 'api.firecrawl.dev' in self.api_url and self.api_key is None: + logger.warning("No API key provided for cloud service") + raise ValueError('No API key provided') + + logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}") def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: """ diff --git a/apps/python-sdk/pyproject.toml b/apps/python-sdk/pyproject.toml index 87cb91f1..67082d5e 100644 --- a/apps/python-sdk/pyproject.toml +++ b/apps/python-sdk/pyproject.toml @@ -12,7 +12,8 @@ dependencies = [ "requests", "python-dotenv", "websockets", - "nest-asyncio" + "nest-asyncio", + "pydantic>=2.10.3", ] authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}] maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}] diff --git a/apps/python-sdk/requirements.txt b/apps/python-sdk/requirements.txt index db67ceeb..5dcd8f6c 100644 --- a/apps/python-sdk/requirements.txt +++ b/apps/python-sdk/requirements.txt @@ -2,4 +2,5 @@ requests pytest python-dotenv websockets -nest-asyncio \ No newline at end of file +nest-asyncio +pydantic \ No newline at end of file diff --git a/apps/rust-sdk/src/error.rs b/apps/rust-sdk/src/error.rs index f04a286a..33e4edc6 100644 --- a/apps/rust-sdk/src/error.rs +++ b/apps/rust-sdk/src/error.rs @@ -9,7 +9,7 @@ use crate::crawl::CrawlStatus; #[derive(Debug, Deserialize, Serialize, Clone)] pub struct FirecrawlAPIError { /// Always false. - success: bool, + pub success: bool, /// Error message pub error: String, diff --git a/apps/rust-sdk/src/lib.rs b/apps/rust-sdk/src/lib.rs index 38c2dc11..5d95cc7d 100644 --- a/apps/rust-sdk/src/lib.rs +++ b/apps/rust-sdk/src/lib.rs @@ -9,6 +9,7 @@ pub mod map; pub mod scrape; pub use error::FirecrawlError; +use error::FirecrawlAPIError; #[derive(Clone, Debug)] pub struct FirecrawlApp { @@ -18,16 +19,30 @@ pub struct FirecrawlApp { } pub(crate) const API_VERSION: &str = "/v1"; +const CLOUD_API_URL: &str = "https://api.firecrawl.dev"; impl FirecrawlApp { pub fn new(api_key: impl AsRef) -> Result { - FirecrawlApp::new_selfhosted("https://api.firecrawl.dev", Some(api_key)) + FirecrawlApp::new_selfhosted(CLOUD_API_URL, Some(api_key)) } pub fn new_selfhosted(api_url: impl AsRef, api_key: Option>) -> Result { + let url = api_url.as_ref().to_string(); + + if url == CLOUD_API_URL && api_key.is_none() { + return Err(FirecrawlError::APIError( + "Configuration".to_string(), + FirecrawlAPIError { + success: false, + error: "API key is required for cloud service".to_string(), + details: None, + } + )); + } + Ok(FirecrawlApp { api_key: api_key.map(|x| x.as_ref().to_string()), - api_url: api_url.as_ref().to_string(), + api_url: url, client: Client::new(), }) } diff --git a/apps/rust-sdk/tests/e2e_with_auth.rs b/apps/rust-sdk/tests/e2e_with_auth.rs index 92b202cb..00f3e26c 100644 --- a/apps/rust-sdk/tests/e2e_with_auth.rs +++ b/apps/rust-sdk/tests/e2e_with_auth.rs @@ -1,7 +1,7 @@ use assert_matches::assert_matches; use dotenvy::dotenv; use firecrawl::scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions}; -use firecrawl::FirecrawlApp; +use firecrawl::{FirecrawlApp, FirecrawlError}; use serde_json::json; use std::env; @@ -155,3 +155,29 @@ async fn test_llm_extraction() { assert!(llm_extraction["supports_sso"].is_boolean()); assert!(llm_extraction["is_open_source"].is_boolean()); } + +#[test] +fn test_api_key_requirements() { + dotenv().ok(); + + let api_url = env::var("API_URL").unwrap_or("http://localhost:3002".to_string()); + let api_key = env::var("TEST_API_KEY").ok(); + + match (api_url.contains("api.firecrawl.dev"), api_key) { + (false, _) => { + let result = FirecrawlApp::new_selfhosted(&api_url, None::); + assert!(result.is_ok(), "Local setup failed: {:?}", result.err().unwrap()); + } + (true, None) => { + let result = FirecrawlApp::new_selfhosted(&api_url, None::); + assert!(matches!( + result, + Err(FirecrawlError::APIError(msg, _)) if msg == "Configuration" + )); + } + (true, Some(key)) => { + let result = FirecrawlApp::new_selfhosted(&api_url, Some(&key)); + assert!(result.is_ok()); + } + } +}