Merge branch 'main' into fix-sdk/next-in-when-502

This commit is contained in:
Nicolas 2024-12-26 12:53:10 -03:00
commit bcc18e1c07
60 changed files with 1195 additions and 425 deletions

View File

@ -111,6 +111,20 @@ curl -X POST http://localhost:3002/v1/crawl \
}' }'
``` ```
### Alternative: Using Docker Compose
For a simpler setup, you can use Docker Compose to run all services:
1. Prerequisites: Make sure you have Docker and Docker Compose installed
2. Copy the `.env.example` file to `.env` in the `/apps/api/` directory and configure as needed
3. From the root directory, run:
```bash
docker compose up
```
This will start Redis, the API server, and workers automatically in the correct configuration.
## Tests: ## Tests:
The best way to do this is run the test with `npm run test:local-no-auth` if you'd like to run the tests without authentication. The best way to do this is run the test with `npm run test:local-no-auth` if you'd like to run the tests without authentication.

2
apps/api/.gitignore vendored
View File

@ -9,3 +9,5 @@ dump.rdb
.rdb .rdb
.sentryclirc .sentryclirc
.env.*

View File

@ -7478,7 +7478,7 @@ snapshots:
extract-zip@2.0.1: extract-zip@2.0.1:
dependencies: dependencies:
debug: 4.3.4 debug: 4.3.5
get-stream: 5.2.0 get-stream: 5.2.0
yauzl: 2.10.0 yauzl: 2.10.0
optionalDependencies: optionalDependencies:
@ -7622,7 +7622,7 @@ snapshots:
dependencies: dependencies:
basic-ftp: 5.0.5 basic-ftp: 5.0.5
data-uri-to-buffer: 6.0.2 data-uri-to-buffer: 6.0.2
debug: 4.3.4 debug: 4.3.5
fs-extra: 11.2.0 fs-extra: 11.2.0
transitivePeerDependencies: transitivePeerDependencies:
- supports-color - supports-color
@ -7723,7 +7723,7 @@ snapshots:
http-proxy-agent@7.0.2: http-proxy-agent@7.0.2:
dependencies: dependencies:
agent-base: 7.1.1 agent-base: 7.1.1
debug: 4.3.4 debug: 4.3.5
transitivePeerDependencies: transitivePeerDependencies:
- supports-color - supports-color
@ -7771,7 +7771,7 @@ snapshots:
https-proxy-agent@7.0.5: https-proxy-agent@7.0.5:
dependencies: dependencies:
agent-base: 7.1.1 agent-base: 7.1.1
debug: 4.3.4 debug: 4.3.5
transitivePeerDependencies: transitivePeerDependencies:
- supports-color - supports-color
@ -8836,7 +8836,7 @@ snapshots:
dependencies: dependencies:
'@tootallnate/quickjs-emscripten': 0.23.0 '@tootallnate/quickjs-emscripten': 0.23.0
agent-base: 7.1.1 agent-base: 7.1.1
debug: 4.3.4 debug: 4.3.5
get-uri: 6.0.3 get-uri: 6.0.3
http-proxy-agent: 7.0.2 http-proxy-agent: 7.0.2
https-proxy-agent: 7.0.5 https-proxy-agent: 7.0.5
@ -9031,7 +9031,7 @@ snapshots:
proxy-agent@6.4.0: proxy-agent@6.4.0:
dependencies: dependencies:
agent-base: 7.1.1 agent-base: 7.1.1
debug: 4.3.4 debug: 4.3.5
http-proxy-agent: 7.0.2 http-proxy-agent: 7.0.2
https-proxy-agent: 7.0.5 https-proxy-agent: 7.0.5
lru-cache: 7.18.3 lru-cache: 7.18.3
@ -9338,7 +9338,7 @@ snapshots:
socks-proxy-agent@8.0.4: socks-proxy-agent@8.0.4:
dependencies: dependencies:
agent-base: 7.1.1 agent-base: 7.1.1
debug: 4.3.4 debug: 4.3.5
socks: 2.8.3 socks: 2.8.3
transitivePeerDependencies: transitivePeerDependencies:
- supports-color - supports-color

View File

@ -1,6 +1,7 @@
import request from "supertest"; import request from "supertest";
import dotenv from "dotenv"; import dotenv from "dotenv";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
dotenv.config(); dotenv.config();
@ -58,9 +59,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.send({ url: blocklistedUrl }); .send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403); expect(response.statusCode).toBe(403);
expect(response.body.error).toContain( expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
}); });
// tested on rate limit test // tested on rate limit test
@ -480,9 +479,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.send({ url: blocklistedUrl }); .send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403); expect(response.statusCode).toBe(403);
expect(response.body.error).toContain( expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
}); });
it.concurrent( it.concurrent(

View File

@ -1,5 +1,6 @@
import request from "supertest"; import request from "supertest";
import dotenv from "dotenv"; import dotenv from "dotenv";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
const fs = require("fs"); const fs = require("fs");
const path = require("path"); const path = require("path");
@ -61,9 +62,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.send({ url: blocklistedUrl }); .send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403); expect(response.statusCode).toBe(403);
expect(response.body.error).toContain( expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
}); });
it("should return a successful response", async () => { it("should return a successful response", async () => {
@ -88,9 +87,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.send({ url: blocklistedUrl }); .send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403); expect(response.statusCode).toBe(403);
expect(response.body.error).toContain( expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
}); });
it("should return a successful response", async () => { it("should return a successful response", async () => {
@ -119,9 +116,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.send({ url: blocklistedUrl }); .send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403); expect(response.statusCode).toBe(403);
expect(response.body.error).toContain( expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
}); });
it("should return a successful response", async () => { it("should return a successful response", async () => {

View File

@ -4,6 +4,7 @@ import {
ScrapeRequestInput, ScrapeRequestInput,
ScrapeResponseRequestTest, ScrapeResponseRequestTest,
} from "../../controllers/v1/types"; } from "../../controllers/v1/types";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
configDotenv(); configDotenv();
const TEST_URL = "http://127.0.0.1:3002"; const TEST_URL = "http://127.0.0.1:3002";
@ -57,9 +58,7 @@ describe("E2E Tests for v1 API Routes", () => {
.send(scrapeRequest); .send(scrapeRequest);
expect(response.statusCode).toBe(403); expect(response.statusCode).toBe(403);
expect(response.body.error).toBe( expect(response.body.error).toBe(BLOCKLISTED_URL_MESSAGE);
"URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.",
);
}); });
it.concurrent( it.concurrent(
@ -756,9 +755,7 @@ describe("E2E Tests for v1 API Routes", () => {
.send(scrapeRequest); .send(scrapeRequest);
expect(response.statusCode).toBe(403); expect(response.statusCode).toBe(403);
expect(response.body.error).toBe( expect(response.body.error).toBe(BLOCKLISTED_URL_MESSAGE);
"URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.",
);
}); });
it.concurrent( it.concurrent(

View File

@ -351,6 +351,7 @@ function getPlanByPriceId(price_id: string | null): PlanType {
case process.env.STRIPE_PRICE_ID_ETIER1A_MONTHLY: //ocqh case process.env.STRIPE_PRICE_ID_ETIER1A_MONTHLY: //ocqh
return "etier1a"; return "etier1a";
case process.env.STRIPE_PRICE_ID_ETIER_SCALE_1_MONTHLY: case process.env.STRIPE_PRICE_ID_ETIER_SCALE_1_MONTHLY:
case process.env.STRIPE_PRICE_ID_ETIER_SCALE_1_YEARLY:
return "etierscale1"; return "etierscale1";
default: default:
return "free"; return "free";

View File

@ -0,0 +1,64 @@
import { logger } from "../../../lib/logger";
import * as Sentry from "@sentry/node";
import { Request, Response } from "express";
export async function checkFireEngine(req: Request, res: Response) {
try {
if (!process.env.FIRE_ENGINE_BETA_URL) {
logger.warn("Fire engine beta URL not configured");
return res.status(500).json({
success: false,
error: "Fire engine beta URL not configured",
});
}
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 30000);
try {
const response = await fetch(
`${process.env.FIRE_ENGINE_BETA_URL}/scrape`,
{
method: "POST",
headers: {
"Content-Type": "application/json",
"X-Disable-Cache": "true",
},
body: JSON.stringify({
url: "https://example.com",
}),
signal: controller.signal,
},
);
clearTimeout(timeout);
if (response.ok) {
const responseData = await response.json();
return res.status(200).json({
data: responseData,
});
} else {
return res.status(response.status).json({
success: false,
error: `Fire engine returned status ${response.status}`,
});
}
} catch (error) {
if (error.name === "AbortError") {
return res.status(504).json({
success: false,
error: "Request timed out after 30 seconds",
});
}
throw error;
}
} catch (error) {
logger.error(error);
Sentry.captureException(error);
return res.status(500).json({
success: false,
error: "Internal server error",
});
}
}

View File

@ -29,6 +29,7 @@ import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority"; import { getJobPriority } from "../../lib/job-priority";
import { fromLegacyScrapeOptions, url as urlSchema } from "../v1/types"; import { fromLegacyScrapeOptions, url as urlSchema } from "../v1/types";
import { ZodError } from "zod"; import { ZodError } from "zod";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
export async function crawlController(req: Request, res: Response) { export async function crawlController(req: Request, res: Response) {
try { try {
@ -112,8 +113,7 @@ export async function crawlController(req: Request, res: Response) {
if (isUrlBlocked(url)) { if (isUrlBlocked(url)) {
return res.status(403).json({ return res.status(403).json({
error: error: BLOCKLISTED_URL_MESSAGE,
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
}); });
} }

View File

@ -15,6 +15,7 @@ import { addScrapeJob } from "../../../src/services/queue-jobs";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { fromLegacyScrapeOptions } from "../v1/types"; import { fromLegacyScrapeOptions } from "../v1/types";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
export async function crawlPreviewController(req: Request, res: Response) { export async function crawlPreviewController(req: Request, res: Response) {
try { try {
@ -42,8 +43,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
if (isUrlBlocked(url)) { if (isUrlBlocked(url)) {
return res.status(403).json({ return res.status(403).json({
error: error: BLOCKLISTED_URL_MESSAGE,
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
}); });
} }

View File

@ -8,7 +8,6 @@ import { authenticateUser } from "../auth";
import { PlanType, RateLimiterMode } from "../../types"; import { PlanType, RateLimiterMode } from "../../types";
import { logJob } from "../../services/logging/log_job"; import { logJob } from "../../services/logging/log_job";
import { import {
Document,
fromLegacyCombo, fromLegacyCombo,
toLegacyDocument, toLegacyDocument,
url as urlSchema, url as urlSchema,
@ -29,6 +28,8 @@ import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority"; import { getJobPriority } from "../../lib/job-priority";
import { fromLegacyScrapeOptions } from "../v1/types"; import { fromLegacyScrapeOptions } from "../v1/types";
import { ZodError } from "zod"; import { ZodError } from "zod";
import { Document as V0Document } from "./../../lib/entities";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
export async function scrapeHelper( export async function scrapeHelper(
jobId: string, jobId: string,
@ -42,7 +43,7 @@ export async function scrapeHelper(
): Promise<{ ): Promise<{
success: boolean; success: boolean;
error?: string; error?: string;
data?: Document | { url: string }; data?: V0Document | { url: string };
returnCode: number; returnCode: number;
}> { }> {
const url = urlSchema.parse(req.body.url); const url = urlSchema.parse(req.body.url);
@ -53,8 +54,7 @@ export async function scrapeHelper(
if (isUrlBlocked(url)) { if (isUrlBlocked(url)) {
return { return {
success: false, success: false,
error: error: BLOCKLISTED_URL_MESSAGE,
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
returnCode: 403, returnCode: 403,
}; };
} }
@ -241,9 +241,9 @@ export async function scrapeController(req: Request, res: Response) {
const endTime = new Date().getTime(); const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000; const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens = const numTokens =
result.data && (result.data as Document).markdown result.data && (result.data as V0Document).markdown
? numTokensFromString( ? numTokensFromString(
(result.data as Document).markdown!, (result.data as V0Document).markdown!,
"gpt-3.5-turbo", "gpt-3.5-turbo",
) )
: 0; : 0;
@ -265,25 +265,28 @@ export async function scrapeController(req: Request, res: Response) {
} }
if (creditsToBeBilled > 0) { if (creditsToBeBilled > 0) {
// billing for doc done on queue end, bill only for llm extraction // billing for doc done on queue end, bill only for llm extraction
billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch((error) => { billTeam(team_id, chunk?.sub_id, creditsToBeBilled, logger).catch(
logger.error( (error) => {
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`, logger.error(
); `Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,
// Optionally, you could notify an admin or add to a retry queue here { error },
}); );
// Optionally, you could notify an admin or add to a retry queue here
},
);
} }
} }
let doc = result.data; let doc = result.data;
if (!pageOptions || !pageOptions.includeRawHtml) { if (!pageOptions || !pageOptions.includeRawHtml) {
if (doc && (doc as Document).rawHtml) { if (doc && (doc as V0Document).rawHtml) {
delete (doc as Document).rawHtml; delete (doc as V0Document).rawHtml;
} }
} }
if (pageOptions && pageOptions.includeExtract) { if (pageOptions && pageOptions.includeExtract) {
if (!pageOptions.includeMarkdown && doc && (doc as Document).markdown) { if (!pageOptions.includeMarkdown && doc && (doc as V0Document).markdown) {
delete (doc as Document).markdown; delete (doc as V0Document).markdown;
} }
} }
@ -312,7 +315,7 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(result.returnCode).json(result); return res.status(result.returnCode).json(result);
} catch (error) { } catch (error) {
Sentry.captureException(error); Sentry.captureException(error);
logger.error(error); logger.error("Scrape error occcurred", { error });
return res.status(500).json({ return res.status(500).json({
error: error:
error instanceof ZodError error instanceof ZodError

View File

@ -1,4 +1,5 @@
import { url } from "../types"; import { url } from "../types";
import { BLOCKLISTED_URL_MESSAGE } from "../../../lib/strings";
describe("URL Schema Validation", () => { describe("URL Schema Validation", () => {
beforeEach(() => { beforeEach(() => {
@ -31,7 +32,7 @@ describe("URL Schema Validation", () => {
it("should reject blocked URLs", () => { it("should reject blocked URLs", () => {
expect(() => url.parse("https://facebook.com")).toThrow( expect(() => url.parse("https://facebook.com")).toThrow(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", BLOCKLISTED_URL_MESSAGE,
); );
}); });
@ -47,16 +48,16 @@ describe("URL Schema Validation", () => {
it("should handle URLs with subdomains that are blocked", () => { it("should handle URLs with subdomains that are blocked", () => {
expect(() => url.parse("https://sub.facebook.com")).toThrow( expect(() => url.parse("https://sub.facebook.com")).toThrow(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", BLOCKLISTED_URL_MESSAGE,
); );
}); });
it("should handle URLs with paths that are blocked", () => { it("should handle URLs with paths that are blocked", () => {
expect(() => url.parse("http://facebook.com/path")).toThrow( expect(() => url.parse("http://facebook.com/path")).toThrow(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", BLOCKLISTED_URL_MESSAGE,
); );
expect(() => url.parse("https://facebook.com/another/path")).toThrow( expect(() => url.parse("https://facebook.com/another/path")).toThrow(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", BLOCKLISTED_URL_MESSAGE,
); );
}); });

View File

@ -3,9 +3,11 @@ import { v4 as uuidv4 } from "uuid";
import { import {
BatchScrapeRequest, BatchScrapeRequest,
batchScrapeRequestSchema, batchScrapeRequestSchema,
CrawlResponse, batchScrapeRequestSchemaNoURLValidation,
url as urlSchema,
RequestWithAuth, RequestWithAuth,
ScrapeOptions, ScrapeOptions,
BatchScrapeResponse,
} from "./types"; } from "./types";
import { import {
addCrawlJobs, addCrawlJobs,
@ -21,10 +23,14 @@ import { callWebhook } from "../../services/webhook";
import { logger as _logger } from "../../lib/logger"; import { logger as _logger } from "../../lib/logger";
export async function batchScrapeController( export async function batchScrapeController(
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>, req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
res: Response<CrawlResponse>, res: Response<BatchScrapeResponse>,
) { ) {
req.body = batchScrapeRequestSchema.parse(req.body); if (req.body?.ignoreInvalidURLs === true) {
req.body = batchScrapeRequestSchemaNoURLValidation.parse(req.body);
} else {
req.body = batchScrapeRequestSchema.parse(req.body);
}
const id = req.body.appendToId ?? uuidv4(); const id = req.body.appendToId ?? uuidv4();
const logger = _logger.child({ const logger = _logger.child({
@ -35,8 +41,27 @@ export async function batchScrapeController(
teamId: req.auth.team_id, teamId: req.auth.team_id,
plan: req.auth.plan, plan: req.auth.plan,
}); });
let urls = req.body.urls;
let invalidURLs: string[] | undefined = undefined;
if (req.body.ignoreInvalidURLs) {
invalidURLs = [];
let pendingURLs = urls;
urls = [];
for (const u of pendingURLs) {
try {
const nu = urlSchema.parse(u);
urls.push(nu);
} catch (_) {
invalidURLs.push(u);
}
}
}
logger.debug("Batch scrape " + id + " starting", { logger.debug("Batch scrape " + id + " starting", {
urlsLength: req.body.urls, urlsLength: urls,
appendToId: req.body.appendToId, appendToId: req.body.appendToId,
account: req.account, account: req.account,
}); });
@ -70,7 +95,7 @@ export async function batchScrapeController(
// If it is over 1000, we need to get the job priority, // If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20 // otherwise we can use the default priority of 20
if (req.body.urls.length > 1000) { if (urls.length > 1000) {
// set base to 21 // set base to 21
jobPriority = await getJobPriority({ jobPriority = await getJobPriority({
plan: req.auth.plan, plan: req.auth.plan,
@ -84,7 +109,7 @@ export async function batchScrapeController(
delete (scrapeOptions as any).urls; delete (scrapeOptions as any).urls;
delete (scrapeOptions as any).appendToId; delete (scrapeOptions as any).appendToId;
const jobs = req.body.urls.map((x) => { const jobs = urls.map((x) => {
return { return {
data: { data: {
url: x, url: x,
@ -140,5 +165,6 @@ export async function batchScrapeController(
success: true, success: true,
id, id,
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`, url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`,
invalidURLs,
}); });
} }

View File

@ -18,7 +18,7 @@ import {
} from "../../lib/crawl-redis"; } from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log"; import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service"; import { getScrapeQueue } from "../../services/queue-service";
import { addScrapeJob } from "../../services/queue-jobs"; import { addScrapeJob, addScrapeJobs } from "../../services/queue-jobs";
import { logger as _logger } from "../../lib/logger"; import { logger as _logger } from "../../lib/logger";
import { getJobPriority } from "../../lib/job-priority"; import { getJobPriority } from "../../lib/job-priority";
import { callWebhook } from "../../services/webhook"; import { callWebhook } from "../../services/webhook";
@ -139,9 +139,9 @@ export async function crawlController(
name: uuid, name: uuid,
data: { data: {
url, url,
mode: "single_urls", mode: "single_urls" as const,
team_id: req.auth.team_id, team_id: req.auth.team_id,
plan: req.auth.plan, plan: req.auth.plan!,
crawlerOptions, crawlerOptions,
scrapeOptions, scrapeOptions,
internalOptions: sc.internalOptions, internalOptions: sc.internalOptions,
@ -170,7 +170,7 @@ export async function crawlController(
jobs.map((x) => x.opts.jobId), jobs.map((x) => x.opts.jobId),
); );
logger.debug("Adding scrape jobs to BullMQ..."); logger.debug("Adding scrape jobs to BullMQ...");
await getScrapeQueue().addBulk(jobs); await addScrapeJobs(jobs);
} else { } else {
logger.debug("Sitemap not found or ignored.", { logger.debug("Sitemap not found or ignored.", {
ignoreSitemap: sc.crawlerOptions.ignoreSitemap, ignoreSitemap: sc.crawlerOptions.ignoreSitemap,

View File

@ -0,0 +1,45 @@
import { Request, Response } from "express";
import { RequestWithAuth } from "./types";
import { getACUC } from "../auth";
import { logger } from "../../lib/logger";
export async function creditUsageController(
req: RequestWithAuth,
res: Response,
): Promise<void> {
try {
// If we already have the credit usage info from auth, use it
if (req.acuc) {
res.json({
success: true,
data: {
remaining_credits: req.acuc.remaining_credits,
},
});
return;
}
// Otherwise fetch fresh data
const chunk = await getACUC(req.auth.team_id);
if (!chunk) {
res.status(404).json({
success: false,
error: "Could not find credit usage information",
});
return;
}
res.json({
success: true,
data: {
remaining_credits: chunk.remaining_credits,
},
});
} catch (error) {
logger.error("Error in credit usage controller:", error);
res.status(500).json({
success: false,
error: "Internal server error while fetching credit usage",
});
}
}

View File

@ -1,6 +1,6 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { import {
// Document, Document,
RequestWithAuth, RequestWithAuth,
ExtractRequest, ExtractRequest,
extractRequestSchema, extractRequestSchema,
@ -8,7 +8,7 @@ import {
MapDocument, MapDocument,
scrapeOptions, scrapeOptions,
} from "./types"; } from "./types";
import { Document } from "../../lib/entities"; // import { Document } from "../../lib/entities";
import Redis from "ioredis"; import Redis from "ioredis";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
import { performRanking } from "../../lib/ranker"; import { performRanking } from "../../lib/ranker";
@ -24,6 +24,9 @@ import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { getMapResults } from "./map"; import { getMapResults } from "./map";
import { buildDocument } from "../../lib/extract/build-document"; import { buildDocument } from "../../lib/extract/build-document";
import { generateBasicCompletion } from "../../lib/LLM-extraction";
import { buildRefrasedPrompt } from "../../lib/extract/build-prompts";
import { removeDuplicateUrls } from "../../lib/validateUrl";
configDotenv(); configDotenv();
const redis = new Redis(process.env.REDIS_URL!); const redis = new Redis(process.env.REDIS_URL!);
@ -61,30 +64,50 @@ export async function extractController(
const baseUrl = url.replace("/*", ""); const baseUrl = url.replace("/*", "");
// const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any // const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
const allowExternalLinks = req.body.allowExternalLinks ?? true; const allowExternalLinks = req.body.allowExternalLinks;
let urlWithoutWww = baseUrl.replace("www.", ""); let urlWithoutWww = baseUrl.replace("www.", "");
let mapUrl =
req.body.prompt && allowExternalLinks let rephrasedPrompt = req.body.prompt;
? `${req.body.prompt} ${urlWithoutWww}` if (req.body.prompt) {
: req.body.prompt rephrasedPrompt =
? `${req.body.prompt} site:${urlWithoutWww}` (await generateBasicCompletion(
: `site:${urlWithoutWww}`; buildRefrasedPrompt(req.body.prompt, baseUrl),
)) ?? req.body.prompt;
}
const mapResults = await getMapResults({ const mapResults = await getMapResults({
url: baseUrl, url: baseUrl,
search: req.body.prompt, search: rephrasedPrompt,
teamId: req.auth.team_id, teamId: req.auth.team_id,
plan: req.auth.plan, plan: req.auth.plan,
allowExternalLinks, allowExternalLinks,
origin: req.body.origin, origin: req.body.origin,
limit: req.body.limit, limit: req.body.limit,
// If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping // If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
ignoreSitemap: !selfHosted ? true : false, ignoreSitemap: false,
includeMetadata: true, includeMetadata: true,
includeSubdomains: req.body.includeSubdomains, includeSubdomains: req.body.includeSubdomains,
}); });
let mappedLinks = mapResults.links as MapDocument[]; let mappedLinks = mapResults.mapResults as MapDocument[];
// Remove duplicates between mapResults.links and mappedLinks
const allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
const uniqueUrls = removeDuplicateUrls(allUrls);
// Only add URLs from mapResults.links that aren't already in mappedLinks
const existingUrls = new Set(mappedLinks.map((m) => m.url));
const newUrls = uniqueUrls.filter((url) => !existingUrls.has(url));
mappedLinks = [
...mappedLinks,
...newUrls.map((url) => ({ url, title: "", description: "" })),
];
if (mappedLinks.length === 0) {
mappedLinks = [{ url: baseUrl, title: "", description: "" }];
}
// Limit number of links to MAX_EXTRACT_LIMIT // Limit number of links to MAX_EXTRACT_LIMIT
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT); mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
@ -93,18 +116,18 @@ export async function extractController(
`url: ${x.url}, title: ${x.title}, description: ${x.description}`, `url: ${x.url}, title: ${x.title}, description: ${x.description}`,
); );
// Filter by path prefix if present
// wrong
// if (pathPrefix) {
// mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
// }
if (req.body.prompt) { if (req.body.prompt) {
let searchQuery =
req.body.prompt && allowExternalLinks
? `${req.body.prompt} ${urlWithoutWww}`
: req.body.prompt
? `${req.body.prompt} site:${urlWithoutWww}`
: `site:${urlWithoutWww}`;
// Get similarity scores between the search query and each link's context // Get similarity scores between the search query and each link's context
const linksAndScores = await performRanking( const linksAndScores = await performRanking(
mappedLinksRerank, mappedLinksRerank,
mappedLinks.map((l) => l.url), mappedLinks.map((l) => l.url),
mapUrl, searchQuery,
); );
// First try with high threshold // First try with high threshold
@ -158,7 +181,8 @@ export async function extractController(
// Wait for all URL processing to complete and flatten results // Wait for all URL processing to complete and flatten results
const processedUrls = await Promise.all(urlPromises); const processedUrls = await Promise.all(urlPromises);
links.push(...processedUrls.flat()); const flattenedUrls = processedUrls.flat().filter((url) => url); // Filter out any null/undefined values
links.push(...flattenedUrls);
if (links.length === 0) { if (links.length === 0) {
return res.status(400).json({ return res.status(400).json({
@ -204,21 +228,8 @@ export async function extractController(
} }
return doc; return doc;
} catch (e) { } catch (e) {
logger.error(`Error in scrapeController: ${e}`); logger.error(`Error in extractController: ${e}`);
if ( return null;
e instanceof Error &&
(e.message.startsWith("Job wait") || e.message === "timeout")
) {
throw {
status: 408,
error: "Request timed out",
};
} else {
throw {
status: 500,
error: `(Internal server error) - ${e && e.message ? e.message : e}`,
};
}
} }
}); });
@ -237,7 +248,8 @@ export async function extractController(
{ {
mode: "llm", mode: "llm",
systemPrompt: systemPrompt:
"Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided. Here are the urls the user provided of which he wants to extract information from: " + (req.body.systemPrompt ? `${req.body.systemPrompt}\n` : "") +
"Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " +
links.join(", "), links.join(", "),
prompt: req.body.prompt, prompt: req.body.prompt,
schema: req.body.schema, schema: req.body.schema,

View File

@ -28,14 +28,15 @@ const redis = new Redis(process.env.REDIS_URL!);
// Max Links that /map can return // Max Links that /map can return
const MAX_MAP_LIMIT = 5000; const MAX_MAP_LIMIT = 5000;
// Max Links that "Smart /map" can return // Max Links that "Smart /map" can return
const MAX_FIRE_ENGINE_RESULTS = 1000; const MAX_FIRE_ENGINE_RESULTS = 500;
interface MapResult { interface MapResult {
success: boolean; success: boolean;
links: string[] | any[]; links: string[];
scrape_id?: string; scrape_id?: string;
job_id: string; job_id: string;
time_taken: number; time_taken: number;
mapResults: MapDocument[];
} }
export async function getMapResults({ export async function getMapResults({
@ -215,7 +216,8 @@ export async function getMapResults({
return { return {
success: true, success: true,
links: includeMetadata ? mapResults : linksToReturn, links: linksToReturn,
mapResults: mapResults,
scrape_id: origin?.includes("website") ? id : undefined, scrape_id: origin?.includes("website") ? id : undefined,
job_id: id, job_id: id,
time_taken: (new Date().getTime() - Date.now()) / 1000, time_taken: (new Date().getTime() - Date.now()) / 1000,

View File

@ -60,7 +60,11 @@ export async function scrapeController(
try { try {
doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
} catch (e) { } catch (e) {
logger.error(`Error in scrapeController: ${e}`); logger.error(`Error in scrapeController: ${e}`, {
jobId,
scrapeId: jobId,
startTime,
});
if ( if (
e instanceof Error && e instanceof Error &&
(e.message.startsWith("Job wait") || e.message === "timeout") (e.message.startsWith("Job wait") || e.message === "timeout")

View File

@ -11,6 +11,7 @@ import {
Document as V0Document, Document as V0Document,
} from "../../lib/entities"; } from "../../lib/entities";
import { InternalOptions } from "../../scraper/scrapeURL"; import { InternalOptions } from "../../scraper/scrapeURL";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
export type Format = export type Format =
| "markdown" | "markdown"
@ -44,10 +45,7 @@ export const url = z.preprocess(
return false; return false;
} }
}, "Invalid URL") }, "Invalid URL")
.refine( .refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE),
(x) => !isUrlBlocked(x as string),
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
),
); );
const strictMessage = const strictMessage =
@ -182,6 +180,7 @@ export const scrapeOptions = z
.optional(), .optional(),
skipTlsVerification: z.boolean().default(false), skipTlsVerification: z.boolean().default(false),
removeBase64Images: z.boolean().default(true), removeBase64Images: z.boolean().default(true),
fastMode: z.boolean().default(false),
}) })
.strict(strictMessage); .strict(strictMessage);
@ -193,11 +192,12 @@ export const extractV1Options = z
.array() .array()
.max(10, "Maximum of 10 URLs allowed per request while in beta."), .max(10, "Maximum of 10 URLs allowed per request while in beta."),
prompt: z.string().optional(), prompt: z.string().optional(),
systemPrompt: z.string().optional(),
schema: z.any().optional(), schema: z.any().optional(),
limit: z.number().int().positive().finite().safe().optional(), limit: z.number().int().positive().finite().safe().optional(),
ignoreSitemap: z.boolean().default(false), ignoreSitemap: z.boolean().default(false),
includeSubdomains: z.boolean().default(true), includeSubdomains: z.boolean().default(true),
allowExternalLinks: z.boolean().default(true), allowExternalLinks: z.boolean().default(false),
origin: z.string().optional().default("api"), origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000), timeout: z.number().int().positive().finite().safe().default(60000),
}) })
@ -262,6 +262,31 @@ export const batchScrapeRequestSchema = scrapeOptions
origin: z.string().optional().default("api"), origin: z.string().optional().default("api"),
webhook: webhookSchema.optional(), webhook: webhookSchema.optional(),
appendToId: z.string().uuid().optional(), appendToId: z.string().uuid().optional(),
ignoreInvalidURLs: z.boolean().default(false),
})
.strict(strictMessage)
.refine(
(obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
const hasExtractOptions = obj.extract !== undefined;
return (
(hasExtractFormat && hasExtractOptions) ||
(!hasExtractFormat && !hasExtractOptions)
);
},
{
message:
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
},
);
export const batchScrapeRequestSchemaNoURLValidation = scrapeOptions
.extend({
urls: z.string().array(),
origin: z.string().optional().default("api"),
webhook: webhookSchema.optional(),
appendToId: z.string().uuid().optional(),
ignoreInvalidURLs: z.boolean().default(false),
}) })
.strict(strictMessage) .strict(strictMessage)
.refine( .refine(
@ -396,7 +421,7 @@ export type Document = {
articleSection?: string; articleSection?: string;
url?: string; url?: string;
sourceURL?: string; sourceURL?: string;
statusCode?: number; statusCode: number;
error?: string; error?: string;
[key: string]: string | string[] | number | undefined; [key: string]: string | string[] | number | undefined;
}; };
@ -446,6 +471,15 @@ export type CrawlResponse =
url: string; url: string;
}; };
export type BatchScrapeResponse =
| ErrorResponse
| {
success: true;
id: string;
url: string;
invalidURLs?: string[];
};
export type MapResponse = export type MapResponse =
| ErrorResponse | ErrorResponse
| { | {
@ -651,11 +685,11 @@ export function fromLegacyScrapeOptions(
} }
: undefined, : undefined,
mobile: pageOptions.mobile, mobile: pageOptions.mobile,
fastMode: pageOptions.useFastMode,
}), }),
internalOptions: { internalOptions: {
atsv: pageOptions.atsv, atsv: pageOptions.atsv,
v0DisableJsDom: pageOptions.disableJsDom, v0DisableJsDom: pageOptions.disableJsDom,
v0UseFastMode: pageOptions.useFastMode,
}, },
// TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks // TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
}; };

View File

@ -62,3 +62,16 @@ export async function generateCompletions(
return completions; return completions;
} }
// generate basic completion
export async function generateBasicCompletion(prompt: string) {
const openai = new OpenAI();
const model = "gpt-4o";
const completion = await openai.chat.completions.create({
model,
messages: [{ role: "user", content: prompt }],
});
return completion.choices[0].message.content;
}

View File

@ -21,7 +21,7 @@ export function cacheKey(
if ( if (
internalOptions.v0CrawlOnlyUrls || internalOptions.v0CrawlOnlyUrls ||
internalOptions.forceEngine || internalOptions.forceEngine ||
internalOptions.v0UseFastMode || scrapeOptions.fastMode ||
internalOptions.atsv || internalOptions.atsv ||
(scrapeOptions.actions && scrapeOptions.actions.length > 0) (scrapeOptions.actions && scrapeOptions.actions.length > 0)
) { ) {

View File

@ -60,6 +60,8 @@ export async function addCrawlJob(id: string, job_id: string) {
} }
export async function addCrawlJobs(id: string, job_ids: string[]) { export async function addCrawlJobs(id: string, job_ids: string[]) {
if (job_ids.length === 0) return true;
_logger.debug("Adding crawl jobs to Redis...", { _logger.debug("Adding crawl jobs to Redis...", {
jobIds: job_ids, jobIds: job_ids,
module: "crawl-redis", module: "crawl-redis",
@ -90,12 +92,20 @@ export async function addCrawlJobDone(
if (success) { if (success) {
await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id); await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id);
await redisConnection.expire( } else {
// in case it's already been pushed, make sure it's removed
await redisConnection.lrem(
"crawl:" + id + ":jobs_done_ordered", "crawl:" + id + ":jobs_done_ordered",
24 * 60 * 60, -1,
"NX", job_id,
); );
} }
await redisConnection.expire(
"crawl:" + id + ":jobs_done_ordered",
24 * 60 * 60,
"NX",
);
} }
export async function getDoneJobsOrderedLength(id: string): Promise<number> { export async function getDoneJobsOrderedLength(id: string): Promise<number> {
@ -227,13 +237,6 @@ export async function lockURL(
url = normalizeURL(url, sc); url = normalizeURL(url, sc);
logger = logger.child({ url }); logger = logger.child({ url });
await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
await redisConnection.expire(
"crawl:" + id + ":visited_unique",
24 * 60 * 60,
"NX",
);
let res: boolean; let res: boolean;
if (!sc.crawlerOptions?.deduplicateSimilarURLs) { if (!sc.crawlerOptions?.deduplicateSimilarURLs) {
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0; res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0;
@ -249,6 +252,15 @@ export async function lockURL(
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
if (res) {
await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
await redisConnection.expire(
"crawl:" + id + ":visited_unique",
24 * 60 * 60,
"NX",
);
}
logger.debug("Locking URL " + JSON.stringify(url) + "... result: " + res, { logger.debug("Locking URL " + JSON.stringify(url) + "... result: " + res, {
res, res,
}); });
@ -261,6 +273,8 @@ export async function lockURLs(
sc: StoredCrawl, sc: StoredCrawl,
urls: string[], urls: string[],
): Promise<boolean> { ): Promise<boolean> {
if (urls.length === 0) return true;
urls = urls.map((url) => normalizeURL(url, sc)); urls = urls.map((url) => normalizeURL(url, sc));
const logger = _logger.child({ const logger = _logger.child({
crawlId: id, crawlId: id,

View File

@ -0,0 +1,16 @@
export function buildRefrasedPrompt(prompt: string, url: string): string {
return `You are a search query optimizer. Your task is to rephrase the following prompt into an effective search query that will find relevant results about this topic on ${url}.
Original prompt: "${prompt}"
Provide a rephrased search query that:
1. Maintains the core intent of the original prompt with ONLY the keywords
2. Uses relevant keywords
3. Is optimized for search engine results
4. Is concise and focused
5. Short is better than long
6. It is a search engine, not a chatbot
7. Concise
Return only the rephrased search query, without any explanation or additional text.`;
}

View File

@ -0,0 +1,2 @@
export const BLOCKLISTED_URL_MESSAGE =
"This website is no longer supported, please reach out to help@firecrawl.com for more info on how to activate it on your account.";

View File

@ -7,7 +7,7 @@ import {
import { billTeam } from "../services/billing/credit_billing"; import { billTeam } from "../services/billing/credit_billing";
import { Document } from "../controllers/v1/types"; import { Document } from "../controllers/v1/types";
import { supabase_service } from "../services/supabase"; import { supabase_service } from "../services/supabase";
import { logger } from "../lib/logger"; import { logger as _logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events"; import { ScrapeEvents } from "../lib/scrape-events";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
import { import {
@ -49,6 +49,7 @@ export async function startWebScraperPipeline({
bull_job_id: job.id.toString(), bull_job_id: job.id.toString(),
priority: job.opts.priority, priority: job.opts.priority,
is_scrape: job.data.is_scrape ?? false, is_scrape: job.data.is_scrape ?? false,
is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null),
}); });
} }
@ -63,56 +64,126 @@ export async function runWebScraper({
bull_job_id, bull_job_id,
priority, priority,
is_scrape = false, is_scrape = false,
is_crawl = false,
}: RunWebScraperParams): Promise<ScrapeUrlResponse> { }: RunWebScraperParams): Promise<ScrapeUrlResponse> {
const logger = _logger.child({
method: "runWebScraper",
module: "runWebscraper",
scrapeId: bull_job_id,
jobId: bull_job_id,
});
const tries = is_crawl ? 3 : 1;
let response: ScrapeUrlResponse | undefined = undefined; let response: ScrapeUrlResponse | undefined = undefined;
let engines: EngineResultsTracker = {}; let engines: EngineResultsTracker = {};
try { let error: any = undefined;
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
priority, for (let i = 0; i < tries; i++) {
...internalOptions, if (i > 0) {
}); logger.debug("Retrying scrape...", {
if (!response.success) { tries,
if (response.error instanceof Error) { i,
throw response.error; previousStatusCode: (response as any)?.document?.metadata?.statusCode,
} else { previousError: error,
throw new Error( });
"scrapeURL error: " +
(Array.isArray(response.error)
? JSON.stringify(response.error)
: typeof response.error === "object"
? JSON.stringify({ ...response.error })
: response.error),
);
}
} }
response = undefined;
engines = {};
error = undefined;
try {
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
priority,
...internalOptions,
});
if (!response.success) {
if (response.error instanceof Error) {
throw response.error;
} else {
throw new Error(
"scrapeURL error: " +
(Array.isArray(response.error)
? JSON.stringify(response.error)
: typeof response.error === "object"
? JSON.stringify({ ...response.error })
: response.error),
);
}
}
// This is where the returnvalue from the job is set
// onSuccess(response.document, mode);
engines = response.engines;
if (
(response.document.metadata.statusCode >= 200 &&
response.document.metadata.statusCode < 300) ||
response.document.metadata.statusCode === 304
) {
// status code is good -- do not attempt retry
break;
}
} catch (_error) {
error = _error;
engines =
response !== undefined
? response.engines
: typeof error === "object" && error !== null
? ((error as any).results ?? {})
: {};
}
}
const engineOrder = Object.entries(engines)
.sort((a, b) => a[1].startedAt - b[1].startedAt)
.map((x) => x[0]) as Engine[];
for (const engine of engineOrder) {
const result = engines[engine] as Exclude<
EngineResultsTracker[Engine],
undefined
>;
ScrapeEvents.insert(bull_job_id, {
type: "scrape",
url,
method: engine,
result: {
success: result.state === "success",
response_code:
result.state === "success" ? result.result.statusCode : undefined,
response_size:
result.state === "success" ? result.result.html.length : undefined,
error:
result.state === "error"
? result.error
: result.state === "timeout"
? "Timed out"
: undefined,
time_taken: result.finishedAt - result.startedAt,
},
});
}
if (error === undefined && response?.success) {
if (is_scrape === false) { if (is_scrape === false) {
let creditsToBeBilled = 1; // Assuming 1 credit per document let creditsToBeBilled = 1; // Assuming 1 credit per document
if (scrapeOptions.extract) { if (scrapeOptions.extract) {
creditsToBeBilled = 5; creditsToBeBilled = 5;
} }
billTeam(team_id, undefined, creditsToBeBilled).catch((error) => { billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => {
logger.error( logger.error(
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`, `Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,
{ error },
); );
// Optionally, you could notify an admin or add to a retry queue here // Optionally, you could notify an admin or add to a retry queue here
}); });
} }
// This is where the returnvalue from the job is set
// onSuccess(response.document, mode);
engines = response.engines;
return response; return response;
} catch (error) { } else {
engines =
response !== undefined
? response.engines
: typeof error === "object" && error !== null
? ((error as any).results ?? {})
: {};
if (response !== undefined) { if (response !== undefined) {
return { return {
...response, ...response,
@ -127,37 +198,6 @@ export async function runWebScraper({
engines, engines,
}; };
} }
// onError(error);
} finally {
const engineOrder = Object.entries(engines)
.sort((a, b) => a[1].startedAt - b[1].startedAt)
.map((x) => x[0]) as Engine[];
for (const engine of engineOrder) {
const result = engines[engine] as Exclude<
EngineResultsTracker[Engine],
undefined
>;
ScrapeEvents.insert(bull_job_id, {
type: "scrape",
url,
method: engine,
result: {
success: result.state === "success",
response_code:
result.state === "success" ? result.result.statusCode : undefined,
response_size:
result.state === "success" ? result.result.html.length : undefined,
error:
result.state === "error"
? result.error
: result.state === "timeout"
? "Timed out"
: undefined,
time_taken: result.finishedAt - result.startedAt,
},
});
}
} }
} }
@ -195,6 +235,11 @@ const saveJob = async (
} }
ScrapeEvents.logJobEvent(job, "completed"); ScrapeEvents.logJobEvent(job, "completed");
} catch (error) { } catch (error) {
logger.error(`🐂 Failed to update job status: ${error}`); _logger.error(`🐂 Failed to update job status`, {
module: "runWebScraper",
method: "saveJob",
jobId: job.id,
scrapeId: job.id,
});
} }
}; };

View File

@ -8,6 +8,7 @@ import {
} from "../controllers/v0/admin/queue"; } from "../controllers/v0/admin/queue";
import { wrap } from "./v1"; import { wrap } from "./v1";
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear"; import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
import { checkFireEngine } from "../controllers/v0/admin/check-fire-engine";
export const adminRouter = express.Router(); export const adminRouter = express.Router();
@ -37,3 +38,8 @@ adminRouter.post(
`/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`, `/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`,
wrap(acucCacheClearController), wrap(acucCacheClearController),
); );
adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/feng-check`,
wrap(checkFireEngine),
);

View File

@ -31,6 +31,8 @@ import { extractController } from "../controllers/v1/extract";
// import { keyAuthController } from "../../src/controllers/v1/keyAuth"; // import { keyAuthController } from "../../src/controllers/v1/keyAuth";
// import { livenessController } from "../controllers/v1/liveness"; // import { livenessController } from "../controllers/v1/liveness";
// import { readinessController } from "../controllers/v1/readiness"; // import { readinessController } from "../controllers/v1/readiness";
import { creditUsageController } from "../controllers/v1/credit-usage";
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
function checkCreditsMiddleware( function checkCreditsMiddleware(
minimum?: number, minimum?: number,
@ -122,8 +124,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
if (!res.headersSent) { if (!res.headersSent) {
return res.status(403).json({ return res.status(403).json({
success: false, success: false,
error: error: BLOCKLISTED_URL_MESSAGE,
"URL is blocked intentionally. Firecrawl currently does not support scraping this site due to policy restrictions.",
}); });
} }
} }
@ -224,3 +225,9 @@ v1Router.delete(
// Health/Probe routes // Health/Probe routes
// v1Router.get("/health/liveness", livenessController); // v1Router.get("/health/liveness", livenessController);
// v1Router.get("/health/readiness", readinessController); // v1Router.get("/health/readiness", readinessController);
v1Router.get(
"/team/credit-usage",
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(creditUsageController),
);

View File

@ -210,7 +210,7 @@ export class WebCrawler {
} }
if (sitemapLinks.length > 0) { if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks( let filteredLinks = this.filterLinks(
sitemapLinks, [...new Set(sitemapLinks)],
this.limit, this.limit,
this.maxCrawledDepth, this.maxCrawledDepth,
fromMap, fromMap,

View File

@ -6,6 +6,15 @@ configDotenv();
const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8"); const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8");
const algorithm = "aes-256-ecb"; const algorithm = "aes-256-ecb";
function encryptAES(plaintext: string, key: Buffer): string {
const cipher = crypto.createCipheriv(algorithm, key, null);
const encrypted = Buffer.concat([
cipher.update(plaintext, "utf-8"),
cipher.final(),
]);
return encrypted.toString("base64");
}
function decryptAES(ciphertext: string, key: Buffer): string { function decryptAES(ciphertext: string, key: Buffer): string {
const decipher = crypto.createDecipheriv(algorithm, key, null); const decipher = crypto.createDecipheriv(algorithm, key, null);
const decrypted = Buffer.concat([ const decrypted = Buffer.concat([
@ -42,9 +51,27 @@ const urlBlocklist = [
"PTbGg8PK/h0Seyw4HEpK4Q==", "PTbGg8PK/h0Seyw4HEpK4Q==",
"lZdQMknjHb7+4+sjF3qNTw==", "lZdQMknjHb7+4+sjF3qNTw==",
"LsgSq54q5oDysbva29JxnQ==", "LsgSq54q5oDysbva29JxnQ==",
"KZfBtpwjOpdSoqacRbz7og==",
"Indtl4yxJMHCKBGF4KABCQ==",
"e3HFXLVgxhaVoadYpwb2BA==",
"b+asgLayXQ5Jq+se+q56jA==",
"86ZDUI7vmp4MvNq3fvZrGQ==",
"sEGFoYZ6GEg4Zocd+TiyfQ==",
"6OOL72eXthgnJ1Hj4PfOQQ==",
"g/ME+Sh1CAFboKrwkVb+5Q==",
"Pw+xawUoX8xBYbX2yqqGWQ==",
"k6vBalxYFhAvkPsF19t9gQ==",
"e3HFXLVgxhaVoadYpwb2BA==",
"b+asgLayXQ5Jq+se+q56jA==",
"KKttwRz4w+AMJrZcB828WQ==",
"vMdzZ33BXoyWVZnAPOBcrg==",
"l8GDVI8w/ueHnNzdN1ODuQ==",
]; ];
const decryptedBlocklist = hashKey.length > 0 ? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey)) : []; const decryptedBlocklist =
hashKey.length > 0
? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey))
: [];
const allowedKeywords = [ const allowedKeywords = [
"pulse", "pulse",

View File

@ -5,8 +5,9 @@ import { specialtyScrapeCheck } from "../utils/specialtyHandler";
export async function scrapeURLWithFetch( export async function scrapeURLWithFetch(
meta: Meta, meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const timeout = 20000; const timeout = timeToRun ?? 300000;
const response = await Promise.race([ const response = await Promise.race([
fetch(meta.url, { fetch(meta.url, {

View File

@ -3,7 +3,7 @@ import * as Sentry from "@sentry/node";
import { z } from "zod"; import { z } from "zod";
import { robustFetch } from "../../lib/fetch"; import { robustFetch } from "../../lib/fetch";
import { EngineError, SiteError } from "../../error"; import { ActionError, EngineError, SiteError } from "../../error";
const successSchema = z.object({ const successSchema = z.object({
jobId: z.string(), jobId: z.string(),
@ -111,6 +111,12 @@ export async function fireEngineCheckStatus(
status.error.includes("Chrome error: ") status.error.includes("Chrome error: ")
) { ) {
throw new SiteError(status.error.split("Chrome error: ")[1]); throw new SiteError(status.error.split("Chrome error: ")[1]);
} else if (
typeof status.error === "string" &&
// TODO: improve this later
status.error.includes("Element")
) {
throw new ActionError(status.error.split("Error: ")[1]);
} else { } else {
throw new EngineError("Scrape job failed", { throw new EngineError("Scrape job failed", {
cause: { cause: {

View File

@ -13,13 +13,11 @@ import {
FireEngineCheckStatusSuccess, FireEngineCheckStatusSuccess,
StillProcessingError, StillProcessingError,
} from "./checkStatus"; } from "./checkStatus";
import { EngineError, SiteError, TimeoutError } from "../../error"; import { ActionError, EngineError, SiteError, TimeoutError } from "../../error";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { Action } from "../../../../lib/entities"; import { Action } from "../../../../lib/entities";
import { specialtyScrapeCheck } from "../utils/specialtyHandler"; import { specialtyScrapeCheck } from "../utils/specialtyHandler";
export const defaultTimeout = 10000;
// This function does not take `Meta` on purpose. It may not access any // This function does not take `Meta` on purpose. It may not access any
// meta values to construct the request -- that must be done by the // meta values to construct the request -- that must be done by the
// `scrapeURLWithFireEngine*` functions. // `scrapeURLWithFireEngine*` functions.
@ -31,7 +29,7 @@ async function performFireEngineScrape<
>( >(
logger: Logger, logger: Logger,
request: FireEngineScrapeRequestCommon & Engine, request: FireEngineScrapeRequestCommon & Engine,
timeout = defaultTimeout, timeout: number,
): Promise<FireEngineCheckStatusSuccess> { ): Promise<FireEngineCheckStatusSuccess> {
const scrape = await fireEngineScrape( const scrape = await fireEngineScrape(
logger.child({ method: "fireEngineScrape" }), logger.child({ method: "fireEngineScrape" }),
@ -70,7 +68,11 @@ async function performFireEngineScrape<
} catch (error) { } catch (error) {
if (error instanceof StillProcessingError) { if (error instanceof StillProcessingError) {
// nop // nop
} else if (error instanceof EngineError || error instanceof SiteError) { } else if (
error instanceof EngineError ||
error instanceof SiteError ||
error instanceof ActionError
) {
logger.debug("Fire-engine scrape job failed.", { logger.debug("Fire-engine scrape job failed.", {
error, error,
jobId: scrape.jobId, jobId: scrape.jobId,
@ -94,6 +96,7 @@ async function performFireEngineScrape<
export async function scrapeURLWithFireEngineChromeCDP( export async function scrapeURLWithFireEngineChromeCDP(
meta: Meta, meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const actions: Action[] = [ const actions: Action[] = [
// Transform waitFor option into an action (unsupported by chrome-cdp) // Transform waitFor option into an action (unsupported by chrome-cdp)
@ -121,6 +124,13 @@ export async function scrapeURLWithFireEngineChromeCDP(
...(meta.options.actions ?? []), ...(meta.options.actions ?? []),
]; ];
const totalWait = actions.reduce(
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
0,
);
const timeout = (timeToRun ?? 300000) + totalWait;
const request: FireEngineScrapeRequestCommon & const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestChromeCDP = { FireEngineScrapeRequestChromeCDP = {
url: meta.url, url: meta.url,
@ -134,25 +144,20 @@ export async function scrapeURLWithFireEngineChromeCDP(
} }
: {}), : {}),
priority: meta.internalOptions.priority, priority: meta.internalOptions.priority,
geolocation: meta.options.geolocation, geolocation: meta.options.geolocation ?? meta.options.location,
mobile: meta.options.mobile, mobile: meta.options.mobile,
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic timeout, // TODO: better timeout logic
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache, disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
// TODO: scrollXPaths // TODO: scrollXPaths
}; };
const totalWait = actions.reduce(
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
0,
);
let response = await performFireEngineScrape( let response = await performFireEngineScrape(
meta.logger.child({ meta.logger.child({
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
request, request,
}), }),
request, request,
meta.options.timeout !== undefined ? defaultTimeout + totalWait : Infinity, // TODO: better timeout handling timeout,
); );
specialtyScrapeCheck( specialtyScrapeCheck(
@ -206,7 +211,11 @@ export async function scrapeURLWithFireEngineChromeCDP(
export async function scrapeURLWithFireEnginePlaywright( export async function scrapeURLWithFireEnginePlaywright(
meta: Meta, meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const totalWait = meta.options.waitFor;
const timeout = (timeToRun ?? 300000) + totalWait;
const request: FireEngineScrapeRequestCommon & const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestPlaywright = { FireEngineScrapeRequestPlaywright = {
url: meta.url, url: meta.url,
@ -218,9 +227,9 @@ export async function scrapeURLWithFireEnginePlaywright(
screenshot: meta.options.formats.includes("screenshot"), screenshot: meta.options.formats.includes("screenshot"),
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"), fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
wait: meta.options.waitFor, wait: meta.options.waitFor,
geolocation: meta.options.geolocation, geolocation: meta.options.geolocation ?? meta.options.location,
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic timeout,
}; };
let response = await performFireEngineScrape( let response = await performFireEngineScrape(
@ -229,9 +238,7 @@ export async function scrapeURLWithFireEnginePlaywright(
request, request,
}), }),
request, request,
meta.options.timeout !== undefined timeout,
? defaultTimeout + meta.options.waitFor
: Infinity, // TODO: better timeout handling
); );
specialtyScrapeCheck( specialtyScrapeCheck(
@ -265,7 +272,10 @@ export async function scrapeURLWithFireEnginePlaywright(
export async function scrapeURLWithFireEngineTLSClient( export async function scrapeURLWithFireEngineTLSClient(
meta: Meta, meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const timeout = timeToRun ?? 30000;
const request: FireEngineScrapeRequestCommon & const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestTLSClient = { FireEngineScrapeRequestTLSClient = {
url: meta.url, url: meta.url,
@ -276,10 +286,10 @@ export async function scrapeURLWithFireEngineTLSClient(
priority: meta.internalOptions.priority, priority: meta.internalOptions.priority,
atsv: meta.internalOptions.atsv, atsv: meta.internalOptions.atsv,
geolocation: meta.options.geolocation, geolocation: meta.options.geolocation ?? meta.options.location,
disableJsDom: meta.internalOptions.v0DisableJsDom, disableJsDom: meta.internalOptions.v0DisableJsDom,
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic timeout,
}; };
let response = await performFireEngineScrape( let response = await performFireEngineScrape(
@ -288,7 +298,7 @@ export async function scrapeURLWithFireEngineTLSClient(
request, request,
}), }),
request, request,
meta.options.timeout !== undefined ? defaultTimeout : Infinity, // TODO: better timeout handling timeout,
); );
specialtyScrapeCheck( specialtyScrapeCheck(

View File

@ -105,7 +105,10 @@ export type EngineScrapeResult = {
}; };
const engineHandlers: { const engineHandlers: {
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>; [E in Engine]: (
meta: Meta,
timeToRun: number | undefined,
) => Promise<EngineScrapeResult>;
} = { } = {
cache: scrapeCache, cache: scrapeCache,
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP, "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
@ -372,6 +375,7 @@ export function buildFallbackList(meta: Meta): {
export async function scrapeURLWithEngine( export async function scrapeURLWithEngine(
meta: Meta, meta: Meta,
engine: Engine, engine: Engine,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const fn = engineHandlers[engine]; const fn = engineHandlers[engine];
const logger = meta.logger.child({ const logger = meta.logger.child({
@ -383,5 +387,5 @@ export async function scrapeURLWithEngine(
logger, logger,
}; };
return await fn(_meta); return await fn(_meta, timeToRun);
} }

View File

@ -15,6 +15,7 @@ type PDFProcessorResult = { html: string; markdown?: string };
async function scrapePDFWithLlamaParse( async function scrapePDFWithLlamaParse(
meta: Meta, meta: Meta,
tempFilePath: string, tempFilePath: string,
timeToRun: number | undefined,
): Promise<PDFProcessorResult> { ): Promise<PDFProcessorResult> {
meta.logger.debug("Processing PDF document with LlamaIndex", { meta.logger.debug("Processing PDF document with LlamaIndex", {
tempFilePath, tempFilePath,
@ -63,8 +64,9 @@ async function scrapePDFWithLlamaParse(
// TODO: timeout, retries // TODO: timeout, retries
const startedAt = Date.now(); const startedAt = Date.now();
const timeout = timeToRun ?? 300000;
while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) { while (Date.now() <= startedAt + timeout) {
try { try {
const result = await robustFetch({ const result = await robustFetch({
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`, url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
@ -122,7 +124,10 @@ async function scrapePDFWithParsePDF(
}; };
} }
export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> { export async function scrapePDF(
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
if (!meta.options.parsePDF) { if (!meta.options.parsePDF) {
const file = await fetchFileToBuffer(meta.url); const file = await fetchFileToBuffer(meta.url);
const content = file.buffer.toString("base64"); const content = file.buffer.toString("base64");
@ -138,9 +143,26 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
const { response, tempFilePath } = await downloadFile(meta.id, meta.url); const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
let result: PDFProcessorResult | null = null; let result: PDFProcessorResult | null = null;
if (process.env.LLAMAPARSE_API_KEY) {
// First, try parsing with PdfParse
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);
// If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse
if (
result.markdown &&
result.markdown.length < 500 &&
process.env.LLAMAPARSE_API_KEY
) {
try { try {
result = await scrapePDFWithLlamaParse( const llamaResult = await scrapePDFWithLlamaParse(
{ {
...meta, ...meta,
logger: meta.logger.child({ logger: meta.logger.child({
@ -148,17 +170,19 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
}), }),
}, },
tempFilePath, tempFilePath,
timeToRun,
); );
result = llamaResult; // Use LlamaParse result if successful
} catch (error) { } catch (error) {
if (error instanceof Error && error.message === "LlamaParse timed out") { if (error instanceof Error && error.message === "LlamaParse timed out") {
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", { meta.logger.warn("LlamaParse timed out -- using parse-pdf result", {
error, error,
}); });
} else if (error instanceof RemoveFeatureError) { } else if (error instanceof RemoveFeatureError) {
throw error; throw error;
} else { } else {
meta.logger.warn( meta.logger.warn(
"LlamaParse failed to parse PDF -- falling back to parse-pdf", "LlamaParse failed to parse PDF -- using parse-pdf result",
{ error }, { error },
); );
Sentry.captureException(error); Sentry.captureException(error);
@ -166,18 +190,6 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
} }
} }
if (result === null) {
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);
}
await fs.unlink(tempFilePath); await fs.unlink(tempFilePath);
return { return {

View File

@ -6,8 +6,9 @@ import { robustFetch } from "../../lib/fetch";
export async function scrapeURLWithPlaywright( export async function scrapeURLWithPlaywright(
meta: Meta, meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const timeout = 20000 + meta.options.waitFor; const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
const response = await Promise.race([ const response = await Promise.race([
await robustFetch({ await robustFetch({
@ -30,7 +31,7 @@ export async function scrapeURLWithPlaywright(
}), }),
}), }),
(async () => { (async () => {
await new Promise((resolve) => setTimeout(() => resolve(null), 20000)); await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
throw new TimeoutError( throw new TimeoutError(
"Playwright was unable to scrape the page before timing out", "Playwright was unable to scrape the page before timing out",
{ cause: { timeout } }, { cause: { timeout } },

View File

@ -9,16 +9,20 @@ const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
export function scrapeURLWithScrapingBee( export function scrapeURLWithScrapingBee(
wait_browser: "domcontentloaded" | "networkidle2", wait_browser: "domcontentloaded" | "networkidle2",
): (meta: Meta) => Promise<EngineScrapeResult> { ): (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult> {
return async (meta: Meta): Promise<EngineScrapeResult> => { return async (
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> => {
let response: AxiosResponse<any>; let response: AxiosResponse<any>;
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
try { try {
response = await client.get({ response = await client.get({
url: meta.url, url: meta.url,
params: { params: {
timeout: 15000, // TODO: dynamic timeout based on request timeout timeout,
wait_browser: wait_browser, wait_browser: wait_browser,
wait: Math.min(meta.options.waitFor, 35000), wait: meta.options.waitFor,
transparent_status_code: true, transparent_status_code: true,
json_response: true, json_response: true,
screenshot: meta.options.formats.includes("screenshot"), screenshot: meta.options.formats.includes("screenshot"),

View File

@ -56,3 +56,11 @@ export class SiteError extends Error {
this.code = code; this.code = code;
} }
} }
export class ActionError extends Error {
public code: string;
constructor(code: string) {
super("Action(s) failed to complete. Error code: " + code);
this.code = code;
}
}

View File

@ -12,6 +12,7 @@ import {
} from "./engines"; } from "./engines";
import { parseMarkdown } from "../../lib/html-to-markdown"; import { parseMarkdown } from "../../lib/html-to-markdown";
import { import {
ActionError,
AddFeatureError, AddFeatureError,
EngineError, EngineError,
NoEnginesLeftError, NoEnginesLeftError,
@ -86,7 +87,7 @@ function buildFeatureFlags(
flags.add("skipTlsVerification"); flags.add("skipTlsVerification");
} }
if (internalOptions.v0UseFastMode) { if (options.fastMode) {
flags.add("useFastMode"); flags.add("useFastMode");
} }
@ -148,7 +149,6 @@ export type InternalOptions = {
atsv?: boolean; // anti-bot solver, beta atsv?: boolean; // anti-bot solver, beta
v0CrawlOnlyUrls?: boolean; v0CrawlOnlyUrls?: boolean;
v0UseFastMode?: boolean;
v0DisableJsDom?: boolean; v0DisableJsDom?: boolean;
disableSmartWaitCache?: boolean; // Passed along to fire-engine disableSmartWaitCache?: boolean; // Passed along to fire-engine
@ -203,11 +203,16 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
const results: EngineResultsTracker = {}; const results: EngineResultsTracker = {};
let result: EngineScrapeResultWithContext | null = null; let result: EngineScrapeResultWithContext | null = null;
const timeToRun =
meta.options.timeout !== undefined
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2))
: undefined;
for (const { engine, unsupportedFeatures } of fallbackList) { for (const { engine, unsupportedFeatures } of fallbackList) {
const startedAt = Date.now(); const startedAt = Date.now();
try { try {
meta.logger.info("Scraping via " + engine + "..."); meta.logger.info("Scraping via " + engine + "...");
const _engineResult = await scrapeURLWithEngine(meta, engine); const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun);
if (_engineResult.markdown === undefined) { if (_engineResult.markdown === undefined) {
// Some engines emit Markdown directly. // Some engines emit Markdown directly.
_engineResult.markdown = await parseMarkdown(_engineResult.html); _engineResult.markdown = await parseMarkdown(_engineResult.html);
@ -285,6 +290,8 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
throw error; throw error;
} else if (error instanceof SiteError) { } else if (error instanceof SiteError) {
throw error; throw error;
} else if (error instanceof ActionError) {
throw error;
} else { } else {
Sentry.captureException(error); Sentry.captureException(error);
meta.logger.info( meta.logger.info(
@ -405,6 +412,8 @@ export async function scrapeURL(
// TODO: results? // TODO: results?
} else if (error instanceof SiteError) { } else if (error instanceof SiteError) {
meta.logger.warn("scrapeURL: Site failed to load in browser", { error }); meta.logger.warn("scrapeURL: Site failed to load in browser", { error });
} else if (error instanceof ActionError) {
meta.logger.warn("scrapeURL: Action(s) failed to complete", { error });
} else { } else {
Sentry.captureException(error); Sentry.captureException(error);
meta.logger.error("scrapeURL: Unexpected error happened", { error }); meta.logger.error("scrapeURL: Unexpected error happened", { error });

View File

@ -5,7 +5,7 @@ import { Meta } from "..";
export function extractMetadata( export function extractMetadata(
meta: Meta, meta: Meta,
html: string, html: string,
): Document["metadata"] { ): Partial<Document["metadata"]> {
let title: string | undefined = undefined; let title: string | undefined = undefined;
let description: string | undefined = undefined; let description: string | undefined = undefined;
let language: string | undefined = undefined; let language: string | undefined = undefined;
@ -40,7 +40,7 @@ export function extractMetadata(
const soup = load(html); const soup = load(html);
try { try {
title = soup("title").text() || undefined; title = soup("title").first().text().trim() || undefined;
description = soup('meta[name="description"]').attr("content") || undefined; description = soup('meta[name="description"]').attr("content") || undefined;
// Assuming the language is part of the URL as per the regex pattern // Assuming the language is part of the URL as per the regex pattern

View File

@ -159,8 +159,8 @@ export async function generateOpenAICompletions(
role: "user", role: "user",
content: content:
options.prompt !== undefined options.prompt !== undefined
? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}` ? `Transform the above content into structured JSON output based on the provided schema if any and the following user request: ${options.prompt}. If schema is provided, strictly follow it.`
: "Transform the above content into structured JSON output.", : "Transform the above content into structured JSON output based on the provided schema if any.",
}, },
], ],
response_format: options.schema response_format: options.schema

View File

@ -10,6 +10,7 @@ import { issueCredits } from "./issue_credits";
import { redlock } from "../redlock"; import { redlock } from "../redlock";
import { autoCharge } from "./auto_charge"; import { autoCharge } from "./auto_charge";
import { getValue, setValue } from "../redis"; import { getValue, setValue } from "../redis";
import type { Logger } from "winston";
const FREE_CREDITS = 500; const FREE_CREDITS = 500;
@ -20,22 +21,33 @@ export async function billTeam(
team_id: string, team_id: string,
subscription_id: string | null | undefined, subscription_id: string | null | undefined,
credits: number, credits: number,
logger?: Logger,
) { ) {
return withAuth(supaBillTeam, { success: true, message: "No DB, bypassed." })( return withAuth(supaBillTeam, { success: true, message: "No DB, bypassed." })(
team_id, team_id,
subscription_id, subscription_id,
credits, credits,
logger,
); );
} }
export async function supaBillTeam( export async function supaBillTeam(
team_id: string, team_id: string,
subscription_id: string | null | undefined, subscription_id: string | null | undefined,
credits: number, credits: number,
__logger?: Logger,
) { ) {
const _logger = (__logger ?? logger).child({
module: "credit_billing",
method: "supaBillTeam",
});
if (team_id === "preview") { if (team_id === "preview") {
return { success: true, message: "Preview team, no credits used" }; return { success: true, message: "Preview team, no credits used" };
} }
logger.info(`Billing team ${team_id} for ${credits} credits`); _logger.info(`Billing team ${team_id} for ${credits} credits`, {
team_id,
credits,
});
const { data, error } = await supabase_service.rpc("bill_team", { const { data, error } = await supabase_service.rpc("bill_team", {
_team_id: team_id, _team_id: team_id,
@ -46,7 +58,7 @@ export async function supaBillTeam(
if (error) { if (error) {
Sentry.captureException(error); Sentry.captureException(error);
logger.error("Failed to bill team: " + JSON.stringify(error)); _logger.error("Failed to bill team.", { error });
return; return;
} }

View File

@ -9,6 +9,7 @@ export async function issueCredits(team_id: string, credits: number) {
status: "active", status: "active",
// indicates that this coupon was issued from auto recharge // indicates that this coupon was issued from auto recharge
from_auto_recharge: true, from_auto_recharge: true,
initial_credits: credits,
}); });
if (error) { if (error) {

View File

@ -11,11 +11,50 @@ import {
pushConcurrencyLimitedJob, pushConcurrencyLimitedJob,
} from "../lib/concurrency-limit"; } from "../lib/concurrency-limit";
async function _addScrapeJobToConcurrencyQueue(
webScraperOptions: any,
options: any,
jobId: string,
jobPriority: number,
) {
await pushConcurrencyLimitedJob(webScraperOptions.team_id, {
id: jobId,
data: webScraperOptions,
opts: {
...options,
priority: jobPriority,
jobId: jobId,
},
priority: jobPriority,
});
}
async function _addScrapeJobToBullMQ(
webScraperOptions: any,
options: any,
jobId: string,
jobPriority: number,
) {
if (
webScraperOptions &&
webScraperOptions.team_id &&
webScraperOptions.plan
) {
await pushConcurrencyLimitActiveJob(webScraperOptions.team_id, jobId);
}
await getScrapeQueue().add(jobId, webScraperOptions, {
...options,
priority: jobPriority,
jobId,
});
}
async function addScrapeJobRaw( async function addScrapeJobRaw(
webScraperOptions: any, webScraperOptions: any,
options: any, options: any,
jobId: string, jobId: string,
jobPriority: number = 10, jobPriority: number,
) { ) {
let concurrencyLimited = false; let concurrencyLimited = false;
@ -33,30 +72,14 @@ async function addScrapeJobRaw(
} }
if (concurrencyLimited) { if (concurrencyLimited) {
await pushConcurrencyLimitedJob(webScraperOptions.team_id, { await _addScrapeJobToConcurrencyQueue(
id: jobId, webScraperOptions,
data: webScraperOptions, options,
opts: {
...options,
priority: jobPriority,
jobId: jobId,
},
priority: jobPriority,
});
} else {
if (
webScraperOptions &&
webScraperOptions.team_id &&
webScraperOptions.plan
) {
await pushConcurrencyLimitActiveJob(webScraperOptions.team_id, jobId);
}
await getScrapeQueue().add(jobId, webScraperOptions, {
...options,
priority: jobPriority,
jobId, jobId,
}); jobPriority,
);
} else {
await _addScrapeJobToBullMQ(webScraperOptions, options, jobId, jobPriority);
} }
} }
@ -108,11 +131,88 @@ export async function addScrapeJobs(
}; };
}[], }[],
) { ) {
// TODO: better if (jobs.length === 0) return true;
let countCanBeDirectlyAdded = Infinity;
if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) {
const now = Date.now();
const limit = await getConcurrencyLimitMax(jobs[0].data.plan);
console.log("CC limit", limit);
cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now);
countCanBeDirectlyAdded = Math.max(
limit -
(await getConcurrencyLimitActiveJobs(jobs[0].data.team_id, now)).length,
0,
);
}
const addToBull = jobs.slice(0, countCanBeDirectlyAdded);
const addToCQ = jobs.slice(countCanBeDirectlyAdded);
await Promise.all( await Promise.all(
jobs.map((job) => addToBull.map(async (job) => {
addScrapeJob(job.data, job.opts, job.opts.jobId, job.opts.priority), const size = JSON.stringify(job.data).length;
), return await Sentry.startSpan(
{
name: "Add scrape job",
op: "queue.publish",
attributes: {
"messaging.message.id": job.opts.jobId,
"messaging.destination.name": getScrapeQueue().name,
"messaging.message.body.size": size,
},
},
async (span) => {
await _addScrapeJobToBullMQ(
{
...job.data,
sentry: {
trace: Sentry.spanToTraceHeader(span),
baggage: Sentry.spanToBaggageHeader(span),
size,
},
},
job.opts,
job.opts.jobId,
job.opts.priority,
);
},
);
}),
);
await Promise.all(
addToCQ.map(async (job) => {
const size = JSON.stringify(job.data).length;
return await Sentry.startSpan(
{
name: "Add scrape job",
op: "queue.publish",
attributes: {
"messaging.message.id": job.opts.jobId,
"messaging.destination.name": getScrapeQueue().name,
"messaging.message.body.size": size,
},
},
async (span) => {
await _addScrapeJobToConcurrencyQueue(
{
...job.data,
sentry: {
trace: Sentry.spanToTraceHeader(span),
baggage: Sentry.spanToBaggageHeader(span),
size,
},
},
job.opts,
job.opts.jobId,
job.opts.priority,
);
},
);
}),
); );
} }

View File

@ -386,27 +386,27 @@ async function processJob(job: Job & { id: string }, token: string) {
jobId: job.id, jobId: job.id,
scrapeId: job.id, scrapeId: job.id,
crawlId: job.data?.crawl_id ?? undefined, crawlId: job.data?.crawl_id ?? undefined,
teamId: job.data?.team_id ?? undefined,
}); });
logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url }); logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url });
// Check if the job URL is researchhub and block it immediately // Check if the job URL is researchhub and block it immediately
// TODO: remove this once solve the root issue // TODO: remove this once solve the root issue
if ( // if (
job.data.url && // job.data.url &&
(job.data.url.includes("researchhub.com") || // (job.data.url.includes("researchhub.com") ||
job.data.url.includes("ebay.com") || // job.data.url.includes("ebay.com"))
job.data.url.includes("youtube.com")) // ) {
) { // logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`);
logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`); // const data = {
const data = { // success: false,
success: false, // document: null,
document: null, // project_id: job.data.project_id,
project_id: job.data.project_id, // error:
error: // "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.",
"URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.", // };
}; // return data;
return data; // }
}
try { try {
job.updateProgress({ job.updateProgress({
@ -482,32 +482,28 @@ async function processJob(job: Job & { id: string }, token: string) {
normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.url, sc) !==
normalizeURL(doc.metadata.sourceURL, sc) normalizeURL(doc.metadata.sourceURL, sc)
) { ) {
logger.debug(
"Was redirected, removing old URL and locking new URL...",
{ oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url },
);
// Remove the old URL from visited unique due to checking for limit
// Do not remove from :visited otherwise it will keep crawling the original URL (sourceURL)
await redisConnection.srem(
"crawl:" + job.data.crawl_id + ":visited_unique",
normalizeURL(doc.metadata.sourceURL, sc),
);
const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc)); const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc));
const p2 = generateURLPermutations( const p2 = generateURLPermutations(
normalizeURL(doc.metadata.sourceURL, sc), normalizeURL(doc.metadata.sourceURL, sc),
); );
// In crawls, we should only crawl a redirected page once, no matter how many; times it is redirected to, or if it's been discovered by the crawler before. if (JSON.stringify(p1) !== JSON.stringify(p2)) {
// This can prevent flakiness with race conditions. logger.debug(
// Lock the new URL "Was redirected, removing old URL and locking new URL...",
const lockRes = await lockURL(job.data.crawl_id, sc, doc.metadata.url); { oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url },
if ( );
job.data.crawlerOptions !== null &&
!lockRes && // Prevent redirect target from being visited in the crawl again
JSON.stringify(p1) !== JSON.stringify(p2) // See lockURL
) { const x = await redisConnection.sadd(
throw new RacedRedirectError(); "crawl:" + job.data.crawl_id + ":visited",
...p1.map((x) => x.href),
);
const lockRes = x === p1.length;
if (job.data.crawlerOptions !== null && !lockRes) {
throw new RacedRedirectError();
}
} }
} }
@ -679,6 +675,10 @@ async function processJob(job: Job & { id: string }, token: string) {
logger.debug("Declaring job as done..."); logger.debug("Declaring job as done...");
await addCrawlJobDone(job.data.crawl_id, job.id, false); await addCrawlJobDone(job.data.crawl_id, job.id, false);
await redisConnection.srem(
"crawl:" + job.data.crawl_id + ":visited_unique",
normalizeURL(job.data.url, sc),
);
logger.debug("Logging job to DB..."); logger.debug("Logging job to DB...");
await logJob( await logJob(

View File

@ -80,8 +80,8 @@ const RATE_LIMITS = {
default: 100, default: 100,
}, },
crawlStatus: { crawlStatus: {
free: 300, free: 500,
default: 500, default: 5000,
}, },
testSuite: { testSuite: {
free: 10000, free: 10000,

View File

@ -55,6 +55,7 @@ export interface RunWebScraperParams {
bull_job_id: string; bull_job_id: string;
priority?: number; priority?: number;
is_scrape?: boolean; is_scrape?: boolean;
is_crawl?: boolean;
} }
export type RunWebScraperResult = export type RunWebScraperResult =

View File

@ -3,6 +3,7 @@
"rootDir": "./src", "rootDir": "./src",
"lib": ["ES2022", "DOM"], "lib": ["ES2022", "DOM"],
// or higher // or higher
"target": "ES2022", "target": "ES2022",
@ -18,7 +19,7 @@
"*": ["node_modules/*", "src/types/*"], "*": ["node_modules/*", "src/types/*"],
}, },
"inlineSources": true "inlineSources": true,
}, },
"include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"] "include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"]
} }

View File

@ -1,7 +1,19 @@
const fs = require("fs"); const fs = require("fs");
const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8") // METHOD: Winston log file
.split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x)); // const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8")
// .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));
// METHOD: GCloud export
const logs = [
"downloaded-logs-20241213-225607.json",
"downloaded-logs-20241213-225654.json",
"downloaded-logs-20241213-225720.json",
"downloaded-logs-20241213-225758.json",
"downloaded-logs-20241213-225825.json",
"downloaded-logs-20241213-225843.json",
].flatMap(x => JSON.parse(fs.readFileSync(x, "utf8"))).map(x => x.jsonPayload);
const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))]; const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))];

View File

@ -0,0 +1,14 @@
require("dotenv").config();
const Redis = require("ioredis");
const crawlId = process.argv[2];
const redisConnection = new Redis(process.env.REDIS_URL, {
maxRetriesPerRequest: null,
});
(async () => {
const res = await redisConnection.sscan("crawl:" + crawlId + ":visited_unique", 0, "COUNT", 999);
await require("fs/promises").writeFile(crawlId + "-visited.txt", res[1].map(x => x.split("://").slice(1).join("://")).sort().join("\n"));
process.exit(0);
})();

43
apps/api/utils/urldump.js Normal file
View File

@ -0,0 +1,43 @@
require("dotenv").config();
//const baseUrl = "https://api.firecrawl.dev";
const baseUrl = "http://localhost:3002";
const crawlId = process.argv[2];
(async () => {
let url = baseUrl + "/v1/crawl/" + crawlId;
let urls = [];
while (url) {
let res;
while (true) {
try {
res = (await (await fetch(url, {
headers: {
"Authorization": "Bearer " + process.env.TEST_API_KEY
}
})).json());
break;
} catch (e) {
console.error(e);
}
}
console.log(res.data.length);
if (res.data.length === 0) {
break;
}
urls.push(...res.data.map(x => x.metadata.url ?? x.metadata.sourceURL));
url = res.next;
if (url !== undefined) {
const o = new URL(url)
o.protocol = new URL(baseUrl).protocol;
url = o.href;
}
}
await require("fs/promises").writeFile(crawlId + "-urls.txt", urls.map(x => x.split("://").slice(1).join("://")).sort().join("\n"));
})();

View File

@ -1,9 +1,9 @@
import { describe, test, expect, jest } from '@jest/globals'; import { describe, expect, jest, test } from '@jest/globals';
import axios from 'axios';
import FirecrawlApp from '../index';
import { readFile } from 'fs/promises'; import FirecrawlApp from '../index';
import axios from 'axios';
import { join } from 'path'; import { join } from 'path';
import { readFile } from 'fs/promises';
// Mock jest and set the type // Mock jest and set the type
jest.mock('axios'); jest.mock('axios');
@ -14,13 +14,22 @@ async function loadFixture(name: string): Promise<string> {
return await readFile(join(__dirname, 'fixtures', `${name}.json`), 'utf-8') return await readFile(join(__dirname, 'fixtures', `${name}.json`), 'utf-8')
} }
const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev";
describe('the firecrawl JS SDK', () => { describe('the firecrawl JS SDK', () => {
test('Should require an API key to instantiate FirecrawlApp', async () => { test('Should require an API key only for cloud service', async () => {
const fn = () => { if (API_URL.includes('api.firecrawl.dev')) {
new FirecrawlApp({ apiKey: undefined }); // Should throw for cloud service
}; expect(() => {
expect(fn).toThrow('No API key provided'); new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL });
}).toThrow('No API key provided');
} else {
// Should not throw for self-hosted
expect(() => {
new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL });
}).not.toThrow();
}
}); });
test('Should return scraped data from a /scrape API call', async () => { test('Should return scraped data from a /scrape API call', async () => {

View File

@ -9,15 +9,28 @@ const TEST_API_KEY = process.env.TEST_API_KEY;
const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev"; const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev";
describe('FirecrawlApp E2E Tests', () => { describe('FirecrawlApp E2E Tests', () => {
test.concurrent('should throw error for no API key', async () => { test.concurrent('should throw error for no API key only for cloud service', async () => {
expect(() => { if (API_URL.includes('api.firecrawl.dev')) {
new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); // Should throw for cloud service
}).toThrow("No API key provided"); expect(() => {
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
}).toThrow("No API key provided");
} else {
// Should not throw for self-hosted
expect(() => {
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
}).not.toThrow();
}
}); });
test.concurrent('should throw error for invalid API key on scrape', async () => { test.concurrent('should throw error for invalid API key on scrape', async () => {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); if (API_URL.includes('api.firecrawl.dev')) {
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 404");
} else {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).resolves.not.toThrow();
}
}); });
test.concurrent('should throw error for blocklisted URL on scrape', async () => { test.concurrent('should throw error for blocklisted URL on scrape', async () => {
@ -155,14 +168,13 @@ describe('FirecrawlApp E2E Tests', () => {
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
test.concurrent('should throw error for invalid API key on crawl', async () => { test.concurrent('should throw error for invalid API key on crawl', async () => {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); if (API_URL.includes('api.firecrawl.dev')) {
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
}); await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404");
} else {
test.concurrent('should throw error for blocklisted URL on crawl', async () => { const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).resolves.not.toThrow();
const blocklistedUrl = "https://twitter.com/fake-test"; }
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
}); });
test.concurrent('should return successful response for crawl and wait for completion', async () => { test.concurrent('should return successful response for crawl and wait for completion', async () => {
@ -337,8 +349,13 @@ describe('FirecrawlApp E2E Tests', () => {
}, 60000); // 60 seconds timeout }, 60000); // 60 seconds timeout
test.concurrent('should throw error for invalid API key on map', async () => { test.concurrent('should throw error for invalid API key on map', async () => {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); if (API_URL.includes('api.firecrawl.dev')) {
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404");
} else {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).resolves.not.toThrow();
}
}); });
test.concurrent('should throw error for blocklisted URL on map', async () => { test.concurrent('should throw error for blocklisted URL on map', async () => {
@ -355,8 +372,7 @@ describe('FirecrawlApp E2E Tests', () => {
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
test.concurrent('should return successful response for valid map', async () => { test.concurrent('should return successful response for valid map', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
expect(response).not.toBeNull(); expect(response).not.toBeNull();
expect(response.links?.length).toBeGreaterThan(0); expect(response.links?.length).toBeGreaterThan(0);

View File

@ -183,6 +183,7 @@ export interface BatchScrapeResponse {
url?: string; url?: string;
success: true; success: true;
error?: string; error?: string;
invalidURLs?: string[];
} }
/** /**
@ -242,10 +243,11 @@ export interface MapResponse {
* Defines options for extracting information from URLs. * Defines options for extracting information from URLs.
*/ */
export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> { export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
prompt: string; prompt?: string;
schema?: LLMSchema; schema?: LLMSchema;
systemPrompt?: string; systemPrompt?: string;
allowExternalLinks?: boolean; allowExternalLinks?: boolean;
includeSubdomains?: boolean;
} }
/** /**
@ -288,17 +290,23 @@ export default class FirecrawlApp {
public apiKey: string; public apiKey: string;
public apiUrl: string; public apiUrl: string;
private isCloudService(url: string): boolean {
return url.includes('api.firecrawl.dev');
}
/** /**
* Initializes a new instance of the FirecrawlApp class. * Initializes a new instance of the FirecrawlApp class.
* @param config - Configuration options for the FirecrawlApp instance. * @param config - Configuration options for the FirecrawlApp instance.
*/ */
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) { constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
if (typeof apiKey !== "string") { const baseUrl = apiUrl || "https://api.firecrawl.dev";
if (this.isCloudService(baseUrl) && typeof apiKey !== "string") {
throw new FirecrawlError("No API key provided", 401); throw new FirecrawlError("No API key provided", 401);
} }
this.apiKey = apiKey; this.apiKey = apiKey || '';
this.apiUrl = apiUrl || "https://api.firecrawl.dev"; this.apiUrl = baseUrl;
} }
/** /**
@ -576,9 +584,10 @@ export default class FirecrawlApp {
pollInterval: number = 2, pollInterval: number = 2,
idempotencyKey?: string, idempotencyKey?: string,
webhook?: CrawlParams["webhook"], webhook?: CrawlParams["webhook"],
ignoreInvalidURLs?: boolean,
): Promise<BatchScrapeStatusResponse | ErrorResponse> { ): Promise<BatchScrapeStatusResponse | ErrorResponse> {
const headers = this.prepareHeaders(idempotencyKey); const headers = this.prepareHeaders(idempotencyKey);
let jsonData: any = { urls, ...params }; let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params };
if (jsonData?.extract?.schema) { if (jsonData?.extract?.schema) {
let schema = jsonData.extract.schema; let schema = jsonData.extract.schema;
@ -621,10 +630,12 @@ export default class FirecrawlApp {
async asyncBatchScrapeUrls( async asyncBatchScrapeUrls(
urls: string[], urls: string[],
params?: ScrapeParams, params?: ScrapeParams,
idempotencyKey?: string idempotencyKey?: string,
webhook?: CrawlParams["webhook"],
ignoreInvalidURLs?: boolean,
): Promise<BatchScrapeResponse | ErrorResponse> { ): Promise<BatchScrapeResponse | ErrorResponse> {
const headers = this.prepareHeaders(idempotencyKey); const headers = this.prepareHeaders(idempotencyKey);
let jsonData: any = { urls, ...(params ?? {}) }; let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...(params ?? {}) };
try { try {
const response: AxiosResponse = await this.postRequest( const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/batch/scrape`, this.apiUrl + `/v1/batch/scrape`,
@ -657,8 +668,10 @@ export default class FirecrawlApp {
urls: string[], urls: string[],
params?: ScrapeParams, params?: ScrapeParams,
idempotencyKey?: string, idempotencyKey?: string,
webhook?: CrawlParams["webhook"],
ignoreInvalidURLs?: boolean,
) { ) {
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey); const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs);
if (crawl.success && crawl.id) { if (crawl.success && crawl.id) {
const id = crawl.id; const id = crawl.id;
@ -932,9 +945,11 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
private ws: WebSocket; private ws: WebSocket;
public data: FirecrawlDocument<undefined>[]; public data: FirecrawlDocument<undefined>[];
public status: CrawlStatusResponse["status"]; public status: CrawlStatusResponse["status"];
public id: string;
constructor(id: string, app: FirecrawlApp) { constructor(id: string, app: FirecrawlApp) {
super(); super();
this.id = id;
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey); this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
this.status = "scraping"; this.status = "scraping";
this.data = []; this.data = [];
@ -965,6 +980,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
detail: { detail: {
status: this.status, status: this.status,
data: this.data, data: this.data,
id: this.id,
}, },
})); }));
} else if (msg.type === "error") { } else if (msg.type === "error") {
@ -974,6 +990,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
status: this.status, status: this.status,
data: this.data, data: this.data,
error: msg.error, error: msg.error,
id: this.id,
}, },
})); }));
} else if (msg.type === "catchup") { } else if (msg.type === "catchup") {
@ -981,12 +998,18 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
this.data.push(...(msg.data.data ?? [])); this.data.push(...(msg.data.data ?? []));
for (const doc of this.data) { for (const doc of this.data) {
this.dispatchTypedEvent("document", new CustomEvent("document", { this.dispatchTypedEvent("document", new CustomEvent("document", {
detail: doc, detail: {
...doc,
id: this.id,
},
})); }));
} }
} else if (msg.type === "document") { } else if (msg.type === "document") {
this.dispatchTypedEvent("document", new CustomEvent("document", { this.dispatchTypedEvent("document", new CustomEvent("document", {
detail: msg.data, detail: {
...msg.data,
id: this.id,
},
})); }));
} }
} }
@ -996,14 +1019,21 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
this.ws.close(); this.ws.close();
return; return;
} }
try {
const msg = JSON.parse(ev.data) as Message; const msg = JSON.parse(ev.data) as Message;
messageHandler(msg); messageHandler(msg);
} catch (error) {
console.error("Error on message", error);
}
}).bind(this); }).bind(this);
this.ws.onclose = ((ev: CloseEvent) => { this.ws.onclose = ((ev: CloseEvent) => {
const msg = JSON.parse(ev.reason) as Message; try {
messageHandler(msg); const msg = JSON.parse(ev.reason) as Message;
messageHandler(msg);
} catch (error) {
console.error("Error on close", error);
}
}).bind(this); }).bind(this);
this.ws.onerror = ((_: Event) => { this.ws.onerror = ((_: Event) => {
@ -1013,6 +1043,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
status: this.status, status: this.status,
data: this.data, data: this.data,
error: "WebSocket error", error: "WebSocket error",
id: this.id,
}, },
})); }));
}).bind(this); }).bind(this);

View File

@ -196,7 +196,7 @@ app.post('/scrape', async (req: Request, res: Response) => {
} }
} }
const pageError = pageStatusCode !== 200 ? getError(pageStatusCode) : false; const pageError = pageStatusCode !== 200 ? getError(pageStatusCode) : undefined;
if (!pageError) { if (!pageError) {
console.log(`✅ Scrape successful!`); console.log(`✅ Scrape successful!`);
@ -209,7 +209,7 @@ app.post('/scrape', async (req: Request, res: Response) => {
res.json({ res.json({
content: pageContent, content: pageContent,
pageStatusCode, pageStatusCode,
pageError ...(pageError && { pageError })
}); });
}); });

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp # noqa from .firecrawl import FirecrawlApp # noqa
__version__ = "1.6.4" __version__ = "1.6.8"
# Define the logger for the Firecrawl project # Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl") logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -29,12 +29,12 @@ def test_scrape_url_invalid_api_key():
invalid_app.scrape_url('https://firecrawl.dev') invalid_app.scrape_url('https://firecrawl.dev')
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
def test_blocklisted_url(): # def test_blocklisted_url():
blocklisted_url = "https://facebook.com/fake-test" # blocklisted_url = "https://facebook.com/fake-test"
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
with pytest.raises(Exception) as excinfo: # with pytest.raises(Exception) as excinfo:
app.scrape_url(blocklisted_url) # app.scrape_url(blocklisted_url)
assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) # assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
def test_successful_response_with_valid_preview_token(): def test_successful_response_with_valid_preview_token():
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0') app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0')
@ -90,12 +90,12 @@ def test_crawl_url_invalid_api_key():
invalid_app.crawl_url('https://firecrawl.dev') invalid_app.crawl_url('https://firecrawl.dev')
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
def test_should_return_error_for_blocklisted_url(): # def test_should_return_error_for_blocklisted_url():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
blocklisted_url = "https://twitter.com/fake-test" # blocklisted_url = "https://twitter.com/fake-test"
with pytest.raises(Exception) as excinfo: # with pytest.raises(Exception) as excinfo:
app.crawl_url(blocklisted_url) # app.crawl_url(blocklisted_url)
assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) # assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
def test_crawl_url_wait_for_completion_e2e(): def test_crawl_url_wait_for_completion_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')

View File

@ -30,12 +30,12 @@ def test_scrape_url_invalid_api_key():
invalid_app.scrape_url('https://firecrawl.dev') invalid_app.scrape_url('https://firecrawl.dev')
assert "Unauthorized: Invalid token" in str(excinfo.value) assert "Unauthorized: Invalid token" in str(excinfo.value)
def test_blocklisted_url(): # def test_blocklisted_url():
blocklisted_url = "https://facebook.com/fake-test" # blocklisted_url = "https://facebook.com/fake-test"
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
with pytest.raises(Exception) as excinfo: # with pytest.raises(Exception) as excinfo:
app.scrape_url(blocklisted_url) # app.scrape_url(blocklisted_url)
assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) # assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
def test_successful_response_with_valid_preview_token(): def test_successful_response_with_valid_preview_token():
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
@ -136,12 +136,12 @@ def test_crawl_url_invalid_api_key():
invalid_app.crawl_url('https://firecrawl.dev') invalid_app.crawl_url('https://firecrawl.dev')
assert "Unauthorized: Invalid token" in str(excinfo.value) assert "Unauthorized: Invalid token" in str(excinfo.value)
def test_should_return_error_for_blocklisted_url(): # def test_should_return_error_for_blocklisted_url():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
blocklisted_url = "https://twitter.com/fake-test" # blocklisted_url = "https://twitter.com/fake-test"
with pytest.raises(Exception) as excinfo: # with pytest.raises(Exception) as excinfo:
app.crawl_url(blocklisted_url) # app.crawl_url(blocklisted_url)
assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) # assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
def test_crawl_url_wait_for_completion_e2e(): def test_crawl_url_wait_for_completion_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
@ -296,12 +296,12 @@ def test_invalid_api_key_on_map():
invalid_app.map_url('https://roastmywebsite.ai') invalid_app.map_url('https://roastmywebsite.ai')
assert "Unauthorized: Invalid token" in str(excinfo.value) assert "Unauthorized: Invalid token" in str(excinfo.value)
def test_blocklisted_url_on_map(): # def test_blocklisted_url_on_map():
app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL) # app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
blocklisted_url = "https://facebook.com/fake-test" # blocklisted_url = "https://facebook.com/fake-test"
with pytest.raises(Exception) as excinfo: # with pytest.raises(Exception) as excinfo:
app.map_url(blocklisted_url) # app.map_url(blocklisted_url)
assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) # assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
def test_successful_response_with_valid_preview_token_on_map(): def test_successful_response_with_valid_preview_token_on_map():
app = FirecrawlApp(api_key="this_is_just_a_preview_token", api_url=API_URL) app = FirecrawlApp(api_key="this_is_just_a_preview_token", api_url=API_URL)

View File

@ -26,7 +26,7 @@ class FirecrawlApp:
""" """
Parameters for the extract operation. Parameters for the extract operation.
""" """
prompt: str prompt: Optional[str] = None
schema_: Optional[Any] = pydantic.Field(None, alias='schema') schema_: Optional[Any] = pydantic.Field(None, alias='schema')
system_prompt: Optional[str] = None system_prompt: Optional[str] = None
allow_external_links: Optional[bool] = False allow_external_links: Optional[bool] = False
@ -704,15 +704,15 @@ class CrawlWatcher:
async def _handle_message(self, msg: Dict[str, Any]): async def _handle_message(self, msg: Dict[str, Any]):
if msg['type'] == 'done': if msg['type'] == 'done':
self.status = 'completed' self.status = 'completed'
self.dispatch_event('done', {'status': self.status, 'data': self.data}) self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
elif msg['type'] == 'error': elif msg['type'] == 'error':
self.status = 'failed' self.status = 'failed'
self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error']}) self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
elif msg['type'] == 'catchup': elif msg['type'] == 'catchup':
self.status = msg['data']['status'] self.status = msg['data']['status']
self.data.extend(msg['data'].get('data', [])) self.data.extend(msg['data'].get('data', []))
for doc in self.data: for doc in self.data:
self.dispatch_event('document', doc) self.dispatch_event('document', {'data': doc, 'id': self.id})
elif msg['type'] == 'document': elif msg['type'] == 'document':
self.data.append(msg['data']) self.data.append(msg['data'])
self.dispatch_event('document', msg['data']) self.dispatch_event('document', {'data': msg['data'], 'id': self.id})

View File

@ -5,20 +5,20 @@ use firecrawl::FirecrawlApp;
use serde_json::json; use serde_json::json;
use std::env; use std::env;
#[tokio::test] // #[tokio::test]
async fn test_blocklisted_url() { // async fn test_blocklisted_url() {
dotenv().ok(); // dotenv().ok();
let api_url = env::var("API_URL").unwrap(); // let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").ok(); // let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap(); // let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let blocklisted_url = "https://facebook.com/fake-test"; // let blocklisted_url = "https://facebook.com/fake-test";
let result = app.scrape_url(blocklisted_url, None).await; // let result = app.scrape_url(blocklisted_url, None).await;
assert_matches!( // assert_matches!(
result, // result,
Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions") // Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions")
); // );
} // }
#[tokio::test] #[tokio::test]
async fn test_successful_response_with_valid_preview_token() { async fn test_successful_response_with_valid_preview_token() {
@ -103,20 +103,21 @@ async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explici
.contains("We present spectrophotometric observations of the Broad Line Radio Galaxy")); .contains("We present spectrophotometric observations of the Broad Line Radio Galaxy"));
} }
#[tokio::test]
async fn test_should_return_error_for_blocklisted_url() {
dotenv().ok();
let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let blocklisted_url = "https://twitter.com/fake-test";
let result = app.crawl_url(blocklisted_url, None).await;
assert_matches!( // #[tokio::test]
result, // async fn test_should_return_error_for_blocklisted_url() {
Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions.") // dotenv().ok();
); // let api_url = env::var("API_URL").unwrap();
} // let api_key = env::var("TEST_API_KEY").ok();
// let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
// let blocklisted_url = "https://twitter.com/fake-test";
// let result = app.crawl_url(blocklisted_url, None).await;
// assert_matches!(
// result,
// Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions.")
// );
// }
#[tokio::test] #[tokio::test]
async fn test_llm_extraction() { async fn test_llm_extraction() {

View File

@ -0,0 +1,147 @@
import os
import json
import requests
from dotenv import load_dotenv
from openai import OpenAI
from serpapi import GoogleSearch
# ANSI color codes
class Colors:
CYAN = '\033[96m'
YELLOW = '\033[93m'
GREEN = '\033[92m'
RED = '\033[91m'
MAGENTA = '\033[95m'
BLUE = '\033[94m'
RESET = '\033[0m'
# Load environment variables
load_dotenv()
# Initialize clients
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
def search_google(query):
"""Search Google using SerpAPI and return top results."""
print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}")
search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
return search.get_dict().get("organic_results", [])
def select_urls_with_o1(company, objective, serp_results):
"""
Use O1 to select the most relevant URLs from SERP results for the given company and objective.
Returns a JSON object with a "selected_urls" property that is an array of strings.
"""
try:
# Prepare the data for O1
serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")}
for r in serp_results if r.get("link")]
response = client.chat.completions.create(
model="o1-2024-12-17",
messages=[
{
"role": "developer",
"content": "You select URLs from the SERP results relevant to the company and objective."
},
{
"role": "user",
"content": (
f"Company: {company}\n"
f"Objective: {objective}\n"
f"SERP Results: {json.dumps(serp_data)}\n\n"
"Return a JSON object with a property 'selected_urls' that contains an array "
"of URLs most likely to help meet the objective. Add a /* to the end of the URL if you think it should search all of the pages in the site. Do not return any social media links. For example: {\"selected_urls\": [\"https://example.com\", \"https://example2.com\"]}"
)
}
],
response_format={
"type": "json_schema",
"json_schema": {
"name": "selected_urls_object",
"schema": {
"type": "object",
"properties": {
"selected_urls": {
"type": "array",
"items": {
"type": "string"
}
}
},
"required": ["selected_urls"],
"additionalProperties": False
}
}
}
)
# The response is guaranteed to follow the specified JSON schema
result = json.loads(response.choices[0].message.content)
urls = result.get("selected_urls", [])
return urls
except Exception as e:
print(f"{Colors.RED}Error selecting URLs with O1: {e}{Colors.RESET}")
return []
def extract_company_info(urls, prompt, company, api_key):
"""Use requests to call Firecrawl's extract endpoint with selected URLs."""
print(f"{Colors.YELLOW}Extracting structured data from the provided URLs using Firecrawl...{Colors.RESET}")
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {api_key}'
}
payload = {
"urls": urls,
"prompt": prompt + " for " + company
}
try:
response = requests.post(
"https://api.firecrawl.dev/v1/extract",
headers=headers,
json=payload
)
response.raise_for_status()
data = response.json()
return data
except Exception as e:
print(f"{Colors.RED}Failed to extract data: {e}{Colors.RESET}")
return None
def main():
company = input(f"{Colors.BLUE}Enter the company name: {Colors.RESET}")
objective = input(f"{Colors.BLUE}Enter what information you want about the company: {Colors.RESET}")
serp_results = search_google(f"{company}")
if not serp_results:
print(f"{Colors.RED}No search results found.{Colors.RESET}")
return
# Ask O1 to select URLs
selected_urls = select_urls_with_o1(company, objective, serp_results)
if not selected_urls:
print(f"{Colors.RED}O1 did not return any URLs.{Colors.RESET}")
return
print(f"{Colors.CYAN}Selected URLs for extraction by O1:{Colors.RESET}")
for url in selected_urls:
print(f"- {url}")
data = extract_company_info(selected_urls, objective, company, firecrawl_api_key)
if data and data.get('success') and data.get('data'):
print(f"{Colors.GREEN}Data successfully extracted:{Colors.RESET}")
print(json.dumps(data['data'], indent=2))
else:
print(f"{Colors.RED}Failed to extract the requested information. Try refining your prompt or choosing a different company.{Colors.RESET}")
if __name__ == "__main__":
main()