Merge branch 'main' into fix-sdk/next-in-when-502

This commit is contained in:
Nicolas 2024-12-26 12:53:10 -03:00
commit bcc18e1c07
60 changed files with 1195 additions and 425 deletions

View File

@ -111,6 +111,20 @@ curl -X POST http://localhost:3002/v1/crawl \
}'
```
### Alternative: Using Docker Compose
For a simpler setup, you can use Docker Compose to run all services:
1. Prerequisites: Make sure you have Docker and Docker Compose installed
2. Copy the `.env.example` file to `.env` in the `/apps/api/` directory and configure as needed
3. From the root directory, run:
```bash
docker compose up
```
This will start Redis, the API server, and workers automatically in the correct configuration.
## Tests:
The best way to do this is run the test with `npm run test:local-no-auth` if you'd like to run the tests without authentication.

2
apps/api/.gitignore vendored
View File

@ -9,3 +9,5 @@ dump.rdb
.rdb
.sentryclirc
.env.*

View File

@ -7478,7 +7478,7 @@ snapshots:
extract-zip@2.0.1:
dependencies:
debug: 4.3.4
debug: 4.3.5
get-stream: 5.2.0
yauzl: 2.10.0
optionalDependencies:
@ -7622,7 +7622,7 @@ snapshots:
dependencies:
basic-ftp: 5.0.5
data-uri-to-buffer: 6.0.2
debug: 4.3.4
debug: 4.3.5
fs-extra: 11.2.0
transitivePeerDependencies:
- supports-color
@ -7723,7 +7723,7 @@ snapshots:
http-proxy-agent@7.0.2:
dependencies:
agent-base: 7.1.1
debug: 4.3.4
debug: 4.3.5
transitivePeerDependencies:
- supports-color
@ -7771,7 +7771,7 @@ snapshots:
https-proxy-agent@7.0.5:
dependencies:
agent-base: 7.1.1
debug: 4.3.4
debug: 4.3.5
transitivePeerDependencies:
- supports-color
@ -8836,7 +8836,7 @@ snapshots:
dependencies:
'@tootallnate/quickjs-emscripten': 0.23.0
agent-base: 7.1.1
debug: 4.3.4
debug: 4.3.5
get-uri: 6.0.3
http-proxy-agent: 7.0.2
https-proxy-agent: 7.0.5
@ -9031,7 +9031,7 @@ snapshots:
proxy-agent@6.4.0:
dependencies:
agent-base: 7.1.1
debug: 4.3.4
debug: 4.3.5
http-proxy-agent: 7.0.2
https-proxy-agent: 7.0.5
lru-cache: 7.18.3
@ -9338,7 +9338,7 @@ snapshots:
socks-proxy-agent@8.0.4:
dependencies:
agent-base: 7.1.1
debug: 4.3.4
debug: 4.3.5
socks: 2.8.3
transitivePeerDependencies:
- supports-color

View File

@ -1,6 +1,7 @@
import request from "supertest";
import dotenv from "dotenv";
import { v4 as uuidv4 } from "uuid";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
dotenv.config();
@ -58,9 +59,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json")
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
});
// tested on rate limit test
@ -480,9 +479,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json")
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
});
it.concurrent(

View File

@ -1,5 +1,6 @@
import request from "supertest";
import dotenv from "dotenv";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
const fs = require("fs");
const path = require("path");
@ -61,9 +62,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
.set("Content-Type", "application/json")
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
});
it("should return a successful response", async () => {
@ -88,9 +87,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
.set("Content-Type", "application/json")
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
});
it("should return a successful response", async () => {
@ -119,9 +116,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
.set("Content-Type", "application/json")
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
});
it("should return a successful response", async () => {

View File

@ -4,6 +4,7 @@ import {
ScrapeRequestInput,
ScrapeResponseRequestTest,
} from "../../controllers/v1/types";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
configDotenv();
const TEST_URL = "http://127.0.0.1:3002";
@ -57,9 +58,7 @@ describe("E2E Tests for v1 API Routes", () => {
.send(scrapeRequest);
expect(response.statusCode).toBe(403);
expect(response.body.error).toBe(
"URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.",
);
expect(response.body.error).toBe(BLOCKLISTED_URL_MESSAGE);
});
it.concurrent(
@ -756,9 +755,7 @@ describe("E2E Tests for v1 API Routes", () => {
.send(scrapeRequest);
expect(response.statusCode).toBe(403);
expect(response.body.error).toBe(
"URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.",
);
expect(response.body.error).toBe(BLOCKLISTED_URL_MESSAGE);
});
it.concurrent(

View File

@ -351,6 +351,7 @@ function getPlanByPriceId(price_id: string | null): PlanType {
case process.env.STRIPE_PRICE_ID_ETIER1A_MONTHLY: //ocqh
return "etier1a";
case process.env.STRIPE_PRICE_ID_ETIER_SCALE_1_MONTHLY:
case process.env.STRIPE_PRICE_ID_ETIER_SCALE_1_YEARLY:
return "etierscale1";
default:
return "free";

View File

@ -0,0 +1,64 @@
import { logger } from "../../../lib/logger";
import * as Sentry from "@sentry/node";
import { Request, Response } from "express";
export async function checkFireEngine(req: Request, res: Response) {
try {
if (!process.env.FIRE_ENGINE_BETA_URL) {
logger.warn("Fire engine beta URL not configured");
return res.status(500).json({
success: false,
error: "Fire engine beta URL not configured",
});
}
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 30000);
try {
const response = await fetch(
`${process.env.FIRE_ENGINE_BETA_URL}/scrape`,
{
method: "POST",
headers: {
"Content-Type": "application/json",
"X-Disable-Cache": "true",
},
body: JSON.stringify({
url: "https://example.com",
}),
signal: controller.signal,
},
);
clearTimeout(timeout);
if (response.ok) {
const responseData = await response.json();
return res.status(200).json({
data: responseData,
});
} else {
return res.status(response.status).json({
success: false,
error: `Fire engine returned status ${response.status}`,
});
}
} catch (error) {
if (error.name === "AbortError") {
return res.status(504).json({
success: false,
error: "Request timed out after 30 seconds",
});
}
throw error;
}
} catch (error) {
logger.error(error);
Sentry.captureException(error);
return res.status(500).json({
success: false,
error: "Internal server error",
});
}
}

View File

@ -29,6 +29,7 @@ import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority";
import { fromLegacyScrapeOptions, url as urlSchema } from "../v1/types";
import { ZodError } from "zod";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
export async function crawlController(req: Request, res: Response) {
try {
@ -112,8 +113,7 @@ export async function crawlController(req: Request, res: Response) {
if (isUrlBlocked(url)) {
return res.status(403).json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
error: BLOCKLISTED_URL_MESSAGE,
});
}

View File

@ -15,6 +15,7 @@ import { addScrapeJob } from "../../../src/services/queue-jobs";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import * as Sentry from "@sentry/node";
import { fromLegacyScrapeOptions } from "../v1/types";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
export async function crawlPreviewController(req: Request, res: Response) {
try {
@ -42,8 +43,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
if (isUrlBlocked(url)) {
return res.status(403).json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
error: BLOCKLISTED_URL_MESSAGE,
});
}

View File

@ -8,7 +8,6 @@ import { authenticateUser } from "../auth";
import { PlanType, RateLimiterMode } from "../../types";
import { logJob } from "../../services/logging/log_job";
import {
Document,
fromLegacyCombo,
toLegacyDocument,
url as urlSchema,
@ -29,6 +28,8 @@ import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority";
import { fromLegacyScrapeOptions } from "../v1/types";
import { ZodError } from "zod";
import { Document as V0Document } from "./../../lib/entities";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
export async function scrapeHelper(
jobId: string,
@ -42,7 +43,7 @@ export async function scrapeHelper(
): Promise<{
success: boolean;
error?: string;
data?: Document | { url: string };
data?: V0Document | { url: string };
returnCode: number;
}> {
const url = urlSchema.parse(req.body.url);
@ -53,8 +54,7 @@ export async function scrapeHelper(
if (isUrlBlocked(url)) {
return {
success: false,
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
error: BLOCKLISTED_URL_MESSAGE,
returnCode: 403,
};
}
@ -241,9 +241,9 @@ export async function scrapeController(req: Request, res: Response) {
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens =
result.data && (result.data as Document).markdown
result.data && (result.data as V0Document).markdown
? numTokensFromString(
(result.data as Document).markdown!,
(result.data as V0Document).markdown!,
"gpt-3.5-turbo",
)
: 0;
@ -265,25 +265,28 @@ export async function scrapeController(req: Request, res: Response) {
}
if (creditsToBeBilled > 0) {
// billing for doc done on queue end, bill only for llm extraction
billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch((error) => {
logger.error(
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
);
// Optionally, you could notify an admin or add to a retry queue here
});
billTeam(team_id, chunk?.sub_id, creditsToBeBilled, logger).catch(
(error) => {
logger.error(
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,
{ error },
);
// Optionally, you could notify an admin or add to a retry queue here
},
);
}
}
let doc = result.data;
if (!pageOptions || !pageOptions.includeRawHtml) {
if (doc && (doc as Document).rawHtml) {
delete (doc as Document).rawHtml;
if (doc && (doc as V0Document).rawHtml) {
delete (doc as V0Document).rawHtml;
}
}
if (pageOptions && pageOptions.includeExtract) {
if (!pageOptions.includeMarkdown && doc && (doc as Document).markdown) {
delete (doc as Document).markdown;
if (!pageOptions.includeMarkdown && doc && (doc as V0Document).markdown) {
delete (doc as V0Document).markdown;
}
}
@ -312,7 +315,7 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(result.returnCode).json(result);
} catch (error) {
Sentry.captureException(error);
logger.error(error);
logger.error("Scrape error occcurred", { error });
return res.status(500).json({
error:
error instanceof ZodError

View File

@ -1,4 +1,5 @@
import { url } from "../types";
import { BLOCKLISTED_URL_MESSAGE } from "../../../lib/strings";
describe("URL Schema Validation", () => {
beforeEach(() => {
@ -31,7 +32,7 @@ describe("URL Schema Validation", () => {
it("should reject blocked URLs", () => {
expect(() => url.parse("https://facebook.com")).toThrow(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
BLOCKLISTED_URL_MESSAGE,
);
});
@ -47,16 +48,16 @@ describe("URL Schema Validation", () => {
it("should handle URLs with subdomains that are blocked", () => {
expect(() => url.parse("https://sub.facebook.com")).toThrow(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
BLOCKLISTED_URL_MESSAGE,
);
});
it("should handle URLs with paths that are blocked", () => {
expect(() => url.parse("http://facebook.com/path")).toThrow(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
BLOCKLISTED_URL_MESSAGE,
);
expect(() => url.parse("https://facebook.com/another/path")).toThrow(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
BLOCKLISTED_URL_MESSAGE,
);
});

View File

@ -3,9 +3,11 @@ import { v4 as uuidv4 } from "uuid";
import {
BatchScrapeRequest,
batchScrapeRequestSchema,
CrawlResponse,
batchScrapeRequestSchemaNoURLValidation,
url as urlSchema,
RequestWithAuth,
ScrapeOptions,
BatchScrapeResponse,
} from "./types";
import {
addCrawlJobs,
@ -21,10 +23,14 @@ import { callWebhook } from "../../services/webhook";
import { logger as _logger } from "../../lib/logger";
export async function batchScrapeController(
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
res: Response<CrawlResponse>,
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
res: Response<BatchScrapeResponse>,
) {
req.body = batchScrapeRequestSchema.parse(req.body);
if (req.body?.ignoreInvalidURLs === true) {
req.body = batchScrapeRequestSchemaNoURLValidation.parse(req.body);
} else {
req.body = batchScrapeRequestSchema.parse(req.body);
}
const id = req.body.appendToId ?? uuidv4();
const logger = _logger.child({
@ -35,8 +41,27 @@ export async function batchScrapeController(
teamId: req.auth.team_id,
plan: req.auth.plan,
});
let urls = req.body.urls;
let invalidURLs: string[] | undefined = undefined;
if (req.body.ignoreInvalidURLs) {
invalidURLs = [];
let pendingURLs = urls;
urls = [];
for (const u of pendingURLs) {
try {
const nu = urlSchema.parse(u);
urls.push(nu);
} catch (_) {
invalidURLs.push(u);
}
}
}
logger.debug("Batch scrape " + id + " starting", {
urlsLength: req.body.urls,
urlsLength: urls,
appendToId: req.body.appendToId,
account: req.account,
});
@ -70,7 +95,7 @@ export async function batchScrapeController(
// If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20
if (req.body.urls.length > 1000) {
if (urls.length > 1000) {
// set base to 21
jobPriority = await getJobPriority({
plan: req.auth.plan,
@ -84,7 +109,7 @@ export async function batchScrapeController(
delete (scrapeOptions as any).urls;
delete (scrapeOptions as any).appendToId;
const jobs = req.body.urls.map((x) => {
const jobs = urls.map((x) => {
return {
data: {
url: x,
@ -140,5 +165,6 @@ export async function batchScrapeController(
success: true,
id,
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`,
invalidURLs,
});
}

View File

@ -18,7 +18,7 @@ import {
} from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { addScrapeJob } from "../../services/queue-jobs";
import { addScrapeJob, addScrapeJobs } from "../../services/queue-jobs";
import { logger as _logger } from "../../lib/logger";
import { getJobPriority } from "../../lib/job-priority";
import { callWebhook } from "../../services/webhook";
@ -139,9 +139,9 @@ export async function crawlController(
name: uuid,
data: {
url,
mode: "single_urls",
mode: "single_urls" as const,
team_id: req.auth.team_id,
plan: req.auth.plan,
plan: req.auth.plan!,
crawlerOptions,
scrapeOptions,
internalOptions: sc.internalOptions,
@ -170,7 +170,7 @@ export async function crawlController(
jobs.map((x) => x.opts.jobId),
);
logger.debug("Adding scrape jobs to BullMQ...");
await getScrapeQueue().addBulk(jobs);
await addScrapeJobs(jobs);
} else {
logger.debug("Sitemap not found or ignored.", {
ignoreSitemap: sc.crawlerOptions.ignoreSitemap,

View File

@ -0,0 +1,45 @@
import { Request, Response } from "express";
import { RequestWithAuth } from "./types";
import { getACUC } from "../auth";
import { logger } from "../../lib/logger";
export async function creditUsageController(
req: RequestWithAuth,
res: Response,
): Promise<void> {
try {
// If we already have the credit usage info from auth, use it
if (req.acuc) {
res.json({
success: true,
data: {
remaining_credits: req.acuc.remaining_credits,
},
});
return;
}
// Otherwise fetch fresh data
const chunk = await getACUC(req.auth.team_id);
if (!chunk) {
res.status(404).json({
success: false,
error: "Could not find credit usage information",
});
return;
}
res.json({
success: true,
data: {
remaining_credits: chunk.remaining_credits,
},
});
} catch (error) {
logger.error("Error in credit usage controller:", error);
res.status(500).json({
success: false,
error: "Internal server error while fetching credit usage",
});
}
}

View File

@ -1,6 +1,6 @@
import { Request, Response } from "express";
import {
// Document,
Document,
RequestWithAuth,
ExtractRequest,
extractRequestSchema,
@ -8,7 +8,7 @@ import {
MapDocument,
scrapeOptions,
} from "./types";
import { Document } from "../../lib/entities";
// import { Document } from "../../lib/entities";
import Redis from "ioredis";
import { configDotenv } from "dotenv";
import { performRanking } from "../../lib/ranker";
@ -24,6 +24,9 @@ import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { getMapResults } from "./map";
import { buildDocument } from "../../lib/extract/build-document";
import { generateBasicCompletion } from "../../lib/LLM-extraction";
import { buildRefrasedPrompt } from "../../lib/extract/build-prompts";
import { removeDuplicateUrls } from "../../lib/validateUrl";
configDotenv();
const redis = new Redis(process.env.REDIS_URL!);
@ -61,30 +64,50 @@ export async function extractController(
const baseUrl = url.replace("/*", "");
// const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
const allowExternalLinks = req.body.allowExternalLinks ?? true;
const allowExternalLinks = req.body.allowExternalLinks;
let urlWithoutWww = baseUrl.replace("www.", "");
let mapUrl =
req.body.prompt && allowExternalLinks
? `${req.body.prompt} ${urlWithoutWww}`
: req.body.prompt
? `${req.body.prompt} site:${urlWithoutWww}`
: `site:${urlWithoutWww}`;
let rephrasedPrompt = req.body.prompt;
if (req.body.prompt) {
rephrasedPrompt =
(await generateBasicCompletion(
buildRefrasedPrompt(req.body.prompt, baseUrl),
)) ?? req.body.prompt;
}
const mapResults = await getMapResults({
url: baseUrl,
search: req.body.prompt,
search: rephrasedPrompt,
teamId: req.auth.team_id,
plan: req.auth.plan,
allowExternalLinks,
origin: req.body.origin,
limit: req.body.limit,
// If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
ignoreSitemap: !selfHosted ? true : false,
ignoreSitemap: false,
includeMetadata: true,
includeSubdomains: req.body.includeSubdomains,
});
let mappedLinks = mapResults.links as MapDocument[];
let mappedLinks = mapResults.mapResults as MapDocument[];
// Remove duplicates between mapResults.links and mappedLinks
const allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
const uniqueUrls = removeDuplicateUrls(allUrls);
// Only add URLs from mapResults.links that aren't already in mappedLinks
const existingUrls = new Set(mappedLinks.map((m) => m.url));
const newUrls = uniqueUrls.filter((url) => !existingUrls.has(url));
mappedLinks = [
...mappedLinks,
...newUrls.map((url) => ({ url, title: "", description: "" })),
];
if (mappedLinks.length === 0) {
mappedLinks = [{ url: baseUrl, title: "", description: "" }];
}
// Limit number of links to MAX_EXTRACT_LIMIT
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
@ -93,18 +116,18 @@ export async function extractController(
`url: ${x.url}, title: ${x.title}, description: ${x.description}`,
);
// Filter by path prefix if present
// wrong
// if (pathPrefix) {
// mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
// }
if (req.body.prompt) {
let searchQuery =
req.body.prompt && allowExternalLinks
? `${req.body.prompt} ${urlWithoutWww}`
: req.body.prompt
? `${req.body.prompt} site:${urlWithoutWww}`
: `site:${urlWithoutWww}`;
// Get similarity scores between the search query and each link's context
const linksAndScores = await performRanking(
mappedLinksRerank,
mappedLinks.map((l) => l.url),
mapUrl,
searchQuery,
);
// First try with high threshold
@ -158,7 +181,8 @@ export async function extractController(
// Wait for all URL processing to complete and flatten results
const processedUrls = await Promise.all(urlPromises);
links.push(...processedUrls.flat());
const flattenedUrls = processedUrls.flat().filter((url) => url); // Filter out any null/undefined values
links.push(...flattenedUrls);
if (links.length === 0) {
return res.status(400).json({
@ -204,21 +228,8 @@ export async function extractController(
}
return doc;
} catch (e) {
logger.error(`Error in scrapeController: ${e}`);
if (
e instanceof Error &&
(e.message.startsWith("Job wait") || e.message === "timeout")
) {
throw {
status: 408,
error: "Request timed out",
};
} else {
throw {
status: 500,
error: `(Internal server error) - ${e && e.message ? e.message : e}`,
};
}
logger.error(`Error in extractController: ${e}`);
return null;
}
});
@ -237,7 +248,8 @@ export async function extractController(
{
mode: "llm",
systemPrompt:
"Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided. Here are the urls the user provided of which he wants to extract information from: " +
(req.body.systemPrompt ? `${req.body.systemPrompt}\n` : "") +
"Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " +
links.join(", "),
prompt: req.body.prompt,
schema: req.body.schema,

View File

@ -28,14 +28,15 @@ const redis = new Redis(process.env.REDIS_URL!);
// Max Links that /map can return
const MAX_MAP_LIMIT = 5000;
// Max Links that "Smart /map" can return
const MAX_FIRE_ENGINE_RESULTS = 1000;
const MAX_FIRE_ENGINE_RESULTS = 500;
interface MapResult {
success: boolean;
links: string[] | any[];
links: string[];
scrape_id?: string;
job_id: string;
time_taken: number;
mapResults: MapDocument[];
}
export async function getMapResults({
@ -215,7 +216,8 @@ export async function getMapResults({
return {
success: true,
links: includeMetadata ? mapResults : linksToReturn,
links: linksToReturn,
mapResults: mapResults,
scrape_id: origin?.includes("website") ? id : undefined,
job_id: id,
time_taken: (new Date().getTime() - Date.now()) / 1000,

View File

@ -60,7 +60,11 @@ export async function scrapeController(
try {
doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
} catch (e) {
logger.error(`Error in scrapeController: ${e}`);
logger.error(`Error in scrapeController: ${e}`, {
jobId,
scrapeId: jobId,
startTime,
});
if (
e instanceof Error &&
(e.message.startsWith("Job wait") || e.message === "timeout")

View File

@ -11,6 +11,7 @@ import {
Document as V0Document,
} from "../../lib/entities";
import { InternalOptions } from "../../scraper/scrapeURL";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
export type Format =
| "markdown"
@ -44,10 +45,7 @@ export const url = z.preprocess(
return false;
}
}, "Invalid URL")
.refine(
(x) => !isUrlBlocked(x as string),
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
),
.refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE),
);
const strictMessage =
@ -182,6 +180,7 @@ export const scrapeOptions = z
.optional(),
skipTlsVerification: z.boolean().default(false),
removeBase64Images: z.boolean().default(true),
fastMode: z.boolean().default(false),
})
.strict(strictMessage);
@ -193,11 +192,12 @@ export const extractV1Options = z
.array()
.max(10, "Maximum of 10 URLs allowed per request while in beta."),
prompt: z.string().optional(),
systemPrompt: z.string().optional(),
schema: z.any().optional(),
limit: z.number().int().positive().finite().safe().optional(),
ignoreSitemap: z.boolean().default(false),
includeSubdomains: z.boolean().default(true),
allowExternalLinks: z.boolean().default(true),
allowExternalLinks: z.boolean().default(false),
origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000),
})
@ -262,6 +262,31 @@ export const batchScrapeRequestSchema = scrapeOptions
origin: z.string().optional().default("api"),
webhook: webhookSchema.optional(),
appendToId: z.string().uuid().optional(),
ignoreInvalidURLs: z.boolean().default(false),
})
.strict(strictMessage)
.refine(
(obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
const hasExtractOptions = obj.extract !== undefined;
return (
(hasExtractFormat && hasExtractOptions) ||
(!hasExtractFormat && !hasExtractOptions)
);
},
{
message:
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
},
);
export const batchScrapeRequestSchemaNoURLValidation = scrapeOptions
.extend({
urls: z.string().array(),
origin: z.string().optional().default("api"),
webhook: webhookSchema.optional(),
appendToId: z.string().uuid().optional(),
ignoreInvalidURLs: z.boolean().default(false),
})
.strict(strictMessage)
.refine(
@ -396,7 +421,7 @@ export type Document = {
articleSection?: string;
url?: string;
sourceURL?: string;
statusCode?: number;
statusCode: number;
error?: string;
[key: string]: string | string[] | number | undefined;
};
@ -446,6 +471,15 @@ export type CrawlResponse =
url: string;
};
export type BatchScrapeResponse =
| ErrorResponse
| {
success: true;
id: string;
url: string;
invalidURLs?: string[];
};
export type MapResponse =
| ErrorResponse
| {
@ -651,11 +685,11 @@ export function fromLegacyScrapeOptions(
}
: undefined,
mobile: pageOptions.mobile,
fastMode: pageOptions.useFastMode,
}),
internalOptions: {
atsv: pageOptions.atsv,
v0DisableJsDom: pageOptions.disableJsDom,
v0UseFastMode: pageOptions.useFastMode,
},
// TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
};

View File

@ -62,3 +62,16 @@ export async function generateCompletions(
return completions;
}
// generate basic completion
export async function generateBasicCompletion(prompt: string) {
const openai = new OpenAI();
const model = "gpt-4o";
const completion = await openai.chat.completions.create({
model,
messages: [{ role: "user", content: prompt }],
});
return completion.choices[0].message.content;
}

View File

@ -21,7 +21,7 @@ export function cacheKey(
if (
internalOptions.v0CrawlOnlyUrls ||
internalOptions.forceEngine ||
internalOptions.v0UseFastMode ||
scrapeOptions.fastMode ||
internalOptions.atsv ||
(scrapeOptions.actions && scrapeOptions.actions.length > 0)
) {

View File

@ -60,6 +60,8 @@ export async function addCrawlJob(id: string, job_id: string) {
}
export async function addCrawlJobs(id: string, job_ids: string[]) {
if (job_ids.length === 0) return true;
_logger.debug("Adding crawl jobs to Redis...", {
jobIds: job_ids,
module: "crawl-redis",
@ -90,12 +92,20 @@ export async function addCrawlJobDone(
if (success) {
await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id);
await redisConnection.expire(
} else {
// in case it's already been pushed, make sure it's removed
await redisConnection.lrem(
"crawl:" + id + ":jobs_done_ordered",
24 * 60 * 60,
"NX",
-1,
job_id,
);
}
await redisConnection.expire(
"crawl:" + id + ":jobs_done_ordered",
24 * 60 * 60,
"NX",
);
}
export async function getDoneJobsOrderedLength(id: string): Promise<number> {
@ -227,13 +237,6 @@ export async function lockURL(
url = normalizeURL(url, sc);
logger = logger.child({ url });
await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
await redisConnection.expire(
"crawl:" + id + ":visited_unique",
24 * 60 * 60,
"NX",
);
let res: boolean;
if (!sc.crawlerOptions?.deduplicateSimilarURLs) {
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0;
@ -249,6 +252,15 @@ export async function lockURL(
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
if (res) {
await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
await redisConnection.expire(
"crawl:" + id + ":visited_unique",
24 * 60 * 60,
"NX",
);
}
logger.debug("Locking URL " + JSON.stringify(url) + "... result: " + res, {
res,
});
@ -261,6 +273,8 @@ export async function lockURLs(
sc: StoredCrawl,
urls: string[],
): Promise<boolean> {
if (urls.length === 0) return true;
urls = urls.map((url) => normalizeURL(url, sc));
const logger = _logger.child({
crawlId: id,

View File

@ -0,0 +1,16 @@
export function buildRefrasedPrompt(prompt: string, url: string): string {
return `You are a search query optimizer. Your task is to rephrase the following prompt into an effective search query that will find relevant results about this topic on ${url}.
Original prompt: "${prompt}"
Provide a rephrased search query that:
1. Maintains the core intent of the original prompt with ONLY the keywords
2. Uses relevant keywords
3. Is optimized for search engine results
4. Is concise and focused
5. Short is better than long
6. It is a search engine, not a chatbot
7. Concise
Return only the rephrased search query, without any explanation or additional text.`;
}

View File

@ -0,0 +1,2 @@
export const BLOCKLISTED_URL_MESSAGE =
"This website is no longer supported, please reach out to help@firecrawl.com for more info on how to activate it on your account.";

View File

@ -7,7 +7,7 @@ import {
import { billTeam } from "../services/billing/credit_billing";
import { Document } from "../controllers/v1/types";
import { supabase_service } from "../services/supabase";
import { logger } from "../lib/logger";
import { logger as _logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events";
import { configDotenv } from "dotenv";
import {
@ -49,6 +49,7 @@ export async function startWebScraperPipeline({
bull_job_id: job.id.toString(),
priority: job.opts.priority,
is_scrape: job.data.is_scrape ?? false,
is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null),
});
}
@ -63,56 +64,126 @@ export async function runWebScraper({
bull_job_id,
priority,
is_scrape = false,
is_crawl = false,
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
const logger = _logger.child({
method: "runWebScraper",
module: "runWebscraper",
scrapeId: bull_job_id,
jobId: bull_job_id,
});
const tries = is_crawl ? 3 : 1;
let response: ScrapeUrlResponse | undefined = undefined;
let engines: EngineResultsTracker = {};
try {
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
priority,
...internalOptions,
});
if (!response.success) {
if (response.error instanceof Error) {
throw response.error;
} else {
throw new Error(
"scrapeURL error: " +
(Array.isArray(response.error)
? JSON.stringify(response.error)
: typeof response.error === "object"
? JSON.stringify({ ...response.error })
: response.error),
);
}
let error: any = undefined;
for (let i = 0; i < tries; i++) {
if (i > 0) {
logger.debug("Retrying scrape...", {
tries,
i,
previousStatusCode: (response as any)?.document?.metadata?.statusCode,
previousError: error,
});
}
response = undefined;
engines = {};
error = undefined;
try {
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
priority,
...internalOptions,
});
if (!response.success) {
if (response.error instanceof Error) {
throw response.error;
} else {
throw new Error(
"scrapeURL error: " +
(Array.isArray(response.error)
? JSON.stringify(response.error)
: typeof response.error === "object"
? JSON.stringify({ ...response.error })
: response.error),
);
}
}
// This is where the returnvalue from the job is set
// onSuccess(response.document, mode);
engines = response.engines;
if (
(response.document.metadata.statusCode >= 200 &&
response.document.metadata.statusCode < 300) ||
response.document.metadata.statusCode === 304
) {
// status code is good -- do not attempt retry
break;
}
} catch (_error) {
error = _error;
engines =
response !== undefined
? response.engines
: typeof error === "object" && error !== null
? ((error as any).results ?? {})
: {};
}
}
const engineOrder = Object.entries(engines)
.sort((a, b) => a[1].startedAt - b[1].startedAt)
.map((x) => x[0]) as Engine[];
for (const engine of engineOrder) {
const result = engines[engine] as Exclude<
EngineResultsTracker[Engine],
undefined
>;
ScrapeEvents.insert(bull_job_id, {
type: "scrape",
url,
method: engine,
result: {
success: result.state === "success",
response_code:
result.state === "success" ? result.result.statusCode : undefined,
response_size:
result.state === "success" ? result.result.html.length : undefined,
error:
result.state === "error"
? result.error
: result.state === "timeout"
? "Timed out"
: undefined,
time_taken: result.finishedAt - result.startedAt,
},
});
}
if (error === undefined && response?.success) {
if (is_scrape === false) {
let creditsToBeBilled = 1; // Assuming 1 credit per document
if (scrapeOptions.extract) {
creditsToBeBilled = 5;
}
billTeam(team_id, undefined, creditsToBeBilled).catch((error) => {
billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => {
logger.error(
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,
{ error },
);
// Optionally, you could notify an admin or add to a retry queue here
});
}
// This is where the returnvalue from the job is set
// onSuccess(response.document, mode);
engines = response.engines;
return response;
} catch (error) {
engines =
response !== undefined
? response.engines
: typeof error === "object" && error !== null
? ((error as any).results ?? {})
: {};
} else {
if (response !== undefined) {
return {
...response,
@ -127,37 +198,6 @@ export async function runWebScraper({
engines,
};
}
// onError(error);
} finally {
const engineOrder = Object.entries(engines)
.sort((a, b) => a[1].startedAt - b[1].startedAt)
.map((x) => x[0]) as Engine[];
for (const engine of engineOrder) {
const result = engines[engine] as Exclude<
EngineResultsTracker[Engine],
undefined
>;
ScrapeEvents.insert(bull_job_id, {
type: "scrape",
url,
method: engine,
result: {
success: result.state === "success",
response_code:
result.state === "success" ? result.result.statusCode : undefined,
response_size:
result.state === "success" ? result.result.html.length : undefined,
error:
result.state === "error"
? result.error
: result.state === "timeout"
? "Timed out"
: undefined,
time_taken: result.finishedAt - result.startedAt,
},
});
}
}
}
@ -195,6 +235,11 @@ const saveJob = async (
}
ScrapeEvents.logJobEvent(job, "completed");
} catch (error) {
logger.error(`🐂 Failed to update job status: ${error}`);
_logger.error(`🐂 Failed to update job status`, {
module: "runWebScraper",
method: "saveJob",
jobId: job.id,
scrapeId: job.id,
});
}
};

View File

@ -8,6 +8,7 @@ import {
} from "../controllers/v0/admin/queue";
import { wrap } from "./v1";
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
import { checkFireEngine } from "../controllers/v0/admin/check-fire-engine";
export const adminRouter = express.Router();
@ -37,3 +38,8 @@ adminRouter.post(
`/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`,
wrap(acucCacheClearController),
);
adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/feng-check`,
wrap(checkFireEngine),
);

View File

@ -31,6 +31,8 @@ import { extractController } from "../controllers/v1/extract";
// import { keyAuthController } from "../../src/controllers/v1/keyAuth";
// import { livenessController } from "../controllers/v1/liveness";
// import { readinessController } from "../controllers/v1/readiness";
import { creditUsageController } from "../controllers/v1/credit-usage";
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
function checkCreditsMiddleware(
minimum?: number,
@ -122,8 +124,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
if (!res.headersSent) {
return res.status(403).json({
success: false,
error:
"URL is blocked intentionally. Firecrawl currently does not support scraping this site due to policy restrictions.",
error: BLOCKLISTED_URL_MESSAGE,
});
}
}
@ -224,3 +225,9 @@ v1Router.delete(
// Health/Probe routes
// v1Router.get("/health/liveness", livenessController);
// v1Router.get("/health/readiness", readinessController);
v1Router.get(
"/team/credit-usage",
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(creditUsageController),
);

View File

@ -210,7 +210,7 @@ export class WebCrawler {
}
if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks(
sitemapLinks,
[...new Set(sitemapLinks)],
this.limit,
this.maxCrawledDepth,
fromMap,

View File

@ -6,6 +6,15 @@ configDotenv();
const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8");
const algorithm = "aes-256-ecb";
function encryptAES(plaintext: string, key: Buffer): string {
const cipher = crypto.createCipheriv(algorithm, key, null);
const encrypted = Buffer.concat([
cipher.update(plaintext, "utf-8"),
cipher.final(),
]);
return encrypted.toString("base64");
}
function decryptAES(ciphertext: string, key: Buffer): string {
const decipher = crypto.createDecipheriv(algorithm, key, null);
const decrypted = Buffer.concat([
@ -42,9 +51,27 @@ const urlBlocklist = [
"PTbGg8PK/h0Seyw4HEpK4Q==",
"lZdQMknjHb7+4+sjF3qNTw==",
"LsgSq54q5oDysbva29JxnQ==",
"KZfBtpwjOpdSoqacRbz7og==",
"Indtl4yxJMHCKBGF4KABCQ==",
"e3HFXLVgxhaVoadYpwb2BA==",
"b+asgLayXQ5Jq+se+q56jA==",
"86ZDUI7vmp4MvNq3fvZrGQ==",
"sEGFoYZ6GEg4Zocd+TiyfQ==",
"6OOL72eXthgnJ1Hj4PfOQQ==",
"g/ME+Sh1CAFboKrwkVb+5Q==",
"Pw+xawUoX8xBYbX2yqqGWQ==",
"k6vBalxYFhAvkPsF19t9gQ==",
"e3HFXLVgxhaVoadYpwb2BA==",
"b+asgLayXQ5Jq+se+q56jA==",
"KKttwRz4w+AMJrZcB828WQ==",
"vMdzZ33BXoyWVZnAPOBcrg==",
"l8GDVI8w/ueHnNzdN1ODuQ==",
];
const decryptedBlocklist = hashKey.length > 0 ? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey)) : [];
const decryptedBlocklist =
hashKey.length > 0
? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey))
: [];
const allowedKeywords = [
"pulse",

View File

@ -5,8 +5,9 @@ import { specialtyScrapeCheck } from "../utils/specialtyHandler";
export async function scrapeURLWithFetch(
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
const timeout = 20000;
const timeout = timeToRun ?? 300000;
const response = await Promise.race([
fetch(meta.url, {

View File

@ -3,7 +3,7 @@ import * as Sentry from "@sentry/node";
import { z } from "zod";
import { robustFetch } from "../../lib/fetch";
import { EngineError, SiteError } from "../../error";
import { ActionError, EngineError, SiteError } from "../../error";
const successSchema = z.object({
jobId: z.string(),
@ -111,6 +111,12 @@ export async function fireEngineCheckStatus(
status.error.includes("Chrome error: ")
) {
throw new SiteError(status.error.split("Chrome error: ")[1]);
} else if (
typeof status.error === "string" &&
// TODO: improve this later
status.error.includes("Element")
) {
throw new ActionError(status.error.split("Error: ")[1]);
} else {
throw new EngineError("Scrape job failed", {
cause: {

View File

@ -13,13 +13,11 @@ import {
FireEngineCheckStatusSuccess,
StillProcessingError,
} from "./checkStatus";
import { EngineError, SiteError, TimeoutError } from "../../error";
import { ActionError, EngineError, SiteError, TimeoutError } from "../../error";
import * as Sentry from "@sentry/node";
import { Action } from "../../../../lib/entities";
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
export const defaultTimeout = 10000;
// This function does not take `Meta` on purpose. It may not access any
// meta values to construct the request -- that must be done by the
// `scrapeURLWithFireEngine*` functions.
@ -31,7 +29,7 @@ async function performFireEngineScrape<
>(
logger: Logger,
request: FireEngineScrapeRequestCommon & Engine,
timeout = defaultTimeout,
timeout: number,
): Promise<FireEngineCheckStatusSuccess> {
const scrape = await fireEngineScrape(
logger.child({ method: "fireEngineScrape" }),
@ -70,7 +68,11 @@ async function performFireEngineScrape<
} catch (error) {
if (error instanceof StillProcessingError) {
// nop
} else if (error instanceof EngineError || error instanceof SiteError) {
} else if (
error instanceof EngineError ||
error instanceof SiteError ||
error instanceof ActionError
) {
logger.debug("Fire-engine scrape job failed.", {
error,
jobId: scrape.jobId,
@ -94,6 +96,7 @@ async function performFireEngineScrape<
export async function scrapeURLWithFireEngineChromeCDP(
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
const actions: Action[] = [
// Transform waitFor option into an action (unsupported by chrome-cdp)
@ -121,6 +124,13 @@ export async function scrapeURLWithFireEngineChromeCDP(
...(meta.options.actions ?? []),
];
const totalWait = actions.reduce(
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
0,
);
const timeout = (timeToRun ?? 300000) + totalWait;
const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestChromeCDP = {
url: meta.url,
@ -134,25 +144,20 @@ export async function scrapeURLWithFireEngineChromeCDP(
}
: {}),
priority: meta.internalOptions.priority,
geolocation: meta.options.geolocation,
geolocation: meta.options.geolocation ?? meta.options.location,
mobile: meta.options.mobile,
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
timeout, // TODO: better timeout logic
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
// TODO: scrollXPaths
};
const totalWait = actions.reduce(
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
0,
);
let response = await performFireEngineScrape(
meta.logger.child({
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
request,
}),
request,
meta.options.timeout !== undefined ? defaultTimeout + totalWait : Infinity, // TODO: better timeout handling
timeout,
);
specialtyScrapeCheck(
@ -206,7 +211,11 @@ export async function scrapeURLWithFireEngineChromeCDP(
export async function scrapeURLWithFireEnginePlaywright(
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
const totalWait = meta.options.waitFor;
const timeout = (timeToRun ?? 300000) + totalWait;
const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestPlaywright = {
url: meta.url,
@ -218,9 +227,9 @@ export async function scrapeURLWithFireEnginePlaywright(
screenshot: meta.options.formats.includes("screenshot"),
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
wait: meta.options.waitFor,
geolocation: meta.options.geolocation,
geolocation: meta.options.geolocation ?? meta.options.location,
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
timeout,
};
let response = await performFireEngineScrape(
@ -229,9 +238,7 @@ export async function scrapeURLWithFireEnginePlaywright(
request,
}),
request,
meta.options.timeout !== undefined
? defaultTimeout + meta.options.waitFor
: Infinity, // TODO: better timeout handling
timeout,
);
specialtyScrapeCheck(
@ -265,7 +272,10 @@ export async function scrapeURLWithFireEnginePlaywright(
export async function scrapeURLWithFireEngineTLSClient(
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
const timeout = timeToRun ?? 30000;
const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestTLSClient = {
url: meta.url,
@ -276,10 +286,10 @@ export async function scrapeURLWithFireEngineTLSClient(
priority: meta.internalOptions.priority,
atsv: meta.internalOptions.atsv,
geolocation: meta.options.geolocation,
geolocation: meta.options.geolocation ?? meta.options.location,
disableJsDom: meta.internalOptions.v0DisableJsDom,
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
timeout,
};
let response = await performFireEngineScrape(
@ -288,7 +298,7 @@ export async function scrapeURLWithFireEngineTLSClient(
request,
}),
request,
meta.options.timeout !== undefined ? defaultTimeout : Infinity, // TODO: better timeout handling
timeout,
);
specialtyScrapeCheck(

View File

@ -105,7 +105,10 @@ export type EngineScrapeResult = {
};
const engineHandlers: {
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>;
[E in Engine]: (
meta: Meta,
timeToRun: number | undefined,
) => Promise<EngineScrapeResult>;
} = {
cache: scrapeCache,
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
@ -372,6 +375,7 @@ export function buildFallbackList(meta: Meta): {
export async function scrapeURLWithEngine(
meta: Meta,
engine: Engine,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
const fn = engineHandlers[engine];
const logger = meta.logger.child({
@ -383,5 +387,5 @@ export async function scrapeURLWithEngine(
logger,
};
return await fn(_meta);
return await fn(_meta, timeToRun);
}

View File

@ -15,6 +15,7 @@ type PDFProcessorResult = { html: string; markdown?: string };
async function scrapePDFWithLlamaParse(
meta: Meta,
tempFilePath: string,
timeToRun: number | undefined,
): Promise<PDFProcessorResult> {
meta.logger.debug("Processing PDF document with LlamaIndex", {
tempFilePath,
@ -63,8 +64,9 @@ async function scrapePDFWithLlamaParse(
// TODO: timeout, retries
const startedAt = Date.now();
const timeout = timeToRun ?? 300000;
while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) {
while (Date.now() <= startedAt + timeout) {
try {
const result = await robustFetch({
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
@ -122,7 +124,10 @@ async function scrapePDFWithParsePDF(
};
}
export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
export async function scrapePDF(
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
if (!meta.options.parsePDF) {
const file = await fetchFileToBuffer(meta.url);
const content = file.buffer.toString("base64");
@ -138,9 +143,26 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
let result: PDFProcessorResult | null = null;
if (process.env.LLAMAPARSE_API_KEY) {
// First, try parsing with PdfParse
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);
// If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse
if (
result.markdown &&
result.markdown.length < 500 &&
process.env.LLAMAPARSE_API_KEY
) {
try {
result = await scrapePDFWithLlamaParse(
const llamaResult = await scrapePDFWithLlamaParse(
{
...meta,
logger: meta.logger.child({
@ -148,17 +170,19 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
}),
},
tempFilePath,
timeToRun,
);
result = llamaResult; // Use LlamaParse result if successful
} catch (error) {
if (error instanceof Error && error.message === "LlamaParse timed out") {
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", {
meta.logger.warn("LlamaParse timed out -- using parse-pdf result", {
error,
});
} else if (error instanceof RemoveFeatureError) {
throw error;
} else {
meta.logger.warn(
"LlamaParse failed to parse PDF -- falling back to parse-pdf",
"LlamaParse failed to parse PDF -- using parse-pdf result",
{ error },
);
Sentry.captureException(error);
@ -166,18 +190,6 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
}
}
if (result === null) {
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);
}
await fs.unlink(tempFilePath);
return {

View File

@ -6,8 +6,9 @@ import { robustFetch } from "../../lib/fetch";
export async function scrapeURLWithPlaywright(
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
const timeout = 20000 + meta.options.waitFor;
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
const response = await Promise.race([
await robustFetch({
@ -30,7 +31,7 @@ export async function scrapeURLWithPlaywright(
}),
}),
(async () => {
await new Promise((resolve) => setTimeout(() => resolve(null), 20000));
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
throw new TimeoutError(
"Playwright was unable to scrape the page before timing out",
{ cause: { timeout } },

View File

@ -9,16 +9,20 @@ const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
export function scrapeURLWithScrapingBee(
wait_browser: "domcontentloaded" | "networkidle2",
): (meta: Meta) => Promise<EngineScrapeResult> {
return async (meta: Meta): Promise<EngineScrapeResult> => {
): (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult> {
return async (
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> => {
let response: AxiosResponse<any>;
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
try {
response = await client.get({
url: meta.url,
params: {
timeout: 15000, // TODO: dynamic timeout based on request timeout
timeout,
wait_browser: wait_browser,
wait: Math.min(meta.options.waitFor, 35000),
wait: meta.options.waitFor,
transparent_status_code: true,
json_response: true,
screenshot: meta.options.formats.includes("screenshot"),

View File

@ -56,3 +56,11 @@ export class SiteError extends Error {
this.code = code;
}
}
export class ActionError extends Error {
public code: string;
constructor(code: string) {
super("Action(s) failed to complete. Error code: " + code);
this.code = code;
}
}

View File

@ -12,6 +12,7 @@ import {
} from "./engines";
import { parseMarkdown } from "../../lib/html-to-markdown";
import {
ActionError,
AddFeatureError,
EngineError,
NoEnginesLeftError,
@ -86,7 +87,7 @@ function buildFeatureFlags(
flags.add("skipTlsVerification");
}
if (internalOptions.v0UseFastMode) {
if (options.fastMode) {
flags.add("useFastMode");
}
@ -148,7 +149,6 @@ export type InternalOptions = {
atsv?: boolean; // anti-bot solver, beta
v0CrawlOnlyUrls?: boolean;
v0UseFastMode?: boolean;
v0DisableJsDom?: boolean;
disableSmartWaitCache?: boolean; // Passed along to fire-engine
@ -203,11 +203,16 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
const results: EngineResultsTracker = {};
let result: EngineScrapeResultWithContext | null = null;
const timeToRun =
meta.options.timeout !== undefined
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2))
: undefined;
for (const { engine, unsupportedFeatures } of fallbackList) {
const startedAt = Date.now();
try {
meta.logger.info("Scraping via " + engine + "...");
const _engineResult = await scrapeURLWithEngine(meta, engine);
const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun);
if (_engineResult.markdown === undefined) {
// Some engines emit Markdown directly.
_engineResult.markdown = await parseMarkdown(_engineResult.html);
@ -285,6 +290,8 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
throw error;
} else if (error instanceof SiteError) {
throw error;
} else if (error instanceof ActionError) {
throw error;
} else {
Sentry.captureException(error);
meta.logger.info(
@ -405,6 +412,8 @@ export async function scrapeURL(
// TODO: results?
} else if (error instanceof SiteError) {
meta.logger.warn("scrapeURL: Site failed to load in browser", { error });
} else if (error instanceof ActionError) {
meta.logger.warn("scrapeURL: Action(s) failed to complete", { error });
} else {
Sentry.captureException(error);
meta.logger.error("scrapeURL: Unexpected error happened", { error });

View File

@ -5,7 +5,7 @@ import { Meta } from "..";
export function extractMetadata(
meta: Meta,
html: string,
): Document["metadata"] {
): Partial<Document["metadata"]> {
let title: string | undefined = undefined;
let description: string | undefined = undefined;
let language: string | undefined = undefined;
@ -40,7 +40,7 @@ export function extractMetadata(
const soup = load(html);
try {
title = soup("title").text() || undefined;
title = soup("title").first().text().trim() || undefined;
description = soup('meta[name="description"]').attr("content") || undefined;
// Assuming the language is part of the URL as per the regex pattern

View File

@ -159,8 +159,8 @@ export async function generateOpenAICompletions(
role: "user",
content:
options.prompt !== undefined
? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}`
: "Transform the above content into structured JSON output.",
? `Transform the above content into structured JSON output based on the provided schema if any and the following user request: ${options.prompt}. If schema is provided, strictly follow it.`
: "Transform the above content into structured JSON output based on the provided schema if any.",
},
],
response_format: options.schema

View File

@ -10,6 +10,7 @@ import { issueCredits } from "./issue_credits";
import { redlock } from "../redlock";
import { autoCharge } from "./auto_charge";
import { getValue, setValue } from "../redis";
import type { Logger } from "winston";
const FREE_CREDITS = 500;
@ -20,22 +21,33 @@ export async function billTeam(
team_id: string,
subscription_id: string | null | undefined,
credits: number,
logger?: Logger,
) {
return withAuth(supaBillTeam, { success: true, message: "No DB, bypassed." })(
team_id,
subscription_id,
credits,
logger,
);
}
export async function supaBillTeam(
team_id: string,
subscription_id: string | null | undefined,
credits: number,
__logger?: Logger,
) {
const _logger = (__logger ?? logger).child({
module: "credit_billing",
method: "supaBillTeam",
});
if (team_id === "preview") {
return { success: true, message: "Preview team, no credits used" };
}
logger.info(`Billing team ${team_id} for ${credits} credits`);
_logger.info(`Billing team ${team_id} for ${credits} credits`, {
team_id,
credits,
});
const { data, error } = await supabase_service.rpc("bill_team", {
_team_id: team_id,
@ -46,7 +58,7 @@ export async function supaBillTeam(
if (error) {
Sentry.captureException(error);
logger.error("Failed to bill team: " + JSON.stringify(error));
_logger.error("Failed to bill team.", { error });
return;
}

View File

@ -9,6 +9,7 @@ export async function issueCredits(team_id: string, credits: number) {
status: "active",
// indicates that this coupon was issued from auto recharge
from_auto_recharge: true,
initial_credits: credits,
});
if (error) {

View File

@ -11,11 +11,50 @@ import {
pushConcurrencyLimitedJob,
} from "../lib/concurrency-limit";
async function _addScrapeJobToConcurrencyQueue(
webScraperOptions: any,
options: any,
jobId: string,
jobPriority: number,
) {
await pushConcurrencyLimitedJob(webScraperOptions.team_id, {
id: jobId,
data: webScraperOptions,
opts: {
...options,
priority: jobPriority,
jobId: jobId,
},
priority: jobPriority,
});
}
async function _addScrapeJobToBullMQ(
webScraperOptions: any,
options: any,
jobId: string,
jobPriority: number,
) {
if (
webScraperOptions &&
webScraperOptions.team_id &&
webScraperOptions.plan
) {
await pushConcurrencyLimitActiveJob(webScraperOptions.team_id, jobId);
}
await getScrapeQueue().add(jobId, webScraperOptions, {
...options,
priority: jobPriority,
jobId,
});
}
async function addScrapeJobRaw(
webScraperOptions: any,
options: any,
jobId: string,
jobPriority: number = 10,
jobPriority: number,
) {
let concurrencyLimited = false;
@ -33,30 +72,14 @@ async function addScrapeJobRaw(
}
if (concurrencyLimited) {
await pushConcurrencyLimitedJob(webScraperOptions.team_id, {
id: jobId,
data: webScraperOptions,
opts: {
...options,
priority: jobPriority,
jobId: jobId,
},
priority: jobPriority,
});
} else {
if (
webScraperOptions &&
webScraperOptions.team_id &&
webScraperOptions.plan
) {
await pushConcurrencyLimitActiveJob(webScraperOptions.team_id, jobId);
}
await getScrapeQueue().add(jobId, webScraperOptions, {
...options,
priority: jobPriority,
await _addScrapeJobToConcurrencyQueue(
webScraperOptions,
options,
jobId,
});
jobPriority,
);
} else {
await _addScrapeJobToBullMQ(webScraperOptions, options, jobId, jobPriority);
}
}
@ -108,11 +131,88 @@ export async function addScrapeJobs(
};
}[],
) {
// TODO: better
if (jobs.length === 0) return true;
let countCanBeDirectlyAdded = Infinity;
if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) {
const now = Date.now();
const limit = await getConcurrencyLimitMax(jobs[0].data.plan);
console.log("CC limit", limit);
cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now);
countCanBeDirectlyAdded = Math.max(
limit -
(await getConcurrencyLimitActiveJobs(jobs[0].data.team_id, now)).length,
0,
);
}
const addToBull = jobs.slice(0, countCanBeDirectlyAdded);
const addToCQ = jobs.slice(countCanBeDirectlyAdded);
await Promise.all(
jobs.map((job) =>
addScrapeJob(job.data, job.opts, job.opts.jobId, job.opts.priority),
),
addToBull.map(async (job) => {
const size = JSON.stringify(job.data).length;
return await Sentry.startSpan(
{
name: "Add scrape job",
op: "queue.publish",
attributes: {
"messaging.message.id": job.opts.jobId,
"messaging.destination.name": getScrapeQueue().name,
"messaging.message.body.size": size,
},
},
async (span) => {
await _addScrapeJobToBullMQ(
{
...job.data,
sentry: {
trace: Sentry.spanToTraceHeader(span),
baggage: Sentry.spanToBaggageHeader(span),
size,
},
},
job.opts,
job.opts.jobId,
job.opts.priority,
);
},
);
}),
);
await Promise.all(
addToCQ.map(async (job) => {
const size = JSON.stringify(job.data).length;
return await Sentry.startSpan(
{
name: "Add scrape job",
op: "queue.publish",
attributes: {
"messaging.message.id": job.opts.jobId,
"messaging.destination.name": getScrapeQueue().name,
"messaging.message.body.size": size,
},
},
async (span) => {
await _addScrapeJobToConcurrencyQueue(
{
...job.data,
sentry: {
trace: Sentry.spanToTraceHeader(span),
baggage: Sentry.spanToBaggageHeader(span),
size,
},
},
job.opts,
job.opts.jobId,
job.opts.priority,
);
},
);
}),
);
}

View File

@ -386,27 +386,27 @@ async function processJob(job: Job & { id: string }, token: string) {
jobId: job.id,
scrapeId: job.id,
crawlId: job.data?.crawl_id ?? undefined,
teamId: job.data?.team_id ?? undefined,
});
logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url });
// Check if the job URL is researchhub and block it immediately
// TODO: remove this once solve the root issue
if (
job.data.url &&
(job.data.url.includes("researchhub.com") ||
job.data.url.includes("ebay.com") ||
job.data.url.includes("youtube.com"))
) {
logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`);
const data = {
success: false,
document: null,
project_id: job.data.project_id,
error:
"URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.",
};
return data;
}
// if (
// job.data.url &&
// (job.data.url.includes("researchhub.com") ||
// job.data.url.includes("ebay.com"))
// ) {
// logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`);
// const data = {
// success: false,
// document: null,
// project_id: job.data.project_id,
// error:
// "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.",
// };
// return data;
// }
try {
job.updateProgress({
@ -482,32 +482,28 @@ async function processJob(job: Job & { id: string }, token: string) {
normalizeURL(doc.metadata.url, sc) !==
normalizeURL(doc.metadata.sourceURL, sc)
) {
logger.debug(
"Was redirected, removing old URL and locking new URL...",
{ oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url },
);
// Remove the old URL from visited unique due to checking for limit
// Do not remove from :visited otherwise it will keep crawling the original URL (sourceURL)
await redisConnection.srem(
"crawl:" + job.data.crawl_id + ":visited_unique",
normalizeURL(doc.metadata.sourceURL, sc),
);
const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc));
const p2 = generateURLPermutations(
normalizeURL(doc.metadata.sourceURL, sc),
);
// In crawls, we should only crawl a redirected page once, no matter how many; times it is redirected to, or if it's been discovered by the crawler before.
// This can prevent flakiness with race conditions.
// Lock the new URL
const lockRes = await lockURL(job.data.crawl_id, sc, doc.metadata.url);
if (
job.data.crawlerOptions !== null &&
!lockRes &&
JSON.stringify(p1) !== JSON.stringify(p2)
) {
throw new RacedRedirectError();
if (JSON.stringify(p1) !== JSON.stringify(p2)) {
logger.debug(
"Was redirected, removing old URL and locking new URL...",
{ oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url },
);
// Prevent redirect target from being visited in the crawl again
// See lockURL
const x = await redisConnection.sadd(
"crawl:" + job.data.crawl_id + ":visited",
...p1.map((x) => x.href),
);
const lockRes = x === p1.length;
if (job.data.crawlerOptions !== null && !lockRes) {
throw new RacedRedirectError();
}
}
}
@ -679,6 +675,10 @@ async function processJob(job: Job & { id: string }, token: string) {
logger.debug("Declaring job as done...");
await addCrawlJobDone(job.data.crawl_id, job.id, false);
await redisConnection.srem(
"crawl:" + job.data.crawl_id + ":visited_unique",
normalizeURL(job.data.url, sc),
);
logger.debug("Logging job to DB...");
await logJob(

View File

@ -80,8 +80,8 @@ const RATE_LIMITS = {
default: 100,
},
crawlStatus: {
free: 300,
default: 500,
free: 500,
default: 5000,
},
testSuite: {
free: 10000,

View File

@ -55,6 +55,7 @@ export interface RunWebScraperParams {
bull_job_id: string;
priority?: number;
is_scrape?: boolean;
is_crawl?: boolean;
}
export type RunWebScraperResult =

View File

@ -3,6 +3,7 @@
"rootDir": "./src",
"lib": ["ES2022", "DOM"],
// or higher
"target": "ES2022",
@ -18,7 +19,7 @@
"*": ["node_modules/*", "src/types/*"],
},
"inlineSources": true
"inlineSources": true,
},
"include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"]
}

View File

@ -1,7 +1,19 @@
const fs = require("fs");
const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8")
.split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));
// METHOD: Winston log file
// const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8")
// .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));
// METHOD: GCloud export
const logs = [
"downloaded-logs-20241213-225607.json",
"downloaded-logs-20241213-225654.json",
"downloaded-logs-20241213-225720.json",
"downloaded-logs-20241213-225758.json",
"downloaded-logs-20241213-225825.json",
"downloaded-logs-20241213-225843.json",
].flatMap(x => JSON.parse(fs.readFileSync(x, "utf8"))).map(x => x.jsonPayload);
const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))];

View File

@ -0,0 +1,14 @@
require("dotenv").config();
const Redis = require("ioredis");
const crawlId = process.argv[2];
const redisConnection = new Redis(process.env.REDIS_URL, {
maxRetriesPerRequest: null,
});
(async () => {
const res = await redisConnection.sscan("crawl:" + crawlId + ":visited_unique", 0, "COUNT", 999);
await require("fs/promises").writeFile(crawlId + "-visited.txt", res[1].map(x => x.split("://").slice(1).join("://")).sort().join("\n"));
process.exit(0);
})();

43
apps/api/utils/urldump.js Normal file
View File

@ -0,0 +1,43 @@
require("dotenv").config();
//const baseUrl = "https://api.firecrawl.dev";
const baseUrl = "http://localhost:3002";
const crawlId = process.argv[2];
(async () => {
let url = baseUrl + "/v1/crawl/" + crawlId;
let urls = [];
while (url) {
let res;
while (true) {
try {
res = (await (await fetch(url, {
headers: {
"Authorization": "Bearer " + process.env.TEST_API_KEY
}
})).json());
break;
} catch (e) {
console.error(e);
}
}
console.log(res.data.length);
if (res.data.length === 0) {
break;
}
urls.push(...res.data.map(x => x.metadata.url ?? x.metadata.sourceURL));
url = res.next;
if (url !== undefined) {
const o = new URL(url)
o.protocol = new URL(baseUrl).protocol;
url = o.href;
}
}
await require("fs/promises").writeFile(crawlId + "-urls.txt", urls.map(x => x.split("://").slice(1).join("://")).sort().join("\n"));
})();

View File

@ -1,9 +1,9 @@
import { describe, test, expect, jest } from '@jest/globals';
import axios from 'axios';
import FirecrawlApp from '../index';
import { describe, expect, jest, test } from '@jest/globals';
import { readFile } from 'fs/promises';
import FirecrawlApp from '../index';
import axios from 'axios';
import { join } from 'path';
import { readFile } from 'fs/promises';
// Mock jest and set the type
jest.mock('axios');
@ -14,13 +14,22 @@ async function loadFixture(name: string): Promise<string> {
return await readFile(join(__dirname, 'fixtures', `${name}.json`), 'utf-8')
}
const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev";
describe('the firecrawl JS SDK', () => {
test('Should require an API key to instantiate FirecrawlApp', async () => {
const fn = () => {
new FirecrawlApp({ apiKey: undefined });
};
expect(fn).toThrow('No API key provided');
test('Should require an API key only for cloud service', async () => {
if (API_URL.includes('api.firecrawl.dev')) {
// Should throw for cloud service
expect(() => {
new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL });
}).toThrow('No API key provided');
} else {
// Should not throw for self-hosted
expect(() => {
new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL });
}).not.toThrow();
}
});
test('Should return scraped data from a /scrape API call', async () => {

View File

@ -9,15 +9,28 @@ const TEST_API_KEY = process.env.TEST_API_KEY;
const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev";
describe('FirecrawlApp E2E Tests', () => {
test.concurrent('should throw error for no API key', async () => {
expect(() => {
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
}).toThrow("No API key provided");
test.concurrent('should throw error for no API key only for cloud service', async () => {
if (API_URL.includes('api.firecrawl.dev')) {
// Should throw for cloud service
expect(() => {
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
}).toThrow("No API key provided");
} else {
// Should not throw for self-hosted
expect(() => {
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
}).not.toThrow();
}
});
test.concurrent('should throw error for invalid API key on scrape', async () => {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
if (API_URL.includes('api.firecrawl.dev')) {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 404");
} else {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).resolves.not.toThrow();
}
});
test.concurrent('should throw error for blocklisted URL on scrape', async () => {
@ -155,14 +168,13 @@ describe('FirecrawlApp E2E Tests', () => {
}, 30000); // 30 seconds timeout
test.concurrent('should throw error for invalid API key on crawl', async () => {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
});
test.concurrent('should throw error for blocklisted URL on crawl', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const blocklistedUrl = "https://twitter.com/fake-test";
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
if (API_URL.includes('api.firecrawl.dev')) {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404");
} else {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).resolves.not.toThrow();
}
});
test.concurrent('should return successful response for crawl and wait for completion', async () => {
@ -337,8 +349,13 @@ describe('FirecrawlApp E2E Tests', () => {
}, 60000); // 60 seconds timeout
test.concurrent('should throw error for invalid API key on map', async () => {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
if (API_URL.includes('api.firecrawl.dev')) {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404");
} else {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).resolves.not.toThrow();
}
});
test.concurrent('should throw error for blocklisted URL on map', async () => {
@ -355,8 +372,7 @@ describe('FirecrawlApp E2E Tests', () => {
}, 30000); // 30 seconds timeout
test.concurrent('should return successful response for valid map', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
expect(response).not.toBeNull();
expect(response.links?.length).toBeGreaterThan(0);

View File

@ -183,6 +183,7 @@ export interface BatchScrapeResponse {
url?: string;
success: true;
error?: string;
invalidURLs?: string[];
}
/**
@ -242,10 +243,11 @@ export interface MapResponse {
* Defines options for extracting information from URLs.
*/
export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
prompt: string;
prompt?: string;
schema?: LLMSchema;
systemPrompt?: string;
allowExternalLinks?: boolean;
includeSubdomains?: boolean;
}
/**
@ -288,17 +290,23 @@ export default class FirecrawlApp {
public apiKey: string;
public apiUrl: string;
private isCloudService(url: string): boolean {
return url.includes('api.firecrawl.dev');
}
/**
* Initializes a new instance of the FirecrawlApp class.
* @param config - Configuration options for the FirecrawlApp instance.
*/
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
if (typeof apiKey !== "string") {
const baseUrl = apiUrl || "https://api.firecrawl.dev";
if (this.isCloudService(baseUrl) && typeof apiKey !== "string") {
throw new FirecrawlError("No API key provided", 401);
}
this.apiKey = apiKey;
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
this.apiKey = apiKey || '';
this.apiUrl = baseUrl;
}
/**
@ -576,9 +584,10 @@ export default class FirecrawlApp {
pollInterval: number = 2,
idempotencyKey?: string,
webhook?: CrawlParams["webhook"],
ignoreInvalidURLs?: boolean,
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData: any = { urls, ...params };
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params };
if (jsonData?.extract?.schema) {
let schema = jsonData.extract.schema;
@ -621,10 +630,12 @@ export default class FirecrawlApp {
async asyncBatchScrapeUrls(
urls: string[],
params?: ScrapeParams,
idempotencyKey?: string
idempotencyKey?: string,
webhook?: CrawlParams["webhook"],
ignoreInvalidURLs?: boolean,
): Promise<BatchScrapeResponse | ErrorResponse> {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData: any = { urls, ...(params ?? {}) };
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...(params ?? {}) };
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/batch/scrape`,
@ -657,8 +668,10 @@ export default class FirecrawlApp {
urls: string[],
params?: ScrapeParams,
idempotencyKey?: string,
webhook?: CrawlParams["webhook"],
ignoreInvalidURLs?: boolean,
) {
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs);
if (crawl.success && crawl.id) {
const id = crawl.id;
@ -932,9 +945,11 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
private ws: WebSocket;
public data: FirecrawlDocument<undefined>[];
public status: CrawlStatusResponse["status"];
public id: string;
constructor(id: string, app: FirecrawlApp) {
super();
this.id = id;
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
this.status = "scraping";
this.data = [];
@ -965,6 +980,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
detail: {
status: this.status,
data: this.data,
id: this.id,
},
}));
} else if (msg.type === "error") {
@ -974,6 +990,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
status: this.status,
data: this.data,
error: msg.error,
id: this.id,
},
}));
} else if (msg.type === "catchup") {
@ -981,12 +998,18 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
this.data.push(...(msg.data.data ?? []));
for (const doc of this.data) {
this.dispatchTypedEvent("document", new CustomEvent("document", {
detail: doc,
detail: {
...doc,
id: this.id,
},
}));
}
} else if (msg.type === "document") {
this.dispatchTypedEvent("document", new CustomEvent("document", {
detail: msg.data,
detail: {
...msg.data,
id: this.id,
},
}));
}
}
@ -996,14 +1019,21 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
this.ws.close();
return;
}
const msg = JSON.parse(ev.data) as Message;
messageHandler(msg);
try {
const msg = JSON.parse(ev.data) as Message;
messageHandler(msg);
} catch (error) {
console.error("Error on message", error);
}
}).bind(this);
this.ws.onclose = ((ev: CloseEvent) => {
const msg = JSON.parse(ev.reason) as Message;
messageHandler(msg);
try {
const msg = JSON.parse(ev.reason) as Message;
messageHandler(msg);
} catch (error) {
console.error("Error on close", error);
}
}).bind(this);
this.ws.onerror = ((_: Event) => {
@ -1013,6 +1043,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
status: this.status,
data: this.data,
error: "WebSocket error",
id: this.id,
},
}));
}).bind(this);

View File

@ -196,7 +196,7 @@ app.post('/scrape', async (req: Request, res: Response) => {
}
}
const pageError = pageStatusCode !== 200 ? getError(pageStatusCode) : false;
const pageError = pageStatusCode !== 200 ? getError(pageStatusCode) : undefined;
if (!pageError) {
console.log(`✅ Scrape successful!`);
@ -209,7 +209,7 @@ app.post('/scrape', async (req: Request, res: Response) => {
res.json({
content: pageContent,
pageStatusCode,
pageError
...(pageError && { pageError })
});
});

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp # noqa
__version__ = "1.6.4"
__version__ = "1.6.8"
# Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -29,12 +29,12 @@ def test_scrape_url_invalid_api_key():
invalid_app.scrape_url('https://firecrawl.dev')
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
def test_blocklisted_url():
blocklisted_url = "https://facebook.com/fake-test"
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
with pytest.raises(Exception) as excinfo:
app.scrape_url(blocklisted_url)
assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
# def test_blocklisted_url():
# blocklisted_url = "https://facebook.com/fake-test"
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
# with pytest.raises(Exception) as excinfo:
# app.scrape_url(blocklisted_url)
# assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
def test_successful_response_with_valid_preview_token():
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0')
@ -90,12 +90,12 @@ def test_crawl_url_invalid_api_key():
invalid_app.crawl_url('https://firecrawl.dev')
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
def test_should_return_error_for_blocklisted_url():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
blocklisted_url = "https://twitter.com/fake-test"
with pytest.raises(Exception) as excinfo:
app.crawl_url(blocklisted_url)
assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
# def test_should_return_error_for_blocklisted_url():
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
# blocklisted_url = "https://twitter.com/fake-test"
# with pytest.raises(Exception) as excinfo:
# app.crawl_url(blocklisted_url)
# assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
def test_crawl_url_wait_for_completion_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')

View File

@ -30,12 +30,12 @@ def test_scrape_url_invalid_api_key():
invalid_app.scrape_url('https://firecrawl.dev')
assert "Unauthorized: Invalid token" in str(excinfo.value)
def test_blocklisted_url():
blocklisted_url = "https://facebook.com/fake-test"
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
with pytest.raises(Exception) as excinfo:
app.scrape_url(blocklisted_url)
assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
# def test_blocklisted_url():
# blocklisted_url = "https://facebook.com/fake-test"
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
# with pytest.raises(Exception) as excinfo:
# app.scrape_url(blocklisted_url)
# assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
def test_successful_response_with_valid_preview_token():
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
@ -136,12 +136,12 @@ def test_crawl_url_invalid_api_key():
invalid_app.crawl_url('https://firecrawl.dev')
assert "Unauthorized: Invalid token" in str(excinfo.value)
def test_should_return_error_for_blocklisted_url():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
blocklisted_url = "https://twitter.com/fake-test"
with pytest.raises(Exception) as excinfo:
app.crawl_url(blocklisted_url)
assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
# def test_should_return_error_for_blocklisted_url():
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
# blocklisted_url = "https://twitter.com/fake-test"
# with pytest.raises(Exception) as excinfo:
# app.crawl_url(blocklisted_url)
# assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
def test_crawl_url_wait_for_completion_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
@ -296,12 +296,12 @@ def test_invalid_api_key_on_map():
invalid_app.map_url('https://roastmywebsite.ai')
assert "Unauthorized: Invalid token" in str(excinfo.value)
def test_blocklisted_url_on_map():
app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
blocklisted_url = "https://facebook.com/fake-test"
with pytest.raises(Exception) as excinfo:
app.map_url(blocklisted_url)
assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
# def test_blocklisted_url_on_map():
# app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
# blocklisted_url = "https://facebook.com/fake-test"
# with pytest.raises(Exception) as excinfo:
# app.map_url(blocklisted_url)
# assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
def test_successful_response_with_valid_preview_token_on_map():
app = FirecrawlApp(api_key="this_is_just_a_preview_token", api_url=API_URL)

View File

@ -26,7 +26,7 @@ class FirecrawlApp:
"""
Parameters for the extract operation.
"""
prompt: str
prompt: Optional[str] = None
schema_: Optional[Any] = pydantic.Field(None, alias='schema')
system_prompt: Optional[str] = None
allow_external_links: Optional[bool] = False
@ -704,15 +704,15 @@ class CrawlWatcher:
async def _handle_message(self, msg: Dict[str, Any]):
if msg['type'] == 'done':
self.status = 'completed'
self.dispatch_event('done', {'status': self.status, 'data': self.data})
self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
elif msg['type'] == 'error':
self.status = 'failed'
self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error']})
self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
elif msg['type'] == 'catchup':
self.status = msg['data']['status']
self.data.extend(msg['data'].get('data', []))
for doc in self.data:
self.dispatch_event('document', doc)
self.dispatch_event('document', {'data': doc, 'id': self.id})
elif msg['type'] == 'document':
self.data.append(msg['data'])
self.dispatch_event('document', msg['data'])
self.dispatch_event('document', {'data': msg['data'], 'id': self.id})

View File

@ -5,20 +5,20 @@ use firecrawl::FirecrawlApp;
use serde_json::json;
use std::env;
#[tokio::test]
async fn test_blocklisted_url() {
dotenv().ok();
let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let blocklisted_url = "https://facebook.com/fake-test";
let result = app.scrape_url(blocklisted_url, None).await;
// #[tokio::test]
// async fn test_blocklisted_url() {
// dotenv().ok();
// let api_url = env::var("API_URL").unwrap();
// let api_key = env::var("TEST_API_KEY").ok();
// let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
// let blocklisted_url = "https://facebook.com/fake-test";
// let result = app.scrape_url(blocklisted_url, None).await;
assert_matches!(
result,
Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions")
);
}
// assert_matches!(
// result,
// Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions")
// );
// }
#[tokio::test]
async fn test_successful_response_with_valid_preview_token() {
@ -103,20 +103,21 @@ async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explici
.contains("We present spectrophotometric observations of the Broad Line Radio Galaxy"));
}
#[tokio::test]
async fn test_should_return_error_for_blocklisted_url() {
dotenv().ok();
let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let blocklisted_url = "https://twitter.com/fake-test";
let result = app.crawl_url(blocklisted_url, None).await;
assert_matches!(
result,
Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions.")
);
}
// #[tokio::test]
// async fn test_should_return_error_for_blocklisted_url() {
// dotenv().ok();
// let api_url = env::var("API_URL").unwrap();
// let api_key = env::var("TEST_API_KEY").ok();
// let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
// let blocklisted_url = "https://twitter.com/fake-test";
// let result = app.crawl_url(blocklisted_url, None).await;
// assert_matches!(
// result,
// Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions.")
// );
// }
#[tokio::test]
async fn test_llm_extraction() {

View File

@ -0,0 +1,147 @@
import os
import json
import requests
from dotenv import load_dotenv
from openai import OpenAI
from serpapi import GoogleSearch
# ANSI color codes
class Colors:
CYAN = '\033[96m'
YELLOW = '\033[93m'
GREEN = '\033[92m'
RED = '\033[91m'
MAGENTA = '\033[95m'
BLUE = '\033[94m'
RESET = '\033[0m'
# Load environment variables
load_dotenv()
# Initialize clients
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
def search_google(query):
"""Search Google using SerpAPI and return top results."""
print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}")
search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
return search.get_dict().get("organic_results", [])
def select_urls_with_o1(company, objective, serp_results):
"""
Use O1 to select the most relevant URLs from SERP results for the given company and objective.
Returns a JSON object with a "selected_urls" property that is an array of strings.
"""
try:
# Prepare the data for O1
serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")}
for r in serp_results if r.get("link")]
response = client.chat.completions.create(
model="o1-2024-12-17",
messages=[
{
"role": "developer",
"content": "You select URLs from the SERP results relevant to the company and objective."
},
{
"role": "user",
"content": (
f"Company: {company}\n"
f"Objective: {objective}\n"
f"SERP Results: {json.dumps(serp_data)}\n\n"
"Return a JSON object with a property 'selected_urls' that contains an array "
"of URLs most likely to help meet the objective. Add a /* to the end of the URL if you think it should search all of the pages in the site. Do not return any social media links. For example: {\"selected_urls\": [\"https://example.com\", \"https://example2.com\"]}"
)
}
],
response_format={
"type": "json_schema",
"json_schema": {
"name": "selected_urls_object",
"schema": {
"type": "object",
"properties": {
"selected_urls": {
"type": "array",
"items": {
"type": "string"
}
}
},
"required": ["selected_urls"],
"additionalProperties": False
}
}
}
)
# The response is guaranteed to follow the specified JSON schema
result = json.loads(response.choices[0].message.content)
urls = result.get("selected_urls", [])
return urls
except Exception as e:
print(f"{Colors.RED}Error selecting URLs with O1: {e}{Colors.RESET}")
return []
def extract_company_info(urls, prompt, company, api_key):
"""Use requests to call Firecrawl's extract endpoint with selected URLs."""
print(f"{Colors.YELLOW}Extracting structured data from the provided URLs using Firecrawl...{Colors.RESET}")
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {api_key}'
}
payload = {
"urls": urls,
"prompt": prompt + " for " + company
}
try:
response = requests.post(
"https://api.firecrawl.dev/v1/extract",
headers=headers,
json=payload
)
response.raise_for_status()
data = response.json()
return data
except Exception as e:
print(f"{Colors.RED}Failed to extract data: {e}{Colors.RESET}")
return None
def main():
company = input(f"{Colors.BLUE}Enter the company name: {Colors.RESET}")
objective = input(f"{Colors.BLUE}Enter what information you want about the company: {Colors.RESET}")
serp_results = search_google(f"{company}")
if not serp_results:
print(f"{Colors.RED}No search results found.{Colors.RESET}")
return
# Ask O1 to select URLs
selected_urls = select_urls_with_o1(company, objective, serp_results)
if not selected_urls:
print(f"{Colors.RED}O1 did not return any URLs.{Colors.RESET}")
return
print(f"{Colors.CYAN}Selected URLs for extraction by O1:{Colors.RESET}")
for url in selected_urls:
print(f"- {url}")
data = extract_company_info(selected_urls, objective, company, firecrawl_api_key)
if data and data.get('success') and data.get('data'):
print(f"{Colors.GREEN}Data successfully extracted:{Colors.RESET}")
print(json.dumps(data['data'], indent=2))
else:
print(f"{Colors.RED}Failed to extract the requested information. Try refining your prompt or choosing a different company.{Colors.RESET}")
if __name__ == "__main__":
main()