mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-03 05:40:38 +08:00
Merge branch 'main' into fix-sdk/next-in-when-502
This commit is contained in:
commit
bcc18e1c07
@ -111,6 +111,20 @@ curl -X POST http://localhost:3002/v1/crawl \
|
||||
}'
|
||||
```
|
||||
|
||||
### Alternative: Using Docker Compose
|
||||
|
||||
For a simpler setup, you can use Docker Compose to run all services:
|
||||
|
||||
1. Prerequisites: Make sure you have Docker and Docker Compose installed
|
||||
2. Copy the `.env.example` file to `.env` in the `/apps/api/` directory and configure as needed
|
||||
3. From the root directory, run:
|
||||
|
||||
```bash
|
||||
docker compose up
|
||||
```
|
||||
|
||||
This will start Redis, the API server, and workers automatically in the correct configuration.
|
||||
|
||||
## Tests:
|
||||
|
||||
The best way to do this is run the test with `npm run test:local-no-auth` if you'd like to run the tests without authentication.
|
||||
|
2
apps/api/.gitignore
vendored
2
apps/api/.gitignore
vendored
@ -9,3 +9,5 @@ dump.rdb
|
||||
|
||||
.rdb
|
||||
.sentryclirc
|
||||
|
||||
.env.*
|
14
apps/api/pnpm-lock.yaml
generated
14
apps/api/pnpm-lock.yaml
generated
@ -7478,7 +7478,7 @@ snapshots:
|
||||
|
||||
extract-zip@2.0.1:
|
||||
dependencies:
|
||||
debug: 4.3.4
|
||||
debug: 4.3.5
|
||||
get-stream: 5.2.0
|
||||
yauzl: 2.10.0
|
||||
optionalDependencies:
|
||||
@ -7622,7 +7622,7 @@ snapshots:
|
||||
dependencies:
|
||||
basic-ftp: 5.0.5
|
||||
data-uri-to-buffer: 6.0.2
|
||||
debug: 4.3.4
|
||||
debug: 4.3.5
|
||||
fs-extra: 11.2.0
|
||||
transitivePeerDependencies:
|
||||
- supports-color
|
||||
@ -7723,7 +7723,7 @@ snapshots:
|
||||
http-proxy-agent@7.0.2:
|
||||
dependencies:
|
||||
agent-base: 7.1.1
|
||||
debug: 4.3.4
|
||||
debug: 4.3.5
|
||||
transitivePeerDependencies:
|
||||
- supports-color
|
||||
|
||||
@ -7771,7 +7771,7 @@ snapshots:
|
||||
https-proxy-agent@7.0.5:
|
||||
dependencies:
|
||||
agent-base: 7.1.1
|
||||
debug: 4.3.4
|
||||
debug: 4.3.5
|
||||
transitivePeerDependencies:
|
||||
- supports-color
|
||||
|
||||
@ -8836,7 +8836,7 @@ snapshots:
|
||||
dependencies:
|
||||
'@tootallnate/quickjs-emscripten': 0.23.0
|
||||
agent-base: 7.1.1
|
||||
debug: 4.3.4
|
||||
debug: 4.3.5
|
||||
get-uri: 6.0.3
|
||||
http-proxy-agent: 7.0.2
|
||||
https-proxy-agent: 7.0.5
|
||||
@ -9031,7 +9031,7 @@ snapshots:
|
||||
proxy-agent@6.4.0:
|
||||
dependencies:
|
||||
agent-base: 7.1.1
|
||||
debug: 4.3.4
|
||||
debug: 4.3.5
|
||||
http-proxy-agent: 7.0.2
|
||||
https-proxy-agent: 7.0.5
|
||||
lru-cache: 7.18.3
|
||||
@ -9338,7 +9338,7 @@ snapshots:
|
||||
socks-proxy-agent@8.0.4:
|
||||
dependencies:
|
||||
agent-base: 7.1.1
|
||||
debug: 4.3.4
|
||||
debug: 4.3.5
|
||||
socks: 2.8.3
|
||||
transitivePeerDependencies:
|
||||
- supports-color
|
||||
|
@ -1,6 +1,7 @@
|
||||
import request from "supertest";
|
||||
import dotenv from "dotenv";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
@ -58,9 +59,7 @@ describe("E2E Tests for API Routes", () => {
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: blocklistedUrl });
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toContain(
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
);
|
||||
expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
|
||||
});
|
||||
|
||||
// tested on rate limit test
|
||||
@ -480,9 +479,7 @@ describe("E2E Tests for API Routes", () => {
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: blocklistedUrl });
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toContain(
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
);
|
||||
expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
|
||||
});
|
||||
|
||||
it.concurrent(
|
||||
|
@ -1,5 +1,6 @@
|
||||
import request from "supertest";
|
||||
import dotenv from "dotenv";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
|
||||
@ -61,9 +62,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: blocklistedUrl });
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toContain(
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
);
|
||||
expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
|
||||
});
|
||||
|
||||
it("should return a successful response", async () => {
|
||||
@ -88,9 +87,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: blocklistedUrl });
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toContain(
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
);
|
||||
expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
|
||||
});
|
||||
|
||||
it("should return a successful response", async () => {
|
||||
@ -119,9 +116,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: blocklistedUrl });
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toContain(
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
);
|
||||
expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
|
||||
});
|
||||
|
||||
it("should return a successful response", async () => {
|
||||
|
@ -4,6 +4,7 @@ import {
|
||||
ScrapeRequestInput,
|
||||
ScrapeResponseRequestTest,
|
||||
} from "../../controllers/v1/types";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||
|
||||
configDotenv();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
@ -57,9 +58,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toBe(
|
||||
"URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.",
|
||||
);
|
||||
expect(response.body.error).toBe(BLOCKLISTED_URL_MESSAGE);
|
||||
});
|
||||
|
||||
it.concurrent(
|
||||
@ -756,9 +755,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toBe(
|
||||
"URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.",
|
||||
);
|
||||
expect(response.body.error).toBe(BLOCKLISTED_URL_MESSAGE);
|
||||
});
|
||||
|
||||
it.concurrent(
|
||||
|
@ -351,6 +351,7 @@ function getPlanByPriceId(price_id: string | null): PlanType {
|
||||
case process.env.STRIPE_PRICE_ID_ETIER1A_MONTHLY: //ocqh
|
||||
return "etier1a";
|
||||
case process.env.STRIPE_PRICE_ID_ETIER_SCALE_1_MONTHLY:
|
||||
case process.env.STRIPE_PRICE_ID_ETIER_SCALE_1_YEARLY:
|
||||
return "etierscale1";
|
||||
default:
|
||||
return "free";
|
||||
|
64
apps/api/src/controllers/v0/admin/check-fire-engine.ts
Normal file
64
apps/api/src/controllers/v0/admin/check-fire-engine.ts
Normal file
@ -0,0 +1,64 @@
|
||||
import { logger } from "../../../lib/logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { Request, Response } from "express";
|
||||
|
||||
export async function checkFireEngine(req: Request, res: Response) {
|
||||
try {
|
||||
if (!process.env.FIRE_ENGINE_BETA_URL) {
|
||||
logger.warn("Fire engine beta URL not configured");
|
||||
return res.status(500).json({
|
||||
success: false,
|
||||
error: "Fire engine beta URL not configured",
|
||||
});
|
||||
}
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), 30000);
|
||||
|
||||
try {
|
||||
const response = await fetch(
|
||||
`${process.env.FIRE_ENGINE_BETA_URL}/scrape`,
|
||||
{
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"X-Disable-Cache": "true",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
url: "https://example.com",
|
||||
}),
|
||||
signal: controller.signal,
|
||||
},
|
||||
);
|
||||
|
||||
clearTimeout(timeout);
|
||||
|
||||
if (response.ok) {
|
||||
const responseData = await response.json();
|
||||
return res.status(200).json({
|
||||
data: responseData,
|
||||
});
|
||||
} else {
|
||||
return res.status(response.status).json({
|
||||
success: false,
|
||||
error: `Fire engine returned status ${response.status}`,
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
if (error.name === "AbortError") {
|
||||
return res.status(504).json({
|
||||
success: false,
|
||||
error: "Request timed out after 30 seconds",
|
||||
});
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(error);
|
||||
Sentry.captureException(error);
|
||||
return res.status(500).json({
|
||||
success: false,
|
||||
error: "Internal server error",
|
||||
});
|
||||
}
|
||||
}
|
@ -29,6 +29,7 @@ import * as Sentry from "@sentry/node";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { fromLegacyScrapeOptions, url as urlSchema } from "../v1/types";
|
||||
import { ZodError } from "zod";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||
|
||||
export async function crawlController(req: Request, res: Response) {
|
||||
try {
|
||||
@ -112,8 +113,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res.status(403).json({
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
error: BLOCKLISTED_URL_MESSAGE,
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -15,6 +15,7 @@ import { addScrapeJob } from "../../../src/services/queue-jobs";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { fromLegacyScrapeOptions } from "../v1/types";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||
|
||||
export async function crawlPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
@ -42,8 +43,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res.status(403).json({
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
error: BLOCKLISTED_URL_MESSAGE,
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -8,7 +8,6 @@ import { authenticateUser } from "../auth";
|
||||
import { PlanType, RateLimiterMode } from "../../types";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import {
|
||||
Document,
|
||||
fromLegacyCombo,
|
||||
toLegacyDocument,
|
||||
url as urlSchema,
|
||||
@ -29,6 +28,8 @@ import * as Sentry from "@sentry/node";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { fromLegacyScrapeOptions } from "../v1/types";
|
||||
import { ZodError } from "zod";
|
||||
import { Document as V0Document } from "./../../lib/entities";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||
|
||||
export async function scrapeHelper(
|
||||
jobId: string,
|
||||
@ -42,7 +43,7 @@ export async function scrapeHelper(
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
data?: Document | { url: string };
|
||||
data?: V0Document | { url: string };
|
||||
returnCode: number;
|
||||
}> {
|
||||
const url = urlSchema.parse(req.body.url);
|
||||
@ -53,8 +54,7 @@ export async function scrapeHelper(
|
||||
if (isUrlBlocked(url)) {
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
error: BLOCKLISTED_URL_MESSAGE,
|
||||
returnCode: 403,
|
||||
};
|
||||
}
|
||||
@ -241,9 +241,9 @@ export async function scrapeController(req: Request, res: Response) {
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens =
|
||||
result.data && (result.data as Document).markdown
|
||||
result.data && (result.data as V0Document).markdown
|
||||
? numTokensFromString(
|
||||
(result.data as Document).markdown!,
|
||||
(result.data as V0Document).markdown!,
|
||||
"gpt-3.5-turbo",
|
||||
)
|
||||
: 0;
|
||||
@ -265,25 +265,28 @@ export async function scrapeController(req: Request, res: Response) {
|
||||
}
|
||||
if (creditsToBeBilled > 0) {
|
||||
// billing for doc done on queue end, bill only for llm extraction
|
||||
billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch((error) => {
|
||||
logger.error(
|
||||
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
|
||||
);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
billTeam(team_id, chunk?.sub_id, creditsToBeBilled, logger).catch(
|
||||
(error) => {
|
||||
logger.error(
|
||||
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,
|
||||
{ error },
|
||||
);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let doc = result.data;
|
||||
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||
if (doc && (doc as Document).rawHtml) {
|
||||
delete (doc as Document).rawHtml;
|
||||
if (doc && (doc as V0Document).rawHtml) {
|
||||
delete (doc as V0Document).rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
if (pageOptions && pageOptions.includeExtract) {
|
||||
if (!pageOptions.includeMarkdown && doc && (doc as Document).markdown) {
|
||||
delete (doc as Document).markdown;
|
||||
if (!pageOptions.includeMarkdown && doc && (doc as V0Document).markdown) {
|
||||
delete (doc as V0Document).markdown;
|
||||
}
|
||||
}
|
||||
|
||||
@ -312,7 +315,7 @@ export async function scrapeController(req: Request, res: Response) {
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
logger.error(error);
|
||||
logger.error("Scrape error occcurred", { error });
|
||||
return res.status(500).json({
|
||||
error:
|
||||
error instanceof ZodError
|
||||
|
@ -1,4 +1,5 @@
|
||||
import { url } from "../types";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../../../lib/strings";
|
||||
|
||||
describe("URL Schema Validation", () => {
|
||||
beforeEach(() => {
|
||||
@ -31,7 +32,7 @@ describe("URL Schema Validation", () => {
|
||||
|
||||
it("should reject blocked URLs", () => {
|
||||
expect(() => url.parse("https://facebook.com")).toThrow(
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
BLOCKLISTED_URL_MESSAGE,
|
||||
);
|
||||
});
|
||||
|
||||
@ -47,16 +48,16 @@ describe("URL Schema Validation", () => {
|
||||
|
||||
it("should handle URLs with subdomains that are blocked", () => {
|
||||
expect(() => url.parse("https://sub.facebook.com")).toThrow(
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
BLOCKLISTED_URL_MESSAGE,
|
||||
);
|
||||
});
|
||||
|
||||
it("should handle URLs with paths that are blocked", () => {
|
||||
expect(() => url.parse("http://facebook.com/path")).toThrow(
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
BLOCKLISTED_URL_MESSAGE,
|
||||
);
|
||||
expect(() => url.parse("https://facebook.com/another/path")).toThrow(
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
BLOCKLISTED_URL_MESSAGE,
|
||||
);
|
||||
});
|
||||
|
||||
|
@ -3,9 +3,11 @@ import { v4 as uuidv4 } from "uuid";
|
||||
import {
|
||||
BatchScrapeRequest,
|
||||
batchScrapeRequestSchema,
|
||||
CrawlResponse,
|
||||
batchScrapeRequestSchemaNoURLValidation,
|
||||
url as urlSchema,
|
||||
RequestWithAuth,
|
||||
ScrapeOptions,
|
||||
BatchScrapeResponse,
|
||||
} from "./types";
|
||||
import {
|
||||
addCrawlJobs,
|
||||
@ -21,10 +23,14 @@ import { callWebhook } from "../../services/webhook";
|
||||
import { logger as _logger } from "../../lib/logger";
|
||||
|
||||
export async function batchScrapeController(
|
||||
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
|
||||
res: Response<CrawlResponse>,
|
||||
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
|
||||
res: Response<BatchScrapeResponse>,
|
||||
) {
|
||||
req.body = batchScrapeRequestSchema.parse(req.body);
|
||||
if (req.body?.ignoreInvalidURLs === true) {
|
||||
req.body = batchScrapeRequestSchemaNoURLValidation.parse(req.body);
|
||||
} else {
|
||||
req.body = batchScrapeRequestSchema.parse(req.body);
|
||||
}
|
||||
|
||||
const id = req.body.appendToId ?? uuidv4();
|
||||
const logger = _logger.child({
|
||||
@ -35,8 +41,27 @@ export async function batchScrapeController(
|
||||
teamId: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
});
|
||||
|
||||
let urls = req.body.urls;
|
||||
let invalidURLs: string[] | undefined = undefined;
|
||||
|
||||
if (req.body.ignoreInvalidURLs) {
|
||||
invalidURLs = [];
|
||||
|
||||
let pendingURLs = urls;
|
||||
urls = [];
|
||||
for (const u of pendingURLs) {
|
||||
try {
|
||||
const nu = urlSchema.parse(u);
|
||||
urls.push(nu);
|
||||
} catch (_) {
|
||||
invalidURLs.push(u);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.debug("Batch scrape " + id + " starting", {
|
||||
urlsLength: req.body.urls,
|
||||
urlsLength: urls,
|
||||
appendToId: req.body.appendToId,
|
||||
account: req.account,
|
||||
});
|
||||
@ -70,7 +95,7 @@ export async function batchScrapeController(
|
||||
|
||||
// If it is over 1000, we need to get the job priority,
|
||||
// otherwise we can use the default priority of 20
|
||||
if (req.body.urls.length > 1000) {
|
||||
if (urls.length > 1000) {
|
||||
// set base to 21
|
||||
jobPriority = await getJobPriority({
|
||||
plan: req.auth.plan,
|
||||
@ -84,7 +109,7 @@ export async function batchScrapeController(
|
||||
delete (scrapeOptions as any).urls;
|
||||
delete (scrapeOptions as any).appendToId;
|
||||
|
||||
const jobs = req.body.urls.map((x) => {
|
||||
const jobs = urls.map((x) => {
|
||||
return {
|
||||
data: {
|
||||
url: x,
|
||||
@ -140,5 +165,6 @@ export async function batchScrapeController(
|
||||
success: true,
|
||||
id,
|
||||
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`,
|
||||
invalidURLs,
|
||||
});
|
||||
}
|
||||
|
@ -18,7 +18,7 @@ import {
|
||||
} from "../../lib/crawl-redis";
|
||||
import { logCrawl } from "../../services/logging/crawl_log";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { addScrapeJob } from "../../services/queue-jobs";
|
||||
import { addScrapeJob, addScrapeJobs } from "../../services/queue-jobs";
|
||||
import { logger as _logger } from "../../lib/logger";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { callWebhook } from "../../services/webhook";
|
||||
@ -139,9 +139,9 @@ export async function crawlController(
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
mode: "single_urls" as const,
|
||||
team_id: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
plan: req.auth.plan!,
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions: sc.internalOptions,
|
||||
@ -170,7 +170,7 @@ export async function crawlController(
|
||||
jobs.map((x) => x.opts.jobId),
|
||||
);
|
||||
logger.debug("Adding scrape jobs to BullMQ...");
|
||||
await getScrapeQueue().addBulk(jobs);
|
||||
await addScrapeJobs(jobs);
|
||||
} else {
|
||||
logger.debug("Sitemap not found or ignored.", {
|
||||
ignoreSitemap: sc.crawlerOptions.ignoreSitemap,
|
||||
|
45
apps/api/src/controllers/v1/credit-usage.ts
Normal file
45
apps/api/src/controllers/v1/credit-usage.ts
Normal file
@ -0,0 +1,45 @@
|
||||
import { Request, Response } from "express";
|
||||
import { RequestWithAuth } from "./types";
|
||||
import { getACUC } from "../auth";
|
||||
import { logger } from "../../lib/logger";
|
||||
|
||||
export async function creditUsageController(
|
||||
req: RequestWithAuth,
|
||||
res: Response,
|
||||
): Promise<void> {
|
||||
try {
|
||||
// If we already have the credit usage info from auth, use it
|
||||
if (req.acuc) {
|
||||
res.json({
|
||||
success: true,
|
||||
data: {
|
||||
remaining_credits: req.acuc.remaining_credits,
|
||||
},
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
// Otherwise fetch fresh data
|
||||
const chunk = await getACUC(req.auth.team_id);
|
||||
if (!chunk) {
|
||||
res.status(404).json({
|
||||
success: false,
|
||||
error: "Could not find credit usage information",
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
data: {
|
||||
remaining_credits: chunk.remaining_credits,
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
logger.error("Error in credit usage controller:", error);
|
||||
res.status(500).json({
|
||||
success: false,
|
||||
error: "Internal server error while fetching credit usage",
|
||||
});
|
||||
}
|
||||
}
|
@ -1,6 +1,6 @@
|
||||
import { Request, Response } from "express";
|
||||
import {
|
||||
// Document,
|
||||
Document,
|
||||
RequestWithAuth,
|
||||
ExtractRequest,
|
||||
extractRequestSchema,
|
||||
@ -8,7 +8,7 @@ import {
|
||||
MapDocument,
|
||||
scrapeOptions,
|
||||
} from "./types";
|
||||
import { Document } from "../../lib/entities";
|
||||
// import { Document } from "../../lib/entities";
|
||||
import Redis from "ioredis";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { performRanking } from "../../lib/ranker";
|
||||
@ -24,6 +24,9 @@ import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { getMapResults } from "./map";
|
||||
import { buildDocument } from "../../lib/extract/build-document";
|
||||
import { generateBasicCompletion } from "../../lib/LLM-extraction";
|
||||
import { buildRefrasedPrompt } from "../../lib/extract/build-prompts";
|
||||
import { removeDuplicateUrls } from "../../lib/validateUrl";
|
||||
|
||||
configDotenv();
|
||||
const redis = new Redis(process.env.REDIS_URL!);
|
||||
@ -61,30 +64,50 @@ export async function extractController(
|
||||
const baseUrl = url.replace("/*", "");
|
||||
// const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
|
||||
|
||||
const allowExternalLinks = req.body.allowExternalLinks ?? true;
|
||||
const allowExternalLinks = req.body.allowExternalLinks;
|
||||
let urlWithoutWww = baseUrl.replace("www.", "");
|
||||
let mapUrl =
|
||||
req.body.prompt && allowExternalLinks
|
||||
? `${req.body.prompt} ${urlWithoutWww}`
|
||||
: req.body.prompt
|
||||
? `${req.body.prompt} site:${urlWithoutWww}`
|
||||
: `site:${urlWithoutWww}`;
|
||||
|
||||
let rephrasedPrompt = req.body.prompt;
|
||||
if (req.body.prompt) {
|
||||
rephrasedPrompt =
|
||||
(await generateBasicCompletion(
|
||||
buildRefrasedPrompt(req.body.prompt, baseUrl),
|
||||
)) ?? req.body.prompt;
|
||||
}
|
||||
|
||||
const mapResults = await getMapResults({
|
||||
url: baseUrl,
|
||||
search: req.body.prompt,
|
||||
search: rephrasedPrompt,
|
||||
teamId: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
allowExternalLinks,
|
||||
origin: req.body.origin,
|
||||
limit: req.body.limit,
|
||||
// If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
|
||||
ignoreSitemap: !selfHosted ? true : false,
|
||||
ignoreSitemap: false,
|
||||
includeMetadata: true,
|
||||
includeSubdomains: req.body.includeSubdomains,
|
||||
});
|
||||
|
||||
let mappedLinks = mapResults.links as MapDocument[];
|
||||
let mappedLinks = mapResults.mapResults as MapDocument[];
|
||||
|
||||
// Remove duplicates between mapResults.links and mappedLinks
|
||||
const allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
|
||||
const uniqueUrls = removeDuplicateUrls(allUrls);
|
||||
|
||||
// Only add URLs from mapResults.links that aren't already in mappedLinks
|
||||
const existingUrls = new Set(mappedLinks.map((m) => m.url));
|
||||
const newUrls = uniqueUrls.filter((url) => !existingUrls.has(url));
|
||||
|
||||
mappedLinks = [
|
||||
...mappedLinks,
|
||||
...newUrls.map((url) => ({ url, title: "", description: "" })),
|
||||
];
|
||||
|
||||
if (mappedLinks.length === 0) {
|
||||
mappedLinks = [{ url: baseUrl, title: "", description: "" }];
|
||||
}
|
||||
|
||||
// Limit number of links to MAX_EXTRACT_LIMIT
|
||||
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
|
||||
|
||||
@ -93,18 +116,18 @@ export async function extractController(
|
||||
`url: ${x.url}, title: ${x.title}, description: ${x.description}`,
|
||||
);
|
||||
|
||||
// Filter by path prefix if present
|
||||
// wrong
|
||||
// if (pathPrefix) {
|
||||
// mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
|
||||
// }
|
||||
|
||||
if (req.body.prompt) {
|
||||
let searchQuery =
|
||||
req.body.prompt && allowExternalLinks
|
||||
? `${req.body.prompt} ${urlWithoutWww}`
|
||||
: req.body.prompt
|
||||
? `${req.body.prompt} site:${urlWithoutWww}`
|
||||
: `site:${urlWithoutWww}`;
|
||||
// Get similarity scores between the search query and each link's context
|
||||
const linksAndScores = await performRanking(
|
||||
mappedLinksRerank,
|
||||
mappedLinks.map((l) => l.url),
|
||||
mapUrl,
|
||||
searchQuery,
|
||||
);
|
||||
|
||||
// First try with high threshold
|
||||
@ -158,7 +181,8 @@ export async function extractController(
|
||||
|
||||
// Wait for all URL processing to complete and flatten results
|
||||
const processedUrls = await Promise.all(urlPromises);
|
||||
links.push(...processedUrls.flat());
|
||||
const flattenedUrls = processedUrls.flat().filter((url) => url); // Filter out any null/undefined values
|
||||
links.push(...flattenedUrls);
|
||||
|
||||
if (links.length === 0) {
|
||||
return res.status(400).json({
|
||||
@ -204,21 +228,8 @@ export async function extractController(
|
||||
}
|
||||
return doc;
|
||||
} catch (e) {
|
||||
logger.error(`Error in scrapeController: ${e}`);
|
||||
if (
|
||||
e instanceof Error &&
|
||||
(e.message.startsWith("Job wait") || e.message === "timeout")
|
||||
) {
|
||||
throw {
|
||||
status: 408,
|
||||
error: "Request timed out",
|
||||
};
|
||||
} else {
|
||||
throw {
|
||||
status: 500,
|
||||
error: `(Internal server error) - ${e && e.message ? e.message : e}`,
|
||||
};
|
||||
}
|
||||
logger.error(`Error in extractController: ${e}`);
|
||||
return null;
|
||||
}
|
||||
});
|
||||
|
||||
@ -237,7 +248,8 @@ export async function extractController(
|
||||
{
|
||||
mode: "llm",
|
||||
systemPrompt:
|
||||
"Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided. Here are the urls the user provided of which he wants to extract information from: " +
|
||||
(req.body.systemPrompt ? `${req.body.systemPrompt}\n` : "") +
|
||||
"Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " +
|
||||
links.join(", "),
|
||||
prompt: req.body.prompt,
|
||||
schema: req.body.schema,
|
||||
|
@ -28,14 +28,15 @@ const redis = new Redis(process.env.REDIS_URL!);
|
||||
// Max Links that /map can return
|
||||
const MAX_MAP_LIMIT = 5000;
|
||||
// Max Links that "Smart /map" can return
|
||||
const MAX_FIRE_ENGINE_RESULTS = 1000;
|
||||
const MAX_FIRE_ENGINE_RESULTS = 500;
|
||||
|
||||
interface MapResult {
|
||||
success: boolean;
|
||||
links: string[] | any[];
|
||||
links: string[];
|
||||
scrape_id?: string;
|
||||
job_id: string;
|
||||
time_taken: number;
|
||||
mapResults: MapDocument[];
|
||||
}
|
||||
|
||||
export async function getMapResults({
|
||||
@ -215,7 +216,8 @@ export async function getMapResults({
|
||||
|
||||
return {
|
||||
success: true,
|
||||
links: includeMetadata ? mapResults : linksToReturn,
|
||||
links: linksToReturn,
|
||||
mapResults: mapResults,
|
||||
scrape_id: origin?.includes("website") ? id : undefined,
|
||||
job_id: id,
|
||||
time_taken: (new Date().getTime() - Date.now()) / 1000,
|
||||
|
@ -60,7 +60,11 @@ export async function scrapeController(
|
||||
try {
|
||||
doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
|
||||
} catch (e) {
|
||||
logger.error(`Error in scrapeController: ${e}`);
|
||||
logger.error(`Error in scrapeController: ${e}`, {
|
||||
jobId,
|
||||
scrapeId: jobId,
|
||||
startTime,
|
||||
});
|
||||
if (
|
||||
e instanceof Error &&
|
||||
(e.message.startsWith("Job wait") || e.message === "timeout")
|
||||
|
@ -11,6 +11,7 @@ import {
|
||||
Document as V0Document,
|
||||
} from "../../lib/entities";
|
||||
import { InternalOptions } from "../../scraper/scrapeURL";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||
|
||||
export type Format =
|
||||
| "markdown"
|
||||
@ -44,10 +45,7 @@ export const url = z.preprocess(
|
||||
return false;
|
||||
}
|
||||
}, "Invalid URL")
|
||||
.refine(
|
||||
(x) => !isUrlBlocked(x as string),
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
),
|
||||
.refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE),
|
||||
);
|
||||
|
||||
const strictMessage =
|
||||
@ -182,6 +180,7 @@ export const scrapeOptions = z
|
||||
.optional(),
|
||||
skipTlsVerification: z.boolean().default(false),
|
||||
removeBase64Images: z.boolean().default(true),
|
||||
fastMode: z.boolean().default(false),
|
||||
})
|
||||
.strict(strictMessage);
|
||||
|
||||
@ -193,11 +192,12 @@ export const extractV1Options = z
|
||||
.array()
|
||||
.max(10, "Maximum of 10 URLs allowed per request while in beta."),
|
||||
prompt: z.string().optional(),
|
||||
systemPrompt: z.string().optional(),
|
||||
schema: z.any().optional(),
|
||||
limit: z.number().int().positive().finite().safe().optional(),
|
||||
ignoreSitemap: z.boolean().default(false),
|
||||
includeSubdomains: z.boolean().default(true),
|
||||
allowExternalLinks: z.boolean().default(true),
|
||||
allowExternalLinks: z.boolean().default(false),
|
||||
origin: z.string().optional().default("api"),
|
||||
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||
})
|
||||
@ -262,6 +262,31 @@ export const batchScrapeRequestSchema = scrapeOptions
|
||||
origin: z.string().optional().default("api"),
|
||||
webhook: webhookSchema.optional(),
|
||||
appendToId: z.string().uuid().optional(),
|
||||
ignoreInvalidURLs: z.boolean().default(false),
|
||||
})
|
||||
.strict(strictMessage)
|
||||
.refine(
|
||||
(obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
const hasExtractOptions = obj.extract !== undefined;
|
||||
return (
|
||||
(hasExtractFormat && hasExtractOptions) ||
|
||||
(!hasExtractFormat && !hasExtractOptions)
|
||||
);
|
||||
},
|
||||
{
|
||||
message:
|
||||
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
||||
},
|
||||
);
|
||||
|
||||
export const batchScrapeRequestSchemaNoURLValidation = scrapeOptions
|
||||
.extend({
|
||||
urls: z.string().array(),
|
||||
origin: z.string().optional().default("api"),
|
||||
webhook: webhookSchema.optional(),
|
||||
appendToId: z.string().uuid().optional(),
|
||||
ignoreInvalidURLs: z.boolean().default(false),
|
||||
})
|
||||
.strict(strictMessage)
|
||||
.refine(
|
||||
@ -396,7 +421,7 @@ export type Document = {
|
||||
articleSection?: string;
|
||||
url?: string;
|
||||
sourceURL?: string;
|
||||
statusCode?: number;
|
||||
statusCode: number;
|
||||
error?: string;
|
||||
[key: string]: string | string[] | number | undefined;
|
||||
};
|
||||
@ -446,6 +471,15 @@ export type CrawlResponse =
|
||||
url: string;
|
||||
};
|
||||
|
||||
export type BatchScrapeResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
id: string;
|
||||
url: string;
|
||||
invalidURLs?: string[];
|
||||
};
|
||||
|
||||
export type MapResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
@ -651,11 +685,11 @@ export function fromLegacyScrapeOptions(
|
||||
}
|
||||
: undefined,
|
||||
mobile: pageOptions.mobile,
|
||||
fastMode: pageOptions.useFastMode,
|
||||
}),
|
||||
internalOptions: {
|
||||
atsv: pageOptions.atsv,
|
||||
v0DisableJsDom: pageOptions.disableJsDom,
|
||||
v0UseFastMode: pageOptions.useFastMode,
|
||||
},
|
||||
// TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
|
||||
};
|
||||
|
@ -62,3 +62,16 @@ export async function generateCompletions(
|
||||
|
||||
return completions;
|
||||
}
|
||||
|
||||
// generate basic completion
|
||||
|
||||
export async function generateBasicCompletion(prompt: string) {
|
||||
const openai = new OpenAI();
|
||||
const model = "gpt-4o";
|
||||
|
||||
const completion = await openai.chat.completions.create({
|
||||
model,
|
||||
messages: [{ role: "user", content: prompt }],
|
||||
});
|
||||
return completion.choices[0].message.content;
|
||||
}
|
||||
|
@ -21,7 +21,7 @@ export function cacheKey(
|
||||
if (
|
||||
internalOptions.v0CrawlOnlyUrls ||
|
||||
internalOptions.forceEngine ||
|
||||
internalOptions.v0UseFastMode ||
|
||||
scrapeOptions.fastMode ||
|
||||
internalOptions.atsv ||
|
||||
(scrapeOptions.actions && scrapeOptions.actions.length > 0)
|
||||
) {
|
||||
|
@ -60,6 +60,8 @@ export async function addCrawlJob(id: string, job_id: string) {
|
||||
}
|
||||
|
||||
export async function addCrawlJobs(id: string, job_ids: string[]) {
|
||||
if (job_ids.length === 0) return true;
|
||||
|
||||
_logger.debug("Adding crawl jobs to Redis...", {
|
||||
jobIds: job_ids,
|
||||
module: "crawl-redis",
|
||||
@ -90,12 +92,20 @@ export async function addCrawlJobDone(
|
||||
|
||||
if (success) {
|
||||
await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id);
|
||||
await redisConnection.expire(
|
||||
} else {
|
||||
// in case it's already been pushed, make sure it's removed
|
||||
await redisConnection.lrem(
|
||||
"crawl:" + id + ":jobs_done_ordered",
|
||||
24 * 60 * 60,
|
||||
"NX",
|
||||
-1,
|
||||
job_id,
|
||||
);
|
||||
}
|
||||
|
||||
await redisConnection.expire(
|
||||
"crawl:" + id + ":jobs_done_ordered",
|
||||
24 * 60 * 60,
|
||||
"NX",
|
||||
);
|
||||
}
|
||||
|
||||
export async function getDoneJobsOrderedLength(id: string): Promise<number> {
|
||||
@ -227,13 +237,6 @@ export async function lockURL(
|
||||
url = normalizeURL(url, sc);
|
||||
logger = logger.child({ url });
|
||||
|
||||
await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
|
||||
await redisConnection.expire(
|
||||
"crawl:" + id + ":visited_unique",
|
||||
24 * 60 * 60,
|
||||
"NX",
|
||||
);
|
||||
|
||||
let res: boolean;
|
||||
if (!sc.crawlerOptions?.deduplicateSimilarURLs) {
|
||||
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0;
|
||||
@ -249,6 +252,15 @@ export async function lockURL(
|
||||
|
||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||
|
||||
if (res) {
|
||||
await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
|
||||
await redisConnection.expire(
|
||||
"crawl:" + id + ":visited_unique",
|
||||
24 * 60 * 60,
|
||||
"NX",
|
||||
);
|
||||
}
|
||||
|
||||
logger.debug("Locking URL " + JSON.stringify(url) + "... result: " + res, {
|
||||
res,
|
||||
});
|
||||
@ -261,6 +273,8 @@ export async function lockURLs(
|
||||
sc: StoredCrawl,
|
||||
urls: string[],
|
||||
): Promise<boolean> {
|
||||
if (urls.length === 0) return true;
|
||||
|
||||
urls = urls.map((url) => normalizeURL(url, sc));
|
||||
const logger = _logger.child({
|
||||
crawlId: id,
|
||||
|
16
apps/api/src/lib/extract/build-prompts.ts
Normal file
16
apps/api/src/lib/extract/build-prompts.ts
Normal file
@ -0,0 +1,16 @@
|
||||
export function buildRefrasedPrompt(prompt: string, url: string): string {
|
||||
return `You are a search query optimizer. Your task is to rephrase the following prompt into an effective search query that will find relevant results about this topic on ${url}.
|
||||
|
||||
Original prompt: "${prompt}"
|
||||
|
||||
Provide a rephrased search query that:
|
||||
1. Maintains the core intent of the original prompt with ONLY the keywords
|
||||
2. Uses relevant keywords
|
||||
3. Is optimized for search engine results
|
||||
4. Is concise and focused
|
||||
5. Short is better than long
|
||||
6. It is a search engine, not a chatbot
|
||||
7. Concise
|
||||
|
||||
Return only the rephrased search query, without any explanation or additional text.`;
|
||||
}
|
2
apps/api/src/lib/strings.ts
Normal file
2
apps/api/src/lib/strings.ts
Normal file
@ -0,0 +1,2 @@
|
||||
export const BLOCKLISTED_URL_MESSAGE =
|
||||
"This website is no longer supported, please reach out to help@firecrawl.com for more info on how to activate it on your account.";
|
@ -7,7 +7,7 @@ import {
|
||||
import { billTeam } from "../services/billing/credit_billing";
|
||||
import { Document } from "../controllers/v1/types";
|
||||
import { supabase_service } from "../services/supabase";
|
||||
import { logger } from "../lib/logger";
|
||||
import { logger as _logger } from "../lib/logger";
|
||||
import { ScrapeEvents } from "../lib/scrape-events";
|
||||
import { configDotenv } from "dotenv";
|
||||
import {
|
||||
@ -49,6 +49,7 @@ export async function startWebScraperPipeline({
|
||||
bull_job_id: job.id.toString(),
|
||||
priority: job.opts.priority,
|
||||
is_scrape: job.data.is_scrape ?? false,
|
||||
is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null),
|
||||
});
|
||||
}
|
||||
|
||||
@ -63,56 +64,126 @@ export async function runWebScraper({
|
||||
bull_job_id,
|
||||
priority,
|
||||
is_scrape = false,
|
||||
is_crawl = false,
|
||||
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
|
||||
const logger = _logger.child({
|
||||
method: "runWebScraper",
|
||||
module: "runWebscraper",
|
||||
scrapeId: bull_job_id,
|
||||
jobId: bull_job_id,
|
||||
});
|
||||
const tries = is_crawl ? 3 : 1;
|
||||
|
||||
let response: ScrapeUrlResponse | undefined = undefined;
|
||||
let engines: EngineResultsTracker = {};
|
||||
try {
|
||||
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
|
||||
priority,
|
||||
...internalOptions,
|
||||
});
|
||||
if (!response.success) {
|
||||
if (response.error instanceof Error) {
|
||||
throw response.error;
|
||||
} else {
|
||||
throw new Error(
|
||||
"scrapeURL error: " +
|
||||
(Array.isArray(response.error)
|
||||
? JSON.stringify(response.error)
|
||||
: typeof response.error === "object"
|
||||
? JSON.stringify({ ...response.error })
|
||||
: response.error),
|
||||
);
|
||||
}
|
||||
let error: any = undefined;
|
||||
|
||||
for (let i = 0; i < tries; i++) {
|
||||
if (i > 0) {
|
||||
logger.debug("Retrying scrape...", {
|
||||
tries,
|
||||
i,
|
||||
previousStatusCode: (response as any)?.document?.metadata?.statusCode,
|
||||
previousError: error,
|
||||
});
|
||||
}
|
||||
|
||||
response = undefined;
|
||||
engines = {};
|
||||
error = undefined;
|
||||
|
||||
try {
|
||||
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
|
||||
priority,
|
||||
...internalOptions,
|
||||
});
|
||||
if (!response.success) {
|
||||
if (response.error instanceof Error) {
|
||||
throw response.error;
|
||||
} else {
|
||||
throw new Error(
|
||||
"scrapeURL error: " +
|
||||
(Array.isArray(response.error)
|
||||
? JSON.stringify(response.error)
|
||||
: typeof response.error === "object"
|
||||
? JSON.stringify({ ...response.error })
|
||||
: response.error),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// This is where the returnvalue from the job is set
|
||||
// onSuccess(response.document, mode);
|
||||
|
||||
engines = response.engines;
|
||||
|
||||
if (
|
||||
(response.document.metadata.statusCode >= 200 &&
|
||||
response.document.metadata.statusCode < 300) ||
|
||||
response.document.metadata.statusCode === 304
|
||||
) {
|
||||
// status code is good -- do not attempt retry
|
||||
break;
|
||||
}
|
||||
} catch (_error) {
|
||||
error = _error;
|
||||
engines =
|
||||
response !== undefined
|
||||
? response.engines
|
||||
: typeof error === "object" && error !== null
|
||||
? ((error as any).results ?? {})
|
||||
: {};
|
||||
}
|
||||
}
|
||||
|
||||
const engineOrder = Object.entries(engines)
|
||||
.sort((a, b) => a[1].startedAt - b[1].startedAt)
|
||||
.map((x) => x[0]) as Engine[];
|
||||
|
||||
for (const engine of engineOrder) {
|
||||
const result = engines[engine] as Exclude<
|
||||
EngineResultsTracker[Engine],
|
||||
undefined
|
||||
>;
|
||||
ScrapeEvents.insert(bull_job_id, {
|
||||
type: "scrape",
|
||||
url,
|
||||
method: engine,
|
||||
result: {
|
||||
success: result.state === "success",
|
||||
response_code:
|
||||
result.state === "success" ? result.result.statusCode : undefined,
|
||||
response_size:
|
||||
result.state === "success" ? result.result.html.length : undefined,
|
||||
error:
|
||||
result.state === "error"
|
||||
? result.error
|
||||
: result.state === "timeout"
|
||||
? "Timed out"
|
||||
: undefined,
|
||||
time_taken: result.finishedAt - result.startedAt,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
if (error === undefined && response?.success) {
|
||||
if (is_scrape === false) {
|
||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
if (scrapeOptions.extract) {
|
||||
creditsToBeBilled = 5;
|
||||
}
|
||||
|
||||
billTeam(team_id, undefined, creditsToBeBilled).catch((error) => {
|
||||
billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => {
|
||||
logger.error(
|
||||
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
|
||||
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,
|
||||
{ error },
|
||||
);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
}
|
||||
|
||||
// This is where the returnvalue from the job is set
|
||||
// onSuccess(response.document, mode);
|
||||
|
||||
engines = response.engines;
|
||||
return response;
|
||||
} catch (error) {
|
||||
engines =
|
||||
response !== undefined
|
||||
? response.engines
|
||||
: typeof error === "object" && error !== null
|
||||
? ((error as any).results ?? {})
|
||||
: {};
|
||||
|
||||
} else {
|
||||
if (response !== undefined) {
|
||||
return {
|
||||
...response,
|
||||
@ -127,37 +198,6 @@ export async function runWebScraper({
|
||||
engines,
|
||||
};
|
||||
}
|
||||
// onError(error);
|
||||
} finally {
|
||||
const engineOrder = Object.entries(engines)
|
||||
.sort((a, b) => a[1].startedAt - b[1].startedAt)
|
||||
.map((x) => x[0]) as Engine[];
|
||||
|
||||
for (const engine of engineOrder) {
|
||||
const result = engines[engine] as Exclude<
|
||||
EngineResultsTracker[Engine],
|
||||
undefined
|
||||
>;
|
||||
ScrapeEvents.insert(bull_job_id, {
|
||||
type: "scrape",
|
||||
url,
|
||||
method: engine,
|
||||
result: {
|
||||
success: result.state === "success",
|
||||
response_code:
|
||||
result.state === "success" ? result.result.statusCode : undefined,
|
||||
response_size:
|
||||
result.state === "success" ? result.result.html.length : undefined,
|
||||
error:
|
||||
result.state === "error"
|
||||
? result.error
|
||||
: result.state === "timeout"
|
||||
? "Timed out"
|
||||
: undefined,
|
||||
time_taken: result.finishedAt - result.startedAt,
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -195,6 +235,11 @@ const saveJob = async (
|
||||
}
|
||||
ScrapeEvents.logJobEvent(job, "completed");
|
||||
} catch (error) {
|
||||
logger.error(`🐂 Failed to update job status: ${error}`);
|
||||
_logger.error(`🐂 Failed to update job status`, {
|
||||
module: "runWebScraper",
|
||||
method: "saveJob",
|
||||
jobId: job.id,
|
||||
scrapeId: job.id,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
@ -8,6 +8,7 @@ import {
|
||||
} from "../controllers/v0/admin/queue";
|
||||
import { wrap } from "./v1";
|
||||
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
|
||||
import { checkFireEngine } from "../controllers/v0/admin/check-fire-engine";
|
||||
|
||||
export const adminRouter = express.Router();
|
||||
|
||||
@ -37,3 +38,8 @@ adminRouter.post(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`,
|
||||
wrap(acucCacheClearController),
|
||||
);
|
||||
|
||||
adminRouter.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/feng-check`,
|
||||
wrap(checkFireEngine),
|
||||
);
|
||||
|
@ -31,6 +31,8 @@ import { extractController } from "../controllers/v1/extract";
|
||||
// import { keyAuthController } from "../../src/controllers/v1/keyAuth";
|
||||
// import { livenessController } from "../controllers/v1/liveness";
|
||||
// import { readinessController } from "../controllers/v1/readiness";
|
||||
import { creditUsageController } from "../controllers/v1/credit-usage";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||
|
||||
function checkCreditsMiddleware(
|
||||
minimum?: number,
|
||||
@ -122,8 +124,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||
if (!res.headersSent) {
|
||||
return res.status(403).json({
|
||||
success: false,
|
||||
error:
|
||||
"URL is blocked intentionally. Firecrawl currently does not support scraping this site due to policy restrictions.",
|
||||
error: BLOCKLISTED_URL_MESSAGE,
|
||||
});
|
||||
}
|
||||
}
|
||||
@ -224,3 +225,9 @@ v1Router.delete(
|
||||
// Health/Probe routes
|
||||
// v1Router.get("/health/liveness", livenessController);
|
||||
// v1Router.get("/health/readiness", readinessController);
|
||||
|
||||
v1Router.get(
|
||||
"/team/credit-usage",
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
wrap(creditUsageController),
|
||||
);
|
||||
|
@ -210,7 +210,7 @@ export class WebCrawler {
|
||||
}
|
||||
if (sitemapLinks.length > 0) {
|
||||
let filteredLinks = this.filterLinks(
|
||||
sitemapLinks,
|
||||
[...new Set(sitemapLinks)],
|
||||
this.limit,
|
||||
this.maxCrawledDepth,
|
||||
fromMap,
|
||||
|
@ -6,6 +6,15 @@ configDotenv();
|
||||
const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8");
|
||||
const algorithm = "aes-256-ecb";
|
||||
|
||||
function encryptAES(plaintext: string, key: Buffer): string {
|
||||
const cipher = crypto.createCipheriv(algorithm, key, null);
|
||||
const encrypted = Buffer.concat([
|
||||
cipher.update(plaintext, "utf-8"),
|
||||
cipher.final(),
|
||||
]);
|
||||
return encrypted.toString("base64");
|
||||
}
|
||||
|
||||
function decryptAES(ciphertext: string, key: Buffer): string {
|
||||
const decipher = crypto.createDecipheriv(algorithm, key, null);
|
||||
const decrypted = Buffer.concat([
|
||||
@ -42,9 +51,27 @@ const urlBlocklist = [
|
||||
"PTbGg8PK/h0Seyw4HEpK4Q==",
|
||||
"lZdQMknjHb7+4+sjF3qNTw==",
|
||||
"LsgSq54q5oDysbva29JxnQ==",
|
||||
"KZfBtpwjOpdSoqacRbz7og==",
|
||||
"Indtl4yxJMHCKBGF4KABCQ==",
|
||||
"e3HFXLVgxhaVoadYpwb2BA==",
|
||||
"b+asgLayXQ5Jq+se+q56jA==",
|
||||
"86ZDUI7vmp4MvNq3fvZrGQ==",
|
||||
"sEGFoYZ6GEg4Zocd+TiyfQ==",
|
||||
"6OOL72eXthgnJ1Hj4PfOQQ==",
|
||||
"g/ME+Sh1CAFboKrwkVb+5Q==",
|
||||
"Pw+xawUoX8xBYbX2yqqGWQ==",
|
||||
"k6vBalxYFhAvkPsF19t9gQ==",
|
||||
"e3HFXLVgxhaVoadYpwb2BA==",
|
||||
"b+asgLayXQ5Jq+se+q56jA==",
|
||||
"KKttwRz4w+AMJrZcB828WQ==",
|
||||
"vMdzZ33BXoyWVZnAPOBcrg==",
|
||||
"l8GDVI8w/ueHnNzdN1ODuQ==",
|
||||
];
|
||||
|
||||
const decryptedBlocklist = hashKey.length > 0 ? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey)) : [];
|
||||
const decryptedBlocklist =
|
||||
hashKey.length > 0
|
||||
? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey))
|
||||
: [];
|
||||
|
||||
const allowedKeywords = [
|
||||
"pulse",
|
||||
|
@ -5,8 +5,9 @@ import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
|
||||
export async function scrapeURLWithFetch(
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> {
|
||||
const timeout = 20000;
|
||||
const timeout = timeToRun ?? 300000;
|
||||
|
||||
const response = await Promise.race([
|
||||
fetch(meta.url, {
|
||||
|
@ -3,7 +3,7 @@ import * as Sentry from "@sentry/node";
|
||||
import { z } from "zod";
|
||||
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
import { EngineError, SiteError } from "../../error";
|
||||
import { ActionError, EngineError, SiteError } from "../../error";
|
||||
|
||||
const successSchema = z.object({
|
||||
jobId: z.string(),
|
||||
@ -111,6 +111,12 @@ export async function fireEngineCheckStatus(
|
||||
status.error.includes("Chrome error: ")
|
||||
) {
|
||||
throw new SiteError(status.error.split("Chrome error: ")[1]);
|
||||
} else if (
|
||||
typeof status.error === "string" &&
|
||||
// TODO: improve this later
|
||||
status.error.includes("Element")
|
||||
) {
|
||||
throw new ActionError(status.error.split("Error: ")[1]);
|
||||
} else {
|
||||
throw new EngineError("Scrape job failed", {
|
||||
cause: {
|
||||
|
@ -13,13 +13,11 @@ import {
|
||||
FireEngineCheckStatusSuccess,
|
||||
StillProcessingError,
|
||||
} from "./checkStatus";
|
||||
import { EngineError, SiteError, TimeoutError } from "../../error";
|
||||
import { ActionError, EngineError, SiteError, TimeoutError } from "../../error";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { Action } from "../../../../lib/entities";
|
||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
|
||||
export const defaultTimeout = 10000;
|
||||
|
||||
// This function does not take `Meta` on purpose. It may not access any
|
||||
// meta values to construct the request -- that must be done by the
|
||||
// `scrapeURLWithFireEngine*` functions.
|
||||
@ -31,7 +29,7 @@ async function performFireEngineScrape<
|
||||
>(
|
||||
logger: Logger,
|
||||
request: FireEngineScrapeRequestCommon & Engine,
|
||||
timeout = defaultTimeout,
|
||||
timeout: number,
|
||||
): Promise<FireEngineCheckStatusSuccess> {
|
||||
const scrape = await fireEngineScrape(
|
||||
logger.child({ method: "fireEngineScrape" }),
|
||||
@ -70,7 +68,11 @@ async function performFireEngineScrape<
|
||||
} catch (error) {
|
||||
if (error instanceof StillProcessingError) {
|
||||
// nop
|
||||
} else if (error instanceof EngineError || error instanceof SiteError) {
|
||||
} else if (
|
||||
error instanceof EngineError ||
|
||||
error instanceof SiteError ||
|
||||
error instanceof ActionError
|
||||
) {
|
||||
logger.debug("Fire-engine scrape job failed.", {
|
||||
error,
|
||||
jobId: scrape.jobId,
|
||||
@ -94,6 +96,7 @@ async function performFireEngineScrape<
|
||||
|
||||
export async function scrapeURLWithFireEngineChromeCDP(
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> {
|
||||
const actions: Action[] = [
|
||||
// Transform waitFor option into an action (unsupported by chrome-cdp)
|
||||
@ -121,6 +124,13 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
||||
...(meta.options.actions ?? []),
|
||||
];
|
||||
|
||||
const totalWait = actions.reduce(
|
||||
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
|
||||
0,
|
||||
);
|
||||
|
||||
const timeout = (timeToRun ?? 300000) + totalWait;
|
||||
|
||||
const request: FireEngineScrapeRequestCommon &
|
||||
FireEngineScrapeRequestChromeCDP = {
|
||||
url: meta.url,
|
||||
@ -134,25 +144,20 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
||||
}
|
||||
: {}),
|
||||
priority: meta.internalOptions.priority,
|
||||
geolocation: meta.options.geolocation,
|
||||
geolocation: meta.options.geolocation ?? meta.options.location,
|
||||
mobile: meta.options.mobile,
|
||||
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
||||
timeout, // TODO: better timeout logic
|
||||
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
|
||||
// TODO: scrollXPaths
|
||||
};
|
||||
|
||||
const totalWait = actions.reduce(
|
||||
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
|
||||
0,
|
||||
);
|
||||
|
||||
let response = await performFireEngineScrape(
|
||||
meta.logger.child({
|
||||
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
|
||||
request,
|
||||
}),
|
||||
request,
|
||||
meta.options.timeout !== undefined ? defaultTimeout + totalWait : Infinity, // TODO: better timeout handling
|
||||
timeout,
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(
|
||||
@ -206,7 +211,11 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
||||
|
||||
export async function scrapeURLWithFireEnginePlaywright(
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> {
|
||||
const totalWait = meta.options.waitFor;
|
||||
const timeout = (timeToRun ?? 300000) + totalWait;
|
||||
|
||||
const request: FireEngineScrapeRequestCommon &
|
||||
FireEngineScrapeRequestPlaywright = {
|
||||
url: meta.url,
|
||||
@ -218,9 +227,9 @@ export async function scrapeURLWithFireEnginePlaywright(
|
||||
screenshot: meta.options.formats.includes("screenshot"),
|
||||
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
|
||||
wait: meta.options.waitFor,
|
||||
geolocation: meta.options.geolocation,
|
||||
geolocation: meta.options.geolocation ?? meta.options.location,
|
||||
|
||||
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
||||
timeout,
|
||||
};
|
||||
|
||||
let response = await performFireEngineScrape(
|
||||
@ -229,9 +238,7 @@ export async function scrapeURLWithFireEnginePlaywright(
|
||||
request,
|
||||
}),
|
||||
request,
|
||||
meta.options.timeout !== undefined
|
||||
? defaultTimeout + meta.options.waitFor
|
||||
: Infinity, // TODO: better timeout handling
|
||||
timeout,
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(
|
||||
@ -265,7 +272,10 @@ export async function scrapeURLWithFireEnginePlaywright(
|
||||
|
||||
export async function scrapeURLWithFireEngineTLSClient(
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> {
|
||||
const timeout = timeToRun ?? 30000;
|
||||
|
||||
const request: FireEngineScrapeRequestCommon &
|
||||
FireEngineScrapeRequestTLSClient = {
|
||||
url: meta.url,
|
||||
@ -276,10 +286,10 @@ export async function scrapeURLWithFireEngineTLSClient(
|
||||
priority: meta.internalOptions.priority,
|
||||
|
||||
atsv: meta.internalOptions.atsv,
|
||||
geolocation: meta.options.geolocation,
|
||||
geolocation: meta.options.geolocation ?? meta.options.location,
|
||||
disableJsDom: meta.internalOptions.v0DisableJsDom,
|
||||
|
||||
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
||||
timeout,
|
||||
};
|
||||
|
||||
let response = await performFireEngineScrape(
|
||||
@ -288,7 +298,7 @@ export async function scrapeURLWithFireEngineTLSClient(
|
||||
request,
|
||||
}),
|
||||
request,
|
||||
meta.options.timeout !== undefined ? defaultTimeout : Infinity, // TODO: better timeout handling
|
||||
timeout,
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(
|
||||
|
@ -105,7 +105,10 @@ export type EngineScrapeResult = {
|
||||
};
|
||||
|
||||
const engineHandlers: {
|
||||
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>;
|
||||
[E in Engine]: (
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined,
|
||||
) => Promise<EngineScrapeResult>;
|
||||
} = {
|
||||
cache: scrapeCache,
|
||||
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
||||
@ -372,6 +375,7 @@ export function buildFallbackList(meta: Meta): {
|
||||
export async function scrapeURLWithEngine(
|
||||
meta: Meta,
|
||||
engine: Engine,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> {
|
||||
const fn = engineHandlers[engine];
|
||||
const logger = meta.logger.child({
|
||||
@ -383,5 +387,5 @@ export async function scrapeURLWithEngine(
|
||||
logger,
|
||||
};
|
||||
|
||||
return await fn(_meta);
|
||||
return await fn(_meta, timeToRun);
|
||||
}
|
||||
|
@ -15,6 +15,7 @@ type PDFProcessorResult = { html: string; markdown?: string };
|
||||
async function scrapePDFWithLlamaParse(
|
||||
meta: Meta,
|
||||
tempFilePath: string,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<PDFProcessorResult> {
|
||||
meta.logger.debug("Processing PDF document with LlamaIndex", {
|
||||
tempFilePath,
|
||||
@ -63,8 +64,9 @@ async function scrapePDFWithLlamaParse(
|
||||
|
||||
// TODO: timeout, retries
|
||||
const startedAt = Date.now();
|
||||
const timeout = timeToRun ?? 300000;
|
||||
|
||||
while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) {
|
||||
while (Date.now() <= startedAt + timeout) {
|
||||
try {
|
||||
const result = await robustFetch({
|
||||
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
|
||||
@ -122,7 +124,10 @@ async function scrapePDFWithParsePDF(
|
||||
};
|
||||
}
|
||||
|
||||
export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
||||
export async function scrapePDF(
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> {
|
||||
if (!meta.options.parsePDF) {
|
||||
const file = await fetchFileToBuffer(meta.url);
|
||||
const content = file.buffer.toString("base64");
|
||||
@ -138,9 +143,26 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
|
||||
|
||||
let result: PDFProcessorResult | null = null;
|
||||
if (process.env.LLAMAPARSE_API_KEY) {
|
||||
|
||||
// First, try parsing with PdfParse
|
||||
result = await scrapePDFWithParsePDF(
|
||||
{
|
||||
...meta,
|
||||
logger: meta.logger.child({
|
||||
method: "scrapePDF/scrapePDFWithParsePDF",
|
||||
}),
|
||||
},
|
||||
tempFilePath,
|
||||
);
|
||||
|
||||
// If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse
|
||||
if (
|
||||
result.markdown &&
|
||||
result.markdown.length < 500 &&
|
||||
process.env.LLAMAPARSE_API_KEY
|
||||
) {
|
||||
try {
|
||||
result = await scrapePDFWithLlamaParse(
|
||||
const llamaResult = await scrapePDFWithLlamaParse(
|
||||
{
|
||||
...meta,
|
||||
logger: meta.logger.child({
|
||||
@ -148,17 +170,19 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
||||
}),
|
||||
},
|
||||
tempFilePath,
|
||||
timeToRun,
|
||||
);
|
||||
result = llamaResult; // Use LlamaParse result if successful
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message === "LlamaParse timed out") {
|
||||
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", {
|
||||
meta.logger.warn("LlamaParse timed out -- using parse-pdf result", {
|
||||
error,
|
||||
});
|
||||
} else if (error instanceof RemoveFeatureError) {
|
||||
throw error;
|
||||
} else {
|
||||
meta.logger.warn(
|
||||
"LlamaParse failed to parse PDF -- falling back to parse-pdf",
|
||||
"LlamaParse failed to parse PDF -- using parse-pdf result",
|
||||
{ error },
|
||||
);
|
||||
Sentry.captureException(error);
|
||||
@ -166,18 +190,6 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
||||
}
|
||||
}
|
||||
|
||||
if (result === null) {
|
||||
result = await scrapePDFWithParsePDF(
|
||||
{
|
||||
...meta,
|
||||
logger: meta.logger.child({
|
||||
method: "scrapePDF/scrapePDFWithParsePDF",
|
||||
}),
|
||||
},
|
||||
tempFilePath,
|
||||
);
|
||||
}
|
||||
|
||||
await fs.unlink(tempFilePath);
|
||||
|
||||
return {
|
||||
|
@ -6,8 +6,9 @@ import { robustFetch } from "../../lib/fetch";
|
||||
|
||||
export async function scrapeURLWithPlaywright(
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> {
|
||||
const timeout = 20000 + meta.options.waitFor;
|
||||
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
|
||||
|
||||
const response = await Promise.race([
|
||||
await robustFetch({
|
||||
@ -30,7 +31,7 @@ export async function scrapeURLWithPlaywright(
|
||||
}),
|
||||
}),
|
||||
(async () => {
|
||||
await new Promise((resolve) => setTimeout(() => resolve(null), 20000));
|
||||
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
|
||||
throw new TimeoutError(
|
||||
"Playwright was unable to scrape the page before timing out",
|
||||
{ cause: { timeout } },
|
||||
|
@ -9,16 +9,20 @@ const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
|
||||
|
||||
export function scrapeURLWithScrapingBee(
|
||||
wait_browser: "domcontentloaded" | "networkidle2",
|
||||
): (meta: Meta) => Promise<EngineScrapeResult> {
|
||||
return async (meta: Meta): Promise<EngineScrapeResult> => {
|
||||
): (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult> {
|
||||
return async (
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> => {
|
||||
let response: AxiosResponse<any>;
|
||||
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
|
||||
try {
|
||||
response = await client.get({
|
||||
url: meta.url,
|
||||
params: {
|
||||
timeout: 15000, // TODO: dynamic timeout based on request timeout
|
||||
timeout,
|
||||
wait_browser: wait_browser,
|
||||
wait: Math.min(meta.options.waitFor, 35000),
|
||||
wait: meta.options.waitFor,
|
||||
transparent_status_code: true,
|
||||
json_response: true,
|
||||
screenshot: meta.options.formats.includes("screenshot"),
|
||||
|
@ -56,3 +56,11 @@ export class SiteError extends Error {
|
||||
this.code = code;
|
||||
}
|
||||
}
|
||||
|
||||
export class ActionError extends Error {
|
||||
public code: string;
|
||||
constructor(code: string) {
|
||||
super("Action(s) failed to complete. Error code: " + code);
|
||||
this.code = code;
|
||||
}
|
||||
}
|
||||
|
@ -12,6 +12,7 @@ import {
|
||||
} from "./engines";
|
||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||
import {
|
||||
ActionError,
|
||||
AddFeatureError,
|
||||
EngineError,
|
||||
NoEnginesLeftError,
|
||||
@ -86,7 +87,7 @@ function buildFeatureFlags(
|
||||
flags.add("skipTlsVerification");
|
||||
}
|
||||
|
||||
if (internalOptions.v0UseFastMode) {
|
||||
if (options.fastMode) {
|
||||
flags.add("useFastMode");
|
||||
}
|
||||
|
||||
@ -148,7 +149,6 @@ export type InternalOptions = {
|
||||
atsv?: boolean; // anti-bot solver, beta
|
||||
|
||||
v0CrawlOnlyUrls?: boolean;
|
||||
v0UseFastMode?: boolean;
|
||||
v0DisableJsDom?: boolean;
|
||||
|
||||
disableSmartWaitCache?: boolean; // Passed along to fire-engine
|
||||
@ -203,11 +203,16 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
||||
const results: EngineResultsTracker = {};
|
||||
let result: EngineScrapeResultWithContext | null = null;
|
||||
|
||||
const timeToRun =
|
||||
meta.options.timeout !== undefined
|
||||
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2))
|
||||
: undefined;
|
||||
|
||||
for (const { engine, unsupportedFeatures } of fallbackList) {
|
||||
const startedAt = Date.now();
|
||||
try {
|
||||
meta.logger.info("Scraping via " + engine + "...");
|
||||
const _engineResult = await scrapeURLWithEngine(meta, engine);
|
||||
const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun);
|
||||
if (_engineResult.markdown === undefined) {
|
||||
// Some engines emit Markdown directly.
|
||||
_engineResult.markdown = await parseMarkdown(_engineResult.html);
|
||||
@ -285,6 +290,8 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
||||
throw error;
|
||||
} else if (error instanceof SiteError) {
|
||||
throw error;
|
||||
} else if (error instanceof ActionError) {
|
||||
throw error;
|
||||
} else {
|
||||
Sentry.captureException(error);
|
||||
meta.logger.info(
|
||||
@ -405,6 +412,8 @@ export async function scrapeURL(
|
||||
// TODO: results?
|
||||
} else if (error instanceof SiteError) {
|
||||
meta.logger.warn("scrapeURL: Site failed to load in browser", { error });
|
||||
} else if (error instanceof ActionError) {
|
||||
meta.logger.warn("scrapeURL: Action(s) failed to complete", { error });
|
||||
} else {
|
||||
Sentry.captureException(error);
|
||||
meta.logger.error("scrapeURL: Unexpected error happened", { error });
|
||||
|
@ -5,7 +5,7 @@ import { Meta } from "..";
|
||||
export function extractMetadata(
|
||||
meta: Meta,
|
||||
html: string,
|
||||
): Document["metadata"] {
|
||||
): Partial<Document["metadata"]> {
|
||||
let title: string | undefined = undefined;
|
||||
let description: string | undefined = undefined;
|
||||
let language: string | undefined = undefined;
|
||||
@ -40,7 +40,7 @@ export function extractMetadata(
|
||||
const soup = load(html);
|
||||
|
||||
try {
|
||||
title = soup("title").text() || undefined;
|
||||
title = soup("title").first().text().trim() || undefined;
|
||||
description = soup('meta[name="description"]').attr("content") || undefined;
|
||||
|
||||
// Assuming the language is part of the URL as per the regex pattern
|
||||
|
@ -159,8 +159,8 @@ export async function generateOpenAICompletions(
|
||||
role: "user",
|
||||
content:
|
||||
options.prompt !== undefined
|
||||
? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}`
|
||||
: "Transform the above content into structured JSON output.",
|
||||
? `Transform the above content into structured JSON output based on the provided schema if any and the following user request: ${options.prompt}. If schema is provided, strictly follow it.`
|
||||
: "Transform the above content into structured JSON output based on the provided schema if any.",
|
||||
},
|
||||
],
|
||||
response_format: options.schema
|
||||
|
@ -10,6 +10,7 @@ import { issueCredits } from "./issue_credits";
|
||||
import { redlock } from "../redlock";
|
||||
import { autoCharge } from "./auto_charge";
|
||||
import { getValue, setValue } from "../redis";
|
||||
import type { Logger } from "winston";
|
||||
|
||||
const FREE_CREDITS = 500;
|
||||
|
||||
@ -20,22 +21,33 @@ export async function billTeam(
|
||||
team_id: string,
|
||||
subscription_id: string | null | undefined,
|
||||
credits: number,
|
||||
logger?: Logger,
|
||||
) {
|
||||
return withAuth(supaBillTeam, { success: true, message: "No DB, bypassed." })(
|
||||
team_id,
|
||||
subscription_id,
|
||||
credits,
|
||||
logger,
|
||||
);
|
||||
}
|
||||
export async function supaBillTeam(
|
||||
team_id: string,
|
||||
subscription_id: string | null | undefined,
|
||||
credits: number,
|
||||
__logger?: Logger,
|
||||
) {
|
||||
const _logger = (__logger ?? logger).child({
|
||||
module: "credit_billing",
|
||||
method: "supaBillTeam",
|
||||
});
|
||||
|
||||
if (team_id === "preview") {
|
||||
return { success: true, message: "Preview team, no credits used" };
|
||||
}
|
||||
logger.info(`Billing team ${team_id} for ${credits} credits`);
|
||||
_logger.info(`Billing team ${team_id} for ${credits} credits`, {
|
||||
team_id,
|
||||
credits,
|
||||
});
|
||||
|
||||
const { data, error } = await supabase_service.rpc("bill_team", {
|
||||
_team_id: team_id,
|
||||
@ -46,7 +58,7 @@ export async function supaBillTeam(
|
||||
|
||||
if (error) {
|
||||
Sentry.captureException(error);
|
||||
logger.error("Failed to bill team: " + JSON.stringify(error));
|
||||
_logger.error("Failed to bill team.", { error });
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -9,6 +9,7 @@ export async function issueCredits(team_id: string, credits: number) {
|
||||
status: "active",
|
||||
// indicates that this coupon was issued from auto recharge
|
||||
from_auto_recharge: true,
|
||||
initial_credits: credits,
|
||||
});
|
||||
|
||||
if (error) {
|
||||
|
@ -11,11 +11,50 @@ import {
|
||||
pushConcurrencyLimitedJob,
|
||||
} from "../lib/concurrency-limit";
|
||||
|
||||
async function _addScrapeJobToConcurrencyQueue(
|
||||
webScraperOptions: any,
|
||||
options: any,
|
||||
jobId: string,
|
||||
jobPriority: number,
|
||||
) {
|
||||
await pushConcurrencyLimitedJob(webScraperOptions.team_id, {
|
||||
id: jobId,
|
||||
data: webScraperOptions,
|
||||
opts: {
|
||||
...options,
|
||||
priority: jobPriority,
|
||||
jobId: jobId,
|
||||
},
|
||||
priority: jobPriority,
|
||||
});
|
||||
}
|
||||
|
||||
async function _addScrapeJobToBullMQ(
|
||||
webScraperOptions: any,
|
||||
options: any,
|
||||
jobId: string,
|
||||
jobPriority: number,
|
||||
) {
|
||||
if (
|
||||
webScraperOptions &&
|
||||
webScraperOptions.team_id &&
|
||||
webScraperOptions.plan
|
||||
) {
|
||||
await pushConcurrencyLimitActiveJob(webScraperOptions.team_id, jobId);
|
||||
}
|
||||
|
||||
await getScrapeQueue().add(jobId, webScraperOptions, {
|
||||
...options,
|
||||
priority: jobPriority,
|
||||
jobId,
|
||||
});
|
||||
}
|
||||
|
||||
async function addScrapeJobRaw(
|
||||
webScraperOptions: any,
|
||||
options: any,
|
||||
jobId: string,
|
||||
jobPriority: number = 10,
|
||||
jobPriority: number,
|
||||
) {
|
||||
let concurrencyLimited = false;
|
||||
|
||||
@ -33,30 +72,14 @@ async function addScrapeJobRaw(
|
||||
}
|
||||
|
||||
if (concurrencyLimited) {
|
||||
await pushConcurrencyLimitedJob(webScraperOptions.team_id, {
|
||||
id: jobId,
|
||||
data: webScraperOptions,
|
||||
opts: {
|
||||
...options,
|
||||
priority: jobPriority,
|
||||
jobId: jobId,
|
||||
},
|
||||
priority: jobPriority,
|
||||
});
|
||||
} else {
|
||||
if (
|
||||
webScraperOptions &&
|
||||
webScraperOptions.team_id &&
|
||||
webScraperOptions.plan
|
||||
) {
|
||||
await pushConcurrencyLimitActiveJob(webScraperOptions.team_id, jobId);
|
||||
}
|
||||
|
||||
await getScrapeQueue().add(jobId, webScraperOptions, {
|
||||
...options,
|
||||
priority: jobPriority,
|
||||
await _addScrapeJobToConcurrencyQueue(
|
||||
webScraperOptions,
|
||||
options,
|
||||
jobId,
|
||||
});
|
||||
jobPriority,
|
||||
);
|
||||
} else {
|
||||
await _addScrapeJobToBullMQ(webScraperOptions, options, jobId, jobPriority);
|
||||
}
|
||||
}
|
||||
|
||||
@ -108,11 +131,88 @@ export async function addScrapeJobs(
|
||||
};
|
||||
}[],
|
||||
) {
|
||||
// TODO: better
|
||||
if (jobs.length === 0) return true;
|
||||
|
||||
let countCanBeDirectlyAdded = Infinity;
|
||||
|
||||
if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) {
|
||||
const now = Date.now();
|
||||
const limit = await getConcurrencyLimitMax(jobs[0].data.plan);
|
||||
console.log("CC limit", limit);
|
||||
cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now);
|
||||
|
||||
countCanBeDirectlyAdded = Math.max(
|
||||
limit -
|
||||
(await getConcurrencyLimitActiveJobs(jobs[0].data.team_id, now)).length,
|
||||
0,
|
||||
);
|
||||
}
|
||||
|
||||
const addToBull = jobs.slice(0, countCanBeDirectlyAdded);
|
||||
const addToCQ = jobs.slice(countCanBeDirectlyAdded);
|
||||
|
||||
await Promise.all(
|
||||
jobs.map((job) =>
|
||||
addScrapeJob(job.data, job.opts, job.opts.jobId, job.opts.priority),
|
||||
),
|
||||
addToBull.map(async (job) => {
|
||||
const size = JSON.stringify(job.data).length;
|
||||
return await Sentry.startSpan(
|
||||
{
|
||||
name: "Add scrape job",
|
||||
op: "queue.publish",
|
||||
attributes: {
|
||||
"messaging.message.id": job.opts.jobId,
|
||||
"messaging.destination.name": getScrapeQueue().name,
|
||||
"messaging.message.body.size": size,
|
||||
},
|
||||
},
|
||||
async (span) => {
|
||||
await _addScrapeJobToBullMQ(
|
||||
{
|
||||
...job.data,
|
||||
sentry: {
|
||||
trace: Sentry.spanToTraceHeader(span),
|
||||
baggage: Sentry.spanToBaggageHeader(span),
|
||||
size,
|
||||
},
|
||||
},
|
||||
job.opts,
|
||||
job.opts.jobId,
|
||||
job.opts.priority,
|
||||
);
|
||||
},
|
||||
);
|
||||
}),
|
||||
);
|
||||
|
||||
await Promise.all(
|
||||
addToCQ.map(async (job) => {
|
||||
const size = JSON.stringify(job.data).length;
|
||||
return await Sentry.startSpan(
|
||||
{
|
||||
name: "Add scrape job",
|
||||
op: "queue.publish",
|
||||
attributes: {
|
||||
"messaging.message.id": job.opts.jobId,
|
||||
"messaging.destination.name": getScrapeQueue().name,
|
||||
"messaging.message.body.size": size,
|
||||
},
|
||||
},
|
||||
async (span) => {
|
||||
await _addScrapeJobToConcurrencyQueue(
|
||||
{
|
||||
...job.data,
|
||||
sentry: {
|
||||
trace: Sentry.spanToTraceHeader(span),
|
||||
baggage: Sentry.spanToBaggageHeader(span),
|
||||
size,
|
||||
},
|
||||
},
|
||||
job.opts,
|
||||
job.opts.jobId,
|
||||
job.opts.priority,
|
||||
);
|
||||
},
|
||||
);
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -386,27 +386,27 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
jobId: job.id,
|
||||
scrapeId: job.id,
|
||||
crawlId: job.data?.crawl_id ?? undefined,
|
||||
teamId: job.data?.team_id ?? undefined,
|
||||
});
|
||||
logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url });
|
||||
|
||||
// Check if the job URL is researchhub and block it immediately
|
||||
// TODO: remove this once solve the root issue
|
||||
if (
|
||||
job.data.url &&
|
||||
(job.data.url.includes("researchhub.com") ||
|
||||
job.data.url.includes("ebay.com") ||
|
||||
job.data.url.includes("youtube.com"))
|
||||
) {
|
||||
logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`);
|
||||
const data = {
|
||||
success: false,
|
||||
document: null,
|
||||
project_id: job.data.project_id,
|
||||
error:
|
||||
"URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.",
|
||||
};
|
||||
return data;
|
||||
}
|
||||
// if (
|
||||
// job.data.url &&
|
||||
// (job.data.url.includes("researchhub.com") ||
|
||||
// job.data.url.includes("ebay.com"))
|
||||
// ) {
|
||||
// logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`);
|
||||
// const data = {
|
||||
// success: false,
|
||||
// document: null,
|
||||
// project_id: job.data.project_id,
|
||||
// error:
|
||||
// "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.",
|
||||
// };
|
||||
// return data;
|
||||
// }
|
||||
|
||||
try {
|
||||
job.updateProgress({
|
||||
@ -482,32 +482,28 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
normalizeURL(doc.metadata.url, sc) !==
|
||||
normalizeURL(doc.metadata.sourceURL, sc)
|
||||
) {
|
||||
logger.debug(
|
||||
"Was redirected, removing old URL and locking new URL...",
|
||||
{ oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url },
|
||||
);
|
||||
// Remove the old URL from visited unique due to checking for limit
|
||||
// Do not remove from :visited otherwise it will keep crawling the original URL (sourceURL)
|
||||
await redisConnection.srem(
|
||||
"crawl:" + job.data.crawl_id + ":visited_unique",
|
||||
normalizeURL(doc.metadata.sourceURL, sc),
|
||||
);
|
||||
|
||||
const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc));
|
||||
const p2 = generateURLPermutations(
|
||||
normalizeURL(doc.metadata.sourceURL, sc),
|
||||
);
|
||||
|
||||
// In crawls, we should only crawl a redirected page once, no matter how many; times it is redirected to, or if it's been discovered by the crawler before.
|
||||
// This can prevent flakiness with race conditions.
|
||||
// Lock the new URL
|
||||
const lockRes = await lockURL(job.data.crawl_id, sc, doc.metadata.url);
|
||||
if (
|
||||
job.data.crawlerOptions !== null &&
|
||||
!lockRes &&
|
||||
JSON.stringify(p1) !== JSON.stringify(p2)
|
||||
) {
|
||||
throw new RacedRedirectError();
|
||||
if (JSON.stringify(p1) !== JSON.stringify(p2)) {
|
||||
logger.debug(
|
||||
"Was redirected, removing old URL and locking new URL...",
|
||||
{ oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url },
|
||||
);
|
||||
|
||||
// Prevent redirect target from being visited in the crawl again
|
||||
// See lockURL
|
||||
const x = await redisConnection.sadd(
|
||||
"crawl:" + job.data.crawl_id + ":visited",
|
||||
...p1.map((x) => x.href),
|
||||
);
|
||||
const lockRes = x === p1.length;
|
||||
|
||||
if (job.data.crawlerOptions !== null && !lockRes) {
|
||||
throw new RacedRedirectError();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -679,6 +675,10 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
|
||||
logger.debug("Declaring job as done...");
|
||||
await addCrawlJobDone(job.data.crawl_id, job.id, false);
|
||||
await redisConnection.srem(
|
||||
"crawl:" + job.data.crawl_id + ":visited_unique",
|
||||
normalizeURL(job.data.url, sc),
|
||||
);
|
||||
|
||||
logger.debug("Logging job to DB...");
|
||||
await logJob(
|
||||
|
@ -80,8 +80,8 @@ const RATE_LIMITS = {
|
||||
default: 100,
|
||||
},
|
||||
crawlStatus: {
|
||||
free: 300,
|
||||
default: 500,
|
||||
free: 500,
|
||||
default: 5000,
|
||||
},
|
||||
testSuite: {
|
||||
free: 10000,
|
||||
|
@ -55,6 +55,7 @@ export interface RunWebScraperParams {
|
||||
bull_job_id: string;
|
||||
priority?: number;
|
||||
is_scrape?: boolean;
|
||||
is_crawl?: boolean;
|
||||
}
|
||||
|
||||
export type RunWebScraperResult =
|
||||
|
@ -3,6 +3,7 @@
|
||||
"rootDir": "./src",
|
||||
"lib": ["ES2022", "DOM"],
|
||||
|
||||
|
||||
// or higher
|
||||
"target": "ES2022",
|
||||
|
||||
@ -18,7 +19,7 @@
|
||||
"*": ["node_modules/*", "src/types/*"],
|
||||
},
|
||||
|
||||
"inlineSources": true
|
||||
"inlineSources": true,
|
||||
},
|
||||
"include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"]
|
||||
}
|
||||
|
@ -1,7 +1,19 @@
|
||||
const fs = require("fs");
|
||||
|
||||
const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8")
|
||||
.split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));
|
||||
// METHOD: Winston log file
|
||||
// const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8")
|
||||
// .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));
|
||||
|
||||
// METHOD: GCloud export
|
||||
const logs = [
|
||||
"downloaded-logs-20241213-225607.json",
|
||||
"downloaded-logs-20241213-225654.json",
|
||||
"downloaded-logs-20241213-225720.json",
|
||||
"downloaded-logs-20241213-225758.json",
|
||||
"downloaded-logs-20241213-225825.json",
|
||||
"downloaded-logs-20241213-225843.json",
|
||||
].flatMap(x => JSON.parse(fs.readFileSync(x, "utf8"))).map(x => x.jsonPayload);
|
||||
|
||||
|
||||
const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))];
|
||||
|
14
apps/api/utils/urldump-redis.js
Normal file
14
apps/api/utils/urldump-redis.js
Normal file
@ -0,0 +1,14 @@
|
||||
require("dotenv").config();
|
||||
const Redis = require("ioredis");
|
||||
|
||||
const crawlId = process.argv[2];
|
||||
|
||||
const redisConnection = new Redis(process.env.REDIS_URL, {
|
||||
maxRetriesPerRequest: null,
|
||||
});
|
||||
|
||||
(async () => {
|
||||
const res = await redisConnection.sscan("crawl:" + crawlId + ":visited_unique", 0, "COUNT", 999);
|
||||
await require("fs/promises").writeFile(crawlId + "-visited.txt", res[1].map(x => x.split("://").slice(1).join("://")).sort().join("\n"));
|
||||
process.exit(0);
|
||||
})();
|
43
apps/api/utils/urldump.js
Normal file
43
apps/api/utils/urldump.js
Normal file
@ -0,0 +1,43 @@
|
||||
require("dotenv").config();
|
||||
|
||||
//const baseUrl = "https://api.firecrawl.dev";
|
||||
const baseUrl = "http://localhost:3002";
|
||||
const crawlId = process.argv[2];
|
||||
|
||||
(async () => {
|
||||
let url = baseUrl + "/v1/crawl/" + crawlId;
|
||||
let urls = [];
|
||||
|
||||
while (url) {
|
||||
let res;
|
||||
|
||||
while (true) {
|
||||
try {
|
||||
res = (await (await fetch(url, {
|
||||
headers: {
|
||||
"Authorization": "Bearer " + process.env.TEST_API_KEY
|
||||
}
|
||||
})).json());
|
||||
break;
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(res.data.length);
|
||||
if (res.data.length === 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
urls.push(...res.data.map(x => x.metadata.url ?? x.metadata.sourceURL));
|
||||
|
||||
url = res.next;
|
||||
if (url !== undefined) {
|
||||
const o = new URL(url)
|
||||
o.protocol = new URL(baseUrl).protocol;
|
||||
url = o.href;
|
||||
}
|
||||
}
|
||||
|
||||
await require("fs/promises").writeFile(crawlId + "-urls.txt", urls.map(x => x.split("://").slice(1).join("://")).sort().join("\n"));
|
||||
})();
|
@ -1,9 +1,9 @@
|
||||
import { describe, test, expect, jest } from '@jest/globals';
|
||||
import axios from 'axios';
|
||||
import FirecrawlApp from '../index';
|
||||
import { describe, expect, jest, test } from '@jest/globals';
|
||||
|
||||
import { readFile } from 'fs/promises';
|
||||
import FirecrawlApp from '../index';
|
||||
import axios from 'axios';
|
||||
import { join } from 'path';
|
||||
import { readFile } from 'fs/promises';
|
||||
|
||||
// Mock jest and set the type
|
||||
jest.mock('axios');
|
||||
@ -14,13 +14,22 @@ async function loadFixture(name: string): Promise<string> {
|
||||
return await readFile(join(__dirname, 'fixtures', `${name}.json`), 'utf-8')
|
||||
}
|
||||
|
||||
const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev";
|
||||
|
||||
describe('the firecrawl JS SDK', () => {
|
||||
|
||||
test('Should require an API key to instantiate FirecrawlApp', async () => {
|
||||
const fn = () => {
|
||||
new FirecrawlApp({ apiKey: undefined });
|
||||
};
|
||||
expect(fn).toThrow('No API key provided');
|
||||
test('Should require an API key only for cloud service', async () => {
|
||||
if (API_URL.includes('api.firecrawl.dev')) {
|
||||
// Should throw for cloud service
|
||||
expect(() => {
|
||||
new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL });
|
||||
}).toThrow('No API key provided');
|
||||
} else {
|
||||
// Should not throw for self-hosted
|
||||
expect(() => {
|
||||
new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL });
|
||||
}).not.toThrow();
|
||||
}
|
||||
});
|
||||
|
||||
test('Should return scraped data from a /scrape API call', async () => {
|
||||
|
@ -9,15 +9,28 @@ const TEST_API_KEY = process.env.TEST_API_KEY;
|
||||
const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev";
|
||||
|
||||
describe('FirecrawlApp E2E Tests', () => {
|
||||
test.concurrent('should throw error for no API key', async () => {
|
||||
expect(() => {
|
||||
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
||||
}).toThrow("No API key provided");
|
||||
test.concurrent('should throw error for no API key only for cloud service', async () => {
|
||||
if (API_URL.includes('api.firecrawl.dev')) {
|
||||
// Should throw for cloud service
|
||||
expect(() => {
|
||||
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
||||
}).toThrow("No API key provided");
|
||||
} else {
|
||||
// Should not throw for self-hosted
|
||||
expect(() => {
|
||||
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
||||
}).not.toThrow();
|
||||
}
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for invalid API key on scrape', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
if (API_URL.includes('api.firecrawl.dev')) {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 404");
|
||||
} else {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).resolves.not.toThrow();
|
||||
}
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on scrape', async () => {
|
||||
@ -155,14 +168,13 @@ describe('FirecrawlApp E2E Tests', () => {
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should throw error for invalid API key on crawl', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on crawl', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const blocklistedUrl = "https://twitter.com/fake-test";
|
||||
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
|
||||
if (API_URL.includes('api.firecrawl.dev')) {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404");
|
||||
} else {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).resolves.not.toThrow();
|
||||
}
|
||||
});
|
||||
|
||||
test.concurrent('should return successful response for crawl and wait for completion', async () => {
|
||||
@ -337,8 +349,13 @@ describe('FirecrawlApp E2E Tests', () => {
|
||||
}, 60000); // 60 seconds timeout
|
||||
|
||||
test.concurrent('should throw error for invalid API key on map', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
if (API_URL.includes('api.firecrawl.dev')) {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404");
|
||||
} else {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).resolves.not.toThrow();
|
||||
}
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on map', async () => {
|
||||
@ -355,8 +372,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid map', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
|
||||
expect(response).not.toBeNull();
|
||||
|
||||
expect(response.links?.length).toBeGreaterThan(0);
|
||||
|
@ -183,6 +183,7 @@ export interface BatchScrapeResponse {
|
||||
url?: string;
|
||||
success: true;
|
||||
error?: string;
|
||||
invalidURLs?: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
@ -242,10 +243,11 @@ export interface MapResponse {
|
||||
* Defines options for extracting information from URLs.
|
||||
*/
|
||||
export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
||||
prompt: string;
|
||||
prompt?: string;
|
||||
schema?: LLMSchema;
|
||||
systemPrompt?: string;
|
||||
allowExternalLinks?: boolean;
|
||||
includeSubdomains?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -288,17 +290,23 @@ export default class FirecrawlApp {
|
||||
public apiKey: string;
|
||||
public apiUrl: string;
|
||||
|
||||
private isCloudService(url: string): boolean {
|
||||
return url.includes('api.firecrawl.dev');
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes a new instance of the FirecrawlApp class.
|
||||
* @param config - Configuration options for the FirecrawlApp instance.
|
||||
*/
|
||||
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
|
||||
if (typeof apiKey !== "string") {
|
||||
const baseUrl = apiUrl || "https://api.firecrawl.dev";
|
||||
|
||||
if (this.isCloudService(baseUrl) && typeof apiKey !== "string") {
|
||||
throw new FirecrawlError("No API key provided", 401);
|
||||
}
|
||||
|
||||
this.apiKey = apiKey;
|
||||
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
|
||||
this.apiKey = apiKey || '';
|
||||
this.apiUrl = baseUrl;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -576,9 +584,10 @@ export default class FirecrawlApp {
|
||||
pollInterval: number = 2,
|
||||
idempotencyKey?: string,
|
||||
webhook?: CrawlParams["webhook"],
|
||||
ignoreInvalidURLs?: boolean,
|
||||
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData: any = { urls, ...params };
|
||||
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params };
|
||||
if (jsonData?.extract?.schema) {
|
||||
let schema = jsonData.extract.schema;
|
||||
|
||||
@ -621,10 +630,12 @@ export default class FirecrawlApp {
|
||||
async asyncBatchScrapeUrls(
|
||||
urls: string[],
|
||||
params?: ScrapeParams,
|
||||
idempotencyKey?: string
|
||||
idempotencyKey?: string,
|
||||
webhook?: CrawlParams["webhook"],
|
||||
ignoreInvalidURLs?: boolean,
|
||||
): Promise<BatchScrapeResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData: any = { urls, ...(params ?? {}) };
|
||||
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...(params ?? {}) };
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/v1/batch/scrape`,
|
||||
@ -657,8 +668,10 @@ export default class FirecrawlApp {
|
||||
urls: string[],
|
||||
params?: ScrapeParams,
|
||||
idempotencyKey?: string,
|
||||
webhook?: CrawlParams["webhook"],
|
||||
ignoreInvalidURLs?: boolean,
|
||||
) {
|
||||
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
||||
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs);
|
||||
|
||||
if (crawl.success && crawl.id) {
|
||||
const id = crawl.id;
|
||||
@ -932,9 +945,11 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
||||
private ws: WebSocket;
|
||||
public data: FirecrawlDocument<undefined>[];
|
||||
public status: CrawlStatusResponse["status"];
|
||||
public id: string;
|
||||
|
||||
constructor(id: string, app: FirecrawlApp) {
|
||||
super();
|
||||
this.id = id;
|
||||
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
||||
this.status = "scraping";
|
||||
this.data = [];
|
||||
@ -965,6 +980,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
||||
detail: {
|
||||
status: this.status,
|
||||
data: this.data,
|
||||
id: this.id,
|
||||
},
|
||||
}));
|
||||
} else if (msg.type === "error") {
|
||||
@ -974,6 +990,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
||||
status: this.status,
|
||||
data: this.data,
|
||||
error: msg.error,
|
||||
id: this.id,
|
||||
},
|
||||
}));
|
||||
} else if (msg.type === "catchup") {
|
||||
@ -981,12 +998,18 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
||||
this.data.push(...(msg.data.data ?? []));
|
||||
for (const doc of this.data) {
|
||||
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
||||
detail: doc,
|
||||
detail: {
|
||||
...doc,
|
||||
id: this.id,
|
||||
},
|
||||
}));
|
||||
}
|
||||
} else if (msg.type === "document") {
|
||||
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
||||
detail: msg.data,
|
||||
detail: {
|
||||
...msg.data,
|
||||
id: this.id,
|
||||
},
|
||||
}));
|
||||
}
|
||||
}
|
||||
@ -996,14 +1019,21 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
||||
this.ws.close();
|
||||
return;
|
||||
}
|
||||
|
||||
const msg = JSON.parse(ev.data) as Message;
|
||||
messageHandler(msg);
|
||||
try {
|
||||
const msg = JSON.parse(ev.data) as Message;
|
||||
messageHandler(msg);
|
||||
} catch (error) {
|
||||
console.error("Error on message", error);
|
||||
}
|
||||
}).bind(this);
|
||||
|
||||
this.ws.onclose = ((ev: CloseEvent) => {
|
||||
const msg = JSON.parse(ev.reason) as Message;
|
||||
messageHandler(msg);
|
||||
try {
|
||||
const msg = JSON.parse(ev.reason) as Message;
|
||||
messageHandler(msg);
|
||||
} catch (error) {
|
||||
console.error("Error on close", error);
|
||||
}
|
||||
}).bind(this);
|
||||
|
||||
this.ws.onerror = ((_: Event) => {
|
||||
@ -1013,6 +1043,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
||||
status: this.status,
|
||||
data: this.data,
|
||||
error: "WebSocket error",
|
||||
id: this.id,
|
||||
},
|
||||
}));
|
||||
}).bind(this);
|
||||
|
@ -196,7 +196,7 @@ app.post('/scrape', async (req: Request, res: Response) => {
|
||||
}
|
||||
}
|
||||
|
||||
const pageError = pageStatusCode !== 200 ? getError(pageStatusCode) : false;
|
||||
const pageError = pageStatusCode !== 200 ? getError(pageStatusCode) : undefined;
|
||||
|
||||
if (!pageError) {
|
||||
console.log(`✅ Scrape successful!`);
|
||||
@ -209,7 +209,7 @@ app.post('/scrape', async (req: Request, res: Response) => {
|
||||
res.json({
|
||||
content: pageContent,
|
||||
pageStatusCode,
|
||||
pageError
|
||||
...(pageError && { pageError })
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -13,7 +13,7 @@ import os
|
||||
|
||||
from .firecrawl import FirecrawlApp # noqa
|
||||
|
||||
__version__ = "1.6.4"
|
||||
__version__ = "1.6.8"
|
||||
|
||||
# Define the logger for the Firecrawl project
|
||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
@ -29,12 +29,12 @@ def test_scrape_url_invalid_api_key():
|
||||
invalid_app.scrape_url('https://firecrawl.dev')
|
||||
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_blocklisted_url():
|
||||
blocklisted_url = "https://facebook.com/fake-test"
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.scrape_url(blocklisted_url)
|
||||
assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||
# def test_blocklisted_url():
|
||||
# blocklisted_url = "https://facebook.com/fake-test"
|
||||
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
# with pytest.raises(Exception) as excinfo:
|
||||
# app.scrape_url(blocklisted_url)
|
||||
# assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||
|
||||
def test_successful_response_with_valid_preview_token():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0')
|
||||
@ -90,12 +90,12 @@ def test_crawl_url_invalid_api_key():
|
||||
invalid_app.crawl_url('https://firecrawl.dev')
|
||||
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_should_return_error_for_blocklisted_url():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
blocklisted_url = "https://twitter.com/fake-test"
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.crawl_url(blocklisted_url)
|
||||
assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||
# def test_should_return_error_for_blocklisted_url():
|
||||
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
# blocklisted_url = "https://twitter.com/fake-test"
|
||||
# with pytest.raises(Exception) as excinfo:
|
||||
# app.crawl_url(blocklisted_url)
|
||||
# assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||
|
||||
def test_crawl_url_wait_for_completion_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
|
@ -30,12 +30,12 @@ def test_scrape_url_invalid_api_key():
|
||||
invalid_app.scrape_url('https://firecrawl.dev')
|
||||
assert "Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_blocklisted_url():
|
||||
blocklisted_url = "https://facebook.com/fake-test"
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.scrape_url(blocklisted_url)
|
||||
assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
|
||||
# def test_blocklisted_url():
|
||||
# blocklisted_url = "https://facebook.com/fake-test"
|
||||
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
# with pytest.raises(Exception) as excinfo:
|
||||
# app.scrape_url(blocklisted_url)
|
||||
# assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
|
||||
|
||||
def test_successful_response_with_valid_preview_token():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
|
||||
@ -136,12 +136,12 @@ def test_crawl_url_invalid_api_key():
|
||||
invalid_app.crawl_url('https://firecrawl.dev')
|
||||
assert "Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_should_return_error_for_blocklisted_url():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
blocklisted_url = "https://twitter.com/fake-test"
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.crawl_url(blocklisted_url)
|
||||
assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
|
||||
# def test_should_return_error_for_blocklisted_url():
|
||||
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
# blocklisted_url = "https://twitter.com/fake-test"
|
||||
# with pytest.raises(Exception) as excinfo:
|
||||
# app.crawl_url(blocklisted_url)
|
||||
# assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
|
||||
|
||||
def test_crawl_url_wait_for_completion_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
@ -296,12 +296,12 @@ def test_invalid_api_key_on_map():
|
||||
invalid_app.map_url('https://roastmywebsite.ai')
|
||||
assert "Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_blocklisted_url_on_map():
|
||||
app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
|
||||
blocklisted_url = "https://facebook.com/fake-test"
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.map_url(blocklisted_url)
|
||||
assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
|
||||
# def test_blocklisted_url_on_map():
|
||||
# app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
|
||||
# blocklisted_url = "https://facebook.com/fake-test"
|
||||
# with pytest.raises(Exception) as excinfo:
|
||||
# app.map_url(blocklisted_url)
|
||||
# assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
|
||||
|
||||
def test_successful_response_with_valid_preview_token_on_map():
|
||||
app = FirecrawlApp(api_key="this_is_just_a_preview_token", api_url=API_URL)
|
||||
|
@ -26,7 +26,7 @@ class FirecrawlApp:
|
||||
"""
|
||||
Parameters for the extract operation.
|
||||
"""
|
||||
prompt: str
|
||||
prompt: Optional[str] = None
|
||||
schema_: Optional[Any] = pydantic.Field(None, alias='schema')
|
||||
system_prompt: Optional[str] = None
|
||||
allow_external_links: Optional[bool] = False
|
||||
@ -704,15 +704,15 @@ class CrawlWatcher:
|
||||
async def _handle_message(self, msg: Dict[str, Any]):
|
||||
if msg['type'] == 'done':
|
||||
self.status = 'completed'
|
||||
self.dispatch_event('done', {'status': self.status, 'data': self.data})
|
||||
self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
|
||||
elif msg['type'] == 'error':
|
||||
self.status = 'failed'
|
||||
self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error']})
|
||||
self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
|
||||
elif msg['type'] == 'catchup':
|
||||
self.status = msg['data']['status']
|
||||
self.data.extend(msg['data'].get('data', []))
|
||||
for doc in self.data:
|
||||
self.dispatch_event('document', doc)
|
||||
self.dispatch_event('document', {'data': doc, 'id': self.id})
|
||||
elif msg['type'] == 'document':
|
||||
self.data.append(msg['data'])
|
||||
self.dispatch_event('document', msg['data'])
|
||||
self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
|
||||
|
@ -5,20 +5,20 @@ use firecrawl::FirecrawlApp;
|
||||
use serde_json::json;
|
||||
use std::env;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_blocklisted_url() {
|
||||
dotenv().ok();
|
||||
let api_url = env::var("API_URL").unwrap();
|
||||
let api_key = env::var("TEST_API_KEY").ok();
|
||||
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||
let blocklisted_url = "https://facebook.com/fake-test";
|
||||
let result = app.scrape_url(blocklisted_url, None).await;
|
||||
// #[tokio::test]
|
||||
// async fn test_blocklisted_url() {
|
||||
// dotenv().ok();
|
||||
// let api_url = env::var("API_URL").unwrap();
|
||||
// let api_key = env::var("TEST_API_KEY").ok();
|
||||
// let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||
// let blocklisted_url = "https://facebook.com/fake-test";
|
||||
// let result = app.scrape_url(blocklisted_url, None).await;
|
||||
|
||||
assert_matches!(
|
||||
result,
|
||||
Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions")
|
||||
);
|
||||
}
|
||||
// assert_matches!(
|
||||
// result,
|
||||
// Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions")
|
||||
// );
|
||||
// }
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_successful_response_with_valid_preview_token() {
|
||||
@ -103,20 +103,21 @@ async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explici
|
||||
.contains("We present spectrophotometric observations of the Broad Line Radio Galaxy"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_should_return_error_for_blocklisted_url() {
|
||||
dotenv().ok();
|
||||
let api_url = env::var("API_URL").unwrap();
|
||||
let api_key = env::var("TEST_API_KEY").ok();
|
||||
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||
let blocklisted_url = "https://twitter.com/fake-test";
|
||||
let result = app.crawl_url(blocklisted_url, None).await;
|
||||
|
||||
assert_matches!(
|
||||
result,
|
||||
Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions.")
|
||||
);
|
||||
}
|
||||
// #[tokio::test]
|
||||
// async fn test_should_return_error_for_blocklisted_url() {
|
||||
// dotenv().ok();
|
||||
// let api_url = env::var("API_URL").unwrap();
|
||||
// let api_key = env::var("TEST_API_KEY").ok();
|
||||
// let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||
// let blocklisted_url = "https://twitter.com/fake-test";
|
||||
// let result = app.crawl_url(blocklisted_url, None).await;
|
||||
|
||||
// assert_matches!(
|
||||
// result,
|
||||
// Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions.")
|
||||
// );
|
||||
// }
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_llm_extraction() {
|
||||
|
147
examples/o1_web_extractor/o1_web_extractor.py
Normal file
147
examples/o1_web_extractor/o1_web_extractor.py
Normal file
@ -0,0 +1,147 @@
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
from openai import OpenAI
|
||||
from serpapi import GoogleSearch
|
||||
|
||||
# ANSI color codes
|
||||
class Colors:
|
||||
CYAN = '\033[96m'
|
||||
YELLOW = '\033[93m'
|
||||
GREEN = '\033[92m'
|
||||
RED = '\033[91m'
|
||||
MAGENTA = '\033[95m'
|
||||
BLUE = '\033[94m'
|
||||
RESET = '\033[0m'
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Initialize clients
|
||||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||
|
||||
def search_google(query):
|
||||
"""Search Google using SerpAPI and return top results."""
|
||||
print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}")
|
||||
search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
|
||||
return search.get_dict().get("organic_results", [])
|
||||
|
||||
def select_urls_with_o1(company, objective, serp_results):
|
||||
"""
|
||||
Use O1 to select the most relevant URLs from SERP results for the given company and objective.
|
||||
Returns a JSON object with a "selected_urls" property that is an array of strings.
|
||||
"""
|
||||
try:
|
||||
# Prepare the data for O1
|
||||
serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")}
|
||||
for r in serp_results if r.get("link")]
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="o1-2024-12-17",
|
||||
messages=[
|
||||
{
|
||||
"role": "developer",
|
||||
"content": "You select URLs from the SERP results relevant to the company and objective."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
f"Company: {company}\n"
|
||||
f"Objective: {objective}\n"
|
||||
f"SERP Results: {json.dumps(serp_data)}\n\n"
|
||||
"Return a JSON object with a property 'selected_urls' that contains an array "
|
||||
"of URLs most likely to help meet the objective. Add a /* to the end of the URL if you think it should search all of the pages in the site. Do not return any social media links. For example: {\"selected_urls\": [\"https://example.com\", \"https://example2.com\"]}"
|
||||
)
|
||||
}
|
||||
],
|
||||
response_format={
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "selected_urls_object",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"selected_urls": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["selected_urls"],
|
||||
"additionalProperties": False
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
# The response is guaranteed to follow the specified JSON schema
|
||||
result = json.loads(response.choices[0].message.content)
|
||||
urls = result.get("selected_urls", [])
|
||||
return urls
|
||||
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Error selecting URLs with O1: {e}{Colors.RESET}")
|
||||
return []
|
||||
|
||||
|
||||
|
||||
def extract_company_info(urls, prompt, company, api_key):
|
||||
"""Use requests to call Firecrawl's extract endpoint with selected URLs."""
|
||||
print(f"{Colors.YELLOW}Extracting structured data from the provided URLs using Firecrawl...{Colors.RESET}")
|
||||
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {api_key}'
|
||||
}
|
||||
|
||||
payload = {
|
||||
"urls": urls,
|
||||
"prompt": prompt + " for " + company
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
"https://api.firecrawl.dev/v1/extract",
|
||||
headers=headers,
|
||||
json=payload
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Failed to extract data: {e}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
def main():
|
||||
company = input(f"{Colors.BLUE}Enter the company name: {Colors.RESET}")
|
||||
objective = input(f"{Colors.BLUE}Enter what information you want about the company: {Colors.RESET}")
|
||||
|
||||
serp_results = search_google(f"{company}")
|
||||
if not serp_results:
|
||||
print(f"{Colors.RED}No search results found.{Colors.RESET}")
|
||||
return
|
||||
|
||||
# Ask O1 to select URLs
|
||||
selected_urls = select_urls_with_o1(company, objective, serp_results)
|
||||
|
||||
if not selected_urls:
|
||||
print(f"{Colors.RED}O1 did not return any URLs.{Colors.RESET}")
|
||||
return
|
||||
|
||||
print(f"{Colors.CYAN}Selected URLs for extraction by O1:{Colors.RESET}")
|
||||
for url in selected_urls:
|
||||
print(f"- {url}")
|
||||
|
||||
data = extract_company_info(selected_urls, objective, company, firecrawl_api_key)
|
||||
|
||||
if data and data.get('success') and data.get('data'):
|
||||
print(f"{Colors.GREEN}Data successfully extracted:{Colors.RESET}")
|
||||
print(json.dumps(data['data'], indent=2))
|
||||
else:
|
||||
print(f"{Colors.RED}Failed to extract the requested information. Try refining your prompt or choosing a different company.{Colors.RESET}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
x
Reference in New Issue
Block a user