diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 773454e5..ce82236d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -111,6 +111,20 @@ curl -X POST http://localhost:3002/v1/crawl \ }' ``` +### Alternative: Using Docker Compose + +For a simpler setup, you can use Docker Compose to run all services: + +1. Prerequisites: Make sure you have Docker and Docker Compose installed +2. Copy the `.env.example` file to `.env` in the `/apps/api/` directory and configure as needed +3. From the root directory, run: + +```bash +docker compose up +``` + +This will start Redis, the API server, and workers automatically in the correct configuration. + ## Tests: The best way to do this is run the test with `npm run test:local-no-auth` if you'd like to run the tests without authentication. diff --git a/apps/api/.gitignore b/apps/api/.gitignore index d9639687..52345155 100644 --- a/apps/api/.gitignore +++ b/apps/api/.gitignore @@ -9,3 +9,5 @@ dump.rdb .rdb .sentryclirc + +.env.* \ No newline at end of file diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 6d971708..569eafd9 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -7478,7 +7478,7 @@ snapshots: extract-zip@2.0.1: dependencies: - debug: 4.3.4 + debug: 4.3.5 get-stream: 5.2.0 yauzl: 2.10.0 optionalDependencies: @@ -7622,7 +7622,7 @@ snapshots: dependencies: basic-ftp: 5.0.5 data-uri-to-buffer: 6.0.2 - debug: 4.3.4 + debug: 4.3.5 fs-extra: 11.2.0 transitivePeerDependencies: - supports-color @@ -7723,7 +7723,7 @@ snapshots: http-proxy-agent@7.0.2: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 transitivePeerDependencies: - supports-color @@ -7771,7 +7771,7 @@ snapshots: https-proxy-agent@7.0.5: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 transitivePeerDependencies: - supports-color @@ -8836,7 +8836,7 @@ snapshots: dependencies: '@tootallnate/quickjs-emscripten': 0.23.0 agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 get-uri: 6.0.3 http-proxy-agent: 7.0.2 https-proxy-agent: 7.0.5 @@ -9031,7 +9031,7 @@ snapshots: proxy-agent@6.4.0: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 http-proxy-agent: 7.0.2 https-proxy-agent: 7.0.5 lru-cache: 7.18.3 @@ -9338,7 +9338,7 @@ snapshots: socks-proxy-agent@8.0.4: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 socks: 2.8.3 transitivePeerDependencies: - supports-color diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts index 45b3c31e..40686c45 100644 --- a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts @@ -1,6 +1,7 @@ import request from "supertest"; import dotenv from "dotenv"; import { v4 as uuidv4 } from "uuid"; +import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; dotenv.config(); @@ -58,9 +59,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - ); + expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE); }); // tested on rate limit test @@ -480,9 +479,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - ); + expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE); }); it.concurrent( diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index e30352a5..9d5dc554 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -1,5 +1,6 @@ import request from "supertest"; import dotenv from "dotenv"; +import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; const fs = require("fs"); const path = require("path"); @@ -61,9 +62,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - ); + expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE); }); it("should return a successful response", async () => { @@ -88,9 +87,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - ); + expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE); }); it("should return a successful response", async () => { @@ -119,9 +116,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - ); + expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE); }); it("should return a successful response", async () => { diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index 35ee2d89..39e0aa85 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -4,6 +4,7 @@ import { ScrapeRequestInput, ScrapeResponseRequestTest, } from "../../controllers/v1/types"; +import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; configDotenv(); const TEST_URL = "http://127.0.0.1:3002"; @@ -57,9 +58,7 @@ describe("E2E Tests for v1 API Routes", () => { .send(scrapeRequest); expect(response.statusCode).toBe(403); - expect(response.body.error).toBe( - "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.", - ); + expect(response.body.error).toBe(BLOCKLISTED_URL_MESSAGE); }); it.concurrent( @@ -756,9 +755,7 @@ describe("E2E Tests for v1 API Routes", () => { .send(scrapeRequest); expect(response.statusCode).toBe(403); - expect(response.body.error).toBe( - "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.", - ); + expect(response.body.error).toBe(BLOCKLISTED_URL_MESSAGE); }); it.concurrent( diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index f865984a..d344625d 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -351,6 +351,7 @@ function getPlanByPriceId(price_id: string | null): PlanType { case process.env.STRIPE_PRICE_ID_ETIER1A_MONTHLY: //ocqh return "etier1a"; case process.env.STRIPE_PRICE_ID_ETIER_SCALE_1_MONTHLY: + case process.env.STRIPE_PRICE_ID_ETIER_SCALE_1_YEARLY: return "etierscale1"; default: return "free"; diff --git a/apps/api/src/controllers/v0/admin/check-fire-engine.ts b/apps/api/src/controllers/v0/admin/check-fire-engine.ts new file mode 100644 index 00000000..0671f7a9 --- /dev/null +++ b/apps/api/src/controllers/v0/admin/check-fire-engine.ts @@ -0,0 +1,64 @@ +import { logger } from "../../../lib/logger"; +import * as Sentry from "@sentry/node"; +import { Request, Response } from "express"; + +export async function checkFireEngine(req: Request, res: Response) { + try { + if (!process.env.FIRE_ENGINE_BETA_URL) { + logger.warn("Fire engine beta URL not configured"); + return res.status(500).json({ + success: false, + error: "Fire engine beta URL not configured", + }); + } + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 30000); + + try { + const response = await fetch( + `${process.env.FIRE_ENGINE_BETA_URL}/scrape`, + { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-Disable-Cache": "true", + }, + body: JSON.stringify({ + url: "https://example.com", + }), + signal: controller.signal, + }, + ); + + clearTimeout(timeout); + + if (response.ok) { + const responseData = await response.json(); + return res.status(200).json({ + data: responseData, + }); + } else { + return res.status(response.status).json({ + success: false, + error: `Fire engine returned status ${response.status}`, + }); + } + } catch (error) { + if (error.name === "AbortError") { + return res.status(504).json({ + success: false, + error: "Request timed out after 30 seconds", + }); + } + throw error; + } + } catch (error) { + logger.error(error); + Sentry.captureException(error); + return res.status(500).json({ + success: false, + error: "Internal server error", + }); + } +} diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index 36b8309f..ceeaa436 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -29,6 +29,7 @@ import * as Sentry from "@sentry/node"; import { getJobPriority } from "../../lib/job-priority"; import { fromLegacyScrapeOptions, url as urlSchema } from "../v1/types"; import { ZodError } from "zod"; +import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; export async function crawlController(req: Request, res: Response) { try { @@ -112,8 +113,7 @@ export async function crawlController(req: Request, res: Response) { if (isUrlBlocked(url)) { return res.status(403).json({ - error: - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + error: BLOCKLISTED_URL_MESSAGE, }); } diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index 405e49c2..f9462c3d 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -15,6 +15,7 @@ import { addScrapeJob } from "../../../src/services/queue-jobs"; import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import * as Sentry from "@sentry/node"; import { fromLegacyScrapeOptions } from "../v1/types"; +import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; export async function crawlPreviewController(req: Request, res: Response) { try { @@ -42,8 +43,7 @@ export async function crawlPreviewController(req: Request, res: Response) { if (isUrlBlocked(url)) { return res.status(403).json({ - error: - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + error: BLOCKLISTED_URL_MESSAGE, }); } diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 8501e502..05bf364b 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -8,7 +8,6 @@ import { authenticateUser } from "../auth"; import { PlanType, RateLimiterMode } from "../../types"; import { logJob } from "../../services/logging/log_job"; import { - Document, fromLegacyCombo, toLegacyDocument, url as urlSchema, @@ -29,6 +28,8 @@ import * as Sentry from "@sentry/node"; import { getJobPriority } from "../../lib/job-priority"; import { fromLegacyScrapeOptions } from "../v1/types"; import { ZodError } from "zod"; +import { Document as V0Document } from "./../../lib/entities"; +import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; export async function scrapeHelper( jobId: string, @@ -42,7 +43,7 @@ export async function scrapeHelper( ): Promise<{ success: boolean; error?: string; - data?: Document | { url: string }; + data?: V0Document | { url: string }; returnCode: number; }> { const url = urlSchema.parse(req.body.url); @@ -53,8 +54,7 @@ export async function scrapeHelper( if (isUrlBlocked(url)) { return { success: false, - error: - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + error: BLOCKLISTED_URL_MESSAGE, returnCode: 403, }; } @@ -241,9 +241,9 @@ export async function scrapeController(req: Request, res: Response) { const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; const numTokens = - result.data && (result.data as Document).markdown + result.data && (result.data as V0Document).markdown ? numTokensFromString( - (result.data as Document).markdown!, + (result.data as V0Document).markdown!, "gpt-3.5-turbo", ) : 0; @@ -265,25 +265,28 @@ export async function scrapeController(req: Request, res: Response) { } if (creditsToBeBilled > 0) { // billing for doc done on queue end, bill only for llm extraction - billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch((error) => { - logger.error( - `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`, - ); - // Optionally, you could notify an admin or add to a retry queue here - }); + billTeam(team_id, chunk?.sub_id, creditsToBeBilled, logger).catch( + (error) => { + logger.error( + `Failed to bill team ${team_id} for ${creditsToBeBilled} credits`, + { error }, + ); + // Optionally, you could notify an admin or add to a retry queue here + }, + ); } } let doc = result.data; if (!pageOptions || !pageOptions.includeRawHtml) { - if (doc && (doc as Document).rawHtml) { - delete (doc as Document).rawHtml; + if (doc && (doc as V0Document).rawHtml) { + delete (doc as V0Document).rawHtml; } } if (pageOptions && pageOptions.includeExtract) { - if (!pageOptions.includeMarkdown && doc && (doc as Document).markdown) { - delete (doc as Document).markdown; + if (!pageOptions.includeMarkdown && doc && (doc as V0Document).markdown) { + delete (doc as V0Document).markdown; } } @@ -312,7 +315,7 @@ export async function scrapeController(req: Request, res: Response) { return res.status(result.returnCode).json(result); } catch (error) { Sentry.captureException(error); - logger.error(error); + logger.error("Scrape error occcurred", { error }); return res.status(500).json({ error: error instanceof ZodError diff --git a/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts b/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts index b455e5ab..afa44e58 100644 --- a/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts +++ b/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts @@ -1,4 +1,5 @@ import { url } from "../types"; +import { BLOCKLISTED_URL_MESSAGE } from "../../../lib/strings"; describe("URL Schema Validation", () => { beforeEach(() => { @@ -31,7 +32,7 @@ describe("URL Schema Validation", () => { it("should reject blocked URLs", () => { expect(() => url.parse("https://facebook.com")).toThrow( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + BLOCKLISTED_URL_MESSAGE, ); }); @@ -47,16 +48,16 @@ describe("URL Schema Validation", () => { it("should handle URLs with subdomains that are blocked", () => { expect(() => url.parse("https://sub.facebook.com")).toThrow( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + BLOCKLISTED_URL_MESSAGE, ); }); it("should handle URLs with paths that are blocked", () => { expect(() => url.parse("http://facebook.com/path")).toThrow( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + BLOCKLISTED_URL_MESSAGE, ); expect(() => url.parse("https://facebook.com/another/path")).toThrow( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + BLOCKLISTED_URL_MESSAGE, ); }); diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index 89fa6741..19ce3ba0 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -3,9 +3,11 @@ import { v4 as uuidv4 } from "uuid"; import { BatchScrapeRequest, batchScrapeRequestSchema, - CrawlResponse, + batchScrapeRequestSchemaNoURLValidation, + url as urlSchema, RequestWithAuth, ScrapeOptions, + BatchScrapeResponse, } from "./types"; import { addCrawlJobs, @@ -21,10 +23,14 @@ import { callWebhook } from "../../services/webhook"; import { logger as _logger } from "../../lib/logger"; export async function batchScrapeController( - req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>, - res: Response, + req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>, + res: Response, ) { - req.body = batchScrapeRequestSchema.parse(req.body); + if (req.body?.ignoreInvalidURLs === true) { + req.body = batchScrapeRequestSchemaNoURLValidation.parse(req.body); + } else { + req.body = batchScrapeRequestSchema.parse(req.body); + } const id = req.body.appendToId ?? uuidv4(); const logger = _logger.child({ @@ -35,8 +41,27 @@ export async function batchScrapeController( teamId: req.auth.team_id, plan: req.auth.plan, }); + + let urls = req.body.urls; + let invalidURLs: string[] | undefined = undefined; + + if (req.body.ignoreInvalidURLs) { + invalidURLs = []; + + let pendingURLs = urls; + urls = []; + for (const u of pendingURLs) { + try { + const nu = urlSchema.parse(u); + urls.push(nu); + } catch (_) { + invalidURLs.push(u); + } + } + } + logger.debug("Batch scrape " + id + " starting", { - urlsLength: req.body.urls, + urlsLength: urls, appendToId: req.body.appendToId, account: req.account, }); @@ -70,7 +95,7 @@ export async function batchScrapeController( // If it is over 1000, we need to get the job priority, // otherwise we can use the default priority of 20 - if (req.body.urls.length > 1000) { + if (urls.length > 1000) { // set base to 21 jobPriority = await getJobPriority({ plan: req.auth.plan, @@ -84,7 +109,7 @@ export async function batchScrapeController( delete (scrapeOptions as any).urls; delete (scrapeOptions as any).appendToId; - const jobs = req.body.urls.map((x) => { + const jobs = urls.map((x) => { return { data: { url: x, @@ -140,5 +165,6 @@ export async function batchScrapeController( success: true, id, url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`, + invalidURLs, }); } diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index 1fb470f9..c2e3369f 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -18,7 +18,7 @@ import { } from "../../lib/crawl-redis"; import { logCrawl } from "../../services/logging/crawl_log"; import { getScrapeQueue } from "../../services/queue-service"; -import { addScrapeJob } from "../../services/queue-jobs"; +import { addScrapeJob, addScrapeJobs } from "../../services/queue-jobs"; import { logger as _logger } from "../../lib/logger"; import { getJobPriority } from "../../lib/job-priority"; import { callWebhook } from "../../services/webhook"; @@ -139,9 +139,9 @@ export async function crawlController( name: uuid, data: { url, - mode: "single_urls", + mode: "single_urls" as const, team_id: req.auth.team_id, - plan: req.auth.plan, + plan: req.auth.plan!, crawlerOptions, scrapeOptions, internalOptions: sc.internalOptions, @@ -170,7 +170,7 @@ export async function crawlController( jobs.map((x) => x.opts.jobId), ); logger.debug("Adding scrape jobs to BullMQ..."); - await getScrapeQueue().addBulk(jobs); + await addScrapeJobs(jobs); } else { logger.debug("Sitemap not found or ignored.", { ignoreSitemap: sc.crawlerOptions.ignoreSitemap, diff --git a/apps/api/src/controllers/v1/credit-usage.ts b/apps/api/src/controllers/v1/credit-usage.ts new file mode 100644 index 00000000..da522c13 --- /dev/null +++ b/apps/api/src/controllers/v1/credit-usage.ts @@ -0,0 +1,45 @@ +import { Request, Response } from "express"; +import { RequestWithAuth } from "./types"; +import { getACUC } from "../auth"; +import { logger } from "../../lib/logger"; + +export async function creditUsageController( + req: RequestWithAuth, + res: Response, +): Promise { + try { + // If we already have the credit usage info from auth, use it + if (req.acuc) { + res.json({ + success: true, + data: { + remaining_credits: req.acuc.remaining_credits, + }, + }); + return; + } + + // Otherwise fetch fresh data + const chunk = await getACUC(req.auth.team_id); + if (!chunk) { + res.status(404).json({ + success: false, + error: "Could not find credit usage information", + }); + return; + } + + res.json({ + success: true, + data: { + remaining_credits: chunk.remaining_credits, + }, + }); + } catch (error) { + logger.error("Error in credit usage controller:", error); + res.status(500).json({ + success: false, + error: "Internal server error while fetching credit usage", + }); + } +} diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index 0c286253..58e75751 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -1,6 +1,6 @@ import { Request, Response } from "express"; import { - // Document, + Document, RequestWithAuth, ExtractRequest, extractRequestSchema, @@ -8,7 +8,7 @@ import { MapDocument, scrapeOptions, } from "./types"; -import { Document } from "../../lib/entities"; +// import { Document } from "../../lib/entities"; import Redis from "ioredis"; import { configDotenv } from "dotenv"; import { performRanking } from "../../lib/ranker"; @@ -24,6 +24,9 @@ import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/ import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { getMapResults } from "./map"; import { buildDocument } from "../../lib/extract/build-document"; +import { generateBasicCompletion } from "../../lib/LLM-extraction"; +import { buildRefrasedPrompt } from "../../lib/extract/build-prompts"; +import { removeDuplicateUrls } from "../../lib/validateUrl"; configDotenv(); const redis = new Redis(process.env.REDIS_URL!); @@ -61,30 +64,50 @@ export async function extractController( const baseUrl = url.replace("/*", ""); // const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any - const allowExternalLinks = req.body.allowExternalLinks ?? true; + const allowExternalLinks = req.body.allowExternalLinks; let urlWithoutWww = baseUrl.replace("www.", ""); - let mapUrl = - req.body.prompt && allowExternalLinks - ? `${req.body.prompt} ${urlWithoutWww}` - : req.body.prompt - ? `${req.body.prompt} site:${urlWithoutWww}` - : `site:${urlWithoutWww}`; + + let rephrasedPrompt = req.body.prompt; + if (req.body.prompt) { + rephrasedPrompt = + (await generateBasicCompletion( + buildRefrasedPrompt(req.body.prompt, baseUrl), + )) ?? req.body.prompt; + } const mapResults = await getMapResults({ url: baseUrl, - search: req.body.prompt, + search: rephrasedPrompt, teamId: req.auth.team_id, plan: req.auth.plan, allowExternalLinks, origin: req.body.origin, limit: req.body.limit, // If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping - ignoreSitemap: !selfHosted ? true : false, + ignoreSitemap: false, includeMetadata: true, includeSubdomains: req.body.includeSubdomains, }); - let mappedLinks = mapResults.links as MapDocument[]; + let mappedLinks = mapResults.mapResults as MapDocument[]; + + // Remove duplicates between mapResults.links and mappedLinks + const allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links]; + const uniqueUrls = removeDuplicateUrls(allUrls); + + // Only add URLs from mapResults.links that aren't already in mappedLinks + const existingUrls = new Set(mappedLinks.map((m) => m.url)); + const newUrls = uniqueUrls.filter((url) => !existingUrls.has(url)); + + mappedLinks = [ + ...mappedLinks, + ...newUrls.map((url) => ({ url, title: "", description: "" })), + ]; + + if (mappedLinks.length === 0) { + mappedLinks = [{ url: baseUrl, title: "", description: "" }]; + } + // Limit number of links to MAX_EXTRACT_LIMIT mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT); @@ -93,18 +116,18 @@ export async function extractController( `url: ${x.url}, title: ${x.title}, description: ${x.description}`, ); - // Filter by path prefix if present - // wrong - // if (pathPrefix) { - // mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`)); - // } - if (req.body.prompt) { + let searchQuery = + req.body.prompt && allowExternalLinks + ? `${req.body.prompt} ${urlWithoutWww}` + : req.body.prompt + ? `${req.body.prompt} site:${urlWithoutWww}` + : `site:${urlWithoutWww}`; // Get similarity scores between the search query and each link's context const linksAndScores = await performRanking( mappedLinksRerank, mappedLinks.map((l) => l.url), - mapUrl, + searchQuery, ); // First try with high threshold @@ -158,7 +181,8 @@ export async function extractController( // Wait for all URL processing to complete and flatten results const processedUrls = await Promise.all(urlPromises); - links.push(...processedUrls.flat()); + const flattenedUrls = processedUrls.flat().filter((url) => url); // Filter out any null/undefined values + links.push(...flattenedUrls); if (links.length === 0) { return res.status(400).json({ @@ -204,21 +228,8 @@ export async function extractController( } return doc; } catch (e) { - logger.error(`Error in scrapeController: ${e}`); - if ( - e instanceof Error && - (e.message.startsWith("Job wait") || e.message === "timeout") - ) { - throw { - status: 408, - error: "Request timed out", - }; - } else { - throw { - status: 500, - error: `(Internal server error) - ${e && e.message ? e.message : e}`, - }; - } + logger.error(`Error in extractController: ${e}`); + return null; } }); @@ -237,7 +248,8 @@ export async function extractController( { mode: "llm", systemPrompt: - "Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided. Here are the urls the user provided of which he wants to extract information from: " + + (req.body.systemPrompt ? `${req.body.systemPrompt}\n` : "") + + "Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " + links.join(", "), prompt: req.body.prompt, schema: req.body.schema, diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index cd302708..27a926fc 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -28,14 +28,15 @@ const redis = new Redis(process.env.REDIS_URL!); // Max Links that /map can return const MAX_MAP_LIMIT = 5000; // Max Links that "Smart /map" can return -const MAX_FIRE_ENGINE_RESULTS = 1000; +const MAX_FIRE_ENGINE_RESULTS = 500; interface MapResult { success: boolean; - links: string[] | any[]; + links: string[]; scrape_id?: string; job_id: string; time_taken: number; + mapResults: MapDocument[]; } export async function getMapResults({ @@ -215,7 +216,8 @@ export async function getMapResults({ return { success: true, - links: includeMetadata ? mapResults : linksToReturn, + links: linksToReturn, + mapResults: mapResults, scrape_id: origin?.includes("website") ? id : undefined, job_id: id, time_taken: (new Date().getTime() - Date.now()) / 1000, diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index ddd5da74..1ea28995 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -60,7 +60,11 @@ export async function scrapeController( try { doc = await waitForJob(jobId, timeout + totalWait); // TODO: better types for this } catch (e) { - logger.error(`Error in scrapeController: ${e}`); + logger.error(`Error in scrapeController: ${e}`, { + jobId, + scrapeId: jobId, + startTime, + }); if ( e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout") diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 076d8b0b..114c115e 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -11,6 +11,7 @@ import { Document as V0Document, } from "../../lib/entities"; import { InternalOptions } from "../../scraper/scrapeURL"; +import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; export type Format = | "markdown" @@ -44,10 +45,7 @@ export const url = z.preprocess( return false; } }, "Invalid URL") - .refine( - (x) => !isUrlBlocked(x as string), - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - ), + .refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE), ); const strictMessage = @@ -182,6 +180,7 @@ export const scrapeOptions = z .optional(), skipTlsVerification: z.boolean().default(false), removeBase64Images: z.boolean().default(true), + fastMode: z.boolean().default(false), }) .strict(strictMessage); @@ -193,11 +192,12 @@ export const extractV1Options = z .array() .max(10, "Maximum of 10 URLs allowed per request while in beta."), prompt: z.string().optional(), + systemPrompt: z.string().optional(), schema: z.any().optional(), limit: z.number().int().positive().finite().safe().optional(), ignoreSitemap: z.boolean().default(false), includeSubdomains: z.boolean().default(true), - allowExternalLinks: z.boolean().default(true), + allowExternalLinks: z.boolean().default(false), origin: z.string().optional().default("api"), timeout: z.number().int().positive().finite().safe().default(60000), }) @@ -262,6 +262,31 @@ export const batchScrapeRequestSchema = scrapeOptions origin: z.string().optional().default("api"), webhook: webhookSchema.optional(), appendToId: z.string().uuid().optional(), + ignoreInvalidURLs: z.boolean().default(false), + }) + .strict(strictMessage) + .refine( + (obj) => { + const hasExtractFormat = obj.formats?.includes("extract"); + const hasExtractOptions = obj.extract !== undefined; + return ( + (hasExtractFormat && hasExtractOptions) || + (!hasExtractFormat && !hasExtractOptions) + ); + }, + { + message: + "When 'extract' format is specified, 'extract' options must be provided, and vice versa", + }, + ); + +export const batchScrapeRequestSchemaNoURLValidation = scrapeOptions + .extend({ + urls: z.string().array(), + origin: z.string().optional().default("api"), + webhook: webhookSchema.optional(), + appendToId: z.string().uuid().optional(), + ignoreInvalidURLs: z.boolean().default(false), }) .strict(strictMessage) .refine( @@ -396,7 +421,7 @@ export type Document = { articleSection?: string; url?: string; sourceURL?: string; - statusCode?: number; + statusCode: number; error?: string; [key: string]: string | string[] | number | undefined; }; @@ -446,6 +471,15 @@ export type CrawlResponse = url: string; }; +export type BatchScrapeResponse = + | ErrorResponse + | { + success: true; + id: string; + url: string; + invalidURLs?: string[]; + }; + export type MapResponse = | ErrorResponse | { @@ -651,11 +685,11 @@ export function fromLegacyScrapeOptions( } : undefined, mobile: pageOptions.mobile, + fastMode: pageOptions.useFastMode, }), internalOptions: { atsv: pageOptions.atsv, v0DisableJsDom: pageOptions.disableJsDom, - v0UseFastMode: pageOptions.useFastMode, }, // TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks }; diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index de7017ea..3a98ffc9 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -62,3 +62,16 @@ export async function generateCompletions( return completions; } + +// generate basic completion + +export async function generateBasicCompletion(prompt: string) { + const openai = new OpenAI(); + const model = "gpt-4o"; + + const completion = await openai.chat.completions.create({ + model, + messages: [{ role: "user", content: prompt }], + }); + return completion.choices[0].message.content; +} diff --git a/apps/api/src/lib/cache.ts b/apps/api/src/lib/cache.ts index 7dcbf88b..cbab4e05 100644 --- a/apps/api/src/lib/cache.ts +++ b/apps/api/src/lib/cache.ts @@ -21,7 +21,7 @@ export function cacheKey( if ( internalOptions.v0CrawlOnlyUrls || internalOptions.forceEngine || - internalOptions.v0UseFastMode || + scrapeOptions.fastMode || internalOptions.atsv || (scrapeOptions.actions && scrapeOptions.actions.length > 0) ) { diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 6ccb9436..6ecb0b8f 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -60,6 +60,8 @@ export async function addCrawlJob(id: string, job_id: string) { } export async function addCrawlJobs(id: string, job_ids: string[]) { + if (job_ids.length === 0) return true; + _logger.debug("Adding crawl jobs to Redis...", { jobIds: job_ids, module: "crawl-redis", @@ -90,12 +92,20 @@ export async function addCrawlJobDone( if (success) { await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id); - await redisConnection.expire( + } else { + // in case it's already been pushed, make sure it's removed + await redisConnection.lrem( "crawl:" + id + ":jobs_done_ordered", - 24 * 60 * 60, - "NX", + -1, + job_id, ); } + + await redisConnection.expire( + "crawl:" + id + ":jobs_done_ordered", + 24 * 60 * 60, + "NX", + ); } export async function getDoneJobsOrderedLength(id: string): Promise { @@ -227,13 +237,6 @@ export async function lockURL( url = normalizeURL(url, sc); logger = logger.child({ url }); - await redisConnection.sadd("crawl:" + id + ":visited_unique", url); - await redisConnection.expire( - "crawl:" + id + ":visited_unique", - 24 * 60 * 60, - "NX", - ); - let res: boolean; if (!sc.crawlerOptions?.deduplicateSimilarURLs) { res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0; @@ -249,6 +252,15 @@ export async function lockURL( await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); + if (res) { + await redisConnection.sadd("crawl:" + id + ":visited_unique", url); + await redisConnection.expire( + "crawl:" + id + ":visited_unique", + 24 * 60 * 60, + "NX", + ); + } + logger.debug("Locking URL " + JSON.stringify(url) + "... result: " + res, { res, }); @@ -261,6 +273,8 @@ export async function lockURLs( sc: StoredCrawl, urls: string[], ): Promise { + if (urls.length === 0) return true; + urls = urls.map((url) => normalizeURL(url, sc)); const logger = _logger.child({ crawlId: id, diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts new file mode 100644 index 00000000..f554eadc --- /dev/null +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -0,0 +1,16 @@ +export function buildRefrasedPrompt(prompt: string, url: string): string { + return `You are a search query optimizer. Your task is to rephrase the following prompt into an effective search query that will find relevant results about this topic on ${url}. + +Original prompt: "${prompt}" + +Provide a rephrased search query that: +1. Maintains the core intent of the original prompt with ONLY the keywords +2. Uses relevant keywords +3. Is optimized for search engine results +4. Is concise and focused +5. Short is better than long +6. It is a search engine, not a chatbot +7. Concise + +Return only the rephrased search query, without any explanation or additional text.`; +} diff --git a/apps/api/src/lib/strings.ts b/apps/api/src/lib/strings.ts new file mode 100644 index 00000000..4e278d2b --- /dev/null +++ b/apps/api/src/lib/strings.ts @@ -0,0 +1,2 @@ +export const BLOCKLISTED_URL_MESSAGE = + "This website is no longer supported, please reach out to help@firecrawl.com for more info on how to activate it on your account."; diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index dc907371..0f3b8524 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -7,7 +7,7 @@ import { import { billTeam } from "../services/billing/credit_billing"; import { Document } from "../controllers/v1/types"; import { supabase_service } from "../services/supabase"; -import { logger } from "../lib/logger"; +import { logger as _logger } from "../lib/logger"; import { ScrapeEvents } from "../lib/scrape-events"; import { configDotenv } from "dotenv"; import { @@ -49,6 +49,7 @@ export async function startWebScraperPipeline({ bull_job_id: job.id.toString(), priority: job.opts.priority, is_scrape: job.data.is_scrape ?? false, + is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null), }); } @@ -63,56 +64,126 @@ export async function runWebScraper({ bull_job_id, priority, is_scrape = false, + is_crawl = false, }: RunWebScraperParams): Promise { + const logger = _logger.child({ + method: "runWebScraper", + module: "runWebscraper", + scrapeId: bull_job_id, + jobId: bull_job_id, + }); + const tries = is_crawl ? 3 : 1; + let response: ScrapeUrlResponse | undefined = undefined; let engines: EngineResultsTracker = {}; - try { - response = await scrapeURL(bull_job_id, url, scrapeOptions, { - priority, - ...internalOptions, - }); - if (!response.success) { - if (response.error instanceof Error) { - throw response.error; - } else { - throw new Error( - "scrapeURL error: " + - (Array.isArray(response.error) - ? JSON.stringify(response.error) - : typeof response.error === "object" - ? JSON.stringify({ ...response.error }) - : response.error), - ); - } + let error: any = undefined; + + for (let i = 0; i < tries; i++) { + if (i > 0) { + logger.debug("Retrying scrape...", { + tries, + i, + previousStatusCode: (response as any)?.document?.metadata?.statusCode, + previousError: error, + }); } + response = undefined; + engines = {}; + error = undefined; + + try { + response = await scrapeURL(bull_job_id, url, scrapeOptions, { + priority, + ...internalOptions, + }); + if (!response.success) { + if (response.error instanceof Error) { + throw response.error; + } else { + throw new Error( + "scrapeURL error: " + + (Array.isArray(response.error) + ? JSON.stringify(response.error) + : typeof response.error === "object" + ? JSON.stringify({ ...response.error }) + : response.error), + ); + } + } + + // This is where the returnvalue from the job is set + // onSuccess(response.document, mode); + + engines = response.engines; + + if ( + (response.document.metadata.statusCode >= 200 && + response.document.metadata.statusCode < 300) || + response.document.metadata.statusCode === 304 + ) { + // status code is good -- do not attempt retry + break; + } + } catch (_error) { + error = _error; + engines = + response !== undefined + ? response.engines + : typeof error === "object" && error !== null + ? ((error as any).results ?? {}) + : {}; + } + } + + const engineOrder = Object.entries(engines) + .sort((a, b) => a[1].startedAt - b[1].startedAt) + .map((x) => x[0]) as Engine[]; + + for (const engine of engineOrder) { + const result = engines[engine] as Exclude< + EngineResultsTracker[Engine], + undefined + >; + ScrapeEvents.insert(bull_job_id, { + type: "scrape", + url, + method: engine, + result: { + success: result.state === "success", + response_code: + result.state === "success" ? result.result.statusCode : undefined, + response_size: + result.state === "success" ? result.result.html.length : undefined, + error: + result.state === "error" + ? result.error + : result.state === "timeout" + ? "Timed out" + : undefined, + time_taken: result.finishedAt - result.startedAt, + }, + }); + } + + if (error === undefined && response?.success) { if (is_scrape === false) { let creditsToBeBilled = 1; // Assuming 1 credit per document if (scrapeOptions.extract) { creditsToBeBilled = 5; } - billTeam(team_id, undefined, creditsToBeBilled).catch((error) => { + billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => { logger.error( - `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`, + `Failed to bill team ${team_id} for ${creditsToBeBilled} credits`, + { error }, ); // Optionally, you could notify an admin or add to a retry queue here }); } - // This is where the returnvalue from the job is set - // onSuccess(response.document, mode); - - engines = response.engines; return response; - } catch (error) { - engines = - response !== undefined - ? response.engines - : typeof error === "object" && error !== null - ? ((error as any).results ?? {}) - : {}; - + } else { if (response !== undefined) { return { ...response, @@ -127,37 +198,6 @@ export async function runWebScraper({ engines, }; } - // onError(error); - } finally { - const engineOrder = Object.entries(engines) - .sort((a, b) => a[1].startedAt - b[1].startedAt) - .map((x) => x[0]) as Engine[]; - - for (const engine of engineOrder) { - const result = engines[engine] as Exclude< - EngineResultsTracker[Engine], - undefined - >; - ScrapeEvents.insert(bull_job_id, { - type: "scrape", - url, - method: engine, - result: { - success: result.state === "success", - response_code: - result.state === "success" ? result.result.statusCode : undefined, - response_size: - result.state === "success" ? result.result.html.length : undefined, - error: - result.state === "error" - ? result.error - : result.state === "timeout" - ? "Timed out" - : undefined, - time_taken: result.finishedAt - result.startedAt, - }, - }); - } } } @@ -195,6 +235,11 @@ const saveJob = async ( } ScrapeEvents.logJobEvent(job, "completed"); } catch (error) { - logger.error(`🐂 Failed to update job status: ${error}`); + _logger.error(`🐂 Failed to update job status`, { + module: "runWebScraper", + method: "saveJob", + jobId: job.id, + scrapeId: job.id, + }); } }; diff --git a/apps/api/src/routes/admin.ts b/apps/api/src/routes/admin.ts index ec9967b8..1901c6f2 100644 --- a/apps/api/src/routes/admin.ts +++ b/apps/api/src/routes/admin.ts @@ -8,6 +8,7 @@ import { } from "../controllers/v0/admin/queue"; import { wrap } from "./v1"; import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear"; +import { checkFireEngine } from "../controllers/v0/admin/check-fire-engine"; export const adminRouter = express.Router(); @@ -37,3 +38,8 @@ adminRouter.post( `/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`, wrap(acucCacheClearController), ); + +adminRouter.get( + `/admin/${process.env.BULL_AUTH_KEY}/feng-check`, + wrap(checkFireEngine), +); diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index f09573d9..1ee191ef 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -31,6 +31,8 @@ import { extractController } from "../controllers/v1/extract"; // import { keyAuthController } from "../../src/controllers/v1/keyAuth"; // import { livenessController } from "../controllers/v1/liveness"; // import { readinessController } from "../controllers/v1/readiness"; +import { creditUsageController } from "../controllers/v1/credit-usage"; +import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; function checkCreditsMiddleware( minimum?: number, @@ -122,8 +124,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { if (!res.headersSent) { return res.status(403).json({ success: false, - error: - "URL is blocked intentionally. Firecrawl currently does not support scraping this site due to policy restrictions.", + error: BLOCKLISTED_URL_MESSAGE, }); } } @@ -224,3 +225,9 @@ v1Router.delete( // Health/Probe routes // v1Router.get("/health/liveness", livenessController); // v1Router.get("/health/readiness", readinessController); + +v1Router.get( + "/team/credit-usage", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(creditUsageController), +); diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 19b0b5b4..2e47d352 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -210,7 +210,7 @@ export class WebCrawler { } if (sitemapLinks.length > 0) { let filteredLinks = this.filterLinks( - sitemapLinks, + [...new Set(sitemapLinks)], this.limit, this.maxCrawledDepth, fromMap, diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 0a3ef705..16e9e45f 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -6,6 +6,15 @@ configDotenv(); const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8"); const algorithm = "aes-256-ecb"; +function encryptAES(plaintext: string, key: Buffer): string { + const cipher = crypto.createCipheriv(algorithm, key, null); + const encrypted = Buffer.concat([ + cipher.update(plaintext, "utf-8"), + cipher.final(), + ]); + return encrypted.toString("base64"); +} + function decryptAES(ciphertext: string, key: Buffer): string { const decipher = crypto.createDecipheriv(algorithm, key, null); const decrypted = Buffer.concat([ @@ -42,9 +51,27 @@ const urlBlocklist = [ "PTbGg8PK/h0Seyw4HEpK4Q==", "lZdQMknjHb7+4+sjF3qNTw==", "LsgSq54q5oDysbva29JxnQ==", + "KZfBtpwjOpdSoqacRbz7og==", + "Indtl4yxJMHCKBGF4KABCQ==", + "e3HFXLVgxhaVoadYpwb2BA==", + "b+asgLayXQ5Jq+se+q56jA==", + "86ZDUI7vmp4MvNq3fvZrGQ==", + "sEGFoYZ6GEg4Zocd+TiyfQ==", + "6OOL72eXthgnJ1Hj4PfOQQ==", + "g/ME+Sh1CAFboKrwkVb+5Q==", + "Pw+xawUoX8xBYbX2yqqGWQ==", + "k6vBalxYFhAvkPsF19t9gQ==", + "e3HFXLVgxhaVoadYpwb2BA==", + "b+asgLayXQ5Jq+se+q56jA==", + "KKttwRz4w+AMJrZcB828WQ==", + "vMdzZ33BXoyWVZnAPOBcrg==", + "l8GDVI8w/ueHnNzdN1ODuQ==", ]; -const decryptedBlocklist = hashKey.length > 0 ? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey)) : []; +const decryptedBlocklist = + hashKey.length > 0 + ? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey)) + : []; const allowedKeywords = [ "pulse", diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts index af6f57c0..a0c8eaba 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts @@ -5,8 +5,9 @@ import { specialtyScrapeCheck } from "../utils/specialtyHandler"; export async function scrapeURLWithFetch( meta: Meta, + timeToRun: number | undefined, ): Promise { - const timeout = 20000; + const timeout = timeToRun ?? 300000; const response = await Promise.race([ fetch(meta.url, { diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts index 328931ba..6f65db98 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts @@ -3,7 +3,7 @@ import * as Sentry from "@sentry/node"; import { z } from "zod"; import { robustFetch } from "../../lib/fetch"; -import { EngineError, SiteError } from "../../error"; +import { ActionError, EngineError, SiteError } from "../../error"; const successSchema = z.object({ jobId: z.string(), @@ -111,6 +111,12 @@ export async function fireEngineCheckStatus( status.error.includes("Chrome error: ") ) { throw new SiteError(status.error.split("Chrome error: ")[1]); + } else if ( + typeof status.error === "string" && + // TODO: improve this later + status.error.includes("Element") + ) { + throw new ActionError(status.error.split("Error: ")[1]); } else { throw new EngineError("Scrape job failed", { cause: { diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 3fc32835..d753465d 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -13,13 +13,11 @@ import { FireEngineCheckStatusSuccess, StillProcessingError, } from "./checkStatus"; -import { EngineError, SiteError, TimeoutError } from "../../error"; +import { ActionError, EngineError, SiteError, TimeoutError } from "../../error"; import * as Sentry from "@sentry/node"; import { Action } from "../../../../lib/entities"; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; -export const defaultTimeout = 10000; - // This function does not take `Meta` on purpose. It may not access any // meta values to construct the request -- that must be done by the // `scrapeURLWithFireEngine*` functions. @@ -31,7 +29,7 @@ async function performFireEngineScrape< >( logger: Logger, request: FireEngineScrapeRequestCommon & Engine, - timeout = defaultTimeout, + timeout: number, ): Promise { const scrape = await fireEngineScrape( logger.child({ method: "fireEngineScrape" }), @@ -70,7 +68,11 @@ async function performFireEngineScrape< } catch (error) { if (error instanceof StillProcessingError) { // nop - } else if (error instanceof EngineError || error instanceof SiteError) { + } else if ( + error instanceof EngineError || + error instanceof SiteError || + error instanceof ActionError + ) { logger.debug("Fire-engine scrape job failed.", { error, jobId: scrape.jobId, @@ -94,6 +96,7 @@ async function performFireEngineScrape< export async function scrapeURLWithFireEngineChromeCDP( meta: Meta, + timeToRun: number | undefined, ): Promise { const actions: Action[] = [ // Transform waitFor option into an action (unsupported by chrome-cdp) @@ -121,6 +124,13 @@ export async function scrapeURLWithFireEngineChromeCDP( ...(meta.options.actions ?? []), ]; + const totalWait = actions.reduce( + (a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a), + 0, + ); + + const timeout = (timeToRun ?? 300000) + totalWait; + const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestChromeCDP = { url: meta.url, @@ -134,25 +144,20 @@ export async function scrapeURLWithFireEngineChromeCDP( } : {}), priority: meta.internalOptions.priority, - geolocation: meta.options.geolocation, + geolocation: meta.options.geolocation ?? meta.options.location, mobile: meta.options.mobile, - timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic + timeout, // TODO: better timeout logic disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache, // TODO: scrollXPaths }; - const totalWait = actions.reduce( - (a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a), - 0, - ); - let response = await performFireEngineScrape( meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request, }), request, - meta.options.timeout !== undefined ? defaultTimeout + totalWait : Infinity, // TODO: better timeout handling + timeout, ); specialtyScrapeCheck( @@ -206,7 +211,11 @@ export async function scrapeURLWithFireEngineChromeCDP( export async function scrapeURLWithFireEnginePlaywright( meta: Meta, + timeToRun: number | undefined, ): Promise { + const totalWait = meta.options.waitFor; + const timeout = (timeToRun ?? 300000) + totalWait; + const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestPlaywright = { url: meta.url, @@ -218,9 +227,9 @@ export async function scrapeURLWithFireEnginePlaywright( screenshot: meta.options.formats.includes("screenshot"), fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"), wait: meta.options.waitFor, - geolocation: meta.options.geolocation, + geolocation: meta.options.geolocation ?? meta.options.location, - timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic + timeout, }; let response = await performFireEngineScrape( @@ -229,9 +238,7 @@ export async function scrapeURLWithFireEnginePlaywright( request, }), request, - meta.options.timeout !== undefined - ? defaultTimeout + meta.options.waitFor - : Infinity, // TODO: better timeout handling + timeout, ); specialtyScrapeCheck( @@ -265,7 +272,10 @@ export async function scrapeURLWithFireEnginePlaywright( export async function scrapeURLWithFireEngineTLSClient( meta: Meta, + timeToRun: number | undefined, ): Promise { + const timeout = timeToRun ?? 30000; + const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestTLSClient = { url: meta.url, @@ -276,10 +286,10 @@ export async function scrapeURLWithFireEngineTLSClient( priority: meta.internalOptions.priority, atsv: meta.internalOptions.atsv, - geolocation: meta.options.geolocation, + geolocation: meta.options.geolocation ?? meta.options.location, disableJsDom: meta.internalOptions.v0DisableJsDom, - timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic + timeout, }; let response = await performFireEngineScrape( @@ -288,7 +298,7 @@ export async function scrapeURLWithFireEngineTLSClient( request, }), request, - meta.options.timeout !== undefined ? defaultTimeout : Infinity, // TODO: better timeout handling + timeout, ); specialtyScrapeCheck( diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 01ac0be9..bb0c485c 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -105,7 +105,10 @@ export type EngineScrapeResult = { }; const engineHandlers: { - [E in Engine]: (meta: Meta) => Promise; + [E in Engine]: ( + meta: Meta, + timeToRun: number | undefined, + ) => Promise; } = { cache: scrapeCache, "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP, @@ -372,6 +375,7 @@ export function buildFallbackList(meta: Meta): { export async function scrapeURLWithEngine( meta: Meta, engine: Engine, + timeToRun: number | undefined, ): Promise { const fn = engineHandlers[engine]; const logger = meta.logger.child({ @@ -383,5 +387,5 @@ export async function scrapeURLWithEngine( logger, }; - return await fn(_meta); + return await fn(_meta, timeToRun); } diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 341a4f1a..9d2f11b1 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -15,6 +15,7 @@ type PDFProcessorResult = { html: string; markdown?: string }; async function scrapePDFWithLlamaParse( meta: Meta, tempFilePath: string, + timeToRun: number | undefined, ): Promise { meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath, @@ -63,8 +64,9 @@ async function scrapePDFWithLlamaParse( // TODO: timeout, retries const startedAt = Date.now(); + const timeout = timeToRun ?? 300000; - while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) { + while (Date.now() <= startedAt + timeout) { try { const result = await robustFetch({ url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`, @@ -122,7 +124,10 @@ async function scrapePDFWithParsePDF( }; } -export async function scrapePDF(meta: Meta): Promise { +export async function scrapePDF( + meta: Meta, + timeToRun: number | undefined, +): Promise { if (!meta.options.parsePDF) { const file = await fetchFileToBuffer(meta.url); const content = file.buffer.toString("base64"); @@ -138,9 +143,26 @@ export async function scrapePDF(meta: Meta): Promise { const { response, tempFilePath } = await downloadFile(meta.id, meta.url); let result: PDFProcessorResult | null = null; - if (process.env.LLAMAPARSE_API_KEY) { + + // First, try parsing with PdfParse + result = await scrapePDFWithParsePDF( + { + ...meta, + logger: meta.logger.child({ + method: "scrapePDF/scrapePDFWithParsePDF", + }), + }, + tempFilePath, + ); + + // If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse + if ( + result.markdown && + result.markdown.length < 500 && + process.env.LLAMAPARSE_API_KEY + ) { try { - result = await scrapePDFWithLlamaParse( + const llamaResult = await scrapePDFWithLlamaParse( { ...meta, logger: meta.logger.child({ @@ -148,17 +170,19 @@ export async function scrapePDF(meta: Meta): Promise { }), }, tempFilePath, + timeToRun, ); + result = llamaResult; // Use LlamaParse result if successful } catch (error) { if (error instanceof Error && error.message === "LlamaParse timed out") { - meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", { + meta.logger.warn("LlamaParse timed out -- using parse-pdf result", { error, }); } else if (error instanceof RemoveFeatureError) { throw error; } else { meta.logger.warn( - "LlamaParse failed to parse PDF -- falling back to parse-pdf", + "LlamaParse failed to parse PDF -- using parse-pdf result", { error }, ); Sentry.captureException(error); @@ -166,18 +190,6 @@ export async function scrapePDF(meta: Meta): Promise { } } - if (result === null) { - result = await scrapePDFWithParsePDF( - { - ...meta, - logger: meta.logger.child({ - method: "scrapePDF/scrapePDFWithParsePDF", - }), - }, - tempFilePath, - ); - } - await fs.unlink(tempFilePath); return { diff --git a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts index c92b1d90..edcd50c0 100644 --- a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts @@ -6,8 +6,9 @@ import { robustFetch } from "../../lib/fetch"; export async function scrapeURLWithPlaywright( meta: Meta, + timeToRun: number | undefined, ): Promise { - const timeout = 20000 + meta.options.waitFor; + const timeout = (timeToRun ?? 300000) + meta.options.waitFor; const response = await Promise.race([ await robustFetch({ @@ -30,7 +31,7 @@ export async function scrapeURLWithPlaywright( }), }), (async () => { - await new Promise((resolve) => setTimeout(() => resolve(null), 20000)); + await new Promise((resolve) => setTimeout(() => resolve(null), timeout)); throw new TimeoutError( "Playwright was unable to scrape the page before timing out", { cause: { timeout } }, diff --git a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts index 50ac502b..38c43878 100644 --- a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts @@ -9,16 +9,20 @@ const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!); export function scrapeURLWithScrapingBee( wait_browser: "domcontentloaded" | "networkidle2", -): (meta: Meta) => Promise { - return async (meta: Meta): Promise => { +): (meta: Meta, timeToRun: number | undefined) => Promise { + return async ( + meta: Meta, + timeToRun: number | undefined, + ): Promise => { let response: AxiosResponse; + const timeout = (timeToRun ?? 300000) + meta.options.waitFor; try { response = await client.get({ url: meta.url, params: { - timeout: 15000, // TODO: dynamic timeout based on request timeout + timeout, wait_browser: wait_browser, - wait: Math.min(meta.options.waitFor, 35000), + wait: meta.options.waitFor, transparent_status_code: true, json_response: true, screenshot: meta.options.formats.includes("screenshot"), diff --git a/apps/api/src/scraper/scrapeURL/error.ts b/apps/api/src/scraper/scrapeURL/error.ts index ec044745..689f90c8 100644 --- a/apps/api/src/scraper/scrapeURL/error.ts +++ b/apps/api/src/scraper/scrapeURL/error.ts @@ -56,3 +56,11 @@ export class SiteError extends Error { this.code = code; } } + +export class ActionError extends Error { + public code: string; + constructor(code: string) { + super("Action(s) failed to complete. Error code: " + code); + this.code = code; + } +} diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index a3eb6f1e..1df812bd 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -12,6 +12,7 @@ import { } from "./engines"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { + ActionError, AddFeatureError, EngineError, NoEnginesLeftError, @@ -86,7 +87,7 @@ function buildFeatureFlags( flags.add("skipTlsVerification"); } - if (internalOptions.v0UseFastMode) { + if (options.fastMode) { flags.add("useFastMode"); } @@ -148,7 +149,6 @@ export type InternalOptions = { atsv?: boolean; // anti-bot solver, beta v0CrawlOnlyUrls?: boolean; - v0UseFastMode?: boolean; v0DisableJsDom?: boolean; disableSmartWaitCache?: boolean; // Passed along to fire-engine @@ -203,11 +203,16 @@ async function scrapeURLLoop(meta: Meta): Promise { const results: EngineResultsTracker = {}; let result: EngineScrapeResultWithContext | null = null; + const timeToRun = + meta.options.timeout !== undefined + ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2)) + : undefined; + for (const { engine, unsupportedFeatures } of fallbackList) { const startedAt = Date.now(); try { meta.logger.info("Scraping via " + engine + "..."); - const _engineResult = await scrapeURLWithEngine(meta, engine); + const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun); if (_engineResult.markdown === undefined) { // Some engines emit Markdown directly. _engineResult.markdown = await parseMarkdown(_engineResult.html); @@ -285,6 +290,8 @@ async function scrapeURLLoop(meta: Meta): Promise { throw error; } else if (error instanceof SiteError) { throw error; + } else if (error instanceof ActionError) { + throw error; } else { Sentry.captureException(error); meta.logger.info( @@ -405,6 +412,8 @@ export async function scrapeURL( // TODO: results? } else if (error instanceof SiteError) { meta.logger.warn("scrapeURL: Site failed to load in browser", { error }); + } else if (error instanceof ActionError) { + meta.logger.warn("scrapeURL: Action(s) failed to complete", { error }); } else { Sentry.captureException(error); meta.logger.error("scrapeURL: Unexpected error happened", { error }); diff --git a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts index 040bf0ee..66cf30cc 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts @@ -5,7 +5,7 @@ import { Meta } from ".."; export function extractMetadata( meta: Meta, html: string, -): Document["metadata"] { +): Partial { let title: string | undefined = undefined; let description: string | undefined = undefined; let language: string | undefined = undefined; @@ -40,7 +40,7 @@ export function extractMetadata( const soup = load(html); try { - title = soup("title").text() || undefined; + title = soup("title").first().text().trim() || undefined; description = soup('meta[name="description"]').attr("content") || undefined; // Assuming the language is part of the URL as per the regex pattern diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 6380edb8..759f87e2 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -159,8 +159,8 @@ export async function generateOpenAICompletions( role: "user", content: options.prompt !== undefined - ? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}` - : "Transform the above content into structured JSON output.", + ? `Transform the above content into structured JSON output based on the provided schema if any and the following user request: ${options.prompt}. If schema is provided, strictly follow it.` + : "Transform the above content into structured JSON output based on the provided schema if any.", }, ], response_format: options.schema diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index bbd04cc0..5eb541fd 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -10,6 +10,7 @@ import { issueCredits } from "./issue_credits"; import { redlock } from "../redlock"; import { autoCharge } from "./auto_charge"; import { getValue, setValue } from "../redis"; +import type { Logger } from "winston"; const FREE_CREDITS = 500; @@ -20,22 +21,33 @@ export async function billTeam( team_id: string, subscription_id: string | null | undefined, credits: number, + logger?: Logger, ) { return withAuth(supaBillTeam, { success: true, message: "No DB, bypassed." })( team_id, subscription_id, credits, + logger, ); } export async function supaBillTeam( team_id: string, subscription_id: string | null | undefined, credits: number, + __logger?: Logger, ) { + const _logger = (__logger ?? logger).child({ + module: "credit_billing", + method: "supaBillTeam", + }); + if (team_id === "preview") { return { success: true, message: "Preview team, no credits used" }; } - logger.info(`Billing team ${team_id} for ${credits} credits`); + _logger.info(`Billing team ${team_id} for ${credits} credits`, { + team_id, + credits, + }); const { data, error } = await supabase_service.rpc("bill_team", { _team_id: team_id, @@ -46,7 +58,7 @@ export async function supaBillTeam( if (error) { Sentry.captureException(error); - logger.error("Failed to bill team: " + JSON.stringify(error)); + _logger.error("Failed to bill team.", { error }); return; } diff --git a/apps/api/src/services/billing/issue_credits.ts b/apps/api/src/services/billing/issue_credits.ts index ce84db1b..2ca057dd 100644 --- a/apps/api/src/services/billing/issue_credits.ts +++ b/apps/api/src/services/billing/issue_credits.ts @@ -9,6 +9,7 @@ export async function issueCredits(team_id: string, credits: number) { status: "active", // indicates that this coupon was issued from auto recharge from_auto_recharge: true, + initial_credits: credits, }); if (error) { diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index bd2b9121..654f6cda 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -11,11 +11,50 @@ import { pushConcurrencyLimitedJob, } from "../lib/concurrency-limit"; +async function _addScrapeJobToConcurrencyQueue( + webScraperOptions: any, + options: any, + jobId: string, + jobPriority: number, +) { + await pushConcurrencyLimitedJob(webScraperOptions.team_id, { + id: jobId, + data: webScraperOptions, + opts: { + ...options, + priority: jobPriority, + jobId: jobId, + }, + priority: jobPriority, + }); +} + +async function _addScrapeJobToBullMQ( + webScraperOptions: any, + options: any, + jobId: string, + jobPriority: number, +) { + if ( + webScraperOptions && + webScraperOptions.team_id && + webScraperOptions.plan + ) { + await pushConcurrencyLimitActiveJob(webScraperOptions.team_id, jobId); + } + + await getScrapeQueue().add(jobId, webScraperOptions, { + ...options, + priority: jobPriority, + jobId, + }); +} + async function addScrapeJobRaw( webScraperOptions: any, options: any, jobId: string, - jobPriority: number = 10, + jobPriority: number, ) { let concurrencyLimited = false; @@ -33,30 +72,14 @@ async function addScrapeJobRaw( } if (concurrencyLimited) { - await pushConcurrencyLimitedJob(webScraperOptions.team_id, { - id: jobId, - data: webScraperOptions, - opts: { - ...options, - priority: jobPriority, - jobId: jobId, - }, - priority: jobPriority, - }); - } else { - if ( - webScraperOptions && - webScraperOptions.team_id && - webScraperOptions.plan - ) { - await pushConcurrencyLimitActiveJob(webScraperOptions.team_id, jobId); - } - - await getScrapeQueue().add(jobId, webScraperOptions, { - ...options, - priority: jobPriority, + await _addScrapeJobToConcurrencyQueue( + webScraperOptions, + options, jobId, - }); + jobPriority, + ); + } else { + await _addScrapeJobToBullMQ(webScraperOptions, options, jobId, jobPriority); } } @@ -108,11 +131,88 @@ export async function addScrapeJobs( }; }[], ) { - // TODO: better + if (jobs.length === 0) return true; + + let countCanBeDirectlyAdded = Infinity; + + if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) { + const now = Date.now(); + const limit = await getConcurrencyLimitMax(jobs[0].data.plan); + console.log("CC limit", limit); + cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now); + + countCanBeDirectlyAdded = Math.max( + limit - + (await getConcurrencyLimitActiveJobs(jobs[0].data.team_id, now)).length, + 0, + ); + } + + const addToBull = jobs.slice(0, countCanBeDirectlyAdded); + const addToCQ = jobs.slice(countCanBeDirectlyAdded); + await Promise.all( - jobs.map((job) => - addScrapeJob(job.data, job.opts, job.opts.jobId, job.opts.priority), - ), + addToBull.map(async (job) => { + const size = JSON.stringify(job.data).length; + return await Sentry.startSpan( + { + name: "Add scrape job", + op: "queue.publish", + attributes: { + "messaging.message.id": job.opts.jobId, + "messaging.destination.name": getScrapeQueue().name, + "messaging.message.body.size": size, + }, + }, + async (span) => { + await _addScrapeJobToBullMQ( + { + ...job.data, + sentry: { + trace: Sentry.spanToTraceHeader(span), + baggage: Sentry.spanToBaggageHeader(span), + size, + }, + }, + job.opts, + job.opts.jobId, + job.opts.priority, + ); + }, + ); + }), + ); + + await Promise.all( + addToCQ.map(async (job) => { + const size = JSON.stringify(job.data).length; + return await Sentry.startSpan( + { + name: "Add scrape job", + op: "queue.publish", + attributes: { + "messaging.message.id": job.opts.jobId, + "messaging.destination.name": getScrapeQueue().name, + "messaging.message.body.size": size, + }, + }, + async (span) => { + await _addScrapeJobToConcurrencyQueue( + { + ...job.data, + sentry: { + trace: Sentry.spanToTraceHeader(span), + baggage: Sentry.spanToBaggageHeader(span), + size, + }, + }, + job.opts, + job.opts.jobId, + job.opts.priority, + ); + }, + ); + }), ); } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 29f4b84f..4ef9610d 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -386,27 +386,27 @@ async function processJob(job: Job & { id: string }, token: string) { jobId: job.id, scrapeId: job.id, crawlId: job.data?.crawl_id ?? undefined, + teamId: job.data?.team_id ?? undefined, }); logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url }); // Check if the job URL is researchhub and block it immediately // TODO: remove this once solve the root issue - if ( - job.data.url && - (job.data.url.includes("researchhub.com") || - job.data.url.includes("ebay.com") || - job.data.url.includes("youtube.com")) - ) { - logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`); - const data = { - success: false, - document: null, - project_id: job.data.project_id, - error: - "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.", - }; - return data; - } + // if ( + // job.data.url && + // (job.data.url.includes("researchhub.com") || + // job.data.url.includes("ebay.com")) + // ) { + // logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`); + // const data = { + // success: false, + // document: null, + // project_id: job.data.project_id, + // error: + // "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.", + // }; + // return data; + // } try { job.updateProgress({ @@ -482,32 +482,28 @@ async function processJob(job: Job & { id: string }, token: string) { normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.sourceURL, sc) ) { - logger.debug( - "Was redirected, removing old URL and locking new URL...", - { oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url }, - ); - // Remove the old URL from visited unique due to checking for limit - // Do not remove from :visited otherwise it will keep crawling the original URL (sourceURL) - await redisConnection.srem( - "crawl:" + job.data.crawl_id + ":visited_unique", - normalizeURL(doc.metadata.sourceURL, sc), - ); - const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc)); const p2 = generateURLPermutations( normalizeURL(doc.metadata.sourceURL, sc), ); - // In crawls, we should only crawl a redirected page once, no matter how many; times it is redirected to, or if it's been discovered by the crawler before. - // This can prevent flakiness with race conditions. - // Lock the new URL - const lockRes = await lockURL(job.data.crawl_id, sc, doc.metadata.url); - if ( - job.data.crawlerOptions !== null && - !lockRes && - JSON.stringify(p1) !== JSON.stringify(p2) - ) { - throw new RacedRedirectError(); + if (JSON.stringify(p1) !== JSON.stringify(p2)) { + logger.debug( + "Was redirected, removing old URL and locking new URL...", + { oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url }, + ); + + // Prevent redirect target from being visited in the crawl again + // See lockURL + const x = await redisConnection.sadd( + "crawl:" + job.data.crawl_id + ":visited", + ...p1.map((x) => x.href), + ); + const lockRes = x === p1.length; + + if (job.data.crawlerOptions !== null && !lockRes) { + throw new RacedRedirectError(); + } } } @@ -679,6 +675,10 @@ async function processJob(job: Job & { id: string }, token: string) { logger.debug("Declaring job as done..."); await addCrawlJobDone(job.data.crawl_id, job.id, false); + await redisConnection.srem( + "crawl:" + job.data.crawl_id + ":visited_unique", + normalizeURL(job.data.url, sc), + ); logger.debug("Logging job to DB..."); await logJob( diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 5b8e39ca..21025589 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -80,8 +80,8 @@ const RATE_LIMITS = { default: 100, }, crawlStatus: { - free: 300, - default: 500, + free: 500, + default: 5000, }, testSuite: { free: 10000, diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 5325a0ad..9db79bc5 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -55,6 +55,7 @@ export interface RunWebScraperParams { bull_job_id: string; priority?: number; is_scrape?: boolean; + is_crawl?: boolean; } export type RunWebScraperResult = diff --git a/apps/api/tsconfig.json b/apps/api/tsconfig.json index 29093be6..ab2a9546 100644 --- a/apps/api/tsconfig.json +++ b/apps/api/tsconfig.json @@ -3,6 +3,7 @@ "rootDir": "./src", "lib": ["ES2022", "DOM"], + // or higher "target": "ES2022", @@ -18,7 +19,7 @@ "*": ["node_modules/*", "src/types/*"], }, - "inlineSources": true + "inlineSources": true, }, "include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"] } diff --git a/apps/api/logview.js b/apps/api/utils/logview.js similarity index 71% rename from apps/api/logview.js rename to apps/api/utils/logview.js index 232d2cda..3c0db523 100644 --- a/apps/api/logview.js +++ b/apps/api/utils/logview.js @@ -1,7 +1,19 @@ const fs = require("fs"); -const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8") - .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x)); +// METHOD: Winston log file +// const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8") +// .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x)); + +// METHOD: GCloud export +const logs = [ + "downloaded-logs-20241213-225607.json", + "downloaded-logs-20241213-225654.json", + "downloaded-logs-20241213-225720.json", + "downloaded-logs-20241213-225758.json", + "downloaded-logs-20241213-225825.json", + "downloaded-logs-20241213-225843.json", +].flatMap(x => JSON.parse(fs.readFileSync(x, "utf8"))).map(x => x.jsonPayload); + const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))]; diff --git a/apps/api/utils/urldump-redis.js b/apps/api/utils/urldump-redis.js new file mode 100644 index 00000000..fdd6090c --- /dev/null +++ b/apps/api/utils/urldump-redis.js @@ -0,0 +1,14 @@ +require("dotenv").config(); +const Redis = require("ioredis"); + +const crawlId = process.argv[2]; + +const redisConnection = new Redis(process.env.REDIS_URL, { + maxRetriesPerRequest: null, +}); + +(async () => { + const res = await redisConnection.sscan("crawl:" + crawlId + ":visited_unique", 0, "COUNT", 999); + await require("fs/promises").writeFile(crawlId + "-visited.txt", res[1].map(x => x.split("://").slice(1).join("://")).sort().join("\n")); + process.exit(0); +})(); \ No newline at end of file diff --git a/apps/api/utils/urldump.js b/apps/api/utils/urldump.js new file mode 100644 index 00000000..3583f7c6 --- /dev/null +++ b/apps/api/utils/urldump.js @@ -0,0 +1,43 @@ +require("dotenv").config(); + +//const baseUrl = "https://api.firecrawl.dev"; +const baseUrl = "http://localhost:3002"; +const crawlId = process.argv[2]; + +(async () => { + let url = baseUrl + "/v1/crawl/" + crawlId; + let urls = []; + + while (url) { + let res; + + while (true) { + try { + res = (await (await fetch(url, { + headers: { + "Authorization": "Bearer " + process.env.TEST_API_KEY + } + })).json()); + break; + } catch (e) { + console.error(e); + } + } + + console.log(res.data.length); + if (res.data.length === 0) { + break; + } + + urls.push(...res.data.map(x => x.metadata.url ?? x.metadata.sourceURL)); + + url = res.next; + if (url !== undefined) { + const o = new URL(url) + o.protocol = new URL(baseUrl).protocol; + url = o.href; + } + } + + await require("fs/promises").writeFile(crawlId + "-urls.txt", urls.map(x => x.split("://").slice(1).join("://")).sort().join("\n")); +})(); \ No newline at end of file diff --git a/apps/js-sdk/firecrawl/src/__tests__/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/index.test.ts index 92951237..6958abf8 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/index.test.ts @@ -1,9 +1,9 @@ -import { describe, test, expect, jest } from '@jest/globals'; -import axios from 'axios'; -import FirecrawlApp from '../index'; +import { describe, expect, jest, test } from '@jest/globals'; -import { readFile } from 'fs/promises'; +import FirecrawlApp from '../index'; +import axios from 'axios'; import { join } from 'path'; +import { readFile } from 'fs/promises'; // Mock jest and set the type jest.mock('axios'); @@ -14,13 +14,22 @@ async function loadFixture(name: string): Promise { return await readFile(join(__dirname, 'fixtures', `${name}.json`), 'utf-8') } +const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev"; + describe('the firecrawl JS SDK', () => { - test('Should require an API key to instantiate FirecrawlApp', async () => { - const fn = () => { - new FirecrawlApp({ apiKey: undefined }); - }; - expect(fn).toThrow('No API key provided'); + test('Should require an API key only for cloud service', async () => { + if (API_URL.includes('api.firecrawl.dev')) { + // Should throw for cloud service + expect(() => { + new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL }); + }).toThrow('No API key provided'); + } else { + // Should not throw for self-hosted + expect(() => { + new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL }); + }).not.toThrow(); + } }); test('Should return scraped data from a /scrape API call', async () => { diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index dea55846..e5c04209 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -9,15 +9,28 @@ const TEST_API_KEY = process.env.TEST_API_KEY; const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev"; describe('FirecrawlApp E2E Tests', () => { - test.concurrent('should throw error for no API key', async () => { - expect(() => { - new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); - }).toThrow("No API key provided"); + test.concurrent('should throw error for no API key only for cloud service', async () => { + if (API_URL.includes('api.firecrawl.dev')) { + // Should throw for cloud service + expect(() => { + new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); + }).toThrow("No API key provided"); + } else { + // Should not throw for self-hosted + expect(() => { + new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); + }).not.toThrow(); + } }); test.concurrent('should throw error for invalid API key on scrape', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + if (API_URL.includes('api.firecrawl.dev')) { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 404"); + } else { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).resolves.not.toThrow(); + } }); test.concurrent('should throw error for blocklisted URL on scrape', async () => { @@ -155,14 +168,13 @@ describe('FirecrawlApp E2E Tests', () => { }, 30000); // 30 seconds timeout test.concurrent('should throw error for invalid API key on crawl', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); - }); - - test.concurrent('should throw error for blocklisted URL on crawl', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const blocklistedUrl = "https://twitter.com/fake-test"; - await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."); + if (API_URL.includes('api.firecrawl.dev')) { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404"); + } else { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).resolves.not.toThrow(); + } }); test.concurrent('should return successful response for crawl and wait for completion', async () => { @@ -337,8 +349,13 @@ describe('FirecrawlApp E2E Tests', () => { }, 60000); // 60 seconds timeout test.concurrent('should throw error for invalid API key on map', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + if (API_URL.includes('api.firecrawl.dev')) { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404"); + } else { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).resolves.not.toThrow(); + } }); test.concurrent('should throw error for blocklisted URL on map', async () => { @@ -355,8 +372,7 @@ describe('FirecrawlApp E2E Tests', () => { }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid map', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; expect(response).not.toBeNull(); expect(response.links?.length).toBeGreaterThan(0); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index a54506ad..29fabf5d 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -183,6 +183,7 @@ export interface BatchScrapeResponse { url?: string; success: true; error?: string; + invalidURLs?: string[]; } /** @@ -242,10 +243,11 @@ export interface MapResponse { * Defines options for extracting information from URLs. */ export interface ExtractParams { - prompt: string; + prompt?: string; schema?: LLMSchema; systemPrompt?: string; allowExternalLinks?: boolean; + includeSubdomains?: boolean; } /** @@ -288,17 +290,23 @@ export default class FirecrawlApp { public apiKey: string; public apiUrl: string; + private isCloudService(url: string): boolean { + return url.includes('api.firecrawl.dev'); + } + /** * Initializes a new instance of the FirecrawlApp class. * @param config - Configuration options for the FirecrawlApp instance. */ constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) { - if (typeof apiKey !== "string") { + const baseUrl = apiUrl || "https://api.firecrawl.dev"; + + if (this.isCloudService(baseUrl) && typeof apiKey !== "string") { throw new FirecrawlError("No API key provided", 401); } - this.apiKey = apiKey; - this.apiUrl = apiUrl || "https://api.firecrawl.dev"; + this.apiKey = apiKey || ''; + this.apiUrl = baseUrl; } /** @@ -576,9 +584,10 @@ export default class FirecrawlApp { pollInterval: number = 2, idempotencyKey?: string, webhook?: CrawlParams["webhook"], + ignoreInvalidURLs?: boolean, ): Promise { const headers = this.prepareHeaders(idempotencyKey); - let jsonData: any = { urls, ...params }; + let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params }; if (jsonData?.extract?.schema) { let schema = jsonData.extract.schema; @@ -621,10 +630,12 @@ export default class FirecrawlApp { async asyncBatchScrapeUrls( urls: string[], params?: ScrapeParams, - idempotencyKey?: string + idempotencyKey?: string, + webhook?: CrawlParams["webhook"], + ignoreInvalidURLs?: boolean, ): Promise { const headers = this.prepareHeaders(idempotencyKey); - let jsonData: any = { urls, ...(params ?? {}) }; + let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...(params ?? {}) }; try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/batch/scrape`, @@ -657,8 +668,10 @@ export default class FirecrawlApp { urls: string[], params?: ScrapeParams, idempotencyKey?: string, + webhook?: CrawlParams["webhook"], + ignoreInvalidURLs?: boolean, ) { - const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey); + const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs); if (crawl.success && crawl.id) { const id = crawl.id; @@ -932,9 +945,11 @@ export class CrawlWatcher extends TypedEventTarget { private ws: WebSocket; public data: FirecrawlDocument[]; public status: CrawlStatusResponse["status"]; + public id: string; constructor(id: string, app: FirecrawlApp) { super(); + this.id = id; this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey); this.status = "scraping"; this.data = []; @@ -965,6 +980,7 @@ export class CrawlWatcher extends TypedEventTarget { detail: { status: this.status, data: this.data, + id: this.id, }, })); } else if (msg.type === "error") { @@ -974,6 +990,7 @@ export class CrawlWatcher extends TypedEventTarget { status: this.status, data: this.data, error: msg.error, + id: this.id, }, })); } else if (msg.type === "catchup") { @@ -981,12 +998,18 @@ export class CrawlWatcher extends TypedEventTarget { this.data.push(...(msg.data.data ?? [])); for (const doc of this.data) { this.dispatchTypedEvent("document", new CustomEvent("document", { - detail: doc, + detail: { + ...doc, + id: this.id, + }, })); } } else if (msg.type === "document") { this.dispatchTypedEvent("document", new CustomEvent("document", { - detail: msg.data, + detail: { + ...msg.data, + id: this.id, + }, })); } } @@ -996,14 +1019,21 @@ export class CrawlWatcher extends TypedEventTarget { this.ws.close(); return; } - - const msg = JSON.parse(ev.data) as Message; - messageHandler(msg); + try { + const msg = JSON.parse(ev.data) as Message; + messageHandler(msg); + } catch (error) { + console.error("Error on message", error); + } }).bind(this); this.ws.onclose = ((ev: CloseEvent) => { - const msg = JSON.parse(ev.reason) as Message; - messageHandler(msg); + try { + const msg = JSON.parse(ev.reason) as Message; + messageHandler(msg); + } catch (error) { + console.error("Error on close", error); + } }).bind(this); this.ws.onerror = ((_: Event) => { @@ -1013,6 +1043,7 @@ export class CrawlWatcher extends TypedEventTarget { status: this.status, data: this.data, error: "WebSocket error", + id: this.id, }, })); }).bind(this); diff --git a/apps/playwright-service-ts/api.ts b/apps/playwright-service-ts/api.ts index 90a4eb87..eacb35ff 100644 --- a/apps/playwright-service-ts/api.ts +++ b/apps/playwright-service-ts/api.ts @@ -196,7 +196,7 @@ app.post('/scrape', async (req: Request, res: Response) => { } } - const pageError = pageStatusCode !== 200 ? getError(pageStatusCode) : false; + const pageError = pageStatusCode !== 200 ? getError(pageStatusCode) : undefined; if (!pageError) { console.log(`✅ Scrape successful!`); @@ -209,7 +209,7 @@ app.post('/scrape', async (req: Request, res: Response) => { res.json({ content: pageContent, pageStatusCode, - pageError + ...(pageError && { pageError }) }); }); diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 31d68095..5f592c2c 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.6.4" +__version__ = "1.6.8" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py index 8945d74d..50d5306f 100644 --- a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py @@ -29,12 +29,12 @@ def test_scrape_url_invalid_api_key(): invalid_app.scrape_url('https://firecrawl.dev') assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) -def test_blocklisted_url(): - blocklisted_url = "https://facebook.com/fake-test" - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') - with pytest.raises(Exception) as excinfo: - app.scrape_url(blocklisted_url) - assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) +# def test_blocklisted_url(): +# blocklisted_url = "https://facebook.com/fake-test" +# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') +# with pytest.raises(Exception) as excinfo: +# app.scrape_url(blocklisted_url) +# assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) def test_successful_response_with_valid_preview_token(): app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0') @@ -90,12 +90,12 @@ def test_crawl_url_invalid_api_key(): invalid_app.crawl_url('https://firecrawl.dev') assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) -def test_should_return_error_for_blocklisted_url(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') - blocklisted_url = "https://twitter.com/fake-test" - with pytest.raises(Exception) as excinfo: - app.crawl_url(blocklisted_url) - assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) +# def test_should_return_error_for_blocklisted_url(): +# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') +# blocklisted_url = "https://twitter.com/fake-test" +# with pytest.raises(Exception) as excinfo: +# app.crawl_url(blocklisted_url) +# assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) def test_crawl_url_wait_for_completion_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py index 12fa10ce..0ada6c1d 100644 --- a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py @@ -30,12 +30,12 @@ def test_scrape_url_invalid_api_key(): invalid_app.scrape_url('https://firecrawl.dev') assert "Unauthorized: Invalid token" in str(excinfo.value) -def test_blocklisted_url(): - blocklisted_url = "https://facebook.com/fake-test" - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) - with pytest.raises(Exception) as excinfo: - app.scrape_url(blocklisted_url) - assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) +# def test_blocklisted_url(): +# blocklisted_url = "https://facebook.com/fake-test" +# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) +# with pytest.raises(Exception) as excinfo: +# app.scrape_url(blocklisted_url) +# assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) def test_successful_response_with_valid_preview_token(): app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") @@ -136,12 +136,12 @@ def test_crawl_url_invalid_api_key(): invalid_app.crawl_url('https://firecrawl.dev') assert "Unauthorized: Invalid token" in str(excinfo.value) -def test_should_return_error_for_blocklisted_url(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) - blocklisted_url = "https://twitter.com/fake-test" - with pytest.raises(Exception) as excinfo: - app.crawl_url(blocklisted_url) - assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) +# def test_should_return_error_for_blocklisted_url(): +# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) +# blocklisted_url = "https://twitter.com/fake-test" +# with pytest.raises(Exception) as excinfo: +# app.crawl_url(blocklisted_url) +# assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) def test_crawl_url_wait_for_completion_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) @@ -296,12 +296,12 @@ def test_invalid_api_key_on_map(): invalid_app.map_url('https://roastmywebsite.ai') assert "Unauthorized: Invalid token" in str(excinfo.value) -def test_blocklisted_url_on_map(): - app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL) - blocklisted_url = "https://facebook.com/fake-test" - with pytest.raises(Exception) as excinfo: - app.map_url(blocklisted_url) - assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) +# def test_blocklisted_url_on_map(): +# app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL) +# blocklisted_url = "https://facebook.com/fake-test" +# with pytest.raises(Exception) as excinfo: +# app.map_url(blocklisted_url) +# assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) def test_successful_response_with_valid_preview_token_on_map(): app = FirecrawlApp(api_key="this_is_just_a_preview_token", api_url=API_URL) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 45ed27d8..e4ac2726 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -26,7 +26,7 @@ class FirecrawlApp: """ Parameters for the extract operation. """ - prompt: str + prompt: Optional[str] = None schema_: Optional[Any] = pydantic.Field(None, alias='schema') system_prompt: Optional[str] = None allow_external_links: Optional[bool] = False @@ -704,15 +704,15 @@ class CrawlWatcher: async def _handle_message(self, msg: Dict[str, Any]): if msg['type'] == 'done': self.status = 'completed' - self.dispatch_event('done', {'status': self.status, 'data': self.data}) + self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id}) elif msg['type'] == 'error': self.status = 'failed' - self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error']}) + self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id}) elif msg['type'] == 'catchup': self.status = msg['data']['status'] self.data.extend(msg['data'].get('data', [])) for doc in self.data: - self.dispatch_event('document', doc) + self.dispatch_event('document', {'data': doc, 'id': self.id}) elif msg['type'] == 'document': self.data.append(msg['data']) - self.dispatch_event('document', msg['data']) + self.dispatch_event('document', {'data': msg['data'], 'id': self.id}) diff --git a/apps/rust-sdk/tests/e2e_with_auth.rs b/apps/rust-sdk/tests/e2e_with_auth.rs index 75568f92..92b202cb 100644 --- a/apps/rust-sdk/tests/e2e_with_auth.rs +++ b/apps/rust-sdk/tests/e2e_with_auth.rs @@ -5,20 +5,20 @@ use firecrawl::FirecrawlApp; use serde_json::json; use std::env; -#[tokio::test] -async fn test_blocklisted_url() { - dotenv().ok(); - let api_url = env::var("API_URL").unwrap(); - let api_key = env::var("TEST_API_KEY").ok(); - let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap(); - let blocklisted_url = "https://facebook.com/fake-test"; - let result = app.scrape_url(blocklisted_url, None).await; +// #[tokio::test] +// async fn test_blocklisted_url() { +// dotenv().ok(); +// let api_url = env::var("API_URL").unwrap(); +// let api_key = env::var("TEST_API_KEY").ok(); +// let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap(); +// let blocklisted_url = "https://facebook.com/fake-test"; +// let result = app.scrape_url(blocklisted_url, None).await; - assert_matches!( - result, - Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions") - ); -} +// assert_matches!( +// result, +// Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions") +// ); +// } #[tokio::test] async fn test_successful_response_with_valid_preview_token() { @@ -103,20 +103,21 @@ async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explici .contains("We present spectrophotometric observations of the Broad Line Radio Galaxy")); } -#[tokio::test] -async fn test_should_return_error_for_blocklisted_url() { - dotenv().ok(); - let api_url = env::var("API_URL").unwrap(); - let api_key = env::var("TEST_API_KEY").ok(); - let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap(); - let blocklisted_url = "https://twitter.com/fake-test"; - let result = app.crawl_url(blocklisted_url, None).await; - assert_matches!( - result, - Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions.") - ); -} +// #[tokio::test] +// async fn test_should_return_error_for_blocklisted_url() { +// dotenv().ok(); +// let api_url = env::var("API_URL").unwrap(); +// let api_key = env::var("TEST_API_KEY").ok(); +// let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap(); +// let blocklisted_url = "https://twitter.com/fake-test"; +// let result = app.crawl_url(blocklisted_url, None).await; + +// assert_matches!( +// result, +// Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions.") +// ); +// } #[tokio::test] async fn test_llm_extraction() { diff --git a/examples/o1_web_extractor/o1_web_extractor.py b/examples/o1_web_extractor/o1_web_extractor.py new file mode 100644 index 00000000..34857ff9 --- /dev/null +++ b/examples/o1_web_extractor/o1_web_extractor.py @@ -0,0 +1,147 @@ +import os +import json +import requests +from dotenv import load_dotenv +from openai import OpenAI +from serpapi import GoogleSearch + +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + +# Load environment variables +load_dotenv() + +# Initialize clients +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") + +def search_google(query): + """Search Google using SerpAPI and return top results.""" + print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}") + search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")}) + return search.get_dict().get("organic_results", []) + +def select_urls_with_o1(company, objective, serp_results): + """ + Use O1 to select the most relevant URLs from SERP results for the given company and objective. + Returns a JSON object with a "selected_urls" property that is an array of strings. + """ + try: + # Prepare the data for O1 + serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")} + for r in serp_results if r.get("link")] + + response = client.chat.completions.create( + model="o1-2024-12-17", + messages=[ + { + "role": "developer", + "content": "You select URLs from the SERP results relevant to the company and objective." + }, + { + "role": "user", + "content": ( + f"Company: {company}\n" + f"Objective: {objective}\n" + f"SERP Results: {json.dumps(serp_data)}\n\n" + "Return a JSON object with a property 'selected_urls' that contains an array " + "of URLs most likely to help meet the objective. Add a /* to the end of the URL if you think it should search all of the pages in the site. Do not return any social media links. For example: {\"selected_urls\": [\"https://example.com\", \"https://example2.com\"]}" + ) + } + ], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "selected_urls_object", + "schema": { + "type": "object", + "properties": { + "selected_urls": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["selected_urls"], + "additionalProperties": False + } + } + } + ) + + # The response is guaranteed to follow the specified JSON schema + result = json.loads(response.choices[0].message.content) + urls = result.get("selected_urls", []) + return urls + + except Exception as e: + print(f"{Colors.RED}Error selecting URLs with O1: {e}{Colors.RESET}") + return [] + + + +def extract_company_info(urls, prompt, company, api_key): + """Use requests to call Firecrawl's extract endpoint with selected URLs.""" + print(f"{Colors.YELLOW}Extracting structured data from the provided URLs using Firecrawl...{Colors.RESET}") + + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {api_key}' + } + + payload = { + "urls": urls, + "prompt": prompt + " for " + company + } + + try: + response = requests.post( + "https://api.firecrawl.dev/v1/extract", + headers=headers, + json=payload + ) + response.raise_for_status() + data = response.json() + return data + except Exception as e: + print(f"{Colors.RED}Failed to extract data: {e}{Colors.RESET}") + return None + +def main(): + company = input(f"{Colors.BLUE}Enter the company name: {Colors.RESET}") + objective = input(f"{Colors.BLUE}Enter what information you want about the company: {Colors.RESET}") + + serp_results = search_google(f"{company}") + if not serp_results: + print(f"{Colors.RED}No search results found.{Colors.RESET}") + return + + # Ask O1 to select URLs + selected_urls = select_urls_with_o1(company, objective, serp_results) + + if not selected_urls: + print(f"{Colors.RED}O1 did not return any URLs.{Colors.RESET}") + return + + print(f"{Colors.CYAN}Selected URLs for extraction by O1:{Colors.RESET}") + for url in selected_urls: + print(f"- {url}") + + data = extract_company_info(selected_urls, objective, company, firecrawl_api_key) + + if data and data.get('success') and data.get('data'): + print(f"{Colors.GREEN}Data successfully extracted:{Colors.RESET}") + print(json.dumps(data['data'], indent=2)) + else: + print(f"{Colors.RED}Failed to extract the requested information. Try refining your prompt or choosing a different company.{Colors.RESET}") + +if __name__ == "__main__": + main()