From ac5c88bffb84d3b874d150726d1d34bc51660c97 Mon Sep 17 00:00:00 2001 From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 7 Feb 2025 13:38:08 -0300 Subject: [PATCH] added scrapeOptions to extract (#1133) --- .../src/__tests__/e2e_extract/index.test.ts | 31 +++++++++++++++++++ apps/api/src/controllers/v1/types.ts | 1 + .../api/src/lib/extract/extraction-service.ts | 3 ++ 3 files changed, 35 insertions(+) diff --git a/apps/api/src/__tests__/e2e_extract/index.test.ts b/apps/api/src/__tests__/e2e_extract/index.test.ts index dcb6bb4f..293415ef 100644 --- a/apps/api/src/__tests__/e2e_extract/index.test.ts +++ b/apps/api/src/__tests__/e2e_extract/index.test.ts @@ -306,4 +306,35 @@ describe("E2E Tests for Extract API Routes", () => { }, 60000, ); + + it.concurrent( + "should extract information with scrapeOptions.waitFor", + async () => { + const response = await request(TEST_URL) + .post("/v1/extract") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + urls: ["https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/"], + prompt: "What is the content right after the #content-1 id?", + schema: { + type: "object", + properties: { + content: { type: "string" }, + }, + required: ["content"], + }, + scrapeOptions: { + waitFor: 6000, + } + }); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(typeof response.body.data).toBe("object"); + expect(response.body.data?.content).toBeDefined(); + expect(response.body.data?.content).toBe("Content loaded after 5 seconds!"); + }, + 60000, + ); }); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index ae58fb5a..80dceb78 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -226,6 +226,7 @@ export const extractV1Options = z includeSubdomains: z.boolean().default(true), allowExternalLinks: z.boolean().default(false), enableWebSearch: z.boolean().default(false), + scrapeOptions: scrapeOptions.default({ onlyMainContent: false }).optional(), origin: z.string().optional().default("api"), urlTrace: z.boolean().default(false), timeout: z.number().int().positive().finite().safe().default(60000), diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 68213c2c..ab4b1aeb 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -294,6 +294,8 @@ export async function performExtraction( isMultiEntity: true, }), { + ...request.scrapeOptions, + // Needs to be true for multi-entity to work properly onlyMainContent: true, } @@ -554,6 +556,7 @@ export async function performExtraction( url, isMultiEntity: false, }), + request.scrapeOptions ); } return docsMap.get(normalizeUrl(url));