From d09e0603f8afc118f34ceb5ace8124968d7a1c88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 29 Jan 2025 15:03:37 +0100 Subject: [PATCH] feat(scrapeUrl/fire-engine): add blockAds flag (FIR-692) (#1106) * feat(scrapeUrl/fire-engine): add blockAds flag * feat(v1/scrape): blockAds test --- apps/api/src/__tests__/snips/scrape.test.ts | 25 +++++++++++++++++-- apps/api/src/controllers/v1/types.ts | 1 + .../scrapeURL/engines/fire-engine/index.ts | 2 ++ .../scrapeURL/engines/fire-engine/scrape.ts | 1 + apps/js-sdk/firecrawl/src/index.ts | 1 + 5 files changed, 28 insertions(+), 2 deletions(-) diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 8a570ec6..771a6cc3 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -35,8 +35,29 @@ describe("Scrape tests", () => { "this is fake data coming from the mocking system!", ); }); + + describe("Ad blocking (f-e dependant)", () => { + it.concurrent("blocks ads by default", async () => { + const response = await scrape({ + url: "https://canyoublockit.com/testing/", + }); + + expectScrapeToSucceed(response); + expect(response.body.data.markdown).not.toContain(".g.doubleclick.net/"); + }, 10000); + + it.concurrent("doesn't block ads if explicitly disabled", async () => { + const response = await scrape({ + url: "https://canyoublockit.com/testing/", + blockAds: false, + }); + + expectScrapeToSucceed(response); + expect(response.body.data.markdown).toContain(".g.doubleclick.net/"); + }, 10000); + }); - describe("Location API", () => { + describe("Location API (f-e dependant)", () => { it.concurrent("works without specifying an explicit location", async () => { const response = await scrape({ url: "https://iplocation.com", @@ -54,5 +75,5 @@ describe("Scrape tests", () => { expectScrapeToSucceed(response); expect(response.body.data.markdown).toContain("| Country | United States |"); }); - }) + }); }); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 6ad9288b..b67acf9c 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -186,6 +186,7 @@ export const scrapeOptions = z removeBase64Images: z.boolean().default(true), fastMode: z.boolean().default(false), useMock: z.string().optional(), + blockAds: z.boolean().default(true), }) .strict(strictMessage); diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 62a50f60..50198ca7 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -197,6 +197,7 @@ export async function scrapeURLWithFireEngineChromeCDP( mobile: meta.options.mobile, timeout, // TODO: better timeout logic disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache, + blockAds: meta.options.blockAds, // TODO: scrollXPaths }; @@ -271,6 +272,7 @@ export async function scrapeURLWithFireEnginePlaywright( fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"), wait: meta.options.waitFor, geolocation: meta.options.geolocation ?? meta.options.location, + blockAds: meta.options.blockAds, timeout, }; diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts index 4248024a..f3eedde4 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts @@ -37,6 +37,7 @@ export type FireEngineScrapeRequestChromeCDP = { blockMedia?: true; // cannot be false mobile?: boolean; disableSmartWaitCache?: boolean; + blockAds?: boolean; // default: true }; export type FireEngineScrapeRequestPlaywright = { diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 3fe7a8b9..603591ac 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -93,6 +93,7 @@ export interface CrawlScrapeOptions { mobile?: boolean; skipTlsVerification?: boolean; removeBase64Images?: boolean; + blockAds?: boolean; } export type Action = {