From 5733b82e9ddef79ebc1d4176b02416ef64e23cd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 29 Jan 2025 08:23:36 +0100 Subject: [PATCH] fix(scrapeURL/fire-engine): default to separate US-generic proxy list if no location is specified (FIR-728) (#1104) * feat(location/country): default to us-generic * add tests + fix mock --- .../snips/mocks/mocking-works-properly.json | 12 +++++------ apps/api/src/__tests__/snips/scrape.test.ts | 20 +++++++++++++++++++ apps/api/src/controllers/v1/types.ts | 6 +++--- apps/api/src/scraper/scrapeURL/lib/fetch.ts | 8 ++++++-- apps/api/src/scraper/scrapeURL/lib/mock.ts | 2 +- 5 files changed, 36 insertions(+), 12 deletions(-) diff --git a/apps/api/src/__tests__/snips/mocks/mocking-works-properly.json b/apps/api/src/__tests__/snips/mocks/mocking-works-properly.json index 5609e6c2..6cba173f 100644 --- a/apps/api/src/__tests__/snips/mocks/mocking-works-properly.json +++ b/apps/api/src/__tests__/snips/mocks/mocking-works-properly.json @@ -2,7 +2,7 @@ { "time": 1735911273239, "options": { - "url": "http://default-fire-engine-api-service:8080/scrape", + "url": "/scrape", "method": "POST", "body": { "url": "http://firecrawl.dev", @@ -27,7 +27,7 @@ { "time": 1735911273354, "options": { - "url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp", + "url": "/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp", "method": "GET", "headers": {}, "ignoreResponse": false, @@ -43,7 +43,7 @@ { "time": 1735911273720, "options": { - "url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp", + "url": "/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp", "method": "GET", "headers": {}, "ignoreResponse": false, @@ -59,7 +59,7 @@ { "time": 1735911274092, "options": { - "url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp", + "url": "/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp", "method": "GET", "headers": {}, "ignoreResponse": false, @@ -75,7 +75,7 @@ { "time": 1735911274467, "options": { - "url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp", + "url": "/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp", "method": "GET", "headers": {}, "ignoreResponse": false, @@ -91,7 +91,7 @@ { "time": 1735911274947, "options": { - "url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp", + "url": "/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp", "method": "GET", "headers": {}, "ignoreResponse": false, diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index fd28c25d..8a570ec6 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -35,4 +35,24 @@ describe("Scrape tests", () => { "this is fake data coming from the mocking system!", ); }); + + describe("Location API", () => { + it.concurrent("works without specifying an explicit location", async () => { + const response = await scrape({ + url: "https://iplocation.com", + }); + + expectScrapeToSucceed(response); + }); + + it.concurrent("works with country US", async () => { + const response = await scrape({ + url: "https://iplocation.com", + location: { country: "US" }, + }); + + expectScrapeToSucceed(response); + expect(response.body.data.markdown).toContain("| Country | United States |"); + }); + }) }); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index cdf2a2d4..6ad9288b 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -154,13 +154,13 @@ export const scrapeOptions = z .string() .optional() .refine( - (val) => !val || Object.keys(countries).includes(val.toUpperCase()), + (val) => !val || Object.keys(countries).includes(val.toUpperCase()) || val === "US-generic", { message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.", }, ) - .transform((val) => (val ? val.toUpperCase() : "US")), + .transform((val) => (val ? val.toUpperCase() : "US-generic")), languages: z.string().array().optional(), }) .optional(), @@ -178,7 +178,7 @@ export const scrapeOptions = z "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.", }, ) - .transform((val) => (val ? val.toUpperCase() : "US")), + .transform((val) => (val ? val.toUpperCase() : "US-generic")), languages: z.string().array().optional(), }) .optional(), diff --git a/apps/api/src/scraper/scrapeURL/lib/fetch.ts b/apps/api/src/scraper/scrapeURL/lib/fetch.ts index 73b8f9be..c9513bfe 100644 --- a/apps/api/src/scraper/scrapeURL/lib/fetch.ts +++ b/apps/api/src/scraper/scrapeURL/lib/fetch.ts @@ -126,10 +126,14 @@ export async function robustFetch< const makeRequestTypeId = ( request: (typeof mock)["requests"][number]["options"], ) => { - let out = request.url + ";" + request.method; + let trueUrl = (process.env.FIRE_ENGINE_BETA_URL && request.url.startsWith(process.env.FIRE_ENGINE_BETA_URL)) + ? request.url.replace(process.env.FIRE_ENGINE_BETA_URL, "") + : request.url; + + let out = trueUrl + ";" + request.method; if ( process.env.FIRE_ENGINE_BETA_URL && - url.startsWith(process.env.FIRE_ENGINE_BETA_URL) && + (trueUrl.startsWith("")) && request.method === "POST" ) { out += "f-e;" + request.body?.engine + ";" + request.body?.url; diff --git a/apps/api/src/scraper/scrapeURL/lib/mock.ts b/apps/api/src/scraper/scrapeURL/lib/mock.ts index e57256d6..853e3a4f 100644 --- a/apps/api/src/scraper/scrapeURL/lib/mock.ts +++ b/apps/api/src/scraper/scrapeURL/lib/mock.ts @@ -3,7 +3,7 @@ import * as path from "path"; import { logger as _logger } from "../../../lib/logger"; import { Logger } from "winston"; const saveMocksDirPath = path.join(__dirname, "../mocks/").replace("dist/", ""); -const loadMocksDirPath = path.join(__dirname, "../../../__tests__/snips/mocks"); +const loadMocksDirPath = path.join(__dirname, "../../../__tests__/snips/mocks").replace("dist/", ""); export async function saveMock(options: unknown, result: unknown) { if (process.env.FIRECRAWL_SAVE_MOCKS !== "true") return;