From 16c305775e34ae632e7ecc95f954d024d3fcc6da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 20 Feb 2025 19:06:02 +0100 Subject: [PATCH] fix(crawl-redis): ignore empty includes/excludes (#1223) * fix(crawl-redis): ignore empty includes/excludes * fix(snips/scrape): bump timeouts --- apps/api/src/__tests__/snips/scrape.test.ts | 26 ++++++++++----------- apps/api/src/lib/crawl-redis.ts | 4 ++-- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 5d9ca5c9..18f98330 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -39,7 +39,7 @@ describe("Scrape tests", () => { expect(response.markdown).toBe( "this is fake data coming from the mocking system!", ); - }, 10000); + }, 30000); it.concurrent("works", async () => { const response = await scrape({ @@ -47,7 +47,7 @@ describe("Scrape tests", () => { }); expect(response.markdown).toContain("Firecrawl"); - }, 10000); + }, 30000); it.concurrent("handles non-UTF-8 encodings", async () => { const response = await scrape({ @@ -55,7 +55,7 @@ describe("Scrape tests", () => { }); expect(response.markdown).toContain("ぐ け げ こ ご さ ざ し じ す ず せ ぜ そ ぞ た"); - }, 15000); + }, 30000); if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) { it.concurrent("self-hosted proxy works", async () => { @@ -64,7 +64,7 @@ describe("Scrape tests", () => { }); expect(response.markdown?.trim()).toBe(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]); - }); + }, 30000); } if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.PLAYWRIGHT_MICROSERVICE_URL) { @@ -75,7 +75,7 @@ describe("Scrape tests", () => { }); expect(response.markdown).toContain("Firecrawl"); - }, 15000); + }, 30000); } describe("JSON scrape support", () => { @@ -87,7 +87,7 @@ describe("Scrape tests", () => { const obj = JSON.parse(response.rawHtml!); expect(obj.id).toBe(1); - }, 25000); // TODO: mock and shorten + }, 30000); }); if (!process.env.TEST_SUITE_SELF_HOSTED) { @@ -98,7 +98,7 @@ describe("Scrape tests", () => { }); expect(response.markdown).not.toContain(".g.doubleclick.net/"); - }, 10000); + }, 30000); it.concurrent("doesn't block ads if explicitly disabled", async () => { const response = await scrape({ @@ -107,15 +107,15 @@ describe("Scrape tests", () => { }); expect(response.markdown).toContain(".g.doubleclick.net/"); - }, 10000); + }, 30000); }); describe("Location API (f-e dependant)", () => { it.concurrent("works without specifying an explicit location", async () => { - const response = await scrape({ + await scrape({ url: "https://iplocation.com", }); - }, 10000); + }, 30000); it.concurrent("works with country US", async () => { const response = await scrape({ @@ -124,7 +124,7 @@ describe("Scrape tests", () => { }); expect(response.markdown).toContain("| Country | United States |"); - }, 10000); + }, 30000); }); describe("Screenshot (f-e/sb dependant)", () => { @@ -152,14 +152,14 @@ describe("Scrape tests", () => { await scrape({ url: "http://firecrawl.dev", }); - }, 15000); + }, 30000); it.concurrent("basic works", async () => { await scrape({ url: "http://firecrawl.dev", proxy: "basic", }); - }, 15000); + }, 30000); it.concurrent("stealth works", async () => { await scrape({ diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index eaee3491..526ba235 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -384,8 +384,8 @@ export function crawlToCrawler( jobId: id, initialUrl: sc.originUrl!, baseUrl: newBase ? new URL(newBase).origin : undefined, - includes: sc.crawlerOptions?.includes ?? [], - excludes: sc.crawlerOptions?.excludes ?? [], + includes: (sc.crawlerOptions?.includes ?? []).filter(x => x.trim().length > 0), + excludes: (sc.crawlerOptions?.excludes ?? []).filter(x => x.trim().length > 0), maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, maxCrawledDepth: getAdjustedMaxDepth( sc.originUrl!,