fix(crawl-redis): ignore empty includes/excludes (#1223)

* fix(crawl-redis): ignore empty includes/excludes

* fix(snips/scrape): bump timeouts
This commit is contained in:
Gergő Móricz 2025-02-20 19:06:02 +01:00 committed by GitHub
parent 283a3bfef3
commit 16c305775e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 15 additions and 15 deletions

View File

@ -39,7 +39,7 @@ describe("Scrape tests", () => {
expect(response.markdown).toBe( expect(response.markdown).toBe(
"this is fake data coming from the mocking system!", "this is fake data coming from the mocking system!",
); );
}, 10000); }, 30000);
it.concurrent("works", async () => { it.concurrent("works", async () => {
const response = await scrape({ const response = await scrape({
@ -47,7 +47,7 @@ describe("Scrape tests", () => {
}); });
expect(response.markdown).toContain("Firecrawl"); expect(response.markdown).toContain("Firecrawl");
}, 10000); }, 30000);
it.concurrent("handles non-UTF-8 encodings", async () => { it.concurrent("handles non-UTF-8 encodings", async () => {
const response = await scrape({ const response = await scrape({
@ -55,7 +55,7 @@ describe("Scrape tests", () => {
}); });
expect(response.markdown).toContain("ぐ け げ こ ご さ ざ し じ す ず せ ぜ そ ぞ た"); expect(response.markdown).toContain("ぐ け げ こ ご さ ざ し じ す ず せ ぜ そ ぞ た");
}, 15000); }, 30000);
if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) { if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) {
it.concurrent("self-hosted proxy works", async () => { it.concurrent("self-hosted proxy works", async () => {
@ -64,7 +64,7 @@ describe("Scrape tests", () => {
}); });
expect(response.markdown?.trim()).toBe(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]); expect(response.markdown?.trim()).toBe(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]);
}); }, 30000);
} }
if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.PLAYWRIGHT_MICROSERVICE_URL) { if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.PLAYWRIGHT_MICROSERVICE_URL) {
@ -75,7 +75,7 @@ describe("Scrape tests", () => {
}); });
expect(response.markdown).toContain("Firecrawl"); expect(response.markdown).toContain("Firecrawl");
}, 15000); }, 30000);
} }
describe("JSON scrape support", () => { describe("JSON scrape support", () => {
@ -87,7 +87,7 @@ describe("Scrape tests", () => {
const obj = JSON.parse(response.rawHtml!); const obj = JSON.parse(response.rawHtml!);
expect(obj.id).toBe(1); expect(obj.id).toBe(1);
}, 25000); // TODO: mock and shorten }, 30000);
}); });
if (!process.env.TEST_SUITE_SELF_HOSTED) { if (!process.env.TEST_SUITE_SELF_HOSTED) {
@ -98,7 +98,7 @@ describe("Scrape tests", () => {
}); });
expect(response.markdown).not.toContain(".g.doubleclick.net/"); expect(response.markdown).not.toContain(".g.doubleclick.net/");
}, 10000); }, 30000);
it.concurrent("doesn't block ads if explicitly disabled", async () => { it.concurrent("doesn't block ads if explicitly disabled", async () => {
const response = await scrape({ const response = await scrape({
@ -107,15 +107,15 @@ describe("Scrape tests", () => {
}); });
expect(response.markdown).toContain(".g.doubleclick.net/"); expect(response.markdown).toContain(".g.doubleclick.net/");
}, 10000); }, 30000);
}); });
describe("Location API (f-e dependant)", () => { describe("Location API (f-e dependant)", () => {
it.concurrent("works without specifying an explicit location", async () => { it.concurrent("works without specifying an explicit location", async () => {
const response = await scrape({ await scrape({
url: "https://iplocation.com", url: "https://iplocation.com",
}); });
}, 10000); }, 30000);
it.concurrent("works with country US", async () => { it.concurrent("works with country US", async () => {
const response = await scrape({ const response = await scrape({
@ -124,7 +124,7 @@ describe("Scrape tests", () => {
}); });
expect(response.markdown).toContain("| Country | United States |"); expect(response.markdown).toContain("| Country | United States |");
}, 10000); }, 30000);
}); });
describe("Screenshot (f-e/sb dependant)", () => { describe("Screenshot (f-e/sb dependant)", () => {
@ -152,14 +152,14 @@ describe("Scrape tests", () => {
await scrape({ await scrape({
url: "http://firecrawl.dev", url: "http://firecrawl.dev",
}); });
}, 15000); }, 30000);
it.concurrent("basic works", async () => { it.concurrent("basic works", async () => {
await scrape({ await scrape({
url: "http://firecrawl.dev", url: "http://firecrawl.dev",
proxy: "basic", proxy: "basic",
}); });
}, 15000); }, 30000);
it.concurrent("stealth works", async () => { it.concurrent("stealth works", async () => {
await scrape({ await scrape({

View File

@ -384,8 +384,8 @@ export function crawlToCrawler(
jobId: id, jobId: id,
initialUrl: sc.originUrl!, initialUrl: sc.originUrl!,
baseUrl: newBase ? new URL(newBase).origin : undefined, baseUrl: newBase ? new URL(newBase).origin : undefined,
includes: sc.crawlerOptions?.includes ?? [], includes: (sc.crawlerOptions?.includes ?? []).filter(x => x.trim().length > 0),
excludes: sc.crawlerOptions?.excludes ?? [], excludes: (sc.crawlerOptions?.excludes ?? []).filter(x => x.trim().length > 0),
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
maxCrawledDepth: getAdjustedMaxDepth( maxCrawledDepth: getAdjustedMaxDepth(
sc.originUrl!, sc.originUrl!,