feat(map): mock support (FIR-1109) (#1213)

* feat(map,fetch): mock support

* feat(snips/map): mock out long-running test

* fix(snips/scrape): use more reliable site for adblock testing
This commit is contained in:
Gergő Móricz 2025-02-20 10:41:43 +01:00 committed by GitHub
parent bc5a16d048
commit da1670b78c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 156 additions and 40 deletions

View File

@ -21,7 +21,7 @@ function expectMapToSucceed(response: Awaited<ReturnType<typeof map>>) {
} }
describe("Map tests", () => { describe("Map tests", () => {
it("basic map succeeds", async () => { it.concurrent("basic map succeeds", async () => {
const response = await map({ const response = await map({
url: "http://firecrawl.dev", url: "http://firecrawl.dev",
}); });
@ -29,7 +29,7 @@ describe("Map tests", () => {
expectMapToSucceed(response); expectMapToSucceed(response);
}, 10000); }, 10000);
it("times out properly", async () => { it.concurrent("times out properly", async () => {
const response = await map({ const response = await map({
url: "http://firecrawl.dev", url: "http://firecrawl.dev",
timeout: 1 timeout: 1
@ -40,14 +40,15 @@ describe("Map tests", () => {
expect(response.body.error).toBe("Request timed out"); expect(response.body.error).toBe("Request timed out");
}, 10000); }, 10000);
it("handles query parameters correctly", async () => { it.concurrent("handles query parameters correctly", async () => {
let response = await map({ let response = await map({
url: "https://www.hfea.gov.uk", url: "https://www.hfea.gov.uk",
sitemapOnly: true, sitemapOnly: true,
useMock: "map-query-params",
}); });
expect(response.statusCode).toBe(200); expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true); expect(response.body.success).toBe(true);
expect(response.body.links.some(x => x.match(/^https:\/\/www\.hfea\.gov\.uk\/choose-a-clinic\/clinic-search\/results\/?\?options=\d+$/))).toBe(true); expect(response.body.links.some(x => x.match(/^https:\/\/www\.hfea\.gov\.uk\/choose-a-clinic\/clinic-search\/results\/?\?options=\d+$/))).toBe(true);
}, 300000); }, 60000);
}); });

File diff suppressed because one or more lines are too long

View File

@ -76,7 +76,7 @@ describe("Scrape tests", () => {
describe("Ad blocking (f-e dependant)", () => { describe("Ad blocking (f-e dependant)", () => {
it.concurrent("blocks ads by default", async () => { it.concurrent("blocks ads by default", async () => {
const response = await scrape({ const response = await scrape({
url: "https://canyoublockit.com/testing/", url: "https://www.allrecipes.com/recipe/18185/yum/",
}); });
expect(response.markdown).not.toContain(".g.doubleclick.net/"); expect(response.markdown).not.toContain(".g.doubleclick.net/");
@ -84,7 +84,7 @@ describe("Scrape tests", () => {
it.concurrent("doesn't block ads if explicitly disabled", async () => { it.concurrent("doesn't block ads if explicitly disabled", async () => {
const response = await scrape({ const response = await scrape({
url: "https://canyoublockit.com/testing/", url: "https://www.allrecipes.com/recipe/18185/yum/",
blockAds: false, blockAds: false,
}); });

View File

@ -32,5 +32,5 @@ describe("Search tests", () => {
await search({ await search({
query: "firecrawl" query: "firecrawl"
}); });
}, 15000); }, 60000);
}); });

View File

@ -55,6 +55,7 @@ export async function getMapResults({
includeMetadata = false, includeMetadata = false,
allowExternalLinks, allowExternalLinks,
abort = new AbortController().signal, // noop abort = new AbortController().signal, // noop
mock,
}: { }: {
url: string; url: string;
search?: string; search?: string;
@ -68,6 +69,7 @@ export async function getMapResults({
includeMetadata?: boolean; includeMetadata?: boolean;
allowExternalLinks?: boolean; allowExternalLinks?: boolean;
abort?: AbortSignal; abort?: AbortSignal;
mock?: string;
}): Promise<MapResult> { }): Promise<MapResult> {
const id = uuidv4(); const id = uuidv4();
let links: string[] = [url]; let links: string[] = [url];
@ -106,6 +108,7 @@ export async function getMapResults({
true, true,
30000, 30000,
abort, abort,
mock,
); );
if (sitemap > 0) { if (sitemap > 0) {
links = links links = links
@ -296,6 +299,7 @@ export async function mapController(
teamId: req.auth.team_id, teamId: req.auth.team_id,
plan: req.auth.plan, plan: req.auth.plan,
abort: abort.signal, abort: abort.signal,
mock: req.body.useMock,
}), }),
...(req.body.timeout !== undefined ? [ ...(req.body.timeout !== undefined ? [
new Promise((resolve, reject) => setTimeout(() => { new Promise((resolve, reject) => setTimeout(() => {

View File

@ -501,6 +501,7 @@ export const mapRequestSchema = crawlerOptions
sitemapOnly: z.boolean().default(false), sitemapOnly: z.boolean().default(false),
limit: z.number().min(1).max(5000).default(5000), limit: z.number().min(1).max(5000).default(5000),
timeout: z.number().positive().finite().optional(), timeout: z.number().positive().finite().optional(),
useMock: z.string().optional(),
}) })
.strict(strictMessage); .strict(strictMessage);

View File

@ -207,7 +207,8 @@ export class WebCrawler {
fromMap: boolean = false, fromMap: boolean = false,
onlySitemap: boolean = false, onlySitemap: boolean = false,
timeout: number = 120000, timeout: number = 120000,
abort?: AbortSignal abort?: AbortSignal,
mock?: string,
): Promise<number> { ): Promise<number> {
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, { this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
method: "tryGetSitemap", method: "tryGetSitemap",
@ -263,10 +264,10 @@ export class WebCrawler {
try { try {
let count = (await Promise.race([ let count = (await Promise.race([
Promise.all([ Promise.all([
this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler, abort), this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler, abort, mock),
...this.robots ...this.robots
.getSitemaps() .getSitemaps()
.map((x) => this.tryFetchSitemapLinks(x, _urlsHandler, abort)), .map((x) => this.tryFetchSitemapLinks(x, _urlsHandler, abort, mock)),
]).then((results) => results.reduce((a, x) => a + x, 0)), ]).then((results) => results.reduce((a, x) => a + x, 0)),
timeoutPromise, timeoutPromise,
])) as number; ])) as number;
@ -559,6 +560,7 @@ export class WebCrawler {
url: string, url: string,
urlsHandler: (urls: string[]) => unknown, urlsHandler: (urls: string[]) => unknown,
abort?: AbortSignal, abort?: AbortSignal,
mock?: string,
): Promise<number> { ): Promise<number> {
const sitemapUrl = url.endsWith(".xml") const sitemapUrl = url.endsWith(".xml")
? url ? url
@ -574,6 +576,7 @@ export class WebCrawler {
this.jobId, this.jobId,
this.sitemapsHit, this.sitemapsHit,
abort, abort,
mock,
); );
} catch (error) { } catch (error) {
if (error instanceof TimeoutSignal) { if (error instanceof TimeoutSignal) {
@ -621,6 +624,7 @@ export class WebCrawler {
this.jobId, this.jobId,
this.sitemapsHit, this.sitemapsHit,
abort, abort,
mock,
); );
} catch (error) { } catch (error) {
if (error instanceof TimeoutSignal) { if (error instanceof TimeoutSignal) {
@ -655,6 +659,7 @@ export class WebCrawler {
this.jobId, this.jobId,
this.sitemapsHit, this.sitemapsHit,
abort, abort,
mock,
); );
} catch (error) { } catch (error) {
if (error instanceof TimeoutSignal) { if (error instanceof TimeoutSignal) {
@ -674,6 +679,7 @@ export class WebCrawler {
this.jobId, this.jobId,
this.sitemapsHit, this.sitemapsHit,
abort, abort,
mock,
); );
} }
} }

View File

@ -20,6 +20,7 @@ export async function getLinksFromSitemap(
crawlId: string, crawlId: string,
sitemapsHit: Set<string>, sitemapsHit: Set<string>,
abort?: AbortSignal, abort?: AbortSignal,
mock?: string,
): Promise<number> { ): Promise<number> {
if (sitemapsHit.size >= 20) { if (sitemapsHit.size >= 20) {
return 0; return 0;
@ -38,7 +39,7 @@ export async function getLinksFromSitemap(
const response = await scrapeURL( const response = await scrapeURL(
"sitemap;" + crawlId, "sitemap;" + crawlId,
sitemapUrl, sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }), scrapeOptions.parse({ formats: ["rawHtml"], useMock: mock }),
{ {
forceEngine: [ forceEngine: [
"fetch", "fetch",
@ -95,7 +96,7 @@ export async function getLinksFromSitemap(
.map((sitemap) => sitemap.loc[0].trim()); .map((sitemap) => sitemap.loc[0].trim());
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) => const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit, abort), getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit, abort, mock),
); );
const results = await Promise.all(sitemapPromises); const results = await Promise.all(sitemapPromises);
@ -120,6 +121,7 @@ export async function getLinksFromSitemap(
crawlId, crawlId,
sitemapsHit, sitemapsHit,
abort, abort,
mock,
), ),
); );
count += (await Promise.all(sitemapPromises)).reduce( count += (await Promise.all(sitemapPromises)).reduce(

View File

@ -7,6 +7,7 @@ import {
InsecureConnectionError, InsecureConnectionError,
makeSecureDispatcher, makeSecureDispatcher,
} from "../utils/safeFetch"; } from "../utils/safeFetch";
import { MockState, saveMock } from "../../lib/mock";
export async function scrapeURLWithFetch( export async function scrapeURLWithFetch(
meta: Meta, meta: Meta,
@ -14,33 +15,84 @@ export async function scrapeURLWithFetch(
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const timeout = timeToRun ?? 300000; const timeout = timeToRun ?? 300000;
let response: undici.Response; const mockOptions = {
try { url: meta.url,
response = await Promise.race([
undici.fetch(meta.url, { // irrelevant
dispatcher: await makeSecureDispatcher(meta.url), method: "GET",
redirect: "follow", ignoreResponse: false,
headers: meta.options.headers, ignoreFailure: false,
signal: meta.internalOptions.abort, tryCount: 1,
}), };
(async () => {
await new Promise((resolve) => let response: {
setTimeout(() => resolve(null), timeout), url: string;
body: string,
status: number;
headers: any;
};
if (meta.mock !== null) {
const makeRequestTypeId = (
request: MockState["requests"][number]["options"],
) => request.url + ";" + request.method;
const thisId = makeRequestTypeId(mockOptions);
const matchingMocks = meta.mock.requests
.filter((x) => makeRequestTypeId(x.options) === thisId)
.sort((a, b) => a.time - b.time);
const nextI = meta.mock.tracker[thisId] ?? 0;
meta.mock.tracker[thisId] = nextI + 1;
if (!matchingMocks[nextI]) {
throw new Error("Failed to mock request -- no mock targets found.");
}
response = {
...matchingMocks[nextI].result,
};
} else {
try {
const x = await Promise.race([
undici.fetch(meta.url, {
dispatcher: await makeSecureDispatcher(meta.url),
redirect: "follow",
headers: meta.options.headers,
signal: meta.internalOptions.abort,
}),
(async () => {
await new Promise((resolve) =>
setTimeout(() => resolve(null), timeout),
);
throw new TimeoutError(
"Fetch was unable to scrape the page before timing out",
{ cause: { timeout } },
);
})(),
]);
response = {
url: x.url,
body: await x.text(),
status: x.status,
headers: [...x.headers],
};
if (meta.mock === null) {
await saveMock(
mockOptions,
response,
); );
throw new TimeoutError( }
"Fetch was unable to scrape the page before timing out", } catch (error) {
{ cause: { timeout } }, if (
); error instanceof TypeError &&
})(), error.cause instanceof InsecureConnectionError
]); ) {
} catch (error) { throw error.cause;
if ( } else {
error instanceof TypeError && throw error;
error.cause instanceof InsecureConnectionError }
) {
throw error.cause;
} else {
throw error;
} }
} }
@ -51,8 +103,7 @@ export async function scrapeURLWithFetch(
return { return {
url: response.url, url: response.url,
html: await response.text(), html: response.body,
statusCode: response.status, statusCode: response.status,
// TODO: error?
}; };
} }