feat(map): mock support (FIR-1109) (#1213)

* feat(map,fetch): mock support

* feat(snips/map): mock out long-running test

* fix(snips/scrape): use more reliable site for adblock testing
This commit is contained in:
Gergő Móricz 2025-02-20 10:41:43 +01:00 committed by GitHub
parent bc5a16d048
commit da1670b78c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 156 additions and 40 deletions

View File

@ -21,7 +21,7 @@ function expectMapToSucceed(response: Awaited<ReturnType<typeof map>>) {
}
describe("Map tests", () => {
it("basic map succeeds", async () => {
it.concurrent("basic map succeeds", async () => {
const response = await map({
url: "http://firecrawl.dev",
});
@ -29,7 +29,7 @@ describe("Map tests", () => {
expectMapToSucceed(response);
}, 10000);
it("times out properly", async () => {
it.concurrent("times out properly", async () => {
const response = await map({
url: "http://firecrawl.dev",
timeout: 1
@ -40,14 +40,15 @@ describe("Map tests", () => {
expect(response.body.error).toBe("Request timed out");
}, 10000);
it("handles query parameters correctly", async () => {
it.concurrent("handles query parameters correctly", async () => {
let response = await map({
url: "https://www.hfea.gov.uk",
sitemapOnly: true,
useMock: "map-query-params",
});
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(response.body.links.some(x => x.match(/^https:\/\/www\.hfea\.gov\.uk\/choose-a-clinic\/clinic-search\/results\/?\?options=\d+$/))).toBe(true);
}, 300000);
}, 60000);
});

File diff suppressed because one or more lines are too long

View File

@ -76,7 +76,7 @@ describe("Scrape tests", () => {
describe("Ad blocking (f-e dependant)", () => {
it.concurrent("blocks ads by default", async () => {
const response = await scrape({
url: "https://canyoublockit.com/testing/",
url: "https://www.allrecipes.com/recipe/18185/yum/",
});
expect(response.markdown).not.toContain(".g.doubleclick.net/");
@ -84,7 +84,7 @@ describe("Scrape tests", () => {
it.concurrent("doesn't block ads if explicitly disabled", async () => {
const response = await scrape({
url: "https://canyoublockit.com/testing/",
url: "https://www.allrecipes.com/recipe/18185/yum/",
blockAds: false,
});

View File

@ -32,5 +32,5 @@ describe("Search tests", () => {
await search({
query: "firecrawl"
});
}, 15000);
}, 60000);
});

View File

@ -55,6 +55,7 @@ export async function getMapResults({
includeMetadata = false,
allowExternalLinks,
abort = new AbortController().signal, // noop
mock,
}: {
url: string;
search?: string;
@ -68,6 +69,7 @@ export async function getMapResults({
includeMetadata?: boolean;
allowExternalLinks?: boolean;
abort?: AbortSignal;
mock?: string;
}): Promise<MapResult> {
const id = uuidv4();
let links: string[] = [url];
@ -106,6 +108,7 @@ export async function getMapResults({
true,
30000,
abort,
mock,
);
if (sitemap > 0) {
links = links
@ -296,6 +299,7 @@ export async function mapController(
teamId: req.auth.team_id,
plan: req.auth.plan,
abort: abort.signal,
mock: req.body.useMock,
}),
...(req.body.timeout !== undefined ? [
new Promise((resolve, reject) => setTimeout(() => {

View File

@ -501,6 +501,7 @@ export const mapRequestSchema = crawlerOptions
sitemapOnly: z.boolean().default(false),
limit: z.number().min(1).max(5000).default(5000),
timeout: z.number().positive().finite().optional(),
useMock: z.string().optional(),
})
.strict(strictMessage);

View File

@ -207,7 +207,8 @@ export class WebCrawler {
fromMap: boolean = false,
onlySitemap: boolean = false,
timeout: number = 120000,
abort?: AbortSignal
abort?: AbortSignal,
mock?: string,
): Promise<number> {
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
method: "tryGetSitemap",
@ -263,10 +264,10 @@ export class WebCrawler {
try {
let count = (await Promise.race([
Promise.all([
this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler, abort),
this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler, abort, mock),
...this.robots
.getSitemaps()
.map((x) => this.tryFetchSitemapLinks(x, _urlsHandler, abort)),
.map((x) => this.tryFetchSitemapLinks(x, _urlsHandler, abort, mock)),
]).then((results) => results.reduce((a, x) => a + x, 0)),
timeoutPromise,
])) as number;
@ -559,6 +560,7 @@ export class WebCrawler {
url: string,
urlsHandler: (urls: string[]) => unknown,
abort?: AbortSignal,
mock?: string,
): Promise<number> {
const sitemapUrl = url.endsWith(".xml")
? url
@ -574,6 +576,7 @@ export class WebCrawler {
this.jobId,
this.sitemapsHit,
abort,
mock,
);
} catch (error) {
if (error instanceof TimeoutSignal) {
@ -621,6 +624,7 @@ export class WebCrawler {
this.jobId,
this.sitemapsHit,
abort,
mock,
);
} catch (error) {
if (error instanceof TimeoutSignal) {
@ -655,6 +659,7 @@ export class WebCrawler {
this.jobId,
this.sitemapsHit,
abort,
mock,
);
} catch (error) {
if (error instanceof TimeoutSignal) {
@ -674,6 +679,7 @@ export class WebCrawler {
this.jobId,
this.sitemapsHit,
abort,
mock,
);
}
}

View File

@ -20,6 +20,7 @@ export async function getLinksFromSitemap(
crawlId: string,
sitemapsHit: Set<string>,
abort?: AbortSignal,
mock?: string,
): Promise<number> {
if (sitemapsHit.size >= 20) {
return 0;
@ -38,7 +39,7 @@ export async function getLinksFromSitemap(
const response = await scrapeURL(
"sitemap;" + crawlId,
sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }),
scrapeOptions.parse({ formats: ["rawHtml"], useMock: mock }),
{
forceEngine: [
"fetch",
@ -95,7 +96,7 @@ export async function getLinksFromSitemap(
.map((sitemap) => sitemap.loc[0].trim());
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit, abort),
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit, abort, mock),
);
const results = await Promise.all(sitemapPromises);
@ -120,6 +121,7 @@ export async function getLinksFromSitemap(
crawlId,
sitemapsHit,
abort,
mock,
),
);
count += (await Promise.all(sitemapPromises)).reduce(

View File

@ -7,6 +7,7 @@ import {
InsecureConnectionError,
makeSecureDispatcher,
} from "../utils/safeFetch";
import { MockState, saveMock } from "../../lib/mock";
export async function scrapeURLWithFetch(
meta: Meta,
@ -14,33 +15,84 @@ export async function scrapeURLWithFetch(
): Promise<EngineScrapeResult> {
const timeout = timeToRun ?? 300000;
let response: undici.Response;
try {
response = await Promise.race([
undici.fetch(meta.url, {
dispatcher: await makeSecureDispatcher(meta.url),
redirect: "follow",
headers: meta.options.headers,
signal: meta.internalOptions.abort,
}),
(async () => {
await new Promise((resolve) =>
setTimeout(() => resolve(null), timeout),
const mockOptions = {
url: meta.url,
// irrelevant
method: "GET",
ignoreResponse: false,
ignoreFailure: false,
tryCount: 1,
};
let response: {
url: string;
body: string,
status: number;
headers: any;
};
if (meta.mock !== null) {
const makeRequestTypeId = (
request: MockState["requests"][number]["options"],
) => request.url + ";" + request.method;
const thisId = makeRequestTypeId(mockOptions);
const matchingMocks = meta.mock.requests
.filter((x) => makeRequestTypeId(x.options) === thisId)
.sort((a, b) => a.time - b.time);
const nextI = meta.mock.tracker[thisId] ?? 0;
meta.mock.tracker[thisId] = nextI + 1;
if (!matchingMocks[nextI]) {
throw new Error("Failed to mock request -- no mock targets found.");
}
response = {
...matchingMocks[nextI].result,
};
} else {
try {
const x = await Promise.race([
undici.fetch(meta.url, {
dispatcher: await makeSecureDispatcher(meta.url),
redirect: "follow",
headers: meta.options.headers,
signal: meta.internalOptions.abort,
}),
(async () => {
await new Promise((resolve) =>
setTimeout(() => resolve(null), timeout),
);
throw new TimeoutError(
"Fetch was unable to scrape the page before timing out",
{ cause: { timeout } },
);
})(),
]);
response = {
url: x.url,
body: await x.text(),
status: x.status,
headers: [...x.headers],
};
if (meta.mock === null) {
await saveMock(
mockOptions,
response,
);
throw new TimeoutError(
"Fetch was unable to scrape the page before timing out",
{ cause: { timeout } },
);
})(),
]);
} catch (error) {
if (
error instanceof TypeError &&
error.cause instanceof InsecureConnectionError
) {
throw error.cause;
} else {
throw error;
}
} catch (error) {
if (
error instanceof TypeError &&
error.cause instanceof InsecureConnectionError
) {
throw error.cause;
} else {
throw error;
}
}
}
@ -51,8 +103,7 @@ export async function scrapeURLWithFetch(
return {
url: response.url,
html: await response.text(),
html: response.body,
statusCode: response.status,
// TODO: error?
};
}