mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 01:05:52 +08:00
feat(map): mock support (FIR-1109) (#1213)
* feat(map,fetch): mock support * feat(snips/map): mock out long-running test * fix(snips/scrape): use more reliable site for adblock testing
This commit is contained in:
parent
bc5a16d048
commit
da1670b78c
@ -21,7 +21,7 @@ function expectMapToSucceed(response: Awaited<ReturnType<typeof map>>) {
|
||||
}
|
||||
|
||||
describe("Map tests", () => {
|
||||
it("basic map succeeds", async () => {
|
||||
it.concurrent("basic map succeeds", async () => {
|
||||
const response = await map({
|
||||
url: "http://firecrawl.dev",
|
||||
});
|
||||
@ -29,7 +29,7 @@ describe("Map tests", () => {
|
||||
expectMapToSucceed(response);
|
||||
}, 10000);
|
||||
|
||||
it("times out properly", async () => {
|
||||
it.concurrent("times out properly", async () => {
|
||||
const response = await map({
|
||||
url: "http://firecrawl.dev",
|
||||
timeout: 1
|
||||
@ -40,14 +40,15 @@ describe("Map tests", () => {
|
||||
expect(response.body.error).toBe("Request timed out");
|
||||
}, 10000);
|
||||
|
||||
it("handles query parameters correctly", async () => {
|
||||
it.concurrent("handles query parameters correctly", async () => {
|
||||
let response = await map({
|
||||
url: "https://www.hfea.gov.uk",
|
||||
sitemapOnly: true,
|
||||
useMock: "map-query-params",
|
||||
});
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(response.body.links.some(x => x.match(/^https:\/\/www\.hfea\.gov\.uk\/choose-a-clinic\/clinic-search\/results\/?\?options=\d+$/))).toBe(true);
|
||||
}, 300000);
|
||||
}, 60000);
|
||||
});
|
||||
|
51
apps/api/src/__tests__/snips/mocks/map-query-params.json
Normal file
51
apps/api/src/__tests__/snips/mocks/map-query-params.json
Normal file
File diff suppressed because one or more lines are too long
@ -76,7 +76,7 @@ describe("Scrape tests", () => {
|
||||
describe("Ad blocking (f-e dependant)", () => {
|
||||
it.concurrent("blocks ads by default", async () => {
|
||||
const response = await scrape({
|
||||
url: "https://canyoublockit.com/testing/",
|
||||
url: "https://www.allrecipes.com/recipe/18185/yum/",
|
||||
});
|
||||
|
||||
expect(response.markdown).not.toContain(".g.doubleclick.net/");
|
||||
@ -84,7 +84,7 @@ describe("Scrape tests", () => {
|
||||
|
||||
it.concurrent("doesn't block ads if explicitly disabled", async () => {
|
||||
const response = await scrape({
|
||||
url: "https://canyoublockit.com/testing/",
|
||||
url: "https://www.allrecipes.com/recipe/18185/yum/",
|
||||
blockAds: false,
|
||||
});
|
||||
|
||||
|
@ -32,5 +32,5 @@ describe("Search tests", () => {
|
||||
await search({
|
||||
query: "firecrawl"
|
||||
});
|
||||
}, 15000);
|
||||
}, 60000);
|
||||
});
|
||||
|
@ -55,6 +55,7 @@ export async function getMapResults({
|
||||
includeMetadata = false,
|
||||
allowExternalLinks,
|
||||
abort = new AbortController().signal, // noop
|
||||
mock,
|
||||
}: {
|
||||
url: string;
|
||||
search?: string;
|
||||
@ -68,6 +69,7 @@ export async function getMapResults({
|
||||
includeMetadata?: boolean;
|
||||
allowExternalLinks?: boolean;
|
||||
abort?: AbortSignal;
|
||||
mock?: string;
|
||||
}): Promise<MapResult> {
|
||||
const id = uuidv4();
|
||||
let links: string[] = [url];
|
||||
@ -106,6 +108,7 @@ export async function getMapResults({
|
||||
true,
|
||||
30000,
|
||||
abort,
|
||||
mock,
|
||||
);
|
||||
if (sitemap > 0) {
|
||||
links = links
|
||||
@ -296,6 +299,7 @@ export async function mapController(
|
||||
teamId: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
abort: abort.signal,
|
||||
mock: req.body.useMock,
|
||||
}),
|
||||
...(req.body.timeout !== undefined ? [
|
||||
new Promise((resolve, reject) => setTimeout(() => {
|
||||
|
@ -501,6 +501,7 @@ export const mapRequestSchema = crawlerOptions
|
||||
sitemapOnly: z.boolean().default(false),
|
||||
limit: z.number().min(1).max(5000).default(5000),
|
||||
timeout: z.number().positive().finite().optional(),
|
||||
useMock: z.string().optional(),
|
||||
})
|
||||
.strict(strictMessage);
|
||||
|
||||
|
@ -207,7 +207,8 @@ export class WebCrawler {
|
||||
fromMap: boolean = false,
|
||||
onlySitemap: boolean = false,
|
||||
timeout: number = 120000,
|
||||
abort?: AbortSignal
|
||||
abort?: AbortSignal,
|
||||
mock?: string,
|
||||
): Promise<number> {
|
||||
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
|
||||
method: "tryGetSitemap",
|
||||
@ -263,10 +264,10 @@ export class WebCrawler {
|
||||
try {
|
||||
let count = (await Promise.race([
|
||||
Promise.all([
|
||||
this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler, abort),
|
||||
this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler, abort, mock),
|
||||
...this.robots
|
||||
.getSitemaps()
|
||||
.map((x) => this.tryFetchSitemapLinks(x, _urlsHandler, abort)),
|
||||
.map((x) => this.tryFetchSitemapLinks(x, _urlsHandler, abort, mock)),
|
||||
]).then((results) => results.reduce((a, x) => a + x, 0)),
|
||||
timeoutPromise,
|
||||
])) as number;
|
||||
@ -559,6 +560,7 @@ export class WebCrawler {
|
||||
url: string,
|
||||
urlsHandler: (urls: string[]) => unknown,
|
||||
abort?: AbortSignal,
|
||||
mock?: string,
|
||||
): Promise<number> {
|
||||
const sitemapUrl = url.endsWith(".xml")
|
||||
? url
|
||||
@ -574,6 +576,7 @@ export class WebCrawler {
|
||||
this.jobId,
|
||||
this.sitemapsHit,
|
||||
abort,
|
||||
mock,
|
||||
);
|
||||
} catch (error) {
|
||||
if (error instanceof TimeoutSignal) {
|
||||
@ -621,6 +624,7 @@ export class WebCrawler {
|
||||
this.jobId,
|
||||
this.sitemapsHit,
|
||||
abort,
|
||||
mock,
|
||||
);
|
||||
} catch (error) {
|
||||
if (error instanceof TimeoutSignal) {
|
||||
@ -655,6 +659,7 @@ export class WebCrawler {
|
||||
this.jobId,
|
||||
this.sitemapsHit,
|
||||
abort,
|
||||
mock,
|
||||
);
|
||||
} catch (error) {
|
||||
if (error instanceof TimeoutSignal) {
|
||||
@ -674,6 +679,7 @@ export class WebCrawler {
|
||||
this.jobId,
|
||||
this.sitemapsHit,
|
||||
abort,
|
||||
mock,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -20,6 +20,7 @@ export async function getLinksFromSitemap(
|
||||
crawlId: string,
|
||||
sitemapsHit: Set<string>,
|
||||
abort?: AbortSignal,
|
||||
mock?: string,
|
||||
): Promise<number> {
|
||||
if (sitemapsHit.size >= 20) {
|
||||
return 0;
|
||||
@ -38,7 +39,7 @@ export async function getLinksFromSitemap(
|
||||
const response = await scrapeURL(
|
||||
"sitemap;" + crawlId,
|
||||
sitemapUrl,
|
||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
||||
scrapeOptions.parse({ formats: ["rawHtml"], useMock: mock }),
|
||||
{
|
||||
forceEngine: [
|
||||
"fetch",
|
||||
@ -95,7 +96,7 @@ export async function getLinksFromSitemap(
|
||||
.map((sitemap) => sitemap.loc[0].trim());
|
||||
|
||||
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
||||
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit, abort),
|
||||
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit, abort, mock),
|
||||
);
|
||||
|
||||
const results = await Promise.all(sitemapPromises);
|
||||
@ -120,6 +121,7 @@ export async function getLinksFromSitemap(
|
||||
crawlId,
|
||||
sitemapsHit,
|
||||
abort,
|
||||
mock,
|
||||
),
|
||||
);
|
||||
count += (await Promise.all(sitemapPromises)).reduce(
|
||||
|
@ -7,6 +7,7 @@ import {
|
||||
InsecureConnectionError,
|
||||
makeSecureDispatcher,
|
||||
} from "../utils/safeFetch";
|
||||
import { MockState, saveMock } from "../../lib/mock";
|
||||
|
||||
export async function scrapeURLWithFetch(
|
||||
meta: Meta,
|
||||
@ -14,33 +15,84 @@ export async function scrapeURLWithFetch(
|
||||
): Promise<EngineScrapeResult> {
|
||||
const timeout = timeToRun ?? 300000;
|
||||
|
||||
let response: undici.Response;
|
||||
try {
|
||||
response = await Promise.race([
|
||||
undici.fetch(meta.url, {
|
||||
dispatcher: await makeSecureDispatcher(meta.url),
|
||||
redirect: "follow",
|
||||
headers: meta.options.headers,
|
||||
signal: meta.internalOptions.abort,
|
||||
}),
|
||||
(async () => {
|
||||
await new Promise((resolve) =>
|
||||
setTimeout(() => resolve(null), timeout),
|
||||
const mockOptions = {
|
||||
url: meta.url,
|
||||
|
||||
// irrelevant
|
||||
method: "GET",
|
||||
ignoreResponse: false,
|
||||
ignoreFailure: false,
|
||||
tryCount: 1,
|
||||
};
|
||||
|
||||
let response: {
|
||||
url: string;
|
||||
body: string,
|
||||
status: number;
|
||||
headers: any;
|
||||
};
|
||||
|
||||
if (meta.mock !== null) {
|
||||
const makeRequestTypeId = (
|
||||
request: MockState["requests"][number]["options"],
|
||||
) => request.url + ";" + request.method;
|
||||
|
||||
const thisId = makeRequestTypeId(mockOptions);
|
||||
const matchingMocks = meta.mock.requests
|
||||
.filter((x) => makeRequestTypeId(x.options) === thisId)
|
||||
.sort((a, b) => a.time - b.time);
|
||||
const nextI = meta.mock.tracker[thisId] ?? 0;
|
||||
meta.mock.tracker[thisId] = nextI + 1;
|
||||
|
||||
if (!matchingMocks[nextI]) {
|
||||
throw new Error("Failed to mock request -- no mock targets found.");
|
||||
}
|
||||
|
||||
response = {
|
||||
...matchingMocks[nextI].result,
|
||||
};
|
||||
} else {
|
||||
try {
|
||||
const x = await Promise.race([
|
||||
undici.fetch(meta.url, {
|
||||
dispatcher: await makeSecureDispatcher(meta.url),
|
||||
redirect: "follow",
|
||||
headers: meta.options.headers,
|
||||
signal: meta.internalOptions.abort,
|
||||
}),
|
||||
(async () => {
|
||||
await new Promise((resolve) =>
|
||||
setTimeout(() => resolve(null), timeout),
|
||||
);
|
||||
throw new TimeoutError(
|
||||
"Fetch was unable to scrape the page before timing out",
|
||||
{ cause: { timeout } },
|
||||
);
|
||||
})(),
|
||||
]);
|
||||
|
||||
response = {
|
||||
url: x.url,
|
||||
body: await x.text(),
|
||||
status: x.status,
|
||||
headers: [...x.headers],
|
||||
};
|
||||
|
||||
if (meta.mock === null) {
|
||||
await saveMock(
|
||||
mockOptions,
|
||||
response,
|
||||
);
|
||||
throw new TimeoutError(
|
||||
"Fetch was unable to scrape the page before timing out",
|
||||
{ cause: { timeout } },
|
||||
);
|
||||
})(),
|
||||
]);
|
||||
} catch (error) {
|
||||
if (
|
||||
error instanceof TypeError &&
|
||||
error.cause instanceof InsecureConnectionError
|
||||
) {
|
||||
throw error.cause;
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
} catch (error) {
|
||||
if (
|
||||
error instanceof TypeError &&
|
||||
error.cause instanceof InsecureConnectionError
|
||||
) {
|
||||
throw error.cause;
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -51,8 +103,7 @@ export async function scrapeURLWithFetch(
|
||||
|
||||
return {
|
||||
url: response.url,
|
||||
html: await response.text(),
|
||||
html: response.body,
|
||||
statusCode: response.status,
|
||||
// TODO: error?
|
||||
};
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user