mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-15 00:15:52 +08:00
feat(map): mock support (FIR-1109) (#1213)
* feat(map,fetch): mock support * feat(snips/map): mock out long-running test * fix(snips/scrape): use more reliable site for adblock testing
This commit is contained in:
parent
bc5a16d048
commit
da1670b78c
@ -21,7 +21,7 @@ function expectMapToSucceed(response: Awaited<ReturnType<typeof map>>) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
describe("Map tests", () => {
|
describe("Map tests", () => {
|
||||||
it("basic map succeeds", async () => {
|
it.concurrent("basic map succeeds", async () => {
|
||||||
const response = await map({
|
const response = await map({
|
||||||
url: "http://firecrawl.dev",
|
url: "http://firecrawl.dev",
|
||||||
});
|
});
|
||||||
@ -29,7 +29,7 @@ describe("Map tests", () => {
|
|||||||
expectMapToSucceed(response);
|
expectMapToSucceed(response);
|
||||||
}, 10000);
|
}, 10000);
|
||||||
|
|
||||||
it("times out properly", async () => {
|
it.concurrent("times out properly", async () => {
|
||||||
const response = await map({
|
const response = await map({
|
||||||
url: "http://firecrawl.dev",
|
url: "http://firecrawl.dev",
|
||||||
timeout: 1
|
timeout: 1
|
||||||
@ -40,14 +40,15 @@ describe("Map tests", () => {
|
|||||||
expect(response.body.error).toBe("Request timed out");
|
expect(response.body.error).toBe("Request timed out");
|
||||||
}, 10000);
|
}, 10000);
|
||||||
|
|
||||||
it("handles query parameters correctly", async () => {
|
it.concurrent("handles query parameters correctly", async () => {
|
||||||
let response = await map({
|
let response = await map({
|
||||||
url: "https://www.hfea.gov.uk",
|
url: "https://www.hfea.gov.uk",
|
||||||
sitemapOnly: true,
|
sitemapOnly: true,
|
||||||
|
useMock: "map-query-params",
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body.success).toBe(true);
|
expect(response.body.success).toBe(true);
|
||||||
expect(response.body.links.some(x => x.match(/^https:\/\/www\.hfea\.gov\.uk\/choose-a-clinic\/clinic-search\/results\/?\?options=\d+$/))).toBe(true);
|
expect(response.body.links.some(x => x.match(/^https:\/\/www\.hfea\.gov\.uk\/choose-a-clinic\/clinic-search\/results\/?\?options=\d+$/))).toBe(true);
|
||||||
}, 300000);
|
}, 60000);
|
||||||
});
|
});
|
||||||
|
51
apps/api/src/__tests__/snips/mocks/map-query-params.json
Normal file
51
apps/api/src/__tests__/snips/mocks/map-query-params.json
Normal file
File diff suppressed because one or more lines are too long
@ -76,7 +76,7 @@ describe("Scrape tests", () => {
|
|||||||
describe("Ad blocking (f-e dependant)", () => {
|
describe("Ad blocking (f-e dependant)", () => {
|
||||||
it.concurrent("blocks ads by default", async () => {
|
it.concurrent("blocks ads by default", async () => {
|
||||||
const response = await scrape({
|
const response = await scrape({
|
||||||
url: "https://canyoublockit.com/testing/",
|
url: "https://www.allrecipes.com/recipe/18185/yum/",
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(response.markdown).not.toContain(".g.doubleclick.net/");
|
expect(response.markdown).not.toContain(".g.doubleclick.net/");
|
||||||
@ -84,7 +84,7 @@ describe("Scrape tests", () => {
|
|||||||
|
|
||||||
it.concurrent("doesn't block ads if explicitly disabled", async () => {
|
it.concurrent("doesn't block ads if explicitly disabled", async () => {
|
||||||
const response = await scrape({
|
const response = await scrape({
|
||||||
url: "https://canyoublockit.com/testing/",
|
url: "https://www.allrecipes.com/recipe/18185/yum/",
|
||||||
blockAds: false,
|
blockAds: false,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -32,5 +32,5 @@ describe("Search tests", () => {
|
|||||||
await search({
|
await search({
|
||||||
query: "firecrawl"
|
query: "firecrawl"
|
||||||
});
|
});
|
||||||
}, 15000);
|
}, 60000);
|
||||||
});
|
});
|
||||||
|
@ -55,6 +55,7 @@ export async function getMapResults({
|
|||||||
includeMetadata = false,
|
includeMetadata = false,
|
||||||
allowExternalLinks,
|
allowExternalLinks,
|
||||||
abort = new AbortController().signal, // noop
|
abort = new AbortController().signal, // noop
|
||||||
|
mock,
|
||||||
}: {
|
}: {
|
||||||
url: string;
|
url: string;
|
||||||
search?: string;
|
search?: string;
|
||||||
@ -68,6 +69,7 @@ export async function getMapResults({
|
|||||||
includeMetadata?: boolean;
|
includeMetadata?: boolean;
|
||||||
allowExternalLinks?: boolean;
|
allowExternalLinks?: boolean;
|
||||||
abort?: AbortSignal;
|
abort?: AbortSignal;
|
||||||
|
mock?: string;
|
||||||
}): Promise<MapResult> {
|
}): Promise<MapResult> {
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
let links: string[] = [url];
|
let links: string[] = [url];
|
||||||
@ -106,6 +108,7 @@ export async function getMapResults({
|
|||||||
true,
|
true,
|
||||||
30000,
|
30000,
|
||||||
abort,
|
abort,
|
||||||
|
mock,
|
||||||
);
|
);
|
||||||
if (sitemap > 0) {
|
if (sitemap > 0) {
|
||||||
links = links
|
links = links
|
||||||
@ -296,6 +299,7 @@ export async function mapController(
|
|||||||
teamId: req.auth.team_id,
|
teamId: req.auth.team_id,
|
||||||
plan: req.auth.plan,
|
plan: req.auth.plan,
|
||||||
abort: abort.signal,
|
abort: abort.signal,
|
||||||
|
mock: req.body.useMock,
|
||||||
}),
|
}),
|
||||||
...(req.body.timeout !== undefined ? [
|
...(req.body.timeout !== undefined ? [
|
||||||
new Promise((resolve, reject) => setTimeout(() => {
|
new Promise((resolve, reject) => setTimeout(() => {
|
||||||
|
@ -501,6 +501,7 @@ export const mapRequestSchema = crawlerOptions
|
|||||||
sitemapOnly: z.boolean().default(false),
|
sitemapOnly: z.boolean().default(false),
|
||||||
limit: z.number().min(1).max(5000).default(5000),
|
limit: z.number().min(1).max(5000).default(5000),
|
||||||
timeout: z.number().positive().finite().optional(),
|
timeout: z.number().positive().finite().optional(),
|
||||||
|
useMock: z.string().optional(),
|
||||||
})
|
})
|
||||||
.strict(strictMessage);
|
.strict(strictMessage);
|
||||||
|
|
||||||
|
@ -207,7 +207,8 @@ export class WebCrawler {
|
|||||||
fromMap: boolean = false,
|
fromMap: boolean = false,
|
||||||
onlySitemap: boolean = false,
|
onlySitemap: boolean = false,
|
||||||
timeout: number = 120000,
|
timeout: number = 120000,
|
||||||
abort?: AbortSignal
|
abort?: AbortSignal,
|
||||||
|
mock?: string,
|
||||||
): Promise<number> {
|
): Promise<number> {
|
||||||
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
|
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
|
||||||
method: "tryGetSitemap",
|
method: "tryGetSitemap",
|
||||||
@ -263,10 +264,10 @@ export class WebCrawler {
|
|||||||
try {
|
try {
|
||||||
let count = (await Promise.race([
|
let count = (await Promise.race([
|
||||||
Promise.all([
|
Promise.all([
|
||||||
this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler, abort),
|
this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler, abort, mock),
|
||||||
...this.robots
|
...this.robots
|
||||||
.getSitemaps()
|
.getSitemaps()
|
||||||
.map((x) => this.tryFetchSitemapLinks(x, _urlsHandler, abort)),
|
.map((x) => this.tryFetchSitemapLinks(x, _urlsHandler, abort, mock)),
|
||||||
]).then((results) => results.reduce((a, x) => a + x, 0)),
|
]).then((results) => results.reduce((a, x) => a + x, 0)),
|
||||||
timeoutPromise,
|
timeoutPromise,
|
||||||
])) as number;
|
])) as number;
|
||||||
@ -559,6 +560,7 @@ export class WebCrawler {
|
|||||||
url: string,
|
url: string,
|
||||||
urlsHandler: (urls: string[]) => unknown,
|
urlsHandler: (urls: string[]) => unknown,
|
||||||
abort?: AbortSignal,
|
abort?: AbortSignal,
|
||||||
|
mock?: string,
|
||||||
): Promise<number> {
|
): Promise<number> {
|
||||||
const sitemapUrl = url.endsWith(".xml")
|
const sitemapUrl = url.endsWith(".xml")
|
||||||
? url
|
? url
|
||||||
@ -574,6 +576,7 @@ export class WebCrawler {
|
|||||||
this.jobId,
|
this.jobId,
|
||||||
this.sitemapsHit,
|
this.sitemapsHit,
|
||||||
abort,
|
abort,
|
||||||
|
mock,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof TimeoutSignal) {
|
if (error instanceof TimeoutSignal) {
|
||||||
@ -621,6 +624,7 @@ export class WebCrawler {
|
|||||||
this.jobId,
|
this.jobId,
|
||||||
this.sitemapsHit,
|
this.sitemapsHit,
|
||||||
abort,
|
abort,
|
||||||
|
mock,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof TimeoutSignal) {
|
if (error instanceof TimeoutSignal) {
|
||||||
@ -655,6 +659,7 @@ export class WebCrawler {
|
|||||||
this.jobId,
|
this.jobId,
|
||||||
this.sitemapsHit,
|
this.sitemapsHit,
|
||||||
abort,
|
abort,
|
||||||
|
mock,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof TimeoutSignal) {
|
if (error instanceof TimeoutSignal) {
|
||||||
@ -674,6 +679,7 @@ export class WebCrawler {
|
|||||||
this.jobId,
|
this.jobId,
|
||||||
this.sitemapsHit,
|
this.sitemapsHit,
|
||||||
abort,
|
abort,
|
||||||
|
mock,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -20,6 +20,7 @@ export async function getLinksFromSitemap(
|
|||||||
crawlId: string,
|
crawlId: string,
|
||||||
sitemapsHit: Set<string>,
|
sitemapsHit: Set<string>,
|
||||||
abort?: AbortSignal,
|
abort?: AbortSignal,
|
||||||
|
mock?: string,
|
||||||
): Promise<number> {
|
): Promise<number> {
|
||||||
if (sitemapsHit.size >= 20) {
|
if (sitemapsHit.size >= 20) {
|
||||||
return 0;
|
return 0;
|
||||||
@ -38,7 +39,7 @@ export async function getLinksFromSitemap(
|
|||||||
const response = await scrapeURL(
|
const response = await scrapeURL(
|
||||||
"sitemap;" + crawlId,
|
"sitemap;" + crawlId,
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
scrapeOptions.parse({ formats: ["rawHtml"], useMock: mock }),
|
||||||
{
|
{
|
||||||
forceEngine: [
|
forceEngine: [
|
||||||
"fetch",
|
"fetch",
|
||||||
@ -95,7 +96,7 @@ export async function getLinksFromSitemap(
|
|||||||
.map((sitemap) => sitemap.loc[0].trim());
|
.map((sitemap) => sitemap.loc[0].trim());
|
||||||
|
|
||||||
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
||||||
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit, abort),
|
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit, abort, mock),
|
||||||
);
|
);
|
||||||
|
|
||||||
const results = await Promise.all(sitemapPromises);
|
const results = await Promise.all(sitemapPromises);
|
||||||
@ -120,6 +121,7 @@ export async function getLinksFromSitemap(
|
|||||||
crawlId,
|
crawlId,
|
||||||
sitemapsHit,
|
sitemapsHit,
|
||||||
abort,
|
abort,
|
||||||
|
mock,
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
count += (await Promise.all(sitemapPromises)).reduce(
|
count += (await Promise.all(sitemapPromises)).reduce(
|
||||||
|
@ -7,6 +7,7 @@ import {
|
|||||||
InsecureConnectionError,
|
InsecureConnectionError,
|
||||||
makeSecureDispatcher,
|
makeSecureDispatcher,
|
||||||
} from "../utils/safeFetch";
|
} from "../utils/safeFetch";
|
||||||
|
import { MockState, saveMock } from "../../lib/mock";
|
||||||
|
|
||||||
export async function scrapeURLWithFetch(
|
export async function scrapeURLWithFetch(
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
@ -14,33 +15,84 @@ export async function scrapeURLWithFetch(
|
|||||||
): Promise<EngineScrapeResult> {
|
): Promise<EngineScrapeResult> {
|
||||||
const timeout = timeToRun ?? 300000;
|
const timeout = timeToRun ?? 300000;
|
||||||
|
|
||||||
let response: undici.Response;
|
const mockOptions = {
|
||||||
try {
|
url: meta.url,
|
||||||
response = await Promise.race([
|
|
||||||
undici.fetch(meta.url, {
|
// irrelevant
|
||||||
dispatcher: await makeSecureDispatcher(meta.url),
|
method: "GET",
|
||||||
redirect: "follow",
|
ignoreResponse: false,
|
||||||
headers: meta.options.headers,
|
ignoreFailure: false,
|
||||||
signal: meta.internalOptions.abort,
|
tryCount: 1,
|
||||||
}),
|
};
|
||||||
(async () => {
|
|
||||||
await new Promise((resolve) =>
|
let response: {
|
||||||
setTimeout(() => resolve(null), timeout),
|
url: string;
|
||||||
|
body: string,
|
||||||
|
status: number;
|
||||||
|
headers: any;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (meta.mock !== null) {
|
||||||
|
const makeRequestTypeId = (
|
||||||
|
request: MockState["requests"][number]["options"],
|
||||||
|
) => request.url + ";" + request.method;
|
||||||
|
|
||||||
|
const thisId = makeRequestTypeId(mockOptions);
|
||||||
|
const matchingMocks = meta.mock.requests
|
||||||
|
.filter((x) => makeRequestTypeId(x.options) === thisId)
|
||||||
|
.sort((a, b) => a.time - b.time);
|
||||||
|
const nextI = meta.mock.tracker[thisId] ?? 0;
|
||||||
|
meta.mock.tracker[thisId] = nextI + 1;
|
||||||
|
|
||||||
|
if (!matchingMocks[nextI]) {
|
||||||
|
throw new Error("Failed to mock request -- no mock targets found.");
|
||||||
|
}
|
||||||
|
|
||||||
|
response = {
|
||||||
|
...matchingMocks[nextI].result,
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
const x = await Promise.race([
|
||||||
|
undici.fetch(meta.url, {
|
||||||
|
dispatcher: await makeSecureDispatcher(meta.url),
|
||||||
|
redirect: "follow",
|
||||||
|
headers: meta.options.headers,
|
||||||
|
signal: meta.internalOptions.abort,
|
||||||
|
}),
|
||||||
|
(async () => {
|
||||||
|
await new Promise((resolve) =>
|
||||||
|
setTimeout(() => resolve(null), timeout),
|
||||||
|
);
|
||||||
|
throw new TimeoutError(
|
||||||
|
"Fetch was unable to scrape the page before timing out",
|
||||||
|
{ cause: { timeout } },
|
||||||
|
);
|
||||||
|
})(),
|
||||||
|
]);
|
||||||
|
|
||||||
|
response = {
|
||||||
|
url: x.url,
|
||||||
|
body: await x.text(),
|
||||||
|
status: x.status,
|
||||||
|
headers: [...x.headers],
|
||||||
|
};
|
||||||
|
|
||||||
|
if (meta.mock === null) {
|
||||||
|
await saveMock(
|
||||||
|
mockOptions,
|
||||||
|
response,
|
||||||
);
|
);
|
||||||
throw new TimeoutError(
|
}
|
||||||
"Fetch was unable to scrape the page before timing out",
|
} catch (error) {
|
||||||
{ cause: { timeout } },
|
if (
|
||||||
);
|
error instanceof TypeError &&
|
||||||
})(),
|
error.cause instanceof InsecureConnectionError
|
||||||
]);
|
) {
|
||||||
} catch (error) {
|
throw error.cause;
|
||||||
if (
|
} else {
|
||||||
error instanceof TypeError &&
|
throw error;
|
||||||
error.cause instanceof InsecureConnectionError
|
}
|
||||||
) {
|
|
||||||
throw error.cause;
|
|
||||||
} else {
|
|
||||||
throw error;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -51,8 +103,7 @@ export async function scrapeURLWithFetch(
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
url: response.url,
|
url: response.url,
|
||||||
html: await response.text(),
|
html: response.body,
|
||||||
statusCode: response.status,
|
statusCode: response.status,
|
||||||
// TODO: error?
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user