mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-06-20 11:50:48 +08:00
Merge branch 'main' of https://github.com/mendableai/firecrawl
This commit is contained in:
commit
b0534d0767
@ -39,4 +39,24 @@ describe("Map tests", () => {
|
|||||||
expect(response.body.success).toBe(false);
|
expect(response.body.success).toBe(false);
|
||||||
expect(response.body.error).toBe("Request timed out");
|
expect(response.body.error).toBe("Request timed out");
|
||||||
}, 10000);
|
}, 10000);
|
||||||
|
|
||||||
|
it("handles query parameters correctly", async () => {
|
||||||
|
let response = await map({
|
||||||
|
url: "https://www.hfea.gov.uk",
|
||||||
|
sitemapOnly: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body.success).toBe(true);
|
||||||
|
expect(response.body.links.some(x => x.match(/^https:\/\/www\.hfea\.gov\.uk\/choose-a-clinic\/clinic-search\/results\/?\?options=\d+$/))).toBe(true);
|
||||||
|
|
||||||
|
response = await map({
|
||||||
|
url: "https://www.hfea.gov.uk",
|
||||||
|
ignoreSitemap: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body.success).toBe(true);
|
||||||
|
expect(response.body.links.some(x => x.match(/^https:\/\/www\.hfea\.gov\.uk\/choose-a-clinic\/clinic-search\/results\/?\?options=\d+$/))).toBe(true);
|
||||||
|
}, 300000); // TODO: mocks
|
||||||
});
|
});
|
||||||
|
@ -34,7 +34,7 @@ describe("Scrape tests", () => {
|
|||||||
expect(response.body.data.markdown).toBe(
|
expect(response.body.data.markdown).toBe(
|
||||||
"this is fake data coming from the mocking system!",
|
"this is fake data coming from the mocking system!",
|
||||||
);
|
);
|
||||||
});
|
}, 10000);
|
||||||
|
|
||||||
describe("Ad blocking (f-e dependant)", () => {
|
describe("Ad blocking (f-e dependant)", () => {
|
||||||
it.concurrent("blocks ads by default", async () => {
|
it.concurrent("blocks ads by default", async () => {
|
||||||
@ -88,5 +88,27 @@ describe("Scrape tests", () => {
|
|||||||
const obj = JSON.parse(response.body.data.rawHtml);
|
const obj = JSON.parse(response.body.data.rawHtml);
|
||||||
expect(obj.id).toBe(1);
|
expect(obj.id).toBe(1);
|
||||||
}, 25000); // TODO: mock and shorten
|
}, 25000); // TODO: mock and shorten
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("Screenshot", () => {
|
||||||
|
it.concurrent("screenshot format works", async () => {
|
||||||
|
const response = await scrape({
|
||||||
|
url: "http://firecrawl.dev",
|
||||||
|
formats: ["screenshot"]
|
||||||
|
});
|
||||||
|
|
||||||
|
expectScrapeToSucceed(response);
|
||||||
|
expect(response.body.data.screenshot).toBeTruthy();
|
||||||
|
}, 15000);
|
||||||
|
|
||||||
|
it.concurrent("screenshot@fullPage format works", async () => {
|
||||||
|
const response = await scrape({
|
||||||
|
url: "http://firecrawl.dev",
|
||||||
|
formats: ["screenshot@fullPage"]
|
||||||
|
});
|
||||||
|
|
||||||
|
expectScrapeToSucceed(response);
|
||||||
|
expect(response.body.data.screenshot).toBeTruthy();
|
||||||
|
}, 15000);
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
@ -2,27 +2,15 @@ import { Response } from "express";
|
|||||||
import {
|
import {
|
||||||
CrawlErrorsResponse,
|
CrawlErrorsResponse,
|
||||||
CrawlStatusParams,
|
CrawlStatusParams,
|
||||||
CrawlStatusResponse,
|
|
||||||
ErrorResponse,
|
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import {
|
import {
|
||||||
getCrawl,
|
getCrawl,
|
||||||
getCrawlExpiry,
|
|
||||||
getCrawlJobs,
|
getCrawlJobs,
|
||||||
getDoneJobsOrdered,
|
|
||||||
getDoneJobsOrderedLength,
|
|
||||||
getThrottledJobs,
|
|
||||||
isCrawlFinished,
|
|
||||||
} from "../../lib/crawl-redis";
|
} from "../../lib/crawl-redis";
|
||||||
import { getScrapeQueue, redisConnection } from "../../services/queue-service";
|
import { getScrapeQueue, redisConnection } from "../../services/queue-service";
|
||||||
import {
|
|
||||||
supabaseGetJobById,
|
|
||||||
supabaseGetJobsById,
|
|
||||||
} from "../../lib/supabase-jobs";
|
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import { Job, JobState } from "bullmq";
|
import { Job } from "bullmq";
|
||||||
import { logger } from "../../lib/logger";
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
export async function getJob(id: string) {
|
export async function getJob(id: string) {
|
||||||
|
@ -17,7 +17,6 @@ import {
|
|||||||
getCrawlJobs,
|
getCrawlJobs,
|
||||||
getDoneJobsOrdered,
|
getDoneJobsOrdered,
|
||||||
getDoneJobsOrderedLength,
|
getDoneJobsOrderedLength,
|
||||||
getThrottledJobs,
|
|
||||||
isCrawlFinished,
|
isCrawlFinished,
|
||||||
isCrawlFinishedLocked,
|
isCrawlFinishedLocked,
|
||||||
} from "../../lib/crawl-redis";
|
} from "../../lib/crawl-redis";
|
||||||
@ -25,6 +24,7 @@ import { getScrapeQueue } from "../../services/queue-service";
|
|||||||
import { getJob, getJobs } from "./crawl-status";
|
import { getJob, getJobs } from "./crawl-status";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { Job, JobState } from "bullmq";
|
import { Job, JobState } from "bullmq";
|
||||||
|
import { getConcurrencyLimitedJobs } from "../../lib/concurrency-limit";
|
||||||
|
|
||||||
type ErrorMessage = {
|
type ErrorMessage = {
|
||||||
type: "error";
|
type: "error";
|
||||||
@ -127,16 +127,16 @@ async function crawlStatusWS(
|
|||||||
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
|
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
|
const throttledJobsSet = await getConcurrencyLimitedJobs(req.auth.team_id);
|
||||||
|
|
||||||
const throttledJobsSet = new Set(throttledJobs);
|
|
||||||
|
|
||||||
const validJobStatuses: [string, JobState | "unknown"][] = [];
|
const validJobStatuses: [string, JobState | "unknown"][] = [];
|
||||||
const validJobIDs: string[] = [];
|
const validJobIDs: string[] = [];
|
||||||
|
|
||||||
for (const [id, status] of jobStatuses) {
|
for (const [id, status] of jobStatuses) {
|
||||||
if (
|
if (throttledJobsSet.has(id)) {
|
||||||
!throttledJobsSet.has(id) &&
|
validJobStatuses.push([id, "prioritized"]);
|
||||||
|
validJobIDs.push(id);
|
||||||
|
} else if (
|
||||||
status !== "failed" &&
|
status !== "failed" &&
|
||||||
status !== "unknown"
|
status !== "unknown"
|
||||||
) {
|
) {
|
||||||
|
@ -11,7 +11,6 @@ import {
|
|||||||
getCrawlJobs,
|
getCrawlJobs,
|
||||||
getDoneJobsOrdered,
|
getDoneJobsOrdered,
|
||||||
getDoneJobsOrderedLength,
|
getDoneJobsOrderedLength,
|
||||||
getThrottledJobs,
|
|
||||||
isCrawlKickoffFinished,
|
isCrawlKickoffFinished,
|
||||||
} from "../../lib/crawl-redis";
|
} from "../../lib/crawl-redis";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
@ -23,6 +22,7 @@ import { configDotenv } from "dotenv";
|
|||||||
import type { Job, JobState } from "bullmq";
|
import type { Job, JobState } from "bullmq";
|
||||||
import { logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
import { supabase_service } from "../../services/supabase";
|
import { supabase_service } from "../../services/supabase";
|
||||||
|
import { getConcurrencyLimitedJobs } from "../../lib/concurrency-limit";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
export type PseudoJob<T> = {
|
export type PseudoJob<T> = {
|
||||||
@ -137,16 +137,17 @@ export async function crawlStatusController(
|
|||||||
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
|
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
|
|
||||||
|
|
||||||
const throttledJobsSet = new Set(throttledJobs);
|
const throttledJobsSet = await getConcurrencyLimitedJobs(req.auth.team_id);
|
||||||
|
|
||||||
const validJobStatuses: [string, JobState | "unknown"][] = [];
|
const validJobStatuses: [string, JobState | "unknown"][] = [];
|
||||||
const validJobIDs: string[] = [];
|
const validJobIDs: string[] = [];
|
||||||
|
|
||||||
for (const [id, status] of jobStatuses) {
|
for (const [id, status] of jobStatuses) {
|
||||||
if (
|
if (throttledJobsSet.has(id)) {
|
||||||
!throttledJobsSet.has(id) &&
|
validJobStatuses.push([id, "prioritized"]);
|
||||||
|
validJobIDs.push(id);
|
||||||
|
} else if (
|
||||||
status !== "failed" &&
|
status !== "failed" &&
|
||||||
status !== "unknown"
|
status !== "unknown"
|
||||||
) {
|
) {
|
||||||
|
@ -100,6 +100,11 @@ export async function pushConcurrencyLimitedJob(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function getConcurrencyLimitedJobs(
|
||||||
|
team_id: string,
|
||||||
|
) {
|
||||||
|
return new Set((await redisConnection.zrange(constructQueueKey(team_id), 0, -1)).map(x => JSON.parse(x).id));
|
||||||
|
}
|
||||||
|
|
||||||
export async function getConcurrencyQueueJobsCount(team_id: string): Promise<number> {
|
export async function getConcurrencyQueueJobsCount(team_id: string): Promise<number> {
|
||||||
const count = await redisConnection.zcard(constructQueueKey(team_id));
|
const count = await redisConnection.zcard(constructQueueKey(team_id));
|
||||||
|
@ -184,14 +184,6 @@ export async function getCrawlJobCount(id: string): Promise<number> {
|
|||||||
return await redisConnection.scard("crawl:" + id + ":jobs");
|
return await redisConnection.scard("crawl:" + id + ":jobs");
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getThrottledJobs(teamId: string): Promise<string[]> {
|
|
||||||
return await redisConnection.zrangebyscore(
|
|
||||||
"concurrency-limiter:" + teamId + ":throttled",
|
|
||||||
Date.now(),
|
|
||||||
Infinity,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
export function normalizeURL(url: string, sc: StoredCrawl): string {
|
export function normalizeURL(url: string, sc: StoredCrawl): string {
|
||||||
const urlO = new URL(url);
|
const urlO = new URL(url);
|
||||||
if (!sc.crawlerOptions || sc.crawlerOptions.ignoreQueryParameters) {
|
if (!sc.crawlerOptions || sc.crawlerOptions.ignoreQueryParameters) {
|
||||||
|
@ -147,7 +147,7 @@ export const checkAndUpdateURLForMap = (url: string) => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// remove any query params
|
// remove any query params
|
||||||
url = url.split("?")[0].trim();
|
// url = url.split("?")[0].trim();
|
||||||
|
|
||||||
return { urlObj: typedUrlObj, url: url };
|
return { urlObj: typedUrlObj, url: url };
|
||||||
};
|
};
|
||||||
|
@ -167,6 +167,9 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
]
|
]
|
||||||
: []),
|
: []),
|
||||||
|
|
||||||
|
// Include specified actions
|
||||||
|
...(meta.options.actions ?? []),
|
||||||
|
|
||||||
// Transform screenshot format into an action (unsupported by chrome-cdp)
|
// Transform screenshot format into an action (unsupported by chrome-cdp)
|
||||||
...(meta.options.formats.includes("screenshot") ||
|
...(meta.options.formats.includes("screenshot") ||
|
||||||
meta.options.formats.includes("screenshot@fullPage")
|
meta.options.formats.includes("screenshot@fullPage")
|
||||||
@ -177,9 +180,6 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
},
|
},
|
||||||
]
|
]
|
||||||
: []),
|
: []),
|
||||||
|
|
||||||
// Include specified actions
|
|
||||||
...(meta.options.actions ?? []),
|
|
||||||
];
|
];
|
||||||
|
|
||||||
const totalWait = actions.reduce(
|
const totalWait = actions.reduce(
|
||||||
@ -228,8 +228,10 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
"Transforming screenshots from actions into screenshot field",
|
"Transforming screenshots from actions into screenshot field",
|
||||||
{ screenshots: response.screenshots },
|
{ screenshots: response.screenshots },
|
||||||
);
|
);
|
||||||
response.screenshot = (response.screenshots ?? [])[0];
|
if (response.screenshots) {
|
||||||
(response.screenshots ?? []).splice(0, 1);
|
response.screenshot = response.screenshots.slice(-1, 0)[0];
|
||||||
|
response.screenshots = response.screenshots.slice(0, -1);
|
||||||
|
}
|
||||||
meta.logger.debug("Screenshot transformation done", {
|
meta.logger.debug("Screenshot transformation done", {
|
||||||
screenshots: response.screenshots,
|
screenshots: response.screenshots,
|
||||||
screenshot: response.screenshot,
|
screenshot: response.screenshot,
|
||||||
|
@ -3,7 +3,7 @@ import { Meta } from "../..";
|
|||||||
import { EngineScrapeResult } from "..";
|
import { EngineScrapeResult } from "..";
|
||||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||||
import { AxiosError, type AxiosResponse } from "axios";
|
import { AxiosError, type AxiosResponse } from "axios";
|
||||||
import { EngineError } from "../../error";
|
import { EngineError, TimeoutError } from "../../error";
|
||||||
|
|
||||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
|
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
|
||||||
|
|
||||||
@ -17,7 +17,8 @@ export function scrapeURLWithScrapingBee(
|
|||||||
let response: AxiosResponse<any>;
|
let response: AxiosResponse<any>;
|
||||||
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
|
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
|
||||||
try {
|
try {
|
||||||
response = await client.get({
|
response = await Promise.race<AxiosResponse<any>>([
|
||||||
|
client.get({
|
||||||
url: meta.url,
|
url: meta.url,
|
||||||
params: {
|
params: {
|
||||||
timeout,
|
timeout,
|
||||||
@ -33,7 +34,9 @@ export function scrapeURLWithScrapingBee(
|
|||||||
headers: {
|
headers: {
|
||||||
"ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery
|
"ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery
|
||||||
},
|
},
|
||||||
});
|
}),
|
||||||
|
new Promise((_, reject) => setTimeout(() => reject(new TimeoutError("ScrapingBee timed out")), timeout + 5000)),
|
||||||
|
]);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof AxiosError && error.response !== undefined) {
|
if (error instanceof AxiosError && error.response !== undefined) {
|
||||||
response = error.response;
|
response = error.response;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user