mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-06-04 11:24:40 +08:00
Merge branch 'main' of https://github.com/mendableai/firecrawl
This commit is contained in:
commit
b0534d0767
@ -39,4 +39,24 @@ describe("Map tests", () => {
|
||||
expect(response.body.success).toBe(false);
|
||||
expect(response.body.error).toBe("Request timed out");
|
||||
}, 10000);
|
||||
|
||||
it("handles query parameters correctly", async () => {
|
||||
let response = await map({
|
||||
url: "https://www.hfea.gov.uk",
|
||||
sitemapOnly: true,
|
||||
});
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(response.body.links.some(x => x.match(/^https:\/\/www\.hfea\.gov\.uk\/choose-a-clinic\/clinic-search\/results\/?\?options=\d+$/))).toBe(true);
|
||||
|
||||
response = await map({
|
||||
url: "https://www.hfea.gov.uk",
|
||||
ignoreSitemap: false,
|
||||
});
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(response.body.links.some(x => x.match(/^https:\/\/www\.hfea\.gov\.uk\/choose-a-clinic\/clinic-search\/results\/?\?options=\d+$/))).toBe(true);
|
||||
}, 300000); // TODO: mocks
|
||||
});
|
||||
|
@ -34,7 +34,7 @@ describe("Scrape tests", () => {
|
||||
expect(response.body.data.markdown).toBe(
|
||||
"this is fake data coming from the mocking system!",
|
||||
);
|
||||
});
|
||||
}, 10000);
|
||||
|
||||
describe("Ad blocking (f-e dependant)", () => {
|
||||
it.concurrent("blocks ads by default", async () => {
|
||||
@ -88,5 +88,27 @@ describe("Scrape tests", () => {
|
||||
const obj = JSON.parse(response.body.data.rawHtml);
|
||||
expect(obj.id).toBe(1);
|
||||
}, 25000); // TODO: mock and shorten
|
||||
});
|
||||
|
||||
describe("Screenshot", () => {
|
||||
it.concurrent("screenshot format works", async () => {
|
||||
const response = await scrape({
|
||||
url: "http://firecrawl.dev",
|
||||
formats: ["screenshot"]
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
expect(response.body.data.screenshot).toBeTruthy();
|
||||
}, 15000);
|
||||
|
||||
it.concurrent("screenshot@fullPage format works", async () => {
|
||||
const response = await scrape({
|
||||
url: "http://firecrawl.dev",
|
||||
formats: ["screenshot@fullPage"]
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
expect(response.body.data.screenshot).toBeTruthy();
|
||||
}, 15000);
|
||||
})
|
||||
});
|
||||
|
@ -2,27 +2,15 @@ import { Response } from "express";
|
||||
import {
|
||||
CrawlErrorsResponse,
|
||||
CrawlStatusParams,
|
||||
CrawlStatusResponse,
|
||||
ErrorResponse,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import {
|
||||
getCrawl,
|
||||
getCrawlExpiry,
|
||||
getCrawlJobs,
|
||||
getDoneJobsOrdered,
|
||||
getDoneJobsOrderedLength,
|
||||
getThrottledJobs,
|
||||
isCrawlFinished,
|
||||
} from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue, redisConnection } from "../../services/queue-service";
|
||||
import {
|
||||
supabaseGetJobById,
|
||||
supabaseGetJobsById,
|
||||
} from "../../lib/supabase-jobs";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { Job, JobState } from "bullmq";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { Job } from "bullmq";
|
||||
configDotenv();
|
||||
|
||||
export async function getJob(id: string) {
|
||||
|
@ -17,7 +17,6 @@ import {
|
||||
getCrawlJobs,
|
||||
getDoneJobsOrdered,
|
||||
getDoneJobsOrderedLength,
|
||||
getThrottledJobs,
|
||||
isCrawlFinished,
|
||||
isCrawlFinishedLocked,
|
||||
} from "../../lib/crawl-redis";
|
||||
@ -25,6 +24,7 @@ import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { getJob, getJobs } from "./crawl-status";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { Job, JobState } from "bullmq";
|
||||
import { getConcurrencyLimitedJobs } from "../../lib/concurrency-limit";
|
||||
|
||||
type ErrorMessage = {
|
||||
type: "error";
|
||||
@ -127,16 +127,16 @@ async function crawlStatusWS(
|
||||
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
|
||||
),
|
||||
);
|
||||
const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
|
||||
|
||||
const throttledJobsSet = new Set(throttledJobs);
|
||||
|
||||
const throttledJobsSet = await getConcurrencyLimitedJobs(req.auth.team_id);
|
||||
|
||||
const validJobStatuses: [string, JobState | "unknown"][] = [];
|
||||
const validJobIDs: string[] = [];
|
||||
|
||||
for (const [id, status] of jobStatuses) {
|
||||
if (
|
||||
!throttledJobsSet.has(id) &&
|
||||
if (throttledJobsSet.has(id)) {
|
||||
validJobStatuses.push([id, "prioritized"]);
|
||||
validJobIDs.push(id);
|
||||
} else if (
|
||||
status !== "failed" &&
|
||||
status !== "unknown"
|
||||
) {
|
||||
|
@ -11,7 +11,6 @@ import {
|
||||
getCrawlJobs,
|
||||
getDoneJobsOrdered,
|
||||
getDoneJobsOrderedLength,
|
||||
getThrottledJobs,
|
||||
isCrawlKickoffFinished,
|
||||
} from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
@ -23,6 +22,7 @@ import { configDotenv } from "dotenv";
|
||||
import type { Job, JobState } from "bullmq";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { supabase_service } from "../../services/supabase";
|
||||
import { getConcurrencyLimitedJobs } from "../../lib/concurrency-limit";
|
||||
configDotenv();
|
||||
|
||||
export type PseudoJob<T> = {
|
||||
@ -137,16 +137,17 @@ export async function crawlStatusController(
|
||||
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
|
||||
),
|
||||
);
|
||||
const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
|
||||
|
||||
const throttledJobsSet = new Set(throttledJobs);
|
||||
const throttledJobsSet = await getConcurrencyLimitedJobs(req.auth.team_id);
|
||||
|
||||
const validJobStatuses: [string, JobState | "unknown"][] = [];
|
||||
const validJobIDs: string[] = [];
|
||||
|
||||
for (const [id, status] of jobStatuses) {
|
||||
if (
|
||||
!throttledJobsSet.has(id) &&
|
||||
if (throttledJobsSet.has(id)) {
|
||||
validJobStatuses.push([id, "prioritized"]);
|
||||
validJobIDs.push(id);
|
||||
} else if (
|
||||
status !== "failed" &&
|
||||
status !== "unknown"
|
||||
) {
|
||||
|
@ -100,6 +100,11 @@ export async function pushConcurrencyLimitedJob(
|
||||
);
|
||||
}
|
||||
|
||||
export async function getConcurrencyLimitedJobs(
|
||||
team_id: string,
|
||||
) {
|
||||
return new Set((await redisConnection.zrange(constructQueueKey(team_id), 0, -1)).map(x => JSON.parse(x).id));
|
||||
}
|
||||
|
||||
export async function getConcurrencyQueueJobsCount(team_id: string): Promise<number> {
|
||||
const count = await redisConnection.zcard(constructQueueKey(team_id));
|
||||
|
@ -184,14 +184,6 @@ export async function getCrawlJobCount(id: string): Promise<number> {
|
||||
return await redisConnection.scard("crawl:" + id + ":jobs");
|
||||
}
|
||||
|
||||
export async function getThrottledJobs(teamId: string): Promise<string[]> {
|
||||
return await redisConnection.zrangebyscore(
|
||||
"concurrency-limiter:" + teamId + ":throttled",
|
||||
Date.now(),
|
||||
Infinity,
|
||||
);
|
||||
}
|
||||
|
||||
export function normalizeURL(url: string, sc: StoredCrawl): string {
|
||||
const urlO = new URL(url);
|
||||
if (!sc.crawlerOptions || sc.crawlerOptions.ignoreQueryParameters) {
|
||||
|
@ -147,7 +147,7 @@ export const checkAndUpdateURLForMap = (url: string) => {
|
||||
}
|
||||
|
||||
// remove any query params
|
||||
url = url.split("?")[0].trim();
|
||||
// url = url.split("?")[0].trim();
|
||||
|
||||
return { urlObj: typedUrlObj, url: url };
|
||||
};
|
||||
|
@ -167,6 +167,9 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
||||
]
|
||||
: []),
|
||||
|
||||
// Include specified actions
|
||||
...(meta.options.actions ?? []),
|
||||
|
||||
// Transform screenshot format into an action (unsupported by chrome-cdp)
|
||||
...(meta.options.formats.includes("screenshot") ||
|
||||
meta.options.formats.includes("screenshot@fullPage")
|
||||
@ -177,9 +180,6 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
||||
},
|
||||
]
|
||||
: []),
|
||||
|
||||
// Include specified actions
|
||||
...(meta.options.actions ?? []),
|
||||
];
|
||||
|
||||
const totalWait = actions.reduce(
|
||||
@ -228,8 +228,10 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
||||
"Transforming screenshots from actions into screenshot field",
|
||||
{ screenshots: response.screenshots },
|
||||
);
|
||||
response.screenshot = (response.screenshots ?? [])[0];
|
||||
(response.screenshots ?? []).splice(0, 1);
|
||||
if (response.screenshots) {
|
||||
response.screenshot = response.screenshots.slice(-1, 0)[0];
|
||||
response.screenshots = response.screenshots.slice(0, -1);
|
||||
}
|
||||
meta.logger.debug("Screenshot transformation done", {
|
||||
screenshots: response.screenshots,
|
||||
screenshot: response.screenshot,
|
||||
|
@ -3,7 +3,7 @@ import { Meta } from "../..";
|
||||
import { EngineScrapeResult } from "..";
|
||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
import { AxiosError, type AxiosResponse } from "axios";
|
||||
import { EngineError } from "../../error";
|
||||
import { EngineError, TimeoutError } from "../../error";
|
||||
|
||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
|
||||
|
||||
@ -17,23 +17,26 @@ export function scrapeURLWithScrapingBee(
|
||||
let response: AxiosResponse<any>;
|
||||
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
|
||||
try {
|
||||
response = await client.get({
|
||||
url: meta.url,
|
||||
params: {
|
||||
timeout,
|
||||
wait_browser: wait_browser,
|
||||
wait: meta.options.waitFor,
|
||||
transparent_status_code: true,
|
||||
json_response: true,
|
||||
screenshot: meta.options.formats.includes("screenshot"),
|
||||
screenshot_full_page: meta.options.formats.includes(
|
||||
"screenshot@fullPage",
|
||||
),
|
||||
},
|
||||
headers: {
|
||||
"ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery
|
||||
},
|
||||
});
|
||||
response = await Promise.race<AxiosResponse<any>>([
|
||||
client.get({
|
||||
url: meta.url,
|
||||
params: {
|
||||
timeout,
|
||||
wait_browser: wait_browser,
|
||||
wait: meta.options.waitFor,
|
||||
transparent_status_code: true,
|
||||
json_response: true,
|
||||
screenshot: meta.options.formats.includes("screenshot"),
|
||||
screenshot_full_page: meta.options.formats.includes(
|
||||
"screenshot@fullPage",
|
||||
),
|
||||
},
|
||||
headers: {
|
||||
"ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery
|
||||
},
|
||||
}),
|
||||
new Promise((_, reject) => setTimeout(() => reject(new TimeoutError("ScrapingBee timed out")), timeout + 5000)),
|
||||
]);
|
||||
} catch (error) {
|
||||
if (error instanceof AxiosError && error.response !== undefined) {
|
||||
response = error.response;
|
||||
|
Loading…
x
Reference in New Issue
Block a user