This commit is contained in:
Nicolas 2025-02-17 12:42:59 -03:00
commit b0534d0767
11 changed files with 91 additions and 58 deletions

View File

@ -39,4 +39,24 @@ describe("Map tests", () => {
expect(response.body.success).toBe(false);
expect(response.body.error).toBe("Request timed out");
}, 10000);
it("handles query parameters correctly", async () => {
let response = await map({
url: "https://www.hfea.gov.uk",
sitemapOnly: true,
});
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(response.body.links.some(x => x.match(/^https:\/\/www\.hfea\.gov\.uk\/choose-a-clinic\/clinic-search\/results\/?\?options=\d+$/))).toBe(true);
response = await map({
url: "https://www.hfea.gov.uk",
ignoreSitemap: false,
});
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(response.body.links.some(x => x.match(/^https:\/\/www\.hfea\.gov\.uk\/choose-a-clinic\/clinic-search\/results\/?\?options=\d+$/))).toBe(true);
}, 300000); // TODO: mocks
});

View File

@ -34,7 +34,7 @@ describe("Scrape tests", () => {
expect(response.body.data.markdown).toBe(
"this is fake data coming from the mocking system!",
);
});
}, 10000);
describe("Ad blocking (f-e dependant)", () => {
it.concurrent("blocks ads by default", async () => {
@ -88,5 +88,27 @@ describe("Scrape tests", () => {
const obj = JSON.parse(response.body.data.rawHtml);
expect(obj.id).toBe(1);
}, 25000); // TODO: mock and shorten
});
describe("Screenshot", () => {
it.concurrent("screenshot format works", async () => {
const response = await scrape({
url: "http://firecrawl.dev",
formats: ["screenshot"]
});
expectScrapeToSucceed(response);
expect(response.body.data.screenshot).toBeTruthy();
}, 15000);
it.concurrent("screenshot@fullPage format works", async () => {
const response = await scrape({
url: "http://firecrawl.dev",
formats: ["screenshot@fullPage"]
});
expectScrapeToSucceed(response);
expect(response.body.data.screenshot).toBeTruthy();
}, 15000);
})
});

View File

@ -2,27 +2,15 @@ import { Response } from "express";
import {
CrawlErrorsResponse,
CrawlStatusParams,
CrawlStatusResponse,
ErrorResponse,
RequestWithAuth,
} from "./types";
import {
getCrawl,
getCrawlExpiry,
getCrawlJobs,
getDoneJobsOrdered,
getDoneJobsOrderedLength,
getThrottledJobs,
isCrawlFinished,
} from "../../lib/crawl-redis";
import { getScrapeQueue, redisConnection } from "../../services/queue-service";
import {
supabaseGetJobById,
supabaseGetJobsById,
} from "../../lib/supabase-jobs";
import { configDotenv } from "dotenv";
import { Job, JobState } from "bullmq";
import { logger } from "../../lib/logger";
import { Job } from "bullmq";
configDotenv();
export async function getJob(id: string) {

View File

@ -17,7 +17,6 @@ import {
getCrawlJobs,
getDoneJobsOrdered,
getDoneJobsOrderedLength,
getThrottledJobs,
isCrawlFinished,
isCrawlFinishedLocked,
} from "../../lib/crawl-redis";
@ -25,6 +24,7 @@ import { getScrapeQueue } from "../../services/queue-service";
import { getJob, getJobs } from "./crawl-status";
import * as Sentry from "@sentry/node";
import { Job, JobState } from "bullmq";
import { getConcurrencyLimitedJobs } from "../../lib/concurrency-limit";
type ErrorMessage = {
type: "error";
@ -127,16 +127,16 @@ async function crawlStatusWS(
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
),
);
const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
const throttledJobsSet = new Set(throttledJobs);
const throttledJobsSet = await getConcurrencyLimitedJobs(req.auth.team_id);
const validJobStatuses: [string, JobState | "unknown"][] = [];
const validJobIDs: string[] = [];
for (const [id, status] of jobStatuses) {
if (
!throttledJobsSet.has(id) &&
if (throttledJobsSet.has(id)) {
validJobStatuses.push([id, "prioritized"]);
validJobIDs.push(id);
} else if (
status !== "failed" &&
status !== "unknown"
) {

View File

@ -11,7 +11,6 @@ import {
getCrawlJobs,
getDoneJobsOrdered,
getDoneJobsOrderedLength,
getThrottledJobs,
isCrawlKickoffFinished,
} from "../../lib/crawl-redis";
import { getScrapeQueue } from "../../services/queue-service";
@ -23,6 +22,7 @@ import { configDotenv } from "dotenv";
import type { Job, JobState } from "bullmq";
import { logger } from "../../lib/logger";
import { supabase_service } from "../../services/supabase";
import { getConcurrencyLimitedJobs } from "../../lib/concurrency-limit";
configDotenv();
export type PseudoJob<T> = {
@ -137,16 +137,17 @@ export async function crawlStatusController(
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
),
);
const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
const throttledJobsSet = new Set(throttledJobs);
const throttledJobsSet = await getConcurrencyLimitedJobs(req.auth.team_id);
const validJobStatuses: [string, JobState | "unknown"][] = [];
const validJobIDs: string[] = [];
for (const [id, status] of jobStatuses) {
if (
!throttledJobsSet.has(id) &&
if (throttledJobsSet.has(id)) {
validJobStatuses.push([id, "prioritized"]);
validJobIDs.push(id);
} else if (
status !== "failed" &&
status !== "unknown"
) {

View File

@ -100,6 +100,11 @@ export async function pushConcurrencyLimitedJob(
);
}
export async function getConcurrencyLimitedJobs(
team_id: string,
) {
return new Set((await redisConnection.zrange(constructQueueKey(team_id), 0, -1)).map(x => JSON.parse(x).id));
}
export async function getConcurrencyQueueJobsCount(team_id: string): Promise<number> {
const count = await redisConnection.zcard(constructQueueKey(team_id));

View File

@ -184,14 +184,6 @@ export async function getCrawlJobCount(id: string): Promise<number> {
return await redisConnection.scard("crawl:" + id + ":jobs");
}
export async function getThrottledJobs(teamId: string): Promise<string[]> {
return await redisConnection.zrangebyscore(
"concurrency-limiter:" + teamId + ":throttled",
Date.now(),
Infinity,
);
}
export function normalizeURL(url: string, sc: StoredCrawl): string {
const urlO = new URL(url);
if (!sc.crawlerOptions || sc.crawlerOptions.ignoreQueryParameters) {

View File

@ -147,7 +147,7 @@ export const checkAndUpdateURLForMap = (url: string) => {
}
// remove any query params
url = url.split("?")[0].trim();
// url = url.split("?")[0].trim();
return { urlObj: typedUrlObj, url: url };
};

View File

@ -167,6 +167,9 @@ export async function scrapeURLWithFireEngineChromeCDP(
]
: []),
// Include specified actions
...(meta.options.actions ?? []),
// Transform screenshot format into an action (unsupported by chrome-cdp)
...(meta.options.formats.includes("screenshot") ||
meta.options.formats.includes("screenshot@fullPage")
@ -177,9 +180,6 @@ export async function scrapeURLWithFireEngineChromeCDP(
},
]
: []),
// Include specified actions
...(meta.options.actions ?? []),
];
const totalWait = actions.reduce(
@ -228,8 +228,10 @@ export async function scrapeURLWithFireEngineChromeCDP(
"Transforming screenshots from actions into screenshot field",
{ screenshots: response.screenshots },
);
response.screenshot = (response.screenshots ?? [])[0];
(response.screenshots ?? []).splice(0, 1);
if (response.screenshots) {
response.screenshot = response.screenshots.slice(-1, 0)[0];
response.screenshots = response.screenshots.slice(0, -1);
}
meta.logger.debug("Screenshot transformation done", {
screenshots: response.screenshots,
screenshot: response.screenshot,

View File

@ -3,7 +3,7 @@ import { Meta } from "../..";
import { EngineScrapeResult } from "..";
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
import { AxiosError, type AxiosResponse } from "axios";
import { EngineError } from "../../error";
import { EngineError, TimeoutError } from "../../error";
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
@ -17,23 +17,26 @@ export function scrapeURLWithScrapingBee(
let response: AxiosResponse<any>;
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
try {
response = await client.get({
url: meta.url,
params: {
timeout,
wait_browser: wait_browser,
wait: meta.options.waitFor,
transparent_status_code: true,
json_response: true,
screenshot: meta.options.formats.includes("screenshot"),
screenshot_full_page: meta.options.formats.includes(
"screenshot@fullPage",
),
},
headers: {
"ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery
},
});
response = await Promise.race<AxiosResponse<any>>([
client.get({
url: meta.url,
params: {
timeout,
wait_browser: wait_browser,
wait: meta.options.waitFor,
transparent_status_code: true,
json_response: true,
screenshot: meta.options.formats.includes("screenshot"),
screenshot_full_page: meta.options.formats.includes(
"screenshot@fullPage",
),
},
headers: {
"ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery
},
}),
new Promise((_, reject) => setTimeout(() => reject(new TimeoutError("ScrapingBee timed out")), timeout + 5000)),
]);
} catch (error) {
if (error instanceof AxiosError && error.response !== undefined) {
response = error.response;