diff --git a/apps/api/src/__tests__/snips/map.test.ts b/apps/api/src/__tests__/snips/map.test.ts index be1469ff..e156a3f3 100644 --- a/apps/api/src/__tests__/snips/map.test.ts +++ b/apps/api/src/__tests__/snips/map.test.ts @@ -39,4 +39,24 @@ describe("Map tests", () => { expect(response.body.success).toBe(false); expect(response.body.error).toBe("Request timed out"); }, 10000); + + it("handles query parameters correctly", async () => { + let response = await map({ + url: "https://www.hfea.gov.uk", + sitemapOnly: true, + }); + + expect(response.statusCode).toBe(200); + expect(response.body.success).toBe(true); + expect(response.body.links.some(x => x.match(/^https:\/\/www\.hfea\.gov\.uk\/choose-a-clinic\/clinic-search\/results\/?\?options=\d+$/))).toBe(true); + + response = await map({ + url: "https://www.hfea.gov.uk", + ignoreSitemap: false, + }); + + expect(response.statusCode).toBe(200); + expect(response.body.success).toBe(true); + expect(response.body.links.some(x => x.match(/^https:\/\/www\.hfea\.gov\.uk\/choose-a-clinic\/clinic-search\/results\/?\?options=\d+$/))).toBe(true); + }, 300000); // TODO: mocks }); diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 0c53edb8..8463e098 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -34,7 +34,7 @@ describe("Scrape tests", () => { expect(response.body.data.markdown).toBe( "this is fake data coming from the mocking system!", ); - }); + }, 10000); describe("Ad blocking (f-e dependant)", () => { it.concurrent("blocks ads by default", async () => { @@ -88,5 +88,27 @@ describe("Scrape tests", () => { const obj = JSON.parse(response.body.data.rawHtml); expect(obj.id).toBe(1); }, 25000); // TODO: mock and shorten + }); + + describe("Screenshot", () => { + it.concurrent("screenshot format works", async () => { + const response = await scrape({ + url: "http://firecrawl.dev", + formats: ["screenshot"] + }); + + expectScrapeToSucceed(response); + expect(response.body.data.screenshot).toBeTruthy(); + }, 15000); + + it.concurrent("screenshot@fullPage format works", async () => { + const response = await scrape({ + url: "http://firecrawl.dev", + formats: ["screenshot@fullPage"] + }); + + expectScrapeToSucceed(response); + expect(response.body.data.screenshot).toBeTruthy(); + }, 15000); }) }); diff --git a/apps/api/src/controllers/v1/crawl-errors.ts b/apps/api/src/controllers/v1/crawl-errors.ts index defdda01..979a6d7a 100644 --- a/apps/api/src/controllers/v1/crawl-errors.ts +++ b/apps/api/src/controllers/v1/crawl-errors.ts @@ -2,27 +2,15 @@ import { Response } from "express"; import { CrawlErrorsResponse, CrawlStatusParams, - CrawlStatusResponse, - ErrorResponse, RequestWithAuth, } from "./types"; import { getCrawl, - getCrawlExpiry, getCrawlJobs, - getDoneJobsOrdered, - getDoneJobsOrderedLength, - getThrottledJobs, - isCrawlFinished, } from "../../lib/crawl-redis"; import { getScrapeQueue, redisConnection } from "../../services/queue-service"; -import { - supabaseGetJobById, - supabaseGetJobsById, -} from "../../lib/supabase-jobs"; import { configDotenv } from "dotenv"; -import { Job, JobState } from "bullmq"; -import { logger } from "../../lib/logger"; +import { Job } from "bullmq"; configDotenv(); export async function getJob(id: string) { diff --git a/apps/api/src/controllers/v1/crawl-status-ws.ts b/apps/api/src/controllers/v1/crawl-status-ws.ts index 9ed1d1c6..41f99094 100644 --- a/apps/api/src/controllers/v1/crawl-status-ws.ts +++ b/apps/api/src/controllers/v1/crawl-status-ws.ts @@ -17,7 +17,6 @@ import { getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, - getThrottledJobs, isCrawlFinished, isCrawlFinishedLocked, } from "../../lib/crawl-redis"; @@ -25,6 +24,7 @@ import { getScrapeQueue } from "../../services/queue-service"; import { getJob, getJobs } from "./crawl-status"; import * as Sentry from "@sentry/node"; import { Job, JobState } from "bullmq"; +import { getConcurrencyLimitedJobs } from "../../lib/concurrency-limit"; type ErrorMessage = { type: "error"; @@ -127,16 +127,16 @@ async function crawlStatusWS( async (x) => [x, await getScrapeQueue().getJobState(x)] as const, ), ); - const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id))); - - const throttledJobsSet = new Set(throttledJobs); - + const throttledJobsSet = await getConcurrencyLimitedJobs(req.auth.team_id); + const validJobStatuses: [string, JobState | "unknown"][] = []; const validJobIDs: string[] = []; for (const [id, status] of jobStatuses) { - if ( - !throttledJobsSet.has(id) && + if (throttledJobsSet.has(id)) { + validJobStatuses.push([id, "prioritized"]); + validJobIDs.push(id); + } else if ( status !== "failed" && status !== "unknown" ) { diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index d43562b6..8d0ea1b7 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -11,7 +11,6 @@ import { getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, - getThrottledJobs, isCrawlKickoffFinished, } from "../../lib/crawl-redis"; import { getScrapeQueue } from "../../services/queue-service"; @@ -23,6 +22,7 @@ import { configDotenv } from "dotenv"; import type { Job, JobState } from "bullmq"; import { logger } from "../../lib/logger"; import { supabase_service } from "../../services/supabase"; +import { getConcurrencyLimitedJobs } from "../../lib/concurrency-limit"; configDotenv(); export type PseudoJob = { @@ -137,16 +137,17 @@ export async function crawlStatusController( async (x) => [x, await getScrapeQueue().getJobState(x)] as const, ), ); - const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id))); - const throttledJobsSet = new Set(throttledJobs); + const throttledJobsSet = await getConcurrencyLimitedJobs(req.auth.team_id); const validJobStatuses: [string, JobState | "unknown"][] = []; const validJobIDs: string[] = []; for (const [id, status] of jobStatuses) { - if ( - !throttledJobsSet.has(id) && + if (throttledJobsSet.has(id)) { + validJobStatuses.push([id, "prioritized"]); + validJobIDs.push(id); + } else if ( status !== "failed" && status !== "unknown" ) { diff --git a/apps/api/src/lib/concurrency-limit.ts b/apps/api/src/lib/concurrency-limit.ts index 8fa87cb2..5a1578ed 100644 --- a/apps/api/src/lib/concurrency-limit.ts +++ b/apps/api/src/lib/concurrency-limit.ts @@ -100,6 +100,11 @@ export async function pushConcurrencyLimitedJob( ); } +export async function getConcurrencyLimitedJobs( + team_id: string, +) { + return new Set((await redisConnection.zrange(constructQueueKey(team_id), 0, -1)).map(x => JSON.parse(x).id)); +} export async function getConcurrencyQueueJobsCount(team_id: string): Promise { const count = await redisConnection.zcard(constructQueueKey(team_id)); diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index c0870586..eaee3491 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -184,14 +184,6 @@ export async function getCrawlJobCount(id: string): Promise { return await redisConnection.scard("crawl:" + id + ":jobs"); } -export async function getThrottledJobs(teamId: string): Promise { - return await redisConnection.zrangebyscore( - "concurrency-limiter:" + teamId + ":throttled", - Date.now(), - Infinity, - ); -} - export function normalizeURL(url: string, sc: StoredCrawl): string { const urlO = new URL(url); if (!sc.crawlerOptions || sc.crawlerOptions.ignoreQueryParameters) { diff --git a/apps/api/src/lib/validateUrl.ts b/apps/api/src/lib/validateUrl.ts index dc27c136..d7672bb9 100644 --- a/apps/api/src/lib/validateUrl.ts +++ b/apps/api/src/lib/validateUrl.ts @@ -147,7 +147,7 @@ export const checkAndUpdateURLForMap = (url: string) => { } // remove any query params - url = url.split("?")[0].trim(); + // url = url.split("?")[0].trim(); return { urlObj: typedUrlObj, url: url }; }; diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 04601da6..988dfeb8 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -167,6 +167,9 @@ export async function scrapeURLWithFireEngineChromeCDP( ] : []), + // Include specified actions + ...(meta.options.actions ?? []), + // Transform screenshot format into an action (unsupported by chrome-cdp) ...(meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage") @@ -177,9 +180,6 @@ export async function scrapeURLWithFireEngineChromeCDP( }, ] : []), - - // Include specified actions - ...(meta.options.actions ?? []), ]; const totalWait = actions.reduce( @@ -228,8 +228,10 @@ export async function scrapeURLWithFireEngineChromeCDP( "Transforming screenshots from actions into screenshot field", { screenshots: response.screenshots }, ); - response.screenshot = (response.screenshots ?? [])[0]; - (response.screenshots ?? []).splice(0, 1); + if (response.screenshots) { + response.screenshot = response.screenshots.slice(-1, 0)[0]; + response.screenshots = response.screenshots.slice(0, -1); + } meta.logger.debug("Screenshot transformation done", { screenshots: response.screenshots, screenshot: response.screenshot, diff --git a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts index 38c43878..39805a2f 100644 --- a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts @@ -3,7 +3,7 @@ import { Meta } from "../.."; import { EngineScrapeResult } from ".."; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; import { AxiosError, type AxiosResponse } from "axios"; -import { EngineError } from "../../error"; +import { EngineError, TimeoutError } from "../../error"; const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!); @@ -17,23 +17,26 @@ export function scrapeURLWithScrapingBee( let response: AxiosResponse; const timeout = (timeToRun ?? 300000) + meta.options.waitFor; try { - response = await client.get({ - url: meta.url, - params: { - timeout, - wait_browser: wait_browser, - wait: meta.options.waitFor, - transparent_status_code: true, - json_response: true, - screenshot: meta.options.formats.includes("screenshot"), - screenshot_full_page: meta.options.formats.includes( - "screenshot@fullPage", - ), - }, - headers: { - "ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery - }, - }); + response = await Promise.race>([ + client.get({ + url: meta.url, + params: { + timeout, + wait_browser: wait_browser, + wait: meta.options.waitFor, + transparent_status_code: true, + json_response: true, + screenshot: meta.options.formats.includes("screenshot"), + screenshot_full_page: meta.options.formats.includes( + "screenshot@fullPage", + ), + }, + headers: { + "ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery + }, + }), + new Promise((_, reject) => setTimeout(() => reject(new TimeoutError("ScrapingBee timed out")), timeout + 5000)), + ]); } catch (error) { if (error instanceof AxiosError && error.response !== undefined) { response = error.response; diff --git a/examples/o3-mini-Product-Reviews-summarizer/o3-mini-product-reviews-summarizer.py b/examples/o3-mini-deal-finder/o3-mini-deal-finder.py similarity index 100% rename from examples/o3-mini-Product-Reviews-summarizer/o3-mini-product-reviews-summarizer.py rename to examples/o3-mini-deal-finder/o3-mini-deal-finder.py