diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 3089ae23..6bda3039 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -204,7 +204,7 @@ export async function supaAuthenticateUser( } chunk = await getACUC(normalizedApi); - + if (chunk === null) { return { success: false, diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index 9659c218..67584178 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -178,48 +178,52 @@ export async function crawlController(req: Request, res: Response) { await saveCrawl(id, sc); const sitemap = sc.crawlerOptions.ignoreSitemap - ? 0 - : await crawler.tryGetSitemap(async urls => { - if (urls.length === 0) return; - - let jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 }); - const jobs = urls.map(url => { - const uuid = uuidv4(); - return { - name: uuid, - data: { - url, - mode: "single_urls", - crawlerOptions, - scrapeOptions, - internalOptions, - team_id, - plan, - origin: req.body.origin ?? defaultOrigin, - crawl_id: id, - sitemapped: true, - }, - opts: { - jobId: uuid, - priority: jobPriority, - }, - }; - }); + ? 0 + : await crawler.tryGetSitemap(async (urls) => { + if (urls.length === 0) return; - await lockURLs( - id, - sc, - jobs.map((x) => x.data.url), - ); - await addCrawlJobs( - id, - jobs.map((x) => x.opts.jobId), - ); - for (const job of jobs) { - // add with sentry instrumentation - await addScrapeJob(job.data as any, {}, job.opts.jobId); - } + let jobPriority = await getJobPriority({ + plan, + team_id, + basePriority: 21, }); + const jobs = urls.map((url) => { + const uuid = uuidv4(); + return { + name: uuid, + data: { + url, + mode: "single_urls", + crawlerOptions, + scrapeOptions, + internalOptions, + team_id, + plan, + origin: req.body.origin ?? defaultOrigin, + crawl_id: id, + sitemapped: true, + }, + opts: { + jobId: uuid, + priority: jobPriority, + }, + }; + }); + + await lockURLs( + id, + sc, + jobs.map((x) => x.data.url), + ); + await addCrawlJobs( + id, + jobs.map((x) => x.opts.jobId), + ); + for (const job of jobs) { + // add with sentry instrumentation + await addScrapeJob(job.data as any, {}, job.opts.jobId); + } + }); if (sitemap === 0) { await lockURL(id, sc, url); diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index 9ba9bd46..3f1a9dd9 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -114,29 +114,29 @@ export async function crawlPreviewController(req: Request, res: Response) { const sitemap = sc.crawlerOptions?.ignoreSitemap ? 0 - : await crawler.tryGetSitemap(async urls => { - for (const url of urls) { - await lockURL(id, sc, url); - const jobId = uuidv4(); - await addScrapeJob( - { - url, - mode: "single_urls", - team_id, - plan: plan!, - crawlerOptions, - scrapeOptions, - internalOptions, - origin: "website-preview", - crawl_id: id, - sitemapped: true, - }, - {}, - jobId, - ); - await addCrawlJob(id, jobId); - } - }); + : await crawler.tryGetSitemap(async (urls) => { + for (const url of urls) { + await lockURL(id, sc, url); + const jobId = uuidv4(); + await addScrapeJob( + { + url, + mode: "single_urls", + team_id, + plan: plan!, + crawlerOptions, + scrapeOptions, + internalOptions, + origin: "website-preview", + crawl_id: id, + sitemapped: true, + }, + {}, + jobId, + ); + await addCrawlJob(id, jobId); + } + }); if (sitemap === 0) { await lockURL(id, sc, url); diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index f5fcf6b6..a2593ced 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -115,7 +115,8 @@ export async function crawlStatusController( const status: Exclude["status"] = sc.cancelled ? "cancelled" - : (validJobStatuses.every((x) => x[1] === "completed") && validJobStatuses.length > 0) + : validJobStatuses.every((x) => x[1] === "completed") && + validJobStatuses.length > 0 ? "completed" : "scraping"; diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index a01106d0..9f05e73c 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -7,11 +7,7 @@ import { RequestWithAuth, toLegacyCrawlerOptions, } from "./types"; -import { - crawlToCrawler, - saveCrawl, - StoredCrawl, -} from "../../lib/crawl-redis"; +import { crawlToCrawler, saveCrawl, StoredCrawl } from "../../lib/crawl-redis"; import { logCrawl } from "../../services/logging/crawl_log"; import { _addScrapeJobToBullMQ } from "../../services/queue-jobs"; import { logger as _logger } from "../../lib/logger"; @@ -103,20 +99,25 @@ export async function crawlController( await saveCrawl(id, sc); - await _addScrapeJobToBullMQ({ - url: req.body.url, - mode: "kickoff" as const, - team_id: req.auth.team_id, - plan: req.auth.plan, - crawlerOptions, - scrapeOptions: sc.scrapeOptions, - internalOptions: sc.internalOptions, - origin: "api", - crawl_id: id, - webhook: req.body.webhook, - v1: true, - }, {}, crypto.randomUUID(), 10); - + await _addScrapeJobToBullMQ( + { + url: req.body.url, + mode: "kickoff" as const, + team_id: req.auth.team_id, + plan: req.auth.plan, + crawlerOptions, + scrapeOptions: sc.scrapeOptions, + internalOptions: sc.internalOptions, + origin: "api", + crawl_id: id, + webhook: req.body.webhook, + v1: true, + }, + {}, + crypto.randomUUID(), + 10, + ); + const protocol = process.env.ENV === "local" ? req.protocol : "https"; return res.status(200).json({ diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index ab69ca93..4e0ef877 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -11,25 +11,29 @@ import { saveExtract } from "../../lib/extract/extract-redis"; import { getTeamIdSyncB } from "../../lib/extract/team-id-sync"; import { performExtraction } from "../../lib/extract/extraction-service"; -export async function oldExtract(req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>, res: Response, extractId: string){ +export async function oldExtract( + req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>, + res: Response, + extractId: string, +) { // Means that are in the non-queue system // TODO: Remove this once all teams have transitioned to the new system - try { - const result = await performExtraction(extractId, { - request: req.body, - teamId: req.auth.team_id, - plan: req.auth.plan ?? "free", - subId: req.acuc?.sub_id ?? undefined, + try { + const result = await performExtraction(extractId, { + request: req.body, + teamId: req.auth.team_id, + plan: req.auth.plan ?? "free", + subId: req.acuc?.sub_id ?? undefined, }); - return res.status(200).json(result); - } catch (error) { - return res.status(500).json({ - success: false, - error: "Internal server error", - }); - } + return res.status(200).json(result); + } catch (error) { + return res.status(500).json({ + success: false, + error: "Internal server error", + }); } +} /** * Extracts data from the provided URLs based on the request parameters. * Currently in beta. @@ -53,7 +57,10 @@ export async function extractController( extractId, }; - if(await getTeamIdSyncB(req.auth.team_id) && req.body.origin !== "api-sdk") { + if ( + (await getTeamIdSyncB(req.auth.team_id)) && + req.body.origin !== "api-sdk" + ) { return await oldExtract(req, res, extractId); } diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 3274dd93..29c82e72 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -86,11 +86,15 @@ export async function getMapResults({ // If sitemapOnly is true, only get links from sitemap if (crawlerOptions.sitemapOnly) { - const sitemap = await crawler.tryGetSitemap(urls => { - urls.forEach((x) => { - links.push(x); - }); - }, true, true); + const sitemap = await crawler.tryGetSitemap( + (urls) => { + urls.forEach((x) => { + links.push(x); + }); + }, + true, + true, + ); if (sitemap > 0) { links = links .slice(1) @@ -145,9 +149,11 @@ export async function getMapResults({ // Parallelize sitemap fetch with serper search const [_, ...searchResults] = await Promise.all([ - ignoreSitemap ? null : crawler.tryGetSitemap(urls => { - links.push(...urls); - }, true), + ignoreSitemap + ? null + : crawler.tryGetSitemap((urls) => { + links.push(...urls); + }, true), ...(cachedResult ? [] : pagePromises), ]); diff --git a/apps/api/src/controllers/v1/scrape-status.ts b/apps/api/src/controllers/v1/scrape-status.ts index 5b9f9b46..7d074d42 100644 --- a/apps/api/src/controllers/v1/scrape-status.ts +++ b/apps/api/src/controllers/v1/scrape-status.ts @@ -8,7 +8,7 @@ export async function scrapeStatusController(req: any, res: any) { "511544f2-2fce-4183-9c59-6c29b02c69b5", "1ec9a0b3-6e7d-49a9-ad6c-9c598ba824c8", ]; - + if (!allowedTeams.includes(req.auth.team_id)) { return res.status(403).json({ success: false, @@ -18,7 +18,10 @@ export async function scrapeStatusController(req: any, res: any) { const job = await supabaseGetJobByIdOnlyData(req.params.jobId); - if (!allowedTeams.includes(job?.team_id) || job?.team_id !== req.auth.team_id) { + if ( + !allowedTeams.includes(job?.team_id) || + job?.team_id !== req.auth.team_id + ) { return res.status(403).json({ success: false, error: "You are not allowed to access this resource.", diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 86f2f3c0..7aebd560 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -200,17 +200,20 @@ export const extractV1Options = z schema: z .any() .optional() - .refine((val) => { - if (!val) return true; // Allow undefined schema - try { - const validate = ajv.compile(val); - return typeof validate === "function"; - } catch (e) { - return false; - } - }, { - message: "Invalid JSON schema.", - }), + .refine( + (val) => { + if (!val) return true; // Allow undefined schema + try { + const validate = ajv.compile(val); + return typeof validate === "function"; + } catch (e) { + return false; + } + }, + { + message: "Invalid JSON schema.", + }, + ), limit: z.number().int().positive().finite().safe().optional(), ignoreSitemap: z.boolean().default(false), includeSubdomains: z.boolean().default(true), @@ -452,7 +455,7 @@ export type Document = { description: string; url: string; }; -} +}; export type ErrorResponse = { success: false; @@ -477,7 +480,7 @@ export interface ScrapeResponseRequestTest { export interface URLTrace { url: string; - status: 'mapped' | 'scraped' | 'error'; + status: "mapped" | "scraped" | "error"; timing: { discoveredAt: string; scrapedAt?: string; @@ -785,28 +788,46 @@ export function toLegacyDocument( }; } -export const searchRequestSchema = z.object({ - query: z.string(), - limit: z.number().int().positive().finite().safe().max(10).optional().default(5), - tbs: z.string().optional(), - filter: z.string().optional(), - lang: z.string().optional().default("en"), - country: z.string().optional().default("us"), - location: z.string().optional(), - origin: z.string().optional().default("api"), - timeout: z.number().int().positive().finite().safe().default(60000), - scrapeOptions: scrapeOptions.extend({ - formats: z.array(z.enum([ - "markdown", - "html", - "rawHtml", - "links", - "screenshot", - "screenshot@fullPage", - "extract" - ])).default([]) - }).default({}), -}).strict("Unrecognized key in body -- please review the v1 API documentation for request body changes"); +export const searchRequestSchema = z + .object({ + query: z.string(), + limit: z + .number() + .int() + .positive() + .finite() + .safe() + .max(10) + .optional() + .default(5), + tbs: z.string().optional(), + filter: z.string().optional(), + lang: z.string().optional().default("en"), + country: z.string().optional().default("us"), + location: z.string().optional(), + origin: z.string().optional().default("api"), + timeout: z.number().int().positive().finite().safe().default(60000), + scrapeOptions: scrapeOptions + .extend({ + formats: z + .array( + z.enum([ + "markdown", + "html", + "rawHtml", + "links", + "screenshot", + "screenshot@fullPage", + "extract", + ]), + ) + .default([]), + }) + .default({}), + }) + .strict( + "Unrecognized key in body -- please review the v1 API documentation for request body changes", + ); export type SearchRequest = z.infer; diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index cc2d41b8..5b5208f3 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -45,7 +45,10 @@ const serverAdapter = new ExpressAdapter(); serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({ - queues: [new BullAdapter(getScrapeQueue()), new BullAdapter(getExtractQueue())], + queues: [ + new BullAdapter(getScrapeQueue()), + new BullAdapter(getExtractQueue()), + ], serverAdapter: serverAdapter, }); @@ -254,4 +257,4 @@ logger.info(`Worker ${process.pid} started`); // sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); // sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); // sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); -// +// diff --git a/apps/api/src/lib/canonical-url.test.ts b/apps/api/src/lib/canonical-url.test.ts index 65171642..980cec8e 100644 --- a/apps/api/src/lib/canonical-url.test.ts +++ b/apps/api/src/lib/canonical-url.test.ts @@ -1,91 +1,89 @@ -import { normalizeUrl, normalizeUrlOnlyHostname } from './canonical-url'; +import { normalizeUrl, normalizeUrlOnlyHostname } from "./canonical-url"; -describe('normalizeUrlOnlyHostname', () => { - it('should remove protocol and www from URL', () => { - const url = 'https://www.example.com'; - const expected = 'example.com'; +describe("normalizeUrlOnlyHostname", () => { + it("should remove protocol and www from URL", () => { + const url = "https://www.example.com"; + const expected = "example.com"; expect(normalizeUrlOnlyHostname(url)).toBe(expected); }); - it('should remove only protocol if www is not present', () => { - const url = 'https://example.com'; - const expected = 'example.com'; + it("should remove only protocol if www is not present", () => { + const url = "https://example.com"; + const expected = "example.com"; expect(normalizeUrlOnlyHostname(url)).toBe(expected); }); - it('should handle URLs without protocol', () => { - const url = 'www.example.com'; - const expected = 'example.com'; + it("should handle URLs without protocol", () => { + const url = "www.example.com"; + const expected = "example.com"; expect(normalizeUrlOnlyHostname(url)).toBe(expected); }); - it('should handle URLs without protocol and www', () => { - const url = 'example.com'; - const expected = 'example.com'; + it("should handle URLs without protocol and www", () => { + const url = "example.com"; + const expected = "example.com"; expect(normalizeUrlOnlyHostname(url)).toBe(expected); }); - it('should handle URLs with paths', () => { - const url = 'https://www.example.com/path/to/resource'; - const expected = 'example.com'; + it("should handle URLs with paths", () => { + const url = "https://www.example.com/path/to/resource"; + const expected = "example.com"; expect(normalizeUrlOnlyHostname(url)).toBe(expected); }); - it('should handle invalid URLs gracefully', () => { - const url = 'not a valid url'; - const expected = 'not a valid url'; + it("should handle invalid URLs gracefully", () => { + const url = "not a valid url"; + const expected = "not a valid url"; expect(normalizeUrlOnlyHostname(url)).toBe(expected); }); }); - - -describe('normalizeUrl', () => { - it('should remove protocol and www from URL', () => { - const url = 'https://www.example.com'; - const expected = 'example.com'; +describe("normalizeUrl", () => { + it("should remove protocol and www from URL", () => { + const url = "https://www.example.com"; + const expected = "example.com"; expect(normalizeUrl(url)).toBe(expected); }); - it('should remove only protocol if www is not present', () => { - const url = 'https://example.com'; - const expected = 'example.com'; + it("should remove only protocol if www is not present", () => { + const url = "https://example.com"; + const expected = "example.com"; expect(normalizeUrl(url)).toBe(expected); }); - it('should handle URLs without protocol', () => { - const url = 'www.example.com'; - const expected = 'example.com'; + it("should handle URLs without protocol", () => { + const url = "www.example.com"; + const expected = "example.com"; expect(normalizeUrl(url)).toBe(expected); }); - it('should handle URLs without protocol and www', () => { - const url = 'example.com'; - const expected = 'example.com'; + it("should handle URLs without protocol and www", () => { + const url = "example.com"; + const expected = "example.com"; expect(normalizeUrl(url)).toBe(expected); }); - it('should handle URLs with paths', () => { - const url = 'https://www.example.com/path/to/resource'; - const expected = 'example.com/path/to/resource'; + it("should handle URLs with paths", () => { + const url = "https://www.example.com/path/to/resource"; + const expected = "example.com/path/to/resource"; expect(normalizeUrl(url)).toBe(expected); }); - it('should handle URLs with trailing slash', () => { - const url = 'https://www.example.com/'; - const expected = 'example.com'; + it("should handle URLs with trailing slash", () => { + const url = "https://www.example.com/"; + const expected = "example.com"; expect(normalizeUrl(url)).toBe(expected); }); - it('should handle URLs with trailing slash and path', () => { - const url = 'https://www.example.com/path/'; - const expected = 'example.com/path'; + it("should handle URLs with trailing slash and path", () => { + const url = "https://www.example.com/path/"; + const expected = "example.com/path"; expect(normalizeUrl(url)).toBe(expected); }); - it('should handle invalid URLs gracefully', () => { - const url = 'not a valid url'; - const expected = 'not a valid url'; + it("should handle invalid URLs gracefully", () => { + const url = "not a valid url"; + const expected = "not a valid url"; expect(normalizeUrl(url)).toBe(expected); }); }); diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index a200b9b5..00fb9831 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -322,13 +322,13 @@ export async function lockURLs( export async function lockURLsIndividually( id: string, sc: StoredCrawl, - jobs: { id: string; url: string; }[], + jobs: { id: string; url: string }[], ) { const out: typeof jobs = []; for (const job of jobs) { if (await lockURL(id, sc, job.url)) { - out.push(job); + out.push(job); } } diff --git a/apps/api/src/lib/extract/config.ts b/apps/api/src/lib/extract/config.ts index bb88804a..f92c1fd9 100644 --- a/apps/api/src/lib/extract/config.ts +++ b/apps/api/src/lib/extract/config.ts @@ -6,6 +6,4 @@ export const extractConfig = { MIN_REQUIRED_LINKS: 1, }; -export const CUSTOM_U_TEAMS = [ - "874d40cc-a5c0-4e93-b661-9ddfbad5e51e" -] \ No newline at end of file +export const CUSTOM_U_TEAMS = ["874d40cc-a5c0-4e93-b661-9ddfbad5e51e"]; diff --git a/apps/api/src/lib/extract/extract-redis.ts b/apps/api/src/lib/extract/extract-redis.ts index f4ed0369..658f734a 100644 --- a/apps/api/src/lib/extract/extract-redis.ts +++ b/apps/api/src/lib/extract/extract-redis.ts @@ -21,14 +21,19 @@ export async function getExtract(id: string): Promise { return x ? JSON.parse(x) : null; } -export async function updateExtract(id: string, extract: Partial) { +export async function updateExtract( + id: string, + extract: Partial, +) { const current = await getExtract(id); if (!current) return; - await redisConnection.set("extract:" + id, JSON.stringify({ ...current, ...extract })); + await redisConnection.set( + "extract:" + id, + JSON.stringify({ ...current, ...extract }), + ); await redisConnection.expire("extract:" + id, 24 * 60 * 60, "NX"); } - export async function getExtractExpiry(id: string): Promise { const d = new Date(); const ttl = await redisConnection.pttl("extract:" + id); diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 589620cb..2c909c52 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -30,7 +30,7 @@ interface ExtractResult { function getRootDomain(url: string): string { try { - if(url.endsWith("/*")) { + if (url.endsWith("/*")) { url = url.slice(0, -2); } const urlObj = new URL(url); @@ -40,32 +40,39 @@ function getRootDomain(url: string): string { } } -export async function performExtraction(extractId: string, options: ExtractServiceOptions): Promise { +export async function performExtraction( + extractId: string, + options: ExtractServiceOptions, +): Promise { const { request, teamId, plan, subId } = options; const urlTraces: URLTrace[] = []; let docs: Document[] = []; // Process URLs - const urlPromises = request.urls.map(url => - processUrl({ - url, - prompt: request.prompt, - teamId, - plan, - allowExternalLinks: request.allowExternalLinks, - origin: request.origin, - limit: request.limit, - includeSubdomains: request.includeSubdomains, - }, urlTraces) + const urlPromises = request.urls.map((url) => + processUrl( + { + url, + prompt: request.prompt, + teamId, + plan, + allowExternalLinks: request.allowExternalLinks, + origin: request.origin, + limit: request.limit, + includeSubdomains: request.includeSubdomains, + }, + urlTraces, + ), ); const processedUrls = await Promise.all(urlPromises); - const links = processedUrls.flat().filter(url => url); + const links = processedUrls.flat().filter((url) => url); if (links.length === 0) { return { success: false, - error: "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.", + error: + "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.", extractId, urlTrace: urlTraces, }; @@ -73,14 +80,17 @@ export async function performExtraction(extractId: string, options: ExtractServi // Scrape documents const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000; - const scrapePromises = links.map(url => - scrapeDocument({ - url, - teamId, - plan, - origin: request.origin || "api", - timeout, - }, urlTraces) + const scrapePromises = links.map((url) => + scrapeDocument( + { + url, + teamId, + plan, + origin: request.origin || "api", + timeout, + }, + urlTraces, + ), ); try { @@ -114,13 +124,16 @@ export async function performExtraction(extractId: string, options: ExtractServi // Update token usage in traces if (completions.numTokens) { - const totalLength = docs.reduce((sum, doc) => sum + (doc.markdown?.length || 0), 0); + const totalLength = docs.reduce( + (sum, doc) => sum + (doc.markdown?.length || 0), + 0, + ); docs.forEach((doc) => { if (doc.metadata?.sourceURL) { const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL); if (trace && trace.contentStats) { trace.contentStats.tokensUsed = Math.floor( - ((doc.markdown?.length || 0) / totalLength) * completions.numTokens + ((doc.markdown?.length || 0) / totalLength) * completions.numTokens, ); } } @@ -131,7 +144,7 @@ export async function performExtraction(extractId: string, options: ExtractServi // const rootDomains = new Set(request.urls.map(getRootDomain)); // rootDomains.forEach(async url => { // const crawlId = crypto.randomUUID(); - + // // Create and save crawl configuration first // const sc: StoredCrawl = { // originUrl: url, @@ -155,7 +168,7 @@ export async function performExtraction(extractId: string, options: ExtractServi // parsePDF: true, // skipTlsVerification: false, // }, - // internalOptions: { + // internalOptions: { // disableSmartWaitCache: true, // isBackgroundIndex: true // }, @@ -185,7 +198,7 @@ export async function performExtraction(extractId: string, options: ExtractServi let linksBilled = links.length * 5; - if(CUSTOM_U_TEAMS.includes(teamId)){ + if (CUSTOM_U_TEAMS.includes(teamId)) { linksBilled = 1; } // Bill team for usage @@ -213,12 +226,12 @@ export async function performExtraction(extractId: string, options: ExtractServi updateExtract(extractId, { status: "completed", }).catch((error) => { - logger.error(`Failed to update extract ${extractId} status to completed: ${error}`); + logger.error( + `Failed to update extract ${extractId} status to completed: ${error}`, + ); }); }); - - return { success: true, data: completions.extract ?? {}, @@ -226,4 +239,4 @@ export async function performExtraction(extractId: string, options: ExtractServi warning: completions.warning, urlTrace: request.urlTrace ? urlTraces : undefined, }; -} \ No newline at end of file +} diff --git a/apps/api/src/lib/extract/index/pinecone.ts b/apps/api/src/lib/extract/index/pinecone.ts index 14c3bea4..50799d4d 100644 --- a/apps/api/src/lib/extract/index/pinecone.ts +++ b/apps/api/src/lib/extract/index/pinecone.ts @@ -1,6 +1,6 @@ -import { Pinecone } from '@pinecone-database/pinecone'; -import { Document } from '../../../controllers/v1/types'; -import { logger } from '../../logger'; +import { Pinecone } from "@pinecone-database/pinecone"; +import { Document } from "../../../controllers/v1/types"; +import { logger } from "../../logger"; import OpenAI from "openai"; const openai = new OpenAI({ @@ -48,34 +48,43 @@ export async function indexPage({ document, originUrl, crawlId, - teamId + teamId, }: { document: Document; originUrl: string; crawlId?: string; teamId?: string; -} -) { +}) { try { const index = pinecone.index(INDEX_NAME); // Trim markdown if it's too long let trimmedMarkdown = document.markdown; - if (trimmedMarkdown && Buffer.byteLength(trimmedMarkdown, 'utf-8') > MAX_METADATA_SIZE) { - trimmedMarkdown = trimmedMarkdown.slice(0, Math.floor(MAX_METADATA_SIZE / 2)); // Using half the size to be safe with UTF-8 encoding + if ( + trimmedMarkdown && + Buffer.byteLength(trimmedMarkdown, "utf-8") > MAX_METADATA_SIZE + ) { + trimmedMarkdown = trimmedMarkdown.slice( + 0, + Math.floor(MAX_METADATA_SIZE / 2), + ); // Using half the size to be safe with UTF-8 encoding } // Create text to embed const textToEmbed = [ document.metadata.title, document.metadata.description, - trimmedMarkdown - ].filter(Boolean).join('\n\n'); + trimmedMarkdown, + ] + .filter(Boolean) + .join("\n\n"); // Get embedding from OpenAI const embedding = await getEmbedding(textToEmbed); - const normalizedUrl = normalizeUrl(document.metadata.sourceURL || document.metadata.url!); + const normalizedUrl = normalizeUrl( + document.metadata.sourceURL || document.metadata.url!, + ); // Prepare metadata const metadata: PageMetadata = { @@ -86,29 +95,30 @@ export async function indexPage({ crawlId, teamId, markdown: trimmedMarkdown, - timestamp: Date.now() + timestamp: Date.now(), }; // Upsert to Pinecone - await index.upsert([{ - id: normalizedUrl, - values: embedding, - metadata: { - ...metadata, - [document.metadata.sourceURL || document.metadata.url!]: true - } - }]); + await index.upsert([ + { + id: normalizedUrl, + values: embedding, + metadata: { + ...metadata, + [document.metadata.sourceURL || document.metadata.url!]: true, + }, + }, + ]); - logger.debug('Successfully indexed page in Pinecone', { + logger.debug("Successfully indexed page in Pinecone", { url: metadata.url, - crawlId + crawlId, }); - } catch (error) { - logger.error('Failed to index page in Pinecone', { + logger.error("Failed to index page in Pinecone", { error, url: document.metadata.sourceURL || document.metadata.url, - crawlId + crawlId, }); } } @@ -116,7 +126,7 @@ export async function indexPage({ export async function searchSimilarPages( query: string, originUrl?: string, - limit: number = 10 + limit: number = 10, ) { try { const index = pinecone.index(INDEX_NAME); @@ -127,31 +137,30 @@ export async function searchSimilarPages( const queryParams: any = { vector: queryEmbedding, topK: limit, - includeMetadata: true + includeMetadata: true, }; const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined; // Add filter if originUrl is provided if (normalizedOriginUrl) { queryParams.filter = { - originUrl: { $eq: normalizedOriginUrl } + originUrl: { $eq: normalizedOriginUrl }, }; } const results = await index.query(queryParams); - return results.matches.map(match => ({ + return results.matches.map((match) => ({ url: match.metadata?.url, - title: match.metadata?.title, + title: match.metadata?.title, description: match.metadata?.description, score: match.score, - markdown: match.metadata?.markdown + markdown: match.metadata?.markdown, })); - } catch (error) { - logger.error('Failed to search similar pages in Pinecone', { + logger.error("Failed to search similar pages in Pinecone", { error, query, - originUrl + originUrl, }); return []; } diff --git a/apps/api/src/lib/extract/reranker.ts b/apps/api/src/lib/extract/reranker.ts index e5b61741..439400c6 100644 --- a/apps/api/src/lib/extract/reranker.ts +++ b/apps/api/src/lib/extract/reranker.ts @@ -9,8 +9,6 @@ const cohere = new CohereClient({ token: process.env.COHERE_API_KEY, }); - - interface RankingResult { mappedLinks: MapDocument[]; linksAndScores: { @@ -59,7 +57,6 @@ export async function rerankLinks( searchQuery, ); - // First try with high threshold let filteredLinks = filterAndProcessLinks( mappedLinks, @@ -67,8 +64,6 @@ export async function rerankLinks( extractConfig.INITIAL_SCORE_THRESHOLD, ); - - // If we don't have enough high-quality links, try with lower threshold if (filteredLinks.length < extractConfig.MIN_REQUIRED_LINKS) { logger.info( @@ -102,7 +97,7 @@ export async function rerankLinks( if (trace) { trace.relevanceScore = score.score; // If URL didn't make it through filtering, mark it as filtered out - if (!filteredLinks.some(link => link.url === score.link)) { + if (!filteredLinks.some((link) => link.url === score.link)) { trace.warning = `Relevance score ${score.score} below threshold`; trace.usedInCompletion = false; } @@ -110,20 +105,20 @@ export async function rerankLinks( }); const rankedLinks = filteredLinks.slice(0, extractConfig.MAX_RANKING_LIMIT); - + // Mark URLs that will be used in completion - rankedLinks.forEach(link => { - const trace = urlTraces.find(t => t.url === link.url); + rankedLinks.forEach((link) => { + const trace = urlTraces.find((t) => t.url === link.url); if (trace) { trace.usedInCompletion = true; } }); // Mark URLs that were dropped due to ranking limit - filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach(link => { - const trace = urlTraces.find(t => t.url === link.url); + filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach((link) => { + const trace = urlTraces.find((t) => t.url === link.url); if (trace) { - trace.warning = 'Excluded due to ranking limit'; + trace.warning = "Excluded due to ranking limit"; trace.usedInCompletion = false; } }); diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index a5027fa9..858a780b 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -20,10 +20,13 @@ interface ProcessUrlOptions { includeSubdomains?: boolean; } -export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace[]): Promise { +export async function processUrl( + options: ProcessUrlOptions, + urlTraces: URLTrace[], +): Promise { const trace: URLTrace = { url: options.url, - status: 'mapped', + status: "mapped", timing: { discoveredAt: new Date().toISOString(), }, @@ -35,8 +38,8 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace trace.usedInCompletion = true; return [options.url]; } - trace.status = 'error'; - trace.error = 'URL is blocked'; + trace.status = "error"; + trace.error = "URL is blocked"; trace.usedInCompletion = false; return []; } @@ -46,9 +49,10 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace let rephrasedPrompt = options.prompt; if (options.prompt) { - rephrasedPrompt = await generateBasicCompletion( - buildRefrasedPrompt(options.prompt, baseUrl) - ) ?? options.prompt; + rephrasedPrompt = + (await generateBasicCompletion( + buildRefrasedPrompt(options.prompt, baseUrl), + )) ?? options.prompt; } try { @@ -70,11 +74,11 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace let uniqueUrls = removeDuplicateUrls(allUrls); // Track all discovered URLs - uniqueUrls.forEach(discoveredUrl => { - if (!urlTraces.some(t => t.url === discoveredUrl)) { + uniqueUrls.forEach((discoveredUrl) => { + if (!urlTraces.some((t) => t.url === discoveredUrl)) { urlTraces.push({ url: discoveredUrl, - status: 'mapped', + status: "mapped", timing: { discoveredAt: new Date().toISOString(), }, @@ -84,7 +88,7 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace }); // retry if only one url is returned - if (uniqueUrls.length <= 1) { + if (uniqueUrls.length <= 1) { const retryMapResults = await getMapResults({ url: baseUrl, teamId: options.teamId, @@ -96,18 +100,18 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace includeMetadata: true, includeSubdomains: options.includeSubdomains, }); - + mappedLinks = retryMapResults.mapResults as MapDocument[]; allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links]; uniqueUrls = removeDuplicateUrls(allUrls); // Track all discovered URLs - uniqueUrls.forEach(discoveredUrl => { - if (!urlTraces.some(t => t.url === discoveredUrl)) { + uniqueUrls.forEach((discoveredUrl) => { + if (!urlTraces.some((t) => t.url === discoveredUrl)) { urlTraces.push({ url: discoveredUrl, - status: 'mapped', - warning: 'Broader search. Not limiting map results to prompt.', + status: "mapped", + warning: "Broader search. Not limiting map results to prompt.", timing: { discoveredAt: new Date().toISOString(), }, @@ -118,11 +122,11 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace } // Track all discovered URLs - uniqueUrls.forEach(discoveredUrl => { - if (!urlTraces.some(t => t.url === discoveredUrl)) { + uniqueUrls.forEach((discoveredUrl) => { + if (!urlTraces.some((t) => t.url === discoveredUrl)) { urlTraces.push({ url: discoveredUrl, - status: 'mapped', + status: "mapped", timing: { discoveredAt: new Date().toISOString(), }, @@ -155,11 +159,11 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace mappedLinks = await rerankLinks(mappedLinks, searchQuery, urlTraces); } - return mappedLinks.map(x => x.url); + return mappedLinks.map((x) => x.url); } catch (error) { - trace.status = 'error'; + trace.status = "error"; trace.error = error.message; trace.usedInCompletion = false; return []; } -} \ No newline at end of file +} diff --git a/apps/api/src/lib/logger.ts b/apps/api/src/lib/logger.ts index bf57f63f..76b66180 100644 --- a/apps/api/src/lib/logger.ts +++ b/apps/api/src/lib/logger.ts @@ -42,11 +42,18 @@ export const logger = winston.createLogger({ }, }), transports: [ - ...(process.env.FIRECRAWL_LOG_TO_FILE ? [ - new winston.transports.File({ - filename: "firecrawl-" + (process.argv[1].includes("worker") ? "worker" : "app") + "-" + crypto.randomUUID() + ".log", - }) - ] : []), + ...(process.env.FIRECRAWL_LOG_TO_FILE + ? [ + new winston.transports.File({ + filename: + "firecrawl-" + + (process.argv[1].includes("worker") ? "worker" : "app") + + "-" + + crypto.randomUUID() + + ".log", + }), + ] + : []), new winston.transports.Console({ format: winston.format.combine( winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }), diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 6bb8b04e..1d10fb17 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -175,11 +175,10 @@ export async function runWebScraper({ } // If the team is the background index team, return the response - if(team_id === process.env.BACKGROUND_INDEX_TEAM_ID!) { + if (team_id === process.env.BACKGROUND_INDEX_TEAM_ID!) { return response; } - billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => { logger.error( `Failed to bill team ${team_id} for ${creditsToBeBilled} credits`, diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 656d076c..a916dd40 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -192,7 +192,8 @@ v1Router.get( wrap((req: any, res): any => crawlStatusController(req, res, true)), ); -v1Router.get("/scrape/:jobId", +v1Router.get( + "/scrape/:jobId", authMiddleware(RateLimiterMode.CrawlStatus), wrap(scrapeStatusController), ); @@ -242,6 +243,3 @@ v1Router.get( authMiddleware(RateLimiterMode.CrawlStatus), wrap(creditUsageController), ); - - - diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index c958306d..5662fff9 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -219,34 +219,51 @@ export class WebCrawler { const _urlsHandler = async (urls: string[]) => { let uniqueURLs: string[] = []; for (const url of urls) { - if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(url))) { + if ( + await redisConnection.sadd( + "sitemap:" + this.jobId + ":links", + normalizeUrl(url), + ) + ) { uniqueURLs.push(url); } } - await redisConnection.expire("sitemap:" + this.jobId + ":links", 3600, "NX"); + await redisConnection.expire( + "sitemap:" + this.jobId + ":links", + 3600, + "NX", + ); if (uniqueURLs.length > 0) { urlsHandler(uniqueURLs); } }; - let count = await this.tryFetchSitemapLinks(this.initialUrl, (urls: string[]) => { - if (fromMap && onlySitemap) { - return urlsHandler(urls); - } else { - let filteredLinks = this.filterLinks( - [...new Set(urls)], - leftOfLimit, - this.maxCrawledDepth, - fromMap, - ); - leftOfLimit -= filteredLinks.length; - return _urlsHandler(filteredLinks); - } - }); + let count = await this.tryFetchSitemapLinks( + this.initialUrl, + (urls: string[]) => { + if (fromMap && onlySitemap) { + return urlsHandler(urls); + } else { + let filteredLinks = this.filterLinks( + [...new Set(urls)], + leftOfLimit, + this.maxCrawledDepth, + fromMap, + ); + leftOfLimit -= filteredLinks.length; + return _urlsHandler(filteredLinks); + } + }, + ); if (count > 0) { - if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(this.initialUrl))) { + if ( + await redisConnection.sadd( + "sitemap:" + this.jobId + ":links", + normalizeUrl(this.initialUrl), + ) + ) { urlsHandler([this.initialUrl]); } count++; @@ -470,8 +487,13 @@ export class WebCrawler { return socialMediaOrEmail.some((ext) => url.includes(ext)); } - private async tryFetchSitemapLinks(url: string, urlsHandler: (urls: string[]) => unknown): Promise { - const sitemapUrl = url.endsWith(".xml") ? url : `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`; + private async tryFetchSitemapLinks( + url: string, + urlsHandler: (urls: string[]) => unknown, + ): Promise { + const sitemapUrl = url.endsWith(".xml") + ? url + : `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`; let sitemapCount: number = 0; @@ -482,37 +504,43 @@ export class WebCrawler { this.logger, ); } catch (error) { - this.logger.debug( - `Failed to fetch sitemap from ${sitemapUrl}`, - { method: "tryFetchSitemapLinks", sitemapUrl, error }, - ); + this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, { + method: "tryFetchSitemapLinks", + sitemapUrl, + error, + }); } // If this is a subdomain, also try to get sitemap from the main domain try { const urlObj = new URL(url); const hostname = urlObj.hostname; - const domainParts = hostname.split('.'); - + const domainParts = hostname.split("."); + // Check if this is a subdomain (has more than 2 parts and not www) - if (domainParts.length > 2 && domainParts[0] !== 'www') { + if (domainParts.length > 2 && domainParts[0] !== "www") { // Get the main domain by taking the last two parts - const mainDomain = domainParts.slice(-2).join('.'); + const mainDomain = domainParts.slice(-2).join("."); const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`; const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`; try { // Get all links from the main domain's sitemap sitemapCount += await getLinksFromSitemap( - { sitemapUrl: mainDomainSitemapUrl, urlsHandler(urls) { - return urlsHandler(urls.filter(link => { - try { - const linkUrl = new URL(link); - return linkUrl.hostname.endsWith(hostname); - } catch { - } - })) - }, mode: "fire-engine" }, + { + sitemapUrl: mainDomainSitemapUrl, + urlsHandler(urls) { + return urlsHandler( + urls.filter((link) => { + try { + const linkUrl = new URL(link); + return linkUrl.hostname.endsWith(hostname); + } catch {} + }), + ); + }, + mode: "fire-engine", + }, this.logger, ); } catch (error) { diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 29f821d7..679ac5b9 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -15,7 +15,7 @@ export async function getLinksFromSitemap( mode = "axios", }: { sitemapUrl: string; - urlsHandler(urls: string[]): unknown, + urlsHandler(urls: string[]): unknown; mode?: "axios" | "fire-engine"; }, logger: Logger, @@ -31,7 +31,10 @@ export async function getLinksFromSitemap( { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }, ); if (!response.success) { - logger.debug("Failed to scrape sitemap via TLSClient, falling back to axios...", { error: response.error }) + logger.debug( + "Failed to scrape sitemap via TLSClient, falling back to axios...", + { error: response.error }, + ); const ar = await axios.get(sitemapUrl, { timeout: axiosTimeout }); content = ar.data; } else { @@ -63,14 +66,11 @@ export async function getLinksFromSitemap( .map((sitemap) => sitemap.loc[0].trim()); const sitemapPromises: Promise[] = sitemapUrls.map((sitemapUrl) => - getLinksFromSitemap( - { sitemapUrl, urlsHandler, mode }, - logger, - ), + getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger), ); - + const results = await Promise.all(sitemapPromises); - count = results.reduce((a,x) => a + x) + count = results.reduce((a, x) => a + x); } else if (root && root.url) { // Check if any URLs point to additional sitemaps const xmlSitemaps: string[] = root.url @@ -78,7 +78,7 @@ export async function getLinksFromSitemap( (url) => url.loc && url.loc.length > 0 && - url.loc[0].trim().toLowerCase().endsWith('.xml') + url.loc[0].trim().toLowerCase().endsWith(".xml"), ) .map((url) => url.loc[0].trim()); @@ -90,7 +90,10 @@ export async function getLinksFromSitemap( logger, ), ); - count += (await Promise.all(sitemapPromises)).reduce((a,x) => a + x, 0); + count += (await Promise.all(sitemapPromises)).reduce( + (a, x) => a + x, + 0, + ); } const validUrls = root.url @@ -98,7 +101,7 @@ export async function getLinksFromSitemap( (url) => url.loc && url.loc.length > 0 && - !url.loc[0].trim().toLowerCase().endsWith('.xml') && + !url.loc[0].trim().toLowerCase().endsWith(".xml") && !WebCrawler.prototype.isFile(url.loc[0].trim()), ) .map((url) => url.loc[0].trim()); diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts index eefd4c0f..a13ff971 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts @@ -3,7 +3,10 @@ import { EngineScrapeResult } from ".."; import { Meta } from "../.."; import { TimeoutError } from "../../error"; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; -import { InsecureConnectionError, makeSecureDispatcher } from "../utils/safeFetch"; +import { + InsecureConnectionError, + makeSecureDispatcher, +} from "../utils/safeFetch"; export async function scrapeURLWithFetch( meta: Meta, @@ -20,7 +23,9 @@ export async function scrapeURLWithFetch( headers: meta.options.headers, }), (async () => { - await new Promise((resolve) => setTimeout(() => resolve(null), timeout)); + await new Promise((resolve) => + setTimeout(() => resolve(null), timeout), + ); throw new TimeoutError( "Fetch was unable to scrape the page before timing out", { cause: { timeout } }, @@ -28,7 +33,10 @@ export async function scrapeURLWithFetch( })(), ]); } catch (error) { - if (error instanceof TypeError && error.cause instanceof InsecureConnectionError) { + if ( + error instanceof TypeError && + error.cause instanceof InsecureConnectionError + ) { throw error.cause; } else { throw error; diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts index e02e9dbb..8b7b86fb 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts @@ -3,7 +3,12 @@ import * as Sentry from "@sentry/node"; import { z } from "zod"; import { robustFetch } from "../../lib/fetch"; -import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error"; +import { + ActionError, + EngineError, + SiteError, + UnsupportedFileError, +} from "../../error"; const successSchema = z.object({ jobId: z.string(), @@ -35,12 +40,15 @@ const successSchema = z.object({ }) .array() .optional(), - + // chrome-cdp only -- file download handler - file: z.object({ - name: z.string(), - content: z.string(), - }).optional().or(z.null()), + file: z + .object({ + name: z.string(), + content: z.string(), + }) + .optional() + .or(z.null()), }); export type FireEngineCheckStatusSuccess = z.infer; @@ -121,7 +129,9 @@ export async function fireEngineCheckStatus( typeof status.error === "string" && status.error.includes("File size exceeds") ) { - throw new UnsupportedFileError("File size exceeds " + status.error.split("File size exceeds ")[1]); + throw new UnsupportedFileError( + "File size exceeds " + status.error.split("File size exceeds ")[1], + ); } else if ( typeof status.error === "string" && // TODO: improve this later diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index aeafebea..43da361c 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -13,7 +13,13 @@ import { FireEngineCheckStatusSuccess, StillProcessingError, } from "./checkStatus"; -import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError } from "../../error"; +import { + ActionError, + EngineError, + SiteError, + TimeoutError, + UnsupportedFileError, +} from "../../error"; import * as Sentry from "@sentry/node"; import { Action } from "../../../../lib/entities"; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index e452f7fa..12a5e6e4 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -38,7 +38,7 @@ const useCache = process.env.CACHE_REDIS_URL !== undefined; export const engines: Engine[] = [ - ...(useCache ? [ "cache" as const ] : []), + ...(useCache ? ["cache" as const] : []), ...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, @@ -298,7 +298,6 @@ export function buildFallbackList(meta: Meta): { engine: Engine; unsupportedFeatures: Set; }[] { - if (meta.internalOptions.useCache !== true) { const cacheIndex = engines.indexOf("cache"); if (cacheIndex !== -1) { diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts index 55be08c3..e83cf791 100644 --- a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts +++ b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts @@ -7,11 +7,18 @@ import { v4 as uuid } from "uuid"; import * as undici from "undici"; import { makeSecureDispatcher } from "./safeFetch"; -export async function fetchFileToBuffer(url: string, init?: undici.RequestInit): Promise<{ +export async function fetchFileToBuffer( + url: string, + init?: undici.RequestInit, +): Promise<{ response: undici.Response; buffer: Buffer; }> { - const response = await undici.fetch(url, { ...init, redirect: "follow", dispatcher: await makeSecureDispatcher(url) }); + const response = await undici.fetch(url, { + ...init, + redirect: "follow", + dispatcher: await makeSecureDispatcher(url), + }); return { response, buffer: Buffer.from(await response.arrayBuffer()), @@ -30,7 +37,11 @@ export async function downloadFile( const tempFileWrite = createWriteStream(tempFilePath); // TODO: maybe we could use tlsclient for this? for proxying - const response = await undici.fetch(url, { ...init, redirect: "follow", dispatcher: await makeSecureDispatcher(url) }); + const response = await undici.fetch(url, { + ...init, + redirect: "follow", + dispatcher: await makeSecureDispatcher(url), + }); // This should never happen in the current state of JS/Undici (2024), but let's check anyways. if (response.body === null) { diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts b/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts index eb6f2459..6cae9bd8 100644 --- a/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts +++ b/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts @@ -4,57 +4,71 @@ import * as undici from "undici"; import { Address6 } from "ip-address"; export class InsecureConnectionError extends Error { - constructor() { - super("Connection violated security rules.") - } + constructor() { + super("Connection violated security rules."); + } } function isIPv4Private(address: string): boolean { - const parts = address.split(".").map(x => parseInt(x, 10)); - return parts[0] === 0 // Current (local, "this") network - || parts[0] === 10 // Used for local communications within a private network - || (parts[0] === 100 && parts[1] >= 64 && parts[1] < 128) // Shared address space for communications between a service provider and its subscribers when using a carrier-grade NAT - || parts[0] === 127 // Used for loopback addresses to the local host - || (parts[0] === 169 && parts[1] === 254) // Used for link-local addresses between two hosts on a single link when no IP address is otherwise specified, such as would have normally been retrieved from a DHCP server - || (parts[0] === 127 && parts[1] >= 16 && parts[2] < 32) // Used for local communications within a private network - || (parts[0] === 192 && parts[1] === 0 && parts[2] === 0) // IETF Porotocol Assignments, DS-Lite (/29) - || (parts[0] === 192 && parts[1] === 0 && parts[2] === 2) // Assigned as TEST-NET-1, documentation and examples - || (parts[0] === 192 && parts[1] === 88 && parts[2] === 99) // Reserved. Formerly used for IPv6 to IPv4 relay (included IPv6 address block 2002::/16). - || (parts[0] === 192 && parts[1] === 168) // Used for local communications within a private network - || (parts[0] === 192 && parts[1] >= 18 && parts[1] < 20) // Used for benchmark testing of inter-network communications between two separate subnets - || (parts[0] === 198 && parts[1] === 51 && parts[2] === 100) // Assigned as TEST-NET-2, documentation and examples - || (parts[0] === 203 && parts[1] === 0 && parts[2] === 113) // Assigned as TEST-NET-3, documentation and examples - || (parts[0] >= 224 && parts[0] < 240) // In use for multicast (former Class D network) - || (parts[0] === 233 && parts[1] === 252 && parts[2] === 0) // Assigned as MCAST-TEST-NET, documentation and examples (Note that this is part of the above multicast space.) - || parts[0] >= 240 // Reserved for future use (former class E network) - || (parts[0] === 255 && parts[1] === 255 && parts[2] === 255 && parts[3] === 255) // Reserved for the "limited broadcast" destination address + const parts = address.split(".").map((x) => parseInt(x, 10)); + return ( + parts[0] === 0 || // Current (local, "this") network + parts[0] === 10 || // Used for local communications within a private network + (parts[0] === 100 && parts[1] >= 64 && parts[1] < 128) || // Shared address space for communications between a service provider and its subscribers when using a carrier-grade NAT + parts[0] === 127 || // Used for loopback addresses to the local host + (parts[0] === 169 && parts[1] === 254) || // Used for link-local addresses between two hosts on a single link when no IP address is otherwise specified, such as would have normally been retrieved from a DHCP server + (parts[0] === 127 && parts[1] >= 16 && parts[2] < 32) || // Used for local communications within a private network + (parts[0] === 192 && parts[1] === 0 && parts[2] === 0) || // IETF Porotocol Assignments, DS-Lite (/29) + (parts[0] === 192 && parts[1] === 0 && parts[2] === 2) || // Assigned as TEST-NET-1, documentation and examples + (parts[0] === 192 && parts[1] === 88 && parts[2] === 99) || // Reserved. Formerly used for IPv6 to IPv4 relay (included IPv6 address block 2002::/16). + (parts[0] === 192 && parts[1] === 168) || // Used for local communications within a private network + (parts[0] === 192 && parts[1] >= 18 && parts[1] < 20) || // Used for benchmark testing of inter-network communications between two separate subnets + (parts[0] === 198 && parts[1] === 51 && parts[2] === 100) || // Assigned as TEST-NET-2, documentation and examples + (parts[0] === 203 && parts[1] === 0 && parts[2] === 113) || // Assigned as TEST-NET-3, documentation and examples + (parts[0] >= 224 && parts[0] < 240) || // In use for multicast (former Class D network) + (parts[0] === 233 && parts[1] === 252 && parts[2] === 0) || // Assigned as MCAST-TEST-NET, documentation and examples (Note that this is part of the above multicast space.) + parts[0] >= 240 || // Reserved for future use (former class E network) + (parts[0] === 255 && + parts[1] === 255 && + parts[2] === 255 && + parts[3] === 255) + ); // Reserved for the "limited broadcast" destination address } function isIPv6Private(ipv6) { - return new Address6(ipv6).getScope() !== "Global"; + return new Address6(ipv6).getScope() !== "Global"; } -export function makeSecureDispatcher(url: string, options?: undici.Agent.Options) { - const agent = new undici.Agent({ - connect: { - rejectUnauthorized: false, // bypass SSL failures -- this is fine - // lookup: secureLookup, - }, - maxRedirections: 5000, - ...options, - }); +export function makeSecureDispatcher( + url: string, + options?: undici.Agent.Options, +) { + const agent = new undici.Agent({ + connect: { + rejectUnauthorized: false, // bypass SSL failures -- this is fine + // lookup: secureLookup, + }, + maxRedirections: 5000, + ...options, + }); - agent.on("connect", (_, targets) => { - const client: undici.Client = targets.slice(-1)[0] as undici.Client; - const socketSymbol = Object.getOwnPropertySymbols(client).find(x => x.description === "socket")!; - const socket: Socket | TLSSocket = (client as any)[socketSymbol]; + agent.on("connect", (_, targets) => { + const client: undici.Client = targets.slice(-1)[0] as undici.Client; + const socketSymbol = Object.getOwnPropertySymbols(client).find( + (x) => x.description === "socket", + )!; + const socket: Socket | TLSSocket = (client as any)[socketSymbol]; - if (socket.remoteAddress) { - if (socket.remoteFamily === "IPv4" ? isIPv4Private(socket.remoteAddress!) : isIPv6Private(socket.remoteAddress!)) { - socket.destroy(new InsecureConnectionError()) - } - } - }); + if (socket.remoteAddress) { + if ( + socket.remoteFamily === "IPv4" + ? isIPv4Private(socket.remoteAddress!) + : isIPv6Private(socket.remoteAddress!) + ) { + socket.destroy(new InsecureConnectionError()); + } + } + }); - return agent; -} \ No newline at end of file + return agent; +} diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index cae049c4..26577675 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -420,7 +420,9 @@ export async function scrapeURL( } else if (error instanceof ActionError) { meta.logger.warn("scrapeURL: Action(s) failed to complete", { error }); } else if (error instanceof UnsupportedFileError) { - meta.logger.warn("scrapeURL: Tried to scrape unsupported file", { error }); + meta.logger.warn("scrapeURL: Tried to scrape unsupported file", { + error, + }); } else { Sentry.captureException(error); meta.logger.error("scrapeURL: Unexpected error happened", { error }); diff --git a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts index 5c489a70..a82dff79 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts @@ -43,11 +43,16 @@ export function extractMetadata( try { title = soup("title").first().text().trim() || undefined; description = soup('meta[name="description"]').attr("content") || undefined; - - const faviconLink = soup('link[rel="icon"]').attr("href") || soup('link[rel*="icon"]').first().attr("href") || undefined; + + const faviconLink = + soup('link[rel="icon"]').attr("href") || + soup('link[rel*="icon"]').first().attr("href") || + undefined; if (faviconLink) { const baseUrl = new URL(meta.url).origin; - favicon = faviconLink.startsWith('http') ? faviconLink : `${baseUrl}${faviconLink}`; + favicon = faviconLink.startsWith("http") + ? faviconLink + : `${baseUrl}${faviconLink}`; } // Assuming the language is part of the URL as per the regex pattern diff --git a/apps/api/src/scraper/scrapeURL/transformers/cache.ts b/apps/api/src/scraper/scrapeURL/transformers/cache.ts index f2d7bcf4..96c3d437 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/cache.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/cache.ts @@ -24,7 +24,6 @@ export function saveToCache(meta: Meta, document: Document): Document { return document; } - const key = cacheKey(meta.url, meta.options, meta.internalOptions); if (key !== null) { diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts index f23f506f..844a7fbc 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts @@ -1,33 +1,41 @@ import { removeDefaultProperty } from "./llmExtract"; describe("removeDefaultProperty", () => { - it("should remove the default property from a simple object", () => { - const input = { default: "test", test: "test" }; - const expectedOutput = { test: "test" }; - expect(removeDefaultProperty(input)).toEqual(expectedOutput); - }); + it("should remove the default property from a simple object", () => { + const input = { default: "test", test: "test" }; + const expectedOutput = { test: "test" }; + expect(removeDefaultProperty(input)).toEqual(expectedOutput); + }); - it("should remove the default property from a nested object", () => { - const input = { default: "test", nested: { default: "nestedTest", test: "nestedTest" } }; - const expectedOutput = { nested: { test: "nestedTest" } }; - expect(removeDefaultProperty(input)).toEqual(expectedOutput); - }); + it("should remove the default property from a nested object", () => { + const input = { + default: "test", + nested: { default: "nestedTest", test: "nestedTest" }, + }; + const expectedOutput = { nested: { test: "nestedTest" } }; + expect(removeDefaultProperty(input)).toEqual(expectedOutput); + }); - it("should remove the default property from an array of objects", () => { - const input = { array: [{ default: "test1", test: "test1" }, { default: "test2", test: "test2" }] }; - const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] }; - expect(removeDefaultProperty(input)).toEqual(expectedOutput); - }); + it("should remove the default property from an array of objects", () => { + const input = { + array: [ + { default: "test1", test: "test1" }, + { default: "test2", test: "test2" }, + ], + }; + const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] }; + expect(removeDefaultProperty(input)).toEqual(expectedOutput); + }); - it("should handle objects without a default property", () => { - const input = { test: "test" }; - const expectedOutput = { test: "test" }; - expect(removeDefaultProperty(input)).toEqual(expectedOutput); - }); + it("should handle objects without a default property", () => { + const input = { test: "test" }; + const expectedOutput = { test: "test" }; + expect(removeDefaultProperty(input)).toEqual(expectedOutput); + }); - it("should handle null and non-object inputs", () => { - expect(removeDefaultProperty(null)).toBeNull(); - expect(removeDefaultProperty("string")).toBe("string"); - expect(removeDefaultProperty(123)).toBe(123); - }); -}); \ No newline at end of file + it("should handle null and non-object inputs", () => { + expect(removeDefaultProperty(null)).toBeNull(); + expect(removeDefaultProperty("string")).toBe("string"); + expect(removeDefaultProperty(123)).toBe(123); + }); +}); diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 0b4d6e1e..e9945963 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -123,7 +123,7 @@ export async function generateOpenAICompletions( let schema = options.schema; if (schema) { schema = removeDefaultProperty(schema); -} + } if (schema && schema.type === "array") { schema = { @@ -140,10 +140,10 @@ export async function generateOpenAICompletions( properties: Object.fromEntries( Object.entries(schema).map(([key, value]) => { return [key, removeDefaultProperty(value)]; - }) + }), ), required: Object.keys(schema), - additionalProperties: false + additionalProperties: false, }; } @@ -240,17 +240,17 @@ export async function performLLMExtract( } export function removeDefaultProperty(schema: any): any { - if (typeof schema !== 'object' || schema === null) return schema; + if (typeof schema !== "object" || schema === null) return schema; const { default: _, ...rest } = schema; for (const key in rest) { - if (Array.isArray(rest[key])) { - rest[key] = rest[key].map((item: any) => removeDefaultProperty(item)); - } else if (typeof rest[key] === 'object' && rest[key] !== null) { - rest[key] = removeDefaultProperty(rest[key]); - } + if (Array.isArray(rest[key])) { + rest[key] = rest[key].map((item: any) => removeDefaultProperty(item)); + } else if (typeof rest[key] === "object" && rest[key] !== null) { + rest[key] = removeDefaultProperty(rest[key]); + } } return rest; -} \ No newline at end of file +} diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index b367e0b6..2ee07292 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -9,9 +9,11 @@ configDotenv(); function cleanOfNull(x: T): T { if (Array.isArray(x)) { - return x.map(x => cleanOfNull(x)) as T; + return x.map((x) => cleanOfNull(x)) as T; } else if (typeof x === "object" && x !== null) { - return Object.fromEntries(Object.entries(x).map(([k,v]) => [k,cleanOfNull(v)])) as T + return Object.fromEntries( + Object.entries(x).map(([k, v]) => [k, cleanOfNull(v)]), + ) as T; } else if (typeof x === "string") { return x.replaceAll("\u0000", "") as T; } else { diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index d3d8a4e5..db222814 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -16,20 +16,17 @@ export const loggingQueueName = "{loggingQueue}"; export function getScrapeQueue() { if (!scrapeQueue) { - scrapeQueue = new Queue( - scrapeQueueName, - { - connection: redisConnection, - defaultJobOptions: { - removeOnComplete: { - age: 90000, // 25 hours - }, - removeOnFail: { - age: 90000, // 25 hours - }, + scrapeQueue = new Queue(scrapeQueueName, { + connection: redisConnection, + defaultJobOptions: { + removeOnComplete: { + age: 90000, // 25 hours }, - } - ); + removeOnFail: { + age: 90000, // 25 hours + }, + }, + }); logger.info("Web scraper queue created"); } return scrapeQueue; @@ -37,26 +34,22 @@ export function getScrapeQueue() { export function getExtractQueue() { if (!extractQueue) { - extractQueue = new Queue( - extractQueueName, - { - connection: redisConnection, - defaultJobOptions: { - removeOnComplete: { - age: 90000, // 25 hours - }, - removeOnFail: { - age: 90000, // 25 hours - }, + extractQueue = new Queue(extractQueueName, { + connection: redisConnection, + defaultJobOptions: { + removeOnComplete: { + age: 90000, // 25 hours }, - } - ); + removeOnFail: { + age: 90000, // 25 hours + }, + }, + }); logger.info("Extraction queue created"); } return extractQueue; } - // === REMOVED IN FAVOR OF POLLING -- NOT RELIABLE // import { QueueEvents } from 'bullmq'; // export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection.duplicate() }); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 450393b2..ac1376e8 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -89,13 +89,19 @@ const runningJobs: Set = new Set(); async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { if (await finishCrawl(job.data.crawl_id)) { (async () => { - const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined; + const originUrl = sc.originUrl + ? normalizeUrlOnlyHostname(sc.originUrl) + : undefined; // Get all visited URLs from Redis const visitedUrls = await redisConnection.smembers( "crawl:" + job.data.crawl_id + ":visited", ); // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape) - if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) { + if ( + visitedUrls.length > 0 && + job.data.crawlerOptions !== null && + originUrl + ) { // Fire and forget the upload to Supabase try { // Standardize URLs to canonical form (https, no www) @@ -317,7 +323,10 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => { return err; }; -const processExtractJobInternal = async (token: string, job: Job & { id: string }) => { +const processExtractJobInternal = async ( + token: string, + job: Job & { id: string }, +) => { const logger = _logger.child({ module: "extract-worker", method: "processJobInternal", @@ -348,23 +357,26 @@ const processExtractJobInternal = async (token: string, job: Job & { id: string } } catch (error) { logger.error(`🚫 Job errored ${job.id} - ${error}`, { error }); - + Sentry.captureException(error, { data: { job: job.id, }, }); - + // Move job to failed state in Redis await job.moveToFailed(error, token, false); await updateExtract(job.data.extractId, { status: "failed", - error: error.error ?? error ?? "Unknown error, please contact help@firecrawl.dev. Extract id: " + job.data.extractId, + error: + error.error ?? + error ?? + "Unknown error, please contact help@firecrawl.dev. Extract id: " + + job.data.extractId, }); // throw error; } finally { - clearInterval(extendLockInterval); } }; @@ -635,7 +647,9 @@ async function processKickoffJob(job: Job & { id: string }, token: string) { sc, jobs.map((x) => ({ id: x.opts.jobId, url: x.data.url })), ); - const lockedJobs = jobs.filter(x => lockedIds.find(y => y.id === x.opts.jobId)); + const lockedJobs = jobs.filter((x) => + lockedIds.find((y) => y.id === x.opts.jobId), + ); logger.debug("Adding scrape jobs to Redis..."); await addCrawlJobs( job.data.crawl_id, @@ -790,7 +804,8 @@ async function processJob(job: Job & { id: string }, token: string) { ) { const crawler = crawlToCrawler(job.data.crawl_id, sc); if ( - crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null && + crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === + null && !job.data.isCrawlSourceScrape ) { throw new Error( @@ -1073,8 +1088,8 @@ async function processJob(job: Job & { id: string }, token: string) { console.log("All workers exited. Waiting for all jobs to finish..."); while (runningJobs.size > 0) { - await new Promise(resolve => setTimeout(resolve, 500)); + await new Promise((resolve) => setTimeout(resolve, 500)); } process.exit(0); -})(); \ No newline at end of file +})();