diff --git a/apps/api/package.json b/apps/api/package.json index 1dc26a05..eae70214 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -90,6 +90,7 @@ "express": "^4.18.2", "express-rate-limit": "^7.3.1", "express-ws": "^5.0.2", + "git-diff": "^2.0.6", "glob": "^10.4.2", "gpt3-tokenizer": "^1.1.5", "ioredis": "^5.4.1", @@ -110,6 +111,7 @@ "mongoose": "^8.4.4", "natural": "^7.0.7", "ollama-ai-provider": "^1.2.0", + "parse-diff": "^0.11.1", "pdf-parse": "^1.1.1", "pos": "^0.4.2", "posthog-node": "^4.0.1", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 8644e7bd..028423d3 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -125,6 +125,9 @@ importers: express-ws: specifier: ^5.0.2 version: 5.0.2(express@4.19.2) + git-diff: + specifier: ^2.0.6 + version: 2.0.6 glob: specifier: ^10.4.2 version: 10.4.2 @@ -185,6 +188,9 @@ importers: ollama-ai-provider: specifier: ^1.2.0 version: 1.2.0(zod@3.24.2) + parse-diff: + specifier: ^0.11.1 + version: 0.11.1 pdf-parse: specifier: ^1.1.1 version: 1.1.1 @@ -2384,6 +2390,10 @@ packages: resolution: {integrity: sha512-EjePK1srD3P08o2j4f0ExnylqRs5B9tJjcp9t1krH2qRi8CCdsYfwe9JgSLurFBWwq4uOlipzfk5fHNvwFKr8Q==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + diff@3.5.0: + resolution: {integrity: sha512-A46qtFgd+g7pDZinpnwiRJtxbC1hpgf0uzP3iG89scHk0AUC7A1TGxf5OiiOUv/JMZR8GOt8hL900hV0bOy5xA==} + engines: {node: '>=0.3.1'} + diff@4.0.2: resolution: {integrity: sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==} engines: {node: '>=0.3.1'} @@ -2762,6 +2772,10 @@ packages: resolution: {integrity: sha512-BzUrJBS9EcUb4cFol8r4W3v1cPsSyajLSthNkz5BxbpDcHN5tIrM10E2eNvfnvBn3DaT3DUgx0OpsBKkaOpanw==} engines: {node: '>= 14'} + git-diff@2.0.6: + resolution: {integrity: sha512-/Iu4prUrydE3Pb3lCBMbcSNIf81tgGt0W1ZwknnyF62t3tHmtiJTRj0f+1ZIhp3+Rh0ktz1pJVoa7ZXUCskivA==} + engines: {node: '>= 4.8.0'} + glob-parent@5.1.2: resolution: {integrity: sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==} engines: {node: '>= 6'} @@ -2955,6 +2969,10 @@ packages: ini@1.3.8: resolution: {integrity: sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==} + interpret@1.4.0: + resolution: {integrity: sha512-agE4QfB2Lkp9uICn7BAqoscw4SZP9kTE2hxiFI3jBPmXJfdqiahTbUuKGsMoN2GtqL9AxhYioAcVvgsb1HvRbA==} + engines: {node: '>= 0.10'} + ioredis@5.4.1: resolution: {integrity: sha512-2YZsvl7jopIa1gaePkeMtd9rAcSjOOjPtpcLlOeusyO+XH2SK5ZcT+UCrElPP+WVIInh2TzeI4XW9ENaSLVVHA==} engines: {node: '>=12.22.0'} @@ -3721,6 +3739,9 @@ packages: resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==} engines: {node: '>=6'} + parse-diff@0.11.1: + resolution: {integrity: sha512-Oq4j8LAOPOcssanQkIjxosjATBIEJhCxMCxPhMu+Ci4wdNmAEdx0O+a7gzbR2PyKXgKPvRLIN5g224+dJAsKHA==} + parse-json@5.2.0: resolution: {integrity: sha512-ayCKvm/phCGxOkYRSCM82iDwct8/EonSEgCSxWxD7ve6jHggsFl4fZVQBPRNgQoKiuV/odhFrGzQXZwbifC8Rg==} engines: {node: '>=8'} @@ -3982,6 +4003,10 @@ packages: resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==} engines: {node: '>=8.10.0'} + rechoir@0.6.2: + resolution: {integrity: sha512-HFM8rkZ+i3zrV+4LQjwQ0W+ez98pApMGM3HUrN04j3CqzPOzl9nmP15Y8YXNm8QHGv/eacOVEjqhmWpkRV0NAw==} + engines: {node: '>= 0.10'} + redis-errors@1.2.0: resolution: {integrity: sha512-1qny3OExCf0UvUV/5wpYKf2YwPcOqXzkwKKSmKHiE6ZMQs5heeE/c8eXK+PNllPvmjgAbfnsbpkGZWy8cBpn9w==} engines: {node: '>=4'} @@ -4148,6 +4173,15 @@ packages: resolution: {integrity: sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==} engines: {node: '>=8'} + shelljs.exec@1.1.8: + resolution: {integrity: sha512-vFILCw+lzUtiwBAHV8/Ex8JsFjelFMdhONIsgKNLgTzeRckp2AOYRQtHJE/9LhNvdMmE27AGtzWx0+DHpwIwSw==} + engines: {node: '>= 4.0.0'} + + shelljs@0.8.5: + resolution: {integrity: sha512-TiwcRcrkhHvbrZbnRcFYMLl30Dfov3HKqzp5tO5b4pt6G/SezKcYhmDg15zXVBswHmctSAQKznqNW2LO5tTDow==} + engines: {node: '>=4'} + hasBin: true + shimmer@1.2.1: resolution: {integrity: sha512-sQTKC1Re/rM6XyFM6fIAGHRPVGvyXfgzIDvzoq608vM+jeyVD0Tu1E6Np0Kc2zAIFWIj963V2800iF/9LPieQw==} @@ -7603,6 +7637,8 @@ snapshots: diff-sequences@29.6.3: {} + diff@3.5.0: {} + diff@4.0.2: {} dingbat-to-unicode@1.0.1: {} @@ -8026,6 +8062,14 @@ snapshots: transitivePeerDependencies: - supports-color + git-diff@2.0.6: + dependencies: + chalk: 2.4.2 + diff: 3.5.0 + loglevel: 1.9.1 + shelljs: 0.8.5 + shelljs.exec: 1.1.8 + glob-parent@5.1.2: dependencies: is-glob: 4.0.3 @@ -8271,6 +8315,8 @@ snapshots: ini@1.3.8: {} + interpret@1.4.0: {} + ioredis@5.4.1: dependencies: '@ioredis/commands': 1.2.0 @@ -9249,6 +9295,8 @@ snapshots: dependencies: callsites: 3.1.0 + parse-diff@0.11.1: {} + parse-json@5.2.0: dependencies: '@babel/code-frame': 7.24.7 @@ -9546,6 +9594,10 @@ snapshots: dependencies: picomatch: 2.3.1 + rechoir@0.6.2: + dependencies: + resolve: 1.22.8 + redis-errors@1.2.0: {} redis-info@3.1.0: @@ -9718,6 +9770,14 @@ snapshots: shebang-regex@3.0.0: {} + shelljs.exec@1.1.8: {} + + shelljs@0.8.5: + dependencies: + glob: 7.2.3 + interpret: 1.4.0 + rechoir: 0.6.2 + shimmer@1.2.1: {} side-channel@1.0.6: diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 097f5504..d9d97330 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -95,6 +95,116 @@ describe("Scrape tests", () => { expect(response.changeTracking).toBeDefined(); expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); }, 30000); + + it.concurrent("includes git diff when requested", async () => { + const response = await scrape({ + url: "https://example.com", + formats: ["markdown", "changeTracking"], + changeTrackingOptions: { + modes: ["git-diff"] + } + }); + + expect(response.changeTracking).toBeDefined(); + expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); + + if (response.changeTracking?.changeStatus === "changed") { + expect(response.changeTracking?.diff).toBeDefined(); + expect(response.changeTracking?.diff?.text).toBeDefined(); + expect(response.changeTracking?.diff?.json).toBeDefined(); + expect(response.changeTracking?.diff?.json.files).toBeInstanceOf(Array); + } + }, 30000); + + it.concurrent("includes structured output when requested", async () => { + const response = await scrape({ + url: "https://example.com", + formats: ["markdown", "changeTracking"], + changeTrackingOptions: { + modes: ["json"], + prompt: "Summarize the changes between the previous and current content", + systemPrompt: "You are a helpful assistant that summarizes changes between document versions." + } + }); + + expect(response.changeTracking).toBeDefined(); + expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); + + if (response.changeTracking?.changeStatus === "changed") { + expect(response.changeTracking?.json).toBeDefined(); + } + }, 30000); + + it.concurrent("supports schema-based extraction for change tracking", async () => { + const response = await scrape({ + url: "https://example.com", + formats: ["markdown", "changeTracking"], + changeTrackingOptions: { + modes: ["json"], + schema: { + type: "object", + properties: { + pricing: { + type: "object", + properties: { + amount: { type: "number" }, + currency: { type: "string" } + } + }, + features: { + type: "array", + items: { type: "string" } + } + } + } + } + }); + + expect(response.changeTracking).toBeDefined(); + expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); + + if (response.changeTracking?.changeStatus === "changed") { + expect(response.changeTracking?.json).toBeDefined(); + if (response.changeTracking?.json.pricing) { + expect(response.changeTracking?.json.pricing).toHaveProperty("old"); + expect(response.changeTracking?.json.pricing).toHaveProperty("new"); + } + if (response.changeTracking?.json.features) { + expect(response.changeTracking?.json.features).toHaveProperty("old"); + expect(response.changeTracking?.json.features).toHaveProperty("new"); + } + } + }, 30000); + + it.concurrent("supports both git-diff and structured modes together", async () => { + const response = await scrape({ + url: "https://example.com", + formats: ["markdown", "changeTracking"], + changeTrackingOptions: { + modes: ["git-diff", "json"], + schema: { + type: "object", + properties: { + summary: { type: "string" }, + changes: { type: "array", items: { type: "string" } } + } + } + } + }); + + expect(response.changeTracking).toBeDefined(); + expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); + + if (response.changeTracking?.changeStatus === "changed") { + expect(response.changeTracking?.diff).toBeDefined(); + expect(response.changeTracking?.diff?.text).toBeDefined(); + expect(response.changeTracking?.diff?.json).toBeDefined(); + + expect(response.changeTracking?.json).toBeDefined(); + expect(response.changeTracking?.json).toHaveProperty("summary"); + expect(response.changeTracking?.json).toHaveProperty("changes"); + } + }, 30000); }); describe("Location API (f-e dependant)", () => { diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index ccf81cda..6259981d 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -104,7 +104,7 @@ export async function scrapeController( // Don't bill if we're early returning return; } - if (req.body.extract && req.body.formats.includes("extract")) { + if ((req.body.extract && req.body.formats?.includes("extract")) || (req.body.formats?.includes("changeTracking") && req.body.changeTrackingOptions?.modes?.includes("json"))) { creditsToBeBilled = 5; } diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index fa059e6f..56a93072 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -20,6 +20,7 @@ export type Format = | "screenshot" | "screenshot@fullPage" | "extract" + | "json" | "changeTracking"; export const url = z.preprocess( @@ -195,6 +196,13 @@ const baseScrapeOptions = z extract: extractOptions.optional(), // New jsonOptions: extractOptions.optional(), + changeTrackingOptions: z + .object({ + prompt: z.string().optional(), + schema: z.any().optional(), + modes: z.enum(["json", "git-diff"]).array().optional().default([]), + }) + .optional(), mobile: z.boolean().default(false), parsePDF: z.boolean().default(true), actions: actionsSchema.optional(), @@ -555,6 +563,27 @@ export type Document = { previousScrapeAt: string | null; changeStatus: "new" | "same" | "changed" | "removed"; visibility: "visible" | "hidden"; + diff?: { + text: string; + json: { + files: Array<{ + from: string | null; + to: string | null; + chunks: Array<{ + content: string; + changes: Array<{ + type: string; + normal?: boolean; + ln?: number; + ln1?: number; + ln2?: number; + content: string; + }>; + }>; + }>; + }; + }; + json?: any; } metadata: { title?: string; diff --git a/apps/api/src/scraper/scrapeURL/transformers/diff.ts b/apps/api/src/scraper/scrapeURL/transformers/diff.ts index ea98fa6a..f1c2d1fc 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/diff.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/diff.ts @@ -2,6 +2,53 @@ import { supabase_service } from "../../../services/supabase"; import { Document } from "../../../controllers/v1/types"; import { Meta } from "../index"; import { getJob } from "../../../controllers/v1/crawl-status"; +import gitDiff from 'git-diff'; +import parseDiff from 'parse-diff'; +import { generateCompletions } from "./llmExtract"; + +async function extractDataWithSchema(content: string, meta: Meta): Promise { + try { + const { extract } = await generateCompletions({ + logger: meta.logger.child({ + method: "extractDataWithSchema/generateCompletions", + }), + options: { + mode: "llm", + schema: meta.options.changeTrackingOptions?.schema, + systemPrompt: "Extract the requested information from the content based on the provided schema.", + temperature: 0 + }, + markdown: content + }); + return extract; + } catch (error) { + meta.logger.error("Error extracting data with schema", { error }); + return null; + } +} + +function compareExtractedData(previousData: any, currentData: any): any { + const result: Record = {}; + + const allKeys = new Set([ + ...Object.keys(previousData || {}), + ...Object.keys(currentData || {}) + ]); + + for (const key of allKeys) { + const oldValue = previousData?.[key]; + const newValue = currentData?.[key]; + + if (JSON.stringify(oldValue) !== JSON.stringify(newValue)) { + result[key] = { + previous: oldValue, + current: newValue + }; + } + } + + return result; +} export async function deriveDiff(meta: Meta, document: Document): Promise { if (meta.options.formats.includes("changeTracking")) { @@ -20,19 +67,106 @@ export async function deriveDiff(meta: Meta, document: Document): Promise [...x.replace(/\s+/g, "").replace(/\[iframe\]\(.+?\)/g, "")].sort().join(""); + const isChanged = transformer(previousMarkdown) !== transformer(currentMarkdown); + const changeStatus = document.metadata.statusCode === 404 ? "removed" : isChanged ? "changed" : "same"; document.changeTracking = { previousScrapeAt: data.o_date_added, - changeStatus: document.metadata.statusCode === 404 ? "removed" : transformer(previousMarkdown) === transformer(currentMarkdown) ? "same" : "changed", + changeStatus, visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible", } + + if (meta.options.changeTrackingOptions?.modes?.includes("git-diff") && changeStatus === "changed") { + const diffText = gitDiff(previousMarkdown, currentMarkdown, { + color: false, + wordDiff: false + }); + + if (diffText) { + const diffStructured = parseDiff(diffText); + document.changeTracking.diff = { + text: diffText, + json: { + files: diffStructured.map(file => ({ + from: file.from || null, + to: file.to || null, + chunks: file.chunks.map(chunk => ({ + content: chunk.content, + changes: chunk.changes.map(change => { + const baseChange = { + type: change.type, + content: change.content + }; + + if (change.type === 'normal' && 'ln1' in change && 'ln2' in change) { + return { + ...baseChange, + normal: true, + ln1: change.ln1, + ln2: change.ln2 + }; + } else if (change.type === 'add' && 'ln' in change) { + return { + ...baseChange, + add: true, + ln: change.ln + }; + } else if (change.type === 'del' && 'ln' in change) { + return { + ...baseChange, + del: true, + ln: change.ln + }; + } + + return baseChange; + }) + })) + })) + } + }; + } + } + + if (meta.options.changeTrackingOptions?.modes?.includes("json") && + meta.options.changeTrackingOptions && changeStatus === "changed") { + try { + const previousData = meta.options.changeTrackingOptions.schema ? + await extractDataWithSchema(previousMarkdown, meta) : null; + + const currentData = meta.options.changeTrackingOptions.schema ? + await extractDataWithSchema(currentMarkdown, meta) : null; + + if (previousData && currentData) { + document.changeTracking.json = compareExtractedData(previousData, currentData); + } else { + const { extract } = await generateCompletions({ + logger: meta.logger.child({ + method: "deriveDiff/generateCompletions", + }), + options: { + mode: "llm", + systemPrompt: "Analyze the differences between the previous and current content and provide a structured summary of the changes.", + schema: meta.options.changeTrackingOptions.schema, + prompt: meta.options.changeTrackingOptions.prompt, + temperature: 0 + }, + markdown: `Previous Content:\n${previousMarkdown}\n\nCurrent Content:\n${currentMarkdown}`, + previousWarning: document.warning + }); + + document.changeTracking.json = extract; + } + } catch (error) { + meta.logger.error("Error generating structured diff with LLM", { error }); + document.warning = "Structured diff generation failed." + (document.warning ? ` ${document.warning}` : ""); + } + } } else if (!res.error) { document.changeTracking = { previousScrapeAt: null, diff --git a/apps/api/src/scraper/scrapeURL/transformers/index.ts b/apps/api/src/scraper/scrapeURL/transformers/index.ts index 5b16ae99..2544995d 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/index.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts @@ -159,6 +159,24 @@ export function coerceFieldsToFormats( ); } + if (document.changeTracking && + (!meta.options.changeTrackingOptions?.modes?.includes("git-diff")) && + document.changeTracking.diff !== undefined) { + meta.logger.warn( + "Removed diff from changeTracking because git-diff mode wasn't specified in changeTrackingOptions.modes.", + ); + delete document.changeTracking.diff; + } + + if (document.changeTracking && + (!meta.options.changeTrackingOptions?.modes?.includes("json")) && + document.changeTracking.json !== undefined) { + meta.logger.warn( + "Removed structured from changeTracking because structured mode wasn't specified in changeTrackingOptions.modes.", + ); + delete document.changeTracking.json; + } + if (meta.options.actions === undefined || meta.options.actions.length === 0) { delete document.actions; } diff --git a/apps/api/src/types/parse-diff.d.ts b/apps/api/src/types/parse-diff.d.ts new file mode 100644 index 00000000..8eca3b2b --- /dev/null +++ b/apps/api/src/types/parse-diff.d.ts @@ -0,0 +1,49 @@ +declare module 'parse-diff' { + interface NormalChange { + type: 'normal'; + normal: true; + ln1: number; + ln2: number; + content: string; + } + + interface AddChange { + type: 'add'; + add: true; + ln: number; + content: string; + } + + interface DeleteChange { + type: 'del'; + del: true; + ln: number; + content: string; + } + + type Change = NormalChange | AddChange | DeleteChange; + + interface Chunk { + content: string; + changes: Change[]; + oldStart: number; + oldLines: number; + newStart: number; + newLines: number; + } + + interface File { + chunks: Chunk[]; + deletions: number; + additions: number; + from: string | null; + to: string | null; + index?: string[]; + newMode?: string; + oldMode?: string; + binary?: boolean; + } + + function parseDiff(diff: string): File[]; + export = parseDiff; +} diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 6f76220f..7ba44b70 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -72,6 +72,26 @@ export interface FirecrawlDocument; + }>; + }>; + }; + }; }; // v1 search only title?: string;