(feat/change-tracking) Change Tracking Modes (#1445)

* Add git-diff support to change tracking format

Co-Authored-By: Nicolas Camara <nick@sideguide.dev>

* Fix type issues with parse-diff library

Co-Authored-By: Nicolas Camara <nick@sideguide.dev>

* Fix parse-diff type definitions to match actual library structure

Co-Authored-By: Nicolas Camara <nick@sideguide.dev>

* Add structured output/prompt support to change tracking

Co-Authored-By: Nicolas Camara <nick@sideguide.dev>

* (feat/change-tracking) Change Tracking Modes (#1447)

* Refactor change tracking to use modes array instead of separate formats

Co-Authored-By: Nicolas Camara <nick@sideguide.dev>

* Implement schema-based change tracking with old/new value comparison

Co-Authored-By: Nicolas Camara <nick@sideguide.dev>

* Nick:

* Nick: .json

* Update diff.ts

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: Nicolas Camara <nick@sideguide.dev>
Co-authored-by: Nicolas <nicolascamara29@gmail.com>

* Update index.ts

* Update types.ts

* Update diff.ts

* Update scrape.ts

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: Nicolas Camara <nick@sideguide.dev>
Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
devin-ai-integration[bot] 2025-04-12 16:38:56 -07:00 committed by GitHub
parent f18a6b20ff
commit 138a9757ae
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 426 additions and 4 deletions

View File

@ -90,6 +90,7 @@
"express": "^4.18.2",
"express-rate-limit": "^7.3.1",
"express-ws": "^5.0.2",
"git-diff": "^2.0.6",
"glob": "^10.4.2",
"gpt3-tokenizer": "^1.1.5",
"ioredis": "^5.4.1",
@ -110,6 +111,7 @@
"mongoose": "^8.4.4",
"natural": "^7.0.7",
"ollama-ai-provider": "^1.2.0",
"parse-diff": "^0.11.1",
"pdf-parse": "^1.1.1",
"pos": "^0.4.2",
"posthog-node": "^4.0.1",

View File

@ -125,6 +125,9 @@ importers:
express-ws:
specifier: ^5.0.2
version: 5.0.2(express@4.19.2)
git-diff:
specifier: ^2.0.6
version: 2.0.6
glob:
specifier: ^10.4.2
version: 10.4.2
@ -185,6 +188,9 @@ importers:
ollama-ai-provider:
specifier: ^1.2.0
version: 1.2.0(zod@3.24.2)
parse-diff:
specifier: ^0.11.1
version: 0.11.1
pdf-parse:
specifier: ^1.1.1
version: 1.1.1
@ -2384,6 +2390,10 @@ packages:
resolution: {integrity: sha512-EjePK1srD3P08o2j4f0ExnylqRs5B9tJjcp9t1krH2qRi8CCdsYfwe9JgSLurFBWwq4uOlipzfk5fHNvwFKr8Q==}
engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0}
diff@3.5.0:
resolution: {integrity: sha512-A46qtFgd+g7pDZinpnwiRJtxbC1hpgf0uzP3iG89scHk0AUC7A1TGxf5OiiOUv/JMZR8GOt8hL900hV0bOy5xA==}
engines: {node: '>=0.3.1'}
diff@4.0.2:
resolution: {integrity: sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==}
engines: {node: '>=0.3.1'}
@ -2762,6 +2772,10 @@ packages:
resolution: {integrity: sha512-BzUrJBS9EcUb4cFol8r4W3v1cPsSyajLSthNkz5BxbpDcHN5tIrM10E2eNvfnvBn3DaT3DUgx0OpsBKkaOpanw==}
engines: {node: '>= 14'}
git-diff@2.0.6:
resolution: {integrity: sha512-/Iu4prUrydE3Pb3lCBMbcSNIf81tgGt0W1ZwknnyF62t3tHmtiJTRj0f+1ZIhp3+Rh0ktz1pJVoa7ZXUCskivA==}
engines: {node: '>= 4.8.0'}
glob-parent@5.1.2:
resolution: {integrity: sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==}
engines: {node: '>= 6'}
@ -2955,6 +2969,10 @@ packages:
ini@1.3.8:
resolution: {integrity: sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==}
interpret@1.4.0:
resolution: {integrity: sha512-agE4QfB2Lkp9uICn7BAqoscw4SZP9kTE2hxiFI3jBPmXJfdqiahTbUuKGsMoN2GtqL9AxhYioAcVvgsb1HvRbA==}
engines: {node: '>= 0.10'}
ioredis@5.4.1:
resolution: {integrity: sha512-2YZsvl7jopIa1gaePkeMtd9rAcSjOOjPtpcLlOeusyO+XH2SK5ZcT+UCrElPP+WVIInh2TzeI4XW9ENaSLVVHA==}
engines: {node: '>=12.22.0'}
@ -3721,6 +3739,9 @@ packages:
resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==}
engines: {node: '>=6'}
parse-diff@0.11.1:
resolution: {integrity: sha512-Oq4j8LAOPOcssanQkIjxosjATBIEJhCxMCxPhMu+Ci4wdNmAEdx0O+a7gzbR2PyKXgKPvRLIN5g224+dJAsKHA==}
parse-json@5.2.0:
resolution: {integrity: sha512-ayCKvm/phCGxOkYRSCM82iDwct8/EonSEgCSxWxD7ve6jHggsFl4fZVQBPRNgQoKiuV/odhFrGzQXZwbifC8Rg==}
engines: {node: '>=8'}
@ -3982,6 +4003,10 @@ packages:
resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==}
engines: {node: '>=8.10.0'}
rechoir@0.6.2:
resolution: {integrity: sha512-HFM8rkZ+i3zrV+4LQjwQ0W+ez98pApMGM3HUrN04j3CqzPOzl9nmP15Y8YXNm8QHGv/eacOVEjqhmWpkRV0NAw==}
engines: {node: '>= 0.10'}
redis-errors@1.2.0:
resolution: {integrity: sha512-1qny3OExCf0UvUV/5wpYKf2YwPcOqXzkwKKSmKHiE6ZMQs5heeE/c8eXK+PNllPvmjgAbfnsbpkGZWy8cBpn9w==}
engines: {node: '>=4'}
@ -4148,6 +4173,15 @@ packages:
resolution: {integrity: sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==}
engines: {node: '>=8'}
shelljs.exec@1.1.8:
resolution: {integrity: sha512-vFILCw+lzUtiwBAHV8/Ex8JsFjelFMdhONIsgKNLgTzeRckp2AOYRQtHJE/9LhNvdMmE27AGtzWx0+DHpwIwSw==}
engines: {node: '>= 4.0.0'}
shelljs@0.8.5:
resolution: {integrity: sha512-TiwcRcrkhHvbrZbnRcFYMLl30Dfov3HKqzp5tO5b4pt6G/SezKcYhmDg15zXVBswHmctSAQKznqNW2LO5tTDow==}
engines: {node: '>=4'}
hasBin: true
shimmer@1.2.1:
resolution: {integrity: sha512-sQTKC1Re/rM6XyFM6fIAGHRPVGvyXfgzIDvzoq608vM+jeyVD0Tu1E6Np0Kc2zAIFWIj963V2800iF/9LPieQw==}
@ -7603,6 +7637,8 @@ snapshots:
diff-sequences@29.6.3: {}
diff@3.5.0: {}
diff@4.0.2: {}
dingbat-to-unicode@1.0.1: {}
@ -8026,6 +8062,14 @@ snapshots:
transitivePeerDependencies:
- supports-color
git-diff@2.0.6:
dependencies:
chalk: 2.4.2
diff: 3.5.0
loglevel: 1.9.1
shelljs: 0.8.5
shelljs.exec: 1.1.8
glob-parent@5.1.2:
dependencies:
is-glob: 4.0.3
@ -8271,6 +8315,8 @@ snapshots:
ini@1.3.8: {}
interpret@1.4.0: {}
ioredis@5.4.1:
dependencies:
'@ioredis/commands': 1.2.0
@ -9249,6 +9295,8 @@ snapshots:
dependencies:
callsites: 3.1.0
parse-diff@0.11.1: {}
parse-json@5.2.0:
dependencies:
'@babel/code-frame': 7.24.7
@ -9546,6 +9594,10 @@ snapshots:
dependencies:
picomatch: 2.3.1
rechoir@0.6.2:
dependencies:
resolve: 1.22.8
redis-errors@1.2.0: {}
redis-info@3.1.0:
@ -9718,6 +9770,14 @@ snapshots:
shebang-regex@3.0.0: {}
shelljs.exec@1.1.8: {}
shelljs@0.8.5:
dependencies:
glob: 7.2.3
interpret: 1.4.0
rechoir: 0.6.2
shimmer@1.2.1: {}
side-channel@1.0.6:

View File

@ -95,6 +95,116 @@ describe("Scrape tests", () => {
expect(response.changeTracking).toBeDefined();
expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
}, 30000);
it.concurrent("includes git diff when requested", async () => {
const response = await scrape({
url: "https://example.com",
formats: ["markdown", "changeTracking"],
changeTrackingOptions: {
modes: ["git-diff"]
}
});
expect(response.changeTracking).toBeDefined();
expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
if (response.changeTracking?.changeStatus === "changed") {
expect(response.changeTracking?.diff).toBeDefined();
expect(response.changeTracking?.diff?.text).toBeDefined();
expect(response.changeTracking?.diff?.json).toBeDefined();
expect(response.changeTracking?.diff?.json.files).toBeInstanceOf(Array);
}
}, 30000);
it.concurrent("includes structured output when requested", async () => {
const response = await scrape({
url: "https://example.com",
formats: ["markdown", "changeTracking"],
changeTrackingOptions: {
modes: ["json"],
prompt: "Summarize the changes between the previous and current content",
systemPrompt: "You are a helpful assistant that summarizes changes between document versions."
}
});
expect(response.changeTracking).toBeDefined();
expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
if (response.changeTracking?.changeStatus === "changed") {
expect(response.changeTracking?.json).toBeDefined();
}
}, 30000);
it.concurrent("supports schema-based extraction for change tracking", async () => {
const response = await scrape({
url: "https://example.com",
formats: ["markdown", "changeTracking"],
changeTrackingOptions: {
modes: ["json"],
schema: {
type: "object",
properties: {
pricing: {
type: "object",
properties: {
amount: { type: "number" },
currency: { type: "string" }
}
},
features: {
type: "array",
items: { type: "string" }
}
}
}
}
});
expect(response.changeTracking).toBeDefined();
expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
if (response.changeTracking?.changeStatus === "changed") {
expect(response.changeTracking?.json).toBeDefined();
if (response.changeTracking?.json.pricing) {
expect(response.changeTracking?.json.pricing).toHaveProperty("old");
expect(response.changeTracking?.json.pricing).toHaveProperty("new");
}
if (response.changeTracking?.json.features) {
expect(response.changeTracking?.json.features).toHaveProperty("old");
expect(response.changeTracking?.json.features).toHaveProperty("new");
}
}
}, 30000);
it.concurrent("supports both git-diff and structured modes together", async () => {
const response = await scrape({
url: "https://example.com",
formats: ["markdown", "changeTracking"],
changeTrackingOptions: {
modes: ["git-diff", "json"],
schema: {
type: "object",
properties: {
summary: { type: "string" },
changes: { type: "array", items: { type: "string" } }
}
}
}
});
expect(response.changeTracking).toBeDefined();
expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
if (response.changeTracking?.changeStatus === "changed") {
expect(response.changeTracking?.diff).toBeDefined();
expect(response.changeTracking?.diff?.text).toBeDefined();
expect(response.changeTracking?.diff?.json).toBeDefined();
expect(response.changeTracking?.json).toBeDefined();
expect(response.changeTracking?.json).toHaveProperty("summary");
expect(response.changeTracking?.json).toHaveProperty("changes");
}
}, 30000);
});
describe("Location API (f-e dependant)", () => {

View File

@ -104,7 +104,7 @@ export async function scrapeController(
// Don't bill if we're early returning
return;
}
if (req.body.extract && req.body.formats.includes("extract")) {
if ((req.body.extract && req.body.formats?.includes("extract")) || (req.body.formats?.includes("changeTracking") && req.body.changeTrackingOptions?.modes?.includes("json"))) {
creditsToBeBilled = 5;
}

View File

@ -20,6 +20,7 @@ export type Format =
| "screenshot"
| "screenshot@fullPage"
| "extract"
| "json"
| "changeTracking";
export const url = z.preprocess(
@ -195,6 +196,13 @@ const baseScrapeOptions = z
extract: extractOptions.optional(),
// New
jsonOptions: extractOptions.optional(),
changeTrackingOptions: z
.object({
prompt: z.string().optional(),
schema: z.any().optional(),
modes: z.enum(["json", "git-diff"]).array().optional().default([]),
})
.optional(),
mobile: z.boolean().default(false),
parsePDF: z.boolean().default(true),
actions: actionsSchema.optional(),
@ -555,6 +563,27 @@ export type Document = {
previousScrapeAt: string | null;
changeStatus: "new" | "same" | "changed" | "removed";
visibility: "visible" | "hidden";
diff?: {
text: string;
json: {
files: Array<{
from: string | null;
to: string | null;
chunks: Array<{
content: string;
changes: Array<{
type: string;
normal?: boolean;
ln?: number;
ln1?: number;
ln2?: number;
content: string;
}>;
}>;
}>;
};
};
json?: any;
}
metadata: {
title?: string;

View File

@ -2,6 +2,53 @@ import { supabase_service } from "../../../services/supabase";
import { Document } from "../../../controllers/v1/types";
import { Meta } from "../index";
import { getJob } from "../../../controllers/v1/crawl-status";
import gitDiff from 'git-diff';
import parseDiff from 'parse-diff';
import { generateCompletions } from "./llmExtract";
async function extractDataWithSchema(content: string, meta: Meta): Promise<any> {
try {
const { extract } = await generateCompletions({
logger: meta.logger.child({
method: "extractDataWithSchema/generateCompletions",
}),
options: {
mode: "llm",
schema: meta.options.changeTrackingOptions?.schema,
systemPrompt: "Extract the requested information from the content based on the provided schema.",
temperature: 0
},
markdown: content
});
return extract;
} catch (error) {
meta.logger.error("Error extracting data with schema", { error });
return null;
}
}
function compareExtractedData(previousData: any, currentData: any): any {
const result: Record<string, { previous: any, current: any }> = {};
const allKeys = new Set([
...Object.keys(previousData || {}),
...Object.keys(currentData || {})
]);
for (const key of allKeys) {
const oldValue = previousData?.[key];
const newValue = currentData?.[key];
if (JSON.stringify(oldValue) !== JSON.stringify(newValue)) {
result[key] = {
previous: oldValue,
current: newValue
};
}
}
return result;
}
export async function deriveDiff(meta: Meta, document: Document): Promise<Document> {
if (meta.options.formats.includes("changeTracking")) {
@ -20,19 +67,106 @@ export async function deriveDiff(meta: Meta, document: Document): Promise<Docume
returnvalue: Document,
} | null = data?.o_job_id ? await getJob(data.o_job_id) : null;
console.log(data, job);
if (data && job && job?.returnvalue) {
const previousMarkdown = job.returnvalue.markdown!;
const currentMarkdown = document.markdown!;
const transformer = (x: string) => [...x.replace(/\s+/g, "").replace(/\[iframe\]\(.+?\)/g, "")].sort().join("");
const isChanged = transformer(previousMarkdown) !== transformer(currentMarkdown);
const changeStatus = document.metadata.statusCode === 404 ? "removed" : isChanged ? "changed" : "same";
document.changeTracking = {
previousScrapeAt: data.o_date_added,
changeStatus: document.metadata.statusCode === 404 ? "removed" : transformer(previousMarkdown) === transformer(currentMarkdown) ? "same" : "changed",
changeStatus,
visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible",
}
if (meta.options.changeTrackingOptions?.modes?.includes("git-diff") && changeStatus === "changed") {
const diffText = gitDiff(previousMarkdown, currentMarkdown, {
color: false,
wordDiff: false
});
if (diffText) {
const diffStructured = parseDiff(diffText);
document.changeTracking.diff = {
text: diffText,
json: {
files: diffStructured.map(file => ({
from: file.from || null,
to: file.to || null,
chunks: file.chunks.map(chunk => ({
content: chunk.content,
changes: chunk.changes.map(change => {
const baseChange = {
type: change.type,
content: change.content
};
if (change.type === 'normal' && 'ln1' in change && 'ln2' in change) {
return {
...baseChange,
normal: true,
ln1: change.ln1,
ln2: change.ln2
};
} else if (change.type === 'add' && 'ln' in change) {
return {
...baseChange,
add: true,
ln: change.ln
};
} else if (change.type === 'del' && 'ln' in change) {
return {
...baseChange,
del: true,
ln: change.ln
};
}
return baseChange;
})
}))
}))
}
};
}
}
if (meta.options.changeTrackingOptions?.modes?.includes("json") &&
meta.options.changeTrackingOptions && changeStatus === "changed") {
try {
const previousData = meta.options.changeTrackingOptions.schema ?
await extractDataWithSchema(previousMarkdown, meta) : null;
const currentData = meta.options.changeTrackingOptions.schema ?
await extractDataWithSchema(currentMarkdown, meta) : null;
if (previousData && currentData) {
document.changeTracking.json = compareExtractedData(previousData, currentData);
} else {
const { extract } = await generateCompletions({
logger: meta.logger.child({
method: "deriveDiff/generateCompletions",
}),
options: {
mode: "llm",
systemPrompt: "Analyze the differences between the previous and current content and provide a structured summary of the changes.",
schema: meta.options.changeTrackingOptions.schema,
prompt: meta.options.changeTrackingOptions.prompt,
temperature: 0
},
markdown: `Previous Content:\n${previousMarkdown}\n\nCurrent Content:\n${currentMarkdown}`,
previousWarning: document.warning
});
document.changeTracking.json = extract;
}
} catch (error) {
meta.logger.error("Error generating structured diff with LLM", { error });
document.warning = "Structured diff generation failed." + (document.warning ? ` ${document.warning}` : "");
}
}
} else if (!res.error) {
document.changeTracking = {
previousScrapeAt: null,

View File

@ -159,6 +159,24 @@ export function coerceFieldsToFormats(
);
}
if (document.changeTracking &&
(!meta.options.changeTrackingOptions?.modes?.includes("git-diff")) &&
document.changeTracking.diff !== undefined) {
meta.logger.warn(
"Removed diff from changeTracking because git-diff mode wasn't specified in changeTrackingOptions.modes.",
);
delete document.changeTracking.diff;
}
if (document.changeTracking &&
(!meta.options.changeTrackingOptions?.modes?.includes("json")) &&
document.changeTracking.json !== undefined) {
meta.logger.warn(
"Removed structured from changeTracking because structured mode wasn't specified in changeTrackingOptions.modes.",
);
delete document.changeTracking.json;
}
if (meta.options.actions === undefined || meta.options.actions.length === 0) {
delete document.actions;
}

49
apps/api/src/types/parse-diff.d.ts vendored Normal file
View File

@ -0,0 +1,49 @@
declare module 'parse-diff' {
interface NormalChange {
type: 'normal';
normal: true;
ln1: number;
ln2: number;
content: string;
}
interface AddChange {
type: 'add';
add: true;
ln: number;
content: string;
}
interface DeleteChange {
type: 'del';
del: true;
ln: number;
content: string;
}
type Change = NormalChange | AddChange | DeleteChange;
interface Chunk {
content: string;
changes: Change[];
oldStart: number;
oldLines: number;
newStart: number;
newLines: number;
}
interface File {
chunks: Chunk[];
deletions: number;
additions: number;
from: string | null;
to: string | null;
index?: string[];
newMode?: string;
oldMode?: string;
binary?: boolean;
}
function parseDiff(diff: string): File[];
export = parseDiff;
}

View File

@ -72,6 +72,26 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
previousScrapeAt: string | null;
changeStatus: "new" | "same" | "changed" | "removed";
visibility: "visible" | "hidden";
diff?: {
text: string;
structured: {
files: Array<{
from: string | null;
to: string | null;
chunks: Array<{
content: string;
changes: Array<{
type: string;
normal?: boolean;
ln?: number;
ln1?: number;
ln2?: number;
content: string;
}>;
}>;
}>;
};
};
};
// v1 search only
title?: string;