mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 06:55:57 +08:00
(feat/change-tracking) Change Tracking Modes (#1445)
* Add git-diff support to change tracking format Co-Authored-By: Nicolas Camara <nick@sideguide.dev> * Fix type issues with parse-diff library Co-Authored-By: Nicolas Camara <nick@sideguide.dev> * Fix parse-diff type definitions to match actual library structure Co-Authored-By: Nicolas Camara <nick@sideguide.dev> * Add structured output/prompt support to change tracking Co-Authored-By: Nicolas Camara <nick@sideguide.dev> * (feat/change-tracking) Change Tracking Modes (#1447) * Refactor change tracking to use modes array instead of separate formats Co-Authored-By: Nicolas Camara <nick@sideguide.dev> * Implement schema-based change tracking with old/new value comparison Co-Authored-By: Nicolas Camara <nick@sideguide.dev> * Nick: * Nick: .json * Update diff.ts --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Nicolas Camara <nick@sideguide.dev> Co-authored-by: Nicolas <nicolascamara29@gmail.com> * Update index.ts * Update types.ts * Update diff.ts * Update scrape.ts --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Nicolas Camara <nick@sideguide.dev> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
parent
f18a6b20ff
commit
138a9757ae
@ -90,6 +90,7 @@
|
||||
"express": "^4.18.2",
|
||||
"express-rate-limit": "^7.3.1",
|
||||
"express-ws": "^5.0.2",
|
||||
"git-diff": "^2.0.6",
|
||||
"glob": "^10.4.2",
|
||||
"gpt3-tokenizer": "^1.1.5",
|
||||
"ioredis": "^5.4.1",
|
||||
@ -110,6 +111,7 @@
|
||||
"mongoose": "^8.4.4",
|
||||
"natural": "^7.0.7",
|
||||
"ollama-ai-provider": "^1.2.0",
|
||||
"parse-diff": "^0.11.1",
|
||||
"pdf-parse": "^1.1.1",
|
||||
"pos": "^0.4.2",
|
||||
"posthog-node": "^4.0.1",
|
||||
|
60
apps/api/pnpm-lock.yaml
generated
60
apps/api/pnpm-lock.yaml
generated
@ -125,6 +125,9 @@ importers:
|
||||
express-ws:
|
||||
specifier: ^5.0.2
|
||||
version: 5.0.2(express@4.19.2)
|
||||
git-diff:
|
||||
specifier: ^2.0.6
|
||||
version: 2.0.6
|
||||
glob:
|
||||
specifier: ^10.4.2
|
||||
version: 10.4.2
|
||||
@ -185,6 +188,9 @@ importers:
|
||||
ollama-ai-provider:
|
||||
specifier: ^1.2.0
|
||||
version: 1.2.0(zod@3.24.2)
|
||||
parse-diff:
|
||||
specifier: ^0.11.1
|
||||
version: 0.11.1
|
||||
pdf-parse:
|
||||
specifier: ^1.1.1
|
||||
version: 1.1.1
|
||||
@ -2384,6 +2390,10 @@ packages:
|
||||
resolution: {integrity: sha512-EjePK1srD3P08o2j4f0ExnylqRs5B9tJjcp9t1krH2qRi8CCdsYfwe9JgSLurFBWwq4uOlipzfk5fHNvwFKr8Q==}
|
||||
engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0}
|
||||
|
||||
diff@3.5.0:
|
||||
resolution: {integrity: sha512-A46qtFgd+g7pDZinpnwiRJtxbC1hpgf0uzP3iG89scHk0AUC7A1TGxf5OiiOUv/JMZR8GOt8hL900hV0bOy5xA==}
|
||||
engines: {node: '>=0.3.1'}
|
||||
|
||||
diff@4.0.2:
|
||||
resolution: {integrity: sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==}
|
||||
engines: {node: '>=0.3.1'}
|
||||
@ -2762,6 +2772,10 @@ packages:
|
||||
resolution: {integrity: sha512-BzUrJBS9EcUb4cFol8r4W3v1cPsSyajLSthNkz5BxbpDcHN5tIrM10E2eNvfnvBn3DaT3DUgx0OpsBKkaOpanw==}
|
||||
engines: {node: '>= 14'}
|
||||
|
||||
git-diff@2.0.6:
|
||||
resolution: {integrity: sha512-/Iu4prUrydE3Pb3lCBMbcSNIf81tgGt0W1ZwknnyF62t3tHmtiJTRj0f+1ZIhp3+Rh0ktz1pJVoa7ZXUCskivA==}
|
||||
engines: {node: '>= 4.8.0'}
|
||||
|
||||
glob-parent@5.1.2:
|
||||
resolution: {integrity: sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==}
|
||||
engines: {node: '>= 6'}
|
||||
@ -2955,6 +2969,10 @@ packages:
|
||||
ini@1.3.8:
|
||||
resolution: {integrity: sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==}
|
||||
|
||||
interpret@1.4.0:
|
||||
resolution: {integrity: sha512-agE4QfB2Lkp9uICn7BAqoscw4SZP9kTE2hxiFI3jBPmXJfdqiahTbUuKGsMoN2GtqL9AxhYioAcVvgsb1HvRbA==}
|
||||
engines: {node: '>= 0.10'}
|
||||
|
||||
ioredis@5.4.1:
|
||||
resolution: {integrity: sha512-2YZsvl7jopIa1gaePkeMtd9rAcSjOOjPtpcLlOeusyO+XH2SK5ZcT+UCrElPP+WVIInh2TzeI4XW9ENaSLVVHA==}
|
||||
engines: {node: '>=12.22.0'}
|
||||
@ -3721,6 +3739,9 @@ packages:
|
||||
resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==}
|
||||
engines: {node: '>=6'}
|
||||
|
||||
parse-diff@0.11.1:
|
||||
resolution: {integrity: sha512-Oq4j8LAOPOcssanQkIjxosjATBIEJhCxMCxPhMu+Ci4wdNmAEdx0O+a7gzbR2PyKXgKPvRLIN5g224+dJAsKHA==}
|
||||
|
||||
parse-json@5.2.0:
|
||||
resolution: {integrity: sha512-ayCKvm/phCGxOkYRSCM82iDwct8/EonSEgCSxWxD7ve6jHggsFl4fZVQBPRNgQoKiuV/odhFrGzQXZwbifC8Rg==}
|
||||
engines: {node: '>=8'}
|
||||
@ -3982,6 +4003,10 @@ packages:
|
||||
resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==}
|
||||
engines: {node: '>=8.10.0'}
|
||||
|
||||
rechoir@0.6.2:
|
||||
resolution: {integrity: sha512-HFM8rkZ+i3zrV+4LQjwQ0W+ez98pApMGM3HUrN04j3CqzPOzl9nmP15Y8YXNm8QHGv/eacOVEjqhmWpkRV0NAw==}
|
||||
engines: {node: '>= 0.10'}
|
||||
|
||||
redis-errors@1.2.0:
|
||||
resolution: {integrity: sha512-1qny3OExCf0UvUV/5wpYKf2YwPcOqXzkwKKSmKHiE6ZMQs5heeE/c8eXK+PNllPvmjgAbfnsbpkGZWy8cBpn9w==}
|
||||
engines: {node: '>=4'}
|
||||
@ -4148,6 +4173,15 @@ packages:
|
||||
resolution: {integrity: sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==}
|
||||
engines: {node: '>=8'}
|
||||
|
||||
shelljs.exec@1.1.8:
|
||||
resolution: {integrity: sha512-vFILCw+lzUtiwBAHV8/Ex8JsFjelFMdhONIsgKNLgTzeRckp2AOYRQtHJE/9LhNvdMmE27AGtzWx0+DHpwIwSw==}
|
||||
engines: {node: '>= 4.0.0'}
|
||||
|
||||
shelljs@0.8.5:
|
||||
resolution: {integrity: sha512-TiwcRcrkhHvbrZbnRcFYMLl30Dfov3HKqzp5tO5b4pt6G/SezKcYhmDg15zXVBswHmctSAQKznqNW2LO5tTDow==}
|
||||
engines: {node: '>=4'}
|
||||
hasBin: true
|
||||
|
||||
shimmer@1.2.1:
|
||||
resolution: {integrity: sha512-sQTKC1Re/rM6XyFM6fIAGHRPVGvyXfgzIDvzoq608vM+jeyVD0Tu1E6Np0Kc2zAIFWIj963V2800iF/9LPieQw==}
|
||||
|
||||
@ -7603,6 +7637,8 @@ snapshots:
|
||||
|
||||
diff-sequences@29.6.3: {}
|
||||
|
||||
diff@3.5.0: {}
|
||||
|
||||
diff@4.0.2: {}
|
||||
|
||||
dingbat-to-unicode@1.0.1: {}
|
||||
@ -8026,6 +8062,14 @@ snapshots:
|
||||
transitivePeerDependencies:
|
||||
- supports-color
|
||||
|
||||
git-diff@2.0.6:
|
||||
dependencies:
|
||||
chalk: 2.4.2
|
||||
diff: 3.5.0
|
||||
loglevel: 1.9.1
|
||||
shelljs: 0.8.5
|
||||
shelljs.exec: 1.1.8
|
||||
|
||||
glob-parent@5.1.2:
|
||||
dependencies:
|
||||
is-glob: 4.0.3
|
||||
@ -8271,6 +8315,8 @@ snapshots:
|
||||
|
||||
ini@1.3.8: {}
|
||||
|
||||
interpret@1.4.0: {}
|
||||
|
||||
ioredis@5.4.1:
|
||||
dependencies:
|
||||
'@ioredis/commands': 1.2.0
|
||||
@ -9249,6 +9295,8 @@ snapshots:
|
||||
dependencies:
|
||||
callsites: 3.1.0
|
||||
|
||||
parse-diff@0.11.1: {}
|
||||
|
||||
parse-json@5.2.0:
|
||||
dependencies:
|
||||
'@babel/code-frame': 7.24.7
|
||||
@ -9546,6 +9594,10 @@ snapshots:
|
||||
dependencies:
|
||||
picomatch: 2.3.1
|
||||
|
||||
rechoir@0.6.2:
|
||||
dependencies:
|
||||
resolve: 1.22.8
|
||||
|
||||
redis-errors@1.2.0: {}
|
||||
|
||||
redis-info@3.1.0:
|
||||
@ -9718,6 +9770,14 @@ snapshots:
|
||||
|
||||
shebang-regex@3.0.0: {}
|
||||
|
||||
shelljs.exec@1.1.8: {}
|
||||
|
||||
shelljs@0.8.5:
|
||||
dependencies:
|
||||
glob: 7.2.3
|
||||
interpret: 1.4.0
|
||||
rechoir: 0.6.2
|
||||
|
||||
shimmer@1.2.1: {}
|
||||
|
||||
side-channel@1.0.6:
|
||||
|
@ -95,6 +95,116 @@ describe("Scrape tests", () => {
|
||||
expect(response.changeTracking).toBeDefined();
|
||||
expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
|
||||
}, 30000);
|
||||
|
||||
it.concurrent("includes git diff when requested", async () => {
|
||||
const response = await scrape({
|
||||
url: "https://example.com",
|
||||
formats: ["markdown", "changeTracking"],
|
||||
changeTrackingOptions: {
|
||||
modes: ["git-diff"]
|
||||
}
|
||||
});
|
||||
|
||||
expect(response.changeTracking).toBeDefined();
|
||||
expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
|
||||
|
||||
if (response.changeTracking?.changeStatus === "changed") {
|
||||
expect(response.changeTracking?.diff).toBeDefined();
|
||||
expect(response.changeTracking?.diff?.text).toBeDefined();
|
||||
expect(response.changeTracking?.diff?.json).toBeDefined();
|
||||
expect(response.changeTracking?.diff?.json.files).toBeInstanceOf(Array);
|
||||
}
|
||||
}, 30000);
|
||||
|
||||
it.concurrent("includes structured output when requested", async () => {
|
||||
const response = await scrape({
|
||||
url: "https://example.com",
|
||||
formats: ["markdown", "changeTracking"],
|
||||
changeTrackingOptions: {
|
||||
modes: ["json"],
|
||||
prompt: "Summarize the changes between the previous and current content",
|
||||
systemPrompt: "You are a helpful assistant that summarizes changes between document versions."
|
||||
}
|
||||
});
|
||||
|
||||
expect(response.changeTracking).toBeDefined();
|
||||
expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
|
||||
|
||||
if (response.changeTracking?.changeStatus === "changed") {
|
||||
expect(response.changeTracking?.json).toBeDefined();
|
||||
}
|
||||
}, 30000);
|
||||
|
||||
it.concurrent("supports schema-based extraction for change tracking", async () => {
|
||||
const response = await scrape({
|
||||
url: "https://example.com",
|
||||
formats: ["markdown", "changeTracking"],
|
||||
changeTrackingOptions: {
|
||||
modes: ["json"],
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
pricing: {
|
||||
type: "object",
|
||||
properties: {
|
||||
amount: { type: "number" },
|
||||
currency: { type: "string" }
|
||||
}
|
||||
},
|
||||
features: {
|
||||
type: "array",
|
||||
items: { type: "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
expect(response.changeTracking).toBeDefined();
|
||||
expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
|
||||
|
||||
if (response.changeTracking?.changeStatus === "changed") {
|
||||
expect(response.changeTracking?.json).toBeDefined();
|
||||
if (response.changeTracking?.json.pricing) {
|
||||
expect(response.changeTracking?.json.pricing).toHaveProperty("old");
|
||||
expect(response.changeTracking?.json.pricing).toHaveProperty("new");
|
||||
}
|
||||
if (response.changeTracking?.json.features) {
|
||||
expect(response.changeTracking?.json.features).toHaveProperty("old");
|
||||
expect(response.changeTracking?.json.features).toHaveProperty("new");
|
||||
}
|
||||
}
|
||||
}, 30000);
|
||||
|
||||
it.concurrent("supports both git-diff and structured modes together", async () => {
|
||||
const response = await scrape({
|
||||
url: "https://example.com",
|
||||
formats: ["markdown", "changeTracking"],
|
||||
changeTrackingOptions: {
|
||||
modes: ["git-diff", "json"],
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
summary: { type: "string" },
|
||||
changes: { type: "array", items: { type: "string" } }
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
expect(response.changeTracking).toBeDefined();
|
||||
expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
|
||||
|
||||
if (response.changeTracking?.changeStatus === "changed") {
|
||||
expect(response.changeTracking?.diff).toBeDefined();
|
||||
expect(response.changeTracking?.diff?.text).toBeDefined();
|
||||
expect(response.changeTracking?.diff?.json).toBeDefined();
|
||||
|
||||
expect(response.changeTracking?.json).toBeDefined();
|
||||
expect(response.changeTracking?.json).toHaveProperty("summary");
|
||||
expect(response.changeTracking?.json).toHaveProperty("changes");
|
||||
}
|
||||
}, 30000);
|
||||
});
|
||||
|
||||
describe("Location API (f-e dependant)", () => {
|
||||
|
@ -104,7 +104,7 @@ export async function scrapeController(
|
||||
// Don't bill if we're early returning
|
||||
return;
|
||||
}
|
||||
if (req.body.extract && req.body.formats.includes("extract")) {
|
||||
if ((req.body.extract && req.body.formats?.includes("extract")) || (req.body.formats?.includes("changeTracking") && req.body.changeTrackingOptions?.modes?.includes("json"))) {
|
||||
creditsToBeBilled = 5;
|
||||
}
|
||||
|
||||
|
@ -20,6 +20,7 @@ export type Format =
|
||||
| "screenshot"
|
||||
| "screenshot@fullPage"
|
||||
| "extract"
|
||||
| "json"
|
||||
| "changeTracking";
|
||||
|
||||
export const url = z.preprocess(
|
||||
@ -195,6 +196,13 @@ const baseScrapeOptions = z
|
||||
extract: extractOptions.optional(),
|
||||
// New
|
||||
jsonOptions: extractOptions.optional(),
|
||||
changeTrackingOptions: z
|
||||
.object({
|
||||
prompt: z.string().optional(),
|
||||
schema: z.any().optional(),
|
||||
modes: z.enum(["json", "git-diff"]).array().optional().default([]),
|
||||
})
|
||||
.optional(),
|
||||
mobile: z.boolean().default(false),
|
||||
parsePDF: z.boolean().default(true),
|
||||
actions: actionsSchema.optional(),
|
||||
@ -555,6 +563,27 @@ export type Document = {
|
||||
previousScrapeAt: string | null;
|
||||
changeStatus: "new" | "same" | "changed" | "removed";
|
||||
visibility: "visible" | "hidden";
|
||||
diff?: {
|
||||
text: string;
|
||||
json: {
|
||||
files: Array<{
|
||||
from: string | null;
|
||||
to: string | null;
|
||||
chunks: Array<{
|
||||
content: string;
|
||||
changes: Array<{
|
||||
type: string;
|
||||
normal?: boolean;
|
||||
ln?: number;
|
||||
ln1?: number;
|
||||
ln2?: number;
|
||||
content: string;
|
||||
}>;
|
||||
}>;
|
||||
}>;
|
||||
};
|
||||
};
|
||||
json?: any;
|
||||
}
|
||||
metadata: {
|
||||
title?: string;
|
||||
|
@ -2,6 +2,53 @@ import { supabase_service } from "../../../services/supabase";
|
||||
import { Document } from "../../../controllers/v1/types";
|
||||
import { Meta } from "../index";
|
||||
import { getJob } from "../../../controllers/v1/crawl-status";
|
||||
import gitDiff from 'git-diff';
|
||||
import parseDiff from 'parse-diff';
|
||||
import { generateCompletions } from "./llmExtract";
|
||||
|
||||
async function extractDataWithSchema(content: string, meta: Meta): Promise<any> {
|
||||
try {
|
||||
const { extract } = await generateCompletions({
|
||||
logger: meta.logger.child({
|
||||
method: "extractDataWithSchema/generateCompletions",
|
||||
}),
|
||||
options: {
|
||||
mode: "llm",
|
||||
schema: meta.options.changeTrackingOptions?.schema,
|
||||
systemPrompt: "Extract the requested information from the content based on the provided schema.",
|
||||
temperature: 0
|
||||
},
|
||||
markdown: content
|
||||
});
|
||||
return extract;
|
||||
} catch (error) {
|
||||
meta.logger.error("Error extracting data with schema", { error });
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function compareExtractedData(previousData: any, currentData: any): any {
|
||||
const result: Record<string, { previous: any, current: any }> = {};
|
||||
|
||||
const allKeys = new Set([
|
||||
...Object.keys(previousData || {}),
|
||||
...Object.keys(currentData || {})
|
||||
]);
|
||||
|
||||
for (const key of allKeys) {
|
||||
const oldValue = previousData?.[key];
|
||||
const newValue = currentData?.[key];
|
||||
|
||||
if (JSON.stringify(oldValue) !== JSON.stringify(newValue)) {
|
||||
result[key] = {
|
||||
previous: oldValue,
|
||||
current: newValue
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
export async function deriveDiff(meta: Meta, document: Document): Promise<Document> {
|
||||
if (meta.options.formats.includes("changeTracking")) {
|
||||
@ -20,19 +67,106 @@ export async function deriveDiff(meta: Meta, document: Document): Promise<Docume
|
||||
returnvalue: Document,
|
||||
} | null = data?.o_job_id ? await getJob(data.o_job_id) : null;
|
||||
|
||||
console.log(data, job);
|
||||
|
||||
if (data && job && job?.returnvalue) {
|
||||
const previousMarkdown = job.returnvalue.markdown!;
|
||||
const currentMarkdown = document.markdown!;
|
||||
|
||||
const transformer = (x: string) => [...x.replace(/\s+/g, "").replace(/\[iframe\]\(.+?\)/g, "")].sort().join("");
|
||||
const isChanged = transformer(previousMarkdown) !== transformer(currentMarkdown);
|
||||
const changeStatus = document.metadata.statusCode === 404 ? "removed" : isChanged ? "changed" : "same";
|
||||
|
||||
document.changeTracking = {
|
||||
previousScrapeAt: data.o_date_added,
|
||||
changeStatus: document.metadata.statusCode === 404 ? "removed" : transformer(previousMarkdown) === transformer(currentMarkdown) ? "same" : "changed",
|
||||
changeStatus,
|
||||
visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible",
|
||||
}
|
||||
|
||||
if (meta.options.changeTrackingOptions?.modes?.includes("git-diff") && changeStatus === "changed") {
|
||||
const diffText = gitDiff(previousMarkdown, currentMarkdown, {
|
||||
color: false,
|
||||
wordDiff: false
|
||||
});
|
||||
|
||||
if (diffText) {
|
||||
const diffStructured = parseDiff(diffText);
|
||||
document.changeTracking.diff = {
|
||||
text: diffText,
|
||||
json: {
|
||||
files: diffStructured.map(file => ({
|
||||
from: file.from || null,
|
||||
to: file.to || null,
|
||||
chunks: file.chunks.map(chunk => ({
|
||||
content: chunk.content,
|
||||
changes: chunk.changes.map(change => {
|
||||
const baseChange = {
|
||||
type: change.type,
|
||||
content: change.content
|
||||
};
|
||||
|
||||
if (change.type === 'normal' && 'ln1' in change && 'ln2' in change) {
|
||||
return {
|
||||
...baseChange,
|
||||
normal: true,
|
||||
ln1: change.ln1,
|
||||
ln2: change.ln2
|
||||
};
|
||||
} else if (change.type === 'add' && 'ln' in change) {
|
||||
return {
|
||||
...baseChange,
|
||||
add: true,
|
||||
ln: change.ln
|
||||
};
|
||||
} else if (change.type === 'del' && 'ln' in change) {
|
||||
return {
|
||||
...baseChange,
|
||||
del: true,
|
||||
ln: change.ln
|
||||
};
|
||||
}
|
||||
|
||||
return baseChange;
|
||||
})
|
||||
}))
|
||||
}))
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if (meta.options.changeTrackingOptions?.modes?.includes("json") &&
|
||||
meta.options.changeTrackingOptions && changeStatus === "changed") {
|
||||
try {
|
||||
const previousData = meta.options.changeTrackingOptions.schema ?
|
||||
await extractDataWithSchema(previousMarkdown, meta) : null;
|
||||
|
||||
const currentData = meta.options.changeTrackingOptions.schema ?
|
||||
await extractDataWithSchema(currentMarkdown, meta) : null;
|
||||
|
||||
if (previousData && currentData) {
|
||||
document.changeTracking.json = compareExtractedData(previousData, currentData);
|
||||
} else {
|
||||
const { extract } = await generateCompletions({
|
||||
logger: meta.logger.child({
|
||||
method: "deriveDiff/generateCompletions",
|
||||
}),
|
||||
options: {
|
||||
mode: "llm",
|
||||
systemPrompt: "Analyze the differences between the previous and current content and provide a structured summary of the changes.",
|
||||
schema: meta.options.changeTrackingOptions.schema,
|
||||
prompt: meta.options.changeTrackingOptions.prompt,
|
||||
temperature: 0
|
||||
},
|
||||
markdown: `Previous Content:\n${previousMarkdown}\n\nCurrent Content:\n${currentMarkdown}`,
|
||||
previousWarning: document.warning
|
||||
});
|
||||
|
||||
document.changeTracking.json = extract;
|
||||
}
|
||||
} catch (error) {
|
||||
meta.logger.error("Error generating structured diff with LLM", { error });
|
||||
document.warning = "Structured diff generation failed." + (document.warning ? ` ${document.warning}` : "");
|
||||
}
|
||||
}
|
||||
} else if (!res.error) {
|
||||
document.changeTracking = {
|
||||
previousScrapeAt: null,
|
||||
|
@ -159,6 +159,24 @@ export function coerceFieldsToFormats(
|
||||
);
|
||||
}
|
||||
|
||||
if (document.changeTracking &&
|
||||
(!meta.options.changeTrackingOptions?.modes?.includes("git-diff")) &&
|
||||
document.changeTracking.diff !== undefined) {
|
||||
meta.logger.warn(
|
||||
"Removed diff from changeTracking because git-diff mode wasn't specified in changeTrackingOptions.modes.",
|
||||
);
|
||||
delete document.changeTracking.diff;
|
||||
}
|
||||
|
||||
if (document.changeTracking &&
|
||||
(!meta.options.changeTrackingOptions?.modes?.includes("json")) &&
|
||||
document.changeTracking.json !== undefined) {
|
||||
meta.logger.warn(
|
||||
"Removed structured from changeTracking because structured mode wasn't specified in changeTrackingOptions.modes.",
|
||||
);
|
||||
delete document.changeTracking.json;
|
||||
}
|
||||
|
||||
if (meta.options.actions === undefined || meta.options.actions.length === 0) {
|
||||
delete document.actions;
|
||||
}
|
||||
|
49
apps/api/src/types/parse-diff.d.ts
vendored
Normal file
49
apps/api/src/types/parse-diff.d.ts
vendored
Normal file
@ -0,0 +1,49 @@
|
||||
declare module 'parse-diff' {
|
||||
interface NormalChange {
|
||||
type: 'normal';
|
||||
normal: true;
|
||||
ln1: number;
|
||||
ln2: number;
|
||||
content: string;
|
||||
}
|
||||
|
||||
interface AddChange {
|
||||
type: 'add';
|
||||
add: true;
|
||||
ln: number;
|
||||
content: string;
|
||||
}
|
||||
|
||||
interface DeleteChange {
|
||||
type: 'del';
|
||||
del: true;
|
||||
ln: number;
|
||||
content: string;
|
||||
}
|
||||
|
||||
type Change = NormalChange | AddChange | DeleteChange;
|
||||
|
||||
interface Chunk {
|
||||
content: string;
|
||||
changes: Change[];
|
||||
oldStart: number;
|
||||
oldLines: number;
|
||||
newStart: number;
|
||||
newLines: number;
|
||||
}
|
||||
|
||||
interface File {
|
||||
chunks: Chunk[];
|
||||
deletions: number;
|
||||
additions: number;
|
||||
from: string | null;
|
||||
to: string | null;
|
||||
index?: string[];
|
||||
newMode?: string;
|
||||
oldMode?: string;
|
||||
binary?: boolean;
|
||||
}
|
||||
|
||||
function parseDiff(diff: string): File[];
|
||||
export = parseDiff;
|
||||
}
|
@ -72,6 +72,26 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
|
||||
previousScrapeAt: string | null;
|
||||
changeStatus: "new" | "same" | "changed" | "removed";
|
||||
visibility: "visible" | "hidden";
|
||||
diff?: {
|
||||
text: string;
|
||||
structured: {
|
||||
files: Array<{
|
||||
from: string | null;
|
||||
to: string | null;
|
||||
chunks: Array<{
|
||||
content: string;
|
||||
changes: Array<{
|
||||
type: string;
|
||||
normal?: boolean;
|
||||
ln?: number;
|
||||
ln1?: number;
|
||||
ln2?: number;
|
||||
content: string;
|
||||
}>;
|
||||
}>;
|
||||
}>;
|
||||
};
|
||||
};
|
||||
};
|
||||
// v1 search only
|
||||
title?: string;
|
||||
|
Loading…
x
Reference in New Issue
Block a user