mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 10:35:56 +08:00
(feat/change-tracking) Change Tracking Modes (#1445)
* Add git-diff support to change tracking format Co-Authored-By: Nicolas Camara <nick@sideguide.dev> * Fix type issues with parse-diff library Co-Authored-By: Nicolas Camara <nick@sideguide.dev> * Fix parse-diff type definitions to match actual library structure Co-Authored-By: Nicolas Camara <nick@sideguide.dev> * Add structured output/prompt support to change tracking Co-Authored-By: Nicolas Camara <nick@sideguide.dev> * (feat/change-tracking) Change Tracking Modes (#1447) * Refactor change tracking to use modes array instead of separate formats Co-Authored-By: Nicolas Camara <nick@sideguide.dev> * Implement schema-based change tracking with old/new value comparison Co-Authored-By: Nicolas Camara <nick@sideguide.dev> * Nick: * Nick: .json * Update diff.ts --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Nicolas Camara <nick@sideguide.dev> Co-authored-by: Nicolas <nicolascamara29@gmail.com> * Update index.ts * Update types.ts * Update diff.ts * Update scrape.ts --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Nicolas Camara <nick@sideguide.dev> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
parent
f18a6b20ff
commit
138a9757ae
@ -90,6 +90,7 @@
|
|||||||
"express": "^4.18.2",
|
"express": "^4.18.2",
|
||||||
"express-rate-limit": "^7.3.1",
|
"express-rate-limit": "^7.3.1",
|
||||||
"express-ws": "^5.0.2",
|
"express-ws": "^5.0.2",
|
||||||
|
"git-diff": "^2.0.6",
|
||||||
"glob": "^10.4.2",
|
"glob": "^10.4.2",
|
||||||
"gpt3-tokenizer": "^1.1.5",
|
"gpt3-tokenizer": "^1.1.5",
|
||||||
"ioredis": "^5.4.1",
|
"ioredis": "^5.4.1",
|
||||||
@ -110,6 +111,7 @@
|
|||||||
"mongoose": "^8.4.4",
|
"mongoose": "^8.4.4",
|
||||||
"natural": "^7.0.7",
|
"natural": "^7.0.7",
|
||||||
"ollama-ai-provider": "^1.2.0",
|
"ollama-ai-provider": "^1.2.0",
|
||||||
|
"parse-diff": "^0.11.1",
|
||||||
"pdf-parse": "^1.1.1",
|
"pdf-parse": "^1.1.1",
|
||||||
"pos": "^0.4.2",
|
"pos": "^0.4.2",
|
||||||
"posthog-node": "^4.0.1",
|
"posthog-node": "^4.0.1",
|
||||||
|
60
apps/api/pnpm-lock.yaml
generated
60
apps/api/pnpm-lock.yaml
generated
@ -125,6 +125,9 @@ importers:
|
|||||||
express-ws:
|
express-ws:
|
||||||
specifier: ^5.0.2
|
specifier: ^5.0.2
|
||||||
version: 5.0.2(express@4.19.2)
|
version: 5.0.2(express@4.19.2)
|
||||||
|
git-diff:
|
||||||
|
specifier: ^2.0.6
|
||||||
|
version: 2.0.6
|
||||||
glob:
|
glob:
|
||||||
specifier: ^10.4.2
|
specifier: ^10.4.2
|
||||||
version: 10.4.2
|
version: 10.4.2
|
||||||
@ -185,6 +188,9 @@ importers:
|
|||||||
ollama-ai-provider:
|
ollama-ai-provider:
|
||||||
specifier: ^1.2.0
|
specifier: ^1.2.0
|
||||||
version: 1.2.0(zod@3.24.2)
|
version: 1.2.0(zod@3.24.2)
|
||||||
|
parse-diff:
|
||||||
|
specifier: ^0.11.1
|
||||||
|
version: 0.11.1
|
||||||
pdf-parse:
|
pdf-parse:
|
||||||
specifier: ^1.1.1
|
specifier: ^1.1.1
|
||||||
version: 1.1.1
|
version: 1.1.1
|
||||||
@ -2384,6 +2390,10 @@ packages:
|
|||||||
resolution: {integrity: sha512-EjePK1srD3P08o2j4f0ExnylqRs5B9tJjcp9t1krH2qRi8CCdsYfwe9JgSLurFBWwq4uOlipzfk5fHNvwFKr8Q==}
|
resolution: {integrity: sha512-EjePK1srD3P08o2j4f0ExnylqRs5B9tJjcp9t1krH2qRi8CCdsYfwe9JgSLurFBWwq4uOlipzfk5fHNvwFKr8Q==}
|
||||||
engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0}
|
engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0}
|
||||||
|
|
||||||
|
diff@3.5.0:
|
||||||
|
resolution: {integrity: sha512-A46qtFgd+g7pDZinpnwiRJtxbC1hpgf0uzP3iG89scHk0AUC7A1TGxf5OiiOUv/JMZR8GOt8hL900hV0bOy5xA==}
|
||||||
|
engines: {node: '>=0.3.1'}
|
||||||
|
|
||||||
diff@4.0.2:
|
diff@4.0.2:
|
||||||
resolution: {integrity: sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==}
|
resolution: {integrity: sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==}
|
||||||
engines: {node: '>=0.3.1'}
|
engines: {node: '>=0.3.1'}
|
||||||
@ -2762,6 +2772,10 @@ packages:
|
|||||||
resolution: {integrity: sha512-BzUrJBS9EcUb4cFol8r4W3v1cPsSyajLSthNkz5BxbpDcHN5tIrM10E2eNvfnvBn3DaT3DUgx0OpsBKkaOpanw==}
|
resolution: {integrity: sha512-BzUrJBS9EcUb4cFol8r4W3v1cPsSyajLSthNkz5BxbpDcHN5tIrM10E2eNvfnvBn3DaT3DUgx0OpsBKkaOpanw==}
|
||||||
engines: {node: '>= 14'}
|
engines: {node: '>= 14'}
|
||||||
|
|
||||||
|
git-diff@2.0.6:
|
||||||
|
resolution: {integrity: sha512-/Iu4prUrydE3Pb3lCBMbcSNIf81tgGt0W1ZwknnyF62t3tHmtiJTRj0f+1ZIhp3+Rh0ktz1pJVoa7ZXUCskivA==}
|
||||||
|
engines: {node: '>= 4.8.0'}
|
||||||
|
|
||||||
glob-parent@5.1.2:
|
glob-parent@5.1.2:
|
||||||
resolution: {integrity: sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==}
|
resolution: {integrity: sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==}
|
||||||
engines: {node: '>= 6'}
|
engines: {node: '>= 6'}
|
||||||
@ -2955,6 +2969,10 @@ packages:
|
|||||||
ini@1.3.8:
|
ini@1.3.8:
|
||||||
resolution: {integrity: sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==}
|
resolution: {integrity: sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==}
|
||||||
|
|
||||||
|
interpret@1.4.0:
|
||||||
|
resolution: {integrity: sha512-agE4QfB2Lkp9uICn7BAqoscw4SZP9kTE2hxiFI3jBPmXJfdqiahTbUuKGsMoN2GtqL9AxhYioAcVvgsb1HvRbA==}
|
||||||
|
engines: {node: '>= 0.10'}
|
||||||
|
|
||||||
ioredis@5.4.1:
|
ioredis@5.4.1:
|
||||||
resolution: {integrity: sha512-2YZsvl7jopIa1gaePkeMtd9rAcSjOOjPtpcLlOeusyO+XH2SK5ZcT+UCrElPP+WVIInh2TzeI4XW9ENaSLVVHA==}
|
resolution: {integrity: sha512-2YZsvl7jopIa1gaePkeMtd9rAcSjOOjPtpcLlOeusyO+XH2SK5ZcT+UCrElPP+WVIInh2TzeI4XW9ENaSLVVHA==}
|
||||||
engines: {node: '>=12.22.0'}
|
engines: {node: '>=12.22.0'}
|
||||||
@ -3721,6 +3739,9 @@ packages:
|
|||||||
resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==}
|
resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==}
|
||||||
engines: {node: '>=6'}
|
engines: {node: '>=6'}
|
||||||
|
|
||||||
|
parse-diff@0.11.1:
|
||||||
|
resolution: {integrity: sha512-Oq4j8LAOPOcssanQkIjxosjATBIEJhCxMCxPhMu+Ci4wdNmAEdx0O+a7gzbR2PyKXgKPvRLIN5g224+dJAsKHA==}
|
||||||
|
|
||||||
parse-json@5.2.0:
|
parse-json@5.2.0:
|
||||||
resolution: {integrity: sha512-ayCKvm/phCGxOkYRSCM82iDwct8/EonSEgCSxWxD7ve6jHggsFl4fZVQBPRNgQoKiuV/odhFrGzQXZwbifC8Rg==}
|
resolution: {integrity: sha512-ayCKvm/phCGxOkYRSCM82iDwct8/EonSEgCSxWxD7ve6jHggsFl4fZVQBPRNgQoKiuV/odhFrGzQXZwbifC8Rg==}
|
||||||
engines: {node: '>=8'}
|
engines: {node: '>=8'}
|
||||||
@ -3982,6 +4003,10 @@ packages:
|
|||||||
resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==}
|
resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==}
|
||||||
engines: {node: '>=8.10.0'}
|
engines: {node: '>=8.10.0'}
|
||||||
|
|
||||||
|
rechoir@0.6.2:
|
||||||
|
resolution: {integrity: sha512-HFM8rkZ+i3zrV+4LQjwQ0W+ez98pApMGM3HUrN04j3CqzPOzl9nmP15Y8YXNm8QHGv/eacOVEjqhmWpkRV0NAw==}
|
||||||
|
engines: {node: '>= 0.10'}
|
||||||
|
|
||||||
redis-errors@1.2.0:
|
redis-errors@1.2.0:
|
||||||
resolution: {integrity: sha512-1qny3OExCf0UvUV/5wpYKf2YwPcOqXzkwKKSmKHiE6ZMQs5heeE/c8eXK+PNllPvmjgAbfnsbpkGZWy8cBpn9w==}
|
resolution: {integrity: sha512-1qny3OExCf0UvUV/5wpYKf2YwPcOqXzkwKKSmKHiE6ZMQs5heeE/c8eXK+PNllPvmjgAbfnsbpkGZWy8cBpn9w==}
|
||||||
engines: {node: '>=4'}
|
engines: {node: '>=4'}
|
||||||
@ -4148,6 +4173,15 @@ packages:
|
|||||||
resolution: {integrity: sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==}
|
resolution: {integrity: sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==}
|
||||||
engines: {node: '>=8'}
|
engines: {node: '>=8'}
|
||||||
|
|
||||||
|
shelljs.exec@1.1.8:
|
||||||
|
resolution: {integrity: sha512-vFILCw+lzUtiwBAHV8/Ex8JsFjelFMdhONIsgKNLgTzeRckp2AOYRQtHJE/9LhNvdMmE27AGtzWx0+DHpwIwSw==}
|
||||||
|
engines: {node: '>= 4.0.0'}
|
||||||
|
|
||||||
|
shelljs@0.8.5:
|
||||||
|
resolution: {integrity: sha512-TiwcRcrkhHvbrZbnRcFYMLl30Dfov3HKqzp5tO5b4pt6G/SezKcYhmDg15zXVBswHmctSAQKznqNW2LO5tTDow==}
|
||||||
|
engines: {node: '>=4'}
|
||||||
|
hasBin: true
|
||||||
|
|
||||||
shimmer@1.2.1:
|
shimmer@1.2.1:
|
||||||
resolution: {integrity: sha512-sQTKC1Re/rM6XyFM6fIAGHRPVGvyXfgzIDvzoq608vM+jeyVD0Tu1E6Np0Kc2zAIFWIj963V2800iF/9LPieQw==}
|
resolution: {integrity: sha512-sQTKC1Re/rM6XyFM6fIAGHRPVGvyXfgzIDvzoq608vM+jeyVD0Tu1E6Np0Kc2zAIFWIj963V2800iF/9LPieQw==}
|
||||||
|
|
||||||
@ -7603,6 +7637,8 @@ snapshots:
|
|||||||
|
|
||||||
diff-sequences@29.6.3: {}
|
diff-sequences@29.6.3: {}
|
||||||
|
|
||||||
|
diff@3.5.0: {}
|
||||||
|
|
||||||
diff@4.0.2: {}
|
diff@4.0.2: {}
|
||||||
|
|
||||||
dingbat-to-unicode@1.0.1: {}
|
dingbat-to-unicode@1.0.1: {}
|
||||||
@ -8026,6 +8062,14 @@ snapshots:
|
|||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- supports-color
|
- supports-color
|
||||||
|
|
||||||
|
git-diff@2.0.6:
|
||||||
|
dependencies:
|
||||||
|
chalk: 2.4.2
|
||||||
|
diff: 3.5.0
|
||||||
|
loglevel: 1.9.1
|
||||||
|
shelljs: 0.8.5
|
||||||
|
shelljs.exec: 1.1.8
|
||||||
|
|
||||||
glob-parent@5.1.2:
|
glob-parent@5.1.2:
|
||||||
dependencies:
|
dependencies:
|
||||||
is-glob: 4.0.3
|
is-glob: 4.0.3
|
||||||
@ -8271,6 +8315,8 @@ snapshots:
|
|||||||
|
|
||||||
ini@1.3.8: {}
|
ini@1.3.8: {}
|
||||||
|
|
||||||
|
interpret@1.4.0: {}
|
||||||
|
|
||||||
ioredis@5.4.1:
|
ioredis@5.4.1:
|
||||||
dependencies:
|
dependencies:
|
||||||
'@ioredis/commands': 1.2.0
|
'@ioredis/commands': 1.2.0
|
||||||
@ -9249,6 +9295,8 @@ snapshots:
|
|||||||
dependencies:
|
dependencies:
|
||||||
callsites: 3.1.0
|
callsites: 3.1.0
|
||||||
|
|
||||||
|
parse-diff@0.11.1: {}
|
||||||
|
|
||||||
parse-json@5.2.0:
|
parse-json@5.2.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
'@babel/code-frame': 7.24.7
|
'@babel/code-frame': 7.24.7
|
||||||
@ -9546,6 +9594,10 @@ snapshots:
|
|||||||
dependencies:
|
dependencies:
|
||||||
picomatch: 2.3.1
|
picomatch: 2.3.1
|
||||||
|
|
||||||
|
rechoir@0.6.2:
|
||||||
|
dependencies:
|
||||||
|
resolve: 1.22.8
|
||||||
|
|
||||||
redis-errors@1.2.0: {}
|
redis-errors@1.2.0: {}
|
||||||
|
|
||||||
redis-info@3.1.0:
|
redis-info@3.1.0:
|
||||||
@ -9718,6 +9770,14 @@ snapshots:
|
|||||||
|
|
||||||
shebang-regex@3.0.0: {}
|
shebang-regex@3.0.0: {}
|
||||||
|
|
||||||
|
shelljs.exec@1.1.8: {}
|
||||||
|
|
||||||
|
shelljs@0.8.5:
|
||||||
|
dependencies:
|
||||||
|
glob: 7.2.3
|
||||||
|
interpret: 1.4.0
|
||||||
|
rechoir: 0.6.2
|
||||||
|
|
||||||
shimmer@1.2.1: {}
|
shimmer@1.2.1: {}
|
||||||
|
|
||||||
side-channel@1.0.6:
|
side-channel@1.0.6:
|
||||||
|
@ -95,6 +95,116 @@ describe("Scrape tests", () => {
|
|||||||
expect(response.changeTracking).toBeDefined();
|
expect(response.changeTracking).toBeDefined();
|
||||||
expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
|
expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
|
||||||
}, 30000);
|
}, 30000);
|
||||||
|
|
||||||
|
it.concurrent("includes git diff when requested", async () => {
|
||||||
|
const response = await scrape({
|
||||||
|
url: "https://example.com",
|
||||||
|
formats: ["markdown", "changeTracking"],
|
||||||
|
changeTrackingOptions: {
|
||||||
|
modes: ["git-diff"]
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.changeTracking).toBeDefined();
|
||||||
|
expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
|
||||||
|
|
||||||
|
if (response.changeTracking?.changeStatus === "changed") {
|
||||||
|
expect(response.changeTracking?.diff).toBeDefined();
|
||||||
|
expect(response.changeTracking?.diff?.text).toBeDefined();
|
||||||
|
expect(response.changeTracking?.diff?.json).toBeDefined();
|
||||||
|
expect(response.changeTracking?.diff?.json.files).toBeInstanceOf(Array);
|
||||||
|
}
|
||||||
|
}, 30000);
|
||||||
|
|
||||||
|
it.concurrent("includes structured output when requested", async () => {
|
||||||
|
const response = await scrape({
|
||||||
|
url: "https://example.com",
|
||||||
|
formats: ["markdown", "changeTracking"],
|
||||||
|
changeTrackingOptions: {
|
||||||
|
modes: ["json"],
|
||||||
|
prompt: "Summarize the changes between the previous and current content",
|
||||||
|
systemPrompt: "You are a helpful assistant that summarizes changes between document versions."
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.changeTracking).toBeDefined();
|
||||||
|
expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
|
||||||
|
|
||||||
|
if (response.changeTracking?.changeStatus === "changed") {
|
||||||
|
expect(response.changeTracking?.json).toBeDefined();
|
||||||
|
}
|
||||||
|
}, 30000);
|
||||||
|
|
||||||
|
it.concurrent("supports schema-based extraction for change tracking", async () => {
|
||||||
|
const response = await scrape({
|
||||||
|
url: "https://example.com",
|
||||||
|
formats: ["markdown", "changeTracking"],
|
||||||
|
changeTrackingOptions: {
|
||||||
|
modes: ["json"],
|
||||||
|
schema: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
pricing: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
amount: { type: "number" },
|
||||||
|
currency: { type: "string" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
features: {
|
||||||
|
type: "array",
|
||||||
|
items: { type: "string" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.changeTracking).toBeDefined();
|
||||||
|
expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
|
||||||
|
|
||||||
|
if (response.changeTracking?.changeStatus === "changed") {
|
||||||
|
expect(response.changeTracking?.json).toBeDefined();
|
||||||
|
if (response.changeTracking?.json.pricing) {
|
||||||
|
expect(response.changeTracking?.json.pricing).toHaveProperty("old");
|
||||||
|
expect(response.changeTracking?.json.pricing).toHaveProperty("new");
|
||||||
|
}
|
||||||
|
if (response.changeTracking?.json.features) {
|
||||||
|
expect(response.changeTracking?.json.features).toHaveProperty("old");
|
||||||
|
expect(response.changeTracking?.json.features).toHaveProperty("new");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, 30000);
|
||||||
|
|
||||||
|
it.concurrent("supports both git-diff and structured modes together", async () => {
|
||||||
|
const response = await scrape({
|
||||||
|
url: "https://example.com",
|
||||||
|
formats: ["markdown", "changeTracking"],
|
||||||
|
changeTrackingOptions: {
|
||||||
|
modes: ["git-diff", "json"],
|
||||||
|
schema: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
summary: { type: "string" },
|
||||||
|
changes: { type: "array", items: { type: "string" } }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.changeTracking).toBeDefined();
|
||||||
|
expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
|
||||||
|
|
||||||
|
if (response.changeTracking?.changeStatus === "changed") {
|
||||||
|
expect(response.changeTracking?.diff).toBeDefined();
|
||||||
|
expect(response.changeTracking?.diff?.text).toBeDefined();
|
||||||
|
expect(response.changeTracking?.diff?.json).toBeDefined();
|
||||||
|
|
||||||
|
expect(response.changeTracking?.json).toBeDefined();
|
||||||
|
expect(response.changeTracking?.json).toHaveProperty("summary");
|
||||||
|
expect(response.changeTracking?.json).toHaveProperty("changes");
|
||||||
|
}
|
||||||
|
}, 30000);
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("Location API (f-e dependant)", () => {
|
describe("Location API (f-e dependant)", () => {
|
||||||
|
@ -104,7 +104,7 @@ export async function scrapeController(
|
|||||||
// Don't bill if we're early returning
|
// Don't bill if we're early returning
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (req.body.extract && req.body.formats.includes("extract")) {
|
if ((req.body.extract && req.body.formats?.includes("extract")) || (req.body.formats?.includes("changeTracking") && req.body.changeTrackingOptions?.modes?.includes("json"))) {
|
||||||
creditsToBeBilled = 5;
|
creditsToBeBilled = 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,6 +20,7 @@ export type Format =
|
|||||||
| "screenshot"
|
| "screenshot"
|
||||||
| "screenshot@fullPage"
|
| "screenshot@fullPage"
|
||||||
| "extract"
|
| "extract"
|
||||||
|
| "json"
|
||||||
| "changeTracking";
|
| "changeTracking";
|
||||||
|
|
||||||
export const url = z.preprocess(
|
export const url = z.preprocess(
|
||||||
@ -195,6 +196,13 @@ const baseScrapeOptions = z
|
|||||||
extract: extractOptions.optional(),
|
extract: extractOptions.optional(),
|
||||||
// New
|
// New
|
||||||
jsonOptions: extractOptions.optional(),
|
jsonOptions: extractOptions.optional(),
|
||||||
|
changeTrackingOptions: z
|
||||||
|
.object({
|
||||||
|
prompt: z.string().optional(),
|
||||||
|
schema: z.any().optional(),
|
||||||
|
modes: z.enum(["json", "git-diff"]).array().optional().default([]),
|
||||||
|
})
|
||||||
|
.optional(),
|
||||||
mobile: z.boolean().default(false),
|
mobile: z.boolean().default(false),
|
||||||
parsePDF: z.boolean().default(true),
|
parsePDF: z.boolean().default(true),
|
||||||
actions: actionsSchema.optional(),
|
actions: actionsSchema.optional(),
|
||||||
@ -555,6 +563,27 @@ export type Document = {
|
|||||||
previousScrapeAt: string | null;
|
previousScrapeAt: string | null;
|
||||||
changeStatus: "new" | "same" | "changed" | "removed";
|
changeStatus: "new" | "same" | "changed" | "removed";
|
||||||
visibility: "visible" | "hidden";
|
visibility: "visible" | "hidden";
|
||||||
|
diff?: {
|
||||||
|
text: string;
|
||||||
|
json: {
|
||||||
|
files: Array<{
|
||||||
|
from: string | null;
|
||||||
|
to: string | null;
|
||||||
|
chunks: Array<{
|
||||||
|
content: string;
|
||||||
|
changes: Array<{
|
||||||
|
type: string;
|
||||||
|
normal?: boolean;
|
||||||
|
ln?: number;
|
||||||
|
ln1?: number;
|
||||||
|
ln2?: number;
|
||||||
|
content: string;
|
||||||
|
}>;
|
||||||
|
}>;
|
||||||
|
}>;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
json?: any;
|
||||||
}
|
}
|
||||||
metadata: {
|
metadata: {
|
||||||
title?: string;
|
title?: string;
|
||||||
|
@ -2,6 +2,53 @@ import { supabase_service } from "../../../services/supabase";
|
|||||||
import { Document } from "../../../controllers/v1/types";
|
import { Document } from "../../../controllers/v1/types";
|
||||||
import { Meta } from "../index";
|
import { Meta } from "../index";
|
||||||
import { getJob } from "../../../controllers/v1/crawl-status";
|
import { getJob } from "../../../controllers/v1/crawl-status";
|
||||||
|
import gitDiff from 'git-diff';
|
||||||
|
import parseDiff from 'parse-diff';
|
||||||
|
import { generateCompletions } from "./llmExtract";
|
||||||
|
|
||||||
|
async function extractDataWithSchema(content: string, meta: Meta): Promise<any> {
|
||||||
|
try {
|
||||||
|
const { extract } = await generateCompletions({
|
||||||
|
logger: meta.logger.child({
|
||||||
|
method: "extractDataWithSchema/generateCompletions",
|
||||||
|
}),
|
||||||
|
options: {
|
||||||
|
mode: "llm",
|
||||||
|
schema: meta.options.changeTrackingOptions?.schema,
|
||||||
|
systemPrompt: "Extract the requested information from the content based on the provided schema.",
|
||||||
|
temperature: 0
|
||||||
|
},
|
||||||
|
markdown: content
|
||||||
|
});
|
||||||
|
return extract;
|
||||||
|
} catch (error) {
|
||||||
|
meta.logger.error("Error extracting data with schema", { error });
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function compareExtractedData(previousData: any, currentData: any): any {
|
||||||
|
const result: Record<string, { previous: any, current: any }> = {};
|
||||||
|
|
||||||
|
const allKeys = new Set([
|
||||||
|
...Object.keys(previousData || {}),
|
||||||
|
...Object.keys(currentData || {})
|
||||||
|
]);
|
||||||
|
|
||||||
|
for (const key of allKeys) {
|
||||||
|
const oldValue = previousData?.[key];
|
||||||
|
const newValue = currentData?.[key];
|
||||||
|
|
||||||
|
if (JSON.stringify(oldValue) !== JSON.stringify(newValue)) {
|
||||||
|
result[key] = {
|
||||||
|
previous: oldValue,
|
||||||
|
current: newValue
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
export async function deriveDiff(meta: Meta, document: Document): Promise<Document> {
|
export async function deriveDiff(meta: Meta, document: Document): Promise<Document> {
|
||||||
if (meta.options.formats.includes("changeTracking")) {
|
if (meta.options.formats.includes("changeTracking")) {
|
||||||
@ -20,19 +67,106 @@ export async function deriveDiff(meta: Meta, document: Document): Promise<Docume
|
|||||||
returnvalue: Document,
|
returnvalue: Document,
|
||||||
} | null = data?.o_job_id ? await getJob(data.o_job_id) : null;
|
} | null = data?.o_job_id ? await getJob(data.o_job_id) : null;
|
||||||
|
|
||||||
console.log(data, job);
|
|
||||||
|
|
||||||
if (data && job && job?.returnvalue) {
|
if (data && job && job?.returnvalue) {
|
||||||
const previousMarkdown = job.returnvalue.markdown!;
|
const previousMarkdown = job.returnvalue.markdown!;
|
||||||
const currentMarkdown = document.markdown!;
|
const currentMarkdown = document.markdown!;
|
||||||
|
|
||||||
const transformer = (x: string) => [...x.replace(/\s+/g, "").replace(/\[iframe\]\(.+?\)/g, "")].sort().join("");
|
const transformer = (x: string) => [...x.replace(/\s+/g, "").replace(/\[iframe\]\(.+?\)/g, "")].sort().join("");
|
||||||
|
const isChanged = transformer(previousMarkdown) !== transformer(currentMarkdown);
|
||||||
|
const changeStatus = document.metadata.statusCode === 404 ? "removed" : isChanged ? "changed" : "same";
|
||||||
|
|
||||||
document.changeTracking = {
|
document.changeTracking = {
|
||||||
previousScrapeAt: data.o_date_added,
|
previousScrapeAt: data.o_date_added,
|
||||||
changeStatus: document.metadata.statusCode === 404 ? "removed" : transformer(previousMarkdown) === transformer(currentMarkdown) ? "same" : "changed",
|
changeStatus,
|
||||||
visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible",
|
visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (meta.options.changeTrackingOptions?.modes?.includes("git-diff") && changeStatus === "changed") {
|
||||||
|
const diffText = gitDiff(previousMarkdown, currentMarkdown, {
|
||||||
|
color: false,
|
||||||
|
wordDiff: false
|
||||||
|
});
|
||||||
|
|
||||||
|
if (diffText) {
|
||||||
|
const diffStructured = parseDiff(diffText);
|
||||||
|
document.changeTracking.diff = {
|
||||||
|
text: diffText,
|
||||||
|
json: {
|
||||||
|
files: diffStructured.map(file => ({
|
||||||
|
from: file.from || null,
|
||||||
|
to: file.to || null,
|
||||||
|
chunks: file.chunks.map(chunk => ({
|
||||||
|
content: chunk.content,
|
||||||
|
changes: chunk.changes.map(change => {
|
||||||
|
const baseChange = {
|
||||||
|
type: change.type,
|
||||||
|
content: change.content
|
||||||
|
};
|
||||||
|
|
||||||
|
if (change.type === 'normal' && 'ln1' in change && 'ln2' in change) {
|
||||||
|
return {
|
||||||
|
...baseChange,
|
||||||
|
normal: true,
|
||||||
|
ln1: change.ln1,
|
||||||
|
ln2: change.ln2
|
||||||
|
};
|
||||||
|
} else if (change.type === 'add' && 'ln' in change) {
|
||||||
|
return {
|
||||||
|
...baseChange,
|
||||||
|
add: true,
|
||||||
|
ln: change.ln
|
||||||
|
};
|
||||||
|
} else if (change.type === 'del' && 'ln' in change) {
|
||||||
|
return {
|
||||||
|
...baseChange,
|
||||||
|
del: true,
|
||||||
|
ln: change.ln
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return baseChange;
|
||||||
|
})
|
||||||
|
}))
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (meta.options.changeTrackingOptions?.modes?.includes("json") &&
|
||||||
|
meta.options.changeTrackingOptions && changeStatus === "changed") {
|
||||||
|
try {
|
||||||
|
const previousData = meta.options.changeTrackingOptions.schema ?
|
||||||
|
await extractDataWithSchema(previousMarkdown, meta) : null;
|
||||||
|
|
||||||
|
const currentData = meta.options.changeTrackingOptions.schema ?
|
||||||
|
await extractDataWithSchema(currentMarkdown, meta) : null;
|
||||||
|
|
||||||
|
if (previousData && currentData) {
|
||||||
|
document.changeTracking.json = compareExtractedData(previousData, currentData);
|
||||||
|
} else {
|
||||||
|
const { extract } = await generateCompletions({
|
||||||
|
logger: meta.logger.child({
|
||||||
|
method: "deriveDiff/generateCompletions",
|
||||||
|
}),
|
||||||
|
options: {
|
||||||
|
mode: "llm",
|
||||||
|
systemPrompt: "Analyze the differences between the previous and current content and provide a structured summary of the changes.",
|
||||||
|
schema: meta.options.changeTrackingOptions.schema,
|
||||||
|
prompt: meta.options.changeTrackingOptions.prompt,
|
||||||
|
temperature: 0
|
||||||
|
},
|
||||||
|
markdown: `Previous Content:\n${previousMarkdown}\n\nCurrent Content:\n${currentMarkdown}`,
|
||||||
|
previousWarning: document.warning
|
||||||
|
});
|
||||||
|
|
||||||
|
document.changeTracking.json = extract;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
meta.logger.error("Error generating structured diff with LLM", { error });
|
||||||
|
document.warning = "Structured diff generation failed." + (document.warning ? ` ${document.warning}` : "");
|
||||||
|
}
|
||||||
|
}
|
||||||
} else if (!res.error) {
|
} else if (!res.error) {
|
||||||
document.changeTracking = {
|
document.changeTracking = {
|
||||||
previousScrapeAt: null,
|
previousScrapeAt: null,
|
||||||
|
@ -159,6 +159,24 @@ export function coerceFieldsToFormats(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (document.changeTracking &&
|
||||||
|
(!meta.options.changeTrackingOptions?.modes?.includes("git-diff")) &&
|
||||||
|
document.changeTracking.diff !== undefined) {
|
||||||
|
meta.logger.warn(
|
||||||
|
"Removed diff from changeTracking because git-diff mode wasn't specified in changeTrackingOptions.modes.",
|
||||||
|
);
|
||||||
|
delete document.changeTracking.diff;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (document.changeTracking &&
|
||||||
|
(!meta.options.changeTrackingOptions?.modes?.includes("json")) &&
|
||||||
|
document.changeTracking.json !== undefined) {
|
||||||
|
meta.logger.warn(
|
||||||
|
"Removed structured from changeTracking because structured mode wasn't specified in changeTrackingOptions.modes.",
|
||||||
|
);
|
||||||
|
delete document.changeTracking.json;
|
||||||
|
}
|
||||||
|
|
||||||
if (meta.options.actions === undefined || meta.options.actions.length === 0) {
|
if (meta.options.actions === undefined || meta.options.actions.length === 0) {
|
||||||
delete document.actions;
|
delete document.actions;
|
||||||
}
|
}
|
||||||
|
49
apps/api/src/types/parse-diff.d.ts
vendored
Normal file
49
apps/api/src/types/parse-diff.d.ts
vendored
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
declare module 'parse-diff' {
|
||||||
|
interface NormalChange {
|
||||||
|
type: 'normal';
|
||||||
|
normal: true;
|
||||||
|
ln1: number;
|
||||||
|
ln2: number;
|
||||||
|
content: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface AddChange {
|
||||||
|
type: 'add';
|
||||||
|
add: true;
|
||||||
|
ln: number;
|
||||||
|
content: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface DeleteChange {
|
||||||
|
type: 'del';
|
||||||
|
del: true;
|
||||||
|
ln: number;
|
||||||
|
content: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
type Change = NormalChange | AddChange | DeleteChange;
|
||||||
|
|
||||||
|
interface Chunk {
|
||||||
|
content: string;
|
||||||
|
changes: Change[];
|
||||||
|
oldStart: number;
|
||||||
|
oldLines: number;
|
||||||
|
newStart: number;
|
||||||
|
newLines: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface File {
|
||||||
|
chunks: Chunk[];
|
||||||
|
deletions: number;
|
||||||
|
additions: number;
|
||||||
|
from: string | null;
|
||||||
|
to: string | null;
|
||||||
|
index?: string[];
|
||||||
|
newMode?: string;
|
||||||
|
oldMode?: string;
|
||||||
|
binary?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseDiff(diff: string): File[];
|
||||||
|
export = parseDiff;
|
||||||
|
}
|
@ -72,6 +72,26 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
|
|||||||
previousScrapeAt: string | null;
|
previousScrapeAt: string | null;
|
||||||
changeStatus: "new" | "same" | "changed" | "removed";
|
changeStatus: "new" | "same" | "changed" | "removed";
|
||||||
visibility: "visible" | "hidden";
|
visibility: "visible" | "hidden";
|
||||||
|
diff?: {
|
||||||
|
text: string;
|
||||||
|
structured: {
|
||||||
|
files: Array<{
|
||||||
|
from: string | null;
|
||||||
|
to: string | null;
|
||||||
|
chunks: Array<{
|
||||||
|
content: string;
|
||||||
|
changes: Array<{
|
||||||
|
type: string;
|
||||||
|
normal?: boolean;
|
||||||
|
ln?: number;
|
||||||
|
ln1?: number;
|
||||||
|
ln2?: number;
|
||||||
|
content: string;
|
||||||
|
}>;
|
||||||
|
}>;
|
||||||
|
}>;
|
||||||
|
};
|
||||||
|
};
|
||||||
};
|
};
|
||||||
// v1 search only
|
// v1 search only
|
||||||
title?: string;
|
title?: string;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user