mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-04-21 21:49:44 +08:00
Rename 'changes' format and property to 'changeTracking'
Co-Authored-By: mogery@sideguide.dev <mogery@sideguide.dev>
This commit is contained in:
parent
0a9b7e96f7
commit
89ffac85ab
@ -85,15 +85,15 @@ describe("Scrape tests", () => {
|
|||||||
// }, 30000);
|
// }, 30000);
|
||||||
// });
|
// });
|
||||||
|
|
||||||
describe("Changes format", () => {
|
describe("Change Tracking format", () => {
|
||||||
it.concurrent("works", async () => {
|
it.concurrent("works", async () => {
|
||||||
const response = await scrape({
|
const response = await scrape({
|
||||||
url: "https://example.com",
|
url: "https://example.com",
|
||||||
formats: ["markdown", "changes"],
|
formats: ["markdown", "changeTracking"],
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(response.changes).toBeDefined();
|
expect(response.changeTracking).toBeDefined();
|
||||||
expect(response.changes?.previousScrapeAt).not.toBeNull();
|
expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ export type Format =
|
|||||||
| "screenshot"
|
| "screenshot"
|
||||||
| "screenshot@fullPage"
|
| "screenshot@fullPage"
|
||||||
| "extract"
|
| "extract"
|
||||||
| "changes";
|
| "changeTracking";
|
||||||
|
|
||||||
export const url = z.preprocess(
|
export const url = z.preprocess(
|
||||||
(x) => {
|
(x) => {
|
||||||
@ -166,7 +166,7 @@ const baseScrapeOptions = z
|
|||||||
"screenshot@fullPage",
|
"screenshot@fullPage",
|
||||||
"extract",
|
"extract",
|
||||||
"json",
|
"json",
|
||||||
"changes",
|
"changeTracking",
|
||||||
])
|
])
|
||||||
.array()
|
.array()
|
||||||
.optional()
|
.optional()
|
||||||
@ -176,8 +176,8 @@ const baseScrapeOptions = z
|
|||||||
"You may only specify either screenshot or screenshot@fullPage",
|
"You may only specify either screenshot or screenshot@fullPage",
|
||||||
)
|
)
|
||||||
.refine(
|
.refine(
|
||||||
(x) => !x.includes("changes") || x.includes("markdown"),
|
(x) => !x.includes("changeTracking") || x.includes("markdown"),
|
||||||
"The changes format requires the markdown format to be specified as well",
|
"The changeTracking format requires the markdown format to be specified as well",
|
||||||
),
|
),
|
||||||
headers: z.record(z.string(), z.string()).optional(),
|
headers: z.record(z.string(), z.string()).optional(),
|
||||||
includeTags: z.string().array().optional(),
|
includeTags: z.string().array().optional(),
|
||||||
@ -552,7 +552,7 @@ export type Document = {
|
|||||||
value: unknown
|
value: unknown
|
||||||
}[];
|
}[];
|
||||||
};
|
};
|
||||||
changes?: {
|
changeTracking?: {
|
||||||
previousScrapeAt: string | null;
|
previousScrapeAt: string | null;
|
||||||
changeStatus: "new" | "same" | "changed" | "removed";
|
changeStatus: "new" | "same" | "changed" | "removed";
|
||||||
visibility: "visible" | "hidden";
|
visibility: "visible" | "hidden";
|
||||||
|
@ -3,7 +3,7 @@ import { Document } from "../../../controllers/v1/types";
|
|||||||
import { Meta } from "../index";
|
import { Meta } from "../index";
|
||||||
|
|
||||||
export async function deriveDiff(meta: Meta, document: Document): Promise<Document> {
|
export async function deriveDiff(meta: Meta, document: Document): Promise<Document> {
|
||||||
if (meta.options.formats.includes("changes")) {
|
if (meta.options.formats.includes("changeTracking")) {
|
||||||
const res = await supabase_service
|
const res = await supabase_service
|
||||||
.rpc("diff_get_last_scrape_1", {
|
.rpc("diff_get_last_scrape_1", {
|
||||||
i_team_id: meta.internalOptions.teamId,
|
i_team_id: meta.internalOptions.teamId,
|
||||||
@ -21,13 +21,13 @@ export async function deriveDiff(meta: Meta, document: Document): Promise<Docume
|
|||||||
|
|
||||||
const transformer = (x: string) => [...x.replace(/\s+/g, "").replace(/\[iframe\]\(.+?\)/g, "")].sort().join("");
|
const transformer = (x: string) => [...x.replace(/\s+/g, "").replace(/\[iframe\]\(.+?\)/g, "")].sort().join("");
|
||||||
|
|
||||||
document.changes = {
|
document.changeTracking = {
|
||||||
previousScrapeAt: data.o_date_added,
|
previousScrapeAt: data.o_date_added,
|
||||||
changeStatus: document.metadata.statusCode === 404 ? "removed" : transformer(previousMarkdown) === transformer(currentMarkdown) ? "same" : "changed",
|
changeStatus: document.metadata.statusCode === 404 ? "removed" : transformer(previousMarkdown) === transformer(currentMarkdown) ? "same" : "changed",
|
||||||
visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible",
|
visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible",
|
||||||
}
|
}
|
||||||
} else if (!res.error) {
|
} else if (!res.error) {
|
||||||
document.changes = {
|
document.changeTracking = {
|
||||||
previousScrapeAt: null,
|
previousScrapeAt: null,
|
||||||
changeStatus: document.metadata.statusCode === 404 ? "removed" : "new",
|
changeStatus: document.metadata.statusCode === 404 ? "removed" : "new",
|
||||||
visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible",
|
visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible",
|
||||||
|
@ -148,14 +148,14 @@ export function coerceFieldsToFormats(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!formats.has("changes") && document.changes !== undefined) {
|
if (!formats.has("changeTracking") && document.changeTracking !== undefined) {
|
||||||
meta.logger.warn(
|
meta.logger.warn(
|
||||||
"Removed changes from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.",
|
"Removed changeTracking from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.",
|
||||||
);
|
);
|
||||||
delete document.changes;
|
delete document.changeTracking;
|
||||||
} else if (formats.has("changes") && document.changes === undefined) {
|
} else if (formats.has("changeTracking") && document.changeTracking === undefined) {
|
||||||
meta.logger.warn(
|
meta.logger.warn(
|
||||||
"Request had format changes, but there was no changes field in the result.",
|
"Request had format changeTracking, but there was no changeTracking field in the result.",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -68,7 +68,7 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
|
|||||||
screenshot?: string;
|
screenshot?: string;
|
||||||
metadata?: FirecrawlDocumentMetadata;
|
metadata?: FirecrawlDocumentMetadata;
|
||||||
actions: ActionsSchema;
|
actions: ActionsSchema;
|
||||||
changes?: {
|
changeTracking?: {
|
||||||
previousScrapeAt: string | null;
|
previousScrapeAt: string | null;
|
||||||
changeStatus: "new" | "same" | "changed" | "removed";
|
changeStatus: "new" | "same" | "changed" | "removed";
|
||||||
visibility: "visible" | "hidden";
|
visibility: "visible" | "hidden";
|
||||||
@ -83,7 +83,7 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
|
|||||||
* Defines the options and configurations available for scraping web content.
|
* Defines the options and configurations available for scraping web content.
|
||||||
*/
|
*/
|
||||||
export interface CrawlScrapeOptions {
|
export interface CrawlScrapeOptions {
|
||||||
formats?: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract" | "json" | "changes")[];
|
formats?: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract" | "json" | "changeTracking")[];
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
includeTags?: string[];
|
includeTags?: string[];
|
||||||
excludeTags?: string[];
|
excludeTags?: string[];
|
||||||
|
Loading…
x
Reference in New Issue
Block a user