mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-18 20:55:56 +08:00
monitoring v0
This commit is contained in:
parent
b3b63486f1
commit
50db3e9e8a
@ -158,6 +158,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||
pageOptions,
|
||||
undefined,
|
||||
undefined,
|
||||
team_id
|
||||
);
|
||||
internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
||||
|
||||
|
@ -99,6 +99,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||
pageOptions,
|
||||
undefined,
|
||||
undefined,
|
||||
team_id
|
||||
);
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
|
@ -66,6 +66,7 @@ export async function scrapeHelper(
|
||||
extractorOptions,
|
||||
timeout,
|
||||
crawlerOptions,
|
||||
team_id,
|
||||
);
|
||||
|
||||
await addScrapeJob(
|
||||
@ -297,6 +298,7 @@ export async function scrapeController(req: Request, res: Response) {
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
timeout,
|
||||
team_id,
|
||||
);
|
||||
|
||||
logJob({
|
||||
|
@ -72,6 +72,7 @@ export async function searchHelper(
|
||||
undefined,
|
||||
60000,
|
||||
crawlerOptions,
|
||||
team_id,
|
||||
);
|
||||
|
||||
if (justSearch) {
|
||||
|
@ -82,7 +82,7 @@ export async function batchScrapeController(
|
||||
: {
|
||||
crawlerOptions: null,
|
||||
scrapeOptions: req.body,
|
||||
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
|
||||
internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
|
@ -81,7 +81,7 @@ export async function crawlController(
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
|
||||
scrapeOptions,
|
||||
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
||||
internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
|
@ -85,7 +85,7 @@ export async function getMapResults({
|
||||
scrapeOptions: undefined,
|
||||
},
|
||||
scrapeOptions: scrapeOptions.parse({}),
|
||||
internalOptions: {},
|
||||
internalOptions: { teamId },
|
||||
team_id: teamId,
|
||||
createdAt: Date.now(),
|
||||
plan: plan,
|
||||
|
@ -50,7 +50,7 @@ export async function scrapeController(
|
||||
mode: "single_urls",
|
||||
team_id: req.auth.team_id,
|
||||
scrapeOptions: req.body,
|
||||
internalOptions: {},
|
||||
internalOptions: { teamId: req.auth.team_id },
|
||||
plan: req.auth.plan!,
|
||||
origin: req.body.origin,
|
||||
is_scrape: true,
|
||||
|
@ -83,7 +83,7 @@ async function scrapeSearchResult(
|
||||
mode: "single_urls" as Mode,
|
||||
team_id: options.teamId,
|
||||
scrapeOptions: options.scrapeOptions,
|
||||
internalOptions: {},
|
||||
internalOptions: { teamId: options.teamId },
|
||||
plan: options.plan || "free",
|
||||
origin: options.origin,
|
||||
is_scrape: true,
|
||||
|
@ -20,7 +20,8 @@ export type Format =
|
||||
| "links"
|
||||
| "screenshot"
|
||||
| "screenshot@fullPage"
|
||||
| "extract";
|
||||
| "extract"
|
||||
| "diff";
|
||||
|
||||
export const url = z.preprocess(
|
||||
(x) => {
|
||||
@ -165,6 +166,7 @@ const baseScrapeOptions = z
|
||||
"screenshot@fullPage",
|
||||
"extract",
|
||||
"json",
|
||||
"diff",
|
||||
])
|
||||
.array()
|
||||
.optional()
|
||||
@ -172,6 +174,10 @@ const baseScrapeOptions = z
|
||||
.refine(
|
||||
(x) => !(x.includes("screenshot") && x.includes("screenshot@fullPage")),
|
||||
"You may only specify either screenshot or screenshot@fullPage",
|
||||
)
|
||||
.refine(
|
||||
(x) => !x.includes("diff") || x.includes("markdown"),
|
||||
"The diff format requires the markdown format to be specified as well",
|
||||
),
|
||||
headers: z.record(z.string(), z.string()).optional(),
|
||||
includeTags: z.string().array().optional(),
|
||||
@ -546,6 +552,11 @@ export type Document = {
|
||||
value: unknown
|
||||
}[];
|
||||
};
|
||||
diff?: {
|
||||
previousScrapeAt: string | null;
|
||||
changeStatus: "new" | "same" | "changed" | "removed";
|
||||
visibility: "visible" | "hidden";
|
||||
}
|
||||
metadata: {
|
||||
title?: string;
|
||||
description?: string;
|
||||
@ -812,7 +823,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
||||
};
|
||||
}
|
||||
|
||||
export function fromLegacyCrawlerOptions(x: any): {
|
||||
export function fromLegacyCrawlerOptions(x: any, teamId: string): {
|
||||
crawlOptions: CrawlerOptions;
|
||||
internalOptions: InternalOptions;
|
||||
} {
|
||||
@ -834,6 +845,7 @@ export function fromLegacyCrawlerOptions(x: any): {
|
||||
}),
|
||||
internalOptions: {
|
||||
v0CrawlOnlyUrls: x.returnOnlyUrls,
|
||||
teamId,
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -847,6 +859,7 @@ export function fromLegacyScrapeOptions(
|
||||
pageOptions: PageOptions,
|
||||
extractorOptions: ExtractorOptions | undefined,
|
||||
timeout: number | undefined,
|
||||
teamId: string,
|
||||
): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
|
||||
return {
|
||||
scrapeOptions: scrapeOptions.parse({
|
||||
@ -896,6 +909,7 @@ export function fromLegacyScrapeOptions(
|
||||
internalOptions: {
|
||||
atsv: pageOptions.atsv,
|
||||
v0DisableJsDom: pageOptions.disableJsDom,
|
||||
teamId,
|
||||
},
|
||||
// TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
|
||||
};
|
||||
@ -906,13 +920,15 @@ export function fromLegacyCombo(
|
||||
extractorOptions: ExtractorOptions | undefined,
|
||||
timeout: number | undefined,
|
||||
crawlerOptions: any,
|
||||
teamId: string,
|
||||
): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
|
||||
const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
timeout,
|
||||
teamId,
|
||||
);
|
||||
const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions);
|
||||
const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions, teamId);
|
||||
return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
|
||||
}
|
||||
|
||||
|
@ -44,6 +44,7 @@ export async function scrapeDocument(
|
||||
scrapeOptions: scrapeOptions.parse({ ...internalScrapeOptions }),
|
||||
internalOptions: {
|
||||
useCache: true,
|
||||
teamId: options.teamId,
|
||||
},
|
||||
plan: options.plan,
|
||||
origin: options.origin,
|
||||
|
@ -97,6 +97,7 @@ export async function runWebScraper({
|
||||
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
|
||||
priority,
|
||||
...internalOptions,
|
||||
teamId: internalOptions?.teamId ?? team_id,
|
||||
});
|
||||
if (!response.success) {
|
||||
if (response.error instanceof Error) {
|
||||
|
@ -47,6 +47,7 @@ export async function getLinksFromSitemap(
|
||||
],
|
||||
v0DisableJsDom: true,
|
||||
abort,
|
||||
teamId: "sitemap",
|
||||
},
|
||||
);
|
||||
|
||||
|
@ -173,6 +173,7 @@ export type InternalOptions = {
|
||||
isBackgroundIndex?: boolean;
|
||||
fromCache?: boolean; // Indicates if the document was retrieved from cache
|
||||
abort?: AbortSignal;
|
||||
teamId: string;
|
||||
};
|
||||
|
||||
export type EngineResultsTracker = {
|
||||
@ -383,7 +384,7 @@ export async function scrapeURL(
|
||||
id: string,
|
||||
url: string,
|
||||
options: ScrapeOptions,
|
||||
internalOptions: InternalOptions = {},
|
||||
internalOptions: InternalOptions,
|
||||
): Promise<ScrapeUrlResponse> {
|
||||
const meta = await buildMetaObject(id, url, options, internalOptions);
|
||||
try {
|
||||
|
@ -31,7 +31,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"test:scrape-basic",
|
||||
"https://www.roastmywebsite.ai/",
|
||||
scrapeOptions.parse({}),
|
||||
{ forceEngine },
|
||||
{ forceEngine, teamId: "test" },
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -78,7 +78,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
scrapeOptions.parse({
|
||||
formats: ["markdown", "html"],
|
||||
}),
|
||||
{ forceEngine },
|
||||
{ forceEngine, teamId: "test" },
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -102,7 +102,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
scrapeOptions.parse({
|
||||
onlyMainContent: false,
|
||||
}),
|
||||
{ forceEngine },
|
||||
{ forceEngine, teamId: "test" },
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -125,7 +125,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
onlyMainContent: false,
|
||||
excludeTags: [".nav", "#footer", "strong"],
|
||||
}),
|
||||
{ forceEngine },
|
||||
{ forceEngine, teamId: "test" },
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -145,7 +145,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"test:scrape-400",
|
||||
"https://httpstat.us/400",
|
||||
scrapeOptions.parse({}),
|
||||
{ forceEngine },
|
||||
{ forceEngine, teamId: "test" },
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -163,7 +163,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"test:scrape-401",
|
||||
"https://httpstat.us/401",
|
||||
scrapeOptions.parse({}),
|
||||
{ forceEngine },
|
||||
{ forceEngine, teamId: "test" },
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -181,7 +181,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"test:scrape-403",
|
||||
"https://httpstat.us/403",
|
||||
scrapeOptions.parse({}),
|
||||
{ forceEngine },
|
||||
{ forceEngine, teamId: "test" },
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -199,7 +199,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"test:scrape-404",
|
||||
"https://httpstat.us/404",
|
||||
scrapeOptions.parse({}),
|
||||
{ forceEngine },
|
||||
{ forceEngine, teamId: "test" },
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -217,7 +217,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"test:scrape-405",
|
||||
"https://httpstat.us/405",
|
||||
scrapeOptions.parse({}),
|
||||
{ forceEngine },
|
||||
{ forceEngine, teamId: "test" },
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -235,7 +235,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"test:scrape-500",
|
||||
"https://httpstat.us/500",
|
||||
scrapeOptions.parse({}),
|
||||
{ forceEngine },
|
||||
{ forceEngine, teamId: "test" },
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -253,7 +253,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"test:scrape-redirect",
|
||||
"https://scrapethissite.com/",
|
||||
scrapeOptions.parse({}),
|
||||
{ forceEngine },
|
||||
{ forceEngine, teamId: "test" },
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -285,7 +285,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
scrapeOptions.parse({
|
||||
formats: ["screenshot"],
|
||||
}),
|
||||
{ forceEngine },
|
||||
{ forceEngine, teamId: "test" },
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -313,7 +313,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
scrapeOptions.parse({
|
||||
formats: ["screenshot@fullPage"],
|
||||
}),
|
||||
{ forceEngine },
|
||||
{ forceEngine, teamId: "test" },
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -341,6 +341,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"test:scrape-pdf",
|
||||
"https://arxiv.org/pdf/astro-ph/9301001.pdf",
|
||||
scrapeOptions.parse({}),
|
||||
{ teamId: "test" },
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -359,6 +360,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"test:scrape-docx",
|
||||
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx",
|
||||
scrapeOptions.parse({}),
|
||||
{ teamId: "test" },
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -395,6 +397,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
},
|
||||
},
|
||||
}),
|
||||
{ teamId: "test" },
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -430,6 +433,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
},
|
||||
},
|
||||
}),
|
||||
{ teamId: "test" },
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -451,7 +455,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
async (i) => {
|
||||
const url = "https://www.scrapethissite.com/?i=" + i;
|
||||
const id = "test:concurrent:" + url;
|
||||
const out = await scrapeURL(id, url, scrapeOptions.parse({}));
|
||||
const out = await scrapeURL(id, url, scrapeOptions.parse({}), { teamId: "test" });
|
||||
|
||||
const replacer = (key: string, value: any) => {
|
||||
if (value instanceof Error) {
|
||||
|
39
apps/api/src/scraper/scrapeURL/transformers/diff.ts
Normal file
39
apps/api/src/scraper/scrapeURL/transformers/diff.ts
Normal file
@ -0,0 +1,39 @@
|
||||
import { supabase_rr_service } from "../../../services/supabase";
|
||||
import { Document } from "../../../controllers/v1/types";
|
||||
import { Meta } from "../index";
|
||||
|
||||
export async function deriveDiff(meta: Meta, document: Document): Promise<Document> {
|
||||
if (meta.options.formats.includes("diff")) {
|
||||
const { data, error } = await supabase_rr_service
|
||||
.from("firecrawl_jobs")
|
||||
.select()
|
||||
.eq("team_id", meta.internalOptions.teamId)
|
||||
.eq("url", document.metadata.url ?? document.metadata.sourceURL ?? meta.url)
|
||||
.contains("page_options->>'formats'", "markdown")
|
||||
.order("date_added", { ascending: false })
|
||||
.limit(1)
|
||||
.single();
|
||||
|
||||
if (data) {
|
||||
const previousMarkdown = data.docs[0].markdown;
|
||||
const currentMarkdown = document.markdown!;
|
||||
|
||||
document.diff = {
|
||||
previousScrapeAt: data.date_added,
|
||||
changeStatus: previousMarkdown.replace(/\s+/g, "") === currentMarkdown.replace(/\s+/g, "") ? "same" : "changed",
|
||||
visibility: "visible",
|
||||
}
|
||||
} else if (!error) {
|
||||
document.diff = {
|
||||
previousScrapeAt: null,
|
||||
changeStatus: "new",
|
||||
visibility: "visible",
|
||||
}
|
||||
} else {
|
||||
meta.logger.error("Error fetching previous scrape", { error });
|
||||
document.warning = "Diffing failed, please try again later." + (document.warning ? ` ${document.warning}` : "");
|
||||
}
|
||||
}
|
||||
|
||||
return document;
|
||||
}
|
@ -8,7 +8,7 @@ import { performLLMExtract } from "./llmExtract";
|
||||
import { uploadScreenshot } from "./uploadScreenshot";
|
||||
import { removeBase64Images } from "./removeBase64Images";
|
||||
import { saveToCache } from "./cache";
|
||||
|
||||
import { deriveDiff } from "./diff";
|
||||
export type Transformer = (
|
||||
meta: Meta,
|
||||
document: Document,
|
||||
@ -148,6 +148,17 @@ export function coerceFieldsToFormats(
|
||||
);
|
||||
}
|
||||
|
||||
if (!formats.has("diff") && document.diff !== undefined) {
|
||||
meta.logger.warn(
|
||||
"Removed diff from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.",
|
||||
);
|
||||
delete document.diff;
|
||||
} else if (formats.has("diff") && document.diff === undefined) {
|
||||
meta.logger.warn(
|
||||
"Request had format diff, but there was no diff field in the result.",
|
||||
);
|
||||
}
|
||||
|
||||
if (meta.options.actions === undefined || meta.options.actions.length === 0) {
|
||||
delete document.actions;
|
||||
}
|
||||
@ -164,6 +175,7 @@ export const transformerStack: Transformer[] = [
|
||||
deriveMetadataFromRawHTML,
|
||||
uploadScreenshot,
|
||||
performLLMExtract,
|
||||
deriveDiff,
|
||||
coerceFieldsToFormats,
|
||||
removeBase64Images,
|
||||
];
|
||||
|
Loading…
x
Reference in New Issue
Block a user