mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-04-18 12:09:42 +08:00

* agent that decides if splits schema or not * split and merge properties done * wip * wip * changes * ch * array merge working! * comment * wip * dereferentiate schema * dereference schemas * Nick: new re-ranker * Create llm-links.txt * Nick: format * Update extraction-service.ts * wip: cooking schema mix and spread functions * wip * wip getting there!!! * nick: * moved functions to helpers * nick: * cant reproduce the error anymore * error handling all scrapes failed * fix * Nick: added the sitemap index * Update sitemap-index.ts * Update map.ts * deduplicate and merge arrays * added error handler for object transformations * Update url-processor.ts * Nick: * Nick: fixes * Nick: big improvements to rerank of multi-entity * Nick: working * Update reranker.ts * fixed transformations for nested objs * fix merge nulls * Nick: fixed error piping * Update queue-worker.ts * Update extraction-service.ts * Nick: format * Update queue-worker.ts * Update pnpm-lock.yaml * Update queue-worker.ts --------- Co-authored-by: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Co-authored-by: Thomas Kosmas <thomas510111@gmail.com>
81 lines
2.6 KiB
TypeScript
81 lines
2.6 KiB
TypeScript
// const id = crypto.randomUUID();
|
|
|
|
// const sc: StoredCrawl = {
|
|
// originUrl: request.urls[0].replace("/*",""),
|
|
// crawlerOptions: toLegacyCrawlerOptions({
|
|
// maxDepth: 15,
|
|
// limit: 5000,
|
|
// includePaths: [],
|
|
// excludePaths: [],
|
|
// ignoreSitemap: false,
|
|
// allowExternalLinks: false,
|
|
// allowBackwardLinks: true,
|
|
// allowSubdomains: false,
|
|
// ignoreRobotsTxt: false,
|
|
// deduplicateSimilarURLs: false,
|
|
// ignoreQueryParameters: false
|
|
// }),
|
|
// scrapeOptions: {
|
|
// formats: ["markdown"],
|
|
// onlyMainContent: true,
|
|
// waitFor: 0,
|
|
// mobile: false,
|
|
// removeBase64Images: true,
|
|
// fastMode: false,
|
|
// parsePDF: true,
|
|
// skipTlsVerification: false,
|
|
// },
|
|
// internalOptions: {
|
|
// disableSmartWaitCache: true,
|
|
// isBackgroundIndex: true
|
|
// },
|
|
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
|
// createdAt: Date.now(),
|
|
// plan: "hobby", // make it a low concurrency
|
|
// };
|
|
|
|
// // Save the crawl configuration
|
|
// await saveCrawl(id, sc);
|
|
|
|
// // Then kick off the job
|
|
// await _addScrapeJobToBullMQ({
|
|
// url: request.urls[0].replace("/*",""),
|
|
// mode: "kickoff" as const,
|
|
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
|
// plan: "hobby", // make it a low concurrency
|
|
// crawlerOptions: sc.crawlerOptions,
|
|
// scrapeOptions: sc.scrapeOptions,
|
|
// internalOptions: sc.internalOptions,
|
|
// origin: "index",
|
|
// crawl_id: id,
|
|
// webhook: null,
|
|
// v1: true,
|
|
// }, {}, crypto.randomUUID(), 50);
|
|
|
|
// we restructure and make all of the arrays we need to fill into objects,
|
|
// adding them to a single object so the llm can fill them one at a time
|
|
// TODO: make this work for more complex schemas where arrays are not first level
|
|
|
|
// let schemasForLLM: {} = {};
|
|
// for (const key in largeArraysSchema) {
|
|
// const originalSchema = structuredClone(largeArraysSchema[key].items);
|
|
// console.log(
|
|
// "key",
|
|
// key,
|
|
// "\noriginalSchema",
|
|
// JSON.stringify(largeArraysSchema[key], null, 2),
|
|
// );
|
|
// let clonedObj = {
|
|
// type: "object",
|
|
// properties: {
|
|
// informationFilled: {
|
|
// type: "boolean",
|
|
// },
|
|
// data: {
|
|
// type: "object",
|
|
// properties: originalSchema.properties,
|
|
// },
|
|
// },
|
|
// };
|
|
// schemasForLLM[key] = clonedObj;
|
|
// }
|