firecrawl/apps/api/src/lib/extract/archive/crawling-index.ts
Nicolas 5e5b5ee0e2
(feat/extract) New re-ranker + multi entity extraction (#1061)
* agent that decides if splits schema or not

* split and merge properties done

* wip

* wip

* changes

* ch

* array merge working!

* comment

* wip

* dereferentiate schema

* dereference schemas

* Nick: new re-ranker

* Create llm-links.txt

* Nick: format

* Update extraction-service.ts

* wip: cooking schema mix and spread functions

* wip

* wip getting there!!!

* nick:

* moved functions to helpers

* nick:

* cant reproduce the error anymore

* error handling all scrapes failed

* fix

* Nick: added the sitemap index

* Update sitemap-index.ts

* Update map.ts

* deduplicate and merge arrays

* added error handler for object transformations

* Update url-processor.ts

* Nick:

* Nick: fixes

* Nick: big improvements to rerank of multi-entity

* Nick: working

* Update reranker.ts

* fixed transformations for nested objs

* fix merge nulls

* Nick: fixed error piping

* Update queue-worker.ts

* Update extraction-service.ts

* Nick: format

* Update queue-worker.ts

* Update pnpm-lock.yaml

* Update queue-worker.ts

---------

Co-authored-by: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com>
Co-authored-by: Thomas Kosmas <thomas510111@gmail.com>
2025-01-13 22:30:15 -03:00

81 lines
2.6 KiB
TypeScript

// const id = crypto.randomUUID();
// const sc: StoredCrawl = {
// originUrl: request.urls[0].replace("/*",""),
// crawlerOptions: toLegacyCrawlerOptions({
// maxDepth: 15,
// limit: 5000,
// includePaths: [],
// excludePaths: [],
// ignoreSitemap: false,
// allowExternalLinks: false,
// allowBackwardLinks: true,
// allowSubdomains: false,
// ignoreRobotsTxt: false,
// deduplicateSimilarURLs: false,
// ignoreQueryParameters: false
// }),
// scrapeOptions: {
// formats: ["markdown"],
// onlyMainContent: true,
// waitFor: 0,
// mobile: false,
// removeBase64Images: true,
// fastMode: false,
// parsePDF: true,
// skipTlsVerification: false,
// },
// internalOptions: {
// disableSmartWaitCache: true,
// isBackgroundIndex: true
// },
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
// createdAt: Date.now(),
// plan: "hobby", // make it a low concurrency
// };
// // Save the crawl configuration
// await saveCrawl(id, sc);
// // Then kick off the job
// await _addScrapeJobToBullMQ({
// url: request.urls[0].replace("/*",""),
// mode: "kickoff" as const,
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
// plan: "hobby", // make it a low concurrency
// crawlerOptions: sc.crawlerOptions,
// scrapeOptions: sc.scrapeOptions,
// internalOptions: sc.internalOptions,
// origin: "index",
// crawl_id: id,
// webhook: null,
// v1: true,
// }, {}, crypto.randomUUID(), 50);
// we restructure and make all of the arrays we need to fill into objects,
// adding them to a single object so the llm can fill them one at a time
// TODO: make this work for more complex schemas where arrays are not first level
// let schemasForLLM: {} = {};
// for (const key in largeArraysSchema) {
// const originalSchema = structuredClone(largeArraysSchema[key].items);
// console.log(
// "key",
// key,
// "\noriginalSchema",
// JSON.stringify(largeArraysSchema[key], null, 2),
// );
// let clonedObj = {
// type: "object",
// properties: {
// informationFilled: {
// type: "boolean",
// },
// data: {
// type: "object",
// properties: originalSchema.properties,
// },
// },
// };
// schemasForLLM[key] = clonedObj;
// }