mirror of
https://git-proxy.hk.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-04-10 13:52:42 +08:00
(feat/extract) New re-ranker + multi entity extraction (#1061)
* agent that decides if splits schema or not * split and merge properties done * wip * wip * changes * ch * array merge working! * comment * wip * dereferentiate schema * dereference schemas * Nick: new re-ranker * Create llm-links.txt * Nick: format * Update extraction-service.ts * wip: cooking schema mix and spread functions * wip * wip getting there!!! * nick: * moved functions to helpers * nick: * cant reproduce the error anymore * error handling all scrapes failed * fix * Nick: added the sitemap index * Update sitemap-index.ts * Update map.ts * deduplicate and merge arrays * added error handler for object transformations * Update url-processor.ts * Nick: * Nick: fixes * Nick: big improvements to rerank of multi-entity * Nick: working * Update reranker.ts * fixed transformations for nested objs * fix merge nulls * Nick: fixed error piping * Update queue-worker.ts * Update extraction-service.ts * Nick: format * Update queue-worker.ts * Update pnpm-lock.yaml * Update queue-worker.ts --------- Co-authored-by: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Co-authored-by: Thomas Kosmas <thomas510111@gmail.com>
This commit is contained in:
parent
5c62bb1195
commit
5e5b5ee0e2
2
.gitignore
vendored
2
.gitignore
vendored
@ -38,3 +38,5 @@ apps/js-sdk/firecrawl/dist
|
||||
/apps/api/debug/*
|
||||
|
||||
.vscode
|
||||
llm-links.txt
|
||||
mapped-links.txt
|
@ -36,6 +36,7 @@
|
||||
"@types/escape-html": "^1.0.4",
|
||||
"@types/express": "^4.17.17",
|
||||
"@types/jest": "^29.5.12",
|
||||
"@types/lodash": "^4.17.14",
|
||||
"@types/node": "^20.14.1",
|
||||
"@types/pdf-parse": "^1.1.4",
|
||||
"@types/supertest": "^6.0.2",
|
||||
@ -54,6 +55,7 @@
|
||||
},
|
||||
"dependencies": {
|
||||
"@anthropic-ai/sdk": "^0.24.3",
|
||||
"@apidevtools/json-schema-ref-parser": "^11.7.3",
|
||||
"@brillout/import": "^0.2.2",
|
||||
"@bull-board/api": "^5.20.5",
|
||||
"@bull-board/express": "^5.20.5",
|
||||
@ -96,6 +98,7 @@
|
||||
"koffi": "^2.9.0",
|
||||
"langchain": "^0.2.8",
|
||||
"languagedetect": "^2.0.0",
|
||||
"lodash": "^4.17.21",
|
||||
"logsnag": "^1.0.0",
|
||||
"luxon": "^3.4.3",
|
||||
"marked": "^14.1.2",
|
||||
|
42
apps/api/pnpm-lock.yaml
generated
42
apps/api/pnpm-lock.yaml
generated
@ -11,6 +11,9 @@ importers:
|
||||
'@anthropic-ai/sdk':
|
||||
specifier: ^0.24.3
|
||||
version: 0.24.3(encoding@0.1.13)
|
||||
'@apidevtools/json-schema-ref-parser':
|
||||
specifier: ^11.7.3
|
||||
version: 11.7.3
|
||||
'@brillout/import':
|
||||
specifier: ^0.2.2
|
||||
version: 0.2.3
|
||||
@ -137,6 +140,9 @@ importers:
|
||||
languagedetect:
|
||||
specifier: ^2.0.0
|
||||
version: 2.0.0
|
||||
lodash:
|
||||
specifier: ^4.17.21
|
||||
version: 4.17.21
|
||||
logsnag:
|
||||
specifier: ^1.0.0
|
||||
version: 1.0.0(encoding@0.1.13)
|
||||
@ -261,6 +267,9 @@ importers:
|
||||
'@types/jest':
|
||||
specifier: ^29.5.12
|
||||
version: 29.5.12
|
||||
'@types/lodash':
|
||||
specifier: ^4.17.14
|
||||
version: 4.17.14
|
||||
'@types/node':
|
||||
specifier: ^20.14.1
|
||||
version: 20.14.1
|
||||
@ -316,6 +325,10 @@ packages:
|
||||
'@anthropic-ai/sdk@0.24.3':
|
||||
resolution: {integrity: sha512-916wJXO6T6k8R6BAAcLhLPv/pnLGy7YSEBZXZ1XTFbLcTZE8oTy3oDW9WJf9KKZwMvVcePIfoTSvzXHRcGxkQQ==}
|
||||
|
||||
'@apidevtools/json-schema-ref-parser@11.7.3':
|
||||
resolution: {integrity: sha512-WApSdLdXEBb/1FUPca2lteASewEfpjEYJ8oXZP+0gExK5qSfsEKBKcA+WjY6Q4wvXwyv0+W6Kvc372pSceib9w==}
|
||||
engines: {node: '>= 16'}
|
||||
|
||||
'@aws-crypto/crc32@3.0.0':
|
||||
resolution: {integrity: sha512-IzSgsrxUcsrejQbPVilIKy16kAT52EwB6zSaI+M3xxIhKh5+aldEyvI+z6erM7TCLB2BJsFrtHjp6/4/sr+3dA==}
|
||||
|
||||
@ -791,6 +804,9 @@ packages:
|
||||
'@jridgewell/trace-mapping@0.3.9':
|
||||
resolution: {integrity: sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==}
|
||||
|
||||
'@jsdevtools/ono@7.1.3':
|
||||
resolution: {integrity: sha512-4JQNk+3mVzK3xh2rqd6RB4J46qUR19azEHBneZyTZM+c456qOrbbM/5xcR8huNCCcbVt7+UmizG6GuUvPvKUYg==}
|
||||
|
||||
'@langchain/core@0.2.12':
|
||||
resolution: {integrity: sha512-zaKvUcWU1Cxcpd/fxklygY6iUrxls10KTRzyHZGBAIKJq1JD/B10vX59YlFgBs7nqqVTEvaChfIE0O0e2qBttA==}
|
||||
engines: {node: '>=18'}
|
||||
@ -1555,6 +1571,12 @@ packages:
|
||||
'@types/jest@29.5.12':
|
||||
resolution: {integrity: sha512-eDC8bTvT/QhYdxJAulQikueigY5AsdBRH2yDKW3yveW7svY3+DzN84/2NUgkw10RTiJbWqZrTtoGVdYlvFJdLw==}
|
||||
|
||||
'@types/json-schema@7.0.15':
|
||||
resolution: {integrity: sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==}
|
||||
|
||||
'@types/lodash@4.17.14':
|
||||
resolution: {integrity: sha512-jsxagdikDiDBeIRaPYtArcT8my4tN1og7MtMRquFT3XNA6axxyHDRUemqDz/taRDdOUn0GnGHRCuff4q48sW9A==}
|
||||
|
||||
'@types/methods@1.1.4':
|
||||
resolution: {integrity: sha512-ymXWVrDiCxTBE3+RIrrP533E70eA+9qu7zdWoHuOmGujkYtzf4HQF96b8nwHLqhuf4ykX61IGRIB38CC6/sImQ==}
|
||||
|
||||
@ -4361,8 +4383,8 @@ packages:
|
||||
engines: {node: '>=14.17'}
|
||||
hasBin: true
|
||||
|
||||
typescript@5.7.2:
|
||||
resolution: {integrity: sha512-i5t66RHxDvVN40HfDd1PsEThGNnlMCMT3jMUuoh9/0TaqWevNontacunWyN02LA9/fIbEWlcHZcgTKb9QoaLfg==}
|
||||
typescript@5.7.3:
|
||||
resolution: {integrity: sha512-84MVSjMEHP+FQRPy3pX9sTVV/INIex71s9TL2Gm5FG/WG1SqXeKyZ0k7/blY/4FdOzI12CBy1vGc4og/eus0fw==}
|
||||
engines: {node: '>=14.17'}
|
||||
hasBin: true
|
||||
|
||||
@ -4637,6 +4659,12 @@ snapshots:
|
||||
transitivePeerDependencies:
|
||||
- encoding
|
||||
|
||||
'@apidevtools/json-schema-ref-parser@11.7.3':
|
||||
dependencies:
|
||||
'@jsdevtools/ono': 7.1.3
|
||||
'@types/json-schema': 7.0.15
|
||||
js-yaml: 4.1.0
|
||||
|
||||
'@aws-crypto/crc32@3.0.0':
|
||||
dependencies:
|
||||
'@aws-crypto/util': 3.0.0
|
||||
@ -5602,6 +5630,8 @@ snapshots:
|
||||
'@jridgewell/resolve-uri': 3.1.2
|
||||
'@jridgewell/sourcemap-codec': 1.4.15
|
||||
|
||||
'@jsdevtools/ono@7.1.3': {}
|
||||
|
||||
'@langchain/core@0.2.12(langchain@0.2.8(@aws-sdk/credential-provider-node@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0))(@aws-sdk/client-sts@3.679.0))(@pinecone-database/pinecone@4.0.0)(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(encoding@0.1.13)(fast-xml-parser@4.4.1)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(encoding@0.1.13)(zod@3.23.8))':
|
||||
dependencies:
|
||||
ansi-styles: 5.2.0
|
||||
@ -6630,6 +6660,10 @@ snapshots:
|
||||
expect: 29.7.0
|
||||
pretty-format: 29.7.0
|
||||
|
||||
'@types/json-schema@7.0.15': {}
|
||||
|
||||
'@types/lodash@4.17.14': {}
|
||||
|
||||
'@types/methods@1.1.4': {}
|
||||
|
||||
'@types/mime@1.3.5': {}
|
||||
@ -9071,7 +9105,7 @@ snapshots:
|
||||
csv-parse: 5.5.6
|
||||
gpt3-tokenizer: 1.1.5
|
||||
openai: 3.3.0
|
||||
typescript: 5.7.2
|
||||
typescript: 5.7.3
|
||||
uuid: 9.0.1
|
||||
zod: 3.23.8
|
||||
transitivePeerDependencies:
|
||||
@ -9655,7 +9689,7 @@ snapshots:
|
||||
|
||||
typescript@5.4.5: {}
|
||||
|
||||
typescript@5.7.2: {}
|
||||
typescript@5.7.3: {}
|
||||
|
||||
typesense@1.8.2(@babel/runtime@7.24.6):
|
||||
dependencies:
|
||||
|
@ -9,17 +9,19 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"url":"firecrawl.dev"
|
||||
"url":"https://opencorporates.com/companies/us_tn/001260776/"
|
||||
}
|
||||
|
||||
### http://ibtikar.net.sa bugado. redirect
|
||||
### https://bansko.bg/bg -> webhooks
|
||||
### Crawl Website
|
||||
# @name crawl
|
||||
POST {{baseUrl}}/v1/crawl HTTP/1.1
|
||||
POST {{baseUrl}}/v1/map HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"url": "firecrawl.dev"
|
||||
{
|
||||
"url": "https://emelitastes.lausd.org"
|
||||
}
|
||||
|
||||
### Check Crawl Status
|
||||
@ -60,6 +62,35 @@ content-type: application/json
|
||||
"sitemapOnly": true
|
||||
}
|
||||
|
||||
###
|
||||
POST {{baseUrl}}/v1/extract HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"urls": [
|
||||
"justcall.io/*"
|
||||
],
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": { "type": "string" },
|
||||
"description": { "type": "string" },
|
||||
"pricing": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"plan": { "type": "string" },
|
||||
"description": { "type": "string" },
|
||||
"price": { "type": "string" },
|
||||
"currency": { "type": "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"prompt": "Extract the title and description from the website",
|
||||
"urlTrace": true
|
||||
}
|
||||
|
||||
### Extract
|
||||
# @name extract
|
||||
POST {{baseUrl}}/v1/extract HTTP/1.1
|
||||
@ -67,11 +98,414 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"urls": ["firecrawl.dev"],
|
||||
"prompt": "What is the title, description and main product of the page?",
|
||||
"urls": [
|
||||
"https://benscreeknursery.com/*",
|
||||
"https://www.bbb.org/us/nc/littleton/profile/landscape-contractors/bens-creek-nursery-0593-90077956",
|
||||
"https://benscreeknursery.com/testimonial/",
|
||||
"https://www.dnb.com/business-directory/company-profiles.bens_creek_nursery.af816431450a54ace44050c5dbc4cb30.html"
|
||||
],
|
||||
"prompt": "Extract the following information from the website: business overview, management team, customer feedback, operational details, reputation, financial information, online presence, additional notes",
|
||||
"schema": {
|
||||
"title": { "type": "string" },
|
||||
"description": { "type": "string" },
|
||||
"mainProduct": { "type": "string" }
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"business_overview": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": { "type": "string" },
|
||||
"location": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"address": { "type": "string" },
|
||||
"source": { "type": "string" }
|
||||
}
|
||||
},
|
||||
"years_in_operation": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"value": { "type": "string" },
|
||||
"source": { "type": "string" }
|
||||
}
|
||||
},
|
||||
"services_offered": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"list": {
|
||||
"type": "array",
|
||||
"items": { "type": "string" }
|
||||
},
|
||||
"source": { "type": "string" }
|
||||
}
|
||||
},
|
||||
"business_structure": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"type": { "type": "string" },
|
||||
"source": { "type": "string" }
|
||||
}
|
||||
},
|
||||
"licensing": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"agency": { "type": "string" },
|
||||
"license_number": { "type": "string" },
|
||||
"source": { "type": "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"management_team": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": { "type": "string" },
|
||||
"role": { "type": "string" },
|
||||
"source": { "type": "string" }
|
||||
}
|
||||
}
|
||||
},
|
||||
"customer_feedback": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"birdeye_reviews": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"rating": { "type": "number" },
|
||||
"total_reviews": { "type": "integer" },
|
||||
"source": { "type": "string" }
|
||||
}
|
||||
},
|
||||
"testimonials": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"author": { "type": "string" },
|
||||
"content": { "type": "string" },
|
||||
"source": { "type": "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"operational_details": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"office_hours": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"monday_to_friday": { "type": "string" },
|
||||
"saturday": { "type": "string" },
|
||||
"sunday": { "type": "string" },
|
||||
"source": { "type": "string" }
|
||||
}
|
||||
},
|
||||
"contact_information": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"phone": { "type": "string" },
|
||||
"email": { "type": "string" },
|
||||
"source": { "type": "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"reputation": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"bbb_rating": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"value": { "type": "string" },
|
||||
"source": { "type": "string" }
|
||||
}
|
||||
},
|
||||
"customer_complaints": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"value": { "type": "string" },
|
||||
"source": { "type": "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"financial_information": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"dun_and_bradstreet_profile": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": { "type": "string" },
|
||||
"source": { "type": "string" }
|
||||
}
|
||||
},
|
||||
"financial_statements": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": { "type": "string" },
|
||||
"source": { "type": "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"online_presence": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"website": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": { "type": "string" },
|
||||
"source": { "type": "string" }
|
||||
}
|
||||
},
|
||||
"social_media": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"platform": { "type": "string" },
|
||||
"details": { "type": "string" },
|
||||
"source": { "type": "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"additional_notes": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"note": { "type": "string" },
|
||||
"source": { "type": "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
###
|
||||
### Batch Scrape Websites
|
||||
# @name batchScrape
|
||||
POST {{baseUrl}}/v1/batch/scrape HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"urls":[
|
||||
"https://mommypoppins.com/new-york-city-kids/event/events/gobble-gobble-give-at-the-apollo",
|
||||
"https://mommypoppins.com/new-york-city-kids/event/events/sold-out-an-irish-halloweenoiche-shamhna-celtic-magic-at-the-irish",
|
||||
"https://mommypoppins.com/new-york-city-kids/event/events/gingerbread-decorating-at-queens-county-farm-museum",
|
||||
"https://mommypoppins.com/new-york-city-kids/event/events/the-yorkville-nutcracker-at-john-jay-college",
|
||||
"https://mommypoppins.com/new-york-city-kids/event/events/day-of-the-dead-2024-at-the-museum-of-the-moving-image",
|
||||
"https://mommypoppins.com/new-york-city-kids/event/events/watson-adventures-ghosts-of-greenwich-village-scavenger-hunt-for-1",
|
||||
"https://mommypoppins.com/new-york-city-kids/event/events/luna-luna-forgotten-fantasy-at-the-shed",
|
||||
"https://mommypoppins.com/new-york-city-kids/event/events/first-friday-art-music-dancing-at-the-bronx-museum",
|
||||
|
||||
],
|
||||
"extract":{
|
||||
"prompt":"Use the schema for extracting from the main event in the page.\nDates should be extracted in YYYY-MM-DD format.\nExtracted times have to be separated by date of week when explicit in the event description, otherwise load all 7 days of the week. Times should be in the HH:MM format. Day sub-property is 0-6 for Sunday-Saturday\nIf event is free set price description to \"FREE\" and minmax both to 0. Otherwise description is the full text price cohort\nSome data could be better explained in the description of the event, extract as necessary.\n",
|
||||
"schema":{
|
||||
"type":"object",
|
||||
"$defs":{
|
||||
"MinMax":{
|
||||
"type":"object",
|
||||
"title":"MinMax",
|
||||
"required":["min","max"],
|
||||
"properties":{
|
||||
"max":{
|
||||
"type":"number",
|
||||
"title":"Max"
|
||||
},
|
||||
"min":{
|
||||
"type":"number",
|
||||
"title":"Min"
|
||||
}
|
||||
}
|
||||
},
|
||||
"AgeRangeModel":{
|
||||
"type":"object",
|
||||
"title":"AgeRangeModel",
|
||||
"required":["range","description"],
|
||||
"properties":{
|
||||
"range":{
|
||||
"$ref":"#/$defs/MinMax"
|
||||
},
|
||||
"description":{
|
||||
"type":"string",
|
||||
"title":"Description"
|
||||
}
|
||||
}
|
||||
},
|
||||
"LocationModel":{
|
||||
"type":"object",
|
||||
"title":"LocationModel",
|
||||
"required":["name","address","phone"],
|
||||
"properties":{
|
||||
"name":{
|
||||
"type":"string",
|
||||
"title":"Name"
|
||||
},
|
||||
"phone":{
|
||||
"type":"string",
|
||||
"title":"Phone"
|
||||
},
|
||||
"address":{
|
||||
"type":"string",
|
||||
"title":"Address"
|
||||
}
|
||||
}
|
||||
},
|
||||
"TimeSlotModel":{
|
||||
"type":"object",
|
||||
"title":"TimeSlotModel",
|
||||
"required":["day","start","end"],
|
||||
"properties":{
|
||||
"day":{
|
||||
"type":"integer",
|
||||
"title":"Day"
|
||||
},
|
||||
"end":{
|
||||
"type":"string",
|
||||
"title":"End"
|
||||
},
|
||||
"start":{
|
||||
"type":"string",
|
||||
"title":"Start"
|
||||
}
|
||||
}
|
||||
},
|
||||
"DateRangeModel":{
|
||||
"type":"object",
|
||||
"title":"DateRangeModel",
|
||||
"required":["start","end"],
|
||||
"properties":{
|
||||
"end":{
|
||||
"type":"string",
|
||||
"title":"End"
|
||||
},
|
||||
"start":{
|
||||
"type":"string",
|
||||
"title":"Start"
|
||||
}
|
||||
}
|
||||
},
|
||||
"PriceInfoModel":{
|
||||
"type":"object",
|
||||
"title":"PriceInfoModel",
|
||||
"required":["range","bracket"],
|
||||
"properties":{
|
||||
"range":{
|
||||
"$ref":"#/$defs/MinMax"
|
||||
},
|
||||
"bracket":{
|
||||
"type":"string",
|
||||
"title":"Bracket"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"title":"MommyPoppingsExtractionSchema",
|
||||
"required":["name","date","time","description","website_url","age_range","price","location"],
|
||||
"properties":{
|
||||
"date":{
|
||||
"$ref":"#/$defs/DateRangeModel"
|
||||
},
|
||||
"name":{
|
||||
"type":"string",
|
||||
"title":"Name"
|
||||
},
|
||||
"time":{
|
||||
"type":"array",
|
||||
"items":{
|
||||
"$ref":"#/$defs/TimeSlotModel"
|
||||
},
|
||||
"title":"Time"
|
||||
},
|
||||
"price":{
|
||||
"$ref":"#/$defs/PriceInfoModel"
|
||||
},
|
||||
"location":{
|
||||
"$ref":"#/$defs/LocationModel"
|
||||
},
|
||||
"age_range":{
|
||||
"$ref":"#/$defs/AgeRangeModel"
|
||||
},
|
||||
"description":{
|
||||
"type":"string",
|
||||
"title":"Description"
|
||||
},
|
||||
"website_url":{
|
||||
"type":"string",
|
||||
"title":"Website Url"
|
||||
}
|
||||
}
|
||||
},
|
||||
"systemPrompt":"Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required."
|
||||
},
|
||||
"formats":["extract"]
|
||||
}
|
||||
|
||||
### Check Batch Scrape Status
|
||||
@batchScrapeId = {{batchScrape.response.body.$.id}}
|
||||
# @name batchScrapeStatus
|
||||
GET {{baseUrl}}/v1/crawl/4cb890c8-3fbe-4e02-94a4-15aee60446e8?skip=943 HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
|
||||
###
|
||||
# @name extract
|
||||
POST {{baseUrl}}/v1/extract HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"urls": [
|
||||
"https://www.naamanp.co.il/catalog/product/view/id/16453/s/11314179/category/167/",
|
||||
"https://www.naamanp.co.il/11290046",
|
||||
"https://www.naamanp.co.il/11314193"
|
||||
],
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string",
|
||||
"title": "Title"
|
||||
},
|
||||
"description": {
|
||||
"type": "string",
|
||||
"title": "Description"
|
||||
},
|
||||
"categories": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"products": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"title": "Name"
|
||||
},
|
||||
"description": {
|
||||
"type": "string",
|
||||
"title": "Description"
|
||||
},
|
||||
"price": {
|
||||
"type": "string",
|
||||
"title": "Price"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"systemPrompt": "Extract all the products from the page. Consider it's a huge store with thousands of products and categories."
|
||||
}
|
||||
|
||||
###
|
||||
@extractId = {{extract.response.body.$.id}}
|
||||
# @name extractStatus
|
||||
GET {{baseUrl}}/v1/extract/{{extractId}} HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
@ -31,7 +31,7 @@ export async function extractStatusController(
|
||||
}
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
success: extract.status === "failed" ? false : true,
|
||||
data: data,
|
||||
status: extract.status,
|
||||
error: extract?.error ?? undefined,
|
||||
|
@ -21,6 +21,7 @@ import { logJob } from "../../services/logging/log_job";
|
||||
import { performCosineSimilarity } from "../../lib/map-cosine";
|
||||
import { logger } from "../../lib/logger";
|
||||
import Redis from "ioredis";
|
||||
import { querySitemapIndex } from "../../scraper/WebScraper/sitemap-index";
|
||||
|
||||
configDotenv();
|
||||
const redis = new Redis(process.env.REDIS_URL!);
|
||||
@ -144,16 +145,15 @@ export async function getMapResults({
|
||||
);
|
||||
allResults = await Promise.all(pagePromises);
|
||||
|
||||
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
|
||||
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 48 * 60 * 60); // Cache for 48 hours
|
||||
}
|
||||
|
||||
// Parallelize sitemap fetch with serper search
|
||||
const [_, ...searchResults] = await Promise.all([
|
||||
ignoreSitemap
|
||||
? null
|
||||
: crawler.tryGetSitemap((urls) => {
|
||||
links.push(...urls);
|
||||
}, true),
|
||||
// Parallelize sitemap fetch with serper search and sitemap-index
|
||||
const [_, sitemapIndexUrls, ...searchResults] = await Promise.all([
|
||||
ignoreSitemap ? null : crawler.tryGetSitemap(urls => {
|
||||
links.push(...urls);
|
||||
}, true),
|
||||
querySitemapIndex(url),
|
||||
...(cachedResult ? [] : pagePromises),
|
||||
]);
|
||||
|
||||
@ -185,6 +185,9 @@ export async function getMapResults({
|
||||
}
|
||||
}
|
||||
|
||||
// Add sitemap-index URLs
|
||||
links.push(...sitemapIndexUrls);
|
||||
|
||||
// Perform cosine similarity between the search query and the list of links
|
||||
if (search) {
|
||||
const searchQuery = search.toLowerCase();
|
||||
|
226
apps/api/src/lib/__tests__/deduplicate-obs-array.test.ts
Normal file
226
apps/api/src/lib/__tests__/deduplicate-obs-array.test.ts
Normal file
@ -0,0 +1,226 @@
|
||||
import { deduplicateObjectsArray } from "../extract/helpers/deduplicate-objs-array";
|
||||
|
||||
describe("deduplicateObjectsArray", () => {
|
||||
it("should deduplicate the array", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": null,
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": null,
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": null,
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": null,
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const result = await deduplicateObjectsArray(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
|
||||
it("should not deduplicate if not necessary", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": null,
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "John Doe",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": null,
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const result = await deduplicateObjectsArray(objArray);
|
||||
|
||||
expect(result).toEqual(objArray);
|
||||
})
|
||||
|
||||
it("should handle an empty array", async () => {
|
||||
const objArray = { "lawyers": [] };
|
||||
|
||||
const expected = { "lawyers": [] };
|
||||
|
||||
const result = await deduplicateObjectsArray(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
});
|
||||
|
||||
it("should handle objects with different properties", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": "james@example.com",
|
||||
"title": "Personal Injury Attorney"
|
||||
},
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": "james@example.com",
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "123-456-7890"
|
||||
}
|
||||
]
|
||||
};
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": "james@example.com",
|
||||
"title": "Personal Injury Attorney"
|
||||
},
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": "james@example.com",
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "123-456-7890"
|
||||
}
|
||||
]
|
||||
};
|
||||
|
||||
const result = await deduplicateObjectsArray(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
});
|
||||
|
||||
it("should handle objects with same properties but different values", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": "james1@example.com",
|
||||
"title": "Personal Injury Attorney"
|
||||
},
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": "james2@example.com",
|
||||
"title": "Personal Injury Attorney"
|
||||
}
|
||||
]
|
||||
};
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": "james1@example.com",
|
||||
"title": "Personal Injury Attorney"
|
||||
},
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": "james2@example.com",
|
||||
"title": "Personal Injury Attorney"
|
||||
}
|
||||
]
|
||||
};
|
||||
|
||||
const result = await deduplicateObjectsArray(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
});
|
||||
|
||||
it("should handle nested identical objects", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
};
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
};
|
||||
|
||||
const result = await deduplicateObjectsArray(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
});
|
||||
})
|
474
apps/api/src/lib/__tests__/merge-null-val-objs.test.ts
Normal file
474
apps/api/src/lib/__tests__/merge-null-val-objs.test.ts
Normal file
@ -0,0 +1,474 @@
|
||||
import { mergeNullValObjs } from "../extract/helpers/merge-null-val-objs";
|
||||
|
||||
describe("mergeNullValObjs", () => {
|
||||
it("should merge the objects with null values", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
|
||||
it("should handle empty object array", async () => {
|
||||
const objArray = {
|
||||
"lawyers": []
|
||||
}
|
||||
|
||||
const expected = {
|
||||
"lawyers": []
|
||||
}
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
|
||||
it("should handle object array with no null values", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "John Doe",
|
||||
"email": "john.doe@example.com",
|
||||
"title": "Attorney",
|
||||
"phone-number": "123.456.7890",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Corporate Law"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "John Doe",
|
||||
"email": "john.doe@example.com",
|
||||
"title": "Attorney",
|
||||
"phone-number": "123.456.7890",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Corporate Law"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
|
||||
it("should merge objects with different null values", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "Jane Smith",
|
||||
"email": "null",
|
||||
"title": "Attorney",
|
||||
"description": null,
|
||||
"phone-number": "987.654.3210",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Jane Smith",
|
||||
"email": "jane.smith@example.com",
|
||||
"title": null,
|
||||
"description": "Jane Smith is an attorney specializing in Family Law.",
|
||||
"phone-number": "987.654.3210",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "Jane Smith",
|
||||
"email": "jane.smith@example.com",
|
||||
"title": "Attorney",
|
||||
"description": "Jane Smith is an attorney specializing in Family Law.",
|
||||
"phone-number": "987.654.3210",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
|
||||
it("should merge objects with different null values", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": "frank.giunta@example.com",
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Dale R. Rose",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "972.562.0266",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
};
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": "frank.giunta@example.com",
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Dale R. Rose",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "972.562.0266",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
};
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
|
||||
it("should correctly merge and deduplicate objects", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Dale R. Rose",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "972.562.0266",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
};
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Dale R. Rose",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "972.562.0266",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
};
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
});
|
||||
|
||||
it("should merge arrays of similar objects", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "Allen Cox",
|
||||
"email": null,
|
||||
"title": "Personal Injury Lawyer",
|
||||
"phone-number": "972.606.9000",
|
||||
"practice-areas": [
|
||||
{ "area": "Personal Injury" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Allen Cox",
|
||||
"email": "allen.cox@example.com",
|
||||
"title": "Personal Injury Lawyer",
|
||||
"phone-number": null,
|
||||
"practice-areas": [
|
||||
{ "area": "Automobile accidents" },
|
||||
{ "area": "Truck accidents" },
|
||||
{ "area": "Amusement park injury" },
|
||||
{ "area": "Bus accident" },
|
||||
{ "area": "Industrial accidents" },
|
||||
{ "area": "Product defects" },
|
||||
{ "area": "Food poisoning" },
|
||||
{ "area": "Workplace accidents" },
|
||||
{ "area": "Wrongful death" },
|
||||
{ "area": "Swimming pool accidents" },
|
||||
{ "area": "Premises accidents" },
|
||||
{ "area": "Aircraft accidents" },
|
||||
{ "area": "Animal and dog bites" }
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "Allen Cox",
|
||||
"email": "allen.cox@example.com",
|
||||
"title": "Personal Injury Lawyer",
|
||||
"phone-number": "972.606.9000",
|
||||
"practice-areas": [
|
||||
{ "area": "Personal Injury" },
|
||||
{ "area": "Automobile accidents" },
|
||||
{ "area": "Truck accidents" },
|
||||
{ "area": "Amusement park injury" },
|
||||
{ "area": "Bus accident" },
|
||||
{ "area": "Industrial accidents" },
|
||||
{ "area": "Product defects" },
|
||||
{ "area": "Food poisoning" },
|
||||
{ "area": "Workplace accidents" },
|
||||
{ "area": "Wrongful death" },
|
||||
{ "area": "Swimming pool accidents" },
|
||||
{ "area": "Premises accidents" },
|
||||
{ "area": "Aircraft accidents" },
|
||||
{ "area": "Animal and dog bites" }
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
|
||||
it("should merge arrays of similar objects with different key names", async () => {
|
||||
const objArray = {
|
||||
"attorneys": [
|
||||
{
|
||||
"fullName": "Allen Cox",
|
||||
"contactEmail": null,
|
||||
"position": "Personal Injury Lawyer",
|
||||
"contactNumber": "972.606.9000",
|
||||
"specializations": [
|
||||
{ "field": "Personal Injury" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"fullName": "Allen Cox",
|
||||
"contactEmail": "allen.cox@example.com",
|
||||
"position": "Personal Injury Lawyer",
|
||||
"contactNumber": null,
|
||||
"specializations": [
|
||||
{ "field": "Automobile accidents" },
|
||||
{ "field": "Truck accidents" },
|
||||
{ "field": "Amusement park injury" },
|
||||
{ "field": "Bus accident" },
|
||||
{ "field": "Industrial accidents" },
|
||||
{ "field": "Product defects" },
|
||||
{ "field": "Food poisoning" },
|
||||
{ "field": "Workplace accidents" },
|
||||
{ "field": "Wrongful death" },
|
||||
{ "field": "Swimming pool accidents" },
|
||||
{ "field": "Premises accidents" },
|
||||
{ "field": "Aircraft accidents" },
|
||||
{ "field": "Animal and dog bites" }
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const expected = {
|
||||
"attorneys": [
|
||||
{
|
||||
"fullName": "Allen Cox",
|
||||
"contactEmail": "allen.cox@example.com",
|
||||
"position": "Personal Injury Lawyer",
|
||||
"contactNumber": "972.606.9000",
|
||||
"specializations": [
|
||||
{ "field": "Personal Injury" },
|
||||
{ "field": "Automobile accidents" },
|
||||
{ "field": "Truck accidents" },
|
||||
{ "field": "Amusement park injury" },
|
||||
{ "field": "Bus accident" },
|
||||
{ "field": "Industrial accidents" },
|
||||
{ "field": "Product defects" },
|
||||
{ "field": "Food poisoning" },
|
||||
{ "field": "Workplace accidents" },
|
||||
{ "field": "Wrongful death" },
|
||||
{ "field": "Swimming pool accidents" },
|
||||
{ "field": "Premises accidents" },
|
||||
{ "field": "Aircraft accidents" },
|
||||
{ "field": "Animal and dog bites" }
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
|
||||
it ("should deal with not array values", async () => {
|
||||
const objArray = {
|
||||
"lawyers": {
|
||||
"name": "not an array"
|
||||
},
|
||||
"attorneys": {
|
||||
"name": "not an array"
|
||||
}
|
||||
}
|
||||
|
||||
const expected = {
|
||||
"lawyers": {
|
||||
"name": "not an array"
|
||||
},
|
||||
"attorneys": {
|
||||
"name": "not an array"
|
||||
}
|
||||
}
|
||||
|
||||
// @ts-expect-error
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
|
||||
it ("should deal with arrays of strings", async () => {
|
||||
const objArray = {
|
||||
"lawyers": ["res1", "res2", "res3"]
|
||||
}
|
||||
|
||||
const expected = {
|
||||
"lawyers": ["res1", "res2", "res3"]
|
||||
}
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
|
||||
})
|
996
apps/api/src/lib/__tests__/mix-schemas.test.ts
Normal file
996
apps/api/src/lib/__tests__/mix-schemas.test.ts
Normal file
@ -0,0 +1,996 @@
|
||||
import { mixSchemaObjects } from "../extract/helpers/mix-schema-objs";
|
||||
import { transformArrayToObject } from "../extract/helpers/transform-array-to-obj";
|
||||
|
||||
describe("mixSchemaObjects function", () => {
|
||||
it("should mix kyb schema (id: 1)", async () => {
|
||||
const originalSchema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
business: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
registration_number: { type: "string" },
|
||||
tax_id: { type: "string" },
|
||||
type: { type: "string" },
|
||||
industry: { type: "string" },
|
||||
address: {
|
||||
type: "object",
|
||||
properties: {
|
||||
street: { type: "string" },
|
||||
city: { type: "string" },
|
||||
state: { type: "string" },
|
||||
country: { type: "string" },
|
||||
postal_code: { type: "string" }
|
||||
},
|
||||
},
|
||||
incorporation_date: { type: "string", format: "date" },
|
||||
phone: { type: "string" },
|
||||
email: { type: "string", format: "email" }
|
||||
}
|
||||
},
|
||||
owners: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
full_name: { type: "string" },
|
||||
role: { type: "string" },
|
||||
address: {
|
||||
type: "object",
|
||||
properties: {
|
||||
street: { type: "string" },
|
||||
city: { type: "string" },
|
||||
state: { type: "string" },
|
||||
country: { type: "string" },
|
||||
postal_code: { type: "string" }
|
||||
},
|
||||
},
|
||||
phone: { type: "string" },
|
||||
email: { type: "string", format: "email" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const singleAnswerResult = {
|
||||
"business": {
|
||||
"name": "Revolut Ltd",
|
||||
"registration_number": "08804411",
|
||||
"tax_id": "",
|
||||
"type": "Private limited company",
|
||||
"industry": "Other information technology service activities",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "London",
|
||||
"state": "",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"incorporation_date": "2013-12-06",
|
||||
"phone": "",
|
||||
"email": ""
|
||||
}
|
||||
}
|
||||
|
||||
const multiEntityResult = {
|
||||
"owners": [
|
||||
{
|
||||
"full_name": "Thomas Bruce Hambrett",
|
||||
"role": "Secretary",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "Canary Wharf",
|
||||
"state": "London",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"phone": "",
|
||||
"email": ""
|
||||
},
|
||||
{
|
||||
"full_name": "Caroline Louise Britton",
|
||||
"role": "Director",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "Canary Wharf",
|
||||
"state": "London",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"phone": "",
|
||||
"email": ""
|
||||
},
|
||||
{
|
||||
"full_name": "Martin James Gilbert",
|
||||
"role": "Director",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "Canary Wharf",
|
||||
"state": "London",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"phone": "",
|
||||
"email": ""
|
||||
},
|
||||
{
|
||||
"full_name": "Michael Sidney Sherwood",
|
||||
"role": "Director",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "Canary Wharf",
|
||||
"state": "London",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"phone": "",
|
||||
"email": ""
|
||||
},
|
||||
{
|
||||
"full_name": "John Phimister Sievwright",
|
||||
"role": "Director",
|
||||
"ownership_percentage": "",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "Canary Wharf",
|
||||
"state": "London",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"phone": "",
|
||||
"email": ""
|
||||
},
|
||||
{
|
||||
"full_name": "Nikolay Storonsky",
|
||||
"role": "Director",
|
||||
"ownership_percentage": "",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "Canary Wharf",
|
||||
"state": "London",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"phone": "",
|
||||
"email": ""
|
||||
},
|
||||
{
|
||||
"full_name": "Dan Teodosiu",
|
||||
"role": "Director",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "Canary Wharf",
|
||||
"state": "London",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"phone": "",
|
||||
"email": ""
|
||||
},
|
||||
{
|
||||
"full_name": "Vladyslav Yatsenko",
|
||||
"role": "Director",
|
||||
"ownership_percentage": "",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "Canary Wharf",
|
||||
"state": "London",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"phone": "",
|
||||
"email": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const finalResult = await mixSchemaObjects(originalSchema, singleAnswerResult, multiEntityResult)
|
||||
|
||||
expect(finalResult).toEqual({
|
||||
"business": {
|
||||
"name": "Revolut Ltd",
|
||||
"registration_number": "08804411",
|
||||
"tax_id": "",
|
||||
"type": "Private limited company",
|
||||
"industry": "Other information technology service activities",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "London",
|
||||
"state": "",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"incorporation_date": "2013-12-06",
|
||||
"phone": "",
|
||||
"email": ""
|
||||
},
|
||||
"owners": [
|
||||
{
|
||||
"full_name": "Thomas Bruce Hambrett",
|
||||
"role": "Secretary",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "Canary Wharf",
|
||||
"state": "London",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"phone": "",
|
||||
"email": ""
|
||||
},
|
||||
{
|
||||
"full_name": "Caroline Louise Britton",
|
||||
"role": "Director",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "Canary Wharf",
|
||||
"state": "London",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"phone": "",
|
||||
"email": ""
|
||||
},
|
||||
{
|
||||
"full_name": "Martin James Gilbert",
|
||||
"role": "Director",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "Canary Wharf",
|
||||
"state": "London",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"phone": "",
|
||||
"email": ""
|
||||
},
|
||||
{
|
||||
"full_name": "Michael Sidney Sherwood",
|
||||
"role": "Director",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "Canary Wharf",
|
||||
"state": "London",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"phone": "",
|
||||
"email": ""
|
||||
},
|
||||
{
|
||||
"full_name": "John Phimister Sievwright",
|
||||
"role": "Director",
|
||||
"ownership_percentage": "",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "Canary Wharf",
|
||||
"state": "London",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"phone": "",
|
||||
"email": ""
|
||||
},
|
||||
{
|
||||
"full_name": "Nikolay Storonsky",
|
||||
"role": "Director",
|
||||
"ownership_percentage": "",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "Canary Wharf",
|
||||
"state": "London",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"phone": "",
|
||||
"email": ""
|
||||
},
|
||||
{
|
||||
"full_name": "Dan Teodosiu",
|
||||
"role": "Director",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "Canary Wharf",
|
||||
"state": "London",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"phone": "",
|
||||
"email": ""
|
||||
},
|
||||
{
|
||||
"full_name": "Vladyslav Yatsenko",
|
||||
"role": "Director",
|
||||
"ownership_percentage": "",
|
||||
"address": {
|
||||
"street": "7 Westferry Circus",
|
||||
"city": "Canary Wharf",
|
||||
"state": "London",
|
||||
"country": "England",
|
||||
"postal_code": "E14 4HD"
|
||||
},
|
||||
"phone": "",
|
||||
"email": ""
|
||||
}
|
||||
]
|
||||
})
|
||||
})
|
||||
|
||||
it("should mix lawyers schema (id: 29)", async () => {
|
||||
const originalSchema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
lawyers: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
email: { type: ["string", "null"], format: "email" },
|
||||
title: { type: ["string", "null"] },
|
||||
phone_number: { type: ["string", "null"], alias: "phone-number" },
|
||||
practice_areas: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
area: { type: "string" }
|
||||
},
|
||||
},
|
||||
alias: "practice-areas"
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const multiEntityResult = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "Phillip Galyen",
|
||||
"email": "pgalyen@galyen.com",
|
||||
"title": "President and CEO",
|
||||
"phone-number": "(844) 698-0233",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "James Bridge",
|
||||
"email": "jbridge@galyen.com",
|
||||
"title": "COO & Firm Managing Attorney",
|
||||
"phone-number": "(844) 698-0233",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Stephen C. Maxwell",
|
||||
"email": "smaxwell@galyen.com",
|
||||
"title": "Personal Injury Trial Attorney",
|
||||
"phone-number": "(844) 698-0233",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Scott Robelen",
|
||||
"email": "srobelen@galyen.com",
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Kern A. Lewis",
|
||||
"email": "klewis@galyen.com",
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Steven Pierret",
|
||||
"email": "spierret@galyen.com",
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Michael Galyen",
|
||||
"email": "mgalyen@galyen.com",
|
||||
"title": "Executive Vice President - Litigation Attorney",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "H. John Gutierrez",
|
||||
"email": "jgutierrez@galyen.com",
|
||||
"title": "Personal Injury Lawyer",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Daniel P. Sullivan",
|
||||
"email": "dsullivan@galyen.com",
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Ana Lee",
|
||||
"email": "alee@galyen.com",
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "(844) 402-4530",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Michael Raymond Cramer",
|
||||
"email": "mcramer@galyen.com",
|
||||
"title": "Of Counsel",
|
||||
"phone-number": "(844) 698-0233",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Business Law"
|
||||
},
|
||||
{
|
||||
"area": "Civil and Commercial Litigation"
|
||||
},
|
||||
{
|
||||
"area": "Employment Law"
|
||||
},
|
||||
{
|
||||
"area": "Corporate Law"
|
||||
},
|
||||
{
|
||||
"area": "Construction Law"
|
||||
},
|
||||
{
|
||||
"area": "Real Estate"
|
||||
},
|
||||
{
|
||||
"area": "Civil Defense"
|
||||
},
|
||||
{
|
||||
"area": "Estate Planning"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Benton Gann",
|
||||
"email": "bgann@galyen.com",
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Shane F. Langston",
|
||||
"email": "slangston@galyen.com",
|
||||
"title": "Personal Injury Litigation",
|
||||
"phone-number": "(844) 402-4530",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury Litigation"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Rebecca M. Langston",
|
||||
"email": "rlangston@galyen.com",
|
||||
"title": "Personal Injury Litigation",
|
||||
"phone-number": "(844) 402-4530",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury Litigation"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "David Klemm",
|
||||
"email": "dklemm@galyen.com",
|
||||
"title": "Personal Injury Trial Lawyer",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury Trial Lawyer"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Tyler D. Baker",
|
||||
"email": "tbaker@galyen.com",
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Clint Lee",
|
||||
"email": "clee@galyen.com",
|
||||
"title": "Catastrophic Injury Attorney",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Catastrophic Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "R. Keith Spencer",
|
||||
"email": "rkspencer@galyen.com",
|
||||
"title": "Family Law Attorney",
|
||||
"phone-number": "(844) 698-0233",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Gene Leposki",
|
||||
"email": "gleposki@galyen.com",
|
||||
"title": "Family Law Attorney",
|
||||
"phone-number": "(844) 698-0233",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Teresa Sanchez",
|
||||
"email": "tsanchez@galyen.com",
|
||||
"title": "Managing Attorney of the Family Law Department",
|
||||
"phone-number": "(844) 698-0233",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Paul Kennedy",
|
||||
"email": "pkennedy@galyen.com",
|
||||
"title": "Family Law Attorney",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Danielle Cortez-Harper",
|
||||
"email": "dharper@galyen.com",
|
||||
"title": "Family Law Attorney",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Jane Mapes",
|
||||
"email": "jmapes@galyen.com",
|
||||
"title": "Family Law Attorney",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Juliette Steffe",
|
||||
"email": "jsteffe@galyen.com",
|
||||
"title": "Family Law Attorney",
|
||||
"phone-number": "(817) 263-3000",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Anna Nika",
|
||||
"email": "anika@galyen.com",
|
||||
"title": "Family Law Attorney",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Lori Shannon",
|
||||
"email": "lshannon@galyen.com",
|
||||
"title": "Family Law Attorney",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Michael Livens",
|
||||
"email": "mlivens@galyen.com",
|
||||
"title": "Family Law Attorney",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Jennifer Scherf",
|
||||
"email": "jscherf@galyen.com",
|
||||
"title": "Family Law Attorney",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Allen Griffin",
|
||||
"email": "agriffin@galyen.com",
|
||||
"title": "Family Law Attorney",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Ian Croall",
|
||||
"email": "icroall@galyen.com",
|
||||
"title": "Vice President & Managing Attorney, Social Security Disability",
|
||||
"phone-number": "(844) 698-0233",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Social Security Disability"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Kim C. Smith",
|
||||
"email": "ksmith@galyen.com",
|
||||
"title": "Managing Attorney, Workers’ Compensation",
|
||||
"phone-number": "(844) 698-0233",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Workers’ Compensation"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "J. C. Bailey III",
|
||||
"email": "jcbailey@galyen.com",
|
||||
"title": "Estate Planning, Probate, Wills & Business Law",
|
||||
"phone-number": "(844) 698-0233",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Estate Planning"
|
||||
},
|
||||
{
|
||||
"area": "Probate"
|
||||
},
|
||||
{
|
||||
"area": "Wills"
|
||||
},
|
||||
{
|
||||
"area": "Business Law"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "John Robinson",
|
||||
"email": "jrobinson@galyen.com",
|
||||
"title": "Criminal Law Attorney",
|
||||
"phone-number": "(844) 698-0233",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Criminal Law"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Michael Raymond Cramer",
|
||||
"email": "mcramer@galyen.com",
|
||||
"title": "Of Counsel",
|
||||
"phone-number": "(844) 698-0233",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Business Law"
|
||||
},
|
||||
{
|
||||
"area": "Civil and Commercial Litigation"
|
||||
},
|
||||
{
|
||||
"area": "Employment Law"
|
||||
},
|
||||
{
|
||||
"area": "Corporate Law"
|
||||
},
|
||||
{
|
||||
"area": "Construction Law"
|
||||
},
|
||||
{
|
||||
"area": "Real Estate"
|
||||
},
|
||||
{
|
||||
"area": "Civil Defense"
|
||||
},
|
||||
{
|
||||
"area": "Estate Planning"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Paul F. Wieneskie",
|
||||
"email": "pwieneskie@galyen.com",
|
||||
"title": "Civil Appellate Attorney",
|
||||
"phone-number": "(844) 698-0233",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Civil Appellate Law"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Claudia Cubias",
|
||||
"email": "ccubias@galyen.com",
|
||||
"title": "Immigration Attorney",
|
||||
"phone-number": "(844) 402-2992",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Immigration Law"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Katherine Hawkins",
|
||||
"email": "khawkins@galyen.com",
|
||||
"title": "Immigration Attorney",
|
||||
"phone-number": "",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Immigration Law"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
};
|
||||
|
||||
const singleAnswerResult = {}
|
||||
|
||||
const finalResult = await mixSchemaObjects(originalSchema, singleAnswerResult, multiEntityResult)
|
||||
|
||||
expect(finalResult).toEqual(multiEntityResult)
|
||||
})
|
||||
|
||||
it("shoud spread (id: 26)", async () => {
|
||||
const res1 = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const res2 = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const res3 = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const res4 = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const res5 = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const res6 = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const res7 = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const results = [res1, res2, res3, res4, res5, res6, res7]
|
||||
|
||||
const originalSchema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
products: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
price: { type: "string" },
|
||||
description: { type: "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(await transformArrayToObject(originalSchema, results))
|
||||
|
||||
const singleAnswerResult = {}
|
||||
const multiEntityResult = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
},
|
||||
{
|
||||
"name": "סיר Neon",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
},
|
||||
{
|
||||
"name": "סיר Neon",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
},
|
||||
{
|
||||
"name": "סיר Neon",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
},
|
||||
{
|
||||
"name": "סיר Neon",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
},
|
||||
{
|
||||
"name": "סיר Neon",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
},
|
||||
{
|
||||
"name": "סיר Neon",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const finalResult = await mixSchemaObjects(originalSchema, singleAnswerResult, multiEntityResult)
|
||||
|
||||
expect(finalResult).toEqual(multiEntityResult)
|
||||
})
|
||||
|
||||
it("should spread (id: 29)", async () => {
|
||||
const originalSchema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
is_active: { type: "boolean" },
|
||||
is_partner: { type: "boolean" },
|
||||
is_msp: { type: "boolean" },
|
||||
is_auditor: { type: "boolean" },
|
||||
is_vciso: { type: "boolean" },
|
||||
offers_soc_2: { type: "boolean" },
|
||||
offers_iso_27001: { type: "boolean" },
|
||||
offers_cmmc: { type: "boolean" },
|
||||
has_soc_2_cert: { type: "boolean" },
|
||||
offers_office365: { type: "boolean" },
|
||||
offers_endpoint_security: { type: "boolean" }
|
||||
}
|
||||
}
|
||||
|
||||
const singleAnswerResult = {
|
||||
"is_active": true,
|
||||
"is_partner": true,
|
||||
"is_msp": true,
|
||||
"is_auditor": false,
|
||||
"is_vciso": false,
|
||||
"offers_soc_2": true,
|
||||
"offers_iso_27001": false,
|
||||
"offers_cmmc": false,
|
||||
"has_soc_2_cert": false,
|
||||
"offers_office365": true,
|
||||
"offers_endpoint_security": false
|
||||
}
|
||||
const multiEntityResult = {}
|
||||
|
||||
const finalResult = await mixSchemaObjects(originalSchema, singleAnswerResult, multiEntityResult)
|
||||
|
||||
expect(finalResult).toEqual(singleAnswerResult)
|
||||
})
|
||||
|
||||
})
|
269
apps/api/src/lib/__tests__/spread-schema-objects.test.ts
Normal file
269
apps/api/src/lib/__tests__/spread-schema-objects.test.ts
Normal file
@ -0,0 +1,269 @@
|
||||
import { spreadSchemas } from "../extract/helpers/spread-schemas";
|
||||
|
||||
describe("spreadSchemas", () => {
|
||||
it("should spread kyb schema (id: 1)", async () => {
|
||||
const keys = ["owners"]
|
||||
const schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
business: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
registration_number: { type: "string" },
|
||||
tax_id: { type: "string" },
|
||||
type: { type: "string" },
|
||||
industry: { type: "string" },
|
||||
address: {
|
||||
type: "object",
|
||||
properties: {
|
||||
street: { type: "string" },
|
||||
city: { type: "string" },
|
||||
state: { type: "string" },
|
||||
country: { type: "string" },
|
||||
postal_code: { type: "string" }
|
||||
},
|
||||
},
|
||||
incorporation_date: { type: "string", format: "date" },
|
||||
phone: { type: "string" },
|
||||
email: { type: "string", format: "email" }
|
||||
}
|
||||
},
|
||||
owners: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
full_name: { type: "string" },
|
||||
role: { type: "string" },
|
||||
address: {
|
||||
type: "object",
|
||||
properties: {
|
||||
street: { type: "string" },
|
||||
city: { type: "string" },
|
||||
state: { type: "string" },
|
||||
country: { type: "string" },
|
||||
postal_code: { type: "string" }
|
||||
},
|
||||
},
|
||||
phone: { type: "string" },
|
||||
email: { type: "string", format: "email" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
||||
|
||||
expect(singleAnswerSchema).toEqual({
|
||||
type: "object",
|
||||
properties: {
|
||||
business: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
registration_number: { type: "string" },
|
||||
tax_id: { type: "string" },
|
||||
type: { type: "string" },
|
||||
industry: { type: "string" },
|
||||
address: {
|
||||
type: "object",
|
||||
properties: {
|
||||
street: { type: "string" },
|
||||
city: { type: "string" },
|
||||
state: { type: "string" },
|
||||
country: { type: "string" },
|
||||
postal_code: { type: "string" }
|
||||
}
|
||||
},
|
||||
incorporation_date: { type: "string", format: "date" },
|
||||
phone: { type: "string" },
|
||||
email: { type: "string", format: "email" }
|
||||
}
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
expect(multiEntitySchema).toEqual({
|
||||
type: "object",
|
||||
properties: {
|
||||
owners: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
full_name: { type: "string" },
|
||||
role: { type: "string" },
|
||||
address: {
|
||||
type: "object",
|
||||
properties: {
|
||||
street: { type: "string" },
|
||||
city: { type: "string" },
|
||||
state: { type: "string" },
|
||||
country: { type: "string" },
|
||||
postal_code: { type: "string" }
|
||||
}
|
||||
},
|
||||
phone: { type: "string" },
|
||||
email: { type: "string", format: "email" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
it("should spread lawyers schema (id: 9)", async () => {
|
||||
const keys = ["lawyers"]
|
||||
const schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
lawyers: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
email: { type: ["string", "null"], format: "email" },
|
||||
title: { type: ["string", "null"] },
|
||||
phone_number: { type: ["string", "null"], alias: "phone-number" },
|
||||
practice_areas: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
area: { type: "string" }
|
||||
},
|
||||
},
|
||||
alias: "practice-areas"
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
||||
|
||||
expect(singleAnswerSchema).toEqual({})
|
||||
expect(multiEntitySchema).toEqual(schema)
|
||||
})
|
||||
|
||||
it("shoud spread (id: 26)", async () => {
|
||||
const schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
products: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
price: { type: "string" },
|
||||
description: { type: "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const keys = ["products"]
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
||||
|
||||
expect(singleAnswerSchema).toEqual({})
|
||||
expect(multiEntitySchema).toEqual(schema)
|
||||
})
|
||||
|
||||
it("shoud spread categories and products", async () => {
|
||||
const schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
categories: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "string"
|
||||
}
|
||||
},
|
||||
products: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
price: { type: "string" },
|
||||
description: { type: "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const keys = ["products", "categories"]
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
||||
|
||||
expect(singleAnswerSchema).toEqual({})
|
||||
expect(multiEntitySchema).toEqual(schema)
|
||||
})
|
||||
|
||||
it("should spread (id: 29)", async () => {
|
||||
const schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
is_active: { type: "boolean" },
|
||||
is_partner: { type: "boolean" },
|
||||
is_msp: { type: "boolean" },
|
||||
is_auditor: { type: "boolean" },
|
||||
is_vciso: { type: "boolean" },
|
||||
offers_soc_2: { type: "boolean" },
|
||||
offers_iso_27001: { type: "boolean" },
|
||||
offers_cmmc: { type: "boolean" },
|
||||
has_soc_2_cert: { type: "boolean" },
|
||||
offers_office365: { type: "boolean" },
|
||||
offers_endpoint_security: { type: "boolean" }
|
||||
}
|
||||
}
|
||||
|
||||
const keys = []
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
||||
|
||||
expect(singleAnswerSchema).toEqual(schema)
|
||||
expect(multiEntitySchema).toEqual({})
|
||||
})
|
||||
|
||||
it("should spread kyb schema (id: 29)", async () => {
|
||||
|
||||
const schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"lawyers": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": { "type": "string" },
|
||||
"email": { "type": ["string", "null"] },
|
||||
"phone-number": { "type": "string" },
|
||||
"practice-areas": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"area": { "type": "string" }
|
||||
}
|
||||
}
|
||||
},
|
||||
"title": { "type": ["string", "null"] }
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const keys = ["lawyers"]
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
||||
|
||||
expect(singleAnswerSchema).toEqual({})
|
||||
expect(multiEntitySchema).toEqual(schema)
|
||||
})
|
||||
})
|
674
apps/api/src/lib/__tests__/transform-array-to-obj.test.ts
Normal file
674
apps/api/src/lib/__tests__/transform-array-to-obj.test.ts
Normal file
@ -0,0 +1,674 @@
|
||||
import { transformArrayToObject } from "../extract/helpers/transform-array-to-obj";
|
||||
|
||||
const originalSchema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
products: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
price: { type: "string" },
|
||||
description: { type: "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe("transformArrayToObject function", () => {
|
||||
it("shoud transform array to object (id: 26)", async () => {
|
||||
const res1 = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon 1",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const res2 = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon 2",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const res3 = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon 3",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const res4 = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon 4",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const res5 = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon 5",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const res6 = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon 6",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const res7 = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon 7",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const results = [res1, res2, res3, res4, res5, res6, res7]
|
||||
|
||||
const multiEntityResult = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon 1",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
},
|
||||
{
|
||||
"name": "סיר Neon 2",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
},
|
||||
{
|
||||
"name": "סיר Neon 3",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
},
|
||||
{
|
||||
"name": "סיר Neon 4",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
},
|
||||
{
|
||||
"name": "סיר Neon 5",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
},
|
||||
{
|
||||
"name": "סיר Neon 6",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
},
|
||||
{
|
||||
"name": "סיר Neon 7",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
expect(await transformArrayToObject(originalSchema, results)).toEqual(multiEntityResult)
|
||||
})
|
||||
|
||||
it("should transform array to object (id: 27)", async () => {
|
||||
const res1 = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon 1",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const res3 = { "products": [] }
|
||||
const res4 = { "products": null }
|
||||
|
||||
const results = [res1, res3, res4]
|
||||
|
||||
const multiEntityResult = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon 1",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
expect(await transformArrayToObject(originalSchema, results)).toEqual(multiEntityResult)
|
||||
})
|
||||
|
||||
it("should transform array to object (id: 27)", async () => {
|
||||
const res1 = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon 1",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const res3 = { "products": [] }
|
||||
const res4 = { "products": [{
|
||||
"name": "סיר Neon 4",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}] }
|
||||
|
||||
const results = [res1, res3, res4]
|
||||
|
||||
const multiEntityResult = {
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon 1",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
},
|
||||
{
|
||||
"name": "סיר Neon 4",
|
||||
"price": "99.90 ₪",
|
||||
"description": "סיר מסדרת Neon גוף הכלי עשוי אלומיניום להולכת חום מהירה ואחידה ולחיסכון בזמן ואנרגיה סיר בציפוי נון סטיק למניעת הדבקות המזון, לשימוש מופחת בשמן ולניקוי קל ונוח. מתאים לכל סוגי הכיריים, מתאים לאינדוקציה מתאים לשטיפה במדיח. מתאים לשימוש כסיר אורז, סיר פסטה, סיר מרק, סיר למגוון תבשילים. סיר 28 ס”מ | 7.1 ליטר התמונה להמחשה בלבד. הצבע בתמונה עשוי להיות שונה מהמציאות"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
expect(await transformArrayToObject(originalSchema, results)).toEqual(multiEntityResult)
|
||||
})
|
||||
|
||||
it("more complex schema", async () => {
|
||||
const originalSchema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
"ecommerce": {
|
||||
type: "object",
|
||||
properties: {
|
||||
"name": { type: "string" },
|
||||
"products": {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
"name": { type: "string" },
|
||||
"price": { type: "string" },
|
||||
"description": { type: "string" },
|
||||
"categories": {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const res1 = {
|
||||
"ecommerce": {
|
||||
"name": '1',
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon 1",
|
||||
"price": "99.90 ₪",
|
||||
"description": "",
|
||||
"categories": [ "סירים", "something", "else" ]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
const res2 = {
|
||||
"ecommerce": {
|
||||
"name": 'keep the first',
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon 2",
|
||||
"price": "99.90 ₪",
|
||||
"description": "",
|
||||
"categories": [ "סירים", "ajkshda", "something", "else" ]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
const res3 = { "ecommerce": { "products": [] } }
|
||||
const res4 = { "ecommerce": { "products": null } }
|
||||
|
||||
const results = [res1, res2, res3, res4]
|
||||
|
||||
const multiEntityResult = {
|
||||
"ecommerce": {
|
||||
"name": '1',
|
||||
"products": [
|
||||
{
|
||||
"name": "סיר Neon 1",
|
||||
"price": "99.90 ₪",
|
||||
"description": "",
|
||||
"categories": [ "סירים", "something", "else" ]
|
||||
},
|
||||
{
|
||||
"name": "סיר Neon 2",
|
||||
"price": "99.90 ₪",
|
||||
"description": "",
|
||||
"categories": [ "סירים", "ajkshda", "something", "else" ]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
console.log(await transformArrayToObject(originalSchema, results))
|
||||
|
||||
expect(await transformArrayToObject(originalSchema, results)).toEqual(multiEntityResult)
|
||||
})
|
||||
|
||||
it("even more complex schema", async () => {
|
||||
const moreComplexSchema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
"name": { type: "string" },
|
||||
"description": { type: "string" },
|
||||
"products": {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
"name": { type: "string" },
|
||||
"price": { type: "string" },
|
||||
"description": { type: "string" }
|
||||
}
|
||||
}
|
||||
},
|
||||
categories: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const res1 = {
|
||||
"name": '1',
|
||||
"description": "description",
|
||||
"products": [
|
||||
{
|
||||
"name": "Neon 1",
|
||||
"price": "99.90 ₪",
|
||||
"description": "neon 1 product"
|
||||
}
|
||||
],
|
||||
"categories": [ "something", "else" ]
|
||||
}
|
||||
|
||||
const res4 = { "products": [] }
|
||||
|
||||
const res2 = {
|
||||
"name": 'keep first',
|
||||
"description": "description",
|
||||
"products": [
|
||||
{
|
||||
"name": "Neon 2",
|
||||
"price": "99.90 ₪",
|
||||
"description": "neon 2 product"
|
||||
}
|
||||
],
|
||||
"categories": ["something" ]
|
||||
}
|
||||
|
||||
const res3 = {
|
||||
"name": 'keep the first',
|
||||
"products": [
|
||||
{
|
||||
"name": "Neon 3",
|
||||
"price": "555.90 ₪",
|
||||
"description": "neon 3 product"
|
||||
}
|
||||
],
|
||||
"categories": [ "hey", "something", "other one" ]
|
||||
}
|
||||
|
||||
const res5 = { "products": null }
|
||||
|
||||
const results = [res1, res2, res3]
|
||||
|
||||
const multiEntityResult = {
|
||||
"name": '1',
|
||||
"description": "description",
|
||||
"products": [
|
||||
{
|
||||
"name": "Neon 1",
|
||||
"price": "99.90 ₪",
|
||||
"description": "neon 1 product"
|
||||
},
|
||||
{
|
||||
"name": "Neon 2",
|
||||
"price": "99.90 ₪",
|
||||
"description": "neon 2 product"
|
||||
},
|
||||
{
|
||||
"name": "Neon 3",
|
||||
"price": "555.90 ₪",
|
||||
"description": "neon 3 product"
|
||||
}
|
||||
],
|
||||
"categories": [ "something", "else", "hey", "other one" ]
|
||||
}
|
||||
|
||||
console.log(multiEntityResult, await transformArrayToObject(moreComplexSchema, results))
|
||||
|
||||
expect(await transformArrayToObject(moreComplexSchema, results)).toEqual(multiEntityResult)
|
||||
})
|
||||
|
||||
it("should transform array to object (id: 7)", async () => {
|
||||
const originalSchema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"property_details": {
|
||||
"properties": {
|
||||
"title": {
|
||||
"title": "Title",
|
||||
"type": "string"
|
||||
},
|
||||
"location": {
|
||||
"title": "Location",
|
||||
"type": "string"
|
||||
},
|
||||
"property_type": {
|
||||
"title": "Property Type",
|
||||
"type": "string"
|
||||
},
|
||||
"size": {
|
||||
"title": "Size",
|
||||
"type": "string"
|
||||
},
|
||||
"rooms": {
|
||||
"title": "Rooms",
|
||||
"type": "string"
|
||||
},
|
||||
"floor": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Floor"
|
||||
},
|
||||
"furnished": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Furnished"
|
||||
},
|
||||
"energy_rating": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Energy Rating"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"title",
|
||||
"location",
|
||||
"property_type",
|
||||
"size",
|
||||
"rooms",
|
||||
"floor",
|
||||
"furnished",
|
||||
"energy_rating"
|
||||
],
|
||||
"title": "PropertyDetails",
|
||||
"type": "object"
|
||||
},
|
||||
"features": {
|
||||
"properties": {
|
||||
"pets_allowed": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Pets Allowed"
|
||||
},
|
||||
"senior_friendly": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Senior Friendly"
|
||||
},
|
||||
"balcony": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Balcony"
|
||||
},
|
||||
"dishwasher": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Dishwasher"
|
||||
},
|
||||
"parking": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Parking"
|
||||
},
|
||||
"electric_charging": {
|
||||
"anyOf": [
|
||||
{ "type": "string"},
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Electric Charging"
|
||||
},
|
||||
"elevator": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Elevator"
|
||||
},
|
||||
"washer_dryer": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Washer Dryer"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"pets_allowed",
|
||||
"senior_friendly",
|
||||
"balcony",
|
||||
"dishwasher",
|
||||
"parking",
|
||||
"electric_charging",
|
||||
"elevator",
|
||||
"washer_dryer"
|
||||
],
|
||||
"title": "FeaturesAmenities",
|
||||
"type": "object"
|
||||
},
|
||||
"rental_details": {
|
||||
"properties": {
|
||||
"monthly_net_rent": {
|
||||
"title": "Monthly Net Rent",
|
||||
"type": "string"
|
||||
},
|
||||
"utilities": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Utilities"
|
||||
},
|
||||
"move_in_price": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Move In Price"
|
||||
},
|
||||
"deposit": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Deposit"
|
||||
},
|
||||
"prepaid_rent": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Prepaid Rent"
|
||||
},
|
||||
"rental_period": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Rental Period"
|
||||
},
|
||||
"available_from": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Available From"
|
||||
},
|
||||
"listing_id": {
|
||||
"title": "Listing Id",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"monthly_net_rent",
|
||||
"utilities",
|
||||
"move_in_price",
|
||||
"deposit",
|
||||
"prepaid_rent",
|
||||
"rental_period",
|
||||
"available_from",
|
||||
"listing_id"
|
||||
],
|
||||
"title": "RentalDetails",
|
||||
"type": "object"
|
||||
},
|
||||
"landlord_status": {
|
||||
"properties": {
|
||||
"boligportal_approved": {
|
||||
"anyOf": [
|
||||
{ "type": "boolean" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Boligportal Approved"
|
||||
},
|
||||
"number_of_ads": {
|
||||
"anyOf": [
|
||||
{ "type": "integer" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Number Of Ads"
|
||||
},
|
||||
"last_active": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Last Active"
|
||||
},
|
||||
"profile_created": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"title": "Profile Created"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"boligportal_approved",
|
||||
"number_of_ads",
|
||||
"last_active",
|
||||
"profile_created"
|
||||
],
|
||||
"title": "LandlordStatus",
|
||||
"type": "object"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const results = [
|
||||
{
|
||||
"property_details": {
|
||||
"title": "3 room apartment on 70 m²",
|
||||
"location": "Odense",
|
||||
"property_type": "Apartment",
|
||||
"size": "70 m²",
|
||||
"rooms": "3",
|
||||
"floor": null,
|
||||
"furnished": null,
|
||||
"energy_rating": null
|
||||
},
|
||||
"features": {
|
||||
"pets_allowed": null,
|
||||
"senior_friendly": null,
|
||||
"balcony": null,
|
||||
"dishwasher": null,
|
||||
"parking": null,
|
||||
"electric_charging": null,
|
||||
"elevator": null,
|
||||
"washer_dryer": null
|
||||
},
|
||||
"rental_details": {
|
||||
"monthly_net_rent": "7,000 kr.",
|
||||
"utilities": null,
|
||||
"move_in_price": null,
|
||||
"deposit": null,
|
||||
"prepaid_rent": null,
|
||||
"rental_period": null,
|
||||
"available_from": null,
|
||||
"listing_id": "4937446"
|
||||
},
|
||||
"landlord_status": {
|
||||
"boligportal_approved": null,
|
||||
"number_of_ads": null,
|
||||
"last_active": null,
|
||||
"profile_created": null
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
expect(await transformArrayToObject(originalSchema, results)).toEqual(results[0])
|
||||
})
|
||||
})
|
@ -20,7 +20,7 @@ export function cacheKey(
|
||||
// these options disqualify a cache
|
||||
if (
|
||||
internalOptions.v0CrawlOnlyUrls ||
|
||||
internalOptions.forceEngine ||
|
||||
internalOptions.forceEngine?.includes("cache") ||
|
||||
scrapeOptions.fastMode ||
|
||||
internalOptions.atsv ||
|
||||
(scrapeOptions.actions && scrapeOptions.actions.length > 0)
|
||||
|
81
apps/api/src/lib/extract/archive/crawling-index.ts
Normal file
81
apps/api/src/lib/extract/archive/crawling-index.ts
Normal file
@ -0,0 +1,81 @@
|
||||
// const id = crypto.randomUUID();
|
||||
|
||||
// const sc: StoredCrawl = {
|
||||
// originUrl: request.urls[0].replace("/*",""),
|
||||
// crawlerOptions: toLegacyCrawlerOptions({
|
||||
// maxDepth: 15,
|
||||
// limit: 5000,
|
||||
// includePaths: [],
|
||||
// excludePaths: [],
|
||||
// ignoreSitemap: false,
|
||||
// allowExternalLinks: false,
|
||||
// allowBackwardLinks: true,
|
||||
// allowSubdomains: false,
|
||||
// ignoreRobotsTxt: false,
|
||||
// deduplicateSimilarURLs: false,
|
||||
// ignoreQueryParameters: false
|
||||
// }),
|
||||
// scrapeOptions: {
|
||||
// formats: ["markdown"],
|
||||
// onlyMainContent: true,
|
||||
// waitFor: 0,
|
||||
// mobile: false,
|
||||
// removeBase64Images: true,
|
||||
// fastMode: false,
|
||||
// parsePDF: true,
|
||||
// skipTlsVerification: false,
|
||||
// },
|
||||
// internalOptions: {
|
||||
// disableSmartWaitCache: true,
|
||||
// isBackgroundIndex: true
|
||||
// },
|
||||
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||
// createdAt: Date.now(),
|
||||
// plan: "hobby", // make it a low concurrency
|
||||
// };
|
||||
|
||||
// // Save the crawl configuration
|
||||
// await saveCrawl(id, sc);
|
||||
|
||||
// // Then kick off the job
|
||||
// await _addScrapeJobToBullMQ({
|
||||
// url: request.urls[0].replace("/*",""),
|
||||
// mode: "kickoff" as const,
|
||||
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||
// plan: "hobby", // make it a low concurrency
|
||||
// crawlerOptions: sc.crawlerOptions,
|
||||
// scrapeOptions: sc.scrapeOptions,
|
||||
// internalOptions: sc.internalOptions,
|
||||
// origin: "index",
|
||||
// crawl_id: id,
|
||||
// webhook: null,
|
||||
// v1: true,
|
||||
// }, {}, crypto.randomUUID(), 50);
|
||||
|
||||
// we restructure and make all of the arrays we need to fill into objects,
|
||||
// adding them to a single object so the llm can fill them one at a time
|
||||
// TODO: make this work for more complex schemas where arrays are not first level
|
||||
|
||||
// let schemasForLLM: {} = {};
|
||||
// for (const key in largeArraysSchema) {
|
||||
// const originalSchema = structuredClone(largeArraysSchema[key].items);
|
||||
// console.log(
|
||||
// "key",
|
||||
// key,
|
||||
// "\noriginalSchema",
|
||||
// JSON.stringify(largeArraysSchema[key], null, 2),
|
||||
// );
|
||||
// let clonedObj = {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// informationFilled: {
|
||||
// type: "boolean",
|
||||
// },
|
||||
// data: {
|
||||
// type: "object",
|
||||
// properties: originalSchema.properties,
|
||||
// },
|
||||
// },
|
||||
// };
|
||||
// schemasForLLM[key] = clonedObj;
|
||||
// }
|
@ -1,9 +1,10 @@
|
||||
export const extractConfig = {
|
||||
MAX_INITIAL_RANKING_LIMIT: 1000,
|
||||
MAX_RANKING_LIMIT: 20,
|
||||
INITIAL_SCORE_THRESHOLD: 0.75,
|
||||
FALLBACK_SCORE_THRESHOLD: 0.5,
|
||||
MIN_REQUIRED_LINKS: 1,
|
||||
RERANKING:{
|
||||
MAX_INITIAL_RANKING_LIMIT: 1000,
|
||||
MAX_RANKING_LIMIT_FOR_RELEVANCE: 100,
|
||||
INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE: 0.75,
|
||||
FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE: 0.5,
|
||||
MIN_REQUIRED_LINKS: 1,
|
||||
}
|
||||
};
|
||||
|
||||
export const CUSTOM_U_TEAMS = ["874d40cc-a5c0-4e93-b661-9ddfbad5e51e"];
|
||||
|
@ -1,4 +1,9 @@
|
||||
import { Document, ExtractRequest, URLTrace } from "../../controllers/v1/types";
|
||||
import {
|
||||
Document,
|
||||
ExtractRequest,
|
||||
toLegacyCrawlerOptions,
|
||||
URLTrace,
|
||||
} from "../../controllers/v1/types";
|
||||
import { PlanType } from "../../types";
|
||||
import { logger } from "../logger";
|
||||
import { processUrl } from "./url-processor";
|
||||
@ -9,7 +14,19 @@ import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
|
||||
import { saveCrawl, StoredCrawl } from "../crawl-redis";
|
||||
import { dereferenceSchema } from "./helpers/dereference-schema";
|
||||
import { z } from "zod";
|
||||
import OpenAI from "openai";
|
||||
import { spreadSchemas } from "./helpers/spread-schemas";
|
||||
import { transformArrayToObject } from "./helpers/transform-array-to-obj";
|
||||
import { mixSchemaObjects } from "./helpers/mix-schema-objs";
|
||||
import Ajv from "ajv";
|
||||
const ajv = new Ajv();
|
||||
|
||||
const openai = new OpenAI();
|
||||
import { updateExtract } from "./extract-redis";
|
||||
import { deduplicateObjectsArray } from "./helpers/deduplicate-objs-array";
|
||||
import { mergeNullValObjs } from "./helpers/merge-null-val-objs";
|
||||
import { CUSTOM_U_TEAMS } from "./config";
|
||||
|
||||
interface ExtractServiceOptions {
|
||||
@ -28,6 +45,95 @@ interface ExtractResult {
|
||||
error?: string;
|
||||
}
|
||||
|
||||
|
||||
async function analyzeSchemaAndPrompt(
|
||||
urls: string[],
|
||||
schema: any,
|
||||
prompt: string,
|
||||
): Promise<{
|
||||
isMultiEntity: boolean;
|
||||
multiEntityKeys: string[];
|
||||
reasoning?: string;
|
||||
keyIndicators?: string[];
|
||||
}> {
|
||||
const schemaString = JSON.stringify(schema);
|
||||
|
||||
const checkSchema = z.object({
|
||||
isMultiEntity: z.boolean(),
|
||||
multiEntityKeys: z.array(z.string()),
|
||||
reasoning: z.string(),
|
||||
keyIndicators: z.array(z.string()),
|
||||
});
|
||||
|
||||
const result = await openai.beta.chat.completions.parse({
|
||||
model: "gpt-4o",
|
||||
messages: [
|
||||
{
|
||||
role: "system",
|
||||
content: `
|
||||
You are a query classifier for a web scraping system. Classify the data extraction query as either:
|
||||
A) Single-Answer: One answer across a few pages, possibly containing small arrays.
|
||||
B) Multi-Entity: Many items across many pages, often involving large arrays.
|
||||
|
||||
Consider:
|
||||
1. Answer Cardinality: Single or multiple items?
|
||||
2. Page Distribution: Found on 1-3 pages or many?
|
||||
3. Verification Needs: Cross-page verification or independent extraction?
|
||||
|
||||
Provide:
|
||||
- Method: [Single-Answer/Multi-Entity]
|
||||
- Confidence: [0-100%]
|
||||
- Reasoning: Why this classification?
|
||||
- Key Indicators: Specific aspects leading to this decision.
|
||||
|
||||
Examples:
|
||||
- "Is this company a non-profit?" -> Single-Answer
|
||||
- "Extract all product prices" -> Multi-Entity
|
||||
|
||||
For Single-Answer, arrays may be present but are typically small. For Multi-Entity, if arrays have multiple items not from a single page, return keys with large arrays. If nested, return the full key (e.g., 'ecommerce.products').
|
||||
`,
|
||||
},
|
||||
{
|
||||
role: "user",
|
||||
content: `Classify the query as Single-Answer or Multi-Entity. For Multi-Entity, return keys with large arrays; otherwise, return none:
|
||||
Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`,
|
||||
},
|
||||
],
|
||||
response_format: {
|
||||
type: "json_schema",
|
||||
json_schema: {
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
isMultiEntity: { type: "boolean" },
|
||||
multiEntityKeys: { type: "array", items: { type: "string" } },
|
||||
reasoning: { type: "string" },
|
||||
keyIndicators: { type: "array", items: { type: "string" } },
|
||||
},
|
||||
required: [
|
||||
"isMultiEntity",
|
||||
"multiEntityKeys",
|
||||
"reasoning",
|
||||
"keyIndicators",
|
||||
],
|
||||
additionalProperties: false,
|
||||
},
|
||||
name: "checkSchema",
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } =
|
||||
checkSchema.parse(result.choices[0].message.parsed);
|
||||
return { isMultiEntity, multiEntityKeys, reasoning, keyIndicators };
|
||||
}
|
||||
|
||||
type completions = {
|
||||
extract: Record<string, any>;
|
||||
numTokens: number;
|
||||
warning?: string;
|
||||
}
|
||||
|
||||
function getRootDomain(url: string): string {
|
||||
try {
|
||||
if (url.endsWith("/*")) {
|
||||
@ -46,8 +152,11 @@ export async function performExtraction(
|
||||
): Promise<ExtractResult> {
|
||||
const { request, teamId, plan, subId } = options;
|
||||
const urlTraces: URLTrace[] = [];
|
||||
let docs: Document[] = [];
|
||||
|
||||
let docsMap: Map<string, Document> = new Map();
|
||||
let singleAnswerCompletions: completions | null = null;
|
||||
let multiEntityCompletions: completions[] = [];
|
||||
let multiEntityResult: any = {};
|
||||
let singleAnswerResult: any = {};
|
||||
// Process URLs
|
||||
const urlPromises = request.urls.map((url) =>
|
||||
processUrl(
|
||||
@ -60,6 +169,7 @@ export async function performExtraction(
|
||||
origin: request.origin,
|
||||
limit: request.limit,
|
||||
includeSubdomains: request.includeSubdomains,
|
||||
schema: request.schema,
|
||||
},
|
||||
urlTraces,
|
||||
),
|
||||
@ -78,123 +188,289 @@ export async function performExtraction(
|
||||
};
|
||||
}
|
||||
|
||||
// Scrape documents
|
||||
const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000;
|
||||
const scrapePromises = links.map((url) =>
|
||||
scrapeDocument(
|
||||
{
|
||||
url,
|
||||
teamId,
|
||||
plan,
|
||||
origin: request.origin || "api",
|
||||
timeout,
|
||||
},
|
||||
urlTraces,
|
||||
),
|
||||
);
|
||||
let reqSchema = request.schema;
|
||||
reqSchema = await dereferenceSchema(reqSchema);
|
||||
|
||||
try {
|
||||
const results = await Promise.all(scrapePromises);
|
||||
docs.push(...results.filter((doc): doc is Document => doc !== null));
|
||||
} catch (error) {
|
||||
return {
|
||||
success: false,
|
||||
error: error.message,
|
||||
extractId,
|
||||
urlTrace: urlTraces,
|
||||
};
|
||||
}
|
||||
// agent evaluates if the schema or the prompt has an array with big amount of items
|
||||
// also it checks if the schema any other properties that are not arrays
|
||||
// if so, it splits the results into 2 types of completions:
|
||||
// 1. the first one is a completion that will extract the array of items
|
||||
// 2. the second one is multiple completions that will extract the items from the array
|
||||
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } =
|
||||
await analyzeSchemaAndPrompt(links, request.schema, request.prompt ?? "");
|
||||
|
||||
// Generate completions
|
||||
const completions = await generateOpenAICompletions(
|
||||
logger.child({ method: "extractService/generateOpenAICompletions" }),
|
||||
{
|
||||
mode: "llm",
|
||||
systemPrompt:
|
||||
(request.systemPrompt ? `${request.systemPrompt}\n` : "") +
|
||||
"Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " +
|
||||
links.join(", "),
|
||||
prompt: request.prompt,
|
||||
schema: request.schema,
|
||||
},
|
||||
docs.map((x) => buildDocument(x)).join("\n"),
|
||||
undefined,
|
||||
true,
|
||||
);
|
||||
// console.log("\nIs Multi Entity:", isMultiEntity);
|
||||
// console.log("\nMulti Entity Keys:", multiEntityKeys);
|
||||
// console.log("\nReasoning:", reasoning);
|
||||
// console.log("\nKey Indicators:", keyIndicators);
|
||||
|
||||
// Update token usage in traces
|
||||
if (completions.numTokens) {
|
||||
const totalLength = docs.reduce(
|
||||
(sum, doc) => sum + (doc.markdown?.length || 0),
|
||||
0,
|
||||
let rSchema = reqSchema;
|
||||
if (isMultiEntity) {
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(reqSchema, multiEntityKeys)
|
||||
rSchema = singleAnswerSchema;
|
||||
|
||||
|
||||
const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000;
|
||||
const scrapePromises = links.map((url) => {
|
||||
if (!docsMap.has(url)) {
|
||||
return scrapeDocument(
|
||||
{
|
||||
url,
|
||||
teamId,
|
||||
plan,
|
||||
origin: request.origin || "api",
|
||||
timeout,
|
||||
},
|
||||
urlTraces,
|
||||
)
|
||||
}
|
||||
return docsMap.get(url);
|
||||
})
|
||||
|
||||
let multyEntityDocs = (await Promise.all(scrapePromises)).filter(
|
||||
(doc): doc is Document => doc !== null,
|
||||
);
|
||||
docs.forEach((doc) => {
|
||||
if (doc.metadata?.sourceURL) {
|
||||
const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL);
|
||||
if (trace && trace.contentStats) {
|
||||
trace.contentStats.tokensUsed = Math.floor(
|
||||
((doc.markdown?.length || 0) / totalLength) * completions.numTokens,
|
||||
|
||||
for (const doc of multyEntityDocs) {
|
||||
if (doc?.metadata?.url) {
|
||||
docsMap.set(doc.metadata.url, doc);
|
||||
}
|
||||
}
|
||||
|
||||
// Process docs in chunks with queue style processing
|
||||
const chunkSize = 50;
|
||||
const timeoutCompletion = 45000; // 45 second timeout
|
||||
const chunks: Document[][] = [];
|
||||
|
||||
// Split into chunks
|
||||
for (let i = 0; i < multyEntityDocs.length; i += chunkSize) {
|
||||
chunks.push(multyEntityDocs.slice(i, i + chunkSize));
|
||||
}
|
||||
|
||||
// Process chunks sequentially with timeout
|
||||
for (const chunk of chunks) {
|
||||
const chunkPromises = chunk.map(async (doc) => {
|
||||
try {
|
||||
ajv.compile(multiEntitySchema);
|
||||
|
||||
// Wrap in timeout promise
|
||||
const timeoutPromise = new Promise((resolve) => {
|
||||
setTimeout(() => resolve(null), timeoutCompletion);
|
||||
});
|
||||
|
||||
// // Check if page should be extracted before proceeding
|
||||
const shouldExtractCheck = await generateOpenAICompletions(
|
||||
logger.child({ method: "extractService/checkShouldExtract" }),
|
||||
{
|
||||
mode: "llm",
|
||||
systemPrompt: "You are a content relevance checker. Your job is to determine if the provided content is very relevant to extract information from based on the user's prompt. Return true only if the content appears relevant and contains information that could help answer the prompt. Return false if the content seems irrelevant or unlikely to contain useful information for the prompt.",
|
||||
prompt: `Should the following content be used to extract information for this prompt: "${request.prompt}" User schema is: ${JSON.stringify(multiEntitySchema)}\nReturn only true or false.`,
|
||||
schema: {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"extract": {
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": ["extract"]
|
||||
}
|
||||
},
|
||||
buildDocument(doc),
|
||||
undefined,
|
||||
true
|
||||
);
|
||||
|
||||
if (!shouldExtractCheck.extract["extract"]) {
|
||||
console.log(`Skipping extraction for ${doc.metadata.url} as content is irrelevant`);
|
||||
return null;
|
||||
}
|
||||
// Add confidence score to schema with 5 levels
|
||||
const schemaWithConfidence = {
|
||||
...multiEntitySchema,
|
||||
properties: {
|
||||
...multiEntitySchema.properties,
|
||||
is_content_relevant: {
|
||||
type: "boolean",
|
||||
description: "Determine if this content is relevant to the prompt. Return true ONLY if the content contains information that directly helps answer the prompt. Return false if the content is irrelevant or unlikely to contain useful information."
|
||||
}
|
||||
},
|
||||
required: [...(multiEntitySchema.required || []), "is_content_relevant"]
|
||||
};
|
||||
// console.log("schemaWithConfidence", schemaWithConfidence);
|
||||
|
||||
const completionPromise = generateOpenAICompletions(
|
||||
logger.child({ method: "extractService/generateOpenAICompletions" }),
|
||||
{
|
||||
mode: "llm",
|
||||
systemPrompt:
|
||||
(request.systemPrompt ? `${request.systemPrompt}\n` : "") +
|
||||
`Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null. Here are the urls the user provided of which he wants to extract information from: ` +
|
||||
links.join(", "),
|
||||
prompt: request.prompt,
|
||||
schema: multiEntitySchema,
|
||||
},
|
||||
buildDocument(doc),
|
||||
undefined,
|
||||
true,
|
||||
);
|
||||
|
||||
// Race between timeout and completion
|
||||
const multiEntityCompletion = await Promise.race([
|
||||
completionPromise,
|
||||
timeoutPromise
|
||||
]) as Awaited<ReturnType<typeof generateOpenAICompletions>>;
|
||||
|
||||
// console.log(multiEntityCompletion.extract)
|
||||
// if (!multiEntityCompletion.extract?.is_content_relevant) {
|
||||
// console.log(`Skipping extraction for ${doc.metadata.url} as content is not relevant`);
|
||||
// return null;
|
||||
// }
|
||||
|
||||
// Update token usage in traces
|
||||
// if (multiEntityCompletion && multiEntityCompletion.numTokens) {
|
||||
// const totalLength = docs.reduce(
|
||||
// (sum, doc) => sum + (doc.markdown?.length || 0),
|
||||
// 0,
|
||||
// );
|
||||
// docs.forEach((doc) => {
|
||||
// if (doc.metadata?.sourceURL) {
|
||||
// const trace = urlTraces.find(
|
||||
// (t) => t.url === doc.metadata.sourceURL,
|
||||
// );
|
||||
// if (trace && trace.contentStats) {
|
||||
// trace.contentStats.tokensUsed = Math.floor(
|
||||
// ((doc.markdown?.length || 0) / totalLength) *
|
||||
// (multiEntityCompletion?.numTokens || 0),
|
||||
// );
|
||||
// }
|
||||
// }
|
||||
// });
|
||||
// }
|
||||
|
||||
// if (multiEntityCompletion.extract && multiEntityCompletion.extract.extraction_confidence < 3) {
|
||||
// console.log(`Skipping extraction for ${doc.metadata.url} as confidence is too low (${multiEntityCompletion.extract.extraction_confidence})`);
|
||||
// return null;
|
||||
// }
|
||||
|
||||
return multiEntityCompletion.extract;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to process document: ${error}`);
|
||||
return null;
|
||||
}
|
||||
});
|
||||
|
||||
// Wait for current chunk to complete before processing next chunk
|
||||
const chunkResults = await Promise.all(chunkPromises);
|
||||
multiEntityCompletions.push(...chunkResults.filter(result => result !== null));
|
||||
}
|
||||
|
||||
try {
|
||||
multiEntityResult = transformArrayToObject(multiEntitySchema, multiEntityCompletions);
|
||||
multiEntityResult = deduplicateObjectsArray(multiEntityResult);
|
||||
multiEntityResult = mergeNullValObjs(multiEntityResult);
|
||||
// @nick: maybe we can add here a llm that checks if the array probably has a primary key?
|
||||
} catch (error) {
|
||||
logger.error(`Failed to transform array to object: ${error}`);
|
||||
return {
|
||||
success: false,
|
||||
error: "An unexpected error occurred. Please contact help@firecrawl.com for help.",
|
||||
extractId,
|
||||
urlTrace: urlTraces,
|
||||
};
|
||||
}
|
||||
}
|
||||
if (rSchema && Object.keys(rSchema).length > 0 && rSchema.properties && Object.keys(rSchema.properties).length > 0) {
|
||||
// Scrape documents
|
||||
const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000;
|
||||
let singleAnswerDocs: Document[] = [];
|
||||
|
||||
// let rerank = await rerankLinks(links.map((url) => ({ url })), request.prompt ?? JSON.stringify(request.schema), urlTraces);
|
||||
|
||||
const scrapePromises = links.map((url) => {
|
||||
if (!docsMap.has(url)) {
|
||||
return scrapeDocument(
|
||||
{
|
||||
url,
|
||||
teamId,
|
||||
plan,
|
||||
origin: request.origin || "api",
|
||||
timeout,
|
||||
},
|
||||
urlTraces,
|
||||
);
|
||||
}
|
||||
return docsMap.get(url);
|
||||
});
|
||||
|
||||
try {
|
||||
const results = await Promise.all(scrapePromises);
|
||||
|
||||
for (const doc of results) {
|
||||
if (doc?.metadata?.url) {
|
||||
docsMap.set(doc.metadata.url, doc);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
singleAnswerDocs.push(...results.filter((doc): doc is Document => doc !== null));
|
||||
} catch (error) {
|
||||
return {
|
||||
success: false,
|
||||
error: error.message,
|
||||
extractId,
|
||||
urlTrace: urlTraces,
|
||||
};
|
||||
}
|
||||
|
||||
if (docsMap.size == 0) {
|
||||
// All urls are invalid
|
||||
return {
|
||||
success: false,
|
||||
error: "All provided URLs are invalid. Please check your input and try again.",
|
||||
extractId,
|
||||
urlTrace: request.urlTrace ? urlTraces : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
// Generate completions
|
||||
singleAnswerCompletions = await generateOpenAICompletions(
|
||||
logger.child({ method: "extractService/generateOpenAICompletions" }),
|
||||
{
|
||||
mode: "llm",
|
||||
systemPrompt:
|
||||
(request.systemPrompt ? `${request.systemPrompt}\n` : "") +
|
||||
"Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Return 'null' the property that you don't find the information. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " +
|
||||
links.join(", "),
|
||||
prompt: request.prompt,
|
||||
schema: rSchema,
|
||||
},
|
||||
singleAnswerDocs.map((x) => buildDocument(x)).join("\n"),
|
||||
undefined,
|
||||
true,
|
||||
);
|
||||
|
||||
singleAnswerResult = singleAnswerCompletions.extract;
|
||||
|
||||
// Update token usage in traces
|
||||
// if (completions && completions.numTokens) {
|
||||
// const totalLength = docs.reduce(
|
||||
// (sum, doc) => sum + (doc.markdown?.length || 0),
|
||||
// 0,
|
||||
// );
|
||||
// docs.forEach((doc) => {
|
||||
// if (doc.metadata?.sourceURL) {
|
||||
// const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL);
|
||||
// if (trace && trace.contentStats) {
|
||||
// trace.contentStats.tokensUsed = Math.floor(
|
||||
// ((doc.markdown?.length || 0) / totalLength) *
|
||||
// (completions?.numTokens || 0),
|
||||
// );
|
||||
// }
|
||||
// }
|
||||
// });
|
||||
// }
|
||||
}
|
||||
|
||||
// Kickoff background crawl for indexing root domains
|
||||
// const rootDomains = new Set(request.urls.map(getRootDomain));
|
||||
// rootDomains.forEach(async url => {
|
||||
// const crawlId = crypto.randomUUID();
|
||||
|
||||
// // Create and save crawl configuration first
|
||||
// const sc: StoredCrawl = {
|
||||
// originUrl: url,
|
||||
// crawlerOptions: {
|
||||
// maxDepth: 15,
|
||||
// limit: 5000,
|
||||
// includePaths: [],
|
||||
// excludePaths: [],
|
||||
// ignoreSitemap: false,
|
||||
// includeSubdomains: true,
|
||||
// allowExternalLinks: false,
|
||||
// allowBackwardLinks: true
|
||||
// },
|
||||
// scrapeOptions: {
|
||||
// formats: ["markdown"],
|
||||
// onlyMainContent: true,
|
||||
// waitFor: 0,
|
||||
// mobile: false,
|
||||
// removeBase64Images: true,
|
||||
// fastMode: false,
|
||||
// parsePDF: true,
|
||||
// skipTlsVerification: false,
|
||||
// },
|
||||
// internalOptions: {
|
||||
// disableSmartWaitCache: true,
|
||||
// isBackgroundIndex: true
|
||||
// },
|
||||
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||
// createdAt: Date.now(),
|
||||
// plan: "hobby", // make it a low concurrency
|
||||
// };
|
||||
|
||||
// // Save the crawl configuration
|
||||
// await saveCrawl(crawlId, sc);
|
||||
|
||||
// // Then kick off the job
|
||||
// await _addScrapeJobToBullMQ({
|
||||
// url,
|
||||
// mode: "kickoff" as const,
|
||||
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||
// plan: "hobby", // make it a low concurrency
|
||||
// crawlerOptions: sc.crawlerOptions,
|
||||
// scrapeOptions: sc.scrapeOptions,
|
||||
// internalOptions: sc.internalOptions,
|
||||
// origin: "index",
|
||||
// crawl_id: crawlId,
|
||||
// webhook: null,
|
||||
// v1: true,
|
||||
// }, {}, crypto.randomUUID(), 50);
|
||||
// });
|
||||
const finalResult = await mixSchemaObjects(reqSchema, singleAnswerResult, multiEntityResult);
|
||||
|
||||
let linksBilled = links.length * 5;
|
||||
|
||||
@ -214,14 +490,14 @@ export async function performExtraction(
|
||||
success: true,
|
||||
message: "Extract completed",
|
||||
num_docs: 1,
|
||||
docs: completions.extract ?? {},
|
||||
docs: finalResult ?? {},
|
||||
time_taken: (new Date().getTime() - Date.now()) / 1000,
|
||||
team_id: teamId,
|
||||
mode: "extract",
|
||||
url: request.urls.join(", "),
|
||||
scrapeOptions: request,
|
||||
origin: request.origin ?? "api",
|
||||
num_tokens: completions.numTokens ?? 0,
|
||||
num_tokens: 0, // completions?.numTokens ?? 0,
|
||||
}).then(() => {
|
||||
updateExtract(extractId, {
|
||||
status: "completed",
|
||||
@ -234,9 +510,9 @@ export async function performExtraction(
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: completions.extract ?? {},
|
||||
data: finalResult ?? {},
|
||||
extractId,
|
||||
warning: completions.warning,
|
||||
warning: undefined, // TODO FIX
|
||||
urlTrace: request.urlTrace ? urlTraces : undefined,
|
||||
};
|
||||
}
|
||||
|
27
apps/api/src/lib/extract/helpers/deduplicate-objs-array.ts
Normal file
27
apps/api/src/lib/extract/helpers/deduplicate-objs-array.ts
Normal file
@ -0,0 +1,27 @@
|
||||
export function deduplicateObjectsArray(objArray: { [key: string]: any[] }): { [key: string]: any[] } {
|
||||
const deduplicatedObjArray: { [key: string]: any[] } = {};
|
||||
|
||||
for (const key in objArray) {
|
||||
if (Array.isArray(objArray[key])) {
|
||||
const seen = new Set();
|
||||
deduplicatedObjArray[key] = objArray[key].filter(item => {
|
||||
// Create a unique identifier for each item based on its properties
|
||||
const identifier = JSON.stringify(item);
|
||||
|
||||
// Check if this identifier has been seen before
|
||||
if (seen.has(identifier)) {
|
||||
return false; // Duplicate found, filter it out
|
||||
}
|
||||
|
||||
// Add the identifier to the set and keep the item
|
||||
seen.add(identifier);
|
||||
return true;
|
||||
});
|
||||
} else {
|
||||
// If the value is not an array, just copy it as is
|
||||
deduplicatedObjArray[key] = objArray[key];
|
||||
}
|
||||
}
|
||||
|
||||
return deduplicatedObjArray;
|
||||
}
|
10
apps/api/src/lib/extract/helpers/dereference-schema.ts
Normal file
10
apps/api/src/lib/extract/helpers/dereference-schema.ts
Normal file
@ -0,0 +1,10 @@
|
||||
import { dereference } from "@apidevtools/json-schema-ref-parser";
|
||||
|
||||
export async function dereferenceSchema(schema: any): Promise<any> {
|
||||
try {
|
||||
return await dereference(schema);
|
||||
} catch (error) {
|
||||
console.error("Failed to dereference schema:", error);
|
||||
throw error;
|
||||
}
|
||||
}
|
26
apps/api/src/lib/extract/helpers/dump-to-file.ts
Normal file
26
apps/api/src/lib/extract/helpers/dump-to-file.ts
Normal file
@ -0,0 +1,26 @@
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
/**
|
||||
* Helper function to dump data to a file for debugging/logging purposes
|
||||
* @param filename The name of the file to write to (will be created in __dirname)
|
||||
* @param data The data to write to the file
|
||||
* @param formatter Optional function to format each item in the data array
|
||||
*/
|
||||
export function dumpToFile<T>(
|
||||
filename: string,
|
||||
data: T[],
|
||||
formatter?: (item: T, index: number) => string
|
||||
) {
|
||||
const filePath = path.join(__dirname, filename);
|
||||
|
||||
let fileContent: string;
|
||||
if (formatter) {
|
||||
fileContent = data.map((item, index) => formatter(item, index)).join('\n');
|
||||
} else {
|
||||
fileContent = data.map((item, index) => `${index + 1}. ${JSON.stringify(item)}`).join('\n');
|
||||
}
|
||||
|
||||
fs.writeFileSync(filePath, fileContent, 'utf8');
|
||||
console.log(`Dumped data to ${filename}`);
|
||||
}
|
143
apps/api/src/lib/extract/helpers/merge-null-val-objs.ts
Normal file
143
apps/api/src/lib/extract/helpers/merge-null-val-objs.ts
Normal file
@ -0,0 +1,143 @@
|
||||
import { deduplicateObjectsArray } from './deduplicate-objs-array';
|
||||
|
||||
/**
|
||||
* Convert "null" strings to actual null values for easier comparison.
|
||||
*/
|
||||
function unifyValue(val: any): any {
|
||||
return val === "null" ? null : val;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert all "null" strings in an object to actual null values.
|
||||
*/
|
||||
function unifyItemValues<T extends object>(item: T): T {
|
||||
const unifiedItem: any = {};
|
||||
for (const key of Object.keys(item)) {
|
||||
unifiedItem[key] = unifyValue(item[key]);
|
||||
}
|
||||
return unifiedItem;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if two objects are mergeable by comparing their non-null values
|
||||
*/
|
||||
function areMergeable(obj1: any, obj2: any): boolean {
|
||||
const allKeys = new Set([...Object.keys(obj1), ...Object.keys(obj2)]);
|
||||
let matchingNonNullValues = 0;
|
||||
let nonNullComparisons = 0;
|
||||
|
||||
for (const key of allKeys) {
|
||||
const val1 = obj1[key];
|
||||
const val2 = obj2[key];
|
||||
|
||||
// Skip array comparisons - they'll be merged separately
|
||||
if (Array.isArray(val1) || Array.isArray(val2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If both values exist and are not null
|
||||
if (val1 !== null && val2 !== null) {
|
||||
nonNullComparisons++;
|
||||
if (val1 === val2) {
|
||||
matchingNonNullValues++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Objects are mergeable if they have at least one matching non-null value
|
||||
// and all their non-null values match when both objects have them
|
||||
return nonNullComparisons > 0 && matchingNonNullValues === nonNullComparisons;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge arrays and remove duplicates
|
||||
*/
|
||||
function mergeArrays(arr1: any[], arr2: any[]): any[] {
|
||||
const combined = [...arr1, ...arr2];
|
||||
return combined.filter((item, index) => {
|
||||
const stringified = JSON.stringify(item);
|
||||
return combined.findIndex(other => JSON.stringify(other) === stringified) === index;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge two objects, taking non-null values over null values
|
||||
*/
|
||||
function mergeObjects(obj1: any, obj2: any): any {
|
||||
const result = { ...obj1 };
|
||||
|
||||
for (const key in obj2) {
|
||||
if (obj2.hasOwnProperty(key)) {
|
||||
// If obj2's value is non-null, it should override obj1's value
|
||||
if (obj2[key] !== null) {
|
||||
if (Array.isArray(obj2[key])) {
|
||||
// If both are arrays, merge them
|
||||
if (Array.isArray(result[key])) {
|
||||
result[key] = mergeArrays(result[key], obj2[key]);
|
||||
} else {
|
||||
// If only obj2's value is an array, use it
|
||||
result[key] = [...obj2[key]];
|
||||
}
|
||||
} else if (typeof obj2[key] === 'object') {
|
||||
// If both are objects (but not arrays), merge them
|
||||
if (typeof result[key] === 'object' && !Array.isArray(result[key])) {
|
||||
result[key] = mergeObjects(result[key], obj2[key]);
|
||||
} else {
|
||||
result[key] = { ...obj2[key] };
|
||||
}
|
||||
} else {
|
||||
// For primitive values, obj2's non-null value always wins
|
||||
result[key] = obj2[key];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges arrays of objects by combining those that are identical except for
|
||||
* null-equivalent fields, filling in null fields with the corresponding
|
||||
* non-null fields from the other object.
|
||||
*/
|
||||
export function mergeNullValObjs(objArray: { [key: string]: any[] }): { [key: string]: any[] } {
|
||||
const result: { [key: string]: any[] } = {};
|
||||
|
||||
for (const key in objArray) {
|
||||
if (Array.isArray(objArray[key])) {
|
||||
// If array contains only primitive values, return as is
|
||||
if (objArray[key].every(item => typeof item !== 'object' || item === null)) {
|
||||
result[key] = [...objArray[key]];
|
||||
continue;
|
||||
}
|
||||
|
||||
const items = objArray[key].map(unifyItemValues);
|
||||
const mergedItems: any[] = [];
|
||||
|
||||
for (const item of items) {
|
||||
let merged = false;
|
||||
|
||||
for (let i = 0; i < mergedItems.length; i++) {
|
||||
if (areMergeable(mergedItems[i], item)) {
|
||||
mergedItems[i] = mergeObjects(mergedItems[i], item);
|
||||
merged = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!merged) {
|
||||
mergedItems.push({ ...item });
|
||||
}
|
||||
}
|
||||
|
||||
// Final deduplication pass
|
||||
result[key] = deduplicateObjectsArray({ [key]: mergedItems })[key];
|
||||
} else {
|
||||
console.warn(`Expected an array at objArray[${key}], but found:`, objArray[key]);
|
||||
return objArray;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
35
apps/api/src/lib/extract/helpers/mix-schema-objs.ts
Normal file
35
apps/api/src/lib/extract/helpers/mix-schema-objs.ts
Normal file
@ -0,0 +1,35 @@
|
||||
export async function mixSchemaObjects(
|
||||
finalSchema: any,
|
||||
singleAnswerResult: any,
|
||||
multiEntityResult: any
|
||||
) {
|
||||
const finalResult: any = {};
|
||||
|
||||
// Recursive helper function to merge results based on schema
|
||||
function mergeResults(schema: any, singleResult: any, multiResult: any) {
|
||||
const result: any = {};
|
||||
for (const key in schema.properties) {
|
||||
if (schema.properties[key].type === 'object' && schema.properties[key].properties) {
|
||||
// If the property is an object, recursively merge its properties
|
||||
result[key] = mergeResults(
|
||||
schema.properties[key],
|
||||
singleResult[key] || {},
|
||||
multiResult[key] || {}
|
||||
);
|
||||
} else if (schema.properties[key].type === 'array' && Array.isArray(multiResult[key])) {
|
||||
// If the property is an array, flatten the arrays from multiResult
|
||||
result[key] = multiResult[key].flat();
|
||||
} else if (singleResult.hasOwnProperty(key)) {
|
||||
result[key] = singleResult[key];
|
||||
} else if (multiResult.hasOwnProperty(key)) {
|
||||
result[key] = multiResult[key];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Merge the properties from the final schema
|
||||
Object.assign(finalResult, mergeResults(finalSchema, singleAnswerResult, multiEntityResult));
|
||||
|
||||
return finalResult;
|
||||
}
|
44
apps/api/src/lib/extract/helpers/spread-schemas.ts
Normal file
44
apps/api/src/lib/extract/helpers/spread-schemas.ts
Normal file
@ -0,0 +1,44 @@
|
||||
export async function spreadSchemas(schema: any, keys: string[]): Promise<{
|
||||
singleAnswerSchema: any;
|
||||
multiEntitySchema: any;
|
||||
}> {
|
||||
let singleAnswerSchema = { ...schema, properties: { ...schema.properties } };
|
||||
let multiEntitySchema: any = { type: "object", properties: {} };
|
||||
|
||||
keys.forEach((key) => {
|
||||
if (singleAnswerSchema.properties[key]) {
|
||||
multiEntitySchema.properties[key] = singleAnswerSchema.properties[key];
|
||||
delete singleAnswerSchema.properties[key];
|
||||
}
|
||||
});
|
||||
// Recursively delete empty properties in singleAnswerSchema
|
||||
const deleteEmptyProperties = (schema: any) => {
|
||||
for (const key in schema.properties) {
|
||||
if (
|
||||
schema.properties[key].properties &&
|
||||
Object.keys(schema.properties[key].properties).length === 0
|
||||
) {
|
||||
delete schema.properties[key];
|
||||
} else if (schema.properties[key].properties) {
|
||||
deleteEmptyProperties(schema.properties[key]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
deleteEmptyProperties(singleAnswerSchema);
|
||||
deleteEmptyProperties(multiEntitySchema);
|
||||
|
||||
// If singleAnswerSchema has no properties left, return an empty object
|
||||
if (Object.keys(singleAnswerSchema.properties).length === 0) {
|
||||
singleAnswerSchema = {};
|
||||
}
|
||||
|
||||
if (Object.keys(multiEntitySchema.properties).length === 0) {
|
||||
multiEntitySchema = {};
|
||||
}
|
||||
|
||||
return {
|
||||
singleAnswerSchema,
|
||||
multiEntitySchema,
|
||||
};
|
||||
}
|
132
apps/api/src/lib/extract/helpers/transform-array-to-obj.ts
Normal file
132
apps/api/src/lib/extract/helpers/transform-array-to-obj.ts
Normal file
@ -0,0 +1,132 @@
|
||||
import isEqual from 'lodash/isEqual';
|
||||
|
||||
export function transformArrayToObject(
|
||||
originalSchema: any,
|
||||
arrayData: any[]
|
||||
): any {
|
||||
if (Object.keys(originalSchema).length == 0) {
|
||||
return {};
|
||||
}
|
||||
|
||||
const transformedResult: any = {};
|
||||
|
||||
// Function to find the array key in a nested schema
|
||||
function findArrayKey(schema: any): string | null {
|
||||
for (const key in schema.properties) {
|
||||
if (schema.properties[key].type === 'array') {
|
||||
return key;
|
||||
} else if (schema.properties[key].type === 'object') {
|
||||
const nestedKey = findArrayKey(schema.properties[key]);
|
||||
if (nestedKey) {
|
||||
return `${key}.${nestedKey}`;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
const arrayKeyPath = findArrayKey(originalSchema);
|
||||
if (!arrayKeyPath) {
|
||||
return arrayData.reduce((acc, item) => {
|
||||
for (const key in item) {
|
||||
if (!acc[key]) {
|
||||
acc[key] = item[key];
|
||||
} else if (typeof acc[key] === 'object' && typeof item[key] === 'object') {
|
||||
acc[key] = { ...acc[key], ...item[key] };
|
||||
}
|
||||
}
|
||||
return acc;
|
||||
}, {});
|
||||
}
|
||||
|
||||
const arrayKeyParts = arrayKeyPath.split('.');
|
||||
const arrayKey = arrayKeyParts.pop();
|
||||
if (!arrayKey) {
|
||||
throw new Error("Array key not found in schema");
|
||||
}
|
||||
|
||||
const parentSchema = arrayKeyParts.reduce((schema, key) => schema.properties[key], originalSchema);
|
||||
const itemSchema = parentSchema.properties[arrayKey].items;
|
||||
if (!itemSchema) {
|
||||
throw new Error("Item schema not found for array key");
|
||||
}
|
||||
|
||||
// Initialize the array in the transformed result
|
||||
let currentLevel = transformedResult;
|
||||
arrayKeyParts.forEach(part => {
|
||||
if (!currentLevel[part]) {
|
||||
currentLevel[part] = {};
|
||||
}
|
||||
currentLevel = currentLevel[part];
|
||||
});
|
||||
currentLevel[arrayKey] = [];
|
||||
|
||||
// Helper function to check if an object is already in the array
|
||||
function isDuplicateObject(array: any[], obj: any): boolean {
|
||||
return array.some(existingItem => isEqual(existingItem, obj));
|
||||
}
|
||||
|
||||
// Helper function to validate if an object follows the schema
|
||||
function isValidObject(obj: any, schema: any): boolean {
|
||||
return Object.keys(schema.properties).every(key => {
|
||||
return obj.hasOwnProperty(key) && typeof obj[key] === schema.properties[key].type;
|
||||
});
|
||||
}
|
||||
|
||||
// Iterate over each item in the arrayData
|
||||
arrayData.forEach(item => {
|
||||
let currentItem = item;
|
||||
arrayKeyParts.forEach(part => {
|
||||
if (currentItem[part]) {
|
||||
currentItem = currentItem[part];
|
||||
}
|
||||
});
|
||||
|
||||
// Copy non-array properties from the parent object
|
||||
for (const key in parentSchema.properties) {
|
||||
if (key !== arrayKey && currentItem.hasOwnProperty(key) && !currentLevel.hasOwnProperty(key)) {
|
||||
currentLevel[key] = currentItem[key];
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure that the currentItem[arrayKey] is an array before mapping
|
||||
if (Array.isArray(currentItem[arrayKey])) {
|
||||
currentItem[arrayKey].forEach((subItem: any) => {
|
||||
if (typeof subItem === 'object' && subItem !== null && isValidObject(subItem, itemSchema)) {
|
||||
// For arrays of objects, add only unique objects
|
||||
const transformedItem: any = {};
|
||||
let hasValidData = false;
|
||||
|
||||
for (const key in itemSchema.properties) {
|
||||
if (subItem.hasOwnProperty(key) && subItem[key] !== undefined) {
|
||||
transformedItem[key] = subItem[key];
|
||||
hasValidData = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (hasValidData && !isDuplicateObject(currentLevel[arrayKey], transformedItem)) {
|
||||
currentLevel[arrayKey].push(transformedItem);
|
||||
}
|
||||
}
|
||||
});
|
||||
} else {
|
||||
console.warn(`Expected an array at ${arrayKey}, but found:`, currentItem[arrayKey]);
|
||||
}
|
||||
|
||||
// Handle merging of array properties
|
||||
for (const key in parentSchema.properties) {
|
||||
if (parentSchema.properties[key].type === 'array' && Array.isArray(currentItem[key])) {
|
||||
if (!currentLevel[key]) {
|
||||
currentLevel[key] = [];
|
||||
}
|
||||
currentItem[key].forEach((value: any) => {
|
||||
if (!currentLevel[key].includes(value) && !isDuplicateObject(currentLevel[arrayKey], value)) {
|
||||
currentLevel[key].push(value);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return transformedResult;
|
||||
}
|
@ -90,8 +90,8 @@ export async function indexPage({
|
||||
const metadata: PageMetadata = {
|
||||
url: normalizedUrl,
|
||||
originUrl: normalizeUrl(originUrl),
|
||||
title: document.metadata.title,
|
||||
description: document.metadata.description,
|
||||
title: document.metadata.title ?? document.metadata.ogTitle ?? "",
|
||||
description: document.metadata.description ?? document.metadata.ogDescription ?? "",
|
||||
crawlId,
|
||||
teamId,
|
||||
markdown: trimmedMarkdown,
|
||||
@ -126,8 +126,8 @@ export async function indexPage({
|
||||
export async function searchSimilarPages(
|
||||
query: string,
|
||||
originUrl?: string,
|
||||
limit: number = 10,
|
||||
) {
|
||||
limit: number = 1000
|
||||
): Promise<any[]> {
|
||||
try {
|
||||
const index = pinecone.index(INDEX_NAME);
|
||||
|
||||
|
@ -4,6 +4,8 @@ import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { logger } from "../logger";
|
||||
import { CohereClient } from "cohere-ai";
|
||||
import { extractConfig } from "./config";
|
||||
import { searchSimilarPages } from "./index/pinecone";
|
||||
import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||
|
||||
const cohere = new CohereClient({
|
||||
token: process.env.COHERE_API_KEY,
|
||||
@ -47,6 +49,7 @@ export async function rerankLinks(
|
||||
searchQuery: string,
|
||||
urlTraces: URLTrace[],
|
||||
): Promise<MapDocument[]> {
|
||||
// console.log("Going to rerank links");
|
||||
const mappedLinksRerank = mappedLinks.map(
|
||||
(x) => `url: ${x.url}, title: ${x.title}, description: ${x.description}`,
|
||||
);
|
||||
@ -54,35 +57,35 @@ export async function rerankLinks(
|
||||
const linksAndScores = await performRanking(
|
||||
mappedLinksRerank,
|
||||
mappedLinks.map((l) => l.url),
|
||||
searchQuery,
|
||||
searchQuery
|
||||
);
|
||||
|
||||
// First try with high threshold
|
||||
let filteredLinks = filterAndProcessLinks(
|
||||
mappedLinks,
|
||||
linksAndScores,
|
||||
extractConfig.INITIAL_SCORE_THRESHOLD,
|
||||
extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE,
|
||||
);
|
||||
|
||||
// If we don't have enough high-quality links, try with lower threshold
|
||||
if (filteredLinks.length < extractConfig.MIN_REQUIRED_LINKS) {
|
||||
if (filteredLinks.length < extractConfig.RERANKING.MIN_REQUIRED_LINKS) {
|
||||
logger.info(
|
||||
`Only found ${filteredLinks.length} links with score > ${extractConfig.INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`,
|
||||
`Only found ${filteredLinks.length} links with score > ${extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE}. Trying lower threshold...`,
|
||||
);
|
||||
filteredLinks = filterAndProcessLinks(
|
||||
mappedLinks,
|
||||
linksAndScores,
|
||||
extractConfig.FALLBACK_SCORE_THRESHOLD,
|
||||
extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE,
|
||||
);
|
||||
|
||||
if (filteredLinks.length === 0) {
|
||||
// If still no results, take top N results regardless of score
|
||||
logger.warn(
|
||||
`No links found with score > ${extractConfig.FALLBACK_SCORE_THRESHOLD}. Taking top ${extractConfig.MIN_REQUIRED_LINKS} results.`,
|
||||
`No links found with score > ${extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE}. Taking top ${extractConfig.RERANKING.MIN_REQUIRED_LINKS} results.`,
|
||||
);
|
||||
filteredLinks = linksAndScores
|
||||
.sort((a, b) => b.score - a.score)
|
||||
.slice(0, extractConfig.MIN_REQUIRED_LINKS)
|
||||
.slice(0, extractConfig.RERANKING.MIN_REQUIRED_LINKS)
|
||||
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
||||
.filter(
|
||||
(x): x is MapDocument =>
|
||||
@ -104,8 +107,8 @@ export async function rerankLinks(
|
||||
}
|
||||
});
|
||||
|
||||
const rankedLinks = filteredLinks.slice(0, extractConfig.MAX_RANKING_LIMIT);
|
||||
|
||||
const rankedLinks = filteredLinks.slice(0, extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE);
|
||||
|
||||
// Mark URLs that will be used in completion
|
||||
rankedLinks.forEach((link) => {
|
||||
const trace = urlTraces.find((t) => t.url === link.url);
|
||||
@ -115,14 +118,16 @@ export async function rerankLinks(
|
||||
});
|
||||
|
||||
// Mark URLs that were dropped due to ranking limit
|
||||
filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach((link) => {
|
||||
const trace = urlTraces.find((t) => t.url === link.url);
|
||||
filteredLinks.slice(extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE).forEach(link => {
|
||||
const trace = urlTraces.find(t => t.url === link.url);
|
||||
if (trace) {
|
||||
trace.warning = "Excluded due to ranking limit";
|
||||
trace.usedInCompletion = false;
|
||||
}
|
||||
});
|
||||
|
||||
// console.log("Reranked links: ", rankedLinks.length);
|
||||
|
||||
return rankedLinks;
|
||||
}
|
||||
|
||||
@ -144,3 +149,108 @@ function filterAndProcessLinks(
|
||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
export async function rerankLinksWithLLM(
|
||||
mappedLinks: MapDocument[],
|
||||
searchQuery: string,
|
||||
urlTraces: URLTrace[],
|
||||
): Promise<MapDocument[]> {
|
||||
const chunkSize = 100;
|
||||
const chunks: MapDocument[][] = [];
|
||||
const TIMEOUT_MS = 20000;
|
||||
const MAX_RETRIES = 2;
|
||||
|
||||
// Split mappedLinks into chunks of 200
|
||||
for (let i = 0; i < mappedLinks.length; i += chunkSize) {
|
||||
chunks.push(mappedLinks.slice(i, i + chunkSize));
|
||||
}
|
||||
|
||||
// console.log(`Total links: ${mappedLinks.length}, Number of chunks: ${chunks.length}`);
|
||||
|
||||
const schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
relevantLinks: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
url: { type: "string" },
|
||||
relevanceScore: { type: "number" }
|
||||
},
|
||||
required: ["url", "relevanceScore"]
|
||||
}
|
||||
}
|
||||
},
|
||||
required: ["relevantLinks"]
|
||||
};
|
||||
|
||||
const results = await Promise.all(
|
||||
chunks.map(async (chunk, chunkIndex) => {
|
||||
// console.log(`Processing chunk ${chunkIndex + 1}/${chunks.length} with ${chunk.length} links`);
|
||||
|
||||
const linksContent = chunk.map(link =>
|
||||
`URL: ${link.url}${link.title ? `\nTitle: ${link.title}` : ''}${link.description ? `\nDescription: ${link.description}` : ''}`
|
||||
).join("\n\n");
|
||||
|
||||
for (let retry = 0; retry <= MAX_RETRIES; retry++) {
|
||||
try {
|
||||
const timeoutPromise = new Promise<null>((resolve) => {
|
||||
setTimeout(() => resolve(null), TIMEOUT_MS);
|
||||
});
|
||||
|
||||
const completionPromise = generateOpenAICompletions(
|
||||
logger.child({ method: "rerankLinksWithLLM", chunk: chunkIndex + 1, retry }),
|
||||
{
|
||||
mode: "llm",
|
||||
systemPrompt: "You are a search relevance expert. Analyze the provided URLs and their content to determine their relevance to the search query. For each URL, assign a relevance score between 0 and 1, where 1 means highly relevant and 0 means not relevant at all. Only include URLs that are actually relevant to the query.",
|
||||
prompt: `Given these URLs and their content, identify which ones are relevant to the search query: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the search query. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relvancy score of 0.8+.`,
|
||||
schema: schema
|
||||
},
|
||||
linksContent,
|
||||
undefined,
|
||||
true
|
||||
);
|
||||
|
||||
const completion = await Promise.race([completionPromise, timeoutPromise]);
|
||||
|
||||
if (!completion) {
|
||||
// console.log(`Chunk ${chunkIndex + 1}: Timeout on attempt ${retry + 1}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!completion.extract?.relevantLinks) {
|
||||
// console.warn(`Chunk ${chunkIndex + 1}: No relevant links found in completion response`);
|
||||
return [];
|
||||
}
|
||||
|
||||
// console.log(`Chunk ${chunkIndex + 1}: Found ${completion.extract.relevantLinks.length} relevant links`);
|
||||
return completion.extract.relevantLinks;
|
||||
|
||||
} catch (error) {
|
||||
console.warn(`Error processing chunk ${chunkIndex + 1} attempt ${retry + 1}:`, error);
|
||||
if (retry === MAX_RETRIES) {
|
||||
// console.log(`Chunk ${chunkIndex + 1}: Max retries reached, returning empty array`);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
}
|
||||
return [];
|
||||
})
|
||||
);
|
||||
|
||||
// console.log(`Processed ${results.length} chunks`);
|
||||
|
||||
// Flatten results and sort by relevance score
|
||||
const flattenedResults = results.flat().sort((a, b) => b.relevanceScore - a.relevanceScore);
|
||||
// console.log(`Total relevant links found: ${flattenedResults.length}`);
|
||||
|
||||
// Map back to MapDocument format, keeping only relevant links
|
||||
const relevantLinks = flattenedResults
|
||||
.map(result => mappedLinks.find(link => link.url === result.url))
|
||||
.filter((link): link is MapDocument => link !== undefined);
|
||||
|
||||
// console.log(`Returning ${relevantLinks.length} relevant links`);
|
||||
return relevantLinks;
|
||||
}
|
@ -5,13 +5,13 @@ import { removeDuplicateUrls } from "../validateUrl";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { generateBasicCompletion } from "../LLM-extraction";
|
||||
import { buildRefrasedPrompt } from "./build-prompts";
|
||||
import { logger } from "../logger";
|
||||
import { rerankLinks } from "./reranker";
|
||||
import { rerankLinksWithLLM } from "./reranker";
|
||||
import { extractConfig } from "./config";
|
||||
|
||||
interface ProcessUrlOptions {
|
||||
url: string;
|
||||
prompt?: string;
|
||||
schema?: any;
|
||||
teamId: string;
|
||||
plan: PlanType;
|
||||
allowExternalLinks?: boolean;
|
||||
@ -50,9 +50,13 @@ export async function processUrl(
|
||||
let rephrasedPrompt = options.prompt;
|
||||
if (options.prompt) {
|
||||
rephrasedPrompt =
|
||||
(await generateBasicCompletion(
|
||||
buildRefrasedPrompt(options.prompt, baseUrl),
|
||||
))?.replace('"', '').replace("/", "") ?? options.prompt;
|
||||
(
|
||||
await generateBasicCompletion(
|
||||
buildRefrasedPrompt(options.prompt, baseUrl),
|
||||
)
|
||||
)
|
||||
?.replace('"', "")
|
||||
.replace("/", "") ?? options.prompt;
|
||||
}
|
||||
|
||||
try {
|
||||
@ -148,17 +152,64 @@ export async function processUrl(
|
||||
}
|
||||
|
||||
// Limit initial set of links (1000)
|
||||
mappedLinks = mappedLinks.slice(0, extractConfig.MAX_INITIAL_RANKING_LIMIT);
|
||||
mappedLinks = mappedLinks.slice(
|
||||
0,
|
||||
extractConfig.RERANKING.MAX_INITIAL_RANKING_LIMIT,
|
||||
);
|
||||
|
||||
// Perform reranking if prompt is provided
|
||||
// Perform reranking using either prompt or schema
|
||||
let searchQuery = "";
|
||||
if (options.prompt) {
|
||||
const searchQuery = options.allowExternalLinks
|
||||
searchQuery = options.allowExternalLinks
|
||||
? `${options.prompt} ${urlWithoutWww}`
|
||||
: `${options.prompt} site:${urlWithoutWww}`;
|
||||
} else if (options.schema) {
|
||||
// Generate search query from schema using basic completion
|
||||
try {
|
||||
const schemaString = JSON.stringify(options.schema, null, 2);
|
||||
const prompt = `Given this JSON schema, generate a natural language search query that would help find relevant pages containing this type of data. Focus on the key properties and their descriptions and keep it very concise. Schema: ${schemaString}`;
|
||||
|
||||
mappedLinks = await rerankLinks(mappedLinks, searchQuery, urlTraces);
|
||||
searchQuery =
|
||||
(await generateBasicCompletion(prompt)) ??
|
||||
"Extract the data according to the schema: " + schemaString;
|
||||
|
||||
if (options.allowExternalLinks) {
|
||||
searchQuery = `${searchQuery} ${urlWithoutWww}`;
|
||||
} else {
|
||||
searchQuery = `${searchQuery} site:${urlWithoutWww}`;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error generating search query from schema:", error);
|
||||
searchQuery = urlWithoutWww; // Fallback to just the domain
|
||||
}
|
||||
} else {
|
||||
searchQuery = urlWithoutWww;
|
||||
}
|
||||
|
||||
// dumpToFile(
|
||||
// "mapped-links.txt",
|
||||
// mappedLinks,
|
||||
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
|
||||
// );
|
||||
|
||||
mappedLinks = await rerankLinksWithLLM(mappedLinks, searchQuery, urlTraces);
|
||||
|
||||
// 2nd Pass, useful for when the first pass returns too many links
|
||||
if (mappedLinks.length > 100) {
|
||||
mappedLinks = await rerankLinksWithLLM(
|
||||
mappedLinks,
|
||||
searchQuery,
|
||||
urlTraces,
|
||||
);
|
||||
}
|
||||
|
||||
// dumpToFile(
|
||||
// "llm-links.txt",
|
||||
// mappedLinks,
|
||||
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
|
||||
// );
|
||||
// Remove title and description from mappedLinks
|
||||
mappedLinks = mappedLinks.map((link) => ({ url: link.url }));
|
||||
return mappedLinks.map((x) => x.url);
|
||||
} catch (error) {
|
||||
trace.status = "error";
|
||||
|
@ -53,29 +53,26 @@ async function performRanking(
|
||||
// Generate embeddings for the search query
|
||||
const queryEmbedding = await getEmbedding(sanitizedQuery);
|
||||
|
||||
// Generate embeddings for each link and calculate similarity
|
||||
// Generate embeddings for each link and calculate similarity in parallel
|
||||
const linksAndScores = await Promise.all(
|
||||
linksWithContext.map(async (linkWithContext, index) => {
|
||||
try {
|
||||
const linkEmbedding = await getEmbedding(linkWithContext);
|
||||
const score = cosineSimilarity(queryEmbedding, linkEmbedding);
|
||||
|
||||
return {
|
||||
link: links[index],
|
||||
linkWithContext,
|
||||
score,
|
||||
originalIndex: index,
|
||||
};
|
||||
} catch (err) {
|
||||
// If embedding fails for a link, return with score 0
|
||||
return {
|
||||
linksWithContext.map((linkWithContext, index) =>
|
||||
getEmbedding(linkWithContext)
|
||||
.then(linkEmbedding => {
|
||||
const score = cosineSimilarity(queryEmbedding, linkEmbedding);
|
||||
return {
|
||||
link: links[index],
|
||||
linkWithContext,
|
||||
score,
|
||||
originalIndex: index,
|
||||
};
|
||||
})
|
||||
.catch(() => ({
|
||||
link: links[index],
|
||||
linkWithContext,
|
||||
score: 0,
|
||||
originalIndex: index,
|
||||
};
|
||||
}
|
||||
}),
|
||||
}))
|
||||
)
|
||||
);
|
||||
|
||||
// Sort links based on similarity scores while preserving original order for equal scores
|
||||
|
30
apps/api/src/scraper/WebScraper/sitemap-index.ts
Normal file
30
apps/api/src/scraper/WebScraper/sitemap-index.ts
Normal file
@ -0,0 +1,30 @@
|
||||
import { logger } from "../../lib/logger";
|
||||
import { normalizeUrlOnlyHostname } from "../../lib/canonical-url";
|
||||
import { supabase_service } from "../../services/supabase";
|
||||
|
||||
/**
|
||||
* Query the sitemap index for a given URL
|
||||
* @param url The URL to query
|
||||
* @returns A list of URLs found in the sitemap index aggregated from all sitemaps
|
||||
*/
|
||||
import { withAuth } from "../../lib/withAuth";
|
||||
|
||||
async function querySitemapIndexFunction(url: string) {
|
||||
const originUrl = normalizeUrlOnlyHostname(url);
|
||||
|
||||
const { data, error } = await supabase_service
|
||||
.from("crawl_maps")
|
||||
.select("urls")
|
||||
.eq("origin_url", originUrl);
|
||||
|
||||
if (error) {
|
||||
logger.error("(sitemap-index) Error querying the index", { error });
|
||||
return [];
|
||||
}
|
||||
|
||||
const allUrls = data.map((entry) => entry.urls).flat();
|
||||
|
||||
return allUrls;
|
||||
}
|
||||
|
||||
export const querySitemapIndex = withAuth(querySitemapIndexFunction, []);
|
@ -173,7 +173,7 @@ export async function generateOpenAICompletions(
|
||||
? {
|
||||
type: "json_schema",
|
||||
json_schema: {
|
||||
name: "websiteContent",
|
||||
name: "schema",
|
||||
schema: schema,
|
||||
strict: true,
|
||||
},
|
||||
|
@ -89,12 +89,10 @@ const runningJobs: Set<string> = new Set();
|
||||
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||
if (await finishCrawl(job.data.crawl_id)) {
|
||||
(async () => {
|
||||
const originUrl = sc.originUrl
|
||||
? normalizeUrlOnlyHostname(sc.originUrl)
|
||||
: undefined;
|
||||
// Get all visited URLs from Redis
|
||||
const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined;
|
||||
// Get all visited unique URLs from Redis
|
||||
const visitedUrls = await redisConnection.smembers(
|
||||
"crawl:" + job.data.crawl_id + ":visited",
|
||||
"crawl:" + job.data.crawl_id + ":visited_unique",
|
||||
);
|
||||
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
|
||||
if (
|
||||
@ -353,7 +351,15 @@ const processExtractJobInternal = async (
|
||||
await job.moveToCompleted(result, token, false);
|
||||
return result;
|
||||
} else {
|
||||
throw new Error(result.error || "Unknown error during extraction");
|
||||
// throw new Error(result.error || "Unknown error during extraction");
|
||||
|
||||
await job.moveToCompleted(result, token, false);
|
||||
await updateExtract(job.data.extractId, {
|
||||
status: "failed",
|
||||
error: result.error ?? "Unknown error, please contact help@firecrawl.com. Extract id: " + job.data.extractId,
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`🚫 Job errored ${job.id} - ${error}`, { error });
|
||||
@ -372,9 +378,10 @@ const processExtractJobInternal = async (
|
||||
error:
|
||||
error.error ??
|
||||
error ??
|
||||
"Unknown error, please contact help@firecrawl.dev. Extract id: " +
|
||||
"Unknown error, please contact help@firecrawl.com. Extract id: " +
|
||||
job.data.extractId,
|
||||
});
|
||||
return { success: false, error: error.error ?? error ?? "Unknown error, please contact help@firecrawl.com. Extract id: " + job.data.extractId };
|
||||
// throw error;
|
||||
} finally {
|
||||
clearInterval(extendLockInterval);
|
||||
|
Loading…
x
Reference in New Issue
Block a user