mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 15:40:37 +08:00
commit
6c33b978f3
@ -55,7 +55,7 @@
|
||||
"@bull-board/api": "^5.20.5",
|
||||
"@bull-board/express": "^5.20.5",
|
||||
"@devil7softwares/pos": "^1.0.2",
|
||||
"@dqbd/tiktoken": "^1.0.16",
|
||||
"@dqbd/tiktoken": "^1.0.17",
|
||||
"@nangohq/node": "^0.40.8",
|
||||
"@sentry/cli": "^2.33.1",
|
||||
"@sentry/node": "^8.26.0",
|
||||
@ -73,6 +73,7 @@
|
||||
"cacheable-lookup": "^6.1.0",
|
||||
"cheerio": "^1.0.0-rc.12",
|
||||
"cohere": "^1.1.1",
|
||||
"cohere-ai": "^7.14.0",
|
||||
"cors": "^2.8.5",
|
||||
"cron-parser": "^4.9.0",
|
||||
"date-fns": "^3.6.0",
|
||||
|
1316
apps/api/pnpm-lock.yaml
generated
1316
apps/api/pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
@ -46,4 +46,32 @@ content-type: application/json
|
||||
@batchScrapeId = {{batchScrape.response.body.$.id}}
|
||||
# @name batchScrapeStatus
|
||||
GET {{baseUrl}}/v1/crawl/{{batchScrapeId}} HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
|
||||
|
||||
### Map Website
|
||||
# @name map
|
||||
POST {{baseUrl}}/v1/map HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"url": "firecrawl.dev",
|
||||
"sitemapOnly": true
|
||||
}
|
||||
|
||||
### Extract
|
||||
# @name extract
|
||||
POST {{baseUrl}}/v1/extract HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"urls": ["firecrawl.dev"],
|
||||
"prompt": "What is the title, description and main product of the page?",
|
||||
"schema": {
|
||||
"title": "string",
|
||||
"description": "string",
|
||||
"mainProduct": "string"
|
||||
}
|
||||
}
|
249
apps/api/src/__tests__/e2e_extract/index.test.ts
Normal file
249
apps/api/src/__tests__/e2e_extract/index.test.ts
Normal file
@ -0,0 +1,249 @@
|
||||
import request from "supertest";
|
||||
import dotenv from "dotenv";
|
||||
import {
|
||||
FirecrawlCrawlResponse,
|
||||
FirecrawlCrawlStatusResponse,
|
||||
FirecrawlScrapeResponse,
|
||||
} from "../../types";
|
||||
|
||||
dotenv.config();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
describe("E2E Tests for Extract API Routes", () => {
|
||||
it.concurrent("should return authors of blog posts on firecrawl.dev", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/extract")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
urls: ["https://firecrawl.dev/*"],
|
||||
prompt: "Who are the authors of the blog posts?",
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: { authors: { type: "array", items: { type: "string" } } },
|
||||
},
|
||||
});
|
||||
|
||||
console.log(response.body);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toHaveProperty("authors");
|
||||
|
||||
let gotItRight = 0;
|
||||
for (const author of response.body.data?.authors) {
|
||||
if (author.includes("Caleb Peffer")) gotItRight++;
|
||||
if (author.includes("Gergő Móricz")) gotItRight++;
|
||||
if (author.includes("Eric Ciarla")) gotItRight++;
|
||||
if (author.includes("Nicolas Camara")) gotItRight++;
|
||||
if (author.includes("Jon")) gotItRight++;
|
||||
if (author.includes("Wendong")) gotItRight++;
|
||||
|
||||
}
|
||||
|
||||
expect(gotItRight).toBeGreaterThan(1);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return founders of firecrawl.dev (allowExternalLinks = true)", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/extract")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
urls: ["firecrawl.dev/*"],
|
||||
prompt: "Who are the founders of the company?",
|
||||
allowExternalLinks: true,
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: { founders: { type: "array", items: { type: "string" } } },
|
||||
},
|
||||
});
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toHaveProperty("founders");
|
||||
|
||||
console.log(response.body.data?.founders);
|
||||
let gotItRight = 0;
|
||||
for (const founder of response.body.data?.founders) {
|
||||
if (founder.includes("Caleb")) gotItRight++;
|
||||
if (founder.includes("Eric")) gotItRight++;
|
||||
if (founder.includes("Nicolas")) gotItRight++;
|
||||
if (founder.includes("nick")) gotItRight++;
|
||||
if (founder.includes("eric")) gotItRight++;
|
||||
if (founder.includes("jon-noronha")) gotItRight++;
|
||||
|
||||
}
|
||||
|
||||
expect(gotItRight).toBeGreaterThanOrEqual(2);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return hiring opportunities on firecrawl.dev (allowExternalLinks = true)", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/extract")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
urls: ["https://firecrawl.dev/*"],
|
||||
prompt: "What are they hiring for?",
|
||||
allowExternalLinks: true,
|
||||
schema: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "string"
|
||||
},
|
||||
required: ["items"]
|
||||
},
|
||||
});
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
console.log(response.body.data);
|
||||
|
||||
let gotItRight = 0;
|
||||
for (const hiring of response.body.data?.items) {
|
||||
if (hiring.includes("Developer Support Engineer")) gotItRight++;
|
||||
if (hiring.includes("Dev Ops Engineer")) gotItRight++;
|
||||
if (hiring.includes("Founding Web Automation Engineer")) gotItRight++;
|
||||
}
|
||||
|
||||
expect(gotItRight).toBeGreaterThan(2);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return PCI DSS compliance for Fivetran", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/extract")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
urls: ["fivetran.com/*"],
|
||||
prompt: "Does Fivetran have PCI DSS compliance?",
|
||||
allowExternalLinks: true,
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
pciDssCompliance: { type: "boolean" }
|
||||
}
|
||||
},
|
||||
});
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data?.pciDssCompliance).toBe(true);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return Azure Data Connectors for Fivetran", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/extract")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
urls: ["fivetran.com/*"],
|
||||
prompt: "What are the Azure Data Connectors they offer?",
|
||||
schema: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
connector: { type: "string" },
|
||||
description: { type: "string" },
|
||||
supportsCaptureDelete: { type: "boolean" }
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
console.log(response.body);
|
||||
// expect(response.statusCode).toBe(200);
|
||||
// expect(response.body).toHaveProperty("data");
|
||||
// expect(response.body.data?.pciDssCompliance).toBe(true);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return Greenhouse Applicant Tracking System for Abnormal Security", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/extract")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
urls: ["https://careers.abnormalsecurity.com/jobs/6119456003?gh_jid=6119456003"],
|
||||
prompt: "what applicant tracking system is this company using?",
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
isGreenhouseATS: { type: "boolean" },
|
||||
answer: { type: "string" }
|
||||
}
|
||||
},
|
||||
allowExternalLinks: true
|
||||
})
|
||||
|
||||
console.log(response.body);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data?.isGreenhouseATS).toBe(true);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return mintlify api components", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/extract")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
urls: ["https://mintlify.com/docs/*"],
|
||||
prompt: "what are the 4 API components?",
|
||||
schema: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
component: { type: "string" }
|
||||
}
|
||||
},
|
||||
required: ["items"]
|
||||
},
|
||||
allowExternalLinks: true
|
||||
})
|
||||
|
||||
console.log(response.body.data?.items);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data?.items.length).toBe(4);
|
||||
let gotItRight = 0;
|
||||
for (const component of response.body.data?.items) {
|
||||
if (component.component.toLowerCase().includes("parameter")) gotItRight++;
|
||||
if (component.component.toLowerCase().includes("response")) gotItRight++;
|
||||
if (component.component.toLowerCase().includes("expandable")) gotItRight++;
|
||||
if (component.component.toLowerCase().includes("sticky")) gotItRight++;
|
||||
if (component.component.toLowerCase().includes("examples")) gotItRight++;
|
||||
|
||||
}
|
||||
expect(gotItRight).toBeGreaterThan(2);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return information about Eric Ciarla", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/extract")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
urls: ["https://ericciarla.com/"],
|
||||
prompt: "Who is Eric Ciarla? Where does he work? Where did he go to school?",
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
work: { type: "string" },
|
||||
education: { type: "string" }
|
||||
},
|
||||
required: ["name", "work", "education"]
|
||||
},
|
||||
allowExternalLinks: true
|
||||
})
|
||||
|
||||
console.log(response.body.data);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data?.name).toBe("Eric Ciarla");
|
||||
expect(response.body.data?.work).toBeDefined();
|
||||
expect(response.body.data?.education).toBeDefined();
|
||||
}, 60000);
|
||||
|
||||
|
||||
|
||||
});
|
117
apps/api/src/__tests__/e2e_map/index.test.ts
Normal file
117
apps/api/src/__tests__/e2e_map/index.test.ts
Normal file
@ -0,0 +1,117 @@
|
||||
import request from "supertest";
|
||||
import dotenv from "dotenv";
|
||||
|
||||
dotenv.config();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
describe("E2E Tests for Map API Routes", () => {
|
||||
it.concurrent(
|
||||
"(feat-search)should return links containing 'smart-crawl'",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://firecrawl.dev",
|
||||
sitemapOnly: false,
|
||||
search: "smart-crawl",
|
||||
});
|
||||
|
||||
console.log(response.body);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
expect(response.body.links.length).toBeGreaterThan(0);
|
||||
expect(response.body.links[0]).toContain("firecrawl.dev/smart-crawl");
|
||||
},
|
||||
60000
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"(feat-subdomains) should return mapped links for firecrawl.dev with subdomains included",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://firecrawl.dev",
|
||||
sitemapOnly: false,
|
||||
includeSubdomains: true,
|
||||
});
|
||||
|
||||
console.log(response.body);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
expect(response.body.links.length).toBeGreaterThan(0);
|
||||
expect(response.body.links[response.body.links.length - 1]).toContain(
|
||||
"docs.firecrawl.dev"
|
||||
);
|
||||
},
|
||||
60000
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"(feat-sitemap-only) should return mapped links for firecrawl.dev with sitemap only",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://firecrawl.dev",
|
||||
sitemapOnly: true,
|
||||
});
|
||||
|
||||
console.log(response.body);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
expect(response.body.links.length).toBeGreaterThan(0);
|
||||
expect(response.body.links[response.body.links.length - 1]).not.toContain(
|
||||
"docs.firecrawl.dev"
|
||||
);
|
||||
},
|
||||
60000
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"(feat-limit) should return mapped links for firecrawl.dev with a limit",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://firecrawl.dev",
|
||||
sitemapOnly: false,
|
||||
limit: 10,
|
||||
});
|
||||
|
||||
console.log(response.body);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
expect(response.body.links.length).toBeLessThanOrEqual(10);
|
||||
},
|
||||
60000
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"(feat-sitemap-large) should return more than 1900 links for geekflare sitemap",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://geekflare.com/sitemap_index.xml",
|
||||
sitemapOnly: true,
|
||||
});
|
||||
|
||||
console.log(response.body);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
expect(response.body.links.length).toBeGreaterThan(1900);
|
||||
},
|
||||
60000
|
||||
);
|
||||
});
|
265
apps/api/src/controllers/v1/extract.ts
Normal file
265
apps/api/src/controllers/v1/extract.ts
Normal file
@ -0,0 +1,265 @@
|
||||
import { Request, Response } from "express";
|
||||
import {
|
||||
// Document,
|
||||
RequestWithAuth,
|
||||
ExtractRequest,
|
||||
extractRequestSchema,
|
||||
ExtractResponse,
|
||||
MapDocument,
|
||||
scrapeOptions,
|
||||
} from "./types";
|
||||
import { Document } from "../../lib/entities";
|
||||
import Redis from "ioredis";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { performRanking } from "../../lib/ranker";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { waitForJob } from "../../services/queue-jobs";
|
||||
import { addScrapeJob } from "../../services/queue-jobs";
|
||||
import { PlanType } from "../../types";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { getMapResults } from "./map";
|
||||
import { buildDocument } from "../../lib/extract/build-document";
|
||||
|
||||
configDotenv();
|
||||
const redis = new Redis(process.env.REDIS_URL!);
|
||||
|
||||
const MAX_EXTRACT_LIMIT = 100;
|
||||
const MAX_RANKING_LIMIT = 10;
|
||||
const INITIAL_SCORE_THRESHOLD = 0.75;
|
||||
const FALLBACK_SCORE_THRESHOLD = 0.5;
|
||||
const MIN_REQUIRED_LINKS = 1;
|
||||
|
||||
/**
|
||||
* Extracts data from the provided URLs based on the request parameters.
|
||||
* Currently in beta.
|
||||
* @param req - The request object containing authentication and extraction details.
|
||||
* @param res - The response object to send the extraction results.
|
||||
* @returns A promise that resolves when the extraction process is complete.
|
||||
*/
|
||||
export async function extractController(
|
||||
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
|
||||
res: Response<ExtractResponse>
|
||||
) {
|
||||
const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true";
|
||||
|
||||
req.body = extractRequestSchema.parse(req.body);
|
||||
|
||||
const id = crypto.randomUUID();
|
||||
let links: string[] = [];
|
||||
let docs: Document[] = [];
|
||||
const earlyReturn = false;
|
||||
|
||||
// Process all URLs in parallel
|
||||
const urlPromises = req.body.urls.map(async (url) => {
|
||||
if (url.includes('/*') || req.body.allowExternalLinks) {
|
||||
// Handle glob pattern URLs
|
||||
const baseUrl = url.replace('/*', '');
|
||||
// const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
|
||||
|
||||
const allowExternalLinks = req.body.allowExternalLinks ?? true;
|
||||
let urlWithoutWww = baseUrl.replace("www.", "");
|
||||
let mapUrl = req.body.prompt && allowExternalLinks
|
||||
? `${req.body.prompt} ${urlWithoutWww}`
|
||||
: req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}`
|
||||
: `site:${urlWithoutWww}`;
|
||||
|
||||
const mapResults = await getMapResults({
|
||||
url: baseUrl,
|
||||
search: req.body.prompt,
|
||||
teamId: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
allowExternalLinks,
|
||||
origin: req.body.origin,
|
||||
limit: req.body.limit,
|
||||
// If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
|
||||
ignoreSitemap: !selfHosted ? true : false,
|
||||
includeMetadata: true,
|
||||
includeSubdomains: req.body.includeSubdomains,
|
||||
});
|
||||
|
||||
let mappedLinks = mapResults.links as MapDocument[];
|
||||
// Limit number of links to MAX_EXTRACT_LIMIT
|
||||
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
|
||||
|
||||
let mappedLinksRerank = mappedLinks.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
|
||||
|
||||
// Filter by path prefix if present
|
||||
// wrong
|
||||
// if (pathPrefix) {
|
||||
// mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
|
||||
// }
|
||||
|
||||
if (req.body.prompt) {
|
||||
// Get similarity scores between the search query and each link's context
|
||||
const linksAndScores = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl);
|
||||
|
||||
// First try with high threshold
|
||||
let filteredLinks = filterAndProcessLinks(mappedLinks, linksAndScores, INITIAL_SCORE_THRESHOLD);
|
||||
|
||||
// If we don't have enough high-quality links, try with lower threshold
|
||||
if (filteredLinks.length < MIN_REQUIRED_LINKS) {
|
||||
logger.info(`Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`);
|
||||
filteredLinks = filterAndProcessLinks(mappedLinks, linksAndScores, FALLBACK_SCORE_THRESHOLD);
|
||||
|
||||
if (filteredLinks.length === 0) {
|
||||
// If still no results, take top N results regardless of score
|
||||
logger.warn(`No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`);
|
||||
filteredLinks = linksAndScores
|
||||
.sort((a, b) => b.score - a.score)
|
||||
.slice(0, MIN_REQUIRED_LINKS)
|
||||
.map(x => mappedLinks.find(link => link.url === x.link))
|
||||
.filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url));
|
||||
}
|
||||
}
|
||||
|
||||
mappedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT);
|
||||
}
|
||||
|
||||
return mappedLinks.map(x => x.url) as string[];
|
||||
|
||||
} else {
|
||||
// Handle direct URLs without glob pattern
|
||||
if (!isUrlBlocked(url)) {
|
||||
return [url];
|
||||
}
|
||||
return [];
|
||||
}
|
||||
});
|
||||
|
||||
// Wait for all URL processing to complete and flatten results
|
||||
const processedUrls = await Promise.all(urlPromises);
|
||||
links.push(...processedUrls.flat());
|
||||
|
||||
if (links.length === 0) {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs."
|
||||
});
|
||||
}
|
||||
|
||||
// Scrape all links in parallel with retries
|
||||
const scrapePromises = links.map(async (url) => {
|
||||
const origin = req.body.origin || "api";
|
||||
const timeout = Math.floor((req.body.timeout || 40000) * 0.7) || 30000; // Use 70% of total timeout for individual scrapes
|
||||
const jobId = crypto.randomUUID();
|
||||
|
||||
const jobPriority = await getJobPriority({
|
||||
plan: req.auth.plan as PlanType,
|
||||
team_id: req.auth.team_id,
|
||||
basePriority: 10,
|
||||
});
|
||||
|
||||
await addScrapeJob(
|
||||
{
|
||||
url,
|
||||
mode: "single_urls",
|
||||
team_id: req.auth.team_id,
|
||||
scrapeOptions: scrapeOptions.parse({}),
|
||||
internalOptions: {},
|
||||
plan: req.auth.plan!,
|
||||
origin,
|
||||
is_scrape: true,
|
||||
},
|
||||
{},
|
||||
jobId,
|
||||
jobPriority
|
||||
);
|
||||
|
||||
try {
|
||||
const doc = await waitForJob<Document>(jobId, timeout);
|
||||
await getScrapeQueue().remove(jobId);
|
||||
if (earlyReturn) {
|
||||
return null;
|
||||
}
|
||||
return doc;
|
||||
} catch (e) {
|
||||
logger.error(`Error in scrapeController: ${e}`);
|
||||
if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) {
|
||||
throw {
|
||||
status: 408,
|
||||
error: "Request timed out"
|
||||
};
|
||||
} else {
|
||||
throw {
|
||||
status: 500,
|
||||
error: `(Internal server error) - ${(e && e.message) ? e.message : e}`
|
||||
};
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
try {
|
||||
const results = await Promise.all(scrapePromises);
|
||||
docs.push(...results.filter(doc => doc !== null).map(x => x!));
|
||||
} catch (e) {
|
||||
return res.status(e.status).json({
|
||||
success: false,
|
||||
error: e.error
|
||||
});
|
||||
}
|
||||
|
||||
const completions = await generateOpenAICompletions(
|
||||
logger.child({ method: "extractController/generateOpenAICompletions" }),
|
||||
{
|
||||
mode: "llm",
|
||||
systemPrompt: "Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided.",
|
||||
prompt: req.body.prompt,
|
||||
schema: req.body.schema,
|
||||
},
|
||||
docs.map(x => buildDocument(x)).join('\n')
|
||||
);
|
||||
|
||||
// TODO: change this later
|
||||
// While on beta, we're billing 5 credits per link discovered/scraped.
|
||||
billTeam(req.auth.team_id, req.acuc?.sub_id, links.length * 5).catch(error => {
|
||||
logger.error(`Failed to bill team ${req.auth.team_id} for ${links.length * 5} credits: ${error}`);
|
||||
});
|
||||
|
||||
let data = completions.extract ?? {};
|
||||
let warning = completions.warning;
|
||||
|
||||
logJob({
|
||||
job_id: id,
|
||||
success: true,
|
||||
message: "Extract completed",
|
||||
num_docs: 1,
|
||||
docs: data,
|
||||
time_taken: (new Date().getTime() - Date.now()) / 1000,
|
||||
team_id: req.auth.team_id,
|
||||
mode: "extract",
|
||||
url: req.body.urls.join(", "),
|
||||
scrapeOptions: req.body,
|
||||
origin: req.body.origin ?? "api",
|
||||
num_tokens: completions.numTokens ?? 0
|
||||
});
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: data,
|
||||
scrape_id: id,
|
||||
warning: warning
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Filters links based on their similarity score to the search query.
|
||||
* @param mappedLinks - The list of mapped links to filter.
|
||||
* @param linksAndScores - The list of links and their similarity scores.
|
||||
* @param threshold - The score threshold to filter by.
|
||||
* @returns The filtered list of links.
|
||||
*/
|
||||
function filterAndProcessLinks(
|
||||
mappedLinks: MapDocument[],
|
||||
linksAndScores: { link: string, linkWithContext: string, score: number, originalIndex: number }[],
|
||||
threshold: number
|
||||
): MapDocument[] {
|
||||
return linksAndScores
|
||||
.filter(x => x.score > threshold)
|
||||
.map(x => mappedLinks.find(link => link.url === x.link))
|
||||
.filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url));
|
||||
}
|
@ -1,6 +1,6 @@
|
||||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
|
||||
import { MapDocument, mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
|
||||
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||
import { MapResponse, MapRequest } from "./types";
|
||||
import { configDotenv } from "dotenv";
|
||||
@ -25,37 +25,61 @@ const MAX_MAP_LIMIT = 5000;
|
||||
// Max Links that "Smart /map" can return
|
||||
const MAX_FIRE_ENGINE_RESULTS = 1000;
|
||||
|
||||
export async function mapController(
|
||||
req: RequestWithAuth<{}, MapResponse, MapRequest>,
|
||||
res: Response<MapResponse>
|
||||
) {
|
||||
const startTime = new Date().getTime();
|
||||
|
||||
req.body = mapRequestSchema.parse(req.body);
|
||||
|
||||
const limit: number = req.body.limit ?? MAX_MAP_LIMIT;
|
||||
interface MapResult {
|
||||
success: boolean;
|
||||
links: string[] | any[];
|
||||
scrape_id?: string;
|
||||
job_id: string;
|
||||
time_taken: number;
|
||||
}
|
||||
|
||||
export async function getMapResults({
|
||||
url,
|
||||
search,
|
||||
limit = MAX_MAP_LIMIT,
|
||||
ignoreSitemap = false,
|
||||
includeSubdomains = true,
|
||||
crawlerOptions = {},
|
||||
teamId,
|
||||
plan,
|
||||
origin,
|
||||
includeMetadata = false,
|
||||
allowExternalLinks
|
||||
}: {
|
||||
url: string;
|
||||
search?: string;
|
||||
limit?: number;
|
||||
ignoreSitemap?: boolean;
|
||||
includeSubdomains?: boolean;
|
||||
crawlerOptions?: any;
|
||||
teamId: string;
|
||||
plan?: string;
|
||||
origin?: string;
|
||||
includeMetadata?: boolean;
|
||||
allowExternalLinks?: boolean;
|
||||
}): Promise<MapResult> {
|
||||
const id = uuidv4();
|
||||
let links: string[] = [req.body.url];
|
||||
let links: string[] = [url];
|
||||
let mapResults: MapDocument[] = [];
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
originUrl: url,
|
||||
crawlerOptions: {
|
||||
...req.body,
|
||||
limit: req.body.sitemapOnly ? 10000000 : limit,
|
||||
...crawlerOptions,
|
||||
limit: crawlerOptions.sitemapOnly ? 10000000 : limit,
|
||||
scrapeOptions: undefined,
|
||||
},
|
||||
scrapeOptions: scrapeOptions.parse({}),
|
||||
internalOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
team_id: teamId,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
plan: plan,
|
||||
};
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
// If sitemapOnly is true, only get links from sitemap
|
||||
if (req.body.sitemapOnly) {
|
||||
if (crawlerOptions.sitemapOnly) {
|
||||
const sitemap = await crawler.tryGetSitemap(true, true);
|
||||
if (sitemap !== null) {
|
||||
sitemap.forEach((x) => {
|
||||
@ -73,19 +97,18 @@ export async function mapController(
|
||||
// links = links.slice(1, limit); // don't slice, unnecessary
|
||||
}
|
||||
} else {
|
||||
let urlWithoutWww = req.body.url.replace("www.", "");
|
||||
let urlWithoutWww = url.replace("www.", "");
|
||||
|
||||
let mapUrl = req.body.search
|
||||
? `"${req.body.search}" site:${urlWithoutWww}`
|
||||
: `site:${req.body.url}`;
|
||||
let mapUrl = search && allowExternalLinks
|
||||
? `${search} ${urlWithoutWww}`
|
||||
: search ? `${search} site:${urlWithoutWww}`
|
||||
: `site:${url}`;
|
||||
|
||||
const resultsPerPage = 100;
|
||||
const maxPages = Math.ceil(
|
||||
Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage
|
||||
);
|
||||
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
|
||||
|
||||
const cacheKey = `fireEngineMap:${mapUrl}`;
|
||||
const cachedResult = null;
|
||||
const cachedResult = await redis.get(cacheKey);
|
||||
|
||||
let allResults: any[] = [];
|
||||
let pagePromises: Promise<any>[] = [];
|
||||
@ -110,7 +133,7 @@ export async function mapController(
|
||||
|
||||
// Parallelize sitemap fetch with serper search
|
||||
const [sitemap, ...searchResults] = await Promise.all([
|
||||
req.body.ignoreSitemap ? null : crawler.tryGetSitemap(true),
|
||||
ignoreSitemap ? null : crawler.tryGetSitemap(true),
|
||||
...(cachedResult ? [] : pagePromises),
|
||||
]);
|
||||
|
||||
@ -124,7 +147,7 @@ export async function mapController(
|
||||
});
|
||||
}
|
||||
|
||||
let mapResults = allResults
|
||||
mapResults = allResults
|
||||
.flat()
|
||||
.filter((result) => result !== null && result !== undefined);
|
||||
|
||||
@ -134,7 +157,7 @@ export async function mapController(
|
||||
}
|
||||
|
||||
if (mapResults.length > 0) {
|
||||
if (req.body.search) {
|
||||
if (search) {
|
||||
// Ensure all map results are first, maintaining their order
|
||||
links = [
|
||||
mapResults[0].url,
|
||||
@ -149,9 +172,8 @@ export async function mapController(
|
||||
}
|
||||
|
||||
// Perform cosine similarity between the search query and the list of links
|
||||
if (req.body.search) {
|
||||
const searchQuery = req.body.search.toLowerCase();
|
||||
|
||||
if (search) {
|
||||
const searchQuery = search.toLowerCase();
|
||||
links = performCosineSimilarity(links, searchQuery);
|
||||
}
|
||||
|
||||
@ -166,95 +188,75 @@ export async function mapController(
|
||||
.filter((x) => x !== null) as string[];
|
||||
|
||||
// allows for subdomains to be included
|
||||
links = links.filter((x) => isSameDomain(x, req.body.url));
|
||||
links = links.filter((x) => isSameDomain(x, url));
|
||||
|
||||
// if includeSubdomains is false, filter out subdomains
|
||||
if (!req.body.includeSubdomains) {
|
||||
links = links.filter((x) => isSameSubdomain(x, req.body.url));
|
||||
if (!includeSubdomains) {
|
||||
links = links.filter((x) => isSameSubdomain(x, url));
|
||||
}
|
||||
|
||||
// remove duplicates that could be due to http/https or www
|
||||
links = removeDuplicateUrls(links);
|
||||
links.slice(0, limit);
|
||||
}
|
||||
|
||||
const linksToReturn = crawlerOptions.sitemapOnly ? links : links.slice(0, limit);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
links: includeMetadata ? mapResults : linksToReturn,
|
||||
scrape_id: origin?.includes("website") ? id : undefined,
|
||||
job_id: id,
|
||||
time_taken: (new Date().getTime() - Date.now()) / 1000,
|
||||
};
|
||||
}
|
||||
|
||||
export async function mapController(
|
||||
req: RequestWithAuth<{}, MapResponse, MapRequest>,
|
||||
res: Response<MapResponse>
|
||||
) {
|
||||
req.body = mapRequestSchema.parse(req.body);
|
||||
|
||||
const result = await getMapResults({
|
||||
url: req.body.url,
|
||||
search: req.body.search,
|
||||
limit: req.body.limit,
|
||||
ignoreSitemap: req.body.ignoreSitemap,
|
||||
includeSubdomains: req.body.includeSubdomains,
|
||||
crawlerOptions: req.body,
|
||||
origin: req.body.origin,
|
||||
teamId: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
});
|
||||
|
||||
// Bill the team
|
||||
billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
|
||||
logger.error(
|
||||
`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
|
||||
);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
|
||||
// Log the job
|
||||
logJob({
|
||||
job_id: id,
|
||||
success: links.length > 0,
|
||||
job_id: result.job_id,
|
||||
success: result.links.length > 0,
|
||||
message: "Map completed",
|
||||
num_docs: links.length,
|
||||
docs: links,
|
||||
time_taken: timeTakenInSeconds,
|
||||
num_docs: result.links.length,
|
||||
docs: result.links,
|
||||
time_taken: result.time_taken,
|
||||
team_id: req.auth.team_id,
|
||||
mode: "map",
|
||||
mode: "map",
|
||||
url: req.body.url,
|
||||
crawlerOptions: {},
|
||||
scrapeOptions: {},
|
||||
origin: req.body.origin,
|
||||
origin: req.body.origin ?? "api",
|
||||
num_tokens: 0,
|
||||
});
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
links: links,
|
||||
scrape_id: req.body.origin?.includes("website") ? id : undefined,
|
||||
});
|
||||
}
|
||||
const response = {
|
||||
success: true as const,
|
||||
links: result.links,
|
||||
scrape_id: result.scrape_id
|
||||
};
|
||||
|
||||
// Subdomain sitemap url checking
|
||||
|
||||
// // For each result, check for subdomains, get their sitemaps and add them to the links
|
||||
// const processedUrls = new Set();
|
||||
// const processedSubdomains = new Set();
|
||||
|
||||
// for (const result of links) {
|
||||
// let url;
|
||||
// let hostParts;
|
||||
// try {
|
||||
// url = new URL(result);
|
||||
// hostParts = url.hostname.split('.');
|
||||
// } catch (e) {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
// console.log("hostParts", hostParts);
|
||||
// // Check if it's a subdomain (more than 2 parts, and not 'www')
|
||||
// if (hostParts.length > 2 && hostParts[0] !== 'www') {
|
||||
// const subdomain = hostParts[0];
|
||||
// console.log("subdomain", subdomain);
|
||||
// const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
|
||||
// console.log("subdomainUrl", subdomainUrl);
|
||||
|
||||
// if (!processedSubdomains.has(subdomainUrl)) {
|
||||
// processedSubdomains.add(subdomainUrl);
|
||||
|
||||
// const subdomainCrawl = crawlToCrawler(id, {
|
||||
// originUrl: subdomainUrl,
|
||||
// crawlerOptions: legacyCrawlerOptions(req.body),
|
||||
// pageOptions: {},
|
||||
// team_id: req.auth.team_id,
|
||||
// createdAt: Date.now(),
|
||||
// plan: req.auth.plan,
|
||||
// });
|
||||
// const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
|
||||
// if (subdomainSitemap) {
|
||||
// subdomainSitemap.forEach((x) => {
|
||||
// if (!processedUrls.has(x.url)) {
|
||||
// processedUrls.add(x.url);
|
||||
// links.push(x.url);
|
||||
// }
|
||||
// });
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
return res.status(200).json(response);
|
||||
}
|
@ -151,8 +151,25 @@ export const scrapeOptions = z.object({
|
||||
}).strict(strictMessage)
|
||||
|
||||
|
||||
|
||||
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
||||
|
||||
export const extractV1Options = z.object({
|
||||
urls: url.array(),
|
||||
prompt: z.string().optional(),
|
||||
schema: z.any().optional(),
|
||||
limit: z.number().int().positive().finite().safe().optional(),
|
||||
ignoreSitemap: z.boolean().default(false),
|
||||
includeSubdomains: z.boolean().default(true),
|
||||
allowExternalLinks: z.boolean().default(false),
|
||||
origin: z.string().optional().default("api"),
|
||||
timeout: z.number().int().positive().finite().safe().default(60000)
|
||||
}).strict(strictMessage)
|
||||
|
||||
export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
||||
export const extractRequestSchema = extractV1Options;
|
||||
export type ExtractRequest = z.infer<typeof extractRequestSchema>;
|
||||
|
||||
export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
@ -173,6 +190,8 @@ export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend(
|
||||
return obj;
|
||||
});
|
||||
|
||||
|
||||
|
||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
|
||||
|
||||
@ -340,6 +359,21 @@ export interface ScrapeResponseRequestTest {
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export type ExtractResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
warning?: string;
|
||||
data: z.infer<typeof extractRequestSchema>;
|
||||
scrape_id?: string;
|
||||
};
|
||||
|
||||
export interface ExtractResponseRequestTest {
|
||||
statusCode: number;
|
||||
body: ExtractResponse;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export type CrawlResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
@ -496,6 +530,13 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
|
||||
export interface MapDocument {
|
||||
url: string;
|
||||
title?: string;
|
||||
description?: string;
|
||||
}
|
||||
export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions } {
|
||||
return {
|
||||
scrapeOptions: scrapeOptions.parse({
|
||||
|
50
apps/api/src/lib/cache.ts
Normal file
50
apps/api/src/lib/cache.ts
Normal file
@ -0,0 +1,50 @@
|
||||
import IORedis from "ioredis";
|
||||
import { ScrapeOptions } from "../controllers/v1/types";
|
||||
import { InternalOptions } from "../scraper/scrapeURL";
|
||||
import { logger as _logger } from "./logger";
|
||||
const logger = _logger.child({module: "cache"});
|
||||
|
||||
export const cacheRedis = process.env.CACHE_REDIS_URL ? new IORedis(process.env.CACHE_REDIS_URL, {
|
||||
maxRetriesPerRequest: null,
|
||||
}) : null;
|
||||
|
||||
export function cacheKey(url: string, scrapeOptions: ScrapeOptions, internalOptions: InternalOptions): string | null {
|
||||
if (!cacheRedis) return null;
|
||||
|
||||
// these options disqualify a cache
|
||||
if (internalOptions.v0CrawlOnlyUrls || internalOptions.forceEngine || internalOptions.v0UseFastMode || internalOptions.atsv
|
||||
|| (scrapeOptions.actions && scrapeOptions.actions.length > 0)
|
||||
) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return "cache:" + url + ":waitFor:" + scrapeOptions.waitFor;
|
||||
}
|
||||
|
||||
export type CacheEntry = {
|
||||
url: string;
|
||||
html: string;
|
||||
statusCode: number;
|
||||
error?: string;
|
||||
};
|
||||
|
||||
export async function saveEntryToCache(key: string, entry: CacheEntry) {
|
||||
if (!cacheRedis) return;
|
||||
|
||||
try {
|
||||
await cacheRedis.set(key, JSON.stringify(entry));
|
||||
} catch (error) {
|
||||
logger.warn("Failed to save to cache", { key, error });
|
||||
}
|
||||
}
|
||||
|
||||
export async function getEntryFromCache(key: string): Promise<CacheEntry | null> {
|
||||
if (!cacheRedis) return null;
|
||||
|
||||
try {
|
||||
return JSON.parse(await cacheRedis.get(key) ?? "null");
|
||||
} catch (error) {
|
||||
logger.warn("Failed to get from cache", { key, error });
|
||||
return null;
|
||||
}
|
||||
}
|
15
apps/api/src/lib/extract/build-document.ts
Normal file
15
apps/api/src/lib/extract/build-document.ts
Normal file
@ -0,0 +1,15 @@
|
||||
import { Document } from "../../controllers/v1/types";
|
||||
|
||||
export function buildDocument(document: Document): string {
|
||||
const metadata = document.metadata;
|
||||
const markdown = document.markdown;
|
||||
|
||||
// for each key in the metadata allow up to 250 characters
|
||||
const metadataString = Object.entries(metadata).map(([key, value]) => {
|
||||
return `${key}: ${value?.toString().slice(0, 250)}`;
|
||||
}).join('\n');
|
||||
|
||||
const documentMetadataString = `\n- - - - - Page metadata - - - - -\n${metadataString}`;
|
||||
const documentString = `${markdown}${documentMetadataString}`;
|
||||
return documentString;
|
||||
}
|
124
apps/api/src/lib/extract/completions.ts
Normal file
124
apps/api/src/lib/extract/completions.ts
Normal file
@ -0,0 +1,124 @@
|
||||
// use llmExtract.ts instead
|
||||
|
||||
// import OpenAI from "openai";
|
||||
// import { encoding_for_model } from "@dqbd/tiktoken";
|
||||
// import { TiktokenModel } from "@dqbd/tiktoken";
|
||||
// import { ExtractOptions } from "../../controllers/v1/types";
|
||||
// import { Document } from "../entities";
|
||||
// import { z } from "zod";
|
||||
|
||||
// const maxTokens = 32000;
|
||||
// const modifier = 4;
|
||||
|
||||
// export class LLMRefusalError extends Error {
|
||||
// constructor(refusal: string) {
|
||||
// super("LLM refused to extract the website's content");
|
||||
// this.name = "LLMRefusalError";
|
||||
// }
|
||||
// }
|
||||
|
||||
// interface GenerateCompletionsParams {
|
||||
// systemPrompt?: string;
|
||||
// prompt?: string;
|
||||
// schema?: any;
|
||||
// pagesContent: string;
|
||||
// }
|
||||
|
||||
// export async function generateBasicCompletion(prompt: string) {
|
||||
// const openai = new OpenAI();
|
||||
// const model: TiktokenModel =
|
||||
// (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||
|
||||
// const completion = await openai.chat.completions.create({
|
||||
// model,
|
||||
// messages: [{ role: "user", content: prompt }],
|
||||
// });
|
||||
|
||||
// return completion.choices[0].message.content;
|
||||
// }
|
||||
|
||||
// export async function generateFinalExtraction({
|
||||
// pagesContent,
|
||||
// systemPrompt,
|
||||
// prompt,
|
||||
// schema,
|
||||
// }: GenerateCompletionsParams): Promise<{
|
||||
// content: string;
|
||||
// metadata: { numTokens: number; warning: string };
|
||||
// }> {
|
||||
// const openai = new OpenAI();
|
||||
// const model: TiktokenModel =
|
||||
// (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||
|
||||
// let extractionContent = pagesContent;
|
||||
// let numTokens = 0;
|
||||
// let warning = "";
|
||||
|
||||
// const encoder = encoding_for_model(model);
|
||||
// try {
|
||||
// const tokens = encoder.encode(extractionContent);
|
||||
// numTokens = tokens.length;
|
||||
// } catch (error) {
|
||||
// extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
||||
// warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
|
||||
// } finally {
|
||||
// encoder.free();
|
||||
// }
|
||||
|
||||
// if (numTokens > maxTokens) {
|
||||
// extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
||||
// warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
|
||||
// }
|
||||
|
||||
// if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
|
||||
// schema = {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// items: schema,
|
||||
// },
|
||||
// required: ["items"],
|
||||
// additionalProperties: false,
|
||||
// };
|
||||
// } else if (schema) {
|
||||
// schema.additionalProperties = false;
|
||||
// schema.required = Object.keys(schema.properties);
|
||||
// }
|
||||
|
||||
// const jsonCompletion = await openai.beta.chat.completions.parse({
|
||||
// temperature: 0,
|
||||
// model,
|
||||
// messages: [
|
||||
// { role: "system", content: systemPrompt ?? "" },
|
||||
// { role: "user", content: [{ type: "text", text: extractionContent }] },
|
||||
// {
|
||||
// role: "user",
|
||||
// content: prompt
|
||||
// ? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
|
||||
// : "Transform the above content into structured JSON output.",
|
||||
// },
|
||||
// ],
|
||||
// response_format: schema
|
||||
// ? {
|
||||
// type: "json_schema",
|
||||
// json_schema: {
|
||||
// name: "websiteContent",
|
||||
// schema: schema,
|
||||
// strict: true,
|
||||
// },
|
||||
// }
|
||||
// : { type: "json_object" },
|
||||
// });
|
||||
|
||||
// if (jsonCompletion.choices[0].message.refusal !== null) {
|
||||
// throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
|
||||
// }
|
||||
|
||||
// const extraction = jsonCompletion.choices[0].message.parsed;
|
||||
// return {
|
||||
// content: extraction ?? "",
|
||||
// metadata: {
|
||||
// numTokens,
|
||||
// warning,
|
||||
// },
|
||||
// };
|
||||
// }
|
22
apps/api/src/lib/extract/reranker.ts
Normal file
22
apps/api/src/lib/extract/reranker.ts
Normal file
@ -0,0 +1,22 @@
|
||||
import { CohereClient } from "cohere-ai";
|
||||
import { MapDocument } from "../../controllers/v1/types";
|
||||
const cohere = new CohereClient({
|
||||
token: process.env.COHERE_API_KEY,
|
||||
});
|
||||
|
||||
export async function rerankDocuments(
|
||||
documents: (string | Record<string, string>)[],
|
||||
query: string,
|
||||
topN = 3,
|
||||
model = "rerank-english-v3.0"
|
||||
) {
|
||||
const rerank = await cohere.v2.rerank({
|
||||
documents,
|
||||
query,
|
||||
topN,
|
||||
model,
|
||||
returnDocuments: true,
|
||||
});
|
||||
|
||||
return rerank.results.sort((a, b) => b.relevanceScore - a.relevanceScore).map(x => ({ document: x.document, index: x.index, relevanceScore: x.relevanceScore }));
|
||||
}
|
68
apps/api/src/lib/ranker.test.ts
Normal file
68
apps/api/src/lib/ranker.test.ts
Normal file
@ -0,0 +1,68 @@
|
||||
import { performRanking } from './ranker';
|
||||
|
||||
describe('performRanking', () => {
|
||||
it('should rank links based on similarity to search query', async () => {
|
||||
const linksWithContext = [
|
||||
'url: https://example.com/dogs, title: All about dogs, description: Learn about different dog breeds',
|
||||
'url: https://example.com/cats, title: Cat care guide, description: Everything about cats',
|
||||
'url: https://example.com/pets, title: General pet care, description: Care for all types of pets'
|
||||
];
|
||||
|
||||
const links = [
|
||||
'https://example.com/dogs',
|
||||
'https://example.com/cats',
|
||||
'https://example.com/pets'
|
||||
];
|
||||
|
||||
const searchQuery = 'cats training';
|
||||
|
||||
const result = await performRanking(linksWithContext, links, searchQuery);
|
||||
|
||||
// Should return array of objects with link, linkWithContext, score, originalIndex
|
||||
expect(result).toBeInstanceOf(Array);
|
||||
expect(result.length).toBe(3);
|
||||
|
||||
// First result should be the dogs page since query is about dogs
|
||||
expect(result[0].link).toBe('https://example.com/cats');
|
||||
|
||||
// Each result should have required properties
|
||||
result.forEach(item => {
|
||||
expect(item).toHaveProperty('link');
|
||||
expect(item).toHaveProperty('linkWithContext');
|
||||
expect(item).toHaveProperty('score');
|
||||
expect(item).toHaveProperty('originalIndex');
|
||||
expect(typeof item.score).toBe('number');
|
||||
expect(item.score).toBeGreaterThanOrEqual(0);
|
||||
expect(item.score).toBeLessThanOrEqual(1);
|
||||
});
|
||||
|
||||
// Scores should be in descending order
|
||||
for (let i = 1; i < result.length; i++) {
|
||||
expect(result[i].score).toBeLessThanOrEqual(result[i-1].score);
|
||||
}
|
||||
});
|
||||
|
||||
it('should handle empty inputs', async () => {
|
||||
const result = await performRanking([], [], '');
|
||||
expect(result).toEqual([]);
|
||||
});
|
||||
|
||||
it('should maintain original order for equal scores', async () => {
|
||||
const linksWithContext = [
|
||||
'url: https://example.com/1, title: Similar content A, description: test',
|
||||
'url: https://example.com/2, title: Similar content B, description: test'
|
||||
];
|
||||
|
||||
const links = [
|
||||
'https://example.com/1',
|
||||
'https://example.com/2'
|
||||
];
|
||||
|
||||
const searchQuery = 'test';
|
||||
|
||||
const result = await performRanking(linksWithContext, links, searchQuery);
|
||||
|
||||
// If scores are equal, original order should be maintained
|
||||
expect(result[0].originalIndex).toBeLessThan(result[1].originalIndex);
|
||||
});
|
||||
});
|
92
apps/api/src/lib/ranker.ts
Normal file
92
apps/api/src/lib/ranker.ts
Normal file
@ -0,0 +1,92 @@
|
||||
import axios from 'axios';
|
||||
import { configDotenv } from 'dotenv';
|
||||
import OpenAI from "openai";
|
||||
|
||||
configDotenv();
|
||||
|
||||
const openai = new OpenAI({
|
||||
apiKey: process.env.OPENAI_API_KEY,
|
||||
});
|
||||
|
||||
async function getEmbedding(text: string) {
|
||||
const embedding = await openai.embeddings.create({
|
||||
model: "text-embedding-ada-002",
|
||||
input: text,
|
||||
encoding_format: "float",
|
||||
});
|
||||
|
||||
return embedding.data[0].embedding;
|
||||
}
|
||||
|
||||
const cosineSimilarity = (vec1: number[], vec2: number[]): number => {
|
||||
const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
|
||||
const magnitude1 = Math.sqrt(
|
||||
vec1.reduce((sum, val) => sum + val * val, 0)
|
||||
);
|
||||
const magnitude2 = Math.sqrt(
|
||||
vec2.reduce((sum, val) => sum + val * val, 0)
|
||||
);
|
||||
if (magnitude1 === 0 || magnitude2 === 0) return 0;
|
||||
return dotProduct / (magnitude1 * magnitude2);
|
||||
};
|
||||
|
||||
// Function to convert text to vector
|
||||
const textToVector = (searchQuery: string, text: string): number[] => {
|
||||
const words = searchQuery.toLowerCase().split(/\W+/);
|
||||
return words.map((word) => {
|
||||
const count = (text.toLowerCase().match(new RegExp(word, "g")) || [])
|
||||
.length;
|
||||
return count / text.length;
|
||||
});
|
||||
};
|
||||
|
||||
async function performRanking(linksWithContext: string[], links: string[], searchQuery: string) {
|
||||
try {
|
||||
// Handle invalid inputs
|
||||
if (!searchQuery || !linksWithContext.length || !links.length) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Sanitize search query by removing null characters
|
||||
const sanitizedQuery = searchQuery;
|
||||
|
||||
// Generate embeddings for the search query
|
||||
const queryEmbedding = await getEmbedding(sanitizedQuery);
|
||||
|
||||
// Generate embeddings for each link and calculate similarity
|
||||
const linksAndScores = await Promise.all(linksWithContext.map(async (linkWithContext, index) => {
|
||||
try {
|
||||
const linkEmbedding = await getEmbedding(linkWithContext);
|
||||
const score = cosineSimilarity(queryEmbedding, linkEmbedding);
|
||||
|
||||
return {
|
||||
link: links[index],
|
||||
linkWithContext,
|
||||
score,
|
||||
originalIndex: index
|
||||
};
|
||||
} catch (err) {
|
||||
// If embedding fails for a link, return with score 0
|
||||
return {
|
||||
link: links[index],
|
||||
linkWithContext,
|
||||
score: 0,
|
||||
originalIndex: index
|
||||
};
|
||||
}
|
||||
}));
|
||||
|
||||
// Sort links based on similarity scores while preserving original order for equal scores
|
||||
linksAndScores.sort((a, b) => {
|
||||
const scoreDiff = b.score - a.score;
|
||||
return scoreDiff === 0 ? a.originalIndex - b.originalIndex : scoreDiff;
|
||||
});
|
||||
|
||||
return linksAndScores;
|
||||
} catch (error) {
|
||||
console.error(`Error performing semantic search: ${error}`);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
export { performRanking };
|
@ -1 +1 @@
|
||||
export const axiosTimeout = 3000;
|
||||
export const axiosTimeout = 5000;
|
@ -27,4 +27,4 @@ v0Router.post("/v0/search", searchController);
|
||||
|
||||
// Health/Probe routes
|
||||
v0Router.get("/v0/health/liveness", livenessController);
|
||||
v0Router.get("/v0/health/readiness", readinessController);
|
||||
v0Router.get("/v0/health/readiness", readinessController);
|
@ -18,6 +18,7 @@ import { logger } from "../lib/logger";
|
||||
import { scrapeStatusController } from "../controllers/v1/scrape-status";
|
||||
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
|
||||
import { batchScrapeController } from "../controllers/v1/batch-scrape";
|
||||
import { extractController } from "../controllers/v1/extract";
|
||||
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
||||
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
||||
// import { searchController } from "../../src/controllers/v1/search";
|
||||
@ -98,7 +99,7 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
|
||||
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||
if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) {
|
||||
if (!res.headersSent) {
|
||||
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
|
||||
return res.status(403).json({ success: false, error: "URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions." });
|
||||
}
|
||||
}
|
||||
next();
|
||||
@ -178,6 +179,13 @@ v1Router.ws(
|
||||
crawlStatusWSController
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/extract",
|
||||
authMiddleware(RateLimiterMode.Scrape),
|
||||
checkCreditsMiddleware(1),
|
||||
wrap(extractController)
|
||||
);
|
||||
|
||||
|
||||
|
||||
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
|
||||
@ -199,3 +207,4 @@ v1Router.delete(
|
||||
// Health/Probe routes
|
||||
// v1Router.get("/health/liveness", livenessController);
|
||||
// v1Router.get("/health/readiness", readinessController);
|
||||
|
||||
|
19
apps/api/src/scraper/scrapeURL/engines/cache/index.ts
vendored
Normal file
19
apps/api/src/scraper/scrapeURL/engines/cache/index.ts
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
import { cacheKey, getEntryFromCache } from "../../../../lib/cache";
|
||||
import { EngineScrapeResult } from "..";
|
||||
import { Meta } from "../..";
|
||||
import { EngineError } from "../../error";
|
||||
|
||||
export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
|
||||
if (key === null) throw new EngineError("Scrape not eligible for caching");
|
||||
|
||||
const entry = await getEntryFromCache(key);
|
||||
if (entry === null) throw new EngineError("Cache missed");
|
||||
|
||||
return {
|
||||
url: entry.url,
|
||||
html: entry.html,
|
||||
statusCode: entry.statusCode,
|
||||
error: entry.error,
|
||||
};
|
||||
}
|
@ -103,4 +103,4 @@ export async function fireEngineCheckStatus(logger: Logger, jobId: string): Prom
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -6,14 +6,17 @@ import { scrapePDF } from "./pdf";
|
||||
import { scrapeURLWithScrapingBee } from "./scrapingbee";
|
||||
import { scrapeURLWithFetch } from "./fetch";
|
||||
import { scrapeURLWithPlaywright } from "./playwright";
|
||||
import { scrapeCache } from "./cache";
|
||||
|
||||
export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx";
|
||||
export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx" | "cache";
|
||||
|
||||
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
|
||||
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||
const usePlaywright = process.env.PLAYWRIGHT_MICROSERVICE_URL !== '' && process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined;
|
||||
const useCache = process.env.CACHE_REDIS_URL !== '' && process.env.CACHE_REDIS_URL !== undefined;
|
||||
|
||||
export const engines: Engine[] = [
|
||||
// ...(useCache ? [ "cache" as const ] : []),
|
||||
...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, "fire-engine;tlsclient" as const ] : []),
|
||||
...(useScrapingBee ? [ "scrapingbee" as const, "scrapingbeeLoad" as const ] : []),
|
||||
...(usePlaywright ? [ "playwright" as const ] : []),
|
||||
@ -74,6 +77,7 @@ export type EngineScrapeResult = {
|
||||
const engineHandlers: {
|
||||
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>
|
||||
} = {
|
||||
"cache": scrapeCache,
|
||||
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
||||
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
|
||||
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
|
||||
@ -95,6 +99,22 @@ export const engineOptions: {
|
||||
quality: number,
|
||||
}
|
||||
} = {
|
||||
"cache": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": true,
|
||||
"screenshot": false,
|
||||
"screenshot@fullScreen": false,
|
||||
"pdf": false, // TODO: figure this out
|
||||
"docx": false, // TODO: figure this out
|
||||
"atsv": false,
|
||||
"location": false,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": false,
|
||||
},
|
||||
quality: 1000, // cache should always be tried first
|
||||
},
|
||||
"fire-engine;chrome-cdp": {
|
||||
features: {
|
||||
"actions": true,
|
||||
|
26
apps/api/src/scraper/scrapeURL/transformers/cache.ts
Normal file
26
apps/api/src/scraper/scrapeURL/transformers/cache.ts
Normal file
@ -0,0 +1,26 @@
|
||||
import { Document } from "../../../controllers/v1/types";
|
||||
import { Meta } from "..";
|
||||
import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache";
|
||||
|
||||
export function saveToCache(meta: Meta, document: Document): Document {
|
||||
if (document.metadata.statusCode! < 200 || document.metadata.statusCode! >= 300) return document;
|
||||
|
||||
if (document.rawHtml === undefined) {
|
||||
throw new Error("rawHtml is undefined -- this transformer is being called out of order");
|
||||
}
|
||||
|
||||
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
|
||||
|
||||
if (key !== null) {
|
||||
const entry: CacheEntry = {
|
||||
html: document.rawHtml!,
|
||||
statusCode: document.metadata.statusCode!,
|
||||
url: document.metadata.url ?? document.metadata.sourceURL!,
|
||||
error: document.metadata.error ?? undefined,
|
||||
};
|
||||
|
||||
saveEntryToCache(key, entry);
|
||||
}
|
||||
|
||||
return document;
|
||||
}
|
@ -7,6 +7,7 @@ import { extractMetadata } from "../lib/extractMetadata";
|
||||
import { performLLMExtract } from "./llmExtract";
|
||||
import { uploadScreenshot } from "./uploadScreenshot";
|
||||
import { removeBase64Images } from "./removeBase64Images";
|
||||
import { saveToCache } from "./cache";
|
||||
|
||||
export type Transformer = (meta: Meta, document: Document) => Document | Promise<Document>;
|
||||
|
||||
@ -104,6 +105,7 @@ export function coerceFieldsToFormats(meta: Meta, document: Document): Document
|
||||
|
||||
// TODO: allow some of these to run in parallel
|
||||
export const transformerStack: Transformer[] = [
|
||||
saveToCache,
|
||||
deriveHTMLFromRawHTML,
|
||||
deriveMarkdownFromHTML,
|
||||
deriveLinksFromHTML,
|
||||
|
@ -58,32 +58,33 @@ function normalizeSchema(x: any): any {
|
||||
}
|
||||
}
|
||||
|
||||
async function generateOpenAICompletions(logger: Logger, document: Document, options: ExtractOptions): Promise<Document> {
|
||||
export async function generateOpenAICompletions(logger: Logger, options: ExtractOptions, markdown?: string, previousWarning?: string): Promise<{ extract: any, numTokens: number, warning: string | undefined }> {
|
||||
let extract: any;
|
||||
let warning: string | undefined;
|
||||
|
||||
const openai = new OpenAI();
|
||||
const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||
|
||||
if (document.markdown === undefined) {
|
||||
if (markdown === undefined) {
|
||||
throw new Error("document.markdown is undefined -- this is unexpected");
|
||||
}
|
||||
|
||||
let extractionContent = document.markdown;
|
||||
|
||||
// count number of tokens
|
||||
let numTokens = 0;
|
||||
const encoder = encoding_for_model(model as TiktokenModel);
|
||||
try {
|
||||
// Encode the message into tokens
|
||||
const tokens = encoder.encode(extractionContent);
|
||||
const tokens = encoder.encode(markdown);
|
||||
|
||||
// Return the number of tokens
|
||||
numTokens = tokens.length;
|
||||
} catch (error) {
|
||||
logger.warn("Calculating num tokens of string failed", { error, extractionContent });
|
||||
logger.warn("Calculating num tokens of string failed", { error, markdown });
|
||||
|
||||
extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
||||
markdown = markdown.slice(0, maxTokens * modifier);
|
||||
|
||||
const warning = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
|
||||
document.warning = document.warning === undefined ? warning : " " + warning;
|
||||
let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
|
||||
warning = previousWarning === undefined ? w : w + " " + previousWarning;
|
||||
} finally {
|
||||
// Free the encoder resources after use
|
||||
encoder.free();
|
||||
@ -91,10 +92,10 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
|
||||
|
||||
if (numTokens > maxTokens) {
|
||||
// trim the document to the maximum number of tokens, tokens != characters
|
||||
extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
||||
markdown = markdown.slice(0, maxTokens * modifier);
|
||||
|
||||
const warning = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
|
||||
document.warning = document.warning === undefined ? warning : " " + warning;
|
||||
const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
|
||||
warning = previousWarning === undefined ? w : w + " " + previousWarning;
|
||||
}
|
||||
|
||||
let schema = options.schema;
|
||||
@ -107,12 +108,22 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
|
||||
required: ["items"],
|
||||
additionalProperties: false,
|
||||
};
|
||||
} else if (schema && typeof schema === 'object' && !schema.type) {
|
||||
schema = {
|
||||
type: "object",
|
||||
properties: Object.fromEntries(
|
||||
Object.entries(schema).map(([key, value]) => [key, { type: value }])
|
||||
),
|
||||
required: Object.keys(schema),
|
||||
additionalProperties: false
|
||||
};
|
||||
}
|
||||
|
||||
schema = normalizeSchema(schema);
|
||||
|
||||
const jsonCompletion = await openai.beta.chat.completions.parse({
|
||||
model,
|
||||
temperature: 0,
|
||||
messages: [
|
||||
{
|
||||
role: "system",
|
||||
@ -120,7 +131,7 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
|
||||
},
|
||||
{
|
||||
role: "user",
|
||||
content: [{ type: "text", text: extractionContent }],
|
||||
content: [{ type: "text", text: markdown }],
|
||||
},
|
||||
{
|
||||
role: "user",
|
||||
@ -143,26 +154,35 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
|
||||
throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
|
||||
}
|
||||
|
||||
document.extract = jsonCompletion.choices[0].message.parsed;
|
||||
extract = jsonCompletion.choices[0].message.parsed;
|
||||
|
||||
if (document.extract === null && jsonCompletion.choices[0].message.content !== null) {
|
||||
if (extract === null && jsonCompletion.choices[0].message.content !== null) {
|
||||
try {
|
||||
document.extract = JSON.parse(jsonCompletion.choices[0].message.content);
|
||||
extract = JSON.parse(jsonCompletion.choices[0].message.content);
|
||||
} catch (e) {
|
||||
logger.error("Failed to parse returned JSON, no schema specified.", { error: e });
|
||||
throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object.");
|
||||
}
|
||||
}
|
||||
|
||||
if (options.schema && options.schema.type === "array") {
|
||||
document.extract = document.extract?.items;
|
||||
// If the users actually wants the items object, they can specify it as 'required' in the schema
|
||||
// otherwise, we just return the items array
|
||||
if (options.schema && options.schema.type === "array" && !schema?.required?.includes("items")) {
|
||||
extract = extract?.items;
|
||||
}
|
||||
return document;
|
||||
return { extract, warning, numTokens };
|
||||
}
|
||||
|
||||
export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> {
|
||||
if (meta.options.formats.includes("extract")) {
|
||||
document = await generateOpenAICompletions(meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }), document, meta.options.extract!);
|
||||
const { extract, warning } = await generateOpenAICompletions(
|
||||
meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }),
|
||||
meta.options.extract!,
|
||||
document.markdown,
|
||||
document.warning,
|
||||
);
|
||||
document.extract = extract;
|
||||
document.warning = warning;
|
||||
}
|
||||
|
||||
return document;
|
||||
|
@ -109,6 +109,6 @@ export function waitForJob<T = unknown>(jobId: string, timeout: number): Promise
|
||||
}
|
||||
}
|
||||
}
|
||||
}, 500);
|
||||
}, 250);
|
||||
})
|
||||
}
|
||||
|
@ -106,6 +106,15 @@ export interface FirecrawlCrawlStatusResponse {
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export interface FirecrawlExtractResponse {
|
||||
statusCode: number;
|
||||
body: {
|
||||
success: boolean;
|
||||
data: any[];
|
||||
};
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export enum RateLimiterMode {
|
||||
Crawl = "crawl",
|
||||
CrawlStatus = "crawlStatus",
|
||||
|
@ -1,4 +1,5 @@
|
||||
import FirecrawlApp from 'firecrawl';
|
||||
import { z } from 'zod';
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
@ -42,6 +43,18 @@ const main = async () => {
|
||||
const mapResult = await app.mapUrl('https://firecrawl.dev');
|
||||
console.log(mapResult)
|
||||
|
||||
// Extract information from a website using LLM:
|
||||
const extractSchema = z.object({
|
||||
title: z.string(),
|
||||
description: z.string(),
|
||||
links: z.array(z.string())
|
||||
});
|
||||
|
||||
const extractResult = await app.extract(['https://firecrawl.dev'], {
|
||||
prompt: "Extract the title, description, and links from the website",
|
||||
schema: extractSchema
|
||||
});
|
||||
console.log(extractResult);
|
||||
|
||||
// Crawl a website with WebSockets:
|
||||
const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});
|
||||
|
@ -42,6 +42,19 @@ const main = async () => {
|
||||
const mapResult = await app.mapUrl('https://firecrawl.dev');
|
||||
console.log(mapResult)
|
||||
|
||||
// // Extract information from a website using LLM:
|
||||
// const extractSchema = z.object({
|
||||
// title: z.string(),
|
||||
// description: z.string(),
|
||||
// links: z.array(z.string())
|
||||
// });
|
||||
|
||||
// const extractResult = await app.extractUrls(['https://firecrawl.dev'], {
|
||||
// prompt: "Extract the title, description, and links from the website",
|
||||
// schema: extractSchema
|
||||
// });
|
||||
// console.log(extractResult);
|
||||
|
||||
// Crawl a website with WebSockets:
|
||||
const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "1.8.5",
|
||||
"version": "1.9.0",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
|
@ -236,6 +236,27 @@ export interface MapResponse {
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parameters for extracting information from URLs.
|
||||
* Defines options for extracting information from URLs.
|
||||
*/
|
||||
export interface ExtractParams {
|
||||
prompt: string;
|
||||
schema?: zt.ZodSchema;
|
||||
systemPrompt?: string;
|
||||
allowExternalLinks?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for extracting information from URLs.
|
||||
* Defines the structure of the response received after extracting information from URLs.
|
||||
*/
|
||||
export interface ExtractResponse {
|
||||
success: true;
|
||||
data: zt.infer<zt.ZodSchema>;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Error response interface.
|
||||
* Defines the structure of the response received when an error occurs.
|
||||
@ -245,7 +266,6 @@ export interface ErrorResponse {
|
||||
error: string;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Custom error class for Firecrawl.
|
||||
* Extends the built-in Error class to include a status code.
|
||||
@ -679,6 +699,44 @@ export default class FirecrawlApp {
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts information from URLs using the Firecrawl API.
|
||||
* @param url - The URL to extract information from.
|
||||
* @param params - Additional parameters for the extract request.
|
||||
* @returns The response from the extract operation.
|
||||
*/
|
||||
async extract(urls: string[], params?: ExtractParams): Promise<ExtractResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders();
|
||||
|
||||
if (!params?.prompt) {
|
||||
throw new FirecrawlError("Prompt is required", 400);
|
||||
}
|
||||
|
||||
let jsonData: { urls: string[] } & ExtractParams= { urls, ...params };
|
||||
let jsonSchema: any;
|
||||
try {
|
||||
jsonSchema = params?.schema ? zodToJsonSchema(params.schema) : undefined;
|
||||
} catch (error: any) {
|
||||
throw new FirecrawlError("Invalid schema. Use a valid Zod schema.", 400);
|
||||
}
|
||||
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/v1/extract`,
|
||||
{ ...jsonData, schema: jsonSchema },
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
return response.data as ExtractResponse;
|
||||
} else {
|
||||
this.handleError(response, "extract");
|
||||
}
|
||||
} catch (error: any) {
|
||||
throw new FirecrawlError(error.message, 500);
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepares the headers for an API request.
|
||||
* @param idempotencyKey - Optional key to ensure idempotency.
|
||||
|
@ -2,6 +2,8 @@ import time
|
||||
import nest_asyncio
|
||||
import uuid
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
|
||||
app = FirecrawlApp(api_key="fc-")
|
||||
|
||||
@ -50,9 +52,6 @@ print(crawl_status)
|
||||
|
||||
# LLM Extraction:
|
||||
# Define schema to extract contents into using pydantic
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
|
||||
class ArticleSchema(BaseModel):
|
||||
title: str
|
||||
points: int
|
||||
@ -115,6 +114,22 @@ llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
|
||||
map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' })
|
||||
print(map_result)
|
||||
|
||||
# Extract URLs:
|
||||
class ExtractSchema(BaseModel):
|
||||
title: str
|
||||
description: str
|
||||
links: List[str]
|
||||
|
||||
# Define the schema using Pydantic
|
||||
extract_schema = ExtractSchema.schema()
|
||||
|
||||
# Perform the extraction
|
||||
extract_result = app.extract(['https://firecrawl.dev'], {
|
||||
'prompt': "Extract the title, description, and links from the website",
|
||||
'schema': extract_schema
|
||||
})
|
||||
print(extract_result)
|
||||
|
||||
# Crawl a website with WebSockets:
|
||||
# inside an async function...
|
||||
nest_asyncio.apply()
|
||||
|
@ -13,7 +13,7 @@ import os
|
||||
|
||||
from .firecrawl import FirecrawlApp # noqa
|
||||
|
||||
__version__ = "1.5.0"
|
||||
__version__ = "1.6.0"
|
||||
|
||||
# Define the logger for the Firecrawl project
|
||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
@ -12,15 +12,40 @@ Classes:
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from typing import Any, Dict, Optional, List
|
||||
from typing import Any, Dict, Optional, List, Union
|
||||
import json
|
||||
|
||||
import requests
|
||||
import pydantic
|
||||
import websockets
|
||||
|
||||
logger : logging.Logger = logging.getLogger("firecrawl")
|
||||
|
||||
class FirecrawlApp:
|
||||
class ExtractParams(pydantic.BaseModel):
|
||||
"""
|
||||
Parameters for the extract operation.
|
||||
"""
|
||||
prompt: str
|
||||
schema: Optional[Any] = None
|
||||
system_prompt: Optional[str] = None
|
||||
allow_external_links: Optional[bool] = False
|
||||
|
||||
class ExtractResponse(pydantic.BaseModel):
|
||||
"""
|
||||
Response from the extract operation.
|
||||
"""
|
||||
success: bool
|
||||
data: Optional[Any] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
class ErrorResponse(pydantic.BaseModel):
|
||||
"""
|
||||
Error response.
|
||||
"""
|
||||
success: bool
|
||||
error: str
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
||||
"""
|
||||
Initialize the FirecrawlApp instance with API key, API URL.
|
||||
@ -434,6 +459,48 @@ class FirecrawlApp:
|
||||
else:
|
||||
self._handle_error(response, 'check batch scrape status')
|
||||
|
||||
|
||||
def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Union[ExtractResponse, ErrorResponse]:
|
||||
"""
|
||||
Extracts information from a URL using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
urls (List[str]): The URLs to extract information from.
|
||||
params (Optional[ExtractParams]): Additional parameters for the extract request.
|
||||
|
||||
Returns:
|
||||
Union[ExtractResponse, ErrorResponse]: The response from the extract operation.
|
||||
"""
|
||||
headers = self._prepare_headers()
|
||||
|
||||
if not params or not params.get('prompt'):
|
||||
raise ValueError("Prompt is required")
|
||||
|
||||
if not params.get('schema'):
|
||||
raise ValueError("Schema is required for extraction")
|
||||
|
||||
jsonData = {'urls': urls, **params}
|
||||
jsonSchema = params['schema'].schema() if hasattr(params['schema'], 'schema') else None
|
||||
|
||||
try:
|
||||
response = self._post_request(
|
||||
f'{self.api_url}/v1/extract',
|
||||
{
|
||||
**jsonData,
|
||||
'allowExternalLinks': params.get('allow_external_links', False),
|
||||
'schema': jsonSchema
|
||||
},
|
||||
headers
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
self._handle_error(response, "extract")
|
||||
except Exception as e:
|
||||
raise ValueError(str(e), 500)
|
||||
|
||||
return {'success': False, 'error': "Internal server error."}
|
||||
|
||||
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
||||
"""
|
||||
Prepare the headers for API requests.
|
||||
|
Loading…
x
Reference in New Issue
Block a user