Nick: formatting fixes

This commit is contained in:
Nicolas 2025-01-10 18:35:10 -03:00
parent d1f3b96388
commit f4d10c5031
37 changed files with 674 additions and 499 deletions

View File

@ -179,11 +179,15 @@ export async function crawlController(req: Request, res: Response) {
const sitemap = sc.crawlerOptions.ignoreSitemap const sitemap = sc.crawlerOptions.ignoreSitemap
? 0 ? 0
: await crawler.tryGetSitemap(async urls => { : await crawler.tryGetSitemap(async (urls) => {
if (urls.length === 0) return; if (urls.length === 0) return;
let jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 }); let jobPriority = await getJobPriority({
const jobs = urls.map(url => { plan,
team_id,
basePriority: 21,
});
const jobs = urls.map((url) => {
const uuid = uuidv4(); const uuid = uuidv4();
return { return {
name: uuid, name: uuid,

View File

@ -114,7 +114,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
const sitemap = sc.crawlerOptions?.ignoreSitemap const sitemap = sc.crawlerOptions?.ignoreSitemap
? 0 ? 0
: await crawler.tryGetSitemap(async urls => { : await crawler.tryGetSitemap(async (urls) => {
for (const url of urls) { for (const url of urls) {
await lockURL(id, sc, url); await lockURL(id, sc, url);
const jobId = uuidv4(); const jobId = uuidv4();

View File

@ -115,7 +115,8 @@ export async function crawlStatusController(
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
sc.cancelled sc.cancelled
? "cancelled" ? "cancelled"
: (validJobStatuses.every((x) => x[1] === "completed") && validJobStatuses.length > 0) : validJobStatuses.every((x) => x[1] === "completed") &&
validJobStatuses.length > 0
? "completed" ? "completed"
: "scraping"; : "scraping";

View File

@ -7,11 +7,7 @@ import {
RequestWithAuth, RequestWithAuth,
toLegacyCrawlerOptions, toLegacyCrawlerOptions,
} from "./types"; } from "./types";
import { import { crawlToCrawler, saveCrawl, StoredCrawl } from "../../lib/crawl-redis";
crawlToCrawler,
saveCrawl,
StoredCrawl,
} from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log"; import { logCrawl } from "../../services/logging/crawl_log";
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs"; import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
import { logger as _logger } from "../../lib/logger"; import { logger as _logger } from "../../lib/logger";
@ -103,7 +99,8 @@ export async function crawlController(
await saveCrawl(id, sc); await saveCrawl(id, sc);
await _addScrapeJobToBullMQ({ await _addScrapeJobToBullMQ(
{
url: req.body.url, url: req.body.url,
mode: "kickoff" as const, mode: "kickoff" as const,
team_id: req.auth.team_id, team_id: req.auth.team_id,
@ -115,7 +112,11 @@ export async function crawlController(
crawl_id: id, crawl_id: id,
webhook: req.body.webhook, webhook: req.body.webhook,
v1: true, v1: true,
}, {}, crypto.randomUUID(), 10); },
{},
crypto.randomUUID(),
10,
);
const protocol = process.env.ENV === "local" ? req.protocol : "https"; const protocol = process.env.ENV === "local" ? req.protocol : "https";

View File

@ -11,7 +11,11 @@ import { saveExtract } from "../../lib/extract/extract-redis";
import { getTeamIdSyncB } from "../../lib/extract/team-id-sync"; import { getTeamIdSyncB } from "../../lib/extract/team-id-sync";
import { performExtraction } from "../../lib/extract/extraction-service"; import { performExtraction } from "../../lib/extract/extraction-service";
export async function oldExtract(req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>, res: Response<ExtractResponse>, extractId: string){ export async function oldExtract(
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
res: Response<ExtractResponse>,
extractId: string,
) {
// Means that are in the non-queue system // Means that are in the non-queue system
// TODO: Remove this once all teams have transitioned to the new system // TODO: Remove this once all teams have transitioned to the new system
try { try {
@ -53,7 +57,10 @@ export async function extractController(
extractId, extractId,
}; };
if(await getTeamIdSyncB(req.auth.team_id) && req.body.origin !== "api-sdk") { if (
(await getTeamIdSyncB(req.auth.team_id)) &&
req.body.origin !== "api-sdk"
) {
return await oldExtract(req, res, extractId); return await oldExtract(req, res, extractId);
} }

View File

@ -86,11 +86,15 @@ export async function getMapResults({
// If sitemapOnly is true, only get links from sitemap // If sitemapOnly is true, only get links from sitemap
if (crawlerOptions.sitemapOnly) { if (crawlerOptions.sitemapOnly) {
const sitemap = await crawler.tryGetSitemap(urls => { const sitemap = await crawler.tryGetSitemap(
(urls) => {
urls.forEach((x) => { urls.forEach((x) => {
links.push(x); links.push(x);
}); });
}, true, true); },
true,
true,
);
if (sitemap > 0) { if (sitemap > 0) {
links = links links = links
.slice(1) .slice(1)
@ -145,7 +149,9 @@ export async function getMapResults({
// Parallelize sitemap fetch with serper search // Parallelize sitemap fetch with serper search
const [_, ...searchResults] = await Promise.all([ const [_, ...searchResults] = await Promise.all([
ignoreSitemap ? null : crawler.tryGetSitemap(urls => { ignoreSitemap
? null
: crawler.tryGetSitemap((urls) => {
links.push(...urls); links.push(...urls);
}, true), }, true),
...(cachedResult ? [] : pagePromises), ...(cachedResult ? [] : pagePromises),

View File

@ -18,7 +18,10 @@ export async function scrapeStatusController(req: any, res: any) {
const job = await supabaseGetJobByIdOnlyData(req.params.jobId); const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
if (!allowedTeams.includes(job?.team_id) || job?.team_id !== req.auth.team_id) { if (
!allowedTeams.includes(job?.team_id) ||
job?.team_id !== req.auth.team_id
) {
return res.status(403).json({ return res.status(403).json({
success: false, success: false,
error: "You are not allowed to access this resource.", error: "You are not allowed to access this resource.",

View File

@ -200,7 +200,8 @@ export const extractV1Options = z
schema: z schema: z
.any() .any()
.optional() .optional()
.refine((val) => { .refine(
(val) => {
if (!val) return true; // Allow undefined schema if (!val) return true; // Allow undefined schema
try { try {
const validate = ajv.compile(val); const validate = ajv.compile(val);
@ -208,9 +209,11 @@ export const extractV1Options = z
} catch (e) { } catch (e) {
return false; return false;
} }
}, { },
{
message: "Invalid JSON schema.", message: "Invalid JSON schema.",
}), },
),
limit: z.number().int().positive().finite().safe().optional(), limit: z.number().int().positive().finite().safe().optional(),
ignoreSitemap: z.boolean().default(false), ignoreSitemap: z.boolean().default(false),
includeSubdomains: z.boolean().default(true), includeSubdomains: z.boolean().default(true),
@ -452,7 +455,7 @@ export type Document = {
description: string; description: string;
url: string; url: string;
}; };
} };
export type ErrorResponse = { export type ErrorResponse = {
success: false; success: false;
@ -477,7 +480,7 @@ export interface ScrapeResponseRequestTest {
export interface URLTrace { export interface URLTrace {
url: string; url: string;
status: 'mapped' | 'scraped' | 'error'; status: "mapped" | "scraped" | "error";
timing: { timing: {
discoveredAt: string; discoveredAt: string;
scrapedAt?: string; scrapedAt?: string;
@ -785,9 +788,18 @@ export function toLegacyDocument(
}; };
} }
export const searchRequestSchema = z.object({ export const searchRequestSchema = z
.object({
query: z.string(), query: z.string(),
limit: z.number().int().positive().finite().safe().max(10).optional().default(5), limit: z
.number()
.int()
.positive()
.finite()
.safe()
.max(10)
.optional()
.default(5),
tbs: z.string().optional(), tbs: z.string().optional(),
filter: z.string().optional(), filter: z.string().optional(),
lang: z.string().optional().default("en"), lang: z.string().optional().default("en"),
@ -795,18 +807,27 @@ export const searchRequestSchema = z.object({
location: z.string().optional(), location: z.string().optional(),
origin: z.string().optional().default("api"), origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000), timeout: z.number().int().positive().finite().safe().default(60000),
scrapeOptions: scrapeOptions.extend({ scrapeOptions: scrapeOptions
formats: z.array(z.enum([ .extend({
formats: z
.array(
z.enum([
"markdown", "markdown",
"html", "html",
"rawHtml", "rawHtml",
"links", "links",
"screenshot", "screenshot",
"screenshot@fullPage", "screenshot@fullPage",
"extract" "extract",
])).default([]) ]),
}).default({}), )
}).strict("Unrecognized key in body -- please review the v1 API documentation for request body changes"); .default([]),
})
.default({}),
})
.strict(
"Unrecognized key in body -- please review the v1 API documentation for request body changes",
);
export type SearchRequest = z.infer<typeof searchRequestSchema>; export type SearchRequest = z.infer<typeof searchRequestSchema>;

View File

@ -45,7 +45,10 @@ const serverAdapter = new ExpressAdapter();
serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({ const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
queues: [new BullAdapter(getScrapeQueue()), new BullAdapter(getExtractQueue())], queues: [
new BullAdapter(getScrapeQueue()),
new BullAdapter(getExtractQueue()),
],
serverAdapter: serverAdapter, serverAdapter: serverAdapter,
}); });

View File

@ -1,91 +1,89 @@
import { normalizeUrl, normalizeUrlOnlyHostname } from './canonical-url'; import { normalizeUrl, normalizeUrlOnlyHostname } from "./canonical-url";
describe('normalizeUrlOnlyHostname', () => { describe("normalizeUrlOnlyHostname", () => {
it('should remove protocol and www from URL', () => { it("should remove protocol and www from URL", () => {
const url = 'https://www.example.com'; const url = "https://www.example.com";
const expected = 'example.com'; const expected = "example.com";
expect(normalizeUrlOnlyHostname(url)).toBe(expected); expect(normalizeUrlOnlyHostname(url)).toBe(expected);
}); });
it('should remove only protocol if www is not present', () => { it("should remove only protocol if www is not present", () => {
const url = 'https://example.com'; const url = "https://example.com";
const expected = 'example.com'; const expected = "example.com";
expect(normalizeUrlOnlyHostname(url)).toBe(expected); expect(normalizeUrlOnlyHostname(url)).toBe(expected);
}); });
it('should handle URLs without protocol', () => { it("should handle URLs without protocol", () => {
const url = 'www.example.com'; const url = "www.example.com";
const expected = 'example.com'; const expected = "example.com";
expect(normalizeUrlOnlyHostname(url)).toBe(expected); expect(normalizeUrlOnlyHostname(url)).toBe(expected);
}); });
it('should handle URLs without protocol and www', () => { it("should handle URLs without protocol and www", () => {
const url = 'example.com'; const url = "example.com";
const expected = 'example.com'; const expected = "example.com";
expect(normalizeUrlOnlyHostname(url)).toBe(expected); expect(normalizeUrlOnlyHostname(url)).toBe(expected);
}); });
it('should handle URLs with paths', () => { it("should handle URLs with paths", () => {
const url = 'https://www.example.com/path/to/resource'; const url = "https://www.example.com/path/to/resource";
const expected = 'example.com'; const expected = "example.com";
expect(normalizeUrlOnlyHostname(url)).toBe(expected); expect(normalizeUrlOnlyHostname(url)).toBe(expected);
}); });
it('should handle invalid URLs gracefully', () => { it("should handle invalid URLs gracefully", () => {
const url = 'not a valid url'; const url = "not a valid url";
const expected = 'not a valid url'; const expected = "not a valid url";
expect(normalizeUrlOnlyHostname(url)).toBe(expected); expect(normalizeUrlOnlyHostname(url)).toBe(expected);
}); });
}); });
describe("normalizeUrl", () => {
it("should remove protocol and www from URL", () => {
describe('normalizeUrl', () => { const url = "https://www.example.com";
it('should remove protocol and www from URL', () => { const expected = "example.com";
const url = 'https://www.example.com';
const expected = 'example.com';
expect(normalizeUrl(url)).toBe(expected); expect(normalizeUrl(url)).toBe(expected);
}); });
it('should remove only protocol if www is not present', () => { it("should remove only protocol if www is not present", () => {
const url = 'https://example.com'; const url = "https://example.com";
const expected = 'example.com'; const expected = "example.com";
expect(normalizeUrl(url)).toBe(expected); expect(normalizeUrl(url)).toBe(expected);
}); });
it('should handle URLs without protocol', () => { it("should handle URLs without protocol", () => {
const url = 'www.example.com'; const url = "www.example.com";
const expected = 'example.com'; const expected = "example.com";
expect(normalizeUrl(url)).toBe(expected); expect(normalizeUrl(url)).toBe(expected);
}); });
it('should handle URLs without protocol and www', () => { it("should handle URLs without protocol and www", () => {
const url = 'example.com'; const url = "example.com";
const expected = 'example.com'; const expected = "example.com";
expect(normalizeUrl(url)).toBe(expected); expect(normalizeUrl(url)).toBe(expected);
}); });
it('should handle URLs with paths', () => { it("should handle URLs with paths", () => {
const url = 'https://www.example.com/path/to/resource'; const url = "https://www.example.com/path/to/resource";
const expected = 'example.com/path/to/resource'; const expected = "example.com/path/to/resource";
expect(normalizeUrl(url)).toBe(expected); expect(normalizeUrl(url)).toBe(expected);
}); });
it('should handle URLs with trailing slash', () => { it("should handle URLs with trailing slash", () => {
const url = 'https://www.example.com/'; const url = "https://www.example.com/";
const expected = 'example.com'; const expected = "example.com";
expect(normalizeUrl(url)).toBe(expected); expect(normalizeUrl(url)).toBe(expected);
}); });
it('should handle URLs with trailing slash and path', () => { it("should handle URLs with trailing slash and path", () => {
const url = 'https://www.example.com/path/'; const url = "https://www.example.com/path/";
const expected = 'example.com/path'; const expected = "example.com/path";
expect(normalizeUrl(url)).toBe(expected); expect(normalizeUrl(url)).toBe(expected);
}); });
it('should handle invalid URLs gracefully', () => { it("should handle invalid URLs gracefully", () => {
const url = 'not a valid url'; const url = "not a valid url";
const expected = 'not a valid url'; const expected = "not a valid url";
expect(normalizeUrl(url)).toBe(expected); expect(normalizeUrl(url)).toBe(expected);
}); });
}); });

View File

@ -322,7 +322,7 @@ export async function lockURLs(
export async function lockURLsIndividually( export async function lockURLsIndividually(
id: string, id: string,
sc: StoredCrawl, sc: StoredCrawl,
jobs: { id: string; url: string; }[], jobs: { id: string; url: string }[],
) { ) {
const out: typeof jobs = []; const out: typeof jobs = [];

View File

@ -6,6 +6,4 @@ export const extractConfig = {
MIN_REQUIRED_LINKS: 1, MIN_REQUIRED_LINKS: 1,
}; };
export const CUSTOM_U_TEAMS = [ export const CUSTOM_U_TEAMS = ["874d40cc-a5c0-4e93-b661-9ddfbad5e51e"];
"874d40cc-a5c0-4e93-b661-9ddfbad5e51e"
]

View File

@ -21,14 +21,19 @@ export async function getExtract(id: string): Promise<StoredExtract | null> {
return x ? JSON.parse(x) : null; return x ? JSON.parse(x) : null;
} }
export async function updateExtract(id: string, extract: Partial<StoredExtract>) { export async function updateExtract(
id: string,
extract: Partial<StoredExtract>,
) {
const current = await getExtract(id); const current = await getExtract(id);
if (!current) return; if (!current) return;
await redisConnection.set("extract:" + id, JSON.stringify({ ...current, ...extract })); await redisConnection.set(
"extract:" + id,
JSON.stringify({ ...current, ...extract }),
);
await redisConnection.expire("extract:" + id, 24 * 60 * 60, "NX"); await redisConnection.expire("extract:" + id, 24 * 60 * 60, "NX");
} }
export async function getExtractExpiry(id: string): Promise<Date> { export async function getExtractExpiry(id: string): Promise<Date> {
const d = new Date(); const d = new Date();
const ttl = await redisConnection.pttl("extract:" + id); const ttl = await redisConnection.pttl("extract:" + id);

View File

@ -40,14 +40,18 @@ function getRootDomain(url: string): string {
} }
} }
export async function performExtraction(extractId: string, options: ExtractServiceOptions): Promise<ExtractResult> { export async function performExtraction(
extractId: string,
options: ExtractServiceOptions,
): Promise<ExtractResult> {
const { request, teamId, plan, subId } = options; const { request, teamId, plan, subId } = options;
const urlTraces: URLTrace[] = []; const urlTraces: URLTrace[] = [];
let docs: Document[] = []; let docs: Document[] = [];
// Process URLs // Process URLs
const urlPromises = request.urls.map(url => const urlPromises = request.urls.map((url) =>
processUrl({ processUrl(
{
url, url,
prompt: request.prompt, prompt: request.prompt,
teamId, teamId,
@ -56,16 +60,19 @@ export async function performExtraction(extractId: string, options: ExtractServi
origin: request.origin, origin: request.origin,
limit: request.limit, limit: request.limit,
includeSubdomains: request.includeSubdomains, includeSubdomains: request.includeSubdomains,
}, urlTraces) },
urlTraces,
),
); );
const processedUrls = await Promise.all(urlPromises); const processedUrls = await Promise.all(urlPromises);
const links = processedUrls.flat().filter(url => url); const links = processedUrls.flat().filter((url) => url);
if (links.length === 0) { if (links.length === 0) {
return { return {
success: false, success: false,
error: "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.", error:
"No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
extractId, extractId,
urlTrace: urlTraces, urlTrace: urlTraces,
}; };
@ -73,14 +80,17 @@ export async function performExtraction(extractId: string, options: ExtractServi
// Scrape documents // Scrape documents
const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000; const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000;
const scrapePromises = links.map(url => const scrapePromises = links.map((url) =>
scrapeDocument({ scrapeDocument(
{
url, url,
teamId, teamId,
plan, plan,
origin: request.origin || "api", origin: request.origin || "api",
timeout, timeout,
}, urlTraces) },
urlTraces,
),
); );
try { try {
@ -114,13 +124,16 @@ export async function performExtraction(extractId: string, options: ExtractServi
// Update token usage in traces // Update token usage in traces
if (completions.numTokens) { if (completions.numTokens) {
const totalLength = docs.reduce((sum, doc) => sum + (doc.markdown?.length || 0), 0); const totalLength = docs.reduce(
(sum, doc) => sum + (doc.markdown?.length || 0),
0,
);
docs.forEach((doc) => { docs.forEach((doc) => {
if (doc.metadata?.sourceURL) { if (doc.metadata?.sourceURL) {
const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL); const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL);
if (trace && trace.contentStats) { if (trace && trace.contentStats) {
trace.contentStats.tokensUsed = Math.floor( trace.contentStats.tokensUsed = Math.floor(
((doc.markdown?.length || 0) / totalLength) * completions.numTokens ((doc.markdown?.length || 0) / totalLength) * completions.numTokens,
); );
} }
} }
@ -213,12 +226,12 @@ export async function performExtraction(extractId: string, options: ExtractServi
updateExtract(extractId, { updateExtract(extractId, {
status: "completed", status: "completed",
}).catch((error) => { }).catch((error) => {
logger.error(`Failed to update extract ${extractId} status to completed: ${error}`); logger.error(
`Failed to update extract ${extractId} status to completed: ${error}`,
);
}); });
}); });
return { return {
success: true, success: true,
data: completions.extract ?? {}, data: completions.extract ?? {},

View File

@ -1,6 +1,6 @@
import { Pinecone } from '@pinecone-database/pinecone'; import { Pinecone } from "@pinecone-database/pinecone";
import { Document } from '../../../controllers/v1/types'; import { Document } from "../../../controllers/v1/types";
import { logger } from '../../logger'; import { logger } from "../../logger";
import OpenAI from "openai"; import OpenAI from "openai";
const openai = new OpenAI({ const openai = new OpenAI({
@ -48,34 +48,43 @@ export async function indexPage({
document, document,
originUrl, originUrl,
crawlId, crawlId,
teamId teamId,
}: { }: {
document: Document; document: Document;
originUrl: string; originUrl: string;
crawlId?: string; crawlId?: string;
teamId?: string; teamId?: string;
} }) {
) {
try { try {
const index = pinecone.index(INDEX_NAME); const index = pinecone.index(INDEX_NAME);
// Trim markdown if it's too long // Trim markdown if it's too long
let trimmedMarkdown = document.markdown; let trimmedMarkdown = document.markdown;
if (trimmedMarkdown && Buffer.byteLength(trimmedMarkdown, 'utf-8') > MAX_METADATA_SIZE) { if (
trimmedMarkdown = trimmedMarkdown.slice(0, Math.floor(MAX_METADATA_SIZE / 2)); // Using half the size to be safe with UTF-8 encoding trimmedMarkdown &&
Buffer.byteLength(trimmedMarkdown, "utf-8") > MAX_METADATA_SIZE
) {
trimmedMarkdown = trimmedMarkdown.slice(
0,
Math.floor(MAX_METADATA_SIZE / 2),
); // Using half the size to be safe with UTF-8 encoding
} }
// Create text to embed // Create text to embed
const textToEmbed = [ const textToEmbed = [
document.metadata.title, document.metadata.title,
document.metadata.description, document.metadata.description,
trimmedMarkdown trimmedMarkdown,
].filter(Boolean).join('\n\n'); ]
.filter(Boolean)
.join("\n\n");
// Get embedding from OpenAI // Get embedding from OpenAI
const embedding = await getEmbedding(textToEmbed); const embedding = await getEmbedding(textToEmbed);
const normalizedUrl = normalizeUrl(document.metadata.sourceURL || document.metadata.url!); const normalizedUrl = normalizeUrl(
document.metadata.sourceURL || document.metadata.url!,
);
// Prepare metadata // Prepare metadata
const metadata: PageMetadata = { const metadata: PageMetadata = {
@ -86,29 +95,30 @@ export async function indexPage({
crawlId, crawlId,
teamId, teamId,
markdown: trimmedMarkdown, markdown: trimmedMarkdown,
timestamp: Date.now() timestamp: Date.now(),
}; };
// Upsert to Pinecone // Upsert to Pinecone
await index.upsert([{ await index.upsert([
{
id: normalizedUrl, id: normalizedUrl,
values: embedding, values: embedding,
metadata: { metadata: {
...metadata, ...metadata,
[document.metadata.sourceURL || document.metadata.url!]: true [document.metadata.sourceURL || document.metadata.url!]: true,
} },
}]); },
]);
logger.debug('Successfully indexed page in Pinecone', { logger.debug("Successfully indexed page in Pinecone", {
url: metadata.url, url: metadata.url,
crawlId crawlId,
}); });
} catch (error) { } catch (error) {
logger.error('Failed to index page in Pinecone', { logger.error("Failed to index page in Pinecone", {
error, error,
url: document.metadata.sourceURL || document.metadata.url, url: document.metadata.sourceURL || document.metadata.url,
crawlId crawlId,
}); });
} }
} }
@ -116,7 +126,7 @@ export async function indexPage({
export async function searchSimilarPages( export async function searchSimilarPages(
query: string, query: string,
originUrl?: string, originUrl?: string,
limit: number = 10 limit: number = 10,
) { ) {
try { try {
const index = pinecone.index(INDEX_NAME); const index = pinecone.index(INDEX_NAME);
@ -127,31 +137,30 @@ export async function searchSimilarPages(
const queryParams: any = { const queryParams: any = {
vector: queryEmbedding, vector: queryEmbedding,
topK: limit, topK: limit,
includeMetadata: true includeMetadata: true,
}; };
const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined; const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined;
// Add filter if originUrl is provided // Add filter if originUrl is provided
if (normalizedOriginUrl) { if (normalizedOriginUrl) {
queryParams.filter = { queryParams.filter = {
originUrl: { $eq: normalizedOriginUrl } originUrl: { $eq: normalizedOriginUrl },
}; };
} }
const results = await index.query(queryParams); const results = await index.query(queryParams);
return results.matches.map(match => ({ return results.matches.map((match) => ({
url: match.metadata?.url, url: match.metadata?.url,
title: match.metadata?.title, title: match.metadata?.title,
description: match.metadata?.description, description: match.metadata?.description,
score: match.score, score: match.score,
markdown: match.metadata?.markdown markdown: match.metadata?.markdown,
})); }));
} catch (error) { } catch (error) {
logger.error('Failed to search similar pages in Pinecone', { logger.error("Failed to search similar pages in Pinecone", {
error, error,
query, query,
originUrl originUrl,
}); });
return []; return [];
} }

View File

@ -9,8 +9,6 @@ const cohere = new CohereClient({
token: process.env.COHERE_API_KEY, token: process.env.COHERE_API_KEY,
}); });
interface RankingResult { interface RankingResult {
mappedLinks: MapDocument[]; mappedLinks: MapDocument[];
linksAndScores: { linksAndScores: {
@ -59,7 +57,6 @@ export async function rerankLinks(
searchQuery, searchQuery,
); );
// First try with high threshold // First try with high threshold
let filteredLinks = filterAndProcessLinks( let filteredLinks = filterAndProcessLinks(
mappedLinks, mappedLinks,
@ -67,8 +64,6 @@ export async function rerankLinks(
extractConfig.INITIAL_SCORE_THRESHOLD, extractConfig.INITIAL_SCORE_THRESHOLD,
); );
// If we don't have enough high-quality links, try with lower threshold // If we don't have enough high-quality links, try with lower threshold
if (filteredLinks.length < extractConfig.MIN_REQUIRED_LINKS) { if (filteredLinks.length < extractConfig.MIN_REQUIRED_LINKS) {
logger.info( logger.info(
@ -102,7 +97,7 @@ export async function rerankLinks(
if (trace) { if (trace) {
trace.relevanceScore = score.score; trace.relevanceScore = score.score;
// If URL didn't make it through filtering, mark it as filtered out // If URL didn't make it through filtering, mark it as filtered out
if (!filteredLinks.some(link => link.url === score.link)) { if (!filteredLinks.some((link) => link.url === score.link)) {
trace.warning = `Relevance score ${score.score} below threshold`; trace.warning = `Relevance score ${score.score} below threshold`;
trace.usedInCompletion = false; trace.usedInCompletion = false;
} }
@ -112,18 +107,18 @@ export async function rerankLinks(
const rankedLinks = filteredLinks.slice(0, extractConfig.MAX_RANKING_LIMIT); const rankedLinks = filteredLinks.slice(0, extractConfig.MAX_RANKING_LIMIT);
// Mark URLs that will be used in completion // Mark URLs that will be used in completion
rankedLinks.forEach(link => { rankedLinks.forEach((link) => {
const trace = urlTraces.find(t => t.url === link.url); const trace = urlTraces.find((t) => t.url === link.url);
if (trace) { if (trace) {
trace.usedInCompletion = true; trace.usedInCompletion = true;
} }
}); });
// Mark URLs that were dropped due to ranking limit // Mark URLs that were dropped due to ranking limit
filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach(link => { filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach((link) => {
const trace = urlTraces.find(t => t.url === link.url); const trace = urlTraces.find((t) => t.url === link.url);
if (trace) { if (trace) {
trace.warning = 'Excluded due to ranking limit'; trace.warning = "Excluded due to ranking limit";
trace.usedInCompletion = false; trace.usedInCompletion = false;
} }
}); });

View File

@ -20,10 +20,13 @@ interface ProcessUrlOptions {
includeSubdomains?: boolean; includeSubdomains?: boolean;
} }
export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace[]): Promise<string[]> { export async function processUrl(
options: ProcessUrlOptions,
urlTraces: URLTrace[],
): Promise<string[]> {
const trace: URLTrace = { const trace: URLTrace = {
url: options.url, url: options.url,
status: 'mapped', status: "mapped",
timing: { timing: {
discoveredAt: new Date().toISOString(), discoveredAt: new Date().toISOString(),
}, },
@ -35,8 +38,8 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
trace.usedInCompletion = true; trace.usedInCompletion = true;
return [options.url]; return [options.url];
} }
trace.status = 'error'; trace.status = "error";
trace.error = 'URL is blocked'; trace.error = "URL is blocked";
trace.usedInCompletion = false; trace.usedInCompletion = false;
return []; return [];
} }
@ -46,9 +49,10 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
let rephrasedPrompt = options.prompt; let rephrasedPrompt = options.prompt;
if (options.prompt) { if (options.prompt) {
rephrasedPrompt = await generateBasicCompletion( rephrasedPrompt =
buildRefrasedPrompt(options.prompt, baseUrl) (await generateBasicCompletion(
) ?? options.prompt; buildRefrasedPrompt(options.prompt, baseUrl),
)) ?? options.prompt;
} }
try { try {
@ -70,11 +74,11 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
let uniqueUrls = removeDuplicateUrls(allUrls); let uniqueUrls = removeDuplicateUrls(allUrls);
// Track all discovered URLs // Track all discovered URLs
uniqueUrls.forEach(discoveredUrl => { uniqueUrls.forEach((discoveredUrl) => {
if (!urlTraces.some(t => t.url === discoveredUrl)) { if (!urlTraces.some((t) => t.url === discoveredUrl)) {
urlTraces.push({ urlTraces.push({
url: discoveredUrl, url: discoveredUrl,
status: 'mapped', status: "mapped",
timing: { timing: {
discoveredAt: new Date().toISOString(), discoveredAt: new Date().toISOString(),
}, },
@ -102,12 +106,12 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
uniqueUrls = removeDuplicateUrls(allUrls); uniqueUrls = removeDuplicateUrls(allUrls);
// Track all discovered URLs // Track all discovered URLs
uniqueUrls.forEach(discoveredUrl => { uniqueUrls.forEach((discoveredUrl) => {
if (!urlTraces.some(t => t.url === discoveredUrl)) { if (!urlTraces.some((t) => t.url === discoveredUrl)) {
urlTraces.push({ urlTraces.push({
url: discoveredUrl, url: discoveredUrl,
status: 'mapped', status: "mapped",
warning: 'Broader search. Not limiting map results to prompt.', warning: "Broader search. Not limiting map results to prompt.",
timing: { timing: {
discoveredAt: new Date().toISOString(), discoveredAt: new Date().toISOString(),
}, },
@ -118,11 +122,11 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
} }
// Track all discovered URLs // Track all discovered URLs
uniqueUrls.forEach(discoveredUrl => { uniqueUrls.forEach((discoveredUrl) => {
if (!urlTraces.some(t => t.url === discoveredUrl)) { if (!urlTraces.some((t) => t.url === discoveredUrl)) {
urlTraces.push({ urlTraces.push({
url: discoveredUrl, url: discoveredUrl,
status: 'mapped', status: "mapped",
timing: { timing: {
discoveredAt: new Date().toISOString(), discoveredAt: new Date().toISOString(),
}, },
@ -155,9 +159,9 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
mappedLinks = await rerankLinks(mappedLinks, searchQuery, urlTraces); mappedLinks = await rerankLinks(mappedLinks, searchQuery, urlTraces);
} }
return mappedLinks.map(x => x.url); return mappedLinks.map((x) => x.url);
} catch (error) { } catch (error) {
trace.status = 'error'; trace.status = "error";
trace.error = error.message; trace.error = error.message;
trace.usedInCompletion = false; trace.usedInCompletion = false;
return []; return [];

View File

@ -42,11 +42,18 @@ export const logger = winston.createLogger({
}, },
}), }),
transports: [ transports: [
...(process.env.FIRECRAWL_LOG_TO_FILE ? [ ...(process.env.FIRECRAWL_LOG_TO_FILE
? [
new winston.transports.File({ new winston.transports.File({
filename: "firecrawl-" + (process.argv[1].includes("worker") ? "worker" : "app") + "-" + crypto.randomUUID() + ".log", filename:
}) "firecrawl-" +
] : []), (process.argv[1].includes("worker") ? "worker" : "app") +
"-" +
crypto.randomUUID() +
".log",
}),
]
: []),
new winston.transports.Console({ new winston.transports.Console({
format: winston.format.combine( format: winston.format.combine(
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }), winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),

View File

@ -179,7 +179,6 @@ export async function runWebScraper({
return response; return response;
} }
billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => { billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => {
logger.error( logger.error(
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits`, `Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,

View File

@ -192,7 +192,8 @@ v1Router.get(
wrap((req: any, res): any => crawlStatusController(req, res, true)), wrap((req: any, res): any => crawlStatusController(req, res, true)),
); );
v1Router.get("/scrape/:jobId", v1Router.get(
"/scrape/:jobId",
authMiddleware(RateLimiterMode.CrawlStatus), authMiddleware(RateLimiterMode.CrawlStatus),
wrap(scrapeStatusController), wrap(scrapeStatusController),
); );
@ -242,6 +243,3 @@ v1Router.get(
authMiddleware(RateLimiterMode.CrawlStatus), authMiddleware(RateLimiterMode.CrawlStatus),
wrap(creditUsageController), wrap(creditUsageController),
); );

View File

@ -219,18 +219,29 @@ export class WebCrawler {
const _urlsHandler = async (urls: string[]) => { const _urlsHandler = async (urls: string[]) => {
let uniqueURLs: string[] = []; let uniqueURLs: string[] = [];
for (const url of urls) { for (const url of urls) {
if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(url))) { if (
await redisConnection.sadd(
"sitemap:" + this.jobId + ":links",
normalizeUrl(url),
)
) {
uniqueURLs.push(url); uniqueURLs.push(url);
} }
} }
await redisConnection.expire("sitemap:" + this.jobId + ":links", 3600, "NX"); await redisConnection.expire(
"sitemap:" + this.jobId + ":links",
3600,
"NX",
);
if (uniqueURLs.length > 0) { if (uniqueURLs.length > 0) {
urlsHandler(uniqueURLs); urlsHandler(uniqueURLs);
} }
}; };
let count = await this.tryFetchSitemapLinks(this.initialUrl, (urls: string[]) => { let count = await this.tryFetchSitemapLinks(
this.initialUrl,
(urls: string[]) => {
if (fromMap && onlySitemap) { if (fromMap && onlySitemap) {
return urlsHandler(urls); return urlsHandler(urls);
} else { } else {
@ -243,10 +254,16 @@ export class WebCrawler {
leftOfLimit -= filteredLinks.length; leftOfLimit -= filteredLinks.length;
return _urlsHandler(filteredLinks); return _urlsHandler(filteredLinks);
} }
}); },
);
if (count > 0) { if (count > 0) {
if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(this.initialUrl))) { if (
await redisConnection.sadd(
"sitemap:" + this.jobId + ":links",
normalizeUrl(this.initialUrl),
)
) {
urlsHandler([this.initialUrl]); urlsHandler([this.initialUrl]);
} }
count++; count++;
@ -470,8 +487,13 @@ export class WebCrawler {
return socialMediaOrEmail.some((ext) => url.includes(ext)); return socialMediaOrEmail.some((ext) => url.includes(ext));
} }
private async tryFetchSitemapLinks(url: string, urlsHandler: (urls: string[]) => unknown): Promise<number> { private async tryFetchSitemapLinks(
const sitemapUrl = url.endsWith(".xml") ? url : `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`; url: string,
urlsHandler: (urls: string[]) => unknown,
): Promise<number> {
const sitemapUrl = url.endsWith(".xml")
? url
: `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
let sitemapCount: number = 0; let sitemapCount: number = 0;
@ -482,37 +504,43 @@ export class WebCrawler {
this.logger, this.logger,
); );
} catch (error) { } catch (error) {
this.logger.debug( this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, {
`Failed to fetch sitemap from ${sitemapUrl}`, method: "tryFetchSitemapLinks",
{ method: "tryFetchSitemapLinks", sitemapUrl, error }, sitemapUrl,
); error,
});
} }
// If this is a subdomain, also try to get sitemap from the main domain // If this is a subdomain, also try to get sitemap from the main domain
try { try {
const urlObj = new URL(url); const urlObj = new URL(url);
const hostname = urlObj.hostname; const hostname = urlObj.hostname;
const domainParts = hostname.split('.'); const domainParts = hostname.split(".");
// Check if this is a subdomain (has more than 2 parts and not www) // Check if this is a subdomain (has more than 2 parts and not www)
if (domainParts.length > 2 && domainParts[0] !== 'www') { if (domainParts.length > 2 && domainParts[0] !== "www") {
// Get the main domain by taking the last two parts // Get the main domain by taking the last two parts
const mainDomain = domainParts.slice(-2).join('.'); const mainDomain = domainParts.slice(-2).join(".");
const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`; const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`;
const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`; const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`;
try { try {
// Get all links from the main domain's sitemap // Get all links from the main domain's sitemap
sitemapCount += await getLinksFromSitemap( sitemapCount += await getLinksFromSitemap(
{ sitemapUrl: mainDomainSitemapUrl, urlsHandler(urls) { {
return urlsHandler(urls.filter(link => { sitemapUrl: mainDomainSitemapUrl,
urlsHandler(urls) {
return urlsHandler(
urls.filter((link) => {
try { try {
const linkUrl = new URL(link); const linkUrl = new URL(link);
return linkUrl.hostname.endsWith(hostname); return linkUrl.hostname.endsWith(hostname);
} catch { } catch {}
} }),
})) );
}, mode: "fire-engine" }, },
mode: "fire-engine",
},
this.logger, this.logger,
); );
} catch (error) { } catch (error) {

View File

@ -15,7 +15,7 @@ export async function getLinksFromSitemap(
mode = "axios", mode = "axios",
}: { }: {
sitemapUrl: string; sitemapUrl: string;
urlsHandler(urls: string[]): unknown, urlsHandler(urls: string[]): unknown;
mode?: "axios" | "fire-engine"; mode?: "axios" | "fire-engine";
}, },
logger: Logger, logger: Logger,
@ -31,7 +31,10 @@ export async function getLinksFromSitemap(
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }, { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
); );
if (!response.success) { if (!response.success) {
logger.debug("Failed to scrape sitemap via TLSClient, falling back to axios...", { error: response.error }) logger.debug(
"Failed to scrape sitemap via TLSClient, falling back to axios...",
{ error: response.error },
);
const ar = await axios.get(sitemapUrl, { timeout: axiosTimeout }); const ar = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = ar.data; content = ar.data;
} else { } else {
@ -63,14 +66,11 @@ export async function getLinksFromSitemap(
.map((sitemap) => sitemap.loc[0].trim()); .map((sitemap) => sitemap.loc[0].trim());
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) => const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
getLinksFromSitemap( getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger),
{ sitemapUrl, urlsHandler, mode },
logger,
),
); );
const results = await Promise.all(sitemapPromises); const results = await Promise.all(sitemapPromises);
count = results.reduce((a,x) => a + x) count = results.reduce((a, x) => a + x);
} else if (root && root.url) { } else if (root && root.url) {
// Check if any URLs point to additional sitemaps // Check if any URLs point to additional sitemaps
const xmlSitemaps: string[] = root.url const xmlSitemaps: string[] = root.url
@ -78,7 +78,7 @@ export async function getLinksFromSitemap(
(url) => (url) =>
url.loc && url.loc &&
url.loc.length > 0 && url.loc.length > 0 &&
url.loc[0].trim().toLowerCase().endsWith('.xml') url.loc[0].trim().toLowerCase().endsWith(".xml"),
) )
.map((url) => url.loc[0].trim()); .map((url) => url.loc[0].trim());
@ -90,7 +90,10 @@ export async function getLinksFromSitemap(
logger, logger,
), ),
); );
count += (await Promise.all(sitemapPromises)).reduce((a,x) => a + x, 0); count += (await Promise.all(sitemapPromises)).reduce(
(a, x) => a + x,
0,
);
} }
const validUrls = root.url const validUrls = root.url
@ -98,7 +101,7 @@ export async function getLinksFromSitemap(
(url) => (url) =>
url.loc && url.loc &&
url.loc.length > 0 && url.loc.length > 0 &&
!url.loc[0].trim().toLowerCase().endsWith('.xml') && !url.loc[0].trim().toLowerCase().endsWith(".xml") &&
!WebCrawler.prototype.isFile(url.loc[0].trim()), !WebCrawler.prototype.isFile(url.loc[0].trim()),
) )
.map((url) => url.loc[0].trim()); .map((url) => url.loc[0].trim());

View File

@ -3,7 +3,10 @@ import { EngineScrapeResult } from "..";
import { Meta } from "../.."; import { Meta } from "../..";
import { TimeoutError } from "../../error"; import { TimeoutError } from "../../error";
import { specialtyScrapeCheck } from "../utils/specialtyHandler"; import { specialtyScrapeCheck } from "../utils/specialtyHandler";
import { InsecureConnectionError, makeSecureDispatcher } from "../utils/safeFetch"; import {
InsecureConnectionError,
makeSecureDispatcher,
} from "../utils/safeFetch";
export async function scrapeURLWithFetch( export async function scrapeURLWithFetch(
meta: Meta, meta: Meta,
@ -20,7 +23,9 @@ export async function scrapeURLWithFetch(
headers: meta.options.headers, headers: meta.options.headers,
}), }),
(async () => { (async () => {
await new Promise((resolve) => setTimeout(() => resolve(null), timeout)); await new Promise((resolve) =>
setTimeout(() => resolve(null), timeout),
);
throw new TimeoutError( throw new TimeoutError(
"Fetch was unable to scrape the page before timing out", "Fetch was unable to scrape the page before timing out",
{ cause: { timeout } }, { cause: { timeout } },
@ -28,7 +33,10 @@ export async function scrapeURLWithFetch(
})(), })(),
]); ]);
} catch (error) { } catch (error) {
if (error instanceof TypeError && error.cause instanceof InsecureConnectionError) { if (
error instanceof TypeError &&
error.cause instanceof InsecureConnectionError
) {
throw error.cause; throw error.cause;
} else { } else {
throw error; throw error;

View File

@ -3,7 +3,12 @@ import * as Sentry from "@sentry/node";
import { z } from "zod"; import { z } from "zod";
import { robustFetch } from "../../lib/fetch"; import { robustFetch } from "../../lib/fetch";
import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error"; import {
ActionError,
EngineError,
SiteError,
UnsupportedFileError,
} from "../../error";
const successSchema = z.object({ const successSchema = z.object({
jobId: z.string(), jobId: z.string(),
@ -37,10 +42,13 @@ const successSchema = z.object({
.optional(), .optional(),
// chrome-cdp only -- file download handler // chrome-cdp only -- file download handler
file: z.object({ file: z
.object({
name: z.string(), name: z.string(),
content: z.string(), content: z.string(),
}).optional().or(z.null()), })
.optional()
.or(z.null()),
}); });
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>; export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
@ -121,7 +129,9 @@ export async function fireEngineCheckStatus(
typeof status.error === "string" && typeof status.error === "string" &&
status.error.includes("File size exceeds") status.error.includes("File size exceeds")
) { ) {
throw new UnsupportedFileError("File size exceeds " + status.error.split("File size exceeds ")[1]); throw new UnsupportedFileError(
"File size exceeds " + status.error.split("File size exceeds ")[1],
);
} else if ( } else if (
typeof status.error === "string" && typeof status.error === "string" &&
// TODO: improve this later // TODO: improve this later

View File

@ -13,7 +13,13 @@ import {
FireEngineCheckStatusSuccess, FireEngineCheckStatusSuccess,
StillProcessingError, StillProcessingError,
} from "./checkStatus"; } from "./checkStatus";
import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError } from "../../error"; import {
ActionError,
EngineError,
SiteError,
TimeoutError,
UnsupportedFileError,
} from "../../error";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { Action } from "../../../../lib/entities"; import { Action } from "../../../../lib/entities";
import { specialtyScrapeCheck } from "../utils/specialtyHandler"; import { specialtyScrapeCheck } from "../utils/specialtyHandler";

View File

@ -298,7 +298,6 @@ export function buildFallbackList(meta: Meta): {
engine: Engine; engine: Engine;
unsupportedFeatures: Set<FeatureFlag>; unsupportedFeatures: Set<FeatureFlag>;
}[] { }[] {
if (meta.internalOptions.useCache !== true) { if (meta.internalOptions.useCache !== true) {
const cacheIndex = engines.indexOf("cache"); const cacheIndex = engines.indexOf("cache");
if (cacheIndex !== -1) { if (cacheIndex !== -1) {

View File

@ -7,11 +7,18 @@ import { v4 as uuid } from "uuid";
import * as undici from "undici"; import * as undici from "undici";
import { makeSecureDispatcher } from "./safeFetch"; import { makeSecureDispatcher } from "./safeFetch";
export async function fetchFileToBuffer(url: string, init?: undici.RequestInit): Promise<{ export async function fetchFileToBuffer(
url: string,
init?: undici.RequestInit,
): Promise<{
response: undici.Response; response: undici.Response;
buffer: Buffer; buffer: Buffer;
}> { }> {
const response = await undici.fetch(url, { ...init, redirect: "follow", dispatcher: await makeSecureDispatcher(url) }); const response = await undici.fetch(url, {
...init,
redirect: "follow",
dispatcher: await makeSecureDispatcher(url),
});
return { return {
response, response,
buffer: Buffer.from(await response.arrayBuffer()), buffer: Buffer.from(await response.arrayBuffer()),
@ -30,7 +37,11 @@ export async function downloadFile(
const tempFileWrite = createWriteStream(tempFilePath); const tempFileWrite = createWriteStream(tempFilePath);
// TODO: maybe we could use tlsclient for this? for proxying // TODO: maybe we could use tlsclient for this? for proxying
const response = await undici.fetch(url, { ...init, redirect: "follow", dispatcher: await makeSecureDispatcher(url) }); const response = await undici.fetch(url, {
...init,
redirect: "follow",
dispatcher: await makeSecureDispatcher(url),
});
// This should never happen in the current state of JS/Undici (2024), but let's check anyways. // This should never happen in the current state of JS/Undici (2024), but let's check anyways.
if (response.body === null) { if (response.body === null) {

View File

@ -5,36 +5,44 @@ import { Address6 } from "ip-address";
export class InsecureConnectionError extends Error { export class InsecureConnectionError extends Error {
constructor() { constructor() {
super("Connection violated security rules.") super("Connection violated security rules.");
} }
} }
function isIPv4Private(address: string): boolean { function isIPv4Private(address: string): boolean {
const parts = address.split(".").map(x => parseInt(x, 10)); const parts = address.split(".").map((x) => parseInt(x, 10));
return parts[0] === 0 // Current (local, "this") network return (
|| parts[0] === 10 // Used for local communications within a private network parts[0] === 0 || // Current (local, "this") network
|| (parts[0] === 100 && parts[1] >= 64 && parts[1] < 128) // Shared address space for communications between a service provider and its subscribers when using a carrier-grade NAT parts[0] === 10 || // Used for local communications within a private network
|| parts[0] === 127 // Used for loopback addresses to the local host (parts[0] === 100 && parts[1] >= 64 && parts[1] < 128) || // Shared address space for communications between a service provider and its subscribers when using a carrier-grade NAT
|| (parts[0] === 169 && parts[1] === 254) // Used for link-local addresses between two hosts on a single link when no IP address is otherwise specified, such as would have normally been retrieved from a DHCP server parts[0] === 127 || // Used for loopback addresses to the local host
|| (parts[0] === 127 && parts[1] >= 16 && parts[2] < 32) // Used for local communications within a private network (parts[0] === 169 && parts[1] === 254) || // Used for link-local addresses between two hosts on a single link when no IP address is otherwise specified, such as would have normally been retrieved from a DHCP server
|| (parts[0] === 192 && parts[1] === 0 && parts[2] === 0) // IETF Porotocol Assignments, DS-Lite (/29) (parts[0] === 127 && parts[1] >= 16 && parts[2] < 32) || // Used for local communications within a private network
|| (parts[0] === 192 && parts[1] === 0 && parts[2] === 2) // Assigned as TEST-NET-1, documentation and examples (parts[0] === 192 && parts[1] === 0 && parts[2] === 0) || // IETF Porotocol Assignments, DS-Lite (/29)
|| (parts[0] === 192 && parts[1] === 88 && parts[2] === 99) // Reserved. Formerly used for IPv6 to IPv4 relay (included IPv6 address block 2002::/16). (parts[0] === 192 && parts[1] === 0 && parts[2] === 2) || // Assigned as TEST-NET-1, documentation and examples
|| (parts[0] === 192 && parts[1] === 168) // Used for local communications within a private network (parts[0] === 192 && parts[1] === 88 && parts[2] === 99) || // Reserved. Formerly used for IPv6 to IPv4 relay (included IPv6 address block 2002::/16).
|| (parts[0] === 192 && parts[1] >= 18 && parts[1] < 20) // Used for benchmark testing of inter-network communications between two separate subnets (parts[0] === 192 && parts[1] === 168) || // Used for local communications within a private network
|| (parts[0] === 198 && parts[1] === 51 && parts[2] === 100) // Assigned as TEST-NET-2, documentation and examples (parts[0] === 192 && parts[1] >= 18 && parts[1] < 20) || // Used for benchmark testing of inter-network communications between two separate subnets
|| (parts[0] === 203 && parts[1] === 0 && parts[2] === 113) // Assigned as TEST-NET-3, documentation and examples (parts[0] === 198 && parts[1] === 51 && parts[2] === 100) || // Assigned as TEST-NET-2, documentation and examples
|| (parts[0] >= 224 && parts[0] < 240) // In use for multicast (former Class D network) (parts[0] === 203 && parts[1] === 0 && parts[2] === 113) || // Assigned as TEST-NET-3, documentation and examples
|| (parts[0] === 233 && parts[1] === 252 && parts[2] === 0) // Assigned as MCAST-TEST-NET, documentation and examples (Note that this is part of the above multicast space.) (parts[0] >= 224 && parts[0] < 240) || // In use for multicast (former Class D network)
|| parts[0] >= 240 // Reserved for future use (former class E network) (parts[0] === 233 && parts[1] === 252 && parts[2] === 0) || // Assigned as MCAST-TEST-NET, documentation and examples (Note that this is part of the above multicast space.)
|| (parts[0] === 255 && parts[1] === 255 && parts[2] === 255 && parts[3] === 255) // Reserved for the "limited broadcast" destination address parts[0] >= 240 || // Reserved for future use (former class E network)
(parts[0] === 255 &&
parts[1] === 255 &&
parts[2] === 255 &&
parts[3] === 255)
); // Reserved for the "limited broadcast" destination address
} }
function isIPv6Private(ipv6) { function isIPv6Private(ipv6) {
return new Address6(ipv6).getScope() !== "Global"; return new Address6(ipv6).getScope() !== "Global";
} }
export function makeSecureDispatcher(url: string, options?: undici.Agent.Options) { export function makeSecureDispatcher(
url: string,
options?: undici.Agent.Options,
) {
const agent = new undici.Agent({ const agent = new undici.Agent({
connect: { connect: {
rejectUnauthorized: false, // bypass SSL failures -- this is fine rejectUnauthorized: false, // bypass SSL failures -- this is fine
@ -46,12 +54,18 @@ export function makeSecureDispatcher(url: string, options?: undici.Agent.Options
agent.on("connect", (_, targets) => { agent.on("connect", (_, targets) => {
const client: undici.Client = targets.slice(-1)[0] as undici.Client; const client: undici.Client = targets.slice(-1)[0] as undici.Client;
const socketSymbol = Object.getOwnPropertySymbols(client).find(x => x.description === "socket")!; const socketSymbol = Object.getOwnPropertySymbols(client).find(
(x) => x.description === "socket",
)!;
const socket: Socket | TLSSocket = (client as any)[socketSymbol]; const socket: Socket | TLSSocket = (client as any)[socketSymbol];
if (socket.remoteAddress) { if (socket.remoteAddress) {
if (socket.remoteFamily === "IPv4" ? isIPv4Private(socket.remoteAddress!) : isIPv6Private(socket.remoteAddress!)) { if (
socket.destroy(new InsecureConnectionError()) socket.remoteFamily === "IPv4"
? isIPv4Private(socket.remoteAddress!)
: isIPv6Private(socket.remoteAddress!)
) {
socket.destroy(new InsecureConnectionError());
} }
} }
}); });

View File

@ -420,7 +420,9 @@ export async function scrapeURL(
} else if (error instanceof ActionError) { } else if (error instanceof ActionError) {
meta.logger.warn("scrapeURL: Action(s) failed to complete", { error }); meta.logger.warn("scrapeURL: Action(s) failed to complete", { error });
} else if (error instanceof UnsupportedFileError) { } else if (error instanceof UnsupportedFileError) {
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", { error }); meta.logger.warn("scrapeURL: Tried to scrape unsupported file", {
error,
});
} else { } else {
Sentry.captureException(error); Sentry.captureException(error);
meta.logger.error("scrapeURL: Unexpected error happened", { error }); meta.logger.error("scrapeURL: Unexpected error happened", { error });

View File

@ -44,10 +44,15 @@ export function extractMetadata(
title = soup("title").first().text().trim() || undefined; title = soup("title").first().text().trim() || undefined;
description = soup('meta[name="description"]').attr("content") || undefined; description = soup('meta[name="description"]').attr("content") || undefined;
const faviconLink = soup('link[rel="icon"]').attr("href") || soup('link[rel*="icon"]').first().attr("href") || undefined; const faviconLink =
soup('link[rel="icon"]').attr("href") ||
soup('link[rel*="icon"]').first().attr("href") ||
undefined;
if (faviconLink) { if (faviconLink) {
const baseUrl = new URL(meta.url).origin; const baseUrl = new URL(meta.url).origin;
favicon = faviconLink.startsWith('http') ? faviconLink : `${baseUrl}${faviconLink}`; favicon = faviconLink.startsWith("http")
? faviconLink
: `${baseUrl}${faviconLink}`;
} }
// Assuming the language is part of the URL as per the regex pattern // Assuming the language is part of the URL as per the regex pattern

View File

@ -24,7 +24,6 @@ export function saveToCache(meta: Meta, document: Document): Document {
return document; return document;
} }
const key = cacheKey(meta.url, meta.options, meta.internalOptions); const key = cacheKey(meta.url, meta.options, meta.internalOptions);
if (key !== null) { if (key !== null) {

View File

@ -8,13 +8,21 @@ describe("removeDefaultProperty", () => {
}); });
it("should remove the default property from a nested object", () => { it("should remove the default property from a nested object", () => {
const input = { default: "test", nested: { default: "nestedTest", test: "nestedTest" } }; const input = {
default: "test",
nested: { default: "nestedTest", test: "nestedTest" },
};
const expectedOutput = { nested: { test: "nestedTest" } }; const expectedOutput = { nested: { test: "nestedTest" } };
expect(removeDefaultProperty(input)).toEqual(expectedOutput); expect(removeDefaultProperty(input)).toEqual(expectedOutput);
}); });
it("should remove the default property from an array of objects", () => { it("should remove the default property from an array of objects", () => {
const input = { array: [{ default: "test1", test: "test1" }, { default: "test2", test: "test2" }] }; const input = {
array: [
{ default: "test1", test: "test1" },
{ default: "test2", test: "test2" },
],
};
const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] }; const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] };
expect(removeDefaultProperty(input)).toEqual(expectedOutput); expect(removeDefaultProperty(input)).toEqual(expectedOutput);
}); });

View File

@ -140,10 +140,10 @@ export async function generateOpenAICompletions(
properties: Object.fromEntries( properties: Object.fromEntries(
Object.entries(schema).map(([key, value]) => { Object.entries(schema).map(([key, value]) => {
return [key, removeDefaultProperty(value)]; return [key, removeDefaultProperty(value)];
}) }),
), ),
required: Object.keys(schema), required: Object.keys(schema),
additionalProperties: false additionalProperties: false,
}; };
} }
@ -240,14 +240,14 @@ export async function performLLMExtract(
} }
export function removeDefaultProperty(schema: any): any { export function removeDefaultProperty(schema: any): any {
if (typeof schema !== 'object' || schema === null) return schema; if (typeof schema !== "object" || schema === null) return schema;
const { default: _, ...rest } = schema; const { default: _, ...rest } = schema;
for (const key in rest) { for (const key in rest) {
if (Array.isArray(rest[key])) { if (Array.isArray(rest[key])) {
rest[key] = rest[key].map((item: any) => removeDefaultProperty(item)); rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));
} else if (typeof rest[key] === 'object' && rest[key] !== null) { } else if (typeof rest[key] === "object" && rest[key] !== null) {
rest[key] = removeDefaultProperty(rest[key]); rest[key] = removeDefaultProperty(rest[key]);
} }
} }

View File

@ -9,9 +9,11 @@ configDotenv();
function cleanOfNull<T>(x: T): T { function cleanOfNull<T>(x: T): T {
if (Array.isArray(x)) { if (Array.isArray(x)) {
return x.map(x => cleanOfNull(x)) as T; return x.map((x) => cleanOfNull(x)) as T;
} else if (typeof x === "object" && x !== null) { } else if (typeof x === "object" && x !== null) {
return Object.fromEntries(Object.entries(x).map(([k,v]) => [k,cleanOfNull(v)])) as T return Object.fromEntries(
Object.entries(x).map(([k, v]) => [k, cleanOfNull(v)]),
) as T;
} else if (typeof x === "string") { } else if (typeof x === "string") {
return x.replaceAll("\u0000", "") as T; return x.replaceAll("\u0000", "") as T;
} else { } else {

View File

@ -16,9 +16,7 @@ export const loggingQueueName = "{loggingQueue}";
export function getScrapeQueue() { export function getScrapeQueue() {
if (!scrapeQueue) { if (!scrapeQueue) {
scrapeQueue = new Queue( scrapeQueue = new Queue(scrapeQueueName, {
scrapeQueueName,
{
connection: redisConnection, connection: redisConnection,
defaultJobOptions: { defaultJobOptions: {
removeOnComplete: { removeOnComplete: {
@ -28,8 +26,7 @@ export function getScrapeQueue() {
age: 90000, // 25 hours age: 90000, // 25 hours
}, },
}, },
} });
);
logger.info("Web scraper queue created"); logger.info("Web scraper queue created");
} }
return scrapeQueue; return scrapeQueue;
@ -37,9 +34,7 @@ export function getScrapeQueue() {
export function getExtractQueue() { export function getExtractQueue() {
if (!extractQueue) { if (!extractQueue) {
extractQueue = new Queue( extractQueue = new Queue(extractQueueName, {
extractQueueName,
{
connection: redisConnection, connection: redisConnection,
defaultJobOptions: { defaultJobOptions: {
removeOnComplete: { removeOnComplete: {
@ -49,14 +44,12 @@ export function getExtractQueue() {
age: 90000, // 25 hours age: 90000, // 25 hours
}, },
}, },
} });
);
logger.info("Extraction queue created"); logger.info("Extraction queue created");
} }
return extractQueue; return extractQueue;
} }
// === REMOVED IN FAVOR OF POLLING -- NOT RELIABLE // === REMOVED IN FAVOR OF POLLING -- NOT RELIABLE
// import { QueueEvents } from 'bullmq'; // import { QueueEvents } from 'bullmq';
// export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection.duplicate() }); // export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection.duplicate() });

View File

@ -89,13 +89,19 @@ const runningJobs: Set<string> = new Set();
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
if (await finishCrawl(job.data.crawl_id)) { if (await finishCrawl(job.data.crawl_id)) {
(async () => { (async () => {
const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined; const originUrl = sc.originUrl
? normalizeUrlOnlyHostname(sc.originUrl)
: undefined;
// Get all visited URLs from Redis // Get all visited URLs from Redis
const visitedUrls = await redisConnection.smembers( const visitedUrls = await redisConnection.smembers(
"crawl:" + job.data.crawl_id + ":visited", "crawl:" + job.data.crawl_id + ":visited",
); );
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape) // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) { if (
visitedUrls.length > 0 &&
job.data.crawlerOptions !== null &&
originUrl
) {
// Fire and forget the upload to Supabase // Fire and forget the upload to Supabase
try { try {
// Standardize URLs to canonical form (https, no www) // Standardize URLs to canonical form (https, no www)
@ -317,7 +323,10 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
return err; return err;
}; };
const processExtractJobInternal = async (token: string, job: Job & { id: string }) => { const processExtractJobInternal = async (
token: string,
job: Job & { id: string },
) => {
const logger = _logger.child({ const logger = _logger.child({
module: "extract-worker", module: "extract-worker",
method: "processJobInternal", method: "processJobInternal",
@ -360,11 +369,14 @@ const processExtractJobInternal = async (token: string, job: Job & { id: string
await updateExtract(job.data.extractId, { await updateExtract(job.data.extractId, {
status: "failed", status: "failed",
error: error.error ?? error ?? "Unknown error, please contact help@firecrawl.dev. Extract id: " + job.data.extractId, error:
error.error ??
error ??
"Unknown error, please contact help@firecrawl.dev. Extract id: " +
job.data.extractId,
}); });
// throw error; // throw error;
} finally { } finally {
clearInterval(extendLockInterval); clearInterval(extendLockInterval);
} }
}; };
@ -635,7 +647,9 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
sc, sc,
jobs.map((x) => ({ id: x.opts.jobId, url: x.data.url })), jobs.map((x) => ({ id: x.opts.jobId, url: x.data.url })),
); );
const lockedJobs = jobs.filter(x => lockedIds.find(y => y.id === x.opts.jobId)); const lockedJobs = jobs.filter((x) =>
lockedIds.find((y) => y.id === x.opts.jobId),
);
logger.debug("Adding scrape jobs to Redis..."); logger.debug("Adding scrape jobs to Redis...");
await addCrawlJobs( await addCrawlJobs(
job.data.crawl_id, job.data.crawl_id,
@ -790,7 +804,8 @@ async function processJob(job: Job & { id: string }, token: string) {
) { ) {
const crawler = crawlToCrawler(job.data.crawl_id, sc); const crawler = crawlToCrawler(job.data.crawl_id, sc);
if ( if (
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null && crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) ===
null &&
!job.data.isCrawlSourceScrape !job.data.isCrawlSourceScrape
) { ) {
throw new Error( throw new Error(
@ -1073,7 +1088,7 @@ async function processJob(job: Job & { id: string }, token: string) {
console.log("All workers exited. Waiting for all jobs to finish..."); console.log("All workers exited. Waiting for all jobs to finish...");
while (runningJobs.size > 0) { while (runningJobs.size > 0) {
await new Promise(resolve => setTimeout(resolve, 500)); await new Promise((resolve) => setTimeout(resolve, 500));
} }
process.exit(0); process.exit(0);