Nick: formatting fixes

This commit is contained in:
Nicolas 2025-01-10 18:35:10 -03:00
parent d1f3b96388
commit f4d10c5031
37 changed files with 674 additions and 499 deletions

View File

@ -179,11 +179,15 @@ export async function crawlController(req: Request, res: Response) {
const sitemap = sc.crawlerOptions.ignoreSitemap
? 0
: await crawler.tryGetSitemap(async urls => {
: await crawler.tryGetSitemap(async (urls) => {
if (urls.length === 0) return;
let jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 });
const jobs = urls.map(url => {
let jobPriority = await getJobPriority({
plan,
team_id,
basePriority: 21,
});
const jobs = urls.map((url) => {
const uuid = uuidv4();
return {
name: uuid,

View File

@ -114,7 +114,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
const sitemap = sc.crawlerOptions?.ignoreSitemap
? 0
: await crawler.tryGetSitemap(async urls => {
: await crawler.tryGetSitemap(async (urls) => {
for (const url of urls) {
await lockURL(id, sc, url);
const jobId = uuidv4();

View File

@ -115,7 +115,8 @@ export async function crawlStatusController(
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
sc.cancelled
? "cancelled"
: (validJobStatuses.every((x) => x[1] === "completed") && validJobStatuses.length > 0)
: validJobStatuses.every((x) => x[1] === "completed") &&
validJobStatuses.length > 0
? "completed"
: "scraping";

View File

@ -7,11 +7,7 @@ import {
RequestWithAuth,
toLegacyCrawlerOptions,
} from "./types";
import {
crawlToCrawler,
saveCrawl,
StoredCrawl,
} from "../../lib/crawl-redis";
import { crawlToCrawler, saveCrawl, StoredCrawl } from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log";
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
import { logger as _logger } from "../../lib/logger";
@ -103,7 +99,8 @@ export async function crawlController(
await saveCrawl(id, sc);
await _addScrapeJobToBullMQ({
await _addScrapeJobToBullMQ(
{
url: req.body.url,
mode: "kickoff" as const,
team_id: req.auth.team_id,
@ -115,7 +112,11 @@ export async function crawlController(
crawl_id: id,
webhook: req.body.webhook,
v1: true,
}, {}, crypto.randomUUID(), 10);
},
{},
crypto.randomUUID(),
10,
);
const protocol = process.env.ENV === "local" ? req.protocol : "https";

View File

@ -11,7 +11,11 @@ import { saveExtract } from "../../lib/extract/extract-redis";
import { getTeamIdSyncB } from "../../lib/extract/team-id-sync";
import { performExtraction } from "../../lib/extract/extraction-service";
export async function oldExtract(req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>, res: Response<ExtractResponse>, extractId: string){
export async function oldExtract(
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
res: Response<ExtractResponse>,
extractId: string,
) {
// Means that are in the non-queue system
// TODO: Remove this once all teams have transitioned to the new system
try {
@ -29,7 +33,7 @@ export async function oldExtract(req: RequestWithAuth<{}, ExtractResponse, Extra
error: "Internal server error",
});
}
}
}
/**
* Extracts data from the provided URLs based on the request parameters.
* Currently in beta.
@ -53,7 +57,10 @@ export async function extractController(
extractId,
};
if(await getTeamIdSyncB(req.auth.team_id) && req.body.origin !== "api-sdk") {
if (
(await getTeamIdSyncB(req.auth.team_id)) &&
req.body.origin !== "api-sdk"
) {
return await oldExtract(req, res, extractId);
}

View File

@ -86,11 +86,15 @@ export async function getMapResults({
// If sitemapOnly is true, only get links from sitemap
if (crawlerOptions.sitemapOnly) {
const sitemap = await crawler.tryGetSitemap(urls => {
const sitemap = await crawler.tryGetSitemap(
(urls) => {
urls.forEach((x) => {
links.push(x);
});
}, true, true);
},
true,
true,
);
if (sitemap > 0) {
links = links
.slice(1)
@ -145,7 +149,9 @@ export async function getMapResults({
// Parallelize sitemap fetch with serper search
const [_, ...searchResults] = await Promise.all([
ignoreSitemap ? null : crawler.tryGetSitemap(urls => {
ignoreSitemap
? null
: crawler.tryGetSitemap((urls) => {
links.push(...urls);
}, true),
...(cachedResult ? [] : pagePromises),

View File

@ -18,7 +18,10 @@ export async function scrapeStatusController(req: any, res: any) {
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
if (!allowedTeams.includes(job?.team_id) || job?.team_id !== req.auth.team_id) {
if (
!allowedTeams.includes(job?.team_id) ||
job?.team_id !== req.auth.team_id
) {
return res.status(403).json({
success: false,
error: "You are not allowed to access this resource.",

View File

@ -200,7 +200,8 @@ export const extractV1Options = z
schema: z
.any()
.optional()
.refine((val) => {
.refine(
(val) => {
if (!val) return true; // Allow undefined schema
try {
const validate = ajv.compile(val);
@ -208,9 +209,11 @@ export const extractV1Options = z
} catch (e) {
return false;
}
}, {
},
{
message: "Invalid JSON schema.",
}),
},
),
limit: z.number().int().positive().finite().safe().optional(),
ignoreSitemap: z.boolean().default(false),
includeSubdomains: z.boolean().default(true),
@ -452,7 +455,7 @@ export type Document = {
description: string;
url: string;
};
}
};
export type ErrorResponse = {
success: false;
@ -477,7 +480,7 @@ export interface ScrapeResponseRequestTest {
export interface URLTrace {
url: string;
status: 'mapped' | 'scraped' | 'error';
status: "mapped" | "scraped" | "error";
timing: {
discoveredAt: string;
scrapedAt?: string;
@ -785,9 +788,18 @@ export function toLegacyDocument(
};
}
export const searchRequestSchema = z.object({
export const searchRequestSchema = z
.object({
query: z.string(),
limit: z.number().int().positive().finite().safe().max(10).optional().default(5),
limit: z
.number()
.int()
.positive()
.finite()
.safe()
.max(10)
.optional()
.default(5),
tbs: z.string().optional(),
filter: z.string().optional(),
lang: z.string().optional().default("en"),
@ -795,18 +807,27 @@ export const searchRequestSchema = z.object({
location: z.string().optional(),
origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000),
scrapeOptions: scrapeOptions.extend({
formats: z.array(z.enum([
scrapeOptions: scrapeOptions
.extend({
formats: z
.array(
z.enum([
"markdown",
"html",
"rawHtml",
"links",
"screenshot",
"screenshot@fullPage",
"extract"
])).default([])
}).default({}),
}).strict("Unrecognized key in body -- please review the v1 API documentation for request body changes");
"extract",
]),
)
.default([]),
})
.default({}),
})
.strict(
"Unrecognized key in body -- please review the v1 API documentation for request body changes",
);
export type SearchRequest = z.infer<typeof searchRequestSchema>;

View File

@ -45,7 +45,10 @@ const serverAdapter = new ExpressAdapter();
serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
queues: [new BullAdapter(getScrapeQueue()), new BullAdapter(getExtractQueue())],
queues: [
new BullAdapter(getScrapeQueue()),
new BullAdapter(getExtractQueue()),
],
serverAdapter: serverAdapter,
});

View File

@ -1,91 +1,89 @@
import { normalizeUrl, normalizeUrlOnlyHostname } from './canonical-url';
import { normalizeUrl, normalizeUrlOnlyHostname } from "./canonical-url";
describe('normalizeUrlOnlyHostname', () => {
it('should remove protocol and www from URL', () => {
const url = 'https://www.example.com';
const expected = 'example.com';
describe("normalizeUrlOnlyHostname", () => {
it("should remove protocol and www from URL", () => {
const url = "https://www.example.com";
const expected = "example.com";
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
it('should remove only protocol if www is not present', () => {
const url = 'https://example.com';
const expected = 'example.com';
it("should remove only protocol if www is not present", () => {
const url = "https://example.com";
const expected = "example.com";
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
it('should handle URLs without protocol', () => {
const url = 'www.example.com';
const expected = 'example.com';
it("should handle URLs without protocol", () => {
const url = "www.example.com";
const expected = "example.com";
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
it('should handle URLs without protocol and www', () => {
const url = 'example.com';
const expected = 'example.com';
it("should handle URLs without protocol and www", () => {
const url = "example.com";
const expected = "example.com";
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
it('should handle URLs with paths', () => {
const url = 'https://www.example.com/path/to/resource';
const expected = 'example.com';
it("should handle URLs with paths", () => {
const url = "https://www.example.com/path/to/resource";
const expected = "example.com";
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
it('should handle invalid URLs gracefully', () => {
const url = 'not a valid url';
const expected = 'not a valid url';
it("should handle invalid URLs gracefully", () => {
const url = "not a valid url";
const expected = "not a valid url";
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
});
describe('normalizeUrl', () => {
it('should remove protocol and www from URL', () => {
const url = 'https://www.example.com';
const expected = 'example.com';
describe("normalizeUrl", () => {
it("should remove protocol and www from URL", () => {
const url = "https://www.example.com";
const expected = "example.com";
expect(normalizeUrl(url)).toBe(expected);
});
it('should remove only protocol if www is not present', () => {
const url = 'https://example.com';
const expected = 'example.com';
it("should remove only protocol if www is not present", () => {
const url = "https://example.com";
const expected = "example.com";
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle URLs without protocol', () => {
const url = 'www.example.com';
const expected = 'example.com';
it("should handle URLs without protocol", () => {
const url = "www.example.com";
const expected = "example.com";
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle URLs without protocol and www', () => {
const url = 'example.com';
const expected = 'example.com';
it("should handle URLs without protocol and www", () => {
const url = "example.com";
const expected = "example.com";
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle URLs with paths', () => {
const url = 'https://www.example.com/path/to/resource';
const expected = 'example.com/path/to/resource';
it("should handle URLs with paths", () => {
const url = "https://www.example.com/path/to/resource";
const expected = "example.com/path/to/resource";
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle URLs with trailing slash', () => {
const url = 'https://www.example.com/';
const expected = 'example.com';
it("should handle URLs with trailing slash", () => {
const url = "https://www.example.com/";
const expected = "example.com";
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle URLs with trailing slash and path', () => {
const url = 'https://www.example.com/path/';
const expected = 'example.com/path';
it("should handle URLs with trailing slash and path", () => {
const url = "https://www.example.com/path/";
const expected = "example.com/path";
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle invalid URLs gracefully', () => {
const url = 'not a valid url';
const expected = 'not a valid url';
it("should handle invalid URLs gracefully", () => {
const url = "not a valid url";
const expected = "not a valid url";
expect(normalizeUrl(url)).toBe(expected);
});
});

View File

@ -322,7 +322,7 @@ export async function lockURLs(
export async function lockURLsIndividually(
id: string,
sc: StoredCrawl,
jobs: { id: string; url: string; }[],
jobs: { id: string; url: string }[],
) {
const out: typeof jobs = [];

View File

@ -6,6 +6,4 @@ export const extractConfig = {
MIN_REQUIRED_LINKS: 1,
};
export const CUSTOM_U_TEAMS = [
"874d40cc-a5c0-4e93-b661-9ddfbad5e51e"
]
export const CUSTOM_U_TEAMS = ["874d40cc-a5c0-4e93-b661-9ddfbad5e51e"];

View File

@ -21,14 +21,19 @@ export async function getExtract(id: string): Promise<StoredExtract | null> {
return x ? JSON.parse(x) : null;
}
export async function updateExtract(id: string, extract: Partial<StoredExtract>) {
export async function updateExtract(
id: string,
extract: Partial<StoredExtract>,
) {
const current = await getExtract(id);
if (!current) return;
await redisConnection.set("extract:" + id, JSON.stringify({ ...current, ...extract }));
await redisConnection.set(
"extract:" + id,
JSON.stringify({ ...current, ...extract }),
);
await redisConnection.expire("extract:" + id, 24 * 60 * 60, "NX");
}
export async function getExtractExpiry(id: string): Promise<Date> {
const d = new Date();
const ttl = await redisConnection.pttl("extract:" + id);

View File

@ -30,7 +30,7 @@ interface ExtractResult {
function getRootDomain(url: string): string {
try {
if(url.endsWith("/*")) {
if (url.endsWith("/*")) {
url = url.slice(0, -2);
}
const urlObj = new URL(url);
@ -40,14 +40,18 @@ function getRootDomain(url: string): string {
}
}
export async function performExtraction(extractId: string, options: ExtractServiceOptions): Promise<ExtractResult> {
export async function performExtraction(
extractId: string,
options: ExtractServiceOptions,
): Promise<ExtractResult> {
const { request, teamId, plan, subId } = options;
const urlTraces: URLTrace[] = [];
let docs: Document[] = [];
// Process URLs
const urlPromises = request.urls.map(url =>
processUrl({
const urlPromises = request.urls.map((url) =>
processUrl(
{
url,
prompt: request.prompt,
teamId,
@ -56,16 +60,19 @@ export async function performExtraction(extractId: string, options: ExtractServi
origin: request.origin,
limit: request.limit,
includeSubdomains: request.includeSubdomains,
}, urlTraces)
},
urlTraces,
),
);
const processedUrls = await Promise.all(urlPromises);
const links = processedUrls.flat().filter(url => url);
const links = processedUrls.flat().filter((url) => url);
if (links.length === 0) {
return {
success: false,
error: "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
error:
"No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
extractId,
urlTrace: urlTraces,
};
@ -73,14 +80,17 @@ export async function performExtraction(extractId: string, options: ExtractServi
// Scrape documents
const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000;
const scrapePromises = links.map(url =>
scrapeDocument({
const scrapePromises = links.map((url) =>
scrapeDocument(
{
url,
teamId,
plan,
origin: request.origin || "api",
timeout,
}, urlTraces)
},
urlTraces,
),
);
try {
@ -114,13 +124,16 @@ export async function performExtraction(extractId: string, options: ExtractServi
// Update token usage in traces
if (completions.numTokens) {
const totalLength = docs.reduce((sum, doc) => sum + (doc.markdown?.length || 0), 0);
const totalLength = docs.reduce(
(sum, doc) => sum + (doc.markdown?.length || 0),
0,
);
docs.forEach((doc) => {
if (doc.metadata?.sourceURL) {
const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL);
if (trace && trace.contentStats) {
trace.contentStats.tokensUsed = Math.floor(
((doc.markdown?.length || 0) / totalLength) * completions.numTokens
((doc.markdown?.length || 0) / totalLength) * completions.numTokens,
);
}
}
@ -185,7 +198,7 @@ export async function performExtraction(extractId: string, options: ExtractServi
let linksBilled = links.length * 5;
if(CUSTOM_U_TEAMS.includes(teamId)){
if (CUSTOM_U_TEAMS.includes(teamId)) {
linksBilled = 1;
}
// Bill team for usage
@ -213,12 +226,12 @@ export async function performExtraction(extractId: string, options: ExtractServi
updateExtract(extractId, {
status: "completed",
}).catch((error) => {
logger.error(`Failed to update extract ${extractId} status to completed: ${error}`);
logger.error(
`Failed to update extract ${extractId} status to completed: ${error}`,
);
});
});
return {
success: true,
data: completions.extract ?? {},

View File

@ -1,6 +1,6 @@
import { Pinecone } from '@pinecone-database/pinecone';
import { Document } from '../../../controllers/v1/types';
import { logger } from '../../logger';
import { Pinecone } from "@pinecone-database/pinecone";
import { Document } from "../../../controllers/v1/types";
import { logger } from "../../logger";
import OpenAI from "openai";
const openai = new OpenAI({
@ -48,34 +48,43 @@ export async function indexPage({
document,
originUrl,
crawlId,
teamId
teamId,
}: {
document: Document;
originUrl: string;
crawlId?: string;
teamId?: string;
}
) {
}) {
try {
const index = pinecone.index(INDEX_NAME);
// Trim markdown if it's too long
let trimmedMarkdown = document.markdown;
if (trimmedMarkdown && Buffer.byteLength(trimmedMarkdown, 'utf-8') > MAX_METADATA_SIZE) {
trimmedMarkdown = trimmedMarkdown.slice(0, Math.floor(MAX_METADATA_SIZE / 2)); // Using half the size to be safe with UTF-8 encoding
if (
trimmedMarkdown &&
Buffer.byteLength(trimmedMarkdown, "utf-8") > MAX_METADATA_SIZE
) {
trimmedMarkdown = trimmedMarkdown.slice(
0,
Math.floor(MAX_METADATA_SIZE / 2),
); // Using half the size to be safe with UTF-8 encoding
}
// Create text to embed
const textToEmbed = [
document.metadata.title,
document.metadata.description,
trimmedMarkdown
].filter(Boolean).join('\n\n');
trimmedMarkdown,
]
.filter(Boolean)
.join("\n\n");
// Get embedding from OpenAI
const embedding = await getEmbedding(textToEmbed);
const normalizedUrl = normalizeUrl(document.metadata.sourceURL || document.metadata.url!);
const normalizedUrl = normalizeUrl(
document.metadata.sourceURL || document.metadata.url!,
);
// Prepare metadata
const metadata: PageMetadata = {
@ -86,29 +95,30 @@ export async function indexPage({
crawlId,
teamId,
markdown: trimmedMarkdown,
timestamp: Date.now()
timestamp: Date.now(),
};
// Upsert to Pinecone
await index.upsert([{
await index.upsert([
{
id: normalizedUrl,
values: embedding,
metadata: {
...metadata,
[document.metadata.sourceURL || document.metadata.url!]: true
}
}]);
[document.metadata.sourceURL || document.metadata.url!]: true,
},
},
]);
logger.debug('Successfully indexed page in Pinecone', {
logger.debug("Successfully indexed page in Pinecone", {
url: metadata.url,
crawlId
crawlId,
});
} catch (error) {
logger.error('Failed to index page in Pinecone', {
logger.error("Failed to index page in Pinecone", {
error,
url: document.metadata.sourceURL || document.metadata.url,
crawlId
crawlId,
});
}
}
@ -116,7 +126,7 @@ export async function indexPage({
export async function searchSimilarPages(
query: string,
originUrl?: string,
limit: number = 10
limit: number = 10,
) {
try {
const index = pinecone.index(INDEX_NAME);
@ -127,31 +137,30 @@ export async function searchSimilarPages(
const queryParams: any = {
vector: queryEmbedding,
topK: limit,
includeMetadata: true
includeMetadata: true,
};
const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined;
// Add filter if originUrl is provided
if (normalizedOriginUrl) {
queryParams.filter = {
originUrl: { $eq: normalizedOriginUrl }
originUrl: { $eq: normalizedOriginUrl },
};
}
const results = await index.query(queryParams);
return results.matches.map(match => ({
return results.matches.map((match) => ({
url: match.metadata?.url,
title: match.metadata?.title,
description: match.metadata?.description,
score: match.score,
markdown: match.metadata?.markdown
markdown: match.metadata?.markdown,
}));
} catch (error) {
logger.error('Failed to search similar pages in Pinecone', {
logger.error("Failed to search similar pages in Pinecone", {
error,
query,
originUrl
originUrl,
});
return [];
}

View File

@ -9,8 +9,6 @@ const cohere = new CohereClient({
token: process.env.COHERE_API_KEY,
});
interface RankingResult {
mappedLinks: MapDocument[];
linksAndScores: {
@ -59,7 +57,6 @@ export async function rerankLinks(
searchQuery,
);
// First try with high threshold
let filteredLinks = filterAndProcessLinks(
mappedLinks,
@ -67,8 +64,6 @@ export async function rerankLinks(
extractConfig.INITIAL_SCORE_THRESHOLD,
);
// If we don't have enough high-quality links, try with lower threshold
if (filteredLinks.length < extractConfig.MIN_REQUIRED_LINKS) {
logger.info(
@ -102,7 +97,7 @@ export async function rerankLinks(
if (trace) {
trace.relevanceScore = score.score;
// If URL didn't make it through filtering, mark it as filtered out
if (!filteredLinks.some(link => link.url === score.link)) {
if (!filteredLinks.some((link) => link.url === score.link)) {
trace.warning = `Relevance score ${score.score} below threshold`;
trace.usedInCompletion = false;
}
@ -112,18 +107,18 @@ export async function rerankLinks(
const rankedLinks = filteredLinks.slice(0, extractConfig.MAX_RANKING_LIMIT);
// Mark URLs that will be used in completion
rankedLinks.forEach(link => {
const trace = urlTraces.find(t => t.url === link.url);
rankedLinks.forEach((link) => {
const trace = urlTraces.find((t) => t.url === link.url);
if (trace) {
trace.usedInCompletion = true;
}
});
// Mark URLs that were dropped due to ranking limit
filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach(link => {
const trace = urlTraces.find(t => t.url === link.url);
filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach((link) => {
const trace = urlTraces.find((t) => t.url === link.url);
if (trace) {
trace.warning = 'Excluded due to ranking limit';
trace.warning = "Excluded due to ranking limit";
trace.usedInCompletion = false;
}
});

View File

@ -20,10 +20,13 @@ interface ProcessUrlOptions {
includeSubdomains?: boolean;
}
export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace[]): Promise<string[]> {
export async function processUrl(
options: ProcessUrlOptions,
urlTraces: URLTrace[],
): Promise<string[]> {
const trace: URLTrace = {
url: options.url,
status: 'mapped',
status: "mapped",
timing: {
discoveredAt: new Date().toISOString(),
},
@ -35,8 +38,8 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
trace.usedInCompletion = true;
return [options.url];
}
trace.status = 'error';
trace.error = 'URL is blocked';
trace.status = "error";
trace.error = "URL is blocked";
trace.usedInCompletion = false;
return [];
}
@ -46,9 +49,10 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
let rephrasedPrompt = options.prompt;
if (options.prompt) {
rephrasedPrompt = await generateBasicCompletion(
buildRefrasedPrompt(options.prompt, baseUrl)
) ?? options.prompt;
rephrasedPrompt =
(await generateBasicCompletion(
buildRefrasedPrompt(options.prompt, baseUrl),
)) ?? options.prompt;
}
try {
@ -70,11 +74,11 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
let uniqueUrls = removeDuplicateUrls(allUrls);
// Track all discovered URLs
uniqueUrls.forEach(discoveredUrl => {
if (!urlTraces.some(t => t.url === discoveredUrl)) {
uniqueUrls.forEach((discoveredUrl) => {
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
urlTraces.push({
url: discoveredUrl,
status: 'mapped',
status: "mapped",
timing: {
discoveredAt: new Date().toISOString(),
},
@ -102,12 +106,12 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
uniqueUrls = removeDuplicateUrls(allUrls);
// Track all discovered URLs
uniqueUrls.forEach(discoveredUrl => {
if (!urlTraces.some(t => t.url === discoveredUrl)) {
uniqueUrls.forEach((discoveredUrl) => {
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
urlTraces.push({
url: discoveredUrl,
status: 'mapped',
warning: 'Broader search. Not limiting map results to prompt.',
status: "mapped",
warning: "Broader search. Not limiting map results to prompt.",
timing: {
discoveredAt: new Date().toISOString(),
},
@ -118,11 +122,11 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
}
// Track all discovered URLs
uniqueUrls.forEach(discoveredUrl => {
if (!urlTraces.some(t => t.url === discoveredUrl)) {
uniqueUrls.forEach((discoveredUrl) => {
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
urlTraces.push({
url: discoveredUrl,
status: 'mapped',
status: "mapped",
timing: {
discoveredAt: new Date().toISOString(),
},
@ -155,9 +159,9 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
mappedLinks = await rerankLinks(mappedLinks, searchQuery, urlTraces);
}
return mappedLinks.map(x => x.url);
return mappedLinks.map((x) => x.url);
} catch (error) {
trace.status = 'error';
trace.status = "error";
trace.error = error.message;
trace.usedInCompletion = false;
return [];

View File

@ -42,11 +42,18 @@ export const logger = winston.createLogger({
},
}),
transports: [
...(process.env.FIRECRAWL_LOG_TO_FILE ? [
...(process.env.FIRECRAWL_LOG_TO_FILE
? [
new winston.transports.File({
filename: "firecrawl-" + (process.argv[1].includes("worker") ? "worker" : "app") + "-" + crypto.randomUUID() + ".log",
})
] : []),
filename:
"firecrawl-" +
(process.argv[1].includes("worker") ? "worker" : "app") +
"-" +
crypto.randomUUID() +
".log",
}),
]
: []),
new winston.transports.Console({
format: winston.format.combine(
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),

View File

@ -175,11 +175,10 @@ export async function runWebScraper({
}
// If the team is the background index team, return the response
if(team_id === process.env.BACKGROUND_INDEX_TEAM_ID!) {
if (team_id === process.env.BACKGROUND_INDEX_TEAM_ID!) {
return response;
}
billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => {
logger.error(
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,

View File

@ -192,7 +192,8 @@ v1Router.get(
wrap((req: any, res): any => crawlStatusController(req, res, true)),
);
v1Router.get("/scrape/:jobId",
v1Router.get(
"/scrape/:jobId",
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(scrapeStatusController),
);
@ -242,6 +243,3 @@ v1Router.get(
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(creditUsageController),
);

View File

@ -219,18 +219,29 @@ export class WebCrawler {
const _urlsHandler = async (urls: string[]) => {
let uniqueURLs: string[] = [];
for (const url of urls) {
if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(url))) {
if (
await redisConnection.sadd(
"sitemap:" + this.jobId + ":links",
normalizeUrl(url),
)
) {
uniqueURLs.push(url);
}
}
await redisConnection.expire("sitemap:" + this.jobId + ":links", 3600, "NX");
await redisConnection.expire(
"sitemap:" + this.jobId + ":links",
3600,
"NX",
);
if (uniqueURLs.length > 0) {
urlsHandler(uniqueURLs);
}
};
let count = await this.tryFetchSitemapLinks(this.initialUrl, (urls: string[]) => {
let count = await this.tryFetchSitemapLinks(
this.initialUrl,
(urls: string[]) => {
if (fromMap && onlySitemap) {
return urlsHandler(urls);
} else {
@ -243,10 +254,16 @@ export class WebCrawler {
leftOfLimit -= filteredLinks.length;
return _urlsHandler(filteredLinks);
}
});
},
);
if (count > 0) {
if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(this.initialUrl))) {
if (
await redisConnection.sadd(
"sitemap:" + this.jobId + ":links",
normalizeUrl(this.initialUrl),
)
) {
urlsHandler([this.initialUrl]);
}
count++;
@ -470,8 +487,13 @@ export class WebCrawler {
return socialMediaOrEmail.some((ext) => url.includes(ext));
}
private async tryFetchSitemapLinks(url: string, urlsHandler: (urls: string[]) => unknown): Promise<number> {
const sitemapUrl = url.endsWith(".xml") ? url : `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
private async tryFetchSitemapLinks(
url: string,
urlsHandler: (urls: string[]) => unknown,
): Promise<number> {
const sitemapUrl = url.endsWith(".xml")
? url
: `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
let sitemapCount: number = 0;
@ -482,37 +504,43 @@ export class WebCrawler {
this.logger,
);
} catch (error) {
this.logger.debug(
`Failed to fetch sitemap from ${sitemapUrl}`,
{ method: "tryFetchSitemapLinks", sitemapUrl, error },
);
this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, {
method: "tryFetchSitemapLinks",
sitemapUrl,
error,
});
}
// If this is a subdomain, also try to get sitemap from the main domain
try {
const urlObj = new URL(url);
const hostname = urlObj.hostname;
const domainParts = hostname.split('.');
const domainParts = hostname.split(".");
// Check if this is a subdomain (has more than 2 parts and not www)
if (domainParts.length > 2 && domainParts[0] !== 'www') {
if (domainParts.length > 2 && domainParts[0] !== "www") {
// Get the main domain by taking the last two parts
const mainDomain = domainParts.slice(-2).join('.');
const mainDomain = domainParts.slice(-2).join(".");
const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`;
const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`;
try {
// Get all links from the main domain's sitemap
sitemapCount += await getLinksFromSitemap(
{ sitemapUrl: mainDomainSitemapUrl, urlsHandler(urls) {
return urlsHandler(urls.filter(link => {
{
sitemapUrl: mainDomainSitemapUrl,
urlsHandler(urls) {
return urlsHandler(
urls.filter((link) => {
try {
const linkUrl = new URL(link);
return linkUrl.hostname.endsWith(hostname);
} catch {
}
}))
}, mode: "fire-engine" },
} catch {}
}),
);
},
mode: "fire-engine",
},
this.logger,
);
} catch (error) {

View File

@ -15,7 +15,7 @@ export async function getLinksFromSitemap(
mode = "axios",
}: {
sitemapUrl: string;
urlsHandler(urls: string[]): unknown,
urlsHandler(urls: string[]): unknown;
mode?: "axios" | "fire-engine";
},
logger: Logger,
@ -31,7 +31,10 @@ export async function getLinksFromSitemap(
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
);
if (!response.success) {
logger.debug("Failed to scrape sitemap via TLSClient, falling back to axios...", { error: response.error })
logger.debug(
"Failed to scrape sitemap via TLSClient, falling back to axios...",
{ error: response.error },
);
const ar = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = ar.data;
} else {
@ -63,14 +66,11 @@ export async function getLinksFromSitemap(
.map((sitemap) => sitemap.loc[0].trim());
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
getLinksFromSitemap(
{ sitemapUrl, urlsHandler, mode },
logger,
),
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger),
);
const results = await Promise.all(sitemapPromises);
count = results.reduce((a,x) => a + x)
count = results.reduce((a, x) => a + x);
} else if (root && root.url) {
// Check if any URLs point to additional sitemaps
const xmlSitemaps: string[] = root.url
@ -78,7 +78,7 @@ export async function getLinksFromSitemap(
(url) =>
url.loc &&
url.loc.length > 0 &&
url.loc[0].trim().toLowerCase().endsWith('.xml')
url.loc[0].trim().toLowerCase().endsWith(".xml"),
)
.map((url) => url.loc[0].trim());
@ -90,7 +90,10 @@ export async function getLinksFromSitemap(
logger,
),
);
count += (await Promise.all(sitemapPromises)).reduce((a,x) => a + x, 0);
count += (await Promise.all(sitemapPromises)).reduce(
(a, x) => a + x,
0,
);
}
const validUrls = root.url
@ -98,7 +101,7 @@ export async function getLinksFromSitemap(
(url) =>
url.loc &&
url.loc.length > 0 &&
!url.loc[0].trim().toLowerCase().endsWith('.xml') &&
!url.loc[0].trim().toLowerCase().endsWith(".xml") &&
!WebCrawler.prototype.isFile(url.loc[0].trim()),
)
.map((url) => url.loc[0].trim());

View File

@ -3,7 +3,10 @@ import { EngineScrapeResult } from "..";
import { Meta } from "../..";
import { TimeoutError } from "../../error";
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
import { InsecureConnectionError, makeSecureDispatcher } from "../utils/safeFetch";
import {
InsecureConnectionError,
makeSecureDispatcher,
} from "../utils/safeFetch";
export async function scrapeURLWithFetch(
meta: Meta,
@ -20,7 +23,9 @@ export async function scrapeURLWithFetch(
headers: meta.options.headers,
}),
(async () => {
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
await new Promise((resolve) =>
setTimeout(() => resolve(null), timeout),
);
throw new TimeoutError(
"Fetch was unable to scrape the page before timing out",
{ cause: { timeout } },
@ -28,7 +33,10 @@ export async function scrapeURLWithFetch(
})(),
]);
} catch (error) {
if (error instanceof TypeError && error.cause instanceof InsecureConnectionError) {
if (
error instanceof TypeError &&
error.cause instanceof InsecureConnectionError
) {
throw error.cause;
} else {
throw error;

View File

@ -3,7 +3,12 @@ import * as Sentry from "@sentry/node";
import { z } from "zod";
import { robustFetch } from "../../lib/fetch";
import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error";
import {
ActionError,
EngineError,
SiteError,
UnsupportedFileError,
} from "../../error";
const successSchema = z.object({
jobId: z.string(),
@ -37,10 +42,13 @@ const successSchema = z.object({
.optional(),
// chrome-cdp only -- file download handler
file: z.object({
file: z
.object({
name: z.string(),
content: z.string(),
}).optional().or(z.null()),
})
.optional()
.or(z.null()),
});
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
@ -121,7 +129,9 @@ export async function fireEngineCheckStatus(
typeof status.error === "string" &&
status.error.includes("File size exceeds")
) {
throw new UnsupportedFileError("File size exceeds " + status.error.split("File size exceeds ")[1]);
throw new UnsupportedFileError(
"File size exceeds " + status.error.split("File size exceeds ")[1],
);
} else if (
typeof status.error === "string" &&
// TODO: improve this later

View File

@ -13,7 +13,13 @@ import {
FireEngineCheckStatusSuccess,
StillProcessingError,
} from "./checkStatus";
import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError } from "../../error";
import {
ActionError,
EngineError,
SiteError,
TimeoutError,
UnsupportedFileError,
} from "../../error";
import * as Sentry from "@sentry/node";
import { Action } from "../../../../lib/entities";
import { specialtyScrapeCheck } from "../utils/specialtyHandler";

View File

@ -38,7 +38,7 @@ const useCache =
process.env.CACHE_REDIS_URL !== undefined;
export const engines: Engine[] = [
...(useCache ? [ "cache" as const ] : []),
...(useCache ? ["cache" as const] : []),
...(useFireEngine
? [
"fire-engine;chrome-cdp" as const,
@ -298,7 +298,6 @@ export function buildFallbackList(meta: Meta): {
engine: Engine;
unsupportedFeatures: Set<FeatureFlag>;
}[] {
if (meta.internalOptions.useCache !== true) {
const cacheIndex = engines.indexOf("cache");
if (cacheIndex !== -1) {

View File

@ -7,11 +7,18 @@ import { v4 as uuid } from "uuid";
import * as undici from "undici";
import { makeSecureDispatcher } from "./safeFetch";
export async function fetchFileToBuffer(url: string, init?: undici.RequestInit): Promise<{
export async function fetchFileToBuffer(
url: string,
init?: undici.RequestInit,
): Promise<{
response: undici.Response;
buffer: Buffer;
}> {
const response = await undici.fetch(url, { ...init, redirect: "follow", dispatcher: await makeSecureDispatcher(url) });
const response = await undici.fetch(url, {
...init,
redirect: "follow",
dispatcher: await makeSecureDispatcher(url),
});
return {
response,
buffer: Buffer.from(await response.arrayBuffer()),
@ -30,7 +37,11 @@ export async function downloadFile(
const tempFileWrite = createWriteStream(tempFilePath);
// TODO: maybe we could use tlsclient for this? for proxying
const response = await undici.fetch(url, { ...init, redirect: "follow", dispatcher: await makeSecureDispatcher(url) });
const response = await undici.fetch(url, {
...init,
redirect: "follow",
dispatcher: await makeSecureDispatcher(url),
});
// This should never happen in the current state of JS/Undici (2024), but let's check anyways.
if (response.body === null) {

View File

@ -5,36 +5,44 @@ import { Address6 } from "ip-address";
export class InsecureConnectionError extends Error {
constructor() {
super("Connection violated security rules.")
super("Connection violated security rules.");
}
}
function isIPv4Private(address: string): boolean {
const parts = address.split(".").map(x => parseInt(x, 10));
return parts[0] === 0 // Current (local, "this") network
|| parts[0] === 10 // Used for local communications within a private network
|| (parts[0] === 100 && parts[1] >= 64 && parts[1] < 128) // Shared address space for communications between a service provider and its subscribers when using a carrier-grade NAT
|| parts[0] === 127 // Used for loopback addresses to the local host
|| (parts[0] === 169 && parts[1] === 254) // Used for link-local addresses between two hosts on a single link when no IP address is otherwise specified, such as would have normally been retrieved from a DHCP server
|| (parts[0] === 127 && parts[1] >= 16 && parts[2] < 32) // Used for local communications within a private network
|| (parts[0] === 192 && parts[1] === 0 && parts[2] === 0) // IETF Porotocol Assignments, DS-Lite (/29)
|| (parts[0] === 192 && parts[1] === 0 && parts[2] === 2) // Assigned as TEST-NET-1, documentation and examples
|| (parts[0] === 192 && parts[1] === 88 && parts[2] === 99) // Reserved. Formerly used for IPv6 to IPv4 relay (included IPv6 address block 2002::/16).
|| (parts[0] === 192 && parts[1] === 168) // Used for local communications within a private network
|| (parts[0] === 192 && parts[1] >= 18 && parts[1] < 20) // Used for benchmark testing of inter-network communications between two separate subnets
|| (parts[0] === 198 && parts[1] === 51 && parts[2] === 100) // Assigned as TEST-NET-2, documentation and examples
|| (parts[0] === 203 && parts[1] === 0 && parts[2] === 113) // Assigned as TEST-NET-3, documentation and examples
|| (parts[0] >= 224 && parts[0] < 240) // In use for multicast (former Class D network)
|| (parts[0] === 233 && parts[1] === 252 && parts[2] === 0) // Assigned as MCAST-TEST-NET, documentation and examples (Note that this is part of the above multicast space.)
|| parts[0] >= 240 // Reserved for future use (former class E network)
|| (parts[0] === 255 && parts[1] === 255 && parts[2] === 255 && parts[3] === 255) // Reserved for the "limited broadcast" destination address
const parts = address.split(".").map((x) => parseInt(x, 10));
return (
parts[0] === 0 || // Current (local, "this") network
parts[0] === 10 || // Used for local communications within a private network
(parts[0] === 100 && parts[1] >= 64 && parts[1] < 128) || // Shared address space for communications between a service provider and its subscribers when using a carrier-grade NAT
parts[0] === 127 || // Used for loopback addresses to the local host
(parts[0] === 169 && parts[1] === 254) || // Used for link-local addresses between two hosts on a single link when no IP address is otherwise specified, such as would have normally been retrieved from a DHCP server
(parts[0] === 127 && parts[1] >= 16 && parts[2] < 32) || // Used for local communications within a private network
(parts[0] === 192 && parts[1] === 0 && parts[2] === 0) || // IETF Porotocol Assignments, DS-Lite (/29)
(parts[0] === 192 && parts[1] === 0 && parts[2] === 2) || // Assigned as TEST-NET-1, documentation and examples
(parts[0] === 192 && parts[1] === 88 && parts[2] === 99) || // Reserved. Formerly used for IPv6 to IPv4 relay (included IPv6 address block 2002::/16).
(parts[0] === 192 && parts[1] === 168) || // Used for local communications within a private network
(parts[0] === 192 && parts[1] >= 18 && parts[1] < 20) || // Used for benchmark testing of inter-network communications between two separate subnets
(parts[0] === 198 && parts[1] === 51 && parts[2] === 100) || // Assigned as TEST-NET-2, documentation and examples
(parts[0] === 203 && parts[1] === 0 && parts[2] === 113) || // Assigned as TEST-NET-3, documentation and examples
(parts[0] >= 224 && parts[0] < 240) || // In use for multicast (former Class D network)
(parts[0] === 233 && parts[1] === 252 && parts[2] === 0) || // Assigned as MCAST-TEST-NET, documentation and examples (Note that this is part of the above multicast space.)
parts[0] >= 240 || // Reserved for future use (former class E network)
(parts[0] === 255 &&
parts[1] === 255 &&
parts[2] === 255 &&
parts[3] === 255)
); // Reserved for the "limited broadcast" destination address
}
function isIPv6Private(ipv6) {
return new Address6(ipv6).getScope() !== "Global";
}
export function makeSecureDispatcher(url: string, options?: undici.Agent.Options) {
export function makeSecureDispatcher(
url: string,
options?: undici.Agent.Options,
) {
const agent = new undici.Agent({
connect: {
rejectUnauthorized: false, // bypass SSL failures -- this is fine
@ -46,12 +54,18 @@ export function makeSecureDispatcher(url: string, options?: undici.Agent.Options
agent.on("connect", (_, targets) => {
const client: undici.Client = targets.slice(-1)[0] as undici.Client;
const socketSymbol = Object.getOwnPropertySymbols(client).find(x => x.description === "socket")!;
const socketSymbol = Object.getOwnPropertySymbols(client).find(
(x) => x.description === "socket",
)!;
const socket: Socket | TLSSocket = (client as any)[socketSymbol];
if (socket.remoteAddress) {
if (socket.remoteFamily === "IPv4" ? isIPv4Private(socket.remoteAddress!) : isIPv6Private(socket.remoteAddress!)) {
socket.destroy(new InsecureConnectionError())
if (
socket.remoteFamily === "IPv4"
? isIPv4Private(socket.remoteAddress!)
: isIPv6Private(socket.remoteAddress!)
) {
socket.destroy(new InsecureConnectionError());
}
}
});

View File

@ -420,7 +420,9 @@ export async function scrapeURL(
} else if (error instanceof ActionError) {
meta.logger.warn("scrapeURL: Action(s) failed to complete", { error });
} else if (error instanceof UnsupportedFileError) {
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", { error });
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", {
error,
});
} else {
Sentry.captureException(error);
meta.logger.error("scrapeURL: Unexpected error happened", { error });

View File

@ -44,10 +44,15 @@ export function extractMetadata(
title = soup("title").first().text().trim() || undefined;
description = soup('meta[name="description"]').attr("content") || undefined;
const faviconLink = soup('link[rel="icon"]').attr("href") || soup('link[rel*="icon"]').first().attr("href") || undefined;
const faviconLink =
soup('link[rel="icon"]').attr("href") ||
soup('link[rel*="icon"]').first().attr("href") ||
undefined;
if (faviconLink) {
const baseUrl = new URL(meta.url).origin;
favicon = faviconLink.startsWith('http') ? faviconLink : `${baseUrl}${faviconLink}`;
favicon = faviconLink.startsWith("http")
? faviconLink
: `${baseUrl}${faviconLink}`;
}
// Assuming the language is part of the URL as per the regex pattern

View File

@ -24,7 +24,6 @@ export function saveToCache(meta: Meta, document: Document): Document {
return document;
}
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
if (key !== null) {

View File

@ -8,13 +8,21 @@ describe("removeDefaultProperty", () => {
});
it("should remove the default property from a nested object", () => {
const input = { default: "test", nested: { default: "nestedTest", test: "nestedTest" } };
const input = {
default: "test",
nested: { default: "nestedTest", test: "nestedTest" },
};
const expectedOutput = { nested: { test: "nestedTest" } };
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
});
it("should remove the default property from an array of objects", () => {
const input = { array: [{ default: "test1", test: "test1" }, { default: "test2", test: "test2" }] };
const input = {
array: [
{ default: "test1", test: "test1" },
{ default: "test2", test: "test2" },
],
};
const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] };
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
});

View File

@ -123,7 +123,7 @@ export async function generateOpenAICompletions(
let schema = options.schema;
if (schema) {
schema = removeDefaultProperty(schema);
}
}
if (schema && schema.type === "array") {
schema = {
@ -140,10 +140,10 @@ export async function generateOpenAICompletions(
properties: Object.fromEntries(
Object.entries(schema).map(([key, value]) => {
return [key, removeDefaultProperty(value)];
})
}),
),
required: Object.keys(schema),
additionalProperties: false
additionalProperties: false,
};
}
@ -240,14 +240,14 @@ export async function performLLMExtract(
}
export function removeDefaultProperty(schema: any): any {
if (typeof schema !== 'object' || schema === null) return schema;
if (typeof schema !== "object" || schema === null) return schema;
const { default: _, ...rest } = schema;
for (const key in rest) {
if (Array.isArray(rest[key])) {
rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));
} else if (typeof rest[key] === 'object' && rest[key] !== null) {
} else if (typeof rest[key] === "object" && rest[key] !== null) {
rest[key] = removeDefaultProperty(rest[key]);
}
}

View File

@ -9,9 +9,11 @@ configDotenv();
function cleanOfNull<T>(x: T): T {
if (Array.isArray(x)) {
return x.map(x => cleanOfNull(x)) as T;
return x.map((x) => cleanOfNull(x)) as T;
} else if (typeof x === "object" && x !== null) {
return Object.fromEntries(Object.entries(x).map(([k,v]) => [k,cleanOfNull(v)])) as T
return Object.fromEntries(
Object.entries(x).map(([k, v]) => [k, cleanOfNull(v)]),
) as T;
} else if (typeof x === "string") {
return x.replaceAll("\u0000", "") as T;
} else {

View File

@ -16,9 +16,7 @@ export const loggingQueueName = "{loggingQueue}";
export function getScrapeQueue() {
if (!scrapeQueue) {
scrapeQueue = new Queue(
scrapeQueueName,
{
scrapeQueue = new Queue(scrapeQueueName, {
connection: redisConnection,
defaultJobOptions: {
removeOnComplete: {
@ -28,8 +26,7 @@ export function getScrapeQueue() {
age: 90000, // 25 hours
},
},
}
);
});
logger.info("Web scraper queue created");
}
return scrapeQueue;
@ -37,9 +34,7 @@ export function getScrapeQueue() {
export function getExtractQueue() {
if (!extractQueue) {
extractQueue = new Queue(
extractQueueName,
{
extractQueue = new Queue(extractQueueName, {
connection: redisConnection,
defaultJobOptions: {
removeOnComplete: {
@ -49,14 +44,12 @@ export function getExtractQueue() {
age: 90000, // 25 hours
},
},
}
);
});
logger.info("Extraction queue created");
}
return extractQueue;
}
// === REMOVED IN FAVOR OF POLLING -- NOT RELIABLE
// import { QueueEvents } from 'bullmq';
// export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection.duplicate() });

View File

@ -89,13 +89,19 @@ const runningJobs: Set<string> = new Set();
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
if (await finishCrawl(job.data.crawl_id)) {
(async () => {
const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined;
const originUrl = sc.originUrl
? normalizeUrlOnlyHostname(sc.originUrl)
: undefined;
// Get all visited URLs from Redis
const visitedUrls = await redisConnection.smembers(
"crawl:" + job.data.crawl_id + ":visited",
);
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
if (
visitedUrls.length > 0 &&
job.data.crawlerOptions !== null &&
originUrl
) {
// Fire and forget the upload to Supabase
try {
// Standardize URLs to canonical form (https, no www)
@ -317,7 +323,10 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
return err;
};
const processExtractJobInternal = async (token: string, job: Job & { id: string }) => {
const processExtractJobInternal = async (
token: string,
job: Job & { id: string },
) => {
const logger = _logger.child({
module: "extract-worker",
method: "processJobInternal",
@ -360,11 +369,14 @@ const processExtractJobInternal = async (token: string, job: Job & { id: string
await updateExtract(job.data.extractId, {
status: "failed",
error: error.error ?? error ?? "Unknown error, please contact help@firecrawl.dev. Extract id: " + job.data.extractId,
error:
error.error ??
error ??
"Unknown error, please contact help@firecrawl.dev. Extract id: " +
job.data.extractId,
});
// throw error;
} finally {
clearInterval(extendLockInterval);
}
};
@ -635,7 +647,9 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
sc,
jobs.map((x) => ({ id: x.opts.jobId, url: x.data.url })),
);
const lockedJobs = jobs.filter(x => lockedIds.find(y => y.id === x.opts.jobId));
const lockedJobs = jobs.filter((x) =>
lockedIds.find((y) => y.id === x.opts.jobId),
);
logger.debug("Adding scrape jobs to Redis...");
await addCrawlJobs(
job.data.crawl_id,
@ -790,7 +804,8 @@ async function processJob(job: Job & { id: string }, token: string) {
) {
const crawler = crawlToCrawler(job.data.crawl_id, sc);
if (
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null &&
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) ===
null &&
!job.data.isCrawlSourceScrape
) {
throw new Error(
@ -1073,7 +1088,7 @@ async function processJob(job: Job & { id: string }, token: string) {
console.log("All workers exited. Waiting for all jobs to finish...");
while (runningJobs.size > 0) {
await new Promise(resolve => setTimeout(resolve, 500));
await new Promise((resolve) => setTimeout(resolve, 500));
}
process.exit(0);