mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 10:29:00 +08:00
Nick: formatting fixes
This commit is contained in:
parent
d1f3b96388
commit
f4d10c5031
@ -204,7 +204,7 @@ export async function supaAuthenticateUser(
|
||||
}
|
||||
|
||||
chunk = await getACUC(normalizedApi);
|
||||
|
||||
|
||||
if (chunk === null) {
|
||||
return {
|
||||
success: false,
|
||||
|
@ -178,48 +178,52 @@ export async function crawlController(req: Request, res: Response) {
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||
? 0
|
||||
: await crawler.tryGetSitemap(async urls => {
|
||||
if (urls.length === 0) return;
|
||||
|
||||
let jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 });
|
||||
const jobs = urls.map(url => {
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
team_id,
|
||||
plan,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: jobPriority,
|
||||
},
|
||||
};
|
||||
});
|
||||
? 0
|
||||
: await crawler.tryGetSitemap(async (urls) => {
|
||||
if (urls.length === 0) return;
|
||||
|
||||
await lockURLs(
|
||||
id,
|
||||
sc,
|
||||
jobs.map((x) => x.data.url),
|
||||
);
|
||||
await addCrawlJobs(
|
||||
id,
|
||||
jobs.map((x) => x.opts.jobId),
|
||||
);
|
||||
for (const job of jobs) {
|
||||
// add with sentry instrumentation
|
||||
await addScrapeJob(job.data as any, {}, job.opts.jobId);
|
||||
}
|
||||
let jobPriority = await getJobPriority({
|
||||
plan,
|
||||
team_id,
|
||||
basePriority: 21,
|
||||
});
|
||||
const jobs = urls.map((url) => {
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
team_id,
|
||||
plan,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: jobPriority,
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
await lockURLs(
|
||||
id,
|
||||
sc,
|
||||
jobs.map((x) => x.data.url),
|
||||
);
|
||||
await addCrawlJobs(
|
||||
id,
|
||||
jobs.map((x) => x.opts.jobId),
|
||||
);
|
||||
for (const job of jobs) {
|
||||
// add with sentry instrumentation
|
||||
await addScrapeJob(job.data as any, {}, job.opts.jobId);
|
||||
}
|
||||
});
|
||||
|
||||
if (sitemap === 0) {
|
||||
await lockURL(id, sc, url);
|
||||
|
@ -114,29 +114,29 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||
|
||||
const sitemap = sc.crawlerOptions?.ignoreSitemap
|
||||
? 0
|
||||
: await crawler.tryGetSitemap(async urls => {
|
||||
for (const url of urls) {
|
||||
await lockURL(id, sc, url);
|
||||
const jobId = uuidv4();
|
||||
await addScrapeJob(
|
||||
{
|
||||
url,
|
||||
mode: "single_urls",
|
||||
team_id,
|
||||
plan: plan!,
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
origin: "website-preview",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
},
|
||||
{},
|
||||
jobId,
|
||||
);
|
||||
await addCrawlJob(id, jobId);
|
||||
}
|
||||
});
|
||||
: await crawler.tryGetSitemap(async (urls) => {
|
||||
for (const url of urls) {
|
||||
await lockURL(id, sc, url);
|
||||
const jobId = uuidv4();
|
||||
await addScrapeJob(
|
||||
{
|
||||
url,
|
||||
mode: "single_urls",
|
||||
team_id,
|
||||
plan: plan!,
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
origin: "website-preview",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
},
|
||||
{},
|
||||
jobId,
|
||||
);
|
||||
await addCrawlJob(id, jobId);
|
||||
}
|
||||
});
|
||||
|
||||
if (sitemap === 0) {
|
||||
await lockURL(id, sc, url);
|
||||
|
@ -115,7 +115,8 @@ export async function crawlStatusController(
|
||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
|
||||
sc.cancelled
|
||||
? "cancelled"
|
||||
: (validJobStatuses.every((x) => x[1] === "completed") && validJobStatuses.length > 0)
|
||||
: validJobStatuses.every((x) => x[1] === "completed") &&
|
||||
validJobStatuses.length > 0
|
||||
? "completed"
|
||||
: "scraping";
|
||||
|
||||
|
@ -7,11 +7,7 @@ import {
|
||||
RequestWithAuth,
|
||||
toLegacyCrawlerOptions,
|
||||
} from "./types";
|
||||
import {
|
||||
crawlToCrawler,
|
||||
saveCrawl,
|
||||
StoredCrawl,
|
||||
} from "../../lib/crawl-redis";
|
||||
import { crawlToCrawler, saveCrawl, StoredCrawl } from "../../lib/crawl-redis";
|
||||
import { logCrawl } from "../../services/logging/crawl_log";
|
||||
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
|
||||
import { logger as _logger } from "../../lib/logger";
|
||||
@ -103,20 +99,25 @@ export async function crawlController(
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
await _addScrapeJobToBullMQ({
|
||||
url: req.body.url,
|
||||
mode: "kickoff" as const,
|
||||
team_id: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
crawlerOptions,
|
||||
scrapeOptions: sc.scrapeOptions,
|
||||
internalOptions: sc.internalOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
webhook: req.body.webhook,
|
||||
v1: true,
|
||||
}, {}, crypto.randomUUID(), 10);
|
||||
|
||||
await _addScrapeJobToBullMQ(
|
||||
{
|
||||
url: req.body.url,
|
||||
mode: "kickoff" as const,
|
||||
team_id: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
crawlerOptions,
|
||||
scrapeOptions: sc.scrapeOptions,
|
||||
internalOptions: sc.internalOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
webhook: req.body.webhook,
|
||||
v1: true,
|
||||
},
|
||||
{},
|
||||
crypto.randomUUID(),
|
||||
10,
|
||||
);
|
||||
|
||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||
|
||||
return res.status(200).json({
|
||||
|
@ -11,25 +11,29 @@ import { saveExtract } from "../../lib/extract/extract-redis";
|
||||
import { getTeamIdSyncB } from "../../lib/extract/team-id-sync";
|
||||
import { performExtraction } from "../../lib/extract/extraction-service";
|
||||
|
||||
export async function oldExtract(req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>, res: Response<ExtractResponse>, extractId: string){
|
||||
export async function oldExtract(
|
||||
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
|
||||
res: Response<ExtractResponse>,
|
||||
extractId: string,
|
||||
) {
|
||||
// Means that are in the non-queue system
|
||||
// TODO: Remove this once all teams have transitioned to the new system
|
||||
try {
|
||||
const result = await performExtraction(extractId, {
|
||||
request: req.body,
|
||||
teamId: req.auth.team_id,
|
||||
plan: req.auth.plan ?? "free",
|
||||
subId: req.acuc?.sub_id ?? undefined,
|
||||
try {
|
||||
const result = await performExtraction(extractId, {
|
||||
request: req.body,
|
||||
teamId: req.auth.team_id,
|
||||
plan: req.auth.plan ?? "free",
|
||||
subId: req.acuc?.sub_id ?? undefined,
|
||||
});
|
||||
|
||||
return res.status(200).json(result);
|
||||
} catch (error) {
|
||||
return res.status(500).json({
|
||||
success: false,
|
||||
error: "Internal server error",
|
||||
});
|
||||
}
|
||||
return res.status(200).json(result);
|
||||
} catch (error) {
|
||||
return res.status(500).json({
|
||||
success: false,
|
||||
error: "Internal server error",
|
||||
});
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Extracts data from the provided URLs based on the request parameters.
|
||||
* Currently in beta.
|
||||
@ -53,7 +57,10 @@ export async function extractController(
|
||||
extractId,
|
||||
};
|
||||
|
||||
if(await getTeamIdSyncB(req.auth.team_id) && req.body.origin !== "api-sdk") {
|
||||
if (
|
||||
(await getTeamIdSyncB(req.auth.team_id)) &&
|
||||
req.body.origin !== "api-sdk"
|
||||
) {
|
||||
return await oldExtract(req, res, extractId);
|
||||
}
|
||||
|
||||
|
@ -86,11 +86,15 @@ export async function getMapResults({
|
||||
|
||||
// If sitemapOnly is true, only get links from sitemap
|
||||
if (crawlerOptions.sitemapOnly) {
|
||||
const sitemap = await crawler.tryGetSitemap(urls => {
|
||||
urls.forEach((x) => {
|
||||
links.push(x);
|
||||
});
|
||||
}, true, true);
|
||||
const sitemap = await crawler.tryGetSitemap(
|
||||
(urls) => {
|
||||
urls.forEach((x) => {
|
||||
links.push(x);
|
||||
});
|
||||
},
|
||||
true,
|
||||
true,
|
||||
);
|
||||
if (sitemap > 0) {
|
||||
links = links
|
||||
.slice(1)
|
||||
@ -145,9 +149,11 @@ export async function getMapResults({
|
||||
|
||||
// Parallelize sitemap fetch with serper search
|
||||
const [_, ...searchResults] = await Promise.all([
|
||||
ignoreSitemap ? null : crawler.tryGetSitemap(urls => {
|
||||
links.push(...urls);
|
||||
}, true),
|
||||
ignoreSitemap
|
||||
? null
|
||||
: crawler.tryGetSitemap((urls) => {
|
||||
links.push(...urls);
|
||||
}, true),
|
||||
...(cachedResult ? [] : pagePromises),
|
||||
]);
|
||||
|
||||
|
@ -8,7 +8,7 @@ export async function scrapeStatusController(req: any, res: any) {
|
||||
"511544f2-2fce-4183-9c59-6c29b02c69b5",
|
||||
"1ec9a0b3-6e7d-49a9-ad6c-9c598ba824c8",
|
||||
];
|
||||
|
||||
|
||||
if (!allowedTeams.includes(req.auth.team_id)) {
|
||||
return res.status(403).json({
|
||||
success: false,
|
||||
@ -18,7 +18,10 @@ export async function scrapeStatusController(req: any, res: any) {
|
||||
|
||||
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
|
||||
|
||||
if (!allowedTeams.includes(job?.team_id) || job?.team_id !== req.auth.team_id) {
|
||||
if (
|
||||
!allowedTeams.includes(job?.team_id) ||
|
||||
job?.team_id !== req.auth.team_id
|
||||
) {
|
||||
return res.status(403).json({
|
||||
success: false,
|
||||
error: "You are not allowed to access this resource.",
|
||||
|
@ -200,17 +200,20 @@ export const extractV1Options = z
|
||||
schema: z
|
||||
.any()
|
||||
.optional()
|
||||
.refine((val) => {
|
||||
if (!val) return true; // Allow undefined schema
|
||||
try {
|
||||
const validate = ajv.compile(val);
|
||||
return typeof validate === "function";
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
}, {
|
||||
message: "Invalid JSON schema.",
|
||||
}),
|
||||
.refine(
|
||||
(val) => {
|
||||
if (!val) return true; // Allow undefined schema
|
||||
try {
|
||||
const validate = ajv.compile(val);
|
||||
return typeof validate === "function";
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
},
|
||||
{
|
||||
message: "Invalid JSON schema.",
|
||||
},
|
||||
),
|
||||
limit: z.number().int().positive().finite().safe().optional(),
|
||||
ignoreSitemap: z.boolean().default(false),
|
||||
includeSubdomains: z.boolean().default(true),
|
||||
@ -452,7 +455,7 @@ export type Document = {
|
||||
description: string;
|
||||
url: string;
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
export type ErrorResponse = {
|
||||
success: false;
|
||||
@ -477,7 +480,7 @@ export interface ScrapeResponseRequestTest {
|
||||
|
||||
export interface URLTrace {
|
||||
url: string;
|
||||
status: 'mapped' | 'scraped' | 'error';
|
||||
status: "mapped" | "scraped" | "error";
|
||||
timing: {
|
||||
discoveredAt: string;
|
||||
scrapedAt?: string;
|
||||
@ -785,28 +788,46 @@ export function toLegacyDocument(
|
||||
};
|
||||
}
|
||||
|
||||
export const searchRequestSchema = z.object({
|
||||
query: z.string(),
|
||||
limit: z.number().int().positive().finite().safe().max(10).optional().default(5),
|
||||
tbs: z.string().optional(),
|
||||
filter: z.string().optional(),
|
||||
lang: z.string().optional().default("en"),
|
||||
country: z.string().optional().default("us"),
|
||||
location: z.string().optional(),
|
||||
origin: z.string().optional().default("api"),
|
||||
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||
scrapeOptions: scrapeOptions.extend({
|
||||
formats: z.array(z.enum([
|
||||
"markdown",
|
||||
"html",
|
||||
"rawHtml",
|
||||
"links",
|
||||
"screenshot",
|
||||
"screenshot@fullPage",
|
||||
"extract"
|
||||
])).default([])
|
||||
}).default({}),
|
||||
}).strict("Unrecognized key in body -- please review the v1 API documentation for request body changes");
|
||||
export const searchRequestSchema = z
|
||||
.object({
|
||||
query: z.string(),
|
||||
limit: z
|
||||
.number()
|
||||
.int()
|
||||
.positive()
|
||||
.finite()
|
||||
.safe()
|
||||
.max(10)
|
||||
.optional()
|
||||
.default(5),
|
||||
tbs: z.string().optional(),
|
||||
filter: z.string().optional(),
|
||||
lang: z.string().optional().default("en"),
|
||||
country: z.string().optional().default("us"),
|
||||
location: z.string().optional(),
|
||||
origin: z.string().optional().default("api"),
|
||||
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||
scrapeOptions: scrapeOptions
|
||||
.extend({
|
||||
formats: z
|
||||
.array(
|
||||
z.enum([
|
||||
"markdown",
|
||||
"html",
|
||||
"rawHtml",
|
||||
"links",
|
||||
"screenshot",
|
||||
"screenshot@fullPage",
|
||||
"extract",
|
||||
]),
|
||||
)
|
||||
.default([]),
|
||||
})
|
||||
.default({}),
|
||||
})
|
||||
.strict(
|
||||
"Unrecognized key in body -- please review the v1 API documentation for request body changes",
|
||||
);
|
||||
|
||||
export type SearchRequest = z.infer<typeof searchRequestSchema>;
|
||||
|
||||
|
@ -45,7 +45,10 @@ const serverAdapter = new ExpressAdapter();
|
||||
serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
|
||||
|
||||
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
||||
queues: [new BullAdapter(getScrapeQueue()), new BullAdapter(getExtractQueue())],
|
||||
queues: [
|
||||
new BullAdapter(getScrapeQueue()),
|
||||
new BullAdapter(getExtractQueue()),
|
||||
],
|
||||
serverAdapter: serverAdapter,
|
||||
});
|
||||
|
||||
@ -254,4 +257,4 @@ logger.info(`Worker ${process.pid} started`);
|
||||
// sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
||||
// sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
||||
// sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
||||
//
|
||||
//
|
||||
|
@ -1,91 +1,89 @@
|
||||
import { normalizeUrl, normalizeUrlOnlyHostname } from './canonical-url';
|
||||
import { normalizeUrl, normalizeUrlOnlyHostname } from "./canonical-url";
|
||||
|
||||
describe('normalizeUrlOnlyHostname', () => {
|
||||
it('should remove protocol and www from URL', () => {
|
||||
const url = 'https://www.example.com';
|
||||
const expected = 'example.com';
|
||||
describe("normalizeUrlOnlyHostname", () => {
|
||||
it("should remove protocol and www from URL", () => {
|
||||
const url = "https://www.example.com";
|
||||
const expected = "example.com";
|
||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should remove only protocol if www is not present', () => {
|
||||
const url = 'https://example.com';
|
||||
const expected = 'example.com';
|
||||
it("should remove only protocol if www is not present", () => {
|
||||
const url = "https://example.com";
|
||||
const expected = "example.com";
|
||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle URLs without protocol', () => {
|
||||
const url = 'www.example.com';
|
||||
const expected = 'example.com';
|
||||
it("should handle URLs without protocol", () => {
|
||||
const url = "www.example.com";
|
||||
const expected = "example.com";
|
||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle URLs without protocol and www', () => {
|
||||
const url = 'example.com';
|
||||
const expected = 'example.com';
|
||||
it("should handle URLs without protocol and www", () => {
|
||||
const url = "example.com";
|
||||
const expected = "example.com";
|
||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle URLs with paths', () => {
|
||||
const url = 'https://www.example.com/path/to/resource';
|
||||
const expected = 'example.com';
|
||||
it("should handle URLs with paths", () => {
|
||||
const url = "https://www.example.com/path/to/resource";
|
||||
const expected = "example.com";
|
||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle invalid URLs gracefully', () => {
|
||||
const url = 'not a valid url';
|
||||
const expected = 'not a valid url';
|
||||
it("should handle invalid URLs gracefully", () => {
|
||||
const url = "not a valid url";
|
||||
const expected = "not a valid url";
|
||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
|
||||
describe('normalizeUrl', () => {
|
||||
it('should remove protocol and www from URL', () => {
|
||||
const url = 'https://www.example.com';
|
||||
const expected = 'example.com';
|
||||
describe("normalizeUrl", () => {
|
||||
it("should remove protocol and www from URL", () => {
|
||||
const url = "https://www.example.com";
|
||||
const expected = "example.com";
|
||||
expect(normalizeUrl(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should remove only protocol if www is not present', () => {
|
||||
const url = 'https://example.com';
|
||||
const expected = 'example.com';
|
||||
it("should remove only protocol if www is not present", () => {
|
||||
const url = "https://example.com";
|
||||
const expected = "example.com";
|
||||
expect(normalizeUrl(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle URLs without protocol', () => {
|
||||
const url = 'www.example.com';
|
||||
const expected = 'example.com';
|
||||
it("should handle URLs without protocol", () => {
|
||||
const url = "www.example.com";
|
||||
const expected = "example.com";
|
||||
expect(normalizeUrl(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle URLs without protocol and www', () => {
|
||||
const url = 'example.com';
|
||||
const expected = 'example.com';
|
||||
it("should handle URLs without protocol and www", () => {
|
||||
const url = "example.com";
|
||||
const expected = "example.com";
|
||||
expect(normalizeUrl(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle URLs with paths', () => {
|
||||
const url = 'https://www.example.com/path/to/resource';
|
||||
const expected = 'example.com/path/to/resource';
|
||||
it("should handle URLs with paths", () => {
|
||||
const url = "https://www.example.com/path/to/resource";
|
||||
const expected = "example.com/path/to/resource";
|
||||
expect(normalizeUrl(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle URLs with trailing slash', () => {
|
||||
const url = 'https://www.example.com/';
|
||||
const expected = 'example.com';
|
||||
it("should handle URLs with trailing slash", () => {
|
||||
const url = "https://www.example.com/";
|
||||
const expected = "example.com";
|
||||
expect(normalizeUrl(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle URLs with trailing slash and path', () => {
|
||||
const url = 'https://www.example.com/path/';
|
||||
const expected = 'example.com/path';
|
||||
it("should handle URLs with trailing slash and path", () => {
|
||||
const url = "https://www.example.com/path/";
|
||||
const expected = "example.com/path";
|
||||
expect(normalizeUrl(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle invalid URLs gracefully', () => {
|
||||
const url = 'not a valid url';
|
||||
const expected = 'not a valid url';
|
||||
it("should handle invalid URLs gracefully", () => {
|
||||
const url = "not a valid url";
|
||||
const expected = "not a valid url";
|
||||
expect(normalizeUrl(url)).toBe(expected);
|
||||
});
|
||||
});
|
||||
|
@ -322,13 +322,13 @@ export async function lockURLs(
|
||||
export async function lockURLsIndividually(
|
||||
id: string,
|
||||
sc: StoredCrawl,
|
||||
jobs: { id: string; url: string; }[],
|
||||
jobs: { id: string; url: string }[],
|
||||
) {
|
||||
const out: typeof jobs = [];
|
||||
|
||||
for (const job of jobs) {
|
||||
if (await lockURL(id, sc, job.url)) {
|
||||
out.push(job);
|
||||
out.push(job);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -6,6 +6,4 @@ export const extractConfig = {
|
||||
MIN_REQUIRED_LINKS: 1,
|
||||
};
|
||||
|
||||
export const CUSTOM_U_TEAMS = [
|
||||
"874d40cc-a5c0-4e93-b661-9ddfbad5e51e"
|
||||
]
|
||||
export const CUSTOM_U_TEAMS = ["874d40cc-a5c0-4e93-b661-9ddfbad5e51e"];
|
||||
|
@ -21,14 +21,19 @@ export async function getExtract(id: string): Promise<StoredExtract | null> {
|
||||
return x ? JSON.parse(x) : null;
|
||||
}
|
||||
|
||||
export async function updateExtract(id: string, extract: Partial<StoredExtract>) {
|
||||
export async function updateExtract(
|
||||
id: string,
|
||||
extract: Partial<StoredExtract>,
|
||||
) {
|
||||
const current = await getExtract(id);
|
||||
if (!current) return;
|
||||
await redisConnection.set("extract:" + id, JSON.stringify({ ...current, ...extract }));
|
||||
await redisConnection.set(
|
||||
"extract:" + id,
|
||||
JSON.stringify({ ...current, ...extract }),
|
||||
);
|
||||
await redisConnection.expire("extract:" + id, 24 * 60 * 60, "NX");
|
||||
}
|
||||
|
||||
|
||||
export async function getExtractExpiry(id: string): Promise<Date> {
|
||||
const d = new Date();
|
||||
const ttl = await redisConnection.pttl("extract:" + id);
|
||||
|
@ -30,7 +30,7 @@ interface ExtractResult {
|
||||
|
||||
function getRootDomain(url: string): string {
|
||||
try {
|
||||
if(url.endsWith("/*")) {
|
||||
if (url.endsWith("/*")) {
|
||||
url = url.slice(0, -2);
|
||||
}
|
||||
const urlObj = new URL(url);
|
||||
@ -40,32 +40,39 @@ function getRootDomain(url: string): string {
|
||||
}
|
||||
}
|
||||
|
||||
export async function performExtraction(extractId: string, options: ExtractServiceOptions): Promise<ExtractResult> {
|
||||
export async function performExtraction(
|
||||
extractId: string,
|
||||
options: ExtractServiceOptions,
|
||||
): Promise<ExtractResult> {
|
||||
const { request, teamId, plan, subId } = options;
|
||||
const urlTraces: URLTrace[] = [];
|
||||
let docs: Document[] = [];
|
||||
|
||||
// Process URLs
|
||||
const urlPromises = request.urls.map(url =>
|
||||
processUrl({
|
||||
url,
|
||||
prompt: request.prompt,
|
||||
teamId,
|
||||
plan,
|
||||
allowExternalLinks: request.allowExternalLinks,
|
||||
origin: request.origin,
|
||||
limit: request.limit,
|
||||
includeSubdomains: request.includeSubdomains,
|
||||
}, urlTraces)
|
||||
const urlPromises = request.urls.map((url) =>
|
||||
processUrl(
|
||||
{
|
||||
url,
|
||||
prompt: request.prompt,
|
||||
teamId,
|
||||
plan,
|
||||
allowExternalLinks: request.allowExternalLinks,
|
||||
origin: request.origin,
|
||||
limit: request.limit,
|
||||
includeSubdomains: request.includeSubdomains,
|
||||
},
|
||||
urlTraces,
|
||||
),
|
||||
);
|
||||
|
||||
const processedUrls = await Promise.all(urlPromises);
|
||||
const links = processedUrls.flat().filter(url => url);
|
||||
const links = processedUrls.flat().filter((url) => url);
|
||||
|
||||
if (links.length === 0) {
|
||||
return {
|
||||
success: false,
|
||||
error: "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
|
||||
error:
|
||||
"No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
|
||||
extractId,
|
||||
urlTrace: urlTraces,
|
||||
};
|
||||
@ -73,14 +80,17 @@ export async function performExtraction(extractId: string, options: ExtractServi
|
||||
|
||||
// Scrape documents
|
||||
const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000;
|
||||
const scrapePromises = links.map(url =>
|
||||
scrapeDocument({
|
||||
url,
|
||||
teamId,
|
||||
plan,
|
||||
origin: request.origin || "api",
|
||||
timeout,
|
||||
}, urlTraces)
|
||||
const scrapePromises = links.map((url) =>
|
||||
scrapeDocument(
|
||||
{
|
||||
url,
|
||||
teamId,
|
||||
plan,
|
||||
origin: request.origin || "api",
|
||||
timeout,
|
||||
},
|
||||
urlTraces,
|
||||
),
|
||||
);
|
||||
|
||||
try {
|
||||
@ -114,13 +124,16 @@ export async function performExtraction(extractId: string, options: ExtractServi
|
||||
|
||||
// Update token usage in traces
|
||||
if (completions.numTokens) {
|
||||
const totalLength = docs.reduce((sum, doc) => sum + (doc.markdown?.length || 0), 0);
|
||||
const totalLength = docs.reduce(
|
||||
(sum, doc) => sum + (doc.markdown?.length || 0),
|
||||
0,
|
||||
);
|
||||
docs.forEach((doc) => {
|
||||
if (doc.metadata?.sourceURL) {
|
||||
const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL);
|
||||
if (trace && trace.contentStats) {
|
||||
trace.contentStats.tokensUsed = Math.floor(
|
||||
((doc.markdown?.length || 0) / totalLength) * completions.numTokens
|
||||
((doc.markdown?.length || 0) / totalLength) * completions.numTokens,
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -131,7 +144,7 @@ export async function performExtraction(extractId: string, options: ExtractServi
|
||||
// const rootDomains = new Set(request.urls.map(getRootDomain));
|
||||
// rootDomains.forEach(async url => {
|
||||
// const crawlId = crypto.randomUUID();
|
||||
|
||||
|
||||
// // Create and save crawl configuration first
|
||||
// const sc: StoredCrawl = {
|
||||
// originUrl: url,
|
||||
@ -155,7 +168,7 @@ export async function performExtraction(extractId: string, options: ExtractServi
|
||||
// parsePDF: true,
|
||||
// skipTlsVerification: false,
|
||||
// },
|
||||
// internalOptions: {
|
||||
// internalOptions: {
|
||||
// disableSmartWaitCache: true,
|
||||
// isBackgroundIndex: true
|
||||
// },
|
||||
@ -185,7 +198,7 @@ export async function performExtraction(extractId: string, options: ExtractServi
|
||||
|
||||
let linksBilled = links.length * 5;
|
||||
|
||||
if(CUSTOM_U_TEAMS.includes(teamId)){
|
||||
if (CUSTOM_U_TEAMS.includes(teamId)) {
|
||||
linksBilled = 1;
|
||||
}
|
||||
// Bill team for usage
|
||||
@ -213,12 +226,12 @@ export async function performExtraction(extractId: string, options: ExtractServi
|
||||
updateExtract(extractId, {
|
||||
status: "completed",
|
||||
}).catch((error) => {
|
||||
logger.error(`Failed to update extract ${extractId} status to completed: ${error}`);
|
||||
logger.error(
|
||||
`Failed to update extract ${extractId} status to completed: ${error}`,
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: completions.extract ?? {},
|
||||
@ -226,4 +239,4 @@ export async function performExtraction(extractId: string, options: ExtractServi
|
||||
warning: completions.warning,
|
||||
urlTrace: request.urlTrace ? urlTraces : undefined,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
import { Pinecone } from '@pinecone-database/pinecone';
|
||||
import { Document } from '../../../controllers/v1/types';
|
||||
import { logger } from '../../logger';
|
||||
import { Pinecone } from "@pinecone-database/pinecone";
|
||||
import { Document } from "../../../controllers/v1/types";
|
||||
import { logger } from "../../logger";
|
||||
import OpenAI from "openai";
|
||||
|
||||
const openai = new OpenAI({
|
||||
@ -48,34 +48,43 @@ export async function indexPage({
|
||||
document,
|
||||
originUrl,
|
||||
crawlId,
|
||||
teamId
|
||||
teamId,
|
||||
}: {
|
||||
document: Document;
|
||||
originUrl: string;
|
||||
crawlId?: string;
|
||||
teamId?: string;
|
||||
}
|
||||
) {
|
||||
}) {
|
||||
try {
|
||||
const index = pinecone.index(INDEX_NAME);
|
||||
|
||||
// Trim markdown if it's too long
|
||||
let trimmedMarkdown = document.markdown;
|
||||
if (trimmedMarkdown && Buffer.byteLength(trimmedMarkdown, 'utf-8') > MAX_METADATA_SIZE) {
|
||||
trimmedMarkdown = trimmedMarkdown.slice(0, Math.floor(MAX_METADATA_SIZE / 2)); // Using half the size to be safe with UTF-8 encoding
|
||||
if (
|
||||
trimmedMarkdown &&
|
||||
Buffer.byteLength(trimmedMarkdown, "utf-8") > MAX_METADATA_SIZE
|
||||
) {
|
||||
trimmedMarkdown = trimmedMarkdown.slice(
|
||||
0,
|
||||
Math.floor(MAX_METADATA_SIZE / 2),
|
||||
); // Using half the size to be safe with UTF-8 encoding
|
||||
}
|
||||
|
||||
// Create text to embed
|
||||
const textToEmbed = [
|
||||
document.metadata.title,
|
||||
document.metadata.description,
|
||||
trimmedMarkdown
|
||||
].filter(Boolean).join('\n\n');
|
||||
trimmedMarkdown,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join("\n\n");
|
||||
|
||||
// Get embedding from OpenAI
|
||||
const embedding = await getEmbedding(textToEmbed);
|
||||
|
||||
const normalizedUrl = normalizeUrl(document.metadata.sourceURL || document.metadata.url!);
|
||||
const normalizedUrl = normalizeUrl(
|
||||
document.metadata.sourceURL || document.metadata.url!,
|
||||
);
|
||||
|
||||
// Prepare metadata
|
||||
const metadata: PageMetadata = {
|
||||
@ -86,29 +95,30 @@ export async function indexPage({
|
||||
crawlId,
|
||||
teamId,
|
||||
markdown: trimmedMarkdown,
|
||||
timestamp: Date.now()
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
|
||||
// Upsert to Pinecone
|
||||
await index.upsert([{
|
||||
id: normalizedUrl,
|
||||
values: embedding,
|
||||
metadata: {
|
||||
...metadata,
|
||||
[document.metadata.sourceURL || document.metadata.url!]: true
|
||||
}
|
||||
}]);
|
||||
await index.upsert([
|
||||
{
|
||||
id: normalizedUrl,
|
||||
values: embedding,
|
||||
metadata: {
|
||||
...metadata,
|
||||
[document.metadata.sourceURL || document.metadata.url!]: true,
|
||||
},
|
||||
},
|
||||
]);
|
||||
|
||||
logger.debug('Successfully indexed page in Pinecone', {
|
||||
logger.debug("Successfully indexed page in Pinecone", {
|
||||
url: metadata.url,
|
||||
crawlId
|
||||
crawlId,
|
||||
});
|
||||
|
||||
} catch (error) {
|
||||
logger.error('Failed to index page in Pinecone', {
|
||||
logger.error("Failed to index page in Pinecone", {
|
||||
error,
|
||||
url: document.metadata.sourceURL || document.metadata.url,
|
||||
crawlId
|
||||
crawlId,
|
||||
});
|
||||
}
|
||||
}
|
||||
@ -116,7 +126,7 @@ export async function indexPage({
|
||||
export async function searchSimilarPages(
|
||||
query: string,
|
||||
originUrl?: string,
|
||||
limit: number = 10
|
||||
limit: number = 10,
|
||||
) {
|
||||
try {
|
||||
const index = pinecone.index(INDEX_NAME);
|
||||
@ -127,31 +137,30 @@ export async function searchSimilarPages(
|
||||
const queryParams: any = {
|
||||
vector: queryEmbedding,
|
||||
topK: limit,
|
||||
includeMetadata: true
|
||||
includeMetadata: true,
|
||||
};
|
||||
|
||||
const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined;
|
||||
// Add filter if originUrl is provided
|
||||
if (normalizedOriginUrl) {
|
||||
queryParams.filter = {
|
||||
originUrl: { $eq: normalizedOriginUrl }
|
||||
originUrl: { $eq: normalizedOriginUrl },
|
||||
};
|
||||
}
|
||||
|
||||
const results = await index.query(queryParams);
|
||||
return results.matches.map(match => ({
|
||||
return results.matches.map((match) => ({
|
||||
url: match.metadata?.url,
|
||||
title: match.metadata?.title,
|
||||
title: match.metadata?.title,
|
||||
description: match.metadata?.description,
|
||||
score: match.score,
|
||||
markdown: match.metadata?.markdown
|
||||
markdown: match.metadata?.markdown,
|
||||
}));
|
||||
|
||||
} catch (error) {
|
||||
logger.error('Failed to search similar pages in Pinecone', {
|
||||
logger.error("Failed to search similar pages in Pinecone", {
|
||||
error,
|
||||
query,
|
||||
originUrl
|
||||
originUrl,
|
||||
});
|
||||
return [];
|
||||
}
|
||||
|
@ -9,8 +9,6 @@ const cohere = new CohereClient({
|
||||
token: process.env.COHERE_API_KEY,
|
||||
});
|
||||
|
||||
|
||||
|
||||
interface RankingResult {
|
||||
mappedLinks: MapDocument[];
|
||||
linksAndScores: {
|
||||
@ -59,7 +57,6 @@ export async function rerankLinks(
|
||||
searchQuery,
|
||||
);
|
||||
|
||||
|
||||
// First try with high threshold
|
||||
let filteredLinks = filterAndProcessLinks(
|
||||
mappedLinks,
|
||||
@ -67,8 +64,6 @@ export async function rerankLinks(
|
||||
extractConfig.INITIAL_SCORE_THRESHOLD,
|
||||
);
|
||||
|
||||
|
||||
|
||||
// If we don't have enough high-quality links, try with lower threshold
|
||||
if (filteredLinks.length < extractConfig.MIN_REQUIRED_LINKS) {
|
||||
logger.info(
|
||||
@ -102,7 +97,7 @@ export async function rerankLinks(
|
||||
if (trace) {
|
||||
trace.relevanceScore = score.score;
|
||||
// If URL didn't make it through filtering, mark it as filtered out
|
||||
if (!filteredLinks.some(link => link.url === score.link)) {
|
||||
if (!filteredLinks.some((link) => link.url === score.link)) {
|
||||
trace.warning = `Relevance score ${score.score} below threshold`;
|
||||
trace.usedInCompletion = false;
|
||||
}
|
||||
@ -110,20 +105,20 @@ export async function rerankLinks(
|
||||
});
|
||||
|
||||
const rankedLinks = filteredLinks.slice(0, extractConfig.MAX_RANKING_LIMIT);
|
||||
|
||||
|
||||
// Mark URLs that will be used in completion
|
||||
rankedLinks.forEach(link => {
|
||||
const trace = urlTraces.find(t => t.url === link.url);
|
||||
rankedLinks.forEach((link) => {
|
||||
const trace = urlTraces.find((t) => t.url === link.url);
|
||||
if (trace) {
|
||||
trace.usedInCompletion = true;
|
||||
}
|
||||
});
|
||||
|
||||
// Mark URLs that were dropped due to ranking limit
|
||||
filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach(link => {
|
||||
const trace = urlTraces.find(t => t.url === link.url);
|
||||
filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach((link) => {
|
||||
const trace = urlTraces.find((t) => t.url === link.url);
|
||||
if (trace) {
|
||||
trace.warning = 'Excluded due to ranking limit';
|
||||
trace.warning = "Excluded due to ranking limit";
|
||||
trace.usedInCompletion = false;
|
||||
}
|
||||
});
|
||||
|
@ -20,10 +20,13 @@ interface ProcessUrlOptions {
|
||||
includeSubdomains?: boolean;
|
||||
}
|
||||
|
||||
export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace[]): Promise<string[]> {
|
||||
export async function processUrl(
|
||||
options: ProcessUrlOptions,
|
||||
urlTraces: URLTrace[],
|
||||
): Promise<string[]> {
|
||||
const trace: URLTrace = {
|
||||
url: options.url,
|
||||
status: 'mapped',
|
||||
status: "mapped",
|
||||
timing: {
|
||||
discoveredAt: new Date().toISOString(),
|
||||
},
|
||||
@ -35,8 +38,8 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
|
||||
trace.usedInCompletion = true;
|
||||
return [options.url];
|
||||
}
|
||||
trace.status = 'error';
|
||||
trace.error = 'URL is blocked';
|
||||
trace.status = "error";
|
||||
trace.error = "URL is blocked";
|
||||
trace.usedInCompletion = false;
|
||||
return [];
|
||||
}
|
||||
@ -46,9 +49,10 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
|
||||
|
||||
let rephrasedPrompt = options.prompt;
|
||||
if (options.prompt) {
|
||||
rephrasedPrompt = await generateBasicCompletion(
|
||||
buildRefrasedPrompt(options.prompt, baseUrl)
|
||||
) ?? options.prompt;
|
||||
rephrasedPrompt =
|
||||
(await generateBasicCompletion(
|
||||
buildRefrasedPrompt(options.prompt, baseUrl),
|
||||
)) ?? options.prompt;
|
||||
}
|
||||
|
||||
try {
|
||||
@ -70,11 +74,11 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
|
||||
let uniqueUrls = removeDuplicateUrls(allUrls);
|
||||
|
||||
// Track all discovered URLs
|
||||
uniqueUrls.forEach(discoveredUrl => {
|
||||
if (!urlTraces.some(t => t.url === discoveredUrl)) {
|
||||
uniqueUrls.forEach((discoveredUrl) => {
|
||||
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
|
||||
urlTraces.push({
|
||||
url: discoveredUrl,
|
||||
status: 'mapped',
|
||||
status: "mapped",
|
||||
timing: {
|
||||
discoveredAt: new Date().toISOString(),
|
||||
},
|
||||
@ -84,7 +88,7 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
|
||||
});
|
||||
|
||||
// retry if only one url is returned
|
||||
if (uniqueUrls.length <= 1) {
|
||||
if (uniqueUrls.length <= 1) {
|
||||
const retryMapResults = await getMapResults({
|
||||
url: baseUrl,
|
||||
teamId: options.teamId,
|
||||
@ -96,18 +100,18 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
|
||||
includeMetadata: true,
|
||||
includeSubdomains: options.includeSubdomains,
|
||||
});
|
||||
|
||||
|
||||
mappedLinks = retryMapResults.mapResults as MapDocument[];
|
||||
allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
|
||||
uniqueUrls = removeDuplicateUrls(allUrls);
|
||||
|
||||
// Track all discovered URLs
|
||||
uniqueUrls.forEach(discoveredUrl => {
|
||||
if (!urlTraces.some(t => t.url === discoveredUrl)) {
|
||||
uniqueUrls.forEach((discoveredUrl) => {
|
||||
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
|
||||
urlTraces.push({
|
||||
url: discoveredUrl,
|
||||
status: 'mapped',
|
||||
warning: 'Broader search. Not limiting map results to prompt.',
|
||||
status: "mapped",
|
||||
warning: "Broader search. Not limiting map results to prompt.",
|
||||
timing: {
|
||||
discoveredAt: new Date().toISOString(),
|
||||
},
|
||||
@ -118,11 +122,11 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
|
||||
}
|
||||
|
||||
// Track all discovered URLs
|
||||
uniqueUrls.forEach(discoveredUrl => {
|
||||
if (!urlTraces.some(t => t.url === discoveredUrl)) {
|
||||
uniqueUrls.forEach((discoveredUrl) => {
|
||||
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
|
||||
urlTraces.push({
|
||||
url: discoveredUrl,
|
||||
status: 'mapped',
|
||||
status: "mapped",
|
||||
timing: {
|
||||
discoveredAt: new Date().toISOString(),
|
||||
},
|
||||
@ -155,11 +159,11 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
|
||||
mappedLinks = await rerankLinks(mappedLinks, searchQuery, urlTraces);
|
||||
}
|
||||
|
||||
return mappedLinks.map(x => x.url);
|
||||
return mappedLinks.map((x) => x.url);
|
||||
} catch (error) {
|
||||
trace.status = 'error';
|
||||
trace.status = "error";
|
||||
trace.error = error.message;
|
||||
trace.usedInCompletion = false;
|
||||
return [];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -42,11 +42,18 @@ export const logger = winston.createLogger({
|
||||
},
|
||||
}),
|
||||
transports: [
|
||||
...(process.env.FIRECRAWL_LOG_TO_FILE ? [
|
||||
new winston.transports.File({
|
||||
filename: "firecrawl-" + (process.argv[1].includes("worker") ? "worker" : "app") + "-" + crypto.randomUUID() + ".log",
|
||||
})
|
||||
] : []),
|
||||
...(process.env.FIRECRAWL_LOG_TO_FILE
|
||||
? [
|
||||
new winston.transports.File({
|
||||
filename:
|
||||
"firecrawl-" +
|
||||
(process.argv[1].includes("worker") ? "worker" : "app") +
|
||||
"-" +
|
||||
crypto.randomUUID() +
|
||||
".log",
|
||||
}),
|
||||
]
|
||||
: []),
|
||||
new winston.transports.Console({
|
||||
format: winston.format.combine(
|
||||
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
|
||||
|
@ -175,11 +175,10 @@ export async function runWebScraper({
|
||||
}
|
||||
|
||||
// If the team is the background index team, return the response
|
||||
if(team_id === process.env.BACKGROUND_INDEX_TEAM_ID!) {
|
||||
if (team_id === process.env.BACKGROUND_INDEX_TEAM_ID!) {
|
||||
return response;
|
||||
}
|
||||
|
||||
|
||||
billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => {
|
||||
logger.error(
|
||||
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,
|
||||
|
@ -192,7 +192,8 @@ v1Router.get(
|
||||
wrap((req: any, res): any => crawlStatusController(req, res, true)),
|
||||
);
|
||||
|
||||
v1Router.get("/scrape/:jobId",
|
||||
v1Router.get(
|
||||
"/scrape/:jobId",
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
wrap(scrapeStatusController),
|
||||
);
|
||||
@ -242,6 +243,3 @@ v1Router.get(
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
wrap(creditUsageController),
|
||||
);
|
||||
|
||||
|
||||
|
||||
|
@ -219,34 +219,51 @@ export class WebCrawler {
|
||||
const _urlsHandler = async (urls: string[]) => {
|
||||
let uniqueURLs: string[] = [];
|
||||
for (const url of urls) {
|
||||
if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(url))) {
|
||||
if (
|
||||
await redisConnection.sadd(
|
||||
"sitemap:" + this.jobId + ":links",
|
||||
normalizeUrl(url),
|
||||
)
|
||||
) {
|
||||
uniqueURLs.push(url);
|
||||
}
|
||||
}
|
||||
|
||||
await redisConnection.expire("sitemap:" + this.jobId + ":links", 3600, "NX");
|
||||
await redisConnection.expire(
|
||||
"sitemap:" + this.jobId + ":links",
|
||||
3600,
|
||||
"NX",
|
||||
);
|
||||
if (uniqueURLs.length > 0) {
|
||||
urlsHandler(uniqueURLs);
|
||||
}
|
||||
};
|
||||
|
||||
let count = await this.tryFetchSitemapLinks(this.initialUrl, (urls: string[]) => {
|
||||
if (fromMap && onlySitemap) {
|
||||
return urlsHandler(urls);
|
||||
} else {
|
||||
let filteredLinks = this.filterLinks(
|
||||
[...new Set(urls)],
|
||||
leftOfLimit,
|
||||
this.maxCrawledDepth,
|
||||
fromMap,
|
||||
);
|
||||
leftOfLimit -= filteredLinks.length;
|
||||
return _urlsHandler(filteredLinks);
|
||||
}
|
||||
});
|
||||
let count = await this.tryFetchSitemapLinks(
|
||||
this.initialUrl,
|
||||
(urls: string[]) => {
|
||||
if (fromMap && onlySitemap) {
|
||||
return urlsHandler(urls);
|
||||
} else {
|
||||
let filteredLinks = this.filterLinks(
|
||||
[...new Set(urls)],
|
||||
leftOfLimit,
|
||||
this.maxCrawledDepth,
|
||||
fromMap,
|
||||
);
|
||||
leftOfLimit -= filteredLinks.length;
|
||||
return _urlsHandler(filteredLinks);
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
if (count > 0) {
|
||||
if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(this.initialUrl))) {
|
||||
if (
|
||||
await redisConnection.sadd(
|
||||
"sitemap:" + this.jobId + ":links",
|
||||
normalizeUrl(this.initialUrl),
|
||||
)
|
||||
) {
|
||||
urlsHandler([this.initialUrl]);
|
||||
}
|
||||
count++;
|
||||
@ -470,8 +487,13 @@ export class WebCrawler {
|
||||
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
||||
}
|
||||
|
||||
private async tryFetchSitemapLinks(url: string, urlsHandler: (urls: string[]) => unknown): Promise<number> {
|
||||
const sitemapUrl = url.endsWith(".xml") ? url : `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
|
||||
private async tryFetchSitemapLinks(
|
||||
url: string,
|
||||
urlsHandler: (urls: string[]) => unknown,
|
||||
): Promise<number> {
|
||||
const sitemapUrl = url.endsWith(".xml")
|
||||
? url
|
||||
: `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
|
||||
|
||||
let sitemapCount: number = 0;
|
||||
|
||||
@ -482,37 +504,43 @@ export class WebCrawler {
|
||||
this.logger,
|
||||
);
|
||||
} catch (error) {
|
||||
this.logger.debug(
|
||||
`Failed to fetch sitemap from ${sitemapUrl}`,
|
||||
{ method: "tryFetchSitemapLinks", sitemapUrl, error },
|
||||
);
|
||||
this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, {
|
||||
method: "tryFetchSitemapLinks",
|
||||
sitemapUrl,
|
||||
error,
|
||||
});
|
||||
}
|
||||
|
||||
// If this is a subdomain, also try to get sitemap from the main domain
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
const hostname = urlObj.hostname;
|
||||
const domainParts = hostname.split('.');
|
||||
|
||||
const domainParts = hostname.split(".");
|
||||
|
||||
// Check if this is a subdomain (has more than 2 parts and not www)
|
||||
if (domainParts.length > 2 && domainParts[0] !== 'www') {
|
||||
if (domainParts.length > 2 && domainParts[0] !== "www") {
|
||||
// Get the main domain by taking the last two parts
|
||||
const mainDomain = domainParts.slice(-2).join('.');
|
||||
const mainDomain = domainParts.slice(-2).join(".");
|
||||
const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`;
|
||||
const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`;
|
||||
|
||||
try {
|
||||
// Get all links from the main domain's sitemap
|
||||
sitemapCount += await getLinksFromSitemap(
|
||||
{ sitemapUrl: mainDomainSitemapUrl, urlsHandler(urls) {
|
||||
return urlsHandler(urls.filter(link => {
|
||||
try {
|
||||
const linkUrl = new URL(link);
|
||||
return linkUrl.hostname.endsWith(hostname);
|
||||
} catch {
|
||||
}
|
||||
}))
|
||||
}, mode: "fire-engine" },
|
||||
{
|
||||
sitemapUrl: mainDomainSitemapUrl,
|
||||
urlsHandler(urls) {
|
||||
return urlsHandler(
|
||||
urls.filter((link) => {
|
||||
try {
|
||||
const linkUrl = new URL(link);
|
||||
return linkUrl.hostname.endsWith(hostname);
|
||||
} catch {}
|
||||
}),
|
||||
);
|
||||
},
|
||||
mode: "fire-engine",
|
||||
},
|
||||
this.logger,
|
||||
);
|
||||
} catch (error) {
|
||||
|
@ -15,7 +15,7 @@ export async function getLinksFromSitemap(
|
||||
mode = "axios",
|
||||
}: {
|
||||
sitemapUrl: string;
|
||||
urlsHandler(urls: string[]): unknown,
|
||||
urlsHandler(urls: string[]): unknown;
|
||||
mode?: "axios" | "fire-engine";
|
||||
},
|
||||
logger: Logger,
|
||||
@ -31,7 +31,10 @@ export async function getLinksFromSitemap(
|
||||
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
|
||||
);
|
||||
if (!response.success) {
|
||||
logger.debug("Failed to scrape sitemap via TLSClient, falling back to axios...", { error: response.error })
|
||||
logger.debug(
|
||||
"Failed to scrape sitemap via TLSClient, falling back to axios...",
|
||||
{ error: response.error },
|
||||
);
|
||||
const ar = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
content = ar.data;
|
||||
} else {
|
||||
@ -63,14 +66,11 @@ export async function getLinksFromSitemap(
|
||||
.map((sitemap) => sitemap.loc[0].trim());
|
||||
|
||||
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
||||
getLinksFromSitemap(
|
||||
{ sitemapUrl, urlsHandler, mode },
|
||||
logger,
|
||||
),
|
||||
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger),
|
||||
);
|
||||
|
||||
|
||||
const results = await Promise.all(sitemapPromises);
|
||||
count = results.reduce((a,x) => a + x)
|
||||
count = results.reduce((a, x) => a + x);
|
||||
} else if (root && root.url) {
|
||||
// Check if any URLs point to additional sitemaps
|
||||
const xmlSitemaps: string[] = root.url
|
||||
@ -78,7 +78,7 @@ export async function getLinksFromSitemap(
|
||||
(url) =>
|
||||
url.loc &&
|
||||
url.loc.length > 0 &&
|
||||
url.loc[0].trim().toLowerCase().endsWith('.xml')
|
||||
url.loc[0].trim().toLowerCase().endsWith(".xml"),
|
||||
)
|
||||
.map((url) => url.loc[0].trim());
|
||||
|
||||
@ -90,7 +90,10 @@ export async function getLinksFromSitemap(
|
||||
logger,
|
||||
),
|
||||
);
|
||||
count += (await Promise.all(sitemapPromises)).reduce((a,x) => a + x, 0);
|
||||
count += (await Promise.all(sitemapPromises)).reduce(
|
||||
(a, x) => a + x,
|
||||
0,
|
||||
);
|
||||
}
|
||||
|
||||
const validUrls = root.url
|
||||
@ -98,7 +101,7 @@ export async function getLinksFromSitemap(
|
||||
(url) =>
|
||||
url.loc &&
|
||||
url.loc.length > 0 &&
|
||||
!url.loc[0].trim().toLowerCase().endsWith('.xml') &&
|
||||
!url.loc[0].trim().toLowerCase().endsWith(".xml") &&
|
||||
!WebCrawler.prototype.isFile(url.loc[0].trim()),
|
||||
)
|
||||
.map((url) => url.loc[0].trim());
|
||||
|
@ -3,7 +3,10 @@ import { EngineScrapeResult } from "..";
|
||||
import { Meta } from "../..";
|
||||
import { TimeoutError } from "../../error";
|
||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
import { InsecureConnectionError, makeSecureDispatcher } from "../utils/safeFetch";
|
||||
import {
|
||||
InsecureConnectionError,
|
||||
makeSecureDispatcher,
|
||||
} from "../utils/safeFetch";
|
||||
|
||||
export async function scrapeURLWithFetch(
|
||||
meta: Meta,
|
||||
@ -20,7 +23,9 @@ export async function scrapeURLWithFetch(
|
||||
headers: meta.options.headers,
|
||||
}),
|
||||
(async () => {
|
||||
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
|
||||
await new Promise((resolve) =>
|
||||
setTimeout(() => resolve(null), timeout),
|
||||
);
|
||||
throw new TimeoutError(
|
||||
"Fetch was unable to scrape the page before timing out",
|
||||
{ cause: { timeout } },
|
||||
@ -28,7 +33,10 @@ export async function scrapeURLWithFetch(
|
||||
})(),
|
||||
]);
|
||||
} catch (error) {
|
||||
if (error instanceof TypeError && error.cause instanceof InsecureConnectionError) {
|
||||
if (
|
||||
error instanceof TypeError &&
|
||||
error.cause instanceof InsecureConnectionError
|
||||
) {
|
||||
throw error.cause;
|
||||
} else {
|
||||
throw error;
|
||||
|
@ -3,7 +3,12 @@ import * as Sentry from "@sentry/node";
|
||||
import { z } from "zod";
|
||||
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error";
|
||||
import {
|
||||
ActionError,
|
||||
EngineError,
|
||||
SiteError,
|
||||
UnsupportedFileError,
|
||||
} from "../../error";
|
||||
|
||||
const successSchema = z.object({
|
||||
jobId: z.string(),
|
||||
@ -35,12 +40,15 @@ const successSchema = z.object({
|
||||
})
|
||||
.array()
|
||||
.optional(),
|
||||
|
||||
|
||||
// chrome-cdp only -- file download handler
|
||||
file: z.object({
|
||||
name: z.string(),
|
||||
content: z.string(),
|
||||
}).optional().or(z.null()),
|
||||
file: z
|
||||
.object({
|
||||
name: z.string(),
|
||||
content: z.string(),
|
||||
})
|
||||
.optional()
|
||||
.or(z.null()),
|
||||
});
|
||||
|
||||
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
|
||||
@ -121,7 +129,9 @@ export async function fireEngineCheckStatus(
|
||||
typeof status.error === "string" &&
|
||||
status.error.includes("File size exceeds")
|
||||
) {
|
||||
throw new UnsupportedFileError("File size exceeds " + status.error.split("File size exceeds ")[1]);
|
||||
throw new UnsupportedFileError(
|
||||
"File size exceeds " + status.error.split("File size exceeds ")[1],
|
||||
);
|
||||
} else if (
|
||||
typeof status.error === "string" &&
|
||||
// TODO: improve this later
|
||||
|
@ -13,7 +13,13 @@ import {
|
||||
FireEngineCheckStatusSuccess,
|
||||
StillProcessingError,
|
||||
} from "./checkStatus";
|
||||
import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError } from "../../error";
|
||||
import {
|
||||
ActionError,
|
||||
EngineError,
|
||||
SiteError,
|
||||
TimeoutError,
|
||||
UnsupportedFileError,
|
||||
} from "../../error";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { Action } from "../../../../lib/entities";
|
||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
|
@ -38,7 +38,7 @@ const useCache =
|
||||
process.env.CACHE_REDIS_URL !== undefined;
|
||||
|
||||
export const engines: Engine[] = [
|
||||
...(useCache ? [ "cache" as const ] : []),
|
||||
...(useCache ? ["cache" as const] : []),
|
||||
...(useFireEngine
|
||||
? [
|
||||
"fire-engine;chrome-cdp" as const,
|
||||
@ -298,7 +298,6 @@ export function buildFallbackList(meta: Meta): {
|
||||
engine: Engine;
|
||||
unsupportedFeatures: Set<FeatureFlag>;
|
||||
}[] {
|
||||
|
||||
if (meta.internalOptions.useCache !== true) {
|
||||
const cacheIndex = engines.indexOf("cache");
|
||||
if (cacheIndex !== -1) {
|
||||
|
@ -7,11 +7,18 @@ import { v4 as uuid } from "uuid";
|
||||
import * as undici from "undici";
|
||||
import { makeSecureDispatcher } from "./safeFetch";
|
||||
|
||||
export async function fetchFileToBuffer(url: string, init?: undici.RequestInit): Promise<{
|
||||
export async function fetchFileToBuffer(
|
||||
url: string,
|
||||
init?: undici.RequestInit,
|
||||
): Promise<{
|
||||
response: undici.Response;
|
||||
buffer: Buffer;
|
||||
}> {
|
||||
const response = await undici.fetch(url, { ...init, redirect: "follow", dispatcher: await makeSecureDispatcher(url) });
|
||||
const response = await undici.fetch(url, {
|
||||
...init,
|
||||
redirect: "follow",
|
||||
dispatcher: await makeSecureDispatcher(url),
|
||||
});
|
||||
return {
|
||||
response,
|
||||
buffer: Buffer.from(await response.arrayBuffer()),
|
||||
@ -30,7 +37,11 @@ export async function downloadFile(
|
||||
const tempFileWrite = createWriteStream(tempFilePath);
|
||||
|
||||
// TODO: maybe we could use tlsclient for this? for proxying
|
||||
const response = await undici.fetch(url, { ...init, redirect: "follow", dispatcher: await makeSecureDispatcher(url) });
|
||||
const response = await undici.fetch(url, {
|
||||
...init,
|
||||
redirect: "follow",
|
||||
dispatcher: await makeSecureDispatcher(url),
|
||||
});
|
||||
|
||||
// This should never happen in the current state of JS/Undici (2024), but let's check anyways.
|
||||
if (response.body === null) {
|
||||
|
@ -4,57 +4,71 @@ import * as undici from "undici";
|
||||
import { Address6 } from "ip-address";
|
||||
|
||||
export class InsecureConnectionError extends Error {
|
||||
constructor() {
|
||||
super("Connection violated security rules.")
|
||||
}
|
||||
constructor() {
|
||||
super("Connection violated security rules.");
|
||||
}
|
||||
}
|
||||
|
||||
function isIPv4Private(address: string): boolean {
|
||||
const parts = address.split(".").map(x => parseInt(x, 10));
|
||||
return parts[0] === 0 // Current (local, "this") network
|
||||
|| parts[0] === 10 // Used for local communications within a private network
|
||||
|| (parts[0] === 100 && parts[1] >= 64 && parts[1] < 128) // Shared address space for communications between a service provider and its subscribers when using a carrier-grade NAT
|
||||
|| parts[0] === 127 // Used for loopback addresses to the local host
|
||||
|| (parts[0] === 169 && parts[1] === 254) // Used for link-local addresses between two hosts on a single link when no IP address is otherwise specified, such as would have normally been retrieved from a DHCP server
|
||||
|| (parts[0] === 127 && parts[1] >= 16 && parts[2] < 32) // Used for local communications within a private network
|
||||
|| (parts[0] === 192 && parts[1] === 0 && parts[2] === 0) // IETF Porotocol Assignments, DS-Lite (/29)
|
||||
|| (parts[0] === 192 && parts[1] === 0 && parts[2] === 2) // Assigned as TEST-NET-1, documentation and examples
|
||||
|| (parts[0] === 192 && parts[1] === 88 && parts[2] === 99) // Reserved. Formerly used for IPv6 to IPv4 relay (included IPv6 address block 2002::/16).
|
||||
|| (parts[0] === 192 && parts[1] === 168) // Used for local communications within a private network
|
||||
|| (parts[0] === 192 && parts[1] >= 18 && parts[1] < 20) // Used for benchmark testing of inter-network communications between two separate subnets
|
||||
|| (parts[0] === 198 && parts[1] === 51 && parts[2] === 100) // Assigned as TEST-NET-2, documentation and examples
|
||||
|| (parts[0] === 203 && parts[1] === 0 && parts[2] === 113) // Assigned as TEST-NET-3, documentation and examples
|
||||
|| (parts[0] >= 224 && parts[0] < 240) // In use for multicast (former Class D network)
|
||||
|| (parts[0] === 233 && parts[1] === 252 && parts[2] === 0) // Assigned as MCAST-TEST-NET, documentation and examples (Note that this is part of the above multicast space.)
|
||||
|| parts[0] >= 240 // Reserved for future use (former class E network)
|
||||
|| (parts[0] === 255 && parts[1] === 255 && parts[2] === 255 && parts[3] === 255) // Reserved for the "limited broadcast" destination address
|
||||
const parts = address.split(".").map((x) => parseInt(x, 10));
|
||||
return (
|
||||
parts[0] === 0 || // Current (local, "this") network
|
||||
parts[0] === 10 || // Used for local communications within a private network
|
||||
(parts[0] === 100 && parts[1] >= 64 && parts[1] < 128) || // Shared address space for communications between a service provider and its subscribers when using a carrier-grade NAT
|
||||
parts[0] === 127 || // Used for loopback addresses to the local host
|
||||
(parts[0] === 169 && parts[1] === 254) || // Used for link-local addresses between two hosts on a single link when no IP address is otherwise specified, such as would have normally been retrieved from a DHCP server
|
||||
(parts[0] === 127 && parts[1] >= 16 && parts[2] < 32) || // Used for local communications within a private network
|
||||
(parts[0] === 192 && parts[1] === 0 && parts[2] === 0) || // IETF Porotocol Assignments, DS-Lite (/29)
|
||||
(parts[0] === 192 && parts[1] === 0 && parts[2] === 2) || // Assigned as TEST-NET-1, documentation and examples
|
||||
(parts[0] === 192 && parts[1] === 88 && parts[2] === 99) || // Reserved. Formerly used for IPv6 to IPv4 relay (included IPv6 address block 2002::/16).
|
||||
(parts[0] === 192 && parts[1] === 168) || // Used for local communications within a private network
|
||||
(parts[0] === 192 && parts[1] >= 18 && parts[1] < 20) || // Used for benchmark testing of inter-network communications between two separate subnets
|
||||
(parts[0] === 198 && parts[1] === 51 && parts[2] === 100) || // Assigned as TEST-NET-2, documentation and examples
|
||||
(parts[0] === 203 && parts[1] === 0 && parts[2] === 113) || // Assigned as TEST-NET-3, documentation and examples
|
||||
(parts[0] >= 224 && parts[0] < 240) || // In use for multicast (former Class D network)
|
||||
(parts[0] === 233 && parts[1] === 252 && parts[2] === 0) || // Assigned as MCAST-TEST-NET, documentation and examples (Note that this is part of the above multicast space.)
|
||||
parts[0] >= 240 || // Reserved for future use (former class E network)
|
||||
(parts[0] === 255 &&
|
||||
parts[1] === 255 &&
|
||||
parts[2] === 255 &&
|
||||
parts[3] === 255)
|
||||
); // Reserved for the "limited broadcast" destination address
|
||||
}
|
||||
|
||||
function isIPv6Private(ipv6) {
|
||||
return new Address6(ipv6).getScope() !== "Global";
|
||||
return new Address6(ipv6).getScope() !== "Global";
|
||||
}
|
||||
|
||||
export function makeSecureDispatcher(url: string, options?: undici.Agent.Options) {
|
||||
const agent = new undici.Agent({
|
||||
connect: {
|
||||
rejectUnauthorized: false, // bypass SSL failures -- this is fine
|
||||
// lookup: secureLookup,
|
||||
},
|
||||
maxRedirections: 5000,
|
||||
...options,
|
||||
});
|
||||
export function makeSecureDispatcher(
|
||||
url: string,
|
||||
options?: undici.Agent.Options,
|
||||
) {
|
||||
const agent = new undici.Agent({
|
||||
connect: {
|
||||
rejectUnauthorized: false, // bypass SSL failures -- this is fine
|
||||
// lookup: secureLookup,
|
||||
},
|
||||
maxRedirections: 5000,
|
||||
...options,
|
||||
});
|
||||
|
||||
agent.on("connect", (_, targets) => {
|
||||
const client: undici.Client = targets.slice(-1)[0] as undici.Client;
|
||||
const socketSymbol = Object.getOwnPropertySymbols(client).find(x => x.description === "socket")!;
|
||||
const socket: Socket | TLSSocket = (client as any)[socketSymbol];
|
||||
agent.on("connect", (_, targets) => {
|
||||
const client: undici.Client = targets.slice(-1)[0] as undici.Client;
|
||||
const socketSymbol = Object.getOwnPropertySymbols(client).find(
|
||||
(x) => x.description === "socket",
|
||||
)!;
|
||||
const socket: Socket | TLSSocket = (client as any)[socketSymbol];
|
||||
|
||||
if (socket.remoteAddress) {
|
||||
if (socket.remoteFamily === "IPv4" ? isIPv4Private(socket.remoteAddress!) : isIPv6Private(socket.remoteAddress!)) {
|
||||
socket.destroy(new InsecureConnectionError())
|
||||
}
|
||||
}
|
||||
});
|
||||
if (socket.remoteAddress) {
|
||||
if (
|
||||
socket.remoteFamily === "IPv4"
|
||||
? isIPv4Private(socket.remoteAddress!)
|
||||
: isIPv6Private(socket.remoteAddress!)
|
||||
) {
|
||||
socket.destroy(new InsecureConnectionError());
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return agent;
|
||||
}
|
||||
return agent;
|
||||
}
|
||||
|
@ -420,7 +420,9 @@ export async function scrapeURL(
|
||||
} else if (error instanceof ActionError) {
|
||||
meta.logger.warn("scrapeURL: Action(s) failed to complete", { error });
|
||||
} else if (error instanceof UnsupportedFileError) {
|
||||
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", { error });
|
||||
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", {
|
||||
error,
|
||||
});
|
||||
} else {
|
||||
Sentry.captureException(error);
|
||||
meta.logger.error("scrapeURL: Unexpected error happened", { error });
|
||||
|
@ -43,11 +43,16 @@ export function extractMetadata(
|
||||
try {
|
||||
title = soup("title").first().text().trim() || undefined;
|
||||
description = soup('meta[name="description"]').attr("content") || undefined;
|
||||
|
||||
const faviconLink = soup('link[rel="icon"]').attr("href") || soup('link[rel*="icon"]').first().attr("href") || undefined;
|
||||
|
||||
const faviconLink =
|
||||
soup('link[rel="icon"]').attr("href") ||
|
||||
soup('link[rel*="icon"]').first().attr("href") ||
|
||||
undefined;
|
||||
if (faviconLink) {
|
||||
const baseUrl = new URL(meta.url).origin;
|
||||
favicon = faviconLink.startsWith('http') ? faviconLink : `${baseUrl}${faviconLink}`;
|
||||
favicon = faviconLink.startsWith("http")
|
||||
? faviconLink
|
||||
: `${baseUrl}${faviconLink}`;
|
||||
}
|
||||
|
||||
// Assuming the language is part of the URL as per the regex pattern
|
||||
|
@ -24,7 +24,6 @@ export function saveToCache(meta: Meta, document: Document): Document {
|
||||
return document;
|
||||
}
|
||||
|
||||
|
||||
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
|
||||
|
||||
if (key !== null) {
|
||||
|
@ -1,33 +1,41 @@
|
||||
import { removeDefaultProperty } from "./llmExtract";
|
||||
|
||||
describe("removeDefaultProperty", () => {
|
||||
it("should remove the default property from a simple object", () => {
|
||||
const input = { default: "test", test: "test" };
|
||||
const expectedOutput = { test: "test" };
|
||||
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||
});
|
||||
it("should remove the default property from a simple object", () => {
|
||||
const input = { default: "test", test: "test" };
|
||||
const expectedOutput = { test: "test" };
|
||||
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||
});
|
||||
|
||||
it("should remove the default property from a nested object", () => {
|
||||
const input = { default: "test", nested: { default: "nestedTest", test: "nestedTest" } };
|
||||
const expectedOutput = { nested: { test: "nestedTest" } };
|
||||
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||
});
|
||||
it("should remove the default property from a nested object", () => {
|
||||
const input = {
|
||||
default: "test",
|
||||
nested: { default: "nestedTest", test: "nestedTest" },
|
||||
};
|
||||
const expectedOutput = { nested: { test: "nestedTest" } };
|
||||
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||
});
|
||||
|
||||
it("should remove the default property from an array of objects", () => {
|
||||
const input = { array: [{ default: "test1", test: "test1" }, { default: "test2", test: "test2" }] };
|
||||
const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] };
|
||||
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||
});
|
||||
it("should remove the default property from an array of objects", () => {
|
||||
const input = {
|
||||
array: [
|
||||
{ default: "test1", test: "test1" },
|
||||
{ default: "test2", test: "test2" },
|
||||
],
|
||||
};
|
||||
const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] };
|
||||
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||
});
|
||||
|
||||
it("should handle objects without a default property", () => {
|
||||
const input = { test: "test" };
|
||||
const expectedOutput = { test: "test" };
|
||||
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||
});
|
||||
it("should handle objects without a default property", () => {
|
||||
const input = { test: "test" };
|
||||
const expectedOutput = { test: "test" };
|
||||
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||
});
|
||||
|
||||
it("should handle null and non-object inputs", () => {
|
||||
expect(removeDefaultProperty(null)).toBeNull();
|
||||
expect(removeDefaultProperty("string")).toBe("string");
|
||||
expect(removeDefaultProperty(123)).toBe(123);
|
||||
});
|
||||
});
|
||||
it("should handle null and non-object inputs", () => {
|
||||
expect(removeDefaultProperty(null)).toBeNull();
|
||||
expect(removeDefaultProperty("string")).toBe("string");
|
||||
expect(removeDefaultProperty(123)).toBe(123);
|
||||
});
|
||||
});
|
||||
|
@ -123,7 +123,7 @@ export async function generateOpenAICompletions(
|
||||
let schema = options.schema;
|
||||
if (schema) {
|
||||
schema = removeDefaultProperty(schema);
|
||||
}
|
||||
}
|
||||
|
||||
if (schema && schema.type === "array") {
|
||||
schema = {
|
||||
@ -140,10 +140,10 @@ export async function generateOpenAICompletions(
|
||||
properties: Object.fromEntries(
|
||||
Object.entries(schema).map(([key, value]) => {
|
||||
return [key, removeDefaultProperty(value)];
|
||||
})
|
||||
}),
|
||||
),
|
||||
required: Object.keys(schema),
|
||||
additionalProperties: false
|
||||
additionalProperties: false,
|
||||
};
|
||||
}
|
||||
|
||||
@ -240,17 +240,17 @@ export async function performLLMExtract(
|
||||
}
|
||||
|
||||
export function removeDefaultProperty(schema: any): any {
|
||||
if (typeof schema !== 'object' || schema === null) return schema;
|
||||
if (typeof schema !== "object" || schema === null) return schema;
|
||||
|
||||
const { default: _, ...rest } = schema;
|
||||
|
||||
for (const key in rest) {
|
||||
if (Array.isArray(rest[key])) {
|
||||
rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));
|
||||
} else if (typeof rest[key] === 'object' && rest[key] !== null) {
|
||||
rest[key] = removeDefaultProperty(rest[key]);
|
||||
}
|
||||
if (Array.isArray(rest[key])) {
|
||||
rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));
|
||||
} else if (typeof rest[key] === "object" && rest[key] !== null) {
|
||||
rest[key] = removeDefaultProperty(rest[key]);
|
||||
}
|
||||
}
|
||||
|
||||
return rest;
|
||||
}
|
||||
}
|
||||
|
@ -9,9 +9,11 @@ configDotenv();
|
||||
|
||||
function cleanOfNull<T>(x: T): T {
|
||||
if (Array.isArray(x)) {
|
||||
return x.map(x => cleanOfNull(x)) as T;
|
||||
return x.map((x) => cleanOfNull(x)) as T;
|
||||
} else if (typeof x === "object" && x !== null) {
|
||||
return Object.fromEntries(Object.entries(x).map(([k,v]) => [k,cleanOfNull(v)])) as T
|
||||
return Object.fromEntries(
|
||||
Object.entries(x).map(([k, v]) => [k, cleanOfNull(v)]),
|
||||
) as T;
|
||||
} else if (typeof x === "string") {
|
||||
return x.replaceAll("\u0000", "") as T;
|
||||
} else {
|
||||
|
@ -16,20 +16,17 @@ export const loggingQueueName = "{loggingQueue}";
|
||||
|
||||
export function getScrapeQueue() {
|
||||
if (!scrapeQueue) {
|
||||
scrapeQueue = new Queue(
|
||||
scrapeQueueName,
|
||||
{
|
||||
connection: redisConnection,
|
||||
defaultJobOptions: {
|
||||
removeOnComplete: {
|
||||
age: 90000, // 25 hours
|
||||
},
|
||||
removeOnFail: {
|
||||
age: 90000, // 25 hours
|
||||
},
|
||||
scrapeQueue = new Queue(scrapeQueueName, {
|
||||
connection: redisConnection,
|
||||
defaultJobOptions: {
|
||||
removeOnComplete: {
|
||||
age: 90000, // 25 hours
|
||||
},
|
||||
}
|
||||
);
|
||||
removeOnFail: {
|
||||
age: 90000, // 25 hours
|
||||
},
|
||||
},
|
||||
});
|
||||
logger.info("Web scraper queue created");
|
||||
}
|
||||
return scrapeQueue;
|
||||
@ -37,26 +34,22 @@ export function getScrapeQueue() {
|
||||
|
||||
export function getExtractQueue() {
|
||||
if (!extractQueue) {
|
||||
extractQueue = new Queue(
|
||||
extractQueueName,
|
||||
{
|
||||
connection: redisConnection,
|
||||
defaultJobOptions: {
|
||||
removeOnComplete: {
|
||||
age: 90000, // 25 hours
|
||||
},
|
||||
removeOnFail: {
|
||||
age: 90000, // 25 hours
|
||||
},
|
||||
extractQueue = new Queue(extractQueueName, {
|
||||
connection: redisConnection,
|
||||
defaultJobOptions: {
|
||||
removeOnComplete: {
|
||||
age: 90000, // 25 hours
|
||||
},
|
||||
}
|
||||
);
|
||||
removeOnFail: {
|
||||
age: 90000, // 25 hours
|
||||
},
|
||||
},
|
||||
});
|
||||
logger.info("Extraction queue created");
|
||||
}
|
||||
return extractQueue;
|
||||
}
|
||||
|
||||
|
||||
// === REMOVED IN FAVOR OF POLLING -- NOT RELIABLE
|
||||
// import { QueueEvents } from 'bullmq';
|
||||
// export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection.duplicate() });
|
||||
|
@ -89,13 +89,19 @@ const runningJobs: Set<string> = new Set();
|
||||
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||
if (await finishCrawl(job.data.crawl_id)) {
|
||||
(async () => {
|
||||
const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined;
|
||||
const originUrl = sc.originUrl
|
||||
? normalizeUrlOnlyHostname(sc.originUrl)
|
||||
: undefined;
|
||||
// Get all visited URLs from Redis
|
||||
const visitedUrls = await redisConnection.smembers(
|
||||
"crawl:" + job.data.crawl_id + ":visited",
|
||||
);
|
||||
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
|
||||
if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
|
||||
if (
|
||||
visitedUrls.length > 0 &&
|
||||
job.data.crawlerOptions !== null &&
|
||||
originUrl
|
||||
) {
|
||||
// Fire and forget the upload to Supabase
|
||||
try {
|
||||
// Standardize URLs to canonical form (https, no www)
|
||||
@ -317,7 +323,10 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
|
||||
return err;
|
||||
};
|
||||
|
||||
const processExtractJobInternal = async (token: string, job: Job & { id: string }) => {
|
||||
const processExtractJobInternal = async (
|
||||
token: string,
|
||||
job: Job & { id: string },
|
||||
) => {
|
||||
const logger = _logger.child({
|
||||
module: "extract-worker",
|
||||
method: "processJobInternal",
|
||||
@ -348,23 +357,26 @@ const processExtractJobInternal = async (token: string, job: Job & { id: string
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`🚫 Job errored ${job.id} - ${error}`, { error });
|
||||
|
||||
|
||||
Sentry.captureException(error, {
|
||||
data: {
|
||||
job: job.id,
|
||||
},
|
||||
});
|
||||
|
||||
|
||||
// Move job to failed state in Redis
|
||||
await job.moveToFailed(error, token, false);
|
||||
|
||||
await updateExtract(job.data.extractId, {
|
||||
status: "failed",
|
||||
error: error.error ?? error ?? "Unknown error, please contact help@firecrawl.dev. Extract id: " + job.data.extractId,
|
||||
error:
|
||||
error.error ??
|
||||
error ??
|
||||
"Unknown error, please contact help@firecrawl.dev. Extract id: " +
|
||||
job.data.extractId,
|
||||
});
|
||||
// throw error;
|
||||
} finally {
|
||||
|
||||
clearInterval(extendLockInterval);
|
||||
}
|
||||
};
|
||||
@ -635,7 +647,9 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
||||
sc,
|
||||
jobs.map((x) => ({ id: x.opts.jobId, url: x.data.url })),
|
||||
);
|
||||
const lockedJobs = jobs.filter(x => lockedIds.find(y => y.id === x.opts.jobId));
|
||||
const lockedJobs = jobs.filter((x) =>
|
||||
lockedIds.find((y) => y.id === x.opts.jobId),
|
||||
);
|
||||
logger.debug("Adding scrape jobs to Redis...");
|
||||
await addCrawlJobs(
|
||||
job.data.crawl_id,
|
||||
@ -790,7 +804,8 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
) {
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||
if (
|
||||
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null &&
|
||||
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) ===
|
||||
null &&
|
||||
!job.data.isCrawlSourceScrape
|
||||
) {
|
||||
throw new Error(
|
||||
@ -1073,8 +1088,8 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
console.log("All workers exited. Waiting for all jobs to finish...");
|
||||
|
||||
while (runningJobs.size > 0) {
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||
}
|
||||
|
||||
process.exit(0);
|
||||
})();
|
||||
})();
|
||||
|
Loading…
x
Reference in New Issue
Block a user