mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-13 20:45:57 +08:00
Nick: formatting fixes
This commit is contained in:
parent
d1f3b96388
commit
f4d10c5031
@ -179,11 +179,15 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
|
|
||||||
const sitemap = sc.crawlerOptions.ignoreSitemap
|
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||||
? 0
|
? 0
|
||||||
: await crawler.tryGetSitemap(async urls => {
|
: await crawler.tryGetSitemap(async (urls) => {
|
||||||
if (urls.length === 0) return;
|
if (urls.length === 0) return;
|
||||||
|
|
||||||
let jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 });
|
let jobPriority = await getJobPriority({
|
||||||
const jobs = urls.map(url => {
|
plan,
|
||||||
|
team_id,
|
||||||
|
basePriority: 21,
|
||||||
|
});
|
||||||
|
const jobs = urls.map((url) => {
|
||||||
const uuid = uuidv4();
|
const uuid = uuidv4();
|
||||||
return {
|
return {
|
||||||
name: uuid,
|
name: uuid,
|
||||||
|
@ -114,7 +114,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
|
|
||||||
const sitemap = sc.crawlerOptions?.ignoreSitemap
|
const sitemap = sc.crawlerOptions?.ignoreSitemap
|
||||||
? 0
|
? 0
|
||||||
: await crawler.tryGetSitemap(async urls => {
|
: await crawler.tryGetSitemap(async (urls) => {
|
||||||
for (const url of urls) {
|
for (const url of urls) {
|
||||||
await lockURL(id, sc, url);
|
await lockURL(id, sc, url);
|
||||||
const jobId = uuidv4();
|
const jobId = uuidv4();
|
||||||
|
@ -115,7 +115,8 @@ export async function crawlStatusController(
|
|||||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
|
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
|
||||||
sc.cancelled
|
sc.cancelled
|
||||||
? "cancelled"
|
? "cancelled"
|
||||||
: (validJobStatuses.every((x) => x[1] === "completed") && validJobStatuses.length > 0)
|
: validJobStatuses.every((x) => x[1] === "completed") &&
|
||||||
|
validJobStatuses.length > 0
|
||||||
? "completed"
|
? "completed"
|
||||||
: "scraping";
|
: "scraping";
|
||||||
|
|
||||||
|
@ -7,11 +7,7 @@ import {
|
|||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
toLegacyCrawlerOptions,
|
toLegacyCrawlerOptions,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import {
|
import { crawlToCrawler, saveCrawl, StoredCrawl } from "../../lib/crawl-redis";
|
||||||
crawlToCrawler,
|
|
||||||
saveCrawl,
|
|
||||||
StoredCrawl,
|
|
||||||
} from "../../lib/crawl-redis";
|
|
||||||
import { logCrawl } from "../../services/logging/crawl_log";
|
import { logCrawl } from "../../services/logging/crawl_log";
|
||||||
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
|
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
|
||||||
import { logger as _logger } from "../../lib/logger";
|
import { logger as _logger } from "../../lib/logger";
|
||||||
@ -103,7 +99,8 @@ export async function crawlController(
|
|||||||
|
|
||||||
await saveCrawl(id, sc);
|
await saveCrawl(id, sc);
|
||||||
|
|
||||||
await _addScrapeJobToBullMQ({
|
await _addScrapeJobToBullMQ(
|
||||||
|
{
|
||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
mode: "kickoff" as const,
|
mode: "kickoff" as const,
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
@ -115,7 +112,11 @@ export async function crawlController(
|
|||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
webhook: req.body.webhook,
|
webhook: req.body.webhook,
|
||||||
v1: true,
|
v1: true,
|
||||||
}, {}, crypto.randomUUID(), 10);
|
},
|
||||||
|
{},
|
||||||
|
crypto.randomUUID(),
|
||||||
|
10,
|
||||||
|
);
|
||||||
|
|
||||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||||
|
|
||||||
|
@ -11,7 +11,11 @@ import { saveExtract } from "../../lib/extract/extract-redis";
|
|||||||
import { getTeamIdSyncB } from "../../lib/extract/team-id-sync";
|
import { getTeamIdSyncB } from "../../lib/extract/team-id-sync";
|
||||||
import { performExtraction } from "../../lib/extract/extraction-service";
|
import { performExtraction } from "../../lib/extract/extraction-service";
|
||||||
|
|
||||||
export async function oldExtract(req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>, res: Response<ExtractResponse>, extractId: string){
|
export async function oldExtract(
|
||||||
|
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
|
||||||
|
res: Response<ExtractResponse>,
|
||||||
|
extractId: string,
|
||||||
|
) {
|
||||||
// Means that are in the non-queue system
|
// Means that are in the non-queue system
|
||||||
// TODO: Remove this once all teams have transitioned to the new system
|
// TODO: Remove this once all teams have transitioned to the new system
|
||||||
try {
|
try {
|
||||||
@ -53,7 +57,10 @@ export async function extractController(
|
|||||||
extractId,
|
extractId,
|
||||||
};
|
};
|
||||||
|
|
||||||
if(await getTeamIdSyncB(req.auth.team_id) && req.body.origin !== "api-sdk") {
|
if (
|
||||||
|
(await getTeamIdSyncB(req.auth.team_id)) &&
|
||||||
|
req.body.origin !== "api-sdk"
|
||||||
|
) {
|
||||||
return await oldExtract(req, res, extractId);
|
return await oldExtract(req, res, extractId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -86,11 +86,15 @@ export async function getMapResults({
|
|||||||
|
|
||||||
// If sitemapOnly is true, only get links from sitemap
|
// If sitemapOnly is true, only get links from sitemap
|
||||||
if (crawlerOptions.sitemapOnly) {
|
if (crawlerOptions.sitemapOnly) {
|
||||||
const sitemap = await crawler.tryGetSitemap(urls => {
|
const sitemap = await crawler.tryGetSitemap(
|
||||||
|
(urls) => {
|
||||||
urls.forEach((x) => {
|
urls.forEach((x) => {
|
||||||
links.push(x);
|
links.push(x);
|
||||||
});
|
});
|
||||||
}, true, true);
|
},
|
||||||
|
true,
|
||||||
|
true,
|
||||||
|
);
|
||||||
if (sitemap > 0) {
|
if (sitemap > 0) {
|
||||||
links = links
|
links = links
|
||||||
.slice(1)
|
.slice(1)
|
||||||
@ -145,7 +149,9 @@ export async function getMapResults({
|
|||||||
|
|
||||||
// Parallelize sitemap fetch with serper search
|
// Parallelize sitemap fetch with serper search
|
||||||
const [_, ...searchResults] = await Promise.all([
|
const [_, ...searchResults] = await Promise.all([
|
||||||
ignoreSitemap ? null : crawler.tryGetSitemap(urls => {
|
ignoreSitemap
|
||||||
|
? null
|
||||||
|
: crawler.tryGetSitemap((urls) => {
|
||||||
links.push(...urls);
|
links.push(...urls);
|
||||||
}, true),
|
}, true),
|
||||||
...(cachedResult ? [] : pagePromises),
|
...(cachedResult ? [] : pagePromises),
|
||||||
|
@ -18,7 +18,10 @@ export async function scrapeStatusController(req: any, res: any) {
|
|||||||
|
|
||||||
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
|
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
|
||||||
|
|
||||||
if (!allowedTeams.includes(job?.team_id) || job?.team_id !== req.auth.team_id) {
|
if (
|
||||||
|
!allowedTeams.includes(job?.team_id) ||
|
||||||
|
job?.team_id !== req.auth.team_id
|
||||||
|
) {
|
||||||
return res.status(403).json({
|
return res.status(403).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "You are not allowed to access this resource.",
|
error: "You are not allowed to access this resource.",
|
||||||
|
@ -200,7 +200,8 @@ export const extractV1Options = z
|
|||||||
schema: z
|
schema: z
|
||||||
.any()
|
.any()
|
||||||
.optional()
|
.optional()
|
||||||
.refine((val) => {
|
.refine(
|
||||||
|
(val) => {
|
||||||
if (!val) return true; // Allow undefined schema
|
if (!val) return true; // Allow undefined schema
|
||||||
try {
|
try {
|
||||||
const validate = ajv.compile(val);
|
const validate = ajv.compile(val);
|
||||||
@ -208,9 +209,11 @@ export const extractV1Options = z
|
|||||||
} catch (e) {
|
} catch (e) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}, {
|
},
|
||||||
|
{
|
||||||
message: "Invalid JSON schema.",
|
message: "Invalid JSON schema.",
|
||||||
}),
|
},
|
||||||
|
),
|
||||||
limit: z.number().int().positive().finite().safe().optional(),
|
limit: z.number().int().positive().finite().safe().optional(),
|
||||||
ignoreSitemap: z.boolean().default(false),
|
ignoreSitemap: z.boolean().default(false),
|
||||||
includeSubdomains: z.boolean().default(true),
|
includeSubdomains: z.boolean().default(true),
|
||||||
@ -452,7 +455,7 @@ export type Document = {
|
|||||||
description: string;
|
description: string;
|
||||||
url: string;
|
url: string;
|
||||||
};
|
};
|
||||||
}
|
};
|
||||||
|
|
||||||
export type ErrorResponse = {
|
export type ErrorResponse = {
|
||||||
success: false;
|
success: false;
|
||||||
@ -477,7 +480,7 @@ export interface ScrapeResponseRequestTest {
|
|||||||
|
|
||||||
export interface URLTrace {
|
export interface URLTrace {
|
||||||
url: string;
|
url: string;
|
||||||
status: 'mapped' | 'scraped' | 'error';
|
status: "mapped" | "scraped" | "error";
|
||||||
timing: {
|
timing: {
|
||||||
discoveredAt: string;
|
discoveredAt: string;
|
||||||
scrapedAt?: string;
|
scrapedAt?: string;
|
||||||
@ -785,9 +788,18 @@ export function toLegacyDocument(
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export const searchRequestSchema = z.object({
|
export const searchRequestSchema = z
|
||||||
|
.object({
|
||||||
query: z.string(),
|
query: z.string(),
|
||||||
limit: z.number().int().positive().finite().safe().max(10).optional().default(5),
|
limit: z
|
||||||
|
.number()
|
||||||
|
.int()
|
||||||
|
.positive()
|
||||||
|
.finite()
|
||||||
|
.safe()
|
||||||
|
.max(10)
|
||||||
|
.optional()
|
||||||
|
.default(5),
|
||||||
tbs: z.string().optional(),
|
tbs: z.string().optional(),
|
||||||
filter: z.string().optional(),
|
filter: z.string().optional(),
|
||||||
lang: z.string().optional().default("en"),
|
lang: z.string().optional().default("en"),
|
||||||
@ -795,18 +807,27 @@ export const searchRequestSchema = z.object({
|
|||||||
location: z.string().optional(),
|
location: z.string().optional(),
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
timeout: z.number().int().positive().finite().safe().default(60000),
|
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||||
scrapeOptions: scrapeOptions.extend({
|
scrapeOptions: scrapeOptions
|
||||||
formats: z.array(z.enum([
|
.extend({
|
||||||
|
formats: z
|
||||||
|
.array(
|
||||||
|
z.enum([
|
||||||
"markdown",
|
"markdown",
|
||||||
"html",
|
"html",
|
||||||
"rawHtml",
|
"rawHtml",
|
||||||
"links",
|
"links",
|
||||||
"screenshot",
|
"screenshot",
|
||||||
"screenshot@fullPage",
|
"screenshot@fullPage",
|
||||||
"extract"
|
"extract",
|
||||||
])).default([])
|
]),
|
||||||
}).default({}),
|
)
|
||||||
}).strict("Unrecognized key in body -- please review the v1 API documentation for request body changes");
|
.default([]),
|
||||||
|
})
|
||||||
|
.default({}),
|
||||||
|
})
|
||||||
|
.strict(
|
||||||
|
"Unrecognized key in body -- please review the v1 API documentation for request body changes",
|
||||||
|
);
|
||||||
|
|
||||||
export type SearchRequest = z.infer<typeof searchRequestSchema>;
|
export type SearchRequest = z.infer<typeof searchRequestSchema>;
|
||||||
|
|
||||||
|
@ -45,7 +45,10 @@ const serverAdapter = new ExpressAdapter();
|
|||||||
serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
|
serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
|
||||||
|
|
||||||
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
||||||
queues: [new BullAdapter(getScrapeQueue()), new BullAdapter(getExtractQueue())],
|
queues: [
|
||||||
|
new BullAdapter(getScrapeQueue()),
|
||||||
|
new BullAdapter(getExtractQueue()),
|
||||||
|
],
|
||||||
serverAdapter: serverAdapter,
|
serverAdapter: serverAdapter,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -1,91 +1,89 @@
|
|||||||
import { normalizeUrl, normalizeUrlOnlyHostname } from './canonical-url';
|
import { normalizeUrl, normalizeUrlOnlyHostname } from "./canonical-url";
|
||||||
|
|
||||||
describe('normalizeUrlOnlyHostname', () => {
|
describe("normalizeUrlOnlyHostname", () => {
|
||||||
it('should remove protocol and www from URL', () => {
|
it("should remove protocol and www from URL", () => {
|
||||||
const url = 'https://www.example.com';
|
const url = "https://www.example.com";
|
||||||
const expected = 'example.com';
|
const expected = "example.com";
|
||||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should remove only protocol if www is not present', () => {
|
it("should remove only protocol if www is not present", () => {
|
||||||
const url = 'https://example.com';
|
const url = "https://example.com";
|
||||||
const expected = 'example.com';
|
const expected = "example.com";
|
||||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should handle URLs without protocol', () => {
|
it("should handle URLs without protocol", () => {
|
||||||
const url = 'www.example.com';
|
const url = "www.example.com";
|
||||||
const expected = 'example.com';
|
const expected = "example.com";
|
||||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should handle URLs without protocol and www', () => {
|
it("should handle URLs without protocol and www", () => {
|
||||||
const url = 'example.com';
|
const url = "example.com";
|
||||||
const expected = 'example.com';
|
const expected = "example.com";
|
||||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should handle URLs with paths', () => {
|
it("should handle URLs with paths", () => {
|
||||||
const url = 'https://www.example.com/path/to/resource';
|
const url = "https://www.example.com/path/to/resource";
|
||||||
const expected = 'example.com';
|
const expected = "example.com";
|
||||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should handle invalid URLs gracefully', () => {
|
it("should handle invalid URLs gracefully", () => {
|
||||||
const url = 'not a valid url';
|
const url = "not a valid url";
|
||||||
const expected = 'not a valid url';
|
const expected = "not a valid url";
|
||||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("normalizeUrl", () => {
|
||||||
|
it("should remove protocol and www from URL", () => {
|
||||||
describe('normalizeUrl', () => {
|
const url = "https://www.example.com";
|
||||||
it('should remove protocol and www from URL', () => {
|
const expected = "example.com";
|
||||||
const url = 'https://www.example.com';
|
|
||||||
const expected = 'example.com';
|
|
||||||
expect(normalizeUrl(url)).toBe(expected);
|
expect(normalizeUrl(url)).toBe(expected);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should remove only protocol if www is not present', () => {
|
it("should remove only protocol if www is not present", () => {
|
||||||
const url = 'https://example.com';
|
const url = "https://example.com";
|
||||||
const expected = 'example.com';
|
const expected = "example.com";
|
||||||
expect(normalizeUrl(url)).toBe(expected);
|
expect(normalizeUrl(url)).toBe(expected);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should handle URLs without protocol', () => {
|
it("should handle URLs without protocol", () => {
|
||||||
const url = 'www.example.com';
|
const url = "www.example.com";
|
||||||
const expected = 'example.com';
|
const expected = "example.com";
|
||||||
expect(normalizeUrl(url)).toBe(expected);
|
expect(normalizeUrl(url)).toBe(expected);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should handle URLs without protocol and www', () => {
|
it("should handle URLs without protocol and www", () => {
|
||||||
const url = 'example.com';
|
const url = "example.com";
|
||||||
const expected = 'example.com';
|
const expected = "example.com";
|
||||||
expect(normalizeUrl(url)).toBe(expected);
|
expect(normalizeUrl(url)).toBe(expected);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should handle URLs with paths', () => {
|
it("should handle URLs with paths", () => {
|
||||||
const url = 'https://www.example.com/path/to/resource';
|
const url = "https://www.example.com/path/to/resource";
|
||||||
const expected = 'example.com/path/to/resource';
|
const expected = "example.com/path/to/resource";
|
||||||
expect(normalizeUrl(url)).toBe(expected);
|
expect(normalizeUrl(url)).toBe(expected);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should handle URLs with trailing slash', () => {
|
it("should handle URLs with trailing slash", () => {
|
||||||
const url = 'https://www.example.com/';
|
const url = "https://www.example.com/";
|
||||||
const expected = 'example.com';
|
const expected = "example.com";
|
||||||
expect(normalizeUrl(url)).toBe(expected);
|
expect(normalizeUrl(url)).toBe(expected);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should handle URLs with trailing slash and path', () => {
|
it("should handle URLs with trailing slash and path", () => {
|
||||||
const url = 'https://www.example.com/path/';
|
const url = "https://www.example.com/path/";
|
||||||
const expected = 'example.com/path';
|
const expected = "example.com/path";
|
||||||
expect(normalizeUrl(url)).toBe(expected);
|
expect(normalizeUrl(url)).toBe(expected);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should handle invalid URLs gracefully', () => {
|
it("should handle invalid URLs gracefully", () => {
|
||||||
const url = 'not a valid url';
|
const url = "not a valid url";
|
||||||
const expected = 'not a valid url';
|
const expected = "not a valid url";
|
||||||
expect(normalizeUrl(url)).toBe(expected);
|
expect(normalizeUrl(url)).toBe(expected);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -322,7 +322,7 @@ export async function lockURLs(
|
|||||||
export async function lockURLsIndividually(
|
export async function lockURLsIndividually(
|
||||||
id: string,
|
id: string,
|
||||||
sc: StoredCrawl,
|
sc: StoredCrawl,
|
||||||
jobs: { id: string; url: string; }[],
|
jobs: { id: string; url: string }[],
|
||||||
) {
|
) {
|
||||||
const out: typeof jobs = [];
|
const out: typeof jobs = [];
|
||||||
|
|
||||||
|
@ -6,6 +6,4 @@ export const extractConfig = {
|
|||||||
MIN_REQUIRED_LINKS: 1,
|
MIN_REQUIRED_LINKS: 1,
|
||||||
};
|
};
|
||||||
|
|
||||||
export const CUSTOM_U_TEAMS = [
|
export const CUSTOM_U_TEAMS = ["874d40cc-a5c0-4e93-b661-9ddfbad5e51e"];
|
||||||
"874d40cc-a5c0-4e93-b661-9ddfbad5e51e"
|
|
||||||
]
|
|
||||||
|
@ -21,14 +21,19 @@ export async function getExtract(id: string): Promise<StoredExtract | null> {
|
|||||||
return x ? JSON.parse(x) : null;
|
return x ? JSON.parse(x) : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function updateExtract(id: string, extract: Partial<StoredExtract>) {
|
export async function updateExtract(
|
||||||
|
id: string,
|
||||||
|
extract: Partial<StoredExtract>,
|
||||||
|
) {
|
||||||
const current = await getExtract(id);
|
const current = await getExtract(id);
|
||||||
if (!current) return;
|
if (!current) return;
|
||||||
await redisConnection.set("extract:" + id, JSON.stringify({ ...current, ...extract }));
|
await redisConnection.set(
|
||||||
|
"extract:" + id,
|
||||||
|
JSON.stringify({ ...current, ...extract }),
|
||||||
|
);
|
||||||
await redisConnection.expire("extract:" + id, 24 * 60 * 60, "NX");
|
await redisConnection.expire("extract:" + id, 24 * 60 * 60, "NX");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
export async function getExtractExpiry(id: string): Promise<Date> {
|
export async function getExtractExpiry(id: string): Promise<Date> {
|
||||||
const d = new Date();
|
const d = new Date();
|
||||||
const ttl = await redisConnection.pttl("extract:" + id);
|
const ttl = await redisConnection.pttl("extract:" + id);
|
||||||
|
@ -40,14 +40,18 @@ function getRootDomain(url: string): string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function performExtraction(extractId: string, options: ExtractServiceOptions): Promise<ExtractResult> {
|
export async function performExtraction(
|
||||||
|
extractId: string,
|
||||||
|
options: ExtractServiceOptions,
|
||||||
|
): Promise<ExtractResult> {
|
||||||
const { request, teamId, plan, subId } = options;
|
const { request, teamId, plan, subId } = options;
|
||||||
const urlTraces: URLTrace[] = [];
|
const urlTraces: URLTrace[] = [];
|
||||||
let docs: Document[] = [];
|
let docs: Document[] = [];
|
||||||
|
|
||||||
// Process URLs
|
// Process URLs
|
||||||
const urlPromises = request.urls.map(url =>
|
const urlPromises = request.urls.map((url) =>
|
||||||
processUrl({
|
processUrl(
|
||||||
|
{
|
||||||
url,
|
url,
|
||||||
prompt: request.prompt,
|
prompt: request.prompt,
|
||||||
teamId,
|
teamId,
|
||||||
@ -56,16 +60,19 @@ export async function performExtraction(extractId: string, options: ExtractServi
|
|||||||
origin: request.origin,
|
origin: request.origin,
|
||||||
limit: request.limit,
|
limit: request.limit,
|
||||||
includeSubdomains: request.includeSubdomains,
|
includeSubdomains: request.includeSubdomains,
|
||||||
}, urlTraces)
|
},
|
||||||
|
urlTraces,
|
||||||
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
const processedUrls = await Promise.all(urlPromises);
|
const processedUrls = await Promise.all(urlPromises);
|
||||||
const links = processedUrls.flat().filter(url => url);
|
const links = processedUrls.flat().filter((url) => url);
|
||||||
|
|
||||||
if (links.length === 0) {
|
if (links.length === 0) {
|
||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
|
error:
|
||||||
|
"No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
|
||||||
extractId,
|
extractId,
|
||||||
urlTrace: urlTraces,
|
urlTrace: urlTraces,
|
||||||
};
|
};
|
||||||
@ -73,14 +80,17 @@ export async function performExtraction(extractId: string, options: ExtractServi
|
|||||||
|
|
||||||
// Scrape documents
|
// Scrape documents
|
||||||
const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000;
|
const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000;
|
||||||
const scrapePromises = links.map(url =>
|
const scrapePromises = links.map((url) =>
|
||||||
scrapeDocument({
|
scrapeDocument(
|
||||||
|
{
|
||||||
url,
|
url,
|
||||||
teamId,
|
teamId,
|
||||||
plan,
|
plan,
|
||||||
origin: request.origin || "api",
|
origin: request.origin || "api",
|
||||||
timeout,
|
timeout,
|
||||||
}, urlTraces)
|
},
|
||||||
|
urlTraces,
|
||||||
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -114,13 +124,16 @@ export async function performExtraction(extractId: string, options: ExtractServi
|
|||||||
|
|
||||||
// Update token usage in traces
|
// Update token usage in traces
|
||||||
if (completions.numTokens) {
|
if (completions.numTokens) {
|
||||||
const totalLength = docs.reduce((sum, doc) => sum + (doc.markdown?.length || 0), 0);
|
const totalLength = docs.reduce(
|
||||||
|
(sum, doc) => sum + (doc.markdown?.length || 0),
|
||||||
|
0,
|
||||||
|
);
|
||||||
docs.forEach((doc) => {
|
docs.forEach((doc) => {
|
||||||
if (doc.metadata?.sourceURL) {
|
if (doc.metadata?.sourceURL) {
|
||||||
const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL);
|
const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL);
|
||||||
if (trace && trace.contentStats) {
|
if (trace && trace.contentStats) {
|
||||||
trace.contentStats.tokensUsed = Math.floor(
|
trace.contentStats.tokensUsed = Math.floor(
|
||||||
((doc.markdown?.length || 0) / totalLength) * completions.numTokens
|
((doc.markdown?.length || 0) / totalLength) * completions.numTokens,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -213,12 +226,12 @@ export async function performExtraction(extractId: string, options: ExtractServi
|
|||||||
updateExtract(extractId, {
|
updateExtract(extractId, {
|
||||||
status: "completed",
|
status: "completed",
|
||||||
}).catch((error) => {
|
}).catch((error) => {
|
||||||
logger.error(`Failed to update extract ${extractId} status to completed: ${error}`);
|
logger.error(
|
||||||
|
`Failed to update extract ${extractId} status to completed: ${error}`,
|
||||||
|
);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: completions.extract ?? {},
|
data: completions.extract ?? {},
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import { Pinecone } from '@pinecone-database/pinecone';
|
import { Pinecone } from "@pinecone-database/pinecone";
|
||||||
import { Document } from '../../../controllers/v1/types';
|
import { Document } from "../../../controllers/v1/types";
|
||||||
import { logger } from '../../logger';
|
import { logger } from "../../logger";
|
||||||
import OpenAI from "openai";
|
import OpenAI from "openai";
|
||||||
|
|
||||||
const openai = new OpenAI({
|
const openai = new OpenAI({
|
||||||
@ -48,34 +48,43 @@ export async function indexPage({
|
|||||||
document,
|
document,
|
||||||
originUrl,
|
originUrl,
|
||||||
crawlId,
|
crawlId,
|
||||||
teamId
|
teamId,
|
||||||
}: {
|
}: {
|
||||||
document: Document;
|
document: Document;
|
||||||
originUrl: string;
|
originUrl: string;
|
||||||
crawlId?: string;
|
crawlId?: string;
|
||||||
teamId?: string;
|
teamId?: string;
|
||||||
}
|
}) {
|
||||||
) {
|
|
||||||
try {
|
try {
|
||||||
const index = pinecone.index(INDEX_NAME);
|
const index = pinecone.index(INDEX_NAME);
|
||||||
|
|
||||||
// Trim markdown if it's too long
|
// Trim markdown if it's too long
|
||||||
let trimmedMarkdown = document.markdown;
|
let trimmedMarkdown = document.markdown;
|
||||||
if (trimmedMarkdown && Buffer.byteLength(trimmedMarkdown, 'utf-8') > MAX_METADATA_SIZE) {
|
if (
|
||||||
trimmedMarkdown = trimmedMarkdown.slice(0, Math.floor(MAX_METADATA_SIZE / 2)); // Using half the size to be safe with UTF-8 encoding
|
trimmedMarkdown &&
|
||||||
|
Buffer.byteLength(trimmedMarkdown, "utf-8") > MAX_METADATA_SIZE
|
||||||
|
) {
|
||||||
|
trimmedMarkdown = trimmedMarkdown.slice(
|
||||||
|
0,
|
||||||
|
Math.floor(MAX_METADATA_SIZE / 2),
|
||||||
|
); // Using half the size to be safe with UTF-8 encoding
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create text to embed
|
// Create text to embed
|
||||||
const textToEmbed = [
|
const textToEmbed = [
|
||||||
document.metadata.title,
|
document.metadata.title,
|
||||||
document.metadata.description,
|
document.metadata.description,
|
||||||
trimmedMarkdown
|
trimmedMarkdown,
|
||||||
].filter(Boolean).join('\n\n');
|
]
|
||||||
|
.filter(Boolean)
|
||||||
|
.join("\n\n");
|
||||||
|
|
||||||
// Get embedding from OpenAI
|
// Get embedding from OpenAI
|
||||||
const embedding = await getEmbedding(textToEmbed);
|
const embedding = await getEmbedding(textToEmbed);
|
||||||
|
|
||||||
const normalizedUrl = normalizeUrl(document.metadata.sourceURL || document.metadata.url!);
|
const normalizedUrl = normalizeUrl(
|
||||||
|
document.metadata.sourceURL || document.metadata.url!,
|
||||||
|
);
|
||||||
|
|
||||||
// Prepare metadata
|
// Prepare metadata
|
||||||
const metadata: PageMetadata = {
|
const metadata: PageMetadata = {
|
||||||
@ -86,29 +95,30 @@ export async function indexPage({
|
|||||||
crawlId,
|
crawlId,
|
||||||
teamId,
|
teamId,
|
||||||
markdown: trimmedMarkdown,
|
markdown: trimmedMarkdown,
|
||||||
timestamp: Date.now()
|
timestamp: Date.now(),
|
||||||
};
|
};
|
||||||
|
|
||||||
// Upsert to Pinecone
|
// Upsert to Pinecone
|
||||||
await index.upsert([{
|
await index.upsert([
|
||||||
|
{
|
||||||
id: normalizedUrl,
|
id: normalizedUrl,
|
||||||
values: embedding,
|
values: embedding,
|
||||||
metadata: {
|
metadata: {
|
||||||
...metadata,
|
...metadata,
|
||||||
[document.metadata.sourceURL || document.metadata.url!]: true
|
[document.metadata.sourceURL || document.metadata.url!]: true,
|
||||||
}
|
},
|
||||||
}]);
|
},
|
||||||
|
]);
|
||||||
|
|
||||||
logger.debug('Successfully indexed page in Pinecone', {
|
logger.debug("Successfully indexed page in Pinecone", {
|
||||||
url: metadata.url,
|
url: metadata.url,
|
||||||
crawlId
|
crawlId,
|
||||||
});
|
});
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error('Failed to index page in Pinecone', {
|
logger.error("Failed to index page in Pinecone", {
|
||||||
error,
|
error,
|
||||||
url: document.metadata.sourceURL || document.metadata.url,
|
url: document.metadata.sourceURL || document.metadata.url,
|
||||||
crawlId
|
crawlId,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -116,7 +126,7 @@ export async function indexPage({
|
|||||||
export async function searchSimilarPages(
|
export async function searchSimilarPages(
|
||||||
query: string,
|
query: string,
|
||||||
originUrl?: string,
|
originUrl?: string,
|
||||||
limit: number = 10
|
limit: number = 10,
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
const index = pinecone.index(INDEX_NAME);
|
const index = pinecone.index(INDEX_NAME);
|
||||||
@ -127,31 +137,30 @@ export async function searchSimilarPages(
|
|||||||
const queryParams: any = {
|
const queryParams: any = {
|
||||||
vector: queryEmbedding,
|
vector: queryEmbedding,
|
||||||
topK: limit,
|
topK: limit,
|
||||||
includeMetadata: true
|
includeMetadata: true,
|
||||||
};
|
};
|
||||||
|
|
||||||
const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined;
|
const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined;
|
||||||
// Add filter if originUrl is provided
|
// Add filter if originUrl is provided
|
||||||
if (normalizedOriginUrl) {
|
if (normalizedOriginUrl) {
|
||||||
queryParams.filter = {
|
queryParams.filter = {
|
||||||
originUrl: { $eq: normalizedOriginUrl }
|
originUrl: { $eq: normalizedOriginUrl },
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const results = await index.query(queryParams);
|
const results = await index.query(queryParams);
|
||||||
return results.matches.map(match => ({
|
return results.matches.map((match) => ({
|
||||||
url: match.metadata?.url,
|
url: match.metadata?.url,
|
||||||
title: match.metadata?.title,
|
title: match.metadata?.title,
|
||||||
description: match.metadata?.description,
|
description: match.metadata?.description,
|
||||||
score: match.score,
|
score: match.score,
|
||||||
markdown: match.metadata?.markdown
|
markdown: match.metadata?.markdown,
|
||||||
}));
|
}));
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error('Failed to search similar pages in Pinecone', {
|
logger.error("Failed to search similar pages in Pinecone", {
|
||||||
error,
|
error,
|
||||||
query,
|
query,
|
||||||
originUrl
|
originUrl,
|
||||||
});
|
});
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
@ -9,8 +9,6 @@ const cohere = new CohereClient({
|
|||||||
token: process.env.COHERE_API_KEY,
|
token: process.env.COHERE_API_KEY,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
interface RankingResult {
|
interface RankingResult {
|
||||||
mappedLinks: MapDocument[];
|
mappedLinks: MapDocument[];
|
||||||
linksAndScores: {
|
linksAndScores: {
|
||||||
@ -59,7 +57,6 @@ export async function rerankLinks(
|
|||||||
searchQuery,
|
searchQuery,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
// First try with high threshold
|
// First try with high threshold
|
||||||
let filteredLinks = filterAndProcessLinks(
|
let filteredLinks = filterAndProcessLinks(
|
||||||
mappedLinks,
|
mappedLinks,
|
||||||
@ -67,8 +64,6 @@ export async function rerankLinks(
|
|||||||
extractConfig.INITIAL_SCORE_THRESHOLD,
|
extractConfig.INITIAL_SCORE_THRESHOLD,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// If we don't have enough high-quality links, try with lower threshold
|
// If we don't have enough high-quality links, try with lower threshold
|
||||||
if (filteredLinks.length < extractConfig.MIN_REQUIRED_LINKS) {
|
if (filteredLinks.length < extractConfig.MIN_REQUIRED_LINKS) {
|
||||||
logger.info(
|
logger.info(
|
||||||
@ -102,7 +97,7 @@ export async function rerankLinks(
|
|||||||
if (trace) {
|
if (trace) {
|
||||||
trace.relevanceScore = score.score;
|
trace.relevanceScore = score.score;
|
||||||
// If URL didn't make it through filtering, mark it as filtered out
|
// If URL didn't make it through filtering, mark it as filtered out
|
||||||
if (!filteredLinks.some(link => link.url === score.link)) {
|
if (!filteredLinks.some((link) => link.url === score.link)) {
|
||||||
trace.warning = `Relevance score ${score.score} below threshold`;
|
trace.warning = `Relevance score ${score.score} below threshold`;
|
||||||
trace.usedInCompletion = false;
|
trace.usedInCompletion = false;
|
||||||
}
|
}
|
||||||
@ -112,18 +107,18 @@ export async function rerankLinks(
|
|||||||
const rankedLinks = filteredLinks.slice(0, extractConfig.MAX_RANKING_LIMIT);
|
const rankedLinks = filteredLinks.slice(0, extractConfig.MAX_RANKING_LIMIT);
|
||||||
|
|
||||||
// Mark URLs that will be used in completion
|
// Mark URLs that will be used in completion
|
||||||
rankedLinks.forEach(link => {
|
rankedLinks.forEach((link) => {
|
||||||
const trace = urlTraces.find(t => t.url === link.url);
|
const trace = urlTraces.find((t) => t.url === link.url);
|
||||||
if (trace) {
|
if (trace) {
|
||||||
trace.usedInCompletion = true;
|
trace.usedInCompletion = true;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Mark URLs that were dropped due to ranking limit
|
// Mark URLs that were dropped due to ranking limit
|
||||||
filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach(link => {
|
filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach((link) => {
|
||||||
const trace = urlTraces.find(t => t.url === link.url);
|
const trace = urlTraces.find((t) => t.url === link.url);
|
||||||
if (trace) {
|
if (trace) {
|
||||||
trace.warning = 'Excluded due to ranking limit';
|
trace.warning = "Excluded due to ranking limit";
|
||||||
trace.usedInCompletion = false;
|
trace.usedInCompletion = false;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -20,10 +20,13 @@ interface ProcessUrlOptions {
|
|||||||
includeSubdomains?: boolean;
|
includeSubdomains?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace[]): Promise<string[]> {
|
export async function processUrl(
|
||||||
|
options: ProcessUrlOptions,
|
||||||
|
urlTraces: URLTrace[],
|
||||||
|
): Promise<string[]> {
|
||||||
const trace: URLTrace = {
|
const trace: URLTrace = {
|
||||||
url: options.url,
|
url: options.url,
|
||||||
status: 'mapped',
|
status: "mapped",
|
||||||
timing: {
|
timing: {
|
||||||
discoveredAt: new Date().toISOString(),
|
discoveredAt: new Date().toISOString(),
|
||||||
},
|
},
|
||||||
@ -35,8 +38,8 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
|
|||||||
trace.usedInCompletion = true;
|
trace.usedInCompletion = true;
|
||||||
return [options.url];
|
return [options.url];
|
||||||
}
|
}
|
||||||
trace.status = 'error';
|
trace.status = "error";
|
||||||
trace.error = 'URL is blocked';
|
trace.error = "URL is blocked";
|
||||||
trace.usedInCompletion = false;
|
trace.usedInCompletion = false;
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
@ -46,9 +49,10 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
|
|||||||
|
|
||||||
let rephrasedPrompt = options.prompt;
|
let rephrasedPrompt = options.prompt;
|
||||||
if (options.prompt) {
|
if (options.prompt) {
|
||||||
rephrasedPrompt = await generateBasicCompletion(
|
rephrasedPrompt =
|
||||||
buildRefrasedPrompt(options.prompt, baseUrl)
|
(await generateBasicCompletion(
|
||||||
) ?? options.prompt;
|
buildRefrasedPrompt(options.prompt, baseUrl),
|
||||||
|
)) ?? options.prompt;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -70,11 +74,11 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
|
|||||||
let uniqueUrls = removeDuplicateUrls(allUrls);
|
let uniqueUrls = removeDuplicateUrls(allUrls);
|
||||||
|
|
||||||
// Track all discovered URLs
|
// Track all discovered URLs
|
||||||
uniqueUrls.forEach(discoveredUrl => {
|
uniqueUrls.forEach((discoveredUrl) => {
|
||||||
if (!urlTraces.some(t => t.url === discoveredUrl)) {
|
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
|
||||||
urlTraces.push({
|
urlTraces.push({
|
||||||
url: discoveredUrl,
|
url: discoveredUrl,
|
||||||
status: 'mapped',
|
status: "mapped",
|
||||||
timing: {
|
timing: {
|
||||||
discoveredAt: new Date().toISOString(),
|
discoveredAt: new Date().toISOString(),
|
||||||
},
|
},
|
||||||
@ -102,12 +106,12 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
|
|||||||
uniqueUrls = removeDuplicateUrls(allUrls);
|
uniqueUrls = removeDuplicateUrls(allUrls);
|
||||||
|
|
||||||
// Track all discovered URLs
|
// Track all discovered URLs
|
||||||
uniqueUrls.forEach(discoveredUrl => {
|
uniqueUrls.forEach((discoveredUrl) => {
|
||||||
if (!urlTraces.some(t => t.url === discoveredUrl)) {
|
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
|
||||||
urlTraces.push({
|
urlTraces.push({
|
||||||
url: discoveredUrl,
|
url: discoveredUrl,
|
||||||
status: 'mapped',
|
status: "mapped",
|
||||||
warning: 'Broader search. Not limiting map results to prompt.',
|
warning: "Broader search. Not limiting map results to prompt.",
|
||||||
timing: {
|
timing: {
|
||||||
discoveredAt: new Date().toISOString(),
|
discoveredAt: new Date().toISOString(),
|
||||||
},
|
},
|
||||||
@ -118,11 +122,11 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Track all discovered URLs
|
// Track all discovered URLs
|
||||||
uniqueUrls.forEach(discoveredUrl => {
|
uniqueUrls.forEach((discoveredUrl) => {
|
||||||
if (!urlTraces.some(t => t.url === discoveredUrl)) {
|
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
|
||||||
urlTraces.push({
|
urlTraces.push({
|
||||||
url: discoveredUrl,
|
url: discoveredUrl,
|
||||||
status: 'mapped',
|
status: "mapped",
|
||||||
timing: {
|
timing: {
|
||||||
discoveredAt: new Date().toISOString(),
|
discoveredAt: new Date().toISOString(),
|
||||||
},
|
},
|
||||||
@ -155,9 +159,9 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
|
|||||||
mappedLinks = await rerankLinks(mappedLinks, searchQuery, urlTraces);
|
mappedLinks = await rerankLinks(mappedLinks, searchQuery, urlTraces);
|
||||||
}
|
}
|
||||||
|
|
||||||
return mappedLinks.map(x => x.url);
|
return mappedLinks.map((x) => x.url);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
trace.status = 'error';
|
trace.status = "error";
|
||||||
trace.error = error.message;
|
trace.error = error.message;
|
||||||
trace.usedInCompletion = false;
|
trace.usedInCompletion = false;
|
||||||
return [];
|
return [];
|
||||||
|
@ -42,11 +42,18 @@ export const logger = winston.createLogger({
|
|||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
transports: [
|
transports: [
|
||||||
...(process.env.FIRECRAWL_LOG_TO_FILE ? [
|
...(process.env.FIRECRAWL_LOG_TO_FILE
|
||||||
|
? [
|
||||||
new winston.transports.File({
|
new winston.transports.File({
|
||||||
filename: "firecrawl-" + (process.argv[1].includes("worker") ? "worker" : "app") + "-" + crypto.randomUUID() + ".log",
|
filename:
|
||||||
})
|
"firecrawl-" +
|
||||||
] : []),
|
(process.argv[1].includes("worker") ? "worker" : "app") +
|
||||||
|
"-" +
|
||||||
|
crypto.randomUUID() +
|
||||||
|
".log",
|
||||||
|
}),
|
||||||
|
]
|
||||||
|
: []),
|
||||||
new winston.transports.Console({
|
new winston.transports.Console({
|
||||||
format: winston.format.combine(
|
format: winston.format.combine(
|
||||||
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
|
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
|
||||||
|
@ -179,7 +179,6 @@ export async function runWebScraper({
|
|||||||
return response;
|
return response;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => {
|
billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => {
|
||||||
logger.error(
|
logger.error(
|
||||||
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,
|
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,
|
||||||
|
@ -192,7 +192,8 @@ v1Router.get(
|
|||||||
wrap((req: any, res): any => crawlStatusController(req, res, true)),
|
wrap((req: any, res): any => crawlStatusController(req, res, true)),
|
||||||
);
|
);
|
||||||
|
|
||||||
v1Router.get("/scrape/:jobId",
|
v1Router.get(
|
||||||
|
"/scrape/:jobId",
|
||||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||||
wrap(scrapeStatusController),
|
wrap(scrapeStatusController),
|
||||||
);
|
);
|
||||||
@ -242,6 +243,3 @@ v1Router.get(
|
|||||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||||
wrap(creditUsageController),
|
wrap(creditUsageController),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -219,18 +219,29 @@ export class WebCrawler {
|
|||||||
const _urlsHandler = async (urls: string[]) => {
|
const _urlsHandler = async (urls: string[]) => {
|
||||||
let uniqueURLs: string[] = [];
|
let uniqueURLs: string[] = [];
|
||||||
for (const url of urls) {
|
for (const url of urls) {
|
||||||
if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(url))) {
|
if (
|
||||||
|
await redisConnection.sadd(
|
||||||
|
"sitemap:" + this.jobId + ":links",
|
||||||
|
normalizeUrl(url),
|
||||||
|
)
|
||||||
|
) {
|
||||||
uniqueURLs.push(url);
|
uniqueURLs.push(url);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
await redisConnection.expire("sitemap:" + this.jobId + ":links", 3600, "NX");
|
await redisConnection.expire(
|
||||||
|
"sitemap:" + this.jobId + ":links",
|
||||||
|
3600,
|
||||||
|
"NX",
|
||||||
|
);
|
||||||
if (uniqueURLs.length > 0) {
|
if (uniqueURLs.length > 0) {
|
||||||
urlsHandler(uniqueURLs);
|
urlsHandler(uniqueURLs);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let count = await this.tryFetchSitemapLinks(this.initialUrl, (urls: string[]) => {
|
let count = await this.tryFetchSitemapLinks(
|
||||||
|
this.initialUrl,
|
||||||
|
(urls: string[]) => {
|
||||||
if (fromMap && onlySitemap) {
|
if (fromMap && onlySitemap) {
|
||||||
return urlsHandler(urls);
|
return urlsHandler(urls);
|
||||||
} else {
|
} else {
|
||||||
@ -243,10 +254,16 @@ export class WebCrawler {
|
|||||||
leftOfLimit -= filteredLinks.length;
|
leftOfLimit -= filteredLinks.length;
|
||||||
return _urlsHandler(filteredLinks);
|
return _urlsHandler(filteredLinks);
|
||||||
}
|
}
|
||||||
});
|
},
|
||||||
|
);
|
||||||
|
|
||||||
if (count > 0) {
|
if (count > 0) {
|
||||||
if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(this.initialUrl))) {
|
if (
|
||||||
|
await redisConnection.sadd(
|
||||||
|
"sitemap:" + this.jobId + ":links",
|
||||||
|
normalizeUrl(this.initialUrl),
|
||||||
|
)
|
||||||
|
) {
|
||||||
urlsHandler([this.initialUrl]);
|
urlsHandler([this.initialUrl]);
|
||||||
}
|
}
|
||||||
count++;
|
count++;
|
||||||
@ -470,8 +487,13 @@ export class WebCrawler {
|
|||||||
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
||||||
}
|
}
|
||||||
|
|
||||||
private async tryFetchSitemapLinks(url: string, urlsHandler: (urls: string[]) => unknown): Promise<number> {
|
private async tryFetchSitemapLinks(
|
||||||
const sitemapUrl = url.endsWith(".xml") ? url : `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
|
url: string,
|
||||||
|
urlsHandler: (urls: string[]) => unknown,
|
||||||
|
): Promise<number> {
|
||||||
|
const sitemapUrl = url.endsWith(".xml")
|
||||||
|
? url
|
||||||
|
: `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
|
||||||
|
|
||||||
let sitemapCount: number = 0;
|
let sitemapCount: number = 0;
|
||||||
|
|
||||||
@ -482,37 +504,43 @@ export class WebCrawler {
|
|||||||
this.logger,
|
this.logger,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(
|
this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, {
|
||||||
`Failed to fetch sitemap from ${sitemapUrl}`,
|
method: "tryFetchSitemapLinks",
|
||||||
{ method: "tryFetchSitemapLinks", sitemapUrl, error },
|
sitemapUrl,
|
||||||
);
|
error,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// If this is a subdomain, also try to get sitemap from the main domain
|
// If this is a subdomain, also try to get sitemap from the main domain
|
||||||
try {
|
try {
|
||||||
const urlObj = new URL(url);
|
const urlObj = new URL(url);
|
||||||
const hostname = urlObj.hostname;
|
const hostname = urlObj.hostname;
|
||||||
const domainParts = hostname.split('.');
|
const domainParts = hostname.split(".");
|
||||||
|
|
||||||
// Check if this is a subdomain (has more than 2 parts and not www)
|
// Check if this is a subdomain (has more than 2 parts and not www)
|
||||||
if (domainParts.length > 2 && domainParts[0] !== 'www') {
|
if (domainParts.length > 2 && domainParts[0] !== "www") {
|
||||||
// Get the main domain by taking the last two parts
|
// Get the main domain by taking the last two parts
|
||||||
const mainDomain = domainParts.slice(-2).join('.');
|
const mainDomain = domainParts.slice(-2).join(".");
|
||||||
const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`;
|
const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`;
|
||||||
const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`;
|
const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Get all links from the main domain's sitemap
|
// Get all links from the main domain's sitemap
|
||||||
sitemapCount += await getLinksFromSitemap(
|
sitemapCount += await getLinksFromSitemap(
|
||||||
{ sitemapUrl: mainDomainSitemapUrl, urlsHandler(urls) {
|
{
|
||||||
return urlsHandler(urls.filter(link => {
|
sitemapUrl: mainDomainSitemapUrl,
|
||||||
|
urlsHandler(urls) {
|
||||||
|
return urlsHandler(
|
||||||
|
urls.filter((link) => {
|
||||||
try {
|
try {
|
||||||
const linkUrl = new URL(link);
|
const linkUrl = new URL(link);
|
||||||
return linkUrl.hostname.endsWith(hostname);
|
return linkUrl.hostname.endsWith(hostname);
|
||||||
} catch {
|
} catch {}
|
||||||
}
|
}),
|
||||||
}))
|
);
|
||||||
}, mode: "fire-engine" },
|
},
|
||||||
|
mode: "fire-engine",
|
||||||
|
},
|
||||||
this.logger,
|
this.logger,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
@ -15,7 +15,7 @@ export async function getLinksFromSitemap(
|
|||||||
mode = "axios",
|
mode = "axios",
|
||||||
}: {
|
}: {
|
||||||
sitemapUrl: string;
|
sitemapUrl: string;
|
||||||
urlsHandler(urls: string[]): unknown,
|
urlsHandler(urls: string[]): unknown;
|
||||||
mode?: "axios" | "fire-engine";
|
mode?: "axios" | "fire-engine";
|
||||||
},
|
},
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
@ -31,7 +31,10 @@ export async function getLinksFromSitemap(
|
|||||||
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
|
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
|
||||||
);
|
);
|
||||||
if (!response.success) {
|
if (!response.success) {
|
||||||
logger.debug("Failed to scrape sitemap via TLSClient, falling back to axios...", { error: response.error })
|
logger.debug(
|
||||||
|
"Failed to scrape sitemap via TLSClient, falling back to axios...",
|
||||||
|
{ error: response.error },
|
||||||
|
);
|
||||||
const ar = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
const ar = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
content = ar.data;
|
content = ar.data;
|
||||||
} else {
|
} else {
|
||||||
@ -63,14 +66,11 @@ export async function getLinksFromSitemap(
|
|||||||
.map((sitemap) => sitemap.loc[0].trim());
|
.map((sitemap) => sitemap.loc[0].trim());
|
||||||
|
|
||||||
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
||||||
getLinksFromSitemap(
|
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger),
|
||||||
{ sitemapUrl, urlsHandler, mode },
|
|
||||||
logger,
|
|
||||||
),
|
|
||||||
);
|
);
|
||||||
|
|
||||||
const results = await Promise.all(sitemapPromises);
|
const results = await Promise.all(sitemapPromises);
|
||||||
count = results.reduce((a,x) => a + x)
|
count = results.reduce((a, x) => a + x);
|
||||||
} else if (root && root.url) {
|
} else if (root && root.url) {
|
||||||
// Check if any URLs point to additional sitemaps
|
// Check if any URLs point to additional sitemaps
|
||||||
const xmlSitemaps: string[] = root.url
|
const xmlSitemaps: string[] = root.url
|
||||||
@ -78,7 +78,7 @@ export async function getLinksFromSitemap(
|
|||||||
(url) =>
|
(url) =>
|
||||||
url.loc &&
|
url.loc &&
|
||||||
url.loc.length > 0 &&
|
url.loc.length > 0 &&
|
||||||
url.loc[0].trim().toLowerCase().endsWith('.xml')
|
url.loc[0].trim().toLowerCase().endsWith(".xml"),
|
||||||
)
|
)
|
||||||
.map((url) => url.loc[0].trim());
|
.map((url) => url.loc[0].trim());
|
||||||
|
|
||||||
@ -90,7 +90,10 @@ export async function getLinksFromSitemap(
|
|||||||
logger,
|
logger,
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
count += (await Promise.all(sitemapPromises)).reduce((a,x) => a + x, 0);
|
count += (await Promise.all(sitemapPromises)).reduce(
|
||||||
|
(a, x) => a + x,
|
||||||
|
0,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
const validUrls = root.url
|
const validUrls = root.url
|
||||||
@ -98,7 +101,7 @@ export async function getLinksFromSitemap(
|
|||||||
(url) =>
|
(url) =>
|
||||||
url.loc &&
|
url.loc &&
|
||||||
url.loc.length > 0 &&
|
url.loc.length > 0 &&
|
||||||
!url.loc[0].trim().toLowerCase().endsWith('.xml') &&
|
!url.loc[0].trim().toLowerCase().endsWith(".xml") &&
|
||||||
!WebCrawler.prototype.isFile(url.loc[0].trim()),
|
!WebCrawler.prototype.isFile(url.loc[0].trim()),
|
||||||
)
|
)
|
||||||
.map((url) => url.loc[0].trim());
|
.map((url) => url.loc[0].trim());
|
||||||
|
@ -3,7 +3,10 @@ import { EngineScrapeResult } from "..";
|
|||||||
import { Meta } from "../..";
|
import { Meta } from "../..";
|
||||||
import { TimeoutError } from "../../error";
|
import { TimeoutError } from "../../error";
|
||||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||||
import { InsecureConnectionError, makeSecureDispatcher } from "../utils/safeFetch";
|
import {
|
||||||
|
InsecureConnectionError,
|
||||||
|
makeSecureDispatcher,
|
||||||
|
} from "../utils/safeFetch";
|
||||||
|
|
||||||
export async function scrapeURLWithFetch(
|
export async function scrapeURLWithFetch(
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
@ -20,7 +23,9 @@ export async function scrapeURLWithFetch(
|
|||||||
headers: meta.options.headers,
|
headers: meta.options.headers,
|
||||||
}),
|
}),
|
||||||
(async () => {
|
(async () => {
|
||||||
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
|
await new Promise((resolve) =>
|
||||||
|
setTimeout(() => resolve(null), timeout),
|
||||||
|
);
|
||||||
throw new TimeoutError(
|
throw new TimeoutError(
|
||||||
"Fetch was unable to scrape the page before timing out",
|
"Fetch was unable to scrape the page before timing out",
|
||||||
{ cause: { timeout } },
|
{ cause: { timeout } },
|
||||||
@ -28,7 +33,10 @@ export async function scrapeURLWithFetch(
|
|||||||
})(),
|
})(),
|
||||||
]);
|
]);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof TypeError && error.cause instanceof InsecureConnectionError) {
|
if (
|
||||||
|
error instanceof TypeError &&
|
||||||
|
error.cause instanceof InsecureConnectionError
|
||||||
|
) {
|
||||||
throw error.cause;
|
throw error.cause;
|
||||||
} else {
|
} else {
|
||||||
throw error;
|
throw error;
|
||||||
|
@ -3,7 +3,12 @@ import * as Sentry from "@sentry/node";
|
|||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
|
|
||||||
import { robustFetch } from "../../lib/fetch";
|
import { robustFetch } from "../../lib/fetch";
|
||||||
import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error";
|
import {
|
||||||
|
ActionError,
|
||||||
|
EngineError,
|
||||||
|
SiteError,
|
||||||
|
UnsupportedFileError,
|
||||||
|
} from "../../error";
|
||||||
|
|
||||||
const successSchema = z.object({
|
const successSchema = z.object({
|
||||||
jobId: z.string(),
|
jobId: z.string(),
|
||||||
@ -37,10 +42,13 @@ const successSchema = z.object({
|
|||||||
.optional(),
|
.optional(),
|
||||||
|
|
||||||
// chrome-cdp only -- file download handler
|
// chrome-cdp only -- file download handler
|
||||||
file: z.object({
|
file: z
|
||||||
|
.object({
|
||||||
name: z.string(),
|
name: z.string(),
|
||||||
content: z.string(),
|
content: z.string(),
|
||||||
}).optional().or(z.null()),
|
})
|
||||||
|
.optional()
|
||||||
|
.or(z.null()),
|
||||||
});
|
});
|
||||||
|
|
||||||
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
|
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
|
||||||
@ -121,7 +129,9 @@ export async function fireEngineCheckStatus(
|
|||||||
typeof status.error === "string" &&
|
typeof status.error === "string" &&
|
||||||
status.error.includes("File size exceeds")
|
status.error.includes("File size exceeds")
|
||||||
) {
|
) {
|
||||||
throw new UnsupportedFileError("File size exceeds " + status.error.split("File size exceeds ")[1]);
|
throw new UnsupportedFileError(
|
||||||
|
"File size exceeds " + status.error.split("File size exceeds ")[1],
|
||||||
|
);
|
||||||
} else if (
|
} else if (
|
||||||
typeof status.error === "string" &&
|
typeof status.error === "string" &&
|
||||||
// TODO: improve this later
|
// TODO: improve this later
|
||||||
|
@ -13,7 +13,13 @@ import {
|
|||||||
FireEngineCheckStatusSuccess,
|
FireEngineCheckStatusSuccess,
|
||||||
StillProcessingError,
|
StillProcessingError,
|
||||||
} from "./checkStatus";
|
} from "./checkStatus";
|
||||||
import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError } from "../../error";
|
import {
|
||||||
|
ActionError,
|
||||||
|
EngineError,
|
||||||
|
SiteError,
|
||||||
|
TimeoutError,
|
||||||
|
UnsupportedFileError,
|
||||||
|
} from "../../error";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { Action } from "../../../../lib/entities";
|
import { Action } from "../../../../lib/entities";
|
||||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||||
|
@ -298,7 +298,6 @@ export function buildFallbackList(meta: Meta): {
|
|||||||
engine: Engine;
|
engine: Engine;
|
||||||
unsupportedFeatures: Set<FeatureFlag>;
|
unsupportedFeatures: Set<FeatureFlag>;
|
||||||
}[] {
|
}[] {
|
||||||
|
|
||||||
if (meta.internalOptions.useCache !== true) {
|
if (meta.internalOptions.useCache !== true) {
|
||||||
const cacheIndex = engines.indexOf("cache");
|
const cacheIndex = engines.indexOf("cache");
|
||||||
if (cacheIndex !== -1) {
|
if (cacheIndex !== -1) {
|
||||||
|
@ -7,11 +7,18 @@ import { v4 as uuid } from "uuid";
|
|||||||
import * as undici from "undici";
|
import * as undici from "undici";
|
||||||
import { makeSecureDispatcher } from "./safeFetch";
|
import { makeSecureDispatcher } from "./safeFetch";
|
||||||
|
|
||||||
export async function fetchFileToBuffer(url: string, init?: undici.RequestInit): Promise<{
|
export async function fetchFileToBuffer(
|
||||||
|
url: string,
|
||||||
|
init?: undici.RequestInit,
|
||||||
|
): Promise<{
|
||||||
response: undici.Response;
|
response: undici.Response;
|
||||||
buffer: Buffer;
|
buffer: Buffer;
|
||||||
}> {
|
}> {
|
||||||
const response = await undici.fetch(url, { ...init, redirect: "follow", dispatcher: await makeSecureDispatcher(url) });
|
const response = await undici.fetch(url, {
|
||||||
|
...init,
|
||||||
|
redirect: "follow",
|
||||||
|
dispatcher: await makeSecureDispatcher(url),
|
||||||
|
});
|
||||||
return {
|
return {
|
||||||
response,
|
response,
|
||||||
buffer: Buffer.from(await response.arrayBuffer()),
|
buffer: Buffer.from(await response.arrayBuffer()),
|
||||||
@ -30,7 +37,11 @@ export async function downloadFile(
|
|||||||
const tempFileWrite = createWriteStream(tempFilePath);
|
const tempFileWrite = createWriteStream(tempFilePath);
|
||||||
|
|
||||||
// TODO: maybe we could use tlsclient for this? for proxying
|
// TODO: maybe we could use tlsclient for this? for proxying
|
||||||
const response = await undici.fetch(url, { ...init, redirect: "follow", dispatcher: await makeSecureDispatcher(url) });
|
const response = await undici.fetch(url, {
|
||||||
|
...init,
|
||||||
|
redirect: "follow",
|
||||||
|
dispatcher: await makeSecureDispatcher(url),
|
||||||
|
});
|
||||||
|
|
||||||
// This should never happen in the current state of JS/Undici (2024), but let's check anyways.
|
// This should never happen in the current state of JS/Undici (2024), but let's check anyways.
|
||||||
if (response.body === null) {
|
if (response.body === null) {
|
||||||
|
@ -5,36 +5,44 @@ import { Address6 } from "ip-address";
|
|||||||
|
|
||||||
export class InsecureConnectionError extends Error {
|
export class InsecureConnectionError extends Error {
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Connection violated security rules.")
|
super("Connection violated security rules.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function isIPv4Private(address: string): boolean {
|
function isIPv4Private(address: string): boolean {
|
||||||
const parts = address.split(".").map(x => parseInt(x, 10));
|
const parts = address.split(".").map((x) => parseInt(x, 10));
|
||||||
return parts[0] === 0 // Current (local, "this") network
|
return (
|
||||||
|| parts[0] === 10 // Used for local communications within a private network
|
parts[0] === 0 || // Current (local, "this") network
|
||||||
|| (parts[0] === 100 && parts[1] >= 64 && parts[1] < 128) // Shared address space for communications between a service provider and its subscribers when using a carrier-grade NAT
|
parts[0] === 10 || // Used for local communications within a private network
|
||||||
|| parts[0] === 127 // Used for loopback addresses to the local host
|
(parts[0] === 100 && parts[1] >= 64 && parts[1] < 128) || // Shared address space for communications between a service provider and its subscribers when using a carrier-grade NAT
|
||||||
|| (parts[0] === 169 && parts[1] === 254) // Used for link-local addresses between two hosts on a single link when no IP address is otherwise specified, such as would have normally been retrieved from a DHCP server
|
parts[0] === 127 || // Used for loopback addresses to the local host
|
||||||
|| (parts[0] === 127 && parts[1] >= 16 && parts[2] < 32) // Used for local communications within a private network
|
(parts[0] === 169 && parts[1] === 254) || // Used for link-local addresses between two hosts on a single link when no IP address is otherwise specified, such as would have normally been retrieved from a DHCP server
|
||||||
|| (parts[0] === 192 && parts[1] === 0 && parts[2] === 0) // IETF Porotocol Assignments, DS-Lite (/29)
|
(parts[0] === 127 && parts[1] >= 16 && parts[2] < 32) || // Used for local communications within a private network
|
||||||
|| (parts[0] === 192 && parts[1] === 0 && parts[2] === 2) // Assigned as TEST-NET-1, documentation and examples
|
(parts[0] === 192 && parts[1] === 0 && parts[2] === 0) || // IETF Porotocol Assignments, DS-Lite (/29)
|
||||||
|| (parts[0] === 192 && parts[1] === 88 && parts[2] === 99) // Reserved. Formerly used for IPv6 to IPv4 relay (included IPv6 address block 2002::/16).
|
(parts[0] === 192 && parts[1] === 0 && parts[2] === 2) || // Assigned as TEST-NET-1, documentation and examples
|
||||||
|| (parts[0] === 192 && parts[1] === 168) // Used for local communications within a private network
|
(parts[0] === 192 && parts[1] === 88 && parts[2] === 99) || // Reserved. Formerly used for IPv6 to IPv4 relay (included IPv6 address block 2002::/16).
|
||||||
|| (parts[0] === 192 && parts[1] >= 18 && parts[1] < 20) // Used for benchmark testing of inter-network communications between two separate subnets
|
(parts[0] === 192 && parts[1] === 168) || // Used for local communications within a private network
|
||||||
|| (parts[0] === 198 && parts[1] === 51 && parts[2] === 100) // Assigned as TEST-NET-2, documentation and examples
|
(parts[0] === 192 && parts[1] >= 18 && parts[1] < 20) || // Used for benchmark testing of inter-network communications between two separate subnets
|
||||||
|| (parts[0] === 203 && parts[1] === 0 && parts[2] === 113) // Assigned as TEST-NET-3, documentation and examples
|
(parts[0] === 198 && parts[1] === 51 && parts[2] === 100) || // Assigned as TEST-NET-2, documentation and examples
|
||||||
|| (parts[0] >= 224 && parts[0] < 240) // In use for multicast (former Class D network)
|
(parts[0] === 203 && parts[1] === 0 && parts[2] === 113) || // Assigned as TEST-NET-3, documentation and examples
|
||||||
|| (parts[0] === 233 && parts[1] === 252 && parts[2] === 0) // Assigned as MCAST-TEST-NET, documentation and examples (Note that this is part of the above multicast space.)
|
(parts[0] >= 224 && parts[0] < 240) || // In use for multicast (former Class D network)
|
||||||
|| parts[0] >= 240 // Reserved for future use (former class E network)
|
(parts[0] === 233 && parts[1] === 252 && parts[2] === 0) || // Assigned as MCAST-TEST-NET, documentation and examples (Note that this is part of the above multicast space.)
|
||||||
|| (parts[0] === 255 && parts[1] === 255 && parts[2] === 255 && parts[3] === 255) // Reserved for the "limited broadcast" destination address
|
parts[0] >= 240 || // Reserved for future use (former class E network)
|
||||||
|
(parts[0] === 255 &&
|
||||||
|
parts[1] === 255 &&
|
||||||
|
parts[2] === 255 &&
|
||||||
|
parts[3] === 255)
|
||||||
|
); // Reserved for the "limited broadcast" destination address
|
||||||
}
|
}
|
||||||
|
|
||||||
function isIPv6Private(ipv6) {
|
function isIPv6Private(ipv6) {
|
||||||
return new Address6(ipv6).getScope() !== "Global";
|
return new Address6(ipv6).getScope() !== "Global";
|
||||||
}
|
}
|
||||||
|
|
||||||
export function makeSecureDispatcher(url: string, options?: undici.Agent.Options) {
|
export function makeSecureDispatcher(
|
||||||
|
url: string,
|
||||||
|
options?: undici.Agent.Options,
|
||||||
|
) {
|
||||||
const agent = new undici.Agent({
|
const agent = new undici.Agent({
|
||||||
connect: {
|
connect: {
|
||||||
rejectUnauthorized: false, // bypass SSL failures -- this is fine
|
rejectUnauthorized: false, // bypass SSL failures -- this is fine
|
||||||
@ -46,12 +54,18 @@ export function makeSecureDispatcher(url: string, options?: undici.Agent.Options
|
|||||||
|
|
||||||
agent.on("connect", (_, targets) => {
|
agent.on("connect", (_, targets) => {
|
||||||
const client: undici.Client = targets.slice(-1)[0] as undici.Client;
|
const client: undici.Client = targets.slice(-1)[0] as undici.Client;
|
||||||
const socketSymbol = Object.getOwnPropertySymbols(client).find(x => x.description === "socket")!;
|
const socketSymbol = Object.getOwnPropertySymbols(client).find(
|
||||||
|
(x) => x.description === "socket",
|
||||||
|
)!;
|
||||||
const socket: Socket | TLSSocket = (client as any)[socketSymbol];
|
const socket: Socket | TLSSocket = (client as any)[socketSymbol];
|
||||||
|
|
||||||
if (socket.remoteAddress) {
|
if (socket.remoteAddress) {
|
||||||
if (socket.remoteFamily === "IPv4" ? isIPv4Private(socket.remoteAddress!) : isIPv6Private(socket.remoteAddress!)) {
|
if (
|
||||||
socket.destroy(new InsecureConnectionError())
|
socket.remoteFamily === "IPv4"
|
||||||
|
? isIPv4Private(socket.remoteAddress!)
|
||||||
|
: isIPv6Private(socket.remoteAddress!)
|
||||||
|
) {
|
||||||
|
socket.destroy(new InsecureConnectionError());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -420,7 +420,9 @@ export async function scrapeURL(
|
|||||||
} else if (error instanceof ActionError) {
|
} else if (error instanceof ActionError) {
|
||||||
meta.logger.warn("scrapeURL: Action(s) failed to complete", { error });
|
meta.logger.warn("scrapeURL: Action(s) failed to complete", { error });
|
||||||
} else if (error instanceof UnsupportedFileError) {
|
} else if (error instanceof UnsupportedFileError) {
|
||||||
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", { error });
|
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", {
|
||||||
|
error,
|
||||||
|
});
|
||||||
} else {
|
} else {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
meta.logger.error("scrapeURL: Unexpected error happened", { error });
|
meta.logger.error("scrapeURL: Unexpected error happened", { error });
|
||||||
|
@ -44,10 +44,15 @@ export function extractMetadata(
|
|||||||
title = soup("title").first().text().trim() || undefined;
|
title = soup("title").first().text().trim() || undefined;
|
||||||
description = soup('meta[name="description"]').attr("content") || undefined;
|
description = soup('meta[name="description"]').attr("content") || undefined;
|
||||||
|
|
||||||
const faviconLink = soup('link[rel="icon"]').attr("href") || soup('link[rel*="icon"]').first().attr("href") || undefined;
|
const faviconLink =
|
||||||
|
soup('link[rel="icon"]').attr("href") ||
|
||||||
|
soup('link[rel*="icon"]').first().attr("href") ||
|
||||||
|
undefined;
|
||||||
if (faviconLink) {
|
if (faviconLink) {
|
||||||
const baseUrl = new URL(meta.url).origin;
|
const baseUrl = new URL(meta.url).origin;
|
||||||
favicon = faviconLink.startsWith('http') ? faviconLink : `${baseUrl}${faviconLink}`;
|
favicon = faviconLink.startsWith("http")
|
||||||
|
? faviconLink
|
||||||
|
: `${baseUrl}${faviconLink}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Assuming the language is part of the URL as per the regex pattern
|
// Assuming the language is part of the URL as per the regex pattern
|
||||||
|
@ -24,7 +24,6 @@ export function saveToCache(meta: Meta, document: Document): Document {
|
|||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
|
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
|
||||||
|
|
||||||
if (key !== null) {
|
if (key !== null) {
|
||||||
|
@ -8,13 +8,21 @@ describe("removeDefaultProperty", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
it("should remove the default property from a nested object", () => {
|
it("should remove the default property from a nested object", () => {
|
||||||
const input = { default: "test", nested: { default: "nestedTest", test: "nestedTest" } };
|
const input = {
|
||||||
|
default: "test",
|
||||||
|
nested: { default: "nestedTest", test: "nestedTest" },
|
||||||
|
};
|
||||||
const expectedOutput = { nested: { test: "nestedTest" } };
|
const expectedOutput = { nested: { test: "nestedTest" } };
|
||||||
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should remove the default property from an array of objects", () => {
|
it("should remove the default property from an array of objects", () => {
|
||||||
const input = { array: [{ default: "test1", test: "test1" }, { default: "test2", test: "test2" }] };
|
const input = {
|
||||||
|
array: [
|
||||||
|
{ default: "test1", test: "test1" },
|
||||||
|
{ default: "test2", test: "test2" },
|
||||||
|
],
|
||||||
|
};
|
||||||
const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] };
|
const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] };
|
||||||
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
||||||
});
|
});
|
||||||
|
@ -140,10 +140,10 @@ export async function generateOpenAICompletions(
|
|||||||
properties: Object.fromEntries(
|
properties: Object.fromEntries(
|
||||||
Object.entries(schema).map(([key, value]) => {
|
Object.entries(schema).map(([key, value]) => {
|
||||||
return [key, removeDefaultProperty(value)];
|
return [key, removeDefaultProperty(value)];
|
||||||
})
|
}),
|
||||||
),
|
),
|
||||||
required: Object.keys(schema),
|
required: Object.keys(schema),
|
||||||
additionalProperties: false
|
additionalProperties: false,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -240,14 +240,14 @@ export async function performLLMExtract(
|
|||||||
}
|
}
|
||||||
|
|
||||||
export function removeDefaultProperty(schema: any): any {
|
export function removeDefaultProperty(schema: any): any {
|
||||||
if (typeof schema !== 'object' || schema === null) return schema;
|
if (typeof schema !== "object" || schema === null) return schema;
|
||||||
|
|
||||||
const { default: _, ...rest } = schema;
|
const { default: _, ...rest } = schema;
|
||||||
|
|
||||||
for (const key in rest) {
|
for (const key in rest) {
|
||||||
if (Array.isArray(rest[key])) {
|
if (Array.isArray(rest[key])) {
|
||||||
rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));
|
rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));
|
||||||
} else if (typeof rest[key] === 'object' && rest[key] !== null) {
|
} else if (typeof rest[key] === "object" && rest[key] !== null) {
|
||||||
rest[key] = removeDefaultProperty(rest[key]);
|
rest[key] = removeDefaultProperty(rest[key]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -9,9 +9,11 @@ configDotenv();
|
|||||||
|
|
||||||
function cleanOfNull<T>(x: T): T {
|
function cleanOfNull<T>(x: T): T {
|
||||||
if (Array.isArray(x)) {
|
if (Array.isArray(x)) {
|
||||||
return x.map(x => cleanOfNull(x)) as T;
|
return x.map((x) => cleanOfNull(x)) as T;
|
||||||
} else if (typeof x === "object" && x !== null) {
|
} else if (typeof x === "object" && x !== null) {
|
||||||
return Object.fromEntries(Object.entries(x).map(([k,v]) => [k,cleanOfNull(v)])) as T
|
return Object.fromEntries(
|
||||||
|
Object.entries(x).map(([k, v]) => [k, cleanOfNull(v)]),
|
||||||
|
) as T;
|
||||||
} else if (typeof x === "string") {
|
} else if (typeof x === "string") {
|
||||||
return x.replaceAll("\u0000", "") as T;
|
return x.replaceAll("\u0000", "") as T;
|
||||||
} else {
|
} else {
|
||||||
|
@ -16,9 +16,7 @@ export const loggingQueueName = "{loggingQueue}";
|
|||||||
|
|
||||||
export function getScrapeQueue() {
|
export function getScrapeQueue() {
|
||||||
if (!scrapeQueue) {
|
if (!scrapeQueue) {
|
||||||
scrapeQueue = new Queue(
|
scrapeQueue = new Queue(scrapeQueueName, {
|
||||||
scrapeQueueName,
|
|
||||||
{
|
|
||||||
connection: redisConnection,
|
connection: redisConnection,
|
||||||
defaultJobOptions: {
|
defaultJobOptions: {
|
||||||
removeOnComplete: {
|
removeOnComplete: {
|
||||||
@ -28,8 +26,7 @@ export function getScrapeQueue() {
|
|||||||
age: 90000, // 25 hours
|
age: 90000, // 25 hours
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
});
|
||||||
);
|
|
||||||
logger.info("Web scraper queue created");
|
logger.info("Web scraper queue created");
|
||||||
}
|
}
|
||||||
return scrapeQueue;
|
return scrapeQueue;
|
||||||
@ -37,9 +34,7 @@ export function getScrapeQueue() {
|
|||||||
|
|
||||||
export function getExtractQueue() {
|
export function getExtractQueue() {
|
||||||
if (!extractQueue) {
|
if (!extractQueue) {
|
||||||
extractQueue = new Queue(
|
extractQueue = new Queue(extractQueueName, {
|
||||||
extractQueueName,
|
|
||||||
{
|
|
||||||
connection: redisConnection,
|
connection: redisConnection,
|
||||||
defaultJobOptions: {
|
defaultJobOptions: {
|
||||||
removeOnComplete: {
|
removeOnComplete: {
|
||||||
@ -49,14 +44,12 @@ export function getExtractQueue() {
|
|||||||
age: 90000, // 25 hours
|
age: 90000, // 25 hours
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
});
|
||||||
);
|
|
||||||
logger.info("Extraction queue created");
|
logger.info("Extraction queue created");
|
||||||
}
|
}
|
||||||
return extractQueue;
|
return extractQueue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// === REMOVED IN FAVOR OF POLLING -- NOT RELIABLE
|
// === REMOVED IN FAVOR OF POLLING -- NOT RELIABLE
|
||||||
// import { QueueEvents } from 'bullmq';
|
// import { QueueEvents } from 'bullmq';
|
||||||
// export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection.duplicate() });
|
// export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection.duplicate() });
|
||||||
|
@ -89,13 +89,19 @@ const runningJobs: Set<string> = new Set();
|
|||||||
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||||
if (await finishCrawl(job.data.crawl_id)) {
|
if (await finishCrawl(job.data.crawl_id)) {
|
||||||
(async () => {
|
(async () => {
|
||||||
const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined;
|
const originUrl = sc.originUrl
|
||||||
|
? normalizeUrlOnlyHostname(sc.originUrl)
|
||||||
|
: undefined;
|
||||||
// Get all visited URLs from Redis
|
// Get all visited URLs from Redis
|
||||||
const visitedUrls = await redisConnection.smembers(
|
const visitedUrls = await redisConnection.smembers(
|
||||||
"crawl:" + job.data.crawl_id + ":visited",
|
"crawl:" + job.data.crawl_id + ":visited",
|
||||||
);
|
);
|
||||||
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
|
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
|
||||||
if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
|
if (
|
||||||
|
visitedUrls.length > 0 &&
|
||||||
|
job.data.crawlerOptions !== null &&
|
||||||
|
originUrl
|
||||||
|
) {
|
||||||
// Fire and forget the upload to Supabase
|
// Fire and forget the upload to Supabase
|
||||||
try {
|
try {
|
||||||
// Standardize URLs to canonical form (https, no www)
|
// Standardize URLs to canonical form (https, no www)
|
||||||
@ -317,7 +323,10 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
|
|||||||
return err;
|
return err;
|
||||||
};
|
};
|
||||||
|
|
||||||
const processExtractJobInternal = async (token: string, job: Job & { id: string }) => {
|
const processExtractJobInternal = async (
|
||||||
|
token: string,
|
||||||
|
job: Job & { id: string },
|
||||||
|
) => {
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
module: "extract-worker",
|
module: "extract-worker",
|
||||||
method: "processJobInternal",
|
method: "processJobInternal",
|
||||||
@ -360,11 +369,14 @@ const processExtractJobInternal = async (token: string, job: Job & { id: string
|
|||||||
|
|
||||||
await updateExtract(job.data.extractId, {
|
await updateExtract(job.data.extractId, {
|
||||||
status: "failed",
|
status: "failed",
|
||||||
error: error.error ?? error ?? "Unknown error, please contact help@firecrawl.dev. Extract id: " + job.data.extractId,
|
error:
|
||||||
|
error.error ??
|
||||||
|
error ??
|
||||||
|
"Unknown error, please contact help@firecrawl.dev. Extract id: " +
|
||||||
|
job.data.extractId,
|
||||||
});
|
});
|
||||||
// throw error;
|
// throw error;
|
||||||
} finally {
|
} finally {
|
||||||
|
|
||||||
clearInterval(extendLockInterval);
|
clearInterval(extendLockInterval);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -635,7 +647,9 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
|||||||
sc,
|
sc,
|
||||||
jobs.map((x) => ({ id: x.opts.jobId, url: x.data.url })),
|
jobs.map((x) => ({ id: x.opts.jobId, url: x.data.url })),
|
||||||
);
|
);
|
||||||
const lockedJobs = jobs.filter(x => lockedIds.find(y => y.id === x.opts.jobId));
|
const lockedJobs = jobs.filter((x) =>
|
||||||
|
lockedIds.find((y) => y.id === x.opts.jobId),
|
||||||
|
);
|
||||||
logger.debug("Adding scrape jobs to Redis...");
|
logger.debug("Adding scrape jobs to Redis...");
|
||||||
await addCrawlJobs(
|
await addCrawlJobs(
|
||||||
job.data.crawl_id,
|
job.data.crawl_id,
|
||||||
@ -790,7 +804,8 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
) {
|
) {
|
||||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||||
if (
|
if (
|
||||||
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null &&
|
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) ===
|
||||||
|
null &&
|
||||||
!job.data.isCrawlSourceScrape
|
!job.data.isCrawlSourceScrape
|
||||||
) {
|
) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
@ -1073,7 +1088,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
console.log("All workers exited. Waiting for all jobs to finish...");
|
console.log("All workers exited. Waiting for all jobs to finish...");
|
||||||
|
|
||||||
while (runningJobs.size > 0) {
|
while (runningJobs.size > 0) {
|
||||||
await new Promise(resolve => setTimeout(resolve, 500));
|
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||||
}
|
}
|
||||||
|
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user