mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-04 18:10:41 +08:00
Merge branch 'main' into nsc/admin-router
This commit is contained in:
commit
10e80f00cf
@ -10,6 +10,7 @@ import { logCrawl } from "../../src/services/logging/crawl_log";
|
|||||||
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
||||||
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
||||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
||||||
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from "../../src/lib/logger";
|
import { Logger } from "../../src/lib/logger";
|
||||||
|
|
||||||
export async function crawlController(req: Request, res: Response) {
|
export async function crawlController(req: Request, res: Response) {
|
||||||
@ -61,10 +62,11 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||||
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||||
|
|
||||||
if (mode === "single_urls" && !url.includes(",")) {
|
if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||||
try {
|
try {
|
||||||
const a = new WebScraperDataProvider();
|
const a = new WebScraperDataProvider();
|
||||||
await a.setOptions({
|
await a.setOptions({
|
||||||
|
jobId: uuidv4(),
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
urls: [url],
|
urls: [url],
|
||||||
crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||||
|
@ -9,9 +9,11 @@ import { Document } from "../lib/entities";
|
|||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||||
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
||||||
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
|
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
|
||||||
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from '../lib/logger';
|
import { Logger } from '../lib/logger';
|
||||||
|
|
||||||
export async function scrapeHelper(
|
export async function scrapeHelper(
|
||||||
|
jobId: string,
|
||||||
req: Request,
|
req: Request,
|
||||||
team_id: string,
|
team_id: string,
|
||||||
crawlerOptions: any,
|
crawlerOptions: any,
|
||||||
@ -36,6 +38,7 @@ export async function scrapeHelper(
|
|||||||
|
|
||||||
const a = new WebScraperDataProvider();
|
const a = new WebScraperDataProvider();
|
||||||
await a.setOptions({
|
await a.setOptions({
|
||||||
|
jobId,
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
urls: [url],
|
urls: [url],
|
||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
@ -128,8 +131,11 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
checkCredits();
|
checkCredits();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const jobId = uuidv4();
|
||||||
|
|
||||||
const startTime = new Date().getTime();
|
const startTime = new Date().getTime();
|
||||||
const result = await scrapeHelper(
|
const result = await scrapeHelper(
|
||||||
|
jobId,
|
||||||
req,
|
req,
|
||||||
team_id,
|
team_id,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
@ -170,6 +176,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
logJob({
|
logJob({
|
||||||
|
job_id: jobId,
|
||||||
success: result.success,
|
success: result.success,
|
||||||
message: result.error,
|
message: result.error,
|
||||||
num_docs: 1,
|
num_docs: 1,
|
||||||
|
@ -7,9 +7,11 @@ import { logJob } from "../services/logging/log_job";
|
|||||||
import { PageOptions, SearchOptions } from "../lib/entities";
|
import { PageOptions, SearchOptions } from "../lib/entities";
|
||||||
import { search } from "../search";
|
import { search } from "../search";
|
||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||||
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from "../lib/logger";
|
import { Logger } from "../lib/logger";
|
||||||
|
|
||||||
export async function searchHelper(
|
export async function searchHelper(
|
||||||
|
jobId: string,
|
||||||
req: Request,
|
req: Request,
|
||||||
team_id: string,
|
team_id: string,
|
||||||
crawlerOptions: any,
|
crawlerOptions: any,
|
||||||
@ -76,6 +78,7 @@ export async function searchHelper(
|
|||||||
|
|
||||||
const a = new WebScraperDataProvider();
|
const a = new WebScraperDataProvider();
|
||||||
await a.setOptions({
|
await a.setOptions({
|
||||||
|
jobId,
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7),
|
urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7),
|
||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
@ -149,6 +152,8 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
|
|
||||||
const searchOptions = req.body.searchOptions ?? { limit: 7 };
|
const searchOptions = req.body.searchOptions ?? { limit: 7 };
|
||||||
|
|
||||||
|
const jobId = uuidv4();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||||
await checkTeamCredits(team_id, 1);
|
await checkTeamCredits(team_id, 1);
|
||||||
@ -161,6 +166,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
const startTime = new Date().getTime();
|
const startTime = new Date().getTime();
|
||||||
const result = await searchHelper(
|
const result = await searchHelper(
|
||||||
|
jobId,
|
||||||
req,
|
req,
|
||||||
team_id,
|
team_id,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
@ -170,6 +176,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
logJob({
|
logJob({
|
||||||
|
job_id: jobId,
|
||||||
success: result.success,
|
success: result.success,
|
||||||
message: result.error,
|
message: result.error,
|
||||||
num_docs: result.data ? result.data.length : 0,
|
num_docs: result.data ? result.data.length : 0,
|
||||||
|
@ -4,6 +4,7 @@ async function example() {
|
|||||||
const example = new WebScraperDataProvider();
|
const example = new WebScraperDataProvider();
|
||||||
|
|
||||||
await example.setOptions({
|
await example.setOptions({
|
||||||
|
jobId: "TEST",
|
||||||
mode: "crawl",
|
mode: "crawl",
|
||||||
urls: ["https://mendable.ai"],
|
urls: ["https://mendable.ai"],
|
||||||
crawlerOptions: {},
|
crawlerOptions: {},
|
||||||
|
@ -9,6 +9,7 @@ import cluster from "cluster";
|
|||||||
import os from "os";
|
import os from "os";
|
||||||
import { Logger } from "./lib/logger";
|
import { Logger } from "./lib/logger";
|
||||||
import { adminRouter } from "./routes/admin";
|
import { adminRouter } from "./routes/admin";
|
||||||
|
import { ScrapeEvents } from "./lib/scrape-events";
|
||||||
|
|
||||||
const { createBullBoard } = require("@bull-board/api");
|
const { createBullBoard } = require("@bull-board/api");
|
||||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||||
@ -168,3 +169,12 @@ if (cluster.isMaster) {
|
|||||||
|
|
||||||
Logger.info(`Worker ${process.pid} started`);
|
Logger.info(`Worker ${process.pid} started`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const wsq = getWebScraperQueue();
|
||||||
|
|
||||||
|
wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
||||||
|
wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
|
||||||
|
wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
|
||||||
|
wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
||||||
|
wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
||||||
|
wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
||||||
|
@ -56,6 +56,7 @@ export type CrawlerOptions = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export type WebScraperOptions = {
|
export type WebScraperOptions = {
|
||||||
|
jobId: string;
|
||||||
urls: string[];
|
urls: string[];
|
||||||
mode: "single_urls" | "sitemap" | "crawl";
|
mode: "single_urls" | "sitemap" | "crawl";
|
||||||
crawlerOptions?: CrawlerOptions;
|
crawlerOptions?: CrawlerOptions;
|
||||||
|
84
apps/api/src/lib/scrape-events.ts
Normal file
84
apps/api/src/lib/scrape-events.ts
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
import { Job, JobId } from "bull";
|
||||||
|
import type { baseScrapers } from "../scraper/WebScraper/single_url";
|
||||||
|
import { supabase_service as supabase } from "../services/supabase";
|
||||||
|
import { Logger } from "./logger";
|
||||||
|
|
||||||
|
export type ScrapeErrorEvent = {
|
||||||
|
type: "error",
|
||||||
|
message: string,
|
||||||
|
stack?: string,
|
||||||
|
}
|
||||||
|
|
||||||
|
export type ScrapeScrapeEvent = {
|
||||||
|
type: "scrape",
|
||||||
|
url: string,
|
||||||
|
worker?: string,
|
||||||
|
method: (typeof baseScrapers)[number],
|
||||||
|
result: null | {
|
||||||
|
success: boolean,
|
||||||
|
response_code?: number,
|
||||||
|
response_size?: number,
|
||||||
|
error?: string | object,
|
||||||
|
// proxy?: string,
|
||||||
|
time_taken: number,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
export type ScrapeQueueEvent = {
|
||||||
|
type: "queue",
|
||||||
|
event: "waiting" | "active" | "completed" | "paused" | "resumed" | "removed" | "failed",
|
||||||
|
worker?: string,
|
||||||
|
}
|
||||||
|
|
||||||
|
export type ScrapeEvent = ScrapeErrorEvent | ScrapeScrapeEvent | ScrapeQueueEvent;
|
||||||
|
|
||||||
|
export class ScrapeEvents {
|
||||||
|
static async insert(jobId: string, content: ScrapeEvent) {
|
||||||
|
if (jobId === "TEST") return null;
|
||||||
|
|
||||||
|
if (process.env.USE_DB_AUTHENTICATION) {
|
||||||
|
try {
|
||||||
|
const result = await supabase.from("scrape_events").insert({
|
||||||
|
job_id: jobId,
|
||||||
|
type: content.type,
|
||||||
|
content: content,
|
||||||
|
// created_at
|
||||||
|
}).select().single();
|
||||||
|
return (result.data as any).id;
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(`Error inserting scrape event: ${error}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
static async updateScrapeResult(logId: number | null, result: ScrapeScrapeEvent["result"]) {
|
||||||
|
if (logId === null) return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any;
|
||||||
|
await supabase.from("scrape_events").update({
|
||||||
|
content: {
|
||||||
|
...previousLog.content,
|
||||||
|
result,
|
||||||
|
}
|
||||||
|
}).eq("id", logId);
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(`Error updating scrape result: ${error}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static async logJobEvent(job: Job | JobId, event: ScrapeQueueEvent["event"]) {
|
||||||
|
try {
|
||||||
|
await this.insert(((job as any).id ? (job as any).id : job) as string, {
|
||||||
|
type: "queue",
|
||||||
|
event,
|
||||||
|
worker: process.env.FLY_MACHINE_ID,
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(`Error logging job event: ${error}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -11,6 +11,7 @@ import { billTeam } from "../services/billing/credit_billing";
|
|||||||
import { Document } from "../lib/entities";
|
import { Document } from "../lib/entities";
|
||||||
import { supabase_service } from "../services/supabase";
|
import { supabase_service } from "../services/supabase";
|
||||||
import { Logger } from "../lib/logger";
|
import { Logger } from "../lib/logger";
|
||||||
|
import { ScrapeEvents } from "../lib/scrape-events";
|
||||||
|
|
||||||
export async function startWebScraperPipeline({
|
export async function startWebScraperPipeline({
|
||||||
job,
|
job,
|
||||||
@ -39,6 +40,7 @@ export async function startWebScraperPipeline({
|
|||||||
},
|
},
|
||||||
onError: (error) => {
|
onError: (error) => {
|
||||||
Logger.error(`🐂 Job failed ${job.id}`);
|
Logger.error(`🐂 Job failed ${job.id}`);
|
||||||
|
ScrapeEvents.logJobEvent(job, "failed");
|
||||||
job.moveToFailed(error);
|
job.moveToFailed(error);
|
||||||
},
|
},
|
||||||
team_id: job.data.team_id,
|
team_id: job.data.team_id,
|
||||||
@ -60,6 +62,7 @@ export async function runWebScraper({
|
|||||||
const provider = new WebScraperDataProvider();
|
const provider = new WebScraperDataProvider();
|
||||||
if (mode === "crawl") {
|
if (mode === "crawl") {
|
||||||
await provider.setOptions({
|
await provider.setOptions({
|
||||||
|
jobId: bull_job_id,
|
||||||
mode: mode,
|
mode: mode,
|
||||||
urls: [url],
|
urls: [url],
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
@ -68,6 +71,7 @@ export async function runWebScraper({
|
|||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
await provider.setOptions({
|
await provider.setOptions({
|
||||||
|
jobId: bull_job_id,
|
||||||
mode: mode,
|
mode: mode,
|
||||||
urls: url.split(","),
|
urls: url.split(","),
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
@ -138,6 +142,7 @@ const saveJob = async (job: Job, result: any) => {
|
|||||||
// I think the job won't exist here anymore
|
// I think the job won't exist here anymore
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
ScrapeEvents.logJobEvent(job, "completed");
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`🐂 Failed to update job status: ${error}`);
|
Logger.error(`🐂 Failed to update job status: ${error}`);
|
||||||
}
|
}
|
||||||
|
@ -42,6 +42,7 @@ describe('WebCrawler', () => {
|
|||||||
|
|
||||||
|
|
||||||
crawler = new WebCrawler({
|
crawler = new WebCrawler({
|
||||||
|
jobId: "TEST",
|
||||||
initialUrl: initialUrl,
|
initialUrl: initialUrl,
|
||||||
includes: [],
|
includes: [],
|
||||||
excludes: [],
|
excludes: [],
|
||||||
@ -76,6 +77,7 @@ describe('WebCrawler', () => {
|
|||||||
|
|
||||||
|
|
||||||
crawler = new WebCrawler({
|
crawler = new WebCrawler({
|
||||||
|
jobId: "TEST",
|
||||||
initialUrl: initialUrl,
|
initialUrl: initialUrl,
|
||||||
includes: [],
|
includes: [],
|
||||||
excludes: [],
|
excludes: [],
|
||||||
@ -104,6 +106,7 @@ describe('WebCrawler', () => {
|
|||||||
|
|
||||||
|
|
||||||
crawler = new WebCrawler({
|
crawler = new WebCrawler({
|
||||||
|
jobId: "TEST",
|
||||||
initialUrl: initialUrl,
|
initialUrl: initialUrl,
|
||||||
includes: [],
|
includes: [],
|
||||||
excludes: [],
|
excludes: [],
|
||||||
@ -133,6 +136,7 @@ describe('WebCrawler', () => {
|
|||||||
|
|
||||||
|
|
||||||
crawler = new WebCrawler({
|
crawler = new WebCrawler({
|
||||||
|
jobId: "TEST",
|
||||||
initialUrl: initialUrl,
|
initialUrl: initialUrl,
|
||||||
includes: [],
|
includes: [],
|
||||||
excludes: [],
|
excludes: [],
|
||||||
@ -161,6 +165,7 @@ describe('WebCrawler', () => {
|
|||||||
|
|
||||||
// Setup the crawler with the specific test case options
|
// Setup the crawler with the specific test case options
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
|
jobId: "TEST",
|
||||||
initialUrl: initialUrl,
|
initialUrl: initialUrl,
|
||||||
includes: [],
|
includes: [],
|
||||||
excludes: [],
|
excludes: [],
|
||||||
@ -194,6 +199,7 @@ describe('WebCrawler', () => {
|
|||||||
const limit = 2; // Set a limit for the number of links
|
const limit = 2; // Set a limit for the number of links
|
||||||
|
|
||||||
crawler = new WebCrawler({
|
crawler = new WebCrawler({
|
||||||
|
jobId: "TEST",
|
||||||
initialUrl: initialUrl,
|
initialUrl: initialUrl,
|
||||||
includes: [],
|
includes: [],
|
||||||
excludes: [],
|
excludes: [],
|
||||||
|
@ -15,8 +15,8 @@ describe('scrapSingleUrl', () => {
|
|||||||
const pageOptionsWithHtml: PageOptions = { includeHtml: true };
|
const pageOptionsWithHtml: PageOptions = { includeHtml: true };
|
||||||
const pageOptionsWithoutHtml: PageOptions = { includeHtml: false };
|
const pageOptionsWithoutHtml: PageOptions = { includeHtml: false };
|
||||||
|
|
||||||
const resultWithHtml = await scrapSingleUrl(url, pageOptionsWithHtml);
|
const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml);
|
||||||
const resultWithoutHtml = await scrapSingleUrl(url, pageOptionsWithoutHtml);
|
const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml);
|
||||||
|
|
||||||
expect(resultWithHtml.html).toBeDefined();
|
expect(resultWithHtml.html).toBeDefined();
|
||||||
expect(resultWithoutHtml.html).toBeUndefined();
|
expect(resultWithoutHtml.html).toBeUndefined();
|
||||||
@ -27,7 +27,7 @@ it('should return a list of links on the mendable.ai page', async () => {
|
|||||||
const url = 'https://mendable.ai';
|
const url = 'https://mendable.ai';
|
||||||
const pageOptions: PageOptions = { includeHtml: true };
|
const pageOptions: PageOptions = { includeHtml: true };
|
||||||
|
|
||||||
const result = await scrapSingleUrl(url, pageOptions);
|
const result = await scrapSingleUrl("TEST", url, pageOptions);
|
||||||
|
|
||||||
// Check if the result contains a list of links
|
// Check if the result contains a list of links
|
||||||
expect(result.linksOnPage).toBeDefined();
|
expect(result.linksOnPage).toBeDefined();
|
||||||
|
@ -11,6 +11,7 @@ import { axiosTimeout } from "../../../src/lib/timeout";
|
|||||||
import { Logger } from "../../../src/lib/logger";
|
import { Logger } from "../../../src/lib/logger";
|
||||||
|
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
|
private jobId: string;
|
||||||
private initialUrl: string;
|
private initialUrl: string;
|
||||||
private baseUrl: string;
|
private baseUrl: string;
|
||||||
private includes: string[];
|
private includes: string[];
|
||||||
@ -27,6 +28,7 @@ export class WebCrawler {
|
|||||||
private allowExternalContentLinks: boolean;
|
private allowExternalContentLinks: boolean;
|
||||||
|
|
||||||
constructor({
|
constructor({
|
||||||
|
jobId,
|
||||||
initialUrl,
|
initialUrl,
|
||||||
includes,
|
includes,
|
||||||
excludes,
|
excludes,
|
||||||
@ -37,6 +39,7 @@ export class WebCrawler {
|
|||||||
allowBackwardCrawling = false,
|
allowBackwardCrawling = false,
|
||||||
allowExternalContentLinks = false
|
allowExternalContentLinks = false
|
||||||
}: {
|
}: {
|
||||||
|
jobId: string;
|
||||||
initialUrl: string;
|
initialUrl: string;
|
||||||
includes?: string[];
|
includes?: string[];
|
||||||
excludes?: string[];
|
excludes?: string[];
|
||||||
@ -47,6 +50,7 @@ export class WebCrawler {
|
|||||||
allowBackwardCrawling?: boolean;
|
allowBackwardCrawling?: boolean;
|
||||||
allowExternalContentLinks?: boolean;
|
allowExternalContentLinks?: boolean;
|
||||||
}) {
|
}) {
|
||||||
|
this.jobId = jobId;
|
||||||
this.initialUrl = initialUrl;
|
this.initialUrl = initialUrl;
|
||||||
this.baseUrl = new URL(initialUrl).origin;
|
this.baseUrl = new URL(initialUrl).origin;
|
||||||
this.includes = includes ?? [];
|
this.includes = includes ?? [];
|
||||||
@ -261,7 +265,7 @@ export class WebCrawler {
|
|||||||
|
|
||||||
// If it is the first link, fetch with single url
|
// If it is the first link, fetch with single url
|
||||||
if (this.visited.size === 1) {
|
if (this.visited.size === 1) {
|
||||||
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
|
const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true });
|
||||||
content = page.html ?? "";
|
content = page.html ?? "";
|
||||||
pageStatusCode = page.metadata?.pageStatusCode;
|
pageStatusCode = page.metadata?.pageStatusCode;
|
||||||
pageError = page.metadata?.pageError || undefined;
|
pageError = page.metadata?.pageError || undefined;
|
||||||
|
@ -22,6 +22,7 @@ import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
|
|||||||
import { Logger } from "../../lib/logger";
|
import { Logger } from "../../lib/logger";
|
||||||
|
|
||||||
export class WebScraperDataProvider {
|
export class WebScraperDataProvider {
|
||||||
|
private jobId: string;
|
||||||
private bullJobId: string;
|
private bullJobId: string;
|
||||||
private urls: string[] = [""];
|
private urls: string[] = [""];
|
||||||
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
||||||
@ -66,6 +67,7 @@ export class WebScraperDataProvider {
|
|||||||
batchUrls.map(async (url, index) => {
|
batchUrls.map(async (url, index) => {
|
||||||
const existingHTML = allHtmls ? allHtmls[i + index] : "";
|
const existingHTML = allHtmls ? allHtmls[i + index] : "";
|
||||||
const result = await scrapSingleUrl(
|
const result = await scrapSingleUrl(
|
||||||
|
this.jobId,
|
||||||
url,
|
url,
|
||||||
this.pageOptions,
|
this.pageOptions,
|
||||||
this.extractorOptions,
|
this.extractorOptions,
|
||||||
@ -166,6 +168,7 @@ export class WebScraperDataProvider {
|
|||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
|
jobId: this.jobId,
|
||||||
initialUrl: this.urls[0],
|
initialUrl: this.urls[0],
|
||||||
includes: this.includes,
|
includes: this.includes,
|
||||||
excludes: this.excludes,
|
excludes: this.excludes,
|
||||||
@ -500,6 +503,7 @@ export class WebScraperDataProvider {
|
|||||||
throw new Error("Urls are required");
|
throw new Error("Urls are required");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this.jobId = options.jobId;
|
||||||
this.bullJobId = options.bullJobId;
|
this.bullJobId = options.bullJobId;
|
||||||
this.urls = options.urls;
|
this.urls = options.urls;
|
||||||
this.mode = options.mode;
|
this.mode = options.mode;
|
||||||
|
@ -18,10 +18,11 @@ import { scrapWithPlaywright } from "./scrapers/playwright";
|
|||||||
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
|
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
|
||||||
import { extractLinks } from "./utils/utils";
|
import { extractLinks } from "./utils/utils";
|
||||||
import { Logger } from "../../lib/logger";
|
import { Logger } from "../../lib/logger";
|
||||||
|
import { ScrapeEvents } from "../../lib/scrape-events";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
const baseScrapers = [
|
export const baseScrapers = [
|
||||||
"fire-engine",
|
"fire-engine",
|
||||||
"fire-engine;chrome-cdp",
|
"fire-engine;chrome-cdp",
|
||||||
"scrapingBee",
|
"scrapingBee",
|
||||||
@ -118,6 +119,7 @@ function getScrapingFallbackOrder(
|
|||||||
|
|
||||||
|
|
||||||
export async function scrapSingleUrl(
|
export async function scrapSingleUrl(
|
||||||
|
jobId: string,
|
||||||
urlToScrap: string,
|
urlToScrap: string,
|
||||||
pageOptions: PageOptions = {
|
pageOptions: PageOptions = {
|
||||||
onlyMainContent: true,
|
onlyMainContent: true,
|
||||||
@ -145,6 +147,15 @@ export async function scrapSingleUrl(
|
|||||||
} = { text: "", screenshot: "", metadata: {} };
|
} = { text: "", screenshot: "", metadata: {} };
|
||||||
let screenshot = "";
|
let screenshot = "";
|
||||||
|
|
||||||
|
const timer = Date.now();
|
||||||
|
const logInsertPromise = ScrapeEvents.insert(jobId, {
|
||||||
|
type: "scrape",
|
||||||
|
url,
|
||||||
|
worker: process.env.FLY_MACHINE_ID,
|
||||||
|
method,
|
||||||
|
result: null,
|
||||||
|
});
|
||||||
|
|
||||||
switch (method) {
|
switch (method) {
|
||||||
case "fire-engine":
|
case "fire-engine":
|
||||||
case "fire-engine;chrome-cdp":
|
case "fire-engine;chrome-cdp":
|
||||||
@ -254,8 +265,19 @@ export async function scrapSingleUrl(
|
|||||||
}
|
}
|
||||||
//* TODO: add an optional to return markdown or structured/extracted content
|
//* TODO: add an optional to return markdown or structured/extracted content
|
||||||
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
||||||
|
const text = await parseMarkdown(cleanedHtml);
|
||||||
|
|
||||||
|
const insertedLogId = await logInsertPromise;
|
||||||
|
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||||
|
response_size: scraperResponse.text.length,
|
||||||
|
success: !scraperResponse.metadata.pageError && !!text,
|
||||||
|
error: scraperResponse.metadata.pageError,
|
||||||
|
response_code: scraperResponse.metadata.pageStatusCode,
|
||||||
|
time_taken: Date.now() - timer,
|
||||||
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
text: await parseMarkdown(cleanedHtml),
|
text,
|
||||||
html: cleanedHtml,
|
html: cleanedHtml,
|
||||||
rawHtml: scraperResponse.text,
|
rawHtml: scraperResponse.text,
|
||||||
screenshot: scraperResponse.screenshot,
|
screenshot: scraperResponse.screenshot,
|
||||||
@ -379,6 +401,11 @@ export async function scrapSingleUrl(
|
|||||||
return document;
|
return document;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
|
Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
|
||||||
|
ScrapeEvents.insert(jobId, {
|
||||||
|
type: "error",
|
||||||
|
message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
|
||||||
|
stack: error.stack,
|
||||||
|
});
|
||||||
return {
|
return {
|
||||||
content: "",
|
content: "",
|
||||||
markdown: "",
|
markdown: "",
|
||||||
|
@ -8,6 +8,7 @@ import { logJob } from "./logging/log_job";
|
|||||||
import { initSDK } from '@hyperdx/node-opentelemetry';
|
import { initSDK } from '@hyperdx/node-opentelemetry';
|
||||||
import { Job } from "bull";
|
import { Job } from "bull";
|
||||||
import { Logger } from "../lib/logger";
|
import { Logger } from "../lib/logger";
|
||||||
|
import { ScrapeEvents } from "../lib/scrape-events";
|
||||||
|
|
||||||
if (process.env.ENV === 'production') {
|
if (process.env.ENV === 'production') {
|
||||||
initSDK({
|
initSDK({
|
||||||
@ -20,6 +21,7 @@ const wsq = getWebScraperQueue();
|
|||||||
|
|
||||||
async function processJob(job: Job, done) {
|
async function processJob(job: Job, done) {
|
||||||
Logger.debug(`🐂 Worker taking job ${job.id}`);
|
Logger.debug(`🐂 Worker taking job ${job.id}`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
job.progress({
|
job.progress({
|
||||||
current: 1,
|
current: 1,
|
||||||
@ -114,3 +116,10 @@ wsq.process(
|
|||||||
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
|
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
|
||||||
processJob
|
processJob
|
||||||
);
|
);
|
||||||
|
|
||||||
|
wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
||||||
|
wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
|
||||||
|
wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
|
||||||
|
wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
||||||
|
wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
||||||
|
wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
||||||
|
Loading…
x
Reference in New Issue
Block a user