mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-06 05:47:19 +08:00
commit
4f3d421c70
@ -22,6 +22,7 @@ import { getScrapeQueue } from "../../services/queue-service";
|
|||||||
import { addScrapeJob } from "../../services/queue-jobs";
|
import { addScrapeJob } from "../../services/queue-jobs";
|
||||||
import { Logger } from "../../lib/logger";
|
import { Logger } from "../../lib/logger";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
|
import { callWebhook } from "../../services/webhook";
|
||||||
|
|
||||||
export async function crawlController(
|
export async function crawlController(
|
||||||
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
||||||
@ -150,6 +151,10 @@ export async function crawlController(
|
|||||||
await addCrawlJob(id, job.id);
|
await addCrawlJob(id, job.id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(req.body.webhook) {
|
||||||
|
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "crawl.started");
|
||||||
|
}
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
id,
|
id,
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import "dotenv/config";
|
import "dotenv/config";
|
||||||
import "./sentry"
|
import "./sentry";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { CustomError } from "../lib/custom-error";
|
import { CustomError } from "../lib/custom-error";
|
||||||
import {
|
import {
|
||||||
@ -17,12 +17,25 @@ import { Logger } from "../lib/logger";
|
|||||||
import { Worker } from "bullmq";
|
import { Worker } from "bullmq";
|
||||||
import systemMonitor from "./system-monitor";
|
import systemMonitor from "./system-monitor";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { addCrawlJob, addCrawlJobDone, crawlToCrawler, finishCrawl, getCrawl, getCrawlJobs, lockURL } from "../lib/crawl-redis";
|
import {
|
||||||
|
addCrawlJob,
|
||||||
|
addCrawlJobDone,
|
||||||
|
crawlToCrawler,
|
||||||
|
finishCrawl,
|
||||||
|
getCrawl,
|
||||||
|
getCrawlJobs,
|
||||||
|
lockURL,
|
||||||
|
} from "../lib/crawl-redis";
|
||||||
import { StoredCrawl } from "../lib/crawl-redis";
|
import { StoredCrawl } from "../lib/crawl-redis";
|
||||||
import { addScrapeJob } from "./queue-jobs";
|
import { addScrapeJob } from "./queue-jobs";
|
||||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||||
import { addJobPriority, deleteJobPriority, getJobPriority } from "../../src/lib/job-priority";
|
import {
|
||||||
|
addJobPriority,
|
||||||
|
deleteJobPriority,
|
||||||
|
getJobPriority,
|
||||||
|
} from "../../src/lib/job-priority";
|
||||||
import { PlanType } from "../types";
|
import { PlanType } from "../types";
|
||||||
|
import { getJobs } from "../../src/controllers/v1/crawl-status";
|
||||||
|
|
||||||
if (process.env.ENV === "production") {
|
if (process.env.ENV === "production") {
|
||||||
initSDK({
|
initSDK({
|
||||||
@ -52,25 +65,24 @@ const processJobInternal = async (token: string, job: Job) => {
|
|||||||
await job.extendLock(token, jobLockExtensionTime);
|
await job.extendLock(token, jobLockExtensionTime);
|
||||||
}, jobLockExtendInterval);
|
}, jobLockExtendInterval);
|
||||||
|
|
||||||
await addJobPriority(job.data.team_id, job.id );
|
await addJobPriority(job.data.team_id, job.id);
|
||||||
let err = null;
|
let err = null;
|
||||||
try {
|
try {
|
||||||
const result = await processJob(job, token);
|
const result = await processJob(job, token);
|
||||||
try{
|
try {
|
||||||
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
|
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
await job.moveToCompleted(null, token, false);
|
await job.moveToCompleted(null, token, false);
|
||||||
} else {
|
} else {
|
||||||
await job.moveToCompleted(result.docs, token, false);
|
await job.moveToCompleted(result.docs, token, false);
|
||||||
}
|
}
|
||||||
}catch(e){
|
} catch (e) {}
|
||||||
}
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log("Job failed, error:", error);
|
console.log("Job failed, error:", error);
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
err = error;
|
err = error;
|
||||||
await job.moveToFailed(error, token, false);
|
await job.moveToFailed(error, token, false);
|
||||||
} finally {
|
} finally {
|
||||||
await deleteJobPriority(job.data.team_id, job.id );
|
await deleteJobPriority(job.data.team_id, job.id);
|
||||||
clearInterval(extendLockInterval);
|
clearInterval(extendLockInterval);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -84,7 +96,10 @@ process.on("SIGINT", () => {
|
|||||||
isShuttingDown = true;
|
isShuttingDown = true;
|
||||||
});
|
});
|
||||||
|
|
||||||
const workerFun = async (queueName: string, processJobInternal: (token: string, job: Job) => Promise<any>) => {
|
const workerFun = async (
|
||||||
|
queueName: string,
|
||||||
|
processJobInternal: (token: string, job: Job) => Promise<any>
|
||||||
|
) => {
|
||||||
const worker = new Worker(queueName, null, {
|
const worker = new Worker(queueName, null, {
|
||||||
connection: redisConnection,
|
connection: redisConnection,
|
||||||
lockDuration: 1 * 60 * 1000, // 1 minute
|
lockDuration: 1 * 60 * 1000, // 1 minute
|
||||||
@ -113,44 +128,60 @@ const workerFun = async (queueName: string, processJobInternal: (token: string,
|
|||||||
const job = await worker.getNextJob(token);
|
const job = await worker.getNextJob(token);
|
||||||
if (job) {
|
if (job) {
|
||||||
if (job.data && job.data.sentry && Sentry.isInitialized()) {
|
if (job.data && job.data.sentry && Sentry.isInitialized()) {
|
||||||
Sentry.continueTrace({ sentryTrace: job.data.sentry.trace, baggage: job.data.sentry.baggage }, () => {
|
Sentry.continueTrace(
|
||||||
Sentry.startSpan({
|
{
|
||||||
|
sentryTrace: job.data.sentry.trace,
|
||||||
|
baggage: job.data.sentry.baggage,
|
||||||
|
},
|
||||||
|
() => {
|
||||||
|
Sentry.startSpan(
|
||||||
|
{
|
||||||
|
name: "Scrape job",
|
||||||
|
attributes: {
|
||||||
|
job: job.id,
|
||||||
|
worker: process.env.FLY_MACHINE_ID ?? worker.id,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
async (span) => {
|
||||||
|
await Sentry.startSpan(
|
||||||
|
{
|
||||||
|
name: "Process scrape job",
|
||||||
|
op: "queue.process",
|
||||||
|
attributes: {
|
||||||
|
"messaging.message.id": job.id,
|
||||||
|
"messaging.destination.name": getScrapeQueue().name,
|
||||||
|
"messaging.message.body.size": job.data.sentry.size,
|
||||||
|
"messaging.message.receive.latency":
|
||||||
|
Date.now() - (job.processedOn ?? job.timestamp),
|
||||||
|
"messaging.message.retry.count": job.attemptsMade,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
async () => {
|
||||||
|
const res = await processJobInternal(token, job);
|
||||||
|
if (res !== null) {
|
||||||
|
span.setStatus({ code: 2 }); // ERROR
|
||||||
|
} else {
|
||||||
|
span.setStatus({ code: 1 }); // OK
|
||||||
|
}
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
Sentry.startSpan(
|
||||||
|
{
|
||||||
name: "Scrape job",
|
name: "Scrape job",
|
||||||
attributes: {
|
attributes: {
|
||||||
job: job.id,
|
job: job.id,
|
||||||
worker: process.env.FLY_MACHINE_ID ?? worker.id,
|
worker: process.env.FLY_MACHINE_ID ?? worker.id,
|
||||||
},
|
},
|
||||||
}, async (span) => {
|
|
||||||
await Sentry.startSpan({
|
|
||||||
name: "Process scrape job",
|
|
||||||
op: "queue.process",
|
|
||||||
attributes: {
|
|
||||||
"messaging.message.id": job.id,
|
|
||||||
"messaging.destination.name": getScrapeQueue().name,
|
|
||||||
"messaging.message.body.size": job.data.sentry.size,
|
|
||||||
"messaging.message.receive.latency": Date.now() - (job.processedOn ?? job.timestamp),
|
|
||||||
"messaging.message.retry.count": job.attemptsMade,
|
|
||||||
}
|
|
||||||
}, async () => {
|
|
||||||
const res = await processJobInternal(token, job);
|
|
||||||
if (res !== null) {
|
|
||||||
span.setStatus({ code: 2 }); // ERROR
|
|
||||||
} else {
|
|
||||||
span.setStatus({ code: 1 }); // OK
|
|
||||||
}
|
|
||||||
});
|
|
||||||
});
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
Sentry.startSpan({
|
|
||||||
name: "Scrape job",
|
|
||||||
attributes: {
|
|
||||||
job: job.id,
|
|
||||||
worker: process.env.FLY_MACHINE_ID ?? worker.id,
|
|
||||||
},
|
},
|
||||||
}, () => {
|
() => {
|
||||||
processJobInternal(token, job);
|
processJobInternal(token, job);
|
||||||
});
|
}
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
await sleep(gotJobInterval);
|
await sleep(gotJobInterval);
|
||||||
@ -167,13 +198,20 @@ async function processJob(job: Job, token: string) {
|
|||||||
|
|
||||||
// Check if the job URL is researchhub and block it immediately
|
// Check if the job URL is researchhub and block it immediately
|
||||||
// TODO: remove this once solve the root issue
|
// TODO: remove this once solve the root issue
|
||||||
if (job.data.url && (job.data.url.includes("researchhub.com") || job.data.url.includes("ebay.com") || job.data.url.includes("youtube.com") || job.data.url.includes("microsoft.com") )) {
|
if (
|
||||||
|
job.data.url &&
|
||||||
|
(job.data.url.includes("researchhub.com") ||
|
||||||
|
job.data.url.includes("ebay.com") ||
|
||||||
|
job.data.url.includes("youtube.com") ||
|
||||||
|
job.data.url.includes("microsoft.com"))
|
||||||
|
) {
|
||||||
Logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`);
|
Logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`);
|
||||||
const data = {
|
const data = {
|
||||||
success: false,
|
success: false,
|
||||||
docs: [],
|
docs: [],
|
||||||
project_id: job.data.project_id,
|
project_id: job.data.project_id,
|
||||||
error: "URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.",
|
error:
|
||||||
|
"URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.",
|
||||||
};
|
};
|
||||||
await job.moveToCompleted(data.docs, token, false);
|
await job.moveToCompleted(data.docs, token, false);
|
||||||
return data;
|
return data;
|
||||||
@ -194,7 +232,7 @@ async function processJob(job: Job, token: string) {
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Better if we throw here so we capture with the correct error
|
// Better if we throw here so we capture with the correct error
|
||||||
if(!success) {
|
if (!success) {
|
||||||
throw new Error(message);
|
throw new Error(message);
|
||||||
}
|
}
|
||||||
const end = Date.now();
|
const end = Date.now();
|
||||||
@ -217,8 +255,26 @@ async function processJob(job: Job, token: string) {
|
|||||||
docs,
|
docs,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (job.data.mode === "crawl") {
|
// No idea what this does and when it is called.
|
||||||
await callWebhook(job.data.team_id, job.id as string, data, job.data.webhook, job.data.v1);
|
if (job.data.mode === "crawl" && !job.data.v1) {
|
||||||
|
callWebhook(
|
||||||
|
job.data.team_id,
|
||||||
|
job.id as string,
|
||||||
|
data,
|
||||||
|
job.data.webhook,
|
||||||
|
job.data.v1
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
|
||||||
|
await callWebhook(
|
||||||
|
job.data.team_id,
|
||||||
|
job.data.crawl_id,
|
||||||
|
data,
|
||||||
|
job.data.webhook,
|
||||||
|
job.data.v1,
|
||||||
|
"crawl.page",
|
||||||
|
true
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (job.data.crawl_id) {
|
if (job.data.crawl_id) {
|
||||||
@ -240,7 +296,7 @@ async function processJob(job: Job, token: string) {
|
|||||||
|
|
||||||
await addCrawlJobDone(job.data.crawl_id, job.id);
|
await addCrawlJobDone(job.data.crawl_id, job.id);
|
||||||
|
|
||||||
const sc = await getCrawl(job.data.crawl_id) as StoredCrawl;
|
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||||
|
|
||||||
if (!job.data.sitemapped) {
|
if (!job.data.sitemapped) {
|
||||||
if (!sc.cancelled) {
|
if (!sc.cancelled) {
|
||||||
@ -250,13 +306,16 @@ async function processJob(job: Job, token: string) {
|
|||||||
crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl),
|
crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl),
|
||||||
Infinity,
|
Infinity,
|
||||||
sc.crawlerOptions?.maxDepth ?? 10
|
sc.crawlerOptions?.maxDepth ?? 10
|
||||||
)
|
);
|
||||||
|
|
||||||
for (const link of links) {
|
for (const link of links) {
|
||||||
if (await lockURL(job.data.crawl_id, sc, link)) {
|
if (await lockURL(job.data.crawl_id, sc, link)) {
|
||||||
|
|
||||||
// This seems to work really welel
|
// This seems to work really welel
|
||||||
const jobPriority = await getJobPriority({plan:sc.plan as PlanType, team_id: sc.team_id, basePriority: job.data.crawl_id ? 20 : 10})
|
const jobPriority = await getJobPriority({
|
||||||
|
plan: sc.plan as PlanType,
|
||||||
|
team_id: sc.team_id,
|
||||||
|
basePriority: job.data.crawl_id ? 20 : 10,
|
||||||
|
});
|
||||||
const jobId = uuidv4();
|
const jobId = uuidv4();
|
||||||
|
|
||||||
// console.log("plan: ", sc.plan);
|
// console.log("plan: ", sc.plan);
|
||||||
@ -264,16 +323,21 @@ async function processJob(job: Job, token: string) {
|
|||||||
// console.log("base priority: ", job.data.crawl_id ? 20 : 10)
|
// console.log("base priority: ", job.data.crawl_id ? 20 : 10)
|
||||||
// console.log("job priority: " , jobPriority, "\n\n\n")
|
// console.log("job priority: " , jobPriority, "\n\n\n")
|
||||||
|
|
||||||
const newJob = await addScrapeJob({
|
const newJob = await addScrapeJob(
|
||||||
url: link,
|
{
|
||||||
mode: "single_urls",
|
url: link,
|
||||||
crawlerOptions: sc.crawlerOptions,
|
mode: "single_urls",
|
||||||
team_id: sc.team_id,
|
crawlerOptions: sc.crawlerOptions,
|
||||||
pageOptions: sc.pageOptions,
|
team_id: sc.team_id,
|
||||||
origin: job.data.origin,
|
pageOptions: sc.pageOptions,
|
||||||
crawl_id: job.data.crawl_id,
|
origin: job.data.origin,
|
||||||
v1: job.data.v1,
|
crawl_id: job.data.crawl_id,
|
||||||
}, {}, jobId, jobPriority);
|
v1: job.data.v1,
|
||||||
|
},
|
||||||
|
{},
|
||||||
|
jobId,
|
||||||
|
jobPriority
|
||||||
|
);
|
||||||
|
|
||||||
await addCrawlJob(job.data.crawl_id, newJob.id);
|
await addCrawlJob(job.data.crawl_id, newJob.id);
|
||||||
}
|
}
|
||||||
@ -282,67 +346,98 @@ async function processJob(job: Job, token: string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (await finishCrawl(job.data.crawl_id)) {
|
if (await finishCrawl(job.data.crawl_id)) {
|
||||||
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
|
||||||
|
|
||||||
const jobs = (await Promise.all(jobIDs.map(async x => {
|
|
||||||
if (x === job.id) {
|
if (!job.data.v1) {
|
||||||
return {
|
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
||||||
async getState() {
|
|
||||||
return "completed"
|
const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
|
||||||
},
|
const jobStatuses = await Promise.all(jobs.map((x) => x.getState()));
|
||||||
timestamp: Date.now(),
|
const jobStatus =
|
||||||
returnvalue: docs,
|
sc.cancelled || jobStatuses.some((x) => x === "failed")
|
||||||
}
|
? "failed"
|
||||||
|
: "completed";
|
||||||
|
|
||||||
|
const fullDocs = jobs.map((x) =>
|
||||||
|
Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue
|
||||||
|
);
|
||||||
|
|
||||||
|
await logJob({
|
||||||
|
job_id: job.data.crawl_id,
|
||||||
|
success: jobStatus === "completed",
|
||||||
|
message: sc.cancelled ? "Cancelled" : message,
|
||||||
|
num_docs: fullDocs.length,
|
||||||
|
docs: [],
|
||||||
|
time_taken: (Date.now() - sc.createdAt) / 1000,
|
||||||
|
team_id: job.data.team_id,
|
||||||
|
mode: "crawl",
|
||||||
|
url: sc.originUrl,
|
||||||
|
crawlerOptions: sc.crawlerOptions,
|
||||||
|
pageOptions: sc.pageOptions,
|
||||||
|
origin: job.data.origin,
|
||||||
|
});
|
||||||
|
|
||||||
|
const data = {
|
||||||
|
success: jobStatus !== "failed",
|
||||||
|
result: {
|
||||||
|
links: fullDocs.map((doc) => {
|
||||||
|
return {
|
||||||
|
content: doc,
|
||||||
|
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
|
||||||
|
};
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
project_id: job.data.project_id,
|
||||||
|
error: message /* etc... */,
|
||||||
|
docs: fullDocs,
|
||||||
|
};
|
||||||
|
|
||||||
|
// v0 web hooks, call when done with all the data
|
||||||
|
if (!job.data.v1) {
|
||||||
|
callWebhook(
|
||||||
|
job.data.team_id,
|
||||||
|
job.data.crawl_id,
|
||||||
|
data,
|
||||||
|
job.data.webhook,
|
||||||
|
job.data.v1,
|
||||||
|
"crawl.completed"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
||||||
|
const jobStatuses = await Promise.all(jobIDs.map((x) => getScrapeQueue().getJobState(x)));
|
||||||
|
const jobStatus =
|
||||||
|
sc.cancelled || jobStatuses.some((x) => x === "failed")
|
||||||
|
? "failed"
|
||||||
|
: "completed";
|
||||||
|
|
||||||
const j = await getScrapeQueue().getJob(x);
|
// v1 web hooks, call when done with no data, but with event completed
|
||||||
|
if (job.data.v1 && job.data.webhook) {
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
callWebhook(
|
||||||
const supabaseData = await supabaseGetJobById(j.id);
|
job.data.team_id,
|
||||||
|
job.data.crawl_id,
|
||||||
if (supabaseData) {
|
[],
|
||||||
j.returnvalue = supabaseData.docs;
|
job.data.webhook,
|
||||||
|
job.data.v1,
|
||||||
|
"crawl.completed"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return j;
|
await logJob({
|
||||||
}))).sort((a, b) => a.timestamp - b.timestamp);
|
job_id: job.data.crawl_id,
|
||||||
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
success: jobStatus === "completed",
|
||||||
const jobStatus = sc.cancelled || jobStatuses.some(x => x === "failed") ? "failed" : "completed";
|
message: sc.cancelled ? "Cancelled" : message,
|
||||||
|
num_docs: jobIDs.length,
|
||||||
const fullDocs = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
docs: [],
|
||||||
|
time_taken: (Date.now() - sc.createdAt) / 1000,
|
||||||
await logJob({
|
team_id: job.data.team_id,
|
||||||
job_id: job.data.crawl_id,
|
mode: "crawl",
|
||||||
success: jobStatus === "completed",
|
url: sc.originUrl,
|
||||||
message: sc.cancelled ? "Cancelled" : message,
|
crawlerOptions: sc.crawlerOptions,
|
||||||
num_docs: fullDocs.length,
|
pageOptions: sc.pageOptions,
|
||||||
docs: [],
|
origin: job.data.origin,
|
||||||
time_taken: (Date.now() - sc.createdAt) / 1000,
|
});
|
||||||
team_id: job.data.team_id,
|
}
|
||||||
mode: "crawl",
|
|
||||||
url: sc.originUrl,
|
|
||||||
crawlerOptions: sc.crawlerOptions,
|
|
||||||
pageOptions: sc.pageOptions,
|
|
||||||
origin: job.data.origin,
|
|
||||||
});
|
|
||||||
|
|
||||||
const data = {
|
|
||||||
success: jobStatus !== "failed",
|
|
||||||
result: {
|
|
||||||
links: fullDocs.map((doc) => {
|
|
||||||
return {
|
|
||||||
content: doc,
|
|
||||||
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
|
|
||||||
};
|
|
||||||
}),
|
|
||||||
},
|
|
||||||
project_id: job.data.project_id,
|
|
||||||
error: message /* etc... */,
|
|
||||||
docs: fullDocs,
|
|
||||||
};
|
|
||||||
|
|
||||||
await callWebhook(job.data.team_id, job.data.crawl_id, data, job.data.webhook, job.data.v1);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -353,9 +448,9 @@ async function processJob(job: Job, token: string) {
|
|||||||
|
|
||||||
Sentry.captureException(error, {
|
Sentry.captureException(error, {
|
||||||
data: {
|
data: {
|
||||||
job: job.id
|
job: job.id,
|
||||||
},
|
},
|
||||||
})
|
});
|
||||||
|
|
||||||
if (error instanceof CustomError) {
|
if (error instanceof CustomError) {
|
||||||
// Here we handle the error, then save the failed job
|
// Here we handle the error, then save the failed job
|
||||||
@ -385,8 +480,24 @@ async function processJob(job: Job, token: string) {
|
|||||||
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (job.data.mode === "crawl" || job.data.crawl_id) {
|
if (!job.data.v1 && (job.data.mode === "crawl" || job.data.crawl_id)) {
|
||||||
await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data, job.data.webhook, job.data.v1);
|
callWebhook(
|
||||||
|
job.data.team_id,
|
||||||
|
job.data.crawl_id ?? (job.id as string),
|
||||||
|
data,
|
||||||
|
job.data.webhook,
|
||||||
|
job.data.v1
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (job.data.v1) {
|
||||||
|
callWebhook(
|
||||||
|
job.data.team_id,
|
||||||
|
job.id as string,
|
||||||
|
[],
|
||||||
|
job.data.webhook,
|
||||||
|
job.data.v1,
|
||||||
|
"crawl.failed"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (job.data.crawl_id) {
|
if (job.data.crawl_id) {
|
||||||
@ -396,7 +507,8 @@ async function processJob(job: Job, token: string) {
|
|||||||
message:
|
message:
|
||||||
typeof error === "string"
|
typeof error === "string"
|
||||||
? error
|
? error
|
||||||
: error.message ?? "Something went wrong... Contact help@mendable.ai",
|
: error.message ??
|
||||||
|
"Something went wrong... Contact help@mendable.ai",
|
||||||
num_docs: 0,
|
num_docs: 0,
|
||||||
docs: [],
|
docs: [],
|
||||||
time_taken: 0,
|
time_taken: 0,
|
||||||
@ -417,7 +529,8 @@ async function processJob(job: Job, token: string) {
|
|||||||
message:
|
message:
|
||||||
typeof error === "string"
|
typeof error === "string"
|
||||||
? error
|
? error
|
||||||
: error.message ?? "Something went wrong... Contact help@mendable.ai",
|
: error.message ??
|
||||||
|
"Something went wrong... Contact help@mendable.ai",
|
||||||
num_docs: 0,
|
num_docs: 0,
|
||||||
docs: [],
|
docs: [],
|
||||||
time_taken: 0,
|
time_taken: 0,
|
||||||
|
@ -1,11 +1,24 @@
|
|||||||
|
import axios from "axios";
|
||||||
import { legacyDocumentConverter } from "../../src/controllers/v1/types";
|
import { legacyDocumentConverter } from "../../src/controllers/v1/types";
|
||||||
import { Logger } from "../../src/lib/logger";
|
import { Logger } from "../../src/lib/logger";
|
||||||
import { supabase_service } from "./supabase";
|
import { supabase_service } from "./supabase";
|
||||||
|
import { WebhookEventType } from "../types";
|
||||||
|
|
||||||
export const callWebhook = async (teamId: string, jobId: string, data: any, specified?: string, v1 = false) => {
|
export const callWebhook = async (
|
||||||
|
teamId: string,
|
||||||
|
id: string,
|
||||||
|
data: any | null,
|
||||||
|
specified?: string,
|
||||||
|
v1 = false,
|
||||||
|
eventType: WebhookEventType = "crawl.page",
|
||||||
|
awaitWebhook: boolean = false
|
||||||
|
) => {
|
||||||
try {
|
try {
|
||||||
const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL?.replace("{{JOB_ID}}", jobId);
|
const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL?.replace(
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
"{{JOB_ID}}",
|
||||||
|
id
|
||||||
|
);
|
||||||
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
let webhookUrl = specified ?? selfHostedUrl;
|
let webhookUrl = specified ?? selfHostedUrl;
|
||||||
|
|
||||||
// Only fetch the webhook URL from the database if the self-hosted webhook URL and specified webhook are not set
|
// Only fetch the webhook URL from the database if the self-hosted webhook URL and specified webhook are not set
|
||||||
@ -17,7 +30,9 @@ export const callWebhook = async (teamId: string, jobId: string, data: any, spec
|
|||||||
.eq("team_id", teamId)
|
.eq("team_id", teamId)
|
||||||
.limit(1);
|
.limit(1);
|
||||||
if (error) {
|
if (error) {
|
||||||
Logger.error(`Error fetching webhook URL for team ID: ${teamId}, error: ${error.message}`);
|
Logger.error(
|
||||||
|
`Error fetching webhook URL for team ID: ${teamId}, error: ${error.message}`
|
||||||
|
);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -29,10 +44,17 @@ export const callWebhook = async (teamId: string, jobId: string, data: any, spec
|
|||||||
}
|
}
|
||||||
|
|
||||||
let dataToSend = [];
|
let dataToSend = [];
|
||||||
if (data.result.links && data.result.links.length !== 0) {
|
if (
|
||||||
|
data &&
|
||||||
|
data.result &&
|
||||||
|
data.result.links &&
|
||||||
|
data.result.links.length !== 0
|
||||||
|
) {
|
||||||
for (let i = 0; i < data.result.links.length; i++) {
|
for (let i = 0; i < data.result.links.length; i++) {
|
||||||
if (v1) {
|
if (v1) {
|
||||||
dataToSend.push(legacyDocumentConverter(data.result.links[i].content))
|
dataToSend.push(
|
||||||
|
legacyDocumentConverter(data.result.links[i].content)
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
dataToSend.push({
|
dataToSend.push({
|
||||||
content: data.result.links[i].content.content,
|
content: data.result.links[i].content.content,
|
||||||
@ -43,19 +65,72 @@ export const callWebhook = async (teamId: string, jobId: string, data: any, spec
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
await fetch(webhookUrl, {
|
if (awaitWebhook) {
|
||||||
method: "POST",
|
try {
|
||||||
headers: {
|
await axios.post(
|
||||||
"Content-Type": "application/json",
|
webhookUrl,
|
||||||
},
|
{
|
||||||
body: JSON.stringify({
|
success: !v1
|
||||||
success: data.success,
|
? data.success
|
||||||
jobId: jobId,
|
: eventType === "crawl.page"
|
||||||
data: dataToSend,
|
? data.success
|
||||||
error: data.error || undefined,
|
: true,
|
||||||
}),
|
type: eventType,
|
||||||
});
|
[v1 ? "id" : "jobId"]: id,
|
||||||
|
data: dataToSend,
|
||||||
|
error: !v1
|
||||||
|
? data?.error || undefined
|
||||||
|
: eventType === "crawl.page"
|
||||||
|
? data?.error || undefined
|
||||||
|
: undefined,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1)
|
||||||
|
}
|
||||||
|
);
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(
|
||||||
|
`Axios error (0) sending webhook for team ID: ${teamId}, error: ${error.message}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
axios
|
||||||
|
.post(
|
||||||
|
webhookUrl,
|
||||||
|
{
|
||||||
|
success: !v1
|
||||||
|
? data.success
|
||||||
|
: eventType === "crawl.page"
|
||||||
|
? data.success
|
||||||
|
: true,
|
||||||
|
type: eventType,
|
||||||
|
[v1 ? "id" : "jobId"]: id,
|
||||||
|
data: dataToSend,
|
||||||
|
error: !v1
|
||||||
|
? data?.error || undefined
|
||||||
|
: eventType === "crawl.page"
|
||||||
|
? data?.error || undefined
|
||||||
|
: undefined,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
.catch((error) => {
|
||||||
|
Logger.error(
|
||||||
|
`Axios error sending webhook for team ID: ${teamId}, error: ${error.message}`
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.debug(`Error sending webhook for team ID: ${teamId}, error: ${error.message}`);
|
Logger.debug(
|
||||||
|
`Error sending webhook for team ID: ${teamId}, error: ${error.message}`
|
||||||
|
);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -154,3 +154,6 @@ export type PlanType =
|
|||||||
| "growthdouble"
|
| "growthdouble"
|
||||||
| "free"
|
| "free"
|
||||||
| "";
|
| "";
|
||||||
|
|
||||||
|
|
||||||
|
export type WebhookEventType = "crawl.page" | "crawl.started" | "crawl.completed" | "crawl.failed";
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "1.2.0",
|
"version": "1.2.1",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "build/cjs/index.js",
|
"main": "build/cjs/index.js",
|
||||||
"types": "types/index.d.ts",
|
"types": "types/index.d.ts",
|
||||||
|
@ -111,6 +111,7 @@ export interface CrawlParams {
|
|||||||
allowExternalLinks?: boolean;
|
allowExternalLinks?: boolean;
|
||||||
ignoreSitemap?: boolean;
|
ignoreSitemap?: boolean;
|
||||||
scrapeOptions?: ScrapeParams;
|
scrapeOptions?: ScrapeParams;
|
||||||
|
webhook?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
1
apps/js-sdk/firecrawl/types/index.d.ts
vendored
1
apps/js-sdk/firecrawl/types/index.d.ts
vendored
@ -103,6 +103,7 @@ export interface CrawlParams {
|
|||||||
allowExternalLinks?: boolean;
|
allowExternalLinks?: boolean;
|
||||||
ignoreSitemap?: boolean;
|
ignoreSitemap?: boolean;
|
||||||
scrapeOptions?: ScrapeParams;
|
scrapeOptions?: ScrapeParams;
|
||||||
|
webhook?: string;
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* Response interface for crawling operations.
|
* Response interface for crawling operations.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user