mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 15:20:53 +08:00
Merge branch 'main' into v1-webscraper
This commit is contained in:
commit
c917c8fbcd
6
.gitmodules
vendored
Normal file
6
.gitmodules
vendored
Normal file
@ -0,0 +1,6 @@
|
||||
[submodule "apps/go-sdk/firecrawl"]
|
||||
path = apps/go-sdk/firecrawl
|
||||
url = https://github.com/mendableai/firecrawl-go
|
||||
[submodule "apps/go-sdk/examples"]
|
||||
path = apps/go-sdk/examples
|
||||
url = https://github.com/mendableai/firecrawl-go-examples
|
@ -24,8 +24,8 @@ kill_timeout = '30s'
|
||||
|
||||
[http_service.concurrency]
|
||||
type = "requests"
|
||||
hard_limit = 100
|
||||
soft_limit = 50
|
||||
# hard_limit = 100
|
||||
soft_limit = 100
|
||||
|
||||
[[http_service.checks]]
|
||||
grace_period = "10s"
|
||||
@ -51,8 +51,8 @@ kill_timeout = '30s'
|
||||
|
||||
[services.concurrency]
|
||||
type = 'connections'
|
||||
hard_limit = 25
|
||||
soft_limit = 20
|
||||
# hard_limit = 25
|
||||
soft_limit = 100
|
||||
|
||||
[[vm]]
|
||||
size = 'performance-2x'
|
||||
|
@ -24,8 +24,8 @@ kill_timeout = '30s'
|
||||
|
||||
[http_service.concurrency]
|
||||
type = "requests"
|
||||
hard_limit = 200
|
||||
soft_limit = 75
|
||||
# hard_limit = 200
|
||||
soft_limit = 200
|
||||
|
||||
[[http_service.checks]]
|
||||
grace_period = "20s"
|
||||
@ -50,8 +50,8 @@ kill_timeout = '30s'
|
||||
|
||||
[services.concurrency]
|
||||
type = 'connections'
|
||||
hard_limit = 30
|
||||
soft_limit = 12
|
||||
# hard_limit = 30
|
||||
soft_limit = 200
|
||||
|
||||
[[vm]]
|
||||
size = 'performance-4x'
|
||||
|
@ -10,7 +10,9 @@ import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
|
||||
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../src/services/queue-service";
|
||||
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
|
||||
|
||||
export async function crawlController(req: Request, res: Response) {
|
||||
try {
|
||||
@ -42,10 +44,17 @@ export async function crawlController(req: Request, res: Response) {
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
|
||||
const url = req.body.url;
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
try {
|
||||
url = checkAndUpdateURL(url).url;
|
||||
} catch (e) {
|
||||
return res
|
||||
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
|
||||
.json({ error: e.message ?? e });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res
|
||||
@ -94,41 +103,50 @@ export async function crawlController(req: Request, res: Response) {
|
||||
|
||||
await logCrawl(id, team_id);
|
||||
|
||||
let robots;
|
||||
|
||||
try {
|
||||
robots = await this.getRobotsTxt();
|
||||
} catch (_) {}
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: url,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
team_id,
|
||||
robots,
|
||||
createdAt: Date.now(),
|
||||
};
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt();
|
||||
} catch (_) {}
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||
|
||||
if (sitemap !== null) {
|
||||
for (const url of sitemap.map(x => x.url)) {
|
||||
await lockURL(id, sc, url);
|
||||
const job = await addScrapeJob({
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
});
|
||||
await addCrawlJob(id, job.id);
|
||||
}
|
||||
const jobs = sitemap.map(x => {
|
||||
const url = x.url;
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 20,
|
||||
}
|
||||
};
|
||||
})
|
||||
|
||||
await lockURLs(id, jobs.map(x => x.data.url));
|
||||
await addCrawlJobs(id, jobs.map(x => x.opts.jobId));
|
||||
await getScrapeQueue().addBulk(jobs);
|
||||
} else {
|
||||
await lockURL(id, sc, url);
|
||||
const job = await addScrapeJob({
|
||||
@ -139,6 +157,8 @@ export async function crawlController(req: Request, res: Response) {
|
||||
pageOptions: pageOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
crawl_id: id,
|
||||
}, {
|
||||
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
|
||||
});
|
||||
await addCrawlJob(id, job.id);
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
|
||||
import { addScrapeJob } from "../../src/services/queue-jobs";
|
||||
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
|
||||
|
||||
export async function crawlPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
@ -21,10 +22,17 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const url = req.body.url;
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
try {
|
||||
url = checkAndUpdateURL(url).url;
|
||||
} catch (e) {
|
||||
return res
|
||||
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
|
||||
.json({ error: e.message ?? e });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res
|
||||
@ -81,6 +89,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||
pageOptions,
|
||||
team_id,
|
||||
robots,
|
||||
createdAt: Date.now(),
|
||||
};
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
@ -45,7 +45,7 @@ export async function scrapeHelper(
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
});
|
||||
}, {}, jobId);
|
||||
|
||||
let doc;
|
||||
try {
|
||||
|
@ -9,6 +9,7 @@ import { search } from "../search";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { getScrapeQueue, scrapeQueueEvents } from "../services/queue-service";
|
||||
|
||||
export async function searchHelper(
|
||||
jobId: string,
|
||||
@ -75,26 +76,28 @@ export async function searchHelper(
|
||||
|
||||
// filter out social media links
|
||||
|
||||
const jobDatas = res.map(x => {
|
||||
const url = x.url;
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 10,
|
||||
}
|
||||
};
|
||||
})
|
||||
|
||||
const a = new WebScraperDataProvider();
|
||||
await a.setOptions({
|
||||
jobId,
|
||||
mode: "single_urls",
|
||||
urls: res.map((r) => r.url).slice(0, Math.min(searchOptions.limit ?? 5, 5)),
|
||||
crawlerOptions: {
|
||||
...crawlerOptions,
|
||||
},
|
||||
pageOptions: {
|
||||
...pageOptions,
|
||||
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
||||
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
||||
includeHtml: pageOptions?.includeHtml ?? false,
|
||||
removeTags: pageOptions?.removeTags ?? [],
|
||||
fallback: false,
|
||||
},
|
||||
});
|
||||
const jobs = await getScrapeQueue().addBulk(jobDatas);
|
||||
|
||||
const docs = await a.getDocuments(false);
|
||||
const docs = (await Promise.all(jobs.map(x => x.waitUntilFinished(scrapeQueueEvents, 60000)))).map(x => x[0]);
|
||||
|
||||
if (docs.length === 0) {
|
||||
return { success: true, error: "No search results found", returnCode: 200 };
|
||||
@ -109,19 +112,6 @@ export async function searchHelper(
|
||||
return { success: true, error: "No page found", returnCode: 200, data: docs };
|
||||
}
|
||||
|
||||
const billingResult = await billTeam(
|
||||
team_id,
|
||||
filteredDocs.length
|
||||
);
|
||||
if (!billingResult.success) {
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
returnCode: 402,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: filteredDocs,
|
||||
|
@ -2,6 +2,7 @@ import { Request, Response } from "express";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../src/services/queue-service";
|
||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||
|
||||
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
@ -21,7 +22,19 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
|
||||
// }
|
||||
// }
|
||||
|
||||
const jobs = await Promise.all(jobIDs.map(x => getScrapeQueue().getJob(x)));
|
||||
const jobs = (await Promise.all(jobIDs.map(async x => {
|
||||
const job = await getScrapeQueue().getJob(x);
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobById(job.id);
|
||||
|
||||
if (supabaseData) {
|
||||
job.returnvalue = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
return job;
|
||||
}))).sort((a, b) => a.timestamp - b.timestamp);
|
||||
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
||||
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
|
||||
|
||||
|
@ -8,6 +8,7 @@ export type StoredCrawl = {
|
||||
team_id: string;
|
||||
robots?: string;
|
||||
cancelled?: boolean;
|
||||
createdAt: number;
|
||||
};
|
||||
|
||||
export async function saveCrawl(id: string, crawl: StoredCrawl) {
|
||||
@ -30,6 +31,11 @@ export async function addCrawlJob(id: string, job_id: string) {
|
||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||
}
|
||||
|
||||
export async function addCrawlJobs(id: string, job_ids: string[]) {
|
||||
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
|
||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||
}
|
||||
|
||||
export async function addCrawlJobDone(id: string, job_id: string) {
|
||||
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
||||
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
|
||||
@ -54,6 +60,13 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
|
||||
return res;
|
||||
}
|
||||
|
||||
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
|
||||
export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
|
||||
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
|
||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||
return res;
|
||||
}
|
||||
|
||||
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
|
||||
const crawler = new WebCrawler({
|
||||
jobId: id,
|
||||
|
@ -65,6 +65,7 @@ export type WebScraperOptions = {
|
||||
extractorOptions?: ExtractorOptions;
|
||||
concurrentRequests?: number;
|
||||
bullJobId?: string;
|
||||
priority?: number;
|
||||
};
|
||||
|
||||
export interface DocumentUrl {
|
||||
|
@ -49,6 +49,7 @@ export async function startWebScraperPipeline({
|
||||
},
|
||||
team_id: job.data.team_id,
|
||||
bull_job_id: job.id.toString(),
|
||||
priority: job.opts.priority,
|
||||
})) as { success: boolean; message: string; docs: Document[] };
|
||||
}
|
||||
export async function runWebScraper({
|
||||
@ -62,6 +63,7 @@ export async function runWebScraper({
|
||||
onError,
|
||||
team_id,
|
||||
bull_job_id,
|
||||
priority,
|
||||
}: RunWebScraperParams): Promise<RunWebScraperResult> {
|
||||
try {
|
||||
const provider = new WebScraperDataProvider();
|
||||
@ -74,6 +76,7 @@ export async function runWebScraper({
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
bullJobId: bull_job_id,
|
||||
priority,
|
||||
});
|
||||
} else {
|
||||
await provider.setOptions({
|
||||
@ -83,6 +86,7 @@ export async function runWebScraper({
|
||||
extractorOptions,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
priority,
|
||||
});
|
||||
}
|
||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
||||
|
@ -24,7 +24,7 @@ describe('scrapSingleUrl', () => {
|
||||
});
|
||||
|
||||
it('should return a list of links on the firecrawl.ai page', async () => {
|
||||
const url = 'https://example.com';
|
||||
const url = 'https://flutterbricks.com';
|
||||
const pageOptions: PageOptions = { includeHtml: true };
|
||||
|
||||
const result = await scrapSingleUrl("TEST", url, pageOptions);
|
||||
@ -33,5 +33,5 @@ it('should return a list of links on the firecrawl.ai page', async () => {
|
||||
expect(result.linksOnPage).toBeDefined();
|
||||
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
||||
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
||||
expect(result.linksOnPage).toContain('https://www.iana.org/domains/example')
|
||||
expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
|
||||
}, 10000);
|
||||
|
@ -44,6 +44,7 @@ export class WebScraperDataProvider {
|
||||
private crawlerMode: string = "default";
|
||||
private allowBackwardCrawling: boolean = false;
|
||||
private allowExternalContentLinks: boolean = false;
|
||||
private priority?: number;
|
||||
|
||||
authorize(): void {
|
||||
throw new Error("Method not implemented.");
|
||||
@ -72,7 +73,8 @@ export class WebScraperDataProvider {
|
||||
url,
|
||||
this.pageOptions,
|
||||
this.extractorOptions,
|
||||
existingHTML
|
||||
existingHTML,
|
||||
this.priority,
|
||||
);
|
||||
processedUrls++;
|
||||
if (inProgress) {
|
||||
@ -593,6 +595,7 @@ export class WebScraperDataProvider {
|
||||
options.crawlerOptions?.allowBackwardCrawling ?? false;
|
||||
this.allowExternalContentLinks =
|
||||
options.crawlerOptions?.allowExternalContentLinks ?? false;
|
||||
this.priority = options.priority;
|
||||
|
||||
// make sure all urls start with https://
|
||||
this.urls = this.urls.map((url) => {
|
||||
|
@ -26,6 +26,7 @@ export async function scrapWithFireEngine({
|
||||
fireEngineOptions = {},
|
||||
headers,
|
||||
options,
|
||||
priority,
|
||||
}: {
|
||||
url: string;
|
||||
waitFor?: number;
|
||||
@ -35,6 +36,7 @@ export async function scrapWithFireEngine({
|
||||
fireEngineOptions?: FireEngineOptions;
|
||||
headers?: Record<string, string>;
|
||||
options?: any;
|
||||
priority?: number;
|
||||
}): Promise<FireEngineResponse> {
|
||||
const logParams = {
|
||||
url,
|
||||
@ -78,6 +80,7 @@ export async function scrapWithFireEngine({
|
||||
fullPageScreenshot: fullPageScreenshotParam,
|
||||
headers: headers,
|
||||
pageOptions: pageOptions,
|
||||
priority,
|
||||
...fireEngineOptionsParam,
|
||||
},
|
||||
{
|
||||
|
@ -134,7 +134,8 @@ export async function scrapSingleUrl(
|
||||
extractorOptions: ExtractorOptions = {
|
||||
mode: "llm-extraction-from-markdown",
|
||||
},
|
||||
existingHtml: string = ""
|
||||
existingHtml: string = "",
|
||||
priority?: number,
|
||||
): Promise<Document> {
|
||||
urlToScrap = urlToScrap.trim();
|
||||
|
||||
@ -177,7 +178,8 @@ export async function scrapSingleUrl(
|
||||
headers: pageOptions.headers,
|
||||
fireEngineOptions: {
|
||||
engine: engine,
|
||||
}
|
||||
},
|
||||
priority,
|
||||
});
|
||||
scraperResponse.text = response.html;
|
||||
scraperResponse.screenshot = response.screenshot;
|
||||
|
@ -234,5 +234,13 @@ export const urlSpecificParams = {
|
||||
engine: "tlsclient",
|
||||
},
|
||||
},
|
||||
},
|
||||
"zoopla.co.uk":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "chrome-cdp",
|
||||
},
|
||||
},
|
||||
}
|
||||
};
|
||||
|
@ -49,7 +49,7 @@ export async function checkAlerts() {
|
||||
};
|
||||
|
||||
const checkAll = async () => {
|
||||
await checkActiveJobs();
|
||||
// await checkActiveJobs();
|
||||
await checkWaitingQueue();
|
||||
};
|
||||
|
||||
|
@ -44,7 +44,7 @@ export async function logJob(job: FirecrawlJob) {
|
||||
},
|
||||
]);
|
||||
|
||||
if (process.env.POSTHOG_API_KEY) {
|
||||
if (process.env.POSTHOG_API_KEY && !job.crawl_id) {
|
||||
let phLog = {
|
||||
distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
|
||||
...(job.team_id !== "preview" && {
|
||||
@ -65,7 +65,6 @@ export async function logJob(job: FirecrawlJob) {
|
||||
extractor_options: job.extractor_options,
|
||||
num_tokens: job.num_tokens,
|
||||
retry: job.retry,
|
||||
crawl_id: job.crawl_id,
|
||||
},
|
||||
};
|
||||
posthog.capture(phLog);
|
||||
|
@ -9,9 +9,9 @@ export async function addScrapeJob(
|
||||
jobId: string = uuidv4(),
|
||||
): Promise<Job> {
|
||||
return await getScrapeQueue().add(jobId, webScraperOptions, {
|
||||
priority: webScraperOptions.crawl_id ? 20 : 10,
|
||||
...options,
|
||||
jobId,
|
||||
priority: webScraperOptions.crawl_id ? 2 : 1,
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -10,17 +10,15 @@ import { startWebScraperPipeline } from "../main/runWebScraper";
|
||||
import { callWebhook } from "./webhook";
|
||||
import { logJob } from "./logging/log_job";
|
||||
import { initSDK } from "@hyperdx/node-opentelemetry";
|
||||
import { Job, QueueEvents, tryCatch } from "bullmq";
|
||||
import { Job } from "bullmq";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { ScrapeEvents } from "../lib/scrape-events";
|
||||
import { Worker } from "bullmq";
|
||||
import systemMonitor from "./system-monitor";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||
import { getAdjustedMaxDepth } from "../scraper/WebScraper/utils/maxDepthUtils";
|
||||
import { addCrawlJob, addCrawlJobDone, crawlToCrawler, getCrawl, isCrawlFinished, lockURL } from "../lib/crawl-redis";
|
||||
import { addCrawlJob, addCrawlJobDone, crawlToCrawler, getCrawl, getCrawlJobs, isCrawlFinished, lockURL } from "../lib/crawl-redis";
|
||||
import { StoredCrawl } from "../lib/crawl-redis";
|
||||
import { addScrapeJob } from "./queue-jobs";
|
||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||
|
||||
if (process.env.ENV === "production") {
|
||||
initSDK({
|
||||
@ -151,33 +149,33 @@ async function processJob(job: Job, token: string) {
|
||||
await callWebhook(job.data.team_id, job.id as string, data);
|
||||
}
|
||||
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: success,
|
||||
message: message,
|
||||
num_docs: docs.length,
|
||||
docs: docs,
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: job.data.team_id,
|
||||
mode: job.data.mode,
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
});
|
||||
|
||||
if (job.data.crawl_id) {
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: success,
|
||||
message: message,
|
||||
num_docs: docs.length,
|
||||
docs: docs,
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: job.data.team_id,
|
||||
mode: job.data.mode,
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
});
|
||||
|
||||
await addCrawlJobDone(job.data.crawl_id, job.id);
|
||||
|
||||
if (!job.data.sitemapped) {
|
||||
const sc = await getCrawl(job.data.crawl_id) as StoredCrawl;
|
||||
const sc = await getCrawl(job.data.crawl_id) as StoredCrawl;
|
||||
|
||||
if (!job.data.sitemapped) {
|
||||
if (!sc.cancelled) {
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||
|
||||
const links = crawler.filterLinks((data.docs[0].linksOnPage as string[])
|
||||
.map(href => crawler.filterURL(href, sc.originUrl))
|
||||
.map(href => crawler.filterURL(href.trim(), sc.originUrl))
|
||||
.filter(x => x !== null),
|
||||
Infinity,
|
||||
sc.crawlerOptions?.maxDepth ?? 10
|
||||
@ -202,6 +200,66 @@ async function processJob(job: Job, token: string) {
|
||||
}
|
||||
|
||||
if (await isCrawlFinished(job.data.crawl_id)) {
|
||||
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
||||
|
||||
const jobs = (await Promise.all(jobIDs.map(async x => {
|
||||
if (x === job.id) {
|
||||
return {
|
||||
async getState() {
|
||||
return "completed"
|
||||
},
|
||||
timestamp: Date.now(),
|
||||
returnvalue: docs,
|
||||
}
|
||||
}
|
||||
|
||||
const j = await getScrapeQueue().getJob(x);
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobById(j.id);
|
||||
|
||||
if (supabaseData) {
|
||||
j.returnvalue = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
return j;
|
||||
}))).sort((a, b) => a.timestamp - b.timestamp);
|
||||
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
||||
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
|
||||
|
||||
const fullDocs = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
||||
|
||||
await logJob({
|
||||
job_id: job.data.crawl_id,
|
||||
success: jobStatus === "completed",
|
||||
message: message,
|
||||
num_docs: fullDocs.length,
|
||||
docs: [],
|
||||
time_taken: (Date.now() - sc.createdAt) / 1000,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: sc.originUrl,
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
pageOptions: sc.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
|
||||
const data = {
|
||||
success: jobStatus !== "failed",
|
||||
result: {
|
||||
links: fullDocs.map((doc) => {
|
||||
return {
|
||||
content: doc,
|
||||
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
|
||||
};
|
||||
}),
|
||||
},
|
||||
project_id: job.data.project_id,
|
||||
error: message /* etc... */,
|
||||
docs: fullDocs,
|
||||
};
|
||||
|
||||
await callWebhook(job.data.team_id, job.id as string, data);
|
||||
}
|
||||
}
|
||||
@ -222,6 +280,9 @@ async function processJob(job: Job, token: string) {
|
||||
});
|
||||
}
|
||||
Logger.error(error);
|
||||
if (error.stack) {
|
||||
Logger.error(error.stack);
|
||||
}
|
||||
|
||||
logtail.error("Overall error ingesting", {
|
||||
job_id: job.id,
|
||||
@ -235,27 +296,51 @@ async function processJob(job: Job, token: string) {
|
||||
error:
|
||||
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
||||
};
|
||||
|
||||
if (job.data.mode === "crawl" || job.data.crawl_id) {
|
||||
await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data);
|
||||
}
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: false,
|
||||
message:
|
||||
typeof error === "string"
|
||||
? error
|
||||
: error.message ?? "Something went wrong... Contact help@mendable.ai",
|
||||
num_docs: 0,
|
||||
docs: [],
|
||||
time_taken: 0,
|
||||
team_id: job.data.team_id,
|
||||
mode: job.data.mode,
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
});
|
||||
|
||||
if (job.data.crawl_id) {
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: false,
|
||||
message:
|
||||
typeof error === "string"
|
||||
? error
|
||||
: error.message ?? "Something went wrong... Contact help@mendable.ai",
|
||||
num_docs: 0,
|
||||
docs: [],
|
||||
time_taken: 0,
|
||||
team_id: job.data.team_id,
|
||||
mode: job.data.mode,
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
});
|
||||
|
||||
const sc = await getCrawl(job.data.crawl_id);
|
||||
|
||||
await logJob({
|
||||
job_id: job.data.crawl_id,
|
||||
success: false,
|
||||
message:
|
||||
typeof error === "string"
|
||||
? error
|
||||
: error.message ?? "Something went wrong... Contact help@mendable.ai",
|
||||
num_docs: 0,
|
||||
docs: [],
|
||||
time_taken: 0,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: sc ? sc.originUrl : job.data.url,
|
||||
crawlerOptions: sc ? sc.crawlerOptions : job.data.crawlerOptions,
|
||||
pageOptions: sc ? sc.pageOptions : job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
}
|
||||
// done(null, data);
|
||||
return data;
|
||||
}
|
||||
|
@ -43,6 +43,7 @@ export interface RunWebScraperParams {
|
||||
onError: (error: Error) => void;
|
||||
team_id: string;
|
||||
bull_job_id: string;
|
||||
priority?: number;
|
||||
}
|
||||
|
||||
export interface RunWebScraperResult {
|
||||
|
25
apps/go-sdk/examples/.gitignore
vendored
Normal file
25
apps/go-sdk/examples/.gitignore
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
# If you prefer the allow list template instead of the deny list, see community template:
|
||||
# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
|
||||
#
|
||||
# Binaries for programs and plugins
|
||||
*.exe
|
||||
*.exe~
|
||||
*.dll
|
||||
*.so
|
||||
*.dylib
|
||||
|
||||
# Test binary, built with `go test -c`
|
||||
*.test
|
||||
|
||||
# Output of the go coverage tool, specifically when used with LiteIDE
|
||||
*.out
|
||||
|
||||
# Dependency directories (remove the comment below to include it)
|
||||
# vendor/
|
||||
|
||||
# Go workspace file
|
||||
go.work
|
||||
go.work.sum
|
||||
|
||||
# env file
|
||||
.env
|
21
apps/go-sdk/examples/LICENSE
Normal file
21
apps/go-sdk/examples/LICENSE
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 Mendable
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
@ -6,11 +6,11 @@ import (
|
||||
"log"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/mendableai/firecrawl/firecrawl"
|
||||
"github.com/mendableai/firecrawl-go"
|
||||
)
|
||||
|
||||
func main() {
|
||||
app, err := firecrawl.NewFirecrawlApp("fc-YOUR-API-KEY", "http://localhost:3002")
|
||||
app, err := firecrawl.NewFirecrawlApp("fc-YOUR_API_KEY", "https://api.firecrawl.dev")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to create FirecrawlApp: %v", err)
|
||||
}
|
||||
|
@ -1,10 +1,9 @@
|
||||
module github.com/mendableai/firecrawl/apps/go-sdk/examples
|
||||
module github.com/mendableai/firecrawl-go-examples
|
||||
|
||||
go 1.22.5
|
||||
|
||||
replace github.com/mendableai/firecrawl => ../
|
||||
|
||||
require (
|
||||
github.com/google/uuid v1.6.0
|
||||
github.com/mendableai/firecrawl v0.0.0-00010101000000-000000000000
|
||||
)
|
||||
require github.com/google/uuid v1.6.0
|
||||
|
||||
require github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 // indirect
|
||||
|
@ -4,6 +4,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
|
||||
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
|
||||
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 h1:461um7fbSQYj2E3ETl8GINuRg5MTY3BdjMnogwUIhBs=
|
||||
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46/go.mod h1:mTGbJ37fy43aaqonp/tdpzCH516jHFw/XVvfFi4QXHo=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
|
2
apps/go-sdk/firecrawl/.gitignore
vendored
Normal file
2
apps/go-sdk/firecrawl/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
.env
|
||||
vendor
|
@ -1,4 +1,4 @@
|
||||
module github.com/mendableai/firecrawl/apps/go-sdk
|
||||
module github.com/mendableai/firecrawl-go
|
||||
|
||||
go 1.22.5
|
||||
|
@ -24,6 +24,7 @@
|
||||
"devDependencies": {
|
||||
"@types/jest": "^29.5.12",
|
||||
"@types/supertest": "^6.0.2",
|
||||
"artillery": "^2.0.19",
|
||||
"typescript": "^5.4.5"
|
||||
}
|
||||
}
|
||||
|
6884
apps/test-suite/pnpm-lock.yaml
generated
6884
apps/test-suite/pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,4 @@
|
||||
name: firecrawl
|
||||
version: '3.9'
|
||||
|
||||
x-common-service: &common-service
|
||||
build: apps/api
|
||||
|
Loading…
x
Reference in New Issue
Block a user