Merge branch 'main' into v1-webscraper

This commit is contained in:
Nicolas 2024-08-15 15:14:29 -04:00
commit c917c8fbcd
35 changed files with 7190 additions and 156 deletions

6
.gitmodules vendored Normal file
View File

@ -0,0 +1,6 @@
[submodule "apps/go-sdk/firecrawl"]
path = apps/go-sdk/firecrawl
url = https://github.com/mendableai/firecrawl-go
[submodule "apps/go-sdk/examples"]
path = apps/go-sdk/examples
url = https://github.com/mendableai/firecrawl-go-examples

View File

@ -24,8 +24,8 @@ kill_timeout = '30s'
[http_service.concurrency] [http_service.concurrency]
type = "requests" type = "requests"
hard_limit = 100 # hard_limit = 100
soft_limit = 50 soft_limit = 100
[[http_service.checks]] [[http_service.checks]]
grace_period = "10s" grace_period = "10s"
@ -51,8 +51,8 @@ kill_timeout = '30s'
[services.concurrency] [services.concurrency]
type = 'connections' type = 'connections'
hard_limit = 25 # hard_limit = 25
soft_limit = 20 soft_limit = 100
[[vm]] [[vm]]
size = 'performance-2x' size = 'performance-2x'

View File

@ -24,8 +24,8 @@ kill_timeout = '30s'
[http_service.concurrency] [http_service.concurrency]
type = "requests" type = "requests"
hard_limit = 200 # hard_limit = 200
soft_limit = 75 soft_limit = 200
[[http_service.checks]] [[http_service.checks]]
grace_period = "20s" grace_period = "20s"
@ -50,8 +50,8 @@ kill_timeout = '30s'
[services.concurrency] [services.concurrency]
type = 'connections' type = 'connections'
hard_limit = 30 # hard_limit = 30
soft_limit = 12 soft_limit = 200
[[vm]] [[vm]]
size = 'performance-4x' size = 'performance-4x'

View File

@ -10,7 +10,9 @@ import { createIdempotencyKey } from "../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values"; import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../src/lib/logger"; import { Logger } from "../../src/lib/logger";
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis"; import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../src/services/queue-service";
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
export async function crawlController(req: Request, res: Response) { export async function crawlController(req: Request, res: Response) {
try { try {
@ -42,10 +44,17 @@ export async function crawlController(req: Request, res: Response) {
return res.status(402).json({ error: "Insufficient credits" }); return res.status(402).json({ error: "Insufficient credits" });
} }
const url = req.body.url; let url = req.body.url;
if (!url) { if (!url) {
return res.status(400).json({ error: "Url is required" }); return res.status(400).json({ error: "Url is required" });
} }
try {
url = checkAndUpdateURL(url).url;
} catch (e) {
return res
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
.json({ error: e.message ?? e });
}
if (isUrlBlocked(url)) { if (isUrlBlocked(url)) {
return res return res
@ -94,41 +103,50 @@ export async function crawlController(req: Request, res: Response) {
await logCrawl(id, team_id); await logCrawl(id, team_id);
let robots;
try {
robots = await this.getRobotsTxt();
} catch (_) {}
const sc: StoredCrawl = { const sc: StoredCrawl = {
originUrl: url, originUrl: url,
crawlerOptions, crawlerOptions,
pageOptions, pageOptions,
team_id, team_id,
robots, createdAt: Date.now(),
}; };
await saveCrawl(id, sc);
const crawler = crawlToCrawler(id, sc); const crawler = crawlToCrawler(id, sc);
try {
sc.robots = await crawler.getRobotsTxt();
} catch (_) {}
await saveCrawl(id, sc);
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap(); const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
if (sitemap !== null) { if (sitemap !== null) {
for (const url of sitemap.map(x => x.url)) { const jobs = sitemap.map(x => {
await lockURL(id, sc, url); const url = x.url;
const job = await addScrapeJob({ const uuid = uuidv4();
url, return {
mode: "single_urls", name: uuid,
crawlerOptions: crawlerOptions, data: {
team_id: team_id, url,
pageOptions: pageOptions, mode: "single_urls",
origin: req.body.origin ?? defaultOrigin, crawlerOptions: crawlerOptions,
crawl_id: id, team_id: team_id,
sitemapped: true, pageOptions: pageOptions,
}); origin: req.body.origin ?? defaultOrigin,
await addCrawlJob(id, job.id); crawl_id: id,
} sitemapped: true,
},
opts: {
jobId: uuid,
priority: 20,
}
};
})
await lockURLs(id, jobs.map(x => x.data.url));
await addCrawlJobs(id, jobs.map(x => x.opts.jobId));
await getScrapeQueue().addBulk(jobs);
} else { } else {
await lockURL(id, sc, url); await lockURL(id, sc, url);
const job = await addScrapeJob({ const job = await addScrapeJob({
@ -139,6 +157,8 @@ export async function crawlController(req: Request, res: Response) {
pageOptions: pageOptions, pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin, origin: req.body.origin ?? defaultOrigin,
crawl_id: id, crawl_id: id,
}, {
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
}); });
await addCrawlJob(id, job.id); await addCrawlJob(id, job.id);
} }

View File

@ -6,6 +6,7 @@ import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../src/lib/logger"; import { Logger } from "../../src/lib/logger";
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis"; import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
import { addScrapeJob } from "../../src/services/queue-jobs"; import { addScrapeJob } from "../../src/services/queue-jobs";
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
export async function crawlPreviewController(req: Request, res: Response) { export async function crawlPreviewController(req: Request, res: Response) {
try { try {
@ -21,10 +22,17 @@ export async function crawlPreviewController(req: Request, res: Response) {
return res.status(status).json({ error }); return res.status(status).json({ error });
} }
const url = req.body.url; let url = req.body.url;
if (!url) { if (!url) {
return res.status(400).json({ error: "Url is required" }); return res.status(400).json({ error: "Url is required" });
} }
try {
url = checkAndUpdateURL(url).url;
} catch (e) {
return res
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
.json({ error: e.message ?? e });
}
if (isUrlBlocked(url)) { if (isUrlBlocked(url)) {
return res return res
@ -81,6 +89,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
pageOptions, pageOptions,
team_id, team_id,
robots, robots,
createdAt: Date.now(),
}; };
await saveCrawl(id, sc); await saveCrawl(id, sc);

View File

@ -45,7 +45,7 @@ export async function scrapeHelper(
pageOptions, pageOptions,
extractorOptions, extractorOptions,
origin: req.body.origin ?? defaultOrigin, origin: req.body.origin ?? defaultOrigin,
}); }, {}, jobId);
let doc; let doc;
try { try {

View File

@ -9,6 +9,7 @@ import { search } from "../search";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { Logger } from "../lib/logger"; import { Logger } from "../lib/logger";
import { getScrapeQueue, scrapeQueueEvents } from "../services/queue-service";
export async function searchHelper( export async function searchHelper(
jobId: string, jobId: string,
@ -75,26 +76,28 @@ export async function searchHelper(
// filter out social media links // filter out social media links
const jobDatas = res.map(x => {
const url = x.url;
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
},
opts: {
jobId: uuid,
priority: 10,
}
};
})
const a = new WebScraperDataProvider(); const jobs = await getScrapeQueue().addBulk(jobDatas);
await a.setOptions({
jobId,
mode: "single_urls",
urls: res.map((r) => r.url).slice(0, Math.min(searchOptions.limit ?? 5, 5)),
crawlerOptions: {
...crawlerOptions,
},
pageOptions: {
...pageOptions,
onlyMainContent: pageOptions?.onlyMainContent ?? true,
fetchPageContent: pageOptions?.fetchPageContent ?? true,
includeHtml: pageOptions?.includeHtml ?? false,
removeTags: pageOptions?.removeTags ?? [],
fallback: false,
},
});
const docs = await a.getDocuments(false); const docs = (await Promise.all(jobs.map(x => x.waitUntilFinished(scrapeQueueEvents, 60000)))).map(x => x[0]);
if (docs.length === 0) { if (docs.length === 0) {
return { success: true, error: "No search results found", returnCode: 200 }; return { success: true, error: "No search results found", returnCode: 200 };
@ -109,19 +112,6 @@ export async function searchHelper(
return { success: true, error: "No page found", returnCode: 200, data: docs }; return { success: true, error: "No page found", returnCode: 200, data: docs };
} }
const billingResult = await billTeam(
team_id,
filteredDocs.length
);
if (!billingResult.success) {
return {
success: false,
error:
"Failed to bill team. Insufficient credits or subscription not found.",
returnCode: 402,
};
}
return { return {
success: true, success: true,
data: filteredDocs, data: filteredDocs,

View File

@ -2,6 +2,7 @@ import { Request, Response } from "express";
import { Logger } from "../../src/lib/logger"; import { Logger } from "../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis"; import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../src/services/queue-service"; import { getScrapeQueue } from "../../src/services/queue-service";
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
export async function crawlJobStatusPreviewController(req: Request, res: Response) { export async function crawlJobStatusPreviewController(req: Request, res: Response) {
try { try {
@ -21,7 +22,19 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
// } // }
// } // }
const jobs = await Promise.all(jobIDs.map(x => getScrapeQueue().getJob(x))); const jobs = (await Promise.all(jobIDs.map(async x => {
const job = await getScrapeQueue().getJob(x);
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobById(job.id);
if (supabaseData) {
job.returnvalue = supabaseData.docs;
}
}
return job;
}))).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState())); const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active"; const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";

View File

@ -8,6 +8,7 @@ export type StoredCrawl = {
team_id: string; team_id: string;
robots?: string; robots?: string;
cancelled?: boolean; cancelled?: boolean;
createdAt: number;
}; };
export async function saveCrawl(id: string, crawl: StoredCrawl) { export async function saveCrawl(id: string, crawl: StoredCrawl) {
@ -30,6 +31,11 @@ export async function addCrawlJob(id: string, job_id: string) {
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX"); await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
} }
export async function addCrawlJobs(id: string, job_ids: string[]) {
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
}
export async function addCrawlJobDone(id: string, job_id: string) { export async function addCrawlJobDone(id: string, job_id: string) {
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id); await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX"); await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
@ -54,6 +60,13 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
return res; return res;
} }
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
return res;
}
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler { export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
const crawler = new WebCrawler({ const crawler = new WebCrawler({
jobId: id, jobId: id,

View File

@ -65,6 +65,7 @@ export type WebScraperOptions = {
extractorOptions?: ExtractorOptions; extractorOptions?: ExtractorOptions;
concurrentRequests?: number; concurrentRequests?: number;
bullJobId?: string; bullJobId?: string;
priority?: number;
}; };
export interface DocumentUrl { export interface DocumentUrl {

View File

@ -49,6 +49,7 @@ export async function startWebScraperPipeline({
}, },
team_id: job.data.team_id, team_id: job.data.team_id,
bull_job_id: job.id.toString(), bull_job_id: job.id.toString(),
priority: job.opts.priority,
})) as { success: boolean; message: string; docs: Document[] }; })) as { success: boolean; message: string; docs: Document[] };
} }
export async function runWebScraper({ export async function runWebScraper({
@ -62,6 +63,7 @@ export async function runWebScraper({
onError, onError,
team_id, team_id,
bull_job_id, bull_job_id,
priority,
}: RunWebScraperParams): Promise<RunWebScraperResult> { }: RunWebScraperParams): Promise<RunWebScraperResult> {
try { try {
const provider = new WebScraperDataProvider(); const provider = new WebScraperDataProvider();
@ -74,6 +76,7 @@ export async function runWebScraper({
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
pageOptions: pageOptions, pageOptions: pageOptions,
bullJobId: bull_job_id, bullJobId: bull_job_id,
priority,
}); });
} else { } else {
await provider.setOptions({ await provider.setOptions({
@ -83,6 +86,7 @@ export async function runWebScraper({
extractorOptions, extractorOptions,
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
pageOptions: pageOptions, pageOptions: pageOptions,
priority,
}); });
} }
const docs = (await provider.getDocuments(false, (progress: Progress) => { const docs = (await provider.getDocuments(false, (progress: Progress) => {

View File

@ -24,7 +24,7 @@ describe('scrapSingleUrl', () => {
}); });
it('should return a list of links on the firecrawl.ai page', async () => { it('should return a list of links on the firecrawl.ai page', async () => {
const url = 'https://example.com'; const url = 'https://flutterbricks.com';
const pageOptions: PageOptions = { includeHtml: true }; const pageOptions: PageOptions = { includeHtml: true };
const result = await scrapSingleUrl("TEST", url, pageOptions); const result = await scrapSingleUrl("TEST", url, pageOptions);
@ -33,5 +33,5 @@ it('should return a list of links on the firecrawl.ai page', async () => {
expect(result.linksOnPage).toBeDefined(); expect(result.linksOnPage).toBeDefined();
expect(Array.isArray(result.linksOnPage)).toBe(true); expect(Array.isArray(result.linksOnPage)).toBe(true);
expect(result.linksOnPage.length).toBeGreaterThan(0); expect(result.linksOnPage.length).toBeGreaterThan(0);
expect(result.linksOnPage).toContain('https://www.iana.org/domains/example') expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
}, 10000); }, 10000);

View File

@ -44,6 +44,7 @@ export class WebScraperDataProvider {
private crawlerMode: string = "default"; private crawlerMode: string = "default";
private allowBackwardCrawling: boolean = false; private allowBackwardCrawling: boolean = false;
private allowExternalContentLinks: boolean = false; private allowExternalContentLinks: boolean = false;
private priority?: number;
authorize(): void { authorize(): void {
throw new Error("Method not implemented."); throw new Error("Method not implemented.");
@ -72,7 +73,8 @@ export class WebScraperDataProvider {
url, url,
this.pageOptions, this.pageOptions,
this.extractorOptions, this.extractorOptions,
existingHTML existingHTML,
this.priority,
); );
processedUrls++; processedUrls++;
if (inProgress) { if (inProgress) {
@ -593,6 +595,7 @@ export class WebScraperDataProvider {
options.crawlerOptions?.allowBackwardCrawling ?? false; options.crawlerOptions?.allowBackwardCrawling ?? false;
this.allowExternalContentLinks = this.allowExternalContentLinks =
options.crawlerOptions?.allowExternalContentLinks ?? false; options.crawlerOptions?.allowExternalContentLinks ?? false;
this.priority = options.priority;
// make sure all urls start with https:// // make sure all urls start with https://
this.urls = this.urls.map((url) => { this.urls = this.urls.map((url) => {

View File

@ -26,6 +26,7 @@ export async function scrapWithFireEngine({
fireEngineOptions = {}, fireEngineOptions = {},
headers, headers,
options, options,
priority,
}: { }: {
url: string; url: string;
waitFor?: number; waitFor?: number;
@ -35,6 +36,7 @@ export async function scrapWithFireEngine({
fireEngineOptions?: FireEngineOptions; fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>; headers?: Record<string, string>;
options?: any; options?: any;
priority?: number;
}): Promise<FireEngineResponse> { }): Promise<FireEngineResponse> {
const logParams = { const logParams = {
url, url,
@ -78,6 +80,7 @@ export async function scrapWithFireEngine({
fullPageScreenshot: fullPageScreenshotParam, fullPageScreenshot: fullPageScreenshotParam,
headers: headers, headers: headers,
pageOptions: pageOptions, pageOptions: pageOptions,
priority,
...fireEngineOptionsParam, ...fireEngineOptionsParam,
}, },
{ {

View File

@ -134,7 +134,8 @@ export async function scrapSingleUrl(
extractorOptions: ExtractorOptions = { extractorOptions: ExtractorOptions = {
mode: "llm-extraction-from-markdown", mode: "llm-extraction-from-markdown",
}, },
existingHtml: string = "" existingHtml: string = "",
priority?: number,
): Promise<Document> { ): Promise<Document> {
urlToScrap = urlToScrap.trim(); urlToScrap = urlToScrap.trim();
@ -177,7 +178,8 @@ export async function scrapSingleUrl(
headers: pageOptions.headers, headers: pageOptions.headers,
fireEngineOptions: { fireEngineOptions: {
engine: engine, engine: engine,
} },
priority,
}); });
scraperResponse.text = response.html; scraperResponse.text = response.html;
scraperResponse.screenshot = response.screenshot; scraperResponse.screenshot = response.screenshot;

View File

@ -234,5 +234,13 @@ export const urlSpecificParams = {
engine: "tlsclient", engine: "tlsclient",
}, },
}, },
},
"zoopla.co.uk":{
defaultScraper: "fire-engine",
params:{
fireEngineOptions:{
engine: "chrome-cdp",
},
},
} }
}; };

View File

@ -49,7 +49,7 @@ export async function checkAlerts() {
}; };
const checkAll = async () => { const checkAll = async () => {
await checkActiveJobs(); // await checkActiveJobs();
await checkWaitingQueue(); await checkWaitingQueue();
}; };

View File

@ -44,7 +44,7 @@ export async function logJob(job: FirecrawlJob) {
}, },
]); ]);
if (process.env.POSTHOG_API_KEY) { if (process.env.POSTHOG_API_KEY && !job.crawl_id) {
let phLog = { let phLog = {
distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
...(job.team_id !== "preview" && { ...(job.team_id !== "preview" && {
@ -65,7 +65,6 @@ export async function logJob(job: FirecrawlJob) {
extractor_options: job.extractor_options, extractor_options: job.extractor_options,
num_tokens: job.num_tokens, num_tokens: job.num_tokens,
retry: job.retry, retry: job.retry,
crawl_id: job.crawl_id,
}, },
}; };
posthog.capture(phLog); posthog.capture(phLog);

View File

@ -9,9 +9,9 @@ export async function addScrapeJob(
jobId: string = uuidv4(), jobId: string = uuidv4(),
): Promise<Job> { ): Promise<Job> {
return await getScrapeQueue().add(jobId, webScraperOptions, { return await getScrapeQueue().add(jobId, webScraperOptions, {
priority: webScraperOptions.crawl_id ? 20 : 10,
...options, ...options,
jobId, jobId,
priority: webScraperOptions.crawl_id ? 2 : 1,
}); });
} }

View File

@ -10,17 +10,15 @@ import { startWebScraperPipeline } from "../main/runWebScraper";
import { callWebhook } from "./webhook"; import { callWebhook } from "./webhook";
import { logJob } from "./logging/log_job"; import { logJob } from "./logging/log_job";
import { initSDK } from "@hyperdx/node-opentelemetry"; import { initSDK } from "@hyperdx/node-opentelemetry";
import { Job, QueueEvents, tryCatch } from "bullmq"; import { Job } from "bullmq";
import { Logger } from "../lib/logger"; import { Logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events";
import { Worker } from "bullmq"; import { Worker } from "bullmq";
import systemMonitor from "./system-monitor"; import systemMonitor from "./system-monitor";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { WebCrawler } from "../scraper/WebScraper/crawler"; import { addCrawlJob, addCrawlJobDone, crawlToCrawler, getCrawl, getCrawlJobs, isCrawlFinished, lockURL } from "../lib/crawl-redis";
import { getAdjustedMaxDepth } from "../scraper/WebScraper/utils/maxDepthUtils";
import { addCrawlJob, addCrawlJobDone, crawlToCrawler, getCrawl, isCrawlFinished, lockURL } from "../lib/crawl-redis";
import { StoredCrawl } from "../lib/crawl-redis"; import { StoredCrawl } from "../lib/crawl-redis";
import { addScrapeJob } from "./queue-jobs"; import { addScrapeJob } from "./queue-jobs";
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
if (process.env.ENV === "production") { if (process.env.ENV === "production") {
initSDK({ initSDK({
@ -151,33 +149,33 @@ async function processJob(job: Job, token: string) {
await callWebhook(job.data.team_id, job.id as string, data); await callWebhook(job.data.team_id, job.id as string, data);
} }
await logJob({
job_id: job.id as string,
success: success,
message: message,
num_docs: docs.length,
docs: docs,
time_taken: timeTakenInSeconds,
team_id: job.data.team_id,
mode: job.data.mode,
url: job.data.url,
crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
});
if (job.data.crawl_id) { if (job.data.crawl_id) {
await logJob({
job_id: job.id as string,
success: success,
message: message,
num_docs: docs.length,
docs: docs,
time_taken: timeTakenInSeconds,
team_id: job.data.team_id,
mode: job.data.mode,
url: job.data.url,
crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
});
await addCrawlJobDone(job.data.crawl_id, job.id); await addCrawlJobDone(job.data.crawl_id, job.id);
if (!job.data.sitemapped) { const sc = await getCrawl(job.data.crawl_id) as StoredCrawl;
const sc = await getCrawl(job.data.crawl_id) as StoredCrawl;
if (!job.data.sitemapped) {
if (!sc.cancelled) { if (!sc.cancelled) {
const crawler = crawlToCrawler(job.data.crawl_id, sc); const crawler = crawlToCrawler(job.data.crawl_id, sc);
const links = crawler.filterLinks((data.docs[0].linksOnPage as string[]) const links = crawler.filterLinks((data.docs[0].linksOnPage as string[])
.map(href => crawler.filterURL(href, sc.originUrl)) .map(href => crawler.filterURL(href.trim(), sc.originUrl))
.filter(x => x !== null), .filter(x => x !== null),
Infinity, Infinity,
sc.crawlerOptions?.maxDepth ?? 10 sc.crawlerOptions?.maxDepth ?? 10
@ -202,6 +200,66 @@ async function processJob(job: Job, token: string) {
} }
if (await isCrawlFinished(job.data.crawl_id)) { if (await isCrawlFinished(job.data.crawl_id)) {
const jobIDs = await getCrawlJobs(job.data.crawl_id);
const jobs = (await Promise.all(jobIDs.map(async x => {
if (x === job.id) {
return {
async getState() {
return "completed"
},
timestamp: Date.now(),
returnvalue: docs,
}
}
const j = await getScrapeQueue().getJob(x);
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobById(j.id);
if (supabaseData) {
j.returnvalue = supabaseData.docs;
}
}
return j;
}))).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
const fullDocs = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
await logJob({
job_id: job.data.crawl_id,
success: jobStatus === "completed",
message: message,
num_docs: fullDocs.length,
docs: [],
time_taken: (Date.now() - sc.createdAt) / 1000,
team_id: job.data.team_id,
mode: "crawl",
url: sc.originUrl,
crawlerOptions: sc.crawlerOptions,
pageOptions: sc.pageOptions,
origin: job.data.origin,
});
const data = {
success: jobStatus !== "failed",
result: {
links: fullDocs.map((doc) => {
return {
content: doc,
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
};
}),
},
project_id: job.data.project_id,
error: message /* etc... */,
docs: fullDocs,
};
await callWebhook(job.data.team_id, job.id as string, data); await callWebhook(job.data.team_id, job.id as string, data);
} }
} }
@ -222,6 +280,9 @@ async function processJob(job: Job, token: string) {
}); });
} }
Logger.error(error); Logger.error(error);
if (error.stack) {
Logger.error(error.stack);
}
logtail.error("Overall error ingesting", { logtail.error("Overall error ingesting", {
job_id: job.id, job_id: job.id,
@ -235,27 +296,51 @@ async function processJob(job: Job, token: string) {
error: error:
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */, "Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
}; };
if (job.data.mode === "crawl" || job.data.crawl_id) { if (job.data.mode === "crawl" || job.data.crawl_id) {
await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data); await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data);
} }
await logJob({
job_id: job.id as string, if (job.data.crawl_id) {
success: false, await logJob({
message: job_id: job.id as string,
typeof error === "string" success: false,
? error message:
: error.message ?? "Something went wrong... Contact help@mendable.ai", typeof error === "string"
num_docs: 0, ? error
docs: [], : error.message ?? "Something went wrong... Contact help@mendable.ai",
time_taken: 0, num_docs: 0,
team_id: job.data.team_id, docs: [],
mode: job.data.mode, time_taken: 0,
url: job.data.url, team_id: job.data.team_id,
crawlerOptions: job.data.crawlerOptions, mode: job.data.mode,
pageOptions: job.data.pageOptions, url: job.data.url,
origin: job.data.origin, crawlerOptions: job.data.crawlerOptions,
crawl_id: job.data.crawl_id, pageOptions: job.data.pageOptions,
}); origin: job.data.origin,
crawl_id: job.data.crawl_id,
});
const sc = await getCrawl(job.data.crawl_id);
await logJob({
job_id: job.data.crawl_id,
success: false,
message:
typeof error === "string"
? error
: error.message ?? "Something went wrong... Contact help@mendable.ai",
num_docs: 0,
docs: [],
time_taken: 0,
team_id: job.data.team_id,
mode: "crawl",
url: sc ? sc.originUrl : job.data.url,
crawlerOptions: sc ? sc.crawlerOptions : job.data.crawlerOptions,
pageOptions: sc ? sc.pageOptions : job.data.pageOptions,
origin: job.data.origin,
});
}
// done(null, data); // done(null, data);
return data; return data;
} }

View File

@ -43,6 +43,7 @@ export interface RunWebScraperParams {
onError: (error: Error) => void; onError: (error: Error) => void;
team_id: string; team_id: string;
bull_job_id: string; bull_job_id: string;
priority?: number;
} }
export interface RunWebScraperResult { export interface RunWebScraperResult {

25
apps/go-sdk/examples/.gitignore vendored Normal file
View File

@ -0,0 +1,25 @@
# If you prefer the allow list template instead of the deny list, see community template:
# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
#
# Binaries for programs and plugins
*.exe
*.exe~
*.dll
*.so
*.dylib
# Test binary, built with `go test -c`
*.test
# Output of the go coverage tool, specifically when used with LiteIDE
*.out
# Dependency directories (remove the comment below to include it)
# vendor/
# Go workspace file
go.work
go.work.sum
# env file
.env

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Mendable
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -6,11 +6,11 @@ import (
"log" "log"
"github.com/google/uuid" "github.com/google/uuid"
"github.com/mendableai/firecrawl/firecrawl" "github.com/mendableai/firecrawl-go"
) )
func main() { func main() {
app, err := firecrawl.NewFirecrawlApp("fc-YOUR-API-KEY", "http://localhost:3002") app, err := firecrawl.NewFirecrawlApp("fc-YOUR_API_KEY", "https://api.firecrawl.dev")
if err != nil { if err != nil {
log.Fatalf("Failed to create FirecrawlApp: %v", err) log.Fatalf("Failed to create FirecrawlApp: %v", err)
} }

View File

@ -1,10 +1,9 @@
module github.com/mendableai/firecrawl/apps/go-sdk/examples module github.com/mendableai/firecrawl-go-examples
go 1.22.5 go 1.22.5
replace github.com/mendableai/firecrawl => ../ replace github.com/mendableai/firecrawl => ../
require ( require github.com/google/uuid v1.6.0
github.com/google/uuid v1.6.0
github.com/mendableai/firecrawl v0.0.0-00010101000000-000000000000 require github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 // indirect
)

View File

@ -4,6 +4,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 h1:461um7fbSQYj2E3ETl8GINuRg5MTY3BdjMnogwUIhBs=
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46/go.mod h1:mTGbJ37fy43aaqonp/tdpzCH516jHFw/XVvfFi4QXHo=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=

2
apps/go-sdk/firecrawl/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
.env
vendor

View File

@ -1,4 +1,4 @@
module github.com/mendableai/firecrawl/apps/go-sdk module github.com/mendableai/firecrawl-go
go 1.22.5 go 1.22.5

View File

@ -24,6 +24,7 @@
"devDependencies": { "devDependencies": {
"@types/jest": "^29.5.12", "@types/jest": "^29.5.12",
"@types/supertest": "^6.0.2", "@types/supertest": "^6.0.2",
"artillery": "^2.0.19",
"typescript": "^5.4.5" "typescript": "^5.4.5"
} }
} }

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,4 @@
name: firecrawl name: firecrawl
version: '3.9'
x-common-service: &common-service x-common-service: &common-service
build: apps/api build: apps/api