mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-06 01:56:09 +08:00
Merge branch 'main' into v1-webscraper
This commit is contained in:
commit
c917c8fbcd
6
.gitmodules
vendored
Normal file
6
.gitmodules
vendored
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
[submodule "apps/go-sdk/firecrawl"]
|
||||||
|
path = apps/go-sdk/firecrawl
|
||||||
|
url = https://github.com/mendableai/firecrawl-go
|
||||||
|
[submodule "apps/go-sdk/examples"]
|
||||||
|
path = apps/go-sdk/examples
|
||||||
|
url = https://github.com/mendableai/firecrawl-go-examples
|
@ -24,8 +24,8 @@ kill_timeout = '30s'
|
|||||||
|
|
||||||
[http_service.concurrency]
|
[http_service.concurrency]
|
||||||
type = "requests"
|
type = "requests"
|
||||||
hard_limit = 100
|
# hard_limit = 100
|
||||||
soft_limit = 50
|
soft_limit = 100
|
||||||
|
|
||||||
[[http_service.checks]]
|
[[http_service.checks]]
|
||||||
grace_period = "10s"
|
grace_period = "10s"
|
||||||
@ -51,8 +51,8 @@ kill_timeout = '30s'
|
|||||||
|
|
||||||
[services.concurrency]
|
[services.concurrency]
|
||||||
type = 'connections'
|
type = 'connections'
|
||||||
hard_limit = 25
|
# hard_limit = 25
|
||||||
soft_limit = 20
|
soft_limit = 100
|
||||||
|
|
||||||
[[vm]]
|
[[vm]]
|
||||||
size = 'performance-2x'
|
size = 'performance-2x'
|
||||||
|
@ -24,8 +24,8 @@ kill_timeout = '30s'
|
|||||||
|
|
||||||
[http_service.concurrency]
|
[http_service.concurrency]
|
||||||
type = "requests"
|
type = "requests"
|
||||||
hard_limit = 200
|
# hard_limit = 200
|
||||||
soft_limit = 75
|
soft_limit = 200
|
||||||
|
|
||||||
[[http_service.checks]]
|
[[http_service.checks]]
|
||||||
grace_period = "20s"
|
grace_period = "20s"
|
||||||
@ -50,8 +50,8 @@ kill_timeout = '30s'
|
|||||||
|
|
||||||
[services.concurrency]
|
[services.concurrency]
|
||||||
type = 'connections'
|
type = 'connections'
|
||||||
hard_limit = 30
|
# hard_limit = 30
|
||||||
soft_limit = 12
|
soft_limit = 200
|
||||||
|
|
||||||
[[vm]]
|
[[vm]]
|
||||||
size = 'performance-4x'
|
size = 'performance-4x'
|
||||||
|
@ -10,7 +10,9 @@ import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
|||||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from "../../src/lib/logger";
|
import { Logger } from "../../src/lib/logger";
|
||||||
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
|
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
|
||||||
|
import { getScrapeQueue } from "../../src/services/queue-service";
|
||||||
|
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
|
||||||
|
|
||||||
export async function crawlController(req: Request, res: Response) {
|
export async function crawlController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
@ -42,10 +44,17 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
return res.status(402).json({ error: "Insufficient credits" });
|
return res.status(402).json({ error: "Insufficient credits" });
|
||||||
}
|
}
|
||||||
|
|
||||||
const url = req.body.url;
|
let url = req.body.url;
|
||||||
if (!url) {
|
if (!url) {
|
||||||
return res.status(400).json({ error: "Url is required" });
|
return res.status(400).json({ error: "Url is required" });
|
||||||
}
|
}
|
||||||
|
try {
|
||||||
|
url = checkAndUpdateURL(url).url;
|
||||||
|
} catch (e) {
|
||||||
|
return res
|
||||||
|
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
|
||||||
|
.json({ error: e.message ?? e });
|
||||||
|
}
|
||||||
|
|
||||||
if (isUrlBlocked(url)) {
|
if (isUrlBlocked(url)) {
|
||||||
return res
|
return res
|
||||||
@ -94,41 +103,50 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
|
|
||||||
await logCrawl(id, team_id);
|
await logCrawl(id, team_id);
|
||||||
|
|
||||||
let robots;
|
|
||||||
|
|
||||||
try {
|
|
||||||
robots = await this.getRobotsTxt();
|
|
||||||
} catch (_) {}
|
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
originUrl: url,
|
originUrl: url,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
pageOptions,
|
pageOptions,
|
||||||
team_id,
|
team_id,
|
||||||
robots,
|
createdAt: Date.now(),
|
||||||
};
|
};
|
||||||
|
|
||||||
await saveCrawl(id, sc);
|
|
||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
|
try {
|
||||||
|
sc.robots = await crawler.getRobotsTxt();
|
||||||
|
} catch (_) {}
|
||||||
|
|
||||||
|
await saveCrawl(id, sc);
|
||||||
|
|
||||||
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||||
|
|
||||||
if (sitemap !== null) {
|
if (sitemap !== null) {
|
||||||
for (const url of sitemap.map(x => x.url)) {
|
const jobs = sitemap.map(x => {
|
||||||
await lockURL(id, sc, url);
|
const url = x.url;
|
||||||
const job = await addScrapeJob({
|
const uuid = uuidv4();
|
||||||
url,
|
return {
|
||||||
mode: "single_urls",
|
name: uuid,
|
||||||
crawlerOptions: crawlerOptions,
|
data: {
|
||||||
team_id: team_id,
|
url,
|
||||||
pageOptions: pageOptions,
|
mode: "single_urls",
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
crawlerOptions: crawlerOptions,
|
||||||
crawl_id: id,
|
team_id: team_id,
|
||||||
sitemapped: true,
|
pageOptions: pageOptions,
|
||||||
});
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
await addCrawlJob(id, job.id);
|
crawl_id: id,
|
||||||
}
|
sitemapped: true,
|
||||||
|
},
|
||||||
|
opts: {
|
||||||
|
jobId: uuid,
|
||||||
|
priority: 20,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
})
|
||||||
|
|
||||||
|
await lockURLs(id, jobs.map(x => x.data.url));
|
||||||
|
await addCrawlJobs(id, jobs.map(x => x.opts.jobId));
|
||||||
|
await getScrapeQueue().addBulk(jobs);
|
||||||
} else {
|
} else {
|
||||||
await lockURL(id, sc, url);
|
await lockURL(id, sc, url);
|
||||||
const job = await addScrapeJob({
|
const job = await addScrapeJob({
|
||||||
@ -139,6 +157,8 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
|
}, {
|
||||||
|
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
|
||||||
});
|
});
|
||||||
await addCrawlJob(id, job.id);
|
await addCrawlJob(id, job.id);
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,7 @@ import { v4 as uuidv4 } from "uuid";
|
|||||||
import { Logger } from "../../src/lib/logger";
|
import { Logger } from "../../src/lib/logger";
|
||||||
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
|
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
|
||||||
import { addScrapeJob } from "../../src/services/queue-jobs";
|
import { addScrapeJob } from "../../src/services/queue-jobs";
|
||||||
|
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
|
||||||
|
|
||||||
export async function crawlPreviewController(req: Request, res: Response) {
|
export async function crawlPreviewController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
@ -21,10 +22,17 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
return res.status(status).json({ error });
|
return res.status(status).json({ error });
|
||||||
}
|
}
|
||||||
|
|
||||||
const url = req.body.url;
|
let url = req.body.url;
|
||||||
if (!url) {
|
if (!url) {
|
||||||
return res.status(400).json({ error: "Url is required" });
|
return res.status(400).json({ error: "Url is required" });
|
||||||
}
|
}
|
||||||
|
try {
|
||||||
|
url = checkAndUpdateURL(url).url;
|
||||||
|
} catch (e) {
|
||||||
|
return res
|
||||||
|
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
|
||||||
|
.json({ error: e.message ?? e });
|
||||||
|
}
|
||||||
|
|
||||||
if (isUrlBlocked(url)) {
|
if (isUrlBlocked(url)) {
|
||||||
return res
|
return res
|
||||||
@ -81,6 +89,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
pageOptions,
|
pageOptions,
|
||||||
team_id,
|
team_id,
|
||||||
robots,
|
robots,
|
||||||
|
createdAt: Date.now(),
|
||||||
};
|
};
|
||||||
|
|
||||||
await saveCrawl(id, sc);
|
await saveCrawl(id, sc);
|
||||||
|
@ -45,7 +45,7 @@ export async function scrapeHelper(
|
|||||||
pageOptions,
|
pageOptions,
|
||||||
extractorOptions,
|
extractorOptions,
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
});
|
}, {}, jobId);
|
||||||
|
|
||||||
let doc;
|
let doc;
|
||||||
try {
|
try {
|
||||||
|
@ -9,6 +9,7 @@ import { search } from "../search";
|
|||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from "../lib/logger";
|
import { Logger } from "../lib/logger";
|
||||||
|
import { getScrapeQueue, scrapeQueueEvents } from "../services/queue-service";
|
||||||
|
|
||||||
export async function searchHelper(
|
export async function searchHelper(
|
||||||
jobId: string,
|
jobId: string,
|
||||||
@ -75,26 +76,28 @@ export async function searchHelper(
|
|||||||
|
|
||||||
// filter out social media links
|
// filter out social media links
|
||||||
|
|
||||||
|
const jobDatas = res.map(x => {
|
||||||
|
const url = x.url;
|
||||||
|
const uuid = uuidv4();
|
||||||
|
return {
|
||||||
|
name: uuid,
|
||||||
|
data: {
|
||||||
|
url,
|
||||||
|
mode: "single_urls",
|
||||||
|
crawlerOptions: crawlerOptions,
|
||||||
|
team_id: team_id,
|
||||||
|
pageOptions: pageOptions,
|
||||||
|
},
|
||||||
|
opts: {
|
||||||
|
jobId: uuid,
|
||||||
|
priority: 10,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
})
|
||||||
|
|
||||||
const a = new WebScraperDataProvider();
|
const jobs = await getScrapeQueue().addBulk(jobDatas);
|
||||||
await a.setOptions({
|
|
||||||
jobId,
|
|
||||||
mode: "single_urls",
|
|
||||||
urls: res.map((r) => r.url).slice(0, Math.min(searchOptions.limit ?? 5, 5)),
|
|
||||||
crawlerOptions: {
|
|
||||||
...crawlerOptions,
|
|
||||||
},
|
|
||||||
pageOptions: {
|
|
||||||
...pageOptions,
|
|
||||||
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
|
||||||
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
|
||||||
includeHtml: pageOptions?.includeHtml ?? false,
|
|
||||||
removeTags: pageOptions?.removeTags ?? [],
|
|
||||||
fallback: false,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
const docs = await a.getDocuments(false);
|
const docs = (await Promise.all(jobs.map(x => x.waitUntilFinished(scrapeQueueEvents, 60000)))).map(x => x[0]);
|
||||||
|
|
||||||
if (docs.length === 0) {
|
if (docs.length === 0) {
|
||||||
return { success: true, error: "No search results found", returnCode: 200 };
|
return { success: true, error: "No search results found", returnCode: 200 };
|
||||||
@ -109,19 +112,6 @@ export async function searchHelper(
|
|||||||
return { success: true, error: "No page found", returnCode: 200, data: docs };
|
return { success: true, error: "No page found", returnCode: 200, data: docs };
|
||||||
}
|
}
|
||||||
|
|
||||||
const billingResult = await billTeam(
|
|
||||||
team_id,
|
|
||||||
filteredDocs.length
|
|
||||||
);
|
|
||||||
if (!billingResult.success) {
|
|
||||||
return {
|
|
||||||
success: false,
|
|
||||||
error:
|
|
||||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
|
||||||
returnCode: 402,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: filteredDocs,
|
data: filteredDocs,
|
||||||
|
@ -2,6 +2,7 @@ import { Request, Response } from "express";
|
|||||||
import { Logger } from "../../src/lib/logger";
|
import { Logger } from "../../src/lib/logger";
|
||||||
import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis";
|
import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis";
|
||||||
import { getScrapeQueue } from "../../src/services/queue-service";
|
import { getScrapeQueue } from "../../src/services/queue-service";
|
||||||
|
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||||
|
|
||||||
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
|
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
@ -21,7 +22,19 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
|
|||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
|
||||||
const jobs = await Promise.all(jobIDs.map(x => getScrapeQueue().getJob(x)));
|
const jobs = (await Promise.all(jobIDs.map(async x => {
|
||||||
|
const job = await getScrapeQueue().getJob(x);
|
||||||
|
|
||||||
|
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
|
const supabaseData = await supabaseGetJobById(job.id);
|
||||||
|
|
||||||
|
if (supabaseData) {
|
||||||
|
job.returnvalue = supabaseData.docs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return job;
|
||||||
|
}))).sort((a, b) => a.timestamp - b.timestamp);
|
||||||
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
||||||
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
|
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@ export type StoredCrawl = {
|
|||||||
team_id: string;
|
team_id: string;
|
||||||
robots?: string;
|
robots?: string;
|
||||||
cancelled?: boolean;
|
cancelled?: boolean;
|
||||||
|
createdAt: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
export async function saveCrawl(id: string, crawl: StoredCrawl) {
|
export async function saveCrawl(id: string, crawl: StoredCrawl) {
|
||||||
@ -30,6 +31,11 @@ export async function addCrawlJob(id: string, job_id: string) {
|
|||||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function addCrawlJobs(id: string, job_ids: string[]) {
|
||||||
|
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
|
||||||
|
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||||
|
}
|
||||||
|
|
||||||
export async function addCrawlJobDone(id: string, job_id: string) {
|
export async function addCrawlJobDone(id: string, job_id: string) {
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
|
||||||
@ -54,6 +60,13 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
|
||||||
|
export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
|
||||||
|
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
|
||||||
|
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
|
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
jobId: id,
|
jobId: id,
|
||||||
|
@ -65,6 +65,7 @@ export type WebScraperOptions = {
|
|||||||
extractorOptions?: ExtractorOptions;
|
extractorOptions?: ExtractorOptions;
|
||||||
concurrentRequests?: number;
|
concurrentRequests?: number;
|
||||||
bullJobId?: string;
|
bullJobId?: string;
|
||||||
|
priority?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
export interface DocumentUrl {
|
export interface DocumentUrl {
|
||||||
|
@ -49,6 +49,7 @@ export async function startWebScraperPipeline({
|
|||||||
},
|
},
|
||||||
team_id: job.data.team_id,
|
team_id: job.data.team_id,
|
||||||
bull_job_id: job.id.toString(),
|
bull_job_id: job.id.toString(),
|
||||||
|
priority: job.opts.priority,
|
||||||
})) as { success: boolean; message: string; docs: Document[] };
|
})) as { success: boolean; message: string; docs: Document[] };
|
||||||
}
|
}
|
||||||
export async function runWebScraper({
|
export async function runWebScraper({
|
||||||
@ -62,6 +63,7 @@ export async function runWebScraper({
|
|||||||
onError,
|
onError,
|
||||||
team_id,
|
team_id,
|
||||||
bull_job_id,
|
bull_job_id,
|
||||||
|
priority,
|
||||||
}: RunWebScraperParams): Promise<RunWebScraperResult> {
|
}: RunWebScraperParams): Promise<RunWebScraperResult> {
|
||||||
try {
|
try {
|
||||||
const provider = new WebScraperDataProvider();
|
const provider = new WebScraperDataProvider();
|
||||||
@ -74,6 +76,7 @@ export async function runWebScraper({
|
|||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
bullJobId: bull_job_id,
|
bullJobId: bull_job_id,
|
||||||
|
priority,
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
await provider.setOptions({
|
await provider.setOptions({
|
||||||
@ -83,6 +86,7 @@ export async function runWebScraper({
|
|||||||
extractorOptions,
|
extractorOptions,
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
|
priority,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
||||||
|
@ -24,7 +24,7 @@ describe('scrapSingleUrl', () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
it('should return a list of links on the firecrawl.ai page', async () => {
|
it('should return a list of links on the firecrawl.ai page', async () => {
|
||||||
const url = 'https://example.com';
|
const url = 'https://flutterbricks.com';
|
||||||
const pageOptions: PageOptions = { includeHtml: true };
|
const pageOptions: PageOptions = { includeHtml: true };
|
||||||
|
|
||||||
const result = await scrapSingleUrl("TEST", url, pageOptions);
|
const result = await scrapSingleUrl("TEST", url, pageOptions);
|
||||||
@ -33,5 +33,5 @@ it('should return a list of links on the firecrawl.ai page', async () => {
|
|||||||
expect(result.linksOnPage).toBeDefined();
|
expect(result.linksOnPage).toBeDefined();
|
||||||
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
||||||
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
||||||
expect(result.linksOnPage).toContain('https://www.iana.org/domains/example')
|
expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
|
||||||
}, 10000);
|
}, 10000);
|
||||||
|
@ -44,6 +44,7 @@ export class WebScraperDataProvider {
|
|||||||
private crawlerMode: string = "default";
|
private crawlerMode: string = "default";
|
||||||
private allowBackwardCrawling: boolean = false;
|
private allowBackwardCrawling: boolean = false;
|
||||||
private allowExternalContentLinks: boolean = false;
|
private allowExternalContentLinks: boolean = false;
|
||||||
|
private priority?: number;
|
||||||
|
|
||||||
authorize(): void {
|
authorize(): void {
|
||||||
throw new Error("Method not implemented.");
|
throw new Error("Method not implemented.");
|
||||||
@ -72,7 +73,8 @@ export class WebScraperDataProvider {
|
|||||||
url,
|
url,
|
||||||
this.pageOptions,
|
this.pageOptions,
|
||||||
this.extractorOptions,
|
this.extractorOptions,
|
||||||
existingHTML
|
existingHTML,
|
||||||
|
this.priority,
|
||||||
);
|
);
|
||||||
processedUrls++;
|
processedUrls++;
|
||||||
if (inProgress) {
|
if (inProgress) {
|
||||||
@ -593,6 +595,7 @@ export class WebScraperDataProvider {
|
|||||||
options.crawlerOptions?.allowBackwardCrawling ?? false;
|
options.crawlerOptions?.allowBackwardCrawling ?? false;
|
||||||
this.allowExternalContentLinks =
|
this.allowExternalContentLinks =
|
||||||
options.crawlerOptions?.allowExternalContentLinks ?? false;
|
options.crawlerOptions?.allowExternalContentLinks ?? false;
|
||||||
|
this.priority = options.priority;
|
||||||
|
|
||||||
// make sure all urls start with https://
|
// make sure all urls start with https://
|
||||||
this.urls = this.urls.map((url) => {
|
this.urls = this.urls.map((url) => {
|
||||||
|
@ -26,6 +26,7 @@ export async function scrapWithFireEngine({
|
|||||||
fireEngineOptions = {},
|
fireEngineOptions = {},
|
||||||
headers,
|
headers,
|
||||||
options,
|
options,
|
||||||
|
priority,
|
||||||
}: {
|
}: {
|
||||||
url: string;
|
url: string;
|
||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
@ -35,6 +36,7 @@ export async function scrapWithFireEngine({
|
|||||||
fireEngineOptions?: FireEngineOptions;
|
fireEngineOptions?: FireEngineOptions;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
options?: any;
|
options?: any;
|
||||||
|
priority?: number;
|
||||||
}): Promise<FireEngineResponse> {
|
}): Promise<FireEngineResponse> {
|
||||||
const logParams = {
|
const logParams = {
|
||||||
url,
|
url,
|
||||||
@ -78,6 +80,7 @@ export async function scrapWithFireEngine({
|
|||||||
fullPageScreenshot: fullPageScreenshotParam,
|
fullPageScreenshot: fullPageScreenshotParam,
|
||||||
headers: headers,
|
headers: headers,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
|
priority,
|
||||||
...fireEngineOptionsParam,
|
...fireEngineOptionsParam,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -134,7 +134,8 @@ export async function scrapSingleUrl(
|
|||||||
extractorOptions: ExtractorOptions = {
|
extractorOptions: ExtractorOptions = {
|
||||||
mode: "llm-extraction-from-markdown",
|
mode: "llm-extraction-from-markdown",
|
||||||
},
|
},
|
||||||
existingHtml: string = ""
|
existingHtml: string = "",
|
||||||
|
priority?: number,
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
urlToScrap = urlToScrap.trim();
|
urlToScrap = urlToScrap.trim();
|
||||||
|
|
||||||
@ -177,7 +178,8 @@ export async function scrapSingleUrl(
|
|||||||
headers: pageOptions.headers,
|
headers: pageOptions.headers,
|
||||||
fireEngineOptions: {
|
fireEngineOptions: {
|
||||||
engine: engine,
|
engine: engine,
|
||||||
}
|
},
|
||||||
|
priority,
|
||||||
});
|
});
|
||||||
scraperResponse.text = response.html;
|
scraperResponse.text = response.html;
|
||||||
scraperResponse.screenshot = response.screenshot;
|
scraperResponse.screenshot = response.screenshot;
|
||||||
|
@ -234,5 +234,13 @@ export const urlSpecificParams = {
|
|||||||
engine: "tlsclient",
|
engine: "tlsclient",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
},
|
||||||
|
"zoopla.co.uk":{
|
||||||
|
defaultScraper: "fire-engine",
|
||||||
|
params:{
|
||||||
|
fireEngineOptions:{
|
||||||
|
engine: "chrome-cdp",
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -49,7 +49,7 @@ export async function checkAlerts() {
|
|||||||
};
|
};
|
||||||
|
|
||||||
const checkAll = async () => {
|
const checkAll = async () => {
|
||||||
await checkActiveJobs();
|
// await checkActiveJobs();
|
||||||
await checkWaitingQueue();
|
await checkWaitingQueue();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ export async function logJob(job: FirecrawlJob) {
|
|||||||
},
|
},
|
||||||
]);
|
]);
|
||||||
|
|
||||||
if (process.env.POSTHOG_API_KEY) {
|
if (process.env.POSTHOG_API_KEY && !job.crawl_id) {
|
||||||
let phLog = {
|
let phLog = {
|
||||||
distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
|
distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
|
||||||
...(job.team_id !== "preview" && {
|
...(job.team_id !== "preview" && {
|
||||||
@ -65,7 +65,6 @@ export async function logJob(job: FirecrawlJob) {
|
|||||||
extractor_options: job.extractor_options,
|
extractor_options: job.extractor_options,
|
||||||
num_tokens: job.num_tokens,
|
num_tokens: job.num_tokens,
|
||||||
retry: job.retry,
|
retry: job.retry,
|
||||||
crawl_id: job.crawl_id,
|
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
posthog.capture(phLog);
|
posthog.capture(phLog);
|
||||||
|
@ -9,9 +9,9 @@ export async function addScrapeJob(
|
|||||||
jobId: string = uuidv4(),
|
jobId: string = uuidv4(),
|
||||||
): Promise<Job> {
|
): Promise<Job> {
|
||||||
return await getScrapeQueue().add(jobId, webScraperOptions, {
|
return await getScrapeQueue().add(jobId, webScraperOptions, {
|
||||||
|
priority: webScraperOptions.crawl_id ? 20 : 10,
|
||||||
...options,
|
...options,
|
||||||
jobId,
|
jobId,
|
||||||
priority: webScraperOptions.crawl_id ? 2 : 1,
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,17 +10,15 @@ import { startWebScraperPipeline } from "../main/runWebScraper";
|
|||||||
import { callWebhook } from "./webhook";
|
import { callWebhook } from "./webhook";
|
||||||
import { logJob } from "./logging/log_job";
|
import { logJob } from "./logging/log_job";
|
||||||
import { initSDK } from "@hyperdx/node-opentelemetry";
|
import { initSDK } from "@hyperdx/node-opentelemetry";
|
||||||
import { Job, QueueEvents, tryCatch } from "bullmq";
|
import { Job } from "bullmq";
|
||||||
import { Logger } from "../lib/logger";
|
import { Logger } from "../lib/logger";
|
||||||
import { ScrapeEvents } from "../lib/scrape-events";
|
|
||||||
import { Worker } from "bullmq";
|
import { Worker } from "bullmq";
|
||||||
import systemMonitor from "./system-monitor";
|
import systemMonitor from "./system-monitor";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
import { addCrawlJob, addCrawlJobDone, crawlToCrawler, getCrawl, getCrawlJobs, isCrawlFinished, lockURL } from "../lib/crawl-redis";
|
||||||
import { getAdjustedMaxDepth } from "../scraper/WebScraper/utils/maxDepthUtils";
|
|
||||||
import { addCrawlJob, addCrawlJobDone, crawlToCrawler, getCrawl, isCrawlFinished, lockURL } from "../lib/crawl-redis";
|
|
||||||
import { StoredCrawl } from "../lib/crawl-redis";
|
import { StoredCrawl } from "../lib/crawl-redis";
|
||||||
import { addScrapeJob } from "./queue-jobs";
|
import { addScrapeJob } from "./queue-jobs";
|
||||||
|
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||||
|
|
||||||
if (process.env.ENV === "production") {
|
if (process.env.ENV === "production") {
|
||||||
initSDK({
|
initSDK({
|
||||||
@ -151,33 +149,33 @@ async function processJob(job: Job, token: string) {
|
|||||||
await callWebhook(job.data.team_id, job.id as string, data);
|
await callWebhook(job.data.team_id, job.id as string, data);
|
||||||
}
|
}
|
||||||
|
|
||||||
await logJob({
|
|
||||||
job_id: job.id as string,
|
|
||||||
success: success,
|
|
||||||
message: message,
|
|
||||||
num_docs: docs.length,
|
|
||||||
docs: docs,
|
|
||||||
time_taken: timeTakenInSeconds,
|
|
||||||
team_id: job.data.team_id,
|
|
||||||
mode: job.data.mode,
|
|
||||||
url: job.data.url,
|
|
||||||
crawlerOptions: job.data.crawlerOptions,
|
|
||||||
pageOptions: job.data.pageOptions,
|
|
||||||
origin: job.data.origin,
|
|
||||||
crawl_id: job.data.crawl_id,
|
|
||||||
});
|
|
||||||
|
|
||||||
if (job.data.crawl_id) {
|
if (job.data.crawl_id) {
|
||||||
|
await logJob({
|
||||||
|
job_id: job.id as string,
|
||||||
|
success: success,
|
||||||
|
message: message,
|
||||||
|
num_docs: docs.length,
|
||||||
|
docs: docs,
|
||||||
|
time_taken: timeTakenInSeconds,
|
||||||
|
team_id: job.data.team_id,
|
||||||
|
mode: job.data.mode,
|
||||||
|
url: job.data.url,
|
||||||
|
crawlerOptions: job.data.crawlerOptions,
|
||||||
|
pageOptions: job.data.pageOptions,
|
||||||
|
origin: job.data.origin,
|
||||||
|
crawl_id: job.data.crawl_id,
|
||||||
|
});
|
||||||
|
|
||||||
await addCrawlJobDone(job.data.crawl_id, job.id);
|
await addCrawlJobDone(job.data.crawl_id, job.id);
|
||||||
|
|
||||||
if (!job.data.sitemapped) {
|
const sc = await getCrawl(job.data.crawl_id) as StoredCrawl;
|
||||||
const sc = await getCrawl(job.data.crawl_id) as StoredCrawl;
|
|
||||||
|
|
||||||
|
if (!job.data.sitemapped) {
|
||||||
if (!sc.cancelled) {
|
if (!sc.cancelled) {
|
||||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||||
|
|
||||||
const links = crawler.filterLinks((data.docs[0].linksOnPage as string[])
|
const links = crawler.filterLinks((data.docs[0].linksOnPage as string[])
|
||||||
.map(href => crawler.filterURL(href, sc.originUrl))
|
.map(href => crawler.filterURL(href.trim(), sc.originUrl))
|
||||||
.filter(x => x !== null),
|
.filter(x => x !== null),
|
||||||
Infinity,
|
Infinity,
|
||||||
sc.crawlerOptions?.maxDepth ?? 10
|
sc.crawlerOptions?.maxDepth ?? 10
|
||||||
@ -202,6 +200,66 @@ async function processJob(job: Job, token: string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (await isCrawlFinished(job.data.crawl_id)) {
|
if (await isCrawlFinished(job.data.crawl_id)) {
|
||||||
|
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
||||||
|
|
||||||
|
const jobs = (await Promise.all(jobIDs.map(async x => {
|
||||||
|
if (x === job.id) {
|
||||||
|
return {
|
||||||
|
async getState() {
|
||||||
|
return "completed"
|
||||||
|
},
|
||||||
|
timestamp: Date.now(),
|
||||||
|
returnvalue: docs,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const j = await getScrapeQueue().getJob(x);
|
||||||
|
|
||||||
|
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
|
const supabaseData = await supabaseGetJobById(j.id);
|
||||||
|
|
||||||
|
if (supabaseData) {
|
||||||
|
j.returnvalue = supabaseData.docs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return j;
|
||||||
|
}))).sort((a, b) => a.timestamp - b.timestamp);
|
||||||
|
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
||||||
|
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
|
||||||
|
|
||||||
|
const fullDocs = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
||||||
|
|
||||||
|
await logJob({
|
||||||
|
job_id: job.data.crawl_id,
|
||||||
|
success: jobStatus === "completed",
|
||||||
|
message: message,
|
||||||
|
num_docs: fullDocs.length,
|
||||||
|
docs: [],
|
||||||
|
time_taken: (Date.now() - sc.createdAt) / 1000,
|
||||||
|
team_id: job.data.team_id,
|
||||||
|
mode: "crawl",
|
||||||
|
url: sc.originUrl,
|
||||||
|
crawlerOptions: sc.crawlerOptions,
|
||||||
|
pageOptions: sc.pageOptions,
|
||||||
|
origin: job.data.origin,
|
||||||
|
});
|
||||||
|
|
||||||
|
const data = {
|
||||||
|
success: jobStatus !== "failed",
|
||||||
|
result: {
|
||||||
|
links: fullDocs.map((doc) => {
|
||||||
|
return {
|
||||||
|
content: doc,
|
||||||
|
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
|
||||||
|
};
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
project_id: job.data.project_id,
|
||||||
|
error: message /* etc... */,
|
||||||
|
docs: fullDocs,
|
||||||
|
};
|
||||||
|
|
||||||
await callWebhook(job.data.team_id, job.id as string, data);
|
await callWebhook(job.data.team_id, job.id as string, data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -222,6 +280,9 @@ async function processJob(job: Job, token: string) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
Logger.error(error);
|
Logger.error(error);
|
||||||
|
if (error.stack) {
|
||||||
|
Logger.error(error.stack);
|
||||||
|
}
|
||||||
|
|
||||||
logtail.error("Overall error ingesting", {
|
logtail.error("Overall error ingesting", {
|
||||||
job_id: job.id,
|
job_id: job.id,
|
||||||
@ -235,27 +296,51 @@ async function processJob(job: Job, token: string) {
|
|||||||
error:
|
error:
|
||||||
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (job.data.mode === "crawl" || job.data.crawl_id) {
|
if (job.data.mode === "crawl" || job.data.crawl_id) {
|
||||||
await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data);
|
await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data);
|
||||||
}
|
}
|
||||||
await logJob({
|
|
||||||
job_id: job.id as string,
|
if (job.data.crawl_id) {
|
||||||
success: false,
|
await logJob({
|
||||||
message:
|
job_id: job.id as string,
|
||||||
typeof error === "string"
|
success: false,
|
||||||
? error
|
message:
|
||||||
: error.message ?? "Something went wrong... Contact help@mendable.ai",
|
typeof error === "string"
|
||||||
num_docs: 0,
|
? error
|
||||||
docs: [],
|
: error.message ?? "Something went wrong... Contact help@mendable.ai",
|
||||||
time_taken: 0,
|
num_docs: 0,
|
||||||
team_id: job.data.team_id,
|
docs: [],
|
||||||
mode: job.data.mode,
|
time_taken: 0,
|
||||||
url: job.data.url,
|
team_id: job.data.team_id,
|
||||||
crawlerOptions: job.data.crawlerOptions,
|
mode: job.data.mode,
|
||||||
pageOptions: job.data.pageOptions,
|
url: job.data.url,
|
||||||
origin: job.data.origin,
|
crawlerOptions: job.data.crawlerOptions,
|
||||||
crawl_id: job.data.crawl_id,
|
pageOptions: job.data.pageOptions,
|
||||||
});
|
origin: job.data.origin,
|
||||||
|
crawl_id: job.data.crawl_id,
|
||||||
|
});
|
||||||
|
|
||||||
|
const sc = await getCrawl(job.data.crawl_id);
|
||||||
|
|
||||||
|
await logJob({
|
||||||
|
job_id: job.data.crawl_id,
|
||||||
|
success: false,
|
||||||
|
message:
|
||||||
|
typeof error === "string"
|
||||||
|
? error
|
||||||
|
: error.message ?? "Something went wrong... Contact help@mendable.ai",
|
||||||
|
num_docs: 0,
|
||||||
|
docs: [],
|
||||||
|
time_taken: 0,
|
||||||
|
team_id: job.data.team_id,
|
||||||
|
mode: "crawl",
|
||||||
|
url: sc ? sc.originUrl : job.data.url,
|
||||||
|
crawlerOptions: sc ? sc.crawlerOptions : job.data.crawlerOptions,
|
||||||
|
pageOptions: sc ? sc.pageOptions : job.data.pageOptions,
|
||||||
|
origin: job.data.origin,
|
||||||
|
});
|
||||||
|
}
|
||||||
// done(null, data);
|
// done(null, data);
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
@ -43,6 +43,7 @@ export interface RunWebScraperParams {
|
|||||||
onError: (error: Error) => void;
|
onError: (error: Error) => void;
|
||||||
team_id: string;
|
team_id: string;
|
||||||
bull_job_id: string;
|
bull_job_id: string;
|
||||||
|
priority?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface RunWebScraperResult {
|
export interface RunWebScraperResult {
|
||||||
|
25
apps/go-sdk/examples/.gitignore
vendored
Normal file
25
apps/go-sdk/examples/.gitignore
vendored
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
# If you prefer the allow list template instead of the deny list, see community template:
|
||||||
|
# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
|
||||||
|
#
|
||||||
|
# Binaries for programs and plugins
|
||||||
|
*.exe
|
||||||
|
*.exe~
|
||||||
|
*.dll
|
||||||
|
*.so
|
||||||
|
*.dylib
|
||||||
|
|
||||||
|
# Test binary, built with `go test -c`
|
||||||
|
*.test
|
||||||
|
|
||||||
|
# Output of the go coverage tool, specifically when used with LiteIDE
|
||||||
|
*.out
|
||||||
|
|
||||||
|
# Dependency directories (remove the comment below to include it)
|
||||||
|
# vendor/
|
||||||
|
|
||||||
|
# Go workspace file
|
||||||
|
go.work
|
||||||
|
go.work.sum
|
||||||
|
|
||||||
|
# env file
|
||||||
|
.env
|
21
apps/go-sdk/examples/LICENSE
Normal file
21
apps/go-sdk/examples/LICENSE
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2024 Mendable
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
@ -6,11 +6,11 @@ import (
|
|||||||
"log"
|
"log"
|
||||||
|
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
"github.com/mendableai/firecrawl/firecrawl"
|
"github.com/mendableai/firecrawl-go"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
app, err := firecrawl.NewFirecrawlApp("fc-YOUR-API-KEY", "http://localhost:3002")
|
app, err := firecrawl.NewFirecrawlApp("fc-YOUR_API_KEY", "https://api.firecrawl.dev")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("Failed to create FirecrawlApp: %v", err)
|
log.Fatalf("Failed to create FirecrawlApp: %v", err)
|
||||||
}
|
}
|
||||||
|
@ -1,10 +1,9 @@
|
|||||||
module github.com/mendableai/firecrawl/apps/go-sdk/examples
|
module github.com/mendableai/firecrawl-go-examples
|
||||||
|
|
||||||
go 1.22.5
|
go 1.22.5
|
||||||
|
|
||||||
replace github.com/mendableai/firecrawl => ../
|
replace github.com/mendableai/firecrawl => ../
|
||||||
|
|
||||||
require (
|
require github.com/google/uuid v1.6.0
|
||||||
github.com/google/uuid v1.6.0
|
|
||||||
github.com/mendableai/firecrawl v0.0.0-00010101000000-000000000000
|
require github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 // indirect
|
||||||
)
|
|
||||||
|
@ -4,6 +4,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
|||||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
|
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
|
||||||
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
|
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
|
||||||
|
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 h1:461um7fbSQYj2E3ETl8GINuRg5MTY3BdjMnogwUIhBs=
|
||||||
|
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46/go.mod h1:mTGbJ37fy43aaqonp/tdpzCH516jHFw/XVvfFi4QXHo=
|
||||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||||
|
2
apps/go-sdk/firecrawl/.gitignore
vendored
Normal file
2
apps/go-sdk/firecrawl/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
.env
|
||||||
|
vendor
|
@ -1,4 +1,4 @@
|
|||||||
module github.com/mendableai/firecrawl/apps/go-sdk
|
module github.com/mendableai/firecrawl-go
|
||||||
|
|
||||||
go 1.22.5
|
go 1.22.5
|
||||||
|
|
@ -24,6 +24,7 @@
|
|||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/jest": "^29.5.12",
|
"@types/jest": "^29.5.12",
|
||||||
"@types/supertest": "^6.0.2",
|
"@types/supertest": "^6.0.2",
|
||||||
|
"artillery": "^2.0.19",
|
||||||
"typescript": "^5.4.5"
|
"typescript": "^5.4.5"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
6884
apps/test-suite/pnpm-lock.yaml
generated
6884
apps/test-suite/pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,4 @@
|
|||||||
name: firecrawl
|
name: firecrawl
|
||||||
version: '3.9'
|
|
||||||
|
|
||||||
x-common-service: &common-service
|
x-common-service: &common-service
|
||||||
build: apps/api
|
build: apps/api
|
||||||
|
Loading…
x
Reference in New Issue
Block a user