Merge branch 'main' into v1-webscraper

This commit is contained in:
Nicolas 2024-08-15 15:14:29 -04:00
commit c917c8fbcd
35 changed files with 7190 additions and 156 deletions

6
.gitmodules vendored Normal file
View File

@ -0,0 +1,6 @@
[submodule "apps/go-sdk/firecrawl"]
path = apps/go-sdk/firecrawl
url = https://github.com/mendableai/firecrawl-go
[submodule "apps/go-sdk/examples"]
path = apps/go-sdk/examples
url = https://github.com/mendableai/firecrawl-go-examples

View File

@ -24,8 +24,8 @@ kill_timeout = '30s'
[http_service.concurrency]
type = "requests"
hard_limit = 100
soft_limit = 50
# hard_limit = 100
soft_limit = 100
[[http_service.checks]]
grace_period = "10s"
@ -51,8 +51,8 @@ kill_timeout = '30s'
[services.concurrency]
type = 'connections'
hard_limit = 25
soft_limit = 20
# hard_limit = 25
soft_limit = 100
[[vm]]
size = 'performance-2x'

View File

@ -24,8 +24,8 @@ kill_timeout = '30s'
[http_service.concurrency]
type = "requests"
hard_limit = 200
soft_limit = 75
# hard_limit = 200
soft_limit = 200
[[http_service.checks]]
grace_period = "20s"
@ -50,8 +50,8 @@ kill_timeout = '30s'
[services.concurrency]
type = 'connections'
hard_limit = 30
soft_limit = 12
# hard_limit = 30
soft_limit = 200
[[vm]]
size = 'performance-4x'

View File

@ -10,7 +10,9 @@ import { createIdempotencyKey } from "../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../src/lib/logger";
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../src/services/queue-service";
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
export async function crawlController(req: Request, res: Response) {
try {
@ -42,10 +44,17 @@ export async function crawlController(req: Request, res: Response) {
return res.status(402).json({ error: "Insufficient credits" });
}
const url = req.body.url;
let url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
try {
url = checkAndUpdateURL(url).url;
} catch (e) {
return res
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
.json({ error: e.message ?? e });
}
if (isUrlBlocked(url)) {
return res
@ -94,41 +103,50 @@ export async function crawlController(req: Request, res: Response) {
await logCrawl(id, team_id);
let robots;
try {
robots = await this.getRobotsTxt();
} catch (_) {}
const sc: StoredCrawl = {
originUrl: url,
crawlerOptions,
pageOptions,
team_id,
robots,
createdAt: Date.now(),
};
await saveCrawl(id, sc);
const crawler = crawlToCrawler(id, sc);
try {
sc.robots = await crawler.getRobotsTxt();
} catch (_) {}
await saveCrawl(id, sc);
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
if (sitemap !== null) {
for (const url of sitemap.map(x => x.url)) {
await lockURL(id, sc, url);
const job = await addScrapeJob({
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
sitemapped: true,
});
await addCrawlJob(id, job.id);
}
const jobs = sitemap.map(x => {
const url = x.url;
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
sitemapped: true,
},
opts: {
jobId: uuid,
priority: 20,
}
};
})
await lockURLs(id, jobs.map(x => x.data.url));
await addCrawlJobs(id, jobs.map(x => x.opts.jobId));
await getScrapeQueue().addBulk(jobs);
} else {
await lockURL(id, sc, url);
const job = await addScrapeJob({
@ -139,6 +157,8 @@ export async function crawlController(req: Request, res: Response) {
pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
}, {
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
});
await addCrawlJob(id, job.id);
}

View File

@ -6,6 +6,7 @@ import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../src/lib/logger";
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
import { addScrapeJob } from "../../src/services/queue-jobs";
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
export async function crawlPreviewController(req: Request, res: Response) {
try {
@ -21,10 +22,17 @@ export async function crawlPreviewController(req: Request, res: Response) {
return res.status(status).json({ error });
}
const url = req.body.url;
let url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
try {
url = checkAndUpdateURL(url).url;
} catch (e) {
return res
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
.json({ error: e.message ?? e });
}
if (isUrlBlocked(url)) {
return res
@ -81,6 +89,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
pageOptions,
team_id,
robots,
createdAt: Date.now(),
};
await saveCrawl(id, sc);

View File

@ -45,7 +45,7 @@ export async function scrapeHelper(
pageOptions,
extractorOptions,
origin: req.body.origin ?? defaultOrigin,
});
}, {}, jobId);
let doc;
try {

View File

@ -9,6 +9,7 @@ import { search } from "../search";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../lib/logger";
import { getScrapeQueue, scrapeQueueEvents } from "../services/queue-service";
export async function searchHelper(
jobId: string,
@ -75,26 +76,28 @@ export async function searchHelper(
// filter out social media links
const jobDatas = res.map(x => {
const url = x.url;
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
},
opts: {
jobId: uuid,
priority: 10,
}
};
})
const jobs = await getScrapeQueue().addBulk(jobDatas);
const a = new WebScraperDataProvider();
await a.setOptions({
jobId,
mode: "single_urls",
urls: res.map((r) => r.url).slice(0, Math.min(searchOptions.limit ?? 5, 5)),
crawlerOptions: {
...crawlerOptions,
},
pageOptions: {
...pageOptions,
onlyMainContent: pageOptions?.onlyMainContent ?? true,
fetchPageContent: pageOptions?.fetchPageContent ?? true,
includeHtml: pageOptions?.includeHtml ?? false,
removeTags: pageOptions?.removeTags ?? [],
fallback: false,
},
});
const docs = await a.getDocuments(false);
const docs = (await Promise.all(jobs.map(x => x.waitUntilFinished(scrapeQueueEvents, 60000)))).map(x => x[0]);
if (docs.length === 0) {
return { success: true, error: "No search results found", returnCode: 200 };
@ -109,19 +112,6 @@ export async function searchHelper(
return { success: true, error: "No page found", returnCode: 200, data: docs };
}
const billingResult = await billTeam(
team_id,
filteredDocs.length
);
if (!billingResult.success) {
return {
success: false,
error:
"Failed to bill team. Insufficient credits or subscription not found.",
returnCode: 402,
};
}
return {
success: true,
data: filteredDocs,

View File

@ -2,6 +2,7 @@ import { Request, Response } from "express";
import { Logger } from "../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../src/services/queue-service";
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
try {
@ -21,7 +22,19 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
// }
// }
const jobs = await Promise.all(jobIDs.map(x => getScrapeQueue().getJob(x)));
const jobs = (await Promise.all(jobIDs.map(async x => {
const job = await getScrapeQueue().getJob(x);
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobById(job.id);
if (supabaseData) {
job.returnvalue = supabaseData.docs;
}
}
return job;
}))).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";

View File

@ -8,6 +8,7 @@ export type StoredCrawl = {
team_id: string;
robots?: string;
cancelled?: boolean;
createdAt: number;
};
export async function saveCrawl(id: string, crawl: StoredCrawl) {
@ -30,6 +31,11 @@ export async function addCrawlJob(id: string, job_id: string) {
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
}
export async function addCrawlJobs(id: string, job_ids: string[]) {
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
}
export async function addCrawlJobDone(id: string, job_id: string) {
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
@ -54,6 +60,13 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
return res;
}
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
return res;
}
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
const crawler = new WebCrawler({
jobId: id,

View File

@ -65,6 +65,7 @@ export type WebScraperOptions = {
extractorOptions?: ExtractorOptions;
concurrentRequests?: number;
bullJobId?: string;
priority?: number;
};
export interface DocumentUrl {

View File

@ -49,6 +49,7 @@ export async function startWebScraperPipeline({
},
team_id: job.data.team_id,
bull_job_id: job.id.toString(),
priority: job.opts.priority,
})) as { success: boolean; message: string; docs: Document[] };
}
export async function runWebScraper({
@ -62,6 +63,7 @@ export async function runWebScraper({
onError,
team_id,
bull_job_id,
priority,
}: RunWebScraperParams): Promise<RunWebScraperResult> {
try {
const provider = new WebScraperDataProvider();
@ -74,6 +76,7 @@ export async function runWebScraper({
crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
bullJobId: bull_job_id,
priority,
});
} else {
await provider.setOptions({
@ -83,6 +86,7 @@ export async function runWebScraper({
extractorOptions,
crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
priority,
});
}
const docs = (await provider.getDocuments(false, (progress: Progress) => {

View File

@ -24,7 +24,7 @@ describe('scrapSingleUrl', () => {
});
it('should return a list of links on the firecrawl.ai page', async () => {
const url = 'https://example.com';
const url = 'https://flutterbricks.com';
const pageOptions: PageOptions = { includeHtml: true };
const result = await scrapSingleUrl("TEST", url, pageOptions);
@ -33,5 +33,5 @@ it('should return a list of links on the firecrawl.ai page', async () => {
expect(result.linksOnPage).toBeDefined();
expect(Array.isArray(result.linksOnPage)).toBe(true);
expect(result.linksOnPage.length).toBeGreaterThan(0);
expect(result.linksOnPage).toContain('https://www.iana.org/domains/example')
expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
}, 10000);

View File

@ -44,6 +44,7 @@ export class WebScraperDataProvider {
private crawlerMode: string = "default";
private allowBackwardCrawling: boolean = false;
private allowExternalContentLinks: boolean = false;
private priority?: number;
authorize(): void {
throw new Error("Method not implemented.");
@ -72,7 +73,8 @@ export class WebScraperDataProvider {
url,
this.pageOptions,
this.extractorOptions,
existingHTML
existingHTML,
this.priority,
);
processedUrls++;
if (inProgress) {
@ -593,6 +595,7 @@ export class WebScraperDataProvider {
options.crawlerOptions?.allowBackwardCrawling ?? false;
this.allowExternalContentLinks =
options.crawlerOptions?.allowExternalContentLinks ?? false;
this.priority = options.priority;
// make sure all urls start with https://
this.urls = this.urls.map((url) => {

View File

@ -26,6 +26,7 @@ export async function scrapWithFireEngine({
fireEngineOptions = {},
headers,
options,
priority,
}: {
url: string;
waitFor?: number;
@ -35,6 +36,7 @@ export async function scrapWithFireEngine({
fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>;
options?: any;
priority?: number;
}): Promise<FireEngineResponse> {
const logParams = {
url,
@ -78,6 +80,7 @@ export async function scrapWithFireEngine({
fullPageScreenshot: fullPageScreenshotParam,
headers: headers,
pageOptions: pageOptions,
priority,
...fireEngineOptionsParam,
},
{

View File

@ -134,7 +134,8 @@ export async function scrapSingleUrl(
extractorOptions: ExtractorOptions = {
mode: "llm-extraction-from-markdown",
},
existingHtml: string = ""
existingHtml: string = "",
priority?: number,
): Promise<Document> {
urlToScrap = urlToScrap.trim();
@ -177,7 +178,8 @@ export async function scrapSingleUrl(
headers: pageOptions.headers,
fireEngineOptions: {
engine: engine,
}
},
priority,
});
scraperResponse.text = response.html;
scraperResponse.screenshot = response.screenshot;

View File

@ -234,5 +234,13 @@ export const urlSpecificParams = {
engine: "tlsclient",
},
},
},
"zoopla.co.uk":{
defaultScraper: "fire-engine",
params:{
fireEngineOptions:{
engine: "chrome-cdp",
},
},
}
};

View File

@ -49,7 +49,7 @@ export async function checkAlerts() {
};
const checkAll = async () => {
await checkActiveJobs();
// await checkActiveJobs();
await checkWaitingQueue();
};

View File

@ -44,7 +44,7 @@ export async function logJob(job: FirecrawlJob) {
},
]);
if (process.env.POSTHOG_API_KEY) {
if (process.env.POSTHOG_API_KEY && !job.crawl_id) {
let phLog = {
distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
...(job.team_id !== "preview" && {
@ -65,7 +65,6 @@ export async function logJob(job: FirecrawlJob) {
extractor_options: job.extractor_options,
num_tokens: job.num_tokens,
retry: job.retry,
crawl_id: job.crawl_id,
},
};
posthog.capture(phLog);

View File

@ -9,9 +9,9 @@ export async function addScrapeJob(
jobId: string = uuidv4(),
): Promise<Job> {
return await getScrapeQueue().add(jobId, webScraperOptions, {
priority: webScraperOptions.crawl_id ? 20 : 10,
...options,
jobId,
priority: webScraperOptions.crawl_id ? 2 : 1,
});
}

View File

@ -10,17 +10,15 @@ import { startWebScraperPipeline } from "../main/runWebScraper";
import { callWebhook } from "./webhook";
import { logJob } from "./logging/log_job";
import { initSDK } from "@hyperdx/node-opentelemetry";
import { Job, QueueEvents, tryCatch } from "bullmq";
import { Job } from "bullmq";
import { Logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events";
import { Worker } from "bullmq";
import systemMonitor from "./system-monitor";
import { v4 as uuidv4 } from "uuid";
import { WebCrawler } from "../scraper/WebScraper/crawler";
import { getAdjustedMaxDepth } from "../scraper/WebScraper/utils/maxDepthUtils";
import { addCrawlJob, addCrawlJobDone, crawlToCrawler, getCrawl, isCrawlFinished, lockURL } from "../lib/crawl-redis";
import { addCrawlJob, addCrawlJobDone, crawlToCrawler, getCrawl, getCrawlJobs, isCrawlFinished, lockURL } from "../lib/crawl-redis";
import { StoredCrawl } from "../lib/crawl-redis";
import { addScrapeJob } from "./queue-jobs";
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
if (process.env.ENV === "production") {
initSDK({
@ -151,33 +149,33 @@ async function processJob(job: Job, token: string) {
await callWebhook(job.data.team_id, job.id as string, data);
}
await logJob({
job_id: job.id as string,
success: success,
message: message,
num_docs: docs.length,
docs: docs,
time_taken: timeTakenInSeconds,
team_id: job.data.team_id,
mode: job.data.mode,
url: job.data.url,
crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
});
if (job.data.crawl_id) {
await logJob({
job_id: job.id as string,
success: success,
message: message,
num_docs: docs.length,
docs: docs,
time_taken: timeTakenInSeconds,
team_id: job.data.team_id,
mode: job.data.mode,
url: job.data.url,
crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
});
await addCrawlJobDone(job.data.crawl_id, job.id);
if (!job.data.sitemapped) {
const sc = await getCrawl(job.data.crawl_id) as StoredCrawl;
const sc = await getCrawl(job.data.crawl_id) as StoredCrawl;
if (!job.data.sitemapped) {
if (!sc.cancelled) {
const crawler = crawlToCrawler(job.data.crawl_id, sc);
const links = crawler.filterLinks((data.docs[0].linksOnPage as string[])
.map(href => crawler.filterURL(href, sc.originUrl))
.map(href => crawler.filterURL(href.trim(), sc.originUrl))
.filter(x => x !== null),
Infinity,
sc.crawlerOptions?.maxDepth ?? 10
@ -202,6 +200,66 @@ async function processJob(job: Job, token: string) {
}
if (await isCrawlFinished(job.data.crawl_id)) {
const jobIDs = await getCrawlJobs(job.data.crawl_id);
const jobs = (await Promise.all(jobIDs.map(async x => {
if (x === job.id) {
return {
async getState() {
return "completed"
},
timestamp: Date.now(),
returnvalue: docs,
}
}
const j = await getScrapeQueue().getJob(x);
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobById(j.id);
if (supabaseData) {
j.returnvalue = supabaseData.docs;
}
}
return j;
}))).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
const fullDocs = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
await logJob({
job_id: job.data.crawl_id,
success: jobStatus === "completed",
message: message,
num_docs: fullDocs.length,
docs: [],
time_taken: (Date.now() - sc.createdAt) / 1000,
team_id: job.data.team_id,
mode: "crawl",
url: sc.originUrl,
crawlerOptions: sc.crawlerOptions,
pageOptions: sc.pageOptions,
origin: job.data.origin,
});
const data = {
success: jobStatus !== "failed",
result: {
links: fullDocs.map((doc) => {
return {
content: doc,
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
};
}),
},
project_id: job.data.project_id,
error: message /* etc... */,
docs: fullDocs,
};
await callWebhook(job.data.team_id, job.id as string, data);
}
}
@ -222,6 +280,9 @@ async function processJob(job: Job, token: string) {
});
}
Logger.error(error);
if (error.stack) {
Logger.error(error.stack);
}
logtail.error("Overall error ingesting", {
job_id: job.id,
@ -235,27 +296,51 @@ async function processJob(job: Job, token: string) {
error:
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
};
if (job.data.mode === "crawl" || job.data.crawl_id) {
await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data);
}
await logJob({
job_id: job.id as string,
success: false,
message:
typeof error === "string"
? error
: error.message ?? "Something went wrong... Contact help@mendable.ai",
num_docs: 0,
docs: [],
time_taken: 0,
team_id: job.data.team_id,
mode: job.data.mode,
url: job.data.url,
crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
});
if (job.data.crawl_id) {
await logJob({
job_id: job.id as string,
success: false,
message:
typeof error === "string"
? error
: error.message ?? "Something went wrong... Contact help@mendable.ai",
num_docs: 0,
docs: [],
time_taken: 0,
team_id: job.data.team_id,
mode: job.data.mode,
url: job.data.url,
crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
});
const sc = await getCrawl(job.data.crawl_id);
await logJob({
job_id: job.data.crawl_id,
success: false,
message:
typeof error === "string"
? error
: error.message ?? "Something went wrong... Contact help@mendable.ai",
num_docs: 0,
docs: [],
time_taken: 0,
team_id: job.data.team_id,
mode: "crawl",
url: sc ? sc.originUrl : job.data.url,
crawlerOptions: sc ? sc.crawlerOptions : job.data.crawlerOptions,
pageOptions: sc ? sc.pageOptions : job.data.pageOptions,
origin: job.data.origin,
});
}
// done(null, data);
return data;
}

View File

@ -43,6 +43,7 @@ export interface RunWebScraperParams {
onError: (error: Error) => void;
team_id: string;
bull_job_id: string;
priority?: number;
}
export interface RunWebScraperResult {

25
apps/go-sdk/examples/.gitignore vendored Normal file
View File

@ -0,0 +1,25 @@
# If you prefer the allow list template instead of the deny list, see community template:
# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
#
# Binaries for programs and plugins
*.exe
*.exe~
*.dll
*.so
*.dylib
# Test binary, built with `go test -c`
*.test
# Output of the go coverage tool, specifically when used with LiteIDE
*.out
# Dependency directories (remove the comment below to include it)
# vendor/
# Go workspace file
go.work
go.work.sum
# env file
.env

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Mendable
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -6,11 +6,11 @@ import (
"log"
"github.com/google/uuid"
"github.com/mendableai/firecrawl/firecrawl"
"github.com/mendableai/firecrawl-go"
)
func main() {
app, err := firecrawl.NewFirecrawlApp("fc-YOUR-API-KEY", "http://localhost:3002")
app, err := firecrawl.NewFirecrawlApp("fc-YOUR_API_KEY", "https://api.firecrawl.dev")
if err != nil {
log.Fatalf("Failed to create FirecrawlApp: %v", err)
}

View File

@ -1,10 +1,9 @@
module github.com/mendableai/firecrawl/apps/go-sdk/examples
module github.com/mendableai/firecrawl-go-examples
go 1.22.5
replace github.com/mendableai/firecrawl => ../
require (
github.com/google/uuid v1.6.0
github.com/mendableai/firecrawl v0.0.0-00010101000000-000000000000
)
require github.com/google/uuid v1.6.0
require github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 // indirect

View File

@ -4,6 +4,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 h1:461um7fbSQYj2E3ETl8GINuRg5MTY3BdjMnogwUIhBs=
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46/go.mod h1:mTGbJ37fy43aaqonp/tdpzCH516jHFw/XVvfFi4QXHo=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=

2
apps/go-sdk/firecrawl/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
.env
vendor

View File

@ -1,4 +1,4 @@
module github.com/mendableai/firecrawl/apps/go-sdk
module github.com/mendableai/firecrawl-go
go 1.22.5

View File

@ -24,6 +24,7 @@
"devDependencies": {
"@types/jest": "^29.5.12",
"@types/supertest": "^6.0.2",
"artillery": "^2.0.19",
"typescript": "^5.4.5"
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,4 @@
name: firecrawl
version: '3.9'
x-common-service: &common-service
build: apps/api