mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-13 22:35:53 +08:00
Merge branch 'main' into nsc/llm-usage-extract
This commit is contained in:
commit
260a726f37
@ -14,7 +14,7 @@
|
||||
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
|
||||
"test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
|
||||
"test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'",
|
||||
"test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth)'",
|
||||
"test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth|src/scraper/scrapeURL)'",
|
||||
"test:snips": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false src/__tests__/snips/*.test.ts",
|
||||
"workers": "nodemon --exec ts-node src/services/queue-worker.ts",
|
||||
"worker:production": "node dist/src/services/queue-worker.js",
|
||||
|
@ -60,9 +60,6 @@ content-type: application/json
|
||||
"sitemapOnly": true
|
||||
}
|
||||
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
|
||||
|
||||
### Extract Firecrawl Title
|
||||
# @name extractFirecrawl
|
||||
POST {{baseUrl}}/v1/extract HTTP/1.1
|
||||
|
81
apps/api/src/controllers/v1/crawl-errors.ts
Normal file
81
apps/api/src/controllers/v1/crawl-errors.ts
Normal file
@ -0,0 +1,81 @@
|
||||
import { Response } from "express";
|
||||
import {
|
||||
CrawlErrorsResponse,
|
||||
CrawlStatusParams,
|
||||
CrawlStatusResponse,
|
||||
ErrorResponse,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import {
|
||||
getCrawl,
|
||||
getCrawlExpiry,
|
||||
getCrawlJobs,
|
||||
getDoneJobsOrdered,
|
||||
getDoneJobsOrderedLength,
|
||||
getThrottledJobs,
|
||||
isCrawlFinished,
|
||||
} from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue, redisConnection } from "../../services/queue-service";
|
||||
import {
|
||||
supabaseGetJobById,
|
||||
supabaseGetJobsById,
|
||||
} from "../../lib/supabase-jobs";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { Job, JobState } from "bullmq";
|
||||
import { logger } from "../../lib/logger";
|
||||
configDotenv();
|
||||
|
||||
export async function getJob(id: string) {
|
||||
const job = await getScrapeQueue().getJob(id);
|
||||
if (!job) return job;
|
||||
|
||||
return job;
|
||||
}
|
||||
|
||||
export async function getJobs(ids: string[]) {
|
||||
const jobs: (Job & { id: string })[] = (
|
||||
await Promise.all(ids.map((x) => getScrapeQueue().getJob(x)))
|
||||
).filter((x) => x) as (Job & { id: string })[];
|
||||
|
||||
return jobs;
|
||||
}
|
||||
|
||||
export async function crawlErrorsController(
|
||||
req: RequestWithAuth<CrawlStatusParams, undefined, CrawlErrorsResponse>,
|
||||
res: Response<CrawlErrorsResponse>,
|
||||
) {
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ success: false, error: "Job not found" });
|
||||
}
|
||||
|
||||
if (sc.team_id !== req.auth.team_id) {
|
||||
return res.status(403).json({ success: false, error: "Forbidden" });
|
||||
}
|
||||
|
||||
let jobStatuses = await Promise.all(
|
||||
(await getCrawlJobs(req.params.jobId)).map(
|
||||
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
|
||||
),
|
||||
);
|
||||
|
||||
const failedJobIDs: string[] = [];
|
||||
|
||||
for (const [id, status] of jobStatuses) {
|
||||
if (
|
||||
status === "failed"
|
||||
) {
|
||||
failedJobIDs.push(id);
|
||||
}
|
||||
}
|
||||
|
||||
res.status(200).json({
|
||||
errors: (await getJobs(failedJobIDs)).map(x => ({
|
||||
id: x.id,
|
||||
timestamp: x.finishedOn !== undefined ? (new Date(x.finishedOn).toISOString()) : undefined,
|
||||
url: x.data.url,
|
||||
error: x.failedReason,
|
||||
})),
|
||||
robotsBlocked: await redisConnection.smembers("crawl:" + req.params.jobId + ":robots_blocked"),
|
||||
});
|
||||
}
|
@ -13,6 +13,7 @@ import {
|
||||
getDoneJobsOrderedLength,
|
||||
getThrottledJobs,
|
||||
isCrawlFinished,
|
||||
isCrawlFinishedLocked,
|
||||
} from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import {
|
||||
@ -117,7 +118,7 @@ export async function crawlStatusController(
|
||||
sc.cancelled
|
||||
? "cancelled"
|
||||
: validJobStatuses.every((x) => x[1] === "completed") &&
|
||||
await isCrawlFinished(req.params.jobId)
|
||||
(await isCrawlFinishedLocked(req.params.jobId) || await isCrawlFinished(req.params.jobId))
|
||||
? "completed"
|
||||
: "scraping";
|
||||
|
||||
|
@ -85,6 +85,11 @@ export async function getMapResults({
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt();
|
||||
await crawler.importRobotsTxt(sc.robots);
|
||||
} catch (_) {}
|
||||
|
||||
// If sitemapOnly is true, only get links from sitemap
|
||||
if (crawlerOptions.sitemapOnly) {
|
||||
const sitemap = await crawler.tryGetSitemap(
|
||||
|
@ -34,7 +34,7 @@ export const url = z.preprocess(
|
||||
.url()
|
||||
.regex(/^https?:\/\//, "URL uses unsupported protocol")
|
||||
.refine(
|
||||
(x) => /\.[a-z]{2,}([\/?#]|$)/i.test(x),
|
||||
(x) => /\.[a-z]{2,}(:\d+)?([\/?#]|$)/i.test(x),
|
||||
"URL must have a valid top-level domain or be a valid path",
|
||||
)
|
||||
.refine((x) => {
|
||||
@ -569,6 +569,19 @@ export type CrawlStatusResponse =
|
||||
data: Document[];
|
||||
};
|
||||
|
||||
|
||||
export type CrawlErrorsResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
errors: {
|
||||
id: string,
|
||||
timestamp?: string,
|
||||
url: string,
|
||||
error: string,
|
||||
}[];
|
||||
robotsBlocked: string[];
|
||||
};
|
||||
|
||||
type AuthObject = {
|
||||
team_id: string;
|
||||
plan: PlanType | undefined;
|
||||
|
@ -128,6 +128,7 @@ export async function isCrawlFinished(id: string) {
|
||||
return (
|
||||
(await redisConnection.scard("crawl:" + id + ":jobs_done")) ===
|
||||
(await redisConnection.scard("crawl:" + id + ":jobs"))
|
||||
&& (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
|
||||
);
|
||||
}
|
||||
|
||||
@ -135,6 +136,10 @@ export async function isCrawlFinishedLocked(id: string) {
|
||||
return await redisConnection.exists("crawl:" + id + ":finish");
|
||||
}
|
||||
|
||||
export async function finishCrawlKickoff(id: string) {
|
||||
await redisConnection.set("crawl:" + id + ":kickoff:finish", "yes", "EX", 24 * 60 * 60);
|
||||
}
|
||||
|
||||
export async function finishCrawl(id: string) {
|
||||
if (await isCrawlFinished(id)) {
|
||||
_logger.debug("Marking crawl as finished.", {
|
||||
@ -152,6 +157,9 @@ export async function finishCrawl(id: string) {
|
||||
module: "crawl-redis",
|
||||
method: "finishCrawl",
|
||||
crawlId: id,
|
||||
jobs_done: (await redisConnection.scard("crawl:" + id + ":jobs_done")),
|
||||
jobs: (await redisConnection.scard("crawl:" + id + ":jobs")),
|
||||
kickoff_finished: (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@ -28,6 +28,7 @@ import { extractStatusController } from "../controllers/v1/extract-status";
|
||||
import { creditUsageController } from "../controllers/v1/credit-usage";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||
import { searchController } from "../controllers/v1/search";
|
||||
import { crawlErrorsController } from "../controllers/v1/crawl-errors";
|
||||
|
||||
function checkCreditsMiddleware(
|
||||
minimum?: number,
|
||||
@ -192,6 +193,18 @@ v1Router.get(
|
||||
wrap((req: any, res): any => crawlStatusController(req, res, true)),
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/crawl/:jobId/errors",
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
wrap(crawlErrorsController),
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/batch/scrape/:jobId/errors",
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
wrap(crawlErrorsController),
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/scrape/:jobId",
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
|
@ -2,7 +2,7 @@ import axios, { AxiosError } from "axios";
|
||||
import cheerio, { load } from "cheerio";
|
||||
import { URL } from "url";
|
||||
import { getLinksFromSitemap } from "./sitemap";
|
||||
import robotsParser from "robots-parser";
|
||||
import robotsParser, { Robot } from "robots-parser";
|
||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { axiosTimeout } from "../../lib/timeout";
|
||||
import { logger as _logger } from "../../lib/logger";
|
||||
@ -20,7 +20,7 @@ export class WebCrawler {
|
||||
private crawledUrls: Map<string, string> = new Map();
|
||||
private limit: number;
|
||||
private robotsTxtUrl: string;
|
||||
public robots: any;
|
||||
public robots: Robot;
|
||||
private generateImgAltText: boolean;
|
||||
private allowBackwardCrawling: boolean;
|
||||
private allowExternalContentLinks: boolean;
|
||||
@ -63,7 +63,7 @@ export class WebCrawler {
|
||||
this.includes = Array.isArray(includes) ? includes : [];
|
||||
this.excludes = Array.isArray(excludes) ? excludes : [];
|
||||
this.limit = limit;
|
||||
this.robotsTxtUrl = `${this.baseUrl}/robots.txt`;
|
||||
this.robotsTxtUrl = `${this.baseUrl}${this.baseUrl.endsWith("/") ? "" : "/"}robots.txt`;
|
||||
this.robots = robotsParser(this.robotsTxtUrl, "");
|
||||
// Deprecated, use limit instead
|
||||
this.maxCrawledLinks = maxCrawledLinks ?? limit;
|
||||
@ -217,45 +217,46 @@ export class WebCrawler {
|
||||
};
|
||||
|
||||
const _urlsHandler = async (urls: string[]) => {
|
||||
let uniqueURLs: string[] = [];
|
||||
for (const url of urls) {
|
||||
if (
|
||||
await redisConnection.sadd(
|
||||
"sitemap:" + this.jobId + ":links",
|
||||
normalizeUrl(url),
|
||||
)
|
||||
) {
|
||||
uniqueURLs.push(url);
|
||||
if (fromMap && onlySitemap) {
|
||||
return urlsHandler(urls);
|
||||
} else {
|
||||
let filteredLinks = this.filterLinks(
|
||||
[...new Set(urls)],
|
||||
leftOfLimit,
|
||||
this.maxCrawledDepth,
|
||||
fromMap,
|
||||
);
|
||||
leftOfLimit -= filteredLinks.length;
|
||||
let uniqueURLs: string[] = [];
|
||||
for (const url of filteredLinks) {
|
||||
if (
|
||||
await redisConnection.sadd(
|
||||
"sitemap:" + this.jobId + ":links",
|
||||
normalizeUrl(url),
|
||||
)
|
||||
) {
|
||||
uniqueURLs.push(url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await redisConnection.expire(
|
||||
"sitemap:" + this.jobId + ":links",
|
||||
3600,
|
||||
"NX",
|
||||
);
|
||||
if (uniqueURLs.length > 0) {
|
||||
urlsHandler(uniqueURLs);
|
||||
await redisConnection.expire(
|
||||
"sitemap:" + this.jobId + ":links",
|
||||
3600,
|
||||
"NX",
|
||||
);
|
||||
if (uniqueURLs.length > 0) {
|
||||
return urlsHandler(uniqueURLs);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let count = await this.tryFetchSitemapLinks(
|
||||
this.initialUrl,
|
||||
(urls: string[]) => {
|
||||
if (fromMap && onlySitemap) {
|
||||
return urlsHandler(urls);
|
||||
} else {
|
||||
let filteredLinks = this.filterLinks(
|
||||
[...new Set(urls)],
|
||||
leftOfLimit,
|
||||
this.maxCrawledDepth,
|
||||
fromMap,
|
||||
);
|
||||
leftOfLimit -= filteredLinks.length;
|
||||
return _urlsHandler(filteredLinks);
|
||||
}
|
||||
},
|
||||
);
|
||||
let count = (await Promise.all([
|
||||
this.tryFetchSitemapLinks(
|
||||
this.initialUrl,
|
||||
_urlsHandler,
|
||||
),
|
||||
...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)),
|
||||
])).reduce((a,x) => a+x, 0);
|
||||
|
||||
if (count > 0) {
|
||||
if (
|
||||
@ -298,6 +299,16 @@ export class WebCrawler {
|
||||
this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
||||
) {
|
||||
return fullUrl;
|
||||
} else if (
|
||||
this.isInternalLink(fullUrl) &&
|
||||
this.noSections(fullUrl) &&
|
||||
!this.matchesExcludes(path) &&
|
||||
!this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
||||
) {
|
||||
(async() => {
|
||||
await redisConnection.sadd("crawl:" + this.jobId + ":robots_blocked", fullUrl);
|
||||
await redisConnection.expire("crawl:" + this.jobId + ":robots_blocked", 24 * 60 * 60, "NX");
|
||||
})();
|
||||
}
|
||||
} else {
|
||||
// EXTERNAL LINKS
|
||||
|
@ -49,12 +49,14 @@ const excludeNonMainTags = [
|
||||
|
||||
const forceIncludeMainTags = ["#main"];
|
||||
|
||||
export const removeUnwantedElements = (
|
||||
export const htmlTransform = (
|
||||
html: string,
|
||||
url: string,
|
||||
scrapeOptions: ScrapeOptions,
|
||||
) => {
|
||||
const soup = load(html);
|
||||
let soup = load(html);
|
||||
|
||||
// remove unwanted elements
|
||||
if (
|
||||
scrapeOptions.includeTags &&
|
||||
scrapeOptions.includeTags.filter((x) => x.trim().length !== 0).length > 0
|
||||
@ -66,7 +68,8 @@ export const removeUnwantedElements = (
|
||||
newRoot.append(soup(element).clone());
|
||||
});
|
||||
});
|
||||
return newRoot.html() ?? "";
|
||||
|
||||
soup = load(newRoot.html() ?? "");
|
||||
}
|
||||
|
||||
soup("script, style, noscript, meta, head").remove();
|
||||
@ -114,6 +117,42 @@ export const removeUnwantedElements = (
|
||||
});
|
||||
}
|
||||
|
||||
// always return biggest image
|
||||
soup("img[srcset]").each((_, el) => {
|
||||
const sizes = el.attribs.srcset.split(",").map(x => {
|
||||
const tok = x.trim().split(" ");
|
||||
return {
|
||||
url: tok[0],
|
||||
size: parseInt((tok[1] ?? "1x").slice(0, -1), 10),
|
||||
isX: (tok[1] ?? "").endsWith("x")
|
||||
};
|
||||
});
|
||||
|
||||
if (sizes.every(x => x.isX) && el.attribs.src) {
|
||||
sizes.push({
|
||||
url: el.attribs.src,
|
||||
size: 1,
|
||||
isX: true,
|
||||
});
|
||||
}
|
||||
|
||||
sizes.sort((a,b) => b.size - a.size);
|
||||
|
||||
el.attribs.src = sizes[0]?.url;
|
||||
});
|
||||
|
||||
// absolute links
|
||||
soup("img[src]").each((_, el) => {
|
||||
try {
|
||||
el.attribs.src = new URL(el.attribs.src, url).href;
|
||||
} catch (_) {}
|
||||
});
|
||||
soup("a[href]").each((_, el) => {
|
||||
try {
|
||||
el.attribs.href = new URL(el.attribs.href, url).href;
|
||||
} catch (_) {}
|
||||
});
|
||||
|
||||
const cleanedHtml = soup.html();
|
||||
return cleanedHtml;
|
||||
};
|
||||
|
@ -1,7 +1,7 @@
|
||||
import { parseMarkdown } from "../../../lib/html-to-markdown";
|
||||
import { Meta } from "..";
|
||||
import { Document } from "../../../controllers/v1/types";
|
||||
import { removeUnwantedElements } from "../lib/removeUnwantedElements";
|
||||
import { htmlTransform } from "../lib/removeUnwantedElements";
|
||||
import { extractLinks } from "../lib/extractLinks";
|
||||
import { extractMetadata } from "../lib/extractMetadata";
|
||||
import { performLLMExtract } from "./llmExtract";
|
||||
@ -41,7 +41,7 @@ export function deriveHTMLFromRawHTML(
|
||||
);
|
||||
}
|
||||
|
||||
document.html = removeUnwantedElements(document.rawHtml, meta.options);
|
||||
document.html = htmlTransform(document.rawHtml, document.metadata.url ?? document.metadata.sourceURL ?? meta.url, meta.options);
|
||||
return document;
|
||||
}
|
||||
|
||||
|
@ -23,6 +23,7 @@ import {
|
||||
addCrawlJobs,
|
||||
crawlToCrawler,
|
||||
finishCrawl,
|
||||
finishCrawlKickoff,
|
||||
generateURLPermutations,
|
||||
getCrawl,
|
||||
getCrawlJobCount,
|
||||
@ -675,9 +676,17 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
||||
|
||||
logger.debug("Done queueing jobs!");
|
||||
|
||||
await finishCrawlKickoff(job.data.crawl_id);
|
||||
await finishCrawlIfNeeded(job, sc);
|
||||
|
||||
return { success: true };
|
||||
} catch (error) {
|
||||
logger.error("An error occurred!", { error });
|
||||
await finishCrawlKickoff(job.data.crawl_id);
|
||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||
if (sc) {
|
||||
await finishCrawlIfNeeded(job, sc);
|
||||
}
|
||||
return { success: false, error };
|
||||
}
|
||||
}
|
||||
@ -711,6 +720,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
teamId: job.data?.team_id ?? undefined,
|
||||
});
|
||||
logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url });
|
||||
const start = Date.now();
|
||||
|
||||
// Check if the job URL is researchhub and block it immediately
|
||||
// TODO: remove this once solve the root issue
|
||||
@ -737,7 +747,6 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
current_step: "SCRAPING",
|
||||
current_url: "",
|
||||
});
|
||||
const start = Date.now();
|
||||
|
||||
if (job.data.crawl_id) {
|
||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||
@ -988,6 +997,19 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
logger.info(`🐂 Job done ${job.id}`);
|
||||
return data;
|
||||
} catch (error) {
|
||||
if (job.data.crawl_id) {
|
||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||
|
||||
logger.debug("Declaring job as done...");
|
||||
await addCrawlJobDone(job.data.crawl_id, job.id, false);
|
||||
await redisConnection.srem(
|
||||
"crawl:" + job.data.crawl_id + ":visited_unique",
|
||||
normalizeURL(job.data.url, sc),
|
||||
);
|
||||
|
||||
await finishCrawlIfNeeded(job, sc);
|
||||
}
|
||||
|
||||
const isEarlyTimeout =
|
||||
error instanceof Error && error.message === "timeout";
|
||||
const isCancelled =
|
||||
@ -1041,6 +1063,9 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
);
|
||||
}
|
||||
|
||||
const end = Date.now();
|
||||
const timeTakenInSeconds = (end - start) / 1000;
|
||||
|
||||
logger.debug("Logging job to DB...");
|
||||
await logJob(
|
||||
{
|
||||
@ -1053,7 +1078,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
"Something went wrong... Contact help@mendable.ai"),
|
||||
num_docs: 0,
|
||||
docs: [],
|
||||
time_taken: 0,
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: job.data.team_id,
|
||||
mode: job.data.mode,
|
||||
url: job.data.url,
|
||||
@ -1064,39 +1089,6 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
},
|
||||
true,
|
||||
);
|
||||
|
||||
if (job.data.crawl_id) {
|
||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||
|
||||
logger.debug("Declaring job as done...");
|
||||
await addCrawlJobDone(job.data.crawl_id, job.id, false);
|
||||
await redisConnection.srem(
|
||||
"crawl:" + job.data.crawl_id + ":visited_unique",
|
||||
normalizeURL(job.data.url, sc),
|
||||
);
|
||||
|
||||
await finishCrawlIfNeeded(job, sc);
|
||||
|
||||
// await logJob({
|
||||
// job_id: job.data.crawl_id,
|
||||
// success: false,
|
||||
// message:
|
||||
// typeof error === "string"
|
||||
// ? error
|
||||
// : error.message ??
|
||||
// "Something went wrong... Contact help@mendable.ai",
|
||||
// num_docs: 0,
|
||||
// docs: [],
|
||||
// time_taken: 0,
|
||||
// team_id: job.data.team_id,
|
||||
// mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
|
||||
// url: sc ? sc.originUrl ?? job.data.url : job.data.url,
|
||||
// crawlerOptions: sc ? sc.crawlerOptions : undefined,
|
||||
// scrapeOptions: sc ? sc.scrapeOptions : job.data.scrapeOptions,
|
||||
// origin: job.data.origin,
|
||||
// });
|
||||
}
|
||||
// done(null, data);
|
||||
return data;
|
||||
}
|
||||
}
|
||||
@ -1126,5 +1118,6 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||
}
|
||||
|
||||
console.log("All jobs finished. Worker out!");
|
||||
process.exit(0);
|
||||
})();
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "1.14.0",
|
||||
"version": "1.14.1",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
|
@ -279,9 +279,11 @@ export interface ErrorResponse {
|
||||
*/
|
||||
export class FirecrawlError extends Error {
|
||||
statusCode: number;
|
||||
constructor(message: string, statusCode: number) {
|
||||
details?: any;
|
||||
constructor(message: string, statusCode: number, details?: any) {
|
||||
super(message);
|
||||
this.statusCode = statusCode;
|
||||
this.details = details;
|
||||
}
|
||||
}
|
||||
|
||||
@ -312,6 +314,26 @@ export interface SearchResponse {
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for crawl/batch scrape error monitoring.
|
||||
*/
|
||||
export interface CrawlErrorsResponse {
|
||||
/**
|
||||
* Scrapes that errored out + error details
|
||||
*/
|
||||
errors: {
|
||||
id: string,
|
||||
timestamp?: string,
|
||||
url: string,
|
||||
error: string,
|
||||
}[];
|
||||
|
||||
/**
|
||||
* URLs blocked by robots.txt
|
||||
*/
|
||||
robotsBlocked: string[];
|
||||
};
|
||||
|
||||
/**
|
||||
* Main class for interacting with the Firecrawl API.
|
||||
* Provides methods for scraping, searching, crawling, and mapping web content.
|
||||
@ -619,6 +641,29 @@ export default class FirecrawlApp {
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns information about crawl errors.
|
||||
* @param id - The ID of the crawl operation.
|
||||
* @returns Information about crawl errors.
|
||||
*/
|
||||
async checkCrawlErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders();
|
||||
try {
|
||||
const response: AxiosResponse = await this.deleteRequest(
|
||||
`${this.apiUrl}/v1/crawl/${id}/errors`,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
return response.data;
|
||||
} else {
|
||||
this.handleError(response, "check crawl errors");
|
||||
}
|
||||
} catch (error: any) {
|
||||
throw new FirecrawlError(error.message, 500);
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Cancels a crawl job using the Firecrawl API.
|
||||
* @param id - The ID of the crawl operation.
|
||||
@ -881,6 +926,29 @@ export default class FirecrawlApp {
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns information about batch scrape errors.
|
||||
* @param id - The ID of the batch scrape operation.
|
||||
* @returns Information about batch scrape errors.
|
||||
*/
|
||||
async checkBatchScrapeErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders();
|
||||
try {
|
||||
const response: AxiosResponse = await this.deleteRequest(
|
||||
`${this.apiUrl}/v1/batch/scrape/${id}/errors`,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
return response.data;
|
||||
} else {
|
||||
this.handleError(response, "check batch scrape errors");
|
||||
}
|
||||
} catch (error: any) {
|
||||
throw new FirecrawlError(error.message, 500);
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts information from URLs using the Firecrawl API.
|
||||
* Currently in Beta. Expect breaking changes on future minor versions.
|
||||
@ -941,9 +1009,9 @@ export default class FirecrawlApp {
|
||||
this.handleError(response, "extract");
|
||||
}
|
||||
} catch (error: any) {
|
||||
throw new FirecrawlError(error.message, 500);
|
||||
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
return { success: false, error: "Internal server error."};
|
||||
}
|
||||
|
||||
/**
|
||||
@ -985,7 +1053,7 @@ export default class FirecrawlApp {
|
||||
this.handleError(response, "start extract job");
|
||||
}
|
||||
} catch (error: any) {
|
||||
throw new FirecrawlError(error.message, 500);
|
||||
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
@ -120,7 +120,10 @@ class FirecrawlApp:
|
||||
json=scrape_params,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
try:
|
||||
response = response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
if response['success'] and 'data' in response:
|
||||
return response['data']
|
||||
elif "error" in response:
|
||||
@ -159,7 +162,10 @@ class FirecrawlApp:
|
||||
if response.status_code != 200:
|
||||
raise Exception(f"Request failed with status code {response.status_code}")
|
||||
|
||||
return response.json()
|
||||
try:
|
||||
return response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
|
||||
def crawl_url(self, url: str,
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
@ -194,7 +200,10 @@ class FirecrawlApp:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
id = response.json().get('id')
|
||||
try:
|
||||
id = response.json().get('id')
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
return self._monitor_job_status(id, headers, poll_interval)
|
||||
|
||||
else:
|
||||
@ -223,7 +232,10 @@ class FirecrawlApp:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
try:
|
||||
return response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
else:
|
||||
self._handle_error(response, 'start crawl job')
|
||||
|
||||
@ -245,7 +257,10 @@ class FirecrawlApp:
|
||||
headers = self._prepare_headers()
|
||||
response = self._get_request(f'{self.api_url}{endpoint}', headers)
|
||||
if response.status_code == 200:
|
||||
status_data = response.json()
|
||||
try:
|
||||
status_data = response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
if status_data['status'] == 'completed':
|
||||
if 'data' in status_data:
|
||||
data = status_data['data']
|
||||
@ -261,7 +276,10 @@ class FirecrawlApp:
|
||||
if status_response.status_code != 200:
|
||||
logger.error(f"Failed to fetch next page: {status_response.status_code}")
|
||||
break
|
||||
next_data = status_response.json()
|
||||
try:
|
||||
next_data = status_response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
data.extend(next_data.get('data', []))
|
||||
status_data = next_data
|
||||
except Exception as e:
|
||||
@ -291,6 +309,26 @@ class FirecrawlApp:
|
||||
else:
|
||||
self._handle_error(response, 'check crawl status')
|
||||
|
||||
def check_crawl_errors(self, id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Returns information about crawl errors.
|
||||
|
||||
Args:
|
||||
id (str): The ID of the crawl job.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: Information about crawl errors.
|
||||
"""
|
||||
headers = self._prepare_headers()
|
||||
response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
|
||||
if response.status_code == 200:
|
||||
try:
|
||||
return response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
else:
|
||||
self._handle_error(response, "check crawl errors")
|
||||
|
||||
def cancel_crawl(self, id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Cancel an asynchronous crawl job using the Firecrawl API.
|
||||
@ -304,7 +342,10 @@ class FirecrawlApp:
|
||||
headers = self._prepare_headers()
|
||||
response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
try:
|
||||
return response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
else:
|
||||
self._handle_error(response, "cancel crawl job")
|
||||
|
||||
@ -352,7 +393,10 @@ class FirecrawlApp:
|
||||
json=json_data,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
try:
|
||||
response = response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
if response['success'] and 'links' in response:
|
||||
return response
|
||||
elif 'error' in response:
|
||||
@ -395,7 +439,10 @@ class FirecrawlApp:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
id = response.json().get('id')
|
||||
try:
|
||||
id = response.json().get('id')
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
return self._monitor_job_status(id, headers, poll_interval)
|
||||
|
||||
else:
|
||||
@ -424,7 +471,10 @@ class FirecrawlApp:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
try:
|
||||
return response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
else:
|
||||
self._handle_error(response, 'start batch scrape job')
|
||||
|
||||
@ -464,7 +514,10 @@ class FirecrawlApp:
|
||||
headers = self._prepare_headers()
|
||||
response = self._get_request(f'{self.api_url}{endpoint}', headers)
|
||||
if response.status_code == 200:
|
||||
status_data = response.json()
|
||||
try:
|
||||
status_data = response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
if status_data['status'] == 'completed':
|
||||
if 'data' in status_data:
|
||||
data = status_data['data']
|
||||
@ -480,7 +533,10 @@ class FirecrawlApp:
|
||||
if status_response.status_code != 200:
|
||||
logger.error(f"Failed to fetch next page: {status_response.status_code}")
|
||||
break
|
||||
next_data = status_response.json()
|
||||
try:
|
||||
next_data = status_response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
data.extend(next_data.get('data', []))
|
||||
status_data = next_data
|
||||
except Exception as e:
|
||||
@ -510,6 +566,25 @@ class FirecrawlApp:
|
||||
else:
|
||||
self._handle_error(response, 'check batch scrape status')
|
||||
|
||||
def check_batch_scrape_errors(self, id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Returns information about batch scrape errors.
|
||||
|
||||
Args:
|
||||
id (str): The ID of the crawl job.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: Information about crawl errors.
|
||||
"""
|
||||
headers = self._prepare_headers()
|
||||
response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
|
||||
if response.status_code == 200:
|
||||
try:
|
||||
return response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
else:
|
||||
self._handle_error(response, "check batch scrape errors")
|
||||
|
||||
def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Any:
|
||||
"""
|
||||
@ -550,7 +625,10 @@ class FirecrawlApp:
|
||||
headers
|
||||
)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
try:
|
||||
data = response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
if data['success']:
|
||||
job_id = data.get('id')
|
||||
if not job_id:
|
||||
@ -563,7 +641,10 @@ class FirecrawlApp:
|
||||
headers
|
||||
)
|
||||
if status_response.status_code == 200:
|
||||
status_data = status_response.json()
|
||||
try:
|
||||
status_data = status_response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
if status_data['status'] == 'completed':
|
||||
if status_data['success']:
|
||||
return status_data
|
||||
@ -601,7 +682,10 @@ class FirecrawlApp:
|
||||
try:
|
||||
response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
try:
|
||||
return response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
else:
|
||||
self._handle_error(response, "get extract status")
|
||||
except Exception as e:
|
||||
@ -641,7 +725,10 @@ class FirecrawlApp:
|
||||
try:
|
||||
response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
try:
|
||||
return response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
else:
|
||||
self._handle_error(response, "async extract")
|
||||
except Exception as e:
|
||||
@ -771,16 +858,22 @@ class FirecrawlApp:
|
||||
|
||||
status_response = self._get_request(api_url, headers)
|
||||
if status_response.status_code == 200:
|
||||
status_data = status_response.json()
|
||||
try:
|
||||
status_data = status_response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
if status_data['status'] == 'completed':
|
||||
if 'data' in status_data:
|
||||
data = status_data['data']
|
||||
while 'next' in status_data:
|
||||
if len(status_data['data']) == 0:
|
||||
break
|
||||
status_response = self._get_request(status_data['next'], headers)
|
||||
status_data = status_response.json()
|
||||
data.extend(status_data.get('data', []))
|
||||
if len(status_data['data']) == 0:
|
||||
break
|
||||
status_response = self._get_request(status_data['next'], headers)
|
||||
try:
|
||||
status_data = status_response.json()
|
||||
except:
|
||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||
data.extend(status_data.get('data', []))
|
||||
status_data['data'] = data
|
||||
return status_data
|
||||
else:
|
||||
@ -804,8 +897,12 @@ class FirecrawlApp:
|
||||
Raises:
|
||||
Exception: An exception with a message containing the status code and error details from the response.
|
||||
"""
|
||||
error_message = response.json().get('error', 'No error message provided.')
|
||||
error_details = response.json().get('details', 'No additional error details provided.')
|
||||
try:
|
||||
error_message = response.json().get('error', 'No error message provided.')
|
||||
error_details = response.json().get('details', 'No additional error details provided.')
|
||||
except:
|
||||
raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
|
||||
|
||||
|
||||
if response.status_code == 402:
|
||||
message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
|
||||
|
Loading…
x
Reference in New Issue
Block a user