mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-13 23:45:52 +08:00
Merge branch 'main' into nsc/llm-usage-extract
This commit is contained in:
commit
260a726f37
@ -14,7 +14,7 @@
|
|||||||
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
|
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
|
||||||
"test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
|
"test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
|
||||||
"test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'",
|
"test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'",
|
||||||
"test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth)'",
|
"test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth|src/scraper/scrapeURL)'",
|
||||||
"test:snips": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false src/__tests__/snips/*.test.ts",
|
"test:snips": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false src/__tests__/snips/*.test.ts",
|
||||||
"workers": "nodemon --exec ts-node src/services/queue-worker.ts",
|
"workers": "nodemon --exec ts-node src/services/queue-worker.ts",
|
||||||
"worker:production": "node dist/src/services/queue-worker.js",
|
"worker:production": "node dist/src/services/queue-worker.js",
|
||||||
|
@ -60,9 +60,6 @@ content-type: application/json
|
|||||||
"sitemapOnly": true
|
"sitemapOnly": true
|
||||||
}
|
}
|
||||||
|
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
|
||||||
|
|
||||||
|
|
||||||
### Extract Firecrawl Title
|
### Extract Firecrawl Title
|
||||||
# @name extractFirecrawl
|
# @name extractFirecrawl
|
||||||
POST {{baseUrl}}/v1/extract HTTP/1.1
|
POST {{baseUrl}}/v1/extract HTTP/1.1
|
||||||
|
81
apps/api/src/controllers/v1/crawl-errors.ts
Normal file
81
apps/api/src/controllers/v1/crawl-errors.ts
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
import { Response } from "express";
|
||||||
|
import {
|
||||||
|
CrawlErrorsResponse,
|
||||||
|
CrawlStatusParams,
|
||||||
|
CrawlStatusResponse,
|
||||||
|
ErrorResponse,
|
||||||
|
RequestWithAuth,
|
||||||
|
} from "./types";
|
||||||
|
import {
|
||||||
|
getCrawl,
|
||||||
|
getCrawlExpiry,
|
||||||
|
getCrawlJobs,
|
||||||
|
getDoneJobsOrdered,
|
||||||
|
getDoneJobsOrderedLength,
|
||||||
|
getThrottledJobs,
|
||||||
|
isCrawlFinished,
|
||||||
|
} from "../../lib/crawl-redis";
|
||||||
|
import { getScrapeQueue, redisConnection } from "../../services/queue-service";
|
||||||
|
import {
|
||||||
|
supabaseGetJobById,
|
||||||
|
supabaseGetJobsById,
|
||||||
|
} from "../../lib/supabase-jobs";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
import { Job, JobState } from "bullmq";
|
||||||
|
import { logger } from "../../lib/logger";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
|
export async function getJob(id: string) {
|
||||||
|
const job = await getScrapeQueue().getJob(id);
|
||||||
|
if (!job) return job;
|
||||||
|
|
||||||
|
return job;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getJobs(ids: string[]) {
|
||||||
|
const jobs: (Job & { id: string })[] = (
|
||||||
|
await Promise.all(ids.map((x) => getScrapeQueue().getJob(x)))
|
||||||
|
).filter((x) => x) as (Job & { id: string })[];
|
||||||
|
|
||||||
|
return jobs;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function crawlErrorsController(
|
||||||
|
req: RequestWithAuth<CrawlStatusParams, undefined, CrawlErrorsResponse>,
|
||||||
|
res: Response<CrawlErrorsResponse>,
|
||||||
|
) {
|
||||||
|
const sc = await getCrawl(req.params.jobId);
|
||||||
|
if (!sc) {
|
||||||
|
return res.status(404).json({ success: false, error: "Job not found" });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sc.team_id !== req.auth.team_id) {
|
||||||
|
return res.status(403).json({ success: false, error: "Forbidden" });
|
||||||
|
}
|
||||||
|
|
||||||
|
let jobStatuses = await Promise.all(
|
||||||
|
(await getCrawlJobs(req.params.jobId)).map(
|
||||||
|
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
const failedJobIDs: string[] = [];
|
||||||
|
|
||||||
|
for (const [id, status] of jobStatuses) {
|
||||||
|
if (
|
||||||
|
status === "failed"
|
||||||
|
) {
|
||||||
|
failedJobIDs.push(id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
res.status(200).json({
|
||||||
|
errors: (await getJobs(failedJobIDs)).map(x => ({
|
||||||
|
id: x.id,
|
||||||
|
timestamp: x.finishedOn !== undefined ? (new Date(x.finishedOn).toISOString()) : undefined,
|
||||||
|
url: x.data.url,
|
||||||
|
error: x.failedReason,
|
||||||
|
})),
|
||||||
|
robotsBlocked: await redisConnection.smembers("crawl:" + req.params.jobId + ":robots_blocked"),
|
||||||
|
});
|
||||||
|
}
|
@ -13,6 +13,7 @@ import {
|
|||||||
getDoneJobsOrderedLength,
|
getDoneJobsOrderedLength,
|
||||||
getThrottledJobs,
|
getThrottledJobs,
|
||||||
isCrawlFinished,
|
isCrawlFinished,
|
||||||
|
isCrawlFinishedLocked,
|
||||||
} from "../../lib/crawl-redis";
|
} from "../../lib/crawl-redis";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import {
|
import {
|
||||||
@ -117,7 +118,7 @@ export async function crawlStatusController(
|
|||||||
sc.cancelled
|
sc.cancelled
|
||||||
? "cancelled"
|
? "cancelled"
|
||||||
: validJobStatuses.every((x) => x[1] === "completed") &&
|
: validJobStatuses.every((x) => x[1] === "completed") &&
|
||||||
await isCrawlFinished(req.params.jobId)
|
(await isCrawlFinishedLocked(req.params.jobId) || await isCrawlFinished(req.params.jobId))
|
||||||
? "completed"
|
? "completed"
|
||||||
: "scraping";
|
: "scraping";
|
||||||
|
|
||||||
|
@ -85,6 +85,11 @@ export async function getMapResults({
|
|||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
|
try {
|
||||||
|
sc.robots = await crawler.getRobotsTxt();
|
||||||
|
await crawler.importRobotsTxt(sc.robots);
|
||||||
|
} catch (_) {}
|
||||||
|
|
||||||
// If sitemapOnly is true, only get links from sitemap
|
// If sitemapOnly is true, only get links from sitemap
|
||||||
if (crawlerOptions.sitemapOnly) {
|
if (crawlerOptions.sitemapOnly) {
|
||||||
const sitemap = await crawler.tryGetSitemap(
|
const sitemap = await crawler.tryGetSitemap(
|
||||||
|
@ -34,7 +34,7 @@ export const url = z.preprocess(
|
|||||||
.url()
|
.url()
|
||||||
.regex(/^https?:\/\//, "URL uses unsupported protocol")
|
.regex(/^https?:\/\//, "URL uses unsupported protocol")
|
||||||
.refine(
|
.refine(
|
||||||
(x) => /\.[a-z]{2,}([\/?#]|$)/i.test(x),
|
(x) => /\.[a-z]{2,}(:\d+)?([\/?#]|$)/i.test(x),
|
||||||
"URL must have a valid top-level domain or be a valid path",
|
"URL must have a valid top-level domain or be a valid path",
|
||||||
)
|
)
|
||||||
.refine((x) => {
|
.refine((x) => {
|
||||||
@ -569,6 +569,19 @@ export type CrawlStatusResponse =
|
|||||||
data: Document[];
|
data: Document[];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
export type CrawlErrorsResponse =
|
||||||
|
| ErrorResponse
|
||||||
|
| {
|
||||||
|
errors: {
|
||||||
|
id: string,
|
||||||
|
timestamp?: string,
|
||||||
|
url: string,
|
||||||
|
error: string,
|
||||||
|
}[];
|
||||||
|
robotsBlocked: string[];
|
||||||
|
};
|
||||||
|
|
||||||
type AuthObject = {
|
type AuthObject = {
|
||||||
team_id: string;
|
team_id: string;
|
||||||
plan: PlanType | undefined;
|
plan: PlanType | undefined;
|
||||||
|
@ -128,6 +128,7 @@ export async function isCrawlFinished(id: string) {
|
|||||||
return (
|
return (
|
||||||
(await redisConnection.scard("crawl:" + id + ":jobs_done")) ===
|
(await redisConnection.scard("crawl:" + id + ":jobs_done")) ===
|
||||||
(await redisConnection.scard("crawl:" + id + ":jobs"))
|
(await redisConnection.scard("crawl:" + id + ":jobs"))
|
||||||
|
&& (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -135,6 +136,10 @@ export async function isCrawlFinishedLocked(id: string) {
|
|||||||
return await redisConnection.exists("crawl:" + id + ":finish");
|
return await redisConnection.exists("crawl:" + id + ":finish");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function finishCrawlKickoff(id: string) {
|
||||||
|
await redisConnection.set("crawl:" + id + ":kickoff:finish", "yes", "EX", 24 * 60 * 60);
|
||||||
|
}
|
||||||
|
|
||||||
export async function finishCrawl(id: string) {
|
export async function finishCrawl(id: string) {
|
||||||
if (await isCrawlFinished(id)) {
|
if (await isCrawlFinished(id)) {
|
||||||
_logger.debug("Marking crawl as finished.", {
|
_logger.debug("Marking crawl as finished.", {
|
||||||
@ -152,6 +157,9 @@ export async function finishCrawl(id: string) {
|
|||||||
module: "crawl-redis",
|
module: "crawl-redis",
|
||||||
method: "finishCrawl",
|
method: "finishCrawl",
|
||||||
crawlId: id,
|
crawlId: id,
|
||||||
|
jobs_done: (await redisConnection.scard("crawl:" + id + ":jobs_done")),
|
||||||
|
jobs: (await redisConnection.scard("crawl:" + id + ":jobs")),
|
||||||
|
kickoff_finished: (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -28,6 +28,7 @@ import { extractStatusController } from "../controllers/v1/extract-status";
|
|||||||
import { creditUsageController } from "../controllers/v1/credit-usage";
|
import { creditUsageController } from "../controllers/v1/credit-usage";
|
||||||
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||||
import { searchController } from "../controllers/v1/search";
|
import { searchController } from "../controllers/v1/search";
|
||||||
|
import { crawlErrorsController } from "../controllers/v1/crawl-errors";
|
||||||
|
|
||||||
function checkCreditsMiddleware(
|
function checkCreditsMiddleware(
|
||||||
minimum?: number,
|
minimum?: number,
|
||||||
@ -192,6 +193,18 @@ v1Router.get(
|
|||||||
wrap((req: any, res): any => crawlStatusController(req, res, true)),
|
wrap((req: any, res): any => crawlStatusController(req, res, true)),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
v1Router.get(
|
||||||
|
"/crawl/:jobId/errors",
|
||||||
|
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||||
|
wrap(crawlErrorsController),
|
||||||
|
);
|
||||||
|
|
||||||
|
v1Router.get(
|
||||||
|
"/batch/scrape/:jobId/errors",
|
||||||
|
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||||
|
wrap(crawlErrorsController),
|
||||||
|
);
|
||||||
|
|
||||||
v1Router.get(
|
v1Router.get(
|
||||||
"/scrape/:jobId",
|
"/scrape/:jobId",
|
||||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||||
|
@ -2,7 +2,7 @@ import axios, { AxiosError } from "axios";
|
|||||||
import cheerio, { load } from "cheerio";
|
import cheerio, { load } from "cheerio";
|
||||||
import { URL } from "url";
|
import { URL } from "url";
|
||||||
import { getLinksFromSitemap } from "./sitemap";
|
import { getLinksFromSitemap } from "./sitemap";
|
||||||
import robotsParser from "robots-parser";
|
import robotsParser, { Robot } from "robots-parser";
|
||||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||||
import { axiosTimeout } from "../../lib/timeout";
|
import { axiosTimeout } from "../../lib/timeout";
|
||||||
import { logger as _logger } from "../../lib/logger";
|
import { logger as _logger } from "../../lib/logger";
|
||||||
@ -20,7 +20,7 @@ export class WebCrawler {
|
|||||||
private crawledUrls: Map<string, string> = new Map();
|
private crawledUrls: Map<string, string> = new Map();
|
||||||
private limit: number;
|
private limit: number;
|
||||||
private robotsTxtUrl: string;
|
private robotsTxtUrl: string;
|
||||||
public robots: any;
|
public robots: Robot;
|
||||||
private generateImgAltText: boolean;
|
private generateImgAltText: boolean;
|
||||||
private allowBackwardCrawling: boolean;
|
private allowBackwardCrawling: boolean;
|
||||||
private allowExternalContentLinks: boolean;
|
private allowExternalContentLinks: boolean;
|
||||||
@ -63,7 +63,7 @@ export class WebCrawler {
|
|||||||
this.includes = Array.isArray(includes) ? includes : [];
|
this.includes = Array.isArray(includes) ? includes : [];
|
||||||
this.excludes = Array.isArray(excludes) ? excludes : [];
|
this.excludes = Array.isArray(excludes) ? excludes : [];
|
||||||
this.limit = limit;
|
this.limit = limit;
|
||||||
this.robotsTxtUrl = `${this.baseUrl}/robots.txt`;
|
this.robotsTxtUrl = `${this.baseUrl}${this.baseUrl.endsWith("/") ? "" : "/"}robots.txt`;
|
||||||
this.robots = robotsParser(this.robotsTxtUrl, "");
|
this.robots = robotsParser(this.robotsTxtUrl, "");
|
||||||
// Deprecated, use limit instead
|
// Deprecated, use limit instead
|
||||||
this.maxCrawledLinks = maxCrawledLinks ?? limit;
|
this.maxCrawledLinks = maxCrawledLinks ?? limit;
|
||||||
@ -217,45 +217,46 @@ export class WebCrawler {
|
|||||||
};
|
};
|
||||||
|
|
||||||
const _urlsHandler = async (urls: string[]) => {
|
const _urlsHandler = async (urls: string[]) => {
|
||||||
let uniqueURLs: string[] = [];
|
if (fromMap && onlySitemap) {
|
||||||
for (const url of urls) {
|
return urlsHandler(urls);
|
||||||
if (
|
} else {
|
||||||
await redisConnection.sadd(
|
let filteredLinks = this.filterLinks(
|
||||||
"sitemap:" + this.jobId + ":links",
|
[...new Set(urls)],
|
||||||
normalizeUrl(url),
|
leftOfLimit,
|
||||||
)
|
this.maxCrawledDepth,
|
||||||
) {
|
fromMap,
|
||||||
uniqueURLs.push(url);
|
);
|
||||||
|
leftOfLimit -= filteredLinks.length;
|
||||||
|
let uniqueURLs: string[] = [];
|
||||||
|
for (const url of filteredLinks) {
|
||||||
|
if (
|
||||||
|
await redisConnection.sadd(
|
||||||
|
"sitemap:" + this.jobId + ":links",
|
||||||
|
normalizeUrl(url),
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
uniqueURLs.push(url);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
await redisConnection.expire(
|
await redisConnection.expire(
|
||||||
"sitemap:" + this.jobId + ":links",
|
"sitemap:" + this.jobId + ":links",
|
||||||
3600,
|
3600,
|
||||||
"NX",
|
"NX",
|
||||||
);
|
);
|
||||||
if (uniqueURLs.length > 0) {
|
if (uniqueURLs.length > 0) {
|
||||||
urlsHandler(uniqueURLs);
|
return urlsHandler(uniqueURLs);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let count = await this.tryFetchSitemapLinks(
|
let count = (await Promise.all([
|
||||||
this.initialUrl,
|
this.tryFetchSitemapLinks(
|
||||||
(urls: string[]) => {
|
this.initialUrl,
|
||||||
if (fromMap && onlySitemap) {
|
_urlsHandler,
|
||||||
return urlsHandler(urls);
|
),
|
||||||
} else {
|
...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)),
|
||||||
let filteredLinks = this.filterLinks(
|
])).reduce((a,x) => a+x, 0);
|
||||||
[...new Set(urls)],
|
|
||||||
leftOfLimit,
|
|
||||||
this.maxCrawledDepth,
|
|
||||||
fromMap,
|
|
||||||
);
|
|
||||||
leftOfLimit -= filteredLinks.length;
|
|
||||||
return _urlsHandler(filteredLinks);
|
|
||||||
}
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
if (count > 0) {
|
if (count > 0) {
|
||||||
if (
|
if (
|
||||||
@ -298,6 +299,16 @@ export class WebCrawler {
|
|||||||
this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
||||||
) {
|
) {
|
||||||
return fullUrl;
|
return fullUrl;
|
||||||
|
} else if (
|
||||||
|
this.isInternalLink(fullUrl) &&
|
||||||
|
this.noSections(fullUrl) &&
|
||||||
|
!this.matchesExcludes(path) &&
|
||||||
|
!this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
||||||
|
) {
|
||||||
|
(async() => {
|
||||||
|
await redisConnection.sadd("crawl:" + this.jobId + ":robots_blocked", fullUrl);
|
||||||
|
await redisConnection.expire("crawl:" + this.jobId + ":robots_blocked", 24 * 60 * 60, "NX");
|
||||||
|
})();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// EXTERNAL LINKS
|
// EXTERNAL LINKS
|
||||||
|
@ -49,12 +49,14 @@ const excludeNonMainTags = [
|
|||||||
|
|
||||||
const forceIncludeMainTags = ["#main"];
|
const forceIncludeMainTags = ["#main"];
|
||||||
|
|
||||||
export const removeUnwantedElements = (
|
export const htmlTransform = (
|
||||||
html: string,
|
html: string,
|
||||||
|
url: string,
|
||||||
scrapeOptions: ScrapeOptions,
|
scrapeOptions: ScrapeOptions,
|
||||||
) => {
|
) => {
|
||||||
const soup = load(html);
|
let soup = load(html);
|
||||||
|
|
||||||
|
// remove unwanted elements
|
||||||
if (
|
if (
|
||||||
scrapeOptions.includeTags &&
|
scrapeOptions.includeTags &&
|
||||||
scrapeOptions.includeTags.filter((x) => x.trim().length !== 0).length > 0
|
scrapeOptions.includeTags.filter((x) => x.trim().length !== 0).length > 0
|
||||||
@ -66,7 +68,8 @@ export const removeUnwantedElements = (
|
|||||||
newRoot.append(soup(element).clone());
|
newRoot.append(soup(element).clone());
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
return newRoot.html() ?? "";
|
|
||||||
|
soup = load(newRoot.html() ?? "");
|
||||||
}
|
}
|
||||||
|
|
||||||
soup("script, style, noscript, meta, head").remove();
|
soup("script, style, noscript, meta, head").remove();
|
||||||
@ -114,6 +117,42 @@ export const removeUnwantedElements = (
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// always return biggest image
|
||||||
|
soup("img[srcset]").each((_, el) => {
|
||||||
|
const sizes = el.attribs.srcset.split(",").map(x => {
|
||||||
|
const tok = x.trim().split(" ");
|
||||||
|
return {
|
||||||
|
url: tok[0],
|
||||||
|
size: parseInt((tok[1] ?? "1x").slice(0, -1), 10),
|
||||||
|
isX: (tok[1] ?? "").endsWith("x")
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
if (sizes.every(x => x.isX) && el.attribs.src) {
|
||||||
|
sizes.push({
|
||||||
|
url: el.attribs.src,
|
||||||
|
size: 1,
|
||||||
|
isX: true,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
sizes.sort((a,b) => b.size - a.size);
|
||||||
|
|
||||||
|
el.attribs.src = sizes[0]?.url;
|
||||||
|
});
|
||||||
|
|
||||||
|
// absolute links
|
||||||
|
soup("img[src]").each((_, el) => {
|
||||||
|
try {
|
||||||
|
el.attribs.src = new URL(el.attribs.src, url).href;
|
||||||
|
} catch (_) {}
|
||||||
|
});
|
||||||
|
soup("a[href]").each((_, el) => {
|
||||||
|
try {
|
||||||
|
el.attribs.href = new URL(el.attribs.href, url).href;
|
||||||
|
} catch (_) {}
|
||||||
|
});
|
||||||
|
|
||||||
const cleanedHtml = soup.html();
|
const cleanedHtml = soup.html();
|
||||||
return cleanedHtml;
|
return cleanedHtml;
|
||||||
};
|
};
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import { parseMarkdown } from "../../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../../lib/html-to-markdown";
|
||||||
import { Meta } from "..";
|
import { Meta } from "..";
|
||||||
import { Document } from "../../../controllers/v1/types";
|
import { Document } from "../../../controllers/v1/types";
|
||||||
import { removeUnwantedElements } from "../lib/removeUnwantedElements";
|
import { htmlTransform } from "../lib/removeUnwantedElements";
|
||||||
import { extractLinks } from "../lib/extractLinks";
|
import { extractLinks } from "../lib/extractLinks";
|
||||||
import { extractMetadata } from "../lib/extractMetadata";
|
import { extractMetadata } from "../lib/extractMetadata";
|
||||||
import { performLLMExtract } from "./llmExtract";
|
import { performLLMExtract } from "./llmExtract";
|
||||||
@ -41,7 +41,7 @@ export function deriveHTMLFromRawHTML(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
document.html = removeUnwantedElements(document.rawHtml, meta.options);
|
document.html = htmlTransform(document.rawHtml, document.metadata.url ?? document.metadata.sourceURL ?? meta.url, meta.options);
|
||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,6 +23,7 @@ import {
|
|||||||
addCrawlJobs,
|
addCrawlJobs,
|
||||||
crawlToCrawler,
|
crawlToCrawler,
|
||||||
finishCrawl,
|
finishCrawl,
|
||||||
|
finishCrawlKickoff,
|
||||||
generateURLPermutations,
|
generateURLPermutations,
|
||||||
getCrawl,
|
getCrawl,
|
||||||
getCrawlJobCount,
|
getCrawlJobCount,
|
||||||
@ -675,9 +676,17 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
|||||||
|
|
||||||
logger.debug("Done queueing jobs!");
|
logger.debug("Done queueing jobs!");
|
||||||
|
|
||||||
|
await finishCrawlKickoff(job.data.crawl_id);
|
||||||
|
await finishCrawlIfNeeded(job, sc);
|
||||||
|
|
||||||
return { success: true };
|
return { success: true };
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error("An error occurred!", { error });
|
logger.error("An error occurred!", { error });
|
||||||
|
await finishCrawlKickoff(job.data.crawl_id);
|
||||||
|
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||||
|
if (sc) {
|
||||||
|
await finishCrawlIfNeeded(job, sc);
|
||||||
|
}
|
||||||
return { success: false, error };
|
return { success: false, error };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -711,6 +720,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
teamId: job.data?.team_id ?? undefined,
|
teamId: job.data?.team_id ?? undefined,
|
||||||
});
|
});
|
||||||
logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url });
|
logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url });
|
||||||
|
const start = Date.now();
|
||||||
|
|
||||||
// Check if the job URL is researchhub and block it immediately
|
// Check if the job URL is researchhub and block it immediately
|
||||||
// TODO: remove this once solve the root issue
|
// TODO: remove this once solve the root issue
|
||||||
@ -737,7 +747,6 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
current_step: "SCRAPING",
|
current_step: "SCRAPING",
|
||||||
current_url: "",
|
current_url: "",
|
||||||
});
|
});
|
||||||
const start = Date.now();
|
|
||||||
|
|
||||||
if (job.data.crawl_id) {
|
if (job.data.crawl_id) {
|
||||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||||
@ -988,6 +997,19 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
logger.info(`🐂 Job done ${job.id}`);
|
logger.info(`🐂 Job done ${job.id}`);
|
||||||
return data;
|
return data;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
if (job.data.crawl_id) {
|
||||||
|
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||||
|
|
||||||
|
logger.debug("Declaring job as done...");
|
||||||
|
await addCrawlJobDone(job.data.crawl_id, job.id, false);
|
||||||
|
await redisConnection.srem(
|
||||||
|
"crawl:" + job.data.crawl_id + ":visited_unique",
|
||||||
|
normalizeURL(job.data.url, sc),
|
||||||
|
);
|
||||||
|
|
||||||
|
await finishCrawlIfNeeded(job, sc);
|
||||||
|
}
|
||||||
|
|
||||||
const isEarlyTimeout =
|
const isEarlyTimeout =
|
||||||
error instanceof Error && error.message === "timeout";
|
error instanceof Error && error.message === "timeout";
|
||||||
const isCancelled =
|
const isCancelled =
|
||||||
@ -1041,6 +1063,9 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const end = Date.now();
|
||||||
|
const timeTakenInSeconds = (end - start) / 1000;
|
||||||
|
|
||||||
logger.debug("Logging job to DB...");
|
logger.debug("Logging job to DB...");
|
||||||
await logJob(
|
await logJob(
|
||||||
{
|
{
|
||||||
@ -1053,7 +1078,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
"Something went wrong... Contact help@mendable.ai"),
|
"Something went wrong... Contact help@mendable.ai"),
|
||||||
num_docs: 0,
|
num_docs: 0,
|
||||||
docs: [],
|
docs: [],
|
||||||
time_taken: 0,
|
time_taken: timeTakenInSeconds,
|
||||||
team_id: job.data.team_id,
|
team_id: job.data.team_id,
|
||||||
mode: job.data.mode,
|
mode: job.data.mode,
|
||||||
url: job.data.url,
|
url: job.data.url,
|
||||||
@ -1064,39 +1089,6 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
},
|
},
|
||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (job.data.crawl_id) {
|
|
||||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
|
||||||
|
|
||||||
logger.debug("Declaring job as done...");
|
|
||||||
await addCrawlJobDone(job.data.crawl_id, job.id, false);
|
|
||||||
await redisConnection.srem(
|
|
||||||
"crawl:" + job.data.crawl_id + ":visited_unique",
|
|
||||||
normalizeURL(job.data.url, sc),
|
|
||||||
);
|
|
||||||
|
|
||||||
await finishCrawlIfNeeded(job, sc);
|
|
||||||
|
|
||||||
// await logJob({
|
|
||||||
// job_id: job.data.crawl_id,
|
|
||||||
// success: false,
|
|
||||||
// message:
|
|
||||||
// typeof error === "string"
|
|
||||||
// ? error
|
|
||||||
// : error.message ??
|
|
||||||
// "Something went wrong... Contact help@mendable.ai",
|
|
||||||
// num_docs: 0,
|
|
||||||
// docs: [],
|
|
||||||
// time_taken: 0,
|
|
||||||
// team_id: job.data.team_id,
|
|
||||||
// mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
|
|
||||||
// url: sc ? sc.originUrl ?? job.data.url : job.data.url,
|
|
||||||
// crawlerOptions: sc ? sc.crawlerOptions : undefined,
|
|
||||||
// scrapeOptions: sc ? sc.scrapeOptions : job.data.scrapeOptions,
|
|
||||||
// origin: job.data.origin,
|
|
||||||
// });
|
|
||||||
}
|
|
||||||
// done(null, data);
|
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1126,5 +1118,6 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log("All jobs finished. Worker out!");
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
})();
|
})();
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "1.14.0",
|
"version": "1.14.1",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
|
@ -279,9 +279,11 @@ export interface ErrorResponse {
|
|||||||
*/
|
*/
|
||||||
export class FirecrawlError extends Error {
|
export class FirecrawlError extends Error {
|
||||||
statusCode: number;
|
statusCode: number;
|
||||||
constructor(message: string, statusCode: number) {
|
details?: any;
|
||||||
|
constructor(message: string, statusCode: number, details?: any) {
|
||||||
super(message);
|
super(message);
|
||||||
this.statusCode = statusCode;
|
this.statusCode = statusCode;
|
||||||
|
this.details = details;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -312,6 +314,26 @@ export interface SearchResponse {
|
|||||||
error?: string;
|
error?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Response interface for crawl/batch scrape error monitoring.
|
||||||
|
*/
|
||||||
|
export interface CrawlErrorsResponse {
|
||||||
|
/**
|
||||||
|
* Scrapes that errored out + error details
|
||||||
|
*/
|
||||||
|
errors: {
|
||||||
|
id: string,
|
||||||
|
timestamp?: string,
|
||||||
|
url: string,
|
||||||
|
error: string,
|
||||||
|
}[];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* URLs blocked by robots.txt
|
||||||
|
*/
|
||||||
|
robotsBlocked: string[];
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Main class for interacting with the Firecrawl API.
|
* Main class for interacting with the Firecrawl API.
|
||||||
* Provides methods for scraping, searching, crawling, and mapping web content.
|
* Provides methods for scraping, searching, crawling, and mapping web content.
|
||||||
@ -619,6 +641,29 @@ export default class FirecrawlApp {
|
|||||||
return { success: false, error: "Internal server error." };
|
return { success: false, error: "Internal server error." };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns information about crawl errors.
|
||||||
|
* @param id - The ID of the crawl operation.
|
||||||
|
* @returns Information about crawl errors.
|
||||||
|
*/
|
||||||
|
async checkCrawlErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse> {
|
||||||
|
const headers = this.prepareHeaders();
|
||||||
|
try {
|
||||||
|
const response: AxiosResponse = await this.deleteRequest(
|
||||||
|
`${this.apiUrl}/v1/crawl/${id}/errors`,
|
||||||
|
headers
|
||||||
|
);
|
||||||
|
if (response.status === 200) {
|
||||||
|
return response.data;
|
||||||
|
} else {
|
||||||
|
this.handleError(response, "check crawl errors");
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
throw new FirecrawlError(error.message, 500);
|
||||||
|
}
|
||||||
|
return { success: false, error: "Internal server error." };
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Cancels a crawl job using the Firecrawl API.
|
* Cancels a crawl job using the Firecrawl API.
|
||||||
* @param id - The ID of the crawl operation.
|
* @param id - The ID of the crawl operation.
|
||||||
@ -881,6 +926,29 @@ export default class FirecrawlApp {
|
|||||||
return { success: false, error: "Internal server error." };
|
return { success: false, error: "Internal server error." };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns information about batch scrape errors.
|
||||||
|
* @param id - The ID of the batch scrape operation.
|
||||||
|
* @returns Information about batch scrape errors.
|
||||||
|
*/
|
||||||
|
async checkBatchScrapeErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse> {
|
||||||
|
const headers = this.prepareHeaders();
|
||||||
|
try {
|
||||||
|
const response: AxiosResponse = await this.deleteRequest(
|
||||||
|
`${this.apiUrl}/v1/batch/scrape/${id}/errors`,
|
||||||
|
headers
|
||||||
|
);
|
||||||
|
if (response.status === 200) {
|
||||||
|
return response.data;
|
||||||
|
} else {
|
||||||
|
this.handleError(response, "check batch scrape errors");
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
throw new FirecrawlError(error.message, 500);
|
||||||
|
}
|
||||||
|
return { success: false, error: "Internal server error." };
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extracts information from URLs using the Firecrawl API.
|
* Extracts information from URLs using the Firecrawl API.
|
||||||
* Currently in Beta. Expect breaking changes on future minor versions.
|
* Currently in Beta. Expect breaking changes on future minor versions.
|
||||||
@ -941,9 +1009,9 @@ export default class FirecrawlApp {
|
|||||||
this.handleError(response, "extract");
|
this.handleError(response, "extract");
|
||||||
}
|
}
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
throw new FirecrawlError(error.message, 500);
|
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
|
||||||
}
|
}
|
||||||
return { success: false, error: "Internal server error." };
|
return { success: false, error: "Internal server error."};
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -985,7 +1053,7 @@ export default class FirecrawlApp {
|
|||||||
this.handleError(response, "start extract job");
|
this.handleError(response, "start extract job");
|
||||||
}
|
}
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
throw new FirecrawlError(error.message, 500);
|
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
|
||||||
}
|
}
|
||||||
return { success: false, error: "Internal server error." };
|
return { success: false, error: "Internal server error." };
|
||||||
}
|
}
|
||||||
|
@ -120,7 +120,10 @@ class FirecrawlApp:
|
|||||||
json=scrape_params,
|
json=scrape_params,
|
||||||
)
|
)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
response = response.json()
|
try:
|
||||||
|
response = response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
if response['success'] and 'data' in response:
|
if response['success'] and 'data' in response:
|
||||||
return response['data']
|
return response['data']
|
||||||
elif "error" in response:
|
elif "error" in response:
|
||||||
@ -159,7 +162,10 @@ class FirecrawlApp:
|
|||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
raise Exception(f"Request failed with status code {response.status_code}")
|
raise Exception(f"Request failed with status code {response.status_code}")
|
||||||
|
|
||||||
return response.json()
|
try:
|
||||||
|
return response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
|
|
||||||
def crawl_url(self, url: str,
|
def crawl_url(self, url: str,
|
||||||
params: Optional[Dict[str, Any]] = None,
|
params: Optional[Dict[str, Any]] = None,
|
||||||
@ -194,7 +200,10 @@ class FirecrawlApp:
|
|||||||
json_data.update(params)
|
json_data.update(params)
|
||||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
id = response.json().get('id')
|
try:
|
||||||
|
id = response.json().get('id')
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
return self._monitor_job_status(id, headers, poll_interval)
|
return self._monitor_job_status(id, headers, poll_interval)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@ -223,7 +232,10 @@ class FirecrawlApp:
|
|||||||
json_data.update(params)
|
json_data.update(params)
|
||||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
return response.json()
|
try:
|
||||||
|
return response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
else:
|
else:
|
||||||
self._handle_error(response, 'start crawl job')
|
self._handle_error(response, 'start crawl job')
|
||||||
|
|
||||||
@ -245,7 +257,10 @@ class FirecrawlApp:
|
|||||||
headers = self._prepare_headers()
|
headers = self._prepare_headers()
|
||||||
response = self._get_request(f'{self.api_url}{endpoint}', headers)
|
response = self._get_request(f'{self.api_url}{endpoint}', headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
status_data = response.json()
|
try:
|
||||||
|
status_data = response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
if status_data['status'] == 'completed':
|
if status_data['status'] == 'completed':
|
||||||
if 'data' in status_data:
|
if 'data' in status_data:
|
||||||
data = status_data['data']
|
data = status_data['data']
|
||||||
@ -261,7 +276,10 @@ class FirecrawlApp:
|
|||||||
if status_response.status_code != 200:
|
if status_response.status_code != 200:
|
||||||
logger.error(f"Failed to fetch next page: {status_response.status_code}")
|
logger.error(f"Failed to fetch next page: {status_response.status_code}")
|
||||||
break
|
break
|
||||||
next_data = status_response.json()
|
try:
|
||||||
|
next_data = status_response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
data.extend(next_data.get('data', []))
|
data.extend(next_data.get('data', []))
|
||||||
status_data = next_data
|
status_data = next_data
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -291,6 +309,26 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
self._handle_error(response, 'check crawl status')
|
self._handle_error(response, 'check crawl status')
|
||||||
|
|
||||||
|
def check_crawl_errors(self, id: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Returns information about crawl errors.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
id (str): The ID of the crawl job.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, Any]: Information about crawl errors.
|
||||||
|
"""
|
||||||
|
headers = self._prepare_headers()
|
||||||
|
response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
|
||||||
|
if response.status_code == 200:
|
||||||
|
try:
|
||||||
|
return response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
|
else:
|
||||||
|
self._handle_error(response, "check crawl errors")
|
||||||
|
|
||||||
def cancel_crawl(self, id: str) -> Dict[str, Any]:
|
def cancel_crawl(self, id: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Cancel an asynchronous crawl job using the Firecrawl API.
|
Cancel an asynchronous crawl job using the Firecrawl API.
|
||||||
@ -304,7 +342,10 @@ class FirecrawlApp:
|
|||||||
headers = self._prepare_headers()
|
headers = self._prepare_headers()
|
||||||
response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
|
response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
return response.json()
|
try:
|
||||||
|
return response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
else:
|
else:
|
||||||
self._handle_error(response, "cancel crawl job")
|
self._handle_error(response, "cancel crawl job")
|
||||||
|
|
||||||
@ -352,7 +393,10 @@ class FirecrawlApp:
|
|||||||
json=json_data,
|
json=json_data,
|
||||||
)
|
)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
response = response.json()
|
try:
|
||||||
|
response = response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
if response['success'] and 'links' in response:
|
if response['success'] and 'links' in response:
|
||||||
return response
|
return response
|
||||||
elif 'error' in response:
|
elif 'error' in response:
|
||||||
@ -395,7 +439,10 @@ class FirecrawlApp:
|
|||||||
json_data.update(params)
|
json_data.update(params)
|
||||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
id = response.json().get('id')
|
try:
|
||||||
|
id = response.json().get('id')
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
return self._monitor_job_status(id, headers, poll_interval)
|
return self._monitor_job_status(id, headers, poll_interval)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@ -424,7 +471,10 @@ class FirecrawlApp:
|
|||||||
json_data.update(params)
|
json_data.update(params)
|
||||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
return response.json()
|
try:
|
||||||
|
return response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
else:
|
else:
|
||||||
self._handle_error(response, 'start batch scrape job')
|
self._handle_error(response, 'start batch scrape job')
|
||||||
|
|
||||||
@ -464,7 +514,10 @@ class FirecrawlApp:
|
|||||||
headers = self._prepare_headers()
|
headers = self._prepare_headers()
|
||||||
response = self._get_request(f'{self.api_url}{endpoint}', headers)
|
response = self._get_request(f'{self.api_url}{endpoint}', headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
status_data = response.json()
|
try:
|
||||||
|
status_data = response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
if status_data['status'] == 'completed':
|
if status_data['status'] == 'completed':
|
||||||
if 'data' in status_data:
|
if 'data' in status_data:
|
||||||
data = status_data['data']
|
data = status_data['data']
|
||||||
@ -480,7 +533,10 @@ class FirecrawlApp:
|
|||||||
if status_response.status_code != 200:
|
if status_response.status_code != 200:
|
||||||
logger.error(f"Failed to fetch next page: {status_response.status_code}")
|
logger.error(f"Failed to fetch next page: {status_response.status_code}")
|
||||||
break
|
break
|
||||||
next_data = status_response.json()
|
try:
|
||||||
|
next_data = status_response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
data.extend(next_data.get('data', []))
|
data.extend(next_data.get('data', []))
|
||||||
status_data = next_data
|
status_data = next_data
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -510,6 +566,25 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
self._handle_error(response, 'check batch scrape status')
|
self._handle_error(response, 'check batch scrape status')
|
||||||
|
|
||||||
|
def check_batch_scrape_errors(self, id: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Returns information about batch scrape errors.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
id (str): The ID of the crawl job.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, Any]: Information about crawl errors.
|
||||||
|
"""
|
||||||
|
headers = self._prepare_headers()
|
||||||
|
response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
|
||||||
|
if response.status_code == 200:
|
||||||
|
try:
|
||||||
|
return response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
|
else:
|
||||||
|
self._handle_error(response, "check batch scrape errors")
|
||||||
|
|
||||||
def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Any:
|
def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Any:
|
||||||
"""
|
"""
|
||||||
@ -550,7 +625,10 @@ class FirecrawlApp:
|
|||||||
headers
|
headers
|
||||||
)
|
)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
data = response.json()
|
try:
|
||||||
|
data = response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
if data['success']:
|
if data['success']:
|
||||||
job_id = data.get('id')
|
job_id = data.get('id')
|
||||||
if not job_id:
|
if not job_id:
|
||||||
@ -563,7 +641,10 @@ class FirecrawlApp:
|
|||||||
headers
|
headers
|
||||||
)
|
)
|
||||||
if status_response.status_code == 200:
|
if status_response.status_code == 200:
|
||||||
status_data = status_response.json()
|
try:
|
||||||
|
status_data = status_response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
if status_data['status'] == 'completed':
|
if status_data['status'] == 'completed':
|
||||||
if status_data['success']:
|
if status_data['success']:
|
||||||
return status_data
|
return status_data
|
||||||
@ -601,7 +682,10 @@ class FirecrawlApp:
|
|||||||
try:
|
try:
|
||||||
response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
|
response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
return response.json()
|
try:
|
||||||
|
return response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
else:
|
else:
|
||||||
self._handle_error(response, "get extract status")
|
self._handle_error(response, "get extract status")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -641,7 +725,10 @@ class FirecrawlApp:
|
|||||||
try:
|
try:
|
||||||
response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
|
response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
return response.json()
|
try:
|
||||||
|
return response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
else:
|
else:
|
||||||
self._handle_error(response, "async extract")
|
self._handle_error(response, "async extract")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -771,16 +858,22 @@ class FirecrawlApp:
|
|||||||
|
|
||||||
status_response = self._get_request(api_url, headers)
|
status_response = self._get_request(api_url, headers)
|
||||||
if status_response.status_code == 200:
|
if status_response.status_code == 200:
|
||||||
status_data = status_response.json()
|
try:
|
||||||
|
status_data = status_response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
if status_data['status'] == 'completed':
|
if status_data['status'] == 'completed':
|
||||||
if 'data' in status_data:
|
if 'data' in status_data:
|
||||||
data = status_data['data']
|
data = status_data['data']
|
||||||
while 'next' in status_data:
|
while 'next' in status_data:
|
||||||
if len(status_data['data']) == 0:
|
if len(status_data['data']) == 0:
|
||||||
break
|
break
|
||||||
status_response = self._get_request(status_data['next'], headers)
|
status_response = self._get_request(status_data['next'], headers)
|
||||||
status_data = status_response.json()
|
try:
|
||||||
data.extend(status_data.get('data', []))
|
status_data = status_response.json()
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
|
data.extend(status_data.get('data', []))
|
||||||
status_data['data'] = data
|
status_data['data'] = data
|
||||||
return status_data
|
return status_data
|
||||||
else:
|
else:
|
||||||
@ -804,8 +897,12 @@ class FirecrawlApp:
|
|||||||
Raises:
|
Raises:
|
||||||
Exception: An exception with a message containing the status code and error details from the response.
|
Exception: An exception with a message containing the status code and error details from the response.
|
||||||
"""
|
"""
|
||||||
error_message = response.json().get('error', 'No error message provided.')
|
try:
|
||||||
error_details = response.json().get('details', 'No additional error details provided.')
|
error_message = response.json().get('error', 'No error message provided.')
|
||||||
|
error_details = response.json().get('details', 'No additional error details provided.')
|
||||||
|
except:
|
||||||
|
raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
|
||||||
|
|
||||||
|
|
||||||
if response.status_code == 402:
|
if response.status_code == 402:
|
||||||
message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
|
message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user