Merge branch 'main' into nsc/llm-usage-extract

This commit is contained in:
Nicolas 2025-01-17 23:02:12 -03:00
commit 260a726f37
15 changed files with 438 additions and 112 deletions

View File

@ -14,7 +14,7 @@
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
"test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
"test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'",
"test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth)'",
"test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth|src/scraper/scrapeURL)'",
"test:snips": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false src/__tests__/snips/*.test.ts",
"workers": "nodemon --exec ts-node src/services/queue-worker.ts",
"worker:production": "node dist/src/services/queue-worker.js",

View File

@ -60,9 +60,6 @@ content-type: application/json
"sitemapOnly": true
}
Authorization: Bearer {{$dotenv TEST_API_KEY}}
### Extract Firecrawl Title
# @name extractFirecrawl
POST {{baseUrl}}/v1/extract HTTP/1.1

View File

@ -0,0 +1,81 @@
import { Response } from "express";
import {
CrawlErrorsResponse,
CrawlStatusParams,
CrawlStatusResponse,
ErrorResponse,
RequestWithAuth,
} from "./types";
import {
getCrawl,
getCrawlExpiry,
getCrawlJobs,
getDoneJobsOrdered,
getDoneJobsOrderedLength,
getThrottledJobs,
isCrawlFinished,
} from "../../lib/crawl-redis";
import { getScrapeQueue, redisConnection } from "../../services/queue-service";
import {
supabaseGetJobById,
supabaseGetJobsById,
} from "../../lib/supabase-jobs";
import { configDotenv } from "dotenv";
import { Job, JobState } from "bullmq";
import { logger } from "../../lib/logger";
configDotenv();
export async function getJob(id: string) {
const job = await getScrapeQueue().getJob(id);
if (!job) return job;
return job;
}
export async function getJobs(ids: string[]) {
const jobs: (Job & { id: string })[] = (
await Promise.all(ids.map((x) => getScrapeQueue().getJob(x)))
).filter((x) => x) as (Job & { id: string })[];
return jobs;
}
export async function crawlErrorsController(
req: RequestWithAuth<CrawlStatusParams, undefined, CrawlErrorsResponse>,
res: Response<CrawlErrorsResponse>,
) {
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ success: false, error: "Job not found" });
}
if (sc.team_id !== req.auth.team_id) {
return res.status(403).json({ success: false, error: "Forbidden" });
}
let jobStatuses = await Promise.all(
(await getCrawlJobs(req.params.jobId)).map(
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
),
);
const failedJobIDs: string[] = [];
for (const [id, status] of jobStatuses) {
if (
status === "failed"
) {
failedJobIDs.push(id);
}
}
res.status(200).json({
errors: (await getJobs(failedJobIDs)).map(x => ({
id: x.id,
timestamp: x.finishedOn !== undefined ? (new Date(x.finishedOn).toISOString()) : undefined,
url: x.data.url,
error: x.failedReason,
})),
robotsBlocked: await redisConnection.smembers("crawl:" + req.params.jobId + ":robots_blocked"),
});
}

View File

@ -13,6 +13,7 @@ import {
getDoneJobsOrderedLength,
getThrottledJobs,
isCrawlFinished,
isCrawlFinishedLocked,
} from "../../lib/crawl-redis";
import { getScrapeQueue } from "../../services/queue-service";
import {
@ -117,7 +118,7 @@ export async function crawlStatusController(
sc.cancelled
? "cancelled"
: validJobStatuses.every((x) => x[1] === "completed") &&
await isCrawlFinished(req.params.jobId)
(await isCrawlFinishedLocked(req.params.jobId) || await isCrawlFinished(req.params.jobId))
? "completed"
: "scraping";

View File

@ -85,6 +85,11 @@ export async function getMapResults({
const crawler = crawlToCrawler(id, sc);
try {
sc.robots = await crawler.getRobotsTxt();
await crawler.importRobotsTxt(sc.robots);
} catch (_) {}
// If sitemapOnly is true, only get links from sitemap
if (crawlerOptions.sitemapOnly) {
const sitemap = await crawler.tryGetSitemap(

View File

@ -34,7 +34,7 @@ export const url = z.preprocess(
.url()
.regex(/^https?:\/\//, "URL uses unsupported protocol")
.refine(
(x) => /\.[a-z]{2,}([\/?#]|$)/i.test(x),
(x) => /\.[a-z]{2,}(:\d+)?([\/?#]|$)/i.test(x),
"URL must have a valid top-level domain or be a valid path",
)
.refine((x) => {
@ -569,6 +569,19 @@ export type CrawlStatusResponse =
data: Document[];
};
export type CrawlErrorsResponse =
| ErrorResponse
| {
errors: {
id: string,
timestamp?: string,
url: string,
error: string,
}[];
robotsBlocked: string[];
};
type AuthObject = {
team_id: string;
plan: PlanType | undefined;

View File

@ -128,6 +128,7 @@ export async function isCrawlFinished(id: string) {
return (
(await redisConnection.scard("crawl:" + id + ":jobs_done")) ===
(await redisConnection.scard("crawl:" + id + ":jobs"))
&& (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
);
}
@ -135,6 +136,10 @@ export async function isCrawlFinishedLocked(id: string) {
return await redisConnection.exists("crawl:" + id + ":finish");
}
export async function finishCrawlKickoff(id: string) {
await redisConnection.set("crawl:" + id + ":kickoff:finish", "yes", "EX", 24 * 60 * 60);
}
export async function finishCrawl(id: string) {
if (await isCrawlFinished(id)) {
_logger.debug("Marking crawl as finished.", {
@ -152,6 +157,9 @@ export async function finishCrawl(id: string) {
module: "crawl-redis",
method: "finishCrawl",
crawlId: id,
jobs_done: (await redisConnection.scard("crawl:" + id + ":jobs_done")),
jobs: (await redisConnection.scard("crawl:" + id + ":jobs")),
kickoff_finished: (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null,
});
}
}

View File

@ -28,6 +28,7 @@ import { extractStatusController } from "../controllers/v1/extract-status";
import { creditUsageController } from "../controllers/v1/credit-usage";
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
import { searchController } from "../controllers/v1/search";
import { crawlErrorsController } from "../controllers/v1/crawl-errors";
function checkCreditsMiddleware(
minimum?: number,
@ -192,6 +193,18 @@ v1Router.get(
wrap((req: any, res): any => crawlStatusController(req, res, true)),
);
v1Router.get(
"/crawl/:jobId/errors",
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(crawlErrorsController),
);
v1Router.get(
"/batch/scrape/:jobId/errors",
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(crawlErrorsController),
);
v1Router.get(
"/scrape/:jobId",
authMiddleware(RateLimiterMode.CrawlStatus),

View File

@ -2,7 +2,7 @@ import axios, { AxiosError } from "axios";
import cheerio, { load } from "cheerio";
import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap";
import robotsParser from "robots-parser";
import robotsParser, { Robot } from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../lib/timeout";
import { logger as _logger } from "../../lib/logger";
@ -20,7 +20,7 @@ export class WebCrawler {
private crawledUrls: Map<string, string> = new Map();
private limit: number;
private robotsTxtUrl: string;
public robots: any;
public robots: Robot;
private generateImgAltText: boolean;
private allowBackwardCrawling: boolean;
private allowExternalContentLinks: boolean;
@ -63,7 +63,7 @@ export class WebCrawler {
this.includes = Array.isArray(includes) ? includes : [];
this.excludes = Array.isArray(excludes) ? excludes : [];
this.limit = limit;
this.robotsTxtUrl = `${this.baseUrl}/robots.txt`;
this.robotsTxtUrl = `${this.baseUrl}${this.baseUrl.endsWith("/") ? "" : "/"}robots.txt`;
this.robots = robotsParser(this.robotsTxtUrl, "");
// Deprecated, use limit instead
this.maxCrawledLinks = maxCrawledLinks ?? limit;
@ -217,45 +217,46 @@ export class WebCrawler {
};
const _urlsHandler = async (urls: string[]) => {
let uniqueURLs: string[] = [];
for (const url of urls) {
if (
await redisConnection.sadd(
"sitemap:" + this.jobId + ":links",
normalizeUrl(url),
)
) {
uniqueURLs.push(url);
if (fromMap && onlySitemap) {
return urlsHandler(urls);
} else {
let filteredLinks = this.filterLinks(
[...new Set(urls)],
leftOfLimit,
this.maxCrawledDepth,
fromMap,
);
leftOfLimit -= filteredLinks.length;
let uniqueURLs: string[] = [];
for (const url of filteredLinks) {
if (
await redisConnection.sadd(
"sitemap:" + this.jobId + ":links",
normalizeUrl(url),
)
) {
uniqueURLs.push(url);
}
}
}
await redisConnection.expire(
"sitemap:" + this.jobId + ":links",
3600,
"NX",
);
if (uniqueURLs.length > 0) {
urlsHandler(uniqueURLs);
await redisConnection.expire(
"sitemap:" + this.jobId + ":links",
3600,
"NX",
);
if (uniqueURLs.length > 0) {
return urlsHandler(uniqueURLs);
}
}
};
let count = await this.tryFetchSitemapLinks(
this.initialUrl,
(urls: string[]) => {
if (fromMap && onlySitemap) {
return urlsHandler(urls);
} else {
let filteredLinks = this.filterLinks(
[...new Set(urls)],
leftOfLimit,
this.maxCrawledDepth,
fromMap,
);
leftOfLimit -= filteredLinks.length;
return _urlsHandler(filteredLinks);
}
},
);
let count = (await Promise.all([
this.tryFetchSitemapLinks(
this.initialUrl,
_urlsHandler,
),
...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)),
])).reduce((a,x) => a+x, 0);
if (count > 0) {
if (
@ -298,6 +299,16 @@ export class WebCrawler {
this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
) {
return fullUrl;
} else if (
this.isInternalLink(fullUrl) &&
this.noSections(fullUrl) &&
!this.matchesExcludes(path) &&
!this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
) {
(async() => {
await redisConnection.sadd("crawl:" + this.jobId + ":robots_blocked", fullUrl);
await redisConnection.expire("crawl:" + this.jobId + ":robots_blocked", 24 * 60 * 60, "NX");
})();
}
} else {
// EXTERNAL LINKS

View File

@ -49,12 +49,14 @@ const excludeNonMainTags = [
const forceIncludeMainTags = ["#main"];
export const removeUnwantedElements = (
export const htmlTransform = (
html: string,
url: string,
scrapeOptions: ScrapeOptions,
) => {
const soup = load(html);
let soup = load(html);
// remove unwanted elements
if (
scrapeOptions.includeTags &&
scrapeOptions.includeTags.filter((x) => x.trim().length !== 0).length > 0
@ -66,7 +68,8 @@ export const removeUnwantedElements = (
newRoot.append(soup(element).clone());
});
});
return newRoot.html() ?? "";
soup = load(newRoot.html() ?? "");
}
soup("script, style, noscript, meta, head").remove();
@ -114,6 +117,42 @@ export const removeUnwantedElements = (
});
}
// always return biggest image
soup("img[srcset]").each((_, el) => {
const sizes = el.attribs.srcset.split(",").map(x => {
const tok = x.trim().split(" ");
return {
url: tok[0],
size: parseInt((tok[1] ?? "1x").slice(0, -1), 10),
isX: (tok[1] ?? "").endsWith("x")
};
});
if (sizes.every(x => x.isX) && el.attribs.src) {
sizes.push({
url: el.attribs.src,
size: 1,
isX: true,
});
}
sizes.sort((a,b) => b.size - a.size);
el.attribs.src = sizes[0]?.url;
});
// absolute links
soup("img[src]").each((_, el) => {
try {
el.attribs.src = new URL(el.attribs.src, url).href;
} catch (_) {}
});
soup("a[href]").each((_, el) => {
try {
el.attribs.href = new URL(el.attribs.href, url).href;
} catch (_) {}
});
const cleanedHtml = soup.html();
return cleanedHtml;
};

View File

@ -1,7 +1,7 @@
import { parseMarkdown } from "../../../lib/html-to-markdown";
import { Meta } from "..";
import { Document } from "../../../controllers/v1/types";
import { removeUnwantedElements } from "../lib/removeUnwantedElements";
import { htmlTransform } from "../lib/removeUnwantedElements";
import { extractLinks } from "../lib/extractLinks";
import { extractMetadata } from "../lib/extractMetadata";
import { performLLMExtract } from "./llmExtract";
@ -41,7 +41,7 @@ export function deriveHTMLFromRawHTML(
);
}
document.html = removeUnwantedElements(document.rawHtml, meta.options);
document.html = htmlTransform(document.rawHtml, document.metadata.url ?? document.metadata.sourceURL ?? meta.url, meta.options);
return document;
}

View File

@ -23,6 +23,7 @@ import {
addCrawlJobs,
crawlToCrawler,
finishCrawl,
finishCrawlKickoff,
generateURLPermutations,
getCrawl,
getCrawlJobCount,
@ -675,9 +676,17 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
logger.debug("Done queueing jobs!");
await finishCrawlKickoff(job.data.crawl_id);
await finishCrawlIfNeeded(job, sc);
return { success: true };
} catch (error) {
logger.error("An error occurred!", { error });
await finishCrawlKickoff(job.data.crawl_id);
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
if (sc) {
await finishCrawlIfNeeded(job, sc);
}
return { success: false, error };
}
}
@ -711,6 +720,7 @@ async function processJob(job: Job & { id: string }, token: string) {
teamId: job.data?.team_id ?? undefined,
});
logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url });
const start = Date.now();
// Check if the job URL is researchhub and block it immediately
// TODO: remove this once solve the root issue
@ -737,7 +747,6 @@ async function processJob(job: Job & { id: string }, token: string) {
current_step: "SCRAPING",
current_url: "",
});
const start = Date.now();
if (job.data.crawl_id) {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
@ -988,6 +997,19 @@ async function processJob(job: Job & { id: string }, token: string) {
logger.info(`🐂 Job done ${job.id}`);
return data;
} catch (error) {
if (job.data.crawl_id) {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
logger.debug("Declaring job as done...");
await addCrawlJobDone(job.data.crawl_id, job.id, false);
await redisConnection.srem(
"crawl:" + job.data.crawl_id + ":visited_unique",
normalizeURL(job.data.url, sc),
);
await finishCrawlIfNeeded(job, sc);
}
const isEarlyTimeout =
error instanceof Error && error.message === "timeout";
const isCancelled =
@ -1041,6 +1063,9 @@ async function processJob(job: Job & { id: string }, token: string) {
);
}
const end = Date.now();
const timeTakenInSeconds = (end - start) / 1000;
logger.debug("Logging job to DB...");
await logJob(
{
@ -1053,7 +1078,7 @@ async function processJob(job: Job & { id: string }, token: string) {
"Something went wrong... Contact help@mendable.ai"),
num_docs: 0,
docs: [],
time_taken: 0,
time_taken: timeTakenInSeconds,
team_id: job.data.team_id,
mode: job.data.mode,
url: job.data.url,
@ -1064,39 +1089,6 @@ async function processJob(job: Job & { id: string }, token: string) {
},
true,
);
if (job.data.crawl_id) {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
logger.debug("Declaring job as done...");
await addCrawlJobDone(job.data.crawl_id, job.id, false);
await redisConnection.srem(
"crawl:" + job.data.crawl_id + ":visited_unique",
normalizeURL(job.data.url, sc),
);
await finishCrawlIfNeeded(job, sc);
// await logJob({
// job_id: job.data.crawl_id,
// success: false,
// message:
// typeof error === "string"
// ? error
// : error.message ??
// "Something went wrong... Contact help@mendable.ai",
// num_docs: 0,
// docs: [],
// time_taken: 0,
// team_id: job.data.team_id,
// mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
// url: sc ? sc.originUrl ?? job.data.url : job.data.url,
// crawlerOptions: sc ? sc.crawlerOptions : undefined,
// scrapeOptions: sc ? sc.scrapeOptions : job.data.scrapeOptions,
// origin: job.data.origin,
// });
}
// done(null, data);
return data;
}
}
@ -1126,5 +1118,6 @@ async function processJob(job: Job & { id: string }, token: string) {
await new Promise((resolve) => setTimeout(resolve, 500));
}
console.log("All jobs finished. Worker out!");
process.exit(0);
})();

View File

@ -1,6 +1,6 @@
{
"name": "@mendable/firecrawl-js",
"version": "1.14.0",
"version": "1.14.1",
"description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js",
"types": "dist/index.d.ts",

View File

@ -279,9 +279,11 @@ export interface ErrorResponse {
*/
export class FirecrawlError extends Error {
statusCode: number;
constructor(message: string, statusCode: number) {
details?: any;
constructor(message: string, statusCode: number, details?: any) {
super(message);
this.statusCode = statusCode;
this.details = details;
}
}
@ -312,6 +314,26 @@ export interface SearchResponse {
error?: string;
}
/**
* Response interface for crawl/batch scrape error monitoring.
*/
export interface CrawlErrorsResponse {
/**
* Scrapes that errored out + error details
*/
errors: {
id: string,
timestamp?: string,
url: string,
error: string,
}[];
/**
* URLs blocked by robots.txt
*/
robotsBlocked: string[];
};
/**
* Main class for interacting with the Firecrawl API.
* Provides methods for scraping, searching, crawling, and mapping web content.
@ -619,6 +641,29 @@ export default class FirecrawlApp {
return { success: false, error: "Internal server error." };
}
/**
* Returns information about crawl errors.
* @param id - The ID of the crawl operation.
* @returns Information about crawl errors.
*/
async checkCrawlErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse> {
const headers = this.prepareHeaders();
try {
const response: AxiosResponse = await this.deleteRequest(
`${this.apiUrl}/v1/crawl/${id}/errors`,
headers
);
if (response.status === 200) {
return response.data;
} else {
this.handleError(response, "check crawl errors");
}
} catch (error: any) {
throw new FirecrawlError(error.message, 500);
}
return { success: false, error: "Internal server error." };
}
/**
* Cancels a crawl job using the Firecrawl API.
* @param id - The ID of the crawl operation.
@ -881,6 +926,29 @@ export default class FirecrawlApp {
return { success: false, error: "Internal server error." };
}
/**
* Returns information about batch scrape errors.
* @param id - The ID of the batch scrape operation.
* @returns Information about batch scrape errors.
*/
async checkBatchScrapeErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse> {
const headers = this.prepareHeaders();
try {
const response: AxiosResponse = await this.deleteRequest(
`${this.apiUrl}/v1/batch/scrape/${id}/errors`,
headers
);
if (response.status === 200) {
return response.data;
} else {
this.handleError(response, "check batch scrape errors");
}
} catch (error: any) {
throw new FirecrawlError(error.message, 500);
}
return { success: false, error: "Internal server error." };
}
/**
* Extracts information from URLs using the Firecrawl API.
* Currently in Beta. Expect breaking changes on future minor versions.
@ -941,9 +1009,9 @@ export default class FirecrawlApp {
this.handleError(response, "extract");
}
} catch (error: any) {
throw new FirecrawlError(error.message, 500);
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
}
return { success: false, error: "Internal server error." };
return { success: false, error: "Internal server error."};
}
/**
@ -985,7 +1053,7 @@ export default class FirecrawlApp {
this.handleError(response, "start extract job");
}
} catch (error: any) {
throw new FirecrawlError(error.message, 500);
throw new FirecrawlError(error.message, 500, error.response?.data?.details);
}
return { success: false, error: "Internal server error." };
}

View File

@ -120,7 +120,10 @@ class FirecrawlApp:
json=scrape_params,
)
if response.status_code == 200:
response = response.json()
try:
response = response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
if response['success'] and 'data' in response:
return response['data']
elif "error" in response:
@ -159,7 +162,10 @@ class FirecrawlApp:
if response.status_code != 200:
raise Exception(f"Request failed with status code {response.status_code}")
return response.json()
try:
return response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
def crawl_url(self, url: str,
params: Optional[Dict[str, Any]] = None,
@ -194,7 +200,10 @@ class FirecrawlApp:
json_data.update(params)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
if response.status_code == 200:
id = response.json().get('id')
try:
id = response.json().get('id')
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
return self._monitor_job_status(id, headers, poll_interval)
else:
@ -223,7 +232,10 @@ class FirecrawlApp:
json_data.update(params)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
if response.status_code == 200:
return response.json()
try:
return response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
else:
self._handle_error(response, 'start crawl job')
@ -245,7 +257,10 @@ class FirecrawlApp:
headers = self._prepare_headers()
response = self._get_request(f'{self.api_url}{endpoint}', headers)
if response.status_code == 200:
status_data = response.json()
try:
status_data = response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
if status_data['status'] == 'completed':
if 'data' in status_data:
data = status_data['data']
@ -261,7 +276,10 @@ class FirecrawlApp:
if status_response.status_code != 200:
logger.error(f"Failed to fetch next page: {status_response.status_code}")
break
next_data = status_response.json()
try:
next_data = status_response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
data.extend(next_data.get('data', []))
status_data = next_data
except Exception as e:
@ -291,6 +309,26 @@ class FirecrawlApp:
else:
self._handle_error(response, 'check crawl status')
def check_crawl_errors(self, id: str) -> Dict[str, Any]:
"""
Returns information about crawl errors.
Args:
id (str): The ID of the crawl job.
Returns:
Dict[str, Any]: Information about crawl errors.
"""
headers = self._prepare_headers()
response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
if response.status_code == 200:
try:
return response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
else:
self._handle_error(response, "check crawl errors")
def cancel_crawl(self, id: str) -> Dict[str, Any]:
"""
Cancel an asynchronous crawl job using the Firecrawl API.
@ -304,7 +342,10 @@ class FirecrawlApp:
headers = self._prepare_headers()
response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
if response.status_code == 200:
return response.json()
try:
return response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
else:
self._handle_error(response, "cancel crawl job")
@ -352,7 +393,10 @@ class FirecrawlApp:
json=json_data,
)
if response.status_code == 200:
response = response.json()
try:
response = response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
if response['success'] and 'links' in response:
return response
elif 'error' in response:
@ -395,7 +439,10 @@ class FirecrawlApp:
json_data.update(params)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
if response.status_code == 200:
id = response.json().get('id')
try:
id = response.json().get('id')
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
return self._monitor_job_status(id, headers, poll_interval)
else:
@ -424,7 +471,10 @@ class FirecrawlApp:
json_data.update(params)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
if response.status_code == 200:
return response.json()
try:
return response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
else:
self._handle_error(response, 'start batch scrape job')
@ -464,7 +514,10 @@ class FirecrawlApp:
headers = self._prepare_headers()
response = self._get_request(f'{self.api_url}{endpoint}', headers)
if response.status_code == 200:
status_data = response.json()
try:
status_data = response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
if status_data['status'] == 'completed':
if 'data' in status_data:
data = status_data['data']
@ -480,7 +533,10 @@ class FirecrawlApp:
if status_response.status_code != 200:
logger.error(f"Failed to fetch next page: {status_response.status_code}")
break
next_data = status_response.json()
try:
next_data = status_response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
data.extend(next_data.get('data', []))
status_data = next_data
except Exception as e:
@ -510,6 +566,25 @@ class FirecrawlApp:
else:
self._handle_error(response, 'check batch scrape status')
def check_batch_scrape_errors(self, id: str) -> Dict[str, Any]:
"""
Returns information about batch scrape errors.
Args:
id (str): The ID of the crawl job.
Returns:
Dict[str, Any]: Information about crawl errors.
"""
headers = self._prepare_headers()
response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
if response.status_code == 200:
try:
return response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
else:
self._handle_error(response, "check batch scrape errors")
def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Any:
"""
@ -550,7 +625,10 @@ class FirecrawlApp:
headers
)
if response.status_code == 200:
data = response.json()
try:
data = response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
if data['success']:
job_id = data.get('id')
if not job_id:
@ -563,7 +641,10 @@ class FirecrawlApp:
headers
)
if status_response.status_code == 200:
status_data = status_response.json()
try:
status_data = status_response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
if status_data['status'] == 'completed':
if status_data['success']:
return status_data
@ -601,7 +682,10 @@ class FirecrawlApp:
try:
response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
if response.status_code == 200:
return response.json()
try:
return response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
else:
self._handle_error(response, "get extract status")
except Exception as e:
@ -641,7 +725,10 @@ class FirecrawlApp:
try:
response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
if response.status_code == 200:
return response.json()
try:
return response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
else:
self._handle_error(response, "async extract")
except Exception as e:
@ -771,16 +858,22 @@ class FirecrawlApp:
status_response = self._get_request(api_url, headers)
if status_response.status_code == 200:
status_data = status_response.json()
try:
status_data = status_response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
if status_data['status'] == 'completed':
if 'data' in status_data:
data = status_data['data']
while 'next' in status_data:
if len(status_data['data']) == 0:
break
status_response = self._get_request(status_data['next'], headers)
status_data = status_response.json()
data.extend(status_data.get('data', []))
if len(status_data['data']) == 0:
break
status_response = self._get_request(status_data['next'], headers)
try:
status_data = status_response.json()
except:
raise Exception(f'Failed to parse Firecrawl response as JSON.')
data.extend(status_data.get('data', []))
status_data['data'] = data
return status_data
else:
@ -804,8 +897,12 @@ class FirecrawlApp:
Raises:
Exception: An exception with a message containing the status code and error details from the response.
"""
error_message = response.json().get('error', 'No error message provided.')
error_details = response.json().get('details', 'No additional error details provided.')
try:
error_message = response.json().get('error', 'No error message provided.')
error_details = response.json().get('details', 'No additional error details provided.')
except:
raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
if response.status_code == 402:
message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"