diff --git a/apps/api/package.json b/apps/api/package.json index ca03d9c2..88233235 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -14,7 +14,7 @@ "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", "test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", "test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'", - "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth)'", + "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth|src/scraper/scrapeURL)'", "test:snips": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false src/__tests__/snips/*.test.ts", "workers": "nodemon --exec ts-node src/services/queue-worker.ts", "worker:production": "node dist/src/services/queue-worker.js", diff --git a/apps/api/requests.http b/apps/api/requests.http index 95195e9f..d8cc3633 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -60,9 +60,6 @@ content-type: application/json "sitemapOnly": true } -Authorization: Bearer {{$dotenv TEST_API_KEY}} - - ### Extract Firecrawl Title # @name extractFirecrawl POST {{baseUrl}}/v1/extract HTTP/1.1 diff --git a/apps/api/src/controllers/v1/crawl-errors.ts b/apps/api/src/controllers/v1/crawl-errors.ts new file mode 100644 index 00000000..b64d02fa --- /dev/null +++ b/apps/api/src/controllers/v1/crawl-errors.ts @@ -0,0 +1,81 @@ +import { Response } from "express"; +import { + CrawlErrorsResponse, + CrawlStatusParams, + CrawlStatusResponse, + ErrorResponse, + RequestWithAuth, +} from "./types"; +import { + getCrawl, + getCrawlExpiry, + getCrawlJobs, + getDoneJobsOrdered, + getDoneJobsOrderedLength, + getThrottledJobs, + isCrawlFinished, +} from "../../lib/crawl-redis"; +import { getScrapeQueue, redisConnection } from "../../services/queue-service"; +import { + supabaseGetJobById, + supabaseGetJobsById, +} from "../../lib/supabase-jobs"; +import { configDotenv } from "dotenv"; +import { Job, JobState } from "bullmq"; +import { logger } from "../../lib/logger"; +configDotenv(); + +export async function getJob(id: string) { + const job = await getScrapeQueue().getJob(id); + if (!job) return job; + + return job; +} + +export async function getJobs(ids: string[]) { + const jobs: (Job & { id: string })[] = ( + await Promise.all(ids.map((x) => getScrapeQueue().getJob(x))) + ).filter((x) => x) as (Job & { id: string })[]; + + return jobs; +} + +export async function crawlErrorsController( + req: RequestWithAuth, + res: Response, +) { + const sc = await getCrawl(req.params.jobId); + if (!sc) { + return res.status(404).json({ success: false, error: "Job not found" }); + } + + if (sc.team_id !== req.auth.team_id) { + return res.status(403).json({ success: false, error: "Forbidden" }); + } + + let jobStatuses = await Promise.all( + (await getCrawlJobs(req.params.jobId)).map( + async (x) => [x, await getScrapeQueue().getJobState(x)] as const, + ), + ); + + const failedJobIDs: string[] = []; + + for (const [id, status] of jobStatuses) { + if ( + status === "failed" + ) { + failedJobIDs.push(id); + } + } + + res.status(200).json({ + errors: (await getJobs(failedJobIDs)).map(x => ({ + id: x.id, + timestamp: x.finishedOn !== undefined ? (new Date(x.finishedOn).toISOString()) : undefined, + url: x.data.url, + error: x.failedReason, + })), + robotsBlocked: await redisConnection.smembers("crawl:" + req.params.jobId + ":robots_blocked"), + }); +} diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 02046364..694b3dca 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -13,6 +13,7 @@ import { getDoneJobsOrderedLength, getThrottledJobs, isCrawlFinished, + isCrawlFinishedLocked, } from "../../lib/crawl-redis"; import { getScrapeQueue } from "../../services/queue-service"; import { @@ -117,7 +118,7 @@ export async function crawlStatusController( sc.cancelled ? "cancelled" : validJobStatuses.every((x) => x[1] === "completed") && - await isCrawlFinished(req.params.jobId) + (await isCrawlFinishedLocked(req.params.jobId) || await isCrawlFinished(req.params.jobId)) ? "completed" : "scraping"; diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 2afae0d4..c8f7dd96 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -85,6 +85,11 @@ export async function getMapResults({ const crawler = crawlToCrawler(id, sc); + try { + sc.robots = await crawler.getRobotsTxt(); + await crawler.importRobotsTxt(sc.robots); + } catch (_) {} + // If sitemapOnly is true, only get links from sitemap if (crawlerOptions.sitemapOnly) { const sitemap = await crawler.tryGetSitemap( diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 78c35902..479cc33f 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -34,7 +34,7 @@ export const url = z.preprocess( .url() .regex(/^https?:\/\//, "URL uses unsupported protocol") .refine( - (x) => /\.[a-z]{2,}([\/?#]|$)/i.test(x), + (x) => /\.[a-z]{2,}(:\d+)?([\/?#]|$)/i.test(x), "URL must have a valid top-level domain or be a valid path", ) .refine((x) => { @@ -569,6 +569,19 @@ export type CrawlStatusResponse = data: Document[]; }; + +export type CrawlErrorsResponse = + | ErrorResponse + | { + errors: { + id: string, + timestamp?: string, + url: string, + error: string, + }[]; + robotsBlocked: string[]; + }; + type AuthObject = { team_id: string; plan: PlanType | undefined; diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 5b25969e..add189ba 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -128,6 +128,7 @@ export async function isCrawlFinished(id: string) { return ( (await redisConnection.scard("crawl:" + id + ":jobs_done")) === (await redisConnection.scard("crawl:" + id + ":jobs")) + && (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null ); } @@ -135,6 +136,10 @@ export async function isCrawlFinishedLocked(id: string) { return await redisConnection.exists("crawl:" + id + ":finish"); } +export async function finishCrawlKickoff(id: string) { + await redisConnection.set("crawl:" + id + ":kickoff:finish", "yes", "EX", 24 * 60 * 60); +} + export async function finishCrawl(id: string) { if (await isCrawlFinished(id)) { _logger.debug("Marking crawl as finished.", { @@ -152,6 +157,9 @@ export async function finishCrawl(id: string) { module: "crawl-redis", method: "finishCrawl", crawlId: id, + jobs_done: (await redisConnection.scard("crawl:" + id + ":jobs_done")), + jobs: (await redisConnection.scard("crawl:" + id + ":jobs")), + kickoff_finished: (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null, }); } } diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index a916dd40..4aacfe18 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -28,6 +28,7 @@ import { extractStatusController } from "../controllers/v1/extract-status"; import { creditUsageController } from "../controllers/v1/credit-usage"; import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; import { searchController } from "../controllers/v1/search"; +import { crawlErrorsController } from "../controllers/v1/crawl-errors"; function checkCreditsMiddleware( minimum?: number, @@ -192,6 +193,18 @@ v1Router.get( wrap((req: any, res): any => crawlStatusController(req, res, true)), ); +v1Router.get( + "/crawl/:jobId/errors", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(crawlErrorsController), +); + +v1Router.get( + "/batch/scrape/:jobId/errors", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(crawlErrorsController), +); + v1Router.get( "/scrape/:jobId", authMiddleware(RateLimiterMode.CrawlStatus), diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 5662fff9..7d4be97b 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -2,7 +2,7 @@ import axios, { AxiosError } from "axios"; import cheerio, { load } from "cheerio"; import { URL } from "url"; import { getLinksFromSitemap } from "./sitemap"; -import robotsParser from "robots-parser"; +import robotsParser, { Robot } from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; import { axiosTimeout } from "../../lib/timeout"; import { logger as _logger } from "../../lib/logger"; @@ -20,7 +20,7 @@ export class WebCrawler { private crawledUrls: Map = new Map(); private limit: number; private robotsTxtUrl: string; - public robots: any; + public robots: Robot; private generateImgAltText: boolean; private allowBackwardCrawling: boolean; private allowExternalContentLinks: boolean; @@ -63,7 +63,7 @@ export class WebCrawler { this.includes = Array.isArray(includes) ? includes : []; this.excludes = Array.isArray(excludes) ? excludes : []; this.limit = limit; - this.robotsTxtUrl = `${this.baseUrl}/robots.txt`; + this.robotsTxtUrl = `${this.baseUrl}${this.baseUrl.endsWith("/") ? "" : "/"}robots.txt`; this.robots = robotsParser(this.robotsTxtUrl, ""); // Deprecated, use limit instead this.maxCrawledLinks = maxCrawledLinks ?? limit; @@ -217,45 +217,46 @@ export class WebCrawler { }; const _urlsHandler = async (urls: string[]) => { - let uniqueURLs: string[] = []; - for (const url of urls) { - if ( - await redisConnection.sadd( - "sitemap:" + this.jobId + ":links", - normalizeUrl(url), - ) - ) { - uniqueURLs.push(url); + if (fromMap && onlySitemap) { + return urlsHandler(urls); + } else { + let filteredLinks = this.filterLinks( + [...new Set(urls)], + leftOfLimit, + this.maxCrawledDepth, + fromMap, + ); + leftOfLimit -= filteredLinks.length; + let uniqueURLs: string[] = []; + for (const url of filteredLinks) { + if ( + await redisConnection.sadd( + "sitemap:" + this.jobId + ":links", + normalizeUrl(url), + ) + ) { + uniqueURLs.push(url); + } } - } - await redisConnection.expire( - "sitemap:" + this.jobId + ":links", - 3600, - "NX", - ); - if (uniqueURLs.length > 0) { - urlsHandler(uniqueURLs); + await redisConnection.expire( + "sitemap:" + this.jobId + ":links", + 3600, + "NX", + ); + if (uniqueURLs.length > 0) { + return urlsHandler(uniqueURLs); + } } }; - let count = await this.tryFetchSitemapLinks( - this.initialUrl, - (urls: string[]) => { - if (fromMap && onlySitemap) { - return urlsHandler(urls); - } else { - let filteredLinks = this.filterLinks( - [...new Set(urls)], - leftOfLimit, - this.maxCrawledDepth, - fromMap, - ); - leftOfLimit -= filteredLinks.length; - return _urlsHandler(filteredLinks); - } - }, - ); + let count = (await Promise.all([ + this.tryFetchSitemapLinks( + this.initialUrl, + _urlsHandler, + ), + ...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)), + ])).reduce((a,x) => a+x, 0); if (count > 0) { if ( @@ -298,6 +299,16 @@ export class WebCrawler { this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt) ) { return fullUrl; + } else if ( + this.isInternalLink(fullUrl) && + this.noSections(fullUrl) && + !this.matchesExcludes(path) && + !this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt) + ) { + (async() => { + await redisConnection.sadd("crawl:" + this.jobId + ":robots_blocked", fullUrl); + await redisConnection.expire("crawl:" + this.jobId + ":robots_blocked", 24 * 60 * 60, "NX"); + })(); } } else { // EXTERNAL LINKS diff --git a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts index 3536211d..57015557 100644 --- a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts +++ b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts @@ -49,12 +49,14 @@ const excludeNonMainTags = [ const forceIncludeMainTags = ["#main"]; -export const removeUnwantedElements = ( +export const htmlTransform = ( html: string, + url: string, scrapeOptions: ScrapeOptions, ) => { - const soup = load(html); + let soup = load(html); + // remove unwanted elements if ( scrapeOptions.includeTags && scrapeOptions.includeTags.filter((x) => x.trim().length !== 0).length > 0 @@ -66,7 +68,8 @@ export const removeUnwantedElements = ( newRoot.append(soup(element).clone()); }); }); - return newRoot.html() ?? ""; + + soup = load(newRoot.html() ?? ""); } soup("script, style, noscript, meta, head").remove(); @@ -114,6 +117,42 @@ export const removeUnwantedElements = ( }); } + // always return biggest image + soup("img[srcset]").each((_, el) => { + const sizes = el.attribs.srcset.split(",").map(x => { + const tok = x.trim().split(" "); + return { + url: tok[0], + size: parseInt((tok[1] ?? "1x").slice(0, -1), 10), + isX: (tok[1] ?? "").endsWith("x") + }; + }); + + if (sizes.every(x => x.isX) && el.attribs.src) { + sizes.push({ + url: el.attribs.src, + size: 1, + isX: true, + }); + } + + sizes.sort((a,b) => b.size - a.size); + + el.attribs.src = sizes[0]?.url; + }); + + // absolute links + soup("img[src]").each((_, el) => { + try { + el.attribs.src = new URL(el.attribs.src, url).href; + } catch (_) {} + }); + soup("a[href]").each((_, el) => { + try { + el.attribs.href = new URL(el.attribs.href, url).href; + } catch (_) {} + }); + const cleanedHtml = soup.html(); return cleanedHtml; }; diff --git a/apps/api/src/scraper/scrapeURL/transformers/index.ts b/apps/api/src/scraper/scrapeURL/transformers/index.ts index e14896ef..54bf0d46 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/index.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts @@ -1,7 +1,7 @@ import { parseMarkdown } from "../../../lib/html-to-markdown"; import { Meta } from ".."; import { Document } from "../../../controllers/v1/types"; -import { removeUnwantedElements } from "../lib/removeUnwantedElements"; +import { htmlTransform } from "../lib/removeUnwantedElements"; import { extractLinks } from "../lib/extractLinks"; import { extractMetadata } from "../lib/extractMetadata"; import { performLLMExtract } from "./llmExtract"; @@ -41,7 +41,7 @@ export function deriveHTMLFromRawHTML( ); } - document.html = removeUnwantedElements(document.rawHtml, meta.options); + document.html = htmlTransform(document.rawHtml, document.metadata.url ?? document.metadata.sourceURL ?? meta.url, meta.options); return document; } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index d13cb198..de8f5567 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -23,6 +23,7 @@ import { addCrawlJobs, crawlToCrawler, finishCrawl, + finishCrawlKickoff, generateURLPermutations, getCrawl, getCrawlJobCount, @@ -675,9 +676,17 @@ async function processKickoffJob(job: Job & { id: string }, token: string) { logger.debug("Done queueing jobs!"); + await finishCrawlKickoff(job.data.crawl_id); + await finishCrawlIfNeeded(job, sc); + return { success: true }; } catch (error) { logger.error("An error occurred!", { error }); + await finishCrawlKickoff(job.data.crawl_id); + const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; + if (sc) { + await finishCrawlIfNeeded(job, sc); + } return { success: false, error }; } } @@ -711,6 +720,7 @@ async function processJob(job: Job & { id: string }, token: string) { teamId: job.data?.team_id ?? undefined, }); logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url }); + const start = Date.now(); // Check if the job URL is researchhub and block it immediately // TODO: remove this once solve the root issue @@ -737,7 +747,6 @@ async function processJob(job: Job & { id: string }, token: string) { current_step: "SCRAPING", current_url: "", }); - const start = Date.now(); if (job.data.crawl_id) { const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; @@ -988,6 +997,19 @@ async function processJob(job: Job & { id: string }, token: string) { logger.info(`🐂 Job done ${job.id}`); return data; } catch (error) { + if (job.data.crawl_id) { + const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; + + logger.debug("Declaring job as done..."); + await addCrawlJobDone(job.data.crawl_id, job.id, false); + await redisConnection.srem( + "crawl:" + job.data.crawl_id + ":visited_unique", + normalizeURL(job.data.url, sc), + ); + + await finishCrawlIfNeeded(job, sc); + } + const isEarlyTimeout = error instanceof Error && error.message === "timeout"; const isCancelled = @@ -1041,6 +1063,9 @@ async function processJob(job: Job & { id: string }, token: string) { ); } + const end = Date.now(); + const timeTakenInSeconds = (end - start) / 1000; + logger.debug("Logging job to DB..."); await logJob( { @@ -1053,7 +1078,7 @@ async function processJob(job: Job & { id: string }, token: string) { "Something went wrong... Contact help@mendable.ai"), num_docs: 0, docs: [], - time_taken: 0, + time_taken: timeTakenInSeconds, team_id: job.data.team_id, mode: job.data.mode, url: job.data.url, @@ -1064,39 +1089,6 @@ async function processJob(job: Job & { id: string }, token: string) { }, true, ); - - if (job.data.crawl_id) { - const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; - - logger.debug("Declaring job as done..."); - await addCrawlJobDone(job.data.crawl_id, job.id, false); - await redisConnection.srem( - "crawl:" + job.data.crawl_id + ":visited_unique", - normalizeURL(job.data.url, sc), - ); - - await finishCrawlIfNeeded(job, sc); - - // await logJob({ - // job_id: job.data.crawl_id, - // success: false, - // message: - // typeof error === "string" - // ? error - // : error.message ?? - // "Something went wrong... Contact help@mendable.ai", - // num_docs: 0, - // docs: [], - // time_taken: 0, - // team_id: job.data.team_id, - // mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape", - // url: sc ? sc.originUrl ?? job.data.url : job.data.url, - // crawlerOptions: sc ? sc.crawlerOptions : undefined, - // scrapeOptions: sc ? sc.scrapeOptions : job.data.scrapeOptions, - // origin: job.data.origin, - // }); - } - // done(null, data); return data; } } @@ -1126,5 +1118,6 @@ async function processJob(job: Job & { id: string }, token: string) { await new Promise((resolve) => setTimeout(resolve, 500)); } + console.log("All jobs finished. Worker out!"); process.exit(0); })(); diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index ef159121..bd527ac3 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.14.0", + "version": "1.14.1", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 48905dc9..1d1715ed 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -279,9 +279,11 @@ export interface ErrorResponse { */ export class FirecrawlError extends Error { statusCode: number; - constructor(message: string, statusCode: number) { + details?: any; + constructor(message: string, statusCode: number, details?: any) { super(message); this.statusCode = statusCode; + this.details = details; } } @@ -312,6 +314,26 @@ export interface SearchResponse { error?: string; } +/** + * Response interface for crawl/batch scrape error monitoring. + */ +export interface CrawlErrorsResponse { + /** + * Scrapes that errored out + error details + */ + errors: { + id: string, + timestamp?: string, + url: string, + error: string, + }[]; + + /** + * URLs blocked by robots.txt + */ + robotsBlocked: string[]; +}; + /** * Main class for interacting with the Firecrawl API. * Provides methods for scraping, searching, crawling, and mapping web content. @@ -619,6 +641,29 @@ export default class FirecrawlApp { return { success: false, error: "Internal server error." }; } + /** + * Returns information about crawl errors. + * @param id - The ID of the crawl operation. + * @returns Information about crawl errors. + */ + async checkCrawlErrors(id: string): Promise { + const headers = this.prepareHeaders(); + try { + const response: AxiosResponse = await this.deleteRequest( + `${this.apiUrl}/v1/crawl/${id}/errors`, + headers + ); + if (response.status === 200) { + return response.data; + } else { + this.handleError(response, "check crawl errors"); + } + } catch (error: any) { + throw new FirecrawlError(error.message, 500); + } + return { success: false, error: "Internal server error." }; + } + /** * Cancels a crawl job using the Firecrawl API. * @param id - The ID of the crawl operation. @@ -881,6 +926,29 @@ export default class FirecrawlApp { return { success: false, error: "Internal server error." }; } + /** + * Returns information about batch scrape errors. + * @param id - The ID of the batch scrape operation. + * @returns Information about batch scrape errors. + */ + async checkBatchScrapeErrors(id: string): Promise { + const headers = this.prepareHeaders(); + try { + const response: AxiosResponse = await this.deleteRequest( + `${this.apiUrl}/v1/batch/scrape/${id}/errors`, + headers + ); + if (response.status === 200) { + return response.data; + } else { + this.handleError(response, "check batch scrape errors"); + } + } catch (error: any) { + throw new FirecrawlError(error.message, 500); + } + return { success: false, error: "Internal server error." }; + } + /** * Extracts information from URLs using the Firecrawl API. * Currently in Beta. Expect breaking changes on future minor versions. @@ -941,9 +1009,9 @@ export default class FirecrawlApp { this.handleError(response, "extract"); } } catch (error: any) { - throw new FirecrawlError(error.message, 500); + throw new FirecrawlError(error.message, 500, error.response?.data?.details); } - return { success: false, error: "Internal server error." }; + return { success: false, error: "Internal server error."}; } /** @@ -985,7 +1053,7 @@ export default class FirecrawlApp { this.handleError(response, "start extract job"); } } catch (error: any) { - throw new FirecrawlError(error.message, 500); + throw new FirecrawlError(error.message, 500, error.response?.data?.details); } return { success: false, error: "Internal server error." }; } diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 41f8badf..078e6a56 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -120,7 +120,10 @@ class FirecrawlApp: json=scrape_params, ) if response.status_code == 200: - response = response.json() + try: + response = response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') if response['success'] and 'data' in response: return response['data'] elif "error" in response: @@ -159,7 +162,10 @@ class FirecrawlApp: if response.status_code != 200: raise Exception(f"Request failed with status code {response.status_code}") - return response.json() + try: + return response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') def crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, @@ -194,7 +200,10 @@ class FirecrawlApp: json_data.update(params) response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) if response.status_code == 200: - id = response.json().get('id') + try: + id = response.json().get('id') + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') return self._monitor_job_status(id, headers, poll_interval) else: @@ -223,7 +232,10 @@ class FirecrawlApp: json_data.update(params) response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) if response.status_code == 200: - return response.json() + try: + return response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') else: self._handle_error(response, 'start crawl job') @@ -245,7 +257,10 @@ class FirecrawlApp: headers = self._prepare_headers() response = self._get_request(f'{self.api_url}{endpoint}', headers) if response.status_code == 200: - status_data = response.json() + try: + status_data = response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') if status_data['status'] == 'completed': if 'data' in status_data: data = status_data['data'] @@ -261,7 +276,10 @@ class FirecrawlApp: if status_response.status_code != 200: logger.error(f"Failed to fetch next page: {status_response.status_code}") break - next_data = status_response.json() + try: + next_data = status_response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') data.extend(next_data.get('data', [])) status_data = next_data except Exception as e: @@ -291,6 +309,26 @@ class FirecrawlApp: else: self._handle_error(response, 'check crawl status') + def check_crawl_errors(self, id: str) -> Dict[str, Any]: + """ + Returns information about crawl errors. + + Args: + id (str): The ID of the crawl job. + + Returns: + Dict[str, Any]: Information about crawl errors. + """ + headers = self._prepare_headers() + response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers) + if response.status_code == 200: + try: + return response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, "check crawl errors") + def cancel_crawl(self, id: str) -> Dict[str, Any]: """ Cancel an asynchronous crawl job using the Firecrawl API. @@ -304,7 +342,10 @@ class FirecrawlApp: headers = self._prepare_headers() response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers) if response.status_code == 200: - return response.json() + try: + return response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') else: self._handle_error(response, "cancel crawl job") @@ -352,7 +393,10 @@ class FirecrawlApp: json=json_data, ) if response.status_code == 200: - response = response.json() + try: + response = response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') if response['success'] and 'links' in response: return response elif 'error' in response: @@ -395,7 +439,10 @@ class FirecrawlApp: json_data.update(params) response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) if response.status_code == 200: - id = response.json().get('id') + try: + id = response.json().get('id') + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') return self._monitor_job_status(id, headers, poll_interval) else: @@ -424,7 +471,10 @@ class FirecrawlApp: json_data.update(params) response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) if response.status_code == 200: - return response.json() + try: + return response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') else: self._handle_error(response, 'start batch scrape job') @@ -464,7 +514,10 @@ class FirecrawlApp: headers = self._prepare_headers() response = self._get_request(f'{self.api_url}{endpoint}', headers) if response.status_code == 200: - status_data = response.json() + try: + status_data = response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') if status_data['status'] == 'completed': if 'data' in status_data: data = status_data['data'] @@ -480,7 +533,10 @@ class FirecrawlApp: if status_response.status_code != 200: logger.error(f"Failed to fetch next page: {status_response.status_code}") break - next_data = status_response.json() + try: + next_data = status_response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') data.extend(next_data.get('data', [])) status_data = next_data except Exception as e: @@ -510,6 +566,25 @@ class FirecrawlApp: else: self._handle_error(response, 'check batch scrape status') + def check_batch_scrape_errors(self, id: str) -> Dict[str, Any]: + """ + Returns information about batch scrape errors. + + Args: + id (str): The ID of the crawl job. + + Returns: + Dict[str, Any]: Information about crawl errors. + """ + headers = self._prepare_headers() + response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers) + if response.status_code == 200: + try: + return response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, "check batch scrape errors") def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Any: """ @@ -550,7 +625,10 @@ class FirecrawlApp: headers ) if response.status_code == 200: - data = response.json() + try: + data = response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') if data['success']: job_id = data.get('id') if not job_id: @@ -563,7 +641,10 @@ class FirecrawlApp: headers ) if status_response.status_code == 200: - status_data = status_response.json() + try: + status_data = status_response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') if status_data['status'] == 'completed': if status_data['success']: return status_data @@ -601,7 +682,10 @@ class FirecrawlApp: try: response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers) if response.status_code == 200: - return response.json() + try: + return response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') else: self._handle_error(response, "get extract status") except Exception as e: @@ -641,7 +725,10 @@ class FirecrawlApp: try: response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers) if response.status_code == 200: - return response.json() + try: + return response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') else: self._handle_error(response, "async extract") except Exception as e: @@ -771,16 +858,22 @@ class FirecrawlApp: status_response = self._get_request(api_url, headers) if status_response.status_code == 200: - status_data = status_response.json() + try: + status_data = status_response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') if status_data['status'] == 'completed': if 'data' in status_data: data = status_data['data'] while 'next' in status_data: - if len(status_data['data']) == 0: - break - status_response = self._get_request(status_data['next'], headers) - status_data = status_response.json() - data.extend(status_data.get('data', [])) + if len(status_data['data']) == 0: + break + status_response = self._get_request(status_data['next'], headers) + try: + status_data = status_response.json() + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') + data.extend(status_data.get('data', [])) status_data['data'] = data return status_data else: @@ -804,8 +897,12 @@ class FirecrawlApp: Raises: Exception: An exception with a message containing the status code and error details from the response. """ - error_message = response.json().get('error', 'No error message provided.') - error_details = response.json().get('details', 'No additional error details provided.') + try: + error_message = response.json().get('error', 'No error message provided.') + error_details = response.json().get('details', 'No additional error details provided.') + except: + raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response) + if response.status_code == 402: message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"