mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-20 12:39:07 +08:00
Merge branch 'main' into nsc/extract-queue
This commit is contained in:
commit
bb27594443
@ -61,7 +61,7 @@ export async function batchScrapeController(
|
||||
}
|
||||
|
||||
logger.debug("Batch scrape " + id + " starting", {
|
||||
urlsLength: urls,
|
||||
urlsLength: urls.length,
|
||||
appendToId: req.body.appendToId,
|
||||
account: req.account,
|
||||
});
|
||||
|
@ -157,10 +157,10 @@ export async function crawlStatusController(
|
||||
continue;
|
||||
}
|
||||
|
||||
if (job.returnvalue === undefined) {
|
||||
if (job.returnvalue === undefined || job.returnvalue === null) {
|
||||
logger.warn(
|
||||
"Job was considered done, but returnvalue is undefined!",
|
||||
{ jobId: job.id, state },
|
||||
{ jobId: job.id, state, returnvalue: job.returnvalue },
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
91
apps/api/src/lib/canonical-url.test.ts
Normal file
91
apps/api/src/lib/canonical-url.test.ts
Normal file
@ -0,0 +1,91 @@
|
||||
import { normalizeUrl, normalizeUrlOnlyHostname } from './canonical-url';
|
||||
|
||||
describe('normalizeUrlOnlyHostname', () => {
|
||||
it('should remove protocol and www from URL', () => {
|
||||
const url = 'https://www.example.com';
|
||||
const expected = 'example.com';
|
||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should remove only protocol if www is not present', () => {
|
||||
const url = 'https://example.com';
|
||||
const expected = 'example.com';
|
||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle URLs without protocol', () => {
|
||||
const url = 'www.example.com';
|
||||
const expected = 'example.com';
|
||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle URLs without protocol and www', () => {
|
||||
const url = 'example.com';
|
||||
const expected = 'example.com';
|
||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle URLs with paths', () => {
|
||||
const url = 'https://www.example.com/path/to/resource';
|
||||
const expected = 'example.com';
|
||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle invalid URLs gracefully', () => {
|
||||
const url = 'not a valid url';
|
||||
const expected = 'not a valid url';
|
||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
|
||||
describe('normalizeUrl', () => {
|
||||
it('should remove protocol and www from URL', () => {
|
||||
const url = 'https://www.example.com';
|
||||
const expected = 'example.com';
|
||||
expect(normalizeUrl(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should remove only protocol if www is not present', () => {
|
||||
const url = 'https://example.com';
|
||||
const expected = 'example.com';
|
||||
expect(normalizeUrl(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle URLs without protocol', () => {
|
||||
const url = 'www.example.com';
|
||||
const expected = 'example.com';
|
||||
expect(normalizeUrl(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle URLs without protocol and www', () => {
|
||||
const url = 'example.com';
|
||||
const expected = 'example.com';
|
||||
expect(normalizeUrl(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle URLs with paths', () => {
|
||||
const url = 'https://www.example.com/path/to/resource';
|
||||
const expected = 'example.com/path/to/resource';
|
||||
expect(normalizeUrl(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle URLs with trailing slash', () => {
|
||||
const url = 'https://www.example.com/';
|
||||
const expected = 'example.com';
|
||||
expect(normalizeUrl(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle URLs with trailing slash and path', () => {
|
||||
const url = 'https://www.example.com/path/';
|
||||
const expected = 'example.com/path';
|
||||
expect(normalizeUrl(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it('should handle invalid URLs gracefully', () => {
|
||||
const url = 'not a valid url';
|
||||
const expected = 'not a valid url';
|
||||
expect(normalizeUrl(url)).toBe(expected);
|
||||
});
|
||||
});
|
19
apps/api/src/lib/canonical-url.ts
Normal file
19
apps/api/src/lib/canonical-url.ts
Normal file
@ -0,0 +1,19 @@
|
||||
export function normalizeUrl(url: string) {
|
||||
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
||||
if (url.endsWith("/")) {
|
||||
url = url.slice(0, -1);
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
export function normalizeUrlOnlyHostname(url: string) {
|
||||
try {
|
||||
const hostname = new URL(url).hostname;
|
||||
return hostname.replace(/^www\./, "");
|
||||
} catch (error) {
|
||||
return url
|
||||
.replace(/^https?:\/\//, "")
|
||||
.replace(/^www\./, "")
|
||||
.split("/")[0];
|
||||
}
|
||||
}
|
@ -14,10 +14,13 @@ interface ScrapeDocumentOptions {
|
||||
timeout: number;
|
||||
}
|
||||
|
||||
export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: URLTrace[]): Promise<Document | null> {
|
||||
export async function scrapeDocument(
|
||||
options: ScrapeDocumentOptions,
|
||||
urlTraces: URLTrace[],
|
||||
): Promise<Document | null> {
|
||||
const trace = urlTraces.find((t) => t.url === options.url);
|
||||
if (trace) {
|
||||
trace.status = 'scraped';
|
||||
trace.status = "scraped";
|
||||
trace.timing.scrapedAt = new Date().toISOString();
|
||||
}
|
||||
|
||||
@ -35,7 +38,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
|
||||
mode: "single_urls",
|
||||
team_id: options.teamId,
|
||||
scrapeOptions: scrapeOptions.parse({}),
|
||||
internalOptions: {},
|
||||
internalOptions: {
|
||||
useCache: true,
|
||||
},
|
||||
plan: options.plan,
|
||||
origin: options.origin,
|
||||
is_scrape: true,
|
||||
@ -61,7 +66,7 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
|
||||
} catch (error) {
|
||||
logger.error(`Error in scrapeDocument: ${error}`);
|
||||
if (trace) {
|
||||
trace.status = 'error';
|
||||
trace.status = "error";
|
||||
trace.error = error.message;
|
||||
}
|
||||
return null;
|
||||
|
@ -84,7 +84,7 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
|
||||
});
|
||||
|
||||
// retry if only one url is returned
|
||||
if (uniqueUrls.length === 1) {
|
||||
if (uniqueUrls.length <= 1) {
|
||||
const retryMapResults = await getMapResults({
|
||||
url: baseUrl,
|
||||
teamId: options.teamId,
|
||||
|
@ -10,6 +10,9 @@ export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const entry = await getEntryFromCache(key);
|
||||
if (entry === null) throw new EngineError("Cache missed");
|
||||
|
||||
// Set fromCache flag to indicate this document was retrieved from cache
|
||||
meta.internalOptions.fromCache = true;
|
||||
|
||||
return {
|
||||
url: entry.url,
|
||||
html: entry.html,
|
||||
|
@ -298,6 +298,15 @@ export function buildFallbackList(meta: Meta): {
|
||||
engine: Engine;
|
||||
unsupportedFeatures: Set<FeatureFlag>;
|
||||
}[] {
|
||||
|
||||
if (meta.internalOptions.useCache !== true) {
|
||||
const cacheIndex = engines.indexOf("cache");
|
||||
if (cacheIndex !== -1) {
|
||||
engines.splice(cacheIndex, 1);
|
||||
}
|
||||
} else {
|
||||
meta.logger.debug("Cache engine enabled by useCache option");
|
||||
}
|
||||
const prioritySum = [...meta.featureFlags].reduce(
|
||||
(a, x) => a + featureFlagOptions[x].priority,
|
||||
0,
|
||||
|
@ -151,9 +151,10 @@ export type InternalOptions = {
|
||||
|
||||
v0CrawlOnlyUrls?: boolean;
|
||||
v0DisableJsDom?: boolean;
|
||||
|
||||
useCache?: boolean;
|
||||
disableSmartWaitCache?: boolean; // Passed along to fire-engine
|
||||
isBackgroundIndex?: boolean;
|
||||
fromCache?: boolean; // Indicates if the document was retrieved from cache
|
||||
};
|
||||
|
||||
export type EngineResultsTracker = {
|
||||
|
@ -3,6 +3,10 @@ import { Meta } from "..";
|
||||
import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache";
|
||||
|
||||
export function saveToCache(meta: Meta, document: Document): Document {
|
||||
if (meta.internalOptions.useCache !== true) {
|
||||
return document;
|
||||
}
|
||||
|
||||
if (
|
||||
document.metadata.statusCode! < 200 ||
|
||||
document.metadata.statusCode! >= 300
|
||||
@ -15,6 +19,12 @@ export function saveToCache(meta: Meta, document: Document): Document {
|
||||
);
|
||||
}
|
||||
|
||||
// If the document was retrieved from cache, we don't need to save it
|
||||
if (meta.internalOptions.fromCache) {
|
||||
return document;
|
||||
}
|
||||
|
||||
|
||||
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
|
||||
|
||||
if (key !== null) {
|
||||
|
@ -53,6 +53,8 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||
import { indexPage } from "../lib/extract/index/pinecone";
|
||||
import { Document } from "../controllers/v1/types";
|
||||
import { performExtraction } from "../lib/extract/extraction-service";
|
||||
import { supabase_service } from "../services/supabase";
|
||||
import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url";
|
||||
|
||||
configDotenv();
|
||||
|
||||
@ -80,6 +82,69 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
|
||||
|
||||
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||
if (await finishCrawl(job.data.crawl_id)) {
|
||||
(async () => {
|
||||
const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined;
|
||||
// Get all visited URLs from Redis
|
||||
const visitedUrls = await redisConnection.smembers(
|
||||
"crawl:" + job.data.crawl_id + ":visited",
|
||||
);
|
||||
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
|
||||
if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
|
||||
// Fire and forget the upload to Supabase
|
||||
try {
|
||||
// Standardize URLs to canonical form (https, no www)
|
||||
const standardizedUrls = [
|
||||
...new Set(
|
||||
visitedUrls.map((url) => {
|
||||
return normalizeUrl(url);
|
||||
}),
|
||||
),
|
||||
];
|
||||
// First check if entry exists for this origin URL
|
||||
const { data: existingMap } = await supabase_service
|
||||
.from("crawl_maps")
|
||||
.select("urls")
|
||||
.eq("origin_url", originUrl)
|
||||
.single();
|
||||
|
||||
if (existingMap) {
|
||||
// Merge URLs, removing duplicates
|
||||
const mergedUrls = [
|
||||
...new Set([...existingMap.urls, ...standardizedUrls]),
|
||||
];
|
||||
|
||||
const { error } = await supabase_service
|
||||
.from("crawl_maps")
|
||||
.update({
|
||||
urls: mergedUrls,
|
||||
num_urls: mergedUrls.length,
|
||||
updated_at: new Date().toISOString(),
|
||||
})
|
||||
.eq("origin_url", originUrl);
|
||||
|
||||
if (error) {
|
||||
_logger.error("Failed to update crawl map", { error });
|
||||
}
|
||||
} else {
|
||||
// Insert new entry if none exists
|
||||
const { error } = await supabase_service.from("crawl_maps").insert({
|
||||
origin_url: originUrl,
|
||||
urls: standardizedUrls,
|
||||
num_urls: standardizedUrls.length,
|
||||
created_at: new Date().toISOString(),
|
||||
updated_at: new Date().toISOString(),
|
||||
});
|
||||
|
||||
if (error) {
|
||||
_logger.error("Failed to save crawl map", { error });
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
_logger.error("Error saving crawl map", { error });
|
||||
}
|
||||
}
|
||||
})();
|
||||
|
||||
if (!job.data.v1) {
|
||||
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
||||
|
||||
@ -582,16 +647,16 @@ async function indexJob(job: Job & { id: string }, document: Document) {
|
||||
document.markdown &&
|
||||
job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID!
|
||||
) {
|
||||
indexPage({
|
||||
document: document,
|
||||
originUrl: job.data.crawl_id
|
||||
? (await getCrawl(job.data.crawl_id))?.originUrl!
|
||||
: document.metadata.sourceURL!,
|
||||
crawlId: job.data.crawl_id,
|
||||
teamId: job.data.team_id,
|
||||
}).catch((error) => {
|
||||
_logger.error("Error indexing page", { error });
|
||||
});
|
||||
// indexPage({
|
||||
// document: document,
|
||||
// originUrl: job.data.crawl_id
|
||||
// ? (await getCrawl(job.data.crawl_id))?.originUrl!
|
||||
// : document.metadata.sourceURL!,
|
||||
// crawlId: job.data.crawl_id,
|
||||
// teamId: job.data.team_id,
|
||||
// }).catch((error) => {
|
||||
// _logger.error("Error indexing page", { error });
|
||||
// });
|
||||
}
|
||||
}
|
||||
|
||||
@ -696,7 +761,8 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
doc.metadata.url !== undefined &&
|
||||
doc.metadata.sourceURL !== undefined &&
|
||||
normalizeURL(doc.metadata.url, sc) !==
|
||||
normalizeURL(doc.metadata.sourceURL, sc)
|
||||
normalizeURL(doc.metadata.sourceURL, sc) &&
|
||||
job.data.crawlerOptions !== null // only on crawls, don't care on batch scrape
|
||||
) {
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||
if (
|
||||
@ -828,9 +894,10 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
newJobId: jobId,
|
||||
});
|
||||
} else {
|
||||
logger.debug("Could not lock URL " + JSON.stringify(link), {
|
||||
url: link,
|
||||
});
|
||||
// TODO: removed this, ok? too many 'not useful' logs (?) Mogery!
|
||||
// logger.debug("Could not lock URL " + JSON.stringify(link), {
|
||||
// url: link,
|
||||
// });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "1.11.2",
|
||||
"version": "1.11.3",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
|
@ -565,23 +565,39 @@ export default class FirecrawlApp {
|
||||
if ("data" in statusData) {
|
||||
let data = statusData.data;
|
||||
while (typeof statusData === 'object' && 'next' in statusData) {
|
||||
if (data.length === 0) {
|
||||
break
|
||||
}
|
||||
statusData = (await this.getRequest(statusData.next, headers)).data;
|
||||
data = data.concat(statusData.data);
|
||||
}
|
||||
allData = data;
|
||||
}
|
||||
}
|
||||
return ({
|
||||
|
||||
let resp: CrawlStatusResponse | ErrorResponse = {
|
||||
success: response.data.success,
|
||||
status: response.data.status,
|
||||
total: response.data.total,
|
||||
completed: response.data.completed,
|
||||
creditsUsed: response.data.creditsUsed,
|
||||
expiresAt: new Date(response.data.expiresAt),
|
||||
next: response.data.next,
|
||||
data: allData,
|
||||
error: response.data.error,
|
||||
})
|
||||
data: allData
|
||||
}
|
||||
|
||||
if (!response.data.success && response.data.error) {
|
||||
resp = {
|
||||
...resp,
|
||||
success: false,
|
||||
error: response.data.error
|
||||
} as ErrorResponse;
|
||||
}
|
||||
|
||||
if (response.data.next) {
|
||||
(resp as CrawlStatusResponse).next = response.data.next;
|
||||
}
|
||||
|
||||
return resp;
|
||||
} else {
|
||||
this.handleError(response, "check crawl status");
|
||||
}
|
||||
@ -799,23 +815,39 @@ export default class FirecrawlApp {
|
||||
if ("data" in statusData) {
|
||||
let data = statusData.data;
|
||||
while (typeof statusData === 'object' && 'next' in statusData) {
|
||||
if (data.length === 0) {
|
||||
break
|
||||
}
|
||||
statusData = (await this.getRequest(statusData.next, headers)).data;
|
||||
data = data.concat(statusData.data);
|
||||
}
|
||||
allData = data;
|
||||
}
|
||||
}
|
||||
return ({
|
||||
|
||||
let resp: BatchScrapeStatusResponse | ErrorResponse = {
|
||||
success: response.data.success,
|
||||
status: response.data.status,
|
||||
total: response.data.total,
|
||||
completed: response.data.completed,
|
||||
creditsUsed: response.data.creditsUsed,
|
||||
expiresAt: new Date(response.data.expiresAt),
|
||||
next: response.data.next,
|
||||
data: allData,
|
||||
error: response.data.error,
|
||||
})
|
||||
data: allData
|
||||
}
|
||||
|
||||
if (!response.data.success && response.data.error) {
|
||||
resp = {
|
||||
...resp,
|
||||
success: false,
|
||||
error: response.data.error
|
||||
} as ErrorResponse;
|
||||
}
|
||||
|
||||
if (response.data.next) {
|
||||
(resp as BatchScrapeStatusResponse).next = response.data.next;
|
||||
}
|
||||
|
||||
return resp;
|
||||
} else {
|
||||
this.handleError(response, "check batch scrape status");
|
||||
}
|
||||
@ -971,6 +1003,9 @@ export default class FirecrawlApp {
|
||||
if ("data" in statusData) {
|
||||
let data = statusData.data;
|
||||
while (typeof statusData === 'object' && 'next' in statusData) {
|
||||
if (data.length === 0) {
|
||||
break
|
||||
}
|
||||
statusResponse = await this.getRequest(statusData.next, headers);
|
||||
statusData = statusResponse.data;
|
||||
data = data.concat(statusData.data);
|
||||
|
@ -2,7 +2,7 @@ import express, { Request, Response } from 'express';
|
||||
import bodyParser from 'body-parser';
|
||||
import { chromium, Browser, BrowserContext, Route, Request as PlaywrightRequest } from 'playwright';
|
||||
import dotenv from 'dotenv';
|
||||
import randomUseragent from 'random-useragent';
|
||||
import UserAgent from 'user-agents';
|
||||
import { getError } from './helpers/get_error';
|
||||
|
||||
dotenv.config();
|
||||
@ -60,7 +60,7 @@ const initializeBrowser = async () => {
|
||||
]
|
||||
});
|
||||
|
||||
const userAgent = randomUseragent.getRandom();
|
||||
const userAgent = new UserAgent().toString();
|
||||
const viewport = { width: 1280, height: 800 };
|
||||
|
||||
const contextOptions: any = {
|
||||
|
@ -16,12 +16,12 @@
|
||||
"dotenv": "^16.4.5",
|
||||
"express": "^4.19.2",
|
||||
"playwright": "^1.45.0",
|
||||
"random-useragent": "^0.5.0"
|
||||
"user-agents": "^1.1.410"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/express": "^4.17.21",
|
||||
"@types/node": "^20.14.9",
|
||||
"@types/random-useragent": "^0.3.3",
|
||||
"@types/user-agents": "^1.0.4",
|
||||
"ts-node": "^10.9.2",
|
||||
"typescript": "^5.5.2"
|
||||
}
|
||||
|
@ -13,7 +13,7 @@ import os
|
||||
|
||||
from .firecrawl import FirecrawlApp # noqa
|
||||
|
||||
__version__ = "1.8.0"
|
||||
__version__ = "1.8.1"
|
||||
|
||||
# Define the logger for the Firecrawl project
|
||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
@ -250,6 +250,8 @@ class FirecrawlApp:
|
||||
if 'data' in status_data:
|
||||
data = status_data['data']
|
||||
while 'next' in status_data:
|
||||
if len(status_data['data']) == 0:
|
||||
break
|
||||
next_url = status_data.get('next')
|
||||
if not next_url:
|
||||
logger.warning("Expected 'next' URL is missing.")
|
||||
@ -267,16 +269,24 @@ class FirecrawlApp:
|
||||
break
|
||||
status_data['data'] = data
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
response = {
|
||||
'status': status_data.get('status'),
|
||||
'total': status_data.get('total'),
|
||||
'completed': status_data.get('completed'),
|
||||
'creditsUsed': status_data.get('creditsUsed'),
|
||||
'expiresAt': status_data.get('expiresAt'),
|
||||
'data': status_data.get('data'),
|
||||
'error': status_data.get('error'),
|
||||
'next': status_data.get('next', None)
|
||||
'data': status_data.get('data')
|
||||
}
|
||||
|
||||
if 'error' in status_data:
|
||||
response['error'] = status_data['error']
|
||||
|
||||
if 'next' in status_data:
|
||||
response['next'] = status_data['next']
|
||||
|
||||
return {
|
||||
'success': False if 'error' in status_data else True,
|
||||
**response
|
||||
}
|
||||
else:
|
||||
self._handle_error(response, 'check crawl status')
|
||||
@ -459,6 +469,8 @@ class FirecrawlApp:
|
||||
if 'data' in status_data:
|
||||
data = status_data['data']
|
||||
while 'next' in status_data:
|
||||
if len(status_data['data']) == 0:
|
||||
break
|
||||
next_url = status_data.get('next')
|
||||
if not next_url:
|
||||
logger.warning("Expected 'next' URL is missing.")
|
||||
@ -476,16 +488,24 @@ class FirecrawlApp:
|
||||
break
|
||||
status_data['data'] = data
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
response = {
|
||||
'status': status_data.get('status'),
|
||||
'total': status_data.get('total'),
|
||||
'completed': status_data.get('completed'),
|
||||
'creditsUsed': status_data.get('creditsUsed'),
|
||||
'expiresAt': status_data.get('expiresAt'),
|
||||
'data': status_data.get('data'),
|
||||
'error': status_data.get('error'),
|
||||
'next': status_data.get('next', None)
|
||||
'data': status_data.get('data')
|
||||
}
|
||||
|
||||
if 'error' in status_data:
|
||||
response['error'] = status_data['error']
|
||||
|
||||
if 'next' in status_data:
|
||||
response['next'] = status_data['next']
|
||||
|
||||
return {
|
||||
'success': False if 'error' in status_data else True,
|
||||
**response
|
||||
}
|
||||
else:
|
||||
self._handle_error(response, 'check batch scrape status')
|
||||
@ -669,6 +689,8 @@ class FirecrawlApp:
|
||||
if 'data' in status_data:
|
||||
data = status_data['data']
|
||||
while 'next' in status_data:
|
||||
if len(status_data['data']) == 0:
|
||||
break
|
||||
status_response = self._get_request(status_data['next'], headers)
|
||||
status_data = status_response.json()
|
||||
data.extend(status_data.get('data', []))
|
||||
|
Loading…
x
Reference in New Issue
Block a user