mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-20 14:39:15 +08:00
Merge branch 'main' into nsc/extract-queue
This commit is contained in:
commit
bb27594443
@ -61,7 +61,7 @@ export async function batchScrapeController(
|
|||||||
}
|
}
|
||||||
|
|
||||||
logger.debug("Batch scrape " + id + " starting", {
|
logger.debug("Batch scrape " + id + " starting", {
|
||||||
urlsLength: urls,
|
urlsLength: urls.length,
|
||||||
appendToId: req.body.appendToId,
|
appendToId: req.body.appendToId,
|
||||||
account: req.account,
|
account: req.account,
|
||||||
});
|
});
|
||||||
|
@ -157,10 +157,10 @@ export async function crawlStatusController(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (job.returnvalue === undefined) {
|
if (job.returnvalue === undefined || job.returnvalue === null) {
|
||||||
logger.warn(
|
logger.warn(
|
||||||
"Job was considered done, but returnvalue is undefined!",
|
"Job was considered done, but returnvalue is undefined!",
|
||||||
{ jobId: job.id, state },
|
{ jobId: job.id, state, returnvalue: job.returnvalue },
|
||||||
);
|
);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
91
apps/api/src/lib/canonical-url.test.ts
Normal file
91
apps/api/src/lib/canonical-url.test.ts
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
import { normalizeUrl, normalizeUrlOnlyHostname } from './canonical-url';
|
||||||
|
|
||||||
|
describe('normalizeUrlOnlyHostname', () => {
|
||||||
|
it('should remove protocol and www from URL', () => {
|
||||||
|
const url = 'https://www.example.com';
|
||||||
|
const expected = 'example.com';
|
||||||
|
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should remove only protocol if www is not present', () => {
|
||||||
|
const url = 'https://example.com';
|
||||||
|
const expected = 'example.com';
|
||||||
|
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle URLs without protocol', () => {
|
||||||
|
const url = 'www.example.com';
|
||||||
|
const expected = 'example.com';
|
||||||
|
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle URLs without protocol and www', () => {
|
||||||
|
const url = 'example.com';
|
||||||
|
const expected = 'example.com';
|
||||||
|
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle URLs with paths', () => {
|
||||||
|
const url = 'https://www.example.com/path/to/resource';
|
||||||
|
const expected = 'example.com';
|
||||||
|
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle invalid URLs gracefully', () => {
|
||||||
|
const url = 'not a valid url';
|
||||||
|
const expected = 'not a valid url';
|
||||||
|
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
describe('normalizeUrl', () => {
|
||||||
|
it('should remove protocol and www from URL', () => {
|
||||||
|
const url = 'https://www.example.com';
|
||||||
|
const expected = 'example.com';
|
||||||
|
expect(normalizeUrl(url)).toBe(expected);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should remove only protocol if www is not present', () => {
|
||||||
|
const url = 'https://example.com';
|
||||||
|
const expected = 'example.com';
|
||||||
|
expect(normalizeUrl(url)).toBe(expected);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle URLs without protocol', () => {
|
||||||
|
const url = 'www.example.com';
|
||||||
|
const expected = 'example.com';
|
||||||
|
expect(normalizeUrl(url)).toBe(expected);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle URLs without protocol and www', () => {
|
||||||
|
const url = 'example.com';
|
||||||
|
const expected = 'example.com';
|
||||||
|
expect(normalizeUrl(url)).toBe(expected);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle URLs with paths', () => {
|
||||||
|
const url = 'https://www.example.com/path/to/resource';
|
||||||
|
const expected = 'example.com/path/to/resource';
|
||||||
|
expect(normalizeUrl(url)).toBe(expected);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle URLs with trailing slash', () => {
|
||||||
|
const url = 'https://www.example.com/';
|
||||||
|
const expected = 'example.com';
|
||||||
|
expect(normalizeUrl(url)).toBe(expected);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle URLs with trailing slash and path', () => {
|
||||||
|
const url = 'https://www.example.com/path/';
|
||||||
|
const expected = 'example.com/path';
|
||||||
|
expect(normalizeUrl(url)).toBe(expected);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle invalid URLs gracefully', () => {
|
||||||
|
const url = 'not a valid url';
|
||||||
|
const expected = 'not a valid url';
|
||||||
|
expect(normalizeUrl(url)).toBe(expected);
|
||||||
|
});
|
||||||
|
});
|
19
apps/api/src/lib/canonical-url.ts
Normal file
19
apps/api/src/lib/canonical-url.ts
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
export function normalizeUrl(url: string) {
|
||||||
|
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
||||||
|
if (url.endsWith("/")) {
|
||||||
|
url = url.slice(0, -1);
|
||||||
|
}
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function normalizeUrlOnlyHostname(url: string) {
|
||||||
|
try {
|
||||||
|
const hostname = new URL(url).hostname;
|
||||||
|
return hostname.replace(/^www\./, "");
|
||||||
|
} catch (error) {
|
||||||
|
return url
|
||||||
|
.replace(/^https?:\/\//, "")
|
||||||
|
.replace(/^www\./, "")
|
||||||
|
.split("/")[0];
|
||||||
|
}
|
||||||
|
}
|
@ -14,10 +14,13 @@ interface ScrapeDocumentOptions {
|
|||||||
timeout: number;
|
timeout: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: URLTrace[]): Promise<Document | null> {
|
export async function scrapeDocument(
|
||||||
|
options: ScrapeDocumentOptions,
|
||||||
|
urlTraces: URLTrace[],
|
||||||
|
): Promise<Document | null> {
|
||||||
const trace = urlTraces.find((t) => t.url === options.url);
|
const trace = urlTraces.find((t) => t.url === options.url);
|
||||||
if (trace) {
|
if (trace) {
|
||||||
trace.status = 'scraped';
|
trace.status = "scraped";
|
||||||
trace.timing.scrapedAt = new Date().toISOString();
|
trace.timing.scrapedAt = new Date().toISOString();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -35,7 +38,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
|
|||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
team_id: options.teamId,
|
team_id: options.teamId,
|
||||||
scrapeOptions: scrapeOptions.parse({}),
|
scrapeOptions: scrapeOptions.parse({}),
|
||||||
internalOptions: {},
|
internalOptions: {
|
||||||
|
useCache: true,
|
||||||
|
},
|
||||||
plan: options.plan,
|
plan: options.plan,
|
||||||
origin: options.origin,
|
origin: options.origin,
|
||||||
is_scrape: true,
|
is_scrape: true,
|
||||||
@ -61,7 +66,7 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
|
|||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Error in scrapeDocument: ${error}`);
|
logger.error(`Error in scrapeDocument: ${error}`);
|
||||||
if (trace) {
|
if (trace) {
|
||||||
trace.status = 'error';
|
trace.status = "error";
|
||||||
trace.error = error.message;
|
trace.error = error.message;
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
|
@ -84,7 +84,7 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
|
|||||||
});
|
});
|
||||||
|
|
||||||
// retry if only one url is returned
|
// retry if only one url is returned
|
||||||
if (uniqueUrls.length === 1) {
|
if (uniqueUrls.length <= 1) {
|
||||||
const retryMapResults = await getMapResults({
|
const retryMapResults = await getMapResults({
|
||||||
url: baseUrl,
|
url: baseUrl,
|
||||||
teamId: options.teamId,
|
teamId: options.teamId,
|
||||||
|
@ -10,6 +10,9 @@ export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
|
|||||||
const entry = await getEntryFromCache(key);
|
const entry = await getEntryFromCache(key);
|
||||||
if (entry === null) throw new EngineError("Cache missed");
|
if (entry === null) throw new EngineError("Cache missed");
|
||||||
|
|
||||||
|
// Set fromCache flag to indicate this document was retrieved from cache
|
||||||
|
meta.internalOptions.fromCache = true;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
url: entry.url,
|
url: entry.url,
|
||||||
html: entry.html,
|
html: entry.html,
|
||||||
|
@ -298,6 +298,15 @@ export function buildFallbackList(meta: Meta): {
|
|||||||
engine: Engine;
|
engine: Engine;
|
||||||
unsupportedFeatures: Set<FeatureFlag>;
|
unsupportedFeatures: Set<FeatureFlag>;
|
||||||
}[] {
|
}[] {
|
||||||
|
|
||||||
|
if (meta.internalOptions.useCache !== true) {
|
||||||
|
const cacheIndex = engines.indexOf("cache");
|
||||||
|
if (cacheIndex !== -1) {
|
||||||
|
engines.splice(cacheIndex, 1);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
meta.logger.debug("Cache engine enabled by useCache option");
|
||||||
|
}
|
||||||
const prioritySum = [...meta.featureFlags].reduce(
|
const prioritySum = [...meta.featureFlags].reduce(
|
||||||
(a, x) => a + featureFlagOptions[x].priority,
|
(a, x) => a + featureFlagOptions[x].priority,
|
||||||
0,
|
0,
|
||||||
|
@ -151,9 +151,10 @@ export type InternalOptions = {
|
|||||||
|
|
||||||
v0CrawlOnlyUrls?: boolean;
|
v0CrawlOnlyUrls?: boolean;
|
||||||
v0DisableJsDom?: boolean;
|
v0DisableJsDom?: boolean;
|
||||||
|
useCache?: boolean;
|
||||||
disableSmartWaitCache?: boolean; // Passed along to fire-engine
|
disableSmartWaitCache?: boolean; // Passed along to fire-engine
|
||||||
isBackgroundIndex?: boolean;
|
isBackgroundIndex?: boolean;
|
||||||
|
fromCache?: boolean; // Indicates if the document was retrieved from cache
|
||||||
};
|
};
|
||||||
|
|
||||||
export type EngineResultsTracker = {
|
export type EngineResultsTracker = {
|
||||||
|
@ -3,6 +3,10 @@ import { Meta } from "..";
|
|||||||
import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache";
|
import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache";
|
||||||
|
|
||||||
export function saveToCache(meta: Meta, document: Document): Document {
|
export function saveToCache(meta: Meta, document: Document): Document {
|
||||||
|
if (meta.internalOptions.useCache !== true) {
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
if (
|
if (
|
||||||
document.metadata.statusCode! < 200 ||
|
document.metadata.statusCode! < 200 ||
|
||||||
document.metadata.statusCode! >= 300
|
document.metadata.statusCode! >= 300
|
||||||
@ -15,6 +19,12 @@ export function saveToCache(meta: Meta, document: Document): Document {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If the document was retrieved from cache, we don't need to save it
|
||||||
|
if (meta.internalOptions.fromCache) {
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
|
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
|
||||||
|
|
||||||
if (key !== null) {
|
if (key !== null) {
|
||||||
|
@ -53,6 +53,8 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
|||||||
import { indexPage } from "../lib/extract/index/pinecone";
|
import { indexPage } from "../lib/extract/index/pinecone";
|
||||||
import { Document } from "../controllers/v1/types";
|
import { Document } from "../controllers/v1/types";
|
||||||
import { performExtraction } from "../lib/extract/extraction-service";
|
import { performExtraction } from "../lib/extract/extraction-service";
|
||||||
|
import { supabase_service } from "../services/supabase";
|
||||||
|
import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
@ -80,6 +82,69 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
|
|||||||
|
|
||||||
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||||
if (await finishCrawl(job.data.crawl_id)) {
|
if (await finishCrawl(job.data.crawl_id)) {
|
||||||
|
(async () => {
|
||||||
|
const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined;
|
||||||
|
// Get all visited URLs from Redis
|
||||||
|
const visitedUrls = await redisConnection.smembers(
|
||||||
|
"crawl:" + job.data.crawl_id + ":visited",
|
||||||
|
);
|
||||||
|
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
|
||||||
|
if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
|
||||||
|
// Fire and forget the upload to Supabase
|
||||||
|
try {
|
||||||
|
// Standardize URLs to canonical form (https, no www)
|
||||||
|
const standardizedUrls = [
|
||||||
|
...new Set(
|
||||||
|
visitedUrls.map((url) => {
|
||||||
|
return normalizeUrl(url);
|
||||||
|
}),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
// First check if entry exists for this origin URL
|
||||||
|
const { data: existingMap } = await supabase_service
|
||||||
|
.from("crawl_maps")
|
||||||
|
.select("urls")
|
||||||
|
.eq("origin_url", originUrl)
|
||||||
|
.single();
|
||||||
|
|
||||||
|
if (existingMap) {
|
||||||
|
// Merge URLs, removing duplicates
|
||||||
|
const mergedUrls = [
|
||||||
|
...new Set([...existingMap.urls, ...standardizedUrls]),
|
||||||
|
];
|
||||||
|
|
||||||
|
const { error } = await supabase_service
|
||||||
|
.from("crawl_maps")
|
||||||
|
.update({
|
||||||
|
urls: mergedUrls,
|
||||||
|
num_urls: mergedUrls.length,
|
||||||
|
updated_at: new Date().toISOString(),
|
||||||
|
})
|
||||||
|
.eq("origin_url", originUrl);
|
||||||
|
|
||||||
|
if (error) {
|
||||||
|
_logger.error("Failed to update crawl map", { error });
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Insert new entry if none exists
|
||||||
|
const { error } = await supabase_service.from("crawl_maps").insert({
|
||||||
|
origin_url: originUrl,
|
||||||
|
urls: standardizedUrls,
|
||||||
|
num_urls: standardizedUrls.length,
|
||||||
|
created_at: new Date().toISOString(),
|
||||||
|
updated_at: new Date().toISOString(),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (error) {
|
||||||
|
_logger.error("Failed to save crawl map", { error });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
_logger.error("Error saving crawl map", { error });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})();
|
||||||
|
|
||||||
if (!job.data.v1) {
|
if (!job.data.v1) {
|
||||||
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
||||||
|
|
||||||
@ -582,16 +647,16 @@ async function indexJob(job: Job & { id: string }, document: Document) {
|
|||||||
document.markdown &&
|
document.markdown &&
|
||||||
job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID!
|
job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID!
|
||||||
) {
|
) {
|
||||||
indexPage({
|
// indexPage({
|
||||||
document: document,
|
// document: document,
|
||||||
originUrl: job.data.crawl_id
|
// originUrl: job.data.crawl_id
|
||||||
? (await getCrawl(job.data.crawl_id))?.originUrl!
|
// ? (await getCrawl(job.data.crawl_id))?.originUrl!
|
||||||
: document.metadata.sourceURL!,
|
// : document.metadata.sourceURL!,
|
||||||
crawlId: job.data.crawl_id,
|
// crawlId: job.data.crawl_id,
|
||||||
teamId: job.data.team_id,
|
// teamId: job.data.team_id,
|
||||||
}).catch((error) => {
|
// }).catch((error) => {
|
||||||
_logger.error("Error indexing page", { error });
|
// _logger.error("Error indexing page", { error });
|
||||||
});
|
// });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -696,7 +761,8 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
doc.metadata.url !== undefined &&
|
doc.metadata.url !== undefined &&
|
||||||
doc.metadata.sourceURL !== undefined &&
|
doc.metadata.sourceURL !== undefined &&
|
||||||
normalizeURL(doc.metadata.url, sc) !==
|
normalizeURL(doc.metadata.url, sc) !==
|
||||||
normalizeURL(doc.metadata.sourceURL, sc)
|
normalizeURL(doc.metadata.sourceURL, sc) &&
|
||||||
|
job.data.crawlerOptions !== null // only on crawls, don't care on batch scrape
|
||||||
) {
|
) {
|
||||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||||
if (
|
if (
|
||||||
@ -828,9 +894,10 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
newJobId: jobId,
|
newJobId: jobId,
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
logger.debug("Could not lock URL " + JSON.stringify(link), {
|
// TODO: removed this, ok? too many 'not useful' logs (?) Mogery!
|
||||||
url: link,
|
// logger.debug("Could not lock URL " + JSON.stringify(link), {
|
||||||
});
|
// url: link,
|
||||||
|
// });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "1.11.2",
|
"version": "1.11.3",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
|
@ -565,23 +565,39 @@ export default class FirecrawlApp {
|
|||||||
if ("data" in statusData) {
|
if ("data" in statusData) {
|
||||||
let data = statusData.data;
|
let data = statusData.data;
|
||||||
while (typeof statusData === 'object' && 'next' in statusData) {
|
while (typeof statusData === 'object' && 'next' in statusData) {
|
||||||
|
if (data.length === 0) {
|
||||||
|
break
|
||||||
|
}
|
||||||
statusData = (await this.getRequest(statusData.next, headers)).data;
|
statusData = (await this.getRequest(statusData.next, headers)).data;
|
||||||
data = data.concat(statusData.data);
|
data = data.concat(statusData.data);
|
||||||
}
|
}
|
||||||
allData = data;
|
allData = data;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ({
|
|
||||||
|
let resp: CrawlStatusResponse | ErrorResponse = {
|
||||||
success: response.data.success,
|
success: response.data.success,
|
||||||
status: response.data.status,
|
status: response.data.status,
|
||||||
total: response.data.total,
|
total: response.data.total,
|
||||||
completed: response.data.completed,
|
completed: response.data.completed,
|
||||||
creditsUsed: response.data.creditsUsed,
|
creditsUsed: response.data.creditsUsed,
|
||||||
expiresAt: new Date(response.data.expiresAt),
|
expiresAt: new Date(response.data.expiresAt),
|
||||||
next: response.data.next,
|
data: allData
|
||||||
data: allData,
|
}
|
||||||
error: response.data.error,
|
|
||||||
})
|
if (!response.data.success && response.data.error) {
|
||||||
|
resp = {
|
||||||
|
...resp,
|
||||||
|
success: false,
|
||||||
|
error: response.data.error
|
||||||
|
} as ErrorResponse;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (response.data.next) {
|
||||||
|
(resp as CrawlStatusResponse).next = response.data.next;
|
||||||
|
}
|
||||||
|
|
||||||
|
return resp;
|
||||||
} else {
|
} else {
|
||||||
this.handleError(response, "check crawl status");
|
this.handleError(response, "check crawl status");
|
||||||
}
|
}
|
||||||
@ -799,23 +815,39 @@ export default class FirecrawlApp {
|
|||||||
if ("data" in statusData) {
|
if ("data" in statusData) {
|
||||||
let data = statusData.data;
|
let data = statusData.data;
|
||||||
while (typeof statusData === 'object' && 'next' in statusData) {
|
while (typeof statusData === 'object' && 'next' in statusData) {
|
||||||
|
if (data.length === 0) {
|
||||||
|
break
|
||||||
|
}
|
||||||
statusData = (await this.getRequest(statusData.next, headers)).data;
|
statusData = (await this.getRequest(statusData.next, headers)).data;
|
||||||
data = data.concat(statusData.data);
|
data = data.concat(statusData.data);
|
||||||
}
|
}
|
||||||
allData = data;
|
allData = data;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ({
|
|
||||||
|
let resp: BatchScrapeStatusResponse | ErrorResponse = {
|
||||||
success: response.data.success,
|
success: response.data.success,
|
||||||
status: response.data.status,
|
status: response.data.status,
|
||||||
total: response.data.total,
|
total: response.data.total,
|
||||||
completed: response.data.completed,
|
completed: response.data.completed,
|
||||||
creditsUsed: response.data.creditsUsed,
|
creditsUsed: response.data.creditsUsed,
|
||||||
expiresAt: new Date(response.data.expiresAt),
|
expiresAt: new Date(response.data.expiresAt),
|
||||||
next: response.data.next,
|
data: allData
|
||||||
data: allData,
|
}
|
||||||
error: response.data.error,
|
|
||||||
})
|
if (!response.data.success && response.data.error) {
|
||||||
|
resp = {
|
||||||
|
...resp,
|
||||||
|
success: false,
|
||||||
|
error: response.data.error
|
||||||
|
} as ErrorResponse;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (response.data.next) {
|
||||||
|
(resp as BatchScrapeStatusResponse).next = response.data.next;
|
||||||
|
}
|
||||||
|
|
||||||
|
return resp;
|
||||||
} else {
|
} else {
|
||||||
this.handleError(response, "check batch scrape status");
|
this.handleError(response, "check batch scrape status");
|
||||||
}
|
}
|
||||||
@ -971,6 +1003,9 @@ export default class FirecrawlApp {
|
|||||||
if ("data" in statusData) {
|
if ("data" in statusData) {
|
||||||
let data = statusData.data;
|
let data = statusData.data;
|
||||||
while (typeof statusData === 'object' && 'next' in statusData) {
|
while (typeof statusData === 'object' && 'next' in statusData) {
|
||||||
|
if (data.length === 0) {
|
||||||
|
break
|
||||||
|
}
|
||||||
statusResponse = await this.getRequest(statusData.next, headers);
|
statusResponse = await this.getRequest(statusData.next, headers);
|
||||||
statusData = statusResponse.data;
|
statusData = statusResponse.data;
|
||||||
data = data.concat(statusData.data);
|
data = data.concat(statusData.data);
|
||||||
|
@ -2,7 +2,7 @@ import express, { Request, Response } from 'express';
|
|||||||
import bodyParser from 'body-parser';
|
import bodyParser from 'body-parser';
|
||||||
import { chromium, Browser, BrowserContext, Route, Request as PlaywrightRequest } from 'playwright';
|
import { chromium, Browser, BrowserContext, Route, Request as PlaywrightRequest } from 'playwright';
|
||||||
import dotenv from 'dotenv';
|
import dotenv from 'dotenv';
|
||||||
import randomUseragent from 'random-useragent';
|
import UserAgent from 'user-agents';
|
||||||
import { getError } from './helpers/get_error';
|
import { getError } from './helpers/get_error';
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
@ -60,7 +60,7 @@ const initializeBrowser = async () => {
|
|||||||
]
|
]
|
||||||
});
|
});
|
||||||
|
|
||||||
const userAgent = randomUseragent.getRandom();
|
const userAgent = new UserAgent().toString();
|
||||||
const viewport = { width: 1280, height: 800 };
|
const viewport = { width: 1280, height: 800 };
|
||||||
|
|
||||||
const contextOptions: any = {
|
const contextOptions: any = {
|
||||||
|
@ -16,12 +16,12 @@
|
|||||||
"dotenv": "^16.4.5",
|
"dotenv": "^16.4.5",
|
||||||
"express": "^4.19.2",
|
"express": "^4.19.2",
|
||||||
"playwright": "^1.45.0",
|
"playwright": "^1.45.0",
|
||||||
"random-useragent": "^0.5.0"
|
"user-agents": "^1.1.410"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/express": "^4.17.21",
|
"@types/express": "^4.17.21",
|
||||||
"@types/node": "^20.14.9",
|
"@types/node": "^20.14.9",
|
||||||
"@types/random-useragent": "^0.3.3",
|
"@types/user-agents": "^1.0.4",
|
||||||
"ts-node": "^10.9.2",
|
"ts-node": "^10.9.2",
|
||||||
"typescript": "^5.5.2"
|
"typescript": "^5.5.2"
|
||||||
}
|
}
|
||||||
|
@ -13,7 +13,7 @@ import os
|
|||||||
|
|
||||||
from .firecrawl import FirecrawlApp # noqa
|
from .firecrawl import FirecrawlApp # noqa
|
||||||
|
|
||||||
__version__ = "1.8.0"
|
__version__ = "1.8.1"
|
||||||
|
|
||||||
# Define the logger for the Firecrawl project
|
# Define the logger for the Firecrawl project
|
||||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
@ -250,6 +250,8 @@ class FirecrawlApp:
|
|||||||
if 'data' in status_data:
|
if 'data' in status_data:
|
||||||
data = status_data['data']
|
data = status_data['data']
|
||||||
while 'next' in status_data:
|
while 'next' in status_data:
|
||||||
|
if len(status_data['data']) == 0:
|
||||||
|
break
|
||||||
next_url = status_data.get('next')
|
next_url = status_data.get('next')
|
||||||
if not next_url:
|
if not next_url:
|
||||||
logger.warning("Expected 'next' URL is missing.")
|
logger.warning("Expected 'next' URL is missing.")
|
||||||
@ -267,16 +269,24 @@ class FirecrawlApp:
|
|||||||
break
|
break
|
||||||
status_data['data'] = data
|
status_data['data'] = data
|
||||||
|
|
||||||
return {
|
response = {
|
||||||
'success': True,
|
|
||||||
'status': status_data.get('status'),
|
'status': status_data.get('status'),
|
||||||
'total': status_data.get('total'),
|
'total': status_data.get('total'),
|
||||||
'completed': status_data.get('completed'),
|
'completed': status_data.get('completed'),
|
||||||
'creditsUsed': status_data.get('creditsUsed'),
|
'creditsUsed': status_data.get('creditsUsed'),
|
||||||
'expiresAt': status_data.get('expiresAt'),
|
'expiresAt': status_data.get('expiresAt'),
|
||||||
'data': status_data.get('data'),
|
'data': status_data.get('data')
|
||||||
'error': status_data.get('error'),
|
}
|
||||||
'next': status_data.get('next', None)
|
|
||||||
|
if 'error' in status_data:
|
||||||
|
response['error'] = status_data['error']
|
||||||
|
|
||||||
|
if 'next' in status_data:
|
||||||
|
response['next'] = status_data['next']
|
||||||
|
|
||||||
|
return {
|
||||||
|
'success': False if 'error' in status_data else True,
|
||||||
|
**response
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
self._handle_error(response, 'check crawl status')
|
self._handle_error(response, 'check crawl status')
|
||||||
@ -459,6 +469,8 @@ class FirecrawlApp:
|
|||||||
if 'data' in status_data:
|
if 'data' in status_data:
|
||||||
data = status_data['data']
|
data = status_data['data']
|
||||||
while 'next' in status_data:
|
while 'next' in status_data:
|
||||||
|
if len(status_data['data']) == 0:
|
||||||
|
break
|
||||||
next_url = status_data.get('next')
|
next_url = status_data.get('next')
|
||||||
if not next_url:
|
if not next_url:
|
||||||
logger.warning("Expected 'next' URL is missing.")
|
logger.warning("Expected 'next' URL is missing.")
|
||||||
@ -476,16 +488,24 @@ class FirecrawlApp:
|
|||||||
break
|
break
|
||||||
status_data['data'] = data
|
status_data['data'] = data
|
||||||
|
|
||||||
return {
|
response = {
|
||||||
'success': True,
|
|
||||||
'status': status_data.get('status'),
|
'status': status_data.get('status'),
|
||||||
'total': status_data.get('total'),
|
'total': status_data.get('total'),
|
||||||
'completed': status_data.get('completed'),
|
'completed': status_data.get('completed'),
|
||||||
'creditsUsed': status_data.get('creditsUsed'),
|
'creditsUsed': status_data.get('creditsUsed'),
|
||||||
'expiresAt': status_data.get('expiresAt'),
|
'expiresAt': status_data.get('expiresAt'),
|
||||||
'data': status_data.get('data'),
|
'data': status_data.get('data')
|
||||||
'error': status_data.get('error'),
|
}
|
||||||
'next': status_data.get('next', None)
|
|
||||||
|
if 'error' in status_data:
|
||||||
|
response['error'] = status_data['error']
|
||||||
|
|
||||||
|
if 'next' in status_data:
|
||||||
|
response['next'] = status_data['next']
|
||||||
|
|
||||||
|
return {
|
||||||
|
'success': False if 'error' in status_data else True,
|
||||||
|
**response
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
self._handle_error(response, 'check batch scrape status')
|
self._handle_error(response, 'check batch scrape status')
|
||||||
@ -669,6 +689,8 @@ class FirecrawlApp:
|
|||||||
if 'data' in status_data:
|
if 'data' in status_data:
|
||||||
data = status_data['data']
|
data = status_data['data']
|
||||||
while 'next' in status_data:
|
while 'next' in status_data:
|
||||||
|
if len(status_data['data']) == 0:
|
||||||
|
break
|
||||||
status_response = self._get_request(status_data['next'], headers)
|
status_response = self._get_request(status_data['next'], headers)
|
||||||
status_data = status_response.json()
|
status_data = status_response.json()
|
||||||
data.extend(status_data.get('data', []))
|
data.extend(status_data.get('data', []))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user