Merge branch 'main' into nsc/extract-queue

This commit is contained in:
Nicolas 2025-01-06 13:01:15 -03:00
commit bb27594443
17 changed files with 313 additions and 51 deletions

View File

@ -61,7 +61,7 @@ export async function batchScrapeController(
} }
logger.debug("Batch scrape " + id + " starting", { logger.debug("Batch scrape " + id + " starting", {
urlsLength: urls, urlsLength: urls.length,
appendToId: req.body.appendToId, appendToId: req.body.appendToId,
account: req.account, account: req.account,
}); });

View File

@ -157,10 +157,10 @@ export async function crawlStatusController(
continue; continue;
} }
if (job.returnvalue === undefined) { if (job.returnvalue === undefined || job.returnvalue === null) {
logger.warn( logger.warn(
"Job was considered done, but returnvalue is undefined!", "Job was considered done, but returnvalue is undefined!",
{ jobId: job.id, state }, { jobId: job.id, state, returnvalue: job.returnvalue },
); );
continue; continue;
} }

View File

@ -0,0 +1,91 @@
import { normalizeUrl, normalizeUrlOnlyHostname } from './canonical-url';
describe('normalizeUrlOnlyHostname', () => {
it('should remove protocol and www from URL', () => {
const url = 'https://www.example.com';
const expected = 'example.com';
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
it('should remove only protocol if www is not present', () => {
const url = 'https://example.com';
const expected = 'example.com';
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
it('should handle URLs without protocol', () => {
const url = 'www.example.com';
const expected = 'example.com';
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
it('should handle URLs without protocol and www', () => {
const url = 'example.com';
const expected = 'example.com';
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
it('should handle URLs with paths', () => {
const url = 'https://www.example.com/path/to/resource';
const expected = 'example.com';
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
it('should handle invalid URLs gracefully', () => {
const url = 'not a valid url';
const expected = 'not a valid url';
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
});
describe('normalizeUrl', () => {
it('should remove protocol and www from URL', () => {
const url = 'https://www.example.com';
const expected = 'example.com';
expect(normalizeUrl(url)).toBe(expected);
});
it('should remove only protocol if www is not present', () => {
const url = 'https://example.com';
const expected = 'example.com';
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle URLs without protocol', () => {
const url = 'www.example.com';
const expected = 'example.com';
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle URLs without protocol and www', () => {
const url = 'example.com';
const expected = 'example.com';
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle URLs with paths', () => {
const url = 'https://www.example.com/path/to/resource';
const expected = 'example.com/path/to/resource';
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle URLs with trailing slash', () => {
const url = 'https://www.example.com/';
const expected = 'example.com';
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle URLs with trailing slash and path', () => {
const url = 'https://www.example.com/path/';
const expected = 'example.com/path';
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle invalid URLs gracefully', () => {
const url = 'not a valid url';
const expected = 'not a valid url';
expect(normalizeUrl(url)).toBe(expected);
});
});

View File

@ -0,0 +1,19 @@
export function normalizeUrl(url: string) {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
return url;
}
export function normalizeUrlOnlyHostname(url: string) {
try {
const hostname = new URL(url).hostname;
return hostname.replace(/^www\./, "");
} catch (error) {
return url
.replace(/^https?:\/\//, "")
.replace(/^www\./, "")
.split("/")[0];
}
}

View File

@ -14,10 +14,13 @@ interface ScrapeDocumentOptions {
timeout: number; timeout: number;
} }
export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: URLTrace[]): Promise<Document | null> { export async function scrapeDocument(
options: ScrapeDocumentOptions,
urlTraces: URLTrace[],
): Promise<Document | null> {
const trace = urlTraces.find((t) => t.url === options.url); const trace = urlTraces.find((t) => t.url === options.url);
if (trace) { if (trace) {
trace.status = 'scraped'; trace.status = "scraped";
trace.timing.scrapedAt = new Date().toISOString(); trace.timing.scrapedAt = new Date().toISOString();
} }
@ -35,7 +38,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
mode: "single_urls", mode: "single_urls",
team_id: options.teamId, team_id: options.teamId,
scrapeOptions: scrapeOptions.parse({}), scrapeOptions: scrapeOptions.parse({}),
internalOptions: {}, internalOptions: {
useCache: true,
},
plan: options.plan, plan: options.plan,
origin: options.origin, origin: options.origin,
is_scrape: true, is_scrape: true,
@ -61,7 +66,7 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
} catch (error) { } catch (error) {
logger.error(`Error in scrapeDocument: ${error}`); logger.error(`Error in scrapeDocument: ${error}`);
if (trace) { if (trace) {
trace.status = 'error'; trace.status = "error";
trace.error = error.message; trace.error = error.message;
} }
return null; return null;

View File

@ -84,7 +84,7 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
}); });
// retry if only one url is returned // retry if only one url is returned
if (uniqueUrls.length === 1) { if (uniqueUrls.length <= 1) {
const retryMapResults = await getMapResults({ const retryMapResults = await getMapResults({
url: baseUrl, url: baseUrl,
teamId: options.teamId, teamId: options.teamId,

View File

@ -10,6 +10,9 @@ export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
const entry = await getEntryFromCache(key); const entry = await getEntryFromCache(key);
if (entry === null) throw new EngineError("Cache missed"); if (entry === null) throw new EngineError("Cache missed");
// Set fromCache flag to indicate this document was retrieved from cache
meta.internalOptions.fromCache = true;
return { return {
url: entry.url, url: entry.url,
html: entry.html, html: entry.html,

View File

@ -298,6 +298,15 @@ export function buildFallbackList(meta: Meta): {
engine: Engine; engine: Engine;
unsupportedFeatures: Set<FeatureFlag>; unsupportedFeatures: Set<FeatureFlag>;
}[] { }[] {
if (meta.internalOptions.useCache !== true) {
const cacheIndex = engines.indexOf("cache");
if (cacheIndex !== -1) {
engines.splice(cacheIndex, 1);
}
} else {
meta.logger.debug("Cache engine enabled by useCache option");
}
const prioritySum = [...meta.featureFlags].reduce( const prioritySum = [...meta.featureFlags].reduce(
(a, x) => a + featureFlagOptions[x].priority, (a, x) => a + featureFlagOptions[x].priority,
0, 0,

View File

@ -151,9 +151,10 @@ export type InternalOptions = {
v0CrawlOnlyUrls?: boolean; v0CrawlOnlyUrls?: boolean;
v0DisableJsDom?: boolean; v0DisableJsDom?: boolean;
useCache?: boolean;
disableSmartWaitCache?: boolean; // Passed along to fire-engine disableSmartWaitCache?: boolean; // Passed along to fire-engine
isBackgroundIndex?: boolean; isBackgroundIndex?: boolean;
fromCache?: boolean; // Indicates if the document was retrieved from cache
}; };
export type EngineResultsTracker = { export type EngineResultsTracker = {

View File

@ -3,6 +3,10 @@ import { Meta } from "..";
import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache"; import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache";
export function saveToCache(meta: Meta, document: Document): Document { export function saveToCache(meta: Meta, document: Document): Document {
if (meta.internalOptions.useCache !== true) {
return document;
}
if ( if (
document.metadata.statusCode! < 200 || document.metadata.statusCode! < 200 ||
document.metadata.statusCode! >= 300 document.metadata.statusCode! >= 300
@ -15,6 +19,12 @@ export function saveToCache(meta: Meta, document: Document): Document {
); );
} }
// If the document was retrieved from cache, we don't need to save it
if (meta.internalOptions.fromCache) {
return document;
}
const key = cacheKey(meta.url, meta.options, meta.internalOptions); const key = cacheKey(meta.url, meta.options, meta.internalOptions);
if (key !== null) { if (key !== null) {

View File

@ -53,6 +53,8 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
import { indexPage } from "../lib/extract/index/pinecone"; import { indexPage } from "../lib/extract/index/pinecone";
import { Document } from "../controllers/v1/types"; import { Document } from "../controllers/v1/types";
import { performExtraction } from "../lib/extract/extraction-service"; import { performExtraction } from "../lib/extract/extraction-service";
import { supabase_service } from "../services/supabase";
import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url";
configDotenv(); configDotenv();
@ -80,6 +82,69 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
if (await finishCrawl(job.data.crawl_id)) { if (await finishCrawl(job.data.crawl_id)) {
(async () => {
const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined;
// Get all visited URLs from Redis
const visitedUrls = await redisConnection.smembers(
"crawl:" + job.data.crawl_id + ":visited",
);
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
// Fire and forget the upload to Supabase
try {
// Standardize URLs to canonical form (https, no www)
const standardizedUrls = [
...new Set(
visitedUrls.map((url) => {
return normalizeUrl(url);
}),
),
];
// First check if entry exists for this origin URL
const { data: existingMap } = await supabase_service
.from("crawl_maps")
.select("urls")
.eq("origin_url", originUrl)
.single();
if (existingMap) {
// Merge URLs, removing duplicates
const mergedUrls = [
...new Set([...existingMap.urls, ...standardizedUrls]),
];
const { error } = await supabase_service
.from("crawl_maps")
.update({
urls: mergedUrls,
num_urls: mergedUrls.length,
updated_at: new Date().toISOString(),
})
.eq("origin_url", originUrl);
if (error) {
_logger.error("Failed to update crawl map", { error });
}
} else {
// Insert new entry if none exists
const { error } = await supabase_service.from("crawl_maps").insert({
origin_url: originUrl,
urls: standardizedUrls,
num_urls: standardizedUrls.length,
created_at: new Date().toISOString(),
updated_at: new Date().toISOString(),
});
if (error) {
_logger.error("Failed to save crawl map", { error });
}
}
} catch (error) {
_logger.error("Error saving crawl map", { error });
}
}
})();
if (!job.data.v1) { if (!job.data.v1) {
const jobIDs = await getCrawlJobs(job.data.crawl_id); const jobIDs = await getCrawlJobs(job.data.crawl_id);
@ -582,16 +647,16 @@ async function indexJob(job: Job & { id: string }, document: Document) {
document.markdown && document.markdown &&
job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID! job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID!
) { ) {
indexPage({ // indexPage({
document: document, // document: document,
originUrl: job.data.crawl_id // originUrl: job.data.crawl_id
? (await getCrawl(job.data.crawl_id))?.originUrl! // ? (await getCrawl(job.data.crawl_id))?.originUrl!
: document.metadata.sourceURL!, // : document.metadata.sourceURL!,
crawlId: job.data.crawl_id, // crawlId: job.data.crawl_id,
teamId: job.data.team_id, // teamId: job.data.team_id,
}).catch((error) => { // }).catch((error) => {
_logger.error("Error indexing page", { error }); // _logger.error("Error indexing page", { error });
}); // });
} }
} }
@ -696,7 +761,8 @@ async function processJob(job: Job & { id: string }, token: string) {
doc.metadata.url !== undefined && doc.metadata.url !== undefined &&
doc.metadata.sourceURL !== undefined && doc.metadata.sourceURL !== undefined &&
normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.url, sc) !==
normalizeURL(doc.metadata.sourceURL, sc) normalizeURL(doc.metadata.sourceURL, sc) &&
job.data.crawlerOptions !== null // only on crawls, don't care on batch scrape
) { ) {
const crawler = crawlToCrawler(job.data.crawl_id, sc); const crawler = crawlToCrawler(job.data.crawl_id, sc);
if ( if (
@ -828,9 +894,10 @@ async function processJob(job: Job & { id: string }, token: string) {
newJobId: jobId, newJobId: jobId,
}); });
} else { } else {
logger.debug("Could not lock URL " + JSON.stringify(link), { // TODO: removed this, ok? too many 'not useful' logs (?) Mogery!
url: link, // logger.debug("Could not lock URL " + JSON.stringify(link), {
}); // url: link,
// });
} }
} }
} }

View File

@ -1,6 +1,6 @@
{ {
"name": "@mendable/firecrawl-js", "name": "@mendable/firecrawl-js",
"version": "1.11.2", "version": "1.11.3",
"description": "JavaScript SDK for Firecrawl API", "description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js", "main": "dist/index.js",
"types": "dist/index.d.ts", "types": "dist/index.d.ts",

View File

@ -565,23 +565,39 @@ export default class FirecrawlApp {
if ("data" in statusData) { if ("data" in statusData) {
let data = statusData.data; let data = statusData.data;
while (typeof statusData === 'object' && 'next' in statusData) { while (typeof statusData === 'object' && 'next' in statusData) {
if (data.length === 0) {
break
}
statusData = (await this.getRequest(statusData.next, headers)).data; statusData = (await this.getRequest(statusData.next, headers)).data;
data = data.concat(statusData.data); data = data.concat(statusData.data);
} }
allData = data; allData = data;
} }
} }
return ({
let resp: CrawlStatusResponse | ErrorResponse = {
success: response.data.success, success: response.data.success,
status: response.data.status, status: response.data.status,
total: response.data.total, total: response.data.total,
completed: response.data.completed, completed: response.data.completed,
creditsUsed: response.data.creditsUsed, creditsUsed: response.data.creditsUsed,
expiresAt: new Date(response.data.expiresAt), expiresAt: new Date(response.data.expiresAt),
next: response.data.next, data: allData
data: allData, }
error: response.data.error,
}) if (!response.data.success && response.data.error) {
resp = {
...resp,
success: false,
error: response.data.error
} as ErrorResponse;
}
if (response.data.next) {
(resp as CrawlStatusResponse).next = response.data.next;
}
return resp;
} else { } else {
this.handleError(response, "check crawl status"); this.handleError(response, "check crawl status");
} }
@ -799,23 +815,39 @@ export default class FirecrawlApp {
if ("data" in statusData) { if ("data" in statusData) {
let data = statusData.data; let data = statusData.data;
while (typeof statusData === 'object' && 'next' in statusData) { while (typeof statusData === 'object' && 'next' in statusData) {
if (data.length === 0) {
break
}
statusData = (await this.getRequest(statusData.next, headers)).data; statusData = (await this.getRequest(statusData.next, headers)).data;
data = data.concat(statusData.data); data = data.concat(statusData.data);
} }
allData = data; allData = data;
} }
} }
return ({
let resp: BatchScrapeStatusResponse | ErrorResponse = {
success: response.data.success, success: response.data.success,
status: response.data.status, status: response.data.status,
total: response.data.total, total: response.data.total,
completed: response.data.completed, completed: response.data.completed,
creditsUsed: response.data.creditsUsed, creditsUsed: response.data.creditsUsed,
expiresAt: new Date(response.data.expiresAt), expiresAt: new Date(response.data.expiresAt),
next: response.data.next, data: allData
data: allData, }
error: response.data.error,
}) if (!response.data.success && response.data.error) {
resp = {
...resp,
success: false,
error: response.data.error
} as ErrorResponse;
}
if (response.data.next) {
(resp as BatchScrapeStatusResponse).next = response.data.next;
}
return resp;
} else { } else {
this.handleError(response, "check batch scrape status"); this.handleError(response, "check batch scrape status");
} }
@ -971,6 +1003,9 @@ export default class FirecrawlApp {
if ("data" in statusData) { if ("data" in statusData) {
let data = statusData.data; let data = statusData.data;
while (typeof statusData === 'object' && 'next' in statusData) { while (typeof statusData === 'object' && 'next' in statusData) {
if (data.length === 0) {
break
}
statusResponse = await this.getRequest(statusData.next, headers); statusResponse = await this.getRequest(statusData.next, headers);
statusData = statusResponse.data; statusData = statusResponse.data;
data = data.concat(statusData.data); data = data.concat(statusData.data);

View File

@ -2,7 +2,7 @@ import express, { Request, Response } from 'express';
import bodyParser from 'body-parser'; import bodyParser from 'body-parser';
import { chromium, Browser, BrowserContext, Route, Request as PlaywrightRequest } from 'playwright'; import { chromium, Browser, BrowserContext, Route, Request as PlaywrightRequest } from 'playwright';
import dotenv from 'dotenv'; import dotenv from 'dotenv';
import randomUseragent from 'random-useragent'; import UserAgent from 'user-agents';
import { getError } from './helpers/get_error'; import { getError } from './helpers/get_error';
dotenv.config(); dotenv.config();
@ -60,7 +60,7 @@ const initializeBrowser = async () => {
] ]
}); });
const userAgent = randomUseragent.getRandom(); const userAgent = new UserAgent().toString();
const viewport = { width: 1280, height: 800 }; const viewport = { width: 1280, height: 800 };
const contextOptions: any = { const contextOptions: any = {

View File

@ -16,12 +16,12 @@
"dotenv": "^16.4.5", "dotenv": "^16.4.5",
"express": "^4.19.2", "express": "^4.19.2",
"playwright": "^1.45.0", "playwright": "^1.45.0",
"random-useragent": "^0.5.0" "user-agents": "^1.1.410"
}, },
"devDependencies": { "devDependencies": {
"@types/express": "^4.17.21", "@types/express": "^4.17.21",
"@types/node": "^20.14.9", "@types/node": "^20.14.9",
"@types/random-useragent": "^0.3.3", "@types/user-agents": "^1.0.4",
"ts-node": "^10.9.2", "ts-node": "^10.9.2",
"typescript": "^5.5.2" "typescript": "^5.5.2"
} }

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp # noqa from .firecrawl import FirecrawlApp # noqa
__version__ = "1.8.0" __version__ = "1.8.1"
# Define the logger for the Firecrawl project # Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl") logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -250,6 +250,8 @@ class FirecrawlApp:
if 'data' in status_data: if 'data' in status_data:
data = status_data['data'] data = status_data['data']
while 'next' in status_data: while 'next' in status_data:
if len(status_data['data']) == 0:
break
next_url = status_data.get('next') next_url = status_data.get('next')
if not next_url: if not next_url:
logger.warning("Expected 'next' URL is missing.") logger.warning("Expected 'next' URL is missing.")
@ -267,16 +269,24 @@ class FirecrawlApp:
break break
status_data['data'] = data status_data['data'] = data
return { response = {
'success': True,
'status': status_data.get('status'), 'status': status_data.get('status'),
'total': status_data.get('total'), 'total': status_data.get('total'),
'completed': status_data.get('completed'), 'completed': status_data.get('completed'),
'creditsUsed': status_data.get('creditsUsed'), 'creditsUsed': status_data.get('creditsUsed'),
'expiresAt': status_data.get('expiresAt'), 'expiresAt': status_data.get('expiresAt'),
'data': status_data.get('data'), 'data': status_data.get('data')
'error': status_data.get('error'), }
'next': status_data.get('next', None)
if 'error' in status_data:
response['error'] = status_data['error']
if 'next' in status_data:
response['next'] = status_data['next']
return {
'success': False if 'error' in status_data else True,
**response
} }
else: else:
self._handle_error(response, 'check crawl status') self._handle_error(response, 'check crawl status')
@ -459,6 +469,8 @@ class FirecrawlApp:
if 'data' in status_data: if 'data' in status_data:
data = status_data['data'] data = status_data['data']
while 'next' in status_data: while 'next' in status_data:
if len(status_data['data']) == 0:
break
next_url = status_data.get('next') next_url = status_data.get('next')
if not next_url: if not next_url:
logger.warning("Expected 'next' URL is missing.") logger.warning("Expected 'next' URL is missing.")
@ -476,16 +488,24 @@ class FirecrawlApp:
break break
status_data['data'] = data status_data['data'] = data
return { response = {
'success': True,
'status': status_data.get('status'), 'status': status_data.get('status'),
'total': status_data.get('total'), 'total': status_data.get('total'),
'completed': status_data.get('completed'), 'completed': status_data.get('completed'),
'creditsUsed': status_data.get('creditsUsed'), 'creditsUsed': status_data.get('creditsUsed'),
'expiresAt': status_data.get('expiresAt'), 'expiresAt': status_data.get('expiresAt'),
'data': status_data.get('data'), 'data': status_data.get('data')
'error': status_data.get('error'), }
'next': status_data.get('next', None)
if 'error' in status_data:
response['error'] = status_data['error']
if 'next' in status_data:
response['next'] = status_data['next']
return {
'success': False if 'error' in status_data else True,
**response
} }
else: else:
self._handle_error(response, 'check batch scrape status') self._handle_error(response, 'check batch scrape status')
@ -669,6 +689,8 @@ class FirecrawlApp:
if 'data' in status_data: if 'data' in status_data:
data = status_data['data'] data = status_data['data']
while 'next' in status_data: while 'next' in status_data:
if len(status_data['data']) == 0:
break
status_response = self._get_request(status_data['next'], headers) status_response = self._get_request(status_data['next'], headers)
status_data = status_response.json() status_data = status_response.json()
data.extend(status_data.get('data', [])) data.extend(status_data.get('data', []))