Merge branch 'main' into nsc/extract-queue

This commit is contained in:
Nicolas 2025-01-06 13:01:15 -03:00
commit bb27594443
17 changed files with 313 additions and 51 deletions

View File

@ -61,7 +61,7 @@ export async function batchScrapeController(
}
logger.debug("Batch scrape " + id + " starting", {
urlsLength: urls,
urlsLength: urls.length,
appendToId: req.body.appendToId,
account: req.account,
});

View File

@ -157,10 +157,10 @@ export async function crawlStatusController(
continue;
}
if (job.returnvalue === undefined) {
if (job.returnvalue === undefined || job.returnvalue === null) {
logger.warn(
"Job was considered done, but returnvalue is undefined!",
{ jobId: job.id, state },
{ jobId: job.id, state, returnvalue: job.returnvalue },
);
continue;
}

View File

@ -0,0 +1,91 @@
import { normalizeUrl, normalizeUrlOnlyHostname } from './canonical-url';
describe('normalizeUrlOnlyHostname', () => {
it('should remove protocol and www from URL', () => {
const url = 'https://www.example.com';
const expected = 'example.com';
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
it('should remove only protocol if www is not present', () => {
const url = 'https://example.com';
const expected = 'example.com';
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
it('should handle URLs without protocol', () => {
const url = 'www.example.com';
const expected = 'example.com';
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
it('should handle URLs without protocol and www', () => {
const url = 'example.com';
const expected = 'example.com';
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
it('should handle URLs with paths', () => {
const url = 'https://www.example.com/path/to/resource';
const expected = 'example.com';
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
it('should handle invalid URLs gracefully', () => {
const url = 'not a valid url';
const expected = 'not a valid url';
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
});
describe('normalizeUrl', () => {
it('should remove protocol and www from URL', () => {
const url = 'https://www.example.com';
const expected = 'example.com';
expect(normalizeUrl(url)).toBe(expected);
});
it('should remove only protocol if www is not present', () => {
const url = 'https://example.com';
const expected = 'example.com';
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle URLs without protocol', () => {
const url = 'www.example.com';
const expected = 'example.com';
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle URLs without protocol and www', () => {
const url = 'example.com';
const expected = 'example.com';
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle URLs with paths', () => {
const url = 'https://www.example.com/path/to/resource';
const expected = 'example.com/path/to/resource';
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle URLs with trailing slash', () => {
const url = 'https://www.example.com/';
const expected = 'example.com';
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle URLs with trailing slash and path', () => {
const url = 'https://www.example.com/path/';
const expected = 'example.com/path';
expect(normalizeUrl(url)).toBe(expected);
});
it('should handle invalid URLs gracefully', () => {
const url = 'not a valid url';
const expected = 'not a valid url';
expect(normalizeUrl(url)).toBe(expected);
});
});

View File

@ -0,0 +1,19 @@
export function normalizeUrl(url: string) {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
return url;
}
export function normalizeUrlOnlyHostname(url: string) {
try {
const hostname = new URL(url).hostname;
return hostname.replace(/^www\./, "");
} catch (error) {
return url
.replace(/^https?:\/\//, "")
.replace(/^www\./, "")
.split("/")[0];
}
}

View File

@ -14,10 +14,13 @@ interface ScrapeDocumentOptions {
timeout: number;
}
export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: URLTrace[]): Promise<Document | null> {
export async function scrapeDocument(
options: ScrapeDocumentOptions,
urlTraces: URLTrace[],
): Promise<Document | null> {
const trace = urlTraces.find((t) => t.url === options.url);
if (trace) {
trace.status = 'scraped';
trace.status = "scraped";
trace.timing.scrapedAt = new Date().toISOString();
}
@ -35,7 +38,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
mode: "single_urls",
team_id: options.teamId,
scrapeOptions: scrapeOptions.parse({}),
internalOptions: {},
internalOptions: {
useCache: true,
},
plan: options.plan,
origin: options.origin,
is_scrape: true,
@ -61,7 +66,7 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
} catch (error) {
logger.error(`Error in scrapeDocument: ${error}`);
if (trace) {
trace.status = 'error';
trace.status = "error";
trace.error = error.message;
}
return null;

View File

@ -84,7 +84,7 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
});
// retry if only one url is returned
if (uniqueUrls.length === 1) {
if (uniqueUrls.length <= 1) {
const retryMapResults = await getMapResults({
url: baseUrl,
teamId: options.teamId,

View File

@ -10,6 +10,9 @@ export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
const entry = await getEntryFromCache(key);
if (entry === null) throw new EngineError("Cache missed");
// Set fromCache flag to indicate this document was retrieved from cache
meta.internalOptions.fromCache = true;
return {
url: entry.url,
html: entry.html,

View File

@ -298,6 +298,15 @@ export function buildFallbackList(meta: Meta): {
engine: Engine;
unsupportedFeatures: Set<FeatureFlag>;
}[] {
if (meta.internalOptions.useCache !== true) {
const cacheIndex = engines.indexOf("cache");
if (cacheIndex !== -1) {
engines.splice(cacheIndex, 1);
}
} else {
meta.logger.debug("Cache engine enabled by useCache option");
}
const prioritySum = [...meta.featureFlags].reduce(
(a, x) => a + featureFlagOptions[x].priority,
0,

View File

@ -151,9 +151,10 @@ export type InternalOptions = {
v0CrawlOnlyUrls?: boolean;
v0DisableJsDom?: boolean;
useCache?: boolean;
disableSmartWaitCache?: boolean; // Passed along to fire-engine
isBackgroundIndex?: boolean;
fromCache?: boolean; // Indicates if the document was retrieved from cache
};
export type EngineResultsTracker = {

View File

@ -3,6 +3,10 @@ import { Meta } from "..";
import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache";
export function saveToCache(meta: Meta, document: Document): Document {
if (meta.internalOptions.useCache !== true) {
return document;
}
if (
document.metadata.statusCode! < 200 ||
document.metadata.statusCode! >= 300
@ -15,6 +19,12 @@ export function saveToCache(meta: Meta, document: Document): Document {
);
}
// If the document was retrieved from cache, we don't need to save it
if (meta.internalOptions.fromCache) {
return document;
}
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
if (key !== null) {

View File

@ -53,6 +53,8 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
import { indexPage } from "../lib/extract/index/pinecone";
import { Document } from "../controllers/v1/types";
import { performExtraction } from "../lib/extract/extraction-service";
import { supabase_service } from "../services/supabase";
import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url";
configDotenv();
@ -80,6 +82,69 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
if (await finishCrawl(job.data.crawl_id)) {
(async () => {
const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined;
// Get all visited URLs from Redis
const visitedUrls = await redisConnection.smembers(
"crawl:" + job.data.crawl_id + ":visited",
);
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
// Fire and forget the upload to Supabase
try {
// Standardize URLs to canonical form (https, no www)
const standardizedUrls = [
...new Set(
visitedUrls.map((url) => {
return normalizeUrl(url);
}),
),
];
// First check if entry exists for this origin URL
const { data: existingMap } = await supabase_service
.from("crawl_maps")
.select("urls")
.eq("origin_url", originUrl)
.single();
if (existingMap) {
// Merge URLs, removing duplicates
const mergedUrls = [
...new Set([...existingMap.urls, ...standardizedUrls]),
];
const { error } = await supabase_service
.from("crawl_maps")
.update({
urls: mergedUrls,
num_urls: mergedUrls.length,
updated_at: new Date().toISOString(),
})
.eq("origin_url", originUrl);
if (error) {
_logger.error("Failed to update crawl map", { error });
}
} else {
// Insert new entry if none exists
const { error } = await supabase_service.from("crawl_maps").insert({
origin_url: originUrl,
urls: standardizedUrls,
num_urls: standardizedUrls.length,
created_at: new Date().toISOString(),
updated_at: new Date().toISOString(),
});
if (error) {
_logger.error("Failed to save crawl map", { error });
}
}
} catch (error) {
_logger.error("Error saving crawl map", { error });
}
}
})();
if (!job.data.v1) {
const jobIDs = await getCrawlJobs(job.data.crawl_id);
@ -582,16 +647,16 @@ async function indexJob(job: Job & { id: string }, document: Document) {
document.markdown &&
job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID!
) {
indexPage({
document: document,
originUrl: job.data.crawl_id
? (await getCrawl(job.data.crawl_id))?.originUrl!
: document.metadata.sourceURL!,
crawlId: job.data.crawl_id,
teamId: job.data.team_id,
}).catch((error) => {
_logger.error("Error indexing page", { error });
});
// indexPage({
// document: document,
// originUrl: job.data.crawl_id
// ? (await getCrawl(job.data.crawl_id))?.originUrl!
// : document.metadata.sourceURL!,
// crawlId: job.data.crawl_id,
// teamId: job.data.team_id,
// }).catch((error) => {
// _logger.error("Error indexing page", { error });
// });
}
}
@ -696,7 +761,8 @@ async function processJob(job: Job & { id: string }, token: string) {
doc.metadata.url !== undefined &&
doc.metadata.sourceURL !== undefined &&
normalizeURL(doc.metadata.url, sc) !==
normalizeURL(doc.metadata.sourceURL, sc)
normalizeURL(doc.metadata.sourceURL, sc) &&
job.data.crawlerOptions !== null // only on crawls, don't care on batch scrape
) {
const crawler = crawlToCrawler(job.data.crawl_id, sc);
if (
@ -828,9 +894,10 @@ async function processJob(job: Job & { id: string }, token: string) {
newJobId: jobId,
});
} else {
logger.debug("Could not lock URL " + JSON.stringify(link), {
url: link,
});
// TODO: removed this, ok? too many 'not useful' logs (?) Mogery!
// logger.debug("Could not lock URL " + JSON.stringify(link), {
// url: link,
// });
}
}
}

View File

@ -1,6 +1,6 @@
{
"name": "@mendable/firecrawl-js",
"version": "1.11.2",
"version": "1.11.3",
"description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js",
"types": "dist/index.d.ts",

View File

@ -565,23 +565,39 @@ export default class FirecrawlApp {
if ("data" in statusData) {
let data = statusData.data;
while (typeof statusData === 'object' && 'next' in statusData) {
if (data.length === 0) {
break
}
statusData = (await this.getRequest(statusData.next, headers)).data;
data = data.concat(statusData.data);
}
allData = data;
}
}
return ({
let resp: CrawlStatusResponse | ErrorResponse = {
success: response.data.success,
status: response.data.status,
total: response.data.total,
completed: response.data.completed,
creditsUsed: response.data.creditsUsed,
expiresAt: new Date(response.data.expiresAt),
next: response.data.next,
data: allData,
error: response.data.error,
})
data: allData
}
if (!response.data.success && response.data.error) {
resp = {
...resp,
success: false,
error: response.data.error
} as ErrorResponse;
}
if (response.data.next) {
(resp as CrawlStatusResponse).next = response.data.next;
}
return resp;
} else {
this.handleError(response, "check crawl status");
}
@ -799,23 +815,39 @@ export default class FirecrawlApp {
if ("data" in statusData) {
let data = statusData.data;
while (typeof statusData === 'object' && 'next' in statusData) {
if (data.length === 0) {
break
}
statusData = (await this.getRequest(statusData.next, headers)).data;
data = data.concat(statusData.data);
}
allData = data;
}
}
return ({
let resp: BatchScrapeStatusResponse | ErrorResponse = {
success: response.data.success,
status: response.data.status,
total: response.data.total,
completed: response.data.completed,
creditsUsed: response.data.creditsUsed,
expiresAt: new Date(response.data.expiresAt),
next: response.data.next,
data: allData,
error: response.data.error,
})
data: allData
}
if (!response.data.success && response.data.error) {
resp = {
...resp,
success: false,
error: response.data.error
} as ErrorResponse;
}
if (response.data.next) {
(resp as BatchScrapeStatusResponse).next = response.data.next;
}
return resp;
} else {
this.handleError(response, "check batch scrape status");
}
@ -971,6 +1003,9 @@ export default class FirecrawlApp {
if ("data" in statusData) {
let data = statusData.data;
while (typeof statusData === 'object' && 'next' in statusData) {
if (data.length === 0) {
break
}
statusResponse = await this.getRequest(statusData.next, headers);
statusData = statusResponse.data;
data = data.concat(statusData.data);

View File

@ -2,7 +2,7 @@ import express, { Request, Response } from 'express';
import bodyParser from 'body-parser';
import { chromium, Browser, BrowserContext, Route, Request as PlaywrightRequest } from 'playwright';
import dotenv from 'dotenv';
import randomUseragent from 'random-useragent';
import UserAgent from 'user-agents';
import { getError } from './helpers/get_error';
dotenv.config();
@ -60,7 +60,7 @@ const initializeBrowser = async () => {
]
});
const userAgent = randomUseragent.getRandom();
const userAgent = new UserAgent().toString();
const viewport = { width: 1280, height: 800 };
const contextOptions: any = {

View File

@ -16,12 +16,12 @@
"dotenv": "^16.4.5",
"express": "^4.19.2",
"playwright": "^1.45.0",
"random-useragent": "^0.5.0"
"user-agents": "^1.1.410"
},
"devDependencies": {
"@types/express": "^4.17.21",
"@types/node": "^20.14.9",
"@types/random-useragent": "^0.3.3",
"@types/user-agents": "^1.0.4",
"ts-node": "^10.9.2",
"typescript": "^5.5.2"
}

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp # noqa
__version__ = "1.8.0"
__version__ = "1.8.1"
# Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -250,6 +250,8 @@ class FirecrawlApp:
if 'data' in status_data:
data = status_data['data']
while 'next' in status_data:
if len(status_data['data']) == 0:
break
next_url = status_data.get('next')
if not next_url:
logger.warning("Expected 'next' URL is missing.")
@ -267,16 +269,24 @@ class FirecrawlApp:
break
status_data['data'] = data
return {
'success': True,
response = {
'status': status_data.get('status'),
'total': status_data.get('total'),
'completed': status_data.get('completed'),
'creditsUsed': status_data.get('creditsUsed'),
'expiresAt': status_data.get('expiresAt'),
'data': status_data.get('data'),
'error': status_data.get('error'),
'next': status_data.get('next', None)
'data': status_data.get('data')
}
if 'error' in status_data:
response['error'] = status_data['error']
if 'next' in status_data:
response['next'] = status_data['next']
return {
'success': False if 'error' in status_data else True,
**response
}
else:
self._handle_error(response, 'check crawl status')
@ -459,6 +469,8 @@ class FirecrawlApp:
if 'data' in status_data:
data = status_data['data']
while 'next' in status_data:
if len(status_data['data']) == 0:
break
next_url = status_data.get('next')
if not next_url:
logger.warning("Expected 'next' URL is missing.")
@ -476,16 +488,24 @@ class FirecrawlApp:
break
status_data['data'] = data
return {
'success': True,
response = {
'status': status_data.get('status'),
'total': status_data.get('total'),
'completed': status_data.get('completed'),
'creditsUsed': status_data.get('creditsUsed'),
'expiresAt': status_data.get('expiresAt'),
'data': status_data.get('data'),
'error': status_data.get('error'),
'next': status_data.get('next', None)
'data': status_data.get('data')
}
if 'error' in status_data:
response['error'] = status_data['error']
if 'next' in status_data:
response['next'] = status_data['next']
return {
'success': False if 'error' in status_data else True,
**response
}
else:
self._handle_error(response, 'check batch scrape status')
@ -669,6 +689,8 @@ class FirecrawlApp:
if 'data' in status_data:
data = status_data['data']
while 'next' in status_data:
if len(status_data['data']) == 0:
break
status_response = self._get_request(status_data['next'], headers)
status_data = status_response.json()
data.extend(status_data.get('data', []))