diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index d3ec7aa..19e552d 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -857,12 +857,12 @@ export class CrawlerHost extends RPCHost { nominalUrl?: URL, urlValidMs?: number ) { - if (crawlerOptions.engine?.toLowerCase().includes('lm')) { + const engine = crawlerOptions.engine?.toLowerCase() || ''; + if (engine.includes('lm')) { const output: FormattedPage = { title: snapshot.title, content: snapshot.parsed?.textContent, url: snapshot.href, - pageshotUrl: snapshot.pageshotUrl, [Symbol.dispose]: () => undefined, }; diff --git a/backend/functions/src/services/curl.ts b/backend/functions/src/services/curl.ts index 38a524d..05d7806 100644 --- a/backend/functions/src/services/curl.ts +++ b/backend/functions/src/services/curl.ts @@ -2,10 +2,11 @@ import { marshalErrorLike } from 'civkit/lang'; import { AsyncService } from 'civkit/async-service'; import { singleton } from 'tsyringe'; -import { Curl } from 'node-libcurl'; +import { Curl, HeaderInfo } from 'node-libcurl'; import { PageSnapshot, ScrappingOptions } from './puppeteer'; import { Logger } from '../shared/services/logger'; import { JSDomControl } from './jsdom'; +import { AssertionFailureError } from 'civkit'; @singleton() export class CurlControl extends AsyncService { @@ -26,7 +27,11 @@ export class CurlControl extends AsyncService { } async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions) { - const html = await new Promise((resolve, reject) => { + const result = await new Promise<{ + statusCode: number, + data: string, + headers: Buffer | HeaderInfo[], + }>((resolve, reject) => { const curl = new Curl(); curl.setOpt('URL', urlToCrawl.toString()); curl.setOpt(Curl.option.FOLLOWLOCATION, true); @@ -52,23 +57,31 @@ export class CurlControl extends AsyncService { } curl.on('end', (statusCode, data, headers) => { - this.logger.debug(`CURL: ${urlToCrawl}`, { statusCode, headers }); - resolve(data.toString()); + this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl}`, { statusCode, headers }); + resolve({ + statusCode, + data: data.toString(), + headers, + }); curl.close(); }); curl.on('error', (err) => { this.logger.warn(`Failed to curl ${urlToCrawl}`, { err: marshalErrorLike(err) }); curl.close(); - reject(err); + reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: ${err.message}`)); }); curl.perform(); }); + if (result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) { + throw new AssertionFailureError(`Failed to directly access ${urlToCrawl}: HTTP ${result.statusCode}`); + } + const snapshot = { href: urlToCrawl.toString(), - html: html, + html: result.data, title: '', text: '', } as PageSnapshot;