diff --git a/src/api/crawler.ts b/src/api/crawler.ts index d1f103f..5bbd48a 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -9,6 +9,7 @@ import { RawString, ApplicationError, DataStreamBrokenError, + assignMeta, } from 'civkit/civ-rpc'; import { marshalErrorLike } from 'civkit/lang'; import { Defer } from 'civkit/defer'; @@ -755,6 +756,8 @@ export class CrawlerHost extends RPCHost { throw new AssertionFailureError(`Remote server did not return a body: ${urlToCrawl}`); } const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName); + draftSnapshot.status = sideLoaded.status; + draftSnapshot.statusText = sideLoaded.statusText; yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts); return; } @@ -822,6 +825,8 @@ export class CrawlerHost extends RPCHost { } return Promise.reject(err); }); + draftSnapshot.status = sideLoaded.status; + draftSnapshot.statusText = sideLoaded.statusText; if (sideLoaded.status == 200 && !sideLoaded.contentType.startsWith('text/html')) { yield draftSnapshot; return; @@ -849,6 +854,8 @@ export class CrawlerHost extends RPCHost { } return Promise.reject(err); }); + proxySnapshot.status = proxyLoaded.status; + proxySnapshot.statusText = proxyLoaded.statusText; if (proxyLoaded.status === 200 && crawlerOpts?.browserIsNotRequired()) { } analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html); @@ -931,6 +938,7 @@ export class CrawlerHost extends RPCHost { } Object.assign(formatted, { usage: { tokens: amount } }); + assignMeta(formatted, { usage: { tokens: amount } }); return amount; } diff --git a/src/services/curl.ts b/src/services/curl.ts index 8b6ff7b..bc61661 100644 --- a/src/services/curl.ts +++ b/src/services/curl.ts @@ -98,6 +98,7 @@ export class CurlControl extends AsyncService { urlToFile1Shot(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) { return new Promise<{ statusCode: number, + statusText?: string, data?: FancyFile, headers: HeaderInfo[], }>((resolve, reject) => { @@ -179,6 +180,7 @@ export class CurlControl extends AsyncService { }); curl.setOpt(Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB let status = -1; + let statusText: string|undefined; let contentEncoding = ''; curl.once('end', () => { if (curlStream) { @@ -208,6 +210,7 @@ export class CurlControl extends AsyncService { } } const lastResHeaders = headers[headers.length - 1]; + statusText = (lastResHeaders as HeaderInfo).result?.reason; for (const [k, v] of Object.entries(lastResHeaders)) { const kl = k.toLowerCase(); if (kl === 'content-type') { @@ -227,6 +230,7 @@ export class CurlControl extends AsyncService { } resolve({ statusCode: status, + statusText, data: undefined, headers: headers as HeaderInfo[], }); @@ -236,6 +240,7 @@ export class CurlControl extends AsyncService { if (!stream) { resolve({ statusCode: status, + statusText, data: undefined, headers: headers as HeaderInfo[], }); @@ -289,6 +294,7 @@ export class CurlControl extends AsyncService { this.tempFileManager.bindPathTo(fancyFile, fpath); resolve({ statusCode: status, + statusText, data: fancyFile, headers: headers as HeaderInfo[], }); @@ -343,6 +349,7 @@ export class CurlControl extends AsyncService { return { statusCode: r.statusCode, + statusText: r.statusText, data: r.data, headers: fakeHeaderInfos.concat(r.headers), }; @@ -392,6 +399,7 @@ export class CurlControl extends AsyncService { sideLoadOpts, chain: curlResult.headers, status: curlResult.statusCode, + statusText: curlResult.statusText, headers: lastHeaders, contentType, contentDisposition, diff --git a/src/services/puppeteer.ts b/src/services/puppeteer.ts index 3b3d024..49fdfc6 100644 --- a/src/services/puppeteer.ts +++ b/src/services/puppeteer.ts @@ -1176,8 +1176,8 @@ export class PuppeteerControl extends AsyncService { try { const pSubFrameSnapshots = this.snapshotChildFrames(page); snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; - screenshot = await this.takeScreenShot(page); - pageshot = await this.takeScreenShot(page, { fullPage: true }); + screenshot = (await this.takeScreenShot(page)) || screenshot; + pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot; if (snapshot) { snapshot.childFrames = await pSubFrameSnapshots; } @@ -1224,8 +1224,8 @@ export class PuppeteerControl extends AsyncService { .then(async () => { const pSubFrameSnapshots = this.snapshotChildFrames(page); snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; - screenshot = await this.takeScreenShot(page); - pageshot = await this.takeScreenShot(page, { fullPage: true }); + screenshot = (await this.takeScreenShot(page)) || screenshot; + pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot; if (snapshot) { snapshot.childFrames = await pSubFrameSnapshots; } @@ -1267,8 +1267,8 @@ export class PuppeteerControl extends AsyncService { break; } if (options.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) { - screenshot = await this.takeScreenShot(page); - pageshot = await this.takeScreenShot(page, { fullPage: true }); + screenshot = (await this.takeScreenShot(page)) || screenshot; + pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot; lastHTML = snapshot.html; } if (snapshot || screenshot) { @@ -1326,6 +1326,373 @@ export class PuppeteerControl extends AsyncService { return r.filter(Boolean); } + async simpleScrap(parsedUrl: URL, options: ScrappingOptions = {}): Promise { + // parsedUrl.search = ''; + const url = parsedUrl.toString(); + let snapshot: PageSnapshot | undefined; + let navigationResponse: HTTPResponse | undefined; + const page = await this.getNextPage(); + this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx); + this.pagePhase.set(page, 'active'); + page.on('response', (resp) => { + this.blackHoleDetector.itWorked(); + const req = resp.request(); + if (req.frame() === page.mainFrame() && req.isNavigationRequest()) { + navigationResponse = resp; + } + if (!resp.ok()) { + return; + } + }); + page.on('request', async (req) => { + if (req.isInterceptResolutionHandled()) { + return; + }; + const reqUrlParsed = new URL(req.url()); + if (!reqUrlParsed.protocol.startsWith('http')) { + const overrides = req.continueRequestOverrides(); + + return req.continue(overrides, 0); + } + const typ = req.resourceType(); + if (typ === 'media') { + // Non-cooperative answer to block all media requests. + return req.abort('blockedbyclient'); + } + if (!options.proxyResources) { + const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ); + if (!isDocRequest) { + if (options.extraHeaders) { + const overrides = req.continueRequestOverrides(); + const continueArgs = [{ + ...overrides, + headers: { + ...req.headers(), + ...overrides?.headers, + ...options.extraHeaders, + } + }, 1] as const; + + return req.continue(continueArgs[0], continueArgs[1]); + } + const overrides = req.continueRequestOverrides(); + + return req.continue(overrides, 0); + } + } + const sideload = options.sideLoad; + + const impersonate = sideload?.impersonate[reqUrlParsed.href]; + if (impersonate) { + let body; + if (impersonate.body) { + body = await readFile(await impersonate.body.filePath); + if (req.isInterceptResolutionHandled()) { + return; + } + } + return req.respond({ + status: impersonate.status, + headers: impersonate.headers, + contentType: impersonate.contentType, + body: body ? Uint8Array.from(body) : undefined, + }, 999); + } + + const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin]; + const ctx = this.lifeCycleTrack.get(page); + if (proxy && ctx) { + return await this.asyncLocalContext.bridge(ctx, async () => { + try { + const curled = await this.curlControl.sideLoad(reqUrlParsed, { + ...options, + method: req.method(), + body: req.postData(), + extraHeaders: { + ...req.headers(), + ...options.extraHeaders, + }, + proxyUrl: proxy + }); + if (req.isInterceptResolutionHandled()) { + return; + }; + + if (curled.chain.length === 1) { + if (!curled.file) { + return req.respond({ + status: curled.status, + headers: _.omit(curled.headers, 'result'), + contentType: curled.contentType, + }, 3); + } + const body = await readFile(await curled.file.filePath); + if (req.isInterceptResolutionHandled()) { + return; + }; + return req.respond({ + status: curled.status, + headers: _.omit(curled.headers, 'result'), + contentType: curled.contentType, + body: Uint8Array.from(body), + }, 3); + } + options.sideLoad ??= curled.sideLoadOpts; + _.merge(options.sideLoad, curled.sideLoadOpts); + const firstReq = curled.chain[0]; + + return req.respond({ + status: firstReq.result!.code, + headers: _.omit(firstReq, 'result'), + }, 3); + } catch (err: any) { + this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy }); + } + if (req.isInterceptResolutionHandled()) { + return; + }; + const overrides = req.continueRequestOverrides(); + const continueArgs = [{ + ...overrides, + headers: { + ...req.headers(), + ...overrides?.headers, + ...options.extraHeaders, + } + }, 1] as const; + + return req.continue(continueArgs[0], continueArgs[1]); + }); + } + + if (req.isInterceptResolutionHandled()) { + return; + }; + const overrides = req.continueRequestOverrides(); + const continueArgs = [{ + ...overrides, + headers: { + ...req.headers(), + ...overrides?.headers, + ...options.extraHeaders, + } + }, 1] as const; + + return req.continue(continueArgs[0], continueArgs[1]); + }); + + const sn = this.snMap.get(page); + this.logger.info(`Page ${sn}: Scraping ${url}`, { url }); + if (options.locale) { + // Add headers via request interception to walk around this bug + // https://github.com/puppeteer/puppeteer/issues/10235 + // await page.setExtraHTTPHeaders({ + // 'Accept-Language': options.locale + // }); + + await page.evaluateOnNewDocument(() => { + Object.defineProperty(navigator, "language", { + get: function () { + return options.locale; + } + }); + Object.defineProperty(navigator, "languages", { + get: function () { + return [options.locale]; + } + }); + }); + } + + if (options.cookies) { + const mapped = options.cookies.map((x) => { + const draft: CookieParam = { + name: x.name, + value: encodeURIComponent(x.value), + secure: x.secure, + domain: x.domain, + path: x.path, + expires: x.expires ? Math.floor(x.expires.valueOf() / 1000) : undefined, + sameSite: x.sameSite as any, + }; + if (!draft.expires && x.maxAge) { + draft.expires = Math.floor(Date.now() / 1000) + x.maxAge; + } + if (!draft.domain) { + draft.url = parsedUrl.toString(); + } + + return draft; + }); + try { + await page.setCookie(...mapped); + } catch (err: any) { + this.logger.warn(`Page ${sn}: Failed to set cookies`, { err }); + throw new ParamValidationError({ + path: 'cookies', + message: `Failed to set cookies: ${err?.message}` + }); + } + } + if (options.overrideUserAgent) { + await page.setUserAgent(options.overrideUserAgent); + } + if (options.viewport) { + await page.setViewport(options.viewport); + } + + let nextSnapshotDeferred = Defer(); + const crippleListener = () => nextSnapshotDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` })); + this.once('crippled', crippleListener); + nextSnapshotDeferred.promise.finally(() => { + this.off('crippled', crippleListener); + }); + let finalized = false; + const hdl = (s: any) => { + if (snapshot === s) { + return; + } + snapshot = s; + if (snapshot) { + const kit = this.pageReqCtrl.get(page); + snapshot.lastContentResourceLoaded = kit?.lastContentResourceLoadedAt; + snapshot.lastMediaResourceLoaded = kit?.lastMediaResourceLoadedAt; + } + if (s?.maxElemDepth && s.maxElemDepth > 256) { + return; + } + if (s?.elemCount && s.elemCount > 10_000) { + return; + } + nextSnapshotDeferred.resolve(s); + nextSnapshotDeferred = Defer(); + this.once('crippled', crippleListener); + nextSnapshotDeferred.promise.finally(() => { + this.off('crippled', crippleListener); + }); + }; + page.on('snapshot', hdl); + page.once('abuse', (event: any) => { + this.emit('abuse', { ...event, url: parsedUrl }); + if (snapshot?.href && parsedUrl.href !== snapshot.href) { + this.emit('abuse', { ...event, url: snapshot.href }); + } + + nextSnapshotDeferred.reject( + new SecurityCompromiseError(`Abuse detected: ${event.reason}`) + ); + }); + + const timeout = options.timeoutMs || 30_000; + const goToOptions: GoToOptions = { + waitUntil: ['load', 'domcontentloaded', 'networkidle0'], + timeout, + }; + + if (options.referer) { + goToOptions.referer = options.referer; + } + + const gotoPromise = page.goto(url, goToOptions) + .catch((err) => { + if (err instanceof TimeoutError) { + this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err }); + return new AssertionFailureError({ + message: `Failed to goto ${url}: ${err}`, + cause: err, + }); + } + + this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err }); + return new AssertionFailureError({ + message: `Failed to goto ${url}: ${err}`, + cause: err, + }); + }).then(async (stuff) => { + // This check is necessary because without snapshot, the condition of the page is unclear + // Calling evaluate directly may stall the process. + if (!snapshot) { + if (stuff instanceof Error) { + finalized = true; + throw stuff; + } + } + try { + const pSubFrameSnapshots = this.snapshotChildFrames(page); + snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; + if (snapshot) { + snapshot.childFrames = await pSubFrameSnapshots; + } + } catch (err: any) { + this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err }); + if (stuff instanceof Error) { + finalized = true; + throw stuff; + } + } + if (!snapshot?.html) { + if (stuff instanceof Error) { + finalized = true; + throw stuff; + } + } + + finalized = true; + if (snapshot?.html) { + this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href }); + this.emit( + 'crawled', + { + ...snapshot, + status: navigationResponse?.status(), + statusText: navigationResponse?.statusText(), + }, + { ...options, url: parsedUrl } + ); + } + }); + + try { + while (true) { + const ckpt = [nextSnapshotDeferred.promise, gotoPromise]; + if (options.minIntervalMs) { + ckpt.push(delay(options.minIntervalMs)); + } + let error; + await Promise.race(ckpt).catch((err) => error = err); + if (finalized && !error) { + if (!snapshot) { + if (error) { + throw error; + } + throw new AssertionFailureError(`Could not extract any meaningful content from the page`); + } + return { + ...snapshot, + status: navigationResponse?.status(), + statusText: navigationResponse?.statusText(), + } as PageSnapshot; + } + + if (snapshot?.lastMutationIdle) { + return { + ...snapshot, + status: navigationResponse?.status(), + statusText: navigationResponse?.statusText(), + } as PageSnapshot; + } + if (error) { + throw error; + } + } + } finally { + this.pagePhase.set(page, 'background'); + page.off('snapshot', hdl); + this.ditchPage(page); + nextSnapshotDeferred.resolve(); + } + } + } const puppeteerControl = container.resolve(PuppeteerControl); diff --git a/thinapps-shared b/thinapps-shared index 8c31e85..07d2319 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 8c31e85dc52dfcc7d1d86df0328df3a94319b534 +Subproject commit 07d23193d85b1d3c8bbd5d0b024a6884ecfe17fd