mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 00:15:58 +08:00
Merge remote-tracking branch 'origin/main' into mongodb
This commit is contained in:
commit
d874c4890d
@ -116,6 +116,10 @@ export class CrawlerHost extends RPCHost {
|
||||
if (snapshot.isIntermediate) {
|
||||
return;
|
||||
}
|
||||
if (!snapshot.lastMutationIdle) {
|
||||
// Never reached mutationIdle, presumably too short timeout
|
||||
return;
|
||||
}
|
||||
if (options.locale) {
|
||||
Reflect.set(snapshot, 'locale', options.locale);
|
||||
}
|
||||
@ -236,6 +240,7 @@ export class CrawlerHost extends RPCHost {
|
||||
const uid = await auth.solveUID();
|
||||
let chargeAmount = 0;
|
||||
const crawlerOptions = ctx.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed;
|
||||
const tierPolicy = await this.saasAssertTierPolicy(crawlerOptions, auth);
|
||||
|
||||
// Use koa ctx.URL, a standard URL object to avoid node.js framework prop naming confusion
|
||||
const targetUrl = await this.getTargetUrl(tryDecodeURIComponent(`${ctx.URL.pathname}${ctx.URL.search}`), crawlerOptions);
|
||||
@ -294,15 +299,13 @@ export class CrawlerHost extends RPCHost {
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
return;
|
||||
}
|
||||
if (chargeAmount) {
|
||||
apiRoll._ref?.set({
|
||||
chargeAmount,
|
||||
}, { merge: true }).catch((err) => this.logger.warn(`Failed to log charge amount in apiRoll`, { err }));
|
||||
}
|
||||
apiRoll.chargeAmount = chargeAmount;
|
||||
});
|
||||
}
|
||||
|
||||
if (!uid) {
|
||||
// Enforce no proxy is allocated for anonymous users due to abuse.
|
||||
crawlerOptions.proxy = 'none';
|
||||
const blockade = (await DomainBlockade.fromFirestoreQuery(
|
||||
DomainBlockade.COLLECTION
|
||||
.where('domain', '==', targetUrl.hostname.toLowerCase())
|
||||
@ -313,7 +316,6 @@ export class CrawlerHost extends RPCHost {
|
||||
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
||||
}
|
||||
}
|
||||
|
||||
const crawlOpts = await this.configure(crawlerOptions);
|
||||
if (crawlerOptions.robotsTxt) {
|
||||
await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
|
||||
@ -335,10 +337,7 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
||||
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
}
|
||||
chargeAmount = this.assignChargeAmount(formatted, tierPolicy);
|
||||
sseStream.write({
|
||||
event: 'data',
|
||||
data: formatted,
|
||||
@ -376,11 +375,7 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
||||
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
|
||||
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
}
|
||||
chargeAmount = this.assignChargeAmount(formatted, tierPolicy);
|
||||
|
||||
if (scrapped?.pdfs?.length && !chargeAmount) {
|
||||
continue;
|
||||
@ -402,10 +397,7 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
|
||||
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
}
|
||||
chargeAmount = this.assignChargeAmount(formatted, tierPolicy);
|
||||
|
||||
return formatted;
|
||||
}
|
||||
@ -431,10 +423,7 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
||||
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
}
|
||||
chargeAmount = this.assignChargeAmount(formatted, tierPolicy);
|
||||
|
||||
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||
@ -461,12 +450,8 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
|
||||
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
}
|
||||
chargeAmount = this.assignChargeAmount(formatted, tierPolicy);
|
||||
|
||||
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||
|
||||
@ -798,6 +783,8 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && !this.knownUrlThatSideLoadingWouldCrashTheBrowser(urlToCrawl)) {
|
||||
const sideLoadSnapshotPermitted = crawlerOpts?.browserIsNotRequired() &&
|
||||
[RESPOND_TIMING.HTML, RESPOND_TIMING.VISIBLE_CONTENT].includes(crawlerOpts.presumedRespondTiming);
|
||||
try {
|
||||
const altOpts = { ...crawlOpts };
|
||||
let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
|
||||
@ -832,11 +819,12 @@ export class CrawlerHost extends RPCHost {
|
||||
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
|
||||
draftSnapshot.title ??= analyzed.title;
|
||||
draftSnapshot.isIntermediate = true;
|
||||
if (crawlerOpts?.browserIsNotRequired()) {
|
||||
if (sideLoadSnapshotPermitted) {
|
||||
yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
|
||||
}
|
||||
let fallbackProxyIsUsed = false;
|
||||
if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
|
||||
if (
|
||||
((!crawlOpts?.allocProxy || crawlOpts.allocProxy !== 'none') && !crawlOpts?.proxyUrl) &&
|
||||
(analyzed.tokens < 42 || sideLoaded.status !== 200)
|
||||
) {
|
||||
const proxyLoaded = await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
|
||||
@ -858,7 +846,7 @@ export class CrawlerHost extends RPCHost {
|
||||
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
|
||||
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
|
||||
proxySnapshot.isIntermediate = true;
|
||||
if (crawlerOpts?.browserIsNotRequired()) {
|
||||
if (sideLoadSnapshotPermitted) {
|
||||
yield this.jsdomControl.narrowSnapshot(proxySnapshot, crawlOpts);
|
||||
}
|
||||
sideLoaded = proxyLoaded;
|
||||
@ -907,18 +895,14 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
}
|
||||
|
||||
assignChargeAmount(formatted: FormattedPage, crawlerOptions?: CrawlerOptions) {
|
||||
assignChargeAmount(formatted: FormattedPage, saasTierPolicy?: Parameters<typeof this.saasApplyTierPolicy>[0]) {
|
||||
if (!formatted) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
let amount = 0;
|
||||
if (formatted.content) {
|
||||
const x1 = estimateToken(formatted.content);
|
||||
if (crawlerOptions?.respondWith?.toLowerCase().includes('lm')) {
|
||||
amount += x1 * 2;
|
||||
}
|
||||
amount += x1;
|
||||
amount = estimateToken(formatted.content);
|
||||
} else if (formatted.description) {
|
||||
amount += estimateToken(formatted.description);
|
||||
}
|
||||
@ -935,6 +919,10 @@ export class CrawlerHost extends RPCHost {
|
||||
amount += 765;
|
||||
}
|
||||
|
||||
if (saasTierPolicy) {
|
||||
amount = this.saasApplyTierPolicy(saasTierPolicy, amount);
|
||||
}
|
||||
|
||||
Object.assign(formatted, { usage: { tokens: amount } });
|
||||
assignMeta(formatted, { usage: { tokens: amount } });
|
||||
|
||||
@ -1308,4 +1296,54 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
async saasAssertTierPolicy(opts: CrawlerOptions, auth: JinaEmbeddingsAuthDTO) {
|
||||
let chargeScalar = 1;
|
||||
let minimalCharge = 0;
|
||||
|
||||
if (opts.withGeneratedAlt) {
|
||||
await auth.assertTier(0, 'Alt text generation');
|
||||
minimalCharge = 765;
|
||||
}
|
||||
|
||||
if (opts.injectPageScript || opts.injectFrameScript) {
|
||||
await auth.assertTier(0, 'Script injection');
|
||||
minimalCharge = 4_000;
|
||||
}
|
||||
|
||||
if (opts.withIframe) {
|
||||
await auth.assertTier(0, 'Iframe');
|
||||
}
|
||||
|
||||
if (opts.engine === ENGINE_TYPE.CF_BROWSER_RENDERING) {
|
||||
await auth.assertTier(0, 'Cloudflare browser rendering');
|
||||
minimalCharge = 4_000;
|
||||
}
|
||||
|
||||
if (opts.respondWith.includes('lm') || opts.engine?.includes('lm')) {
|
||||
await auth.assertTier(0, 'Language model');
|
||||
minimalCharge = 4_000;
|
||||
chargeScalar = 3;
|
||||
}
|
||||
|
||||
if (opts.proxy && opts.proxy !== 'none') {
|
||||
await auth.assertTier(['auto', 'any'].includes(opts.proxy) ? 0 : 2, 'Proxy allocation');
|
||||
chargeScalar = 5;
|
||||
}
|
||||
|
||||
return {
|
||||
budget: opts.tokenBudget || 0,
|
||||
chargeScalar,
|
||||
minimalCharge,
|
||||
};
|
||||
}
|
||||
|
||||
saasApplyTierPolicy(policy: Awaited<ReturnType<typeof this.saasAssertTierPolicy>>, chargeAmount: number) {
|
||||
const effectiveChargeAmount = policy.chargeScalar * Math.max(chargeAmount, policy.minimalCharge);
|
||||
if (policy.budget && policy.budget < effectiveChargeAmount) {
|
||||
throw new BudgetExceededError(`Token budget (${policy.budget}) exceeded, intended charge amount ${effectiveChargeAmount}`);
|
||||
}
|
||||
|
||||
return effectiveChargeAmount;
|
||||
}
|
||||
}
|
||||
|
@ -318,11 +318,6 @@ export class SearcherHost extends RPCHost {
|
||||
throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
|
||||
}
|
||||
|
||||
if (crawlOpts.timeoutMs && crawlOpts.timeoutMs < 30_000) {
|
||||
delete crawlOpts.timeoutMs;
|
||||
}
|
||||
|
||||
|
||||
let lastScrapped: any[] | undefined;
|
||||
const targetResultCount = crawlWithoutContent ? count : count + 2;
|
||||
const trimmedResults = results.filter((x) => Boolean(x.link)).slice(0, targetResultCount).map((x) => this.mapToFinalResults(x));
|
||||
|
@ -133,7 +133,7 @@ export class SerpHost extends RPCHost {
|
||||
@Param('num', { validate: (v: number) => v >= 0 && v <= 20 })
|
||||
num?: number,
|
||||
@Param('gl', { validate: (v: string) => WORLD_COUNTRY_CODES.includes(v?.toLowerCase()) }) gl?: string,
|
||||
@Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) hl?: string,
|
||||
@Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) _hl?: string,
|
||||
@Param('location') location?: string,
|
||||
@Param('page') page?: number,
|
||||
@Param('fallback') fallback?: boolean,
|
||||
@ -294,7 +294,7 @@ export class SerpHost extends RPCHost {
|
||||
q,
|
||||
num,
|
||||
gl,
|
||||
hl,
|
||||
// hl,
|
||||
location,
|
||||
page,
|
||||
}, crawlerOptions);
|
||||
@ -324,7 +324,7 @@ export class SerpHost extends RPCHost {
|
||||
q: realQuery,
|
||||
num,
|
||||
gl,
|
||||
hl,
|
||||
// hl,
|
||||
location,
|
||||
}, crawlerOptions);
|
||||
if (results?.length) {
|
||||
@ -341,7 +341,7 @@ export class SerpHost extends RPCHost {
|
||||
q: realQuery,
|
||||
num,
|
||||
gl,
|
||||
hl,
|
||||
// hl,
|
||||
location,
|
||||
}, crawlerOptions);
|
||||
}
|
||||
|
@ -655,8 +655,11 @@ export class CrawlerOptions extends AutoCastable {
|
||||
if (this.respondWith.includes('lm')) {
|
||||
return false;
|
||||
}
|
||||
if (this.withIframe) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
return !snapshot.isIntermediate;
|
||||
}
|
||||
|
||||
isCacheQueryApplicable() {
|
||||
|
@ -17,6 +17,7 @@ import { AsyncLocalContext } from '../services/async-context';
|
||||
import envConfig from '../shared/services/secrets';
|
||||
import { JinaEmbeddingsDashboardHTTP } from '../shared/3rd-party/jina-embeddings';
|
||||
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
||||
import { TierFeatureConstraintError } from '../services/errors';
|
||||
|
||||
const authDtoLogger = logger.child({ service: 'JinaAuthDTO' });
|
||||
|
||||
@ -236,6 +237,30 @@ export class JinaEmbeddingsAuthDTO extends AutoCastable {
|
||||
return this.user!;
|
||||
}
|
||||
|
||||
async assertTier(n: number, feature?: string) {
|
||||
let user;
|
||||
try {
|
||||
user = await this.assertUser();
|
||||
} catch (err) {
|
||||
if (err instanceof AuthenticationRequiredError) {
|
||||
throw new AuthenticationRequiredError({
|
||||
message: `Authentication is required to use this feature${feature ? ` (${feature})` : ''}. Please provide a valid API key.`
|
||||
});
|
||||
}
|
||||
|
||||
throw err;
|
||||
}
|
||||
|
||||
const tier = parseInt(user.metadata?.speed_level);
|
||||
if (isNaN(tier) || tier < n) {
|
||||
throw new TierFeatureConstraintError({
|
||||
message: `Your current plan does not support this feature${feature ? ` (${feature})` : ''}. Please upgrade your plan.`
|
||||
});
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
getRateLimits(...tags: string[]) {
|
||||
const descs = tags.map((x) => this.user?.customRateLimits?.[x] || []).flat().filter((x) => x.isEffective());
|
||||
|
||||
|
@ -27,7 +27,7 @@ export class EmailUnverifiedError extends ApplicationError { }
|
||||
export class InsufficientCreditsError extends ApplicationError { }
|
||||
|
||||
@StatusCode(40202)
|
||||
export class FreeFeatureLimitError extends ApplicationError { }
|
||||
export class TierFeatureConstraintError extends ApplicationError { }
|
||||
|
||||
@StatusCode(40203)
|
||||
export class InsufficientBalanceError extends ApplicationError { }
|
||||
|
@ -846,7 +846,6 @@ export class PuppeteerControl extends AsyncService {
|
||||
async *scrap(parsedUrl: URL, options: ScrappingOptions = {}): AsyncGenerator<PageSnapshot | undefined> {
|
||||
// parsedUrl.search = '';
|
||||
const url = parsedUrl.toString();
|
||||
|
||||
let snapshot: PageSnapshot | undefined;
|
||||
let screenshot: Buffer | undefined;
|
||||
let pageshot: Buffer | undefined;
|
||||
@ -1097,7 +1096,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
nextSnapshotDeferred.promise.finally(() => {
|
||||
this.off('crippled', crippleListener);
|
||||
});
|
||||
let finalized = false;
|
||||
let successfullyDone;
|
||||
const hdl = (s: any) => {
|
||||
if (snapshot === s) {
|
||||
return;
|
||||
@ -1143,6 +1142,42 @@ export class PuppeteerControl extends AsyncService {
|
||||
goToOptions.referer = options.referer;
|
||||
}
|
||||
|
||||
let waitForPromise: Promise<any> | undefined;
|
||||
let finalizationPromise: Promise<any> | undefined;
|
||||
const doFinalization = async () => {
|
||||
if (waitForPromise) {
|
||||
// SuccessfullyDone is meant for the finish of the page.
|
||||
// It doesn't matter if you are expecting something and it didn't show up.
|
||||
await waitForPromise.catch(() => void 0);
|
||||
}
|
||||
successfullyDone ??= true;
|
||||
try {
|
||||
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||
screenshot = (await this.takeScreenShot(page)) || screenshot;
|
||||
pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
|
||||
if (snapshot) {
|
||||
snapshot.childFrames = await pSubFrameSnapshots;
|
||||
}
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
|
||||
}
|
||||
if (!snapshot?.html) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
||||
this.emit(
|
||||
'crawled',
|
||||
{
|
||||
...snapshot,
|
||||
status: navigationResponse?.status(),
|
||||
statusText: navigationResponse?.statusText(),
|
||||
pdfs: _.uniq(pdfUrls), screenshot, pageshot,
|
||||
},
|
||||
{ ...options, url: parsedUrl }
|
||||
);
|
||||
};
|
||||
const delayPromise = delay(timeout);
|
||||
const gotoPromise = page.goto(url, goToOptions)
|
||||
.catch((err) => {
|
||||
@ -1170,50 +1205,14 @@ export class PuppeteerControl extends AsyncService {
|
||||
// Calling evaluate directly may stall the process.
|
||||
if (!snapshot) {
|
||||
if (stuff instanceof Error) {
|
||||
finalized = true;
|
||||
throw stuff;
|
||||
}
|
||||
}
|
||||
await Promise.race([Promise.allSettled([...pageScriptEvaluations, ...frameScriptEvaluations]), delayPromise])
|
||||
.catch(() => void 0);
|
||||
try {
|
||||
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||
screenshot = (await this.takeScreenShot(page)) || screenshot;
|
||||
pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
|
||||
if (snapshot) {
|
||||
snapshot.childFrames = await pSubFrameSnapshots;
|
||||
}
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
|
||||
if (stuff instanceof Error) {
|
||||
finalized = true;
|
||||
throw stuff;
|
||||
}
|
||||
}
|
||||
if (!snapshot?.html) {
|
||||
if (stuff instanceof Error) {
|
||||
finalized = true;
|
||||
throw stuff;
|
||||
}
|
||||
}
|
||||
|
||||
finalized = true;
|
||||
if (snapshot?.html) {
|
||||
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
||||
this.emit(
|
||||
'crawled',
|
||||
{
|
||||
...snapshot,
|
||||
status: navigationResponse?.status(),
|
||||
statusText: navigationResponse?.statusText(),
|
||||
pdfs: _.uniq(pdfUrls), screenshot, pageshot,
|
||||
},
|
||||
{ ...options, url: parsedUrl }
|
||||
);
|
||||
}
|
||||
finalizationPromise = doFinalization();
|
||||
return stuff;
|
||||
});
|
||||
let waitForPromise: Promise<any> | undefined;
|
||||
if (options.waitForSelector) {
|
||||
const t0 = Date.now();
|
||||
waitForPromise = nextSnapshotDeferred.promise.then(() => {
|
||||
@ -1224,22 +1223,16 @@ export class PuppeteerControl extends AsyncService {
|
||||
const p = (Array.isArray(options.waitForSelector) ?
|
||||
Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
|
||||
page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
|
||||
.then(async () => {
|
||||
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||
screenshot = (await this.takeScreenShot(page)) || screenshot;
|
||||
pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
|
||||
if (snapshot) {
|
||||
snapshot.childFrames = await pSubFrameSnapshots;
|
||||
}
|
||||
finalized = true;
|
||||
.then(() => {
|
||||
successfullyDone = true;
|
||||
})
|
||||
.catch((err) => {
|
||||
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err });
|
||||
waitForPromise = undefined;
|
||||
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err });
|
||||
});
|
||||
return p as any;
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
try {
|
||||
@ -1254,11 +1247,8 @@ export class PuppeteerControl extends AsyncService {
|
||||
}
|
||||
let error;
|
||||
await Promise.race(ckpt).catch((err) => error = err);
|
||||
if (finalized && !error) {
|
||||
if (successfullyDone && !error) {
|
||||
if (!snapshot && !screenshot) {
|
||||
if (error) {
|
||||
throw error;
|
||||
}
|
||||
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
||||
}
|
||||
yield {
|
||||
@ -1286,10 +1276,20 @@ export class PuppeteerControl extends AsyncService {
|
||||
if (error) {
|
||||
throw error;
|
||||
}
|
||||
if (successfullyDone) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
await finalizationPromise;
|
||||
yield {
|
||||
...snapshot,
|
||||
status: navigationResponse?.status(),
|
||||
statusText: navigationResponse?.statusText(),
|
||||
pdfs: _.uniq(pdfUrls), screenshot, pageshot
|
||||
} as PageSnapshot;
|
||||
} finally {
|
||||
this.pagePhase.set(page, 'background');
|
||||
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
|
||||
Promise.allSettled([gotoPromise, waitForPromise, finalizationPromise]).finally(() => {
|
||||
page.off('snapshot', hdl);
|
||||
this.ditchPage(page);
|
||||
});
|
||||
@ -1329,369 +1329,6 @@ export class PuppeteerControl extends AsyncService {
|
||||
return r.filter(Boolean);
|
||||
}
|
||||
|
||||
async simpleScrap(parsedUrl: URL, options: ScrappingOptions = {}): Promise<PageSnapshot> {
|
||||
// parsedUrl.search = '';
|
||||
const url = parsedUrl.toString();
|
||||
let snapshot: PageSnapshot | undefined;
|
||||
let navigationResponse: HTTPResponse | undefined;
|
||||
const page = await this.getNextPage();
|
||||
this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx);
|
||||
this.pagePhase.set(page, 'active');
|
||||
page.on('response', (resp) => {
|
||||
this.blackHoleDetector.itWorked();
|
||||
const req = resp.request();
|
||||
if (req.frame() === page.mainFrame() && req.isNavigationRequest()) {
|
||||
navigationResponse = resp;
|
||||
}
|
||||
if (!resp.ok()) {
|
||||
return;
|
||||
}
|
||||
});
|
||||
page.on('request', async (req) => {
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
};
|
||||
const reqUrlParsed = new URL(req.url());
|
||||
if (!reqUrlParsed.protocol.startsWith('http')) {
|
||||
const overrides = req.continueRequestOverrides();
|
||||
|
||||
return req.continue(overrides, 0);
|
||||
}
|
||||
const typ = req.resourceType();
|
||||
if (typ === 'media') {
|
||||
// Non-cooperative answer to block all media requests.
|
||||
return req.abort('blockedbyclient');
|
||||
}
|
||||
if (!options.proxyResources) {
|
||||
const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
|
||||
if (!isDocRequest) {
|
||||
if (options.extraHeaders) {
|
||||
const overrides = req.continueRequestOverrides();
|
||||
const continueArgs = [{
|
||||
...overrides,
|
||||
headers: {
|
||||
...req.headers(),
|
||||
...overrides?.headers,
|
||||
...options.extraHeaders,
|
||||
}
|
||||
}, 1] as const;
|
||||
|
||||
return req.continue(continueArgs[0], continueArgs[1]);
|
||||
}
|
||||
const overrides = req.continueRequestOverrides();
|
||||
|
||||
return req.continue(overrides, 0);
|
||||
}
|
||||
}
|
||||
const sideload = options.sideLoad;
|
||||
|
||||
const impersonate = sideload?.impersonate[reqUrlParsed.href];
|
||||
if (impersonate) {
|
||||
let body;
|
||||
if (impersonate.body) {
|
||||
body = await readFile(await impersonate.body.filePath);
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
return req.respond({
|
||||
status: impersonate.status,
|
||||
headers: impersonate.headers,
|
||||
contentType: impersonate.contentType,
|
||||
body: body ? Uint8Array.from(body) : undefined,
|
||||
}, 999);
|
||||
}
|
||||
|
||||
const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
|
||||
const ctx = this.lifeCycleTrack.get(page);
|
||||
if (proxy && ctx) {
|
||||
return await this.asyncLocalContext.bridge(ctx, async () => {
|
||||
try {
|
||||
const curled = await this.curlControl.sideLoad(reqUrlParsed, {
|
||||
...options,
|
||||
method: req.method(),
|
||||
body: req.postData(),
|
||||
extraHeaders: {
|
||||
...req.headers(),
|
||||
...options.extraHeaders,
|
||||
},
|
||||
proxyUrl: proxy
|
||||
});
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
};
|
||||
|
||||
if (curled.chain.length === 1) {
|
||||
if (!curled.file) {
|
||||
return req.respond({
|
||||
status: curled.status,
|
||||
headers: _.omit(curled.headers, 'result'),
|
||||
contentType: curled.contentType,
|
||||
}, 3);
|
||||
}
|
||||
const body = await readFile(await curled.file.filePath);
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
};
|
||||
return req.respond({
|
||||
status: curled.status,
|
||||
headers: _.omit(curled.headers, 'result'),
|
||||
contentType: curled.contentType,
|
||||
body: Uint8Array.from(body),
|
||||
}, 3);
|
||||
}
|
||||
options.sideLoad ??= curled.sideLoadOpts;
|
||||
_.merge(options.sideLoad, curled.sideLoadOpts);
|
||||
const firstReq = curled.chain[0];
|
||||
|
||||
return req.respond({
|
||||
status: firstReq.result!.code,
|
||||
headers: _.omit(firstReq, 'result'),
|
||||
}, 3);
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
|
||||
}
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
};
|
||||
const overrides = req.continueRequestOverrides();
|
||||
const continueArgs = [{
|
||||
...overrides,
|
||||
headers: {
|
||||
...req.headers(),
|
||||
...overrides?.headers,
|
||||
...options.extraHeaders,
|
||||
}
|
||||
}, 1] as const;
|
||||
|
||||
return req.continue(continueArgs[0], continueArgs[1]);
|
||||
});
|
||||
}
|
||||
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
};
|
||||
const overrides = req.continueRequestOverrides();
|
||||
const continueArgs = [{
|
||||
...overrides,
|
||||
headers: {
|
||||
...req.headers(),
|
||||
...overrides?.headers,
|
||||
...options.extraHeaders,
|
||||
}
|
||||
}, 1] as const;
|
||||
|
||||
return req.continue(continueArgs[0], continueArgs[1]);
|
||||
});
|
||||
|
||||
const sn = this.snMap.get(page);
|
||||
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
||||
if (options.locale) {
|
||||
// Add headers via request interception to walk around this bug
|
||||
// https://github.com/puppeteer/puppeteer/issues/10235
|
||||
// await page.setExtraHTTPHeaders({
|
||||
// 'Accept-Language': options.locale
|
||||
// });
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, "language", {
|
||||
get: function () {
|
||||
return options.locale;
|
||||
}
|
||||
});
|
||||
Object.defineProperty(navigator, "languages", {
|
||||
get: function () {
|
||||
return [options.locale];
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
if (options.cookies) {
|
||||
const mapped = options.cookies.map((x) => {
|
||||
const draft: CookieParam = {
|
||||
name: x.name,
|
||||
value: encodeURIComponent(x.value),
|
||||
secure: x.secure,
|
||||
domain: x.domain,
|
||||
path: x.path,
|
||||
expires: x.expires ? Math.floor(x.expires.valueOf() / 1000) : undefined,
|
||||
sameSite: x.sameSite as any,
|
||||
};
|
||||
if (!draft.expires && x.maxAge) {
|
||||
draft.expires = Math.floor(Date.now() / 1000) + x.maxAge;
|
||||
}
|
||||
if (!draft.domain) {
|
||||
draft.url = parsedUrl.toString();
|
||||
}
|
||||
|
||||
return draft;
|
||||
});
|
||||
try {
|
||||
await page.setCookie(...mapped);
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Page ${sn}: Failed to set cookies`, { err });
|
||||
throw new ParamValidationError({
|
||||
path: 'cookies',
|
||||
message: `Failed to set cookies: ${err?.message}`
|
||||
});
|
||||
}
|
||||
}
|
||||
if (options.overrideUserAgent) {
|
||||
await page.setUserAgent(options.overrideUserAgent);
|
||||
}
|
||||
if (options.viewport) {
|
||||
await page.setViewport(options.viewport);
|
||||
}
|
||||
|
||||
let nextSnapshotDeferred = Defer();
|
||||
const crippleListener = () => nextSnapshotDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` }));
|
||||
this.once('crippled', crippleListener);
|
||||
nextSnapshotDeferred.promise.finally(() => {
|
||||
this.off('crippled', crippleListener);
|
||||
});
|
||||
let finalized = false;
|
||||
const hdl = (s: any) => {
|
||||
if (snapshot === s) {
|
||||
return;
|
||||
}
|
||||
snapshot = s;
|
||||
if (snapshot) {
|
||||
const kit = this.pageReqCtrl.get(page);
|
||||
snapshot.lastContentResourceLoaded = kit?.lastContentResourceLoadedAt;
|
||||
snapshot.lastMediaResourceLoaded = kit?.lastMediaResourceLoadedAt;
|
||||
}
|
||||
if (s?.maxElemDepth && s.maxElemDepth > 256) {
|
||||
return;
|
||||
}
|
||||
if (s?.elemCount && s.elemCount > 10_000) {
|
||||
return;
|
||||
}
|
||||
nextSnapshotDeferred.resolve(s);
|
||||
nextSnapshotDeferred = Defer();
|
||||
this.once('crippled', crippleListener);
|
||||
nextSnapshotDeferred.promise.finally(() => {
|
||||
this.off('crippled', crippleListener);
|
||||
});
|
||||
};
|
||||
page.on('snapshot', hdl);
|
||||
page.once('abuse', (event: any) => {
|
||||
this.emit('abuse', { ...event, url: parsedUrl });
|
||||
if (snapshot?.href && parsedUrl.href !== snapshot.href) {
|
||||
this.emit('abuse', { ...event, url: snapshot.href });
|
||||
}
|
||||
|
||||
nextSnapshotDeferred.reject(
|
||||
new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
|
||||
);
|
||||
});
|
||||
|
||||
const timeout = options.timeoutMs || 30_000;
|
||||
const goToOptions: GoToOptions = {
|
||||
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
|
||||
timeout,
|
||||
};
|
||||
|
||||
if (options.referer) {
|
||||
goToOptions.referer = options.referer;
|
||||
}
|
||||
|
||||
const gotoPromise = page.goto(url, goToOptions)
|
||||
.catch((err) => {
|
||||
if (err instanceof TimeoutError) {
|
||||
this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err });
|
||||
return new AssertionFailureError({
|
||||
message: `Failed to goto ${url}: ${err}`,
|
||||
cause: err,
|
||||
});
|
||||
}
|
||||
|
||||
this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err });
|
||||
return new AssertionFailureError({
|
||||
message: `Failed to goto ${url}: ${err}`,
|
||||
cause: err,
|
||||
});
|
||||
}).then(async (stuff) => {
|
||||
// This check is necessary because without snapshot, the condition of the page is unclear
|
||||
// Calling evaluate directly may stall the process.
|
||||
if (!snapshot) {
|
||||
if (stuff instanceof Error) {
|
||||
finalized = true;
|
||||
throw stuff;
|
||||
}
|
||||
}
|
||||
try {
|
||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
|
||||
if (stuff instanceof Error) {
|
||||
finalized = true;
|
||||
throw stuff;
|
||||
}
|
||||
}
|
||||
if (!snapshot?.html) {
|
||||
if (stuff instanceof Error) {
|
||||
finalized = true;
|
||||
throw stuff;
|
||||
}
|
||||
}
|
||||
|
||||
finalized = true;
|
||||
if (snapshot?.html) {
|
||||
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
||||
this.emit(
|
||||
'crawled',
|
||||
{
|
||||
...snapshot,
|
||||
status: navigationResponse?.status(),
|
||||
statusText: navigationResponse?.statusText(),
|
||||
},
|
||||
{ ...options, url: parsedUrl }
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
|
||||
if (options.minIntervalMs) {
|
||||
ckpt.push(delay(options.minIntervalMs));
|
||||
}
|
||||
let error;
|
||||
await Promise.race(ckpt).catch((err) => error = err);
|
||||
if (finalized && !error) {
|
||||
if (!snapshot) {
|
||||
if (error) {
|
||||
throw error;
|
||||
}
|
||||
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
||||
}
|
||||
return {
|
||||
...snapshot,
|
||||
status: navigationResponse?.status(),
|
||||
statusText: navigationResponse?.statusText(),
|
||||
} as PageSnapshot;
|
||||
}
|
||||
|
||||
if (snapshot?.lastMutationIdle) {
|
||||
return {
|
||||
...snapshot,
|
||||
status: navigationResponse?.status(),
|
||||
statusText: navigationResponse?.statusText(),
|
||||
} as PageSnapshot;
|
||||
}
|
||||
if (error) {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
this.pagePhase.set(page, 'background');
|
||||
page.off('snapshot', hdl);
|
||||
this.ditchPage(page);
|
||||
nextSnapshotDeferred.resolve();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
const puppeteerControl = container.resolve(PuppeteerControl);
|
||||
|
@ -827,7 +827,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
|
||||
}
|
||||
snapshot.html = await readFile(await file.filePath, encoding);
|
||||
const innerCharset = snapshot.html.slice(0, 1024).match(/<meta[^>]+text\/html;\s*?charset=([^>"]+)\"/i)?.[1]?.toLowerCase();
|
||||
let innerCharset;
|
||||
const peek = snapshot.html.slice(0, 1024);
|
||||
innerCharset ??= peek.match(/<meta[^>]+text\/html;\s*?charset=([^>"]+)/i)?.[1]?.toLowerCase();
|
||||
innerCharset ??= peek.match(/<meta[^>]+charset="([^>"]+)\"/i)?.[1]?.toLowerCase();
|
||||
if (innerCharset && innerCharset !== encoding) {
|
||||
snapshot.html = await readFile(await file.filePath, innerCharset);
|
||||
}
|
||||
|
@ -19,6 +19,7 @@ import { AsyncLocalContext } from '../services/async-context';
|
||||
import finalizer, { Finalizer } from '../services/finalizer';
|
||||
import { SerpHost } from '../api/serp';
|
||||
import koaCompress from 'koa-compress';
|
||||
import { getAuditionMiddleware } from '../shared/utils/audition';
|
||||
|
||||
@singleton()
|
||||
export class SERPStandAloneServer extends KoaServer {
|
||||
@ -107,6 +108,7 @@ export class SERPStandAloneServer extends KoaServer {
|
||||
}
|
||||
|
||||
registerRoutes(): void {
|
||||
this.koaApp.use(getAuditionMiddleware());
|
||||
this.koaApp.use(koaCompress({
|
||||
filter(type) {
|
||||
if (type.startsWith('text/')) {
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit f89255cd6546641f72eefba140a4aef96a0e03fc
|
||||
Subproject commit 08ded7b8eceee7e931d52e77c87103f28c3ba9e8
|
Loading…
x
Reference in New Issue
Block a user