mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-20 02:29:13 +08:00
feat: domain profile (#1127)
* feat: domain profile * fix * fix * fix * fix * fix * refactor: curl as direct engine * fix --------- Co-authored-by: yanlong.wang <yanlong.wang@naiver.org>
This commit is contained in:
parent
6c23342cbf
commit
54abc175bb
@ -9,19 +9,20 @@ import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
|||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||||
import { Request, Response } from 'express';
|
import { Request, Response } from 'express';
|
||||||
import { Curl } from 'node-libcurl';
|
|
||||||
const pNormalizeUrl = import("@esm2cjs/normalize-url");
|
const pNormalizeUrl = import("@esm2cjs/normalize-url");
|
||||||
import { Crawled } from '../db/crawled';
|
import { Crawled } from '../db/crawled';
|
||||||
import { randomUUID } from 'crypto';
|
import { randomUUID } from 'crypto';
|
||||||
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
||||||
|
|
||||||
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
||||||
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
|
import { CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/scrapping-options';
|
||||||
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
||||||
import { DomainBlockade } from '../db/domain-blockade';
|
import { DomainBlockade } from '../db/domain-blockade';
|
||||||
|
import { DomainProfile } from '../db/domain-profile';
|
||||||
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
|
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
|
||||||
import { JSDomControl } from '../services/jsdom';
|
import { JSDomControl } from '../services/jsdom';
|
||||||
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
||||||
|
import { CurlControl } from '../services/curl';
|
||||||
|
|
||||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||||
withIframe?: boolean | 'quoted';
|
withIframe?: boolean | 'quoted';
|
||||||
@ -50,10 +51,12 @@ export class CrawlerHost extends RPCHost {
|
|||||||
cacheValidMs = 1000 * 3600;
|
cacheValidMs = 1000 * 3600;
|
||||||
urlValidMs = 1000 * 3600 * 4;
|
urlValidMs = 1000 * 3600 * 4;
|
||||||
abuseBlockMs = 1000 * 3600;
|
abuseBlockMs = 1000 * 3600;
|
||||||
|
domainProfileRetentionMs = 1000 * 3600 * 24 * 30;
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
protected globalLogger: Logger,
|
protected globalLogger: Logger,
|
||||||
protected puppeteerControl: PuppeteerControl,
|
protected puppeteerControl: PuppeteerControl,
|
||||||
|
protected curlControl: CurlControl,
|
||||||
protected jsdomControl: JSDomControl,
|
protected jsdomControl: JSDomControl,
|
||||||
protected snapshotFormatter: SnapshotFormatter,
|
protected snapshotFormatter: SnapshotFormatter,
|
||||||
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||||
@ -63,7 +66,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
) {
|
) {
|
||||||
super(...arguments);
|
super(...arguments);
|
||||||
|
|
||||||
puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ScrappingOptions & { url: URL; }) => {
|
puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ExtraScrappingOptions & { url: URL; }) => {
|
||||||
if (!snapshot.title?.trim() && !snapshot.pdfs?.length) {
|
if (!snapshot.title?.trim() && !snapshot.pdfs?.length) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -78,8 +81,15 @@ export class CrawlerHost extends RPCHost {
|
|||||||
if (options.locale) {
|
if (options.locale) {
|
||||||
Reflect.set(snapshot, 'locale', options.locale);
|
Reflect.set(snapshot, 'locale', options.locale);
|
||||||
}
|
}
|
||||||
|
|
||||||
await this.setToCache(options.url, snapshot);
|
await this.setToCache(options.url, snapshot);
|
||||||
|
|
||||||
|
if (!options.engine) {
|
||||||
|
try {
|
||||||
|
await this.exploreDirectEngine(options.url, options, snapshot);
|
||||||
|
} catch (err) {
|
||||||
|
this.logger.warn(`Failed to explore direct engine option for ${options.url.href}`, { err });
|
||||||
|
}
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
||||||
@ -245,8 +255,21 @@ export class CrawlerHost extends RPCHost {
|
|||||||
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
const crawlOpts = await this.configure(crawlerOptions);
|
const crawlOpts = await this.configure(crawlerOptions);
|
||||||
|
|
||||||
|
if (!crawlOpts.engine) {
|
||||||
|
const domainProfile = (await DomainProfile.fromFirestoreQuery(
|
||||||
|
DomainProfile.COLLECTION
|
||||||
|
.where('origin', '==', targetUrl.origin.toLowerCase())
|
||||||
|
.limit(1)
|
||||||
|
))[0];
|
||||||
|
|
||||||
|
if (domainProfile?.engine) {
|
||||||
|
crawlOpts.engine = domainProfile.engine;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
||||||
const sseStream = new OutputServerEventStream();
|
const sseStream = new OutputServerEventStream();
|
||||||
@ -388,6 +411,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
|
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
|
async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
|
||||||
@ -574,7 +598,6 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (crawlerOpts?.pdf) {
|
if (crawlerOpts?.pdf) {
|
||||||
|
|
||||||
const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
|
const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
|
||||||
const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
|
const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
|
||||||
const fakeSnapshot = {
|
const fakeSnapshot = {
|
||||||
@ -590,55 +613,9 @@ export class CrawlerHost extends RPCHost {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (crawlerOpts?.engine?.toLowerCase() === 'curl') {
|
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
|
||||||
const html = await new Promise<string>((resolve, reject) => {
|
yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
|
||||||
const curl = new Curl();
|
|
||||||
curl.setOpt('URL', urlToCrawl.toString());
|
|
||||||
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
|
|
||||||
|
|
||||||
if (crawlOpts?.timeoutMs) {
|
|
||||||
curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts.timeoutMs);
|
|
||||||
}
|
|
||||||
if (crawlOpts?.overrideUserAgent) {
|
|
||||||
curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
|
|
||||||
}
|
|
||||||
if (crawlOpts?.extraHeaders) {
|
|
||||||
curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
|
|
||||||
}
|
|
||||||
if (crawlOpts?.proxyUrl) {
|
|
||||||
curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
|
|
||||||
}
|
|
||||||
if (crawlOpts?.cookies) {
|
|
||||||
curl.setOpt(Curl.option.COOKIE, crawlOpts.cookies.join('; '));
|
|
||||||
}
|
|
||||||
if (crawlOpts?.referer) {
|
|
||||||
curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
curl.on('end', (statusCode, data, headers) => {
|
|
||||||
this.logger.info(`Successfully requested ${urlToCrawl} by curl`, { statusCode, headers });
|
|
||||||
resolve(data.toString());
|
|
||||||
curl.close();
|
|
||||||
});
|
|
||||||
|
|
||||||
curl.on('error', (err) => {
|
|
||||||
this.logger.error(`Failed to request ${urlToCrawl} by curl`, { err: marshalErrorLike(err) });
|
|
||||||
reject(err);
|
|
||||||
curl.close();
|
|
||||||
});
|
|
||||||
|
|
||||||
curl.perform();
|
|
||||||
});
|
|
||||||
|
|
||||||
const fakeSnapshot = {
|
|
||||||
href: urlToCrawl.toString(),
|
|
||||||
html: html,
|
|
||||||
title: '',
|
|
||||||
text: '',
|
|
||||||
} as PageSnapshot;
|
|
||||||
|
|
||||||
yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -760,7 +737,6 @@ export class CrawlerHost extends RPCHost {
|
|||||||
this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
|
this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
|
||||||
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
||||||
this.threadLocal.set('userAgent', opts.userAgent);
|
this.threadLocal.set('userAgent', opts.userAgent);
|
||||||
this.threadLocal.set('engine', opts.engine);
|
|
||||||
if (opts.timeout) {
|
if (opts.timeout) {
|
||||||
this.threadLocal.set('timeout', opts.timeout * 1000);
|
this.threadLocal.set('timeout', opts.timeout * 1000);
|
||||||
}
|
}
|
||||||
@ -775,13 +751,13 @@ export class CrawlerHost extends RPCHost {
|
|||||||
targetSelector: opts.targetSelector,
|
targetSelector: opts.targetSelector,
|
||||||
waitForSelector: opts.waitForSelector,
|
waitForSelector: opts.waitForSelector,
|
||||||
overrideUserAgent: opts.userAgent,
|
overrideUserAgent: opts.userAgent,
|
||||||
engine: opts.engine,
|
|
||||||
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
||||||
withIframe: opts.withIframe,
|
withIframe: opts.withIframe,
|
||||||
withShadowDom: opts.withShadowDom,
|
withShadowDom: opts.withShadowDom,
|
||||||
locale: opts.locale,
|
locale: opts.locale,
|
||||||
referer: opts.referer,
|
referer: opts.referer,
|
||||||
viewport: opts.viewport,
|
viewport: opts.viewport,
|
||||||
|
engine: opts.engine,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (opts.locale) {
|
if (opts.locale) {
|
||||||
@ -849,4 +825,37 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
|
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async exploreDirectEngine(targetUrl: URL, crawlerOptions: ScrappingOptions, knownSnapshot: PageSnapshot) {
|
||||||
|
const snapshot = await this.curlControl.urlToSnapshot(targetUrl, crawlerOptions);
|
||||||
|
|
||||||
|
const thisFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
|
||||||
|
const knownFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', knownSnapshot);
|
||||||
|
|
||||||
|
let engine = ENGINE_TYPE.DIRECT;
|
||||||
|
if (!(thisFormatted.content && knownFormatted.content &&
|
||||||
|
thisFormatted.content.trim() === knownFormatted.content.trim())) {
|
||||||
|
engine = ENGINE_TYPE.BROWSER;
|
||||||
|
}
|
||||||
|
|
||||||
|
const realUrl = new URL(knownSnapshot.href);
|
||||||
|
|
||||||
|
const profile = (await DomainProfile.fromFirestoreQuery(
|
||||||
|
DomainProfile.COLLECTION
|
||||||
|
.where('domain', '==', targetUrl.origin.toLowerCase())
|
||||||
|
.limit(1)
|
||||||
|
))[0] || new DomainProfile();
|
||||||
|
|
||||||
|
|
||||||
|
profile.origin = realUrl.origin.toLowerCase();
|
||||||
|
profile.triggerReason ??= 'Auto Explore';
|
||||||
|
profile.triggerUrl = realUrl.href;
|
||||||
|
profile.engine = engine;
|
||||||
|
profile.createdAt ??= new Date();
|
||||||
|
profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs);
|
||||||
|
|
||||||
|
await DomainProfile.save(profile);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
34
backend/functions/src/db/domain-profile.ts
Normal file
34
backend/functions/src/db/domain-profile.ts
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
import { Also, Prop } from 'civkit';
|
||||||
|
import { FirestoreRecord } from '../shared/lib/firestore';
|
||||||
|
import { ENGINE_TYPE } from '../dto/scrapping-options';
|
||||||
|
|
||||||
|
@Also({
|
||||||
|
dictOf: Object
|
||||||
|
})
|
||||||
|
export class DomainProfile extends FirestoreRecord {
|
||||||
|
static override collectionName = 'domainProfiles';
|
||||||
|
|
||||||
|
override _id!: string;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
required: true
|
||||||
|
})
|
||||||
|
origin!: string;
|
||||||
|
|
||||||
|
@Prop({ required: true })
|
||||||
|
triggerReason!: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
triggerUrl?: string;
|
||||||
|
|
||||||
|
@Prop({ required: true, type: ENGINE_TYPE })
|
||||||
|
engine!: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
createdAt!: Date;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
expireAt?: Date;
|
||||||
|
|
||||||
|
[k: string]: any;
|
||||||
|
}
|
@ -11,6 +11,12 @@ export enum CONTENT_FORMAT {
|
|||||||
SCREENSHOT = 'screenshot',
|
SCREENSHOT = 'screenshot',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export enum ENGINE_TYPE {
|
||||||
|
BROWSER = 'browser',
|
||||||
|
DIRECT = 'direct',
|
||||||
|
VLM = 'vlm',
|
||||||
|
}
|
||||||
|
|
||||||
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
||||||
|
|
||||||
export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
|
export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
|
||||||
@ -182,7 +188,7 @@ class Viewport extends AutoCastable {
|
|||||||
schema: { type: 'string' }
|
schema: { type: 'string' }
|
||||||
},
|
},
|
||||||
'X-Engine': {
|
'X-Engine': {
|
||||||
description: 'Specify the engine to use for crawling.\n\nDefault: puppeteer, supported: puppeteer, curl',
|
description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm',
|
||||||
in: 'header',
|
in: 'header',
|
||||||
schema: { type: 'string' }
|
schema: { type: 'string' }
|
||||||
},
|
},
|
||||||
@ -277,7 +283,9 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
@Prop()
|
@Prop()
|
||||||
userAgent?: string;
|
userAgent?: string;
|
||||||
|
|
||||||
@Prop({ default: 'puppeteer' })
|
@Prop({
|
||||||
|
type: ENGINE_TYPE,
|
||||||
|
})
|
||||||
engine?: string;
|
engine?: string;
|
||||||
|
|
||||||
@Prop({
|
@Prop({
|
||||||
@ -477,6 +485,26 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
isRequestingCompoundContentFormat() {
|
isRequestingCompoundContentFormat() {
|
||||||
return !CONTENT_FORMAT_VALUES.has(this.respondWith);
|
return !CONTENT_FORMAT_VALUES.has(this.respondWith);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
isGeneralMarkdownRequest() {
|
||||||
|
if (this.respondWith !== CONTENT_FORMAT.CONTENT && this.respondWith !== CONTENT_FORMAT.MARKDOWN) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (this.viewport) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (this.pdf) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (this.html) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
|
export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
|
||||||
|
82
backend/functions/src/services/curl.ts
Normal file
82
backend/functions/src/services/curl.ts
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
import { marshalErrorLike } from 'civkit/lang';
|
||||||
|
import { AsyncService } from 'civkit/async-service';
|
||||||
|
import { singleton } from 'tsyringe';
|
||||||
|
|
||||||
|
import { Curl } from 'node-libcurl';
|
||||||
|
import { PageSnapshot, ScrappingOptions } from './puppeteer';
|
||||||
|
import { Logger } from '../shared/services/logger';
|
||||||
|
import { JSDomControl } from './jsdom';
|
||||||
|
|
||||||
|
@singleton()
|
||||||
|
export class CurlControl extends AsyncService {
|
||||||
|
|
||||||
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
protected globalLogger: Logger,
|
||||||
|
protected jsdomControl: JSDomControl,
|
||||||
|
) {
|
||||||
|
super(...arguments);
|
||||||
|
}
|
||||||
|
|
||||||
|
override async init() {
|
||||||
|
await this.dependencyReady();
|
||||||
|
|
||||||
|
this.emit('ready');
|
||||||
|
}
|
||||||
|
|
||||||
|
async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions) {
|
||||||
|
const html = await new Promise<string>((resolve, reject) => {
|
||||||
|
const curl = new Curl();
|
||||||
|
curl.setOpt('URL', urlToCrawl.toString());
|
||||||
|
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
|
||||||
|
|
||||||
|
if (crawlOpts?.timeoutMs) {
|
||||||
|
curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts.timeoutMs);
|
||||||
|
}
|
||||||
|
if (crawlOpts?.overrideUserAgent) {
|
||||||
|
curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
|
||||||
|
}
|
||||||
|
if (crawlOpts?.extraHeaders) {
|
||||||
|
curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
|
||||||
|
}
|
||||||
|
if (crawlOpts?.proxyUrl) {
|
||||||
|
curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
|
||||||
|
}
|
||||||
|
if (crawlOpts?.cookies?.length) {
|
||||||
|
const cookieChunks = crawlOpts.cookies.map((cookie) => `${cookie.name}=${cookie.value}`);
|
||||||
|
curl.setOpt(Curl.option.COOKIE, cookieChunks.join('; '));
|
||||||
|
}
|
||||||
|
if (crawlOpts?.referer) {
|
||||||
|
curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
|
||||||
|
}
|
||||||
|
|
||||||
|
curl.on('end', (statusCode, data, headers) => {
|
||||||
|
this.logger.debug(`CURL: ${urlToCrawl}`, { statusCode, headers });
|
||||||
|
resolve(data.toString());
|
||||||
|
curl.close();
|
||||||
|
});
|
||||||
|
|
||||||
|
curl.on('error', (err) => {
|
||||||
|
this.logger.warn(`Failed to curl ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
||||||
|
curl.close();
|
||||||
|
reject(err);
|
||||||
|
});
|
||||||
|
|
||||||
|
curl.perform();
|
||||||
|
});
|
||||||
|
|
||||||
|
const snapshot = {
|
||||||
|
href: urlToCrawl.toString(),
|
||||||
|
html: html,
|
||||||
|
title: '',
|
||||||
|
text: '',
|
||||||
|
} as PageSnapshot;
|
||||||
|
|
||||||
|
const curlSnapshot = await this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
|
||||||
|
|
||||||
|
return curlSnapshot!;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -1 +1 @@
|
|||||||
Subproject commit 98e9bf19bc6859c79eff516275cf1120e59e47bf
|
Subproject commit 439f633d464f3fd5fe288313766a43163190b60f
|
Loading…
x
Reference in New Issue
Block a user