diff --git a/src/api/crawler.ts b/src/api/crawler.ts index 7c7ad5d..2e4ad2b 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -32,14 +32,16 @@ import { GlobalLogger } from '../services/logger'; import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit'; import { AsyncLocalContext } from '../services/async-context'; import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry'; -import { BudgetExceededError, InsufficientBalanceError, SecurityCompromiseError } from '../services/errors'; +import { + BudgetExceededError, InsufficientBalanceError, + SecurityCompromiseError, ServiceBadApproachError, ServiceBadAttemptError +} from '../services/errors'; import { countGPTToken as estimateToken } from '../shared/utils/openai'; import { ProxyProvider } from '../shared/services/proxy-provider'; import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket'; import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth'; import { RobotsTxtService } from '../services/robots-text'; -import { ServiceBadAttemptError } from '../shared/lib/errors'; export interface ExtraScrappingOptions extends ScrappingOptions { withIframe?: boolean | 'quoted'; @@ -758,7 +760,9 @@ export class CrawlerHost extends RPCHost { let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html); draftSnapshot.title ??= analyzed.title; let fallbackProxyIsUsed = false; - if ((!crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) && (analyzed.tokens < 42 || sideLoaded.status !== 200)) { + if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) && + (analyzed.tokens < 42 || sideLoaded.status !== 200) + ) { const proxyLoaded = await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts); if (!proxyLoaded.file) { throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`); @@ -904,7 +908,7 @@ export class CrawlerHost extends RPCHost { } this.threadLocal.set('retainImages', opts.retainImages); this.threadLocal.set('noGfm', opts.noGfm); - this.threadLocal.set('DNT', Boolean(opts.doNotTrack)) + this.threadLocal.set('DNT', Boolean(opts.doNotTrack)); const crawlOpts: ExtraScrappingOptions = { proxyUrl: opts.proxyUrl, @@ -1146,6 +1150,9 @@ export class CrawlerHost extends RPCHost { } @retryWith((err) => { + if (err instanceof ServiceBadApproachError) { + return false; + } if (err instanceof ServiceBadAttemptError) { // Keep trying return true; @@ -1157,6 +1164,9 @@ export class CrawlerHost extends RPCHost { return undefined; }, 3) async sideLoadWithAllocatedProxy(url: URL, opts?: ExtraScrappingOptions) { + if (opts?.allocProxy === 'none') { + return this.curlControl.sideLoad(url, opts); + } const proxy = await this.proxyProvider.alloc(opts?.allocProxy); const r = await this.curlControl.sideLoad(url, { ...opts, diff --git a/src/services/alt-text.ts b/src/services/alt-text.ts index ea27621..b206909 100644 --- a/src/services/alt-text.ts +++ b/src/services/alt-text.ts @@ -1,6 +1,6 @@ import { AssertionFailureError, AsyncService, HashManager } from 'civkit'; import { singleton } from 'tsyringe'; -import { Logger } from '../shared/services/logger'; +import { GlobalLogger } from './logger'; import { CanvasService } from '../shared/services/canvas'; import { ImageInterrogationManager } from '../shared/services/common-iminterrogate'; import { ImgBrief } from './puppeteer'; @@ -16,7 +16,7 @@ export class AltTextService extends AsyncService { logger = this.globalLogger.child({ service: this.constructor.name }); constructor( - protected globalLogger: Logger, + protected globalLogger: GlobalLogger, protected imageInterrogator: ImageInterrogationManager, protected canvasService: CanvasService, protected asyncLocalContext: AsyncLocalContext diff --git a/src/services/brave-search.ts b/src/services/brave-search.ts index ec83c45..8dcfe2c 100644 --- a/src/services/brave-search.ts +++ b/src/services/brave-search.ts @@ -1,10 +1,10 @@ import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike } from 'civkit'; import { singleton } from 'tsyringe'; -import { Logger } from '../shared/services/logger'; +import { GlobalLogger } from './logger'; import { SecretExposer } from '../shared/services/secrets'; import { BraveSearchHTTP, WebSearchQueryParams } from '../shared/3rd-party/brave-search'; import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip'; -import { AsyncContext } from '../shared'; +import { AsyncLocalContext } from './async-context'; import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types'; import type { Request, Response } from 'express'; import { BlackHoleDetector } from './blackhole-detector'; @@ -17,10 +17,10 @@ export class BraveSearchService extends AsyncService { braveSearchHTTP!: BraveSearchHTTP; constructor( - protected globalLogger: Logger, + protected globalLogger: GlobalLogger, protected secretExposer: SecretExposer, protected geoipControl: GeoIPService, - protected threadLocal: AsyncContext, + protected threadLocal: AsyncLocalContext, protected blackHoleDetector: BlackHoleDetector, ) { super(...arguments); diff --git a/src/services/cf-browser-rendering.ts b/src/services/cf-browser-rendering.ts index 1bb724c..72fa469 100644 --- a/src/services/cf-browser-rendering.ts +++ b/src/services/cf-browser-rendering.ts @@ -1,6 +1,7 @@ import { container, singleton } from 'tsyringe'; import { AsyncService } from 'civkit/async-service'; -import { Logger, SecretExposer } from '../shared'; +import { SecretExposer } from '../shared/services/secrets'; +import { GlobalLogger } from './logger'; import { CloudFlareHTTP } from '../shared/3rd-party/cloud-flare'; @singleton() @@ -10,7 +11,7 @@ export class CFBrowserRendering extends AsyncService { client!: CloudFlareHTTP; constructor( - protected globalLogger: Logger, + protected globalLogger: GlobalLogger, protected secretExposer: SecretExposer, ) { super(...arguments); diff --git a/src/services/curl.ts b/src/services/curl.ts index e3903da..55453c8 100644 --- a/src/services/curl.ts +++ b/src/services/curl.ts @@ -5,9 +5,10 @@ import { Curl, CurlCode, CurlFeature, HeaderInfo } from 'node-libcurl'; import { parseString as parseSetCookieString } from 'set-cookie-parser'; import { ScrappingOptions } from './puppeteer'; -import { Logger } from '../shared/services/logger'; +import { GlobalLogger } from './logger'; import { AssertionFailureError, FancyFile } from 'civkit'; -import { ServiceBadAttemptError, TempFileManager } from '../shared'; +import { ServiceBadAttemptError, ServiceBadApproachError } from './errors'; +import { TempFileManager } from '../services/temp-file'; import { createBrotliDecompress, createInflate, createGunzip } from 'zlib'; import { ZSTDDecompress } from 'simple-zstd'; import _ from 'lodash'; @@ -32,7 +33,7 @@ export class CurlControl extends AsyncService { lifeCycleTrack = new WeakMap(); constructor( - protected globalLogger: Logger, + protected globalLogger: GlobalLogger, protected tempFileManager: TempFileManager, protected asyncLocalContext: AsyncLocalContext, ) { @@ -328,7 +329,7 @@ export class CurlControl extends AsyncService { }; } if (!location && cookieRedirects > 1) { - throw new ServiceBadAttemptError(`Failed to access ${urlToCrawl}: Browser required to solve complex cookie preconditions.`); + throw new ServiceBadApproachError(`Failed to access ${urlToCrawl}: Browser required to solve complex cookie preconditions.`); } nextHopUrl = new URL(location || '', nextHopUrl); diff --git a/src/services/errors.ts b/src/services/errors.ts index f867021..44246db 100644 --- a/src/services/errors.ts +++ b/src/services/errors.ts @@ -14,6 +14,12 @@ export class ServiceCrashedError extends ApplicationError { } @StatusCode(50303) export class ServiceNodeResourceDrainError extends ApplicationError { } +@StatusCode(50304) +export class ServiceBadAttemptError extends ApplicationError { } + +@StatusCode(50305) +export class ServiceBadApproachError extends ServiceBadAttemptError { } + @StatusCode(40104) export class EmailUnverifiedError extends ApplicationError { } diff --git a/src/services/geoip.ts b/src/services/geoip.ts index b6b4781..5ff38ee 100644 --- a/src/services/geoip.ts +++ b/src/services/geoip.ts @@ -2,7 +2,7 @@ import { container, singleton } from 'tsyringe'; import fsp from 'fs/promises'; import { CityResponse, Reader } from 'maxmind'; import { AsyncService, AutoCastable, Prop, runOnce } from 'civkit'; -import { Logger } from '../shared'; +import { GlobalLogger } from './logger'; import path from 'path'; export enum GEOIP_SUPPORTED_LANGUAGES { @@ -61,7 +61,7 @@ export class GeoIPService extends AsyncService { mmdbCity!: Reader; constructor( - protected globalLogger: Logger, + protected globalLogger: GlobalLogger, ) { super(...arguments); } diff --git a/src/services/jsdom.ts b/src/services/jsdom.ts index b79b0a3..1bf2ac0 100644 --- a/src/services/jsdom.ts +++ b/src/services/jsdom.ts @@ -1,13 +1,13 @@ import { container, singleton } from 'tsyringe'; import { AsyncService, marshalErrorLike } from 'civkit'; -import { Logger } from '../shared/services/logger'; +import { GlobalLogger } from './logger'; import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer'; import { Readability } from '@mozilla/readability'; import TurndownService from 'turndown'; import { Threaded } from '../services/threaded'; import type { ExtraScrappingOptions } from '../api/crawler'; import { tailwindClasses } from '../utils/tailwind-classes'; -import { countGPTToken } from '../shared'; +import { countGPTToken } from '../shared/utils/openai'; const pLinkedom = import('linkedom'); @@ -19,7 +19,7 @@ export class JSDomControl extends AsyncService { linkedom!: Awaited; constructor( - protected globalLogger: Logger, + protected globalLogger: GlobalLogger, ) { super(...arguments); } diff --git a/src/services/lm.ts b/src/services/lm.ts index f3991fb..f7ff56b 100644 --- a/src/services/lm.ts +++ b/src/services/lm.ts @@ -2,7 +2,7 @@ import { AsyncService } from 'civkit/async-service'; import { singleton } from 'tsyringe'; import { PageSnapshot } from './puppeteer'; -import { Logger } from '../shared/services/logger'; +import { GlobalLogger } from './logger'; import _ from 'lodash'; import { AssertionFailureError } from 'civkit'; import { LLMManager } from '../shared/services/common-llm'; @@ -16,7 +16,7 @@ export class LmControl extends AsyncService { logger = this.globalLogger.child({ service: this.constructor.name }); constructor( - protected globalLogger: Logger, + protected globalLogger: GlobalLogger, protected commonLLM: LLMManager, protected jsdomControl: JSDomControl, ) { diff --git a/src/services/pdf-extract.ts b/src/services/pdf-extract.ts index edbab1e..4a8e388 100644 --- a/src/services/pdf-extract.ts +++ b/src/services/pdf-extract.ts @@ -3,10 +3,10 @@ import { singleton } from 'tsyringe'; import _ from 'lodash'; import { TextItem } from 'pdfjs-dist/types/src/display/api'; import { AsyncService, HashManager } from 'civkit'; -import { Logger } from '../shared/services/logger'; +import { GlobalLogger } from './logger'; import { PDFContent } from '../db/pdf'; import dayjs from 'dayjs'; -import { FirebaseStorageBucketControl } from '../shared'; +import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket'; import { randomUUID } from 'crypto'; import type { PDFDocumentLoadingTask } from 'pdfjs-dist'; import path from 'path'; @@ -55,7 +55,7 @@ export class PDFExtractor extends AsyncService { cacheRetentionMs = 1000 * 3600 * 24 * 7; constructor( - protected globalLogger: Logger, + protected globalLogger: GlobalLogger, protected firebaseObjectStorage: FirebaseStorageBucketControl, protected asyncLocalContext: AsyncLocalContext, ) { diff --git a/src/services/puppeteer.ts b/src/services/puppeteer.ts index a3391bf..446a428 100644 --- a/src/services/puppeteer.ts +++ b/src/services/puppeteer.ts @@ -2,14 +2,13 @@ import os from 'os'; import fs from 'fs'; import { container, singleton } from 'tsyringe'; import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick, ParamValidationError, FancyFile } from 'civkit'; -import { Logger } from '../shared/services/logger'; +import { GlobalLogger } from './logger'; import type { Browser, CookieParam, GoToOptions, HTTPResponse, Page, Viewport } from 'puppeteer'; import type { Cookie } from 'set-cookie-parser'; import puppeteer from 'puppeteer-extra'; import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources'; -import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy'; import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors'; import { TimeoutError } from 'puppeteer'; import _ from 'lodash'; @@ -108,9 +107,6 @@ puppeteer.use(puppeteerBlockResources({ blockedTypes: new Set(['media']), interceptResolutionPriority: 1, })); -puppeteer.use(puppeteerPageProxy({ - interceptResolutionPriority: 1, -})); const SIMULATE_SCROLL = ` (function () { @@ -472,7 +468,7 @@ export class PuppeteerControl extends AsyncService { lifeCycleTrack = new WeakMap(); constructor( - protected globalLogger: Logger, + protected globalLogger: GlobalLogger, protected asyncLocalContext: AsyncLocalContext, protected curlControl: CurlControl, protected blackHoleDetector: BlackHoleDetector, diff --git a/src/services/robots-text.ts b/src/services/robots-text.ts index 4e786fb..657c13c 100644 --- a/src/services/robots-text.ts +++ b/src/services/robots-text.ts @@ -1,13 +1,12 @@ import { singleton } from 'tsyringe'; +import { URL } from 'url'; import { DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc'; import { AsyncService } from 'civkit/async-service'; import { HashManager } from 'civkit/hash'; import { marshalErrorLike } from 'civkit/lang'; -import { Logger } from '../shared/services/logger'; -import { BraveSearchHTTP } from '../shared/3rd-party/brave-search'; -import { FirebaseStorageBucketControl } from '../shared'; -import { URL } from 'url'; +import { GlobalLogger } from './logger'; +import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket'; import { Threaded } from '../services/threaded'; @@ -18,10 +17,8 @@ export class RobotsTxtService extends AsyncService { logger = this.globalLogger.child({ service: this.constructor.name }); - braveSearchHTTP!: BraveSearchHTTP; - constructor( - protected globalLogger: Logger, + protected globalLogger: GlobalLogger, protected firebaseStorageBucketControl: FirebaseStorageBucketControl, ) { super(...arguments); diff --git a/src/services/serper-search.ts b/src/services/serper-search.ts index 3be7c11..4ec5930 100644 --- a/src/services/serper-search.ts +++ b/src/services/serper-search.ts @@ -1,9 +1,9 @@ import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike } from 'civkit'; import { singleton } from 'tsyringe'; -import { Logger } from '../shared/services/logger'; +import { GlobalLogger } from './logger'; import { SecretExposer } from '../shared/services/secrets'; import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip'; -import { AsyncContext } from '../shared'; +import { AsyncLocalContext } from './async-context'; import { SerperGoogleHTTP, SerperSearchQueryParams, WORLD_COUNTRIES } from '../shared/3rd-party/serper-search'; import { BlackHoleDetector } from './blackhole-detector'; import { Context } from './registry'; @@ -16,10 +16,10 @@ export class SerperSearchService extends AsyncService { serperSearchHTTP!: SerperGoogleHTTP; constructor( - protected globalLogger: Logger, + protected globalLogger: GlobalLogger, protected secretExposer: SecretExposer, protected geoipControl: GeoIPService, - protected threadLocal: AsyncContext, + protected threadLocal: AsyncLocalContext, protected blackHoleDetector: BlackHoleDetector, ) { super(...arguments); diff --git a/src/services/snapshot-formatter.ts b/src/services/snapshot-formatter.ts index 3fd6cb1..b3562ba 100644 --- a/src/services/snapshot-formatter.ts +++ b/src/services/snapshot-formatter.ts @@ -2,7 +2,7 @@ import { randomUUID } from 'crypto'; import { container, singleton } from 'tsyringe'; import { AssertionFailureError, AsyncService, FancyFile, HashManager, marshalErrorLike } from 'civkit'; import TurndownService, { Filter, Rule } from 'turndown'; -import { Logger } from '../shared/services/logger'; +import { GlobalLogger } from './logger'; import { PageSnapshot } from './puppeteer'; import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket'; import { AsyncContext } from '../shared/services/async-context'; @@ -16,7 +16,7 @@ import { STATUS_CODES } from 'http'; import type { CrawlerOptions } from '../dto/crawler-options'; import { readFile } from 'fs/promises'; import { pathToFileURL } from 'url'; -import { countGPTToken } from '../shared'; +import { countGPTToken } from '../shared/utils/openai'; export interface FormattedPage { @@ -82,7 +82,7 @@ export class SnapshotFormatter extends AsyncService { gfmNoTable = [highlightedCodeBlock, gfmPlugin.strikethrough, gfmPlugin.taskListItems]; constructor( - protected globalLogger: Logger, + protected globalLogger: GlobalLogger, protected jsdomControl: JSDomControl, protected altTextService: AltTextService, protected pdfExtractor: PDFExtractor, diff --git a/thinapps-shared b/thinapps-shared index 20417f5..7556390 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 20417f5bb7f8c773a835304f0624a180b558ff65 +Subproject commit 755639081df7640733bb5f704460892a1a9059e7