diff --git a/.gitignore b/.gitignore index a070a88f..71ba6dd2 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,7 @@ apps/test-suite/node_modules/ apps/test-suite/.env apps/test-suite/logs -apps/test-suite/load-test-results/test-run-report.json \ No newline at end of file +apps/test-suite/load-test-results/test-run-report.json + +apps/playwright-service-ts/node_modules/ +apps/playwright-service-ts/package-lock.json \ No newline at end of file diff --git a/apps/playwright-service-ts/README.md b/apps/playwright-service-ts/README.md new file mode 100644 index 00000000..3e6d0aa1 --- /dev/null +++ b/apps/playwright-service-ts/README.md @@ -0,0 +1,47 @@ +# Playwright Scrape API + +This is a simple web scraping service built with Express and Playwright. + +## Features + +- Scrapes HTML content from specified URLs. +- Blocks requests to known ad-serving domains. +- Blocks media files to reduce bandwidth usage. +- Uses random user-agent strings to avoid detection. +- Strategy to ensure the page is fully rendered. + +## Install +```bash +npm install +npx playwright install +``` + +## RUN +```bash +npm run build +npm start +``` +OR +```bash +npm run dev +``` + +## USE + +```bash +curl -X POST http://localhost:3000/scrape \ +-H "Content-Type: application/json" \ +-d '{ + "url": "https://example.com", + "wait_after_load": 1000, + "timeout": 15000, + "headers": { + "Custom-Header": "value" + }, + "check_selector": "#content" +}' +``` + +## USING WITH FIRECRAWL + +Add `PLAYWRIGHT_MICROSERVICE_URL=http://localhost:3003/scrape` to `/apps/api/.env` to configure the API to use this Playwright microservice for scraping operations. diff --git a/apps/playwright-service-ts/api.ts b/apps/playwright-service-ts/api.ts new file mode 100644 index 00000000..90a4eb87 --- /dev/null +++ b/apps/playwright-service-ts/api.ts @@ -0,0 +1,227 @@ +import express, { Request, Response } from 'express'; +import bodyParser from 'body-parser'; +import { chromium, Browser, BrowserContext, Route, Request as PlaywrightRequest } from 'playwright'; +import dotenv from 'dotenv'; +import randomUseragent from 'random-useragent'; +import { getError } from './helpers/get_error'; + +dotenv.config(); + +const app = express(); +const port = process.env.PORT || 3003; + +app.use(bodyParser.json()); + +const BLOCK_MEDIA = (process.env.BLOCK_MEDIA || 'False').toUpperCase() === 'TRUE'; + +const PROXY_SERVER = process.env.PROXY_SERVER || null; +const PROXY_USERNAME = process.env.PROXY_USERNAME || null; +const PROXY_PASSWORD = process.env.PROXY_PASSWORD || null; + +const AD_SERVING_DOMAINS = [ + 'doubleclick.net', + 'adservice.google.com', + 'googlesyndication.com', + 'googletagservices.com', + 'googletagmanager.com', + 'google-analytics.com', + 'adsystem.com', + 'adservice.com', + 'adnxs.com', + 'ads-twitter.com', + 'facebook.net', + 'fbcdn.net', + 'amazon-adsystem.com' +]; + +interface UrlModel { + url: string; + wait_after_load?: number; + timeout?: number; + headers?: { [key: string]: string }; + check_selector?: string; +} + +let browser: Browser; +let context: BrowserContext; + +const initializeBrowser = async () => { + browser = await chromium.launch({ + headless: true, + args: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-accelerated-2d-canvas', + '--no-first-run', + '--no-zygote', + '--single-process', + '--disable-gpu' + ] + }); + + const userAgent = randomUseragent.getRandom(); + const viewport = { width: 1280, height: 800 }; + + const contextOptions: any = { + userAgent, + viewport, + }; + + if (PROXY_SERVER && PROXY_USERNAME && PROXY_PASSWORD) { + contextOptions.proxy = { + server: PROXY_SERVER, + username: PROXY_USERNAME, + password: PROXY_PASSWORD, + }; + } else if (PROXY_SERVER) { + contextOptions.proxy = { + server: PROXY_SERVER, + }; + } + + context = await browser.newContext(contextOptions); + + if (BLOCK_MEDIA) { + await context.route('**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}', async (route: Route, request: PlaywrightRequest) => { + await route.abort(); + }); + } + + // Intercept all requests to avoid loading ads + await context.route('**/*', (route: Route, request: PlaywrightRequest) => { + const requestUrl = new URL(request.url()); + const hostname = requestUrl.hostname; + + if (AD_SERVING_DOMAINS.some(domain => hostname.includes(domain))) { + console.log(hostname); + return route.abort(); + } + return route.continue(); + }); +}; + +const shutdownBrowser = async () => { + if (context) { + await context.close(); + } + if (browser) { + await browser.close(); + } +}; + +const isValidUrl = (urlString: string): boolean => { + try { + new URL(urlString); + return true; + } catch (_) { + return false; + } +}; + +const scrapePage = async (page: any, url: string, waitUntil: 'load' | 'networkidle', waitAfterLoad: number, timeout: number, checkSelector: string | undefined) => { + console.log(`Navigating to ${url} with waitUntil: ${waitUntil} and timeout: ${timeout}ms`); + const response = await page.goto(url, { waitUntil, timeout }); + + if (waitAfterLoad > 0) { + await page.waitForTimeout(waitAfterLoad); + } + + if (checkSelector) { + try { + await page.waitForSelector(checkSelector, { timeout }); + } catch (error) { + throw new Error('Required selector not found'); + } + } + + return { + content: await page.content(), + status: response ? response.status() : null, + }; +}; + +app.post('/scrape', async (req: Request, res: Response) => { + const { url, wait_after_load = 0, timeout = 15000, headers, check_selector }: UrlModel = req.body; + + console.log(`================= Scrape Request =================`); + console.log(`URL: ${url}`); + console.log(`Wait After Load: ${wait_after_load}`); + console.log(`Timeout: ${timeout}`); + console.log(`Headers: ${headers ? JSON.stringify(headers) : 'None'}`); + console.log(`Check Selector: ${check_selector ? check_selector : 'None'}`); + console.log(`==================================================`); + + if (!url) { + return res.status(400).json({ error: 'URL is required' }); + } + + if (!isValidUrl(url)) { + return res.status(400).json({ error: 'Invalid URL' }); + } + + if (!PROXY_SERVER) { + console.warn('⚠️ WARNING: No proxy server provided. Your IP address may be blocked.'); + } + + if (!browser || !context) { + await initializeBrowser(); + } + + const page = await context.newPage(); + + // Set headers if provided + if (headers) { + await page.setExtraHTTPHeaders(headers); + } + + let pageContent; + let pageStatusCode: number | null = null; + try { + // Strategy 1: Normal + console.log('Attempting strategy 1: Normal load'); + const result = await scrapePage(page, url, 'load', wait_after_load, timeout, check_selector); + pageContent = result.content; + pageStatusCode = result.status; + } catch (error) { + console.log('Strategy 1 failed, attempting strategy 2: Wait until networkidle'); + try { + // Strategy 2: Wait until networkidle + const result = await scrapePage(page, url, 'networkidle', wait_after_load, timeout, check_selector); + pageContent = result.content; + pageStatusCode = result.status; + } catch (finalError) { + await page.close(); + return res.status(500).json({ error: 'An error occurred while fetching the page.' }); + } + } + + const pageError = pageStatusCode !== 200 ? getError(pageStatusCode) : false; + + if (!pageError) { + console.log(`✅ Scrape successful!`); + } else { + console.log(`🚨 Scrape failed with status code: ${pageStatusCode} ${pageError}`); + } + + await page.close(); + + res.json({ + content: pageContent, + pageStatusCode, + pageError + }); +}); + +app.listen(port, () => { + initializeBrowser().then(() => { + console.log(`Server is running on port ${port}`); + }); +}); + +process.on('SIGINT', () => { + shutdownBrowser().then(() => { + console.log('Browser closed'); + process.exit(0); + }); +}); diff --git a/apps/playwright-service-ts/helpers/get_error.ts b/apps/playwright-service-ts/helpers/get_error.ts new file mode 100644 index 00000000..8cdb18d0 --- /dev/null +++ b/apps/playwright-service-ts/helpers/get_error.ts @@ -0,0 +1,73 @@ +//impired by firecrawl repo @rafaelsideguide +export const getError = (statusCode: number | null): string | null => { + if (statusCode === null) { + return 'No response received'; + } + + const errorMessages: { [key: number]: string } = { + 300: "Multiple Choices", + 301: "Moved Permanently", + 302: "Found", + 303: "See Other", + 304: "Not Modified", + 305: "Use Proxy", + 307: "Temporary Redirect", + 308: "Permanent Redirect", + 309: "Resume Incomplete", + 310: "Too Many Redirects", + 311: "Unavailable For Legal Reasons", + 312: "Previously Used", + 313: "I'm Used", + 314: "Switch Proxy", + 315: "Temporary Redirect", + 316: "Resume Incomplete", + 317: "Too Many Redirects", + 400: "Bad Request", + 401: "Unauthorized", + 403: "Forbidden", + 404: "Not Found", + 405: "Method Not Allowed", + 406: "Not Acceptable", + 407: "Proxy Authentication Required", + 408: "Request Timeout", + 409: "Conflict", + 410: "Gone", + 411: "Length Required", + 412: "Precondition Failed", + 413: "Payload Too Large", + 414: "URI Too Long", + 415: "Unsupported Media Type", + 416: "Range Not Satisfiable", + 417: "Expectation Failed", + 418: "I'm a teapot", + 421: "Misdirected Request", + 422: "Unprocessable Entity", + 423: "Locked", + 424: "Failed Dependency", + 425: "Too Early", + 426: "Upgrade Required", + 428: "Precondition Required", + 429: "Too Many Requests", + 431: "Request Header Fields Too Large", + 451: "Unavailable For Legal Reasons", + 500: "Internal Server Error", + 501: "Not Implemented", + 502: "Bad Gateway", + 503: "Service Unavailable", + 504: "Gateway Timeout", + 505: "HTTP Version Not Supported", + 506: "Variant Also Negotiates", + 507: "Insufficient Storage", + 508: "Loop Detected", + 510: "Not Extended", + 511: "Network Authentication Required", + 599: "Network Connect Timeout Error" + }; + + if (statusCode < 300) { + return null; + } + + return errorMessages[statusCode] || "Unknown Error"; + }; + \ No newline at end of file diff --git a/apps/playwright-service-ts/package.json b/apps/playwright-service-ts/package.json new file mode 100644 index 00000000..fe15209f --- /dev/null +++ b/apps/playwright-service-ts/package.json @@ -0,0 +1,28 @@ +{ + "name": "playwright-scraper-api", + "version": "1.0.0", + "description": "scraper api with playwright", + "main": "api.ts", + "scripts": { + "start": "node dist/api.js", + "build": "tsc", + "dev": "ts-node api.ts" + }, + "keywords": [], + "author": "Jeff Pereira", + "license": "ISC", + "dependencies": { + "body-parser": "^1.20.2", + "dotenv": "^16.4.5", + "express": "^4.19.2", + "playwright": "^1.45.0", + "random-useragent": "^0.5.0" + }, + "devDependencies": { + "@types/express": "^4.17.21", + "@types/node": "^20.14.9", + "@types/random-useragent": "^0.3.3", + "ts-node": "^10.9.2", + "typescript": "^5.5.2" + } +} diff --git a/apps/playwright-service-ts/tsconfig.json b/apps/playwright-service-ts/tsconfig.json new file mode 100644 index 00000000..e71d09a0 --- /dev/null +++ b/apps/playwright-service-ts/tsconfig.json @@ -0,0 +1,110 @@ +{ + "compilerOptions": { + /* Visit https://aka.ms/tsconfig to read more about this file */ + + /* Projects */ + // "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */ + // "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */ + // "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */ + // "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */ + // "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */ + // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ + + /* Language and Environment */ + "target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ + // "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ + // "jsx": "preserve", /* Specify what JSX code is generated. */ + // "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */ + // "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */ + // "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */ + // "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */ + // "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */ + // "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */ + // "noLib": true, /* Disable including any library files, including the default lib.d.ts. */ + // "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */ + // "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */ + + /* Modules */ + "module": "commonjs", /* Specify what module code is generated. */ + // "rootDir": "./", /* Specify the root folder within your source files. */ + // "moduleResolution": "node10", /* Specify how TypeScript looks up a file from a given module specifier. */ + // "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */ + // "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */ + // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ + // "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */ + // "types": [], /* Specify type package names to be included without being referenced in a source file. */ + // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ + // "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */ + // "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */ + // "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */ + // "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */ + // "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */ + // "resolveJsonModule": true, /* Enable importing .json files. */ + // "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */ + // "noResolve": true, /* Disallow 'import's, 'require's or ''s from expanding the number of files TypeScript should add to a project. */ + + /* JavaScript Support */ + // "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */ + // "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */ + // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */ + + /* Emit */ + // "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ + // "declarationMap": true, /* Create sourcemaps for d.ts files. */ + // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */ + // "sourceMap": true, /* Create source map files for emitted JavaScript files. */ + // "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */ + // "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */ + // "outDir": "./", /* Specify an output folder for all emitted files. */ + // "removeComments": true, /* Disable emitting comments. */ + // "noEmit": true, /* Disable emitting files from a compilation. */ + // "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */ + // "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */ + // "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */ + // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ + // "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */ + // "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */ + // "newLine": "crlf", /* Set the newline character for emitting files. */ + // "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */ + // "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */ + // "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */ + // "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */ + // "declarationDir": "./", /* Specify the output directory for generated declaration files. */ + + /* Interop Constraints */ + // "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */ + // "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */ + // "isolatedDeclarations": true, /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */ + // "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */ + "esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */ + // "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */ + "forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */ + + /* Type Checking */ + "strict": true, /* Enable all strict type-checking options. */ + // "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */ + // "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */ + // "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */ + // "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */ + // "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */ + // "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */ + // "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */ + // "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */ + // "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */ + // "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */ + // "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */ + // "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */ + // "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */ + // "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */ + // "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */ + // "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */ + // "allowUnusedLabels": true, /* Disable error reporting for unused labels. */ + // "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */ + + /* Completeness */ + // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ + "skipLibCheck": true /* Skip type checking all .d.ts files. */ + }, + "include": ["**/*"], + "exclude": ["node_modules", "**/*.spec.ts"] +}