From 62d11f21fc1fa5cab3506cf9eb37336755128cb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 20 Feb 2025 15:39:19 +0100 Subject: [PATCH] self-host: remove old playwright-service --- apps/playwright-service/.gitignore | 152 ----------------------- apps/playwright-service/Dockerfile | 38 ------ apps/playwright-service/README.md | 0 apps/playwright-service/get_error.py | 63 ---------- apps/playwright-service/main.py | 111 ----------------- apps/playwright-service/requests.http | 0 apps/playwright-service/requirements.txt | 4 - apps/playwright-service/runtime.txt | 1 - 8 files changed, 369 deletions(-) delete mode 100644 apps/playwright-service/.gitignore delete mode 100644 apps/playwright-service/Dockerfile delete mode 100644 apps/playwright-service/README.md delete mode 100644 apps/playwright-service/get_error.py delete mode 100644 apps/playwright-service/main.py delete mode 100644 apps/playwright-service/requests.http delete mode 100644 apps/playwright-service/requirements.txt delete mode 100644 apps/playwright-service/runtime.txt diff --git a/apps/playwright-service/.gitignore b/apps/playwright-service/.gitignore deleted file mode 100644 index de2d5e08..00000000 --- a/apps/playwright-service/.gitignore +++ /dev/null @@ -1,152 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ diff --git a/apps/playwright-service/Dockerfile b/apps/playwright-service/Dockerfile deleted file mode 100644 index 5f7d2db9..00000000 --- a/apps/playwright-service/Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -FROM python:3.11-slim - -ENV PYTHONUNBUFFERED=1 -ENV PYTHONDONTWRITEBYTECODE=1 -ENV PIP_DISABLE_PIP_VERSION_CHECK=1 - -RUN apt-get update && apt-get install -y --no-install-recommends \ - gcc \ - libstdc++6 - -WORKDIR /app - -# Install Python dependencies -COPY requirements.txt ./ - -# Remove py which is pulled in by retry, py is not needed and is a CVE -RUN pip install --no-cache-dir --upgrade -r requirements.txt && \ - pip uninstall -y py && \ - playwright install chromium && playwright install-deps chromium && \ - ln -s /usr/local/bin/supervisord /usr/bin/supervisord - -# Cleanup for CVEs and size reduction -# https://github.com/tornadoweb/tornado/issues/3107 -# xserver-common and xvfb included by playwright installation but not needed after -# perl-base is part of the base Python Debian image but not needed for Danswer functionality -# perl-base could only be removed with --allow-remove-essential - - - - - -COPY . ./ - -EXPOSE $PORT -# run fast api hypercorn -CMD hypercorn main:app --bind [::]:$PORT -# CMD ["hypercorn", "main:app", "--bind", "[::]:$PORT"] -# CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port $PORT"] diff --git a/apps/playwright-service/README.md b/apps/playwright-service/README.md deleted file mode 100644 index e69de29b..00000000 diff --git a/apps/playwright-service/get_error.py b/apps/playwright-service/get_error.py deleted file mode 100644 index a33de5e0..00000000 --- a/apps/playwright-service/get_error.py +++ /dev/null @@ -1,63 +0,0 @@ -def get_error(status_code: int) -> str: - error_messages = { - 300: "Multiple Choices", - 301: "Moved Permanently", - 302: "Found", - 303: "See Other", - 304: "Not Modified", - 305: "Use Proxy", - 307: "Temporary Redirect", - 308: "Permanent Redirect", - 309: "Resume Incomplete", - 310: "Too Many Redirects", - 311: "Unavailable For Legal Reasons", - 312: "Previously Used", - 313: "I'm Used", - 314: "Switch Proxy", - 315: "Temporary Redirect", - 316: "Resume Incomplete", - 317: "Too Many Redirects", - 400: "Bad Request", - 401: "Unauthorized", - 403: "Forbidden", - 404: "Not Found", - 405: "Method Not Allowed", - 406: "Not Acceptable", - 407: "Proxy Authentication Required", - 408: "Request Timeout", - 409: "Conflict", - 410: "Gone", - 411: "Length Required", - 412: "Precondition Failed", - 413: "Payload Too Large", - 414: "URI Too Long", - 415: "Unsupported Media Type", - 416: "Range Not Satisfiable", - 417: "Expectation Failed", - 418: "I'm a teapot", - 421: "Misdirected Request", - 422: "Unprocessable Entity", - 423: "Locked", - 424: "Failed Dependency", - 425: "Too Early", - 426: "Upgrade Required", - 428: "Precondition Required", - 429: "Too Many Requests", - 431: "Request Header Fields Too Large", - 451: "Unavailable For Legal Reasons", - 500: "Internal Server Error", - 501: "Not Implemented", - 502: "Bad Gateway", - 503: "Service Unavailable", - 504: "Gateway Timeout", - 505: "HTTP Version Not Supported", - 506: "Variant Also Negotiates", - 507: "Insufficient Storage", - 508: "Loop Detected", - 510: "Not Extended", - 511: "Network Authentication Required", - 599: "Network Connect Timeout Error" - } - if status_code < 300: - return None - return error_messages.get(status_code, "Unknown Error") diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py deleted file mode 100644 index 3607dd6b..00000000 --- a/apps/playwright-service/main.py +++ /dev/null @@ -1,111 +0,0 @@ -""" -This module provides a FastAPI application that uses Playwright to fetch and return -the HTML content of a specified URL. It supports optional proxy settings and media blocking. -""" - -from os import environ - -from fastapi import FastAPI, Response -from fastapi.responses import JSONResponse -from playwright.async_api import Browser, async_playwright -from pydantic import BaseModel -from get_error import get_error - -PROXY_SERVER = environ.get("PROXY_SERVER", None) -PROXY_USERNAME = environ.get("PROXY_USERNAME", None) -PROXY_PASSWORD = environ.get("PROXY_PASSWORD", None) -BLOCK_MEDIA = environ.get("BLOCK_MEDIA", "False").upper() == "TRUE" - -app = FastAPI() - -class UrlModel(BaseModel): - """Model representing the URL and associated parameters for the request.""" - url: str - wait_after_load: int = 0 - timeout: int = 15000 - headers: dict = None - -browser: Browser = None - -@app.on_event("startup") -async def startup_event(): - """Event handler for application startup to initialize the browser.""" - global browser - playwright = await async_playwright().start() - browser = await playwright.chromium.launch() - -@app.on_event("shutdown") -async def shutdown_event(): - """Event handler for application shutdown to close the browser.""" - await browser.close() - -@app.get("/health/liveness") -def liveness_probe(): - """Endpoint for liveness probe.""" - return JSONResponse(content={"status": "ok"}, status_code=200) - - -@app.get("/health/readiness") -async def readiness_probe(): - """Endpoint for readiness probe. Checks if the browser instance is ready.""" - if browser: - return JSONResponse(content={"status": "ok"}, status_code=200) - return JSONResponse(content={"status": "Service Unavailable"}, status_code=503) - - -@app.post("/html") -async def root(body: UrlModel): - """ - Endpoint to fetch and return HTML content of a given URL. - - Args: - body (UrlModel): The URL model containing the target URL, wait time, and timeout. - - Returns: - JSONResponse: The HTML content of the page. - """ - context = None - if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD: - context = await browser.new_context( - proxy={ - "server": PROXY_SERVER, - "username": PROXY_USERNAME, - "password": PROXY_PASSWORD, - } - ) - else: - context = await browser.new_context() - - if BLOCK_MEDIA: - await context.route( - "**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}", - handler=lambda route, request: route.abort(), - ) - - page = await context.new_page() - - # Set headers if provided - if body.headers: - await page.set_extra_http_headers(body.headers) - - response = await page.goto( - body.url, - wait_until="load", - timeout=body.timeout, - ) - page_status_code = response.status - page_error = get_error(page_status_code) - # Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough - if body.wait_after_load > 0: - await page.wait_for_timeout(body.wait_after_load) - - page_content = await page.content() - await context.close() - json_compatible_item_data = { - "content": page_content, - "pageStatusCode": page_status_code, - } - - if page_error is not None: - json_compatible_item_data["pageError"] = page_error - return JSONResponse(content=json_compatible_item_data) diff --git a/apps/playwright-service/requests.http b/apps/playwright-service/requests.http deleted file mode 100644 index e69de29b..00000000 diff --git a/apps/playwright-service/requirements.txt b/apps/playwright-service/requirements.txt deleted file mode 100644 index fd3d1232..00000000 --- a/apps/playwright-service/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -hypercorn==0.17.3 -fastapi==0.111.0 -playwright==1.44.0 -uvicorn \ No newline at end of file diff --git a/apps/playwright-service/runtime.txt b/apps/playwright-service/runtime.txt deleted file mode 100644 index 902b2c90..00000000 --- a/apps/playwright-service/runtime.txt +++ /dev/null @@ -1 +0,0 @@ -3.11 \ No newline at end of file