self-host: remove old playwright-service

This commit is contained in:
Gergő Móricz 2025-02-20 15:39:19 +01:00
parent d533718312
commit 62d11f21fc
8 changed files with 0 additions and 369 deletions

View File

@ -1,152 +0,0 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

View File

@ -1,38 +0,0 @@
FROM python:3.11-slim
ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
ENV PIP_DISABLE_PIP_VERSION_CHECK=1
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc \
libstdc++6
WORKDIR /app
# Install Python dependencies
COPY requirements.txt ./
# Remove py which is pulled in by retry, py is not needed and is a CVE
RUN pip install --no-cache-dir --upgrade -r requirements.txt && \
pip uninstall -y py && \
playwright install chromium && playwright install-deps chromium && \
ln -s /usr/local/bin/supervisord /usr/bin/supervisord
# Cleanup for CVEs and size reduction
# https://github.com/tornadoweb/tornado/issues/3107
# xserver-common and xvfb included by playwright installation but not needed after
# perl-base is part of the base Python Debian image but not needed for Danswer functionality
# perl-base could only be removed with --allow-remove-essential
COPY . ./
EXPOSE $PORT
# run fast api hypercorn
CMD hypercorn main:app --bind [::]:$PORT
# CMD ["hypercorn", "main:app", "--bind", "[::]:$PORT"]
# CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port $PORT"]

View File

@ -1,63 +0,0 @@
def get_error(status_code: int) -> str:
error_messages = {
300: "Multiple Choices",
301: "Moved Permanently",
302: "Found",
303: "See Other",
304: "Not Modified",
305: "Use Proxy",
307: "Temporary Redirect",
308: "Permanent Redirect",
309: "Resume Incomplete",
310: "Too Many Redirects",
311: "Unavailable For Legal Reasons",
312: "Previously Used",
313: "I'm Used",
314: "Switch Proxy",
315: "Temporary Redirect",
316: "Resume Incomplete",
317: "Too Many Redirects",
400: "Bad Request",
401: "Unauthorized",
403: "Forbidden",
404: "Not Found",
405: "Method Not Allowed",
406: "Not Acceptable",
407: "Proxy Authentication Required",
408: "Request Timeout",
409: "Conflict",
410: "Gone",
411: "Length Required",
412: "Precondition Failed",
413: "Payload Too Large",
414: "URI Too Long",
415: "Unsupported Media Type",
416: "Range Not Satisfiable",
417: "Expectation Failed",
418: "I'm a teapot",
421: "Misdirected Request",
422: "Unprocessable Entity",
423: "Locked",
424: "Failed Dependency",
425: "Too Early",
426: "Upgrade Required",
428: "Precondition Required",
429: "Too Many Requests",
431: "Request Header Fields Too Large",
451: "Unavailable For Legal Reasons",
500: "Internal Server Error",
501: "Not Implemented",
502: "Bad Gateway",
503: "Service Unavailable",
504: "Gateway Timeout",
505: "HTTP Version Not Supported",
506: "Variant Also Negotiates",
507: "Insufficient Storage",
508: "Loop Detected",
510: "Not Extended",
511: "Network Authentication Required",
599: "Network Connect Timeout Error"
}
if status_code < 300:
return None
return error_messages.get(status_code, "Unknown Error")

View File

@ -1,111 +0,0 @@
"""
This module provides a FastAPI application that uses Playwright to fetch and return
the HTML content of a specified URL. It supports optional proxy settings and media blocking.
"""
from os import environ
from fastapi import FastAPI, Response
from fastapi.responses import JSONResponse
from playwright.async_api import Browser, async_playwright
from pydantic import BaseModel
from get_error import get_error
PROXY_SERVER = environ.get("PROXY_SERVER", None)
PROXY_USERNAME = environ.get("PROXY_USERNAME", None)
PROXY_PASSWORD = environ.get("PROXY_PASSWORD", None)
BLOCK_MEDIA = environ.get("BLOCK_MEDIA", "False").upper() == "TRUE"
app = FastAPI()
class UrlModel(BaseModel):
"""Model representing the URL and associated parameters for the request."""
url: str
wait_after_load: int = 0
timeout: int = 15000
headers: dict = None
browser: Browser = None
@app.on_event("startup")
async def startup_event():
"""Event handler for application startup to initialize the browser."""
global browser
playwright = await async_playwright().start()
browser = await playwright.chromium.launch()
@app.on_event("shutdown")
async def shutdown_event():
"""Event handler for application shutdown to close the browser."""
await browser.close()
@app.get("/health/liveness")
def liveness_probe():
"""Endpoint for liveness probe."""
return JSONResponse(content={"status": "ok"}, status_code=200)
@app.get("/health/readiness")
async def readiness_probe():
"""Endpoint for readiness probe. Checks if the browser instance is ready."""
if browser:
return JSONResponse(content={"status": "ok"}, status_code=200)
return JSONResponse(content={"status": "Service Unavailable"}, status_code=503)
@app.post("/html")
async def root(body: UrlModel):
"""
Endpoint to fetch and return HTML content of a given URL.
Args:
body (UrlModel): The URL model containing the target URL, wait time, and timeout.
Returns:
JSONResponse: The HTML content of the page.
"""
context = None
if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD:
context = await browser.new_context(
proxy={
"server": PROXY_SERVER,
"username": PROXY_USERNAME,
"password": PROXY_PASSWORD,
}
)
else:
context = await browser.new_context()
if BLOCK_MEDIA:
await context.route(
"**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}",
handler=lambda route, request: route.abort(),
)
page = await context.new_page()
# Set headers if provided
if body.headers:
await page.set_extra_http_headers(body.headers)
response = await page.goto(
body.url,
wait_until="load",
timeout=body.timeout,
)
page_status_code = response.status
page_error = get_error(page_status_code)
# Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough
if body.wait_after_load > 0:
await page.wait_for_timeout(body.wait_after_load)
page_content = await page.content()
await context.close()
json_compatible_item_data = {
"content": page_content,
"pageStatusCode": page_status_code,
}
if page_error is not None:
json_compatible_item_data["pageError"] = page_error
return JSONResponse(content=json_compatible_item_data)

View File

@ -1,4 +0,0 @@
hypercorn==0.17.3
fastapi==0.111.0
playwright==1.44.0
uvicorn

View File

@ -1 +0,0 @@
3.11