mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-13 14:59:00 +08:00
Rework Dockerfile.scratch (#2525)
### What problem does this PR solve? Rework Dockerfile.scratch - Multiple stage Dockerfile - Removed conda - Replaced pip with poetry - Added missing dependencies and fixed package version conflicts - Added deepdoc models ### Type of change - [x] Refactoring - [ ] Performance Improvement - [ ] Other (please describe):
This commit is contained in:
parent
4a6a2a0f1b
commit
d8a43416f5
@ -1,56 +1,91 @@
|
||||
FROM ubuntu:22.04
|
||||
# base stage
|
||||
FROM ubuntu:24.04 AS base
|
||||
USER root
|
||||
|
||||
WORKDIR /ragflow
|
||||
|
||||
RUN apt-get update && apt-get install -y wget curl build-essential libopenmpi-dev
|
||||
RUN rm -f /etc/apt/apt.conf.d/docker-clean \
|
||||
&& echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache
|
||||
|
||||
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
|
||||
bash ~/miniconda.sh -b -p /root/miniconda3 && \
|
||||
rm ~/miniconda.sh && ln -s /root/miniconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
|
||||
echo ". /root/miniconda3/etc/profile.d/conda.sh" >> ~/.bashrc && \
|
||||
echo "conda activate base" >> ~/.bashrc
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
apt update && apt-get --no-install-recommends install -y ca-certificates
|
||||
|
||||
ENV PATH /root/miniconda3/bin:$PATH
|
||||
# if you located in China, you can use tsinghua mirror to speed up apt
|
||||
RUN sed -i 's|http://archive.ubuntu.com|https://mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list.d/ubuntu.sources
|
||||
|
||||
RUN conda create -y --name py11 python=3.11
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
apt update && apt install -y curl libpython3-dev nginx openmpi-bin openmpi-common libopenmpi-dev libglib2.0-0 libglx-mesa0 \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& curl -sSL https://install.python-poetry.org | python3 -
|
||||
|
||||
ENV CONDA_DEFAULT_ENV py11
|
||||
ENV CONDA_PREFIX /root/miniconda3/envs/py11
|
||||
ENV PATH $CONDA_PREFIX/bin:$PATH
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 LD_LIBRARY_PATH=usr/lib/x86_64-linux-gnu/openmpi/lib:$LD_LIBRARY_PATH
|
||||
|
||||
RUN curl -sL https://deb.nodesource.com/setup_14.x | bash -
|
||||
RUN apt-get install -y nodejs
|
||||
# Configure Poetry
|
||||
ENV POETRY_NO_INTERACTION=1
|
||||
ENV POETRY_VIRTUALENVS_IN_PROJECT=true
|
||||
ENV POETRY_VIRTUALENVS_CREATE=true
|
||||
ENV POETRY_REQUESTS_TIMEOUT=15
|
||||
|
||||
RUN apt-get install -y nginx
|
||||
# builder stage
|
||||
FROM base AS builder
|
||||
USER root
|
||||
|
||||
ADD ./web ./web
|
||||
ADD ./api ./api
|
||||
ADD ./conf ./conf
|
||||
ADD ./deepdoc ./deepdoc
|
||||
ADD ./rag ./rag
|
||||
ADD ./requirements.txt ./requirements.txt
|
||||
ADD ./agent ./agent
|
||||
ADD ./graphrag ./graphrag
|
||||
WORKDIR /ragflow
|
||||
|
||||
RUN apt install openmpi-bin openmpi-common libopenmpi-dev
|
||||
ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu/openmpi/lib:$LD_LIBRARY_PATH
|
||||
RUN rm /root/miniconda3/envs/py11/compiler_compat/ld
|
||||
RUN cd ./web && npm i --force && npm run build
|
||||
RUN conda run -n py11 pip install -i https://mirrors.aliyun.com/pypi/simple/ -r ./requirements.txt
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libglib2.0-0 libgl1-mesa-glx && \
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
apt update && apt install -y nodejs npm && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN conda run -n py11 pip install -i https://mirrors.aliyun.com/pypi/simple/ ollama
|
||||
RUN conda run -n py11 python -m nltk.downloader punkt
|
||||
RUN conda run -n py11 python -m nltk.downloader wordnet
|
||||
# if you located in China, you can use taobao registry to speed up npm and yarn
|
||||
RUN npm config set registry https://registry.npmmirror.com/
|
||||
|
||||
# https://yarnpkg.com/getting-started/install
|
||||
COPY web web
|
||||
RUN cd web && npm install -g corepack && corepack enable && yarn install && yarn run build
|
||||
|
||||
# install dependencies from poetry.lock file
|
||||
COPY pyproject.toml poetry.toml poetry.lock ./
|
||||
RUN --mount=type=cache,target=/root/.cache/pypoetry,sharing=locked \
|
||||
/root/.local/bin/poetry install --sync --no-cache --no-root
|
||||
|
||||
# production stage
|
||||
FROM base AS production
|
||||
USER root
|
||||
|
||||
WORKDIR /ragflow
|
||||
|
||||
# Install python packages' dependencies
|
||||
# cv2 requires libGL.so.1
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
apt update && apt install -y --no-install-recommends nginx libgl1 vim less && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY web web
|
||||
COPY api api
|
||||
COPY conf conf
|
||||
COPY deepdoc deepdoc
|
||||
COPY rag rag
|
||||
COPY agent agent
|
||||
COPY graphrag graphrag
|
||||
COPY pyproject.toml poetry.toml poetry.lock ./
|
||||
|
||||
# Copy compiled web pages
|
||||
COPY --from=builder /ragflow/web/dist /ragflow/web/dist
|
||||
|
||||
# Copy Python environment and packages
|
||||
ENV VIRTUAL_ENV=/ragflow/.venv
|
||||
COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
|
||||
ENV PATH="${VIRTUAL_ENV}/bin:/root/.local/bin:${PATH}"
|
||||
|
||||
# Download nltk data
|
||||
RUN python3 -m nltk.downloader wordnet punkt punkt_tab
|
||||
|
||||
# Copy models downloaded via download_deps.sh
|
||||
COPY det.onnx layout.laws.onnx layout.manual.onnx layout.onnx layout.paper.onnx ocr.res rec.onnx tsr.onnx updown_concat_xgb.model /ragflow/rag/res/deepdoc/
|
||||
|
||||
ENV PYTHONPATH=/ragflow/
|
||||
ENV HF_ENDPOINT=https://hf-mirror.com
|
||||
|
||||
ADD docker/entrypoint.sh ./entrypoint.sh
|
||||
COPY docker/entrypoint.sh ./entrypoint.sh
|
||||
RUN chmod +x ./entrypoint.sh
|
||||
|
||||
ENTRYPOINT ["./entrypoint.sh"]
|
||||
|
@ -46,7 +46,7 @@ def update_progress():
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print("""
|
||||
print(r"""
|
||||
____ ______ __
|
||||
/ __ \ ____ _ ____ _ / ____// /____ _ __
|
||||
/ /_/ // __ `// __ `// /_ / // __ \| | /| / /
|
||||
|
@ -33,10 +33,13 @@ REDIS_PASSWORD=infini_rag_flow
|
||||
|
||||
SVR_HTTP_PORT=9380
|
||||
|
||||
RAGFLOW_VERSION=dev
|
||||
RAGFLOW_VERSION=poetry
|
||||
|
||||
TIMEZONE='Asia/Shanghai'
|
||||
|
||||
# Inside GFW, we need the following huggingface.co mirror:
|
||||
HF_ENDPOINT=https://hf-mirror.com
|
||||
|
||||
######## OS setup for ES ###########
|
||||
# sysctl vm.max_map_count
|
||||
# sudo sysctl -w vm.max_map_count=262144
|
||||
|
@ -15,6 +15,7 @@ services:
|
||||
- ${SVR_HTTP_PORT}:9380
|
||||
- 80:80
|
||||
- 443:443
|
||||
- 5678:5678
|
||||
volumes:
|
||||
- ./service_conf.yaml:/ragflow/conf/service_conf.yaml
|
||||
- ./ragflow-logs:/ragflow/logs
|
||||
@ -23,7 +24,7 @@ services:
|
||||
- ./nginx/nginx.conf:/etc/nginx/nginx.conf
|
||||
environment:
|
||||
- TZ=${TIMEZONE}
|
||||
- HF_ENDPOINT=https://huggingface.co
|
||||
- HF_ENDPOINT=${HF_ENDPOINT}
|
||||
- MACOS=${MACOS}
|
||||
networks:
|
||||
- ragflow
|
||||
|
38
download_deps.sh
Normal file
38
download_deps.sh
Normal file
@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
download()
|
||||
{
|
||||
echo "download $1"
|
||||
# https://stackoverflow.com/questions/3162385/how-to-split-a-string-in-shell-and-get-the-last-field
|
||||
fn=${1##*/}
|
||||
if [ ! -f $fn ] ; then
|
||||
wget --no-check-certificate $1
|
||||
fi
|
||||
}
|
||||
|
||||
# https://stackoverflow.com/questions/24628076/convert-multiline-string-to-array
|
||||
names="https://huggingface.co/InfiniFlow/deepdoc/resolve/main/det.onnx
|
||||
https://huggingface.co/InfiniFlow/deepdoc/resolve/main/layout.laws.onnx
|
||||
https://huggingface.co/InfiniFlow/deepdoc/resolve/main/layout.manual.onnx
|
||||
https://huggingface.co/InfiniFlow/deepdoc/resolve/main/layout.onnx
|
||||
https://huggingface.co/InfiniFlow/deepdoc/resolve/main/layout.paper.onnx
|
||||
https://huggingface.co/InfiniFlow/deepdoc/resolve/main/ocr.res
|
||||
https://huggingface.co/InfiniFlow/deepdoc/resolve/main/rec.onnx
|
||||
https://huggingface.co/InfiniFlow/deepdoc/resolve/main/tsr.onnx
|
||||
https://huggingface.co/InfiniFlow/text_concat_xgb_v1.0/resolve/main/updown_concat_xgb.model"
|
||||
|
||||
SAVEIFS=$IFS # Save current IFS (Internal Field Separator)
|
||||
IFS=$'\n' # Change IFS to newline char
|
||||
names=($names) # split the `names` string into an array by the same name
|
||||
IFS=$SAVEIFS # Restore original IFS
|
||||
|
||||
find . -size 0 | xargs rm -f
|
||||
# https://stackoverflow.com/questions/15466808/shell-iterate-over-array
|
||||
for ((i=0; i<${#names[@]}; i+=1)); do
|
||||
url="${names[$i]}"
|
||||
download $url
|
||||
if [ $? != 0 ]; then
|
||||
exit -1
|
||||
fi
|
||||
done
|
||||
find . -size 0 | xargs rm -f
|
10528
poetry.lock
generated
Normal file
10528
poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
4
poetry.toml
Normal file
4
poetry.toml
Normal file
@ -0,0 +1,4 @@
|
||||
[virtualenvs]
|
||||
in-project = true
|
||||
create = true
|
||||
prefer-active-python = true
|
129
pyproject.toml
Normal file
129
pyproject.toml
Normal file
@ -0,0 +1,129 @@
|
||||
[tool.poetry]
|
||||
name = "ragflow"
|
||||
version = "0.11.0"
|
||||
description = "[RAGFlow](https://ragflow.io/) is an open-source RAG (Retrieval-Augmented Generation) engine based on deep document understanding. It offers a streamlined RAG workflow for businesses of any scale, combining LLM (Large Language Models) to provide truthful question-answering capabilities, backed by well-founded citations from various complex formatted data."
|
||||
authors = ["Your Name <you@example.com>"]
|
||||
license = "https://github.com/infiniflow/ragflow/blob/main/LICENSE"
|
||||
readme = "README.md"
|
||||
package-mode = false
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.12,<3.13"
|
||||
datrie = "0.8.2"
|
||||
akshare = "1.14.72"
|
||||
azure-storage-blob = "12.22.0"
|
||||
azure-identity = "1.17.1"
|
||||
azure-storage-file-datalake = "12.16.0"
|
||||
anthropic = "=0.34.1"
|
||||
arxiv = "2.1.3"
|
||||
aspose-slides = "24.8.0"
|
||||
bcembedding = "0.1.3"
|
||||
bio = "1.7.1"
|
||||
boto3 = "1.34.140"
|
||||
botocore = "1.34.140"
|
||||
cachetools = "5.3.3"
|
||||
chardet = "5.2.0"
|
||||
cn2an = "0.5.22"
|
||||
cohere = "5.6.2"
|
||||
dashscope = "1.14.1"
|
||||
deepl = "1.18.0"
|
||||
demjson3 = "3.0.6"
|
||||
discord-py = "2.3.2"
|
||||
duckduckgo-search = "6.1.9"
|
||||
editdistance = "0.8.1"
|
||||
elastic-transport = "8.12.0"
|
||||
elasticsearch = "8.12.1"
|
||||
elasticsearch-dsl = "8.12.0"
|
||||
fastembed = "^0.3.6"
|
||||
fasttext = "0.9.3"
|
||||
filelock = "3.15.4"
|
||||
flagembedding = "1.2.10"
|
||||
flask = "3.0.3"
|
||||
flask-cors = "5.0.0"
|
||||
flask-login = "0.6.3"
|
||||
flask-session = "0.8.0"
|
||||
google-search-results = "2.4.2"
|
||||
groq = "0.9.0"
|
||||
hanziconv = "0.3.2"
|
||||
html-text = "0.6.2"
|
||||
httpx = "0.27.0"
|
||||
huggingface-hub = "^0.25.0"
|
||||
infinity-emb = "0.0.51"
|
||||
itsdangerous = "2.1.2"
|
||||
markdown = "3.6"
|
||||
markdown-to-json = "2.1.1"
|
||||
minio = "7.2.4"
|
||||
mistralai = "0.4.2"
|
||||
nltk = "3.9.1"
|
||||
numpy = "1.26.4"
|
||||
ollama = "0.2.1"
|
||||
onnxruntime = "1.17.3"
|
||||
onnxruntime-gpu = "1.17.1"
|
||||
openai = "1.12.0"
|
||||
opencv-python = "4.9.0.80"
|
||||
opencv-python-headless = "4.9.0.80"
|
||||
openpyxl = "3.1.2"
|
||||
ormsgpack = "1.5.0"
|
||||
pandas = "2.2.2"
|
||||
pdfplumber = "0.10.4"
|
||||
peewee = "3.17.1"
|
||||
pillow = "10.3.0"
|
||||
protobuf = "5.27.2"
|
||||
psycopg2-binary = "2.9.9"
|
||||
pyclipper = "1.3.0.post5"
|
||||
pycryptodomex = "3.20.0"
|
||||
pypdf = "^5.0.0"
|
||||
pytest = "8.2.2"
|
||||
python-dotenv = "1.0.1"
|
||||
python-dateutil = "2.8.2"
|
||||
python-pptx = "0.6.23"
|
||||
pywencai = "0.12.2"
|
||||
qianfan = "0.4.6"
|
||||
ranx = "0.3.20"
|
||||
readability-lxml = "0.8.1"
|
||||
redis = "5.0.3"
|
||||
requests = "2.32.2"
|
||||
replicate = "0.31.0"
|
||||
roman-numbers = "1.0.2"
|
||||
ruamel-base = "1.0.0"
|
||||
scholarly = "1.7.11"
|
||||
scikit-learn = "1.5.0"
|
||||
selenium = "4.22.0"
|
||||
setuptools = "70.0.0"
|
||||
shapely = "2.0.5"
|
||||
six = "1.16.0"
|
||||
strenum = "0.4.15"
|
||||
tabulate = "0.9.0"
|
||||
tencentcloud-sdk-python = "3.0.1215"
|
||||
tika = "2.6.0"
|
||||
tiktoken = "0.6.0"
|
||||
torch = "2.3.0"
|
||||
transformers = "4.38.1"
|
||||
umap = "0.1.1"
|
||||
vertexai = "1.64.0"
|
||||
volcengine = "1.0.146"
|
||||
voyageai = "0.2.3"
|
||||
webdriver-manager = "4.0.1"
|
||||
werkzeug = "3.0.3"
|
||||
wikipedia = "1.4.0"
|
||||
word2number = "1.1"
|
||||
xgboost = "2.1.0"
|
||||
xpinyin = "0.7.6"
|
||||
yfinance = "0.1.96"
|
||||
zhipuai = "2.0.1"
|
||||
ruamel-yaml = "^0.18.6"
|
||||
google-generativeai = "^0.8.1"
|
||||
python-docx = "^1.1.2"
|
||||
pypdf2 = "^3.0.1"
|
||||
graspologic = "^3.4.1"
|
||||
pymysql = "^1.1.1"
|
||||
|
||||
|
||||
[[tool.poetry.source]]
|
||||
name = "tsinghua"
|
||||
url = "https://pypi.tuna.tsinghua.edu.cn/simple/"
|
||||
priority = "primary"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
@ -4,7 +4,7 @@ azure-identity==1.17.1
|
||||
azure-storage-file-datalake==12.16.0
|
||||
anthropic===0.34.1
|
||||
arxiv==2.1.3
|
||||
Aspose.Slides==24.2.0
|
||||
Aspose.Slides==24.8.0
|
||||
BCEmbedding==0.1.3
|
||||
Bio==1.7.1
|
||||
boto3==1.34.140
|
||||
@ -43,7 +43,7 @@ Markdown==3.6
|
||||
markdown_to_json==2.1.1
|
||||
minio==7.2.4
|
||||
mistralai==0.4.2
|
||||
nltk==3.9
|
||||
nltk==3.9.1
|
||||
numpy==1.26.4
|
||||
ollama==0.2.1
|
||||
onnxruntime==1.17.3
|
||||
@ -57,7 +57,6 @@ pandas==2.2.2
|
||||
pdfplumber==0.10.4
|
||||
peewee==3.17.1
|
||||
Pillow==10.3.0
|
||||
pipreqs==0.5.0
|
||||
protobuf==5.27.2
|
||||
psycopg2-binary==2.9.9
|
||||
pyclipper==1.3.0.post5
|
||||
|
39
ubuntu.sources
Normal file
39
ubuntu.sources
Normal file
@ -0,0 +1,39 @@
|
||||
Types: deb
|
||||
URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu
|
||||
Suites: noble noble-updates noble-backports
|
||||
Components: main restricted universe multiverse
|
||||
Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg
|
||||
|
||||
# 默认注释了源码镜像以提高 apt update 速度,如有需要可自行取消注释
|
||||
# Types: deb-src
|
||||
# URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu
|
||||
# Suites: noble noble-updates noble-backports
|
||||
# Components: main restricted universe multiverse
|
||||
# Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg
|
||||
|
||||
# 以下安全更新软件源包含了官方源与镜像站配置,如有需要可自行修改注释切换
|
||||
Types: deb
|
||||
URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu
|
||||
Suites: noble-security
|
||||
Components: main restricted universe multiverse
|
||||
Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg
|
||||
|
||||
# Types: deb-src
|
||||
# URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu
|
||||
# Suites: noble-security
|
||||
# Components: main restricted universe multiverse
|
||||
# Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg
|
||||
|
||||
# 预发布软件源,不建议启用
|
||||
|
||||
# Types: deb
|
||||
# URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu
|
||||
# Suites: noble-proposed
|
||||
# Components: main restricted universe multiverse
|
||||
# Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg
|
||||
|
||||
# # Types: deb-src
|
||||
# # URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu
|
||||
# # Suites: noble-proposed
|
||||
# # Components: main restricted universe multiverse
|
||||
# # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg
|
28006
web/package-lock.json
generated
28006
web/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -8,7 +8,7 @@
|
||||
"lint": "umi lint --eslint-only",
|
||||
"prepare": "cd .. && husky web/.husky",
|
||||
"setup": "umi setup",
|
||||
"start": "npm run dev",
|
||||
"start": "yarn dev",
|
||||
"test": "jest --no-cache --coverage"
|
||||
},
|
||||
"lint-staged": {
|
||||
|
14510
web/yarn.lock
Normal file
14510
web/yarn.lock
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user