Feat: extend S3 storage compatibility and add knowledge base ID prefix (#6355)

### What problem does this PR solve?

- Added support for S3-compatible protocols.
- Enabled the use of knowledge base ID as a file prefix when storing
files in S3.
- Updated docker/README.md to include detailed S3 and OSS configuration
instructions.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
RedBookOfMemory 2025-03-31 16:09:43 +08:00 committed by GitHub
parent 46b5e32cd7
commit e2b66628f4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 81 additions and 12 deletions

View File

@ -136,6 +136,24 @@ The [.env](./.env) file contains important environment variables for Docker.
- `password`: The password for MinIO. - `password`: The password for MinIO.
- `host`: The MinIO serving IP *and* port inside the Docker container. Defaults to `minio:9000`. - `host`: The MinIO serving IP *and* port inside the Docker container. Defaults to `minio:9000`.
- `oss`
- `access_key`: The access key ID used to authenticate requests to the OSS service.
- `secret_key`: The secret access key used to authenticate requests to the OSS service.
- `endpoint_url`: The URL of the OSS service endpoint.
- `region`: The OSS region where the bucket is located.
- `bucket`: The name of the OSS bucket where files will be stored. When you want to store all files in a specified bucket, you need this configuration item.
- `prefix_path`: Optional. A prefix path to prepend to file names in the OSS bucket, which can help organize files within the bucket.
- `s3`:
- `access_key`: The access key ID used to authenticate requests to the S3 service.
- `secret_key`: The secret access key used to authenticate requests to the S3 service.
- `endpoint_url`: The URL of the S3-compatible service endpoint. This is necessary when using an S3-compatible protocol instead of the default AWS S3 endpoint.
- `bucket`: The name of the S3 bucket where files will be stored. When you want to store all files in a specified bucket, you need this configuration item.
- `region`: The AWS region where the S3 bucket is located. This is important for directing requests to the correct data center.
- `signature_version`: Optional. The version of the signature to use for authenticating requests. Common versions include `v4`.
- `addressing_style`: Optional. The style of addressing to use for the S3 endpoint. This can be `path` or `virtual`.
- `prefix_path`: Optional. A prefix path to prepend to file names in the S3 bucket, which can help organize files within the bucket.
- `oauth` - `oauth`
The OAuth configuration for signing up or signing in to RAGFlow using a third-party account. It is disabled by default. To enable this feature, uncomment the corresponding lines in **service_conf.yaml.template**. The OAuth configuration for signing up or signing in to RAGFlow using a third-party account. It is disabled by default. To enable this feature, uncomment the corresponding lines in **service_conf.yaml.template**.
- `github`: The GitHub authentication settings for your application. Visit the [Github Developer Settings page](https://github.com/settings/developers) to obtain your client_id and secret_key. - `github`: The GitHub authentication settings for your application. Visit the [Github Developer Settings page](https://github.com/settings/developers) to obtain your client_id and secret_key.

View File

@ -1,6 +1,7 @@
include: include:
- ./docker-compose-base.yml - ./docker-compose-base.yml
# To ensure that the container processes the locally modified `service_conf.yaml.template` instead of the one included in its image, you need to mount the local `service_conf.yaml.template` to the container.
services: services:
ragflow: ragflow:
depends_on: depends_on:
@ -20,6 +21,7 @@ services:
- ./nginx/proxy.conf:/etc/nginx/proxy.conf - ./nginx/proxy.conf:/etc/nginx/proxy.conf
- ./nginx/nginx.conf:/etc/nginx/nginx.conf - ./nginx/nginx.conf:/etc/nginx/nginx.conf
- ../history_data_agent:/ragflow/history_data_agent - ../history_data_agent:/ragflow/history_data_agent
- ./service_conf.yaml.template:/ragflow/conf/service_conf.yaml.template
env_file: .env env_file: .env
environment: environment:

View File

@ -37,6 +37,11 @@ redis:
# access_key: 'access_key' # access_key: 'access_key'
# secret_key: 'secret_key' # secret_key: 'secret_key'
# region: 'region' # region: 'region'
# endpoint_url: 'endpoint_url'
# bucket: 'bucket'
# prefix_path: 'prefix_path'
# signature_version: 'v4'
# addressing_style: 'path'
# oss: # oss:
# access_key: '${ACCESS_KEY}' # access_key: '${ACCESS_KEY}'
# secret_key: '${SECRET_KEY}' # secret_key: '${SECRET_KEY}'

View File

@ -17,6 +17,7 @@
import logging import logging
import boto3 import boto3
from botocore.exceptions import ClientError from botocore.exceptions import ClientError
from botocore.config import Config
import time import time
from io import BytesIO from io import BytesIO
from rag.utils import singleton from rag.utils import singleton
@ -30,8 +31,34 @@ class RAGFlowS3:
self.access_key = self.s3_config.get('access_key', None) self.access_key = self.s3_config.get('access_key', None)
self.secret_key = self.s3_config.get('secret_key', None) self.secret_key = self.s3_config.get('secret_key', None)
self.region = self.s3_config.get('region', None) self.region = self.s3_config.get('region', None)
self.endpoint_url = self.s3_config.get('endpoint_url', None)
self.signature_version = self.s3_config.get('signature_version', None)
self.addressing_style = self.s3_config.get('addressing_style', None)
self.bucket = self.s3_config.get('bucket', None)
self.prefix_path = self.s3_config.get('prefix_path', None)
self.__open__() self.__open__()
@staticmethod
def use_default_bucket(method):
def wrapper(self, bucket, *args, **kwargs):
# If there is a default bucket, use the default bucket
actual_bucket = self.bucket if self.bucket else bucket
return method(self, actual_bucket, *args, **kwargs)
return wrapper
@staticmethod
def use_prefix_path(method):
def wrapper(self, bucket, fnm, *args, **kwargs):
# If the prefix path is set, use the prefix path.
# The bucket passed from the upstream call is
# used as the file prefix. This is especially useful when you're using the default bucket
if self.prefix_path:
fnm = f"{self.prefix_path}/{bucket}/{fnm}"
else:
fnm = f"{bucket}/{fnm}"
return method(self, bucket, fnm, *args, **kwargs)
return wrapper
def __open__(self): def __open__(self):
try: try:
if self.conn: if self.conn:
@ -40,19 +67,27 @@ class RAGFlowS3:
pass pass
try: try:
self.conn = boto3.client( s3_params = {
's3', 'aws_access_key_id': self.access_key,
region_name=self.region, 'aws_secret_access_key': self.secret_key,
aws_access_key_id=self.access_key, }
aws_secret_access_key=self.secret_key if self.region in self.s3_config:
) s3_params['region_name'] = self.region
if 'endpoint_url' in self.s3_config:
s3_params['endpoint_url'] = self.endpoint_url
if 'signature_version' in self.s3_config:
s3_params['config'] = Config(s3={"signature_version": self.signature_version})
if 'addressing_style' in self.s3_config:
s3_params['config'] = Config(s3={"addressing_style": self.addressing_style})
self.conn = boto3.client('s3', **s3_params)
except Exception: except Exception:
logging.exception(f"Fail to connect at region {self.region}") logging.exception(f"Fail to connect at region {self.region} or endpoint {self.endpoint_url}")
def __close__(self): def __close__(self):
del self.conn del self.conn
self.conn = None self.conn = None
@use_default_bucket
def bucket_exists(self, bucket): def bucket_exists(self, bucket):
try: try:
logging.debug(f"head_bucket bucketname {bucket}") logging.debug(f"head_bucket bucketname {bucket}")
@ -64,8 +99,9 @@ class RAGFlowS3:
return exists return exists
def health(self): def health(self):
bucket, fnm, binary = "txtxtxtxt1", "txtxtxtxt1", b"_t@@@1" bucket = self.bucket
fnm = "txtxtxtxt1"
fnm, binary = f"{self.prefix_path}/{fnm}" if self.prefix_path else fnm, b"_t@@@1"
if not self.bucket_exists(bucket): if not self.bucket_exists(bucket):
self.conn.create_bucket(Bucket=bucket) self.conn.create_bucket(Bucket=bucket)
logging.debug(f"create bucket {bucket} ********") logging.debug(f"create bucket {bucket} ********")
@ -79,6 +115,8 @@ class RAGFlowS3:
def list(self, bucket, dir, recursive=True): def list(self, bucket, dir, recursive=True):
return [] return []
@use_prefix_path
@use_default_bucket
def put(self, bucket, fnm, binary): def put(self, bucket, fnm, binary):
logging.debug(f"bucket name {bucket}; filename :{fnm}:") logging.debug(f"bucket name {bucket}; filename :{fnm}:")
for _ in range(1): for _ in range(1):
@ -94,12 +132,16 @@ class RAGFlowS3:
self.__open__() self.__open__()
time.sleep(1) time.sleep(1)
@use_prefix_path
@use_default_bucket
def rm(self, bucket, fnm): def rm(self, bucket, fnm):
try: try:
self.conn.delete_object(Bucket=bucket, Key=fnm) self.conn.delete_object(Bucket=bucket, Key=fnm)
except Exception: except Exception:
logging.exception(f"Fail rm {bucket}/{fnm}") logging.exception(f"Fail rm {bucket}/{fnm}")
@use_prefix_path
@use_default_bucket
def get(self, bucket, fnm): def get(self, bucket, fnm):
for _ in range(1): for _ in range(1):
try: try:
@ -112,18 +154,20 @@ class RAGFlowS3:
time.sleep(1) time.sleep(1)
return return
@use_prefix_path
@use_default_bucket
def obj_exist(self, bucket, fnm): def obj_exist(self, bucket, fnm):
try: try:
if self.conn.head_object(Bucket=bucket, Key=fnm): if self.conn.head_object(Bucket=bucket, Key=fnm):
return True return True
except ClientError as e: except ClientError as e:
if e.response['Error']['Code'] == '404': if e.response['Error']['Code'] == '404':
return False return False
else: else:
raise raise
@use_prefix_path
@use_default_bucket
def get_presigned_url(self, bucket, fnm, expires): def get_presigned_url(self, bucket, fnm, expires):
for _ in range(10): for _ in range(10):
try: try: