From 5f9d236d22d3ba5b590d91f3c77d4438ca42671c Mon Sep 17 00:00:00 2001 From: Yuichiro Utsumi <81412151+utsumi-fj@users.noreply.github.com> Date: Thu, 13 Mar 2025 17:32:34 +0900 Subject: [PATCH] Feat: Add pg_bigm for keyword search in pgvector (#13876) Signed-off-by: Yuichiro Utsumi --- api/configs/middleware/vdb/pgvector_config.py | 5 +++ .../rag/datasource/vdb/pgvector/pgvector.py | 41 +++++++++++++++---- docker/.env.example | 2 + docker/docker-compose-template.yaml | 5 +++ docker/docker-compose.yaml | 7 ++++ docker/pgvector/docker-entrypoint.sh | 24 +++++++++++ 6 files changed, 75 insertions(+), 9 deletions(-) create mode 100755 docker/pgvector/docker-entrypoint.sh diff --git a/api/configs/middleware/vdb/pgvector_config.py b/api/configs/middleware/vdb/pgvector_config.py index 4561a9a7ca..9f5f7284d7 100644 --- a/api/configs/middleware/vdb/pgvector_config.py +++ b/api/configs/middleware/vdb/pgvector_config.py @@ -43,3 +43,8 @@ class PGVectorConfig(BaseSettings): description="Max connection of the PostgreSQL database", default=5, ) + + PGVECTOR_PG_BIGM: bool = Field( + description="Whether to use pg_bigm module for full text search", + default=False, + ) diff --git a/api/core/rag/datasource/vdb/pgvector/pgvector.py b/api/core/rag/datasource/vdb/pgvector/pgvector.py index c8a1e4f90c..06083af9ca 100644 --- a/api/core/rag/datasource/vdb/pgvector/pgvector.py +++ b/api/core/rag/datasource/vdb/pgvector/pgvector.py @@ -25,6 +25,7 @@ class PGVectorConfig(BaseModel): database: str min_connection: int max_connection: int + pg_bigm: bool = False @model_validator(mode="before") @classmethod @@ -62,12 +63,18 @@ CREATE INDEX IF NOT EXISTS embedding_cosine_v1_idx ON {table_name} USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 64); """ +SQL_CREATE_INDEX_PG_BIGM = """ +CREATE INDEX IF NOT EXISTS bigm_idx ON {table_name} +USING gin (text gin_bigm_ops); +""" + class PGVector(BaseVector): def __init__(self, collection_name: str, config: PGVectorConfig): super().__init__(collection_name) self.pool = self._create_connection_pool(config) self.table_name = f"embedding_{collection_name}" + self.pg_bigm = config.pg_bigm def get_type(self) -> str: return VectorType.PGVECTOR @@ -176,15 +183,27 @@ class PGVector(BaseVector): top_k = kwargs.get("top_k", 5) with self._get_cursor() as cur: - cur.execute( - f"""SELECT meta, text, ts_rank(to_tsvector(coalesce(text, '')), plainto_tsquery(%s)) AS score - FROM {self.table_name} - WHERE to_tsvector(text) @@ plainto_tsquery(%s) - ORDER BY score DESC - LIMIT {top_k}""", - # f"'{query}'" is required in order to account for whitespace in query - (f"'{query}'", f"'{query}'"), - ) + if self.pg_bigm: + cur.execute("SET pg_bigm.similarity_limit TO 0.000001") + cur.execute( + f"""SELECT meta, text, bigm_similarity(unistr(%s), coalesce(text, '')) AS score + FROM {self.table_name} + WHERE text =%% unistr(%s) + ORDER BY score DESC + LIMIT {top_k}""", + # f"'{query}'" is required in order to account for whitespace in query + (f"'{query}'", f"'{query}'"), + ) + else: + cur.execute( + f"""SELECT meta, text, ts_rank(to_tsvector(coalesce(text, '')), plainto_tsquery(%s)) AS score + FROM {self.table_name} + WHERE to_tsvector(text) @@ plainto_tsquery(%s) + ORDER BY score DESC + LIMIT {top_k}""", + # f"'{query}'" is required in order to account for whitespace in query + (f"'{query}'", f"'{query}'"), + ) docs = [] @@ -214,6 +233,9 @@ class PGVector(BaseVector): # ref: https://github.com/pgvector/pgvector?tab=readme-ov-file#indexing if dimension <= 2000: cur.execute(SQL_CREATE_INDEX.format(table_name=self.table_name)) + if self.pg_bigm: + cur.execute("CREATE EXTENSION IF NOT EXISTS pg_bigm") + cur.execute(SQL_CREATE_INDEX_PG_BIGM.format(table_name=self.table_name)) redis_client.set(collection_exist_cache_key, 1, ex=3600) @@ -237,5 +259,6 @@ class PGVectorFactory(AbstractVectorFactory): database=dify_config.PGVECTOR_DATABASE or "postgres", min_connection=dify_config.PGVECTOR_MIN_CONNECTION, max_connection=dify_config.PGVECTOR_MAX_CONNECTION, + pg_bigm=dify_config.PGVECTOR_PG_BIGM, ), ) diff --git a/docker/.env.example b/docker/.env.example index 4bcb604bad..def2f4d41e 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -431,6 +431,8 @@ PGVECTOR_PASSWORD=difyai123456 PGVECTOR_DATABASE=dify PGVECTOR_MIN_CONNECTION=1 PGVECTOR_MAX_CONNECTION=5 +PGVECTOR_PG_BIGM=false +PGVECTOR_PG_BIGM_VERSION=1.2-20240606 # pgvecto-rs configurations, only available when VECTOR_STORE is `pgvecto-rs` PGVECTO_RS_HOST=pgvecto-rs diff --git a/docker/docker-compose-template.yaml b/docker/docker-compose-template.yaml index 2879f2194f..2f844caa88 100644 --- a/docker/docker-compose-template.yaml +++ b/docker/docker-compose-template.yaml @@ -322,8 +322,13 @@ services: POSTGRES_DB: ${PGVECTOR_POSTGRES_DB:-dify} # postgres data directory PGDATA: ${PGVECTOR_PGDATA:-/var/lib/postgresql/data/pgdata} + # pg_bigm module for full text search + PG_BIGM: ${PGVECTOR_PG_BIGM:-false} + PG_BIGM_VERSION: ${PGVECTOR_PG_BIGM_VERSION:-1.2-20240606} volumes: - ./volumes/pgvector/data:/var/lib/postgresql/data + - ./pgvector/docker-entrypoint.sh:/docker-entrypoint.sh + entrypoint: [ '/docker-entrypoint.sh' ] healthcheck: test: [ 'CMD', 'pg_isready' ] interval: 1s diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index a6d71d687d..fca95d3946 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -157,6 +157,8 @@ x-shared-env: &shared-api-worker-env PGVECTOR_DATABASE: ${PGVECTOR_DATABASE:-dify} PGVECTOR_MIN_CONNECTION: ${PGVECTOR_MIN_CONNECTION:-1} PGVECTOR_MAX_CONNECTION: ${PGVECTOR_MAX_CONNECTION:-5} + PGVECTOR_PG_BIGM: ${PGVECTOR_PG_BIGM:-false} + PGVECTOR_PG_BIGM_VERSION: ${PGVECTOR_PG_BIGM_VERSION:-1.2-20240606} PGVECTO_RS_HOST: ${PGVECTO_RS_HOST:-pgvecto-rs} PGVECTO_RS_PORT: ${PGVECTO_RS_PORT:-5432} PGVECTO_RS_USER: ${PGVECTO_RS_USER:-postgres} @@ -741,8 +743,13 @@ services: POSTGRES_DB: ${PGVECTOR_POSTGRES_DB:-dify} # postgres data directory PGDATA: ${PGVECTOR_PGDATA:-/var/lib/postgresql/data/pgdata} + # pg_bigm module for full text search + PG_BIGM: ${PGVECTOR_PG_BIGM:-false} + PG_BIGM_VERSION: ${PGVECTOR_PG_BIGM_VERSION:-1.2-20240606} volumes: - ./volumes/pgvector/data:/var/lib/postgresql/data + - ./pgvector/docker-entrypoint.sh:/docker-entrypoint.sh + entrypoint: [ '/docker-entrypoint.sh' ] healthcheck: test: [ 'CMD', 'pg_isready' ] interval: 1s diff --git a/docker/pgvector/docker-entrypoint.sh b/docker/pgvector/docker-entrypoint.sh new file mode 100755 index 0000000000..262eacfb13 --- /dev/null +++ b/docker/pgvector/docker-entrypoint.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +PG_MAJOR=16 + +if [ "${PG_BIGM}" = "true" ]; then + # install pg_bigm + apt-get update + apt-get install -y curl make gcc postgresql-server-dev-${PG_MAJOR} + + curl -LO https://github.com/pgbigm/pg_bigm/archive/refs/tags/v${PG_BIGM_VERSION}.tar.gz + tar xf v${PG_BIGM_VERSION}.tar.gz + cd pg_bigm-${PG_BIGM_VERSION} || exit 1 + make USE_PGXS=1 PG_CONFIG=/usr/bin/pg_config + make USE_PGXS=1 PG_CONFIG=/usr/bin/pg_config install + + cd - || exit 1 + rm -rf v${PG_BIGM_VERSION}.tar.gz pg_bigm-${PG_BIGM_VERSION} + + # enable pg_bigm + sed -i -e 's/^#\s*shared_preload_libraries.*/shared_preload_libraries = '\''pg_bigm'\''/' /var/lib/postgresql/data/pgdata/postgresql.conf +fi + +# Run the original entrypoint script +exec /usr/local/bin/docker-entrypoint.sh postgres