mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-04-22 05:39:42 +08:00
Feat: Add pg_bigm for keyword search in pgvector (#13876)
Signed-off-by: Yuichiro Utsumi <utsumi.yuichiro@fujitsu.com>
This commit is contained in:
parent
59f5a82261
commit
5f9d236d22
@ -43,3 +43,8 @@ class PGVectorConfig(BaseSettings):
|
|||||||
description="Max connection of the PostgreSQL database",
|
description="Max connection of the PostgreSQL database",
|
||||||
default=5,
|
default=5,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
PGVECTOR_PG_BIGM: bool = Field(
|
||||||
|
description="Whether to use pg_bigm module for full text search",
|
||||||
|
default=False,
|
||||||
|
)
|
||||||
|
@ -25,6 +25,7 @@ class PGVectorConfig(BaseModel):
|
|||||||
database: str
|
database: str
|
||||||
min_connection: int
|
min_connection: int
|
||||||
max_connection: int
|
max_connection: int
|
||||||
|
pg_bigm: bool = False
|
||||||
|
|
||||||
@model_validator(mode="before")
|
@model_validator(mode="before")
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -62,12 +63,18 @@ CREATE INDEX IF NOT EXISTS embedding_cosine_v1_idx ON {table_name}
|
|||||||
USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 64);
|
USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 64);
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
SQL_CREATE_INDEX_PG_BIGM = """
|
||||||
|
CREATE INDEX IF NOT EXISTS bigm_idx ON {table_name}
|
||||||
|
USING gin (text gin_bigm_ops);
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class PGVector(BaseVector):
|
class PGVector(BaseVector):
|
||||||
def __init__(self, collection_name: str, config: PGVectorConfig):
|
def __init__(self, collection_name: str, config: PGVectorConfig):
|
||||||
super().__init__(collection_name)
|
super().__init__(collection_name)
|
||||||
self.pool = self._create_connection_pool(config)
|
self.pool = self._create_connection_pool(config)
|
||||||
self.table_name = f"embedding_{collection_name}"
|
self.table_name = f"embedding_{collection_name}"
|
||||||
|
self.pg_bigm = config.pg_bigm
|
||||||
|
|
||||||
def get_type(self) -> str:
|
def get_type(self) -> str:
|
||||||
return VectorType.PGVECTOR
|
return VectorType.PGVECTOR
|
||||||
@ -176,6 +183,18 @@ class PGVector(BaseVector):
|
|||||||
top_k = kwargs.get("top_k", 5)
|
top_k = kwargs.get("top_k", 5)
|
||||||
|
|
||||||
with self._get_cursor() as cur:
|
with self._get_cursor() as cur:
|
||||||
|
if self.pg_bigm:
|
||||||
|
cur.execute("SET pg_bigm.similarity_limit TO 0.000001")
|
||||||
|
cur.execute(
|
||||||
|
f"""SELECT meta, text, bigm_similarity(unistr(%s), coalesce(text, '')) AS score
|
||||||
|
FROM {self.table_name}
|
||||||
|
WHERE text =%% unistr(%s)
|
||||||
|
ORDER BY score DESC
|
||||||
|
LIMIT {top_k}""",
|
||||||
|
# f"'{query}'" is required in order to account for whitespace in query
|
||||||
|
(f"'{query}'", f"'{query}'"),
|
||||||
|
)
|
||||||
|
else:
|
||||||
cur.execute(
|
cur.execute(
|
||||||
f"""SELECT meta, text, ts_rank(to_tsvector(coalesce(text, '')), plainto_tsquery(%s)) AS score
|
f"""SELECT meta, text, ts_rank(to_tsvector(coalesce(text, '')), plainto_tsquery(%s)) AS score
|
||||||
FROM {self.table_name}
|
FROM {self.table_name}
|
||||||
@ -214,6 +233,9 @@ class PGVector(BaseVector):
|
|||||||
# ref: https://github.com/pgvector/pgvector?tab=readme-ov-file#indexing
|
# ref: https://github.com/pgvector/pgvector?tab=readme-ov-file#indexing
|
||||||
if dimension <= 2000:
|
if dimension <= 2000:
|
||||||
cur.execute(SQL_CREATE_INDEX.format(table_name=self.table_name))
|
cur.execute(SQL_CREATE_INDEX.format(table_name=self.table_name))
|
||||||
|
if self.pg_bigm:
|
||||||
|
cur.execute("CREATE EXTENSION IF NOT EXISTS pg_bigm")
|
||||||
|
cur.execute(SQL_CREATE_INDEX_PG_BIGM.format(table_name=self.table_name))
|
||||||
redis_client.set(collection_exist_cache_key, 1, ex=3600)
|
redis_client.set(collection_exist_cache_key, 1, ex=3600)
|
||||||
|
|
||||||
|
|
||||||
@ -237,5 +259,6 @@ class PGVectorFactory(AbstractVectorFactory):
|
|||||||
database=dify_config.PGVECTOR_DATABASE or "postgres",
|
database=dify_config.PGVECTOR_DATABASE or "postgres",
|
||||||
min_connection=dify_config.PGVECTOR_MIN_CONNECTION,
|
min_connection=dify_config.PGVECTOR_MIN_CONNECTION,
|
||||||
max_connection=dify_config.PGVECTOR_MAX_CONNECTION,
|
max_connection=dify_config.PGVECTOR_MAX_CONNECTION,
|
||||||
|
pg_bigm=dify_config.PGVECTOR_PG_BIGM,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
@ -431,6 +431,8 @@ PGVECTOR_PASSWORD=difyai123456
|
|||||||
PGVECTOR_DATABASE=dify
|
PGVECTOR_DATABASE=dify
|
||||||
PGVECTOR_MIN_CONNECTION=1
|
PGVECTOR_MIN_CONNECTION=1
|
||||||
PGVECTOR_MAX_CONNECTION=5
|
PGVECTOR_MAX_CONNECTION=5
|
||||||
|
PGVECTOR_PG_BIGM=false
|
||||||
|
PGVECTOR_PG_BIGM_VERSION=1.2-20240606
|
||||||
|
|
||||||
# pgvecto-rs configurations, only available when VECTOR_STORE is `pgvecto-rs`
|
# pgvecto-rs configurations, only available when VECTOR_STORE is `pgvecto-rs`
|
||||||
PGVECTO_RS_HOST=pgvecto-rs
|
PGVECTO_RS_HOST=pgvecto-rs
|
||||||
|
@ -322,8 +322,13 @@ services:
|
|||||||
POSTGRES_DB: ${PGVECTOR_POSTGRES_DB:-dify}
|
POSTGRES_DB: ${PGVECTOR_POSTGRES_DB:-dify}
|
||||||
# postgres data directory
|
# postgres data directory
|
||||||
PGDATA: ${PGVECTOR_PGDATA:-/var/lib/postgresql/data/pgdata}
|
PGDATA: ${PGVECTOR_PGDATA:-/var/lib/postgresql/data/pgdata}
|
||||||
|
# pg_bigm module for full text search
|
||||||
|
PG_BIGM: ${PGVECTOR_PG_BIGM:-false}
|
||||||
|
PG_BIGM_VERSION: ${PGVECTOR_PG_BIGM_VERSION:-1.2-20240606}
|
||||||
volumes:
|
volumes:
|
||||||
- ./volumes/pgvector/data:/var/lib/postgresql/data
|
- ./volumes/pgvector/data:/var/lib/postgresql/data
|
||||||
|
- ./pgvector/docker-entrypoint.sh:/docker-entrypoint.sh
|
||||||
|
entrypoint: [ '/docker-entrypoint.sh' ]
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: [ 'CMD', 'pg_isready' ]
|
test: [ 'CMD', 'pg_isready' ]
|
||||||
interval: 1s
|
interval: 1s
|
||||||
|
@ -157,6 +157,8 @@ x-shared-env: &shared-api-worker-env
|
|||||||
PGVECTOR_DATABASE: ${PGVECTOR_DATABASE:-dify}
|
PGVECTOR_DATABASE: ${PGVECTOR_DATABASE:-dify}
|
||||||
PGVECTOR_MIN_CONNECTION: ${PGVECTOR_MIN_CONNECTION:-1}
|
PGVECTOR_MIN_CONNECTION: ${PGVECTOR_MIN_CONNECTION:-1}
|
||||||
PGVECTOR_MAX_CONNECTION: ${PGVECTOR_MAX_CONNECTION:-5}
|
PGVECTOR_MAX_CONNECTION: ${PGVECTOR_MAX_CONNECTION:-5}
|
||||||
|
PGVECTOR_PG_BIGM: ${PGVECTOR_PG_BIGM:-false}
|
||||||
|
PGVECTOR_PG_BIGM_VERSION: ${PGVECTOR_PG_BIGM_VERSION:-1.2-20240606}
|
||||||
PGVECTO_RS_HOST: ${PGVECTO_RS_HOST:-pgvecto-rs}
|
PGVECTO_RS_HOST: ${PGVECTO_RS_HOST:-pgvecto-rs}
|
||||||
PGVECTO_RS_PORT: ${PGVECTO_RS_PORT:-5432}
|
PGVECTO_RS_PORT: ${PGVECTO_RS_PORT:-5432}
|
||||||
PGVECTO_RS_USER: ${PGVECTO_RS_USER:-postgres}
|
PGVECTO_RS_USER: ${PGVECTO_RS_USER:-postgres}
|
||||||
@ -741,8 +743,13 @@ services:
|
|||||||
POSTGRES_DB: ${PGVECTOR_POSTGRES_DB:-dify}
|
POSTGRES_DB: ${PGVECTOR_POSTGRES_DB:-dify}
|
||||||
# postgres data directory
|
# postgres data directory
|
||||||
PGDATA: ${PGVECTOR_PGDATA:-/var/lib/postgresql/data/pgdata}
|
PGDATA: ${PGVECTOR_PGDATA:-/var/lib/postgresql/data/pgdata}
|
||||||
|
# pg_bigm module for full text search
|
||||||
|
PG_BIGM: ${PGVECTOR_PG_BIGM:-false}
|
||||||
|
PG_BIGM_VERSION: ${PGVECTOR_PG_BIGM_VERSION:-1.2-20240606}
|
||||||
volumes:
|
volumes:
|
||||||
- ./volumes/pgvector/data:/var/lib/postgresql/data
|
- ./volumes/pgvector/data:/var/lib/postgresql/data
|
||||||
|
- ./pgvector/docker-entrypoint.sh:/docker-entrypoint.sh
|
||||||
|
entrypoint: [ '/docker-entrypoint.sh' ]
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: [ 'CMD', 'pg_isready' ]
|
test: [ 'CMD', 'pg_isready' ]
|
||||||
interval: 1s
|
interval: 1s
|
||||||
|
24
docker/pgvector/docker-entrypoint.sh
Executable file
24
docker/pgvector/docker-entrypoint.sh
Executable file
@ -0,0 +1,24 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
PG_MAJOR=16
|
||||||
|
|
||||||
|
if [ "${PG_BIGM}" = "true" ]; then
|
||||||
|
# install pg_bigm
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y curl make gcc postgresql-server-dev-${PG_MAJOR}
|
||||||
|
|
||||||
|
curl -LO https://github.com/pgbigm/pg_bigm/archive/refs/tags/v${PG_BIGM_VERSION}.tar.gz
|
||||||
|
tar xf v${PG_BIGM_VERSION}.tar.gz
|
||||||
|
cd pg_bigm-${PG_BIGM_VERSION} || exit 1
|
||||||
|
make USE_PGXS=1 PG_CONFIG=/usr/bin/pg_config
|
||||||
|
make USE_PGXS=1 PG_CONFIG=/usr/bin/pg_config install
|
||||||
|
|
||||||
|
cd - || exit 1
|
||||||
|
rm -rf v${PG_BIGM_VERSION}.tar.gz pg_bigm-${PG_BIGM_VERSION}
|
||||||
|
|
||||||
|
# enable pg_bigm
|
||||||
|
sed -i -e 's/^#\s*shared_preload_libraries.*/shared_preload_libraries = '\''pg_bigm'\''/' /var/lib/postgresql/data/pgdata/postgresql.conf
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Run the original entrypoint script
|
||||||
|
exec /usr/local/bin/docker-entrypoint.sh postgres
|
Loading…
x
Reference in New Issue
Block a user