From 584d04bcef2b41db8f6442119da5bf9c25549308 Mon Sep 17 00:00:00 2001 From: Buqian Zheng Date: Wed, 20 Mar 2024 18:15:00 +0800 Subject: [PATCH] Added a simple example of performing sparse dense hybrid search (#1990) Signed-off-by: Buqian Zheng --- examples/hello_hybrid_sparse_dense.py | 115 ++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 examples/hello_hybrid_sparse_dense.py diff --git a/examples/hello_hybrid_sparse_dense.py b/examples/hello_hybrid_sparse_dense.py new file mode 100644 index 000000000..3c30d03d1 --- /dev/null +++ b/examples/hello_hybrid_sparse_dense.py @@ -0,0 +1,115 @@ +# A demo showing hybrid semantic search with dense and sparse vectors using Milvus. +# You can optionally choose to use the BGE-M3 model to embed the text as dense +# and sparse vectors, or simply use random generated vectors as the example. + +# To use BGE-M3 model, you need to install the optional `model` module in pymilvus: +# pip install pymilvus[model] +use_bge_m3 = True + +# The overall steps are as follows: +# 1. embed the text as dense and sparse vectors +# 2. setup a Milvus collection to store the dense and sparse vectors +# 3. insert the data to Milvus +# 4. search and inspect the result! +import random +import string +import numpy as np + +from pymilvus import ( + utility, + FieldSchema, CollectionSchema, DataType, + Collection, AnnSearchRequest, RRFRanker, connections, +) + +# 1. prepare a small corpus to search +docs = [ + "Artificial intelligence was founded as an academic discipline in 1956.", + "Alan Turing was the first person to conduct substantial research in AI.", + "Born in Maida Vale, London, Turing was raised in southern England.", +] +# add some randomly generated texts +docs.extend([' '.join(''.join(random.choice(string.ascii_lowercase) for _ in range(random.randint(1, 8))) for _ in range(10)) for _ in range(1000)]) +query = "Who started AI research?" + +def random_embedding(texts): + rng = np.random.default_rng() + return { + "dense": np.random.rand(len(texts), 768), + "sparse": [{d: rng.random() for d in random.sample(range(1000), random.randint(20, 30))} for _ in texts], + } + +dense_dim = 768 +ef = random_embedding + +if use_bge_m3: + # BGE-M3 model can embed texts as dense and sparse vectors. + # It is included in the optional `model` module in pymilvus, to install it, + # simply run "pip install pymilvus[model]". + from pymilvus.model.hybrid import BGEM3EmbeddingFunction + ef = BGEM3EmbeddingFunction(use_fp16=False, device="cpu") + dense_dim = ef.dim["dense"] + +docs_embeddings = ef(docs) +query_embeddings = ef([query]) + +# 2. setup Milvus collection and index +connections.connect("default", host="localhost", port="19530") + +# Specify the data schema for the new Collection. +fields = [ + # Use auto generated id as primary key + FieldSchema(name="pk", dtype=DataType.VARCHAR, + is_primary=True, auto_id=True, max_length=100), + # Store the original text to retrieve based on semantically distance + FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=512), + # Milvus now supports both sparse and dense vectors, we can store each in + # a separate field to conduct hybrid search on both vectors. + FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR), + FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR, + dim=dense_dim), +] +schema = CollectionSchema(fields, "") +col_name = 'hybrid_demo' +# Now we can create the new collection with above name and schema. +col = Collection(col_name, schema, consistency_level="Strong") + +# We need to create indices for the vector fields. The indices will be loaded +# into memory for efficient search. +sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"} +col.create_index("sparse_vector", sparse_index) +dense_index = {"index_type": "FLAT", "metric_type": "L2"} +col.create_index("dense_vector", dense_index) +col.load() + +# 3. insert text and sparse/dense vector representations into the collection +entities = [docs, docs_embeddings["sparse"], docs_embeddings["dense"]] +col.insert(entities) +col.flush() + +# 4. search and inspect the result! +k = 2 # we want to get the top 2 docs closest to the query + +# Prepare the search requests for both vector fields +sparse_search_params = {"metric_type": "IP"} +sparse_req = AnnSearchRequest(query_embeddings["sparse"], + "sparse_vector", sparse_search_params, limit=k) +dense_search_params = {"metric_type": "L2"} +dense_req = AnnSearchRequest(query_embeddings["dense"], + "dense_vector", dense_search_params, limit=k) + +# Search topK docs based on dense and sparse vectors and rerank with RRF. +res = col.hybrid_search([sparse_req, dense_req], rerank=RRFRanker(), + limit=k, output_fields=['text']) + +# Currently Milvus only support 1 query in the same hybrid search request, so +# we inspect res[0] directly. In future release Milvus will accept batch +# hybrid search queries in the same call. +for hit in res[0]: + print(f'text: {hit.fields["text"]} distance {hit.distance}') + +# If you are using BGE-M3 to generate the embedding, you should see the following: +# text: Alan Turing was the first person to conduct substantial research in AI. distance 0.032786883413791656 +# text: Artificial intelligence was founded as an academic discipline in 1956. distance 0.016129031777381897 + +# Drop the collection to clean up the data. +utility.drop_collection(col_name) \ No newline at end of file