Vector Search with a Python Couchbase SDK raise couchbase.exceptions.QueryIndexNotFoundException

New update

As you could see in my last example, we were able to do the vector search in couchbase but we had not recovered the rest of the fields of the document, which would be necessary to be able to obtain, for example, the chunks of the vector storage retriever, in order to feed from those chunks to the LLM

In this update of the example I want to show how to proceed to obtain the fields that we require from the documents that we obtain as a result of the vector search

from couchbase.cluster import Cluster
from couchbase.auth import PasswordAuthenticator
from couchbase.exceptions import CouchbaseException
import couchbase.search as search
from couchbase.options import SearchOptions
from couchbase.vector_search import VectorQuery, VectorSearch
from couchbase.options import QueryOptions
from langchain_core.documents.base import Document

authenticator = PasswordAuthenticator('Administrator', 'xxxxxx')
cluster = Cluster('couchbase://localhost?username=Administrator&password=xxxxxx', authenticator=authenticator, timeout=30)
# Open the bucket and collection
bucket = cluster.bucket('chatui')
collection = bucket.default_collection()
scope = bucket.default_scope() 

query_vector = [0.255, 0.239, 0.213, 0.218, 0.197]

search_index = "color_index"

try:
    search_req = search.SearchRequest.create(search.MatchNoneQuery()).with_vector_search(
        VectorSearch.from_vector_query(VectorQuery('colorvect_l2', query_vector, num_candidates=10))
    )

    result = scope.search(search_index, search_req, SearchOptions(limit=10, fields=["id", "color", "brightness", "description"]))

    documents = []
    for row in result.rows():
        document_data = row.fields
        data = {
            "id": document_data.get("id", ""),
            "color": document_data.get("color", ""),
            "brightness": document_data.get("brightness", ""),
            "description": document_data.get("description", "")
        }
        documents.append(Document(page_content=document_data.get("id", ""), metadata=data))

    if not documents:
        raise ValueError("No relevant documents found")

    for document in documents:
        print(document.metadata)
        
    print("Reported total rows: {}".format(result.metadata().metrics().total_rows()))
    search_meta_data = result.metadata()
    print(search_meta_data)
    
except CouchbaseException as ex:
    import traceback
    traceback.print_exc()

what is new in this code

we have imported

from langchain_core.documents.base import Document

with the objective of constructing a Langchain Document from the response of the vector search

We have eliminated the N1QL query that previously only served as a guide to verify that the search results were as expected, this time we can ignore it for better clarity of the example

The search code line has been modified so that it includes the “id”, “color”, “brightness”, and “description” fields of the document in the search result

result = scope.search(search_index, search_req, SearchOptions(limit=10, fields=["id", "color", "brightness", "description"]))

now we can extract each field from the search result

     documents = []
     for row in result.rows():
         document_data = row.fields
         data = {
             "id": document_data.get("id", ""),
             "color": document_data.get("color", ""),
             "brightness": document_data.get("brightness", ""),
             "description": document_data.get("description", "")
         }

and thus build a LangChain Document to be able to incorporate it into our LLM

documents.append(Document(page_content=document_data.get("id", ""), metadata=data))

Now, this is not all, because we must modify the color_index so that these fields can be attached to the search result

{
  "type": "fulltext-index",
  "name": "chatui._default.color_index",
  "uuid": "d948b8effe3de3d7",
  "sourceType": "gocbcore",
  "sourceName": "chatui",
  "sourceUUID": "03324e97fac08d21b52d3354c3508270",
  "planParams": {
    "maxPartitionsPerPIndex": 1024,
    "indexPartitions": 1
  },
  "params": {
    "doc_config": {
      "docid_prefix_delim": "",
      "docid_regexp": "",
      "mode": "scope.collection.type_field",
      "type_field": "type"
    },
    "mapping": {
      "analysis": {},
      "default_analyzer": "standard",
      "default_datetime_parser": "dateTimeOptional",
      "default_field": "_all",
      "default_mapping": {
        "dynamic": true,
        "enabled": true,
        "properties": {
          "brightness": {
            "dynamic": false,
            "enabled": true,
            "fields": [
              {
                "index": true,
                "name": "brightness",
                "store": true,
                "type": "number"
              }
            ]
          },
          "color": {
            "dynamic": false,
            "enabled": true,
            "fields": [
              {
                "index": true,
                "name": "color",
                "store": true,
                "type": "text"
              }
            ]
          },
          "colorvect_l2": {
            "dynamic": false,
            "enabled": true,
            "fields": [
              {
                "dims": 5,
                "index": true,
                "name": "colorvect_l2",
                "similarity": "l2_norm",
                "type": "vector",
                "vector_index_optimized_for": "recall"
              }
            ]
          },
          "description": {
            "dynamic": false,
            "enabled": true,
            "fields": [
              {
                "index": true,
                "name": "description",
                "store": true,
                "type": "text"
              }
            ]
          },
          "id": {
            "dynamic": false,
            "enabled": true,
            "fields": [
              {
                "index": true,
                "name": "id",
                "store": true,
                "type": "text"
              }
            ]
          }
        }
      },
      "default_type": "_default",
      "docvalues_dynamic": false,
      "index_dynamic": true,
      "store_dynamic": false,
      "type_field": "_type"
    },
    "store": {
      "indexType": "scorch",
      "segmentVersion": 16
    }
  },
  "sourceParams": {}
}

We must pay special attention to the “store” attribute: true since otherwise the field referenced in the index will not be incorporated into the search response

With this we obtain a complete response from the vector search with everything necessary to build a langchain_core.documents.base.Document

{'id': '#FFEFD5', 'color': 'papaya whip', 'brightness': 240.82, 'description': 'Papaya whip is a soft and mellow color that can be described as a light shade of peach or coral. It has a calming and soothing effect, similar to the tropical fruit it is named after. This color is perfect for creating a warm and inviting atmosphere, and it pairs well with other pastel shades or neutral tones. Papaya whip is a versatile color that can be used in both fashion and interior design, adding a touch of elegance and sophistication to any space.'}
{'id': '#33C4FF', 'color': 'blue sky', 'brightness': 240.82, 'description': 'color similar to the blue sky in a sunny day.'}
Reported total rows: 2
SearchMetaData:{'client_context_id': '907f46-f7b8-594d-c6f1-5808906dbeebfe', 'metrics': {'took': 818764, 'total_rows': 2, 'max_score': 4.759954384762136e-06, 'success_partition_count': 1, 'error_partition_count': 0}, 'errors': {}}

Thanks for the attention, I hope I haven’t been too verbose, but I think this can be useful to many as much as I would have liked to find a complete example of Couchbase vector search like this

Cheers :tropical_drink:

Glad that you found the solution. Just curious as to why you don’t use the LangChain Vector Store integration for Couchbase (https://python.langchain.com/docs/integrations/vectorstores/couchbase/)
That should get you back LangChain documents for vector search. The documentation also touches up on some of the points you mentioned in this thread.

1 Like

Hi @nithishr.

Yes, that’s correct, LangChain provides a Couchbase implementation for vectorstore, but I wanted to be sure I understand internally what it does. The CouchbaseVectorStore class from langchain_community.vectorstores does exactly the same thing I’m proposing here, which is to implement search with the Couchbase SDK. My problems with this came mainly with the treatment of the index, and this would have happened to me equally with langchain_community’s couchbase.py

 search_req = search.SearchRequest.create(
             VectorSearch.from_vector_query(
                 VectorQuery(
                     self._embedding_key,
                     embedding,
                     k,
                 )
             )
         )
         try:
             if self._scoped_index:
                 search_iter = self._scope.search(
                     self._index_name,
                     search_req,
                     SearchOptions(
                         limit=k,
                         fields=fields,
                         raw=search_options,
                     ),
                 )

             else:
                 search_iter = self._cluster.search(
                     index=self._index_name,
                     request=search_req,
                     options=SearchOptions(limit=k, fields=fields, raw=search_options),
                 )

             docs_with_score = []

             # Parse the results
             for row in search_iter.rows():
                 text = row.fields.pop(self._text_key, "")

                 # Format the metadata from Couchbase
                 metadata = self._format_metadata(row.fields)

                 score = row.score
                 doc = Document(page_content=text, metadata=metadata)
                 docs_with_score.append((doc, score))

The important thing about this thread is that we understand how vector search works in Couchbase, with the Couchbase SDK itself.

Thanks for the advice

This topic was automatically closed 90 days after the last reply. New replies are no longer allowed.