Access to index without SQL++ query

Hello all,
As far as I know the only way to use an index in a Couchbase bucket is via some SELECT SQL++ query for which the engine determines that index can be used to execute that query.

I’m wondering why Couchbase does not provide “direct” access to index with basic operations such as find some value in index, should be O(log n) as index are sorted (from my understanding) and iterate in the index, so get next/prev value(s) by ascending order (with a limit value). A bit like a basic cursor or iterator like in C++ std::map or std::set.

Not saying SQL++ SELECT has no use of course but IMO there are simple cases where one knows which index to use to perform searches in a bucket, index being created on that purpose.

Thanks
Best

SELECT does access the index directly. Checkout the at the query plan.

select * from travel-sample.inventory.airport where airportname = ‘Albany Intl’

{
  "#operator": "Authorize",
  "#planPreparedTime": "2025-05-08T10:41:02.914-07:00",
  "#stats": {
    "#phaseSwitches": 4,
    "execTime": "743ns",
    "servTime": "2.054µs"
  },
  "privileges": {
    "List": [
      {
        "Target": "default:travel-sample.inventory.airport",
        "Priv": 7,
        "Props": 0
      }
    ]
  },
  "~child": {
    "#operator": "Sequence",
    "#stats": {
      "#phaseSwitches": 2,
      "execTime": "1.75µs"
    },
    "~children": [
      {
        "#operator": "IndexScan3",
        "#stats": {
          "#itemsIn": 1,
          "#itemsOut": 1,
          "#phaseSwitches": 7,
          "execTime": "14.616µs",
          "kernTime": "1.656µs",
          "servTime": "2.025003ms"
        },
        "bucket": "travel-sample",
        "index": "def_inventory_airport_airportname",    <------------------------ using this index
        "index_id": "8aa68a427e5ef6fb",
        "index_projection": {
          "primary_key": true
        },
        "keyspace": "airport",
        "namespace": "default",
        "optimizer_estimates": {
          "cardinality": 1,
          "cost": 12.180133283987164,
          "fr_cost": 12.180133283987164,
          "size": 12
        },
        "scope": "inventory",
        "spans": [
          {
            "exact": true,           <-----------------------------------  exact match
            "range": [
              {
                "high": "\"Albany Intl\"",
                "inclusion": 3,
                "index_key": "`airportname`",
                "low": "\"Albany Intl\""
              }
            ]
          }
        ],
        "using": "gsi", 
        "#time_normal": "00:00.002",
        "#time_absolute": 0.002039619
      },
      {
        "#operator": "Fetch",
        "#stats": {
          "#itemsIn": 1,                       <--------------------- one item
          "#itemsOut": 1,
          "#phaseSwitches": 10,
          "execTime": "43.023µs",
          "kernTime": "2.054953ms",
          "servTime": "314.389µs"
        },
        "bucket": "travel-sample",
        "keyspace": "airport",
        "namespace": "default",
        "optimizer_estimates": {
          "cardinality": 1,
          "cost": 24.967534071388343,
          "fr_cost": 24.967534071388343,
          "size": 248
        },
        "scope": "inventory",
        "#time_normal": "00:00.000",
        "#time_absolute": 0.000357412
      },
      {
        "#operator": "Sequence",
        "#stats": {
          "#phaseSwitches": 1,
          "execTime": "2.486µs"
        },
        "~children": [
          {
            "#operator": "Filter",
            "#stats": {
              "#itemsIn": 1,
              "#itemsOut": 1,
              "#phaseSwitches": 8,
              "execTime": "15.897µs",
              "kernTime": "2.428284ms"
            },
            "condition": "((`airport`.`airportname`) = \"Albany Intl\")",
            "optimizer_estimates": {
              "cardinality": 1,
              "cost": 24.983282087136367,
              "fr_cost": 24.983282087136367,
              "size": 248
            },
            "#time_normal": "00:00.000",
            "#time_absolute": 0.000015897
          },
          {
            "#operator": "InitialProject",
            "#stats": {
              "#itemsIn": 1,
              "#itemsOut": 1,
              "#phaseSwitches": 7,
              "execTime": "4.474µs",
              "kernTime": "2.452348ms"
            },
            "discard_original": true,
            "optimizer_estimates": {
              "cardinality": 1,
              "cost": 24.99903010288439,
              "fr_cost": 24.99903010288439,
              "size": 248
            },
            "preserve_order": true,
            "result_terms": [
              {
                "expr": "self",
                "star": true
              }
            ],
            "#time_normal": "00:00.000",
            "#time_absolute": 0.000004474
          }
        ],
        "#time_normal": "00:00.000",
        "#time_absolute": 0.000002486
      },
      {
        "#operator": "Stream",
        "#stats": {
          "#itemsIn": 1,
          "#itemsOut": 1,
          "#phaseSwitches": 3,
          "execTime": "45.881µs"
        },
        "optimizer_estimates": {
          "cardinality": 1,
          "cost": 24.99903010288439,
          "fr_cost": 24.99903010288439,
          "size": 248
        },
        "serializable": true,
        "#time_normal": "00:00.000",
        "#time_absolute": 0.000045881
      }
    ],
    "#time_normal": "00:00.000",
    "#time_absolute": 0.00000175
  },
  "~versions": [
    "7.6.2-N1QL",
    "7.6.2-3721-enterprise"
  ],
  "#time_normal": "00:00.000",
  "#time_absolute": 0.0000020539999999999997

select * from travel-sample.inventory.airport where airportname >= ‘Albany Intl’ and airportname <= ‘Bangor Intl’

{
  "#operator": "Authorize",
  "#planPreparedTime": "2025-05-08T10:47:20.423-07:00",
  "#stats": {
    "#phaseSwitches": 4,
    "execTime": "1.066µs",
    "servTime": "3.848µs"
  },
  "privileges": {
    "List": [
      {
        "Target": "default:travel-sample.inventory.airport",
        "Priv": 7,
        "Props": 0
      }
    ]
  },
  "~child": {
    "#operator": "Sequence",
    "#stats": {
      "#phaseSwitches": 2,
      "execTime": "1.865µs"
    },
    "~children": [
      {
        "#operator": "IndexScan3",
        "#stats": {
          "#heartbeatYields": 1,
          "#itemsIn": 94,
          "#itemsOut": 94,
          "#phaseSwitches": 379,
          "execTime": "46.95µs",
          "kernTime": "45.757µs",
          "servTime": "1.170633ms"
        },
        "bucket": "travel-sample",
        "index": "def_inventory_airport_airportname",    <---------------------- index
        "index_id": "8aa68a427e5ef6fb",
        "index_projection": {
          "primary_key": true
        },
        "keyspace": "airport",
        "namespace": "default",
        "optimizer_estimates": {
          "cardinality": 81.99999999999993,
          "cost": 26.77092928694737,
          "fr_cost": 12.180133283987164,
          "size": 12
        },
        "scope": "inventory",
        "spans": [
          {
            "exact": true,
            "range": [
              {
                "high": "\"Bangor Intl\"",
                "inclusion": 3,
                "index_key": "`airportname`",
                "low": "\"Albany Intl\""
              }
            ]
          }
        ],
        "using": "gsi",
        "#time_normal": "00:00.001",
        "#time_absolute": 0.0012175830000000002
      },
      {
        "#operator": "Fetch",
        "#stats": {
          "#heartbeatYields": 2,
          "#itemsIn": 94,
          "#itemsOut": 94,
          "#phaseSwitches": 386,
          "execTime": "244.364µs",
          "kernTime": "1.308167ms",
          "servTime": "4.414366ms"
        },
        "bucket": "travel-sample",
        "keyspace": "airport",
        "namespace": "default",
        "optimizer_estimates": {
          "cardinality": 81.99999999999993,
          "cost": 103.33779385384418,
          "fr_cost": 24.967534071388343,
          "size": 248
        },
        "scope": "inventory",
        "#time_normal": "00:00.004",
        "#time_absolute": 0.004658730000000001
      },
      {
        "#operator": "Sequence",
        "#stats": {
          "#phaseSwitches": 1,
          "execTime": "7.784µs"
        },
        "~children": [
          {
            "#operator": "Filter",
            "#stats": {
              "#heartbeatYields": 2,
              "#itemsIn": 94,
              "#itemsOut": 94,
              "#phaseSwitches": 380,
              "execTime": "319.771µs",
              "kernTime": "5.751345ms"
            },
            "condition": "((\"Albany Intl\" <= (`airport`.`airportname`)) and ((`airport`.`airportname`) <= \"Bangor Intl\"))",
            "optimizer_estimates": {
              "cardinality": 81.99999999999993,
              "cost": 104.62913114518211,
              "fr_cost": 24.983282087136367,
              "size": 248
            },
            "#time_normal": "00:00.000",
            "#time_absolute": 0.000319771
          },
          {
            "#operator": "InitialProject",
            "#stats": {
              "#itemsIn": 94,
              "#itemsOut": 94,
              "#phaseSwitches": 286,
              "execTime": "35.619µs",
              "kernTime": "5.54178ms"
            },
            "discard_original": true,
            "optimizer_estimates": {
              "cardinality": 81.99999999999993,
              "cost": 105.92046843652005,
              "fr_cost": 24.99903010288439,
              "size": 248
            },
            "preserve_order": true,
            "result_terms": [
              {
                "expr": "self",
                "star": true
              }
            ],
            "#time_normal": "00:00.000",
            "#time_absolute": 0.000035619
          }
        ],
        "#time_normal": "00:00.000",
        "#time_absolute": 0.000007784
      },
      {
        "#operator": "Stream",
        "#stats": {
          "#itemsIn": 94,
          "#itemsOut": 94,
          "#phaseSwitches": 96,
          "execTime": "989.888µs"
        },
        "optimizer_estimates": {
          "cardinality": 81.99999999999993,
          "cost": 105.92046843652005,
          "fr_cost": 24.99903010288439,
          "size": 248
        },
        "serializable": true,
        "#time_normal": "00:00.000",
        "#time_absolute": 0.000989888
      }
    ],
    "#time_normal": "00:00.000",
    "#time_absolute": 0.000001865
  },
  "~versions": [
    "7.6.2-N1QL",
    "7.6.2-3721-enterprise"
  ],
  "#time_normal": "00:00.000",
  "#time_absolute": 0.000004914000000000001
}

Hi, thanks for quick reply. Yes I understand, what I would like is to avoid writing any SQL++ query at all, just access the index itself and perform looks up directly. No need for Couchbase server optimizer to compute any plan as I know already the index I want to use.
Anyway I think this will remain a dream :slight_smile:

what I would like is to avoid writing any SQL++ query

You would still need to indicate what index you want accessed and how you want to access it (ie. exact match, less than, greater than etc, or IN a list). And any other filtering. And what you want projected. And any transformations (toUpperCase etc). And then you would want to use the data from that document to retrieve another document (join). Then you would need a flexible format for the result - such that it could have any number of rows, and any number of columns and it could be streamed. Fifty years of refining those requirements has resulted in SQL++.

If you want to use the KV API but leverage an index, you can retrieve meta().id using SQL++ and then use that in the kv api to retrieve the document. This pattern is actually the fastest way query and get data from Couchbase.

Yes I don’t say SQL++ is useless, it has plenty of features for sure, but I have only simple searches in mind, exact match, ranges (less than, greater than), projection would be just index keys no more, no transformations, no join. Re format for the result, same as today, JSON array with rows and columns.
Typically using an index roughly as one uses a std::map C++ container, in particular calling lower_bound, upper_found or find (for exact match) and iterate with an iterator/cursor.
Is KV API same as functions using lcb_CMDGET (we use libcouchbase), or something different please ?
Thanks

The kv api us get, insert, replace, remove etc. And rangescan. All the calls work only on exact match of the document id except for rangescan - which can do ranges of the document id. Rangescan requires executing the request against every vbucket (1024 of them) and is resource intensive. The kv api exists because it is many times more efficient than the query api because the requests are made directly to the node where the document resides versus SQL++ which needs to fetch the documents from the nodes where they reside before returning them to the client. There is no such possibility for gsi indexes as the location of the data cannot be determined from the gsi key (but it can be determined from the document key).

What is the motivation for an index api? If it is ease-of-use for the application, a wrapper for the query api (SQL++) can provide any api for the application to call. If it is to avoid parsing the SQL++ on every call, the option adhoc=false does that.

if you looking command line cbindex --help

ex:

cbindex -auth user:pass  -type=scan -bucket b1 -index idx1 -high="[1]" -low="[1]" -incl 3

composite

cbindex -auth user:pass  -type=scan -bucket b1 -index idx1 -high="[1, 5]" -low="[1,5]" -incl 3

best recommended and supported way is SQL++ covered query with use index hint

1 Like

Thanks for both replies. We use kv api get function after the index results to fetch each document and very happy with it. It’s fast and convenient. As you suggested, all these calls are wrapped in specific code the rest of application calls.

Motivation for an index API is mainly rooted on the fact we already know which index to use for a given query case, because we created that index precisely in such a way it’s optimal for that query. So why spend time running a query parser, planner etc… even once to tell us the index to use?

Another thing is that we might have issues using composite indexes with ranges on multiple keys. Not much idea what the problem is and not obvious from query plan. We suspect the index is scanned linearly to fetch matching rows but no much evidence that’s the case.
selects whose where pattern is as following (example with 2 keys, but we might have more).

select ... where [k1,k2]<=[$1,$2] and [k1,k2]>=[$3,$4] and k1 is not missing and k2 is not missing limit $5

Thanks, have a good day
Best

issues using composite indexes with ranges on multiple keys

Okay. Now this is starting to make sense.

where [k1,k2]<=[$1,$2] and [k1,k2]>=[$3,$4]

the index is scanned linearly

(scanned linearly from the lower bounds to the upper bounds)
That is correct. Because there is no other way to evaluate inequality on parameterized values. Even a specialized API will need to scan the indexes from the minimum parameter to the maximum parameter. @vsr1 can help you get an efficient query.

Also - since that query is a conjunction of two predicates with different parameters , it cannot be specified with a range api. But it can be specified with SQL++

In your case use prepare statement with USE INDEX and you will be good parse/plan etc is one time few ms. Instead you reinvent wheel and maintain by your self.
Things get complex when you composite you have non-equal predicates composite

where [k1,k2]>= [10,“p”] and [k1,k2] < [20, “x”]

This not same as k1 BETWEEN 10 AND 20 AND k2 BETWEEN “p” AND “x”

Array comparison done first element if duplicate then goes next element

If document has k1 = 12 and k2 “a”
With array comparison returns true vs other scalar comparison false

As far indexer concern it follows similar concept as btree.
i.e it may use start/stop positions after that it will apply individual key parts.

I think there is a misunderstanding on “the index is scanned linearly”. @jo1 means that the select he gave seems to make a full index scan. “Linear” in the sense of O(N). Is it the case?

He expects the select to take a lexicographically-sorted slice of the index (hence the comparison on whole JSON vectors), he understands that an interval on k1 and an interval on k2 is different, but that’s not what he wants. If the select he gave does not do a full index scan, he is happy.

Incidentally, Couchbase’s documentation specifically says that from version 6.5 statements should not be prepared: “With Couchbase Server 6.5 and newer, if a statement has placeholders, and a placeholder is supplied, the Query Service will generate specially optimized plans. Therefore, if you are supplying the placeholder each time, adhoc = true will actually return a better-optimized plan (at the price of generating a fresh plan for each query).” You seem to say that prepared is better?

Source: Query | Couchbase Docs

I don’t think it says “should”. It says that not prepared will generate a specially optimized query plan. (I think it should say “may” instead of “will”). Which is correct. So the application can choose between a (possibly) better query plan versus parsing and creating a query plan on every execution. It says that in the next sentence in the documentation.

Therefore, if you are supplying the placeholder each time, adhoc = true will actually return a better-optimized plan (at the price of generating a fresh plan for each query).

Prepared: Thanks! We have that bit sorted. :slight_smile:

I understand from your reply that we need to make some arbitrage between 2 costs: on one side: parsing, generating new better plan for each query, on other side: generating not so good plan but only once. This is just terrible, disappointing.

That’s precisely because of such concerns I’d prefer to drive index scans directly without all this complication.

It should not be disappointing, it should be expected. Everything is a trade-off. space vs. time, time vs. money, accuracy vs speed.

generating new better plan for each query, on other side: generating not so good plan but only once

This is a property of searching. It’s much easier to locate what you are searching for if you know what you are searching for. Without knowing what you are searching for the plan has to accommodate searching for anything. Consider WHERE Name LIKE $name. Without knowing if name is “%mith”, “S%” or “Smith”, event with an ordered index, it’s necessary to consider every key in the index for Name to handle the case ‘%mith’. A full index scan. But if you know that $name is “S%”, you can start scanning where Name starts with ‘S’ and stop scanning where Name starts with ‘T’ - an index scan only from ‘S’ to ‘T’. And if you know that $name is “Smith” - an index lookup will suffice.

I’d prefer to drive index scans directly

USE INDEX does that.

Most cases (99% ) prepare statements good no further optimization needed and lot of those improved over the time. In your case simple should good.

Some examples: Like one mentioned @mreiche some index aggregations on IN clause,

The prepare bit is just the side question that arose, how about the main question on the cost to execute the lexicographic select? Is it O(N) or O(log(N))? Does it do a full index scan or does it use the values in the JSON vector bounds?

Does it do a full index scan or does it use the values in the JSON vector bounds?

What does the query plan show?

If I use this (equivalent) query on the travel-sample, and create the recommended index on [ airportname, city]

select airport.* from travel-sample.inventory.airport where [ airportname, city] >= ['Albany Intl', 'Albany' ] and [ airportname, city] <= ['Alice Intl' ,'Alice' ]

The resulting query processes only the 9 matching documents. Even without specifying USE INDEX. (there are 1968 airport documents in the colllection).

And look at the range used in the query - it’s exactly what your “index API” would use.

"~children": [
      {
        "#operator": "IndexScan3",
        "#stats": {
          "#itemsIn": 9,
          "#itemsOut": 9,
          "#phaseSwitches": 39,
          "execTime": "24.127µs",
          "kernTime": "2.825µs",
          "servTime": "1.239574ms"
        },
        "bucket": "travel-sample",
        "index": "adv_airportnamecity",
        "index_id": "b24708afc103be3f",
        "index_projection": {
          "primary_key": true
        },
        "keyspace": "airport",
        "namespace": "default",
        "optimizer_estimates": {
          "cardinality": 1.000000000000183,
          "cost": 12.181079088630932,
          "fr_cost": 12.1810790886309,
          "size": 12
        },
        "scope": "inventory",
        "spans": [
          {
            "exact": true,
            "range": [
              {
                "high": "[\"Alice Intl\", \"Alice\"]",
                "inclusion": 3,
                "index_key": "[`airportname`, `city`]",
                "low": "[\"Albany Intl\", \"Albany\"]"
              }
            ]
          }
        ],

I sort of btree scan. Specific details vary but some thing like this
Plan shows span.
It takes low value position there and keep scan until high value

https://www.couchbase.com/blog/wp-content/uploads/2017/10/N1QL-A-Practical-Guide-2nd-Edition.pdf Page 135

Hi Michael, thanks for the example.
I have also the travel-sample bucket locally but I don’t have the index you use in your example. What’s its definition please?

Thanks