Covering array index with arbitrary predicates

Hi there,

I have documents with nested arrays (array of legs contained in array of segments):

{
  "type": "test",
  "id": "CUZ-LIM",
  "segments": [{
    "legs": [
      {
        "flightCode": {
          "carrierCode": "LA",
          "flightNumber": 2062
        },
        "arrival": "2018-10-08T19:43:00.000Z",
        "arrivalStation": "LIM",
        "departure": "2018-10-08T18:11:00.000Z",
        "departureStation": "CUZ",
        "key": "LA2062 CUZ#20181008T1811~LIM#20181008T1943"
      }
    ]
  }]
}

Note this is a simplified version of the actual documents, which are considerably bigger.

I have been trying to create an index and query that both are covered in order to provide decent performance, since my bucket has several hundreds of thousands of documents and fetches to the data service would make the query very slow.

The query must allow to filter by several properties of the leg arbitrarily, i.e. that I may filter by departureStation only, departureStation and arrivalStation together, and so forth depending on the criteria given by the user to create the query.

I have finally succeeded with an index like this:

CREATE INDEX `nested-array-1-prop-idx` ON `index-test`(
  type,
  ALL ARRAY
  (
    ALL ARRAY leg.departureStation FOR leg IN segment.legs END
  )
  FOR segment IN segments END
)
WHERE (`type` = 'test')

And some examples of queries that cover the index:

SELECT meta(`index-test`).id
FROM `index-test`
WHERE type = 'test'
AND ANY segment IN segments SATISFIES
  (ANY leg IN segment.legs SATISFIES leg.departureStation = 'CUZ' END)
END

{
  "plan": {
    "#operator": "Sequence",
    "~children": [
      {
        "#operator": "DistinctScan",
        "scan": {
          "#operator": "IndexScan2",
          "covers": [
            "cover ((`index-test`.`type`))",
            "cover ((all (array (all (array (`leg`.`departureStation`) for `leg` in (`segment`.`legs`) end)) for `segment` in (`index-test`.`segments`) end)))",
            "cover ((meta(`index-test`).`id`))"
          ],
          "filter_covers": {
            "cover ((`index-test`.`type`))": "test",
            "cover (any `segment` in (`index-test`.`segments`) satisfies any `leg` in (`segment`.`legs`) satisfies ((`leg`.`departureStation`) = \"CUZ\") end end)": true
          },
          "index": "nested-array-1-prop-idx",
          "index_id": "2ad8c4a634728f0c",
          "index_projection": {
            "entry_keys": [
              0
            ],
            "primary_key": true
          },
          "keyspace": "index-test",
          "namespace": "default",
          "spans": [
            {
              "exact": true,
              "range": [
                {
                  "high": "\"test\"",
                  "inclusion": 3,
                  "low": "\"test\""
                },
                {
                  "high": "\"CUZ\"",
                  "inclusion": 3,
                  "low": "\"CUZ\""
                }
              ]
            }
          ],
          "using": "gsi"
        }
      },
      {
        "#operator": "Parallel",
        "~child": {
          "#operator": "Sequence",
          "~children": [
            {
              "#operator": "Filter",
              "condition": "((cover ((`index-test`.`type`)) = \"test\") and cover (any `segment` in (`index-test`.`segments`) satisfies any `leg` in (`segment`.`legs`) satisfies ((`leg`.`departureStation`) = \"CUZ\") end end))"
            },
            {
              "#operator": "InitialProject",
              "result_terms": [
                {
                  "expr": "cover ((meta(`index-test`).`id`))"
                }
              ]
            },
            {
              "#operator": "FinalProject"
            }
          ]
        }
      }
    ]
  },
  "text": "SELECT meta(`index-test`).id\nFROM `index-test`\nWHERE type = 'test'\nAND ANY segment IN segments SATISFIES\n  (ANY leg IN segment.legs SATISFIES leg.departureStation = 'CUZ' END)\nEND"
}

Or:

SELECT meta(`index-test`).id
FROM `index-test`
WHERE type = 'test'
AND ANY segment IN segments SATISFIES
  ANY leg IN segment.legs
    SATISFIES leg.departureStation IS NOT MISSING
    AND leg.arrivalStation = 'LIM'
    AND flightCode.carrierCode = 'LA'
    AND flightCode.flightNumber = 2062
  END
END

{
  "plan": {
    "#operator": "Sequence",
    "~children": [
      {
        "#operator": "DistinctScan",
        "scan": {
          "#operator": "IndexScan2",
          "covers": [
            "cover ((`index-test`.`type`))",
            "cover ((all (array (all (array (`leg`.`departureStation`) for `leg` in (`segment`.`legs`) end)) for `segment` in (`index-test`.`segments`) end)))",
            "cover ((meta(`index-test`).`id`))"
          ],
          "filter_covers": {
            "cover ((`index-test`.`type`))": "test",
            "cover (any `segment` in (`index-test`.`segments`) satisfies any `leg` in (`segment`.`legs`) satisfies (((((`leg`.`departureStation`) is not missing) and ((`leg`.`arrivalStation`) = \"LIM\")) and (((`index-test`.`flightCode`).`carrierCode`) = \"LA\")) and (((`index-test`.`flightCode`).`flightNumber`) = 2062)) end end)": true,
            "cover (any `segment` in (`index-test`.`segments`) satisfies any `leg` in (`segment`.`legs`) satisfies (((`leg`.`departureStation`) is not missing) and ((`leg`.`arrivalStation`) = \"LIM\") and (((`index-test`.`flightCode`).`carrierCode`) = \"LA\") and (((`index-test`.`flightCode`).`flightNumber`) = 2062)) end end)": true
          },
          "index": "nested-array-1-prop-idx",
          "index_id": "2ad8c4a634728f0c",
          "index_projection": {
            "entry_keys": [
              0
            ],
            "primary_key": true
          },
          "keyspace": "index-test",
          "namespace": "default",
          "spans": [
            {
              "exact": true,
              "range": [
                {
                  "high": "\"test\"",
                  "inclusion": 3,
                  "low": "\"test\""
                },
                {
                  "inclusion": 1,
                  "low": "null"
                }
              ]
            }
          ],
          "using": "gsi"
        }
      },
      {
        "#operator": "Parallel",
        "~child": {
          "#operator": "Sequence",
          "~children": [
            {
              "#operator": "Filter",
              "condition": "((cover ((`index-test`.`type`)) = \"test\") and cover (any `segment` in (`index-test`.`segments`) satisfies any `leg` in (`segment`.`legs`) satisfies (((((`leg`.`departureStation`) is not missing) and ((`leg`.`arrivalStation`) = \"LIM\")) and (((`index-test`.`flightCode`).`carrierCode`) = \"LA\")) and (((`index-test`.`flightCode`).`flightNumber`) = 2062)) end end))"
            },
            {
              "#operator": "InitialProject",
              "result_terms": [
                {
                  "expr": "cover ((meta(`index-test`).`id`))"
                }
              ]
            },
            {
              "#operator": "FinalProject"
            }
          ]
        }
      }
    ]
  },
  "text": "SELECT meta(`index-test`).id\nFROM `index-test`\nWHERE type = 'test'\nAND ANY segment IN segments SATISFIES\n  ANY leg IN segment.legs\n    SATISFIES leg.departureStation IS NOT MISSING\n    AND leg.arrivalStation = 'LIM'\n    AND flightCode.carrierCode = 'LA'\n    AND flightCode.flightNumber = 2062\n  END\nEND"
}

Note that in my index definition I’m only indexing leg.departureStation and the query is covered as long as leg.departureStation is provided in the query, with a value or with IS NOT MISSING.

I’m wondering if there is a clearer or more intuitive way of achieving the same goal.

I’m using Couchbase 5.0.

Cheers.

The best option you have is use FTS service

https://blog.couchbase.com/search-and-rescue-7-reasons-for-n1ql-sql-developers-to-use-search/
Use 2.1 https://blog.couchbase.com/n1ql-and-search-how-to-leverage-fts-index-in-n1ql-query/

or directly use FTS without N1QL

2.2, 2.3 will be available in next release 6.5.0

Hi @vsr1,

Many thanks for taking the time to reply.

After reading both articles I’m not sure of how I could benefit of FTS, I don’t have any previous experience with Couchbase’s FTS (nor other FTS engines such as ElasticSearch) and I’d need to deep dive into it for a better understanding :slight_smile:

Even though FTS solved the problem, I’m still interested in the array index approach in order to create a covered array index and query where predicates on the array’s objects properties can be applied arbitrarily.

I know the following is possible, as explained in the docs:

CREATE INDEX idx_flight_day ON `travel-sample`
    ( DISTINCT ARRAY [v.flight, v.day] FOR v IN schedule END)
WHERE type = "route" ;

SELECT meta().id FROM `travel-sample`
WHERE type = "route"
AND ANY v in schedule SATISFIES [v.flight, v.day] = ["US681", 2] END;

Or this:

CREATE INDEX idx_sched_simple
ON `travel-sample` (ALL schedule)
WHERE type = "route";

SELECT * FROM `travel-sample`
WHERE type = "route"
AND ANY v IN schedule
SATISFIES v = {"day":2, "flight": "US681", "utc": "19:20:00"} END;

But it seems that in both cases it’s required to provide predicates for all the indexed properties, in order for the query to be covered. Am I right in my statement?

Additionally, I’d like to know if my approach as explained in my original post is correct.

Cheers.

PS: forgot to mention in my original post that I’m using Couchbase 5.0

I have just realised that even though the EXPLAIN shows that it covers the index, the results of the query are not correct.

I have tried another approach, consisting of creating as many indexes as properties in the array I need to filter:

CREATE INDEX `type-idx` ON `index-test`(
  type
)
WHERE (`type` = 'test')

CREATE INDEX `leg-departure-station-idx` ON `index-test`(
  ALL ARRAY
  (
    ALL ARRAY leg.departureStation FOR leg IN segment.legs END
  )
  FOR segment IN segments END
)
WHERE (`type` = 'test')

CREATE INDEX `leg-arrival-station-idx` ON `index-test`(
  ALL ARRAY
  (
    ALL ARRAY leg.arrivalStation FOR leg IN segment.legs END
  )
  FOR segment IN segments END
)
WHERE (`type` = 'test')

Performance is acceptable but not great, since it does IntersectScan but doesn’t seem to be covered (there is a Fetch) . Would it be possible to create a query that covers the multiple indexes?

Cheers.

Array predicates are complex and it will not possible to cover.
Array Index can only index single element from array. If you have dynamic nature you need to create separate index for each one. If need to cover that you must include outside array as scalar index key as trailing index key.

Also checkout Covering index for nested array