FTS Query size problem

Hello,
I’m new to FTS and I’m trying to check if this solution is doable. I created an Index with all the fields that I need for a type of document. I also used the type identifier to limit the index to only those type of documents. I put the checks on index, store and docvalues and I used the keyword analyzer.

Now I’m running some queries and I’m getting some problems. I’m trying to use both SEARCH with the jsonobject and USE INDEX (USING FTS) with standard queries. I put here the query and the plan for both cases. The problems is that if I try to use the skip of 0, 10, 100, 1000 I get the results very fast. If I try to use the skip with 10000 or more the result is a lot slower and I also get more documents than the limit that I put (50).

Query with SEARCH and Object

SELECT *
FROM collection AS d USE INDEX (USING FTS)
WHERE ObjectType = 'Item' AND SEARCH(d, {
"query": {
        "must": {
            "conjuncts":[
                {
                    "field": "IsDeleted",
                    "bool": false
                },{
                    "field": "Project",
                    "match": "value"
                },{
                    "field": "IsTechnical",
                    "bool": false
                },{
                    "field": "Execution",
                    "match": "Y"
                }
            ]
        }
    },
    "sort": [
            {
                "by": "field",
                "field": "ModifiedOn",
                "desc": true,
                "mode": "default",
                "missing": "last",
                "type": "string"
            }
    ],
    "size":50,
    "from":1000})

This query runs in 180ms with no problems. This is the planner

{
  "#operator": "Authorize",
  "#stats": {
    "#phaseSwitches": 4,
    "execTime": "931ns",
    "servTime": "382.072µs"
  },
  "privileges": {
    "List": [
      {
        "Target": "collection",
        "Priv": 7,
        "Props": 0
      }
    ]
  },
  "~child": {
    "#operator": "Sequence",
    "#stats": {
      "#phaseSwitches": 2,
      "execTime": "1.613µs"
    },
    "~children": [
      {
        "#operator": "IndexFtsSearch",
        "#stats": {
          "#heartbeatYields": 1,
          "#itemsOut": 50,
          "#phaseSwitches": 203,
          "execTime": "53.269µs",
          "kernTime": "103.643µs",
          "servTime": "134.745816ms"
        },
        "as": "d",
        "bucket": "bucketname",
        "index": "Item_General",
        "index_id": "39a880613af2ed8e",
        "keyspace": "_default",
        "namespace": "default",
        "scope": "_default",
        "search_info": {
          "field": "\"\"",
          "outname": "out",
          "query": "{\"from\": 1000, \"query\": {\"must\": {\"conjuncts\": [{\"bool\": false, \"field\": \"IsDeleted\"}, {\"field\": \"Project\", \"match\": \"project\"}, {\"bool\": false, \"field\": \"IsTechnical\"}, {\"field\": \"Execution\", \"match\": \"Y\"}]}}, \"size\": 50, \"sort\": [{\"by\": \"field\", \"desc\": true, \"field\": \"ModifiedOn\", \"missing\": \"last\", \"mode\": \"default\", \"type\": \"string\"}]}"
        },
        "using": "fts",
        "#time_normal": "00:00.134",
        "#time_absolute": 0.13479908499999999
      },
      {
        "#operator": "Fetch",
        "#stats": {
          "#heartbeatYields": 1,
          "#itemsIn": 50,
          "#itemsOut": 50,
          "#phaseSwitches": 210,
          "execTime": "156.421µs",
          "kernTime": "134.872425ms",
          "servTime": "10.028027ms"
        },
        "as": "d",
        "bucket": "bucketname",
        "keyspace": "_default",
        "namespace": "default",
        "scope": "_default",
        "#time_normal": "00:00.010",
        "#time_absolute": 0.010184448
      },
      {
        "#operator": "Sequence",
        "#stats": {
          "#phaseSwitches": 2,
          "execTime": "155.711µs",
          "kernTime": "70ns",
          "state": "running"
        },
        "~children": [
          {
            "#operator": "Filter",
            "#stats": {
              "#itemsIn": 50,
              "#itemsOut": 50,
              "#phaseSwitches": 204,
              "execTime": "7.798589ms",
              "kernTime": "139.749025ms"
            },
            "condition": "(((`d`.`ObjectType`) = \"Item\") and search(`d`, {\"from\": 1000, \"query\": {\"must\": {\"conjuncts\": [{\"bool\": false, \"field\": \"IsDeleted\"}, {\"field\": \"Project\", \"match\": \"project\"}, {\"bool\": false, \"field\": \"IsTechnical\"}, {\"field\": \"Execution\", \"match\": \"Y\"}]}}, \"size\": 50, \"sort\": [{\"by\": \"field\", \"desc\": true, \"field\": \"ModifiedOn\", \"missing\": \"last\", \"mode\": \"default\", \"type\": \"string\"}]}))",
            "#time_normal": "00:00.007",
            "#time_absolute": 0.007798589
          },
          {
            "#operator": "InitialProject",
            "#stats": {
              "#itemsIn": 50,
              "#itemsOut": 50,
              "#phaseSwitches": 156,
              "execTime": "47.378µs",
              "kernTime": "142.827314ms"
            },
            "discard_original": true,
            "result_terms": [
              {
                "expr": "self",
                "star": true
              }
            ],
            "#time_normal": "00:00.000",
            "#time_absolute": 0.000047378
          }
        ],
        "#time_normal": "00:00.000",
        "#time_absolute": 0.000155711
      },
      {
        "#operator": "Stream",
        "#stats": {
          "#itemsIn": 50,
          "#itemsOut": 50,
          "#phaseSwitches": 52,
          "execTime": "4.786682ms"
        },
        "#time_normal": "00:00.004",
        "#time_absolute": 0.004786682
      }
    ],
    "#time_normal": "00:00.000",
    "#time_absolute": 0.000001613
  },
  "~versions": [
    "7.2.4-N1QL",
    "7.2.4-7070-enterprise"
  ],
  "#time_normal": "00:00.000",
  "#time_absolute": 0.000382072
}

If I put the from to 10000 or more I get 25625+ results instead of the normal size of 50.

I saw now that there is a default limit of 10k elements of (size + from).

When I create the same query with standard syntax, but using the FlexIndex, I get the results without this limitation, but I need to wait a lot of time.

The solutions that I saw is to use the search_after, but in our application we have a ‘go to page’ feature that isn’t viable with this type of logic (we can still use it, but if I am at page 0 and I want to go to page 10000 it isn’t very useful). Are there any solutions? Increasing the default value to a higher number can cause problems and/or improve the results response time? Is it better to use the search with object query or the standard query with use index (using FTS) or is it the same?

If you look the FTS index scan took about 134ms
and the OS(kernTime) 0.103ms.

         "#operator": "IndexFtsSearch",
        "#stats": {
          "#heartbeatYields": 1,
          "#itemsOut": 50,
          "#phaseSwitches": 203,
          "execTime": "53.269µs",
          "kernTime": "103.643µs",
          "servTime": "134.745816ms"
        },

the fetching of 50 documents b/c of SELECT *
took 10ms

        "#operator": "Fetch",
        "#stats": {
          "#heartbeatYields": 1,
          "#itemsIn": 50,
          "#itemsOut": 50,
          "#phaseSwitches": 210,
          "execTime": "156.421µs",
          "kernTime": "134.872425ms",
          "servTime": "10.028027ms"
        },

Do you need to do a FTS to answer your question?
Based on the below you have exact or = matches

              {
                  "field": "IsDeleted",
                  "bool": false
              },{
                  "field": "Project",
                  "match": "value"
              },{
                  "field": "IsTechnical",
                  "bool": false
              },{
                  "field": "Execution",
                  "match": "Y"
              }

You could do the above with a GSI normal query and leverage index pagination push down to get a fast results when you want to do LIMIT and OFFSET too.

PRO TIP bool in FTS can be painful b/c if you have 100M docs 50/50 true/false the index scan can return 50M results to process.

1 Like

Thank you for the fast reply. This is just a simple test that I was doing with limited filters. Right now we are using GSI, but we are developing a new feature with the pagination and the problem is that my documents are very large (+40 fields) and the user should be able to filter/order(asc or desc) for at least half of them so they advised us to use the flex indexes with the FTS.

Obviously this is the simplest case where the user doesn’t use any filter at all.

There are a few blog about FTS Indexes and optimization.
https://www.couchbase.com/blog?s=full%20text%20search

For Pagination you’ll want to limit the FTS indexes size via limiting the tokenization length … etc.

1 Like