Slow join performance

select UPPER(country) as c,AVG(a.score) as avg from scoretable a
UNNEST a.countries country
inner join scoretable b on keys lower(a.source)||'-'||lower(a.category)
where a.type = 'article' AND 
STR_TO_MILLIS(a.published) >= STR_TO_MILLIS('2017-01-15 00:00:00Z') AND 
STR_TO_MILLIS(a.published) < STR_TO_MILLIS('2017-01-26') AND
a.score IS NOT NULL AND
b.industry_group = 'capital goods'
group by UPPER(country)

Is there any tips to optimize this query? it is taking around 2 secs. I have creates indexes

create index published_country_idx on scoretable(type,STR_TO_MILLIS(published),score,countries,source,category) where type = 'article' and sentiment_score is not missing

and

create index src_cat_idx on rss(source,category) where type = 'article'

Here is my explain

[
{
“plan”: {
#operator”: “Sequence”,
“~children”: [
{
#operator”: “IndexScan”,
“covers”: [
“cover ((a.type))”,
“cover (str_to_millis((a.published)))”,
“cover ((a.score))”,
“cover ((a.countries))”,
“cover ((a.source))”,
“cover ((a.category))”,
“cover ((meta(a).id))”
],
“filter_covers”: {
“cover (((a.score) is not missing))”: true,
“cover ((a.type))”: “article”
},
“index”: “published_country_idx”,
“index_id”: “ebdf52951b05aa47”,
“keyspace”: “scoretable”,
“namespace”: “default”,
“spans”: [
{
“Range”: {
“High”: [
“"article"”,
“1485388800000”
],
“Inclusion”: 0,
“Low”: [
“"article"”,
“1484438400000”,
“null”
]
}
}
],
“using”: “gsi”
},
{
#operator”: “Parallel”,
“~child”: {
#operator”: “Sequence”,
“~children”: [
{
#operator”: “Unnest”,
“as”: “country”,
“expr”: “cover ((a.countries))”
},
{
#operator”: “Join”,
“as”: “b”,
“keyspace”: “scoretable”,
“namespace”: “default”,
“on_keys”: “((lower(cover ((a.source))) || "-") || lower(cover ((a.category))))”
},
{
#operator”: “Filter”,
“condition”: “(((((cover ((a.type)) = "article") and (str_to_millis("2017-01-15 00:00:00Z") <= cover (str_to_millis((a.published))))) and (cover (str_to_millis((a.published))) < str_to_millis("2017-01-26"))) and (cover ((a.score)) is not null)) and (((b.industry_group) = "capital goods") or ((b.industry_group) = "transportation")))”
},
{
#operator”: “InitialGroup”,
“aggregates”: [
“avg(cover ((a.score)))”
],
“group_keys”: [
“upper(country)”
]
}
]
}
},
{
#operator”: “IntermediateGroup”,
“aggregates”: [
“avg(cover ((a.score)))”
],
“group_keys”: [
“upper(country)”
]
},
{
#operator”: “FinalGroup”,
“aggregates”: [
“avg(cover ((a.score)))”
],
“group_keys”: [
“upper(country)”
]
},
{
#operator”: “Parallel”,
“~child”: {
#operator”: “Sequence”,
“~children”: [
{
#operator”: “InitialProject”,
“result_terms”: [
{
“as”: “c”,
“expr”: “upper(country)”
},
{
“as”: “avg”,
“expr”: “avg(cover ((a.score)))”
}
]
},
{
#operator”: “FinalProject”
}
]
}
}
]
},
“text”: “select UPPER(country) as c,AVG(a.score) as avg from scoretable a\r\nUNNEST a.countries country\r\njoin scoretable b on keys lower(a.source)||‘-’||lower(a.category)\r\nwhere a.type = ‘article’ AND \r\nSTR_TO_MILLIS(a.published) >= STR_TO_MILLIS(‘2017-01-15 00:00:00Z’) AND \r\nSTR_TO_MILLIS(a.published) < STR_TO_MILLIS(‘2017-01-26’) AND\r\na.score IS NOT NULL AND\r\n(b.industry_group = ‘capital goods’ or b.industry_group = ‘transportation’)\r\ngroup by UPPER(country)”
}
]

Remove all STR_TO_MILLIS() in your query. Date strings can be compared directly.

Try this index. Post the EXPLAIN here.

create index idx on scoretable( published, score, countries, source, category ) where type = 'article' and score is not null;

As JOIN doesn’t depend on UNNEST, move UNNEST after JOIN. If the predicate on b eliminating lot of joined items, make join as subquery and UNNEST,GROUP in parent query.

1 Like

Hi geraldss,

Removing STR_TO_MILLIS does not help at all, the query speed is around the same. For query like this, does adding more nodes help in the query performance? Is query performance linearly proportional to the total number of nodes?

The WHERE condition on b helps to remove a lot of items. I tried removing GROUP BY totally. However, the query without any aggregation is even slower.

Try this.

  SELECT UPPER(country) AS c,AVG(q.score) AS avg FROM
                 (SELECT a.score, a.countries FROM  scoretable a
                 INNER JOIN scoretable b ON KEYS LOWER(a.source)||'-'||LOWER(a.category)
                 WHERE a.type = 'article' AND a.published >= '2017-01-15 00:00:00Z' AND
                       a.published < '2017-01-26' AND a.score IS NOT NULL AND 
                       b.industry_group = 'capital goods') AS q
    UNNEST q.countries country
    GROUP BY UPPER(country);