Often you want to limit the number of group-by values in a table, and bucket everything else into an 'other' category.
In the top_states_by_eleveation
query below, we have a query with two stages. The first stage calculates the top states and nests the data to be aggregated. The second pipeline stage produces the actual aggregation.
source: airports is duckdb.table('../data/airports.parquet') extend { measure: airport_count is count() avg_elevation is elevation.avg() view: top_states_by_elevation is { group_by: state aggregate: avg_elevation calculate: is_other is row_number() > 5 nest: data is { group_by: code, elevation } } -> { group_by: state is pick state when not is_other else 'OTHER' aggregate: avg_elevation is data.elevation.avg() airport_count is data.count() # hidden sort is data.count() {where: not is_other } order_by: sort desc } }
Basic Query
run: airports -> top_states_by_elevation
state | avg_elevation | airport_count |
---|---|---|
CO | 6,255.864 | 425 |
NM | 5,419.635 | 181 |
UT | 5,066 | 140 |
NV | 4,029.977 | 128 |
WY | 5,619.365 | 115 |
OTHER | 910.346 | 18,781 |
[ { "state": "CO", "avg_elevation": 6255.863529411765, "airport_count": 425, "sort": 425 }, { "state": "NM", "avg_elevation": 5419.635359116022, "airport_count": 181, "sort": 181 }, { "state": "UT", "avg_elevation": 5066, "airport_count": 140, "sort": 140 }, { "state": "NV", "avg_elevation": 4029.9765625, "airport_count": 128, "sort": 128 }, { "state": "WY", "avg_elevation": 5619.365217391304, "airport_count": 115, "sort": 115 }, { "state": "OTHER", "avg_elevation": 910.3459902148479, "airport_count": 18781, "sort": 0 } ]
WITH __stage0 AS ( SELECT group_set, airports."state" as "state__0", CASE WHEN group_set=0 THEN AVG(airports."elevation") END as "avg_elevation__0", ROW_NUMBER() OVER(PARTITION BY group_set ORDER BY CASE WHEN group_set=0 THEN AVG(airports."elevation") END desc NULLS LAST )>5 as "is_other__0", CASE WHEN group_set=1 THEN airports."code" END as "code__1", CASE WHEN group_set=1 THEN airports."elevation" END as "elevation__1" FROM '../data/airports.parquet' as airports CROSS JOIN (SELECT UNNEST(GENERATE_SERIES(0,1,1)) as group_set ) as group_set GROUP BY 1,2,5,6 ) , __stage1 AS ( SELECT "state__0" as "state", MAX(CASE WHEN group_set=0 THEN "avg_elevation__0" END) as "avg_elevation", MAX(CASE WHEN group_set=0 THEN "is_other__0" END) as "is_other", COALESCE(LIST({ "code": "code__1", "elevation": "elevation__1"} ORDER BY "code__1" asc NULLS LAST) FILTER (WHERE group_set=1),[]) as "data" FROM __stage0 GROUP BY 1 ) SELECT CASE WHEN COALESCE(NOT(base."is_other"),FALSE) THEN base."state" ELSE 'OTHER' END as "state", AVG(data_0."elevation") as "avg_elevation", COUNT(DISTINCT base."state" || 'x' || data_0_outer.__row_id) as "airport_count", COUNT(DISTINCT CASE WHEN COALESCE(NOT(base."is_other"),FALSE) THEN base."state" || 'x' || data_0_outer.__row_id END) as "sort" FROM __stage1 as base LEFT JOIN LATERAL (SELECT UNNEST(GENERATE_SERIES(1, length(base."data"),1)) as __row_id, UNNEST(base."data"), 1 as ignoreme) as data_0_outer(__row_id, data_0,ignoreme) ON data_0_outer.ignoreme=1 GROUP BY 1 ORDER BY 4 desc NULLS LAST
Nested Query
run: airports -> { group_by: `Facility Type` is fac_type aggregate: airport_count avg_elevation nest: top_states_by_elevation }
Facility Type | airport_count | avg_elevation | top_states_by_elevation | |||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
AIRPORT | 13,925 | 1,237.044 |
| |||||||||||||||||||||
HELIPORT | 5,135 | 950.513 |
| |||||||||||||||||||||
SEAPLANE BASE | 473 | 488.822 |
| |||||||||||||||||||||
ULTRALIGHT | 125 | 806.144 |
| |||||||||||||||||||||
STOLPORT | 86 | 1,375.047 |
| |||||||||||||||||||||
GLIDERPORT | 37 | 1,611.405 |
| |||||||||||||||||||||
BALLOONPORT | 12 | 1,047.25 |
|
[ { "Facility Type": "AIRPORT", "airport_count": 13925, "avg_elevation": 1237.0441651705567, "top_states_by_elevation": [ { "state": "OTHER", "avg_elevation": 1237.0441651705567, "airport_count": 13925, "sort": 0 } ] }, { "Facility Type": "HELIPORT", "airport_count": 5135, "avg_elevation": 950.5125608568646, "top_states_by_elevation": [ { "state": "OTHER", "avg_elevation": 950.5125608568646, "airport_count": 5135, "sort": 0 } ] }, { "Facility Type": "SEAPLANE BASE", "airport_count": 473, "avg_elevation": 488.82241014799155, "top_states_by_elevation": [ { "state": "FL", "avg_elevation": 51.93023255813954, "airport_count": 43, "sort": 43 }, { "state": "IN", "avg_elevation": 797.0833333333334, "airport_count": 12, "sort": 12 }, { "state": "OTHER", "avg_elevation": 524.9162679425838, "airport_count": 418, "sort": 0 } ] }, { "Facility Type": "ULTRALIGHT", "airport_count": 125, "avg_elevation": 806.144, "top_states_by_elevation": [ { "state": "AZ", "avg_elevation": 2025.7142857142858, "airport_count": 7, "sort": 7 }, { "state": "FL", "avg_elevation": 67.2, "airport_count": 5, "sort": 5 }, { "state": "MO", "avg_elevation": 829, "airport_count": 4, "sort": 4 }, { "state": "WI", "avg_elevation": 831.6666666666666, "airport_count": 3, "sort": 3 }, { "state": "OTHER", "avg_elevation": 758.877358490566, "airport_count": 106, "sort": 0 } ] }, { "Facility Type": "STOLPORT", "airport_count": 86, "avg_elevation": 1375.046511627907, "top_states_by_elevation": [ { "state": "FL", "avg_elevation": 87, "airport_count": 13, "sort": 13 }, { "state": "CO", "avg_elevation": 6211.666666666667, "airport_count": 6, "sort": 6 }, { "state": "OTHER", "avg_elevation": 1191.8358208955224, "airport_count": 67, "sort": 0 } ] }, { "Facility Type": "GLIDERPORT", "airport_count": 37, "avg_elevation": 1611.4054054054054, "top_states_by_elevation": [ { "state": "FL", "avg_elevation": 38.75, "airport_count": 4, "sort": 4 }, { "state": "PA", "avg_elevation": 971.6666666666666, "airport_count": 3, "sort": 3 }, { "state": "AZ", "avg_elevation": 3539, "airport_count": 2, "sort": 2 }, { "state": "NV", "avg_elevation": 4300, "airport_count": 1, "sort": 1 }, { "state": "OTHER", "avg_elevation": 1673.111111111111, "airport_count": 27, "sort": 0 } ] }, { "Facility Type": "BALLOONPORT", "airport_count": 12, "avg_elevation": 1047.25, "top_states_by_elevation": [ { "state": "OTHER", "avg_elevation": 1510, "airport_count": 5, "sort": 0 }, { "state": "IL", "avg_elevation": 811.5, "airport_count": 2, "sort": 2 }, { "state": "NJ", "avg_elevation": 265, "airport_count": 2, "sort": 2 }, { "state": "KS", "avg_elevation": 1250, "airport_count": 1, "sort": 1 }, { "state": "OH", "avg_elevation": 1164, "airport_count": 1, "sort": 1 }, { "state": "VA", "avg_elevation": 450, "airport_count": 1, "sort": 1 } ] } ]
WITH __stage0 AS ( SELECT group_set, airports."fac_type" as "Facility Type__0", CASE WHEN group_set=0 THEN COUNT( 1) END as "airport_count__0", CASE WHEN group_set=0 THEN AVG(airports."elevation") END as "avg_elevation__0", CASE WHEN group_set IN (1,2) THEN airports."state" END as "state__1", CASE WHEN group_set=1 THEN AVG(airports."elevation") END as "avg_elevation__1", ROW_NUMBER() OVER(PARTITION BY group_set, airports."fac_type" ORDER BY CASE WHEN group_set=1 THEN AVG(airports."elevation") END desc NULLS LAST )>5 as "is_other__1", CASE WHEN group_set=2 THEN airports."code" END as "code__2", CASE WHEN group_set=2 THEN airports."elevation" END as "elevation__2" FROM '../data/airports.parquet' as airports CROSS JOIN (SELECT UNNEST(GENERATE_SERIES(0,2,1)) as group_set ) as group_set GROUP BY 1,2,5,8,9 ) , __stage1 AS ( SELECT CASE WHEN group_set=2 THEN 1 ELSE group_set END as group_set, "Facility Type__0" as "Facility Type__0", FIRST("airport_count__0") FILTER (WHERE "airport_count__0" IS NOT NULL) as "airport_count__0", FIRST("avg_elevation__0") FILTER (WHERE "avg_elevation__0" IS NOT NULL) as "avg_elevation__0", CASE WHEN group_set IN (1,2) THEN "state__1" END as "state__1", FIRST("avg_elevation__1") FILTER (WHERE "avg_elevation__1" IS NOT NULL) as "avg_elevation__1", FIRST("is_other__1") FILTER (WHERE "is_other__1" IS NOT NULL) as "is_other__1", COALESCE(LIST({ "code": "code__2", "elevation": "elevation__2"} ORDER BY "code__2" asc NULLS LAST) FILTER (WHERE group_set=2),[]) as "data__1" FROM __stage0 GROUP BY 1,2,5 ) SELECT "Facility Type__0" as "Facility Type", MAX(CASE WHEN group_set=0 THEN "airport_count__0" END) as "airport_count", MAX(CASE WHEN group_set=0 THEN "avg_elevation__0" END) as "avg_elevation", (WITH __stage0 AS ( SELECT CASE WHEN COALESCE(NOT(base."is_other"),FALSE) THEN base."state" ELSE 'OTHER' END as "state", AVG(data_0."elevation") as "avg_elevation", COUNT(DISTINCT base."__distinct_key" || 'x' || data_0_outer.__row_id) as "airport_count", COUNT(DISTINCT CASE WHEN COALESCE(NOT(base."is_other"),FALSE) THEN base."__distinct_key" || 'x' || data_0_outer.__row_id END) as "sort" FROM (SELECT GEN_RANDOM_UUID() as __distinct_key, x.* FROM (SELECT UNNEST(COALESCE(LIST({ "state": "state__1", "avg_elevation": "avg_elevation__1", "is_other": "is_other__1", "data": "data__1"} ORDER BY "avg_elevation__1" desc NULLS LAST) FILTER (WHERE group_set=1),[])) as base) as x) as base LEFT JOIN LATERAL (SELECT UNNEST(GENERATE_SERIES(1, length(base."data"),1)) as __row_id, UNNEST(base."data"), 1 as ignoreme) as data_0_outer(__row_id, data_0,ignoreme) ON data_0_outer.ignoreme=1 GROUP BY 1 ORDER BY 4 desc NULLS LAST ) SELECT LIST(STRUCT_PACK("state","avg_elevation","airport_count","sort")) FROM __stage0 ) as "top_states_by_elevation" FROM __stage1 GROUP BY 1 ORDER BY 2 desc NULLS LAST