Often you want to limit the number of group-by values in a table, and bucket everything else into an 'other' category.
In the top_states_by_eleveation
query below, we have a query with two stages. The first stage calculates the top states and nests the data to be aggregated. The second pipeline stage produces the actual aggregation.
source: airports is duckdb.table('../data/airports.parquet') extend { measure: airport_count is count() avg_elevation is elevation.avg() view: top_states_by_elevation is { group_by: state aggregate: avg_elevation calculate: row_num is row_number() nest: data is { group_by: code, elevation } } -> { group_by: state is pick state when row_num < 5 else 'OTHER' aggregate: avg_elevation is data.elevation.avg() airport_count is data.count() } }
Basic Query
run: airports -> top_states_by_elevation
state | avg_elevation | airport_count |
---|---|---|
CO | 6,255.864 | 425 |
WY | 5,619.365 | 115 |
NM | 5,419.635 | 181 |
UT | 5,066 | 140 |
OTHER | 931.438 | 18,932 |
[ { "state": "CO", "avg_elevation": 6255.863529411765, "airport_count": 425 }, { "state": "WY", "avg_elevation": 5619.365217391304, "airport_count": 115 }, { "state": "NM", "avg_elevation": 5419.635359116022, "airport_count": 181 }, { "state": "UT", "avg_elevation": 5066, "airport_count": 140 }, { "state": "OTHER", "avg_elevation": 931.4379357701247, "airport_count": 18932 } ]
WITH __stage0 AS ( SELECT group_set, airports."state" as "state__0", CASE WHEN group_set=0 THEN AVG(airports."elevation") END as "avg_elevation__0", ROW_NUMBER() OVER(PARTITION BY group_set ORDER BY CASE WHEN group_set=0 THEN AVG(airports."elevation") END desc NULLS LAST ) as "row_num__0", CASE WHEN group_set=1 THEN airports."code" END as "code__1", CASE WHEN group_set=1 THEN airports."elevation" END as "elevation__1" FROM '../data/airports.parquet' as airports CROSS JOIN (SELECT UNNEST(GENERATE_SERIES(0,1,1)) as group_set ) as group_set GROUP BY 1,2,5,6 ) , __stage1 AS ( SELECT "state__0" as "state", MAX(CASE WHEN group_set=0 THEN "avg_elevation__0" END) as "avg_elevation", MAX(CASE WHEN group_set=0 THEN "row_num__0" END) as "row_num", COALESCE(LIST({ "code": "code__1", "elevation": "elevation__1"} ORDER BY "code__1" asc NULLS LAST) FILTER (WHERE group_set=1),[]) as "data" FROM __stage0 GROUP BY 1 ORDER BY 2 desc NULLS LAST ) SELECT CASE WHEN base."row_num"<5 THEN base."state" ELSE 'OTHER' END as "state", AVG(base.data[data_0.__row_id]."elevation") as "avg_elevation", COUNT( 1) as "airport_count" FROM __stage1 as base LEFT JOIN (select UNNEST(generate_series(1, array_length(base."data"), 1)) as __row_id) as data_0 ON data_0.__row_id <= array_length(base."data") GROUP BY 1 ORDER BY 2 desc NULLS LAST
Nested Query
run: airports -> { group_by: `Facility Type` is fac_type aggregate: airport_count avg_elevation nest: top_states_by_elevation }
Facility Type | airport_count | avg_elevation | top_states_by_elevation | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
AIRPORT | 13,925 | 1,237.044 |
| ||||||||||||||||||
HELIPORT | 5,135 | 950.513 |
| ||||||||||||||||||
SEAPLANE BASE | 473 | 488.822 |
| ||||||||||||||||||
ULTRALIGHT | 125 | 806.144 |
| ||||||||||||||||||
STOLPORT | 86 | 1,375.047 |
| ||||||||||||||||||
GLIDERPORT | 37 | 1,611.405 |
| ||||||||||||||||||
BALLOONPORT | 12 | 1,047.25 |
|
[ { "Facility Type": "AIRPORT", "airport_count": 13925, "avg_elevation": 1237.0441651705567, "top_states_by_elevation": [ { "state": "CO", "avg_elevation": 5873.0642570281125, "airport_count": 249 }, { "state": "WY", "avg_elevation": 5570.0329670329675, "airport_count": 91 }, { "state": "NM", "avg_elevation": 5467.63870967742, "airport_count": 155 }, { "state": "UT", "avg_elevation": 5257.18556701031, "airport_count": 97 }, { "state": "OTHER", "avg_elevation": 1042.4617865446637, "airport_count": 13333 } ] }, { "Facility Type": "HELIPORT", "airport_count": 5135, "avg_elevation": 950.5125608568646, "top_states_by_elevation": [ { "state": "NM", "avg_elevation": 5170.76, "airport_count": 25 }, { "state": "AZ", "avg_elevation": 2034.198113207547, "airport_count": 106 }, { "state": "OTHER", "avg_elevation": 947.9042931937173, "airport_count": 4775 }, { "state": "LA", "avg_elevation": 42.55458515283843, "airport_count": 229 } ] }, { "Facility Type": "SEAPLANE BASE", "airport_count": 473, "avg_elevation": 488.82241014799155, "top_states_by_elevation": [ { "state": "NM", "avg_elevation": 4201, "airport_count": 1 }, { "state": "MT", "avg_elevation": 3194, "airport_count": 2 }, { "state": "NE", "avg_elevation": 1946, "airport_count": 1 }, { "state": "ID", "avg_elevation": 1884.6, "airport_count": 5 }, { "state": "OTHER", "avg_elevation": 450.9806034482759, "airport_count": 464 } ] }, { "Facility Type": "ULTRALIGHT", "airport_count": 125, "avg_elevation": 806.144, "top_states_by_elevation": [ { "state": "CO", "avg_elevation": 8110, "airport_count": 1 }, { "state": "MT", "avg_elevation": 3540, "airport_count": 1 }, { "state": "AZ", "avg_elevation": 2025.7142857142858, "airport_count": 7 }, { "state": "NY", "avg_elevation": 1530, "airport_count": 2 }, { "state": "OTHER", "avg_elevation": 630.5087719298245, "airport_count": 114 } ] }, { "Facility Type": "STOLPORT", "airport_count": 86, "avg_elevation": 1375.046511627907, "top_states_by_elevation": [ { "state": "CO", "avg_elevation": 6211.666666666667, "airport_count": 6 }, { "state": "NV", "avg_elevation": 4900, "airport_count": 1 }, { "state": "AZ", "avg_elevation": 4772, "airport_count": 1 }, { "state": "CA", "avg_elevation": 4040.5, "airport_count": 2 }, { "state": "OTHER", "avg_elevation": 831.9868421052631, "airport_count": 76 } ] }, { "Facility Type": "GLIDERPORT", "airport_count": 37, "avg_elevation": 1611.4054054054054, "top_states_by_elevation": [ { "state": "CO", "avg_elevation": 7061.666666666667, "airport_count": 3 }, { "state": "NV", "avg_elevation": 4300, "airport_count": 1 }, { "state": "AZ", "avg_elevation": 3539, "airport_count": 2 }, { "state": "KS", "avg_elevation": 2088.5, "airport_count": 2 }, { "state": "OTHER", "avg_elevation": 789.0344827586207, "airport_count": 29 } ] }, { "Facility Type": "BALLOONPORT", "airport_count": 12, "avg_elevation": 1047.25, "top_states_by_elevation": [ { "state": "CO", "avg_elevation": 5050, "airport_count": 1 }, { "state": "KS", "avg_elevation": 1250, "airport_count": 1 }, { "state": "OH", "avg_elevation": 1164, "airport_count": 1 }, { "state": "MI", "avg_elevation": 980, "airport_count": 1 }, { "state": "OTHER", "avg_elevation": 515.375, "airport_count": 8 } ] } ]
WITH __stage0 AS ( SELECT group_set, airports."fac_type" as "Facility Type__0", CASE WHEN group_set=0 THEN COUNT( 1) END as "airport_count__0", CASE WHEN group_set=0 THEN AVG(airports."elevation") END as "avg_elevation__0", CASE WHEN group_set IN (1,2) THEN airports."state" END as "state__1", CASE WHEN group_set=1 THEN AVG(airports."elevation") END as "avg_elevation__1", ROW_NUMBER() OVER(PARTITION BY group_set, airports."fac_type" ORDER BY CASE WHEN group_set=1 THEN AVG(airports."elevation") END desc NULLS LAST ) as "row_num__1", CASE WHEN group_set=2 THEN airports."code" END as "code__2", CASE WHEN group_set=2 THEN airports."elevation" END as "elevation__2" FROM '../data/airports.parquet' as airports CROSS JOIN (SELECT UNNEST(GENERATE_SERIES(0,2,1)) as group_set ) as group_set GROUP BY 1,2,5,8,9 ) , __stage1 AS ( SELECT CASE WHEN group_set=2 THEN 1 ELSE group_set END as group_set, "Facility Type__0" as "Facility Type__0", FIRST("airport_count__0") FILTER (WHERE "airport_count__0" IS NOT NULL) as "airport_count__0", FIRST("avg_elevation__0") FILTER (WHERE "avg_elevation__0" IS NOT NULL) as "avg_elevation__0", CASE WHEN group_set IN (1,2) THEN "state__1" END as "state__1", FIRST("avg_elevation__1") FILTER (WHERE "avg_elevation__1" IS NOT NULL) as "avg_elevation__1", FIRST("row_num__1") FILTER (WHERE "row_num__1" IS NOT NULL) as "row_num__1", COALESCE(LIST({ "code": "code__2", "elevation": "elevation__2"} ORDER BY "code__2" asc NULLS LAST) FILTER (WHERE group_set=2),[]) as "data__1" FROM __stage0 GROUP BY 1,2,5 ) SELECT "Facility Type__0" as "Facility Type", MAX(CASE WHEN group_set=0 THEN "airport_count__0" END) as "airport_count", MAX(CASE WHEN group_set=0 THEN "avg_elevation__0" END) as "avg_elevation", (WITH __stage0 AS ( SELECT CASE WHEN base."row_num"<5 THEN base."state" ELSE 'OTHER' END as "state", AVG(base.data[data_0.__row_id]."elevation") as "avg_elevation", COUNT( 1) as "airport_count" FROM (SELECT UNNEST(COALESCE(LIST({ "state": "state__1", "avg_elevation": "avg_elevation__1", "row_num": "row_num__1", "data": "data__1"} ORDER BY "avg_elevation__1" desc NULLS LAST) FILTER (WHERE group_set=1),[])) as base) as base LEFT JOIN (select UNNEST(generate_series(1, 100000, 1)) as __row_id) as data_0 ON data_0.__row_id <= array_length(base."data") GROUP BY 1 ORDER BY 2 desc NULLS LAST ) SELECT LIST(ROW("state","avg_elevation","airport_count")) FROM __stage0 ) as "top_states_by_elevation" FROM __stage1 GROUP BY 1 ORDER BY 2 desc NULLS LAST