chDB#
JupySQL integrates with chDB so you can run SQL queries in a Jupyter notebook. Jump into any section to learn more!
Pre-requisites for .parquet
file#
%pip install jupysql chdb pyarrow --quiet
Note: you may need to restart the kernel to use updated packages.
from chdb import dbapi
conn = dbapi.connect()
%load_ext sql
%sql conn --alias chdb
Get a sample .parquet
file:#
from urllib.request import urlretrieve
_ = urlretrieve(
"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet",
"yellow_tripdata_2021-01.parquet",
)
Query on S3/HTTP/File#
Query a local file
%%sql
SELECT
passenger_count, AVG(trip_distance) AS avg_trip_distance
FROM file("yellow_tripdata_2021-01.parquet")
GROUP BY passenger_count
Running query in 'chdb'
passenger_count | avg_trip_distance |
---|---|
None | 29.665125772734516 |
0 | 2.5424466811344746 |
7 | 11.134 |
4 | 2.8681984015618376 |
3 | 2.7576410606578126 |
5 | 2.694099520730797 |
1 | 2.6805563237138768 |
2 | 2.794832592116103 |
8 | 1.05 |
6 | 2.5745177825092656 |
Run a file over HTTP
%%sql
SELECT
RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID)
FROM url('https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_0.parquet')
-- query on s3 --
-- FROM s3('xxxx')
GROUP BY
RegionID
ORDER BY c
DESC LIMIT 10
Running query in 'chdb'
RegionID | sum(AdvEngineID) | c | avg(ResolutionWidth) | uniqExact(UserID) |
---|---|---|---|---|
229 | 38044 | 426435 | 1612.7871867928288 | 27961 |
2 | 12801 | 148193 | 1593.8708913376474 | 10413 |
208 | 2673 | 30614 | 1490.6151107336514 | 3073 |
1 | 1802 | 28577 | 1623.8516989187108 | 1720 |
34 | 508 | 14329 | 1592.897201479517 | 1428 |
47 | 1041 | 13661 | 1637.8519142083303 | 943 |
158 | 78 | 13294 | 1576.340604784113 | 1110 |
7 | 1166 | 11679 | 1627.319034163884 | 647 |
42 | 642 | 11547 | 1625.601021910453 | 956 |
184 | 30 | 10157 | 1614.6938072265432 | 987 |