🦆 DuckDB Cheatsheet
One-page quick reference — Installation, data import/export, SQL queries, JSON, performance, Python integration, and common patterns.
1️⃣ Installation
| Method | Command |
|---|---|
| Homebrew (macOS) | brew install duckdb |
| pip (Python) | pip install duckdb |
| Docker | docker run --rm -it duckdb/duckdb |
| CLI (Linux/macOS binary) | curl -fsSL https://install.duckdb.org | sh |
-- Verify installation
SELECT version();
-- ┌──────────────┐
-- │ version() │
-- ├──────────────┤
-- │ v1.3.0 │
-- └──────────────┘
2️⃣ CSV & Parquet Import/Export
-- Read CSV (auto-detect schema)
SELECT * FROM read_csv_auto('data.csv');
-- Read multiple CSV files at once
SELECT * FROM read_csv_auto('data/*.csv', header=true, delim=',');
-- Export to CSV
COPY (SELECT * FROM users WHERE active = true) TO 'active_users.csv'
WITH (HEADER, DELIMITER ',');
-- Read Parquet
SELECT * FROM read_parquet('sales.parquet');
-- Export to Parquet (compressed, columnar)
COPY orders TO 'orders.parquet' (FORMAT PARQUET);
-- Combine CSV + Parquet in a single query
SELECT * FROM read_csv_auto('recent.csv')
UNION ALL
SELECT * FROM read_parquet('archive.parquet');
3️⃣ SQL Queries
SELECT & WHERE
SELECT name, salary, department
FROM employees
WHERE salary > 80000
AND department IN ('Engineering', 'Product')
ORDER BY salary DESC
LIMIT 10;
GROUP BY with HAVING
SELECT department, COUNT(*) AS headcount, ROUND(AVG(salary), 0) AS avg_sal
FROM employees
GROUP BY department
HAVING COUNT(*) > 5
ORDER BY avg_sal DESC;
Window Functions
SELECT name, department, salary,
RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS rank,
DENSE_RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dense_rank,
LAG(salary) OVER (PARTITION BY department ORDER BY salary) AS prev_salary,
AVG(salary) OVER (PARTITION BY department) AS dept_avg
FROM employees;
JOIN Types
-- INNER / LEFT / RIGHT / FULL OUTER / CROSS / ANTI / SEMI
SELECT e.name, d.name AS dept
FROM employees e
LEFT JOIN departments d ON e.dept_id = d.id;
-- ANTI JOIN: employees without a department
SELECT * FROM employees e
ANTI JOIN departments d ON e.dept_id = d.id;
-- SEMI JOIN: employees whose dept is in list
SELECT * FROM employees e
SEMI JOIN departments d ON e.dept_id = d.id;
4️⃣ Performance Tips
-- 🔍 Explain + Analyze to see query plan & timing
EXPLAIN ANALYZE
SELECT department, SUM(salary) FROM employees GROUP BY department;
-- 🧊 Materialized CTEs (avoid recomputation)
WITH high_earners AS MATERIALIZED (
SELECT * FROM employees WHERE salary > 100000
)
SELECT department, COUNT(*) FROM high_earners GROUP BY department;
-- 📊 Create indexes on filtered/joined columns
CREATE INDEX idx_employees_dept ON employees(dept_id);
CREATE INDEX idx_employees_salary ON employees(salary);
-- 🚀 Use ART index for point lookups
SET enable_progress_bar = false;
SELECT * FROM employees WHERE id = 42; -- uses ART index
-- 🧠 Sample for quick exploration on large datasets
SELECT * FROM huge_table USING SAMPLE 1%;
5️⃣ JSON Processing
-- Read JSON files directly
SELECT * FROM read_json_auto('logs.json');
-- Parse inline JSON
SELECT json_extract('{"name": "Alice", "age": 30}', '$.name');
-- "Alice"
-- Extract multiple fields with json_transform
SELECT json_transform(
'{"name":"Alice","address":{"city":"NYC"}}',
'{"name":"VARCHAR","city":"$.address.city"}'
);
-- VARIANT type: store mixed-type JSON
CREATE TABLE logs AS
SELECT '{"event":"click","userId":42}'::VARIANT AS data
UNION ALL
SELECT '{"event":"pageview","duration":2.5}'::VARIANT;
-- Query VARIANT column with nested access
SELECT data['event']::VARCHAR AS event_type FROM logs;
-- JSON array → rows with UNNEST
SELECT unnest(json_transform(
'[{"x":1},{"x":2}]',
'["VARCHAR"]'
)) AS items;
6️⃣ Python Integration
import duckdb
import pandas as pd
# In-memory database
con = duckdb.connect()
# Query pandas DataFrames directly
df = pd.DataFrame({"id": [1, 2, 3], "value": [10, 20, 30]})
result = con.execute("SELECT * FROM df WHERE value > 15").fetchdf()
print(result)
# id value
# 0 2 20
# 1 3 30
# Register a DataFrame as a named view
con.register("my_view", df)
con.execute("SELECT AVG(value) FROM my_view").fetchone()
# (20.0,)
# Create table from DataFrame
con.execute("CREATE TABLE my_table AS SELECT * FROM df")
# Efficient bulk insert from Python objects
con.executemany(
"INSERT INTO my_table VALUES (?, ?)",
[(4, 40), (5, 50), (6, 60)]
)
# Write query results to CSV/Parquet from Python
con.execute("COPY (SELECT * FROM df) TO 'output.parquet' (FORMAT PARQUET)")
7️⃣ Common Patterns
PIVOT / UNPIVOT
-- PIVOT: rows → columns
PIVOT sales ON quarter USING SUM(amount)
GROUP BY product;
-- UNPIVOT: columns → rows
UNPivot monthly_sales
ON jan, feb, mar
INTO NAME month VALUE amount;
LIST & STRUCT
-- Aggregate into a list
SELECT department, LIST(name ORDER BY name) AS employees
FROM employees GROUP BY department;
-- Build a struct
SELECT {'name': name, 'salary': salary, 'level': 'senior'} AS profile
FROM employees WHERE salary > 120000;
UNNEST (array → rows)
-- Flatten a list column
SELECT department, unnest(projects) AS project
FROM employee_projects;
-- Generate a date series
SELECT unnest(generate_series(DATE '2024-01-01', DATE '2024-01-10', INTERVAL 1 DAY)) AS dt;
Friendly Quantities
SELECT
total_sales,
total_sales // 1e6 AS sales_millions,
ROUND(total_sales / 1e6, 2) AS sales_millions_rounded
FROM quarterly_report;
8️⃣ 💰 Monetization Ideas
Turn DuckDB skills into income — check out our detailed guides:
| Post | Topic |
|---|---|
| DuckDB as new jq | Replace jq for JSON processing in data pipelines |
| DuckDB Online Shell | Build a web-based SQL service |
| DuckDB x Delta Lake | Power analytics with lakehouse architecture |
| DuckDB + DeepSeek / SmallPond | Distributed data processing |
| DuckDB Cross-Database Joins | Multi-source data federation |
| DuckDB Spatial / Geometry | Location analytics with built-in spatial |
| DuckLake v1 Intro | Data lakehouse on DuckDB |
📬 Subscribe to DuckDB Lab
Get the latest DuckDB tips, benchmarks, and monetization strategies delivered to your inbox.
👉 Subscribe via Email — or visit our Contact page.
Last updated: 2026-05-11 · DuckDB v1.3