🦆 DuckDB 速查表
一页快速参考 — 安装、数据导入/导出、SQL 查询、JSON 处理、性能优化、Python 集成与常用模式。
1️⃣ 安装
| 方式 | 命令 |
|---|---|
| Homebrew (macOS) | brew install duckdb |
| pip (Python) | pip install duckdb |
| Docker | docker run --rm -it duckdb/duckdb |
| CLI (Linux/macOS 二进制) | curl -fsSL https://install.duckdb.org | sh |
-- 验证安装
SELECT version();
-- ┌──────────────┐
-- │ version() │
-- ├──────────────┤
-- │ v1.3.0 │
-- └──────────────┘
2️⃣ CSV & Parquet 导入/导出
-- 读取 CSV(自动检测模式)
SELECT * FROM read_csv_auto('data.csv');
-- 批量读取多个 CSV 文件
SELECT * FROM read_csv_auto('data/*.csv', header=true, delim=',');
-- 导出为 CSV
COPY (SELECT * FROM users WHERE active = true) TO 'active_users.csv'
WITH (HEADER, DELIMITER ',');
-- 读取 Parquet
SELECT * FROM read_parquet('sales.parquet');
-- 导出为 Parquet(压缩列式存储)
COPY orders TO 'orders.parquet' (FORMAT PARQUET);
-- 同一查询合并 CSV 和 Parquet
SELECT * FROM read_csv_auto('recent.csv')
UNION ALL
SELECT * FROM read_parquet('archive.parquet');
3️⃣ SQL 查询
SELECT & WHERE
SELECT name, salary, department
FROM employees
WHERE salary > 80000
AND department IN ('Engineering', 'Product')
ORDER BY salary DESC
LIMIT 10;
GROUP BY + HAVING
SELECT department, COUNT(*) AS headcount, ROUND(AVG(salary), 0) AS avg_sal
FROM employees
GROUP BY department
HAVING COUNT(*) > 5
ORDER BY avg_sal DESC;
窗口函数
SELECT name, department, salary,
RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS rank,
DENSE_RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dense_rank,
LAG(salary) OVER (PARTITION BY department ORDER BY salary) AS prev_salary,
AVG(salary) OVER (PARTITION BY department) AS dept_avg
FROM employees;
JOIN 类型
-- INNER / LEFT / RIGHT / FULL OUTER / CROSS / ANTI / SEMI
SELECT e.name, d.name AS dept
FROM employees e
LEFT JOIN departments d ON e.dept_id = d.id;
-- ANTI JOIN:无部门的员工
SELECT * FROM employees e
ANTI JOIN departments d ON e.dept_id = d.id;
-- SEMI JOIN:部门在列表中的员工
SELECT * FROM employees e
SEMI JOIN departments d ON e.dept_id = d.id;
4️⃣ 性能优化技巧
-- 🔍 EXPLAIN ANALYZE 查看执行计划与耗时
EXPLAIN ANALYZE
SELECT department, SUM(salary) FROM employees GROUP BY department;
-- 🧊 物化 CTE(避免重复计算)
WITH high_earners AS MATERIALIZED (
SELECT * FROM employees WHERE salary > 100000
)
SELECT department, COUNT(*) FROM high_earners GROUP BY department;
-- 📊 在过滤/连接列上创建索引
CREATE INDEX idx_employees_dept ON employees(dept_id);
CREATE INDEX idx_employees_salary ON employees(salary);
-- 🚀 使用 ART 索引加速点查询
SET enable_progress_bar = false;
SELECT * FROM employees WHERE id = 42; -- 使用 ART 索引
-- 🧠 大数据集快速探索采样
SELECT * FROM huge_table USING SAMPLE 1%;
5️⃣ JSON 处理
-- 直接读取 JSON 文件
SELECT * FROM read_json_auto('logs.json');
-- 解析内联 JSON
SELECT json_extract('{"name": "Alice", "age": 30}', '$.name');
-- "Alice"
-- json_transform 提取多个字段
SELECT json_transform(
'{"name":"Alice","address":{"city":"NYC"}}',
'{"name":"VARCHAR","city":"$.address.city"}'
);
-- VARIANT 类型:存储混合类型 JSON
CREATE TABLE logs AS
SELECT '{"event":"click","userId":42}'::VARIANT AS data
UNION ALL
SELECT '{"event":"pageview","duration":2.5}'::VARIANT;
-- 查询 VARIANT 列
SELECT data['event']::VARCHAR AS event_type FROM logs;
-- JSON 数组展开为行
SELECT unnest(json_transform(
'[{"x":1},{"x":2}]',
'["VARCHAR"]'
)) AS items;
6️⃣ Python 集成
import duckdb
import pandas as pd
# 内存数据库
con = duckdb.connect()
# 直接查询 pandas DataFrame
df = pd.DataFrame({"id": [1, 2, 3], "value": [10, 20, 30]})
result = con.execute("SELECT * FROM df WHERE value > 15").fetchdf()
print(result)
# id value
# 0 2 20
# 1 3 30
# 注册 DataFrame 为命名视图
con.register("my_view", df)
con.execute("SELECT AVG(value) FROM my_view").fetchone()
# (20.0,)
# 从 DataFrame 创建表
con.execute("CREATE TABLE my_table AS SELECT * FROM df")
# 批量从 Python 对象插入
con.executemany(
"INSERT INTO my_table VALUES (?, ?)",
[(4, 40), (5, 50), (6, 60)]
)
# 从 Python 导出结果为 CSV/Parquet
con.execute("COPY (SELECT * FROM df) TO 'output.parquet' (FORMAT PARQUET)")
7️⃣ 常用模式
PIVOT / UNPIVOT
-- PIVOT:行转列
PIVOT sales ON quarter USING SUM(amount)
GROUP BY product;
-- UNPIVOT:列转行
UNPIVOT monthly_sales
ON jan, feb, mar
INTO NAME month VALUE amount;
LIST & STRUCT
-- 聚合为列表
SELECT department, LIST(name ORDER BY name) AS employees
FROM employees GROUP BY department;
-- 构建结构体
SELECT {'name': name, 'salary': salary, 'level': 'senior'} AS profile
FROM employees WHERE salary > 120000;
UNNEST(数组展平为行)
-- 展开列表列
SELECT department, unnest(projects) AS project
FROM employee_projects;
-- 生成日期序列
SELECT unnest(generate_series(DATE '2024-01-01', DATE '2024-01-10', INTERVAL 1 DAY)) AS dt;
友好数量显示
SELECT
total_sales,
total_sales // 1e6 AS sales_millions,
ROUND(total_sales / 1e6, 2) AS sales_millions_rounded
FROM quarterly_report;
8️⃣ 💰 变现思路
将 DuckDB 技能转化为收入 — 查看我们的详细指南:
| 文章 | 主题 |
|---|---|
| DuckDB 替代 jq | 用 DuckDB 处理 JSON 数据管道 |
| DuckDB 在线 Shell | 构建网页版 SQL 服务 |
| DuckDB × Delta Lake | Lakehouse 架构分析 |
| DuckDB + DeepSeek / SmallPond | 分布式数据处理 |
| DuckDB 跨库 JOIN | 多源数据联邦查询 |
| DuckDB 空间 / Geometry | 内置空间数据分析 |
| DuckLake v1 介绍 | DuckDB 数据湖仓 |
📬 订阅 DuckDB 实验室
获取最新的 DuckDB 技巧、性能评测和变现策略,直接发送到您的邮箱。
👉 [通过邮件订阅](mailto:admin@duckdblab.org?subject=订阅%20DuckDB%20实验室&body=我想订阅 DuckDB 实验室的更新通知。) — 或访问我们的联系页面。
最后更新:2026-05-11 · DuckDB v1.3