DuckDB 速查表

DuckDB 一页速查 — 安装、CSV/JSON、SQL、性能优化、Python 集成等

🦆 DuckDB 速查表

一页快速参考 — 安装、数据导入/导出、SQL 查询、JSON 处理、性能优化、Python 集成与常用模式。


1️⃣ 安装

方式命令
Homebrew (macOS)brew install duckdb
pip (Python)pip install duckdb
Dockerdocker run --rm -it duckdb/duckdb
CLI (Linux/macOS 二进制)curl -fsSL https://install.duckdb.org | sh
-- 验证安装
SELECT version();
-- ┌──────────────┐
-- │  version()   │
-- ├──────────────┤
-- │ v1.3.0       │
-- └──────────────┘

2️⃣ CSV & Parquet 导入/导出

-- 读取 CSV(自动检测模式)
SELECT * FROM read_csv_auto('data.csv');

-- 批量读取多个 CSV 文件
SELECT * FROM read_csv_auto('data/*.csv', header=true, delim=',');

-- 导出为 CSV
COPY (SELECT * FROM users WHERE active = true) TO 'active_users.csv'
  WITH (HEADER, DELIMITER ',');

-- 读取 Parquet
SELECT * FROM read_parquet('sales.parquet');

-- 导出为 Parquet(压缩列式存储)
COPY orders TO 'orders.parquet' (FORMAT PARQUET);

-- 同一查询合并 CSV 和 Parquet
SELECT * FROM read_csv_auto('recent.csv')
UNION ALL
SELECT * FROM read_parquet('archive.parquet');

3️⃣ SQL 查询

SELECT & WHERE

SELECT name, salary, department
FROM employees
WHERE salary > 80000
  AND department IN ('Engineering', 'Product')
ORDER BY salary DESC
LIMIT 10;

GROUP BY + HAVING

SELECT department, COUNT(*) AS headcount, ROUND(AVG(salary), 0) AS avg_sal
FROM employees
GROUP BY department
HAVING COUNT(*) > 5
ORDER BY avg_sal DESC;

窗口函数

SELECT name, department, salary,
  RANK()       OVER (PARTITION BY department ORDER BY salary DESC) AS rank,
  DENSE_RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dense_rank,
  LAG(salary)  OVER (PARTITION BY department ORDER BY salary)      AS prev_salary,
  AVG(salary)  OVER (PARTITION BY department)                      AS dept_avg
FROM employees;

JOIN 类型

-- INNER / LEFT / RIGHT / FULL OUTER / CROSS / ANTI / SEMI
SELECT e.name, d.name AS dept
FROM employees e
LEFT JOIN departments d ON e.dept_id = d.id;

-- ANTI JOIN:无部门的员工
SELECT * FROM employees e
ANTI JOIN departments d ON e.dept_id = d.id;

-- SEMI JOIN:部门在列表中的员工
SELECT * FROM employees e
SEMI JOIN departments d ON e.dept_id = d.id;

4️⃣ 性能优化技巧

-- 🔍 EXPLAIN ANALYZE 查看执行计划与耗时
EXPLAIN ANALYZE
SELECT department, SUM(salary) FROM employees GROUP BY department;

-- 🧊 物化 CTE(避免重复计算)
WITH high_earners AS MATERIALIZED (
  SELECT * FROM employees WHERE salary > 100000
)
SELECT department, COUNT(*) FROM high_earners GROUP BY department;

-- 📊 在过滤/连接列上创建索引
CREATE INDEX idx_employees_dept ON employees(dept_id);
CREATE INDEX idx_employees_salary ON employees(salary);

-- 🚀 使用 ART 索引加速点查询
SET enable_progress_bar = false;
SELECT * FROM employees WHERE id = 42;  -- 使用 ART 索引

-- 🧠 大数据集快速探索采样
SELECT * FROM huge_table USING SAMPLE 1%;

5️⃣ JSON 处理

-- 直接读取 JSON 文件
SELECT * FROM read_json_auto('logs.json');

-- 解析内联 JSON
SELECT json_extract('{"name": "Alice", "age": 30}', '$.name');
-- "Alice"

-- json_transform 提取多个字段
SELECT json_transform(
  '{"name":"Alice","address":{"city":"NYC"}}',
  '{"name":"VARCHAR","city":"$.address.city"}'
);

-- VARIANT 类型:存储混合类型 JSON
CREATE TABLE logs AS
SELECT '{"event":"click","userId":42}'::VARIANT AS data
UNION ALL
SELECT '{"event":"pageview","duration":2.5}'::VARIANT;

-- 查询 VARIANT 列
SELECT data['event']::VARCHAR AS event_type FROM logs;

-- JSON 数组展开为行
SELECT unnest(json_transform(
  '[{"x":1},{"x":2}]',
  '["VARCHAR"]'
)) AS items;

6️⃣ Python 集成

import duckdb
import pandas as pd

# 内存数据库
con = duckdb.connect()

# 直接查询 pandas DataFrame
df = pd.DataFrame({"id": [1, 2, 3], "value": [10, 20, 30]})
result = con.execute("SELECT * FROM df WHERE value > 15").fetchdf()
print(result)
#    id  value
# 0   2     20
# 1   3     30

# 注册 DataFrame 为命名视图
con.register("my_view", df)
con.execute("SELECT AVG(value) FROM my_view").fetchone()
# (20.0,)

# 从 DataFrame 创建表
con.execute("CREATE TABLE my_table AS SELECT * FROM df")

# 批量从 Python 对象插入
con.executemany(
    "INSERT INTO my_table VALUES (?, ?)",
    [(4, 40), (5, 50), (6, 60)]
)

# 从 Python 导出结果为 CSV/Parquet
con.execute("COPY (SELECT * FROM df) TO 'output.parquet' (FORMAT PARQUET)")

7️⃣ 常用模式

PIVOT / UNPIVOT

-- PIVOT:行转列
PIVOT sales ON quarter USING SUM(amount)
  GROUP BY product;

-- UNPIVOT:列转行
UNPIVOT monthly_sales
  ON jan, feb, mar
  INTO NAME month VALUE amount;

LIST & STRUCT

-- 聚合为列表
SELECT department, LIST(name ORDER BY name) AS employees
FROM employees GROUP BY department;

-- 构建结构体
SELECT {'name': name, 'salary': salary, 'level': 'senior'} AS profile
FROM employees WHERE salary > 120000;

UNNEST(数组展平为行)

-- 展开列表列
SELECT department, unnest(projects) AS project
FROM employee_projects;

-- 生成日期序列
SELECT unnest(generate_series(DATE '2024-01-01', DATE '2024-01-10', INTERVAL 1 DAY)) AS dt;

友好数量显示

SELECT
  total_sales,
  total_sales // 1e6 AS sales_millions,
  ROUND(total_sales / 1e6, 2) AS sales_millions_rounded
FROM quarterly_report;

8️⃣ 💰 变现思路

将 DuckDB 技能转化为收入 — 查看我们的详细指南:

文章主题
DuckDB 替代 jq用 DuckDB 处理 JSON 数据管道
DuckDB 在线 Shell构建网页版 SQL 服务
DuckDB × Delta LakeLakehouse 架构分析
DuckDB + DeepSeek / SmallPond分布式数据处理
DuckDB 跨库 JOIN多源数据联邦查询
DuckDB 空间 / Geometry内置空间数据分析
DuckLake v1 介绍DuckDB 数据湖仓

📬 订阅 DuckDB 实验室

获取最新的 DuckDB 技巧、性能评测和变现策略,直接发送到您的邮箱。

👉 [通过邮件订阅](mailto:admin@duckdblab.org?subject=订阅%20DuckDB%20实验室&body=我想订阅 DuckDB 实验室的更新通知。) — 或访问我们的联系页面


最后更新:2026-05-11 · DuckDB v1.3

Licensed under CC BY-NC-SA 4.0