DuckDB SQL 语法概述
DuckDB 的 SQL 语法基于 PostgreSQL 标准,并在其基础上做了大量针对分析场景的增强。本文覆盖了 DuckDB SQL 语法的核心内容,包括标准 SQL 操作和 DuckDB 独有语法特性。
如果你是 SQL 新手,建议先阅读 DuckDB 入门教程;如果已经有一定基础,本文可作为日常使用的 DuckDB SQL 语法速查手册。
数据定义语言 (DDL)
创建表
-- 标准建表
CREATE TABLE employees (
id INTEGER PRIMARY KEY,
name VARCHAR(100),
department VARCHAR,
salary DECIMAL(10,2),
hire_date DATE
);
-- 从查询结果创建表
CREATE TABLE high_earners AS
SELECT * FROM employees WHERE salary > 100000;
-- 临时表(会话结束后自动删除)
CREATE TEMP TABLE temp_results AS
SELECT department, AVG(salary) AS avg_salary
FROM employees GROUP BY department;
修改表结构
-- 添加列
ALTER TABLE employees ADD COLUMN email VARCHAR;
-- 删除列
ALTER TABLE employees DROP COLUMN email;
-- 重命名列
ALTER TABLE employees RENAME COLUMN salary TO base_salary;
数据查询语言 (DQL) — SELECT
基础 SELECT 语法
-- 基本查询
SELECT name, department, salary
FROM employees
WHERE department = 'Engineering'
ORDER BY salary DESC
LIMIT 10;
DuckDB 特有的 SELECT 增强
DuckDB 提供了一些标准 SQL 中没有的便捷语法:
-- GROUP BY ALL:自动按 SELECT 中所有非聚合列分组
SELECT department, year(hire_date) AS hire_year, COUNT(*)
FROM employees
GROUP BY ALL;
-- 等价于 GROUP BY department, year(hire_date)
-- COLUMNS():对多个列应用相同表达式
SELECT COLUMNS('salary|bonus') * 1.1 AS salary_increase
FROM employees;
-- EXCLUDE:排除指定列
SELECT * EXCLUDE (salary, ssn)
FROM employees;
-- REPLACE:替换指定列的表达式
SELECT * REPLACE (salary * 1.1 AS salary)
FROM employees;
数据操作语言 (DML)
INSERT
-- 单行插入
INSERT INTO employees VALUES (5, 'Eve', 'Engineering', 130000, '2026-01-15');
-- 多行插入
INSERT INTO employees VALUES
(6, 'Frank', 'Sales', 90000, '2026-02-01'),
(7, 'Grace', 'Marketing', 95000, '2026-02-15');
-- 从查询插入
INSERT INTO high_earners
SELECT * FROM employees WHERE salary > 100000;
-- 从文件插入(DuckDB 特色)
INSERT INTO employees
SELECT * FROM read_csv_auto('new_employees.csv');
UPDATE 和 DELETE
-- 更新
UPDATE employees
SET salary = salary * 1.05
WHERE department = 'Engineering' AND salary < 100000;
-- 删除
DELETE FROM employees WHERE id = 5;
聚合函数与 GROUP BY
标准聚合
SELECT
department,
COUNT(*) AS employee_count,
SUM(salary) AS total_salary,
AVG(salary) AS avg_salary,
MIN(salary) AS min_salary,
MAX(salary) AS max_salary
FROM employees
GROUP BY department
HAVING COUNT(*) > 1
ORDER BY avg_salary DESC;
高级聚合
-- 中位数
SELECT department, MEDIAN(salary) AS median_salary
FROM employees GROUP BY department;
-- 百分位数
SELECT department,
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY salary) AS q1,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY salary) AS q3
FROM employees GROUP BY department;
-- 多种统计量
SELECT department,
AVG(salary) AS mean,
STDDEV(sample(salary)) AS stddev,
SKEWNESS(salary) AS skew,
KURTOSIS(salary) AS kurt
FROM employees GROUP BY department;
窗口函数
基本窗口函数
SELECT
name,
department,
salary,
-- 排名
ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS rank,
-- 累计和
SUM(salary) OVER (PARTITION BY department ORDER BY name) AS running_total,
-- 全局聚合
AVG(salary) OVER () AS company_avg,
-- 与平均值差距
salary - AVG(salary) OVER () AS diff_from_avg
FROM employees;
滑动窗口
SELECT
date,
amount,
-- 前3天移动平均
AVG(amount) OVER (
ORDER BY date
ROWS BETWEEN 3 PRECEDING AND CURRENT ROW
) AS moving_avg_3d,
-- 年初至今累计
SUM(amount) OVER (
PARTITION BY year(date)
ORDER BY date
) AS ytd_total
FROM daily_sales;
窗口过滤与排序
-- 窗口函数中使用 FILTER
SELECT
department,
AVG(salary) AS avg_all,
AVG(salary) FILTER (WHERE salary > 100000) AS avg_high_only
FROM employees
GROUP BY department;
公用表表达式 (CTE)
基础 CTE
WITH department_stats AS (
SELECT
department,
AVG(salary) AS avg_dept_salary
FROM employees
GROUP BY department
)
SELECT
e.name,
e.department,
e.salary,
d.avg_dept_salary,
e.salary - d.avg_dept_salary AS salary_diff
FROM employees e
JOIN department_stats d ON e.department = d.department
WHERE e.salary > d.avg_dept_salary
ORDER BY salary_diff DESC;
递归 CTE
-- 生成日期序列
WITH RECURSIVE dates AS (
SELECT '2026-01-01'::DATE AS date
UNION ALL
SELECT date + 1
FROM dates
WHERE date < '2026-01-31'
)
SELECT * FROM dates;
-- 组织树查询
WITH RECURSIVE org_tree AS (
SELECT id, name, manager_id, 1 AS level
FROM org_chart
WHERE manager_id IS NULL
UNION ALL
SELECT e.id, e.name, e.manager_id, t.level + 1
FROM org_chart e
JOIN org_tree t ON e.manager_id = t.id
)
SELECT * FROM org_tree ORDER BY level, name;
UNION 和集合操作
-- UNION (去重)
SELECT name, department FROM current_employees
UNION
SELECT name, department FROM former_employees
ORDER BY name;
-- UNION ALL (保留重复,更快)
SELECT region, revenue FROM sales_q1
UNION ALL
SELECT region, revenue FROM sales_q2;
-- INTERSECT 和 EXCEPT
SELECT product FROM products_2025
INTERSECT
SELECT product FROM products_2026;
-- 返回2025和2026都有的产品
SELECT product FROM products_2025
EXCEPT
SELECT product FROM products_2026;
-- 返回2025有但2026没有的产品
PIVOT / UNPIVOT
PIVOT:行转列
-- 将不同部门的工资统计转置为列
PIVOT employees
ON department
USING AVG(salary) AS avg_salary,
COUNT(*) AS count
GROUP BY hire_year;
-- 使用 PIVOT 的 SQL 标准语法
SELECT *
FROM (SELECT department, salary FROM employees)
PIVOT (
AVG(salary)
FOR department IN ('Engineering', 'Sales', 'Marketing')
) AS p;
UNPIVOT:列转行
-- 将季度列转置为行
UNPIVOT quarterly_sales
ON q1, q2, q3, q4
INTO
NAME quarter
VALUE revenue;
DuckDB 特有语法与函数
列表和结构体
-- 列表操作
SELECT [1, 2, 3] AS numbers,
list_value(1, 2, 3) AS also_numbers,
list_sort([3, 1, 2]) AS sorted;
-- 结构体
SELECT {'name': 'Alice', 'salary': 120000} AS employee,
(employee).name AS name;
-- UNNEST:展开嵌套数据
SELECT name, unnest(skills) AS skill
FROM (VALUES ('Alice', ['SQL', 'Python', 'Java'])) AS t(name, skills);
日期时间函数
SELECT
CURRENT_DATE AS today,
DATE_TRUNC('month', '2026-05-21'::DATE) AS month_start,
DATE_DIFF('month', '2026-01-01'::DATE, '2026-12-31'::DATE) AS months_diff,
DATE_ADD('2026-01-01'::DATE, INTERVAL 3 MONTH) AS three_months_later,
EXTRACT(YEAR FROM '2026-05-21'::DATE) AS year;
字符串函数
SELECT
UPPER('hello') AS upper,
LOWER('HELLO') AS lower,
LENGTH('DuckDB') AS len,
CONCAT('Hello', ' ', 'DuckDB') AS greeting,
SPLIT_PART('a,b,c', ',', 2) AS second,
REGEXP_MATCHES('[email protected]', '\w+@\w+\.\w+') AS is_email;
性能优化提示
-- 1. 使用 EXPLAIN 查看执行计划
EXPLAIN SELECT * FROM employees WHERE department = 'Engineering';
-- 2. 创建索引加速过滤
CREATE INDEX idx_emp_dept ON employees(department);
-- 3. 限制并行度
SET threads = 4;
-- 4. 调整内存
SET memory_limit = '8GB';
-- 5. 使用 Parquet 替代 CSV
COPY (SELECT * FROM employees) TO 'employees.parquet' (FORMAT PARQUET);

相关文章
📘 博客: https://duckdblab.org #DuckDB #SQL语法 #SQL教程 #数据分析