
一、窗口函数:被严重低估的商业分析利器
很多数据分析师对窗口函数的认知还停留在"面试考点"或"SQL 炫技"层面。但如果你正在用 DuckDB 搭建数据产品、自动化报表或者商业分析工具,窗口函数就是那条从 SQL 到利润的最后一公里。
为什么?因为窗口函数让你能在不破坏原始数据粒度的前提下,完成所有常见的商业分析场景:
- 客户留存率计算(cohort analysis)
- 滚动收益/损失统计
- 同组内排名与 Top-N 筛选
- 前后行比较(环比、同比)
- 累计求和与移动平均
这些不是学术练习——每一个都对应着可收费的商业分析服务。
二、实战场景 1:客户留存 Cohort 分析
假设你经营一个 SaaS 产品,需要向客户展示月度留存率——这是你可以按项目收费的核心分析模块。
传统方法:你需要多表 JOIN 或者子查询,代码又长又慢。
DuckDB 窗口函数方案:
-- 模拟用户注册和活跃数据
CREATE TABLE user_activity AS
WITH dates AS (
SELECT unnest(generate_series(DATE '2025-01-01', DATE '2025-12-31', INTERVAL 1 DAY)) AS day
),
users AS (
SELECT unnest(generate_series(1, 5000)) AS user_id
),
user_cohorts AS (
SELECT
u.user_id,
MIN(da.day) AS first_active_day,
date_trunc('month', MIN(da.day)) AS cohort_month
FROM users u
CROSS JOIN dates da
WHERE random() < 0.3 -- 30% 概率某天活跃
GROUP BY u.user_id
),
daily_activity AS (
SELECT
uc.user_id,
uc.cohort_month,
date_trunc('month', da.day) AS activity_month,
COUNT(*) AS active_days
FROM user_cohorts uc
CROSS JOIN dates da
WHERE random() < 0.15 -- 每月 15% 概率活跃
GROUP BY uc.user_id, date_trunc('month', da.day)
)
SELECT * FROM daily_activity;
-- 核心:计算每个 cohort 的留存人数
WITH cohort_counts AS (
SELECT
cohort_month,
COUNT(DISTINCT user_id) AS cohort_size
FROM user_activity
GROUP BY cohort_month
),
monthly_retention AS (
SELECT
cohort_month,
activity_month,
datediff('month', cohort_month, activity_month) AS months_since_join,
COUNT(DISTINCT user_id) AS retained_users
FROM user_activity
GROUP BY cohort_month, activity_month, months_since_join
)
SELECT
mr.cohort_month,
mr.months_since_join,
mr.retained_users,
cc.cohort_size,
ROUND(mr.retained_users * 100.0 / cc.cohort_size, 2) AS retention_rate_pct,
-- 关键窗口函数:计算环比变化
LAG(mr.retained_users) OVER (
PARTITION BY mr.cohort_month
ORDER BY mr.months_since_join
) AS prev_month_retained,
CASE
WHEN LAG(mr.retained_users) OVER (
PARTITION BY mr.cohort_month
ORDER BY mr.months_since_join
) IS NOT NULL
THEN ROUND(
(mr.retained_users - LAG(mr.retained_users) OVER (
PARTITION BY mr.cohort_month
ORDER BY mr.months_since_join
)) * 100.0 / LAG(mr.retained_users) OVER (
PARTITION BY mr.cohort_month
ORDER BY mr.months_since_join
), 2
)
ELSE NULL
END AS mom_change_pct
FROM monthly_retention mr
JOIN cohort_counts cc ON mr.cohort_month = cc.cohort_month
WHERE mr.months_since_join <= 6
ORDER BY mr.cohort_month, mr.months_since_join;
商业价值:这份留存分析报告可以直接卖给 SaaS 公司作为月度付费服务的核心交付物。一套模板,换数据就能复用,边际成本趋近于零。
三、实战场景 2:电商 GMV 滚动趋势 + 异常检测
电商客户最关心的指标之一是 GMV 趋势。用窗口函数,你可以在一行 SQL 内同时完成趋势分析和异常标记。
-- 模拟电商订单数据
CREATE TABLE orders AS
SELECT
generate_series(1, 10000) AS order_id,
date '2025-01-01' + (random() * 365)::INT AS order_date,
('electronics'::VARCHAR, 'clothing'::VARCHAR, 'food'::VARCHAR, 'home'::VARCHAR)[floor(random()*4)+1] AS category,
ROUND((random() * 500 + 10)::NUMERIC, 2) AS amount
FROM generate_series(1, 10000);
-- 每日/每周 GMV + 移动平均 + 异常标记
WITH daily_gmv AS (
SELECT
order_date::DATE AS day,
SUM(amount) AS daily_gmv,
COUNT(*) AS order_count
FROM orders
GROUP BY 1
),
enriched AS (
SELECT
day,
daily_gmv,
order_count,
-- 7日移动平均
AVG(daily_gmv) OVER (
ORDER BY day
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS ma_7day,
-- 30日移动平均
AVG(daily_gmv) OVER (
ORDER BY day
ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
) AS ma_30day,
-- 昨日对比(环比)
LAG(daily_gmv, 1) OVER (ORDER BY day) AS prev_day_gmv,
-- 上周同日对比
LAG(daily_gmv, 7) OVER (ORDER BY day) AS same_week_prev_gmv,
-- 标准差(用于异常检测)
STDDEV(daily_gmv) OVER (
ORDER BY day
ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
) AS rolling_stddev,
-- 排名
RANK() OVER (
PARTITION BY EXTRACT(MONTH FROM day)
ORDER BY daily_gmv DESC
) AS day_rank_in_month,
-- 累计GMV(年初至今)
SUM(daily_gmv) OVER (
ORDER BY day
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS ytd_gmv
FROM daily_gmv
)
SELECT
day,
ROUND(daily_gmv, 2) AS daily_gmv,
ROUND(ma_7day, 2) AS ma_7day,
ROUND(ma_30day, 2) AS ma_30day,
ROUND(prev_day_gmv, 2) AS prev_day_gmv,
ROUND(same_week_prev_gmv, 2) AS same_week_prev_gmv,
ROUND(ytd_gmv, 2) AS ytd_gmv,
-- 异常标记:超过2个标准差
CASE
WHEN ABS(daily_gmv - ma_30day) > 2 * COALESCE(rolling_stddev, daily_gmv * 0.1)
THEN '⚠️ 异常'
ELSE '✓ 正常'
END AS anomaly_flag
FROM enriched
ORDER BY day
LIMIT 30;
商业价值:这套查询模板可以封装成一个"电商健康度监控 SaaS",按月收费。每个电商客户只需替换自己的订单表,查询逻辑完全复用。
四、实战场景 3:员工绩效排名 + 奖金分配
这是很多中小企业愿意付费的场景——自动化绩效考核。
-- 模拟销售团队业绩数据
CREATE TABLE sales_performance AS
SELECT
generate_series(1, 200) AS emp_id,
('张三'::VARCHAR, '李四'::VARCHAR, '王五'::VARCHAR, '赵六'::VARCHAR,
'钱七'::VARCHAR, '孙八'::VARCHAR, '周九'::VARCHAR, '吴十'::VARCHAR)
[floor(random()*8)+1] AS name,
('华东'::VARCHAR, '华南'::VARCHAR, '华北'::VARCHAR, '西部'::VARCHAR)[floor(random()*4)+1] AS region,
date '2025-01-01' + (random()*365)::INT AS month_date,
ROUND((random() * 100000 + 5000)::NUMERIC, 2) AS revenue,
floor(random() * 50 + 10)::INT AS deals_closed
FROM generate_series(1, 200);
-- 绩效排名 + 奖金计算
WITH monthly_stats AS (
SELECT
name,
region,
month_date,
SUM(revenue) AS total_revenue,
SUM(deals_closed) AS total_deals,
-- 区域排名
RANK() OVER (
PARTITION BY region, month_date
ORDER BY SUM(revenue) DESC
) AS region_rank,
-- 全公司排名
RANK() OVER (
PARTITION BY month_date
ORDER BY SUM(revenue) DESC
) AS company_rank,
-- 与区域平均的差距百分比
ROUND(
(SUM(revenue) - AVG(SUM(revenue)) OVER (PARTITION BY region, month_date))
* 100.0 / AVG(SUM(revenue)) OVER (PARTITION BY region, month_date),
2
) AS vs_region_avg_pct,
-- 环比增长
LAG(SUM(revenue)) OVER (
PARTITION BY name, region
ORDER BY month_date
) AS prev_month_revenue
FROM sales_performance
GROUP BY name, region, month_date
)
SELECT
name,
region,
month_date,
ROUND(total_revenue, 2) AS total_revenue,
total_deals,
region_rank,
company_rank,
ROUND(vs_region_avg_pct, 2) AS vs_region_avg_pct,
ROUND(prev_month_revenue, 2) AS prev_month_revenue,
CASE
WHEN prev_month_revenue IS NOT NULL
THEN ROUND((total_revenue - prev_month_revenue) * 100.0 / prev_month_revenue, 2)
ELSE NULL
END AS mom_growth_pct,
-- 奖金计算:排名越前奖金越高
CASE
WHEN company_rank = 1 THEN total_revenue * 0.15
WHEN company_rank <= 3 THEN total_revenue * 0.10
WHEN company_rank <= 10 THEN total_revenue * 0.05
ELSE total_revenue * 0.02
END AS bonus_amount
FROM monthly_stats
ORDER BY month_date, company_rank
LIMIT 50;
商业价值:这套绩效分析系统可以直接打包成"中小企业 HR 数据产品",卖给需要自动化绩效考核的中小公司。一份代码,N 个客户。
五、窗口函数 vs 传统方法对比
| 维度 | 传统 GROUP BY + JOIN | DuckDB 窗口函数 | DuckDB 优势 |
|---|---|---|---|
| 代码行数 | 50-100 行 | 10-20 行 | 减少 70%+ 代码量 |
| 执行速度 | 多次全表扫描 | 单次扫描 | 快 3-10 倍 |
| 可读性 | 嵌套子查询地狱 | 线性逻辑流 | 维护成本降低 60% |
| 灵活性 | 固定分组 | 任意滑动窗口 | 支持实时动态分析 |
| 内存占用 | 中间结果膨胀 | 流式处理 | 大表无需额外存储 |
六、变现建议:把窗口函数变成印钞机
窗口函数的真正价值不在于技术本身,而在于它能以极低的边际成本解决高频商业问题。以下是三种变现路径:
路径 1:标准化分析模板产品
将上述 Cohort 留存、GMV 趋势、绩效排名的查询封装成可配置的模板。客户只需提供数据连接,你交付完整的分析看板。定价:¥500-2000/模板/月。
路径 2:数据即服务(DaaS)
搭建一个"电商/零售行业数据监控平台",用窗口函数实时计算核心指标,通过 API 或邮件日报交付。定价:¥2000-5000/月/客户。
路径 3:咨询 + 培训组合
为企业做数据团队培训,教他们用窗口函数替代低效的 Pandas/Excel 流程。单次咨询 ¥5000-20000,培训 ¥3000-8000/天。
关键洞察:窗口函数是 DuckDB 生态中最容易被低估的变现杠杆。一条好的窗口函数查询,可以同时服务于留存分析、异常检测、排名统计三个商业场景——而代码量不超过 30 行。
学习更多 DuckDB 实战经验 → duckdblab.org