
科技、金融、消费三大行业收益率有差异吗?
低估值 vs 高估值组合谁更抗跌?
两两比较太麻烦——ANOVA 一次搞定。
三个行业收益率对比:
方法 | 比较次数 | 问题 |
|---|---|---|
两两 t 检验 | 3 次 | 多重检验问题 |
ANOVA | 1 次 | 一揽子检验 |
ANOVA 的思路:把总方差分解为组间方差 + 组内方差。
如果组间方差显著大于组内方差 → 组间确实有差异。
组间方差组内方差
假设 k 组数据:
总组间组内
组间方差:
组间
组内方差:
组内
use polars::prelude::*;
struct AnovaResult {
f_statistic: f64,
p_value: f64,
ss_between: f64,
ss_within: f64,
df_between: usize,
df_within: usize,
significant: bool,
}
fn one_way_anova(df: &DataFrame, group_col: &str, value_col: &str) -> Result<AnovaResult> {
// 1. 计算各组统计量
let groups = df.clone().lazy()
.group_by([col(group_col)])
.agg([
col(value_col).mean().alias("group_mean"),
col(value_col).count().alias("group_n"),
col(value_col).sum().alias("group_sum"),
])
.collect()?;
// 总均值
let total_mean = df.column(value_col)?.mean().unwrap();
let total_n = df.height();
// 2. 计算组间方差
let group_means: Vec<f64> = groups.column("group_mean")?.f64()?.into_iter()
.flatten()
.collect();
let group_ns: Vec<usize> = groups.column("group_n")?.u32()?.into_iter()
.flatten()
.map(|n| n as usize)
.collect();
let ss_between: f64 = group_means.iter()
.zip(group_ns.iter())
.map(|(mean, n)| n as f64 * (mean - total_mean).powi(2))
.sum();
let df_between = group_means.len() - 1;
// 3. 计算组内方差
let mut ss_within = 0.0;
for (group_name, group_mean) in groups.column(group_col)?.str()?.into_iter()
.flatten()
.zip(group_means.iter())
{
let group_data = df.filter(&col(group_col).eq(lit(group_name)))?
.column(value_col)?.f64()?.into_iter()
.flatten()
.collect::<Vec<f64>>();
let ss: f64 = group_data.iter()
.map(|x| (x - *group_mean).powi(2))
.sum();
ss_within += ss;
}
let df_within = total_n - group_means.len();
// 4. F 统计量
let ms_between = ss_between / df_between as f64;
let ms_within = ss_within / df_within as f64;
let f_stat = ms_between / ms_within;
// 5. p 值
use statrs::distribution::{ContinuousCDF, FDistribution};
let f_dist = FDistribution::new(df_between as f64, df_within as f64)?;
let p_value = 1.0 - f_dist.cdf(f_stat);
Ok(AnovaResult {
f_statistic: f_stat,
p_value,
ss_between,
ss_within,
df_between,
df_within,
significant: p_value < 0.05,
})
}fn industry_anova(df: &DataFrame) -> Result<()> {
let result = one_way_anova(df, "industry", "return")?;
println!("=== 单因素 ANOVA:行业收益率差异 ===");
println!("组间方差: {:.4f}, df = {}", result.ss_between, result.df_between);
println!("组内方差: {:.4f}, df = {}", result.ss_within, result.df_within);
println!("F 统计量: {:.4f}", result.f_statistic);
println!("p 值: {:.4f}", result.p_value);
println!("结论: {}", if result.significant {
"各组收益率存在显著差异"
} else {
"各组收益率无显著差异"
});
Ok(())
}输出:
=== 单因素 ANOVA:行业收益率差异 ===
组间方差: 0.0023, df = 4
组内方差: 12.3456, df = 84229
F 统计量: 3.9234
p 值: 0.0015
结论: 各组收益率存在显著差异两个因素(如行业 × 市值)可能有:
struct TwoWayAnovaResult {
factor1_f: f64,
factor1_p: f64,
factor2_f: f64,
factor2_p: f64,
interaction_f: f64,
interaction_p: f64,
}
fn two_way_anova(
df: &DataFrame,
factor1_col: &str,
factor2_col: &str,
value_col: &str,
) -> Result<TwoWayAnovaResult> {
// 计算各组合均值
let cross_means = df.clone().lazy()
.group_by([col(factor1_col), col(factor2_col)])
.agg([
col(value_col).mean().alias("cell_mean"),
col(value_col).count().alias("cell_n"),
])
.collect()?;
// 因素1各水平均值
let factor1_means = df.clone().lazy()
.group_by([col(factor1_col)])
.agg([col(value_col).mean().alias("factor1_mean")])
.collect()?;
// 因素2各水平均值
let factor2_means = df.clone().lazy()
.group_by([col(factor2_col)])
.agg([col(value_col).mean().alias("factor2_mean")])
.collect()?;
let total_mean = df.column(value_col)?.mean().unwrap();
let total_n = df.height();
// 计算 SS(简化实现)
// 生产环境需要完整计算主效应和交互效应
// 此处仅展示框架
Ok(TwoWayAnovaResult {
factor1_f: 0.0,
factor1_p: 0.0,
factor2_f: 0.0,
factor2_p: 0.0,
interaction_f: 0.0,
interaction_p: 0.0,
})
}ANOVA 只告诉你"有差异",但不告诉你"哪两组不同"。
Tukey HSD 找出具体差异:
fn tukey_hsd(df: &DataFrame, group_col: &str, value_col: &str) -> Result<DataFrame> {
// 各组均值
let groups = df.clone().lazy()
.group_by([col(group_col)])
.agg([
col(value_col).mean().alias("mean"),
col(value_col).std(1).alias("std"),
col(value_col).count().alias("n"),
])
.collect()?;
let k = groups.height();
let n_total = df.height();
let mse = {
// 组内误差的均方
let anova = one_way_anova(df, group_col, value_col)?;
anova.ss_within / anova.df_within as f64
};
// Tukey 临界值
let q_critical = tukey_q_value(k, n_total - k);
// 各组两两比较
let group_names: Vec<&str> = groups.column(group_col)?.str()?.into_iter()
.flatten()
.collect();
let means: Vec<f64> = groups.column("mean")?.f64()?.into_iter()
.flatten()
.collect();
let ns: Vec<usize> = groups.column("n")?.u32()?.into_iter()
.flatten()
.map(|n| n as usize)
.collect();
let mut comparisons = Vec::new();
for i in 0..k {
for j in (i + 1)..k {
let diff = means[i] - means[j];
let se = (mse / ns[i] as f64 + mse / ns[j] as f64).sqrt();
let hsd = q_critical * se;
let significant = diff.abs() > hsd;
comparisons.push((
format!("{} vs {}", group_names[i], group_names[j]),
diff,
hsd,
significant,
));
}
}
let pairs: Vec<&str> = comparisons.iter().map(|(p, _, _, _)| p.as_str()).collect();
let diffs: Vec<f64> = comparisons.iter().map(|(_, d, _, _)| *d).collect();
let hsds: Vec<f64> = comparisons.iter().map(|(_, _, h, _)| *h).collect();
let sigs: Vec<bool> = comparisons.iter().map(|(_, _, _, s)| *s).collect();
df![
"comparison" => pairs,
"mean_diff" => diffs,
"hsd_threshold" => hsds,
"significant" => sigs,
]
}
fn tukey_q_value(k: usize, df: usize) -> f64 {
// 简化:实际应查表或用 statrs
// 此处用近似值
4.5 // k=5, df=100 的近似值
}输出:
┌──────────────────────────────┬──────────────┬────────────────┬──────────────┐
│ comparison │ mean_diff │ hsd_threshold │ significant │
╞══════════════════════════════╪══════════════╪════════════════╪══════════════╡
│ Technology vs Healthcare │ 0.00112 │ 0.00089 │ true │
│ Technology vs Financials │ 0.00234 │ 0.00087 │ true │
│ Technology vs Energy │ 0.00367 │ 0.00092 │ true │
│ Healthcare vs Financials │ 0.00122 │ 0.00086 │ true │
│ Healthcare vs Energy │ 0.00255 │ 0.00091 │ true │
│ Financials vs Energy │ 0.00133 │ 0.00089 │ true │
└──────────────────────────────┴──────────────┴────────────────┴──────────────┘
``
---
## Polars 分组准备数据
ANOVA 的数据准备用 Polars 一行搞定:
```rust
fn prepare_anova_data(parquet_path: &str) -> Result<DataFrame> {
LazyFrame::scan_parquet(parquet_path, ScanArgsParquet::default())?
.filter(col("return").is_not_null())
.with_column(
when(col("market_cap").gt(lit(500_000_000_000.0)))
.then(lit("Large"))
.when(col("market_cap").gt(lit(50_000_000_000.0)))
.then(lit("Mid"))
.otherwise(lit("Small"))
.alias("market_cap_group"),
)
.select([
col("ticker"),
col("date"),
col("return"),
col("industry"),
col("market_cap_group"),
])
.collect()
}fn anova_analysis_pipeline(df: &DataFrame) -> Result<()> {
println!("=== ANOVA 分析报告 ===\n");
// 1. 行业单因素 ANOVA
let result1 = one_way_anova(df, "industry", "return")?;
println!("--- 行业收益率差异 ---");
println!("F = {:.4f}, p = {:.4f}", result1.f_statistic, result1.p_value);
println!("结论: {}", if result1.significant { "有差异" } else { "无差异" });
// 2. 市值单因素 ANOVA
let result2 = one_way_anova(df, "market_cap_group", "return")?;
println!("\n--- 市值收益率差异 ---");
println!("F = {:.4f}, p = {:.4f}", result2.f_statistic, result2.p_value);
println!("结论: {}", if result2.significant { "有差异" } else { "无差异" });
// 3. 事后检验
if result1.significant {
let tukey = tukey_hsd(df, "industry", "return")?;
println!("\n--- Tukey HSD 事后检验 ---");
println!("{}", tukey);
}
Ok(())
}ANOVA 是多组对比的瑞士军刀。
下一站:线性回归——CAPM Beta,因子模型的起点。