SQL Server2005/2008中有没有类似于Linear Regression functions in Oracle的线性回归功能?
发布于 2010-03-29 17:58:01
据我所知,没有。不过,写一篇文章非常简单。下面给出了y= alpha + beta *x+epsilon时的常数α和斜率β:
-- test data (GroupIDs 1, 2 normal regressions, 3, 4 = no variance)
WITH some_table(GroupID, x, y) AS
( SELECT 1, 1, 1 UNION SELECT 1, 2, 2 UNION SELECT 1, 3, 1.3
UNION SELECT 1, 4, 3.75 UNION SELECT 1, 5, 2.25 UNION SELECT 2, 95, 85
UNION SELECT 2, 85, 95 UNION SELECT 2, 80, 70 UNION SELECT 2, 70, 65
UNION SELECT 2, 60, 70 UNION SELECT 3, 1, 2 UNION SELECT 3, 1, 3
UNION SELECT 4, 1, 2 UNION SELECT 4, 2, 2),
-- linear regression query
/*WITH*/ mean_estimates AS
( SELECT GroupID
,AVG(x * 1.) AS xmean
,AVG(y * 1.) AS ymean
FROM some_table
GROUP BY GroupID
),
stdev_estimates AS
( SELECT pd.GroupID
-- T-SQL STDEV() implementation is not numerically stable
,CASE SUM(SQUARE(x - xmean)) WHEN 0 THEN 1
ELSE SQRT(SUM(SQUARE(x - xmean)) / (COUNT(*) - 1)) END AS xstdev
, SQRT(SUM(SQUARE(y - ymean)) / (COUNT(*) - 1)) AS ystdev
FROM some_table pd
INNER JOIN mean_estimates pm ON pm.GroupID = pd.GroupID
GROUP BY pd.GroupID, pm.xmean, pm.ymean
),
standardized_data AS -- increases numerical stability
( SELECT pd.GroupID
,(x - xmean) / xstdev AS xstd
,CASE ystdev WHEN 0 THEN 0 ELSE (y - ymean) / ystdev END AS ystd
FROM some_table pd
INNER JOIN stdev_estimates ps ON ps.GroupID = pd.GroupID
INNER JOIN mean_estimates pm ON pm.GroupID = pd.GroupID
),
standardized_beta_estimates AS
( SELECT GroupID
,CASE WHEN SUM(xstd * xstd) = 0 THEN 0
ELSE SUM(xstd * ystd) / (COUNT(*) - 1) END AS betastd
FROM standardized_data pd
GROUP BY GroupID
)
SELECT pb.GroupID
,ymean - xmean * betastd * ystdev / xstdev AS Alpha
,betastd * ystdev / xstdev AS Beta
FROM standardized_beta_estimates pb
INNER JOIN stdev_estimates ps ON ps.GroupID = pb.GroupID
INNER JOIN mean_estimates pm ON pm.GroupID = pb.GroupID
在这里,GroupID
用于显示如何按源数据表中的某个值进行分组。如果您只想要表中所有数据的统计信息(而不是特定的子组),您可以删除它和连接。为了清楚起见,我使用了WITH
语句。作为替代,您可以使用子查询。请注意表中使用的数据类型的精度,因为如果相对于数据的精度不够高,数值稳定性可能会迅速恶化。
编辑:(在回答Peter的问题时获得额外的统计信息,如评论中的R2 )
您可以使用相同的技术轻松地计算其他统计数据。下面是一个包含R2、相关性和样本协方差的版本:
-- test data (GroupIDs 1, 2 normal regressions, 3, 4 = no variance)
WITH some_table(GroupID, x, y) AS
( SELECT 1, 1, 1 UNION SELECT 1, 2, 2 UNION SELECT 1, 3, 1.3
UNION SELECT 1, 4, 3.75 UNION SELECT 1, 5, 2.25 UNION SELECT 2, 95, 85
UNION SELECT 2, 85, 95 UNION SELECT 2, 80, 70 UNION SELECT 2, 70, 65
UNION SELECT 2, 60, 70 UNION SELECT 3, 1, 2 UNION SELECT 3, 1, 3
UNION SELECT 4, 1, 2 UNION SELECT 4, 2, 2),
-- linear regression query
/*WITH*/ mean_estimates AS
( SELECT GroupID
,AVG(x * 1.) AS xmean
,AVG(y * 1.) AS ymean
FROM some_table pd
GROUP BY GroupID
),
stdev_estimates AS
( SELECT pd.GroupID
-- T-SQL STDEV() implementation is not numerically stable
,CASE SUM(SQUARE(x - xmean)) WHEN 0 THEN 1
ELSE SQRT(SUM(SQUARE(x - xmean)) / (COUNT(*) - 1)) END AS xstdev
, SQRT(SUM(SQUARE(y - ymean)) / (COUNT(*) - 1)) AS ystdev
FROM some_table pd
INNER JOIN mean_estimates pm ON pm.GroupID = pd.GroupID
GROUP BY pd.GroupID, pm.xmean, pm.ymean
),
standardized_data AS -- increases numerical stability
( SELECT pd.GroupID
,(x - xmean) / xstdev AS xstd
,CASE ystdev WHEN 0 THEN 0 ELSE (y - ymean) / ystdev END AS ystd
FROM some_table pd
INNER JOIN stdev_estimates ps ON ps.GroupID = pd.GroupID
INNER JOIN mean_estimates pm ON pm.GroupID = pd.GroupID
),
standardized_beta_estimates AS
( SELECT GroupID
,CASE WHEN SUM(xstd * xstd) = 0 THEN 0
ELSE SUM(xstd * ystd) / (COUNT(*) - 1) END AS betastd
FROM standardized_data
GROUP BY GroupID
)
SELECT pb.GroupID
,ymean - xmean * betastd * ystdev / xstdev AS Alpha
,betastd * ystdev / xstdev AS Beta
,CASE ystdev WHEN 0 THEN 1 ELSE betastd * betastd END AS R2
,betastd AS Correl
,betastd * xstdev * ystdev AS Covar
FROM standardized_beta_estimates pb
INNER JOIN stdev_estimates ps ON ps.GroupID = pb.GroupID
INNER JOIN mean_estimates pm ON pm.GroupID = pb.GroupID
EDIT 2通过标准化数据(而不仅仅是居中)和由于numerical stability issues而替换STDEV
来提高数值稳定性。对我来说,目前的实现似乎是稳定性和复杂性之间的最佳折衷。我可以通过用一个数值稳定的在线算法替换我的标准差来提高稳定性,但这会使实现变得复杂(并减慢它)。类似地,在有限的测试中,使用Kahan(-Babuška-Neumaier)补偿SUM
和AVG
的实现似乎会稍微好一些,但会使查询更加复杂。只要我不知道T-SQL如何实现SUM
和AVG
(例如,它可能已经在使用成对求和),我就不能保证这样的修改总是提高准确性。
发布于 2010-07-20 00:51:45
这是另一种基于blog post on Linear Regression in T-SQL的方法,它使用以下公式:
不过,博客中的SQL建议使用游标。以下是我使用的forum answer的美化版本:
table
-----
X (numeric)
Y (numeric)
/**
* m = (nSxy - SxSy) / (nSxx - SxSx)
* b = Ay - (Ax * m)
* N.B. S = Sum, A = Mean
*/
DECLARE @n INT
SELECT @n = COUNT(*) FROM table
SELECT (@n * SUM(X*Y) - SUM(X) * SUM(Y)) / (@n * SUM(X*X) - SUM(X) * SUM(X)) AS M,
AVG(Y) - AVG(X) *
(@n * SUM(X*Y) - SUM(X) * SUM(Y)) / (@n * SUM(X*X) - SUM(X) * SUM(X)) AS B
FROM table
发布于 2014-01-08 02:07:01
我实际上已经使用Gram-Schmidt正交化编写了一个SQL例程。它以及其他机器学习和预测例程都可以在sqldatamine.blogspot.com上找到
在Brad Larson的建议下,我在这里添加了代码,而不仅仅是将用户直接添加到我的博客。这将产生与Excel中最线性函数相同的结果。我的主要资料来源是Hastie,Tibshirni和Friedman的《统计学习的元素》(2008)。
--Create a table of data
create table #rawdata (id int,area float, rooms float, odd float, price float)
insert into #rawdata select 1, 2201,3,1,400
insert into #rawdata select 2, 1600,3,0,330
insert into #rawdata select 3, 2400,3,1,369
insert into #rawdata select 4, 1416,2,1,232
insert into #rawdata select 5, 3000,4,0,540
--Insert the data into x & y vectors
select id xid, 0 xn,1 xv into #x from #rawdata
union all
select id, 1,rooms from #rawdata
union all
select id, 2,area from #rawdata
union all
select id, 3,odd from #rawdata
select id yid, 0 yn, price yv into #y from #rawdata
--create a residuals table and insert the intercept (1)
create table #z (zid int, zn int, zv float)
insert into #z select id , 0 zn,1 zv from #rawdata
--create a table for the orthoganal (#c) & regression(#b) parameters
create table #c(cxn int, czn int, cv float)
create table #b(bn int, bv float)
--@p is the number of independent variables including the intercept (@p = 0)
declare @p int
set @p = 1
--Loop through each independent variable and estimate the orthagonal parameter (#c)
-- then estimate the residuals and insert into the residuals table (#z)
while @p <= (select max(xn) from #x)
begin
insert into #c
select xn cxn, zn czn, sum(xv*zv)/sum(zv*zv) cv
from #x join #z on xid = zid where zn = @p-1 and xn>zn group by xn, zn
insert into #z
select zid, xn,xv- sum(cv*zv)
from #x join #z on xid = zid join #c on czn = zn and cxn = xn where xn = @p and zn<xn group by zid, xn,xv
set @p = @p +1
end
--Loop through each independent variable and estimate the regression parameter by regressing the orthoganal
-- resiuduals on the dependent variable y
while @p>=0
begin
insert into #b
select zn, sum(yv*zv)/ sum(zv*zv)
from #z join
(select yid, yv-isnull(sum(bv*xv),0) yv from #x join #y on xid = yid left join #b on xn=bn group by yid, yv) y
on zid = yid where zn = @p group by zn
set @p = @p-1
end
--The regression parameters
select * from #b
--Actual vs. fit with error
select yid, yv, fit, yv-fit err from #y join
(select xid, sum(xv*bv) fit from #x join #b on xn = bn group by xid) f
on yid = xid
--R Squared
select 1-sum(power(err,2))/sum(power(yv,2)) from
(select yid, yv, fit, yv-fit err from #y join
(select xid, sum(xv*bv) fit from #x join #b on xn = bn group by xid) f
on yid = xid) d
https://stackoverflow.com/questions/2536895
复制相似问题