-- test data (GroupIDs 1, 2 normal regressions, 3, 4 = no variance)
WITH some_table(GroupID, x, y) AS
(       SELECT 1,  1,  1    UNION SELECT 1,  2,  2    UNION SELECT 1,  3,  1.3  
  UNION SELECT 1,  4,  3.75 UNION SELECT 1,  5,  2.25 UNION SELECT 2, 95, 85    
  UNION SELECT 2, 85, 95    UNION SELECT 2, 80, 70    UNION SELECT 2, 70, 65    
  UNION SELECT 2, 60, 70    UNION SELECT 3,  1,  2    UNION SELECT 3,  1, 3
  UNION SELECT 4,  1,  2    UNION SELECT 4,  2,  2),
 -- linear regression query
/*WITH*/ mean_estimates AS
(   SELECT GroupID
          ,AVG(x * 1.)                                             AS xmean
          ,AVG(y * 1.)                                             AS ymean
    FROM some_table
    GROUP BY GroupID
),
stdev_estimates AS
(   SELECT pd.GroupID
          -- T-SQL STDEV() implementation is not numerically stable
          ,CASE      SUM(SQUARE(x - xmean)) WHEN 0 THEN 1 
           ELSE SQRT(SUM(SQUARE(x - xmean)) / (COUNT(*) - 1)) END AS xstdev
          ,     SQRT(SUM(SQUARE(y - ymean)) / (COUNT(*) - 1))     AS ystdev
    FROM some_table pd
    INNER JOIN mean_estimates  pm ON pm.GroupID = pd.GroupID
    GROUP BY pd.GroupID, pm.xmean, pm.ymean
),
standardized_data AS                   -- increases numerical stability
(   SELECT pd.GroupID
          ,(x - xmean) / xstdev                                    AS xstd
          ,CASE ystdev WHEN 0 THEN 0 ELSE (y - ymean) / ystdev END AS ystd
    FROM some_table pd
    INNER JOIN stdev_estimates ps ON ps.GroupID = pd.GroupID
    INNER JOIN mean_estimates  pm ON pm.GroupID = pd.GroupID
),
standardized_beta_estimates AS
(   SELECT GroupID
          ,CASE WHEN SUM(xstd * xstd) = 0 THEN 0
                ELSE SUM(xstd * ystd) / (COUNT(*) - 1) END         AS betastd
    FROM standardized_data pd
    GROUP BY GroupID
)
SELECT pb.GroupID
      ,ymean - xmean * betastd * ystdev / xstdev                   AS Alpha
      ,betastd * ystdev / xstdev                                   AS Beta
FROM standardized_beta_estimates pb
INNER JOIN stdev_estimates ps ON ps.GroupID = pb.GroupID
INNER JOIN mean_estimates  pm ON pm.GroupID = pb.GroupID

在这里，GroupID用于显示如何按源数据表中的某个值进行分组。如果您只想要表中所有数据的统计信息(而不是特定的子组)，您可以删除它和连接。为了清楚起见，我使用了WITH语句。作为替代，您可以使用子查询。请注意表中使用的数据类型的精度，因为如果相对于数据的精度不够高，数值稳定性可能会迅速恶化。

编辑：(在回答Peter的问题时获得额外的统计信息，如评论中的R2 )

您可以使用相同的技术轻松地计算其他统计数据。下面是一个包含R2、相关性和样本协方差的版本：

-- test data (GroupIDs 1, 2 normal regressions, 3, 4 = no variance)
WITH some_table(GroupID, x, y) AS
(       SELECT 1,  1,  1    UNION SELECT 1,  2,  2    UNION SELECT 1,  3,  1.3  
  UNION SELECT 1,  4,  3.75 UNION SELECT 1,  5,  2.25 UNION SELECT 2, 95, 85    
  UNION SELECT 2, 85, 95    UNION SELECT 2, 80, 70    UNION SELECT 2, 70, 65    
  UNION SELECT 2, 60, 70    UNION SELECT 3,  1,  2    UNION SELECT 3,  1, 3
  UNION SELECT 4,  1,  2    UNION SELECT 4,  2,  2),
 -- linear regression query
/*WITH*/ mean_estimates AS
(   SELECT GroupID
          ,AVG(x * 1.)                                             AS xmean
          ,AVG(y * 1.)                                             AS ymean
    FROM some_table pd
    GROUP BY GroupID
),
stdev_estimates AS
(   SELECT pd.GroupID
          -- T-SQL STDEV() implementation is not numerically stable
          ,CASE      SUM(SQUARE(x - xmean)) WHEN 0 THEN 1 
           ELSE SQRT(SUM(SQUARE(x - xmean)) / (COUNT(*) - 1)) END AS xstdev
          ,     SQRT(SUM(SQUARE(y - ymean)) / (COUNT(*) - 1))     AS ystdev
    FROM some_table pd
    INNER JOIN mean_estimates  pm ON pm.GroupID = pd.GroupID
    GROUP BY pd.GroupID, pm.xmean, pm.ymean
),
standardized_data AS                   -- increases numerical stability
(   SELECT pd.GroupID
          ,(x - xmean) / xstdev                                    AS xstd
          ,CASE ystdev WHEN 0 THEN 0 ELSE (y - ymean) / ystdev END AS ystd
    FROM some_table pd
    INNER JOIN stdev_estimates ps ON ps.GroupID = pd.GroupID
    INNER JOIN mean_estimates  pm ON pm.GroupID = pd.GroupID
),
standardized_beta_estimates AS
(   SELECT GroupID
          ,CASE WHEN SUM(xstd * xstd) = 0 THEN 0
                ELSE SUM(xstd * ystd) / (COUNT(*) - 1) END         AS betastd
    FROM standardized_data
    GROUP BY GroupID
)
SELECT pb.GroupID
      ,ymean - xmean * betastd * ystdev / xstdev                   AS Alpha
      ,betastd * ystdev / xstdev                                   AS Beta
      ,CASE ystdev WHEN 0 THEN 1 ELSE betastd * betastd END        AS R2
      ,betastd                                                     AS Correl
      ,betastd * xstdev * ystdev                                   AS Covar
FROM standardized_beta_estimates pb
INNER JOIN stdev_estimates ps ON ps.GroupID = pb.GroupID
INNER JOIN mean_estimates  pm ON pm.GroupID = pb.GroupID

EDIT 2通过标准化数据(而不仅仅是居中)和由于numerical stability issues而替换STDEV来提高数值稳定性。对我来说，目前的实现似乎是稳定性和复杂性之间的最佳折衷。我可以通过用一个数值稳定的在线算法替换我的标准差来提高稳定性，但这会使实现变得复杂(并减慢它)。类似地，在有限的测试中，使用Kahan(-Babuška-Neumaier)补偿SUM和AVG的实现似乎会稍微好一些，但会使查询更加复杂。只要我不知道T-SQL如何实现SUM和AVG (例如，它可能已经在使用成对求和)，我就不能保证这样的修改总是提高准确性。

票数 47

Stack Overflow用户

发布于 2010-07-20 00:51:45

这是另一种基于blog post on Linear Regression in T-SQL的方法，它使用以下公式：

不过，博客中的SQL建议使用游标。以下是我使用的forum answer的美化版本：

table
-----
X (numeric)
Y (numeric)

/**
 * m = (nSxy - SxSy) / (nSxx - SxSx)
 * b = Ay - (Ax * m)
 * N.B. S = Sum, A = Mean
 */
DECLARE @n INT
SELECT @n = COUNT(*) FROM table
SELECT (@n * SUM(X*Y) - SUM(X) * SUM(Y)) / (@n * SUM(X*X) - SUM(X) * SUM(X)) AS M,
       AVG(Y) - AVG(X) *
       (@n * SUM(X*Y) - SUM(X) * SUM(Y)) / (@n * SUM(X*X) - SUM(X) * SUM(X)) AS B
FROM table

票数 22

Stack Overflow用户

发布于 2014-01-08 02:07:01

我实际上已经使用Gram-Schmidt正交化编写了一个SQL例程。它以及其他机器学习和预测例程都可以在sqldatamine.blogspot.com上找到

在Brad Larson的建议下，我在这里添加了代码，而不仅仅是将用户直接添加到我的博客。这将产生与Excel中最线性函数相同的结果。我的主要资料来源是Hastie，Tibshirni和Friedman的《统计学习的元素》(2008)。

--Create a table of data
create table #rawdata (id int,area float, rooms float, odd float,  price float)

insert into #rawdata select 1, 2201,3,1,400
insert into #rawdata select 2, 1600,3,0,330
insert into #rawdata select 3, 2400,3,1,369
insert into #rawdata select 4, 1416,2,1,232
insert into #rawdata select 5, 3000,4,0,540

--Insert the data into x & y vectors
select id xid, 0 xn,1 xv into #x from #rawdata
union all
select id, 1,rooms  from #rawdata
union all
select id, 2,area  from #rawdata
union all
select id, 3,odd  from #rawdata

select id yid, 0 yn, price yv  into #y from #rawdata

--create a residuals table and insert the intercept (1)
create table #z (zid int, zn int, zv float)
insert into #z select id , 0 zn,1 zv from #rawdata

--create a table for the orthoganal (#c) & regression(#b) parameters
create table #c(cxn int, czn int, cv float) 
create table #b(bn int, bv float) 


--@p is the number of independent variables including the intercept (@p = 0)
declare @p int
set @p = 1


--Loop through each independent variable and estimate the orthagonal parameter (#c)
-- then estimate the residuals and insert into the residuals table (#z)
while @p <= (select max(xn) from #x)
begin   
        insert into #c
    select  xn cxn,  zn czn, sum(xv*zv)/sum(zv*zv) cv 
        from #x join  #z on  xid = zid where zn = @p-1 and xn>zn group by xn, zn

    insert into #z
    select zid, xn,xv- sum(cv*zv) 
        from #x join #z on xid = zid   join  #c  on  czn = zn and cxn = xn  where xn = @p and zn<xn  group by zid, xn,xv

    set @p = @p +1
end

--Loop through each independent variable and estimate the regression parameter by regressing the orthoganal
-- resiuduals on the dependent variable y
while @p>=0 
begin

    insert into #b
    select zn, sum(yv*zv)/ sum(zv*zv) 
        from #z  join 
            (select yid, yv-isnull(sum(bv*xv),0) yv from #x join #y on xid = yid left join #b on  xn=bn group by yid, yv) y
        on zid = yid where zn = @p  group by zn

    set @p = @p-1
end

--The regression parameters
select * from #b

--Actual vs. fit with error
select yid, yv, fit, yv-fit err from #y join 
    (select xid, sum(xv*bv) fit from #x join #b on xn = bn  group by xid) f
     on yid = xid

--R Squared
select 1-sum(power(err,2))/sum(power(yv,2)) from 
(select yid, yv, fit, yv-fit err from #y join 
    (select xid, sum(xv*bv) fit from #x join #b on xn = bn  group by xid) f
     on yid = xid) d

票数 5

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/2536895

复制

相似问题

问SQL Server中是否有线性回归功能？
EN

回答 8

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问SQL Server中是否有线性回归功能？EN

回答 8

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问SQL Server中是否有线性回归功能？
EN