# 机器学习实战-预测数值型数据：回归

``````from numpy import *

dataMat = []; labelMat = []
fr = open(fileName)
lineArr =[]
curLine = line.strip().split('\t')
for i in range(numFeat):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat,labelMat

def standRegres(xArr,yArr):
xMat = mat(xArr); yMat = mat(yArr).T
xTx = xMat.T*xMat
if linalg.det(xTx) == 0.0:#计算行列式
print "This matrix is singular, connot do inverse"
return
ws = xTx.I * (xMat.T*yMat)#.I是逆
ws = linalg.solve(xTx,xMat.T*yMat)#一上一句同义
return ws

``````

``````In [30]: import regression
...: ws = regression.standRegres(xArr,yArr)
In [34]: xArr[0:2]
Out[34]: [[1.0, 0.067732], [1.0, 0.42781]]
``````

``````In [35]: ws
Out[35]:
matrix([[ 3.00774324],
[ 1.69532264]])
``````

ws存放的是回归系数。下面开始使用新的ws计算yHat：

``````xMat = mat(xArr)
yMat = mat(yArr)
yHat = xMat*ws
``````

``````import matplotlib.pyplot as plt
fig = plt.figure()
ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0])
xCopy = xMat.copy()
xCopy.sort(0)#先排序
yHat = xCopy*ws
ax.plot(xCopy[:,1],yHat)
plt.show()
``````

``````In [41]: yHat = xMat*ws

In [42]: corrcoef(yHat.T,yMat)
Out[42]:
array([[ 1.        ,  0.98647356],
[ 0.98647356,  1.        ]])
``````

``````#局部加权线性回归函数
def lwlr(testPoint,xArr,yArr,k=1.0):
xMat = mat(xArr);yMat = mat(yArr).T
m = shape(xMat)[0]
weights = mat(eye((m)))#对角矩阵
for j in range(m):
diffMat = testPoint - xMat[j,:]
weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2))#根据距离创建权重
xTx = xMat.T*(weights*xMat)
if linalg.det(xTx) == 0.0:
print "This matrix is singular, cannot do inverse"
return
ws = xTx.I * (xMat.T*(weights*yMat))
return testPoint*ws

def lwlrTest(testArr,xArr,yArr,k=1.0):
m = shape(testArr)[0]
yHat = zeros(m)
for i in range(m):
yHat[i] = lwlr(testArr[i],xArr,yArr,k)
return yHat
``````

``````xArr,yArr = regression.loadDataSet('ex0.txt')
#对单点估计
In [53]: yArr[0]
Out[53]: 3.176513

In [58]: regression.lwlr(xArr[0],xArr,yArr,1.0)
Out[58]: matrix([[ 3.12204471]])

In [59]: regression.lwlr(xArr[0],xArr,yArr,0.001)
Out[59]: matrix([[ 3.20175729]])

yHat = regression.lwlrTest(xArr,xArr,yArr,0.003)#为了得到数据集所有的点
``````

``````xMat=mat(xArr)
srtInd=xMat[:,1].argsort(0)
xSort=xMat[srtInd][:,0,:]
``````

``````import matplotlib.pyplot as plt
fig = plt.figure()
ax.plot(xSort[:,1],yHat[srtInd])
ax.scatter(xMat[:,1].flatten().A[0],mat(yArr).T.flatten().A[0], s=2,c='red')#c:散点的颜色，s：散点的大小 ，alpha:是透明程度，make:散点样式
plt.show()
``````

k=0.003
k=0.01
k=1.0

k=0.003的时候显然是过拟合，k=1.0的时候和最小二乘法差不多。但局部加权线性回归也存在计算量的问题，因为他对每个点做预测时都必须使用整个数据集。

``````#加入regression,py
return ((yArr-yHatArr)**2).sum()
``````

``````abX,abY = regression.loadDataSet('abalone.txt')
yHat01 = regression.lwlrTest(abX[0:99],abX[0:99],abY[0:99],0.1)
yHat1 = regression.lwlrTest(abX[0:99],abX[0:99],abY[0:99],1)
yHat10 = regression.lwlrTest(abX[0:99],abX[0:99],abY[0:99],10)
``````

``````In [92]: regression.rssError(abY[0:99],yHat01.T)
Out[92]: 56.820227823583345

Out[93]: 429.89056187016047

Out[94]: 549.11817088250825
``````

``````In [102]: yHat01 = regression.lwlrTest(abX[100:199],abX[0:99],abY[0:99],0.1)
...:
Out[102]: 23989.306318956587

In [103]: yHat1 = regression.lwlrTest(abX[100:199],abX[0:99],abY[0:99],1)

Out[103]: 573.52614418975463

In [104]: yHat10 = regression.lwlrTest(abX[100:199],abX[0:99],abY[0:99],10)
``````

``````In [105]: ws = regression.standRegres(abX[0:99],abY[0:99])

In [106]: yHat = mat(abX[100:199])*ws

Out[107]: 518.636315324674
``````

``````def ridgeRegres(xMat,yMat,lam=0.2):#计算回归函数，lam即Python保留关键字lambda
xTx = xMat.T*xMat
denom = xTx + eye(shape(xMat)[1])*lam#岭
if linalg.det(denom) == 0.0:
print "This martrix is singular, cannot do inverse"
return
ws = denom.I*(xMat.T*yMat)
return ws

def ridgeTest(xArr,yArr):
xMat = mat(xArr); yMat = mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean
xMeans = mean(xMat,0)
xVar = var(xMat,0)
xMat = (xMat - xMeans)/xVar
numTestPts = 30
wMat = zeros((numTestPts,shape(xMat)[1]))
for i in range(numTestPts):
ws = ridgeRegres(xMat,yMat,exp(i-10))
wMat[i,:]=ws.T
return wMat
``````

``````import regression
ridgeWeights = regression.ridgeTest(abX,abY)
import matplotlib.pyplot as plt
fig = plt.figure()
ax.plot(ridgeWeights)
plt.show()
``````
log(lambda)

``````数据标准化，使其分布满足0均值和单位方差：
在每轮迭代过程中：
设置当前最小误差Lowe'sError为正无穷
对每个特征：
增大或缩小：
改变一个系数得到一个新的w
计算新W下的误差
如果误差Error小于当前最小误差Lowe'sError：设置Wbest等于当前的W
将W设置为新的Wbest
``````
``````#前向逐步线性回归
def stageWise(xArr,yArr,eps=0.01,numIt=100):#逐步线性回归实现。输入数据，预测变量，步长，迭代次数
xMat = mat(xArr); yMat = mat(yArr).T
yMean = mean(yMat,0)#均值
yMat = yMat - yMean
xMat = regularize(xMat)#标准化
m,n = shape(xMat)
returnMat = zeros((numIt,n))
ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()#贪心算法的两个副本
for i in range(numIt):
print ws.T
lowestError = inf#正无穷
for j in range(n):
wsTest = ws.copy()
wsTest[j] += eps*sign
yTest = xMat*wsTest
wsMax = wsTest
ws = wsMax.copy()
returnMat[i,:] = ws.T
return returnMat

def regularize(xMat):
inMat = xMat.copy()
inMeans = mean(inMat,0)
inVar = var(inMat,0)
inMat = (inMat - inMeans)/inVar
return inMat
``````

``````In [22]: import regression
...: regression.stageWise(xArr,yArr,0.01,200)
...:
[[ 0.  0.  0.  0.  0.  0.  0.  0.]]
[[ 0.    0.    0.    0.01  0.    0.    0.    0.  ]]
[[ 0.    0.    0.    0.02  0.    0.    0.    0.  ]]
[[ 0.    0.    0.    0.03  0.    0.    0.    0.  ]]
[[ 0.    0.    0.    0.04  0.    0.    0.    0.  ]]
[[ 0.    0.    0.    0.05  0.    0.    0.    0.  ]]
[[ 0.    0.    0.    0.06  0.    0.    0.    0.  ]]
[[ 0.    0.    0.01  0.06  0.    0.    0.    0.  ]]
[[ 0.    0.    0.01  0.06  0.    0.    0.    0.01]]
[[ 0.    0.    0.01  0.06  0.    0.    0.    0.02]]
#省略部分
[[ 0.05  0.    0.09  0.03  0.31 -0.64  0.    0.36]]
[[ 0.04  0.    0.09  0.03  0.31 -0.64  0.    0.36]]
[[ 0.05  0.    0.09  0.03  0.31 -0.64  0.    0.36]]
[[ 0.04  0.    0.09  0.03  0.31 -0.64  0.    0.36]]
[[ 0.05  0.    0.09  0.03  0.31 -0.64  0.    0.36]]
[[ 0.04  0.    0.09  0.03  0.31 -0.64  0.    0.36]]
[[ 0.05  0.    0.09  0.03  0.31 -0.64  0.    0.36]]
[[ 0.04  0.    0.09  0.03  0.31 -0.64  0.    0.36]]
``````

``````regression.stageWise(xArr,yArr,0.001,5000)

array([[ 0.   ,  0.   ,  0.   , ...,  0.   ,  0.   ,  0.   ],
[ 0.   ,  0.   ,  0.   , ...,  0.   ,  0.   ,  0.   ],
[ 0.   ,  0.   ,  0.   , ...,  0.   ,  0.   ,  0.   ],
...,
[ 0.043, -0.011,  0.12 , ..., -0.963, -0.105,  0.187],
[ 0.044, -0.011,  0.12 , ..., -0.963, -0.105,  0.187],
[ 0.043, -0.011,  0.12 , ..., -0.963, -0.105,  0.187]])
``````

``````In [25]: xMat = mat(xArr)
...: yMat = mat(yArr).T
...: xMat = regression.regularize(xMat)
...: yM = mean(yMat,0)
...: yMat = yMat - yM
...: weights = regression.standRegres(xMat,yMat.T)
In [27]: weights.T
Out[27]:
matrix([[ 0.0430442 , -0.02274163,  0.13214087,  0.02075182,  2.22403814,
-0.99895312, -0.11725427,  0.16622915]])
``````

``````下面的部分涉及谷歌api，但是原文太旧，找不到新版怎么操作，暂时搁浅...
``````