預測數值型資料:迴歸

CopperDong發表於2017-10-09


一,標準迴歸函式

from numpy import *

def loadDataSet(fileName):      #general function to parse tab -delimited floats
    numFeat = len(open(fileName).readline().split('\t')) - 1 #get number of fields 
    dataMat = []; labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr =[]
        curLine = line.strip().split('\t')
        for i in range(numFeat):
            lineArr.append(float(curLine[i]))
        dataMat.append(lineArr)
        labelMat.append(float(curLine[-1]))
    return dataMat,labelMat

def standRegres(xArr, yArr):
	xMat = mat(xArr); yMat = mat(yArr).T 
	xTx = xMat.T*xMat
	if linalg.det(xTx) == 0.0:
		print "This matrix is singular, cannot do inverse"
		return
	ws = xTx.I * (xMat.T*yMat)
	return ws

>>> import regression

>>> from numpy import *
>>> xArr,yArr=regression.loadDataSet('ex0.txt')
>>> xArr[0:2]
[[1.0, 0.067732], [1.0, 0.42781]]
>>> ws = regression.standRegres(xArr,yArr)
>>> ws
matrix([[ 3.00774324],
        [ 1.69532264]])
>>> xMat=mat(xArr)
>>> yMat=mat(yArr)
>>> import matplotlib.pyplot as plt
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0])
<matplotlib.collections.PathCollection object at 0x7f4905380fd0>
>>> xCopy=xMat.copy()
>>> xCopy.sort(0)
>>> yHat=xCopy*ws
>>> ax.plot(xCopy[:,1],yHat)
[<matplotlib.lines.Line2D object at 0x7f4905391950>]
>>> plt.show()

二,區域性加權線性迴歸

  線性迴歸的一個問題是有可能出現欠擬合現象,因為它求的是具有最小均方誤差的無偏估計.所以有些方法允許在估計中引入一些偏差,從而降低預測的均方誤差


def lwlr(testPoint,xArr,yArr,k=1.0):
	xMat = mat(xArr); yMat = mat(yArr).T
	m = shape(xMat)[0]
	weights = mat(eye((m)))	
	for j in range(m):
		diffMat = testPoint - xMat[j,:]
		weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2))
	xTx = xMat.T * (weights * xMat)
	if linalg.det(xTx) == 0.0:
		print "This matrix is singular, cannot do inverse"
		return
	ws = xTx.I * (xMat.T * (weights*yMat))
	return testPoint*ws

def lwlrTest(testArr,xArr,yArr,k=1.0):
	m = shape(testArr)[0]
	yHat = zeros(m)
	for i in range(m):
		yHat[i] = lwlr(testArr[i],xArr,yArr,k)
	return yHat
>>> reload(regression)
<module 'regression' from 'regression.py'>
>>> xArr,yArr=regression.loadDataSet('ex0.txt')
>>> yArr[0]
3.176513
>>> regression.lwlr(xArr[0],xArr,yArr,1.0)
matrix([[ 3.12204471]])
>>> regression.lwlr(xArr[0],xArr,yArr,0.001)
matrix([[ 3.20175729]])
>>> yHat=regression.lwlrTest(xArr,xArr,yArr,0.003)
>>> xMat=mat(xArr)
>>> srtInd=xMat[:,1].argsort(0)
>>> xSort=xMat[srtInd][:,0,:]
>>> import matplotlib.pyplot as plt
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.plot(xSort[:,1],yHat[srtInd])
[<matplotlib.lines.Line2D object at 0x7f4905a37390>]
>>> ax.scatter(xMat[:,1].flatten().A[0], mat(yArr).T.flatten().A[0], s=2, c='red')
<matplotlib.collections.PathCollection object at 0x7f4905a37a50>
>>> plt.show()

三,示例:預測鮑魚的年齡

def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
    return ((yArr-yHatArr)**2).sum()

四,縮減係數來"理解"資料

  如果資料的特徵比樣本點還多應該怎麼辦?是否還可以使用線性迴歸和之前的方法來做預測?答案是否定的,這是因為在計算(X.T*X)^-1的時候會出錯

為了解決這個問題,統計學家引入了嶺迴歸(ridge regression)


def ridgeRegres(xMat, yMat, lam=0.2):
	xTx = xMat.T*xMat
	denom = xTx + eye(shape(xMat)[1])*lam
	if linalg.det(denom) == 0.0:
		print "This matrix is singular, cannot do inverse"
		return
	ws = denom.I * (xMat.T*yMat)
	return ws

def ridgeTest(xArr,yArr):
	xMat = mat(xArr); yMat = mat(yArr).T
	yMean = mean(yMat, 0)
	yMat = yMat - yMean
	#regularize X's
	xMeans = mean(xMat,0)
	xVar = var(xMat,0)
	xMat = (xMat - xMeans)/xVar
	numTestPts = 30
	wMat = zeros((numTestPts,shape(xMat)[1]))
	for i in range(numTestPts):
		ws = ridgeRegres(xMat,yMat,exp(i-10))
		wMat[i,:]=ws.T
	return wMat

>>> reload(regression)
<module 'regression' from 'regression.py'>
>>> abX,abY=regression.loadDataSet('abalone.txt')
>>> ridgeWeights=regression.ridgeTest(abX,abY)
>>> import matplotlib.pyplot as plt
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.plot(ridgeWeights)
[<matplotlib.lines.Line2D object at 0x7f4905a8e410>, <matplotlib.lines.Line2D object at 0x7f4905a8e750>, <matplotlib.lines.Line2D object at 0x7f4905a8ecd0>, <matplotlib.lines.Line2D object at 0x7f4905a8ee50>, <matplotlib.lines.Line2D object at 0x7f490534d4d0>, <matplotlib.lines.Line2D object at 0x7f490534d6d0>, <matplotlib.lines.Line2D object at 0x7f490534de90>, <matplotlib.lines.Line2D object at 0x7f490534dad0>]
>>> plt.show()

五,lasso迴歸


def regularize(xMat):
	inMat = xMat.copy()
	inMeans = mean(inMat,0)
	inVar = var(inMat,0)
	inMat = (inMat - inMeans)/inVar
	return inMat

def stageWise(xArr,yArr,eps=0.01,numIt=100):
    xMat = mat(xArr); yMat=mat(yArr).T
    yMean = mean(yMat,0)
    yMat = yMat - yMean     #can also regularize ys but will get smaller coef
    xMat = regularize(xMat)
    m,n=shape(xMat)
    #returnMat = zeros((numIt,n)) #testing code remove
    ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
    for i in range(numIt):
        print ws.T
        lowestError = inf; 
        for j in range(n):
            for sign in [-1,1]:
                wsTest = ws.copy()
                wsTest[j] += eps*sign
                yTest = xMat*wsTest
                rssE = rssError(yMat.A,yTest.A)
                if rssE < lowestError:
                    lowestError = rssE
                    wsMax = wsTest
        ws = wsMax.copy()
        #returnMat[i,:]=ws.T
    #return returnMat
>>> reload(regression)
>>> xArr,yArr=regression.loadDataSet('abalone.txt')
>>> regression.stageWise(xArr,yArr,0.01,200)
>>> regression.stageWise(xArr,yArr,0.001,5000)
>>> xMat=mat(xArr)
>>> yMat=mat(yArr).T
>>> xMat=regression.regularize(xMat)
>>> yM = mean(yMat,0)
>>> yMat=yMat-yM
>>> weights=regression.standRegres(xMat,yMat.T)
>>> weights.T
matrix([[ 0.0430442 , -0.02274163,  0.13214087,  0.02075182,  2.22403814,
         -0.99895312, -0.11725427,  0.16622915]])

相關文章