Details of python's Tree of Machine Learning Combat
- 2020-06-19 10:46:03
- OfStack
This article example Shared the tree regression specific code for everyone, for your reference, the specific content is as follows
#-*- coding:utf-8 -*-
#!/usr/bin/python
'''''
Regression tree Continuous value regression prediction the Regression tree
'''
# The test code
# import regTrees as RT RT.RtTreeTest() RT.RtTreeTest('ex0.txt') RT.RtTreeTest('ex2.txt')
# import regTrees as RT RT.RtTreeTest('ex2.txt',ops=(10000,4))
# import regTrees as RT RT.pruneTest()
# The model tree test
# import regTrees as RT RT.modeTreeTest(ops=(1,10)
# Model regression tree and normal regression tree Effect comparison Calculated correlation coefficient
# import regTrees as RT RT.MRTvsSRT()
from numpy import *
# Tab Key-value delimited data Extract into Tabular data set Floating point data
def loadDataSet(fileName): #
dataMat = [] # Target data set The list of
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = map(float,curLine) # Convert to floating point data
dataMat.append(fltLine)
return dataMat
# According to the characteristic value The data set 2 Yuan segmentation Characteristics of the ( column ) The value of the corresponding
# some 1 The value of the column is greater than value The value of the 1 All the row samples are put in 1 You put the rest in the other matrix 1 A matrix
def binSplitDataSet(dataSet, feature, value):
mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:][0] # An array of filtering
mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:][0] #
return mat0,mat1
# Constant leaf node
def regLeaf(dataSet):# The last 1 As a label Number of leaf nodes
return mean(dataSet[:,-1])# The mean of the target variable
# The variance
def regErr(dataSet):
return var(dataSet[:,-1]) * shape(dataSet)[0]# The squared error of the target variable * The total variance of the number of samples (rows)
# Choose the best Split property and corresponding size
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
tolS = ops[0] # The allowable error reduction
tolN = ops[1] # Minimum number of samples to shard
if len(set(dataSet[:,-1].T.tolist()[0])) == 1: # The characteristic residual quantity is 1 It returns
return None, leafType(dataSet) #### return 1 ####
m,n = shape(dataSet) # Current data set size The shape of
S = errType(dataSet) # Current data set error Mean square error (mse)
bestS = inf; bestIndex = 0; bestValue = 0
for featIndex in range(n-1):# traverse Fissionable characteristic
for splitVal in set(dataSet[:,featIndex]):# Traverse the corresponding Characteristics of the Attribute values
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)# for 2 Yuan division
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue # Sample size If it is less than the set value, it is not segmented
newS = errType(mat0) + errType(mat1)# 2 After the partition of the element Mean square error
if newS < bestS: # The weak is smaller than before the split Keep this classification
bestIndex = featIndex
bestValue = splitVal
bestS = newS
if (S - bestS) < tolS: # Weak after the split than Sample variance before splitting Not much It's not syncopated
return None, leafType(dataSet) #### return 2 ####
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): # Sample size If it is less than the set value, it is not segmented
return None, leafType(dataSet) #### return 3 ####
return bestIndex,bestValue # Return the best Split attribute and The value of the corresponding
# Create a regression tree numpy Array data set Leaf function Error function User sets parameters (minimum sample size) And the minimum error descent interval)
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
# To find the best features and corresponding to shard The value of the
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#
# Stop condition The node can not be subdivided, the node is a leaf node
if feat == None: return val
retTree = {}
retTree['spInd'] = feat # Characteristics of the
retTree['spVal'] = val # value
# perform 2 Yuan segmentation
lSet, rSet = binSplitDataSet(dataSet, feat, val)# 2 Yuan segmentation The left tree The right tree
# Create the left tree
retTree['left'] = createTree(lSet, leafType, errType, ops) # The left tree Finally, the child leaf node is returned The attribute value
# To create the right tree
retTree['right'] = createTree(rSet, leafType, errType, ops) # The right tree
return retTree
# Regression tree tests for post-pruning were not performed
def RtTreeTest(filename='ex00.txt',ops=(1,4)):
MyDat = loadDataSet(filename) # ex00.txt y = w*x Two dimensional ex0.txt y = w*x+b 3 d
MyMat = mat(MyDat)
print createTree(MyMat,ops=ops)
# Determine if it's a tree ( Store as a dictionary )
def isTree(obj):
return (type(obj).__name__=='dict')
# Returns the average value of the tree In processing
def getMean(tree):
if isTree(tree['right']):
tree['right'] = getMean(tree['right'])
if isTree(tree['left']):
tree['left'] = getMean(tree['left'])
return (tree['left']+tree['right'])/2.0 # Having two leaf nodes The average
# After pruning The tree to be pruned Test data required for pruning
def prune(tree, testData):
if shape(testData)[0] == 0:
return getMean(tree) # No test data return
if (isTree(tree['right']) or isTree(tree['left'])): # If you go back to the left and the right of the tree you have the tree
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])# Pair of test data shard
if isTree(tree['left']):
tree['left'] = prune(tree['left'], lSet) # Prune the left tree
if isTree(tree['right']):
tree['right'] = prune(tree['right'], rSet)# Prune the right tree
if not isTree(tree['left']) and not isTree(tree['right']):# There are leaves on both sides
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])# Pair of test data shard
errorNoMerge = sum(power(lSet[:,-1] - tree['left'],2)) +\
sum(power(rSet[:,-1] - tree['right'],2)) # Calculate the two leaves before merging error
treeMean = (tree['left']+tree['right'])/2.0 # The combined leaves The mean
errorMerge = sum(power(testData[:,-1] - treeMean,2))# After the merger The error of the
if errorMerge < errorNoMerge: # The merged error is smaller than the pre-merged error
print "merging" # Describes the merged tree Error smaller
return treeMean # Return to two leaves The average of As a The merged leaf node
else: return tree
else: return tree
def pruneTest():
MyDat = loadDataSet('ex2.txt')
MyMat = mat(MyDat)
MyTree = createTree(MyMat,ops=(0,1)) # In order to get The biggest tree The error is set to 0 The number is set to 1 That is, no prepruning
MyDatTest = loadDataSet('ex2test.txt')
MyMatTest = mat(MyDatTest)
print prune(MyTree,MyMatTest)
###### The leaf nodes are linear model trees #########
# Linear model
def linearSolve(dataSet):
m,n = shape(dataSet) # Data set size
X = mat(ones((m,n))) # The independent variables
Y = mat(ones((m,1))) # The target variable
X[:,1:n] = dataSet[:,0:n-1]# Sample data set
Y = dataSet[:,-1] # The label
# Linear model To solve the
xTx = X.T*X
if linalg.det(xTx) == 0.0:
raise NameError(' The determinant is zero , If the inverse matrix cannot be calculated, it can be increased appropriately ops The first 2 A value ')
ws = xTx.I * (X.T * Y)
return ws,X,Y
# Model leaf node
def modelLeaf(dataSet):
ws,X,Y = linearSolve(dataSet)
return ws
# Computational model error
def modelErr(dataSet):
ws,X,Y = linearSolve(dataSet)
yHat = X * ws
return sum(power(Y - yHat,2))
# Model tree test
def modeTreeTest(filename='ex2.txt',ops=(1,4)):
MyDat = loadDataSet(filename) #
MyMat = mat(MyDat)
print createTree(MyMat,leafType=modelLeaf, errType=modelErr,ops=ops)# Plug in the linear model And the corresponding The error calculation function of
# Model effect calculation
# Linear leaf node Predictive computing function Direct return Leaf node value
def regTreeEval(model, inDat):
return float(model)
def modelTreeEval(model, inDat):
n = shape(inDat)[1]
X = mat(ones((1,n+1)))# increase 1 column
X[:,1:n+1]=inDat
return float(X*model) # return Value multiplied by Linear regression coefficient
# Tree prediction function
def treeForeCast(tree, inData, modelEval=regTreeEval):
if not isTree(tree):
return modelEval(tree, inData) # return A leaf node Predictive value
if inData[tree['spInd']] > tree['spVal']: # The left tree
if isTree(tree['left']):
return treeForeCast(tree['left'], inData, modelEval)# Or tree Is called recursively
else:
return modelEval(tree['left'], inData) # Calculate the value of the leaf node And return
else:
if isTree(tree['right']): # The right tree
return treeForeCast(tree['right'], inData, modelEval)
else:
return modelEval(tree['right'], inData)# Calculate the value of the leaf node And return
# Get the predicted value
def createForeCast(tree, testData, modelEval=regTreeEval):
m=len(testData)
yHat = mat(zeros((m,1)))# Predict labels
for i in range(m):
yHat[i,0] = treeForeCast(tree, mat(testData[i]), modelEval)
return yHat
# Comparison of prediction results between constant regression tree and linear model regression tree
def MRTvsSRT():
TestMat = mat(loadDataSet('bikeSpeedVsIq_test.txt'))
TrainMat = mat(loadDataSet('bikeSpeedVsIq_train.txt'))
# Ordinary regression tree Predicted results
# You get a normal regression tree
StaTree = createTree(TrainMat, ops=(1,20))
# Get the predicted results
StaYHat = createForeCast(StaTree, TestMat[:,0], regTreeEval)# The first 1 As a The independent variables
# The correlation coefficient between the predicted results and the actual label
StaCorr = corrcoef(StaYHat, TestMat[:,1], rowvar=0)[0,1] # NumPy Library function
# Model regression tree Predicted results
# The model regression tree is obtained
ModeTree = createTree(TrainMat,leafType=modelLeaf, errType=modelErr, ops=(1,20))
# Get the predicted results
ModeYHat = createForeCast(ModeTree, TestMat[:,0], modelTreeEval)
# The correlation coefficient between the predicted results and the actual label
ModeCorr = corrcoef(ModeYHat, TestMat[:,1], rowvar=0)[0,1] # NumPy Library function
print " Ordinary regression tree Correlation coefficient of prediction results R2: %f" %(StaCorr)
print " Model regression tree Correlation coefficient of prediction results R2: %f" %(ModeCorr)
if ModeCorr>StaCorr:
print " The effect of model regression tree is better than that of ordinary regression tree "
else:
print " The effect of regression tree is better than that of normal tree "