python code implements ID3 decision tree algorithm

2020-06-19 10:43:47
OfStack
The example of this paper shares the specific code of python ID3 decision tree algorithm for your reference. The specific content is as follows

''''' 
Created on Jan 30, 2015 
 
@author:  Shi Shuai  
''' 
 
from math import log 
import operator 
import re 
 
def fileToDataSet(fileName): 
  ''''' 
   This method functions as follows : Reads sample set data from a file , The format of sample data is : The data is separated by white space characters , The last 1 Class label  
     
     parameter : 
      fileName: The file path that holds the sample set data  
     
     The return value : 
      dataSet: The data of the sample set 2 Dimensional array  
  ''' 
  file=open(fileName, mode='r') 
  lines=file.readlines() 
  dataSet=[] 
  index=0 
  p=re.compile(r"\s+") 
  for line in lines: 
    line=p.split(line.strip()) 
    dataSet.append(line) 
    index+=1 
  return dataSet 
 
def calculateShannonEntropy(dataSet): 
  ''''' 
   This method functions as follows : Calculate the information entropy of the data category of the sample set , The format of sample data is 2 Dimensional array  
     
     parameter : 
      dataSet: The data of the sample set 2 Dimensional array  
     
     The return value : 
      shannonEntropy: The information entropy of the data category of the sample set  
  ''' 
  dataCount=len(dataSet) 
  classCountDic={} 
  for data in dataSet: 
    label=data[-1] 
    if label not in classCountDic.keys(): 
      classCountDic[label]=0 
    classCountDic[label]+=1 
  shannonEntropy=0.0 
  for key in classCountDic: 
    prob=float(classCountDic[key])/dataCount 
    shannonEntropy-=prob*log(prob,2) 
  return shannonEntropy 
 
def splitDataSet(dataSet,axis,value): 
  ''''' 
   This method functions as follows : For the sample set data according to a 1 Feature segmentation , Make the values of this feature all equal to the same in the segmented data set 1 A value , And the characteristic column is removed from the segmented data  
   
     parameter : 
      dataSet: Sample set data to be segmented ,2 Dimensional array  
      axis: The location of the feature in the data column of the sample set  
      value: The value of the feature after data segmentation of the sample set  
       
     The return value : 
      splitedDataSet: By location is axis The features are segmented , And the eigenvalue is value Is a subset of the sample set data  
  ''' 
  splitedDataSet=[] 
  for data in dataSet: 
    if data[axis]==value: 
      splitedData=data[:axis] 
      splitedData.extend(data[axis+1:]) 
      splitedDataSet.append(splitedData) 
  return splitedDataSet 
 
def chooseBestFeatureToSlipt(dataSet): 
  ''''' 
   This method functions as follows : The difference between the information entropy of the whole sample set and the information entropy of the data set segmented according to each feature is calculated respectively , The segmentation scheme to maximize the difference is obtained , The characteristics of the segmentation scheme are obtained  
   
     parameter : 
      dataSet: Sample set data to be segmented ,2 Dimensional array  
       
     The return value : 
      bestFeature: According to the feature obtained by the segmentation scheme with the maximum information entropy difference before and after segmentation, the position of the feature in the data column of the sample set is returned  
  ''' 
  bestFeature=-1 
  dataSetShannonEntropy=calculateShannonEntropy(dataSet) 
  infoGain=0 
  featureCount=len(dataSet[0])-1 
  for i in range(featureCount): 
    featureList=[example[i] for example in dataSet] 
    featureSet=set(featureList) 
    splitedDataSetShannonEntropy=0 
    for feature in featureSet: 
      splitedDataSet=splitDataSet(dataSet,i,feature) 
      splitedDataSetShannonEntropy+=float(len(splitedDataSet))/len(dataSet)*calculateShannonEntropy(splitedDataSet) 
    if dataSetShannonEntropy-splitedDataSetShannonEntropy>infoGain: 
      infoGain=dataSetShannonEntropy-splitedDataSetShannonEntropy 
      bestFeature=i 
  return bestFeature 
 
def majorityClass(classList): 
  ''''' 
   This method functions as follows : Gets the most number of categories from the category list  
   
     parameter : 
      classList: Category list ,1 Dimensional array  
       
     The return value : 
       The most numerous category in the category list  
  ''' 
  classCountDic={} 
  for label in classList: 
    if label not in classCountDic.keys(): 
      classCountDic[label]=0 
    classCountDic[label]+=1 
  classCountDic=sorted(classCountDic.item(),key=operator.itemgetter(1),reverse=True) 
  return classCountDic[0][0] 
 
 
def createTree(dataSet,features): 
  ''''' 
   This method functions as follows : Create the most efficient decision tree for classification based on the training sample set data  
   
     parameter : 
      dataSet: Training sample set data ,2 Dimensional array  
      features: Set of feature names corresponding to the eigenvalues of each column in the training sample set data ,1 Dimensional array  
     
     The return value : 
      tree: The most effective decision tree for classification is created based on the training sample set data  
  ''' 
  subFeatures=features[:] 
  classList=[example[-1] for example in dataSet] 
  if classList.count(classList[0])==len(classList): 
    return classList[0] 
  if len(dataSet[0])==1: 
    return majorityClass(classList) 
  bestFeature=chooseBestFeatureToSlipt(dataSet) 
  label=subFeatures[bestFeature] 
  tree={label:{}} 
  del(subFeatures[bestFeature]) 
  featureList=[example[bestFeature] for example in dataSet] 
  featureSet=set(featureList) 
  for feature in featureSet: 
    splitedDataSet=splitDataSet(dataSet,bestFeature,feature) 
    tree[label][feature]=createTree(splitedDataSet, subFeatures) 
  return tree 
   
def classify(inX,tree,features): 
  ''''' 
   This method functions as follows : According to the decision tree created , Categorize specific data  
   
     parameter : 
      inX: Data to be sorted , Eigenvalue vector ,1 Dimensional array  
      tree: Create the most effective decision tree according to the decision tree algorithm  
      features: Set of feature names corresponding to the eigenvalues of each column in the training sample set data ,1 Dimensional array  
       
     The return value : 
      label: The data to be classified is categorized by the decision tree  
  ''' 
  feature=list(tree.keys())[0] 
  featureIndex=features.index(feature) 
  secondTree=tree[feature][inX[featureIndex]] 
  if type(secondTree).__name__=="dict": 
    label=classify(inX,secondTree,features) 
  else: 
    label=secondTree 
  return label