python implementation of decision tree ID3 algorithm sample code

2020-10-23 21:03:07
OfStack

In Zhou Zhihua's Watermelon book and Li Hang's statistical machine learning, the decision tree ID3 algorithm has been explained in great detail. How to realize it? The core point has the following steps

step1: Calculate shannon entropy


from math import log
import operator


#  Calculate the Shannon entropy 
def calculate_entropy(data):
  label_counts = {}
  for feature_data in data:
    laber = feature_data[-1] #  The last 1 Line is laber
    if laber not in label_counts.keys():
      label_counts[laber] = 0
    label_counts[laber] += 1

  count = len(data)
  entropy = 0.0

  for key in label_counts:
    prob = float(label_counts[key]) / count
    entropy -= prob * log(prob, 2)
  return entropy

step2. Method of calculating the information gain of an feature


#  Calculate a feature Information gain of 
# index: You want to compute the information gain feature  In the corresponding data  Number of columns 
# data  The shannon entropy 
def calculate_relative_entropy(data, index, entropy):
  feat_list = [number[index] for number in data] #  To get all the values for a particular characteristic (a column) 
  uniqual_vals = set(feat_list)
  new_entropy = 0
  for value in uniqual_vals:
    sub_data = split_data(data, index, value)
    prob = len(sub_data) / float(len(data)) 
    new_entropy += prob * calculate_entropy(sub_data) #  Sum of shannon entropy for each subset 
  relative_entropy = entropy - new_entropy #  Calculated information gain 
  return relative_entropy

step3. Select feature for maximum information gain


#  Select the maximum information gain feature
def choose_max_relative_entropy(data):
  num_feature = len(data[0]) - 1
  base_entropy = calculate_entropy(data)# Shannon entropy 
  best_infor_gain = 0
  best_feature = -1
  for i in range(num_feature):
    info_gain=calculate_relative_entropy(data, i, base_entropy)
    # Maximum information gain 
    if (info_gain > best_infor_gain):
      best_infor_gain = info_gain
      best_feature = i

  return best_feature

step4. Build the decision tree


def create_decision_tree(data, labels):
  class_list=[example[-1] for example in data]
  #  Same category, stop dividing 
  if class_list.count(class_list[-1]) == len(class_list):
    return class_list[-1]
  #  Determines whether to return the highest number of categories when traversing through all the features 
  if len(data[0]) == 1:
    return most_class(class_list)
  #  Select the classified feature attribute according to the highest information gain 
  best_feat = choose_max_relative_entropy(data)
  best_feat_lable = labels[best_feat] #  The characteristics of the label
  decision_tree = {best_feat_lable: {}} #  Build a dictionary of trees 
  del(labels[best_feat]) #  from labels the list To remove the label
  feat_values = [example[best_feat] for example in data]
  unique_values = set(feat_values)
  for value in unique_values:
    sub_lables=labels[:]
    #  Build subsets of data and recurse 
    decision_tree[best_feat_lable][value] = create_decision_tree(split_data(data, best_feat, value), sub_lables)
  return decision_tree

Two tool methods are used in building the decision tree:


#  Returns the highest number of categories when traversing through all features 
def most_class(classList):
  class_count={}
  for vote in classList:
    if vote not in class_count.keys():class_count[vote]=0
    class_count[vote]+=1
  sorted_class_count=sorted(class_count.items,key=operator.itemgetter(1),reversed=True)
  return sorted_class_count[0][0]
  
#  Tool function input 3 Five variables (data set to be partitioned, features, classified values) return subsets without partitioning features 
def split_data(data, axis, value):
  ret_data=[]
  for feat_vec in data:
    if feat_vec[axis]==value :
      reduce_feat_vec=feat_vec[:axis]
      reduce_feat_vec.extend(feat_vec[axis+1:])
      ret_data.append(reduce_feat_vec)
  return ret_data