python implementation of decision tree ID3 algorithm sample code
- 2020-10-23 21:03:07
- OfStack
In Zhou Zhihua's Watermelon book and Li Hang's statistical machine learning, the decision tree ID3 algorithm has been explained in great detail. How to realize it? The core point has the following steps
step1: Calculate shannon entropy
from math import log
import operator
# Calculate the Shannon entropy
def calculate_entropy(data):
label_counts = {}
for feature_data in data:
laber = feature_data[-1] # The last 1 Line is laber
if laber not in label_counts.keys():
label_counts[laber] = 0
label_counts[laber] += 1
count = len(data)
entropy = 0.0
for key in label_counts:
prob = float(label_counts[key]) / count
entropy -= prob * log(prob, 2)
return entropy
step2. Method of calculating the information gain of an feature
# Calculate a feature Information gain of
# index: You want to compute the information gain feature In the corresponding data Number of columns
# data The shannon entropy
def calculate_relative_entropy(data, index, entropy):
feat_list = [number[index] for number in data] # To get all the values for a particular characteristic (a column)
uniqual_vals = set(feat_list)
new_entropy = 0
for value in uniqual_vals:
sub_data = split_data(data, index, value)
prob = len(sub_data) / float(len(data))
new_entropy += prob * calculate_entropy(sub_data) # Sum of shannon entropy for each subset
relative_entropy = entropy - new_entropy # Calculated information gain
return relative_entropy
step3. Select feature for maximum information gain
# Select the maximum information gain feature
def choose_max_relative_entropy(data):
num_feature = len(data[0]) - 1
base_entropy = calculate_entropy(data)# Shannon entropy
best_infor_gain = 0
best_feature = -1
for i in range(num_feature):
info_gain=calculate_relative_entropy(data, i, base_entropy)
# Maximum information gain
if (info_gain > best_infor_gain):
best_infor_gain = info_gain
best_feature = i
return best_feature
step4. Build the decision tree
def create_decision_tree(data, labels):
class_list=[example[-1] for example in data]
# Same category, stop dividing
if class_list.count(class_list[-1]) == len(class_list):
return class_list[-1]
# Determines whether to return the highest number of categories when traversing through all the features
if len(data[0]) == 1:
return most_class(class_list)
# Select the classified feature attribute according to the highest information gain
best_feat = choose_max_relative_entropy(data)
best_feat_lable = labels[best_feat] # The characteristics of the label
decision_tree = {best_feat_lable: {}} # Build a dictionary of trees
del(labels[best_feat]) # from labels the list To remove the label
feat_values = [example[best_feat] for example in data]
unique_values = set(feat_values)
for value in unique_values:
sub_lables=labels[:]
# Build subsets of data and recurse
decision_tree[best_feat_lable][value] = create_decision_tree(split_data(data, best_feat, value), sub_lables)
return decision_tree
Two tool methods are used in building the decision tree:
# Returns the highest number of categories when traversing through all features
def most_class(classList):
class_count={}
for vote in classList:
if vote not in class_count.keys():class_count[vote]=0
class_count[vote]+=1
sorted_class_count=sorted(class_count.items,key=operator.itemgetter(1),reversed=True)
return sorted_class_count[0][0]
# Tool function input 3 Five variables (data set to be partitioned, features, classified values) return subsets without partitioning features
def split_data(data, axis, value):
ret_data=[]
for feat_vec in data:
if feat_vec[axis]==value :
reduce_feat_vec=feat_vec[:axis]
reduce_feat_vec.extend(feat_vec[axis+1:])
ret_data.append(reduce_feat_vec)
return ret_data