#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 2 10:51:33 2017
@author: lee
"""
from collections import defaultdict
import math
def CreateDataSet():
data = [[1, 1, 'yes' ],
[1, 1, 'yes' ],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
labels = ['no surfacing', 'flippers']
return data, labels
def CreateTree(data, labels,eps=-1):
#print data
#获取训练集的分类
data_labels = [y[-1] for y in data]
#获取特征集
unique_feature = set(data[0][:-1])
#(1)如果所有实例都属于同一个类,则决策树为单结点树
#可怕的坑,不能直接用len(labels)==1
if data_labels.count(data_labels[0]) == len(data_labels):
#print len(data_labels),data_labels,1
return data_labels[0]
#(2)特征集为空,为单结点树,取训练集中实例数最大的类返回
if len(unique_feature) == 0:
dic = defaultdict(int)
for i in range(len(data_labels)):
dic[data_labels[i]] += 1
# =============================================================================
# #字典取最大值的方法:
# max(dic,key=dic.get)#返回最大值所对应的键
# max(dic.items(),key=lambda x: x[1])#返回最大值(key,value)
# =============================================================================
return max(dic,key=dic.get)
#(3)一般情况
#id3的特征选择
maxfindex,maxgain = Maxinformation_gain(data,labels)
#c45的特征选择
#maxfindex,maxgain = Maxinformation_gain_ratio(data,labels)
#(4)最大特征值增益小于阈值,为单结点树,取训练集中实例数最大的类返回
if maxgain < eps:
dic = defaultdict(int)
for i in range(len(data_labels)):
dic[data_labels[i]] += 1
return max(dic,key=dic.get)
#(5)对Ag的每一个取值划分一个树
#取出特征,并在label中删除当前选中特征
maxfeature = labels[maxfindex]
del labels[maxfindex]
#取出该特征对应的所有训练数据
featureVal = [value[maxfindex] for value in data]
#用字典表示树
idtree = {maxfeature:{}}
#取出该特征的取值
featureValunq = set(featureVal)
#(6)对每一个取值,划分出一个子树
for fval in featureValunq:
#print 'mu',maxfeature,fval,idtree
idtree[maxfeature][fval] = CreateTree(SplitData(data,maxfindex,fval),labels[:])
#print 'digui',idtree,'idtree'
return idtree
#筛选出信息增益最大的特征,返回特征索引,该特征的信息增益
def Maxinformation_gain(data,labels):
feature_gain ={}
data_labels = [y[-1] for y in data]
entropyD = entropy(data_labels)
#计算每一个feature的信息增益
for f in range(len(labels)):
featureVal = [value[f] for value in data]
entropyF = ConditionalEntropy(featureVal,data_labels)
feature_gain[f] = entropyD-entropyF
result = max(feature_gain.items(),key=lambda x:x[1])
#print 'max',labels,result,feature_gain
#print 'max',data
return result[0],result[1]
#根据特征索引split_index,特征取值划分数据集
def SplitData(data,split_index,split_value):
#print 'befor split',data,split_index,split_value
subdata = []
for row in data:
if row[split_index] == split_value:
temp1 = row[:split_index]
temp2 = row[split_index+1:]
subdata.append(temp1+temp2)
#print 'after split',subdata,split_index,split_value
return subdata
#计算数据集的经验信息熵
def entropy(y):
n = len(y)
elist = defaultdict(int)
result = 0.
for i in range(len(y)):
elist[y[i]] += 1
for i in elist.keys():
p = elist[i]/float(n)
if p == 0:
result -= 0
else:
result -= p*math.log(p,2)
return result
#计算经验条件信息熵
def ConditionalEntropy(datalist,y):
data_u = set(datalist)
data_dic = defaultdict(int)
y_dic = defaultdict(list)
n = len(datalist)
result = 0.
#计数特征不同取值的个数
for i in range(len(datalist)):
data_dic[datalist[i]] += 1
y_dic[datalist[i]].append(y[i])
for i in data_u:
result += data_dic[i]/float(n) * entropy(y_dic[i])
return result
def Maxinformation_gain_ratio(data,labels):
#计算每个特征的信息增益
result = {}
data_labels = [y[-1] for y in data]
entropyD = entropy(data_labels)
#计算每一个feature的信息增益
for f in range(len(labels)):
featureVal = [value[f] for value in data]
entropyF = ConditionalEntropy(featureVal,data_labels)
feature_gain = entropyD-entropyF
feature_data_en = FeatureDataEntropy(featureVal)
result[f] = feature_gain/feature_data_en
return max(result,key=result.get)[0],max(result,key=result.get)[1]
#计算数据集关于每个特征的熵
def FeatureDataEntropy(datalist):
data_dic = defaultdict(int)
result = 0.
n = len(datalist)
#计数特征不同取值的个数
for i in range(n):
data_dic[datalist[i]] += 1
for i in data_dic.keys():
if data_dic[i] == 0:
result += 0
else:
p = data_dic[i]/float(n)
result += p * math.log(p,2)
return result
data,label = CreateDataSet()
print CreateTree(data,label)
《统计学习方法》 决策树ID3,C45 python实现
最后编辑于 :
?著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事?!?“怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 决策树算是比较容易的算法了,不过里面涉及到的几个公式比如信息增益、信息增益比、基尼系数还是比较重要的,我们一起来复...
- 目录: 1.决策树简介 2.决策树生成 a) 选择标准——熵 b) 信息增益——ID3算法 c) 信息增益率——C...