决策树学习笔记

csv文件
构建决策树并进行预测

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import preprocessing
from sklearn import tree
# from sklearn.externals.six import StringIO

allElectronicsDate = open(r'E:\Python\practice\Decision_Tree\Class_buys_computer.csv','rt')
reader = csv.reader(allElectronicsDate)#CSV模块自带的reader方法,可按行读取内容
# print('reader:'+ str(reader))
headers = next(reader)

print(headers)


featureList = []
labelList = []

for row in reader:
print(row)
labelList.append(row[len(row)-1])
rowDict = {}
for i in range(1,len(row)-1):
rowDict[headers[i]]=row[i]
featureList.append(rowDict)
print(featureList)
print(labelList)

vec = DictVectorizer()#python自带模块
dummyX = vec.fit_transform(featureList).toarray()
#调用方法fit_transform将字典类型的[{'a':'b'},{'c':'d'}]数据中的'b','d'数据转换成0,1的矩阵形式

print("dummyX:"+str(dummyX))
print(vec.get_feature_names())#调用此方法得到'b','d'对应的特征名

print("labellist:"+str(labelList))


lb = preprocessing.LabelBinarizer()#python内部模块
dummyY = lb.fit_transform(labelList)
#调用fit_transform方法将标签列表中的数据转成0,1格式
print("dummyY:"+str(dummyY))


clf = tree.DecisionTreeClassifier(criterion='entropy')
#tree模块,创建clf分类器,entropy表示度量标准信息熵
clf = clf.fit(dummyX,dummyY)
# 用训练数据dummyX,dummyY拟合分类器模型
print("clf:"+str(clf))



with open("allElectronicInformationGainOri.dot",'w') as f:
f = tree.export_graphviz(clf,feature_names=vec.get_feature_names(),out_file = f)
#通过export_graphviz模块导出dot文件中,后通过cmd命令dot -Tpdf 1.dot -o 1.pdf
#将dot文件转化成pdf视图

oneRowX = dummyX[0,:]#取X矩阵数组里面的第一行
print("oneRowX:" + str(oneRowX))

newRowX = oneRowX#赋给新标签

newRowX[0] = 1
newRowX[2] = 0
print("newRowX:" + str(newRowX))
#newRowX:[ 1. 0. 0. 0. 1. 1. 0. 0. 1. 0.]

newRowX = newRowX.reshape(1,-1)
#将列表转化为矩阵,共predict调用
print("newRowX:" + str(newRowX))
#newRowX:[[ 1. 0. 0. 0. 1. 1. 0. 0. 1. 0.]]

predictedY = clf.predict(newRowX)
#用之前创建好的分类器clf(classifier),newRowX必须是矩阵类型

输出的dot文件可以使用graphvize软件转为PDF