


from sklearn.feature_extraction import DictVectorizer

import csv

from sklearn import tree

from sklearn import preprocessing

from sklearn.externals.six import StringIO

allElectronicsData = open(r"D:\workspace\python\files\AllElectronics.csv")

reader = csv.reader(allElectronicsData)

headers = reader.next()

print (headers)

featureList = []

labelList = []

for row in reader:


rowDict = {}

for i in range(1,len(row)-1):



print (featureList)

#Vetorrize features

vec = DictVectorizer()

dummyX = vec.fit_transform(featureList).toarray()

print ("dummyx:" + str(dummyX))

print (vec.get_feature_names())

print ("labelList:" + str(labelList))

# vectorize class labels

lb =preprocessing.LabelBinarizer()

dummyY = lb.fit_transform(labelList)

print ("dummyY:"+ str(dummyY))

#Using decision tree for classification

clf = tree.DecisionTreeClassifier(criterion='entropy')

clf =clf.fit(dummyX,dummyY)

print ("clf:"+str(clf))

#Visualize mpdel

with open("allElectornicinformationGainOri.dot",'w')as f:

f = tree.export_graphviz(clf,feature_names=vec.get_feature_names(),out_file=f)

#dot 转化成pdf 树:dot -Tpdf " " -o output.pdf

oneRowx = dummyX[0,:]

print ("oneRowx"+str(oneRowx))


newRowX = oneRowx

#这里有个坑,一定要注意维度 numpy!!!

newRowX[0] = 0

newRowX[2] = 1

newRowX.reshape(1, -1)

print ("newRowx:" + str(newRowX))

predictedY = clf.predict(oneRowx)

print ("predictedY"+str(predictedY))


Traceback (most recent call last):

File "D:/workspace/python/.idea/decision_tree.py", line 55, in

predictedY = clf.predict(oneRowx)

File "C:\Python27\lib\site-packages\sklearn\tree\tree.py", line 412, in predict

X = self._validate_X_predict(X, check_input)

File "C:\Python27\lib\site-packages\sklearn\tree\tree.py", line 373, in _validate_X_predict

X = check_array(X, dtype=DTYPE, accept_sparse="csr")

File "C:\Python27\lib\site-packages\sklearn\utils\validation.py", line 441, in check_array

"if it contains a single sample.".format(array))

ValueError: Expected 2D array, got 1D array instead:

array=[0. 0. 1. 0. 1. 1. 0. 0. 1. 0.].

Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.



from sklearn.feature_extraction import DictVectorizer

import csv

from sklearn import tree

from sklearn import preprocessing

from sklearn.externals.six import StringIO

allElectronicsData = open(r"D:\workspace\python\files\AllElectronics.csv")

reader = csv.reader(allElectronicsData)

headers = reader.next()

print (headers)

featureList = []

labelList = []

for row in reader:


rowDict = {}

for i in range(1,len(row)-1):



print (featureList)

#Vetorrize features

vec = DictVectorizer()

dummyX = vec.fit_transform(featureList).toarray()

print ("dummyx:" + str(dummyX))

print (vec.get_feature_names())

print ("labelList:" + str(labelList))

# vectorize class labels

lb =preprocessing.LabelBinarizer()

dummyY = lb.fit_transform(labelList)

print ("dummyY:"+ str(dummyY))

#Using decision tree for classification

clf = tree.DecisionTreeClassifier(criterion='entropy')

clf =clf.fit(dummyX,dummyY)

print ("clf:"+str(clf))

#Visualize mpdel

with open("allElectornicinformationGainOri.dot",'w')as f:

f = tree.export_graphviz(clf,feature_names=vec.get_feature_names(),out_file=f)

#dot 转化成pdf 树:dot -Tpdf " " -o output.pdf

oneRowx = dummyX[0,:].reshape(1, -1)

print ("oneRowx"+str(oneRowx))


newRowX = oneRowx

#这里有个坑,一定要注意维度 numpy!!!

newRowX[0][0] = 0

newRowX[0][2] = 1

newRowX.reshape(1, -1)print ("newRowx:" + str(newRowX))

predictedY = clf.predict(oneRowx)

print ("predictedY"+str(predictedY))


C:\Python27\python.exe D:/workspace/python/.idea/decision_tree.py

['RID', 'age', 'income', 'student', 'credit_rating', 'class_buys_computer']

[{'credit_rating': 'fair', 'age': 'youth', 'student': 'no', 'income': 'high'}, {'credit_rating': 'excellent', 'age': 'youth', 'student': 'no', 'income': 'high'}, {'credit_rating': 'fair', 'age': 'middle_aged', 'student': 'no', 'income': 'high'}, {'credit_rating': 'fair', 'age': 'senior', 'student': 'no', 'income': 'medium'}, {'credit_rating': 'fair', 'age': 'senior', 'student': 'yes', 'income': 'low'}, {'credit_rating': 'excellent', 'age': 'senior', 'student': 'yes', 'income': 'low'}, {'credit_rating': 'excellent', 'age': 'middle_aged', 'student': 'yes', 'income': 'low'}, {'credit_rating': 'fair', 'age': 'youth', 'student': 'no', 'income': 'medium'}, {'credit_rating': 'fair', 'age': 'youth', 'student': 'yes', 'income': 'low'}, {'credit_rating': 'fair', 'age': 'senior', 'student': 'yes', 'income': 'medium'}, {'credit_rating': 'excellent', 'age': 'youth', 'student': 'yes', 'income': 'medium'}, {'credit_rating': 'excellent', 'age': 'middle_aged', 'student': 'no', 'income': 'medium'}, {'credit_rating': 'fair', 'age': 'middle_aged', 'student': 'yes', 'income': 'high'}, {'credit_rating': 'excellent', 'age': 'senior', 'student': 'no', 'income': 'medium'}]

dummyx:[[0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]

[0. 0. 1. 1. 0. 1. 0. 0. 1. 0.]

[1. 0. 0. 0. 1. 1. 0. 0. 1. 0.]

[0. 1. 0. 0. 1. 0. 0. 1. 1. 0.]

[0. 1. 0. 0. 1. 0. 1. 0. 0. 1.]

[0. 1. 0. 1. 0. 0. 1. 0. 0. 1.]

[1. 0. 0. 1. 0. 0. 1. 0. 0. 1.]

[0. 0. 1. 0. 1. 0. 0. 1. 1. 0.]

[0. 0. 1. 0. 1. 0. 1. 0. 0. 1.]

[0. 1. 0. 0. 1. 0. 0. 1. 0. 1.]

[0. 0. 1. 1. 0. 0. 0. 1. 0. 1.]

[1. 0. 0. 1. 0. 0. 0. 1. 1. 0.]

[1. 0. 0. 0. 1. 1. 0. 0. 0. 1.]

[0. 1. 0. 1. 0. 0. 0. 1. 1. 0.]]

['age=middle_aged', 'age=senior', 'age=youth', 'credit_rating=excellent', 'credit_rating=fair', 'income=high', 'income=low', 'income=medium', 'student=no', 'student=yes']

labelList:['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']















clf:DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,

max_features=None, max_leaf_nodes=None,

min_impurity_decrease=0.0, min_impurity_split=None,

min_samples_leaf=1, min_samples_split=2,

min_weight_fraction_leaf=0.0, presort=False, random_state=None,


oneRowx[[0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]]

newRowx:[[0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]]


总结:注意 维度,标红位置

