#決策分類樹(Decision Tree )
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn import metrics
#以鐵達尼為例,用'Sex','Pclass','Age' 三個特徵值預測'Survived'(是否生存)
df2=pd.read_csv("kaggle_titanic_train.csv",encoding="big5") #鐵達尼
df3=df2[['Sex','Pclass','Age','Survived']]
df3.head()
#資料預處理
label_encoder = preprocessing.LabelEncoder()
encoded_Sex = label_encoder.fit_transform(df3["Sex"])
df3["Sex"]=encoded_Sex
df3=df3[df3['Age'].notnull()] #刪除NaN
#切分訓練 測試資料
x=df3[['Sex','Pclass','Age']]
y=df3[['Survived']]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=20170817) #random_state 種子值
#標準化 :為了避免偏向某個變數去做訓練
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
sc.fit(x_train)
x_train_nor=sc.transform(x_train)
x_test_nor=sc.transform(x_test)
# 建立分類器
tree=DecisionTreeClassifier(criterion='gini',max_depth=5) #度量使用gini 樹高5
tree_clf=tree.fit(x_train_nor,y_train)
# 預測
y_test_predicted = tree_clf.predict(x_test_nor)
print(y_test_predicted)
[0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0
0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0
0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 1 0 1
1 0 0 0 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
1 1 1 0 1 0 1 0 0 1 0 1 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0
0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0]
# 標準答案
print(y_test.head()) #只取前6筆
Survived
249 0
523 1
549 1
857 1
296 0
......
#決策樹分類器的績效
#使用準確率(Accuracy)
accuracy = metrics.accuracy_score(y_test, y_test_predicted)
print('準確率:',accuracy)
output: