close

 

 

#決策分類樹(Decision Tree )  

from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation  import train_test_split
from sklearn import metrics

#以鐵達尼為例,用'Sex','Pclass','Age' 三個特徵值預測'Survived'(是否生存)

df2=pd.read_csv("kaggle_titanic_train.csv",encoding="big5") #鐵達尼
df3=df2[['Sex','Pclass','Age','Survived']]
df3.head()

t1.PNG

 

#資料預處理

label_encoder = preprocessing.LabelEncoder()
encoded_Sex = label_encoder.fit_transform(df3["Sex"])

df3["Sex"]=encoded_Sex
df3=df3[df3['Age'].notnull()]  #刪除NaN


#切分訓練 測試資料
x=df3[['Sex','Pclass','Age']]
y=df3[['Survived']]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=20170817) #random_state 種子值

 

#標準化 :為了避免偏向某個變數去做訓練
from sklearn.preprocessing  import StandardScaler
sc=StandardScaler()

sc.fit(x_train)
x_train_nor=sc.transform(x_train)
x_test_nor=sc.transform(x_test)

# 建立分類器
tree=DecisionTreeClassifier(criterion='gini',max_depth=5) #度量使用gini 樹高5
tree_clf=tree.fit(x_train_nor,y_train)


# 預測
y_test_predicted = tree_clf.predict(x_test_nor)
print(y_test_predicted)

[0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0
 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 1 0 1
 1 0 0 0 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 1 1 1 0 1 0 1 0 0 1 0 1 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0]

# 標準答案
print(y_test.head())  #只取前6筆

 Survived
249         0
523         1
549         1
857         1
296         0

......

 

#決策樹分類器的績效
#使用準確率(Accuracy)

accuracy = metrics.accuracy_score(y_test, y_test_predicted)
print('準確率:',accuracy)

output:

準確率: 0.8

 

PS:樹的視覺化尚未解決!

arrow
arrow
    全站熱搜
    創作者介紹
    創作者 to52016 的頭像
    to52016

    Eason [資料科學//Python學習/資料庫] & [拍片&剪片]

    to52016 發表在 痞客邦 留言(0) 人氣()