#統計資訊
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
df1=pd.read_csv("2012MLB.csv",encoding="big5")
#df1.cov() #共變數
df1.describe() # 跟R的summary一樣、列出一些統計資訊
#異常値偵測 盒鬚圖
y=np.random.randn(1000) #隨機產生値
plt.boxplot(y,showfliers=True)
plt.show()
r=plt.boxplot(y,showfliers=True)
print('異常値: ',r['fliers'][0].get_data()[1])#取得異常値
#常態分佈與標準差
df2=df1['HR']
df2.plot(kind='kde',title='pdf')
plt.show()
print('標準差 :',df2.std())
#相關性分析
#相關程度不等於有因果關係
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams['font.family']='SimHei' #⿊體
print('corr: ',df1[['H','HR']].corr())
df1.plot(x='H',y='HR',kind='scatter',title='散布圖')
plt.show()