admin管理员组

文章数量:1529447

Killers in Kaggle Competition

1. XGBoost Model

import pandas as pd

'''
对比随机决策森林以及XGBoost模型对泰坦尼克号上的乘客是否生还的预测能力
'''
'''
***************************************************************
***************************************************************
'''

'''
随机森林对泰坦尼克号上的乘客是否生还的预测能力
'''
#通过URL地址下载Titanic数据
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')

#选取pclass、age以及sex作为训练特征
X = titanic[['pclass','age','sex']]
y = titanic['survived']

#对缺失的age信息,采用平均值方法进行补全,即以age列已知数据的平均数填充
X['age'].fillna(X['age'].mean(),inplace=True)

#对原数据进行分割,随机采样25%作为测试集
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=33)
print(X_train)
print(y_train)

#从sklearn.feature_extraction导入DictVectorizer
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)

#对原数据进行特征向量化处理
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
X_test = vec.transform(X_test.to_dict(orient='record'))
print(X_train)
print(y_train)

#采用默认配置的随机森林分类器对测试集进行预测
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
print('the accuracy of RandomForestClassifier on training set:',rfc.score(X_test,y_test))

'''
XGBoost对泰坦尼克号上的乘客是否生还的预测能力
'''
from xgboost import XGBClassifier
xgbc = XGBClassifier()
xgbc.fit(X_train,y_train)
print('the accuracy of XGBoost on training set:',xgbc.score(X_test,y_test))
C:\Users\xxz\Anaconda3\lib\site-packages\pandas\core\generic.py:3191: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


     pclass        age     sex
1086    3rd  31.194181    male
12      1st  31.194181  female
1036    3rd  31.194181    male
833     3rd  32.000000    male
1108    3rd  31.194181    male
562     2nd  41.000000    male
437     2nd  48.000000  female
663     3rd  26.000000    male
669     3rd  19.000000    male
507     2nd  31.194181    male
1167    3rd  31.194181    male
821     3rd   9.000000    male
327     2nd  32.000000  female
715     3rd  21.000000    male
308     1st  31.194181  female
1274    3rd  31.194181    male
640     3rd  40.000000    male
72      1st  70.000000    male
1268    3rd  31.194181    male
1024    3rd  31.194181    male
1047    3rd  31.194181  female
940     3rd  31.194181    male
350     2nd  20.000000  female
892     3rd  31.194181    male
555     2nd  30.000000  female
176     1st  36.000000    male
107     1st  31.194181  female
475     2nd  34.000000  female
330     2nd  23.000000    male
533     2nd  34.000000    male
...     ...        ...     ...
235     1st  24.000000    male
465     2nd  22.000000  female
210     1st  31.194181    male
579     2nd  40.000000  female
650     3rd  23.000000    male
1031    3rd  31.194181    male
99      1st  24.000000  female
969     3rd  31.194181    male
535     2nd  31.194181    male
403     2nd  31.194181    male
744     3rd  45.000000    male
344     2nd  26.000000    male
84      1st  31.194181    male
528     2nd  20.000000    male
1270    3rd  31.194181    male
662     3rd  40.000000    male
395     2nd  42.000000    male
1196    3rd  31.194181    male
543     2nd  23.000000    male
845     3rd  31.194181    male
813     3rd  25.000000    male
61      1st  31.194181  female
102     1st  23.000000  female
195     1st  28.000000    male
57      1st  27.000000    male
1225    3rd  31.194181    male
658     3rd  31.194181  female
578     2nd  12.000000  female
391     2nd  18.000000    male
1044    3rd  31.194181  female

[984 rows x 3 columns]
1086    0
12      1
1036    0
833     0
1108    0
562     0
437     1
663     0
669     0
507     0
1167    1
821     1
327     1
715     0
308     1
1274    0
640     0
72      0
1268    0
1024    0
1047    1
940     1
350     1
892     0
555     1
176     1
107     1
475     1
330     0
533     0
       ..
235     0
465     0
210     1
579     1
650     0
1031    0
99      1
969     0
535     0
403     0
744     1
344     0
84      1
528     0
1270    0
662     0
395     0
1196    0
543     0
845     0
813     0
61      1
102     1
195     0
57      1
1225    0
658     1
578     1
391     0
1044    0
Name: survived, dtype: int64
[[ 31.19418104   0.           0.           1.           0.           1.        ]
 [ 31.19418104   1.           0.           0.           1.           0.        ]
 [ 31.19418104   0.           0.           1.           0.           1.        ]
 ..., 
 [ 12.           0.           1.           0.           1.           0.        ]
 [ 18.           0.           1.           0.           0.           1.        ]
 [ 31.19418104   0.           0.           1.           1.           0.        ]]
1086    0
12      1
1036    0
833     0
1108    0
562     0
437     1
663     0
669     0
507     0
1167    1
821     1
327     1
715     0
308     1
1274    0
640     0
72      0
1268    0
1024    0
1047    1
940     1
350     1
892     0
555     1
176     1
107     1
475     1
330     0
533     0
       ..
235     0
465     0
210     1
579     1
650     0
1031    0
99      1
969     0
535     0
403     0
744     1
344     0
84      1
528     0
1270    0
662     0
395     0
1196    0
543     0
845     0
813     0
61      1
102     1
195     0
57      1
1225    0
658     1
578     1
391     0
1044    0
Name: survived, dtype: int64
the accuracy of RandomForestClassifier on training set: 0.775075987842
the accuracy of XGBoost on training set: 0.787234042553

2. TensorFlow Framework

2.1 Hello Google TensorFlow

import numpy as np

'''
使用Tensorflow输出一句话
'''
import tensorflow as tf

#初始化一个Tensorflow的常量: Hello Google Tensorflow! 字符串,并命名为greeting作为一个计算模块
greeting = tf.constant('Hello Google Tensorflow! ')
#
#启动一个会话
sess = tf.Session()
#使用会话执行greeting计算模块
result = sess.run(greeting)
#输出会话结果
print(result)
#关闭会话,这是一种显示关闭会话的方式
sess.close()
b'Hello Google Tensorflow! '
import tensorflow as tf
'''
使用Tensorflow完成一次线性函数的计算
'''
#声明matrix1为Tensorflow的一个1*2行向量
matrix1 = tf.constant([[3,3]])
#声明matrix2为Tensorflow的一个2*1列向量
matrix2 = tf.constant([[2],[2]])

#product将上述两个算子相乘,作为新算例
product = tf.matmul(matrix1,matrix2)

#继续将product与一个标量2.0求和拼接,作为最终的linear算例
linear = tf.add(product,tf.constant(2))

#直接在会话中执行linear算例,相当于将上面所有单独算例拼接成流程图来执行
with tf.Session() as sess:
    result = sess.run(linear)
    print(result)
[[14]]

2.2 Establishing Classifier

import numpy as np
import pandas as pd
import tensorflow as tf

'''
使用Tensorflow自定义一个线性分类器用于对“良/恶性乳腺癌肿瘤”进行预测
'''

#从本地使用pandas读取乳腺癌肿瘤的训练和测试数据
train = pd.read_csv('breast-cancer-train.csv')
test = pd.read_csv('breast-cancer-test.csv')
print(train)
print(test)

#分隔特征与分类目标
X_train = np.float32(train[['Clump Thickness','Cell Size']].T)
y_train = np.float32(train['Type'].T)
X_test = np.float32(test[['Clump Thickness','Cell Size']].T)
y_test =np.float32(test['Type'].T)
print(X_train)
print(X_train.shape)
print(X_test)
print(X_test.shape)

#定义一个tensorflow的变量b作为线性模型的截距,同时设置初始值1.0
b = tf.Variable(tf.zeros([1]))
#定义一个tensorflow的变量W作为线性模型的系数,并设置初始值为-1.0到1.0之间均匀分布随机函数
W = tf.Variable(tf.random_uniform([1,2],-1.0,1.0))


#显示定义这个线性函数
y = tf.matmul(W,X_train) + b
#使用tensorflow中的reduce_mean取得训练集上均方误差
loss = tf.reduce_mean(tf.square(y-y_train))
#使用梯度下降法估计W,b,并且设置迭代步长为0.01,这个与scikit-learn中SGDRegressor
optimizer = tf.train.GradientDescentOptimizer(0.01)
#以最小二乘损失为优化目标
train_optimizer = optimizer.minimize(loss)

#初始化所有变量
init = tf.initialize_all_variables()
#开启Tensorflow中的会话
sess = tf.Session()
#执行变量初始化操作
sess.run(init)

#迭代1000轮次,训练参数
for step in range(0,1000):
    sess.run(train_optimizer)
    if step % 10 == 0:
        print(step,sess.run(W),sess.run(b))
# print(sess.run(W)[0][0],sess.run(W)[0][1])
#准备测试样本
test_negative = test.loc[test['Type'] == 0][['Clump Thickness','Cell Size']]
test_positive = test.loc[test['Type'] == 1][['Clump Thickness','Cell Size']]

#以最终更新的参数作图
import matplotlib.pyplot as plt

plt.scatter(test_negative['Clump Thickness'],test_negative['Cell Size'],marker='o',s=200,c='red')
plt.scatter(test_positive['Clump Thickness'],test_positive['Cell Size'],marker='x',s=150,c='black')

plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')

lx = np.arange(0,12)

#这里要强调一下,我们以0.5作为分界面,所以计算方式如下:
ly = (0.5 - sess.run(b) - lx * sess.run(W)[0][0])/sess.run(W)[0][1]

plt.plot(lx,ly,color='green')
plt.show()
     Unnamed: 0  Clump Thickness  Cell Size  Type
0           163                1          1     0
1           286               10         10     1
2           612               10         10     1
3           517                1          1     0
4           464                1          1     0
5           277                1          1     0
6           408                3          2     0
7           104               10         10     1
8           114                3          2     0
9           627                1          1     0
10          545                1          1     0
11          467                6          6     1
12           92                1          1     0
13            7                1          2     0
14           89                1          1     0
15          528                1          3     0
16          380                1          1     0
17          521                1          1     0
18          539                1          1     0
19          363                4          4     0
20          638                1          1     0
21          140                1          1     0
22           28                1          1     0
23           43                6          5     1
24           42               10         10     1
25           73                4          5     1
26          167                8         10     1
27          210               10         10     1
28          610                4          3     1
29           66                1          1     0
..          ...              ...        ...   ...
494         252                3          3     0
495          21                5          5     1
496         313                1          1     0
497         459                1          3     0
498         160                7          7     1
499         276                1          1     0
500         191                5         10     1
501         385                3          2     0
502         413                1          2     0
503         491                8          9     1
504         343                1          1     0
505         308                8          7     1
506         661                1          1     0
507         130                1          3     0
508         663                1          3     0
509          99                5          6     1
510         372                1          2     0
511          87                6          6     1
512         458                1          2     0
513         330                4          7     1
514         214               10         10     1
515         466                6          6     1
516         121                2          1     0
517         614                1          1     0
518          20                3          2     1
519          71               10          2     1
520         106               10         10     1
521         270                4          7     1
522         435                8         10     1
523         102                1          2     0

[524 rows x 4 columns]
     Unnamed: 0  Clump Thickness  Cell Size  Type
0           158                1          2     0
1           499                1          1     0
2           396                1          1     0
3           155                5          5     1
4           321                1          1     0
5           212                1          1     0
6           234                3          2     0
7           289                6          6     1
8           300                4         10     1
9           356                3          3     1
10          672                1          1     0
11          328               10          3     1
12          199                1          1     0
13           78                1          1     0
14          598                1          1     0
15          569               10          8     1
16          446                1          1     0
17          506               10         10     1
18          626                6          6     1
19          603                4          6     1
20          360               10         10     1
21          338                1          1     0
22          668                7          4     1
23          290                1          1     0
24          284                4          5     1
25          331                1          1     0
26          477                1          1     0
27           54                5          5     1
28          248                1          1     0
29          223                5          6     1
..          ...              ...        ...   ...
145         302               10         10     1
146         552                2          2     0
147         215                7          8     1
148         235                1          4     0
149          18                7          7     1
150         250                2          2     0
151         260                5          8     1
152         430                3          1     0
153         264                9          4     1
154          61                1          1     0
155         213               10         10     1
156         377                1          1     0
157          29                1          3     0
158         182                1          1     0
159         306                1          1     0
160         388                1          1     0
161         329                4          6     1
162         437                1          1     0
163         296                3          4     0
164         584                1          1     0
165         342                1          1     0
166         436               10         10     1
167         579                1          1     0
168         326                1          1     1
169         362                2          2     0
170         617                1          1     0
171         578                1          1     0
172         231                8          7     1
173         336                5          5     1
174         655                1          1     0

[175 rows x 4 columns]
[[  1.  10.  10. ...,   4.   8.   1.]
 [  1.  10.  10. ...,   7.  10.   2.]]
(2, 524)
[[  1.   1.   1.   5.   1.   1.   3.   6.   4.   3.   1.  10.   1.   1.
    1.  10.   1.  10.   6.   4.  10.   1.   7.   1.   4.   1.   1.   5.
    1.   5.   1.   1.   1.   5.   1.   1.   1.  10.   1.  10.   1.   3.
   10.   1.   1.   1.   2.   4.   1.   1.   2.   1.  10.   1.   3.   1.
    1.   6.   1.   1.   1.   1.  10.   3.   1.   1.  10.   6.   1.   2.
    3.   1.   9.   1.   1.   1.   1.   3.   1.   1.   1.   1.   1.   1.
    1.  10.   3.   1.   1.   1.   2.   1.  10.   1.   1.  10.   1.   1.
    1.   1.   1.   1.   1.   4.   4.   1.   8.   1.   1.   5.   7.   3.
    1.   3.   3.   1.   1.   1.   1.   1.   1.   7.   1.   1.  10.   3.
    1.   3.   7.   4.   1.   1.  10.   1.   6.   1.  10.   1.   1.   3.
    2.   3.   1.   1.   1.  10.   2.   7.   1.   7.   2.   5.   3.   9.
    1.  10.   1.   1.   1.   1.   1.   4.   1.   3.   1.   1.  10.   1.
    1.   2.   1.   1.   8.   5.   1.]
 [  2.   1.   1.   5.   1.   1.   2.   6.  10.   3.   1.   3.   1.   1.
    1.   8.   1.  10.   6.   6.  10.   1.   4.   1.   5.   1.   1.   5.
    1.   6.   1.   1.   1.   4.   1.   1.   1.  10.   1.   8.   1.   3.
   10.   1.   1.   1.   2.   5.   4.   1.   2.   1.  10.   1.   4.   4.
    1.   5.   1.   1.   1.   3.   4.   1.   1.   1.   8.   6.   1.   1.
    3.   1.   9.   1.   1.   1.   1.   2.   2.   1.   1.   1.   1.   1.
    1.  10.   5.   1.   1.   1.   1.   1.  10.   3.   3.   4.   1.   1.
    1.   1.   1.   1.   1.   6.   2.   2.   7.   1.   1.   3.   7.   3.
    1.   4.   1.   2.   1.   1.   1.   3.   1.  10.   1.   1.   3.   6.
    1.   2.   4.   3.   1.   1.  10.   1.   7.   1.   7.   1.   1.   1.
    1.   3.   1.   1.   3.  10.   2.   8.   4.   7.   2.   8.   1.   4.
    1.  10.   1.   3.   1.   1.   1.   6.   1.   4.   1.   1.  10.   1.
    1.   2.   1.   1.   7.   5.   1.]]
(2, 175)
WARNING:tensorflow:From <ipython-input-9-85653c33931d>:41: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
0 [[ 0.36668363 -0.50090975]] [ 0.07310659]
10 [[ 0.42902589 -0.31664807]] [ 0.08392984]
20 [[ 0.37770221 -0.26446074]] [ 0.07397496]
30 [[ 0.33350319 -0.21931998]] [ 0.06422745]
40 [[ 0.29543743 -0.18026219]] [ 0.05475901]
50 [[ 0.26265171 -0.1464566 ]] [ 0.04562278]
60 [[ 0.23441158 -0.11718685]] [ 0.03685662]
70 [[ 0.21008505 -0.09183522]] [ 0.02848593]
80 [[ 0.18912806 -0.06986891]] [ 0.02052602]
90 [[ 0.17107238 -0.05082829]] [ 0.01298404]
100 [[ 0.15551494 -0.03431684]] [ 0.00586066]
110 [[ 0.14210881 -0.01999237]] [-0.00084857]
120 [[ 0.13055533 -0.00755955]] [-0.00715207]
130 [[ 0.12059743  0.00323657]] [-0.0130613]
140 [[ 0.11201376  0.01261609]] [-0.01858996]
150 [[ 0.10461382  0.02076911]] [-0.02375336]
160 [[ 0.09823353  0.02785983]] [-0.02856789]
170 [[ 0.09273166  0.03403014]] [-0.03305059]
180 [[ 0.0879866   0.03940263]] [-0.03721882]
190 [[ 0.08389364  0.0440833 ]] [-0.04109001]
200 [[ 0.08036258  0.0481638 ]] [-0.04468136]
210 [[ 0.07731578  0.05172339]] [-0.04800977]
220 [[ 0.07468636  0.05483068]] [-0.05109166]
230 [[ 0.07241672  0.05754501]] [-0.05394287]
240 [[ 0.07045723  0.05991777]] [-0.05657862]
250 [[ 0.06876517  0.06199349]] [-0.05901347]
260 [[ 0.06730371  0.06381072]] [-0.06126123]
270 [[ 0.06604112  0.06540291]] [-0.06333503]
280 [[ 0.06495008  0.06679903]] [-0.06524725]
290 [[ 0.06400704  0.06802423]] [-0.06700955]
300 [[ 0.06319169  0.06910033]] [-0.06863291]
310 [[ 0.06248654  0.07004629]] [-0.07012761]
320 [[ 0.06187651  0.07087857]] [-0.07150328]
330 [[ 0.06134861  0.07161149]] [-0.07276891]
340 [[ 0.06089162  0.07225746]] [-0.07393289]
350 [[ 0.06049588  0.07282734]] [-0.07500301]
360 [[ 0.06015305  0.07333054]] [-0.07598653]
370 [[ 0.05985595  0.07377529]] [-0.07689022]
380 [[ 0.05959837  0.07416874]] [-0.07772031]
390 [[ 0.05937495  0.07451714]] [-0.07848261]
400 [[ 0.05918108  0.07482593]] [-0.07918249]
410 [[ 0.05901278  0.07509987]] [-0.07982493]
420 [[ 0.05886659  0.07534314]] [-0.08041453]
430 [[ 0.05873957  0.07555936]] [-0.0809555]
440 [[ 0.05862912  0.07575173]] [-0.08145178]
450 [[ 0.05853304  0.07592303]] [-0.08190699]
460 [[ 0.05844941  0.07607572]] [-0.08232445]
470 [[ 0.05837657  0.07621194]] [-0.08270727]
480 [[ 0.05831309  0.07633358]] [-0.08305824]
490 [[ 0.05825774  0.07644231]] [-0.08337997]
500 [[ 0.05820943  0.07653957]] [-0.08367487]
510 [[ 0.05816725  0.07662665]] [-0.08394514]
520 [[ 0.05813038  0.07670469]] [-0.08419282]
530 [[ 0.05809814  0.07677467]] [-0.08441976]
540 [[ 0.05806994  0.07683749]] [-0.0846277]
550 [[ 0.05804524  0.07689392]] [-0.08481821]
560 [[ 0.05802359  0.07694465]] [-0.08499274]
570 [[ 0.0580046  0.0769903]] [-0.08515258]
580 [[ 0.05798792  0.0770314 ]] [-0.085299]
590 [[ 0.05797327  0.07706842]] [-0.08543309]
600 [[ 0.05796038  0.0771018 ]] [-0.0855559]
610 [[ 0.05794904  0.07713193]] [-0.08566836]
620 [[ 0.05793905  0.07715912]] [-0.08577135]
630 [[ 0.05793025  0.07718369]] [-0.08586565]
640 [[ 0.05792246  0.0772059 ]] [-0.08595198]
650 [[ 0.05791559  0.07722599]] [-0.08603103]
660 [[ 0.05790952  0.07724417]] [-0.08610339]
670 [[ 0.05790414  0.07726063]] [-0.08616965]
680 [[ 0.05789939  0.07727554]] [-0.0862303]
690 [[ 0.05789516  0.07728906]] [-0.08628582]
700 [[ 0.05789141  0.07730132]] [-0.08633664]
710 [[ 0.05788808  0.07731244]] [-0.08638318]
720 [[ 0.05788512  0.07732254]] [-0.08642577]
730 [[ 0.05788249  0.0773317 ]] [-0.08646475]
740 [[ 0.05788014  0.07734002]] [-0.08650042]
750 [[ 0.05787804  0.07734759]] [-0.08653308]
760 [[ 0.05787617  0.07735448]] [-0.08656296]
770 [[ 0.0578745   0.07736073]] [-0.08659032]
780 [[ 0.057873    0.07736642]] [-0.08661535]
790 [[ 0.05787166  0.0773716 ]] [-0.08663826]
800 [[ 0.05787047  0.07737631]] [-0.08665923]
810 [[ 0.05786939  0.07738061]] [-0.08667842]
820 [[ 0.05786842  0.07738452]] [-0.08669598]
830 [[ 0.05786756  0.07738808]] [-0.08671205]
840 [[ 0.05786679  0.07739132]] [-0.08672676]
850 [[ 0.05786609  0.07739428]] [-0.08674022]
860 [[ 0.05786546  0.07739697]] [-0.08675253]
870 [[ 0.05786489  0.07739943]] [-0.0867638]
880 [[ 0.05786439  0.07740167]] [-0.08677411]
890 [[ 0.05786392  0.07740371]] [-0.08678355]
900 [[ 0.05786351  0.07740557]] [-0.08679216]
910 [[ 0.05786314  0.07740727]] [-0.08680007]
920 [[ 0.0578628   0.07740883]] [-0.0868073]
930 [[ 0.05786249  0.07741025]] [-0.08681391]
940 [[ 0.05786222  0.07741154]] [-0.08681997]
950 [[ 0.05786196  0.07741272]] [-0.0868255]
960 [[ 0.05786174  0.0774138 ]] [-0.08683058]
970 [[ 0.05786153  0.07741478]] [-0.08683521]
980 [[ 0.05786135  0.07741567]] [-0.08683946]
990 [[ 0.05786117  0.0774165 ]] [-0.08684334]

本文标签: Killerskagglecompetition