import pandas as pd
import xgboost as xgb
import time
import random
from sklearn.model_selection import StratifiedKFold
#For sampling rows from input file
random_seed = 9
subset = 0.4
n_rows = 1183747;
train_rows = int(n_rows * subset)
random.seed(random_seed)
skip = sorted(random.sample(xrange(1,n_rows + 1),n_rows-train_rows))
data = pd.read_csv("/home/zfu/Kaggle/bosch/train_numeric.csv", index_col=0, dtype=np.float32, skiprows=skip)
y = data['Response'].values
del data['Response']
X = data.values
# GPU
param = {}
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'auc'
param['max_depth'] = 5
param['eta'] = 0.3
param['silent'] = 0
param['updater'] = 'grow_gpu'
#param['updater'] = 'grow_colmaker'
num_round = 20
In [18]: i = 1
...:
...: for train_index, test_index in skf.split(X, y):
...: ^Iprint("TRAIN:", train_index, "TEST:", test_index)
...: ^IX_train, X_test = X[train_index], X[test_index]
...: ^Iy_train, y_test = y[train_index], y[test_index]
...: ^Idtrain = xgb.DMatrix(X_train, label=y_train)
...: ^Itmp = time.time()
...: ^Ibst = xgb.train(param, dtrain, num_round)
...: ^Iboost_time = time.time() - tmp
...: ^Ires = bst.eval(xgb.DMatrix(X_test, label=y_test))
...: ^Iprint("Fold {}: {}, Boost Time {}".format(i, res, str(boost_time)))
...: ^Idel bst
...: ^Ii = i + 1
...:
('TRAIN:', array([ 94682, 94683, 94684, ..., 473495, 473496, 473497]), 'TEST:', array([ 0, 1, 2, ..., 97097, 97390, 97619]))
[00:48:52] Allocated 2651/8022 MB on GeForce GTX 1070
Fold 1: [0] eval-auc:0.687818, Boost Time 13.2141840458
('TRAIN:', array([ 0, 1, 2, ..., 473495, 473496, 473497]), 'TEST:', array([ 94682, 94683, 94684, ..., 191560, 191830, 191922]))
[00:49:07] Allocated 2649/8006 MB on GeForce GTX 1070
Fold 2: [0] eval-auc:0.699430, Boost Time 9.08604192734
('TRAIN:', array([ 0, 1, 2, ..., 473495, 473496, 473497]), 'TEST:', array([189390, 189391, 189392, ..., 286293, 286337, 286657]))
[00:49:21] Allocated 2650/8006 MB on GeForce GTX 1070
Fold 3: [0] eval-auc:0.679230, Boost Time 9.11538410187
('TRAIN:', array([ 0, 1, 2, ..., 473495, 473496, 473497]), 'TEST:', array([284088, 284089, 284090, ..., 379736, 379934, 380831]))
[00:49:35] Allocated 2648/8006 MB on GeForce GTX 1070
Fold 4: [0] eval-auc:0.682657, Boost Time 9.08212804794
('TRAIN:', array([ 0, 1, 2, ..., 379736, 379934, 380831]), 'TEST:', array([378786, 378787, 378788, ..., 473495, 473496, 473497]))
[00:49:49] Allocated 2650/8006 MB on GeForce GTX 1070
Fold 5: [0] eval-auc:0.698049, Boost Time 9.16434597969
# CPU
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'auc'
param['max_depth'] = 5
param['eta'] = 0.3
param['silent'] = 0
#param['updater'] = 'grow_gpu'
param['updater'] = 'grow_colmaker'
num_round = 20
In [20]: i = 1
...:
...: for train_index, test_index in skf.split(X, y):
...: ^Iprint("TRAIN:", train_index, "TEST:", test_index)
...: ^IX_train, X_test = X[train_index], X[test_index]
...: ^Iy_train, y_test = y[train_index], y[test_index]
...: ^Idtrain = xgb.DMatrix(X_train, label=y_train)
...: ^Itmp = time.time()
...: ^Ibst = xgb.train(param, dtrain, num_round)
...: ^Iboost_time = time.time() - tmp
...: ^Ires = bst.eval(xgb.DMatrix(X_test, label=y_test))
...: ^Iprint("Fold {}: {}, Boost Time {}".format(i, res, str(boost_time)))
...: ^Idel bst
...: ^Ii = i + 1
...:
('TRAIN:', array([ 94682, 94683, 94684, ..., 473495, 473496, 473497]), 'TEST:', array([ 0, 1, 2, ..., 97097, 97390, 97619]))
Fold 1: [0] eval-auc:0.691973, Boost Time 30.7943990231
('TRAIN:', array([ 0, 1, 2, ..., 473495, 473496, 473497]), 'TEST:', array([ 94682, 94683, 94684, ..., 191560, 191830, 191922]))
Fold 2: [0] eval-auc:0.702624, Boost Time 28.8307631016
('TRAIN:', array([ 0, 1, 2, ..., 473495, 473496, 473497]), 'TEST:', array([189390, 189391, 189392, ..., 286293, 286337, 286657]))
Fold 3: [0] eval-auc:0.669979, Boost Time 28.8397688866
('TRAIN:', array([ 0, 1, 2, ..., 473495, 473496, 473497]), 'TEST:', array([284088, 284089, 284090, ..., 379736, 379934, 380831]))
Fold 4: [0] eval-auc:0.674863, Boost Time 28.8023629189
('TRAIN:', array([ 0, 1, 2, ..., 379736, 379934, 380831]), 'TEST:', array([378786, 378787, 378788, ..., 473495, 473496, 473497]))
Fold 5: [0] eval-auc:0.696640, Boost Time 30.3664522171
No comments:
Post a Comment