# Travelers Auto Insurance Claims Prediction for a case competition

A lot like Kaggle projects I experienced. Train and test data set are given, scoring using Gini index.

Here is some sample code I wrote in Python.

Technique used:

Python Scikit-learn, Xgboost, imbalanced-learn for SMOTE over-sampling, etc.

Compared multiple algorithms before chose the best and tuning parameters.

 LinearRegression Ridge Regression LASSO Linear Regression Elastic Net Regression KNN DecisionTree SVM Bagged Decision Trees Random Forest Extra Trees (Bagging) AdaBoost (Boosting) Stochastic Gradient Boosting (Boosting) XGBoost Regressor XGBoost XGBClassifier
travelers

### environment¶

#### under Linux:¶

• pip install xgboost
• pip install -U imbalanced-learn
In [43]:
import xgboost as xgb
import seaborn
# from gini import Gini

In [2]:
import pandas as pd
import numpy as np
from time import time
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [3]:
# load data
print (train1.shape,train2.shape)
print (hold.shape)
combine=pd.concat([train1,train2])
train=combine.copy()

((22610, 11), (22629, 11))
(22617, 10)


### factorize categorical data¶

In [4]:
cat_vars = train.select_dtypes(['object']).columns

for var in cat_vars:
train[var] = pd.factorize(train[var])[0]
hold[var] = pd.factorize(hold[var])[0]

In [4]:
train.head(1)

Out[4]:
id claimcst0 veh_value exposure veh_body veh_age gender area agecat clm numclaims
0 1 0.0 6.43 0.241898 0 1 0 0 3 0 0

### data clusters explore¶

In [5]:
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.cluster import KMeans

In [6]:
pca=PCA(2)
after=pca.fit_transform(train)
print(after.shape)
plt.scatter(after[:,0],after[:,1],alpha=.3)

(45239, 2)

Out[6]:
<matplotlib.collections.PathCollection at 0x7f9d1893e048>
In [7]:
K=KMeans(2)
gp=K.fit_transform(after)
print (gp.shape)
plt.scatter(gp[:,0],gp[:,1],alpha=.2,color='k')

(45239, 2)

Out[7]:
<matplotlib.collections.PathCollection at 0x7f9d18614e48>

### split train data to train, validate¶

In [6]:
# preprocessing
y_train =train.pop('clm')
# y_claim_train=train.pop('claimcst0')
x_train = train.drop(['id','numclaims'], axis = 1)

y_hold = hold.pop('clm')
hold = hold.drop(['id','numclaims'], axis = 1)

In [7]:
from sklearn.cross_validation import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(
x_train, y_train, test_size = 0.25, random_state = 345,stratify=y_train)

In [8]:
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape, y_train.sum(),y_valid.sum()

Out[8]:
((33929, 8), (11310, 8), (33929,), (11310,), 2313, 771)

### compare predicting clm0 rather than clm¶

In [9]:
y_train_clm=y_train.copy()
y_valid_clm=y_valid.copy()

In [10]:
y_train_clm0=x_train.pop('claimcst0')
y_valid_clm0=x_valid.pop('claimcst0')

In [11]:
x_train.head(1)

Out[11]:
veh_value exposure veh_body veh_age gender area agecat
17594 3.26 0.819075 3 2 0 3 5

### comment below if to use clm0¶

In [12]:
# y_train_clm=y_train.copy()
# y_valid_clm=y_valid.copy()
# y_train=x_train.pop('claimcst0')
# y_valid=x_valid.pop('claimcst0')

In [13]:
x_train.head(1)

Out[13]:
veh_value exposure veh_body veh_age gender area agecat
17594 3.26 0.819075 3 2 0 3 5
In [14]:
fig=plt.figure()

ax=y_train_clm.plot.hist(bins=2,alpha=.5)
ax.set_title('training set clm')
ax=y_valid_clm.plot.hist(bins=2, alpha=.4,color='r')
ax.set_title('validate set clm')


### balance imbalanced training data set¶

In [15]:
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN
from imblearn.under_sampling import NearMiss
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.ensemble import BalanceCascade

sm = SMOTE(kind='regular')
# sm = SMOTE(kind='svm')

# sm= RandomOverSampler()
x_train,y_train=sm.fit_sample(x_train,y_train)
x_train=pd.DataFrame(x_train)
y_train=pd.Series(y_train)
print x_train.shape, y_train.shape, y_train.sum(), y_valid.sum()

(63232, 7) (63232,) 31616 771

In [16]:
fig=plt.figure()

ax=y_train.plot.hist(bins=2,alpha=.5)
ax.set_title('training set clm')
ax=y_valid.plot.hist(bins=2, alpha=.4,color='r')
ax.set_title('validate set clm')

In [23]:
x_train.head(1)

Out[23]:
0 1 2 3 4 5 6
0 3.26 0.819075 3.0 2.0 0.0 3.0 5.0
In [24]:
try:
x_train.columns=train.drop(['id','numclaims'], axis = 1).columns
except:
x_train.columns=train.drop(['id','numclaims','claimcst0'], axis = 1).columns

In [ ]:


In [25]:
# print (x_train['claimcst0']>0).sum()==(y_train).sum()

In [26]:
# y_train=x_train.pop('claimcst0')

In [27]:
# y_valid=x_valid.pop('claimcst0')

In [28]:
x_train.head(1)

Out[28]:
veh_value exposure veh_body veh_age gender area agecat
0 3.26 0.819075 3.0 2.0 0.0 3.0 5.0
In [29]:
x_valid.head(1)

Out[29]:
veh_value exposure veh_body veh_age gender area agecat
15047 0.79 0.775908 2 4 1 2 5
In [293]:
x_valid.dtypes

Out[293]:
veh_value    float64
exposure     float64
veh_body       int64
veh_age        int64
gender         int64
area           int64
agecat         int64
dtype: object

### define nomalized gini¶

In [294]:
# calculate gini
def Gini(y_true, y_pred):
# check and get number of samples
assert y_true.shape == y_pred.shape
n_samples = y_true.shape[0]

# sort rows on prediction column
# (from largest to smallest)
arr = np.array([y_true, y_pred]).transpose()
true_order = arr[arr[:,0].argsort()][::-1,0]
pred_order = arr[arr[:,1].argsort()][::-1,0]

# get Lorenz curves
L_true = np.cumsum(true_order) / np.sum(true_order)
L_pred = np.cumsum(pred_order) / np.sum(pred_order)
L_ones = np.linspace(1/n_samples, 1, n_samples)

# get Gini coefficients (area between curves)
G_true = np.sum(L_ones - L_true)
G_pred = np.sum(L_ones - L_pred)

# normalize to true Gini coefficient
return G_pred/G_true

In [ ]:


In [39]:
# save to csv for submit & R gini calculation
def save():
aa=pd.DataFrame(y_valid_clm0)
pp=pd.Series(pred,index=aa.index)
R_test=pd.concat([aa,pp],axis=1)
R_test['split']='val'
R_test.columns=['act','pred','split']
R_test.to_csv('other_methods.csv')


#### confusion_matrix¶

In [66]:
from sklearn.metrics import confusion_matrix


## test a list of algorithms¶

### general without tuning parameters¶

• LinearRegression
• Ridge Regression
• LASSO Linear Regression
• Elastic Net Regression
• KNN
• DecisionTree
• SVM
• Bagged Decision Trees
• Random Forest
• Extra Trees (Bagging)
• Stochastic Gradient Boosting (Boosting)
• XGBoost Regressor
• XGBoost XGBClassifier

#### compare predicting clm0 rather than clm¶

In [ ]:



### LinearRegression¶

In [156]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(normalize=True)
model

Out[156]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)
In [157]:
start=time()
model.fit(x_train,y_train)
pred=model.predict(x_valid)
result = Gini(y_valid, pred)
print ('used :%.4f s'%(time()-start))
print ('gini :%.6f '%result)

used :0.0053 s
gini :0.211997


on clm 0.200869

In [158]:
judge=pd.Series(pred).apply(lambda x:1 if x>pd.Series(pred).mean() else 0)

In [160]:
confusion_matrix(y_valid_clm,judge)

Out[160]:
array([[5452, 5087],
[ 250,  521]])

### Ridge Regression¶

In [161]:
from sklearn.linear_model import Ridge
model=Ridge(normalize=True)
model

Out[161]:
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
normalize=True, random_state=None, solver='auto', tol=0.001)
In [162]:
start=time()
model.fit(x_train,y_train)
pred=model.predict(x_valid)
result = Gini(y_valid, pred)
print ('used :%.4f s'%(time()-start))
print ('gini :%.6f '%result)

used :0.0046 s
gini :0.213310


on clm 0.201315

### LASSO Linear Regression¶

In [167]:
from sklearn.linear_model import Lasso
model=Lasso()
model

Out[167]:
Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
normalize=False, positive=False, precompute=False, random_state=None,
selection='cyclic', tol=0.0001, warm_start=False)
In [168]:
start=time()
model.fit(x_train,y_train)
pred=model.predict(x_valid)
result = Gini(y_valid_clm0, pred)
print ('used :%.4f s'%(time()-start))
print ('gini :%.6f '%result)

used :0.0154 s
gini :0.005823


### Elastic Net Regression¶

In [169]:
from sklearn.linear_model import ElasticNet
model = ElasticNet()
model

Out[169]:
ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
max_iter=1000, normalize=False, positive=False, precompute=False,
random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
In [170]:
start=time()
model.fit(x_train,y_train)
pred=model.predict(x_valid)
result = Gini(y_valid_clm0, pred)
print ('used :%.4f s'%(time()-start))
print ('gini :%.6f '%result)

used :0.0069 s
gini :0.005823


### KNN¶

In [73]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor(800)
model

Out[73]:
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=800, p=2,
weights='uniform')
In [74]:
start=time()
model.fit(x_train,y_train)
pred=model.predict(x_valid)
result = Gini(y_valid_clm0, pred)
print ('used :%.4f s'%(time()-start))
print ('gini :%.6f '%result)

used :3.5301 s
gini :0.056210

In [81]:
pd.Series(pred).describe()

Out[81]:
count    11310.000000
mean         0.495275
std          0.071319
min          0.271250
25%          0.447500
50%          0.497500
75%          0.545000
max          0.737500
dtype: float64

### Decision Tree¶

In [171]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(max_depth=5,random_state=7,
min_samples_leaf=1,max_leaf_nodes=None,
min_samples_split=5,
min_weight_fraction_leaf=.1)
model

Out[171]:
DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
max_leaf_nodes=None, min_impurity_split=1e-07,
min_samples_leaf=1, min_samples_split=5,
min_weight_fraction_leaf=0.1, presort=False, random_state=7,
splitter='best')
In [172]:
start=time()
model.fit(x_train,y_train)
pred=model.predict(x_valid)
result = Gini(y_valid_clm0, pred)
print ('used :%.4f s'%(time()-start))
print ('gini :%.6f '%result)

used :0.0879 s
gini :0.251430

In [173]:
train_set=model.predict(x_train)
train_set_gini = Gini(y_train, train_set)
train_set_gini

Out[173]:
0.33359751419411399

### SVM¶

In [169]:
from sklearn.svm import SVR
model = SVR(cache_size=20000)
model

Out[169]:
SVR(C=1.0, cache_size=20000, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
In [63]:
start=time()
model.fit(x_train,y_train)
pred=model.predict(x_valid)
result = Gini(x_valid_clm0, pred)
print ('used :%.4f s'%(time()-start))
print ('gini :%.6f '%result)

used :108.2684 s
gini :0.058458


### Bagged Decision Trees¶

In [304]:
from sklearn.ensemble import BaggingRegressor
model = BaggingRegressor(n_estimators=300,verbose=1,n_jobs=-1,
max_features=.9, max_samples=.9,)
model

Out[304]:
BaggingRegressor(base_estimator=None, bootstrap=True,
bootstrap_features=False, max_features=0.9, max_samples=0.9,
n_estimators=300, n_jobs=-1, oob_score=False, random_state=None,
verbose=1, warm_start=False)
In [305]:
start=time()
model.fit(x_train,y_train)
pred=model.predict(x_valid)
result = Gini(y_valid_clm0, pred)
print ('used :%.4f s'%(time()-start))
print ('gini :%.6f '%result)

[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    5.3s remaining:   15.8s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    5.6s finished
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.2s remaining:    0.6s

used :6.3984 s
gini :0.139489

[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    0.4s finished


### Random Forest (Bagging)¶

In [306]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=600,max_depth=3,n_jobs=-1)
model

Out[306]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
max_features='auto', max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=600, n_jobs=-1, oob_score=False, random_state=None,
verbose=0, warm_start=False)
In [307]:
start=time()
model.fit(x_train,y_train)
pred=model.predict(x_valid)
result = Gini(y_valid_clm0, pred)
print ('used :%.4f s'%(time()-start))
print ('gini :%.6f '%result)

used :4.5205 s
gini :0.211273

In [308]:
pred

Out[308]:
array([ 0.3596833 ,  0.34252775,  0.12714231, ...,  0.16006586,
0.31602501,  0.12386025])
In [223]:
save()


on clm 0.205095

### Extra Trees (Bagging)¶

In [309]:
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor(n_jobs=-1,n_estimators=1000,max_depth=3,
bootstrap=True,)
model

Out[309]:
ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=3,
max_features='auto', max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=1000, n_jobs=-1, oob_score=False, random_state=None,
verbose=0, warm_start=False)
In [310]:
start=time()
model.fit(x_train,y_train)
pred=model.predict(x_valid)
result = Gini(y_valid_clm0, pred)
print ('used :%.4f s'%(time()-start))
print ('gini :%.6f '%result)

used :3.4390 s
gini :0.190100

In [311]:
save()


on clm 0.197320

In [312]:
from sklearn.ensemble import AdaBoostRegressor
model

Out[312]:
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
n_estimators=50, random_state=None)
In [313]:
start=time()
model.fit(x_train,y_train)
pred=model.predict(x_valid)
result = Gini(y_valid_clm0, pred)
print ('used :%.4f s'%(time()-start))
print ('gini :%.6f '%result)

used :0.7475 s
gini :0.201253


on clm 0.204600

## Stochastic Gradient Boosting (Boosting)¶

In [314]:
from sklearn.ensemble import GradientBoostingRegressor
subsample=.9)
model

Out[314]:
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
max_leaf_nodes=None, min_impurity_split=1e-07,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=300,
presort='auto', random_state=None, subsample=0.9, verbose=0,
warm_start=False)
In [315]:
start=time()
model.fit(x_train,y_train)
pred=model.predict(x_valid)
result = Gini(y_valid_clm0, pred)
print ('used :%.4f s'%(time()-start))
print ('gini :%.6f '%result)

used :4.8759 s
gini :0.170067


on clm 0.221349 (train on clm and cal gini with clm0)

In [316]:
train_set=model.predict(x_train)
train_set_gini = Gini(y_train, train_set)
train_set_gini

Out[316]:
0.8828912822708892

on clm 0.41400336589109327

### Regressor¶

In [317]:
from xgboost import XGBRegressor
model = XGBRegressor(learning_rate=0.01,max_depth=3,)
model

Out[317]:
XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
learning_rate=0.01, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
objective='reg:linear', reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, seed=0, silent=True, subsample=1)
In [318]:
start=time()
model.fit(x_train,y_train)
pred=model.predict(x_valid)
result = Gini(y_valid_clm0, pred)
print ('used :%.4f s'%(time()-start))
print ('gini :%.6f '%result)

used :0.5483 s
gini :0.214805


### Classifier¶

In [319]:
from xgboost import XGBClassifier

model = XGBClassifier(learning_rate=0.0087,objective='binary:logistic',
n_estimators=300,reg_lambda=3,
subsample=.6,colsample_bylevel=.6,
)
model

Out[319]:
XGBClassifier(base_score=0.5, colsample_bylevel=0.6, colsample_bytree=1,
gamma=0, learning_rate=0.0087, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=300, nthread=-1,
objective='binary:logistic', reg_alpha=0, reg_lambda=3,
scale_pos_weight=1, seed=0, silent=True, subsample=0.6)
In [320]:
start=time()
model.fit(x_train,y_train)
pred=model.predict(x_valid)
result = Gini(y_valid_clm0, pred)
print ('used :%.4f s'%(time()-start))
print ('gini :%.6f '%result)

used :1.5666 s
gini :0.009133

In [ ]:


In [ ]:


In [321]:
x_train.head(1)

Out[321]:
veh_value exposure veh_body veh_age gender area agecat
0 3.26 0.819075 3.0 2.0 0.0 3.0 5.0
In [322]:
x_valid.head(1)

Out[322]:
veh_value exposure veh_body veh_age gender area agecat
15047 0.79 0.775908 2 4 1 2 5
In [ ]:



# xgboost¶

## linear¶

In [32]:
# apply xgboost
d_train = xgb.DMatrix(x_train, y_train)
d_valid = xgb.DMatrix(x_valid, y_valid_clm0)
d_test = xgb.DMatrix(x_valid)

# d_hold=xgb.DMatrix(x_hold)
def use_gini(a, b):
y = b.get_label()
return 'gini', Gini(y, a)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

In [48]:
xgb_params = {
#     'base_score': 0.5,
'colsample_bylevel': 0.6,
'colsample_bytree': 0.8,
#  'gamma': 0,
'learning_rate': 0.01,
#  'max_delta_step': 1,
'max_depth': 1,
'min_child_weight': 9,
'missing': None,

#  'objective': 'reg:linear',

'objective': 'binary:logistic',
'reg_alpha': 9,
'reg_lambda': 9,
'scale_pos_weight': 1,
'seed': 27,
'silent': 1,
'subsample': 0.9}

In [82]:
start=time()

clf = xgb.train(xgb_params, d_train, 502, watchlist, early_stopping_rounds=1111,
verbose_eval=50,
feval=use_gini,
maximize=True)

'!!! used %.2f s'%(time()-start)

[0]	train-gini:-0.186319	valid-gini:0.114123
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 1111 rounds.
[50]	train-gini:0.325454	valid-gini:0.191548
[100]	train-gini:0.35463	valid-gini:0.201859
[150]	train-gini:0.368934	valid-gini:0.215929
[200]	train-gini:0.373168	valid-gini:0.231201
[250]	train-gini:0.374377	valid-gini:0.23014
[300]	train-gini:0.379856	valid-gini:0.234736
[350]	train-gini:0.386277	valid-gini:0.234685
[400]	train-gini:0.391889	valid-gini:0.233804
[450]	train-gini:0.393826	valid-gini:0.231898
[500]	train-gini:0.401135	valid-gini:0.239523

Out[82]:
'!!! used 5.60 s'
In [83]:
clf.best_iteration,clf.best_ntree_limit,clf.best_score

Out[83]:
(501, 502, 0.240186)
In [84]:
pred=clf.predict(d_test)

In [85]:
save()


#### feature importance¶

In [96]:
feature_score=pd.Series(clf.get_fscore())
feature_score.sort()
feature_score.plot.barh(color='r',alpha=.3)

Out[96]:
<matplotlib.axes._subplots.AxesSubplot at 0x7efc04a8f5d0>
In [ ]:


In [ ]:



## logistic¶

In [10]:
xgb_params3 = {
#     'base_score': 0.5,
'colsample_bylevel': 0.6,
#  'colsample_bytree': 0.8,
#  'gamma': 0,
'learning_rate': 0.0086,
'max_depth': 1,
'min_child_weight': .5,
#  'missing': None,

#  'objective': 'reg:linear',

'objective': 'binary:logistic',
'reg_alpha': 0.5,
'reg_lambda': 3,
'scale_pos_weight': .5,
'seed': 27,
'silent': 1,
'subsample': 0.54,
'eval_metric':'auc'}

In [98]:
start=time()

clf = xgb.train(xgb_params3, d_train, 1597, watchlist, early_stopping_rounds=666,
verbose_eval=50,
feval=use_gini,
maximize=True
)

'!!! used %.2f s'%(time()-start)

[0]	train-gini:-0.43203	valid-gini:0.077052
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 666 rounds.
[50]	train-gini:0.309446	valid-gini:0.192613
[100]	train-gini:0.351892	valid-gini:0.206371
[150]	train-gini:0.368609	valid-gini:0.215305
[200]	train-gini:0.365294	valid-gini:0.226315
[250]	train-gini:0.370303	valid-gini:0.229143
[300]	train-gini:0.36993	valid-gini:0.233933
[350]	train-gini:0.377201	valid-gini:0.23231
[400]	train-gini:0.381864	valid-gini:0.229641
[450]	train-gini:0.385241	valid-gini:0.230075
[500]	train-gini:0.391504	valid-gini:0.235022
[550]	train-gini:0.39584	valid-gini:0.23102
[600]	train-gini:0.399529	valid-gini:0.235704
[650]	train-gini:0.400685	valid-gini:0.23321
[700]	train-gini:0.402738	valid-gini:0.233753
[750]	train-gini:0.404563	valid-gini:0.233582
[800]	train-gini:0.405778	valid-gini:0.233925
[850]	train-gini:0.407957	valid-gini:0.236965
[900]	train-gini:0.410187	valid-gini:0.236511
[950]	train-gini:0.412109	valid-gini:0.237146
[1000]	train-gini:0.414372	valid-gini:0.237582
[1050]	train-gini:0.416637	valid-gini:0.238728
[1100]	train-gini:0.418962	valid-gini:0.239321
[1150]	train-gini:0.421577	valid-gini:0.239511
[1200]	train-gini:0.423235	valid-gini:0.239334
[1250]	train-gini:0.425139	valid-gini:0.23904
[1300]	train-gini:0.426691	valid-gini:0.238922
[1350]	train-gini:0.428596	valid-gini:0.238845
[1400]	train-gini:0.430435	valid-gini:0.239261
[1450]	train-gini:0.431998	valid-gini:0.239357
[1500]	train-gini:0.433579	valid-gini:0.239216
[1550]	train-gini:0.434974	valid-gini:0.239915

Out[98]:
'!!! used 19.15 s'
In [100]:
clf.best_iteration,clf.best_ntree_limit,clf.best_score

Out[100]:
(1596, 1597, 0.240364)
In [101]:
pred=clf.predict(d_test)

In [102]:
save()

In [ ]:


In [103]:
feature_score=pd.Series(clf.get_fscore())
feature_score.sort()
feature_score.plot.barh(color='g',alpha=.4)

Out[103]:
<matplotlib.axes._subplots.AxesSubplot at 0x7efc04a14f10>

### feature convert to dummies might help feature selection¶

In [10]:
def make_dummies(data):
for i in ['veh_body', 'veh_age', 'gender', 'area', 'agecat']:
data[i]=data[i].astype('int')
data=pd.concat([data,pd.get_dummies(data[i])],axis=1)

a=['veh_body_'+str(i) for i in range(13) ]
b=['veh_age_'+str(i) for i in range(1,5) ]
c=['gender_'+str(i) for i in range(2) ]
d=['area_'+str(i) for i in range(6) ]
e=['agecat_'+str(i) for i in range(1,7) ]
data.pop('veh_body')
data.pop('veh_age')
data.pop('gender')
data.pop('area')
data.pop('agecat')
data.columns=['veh_value','exposure']+a+b+c+d+e
return data

In [11]:
x_train=make_dummies(x_train)
# x_valid=make_dummies(x_valid)

In [12]:
x_train.shape

Out[12]:
(45239, 33)
In [13]:
x_train.head(1)

Out[13]:
veh_value exposure veh_body_0 veh_body_1 veh_body_2 veh_body_3 veh_body_4 veh_body_5 veh_body_6 veh_body_7 ... area_2 area_3 area_4 area_5 agecat_1 agecat_2 agecat_3 agecat_4 agecat_5 agecat_6
0 6.43 0.241898 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0

1 rows × 33 columns

# use all data to train the model¶

In [ ]:


In [ ]:


In [14]:
# load data
print (train1.shape,train2.shape)
print (hold.shape)
combine=pd.concat([train1,train2])
train=combine.copy()

cat_vars = train.select_dtypes(['object']).columns

for var in cat_vars:
train[var] = pd.factorize(train[var])[0]
hold[var] = pd.factorize(hold[var])[0]

# preprocessing
y_train =train.pop('clm')
# y_claim_train=train.pop('claimcst0')
x_train = train.drop(['id','numclaims'], axis = 1)

y_hold = hold.pop('clm')
hold = hold.drop(['id','numclaims'], axis = 1)

y_train_clm0=x_train.pop('claimcst0')

((22610, 11), (22629, 11))
(22617, 10)

In [15]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(kind='regular')

x_train,y_train=sm.fit_sample(x_train,y_train)
x_train=pd.DataFrame(x_train)
y_train=pd.Series(y_train)
print x_train.shape, y_train.shape, y_train.sum()
x_train.columns=train.drop(['id','numclaims','claimcst0'], axis = 1).columns

(84310, 7) (84310,) 42155

In [16]:
x_train.head(1)

Out[16]:
veh_value exposure veh_body veh_age gender area agecat
0 6.43 0.241898 0.0 1.0 0.0 0.0 3.0

## make dummies¶

In [17]:
x_train=make_dummies(x_train)
hold=make_dummies(hold)

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [18]:
# apply xgboost
d_train = xgb.DMatrix(x_train, y_train)
d_valid = xgb.DMatrix(x_train, y_train)
d_hold = xgb.DMatrix(hold)

# d_hold=xgb.DMatrix(x_hold)
def use_gini(a, b):
y = b.get_label()
return 'gini', Gini(y, a)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

In [19]:
xgb_params = {
#     'base_score': 0.5,
'colsample_bylevel': 0.6,
'colsample_bytree': 0.8,
#  'gamma': 0,
'learning_rate': 0.01,
#  'max_delta_step': 1,
'max_depth': 1,
'min_child_weight': 9,
'missing': None,

#  'objective': 'reg:linear',

'objective': 'binary:logistic',
'reg_alpha': 9,
'reg_lambda': 9,
'scale_pos_weight': 1,
'seed': 27,
'silent': 1,
'subsample': 0.9}

In [20]:
# linear

start=time()

clf = xgb.train(xgb_params, d_train, int(502/0.75), watchlist, early_stopping_rounds=1111,
verbose_eval=50,
feval=use_gini,
maximize=True)

'!!! used %.2f s'%(time()-start)

[0]	train-gini:-0.127339	valid-gini:-0.127339
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 1111 rounds.
[50]	train-gini:0.223111	valid-gini:0.223111
[100]	train-gini:0.273884	valid-gini:0.273884
[150]	train-gini:0.353242	valid-gini:0.353242
[200]	train-gini:0.383093	valid-gini:0.383093
[250]	train-gini:0.377712	valid-gini:0.377712
[300]	train-gini:0.378535	valid-gini:0.378535
[350]	train-gini:0.379602	valid-gini:0.379602
[400]	train-gini:0.381008	valid-gini:0.381008
[450]	train-gini:0.38618	valid-gini:0.38618
[500]	train-gini:0.391242	valid-gini:0.391242
[550]	train-gini:0.393174	valid-gini:0.393174
[600]	train-gini:0.394663	valid-gini:0.394663
[650]	train-gini:0.396026	valid-gini:0.396026

Out[20]:
'!!! used 15.14 s'
In [21]:
clf.best_iteration,clf.best_ntree_limit,clf.best_score

Out[21]:
(665, 666, 0.396638)
In [22]:
pred=clf.predict(d_hold)

In [17]:
save2()## final prediction

In [21]:
feature_score=pd.Series(clf.get_fscore())
feature_score.sort()
feature_score.plot.barh(color='m',alpha=.4)

Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x7faf62e58890>

## optional drop features¶

In [25]:
from sklearn.metrics import confusion_matrix
judge=pd.Series(pred).apply(lambda x:1 if x>0.522029 else 0)

In [26]:
confusion_matrix(y_hold,judge)

Out[26]:
array([[10801, 10276],
[  440,  1100]])
In [27]:
pd.Series(pred).describe()

Out[27]:
count    22617.000000
mean         0.456469
std          0.157445
min          0.098007
25%          0.324611
50%          0.525218
75%          0.600220
max          0.645061
dtype: float64
In [28]:
(0.595391-0.522029)/5*8+0.522029

Out[28]:
0.6394082
In [29]:
(pd.Series(pred)>0.62).sum()

Out[29]:
758
In [30]:
y_hold.sum()

Out[30]:
1540
In [31]:
1540/22617.

Out[31]:
0.06809037449705974
In [32]:
xgb_params4 = {
#     'base_score': 0.5,
'colsample_bylevel': 0.6,
#  'colsample_bytree': 0.8,
#  'gamma': 0,
'learning_rate': 0.0086,
'max_depth': 1,
'min_child_weight': .5,
#  'missing': None,

#  'objective': 'reg:linear',

'objective': 'binary:logistic',
'reg_alpha': 0.5,
'reg_lambda': 3,
'scale_pos_weight': .5,
'seed': 27,
'silent': 1,
'subsample': 0.54,
'eval_metric':'auc'}

In [34]:
start=time()

clf = xgb.train(xgb_params4, d_train, 500, watchlist, early_stopping_rounds=666,
verbose_eval=50,
feval=use_gini,
maximize=True
)

'!!! used %.2f s'%(time()-start)

[0]	train-gini:-0.127759	valid-gini:-0.127759
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 666 rounds.
[50]	train-gini:0.252112	valid-gini:0.252112
[100]	train-gini:0.307819	valid-gini:0.307819
[150]	train-gini:0.368028	valid-gini:0.368028
[200]	train-gini:0.371885	valid-gini:0.371885
[250]	train-gini:0.371882	valid-gini:0.371882
[300]	train-gini:0.378054	valid-gini:0.378054
[350]	train-gini:0.375776	valid-gini:0.375776
[400]	train-gini:0.380253	valid-gini:0.380253
[450]	train-gini:0.386351	valid-gini:0.386351

Out[34]:
'!!! used 11.42 s'
In [35]:
clf.best_iteration,clf.best_ntree_limit,clf.best_score

Out[35]:
(494, 495, 0.391914)
In [37]:
pred=clf.predict(d_hold)

In [38]:
result=pd.Series(pred)

In [40]:
judge=pd.Series(pred).apply(lambda x:1 if x>0.494940 else 0)

In [41]:
confusion_matrix(y_hold,judge)

Out[41]:
array([[21077,     0],
[ 1540,     0]])
In [42]:
result.describe()

Out[42]:
count    22617.000000
mean         0.315732
std          0.116792
min          0.081827
25%          0.199839
50%          0.369745
75%          0.422930
max          0.453982
dtype: float64

