Santander Value Prediction

 
Santander Value Prediction
In [92]:
import numpy as np
import pandas as pd
from IPython.display import display
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
In [2]:
# Load data
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")
submission_data = pd.read_csv('./data/sample_submission.csv')
In [4]:
train_data.describe()
Out[4]:
target 48df886f9 0deb4b6a8 34b15f335 a8cb14b00 2f0771a37 30347e683 d08d1fbe3 6ee66e115 20aa07010 ... 3ecc09859 9281abeea 8675bec0b 3a13ed79a f677d4d13 71b203550 137efaa80 fb36b89d9 7e293fbaf 9fc776466
count 4.459000e+03 4.459000e+03 4.459000e+03 4.459000e+03 4.459000e+03 4.459000e+03 4.459000e+03 4.459000e+03 4.459000e+03 4.459000e+03 ... 4.459000e+03 4.459000e+03 4.459000e+03 4.459000e+03 4459.000000 4.459000e+03 4.459000e+03 4.459000e+03 4.459000e+03 4.459000e+03
mean 5.944923e+06 1.465493e+04 1.390895e+03 2.672245e+04 4.530164e+03 2.640996e+04 3.070811e+04 1.686522e+04 4.669208e+03 2.569407e+06 ... 4.676057e+05 4.446239e+05 8.056219e+05 7.812966e+05 143.529939 1.213809e+05 3.573451e+04 3.123741e+05 9.219960e+04 2.279100e+05
std 8.234312e+06 3.893298e+05 6.428302e+04 5.699652e+05 2.359124e+05 1.514730e+06 5.770590e+05 7.512756e+05 1.879449e+05 9.610183e+06 ... 4.068038e+06 4.428889e+06 4.513246e+06 6.839451e+06 9584.318507 4.720709e+06 1.614622e+06 4.318501e+06 1.635993e+06 1.811139e+06
min 3.000000e+04 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
25% 6.000000e+05 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
50% 2.260000e+06 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
75% 8.000000e+06 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 6.000000e+05 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
max 4.000000e+07 2.000000e+07 4.000000e+06 2.000000e+07 1.480000e+07 1.000000e+08 2.070800e+07 4.000000e+07 1.040000e+07 3.196120e+08 ... 7.600000e+07 1.235880e+08 1.300000e+08 1.444000e+08 640000.000000 3.013120e+08 1.064200e+08 1.400000e+08 6.176800e+07 4.320000e+07

8 rows × 4992 columns

In [5]:
test_data.describe()
Out[5]:
48df886f9 0deb4b6a8 34b15f335 a8cb14b00 2f0771a37 30347e683 d08d1fbe3 6ee66e115 20aa07010 dc5a8f1d8 ... 3ecc09859 9281abeea 8675bec0b 3a13ed79a f677d4d13 71b203550 137efaa80 fb36b89d9 7e293fbaf 9fc776466
count 4.934200e+04 4.934200e+04 4.934200e+04 4.934200e+04 4.934200e+04 4.934200e+04 4.934200e+04 4.934200e+04 4.934200e+04 4.934200e+04 ... 4.934200e+04 4.934200e+04 4.934200e+04 4.934200e+04 4.934200e+04 4.934200e+04 4.934200e+04 4.934200e+04 4.934200e+04 4.934200e+04
mean 5.773787e+04 6.258726e+04 1.036752e+05 6.289853e+04 6.713354e+04 8.083879e+04 6.181014e+04 5.515752e+04 1.406324e+06 8.128668e+04 ... 1.193910e+05 1.355955e+05 3.242217e+05 1.437856e+05 9.302367e+04 8.047145e+04 6.076865e+04 1.323210e+05 1.675766e+05 1.282487e+05
std 1.745182e+06 2.322787e+06 2.586951e+06 2.765941e+06 3.206124e+06 2.845031e+06 2.780137e+06 1.923517e+06 6.872366e+06 2.378938e+06 ... 3.115190e+06 2.598454e+06 3.782996e+06 3.663374e+06 5.041000e+06 2.100210e+06 2.040655e+06 3.592018e+06 3.761816e+06 2.413798e+06
min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
25% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
50% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
75% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
max 1.504447e+08 2.283295e+08 2.758171e+08 3.972621e+08 4.667591e+08 2.852223e+08 4.863751e+08 2.043290e+08 3.435658e+08 2.310167e+08 ... 5.351692e+08 1.236547e+08 3.793398e+08 4.025480e+08 9.657530e+08 1.680065e+08 2.497913e+08 3.200000e+08 3.186300e+08 2.189782e+08

8 rows × 4991 columns

In [6]:
train_data.head()
Out[6]:
ID target 48df886f9 0deb4b6a8 34b15f335 a8cb14b00 2f0771a37 30347e683 d08d1fbe3 6ee66e115 ... 3ecc09859 9281abeea 8675bec0b 3a13ed79a f677d4d13 71b203550 137efaa80 fb36b89d9 7e293fbaf 9fc776466
0 000d6aaf2 38000000.0 0.0 0 0.0 0 0 0 0 0 ... 0.0 0.0 0.0 0 0 0 0 0 0 0
1 000fbd867 600000.0 0.0 0 0.0 0 0 0 0 0 ... 0.0 0.0 0.0 0 0 0 0 0 0 0
2 0027d6b71 10000000.0 0.0 0 0.0 0 0 0 0 0 ... 0.0 0.0 0.0 0 0 0 0 0 0 0
3 0028cbf45 2000000.0 0.0 0 0.0 0 0 0 0 0 ... 0.0 0.0 0.0 0 0 0 0 0 0 0
4 002a68644 14400000.0 0.0 0 0.0 0 0 0 0 0 ... 0.0 0.0 0.0 0 0 0 0 0 0 0

5 rows × 4993 columns

In [9]:
any(np.isnan(train_data.isnull().sum().values))

# No missing values in the training data
Out[9]:
False
In [10]:
any(np.isnan(test_data.isnull().sum().values))
Out[10]:
False
In [3]:
# Since there are so many parameters, let's use XGBoost to select features
X = train_data.iloc[:, 2:].values
y = train_data['target'].values
In [4]:
print(X.shape)
print(y.shape)
(4459, 4991)
(4459,)
In [46]:
plt.plot(y)
plt.gcf().set_size_inches(12, 2)
plt.show()
In [14]:
model = XGBRegressor()
model.fit(X, y)
Out[14]:
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
In [90]:
def RMSLE(y_predict, y_test):
    """Evaluation metric Root Mean Squared Logarithmic Error"""
    return np.sqrt(np.nanmean((np.log(y_predict+1) - np.log(y_test+1))**2))
In [16]:
y_predict = model.predict(X)
print(RMSLE(y_predict, y))
1.8393296647239583
In [17]:
def extract_important_features(feature_importances, tol=1E-6):
    # filter out features that has minimal importance
    important_feature_ind = np.where(feature_importances > 1E-6)[0]
    important_features = model.feature_importances_[important_feature_ind]
    sorted_important_feature_ind = np.array([x for _,x in sorted(
                    zip(important_features,important_feature_ind))])[::-1]
    return sorted_important_feature_ind
In [18]:
sorted_important_feature_ind = extract_important_features(model.feature_importances_)
In [19]:
len(sorted_important_feature_ind)
Out[19]:
272
In [20]:
model.feature_importances_[sorted_important_feature_ind]
Out[20]:
array([0.03448276, 0.02758621, 0.01896552, 0.01896552, 0.01724138,
       0.0137931 , 0.0137931 , 0.01206897, 0.01206897, 0.01206897,
       0.01034483, 0.01034483, 0.01034483, 0.01034483, 0.00862069,
       0.00862069, 0.00862069, 0.00862069, 0.00862069, 0.00862069,
       0.00862069, 0.00862069, 0.00689655, 0.00689655, 0.00689655,
       0.00689655, 0.00689655, 0.00689655, 0.00689655, 0.00689655,
       0.00689655, 0.00689655, 0.00689655, 0.00689655, 0.00689655,
       0.00689655, 0.00517241, 0.00517241, 0.00517241, 0.00517241,
       0.00517241, 0.00517241, 0.00517241, 0.00517241, 0.00517241,
       0.00517241, 0.00517241, 0.00517241, 0.00517241, 0.00517241,
       0.00517241, 0.00517241, 0.00517241, 0.00517241, 0.00517241,
       0.00517241, 0.00517241, 0.00517241, 0.00517241, 0.00517241,
       0.00517241, 0.00517241, 0.00517241, 0.00517241, 0.00517241,
       0.00517241, 0.00517241, 0.00517241, 0.00344828, 0.00344828,
       0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
       0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
       0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
       0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
       0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
       0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
       0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
       0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
       0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
       0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
       0.00344828, 0.00344828, 0.00344828, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414, 0.00172414, 0.00172414, 0.00172414,
       0.00172414, 0.00172414], dtype=float32)

Let's start with simple XGBoost and RandomForest using the important features

In [21]:
model_xgb = XGBRegressor()
for rs in range(30, 40):
    X_train, X_test, y_train, y_test = train_test_split(X[:, sorted_important_feature_ind],
                                                        y, test_size=0.2, random_state=rs)
    model_xgb.fit(X_train, y_train)
    y_predict = model_xgb.predict(X_test)
    print(RMSLE(y_predict, y_test))
1.8809839111057105
1.8483280111644638
1.881242016749642
1.9396014034429652
1.9274425890035274
1.8658012261166985
1.8866572076949215
1.8630890970897818
1.8624094329275218
1.8569099930555
In [22]:
# XGBoost with multiple iterations
num_iter = 10
index = sorted_important_feature_ind
models_xgboost = [[]]*num_iter
for i in range(num_iter):
    models_xgboost[i] = XGBRegressor()
    rmsle_val = []
    for rs in range(30, 50):
        X_train, X_test, y_train, y_test = train_test_split(X[:, index],
                                                            y, test_size=0.2, random_state=rs)
        models_xgboost[i].fit(X_train, y_train)
        y_predict = models_xgboost[i].predict(X_test)
        rmsle_val.append(RMSLE(y_predict, y_test))
    print(i, ':', np.mean(rmsle_val), ', with ', len(index), 'features')
    # Look at important features leftover
    temp_index = extract_important_features(models_xgboost[i].feature_importances_)
    index = index[temp_index] # iteratively updating the features leftover
print(np.array(index))
0 : 1.879653783075668 , with  272 features
1 : 1.8738600296682244 , with  158 features
2 : 1.8742622487559337 , with  153 features
3 : 1.8739001845143464 , with  153 features
4 : 1.87413516508366 , with  153 features
5 : 1.8738438205473869 , with  153 features
6 : 1.874271821477142 , with  153 features
7 : 1.8739384327796984 , with  153 features
8 : 1.8743543756941254 , with  152 features
9 : 1.8739778846146096 , with  152 features
[4881  733 3927  808 4872  401 1791  916 1140 4949 1675 4855 3686 3890
 4434 3771 4660 4528 2221 2232 3311 3702 4981   31 4066 3660 4554 1427
 3711 2721 2364 2656 4802  925 1948  898  168 2882 1530 3706 3672  964
 4243 3217 1097 2193  493  878 3088 4916 1358 2185 2875 3979  920  562
 4204 2500 1057 1142 4408 3836 4167 4101 3866 2424 1556 1447  693  303
  118 4696 4581 3826 3742 3413 3216 2719 2695 2477 2237 1729 1325  655
 4939 4758 4629 4494 4330 4268 4228 4021 3992 3928 3856 3735 3553 3514
 3378 3189 2984 2662 2552 2292 2161 1605 1486  435   17 4720 3800 1052
  108 4808  594 1935 3664 4274 4414  572 2995  908  355 3508 4098 1564
 1781  658  550  602 1224 1942 2835 3939   53 2598  306  705 3994 4507
    8 1571 4358 3570 3724 1653 2755 3592 3323 1614 1742 1044]
In [24]:
np.savez('./data/good_feature_indices.npz', index=index)
In [49]:
all(X[:, index].std(axis=0)>10)
Out[49]:
True
In [102]:
# Properly train the XGBoost
import xgboost as xgb
X_train, X_test, y_train, y_test = train_test_split(X[:, index],
                                                        y, test_size=0.2, random_state=rs)
params = {'objective': 'reg:linear',
          'eval_metric': 'rmse',
          'eta': 0.005,
          'max_depth': 15,
          'subsample': 0.7,
          'colsample_bytree': 0.5,
          'alpha':0,
          'random_state': 42,
          'silent': True}
xgtrain = xgb.DMatrix(X_train, y_train)
xgvals = xgb.DMatrix(X_test, y_test)
watchlist = [(xgtrain, 'train'), (xgvals, 'valid')]
model_xgb = xgb.train(params, xgtrain, 5000, watchlist, maximize=False,
                early_stopping_rounds=30, verbose_eval=100)
y_predict = model_xgb.predict(xgb.DMatrix(X_test), ntree_limit=model_xgb.best_ntree_limit)
print(RMSLE(y_predict, y_test))
[0]	train-rmse:1.02196e+07	valid-rmse:9.74635e+06
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 30 rounds.
[100]	train-rmse:7.91576e+06	valid-rmse:8.17766e+06
[200]	train-rmse:6.56827e+06	valid-rmse:7.44525e+06
[300]	train-rmse:5.74311e+06	valid-rmse:7.12225e+06
[400]	train-rmse:5.19471e+06	valid-rmse:6.98255e+06
[500]	train-rmse:4.8035e+06	valid-rmse:6.91851e+06
[600]	train-rmse:4.51141e+06	valid-rmse:6.89654e+06
[700]	train-rmse:4.27312e+06	valid-rmse:6.87879e+06
[800]	train-rmse:4.08279e+06	valid-rmse:6.86502e+06
[900]	train-rmse:3.92134e+06	valid-rmse:6.85755e+06
[1000]	train-rmse:3.78599e+06	valid-rmse:6.85299e+06
Stopping. Best iteration:
[1001]	train-rmse:3.78421e+06	valid-rmse:6.85291e+06

1.8350776254166632
In [25]:
# RandomForest
from sklearn.ensemble import RandomForestRegressor
model_rf = RandomForestRegressor()
for rs in range(30, 50):
    X_train, X_test, y_train, y_test = train_test_split(X[:, index],
                                                        y, test_size=0.2, random_state=rs)
    model_rf.fit(X_train, y_train)
    y_predict = model_rf.predict(X_test)
    print(RMSLE(y_predict, y_test))
1.7094329194641205
1.7522741420919987
1.6933996598001222
1.6998235178998526
1.7675607282881904
1.722015348543114
1.7097789463192337
1.6880881139813322
1.7214386789183165
1.7032671527531487
1.7342282381291443
1.6786444480156406
1.729162794140566
1.773401796259706
1.6631786944228732
1.6429815660539353
1.6843935172942497
1.6820915907398473
1.7529989029777104
1.8052003311433578
In [97]:
# Light GBM
import lightgbm as lgbm
from lightgbm import LGBMRegressor

X_train, X_test, y_train, y_test = train_test_split(X[:, index],
                                                        y, test_size=0.2, random_state=rs)
params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 35,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 5,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "random_seed": 42
    }
lgtrain = lgb.Dataset(X_train, label=y_train)
lgvals = lgb.Dataset(X_test, label=y_test)
model_lgbm = lgbm.train(params, lgtrain, 5000, valid_sets=[lgvals],
                early_stopping_rounds=100, verbose_eval=50, evals_result={})
y_predict = model_lgbm.predict(X_test, num_iteration=model_lgbm.best_iteration)
print(RMSLE(y_predict, y_test))
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmse: 7.58464e+06
[100]	valid_0's rmse: 7.38148e+06
[150]	valid_0's rmse: 7.22995e+06
[200]	valid_0's rmse: 7.12488e+06
[250]	valid_0's rmse: 7.04591e+06
[300]	valid_0's rmse: 6.99291e+06
[350]	valid_0's rmse: 6.95134e+06
[400]	valid_0's rmse: 6.91854e+06
[450]	valid_0's rmse: 6.89377e+06
[500]	valid_0's rmse: 6.87643e+06
[550]	valid_0's rmse: 6.86276e+06
[600]	valid_0's rmse: 6.84935e+06
[650]	valid_0's rmse: 6.83984e+06
[700]	valid_0's rmse: 6.83347e+06
[750]	valid_0's rmse: 6.82647e+06
[800]	valid_0's rmse: 6.82324e+06
[850]	valid_0's rmse: 6.82004e+06
[900]	valid_0's rmse: 6.81528e+06
[950]	valid_0's rmse: 6.8138e+06
[1000]	valid_0's rmse: 6.81035e+06
[1050]	valid_0's rmse: 6.80922e+06
[1100]	valid_0's rmse: 6.80835e+06
[1150]	valid_0's rmse: 6.80727e+06
[1200]	valid_0's rmse: 6.80699e+06
[1250]	valid_0's rmse: 6.80589e+06
[1300]	valid_0's rmse: 6.80644e+06
[1350]	valid_0's rmse: 6.8065e+06
Early stopping, best iteration is:
[1279]	valid_0's rmse: 6.80535e+06
1.8045660073299785
In [124]:
def combine_models(model_lgbm, model_xgb, model_rf, X_submit):
    y_predict_lgbm = model_lgbm.predict(X_submit, num_iteration=model_lgbm.best_iteration)
    y_predict_xgb =  model_xgb.predict(xgb.DMatrix(X_submit), ntree_limit=model_xgb.best_ntree_limit)
    y_predict_rnf = model_rf.predict(X_submit)
    y_predict = np.c_[y_predict_lgbm, y_predict_xgb, y_predict_rnf]
    y_predict_mean = np.average(y_predict, axis=1, weights=
                            [1./1.8045660073299785,1./1.8350776254166632, 1./1.8052003311433578])

    return y_predict_mean

y_predict_all = combine_models(model_lgbm, model_xgb, model_rf, X_test)
y_predict_all[y_predict_all<0] = np.nanmean(y_predict_all[y_predict_all>0])
print(RMSLE(y_predict_all, y_test))
1.8228209376354063
In [120]:
print(np.where(y_predict_all<0)[0])
print(y_predict_all[153])
[153 167 247 459 623 678]
-43615.47158482677

First round of submission: Take a weighted average of the predictions

In [114]:
X_submit = test_data.iloc[:, 1:].iloc[:, index].values
y_predict_submit = combine_models(model_lgbm, model_xgb, model_rf, X_submit)
In [125]:
# Quick and dirty, get rid of the predicted negative values for submission purpose. 
# Need to further improve model
y_predict_submit[y_predict_submit<0] = np.nanmean(y_predict_submit[y_predict_submit>0])
submission_data['target'] = y_predict_submit
submission_data.to_csv('./data/first_submission.csv', index=False)

Simple MLP

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X[:, index], y, test_size=0.2, random_state=42)
In [53]:
import tensorflow as tf
def RMSLE_tf(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean((tf.log(y_pred+1) - tf.log(y_true+1))**2))
In [ ]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers.normalization import BatchNormalization
from keras.regularizers import l2

model = Sequential([
    Dense(32, input_shape=(152,), kernel_initializer="random_uniform"), # input layer
    BatchNormalization(), # batch normalization
    Activation('relu'),
    Dense(100, kernel_initializer="random_uniform", W_regularizer=l2(0.01),
         activity_regularizer=l2(0.01)), # hidden layer 1 with 100 neurons
    BatchNormalization(),
    Activation('relu'),
    Dense(300, kernel_initializer="random_uniform", W_regularizer=l2(0.01),
         activity_regularizer=l2(0.01)), # hidden layer 2 with 300 neurons
    BatchNormalization(),
    Activation('relu'),
    Dense(100, kernel_initializer="random_uniform", W_regularizer=l2(0.01),
         activity_regularizer=l2(0.01)), # hidden layer 3 with 100 neurons
    BatchNormalization(),
    Activation('relu'),
    Dense(1, kernel_initializer="random_uniform"), # output layer with 1 neurons
])

model.compile(optimizer='adam', loss=RMSLE_tf)

model.fit(X_train, y_train, epochs=500, batch_size=50)

model.evaluate(X_test, y_test, batch_size=10) # [loss]
In [ ]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    """Based on https://www.kaggle.com/alexpengxiao/preprocessing-model-averaging-by-xgb-lgb-1-39"""
    def __init__(self, models, weights=None):
        self.models = models
        self.weigths = weights

    # we define clones of the original models to fit the data in
    # the reason of clone is avoiding affect the original base models
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]  
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)
        return self

    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([ model.predict(X) for model in self.models_ ])
        if self.weights is None:
            return np.mean(predictions, axis=1)
        else:
            return np.average(predictions, axis=1, weights=self.weights)