In [92]:
import numpy as np
import pandas as pd
from IPython.display import display
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
In [2]:
# Load data
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")
submission_data = pd.read_csv('./data/sample_submission.csv')
In [4]:
train_data.describe()
Out[4]:
In [5]:
test_data.describe()
Out[5]:
In [6]:
train_data.head()
Out[6]:
In [9]:
any(np.isnan(train_data.isnull().sum().values))
# No missing values in the training data
Out[9]:
In [10]:
any(np.isnan(test_data.isnull().sum().values))
Out[10]:
In [3]:
# Since there are so many parameters, let's use XGBoost to select features
X = train_data.iloc[:, 2:].values
y = train_data['target'].values
In [4]:
print(X.shape)
print(y.shape)
In [46]:
plt.plot(y)
plt.gcf().set_size_inches(12, 2)
plt.show()
In [14]:
model = XGBRegressor()
model.fit(X, y)
Out[14]:
In [90]:
def RMSLE(y_predict, y_test):
"""Evaluation metric Root Mean Squared Logarithmic Error"""
return np.sqrt(np.nanmean((np.log(y_predict+1) - np.log(y_test+1))**2))
In [16]:
y_predict = model.predict(X)
print(RMSLE(y_predict, y))
In [17]:
def extract_important_features(feature_importances, tol=1E-6):
# filter out features that has minimal importance
important_feature_ind = np.where(feature_importances > 1E-6)[0]
important_features = model.feature_importances_[important_feature_ind]
sorted_important_feature_ind = np.array([x for _,x in sorted(
zip(important_features,important_feature_ind))])[::-1]
return sorted_important_feature_ind
In [18]:
sorted_important_feature_ind = extract_important_features(model.feature_importances_)
In [19]:
len(sorted_important_feature_ind)
Out[19]:
In [20]:
model.feature_importances_[sorted_important_feature_ind]
Out[20]:
Let's start with simple XGBoost and RandomForest using the important features
In [21]:
model_xgb = XGBRegressor()
for rs in range(30, 40):
X_train, X_test, y_train, y_test = train_test_split(X[:, sorted_important_feature_ind],
y, test_size=0.2, random_state=rs)
model_xgb.fit(X_train, y_train)
y_predict = model_xgb.predict(X_test)
print(RMSLE(y_predict, y_test))
In [22]:
# XGBoost with multiple iterations
num_iter = 10
index = sorted_important_feature_ind
models_xgboost = [[]]*num_iter
for i in range(num_iter):
models_xgboost[i] = XGBRegressor()
rmsle_val = []
for rs in range(30, 50):
X_train, X_test, y_train, y_test = train_test_split(X[:, index],
y, test_size=0.2, random_state=rs)
models_xgboost[i].fit(X_train, y_train)
y_predict = models_xgboost[i].predict(X_test)
rmsle_val.append(RMSLE(y_predict, y_test))
print(i, ':', np.mean(rmsle_val), ', with ', len(index), 'features')
# Look at important features leftover
temp_index = extract_important_features(models_xgboost[i].feature_importances_)
index = index[temp_index] # iteratively updating the features leftover
print(np.array(index))
In [24]:
np.savez('./data/good_feature_indices.npz', index=index)
In [49]:
all(X[:, index].std(axis=0)>10)
Out[49]:
In [102]:
# Properly train the XGBoost
import xgboost as xgb
X_train, X_test, y_train, y_test = train_test_split(X[:, index],
y, test_size=0.2, random_state=rs)
params = {'objective': 'reg:linear',
'eval_metric': 'rmse',
'eta': 0.005,
'max_depth': 15,
'subsample': 0.7,
'colsample_bytree': 0.5,
'alpha':0,
'random_state': 42,
'silent': True}
xgtrain = xgb.DMatrix(X_train, y_train)
xgvals = xgb.DMatrix(X_test, y_test)
watchlist = [(xgtrain, 'train'), (xgvals, 'valid')]
model_xgb = xgb.train(params, xgtrain, 5000, watchlist, maximize=False,
early_stopping_rounds=30, verbose_eval=100)
y_predict = model_xgb.predict(xgb.DMatrix(X_test), ntree_limit=model_xgb.best_ntree_limit)
print(RMSLE(y_predict, y_test))
In [25]:
# RandomForest
from sklearn.ensemble import RandomForestRegressor
model_rf = RandomForestRegressor()
for rs in range(30, 50):
X_train, X_test, y_train, y_test = train_test_split(X[:, index],
y, test_size=0.2, random_state=rs)
model_rf.fit(X_train, y_train)
y_predict = model_rf.predict(X_test)
print(RMSLE(y_predict, y_test))
In [97]:
# Light GBM
import lightgbm as lgbm
from lightgbm import LGBMRegressor
X_train, X_test, y_train, y_test = train_test_split(X[:, index],
y, test_size=0.2, random_state=rs)
params = {
"objective" : "regression",
"metric" : "rmse",
"num_leaves" : 35,
"learning_rate" : 0.005,
"bagging_fraction" : 0.7,
"feature_fraction" : 0.5,
"bagging_frequency" : 5,
"bagging_seed" : 42,
"verbosity" : -1,
"random_seed": 42
}
lgtrain = lgb.Dataset(X_train, label=y_train)
lgvals = lgb.Dataset(X_test, label=y_test)
model_lgbm = lgbm.train(params, lgtrain, 5000, valid_sets=[lgvals],
early_stopping_rounds=100, verbose_eval=50, evals_result={})
y_predict = model_lgbm.predict(X_test, num_iteration=model_lgbm.best_iteration)
print(RMSLE(y_predict, y_test))
In [124]:
def combine_models(model_lgbm, model_xgb, model_rf, X_submit):
y_predict_lgbm = model_lgbm.predict(X_submit, num_iteration=model_lgbm.best_iteration)
y_predict_xgb = model_xgb.predict(xgb.DMatrix(X_submit), ntree_limit=model_xgb.best_ntree_limit)
y_predict_rnf = model_rf.predict(X_submit)
y_predict = np.c_[y_predict_lgbm, y_predict_xgb, y_predict_rnf]
y_predict_mean = np.average(y_predict, axis=1, weights=
[1./1.8045660073299785,1./1.8350776254166632, 1./1.8052003311433578])
return y_predict_mean
y_predict_all = combine_models(model_lgbm, model_xgb, model_rf, X_test)
y_predict_all[y_predict_all<0] = np.nanmean(y_predict_all[y_predict_all>0])
print(RMSLE(y_predict_all, y_test))
In [120]:
print(np.where(y_predict_all<0)[0])
print(y_predict_all[153])
First round of submission: Take a weighted average of the predictions
In [114]:
X_submit = test_data.iloc[:, 1:].iloc[:, index].values
y_predict_submit = combine_models(model_lgbm, model_xgb, model_rf, X_submit)
In [125]:
# Quick and dirty, get rid of the predicted negative values for submission purpose.
# Need to further improve model
y_predict_submit[y_predict_submit<0] = np.nanmean(y_predict_submit[y_predict_submit>0])
submission_data['target'] = y_predict_submit
submission_data.to_csv('./data/first_submission.csv', index=False)
Simple MLP
In [52]:
X_train, X_test, y_train, y_test = train_test_split(X[:, index], y, test_size=0.2, random_state=42)
In [53]:
import tensorflow as tf
def RMSLE_tf(y_true, y_pred):
return tf.sqrt(tf.reduce_mean((tf.log(y_pred+1) - tf.log(y_true+1))**2))
In [ ]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers.normalization import BatchNormalization
from keras.regularizers import l2
model = Sequential([
Dense(32, input_shape=(152,), kernel_initializer="random_uniform"), # input layer
BatchNormalization(), # batch normalization
Activation('relu'),
Dense(100, kernel_initializer="random_uniform", W_regularizer=l2(0.01),
activity_regularizer=l2(0.01)), # hidden layer 1 with 100 neurons
BatchNormalization(),
Activation('relu'),
Dense(300, kernel_initializer="random_uniform", W_regularizer=l2(0.01),
activity_regularizer=l2(0.01)), # hidden layer 2 with 300 neurons
BatchNormalization(),
Activation('relu'),
Dense(100, kernel_initializer="random_uniform", W_regularizer=l2(0.01),
activity_regularizer=l2(0.01)), # hidden layer 3 with 100 neurons
BatchNormalization(),
Activation('relu'),
Dense(1, kernel_initializer="random_uniform"), # output layer with 1 neurons
])
model.compile(optimizer='adam', loss=RMSLE_tf)
model.fit(X_train, y_train, epochs=500, batch_size=50)
model.evaluate(X_test, y_test, batch_size=10) # [loss]
In [ ]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
"""Based on https://www.kaggle.com/alexpengxiao/preprocessing-model-averaging-by-xgb-lgb-1-39"""
def __init__(self, models, weights=None):
self.models = models
self.weigths = weights
# we define clones of the original models to fit the data in
# the reason of clone is avoiding affect the original base models
def fit(self, X, y):
self.models_ = [clone(x) for x in self.models]
# Train cloned base models
for model in self.models_:
model.fit(X, y)
return self
#Now we do the predictions for cloned models and average them
def predict(self, X):
predictions = np.column_stack([ model.predict(X) for model in self.models_ ])
if self.weights is None:
return np.mean(predictions, axis=1)
else:
return np.average(predictions, axis=1, weights=self.weights)