In [1]:
# Do some imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
from IPython.display import display
import datetime
import sys
sys.path.append("D:/Edward/Documents/Assignments/Scripts/Python/generic")
from plots import SetFont # helps us fix some font problem for displaying Chinese characters
In [64]:
excelsheet_X_train = "./data/X序列_train.xls"
excelsheet_y_train = "./data/Y序列_train.xls"
# Training data
X_train_original = pd.read_excel(excelsheet_X_train, "Sheet1", skiprows=[1])
y1_train = pd.read_excel(excelsheet_y_train, "Y1", header=None)
y2_train = pd.read_excel(excelsheet_y_train, "Y2", header=None)
y3_train = pd.read_excel(excelsheet_y_train, "Y3", header=None)
# Test data
excelsheet_X_train = "./data/X序列_test.xls"
X_test_original = pd.read_excel(excelsheet_X_train, "Sheet1", skiprows=[1])
In [62]:
X_train_original.describe()
Out[62]:
In [3]:
X_train_original.info()
In [22]:
print(X_train_original.shape)
print(y1_train.shape)
# not the same size. Need to split the data
In [5]:
X_train_original.columns
Out[5]:
In [6]:
display(X_train_original.head())
display(X_train_original.tail())
In [25]:
print(X_train_original['指标名称'][0])
print(y1_train.iloc[:,0][0])
In [3]:
X_train_original["IsInY1"] = np.nan
X_train_original["IsInY2"] = np.nan
X_train_original["IsInY3"] = np.nan
for i in X_train_original.index:
Current_Index = X_train_original.loc[i, '指标名称']
X_train_original.loc[i, "IsInY1"] = np.any(y1_train.iloc[:, 0] == X_train_original.iloc[i,0])
X_train_original.loc[i, "IsInY2"] = np.any(y2_train.iloc[:, 0] == X_train_original.iloc[i,0])
X_train_original.loc[i, "IsInY3"] = np.any(y3_train.iloc[:, 0] == X_train_original.iloc[i,0])
In [67]:
print("Y1", int(X_train_original["IsInY1"].values.astype("int").sum()), "/", X_train_original.shape[0])
print("Y2", int(X_train_original["IsInY2"].values.astype("int").sum()), "/", X_train_original.shape[0])
print("Y3", int(X_train_original["IsInY3"].values.astype("int").sum()), "/", X_train_original.shape[0])
Look at only Y1 now¶
Make X1_train
In [4]:
# Drop some rows
X1_train = X_train_original.drop(np.where(np.logical_not(X_train_original.loc[:, "IsInY1"]))[0], axis=0)
In [5]:
# Find out which columns only have NaNs: Count number of NaNs in each column
#print(X1_train.isnull().sum())
# Drop anything that has greater than 100 NaNs
count_NaNs = X1_train.isnull().sum()
drop_columns = count_NaNs.loc[count_NaNs >100].index
X1_train = X1_train.drop(drop_columns, axis=1)
In [6]:
# Fill the NaNs
X1_train = X1_train.fillna(method='ffill')
In [102]:
print(X1_train.shape)
print(y1_train.shape)
np.all([x == y for x, y in zip(X1_train.iloc[:,0],y1_train.iloc[:,0])]) # there is 1 to 1 correpsondence in X1_train, y1_train
Out[102]:
In [41]:
# Save cleaned data
X1_train.drop(["IsInY1","IsInY2","IsInY3"], inplace=True, axis=1)
X1_train.to_csv("./data/Cleaned_X1_序列_train.csv", index=False, encoding='utf-8')
In [3]:
# Load data for next session
X1_train = pd.read_csv("./data/Cleaned_X1_序列_train.csv", encoding="utf", engine='python')
headers = [u'指标名称', u'中债国债到期收益率:3个月', u'中债国债到期收益率:1个月', u'中债国债到期收益率:1年',
u'中债国债到期收益率:5年', u'中债国债到期收益率:10年', u'上证综合指数', u'沪深300指数', u'中债总指数',
u'中债国债总指数', u'中债金融债券总指数']
In [141]:
# Find out if dates are even
dates = np.asarray([matplotlib.dates.date2num(i.date()) for i in X1_train["指标名称"]]) # convert to num first
intervals = np.diff(dates)
print(np.unique(intervals)) # dates are not even intervals. Most of the data is continuous
fig = plt.figure(figsize=(5,3))
sns.distplot(intervals, kde=False, bins = np.unique(intervals))
plt.show()
In [4]:
# Combine data for ease of plotting
df = X1_train.copy()
df["Y"] = y1_train.iloc[:, 1]
In [44]:
# Do some histogram plots for the features
df.hist(bins=50, figsize=(20,15))
# Fix some font issues with Chinese titles in pandas
SetFont(axes, plt.gcf(), fontsize=8, fontname='c:\\windows\\fonts\\simsun.ttc', items=["title"])
[ax.set_title(ax.get_title(), fontproperties=fontprop) for ax in plt.gcf().axes]
plt.show()
In [48]:
# Plot as time series
axes= df.plot(x="指标名称", kind='line', subplots=True, grid=True, layout=(4, 3), figsize=(10,10))
SetFont(axes, plt.gcf(), fontsize=8, fontname='c:\\windows\\fonts\\simsun.ttc', items=["legend", "xlab"])
plt.show()
In [10]:
# A lot of features seems to be higly correlated. Do a correlation matrix
from pandas.plotting import scatter_matrix
sns.set_style(style="white", rc=None)
# do correlation those columns
headers = [u'中债国债到期收益率:3个月',u'中债国债到期收益率:10年', u'沪深300指数', u'中债总指数']
axes= scatter_matrix(df[headers], figsize=(9,9))
SetFont(axes, plt.gcf(), fontsize=10, fontname='c:\\windows\\fonts\\simsun.ttc', items=["ylab", "xlab"])
plt.show()
Ideas for forecasting this time series with 4 selected features¶
- Can also do dimensionality reduction on the 10 features and select the first 4 PCs to train (perhaps for later)
- Simple ARIMA model for forecasting
- XGBoost with regression
- Deep neural network using RNN / (GRD / LSTM)
XGBoost regression¶
In [38]:
# Taking only these features
headers = [u'中债国债到期收益率:3个月',u'中债国债到期收益率:10年', u'沪深300指数', u'中债总指数']
df_sub = df[headers]
X1_train_mat = np.asarray(df_sub)
y1_train_mat = np.asarray(df["Y"])
In [45]:
# Separating data into X_train, y_train, X_val, y_val
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=50) # for 10 folds of crossvalidation
In [48]:
# Start with a simple xgboost, ignoring the time information and see what it does
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
accuracy_scores_list = []
xgb_reg = XGBRegressor()
for epoch, (train_index, test_index) in enumerate(tscv.split(X1_train_mat)):
# Get data
X_train, X_test = X1_train_mat[train_index], X1_train_mat[test_index]
y_train, y_test = y1_train_mat[train_index], y1_train_mat[test_index]
# Train the XGBoost
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_test)
acs = np.sqrt(mean_squared_error(y_test, y_pred))
#print("RMSE:", acs)
accuracy_scores_list.append(acs)
print(accuracy_scores_list)
In [110]:
np.mean(accuracy_scores_list)
Out[110]:
In [51]:
import xgboost as xgb
xgb.plot_importance(xgb_reg)
plt.show()
In [59]:
# Plot prediction vs. actual data
y1_pred_all = xgb_reg.predict(X1_train_mat)
plt.plot_date(X1_train[u'指标名称'], y1_train_mat, color='#1f77b4', label="original")
plt.plot_date(X1_train[u'指标名称'], y_pred_all, color='#ff7f0e', label="predicted", alpha=0.2)
plt.xlabel("Year")
plt.ylabel("Y1")
plt.legend()
plt.show()
In [70]:
# Make predictions using the trained XGBoost Regressor
X1_test = X_test_original[[u'指标名称'] + headers]
X1_test.isnull().sum()
# has some missing data. Need to later fit with spline
Out[70]:
In [84]:
# Fit the features with spline
X1_test_filled = X1_test.copy()
X1_test_filled[headers] = X1_test_filled[headers].interpolate(method="cubic", axis=0)
X1_test_filled[headers] =X1_test_filled[headers].fillna(axis=0, method="bfill")
X1_test_filled[headers] =X1_test_filled[headers].fillna(axis=0, method="ffill")
In [87]:
# Plot as time series
axes= X1_test.plot(x="指标名称", kind='line', subplots=True, grid=True, layout=(4, 3), figsize=(10,10))
SetFont(axes, plt.gcf(), fontsize=8, fontname='c:\\windows\\fonts\\simsun.ttc', items=["legend", "xlab"])
plt.suptitle("Original")
plt.show()
axes= X1_test_filled.plot(x="指标名称", kind='line', subplots=True, grid=True, layout=(4, 3), figsize=(10,10))
SetFont(axes, plt.gcf(), fontsize=8, fontname='c:\\windows\\fonts\\simsun.ttc', items=["legend", "xlab"])
plt.suptitle("Filled")
plt.show()
In [96]:
X1_test_filled_mat = np.asarray(X1_test_filled[headers])
Y1_pred = xgb_reg.predict(X1_test_filled_mat)
In [109]:
plt.plot_date(X1_test_filled["指标名称"], Y1_pred)
plt.xlabel("Time")
plt.ylabel("Y1_Predicted_XGBoost")
plt.savefig("./results/Y1_prediction_XGBoostReg.png", bbox_inches='tight', dpi=300)
plt.show()
In [98]:
# Writing the Y1 prediction to csv
Y1_pred_df = pd.DataFrame({"Date":X1_test_filled["指标名称"], "Y1":Y1_pred})
Y1_pred_df.to_csv("./results/Y1_prediction_XGBoostReg.csv", index=False, sep="\t")
In [112]:
# save the model
import pickle
pickle.dump(xgb_reg, open("./results/XGBoostReg_y1.model", "wb"))
#loaded_model = pickle.load(open("./results/XGBoostReg_y1.model", "rb"))