CMBC Fintech Problem - DeepTime

In [1]:

# Do some imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
from IPython.display import display
import datetime

import sys
sys.path.append("D:/Edward/Documents/Assignments/Scripts/Python/generic")
from plots import SetFont # helps us fix some font problem for displaying Chinese characters

D:\Anaconda3\lib\site-packages\matplotlib\__init__.py:1405: UserWarning:
This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

  warnings.warn(_use_error_msg)

In [64]:

excelsheet_X_train = "./data/X序列_train.xls"
excelsheet_y_train =  "./data/Y序列_train.xls"

# Training data
X_train_original = pd.read_excel(excelsheet_X_train, "Sheet1", skiprows=[1])
y1_train = pd.read_excel(excelsheet_y_train, "Y1", header=None)
y2_train = pd.read_excel(excelsheet_y_train, "Y2", header=None)
y3_train = pd.read_excel(excelsheet_y_train, "Y3", header=None)

# Test data
excelsheet_X_train = "./data/X序列_test.xls"
X_test_original = pd.read_excel(excelsheet_X_train, "Sheet1", skiprows=[1])

In [62]:

X_train_original.describe()

Out[62]:

	GDP:实际同比增长:全球	美国:GDP:现价:环比折年率:季调	欧元区:GDP平减指数:季调	全球商品贸易量增长率:出口金额:全球	工业增加值:当月同比	工业增加值:累计同比	固定资产投资完成额:累计同比	房地产开发投资完成额:累计同比	产量:发电量:当月同比	产量:发电量:累计同比	...	市净率:中证200	滚动市盈率(TTM):沪深两市	滚动市盈率(TTM):深圳市场	滚动市盈率(TTM):深市主板	滚动市盈率(TTM):中小板	滚动市盈率(TTM):创业板	中债总指数	中债国债总指数	中债金融债券总指数	中债信用债总指数
count	36.000000	275.000000	84.000000	20.000000	312.000000	210.000000	261.000000	187.000000	251.000000	311.000000	...	3273.000000	1136.000000	1136.000000	1136.000000	1136.000000	1136.000000	3502.000000	3502.000000	3502.000000	2248.000000
mean	3.517139	6.588000	92.173143	5.100000	12.758590	12.716571	24.687356	24.162567	8.636828	9.025223	...	3.223309	15.913327	29.539789	21.149058	35.858248	54.809419	125.261976	124.936082	126.800902	119.904897
std	1.248717	4.746882	9.065830	5.374403	5.479889	4.431903	11.468376	9.269454	6.915276	5.479765	...	1.335584	4.276788	9.118093	4.954066	11.299075	22.798515	17.983413	18.305711	17.451678	15.502401
min	0.013000	-7.700000	77.300000	-12.000000	-21.100000	-2.930000	5.500000	1.000000	-13.700000	-11.800000	...	1.601100	11.690000	18.970000	15.720000	21.770000	27.090000	99.771300	97.860100	99.338200	97.377500
25%	2.719000	4.200000	83.047000	2.875000	9.200000	9.400000	17.300000	19.600000	4.800000	5.560000	...	2.196900	13.020000	24.080000	17.840000	28.280000	37.607500	109.332775	108.730825	111.480550	109.335050
50%	3.497500	6.100000	92.497500	5.250000	12.650000	11.650000	24.500000	24.300000	8.310000	9.040000	...	2.972400	14.165000	25.955000	19.370000	31.990000	49.020000	127.947800	127.369550	128.846600	115.906500
75%	4.188750	8.850000	100.235500	8.625000	16.100000	16.475000	29.500000	31.200000	13.500000	12.600000	...	3.697400	17.930000	31.087500	23.792500	37.962500	61.940000	139.943875	140.425800	140.846175	130.304275
max	5.673000	26.800000	106.491000	14.000000	29.400000	29.200000	65.500000	57.100000	36.462500	36.462500	...	8.098700	33.100000	68.350000	42.050000	84.110000	147.060000	167.752800	167.477800	169.529000	154.893300

8 rows × 44 columns

In [3]:

X_train_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7764 entries, 0 to 7763
Data columns (total 45 columns):
指标名称                  7764 non-null datetime64[ns]
GDP:实际同比增长:全球         36 non-null float64
美国:GDP:现价:环比折年率:季调    275 non-null float64
欧元区:GDP平减指数:季调        84 non-null float64
全球商品贸易量增长率:出口金额:全球    20 non-null float64
工业增加值:当月同比            312 non-null float64
工业增加值:累计同比            210 non-null float64
固定资产投资完成额:累计同比        261 non-null float64
房地产开发投资完成额:累计同比       187 non-null float64
产量:发电量:当月同比           251 non-null float64
产量:发电量:累计同比           311 non-null float64
M0:同比                 257 non-null float64
M1:同比                 250 non-null float64
M2:同比                 250 non-null float64
中债国债到期收益率:3个月         3494 non-null float64
中债国债到期收益率:1个月         3493 non-null float64
中债国债到期收益率:1年          3494 non-null float64
中债国债到期收益率:5年          3494 non-null float64
中债国债到期收益率:10年         3494 non-null float64
CPI:当月同比              312 non-null float64
PPI:全部工业品:当月同比        231 non-null float64
PPI:全部工业品:累计同比        156 non-null float64
RPI:当月同比              396 non-null float64
RPI:累计同比              168 non-null float64
上证综合指数                6125 non-null float64
沪深300指数               3392 non-null float64
上证50指数                2914 non-null float64
中证500指数               2671 non-null float64
恒生AH股溢价指数             2536 non-null float64
国泰君安个人投资者投资景气指数       88 non-null float64
市盈率:沪深300             1068 non-null float64
市盈率:中证100             1068 non-null float64
市盈率:中证200             1068 non-null float64
市净率:沪深300             3920 non-null float64
市净率:中证100             3649 non-null float64
市净率:中证200             3273 non-null float64
滚动市盈率(TTM):沪深两市       1136 non-null float64
滚动市盈率(TTM):深圳市场       1136 non-null float64
滚动市盈率(TTM):深市主板       1136 non-null float64
滚动市盈率(TTM):中小板        1136 non-null float64
滚动市盈率(TTM):创业板        1136 non-null float64
中债总指数                 3502 non-null float64
中债国债总指数               3502 non-null float64
中债金融债券总指数             3502 non-null float64
中债信用债总指数              2248 non-null float64
dtypes: datetime64[ns](1), float64(44)
memory usage: 2.7 MB

In [22]:

print(X_train_original.shape)
print(y1_train.shape)

# not the same size. Need to split the data

(7764, 45)
(3392, 2)

In [5]:

X_train_original.columns

Out[5]:

Index(['指标名称', 'GDP:实际同比增长:全球', '美国:GDP:现价:环比折年率:季调', '欧元区:GDP平减指数:季调',
       '全球商品贸易量增长率:出口金额:全球', '工业增加值:当月同比', '工业增加值:累计同比', '固定资产投资完成额:累计同比',
       '房地产开发投资完成额:累计同比', '产量:发电量:当月同比', '产量:发电量:累计同比', 'M0:同比', 'M1:同比',
       'M2:同比', '中债国债到期收益率:3个月', '中债国债到期收益率:1个月', '中债国债到期收益率:1年',
       '中债国债到期收益率:5年', '中债国债到期收益率:10年', 'CPI:当月同比', 'PPI:全部工业品:当月同比',
       'PPI:全部工业品:累计同比', 'RPI:当月同比', 'RPI:累计同比', '上证综合指数', '沪深300指数', '上证50指数',
       '中证500指数', '恒生AH股溢价指数', '国泰君安个人投资者投资景气指数', '市盈率:沪深300', '市盈率:中证100',
       '市盈率:中证200', '市净率:沪深300', '市净率:中证100', '市净率:中证200', '滚动市盈率(TTM):沪深两市',
       '滚动市盈率(TTM):深圳市场', '滚动市盈率(TTM):深市主板', '滚动市盈率(TTM):中小板',
       '滚动市盈率(TTM):创业板', '中债总指数', '中债国债总指数', '中债金融债券总指数', '中债信用债总指数'],
      dtype='object')

In [6]:

display(X_train_original.head())
display(X_train_original.tail())

	指标名称	GDP:实际同比增长:全球	美国:GDP:现价:环比折年率:季调	欧元区:GDP平减指数:季调	全球商品贸易量增长率:出口金额:全球	工业增加值:当月同比	工业增加值:累计同比	固定资产投资完成额:累计同比	房地产开发投资完成额:累计同比	产量:发电量:当月同比	...	市净率:中证200	滚动市盈率(TTM):沪深两市	滚动市盈率(TTM):深圳市场	滚动市盈率(TTM):深市主板	滚动市盈率(TTM):中小板	滚动市盈率(TTM):创业板	中债总指数	中债国债总指数	中债金融债券总指数	中债信用债总指数
0	1947-06-30	NaN	5.3	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	1947-09-30	NaN	6.4	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	1947-12-31	NaN	17.3	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	1948-03-31	NaN	9.3	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	1948-06-30	NaN	10.5	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 45 columns

	指标名称	GDP:实际同比增长:全球	美国:GDP:现价:环比折年率:季调	欧元区:GDP平减指数:季调	全球商品贸易量增长率:出口金额:全球	工业增加值:当月同比	工业增加值:累计同比	固定资产投资完成额:累计同比	房地产开发投资完成额:累计同比	产量:发电量:当月同比	...	市净率:中证200	滚动市盈率(TTM):沪深两市	滚动市盈率(TTM):深圳市场	滚动市盈率(TTM):深市主板	滚动市盈率(TTM):中小板	滚动市盈率(TTM):创业板	中债总指数	中债国债总指数	中债金融债券总指数	中债信用债总指数
7759	2015-12-27	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	3.0719	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
7760	2015-12-28	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	2.9855	22.75	44.89	26.14	54.99	91.62	167.7263	167.4778	169.4162	154.7450
7761	2015-12-29	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	3.0084	23.08	45.99	26.45	57.46	92.36	167.7528	167.4174	169.5290	154.8247
7762	2015-12-30	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	2.9765	23.26	46.64	27.00	57.98	93.77	167.7114	167.4101	169.4554	154.8933
7763	2015-12-31	3.398	1.3	106.491	NaN	5.9	6.1	10.0	1.0	-3.7	...	2.9336	22.96	45.80	26.55	57.00	91.50	167.7430	167.4459	169.4835	154.8913

5 rows × 45 columns

In [25]:

print(X_train_original['指标名称'][0])
print(y1_train.iloc[:,0][0])

1947-06-30 00:00:00
2002-01-04 00:00:00

In [3]:

X_train_original["IsInY1"] = np.nan
X_train_original["IsInY2"] = np.nan
X_train_original["IsInY3"] = np.nan

for i in X_train_original.index:
    Current_Index = X_train_original.loc[i, '指标名称']
    X_train_original.loc[i, "IsInY1"] = np.any(y1_train.iloc[:, 0] == X_train_original.iloc[i,0])
    X_train_original.loc[i, "IsInY2"] = np.any(y2_train.iloc[:, 0] == X_train_original.iloc[i,0])
    X_train_original.loc[i, "IsInY3"] = np.any(y3_train.iloc[:, 0] == X_train_original.iloc[i,0])

In [67]:

print("Y1", int(X_train_original["IsInY1"].values.astype("int").sum()), "/", X_train_original.shape[0])
print("Y2", int(X_train_original["IsInY2"].values.astype("int").sum()), "/", X_train_original.shape[0])
print("Y3", int(X_train_original["IsInY3"].values.astype("int").sum()), "/", X_train_original.shape[0])

Y1 3392 / 7764
Y2 3494 / 7764
Y3 2822 / 7764

Look at only Y1 now¶

Make X1_train

In [4]:

# Drop some rows
X1_train = X_train_original.drop(np.where(np.logical_not(X_train_original.loc[:, "IsInY1"]))[0], axis=0)

In [5]:

# Find out which columns only have NaNs: Count number of NaNs in each column
#print(X1_train.isnull().sum())

# Drop anything that has greater than 100 NaNs
count_NaNs = X1_train.isnull().sum()
drop_columns = count_NaNs.loc[count_NaNs >100].index
X1_train = X1_train.drop(drop_columns, axis=1)

In [6]:

# Fill the NaNs
X1_train = X1_train.fillna(method='ffill')

In [102]:

print(X1_train.shape)
print(y1_train.shape)
np.all([x == y for x, y in zip(X1_train.iloc[:,0],y1_train.iloc[:,0])]) # there is 1 to 1 correpsondence in X1_train, y1_train

(3392, 14)
(3392, 2)

Out[102]:

True

In [41]:

# Save cleaned data
X1_train.drop(["IsInY1","IsInY2","IsInY3"], inplace=True, axis=1)
X1_train.to_csv("./data/Cleaned_X1_序列_train.csv", index=False, encoding='utf-8')

In [3]:

# Load data for next session
X1_train = pd.read_csv("./data/Cleaned_X1_序列_train.csv", encoding="utf", engine='python')
headers = [u'指标名称', u'中债国债到期收益率:3个月', u'中债国债到期收益率:1个月', u'中债国债到期收益率:1年',
       u'中债国债到期收益率:5年', u'中债国债到期收益率:10年', u'上证综合指数', u'沪深300指数', u'中债总指数',
       u'中债国债总指数', u'中债金融债券总指数']

In [141]:

# Find out if dates are even 
dates = np.asarray([matplotlib.dates.date2num(i.date()) for i in X1_train["指标名称"]]) # convert to num first

intervals = np.diff(dates)
print(np.unique(intervals)) # dates are not even intervals. Most of the data is continuous
fig = plt.figure(figsize=(5,3))
sns.distplot(intervals, kde=False, bins = np.unique(intervals))
plt.show()

[ 1.  2.  3.  4.  5.  6.  8.  9. 10. 11. 12. 13. 17.]

In [4]:

# Combine data for ease of plotting
df = X1_train.copy()
df["Y"] = y1_train.iloc[:, 1]

In [44]:

# Do some histogram plots for the features
df.hist(bins=50, figsize=(20,15))
# Fix some font issues with Chinese titles in pandas
SetFont(axes, plt.gcf(), fontsize=8, fontname='c:\\windows\\fonts\\simsun.ttc', items=["title"])
[ax.set_title(ax.get_title(), fontproperties=fontprop) for ax in plt.gcf().axes]
plt.show()

In [48]:

# Plot as time series
axes= df.plot(x="指标名称", kind='line', subplots=True, grid=True, layout=(4, 3), figsize=(10,10))
SetFont(axes, plt.gcf(), fontsize=8, fontname='c:\\windows\\fonts\\simsun.ttc', items=["legend", "xlab"])
plt.show()

In [10]:

# A lot of features seems to be higly correlated. Do a correlation matrix
from pandas.plotting import scatter_matrix
sns.set_style(style="white", rc=None)
# do correlation those columns
headers = [u'中债国债到期收益率:3个月',u'中债国债到期收益率:10年', u'沪深300指数', u'中债总指数']
axes= scatter_matrix(df[headers], figsize=(9,9))
SetFont(axes, plt.gcf(), fontsize=10, fontname='c:\\windows\\fonts\\simsun.ttc', items=["ylab", "xlab"])

plt.show()

Ideas for forecasting this time series with 4 selected features¶

Can also do dimensionality reduction on the 10 features and select the first 4 PCs to train (perhaps for later)
Simple ARIMA model for forecasting
XGBoost with regression
Deep neural network using RNN / (GRD / LSTM)

XGBoost regression¶

In [38]:

# Taking only these features
headers = [u'中债国债到期收益率:3个月',u'中债国债到期收益率:10年', u'沪深300指数', u'中债总指数']
df_sub = df[headers]
X1_train_mat = np.asarray(df_sub)
y1_train_mat = np.asarray(df["Y"])

In [45]:

# Separating data into X_train, y_train, X_val, y_val
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=50) # for 10 folds of crossvalidation

In [48]:

# Start with a simple xgboost, ignoring the time information and see what it does
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
accuracy_scores_list = []
xgb_reg = XGBRegressor()
for epoch, (train_index, test_index) in enumerate(tscv.split(X1_train_mat)):
    # Get data
    X_train, X_test = X1_train_mat[train_index], X1_train_mat[test_index]
    y_train, y_test = y1_train_mat[train_index], y1_train_mat[test_index]
    # Train the XGBoost
    xgb_reg.fit(X_train, y_train)
    y_pred = xgb_reg.predict(X_test)
    acs = np.sqrt(mean_squared_error(y_test, y_pred))
    #print("RMSE:", acs)
    accuracy_scores_list.append(acs)

print(accuracy_scores_list)

[0.05574110536573941, 0.016619024135980067, 0.033881089297445995, 0.013101490564228898, 0.007774167837661698, 0.008569569113488041, 0.005166861410345484, 0.007832514961462039, 0.10005812582163125, 0.058317227561839116, 0.2211929768332808, 0.014995624191141943, 0.008868785821808294, 0.008205008667429285, 0.00576499377245283, 0.00802906259572069, 1.2052946434474727, 0.8126311930967228, 0.15654579347225767, 1.750743392239421, 0.044947404216504394, 0.21488408349684798, 0.2582022184042565, 0.042088168768110695, 0.026614885513594116, 0.01723346169905295, 0.008961623272351706, 0.005099523525062128, 0.00442548052878379, 0.017016211342525782, 0.009355822744739896, 0.0065087937224623344, 0.004312519221825024, 0.013912412268863143, 0.01962128467142125, 0.02687753017935029, 0.019629388004812065, 0.01798221459584771, 0.015101587288185747, 0.01878468314918182, 0.017443180140183975, 0.019159339317003296, 0.018499472288233895, 0.02056343811438474, 0.012219307694427372, 0.016117898906048884, 0.0048321426922448155, 0.03204277687340917, 0.011639229400366083, 0.005903663363517704]

In [110]:

np.mean(accuracy_scores_list)

Out[110]:

0.1089862479122226

In [51]:

import xgboost as xgb
xgb.plot_importance(xgb_reg)
plt.show()

In [59]:

# Plot prediction vs. actual data
y1_pred_all = xgb_reg.predict(X1_train_mat)

plt.plot_date(X1_train[u'指标名称'], y1_train_mat, color='#1f77b4', label="original")
plt.plot_date(X1_train[u'指标名称'], y_pred_all, color='#ff7f0e', label="predicted", alpha=0.2)
plt.xlabel("Year")
plt.ylabel("Y1")
plt.legend()
plt.show()

In [70]:

# Make predictions using the trained XGBoost Regressor
X1_test = X_test_original[[u'指标名称'] + headers]
X1_test.isnull().sum()
# has some missing data. Need to later fit with spline

Out[70]:

指标名称               0
中债国债到期收益率:3个月    259
中债国债到期收益率:10年    259
沪深300指数          275
中债总指数            260
dtype: int64

In [84]:

# Fit the features with spline
X1_test_filled = X1_test.copy()
X1_test_filled[headers] = X1_test_filled[headers].interpolate(method="cubic", axis=0)

X1_test_filled[headers] =X1_test_filled[headers].fillna(axis=0, method="bfill")
X1_test_filled[headers] =X1_test_filled[headers].fillna(axis=0, method="ffill")

In [87]:

# Plot as time series
axes= X1_test.plot(x="指标名称", kind='line', subplots=True, grid=True, layout=(4, 3), figsize=(10,10))
SetFont(axes, plt.gcf(), fontsize=8, fontname='c:\\windows\\fonts\\simsun.ttc', items=["legend", "xlab"])
plt.suptitle("Original")
plt.show()

axes= X1_test_filled.plot(x="指标名称", kind='line', subplots=True, grid=True, layout=(4, 3), figsize=(10,10))
SetFont(axes, plt.gcf(), fontsize=8, fontname='c:\\windows\\fonts\\simsun.ttc', items=["legend", "xlab"])
plt.suptitle("Filled")
plt.show()

In [96]:

X1_test_filled_mat = np.asarray(X1_test_filled[headers])
Y1_pred = xgb_reg.predict(X1_test_filled_mat)

In [109]:

plt.plot_date(X1_test_filled["指标名称"], Y1_pred)
plt.xlabel("Time")
plt.ylabel("Y1_Predicted_XGBoost")
plt.savefig("./results/Y1_prediction_XGBoostReg.png", bbox_inches='tight', dpi=300)
plt.show()

In [98]:

# Writing the Y1 prediction to csv
Y1_pred_df = pd.DataFrame({"Date":X1_test_filled["指标名称"], "Y1":Y1_pred})
Y1_pred_df.to_csv("./results/Y1_prediction_XGBoostReg.csv", index=False, sep="\t")

In [112]:

# save the model
import pickle
pickle.dump(xgb_reg, open("./results/XGBoostReg_y1.model", "wb"))

#loaded_model = pickle.load(open("./results/XGBoostReg_y1.model", "rb"))