Analysis of Medicare Expenditure Data¶
In [1]:
%pwd
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
# Add some custom helper funcionts
exec(open('D:/Edward/Documents/Assignments/Scripts/Python/init_py.py').read())
In [ ]:
df_state_county_2016 = pd.read_excel('data/State_County_All_Report.xlsm',
'State_county 2016', skiprows=1, na_values='*')
In [2]:
# Parse the %
def parse_percent(sheet):
def f_parse_percent(percent_str):
if not isinstance(percent_str, str): return
return float(percent_str.replace('%', '').strip())
for c in sheet.columns:
if isinstance(sheet[c][0], str) and '%' in sheet[c][0]:
# percentage to floats
for i in sheet.index:
if not isinstance(sheet.loc[i, c], str): continue
sheet.loc[i, c] = f_parse_percent(sheet.loc[i, c])
return sheet
In [ ]:
# Separate state and county in the county column
df_state_county_2016 = parse_percent(df_state_county_2016)
for i in df_state_county_2016.index:
if i == 0: continue # skip national
df_state_county_2016.loc[i, 'State'] = df_state_county_2016.loc[i, 'State'][:2]
In [ ]:
# Continue for next time
df_state_county_2016.to_csv('data/State_County_2016.csv', index=False)
In [3]:
df_state_county_2016 = pd.read_csv('data/State_County_2016.csv')
In [4]:
df_state_county_2016.head()
Out[4]:
1 Explore State Data¶
In [5]:
df_2016_state = df_state_county_2016.loc[df_state_county_2016['County'] == 'STATE TOTAL',:]
# Drop some states: PR: Puerto Rico, DC: District of Columbia, VI: virgin island
df_2016_state = df_2016_state.set_index('State')
df_2016_state = df_2016_state.drop(index=['PR', 'DC', 'VI'], axis=0)
In [6]:
df_2016_state.describe()
Out[6]:
2 Plot demographics of each state¶
In [7]:
# Preparation for US state map outline
import bokeh
from bokeh.plotting import figure, output_notebook, show, output_file, output_notebook, ColumnDataSource
from bokeh.models import HoverTool
import json
with open(
'D:/Edward/Documents/Assignments/Scripts/Python/Plots/resource/CBUS_states_scaled_20m.json',
'r') as fid:
states = json.load(fid)
del states['DC']
del states['PR']
# Get coordinates of the states
state_xs = [states[code]["lons"] for code in states]
state_ys = [states[code]["lats"] for code in states]
state_name=[states[code]['name'] for code in states]
In [8]:
from matplotlib.colors import rgb2hex
def value2color(values, cmap=plt.get_cmap('Blues')):
colors = [rgb2hex(cmap(v)) for v in values]
return colors
def sortColumns(df, column):
values = [df.loc[code, column] for code in states] # sorting
return values
def column2color(column, cmap=plt.get_cmap('Blues')):
values = sortColumns(column)
colors = value2color(values, cmap=cmap)
return colors
def Create_geo_plot(df, column, title, width=600, height=300, cmap=plt.get_cmap('plasma'), topercent=True):
# Aggregate all the data for the current plot
values = sortColumns(df, column)
color_values = value2color((np.array(values) - min(values)) / \
(max(values) - min(values)), cmap) # normalization for color
data_source = ColumnDataSource(data=dict(
x=state_xs, y=state_ys, name=state_name,
value=["{:.2f}%".format(v) for v in values] if topercent else values,
color=color_values))
# Create the plot
p = figure(title=title, toolbar_location='left',
tools='pan, wheel_zoom, box_zoom, reset, hover, save',
x_axis_location=None, y_axis_location=None, # turn off axes for maps
plot_width=width, plot_height=height)
p.grid.grid_line_color = None # set no grid
# Color the states
p.patches('x', 'y', source=data_source, fill_color='color',
fill_alpha=1, line_color='white', line_width=0.5)
hover = p.select_one(HoverTool)
hover.point_policy='follow_mouse'
hover.tooltips = [('State', '@name'),
(column,'@value')]
output_notebook()
show(p)
In [9]:
# Ethnicity African American
Create_geo_plot(df=df_2016_state, column='Percent African American',
title="2016 % African Americans by State",cmap=plt.get_cmap('plasma'))
In [10]:
# Ethnicity Hispanic
Create_geo_plot(df=df_2016_state, column='Percent Hispanic',
title="2016 % Hispanic by State",cmap=plt.get_cmap('plasma'))
In [11]:
# Ethnicity White
Create_geo_plot(df=df_2016_state, column='Percent Non-Hispanic White',
title="2016 % White by State",cmap=plt.get_cmap('plasma'))
In [12]:
# Plot care quality based on geographic locations
attributes = ["PQI11 Bacterial Pneumonia Admission Rate (age < 65)",
"PQI11 Bacterial Pneumonia Admission Rate (age 65-74)",
"PQI11 Bacterial Pneumonia Admission Rate (age 75+)"]
#attributes = df_2016_state.columns[216:241]
df_sub = df_2016_state[attributes]
df_sub = df_sub.apply(pd.to_numeric)
df_2016_state['Total Care Quality'] = df_sub[attributes].sum(axis=1)
Create_geo_plot(df=df_2016_state, column='Total Care Quality',
title='Pneumonia 2016 Total Care Quality', cmap=plt.get_cmap('plasma'), topercent=False)
3 Standarized Per Capita Cost vs. Prevention Quality¶
(bacterial penumonia admission rate)
In [30]:
# pair wise corelation
def vis_corr_mat(rho, vmax=1.0, vmin=-1.0, center=0, ax=None, annot=False):
mask = np.zeros_like(rho, dtype=np.bool)
mask[:,:] = False
#mask[np.triu_indices_from(mask)]=True
if ax is None:
fig, ax = plt.subplots(figsize=(5, 5))
else:
fig = ax.figure
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(rho, mask=mask, cmap=cmap, vmax=vmax, vmin=vmin, center=center, square=True,
linewidth=0.5, cbar_kws={"shrink":.5}, annot=annot)
return fig, ax
In [31]:
attributes = ["Standardized Risk-Adjusted Per Capita Costs",
"PQI11 Bacterial Pneumonia Admission Rate (age < 65)",
"PQI11 Bacterial Pneumonia Admission Rate (age 65-74)",
"PQI11 Bacterial Pneumonia Admission Rate (age 75+)"] # do correlation those columns
df_plot = df_2016_state[attributes]
df_plot = df_plot.apply(pd.to_numeric)
df_plot.columns = ['Per Capita Cost', 'PQI11 (<65)', 'PQI11 (65-74)', 'PQI11 (75+)']
In [32]:
from pandas.plotting import scatter_matrix
scatter_matrix(df_plot, figsize=(8,6))
plt.suptitle('2016 Care Quality vs. Cost: PQI10 Dehydration Admission Rate')
plt.show()
In [33]:
corrs = np.corrcoef(df_plot.values.T)
fig, ax = vis_corr_mat(corrs, vmin=0, center=0.5, annot=True)
fig.set_size_inches(3, 3)
plt.gca().set_xticklabels(['Cost' ,'PQI11 (<65)', 'PQI11 (65-74)', 'PQI11 (75+)'], rotation=45)
plt.gca().set_yticklabels(['Cost', 'PQI11 (<65)', 'PQI11 (65-74)', 'PQI11 (75+)'], rotation=45)
plt.title('Peumonia Prevention Quality vs. Cost')
plt.show()
In [34]:
df_plot['Total Quality'] = df_plot['PQI11 (<65)']+df_plot['PQI11 (65-74)']+df_plot['PQI11 (75+)']
df_plot['Cost to quality ratio'] = df_plot['Per Capita Cost'] / df_plot['Total Quality']
df_plot = df_plot.sort_values('Cost to quality ratio')
df_plot['Quality to cost ratio'] = df_plot['Total Quality'] /df_plot['Per Capita Cost']
df_plot = df_plot.sort_values('Cost to quality ratio')
df_plot.head()
Out[34]:
In [35]:
df_qual_vs_eth = df_2016_state[['Percent Non-Hispanic White', 'Percent African American',
'Percent Hispanic', 'Percent Other/Unknown']]
df_qual_vs_eth = df_qual_vs_eth.join(df_plot['Cost to quality ratio'])
df_qual_vs_eth.columns = ['White', 'African American', 'Hispanic', 'Other', 'Quality']
df_qual_vs_eth.head()
Out[35]:
In [36]:
# Cost to quality ratio vs. Percent Hispanic, white, and African american
scatter_matrix(df_qual_vs_eth, figsize=(8,6))
plt.suptitle('2016 Pneumonia Care Quality vs. Ethnicity')
plt.show()
In [37]:
corrs = np.corrcoef(df_qual_vs_eth.values.T)
fig, ax = vis_corr_mat(corrs, annot=True)
fig.set_size_inches(3, 3)
plt.gca().set_xticklabels(['White', 'African American', 'Hispanic', 'Other', 'Quality'], rotation=45)
plt.gca().set_yticklabels(['White', 'African American', 'Hispanic', 'Other', 'Quality'], rotation=45)
plt.title('Quality vs. Ethnicity')
plt.show()
4 Factors Impacting Medicare Expenditure Over Time¶
In [132]:
df_state_all = pd.read_csv('./data/State_all.csv')
df_state_all.head()
Out[132]:
Question: Which factors are important for contributing high Medicare expenditure?
In [133]:
# Hand picking features after understanding the importance of each column!
feature_cols = [[1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15], # basic demographics
[28, 29, 30, 31, 32, 33], # IP: 32/33 for utility rate (stays vs. days covered per 1000)
[40, 41, 42, 43, 44, 45], # PAC: LTCH: 44/45 for utility rate
[52, 53, 54, 55, 56, 57], # PAC: IRF: 56/57 for utility rate
[64, 65, 66, 67, 68, 69], # PAC: SNF: 68/69 for utility rate
[76, 77, 78, 79, 80, 81], # PAC: HH: 80/81 for utility rate
[88, 89, 90, 91, 92, 93], # Hospice: 92/93 for utility rate
[100, 101, 102, 103, 104],# OP: 104: visits
[111, 112, 113, 114, 115],# FQHC/RHC, 115: visits
[122, 123, 124, 125, 126],# Outpatient dialysis, 126: events
[133, 134, 135, 136, 137],# ASC: 137: events
[144, 145, 146, 147, 148],# EM: 148: events
[155, 156, 157, 158, 159],# Procedure: 159: events
[166, 167, 168, 169, 170],# Imaging: 170: events
[177, 178, 179, 180, 181],# DME: 181: events
[188, 189, 190, 191, 192],# Tests: 192: events
[199, 200, 201, 202, 203],# Ambulence: 203: events
[210, 211, 212, 213], # Part B drug costs
list(range(214, 243)),# quality of care metrics
list(range(243, 249)) # utility rate to be calculated
]
targets = 'Standardized Risk-Adjusted Per Capita Costs'
In [134]:
# Add 6 columns that calculte the utility rate of 6 services
df_state_all['IP Utility Rate'] = df_state_all.iloc[:, 32-1] / df_state_all.iloc[:, 33-1]
df_state_all['PAC: LTCH Utility Rate'] = df_state_all.iloc[:, 44-1] / df_state_all.iloc[:, 45-1]
df_state_all['PAC: IRF Utility Rate'] = df_state_all.iloc[:, 56-1] / df_state_all.iloc[:, 57-1]
df_state_all['PAC: SNF Utility Rate'] = df_state_all.iloc[:, 68-1] / df_state_all.iloc[:, 69-1]
df_state_all['PAC: HH Utility Rate'] = df_state_all.iloc[:, 80-1] / df_state_all.iloc[:, 81-1]
df_state_all['Hospice Utility Rate'] = df_state_all.iloc[:, 92-1] / df_state_all.iloc[:, 93-1]
In [135]:
index = np.array([item for sublist in feature_cols for item in sublist])-1
features = df_state_all.columns[index].values
len(features)
Out[135]:
In [136]:
def extract_important_features(feature_importances, tol=1E-6):
# filter out features that has minimal importance
important_feature_ind = np.where(feature_importances > 1E-6)[0]
important_features = feature_importances[important_feature_ind]
sorted_important_feature_ind = np.array([x for _,x in sorted(
zip(important_features,important_feature_ind))])[::-1]
sorted_important_features = feature_importances[sorted_important_feature_ind]
return sorted_important_features, sorted_important_feature_ind
In [137]:
# Use XGBoost to determine the contributing factors of per capita cost over the years
import xgboost as xgb
from xgboost import XGBRegressor, plot_importance
xgb_rg = XGBRegressor()
X = df_state_all[features].drop(columns=['State']).values
y = df_state_all[targets].values
xgb_rg.fit(X, y)
important_features, cols_index = extract_important_features(xgb_rg.feature_importances_)
cols_index = cols_index+1 # since we dropped state column
In [138]:
# Save for other analyses
np.savez('./data/xgboost_state_feature_importance.npz', feature_cols=feature_cols,
features=features, important_features=important_features, cols_index=cols_index)
In [140]:
# Features ranked by its importance
features[cols_index]
Out[140]:
In [141]:
cols_index
Out[141]:
In [142]:
num_features = 10
ax = plot_importance(xgb_rg, max_num_features=num_features)
_ = ax.set_yticklabels(features[cols_index][:num_features][::-1])
In [145]:
fscores_dict = xgb_rg.get_booster().get_fscore()
# Get features with F-scores that's only greater than or equal to 4 (p<0.05)
significant_cols = []
for k, v in fscores_dict.items():
if v >= 4:
current_col = int(k[1:])
significant_cols.append(current_col)
significant_cols = np.array(significant_cols)
print(significant_cols) # 55 important features
print(len(significant_cols))
In [146]:
# Plot these 55 features
num_features = 55
ax = plot_importance(xgb_rg, max_num_features=num_features)
_ = ax.set_yticklabels(features[cols_index][:num_features][::-1])
ax.figure.set_size_inches(5,15)
5 Forecast the Medicare Cost in 2017 and 2018¶
In [168]:
# Preparing data
gp = df_state_all.groupby(by=['Year'], as_index=False)
df_nation = gp.sum()
df_nation['Year'] = pd.to_datetime(df_nation['Year'], format='%Y')
df_nation.set_index(['Year'], inplace=True)
In [378]:
df_nation[targets]
Out[378]:
In [376]:
_ = plt.plot(df_nation.index, df_nation[targets]/1000)
_ = plt.ylabel('Per Capita Cost (thousand dollars)')
_ = plt.xlabel('Year')
_ = plt.title('Current data')
Forecast directly on the cost of the national level total cost¶
In [173]:
from statsmodels.tsa.statespace.structural import UnobservedComponents
indpro_mod = UnobservedComponents(df_nation[targets],
level=True,
trend=True,
stochastic_level=True,
stochastic_trend=True)
indpro_res = indpro_mod.fit(method='powell', disp=False)
In [217]:
#fig, ax = plt.subplots(figsize=(10,5))
cost_array = df_nation[targets].values
fres = indpro_res.get_forecast('2019-01-01')
cost_array = np.append(cost_array, fres.predicted_mean.values)
fres_ci = fres.conf_int()
lower_fill = fres_ci['lower Standardized Risk-Adjusted Per Capita Costs'].values
upper_fill = fres_ci['upper Standardized Risk-Adjusted Per Capita Costs'].values
In [231]:
fig, ax = plt.subplots(figsize=(10,5))
year_array = np.arange(2007, 2020,1)
ax.plot(year_array[:11], cost_array[:11]/1000)
ax.plot(year_array[10:], cost_array[10:]/1000)
_ = ax.fill_between([2017, 2018, 2019], lower_fill/1000, upper_fill/1000, alpha=0.1, color='#ff7f0e')
_ = plt.ylabel('Per Capita Cost (thousand dollars)')
plt.gca().set_xticks(year_array)
_ = plt.xlabel('Year')
plt.gcf().set_size_inches(7, 3)
_ = ax.text(2018, 415, 'Forecast', va='center', ha='center')
Out[231]:
Using Kalman Filter and XGBoost¶
Forecast the 137 features by each state and predict with the XGBoost model
In [339]:
def forecast_feature_by_state(df_state_all, features, state='AK', forecast_end=2019, concat=False):
df_state_sub = df_state_all.loc[df_state_all['State'] == state, features]
forecast_start = df_state_sub['Year'].values[-1]+1
df_state_sub['Year'] = pd.to_datetime(df_state_sub['Year'], format='%Y')
df_state_sub.set_index('Year', inplace=True)
df_state_sub.fillna(method='ffill').fillna(method='bfill', inplace=True)
# iterate over each feature
forecast_df = pd.DataFrame({}, columns = [df_state_sub.index.name] + df_state_sub.columns.tolist())
forecast_df.set_index('Year', inplace=True)
for n, f in enumerate(features):
if f == 'State' or f == 'Year': continue
# deal with missing data
indpro_mod = UnobservedComponents(df_state_sub[f],
level=True,
trend=True,
stochastic_level=True,
stochastic_trend=True)
try:
indpro_res = indpro_mod.fit(method='powell', disp=False)
fres = indpro_res.get_forecast(str(forecast_end)+'-01-01')
out_array = fres.predicted_mean.values
except:
out_array = np.nan * np.ones(forecast_end-forecast_start+1)
for a, y in zip(out_array, range(forecast_start, forecast_end+1)):
forecast_df.loc[pd.to_datetime(str(y)+'-01-01'), f] = a
forecast_df['State'] = state
if concat:
forecast_df = df_state_sub.append(forecast_df)
return forecast_df
In [342]:
# INitialize a dataframe to save all the states forecast data
forecast_all = pd.DataFrame({}, columns=features)
In [344]:
import warnings
warnings.filterwarnings("ignore")
# Iterate over all states
for st in df_state_all['State'].unique().tolist():
print(st)
forecast_df = forecast_feature_by_state(df_state_all, features, state=st, forecast_end=2019)
# Turn Year back to number
forecast_df = forecast_df.reset_index()
forecast_df['Year'] = [int(str(y)[:4]) for y in forecast_df['Year'].values]
forecast_all = forecast_all.append(forecast_df)
In [348]:
forecast_all = forecast_all[features]
forecast_all = forecast_all.sort_values(['Year', 'State'])
forecast_all.to_csv('./data/forecast_state.csv', index=False)
In [349]:
# Apply xgboost model
y_forecast = xgb_rg.predict(forecast_all.drop(columns='State', axis=1).values)
In [370]:
# Summarize
forecast_all['Forecasted Cost'] = y_forecast
gp = forecast_all.groupby(by=['Year'], as_index=False)
df_forecast_nation = gp.sum()
#df_forecast_nation['Year'] = pd.to_datetime(df_nation['Year'], format='%Y')
df_forecast_nation
Out[370]:
In [375]:
# Do the plots
fig, ax = plt.subplots(figsize=(10,5))
year_array = np.arange(2007, 2020,1)
cost_array[10:] = df_forecast_nation['Forecasted Cost'].values
ax.plot(year_array[:11], cost_array[:11]/1000)
ax.plot(year_array[10:], cost_array[10:]/1000)
_ = plt.ylabel('Per Capita Cost (thousand dollars)')
plt.gca().set_xticks(year_array)
_ = plt.xlabel('Year')
plt.gcf().set_size_inches(7, 3)
_ = ax.text(2018, 480, 'Forecast', va='center', ha='center', color='#ff7f0e')