Analysis of Medicare Expenditure Data¶

Explore State Data
Plot demographics of Each State
Standarized Per Capita Cost vs. Prevention Quality
Factors Impacting Medicare Expenditure Over Time
Forecast

In [1]:

%pwd
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
# Add some custom helper funcionts
exec(open('D:/Edward/Documents/Assignments/Scripts/Python/init_py.py').read())

In [ ]:

df_state_county_2016 = pd.read_excel('data/State_County_All_Report.xlsm',
                                     'State_county 2016', skiprows=1, na_values='*')

In [2]:

# Parse the %
def parse_percent(sheet):
    def f_parse_percent(percent_str):
        if not isinstance(percent_str, str): return
        return float(percent_str.replace('%', '').strip())

    for c in sheet.columns:
        if isinstance(sheet[c][0], str) and '%' in sheet[c][0]:
            # percentage to floats
            for i in sheet.index:
                if not isinstance(sheet.loc[i, c], str): continue
                sheet.loc[i, c] = f_parse_percent(sheet.loc[i, c])
    return sheet

In [ ]:

# Separate state and county in the county column
df_state_county_2016 = parse_percent(df_state_county_2016)
for i in df_state_county_2016.index:
    if i == 0: continue # skip national
    df_state_county_2016.loc[i, 'State'] = df_state_county_2016.loc[i, 'State'][:2]

In [ ]:

# Continue for next time
df_state_county_2016.to_csv('data/State_County_2016.csv', index=False)

In [3]:

df_state_county_2016 = pd.read_csv('data/State_County_2016.csv')

In [4]:

df_state_county_2016.head()

Out[4]:

	State	County	Beneficiaries with Part A and Part B	FFS Beneficiaries	MA Beneficiaries	MA Participation Rate	Average Age	Percent Female	Percent Male	Percent Non-Hispanic White	...	PQI11 Bacterial Pneumonia Admission Rate (age < 65)	PQI11 Bacterial Pneumonia Admission Rate (age 65-74)	PQI11 Bacterial Pneumonia Admission Rate (age 75+)	PQI12 UTI Admission Rate (age < 65)	PQI12 UTI Admission Rate (age 65-74)	PQI12 UTI Admission Rate (age 75+)	PQI15 Asthma in Younger Adults Admission Rate (age < 40)	PQI16 Lower Extremity Amputation Admission Rate (age < 65)	PQI16 Lower Extremity Amputation Admission Rate (age 65-74)	PQI16 Lower Extremity Amputation Admission Rate (age 75+)
0	National	National	53403309.0	33981644.0	19421665.0	36.37	71.0	54.88	45.12	79.57	...	575	407	1160	320	244	1040	186	190	58	52
1	AK	STATE TOTAL	79181.0	78265.0	916.0	1.16	70.0	50.36	49.64	74.33	...	506	272	951	197	130	627	NaN	163	31	57
2	AK	Aleutians East	110.0	110.0	0.0	0.00	72.0	44.55	55.45	NaN	...
3	AK	Aleutians West	NaN	142.0	NaN	NaN	70.0	50.00	50.00	NaN	...
4	AK	Anchorage	30394.0	30037.0	357.0	1.17	70.0	52.69	47.31	70.97	...

5 rows × 242 columns

1 Explore State Data¶

In [5]:

df_2016_state = df_state_county_2016.loc[df_state_county_2016['County'] == 'STATE TOTAL',:]
# Drop some states: PR: Puerto Rico, DC: District of Columbia, VI: virgin island
df_2016_state = df_2016_state.set_index('State')
df_2016_state = df_2016_state.drop(index=['PR', 'DC', 'VI'], axis=0)

In [6]:

df_2016_state.describe()

Out[6]:

	Beneficiaries with Part A and Part B	FFS Beneficiaries	MA Beneficiaries	MA Participation Rate	Average Age	Percent Female	Percent Male	Percent Non-Hispanic White	Percent African American	Percent Hispanic	...	Part B Drugs Standardized Costs	Part B Drugs Standardized Costs as % of Total Standardized Costs	Part B Drugs Per Capita Standardized Costs	Part B Drugs Per User Standardized Costs	# Part B Drugs Users	% of Beneficiaries Using Part B Drugs	Emergency Department Visits	Number of Acute Hospital Readmissions	Hospital Readmission Rate	Emergency Department Visits per 1000 Beneficiaries
count	5.000000e+01	5.000000e+01	5.000000e+01	50.000000	50.000000	50.000000	50.000000	50.000000	50.000000	50.000000	...	5.000000e+01	50.000000	50.000000	50.000000	5.000000e+01	50.000000	5.000000e+01	50.00000	50.000000	50.000000
mean	1.052091e+06	6.756657e+05	3.764251e+05	30.027200	71.320000	54.468800	45.531200	82.612400	7.472800	4.082800	...	2.908489e+08	4.289200	387.387400	751.723200	3.621756e+05	51.065800	4.621015e+05	30517.00000	16.962600	674.080000
std	1.075333e+06	6.078273e+05	4.881643e+05	13.094007	1.019003	1.461346	1.461346	11.603567	7.355684	5.160649	...	3.079078e+08	1.052734	106.384195	155.547972	3.314068e+05	7.057579	4.042967e+05	29927.67246	1.746014	80.200473
min	7.918100e+04	7.826500e+04	9.160000e+02	1.160000	69.000000	50.360000	42.900000	30.810000	0.220000	0.460000	...	1.695230e+07	1.930000	145.370000	406.710000	3.011400e+04	33.690000	4.544900e+04	1983.00000	12.750000	512.000000
25%	3.118230e+05	2.265815e+05	9.775725e+04	21.452500	71.000000	53.477500	44.450000	76.780000	1.812500	1.152500	...	7.881647e+07	3.557500	300.330000	667.090000	9.990275e+04	47.390000	1.467042e+05	7988.50000	15.742500	625.250000
50%	7.700850e+05	4.831460e+05	2.542365e+05	30.065000	71.000000	54.760000	45.240000	85.340000	5.370000	2.200000	...	1.986363e+08	4.505000	397.660000	772.470000	2.551820e+05	52.715000	3.769325e+05	21707.00000	17.340000	667.000000
75%	1.206511e+06	8.552588e+05	4.666140e+05	39.015000	72.000000	55.550000	46.522500	90.892500	10.377500	5.512500	...	3.529428e+08	5.092500	462.312500	870.572500	4.657168e+05	56.420000	6.373462e+05	41139.25000	18.307500	733.250000
max	5.336426e+06	2.813732e+06	2.522694e+06	60.750000	73.000000	57.100000	49.640000	96.260000	28.110000	28.980000	...	1.387269e+09	6.370000	618.610000	1086.540000	1.413679e+06	62.480000	1.730332e+06	122232.00000	19.810000	841.000000

8 rows × 213 columns

2 Plot demographics of each state¶

In [7]:

# Preparation for US state map outline
import bokeh
from bokeh.plotting import figure, output_notebook, show, output_file, output_notebook, ColumnDataSource
from bokeh.models import HoverTool
import json
with open(
    'D:/Edward/Documents/Assignments/Scripts/Python/Plots/resource/CBUS_states_scaled_20m.json',
    'r') as fid:
    states = json.load(fid)

del states['DC']
del states['PR']
# Get coordinates of the states
state_xs = [states[code]["lons"] for code in states]
state_ys = [states[code]["lats"] for code in states]
state_name=[states[code]['name'] for code in states]

In [8]:

from matplotlib.colors import rgb2hex
def value2color(values, cmap=plt.get_cmap('Blues')):
    colors = [rgb2hex(cmap(v)) for v in values]
    return colors
def sortColumns(df, column):
    values = [df.loc[code, column] for code in states] # sorting
    return values
def column2color(column, cmap=plt.get_cmap('Blues')):
    values = sortColumns(column)
    colors = value2color(values, cmap=cmap)
    return colors
def Create_geo_plot(df, column, title, width=600, height=300, cmap=plt.get_cmap('plasma'), topercent=True):
    # Aggregate all the data for the current plot
    values = sortColumns(df, column)
    color_values = value2color((np.array(values) - min(values)) / \
                               (max(values) - min(values)), cmap) # normalization for color
    data_source = ColumnDataSource(data=dict(
        x=state_xs, y=state_ys, name=state_name,
        value=["{:.2f}%".format(v) for v in values] if topercent else values,
        color=color_values))

    # Create the plot
    p = figure(title=title, toolbar_location='left',
              tools='pan, wheel_zoom, box_zoom, reset, hover, save',
              x_axis_location=None, y_axis_location=None, # turn off axes for maps
              plot_width=width, plot_height=height)
    p.grid.grid_line_color = None # set no grid

    # Color the states
    p.patches('x', 'y', source=data_source, fill_color='color',
              fill_alpha=1, line_color='white', line_width=0.5)

    hover = p.select_one(HoverTool)
    hover.point_policy='follow_mouse'
    hover.tooltips = [('State', '@name'),
                      (column,'@value')]

    output_notebook()
    show(p)

In [9]:

# Ethnicity African American
Create_geo_plot(df=df_2016_state, column='Percent African American',
                title="2016 % African Americans by State",cmap=plt.get_cmap('plasma'))

Loading BokehJS ...

In [10]:

# Ethnicity Hispanic
Create_geo_plot(df=df_2016_state, column='Percent Hispanic',
                title="2016 % Hispanic by State",cmap=plt.get_cmap('plasma'))

Loading BokehJS ...

In [11]:

# Ethnicity White
Create_geo_plot(df=df_2016_state, column='Percent Non-Hispanic White',
                title="2016 % White by State",cmap=plt.get_cmap('plasma'))

Loading BokehJS ...

In [12]:

# Plot care quality based on geographic locations
attributes = ["PQI11 Bacterial Pneumonia Admission Rate (age < 65)",
              "PQI11 Bacterial Pneumonia Admission Rate (age 65-74)",
              "PQI11 Bacterial Pneumonia Admission Rate (age 75+)"]
#attributes = df_2016_state.columns[216:241]
df_sub = df_2016_state[attributes]
df_sub = df_sub.apply(pd.to_numeric)

df_2016_state['Total Care Quality'] = df_sub[attributes].sum(axis=1)
Create_geo_plot(df=df_2016_state, column='Total Care Quality',
               title='Pneumonia 2016 Total Care Quality', cmap=plt.get_cmap('plasma'), topercent=False)

Loading BokehJS ...

D:\Anaconda3\lib\site-packages\bokeh\core\json_encoder.py:80: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  elif np.issubdtype(type(obj), np.float):

3 Standarized Per Capita Cost vs. Prevention Quality¶

(bacterial penumonia admission rate)

In [30]:

# pair wise corelation
def vis_corr_mat(rho, vmax=1.0, vmin=-1.0, center=0, ax=None, annot=False):
    mask = np.zeros_like(rho, dtype=np.bool)
    mask[:,:] = False
    #mask[np.triu_indices_from(mask)]=True
    if ax is None:
        fig, ax = plt.subplots(figsize=(5, 5))
    else:
        fig = ax.figure
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    sns.heatmap(rho, mask=mask, cmap=cmap, vmax=vmax, vmin=vmin, center=center, square=True,
                linewidth=0.5, cbar_kws={"shrink":.5}, annot=annot)

    return fig, ax

In [31]:

attributes = ["Standardized Risk-Adjusted Per Capita Costs",
              "PQI11 Bacterial Pneumonia Admission Rate (age < 65)",
              "PQI11 Bacterial Pneumonia Admission Rate (age 65-74)",
              "PQI11 Bacterial Pneumonia Admission Rate (age 75+)"] # do correlation those columns
df_plot = df_2016_state[attributes]
df_plot = df_plot.apply(pd.to_numeric)
df_plot.columns = ['Per Capita Cost', 'PQI11 (<65)', 'PQI11 (65-74)', 'PQI11 (75+)']

In [32]:

from pandas.plotting import scatter_matrix

scatter_matrix(df_plot, figsize=(8,6))
plt.suptitle('2016 Care Quality vs. Cost: PQI10 Dehydration Admission Rate')
plt.show()

In [33]:

corrs = np.corrcoef(df_plot.values.T)
fig, ax = vis_corr_mat(corrs, vmin=0, center=0.5, annot=True)
fig.set_size_inches(3, 3)
plt.gca().set_xticklabels(['Cost' ,'PQI11 (<65)', 'PQI11 (65-74)', 'PQI11 (75+)'], rotation=45)
plt.gca().set_yticklabels(['Cost', 'PQI11 (<65)', 'PQI11 (65-74)', 'PQI11 (75+)'], rotation=45)
plt.title('Peumonia Prevention Quality vs. Cost')
plt.show()

In [34]:

df_plot['Total Quality'] = df_plot['PQI11 (<65)']+df_plot['PQI11 (65-74)']+df_plot['PQI11 (75+)']
df_plot['Cost to quality ratio'] = df_plot['Per Capita Cost'] / df_plot['Total Quality']
df_plot = df_plot.sort_values('Cost to quality ratio')
df_plot['Quality to cost ratio'] = df_plot['Total Quality'] /df_plot['Per Capita Cost']
df_plot = df_plot.sort_values('Cost to quality ratio')
df_plot.head()

Out[34]:

	Per Capita Cost	PQI11 (<65)	PQI11 (65-74)	PQI11 (75+)	Total Quality	Cost to quality ratio	Quality to cost ratio
State
KY	9969.61	832	776	1774	3382	2.947844	0.339231
WV	9616.95	706	680	1567	2953	3.256671	0.307062
ND	10134.24	785	508	1680	2973	3.408759	0.293362
AR	10431.59	693	587	1730	3010	3.465645	0.288547
TN	10331.81	778	586	1569	2933	3.522608	0.283881

In [35]:

df_qual_vs_eth = df_2016_state[['Percent Non-Hispanic White', 'Percent African American',
                            'Percent Hispanic', 'Percent Other/Unknown']]

df_qual_vs_eth = df_qual_vs_eth.join(df_plot['Cost to quality ratio'])
df_qual_vs_eth.columns = ['White', 'African American', 'Hispanic', 'Other', 'Quality']
df_qual_vs_eth.head()

Out[35]:

	White	African American	Hispanic	Other	Quality
State
AK	74.33	2.63	2.61	20.44	5.214997
AL	77.96	20.02	0.62	1.39	4.071006
AR	86.29	10.74	1.20	1.77	3.465645
AZ	82.83	2.39	7.96	6.82	5.888230
CA	64.20	5.42	16.73	13.64	5.708042

In [36]:

# Cost to quality ratio vs. Percent Hispanic, white, and African american
scatter_matrix(df_qual_vs_eth, figsize=(8,6))
plt.suptitle('2016 Pneumonia Care Quality vs. Ethnicity')
plt.show()

In [37]:

corrs = np.corrcoef(df_qual_vs_eth.values.T)
fig, ax = vis_corr_mat(corrs, annot=True)
fig.set_size_inches(3, 3)
plt.gca().set_xticklabels(['White', 'African American', 'Hispanic', 'Other', 'Quality'], rotation=45)
plt.gca().set_yticklabels(['White', 'African American', 'Hispanic', 'Other', 'Quality'], rotation=45)
plt.title('Quality vs. Ethnicity')
plt.show()

4 Factors Impacting Medicare Expenditure Over Time¶

In [132]:

df_state_all = pd.read_csv('./data/State_all.csv')
df_state_all.head()

Out[132]:

	State	Year	Beneficiaries with Part A and Part B	FFS Beneficiaries	MA Beneficiaries	MA Participation Rate	Average Age	Percent Female	Percent Male	Percent Non-Hispanic White	...	PQI11 Bacterial Pneumonia Admission Rate (age < 65)	PQI11 Bacterial Pneumonia Admission Rate (age 65-74)	PQI11 Bacterial Pneumonia Admission Rate (age 75+)	PQI12 UTI Admission Rate (age < 65)	PQI12 UTI Admission Rate (age 65-74)	PQI12 UTI Admission Rate (age 75+)	PQI15 Asthma in Younger Adults Admission Rate (age < 40)	PQI16 Lower Extremity Amputation Admission Rate (age < 65)	PQI16 Lower Extremity Amputation Admission Rate (age 65-74)	PQI16 Lower Extremity Amputation Admission Rate (age 75+)
0	AK	2007	53634.0	53203.0	431.0	0.80	70.0	50.93	49.07	74.65	...	853	858	1913	380.0	164	886	NaN	123.0	NaN	NaN
1	AL	2007	773340.0	645421.0	127919.0	16.54	70.0	56.60	43.40	80.45	...	1045	1016	2354	484.0	450	1491	524.0	138.0	71.0	80.0
2	AR	2007	490161.0	430612.0	59549.0	12.15	70.0	55.68	44.32	88.06	...	1188	1153	2919	409.0	376	1477	391.0	99.0	55.0	69.0
3	AZ	2007	807720.0	496814.0	310906.0	38.49	72.0	52.88	47.12	85.32	...	755	582	1552	369.0	230	864	306.0	133.0	25.0	33.0
4	CA	2007	4055674.0	2520032.0	1535642.0	37.86	71.0	54.53	45.47	66.02	...	747	610	1894	385.0	279	1128	293.0	106.0	45.0	46.0

5 rows × 242 columns

Question: Which factors are important for contributing high Medicare expenditure?

In [133]:

# Hand picking features after understanding the importance of each column!
feature_cols = [[1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15], # basic demographics
                [28, 29, 30, 31, 32, 33], # IP: 32/33 for utility rate (stays vs. days covered per 1000) 
                [40, 41, 42, 43, 44, 45], # PAC: LTCH: 44/45 for utility rate
                [52, 53, 54, 55, 56, 57], # PAC: IRF: 56/57 for utility rate
                [64, 65, 66, 67, 68, 69], # PAC: SNF: 68/69 for utility rate
                [76, 77, 78, 79, 80, 81], # PAC: HH: 80/81 for utility rate
                [88, 89, 90, 91, 92, 93], # Hospice: 92/93 for utility rate
                [100, 101, 102, 103, 104],# OP: 104: visits
                [111, 112, 113, 114, 115],# FQHC/RHC, 115: visits
                [122, 123, 124, 125, 126],# Outpatient dialysis, 126: events
                [133, 134, 135, 136, 137],# ASC: 137: events
                [144, 145, 146, 147, 148],# EM: 148: events
                [155, 156, 157, 158, 159],# Procedure: 159: events
                [166, 167, 168, 169, 170],# Imaging: 170: events
                [177, 178, 179, 180, 181],# DME: 181: events
                [188, 189, 190, 191, 192],# Tests: 192: events
                [199, 200, 201, 202, 203],# Ambulence: 203: events
                [210, 211, 212, 213], # Part B drug costs
                list(range(214, 243)),# quality of care metrics
                list(range(243, 249)) # utility rate to be calculated
               ]
targets = 'Standardized Risk-Adjusted Per Capita Costs'

In [134]:

# Add 6 columns that calculte the utility rate of 6 services
df_state_all['IP Utility Rate'] = df_state_all.iloc[:, 32-1] / df_state_all.iloc[:, 33-1]
df_state_all['PAC: LTCH Utility Rate'] =  df_state_all.iloc[:, 44-1] / df_state_all.iloc[:, 45-1]
df_state_all['PAC: IRF Utility Rate'] =  df_state_all.iloc[:, 56-1] / df_state_all.iloc[:, 57-1]
df_state_all['PAC: SNF Utility Rate'] =  df_state_all.iloc[:, 68-1] / df_state_all.iloc[:, 69-1]
df_state_all['PAC: HH Utility Rate'] =  df_state_all.iloc[:, 80-1] / df_state_all.iloc[:, 81-1]
df_state_all['Hospice Utility Rate'] =  df_state_all.iloc[:, 92-1] / df_state_all.iloc[:, 93-1]

In [135]:

index = np.array([item for sublist in feature_cols for item in sublist])-1
features = df_state_all.columns[index].values
len(features)

Out[135]:

In [136]:

def extract_important_features(feature_importances, tol=1E-6):
    # filter out features that has minimal importance
    important_feature_ind = np.where(feature_importances > 1E-6)[0]
    important_features = feature_importances[important_feature_ind]
    sorted_important_feature_ind = np.array([x for _,x in sorted(
                    zip(important_features,important_feature_ind))])[::-1]
    sorted_important_features = feature_importances[sorted_important_feature_ind]
    return sorted_important_features, sorted_important_feature_ind

In [137]:

# Use XGBoost to determine the contributing factors of per capita cost over the years
import xgboost as xgb
from xgboost import XGBRegressor, plot_importance
xgb_rg = XGBRegressor()
X = df_state_all[features].drop(columns=['State']).values
y = df_state_all[targets].values

xgb_rg.fit(X, y)
important_features, cols_index = extract_important_features(xgb_rg.feature_importances_)
cols_index = cols_index+1 # since we dropped state column

In [138]:

# Save for other analyses
np.savez('./data/xgboost_state_feature_importance.npz', feature_cols=feature_cols,
         features=features, important_features=important_features, cols_index=cols_index)

In [140]:

# Features ranked by its importance
features[cols_index]

Out[140]:

array(['OP Per User Standardized Costs',
       'IP Per Capita Standardized Costs', 'Year',
       'Percent Eligible for Medicaid',
       'PAC: LTCH Per Capita Standardized Costs',
       'Beneficiaries with Part A and Part B',
       'OP Per Capita Standardized Costs',
       'PAC: HH Per Capita Standardized Costs',
       'IP Per User Standardized Costs',
       'Procedures Per Capita Standardized Costs',
       'PAC: SNF Per User Standardized Costs', 'PAC: SNF Utility Rate',
       '% of Beneficiaries Using E&M',
       'PAC: LTCH Per User Standardized Costs',
       'Part B Drugs Per Capita Standardized Costs',
       'FQHC/RHC Per User Standardized Costs',
       'PAC: SNF Per Capita Standardized Costs',
       'PQI10 Dehydration Admission Rate (age < 65)',
       'Procedures Per User Standardized Costs',
       'ASC Per Capita Standardized Costs',
       'Outpatient Dialysis Facility Per User Standardized Costs',
       '% of Beneficiaries Using Hospice',
       'PQI11 Bacterial Pneumonia Admission Rate (age < 65)',
       'OP Visits Per 1000 Beneficiaries',
       'PAC: IRF Per User Standardized Costs',
       'PAC: IRF Per Capita Standardized Costs', 'Percent Other/Unknown',
       'Percent Non-Hispanic White', 'MA Beneficiaries',
       'PQI12 UTI Admission Rate (age < 65)',
       'PQI07 Hypertension Admission Rate (age < 65)',
       '% of Beneficiaries Using Ambulance',
       'Hospice Per Capita Standardized Costs',
       'PAC: LTCH Covered Days Per 1000 Beneficiaries',
       'PQI10 Dehydration Admission Rate (age 65-74)',
       'Hospital Readmission Rate',
       'Part B Drugs Per User Standardized Costs',
       'ASC Events Per 1000 Beneficiaries',
       'FQHC/RHC Visits Per 1000 Beneficiaries',
       'Hospice Per User Standardized Costs',
       'PAC: HH Episodes Per 1000 Beneficiaries',
       '% of Beneficiaries Using PAC: LTCH',
       '# PAC: LTCH Users (with a covered stay)', 'Percent Female',
       'PQI16 Lower Extremity Amputation Admission Rate (age 65-74)',
       'PQI10 Dehydration Admission Rate (age 75+)',
       'PQI07 Hypertension Admission Rate (age 75+)',
       'PQI07 Hypertension Admission Rate (age 65-74)',
       '# FQHC/RHC Users', 'PAC: HH Visits Per 1000 Beneficiaries',
       '% of Beneficiaries Using PAC: HH',
       'PAC: HH Per User Standardized Costs',
       'PAC: IRF Covered Days Per 1000 Beneficiaries',
       '% of Beneficiaries Using IP', 'FFS Beneficiaries',
       'PQI08 CHF Admission Rate (age 75+)',
       'PQI08 CHF Admission Rate (age < 65)',
       '% of Beneficiaries Using Part B Drugs',
       'Ambulance Events Per 1000 Beneficiaries',
       'Ambulance Per User Standardized Costs',
       'Ambulance Per Capita Standardized Costs',
       '% of Beneficiaries Using DME',
       'Imaging Events Per 1000 Beneficiaries',
       'Procedure Events Per 1000 Beneficiaries',
       '% of Beneficiaries Using Procedures',
       'E&M Events Per 1000 Beneficiaries',
       '% of Beneficiaries Using ASC',
       'Outpatient Dialysis Facility Per Capita Standardized Costs',
       '% of Beneficiaries Using FQHC/RHC', '% of Beneficiaries Using OP',
       'Hospice Covered Days Per 1000 Beneficiaries',
       'Hospice Covered Stays Per 1000 Beneficiaries',
       'IP Users (with a covered stay)', 'Percent African American',
       'MA Participation Rate', 'Hospice Utility Rate',
       'PAC: LTCH Utility Rate', 'IP Utility Rate',
       'PQI16 Lower Extremity Amputation Admission Rate (age < 65)',
       'PQI15 Asthma in Younger Adults Admission Rate (age < 40)',
       'PQI12 UTI Admission Rate (age 65-74)',
       'PQI11 Bacterial Pneumonia Admission Rate (age 65-74)',
       'PQI05 COPD or Asthma in Older Adults Admission Rate (age 75+)',
       'PQI05 COPD or Asthma in Older Adults Admission Rate (age 40-64)',
       'Emergency Department Visits per 1000 Beneficiaries',
       'Tests Per Capita Standardized Costs', '# Procedure Users',
       'E&M Per Capita Standardized Costs',
       'ASC Per User Standardized Costs',
       'IP Covered Stays Per 1000 Beneficiaries', 'Average HCC Score',
       'Percent Hispanic', 'PAC: IRF Utility Rate',
       'PQI12 UTI Admission Rate (age 75+)',
       'PQI05 COPD or Asthma in Older Adults Admission Rate (age 65-74)',
       'PQI03 Diabetes LT Complication Admission Rate (age < 65)',
       '% of Beneficiaries Using Tests', '# Test Users', '# DME Users',
       'DME Per Capita Standardized Costs',
       'Imaging Per Capita Standardized Costs',
       'Outpatient Dialysis Facility Events Per 1000 Beneficiaries',
       '% of Beneficiaries Using Outpatient Dialysis Facility',
       'FQHC/RHC Per Capita Standardized Costs', '# OP Users',
       'PAC: SNF Covered Days Per 1000 Beneficiaries',
       'PAC: IRF Covered Stays Per 1000 Beneficiaries',
       '% of Beneficiaries Using PAC: IRF',
       'PAC: LTCH Covered Stays Per 1000 Beneficiaries'], dtype=object)

In [141]:

cols_index

Out[141]:

array([ 51,  14,   1,  12,  20,   2,  50,  38,  15,  75,  33, 136,  73,
        21, 100,  56,  32, 120,  76,  65,  61,  47, 123,  54,  27,  26,
        11,   8,   4, 126, 114,  98,  44,  25, 121, 106, 101,  69,  59,
        45,  42,  23,  22,   7, 131, 122, 116, 115,  57,  43,  41,  39,
        31,  17,   3, 119, 117, 103,  99,  96,  95,  88,  84,  79,  78,
        74,  68,  60,  58,  53,  49,  48,  16,   9,   5, 138, 134, 133,
       130, 129, 127, 124, 113, 111, 107,  90,  77,  70,  66,  18,  13,
        10, 135, 128, 112, 108,  93,  92,  87,  85,  80,  64,  63,  55,
        52,  37,  30,  29,  24], dtype=int64)

In [142]:

num_features = 10
ax = plot_importance(xgb_rg, max_num_features=num_features)
_ =  ax.set_yticklabels(features[cols_index][:num_features][::-1])

In [145]:

fscores_dict = xgb_rg.get_booster().get_fscore()
# Get features with F-scores that's only greater than or equal to 4 (p<0.05)
significant_cols = []
for k, v in fscores_dict.items():
    if v >= 4:
        current_col = int(k[1:])
        significant_cols.append(current_col)
significant_cols = np.array(significant_cols)
print(significant_cols) # 55 important features
print(len(significant_cols))

[ 74  10  32  50   7  19  14  37  49 113  26  46  31  43  41  99  72   0
  11  22 119  25  38  40   6  20 122  64  55  13   3 125 120  97  60  42
 115  44 135  30 114 100 130 121  56   1  21  24  68  75   2  53  58 105
  16]
55

In [146]:

# Plot these 55 features
num_features = 55
ax = plot_importance(xgb_rg, max_num_features=num_features)
_ =  ax.set_yticklabels(features[cols_index][:num_features][::-1])
ax.figure.set_size_inches(5,15)

5 Forecast the Medicare Cost in 2017 and 2018¶

In [168]:

# Preparing data
gp = df_state_all.groupby(by=['Year'], as_index=False)
df_nation = gp.sum()
df_nation['Year'] = pd.to_datetime(df_nation['Year'], format='%Y')
df_nation.set_index(['Year'], inplace=True)

In [378]:

df_nation[targets]

Out[378]:

Year
2007-01-01    408185.46
2008-01-01    435001.87
2009-01-01    449867.98
2010-01-01    461558.53
2011-01-01    459797.58
2012-01-01    476134.91
2013-01-01    479433.30
2014-01-01    475021.95
2015-01-01    493828.59
2016-01-01    500015.52
Name: Standardized Risk-Adjusted Per Capita Costs, dtype: float64

In [376]:

_ = plt.plot(df_nation.index, df_nation[targets]/1000)
_ = plt.ylabel('Per Capita Cost (thousand dollars)')
_ = plt.xlabel('Year')
_ = plt.title('Current data')

Forecast directly on the cost of the national level total cost¶

In [173]:

from statsmodels.tsa.statespace.structural import UnobservedComponents

indpro_mod = UnobservedComponents(df_nation[targets],
                                  level=True,
                                  trend=True,
                                  stochastic_level=True,
                                  stochastic_trend=True)
indpro_res = indpro_mod.fit(method='powell', disp=False)

In [217]:

#fig, ax = plt.subplots(figsize=(10,5))

cost_array = df_nation[targets].values
fres = indpro_res.get_forecast('2019-01-01')
cost_array = np.append(cost_array, fres.predicted_mean.values)
fres_ci = fres.conf_int()
lower_fill = fres_ci['lower Standardized Risk-Adjusted Per Capita Costs'].values
upper_fill = fres_ci['upper Standardized Risk-Adjusted Per Capita Costs'].values

In [231]:

fig, ax = plt.subplots(figsize=(10,5))
year_array = np.arange(2007, 2020,1)
ax.plot(year_array[:11], cost_array[:11]/1000)
ax.plot(year_array[10:], cost_array[10:]/1000)
_ = ax.fill_between([2017, 2018, 2019], lower_fill/1000, upper_fill/1000, alpha=0.1, color='#ff7f0e')
_ = plt.ylabel('Per Capita Cost (thousand dollars)')
plt.gca().set_xticks(year_array)
_ = plt.xlabel('Year')
plt.gcf().set_size_inches(7, 3)
_ = ax.text(2018, 415, 'Forecast', va='center', ha='center')

Out[231]:

Text(2018,415,'Forecast')

Using Kalman Filter and XGBoost¶

Forecast the 137 features by each state and predict with the XGBoost model

In [339]:

def forecast_feature_by_state(df_state_all, features, state='AK', forecast_end=2019, concat=False):
    df_state_sub = df_state_all.loc[df_state_all['State'] == state, features]
    forecast_start = df_state_sub['Year'].values[-1]+1
    df_state_sub['Year'] = pd.to_datetime(df_state_sub['Year'], format='%Y')
    df_state_sub.set_index('Year', inplace=True)
    df_state_sub.fillna(method='ffill').fillna(method='bfill', inplace=True)
    # iterate over each feature
    forecast_df = pd.DataFrame({}, columns = [df_state_sub.index.name] + df_state_sub.columns.tolist())
    forecast_df.set_index('Year', inplace=True)
    for n, f in enumerate(features):
        if f == 'State' or f == 'Year': continue
        # deal with missing data

        indpro_mod = UnobservedComponents(df_state_sub[f],
                                          level=True,
                                          trend=True,
                                          stochastic_level=True,
                                          stochastic_trend=True)
        try:
            indpro_res = indpro_mod.fit(method='powell', disp=False)
            fres = indpro_res.get_forecast(str(forecast_end)+'-01-01')
            out_array = fres.predicted_mean.values
        except:
            out_array = np.nan * np.ones(forecast_end-forecast_start+1)
        for a, y in zip(out_array, range(forecast_start, forecast_end+1)):
            forecast_df.loc[pd.to_datetime(str(y)+'-01-01'), f] = a

    forecast_df['State'] = state
    if concat:
        forecast_df = df_state_sub.append(forecast_df)

    return forecast_df

In [342]:

# INitialize a dataframe to save all the states forecast data
forecast_all = pd.DataFrame({}, columns=features)

In [344]:

import warnings
warnings.filterwarnings("ignore")
# Iterate over all states
for st in df_state_all['State'].unique().tolist():
    print(st)
    forecast_df = forecast_feature_by_state(df_state_all, features, state=st, forecast_end=2019)
    # Turn Year back to number
    forecast_df = forecast_df.reset_index()
    forecast_df['Year'] = [int(str(y)[:4]) for y in forecast_df['Year'].values]
    forecast_all = forecast_all.append(forecast_df)

In [348]:

forecast_all = forecast_all[features]
forecast_all = forecast_all.sort_values(['Year', 'State'])
forecast_all.to_csv('./data/forecast_state.csv', index=False)

In [349]:

# Apply xgboost model
y_forecast = xgb_rg.predict(forecast_all.drop(columns='State', axis=1).values)

In [370]:

# Summarize
forecast_all['Forecasted Cost'] = y_forecast
gp = forecast_all.groupby(by=['Year'], as_index=False)
df_forecast_nation = gp.sum()
#df_forecast_nation['Year'] = pd.to_datetime(df_nation['Year'], format='%Y')

df_forecast_nation

Out[370]:

	Year	Forecasted Cost
0	2017	500057.87500
1	2018	501675.37500
2	2019	501400.15625

In [375]:

# Do the plots
fig, ax = plt.subplots(figsize=(10,5))
year_array = np.arange(2007, 2020,1)
cost_array[10:] = df_forecast_nation['Forecasted Cost'].values
ax.plot(year_array[:11], cost_array[:11]/1000)
ax.plot(year_array[10:], cost_array[10:]/1000)
_ = plt.ylabel('Per Capita Cost (thousand dollars)')
plt.gca().set_xticks(year_array)
_ = plt.xlabel('Year')
plt.gcf().set_size_inches(7, 3)
_ = ax.text(2018, 480, 'Forecast', va='center', ha='center', color='#ff7f0e')