Medicare Expenditure

 
Data Incubator Capstone Project - State Level
In [1]:
%pwd
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
# Add some custom helper funcionts
exec(open('D:/Edward/Documents/Assignments/Scripts/Python/init_py.py').read())
In [ ]:
df_state_county_2016 = pd.read_excel('data/State_County_All_Report.xlsm',
                                     'State_county 2016', skiprows=1, na_values='*')
In [2]:
# Parse the %
def parse_percent(sheet):
    def f_parse_percent(percent_str):
        if not isinstance(percent_str, str): return
        return float(percent_str.replace('%', '').strip())

    for c in sheet.columns:
        if isinstance(sheet[c][0], str) and '%' in sheet[c][0]:
            # percentage to floats
            for i in sheet.index:
                if not isinstance(sheet.loc[i, c], str): continue
                sheet.loc[i, c] = f_parse_percent(sheet.loc[i, c])
    return sheet
In [ ]:
# Separate state and county in the county column
df_state_county_2016 = parse_percent(df_state_county_2016)
for i in df_state_county_2016.index:
    if i == 0: continue # skip national
    df_state_county_2016.loc[i, 'State'] = df_state_county_2016.loc[i, 'State'][:2]
In [ ]:
# Continue for next time
df_state_county_2016.to_csv('data/State_County_2016.csv', index=False)
In [3]:
df_state_county_2016 = pd.read_csv('data/State_County_2016.csv')
In [4]:
df_state_county_2016.head()
Out[4]:
State County Beneficiaries with Part A and Part B FFS Beneficiaries MA Beneficiaries MA Participation Rate Average Age Percent Female Percent Male Percent Non-Hispanic White ... PQI11 Bacterial Pneumonia Admission Rate (age < 65) PQI11 Bacterial Pneumonia Admission Rate (age 65-74) PQI11 Bacterial Pneumonia Admission Rate (age 75+) PQI12 UTI Admission Rate (age < 65) PQI12 UTI Admission Rate (age 65-74) PQI12 UTI Admission Rate (age 75+) PQI15 Asthma in Younger Adults Admission Rate (age < 40) PQI16 Lower Extremity Amputation Admission Rate (age < 65) PQI16 Lower Extremity Amputation Admission Rate (age 65-74) PQI16 Lower Extremity Amputation Admission Rate (age 75+)
0 National National 53403309.0 33981644.0 19421665.0 36.37 71.0 54.88 45.12 79.57 ... 575 407 1160 320 244 1040 186 190 58 52
1 AK STATE TOTAL 79181.0 78265.0 916.0 1.16 70.0 50.36 49.64 74.33 ... 506 272 951 197 130 627 NaN 163 31 57
2 AK Aleutians East 110.0 110.0 0.0 0.00 72.0 44.55 55.45 NaN ...
3 AK Aleutians West NaN 142.0 NaN NaN 70.0 50.00 50.00 NaN ...
4 AK Anchorage 30394.0 30037.0 357.0 1.17 70.0 52.69 47.31 70.97 ...

5 rows × 242 columns

1 Explore State Data

In [5]:
df_2016_state = df_state_county_2016.loc[df_state_county_2016['County'] == 'STATE TOTAL',:]
# Drop some states: PR: Puerto Rico, DC: District of Columbia, VI: virgin island
df_2016_state = df_2016_state.set_index('State')
df_2016_state = df_2016_state.drop(index=['PR', 'DC', 'VI'], axis=0)
In [6]:
df_2016_state.describe()
Out[6]:
Beneficiaries with Part A and Part B FFS Beneficiaries MA Beneficiaries MA Participation Rate Average Age Percent Female Percent Male Percent Non-Hispanic White Percent African American Percent Hispanic ... Part B Drugs Standardized Costs Part B Drugs Standardized Costs as % of Total Standardized Costs Part B Drugs Per Capita Standardized Costs Part B Drugs Per User Standardized Costs # Part B Drugs Users % of Beneficiaries Using Part B Drugs Emergency Department Visits Number of Acute Hospital Readmissions Hospital Readmission Rate Emergency Department Visits per 1000 Beneficiaries
count 5.000000e+01 5.000000e+01 5.000000e+01 50.000000 50.000000 50.000000 50.000000 50.000000 50.000000 50.000000 ... 5.000000e+01 50.000000 50.000000 50.000000 5.000000e+01 50.000000 5.000000e+01 50.00000 50.000000 50.000000
mean 1.052091e+06 6.756657e+05 3.764251e+05 30.027200 71.320000 54.468800 45.531200 82.612400 7.472800 4.082800 ... 2.908489e+08 4.289200 387.387400 751.723200 3.621756e+05 51.065800 4.621015e+05 30517.00000 16.962600 674.080000
std 1.075333e+06 6.078273e+05 4.881643e+05 13.094007 1.019003 1.461346 1.461346 11.603567 7.355684 5.160649 ... 3.079078e+08 1.052734 106.384195 155.547972 3.314068e+05 7.057579 4.042967e+05 29927.67246 1.746014 80.200473
min 7.918100e+04 7.826500e+04 9.160000e+02 1.160000 69.000000 50.360000 42.900000 30.810000 0.220000 0.460000 ... 1.695230e+07 1.930000 145.370000 406.710000 3.011400e+04 33.690000 4.544900e+04 1983.00000 12.750000 512.000000
25% 3.118230e+05 2.265815e+05 9.775725e+04 21.452500 71.000000 53.477500 44.450000 76.780000 1.812500 1.152500 ... 7.881647e+07 3.557500 300.330000 667.090000 9.990275e+04 47.390000 1.467042e+05 7988.50000 15.742500 625.250000
50% 7.700850e+05 4.831460e+05 2.542365e+05 30.065000 71.000000 54.760000 45.240000 85.340000 5.370000 2.200000 ... 1.986363e+08 4.505000 397.660000 772.470000 2.551820e+05 52.715000 3.769325e+05 21707.00000 17.340000 667.000000
75% 1.206511e+06 8.552588e+05 4.666140e+05 39.015000 72.000000 55.550000 46.522500 90.892500 10.377500 5.512500 ... 3.529428e+08 5.092500 462.312500 870.572500 4.657168e+05 56.420000 6.373462e+05 41139.25000 18.307500 733.250000
max 5.336426e+06 2.813732e+06 2.522694e+06 60.750000 73.000000 57.100000 49.640000 96.260000 28.110000 28.980000 ... 1.387269e+09 6.370000 618.610000 1086.540000 1.413679e+06 62.480000 1.730332e+06 122232.00000 19.810000 841.000000

8 rows × 213 columns

2 Plot demographics of each state

In [7]:
# Preparation for US state map outline
import bokeh
from bokeh.plotting import figure, output_notebook, show, output_file, output_notebook, ColumnDataSource
from bokeh.models import HoverTool
import json
with open(
    'D:/Edward/Documents/Assignments/Scripts/Python/Plots/resource/CBUS_states_scaled_20m.json',
    'r') as fid:
    states = json.load(fid)

del states['DC']
del states['PR']
# Get coordinates of the states
state_xs = [states[code]["lons"] for code in states]
state_ys = [states[code]["lats"] for code in states]
state_name=[states[code]['name'] for code in states]
In [8]:
from matplotlib.colors import rgb2hex
def value2color(values, cmap=plt.get_cmap('Blues')):
    colors = [rgb2hex(cmap(v)) for v in values]
    return colors
def sortColumns(df, column):
    values = [df.loc[code, column] for code in states] # sorting
    return values
def column2color(column, cmap=plt.get_cmap('Blues')):
    values = sortColumns(column)
    colors = value2color(values, cmap=cmap)
    return colors
def Create_geo_plot(df, column, title, width=600, height=300, cmap=plt.get_cmap('plasma'), topercent=True):
    # Aggregate all the data for the current plot
    values = sortColumns(df, column)
    color_values = value2color((np.array(values) - min(values)) / \
                               (max(values) - min(values)), cmap) # normalization for color
    data_source = ColumnDataSource(data=dict(
        x=state_xs, y=state_ys, name=state_name,
        value=["{:.2f}%".format(v) for v in values] if topercent else values,
        color=color_values))

    # Create the plot
    p = figure(title=title, toolbar_location='left',
              tools='pan, wheel_zoom, box_zoom, reset, hover, save',
              x_axis_location=None, y_axis_location=None, # turn off axes for maps
              plot_width=width, plot_height=height)
    p.grid.grid_line_color = None # set no grid

    # Color the states
    p.patches('x', 'y', source=data_source, fill_color='color',
              fill_alpha=1, line_color='white', line_width=0.5)

    hover = p.select_one(HoverTool)
    hover.point_policy='follow_mouse'
    hover.tooltips = [('State', '@name'),
                      (column,'@value')]

    output_notebook()
    show(p)
In [9]:
# Ethnicity African American
Create_geo_plot(df=df_2016_state, column='Percent African American',
                title="2016 % African Americans by State",cmap=plt.get_cmap('plasma'))
Loading BokehJS ...
In [10]:
# Ethnicity Hispanic
Create_geo_plot(df=df_2016_state, column='Percent Hispanic',
                title="2016 % Hispanic by State",cmap=plt.get_cmap('plasma'))
Loading BokehJS ...
In [11]:
# Ethnicity White
Create_geo_plot(df=df_2016_state, column='Percent Non-Hispanic White',
                title="2016 % White by State",cmap=plt.get_cmap('plasma'))
Loading BokehJS ...
In [12]:
# Plot care quality based on geographic locations
attributes = ["PQI11 Bacterial Pneumonia Admission Rate (age < 65)",
              "PQI11 Bacterial Pneumonia Admission Rate (age 65-74)",
              "PQI11 Bacterial Pneumonia Admission Rate (age 75+)"]
#attributes = df_2016_state.columns[216:241]
df_sub = df_2016_state[attributes]
df_sub = df_sub.apply(pd.to_numeric)

df_2016_state['Total Care Quality'] = df_sub[attributes].sum(axis=1)
Create_geo_plot(df=df_2016_state, column='Total Care Quality',
               title='Pneumonia 2016 Total Care Quality', cmap=plt.get_cmap('plasma'), topercent=False)
Loading BokehJS ...
D:\Anaconda3\lib\site-packages\bokeh\core\json_encoder.py:80: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  elif np.issubdtype(type(obj), np.float):

3 Standarized Per Capita Cost vs. Prevention Quality

(bacterial penumonia admission rate)

In [30]:
# pair wise corelation
def vis_corr_mat(rho, vmax=1.0, vmin=-1.0, center=0, ax=None, annot=False):
    mask = np.zeros_like(rho, dtype=np.bool)
    mask[:,:] = False
    #mask[np.triu_indices_from(mask)]=True
    if ax is None:
        fig, ax = plt.subplots(figsize=(5, 5))
    else:
        fig = ax.figure
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    sns.heatmap(rho, mask=mask, cmap=cmap, vmax=vmax, vmin=vmin, center=center, square=True,
                linewidth=0.5, cbar_kws={"shrink":.5}, annot=annot)

    return fig, ax
In [31]:
attributes = ["Standardized Risk-Adjusted Per Capita Costs",
              "PQI11 Bacterial Pneumonia Admission Rate (age < 65)",
              "PQI11 Bacterial Pneumonia Admission Rate (age 65-74)",
              "PQI11 Bacterial Pneumonia Admission Rate (age 75+)"] # do correlation those columns
df_plot = df_2016_state[attributes]
df_plot = df_plot.apply(pd.to_numeric)
df_plot.columns = ['Per Capita Cost', 'PQI11 (<65)', 'PQI11 (65-74)', 'PQI11 (75+)']
In [32]:
from pandas.plotting import scatter_matrix

scatter_matrix(df_plot, figsize=(8,6))
plt.suptitle('2016 Care Quality vs. Cost: PQI10 Dehydration Admission Rate')
plt.show()
In [33]:
corrs = np.corrcoef(df_plot.values.T)
fig, ax = vis_corr_mat(corrs, vmin=0, center=0.5, annot=True)
fig.set_size_inches(3, 3)
plt.gca().set_xticklabels(['Cost' ,'PQI11 (<65)', 'PQI11 (65-74)', 'PQI11 (75+)'], rotation=45)
plt.gca().set_yticklabels(['Cost', 'PQI11 (<65)', 'PQI11 (65-74)', 'PQI11 (75+)'], rotation=45)
plt.title('Peumonia Prevention Quality vs. Cost')
plt.show()
In [34]:
df_plot['Total Quality'] = df_plot['PQI11 (<65)']+df_plot['PQI11 (65-74)']+df_plot['PQI11 (75+)']
df_plot['Cost to quality ratio'] = df_plot['Per Capita Cost'] / df_plot['Total Quality']
df_plot = df_plot.sort_values('Cost to quality ratio')
df_plot['Quality to cost ratio'] = df_plot['Total Quality'] /df_plot['Per Capita Cost']
df_plot = df_plot.sort_values('Cost to quality ratio')
df_plot.head()
Out[34]:
Per Capita Cost PQI11 (<65) PQI11 (65-74) PQI11 (75+) Total Quality Cost to quality ratio Quality to cost ratio
State
KY 9969.61 832 776 1774 3382 2.947844 0.339231
WV 9616.95 706 680 1567 2953 3.256671 0.307062
ND 10134.24 785 508 1680 2973 3.408759 0.293362
AR 10431.59 693 587 1730 3010 3.465645 0.288547
TN 10331.81 778 586 1569 2933 3.522608 0.283881
In [35]:
df_qual_vs_eth = df_2016_state[['Percent Non-Hispanic White', 'Percent African American',
                            'Percent Hispanic', 'Percent Other/Unknown']]

df_qual_vs_eth = df_qual_vs_eth.join(df_plot['Cost to quality ratio'])
df_qual_vs_eth.columns = ['White', 'African American', 'Hispanic', 'Other', 'Quality']
df_qual_vs_eth.head()
Out[35]:
White African American Hispanic Other Quality
State
AK 74.33 2.63 2.61 20.44 5.214997
AL 77.96 20.02 0.62 1.39 4.071006
AR 86.29 10.74 1.20 1.77 3.465645
AZ 82.83 2.39 7.96 6.82 5.888230
CA 64.20 5.42 16.73 13.64 5.708042
In [36]:
# Cost to quality ratio vs. Percent Hispanic, white, and African american
scatter_matrix(df_qual_vs_eth, figsize=(8,6))
plt.suptitle('2016 Pneumonia Care Quality vs. Ethnicity')
plt.show()
In [37]:
corrs = np.corrcoef(df_qual_vs_eth.values.T)
fig, ax = vis_corr_mat(corrs, annot=True)
fig.set_size_inches(3, 3)
plt.gca().set_xticklabels(['White', 'African American', 'Hispanic', 'Other', 'Quality'], rotation=45)
plt.gca().set_yticklabels(['White', 'African American', 'Hispanic', 'Other', 'Quality'], rotation=45)
plt.title('Quality vs. Ethnicity')
plt.show()

4 Factors Impacting Medicare Expenditure Over Time

In [132]:
df_state_all = pd.read_csv('./data/State_all.csv')
df_state_all.head()
Out[132]:
State Year Beneficiaries with Part A and Part B FFS Beneficiaries MA Beneficiaries MA Participation Rate Average Age Percent Female Percent Male Percent Non-Hispanic White ... PQI11 Bacterial Pneumonia Admission Rate (age < 65) PQI11 Bacterial Pneumonia Admission Rate (age 65-74) PQI11 Bacterial Pneumonia Admission Rate (age 75+) PQI12 UTI Admission Rate (age < 65) PQI12 UTI Admission Rate (age 65-74) PQI12 UTI Admission Rate (age 75+) PQI15 Asthma in Younger Adults Admission Rate (age < 40) PQI16 Lower Extremity Amputation Admission Rate (age < 65) PQI16 Lower Extremity Amputation Admission Rate (age 65-74) PQI16 Lower Extremity Amputation Admission Rate (age 75+)
0 AK 2007 53634.0 53203.0 431.0 0.80 70.0 50.93 49.07 74.65 ... 853 858 1913 380.0 164 886 NaN 123.0 NaN NaN
1 AL 2007 773340.0 645421.0 127919.0 16.54 70.0 56.60 43.40 80.45 ... 1045 1016 2354 484.0 450 1491 524.0 138.0 71.0 80.0
2 AR 2007 490161.0 430612.0 59549.0 12.15 70.0 55.68 44.32 88.06 ... 1188 1153 2919 409.0 376 1477 391.0 99.0 55.0 69.0
3 AZ 2007 807720.0 496814.0 310906.0 38.49 72.0 52.88 47.12 85.32 ... 755 582 1552 369.0 230 864 306.0 133.0 25.0 33.0
4 CA 2007 4055674.0 2520032.0 1535642.0 37.86 71.0 54.53 45.47 66.02 ... 747 610 1894 385.0 279 1128 293.0 106.0 45.0 46.0

5 rows × 242 columns

Question: Which factors are important for contributing high Medicare expenditure?

In [133]:
# Hand picking features after understanding the importance of each column!
feature_cols = [[1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15], # basic demographics
                [28, 29, 30, 31, 32, 33], # IP: 32/33 for utility rate (stays vs. days covered per 1000) 
                [40, 41, 42, 43, 44, 45], # PAC: LTCH: 44/45 for utility rate
                [52, 53, 54, 55, 56, 57], # PAC: IRF: 56/57 for utility rate
                [64, 65, 66, 67, 68, 69], # PAC: SNF: 68/69 for utility rate
                [76, 77, 78, 79, 80, 81], # PAC: HH: 80/81 for utility rate
                [88, 89, 90, 91, 92, 93], # Hospice: 92/93 for utility rate
                [100, 101, 102, 103, 104],# OP: 104: visits
                [111, 112, 113, 114, 115],# FQHC/RHC, 115: visits
                [122, 123, 124, 125, 126],# Outpatient dialysis, 126: events
                [133, 134, 135, 136, 137],# ASC: 137: events
                [144, 145, 146, 147, 148],# EM: 148: events
                [155, 156, 157, 158, 159],# Procedure: 159: events
                [166, 167, 168, 169, 170],# Imaging: 170: events
                [177, 178, 179, 180, 181],# DME: 181: events
                [188, 189, 190, 191, 192],# Tests: 192: events
                [199, 200, 201, 202, 203],# Ambulence: 203: events
                [210, 211, 212, 213], # Part B drug costs
                list(range(214, 243)),# quality of care metrics
                list(range(243, 249)) # utility rate to be calculated
               ]
targets = 'Standardized Risk-Adjusted Per Capita Costs'
In [134]:
# Add 6 columns that calculte the utility rate of 6 services
df_state_all['IP Utility Rate'] = df_state_all.iloc[:, 32-1] / df_state_all.iloc[:, 33-1]
df_state_all['PAC: LTCH Utility Rate'] =  df_state_all.iloc[:, 44-1] / df_state_all.iloc[:, 45-1]
df_state_all['PAC: IRF Utility Rate'] =  df_state_all.iloc[:, 56-1] / df_state_all.iloc[:, 57-1]
df_state_all['PAC: SNF Utility Rate'] =  df_state_all.iloc[:, 68-1] / df_state_all.iloc[:, 69-1]
df_state_all['PAC: HH Utility Rate'] =  df_state_all.iloc[:, 80-1] / df_state_all.iloc[:, 81-1]
df_state_all['Hospice Utility Rate'] =  df_state_all.iloc[:, 92-1] / df_state_all.iloc[:, 93-1]
In [135]:
index = np.array([item for sublist in feature_cols for item in sublist])-1
features = df_state_all.columns[index].values
len(features)
Out[135]:
139
In [136]:
def extract_important_features(feature_importances, tol=1E-6):
    # filter out features that has minimal importance
    important_feature_ind = np.where(feature_importances > 1E-6)[0]
    important_features = feature_importances[important_feature_ind]
    sorted_important_feature_ind = np.array([x for _,x in sorted(
                    zip(important_features,important_feature_ind))])[::-1]
    sorted_important_features = feature_importances[sorted_important_feature_ind]
    return sorted_important_features, sorted_important_feature_ind
In [137]:
# Use XGBoost to determine the contributing factors of per capita cost over the years
import xgboost as xgb
from xgboost import XGBRegressor, plot_importance
xgb_rg = XGBRegressor()
X = df_state_all[features].drop(columns=['State']).values
y = df_state_all[targets].values

xgb_rg.fit(X, y)
important_features, cols_index = extract_important_features(xgb_rg.feature_importances_)
cols_index = cols_index+1 # since we dropped state column
In [138]:
# Save for other analyses
np.savez('./data/xgboost_state_feature_importance.npz', feature_cols=feature_cols,
         features=features, important_features=important_features, cols_index=cols_index)
In [140]:
# Features ranked by its importance
features[cols_index]
Out[140]:
array(['OP Per User Standardized Costs',
       'IP Per Capita Standardized Costs', 'Year',
       'Percent Eligible for Medicaid',
       'PAC: LTCH Per Capita Standardized Costs',
       'Beneficiaries with Part A and Part B',
       'OP Per Capita Standardized Costs',
       'PAC: HH Per Capita Standardized Costs',
       'IP Per User Standardized Costs',
       'Procedures Per Capita Standardized Costs',
       'PAC: SNF Per User Standardized Costs', 'PAC: SNF Utility Rate',
       '% of Beneficiaries Using E&M',
       'PAC: LTCH Per User Standardized Costs',
       'Part B Drugs Per Capita Standardized Costs',
       'FQHC/RHC Per User Standardized Costs',
       'PAC: SNF Per Capita Standardized Costs',
       'PQI10 Dehydration Admission Rate (age < 65)',
       'Procedures Per User Standardized Costs',
       'ASC Per Capita Standardized Costs',
       'Outpatient Dialysis Facility Per User Standardized Costs',
       '% of Beneficiaries Using Hospice',
       'PQI11 Bacterial Pneumonia Admission Rate (age < 65)',
       'OP Visits Per 1000 Beneficiaries',
       'PAC: IRF Per User Standardized Costs',
       'PAC: IRF Per Capita Standardized Costs', 'Percent Other/Unknown',
       'Percent Non-Hispanic White', 'MA Beneficiaries',
       'PQI12 UTI Admission Rate (age < 65)',
       'PQI07 Hypertension Admission Rate (age < 65)',
       '% of Beneficiaries Using Ambulance',
       'Hospice Per Capita Standardized Costs',
       'PAC: LTCH Covered Days Per 1000 Beneficiaries',
       'PQI10 Dehydration Admission Rate (age 65-74)',
       'Hospital Readmission Rate',
       'Part B Drugs Per User Standardized Costs',
       'ASC Events Per 1000 Beneficiaries',
       'FQHC/RHC Visits Per 1000 Beneficiaries',
       'Hospice Per User Standardized Costs',
       'PAC: HH Episodes Per 1000 Beneficiaries',
       '% of Beneficiaries Using PAC: LTCH',
       '# PAC: LTCH Users (with a covered stay)', 'Percent Female',
       'PQI16 Lower Extremity Amputation Admission Rate (age 65-74)',
       'PQI10 Dehydration Admission Rate (age 75+)',
       'PQI07 Hypertension Admission Rate (age 75+)',
       'PQI07 Hypertension Admission Rate (age 65-74)',
       '# FQHC/RHC Users', 'PAC: HH Visits Per 1000 Beneficiaries',
       '% of Beneficiaries Using PAC: HH',
       'PAC: HH Per User Standardized Costs',
       'PAC: IRF Covered Days Per 1000 Beneficiaries',
       '% of Beneficiaries Using IP', 'FFS Beneficiaries',
       'PQI08 CHF Admission Rate (age 75+)',
       'PQI08 CHF Admission Rate (age < 65)',
       '% of Beneficiaries Using Part B Drugs',
       'Ambulance Events Per 1000 Beneficiaries',
       'Ambulance Per User Standardized Costs',
       'Ambulance Per Capita Standardized Costs',
       '% of Beneficiaries Using DME',
       'Imaging Events Per 1000 Beneficiaries',
       'Procedure Events Per 1000 Beneficiaries',
       '% of Beneficiaries Using Procedures',
       'E&M Events Per 1000 Beneficiaries',
       '% of Beneficiaries Using ASC',
       'Outpatient Dialysis Facility Per Capita Standardized Costs',
       '% of Beneficiaries Using FQHC/RHC', '% of Beneficiaries Using OP',
       'Hospice Covered Days Per 1000 Beneficiaries',
       'Hospice Covered Stays Per 1000 Beneficiaries',
       'IP Users (with a covered stay)', 'Percent African American',
       'MA Participation Rate', 'Hospice Utility Rate',
       'PAC: LTCH Utility Rate', 'IP Utility Rate',
       'PQI16 Lower Extremity Amputation Admission Rate (age < 65)',
       'PQI15 Asthma in Younger Adults Admission Rate (age < 40)',
       'PQI12 UTI Admission Rate (age 65-74)',
       'PQI11 Bacterial Pneumonia Admission Rate (age 65-74)',
       'PQI05 COPD or Asthma in Older Adults Admission Rate (age 75+)',
       'PQI05 COPD or Asthma in Older Adults Admission Rate (age 40-64)',
       'Emergency Department Visits per 1000 Beneficiaries',
       'Tests Per Capita Standardized Costs', '# Procedure Users',
       'E&M Per Capita Standardized Costs',
       'ASC Per User Standardized Costs',
       'IP Covered Stays Per 1000 Beneficiaries', 'Average HCC Score',
       'Percent Hispanic', 'PAC: IRF Utility Rate',
       'PQI12 UTI Admission Rate (age 75+)',
       'PQI05 COPD or Asthma in Older Adults Admission Rate (age 65-74)',
       'PQI03 Diabetes LT Complication Admission Rate (age < 65)',
       '% of Beneficiaries Using Tests', '# Test Users', '# DME Users',
       'DME Per Capita Standardized Costs',
       'Imaging Per Capita Standardized Costs',
       'Outpatient Dialysis Facility Events Per 1000 Beneficiaries',
       '% of Beneficiaries Using Outpatient Dialysis Facility',
       'FQHC/RHC Per Capita Standardized Costs', '# OP Users',
       'PAC: SNF Covered Days Per 1000 Beneficiaries',
       'PAC: IRF Covered Stays Per 1000 Beneficiaries',
       '% of Beneficiaries Using PAC: IRF',
       'PAC: LTCH Covered Stays Per 1000 Beneficiaries'], dtype=object)
In [141]:
cols_index
Out[141]:
array([ 51,  14,   1,  12,  20,   2,  50,  38,  15,  75,  33, 136,  73,
        21, 100,  56,  32, 120,  76,  65,  61,  47, 123,  54,  27,  26,
        11,   8,   4, 126, 114,  98,  44,  25, 121, 106, 101,  69,  59,
        45,  42,  23,  22,   7, 131, 122, 116, 115,  57,  43,  41,  39,
        31,  17,   3, 119, 117, 103,  99,  96,  95,  88,  84,  79,  78,
        74,  68,  60,  58,  53,  49,  48,  16,   9,   5, 138, 134, 133,
       130, 129, 127, 124, 113, 111, 107,  90,  77,  70,  66,  18,  13,
        10, 135, 128, 112, 108,  93,  92,  87,  85,  80,  64,  63,  55,
        52,  37,  30,  29,  24], dtype=int64)
In [142]:
num_features = 10
ax = plot_importance(xgb_rg, max_num_features=num_features)
_ =  ax.set_yticklabels(features[cols_index][:num_features][::-1])
In [145]:
fscores_dict = xgb_rg.get_booster().get_fscore()
# Get features with F-scores that's only greater than or equal to 4 (p<0.05)
significant_cols = []
for k, v in fscores_dict.items():
    if v >= 4:
        current_col = int(k[1:])
        significant_cols.append(current_col)
significant_cols = np.array(significant_cols)
print(significant_cols) # 55 important features
print(len(significant_cols))
[ 74  10  32  50   7  19  14  37  49 113  26  46  31  43  41  99  72   0
  11  22 119  25  38  40   6  20 122  64  55  13   3 125 120  97  60  42
 115  44 135  30 114 100 130 121  56   1  21  24  68  75   2  53  58 105
  16]
55
In [146]:
# Plot these 55 features
num_features = 55
ax = plot_importance(xgb_rg, max_num_features=num_features)
_ =  ax.set_yticklabels(features[cols_index][:num_features][::-1])
ax.figure.set_size_inches(5,15)

5 Forecast the Medicare Cost in 2017 and 2018

In [168]:
# Preparing data
gp = df_state_all.groupby(by=['Year'], as_index=False)
df_nation = gp.sum()
df_nation['Year'] = pd.to_datetime(df_nation['Year'], format='%Y')
df_nation.set_index(['Year'], inplace=True)
In [378]:
df_nation[targets]
Out[378]:
Year
2007-01-01    408185.46
2008-01-01    435001.87
2009-01-01    449867.98
2010-01-01    461558.53
2011-01-01    459797.58
2012-01-01    476134.91
2013-01-01    479433.30
2014-01-01    475021.95
2015-01-01    493828.59
2016-01-01    500015.52
Name: Standardized Risk-Adjusted Per Capita Costs, dtype: float64
In [376]:
_ = plt.plot(df_nation.index, df_nation[targets]/1000)
_ = plt.ylabel('Per Capita Cost (thousand dollars)')
_ = plt.xlabel('Year')
_ = plt.title('Current data')

Forecast directly on the cost of the national level total cost

In [173]:
from statsmodels.tsa.statespace.structural import UnobservedComponents

indpro_mod = UnobservedComponents(df_nation[targets],
                                  level=True,
                                  trend=True,
                                  stochastic_level=True,
                                  stochastic_trend=True)
indpro_res = indpro_mod.fit(method='powell', disp=False)
In [217]:
#fig, ax = plt.subplots(figsize=(10,5))

cost_array = df_nation[targets].values
fres = indpro_res.get_forecast('2019-01-01')
cost_array = np.append(cost_array, fres.predicted_mean.values)
fres_ci = fres.conf_int()
lower_fill = fres_ci['lower Standardized Risk-Adjusted Per Capita Costs'].values
upper_fill = fres_ci['upper Standardized Risk-Adjusted Per Capita Costs'].values
In [231]:
fig, ax = plt.subplots(figsize=(10,5))
year_array = np.arange(2007, 2020,1)
ax.plot(year_array[:11], cost_array[:11]/1000)
ax.plot(year_array[10:], cost_array[10:]/1000)
_ = ax.fill_between([2017, 2018, 2019], lower_fill/1000, upper_fill/1000, alpha=0.1, color='#ff7f0e')
_ = plt.ylabel('Per Capita Cost (thousand dollars)')
plt.gca().set_xticks(year_array)
_ = plt.xlabel('Year')
plt.gcf().set_size_inches(7, 3)
_ = ax.text(2018, 415, 'Forecast', va='center', ha='center')
Out[231]:
Text(2018,415,'Forecast')

Using Kalman Filter and XGBoost

Forecast the 137 features by each state and predict with the XGBoost model

In [339]:
def forecast_feature_by_state(df_state_all, features, state='AK', forecast_end=2019, concat=False):
    df_state_sub = df_state_all.loc[df_state_all['State'] == state, features]
    forecast_start = df_state_sub['Year'].values[-1]+1
    df_state_sub['Year'] = pd.to_datetime(df_state_sub['Year'], format='%Y')
    df_state_sub.set_index('Year', inplace=True)
    df_state_sub.fillna(method='ffill').fillna(method='bfill', inplace=True)
    # iterate over each feature
    forecast_df = pd.DataFrame({}, columns = [df_state_sub.index.name] + df_state_sub.columns.tolist())
    forecast_df.set_index('Year', inplace=True)
    for n, f in enumerate(features):
        if f == 'State' or f == 'Year': continue
        # deal with missing data

        indpro_mod = UnobservedComponents(df_state_sub[f],
                                          level=True,
                                          trend=True,
                                          stochastic_level=True,
                                          stochastic_trend=True)
        try:
            indpro_res = indpro_mod.fit(method='powell', disp=False)
            fres = indpro_res.get_forecast(str(forecast_end)+'-01-01')
            out_array = fres.predicted_mean.values
        except:
            out_array = np.nan * np.ones(forecast_end-forecast_start+1)
        for a, y in zip(out_array, range(forecast_start, forecast_end+1)):
            forecast_df.loc[pd.to_datetime(str(y)+'-01-01'), f] = a

    forecast_df['State'] = state
    if concat:
        forecast_df = df_state_sub.append(forecast_df)

    return forecast_df
In [342]:
# INitialize a dataframe to save all the states forecast data
forecast_all = pd.DataFrame({}, columns=features)
In [344]:
import warnings
warnings.filterwarnings("ignore")
# Iterate over all states
for st in df_state_all['State'].unique().tolist():
    print(st)
    forecast_df = forecast_feature_by_state(df_state_all, features, state=st, forecast_end=2019)
    # Turn Year back to number
    forecast_df = forecast_df.reset_index()
    forecast_df['Year'] = [int(str(y)[:4]) for y in forecast_df['Year'].values]
    forecast_all = forecast_all.append(forecast_df)
In [348]:
forecast_all = forecast_all[features]
forecast_all = forecast_all.sort_values(['Year', 'State'])
forecast_all.to_csv('./data/forecast_state.csv', index=False)
In [349]:
# Apply xgboost model
y_forecast = xgb_rg.predict(forecast_all.drop(columns='State', axis=1).values)
In [370]:
# Summarize
forecast_all['Forecasted Cost'] = y_forecast
gp = forecast_all.groupby(by=['Year'], as_index=False)
df_forecast_nation = gp.sum()
#df_forecast_nation['Year'] = pd.to_datetime(df_nation['Year'], format='%Y')

df_forecast_nation
Out[370]:
Year Forecasted Cost
0 2017 500057.87500
1 2018 501675.37500
2 2019 501400.15625
In [375]:
# Do the plots
fig, ax = plt.subplots(figsize=(10,5))
year_array = np.arange(2007, 2020,1)
cost_array[10:] = df_forecast_nation['Forecasted Cost'].values
ax.plot(year_array[:11], cost_array[:11]/1000)
ax.plot(year_array[10:], cost_array[10:]/1000)
_ = plt.ylabel('Per Capita Cost (thousand dollars)')
plt.gca().set_xticks(year_array)
_ = plt.xlabel('Year')
plt.gcf().set_size_inches(7, 3)
_ = ax.text(2018, 480, 'Forecast', va='center', ha='center', color='#ff7f0e')