### importing relevant libraries and setting configuration
%matplotlib inline
#data exploration libraries
import pandas as pd
import numpy as np

#visualization libraries
import matplotlib.pyplot as plt
import plotly as py
import plotly.figure_factory as ff
import plotly.express as px
import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from bubbly.bubbly import bubbleplot
from bokeh.palettes import viridis
colors = viridis(4)
init_notebook_mode(connected=True)
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)


import math
import glob

# Set some Pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 50)


#read data in
df_m = pd.read_csv("../data/Drug_Overdose_Mortality/Underlying Cause of Death-County-2018.csv", 
                   delimiter='\t',
                  dtype={'County Code': str})

#normalize deaths by population
df_m['Norm_Deaths'] = ((df_m['Deaths']/df_m['Population'])*100000).apply(np.ceil)

#generate state code
df_m['State'] = df_m['County'].str[-2:]


df_m.head(5)

  Notes              County County Code  \
0   NaN  Autauga County, AL       01001   
1   NaN  Autauga County, AL       01001   
2   NaN  Baldwin County, AL       01003   
3   NaN  Baldwin County, AL       01003   
4   NaN  Baldwin County, AL       01003   

                          Drug/Alcohol Induced Cause  \
0  Drug poisonings (overdose) Unintentional (X40-...   
1       Drug poisonings (overdose) Suicide (X60-X64)   
2  Drug poisonings (overdose) Unintentional (X40-...   
3       Drug poisonings (overdose) Suicide (X60-X64)   
4  Drug poisonings (overdose) Undetermined (Y10-Y14)   

  Drug/Alcohol Induced Cause Code  Deaths  Population  Crude Rate  \
0                              D1    69.0   1087149.0         6.3   
1                              D2    14.0   1087149.0  Unreliable   
2                              D1   424.0   3758097.0        11.3   
3                              D2    71.0   3758097.0         1.9   
4                              D4    19.0   3758097.0  Unreliable   

   Norm_Deaths State  
0          7.0    AL  
1          2.0    AL  
2         12.0    AL  
3          2.0    AL  
4          1.0    AL


df_m_state = df_m.groupby(['State'])['Norm_Deaths'].agg('mean').reset_index()


fig = px.bar(df_m_state.sort_values(by=['Norm_Deaths']), x='State', y='Norm_Deaths', #color='Norm_Deaths',
            labels={
                     "State": "State Code",
                     "Norm_Deaths": "Number of opioid deaths per 100K population",
                 },
            title="2019 Mean opioid deaths across counties of states per 100K population",
            color_discrete_sequence=colors,
            width=1000, height=600
            )


fig.show()


## sum all cases of opioid deaths and normalize
df_sca = df_m.groupby(['County', 'County Code', 'Population'])['Deaths'].agg('sum').reset_index()
df_sca['State'] = df_sca['County'].str[-2:]
df_sca['Norm_Deaths'] = ((df_sca['Deaths']/df_sca['Population'])*100000).apply(np.ceil)

fig = px.scatter(df_sca, y="Norm_Deaths", x="Population", color="State",
                 log_x=True,
                 hover_data=['Population', 'Deaths', 'County'],
                 labels={
                     'Population': 'Population (Log Scale)',
                     "Norm_Deaths": "Number of opioid deaths per 100K population",
                 },
                 title="2019 Opioid deaths across counties of states per 100K population",
                 color_discrete_sequence=colors,
                )


fig.show()


fig = px.scatter(df_sca.loc[(df_sca['State']=='NC') | (df_sca['State']=='KY')], 
                 y="Norm_Deaths", x="Population", color="State",
                 log_x=True,
                 hover_data=['Population', 'Deaths', 'County'],
                 labels={
                     'Population': 'Population (Log Scale)',
                     "Norm_Deaths": "Number of opioid deaths per 100K population",
                 },
                 title="2019 Opioid deaths across counties in NC and KY per 100K population ",
                 color_discrete_map={
                     'NC': '#5283AD',
                     'KY': '#FDAB5A'
                 },
                )


fig.show()


pie = df_m.groupby(['Drug/Alcohol Induced Cause'])['Norm_Deaths'].agg('mean').reset_index()
pie.sort_values('Drug/Alcohol Induced Cause')

fig = px.pie(pie, 
             values=pie['Norm_Deaths'], 
             names=pie['Drug/Alcohol Induced Cause'],
             title='Different types of drug induced mortalities',
             color_discrete_sequence=colors)


fig.show()


s_bar = df_m.loc[(df_m['State']=='NC')|(df_m['State']=='KY')].groupby(['State','Drug/Alcohol Induced Cause'])['Norm_Deaths'].agg('mean').reset_index()


s_bar

  State                         Drug/Alcohol Induced Cause  Norm_Deaths
0    KY                      All other drug-induced causes     1.833333
1    KY       Drug poisonings (overdose) Suicide (X60-X64)     2.107143
2    KY  Drug poisonings (overdose) Undetermined (Y10-Y14)     2.733333
3    KY  Drug poisonings (overdose) Unintentional (X40-...    18.612069
4    NC                      All other drug-induced causes     1.272727
5    NC       Drug poisonings (overdose) Suicide (X60-X64)     2.492958
6    NC  Drug poisonings (overdose) Undetermined (Y10-Y14)     1.000000
7    NC  Drug poisonings (overdose) Unintentional (X40-...    12.575758


fig = px.bar(s_bar, x="State", y="Norm_Deaths", color="Drug/Alcohol Induced Cause", 
             title="Types of drug induced mortalities - NC vs KY",
             color_discrete_map={
             'Drug poisonings (overdose) Unintentional (X40-X44)': '#440154',
             'All other drug-induced causes': '#FDE724',
             'Drug poisonings (overdose) Undetermined (Y10-Y14)': '#30678D',
             'Drug poisonings (overdose) Suicide (X60-X64)': '#35B778'
        })


fig.show()


df_d_1 = pd.read_csv('../data/Drug_Overdose_Mortality/Death_Count_1999_2010.txt', 
                     delimiter='\t',dtype={'County Code': str})
df_d_2 = pd.read_csv('../data/Drug_Overdose_Mortality/Death_Count_2011-2019.txt',
                     delimiter='\t',dtype={'County Code': str})
## Concatenating datasets
df_d = pd.concat([df_d_1,df_d_2])
## Calculating normalized deaths and state code
df_d['Norm_Deaths'] = ((df_d['Deaths']/df_d['Population'])*100000).apply(np.ceil)
df_d['State'] = df_d['County'].str[-2:]

df_time = df_d.groupby(['State', 'Year Code'])['Norm_Deaths'].agg('mean').reset_index()
df_time = df_time.sort_values(by=['State', 'Year Code'])

df_time.head(2)

  State  Year Code  Norm_Deaths
0    AK       1999         11.0
1    AK       2000         10.0


fig = px.line(df_time.loc[df_time['State'].isin(list(df_time['State'].unique())[:48])], 
             x="Year Code", y='Norm_Deaths', color="State",
             labels={
                     "State": "State Code",
                     "Year Code": "Years",
                     "Norm_Deaths": "Number of opioid deaths per 100K population",
                 },
            title="1999-2019 Mean opioid related deaths per 100K population for States",
            color_discrete_sequence=colors
            )


fig.show()


fig = px.line(df_time.loc[df_time['State'].isin(list(df_time['State'].unique())[:48])], 
              x="Year Code", y='Norm_Deaths', color="State",
              labels={
                     "State": "State Code",
                     "Year Code": "Years",
                     "Norm_Deaths": "Number of opioid deaths per 100K population",
                 },
              title="1999-2019 Mean opioid related deaths per 100K population for NC and KY",
              color_discrete_sequence=colors,
             )

# set color of all traces to lightgrey
fig.update_traces({"line":{"color":"lightgrey"}}, 
                 opacity=.5)

# color KY line to red
fig.update_traces(patch={"line":{"color":"#FDAB5A", "width":3}}, 
                  selector={"legendgroup":"KY"},
                 opacity=1)

# color NC line to red
fig.update_traces(patch={"line":{"color":"#5283AD", "width":3}}, 
                  selector={"legendgroup":"NC"},
                 opacity=1)

# remove the legend, y-axis and add a title
# fig.update_layout(title="Normalized Mortality Deaths",
#                 showlegend=False,
#                 yaxis={"visible":False})


### rendering the figure for presentation
fig.show()


# prepare data

## sum all cases of opioid deaths
tmp = df_m.groupby(['County', 'County Code', 'Population'])['Deaths'].agg('sum').reset_index()
tmp['State'] = tmp['County'].str[-2:]
tmp['Norm_Deaths'] = ((tmp['Deaths']/tmp['Population'])*100000).apply(np.ceil)

## get states data
nc = tmp.loc[(tmp['State']=='NC')]['Norm_Deaths']
ky = tmp.loc[(tmp['State']=='KY')]['Norm_Deaths']

trace1 = go.Histogram(
    x=nc,
    opacity=0.75,
    name = "NC",
    nbinsx=40,
    marker=dict(color='#5283AD', line=dict(width=1, color='grey')))
trace2 = go.Histogram(
    x=ky,
    opacity=0.75,
    name = "KY",
    nbinsx=40,
    marker=dict(color='#FDAB5A', line=dict(width=1, color='grey')))

data = [trace1, trace2]
layout = go.Layout(barmode='overlay',
                   title='Distribution of per capita opioid deaths for different counties in NC and KY',
                   xaxis=dict(title='Number of opioid deaths per 100K population'),
                   yaxis=dict( title='Number of counties'),
)

fig = go.Figure(data=data, layout=layout)
fig.add_vline(x=nc.mean(), line_width=3,
              annotation_text="NC Mean = {:,.0f}".format(nc.mean()),
              annotation_position="top left",
              line_dash="dash", line_color="#5283AD")
fig.add_vline(x=ky.mean(), line_width=3,
              annotation_text="KY Mean = {:,.0f}".format(ky.mean()),
              annotation_position="top right",
              line_dash="dash", line_color="#FDAB5A")


### rendering the figure for presentation
iplot(fig)


fig = px.box(tmp.loc[(tmp['State']=='NC') | (tmp['State']=='KY')], 
             x="State", y="Norm_Deaths", points="all", color="State",hover_data=["County"],
             labels={
                     "State": "State Code",
                     "Year Code": "Years",
                     "Norm_Deaths": "Number of opioid deaths per 100K population",
                 },
              title="Box plot of NC and KY",
             color_discrete_sequence=["#FDAB5A","#5283AD"])


fig.show()


### Read in the excel files and create a dataframe out of them
f = glob.glob('../data/Employment/raw/xls/*.xlsx')
dfs = []

for each in f:
    df_temp = pd.read_excel(each, engine='openpyxl', skiprows=6, skipfooter=3, header=None, converters={1:str,2:str})
    dfs.append(df_temp)

df_emp = pd.concat(dfs)
df_emp = df_emp.drop(df_emp.columns[[5]], axis=1) ### drop a null column

df_emp.columns = ['LAUS', 'State_FIPS','County_FIPS','County','Year', 'Labor Force',
                  'Employed', 'Unemployed', 'Unemployment Rate'
                 ] ### assign column names
df_emp['FIPS'] = df_emp['State_FIPS'] + df_emp['County_FIPS'] ### create the FIPS code for counties

mapping = {'N.A.': 0} #### fill 0 for missing unemployment rate
df_emp = df_emp.applymap(lambda s: mapping.get(s) if s in mapping else s)

df_e = pd.merge(df_d, df_emp,  how='inner', left_on=['County Code','Year Code'], 
              right_on = ['FIPS','Year']) ### merge with mortality data

df_e['Unemployment'] = (df_e['Unemployed'] / df_e['Population'])*10000000 ### normalize unemployment by population


fig = px.scatter(df_e.loc[((df_e['State']=='NC') | (df_e['State']=='KY')) & (df_e['Year_x']==2019)], 
                x="Population", y="Norm_Deaths",
                size="Unemployment Rate", color="State",
                hover_name="County_x", log_x=True, size_max=60,
                labels={
                     "State": "State Code",
                     "Year Code": "Years",
                     "Population": "Population (Log Normalized)",
                     "Norm_Deaths": "Number of opioid deaths per 100K population",
                 },
                title="Bubble plot of NC and KY - Size of the bubble shows unemployment rate",
                color_discrete_sequence=["#FDAB5A","#5283AD"]
                )


fig.show()


df_h = df_e.loc[(df_e['Year_x']==2019)][['Norm_Deaths', 'Deaths', 'Population', 'Labor Force', 'Employed',
                   'Unemployed', 'Unemployment Rate', 'Unemployment']] ### Select columns and year
corr = df_h.corr() ### find correlation values

fig = px.imshow(corr,
                labels=dict(x="Variables", y="Variables Correlation", color="Correlation"),
                title="Variable Correlation HeatMap",
                color_continuous_scale=px.colors.sequential.Viridis
               )
fig.update_xaxes(side="top")


fig.show()


from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

df_map = df_m.groupby(['County Code','County','Population'])['Norm_Deaths'].agg('mean').reset_index()
    
fig = px.choropleth(df_map, geojson=counties, locations=df_map['County Code'], 
                           color='Norm_Deaths',
                           color_continuous_scale="Viridis",
                           range_color=(0, 12),
                           scope="usa",
                           labels={'Norm_Deaths':'Deaths / 100K population'},
                           hover_name="County", hover_data=["Norm_Deaths", "Population"]
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})


### rendering figure for presentation
fig.show()


df_dyna = df_e.loc[((df_e['State']=='NC') | (df_e['State']=='KY'))]
df_dyna.sort_values('Year_x', inplace=True)

fig = px.scatter(df_dyna, 
           x="Population", y="Norm_Deaths", animation_frame="Year_x", 
           animation_group="County_x",
           size="Unemployed", color="State", hover_name="State",
           log_x=True, size_max=60, range_x=[10000,10000000], range_y=[0,150],
           labels={
                     "State": "State Code",
                     "Year Code": "Years",
                     "Population": "Population (Log Normalized)",
                     "Norm_Deaths": "Number of opioid deaths per 100K population",
                     "Year_x": "Year",
                 },
           title="Temporal bubble plot of NC and KY - Size of the bubble shows unemployment rate",
           color_discrete_sequence=["#5283AD", "#FDAB5A"])


fig.show()


### read data
df_g = pd.read_excel("../data/County_Dataset/Guilford Overdose/narcan administration 2014-March 2019.xlsx", engine='openpyxl')

### Get date time field and extract the year information
df_g['Date'] = df_g['Incident Date'].dt.normalize()
df_g['Year'] = pd.DatetimeIndex(df_g['Date']).year


px.set_mapbox_access_token(open(".mapbox").read())
fig = px.scatter_mapbox(df_g, lat='Scene GPS Location Latitude', 
                        lon='Scene GPS Location Longitude', 
                        center=dict(lat=36.065891, lon=-79.789995), 
                        animation_frame="Year",
                        zoom=9,
                        width=1000, height=700,
                        mapbox_style="stamen-terrain"
                    )


fig.show()


fig = px.density_mapbox(df_g, lat='Scene GPS Location Latitude', 
                        lon='Scene GPS Location Longitude', 
                        radius=10,
                        center=dict(lat=36.065891, lon=-79.789995), zoom=10,
                        animation_frame="Year",
                        width=1000, height=700,
                        mapbox_style="stamen-terrain")


fig.show()

Lesson #3 - Visualization and Exploratory Analysis¶

Opioid overdose analytics¶

Software used (open source):¶

Datasets¶

1. Drug Overdose Dataset¶

2. Local Area Unemployment Statistics¶

3. Guilford County Emergency Medical Services (EMS) call data¶

Exploration of Drug Overdose Dataset¶

1. Standard Visualization¶

2. Advanced Visualizations¶

3. Dynamic Visualizations¶