# Data science libraries for python
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# Extra functionality for jupyter notebook formatting
from IPython.display import display, HTML


df_cd = pd.read_csv('Weekly_United_States_COVID-19_Cases_and_Deaths_by_State.csv')
df_v = pd.read_csv('COVID19_CDC_Vaccination_CSV_Download.csv')
df_states = pd.read_csv('https://raw.githubusercontent.com/jasonong/List-of-US-States/master/states.csv')


# display() simply allows back-to-back previews of DataFrames, all from one jupyter codeblock
display(df_cd.head())
display(df_v.head())
display(df_states.head())


df_v = df_v.dropna()


print(df_cd.state.unique())
print()
print(df_v.GEOGRAPHY_NAME.unique())

['AK' 'AL' 'AR' 'AS' 'AZ' 'CA' 'CO' 'CT' 'DC' 'DE' 'FL' 'FSM' 'GA' 'GU'
 'HI' 'IA' 'ID' 'IL' 'IN' 'KS' 'KY' 'LA' 'MA' 'MD' 'ME' 'MI' 'MN' 'MO'
 'MP' 'MS' 'MT' 'NC' 'ND' 'NE' 'NH' 'NJ' 'NM' 'NV' 'NY' 'NYC' 'OH' 'OK'
 'OR' 'PA' 'PR' 'PW' 'RI' 'RMI' 'SC' 'SD' 'TN' 'TX' 'UT' 'VA' 'VI' 'VT'
 'WA' 'WI' 'WV' 'WY']

['United States' 'California' 'Arkansas' 'Arizona' 'Alabama' 'Delaware'
 'Colorado' 'Alaska' 'Connecticut' 'Georgia' 'Idaho' 'Hawaii' 'Florida'
 'Illinois' 'Iowa' 'District of Columbia' 'Indiana' 'Kansas' 'Maine'
 'Kentucky' 'Louisiana' 'Michigan' 'Maryland' 'Minnesota' 'Mississippi'
 'Massachusetts' 'Missouri' 'Nebraska' 'Montana' 'New Hampshire'
 'New York State' 'Nevada' 'New Mexico' 'New Jersey' 'North Dakota'
 'North Carolina' 'Oregon' 'Oklahoma' 'Ohio' 'Pennsylvania'
 'South Carolina' 'Rhode Island' 'Texas' 'Tennessee' 'Utah' 'South Dakota'
 'Vermont' 'Virginia' 'Washington' 'Wyoming' 'Wisconsin' 'West Virginia'
 'Northern Mariana Islands' 'Virgin Islands' 'Puerto Rico' 'Guam'
 'American Samoa' 'Federated States of Micronesia' 'Marshall Islands'
 'Indian Health Svc' 'Republic of Palau']


# Rename so that it can interact with df_states
df_v = df_v.replace({'GEOGRAPHY_NAME': 'New York State'}, 'New York')

# Make everything classified as NYC just show up as NY. Note: this means that NY has 2 entries for each week instead of 1.
df_cd = df_cd.replace({'state': 'NYC'}, 'NY')

# Exclude everything except data for states in df_states
df_cd = df_cd[df_cd.state.isin(df_states.Abbreviation)]
df_v = df_v[df_v.GEOGRAPHY_NAME.isin(df_states.State)]


df_v = df_v[(df_v.DEMOGRAPHIC_GROUP == 'Total') & (df_v.DEMOGRAPHIC_CATEGORY == 'Total')]


# Drop columns that are unnecessary for analysis
df_cd = df_cd.drop(['date_updated', 'start_date'], axis=1)
df_v = df_v.drop(['GEOGRAPHY_LEVEL', 'DEMOGRAPHIC_GROUP', 'DEMOGRAPHIC_CATEGORY', 'TOTAL_DOSES_ADMINISTERED', 'TOTAL_DOSES_DISTRIBUTED', 'SOURCEINFO', 'USAFACTS_INGESTION_DATE'], axis=1)

# Rename columns for clarity and consistency
df_cd = df_cd.rename(columns={'end_date': 'date'})
df_cd = df_cd[['date', 'state', 'tot_cases', 'new_cases', 'tot_deaths', 'new_deaths', 'new_historic_cases', 'new_historic_deaths']]
df_v = df_v.rename(columns={'DATE': 'date', 
                            'GEOGRAPHY_NAME': 'state', 
                            'PARTIALLY_OR_FULLY_VACCINATED_PERSONS': 'partial_or_full_vaccination', 
                            'FULLY_VACCINATED_PERSONS': 'full_vaccination', 
                            'POPULATION': 'pop', 
                            'PARTIALLY_OF_FULLY_VACCINATED_PERCENT': 'partial_or_full_vaccination_pct', 
                            'FULLY_VACCINATED_PERCENT': 'full_vaccination_pct'})


# Convert every proper state name to the abbreviation in the same row for each state in df_states 
df_v.state = df_v.state.map(df_states.set_index('State').Abbreviation)


df_cd.date = pd.to_datetime(df_cd.date)
df_v.date = pd.to_datetime(df_v.date)


df = pd.merge(df_cd, df_v, 'inner', on=['date', 'state'], copy=True)


df.head()


# Display any row that contains a NaN value
df[df.isna().any(axis=1)]


# Create 4 subplots, displayed in a 2 by 2 layout
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20,8))

# Group by state
for key, group in df.groupby('state'):
    ax1.scatter(group.date, group.tot_cases, 6)
    ax1.set(title='State-separated Total COVID-19 Cases over time', ylabel='Number of cases')

    ax2.scatter(group.date, group.new_cases, 6)
    ax2.set(title='State-separated New Weekly COVID-19 Cases over time', ylabel='Number of cases')

    ax3.scatter(group.date, group.partial_or_full_vaccination, 6)
    ax3.set(title='State-separated Total COVID-19 Vaccinations over time', ylabel='Number of vaccinations')
    
    ax4.scatter(group.date, group.partial_or_full_vaccination_pct, 6)
    ax4.set(title='State-separated Total COVID-19 Percent Vaccinations over time', ylabel='Number of vaccinations as percent of population')

plt.show()


# Take averages of each column, grouped by date
df_avg = df.groupby('date', as_index=False)[['tot_cases', 'new_cases', 'partial_or_full_vaccination', 'partial_or_full_vaccination_pct']].mean()

# 2 by 2 grid of plots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20,8))

ax1.scatter(df_avg.date, df_avg.tot_cases, 6)
ax1.set(title='Total COVID-19 Cases over time', ylabel='Number of cases')

ax2.scatter(df_avg.date, df_avg.new_cases, 6)
ax2.set(title='New Weekly COVID-19 Cases over time', ylabel='Number of cases')

ax3.scatter(df_avg.date, df_avg.partial_or_full_vaccination, 6)
ax3.set(title='Total COVID-19 Vaccinations over time', ylabel='Number of vaccinations')

ax4.scatter(df_avg.date, df_avg.partial_or_full_vaccination_pct, 6)
ax4.set(title='Total COVID-19 Percent Vaccinations over time', ylabel='Number of vaccinations as percent of population')
plt.show()


fig, ax = plt.subplots(1, 1)
plt.scatter(df_avg.partial_or_full_vaccination, df_avg.new_cases, 6)
ax.set(title='Number of Vaccinations vs. Number of New Weekly Cases', xlabel='Number of vaccinations', ylabel='Number of cases')
plt.show()


# Identifying our data
X = np.array(df_avg.partial_or_full_vaccination_pct).reshape(-1, 1)
y = np.array(df_avg.new_cases).reshape(-1, 1)

# Splitting the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

# Creating the linear regression model and fitting it to our data
reg = LinearRegression()
reg.fit(X_train, y_train)

# Displaying the scatterplot from above with the line of best fit overlayed
plt.scatter(X, y, 6)
plt.plot(X, reg.predict(X), label='y = {:.2f}x + {:.2f}'.format(reg.coef_[0][0], reg.intercept_[0]))
plt.title('Number of Vaccinations vs. Number of New Weekly Cases')
plt.xlabel('Number of vaccinations')
plt.ylabel('Number of cases')
plt.legend()
plt.show()

# Model score
print('Score: ' + str(reg.score(X_test, y_test)))

Score: 0.003763777326642126


# Residuals calculated by taking the difference between the actual y values and the predicted y values
plt.scatter(X, y - reg.predict(X), 6)
plt.plot(X, np.zeros((114, 1)))
plt.title('Residual Plot of Number of Vaccinations vs. Number of New Weekly Cases')
plt.xlabel('Number of vaccinations')
plt.ylabel('Residuals of number of cases')
plt.show()


# Used to select only rows which lie outside of this date range
datefilter = (df_avg.date < '2021-11-01') | (df_avg.date > '2022-04-01')

fig, ax = plt.subplots(1, 1)
plt.scatter(df_avg.partial_or_full_vaccination[datefilter], df_avg.new_cases[datefilter], 6)
ax.set(title='Number of Vaccinations vs. Number of New Weekly Cases', xlabel='Number of vaccinations', ylabel='Number of cases')
plt.show()


# Same modeling process as before, using the filtered data
X = np.array(df_avg.partial_or_full_vaccination_pct[datefilter]).reshape(-1, 1)
y = np.array(df_avg.new_cases[datefilter]).reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

reg = LinearRegression()
reg.fit(X_train, y_train)

plt.scatter(X, y, 6)
plt.plot(X, reg.predict(X), label='y = {:.2f}x + {:.2f}'.format(reg.coef_[0][0], reg.intercept_[0]))
plt.title('Number of Vaccinations vs. Number of New Weekly Cases')
plt.xlabel('Number of vaccinations')
plt.ylabel('Number of cases')
plt.legend()
plt.show()

print('Score: ' + str(reg.score(X_test, y_test)))

Score: -0.07761144181084223

	DATE	GEOGRAPHY_LEVEL	GEOGRAPHY_NAME	DEMOGRAPHIC_GROUP	DEMOGRAPHIC_CATEGORY	PARTIALLY_OR_FULLY_VACCINATED_PERSONS	FULLY_VACCINATED_PERSONS	POPULATION	PARTIALLY_OF_FULLY_VACCINATED_PERCENT	FULLY_VACCINATED_PERCENT	TOTAL_DOSES_ADMINISTERED	TOTAL_DOSES_DISTRIBUTED	SOURCEINFO	USAFACTS_INGESTION_DATE
0	2023-04-26	Nation	United States	Total	Total	270047396.0	230533196.0	332008832.0	0.813374	0.694359	675442636.0	979617855.0	CDC State and National Vaccination Data	2023-05-01T01:25:24.415Z
1	2023-04-26	State	California	Total	Total	33596928.0	29580494.0	39512223.0	0.850292	0.748642	88296470.0	120437235.0	CDC State and National Vaccination Data	2023-05-01T01:25:24.415Z
2	2023-04-26	State	Arkansas	Total	Total	2114069.0	1719511.0	3017804.0	0.700532	0.569789	4867436.0	8343140.0	CDC State and National Vaccination Data	2023-05-01T01:25:24.415Z
3	2023-04-26	State	Arizona	Total	Total	5699178.0	4819349.0	7278717.0	0.782992	0.662115	14616367.0	19864230.0	CDC State and National Vaccination Data	2023-05-01T01:25:24.415Z
4	2023-04-26	State	Alabama	Total	Total	3192073.0	2610908.0	4903185.0	0.651020	0.532492	7011237.0	12305740.0	CDC State and National Vaccination Data	2023-05-01T01:25:24.415Z

	date	state	tot_cases	new_cases	tot_deaths	new_deaths	partial_or_full_vaccination	full_vaccination	pop	partial_or_full_vaccination_pct	full_vaccination_pct
0	2021-01-20	AK	51178	1377	253	27	56911.0	11283.0	731545.0	0.077796	0.015424
1	2021-02-03	AK	53305	946	279	18	98076.0	27528.0	731545.0	0.134067	0.037630
2	2021-02-10	AK	54294	989	280	1	112448.0	43552.0	731545.0	0.153713	0.059534
3	2021-02-17	AK	55101	807	288	8	130751.0	62474.0	731545.0	0.178733	0.085400
4	2021-02-24	AK	55986	885	289	1	156027.0	87061.0	731545.0	0.213284	0.119010

An Evaluation of COVID-19 Cases and Vaccinations in the United States

Eamon Weingold

Table of Contents¶

Introduction

Hypothesis¶

Part 1: Data Collection

Part 2: Data Processing

Part 3: Exploratory Analysis & Data Visualization

Part 4: Analysis, Hypothesis Testing, & ML

Part 5: Insight & Policy Decision

Learn more¶

	date_updated	state	start_date	end_date
0	01/23/2020	AK	01/16/2020	01/22/2020
1	01/30/2020	AK	01/23/2020	01/29/2020
2	02/06/2020	AK	01/30/2020	02/05/2020
3	02/13/2020	AK	02/06/2020	02/12/2020
4	02/20/2020	AK	02/13/2020	02/19/2020