Skip to main content

Sham Sui Po, Hong Kong

Plotly COVID19 Dataset Exploration

Github Repository

Covid 19 :: Preprocessed Dataset

Johns Hopkins University Center for Systems Science and Engineering - COVID 19 Dataset preprocessed by @laxmimerit.

Coronavirus disease 2019 (COVID-19) time series listing confirmed cases, reported deaths and reported recoveries. Data is disaggregated by country (and sometimes subregion). Coronavirus disease (COVID-19) is caused by the Severe acute respiratory syndrome Coronavirus 2 (SARS-CoV-2) and has had a worldwide effect. On March 11 2020, the World Health Organization (WHO) declared it a pandemic, pointing to the over 118,000 cases of the coronavirus illness in over 110 countries and territories around the world at the time.

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import math
import random
from datetime import datetime

# color palette
cnf='#393e46'
dth='#ff2e63'
rec='#21bf73'
act='#fe9801'
import plotly as py
py.offline.init_notebook_mode(connected=True)

Datasets

!wget https://github.com/laxmimerit/Covid-19-Preprocessed-Dataset/raw/main/preprocessed/country_daywise.csv -P datasets
!wget https://github.com/laxmimerit/Covid-19-Preprocessed-Dataset/raw/main/preprocessed/daywise.csv -P datasets
!wget https://github.com/laxmimerit/Covid-19-Preprocessed-Dataset/raw/main/preprocessed/covid_19_data_cleaned.csv -P datasets
!wget https://github.com/laxmimerit/Covid-19-Preprocessed-Dataset/raw/main/preprocessed/countrywise.csv -P datasets
df_data_clean = pd.read_csv('datasets/covid_19_data_cleaned.csv', parse_dates=['Date'])
df_data_clean['Province/State'] = df_data_clean['Province/State'].fillna('')
df_data_clean.tail(5)
DateProvince/StateCountryLatLongConfirmedRecoveredDeathsActive
3371802023-03-05Timor-Leste-8.8742125.72750000
3371812023-03-06Timor-Leste-8.8742125.72750000
3371822023-03-07Timor-Leste-8.8742125.72750000
3371832023-03-08Timor-Leste-8.8742125.72750000
3371842023-03-09Timor-Leste-8.8742125.72750000
df_data_country_daywise = pd.read_csv('datasets/country_daywise.csv', parse_dates=['Date'])
df_data_country_daywise.tail(5)
DateCountryConfirmedDeathsRecoveredActiveNew CasesNew RecoveredNew Deaths
2291352023-03-03Zimbabwe26412756680258459000
2291362023-03-04Zimbabwe26412756680258459000
2291372023-03-05Zimbabwe26412756680258459000
2291382023-03-06Zimbabwe26412756680258459000
2291392023-03-07Zimbabwe26412756680258459000
df_data_daywise = pd.read_csv('datasets/daywise.csv', parse_dates=['Date'])
df_data_daywise.tail(5)
DateConfirmedDeathsRecoveredActiveNew CasesDeaths / 100 CasesRecovered / 100 CasesDeaths / 100 RecoveredNo. of Countries
11352023-03-03675914580687732506690372551826691.020.0inf201
11362023-03-0467596877568776010669091174541951.020.0inf201
11372023-03-0567602490168777490669147152599881.020.0inf201
11382023-03-0667608294168781150669204826631961.020.0inf201
11392023-03-07676213378687903806693343401304371.020.0inf201
df_data_countrywise = pd.read_csv('datasets/countrywise.csv')
df_data_countrywise.tail(5)
CountryConfirmedDeathsRecoveredActiveNew CasesDeaths / 100 CasesRecovered / 100 CasesDeaths / 100 RecoveredPopulationCases / Million PeopleConfirmed last week1 week change1 week % increase
196West Bank and Gaza7032285708069752000.810.00.04543126154789.00000070322800.00
197Winter Olympics 20225350053500.000.00.00155266.96938853500.00
198Yemen11945215909786018.070.00.029825968400.0000001194500.00
199Zambia3431354057033907801.180.00.01838395618665.0000003430121230.04
200Zimbabwe2641275668025845902.150.00.01486292717771.0000002639212060.08

Exploration

df_data_clean.isnull().sum()
Date0
Province/State0
Country0
Lat0
Long0
Confirmed0
Recovered0
Deaths0
Active0
dtype: int64
df_data_clean.info()
df_data_clean.query('Country == "China"')
DateProvince/StateCountryLatLongConfirmedRecoveredDeathsActive
674372020-01-22AnhuiChina31.8257117.22641001
674382020-01-23AnhuiChina31.8257117.22649009
674392020-01-24AnhuiChina31.8257117.2264150015
674402020-01-25AnhuiChina31.8257117.2264390039
674412020-01-26AnhuiChina31.8257117.2264600060
...
3337512023-03-05HenanChina33.8820113.61400000
3337522023-03-06HenanChina33.8820113.61400000
3337532023-03-07HenanChina33.8820113.61400000
3337542023-03-08HenanChina33.8820113.61400000
3337552023-03-09HenanChina33.8820113.61400000

Worldwide Covid-19 Cases

confirmed_infections = df_data_clean.groupby('Date').sum(numeric_only=True)['Confirmed'].reset_index()
confirmed_infections.tail(5)

Confirmed Infection Cases

DateConfirmed
02020-01-22557
12020-01-23657
22020-01-24944
32020-01-251437
42020-01-262120
...
11382023-03-05676024901
11392023-03-06676082941
11402023-03-07676213378
11412023-03-08676392824
11422023-03-09676570149
1143 rows × 2 columns
recovered_cases = df_data_clean.groupby('Date').sum(numeric_only=True)['Recovered'].reset_index()
recovered_cases.tail(5)

Recovered Cases

DateRecovered
02020-01-2230
12020-01-2332
22020-01-2439
32020-01-2542
42020-01-2656
...
11382023-03-050
11392023-03-060
11402023-03-070
11412023-03-080
11422023-03-090
1143 rows × 2 columns
deaths = df_data_clean.groupby('Date').sum(numeric_only=True)['Deaths'].reset_index()
deaths.tail(5)

Deaths

DateDeaths
DateDeaths
11382023-03-056877749
11392023-03-066878115
11402023-03-076879038
11412023-03-086880483
11422023-03-096881802
fig = go.Figure()
fig.update_layout(
title='Worldwide Covid-19 Cases',
xaxis_tickfont_size=14,
yaxis=dict(title='Number of Cases')
)

fig.add_trace(go.Scatter(
x=confirmed_infections['Date'],
y=confirmed_infections['Confirmed'],
mode='lines',
name='Confirmed Infections',
line=dict(color='orange', width=2),
))

fig.add_trace(go.Scatter(
x=recovered_cases['Date'],
y=recovered_cases['Recovered'],
mode='lines',
name='Recovered Cases',
line=dict(color='dodgerblue', width=2),
))

fig.add_trace(go.Scatter(
x=deaths['Date'],
y=deaths['Deaths'],
mode='lines',
name='Deaths',
line=dict(color='red', width=2),
))

Covid 19 :: Preprocessed Dataset

Worldwide Case Density

df_data_clean['Date'] = df_data_clean['Date'].astype(str)
fig = px.density_mapbox(
data_frame=df_data_clean,
lat='Lat',
lon='Long',
hover_name='Country',
hover_data=['Confirmed', 'Recovered', 'Deaths'],
animation_frame='Date',
color_continuous_scale='Portland',
radius=7,
zoom=0,
height=700
)

fig.update_layout(
mapbox_style="open-street-map",
title='Worldwide Covid-19 Cases',
mapbox_center_lon=0
)

fig.show()

Covid 19 :: Preprocessed Dataset

Cruising Corona

df_data_clean['Date'] = pd.to_datetime(df_data_clean['Date'])
cruise_ships_diamond = df_data_clean[
'Province/State'
].str.contains(
'Grand Princess'
) | df_data_clean[
'Country'
].str.contains(
'Diamond Princess'
)

cruise_ships_grand = df_data_clean[
'Province/State'
].str.contains(
'Grand Princess'
) | df_data_clean[
'Country'
].str.contains(
'Grand Princess'
)

cruise_ships_zaandam = df_data_clean[
'Country'
].str.contains(
'MS Zaandam'
)

cruise_df_diamond = df_data_clean[cruise_ships_diamond]
cruise_ships_grand = df_data_clean[cruise_ships_grand]
cruise_df_zaandam = df_data_clean[cruise_ships_zaandam]
cruise_df_zaandam
DateProvince/StateCountryLatLongConfirmedRecoveredDeathsActive
2000252020-01-22MS Zaandam0.00.00000
2000262020-01-23MS Zaandam0.00.00000
2000272020-01-24MS Zaandam0.00.00000
2000282020-01-25MS Zaandam0.00.00000
2000292020-01-26MS Zaandam0.00.00000
...
2011632023-03-05MS Zaandam0.00.09027
2011642023-03-06MS Zaandam0.00.09027
2011652023-03-07MS Zaandam0.00.09027
2011662023-03-08MS Zaandam0.00.09027
2011672023-03-09MS Zaandam0.00.09027
1143 rows × 9 columns
fig = go.Figure()
fig.update_layout(
title='Covid-19 Deaths on Cruise Ships',
xaxis_tickfont_size=14,
yaxis=dict(title='Number of Cases')
)

fig.add_trace(go.Scatter(
x=cruise_df_diamond['Date'],
y=cruise_df_diamond['Deaths'],
mode='lines',
name='Diamond Princess',
line=dict(color='fuchsia', width=2),
))

fig.add_trace(go.Scatter(
x=cruise_ships_grand['Date'],
y=cruise_ships_grand['Deaths'],
mode='lines',
name='Grand Princess',
line=dict(color='dodgerblue', width=2),
))

fig.add_trace(go.Scatter(
x=cruise_df_zaandam['Date'],
y=cruise_df_zaandam['Deaths'],
mode='lines',
name='MS Zaandam',
line=dict(color='mediumspringgreen', width=2),
))

Covid 19 :: Preprocessed Dataset

time_plot_df = df_data_clean.groupby(
'Date'
)[[
'Confirmed',
'Deaths',
'Recovered',
'Active'
]].sum(numeric_only=True).reset_index()

time_plot_df.head(5)
DateConfirmedDeathsRecoveredActive
02020-01-225571730510
12020-01-236571832607
22020-01-249442639879
32020-01-25143742421353
42020-01-26212056562008
# get latest
latest_values = time_plot_df[time_plot_df['Date']==max(time_plot_df['Date'])].reset_index(drop=True)
latest_values
DateConfirmedDeathsRecoveredActive
02023-03-0967657014968818020669688347
tp_df = time_plot_df.melt(id_vars='Date', value_vars=['Active', 'Deaths', 'Recovered'])

fig = px.treemap(
tp_df,
path=['variable'],
values='value',
height=250,
width=800,
color_discrete_sequence=[act, rec, dth]
)

fig.data[0].textinfo = 'label+text+value'
fig.show()

Covid 19 :: Preprocessed Dataset

tp_df = time_plot_df.melt(
id_vars='Date',
value_vars=['Active', 'Deaths', 'Recovered'],
var_name='Case',
value_name='Count'
)

fig = px.area(
tp_df,
x='Date',
y='Count',
color='Case',
height=600,
title='Cases over Time',
color_discrete_sequence=[rec, dth, act]
)

fig.update_layout(
xaxis_rangeslider_visible=True
)

fig.show()

Covid 19 :: Preprocessed Dataset