import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import date
import csv
import seaborn as sns
# Icrease plot
plt.rcParams['figure.figsize'] = (10, 6)
today = date.today()
# Month abbreviation, day and year
d4 = today.strftime("%d-%b-%Y")
print(f'Date: {d4}')
Date: 18-Nov-2021
# Read file
df_dm = pd.read_csv('/work/registrations_tgp_16092021.csv', encoding='latin-1')
df_dm.head()
df_dm.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6700 entries, 0 to 6699
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 AGE 6700 non-null int64
1 SEX 6700 non-null object
2 REGDATE 6700 non-null object
3 STATUS 6700 non-null object
dtypes: int64(1), object(3)
memory usage: 209.5+ KB
df_dm['REGDATE'] = pd.to_datetime(df_dm['REGDATE'])
df_dm.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6700 entries, 0 to 6699
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 AGE 6700 non-null int64
1 SEX 6700 non-null object
2 REGDATE 6700 non-null datetime64[ns]
3 STATUS 6700 non-null object
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 209.5+ KB
df_dm.REGDATE.dt.year.value_counts().sort_index()
df_dm['SEX'] = df_dm.SEX.str.replace('Unknown', '2')
df_dm['SEX'] = df_dm.SEX.str.replace('Male', '1')
df_dm['SEX'] = df_dm.SEX.str.replace('Female', '0')
df_dm['SEX'] = pd.to_numeric(df_dm['SEX'])
df_dm.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6700 entries, 0 to 6699
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 AGE 6700 non-null int64
1 SEX 6700 non-null int64
2 REGDATE 6700 non-null datetime64[ns]
3 STATUS 6700 non-null object
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 209.5+ KB
fig, ax = plt.subplots()
df_dm.plot(kind='scatter', x='REGDATE', y='AGE', c="SEX", s=5, cmap='viridis', alpha=0.65, linewidth=1, ax=ax, title='Registrations all time =. Scatter Plot AGE / REGDATE / SEX')
df_dm.REGDATE.dt.year.value_counts().sort_index().plot(kind='bar', title='Registration per Year')
df_dm.REGDATE.dt.month.value_counts().sort_index().plot(kind='bar', color=(0.4, 0.4, 0.6, 0.6),title='Registrations by Month of the Year (Jan - Dec) - All Time')
df_dm.REGDATE.dt.weekday.value_counts().sort_index().plot(kind='bar', color=(0.4, 0.4, 0.6, 0.6),title='Registrations by Month of the Year (Jan - Dec) - All Time')
df_dm[(df_dm.REGDATE.dt.year == 2019)].REGDATE.dt.month.value_counts().sort_index().plot(kind='bar', title='Registration by Month of the Year (Jan - Dec) - 2019')
df_dm[(df_dm.REGDATE.dt.year == 2021)].REGDATE.dt.month.value_counts().sort_index().plot(kind='bar', title='Registration by Month of the Year (Jan - Dec) - 2021')
df_dm[(df_dm.REGDATE.dt.year == 2021)].REGDATE.dt.weekday.value_counts().sort_index().plot(kind='bar', title='Registration by Month of the Year (Jan - Dec) - 2021')
df_dm[(df_dm.REGDATE.dt.year == 2021)].REGDATE.dt.week.value_counts().sort_index().plot(kind='bar', color=(0.4, 0.4, 0.6, 0.6), title='Registration in 2021 - Weeks)')
<ipython-input-23-2e4d26c6d009>:1: FutureWarning: Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead.
df_dm[(df_dm.REGDATE.dt.year == 2021)].REGDATE.dt.week.value_counts().sort_index().plot(kind='bar', color=(0.4, 0.4, 0.6, 0.6), title='Registration in 2021 - Weeks)')
df_dm[(df_dm.REGDATE.dt.year == 2021)].REGDATE.dt.isocalendar().week.value_counts().sort_index().plot(kind='bar', title='Weekly Registration 2021')
df_dm[(df_dm.REGDATE.dt.year == 2013)].REGDATE.dt.isocalendar().week.value_counts().sort_index().plot(kind='bar', title='Weekly Registration 2019')
df_dm[(df_dm.REGDATE.dt.year == 2019)].REGDATE.dt.isocalendar().week.value_counts().sort_index().plot(kind='bar', title='Weekly Registration 2020')
df_dm[(df_dm.REGDATE.dt.year == 2020)].REGDATE.dt.month.value_counts().sort_index().plot(kind='bar', title='Weekly Registration 2020')
df_dm[(df_dm.REGDATE.dt.year == 2018)].REGDATE.dt.month.value_counts().sort_index().plot(kind='bar', title='Registration by Month of the Year (Jan - Dec) - 20181')