Objective
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from google.colab import drive
drive.mount('/content/gdrive')
df=pd.read_csv(r'/content/gdrive/MyDrive/ANALYTICS/weatherAUS.csv')
rainAUS=df.copy()
Data Quality
print(f'the dataset have {df.shape[0]} entries')
dfInfo = pd.DataFrame(df.isnull().sum(),columns = ['nullQtd'])
dfInfo['nullPerc']=dfInfo['nullQtd']/df.shape[0]
dfInfo['uniqueValues'] = df.nunique()
dfInfo['types'] = df.dtypes
dfInfo.sort_values(['nullPerc'],ascending = False)
df.groupby(['Location']).agg({'Date':'count'}).sort_values(['Date'],ascending = True)
Information over the quantitative columns
df.describe().transpose()
Information about the qualitative data
df.describe(include=['O']).transpose()
Data Preparation
rainAUS['rainNextDay'] = rainAUS.sort_values(by=['Date'], ascending=True).groupby(['Location'])['Rainfall'].shift(-1)
rainAUS['nextDay'] = rainAUS.sort_values(by=['Date'], ascending=True).groupby(['Location'])['Date'].shift(-1)
rainAUS['datediff'] = -(pd.to_datetime(rainAUS['Date']) - pd.to_datetime(rainAUS['nextDay'])).dt.days
rainAUS['datediff'].value_counts().sort_values(ascending = False)
removed = rainAUS.loc[rainAUS.datediff!=1]
filtered_rainAUSF = rainAUS.loc[rainAUS.datediff == 1]
filtered_rainAUSF = filtered_rainAUSF.loc[:,(filtered_rainAUSF.isnull().sum()/filtered_rainAUSF.shape[0])<0.2]
print(filtered_rainAUSF.shape)
filtered_rainAUSF = filtered_rainAUSF.loc[~filtered_rainAUSF.rainNextDay.isnull()]
print(filtered_rainAUSF.shape)
#Dropping fields which are not needed
filtered_rainAUSF.drop(columns = ['nextDay','datediff','RainTomorrow'],inplace = True)
#Deleting rows which have missing values
filtered_rainAUSF=filtered_rainAUSF.dropna()
#final dataset to be applied at the model(s)
filtered_rainAUSF.to_csv('/content/gdrive/MyDrive/ANALYTICS/filtered.csv')