Introduction
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('/work/data/df_cleaned.csv')
df
df.dtypes
df['fper'] = pd.to_datetime(df['fper'])
df['despropie'] = df['despropie'].astype('category')
df['destipbr'] = df['destipbr'].astype('category')
df['descimi'] = df['descimi'].astype('category')
df['desestru'] = df['desestru'].astype('category')
df['despared'] = df['despared'].astype('category')
df['descubi'] = df['descubi'].astype('category')
df['descres'] = df['descres'].astype('category')
df['DESCTIPOBR'] = df['DESCTIPOBR'].astype('category')
df['DESCDISPSUT'] = df['DESCDISPSUT'].astype('category')
df['DESCDISPUSO'] = df['DESCDISPUSO'].astype('category')
df['DESCDISPARC'] = df['DESCDISPARC'].astype('category')
df['DESCDISPRFI'] = df['DESCDISPRFI'].astype('category')
df['DESCDISPRFX'] = df['DESCDISPRFX'].astype('category')
df['DESCAREAUR'] = df['DESCAREAUR'].astype('category')
df.dtypes
df['DESCAREAUR'].unique()
Data Analysis
Univariate Analysis
sns.set_theme()
sns.countplot(y='desprovin', data=df)
df.groupby(['desprovin'])[['descodcantf']].count().sort_values(by='descodcantf', ascending=False)
df_5_provinces = df.groupby(['desprovin'])[['descodcantf']].count().sort_values(by='descodcantf', ascending=False).head(5).sum()
df_rest_provinces = df.groupby(['desprovin'])[['descodcantf']].count().sort_values(by='descodcantf', ascending=False)[5:].sum()
sizes_pie = [float(df_5_provinces), float(df_rest_provinces)]
labels_pie = ['5 Provinces with Most Constructions', 'Rest of Provinces']
fig, ax = plt.subplots(figsize=(10, 7))
ax.pie(sizes_pie, labels=labels_pie, autopct='%1.1f%%')
ax.axis('equal')
plt.title('The 5 Provinces with most constructions compared to the rest')
plt.show()
df.groupby(['descodcantf'])[['desmes']].count().sort_values(by='desmes', ascending=False)
df_5_cantons = df.groupby(['descodcantf'])[['desmes']].count().sort_values(by='desmes', ascending=False).head(5).sum()
df_rest_cantons = df.groupby(['descodcantf'])[['desmes']].count().sort_values(by='desmes', ascending=False)[5:].sum()
sizes_pie = [float(df_5_cantons), float(df_rest_cantons)]
labels_pie = ['5 Cantons with Most Constructions', 'Rest of Cantons']
fig, ax = plt.subplots(figsize=(10, 7))
ax.pie(sizes_pie, labels=labels_pie, autopct='%1.1f%%')
ax.axis('equal')
plt.title('The 5 Cantons with most constructions compared to the rest')
plt.show()
df.groupby(['desprovin'])[['CVAE']].sum().sort_values(by='CVAE', ascending=False)
df_5_provinces_cost = df.groupby(['desprovin'])[['CVAE']].sum().sort_values(by='CVAE', ascending=False).head(5).sum()
df_rest_provinces_cost = df.groupby(['desprovin'])[['CVAE']].sum().sort_values(by='CVAE', ascending=False)[5:].sum()
sizes_pie = [float(df_5_provinces_cost), float(df_rest_provinces_cost)]
labels_pie = ['5 Provinces with Most Spending', 'Rest of Provinces']
fig, ax = plt.subplots(figsize=(10, 7))
ax.pie(sizes_pie, labels=labels_pie, autopct='%1.1f%%')
ax.axis('equal')
plt.title('The 5 Provinces with most construction spending compared to the rest')
plt.show()
df.groupby(['descodcantf'])[['CVAE']].sum().sort_values(by='CVAE', ascending=False)
df_5_cantons_cost = df.groupby(['descodcantf'])[['CVAE']].sum().sort_values(by='CVAE', ascending=False).head(5).sum()
df_rest_cantons_cost = df.groupby(['descodcantf'])[['CVAE']].sum().sort_values(by='CVAE', ascending=False)[5:].sum()
sizes_pie = [float(df_5_cantons_cost), float(df_rest_cantons_cost)]
labels_pie = ['5 Cantons with Most Spending', 'Rest of Cantons']
fig, ax = plt.subplots(figsize=(10, 7))
ax.pie(sizes_pie, labels=labels_pie, autopct='%1.1f%%')
ax.axis('equal')
plt.title('The 5 cantons with most construction spending compared to the rest')
plt.show()
df_date_index = df.set_index('fper')
df_date_index['Quarter'] = df_date_index.index.quarter
df_date_index['Month'] = df_date_index.index.month
df_date_index['Day'] = df_date_index.index.weekday
df_date_index
df_date_index.groupby('Quarter')[['CVAE']].count().plot.bar(rot=0)
df_date_index.groupby('Quarter')[['CVAE']].sum().plot.bar(rot=0)
df_date_index.groupby('Month')[['CVAE']].count().plot.bar(rot=0)
df_date_index.groupby('Month')[['CVAE']].sum().plot.bar(rot=0)
sns.countplot(y='despropie', data=df_date_index)
df_date_index.groupby('despropie')[['CVAE']].sum().plot.bar(rot=0)
pd.options.display.float_format = '{:,.2f}'.format
df_date_index.groupby(['despropie', 'DESCDISPUSO'])[['CVAE']].median()
sns.countplot(y='destipbr', data=df_date_index)
sns.countplot(y='descimi', data=df_date_index)
sns.countplot(y='desestru', data=df_date_index)
sns.countplot(y='despared', data=df_date_index)
sns.countplot(y='descubi', data=df_date_index)
sns.countplot(y='descres', data=df_date_index)
sns.countplot(y='DESCTIPOBR', data=df_date_index)
sns.countplot(y='DESCDISPSUT', data=df_date_index)
sns.countplot(y='DESCDISPUSO', data=df_date_index)
sns.countplot(y='DESCDISPARC', data=df_date_index)
sns.countplot(data=df_date_index, y='DESCDISPRFI')
sns.countplot(y='DESCDISPRFX', data=df_date_index)
sns.countplot(y='DESCAREAUR', data=df_date_index)
sns.displot(data=df_date_index, x='nucu', kde=True)
df_date_index[['nucu']].describe()
sns.displot(data=df_date_index, x='nudo', kde=True)
df_date_index[['nudo']].describe()
sns.displot(data=df_date_index, x='nupi', kde=True)
df_date_index[['nupi']].describe()
df_date_index[['nupi', 'tonp', 'NUVICAL']].describe()
df_date_index.describe()
Bivariate Analysis
#Type of property (private/public) and type of construction
sns.countplot(x='despropie', hue='destipbr' ,data=df_date_index)
#Public property and type of construction
sns.countplot(data=df_date_index[(df_date_index['despropie'] == 'Pública')], x='despropie', hue='destipbr')
plt.legend(loc='upper left')
#Type of property (private/public) and type of structure
sns.countplot(x='despropie', hue='desestru' ,data=df_date_index)
plt.legend(loc='upper right')
#Public property and type of construction
sns.countplot(data=df_date_index[(df_date_index['despropie'] == 'Pública')], x='despropie', hue='desestru')
plt.legend(loc='upper left')
#Type of property (private/public) and type of use
sns.countplot(hue='despropie', y='DESCDISPUSO' ,data=df_date_index)
plt.legend(loc='upper right')
#Public property and type of construction
sns.countplot(data=df_date_index[(df_date_index['despropie'] == 'Pública')], y='DESCDISPUSO', hue='despropie')
plt.legend(loc='lower right')
#Type of property (private/public) and financing type
sns.countplot(hue='despropie', y='DESCDISPRFI' ,data=df_date_index)
plt.legend(loc='upper right')
#Public property and financing type
sns.countplot(data=df_date_index[(df_date_index['despropie'] == 'Pública')], y='DESCDISPRFI', hue='despropie')
plt.legend(loc='lower right')
#Quarter
df_date_index.groupby(['Quarter', 'despropie'])[['despropie']].count()
df_date_index[(df_date_index['despropie'] == 'Pública')].groupby(['Quarter', 'despropie'])[['desestru']].count().plot.bar(rot=45)
df_price = df_date_index.groupby(['despropie', 'DESCDISPUSO'])[['CVAE', 'CARCO']].sum()
df_price['Price_m2'] = df_price['CVAE'] / df_price['CARCO']
df_price
df_price['Price_m2'] = df_price['Price_m2'].fillna(0)
fig, ax = plt.subplots(figsize=(10, 7))
sns.lineplot(data=df_price, x='DESCDISPUSO', y='Price_m2', hue='despropie')
plt.xticks(rotation=90)
fig, ax = plt.subplots(figsize=(10, 7))
sns.barplot(data=df_price.reset_index(), x='DESCDISPUSO', y='Price_m2', hue='despropie')
plt.xticks(rotation=90)
df_date_index['Price_m2'] = df_date_index['CVAE'] / df_date_index['CARCO']
df_date_index['Price_m2'] = df_date_index['Price_m2'].fillna(0)
df_date_index['Price_m2'].describe()
fig, ax = plt.subplots(figsize=(40, 40))
sns.heatmap(df_date_index.corr(), annot=True, square=True, annot_kws={"fontsize":15})
df_date_index['DESCDISPRFI'].unique()
df_date_index.dtypes
#Correlation number of floors and rooms
sns.regplot(data=df_date_index, x='tonp', y='NUCUCAL')
plt.ylim(0, 50)
#Corr construction area and green area
sns.regplot(data=df_date_index, x='CARCO', y='CAESV')
#Corr land area and construction area
sns.regplot(data=df_date_index, x='CSUTE', y='CARCO')
#Corr land area and green area
sns.regplot(data=df_date_index, x='CSUTE', y='CAESV')
#Corr construction area and green area
sns.regplot(data=df_date_index, x='CARCO', y='CAESV')
#Corr construction area and total price of construction
sns.regplot(data=df_date_index, x='CARCO', y='CVAE')
#Corr land and total price of construction
sns.regplot(data=df_date_index, x='CSUTE', y='CVAE')
#Corr green area and total price of construction
sns.regplot(data=df_date_index, x='CAESV', y='CVAE')
df_date_index.dtypes
Multivariate Analysis
#Correlation number of floors and rooms
sns.scatterplot(data=df_date_index, x='tonp', y='NUCUCAL', hue='despropie')
plt.ylim(0, 50)
#Corr construction area and green area by type of property
sns.scatterplot(data=df_date_index, x='CARCO', y='CAESV', hue='despropie')
#Corr land area and construction area by type of property
sns.scatterplot(data=df_date_index, x='CSUTE', y='CARCO', hue='despropie')
#Corr land area and green area
sns.scatterplot(data=df_date_index, x='CSUTE', y='CAESV', hue='despropie')
df_date_index.groupby(['despropie', 'desestru'])[['tonp']].max()
#Corr construction area and total price of construction by type of property
sns.scatterplot(data=df_date_index, x='CARCO', y='CVAE', hue='despropie')
plt.ylim(0, 2000000)
plt.xlim(0, 15000)
#Corr green area and total price of construction by type of property
sns.scatterplot(data=df_date_index, x='CAESV', y='CVAE', hue='despropie')
plt.ylim(0, 2000000)
plt.xlim(0, 15000)
#Corr construction area and total price of construction by type of structure
sns.scatterplot(data=df_date_index, x='CARCO', y='CVAE', hue='desestru')
#Corr green area and total price of construction by type of structure
sns.scatterplot(data=df_date_index, x='CAESV', y='CVAE', hue='desestru')
#Corr construction area and green area by type of construction
sns.scatterplot(data=df_date_index, x='CARCO', y='CAESV', hue='desestru')