import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statistics
!pip install missingno
import missingno as msno
import numpy as np
from scipy.stats import pearsonr
import os
import json
from pandas.io.json import json_normalize
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
Requirement already satisfied: missingno in /root/venv/lib/python3.7/site-packages (0.5.0)
Requirement already satisfied: scipy in /shared-libs/python3.7/py/lib/python3.7/site-packages (from missingno) (1.7.1)
Requirement already satisfied: matplotlib in /shared-libs/python3.7/py/lib/python3.7/site-packages (from missingno) (3.4.3)
Requirement already satisfied: numpy in /shared-libs/python3.7/py/lib/python3.7/site-packages (from missingno) (1.19.5)
Requirement already satisfied: seaborn in /shared-libs/python3.7/py/lib/python3.7/site-packages (from missingno) (0.11.2)
Requirement already satisfied: pyparsing>=2.2.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib->missingno) (2.4.7)
Requirement already satisfied: cycler>=0.10 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib->missingno) (0.10.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib->missingno) (1.3.2)
Requirement already satisfied: pillow>=6.2.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib->missingno) (8.3.2)
Requirement already satisfied: python-dateutil>=2.7 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib->missingno) (2.8.2)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from cycler>=0.10->matplotlib->missingno) (1.16.0)
Requirement already satisfied: pandas>=0.23 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from seaborn->missingno) (1.2.5)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.23->seaborn->missingno) (2021.3)
WARNING: You are using pip version 21.2.4; however, version 21.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
#règle quelques soucis d'affichage des graphiques
%matplotlib inline
sns.set_context('notebook')
%config InlineBackend.figure_format = 'retina'
def GraphStat1(df,Var,scale_factor,xmin,ymin):
#df = data Var= index scale_factor=dimension x y xmin ymin
# Tracé des histogrammes des principales colonnes continues quantitatives
fig = plt.figure(figsize=(12,10))
ligne_subset=1
for Bcl in Var:
ax = fig.add_subplot(4,3,ligne_subset)
ax.hist(df[Bcl], bins=30, color='grey')
ax.set_title(Bcl)
ax.vlines(df[Bcl].mean(), *ax.get_ylim(), color='blue', ls='-', lw=1.5)
ax.vlines(df[Bcl].median(), *ax.get_ylim(), color='green', ls='-.', lw=1.5)
ax.vlines(df[Bcl].mode(), *ax.get_ylim(), color='red', ls='-.', lw=1.5)
ax.legend(['mean', 'median', 'mode'])
ax.title.set_fontweight('bold')
ligne_subset=ligne_subset+1
plt.xlim(xmin * scale_factor, xmax * scale_factor)
plt.ylim(ymin * scale_factor, ymax * scale_factor)
plt.tight_layout(w_pad=2, h_pad=0.65)
plt.gcf().set_size_inches(20,15)
def unique_valeur_liste(list_tri):
unique = []
for valeur in list_tri:
if valeur in unique:
continue
else:
unique.append(valeur)
return unique
## Fonctions
###Fonction valmanque, paramètre (dataframe, pourcent,1 ou 0)
###Imprime les colonnes avec le nombre de ligne avec valeurs ainsi que le pourcentage sans valeurs
###puis retourne une liste contenant les nom de colonnes ayant un pourcentage inférieur a la valeur transmise.
### retourne aussi le nombre de lignes non nulles correspondant à la colonne ValNa
def valmanque(df,perct,imp):
# recherche des valeurs manquantes
NutriColonne=df.columns.values
ListeNaN=[]
ValNaN=df.isna().sum()
#nb Lignes
LigneNonNull=[]
NbLignes=len(df)
if imp == 1:
print("nombre de lignes :",NbLignes)
print("nombre de ligne contenant des valeurs nul:")
print(ValNaN)
print()
print("données par ligne après supression des valeurs null")
i=0
i2=0
while i< len(df.columns):
LigneNonNull.append(NbLignes-ValNaN[i])
if imp == 1 and (ValNaN[i]*100/NbLignes)>=perct:
print('-----------------------------------------')
print(NutriColonne[i])
print(NbLignes-ValNaN[i],' nombres de lignes remplies')
print(ValNaN[i]*100/NbLignes,' % vide')
if (ValNaN[i]*100/NbLignes)>=perct:
ListeNaN.append(NutriColonne[i])
i2=i2+1
i=i+1
print(i2,'sur',i,'variable avec plus de ',perct, 'pourcent de valeurs manquantes')
return(ListeNaN)
#fonction donnant des données staistiques de bases en indiquant un dataframe et une liste de noms de colonnes
#retourne graphiques de corrélations et un tableau de résultat
def Analyse_1(df,Var):
# Analyse statistique de base (describe fait pareil,mais je voulais ma version...)
#création du dataframe
Resume = pd.DataFrame(columns=['minimum','maximum','somme','moyenne','médiane','mode,','varianceVar','varianceStd'])
for bcl in Var:
mini=(df[bcl].min())
maxi=(df[bcl].max())
total=(df[bcl].sum())
moy=(df[bcl].mean())
mediane=(df[bcl].median())
Modee=(df[bcl].mode())
Varr=(df[bcl].var(ddof=0))
Stdd=(df[bcl].std(ddof=0))
#création du dataframe de résultat 'ajout row'
df_new_row = pd.DataFrame(data=np.array([[mini,maxi,total,moy,mediane,Modee,Varr,Stdd]]), columns=['minimum','maximum','somme','moyenne','médiane','mode,','varianceVar','varianceStd'])
Resume = pd.concat([Resume,df_new_row], ignore_index=True)
plt.plot(df[bcl])
plt.show()
insert_index = 0
insert_colname = 'Nom'
insert_values = Var # this can be a numpy array too
Resume.insert(loc=insert_index, column=insert_colname, value=insert_values)
return(Resume)
# Fonction créant un graphique de corellation en indiquant du dataframe et le nom de l'indicateur
def correl(df,indicateur):
corr=df.corr(method=indicateur)
plt.figure(figsize = (16,5))
sns.heatmap(corr,
xticklabels=corr.columns,
yticklabels=corr.columns,
cmap='RdBu_r',
annot=True,
linewidth=1)
#revoie les colonnes de df qui ne font pas partie de la liste Var
def diff(df,Var):
df1=pd.DataFrame
for Bcl in Var:
if Bcl in df.columns:
df1=df.drop(labels=Bcl, axis=1)
print (df1.shape)
return(df1)
Seattle2015 = pd.read_csv('2015-building-energy-benchmarking.csv')
Seattle2016 = pd.read_csv('2016-building-energy-benchmarking.csv')
S15=Seattle2015.copy()
S16=Seattle2016.copy()
# Contenue des deux DB
print ("Le dataset 2015 compte {} individues et {} variables".format(Seattle2015.shape[0], Seattle2015.shape[1]))
print ("Le dataset 2016 compte {} individues et {} variables".format(Seattle2016.shape[0], Seattle2016.shape[1]))
Le dataset 2015 compte 3340 individues et 47 variables
Le dataset 2016 compte 3376 individues et 46 variables
Seattle2015.columns
Seattle2016.columns
# Nom des variables en listes
columns_2015 = Seattle2015.columns.to_list()
columns_2016 = Seattle2016.columns.to_list()
print(columns_2015)
print(columns_2016)
['OSEBuildingID', 'DataYear', 'BuildingType', 'PrimaryPropertyType', 'PropertyName', 'TaxParcelIdentificationNumber', 'Location', 'CouncilDistrictCode', 'Neighborhood', 'YearBuilt', 'NumberofBuildings', 'NumberofFloors', 'PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)', 'ListOfAllPropertyUseTypes', 'LargestPropertyUseType', 'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseType', 'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseType', 'ThirdLargestPropertyUseTypeGFA', 'YearsENERGYSTARCertified', 'ENERGYSTARScore', 'SiteEUI(kBtu/sf)', 'SiteEUIWN(kBtu/sf)', 'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)', 'SiteEnergyUse(kBtu)', 'SiteEnergyUseWN(kBtu)', 'SteamUse(kBtu)', 'Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)', 'NaturalGas(kBtu)', 'OtherFuelUse(kBtu)', 'GHGEmissions(MetricTonsCO2e)', 'GHGEmissionsIntensity(kgCO2e/ft2)', 'DefaultData', 'Comment', 'ComplianceStatus', 'Outlier', '2010 Census Tracts', 'Seattle Police Department Micro Community Policing Plan Areas', 'City Council Districts', 'SPD Beats', 'Zip Codes']
['OSEBuildingID', 'DataYear', 'BuildingType', 'PrimaryPropertyType', 'PropertyName', 'Address', 'City', 'State', 'ZipCode', 'TaxParcelIdentificationNumber', 'CouncilDistrictCode', 'Neighborhood', 'Latitude', 'Longitude', 'YearBuilt', 'NumberofBuildings', 'NumberofFloors', 'PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)', 'ListOfAllPropertyUseTypes', 'LargestPropertyUseType', 'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseType', 'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseType', 'ThirdLargestPropertyUseTypeGFA', 'YearsENERGYSTARCertified', 'ENERGYSTARScore', 'SiteEUI(kBtu/sf)', 'SiteEUIWN(kBtu/sf)', 'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)', 'SiteEnergyUse(kBtu)', 'SiteEnergyUseWN(kBtu)', 'SteamUse(kBtu)', 'Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)', 'NaturalGas(kBtu)', 'DefaultData', 'Comments', 'ComplianceStatus', 'Outlier', 'TotalGHGEmissions', 'GHGEmissionsIntensity']
S15
S16
S15.shape
S16.shape
S15['TotalEmi']=S15['GHGEmissions(MetricTonsCO2e)']
S16['TotalEmi']=S16['TotalGHGEmissions']
#comparaison entre le GHGEmissions(MetricTonsCO2e) de 2015 remplacé par TotalGHGEmissions de 2016
fig, ax = plt.subplots()
ax.set_xlim(-500,1000)
ax.set(xlabel='', ylabel='')
sns.kdeplot(S15['GHGEmissions(MetricTonsCO2e)'], shade=True)
sns.kdeplot(S16['TotalGHGEmissions'], shade=True)
sns.set(rc = {'figure.figsize':(50,16)})
ax.tick_params(axis = 'both', which = 'major', labelsize = 24)
ax.tick_params(axis = 'both', which = 'minor', labelsize = 16)
plt.legend(labels=["GHGEmissions(MetricTonsCO2e)","TotalGHGEmissions"],title = "Comparaison des répartitions des deux variables",
fontsize = '30', title_fontsize = "30")
plt.xlabel('')
plt.ylabel('')
S1516= pd.concat([S15, S16], axis=0)
S1516.reset_index(drop=True,inplace=True)
S1516
S1516 = S1516.dropna(subset=["LargestPropertyUseTypeGFA"]) #supression des lignes qui ne renseigne pas sur LargestPropertyUseTypeGFA
S1516.columns
#recherche de 'esidential' pour éviter le majuscules/minuscules des variables contenant residential
df_sub=S1516.copy()
sub ='esidential'
# creating and passing series to new column
df_sub["residential"]= df_sub["BuildingType"].str.find(sub)
S1516 = df_sub[df_sub.residential != -1]
S1516
Sdrop1516=['OSEBuildingID','PrimaryPropertyType','PropertyName', 'TaxParcelIdentificationNumber', 'Location','BuildingType',
'CouncilDistrictCode', 'Neighborhood','YearsENERGYSTARCertified',
'DefaultData', 'Comment', 'ComplianceStatus', 'Outlier','PrimaryPropertyType',
'2010 Census Tracts','Seattle Police Department Micro Community Policing Plan Areas',
'City Council Districts', 'SPD Beats', 'Zip Codes', 'Address', 'City', 'State', 'ZipCode', 'Latitude', 'Longitude','Comments','ListOfAllPropertyUseTypes','LargestPropertyUseType','SecondLargestPropertyUseType','ThirdLargestPropertyUseType']
S1516=S1516.drop(labels=Sdrop1516, axis=1)
S1516.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3008 entries, 0 to 6715
Data columns (total 29 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 DataYear 3008 non-null int64
1 YearBuilt 3008 non-null int64
2 NumberofBuildings 3008 non-null float64
3 NumberofFloors 3001 non-null float64
4 PropertyGFATotal 3008 non-null int64
5 PropertyGFAParking 3008 non-null int64
6 PropertyGFABuilding(s) 3008 non-null int64
7 LargestPropertyUseTypeGFA 3008 non-null float64
8 SecondLargestPropertyUseTypeGFA 1642 non-null float64
9 ThirdLargestPropertyUseTypeGFA 669 non-null float64
10 ENERGYSTARScore 1980 non-null float64
11 SiteEUI(kBtu/sf) 3006 non-null float64
12 SiteEUIWN(kBtu/sf) 3006 non-null float64
13 SourceEUI(kBtu/sf) 3007 non-null float64
14 SourceEUIWN(kBtu/sf) 3007 non-null float64
15 SiteEnergyUse(kBtu) 3007 non-null float64
16 SiteEnergyUseWN(kBtu) 3006 non-null float64
17 SteamUse(kBtu) 3007 non-null float64
18 Electricity(kWh) 3007 non-null float64
19 Electricity(kBtu) 3007 non-null float64
20 NaturalGas(therms) 3007 non-null float64
21 NaturalGas(kBtu) 3007 non-null float64
22 OtherFuelUse(kBtu) 1467 non-null float64
23 GHGEmissions(MetricTonsCO2e) 1467 non-null float64
24 GHGEmissionsIntensity(kgCO2e/ft2) 1467 non-null float64
25 TotalEmi 3007 non-null float64
26 TotalGHGEmissions 1540 non-null float64
27 GHGEmissionsIntensity 1540 non-null float64
28 residential 3008 non-null int64
dtypes: float64(23), int64(6)
memory usage: 705.0 KB
valmanque(S1516,90,1)
nombre de lignes : 3008
nombre de ligne contenant des valeurs nul:
DataYear 0
YearBuilt 0
NumberofBuildings 0
NumberofFloors 7
PropertyGFATotal 0
PropertyGFAParking 0
PropertyGFABuilding(s) 0
LargestPropertyUseTypeGFA 0
SecondLargestPropertyUseTypeGFA 1366
ThirdLargestPropertyUseTypeGFA 2339
ENERGYSTARScore 1028
SiteEUI(kBtu/sf) 2
SiteEUIWN(kBtu/sf) 2
SourceEUI(kBtu/sf) 1
SourceEUIWN(kBtu/sf) 1
SiteEnergyUse(kBtu) 1
SiteEnergyUseWN(kBtu) 2
SteamUse(kBtu) 1
Electricity(kWh) 1
Electricity(kBtu) 1
NaturalGas(therms) 1
NaturalGas(kBtu) 1
OtherFuelUse(kBtu) 1541
GHGEmissions(MetricTonsCO2e) 1541
GHGEmissionsIntensity(kgCO2e/ft2) 1541
TotalEmi 1
TotalGHGEmissions 1468
GHGEmissionsIntensity 1468
residential 0
dtype: int64
données par ligne après supression des valeurs null
0 sur 29 variable avec plus de 90 pourcent de valeurs manquantes
S1516.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3008 entries, 0 to 6715
Data columns (total 29 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 DataYear 3008 non-null int64
1 YearBuilt 3008 non-null int64
2 NumberofBuildings 3008 non-null float64
3 NumberofFloors 3001 non-null float64
4 PropertyGFATotal 3008 non-null int64
5 PropertyGFAParking 3008 non-null int64
6 PropertyGFABuilding(s) 3008 non-null int64
7 LargestPropertyUseTypeGFA 3008 non-null float64
8 SecondLargestPropertyUseTypeGFA 1642 non-null float64
9 ThirdLargestPropertyUseTypeGFA 669 non-null float64
10 ENERGYSTARScore 1980 non-null float64
11 SiteEUI(kBtu/sf) 3006 non-null float64
12 SiteEUIWN(kBtu/sf) 3006 non-null float64
13 SourceEUI(kBtu/sf) 3007 non-null float64
14 SourceEUIWN(kBtu/sf) 3007 non-null float64
15 SiteEnergyUse(kBtu) 3007 non-null float64
16 SiteEnergyUseWN(kBtu) 3006 non-null float64
17 SteamUse(kBtu) 3007 non-null float64
18 Electricity(kWh) 3007 non-null float64
19 Electricity(kBtu) 3007 non-null float64
20 NaturalGas(therms) 3007 non-null float64
21 NaturalGas(kBtu) 3007 non-null float64
22 OtherFuelUse(kBtu) 1467 non-null float64
23 GHGEmissions(MetricTonsCO2e) 1467 non-null float64
24 GHGEmissionsIntensity(kgCO2e/ft2) 1467 non-null float64
25 TotalEmi 3007 non-null float64
26 TotalGHGEmissions 1540 non-null float64
27 GHGEmissionsIntensity 1540 non-null float64
28 residential 3008 non-null int64
dtypes: float64(23), int64(6)
memory usage: 705.0 KB
S1516.columns
df1=S1516.copy()
df1
df1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3008 entries, 0 to 6715
Data columns (total 29 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 DataYear 3008 non-null int64
1 YearBuilt 3008 non-null int64
2 NumberofBuildings 3008 non-null float64
3 NumberofFloors 3001 non-null float64
4 PropertyGFATotal 3008 non-null int64
5 PropertyGFAParking 3008 non-null int64
6 PropertyGFABuilding(s) 3008 non-null int64
7 LargestPropertyUseTypeGFA 3008 non-null float64
8 SecondLargestPropertyUseTypeGFA 1642 non-null float64
9 ThirdLargestPropertyUseTypeGFA 669 non-null float64
10 ENERGYSTARScore 1980 non-null float64
11 SiteEUI(kBtu/sf) 3006 non-null float64
12 SiteEUIWN(kBtu/sf) 3006 non-null float64
13 SourceEUI(kBtu/sf) 3007 non-null float64
14 SourceEUIWN(kBtu/sf) 3007 non-null float64
15 SiteEnergyUse(kBtu) 3007 non-null float64
16 SiteEnergyUseWN(kBtu) 3006 non-null float64
17 SteamUse(kBtu) 3007 non-null float64
18 Electricity(kWh) 3007 non-null float64
19 Electricity(kBtu) 3007 non-null float64
20 NaturalGas(therms) 3007 non-null float64
21 NaturalGas(kBtu) 3007 non-null float64
22 OtherFuelUse(kBtu) 1467 non-null float64
23 GHGEmissions(MetricTonsCO2e) 1467 non-null float64
24 GHGEmissionsIntensity(kgCO2e/ft2) 1467 non-null float64
25 TotalEmi 3007 non-null float64
26 TotalGHGEmissions 1540 non-null float64
27 GHGEmissionsIntensity 1540 non-null float64
28 residential 3008 non-null int64
dtypes: float64(23), int64(6)
memory usage: 705.0 KB
df1=df1.fillna(1) #inplace 1 pour éviter dévision par 0
#les deux calcule suivant devraient donner à peux près les mêmes valeurs, les petites différences peuvent venir d'autres PropertyUse mais devraient être minime
df1['energie_surface_largest'] = df1['SiteEnergyUse(kBtu)']/df1['LargestPropertyUseTypeGFA'] #
df1['energie_total'] = df1['SiteEnergyUse(kBtu)']/df1['PropertyGFATotal']
df1['ratio'] = df1['energie_surface_largest']/df1['energie_total'] # comparaison entre les deux résultats
print(df1['ratio'].median())
print(df1['ratio'].mean())
1.0730919534473478
1.3024738041322368
df1
a=(df1['energie_total'].median())
b=(df1['energie_total'].mean())
c=(df1['energie_surface_largest'].median())
d=(df1['energie_surface_largest'].mean())
print(d-c)
print(b-a)
26.014426236635927
20.6199557614309
df1.columns
var=['SiteEUI(kBtu/sf)', 'SiteEUIWN(kBtu/sf)', 'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)',
'SiteEnergyUse(kBtu)', 'SiteEnergyUseWN(kBtu)', 'SteamUse(kBtu)',
'Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)',
'NaturalGas(kBtu)', 'OtherFuelUse(kBtu)','GHGEmissions(MetricTonsCO2e)', 'GHGEmissionsIntensity(kgCO2e/ft2)',
'TotalGHGEmissions', 'GHGEmissionsIntensity', 'residential','ratio','energie_surface_largest']
df1=df1.drop(labels=var, axis=1)
msno.matrix(df1)
df1.describe()
df1=df1[df1.select_dtypes(include=[np.number]).ge(0).all(1)]
df1.describe()
Var=df1.columns
fig = plt.figure(figsize=(12,10))
# for i, ax, c in enumerate(zip(fig.ravel(), sorted(quant_cols))):
ligne_subset=1
for Bcl in Var:
ax = fig.add_subplot(5,3,ligne_subset)
sns.histplot(df1[Bcl])
ax.set_title(Bcl)
ax.vlines(df1[Bcl].mean(), *ax.get_ylim(), color='blue', ls='-', lw=1.5)
ax.vlines(df1[Bcl].median(), *ax.get_ylim(), color='green', ls='-.', lw=1.5)
ax.vlines(df1[Bcl].mode(), *ax.get_ylim(), color='red', ls='-.', lw=1.5)
ax.legend(['mean', 'median', 'mode'])
ax.title.set_fontweight('bold')
ligne_subset=ligne_subset+1
# plt.tight_layout(w_pad=4.5, h_pad=-1)
plt.tight_layout(w_pad=2, h_pad=0.65)
plt.gcf().set_size_inches(20,15)
plt.show()
fig = plt.figure(figsize=(15,15))
ax1 = fig.add_axes()
sel_cols = Var
df1[sel_cols].boxplot(vert=False, patch_artist= False, ax=ax1)
plt.suptitle('Dispersion of quantitative data', fontsize=14,
fontweight='bold')
plt.show()
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:5: MatplotlibDeprecationWarning: Calling add_axes() without argument is deprecated since 3.3 and will be removed two minor releases later. You may want to use add_subplot() instead.
"""
correl(df1,'pearson')
df1.to_csv('Seattle_clean.csv',index = False)