import pandas as pd
import numpy as np
import klib
train = pd.read_csv('/work/Data/ml_case_training_data.csv')
train_hist = pd.read_csv('/work/Data/ml_case_training_hist_data.csv')
train_out = pd.read_csv('/work/Data/ml_case_training_output.csv')
train.head()
train_hist.head()
train_out.head()
klib.missingval_plot(train)
klib.missingval_plot(train_hist)
klib.missingval_plot(train_out)
No missing values found in the dataset.
train_cleaned = klib.data_cleaning(train)
train_hist_cleaned = klib.data_cleaning(train_hist)
train_cleaned['date_first_activ'] = pd.to_datetime(train_cleaned['date_first_activ'])
train_cleaned['date_modif_prod'] = pd.to_datetime(train_cleaned['date_modif_prod'])
train_cleaned['date_renewal'] = pd.to_datetime(train_cleaned['date_renewal'])
mean_col = 0
for column in train_cleaned:
try:
mean_col = train_cleaned[column].mean()
train_cleaned[column] = train_cleaned[column].fillna(mean_col)
except:
continue
klib.missingval_plot(train_cleaned)
Shape of cleaned data: (16096, 31)Remaining NAs: 77537
Changes:
Dropped rows: 0
of which 0 duplicates. (Rows: [])
Dropped columns: 1
of which 0 single valued. Columns: []
Dropped missing values: 16096
Reduced memory by at least: 1.92 MB (-48.85%)
Shape of cleaned data: (193002, 8)Remaining NAs: 8154
Changes:
Dropped rows: 0
of which 0 duplicates. (Rows: [])
Dropped columns: 0
of which 0 single valued. Columns: []
Dropped missing values: 0
Reduced memory by at least: 4.23 MB (-35.91%)
train_cleaned = train_cleaned.merge(train_out,how='inner',on='id')
train_cleaned.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16096 entries, 0 to 16095
Data columns (total 32 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 16096 non-null object
1 activity_new 6551 non-null category
2 channel_sales 11878 non-null category
3 cons_12m 16096 non-null int32
4 cons_gas_12m 16096 non-null int32
5 cons_last_month 16096 non-null int32
6 date_activ 16096 non-null string
7 date_end 16094 non-null category
8 date_first_activ 16096 non-null datetime64[ns]
9 date_modif_prod 16096 non-null datetime64[ns]
10 date_renewal 16056 non-null category
11 forecast_base_bill_ele 16096 non-null float32
12 forecast_base_bill_year 16096 non-null float32
13 forecast_bill_12m 16096 non-null float32
14 forecast_cons 16096 non-null float32
15 forecast_cons_12m 16096 non-null float32
16 forecast_cons_year 16096 non-null int32
17 forecast_discount_energy 16096 non-null float32
18 forecast_meter_rent_12m 16096 non-null float32
19 forecast_price_energy_p1 16096 non-null float32
20 forecast_price_energy_p2 16096 non-null float32
21 forecast_price_pow_p1 16096 non-null float32
22 has_gas 16096 non-null category
23 imp_cons 16096 non-null float32
24 margin_gross_pow_ele 16096 non-null float32
25 margin_net_pow_ele 16096 non-null float32
26 nb_prod_act 16096 non-null int8
27 net_margin 16096 non-null float32
28 num_years_antig 16096 non-null int8
29 origin_up 16009 non-null category
30 pow_max 16096 non-null float32
31 churn 16096 non-null int64
dtypes: category(6), datetime64[ns](2), float32(15), int32(4), int64(1), int8(2), object(1), string(1)
memory usage: 2.1+ MB
train_full = train_cleaned.merge(train_hist_cleaned,how='inner',on='id')
train_full.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 193002 entries, 0 to 193001
Data columns (total 39 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 193002 non-null string
1 activity_new 78570 non-null category
2 channel_sales 142407 non-null category
3 cons_12m 193002 non-null int32
4 cons_gas_12m 193002 non-null int32
5 cons_last_month 193002 non-null int32
6 date_activ 193002 non-null string
7 date_end 192981 non-null category
8 date_first_activ 193002 non-null datetime64[ns]
9 date_modif_prod 193002 non-null datetime64[ns]
10 date_renewal 192525 non-null category
11 forecast_base_bill_ele 193002 non-null float32
12 forecast_base_bill_year 193002 non-null float32
13 forecast_bill_12m 193002 non-null float32
14 forecast_cons 193002 non-null float32
15 forecast_cons_12m 193002 non-null float32
16 forecast_cons_year 193002 non-null int32
17 forecast_discount_energy 193002 non-null float32
18 forecast_meter_rent_12m 193002 non-null float32
19 forecast_price_energy_p1 193002 non-null float32
20 forecast_price_energy_p2 193002 non-null float32
21 forecast_price_pow_p1 193002 non-null float32
22 has_gas 193002 non-null category
23 imp_cons 193002 non-null float32
24 margin_gross_pow_ele 193002 non-null float32
25 margin_net_pow_ele 193002 non-null float32
26 nb_prod_act 193002 non-null int8
27 net_margin 193002 non-null float32
28 num_years_antig 193002 non-null int8
29 origin_up 191960 non-null category
30 pow_max 193002 non-null float32
31 churn 193002 non-null int64
32 price_date 193002 non-null category
33 price_p1_var 191643 non-null float32
34 price_p2_var 191643 non-null float32
35 price_p3_var 191643 non-null float32
36 price_p1_fix 191643 non-null float32
37 price_p2_fix 191643 non-null float32
38 price_p3_fix 191643 non-null float32
dtypes: category(7), datetime64[ns](2), float32(21), int32(4), int64(1), int8(2), string(2)
memory usage: 29.5 MB
klib.corr_plot(train_full,annot=False)