client_data.csv
● id = client company identifier
● activity_new = category of the company’s activity
● channel_sales = code of the sales channel
● cons_12m = electricity consumption of the past 12 months
● cons_gas_12m = gas consumption of the past 12 months
● cons_last_month = electricity consumption of the last month
● date_activ = date of activation of the contract
● date_end = registered date of the end of the contract
● date_modif_prod = date of the last modification of the product
● date_renewal = date of the next contract renewal
● forecast_cons_12m = forecasted electricity consumption for next 12 months
● forecast_cons_year = forecasted electricity consumption for the next calendar year
● forecast_discount_energy = forecasted value of current discount
● forecast_meter_rent_12m = forecasted bill of meter rental for the next 2 months
● forecast_price_energy_off_peak = forecasted energy price for 1st period (off peak)
● forecast_price_energy_peak = forecasted energy price for 2nd period (peak)
● forecast_price_pow_off_peak = forecasted power price for 1st period (off peak)
● has_gas = indicated if client is also a gas client
● imp_cons = current paid consumption
● margin_gross_pow_ele = gross margin on power subscription
● margin_net_pow_ele = net margin on power subscription
● nb_prod_act = number of active products and services
● net_margin = total net margin
● num_years_antig = antiquity of the client (in number of years)
● origin_up = code of the electricity campaign the customer first subscribed to
● pow_max = subscribed power
● churn = has the client churned over the next 3 months
price_data.csv
● id = client company identifier
● price_date = reference date
● price_off_peak_var = price of energy for the 1st period (off peak)
● price_peak_var = price of energy for the 2nd period (peak)
● price_mid_peak_var = price of energy for the 3rd period (mid peak)
● price_off_peak_fix = price of power for the 1st period (off peak)
● price_peak_fix = price of power for the 2nd period (peak)
● price_mid_peak_fix = price of power for the 3rd period (mid peak)
Note: some fields are hashed text strings. This preserves the privacy of the original data but the commercial meaning is retained and so they may have predictive power
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from scipy.stats import variation
from statistics import stdev
from statistics import mean
# Importing the 2 csv files
client_df = pd.read_csv("client_data.csv")
price_df = pd.read_csv("price_data.csv")
client_df.shape
client_df.dtypes
print("Each row in Client data is of a unique company:", client_df.id.nunique() == client_df.shape[0])
client_df.channel_sales.value_counts()
client_df['origin_up'].value_counts()
client_df.hist(bins=50, figsize=(20,15))
plt.show()
def coef_variation(data):
num_client_data = data.select_dtypes(include=['int64','float64'])
coef_var = []
for column in num_client_data.columns:
c = variation(num_client_data[column],axis=0)
coef_var.append(c)
coef_var = pd.DataFrame(coef_var, index=num_client_data.columns)
coef_var = coef_var.transpose()
return coef_var
coef_variation(client_df)
client_df.describe()
client_df['date_activ'] = pd.to_datetime(client_df['date_activ'])
client_df['date_end'] = pd.to_datetime(client_df['date_end'])
client_df['date_modif_prod'] = pd.to_datetime(client_df['date_modif_prod'])
client_df['date_renewal'] = pd.to_datetime(client_df['date_renewal'])
date_df = client_df.iloc[:,5:9]
def year_from_date(data):
for column in data.columns:
data[column+'_year'] = data[column].dt.year
return data.head()
# Creating year columns for the existing columns
year_from_date(date_df)
date_df.drop(date_df[['date_activ','date_end','date_modif_prod','date_renewal']],axis=1, inplace=True)
date_df.hist(bins=50, figsize=(12,8))
plt.show()
x = [(client_df.has_gas.value_counts(normalize=True)[1]*100), (client_df.has_gas.value_counts(normalize=True)[0]*100)]
labels = 'has gas', 'no gas'
sizes = x
explode = (0, 0.1) # only "explode" the 2nd slice (i.e. 'Hogs')
fig1, ax1 = plt.subplots(figsize=(10,5))
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
x = [(client_df.churn.value_counts()[0]), (client_df.churn.value_counts()[1])]
labels = 'Not Churn', 'Churn'
sizes = x
explode = (0, 0.1) # only "explode" the 2nd slice (i.e. 'Hogs')
fig1, ax1 = plt.subplots(figsize=(10,5))
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
price_df.id.value_counts()
id_below_12 = []
for i in range(0,price_df.id.nunique()):
if (price_df.id.value_counts().values[i] < 12):
x = price_df.id.value_counts().index[i]
id_below_12.append(x)
client_df[client_df.id.isin(id_below_12) == True].churn
price_date = reference date
price_df.dtypes
price_df['price_date'] = pd.to_datetime(price_df['price_date'])
price_df['price_date'].dt.year.value_counts()
variance_coef = coef_variation(price_df)
(variance_coef)
price_df.describe()
price_df.hist(bins=20, figsize=(20,15))
plt.show()
# Using the code below, a new dataframe that has the price sensitivity data was created and saved as dif_in_price
"""
energy_off_peak = []
for i in (price_df['id'].unique()):
x = max(price_df[price_df['id'] == i]['price_off_peak_var'])
y = min(price_df[price_df['id'] == i]['price_off_peak_var'])
c = x-y
energy_off_peak.append(c)
energy_peak = []
for i in (price_df['id'].unique()):
x = max(price_df[price_df['id'] == i]['price_peak_var'])
y = min(price_df[price_df['id'] == i]['price_peak_var'])
c = x-y
energy_peak.append(c)
energy_mid_peak = []
for i in (price_df['id'].unique()):
x = max(price_df[price_df['id'] == i]['price_mid_peak_var'])
y = min(price_df[price_df['id'] == i]['price_mid_peak_var'])
c = x-y
energy_mid_peak.append(c)
power_off_peak = []
for i in (price_df['id'].unique()):
x = max(price_df[price_df['id'] == i]['price_off_peak_fix'])
y = min(price_df[price_df['id'] == i]['price_off_peak_fix'])
c = x-y
power_off_peak.append(c)
power_peak = []
for i in (price_df['id'].unique()):
x = max(price_df[price_df['id'] == i]['price_peak_fix'])
y = min(price_df[price_df['id'] == i]['price_peak_fix'])
c = x-y
power_peak.append(c)
power_mid_peak = []
for i in (price_df['id'].unique()):
x = max(price_df[price_df['id'] == i]['price_off_peak_fix'])
y = min(price_df[price_df['id'] == i]['price_off_peak_fix'])
c = x-y
power_mid_peak.append(c)
"""
# dif_in_price = pd.DataFrame({'energy_off_peak':energy_off_peak, 'energy_peak':energy_peak, 'energy_mid_peak':energy_mid_peak, 'power_off_peak':power_off_peak, 'power_peak':power_peak, 'power_mid_peak':power_mid_peak })
dif_price = pd.read_csv('dif_in_price.csv')
dif_price['id'] = price_df.id.unique()
dif_price.head()
client_info = client_df[['id','churn']]
new_df = pd.merge(dif_price, client_info)
new_df.head()
new_df.churn.value_counts(normalize=True).values
labels = 'churn', 'not churn'
sizes = new_df.churn.value_counts(normalize=True).values
explode = (0, 0.1) # only "explode" the 2nd slice (i.e. 'Hogs')
fig1, ax1 = plt.subplots(figsize=(10,5))
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
new_df[new_df['churn'] == 1].describe()
new_df[new_df['churn'] == 0].describe()
churn = []
not_churn = []
for column in (new_df.columns[0:6].tolist()):
x = mean(new_df[new_df['churn'] == 1][column])
y = mean(new_df[new_df['churn'] == 0][column])
churn.append(x)
not_churn.append(y)
churn[0:3]
(new_df.columns[0:6].tolist())
labels = ['energy_off_peak', 'energy_peak', 'energy_mid_peak']
x = np.arange(len(labels)) # the label locations
width = 0.35 # the width of the bars
fig, ax = plt.subplots(figsize = (10,5))
rects1 = ax.bar(x - width/2, churn[0:3], width, label='Churn')
rects2 = ax.bar(x + width/2, not_churn[0:3], width, label='Not Churn')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_title('mean of energy price sensitivity')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()
fig.tight_layout()
plt.show()
# plt.savefig('mean_energy_pricesens.jpg')
labels = ['power_off_peak', 'power_peak', 'power_mid_peak']
x = np.arange(len(labels)) # the label locations
width = 0.35 # the width of the bars
fig, ax = plt.subplots(figsize = (10,5))
rects1 = ax.bar(x - width/2, churn[3:6], width, label='Churn')
rects2 = ax.bar(x + width/2, not_churn[3:6], width, label='Not Churn')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_title('mean of power price sensitivity')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()
fig.tight_layout()
plt.show()
# plt.savefig("mean_power_pricesens.jpg")