import pandas as pd, numpy as np
from sklearn.preprocessing import OneHotEncoder
# read csv data
file = open("Hadopi.csv", "r")
df = pd.read_csv(file, sep=",", encoding='UTF-8')
file.close()
# read column names
file = open("COLS.csv", "r")
colnames = pd.read_csv(file, sep=",")
file.close()
# rename columns
rename = {}
for i in range(len(colnames.columns)) : rename[df.columns[i]] = colnames.columns[i]
df.rename(columns=rename, inplace=True)
# drop useless columns and columns with too many nulls
df.drop(["SEXE_1", "TRANCHE_AGE_2", "SEXE_TRANCHE_AGE", "REG(UDA)", "REG2(BUDA)", "REGION(13C)", "PROVINCE", "CAT_SOCIOPRO(pcf4)", "FREQ_UTILISATION_APP_INTERNET(QRS1)", "TYPE_CONSO_AUTRES", "TYPE_CONSO_AUCUN", "ALEA","DISPQUAL","SYS_BLOCK_SET_1", "poids", "APPAREIL_MUSIQUE_FAVORI", "GOUT_MUSICAUX_DEFINITION_1", "GOUT_MUSICAUX_DEFINITION_2", "DECOUVERTE_SUR_SERVICE_PRINCIPALE", "ACCORD_DECOUVERTE_DECOUVERTE_FACILE_EN_LIGNE", "ACCORD_DECOUVERTE_TROP_DE_CHOIX", "ACCORD_DECOUVERTE_ECOUTE_MEME_MORCEAUX", "ACCORD_DECOUVERTE_LASSE_RAPIDEMENT_DES_MORCEAUX", "ACCORD_DECOUVERTE_AIME_SUIVRE_ARTISTES_SUR_RES_SOC", "ACCORD_DECOUVERTE_TOUT_SERVICES_STREAMING_SONT_EQUIVALENTS", "COMPTE_YOUTUBE", "YOUTUBE_ECOUTE_DESCRIPTION_1", "YOUTUBE_ECOUTE_DESCRIPTION_2", "PROPORTION_YOUTUBE_SANS_ECRAN", "PROPORTION_YOUTUBE_AVEC_ECRAN", "RAISON_POUR_ABONNEMENT_RECOMMANDATION_PROCHE", "RAISON_POUR_ABONNEMENT_CONTENU_UNIQUE", "RAISON_POUR_ABONNEMENT_OFFRE", "RAISON_POUR_ABONNEMENT_UX", "RAISON_POUR_ABONNEMENT_IMPRESSION", "RAISON_POUR_ABONNEMENT_QUALITE_SON", "RAISON_POUR_ABONNEMENT_CATALOGUE", "RAISON_POUR_ABONNEMENT_OFFRE_PROMOTIONNELLE", "RAISON_POUR_ABONNEMENT_ADAPTEE_APPAREIL", "RAISON_POUR_ABONNEMENT_REMUNERATION_ARTISTES", "RAISON_POUR_ABONNEMENT_PAS_DE_CHOIX", "RAISON_POUR_ABONNEMENT_AUTRES", "BUDGET_MUSIQUE_CONCERT_FESTIVAL", "BUDGET_MUSIQUE_CD_VINYLE", "BUDGET_MUSIQUE_STREAMING", "BUDGET_MUSIQUE_TELECHARGEMENT", "ACCORD_DECOUVERTE_ON_SE_LASSE_RAPIDEMENT_DES_MORCEAUX", "ACCORD_DECOUVERTE_ON_ECOUTE_MEME_MORCEAUX", "ACCORD_DECOUVERTE_ON_AIME_SUIVRE_ARTISTES_SUR_RES_SOC", "DUREE_CONSO_MUSIQUE_VIDEO_CLIP","MOYEN_ILLEGAL_NOTE_CONTENU","MOYEN_ILLEGAL_NOTE_SON","MOYEN_ILLEGAL_NOTE_UX","MOYEN_ILLEGAL_NOTE_DIVERSITE_CATALOGUE","MOYEN_ILLEGAL_NOTE_ACTUALISATION_CATALOGUE","MOYEN_ILLEGAL_NOTE_FACILITE_DECOUVERTE","AVIS_CONSOMATION_MUSIQUE", "CONSOMATION_ILLEGALE_PASSE"], axis=1, inplace=True)
# drop rows that have too many nulls (214 records droped)
df = df[df["INTENTION_ABONNEMENT_STREAMING"] != "#NULL!"]
for col in df:
# replace nulls in 0
if '0' in df[col].unique() and 'NOTE' not in col and 'BUDGET' not in col :
# print(len(df[col].unique()),col, df[col].unique())
df[col] = np.where(df[col] == "#NULL!", '0', df[col])
df[col] = np.where((df[col] != "0"), True, False)
# replace nulls in Jamais
if 'Jamais' in df[col].unique() :
# print(len(df[col].unique()),col, df[col].unique())
df[col] = np.where(df[col] == "#NULL!", "Jamais", df[col])
# replace nulls in Aucun
if 'Aucun' in df[col].unique() :
# print(len(df[col].unique()),col, df[col].unique())
df[col] = np.where(df[col] == "#NULL!", "Aucun", df[col])
# replace nulls in Non
if 'Non' in df[col].unique() :
# print(len(df[col].unique()),col, df[col].unique())
df[col] = np.where(df[col] == "#NULL!", "Non", df[col])
# Create the column that will be predicted
moyen_conso_illegal = ["MOYEN_CONSOMATION_ILLEGAL_STREAMING","MOYEN_CONSOMATION_ILLEGAL_RES_SOC","MOYEN_CONSOMATION_ILLEGAL_PEERTOPEER","MOYEN_CONSOMATION_ILLEGAL_TELECHARGEMENT","MOYEN_CONSOMATION_ILLEGAL_BOITIER_TV","MOYEN_CONSOMATION_ILLEGAL_CLOUD","MOYEN_CONSOMATION_ILLEGAL_EXTERNAL_DRIVE","MOYEN_CONSOMATION_ILLEGAL_AUTRES","RAISON_CONSOMATION_ILLEGALE_PRIX_ELEVE","RAISON_CONSOMATION_ILLEGALE_STOCKAGE_LOCAL","RAISON_CONSOMATION_ILLEGALE_OFFRE_PLUS_LARGE","RAISON_CONSOMATION_ILLEGALE_DECOUVERTE_PLUS_FACILE","RAISON_CONSOMATION_ILLEGALE_HABITUDE","RAISON_CONSOMATION_ILLEGALE_GRATUITE","RAISON_CONSOMATION_ILLEGALE_AUTRES"]
type_conso_illegal = ["TYPE_CONSO_MUSIQUE_VIDEO_CLIP","TYPE_CONSO_FILMS","TYPE_CONSO_SERIESTV","TYPE_CONSO_PHOTOS","TYPE_CONSO_JEUXVIDEOS","TYPE_CONSO_LIVRES","TYPE_CONSO_LOGICIELS","TYPE_CONSO_PRESSE","TYPE_CONSO_SPORTS_EN_DIRECT"]
values_conso_illegal = ["Généralement de façon légale même s'il peut m'arriver de le faire de manière illégale", "Autant de manière légale qu'illégale", "Uniquement de manière illégale", "Généralement de façon illégale même s'il peut m'arriver de le faire de manière légale"]
df["CONSO_ILLEGALE"] = df[moyen_conso_illegal + type_conso_illegal].isin([True] + values_conso_illegal).any(axis=1)
df.drop(moyen_conso_illegal, axis=1, inplace=True)
df.drop(type_conso_illegal, axis=1, inplace=True)
# reindex the rows after dropping some
df.reset_index(drop=True, inplace=True)
# encoder for string values
enc = OneHotEncoder()
for c in df.columns:
# If colunm contains string values
if df[c].dtypes == "object":
# get all the possible values (sorted alphanumericaly)
unique = sorted(df[c].unique(), key=str.lower)
# encode the x string values in x numerical columns
enc_df = pd.DataFrame(enc.fit_transform(df[[c]]).toarray())
for i in enc_df :
# add the new columns in the dataframe
df[c+'_'+unique[i]] = enc_df[i]
# remove the original column
df.drop(c, axis=1, inplace=True)
print(len(df))
print(len(df.columns))
import seaborn as sns
sns.set(rc = {'figure.figsize':(50,50)})
sns.heatmap(df.corr())
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
X = df.drop(columns=["CONSO_ILLEGALE"])
y = df["CONSO_ILLEGALE"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
dtc = DecisionTreeClassifier(max_depth = 4, random_state=0)
dtc.fit(X_train, y_train)
print("Score de l'arbre de décision :", dtc.score(X_test, y_test))
plt.figure(figsize=(35,10))
tree.plot_tree(dtc, class_names=["legal", "illegal"], filled=True, feature_names=X.columns, fontsize=11)
sns.set(rc = {'figure.figsize':(5,5)})
plot_confusion_matrix(dtc, X_train, y_train)
plt.show()
rfc = RandomForestClassifier(max_depth=5, random_state=0)
rfc.fit(X_train, y_train)
print("Score de la random forest :", rfc.score(X_test, y_test))
plot_confusion_matrix(rfc, X_train, y_train)
plt.show()
knc = KNeighborsClassifier(n_neighbors=5)
knc.fit(X_train, y_train)
print("Score du k voisin le plus proche :", knc.score(X_test, y_test))
plot_confusion_matrix(knc, X_train, y_train)
plt.show()