Casusbeschrijving
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
import scipy.stats as stats
import plotly.express as px
#import graphviz
import pylab as pl
from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score,mean_squared_error, r2_score, classification_report, confusion_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits import mplot3d
movie_df = pd.read_csv('movie.csv')
movie_df.head(1)
extern_movies_df = pd.read_csv('movies_extern.csv', sep = ';')
extern_movies_df.head(1)
movie_df.isnull().sum()
movie_df['language'].unique()
movie_df['country'].unique()
movie_df['facenumber_in_poster'].sort_values().unique()
movie_df = movie_df[
[
'imdb_score', 'gross', 'genres', 'movie_title',
'cast_total_facebook_likes', 'facenumber_in_poster',
'country', 'title_year', 'budget', 'movie_facebook_likes',
'director_facebook_likes', 'duration', 'content_rating', 'language', 'director_name'
]
]
movie_df.head(1)
movie_df = movie_df.replace(['\xa0', '\n', '\t','"'], '', regex = True)
movie_df.movie_title = movie_df.movie_title.astype("string")
extern_movies_df.rename({'title': 'movie_title'}, axis=1, inplace = True) # Kolomnaam veranderen zodat hij overeenkomt met onze orginele dataset
extern_movies_df.movie_title = extern_movies_df.movie_title.astype("string")
movie_df = pd.merge(movie_df, extern_movies_df[['movie_title','revenue', 'budget', 'vote_average', 'vote_count', 'popularity']], how='inner', on='movie_title', suffixes=('_original', '_extern'))
movie_df.drop_duplicates(subset = 'movie_title', inplace = True)
movie_df.dtypes
dummies_genres = movie_df.genres.str.get_dummies()
movie_df = movie_df.join(dummies_genres)
movie_df.head(1)
movie_df.facenumber_in_poster.isnull().sum()
movie_df.country = movie_df.country.astype('category')
movie_df.title_year = movie_df.title_year.astype(int)
movie_df.budget_original.isnull().sum()
movie_df.director_facebook_likes = movie_df.director_facebook_likes.astype(int)
movie_df.movie_facebook_likes = movie_df[movie_df['title_year'] > 2006].movie_facebook_likes.replace(0,np.nan)
movie_df.cast_total_facebook_likes = movie_df[movie_df['title_year'] > 2006].cast_total_facebook_likes.replace(0,np.nan)
movie_df.director_facebook_likes = movie_df[movie_df['title_year'] > 2006].director_facebook_likes.replace(0,np.nan)
movie_df.duration.isna().sum()
movie_df.content_rating = movie_df.content_rating.astype('category')
movie_df.language = movie_df.language.astype('category')
movie_df.vote_count.plot.box()
print("Voor het verwijderen van de outliers")
print(movie_df.vote_count.describe())
q1 = movie_df.vote_count.quantile(0.25)
q3 = movie_df.vote_count.quantile(0.75)
iqr = q3 - q1
# Zwakke outliers
zwakke_outliers = movie_df[((movie_df['vote_count'] < q1 - 1.5 * iqr) & (movie_df['vote_count'] > q1 - 3 * iqr)) | ((movie_df['vote_count'] < q1 + 3 * iqr) & (movie_df['vote_count'] > q1 + 1.5 * iqr)) ]
# Sterke outliers
movie_df[(movie_df['vote_count'] < q1 - 3 * iqr) | (movie_df['vote_count'] > q3 + 3 * iqr)] = np.nan
print("\nNa het verwijderen van de outliers")
print(movie_df.vote_count.describe())
movie_df.vote_count.plot.box()
movie_df = movie_df[pd.to_numeric(movie_df['popularity'], errors='coerce').notnull()]
movie_df.popularity = movie_df.popularity.dropna().astype(float)
movie_df.popularity.describe()
movie_df.dtypes
res_1_df = movie_df[
[
'movie_title', 'imdb_score', 'gross', 'budget_original', 'budget_extern', 'movie_facebook_likes',
'cast_total_facebook_likes', 'director_facebook_likes', 'title_year', 'vote_average', 'vote_count', 'popularity', 'revenue'
]
].copy()
res_1_df.describe()
res_1_df.dropna(subset=['gross', 'revenue', 'budget_original', 'budget_extern', 'imdb_score', 'vote_average', 'vote_count', 'popularity'], inplace=True)
res_1_df.dropna(subset=['movie_facebook_likes', 'director_facebook_likes', 'cast_total_facebook_likes'], inplace=True, how='all')
res_1_df['movie_facebook_likes'].plot.hist(bins=20).set_title('frequency van de movie facebook likes')
res_1_df[['movie_facebook_likes', 'cast_total_facebook_likes', 'director_facebook_likes']] = res_1_df[['movie_facebook_likes', 'cast_total_facebook_likes', 'director_facebook_likes']].fillna(0)
res_1_df['total_facebook_likes'] = res_1_df['movie_facebook_likes'] + res_1_df['cast_total_facebook_likes'] + res_1_df['director_facebook_likes']
res_1_df.total_facebook_likes = res_1_df[res_1_df['title_year'] > 2006].total_facebook_likes.replace(0,np.nan)
res_1_df.dropna(subset=['total_facebook_likes'],inplace=True)
res_1_df.total_facebook_likes = res_1_df.total_facebook_likes.astype(int)
fig, axes = plt.subplots(nrows=1,ncols=2,figsize=(12,4))
res_1_df[['imdb_score']].plot.hist(bins=15, ax=axes[0], subplots=True)
res_1_df[['imdb_score']].plot.box(ax=axes[1], subplots=True)
sns.scatterplot(x='imdb_score', y='total_facebook_likes', size='gross', hue='gross', data=res_1_df[['imdb_score', 'total_facebook_likes', 'gross']])
corrMatrix2 = res_1_df.corr()
fig, axes = plt.subplots(figsize=(10,10))
sns.heatmap(corrMatrix2, annot=True,ax=axes)
fig, axes = plt.subplots(2,2,figsize=(20,10))
sns.scatterplot(x='imdb_score', y='gross', hue='gross', data=res_1_df[['imdb_score', 'gross']],ax=axes[0][0])
axes[0][0].set_title('Scatterplot van imbd score en gross')
sns.scatterplot(x='movie_facebook_likes', y='gross', hue='gross', data=res_1_df[['movie_facebook_likes', 'gross']],ax=axes[0][1])
axes[0][1].set_title('Scatterplot van de films FB likes en gross')
sns.scatterplot(x='total_facebook_likes', y='gross', hue='gross', data=res_1_df[['total_facebook_likes', 'gross']], ax=axes[1][0])
axes[1][0].set_title('Scatterplot van alle FB likes en gross')
x_research_1_A = res_1_df[['imdb_score', 'total_facebook_likes']]
y_research_1_A = res_1_df[['gross']]
x_research_1_A_train, x_research_1_A_test, y_research_1_A_train, y_research_1_A_test = train_test_split(x_research_1_A, y_research_1_A, random_state=0)
x_research_1_B = res_1_df[['budget_extern', 'vote_count']]
y_research_1_B = res_1_df[['revenue']]
x_research_1_B_train, x_research_1_B_test, y_research_1_B_train, y_research_1_B_test = train_test_split(x_research_1_B, y_research_1_B, random_state=0)
print(f'Totaal A: {len(movie_df)}, 75%: {len(x_research_1_A_train)}, 25%: {len(y_research_1_A_test)}')
print(f'Totaal B: {len(movie_df)}, 75%: {len(x_research_1_B_train)}, 25%: {len(y_research_1_B_test)}')
y_baseline_1_A_means = pd.DataFrame({'means' : np.repeat(y_research_1_A_train.gross.mean(), y_research_1_A_train.shape[0])})
baseline_1_A_RMSE_score = np.sqrt(mean_squared_error(y_research_1_A_train, y_baseline_1_A_means))
baseline_1_A_RMSE_score
y_baseline_1_B_means = pd.DataFrame({'means' : np.repeat(y_research_1_B_train.revenue.mean(), y_research_1_B_train.shape[0])})
baseline_1_B_RMSE_score = np.sqrt(mean_squared_error(y_research_1_B_train, y_baseline_1_B_means))
baseline_1_B_RMSE_score
print(x_research_1_A_train.shape)
print(y_research_1_A_test.shape)
linreg_research_1_A = LinearRegression()
linreg_research_1_A.fit(x_research_1_A_train, y_research_1_A_train)
y_research_1_A_pred = linreg_research_1_A.predict(x_research_1_A_test)
linreg_1_A_RMSE_score = np.sqrt(mean_squared_error(y_research_1_A_test, y_research_1_A_pred))
linreg_1_A_RMSE_score
linreg_1_A_r2_score = r2_score(y_research_1_A_test, y_research_1_A_pred)
linreg_1_A_r2_score
linreg_research_1_B = LinearRegression()
linreg_research_1_B.fit(x_research_1_B_train, y_research_1_B_train)
y_research_1_B_pred = linreg_research_1_B.predict(x_research_1_B_test)
linreg_1_B_RMSE_score = np.sqrt(mean_squared_error(y_research_1_B_test, y_research_1_B_pred))
linreg_1_B_RMSE_score
linreg_1_B_r2_score = r2_score(y_research_1_B_test, y_research_1_B_pred)
linreg_1_B_r2_score
ridge_research_1_A = Ridge()
ridge_research_1_A.fit(x_research_1_A_train, y_research_1_A_train)
y_ridge_research_1_A_pred = ridge_research_1_A.predict(x_research_1_A_test)
ridge_1_A_RMSE_score = np.sqrt(mean_squared_error(y_research_1_A_test, y_ridge_research_1_A_pred))
ridge_1_A_RMSE_score
ridge_1_A_r2_score = r2_score(y_research_1_A_test, y_ridge_research_1_A_pred)
ridge_1_A_r2_score
ridge_research_1_B = Ridge()
ridge_research_1_B.fit(x_research_1_B_train, y_research_1_B_train)
y_ridge_research_1_B_pred = ridge_research_1_B.predict(x_research_1_B_test)
ridge_1_B_RMSE_score = np.sqrt(mean_squared_error(y_research_1_B_test, y_ridge_research_1_B_pred))
ridge_1_B_RMSE_score
ridge_1_B_r2_score = r2_score(y_research_1_B_test, y_ridge_research_1_B_pred)
ridge_1_B_r2_score
fig = px.scatter_3d(res_1_df, x='imdb_score', y='total_facebook_likes', z='gross')
fig.show()
fig = px.scatter_3d(res_1_df, x='budget_extern', y='vote_count', z='revenue')
fig.show()
print(f"σ van gross : {res_1_df.gross.std().round()}")
print(f"RSME van Baseline A: {baseline_1_A_RMSE_score.round()}")
print(f"RSME van Lineare Regressie A: {linreg_1_A_RMSE_score.round()}")
print(f"R2 van Lineare Regressie A: {linreg_1_A_r2_score.round(2)}")
print('\n')
print(f"RSME van Ridge Regressie A: {ridge_1_A_RMSE_score.round()}")
print(f"R2 van Ridge Regressie A: {ridge_1_A_r2_score.round(2)}")
print('\n')
print('\n')
print(f"σ van revenue : {res_1_df.revenue.std().round()}")
print(f"RSME van Baseline B: {baseline_1_B_RMSE_score.round()}")
print(f"RSME van Lineare Regressie B: {linreg_1_B_RMSE_score.round()}")
print(f"R2 van Lineare Regressie B: {linreg_1_B_r2_score.round(2)}")
print('\n')
print(f"RSME van Ridge Regressie B: {ridge_1_B_RMSE_score.round()}")
print(f"R2 van Ridge Regressie B: {ridge_1_B_r2_score.round(2)}")
Onderzoekvraag 2
res_2_df = movie_df[
[
'duration','title_year', 'budget_original', 'budget_extern','content_rating', 'language', 'country', 'genres',
'Action', 'Adventure','Animation',
'Biography','Comedy','Crime','Documentary',
'Drama','Family','Fantasy','Film-Noir','History',
'Horror','Music','Musical',
'Mystery','News','Romance',
'Sci-Fi','Sport','Thriller',
'War','Western',
]
].copy()
res_2_df.describe()
Extra data verwerking
NaN's verwerken
temp = res_2_df.isna().sum()
temp = temp[temp>0]
temp
# drop alles NaNs bij
res_2_df.dropna(subset=['duration', 'budget_original','content_rating','language'], inplace=True)
print(f"Nan's bij duration. Na de drop commando:",res_2_df.duration.isna().sum())
print(f"Nan's bij budget_original. Na de drop commando:",res_2_df.budget_original.isna().sum())
print(f"Nan's bij content_rating. Na de drop commando:",res_2_df.content_rating.isna().sum())
print(f"Nan's bij language. Na de drop commando:",res_2_df.language.isna().sum())
# get dummies for language, content rating en country
for column in ['content_rating', 'language', 'country']:
temp = res_2_df[column].str.get_dummies()
res_2_df = res_2_df.join(temp)
res_2_df.head(1)
Outliers
kolommen = ['duration','title_year','budget_original','budget_extern']
for enkele_kolom in kolommen:
kolom = res_2_df[enkele_kolom].sort_values()
Q1 = kolom.quantile(0.25)
Q3 = kolom.quantile(0.75)
IQR = Q3 - Q1
temp_zwakke_uitschieters_lower = kolom[(kolom <= Q1 - 1.5*IQR) & (kolom > Q1 - 3*IQR)].count()
temp_zwakke_uitschieters_upper = kolom[(kolom >= Q3 + 1.5*IQR) & (kolom < Q3 + 3*IQR)].count()
temp_sterke_uitschieters_lower = kolom[kolom < Q1 - 3*IQR].count()
temp_sterke_uitschieters_upper = kolom[kolom > Q3 + 3*IQR].count()
print(f"{enkele_kolom} zwakke outliers lager: {temp_zwakke_uitschieters_lower}")
print(f"{enkele_kolom} sterke outliers lager: {temp_sterke_uitschieters_lower}")
print(f"{enkele_kolom} zwakke outliers hoger: {temp_zwakke_uitschieters_upper}")
print(f"{enkele_kolom} sterke outliers hoger: {temp_sterke_uitschieters_upper}")
fig,axes = plt.subplots(1,4,figsize=(20,10))
res_2_df.boxplot(column=kolommen[0],ax=axes[0])
res_2_df.boxplot(column=kolommen[1],ax=axes[1])
res_2_df.boxplot(column=kolommen[2],ax=axes[2])
res_2_df.boxplot(column=kolommen[3],ax=axes[3])
Correlatie onderzoek
corrMatrix_1 = res_2_df[
[
'title_year', 'duration',
'budget_original', 'budget_extern'
]
].corr()
corrMatrix_2 = res_2_df[
[
'duration', 'Action', 'Adventure','Animation',
'Biography','Comedy','Crime','Documentary',
'Drama','Family','Fantasy','Film-Noir','History',
'Horror','Music','Musical',
'Mystery','News','Romance',
'Sci-Fi','Sport','Thriller',
'War','Western'
]
].corr()
# experimentele kolommen voor later
corrMatrix_3 = res_2_df[
[
'duration','Approved','G' ,'GP','M','NC-17','Not Rated','PG','PG-13','Passed','R','TV-14','TV-G','Unrated','X'
]
].corr()
corrMatrix_4 = res_2_df[
[
'duration','English', 'Mandarin', 'Aboriginal', 'Spanish','French',
'Filipino', 'Russian', 'Maya', 'Kazakh', 'Cantonese',
'Japanese', 'Aramaic', 'Italian', 'Dutch', 'Dari',
'German', 'Mongolian', 'Bosnian', 'Korean',
'Hungarian', 'Hindi', 'Thai', 'Portuguese', 'Norwegian',
'Czech', 'Danish', 'Zulu', 'Hebrew', 'Arabic',
'Vietnamese', 'Indonesian', 'Romanian', 'Persian'
]
].corr()
corrMatrix_5 = res_2_df[
[
'duration',
'UK', 'USA', 'New Zealand', 'Australia',
'Germany', 'New Line', 'France', 'China',
'Canada', 'Mexico', 'Spain', 'Hong Kong',
'Czech Republic', 'Soviet Union', 'South Korea', 'Peru',
'Italy', 'Japan', 'Aruba', 'Denmark',
'Libya', 'Ireland', 'Romania', 'West Germany',
'South Africa', 'Chile', 'Russia', 'Netherlands',
'Hungary', 'Panama', 'Belgium', 'Greece',
'Taiwan', 'Bulgaria', 'Iran', 'Georgia',
'India', 'Thailand', 'Norway', 'Philippines',
'Brazil', 'Finland', 'Bahamas', 'Iceland',
'Argentina', 'Colombia', 'Poland', 'Israel',
'Egypt', 'Kyrgyzstan', 'Afghanistan', 'Switzerland'
]
].corr()
fig,axes = plt.subplots(5,1,figsize=(20,50))
sns.heatmap(corrMatrix_1, annot=True,ax=axes[0])
sns.heatmap(corrMatrix_2, annot=True,ax=axes[1])
sns.heatmap(corrMatrix_3, annot=True,ax=axes[2])
sns.heatmap(corrMatrix_4, annot=True,ax=axes[3])
sns.heatmap(corrMatrix_5, annot=True,ax=axes[4])
Conclusie correlatie onderzoek
Modeling
Baseline model
X_baseline_2 = res_2_df[
[
'title_year',
'budget_original', 'budget_extern',
'Action', 'Adventure','Animation',
'Biography','Comedy','Crime','Documentary',
'Drama','Family','Fantasy','Film-Noir','History',
'Horror','Music','Musical',
'Mystery','News','Romance',
'Sci-Fi','Sport','Thriller',
'War','Western'
]
]
Y_baseline_2 = res_2_df[['duration']]
baseline_2_DummyRegre = DummyRegressor(strategy='mean')
baseline_2_DummyRegre.fit(X_baseline_2,Y_baseline_2)
Dummy_2_pred = baseline_2_DummyRegre.predict(X_baseline_2)
Dummyreg_2_RMSE_score = np.sqrt(mean_squared_error(Y_baseline_2, Dummy_2_pred))
Dummyreg_2_RMSE_score
baseline_2_DummyRegre_score = baseline_2_DummyRegre.score(Y_baseline_2,Y_baseline_2)
print(baseline_2_DummyRegre_score)
x_research_2_train, x_research_2_test, y_research_2_train, y_research_2_test = train_test_split(X_baseline_2, Y_baseline_2, random_state=0)
print(x_research_2_train.shape)
print(y_research_2_test.shape)
linreg_research_2 = LinearRegression()
linreg_research_2.fit(x_research_2_train, y_research_2_train)
y_research_2_pred = linreg_research_2.predict(x_research_2_test)
linreg_2_RMSE_score = np.sqrt(mean_squared_error(y_research_2_test, y_research_2_pred))
linreg_2_RMSE_score
research_2_liniear_r2_score = r2_score(y_research_2_test,y_research_2_pred)
research_2_liniear_r2_score
Experimentatie
Ridge regressie model
ridge_research_2 = Ridge()
ridge_research_2.fit(x_research_2_train, y_research_2_train)
y_ridge_research_2_pred = ridge_research_2.predict(x_research_2_test)
ridge_2_RMSE_score = np.sqrt(mean_squared_error(y_research_2_test, y_ridge_research_2_pred))
ridge_2_RMSE_score
research_2_ridge_r2_score = r2_score(y_research_2_test,y_ridge_research_2_pred)
research_2_ridge_r2_score
Data experiment
smaller_data_pool_experiment_x = res_2_df[
[
'Drama', 'History','War','Biography'
]
]
x_experiment_2_train, x_experiment_2_test, y_experiment_2_train, y_experiment_2_test = train_test_split(smaller_data_pool_experiment_x, Y_baseline_2, random_state=0)
linreg_experiment = LinearRegression()
linreg_experiment.fit(x_experiment_2_train, y_experiment_2_train)
ridge_experiment = Ridge()
ridge_experiment.fit(x_experiment_2_train, y_experiment_2_train)
y_linear_experiment_2_pred = linreg_experiment.predict(x_experiment_2_test)
y_ridge_experiment_2_pred = ridge_experiment.predict(x_experiment_2_test)
linreg_2_experiment_RMSE_score = np.sqrt(mean_squared_error(y_experiment_2_test, y_linear_experiment_2_pred))
linreg_2_experiment_RMSE_score
ridge_2_experiment_RMSE_score = np.sqrt(mean_squared_error(y_experiment_2_test, y_ridge_experiment_2_pred))
ridge_2_experiment_RMSE_score
experiment_2_liniear_r2_score = r2_score(y_experiment_2_test,y_linear_experiment_2_pred)
experiment_2_liniear_r2_score
experiment_2_ridge_r2_score = r2_score(y_experiment_2_test,y_ridge_experiment_2_pred)
experiment_2_ridge_r2_score
Conclusie
print('Het resultaat van de Baseline,Linear en Ridge model met de niet optionele kolommen')
print(f"Accuracy van Baseline : {baseline_2_DummyRegre_score}")
print(f"Accuracy van Linear : {research_2_liniear_r2_score}")
print(f"Accuracy van Ridge : {research_2_ridge_r2_score}")
print('\n')
print("Het resultaat van de data experiment")
print(f"Accuracy van Linear : {experiment_2_liniear_r2_score}")
print(f"Accuracy van Ridge : {experiment_2_ridge_r2_score}")
res_3_df = movie_df[
[
'budget_extern',
'revenue',
'director_name'
]
].copy()
res_3_df.head(5)
res_3_df = res_3_df.replace(0, np.nan)
res_3_df = res_3_df.dropna(how='any', axis=0)
res_3_df
res_3_df["winstmarge"] = ((res_3_df.revenue - res_3_df.budget_extern) / res_3_df.revenue * 100).round(2)
res_3_df
kolom = res_3_df['winstmarge'].sort_values()
Q1 = kolom.quantile(0.25)
Q3 = kolom.quantile(0.75)
IQR = Q3 - Q1
winstmarge_zwakke_uitschieters_lower = kolom[(kolom <= Q1 - 1.5*IQR) & (kolom > Q1 - 3*IQR)].count()
winstmarge_zwakke_uitschieters_upper = kolom[(kolom >= Q3 + 1.5*IQR) & (kolom < Q3 + 3*IQR)].count()
winstmarge_sterke_uitschieters_lower = kolom[kolom < Q1 - 3*IQR].count()
winstmarge_sterke_uitschieters_upper = kolom[kolom > Q3 + 3*IQR].count()
print(f"winstmarge zwakke outliers lager: {winstmarge_zwakke_uitschieters_lower}")
print(f"winstmarge sterke outliers lager: {winstmarge_sterke_uitschieters_lower}")
print(f"winstmarge zwakke outliers hoger: {winstmarge_zwakke_uitschieters_upper}")
print(f"winstmarge sterke outliers hoger: {winstmarge_sterke_uitschieters_upper}")
res_3_df.boxplot(column='winstmarge')
res_3_df = res_3_df[res_3_df['winstmarge'] > -100]
res_3_df
amount_movies = res_3_df.groupby('director_name', as_index=False).size()
res_3_df = pd.DataFrame(res_3_df.groupby('director_name', as_index=False).agg({'winstmarge': "mean"}))
res_3_df.head(1)
res_3_df = res_3_df.merge(amount_movies, on=['director_name'])
res_3_df.rename(columns={'size':'amount_movies'}, inplace=True)
res_3_df.director_name = res_3_df.director_name.astype('category')
le = preprocessing.LabelEncoder()
le.fit(res_3_df['director_name'].astype(str))
res_3_df['director_name_int'] = le.transform(res_3_df['director_name'])
res_3_df
kmeans_res_3 = KMeans(n_clusters=5, random_state=0)
kmeans_3_scores = []
for k in range(1, 16):
kmeans_res_3.set_params(n_clusters=k, random_state=0)
kmeans_res_3.fit(res_3_df[['winstmarge', 'amount_movies']])
kmeans_3_scores.append(kmeans_res_3.score(res_3_df[['winstmarge', 'amount_movies']]))
temp_x = np.arange(1,16,step=1)
plt.title('Elbow methode voor bepalen van K')
plt.plot(temp_x,kmeans_3_scores)
kmeans_res_3.set_params(n_clusters=5, random_state=0)
kmeans_res_3.fit(res_3_df[['winstmarge', 'amount_movies']])
kmeans_res_3.labels_
kmeans_res_3.score(res_3_df[['winstmarge', 'amount_movies']])
model_3_score = metrics.silhouette_score(res_3_df[['winstmarge', 'amount_movies']], kmeans_res_3.labels_, metric='euclidean')
print(model_3_score)
res_3_df
res_3_df['clusters'] = kmeans_res_3.labels_
sns.scatterplot(x='winstmarge', y='amount_movies', size='clusters', hue='clusters', data=res_3_df[['winstmarge', 'amount_movies', 'clusters']])
Conclusie Kmeans
print(f"accuracy score van het model: {kmeans_3_scores[4]}")
print(f"silouette score van het model: {model_3_score}")