# Let's import the packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
plt.style.use('ggplot')
from matplotlib.pyplot import figure
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12, 8)
df_movies = pd.read_csv('/work/dataset/movies.csv')
df_movies.head(5)
df_movies.columns
df_movies.info()
df_movies.isnull().any()
# Let's loop through the data and see the data missing
for col in df_movies.columns:
pt_missing = np.mean(df_movies[col].isnull())*100
print('{} - {}%'.format(col, round(pt_missing,3)))
# The column Budget has the most percentage of missing data
df_movies[df_movies['budget'].isnull()]
#The Budget column is really important for the analysis. Even if is almost 30% of the data, We need to erased the null values
df_mov = df_movies.dropna(how='any', axis=0)
df_mov.dtypes
# Let's pull apart the column Release
df_mov['released'].astype(str).str.extract('([0-9]{4})')
df_mov['released_year'] = df_mov['released'].astype(str).str.extract('([0-9]{4})')
df_mov['released'].astype(str).str.split(', ').str[0]
df_mov['released_date'] = df_mov['released'].astype(str).str.split(', ').str[0]
df_mov['released'].astype(str).str.split('(').str[1].str.split(')').str[0]
df_mov['released_country'] = df_mov['released'].astype(str).str.split('(').str[1].str.split(')').str[0]
# There is 708 rows where the Year doesn´t match the Release Year
df_mov['released_year'] = df_mov['released_year'].astype('float64')
df_mov[df_mov['released_year'] != df_mov['year']]
# pd.set_options('display.max_rows', None)
df_mov.drop_duplicates() # there is no duplicates
df_mov.boxplot(column=['gross'])
# Looking at the top 15 compaies by gross revenue
company_gross_sum = df_mov.groupby('company')[['gross']].sum()
company_gross_sum_s = company_gross_sum.sort_values('gross', ascending = False)[:15]
company_gross_sum_s = company_gross_sum_s['gross'].astype('int64')
company_gross_sum_s
sns.swarmplot(x="rating", y="gross", data=df_mov)
# Order our Data
df_mov.sort_values(by=['gross'], inplace=False, ascending=False)
#Let's star looking at correlation
sns.pairplot(df_mov)
# Correlation Matrix between all numeric columns
df_mov.corr(method='pearson')
df_mov.corr(method='kendall')
df_mov.corr(method='spearman')
# Correlation heatmap
sns.heatmap(df_mov.corr(), annot=True)
plt.title('Correlation Matrix for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()
# Scatter plot with Budget vs Gross
plt.scatter(x=df_mov['budget'], y=df_mov['gross'])
plt.title('Budget vs Gross Earnings')
plt.ylabel('Gross Earnings')
plt.xlabel('Budget for Film')
plt.show()
sns.regplot(x='budget', y='gross', data=df_mov, scatter_kws={'color': 'red'}, line_kws={'color': 'blue'})
sns.regplot(x='score', y='gross', data=df_mov, scatter_kws={'color': 'red'}, line_kws={'color': 'blue'})
df_mov.head()
df_mov_c = df_mov.copy(deep=False)
for col_name in df_mov_c.columns:
if(df_mov_c[col_name].dtype == 'object'):
df_mov_c[col_name] = df_mov_c[col_name].astype('category')
df_mov_c[col_name] = df_mov_c[col_name].cat.codes
df_mov_c.head()
sns.heatmap(df_mov_c.corr(), annot=True)
plt.title('Correlation Matrix for All Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()
corr_mat = df_mov_c.corr()
corr_pairs = corr_mat.unstack()
print(corr_pairs)
sorted_pairs = corr_pairs.sort_values(kind='quicksort')
print(sorted_pairs)
# We can now take a look at the ones that have a high correlation (> 0.5)
strong_pairs = sorted_pairs[abs(sorted_pairs) > 0.5]
print(strong_pairs)