portfolio_python

# Let's import the packages import numpy as np import pandas as pd import seaborn as sns import matplotlib import matplotlib.pyplot as plt import matplotlib.mlab as mlab plt.style.use('ggplot') from matplotlib.pyplot import figure %matplotlib inline matplotlib.rcParams['figure.figsize'] = (12, 8)

df_movies = pd.read_csv('/work/dataset/movies.csv')

df_movies.head(5)

df_movies.columns

df_movies.info()

df_movies.isnull().any()

# Let's loop through the data and see the data missing for col in df_movies.columns: pt_missing = np.mean(df_movies[col].isnull())*100 print('{} - {}%'.format(col, round(pt_missing,3)))

# The column Budget has the most percentage of missing data df_movies[df_movies['budget'].isnull()]

#The Budget column is really important for the analysis. Even if is almost 30% of the data, We need to erased the null values df_mov = df_movies.dropna(how='any', axis=0)

df_mov.dtypes

# Let's pull apart the column Release df_mov['released'].astype(str).str.extract('([0-9]{4})') df_mov['released_year'] = df_mov['released'].astype(str).str.extract('([0-9]{4})')

df_mov['released'].astype(str).str.split(', ').str[0] df_mov['released_date'] = df_mov['released'].astype(str).str.split(', ').str[0]

df_mov['released'].astype(str).str.split('(').str[1].str.split(')').str[0] df_mov['released_country'] = df_mov['released'].astype(str).str.split('(').str[1].str.split(')').str[0]

# There is 708 rows where the Year doesn´t match the Release Year df_mov['released_year'] = df_mov['released_year'].astype('float64') df_mov[df_mov['released_year'] != df_mov['year']]

# pd.set_options('display.max_rows', None)

df_mov.drop_duplicates() # there is no duplicates

df_mov.boxplot(column=['gross'])

# Looking at the top 15 compaies by gross revenue company_gross_sum = df_mov.groupby('company')[['gross']].sum() company_gross_sum_s = company_gross_sum.sort_values('gross', ascending = False)[:15] company_gross_sum_s = company_gross_sum_s['gross'].astype('int64') company_gross_sum_s

sns.swarmplot(x="rating", y="gross", data=df_mov)

# Order our Data df_mov.sort_values(by=['gross'], inplace=False, ascending=False)

#Let's star looking at correlation sns.pairplot(df_mov)

# Correlation Matrix between all numeric columns df_mov.corr(method='pearson')

df_mov.corr(method='kendall')

df_mov.corr(method='spearman')

# Correlation heatmap sns.heatmap(df_mov.corr(), annot=True) plt.title('Correlation Matrix for Numeric Features') plt.xlabel('Movie Features') plt.ylabel('Movie Features') plt.show()

# Scatter plot with Budget vs Gross plt.scatter(x=df_mov['budget'], y=df_mov['gross']) plt.title('Budget vs Gross Earnings') plt.ylabel('Gross Earnings') plt.xlabel('Budget for Film') plt.show()

sns.regplot(x='budget', y='gross', data=df_mov, scatter_kws={'color': 'red'}, line_kws={'color': 'blue'})

sns.regplot(x='score', y='gross', data=df_mov, scatter_kws={'color': 'red'}, line_kws={'color': 'blue'})

df_mov.head()

df_mov_c = df_mov.copy(deep=False)

for col_name in df_mov_c.columns: if(df_mov_c[col_name].dtype == 'object'): df_mov_c[col_name] = df_mov_c[col_name].astype('category') df_mov_c[col_name] = df_mov_c[col_name].cat.codes

df_mov_c.head()

sns.heatmap(df_mov_c.corr(), annot=True) plt.title('Correlation Matrix for All Features') plt.xlabel('Movie Features') plt.ylabel('Movie Features') plt.show()

corr_mat = df_mov_c.corr() corr_pairs = corr_mat.unstack() print(corr_pairs)

sorted_pairs = corr_pairs.sort_values(kind='quicksort') print(sorted_pairs)

# We can now take a look at the ones that have a high correlation (> 0.5) strong_pairs = sorted_pairs[abs(sorted_pairs) > 0.5] print(strong_pairs)