import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
sample_data = False
size = 1000
titles_df = pd.read_csv('/work/data/ImdbTitleBasics.csv', low_memory = True)
#names_df = pd.read_csv('ImdbTitleCrew.csv')
#names_df = pd.read_csv('ImdbTitleRatings.csv')
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3166: DtypeWarning: Columns (5) have mixed types.Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
names_df = pd.read_csv('/work/data/ImdbName.csv')
principals_df = pd.read_csv('/work/data/ImdbTitlePrincipals.csv')
titles_df.head(5)
names_df.head(5)
principals_df.head(5)
df = principals_df.merge(titles_df, left_on='tconst', right_on='tconst').merge(names_df,
left_on= 'nconst', right_on='nconst')
if sample_data:
df = df.sample(n=size)
df.head()
df.describe(include='all')
df = df.drop(['characters','primaryTitle'], axis=1)
df['endYear'].describe()
df['endYear'].isnull().sum()
df = df.replace("\\N", np.nan)
df['endYear'].isnull().sum()
df.isnull().sum()
df['job'].head(20)
df = df.drop(['job'], axis=1)
df = df.drop(['primaryProfession'], axis=1)
df[df['endYear'].notnull()]
df = df[df['runtimeMinutes'].notnull()]
df = df[df['genres'].notnull()]
df.isnull().sum()
df = df.convert_dtypes()
df.dtypes
df.head()
df.nunique()
Spliting genres
df_genres = df['genres'].str.split(",", expand=True)
df_genres[0].unique()
df = df.drop(['genres'], axis=1)
df = df.drop(['knownForTitles'], axis=1)
df_clean = pd.concat([df, df_genres], axis=1)
df_clean[['category', 'titleType','isAdult']] = df_clean[['category', 'titleType','isAdult']].astype('category')
df_clean['runtimeMinutes'] = df_clean['runtimeMinutes'].astype('int')
df_clean[['runtimeMinutes','startYear','endYear','birthYear','deathYear']] = df_clean[['runtimeMinutes','startYear','endYear','birthYear','deathYear']].astype('int', errors='ignore')
df_clean.head()
df_clean.value_counts(['category']).plot(kind='barh')
df_clean.value_counts(['titleType']).plot(kind='barh')
df_clean.value_counts(['primaryName'])[:10].plot(kind='barh')
df_actors = df_clean[df_clean.category.isin(['actor', 'actress'])]
df_actors['startYear'].astype('int', errors='ignore')
sns.catplot(x='category', y="startYear", kind='box',data=df_actors)
TypeError: Neither the `x` nor `y` variable appears to be numeric.
fig, ax = plt.subplots(figsize=(40,10))
b = sns.pointplot(x=0, y="startYear", data=df_clean, ax=ax)
b.axes.set_title("Category per Year",fontsize=50)
b.set_xlabel("Categories",fontsize=50)
b.set_ylabel("Year",fontsize=50)
b.tick_params(labelsize=20)