import pandas as pd
import numpy as np
import re #Modulo de Expresiones Regulares
moviesDf = pd.read_csv('movies_dataset.csv', sep=',')
#Verificamos el dataset moviesDf
moviesDf.head()
moviesDf.info()
#Función para extraer y concatenar 'name'
def extract_concatenate_name(value):
try:
pattern = r"'name':\s+'(.*?)'"
match = re.findall(pattern, value)
return ', '.join(match)
except (TypeError, AttributeError):
return ''
#Normalizamos la columna "belongs_to_collection"
moviesDf['collection'] = moviesDf['belongs_to_collection'].apply(extract_concatenate_name)
moviesDf['collection'].head(10)
#Normalizamos la columna "genres"
moviesDf['genre'] = moviesDf['genres'].apply(extract_concatenate_name)
moviesDf['genre'].head(10)
#Normalizamos la columna la columna "production_companies"
moviesDf['company'] = moviesDf['production_companies'].apply(extract_concatenate_name)
moviesDf['company'].head(10)
#Normalizamos la columna "production_countries"
moviesDf['country'] = moviesDf['production_countries'].apply(extract_concatenate_name)
moviesDf['country'].head(10)
#Normalizamos la columna "spoken_languages"
moviesDf['language'] = moviesDf['spoken_languages'].apply(extract_concatenate_name)
moviesDf['language'].head(10)
moviesDf.head(0)
moviesDf['budget'].fillna(0, inplace=True)
print('# de valores NULL en el campo budget:', moviesDf.budget.isna().sum())
moviesDf['revenue'].fillna(0, inplace=True)
print('# de valores NULL en el campo revenue:', moviesDf.revenue.isna().sum())
#Damos formato a todos los valores de la columna 'release_date'
#A los que no se les puede dar formato se les asigna NaT (Not a Time)
moviesDf['release_date'] = pd.to_datetime(moviesDf['release_date'], errors='coerce')
#Eliminamos los registros NaT
moviesDf.dropna(subset = ['release_date'], inplace=True)
#Verificamor que no existan NaT
moviesDf['release_date'].isnull().sum()
#Verificamos que todos los registros tengan el mismo formato AAAA-MM-DD
moviesDf['release_date'] = pd.to_datetime(moviesDf['release_date']).dt.strftime('%Y-%m-%d')
#Creamos la columna 'release_year' que va a contener el año de la columna 'release_date'
moviesDf['release_year'] = pd.to_datetime(moviesDf['release_date']).dt.year
#Verificamos las columnas 'release_date','release_year'
moviesDf[['release_date','release_year']].tail()
#Damos formato a los registros de las columnas 'budget' y 'revenue' a tipo numerico
#A los que no se les puede dar formato se les asigna el valor NaN (Not a Number)
moviesDf['budget'] = pd.to_numeric(moviesDf['budget'], errors='coerce')
moviesDf['revenue'] = pd.to_numeric(moviesDf['revenue'], errors='coerce')
#Realizamos el calculo del retorno de inversión 'return'
moviesDf['return'] = moviesDf['revenue']/moviesDf['budget']
moviesDf['return'].fillna(0, inplace=True)
#moviesDf.drop(columns=['video', 'imdb_id', 'adult', 'original_title', 'vote_count', 'poster_path', 'homepage'], inplace = True)
moviesDf = moviesDf[['id', 'title', 'tagline', 'overview', 'runtime',
'original_language', 'genre', 'collection',
'status', 'release_date', 'release_year',
'popularity','vote_average',
'company', 'country', 'language',
'revenue', 'budget', 'return']]
moviesDf.head()
movie_listing = moviesDf.to_csv('movie_listing.csv', index=False, header=True, sep=';')