Curso ciencia de datos

import pandas as pd import numpy as np import matplotlib import matplotlib.pyplot as plt import seaborn as sns import urllib.request as Req from zipfile import ZipFile import plotly.graph_objects as go import plotly.express as px # import plotly.offline as pyo # pyo.init_notebook_mode()

url = r'https://import.cdn.thinkific.com/220744/courses/1648061/pokemon_dataset-220322-181028.zip'

Req.urlretrieve(url, r'pokemon_dataset.zip')

pokemon_zip = ZipFile(r'pokemon_dataset.zip')

pokemon_zip.filelist

archivo = pokemon_zip.open('pokedex_mastermind.csv')

df = pd.read_csv(archivo, index_col=[0])

df

df['name'].median

df.duplicated().any()

df.duplicated().unique()

df.columns

df.head()

df.tail()

df.sample(5)

df[['defense', 'sp_attack', 'sp_defense', 'speed', 'catch_rate', 'percentage_male', 'against_normal', 'against_fire', 'against_water', 'against_electric', 'against_grass', 'against_ice', 'against_fight', 'against_poison']].head()

df.columns

weakness = df[['against_normal', 'against_fire', 'against_water', 'against_electric', 'against_grass', 'against_ice', 'against_fight', 'against_poison', 'against_ground', 'against_flying', 'against_psychic', 'against_bug', 'against_rock', 'against_ghost', 'against_dragon', 'against_dark', 'against_steel', 'against_fairy']] cols = ['against_normal', 'against_fire', 'against_water', 'against_electric', 'against_grass', 'against_ice', 'against_fight', 'against_poison', 'against_ground', 'against_flying', 'against_psychic', 'against_bug', 'against_rock', 'against_ghost', 'against_dragon', 'against_dark', 'against_steel', 'against_fairy']

weakness.columns

weakness.columns = [x.replace('against_', '') for x in weakness.columns]

weakness.head()

df.drop(cols, axis=1, inplace=True)

df.head()

df.select_dtypes(include=[object])

df.select_dtypes(include=[object]).columns

cols = ['status', 'type_1', 'type_2']

# df['status'].str.lower() Se puede realizar esta linea de codigo por array pero no a varias columnas

df[cols] = df[cols].apply(lambda x: x.str.lower())

df.select_dtypes(include=[object])

df['percentage_male'].unique()

df['percentage_male'] = df['percentage_male'].str.replace('%', '').apply(float)

df['percentage_male'].unique()

df['weight_kg'] = df['weight_pounds'] * 0.453592

df['weight_kg']

df['weight_kg'] = df['weight_kg'].round(2)

df['weight_kg']

df['weight_pounds'] = df['weight_kg']

df

df.drop('weight_kg', axis=1, inplace=True)

df.rename(columns={'weight_pounds':'weight_kg'}, inplace=True)

df

df['name'][:50]

df['name'].str.contains('Mega')

df[df['name'].str.contains('Mega')]

df[df['name'].str.contains('Mega')]['name']

len(df[df['name'].str.contains('Mega')]['name'])

nombres = df[df['name'].str.contains('Mega')]['name']

nombres[3]

pat = '^.*(?=(Mega))'

nombres.str.replace(pat, '', regex=True)

df.head()

df['name'].str.replace(pat, '', regex=True)

df['name'] = df['name'].str.replace(pat, '', regex=True)

df.head()

df.describe()

df.head()

df.isna()

df.isna().any()

df['type_2'].isna()

df[df['type_2'].isna()]

len(df[df['type_2'].isna()])

df['type_2'].unique()

df['type_2'].fillna('none', inplace=True)

df[df['catch_rate'].isna()]

media_normales = df[df['status'] == 'normal']['catch_rate'].mean() media_legendarios = df[df['status'] == 'legendary']['catch_rate'].mean()

df.describe()

df.describe()['catch_rate']

print(f'Media de atrapabilidad de Pokemon normales es {media_normales}') print(f'Media de atrapabilidad de Pokemon legendarios es {media_legendarios}')

sin_datos = df['catch_rate'].isna() son_legendarios = df['status'] == 'legendary' son_normales = df['status'] == 'normal'

df.loc[sin_datos & son_legendarios, 'catch_rate'] = media_legendarios df.loc[sin_datos & son_normales, 'catch_rate'] = media_normales

df['catch_rate'].isna().any()

df['catch_rate'] = df['catch_rate'].round(2)

df.isna().any()

df[df['weight_kg'].isna()]

peso = 950

pokemon = df['name'] == 'Eternatus Eternamax'

df.loc[pokemon, 'weight_kg'] = peso

df.loc[pokemon]

df.isna().any()

df.info()

df['type_1'].value_counts()

pd.pivot_table(df, index='type_1')

pd.pivot_table(df, index='type_1', columns='type_2', values='name', aggfunc= lambda x: x.value_counts().count(), fill_value=0)

contar_tipos = pd.pivot_table(df, index='type_1', columns='type_2', values='name', aggfunc= lambda x: x.value_counts().count(), fill_value=0)

fig = plt.figure(figsize=(18,15)) g = sns.heatmap(contar_tipos, cmap='coolwarm', annot=True, center=25) g.set_title('Cantidad de pokemon por tipo', fontsize=20) g.set_xlabel('Tipo 1', fontsize=15) g.set_ylabel('Tipo 2', fontsize=15) plt.show()

df[df['status'] == 'legendary']

df[df['status'] == 'legendary']['generation'].value_counts()

g = sns.catplot(data=df[df['status'] != 'normal'], y='generation', kind='count', edgecolor='black', hue='status', palette='coolwarm', alpha=0.8) g.fig.set_size_inches(16,8) g.set(xlim=(0, 18)) g.fig.suptitle('Cantidad de pokemon legendarios por generacion') plt.show()

df.columns

stat_columns = ['hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed']

df['total'] = df[stat_columns].sum(axis=1)

df

g = sns.catplot(data=df, x='generation', y='total', kind='box', palette='Set2') g.fig.set_size_inches(16,8) # g.set(xlim=(0, 18)) g.fig.suptitle('Poder de los pokemon por generacion') plt.show()

g = sns.catplot(data=df, x='type_1', y='total', kind='box', palette='Set2') g.fig.set_size_inches(16,8) # g.set(xlim=(0, 18)) g.fig.suptitle('Poder de los pokemon por generacion') plt.show()

def stats_medios(tipo): stat_columns = ['hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed'] return df.loc[df['type_1'] == tipo, stat_columns].mean()

tipo_agua = stats_medios('water') tipo_fuego = stats_medios('fire')

def stats_pokemon(nombre): stat_columns = ['hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed'] return df.loc[df['type_1'] == tipo, stat_columns].mean()

tipo_agua

fig = go.Figure() fig.add_trace(go.Scatterpolar(r=tipo_agua.values, theta=tipo_agua.index, name='water', fill='toself')) fig.add_trace(go.Scatterpolar(r=tipo_fuego.values, theta=tipo_fuego.index, name='fire', fill='toself')) fig.update_layout(showlegend=True, polar={'radialaxis':{'visible':False}}, title='Grafica comparativa stats', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900, legend_bgcolor='rgb(117, 112, 179)') fig.show()

def comparar_stats(df, lista_tipos): fig = go.Figure() for tipo in lista_tipos: stats = stats_medios(tipo) fig.add_trace(go.Scatterpolar(r=stats.values, theta=stats.index, name=tipo, fill='toself', opacity=0.5, hoveron='points', hovertemplate='%{theta}: %{r}')) fig.update_layout(showlegend=True, polar={'radialaxis':{'visible':False}}, title='Grafica comparativa stats', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900, legend_bgcolor='rgb(117, 112, 179)') fig.show()

tipos = ['water', 'fire', 'poison', 'dragon']

comparar_stats(df, tipos)

def comparar_pokemons(dataframe, pokemon_list): stat_columns = ['hp', 'attack', 'defense', 'sp_attack', 'sp_defense','speed'] fig = go.Figure() for pokemon in pokemon_list: datos = dataframe[dataframe['name'] == pokemon][stat_columns].squeeze() valores = datos.values.tolist() columnas = datos.index.tolist() fig.add_trace(go.Scatterpolar(r= valores, theta= columnas, name=pokemon, fill='toself', hoverinfo= 'text+theta+name')) fig.update_layout(title=go.layout.Title(text='Comparacion Pokémon'), polar={'radialaxis': {'visible': False}}, showlegend=True, font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900, legend_bgcolor='rgb(117, 112, 179)') fig.show()

pokemons = ['Rayquaza', 'Mimikyu', 'Pikachu'] comparar_pokemons(df, pokemons, #pokemon_colors )

pokemon_colors = {'normal': '#A8A77A', 'fire': '#EE8130', 'water': '#6390F0', 'electric': '#F7D02C', 'grass': '#7AC74C', 'ice': '#96D9D6', 'fighting': '#C22E28', 'poison': '#A33EA1', 'ground': '#E2BF65', 'flying': '#A98FF3', 'psychic': '#F95587', 'bug': '#A6B91A', 'rock': '#B6A136', 'ghost': '#735797', 'dragon': '#6F35FC', 'dark': '#705746', 'steel': '#B7B7CE', 'fairy': '#D685AD'}

def comparar_pokemons(dataframe, pokemon_list): colors = {'normal': '#A8A77A', 'fire': '#EE8130', 'water': '#6390F0', 'electric': '#F7D02C', 'grass': '#7AC74C', 'ice': '#96D9D6', 'fighting': '#C22E28', 'poison': '#A33EA1', 'ground': '#E2BF65', 'flying': '#A98FF3', 'psychic': '#F95587', 'bug': '#A6B91A', 'rock': '#B6A136', 'ghost': '#735797', 'dragon': '#6F35FC', 'dark': '#705746', 'steel': '#B7B7CE', 'fairy': '#D685AD'} stat_columns = ['hp', 'attack', 'defense', 'sp_attack', 'sp_defense','speed'] fig = go.Figure() for pokemon in pokemon_list: datos = dataframe[dataframe['name'] == pokemon][stat_columns].squeeze() pokecolor = dataframe[dataframe['name'] == pokemon]['type_1'].squeeze() valores = datos.values.tolist() columnas = datos.index.tolist() fig.add_trace(go.Scatterpolar(r= valores, theta= columnas, name=pokemon, fill='toself', fillcolor = colors[pokecolor], marker_line_color = colors[pokecolor], opacity=0.7, hovertemplate = "%{theta}: %{r}", hoveron = "points", #"fills", "points+fills" )) fig.update_layout(title=go.layout.Title(text='Comparacion Pokémon por Stats'), polar={'radialaxis': {'visible': False}}, showlegend=True, font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900, legend_bgcolor='rgb(117, 112, 179)') fig.show()

pokemons = ['Squirtle', 'Charmander', 'Bulbasaur'] comparar_pokemons(df, pokemons)

df.head()

mas_altos = df.sort_values(by='height_m', ascending=False)[['name', 'height_m']][:5].set_index('name')

mas_altos

mas_pesados = df.sort_values(by='weight_kg', ascending=False)[['name', 'weight_kg']][:5].set_index('name')

mas_pesados

df['BMI'] = df['weight_kg'] / df['height_m']

df['BMI']

bmi_mas_bajo = df.sort_values(by='BMI')[['name', 'BMI']][:5].set_index('name')

bmi_mas_bajo

mas_pesados = mas_pesados.squeeze() bmi_mas_bajo = bmi_mas_bajo.squeeze() mas_altos = mas_altos.squeeze()

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 5)) sns.barplot(ax=ax1, x=mas_altos.index, y=mas_altos.values) ax1.set_title('Mas altos') ax1.set(xlabel=None) ax1.set_xticklabels(mas_altos.index, rotation=45, horizontalalignment='right') sns.barplot(ax=ax2, x=mas_pesados.index, y=mas_pesados.values) ax2.set_title('Mas pesados') ax2.set(xlabel=None) ax2.set_xticklabels(mas_pesados.index, rotation=45, horizontalalignment='right') sns.barplot(ax=ax3, x=bmi_mas_bajo.index, y=bmi_mas_bajo.values) ax3.set_title('Menor BMI') ax3.set(xlabel=None) ax3.set_xticklabels(bmi_mas_bajo.index, rotation=45, horizontalalignment='right') plt.show()

df['percentage_female'] = 100 - df['percentage_male']

df

pokemon_hembra = df['percentage_female'].mean() pokemon_macho = df['percentage_male'].mean()

pokemon_hembra

fig = px.pie(values=[pokemon_hembra, pokemon_macho], names=['Femeninos', 'Masculinos'], color_discrete_sequence=['#479B55', '#FA0087']) fig.update_layout(title='% de pokemon por sexo', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900, legend_bgcolor='rgb(117, 112, 179)') fig.show()

df.query('attack > defense').head()

linea = pd.DataFrame(dict(x=[0,200], y=[0,200]))

linea

fig1 = px.scatter(df, x='defense', y='attack', hover_data=['name'], color='type_1', color_discrete_sequence=px.colors.qualitative.Dark24) fig2 = px.line(linea, x='x', y='y') fig2.update_traces(line_color='red') fig3 = go.Figure(data=fig1.data + fig2.data) fig3.update_layout(title='Pokemon por ataque/defensa', xaxis_title = 'Defensa', yaxis_title = 'Ataque', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900, legend_bgcolor='rgb(117, 112, 179)') fig3.show()

!pip install statsmodels==0.13.2 ### ESte lo solicito Deepnote para correr las dos graficas que siguen

fig1 = px.scatter(df, x='attack', y='defense', hover_data=['name'], color_discrete_sequence=['#FA0087'], marginal_x='box', marginal_y='box', opacity=0.8, trendline='ols') fig1.update_layout(title='Grafica de ataque/defensa con linea de tendencia general', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900, legend_bgcolor='rgb(117, 112, 179)') fig1.show()

fig2 = px.scatter(df, x='attack', y='defense', hover_data=['name'], color='type_1', color_discrete_sequence=px.colors.qualitative.Light24, marginal_x='box', marginal_y='box', opacity=0.8, trendline='ols') fig2.update_layout(title='Grafica de ataque/defensa con linea de tendencia por tipo', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900, legend_bgcolor='rgb(117, 112, 179)') fig2.show()

columns_corr_high = ['hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed', 'catch_rate', 'height_m', 'weight_kg', 'BMI',] df[columns_corr_high].corr(method ='pearson').style.background_gradient(cmap = 'coolwarm', axis=None)

fig1 = px.imshow(df[columns_corr_high].corr(), color_continuous_scale='Portland') fig1.update_layout(title = 'Heatmaps de correlacion de nuestro dataset por metodo Pearson', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 750, width = 900) fig1.show()

df[columns_corr_high].corr()[['weight_kg']].sort_values(by='weight_kg', ascending=False).style.background_gradient(cmap = 'coolwarm',axis=None)

fig1 = px.violin(df, x='type_1', y='weight_kg', color = 'type_1', box=True, points="all", hover_data=df.columns, color_discrete_sequence=px.colors.qualitative.Light24) fig1.update_layout(title = 'Grafica para determionar efecto de la variable peso en el tipo de pokemon', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900, legend_bgcolor='rgb(117, 112, 179)') fig1.show()