Primero un repaso de pandas
import pandas as pd
data = {'Nombre':['Juan','Jose','Valeria','Agustina'],
'Edad' :[25,31,27,25],
'País' :['Paraguay','Argentina','Brazil','Portugal']}
df = pd.DataFrame(data)
df
print(df[[ 'Nombre', 'País' ]])
print(df[['Nombre', 'Edad']])
data = pd.read_csv('/work/canciones-2018.csv')
data.head(5)
artista = data.artists
artista[5]
info = data.iloc[15]
info
data.tail()
data.shape
data.columns
data['tempo'].describe()
data.sort_index(axis = 0, ascending = False)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
dataset = pd.read_csv('/work/salarios.csv')
dataset.head(5)
dataset.shape
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 1].values
# Para entrena la información debemos desplegar 4 conjuntos de datos
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
print(X_train)
print(X_test)
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
Datos de Entrenamiento
viz_train = plt
viz_train.scatter(X_train, Y_train, color = 'blue')
viz_train.plot(X_train, regressor.predict(X_train), color = 'black')
viz_train.title('Salario vs Experiencia')
viz_train.xlabel('Experiencia')
viz_train.ylabel('Salario')
viz_train.show()
Datos de Prueba
viz_train = plt
viz_train.scatter(X_test, Y_test, color = 'red')
viz_train.plot(X_train, regressor.predict(X_train), color = 'black')
viz_train.title('Salario vs Experiencia')
viz_train.xlabel('Experiencia')
viz_train.ylabel('Salario')
viz_train.show()
Revisamos la precisión del modelo
regressor.score(X_test, Y_test)
Agrego la variable país
data2 = pd.read_csv('/work/paises.csv')
data2.head(5)
dataset['Cod_Pais'] = data2['Cod_Pais']
dataset.head(5)
dataset = dataset[['Aexperiencia','Cod_Pais','Salario']]
dataset.head(5)
dataset.shape
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
# Para entrenar la información debemos desplegar 4 conjuntos de datos
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
regressor.score(X_test, Y_test)
Observamos que la precision del modelo aumento al agregar una variable adicional