#librerias
import numpy as np
import pandas as pd
import seaborn as sns
import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.preprocessing import PolynomialFeatures
# extraccion de los datos
dataframe = pd.read_csv('avocado-updated-2020.csv')
dataframe.head()
#Estadisticas del dataframe
dataframe.describe()
print(dataframe.columns) #cabeceras
print(dataframe.dtypes) # tipos de datos
#organicacion y acondicionamiento de los datos
#Se eliminan algunos datos de los cuales no se tiene informacion clara
# se elimina el dato año ya que contiene un formato extraño que no correspone a los datos reales
df = dataframe.drop(['total_bags','small_bags','large_bags','xlarge_bags','year'], axis=1)
df['date']= pd.to_datetime(df['date'], errors='coerce')#converion de tipo de la columna date object to datetime
df['year']= df['date'].dt.year # extaccion del año
df['month']= df['date'].dt.month # extaccion del mes
df['day']= df['date'].dt.day # extraccion del dia
#Se convierten los datos de tipo y ubicacion a datos numericos convirtiendolos en columnas
tipo =pd.get_dummies(df['type'])
ubicacion=pd.get_dummies(df['geography'])
df2 =pd.concat((df,tipo,ubicacion), axis=1)
# Se eliminan los datos tipo texto
df2 = df2.drop(['type','geography'],axis=1)
df2
print(df2.dtypes) # tipos de datos
print(df2.columns) # columnas
#Media
media =df2['average_price'].mean()
#Mediana
mediana = df2['average_price'].median()
print('media = {} \nmediana = {}'.format(media, mediana))
#Histograma de frecuencia
df2['average_price'].plot.hist(bins=20) #bins = intervalos
#distribution plot, histograma
sns.displot(df2, x='average_price', hue=('organic')) #hue crea un histograma por cada tipo de aguacate
#0 conventional 1 organic
#desviacion estandar
desviacion_estandar=df2['average_price'].std()
#Rango = valor max - valor min
rango = df2['average_price'].max() - df2['average_price'].min()
#Quartiles
mediana = df2['average_price'].median()
Q1 = df2['average_price'].quantile(q=0.25) #25% de los datos
Q3 = df2['average_price'].quantile(q=0.75) #75% de los datos
min_val = df2['average_price'].min()
max_val = df2['average_price'].max()
print('minimo ={min_val}\nQuartil(Q1) ={Q1}\nmediana ={mediana}\nQuartil(Q3) ={Q3}\nmaximo ={max_val}\n'.format(
min_val=min_val,
Q1=Q1,
mediana=mediana,
Q3=Q3,
max_val=max_val))
#limites de deteccion de valores atipicos
iqr = Q3 - Q1
minlimit = Q1 - 1.5*iqr
maxlimit = Q3 + 1.5*iqr
print(iqr,minlimit, maxlimit)
sns.histplot(df2['average_price'])
# diagrama de caja
sns.boxplot(df2['average_price'])
scaler = StandardScaler()
scaled = scaler.fit_transform(
df2[['total_volume', '4046', '4225', '4770', 'year',
'month', 'day']].values
)
covariance_matrix = np.cov(scaled.T)
covariance_matrix
eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)
variance_explained = []
for i in eigen_values:
variance_explained.append((i/sum(eigen_values))*100)
print(variance_explained)
from sklearn.decomposition import PCA
pca = PCA(n_components=7)
pca.fit(scaled)
pca.explained_variance_ratio_
reduced_scaled = pca.transform(scaled)
df2['pca_1'] = reduced_scaled[:,0]
df2['pca_2'] = reduced_scaled[:,1]
df2['pca_3'] = reduced_scaled[:,2]
df2['pca_4'] = reduced_scaled[:,3]
df2['pca_5'] = reduced_scaled[:,4]
df2['pca_6'] = reduced_scaled[:,5]
df2['pca_7'] = reduced_scaled[:,6]
df2.head()
df2.columns
y = df2['average_price']
x = df2[[ 'conventional', 'organic', 'Albany', 'Atlanta',
'Baltimore/Washington', 'Boise', 'Boston', 'Buffalo/Rochester',
'California', 'Charlotte', 'Chicago', 'Cincinnati/Dayton', 'Columbus',
'Dallas/Ft. Worth', 'Denver', 'Detroit', 'Grand Rapids', 'Great Lakes',
'Harrisburg/Scranton', 'Hartford/Springfield', 'Houston',
'Indianapolis', 'Jacksonville', 'Las Vegas', 'Los Angeles',
'Louisville', 'Miami/Ft. Lauderdale', 'Midsouth', 'Nashville',
'New Orleans/Mobile', 'New York', 'Northeast', 'Northern New England',
'Orlando', 'Philadelphia', 'Phoenix/Tucson', 'Pittsburgh', 'Plains',
'Portland', 'Raleigh/Greensboro', 'Richmond/Norfolk', 'Roanoke',
'Sacramento', 'San Diego', 'San Francisco', 'Seattle', 'South Carolina',
'South Central', 'Southeast', 'Spokane', 'St. Louis', 'Syracuse',
'Tampa', 'Total U.S.', 'West', 'West Tex/New Mexico', 'pca_1', 'pca_2',
'pca_3', 'pca_4', 'pca_5','pca_6']]
pf = PolynomialFeatures(degree = 2)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
X_train_poli = pf.fit_transform(X_train)
X_test_poli = pf.fit_transform(X_test)
lr_multiple = linear_model.LinearRegression()
lr_multiple.fit(X_train_poli, y_train)
Y_pred_multiple = lr_multiple.predict(X_test_poli)
print('DATOS DEL MODELO REGRESIÓN LINEAL MULTIPLE')
print()
print('Valor de las pendientes o coeficientes "a":')
print(lr_multiple.coef_)
print('Valor de la intersección o coeficiente "b":')
print(lr_multiple.intercept_)
print('Precisión del modelo:')
print(lr_multiple.score(X_train_poli, y_train))