import numpy as np
#Normal random devuelve una distribución gausiana normal.
'''En normal el primer parámetro es la media de los valores, el segundo es la desviación
estandar o ancho de la distribución, y el tercero es el tamaño del vector.'''
vector = np.random.normal(0.0,1.0,100)
print(vector)
[-0.89385224 -0.71242315 -0.66415388 -0.23318502 -0.42476325 -0.57853583
0.13304424 1.97059121 0.88517244 0.81111603 1.73277407 -0.52247641
1.29609177 -0.30730706 -0.41288624 0.280289 -0.67698037 0.20992178
0.67617206 0.78337695 1.72457566 0.20513955 1.00629615 1.29755871
-0.96283742 -0.01931545 0.27152081 0.05751077 2.32180864 1.55290908
-2.07786128 -1.10119609 0.90785671 -1.67369697 1.01518037 1.63747582
0.74118659 -0.22612085 -0.67941925 -0.60881657 -1.41966784 0.51665574
0.1318715 0.85991152 1.05093903 -0.12921382 1.60897648 0.23738756
1.61343032 0.92745436 -0.27776464 -0.54896702 0.68062772 0.25356839
-0.31967813 0.03755295 -0.27085572 -1.37434335 1.35296455 -0.90357417
1.40050337 -1.27430763 -0.4688332 0.01285907 -0.79192215 0.79437669
1.04179521 1.3297819 1.10847176 0.23086931 -0.11903196 -0.74957627
-2.1232936 -0.49599166 0.0595764 0.48160452 -0.86227876 1.63133594
1.2107841 0.27676407 0.32381802 -0.81725664 1.06798806 -0.28855058
-0.77255143 0.49354495 -1.63412872 1.00473733 -1.21409072 -0.21430972
1.39427824 -0.15744584 -0.79118041 0.12101523 -0.63323703 0.02752219
0.49768619 1.18037512 -0.60134508 0.12845829]
#Cargamos los vectores Et que contienen los valores aleatorios
vectEt1 = np.random.normal(0.0,1.0,100)
vectEt2 = np.random.normal(0.0,1.0,100)
#Creamos los vectores, en este caso, vacios
vectorX1= np.zeros([100,1])
vectorX2= np.zeros([100,1])
#Solo el primer valor de cada vector es cero
vectorX1[0],vectorX2[0] = 0,0
#Se realiza la fórmula
for i in range(1,100):
vectorX1[i] = 0.5 * vectorX1[i-1] + vectEt1[i]
vectorX2[i] = 0.8 * vectorX2[i-1] + vectEt2[i]
#Se procede a imprimir los vectores para verificar que hayan salido bien
print("Vector X1")
print(vectorX1)
print("Vector X2")
print(vectorX2)
Vector X1
[[ 0.00000000e+00]
[ 5.87063898e-02]
[ 7.71320765e-02]
[ 7.41537193e-01]
[ 1.95589779e-01]
[ 1.17745182e+00]
[-6.48975701e-01]
[-6.21007009e-01]
[-7.57549816e-01]
[-5.11426262e-01]
[-1.42599805e+00]
[-1.12904257e+00]
[-5.15822787e-01]
[-7.01561059e-01]
[ 1.64026822e+00]
[ 2.14381397e+00]
[ 7.37407177e-01]
[ 9.42031883e-01]
[ 2.33253152e-01]
[ 9.78993421e-01]
[ 3.46504029e-01]
[-9.93397573e-01]
[-7.93761925e-01]
[ 2.55895859e-01]
[ 1.59011539e+00]
[ 2.24865065e+00]
[ 1.22458983e+00]
[ 4.16035412e-01]
[ 1.46745869e+00]
[-2.48293264e-01]
[-3.77492412e-01]
[-2.45528667e-01]
[-4.33047872e-01]
[-9.70736907e-01]
[-3.13298924e+00]
[-2.05911398e+00]
[ 6.51621393e-01]
[ 2.28911337e+00]
[ 1.08546036e+00]
[-2.93825261e-02]
[-3.70784950e-01]
[ 1.38341365e+00]
[-3.37347895e-02]
[-4.78158690e-01]
[ 6.10424668e-01]
[-2.81689348e-01]
[-7.44392724e-01]
[-1.42027229e+00]
[ 5.54589142e-02]
[-1.18464300e-01]
[-2.22967300e+00]
[-2.09246318e+00]
[-9.32713217e-01]
[-2.63278966e+00]
[-9.72358739e-02]
[ 2.39027769e-01]
[-2.21728484e+00]
[ 2.58099262e-01]
[ 2.14980346e-01]
[-3.55488955e-01]
[ 2.37726753e+00]
[ 8.39289617e-01]
[ 1.27165181e+00]
[ 9.52612537e-01]
[-1.34197338e-01]
[-6.49603378e-02]
[ 1.64947494e-01]
[-6.25687844e-01]
[ 1.40857590e-03]
[ 1.21952712e+00]
[ 1.25349693e-01]
[ 2.24487813e-01]
[-7.06910076e-03]
[ 8.78896867e-01]
[-1.54376173e-01]
[ 5.26632812e-02]
[ 1.10279158e+00]
[ 1.77976033e+00]
[ 5.26383404e-01]
[-1.26126349e+00]
[ 9.29264639e-01]
[ 1.03999012e+00]
[-7.88286000e-01]
[-1.29569010e+00]
[ 6.97174451e-01]
[-1.67776537e+00]
[-1.75161863e+00]
[-1.59070817e+00]
[-2.33050109e+00]
[-1.03136769e+00]
[-2.80385585e+00]
[-2.81480376e+00]
[-6.22597295e-01]
[ 8.81674913e-01]
[ 1.95338795e+00]
[-1.92930107e-01]
[-2.25835714e-02]
[ 1.15959352e+00]
[-1.19743484e+00]
[ 4.92635592e-02]]
Vector X2
[[ 0.00000000e+00]
[ 7.14944275e-01]
[ 3.05763273e-01]
[ 5.99884006e-01]
[-5.27940529e-01]
[-4.95355502e-01]
[-1.23256883e+00]
[-8.63572125e-01]
[ 5.13079838e-01]
[ 5.92761596e-01]
[ 1.20997410e+00]
[ 7.86195724e-01]
[ 2.21768498e-01]
[ 1.07099871e+00]
[ 1.13904039e+00]
[ 8.61199637e-01]
[-5.13190220e-01]
[-1.97394417e+00]
[-1.51013067e+00]
[ 7.59469719e-01]
[ 1.90780105e-01]
[ 4.10687980e-01]
[-8.27248979e-01]
[-3.71219292e-01]
[-1.45750529e+00]
[-1.44772483e+00]
[-1.10603370e+00]
[-2.42138098e+00]
[-6.79045699e-01]
[ 3.53371483e-01]
[-7.05851403e-01]
[-6.24686371e-01]
[-2.47242923e+00]
[-3.83140384e+00]
[-3.83607900e+00]
[-1.22437452e+00]
[-6.75332704e-01]
[-8.81629637e-01]
[-1.63115387e+00]
[-1.88760878e+00]
[-2.63436505e+00]
[-4.73866934e-01]
[ 2.15596205e+00]
[ 1.84543869e+00]
[ 1.84591642e+00]
[ 2.70386140e-01]
[ 9.90248997e-01]
[-1.56107504e+00]
[-1.92671553e+00]
[-1.99793375e+00]
[-1.43602638e+00]
[-2.04206961e+00]
[-3.60507042e+00]
[-3.70932861e+00]
[-2.46888274e+00]
[-1.11338799e+00]
[ 6.70886832e-01]
[-3.27273034e-01]
[ 1.12974468e+00]
[ 6.88809663e-01]
[ 1.84662963e+00]
[ 1.10950774e+00]
[ 1.38709274e+00]
[ 9.48026400e-02]
[ 2.52904005e-01]
[ 1.18447027e+00]
[ 1.46789106e+00]
[ 1.51019068e+00]
[ 2.88859884e+00]
[ 7.41571511e-01]
[-4.91289392e-01]
[-2.27067058e+00]
[-2.02079801e+00]
[ 8.79397709e-02]
[-9.11475958e-01]
[-8.29504568e-01]
[-1.62662183e+00]
[-2.30912422e+00]
[-4.41958881e+00]
[-4.13349767e+00]
[-2.41852665e+00]
[-2.34137730e+00]
[-9.76257108e-01]
[-1.26205136e+00]
[-2.43571686e-01]
[-2.51969042e+00]
[-1.51628096e+00]
[-2.00470992e+00]
[-5.59194008e-02]
[-1.42722689e-01]
[-1.18313591e+00]
[-2.10695011e+00]
[-2.11995033e+00]
[-1.50538195e+00]
[-3.27715993e-01]
[ 1.23213278e-03]
[ 6.37199713e-01]
[ 1.18541945e+00]
[-3.66210385e-01]
[-6.49238291e-01]]
#De manera semejante al anterior, se hace el mismo procedimiento
vectVt = np.random.normal(0.0,1.0,100)
vectorY= np.zeros([100,1])
for i in range(0,100):
vectorY[i] = 2 * vectorX1[i] + 3 * vectorX2[i] + vectVt[i]
print(vectorY)
[[ 0.20192574]
[ 1.68984169]
[ 1.03472749]
[ 4.88488164]
[ -1.46407245]
[ 0.41697382]
[ -6.21810613]
[ -3.81097002]
[ 0.12935662]
[ -0.17349567]
[ 0.73816392]
[ 0.21894142]
[ 1.17340666]
[ 2.19451605]
[ 6.45618477]
[ 8.28647401]
[ -0.15700518]
[ -2.90365836]
[ -5.34636272]
[ 5.06870925]
[ 0.98491292]
[ 0.18405376]
[ -4.03484914]
[ -0.96484507]
[ -1.25115296]
[ -0.96386802]
[ -1.63218066]
[ -7.10609873]
[ -0.15954244]
[ -0.29677929]
[ -2.04007405]
[ -2.54643784]
[-10.00915174]
[-11.99569653]
[-19.85722499]
[ -7.8643731 ]
[ 0.40486161]
[ 1.42297063]
[ -3.26181359]
[ -5.03887533]
[ -9.16061184]
[ 1.4713664 ]
[ 5.64427463]
[ 3.8466475 ]
[ 7.32247508]
[ 0.56300612]
[ 1.33442403]
[ -7.77643394]
[ -6.37711325]
[ -6.11308638]
[ -7.61691652]
[ -9.24473862]
[-11.35337915]
[-15.26272857]
[ -5.89282552]
[ -4.47949558]
[ -0.49143284]
[ 0.42529422]
[ 4.98806454]
[ 1.63407078]
[ 9.27537988]
[ 5.03161247]
[ 5.43652325]
[ 2.4881261 ]
[ 0.05181207]
[ 2.65268904]
[ 4.60176754]
[ 4.05387491]
[ 7.8110873 ]
[ 4.8862235 ]
[ -2.55279808]
[ -7.62768994]
[ -6.52789996]
[ 1.99616254]
[ -2.21580226]
[ -3.13915151]
[ -1.77271295]
[ -4.56118244]
[-12.32201956]
[-15.62308929]
[ -6.22195328]
[ -4.89134806]
[ -4.13641422]
[ -5.72105463]
[ 0.60570288]
[-10.46578539]
[ -7.29063033]
[ -9.89632497]
[ -5.30267626]
[ -3.34637073]
[ -8.95296386]
[-11.5456506 ]
[ -7.07848527]
[ -3.04079616]
[ 3.14697141]
[ 1.140591 ]
[ 1.92618695]
[ 6.02666033]
[ -4.3179347 ]
[ -1.55218448]]
#Concatenamos los 2 vectores en una matriz de 100x2
xtrain = np.concatenate((vectorX1,vectorX2),axis=1)
ytrain= vectorY
def ols_estimator(X, y):
#Se añade a la matriz xtrain una columna con numero uno.
X = np.c_[X,np.ones(X.shape[0])]
#Se realiza la multiplicación de matrices
#Debido a la forma de xtrain, no es necesario sacar su matriz transversal
w = np.linalg.lstsq(X,y,rcond=None)[0]
#retorna un vector con los valores del intercepto y coeficientes b de la ecuación
return w
b = ols_estimator(xtrain,ytrain)
print(f"Los coeficientes b de la ecuacion 'y = Xb + v' son {b[0]} y {b[1]}")
Los coeficientes b de la ecuacion 'y = Xb + v' son [1.87707921] y [3.03819255]
from sklearn.model_selection import train_test_split
#Se dividen en datos de entrenamiento y prueba, solo el 20% de la información será de prueba
X_train, X_test, y_train, y_test = train_test_split(xtrain, ytrain, test_size=0.2)
import sklearn.linear_model
#Se realiza el modelo de regresión
regresion = sklearn.linear_model.LinearRegression()
regresion.fit(X_train, y_train)
#Se realiza la predicción, aunque en este caso es más relevante los coeficientes
Y_pred_multiple = regresion.predict(X_test)
#Se imprimen los coeficientes de la ecuación de cada variable X
print(f"Los coeficientes b de la ecuacion 'y = Xb + v' son {regresion.coef_}")
Los coeficientes b de la ecuacion 'y = Xb + v' son [[1.98335644 3.0382466 ]]
import pandas as pd
import os
#Se procede a extraer los nombres de cada archivo dataset.
file = [f for f in os.listdir('./content/')]
file
#Se revisa desde donde empiezan los dataset
file[1]
#En la variable frame, se guardan los dataset usando sus nombres
#frame = pd.read_csv('./content/'+file[1],sep=',')
#frame['YEAR'] = int(file[1][-8:-4])
#En este bucle se guardan los dataset en uno solo, en este caso dentro de frame
#for i in file[2:8]:
# aux = pd.read_table('./content/'+i,sep=',')
#aux['YEAR'] = i[-8:-4]
#frame = frame.append(aux)
#print(frame)
#En la variable frame, se guardan la dataset del primer archivo usando sus nombres
frame = pd.read_csv('./content/'+file[0],sep=',')
# Se agrega una nueva columna AÑO, que contiene el año en string
frame['YEAR'] = int(file[0][-8:-4])
#En este bucle se guardan los dataset en del segundo a ultimo valor, en este caso dentro de frame
for i in file[1:7]:
aux = pd.read_table('./content/'+i,sep=',')
aux['YEAR'] = i[-8:-4]
#Añade al dataframe
frame = frame.append(aux)
#Imprime el frame
print(frame)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3186: DtypeWarning: Columns (1,2) have mixed types.Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3186: DtypeWarning: Columns (5,6,7) have mixed types.Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
RESIDENCIA UBIGEO_RENIEC UBIGEO_INEI CONTINENTE PAIS \
0 Nacional 10101 10101 América Perú
1 Nacional 10101 10101 América Perú
2 Nacional 10101 10101 América Perú
3 Nacional 10101 10101 América Perú
4 Nacional 10101 10101 América Perú
... ... ... ... ... ...
420838 Extranjero 10000 NaN Oceanía Samoa Americana
420839 Extranjero 10000 NaN Oceanía Samoa Americana
420840 Extranjero 10000 NaN Oceanía Samoa Americana
420841 Extranjero 10000 NaN Oceanía Palau
420842 Extranjero 10000 NaN Oceanía República de Fiyi
DEPARTAMENTO PROVINCIA DISTRITO SEXO EDAD CANTIDAD YEAR
0 Amazonas Chachapoyas Chachapoyas Hombre 0 315 2014
1 Amazonas Chachapoyas Chachapoyas Hombre 1 337 2014
2 Amazonas Chachapoyas Chachapoyas Hombre 2 311 2014
3 Amazonas Chachapoyas Chachapoyas Hombre 3 302 2014
4 Amazonas Chachapoyas Chachapoyas Hombre 4 276 2014
... ... ... ... ... ... ... ...
420838 NaN NaN NaN Hombre 58 1 2019
420839 NaN NaN NaN Hombre 63 1 2019
420840 NaN NaN NaN Mujer 47 1 2019
420841 NaN NaN NaN Hombre 28 1 2019
420842 NaN NaN NaN Mujer 45 1 2019
[2726214 rows x 12 columns]
#Se repara en la variable dat, los atributos del dataset que se van a usar
dat = frame[['RESIDENCIA','SEXO','YEAR','CANTIDAD']]
#Se usa un filtro para tener solo las personas de residencia extranjera
dat = dat[dat['RESIDENCIA']=='Extranjero']
#filtro para tener solo varones
dat_masc = dat[dat['SEXO']=='Hombre']
#filtro para los años 2018 y 2019
dat_masc_2018 = dat_masc[dat_masc['YEAR']== 2018]
dat_masc_2019 = dat_masc[dat_masc['YEAR']== 2019]
#Se concatenan los dos filtros en uno solo
dat_masc_2018_2019 = pd.concat([dat_masc_2018,dat_masc_2019])
#Se cambia el tipo de dato, debido a que su dataset origen era de puros caracteres.
dat_masc_2018_2019['CANTIDAD'] = dat_masc_2018_2019['CANTIDAD'].astype(int)
sum(dat_masc_2018_2019['CANTIDAD'])
#Se cambian los datos a enteros, como se explicó anteriormente
frame['YEAR'] = frame['YEAR'].astype(int)
frame['EDAD'] = frame['EDAD'].astype(int)
frame['CANTIDAD'] = frame['CANTIDAD'].astype(int)
#Se escogen los atributos a utilizar
data2 = frame[['DEPARTAMENTO','SEXO','YEAR','EDAD','CANTIDAD']]
#Se realizan los filtros
data_areq = data2[data2['DEPARTAMENTO']=='Arequipa']
data_areq_women = data_areq[data_areq['SEXO']=='Mujer']
data_areq_women = data_areq_women[data_areq_women['YEAR'] <= 2018]
data_areq_women = data_areq_women[data_areq_women['YEAR'] >= 2016]
data_areq_young_women = data_areq_women[data_areq_women['EDAD']>=10]
data_areq_young_women = data_areq_women[data_areq_women['EDAD']<=19]
#Se suman la cantidad de mujeres de arequipa
women_areq = sum(data_areq_women['CANTIDAD'])
#Se suman la cant. de mujeres jóvenes de arequipa
teenagers_areq = sum(data_areq_young_women['CANTIDAD'])
print(f"% de adolecentes mujeres en arequipa en relacion a cant. de mujeres: {teenagers_areq/women_areq*100}%")
% de adolecentes mujeres en arequipa en relacion a cant. de mujeres: 30.659709627397945%
data_adults = frame[['YEAR','EDAD','CANTIDAD']]
data_major_adults = data_adults[data_adults['EDAD'] >= 60]
#se crea un nuevo atributo multiplicando los atributos edad y cantidad
data_major_adults['EDAD_POR_CANTIDAD'] = data_major_adults['EDAD']*data_major_adults['CANTIDAD']
#Se agrupa la info por año.
adults_group = data_major_adults.groupby(['YEAR'],as_index=False).sum()
#se crea un nuevo atributo dividiento edad por cantidad, entre cantidad
adults_group['PROM_EDAD'] = adults_group['EDAD_POR_CANTIDAD']/adults_group['CANTIDAD']
#Se ordena de acuerdo al mayor promedio de edad
adults_group.sort_values(by=['PROM_EDAD'],inplace=True, ascending=False)
print(adults_group[['YEAR','PROM_EDAD']])
YEAR PROM_EDAD
4 2018 70.588833
5 2019 70.584048
3 2017 70.565418
6 2020 70.541285
2 2016 70.526283
1 2015 70.485849
0 2014 70.465538
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
after removing the cwd from sys.path.
print("El mayor promedio de edad de personas adultas mayores a 65 años es del 2018")
El mayor promedio de edad de personas adultas mayores a 65 años es del 2018
#Se reutiliza el procedimiento anterior, usando los siguientes datos
data4 = frame[['SEXO','EDAD','CANTIDAD','PROVINCIA']]
males = data4[data4['SEXO'] == 'Hombre']
males['EDAD_POR_CANTIDAD'] = males['EDAD']*males['CANTIDAD']
males_group = males.groupby(['PROVINCIA'],as_index=False).sum()
males_group['PROM_EDAD'] = males_group['EDAD_POR_CANTIDAD']/males_group['CANTIDAD']
males_group.sort_values(by=['PROM_EDAD'],inplace=True, ascending=False)
males_group[['PROVINCIA','PROM_EDAD']].head()
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
after removing the cwd from sys.path.
PROVINCIAobject
PROM_EDADfloat64
0
-
41.82878088
77
General Sánchez Cerro
40.99518064
187
Tarata
40.2765677
35
Candarave
39.67347385
135
Ocros
38.57218234
#Se reutiliza el procedimiento anterior, usando los siguientes datos
females = data4[data4['SEXO'] == 'Mujer']
females['EDAD_POR_CANTIDAD'] = females['EDAD']*females['CANTIDAD']
females_group = females.groupby(['PROVINCIA'],as_index=False).sum()
females_group['PROM_EDAD'] = females_group['EDAD_POR_CANTIDAD']/females_group['CANTIDAD']
females_group.sort_values(by=['PROM_EDAD'],inplace=True, ascending=False)
females_group[['PROVINCIA','PROM_EDAD']].head()
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
This is separate from the ipykernel package so we can avoid doing imports until
PROVINCIAobject
PROM_EDADfloat64
0
-
42.76880441
187
Tarata
40.89236374
77
General Sánchez Cerro
40.7635862
35
Candarave
39.95143929
135
Ocros
38.70039095
import seaborn as sns
import matplotlib.pyplot as plt
#En una variable se agrupan por sexo y se ordenan de acuerdo a promedios.
data_adults = frame[['SEXO','EDAD']]
groups_adults = data_adults.groupby(['SEXO'],as_index=False).mean()
groups_adults.sort_values(by=['EDAD'],inplace=True, ascending=False)
groups_adults
SEXOobject
EDADfloat64
1
Mujer
47.02549187
0
Hombre
46.27154824
plt.figure(figsize=(13, 13))
sns.boxplot(x='SEXO',y='EDAD',data=frame,palette='flare')
#Histograma filtrando el sexo como Mujer
framefemale = frame[frame['SEXO']=='Mujer']
framefemale['EDAD'].hist(color='plum',bins=30,figsize=(10,5))
#Histograma filtrando el sexo como Hombre
framemale = frame[frame['SEXO']=='Hombre']
framemale['EDAD'].hist(color='sandybrown',bins=30,figsize=(10,5))
from scipy import stats
adults = frame[['SEXO','EDAD']]
group_adult = adults.groupby('SEXO')
anova = stats.f_oneway(group_adult.get_group('Hombre')['EDAD'],group_adult.get_group('Mujer')['EDAD'])
anova