datos23 = pd.read_csv('https://raw.githubusercontent.com/Wilsonsr/metodos-estadisticos/main/inmuebles_bogota_res.csv', index_col=0)
datos23
datos23.info()
datos23.mnrobanos.unique()
datos23. mnrocuartos.unique()
datos23.describe(include="object")
datos23=datos23.drop_duplicates()
datos23
datos23["LOgVentas"]=np.log10(datos23.mvalorventa)
datos23["LOgVentas"]=np.log10(datos23.mvalorventa)
datos23
import pandas as pd
df2= pd.DataFrame(datos23)
df2
import sklearn
# Import label encoder
from sklearn import preprocessing
# label_encoder
label_encoder = preprocessing.LabelEncoder()
#
df2["Codificada2"]=label_encoder.fit_transform(df2["mzona"])
df2
df2=pd.get_dummies(df2, columns=["mzona"])
df2
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
df2.columns
p = sns.pairplot(df2, x_vars=["LOgmarea" , "mzona_Occidental" ,"mnrogarajes" ,"mnrobanos", "mnrocuartos" , "mzona_Centro" ,"mzona_Norte","mzona_Noroccidente","mzona_Sur"], y_vars="LOgVentas", size=7, aspect=0.7)
## encontrar el valor F critico
import scipy.stats
scipy.stats.f.ppf(q=1-0.05, dfn=18,dfd=1546)
modelo_multiple2=smf.ols("LOgVentas~ marea + mnrogarajes + mnrocuartos + mnrobanos ", data=df2).fit()
modelo_multiple2.summary()
modelo_multiple3=smf.ols("LOgVentas ~ LOgmarea +mnrobanos+ mnrogarajes + mnrocuartos + mzona_Centro + mzona_Norte+mzona_Noroccidente+ mzona_Sur+ mzona_Occidental+ mzona_Chapinero", data=df2).fit()
modelo_multiple3.summary()
modelo_multiple4=smf.ols("LOgVentas ~ LOgmarea+ mnrogarajes + mnrocuartos + mzona_Norte+ mzona_Sur+ mzona_Chapinero+ mzona_Occidental", data=df2).fit()
modelo_multiple4.summary()
residuales=modelo_multiple4.resid
residuales.mean()
residuales.sort_values(ascending=True).head(10)
df2.loc[719,]
df2.loc[[719,6],]
import matplotlib.pyplot as plt
plt.subplots(figsize=(12, 6))
ax = plt.subplot(111)
plt.scatter(x=residuales.index, y=residuales, alpha=0.5)
plt.plot(np.repeat(0,residuales.index.max()), color='darkorange', linestyle='-')
plt.title('Residuales')
plt.show()
%matplotlib inline
%config InlineBackend.figure_format ='retina'
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.stats.api as sms
sns.set_style('darkgrid')
sns.mpl.rcParams['figure.figsize'] = (15.0, 9.0)
def homoscedasticity_test(model):
'''
Function for testing the homoscedasticity of residuals in a linear regression model.
It plots residuals and standardized residuals vs. fitted values and runs Breusch-Pagan and Goldfeld-Quandt tests.
Args:
* model - fitted OLS model from statsmodels
'''
fitted_vals = model.predict()
resids = model.resid
resids_standardized = model.get_influence().resid_studentized_internal
fig, ax = plt.subplots(1,2)
sns.regplot(x=fitted_vals, y=resids, lowess=True, ax=ax[0], line_kws={'color': 'red'})
ax[0].set_title('Residuals vs Fitted', fontsize=16)
ax[0].set(xlabel='Fitted Values', ylabel='Residuals')
sns.regplot(x=fitted_vals, y=np.sqrt(np.abs(resids_standardized)), lowess=True, ax=ax[1], line_kws={'color': 'red'})
ax[1].set_title('Scale-Location', fontsize=16)
ax[1].set(xlabel='Fitted Values', ylabel='sqrt(abs(Residuals))')
bp_test = pd.DataFrame(sms.het_breuschpagan(resids, model.model.exog),
columns=['value'],
index=['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'])
gq_test = pd.DataFrame(sms.het_goldfeldquandt(resids, model.model.exog)[:-1],
columns=['value'],
index=['F statistic', 'p-value'])
print('\n Breusch-Pagan test ----')
print(bp_test)
print('\n Goldfeld-Quandt test ----')
print(gq_test)
print('\n Residuals plots ----')
homoscedasticity_test(modelo_multiple3)
p = sns.distplot(residuales,kde=True)
p = plt.title('Normality of error terms/residuals')
import statsmodels.api as sm
# Gráfico Q-Q
# ==============================================================================
fig, ax = plt.subplots(figsize=(7,4))
sm.qqplot(
residuales,
fit = True,
line = 'q',
alpha = 0.4,
lw = 2,
ax = ax
)
ax.set_title('Gráfico Q-Q Residuales', fontsize = 10,
fontweight = "bold")
ax.tick_params(labelsize = 7)
from scipy import stats
# Shapiro-Wilk test
# ==============================================================================
shapiro_test = stats.shapiro(residuales)
shapiro_test
KM= stats.ks_1samp(residuales, stats.norm.cdf )
KM
import statsmodels.tsa.api as smt
acf = smt.graphics.plot_acf(residuales, lags=40 , alpha=0.05)
acf.show()
df2
plt.figure(figsize=(25,25))
p=sns.heatmap(df2.corr(), annot=True,square=True)
df2.describe
VIf_area=(1/1-0.78)
VIf_garajes=(1/1-0.61)
VIf_garajes ,VIf_area
from statsmodels.stats.outliers_influence import variance_inflation_factor
X= df2[[ 'mnrogarajes', 'marea', 'mzona_Norte','mnrocuartos','mzona_Sur', 'mzona_Centro','mzona_Chapinero',"mzona_Occidental"]]
# VIF dataframe
vif_data = pd.DataFrame()
vif_data ["feature"] = X.columns
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
for i in range(len(X.columns))]
print(vif_data)
Creamos una tabla con los valores de las características para predicir con el módelo selccionado modelo_multiple4.
LOgVentas ~ LOgmarea + mnrogarajes + mnrocuartos