House Price Prediction

import pandas as pd import numpy as np from sklearn.ensemble import RandomForestRegressor from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer from sklearn.metrics import mean_absolute_error from sklearn.pipeline import Pipeline, make_pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.model_selection import train_test_split

!pip install -r requirements.txt

train_data = pd.read_csv('./train.csv', index_col='Id') train_data # train_data.shape train_data = train_data.dropna(axis=1)

X = train_data.drop('SalePrice', axis=1) y = train_data.SalePrice X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

print(X.head()) X.shape

features = ["MSSubClass", "MSZoning", "LotArea", "Street", "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "Neighborhood", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal", "MoSold","YrSold", "SaleType", "SaleCondition"] features_1 = [ "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal", "MoSold", "YrSold", "SaleCondition", "MSSubClass", "LotArea", "Street", "LotShape", "LandContour", "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2", ] features_2 = [ 'MSSubClass', 'LotArea', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleCondition'] feature_3 = list(set(X.columns) - set(['BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'Exterior1st', 'Exterior2nd', 'Functional', 'GarageArea', 'GarageCars', 'KitchenQual', 'MSZoning', 'SaleType', 'TotalBsmtSF', 'Utilities'])) refined_X = X[feature_3] numerical_cols = refined_X.select_dtypes(include="number").columns print("Numerical cols: \n", numerical_cols) categorical_cols = refined_X.select_dtypes(include="object").columns print("\nCategorical cols: \n", categorical_cols)

Modelling

from xgboost import XGBRegressor numerical_transformer = StandardScaler() categorical_transformer = Pipeline(steps=[ # ('imputer', SimpleImputer(strategy='constant')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) preprocessor = ColumnTransformer( transformers=[ ('num', numerical_transformer, numerical_cols), ('cat', categorical_transformer, categorical_cols) ]) mlpipe = Pipeline(steps=[('preprocessor', preprocessor), ('model', XGBRegressor(n_estimators=1000, random_state=0)) ]) mlpipe.fit(X_train, y_train) preds = mlpipe.predict(X_valid) mae = mean_absolute_error(y_valid, preds) print("MAE:", mae)

import pickle with open('model.pkl', 'wb') as f: pickle.dump(mlpipe, f)

with open('model.pkl', 'rb') as f: model = pickle.load(f) model

test_data = pd.read_csv('./test.csv', index_col='Id') X_test = test_data.dropna(axis=1) X_test = X_test[feature_3] preds_test = mlpipe.predict(X_test) preds_test

out = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test.astype(int)}) out = out.to_csv('submission.csv', index=False)

out

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Modelling

Modelling