import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
!pip install -r requirements.txt
train_data = pd.read_csv('./train.csv', index_col='Id')
train_data
# train_data.shape
train_data = train_data.dropna(axis=1)
X = train_data.drop('SalePrice', axis=1)
y = train_data.SalePrice
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
print(X.head())
X.shape
features = ["MSSubClass", "MSZoning", "LotArea", "Street", "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "Neighborhood",
"EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal", "MoSold","YrSold", "SaleType", "SaleCondition"]
features_1 = [
"WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal", "MoSold", "YrSold", "SaleCondition",
"MSSubClass", "LotArea", "Street", "LotShape", "LandContour", "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2",
]
features_2 = [ 'MSSubClass', 'LotArea', 'Street', 'LotShape', 'LandContour',
'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt',
'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond',
'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
'1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
'Fireplaces', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
'MoSold', 'YrSold', 'SaleCondition']
feature_3 = list(set(X.columns) - set(['BsmtFinSF1',
'BsmtFinSF2',
'BsmtFullBath',
'BsmtHalfBath',
'BsmtUnfSF',
'Exterior1st',
'Exterior2nd',
'Functional',
'GarageArea',
'GarageCars',
'KitchenQual',
'MSZoning',
'SaleType',
'TotalBsmtSF',
'Utilities']))
refined_X = X[feature_3]
numerical_cols = refined_X.select_dtypes(include="number").columns
print("Numerical cols: \n", numerical_cols)
categorical_cols = refined_X.select_dtypes(include="object").columns
print("\nCategorical cols: \n", categorical_cols)
Modelling
from xgboost import XGBRegressor
numerical_transformer = StandardScaler()
categorical_transformer = Pipeline(steps=[
# ('imputer', SimpleImputer(strategy='constant')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])
mlpipe = Pipeline(steps=[('preprocessor', preprocessor),
('model', XGBRegressor(n_estimators=1000, random_state=0))
])
mlpipe.fit(X_train, y_train)
preds = mlpipe.predict(X_valid)
mae = mean_absolute_error(y_valid, preds)
print("MAE:", mae)
import pickle
with open('model.pkl', 'wb') as f:
pickle.dump(mlpipe, f)
with open('model.pkl', 'rb') as f:
model = pickle.load(f)
model
test_data = pd.read_csv('./test.csv', index_col='Id')
X_test = test_data.dropna(axis=1)
X_test = X_test[feature_3]
preds_test = mlpipe.predict(X_test)
preds_test
out = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test.astype(int)})
out = out.to_csv('submission.csv', index=False)
out