import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
!pip install -r requirements.txt
Collecting xgboost
Downloading xgboost-1.7.4-py3-none-manylinux2014_x86_64.whl (193.6 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 193.6/193.6 MB 12.5 MB/s eta 0:00:00
Requirement already satisfied: scipy in /shared-libs/python3.9/py/lib/python3.9/site-packages (from xgboost->-r requirements.txt (line 1)) (1.9.3)
Requirement already satisfied: numpy in /shared-libs/python3.9/py/lib/python3.9/site-packages (from xgboost->-r requirements.txt (line 1)) (1.23.4)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.4
WARNING: You are using pip version 22.0.4; however, version 23.0.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
train_data = pd.read_csv('./train.csv', index_col='Id')
train_data
# train_data.shape
train_data = train_data.dropna(axis=1)
X = train_data.drop('SalePrice', axis=1)
y = train_data.SalePrice
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
print(X.head())
X.shape
MSSubClass MSZoning LotArea Street LotShape LandContour Utilities \
Id
1 60 RL 8450 Pave Reg Lvl AllPub
2 20 RL 9600 Pave Reg Lvl AllPub
3 60 RL 11250 Pave IR1 Lvl AllPub
4 70 RL 9550 Pave IR1 Lvl AllPub
5 60 RL 14260 Pave IR1 Lvl AllPub
LotConfig LandSlope Neighborhood ... OpenPorchSF EnclosedPorch 3SsnPorch \
Id ...
1 Inside Gtl CollgCr ... 61 0 0
2 FR2 Gtl Veenker ... 0 0 0
3 Inside Gtl CollgCr ... 42 0 0
4 Corner Gtl Crawfor ... 35 272 0
5 FR2 Gtl NoRidge ... 84 0 0
ScreenPorch PoolArea MiscVal MoSold YrSold SaleType SaleCondition
Id
1 0 0 0 2 2008 WD Normal
2 0 0 0 5 2007 WD Normal
3 0 0 0 9 2008 WD Normal
4 0 0 0 2 2006 WD Abnorml
5 0 0 0 12 2008 WD Normal
[5 rows x 60 columns]
features = ["MSSubClass", "MSZoning", "LotArea", "Street", "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "Neighborhood",
"EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal", "MoSold","YrSold", "SaleType", "SaleCondition"]
features_1 = [
"WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal", "MoSold", "YrSold", "SaleCondition",
"MSSubClass", "LotArea", "Street", "LotShape", "LandContour", "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2",
]
features_2 = [ 'MSSubClass', 'LotArea', 'Street', 'LotShape', 'LandContour',
'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt',
'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond',
'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
'1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
'Fireplaces', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
'MoSold', 'YrSold', 'SaleCondition']
feature_3 = list(set(X.columns) - set(['BsmtFinSF1',
'BsmtFinSF2',
'BsmtFullBath',
'BsmtHalfBath',
'BsmtUnfSF',
'Exterior1st',
'Exterior2nd',
'Functional',
'GarageArea',
'GarageCars',
'KitchenQual',
'MSZoning',
'SaleType',
'TotalBsmtSF',
'Utilities']))
refined_X = X[feature_3]
numerical_cols = refined_X.select_dtypes(include="number").columns
print("Numerical cols: \n", numerical_cols)
categorical_cols = refined_X.select_dtypes(include="object").columns
print("\nCategorical cols: \n", categorical_cols)
Numerical cols:
Index(['OverallQual', 'WoodDeckSF', 'FullBath', 'MiscVal', 'MoSold',
'OverallCond', 'HalfBath', 'EnclosedPorch', 'YrSold', 'LotArea',
'GrLivArea', 'TotRmsAbvGrd', 'YearBuilt', 'KitchenAbvGr', '3SsnPorch',
'BedroomAbvGr', '2ndFlrSF', 'MSSubClass', '1stFlrSF', 'PoolArea',
'ScreenPorch', 'YearRemodAdd', 'OpenPorchSF', 'LowQualFinSF',
'Fireplaces'],
dtype='object')
Categorical cols:
Index(['LandContour', 'Heating', 'Neighborhood', 'HeatingQC', 'ExterCond',
'Street', 'ExterQual', 'Condition1', 'CentralAir', 'RoofMatl',
'RoofStyle', 'LotShape', 'PavedDrive', 'Foundation', 'LandSlope',
'HouseStyle', 'BldgType', 'Condition2', 'SaleCondition', 'LotConfig'],
dtype='object')
Modelling
from xgboost import XGBRegressor
numerical_transformer = StandardScaler()
categorical_transformer = Pipeline(steps=[
# ('imputer', SimpleImputer(strategy='constant')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])
mlpipe = Pipeline(steps=[('preprocessor', preprocessor),
('model', XGBRegressor(n_estimators=1000, random_state=0))
])
mlpipe.fit(X_train, y_train)
preds = mlpipe.predict(X_valid)
mae = mean_absolute_error(y_valid, preds)
print("MAE:", mae)
MAE: 19980.732729559077
import pickle
with open('model.pkl', 'wb') as f:
pickle.dump(mlpipe, f)
with open('model.pkl', 'rb') as f:
model = pickle.load(f)
model
test_data = pd.read_csv('./test.csv', index_col='Id')
X_test = test_data.dropna(axis=1)
X_test = X_test[feature_3]
preds_test = mlpipe.predict(X_test)
preds_test
out = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test.astype(int)})
out = out.to_csv('submission.csv', index=False)
out