train.head()
Year_Factorint64
State_Factorobject
0
1
State_1
1
1
State_1
2
1
State_1
3
1
State_1
4
1
State_1
test.head()
Year_Factorint64
State_Factorobject
0
7
State_1
1
7
State_1
2
7
State_1
3
7
State_1
4
7
State_1
sample.head()
idint64
site_euifloat64
0
75757
0
1
75758
0
2
75759
0
3
75760
0
4
75761
0
train.describe()
Year_Factorfloat64
floor_areafloat64
count
75757
75757
mean
4.367754795
165983.8659
std
1.471441495
246875.7909
min
1
943
25%
3
62379
50%
5
91367
75%
6
166000
max
6
6385382
train = train.drop(columns=["id"])
train["Year_Factor"].value_counts()
train["State_Factor"].value_counts()
train["building_class"].value_counts()
train["facility_type"].value_counts()
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
nominal = ["Year_Factor", "State_Factor"]
ordinal = ["building_class", "facility_type"]
train_val = train.drop(columns=nominal+ordinal)
train_val.head()
floor_areafloat64
year_builtfloat64
0
61242
1942
1
274000
1955
2
280025
1951
3
55325
1980
4
66000
1985
scaled = scaler.fit(train_val)
from sklearn.model_selection improt train_test_split
X, y = train_test_split()
from sklearn.linear_model import LinearRegression
model = LinearRegression()