import pandas as pd
housing = pd.read_csv("housing.csv")
housing.head
#There are 20,640 instances in the dataset
housing.info()
# We can find out what categories exist and how many
# districts belong to each category
housing["ocean_proximity"].value_counts()
# Let's see a histogram for every feature
%matplotlib inline
import matplotlib.pyplot as plt
#housing.hist(bins=50, figsize=(20,15))
#plt.show()
import numpy as np
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
# Now we should remove the income_cat attribute so the data is back to its original state:
for set_ in (strat_train_set, strat_test_set):
set_.drop("income_cat", axis=1, inplace=True)
# Let's create a copy so we can play with it without harming the training set:
housing = strat_train_set.copy()
# use the training set to made predictions (label: median_house_value)
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
# alpha is used to make easier to visualize the places where there is a high density of data points
# Now let's look at the housing prices. The radius of each circle represents the district's population (option s), and the color represents the price (option c). We will use a predefined color map (option cmap) called jet, which ranges from blue (low values) to red (high prices):
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
s=housing["population"]/100, label="population", figsize=(10,7),
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
)
# lets train a logistic regression model using scikit-learn. The objective is predict the median_house_value
# First, we need to separate the predictors and the labels, since we don't necessarily want to apply the same transformations to the predictors and the target values (note that drop() creates a copy of the data and does not affect strat_train_set):
housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()
# Now let's create a preprocessing pipeline that will take care of missing values, the numerical attributes and the categorical attributes:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
# First, let's create a simple pipeline for the numerical attributes:
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler()),
])
# Now let's build a pipeline for the categorical attributes:
housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)
# Now we are ready to train a model. Let's start with a linear regression model:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
# Let's try the full preprocessing pipeline on a few training instances:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))
# Let's measure this regression model's RMSE on the whole training set using Scikit-Learn's mean_squared_error function:
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse