EN-Real-Estate-California-market-analysis

import pandas as pd housing = pd.read_csv("housing.csv")

housing.head #There are 20,640 instances in the dataset

housing.info()

# We can find out what categories exist and how many # districts belong to each category housing["ocean_proximity"].value_counts()

# Let's see a histogram for every feature %matplotlib inline import matplotlib.pyplot as plt #housing.hist(bins=50, figsize=(20,15)) #plt.show()

import numpy as np housing["income_cat"] = np.ceil(housing["median_income"] / 1.5) housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

from sklearn.model_selection import StratifiedShuffleSplit split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(housing, housing["income_cat"]): strat_train_set = housing.loc[train_index] strat_test_set = housing.loc[test_index]

# Now we should remove the income_cat attribute so the data is back to its original state: for set_ in (strat_train_set, strat_test_set): set_.drop("income_cat", axis=1, inplace=True) # Let's create a copy so we can play with it without harming the training set: housing = strat_train_set.copy()

# use the training set to made predictions (label: median_house_value) housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1) # alpha is used to make easier to visualize the places where there is a high density of data points # Now let's look at the housing prices. The radius of each circle represents the district's population (option s), and the color represents the price (option c). We will use a predefined color map (option cmap) called jet, which ranges from blue (low values) to red (high prices): housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, s=housing["population"]/100, label="population", figsize=(10,7), c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True, )

# lets train a logistic regression model using scikit-learn. The objective is predict the median_house_value # First, we need to separate the predictors and the labels, since we don't necessarily want to apply the same transformations to the predictors and the target values (note that drop() creates a copy of the data and does not affect strat_train_set): housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set housing_labels = strat_train_set["median_house_value"].copy()

# Now let's create a preprocessing pipeline that will take care of missing values, the numerical attributes and the categorical attributes: from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer # First, let's create a simple pipeline for the numerical attributes: num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('std_scaler', StandardScaler()), ]) # Now let's build a pipeline for the categorical attributes: housing_num = housing.drop("ocean_proximity", axis=1) num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ]) housing_prepared = full_pipeline.fit_transform(housing)

# Now we are ready to train a model. Let's start with a linear regression model: from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_labels) # Let's try the full preprocessing pipeline on a few training instances: some_data = housing.iloc[:5] some_labels = housing_labels.iloc[:5] some_data_prepared = full_pipeline.transform(some_data) print("Predictions:", lin_reg.predict(some_data_prepared)) print("Labels:", list(some_labels))

# Let's measure this regression model's RMSE on the whole training set using Scikit-Learn's mean_squared_error function: from sklearn.metrics import mean_squared_error housing_predictions = lin_reg.predict(housing_prepared) lin_mse = mean_squared_error(housing_labels, housing_predictions) lin_rmse = np.sqrt(lin_mse) lin_rmse