Lesson 15 - Feature Engineering

# import pandas and numpy import numpy as np import pandas as pd

Run to view results

# Lets take a look at the Ames Iowa Housing Dataset: # source: https://www.kaggle.com/c/ames-housing-data url = 'https://raw.githubusercontent.com/austinlasseter/hosting_some_files/main/pandas_files/iowa.csv' # iowa_df = pd.read_csv('https://www.kaggle.com/c/ames-housing-data') df = pd.read_csv('https://raw.githubusercontent.com/austinlasseter/hosting_some_files/main/pandas_files/iowa.csv')

Run to view results

df.head()

Run to view results

# show the shape df.shape

Run to view results

# show five rows df.head(5)

Run to view results

# set options to 100 rows pd.options.display.max_rows=200

Run to view results

df.head(1).T

Run to view results

pd.set_option('display.max_rows', 100)

Run to view results

df.dtypes.head()

Run to view results

df.dtypes[:5]

Run to view results

df.columns

Run to view results

# Look at the first ten rows of the `BedroomAbvGr` column.

Run to view results

df[['BedroomAbvGr', 'Fireplaces']].head(10)

Run to view results

# Look at the first ten rows of the `BedroomAbvGr` column. df['LotFrontage'].head(10)

Run to view results

# show value counts for frontage df['LotFrontage'].value_counts(dropna=False)

Run to view results

# NaN is the most common value in this column. What is a NaN # NAN is the missing information in the specific cell.

Run to view results

# What is the datatype of NaN? type(np.nan)

Run to view results

# how much missing data for lot frontage df['LotFrontage'].isnull().sum()

Run to view results

# drop missing data print(df.shape)

Run to view results

df.dropna(inplace=True , subset=['LotFrontage'])

Run to view results

print(df.shape)

Run to view results

# now how much missing data? df['LotFrontage'].isnull().sum()

Run to view results

df.isnull().sum().sort_values(ascending=False).head()

Run to view results

df['LotFrontage'].mean()

Run to view results

df['LotFrontage'].fillna(df['LotFrontage'].mean(), inplace=True)

Run to view results

df['LotFrontage'].mean()

Run to view results

# fill missing values for lot frontage df['LotFrontage'].isnull().sum()

Run to view results

# I can make a smaller dataframe with a few specific column headers # by passing a list of column headers inside of the square brackets small_df = df[['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'SalePrice']].copy()

Run to view results

small_df.head()

Run to view results

# what is mean TotalBsmtSF small_df['TotalBsmtSF'].mean()

Run to view results

# what is mean 1stFlrSF small_df['1stFlrSF'].mean()

Run to view results

# Lets add up all of the square footage to get a single square footage column for the entire dataset # Using bracket syntax to make a new 'TotalSquareFootage' column small_df['TotalSquareFootage'] = small_df['TotalBsmtSF']+small_df['1stFlrSF']+small_df['2ndFlrSF']

Run to view results

small_df.head()

Run to view results

# Lets make a nother new column that is 'PricePerSqFt' by dividing the price by the square footage small_df['Price PerSqFt'] = small_df['SalePrice'] / small_df['TotalSquareFootage'] small_df.head()

Run to view results

small_df['Price PerSqFt'].head()

Run to view results

## crosstabs df.head()

Run to view results

# value counts of a categorical variable df['LotShape'].value_counts()

Run to view results

# value counts of a categorical variable df['Alley'].value_counts()

Run to view results

# crosstab of alley and lotshape pd.crosstab(df['Alley'], df['LotShape'])

Run to view results

# margins pd.crosstab(df['Alley'], df['LotShape'], margins=True)

Run to view results

# as proportions pd.crosstab(df['Alley'], df['LotShape'], margins=True, normalize='index')

Run to view results

# as proportions pd.crosstab(df['Alley'], df['LotShape'], margins=True, normalize='columns')

Run to view results

# display the crosstab as a bar chart results = pd.crosstab(df['Alley'], df['LotShape']) pd.crosstab(df['Alley'], df['LotShape']).plot(kind='bar');

Run to view results

# Flip that to horizontal results = pd.crosstab(df['LotShape'], df['Alley']) results.plot(kind='barh')

Run to view results