Lesson 15 - Feature Engineering

# import pandas and numpy import numpy as np import pandas as pd

Run to view results

# Lets take a look at the Ames Iowa Housing Dataset: # source: https://www.kaggle.com/c/ames-housing-data df = pd.read_csv('https://raw.githubusercontent.com/austinlasseter/hosting_some_files/main/pandas_files/iowa.csv')

Run to view results

# show the shape df.shape

Run to view results

# show five rows df.head()

Run to view results

# set options to 100 rows pd.set_option('display.max_rows', 100) df.dtypes[:5]

Run to view results

# Look at the first ten rows of the `BedroomAbvGr` column. df['BedroomAbvGr'].head(10)

Run to view results

# Look at the first ten rows of the `LoFrontage` column. df['LotFrontage'].head(10)

Run to view results

# show value counts for frontage df['LotFrontage'].value_counts(dropna=False)

Run to view results

# NaN is the most common value in this column. What is a NaN #stands for not a number

Run to view results

# What is the datatype of NaN? type(np.nan)

Run to view results

# how much missing data for lot frontage df['LotFrontage'].isna().sum()

Run to view results

# drop missing data df.dropna(inplace = True, subset=['LotFrontage'])

Run to view results

# now how much missing data? df.isna().sum()

Run to view results

# fill missing values for lot frontage

Run to view results

# I can make a smaller dataframe with a few specific column headers # by passing a list of column headers inside of the square brackets df_2 = df[['TotalBsmtSF','1stFlrSF','2ndFlrSF','SalePrice']].copy()

Run to view results

# what is mean TotalBsmtSF df_2['TotalBsmtSF'].mean()

Run to view results

# what is mean 1stFlrSF df_2['1stFlrSF'].mean()

Run to view results

# Lets add up all of the square footage to get a single square footage column for the entire dataset # Using bracket syntax to make a new 'TotalSquareFootage' column df_2['TotalSquareFootage'] = df_2['TotalBsmtSF']+df_2['1stFlrSF']+df_2['2ndFlrSF'] df_2.head()

Run to view results

# Lets make a nother new column that is 'PricePerSqFt' by dividing the price by the square footage df_2['PricePerSqFt']= df_2['SalePrice'] / df_2['TotalSquareFootage'] df_2.head()

Run to view results

df.head()

Run to view results

pd.crosstab(df['Street'], df['HouseStyle'], margins = True)

Run to view results

# value counts of a categorical variable df['LotShape'].value_counts()

Run to view results

# value counts of a categorical variable df['Street'].value_counts()

Run to view results

# crosstab of alley and lotshape pd.crosstab(df['Alley'],df['Street'])

Run to view results

# margins pd.crosstab(df['Alley'],df['Street'], margins = True)

Run to view results

# as proportions pd.crosstab(df['Alley'],df['Street'], margins = True, normalize = 'Index')

Run to view results

# as proportions pd.crosstab(df['Alley'],df['Street'], margins = True, normalize = 'columns')

Run to view results

# display the crosstab as a bar chart pd.crosstab(df['Alley'],df['Street']).plot(kind = 'bar')

Run to view results

# Flip that to horizontal pd.crosstab(df['Alley'],df['Street']).plot(kind = 'barh')

Run to view results