# import pandas and numpy
import numpy as np
import pandas as pd
# Lets take a look at the Ames Iowa Housing Dataset:
# source: https://www.kaggle.com/c/ames-housing-data
url = 'https://raw.githubusercontent.com/austinlasseter/hosting_some_files/main/pandas_files/iowa.csv'
# iowa_df = pd.read_csv('https://www.kaggle.com/c/ames-housing-data')
df = pd.read_csv('https://raw.githubusercontent.com/austinlasseter/hosting_some_files/main/pandas_files/iowa.csv')
df.head()
# show the shape
df.shape
# show five rows
df.head(5)
# set options to 100 rows
pd.options.display.max_rows=200
df.head(1).T
pd.set_option('display.max_rows', 100)
df.dtypes.head()
df.dtypes[:5]
df.columns
# Look at the first ten rows of the `BedroomAbvGr` column.
df[['BedroomAbvGr', 'Fireplaces']].head(10)
# Look at the first ten rows of the `BedroomAbvGr` column.
df['LotFrontage'].head(10)
# show value counts for frontage
df['LotFrontage'].value_counts(dropna=False)
# NaN is the most common value in this column. What is a NaN
# NAN is the missing information in the specific cell.
# What is the datatype of NaN?
type(np.nan)
# how much missing data for lot frontage
df['LotFrontage'].isnull().sum()
# drop missing data
print(df.shape)
df.dropna(inplace=True , subset=['LotFrontage'])
print(df.shape)
# now how much missing data?
df['LotFrontage'].isnull().sum()
df.isnull().sum().sort_values(ascending=False).head()
df['LotFrontage'].mean()
df['LotFrontage'].fillna(df['LotFrontage'].mean(), inplace=True)
df['LotFrontage'].mean()
# fill missing values for lot frontage
df['LotFrontage'].isnull().sum()
# I can make a smaller dataframe with a few specific column headers
# by passing a list of column headers inside of the square brackets
small_df = df[['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'SalePrice']].copy()
small_df.head()
# what is mean TotalBsmtSF
small_df['TotalBsmtSF'].mean()
# what is mean 1stFlrSF
small_df['1stFlrSF'].mean()
# Lets add up all of the square footage to get a single square footage column for the entire dataset
# Using bracket syntax to make a new 'TotalSquareFootage' column
small_df['TotalSquareFootage'] = small_df['TotalBsmtSF']+small_df['1stFlrSF']+small_df['2ndFlrSF']
small_df.head()
# Lets make a nother new column that is 'PricePerSqFt' by dividing the price by the square footage
small_df['Price PerSqFt'] = small_df['SalePrice'] / small_df['TotalSquareFootage']
small_df.head()
small_df['Price PerSqFt'].head()
## crosstabs
df.head()
# value counts of a categorical variable
df['LotShape'].value_counts()
# value counts of a categorical variable
df['Alley'].value_counts()
# crosstab of alley and lotshape
pd.crosstab(df['Alley'], df['LotShape'])
# margins
pd.crosstab(df['Alley'], df['LotShape'], margins=True)
# as proportions
pd.crosstab(df['Alley'], df['LotShape'], margins=True, normalize='index')
# as proportions
pd.crosstab(df['Alley'], df['LotShape'], margins=True, normalize='columns')
# display the crosstab as a bar chart
results = pd.crosstab(df['Alley'], df['LotShape'])
pd.crosstab(df['Alley'], df['LotShape']).plot(kind='bar');
# Flip that to horizontal
results = pd.crosstab(df['LotShape'], df['Alley'])
results.plot(kind='barh')