# import pandas and numpy
import numpy as np
import pandas as pd
# Lets take a look at the Ames Iowa Housing Dataset:
# source: https://www.kaggle.com/c/ames-housing-data
df = pd.read_csv('https://raw.githubusercontent.com/austinlasseter/hosting_some_files/main/pandas_files/iowa.csv')
# show the shape
df.shape
# show five rows
df.head()
# set options to 100 rows
pd.set_option('display.max_rows', 100)
df.dtypes[:5]
# Look at the first ten rows of the `BedroomAbvGr` column.
df['BedroomAbvGr'].head(10)
# Look at the first ten rows of the `LoFrontage` column.
df['LotFrontage'].head(10)
# show value counts for frontage
df['LotFrontage'].value_counts(dropna=False)
# NaN is the most common value in this column. What is a NaN
#stands for not a number
# What is the datatype of NaN?
type(np.nan)
# how much missing data for lot frontage
df['LotFrontage'].isna().sum()
# drop missing data
df.dropna(inplace = True, subset=['LotFrontage'])
# now how much missing data?
df.isna().sum()
# fill missing values for lot frontage
# I can make a smaller dataframe with a few specific column headers
# by passing a list of column headers inside of the square brackets
df_2 = df[['TotalBsmtSF','1stFlrSF','2ndFlrSF','SalePrice']].copy()
# what is mean TotalBsmtSF
df_2['TotalBsmtSF'].mean()
# what is mean 1stFlrSF
df_2['1stFlrSF'].mean()
# Lets add up all of the square footage to get a single square footage column for the entire dataset
# Using bracket syntax to make a new 'TotalSquareFootage' column
df_2['TotalSquareFootage'] = df_2['TotalBsmtSF']+df_2['1stFlrSF']+df_2['2ndFlrSF']
df_2.head()
# Lets make a nother new column that is 'PricePerSqFt' by dividing the price by the square footage
df_2['PricePerSqFt']= df_2['SalePrice'] / df_2['TotalSquareFootage']
df_2.head()
df.head()
pd.crosstab(df['Street'], df['HouseStyle'], margins = True)
# value counts of a categorical variable
df['LotShape'].value_counts()
# value counts of a categorical variable
df['Street'].value_counts()
# crosstab of alley and lotshape
pd.crosstab(df['Alley'],df['Street'])
# margins
pd.crosstab(df['Alley'],df['Street'], margins = True)
# as proportions
pd.crosstab(df['Alley'],df['Street'], margins = True, normalize = 'Index')
# as proportions
pd.crosstab(df['Alley'],df['Street'], margins = True, normalize = 'columns')
# display the crosstab as a bar chart
pd.crosstab(df['Alley'],df['Street']).plot(kind = 'bar')
# Flip that to horizontal
pd.crosstab(df['Alley'],df['Street']).plot(kind = 'barh')