Lesson 13 - Aggregating Data

import pandas as pd

Run to view results

# Look in the "data" folder. df = pd.read_csv("https://raw.githubusercontent.com/austinlasseter/hosting_some_files/main/pandas_files/rejections.csv.gz") df.head()

Run to view results

df.shape[0]

Run to view results

df.dtypes

Run to view results

missing_data = df.isnull().sum() print(missing_data)

Run to view results

print(df['Amount Requested'].min()) print(df['Amount Requested'].max())

Run to view results

df[df['Amount Requested']==0]

Run to view results

#should drop where amount requested == 0 because they're not actually loaning any money #drop df = df.drop(df[df['Amount Requested']==0].index) #check if dropped (should return no rows) df[df['Amount Requested']==0]

Run to view results

df[df['Amount Requested']==df['Amount Requested'].max()]

Run to view results

# hmmm $1.4M for a car seems kinda funky! Let's drop it too.False #drop df = df.drop(df[df['Amount Requested']==1400000].index) #check if dropped (should return no rows) df[df['Amount Requested']==1400000]

Run to view results

df[df["State"].isnull()]

Run to view results

# drop nan's from State column df = df.dropna(subset=["State"])

Run to view results

df.groupby(['State'])['Amount Requested'].mean().round(2)

Run to view results

df['Employment Length'].value_counts()

Run to view results

# drop rows that have no loan title df['Loan Title'].isnull().sum() df = df.dropna(subset=["Loan Title"]) df['Loan Title'].isnull().sum()

Run to view results

# convert each string in the "loan title" column to be fully uppercase #df['Loan Title'] = df['Loan Title'].apply(lambda x: x.upper()) df['Loan Title'] = df['Loan Title'].str.upper() df.head()

Run to view results

# (sidestep) look for rows have the word "DEBT" in the loan title df[df['Loan Title'].str.contains('DEBT')]

Run to view results

# count how many rows have the word "DEBT" in the loan title df['Loan Title'].str.contains('DEBT').sum()

Run to view results

# work out what this is as a % of your total dataset print(round((df['Loan Title'].str.contains('DEBT').sum()/df['Loan Title'].shape[0])*100,0),"%")

Run to view results

# see what words are being used df['Loan Title'].value_counts()[0:25]

Run to view results

LoanTitleList = [x.replace("_"," ").strip().split() for x in list(df['Loan Title'])] LoanTitleList from random import sample bagowords=[] for sublist in sample(LoanTitleList,10000): for item in sublist: bagowords.append(item) words=pd.DataFrame(bagowords) words.value_counts()[0:25]

Run to view results

df[(df['Loan Title'].str.contains('DEBT')) & (df['Loan Title'].str.contains('CONSOLIDATION')) & (df['Loan Title'].str.contains('CREDIT')) & (df['Loan Title'].str.contains('CARD')) ]

Run to view results