import pandas as pd
Run to view results
# Look in the "data" folder.
df = pd.read_csv("https://raw.githubusercontent.com/austinlasseter/hosting_some_files/main/pandas_files/rejections.csv.gz")
df.head()
Run to view results
df.shape[0]
Run to view results
df.dtypes
Run to view results
missing_data = df.isnull().sum()
print(missing_data)
Run to view results
print(df['Amount Requested'].min())
print(df['Amount Requested'].max())
Run to view results
df[df['Amount Requested']==0]
Run to view results
#should drop where amount requested == 0 because they're not actually loaning any money
#drop
df = df.drop(df[df['Amount Requested']==0].index)
#check if dropped (should return no rows)
df[df['Amount Requested']==0]
Run to view results
df[df['Amount Requested']==df['Amount Requested'].max()]
Run to view results
# hmmm $1.4M for a car seems kinda funky! Let's drop it too.False
#drop
df = df.drop(df[df['Amount Requested']==1400000].index)
#check if dropped (should return no rows)
df[df['Amount Requested']==1400000]
Run to view results
df[df["State"].isnull()]
Run to view results
# drop nan's from State column
df = df.dropna(subset=["State"])
Run to view results
df.groupby(['State'])['Amount Requested'].mean().round(2)
Run to view results
df['Employment Length'].value_counts()
Run to view results
# drop rows that have no loan title
df['Loan Title'].isnull().sum()
df = df.dropna(subset=["Loan Title"])
df['Loan Title'].isnull().sum()
Run to view results
# convert each string in the "loan title" column to be fully uppercase
#df['Loan Title'] = df['Loan Title'].apply(lambda x: x.upper())
df['Loan Title'] = df['Loan Title'].str.upper()
df.head()
Run to view results
# (sidestep) look for rows have the word "DEBT" in the loan title
df[df['Loan Title'].str.contains('DEBT')]
Run to view results
# count how many rows have the word "DEBT" in the loan title
df['Loan Title'].str.contains('DEBT').sum()
Run to view results
# work out what this is as a % of your total dataset
print(round((df['Loan Title'].str.contains('DEBT').sum()/df['Loan Title'].shape[0])*100,0),"%")
Run to view results
# see what words are being used
df['Loan Title'].value_counts()[0:25]
Run to view results
LoanTitleList = [x.replace("_"," ").strip().split() for x in list(df['Loan Title'])]
LoanTitleList
from random import sample
bagowords=[]
for sublist in sample(LoanTitleList,10000):
for item in sublist:
bagowords.append(item)
words=pd.DataFrame(bagowords)
words.value_counts()[0:25]
Run to view results
df[(df['Loan Title'].str.contains('DEBT'))
& (df['Loan Title'].str.contains('CONSOLIDATION'))
& (df['Loan Title'].str.contains('CREDIT'))
& (df['Loan Title'].str.contains('CARD'))
]
Run to view results