import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Umaima Khurshid Ahmad -
%pwd #getting path directiry
#Exploring (and comment) the general characteristics of the data as a whole:
#examine the means, standard deviations, and other statistics associated with the numerical attributes;
#show the distributions of values associated with categorical attributes; etc.
df = pd.read_csv("C:/Users/Umaima/Untitled Folder/bank_data.csv", index_col=0, na_values=["?"])
#Question 1 - Part a
df.shape
#checking how the data looks like
df.head(5)
df.columns #extracting out columns to examine which can be used further
df.dtypes
df.describe() # we can see that gender,region,married,car,pep are categorial
#varibales thats why not shown. to show them do discribe("all")
#mean of age is 42.395000 means its the center of the data
#mean of income is 27524.031217 means its the center of the data
df.isnull()[0:5] #checking null values
df.dropna(axis=0, inplace=True)
df.shape #no nullvalues as shape the same as before.
#Discribing categorial varibles
df['pep'].value_counts().plot(kind='bar') #bar graph representing the data for pep
from pandas.plotting import scatter_matrix
scatter_matrix(df[["age","income","pep"]], figsize=(14,14), hist_kwds={'bins':8}, alpha=.5, marker='o', s=50)
#we can visualize the numerical variables showing the income and age
#that might me dependant towards weather they opt for PEP or not - assumtion
#showing all categorial varibales in one frame
fig = plt.figure(figsize=(10,10))
fig.add_subplot(221)
ax1.set_title("-- No of Region -- ")
ax1.set_xlabel('Region')
ax1.set_ylabel('Total')
plt.style.use('ggplot')
df.region.value_counts().plot(kind='bar', grid=True)#top left
fig.add_subplot(222) #top right
ax1.set_title("--No of Married -- ")
ax1.set_xlabel('Married/YES-NO')
ax1.set_ylabel('Total')
plt.style.use('ggplot')
df.married.value_counts().plot(kind='bar', grid=True)
fig.add_subplot(223) #bottom left
ax1.set_title("-- No of Car Ownership --")
ax1.set_xlabel('Car Ownership')
ax1.set_ylabel('Count')
df.car.value_counts().plot(kind='bar', grid=True)
fig.add_subplot(224) #bottom right
ax1.set_title("-- Total savings acctount --")
ax1.set_xlabel('savings account')
ax1.set_ylabel('Count')
df.savings_acct.value_counts().plot(kind='bar', grid=True)
plt.show()
## Means grouped by Car Ownership
df.groupby("car").mean()
## Means grouped by Car Ownership
df.groupby("car").median()
#Question 2: Suppose that the hypothetical bank is particularly interested
#in customers who buy the PEP (Personal Equity Plan) product. Compare and contrast the subsets of customers who buy and don't buy the PEP. Compute summaries (as in part 1) of the selected data with respect to all other attributes.
#Can you observe any significant differences between these segments of customers? Discuss your observations.
import matplotlib.pyplot as plt
fig = plt.figure(figsize = (5, 5))
# creating the bar plot
plt.bar(df["pep"], df["income"], color ='maroon',
width = 0.9)
# finding all instances when the pep is YES
pep_YES = (df["pep"] == "YES")
# finding all instances when the pep is YES
pep_NO = (df["pep"] == "NO")
df[pep_YES].describe()
df[pep_NO].describe()
#We can exmaine that the mean of indivuials with PEP yes
#have a greater income, while NO PEPs have less income with reference to the mean value
#we can also see pep effects income in some case as the max value for pep no is 61554.600000
#while pep yes is a few thousand dollars higher
# creating the bar plot
plt.bar(pep_YES, df["income"], color ='maroon',
width = 0.9)
df
#examining the people who have saving accounts -- assuming they might have some savings that could be untilized in PEP
pep_yes = (df[df['pep'] == 'YES'])
pep_yes["savings_acct"].value_counts()
pep_NO = (df[df['pep'] == 'NO'])
pep_NO["savings_acct"].value_counts()
pep_yes["mortgage"].value_counts()
# we can examine that people who buy pep dont have mortgage.
#Question 3
#Use z-score normalization to
#standardize the values of the income attribute.
income_z = (df["income"] - df["income"].mean()) / df["income"].std()
income_z.head(5)
df["income-zscroe"] = income_z
df.head()
#Question 4
#Discretize the age attribute into 3 categories
#(corresponding to "young", "mid-age", and "old"). [Do not change the original age attribute in the table.]
age_bins = pd.qcut(df.age, 3)
age_bins.head(10)
age_bins = pd.qcut(df.age, [0, 0.33, 0.66, 1], labels=['young', 'mid-age', 'old'], retbins=True)
age_bins
df
#vs2 = pd.concat([df,age_bins]) error while importing values in table
#Question 4 Use Min-Max Normalization to transform the values of all numeric attributes (income, age, children)
#in the original table (before the transformations in parts 3 and 4 above) onto the range 0.0-1.0.
#computing Min Max - Normalization for income
min_income = df["income"].min()
max_income = df["income"].max()
norm_sal = (df["income"] - min_income) / (max_income-min_income)
print (norm_sal.head(10))
#computing Min Max - Normalization for age
min_age = df["age"].min()
max_age = df["age"].max()
norm_age = (df["age"] - min_income) / (max_income-min_income)
print (norm_age.head(10))
#Queston 5 :
#Use Min-Max Normalization to transform the values of all numeric attributes
#(income, age, children) in the original table
#(before the transformations in parts 3 and 4 above) onto the range 0.0-1.0.
#computing Min Max - Normalization for age
min_children = df["children"].min()
max_children = df["children"].max()
norm_children = (df["children"] - min_children) / (max_children-min_children)
print (norm_children.head(10))
df.head(5) #testing
df.head(5) #testing
#adding min-max normalization in dataframe
df["norm_children"] = norm_children
df["norm_age"] = norm_age
df["norm_sal"] = norm_sal
df
#Making a new Data frame
df_Numeric_Values = df[['norm_children', 'norm_age', 'norm_sal']]
df_Numeric_Values
df_Numeric_Values.describe()
#Question # 6 Combined the normalized numbers from part 5 with dummy variables for the categorical attributes into one table/frame. All attributes should then be numeric and no attribute should be repeated. Save this new data matrix into a file called bank_numeric.csv
#and submit it along with your assignment.
#df_for_dummies = df[["gender" ,"region" ,"married", "car" ,"savings_acct" ,"current_acct" ,"mortgage" ,"pep"]]
tbl = pd.get_dummies(df)
tbl.to_csv("bank_numeric.csv", float_format="%1.2f")
tbl.head(5)
tbl.corr()
# we can exmine that there is a positive corr between age and income.this means that as age increase the income also increases
# we can exmaine that people dont buy pep when they have children which means that somehow
# we can assosicate the even if the income is more the there are more expenses with children
# that doesnt help them in saving and getting pep.
# -- 7 -- Using the standardized data set (of the previous part), perform basic correlation analysis among the attributes. Discuss your results by indicating any significant positive or negative correlations among pairs of attributes. You need to construct a complete Correlation Matrix. Be sure to first remove the Customer ID column before creating the correlation matrix. [Hint: you can
#create the correlation matrix by using the corr() function in Pandas, try at least two corr methods and compare them].plt.matshow(tbl.corr())
plt.show()
tbl.corr().style.background_gradient(cmap='coolwarm')
# Using Matplotlib library and/or ploting capabilties of Pandas, create a scatter plot of the (non-normalized) Income attribute relative to Age. Be sure that your plot contains
#appropriate labels for the axes. Do these variables seem correlated?
df.plot(x="age", y="income", kind="scatter", title="Relation Between Age and Income", grid=True)
#They seem to be correlated, as age increases income also increases.
#histograms for (non-normalized) Income (using 10 bins)
fig = plt.figure(figsize=(10,4))
ax1.set_title("Income Distribution")
ax1.set_xlabel('Income')
ax1.set_ylabel('Frequency')
df["income"].plot(kind="hist", bins=10, grid=True)
#Result shows a right-skwewed graph
#histograms for (non-normalized) Age (using 10 bins)
fig = plt.figure(figsize=(10,4))
ax1.set_title("Income Distribution")
ax1.set_xlabel('Income')
ax1.set_ylabel('Frequency')
df["income"].plot(kind="hist", bins=14, grid=True)
#Result shows a right-skwewed graph
#10 - Using a bargraph, plot the distribution of the values of the region attribute.
fig = plt.figure(figsize=(10,10))
fig.add_subplot(221)
ax1.set_title("-- No of Region -- ")
ax1.set_xlabel('Region')
ax1.set_ylabel('Total')
plt.style.use('ggplot')
df.region.value_counts().plot(kind='bar', grid=True)
cross_tab = pd.crosstab(df['region'], df['pep'])
cross_tab
plt.show(cross_tab.plot(kind="bar", grid=True, title="Cross Tabulation between PEP and Reigion"))