import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
!pip install statsmodels
from scipy import stats
import statsmodels.api as sm
import pylab as py
Requirement already satisfied: statsmodels in /root/venv/lib/python3.7/site-packages (0.12.2)
Requirement already satisfied: numpy>=1.15 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.19.5)
Requirement already satisfied: patsy>=0.5 in /root/venv/lib/python3.7/site-packages (from statsmodels) (0.5.1)
Requirement already satisfied: pandas>=0.21 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.2.4)
Requirement already satisfied: scipy>=1.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.6.2)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.21->statsmodels) (2.8.1)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.21->statsmodels) (2021.1)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5->statsmodels) (1.15.0)
WARNING: You are using pip version 21.0.1; however, version 21.1.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
Task 1
#1.1
df = pd.read_csv('processedClevelandData.csv',sep=";")
df['num'] = np.where(df['num']>=1, 1, np.where(df['num']<=0,0 ,None)) #removes invalid values
df['chol'] = df.chol.str.extract('(\d+)', expand=True).astype(int) #creates integer values
# Deletes columns unnamed.
del df["Unnamed: 0"]
del df["Unnamed: 0.1"]
# Removes rows, which contain question marks.
df = df[(df.astype(str) != '?').all(axis=1)]
print(df)
# Checking data Information
# df.info()
# Checking for missing values.
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
id Age Sex cp trestbps chol fbs resetecg thalach exang \
0 1 63.0 1.0 1.0 145.0 233 1.0 2.0 150.0 0.0
1 2 67.0 1.0 4.0 160.0 286 0.0 2.0 108.0 1.0
2 3 67.0 1.0 4.0 120.0 229 0.0 2.0 129.0 1.0
3 4 37.0 1.0 3.0 130.0 250 0.0 0.0 187.0 0.0
4 5 41.0 0.0 2.0 130.0 204 0.0 2.0 172.0 0.0
.. ... ... ... ... ... ... ... ... ... ...
297 298 57.0 0.0 4.0 140.0 241 0.0 0.0 123.0 1.0
298 299 45.0 1.0 1.0 110.0 264 0.0 0.0 132.0 0.0
299 300 68.0 1.0 4.0 144.0 193 1.0 0.0 141.0 0.0
300 301 57.0 1.0 4.0 130.0 131 0.0 0.0 115.0 1.0
301 302 57.0 0.0 2.0 130.0 236 0.0 2.0 174.0 0.0
slope ca thal num
0 3.0 0.0 6.0 0
1 2.0 3.0 3.0 1
2 2.0 2.0 7.0 1
3 3.0 0.0 3.0 0
4 1.0 0.0 3.0 0
.. ... ... ... ..
297 2.0 0.0 7.0 1
298 2.0 0.0 7.0 1
299 2.0 2.0 7.0 1
300 2.0 1.0 7.0 1
301 2.0 1.0 3.0 1
[297 rows x 14 columns]
missing_data.head()
#1.2
lowheart=df.loc[df['num']==0]['trestbps']
highheart=df.loc[df['num']==1]['trestbps']
print("For diagnosis of heart disease 0, we get: ")
print('mean: '+ str(np.mean(lowheart)))
print("median: "+str(np.median(lowheart)))
print("standard deviation: "+str(np.std(lowheart)))
print("For diagnosis of heart disease 1, we get: ")
print('mean: '+ str(np.mean(highheart)))
print("median: "+str(np.median(highheart)))
print("standard deviation: "+str(np.std(highheart)))
For diagnosis of heart disease 0, we get:
mean: 129.175
median: 130.0
standard deviation: 16.322741038195762
For diagnosis of heart disease 1, we get:
mean: 134.63503649635035
median: 130.0
standard deviation: 18.82763767250515
#1.3
fig = plt.figure()
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)
ax1.title.set_text("spearman")
ax2.title.set_text("kendall")
sp=df[['Age', 'trestbps','chol','thalach']].corr(method='spearman')
print("Spearmans correlation:")
sns.heatmap(sp,ax=ax1)
print(sp)
print("\n")
kl=df[['Age', 'trestbps','chol','thalach']].corr(method='kendall')
print("Kendalls correlation:")
sns.heatmap(kl,ax=ax2)
print(kl)
Spearmans correlation:
Age trestbps chol thalach
Age 1.000000 0.299610 0.183440 -0.392571
trestbps 0.299610 1.000000 0.139193 -0.046782
chol 0.183440 0.139193 1.000000 -0.034758
thalach -0.392571 -0.046782 -0.034758 1.000000
Kendalls correlation:
Age trestbps chol thalach
Age 1.000000 0.211476 0.126714 -0.276616
trestbps 0.211476 1.000000 0.095369 -0.032675
chol 0.126714 0.095369 1.000000 -0.023465
thalach -0.276616 -0.032675 -0.023465 1.000000
#1.5
plt.figure()
plt.title("Maximum heart rate in ascending order")
plt.plot(range(0,df.thalach.size),np.sort(df.thalach.copy()))
plt.xlabel("index")
plt.ylabel("maximum heart rate")
plt.grid(axis='y')
plt.figure()
sns.displot(df, x="thalach", kind="kde")
plt.xlabel("maximum heart rate")
plt.title("Density plot of the maximum heart rate")
plt.grid(axis='x')
plt.show()
# 1.6
# Frequency of Sex
df['Sex'].value_counts() # Generates counts.
# Output: 206 Males and 97 Females
fig, ax = plt.subplots()
df['Sex'].value_counts().plot(kind='bar')
plt.ylabel("No. of individuals")
plt.title("Frequency of Sex of Patient")
fig.canvas.draw()
labels = [item.get_text() for item in ax.get_xticklabels()]
labels[0] = 'female'
labels[1] = 'male'
ax.set_xticklabels(labels)
plt.show()
plt.figure()
# 1.6
# Frequency of major vessels colored by flouroscopy in male patients.
female=df.loc[df['Sex']==0]
female_values=female.ca.value_counts()
male=df.loc[df['Sex']==1]
male_values=male.ca.value_counts().plot(kind='bar')
plt.ylabel("No. of individuals")
plt.xlabel("No. of major vessels")
plt.title("Number of major vessels colored in Male Patients")
male_values1=male.ca.value_counts()
male_values1
# 1.7
### trestbps;resting blood pressure (in mm Hg on admission to the hospital)
### chol;serum cholestoral in mg/dl
num_cols = ['trestbps','chol']
plt.figure(figsize=(18,9))
df[num_cols].boxplot()
plt.title("Detecting outliers", fontsize=20)
plt.show()
Task 2
df[['Age']].hist()
df.loc[df['num']==0]['Age'].hist()
df.loc[df['num']==1]['Age'].hist()
fig = sm.qqplot(df.loc[df['num']==1]['Age'], line="s")
plt.show()
/root/venv/lib/python3.7/site-packages/statsmodels/graphics/gofplots.py:993: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "bo" (-> marker='o'). The keyword argument will take precedence.
ax.plot(x, y, fmt, **plot_style)
plt.show()
fig = sm.qqplot(df.loc[df['num']==0]['Age'], line="s")
/root/venv/lib/python3.7/site-packages/statsmodels/graphics/gofplots.py:993: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "bo" (-> marker='o'). The keyword argument will take precedence.
ax.plot(x, y, fmt, **plot_style)
stats.normaltest(df.loc[df['num']==1]['Age'])
stats.normaltest(df.loc[df['num']==0]['Age'])
stats.ttest_ind(df.loc[df['num']==0]['Age'], df.loc[df['num']==1]['Age'])
data_crosstab = pd.crosstab(df['Sex'], df['num'], margins = False)
data_crosstab
stats.fisher_exact(data_crosstab)
stats.chisquare(df['exang'], df['slope'])