import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
diabetes_df = pd.read_csv("diabetes_012_health_indicators_BRFSS2015.csv")
diabetes_df.head()
diabetes_df.info()
diabetes_df.nunique()
def plot_histograms():
columns_to_plot = {"BMI", "MentHlth", "PhysHlth"}
for act_col_name in diabetes_df.columns:
if act_col_name not in columns_to_plot:
continue
fig = plt.figure()
ax = fig.add_subplot()
ax.hist(diabetes_df[act_col_name], bins=diabetes_df[act_col_name].nunique())
ax.set_title(act_col_name + ' Histogram')
ax.set_xlabel(act_col_name)
ax.set_ylabel('Frequency')
plot_histograms()
def plot_pies():
fig, axs = plt.subplots(4, 4)
x = 0
y = 0
columns_to_plot = {"BMI", "MentHlth", "PhysHlth", "Age", "GenHlth", "Education", "Income"}
for act_col_name in diabetes_df.columns:
if act_col_name in columns_to_plot:
continue
axs[x, y].plot()
labels = diabetes_df[act_col_name].unique()
axs[x, y].pie(diabetes_df[act_col_name].value_counts(), autopct='%1.1f%%', labels=labels)
axs[x, y].set_title(act_col_name)
x += 1
if (x > 3):
x = 0
y += 1
fig.set_figheight(15)
fig.set_figwidth(15)
plt.figure()
plot_pies()
def plot_bars():
fig, axs = plt.subplots(2, 2)
x = 0
y = 0
columns_to_plot = {"Age", "GenHlth", "Education", "Income"}
for act_col_name in diabetes_df.columns:
if act_col_name not in columns_to_plot:
continue
axs[y, x].plot()
labels = diabetes_df[act_col_name].unique()
axs[y, x].bar(labels, list(diabetes_df[act_col_name].value_counts()))
axs[y, x].set_title(act_col_name)
axs[y, x].set_xlabel(act_col_name)
x += 1
if (x > 1):
x = 0
y += 1
fig.set_figheight(15)
fig.set_figwidth(15)
plt.figure()
plot_bars()
import matplotlib.pyplot as plt
def plot_correlations():
plt.subplots(figsize=(15, 5),dpi=300)
ax = sns.heatmap(
diabetes_df.corr(),
annot=True,
annot_kws={"fontsize":4},
fmt="0.2f",
square=True,
linewidth=.3
)
plot_correlations()
pd.DataFrame(diabetes_df.BMI).boxplot(grid=False, figsize=(10, 10))
import pandas as pd
import seaborn as sns
import numpy as np
pd.set_option("display.max_columns", None)
diabetes_df = pd.read_csv("diabetes_012_health_indicators_BRFSS2015.csv")
# Vyhozeni dat s prediabetes
diabetes_df = diabetes_df[diabetes_df["Diabetes_012"] != 1]
diabetes_df["Diabetes_012"] = diabetes_df["Diabetes_012"].replace(2.0, 1.0)
# Rozdeleni dat
diabetes_X, diabetes_y = diabetes_df.drop(columns="Diabetes_012"), diabetes_df.Diabetes_012
# Oversampling
diabetes_X, diabetes_y = SMOTE(random_state=42).fit_resample(diabetes_X, diabetes_y)
# Vyber 10 nejlepsich features
diabetes_X = SelectKBest(k=10).fit_transform(diabetes_X, diabetes_y)
# Rozdeleni na trenovaci a testovaci
diabetes_train_X, diabetes_test_X, diabetes_train_y, diabetes_test_y = train_test_split(
diabetes_X, diabetes_y, test_size=0.2, random_state=42
)
# Skalovani
sc = StandardScaler()
diabetes_train_X = sc.fit_transform(diabetes_train_X)
diabetes_test_X = sc.transform(diabetes_test_X)
# Normalizace
sc = MinMaxScaler()
diabetes_train_X = sc.fit_transform(diabetes_train_X)
diabetes_test_X = sc.transform(diabetes_test_X)
# Baseline model
gaussian = GaussianNB()
gaussian.fit(diabetes_train_X, diabetes_train_y)
print(gaussian.score(diabetes_test_X, diabetes_test_y))
print("Its me mario")