import Pkg; Pkg.add(["CSV","CategoricalArrays",
"Chain", "DataFrames", "GLM", "Plots", "Random", "StatsPlots",
"Statistics","Interact", "Blink"])
using CSV
using CategoricalArrays
using Chain
using DataFrames
using GLM
using Plots
using Random
using StatsPlots
using Statistics
ENV["LINES"] = 20 # to limit nuber of rows.
ENV["COLUMNS"] = 20 # to limit number of columns
ENV["LINES"] = 20
ENV["COLUMNS"] = 1000
df_raw = CSV.read("/work/Data/Heart Disease Dataset.csv", DataFrame)
size(df_raw)
describe(df_raw)
df = select(df_raw,:age,:sex => categorical => :sex,
Between(:cp, :chol),
:fbs => categorical => :fbs,:restecg,:thalach,
:exang => categorical => :exang,
Between(:oldpeak,:thal),
:target => categorical => :target
)
dropmissing
groupby(:target)
combine([:age, :sex, :chol, :restecg, :slope] .=> mean)
end
groupby(:target)
combine(names(df, Real) .=> mean)
end
groupby([:target, :sex])
combine(nrow)
end
groupby([:target, :sex])
combine(nrow)
unstack(:target, :sex, :nrow)
end
gd = groupby(df_raw, :target)
gd[1]
probit = glm(@formula(target ~ trestbps + age + chol + thalach + oldpeak + slope + ca),
df_raw, Binomial(), ProbitLink())
test_pred = predict(probit, test, interval=:confidence)
test.predict = test_pred.prediction;
prob_pred = predict(probit, df_pred, interval=:confidence)
plot(df_pred.cp, Matrix(prob_pred),labels=["Predicted" "Lower" "Upper"],
xlabel="cp", ylabel="Pr(lfp=1)")