! pip install statsmodels
! pip install xlrd
! pip install openpyxl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import t
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
titanic = pd.read_excel('3_Titanic.xlsx')
EDA: Die Daten verstehen
titanic.head(10)
titanic.describe()
titanic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 706 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Cabin 204 non-null object
10 Embarked 889 non-null object
11 Unnamed: 11 0 non-null float64
12 Explanation 12 non-null object
13 Unnamed: 13 12 non-null object
dtypes: float64(2), int64(5), object(7)
memory usage: 97.6+ KB
EDA_cols = ['Age', 'Pclass']
EDA_data = titanic[EDA_cols]
plt.figure(figsize=(10, 10))
plt.style.use('ggplot')
sns.boxplot(x='Pclass', y='Age', data=EDA_data, palette='YlGnBu')
plt.ylabel('Alter')
plt.xlabel('Klasse')
plt.show()
EDA_cols2 = ['Survived', 'Pclass']
EDA_data2 = titanic[EDA_cols2]
plt.style.use('ggplot')
plt.figure(figsize=(10, 10))
sns.barplot(x='Pclass', y='Survived', data=EDA_data2, palette='YlGnBu')
plt.ylabel('Anteil Überlebender')
plt.xlabel('Klasse')
plt.show()
plt.style.use('ggplot')
plt.figure(figsize=(10, 10))
sns.barplot(x="Sex", y=titanic.index, data=titanic, palette='tab10', alpha=0.8)
plt.ylabel('Anzahl Personen')
plt.xlabel('Geschlecht')
plt.show()
plt.style.use('ggplot')
plt.figure(figsize=(10, 10))
sns.barplot(x='Sex', y='Survived', data=titanic, palette='tab10', alpha=0.8)
plt.ylabel('Überlebende')
plt.xlabel('Geschlecht')
plt.show()
Kategorische Daten encoden und maschinell lesbar machen
titanic['Sex'] = pd.get_dummies(titanic['Sex'])
titanic['Embarked'] = pd.get_dummies(titanic['Embarked'])
print(titanic.head(10))
PassengerId Survived Pclass \
0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3
5 6 0 3
6 7 0 1
7 8 0 3
8 9 1 3
9 10 1 2
Name Sex Age SibSp Parch \
0 Braund, Mr. Owen Harris 0 22.0 1 0
1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 38.0 1 0
2 Heikkinen, Miss. Laina 1 26.0 0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 35.0 1 0
4 Allen, Mr. William Henry 0 35.0 0 0
5 Moran, Mr. James 0 NaN 0 0
6 McCarthy, Mr. Timothy J 0 54.0 0 0
7 Palsson, Master. Gosta Leonard 0 2.0 3 1
8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 1 27.0 0 2
9 Nasser, Mrs. Nicholas (Adele Achem) 1 14.0 1 0
Ticket Cabin Embarked Unnamed: 11 Explanation \
0 A/5 21171 NaN 0 NaN Variable
1 PC 17599 C85 1 NaN PassengerId
2 STON/O2. 3101282 NaN 0 NaN Survived
3 113803 C123 0 NaN Pclass
4 373450 NaN 0 NaN Name
5 330877 NaN 0 NaN Sex
6 17463 E46 0 NaN Age
7 349909 NaN 0 NaN SibSp
8 347742 NaN 0 NaN Parch
9 237736 NaN 1 NaN Ticket
Unnamed: 13
0 Content
1 Nummer des Passagiers
2 = 1 falls überlebt, ansonsten = 0
3 Erste Klasse = 1; Zweite Klasse = 2; Dritte Kl...
4 Passagiername
5 Geschlecht: male/female
6 Alter
7 Anzahl der Geschwister an Bord
8 Anzahl an anderen Elternteilen und eigene Kind...
9 Ticketnummer
ANOVA und Hypothese
titanic['Survived'].value_counts()
titanic['Parch'].value_counts()
parch_survivors = titanic.query('Survived >= 1 & Parch >= 1 ')
parch_survivors['Survived'].value_counts()
fem_parch_survivors = titanic.query('Survived >= 1 & Parch >= 1 & Sex == 1')
fem_parch_survivors['Survived'].value_counts()
hypothese = pd.DataFrame(columns=['Überlebt', 'Eltern/ Kind an Bord', 'Eltern/ Kind + Überlebt', 'Eltern/ Kind + Überlebt + weiblich'], dtype=float)
werte = {'Überlebt':342, 'Eltern/ Kind an Bord':213, 'Eltern/ Kind + Überlebt':109, 'Eltern/ Kind + Überlebt + weiblich':80}
hypothese = hypothese.append(werte, ignore_index=True)
plt.style.use('ggplot')
plt.figure(figsize=(10, 10))
sns.barplot(data=hypothese, palette='YlGnBu')
plt.xticks(rotation=10)
plt.show()
anova_model = ols('Survived ~ Parch', data=titanic).fit()
anova_ergebnisse = sm.stats.anova_lm(anova_model, typ=2)
print(anova_ergebnisse['PR(>F)'])
Parch 0.014799
Residual NaN
Name: PR(>F), dtype: float64
Korrelationen zwischen den Datenpunkten
plt.style.use('ggplot')
plt.figure(figsize=(10, 10))
corr_matrix = titanic.corr()
sns.heatmap(corr_matrix, annot=True, linewidths=1)
plt.show()
print(titanic.Age.mean())
print(titanic.Age.median())
29.75094900849858
28.0
titanic['Age'] = titanic['Age'].fillna(titanic.Age.mean())
print(titanic.head(10))
print(titanic.info())
feauture_cols = ['Pclass', 'Age', 'Parch', 'Sex', 'SibSp', 'Embarked']
# Auswahl der oben ausgewählten features
X = titanic.loc[:, feauture_cols]
print(X.shape)
# y als Zielparameter - hat die Person überlebt oder nicht?
y = titanic.Survived
print(y.shape)
(891, 6)
(891,)
Logistische Regression
logistic_reg = LogisticRegression()
# Training der Logistischen Variable mit X und y
logistic_reg.fit(X, y)
pred_lr = logistic_reg.predict(X)
print(accuracy_score(y, pred_lr))
0.8035914702581369
# Aufteilen von x & y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Initiierung der Pipeline
logreg = LogisticRegression()
pipe = Pipeline(steps=[('logistic_Reg', logreg)])
# Werte die getestet werden sollen. Das Modell mit den besten Parametern wird anschließend verwendet.
werte_raster = {'logistic_Reg__C': np.logspace(-4, 4, 5),
'logistic_Reg__penalty': ['l1', 'l2', 'none']}
# Training des Modells mittels GridsearchCV. CV steht für cross-validation. Hierbei wird der gesamte Datensatz in Stücke aufgeteilt, wobei ein Stück der Daten zurückgehalten wird, um die performance zu testen.
# Das hat den Vorteil, das wir den gesamten Datensatz zum trainieren verwenden können, ohne das Modell zu over-fitten oder die generalisierbarkeit aufzugeben. Der Datensatz ist recht klein, daher ist jeder Datenpunkt wertvoll.
model_lr = GridSearchCV(pipe, param_grid=werte_raster, cv = 5, verbose=True)
model_lr.fit(X_train, y_train)
pred = model_lr.predict(X_test)
print(accuracy_score(y_test, pred))
Fitting 5 folds for each of 15 candidates, totalling 75 fits
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:818: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:818: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:818: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:818: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
0.8100558659217877
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:818: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1484: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
"Setting penalty='none' will ignore the C and l1_ratio parameters"
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/model_selection/_validation.py:372: FitFailedWarning:
25 fits failed out of a total of 75.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
File "/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/pipeline.py", line 394, in fit
self._final_estimator.fit(Xt, y, **fit_params_last_step)
File "/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
solver = _check_solver(self.solver, self.penalty, self.dual)
File "/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 449, in _check_solver
% (solver, penalty)
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
warnings.warn(some_fits_failed_message, FitFailedWarning)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/model_selection/_search.py:972: UserWarning: One or more of the test scores are non-finite: [ nan 0.62219049 0.78508815 nan 0.73732887 0.78508815
nan 0.7878952 0.78508815 nan 0.78508815 0.78508815
nan 0.78508815 0.78508815]
category=UserWarning,
print(model_lr.best_params_)
{'logistic_Reg__C': 1.0, 'logistic_Reg__penalty': 'l2'}
Random Forest
# Aufsetzen eines Random Forest Modells mit 100 Estimators/ Bäumen
schwarzwald = RandomForestClassifier(n_estimators=100)
#Training des Modells
schwarzwald.fit(X_train, y_train)
# Überprüfung der Ergebnisse mit Hilfe des Testdatensatzes
y_pred = schwarzwald.predict(X_test)
acc_random_forest = schwarzwald.score(X_train, y_train)
print(acc_random_forest)
0.9283707865168539
Rose und Jack: werden sie überleben?
fiktive_person = pd.DataFrame(columns=['Pclass', 'Age', 'Parch', 'Sex', 'SibSp', 'Embarked'], dtype=float)
fiktive_person.head()
rose = {'Pclass':1, 'Age':17, 'Parch':0, 'Sex':1, 'SibSp':1, 'Embarked':0}
jack = {'Pclass':3, 'Age':20, 'Parch':0, 'Sex':0, 'SibSp':0, 'Embarked':0}
fiktive_person = fiktive_person.append(rose, ignore_index=True)
fiktive_person = fiktive_person.append(jack, ignore_index=True)
fiktive_person
# Vorhersagen Rose und für Jack. 1 = Überlebt 0 = Überlebt nicht
roseandjack_prediction = schwarzwald.predict(fiktive_person)
print(roseandjack_prediction)
[1 0]
wahrscheinlichkeit_jackrose = schwarzwald.predict_proba(fiktive_person)
print(wahrscheinlichkeit_jackrose)
[[0.03 0.97]
[1. 0. ]]
Wichtigkeit der Spalten
importances = schwarzwald.feature_importances_
feature_names = ['Pclass', 'Age', 'Parch', 'Sex', 'SibSp', 'Embarked']
forest_importances = pd.Series(importances, index=feature_names)
std = np.std([tree.feature_importances_ for tree in schwarzwald.estimators_], axis=0)
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Wichtigkeit der jeweiligen Spalten")
ax.set_ylabel("%")
fig.tight_layout()
plt.show()