import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from pandas import DataFrame, Series
import seaborn as sns
import pandas as pd

# Load data into a dataframe & print
df = pd.read_csv("admit_data.csv")
df.head()

Englishfloat64

Mathfloat64

0

35.0

54.625

1

26.5

68.5

2

41.0

57.0

3

21.5

42.0

4

46.0

84.0

# Print out feature names to get exact names (some have spaces) using .columns
df.columns

# Get rid of spaces in feature names using df.columns = [ list of new names in quotes]
df.columns = ["English", "Math", "Outcome"]

# Create new feature called 'Admit' using map or apply with values "yes" or "no"
# For this case it's easier to use .map as we did for iris dataset
df["Admit"] = df["Outcome"].map({0 : "No", 1 : "Yes"})

# Set background grid for Seaborn & plot Math vs English with color indicating whether the student was admitted
# or not
sns.set_style ("whitegrid")
sns.relplot (x = "Math", y = "English", data = df, hue = "Admit")

# Create target 1D array and 2D data array dimensioned 100 by 2
#
# create target array. It's a 1D array.
# create data array. It should be a 2D array dimensioned 100 by 2.
# Get NumPy array for english and math scores, stack them and take transpose
# Remember that np.vstack takes 1 argument so put your arrays in ( ) and remember to use .T to transpose
target = np.array (df.Outcome.values)
target = np.reshape(target, [100, 1])
math = np.array(df.Math.values)
english = np.array(df.English.values)
x = np.vstack((math, english)).T

# split our data into a training set and a test set; we choose 75-25 split
# We are splitting the entire data set and target
# first import command fro scikit-learn
from sklearn.model_selection import train_test_split
# split the data
(x_train, x_test, y_train, y_test) = train_test_split (x, target, test_size = .25)
# Print out length of each set
nTrain = len(x_train)
nTest = len(x_test)
print (f"The length of nTrain is {nTrain}")
print (f"The length of nTest is {nTest}")

```
The length of nTrain is 75
The length of nTest is 25
```

# Now use logistic regression on training set; see how well we do
# Create model
Model = LogisticRegression (solver = "lbfgs")
# Fit with training set
Model.fit (x_train, y_train)
# Calculate training score using .score(X_train, y_train) & print out percent accuracy
train_accuracy = round (Model.score(x_train, y_train)*100, 2)
print (f"The percent accuracy for the training score is {train_accuracy}")

```
The percent accuracy for the training score is 89.33
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:993: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
```

# Now see how well model does on test set using .score which requires 2 arguments
test_accuracy = round(Model.score (x_test, y_test)*100, 2)
print (f"The percent accuracy for the test set is {test_accuracy}")

```
The percent accuracy for the test set is 84.0
```

# We want to plot the prediction for each data point so first we add a column to dataframe with prediction
# To do this, predict all data using .predict; print out score
# predict all data
pred = Model.predict(x)
# Add column to dataframe
df["Prediction"] = pred

# Add column to dataframe for this prediction as we did before with .map
df["Predict Admit"] = df["Prediction"].map ({0 : "No", 1 : "Yes"})

# Plot Math vs English with color indicating prediction & compare with scatterplot with actual outcome
sns.relplot (x = "Math", y = "English", data = df, hue = "Predict Admit").set (title = "Predicted Admissions")
sns.relplot (x = "Math", y = "English", data = df, hue = "Admit").set (title = "Actual Admissions")