import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from pandas import DataFrame, Series
import seaborn as sns
import pandas as pd

# Load data into a dataframe & print
#
df = pd.read_csv('admit_data.csv')
df

Englishfloat64

16.0 - 61.0

Mathfloat64

42.0 - 86.0

0

35

54.625

1

26.5

68.5

2

41

57

3

21.5

42

4

46

84

5

39.375

78.5

6

33

56

7

41.875

81.125

8

30

67

9

31

55.5

# Print out feature names to get exact names (some have spaces) using .columns
df.columns

# Get rid of spaces in feature names using df.columns = [ list of new names in quotes]
#
df.columns = ['English','Math','Outcome']
df.columns

# Create new feature called 'Admit' using map or apply with values "yes" or "no"
# For this case it's easier to use .map as we did for iris dataset
#
df['Admit'] = df['Outcome'].map( {0:'no',
1:'yes'} )

# Set background grid for Seaborn & plot Math vs English with color indicating whether the student was admitted
# or not
#
sns.set_style('whitegrid')
sns.relplot(x='Math',y='English',data=df,hue='Admit')

# Create target 1D array and 2D data array dimensioned 100 by 2
#target value is outcome
target = np.array(df.Outcome.values)
target = np.reshape(target,[100,1])
#making 2d array
# Get NumPy array for english and math scores,
math = np.array(df.Math.values)
english = np.array(df.English.values)
math = np.reshape(math, [100,1])
english = np.reshape(english, [100,1])
# stack them and take transpose
stack = np.vstack((math,english))
x = x.T
# Remember that np.vstack takes 1 argument so
# put your arrays in ( ) and remember to use .T to transpose

print (len(math))

```
100
```

# split our data into a training set and a test set; we choose 75-25 split
# We are splitting the entire data set and target
#
# first import command fro scikit-learn
from sklearn.model_selection import train_test_split
#
# split the data
(xTrain,xTest,yTrain,yTest) = train_test_split(x,target,test_size=.25)
#
# Print out length of each set
nTrain = len(xTrain); nTest = len(xTest)
print (nTrain,nTest)

```
75 25
```

# Now use logistic regression on training set; see how well we do
#
#
# Create model
lr = LogisticRegression (solver = 'lbfgs')
#
# Fit with training set
lr.fit(xTrain,yTrain)
#
# Calculate training score using .score(X_train, y_train) & print out percent accuracy
#
accuracy = round(lr.score(xTrain,yTrain)*100,2)
print (f'percent accuracy = {accuracy}')

```
percent accuracy = 60.0
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:993: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
```

# Now see how well model does on test set using .score which requires 2 arguments
#
accuracy = round(lr.score(xTest,yTest)*100,2)
print (f'percent accuracy = {accuracy}')

```
percent accuracy = 60.0
```

# We want to plot the prediction for each data point
#so first we add a column to dataframe with prediction
# To do this, predict all data using .predict; print out score
#
# predict all data
pred = lr.predict(x)
#
# Add column to dataframe
df['Prediction'] = pred

# Add column to dataframe for this prediction as we did before with .map
#
df['Predict Admit'] = df['Prediction'].map( {0:'no',
1:'yes'} )

# Plot Math vs English with color indicating prediction & compare with scatterplot with actual outcome
sns.relplot(x='Math',y='English',data=df,hue='Predict Admit')
sns.relplot(x='Math',y='English',data=df,hue='Admit')