Week 12 Practice Notebook

import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression from pandas import DataFrame, Series import seaborn as sns import pandas as pd

# Load data into a dataframe & print # df = pd.read_csv('admit_data.csv') df

# Print out feature names to get exact names (some have spaces) using .columns df.columns

# Get rid of spaces in feature names using df.columns = [ list of new names in quotes] # df.columns = ['English','Math','Outcome'] df.columns

# Create new feature called 'Admit' using map or apply with values "yes" or "no" # For this case it's easier to use .map as we did for iris dataset # df['Admit'] = df['Outcome'].map( {0:'no', 1:'yes'} )

# Set background grid for Seaborn & plot Math vs English with color indicating whether the student was admitted # or not # sns.set_style('whitegrid') sns.relplot(x='Math',y='English',data=df,hue='Admit')

# Create target 1D array and 2D data array dimensioned 100 by 2 #target value is outcome target = np.array(df.Outcome.values) target = np.reshape(target,[100,1]) #making 2d array # Get NumPy array for english and math scores, math = np.array(df.Math.values) english = np.array(df.English.values) math = np.reshape(math, [100,1]) english = np.reshape(english, [100,1]) # stack them and take transpose stack = np.vstack((math,english)) x = x.T # Remember that np.vstack takes 1 argument so # put your arrays in ( ) and remember to use .T to transpose

print (len(math))

# split our data into a training set and a test set; we choose 75-25 split # We are splitting the entire data set and target # # first import command fro scikit-learn from sklearn.model_selection import train_test_split # # split the data (xTrain,xTest,yTrain,yTest) = train_test_split(x,target,test_size=.25) # # Print out length of each set nTrain = len(xTrain); nTest = len(xTest) print (nTrain,nTest)

# Now use logistic regression on training set; see how well we do # # # Create model lr = LogisticRegression (solver = 'lbfgs') # # Fit with training set lr.fit(xTrain,yTrain) # # Calculate training score using .score(X_train, y_train) & print out percent accuracy # accuracy = round(lr.score(xTrain,yTrain)*100,2) print (f'percent accuracy = {accuracy}')

# Now see how well model does on test set using .score which requires 2 arguments # accuracy = round(lr.score(xTest,yTest)*100,2) print (f'percent accuracy = {accuracy}')

# We want to plot the prediction for each data point #so first we add a column to dataframe with prediction # To do this, predict all data using .predict; print out score # # predict all data pred = lr.predict(x) # # Add column to dataframe df['Prediction'] = pred

# Add column to dataframe for this prediction as we did before with .map # df['Predict Admit'] = df['Prediction'].map( {0:'no', 1:'yes'} )

# Plot Math vs English with color indicating prediction & compare with scatterplot with actual outcome sns.relplot(x='Math',y='English',data=df,hue='Predict Admit') sns.relplot(x='Math',y='English',data=df,hue='Admit')