Week 12 Practice Notebooks

import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression from pandas import DataFrame, Series import seaborn as sns import pandas as pd

# Load data into a dataframe & print df = pd.read_csv("admit_data.csv") df.head()

# Print out feature names to get exact names (some have spaces) using .columns df.columns

# Get rid of spaces in feature names using df.columns = [ list of new names in quotes] df.columns = ["English", "Math", "Outcome"]

# Create new feature called 'Admit' using map or apply with values "yes" or "no" # For this case it's easier to use .map as we did for iris dataset df["Admit"] = df["Outcome"].map({0 : "No", 1 : "Yes"})

# Set background grid for Seaborn & plot Math vs English with color indicating whether the student was admitted # or not sns.set_style ("whitegrid") sns.relplot (x = "Math", y = "English", data = df, hue = "Admit")

# Create target 1D array and 2D data array dimensioned 100 by 2 # # create target array. It's a 1D array. # create data array. It should be a 2D array dimensioned 100 by 2. # Get NumPy array for english and math scores, stack them and take transpose # Remember that np.vstack takes 1 argument so put your arrays in ( ) and remember to use .T to transpose target = np.array (df.Outcome.values) target = np.reshape(target, [100, 1]) math = np.array(df.Math.values) english = np.array(df.English.values) x = np.vstack((math, english)).T

# split our data into a training set and a test set; we choose 75-25 split # We are splitting the entire data set and target # first import command fro scikit-learn from sklearn.model_selection import train_test_split # split the data (x_train, x_test, y_train, y_test) = train_test_split (x, target, test_size = .25) # Print out length of each set nTrain = len(x_train) nTest = len(x_test) print (f"The length of nTrain is {nTrain}") print (f"The length of nTest is {nTest}")

# Now use logistic regression on training set; see how well we do # Create model Model = LogisticRegression (solver = "lbfgs") # Fit with training set Model.fit (x_train, y_train) # Calculate training score using .score(X_train, y_train) & print out percent accuracy train_accuracy = round (Model.score(x_train, y_train)*100, 2) print (f"The percent accuracy for the training score is {train_accuracy}")

# Now see how well model does on test set using .score which requires 2 arguments test_accuracy = round(Model.score (x_test, y_test)*100, 2) print (f"The percent accuracy for the test set is {test_accuracy}")

# We want to plot the prediction for each data point so first we add a column to dataframe with prediction # To do this, predict all data using .predict; print out score # predict all data pred = Model.predict(x) # Add column to dataframe df["Prediction"] = pred

# Add column to dataframe for this prediction as we did before with .map df["Predict Admit"] = df["Prediction"].map ({0 : "No", 1 : "Yes"})

# Plot Math vs English with color indicating prediction & compare with scatterplot with actual outcome sns.relplot (x = "Math", y = "English", data = df, hue = "Predict Admit").set (title = "Predicted Admissions") sns.relplot (x = "Math", y = "English", data = df, hue = "Admit").set (title = "Actual Admissions")