Week 10 Practice Notebooks

# Import libraries and DataFrame import numpy as np import matplotlib.pyplot as plt import pandas as pd from pandas import DataFrame, Series

# Read the data from pokeman.csv into a DataFrame using pandas read_csv() # Print out the first 6 lines of data using .head df = pd.read_csv("pokeman.csv") df.head(6)

# print out the data types of all features using .dtypes (no parentheses) df.dtypes

# print out the column names using .columns df.columns

# Create a pandas Series for the feature Speed; print out type df["Speed"]

# Create a NumPy array for the feature Speed (use.values) ; print out type spd = df.Speed.values print(spd)

# Make 1D NumPy arrays from the features Attack and Defense and do a scatter plot # using matplotlib Att = df.Attack.values Def = df.Defense.values plt.scatter(Att, Def) plt.xlabel("Attack") plt.ylabel("Defense") plt.title("Attack vs. Defense")

# Create a new DataFrame "df_mod" which is same as original but we drop "Type 2" feature; print out to check df_mod = df.drop(columns = ["Type 2"]) df_mod

# Import libraries and DataFrame # import numpy as np import matplotlib.pyplot as plt import pandas as pd from pandas import DataFrame, Series import seaborn as sns

# Read the data into a DataFrame # Print out the first 5 lines of data # The section instructions say the first 6 lines of data, like the first section, so I'm going with 6 just in case! df = pd.read_csv("pokeman.csv") df.head(6)

# Add a white grid to the background of Seaborn plots using set_style sns.set_style("whitegrid") #Don't capitalize the W, it won't accept it!

# Make a scatter plot using Seaborn's relplot of Defense statistics (y-axis) # vs Attacks Stats sns.relplot(x = "Attack", y = "Defense", data = df) # Have to be very specific unlike with previous scatter plot code

# Repeat plot in previous cell but use color to indicate Type 1 (hue = ) sns.relplot(x = "Attack", y = "Defense", data = df, hue = "Type 1") # Just add on the hue part to the code from previous box

# Make a category plot of Defense statistics vs Type 1 (non-numerical) # Rotation labels on x-axis for readability using plt.xticks using plt.xticks(rotation=-45) # Category, so use catplot # Wasn't sure what data should be on which axis so I went with what I thought made more sense sns.catplot(x = "Type 1", y = "Defense", data = df, hue = "Type 1") plt.xticks(rotation = -45)

# Make a Bar graph of Defense statistics for Type 1 # Use barplot, continue being specific when entering data! sns.barplot(x = "Type 1", y = "Defense", data = df) plt.xticks(rotation = -45)

# Make a violin plot of the Defense data for Type 1 sns.violinplot(x = "Type 1", y = "Defense", data = df) plt.xticks(rotation = -45)

# Repeat the plot in the previous cell but change palette to 'prism' and change size sns.violinplot(x = "Type 1", y = "Defense", data = df,palette = "prism", height = 10) plt.xticks(rotation = -45) # I didn't notice any differences when changing size so I may have done it wrong

# Overlaying plots - overlay violin plot of Defense with actual points # To do this (1) increase figure size using ```plt.figure(figsize = (10,6) )```; # (2) create violin plot and set inner = None to get rid of the bars inside violin plot; # (3) rotate x-axis labels for readability; # (4) create swarmplot for points and set ```color='k'``` to create the points in black; # (5) add title "Defense Data for Type 1" #(1) plt.figure(figsize = (10, 6)) #(2) sns.violinplot(x = "Type 1", y = "Defense", data = df, inner = None) #(3) plt.xticks(rotation = -45) #(4) Basically copy violinplot but change from violin to swarm sns.swarmplot(x = "Type 1", y = "Defense", data = df, color = "k") #(5) plt.title("Defense Data for Type 1")

# Import libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd from pandas import DataFrame, Series import seaborn as sns

# Import LinearRegression function from scikit-learn from sklearn.linear_model import LinearRegression

# Read in data from file insurance.csv and create a DataFrame; print out some lines df = pd.read_csv("insurance.csv") df.head(6)

# Set background grid for Seaborn plots sns.set_style("whitegrid")

# Create scatter plot of charges vs BMI with color indiciating whether patient is # smoker or not sns.relplot(x = "bmi", y = "charges", data = df, hue = "smoker") # Don't capitalize "charges" or "smoker", won't work then bc it doesn't specifically match insurance.csv

# Get data to use for linear regression # Right now we see if there is a relationship between insurance charges and bmi charges = df.charges.values bmi = df.bmi.values

# Make bmi an n by 1 array and charges n by 1 n = len(bmi) charges = np.reshape(charges, (n, 1)) bmi = np.reshape(bmi, (n, 1))

# Create model and fit data lr = LinearRegression() lr.fit(bmi, charges)

# write out equation of line lr_int = lr.intercept_[0] lr_sl = lr.coef_[0,0] print("The intercept is", lr_int) print("The slope is", lr_sl)

# Use regplot to plot data and line sns.regplot(x = "bmi", y = "charges", data = df)

# predict insurance costs for a person with BMI 31.7; round answer to nearest cent # Note that this value agrees with plot above because when x=31.7 y is around 14,000 x_eval = np.array([31.7]) x_eval = np.reshape(x_eval, (1, 1)) print("The predicted insurance costs for a person with BMI 31.7 is", lr.predict(x_eval))