Week 10 Practice Notebooks

# Import libraries and DataFrame # import numpy as np import matplotlib.pyplot as plt import pandas as pd from pandas import DataFrame, Series

# Read the data from pokeman.csv into a DataFrame using pandas read_csv() # Print out the first 6 lines of data using .head df = pd.read_csv('pokeman.csv') df.head (6)

# print out the data types of all features using .dtypes (no parentheses) df = pd.read_csv('pokeman.csv') df.dtypes

# print out the column names using .columns df = pd.read_csv('pokeman.csv') df.columns

# Create a pandas Series for the feature Speed; print out type df.Speed

# Create a NumPy array for the feature Speed (use.values) ; print out type speed = df.Speed.values print (type(speed)) print (speed)

# Make 1D NumPy arrays from the features Attack and Defense and do a scatter plot # using matplotlib # m = df.Attack.values a = df.Defense.values plt.plot (m, a, 'ro') plt.xlabel("Defense") plt.ylabel("Attack")

# Create a new DataFrame "df_mod" which is same as original but we drop "Type 2" feature; print out to check

# Import libraries and DataFrame # import numpy as np import matplotlib.pyplot as plt import pandas as pd from pandas import DataFrame, Series import seaborn as sns

# Read the data into a DataFrame # Print out the first 5 lines of data df = pd.read_csv ('pokeman.csv') df.head(5)

# Add a white grid to the background of Seaborn plots using set_style sns.set_style ('whitegrid')

# Make a scatter plot using Seaborn's relplot of Defense statistics (y-axis) # vs Attacks Stats sns.relplot(x='Attack', y='Defense', data = df ) plt.title("Defense vs Attack")

# Repeat plot in previous cell but use color to indicate Type 1 (hue = ) sns.relplot(x='Attack', y='Defense', data = df, hue='Type 1' ) plt.title("Defense vs Attack")

# Make a category plot of Defense statistics vs Type 1 (non-numerical) # Rotation labels on x-axis for readability using plt.xticks using plt.xticks(rotation=-45) sns.catplot(x='Type 1',y='Defense',data = df ) plt.xticks(rotation=-45)

# Make a Bar graph of Defense statistics for Type 1 sns.barplot (x='Type 1', y='Defense', data= df) plt.xticks (rotation=45)

# Make a violin plot of the Defense data for Type 1 sns.violinplot(x='Type 1', y= 'Defense', data = df)

# Repeat the plot in the previous cell but change palette to 'prism' and change size plt.figure(figsize = (10,6)) sns.violinplot(x='Type 1', y= 'Defense', data = df, palette='prism')

# Overlaying plots - overlay violin plot of Defense with actual points # To do this (1) increase figure size using ```plt.figure(figsize = (10,6) )```; # (2) create violin plot and set inner = None to get rid of the bars inside violin plot; # (3) rotate x-axis labels for readability; # (4) create swarmplot for points and set ```color='k'``` to create the points in black; # (5) add title "Defense Data for Type 1" # plt.figure(figsize = (10,6) ) sns.violinplot(x='Type 1', y='Defense', data= df, palette='prism', inner=None) sns.swarmplot(x='Type 1', y='Defense', data = df, color='k' ) plt.title ('Defense Data for Type 1')

# Import libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd from pandas import DataFrame, Series import seaborn as sns

# Import LinearRegression function from scikit-learn from sklearn.linear_model import LinearRegression

# Read in data from file insurance.csv and create a DataFrame; print out some lines # df = pd.read_csv ('insurance.csv') df.head (5) print (df)

# Set background grid for Seaborn plots sns.set_style ('whitegrid')

# Create scatter plot of charges vs BMI with color indiciating whether patient is # smoker or not sns.relplot(x='bmi',y='charges',data=df, hue = 'smoker')

# Get data to use for linear regression # Right now we see if there is a relationship between insurance charges and bmi bmi = df.bmi.values print ('bmi', bmi) charges = df.charges.values print ('charges', charges)

# Make bmi an n by 1 array and charges n by 1 n=len(bmi) bmi = np.reshape (bmi, (n,1)) charges = np.reshape, (charges, (n,1))

# Create model and fit data bmi = df.bmi.values print ('bmi', bmi) n=len(bmi) bmi = np.reshape (bmi, (n,1)) print ('BMI Reshaped', bmi) charges = df.charges.values charges = np.reshape, (charges, (n,1)) print ('charges', charges) lr = LinearRegression() lr.fit(bmi, charges)

# write out equation of line print ( "intercept", lr.intercept_) print ("slope", lr.coef_) print ("The equation which fits the data in a linear regression sense is:") print (f"{round (lr.intercept_[0],4)} + {round(lr.coef_[0,0],4)} times estriol")

# Use regplot to plot data and line sns.regplot(x='bmi', y='charges', data=df)

# predict insurance costs for a person with BMI 31.7; round answer to nearest cent # # Note that this value agrees with plot above because when x=31.7 y is around 14,000 e_eval = [31.7] e_eval = np.reshape(e_eval, (1,1)) # requires a 2D array charges = lr.predict(e_eval) * 100 lbs = grams/453.592 print(f"baby's predicted birthweight is {grams[0,0]} grams and {lbs[0,0]} pounds")