!pip install -r requirements.txt
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import seaborn as sns
from sklearn.linear_model import LinearRegression
from IPython.display import display
iris = sns.load_dataset('iris')
print("Hello, world!")
?pd.DataFrame
%quickref
%matplotlib inline
import matplotlib.pyplot as plt
import ipywidgets as widgets
from scipy.stats import norm, linregress
out = [widgets.Output(), widgets.Output()]
tabs = widgets.Tab(children=[out[0], out[1]])
tabs.set_title(0, 'Linear regression')
tabs.set_title(1, 'Normal distribution')
with out[0]:
# Fit line to some random data
x = np.random.uniform(size=30)
y = x + np.random.normal(scale=0.1, size=30)
slope, intercept, _, _, _ = linregress(x,y)
u = np.linspace(0, 1)
# Plot
fig1, axes1 = plt.subplots()
axes1.scatter(x, y)
axes1.plot(u, slope * u + intercept, 'k')
plt.show(fig1)
with out[1]:
# Plot the probability distribution function (pdf) of the
# standard normal distribution.
x = np.linspace(-3.5, 3.5, num=100)
p = norm.pdf(x)
# Plot
fig2, axes2 = plt.subplots()
axes2.plot(x, p)
plt.show(fig2)
display(tabs)
import pandas as pd
import seaborn as sns
iris = sns.load_dataset('iris')
# `iris` is stored as a pandas DataFrame
print('Type of "iris":', type(iris))
# Show the first few entries in this DataFrame
iris.head()
# 1. Column labels, and types of data in each column
print(iris.dtypes)
# 2. Calculate the average petal length
print(iris['petal_length'].mean())
# 3. Determine which iris species are in the dataset
print(iris['species'].unique())
# 4. Summary of the data
iris.describe()
"""
IPython.display is a convenience function that works in Jupyter Notebook
(or, more generally, any IPython-based application) that will show
objects in a nicer way than using print(). We'll use it in this notebook
to show some pandas DataFrames.
"""
from IPython.display import display
"""
Create a DataFrame for each species of flower. I've provided two
methods for creating these DataFrames below; pick whichever you
prefer as they are equivalent.
"""
# Method 1: "query" function
setosa = iris.query('species == "setosa"')
versicolor = iris.query('species == "versicolor"')
# Method 2: index into the DataFrame
virginica = iris[iris['species'] == 'virginica']
"""
Show the first few entries of the DataFrame corresponding to each species
"""
print('Setosa data:')
display(setosa.head())
print('Versicolor data:')
display(versicolor.head())
print('Virginica data:')
display(virginica.head())
"""
Get the first column.
Note: whenever we extract a single column of a pandas DataFrame,
we get back a pandas Series object. To turn it back into a DataFrame,
we add the line `first_column = pd.DataFrame(first_column)`.
"""
first_column = iris.iloc[:,0]
first_column = pd.DataFrame(first_column)
print('First column:')
display(first_column.head())
"""
Get the first through third columns
"""
first_through_third_columns = iris.iloc[:,0:3]
print('First through third columns:')
display(first_through_third_columns.head())
"""
Get the 'species' column.
"""
species = iris['species']
species = pd.DataFrame(species)
print('Species column:')
display(species.head())
"""
Get all columns *except* the species column
"""
all_but_species = iris.iloc[:, iris.columns != 'species']
print("All columns *except* species:")
display(all_but_species.head())
column_labels = ['A', 'B']
column_entries = [
[1, 2],
[4, 5],
[7, 8]
]
pd.DataFrame(column_entries, columns=column_labels)
import numpy as np
# 1. Create an array with the numbers [1, 2, 3]
x = np.array([1, 2, 3])
# 2. Create a 2 x 2 matrix with [1, 2] in the first row and [3, 4]
# in the second row.
x = np.array( [[1,2], [3,4]] )
# 3. Create an array with the numbers 0, 1, ... , 9. Equivalent to
# calling np.array(range(10))
x = np.arange(10)
# 4. Create a 2 x 2 matrix with zeros in all entries
x = np.zeros( (2,2) )
# 5. Get the total number of items in the matrix, and the shape of
# the matrix.
num_items = x.size
matrix_shape = x.shape
### Constants: pi
print('π = %f' % np.pi)
print()
### Simple functions: sine, cosine, e^x, log, ...
print('sin(0) = %f' % np.sin(0))
print('cos(0) = %f' % np.cos(0))
print('e^1 = %f' % np.exp(1))
print('ln(1) = %f' % np.log(1))
print()
### Minimums, maximums, sums...
x = np.array([1,2,3])
print('Min of [1,2,3] = %d' % x.min())
print('Max of [1,2,3] = %d' % x.max())
print('Sum of [1,2,3] = %d' % x.sum())
print()
### Random numbers: uniform distribution, normal distribution, ...
print('Random numbers:')
print('Uniform([0,1]): %f' % np.random.uniform(0,1))
print('Normal(0,1): %f' % np.random.normal(loc=0, scale=1))
print('Poisson(1): %f' % np.random.poisson(1))
x = np.array([1,2,3])
y = np.array([4,5,6])
print('1 + [1,2,3] =', 1 + x)
print('3 * [1,2,3] =', 3 * x)
print('[1,2,3] * [4,5,6] =', x * y)
print('[1,2,3] + [4,5,6] =', x + y)
print('Dot product of [1,2,3] and [4,5,6] =', x.dot(y))
import seaborn as sns
sns.set()
sns.pairplot(iris, hue="species")
from sklearn.linear_model import LinearRegression
"""
Get all of the irises of the species "setosa" and place them in
a pandas DataFrame called `data`.
Also try with 'setosa' replaced by 'versicolor' and 'virginica'
"""
data = iris[iris['species'] == 'setosa']
"""
Split the data into two pieces: the independent variables
(sepal_length, sepal_width, and petal_width), and the dependent
variable (petal_length).
"""
x = data[['sepal_length','sepal_width','petal_width']]
y = data['petal_length']
"""
Create a scikit-learn LinearRegression object which we will
fit to the data.
"""
lm = LinearRegression()
lm.fit(x,y)
for (coef,col) in zip(lm.coef_, x.columns):
print("%-30s %+.3f" % ("Coefficient for " + col + ":", coef))
import statsmodels.api as sm
# Also try with 'setosa' replaced by 'versicolor' and 'virginica'
data = iris[iris['species'] == 'setosa']
x = data[['sepal_length','sepal_width','petal_width']]
y = data['petal_length']
"""
Add y-intercept term to our linear model
"""
x = sm.add_constant(x)
"""
Perform linear regression
"""
lm = sm.OLS(y,x)
results = lm.fit()
"""
Print the results
"""
print(results.summary())
import statsmodels.formula.api as smf
df = iris[iris['species'] == 'setosa']
# Predict petal length from sepal_width, sepal_length, and petal_width
model = smf.ols(formula='petal_length ~ sepal_width + sepal_length + petal_width',
data=df)
results = model.fit()
print("Results (petal_length ~ sepal_width + sepal_length + petal_width)")
print(results.summary())
print('-' * 80)
# Predict petal length from just sepal_length and petal_width
model = smf.ols(formula='petal_length ~ sepal_length + petal_width',
data=df)
results = model.fit()
print("Results (petal_length ~ sepal_length + petal_width)")
print(results.summary())