Lecture 1: Intro to Data Science in Python

!pip install -r requirements.txt import pandas as pd import matplotlib.pyplot as plt import numpy as np import statsmodels.api as sm import seaborn as sns from sklearn.linear_model import LinearRegression from IPython.display import display iris = sns.load_dataset('iris')

print("Hello, world!")

?pd.DataFrame

%quickref

%matplotlib inline import matplotlib.pyplot as plt import ipywidgets as widgets from scipy.stats import norm, linregress out = [widgets.Output(), widgets.Output()] tabs = widgets.Tab(children=[out[0], out[1]]) tabs.set_title(0, 'Linear regression') tabs.set_title(1, 'Normal distribution') with out[0]: # Fit line to some random data x = np.random.uniform(size=30) y = x + np.random.normal(scale=0.1, size=30) slope, intercept, _, _, _ = linregress(x,y) u = np.linspace(0, 1) # Plot fig1, axes1 = plt.subplots() axes1.scatter(x, y) axes1.plot(u, slope * u + intercept, 'k') plt.show(fig1) with out[1]: # Plot the probability distribution function (pdf) of the # standard normal distribution. x = np.linspace(-3.5, 3.5, num=100) p = norm.pdf(x) # Plot fig2, axes2 = plt.subplots() axes2.plot(x, p) plt.show(fig2) display(tabs)

import pandas as pd import seaborn as sns iris = sns.load_dataset('iris') # `iris` is stored as a pandas DataFrame print('Type of "iris":', type(iris)) # Show the first few entries in this DataFrame iris.head()

# 1. Column labels, and types of data in each column print(iris.dtypes)

# 2. Calculate the average petal length print(iris['petal_length'].mean())

# 3. Determine which iris species are in the dataset print(iris['species'].unique())

# 4. Summary of the data iris.describe()

""" IPython.display is a convenience function that works in Jupyter Notebook (or, more generally, any IPython-based application) that will show objects in a nicer way than using print(). We'll use it in this notebook to show some pandas DataFrames. """ from IPython.display import display """ Create a DataFrame for each species of flower. I've provided two methods for creating these DataFrames below; pick whichever you prefer as they are equivalent. """ # Method 1: "query" function setosa = iris.query('species == "setosa"') versicolor = iris.query('species == "versicolor"') # Method 2: index into the DataFrame virginica = iris[iris['species'] == 'virginica'] """ Show the first few entries of the DataFrame corresponding to each species """ print('Setosa data:') display(setosa.head()) print('Versicolor data:') display(versicolor.head()) print('Virginica data:') display(virginica.head())

""" Get the first column. Note: whenever we extract a single column of a pandas DataFrame, we get back a pandas Series object. To turn it back into a DataFrame, we add the line `first_column = pd.DataFrame(first_column)`. """ first_column = iris.iloc[:,0] first_column = pd.DataFrame(first_column) print('First column:') display(first_column.head()) """ Get the first through third columns """ first_through_third_columns = iris.iloc[:,0:3] print('First through third columns:') display(first_through_third_columns.head()) """ Get the 'species' column. """ species = iris['species'] species = pd.DataFrame(species) print('Species column:') display(species.head()) """ Get all columns *except* the species column """ all_but_species = iris.iloc[:, iris.columns != 'species'] print("All columns *except* species:") display(all_but_species.head())

column_labels = ['A', 'B'] column_entries = [ [1, 2], [4, 5], [7, 8] ] pd.DataFrame(column_entries, columns=column_labels)

import numpy as np # 1. Create an array with the numbers [1, 2, 3] x = np.array([1, 2, 3]) # 2. Create a 2 x 2 matrix with [1, 2] in the first row and [3, 4] # in the second row. x = np.array( [[1,2], [3,4]] ) # 3. Create an array with the numbers 0, 1, ... , 9. Equivalent to # calling np.array(range(10)) x = np.arange(10) # 4. Create a 2 x 2 matrix with zeros in all entries x = np.zeros( (2,2) ) # 5. Get the total number of items in the matrix, and the shape of # the matrix. num_items = x.size matrix_shape = x.shape

### Constants: pi print('π = %f' % np.pi) print() ### Simple functions: sine, cosine, e^x, log, ... print('sin(0) = %f' % np.sin(0)) print('cos(0) = %f' % np.cos(0)) print('e^1 = %f' % np.exp(1)) print('ln(1) = %f' % np.log(1)) print() ### Minimums, maximums, sums... x = np.array([1,2,3]) print('Min of [1,2,3] = %d' % x.min()) print('Max of [1,2,3] = %d' % x.max()) print('Sum of [1,2,3] = %d' % x.sum()) print() ### Random numbers: uniform distribution, normal distribution, ... print('Random numbers:') print('Uniform([0,1]): %f' % np.random.uniform(0,1)) print('Normal(0,1): %f' % np.random.normal(loc=0, scale=1)) print('Poisson(1): %f' % np.random.poisson(1))

x = np.array([1,2,3]) y = np.array([4,5,6]) print('1 + [1,2,3] =', 1 + x) print('3 * [1,2,3] =', 3 * x) print('[1,2,3] * [4,5,6] =', x * y) print('[1,2,3] + [4,5,6] =', x + y) print('Dot product of [1,2,3] and [4,5,6] =', x.dot(y))

import seaborn as sns sns.set() sns.pairplot(iris, hue="species")

from sklearn.linear_model import LinearRegression """ Get all of the irises of the species "setosa" and place them in a pandas DataFrame called `data`. Also try with 'setosa' replaced by 'versicolor' and 'virginica' """ data = iris[iris['species'] == 'setosa'] """ Split the data into two pieces: the independent variables (sepal_length, sepal_width, and petal_width), and the dependent variable (petal_length). """ x = data[['sepal_length','sepal_width','petal_width']] y = data['petal_length'] """ Create a scikit-learn LinearRegression object which we will fit to the data. """ lm = LinearRegression() lm.fit(x,y) for (coef,col) in zip(lm.coef_, x.columns): print("%-30s %+.3f" % ("Coefficient for " + col + ":", coef))

import statsmodels.api as sm # Also try with 'setosa' replaced by 'versicolor' and 'virginica' data = iris[iris['species'] == 'setosa'] x = data[['sepal_length','sepal_width','petal_width']] y = data['petal_length'] """ Add y-intercept term to our linear model """ x = sm.add_constant(x) """ Perform linear regression """ lm = sm.OLS(y,x) results = lm.fit() """ Print the results """ print(results.summary())

import statsmodels.formula.api as smf df = iris[iris['species'] == 'setosa'] # Predict petal length from sepal_width, sepal_length, and petal_width model = smf.ols(formula='petal_length ~ sepal_width + sepal_length + petal_width', data=df) results = model.fit() print("Results (petal_length ~ sepal_width + sepal_length + petal_width)") print(results.summary()) print('-' * 80) # Predict petal length from just sepal_length and petal_width model = smf.ols(formula='petal_length ~ sepal_length + petal_width', data=df) results = model.fit() print("Results (petal_length ~ sepal_length + petal_width)") print(results.summary())