sp17-materials

import numpy as np import pandas as pd %matplotlib inline import matplotlib.pyplot as plt import seaborn as sns from IPython.display import display, Latex, Markdown !pip install -U okpy from client.api.notebook import Notebook ok = Notebook('hw5.ok')

# Use this cell to examine the dataset, if you like. !head -n 25 crawl.json

# This cell loads the JSON data into Python. import json with open("crawl.json", "r") as f: crawl_json = json.load(f)

def json_description(crawl_json_records): """Produces information about a JSON object containing crawl data. Args: crawl_json_records (list): A list of JSON objects such as the crawl_json variable above. Returns: 2-tuple: An (int, set) pair. The integer is the number of records in the given list. The set is a set (constructed with Python's set() function) of strings. It contains all the field top names that appear at the top level of any record in crawl_json_records. """ ... # Feel free to erase the next line. It just demonstrates how # to return a tuple. You'll have to replace the variable names # with whatever you define in this function. return (num_records, possible_fields)

_ = ok.grade('q01') _ = ok.backup()

# Display results n, keys = json_description(crawl_json) print("Number of records:", n) print("Keys in the records:", keys)

# Use this cell for your explorations. q2_answer = r""" Put your answer here, replacing this text. """ display(Markdown(q2_answer))

def make_crawls_dataframe(crawl_json_records): """Creates a Pandas DataFrame from the given list of JSON records. The DataFrame corresponds to the following relation: crawls(primary key (url, hour), updated) Each hour in which a crawl happened for a page (regardless of whether it found a change) should be represented. `updated` is a boolean value indicating whether the check for that hour found a change. The result is sorted by URL in ascending order and **further** sorted by hour in ascending order among the rows for each URL. Args: crawl_json_records (list): A list of JSON objects such as the crawl_json variable above. Returns: DataFrame: A table whose schema (and sort order) is described above. """ ...

# Run this cell before you continue. crawls = make_crawls_dataframe(crawl_json)

_ = ok.grade('q03') _ = ok.backup()

# Use this cell for your explorations. Write the schema # in text in the string below. q4_answer = r""" Put your answer here, replacing this text. """ display(Markdown(q4_answer))

crawl_stats = ( crawls['updated'] .groupby(crawls.index.get_level_values('url')) .agg({ 'number of crawls': 'count', 'proportion of updates': 'mean', 'number of updates': 'sum' }) )

... # Leave this for grading purposes q5_p1_plot = plt.gcf()

... # Leave this for grading purposes q5_p2_plot = plt.gcf()

... # Leave this for grading purposes q5_p3_plot = plt.gcf()

q5_p4_answer = r""" Put your answer here, replacing this text. """ display(Markdown(q5_p4_answer))

def display_points(points, xlim, title): """Displays a timeline with points on it. Args: points (ndarray): A list of floats in the range [xlim[0], xlim[1]], each one a point to display in the timeline. xlim (list-like): A list/tuple/array with 2 elements giving the start and end of the timeline. title (str): The title to display on the plot. Example: >>> # plot the points at [1,3,30,50] >>> display_points([1,4,30,50], [0, 75], "Example") """ fig, ax = plt.subplots(1) fig.set_size_inches(8, 1) ax.scatter(points, np.repeat(0, len(points)), alpha=0.5) ax.axhline(0, color="grey", zorder=-1, lw=.5) ax.yaxis.set_visible(False) ax.xaxis.set_visible(True) ax.set_xlim(xlim) ax.set_title("{} ({:d} total points)".format(title, len(points))) plt.show()

display_points([1,4,30,50], [0, 75], "Example")

# This cell identifies a few categories of pages and # associates a label in the 'Desc' column of crawl_stats crawl_stats['Desc'] = "" # default blank description # Normal pages have a moderate number of updates and # were successfully crawled most times. crawl_stats.loc[ (crawl_stats['proportion of updates'] > 0.1) & (crawl_stats['proportion of updates'] < 0.9) & (crawl_stats['number of crawls'] >= 700), 'Desc'] = 'Normal' crawl_stats.loc[ (crawl_stats['proportion of updates'] < .1) & (crawl_stats['number of crawls'] >= 700), 'Desc'] = 'Rare Update' crawl_stats.loc[ (crawl_stats['proportion of updates'] > .9) & (crawl_stats['number of crawls'] >= 700), 'Desc'] = 'Frequent Update' crawl_stats.loc[ crawl_stats['number of crawls'] < 50, 'Desc'] = 'Few Crawls' # Build a dataframe with a few examples from each type of webpage num_of_each = 3 pages_to_display = pd.concat([ crawl_stats[crawl_stats['Desc'] == "Normal"].head(num_of_each), crawl_stats[crawl_stats['Desc'] == 'Rare Update'].head(num_of_each), crawl_stats[crawl_stats['Desc'] == 'Frequent Update'].head(num_of_each), crawl_stats[crawl_stats['Desc'] == 'Few Crawls'].head(num_of_each), ])

# Construct the timeline point diagrams for each example for (url, desc) in pages_to_display['Desc'].iteritems(): crawls_for_site = crawls.loc[url] display_points( crawls_for_site[crawls_for_site['updated']].index, [0, len(crawls_for_site)], desc)

# Use this cell for your explorations. q6_answer = r""" Put your answer here, replacing this text. """ display(Markdown(q6_answer))

def compute_q_q_pairs(url): """Computes lists of uniform-distribution-quantiles and data-quantiles for a page. Args: url (str): The URL of a page. Returns: 2-tuple: A pair (AKA a 2-tuple). Both components are lists or arrays of length n, where n is the number of positive checks for the page. The first component contains quantiles of a uniform distribution, and the second contains quantiles of the distribution of positive check times for the page. The kth element of a component is the (k+1)/(n+1)-th quantile of its distribution or dataset. The first component of the pair contains quantiles of the uniform distribution on [0, N], where N is the number of checks performed for the page. """ ... obs_q = ... pdf_q = ... return (pdf_q, obs_q)

_ = ok.grade('q07') _ = ok.backup()

for url in pages_to_display[pages_to_display['Desc'] == "Normal"].index.values: print(url) (pdf_q, obs_q) = compute_q_q_pairs(url) sns.regplot(pdf_q, np.array(obs_q)) plt.xlabel("True uniform quantile") plt.ylabel("Actual quantile") plt.show()

# Use this cell for your explorations. q7_p2_answer = r""" Put your answer here, replacing this text. """ display(Markdown(q7_p2_answer))

url = 'http://a.hatena.ne.jp/Syako/simple' n = np.count_nonzero(crawls.loc[url]['updated'])

... # Leave this for grading purposes q8_plot = plt.gcf()

# Use this cell for your explorations. q8_answer = r""" Put your answer here, replacing this text. """ display(Markdown(q8_answer))

q9_answer = r""" Put your answer here and delete these two sentences. Some steps are already filled in to get you started; the '...' indicate where you need to fill in one or more lines. **Step 1.** The probability of the data given $\lambda$ is: $$L(\lambda) = e^{-(\lambda N)} \frac{(\lambda N)^{n}}{n!}$$ ... Therefore, $$\lambda^{\text{MLE}} = ...$$ """ display(Markdown(q9_answer))

... # Leave this at the end so we can grab the plot for grading q10_plot = plt.gcf()

_ = ok.grade('q10') _ = ok.backup()

def sample_poisson_process(rate, length): """Generates n points from Poisson(rate*length) and locates them uniformly at random on [0, length]. Args: rate (float): The average number of points per unit length. length (float): The length of the line segment. Returns: ndarray: An array of points scattered randomly on [0, length]. The number of points has a Poisson(rate*length) distribution. """ ...

_ = ok.grade('q11') _ = ok.backup()

def snap_times(update_times, window_length, process_length): """Given a list of change times, produces a list of the windows that detected a change (that is, where at least one change happened inside the window). This has the effect of 'snapping' each change to the next hour (or whatever the window_length is). For periods where more than one change happened, the output will still only list the period once. In other words, it produces a list of the positive checks given a list of true change times. Args: update_times (ndarray): A list of times when changes happened for a page. All times are between 0 and process_length. window_length (float): The width of each window. (For example, if time is being measured in hours, and an observation happens once per hour, then this should be 1.) process_length (float): The last time time any change could have happened. Returns: ndarray: A list of numbers, each one the right endpoint of a window where at least one change happened.""" window_ends = np.arange(0, process_length, window_length) + window_length num_windows = len(window_ends) event_windows = np.floor(np.array(update_times) / window_length).astype(int) events_by_window = np.bincount(event_windows, minlength=num_windows) event_happened = events_by_window > 0 return window_ends[event_happened]

# Use this cell for your explorations, then write your conclusions # in the string below. q12_answer = r""" Put your answer here, replacing this text. """ display(Markdown(q12_answer))

def simulate_censored_rate_estimate(true_rate, num_visits): """Simulates updates to a page and visits by a web crawler, and returns the proportion of visits in which an update was observed. Args: true_rate (float): The average number of updates per unit length of time. (The units are irrelevant for the purposes of this function, but you can imagine that they are hours.) num_visits (float): The number of visits made to the site. One visit is made per unit length of time, so this is also equal to the duration of time simulated. Returns: float: the MLE for true_rate lambda, based on the number of observed positive checks. """ # The skeleton here is provided for your convenience; you # don't have to follow it. draws = ... windows = ... return ...

def simulate_many_rate_estimates(true_rate, num_visits): return np.mean([simulate_censored_rate_estimate(true_rate, num_visits) for _ in range(1000)])

num_visits = 719 rates = list(np.arange(0, 4, .2)) estimates = [simulate_many_rate_estimates(r, num_visits) for r in rates]

plt.plot(rates, estimates, label = 'average estimate'); plt.plot([rates[0], rates[-1]], [rates[0], rates[-1]], linestyle='dashed', color='g', label='true rate') plt.xlabel("True rate of updates per hour") plt.ylabel("Average estimate of the rate of updates per hour") plt.legend();

_ = ok.grade('q13') _ = ok.backup()

q14_p1_answer = r""" Put your answer here, replacing this text. """ display(Markdown(q14_p1_answer))

q14_p2_answer = r""" Put your answer here, replacing this text. """ display(Markdown(q14_p2_answer))

q15_answer = r""" Put your answer here, replacing this text. """ display(Markdown(q15_answer))

# The probability that a Poisson(0.5) random variable is equal to 0 Prob_pois_half_equals_0 = ... # The probability that a Poisson(0.5) random variable is greater than or equal to 1 Prob_pois_half_gte_1 = ...

_ = ok.grade('q15') _ = ok.backup()

q16_answer = r""" Put your answer here, replacing this text. """ display(Markdown(q16_answer))

...

_ = ok.grade('q17') _ = ok.backup()

def simulate_change_rate_estimate(change_rate, num_observations, estimator): """Simulates hourly change observations for a website and produces an estimate of the change rate. Args: change_rate (float): The hourly change rate of the website to be used in the simulation. num_observations (int): The number of observations (equivalently, the number of hours) to simulate. estimator (func): A function to apply to the simulated observations to produce an estimate of the change rate. Has the same signature as the function modified_mle below. Returns: float: The estimate produced by calling estimator on the simulated list of positive checks. """ ... return def modified_mle(positive_checks, num_observations): """Produces the modified MLE for a dataset. Args: positive_checks (ndarray): A list or array of the hours when a change was observed. num_observations (int): The number of hours in which we could have observed a change. Returns: (float): The modified MLE. """ ... return

_ = ok.grade('q18') _ = ok.backup()

def plot_bootstrap_rate_estimates(page_url, num_simulations): """Simulates positive check observations for a website many times. For each simulation, the page's change rate is estimated based on the simulated positive checks. The estimation method is the modified_mle function implemented above. Then this function produces a histogram that shows the distribution of these estimated change rates. When conducting each simulation, the change rate is taken to be the estimated update rate for the page. *That* estimate is the modified MLE computed from the actual positive check observations for that page. The modified MLE computed from the actual observations for the page is also displayed as a vertical red line at an appropriate horizontal position on the histogram. Here is a diagram that might be helpful: (This happens 1 time) | v Positive check observations for this page ---modified_mle---> Estimate of change rate for this page (1000x, once per simulation) Estimate of change rate for this page ---simulation---> Simulated positive checks (1000x, once per simulation) Simulated positive checks ---modified_mle---> Bootstrap estimate of change rate for this page 1000 bootstrap estimates of change rate for this page ---> Histogram ^ (displayed somewhere) | Estimate of change rate for this page --------------------/ Args: page_url (str): The URL of the page to simulate. num_simulations (int): The number of simulations to run. Returns: None: This function doesn't return anything; it just makes a histogram appear as described above. """ ...

# Run this cell to make plots for several pages using your # function. Since no automatic tests are provided for this # question, we suggest examining the plots to make sure they # make sense to you. many_updates_url = crawl_stats[np.logical_and( crawl_stats['number of crawls'] >= 700, np.logical_and( .2 <= crawl_stats['proportion of updates'], crawl_stats['proportion of updates'] >= .8))].index[0] plot_bootstrap_rate_estimates(many_updates_url, 1000) few_updates_url = crawl_stats[np.logical_and( crawl_stats['number of crawls'] >= 700, np.logical_and( .05 <= crawl_stats['proportion of updates'], crawl_stats['proportion of updates'] >= .15))].index[0] plot_bootstrap_rate_estimates(few_updates_url, 1000)

def estimate_rmse(page_url, num_simulations): """Simulates update observations for a website many times. For each simulation, the page's change rate is estimated based on the simulated observations. (The estimation method is the modified MLE.) Then this function produces an estimate of the RMSE of estimates of the change rate for this page. When conducting each simulation, the change rate is taken to be the estimated change rate for the page. *That* estimate is the modified MLE computed from the actual observations for the page. We compute the modified MLE for each set of simulated observations. That constitutes num_simulations estimates of the change rate. Then we compute the RMSE of those estimates. The "true" change rate in that calculation is taken to be the modified MLE computed from the actual observations for the page. Args: page_url (str): The URL of the page to simulate. num_simulations (int): The number of simulations to run. Returns: float: The estimated RMSE of the modified MLE for the given page, based on num_simulations simulations. """ ... print(estimate_rmse(many_updates_url, 1000)) print(estimate_rmse(few_updates_url, 1000))

# After completing estimate_rmse above, estimate the RMSEs for all # the pages here, and add them to crawl_stats as a new column named # 'rmse'. ...

_ = ok.grade('q20') _ = ok.backup()

...

# Feel free to use this cell to experiment. Then write your answer # in the string below. q22_answer = r""" Put your answer here, replacing this text. """ display(Markdown(q22_answer))

_ = ok.grade_all()

_ = ok.submit()