import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Latex, Markdown
!pip install -U okpy
from client.api.notebook import Notebook
ok = Notebook('hw5.ok')
# Use this cell to examine the dataset, if you like.
!head -n 25 crawl.json
# This cell loads the JSON data into Python.
import json
with open("crawl.json", "r") as f:
crawl_json = json.load(f)
def json_description(crawl_json_records):
"""Produces information about a JSON object containing crawl data.
Args:
crawl_json_records (list): A list of JSON objects such as the
crawl_json variable above.
Returns:
2-tuple: An (int, set) pair. The integer is the number of records
in the given list. The set is a set (constructed with
Python's set() function) of strings. It contains all the
field top names that appear at the top level of any record
in crawl_json_records.
"""
...
# Feel free to erase the next line. It just demonstrates how
# to return a tuple. You'll have to replace the variable names
# with whatever you define in this function.
return (num_records, possible_fields)
_ = ok.grade('q01')
_ = ok.backup()
# Display results
n, keys = json_description(crawl_json)
print("Number of records:", n)
print("Keys in the records:", keys)
# Use this cell for your explorations.
q2_answer = r"""
Put your answer here, replacing this text.
"""
display(Markdown(q2_answer))
def make_crawls_dataframe(crawl_json_records):
"""Creates a Pandas DataFrame from the given list of JSON records.
The DataFrame corresponds to the following relation:
crawls(primary key (url, hour), updated)
Each hour in which a crawl happened for a page (regardless of
whether it found a change) should be represented. `updated` is
a boolean value indicating whether the check for that hour found
a change.
The result is sorted by URL in ascending order and **further**
sorted by hour in ascending order among the rows for each URL.
Args:
crawl_json_records (list): A list of JSON objects such as the
crawl_json variable above.
Returns:
DataFrame: A table whose schema (and sort order) is described
above.
"""
...
# Run this cell before you continue.
crawls = make_crawls_dataframe(crawl_json)
_ = ok.grade('q03')
_ = ok.backup()
# Use this cell for your explorations. Write the schema
# in text in the string below.
q4_answer = r"""
Put your answer here, replacing this text.
"""
display(Markdown(q4_answer))
crawl_stats = (
crawls['updated']
.groupby(crawls.index.get_level_values('url'))
.agg({
'number of crawls': 'count',
'proportion of updates': 'mean',
'number of updates': 'sum'
})
)
...
# Leave this for grading purposes
q5_p1_plot = plt.gcf()
...
# Leave this for grading purposes
q5_p2_plot = plt.gcf()
...
# Leave this for grading purposes
q5_p3_plot = plt.gcf()
q5_p4_answer = r"""
Put your answer here, replacing this text.
"""
display(Markdown(q5_p4_answer))
def display_points(points, xlim, title):
"""Displays a timeline with points on it.
Args:
points (ndarray): A list of floats in the range [xlim[0], xlim[1]],
each one a point to display in the timeline.
xlim (list-like): A list/tuple/array with 2 elements giving the
start and end of the timeline.
title (str): The title to display on the plot.
Example:
>>> # plot the points at [1,3,30,50]
>>> display_points([1,4,30,50], [0, 75], "Example")
"""
fig, ax = plt.subplots(1)
fig.set_size_inches(8, 1)
ax.scatter(points, np.repeat(0, len(points)), alpha=0.5)
ax.axhline(0, color="grey", zorder=-1, lw=.5)
ax.yaxis.set_visible(False)
ax.xaxis.set_visible(True)
ax.set_xlim(xlim)
ax.set_title("{} ({:d} total points)".format(title, len(points)))
plt.show()
display_points([1,4,30,50], [0, 75], "Example")
# This cell identifies a few categories of pages and
# associates a label in the 'Desc' column of crawl_stats
crawl_stats['Desc'] = "" # default blank description
# Normal pages have a moderate number of updates and
# were successfully crawled most times.
crawl_stats.loc[
(crawl_stats['proportion of updates'] > 0.1)
& (crawl_stats['proportion of updates'] < 0.9)
& (crawl_stats['number of crawls'] >= 700),
'Desc'] = 'Normal'
crawl_stats.loc[
(crawl_stats['proportion of updates'] < .1)
& (crawl_stats['number of crawls'] >= 700),
'Desc'] = 'Rare Update'
crawl_stats.loc[
(crawl_stats['proportion of updates'] > .9)
& (crawl_stats['number of crawls'] >= 700),
'Desc'] = 'Frequent Update'
crawl_stats.loc[
crawl_stats['number of crawls'] < 50,
'Desc'] = 'Few Crawls'
# Build a dataframe with a few examples from each type of webpage
num_of_each = 3
pages_to_display = pd.concat([
crawl_stats[crawl_stats['Desc'] == "Normal"].head(num_of_each),
crawl_stats[crawl_stats['Desc'] == 'Rare Update'].head(num_of_each),
crawl_stats[crawl_stats['Desc'] == 'Frequent Update'].head(num_of_each),
crawl_stats[crawl_stats['Desc'] == 'Few Crawls'].head(num_of_each),
])
# Construct the timeline point diagrams for each example
for (url, desc) in pages_to_display['Desc'].iteritems():
crawls_for_site = crawls.loc[url]
display_points(
crawls_for_site[crawls_for_site['updated']].index,
[0, len(crawls_for_site)], desc)
# Use this cell for your explorations.
q6_answer = r"""
Put your answer here, replacing this text.
"""
display(Markdown(q6_answer))
def compute_q_q_pairs(url):
"""Computes lists of uniform-distribution-quantiles and data-quantiles for a page.
Args:
url (str): The URL of a page.
Returns:
2-tuple: A pair (AKA a 2-tuple). Both components are lists or arrays of
length n, where n is the number of positive checks for the page.
The first component contains quantiles of a uniform distribution,
and the second contains quantiles of the distribution of positive
check times for the page. The kth element of a component is the
(k+1)/(n+1)-th quantile of its distribution or dataset.
The first component of the pair contains quantiles of the uniform
distribution on [0, N], where N is the number of checks performed
for the page.
"""
...
obs_q = ...
pdf_q = ...
return (pdf_q, obs_q)
_ = ok.grade('q07')
_ = ok.backup()
for url in pages_to_display[pages_to_display['Desc'] == "Normal"].index.values:
print(url)
(pdf_q, obs_q) = compute_q_q_pairs(url)
sns.regplot(pdf_q, np.array(obs_q))
plt.xlabel("True uniform quantile")
plt.ylabel("Actual quantile")
plt.show()
# Use this cell for your explorations.
q7_p2_answer = r"""
Put your answer here, replacing this text.
"""
display(Markdown(q7_p2_answer))
url = 'http://a.hatena.ne.jp/Syako/simple'
n = np.count_nonzero(crawls.loc[url]['updated'])
...
# Leave this for grading purposes
q8_plot = plt.gcf()
# Use this cell for your explorations.
q8_answer = r"""
Put your answer here, replacing this text.
"""
display(Markdown(q8_answer))
q9_answer = r"""
Put your answer here and delete these two sentences. Some
steps are already filled in to get you started; the '...'
indicate where you need to fill in one or more lines.
**Step 1.** The probability of the data given $\lambda$ is:
$$L(\lambda) = e^{-(\lambda N)} \frac{(\lambda N)^{n}}{n!}$$
...
Therefore,
$$\lambda^{\text{MLE}} = ...$$
"""
display(Markdown(q9_answer))
...
# Leave this at the end so we can grab the plot for grading
q10_plot = plt.gcf()
_ = ok.grade('q10')
_ = ok.backup()
def sample_poisson_process(rate, length):
"""Generates n points from Poisson(rate*length) and locates them
uniformly at random on [0, length].
Args:
rate (float): The average number of points per unit length.
length (float): The length of the line segment.
Returns:
ndarray: An array of points scattered randomly on [0, length].
The number of points has a Poisson(rate*length)
distribution.
"""
...
_ = ok.grade('q11')
_ = ok.backup()
def snap_times(update_times, window_length, process_length):
"""Given a list of change times, produces a list of the windows that detected a
change (that is, where at least one change happened inside the window).
This has the effect of 'snapping' each change to the next hour (or whatever the
window_length is). For periods where more than one change happened, the output
will still only list the period once.
In other words, it produces a list of the positive checks given a list of true
change times.
Args:
update_times (ndarray): A list of times when changes happened for a page.
All times are between 0 and process_length.
window_length (float): The width of each window. (For example, if time
is being measured in hours, and an observation
happens once per hour, then this should be 1.)
process_length (float): The last time time any change could have happened.
Returns:
ndarray: A list of numbers, each one the right endpoint of a window where at
least one change happened."""
window_ends = np.arange(0, process_length, window_length) + window_length
num_windows = len(window_ends)
event_windows = np.floor(np.array(update_times) / window_length).astype(int)
events_by_window = np.bincount(event_windows, minlength=num_windows)
event_happened = events_by_window > 0
return window_ends[event_happened]
# Use this cell for your explorations, then write your conclusions
# in the string below.
q12_answer = r"""
Put your answer here, replacing this text.
"""
display(Markdown(q12_answer))
def simulate_censored_rate_estimate(true_rate, num_visits):
"""Simulates updates to a page and visits by a web crawler, and
returns the proportion of visits in which an update was observed.
Args:
true_rate (float): The average number of updates per unit length
of time. (The units are irrelevant for the
purposes of this function, but you can imagine
that they are hours.)
num_visits (float): The number of visits made to the site. One
visit is made per unit length of time, so
this is also equal to the duration of time
simulated.
Returns:
float: the MLE for true_rate lambda, based on the number of
observed positive checks.
"""
# The skeleton here is provided for your convenience; you
# don't have to follow it.
draws = ...
windows = ...
return ...
def simulate_many_rate_estimates(true_rate, num_visits):
return np.mean([simulate_censored_rate_estimate(true_rate, num_visits)
for _ in range(1000)])
num_visits = 719
rates = list(np.arange(0, 4, .2))
estimates = [simulate_many_rate_estimates(r, num_visits) for r in rates]
plt.plot(rates, estimates, label = 'average estimate');
plt.plot([rates[0], rates[-1]], [rates[0], rates[-1]],
linestyle='dashed', color='g', label='true rate')
plt.xlabel("True rate of updates per hour")
plt.ylabel("Average estimate of the rate of updates per hour")
plt.legend();
_ = ok.grade('q13')
_ = ok.backup()
q14_p1_answer = r"""
Put your answer here, replacing this text.
"""
display(Markdown(q14_p1_answer))
q14_p2_answer = r"""
Put your answer here, replacing this text.
"""
display(Markdown(q14_p2_answer))
q15_answer = r"""
Put your answer here, replacing this text.
"""
display(Markdown(q15_answer))
# The probability that a Poisson(0.5) random variable is equal to 0
Prob_pois_half_equals_0 = ...
# The probability that a Poisson(0.5) random variable is greater than or equal to 1
Prob_pois_half_gte_1 = ...
_ = ok.grade('q15')
_ = ok.backup()
q16_answer = r"""
Put your answer here, replacing this text.
"""
display(Markdown(q16_answer))
...
_ = ok.grade('q17')
_ = ok.backup()
def simulate_change_rate_estimate(change_rate, num_observations, estimator):
"""Simulates hourly change observations for a website and produces an
estimate of the change rate.
Args:
change_rate (float): The hourly change rate of the website to be used in
the simulation.
num_observations (int): The number of observations (equivalently, the
number of hours) to simulate.
estimator (func): A function to apply to the simulated observations to
produce an estimate of the change rate. Has the same
signature as the function modified_mle below.
Returns:
float: The estimate produced by calling estimator on the simulated
list of positive checks.
"""
...
return
def modified_mle(positive_checks, num_observations):
"""Produces the modified MLE for a dataset.
Args:
positive_checks (ndarray): A list or array of the hours when a
change was observed.
num_observations (int): The number of hours in which we could
have observed a change.
Returns:
(float): The modified MLE.
"""
...
return
_ = ok.grade('q18')
_ = ok.backup()
def plot_bootstrap_rate_estimates(page_url, num_simulations):
"""Simulates positive check observations for a website many times. For
each simulation, the page's change rate is estimated based on the simulated
positive checks. The estimation method is the modified_mle function
implemented above. Then this function produces a histogram that shows
the distribution of these estimated change rates.
When conducting each simulation, the change rate is taken to be the
estimated update rate for the page. *That* estimate is the modified
MLE computed from the actual positive check observations for that page.
The modified MLE computed from the actual observations for the page
is also displayed as a vertical red line at an appropriate horizontal
position on the histogram.
Here is a diagram that might be helpful:
(This happens 1 time)
|
v
Positive check observations for this page ---modified_mle---> Estimate of change rate for this page
(1000x, once per simulation)
Estimate of change rate for this page ---simulation---> Simulated positive checks
(1000x, once per simulation)
Simulated positive checks ---modified_mle---> Bootstrap estimate of change rate for this page
1000 bootstrap estimates of change rate for this page ---> Histogram
^
(displayed somewhere) |
Estimate of change rate for this page --------------------/
Args:
page_url (str): The URL of the page to simulate.
num_simulations (int): The number of simulations to run.
Returns:
None: This function doesn't return anything; it just makes a histogram
appear as described above.
"""
...
# Run this cell to make plots for several pages using your
# function. Since no automatic tests are provided for this
# question, we suggest examining the plots to make sure they
# make sense to you.
many_updates_url = crawl_stats[np.logical_and(
crawl_stats['number of crawls'] >= 700,
np.logical_and(
.2 <= crawl_stats['proportion of updates'],
crawl_stats['proportion of updates'] >= .8))].index[0]
plot_bootstrap_rate_estimates(many_updates_url, 1000)
few_updates_url = crawl_stats[np.logical_and(
crawl_stats['number of crawls'] >= 700,
np.logical_and(
.05 <= crawl_stats['proportion of updates'],
crawl_stats['proportion of updates'] >= .15))].index[0]
plot_bootstrap_rate_estimates(few_updates_url, 1000)
def estimate_rmse(page_url, num_simulations):
"""Simulates update observations for a website many times. For each
simulation, the page's change rate is estimated based on the simulated
observations. (The estimation method is the modified MLE.) Then this
function produces an estimate of the RMSE of estimates of the change
rate for this page.
When conducting each simulation, the change rate is taken to be the
estimated change rate for the page. *That* estimate is the modified
MLE computed from the actual observations for the page.
We compute the modified MLE for each set of simulated observations. That
constitutes num_simulations estimates of the change rate.
Then we compute the RMSE of those estimates. The "true" change rate in
that calculation is taken to be the modified MLE computed from the
actual observations for the page.
Args:
page_url (str): The URL of the page to simulate.
num_simulations (int): The number of simulations to run.
Returns:
float: The estimated RMSE of the modified MLE for the given page,
based on num_simulations simulations.
"""
...
print(estimate_rmse(many_updates_url, 1000))
print(estimate_rmse(few_updates_url, 1000))
# After completing estimate_rmse above, estimate the RMSEs for all
# the pages here, and add them to crawl_stats as a new column named
# 'rmse'.
...
_ = ok.grade('q20')
_ = ok.backup()
...
# Feel free to use this cell to experiment. Then write your answer
# in the string below.
q22_answer = r"""
Put your answer here, replacing this text.
"""
display(Markdown(q22_answer))
_ = ok.grade_all()
_ = ok.submit()