import pandas_redshift as pr
import pandas as pd
import os
# analysis timeframe
# data prior 01/16 contained a bug, so only counting 2 weeks of January
# and multiplying figures x2 when publishing to the website
start_date = '2021-02-01'
end_date = '2021-11-30'
pr.connect_to_redshift(dbname = os.environ['PG_DATABASE'], host = os.environ['PG_HOST'], port = 5432, user = os.environ['PG_USER'], password=os.environ['PG_PASSWORD'])
total_expiring_domains = pr.redshift_to_pandas(f"""
SELECT count(distinct(domain_id)) as count
FROM scan_results
WHERE type='ssl' and response->>'not_after' between '{start_date}' and '{end_date}'
""")['count'][0]
print(f'Domains with SSL to be renewed: {total_expiring_domains}')
# "expirations" are detected and added to the DB 1 week ahead of the actual SSL expiration time:
expirations = pr.redshift_to_pandas(f"""
SELECT state, count(*) AS "count"
FROM expirations
WHERE expired_at BETWEEN '{start_date}' and '{end_date}'
GROUP BY state
""")
# but there were domains where SSL was renewed more than a week in advance - adding them as well
renewed_in_advance = total_expiring_domains - expirations['count'].sum()
df = expirations.append(pd.DataFrame([["renewed_in_advance", renewed_in_advance]], columns=["state", "count"])).set_index("state")
plot = df.plot.pie(y='count', figsize=(8, 8), labeldistance=None)
# we only care about the domains where SSL expiration was noticed and fixed
fixed = df.loc['fixed']['count']
print("SSL expired then got fixed on {:} domains ({:.1%} of {:} domains)".format(fixed, fixed / total_expiring_domains, total_expiring_domains))
SELECT sum(duration_sec::double precision * domains.daily_uniques / 3600 / 24) :: bigint as "affected_users"
FROM expirations
LEFT JOIN domains ON domain_id = domains.id
WHERE expired_at BETWEEN '2021-02-01' and '2021-11-30'
data = pr.redshift_to_pandas(f"""
SELECT alexa_rank, (duration_sec / 3600) as hours
FROM expirations
JOIN domains ON domain_id = domains.id
WHERE state = 'fixed' and expired_at between '{start_date}' and '{end_date}'
""")
data.plot.scatter(x='hours', y='alexa_rank')
more_than_hour_percentage = data[data['hours'] > 1].count() / data.count()
print("Around {:.0%} websites were down for > 1 hour".format(more_than_hour_percentage[0]))
more_than_day_percentage = data[data['hours'] > 24].count() / data.count()
print("Around {:.0%} websites were down for > 24 hours".format(more_than_day_percentage[0]))
days = pr.redshift_to_pandas(f"""
SELECT day, count(1) as cnt
FROM(
SELECT CASE WHEN extract(isodow from expired_at) < 5 THEN 'Mon-Thu' ELSE 'Fri-Sun' END as day
FROM expirations
JOIN domains ON domain_id = domains.id
WHERE state = 'fixed' and expired_at between '{start_date}' and '{end_date}'
AND duration_sec > 10 * 3600
) days
GROUP BY day;
""")
days.set_index('day').plot.pie(y='cnt', figsize=(8, 8), labeldistance=None)
data['alexa_rank'].plot.hist(bins=25)
data['alexa_rank'].where(lambda r : r < 50000).dropna().plot.hist(bins=10)