import pandas as pd
from datetime import datetime, timedelta
since_date = (datetime.now().date() - timedelta(days=90)).isoformat()
SQL
Saved to variable
dfSELECT
m.*,
SUM(count) AS total_downloads,
ARRAY_AGG(STRUCT(p.version,
count AS downloads)
ORDER BY
count DESC
LIMIT
100) AS version_downloads
FROM (
SELECT
file.project AS name,
file.version AS version,
COUNT(1) AS count
FROM
`the-psf.pypi.file_downloads`
WHERE
timestamp >= {{ since_date }}
GROUP BY
name,
version ) p
LEFT JOIN (
SELECT
name,
ARRAY_AGG(STRUCT(version, summary, author, author_email, maintainer, maintainer_email, license, home_page, upload_time) ORDER BY upload_time DESC LIMIT 1)[OFFSET(0)].*
FROM
`the-psf.pypi.distribution_metadata`
GROUP BY
name) m
ON
p.name = LOWER(m.name)
WHERE
m.name IS NOT NULL
GROUP BY
m.name, m.version, m.summary, m.author, m.author_email, m.maintainer, m.maintainer_email, m.license, m.home_page, m.upload_time
ORDER BY
total_downloads DESC
bins = {
'top 100': df.iloc[100].total_downloads,
'top 1000': df.iloc[1000].total_downloads,
'top 1%': df.iloc[int(len(df) / 100)].total_downloads,
'top 2%': df.iloc[2 * int(len(df) / 100)].total_downloads,
'top 3%': df.iloc[3 * int(len(df) / 100)].total_downloads,
'top 10%': df.iloc[10 * int(len(df) / 100)].total_downloads,
'top 50%': df.iloc[50 * int(len(df) / 100)].total_downloads,
}
bins