import json
import os
from google.oauth2 import service_account
from google.cloud import bigquery
bq_credentials = service_account.Credentials.from_service_account_info(json.loads(os.environ['BIGQUERY_DEEPNOTE_SERVICE_ACCOUNT']))
bq_client = bigquery.Client(credentials=bq_credentials, project=bq_credentials.project_id)
query = """
SELECT
package,
file_id,
ANY_VALUE(rating) AS rating
FROM (
SELECT
name
FROM
the-psf.pypi.distribution_metadata
GROUP BY
name) m
LEFT JOIN (
SELECT
package,
file_id,
1 AS rating
FROM (
SELECT
f.id AS file_id,
ARRAY(
SELECT
SPLIT(SPLIT(row_, '==')[
OFFSET
(0)], '>=')[
OFFSET
(0)]
FROM
UNNEST(SPLIT( c.content, '\n')) AS row_
WHERE
row_ NOT LIKE "#%"
GROUP BY
row_) AS requirements
FROM (
SELECT
id,
ANY_VALUE(path) AS path
FROM
`bigquery-public-data.github_repos.files`
GROUP BY
id) f
LEFT JOIN
`bigquery-public-data.github_repos.contents` c
ON
f.id = c.id
AND f.path LIKE "%requirements.txt"
AND c.content IS NOT NULL),
UNNEST(requirements) AS package) nested
ON
m.name = nested.package
WHERE
package IS NOT NULL
GROUP BY
package,
file_id
"""
query_job = bq_client.query(query)
df = query_job.to_dataframe()
df
# This is kind of what we're going for:
# >>> df = df.pivot(index='file_id', columns='package', values='rating')
# but it creates a huge matrix full of zeroes,
# so we opt for a sparse representation
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
file_c = CategoricalDtype(sorted(df.file_id.unique()), ordered=False)
package_c = CategoricalDtype(sorted(df.package.unique()), ordered=False)
col = df.file_id.astype(file_c).cat.codes
row = df.package.astype(package_c).cat.codes
sparse_matrix = csr_matrix((df["rating"], (row, col)), \
shape=(package_c.categories.size, file_c.categories.size))
sparse_matrix
import implicit
model = implicit.als.AlternatingLeastSquares(factors=50)
model.fit(sparse_matrix)
WARNING:root:OpenBLAS detected. Its highly recommend to set the environment variable 'export OPENBLAS_NUM_THREADS=1' to disable its internal multithreading
import pandas as pd
def similar_items(package_name, n_items=20):
index = package_c.categories.get_loc(package_name)
items = pd.DataFrame(model.similar_items(index, n_items), columns=['item_number', 'score'])
items['package'] = items.apply(lambda row: package_c.categories[row['item_number']], axis=1)
return list(items.package)
import json
with open('recommendations.json', 'w') as f:
for i, package in enumerate(package_c.categories):
reco = {"package": package, "items": similar_items(package)}
f.write(json.dumps(reco) + "\n")
if i % 1000 == 0:
print(f"On {i}th row")
On 0th row
On 1000th row
On 2000th row
On 3000th row
On 4000th row
On 5000th row
On 6000th row
On 7000th row
On 8000th row
On 9000th row
On 10000th row
On 11000th row
On 12000th row
On 13000th row
On 14000th row
On 15000th row
On 16000th row
On 17000th row
On 18000th row
On 19000th row
query = """
SELECT
name,
ANY_VALUE(summary) AS summary
FROM
`the-psf.pypi.distribution_metadata`
GROUP BY
name
"""
query_job = bq_client.query(query)
summaries = query_job.to_dataframe()
summaries_dict = summaries.set_index('name').to_dict('index')
from google.cloud import firestore
from google.api_core.exceptions import NotFound
# I'm going to sneakily re-use the BQ credentials, because I can
firestore_client = firestore.Client(credentials=bq_credentials, project=bq_credentials.project_id)
collection = firestore_client.collection("python_packages_v2")
with open('recommendations.json', 'r') as f:
for i, line in enumerate(f):
recos = json.loads(line.strip())
firestore_id = recos["package"]
recos_with_summaries = [{
"name": package,
"summary": summaries_dict.get(package, {}).get('summary', '')}
for package in recos["items"]
]
data = {"recommendations": recos_with_summaries}
try:
collection.document(firestore_id).update(data)
except NotFound:
collection.document(firestore_id).set(data)
if i % 1000 == 0:
print(f"On {i}th row")
On row 0
On row 1000
On row 2000
On row 3000
On row 4000
On row 5000
On row 6000
On row 7000
On row 8000
On row 9000
On row 10000
On row 11000
On row 12000
On row 13000
On row 14000
On row 15000
On row 16000
On row 17000
On row 18000
On row 19000