import os
import requests
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# we'll need to rate limit our requests when working with Notion
from ratelimiter import RateLimiter
# load the environment variables into global variables
INSIGHTS_ID = os.environ["INSIGHTS_ID"]
NOTION_KEY = os.environ["NOTION_KEY"]
# classes for interacting with notion API
class Client:
def __init__(self, key):
self.request = requests.Session()
self.request.headers.update({
"Authorization": f"Bearer {key}",
"Notion-Version": "2021-08-16",
"Content-Type": "application/json"
})
self.databases = Databases(self)
self.pages = Pages(self)
class Databases:
def __init__(self, client):
self.client = client
def query(self, database_id):
response = self.client.request.post(f"https://api.notion.com/v1/databases/{database_id}/query")
return response.json()
class Pages:
def __init__(self, client):
self.client = client
def query_content(self, block_id, cursor=None):
url = f"https://api.notion.com/v1/blocks/{block_id}/children?page_size=100"
if cursor:
url = f"{url}&start_cursor={cursor}"
response = self.client.request.get(url)
return response.json()
client = Client(NOTION_KEY)
# fetch database pages
response = client.databases.query(INSIGHTS_ID)
pages = response["results"]
# fetch page content
@RateLimiter(max_calls=3, period=1)
def query_content_recursively(block_id, blocks, cursor=None):
response = client.pages.query_content(block_id, cursor)
for block in response["results"]:
if block["has_children"]:
blocks.append([block, query_content_recursively(block["id"], [])])
else:
blocks.append(block)
if response["next_cursor"]:
return query_content_recursively(block_id, blocks, response["next_cursor"])
return blocks
items = []
for page in pages:
items.append({
"page": page,
"blocks": query_content_recursively(page["id"], [])
})
# extract data from page content
def extract_title(page):
return [t["plain_text"] for t in page["properties"]["Title"]["title"]]
def extract_text(blocks, output):
for block in blocks:
if isinstance(block, list):
extract_text(block, output)
elif block["type"] in ["heading_1", "heading_2", "heading_3", "paragraph", "callout", "quote", "bulleted_list_item", "numbered_list_item", "to_do", "toggle"]:
for text in block[block["type"]]["text"]:
output.append(text["plain_text"])
return output
def extract_relations(page):
relations = set()
for prop in page["properties"].values():
if prop["type"] == "relation":
for relation in prop["relation"]:
relations.add(relation["id"])
return relations
def extract_mentions(blocks, mentions):
for block in blocks:
if isinstance(block, list):
extract_mentions(block, mentions)
elif block["type"] in ["heading_1", "heading_2", "heading_3", "paragraph", "callout", "quote", "bulleted_list_item", "numbered_list_item", "to_do", "toggle", "code"]:
for text in block[block["type"]]["text"]:
if text["type"] == "mention" and text["mention"]["type"] == "page":
mentions.add(text["mention"]["page"]["id"])
elif block["type"] == "link_to_page":
if block["link_to_page"]["type"] == "page_id":
mentions.add(block["link_to_page"]["page_id"])
return mentions
def extract_data(item):
title = "".join(extract_title(item["page"]))
text = " ".join(extract_text(item["blocks"], []))
relations = extract_relations(item["page"])
mentions = extract_mentions(item["blocks"], set())
return {
"title": title,
"text": text,
"links": list(relations.union(mentions))
}
for item in items:
data = extract_data(item)
item["title"] = data["title"]
item["text"] = data["text"]
item["links"] = data["links"]
for item in items:
item["merged_text"] = f'{item["title"]} {item["text"]}'
# convert extracted text content to a dataframe
df = pd.DataFrame([re.sub("(🔵|⚪️)", "", item["merged_text"]) for item in items], columns=["Text"])
# create cv with the appropriate config for ignoring words
cv = CountVectorizer(max_df =0.9, stop_words="english", ngram_range=(1, 2))
# convert our text into a document-term matrix
dtm = cv.fit_transform(df["Text"])
# create lda object with appropriate config
lda = LatentDirichletAllocation(n_components=10, random_state=1)
# train our model
lda.fit(dtm)
len(cv.get_feature_names())
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
warnings.warn(msg, category=FutureWarning)
cv.get_feature_names()
# topics x words
lda.components_.shape
for index, topic in enumerate(lda.components_):
print(f"\nTop 5 words for Topic {index}")
print([cv.get_feature_names()[index] for index in topic.argsort()[-5:]])
Top 5 words for Topic 0
['documents', 'topics', 'systems', 'intelligence', 'words']
Top 5 words for Topic 1
['new', 'create', 'component', 'information', 'actions']
Top 5 words for Topic 2
['set', 'vec', 'note', 'notes', 'number']
Top 5 words for Topic 3
['content', 'information', 'models', 'visual', 'concept']
Top 5 words for Topic 4
['words', 'useful', 'knowledge', 'content', 'information']
Top 5 words for Topic 5
['item', 'create', 'digests', 'good', 'items']
Top 5 words for Topic 6
['process', 'think', 'observation', 'change', 'notes']
Top 5 words for Topic 7
['character', 'number line', 'line', 'based', 'number']
Top 5 words for Topic 8
['commit', 'library', 'value', 'error', 'identity']
Top 5 words for Topic 9
['ai', 'length', 'actions', 'vector', 'need']
# apply lda to our notion pages
page_topics = lda.transform(dtm)
# create a new column for our topics
df["Topic"] = page_topics.argmax(axis=1)
# preview the dataframe
df.head()
Textobject
Topicint64
0
The difference between two numbers is their distance away from each other Jarvis: Akshay: 1. Num…
7
1
Opposite numbers are those which are the same distance away from zero The easiest way to visualize…
0
2
Number Line s with Negative Numbers Jarvis: Negative Number Line s exist on a Number Line , but t…
7
3
Identity numbers don’t change the identity of the original value Identity Number s are those that…
8
4
All numbers are curried Jarvis: In Haskell and other Functional languages, functions always abstr…
3