# spacy's language model has the word embeddings we'll need
!python -m spacy download en_core_web_lg
import os
import spacy
import requests
import re
import numpy as np
# we'll need to rate limit our requests when working with Notion
from ratelimiter import RateLimiter
from sklearn.feature_extraction.text import CountVectorizer
# load the environment variables into global variables
INSIGHTS_ID = os.environ["INSIGHTS_ID"]
NOTION_KEY = os.environ["NOTION_KEY"]
# classes for interacting with notion API
class Client:
def __init__(self, key):
self.request = requests.Session()
self.request.headers.update({
"Authorization": f"Bearer {key}",
"Notion-Version": "2021-08-16",
"Content-Type": "application/json"
})
self.databases = Databases(self)
self.pages = Pages(self)
class Databases:
def __init__(self, client):
self.client = client
def query(self, database_id):
response = self.client.request.post(f"https://api.notion.com/v1/databases/{database_id}/query")
return response.json()
class Pages:
def __init__(self, client):
self.client = client
def query_content(self, block_id, cursor=None):
url = f"https://api.notion.com/v1/blocks/{block_id}/children?page_size=100"
if cursor:
url = f"{url}&start_cursor={cursor}"
response = self.client.request.get(url)
return response.json()
client = Client(NOTION_KEY)
# fetch database pages
response = client.databases.query(INSIGHTS_ID)
pages = response["results"]
# fetch page content
@RateLimiter(max_calls=3, period=1)
def query_content_recursively(block_id, blocks, cursor=None):
response = client.pages.query_content(block_id, cursor)
for block in response["results"]:
if block["has_children"]:
blocks.append([block, query_content_recursively(block["id"], [])])
else:
blocks.append(block)
if response["next_cursor"]:
return query_content_recursively(block_id, blocks, response["next_cursor"])
return blocks
items = []
for page in pages:
items.append({
"page": page,
"blocks": query_content_recursively(page["id"], [])
})
# extract data from page content
def extract_title(page):
return [t["plain_text"] for t in page["properties"]["Title"]["title"]]
def extract_text(blocks, output):
for block in blocks:
if isinstance(block, list):
extract_text(block, output)
elif block["type"] in ["heading_1", "heading_2", "heading_3", "paragraph", "callout", "quote", "bulleted_list_item", "numbered_list_item", "to_do", "toggle"]:
for text in block[block["type"]]["text"]:
output.append(text["plain_text"])
return output
def extract_relations(page):
relations = set()
for prop in page["properties"].values():
if prop["type"] == "relation":
for relation in prop["relation"]:
relations.add(relation["id"])
return relations
def extract_mentions(blocks, mentions):
for block in blocks:
if isinstance(block, list):
extract_mentions(block, mentions)
elif block["type"] in ["heading_1", "heading_2", "heading_3", "paragraph", "callout", "quote", "bulleted_list_item", "numbered_list_item", "to_do", "toggle", "code"]:
for text in block[block["type"]]["text"]:
if text["type"] == "mention" and text["mention"]["type"] == "page":
mentions.add(text["mention"]["page"]["id"])
elif block["type"] == "link_to_page":
if block["link_to_page"]["type"] == "page_id":
mentions.add(block["link_to_page"]["page_id"])
return mentions
def extract_data(item):
title = "".join(extract_title(item["page"]))
text = " ".join(extract_text(item["blocks"], []))
relations = extract_relations(item["page"])
mentions = extract_mentions(item["blocks"], set())
return {
"title": title,
"text": text,
"links": list(relations.union(mentions))
}
for item in items:
data = extract_data(item)
item["title"] = data["title"]
item["text"] = data["text"]
item["links"] = data["links"]
def extract_keywords(item):
# create cv with necessary config
cv = CountVectorizer(ngram_range=(1, 2), stop_words="english")
# create a vocabulary (document-term matrix) from our example item
cv.fit([item["title"]])
# list the candidate keywords
return cv.get_feature_names()
# extract candidate keywords for the first item
all_candidates = extract_keywords(items[0])
all_candidates[:5]
# load spacy
nlp = spacy.load("en_core_web_lg")
def get_nouns(item):
# create spacy doc from our example text
doc = nlp(item["title"])
# extract noun phrases from that document
noun_phrases = set(chunk.text.strip().lower() for chunk in doc.noun_chunks)
# extracting nouns from our noun phrases
nouns = set()
for token in doc:
if token.pos_ == "NOUN":
nouns.add(token.text)
# create a union of nouns and their noun phrases
return nouns.union(noun_phrases)
all_nouns = get_nouns(items[0])
all_nouns
def find_candidates(all_candidates, all_nouns):
return list(filter(lambda candidate: candidate in all_nouns, all_candidates))
candidates = find_candidates(all_candidates, all_nouns)
candidates[:10]
def create_document_embedding(item):
# create spacy document
doc = nlp(f"{item['title']} {item['text']}")
# extract vector informaiton
return {
"vector": doc.vector,
"magnitude": doc.vector_norm
}
document_embedding = create_document_embedding(items[0])
def create_word_embeddings(candidates):
embeddings = []
# create embeddings for each of the candidate keywords
for keyword in candidates:
doc = nlp(keyword)
embeddings.append({
"keyword": keyword,
"magnitude": doc.vector_norm,
"vector": doc.vector
})
return embeddings
word_embeddings = create_word_embeddings(candidates)
def cosine_similarity(a, b):
# we're taking the dot product, divided by the multiplied magnitude (https://en.wikipedia.org/wiki/Cosine_similarity)
return np.dot(a["vector"], b["vector"]) / (a["magnitude"] * b["magnitude"])
def select_best_keyword(document_embedding, word_embeddings):
similarities = []
# compute the similarities
for item in word_embeddings:
similarities.append([item, cosine_similarity(document_embedding, item)])
# sort the similarities
return sorted(similarities, key=lambda item:-item[1])[0][0]["keyword"]
keyword = select_best_keyword(document_embedding, word_embeddings)
keyword
def replace_title_keyword(keyword, item):
return re.sub(keyword, "_" * len(keyword), item["title"].lower())
print(replace_title_keyword(keyword, items[0]))
🔵 the __________ between two numbers is their distance away from each other
def obfuscate_node_title(item):
all_candidates = extract_keywords(item)
all_nouns = get_nouns(item)
candidates = find_candidates(all_candidates, all_nouns)
document_embedding = create_document_embedding(item)
word_embeddings = create_word_embeddings(candidates)
keyword = select_best_keyword(document_embedding, word_embeddings)
return replace_title_keyword(keyword, item)
for item in items[:10]:
title = obfuscate_node_title(item)
print(title)
🔵 the __________ between two numbers is their distance away from each other
🔵 ________________ are those which are the same distance away from zero
🔵 ___________ s with negative numbers
🔵 ________________ don’t change the identity of the original value
🔵 all _______ are curried
🔵 number ____ follow an explicit hierarchy
🔵 claim-based ________ can improve the effectiveness and efficiency of knowledge synthesis
🔵 knowledge synthesis involves tight interaction between ________, synthesis, observation, and contextual notes
🔵 mindful notes provide the foundation for atomic note-______ techniques
🔵 ___________________ fails when notes aren’t adequately detailed or rigorously linked