# Start writing code here...
# Construction 1
from spacy.tokenizer import Tokenizer
from spacy.lang.es import Spanish
nlp = Spanish()
# Create a blank Tokenizer with just the Spanish vocab
tokenizer = Tokenizer(nlp.vocab)
import os
path = 'txt/'
data = []
clases = []
#lectura de spam data
for file in os.listdir(path):
with open(path+file, encoding='latin-1') as f:
data.append(f.read())
clases.append('UTH')
len(data)
data[]
tokens = tokenizer(data[0])
len(tokens)
for token in tokens:
print(token)
tokens[0].text
span = tokens[1:3]
span
span.text
[t.text for t in tokens]
apples = nlp("I like apples")
oranges = nlp("I like oranges")
apples_oranges = apples.similarity(oranges)
oranges_apples = oranges.similarity(apples)
print(apples_oranges)
print(oranges_apples)
print(apples_oranges == oranges_apples)