import nltk
from nltk.corpus import senseval
nltk.download("senseval")
[nltk_data] Downloading package senseval to /root/nltk_data...
[nltk_data] Unzipping corpora/senseval.zip.
senseval.fileids()
senseval.instances('hard.pos')
from nltk.corpus import wordnet
nltk.download('wordnet')
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Unzipping corpora/wordnet.zip.
def senses(word):
"""
This takes a target word from senseval-2 and it returns the list of possible senses for the word
"""
return list(set([instance.senses[0] for instance in senseval.instances(word)]))
hardSenses = senses('hard.pos')
print(hardSenses)
['HARD2', 'HARD1', 'HARD3']
serveSenses = senses('serve.pos')
print(serveSenses)
['SERVE12', 'SERVE2', 'SERVE6', 'SERVE10']
def sense_instances(instances, sense):
"""
This returns the list of instances in instances that have the sense sense
"""
# return instances_with_sense
return [instance for instance in instances if instance.senses[0] == sense]
nltk.FreqDist([i.senses[0] for i in senseval.instances('hard.pos')])
nltk.FreqDist([i.senses[0] for i in senseval.instances('serve.pos')])
def tokens(word):
"""
This takes a target word from senseval-2 and it returns the list of possible tokens for the word
"""
token = set()
for instance in senseval.instances(word):
for element in instance.context:
if element[0] == word.split(".")[0]:
token.add(element[1])
return list(token)
tokens('hard.pos')
tokens('serve.pos')
def check_format(word):
instance_not_format = []
for instance in senseval.instances(word):
for to_check_attr in ['senses', 'context', 'position', 'word']:
if len(instance.__dict__.keys()) > 4:
instance_not_format.append(instance)
return instance_not_format
print(check_format('hard.pos'))
print(check_format('serve.pos'))
[]
[]
instances_hard = senseval.instances('hard.pos')
inst_hard_1 = instances_hard[1]
inst_hard_1.context
from nltk.corpus import stopwords
import string
nltk.download("stopwords")
OTHER_WORDS = ["''", "'d", "'ll", "'m", "'re", "'s", "'t", "'ve", '--', '000', '1', '2', '3', '4', '5', '6', '8', '10', '15', '30', 'I', 'F', '``', 'also', "don'", 'n', 'one', 'said', 'say', 'says', 'u', 'us', 'hard', 'harder']
STOPWORDS_SET = set(stopwords.words('english')).union(set(string.punctuation), set(OTHER_WORDS))
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
def extract_vocab(instances, stopwords=STOPWORDS_SET, m=250):
"""
Given a list of senseval instances, return a list of the m most frequent words that
appears in its context (i.e., the sentence with the target word in), output is in order
of frequency.
"""
return [w for w,f in extract_vocab_frequency(instances,stopwords,m)]
def extract_vocab_frequency(instances, stopwords=STOPWORDS_SET, m=250):
"""
Given a list of senseval instances, return a list of the m most frequent words that
appears in its context (i.e., the sentence with the target word in), output is in order
of frequency and includes also the number of instances in which that key appears in the
context of instances.
"""
word_dict_counter = {}
for instance in instances:
for word in instance.context:
if word[0] not in word_dict_counter:
word_dict_counter[word[0]] = 1
else:
word_dict_counter[word[0]] += 1
return [
(k, v)
for k, v in sorted(word_dict_counter.items(), key=lambda item: item[1], reverse=True)
if k not in stopwords
][:m]
vocab_6 = extract_vocab_frequency(instances_hard, STOPWORDS_SET, m=6)
vocab_6
def wsd_caracteristicas_palabras_vecinas(instance, vocab, dist=2):
"""
Create a featureset where every key returns False unless it occurs in the
instance's context
"""
features = {}
for word in vocab:
features[f"contains('{word}')"] = True if word in [word_context[0] for word_context in instance.context] else False
return features
wsd_caracteristicas_palabras_vecinas(inst_hard_1, vocab_6)
inst_hard_2737 = instances_hard[2737]
inst_hard_2737.context
def wsd_caracteristicas_colocacion(instance, vocab, dist=2):
features = {}
pos = [word_context[0] for word_context in instance.context].index(instance.context[instance.position][0]) #word index in context
start = 0 if pos - dist < 0 else pos - dist
end = len(instance.context) if dist + pos > len(instance.context) else dist + pos + 1
for word in vocab:
features[f"contains('{word}')"] = True if word in [word_context[0] for word_context in instance.context[start:end]] else False
return features
wsd_caracteristicas_colocacion(inst_hard_2737, vocab_6, 2)
def wsd_caracteristicas_colocacion_other(instance, vocab, dist=2):
features = {}
pos = [word_context[0] for word_context in instance.context].index(instance.context[instance.position][0]) #word index in context
start = 0 if pos - dist < 0 else pos - dist
end = len(instance.context) if dist + pos > len(instance.context) else dist + pos + 1
for word in vocab:
features[f"contains('{word}')"] = True if word in [word_context[0] for word_context in instance.context[start:end]] else False
return features
wsd_caracteristicas_colocacion_other(inst_hard_2737, vocab_6, 2)
import numpy as np
from nltk.classify import accuracy, NaiveBayesClassifier
from nltk import ConfusionMatrix
def wsd_clasificador(word, features, stopwords_list=STOPWORDS_SET, number=250, distance=2, errors=False, confusion_matrix=False):
"""
This function takes as arguments:
a target word from senseval2;
a feature set (this can be wsd_caracteristicas_palabras_vecinas or wsd_caracteristicas_colocacion);
a list of stopwords
a number (defaults to 250), which determines for wsd_caracteristicas_palabras_vecinas the number of
most frequent words within the context of a given sense that you use to classify examples;
a distance (defaults to 2) which determines the size of the window for wsd_caracteristicas_colocacion;
errors (defaults to false), which if set to True prints the errors;
confusion_matrix (defaults to False), which if set to True prints a confusion matrix.
Calling this function splits the senseval data for the word into a training set and a test set (the way it does
this is the same for each call of this function, because the argument to random.seed is specified,
but removing this argument would make the training and testing sets different each time you build a classifier).
It then trains the trainer on the training set to create a classifier that performs WSD on the word,
using features (with number or distance where relevant).
It then tests the classifier on the test set, and prints its accuracy on that set.
If error==True, then the errors of the classifier over the test set are printed out.
For each error four things are recorded: (i) the example number within the test data (this is simply the index of the
example within the list test_data); (ii) the sentence that the target word appeared in, (iii) the
(incorrect) derived label, and (iv) the good label.
If confusion_matrix==True, then calling this function prints out a confusion matrix, where each cell [i,j]
indicates how often label j was predicted when the correct label was i (so the diagonal entries indicate labels
that were correctly predicted).
"""
events = [(instance, instance.senses[0]) for instance in senseval.instances(word)]
senses = (sense for (i, sense) in events)
instances = senseval.instances(word)
vocab = extract_vocab(instances, stopwords=STOPWORDS_SET, m=number)
feature_set = [features(i, vocab, distance) for (i, _) in events]
data = list(zip(feature_set, senses))
# Split into train and validation set
np.random.seed(3000)
np.random.shuffle(data)
train = data[:int(0.8 * len(data))]
test = data[int(.8 * len(data)):]
# Train classifier
classifier = NaiveBayesClassifier.train(train)
# Test classifier
acc = nltk.classify.accuracy(classifier, test)
print(f'Accuracy: {acc}')
if errors == True:
errors = []
for (instance, label) in test:
guess = classifier.classify(instance)
if guess != label:
con = instance.context
position = instance.position
item_numer = str(test.index((instance, label)))
word_list = []
for (word, tag) in con:
word_list.append(word)
hard_highlighted = word_list[position].upper()
word_list_highlighted = word_list[0:position] + [hard_highlighted] + word_list[position:]
sentence = ' '.join(word_list_highlighted)
errors.append([item_numer, sentence, guess, label])
error_number = len(errors)
print('Errores: ')
for error in errors:
print(str(errors.index(error)+1) + error)
vocab_ = extract_vocab(senseval.instances(word), STOPWORDS_SET, m=6)
if confusion_matrix == True:
good = [label for (i, label) in test]
derived = [classifier.classify(i) for (i, label) in test]
print('Matriz de confusión: ')
mc = nltk.ConfusionMatrix(good, derived)
print(mc)
return classifier
clasificador_hard_vecinas = wsd_clasificador('hard.pos', wsd_caracteristicas_palabras_vecinas, number=250, errors=False, confusion_matrix=True)
Accuracy: 0.8650519031141869
Matriz de confusión:
| H H H |
| A A A |
| R R R |
| D D D |
| 1 2 3 |
------+-------------+
HARD1 |<677> 16 8 |
HARD2 | 41 <51> 1 |
HARD3 | 49 2 <22>|
------+-------------+
(row = reference; col = test)
clasificador_hard_colocacion = wsd_clasificador('hard.pos', wsd_caracteristicas_colocacion, distance=2, errors=False, confusion_matrix=True)
Accuracy: 0.8765859284890427
Matriz de confusión:
| H H H |
| A A A |
| R R R |
| D D D |
| 1 2 3 |
------+-------------+
HARD1 |<682> 10 9 |
HARD2 | 33 <58> 2 |
HARD3 | 52 1 <20>|
------+-------------+
(row = reference; col = test)
clasificador_hard_colocacion = wsd_clasificador('hard.pos', wsd_caracteristicas_colocacion, distance=2, errors=True, confusion_matrix=True)
Accuracy: 0.8765859284890427
AttributeError: 'dict' object has no attribute 'context'
clasificador_serve_vecinas = wsd_clasificador('serve.pos', wsd_caracteristicas_palabras_vecinas, number=250, confusion_matrix=True)
Accuracy: 0.7454337899543378
Matriz de confusión:
| S S |
| E E S S |
| R R E E |
| V V R R |
| E E V V |
| 1 1 E E |
| 0 2 2 6 |
--------+-----------------+
SERVE10 |<300> 7 48 3 |
SERVE12 | 10<192> 51 10 |
SERVE2 | 28 15<113> 14 |
SERVE6 | 5 14 18 <48>|
--------+-----------------+
(row = reference; col = test)
clasificador_serve_colocacion = wsd_clasificador('serve.pos', wsd_caracteristicas_colocacion, distance=2, errors=False, confusion_matrix=True)
Accuracy: 0.571917808219178
Matriz de confusión:
| S S |
| E E S S |
| R R E E |
| V V R R |
| E E V V |
| 1 1 E E |
| 0 2 2 6 |
--------+-----------------+
SERVE10 |<273> 70 9 6 |
SERVE12 | 92<162> 6 3 |
SERVE2 | 88 37 <40> 5 |
SERVE6 | 35 21 3 <26>|
--------+-----------------+
(row = reference; col = test)