NLP Assignment02
Question 1
Q1.1
import pandas as pd
import numpy as np
%pip install openpyxl
Requirement already satisfied: openpyxl in /usr/local/lib/python3.7/site-packages (3.0.7)
Requirement already satisfied: et-xmlfile in /usr/local/lib/python3.7/site-packages (from openpyxl) (1.0.1)
Note: you may need to restart the kernel to use updated packages.
df_e = pd.read_excel("Diabetes-classification.xlsx", sheet_name='Emotions')
df_pj = pd.read_excel("Diabetes-classification.xlsx", sheet_name='Patient-journey')
print('Patient journey:',df_pj.columns, '\nEmotion:',df_e.columns)
Patient journey: Index(['No', 'discussion_text', 'Label', 'ModelName', 'TextId'], dtype='object')
Emotion: Index(['No', 'discussion_text', 'Label', 'ModelName', 'TextId'], dtype='object')
Multinomial Naïve Bayes classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from sklearn.feature_extraction.text import CountVectorizer
com_words = 20
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Package punkt is already up-to-date!
def get_document_features(document, word_features):
"""
This function will convert given document into a feature set.
"""
document_words = set(document)
features = {}
for word in word_features:
features['contains({})'.format(word)] = (word in document_words)
return features
First computing MNB for Emotions dataset
#Extracting a word-tokenized version of columns dt and label.
e_dt = df_e.apply(lambda row: nltk.word_tokenize(row['discussion_text']), axis=1)
e_lab = df_e.apply(lambda row: nltk.word_tokenize(row['Label']), axis=1)
#Computing the frequency distribution of each word in emotions discussion text, by looping through each row of column, and each word of the rows
all_words_e = nltk.FreqDist(w.lower() for l in e_dt for w in l)
#Choosing the top n most common words for lower computational requirements
most_freq_words_e = all_words_e.most_common(com_words)
#Computing a list of lists, each list containing all the words in a given document on first index, with second index being the accompanying label
doc_lst_e = []
for i in range(len(e_dt)):
doc_lst_e.append([e_dt[i], df_e.loc[i]['Label']])
#Retracting a list of the most common words
word_features_e = [word for (word, count) in most_freq_words_e]
#Finding out how many words from word_features each document contains
featuresets_e = [(get_document_features(d,word_features_e), l) for (d,l) in doc_lst_e]
#Splitting data
split_r_e = round(len(featuresets_e)*0.15)
train_set_e,test_set_e = featuresets_e[:split_r_e], featuresets_e[split_r_e:]
#Training the classifier on the first 75% of the feature set
clf_e= nltk.NaiveBayesClassifier.train(train_set_e)
print(nltk.classify.accuracy(clf_e, test_set_e))
0.310632383599722
MNB for patient journey data
pj_dt = df_pj.apply(lambda row: nltk.word_tokenize(row['discussion_text']), axis=1)
pj_lab = df_pj.apply(lambda row: nltk.word_tokenize(row['Label']), axis=1)
#Finding all words in the entire set of documents
all_words_pj = nltk.FreqDist(w.lower() for l in pj_dt for w in l)
#Computing top n most common words
most_freq_words_pj = all_words_pj.most_common(com_words)
#Making list of the most frequent words
word_features_pj = [word for (word, count) in most_freq_words_pj]
#List of lists, index 1: words in doc, index2: accompanying label
doc_lst_pj = []
for i in range(len(pj_dt)):
doc_lst_pj.append([pj_dt[i], df_pj.loc[i]['Label']])
#How many words from the most common words are present in each document
featuresets_pj = [(get_document_features(d,word_features_pj), l) for (d,l) in doc_lst_pj]
split_r_pj = round(len(featuresets_pj)*0.15)
#Splitting into train and test, 15/85
train_set_pj,test_set_pj = featuresets_pj[:split_r_pj], featuresets_pj[split_r_pj:]
#Trainig the classifier
clf_pj = nltk.NaiveBayesClassifier.train(train_set_pj)
#Printing results accuracy
print(nltk.classify.accuracy(clf_pj, test_set_pj))
0.2590759075907591
print('MNB on Emotion:',nltk.classify.accuracy(clf_e, test_set_e))
print('MNB on Patient Journey:',nltk.classify.accuracy(clf_pj, test_set_pj))
MNB on Emotion: 0.310632383599722
MNB on Patient Journey: 0.2590759075907591
Most Informative Features
contains(to) = False Fear : Anger = 4.8 : 1.0
contains(i) = True Fear : Sadnes = 3.5 : 1.0
contains(with) = False Surpri : Anger = 3.3 : 1.0
contains(2) = False Anger : Fear = 3.1 : 1.0
contains(of) = False Joy : Anger = 2.9 : 1.0
contains(the) = False Fear : Antici = 2.8 : 1.0
contains(.) = False Surpri : Anger = 2.6 : 1.0
contains(it) = True Anger : Surpri = 2.6 : 1.0
contains(diabetes) = False Surpri : Disgus = 2.6 : 1.0
contains(and) = False Fear : Sadnes = 2.4 : 1.0
contains(that) = True Anger : Surpri = 2.3 : 1.0
contains(in) = False Fear : Anger = 2.3 : 1.0
contains(with) = True Anger : Surpri = 2.2 : 1.0
contains(my) = True Fear : Antici = 2.2 : 1.0
contains(it) = False Surpri : Anger = 2.2 : 1.0
contains(..) = False Anger : Disgus = 2.0 : 1.0
contains(to) = True Anger : Fear = 2.0 : 1.0
contains(in) = True Anger : Fear = 1.9 : 1.0
contains(for) = True Sadnes : Surpri = 1.9 : 1.0
contains(of) = True Anger : Joy = 1.8 : 1.0
Most Informative Features
contains(i) = True Diagno : Living = 11.6 : 1.0
contains(and) = False Diagno : Living = 10.3 : 1.0
contains(the) = False Diagno : Living = 8.7 : 1.0
contains(my) = True Relati : Undiag = 5.8 : 1.0
contains(diabetes) = False Altern : Living = 5.0 : 1.0
contains(in) = False Clinic : Living = 4.1 : 1.0
contains(the) = True Living : Diagno = 3.1 : 1.0
contains(for) = False Diagno : Altern = 3.0 : 1.0
contains(have) = True Altern : Living = 2.9 : 1.0
contains(a) = False Diagno : Living = 2.8 : 1.0
contains(of) = False Diagno : Living = 2.8 : 1.0
contains(it) = True Diagno : Relati = 2.8 : 1.0
contains(type) = False Diagno : Relati = 2.8 : 1.0
contains(2) = False Altern : Undiag = 2.7 : 1.0
contains(my) = False Undiag : Relati = 2.5 : 1.0
contains(have) = False Living : Altern = 2.5 : 1.0
contains(to) = False Clinic : Relati = 2.5 : 1.0
contains(it) = False Relati : Diagno = 2.5 : 1.0
contains(type) = True Relati : Diagno = 2.5 : 1.0
contains(.) = False Living : Undiag = 2.4 : 1.0
Logistic Regression Classifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
import sys
def get_most_informative_features(clf, vectorizer,
label_names,
max_number_informative_features):
"""
Prints features with the highest coefficient values, per class
"""
output = []
try:
feature_names = vectorizer.get_feature_names()
label_index = len(label_names)
if label_index == 2:
label_index = 1
print('features for binary classification!')
for index in range(label_index):
#print(str(index) + label_names[index])
#print('clf.coef_:',len(clf.coef_))
output.append('\n' + label_names[index] + ':\n')
coefs_with_fns = sorted(zip(clf.coef_[index], feature_names))
#print(coefs_with_fns)
threshold = int(max_number_informative_features / 2)
top = zip(coefs_with_fns[:threshold],
coefs_with_fns[:-(threshold + 1):-1])
for (coef_1, fn_1), (coef_2, fn_2) in top:
feat = "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2)
output.append(feat)
#print('\n'.join(output))
except:
print("Unexpected error:", sys.exc_info()[0])
raise
return '\n'.join(output)
return '\n'.join(output)
Logistic Regression Classifier - Emotions:
lab_e_lst = []
for i in df_e['Label']:
temp_tok = word_tokenize(i)
lab_e_lst.append(temp_tok)
lab_e = []
for i in range(len(lab_e_lst)):
lab_e.append(lab_e_lst[i][0])
u_lab_e = list(set(lab_e))
xTrain_e, xTest_e, yTrain_e, yTest_e = train_test_split(list(df_e['discussion_text']), lab_e)
pipe_clf = Pipeline(
[('vect', CountVectorizer(decode_error='ignore',
stop_words='english',
max_features=100)),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression( penalty='l2', solver='lbfgs', max_iter = 1000,
dual=False, multi_class='multinomial', tol=1e-3)),
])
text_clf = pipe_clf.fit(xTrain_e,yTrain_e)
predictions = pipe_clf.predict(xTest_e)
print('accuracy :', accuracy_score(yTest_e, predictions))
acc_e = accuracy_score(yTest_e, predictions)
measures_info = metrics.classification_report(yTest_e,
predictions)
print('\n\n',measures_info)
vectorizer = pipe_clf.named_steps['vect']
clf = pipe_clf.named_steps['clf']
feature_names = vectorizer.get_feature_names()
imp_features_e = get_most_informative_features(clf, vectorizer,u_lab_e, 20)
#print('\nTop 20 most important features:\n', imp_features)
accuracy : 0.30814639905548996
precision recall f1-score support
Anger 0.00 0.00 0.00 20
Anticipation 0.27 0.24 0.25 249
Disgust 0.00 0.00 0.00 69
Fear 0.00 0.00 0.00 28
Joy 0.00 0.00 0.00 80
Sadness 0.20 0.01 0.02 78
Surprise 0.00 0.00 0.00 40
Trust 0.32 0.71 0.44 283
accuracy 0.31 847
macro avg 0.10 0.12 0.09 847
weighted avg 0.21 0.31 0.22 847
Top 20 most important features:
Anger:
-0.9040 think 1.2486 did
-0.8845 doctor 1.0976 normal
-0.8657 life 0.9808 weeks
-0.7708 day 0.7795 weight
-0.7616 don 0.7315 help
-0.7506 insulin 0.7249 feel
-0.7194 medication 0.7134 foods
-0.6107 lot 0.6793 exercise
-0.5777 diet 0.6613 diabetic
-0.5680 really 0.6248 things
Joy:
-1.0660 weeks 0.9776 don
-0.9146 hi 0.8245 forum
-0.8577 obesity 0.6893 did
-0.8570 know 0.6776 diabetes
-0.7997 different 0.6342 ve
-0.7103 new 0.5270 use
-0.6380 foods 0.5218 day
-0.5870 carbs 0.4846 problems
-0.5565 dr 0.4437 months
-0.5267 control 0.4413 diet
Sadness:
-0.7562 feel 1.0858 diabetics
-0.7522 weight 0.8950 said
-0.6636 people 0.8316 lost
-0.6572 long 0.7976 insulin
-0.6563 way 0.7535 carbs
-0.6313 ago 0.7039 forum
-0.6095 overweight 0.6951 high
-0.6076 bg 0.6765 hi
-0.5955 don 0.6684 eating
-0.5587 diabetic 0.6650 months
Surprise:
-0.8113 diabetic 1.0564 bg
-0.7461 weeks 0.9674 lot
-0.7184 need 0.9609 better
-0.6896 risk 0.9427 know
-0.6840 metformin 0.9143 com
-0.6363 don 0.8763 t2
-0.6158 forum 0.8110 make
-0.6017 blood 0.7994 http
-0.5889 got 0.7897 started
-0.5843 just 0.7169 high
Anticipation:
-0.8739 com 1.0907 life
-0.8339 months 0.7875 higher
-0.8212 help 0.7438 doctor
-0.7984 carbs 0.7162 got
-0.7221 foods 0.7000 things
-0.7203 exercise 0.6934 metformin
-0.7049 diabetes 0.6841 lost
-0.6604 type 0.6560 told
-0.5050 meds 0.6264 day
-0.4630 diabetics 0.5415 think
Fear:
-0.7085 http 0.8805 com
-0.6477 help 0.8529 heart
-0.6090 things 0.8039 feel
-0.5606 months 0.7972 want
-0.5360 diabetics 0.7696 meds
-0.5200 hi 0.6322 obesity
-0.4957 control 0.6013 think
-0.4796 disease 0.5713 family
-0.4768 lot 0.5612 long
-0.4720 normal 0.5272 medication
Trust:
-1.1721 things 0.7669 way
-0.9829 say 0.7504 health
-0.9349 t2 0.7048 told
-0.8180 life 0.6559 weeks
-0.8048 diabetics 0.6276 healthy
-0.7579 did 0.5913 levels
-0.7482 know 0.5539 carbs
-0.7427 test 0.5435 getting
-0.7227 type 0.5017 doing
-0.7062 eating 0.4751 medication
Disgust:
-0.8031 lost 0.7776 diabetes
-0.7758 feel 0.6857 year
-0.6446 control 0.6245 don
-0.5778 does 0.6081 lot
-0.5187 bg 0.5649 http
-0.5092 better 0.5086 risk
-0.5046 overweight 0.4606 food
-0.4777 different 0.4536 diet
-0.4331 family 0.4357 type
-0.4046 blood 0.4287 new
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
Logistic Regression Classifier for Patient Journal
lab_pj_lst = []
for i in df_pj['Label']:
temp_tok = word_tokenize(i)
lab_pj_lst.append(temp_tok)
lab_pj = []
for i in range(len(lab_pj_lst)):
lab_pj.append(lab_pj_lst[i][0])
u_lab_pj = list(set(lab_pj))
xTrain_pj, xTest_pj, yTrain_pj, yTest_pj = train_test_split(list(df_pj['discussion_text']), lab_pj)
text_clf = pipe_clf.fit(xTrain_pj,yTrain_pj)
predictions = pipe_clf.predict(xTest_pj)
print('accuracy :', accuracy_score(yTest_pj, predictions))
measures_info = metrics.classification_report(yTest_pj,
predictions)
print('\n\n',measures_info)
vectorizer_pj = pipe_clf.named_steps['vect']
clf_pj = pipe_clf.named_steps['clf']
feature_names_pj = vectorizer.get_feature_names()
imp_features_pj = get_most_informative_features(clf_pj, vectorizer_pj,u_lab_pj, 20)
#print('\nTop 20 most important features:\n', imp_features_pj)
accuracy : 0.6610644257703081
precision recall f1-score support
Alternative 0.00 0.00 0.00 1
Clinical 0.44 0.24 0.31 50
Diagnosis 0.00 0.00 0.00 14
Living 0.70 0.93 0.79 230
Relatives 0.00 0.00 0.00 17
Undiagnosed 0.48 0.24 0.32 45
accuracy 0.66 357
macro avg 0.27 0.24 0.24 357
weighted avg 0.57 0.66 0.60 357
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
Q1.1 Conclusion
print('The accurracy of the Multinomail Naïve Bayes models for the Emotions and Patien Journal are as follows: \n - Emotions: ',nltk.classify.accuracy(clf_e, test_set_e), '\n - Patien Journal: ', nltk.classify.accuracy(clf_pj, test_set_pj),'\n\nThe accurracy of the Logistic Regression models for the Emotions and Patien Journal are as follows: \n - Emotions:',0.29634002361275086,'\n - Patien Journal: ', accuracy_score(yTest_pj, predictions))
The accurracy of the Multinomail Naïve Bayes models for the Emotions and Patien Journal are as follows:
- Emotions: 0.310632383599722
- Patien Journal: 0.2590759075907591
The accurracy of the Logistic Regression models for the Emotions and Patien Journal are as follows:
- Emotions: 0.29634002361275086
- Patien Journal: 0.6638655462184874
Q.1.2
Multinomial Naïve Bayes models most important features
print('Emotions: ')
clf_e.show_most_informative_features(20)
print('\n\nPatient Journal:')
clf_pj.show_most_informative_features(20)
Emotions:
Most Informative Features
contains(to) = False Fear : Anger = 4.8 : 1.0
contains(i) = True Fear : Sadnes = 3.5 : 1.0
contains(with) = False Surpri : Anger = 3.3 : 1.0
contains(2) = False Anger : Fear = 3.1 : 1.0
contains(of) = False Joy : Anger = 2.9 : 1.0
contains(the) = False Fear : Antici = 2.8 : 1.0
contains(.) = False Surpri : Anger = 2.6 : 1.0
contains(it) = True Anger : Surpri = 2.6 : 1.0
contains(diabetes) = False Surpri : Disgus = 2.6 : 1.0
contains(and) = False Fear : Sadnes = 2.4 : 1.0
contains(that) = True Anger : Surpri = 2.3 : 1.0
contains(in) = False Fear : Anger = 2.3 : 1.0
contains(with) = True Anger : Surpri = 2.2 : 1.0
contains(my) = True Fear : Antici = 2.2 : 1.0
contains(it) = False Surpri : Anger = 2.2 : 1.0
contains(..) = False Anger : Disgus = 2.0 : 1.0
contains(to) = True Anger : Fear = 2.0 : 1.0
contains(in) = True Anger : Fear = 1.9 : 1.0
contains(for) = True Sadnes : Surpri = 1.9 : 1.0
contains(of) = True Anger : Joy = 1.8 : 1.0
Patient Journal:
Most Informative Features
contains(i) = True Diagno : Living = 11.6 : 1.0
contains(and) = False Diagno : Living = 10.3 : 1.0
contains(the) = False Diagno : Living = 8.7 : 1.0
contains(my) = True Relati : Undiag = 5.8 : 1.0
contains(diabetes) = False Altern : Living = 5.0 : 1.0
contains(in) = False Clinic : Living = 4.1 : 1.0
contains(the) = True Living : Diagno = 3.1 : 1.0
contains(for) = False Diagno : Altern = 3.0 : 1.0
contains(have) = True Altern : Living = 2.9 : 1.0
contains(a) = False Diagno : Living = 2.8 : 1.0
contains(of) = False Diagno : Living = 2.8 : 1.0
contains(it) = True Diagno : Relati = 2.8 : 1.0
contains(type) = False Diagno : Relati = 2.8 : 1.0
contains(2) = False Altern : Undiag = 2.7 : 1.0
contains(my) = False Undiag : Relati = 2.5 : 1.0
contains(have) = False Living : Altern = 2.5 : 1.0
contains(to) = False Clinic : Relati = 2.5 : 1.0
contains(it) = False Relati : Diagno = 2.5 : 1.0
contains(type) = True Relati : Diagno = 2.5 : 1.0
contains(.) = False Living : Undiag = 2.4 : 1.0
Logistic Regression models most important features:
print('\nTop 20 most important features:\n', imp_features)
Top 20 most important features:
Anger:
-0.9040 think 1.2486 did
-0.8845 doctor 1.0976 normal
-0.8657 life 0.9808 weeks
-0.7708 day 0.7795 weight
-0.7616 don 0.7315 help
-0.7506 insulin 0.7249 feel
-0.7194 medication 0.7134 foods
-0.6107 lot 0.6793 exercise
-0.5777 diet 0.6613 diabetic
-0.5680 really 0.6248 things
Joy:
-1.0660 weeks 0.9776 don
-0.9146 hi 0.8245 forum
-0.8577 obesity 0.6893 did
-0.8570 know 0.6776 diabetes
-0.7997 different 0.6342 ve
-0.7103 new 0.5270 use
-0.6380 foods 0.5218 day
-0.5870 carbs 0.4846 problems
-0.5565 dr 0.4437 months
-0.5267 control 0.4413 diet
Sadness:
-0.7562 feel 1.0858 diabetics
-0.7522 weight 0.8950 said
-0.6636 people 0.8316 lost
-0.6572 long 0.7976 insulin
-0.6563 way 0.7535 carbs
-0.6313 ago 0.7039 forum
-0.6095 overweight 0.6951 high
-0.6076 bg 0.6765 hi
-0.5955 don 0.6684 eating
-0.5587 diabetic 0.6650 months
Surprise:
-0.8113 diabetic 1.0564 bg
-0.7461 weeks 0.9674 lot
-0.7184 need 0.9609 better
-0.6896 risk 0.9427 know
-0.6840 metformin 0.9143 com
-0.6363 don 0.8763 t2
-0.6158 forum 0.8110 make
-0.6017 blood 0.7994 http
-0.5889 got 0.7897 started
-0.5843 just 0.7169 high
Anticipation:
-0.8739 com 1.0907 life
-0.8339 months 0.7875 higher
-0.8212 help 0.7438 doctor
-0.7984 carbs 0.7162 got
-0.7221 foods 0.7000 things
-0.7203 exercise 0.6934 metformin
-0.7049 diabetes 0.6841 lost
-0.6604 type 0.6560 told
-0.5050 meds 0.6264 day
-0.4630 diabetics 0.5415 think
Fear:
-0.7085 http 0.8805 com
-0.6477 help 0.8529 heart
-0.6090 things 0.8039 feel
-0.5606 months 0.7972 want
-0.5360 diabetics 0.7696 meds
-0.5200 hi 0.6322 obesity
-0.4957 control 0.6013 think
-0.4796 disease 0.5713 family
-0.4768 lot 0.5612 long
-0.4720 normal 0.5272 medication
Trust:
-1.1721 things 0.7669 way
-0.9829 say 0.7504 health
-0.9349 t2 0.7048 told
-0.8180 life 0.6559 weeks
-0.8048 diabetics 0.6276 healthy
-0.7579 did 0.5913 levels
-0.7482 know 0.5539 carbs
-0.7427 test 0.5435 getting
-0.7227 type 0.5017 doing
-0.7062 eating 0.4751 medication
Disgust:
-0.8031 lost 0.7776 diabetes
-0.7758 feel 0.6857 year
-0.6446 control 0.6245 don
-0.5778 does 0.6081 lot
-0.5187 bg 0.5649 http
-0.5092 better 0.5086 risk
-0.5046 overweight 0.4606 food
-0.4777 different 0.4536 diet
-0.4331 family 0.4357 type
-0.4046 blood 0.4287 new
vectorizer_pj = pipe_clf.named_steps['vect']
clf_pj = pipe_clf.named_steps['clf']
feature_names_pj = vectorizer.get_feature_names()
imp_features_pj = get_most_informative_features(clf_pj, vectorizer_pj,u_lab_pj, 20)
print('\nTop 20 most important features:\n', imp_features_pj)
Top 20 most important features:
Alternative:
-0.5195 diagnosed 0.9551 help
-0.3414 know 0.9017 recently
-0.3343 type 0.7809 just
-0.3271 like 0.7016 medical
-0.3167 high 0.6549 loss
-0.2998 don 0.6046 read
-0.2777 diabetic 0.5381 dr
-0.2661 time 0.5270 weight
-0.2659 diet 0.4500 control
-0.2601 ago 0.4437 patients
Relatives:
-1.1350 diet 2.7993 metformin
-1.1321 resistance 1.9246 insulin
-1.0601 family 1.4874 surgery
-1.0075 higher 1.4326 new
-0.9663 risk 1.2699 taking
-0.8966 diabetes 1.1782 medication
-0.8612 fat 0.9682 months
-0.8509 carbs 0.7925 10
-0.8293 got 0.7294 medical
-0.7777 long 0.6897 doctor
Living:
-1.1237 years 2.9476 diagnosed
-0.7465 control 1.1717 got
-0.7441 insulin 1.1019 hello
-0.6075 health 0.7025 hi
-0.6073 eat 0.6314 doctor
-0.5443 metformin 0.5885 day
-0.5344 people 0.5698 blood
-0.5332 medication 0.5382 t2
-0.5028 exercise 0.5270 new
-0.4941 medical 0.4709 old
Clinical:
-1.4286 type 2.9211 diet
-1.0283 patients 2.1953 exercise
-0.9558 insulin 1.8990 eating
-0.8998 new 1.8828 eat
-0.8732 medical 1.5202 carbs
-0.8300 surgery 1.1762 carb
-0.7648 dr 1.0766 foods
-0.7630 test 1.0034 healthy
-0.6855 diagnosed 0.9072 low
-0.6831 doctor 0.8806 really
Undiagnosed:
-0.7943 risk 1.9334 family
-0.6442 metformin 1.5585 know
-0.6372 insulin 1.5034 diabetic
-0.6250 need 1.1699 type
-0.6198 taking 1.0712 don
-0.5227 http 0.5972 test
-0.5117 feel 0.5519 overweight
-0.4945 work 0.3540 diagnosis
-0.4640 sugars 0.3457 way
-0.4619 ve 0.3394 getting
Diagnosis:
-1.7549 diagnosed 1.5508 risk
-1.6533 metformin 1.5438 resistance
-1.4674 new 1.5435 obesity
-1.0886 exercise 1.3211 higher
-1.0064 medication 1.2872 does
-0.9731 hi 1.0634 type
-0.9179 eat 0.9941 fat
-0.9023 day 0.9556 symptoms
-0.8790 help 0.8060 like
-0.8789 recently 0.7890 body