from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import sklearn
from sklearn.linear_model import LinearRegression
from collections import Counter
nRowsRead = 3000
df1 = pd.read_csv('WordDifficultySample.csv', delimiter=",", nrows = nRowsRead)
df1.dataframeName = 'WordDifficultySample.csv'
nRow, nCol = df1.shape
#Graph
fig, ax = plt.subplots(figsize=(12,6))
x= df1['Length']
y= df1['I_Zscore']
plt.scatter(x,y)
plt.title('Word Length Predicting Word Difficulty')
plt.xlabel('Word Length')
plt.ylabel('Word Difficulty')
#Do linear regression
x = df1['Length'].values.reshape(-1,1).astype('float32')
y = df1['I_Zscore'].values.reshape(-1,1).astype('float32')
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
h = LinearRegression()
h.fit(X_train,y_train)
y_pred = h.predict(X_test)
fig,(ax1) = plt.subplots(1, figsize = (12,6))
ax1.scatter (X_test, y_test, s = 8)
plt.plot(X_test,y_pred, color = 'black', linewidth = 2)
plt.show()
#Graph
fig, ax = plt.subplots(figsize=(12,6))
x= df1['I_Mean_RT']
y= df1['I_Zscore']
plt.scatter(x,y)
plt.title('Average Reaction Time Predicting Difficulty')
plt.xlabel('Reaction Time')
plt.ylabel('Word Difficulty')
#Do linear regression
x = df1['I_Mean_RT'].values.reshape(-1,1).astype('float32')
y = df1['I_Zscore'].values.reshape(-1,1).astype('float32')
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
j = LinearRegression()
j.fit(X_train,y_train)
y_pred = j.predict(X_test)
fig,(ax1) = plt.subplots(1, figsize = (12,6))
ax1.scatter (X_test, y_test, s = 8)
plt.plot(X_test,y_pred, color = 'black', linewidth = 2)
plt.show()
word_RT = df1['I_Mean_RT'].values.reshape(-1,1).astype('float32')
RT_predictions = j.predict(word_RT)
plt.scatter(word_RT, df1["I_Zscore"])
plt.scatter(word_RT, RT_predictions)
# data = np.array(10)
# asdf = data.reshape(-1,1).astype('float32')
# j.predict(asdf)
def word_info(dataframe, index): # Returns difference between actual and predicted Reaction Time
RT_actual = dataframe["I_Mean_RT"][index]
reshaped_value = np.array(dataframe["Length"][index]).reshape(-1, 1).astype("float32")
RT_prediction = j.predict(reshaped_value)
difference = RT_actual - RT_prediction
return difference
print(word_info(df1, 1), word_info(df1, 50))
[[746.558]] [[1030.1982]]
def get_headline(dataframe, index):
headline = dataframe["headline_text"][index].split(" ")
return headline
nRowsRead = 3000
df2 = pd.read_csv('abcnews-date-text.csv', delimiter=",", nrows = nRowsRead)
df2.dataframeName = 'abcnews-date-text.csv'
nRow, nCol = df2.shape
print(f'There are {nRow} rows and {nCol} columns')
# df2["headline_text"][0]
# for i in range(nRow):
headlines = get_headline(df2, 0)
headlines
There are 3000 rows and 2 columns
df2.head(3)
df2.sample(3)
clean_dataset(df2)
def normalized_I_Zscore(values): # Values from 0 - 1
# values = df1["I_Zscore"]
min_val = min(values)
max_val = max(values)
normalized = [(x + abs(min_val)) / (max_val - min_val) for x in values]
return normalized
# print(normalized_I_Zscore()[0:5])
# print(df1["I_Zscore"][0:5])
def hd_difficulty_score(hd_index, wds): # Headline difficulty based on normalized score
total_score = 0
article = get_headline(df2, hd_index)
for i in article:
if i in wds:
df1_word_index = wds.index(i)
# total_score += normalized_I_Zscore()[df1_word_index]
# print(df1["I_Zscore"][df1_word_index])
total_score += df1["I_Zscore"][df1_word_index]
else:
z = np.array(float(len(i))).reshape(-1, 1)
predicted_value = h.predict(z)[0][0] #fix, need to normalize from (0, 1)
# values = list(df1["I_Zscore"])
# values.append(predicted_value)
# print(values)
# print(normalized_I_Zscore(values)[-1])
total_score += predicted_value
return total_score
words = list(df1['Word'])
# print(normalized_I_Zscore()[words.index("against")])
hd_difficulty_score(0, words)
df3 = pd.read_csv('WordDifficultySample.csv', delimiter=",", nrows = nRowsRead)
df3.head(3)
difficulty_scores = []
for i in np.arange(1000): # number of headlines we want to analyze
j = np.random.randint(len(df2))
score = hd_difficulty_score(j, list(df3['Word']))
difficulty_scores.append(score)
# print(normalized_I_Zscore(difficulty_scores))
difficulty_scores = normalized_I_Zscore(difficulty_scores)
removed_zeros = np.ma.masked_equal(difficulty_scores, 0)
np.mean(removed_zeros)
np.std(removed_zeros)