##
## Extended code examples for ESSLLI2021 tutorial part 3 (and 4)
##

library(wordspace)
library(wordspaceEval)

M <- DSM_Vectors
nearest.neighbours(M, "walk_V")


######################################################################
## TASK 1: MULTIPLE CHOICE

# DATASET: TOEFL
head(TOEFL80)
eval.multiple.choice(TOEFL80, M) 

# With details=TRUE we can look at how well we did on the individual items. 
eval.multiple.choice(TOEFL80, M, details=TRUE) 
eval.multiple.choice(TOEFL80, M)
eval.multiple.choice(TOEFL80, M, method="manhattan") # manhattan is slightly better
eval.multiple.choice(TOEFL80, M, method="maximum")   # maximum a disaster :)

# Comparing distance to rank
eval.multiple.choice(TOEFL80, M) 
eval.multiple.choice(TOEFL80, M, rank="fwd") # unsurprisingly identical to distance, given that the ranking is the same.  
eval.multiple.choice(TOEFL80, M, rank="bwd") # it is bwd rank which can make a difference here - and it does, but not for the better

# An interesting field of the output of details=TRUE is "correct.rank" 
# which tells us, among the 4 choices,
# where did the correct one land (1 means the item was predicted correctly).
# In a way, this is a more "forgiving" way of assessing performance,
# one where getting the right choice in position 2 is not as wrong 
# as getting it as the last one.

# cosine
mean(eval.multiple.choice(TOEFL80, M, details=TRUE)$correct.rank)
# manhattan
mean(eval.multiple.choice(TOEFL80, M, method="manhattan", details=TRUE)$correct.rank)
# maximum
mean(eval.multiple.choice(TOEFL80, M, method="maximum", details=TRUE)$correct.rank)


######################################################################
## TASK 2: PREDICTION OF SIMILARITY RATINGS

# DATASET: RG65
head(RG65)
# Let us just have a look at a handful of items (from 1 to 61 in steps of 5)
RG65[seq(1,61,5), ]

eval.similarity.correlation(RG65, M, convert=FALSE)
# With details=TRUE we can look at the individual items
eval.similarity.correlation(RG65, M, details=TRUE, convert=FALSE)
# ... and plot the correlation between similarity/distance and rating
plot(eval.similarity.correlation(RG65, M, details=TRUE, convert=FALSE))

# DATASET: WordSim353
head(WordSim353)

eval.similarity.correlation(WordSim353, M, convert=FALSE)
plot(eval.similarity.correlation(WordSim353, M, details=TRUE, convert=FALSE))

# WordSim353 contains both similarity and relatedness ratings, indicated by Boolean variables
# relatedness and similarity. We can use this information to conduct separate evaluations.
# E.g, looking at items annotated for relatedness
eval.similarity.correlation(subset(WordSim353, relatedness), M, convert=FALSE)
# versus items annotated for similarity
eval.similarity.correlation(subset(WordSim353, similarity), M, convert=FALSE)
# Yeah, our model likes similarity more, clearly... 

# BONUS: Note that you can use the output from details=TRUE for further analysis, e.g. with a regression model
result <- eval.similarity.correlation(WordSim353, M, convert=FALSE, details=TRUE)
summary(lm(distance ~ score + relatedness, data=result))
# the linear model confirms us that the highest scores have higher similarties
# (even if it is called distance, remember we had convert=FALSE)
# and that when the items are annotated as TRUE for relatedness, 
# their similarity score is lower 


######################################################################
# TASK 3: CONCEPT CLUSTERING

# DATASET: ESSLLI08_Nouns
head(ESSLLI08_Nouns)
ESSLLI08_Nouns[seq(1,40,5), ]

eval.clustering(ESSLLI08_Nouns, M)
eval.clustering(ESSLLI08_Nouns, M, details=TRUE)

# Note that the ESSLLI dataset has multiple annotations available, at different granularities
summary(ESSLLI08_Nouns)

# The eval.clustering function allows you to specify which column of your dataset 
# is to be used as a gold standard class for clustering
# 6 classes
eval.clustering(ESSLLI08_Nouns, M)
# 3 classes
eval.clustering(ESSLLI08_Nouns, M, class.name="class2")
# 2 classes
eval.clustering(ESSLLI08_Nouns, M, class.name="class3")
# clearly the task is easier with fewer classes to be predicted...


######################################################################
## ONE DATASET, MANY TASKS: LAZARIDOU 2013

# We talked about the Lazaridou2013 dataset in the lecture. Conveniently, it is part of wordspaceEval
# Have a look at it:
View(Lazaridou2013)
# Read the documentation:
?Lazaridou2013
# And finally, get an idea of its size/scope with the summary function
summary(Lazaridou2013)

# This dataset hasn't been developed witg standard NLP tasks. Yet, we can still construct such 
# evaluation tasks from it and solve them using the 3 functions we learnt today.

## STEP 1: multiple choice ##

# To carry out multiple choice on Lazaridou2013, we need to create the distractors 
# (remember? TOEFL has one word, one candidate synonym, and 3 distractor words)
# Here we have pairs of stems and derived words 
# (stem: drive, derived: driver; stem: happy, derived: unhappy)
# Base can be our input word, and the task for the model will be to find the derived word. 
# How can we generate distractors? Idea: given a pair of base/derived words, 
# we can take 3 random other derived words as distractors. Then the assumption will be:
# If my DSM is "good", it will be able to spot the relatedness between drive/driver, and assign
# it a higher similarity (or lower distance) than the other derived words. 
# Of course it is not a perfect strategy (can you think about potential issues with it?)
# But let's get started nevetherless :) 

# We set a random seed to be able to replicate the results (this is customary
# when doing random shuffling
set.seed(001)
candidates1 <- sample(Lazaridou2013$derived)
candidates1
candidates2 <- sample(Lazaridou2013$derived)
candidates2
candidates3 <- sample(Lazaridou2013$derived)
candidates3

# Have a look at the ?eval.multiple.choice function: it actually allows 
# you to specify the name of the target (for us it is "stem") 
# of correct choice (for us it is "derived") the column for targets and the star
# of the string for the column names specifying distractors. The default is distract*, 
# which is good enough for us. We will have add 3 columns to Lazaridou2013:
# distractor1, distractor2, distractor3

# Since we're going to manipulate the data set, let us create a copy. (Footnote: This isn't
# strictly necessary. You can add to or otherwise modify a read-only data set loaded from 
# a package. R will automatically create a copy in your workspace under the same name, which 
# "shadows" the version from the wordspaceEval package. But this can become quite confusing,
# so it's better to make our own copy to begin with.)
MyLazaridou2013 <- Lazaridou2013
MyLazaridou2013$distractor1 <- candidates1
MyLazaridou2013$distractor2 <- candidates2
MyLazaridou2013$distractor3 <- candidates3

# Let us experiment with word2vec, as it is the largest we have and state-of-the-art.
# We will lose a bit in performance because w2v contains inflected words (so, two vectors for
# dog and dogs, and we will be only using the singular (stem) in this case. 
# But we will gain in coverage so we are good for now. Have fun experimenting with other models;
# watch out for coverage though! 
load("models/GoogleNews300_wf200k.rda")
dsm <- GoogleNews300_wf200k

# And finally we can call multiple choice, making sure we indicate the right names
eval.multiple.choice(MyLazaridou2013, dsm, target.name="stem", correct.name="derived", distractor.name="distract")
# not bad... Further check: is the performance comparable in test vs. training?
# but can we use details=TRUE to investigate things a bit deeper?
Lazaridou2013_multiple <- eval.multiple.choice(MyLazaridou2013, dsm, target.name="stem", correct.name="derived", distractor.name="distract", details=TRUE)

# First of all, we can now have a look at the mistakes made by our model (some are missing items, marked with "Inf")
head(subset(Lazaridou2013_multiple, !correct)) # !correct matches items where the Boolean variable correct is FALSE

# But what is also interesting is the "correct.dist" field, which is actually telling us the distance 
# between base and deriveed word. Now, wouldn't it be nice to know whether some affixes have smaller/bigger 
# distances? We don't have this info here directly, but we can retrieve it from the main dataframe
# because details=TRUE obviously has kept the order! 
# Let's also keep track of the part of speech of the derived word, and of the stem

Lazaridou2013_multiple$affix <- Lazaridou2013$affix
Lazaridou2013_multiple$derivedPOS <- Lazaridou2013$derivedPOS
Lazaridou2013_multiple$stemPOS <- Lazaridou2013$stemPOS

head(Lazaridou2013_multiple)

# One more step: let us get rid of the Inf cases (remember that from 
# now on we cannot concatenate columns anymore, because we have altered the order)
nrow(Lazaridou2013_multiple)
Lazaridou2013_multiple <- subset(Lazaridou2013_multiple, correct.dist != Inf)
nrow(Lazaridou2013_multiple)

# Now we have a lot of information to play with, and we can go wild ;) 
# For example...
boxplot(correct.dist ~ derivedPOS, data=Lazaridou2013_multiple)
boxplot(correct.dist ~ paste(derivedPOS,stemPOS), data=Lazaridou2013_multiple)
boxplot(correct.dist ~ affix,data=Lazaridou2013_multiple)

## STEP 2: prediction of similarity ratings and of quality ratings. ##
# Remember: annotations are available only for the test set

# Base/derived relatedness score
eval.similarity.correlation(subset(Lazaridou2013, set=="test"), dsm, word1.name="stem", word2.name="derived", score.name="relatedness_score")
# Not so good, and with these vectors we have little room for parameter manipulation. How about using rank? (Careful: This is computationally very heavy!)
eval.similarity.correlation(subset(Lazaridou2013, set=="test"), dsm, word1.name="stem", word2.name="derived", score.name="relatedness_score", rank="fwd")
####                                            rho      p.value missing         r     r.lower    r.upper
#### subset(Lazaridou2013, set == "test") 0.2726184 8.405727e-17      65 0.0153527 -0.05004555 0.08061985
eval.similarity.correlation(subset(Lazaridou2013, set=="test"), dsm, word1.name="stem", word2.name="derived", score.name="relatedness_score", rank="avg")
####                                            rho      p.value missing          r     r.lower    r.upper
#### subset(Lazaridou2013, set == "test") 0.2821681 6.176956e-18      65 0.02236349 -0.04304745 0.08758353

# Quality score of the derived vectors (which makes little sense for us, given that it is the quality of the vectors the authors 
# used in their original study). 
eval.similarity.correlation(subset(Lazaridou2013, set=="test"), dsm, word1.name="stem", word2.name="derived", score.name="quality_score")

## STEP 3: categorization ##

# We can use the eval.clustering function to test how well affixes group together...
eval.clustering(Lazaridou2013, dsm, word.name="derived", class.name="affix")
# Or if we can categorize derived word in their part of speech... (less classes, gets easier)
eval.clustering(Lazaridou2013, dsm, word.name="derived", class.name="derivedPOS")
# Or if we can categorize derived word in their part of speech... (gets slightly easier, because stems tend to be more frequent and thus 
# have better representations)
eval.clustering(Lazaridou2013, dsm, word.name="stem", class.name="stemPOS")