## ## Extended code examples for ESSLLI2021 tutorial part 3 (and 4) ## library(wordspace) library(wordspaceEval) M <- DSM_Vectors nearest.neighbours(M, "walk_V") ###################################################################### ## TASK 1: MULTIPLE CHOICE # DATASET: TOEFL head(TOEFL80) eval.multiple.choice(TOEFL80, M) # With details=TRUE we can look at how well we did on the individual items. eval.multiple.choice(TOEFL80, M, details=TRUE) eval.multiple.choice(TOEFL80, M) eval.multiple.choice(TOEFL80, M, method="manhattan") # manhattan is slightly better eval.multiple.choice(TOEFL80, M, method="maximum") # maximum a disaster :) # Comparing distance to rank eval.multiple.choice(TOEFL80, M) eval.multiple.choice(TOEFL80, M, rank="fwd") # unsurprisingly identical to distance, given that the ranking is the same. eval.multiple.choice(TOEFL80, M, rank="bwd") # it is bwd rank which can make a difference here - and it does, but not for the better # An interesting field of the output of details=TRUE is "correct.rank" # which tells us, among the 4 choices, # where did the correct one land (1 means the item was predicted correctly). # In a way, this is a more "forgiving" way of assessing performance, # one where getting the right choice in position 2 is not as wrong # as getting it as the last one. # cosine mean(eval.multiple.choice(TOEFL80, M, details=TRUE)$correct.rank) # manhattan mean(eval.multiple.choice(TOEFL80, M, method="manhattan", details=TRUE)$correct.rank) # maximum mean(eval.multiple.choice(TOEFL80, M, method="maximum", details=TRUE)$correct.rank) ###################################################################### ## TASK 2: PREDICTION OF SIMILARITY RATINGS # DATASET: RG65 head(RG65) # Let us just have a look at a handful of items (from 1 to 61 in steps of 5) RG65[seq(1,61,5), ] eval.similarity.correlation(RG65, M, convert=FALSE) # With details=TRUE we can look at the individual items eval.similarity.correlation(RG65, M, details=TRUE, convert=FALSE) # ... and plot the correlation between similarity/distance and rating plot(eval.similarity.correlation(RG65, M, details=TRUE, convert=FALSE)) # DATASET: WordSim353 head(WordSim353) eval.similarity.correlation(WordSim353, M, convert=FALSE) plot(eval.similarity.correlation(WordSim353, M, details=TRUE, convert=FALSE)) # WordSim353 contains both similarity and relatedness ratings, indicated by Boolean variables # relatedness and similarity. We can use this information to conduct separate evaluations. # E.g, looking at items annotated for relatedness eval.similarity.correlation(subset(WordSim353, relatedness), M, convert=FALSE) # versus items annotated for similarity eval.similarity.correlation(subset(WordSim353, similarity), M, convert=FALSE) # Yeah, our model likes similarity more, clearly... # BONUS: Note that you can use the output from details=TRUE for further analysis, e.g. with a regression model result <- eval.similarity.correlation(WordSim353, M, convert=FALSE, details=TRUE) summary(lm(distance ~ score + relatedness, data=result)) # the linear model confirms us that the highest scores have higher similarties # (even if it is called distance, remember we had convert=FALSE) # and that when the items are annotated as TRUE for relatedness, # their similarity score is lower ###################################################################### # TASK 3: CONCEPT CLUSTERING # DATASET: ESSLLI08_Nouns head(ESSLLI08_Nouns) ESSLLI08_Nouns[seq(1,40,5), ] eval.clustering(ESSLLI08_Nouns, M) eval.clustering(ESSLLI08_Nouns, M, details=TRUE) # Note that the ESSLLI dataset has multiple annotations available, at different granularities summary(ESSLLI08_Nouns) # The eval.clustering function allows you to specify which column of your dataset # is to be used as a gold standard class for clustering # 6 classes eval.clustering(ESSLLI08_Nouns, M) # 3 classes eval.clustering(ESSLLI08_Nouns, M, class.name="class2") # 2 classes eval.clustering(ESSLLI08_Nouns, M, class.name="class3") # clearly the task is easier with fewer classes to be predicted... ###################################################################### ## ONE DATASET, MANY TASKS: LAZARIDOU 2013 # We talked about the Lazaridou2013 dataset in the lecture. Conveniently, it is part of wordspaceEval # Have a look at it: View(Lazaridou2013) # Read the documentation: ?Lazaridou2013 # And finally, get an idea of its size/scope with the summary function summary(Lazaridou2013) # This dataset hasn't been developed witg standard NLP tasks. Yet, we can still construct such # evaluation tasks from it and solve them using the 3 functions we learnt today. ## STEP 1: multiple choice ## # To carry out multiple choice on Lazaridou2013, we need to create the distractors # (remember? TOEFL has one word, one candidate synonym, and 3 distractor words) # Here we have pairs of stems and derived words # (stem: drive, derived: driver; stem: happy, derived: unhappy) # Base can be our input word, and the task for the model will be to find the derived word. # How can we generate distractors? Idea: given a pair of base/derived words, # we can take 3 random other derived words as distractors. Then the assumption will be: # If my DSM is "good", it will be able to spot the relatedness between drive/driver, and assign # it a higher similarity (or lower distance) than the other derived words. # Of course it is not a perfect strategy (can you think about potential issues with it?) # But let's get started nevetherless :) # We set a random seed to be able to replicate the results (this is customary # when doing random shuffling set.seed(001) candidates1 <- sample(Lazaridou2013$derived) candidates1 candidates2 <- sample(Lazaridou2013$derived) candidates2 candidates3 <- sample(Lazaridou2013$derived) candidates3 # Have a look at the ?eval.multiple.choice function: it actually allows # you to specify the name of the target (for us it is "stem") # of correct choice (for us it is "derived") the column for targets and the star # of the string for the column names specifying distractors. The default is distract*, # which is good enough for us. We will have add 3 columns to Lazaridou2013: # distractor1, distractor2, distractor3 # Since we're going to manipulate the data set, let us create a copy. (Footnote: This isn't # strictly necessary. You can add to or otherwise modify a read-only data set loaded from # a package. R will automatically create a copy in your workspace under the same name, which # "shadows" the version from the wordspaceEval package. But this can become quite confusing, # so it's better to make our own copy to begin with.) MyLazaridou2013 <- Lazaridou2013 MyLazaridou2013$distractor1 <- candidates1 MyLazaridou2013$distractor2 <- candidates2 MyLazaridou2013$distractor3 <- candidates3 # Let us experiment with word2vec, as it is the largest we have and state-of-the-art. # We will lose a bit in performance because w2v contains inflected words (so, two vectors for # dog and dogs, and we will be only using the singular (stem) in this case. # But we will gain in coverage so we are good for now. Have fun experimenting with other models; # watch out for coverage though! load("models/GoogleNews300_wf200k.rda") dsm <- GoogleNews300_wf200k # And finally we can call multiple choice, making sure we indicate the right names eval.multiple.choice(MyLazaridou2013, dsm, target.name="stem", correct.name="derived", distractor.name="distract") # not bad... Further check: is the performance comparable in test vs. training? # but can we use details=TRUE to investigate things a bit deeper? Lazaridou2013_multiple <- eval.multiple.choice(MyLazaridou2013, dsm, target.name="stem", correct.name="derived", distractor.name="distract", details=TRUE) # First of all, we can now have a look at the mistakes made by our model (some are missing items, marked with "Inf") head(subset(Lazaridou2013_multiple, !correct)) # !correct matches items where the Boolean variable correct is FALSE # But what is also interesting is the "correct.dist" field, which is actually telling us the distance # between base and deriveed word. Now, wouldn't it be nice to know whether some affixes have smaller/bigger # distances? We don't have this info here directly, but we can retrieve it from the main dataframe # because details=TRUE obviously has kept the order! # Let's also keep track of the part of speech of the derived word, and of the stem Lazaridou2013_multiple$affix <- Lazaridou2013$affix Lazaridou2013_multiple$derivedPOS <- Lazaridou2013$derivedPOS Lazaridou2013_multiple$stemPOS <- Lazaridou2013$stemPOS head(Lazaridou2013_multiple) # One more step: let us get rid of the Inf cases (remember that from # now on we cannot concatenate columns anymore, because we have altered the order) nrow(Lazaridou2013_multiple) Lazaridou2013_multiple <- subset(Lazaridou2013_multiple, correct.dist != Inf) nrow(Lazaridou2013_multiple) # Now we have a lot of information to play with, and we can go wild ;) # For example... boxplot(correct.dist ~ derivedPOS, data=Lazaridou2013_multiple) boxplot(correct.dist ~ paste(derivedPOS,stemPOS), data=Lazaridou2013_multiple) boxplot(correct.dist ~ affix,data=Lazaridou2013_multiple) ## STEP 2: prediction of similarity ratings and of quality ratings. ## # Remember: annotations are available only for the test set # Base/derived relatedness score eval.similarity.correlation(subset(Lazaridou2013, set=="test"), dsm, word1.name="stem", word2.name="derived", score.name="relatedness_score") # Not so good, and with these vectors we have little room for parameter manipulation. How about using rank? (Careful: This is computationally very heavy!) eval.similarity.correlation(subset(Lazaridou2013, set=="test"), dsm, word1.name="stem", word2.name="derived", score.name="relatedness_score", rank="fwd") #### rho p.value missing r r.lower r.upper #### subset(Lazaridou2013, set == "test") 0.2726184 8.405727e-17 65 0.0153527 -0.05004555 0.08061985 eval.similarity.correlation(subset(Lazaridou2013, set=="test"), dsm, word1.name="stem", word2.name="derived", score.name="relatedness_score", rank="avg") #### rho p.value missing r r.lower r.upper #### subset(Lazaridou2013, set == "test") 0.2821681 6.176956e-18 65 0.02236349 -0.04304745 0.08758353 # Quality score of the derived vectors (which makes little sense for us, given that it is the quality of the vectors the authors # used in their original study). eval.similarity.correlation(subset(Lazaridou2013, set=="test"), dsm, word1.name="stem", word2.name="derived", score.name="quality_score") ## STEP 3: categorization ## # We can use the eval.clustering function to test how well affixes group together... eval.clustering(Lazaridou2013, dsm, word.name="derived", class.name="affix") # Or if we can categorize derived word in their part of speech... (less classes, gets easier) eval.clustering(Lazaridou2013, dsm, word.name="derived", class.name="derivedPOS") # Or if we can categorize derived word in their part of speech... (gets slightly easier, because stems tend to be more frequent and thus # have better representations) eval.clustering(Lazaridou2013, dsm, word.name="stem", class.name="stemPOS")