##
## Example code for part 4: DS beyond NLP: Linguistic Theory
##
library(wordspace)
######################################################################
## PART 1: sentence vectors
TT <- DSM_TermTermMatrix
TT
# Sentence 1: Cats and dogs need their time (manually lowercase and lemmatized)
s1 <- "cat and dog need their time"
# Sentence 2: Time is the cause not the effect (manually lowercase and lemmatized)
s2 <- "time is the cause not the effect"
context.vectors(TT, s1)
context.vectors(TT, s2)
# Let's check we are really taking the average:
TT['cat', 'breed']
TT['dog', 'breed']
TT['time', 'breed']
(TT['cat','breed'] + TT['dog','breed'] + TT['time','breed'] ) / 3
# We are :)
# Context.vectors() can also take a list as an input. See ?context.vectors for more details
contexts <- round(context.vectors(TT, c(s1, s2)), 2)
contexts
# Let's make rownames more meaningful
rownames(contexts) <- c("s1", "s2")
# And then update the TT by appending our new sentence vectors...
TT <- rbind(TT, contexts)
# And have a look:
TT
nearest.neighbours(TT, c("s1", "s2"), n=6)
plot(dist.matrix(TT))
######################################################################
# PART 2: compositionality in DSMs
# Let us try to use vector addition and multiplication for adjective/noun composition
# We select DSM_Vectors because lemmas are conveniently annotated by the POS (so, "book_V", "book_N")
head(DSM_Vectors)
# Vecchi et al. had "remarkable onion" as an example of deviant AN - we don't have "remarkable_J" in DSM_Vectors,
# so we can go for "verbose onion", equally funny :)
# First we extract the vectors
verbose <- DSM_Vectors['verbose_J', ]
onion <- DSM_Vectors['onion_N', ]
# Then we create the composed representations
vo_add <- verbose + onion
vo_mult <- verbose * onion # * performs element-wise multiplication (Hadamard product) in R
# To compare them to the vectors in the main space, we have two options.
# Option 1: Either we update DSM_Vectors with the two new vectors, creating a copy of it to leave the main model untouched
DSM_Vectors_up <- rbind(DSM_Vectors, vo_add)
DSM_Vectors_up <- rbind(DSM_Vectors_up, vo_mult)
# ... and calculate nearest neighbours then
nearest.neighbours(DSM_Vectors_up, c("onion_N", "verbose_J", "vo_add", "vo_mult"), n=10)
# Option 2: creating an extra matrix with the composed vectors
composed_dsm <- dsm(rbind(vo_add, vo_mult))
# ... and calculate nearest neighbours using the M2 option of nearest.neighbours()
nearest.neighbours(composed_dsm, c("vo_add", "vo_mult"), n=10, M2=DSM_Vectors)
# Watch out: if you renormalize the composed vectors matrix, you should do the same to DSM_Vectors you use in M2
# Try by yourself with different input models, and different examples of AN (e.g., red {blood, wine, square})
# If you wanted to make the head of the compound count more in the output, which simple trick could you use?
######################################################################
## BONUS example: disambiguating "bank", based on the two example sentences from the lecture slides
bank_1 <- "The broker went to the bank to secure his cash"
bank_2 <- "The river bank was steep and dangerous"
# And let us load a "real sized" model, word2vec, which is based on inflected words
# and has a large vocabulary. Which make it particularly suitable if we want to do
# minimal preprocessing
load("/models/GoogleNews300_wf200k.rda")
# As usual we give a more "human" name to the model
dsm <- GoogleNews300_wf200k
# Little detour: we want to check the coverage of our sentences in the model
## Sentence 1
# Let's turn the string into a list:
bank_1_list <- unlist(strsplit(bank_1, " ")) # this perfoms a whitespace tokenization of bank_1
bank_1_list
bank_1_list %in% rownames(dsm)
# how many?
sum(bank_1_list %in% rownames(dsm))
# how many, in proportion to length?
sum(bank_1_list %in% rownames(dsm)) / length(bank_1_list)
# which ones?
bank_1_list[bank_1_list %in% rownames(dsm)]
## Sentence 2 (code variation)
bank_2_list <- unlist(strsplit(bank_2, " "))
sum(bank_2_list %in% rownames(dsm)) / length(bank_2_list)
# Negate the condition with ! to see directly which words are missing
bank_2_list[!(bank_2_list %in% rownames(dsm))]
# And now we create a matrix with the context vectors of our two sentences, using the
# word2vec representations as our input
bank_candidates <- context.vectors(dsm, c(bank_1, bank_2))
rownames(bank_candidates) <- c("bank1", "bank2")
# We can use the M2 option of the nearest neighbor function ?nearest.neighbours
# to find the nearest neighbours of the selected row in the full embeddings matrix 'dsm'
nearest.neighbours(bank_candidates, "bank1", n=10, M2=dsm)
nearest.neighbours(bank_candidates, "bank2", n=10, M2=dsm)
# We can get the "usual" semantic map for the neighbors of "bank" in word2vec
plot(nearest.neighbours(dsm, "bank", n= 10, dist.matrix=TRUE))
# ... and also the semantic maps for our sentences, "bank1" and "bank2",
# extracting their nearest neighbours from word2vec
plot(nearest.neighbours(bank_candidates, "bank1", n=10, M2=dsm, dist.matrix=TRUE))
plot(nearest.neighbours(bank_candidates, "bank2", n=10, M2=dsm, dist.matrix=TRUE))