##
## Bonus material for day 2:
## Illustration of matrix factorization techniques
##
library(wordspace)
######################################################################
## SVD matrix factorization as successive approximation
M <- DSM_TermTermMatrix # we use unscaled term-term matrix as an example
M
## R has a built-in function for singular value decomposition (SVD), which can
## be used for smaller toy DSMs. For the huge sparse matrices of real-life DSMs
## the efficient sparsesvd() implementation used by dsm.projection() is needed.
M.svd <- svd(M)
M.svd$u # matrix U of left singular vectors
t(M.svd$v) # matrix V' of right singular vectors as rows
M.svd$d # vector of singular values
diag(M.svd$d) # diagonal matrix Σ
## Our helper function mk.svd() adds row and column labels.
mk.svd <- function (M) {
res <- svd(M)
rownames(res$u) <- rownames(M)
rownames(res$v) <- colnames(M)
res
} # If you press Ctrl+Enter / Cmd+Enter on this line, RStudio will execute entire definition
M.svd <- mk.svd(M)
M.svd$u
## Matrix multiplication of U • Σ • V' reconstructs the original matrix M
M1 <- M.svd$u %*% diag(M.svd$d) %*% t(M.svd$v)
round(M1, 3) # ignore small rounding errors in result
## We define another helper function for extracting a truncated SVD
## (first r dimensions) or individual singular components from the factorization.
extract <- function(x, dims, digits=NULL) {
Ur <- x$u[, dims, drop=FALSE]
Vr <- x$v[, dims, drop=FALSE]
res <- tcrossprod(scaleMargins(Ur, cols=x$d[dims]), Vr)
if (is.null(digits)) res else round(res, digits)
}
## Compare truncated SVD for r = 1 ... 4 below with original matrix M
## - What general structure in M does the first component (r=1) capture?
## - Notice how the “correction” of the second component overshoots and produces negative cells.
## - Which components capture the co-occurrence patterns of reason, cause, effect?
## - Try the same procedure with the term-context example matrix!
extract(M.svd, 1, digits=1) # truncated SVD with r=1
extract(M.svd, 1:2, digits=1) # truncated SVD with r=2
extract(M.svd, 1:3, digits=1) # truncated SVD with r=3
extract(M.svd, 1:4, digits=1) # truncated SVD with r=4
extract(M.svd, 2, digits=1) # only the second singular component
extract(M.svd, 6, digits=1) # the last components only make small adjustments
## SVD-based projection to latent dimensions
round(crossprod(M.svd$u), 3) # U is an orthogonal matrix -> U' • U = inner products of columns = I
round(crossprod(M.svd$v), 3) # same for V
## Project M into two SVD dimensions
M %*% M.svd$v[, 1:2] # M • Vr'
scaleMargins(M.svd$u[, 1:2], cols=M.svd$d[1:2]) # Ur • Σr
dsm.projection(M, n=2, method="svd") # SVD is the default projection method
######################################################################
## Non-negative matrix factorization
## The R package 'NMF' implements several state-of-the-art algorithms for non-negative
## matrix factorization. Installing it requires an extra step:
## 1. install packages 'NMF' and 'biocManager' via RStudio
## 2. execute this command in the R console: BiocManager::install("bioBase")
library(NMF)
## Most NMF algorithms factorize the input matrix M into
## W = a “tall” matrix (with a specified number of columns = the rank of the factorization)
## H = a “wide” matrix (with number or rows = rank of factorization)
result <- nmf(M, rank=2) # rank-2 NMF approximation of M, using default algorithm (Brunet 2004)
round(fitted(result), 1) # the approximated matrix
W <- basis(result)
H <- coef(result)
round(W, 1) # the “tall” matrix, with r=2 columns
round(H, 2) # the “wide” matrix, with r=2 rows
round(W %*% H, 1) # M ~ W • H
## Mini-exercise:
## - Did you notice that H has much smaller values than W?
## - Re-run the steps above several times. You will find that W and H are
## different on each run. Can you guess why?
## - Try adding the argument seed="nndsvd" when calling the nmf() function.
## This uses a deterministic SVD-based initialisation for the algorithm.
## In the lecture NMF was described as a factorization M = U • Σ • V' in analogy to SVD,
## with U and V stochastic matrices (whose columns can be seen as probability distributions).
## The helper function below transforms W and H into such a factorization. Notice how "..."
## is used to pass through other arguments (such as method="..." or seed="...") to nmf().
mk.nmf <- function (M, rank, ...) {
res <- nmf(M, rank, ...)
W <- basis(res)
H <- coef(res)
p1 <- colNorms(W, method="manhattan") # column sums of W
p2 <- rowNorms(H, method="manhattan") # row sums of H
U <- scaleMargins(W, cols = 1/p1) # rescale W to stochastic matrix with column sums 1
V <- scaleMargins(t(H), cols = 1/p2) # transpose H and rescale to stochastic matrix
sigma <- p1 * p2 # the scaling factors for W and H are absorbed into a diagonal matrix Σ
idx <- order(sigma, decreasing=TRUE) # reorder components by total weight
invisible(list(u=U[, idx, drop=FALSE], v=V[, idx, drop=FALSE], d=sigma[idx]))
}
## We can use this representation to study the successive approximation of M by the
## NMF components. Note that it is an approximation “from below” because all values are
## non-negative, i.e. the NMF cannot “overshoot” and then go back with the next component.
M.nmf <- mk.nmf(M, rank=3)
extract(M.nmf, 1, digits=1) # first component
extract(M.nmf, 2, digits=1) # second component
extract(M.nmf, 1:2, digits=1) # first two components
extract(M.nmf, 1:3, digits=1) # adding the third component
## Mini-exercise:
## - Can you work out what the contribution of the third component is?
## - Change the rank of the NMF. How does this affect the first two components?
## - Try other algorithms (method="...") and initializations (seed="...").
## Which gives the best approximation to the original co-occurrence matrix?
## - Of course, try everything once more with the term-context matrix.
## From the decomposition in the “standard” form M = U • Σ • V', we can obtain
## a dimensionality reduced version of M as a non-orthogonal projection.
## - How does this projection compare to the SVD-based projection above?
round(scaleMargins(M.nmf$u, cols=M.nmf$d), 1)
######################################################################
## GloVe
## The 'text2vec' package advertises an implementation of GloVe embeddings (Pennington et al. 2014),
## but it's simply re-exported from 'rsparse' under a different name. Here we use 'rsparse' directly.
library(rsparse)
M.glove <- GloVe$new(rank=3, x_max=1000) # for this matrix, only large x_max works
M.embed <- M.glove$fit_transform(t(M), n_iter=1000) # GloVe expects targets as columns
## the iterative algorithm takes fairly long to converge here, hence n_iter=1000
round(M.embed, 2) # do these embeddings look plausible to you?
## The GloVe implementation does not return a full matrix factorization, only the low-dimensional embeddings.
## Mini-exercise:
## - Read the GloVe paper and try different parameter settings (see ?GloVe for necessary arguments).
## - Try both the term-term and the term-document matrix (with different parameters, esp. x_max).
## - Try visualizing the embeddings (hint: plot(dist.matrix(..., method="euclidean"), method="sammon"))
## - Do the resulting embeddings seem to capture the meanings of the 7 nouns?
## - If you're courageous, try obtaining GloVe embeddings for a larger DSM obtained from DSM_VerbNounTriples_BNC,
## with a sufficiently large dimensionality (e.g. rank=50).