Open
Description
I think it would be fairly easy to add support for the lsa package to tidytext and broom. See example below.
# Put some docs in a vector
library("dplyr")
doc1 <- c("pets dog cat ferret")
doc2 <- c("sandwiches turkey ham")
doc3 <- c("cat ferret cat bird")
doc4 <- c("turkey beef sandwiches")
myvector <- c(doc1,doc2,doc3,doc4)
mydf <- data_frame(id = 1:4, text = myvector)
# Create a corpus
library("quanteda")
mycorpus <- corpus(mydf, text_field = "text")
mytokens <- tokens(mycorpus)
mydfm <- dfm(mytokens)
# Perform LSA
mytdm <- convert(mydfm, to = "lsa")
mytdm_weighted = lw_logtf(mytdm) * gw_idf(mytdm)
myLSAspace = lsa(mytdm_weighted, dims=2)
# Here's how broom::augment could add
# factor scores back to the original data frame
factor_scores <- as_data_frame(myLSAspace$dk)
(augmented <- bind_cols(mydf, factor_scores))
# Here's how tidytext:tidy could tidy the factor loadings
library("tidyverse")
# as.data.frame is used to maintain row names until
# rownames_to_column can get them
loadings_tidy <- as.data.frame(myLSAspace$tk) %>%
rownames_to_column() %>%
rename(term = rowname) %>%
gather(factor, loading, # The new variables.
starts_with("V"), # These go into "loading".
-term) %>% # term is not "gathered".
arrange(factor, desc(loading)) %>% # Sort
select(factor, term, loading) # Change var order to enhance readablity.
print(loadings_tidy)