Skip to content

Adding support for latent semantic analysis #46

Open
@BobMuenchen

Description

@BobMuenchen

I think it would be fairly easy to add support for the lsa package to tidytext and broom. See example below.

# Put some docs in a vector
library("dplyr")
doc1 <- c("pets dog cat ferret")
doc2 <- c("sandwiches turkey ham")
doc3 <- c("cat ferret cat bird")
doc4 <- c("turkey beef sandwiches")
myvector <- c(doc1,doc2,doc3,doc4)
mydf <- data_frame(id = 1:4, text = myvector)

# Create a corpus
library("quanteda")
mycorpus <- corpus(mydf, text_field = "text")
mytokens <- tokens(mycorpus)
mydfm <- dfm(mytokens)

# Perform LSA
mytdm <- convert(mydfm, to = "lsa")
mytdm_weighted = lw_logtf(mytdm) * gw_idf(mytdm)
myLSAspace = lsa(mytdm_weighted, dims=2)

# Here's how broom::augment could add 
# factor scores back to the original data frame
factor_scores <- as_data_frame(myLSAspace$dk)
(augmented <- bind_cols(mydf, factor_scores))

# Here's how tidytext:tidy could tidy the factor loadings
library("tidyverse")
# as.data.frame is used to maintain row names until
# rownames_to_column can get them
loadings_tidy <- as.data.frame(myLSAspace$tk) %>%
  rownames_to_column() %>%
  rename(term = rowname) %>%
  gather(factor, loading, # The new variables.
         starts_with("V"), # These go into "loading".
         -term) %>%  # term is not "gathered".
  arrange(factor, desc(loading)) %>% # Sort
  select(factor, term, loading) # Change var order to enhance readablity.

print(loadings_tidy)

Metadata

Metadata

Assignees

No one assigned

    Labels

    featurea feature request or enhancement

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions