Skip to content

Commit

Permalink
Show plotting + allow to return vocabulary + biterms when running the…
Browse files Browse the repository at this point in the history
… model
  • Loading branch information
jwijffels committed May 2, 2020
1 parent a87022c commit edd4c7a
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 17 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: BTM
Type: Package
Title: Biterm Topic Models for Short Text
Version: 0.3
Version: 0.3.1
Maintainer: Jan Wijffels <[email protected]>
Authors@R: c(
person('Jan', 'Wijffels', role = c('aut', 'cre', 'cph'), email = '[email protected]', comment = "R wrapper"),
Expand All @@ -21,5 +21,5 @@ LazyData: true
Imports: Rcpp, utils
Suggests: udpipe, data.table
LinkingTo: Rcpp
RoxygenNote: 6.1.1
RoxygenNote: 7.1.0
SystemRequirements: C++11
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
### CHANGES IN BTM VERSION 0.3.1

- Allow in BTM to return as well the biterms as the vocabulary using an extra argument called detailed. For compatibility with the textplot package.
- Added in the docs examples of plotting but put these in dontrun blocks to avoid package dependencies

### CHANGES IN BTM VERSION 0.3

- Allow to provide your own set of biterms to cluster upon
Expand Down
51 changes: 46 additions & 5 deletions R/btm.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#' Note that doc_id's which are not in \code{data} are not allowed, as well as terms (in term1 and term2) which are not also in \code{data}.
#' See the examples.\cr
#' If provided, the \code{window} argument is ignored and the \code{data} argument will only be used to calculate the background word frequency distribution.
#' @param detailed logical indicating to return detailed output containing as well the vocabulary and the biterms used to construct the model. Defaults to FALSE.
#' @note
#' A biterm is defined as a pair of words co-occurring in the same text window.
#' If you have as an example a document with sequence of words \code{'A B C B'}, and assuming the window size is set to 3,
Expand All @@ -56,6 +57,8 @@
#' \item{theta: a vector with the topic probability p(z) which is determinated by the overall proportions of biterms in it}
#' \item{phi: a matrix of dimension W x K with one row for each token in the data. This matrix contains the probability of the token given the topic P(w|z).
#' the rownames of the matrix indicate the token w}
#' \item{vocab: a data.frame with columns token and freq indicating the frequency of occurrence of the tokens in \code{data}. Only provided in case argument \code{detailed} is set to \code{TRUE}}
#' \item{biterms: the result of a call to \code{terms} with type set to biterms, containing all the biterms used in the model. Only provided in case argument \code{detailed} is set to \code{TRUE}}
#' }
#' @export
#' @seealso \code{\link{predict.BTM}}, \code{\link{terms.BTM}}, \code{\link{logLik.BTM}}
Expand Down Expand Up @@ -94,16 +97,39 @@
#' x <- subset(x, xpos %in% c("NN", "NNP", "NNS", "JJ"))
#' x <- x[, c("doc_id", "lemma")]
#' model <- BTM(x, k = 5, beta = 0.01, iter = 10, background = TRUE,
#' biterms = biterms, trace = 10)
#' biterms = biterms, trace = 10, detailed = TRUE)
#' model
#' terms(model)
#' bitermset <- terms(model, "biterms")
#' head(bitermset$biterms, 100)
#'
#' bitermset$n
#' sum(biterms$cooc)
#'
#'
#' \dontrun{
#' ##
#' ## Visualisation either using the textplot or the LDAvis package
#' ##
#' library(textplot)
#' library(ggraph)
#' library(concaveman)
#' plot(model, top_n = 4)
#'
#' library(LDAvis)
#' docsize <- table(x$doc_id)
#' scores <- predict(model, x)
#' scores <- scores[names(docsize), ]
#' json <- createJSON(
#' phi = t(model$phi),
#' theta = scores,
#' doc.length = as.integer(docsize),
#' vocab = model$vocabulary$token,
#' term.frequency = model$vocabulary$freq)
#' serVis(json)
#' }
BTM <- function(data, k = 5, alpha = 50/k, beta = 0.01, iter = 1000, window = 15, background = FALSE, trace = FALSE,
biterms){
biterms, detailed = FALSE){
trace <- as.integer(trace)
background <- as.integer(as.logical(background))
stopifnot(k >= 1)
Expand All @@ -122,7 +148,18 @@ BTM <- function(data, k = 5, alpha = 50/k, beta = 0.01, iter = 1000, window = 15
data <- data[!is.na(data$doc_id) & !is.na(data$token), ]
## Convert tokens to integer numbers which need to be pasted into a string separated by spaces
data$word <- factor(data$token)
vocabulary <- data.frame(id = seq_along(levels(data$word)) - 1L, token = levels(data$word), stringsAsFactors = FALSE)
if(detailed){
freq <- table(data$word)
freq <- as.data.frame(freq, responseName = "freq", stringsAsFactors = FALSE)
vocabulary <- data.frame(id = seq_along(levels(data$word)) - 1L,
token = levels(data$word),
freq = freq$freq[match(levels(data$word), freq$Var1)],
stringsAsFactors = FALSE)
}else{
vocabulary <- data.frame(id = seq_along(levels(data$word)) - 1L,
token = levels(data$word),
stringsAsFactors = FALSE)
}
data$word <- as.integer(data$word) - 1L

voc <- max(data$word) + 1
Expand Down Expand Up @@ -163,10 +200,14 @@ BTM <- function(data, k = 5, alpha = 50/k, beta = 0.01, iter = 1000, window = 15

## build the model
model <- btm(biterms = biterms, x = context, K = k, W = voc, alpha = alpha, beta = beta, iter = iter, win = window, background = background, trace = as.integer(trace))

## make sure integer numbers are back tokens again
rownames(model$phi) <- vocabulary$token
## also include vocabulary
class(model) <- "BTM"
if(detailed){
model$vocabulary <- vocabulary[c("token", "freq")]
model$biterms <- terms.BTM(model, type = "biterms")
}
model
}

Expand Down Expand Up @@ -315,7 +356,7 @@ terms.BTM <- function(x, type = c("tokens", "biterms"), threshold = 0, top_n = 5
#' x <- subset(x, xpos %in% c("NN", "NNP", "NNS"))
#' x <- x[, c("doc_id", "lemma")]
#'
#' model <- BTM(x, k = 5, iter = 5, trace = TRUE)
#' model <- BTM(x, k = 5, iter = 5, trace = TRUE, detailed = TRUE)
#' fit <- logLik(model)
#' fit$ll
logLik.BTM <- function(object, data = terms.BTM(object, type = 'biterms')$biterms, ...){
Expand Down
43 changes: 40 additions & 3 deletions man/BTM.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 2 additions & 3 deletions man/logLik.BTM.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/predict.BTM.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/terms.BTM.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit edd4c7a

Please sign in to comment.