From 99a0ca80f983ae78371fa33166eb2760e0dec270 Mon Sep 17 00:00:00 2001 From: joran Date: Thu, 28 Mar 2024 19:01:45 -0600 Subject: [PATCH 1/4] spelling and grammar corrections --- R/clean_levels.R | 2 +- R/clean_names.R | 2 +- R/dummy_hash.R | 4 ++-- R/lda.R | 6 +++--- R/lemma.R | 2 +- R/ngram.R | 8 ++++---- R/pos_filter.R | 2 +- R/sequence_onehot.R | 4 ++-- R/show_tokens.R | 4 ++-- R/stem.R | 2 +- R/stopwords.R | 6 +++--- R/text_normalization.R | 2 +- R/textfeature.R | 2 +- R/texthash.R | 2 +- R/tf.R | 6 +++--- R/tfidf.R | 2 +- R/tokenfilter.R | 4 ++-- R/tokenize.R | 10 +++++----- R/tokenize_bpe.R | 2 +- R/tokenize_sentencepiece.R | 2 +- R/tokenize_wordpiece.R | 2 +- R/tokenmerge.R | 2 +- R/untokenize.R | 2 +- R/word_embeddings.R | 2 +- 24 files changed, 41 insertions(+), 41 deletions(-) diff --git a/R/clean_levels.R b/R/clean_levels.R index 6967bb25..b1920aa0 100644 --- a/R/clean_levels.R +++ b/R/clean_levels.R @@ -24,7 +24,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms`, `orginal`, `value`, and `id`: #' #' \describe{ diff --git a/R/clean_names.R b/R/clean_names.R index b0faf14d..097d6e92 100644 --- a/R/clean_names.R +++ b/R/clean_names.R @@ -19,7 +19,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms`, `value`, and `id`: #' #' \describe{ diff --git a/R/dummy_hash.R b/R/dummy_hash.R index 0109b695..3cfc8aad 100644 --- a/R/dummy_hash.R +++ b/R/dummy_hash.R @@ -36,7 +36,7 @@ #' The argument `num_terms` controls the number of indices that the hashing #' function will map to. This is the tuning parameter for this transformation. #' Since the hashing function can map two different tokens to the same index, -#' will a higher value of `num_terms` result in a lower chance of collision. +#' a higher value of `num_terms` will result in a lower chance of collision. #' #' @template details-prefix #' @@ -44,7 +44,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms`, `value`, `num_terms`, `collapse`, and `id`: #' #' \describe{ diff --git a/R/lda.R b/R/lda.R index 97651691..c16cb311 100644 --- a/R/lda.R +++ b/R/lda.R @@ -9,10 +9,10 @@ #' @template args-trained #' @template args-columns #' @param lda_models A WarpLDA model object from the text2vec package. If left -#' to NULL, the default, will it train its model based on the training data. +#' to NULL, the default, it will train its model based on the training data. #' Look at the examples for how to fit a WarpLDA model. #' @param num_topics integer desired number of latent topics. -#' @param prefix A prefix for generated column names, default to "lda". +#' @param prefix A prefix for generated column names, defaults to "lda". #' @template args-keep_original_cols #' @template args-skip #' @template args-id @@ -21,7 +21,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms`, `num_topics`, and `id`: #' #' \describe{ diff --git a/R/lemma.R b/R/lemma.R index cd0f2cb2..c537f8cb 100644 --- a/R/lemma.R +++ b/R/lemma.R @@ -23,7 +23,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms` and `id`: #' #' \describe{ diff --git a/R/ngram.R b/R/ngram.R index 3874ffa0..6fa8d1c0 100644 --- a/R/ngram.R +++ b/R/ngram.R @@ -23,14 +23,14 @@ #' @details #' #' The use of this step will leave the ordering of the tokens meaningless. If -#' `min_num_tokens < num_tokens` then the tokens order in increasing fashion -#' with respect to the number of tokens in the n-gram. If `min_num_tokens = 1` -#' and `num_tokens = 3` then the output contains all the 1-grams followed by all +#' `min_num_tokens < num_tokens` then the tokens will be ordered in increasing +#' fashion with respect to the number of tokens in the n-gram. If `min_num_tokens = 1` +#' and `num_tokens = 3` then the output will contain all the 1-grams followed by all #' the 2-grams followed by all the 3-grams. #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms` and `id`: #' #' \describe{ diff --git a/R/pos_filter.R b/R/pos_filter.R index 54aaba96..269188a7 100644 --- a/R/pos_filter.R +++ b/R/pos_filter.R @@ -25,7 +25,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms` and `id`: #' #' \describe{ diff --git a/R/sequence_onehot.R b/R/sequence_onehot.R index e4015ec4..3eadd07c 100644 --- a/R/sequence_onehot.R +++ b/R/sequence_onehot.R @@ -33,12 +33,12 @@ #' #' The string will be capped by the sequence_length argument, strings shorter #' then sequence_length will be padded with empty characters. The encoding will -#' assign a integer to each character in the vocabulary, and will encode +#' assign an integer to each character in the vocabulary, and will encode #' accordingly. Characters not in the vocabulary will be encoded as 0. #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms`, `vocabulary`, `token`, and `id`: #' #' \describe{ diff --git a/R/show_tokens.R b/R/show_tokens.R index 4d90bb8e..bcbfb6d8 100644 --- a/R/show_tokens.R +++ b/R/show_tokens.R @@ -1,7 +1,7 @@ #' Show token output of recipe #' -#' Returns the tokens as a list of character vector of a recipe. This function -#' can be useful for diagnostics doing recipe construction but should not be +#' Returns the tokens as a list of character vectors of a recipe. This function +#' can be useful for diagnostics during recipe construction but should not be #' used in final recipe steps. Note that this function will both prep() and #' bake() the recipe it is used on. #' diff --git a/R/stem.R b/R/stem.R index ff501174..c49b4bde 100644 --- a/R/stem.R +++ b/R/stem.R @@ -28,7 +28,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms`, `is_custom_stemmer`, and `id`: #' #' \describe{ diff --git a/R/stopwords.R b/R/stopwords.R index 24f5ded6..a00b4eb8 100644 --- a/R/stopwords.R +++ b/R/stopwords.R @@ -23,18 +23,18 @@ #' #' @details #' -#' Stop words are words which sometimes are remove before natural language +#' Stop words are words which sometimes are removed before natural language #' processing tasks. While stop words usually refers to the most common words in #' the language there is no universal stop word list. #' #' The argument `custom_stopword_source` allows you to pass a character vector -#' to filter against. With the `keep` argument one can specify to keep the words +#' to filter against. With the `keep` argument one can specify words to keep #' instead of removing thus allowing you to select words with a combination of #' these two arguments. #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms`, `value`, `keep`, and `id`: #' #' \describe{ diff --git a/R/text_normalization.R b/R/text_normalization.R index 5f08a5e7..9706d05f 100644 --- a/R/text_normalization.R +++ b/R/text_normalization.R @@ -21,7 +21,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms`, `normalization_form`, and `id`: #' #' \describe{ diff --git a/R/textfeature.R b/R/textfeature.R index d949fc64..3c5aac53 100644 --- a/R/textfeature.R +++ b/R/textfeature.R @@ -29,7 +29,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms`, `functions`, and `id`: #' #' \describe{ diff --git a/R/texthash.R b/R/texthash.R index b0f26f93..48a549fa 100644 --- a/R/texthash.R +++ b/R/texthash.R @@ -37,7 +37,7 @@ #' #' @details # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms`, value and `id`: #' #' \describe{ diff --git a/R/tf.R b/R/tf.R index 85071390..f1502632 100644 --- a/R/tf.R +++ b/R/tf.R @@ -33,12 +33,12 @@ #' issues. A good strategy is to start with a low token count and go up #' according to how much RAM you want to use. #' -#' Term frequency is a weight of how many times each token appear in each +#' Term frequency is a weight of how many times each token appears in each #' observation. There are different ways to calculate the weight and this step #' can do it in a couple of ways. Setting the argument `weight_scheme` to #' "binary" will result in a set of binary variables denoting if a token is #' present in the observation. "raw count" will count the times a token is -#' present in the observation. "term frequency" will divide the count with the +#' present in the observation. "term frequency" will divide the count by the #' total number of words in the document to limit the effect of the document #' length as longer documents tends to have the word present more times but not #' necessarily at a higher percentage. "log normalization" takes the log of 1 @@ -54,7 +54,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms`, `value`, and `id`: #' #' \describe{ diff --git a/R/tfidf.R b/R/tfidf.R index 92b848f4..6c1e4c1f 100644 --- a/R/tfidf.R +++ b/R/tfidf.R @@ -51,7 +51,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms`, `token`, `weight`, and `id`: #' #' \describe{ diff --git a/R/tokenfilter.R b/R/tokenfilter.R index b909c0c6..dc0e8eea 100644 --- a/R/tokenfilter.R +++ b/R/tokenfilter.R @@ -29,7 +29,7 @@ #' #' @details #' -#' This step allow you to limit the tokens you are looking at by filtering on +#' This step allows you to limit the tokens you are looking at by filtering on #' their occurrence in the corpus. You are able to exclude tokens if they appear #' too many times or too few times in the data. It can be specified as counts #' using `max_times` and `min_times` or as percentages by setting `percentage` @@ -44,7 +44,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms`, `value`, and `id`: #' #' \describe{ diff --git a/R/tokenize.R b/R/tokenize.R index fabd1b37..1044d2bd 100644 --- a/R/tokenize.R +++ b/R/tokenize.R @@ -29,16 +29,16 @@ #' options(width = 55) #' ``` #' -#' Tokenization is the act of splitting a character string into smaller parts to +#' Tokenization is the act of splitting a character vector into smaller parts to #' be further analyzed. This step uses the `tokenizers` package which includes #' heuristics on how to to split the text into paragraphs tokens, word tokens, #' among others. `textrecipes` keeps the tokens as a [`token`][tokenlist()] #' variable and other steps will do their tasks on those [`token`][tokenlist()] -#' variable before transforming them back to numeric variables. +#' variables before transforming them back to numeric variables. #' -#' Working will `textrecipes` will almost always start by calling +#' Working with `textrecipes` will almost always start by calling #' `step_tokenize` followed by modifying and filtering steps. This is not always -#' the case as you sometimes want to do apply pre-tokenization steps, this can +#' the case as you sometimes want to apply pre-tokenization steps; this can #' be done with [recipes::step_mutate()]. #' #' # Engines @@ -182,7 +182,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms`, `value`, and `id`: #' #' \describe{ diff --git a/R/tokenize_bpe.R b/R/tokenize_bpe.R index f2cd5a00..93714b43 100644 --- a/R/tokenize_bpe.R +++ b/R/tokenize_bpe.R @@ -23,7 +23,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms` and `id`: #' #' \describe{ diff --git a/R/tokenize_sentencepiece.R b/R/tokenize_sentencepiece.R index 5b93deea..4987b4fc 100644 --- a/R/tokenize_sentencepiece.R +++ b/R/tokenize_sentencepiece.R @@ -28,7 +28,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms` and `id`: #' #' \describe{ diff --git a/R/tokenize_wordpiece.R b/R/tokenize_wordpiece.R index 69c349d6..48d8996f 100644 --- a/R/tokenize_wordpiece.R +++ b/R/tokenize_wordpiece.R @@ -22,7 +22,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms` and `id`: #' #' \describe{ diff --git a/R/tokenmerge.R b/R/tokenmerge.R index fbbb04d4..fb95f30b 100644 --- a/R/tokenmerge.R +++ b/R/tokenmerge.R @@ -20,7 +20,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms` and `id`: #' #' \describe{ diff --git a/R/untokenize.R b/R/untokenize.R index 47b3a86a..024c440d 100644 --- a/R/untokenize.R +++ b/R/untokenize.R @@ -23,7 +23,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms`, `value`, and `id`: #' #' \describe{ diff --git a/R/word_embeddings.R b/R/word_embeddings.R index a35e815f..3f9ba172 100644 --- a/R/word_embeddings.R +++ b/R/word_embeddings.R @@ -44,7 +44,7 @@ #' #' # Tidying #' -#' When you [`tidy()`][tidy.recipe()] this step, a tibble is retruned with +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with #' columns `terms`, `embedding_rows`, `aggregation`, and `id`: #' #' \describe{ From 9cf51c92e74eb24e501730bb2b4d00cf0a9d0656 Mon Sep 17 00:00:00 2001 From: joran Date: Thu, 28 Mar 2024 19:18:30 -0600 Subject: [PATCH 2/4] default to -> defaults to --- R/sequence_onehot.R | 2 +- R/textfeature.R | 4 ++-- R/tokenmerge.R | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/sequence_onehot.R b/R/sequence_onehot.R index 3eadd07c..ae75ecee 100644 --- a/R/sequence_onehot.R +++ b/R/sequence_onehot.R @@ -18,7 +18,7 @@ #' @param vocabulary A character vector, characters to be mapped to integers. #' Characters not in the vocabulary will be encoded as 0. Defaults to #' `letters`. -#' @param prefix A prefix for generated column names, default to "seq1hot". +#' @param prefix A prefix for generated column names, defaults to "seq1hot". #' @template args-keep_original_cols #' @template args-skip #' @template args-id diff --git a/R/textfeature.R b/R/textfeature.R index 3c5aac53..bc7ec9e1 100644 --- a/R/textfeature.R +++ b/R/textfeature.R @@ -9,8 +9,8 @@ #' @template args-trained #' @template args-columns #' @param extract_functions A named list of feature extracting functions. -#' default to `count_functions`. See details for more information. -#' @param prefix A prefix for generated column names, default to "textfeature". +#' Defaults to `count_functions`. See details for more information. +#' @param prefix A prefix for generated column names, defaults to "textfeature". #' @template args-keep_original_cols #' @template args-skip #' @template args-id diff --git a/R/tokenmerge.R b/R/tokenmerge.R index fb95f30b..2f4738ad 100644 --- a/R/tokenmerge.R +++ b/R/tokenmerge.R @@ -9,7 +9,7 @@ #' @template args-role_predictors #' @template args-trained #' @template args-columns -#' @param prefix A prefix for generated column names, default to "tokenmerge". +#' @param prefix A prefix for generated column names, defaults to "tokenmerge". #' @template args-keep_original_cols #' @template args-skip #' @template args-id From a484ce1d0549579f8a623005ec6854dd3bb9003a Mon Sep 17 00:00:00 2001 From: joran Date: Thu, 28 Mar 2024 19:19:17 -0600 Subject: [PATCH 3/4] typos and grammar --- vignettes/Working-with-n-grams.Rmd | 12 ++++++------ ...k---using-more-complex-recipes-involving-text.Rmd | 8 ++++---- vignettes/tokenlist.Rmd | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/vignettes/Working-with-n-grams.Rmd b/vignettes/Working-with-n-grams.Rmd index 141ad130..4564d9b9 100644 --- a/vignettes/Working-with-n-grams.Rmd +++ b/vignettes/Working-with-n-grams.Rmd @@ -26,7 +26,7 @@ If you want to use n-grams with textrecipes you have 2 options: Both of these methods come with pros and cons so it will be worthwhile for you to be aware of both. -before we get started let's make sure we are on the same page of what we mean when we are talking about n-grams. We normally tokenize our text into words, which we can do with `tokenize_words()` from the tokenizers package (this is the default engine and token for `step_tokenize()` in textrecipes) +Before we get started let's make sure we are on the same page of what we mean when we are talking about n-grams. We normally tokenize our text into words, which we can do with `tokenize_words()` from the tokenizers package (this is the default engine and token for `step_tokenize()` in textrecipes) ```{r} abc <- c( @@ -37,14 +37,14 @@ abc <- c( tokenize_words(abc) ``` -N-grams are a contiguous sequence of n tokens. So to get 2-gram (or bigrams as they are also called) we can use the `tokenize_ngrams()` function to get them +N-grams are a contiguous sequence of n tokens. So to get 2-grams (or bigrams as they are also called) we can use the `tokenize_ngrams()` function to get them ```{r} tokenize_ngrams(abc, n = 2) ``` Notice how the words appear in multiple n-grams as the window slides across them. -And why changing the `n` argument we can any kind of n-gram (notice how `n = 1` is the special case of tokenizing to words). +And by changing the `n` argument we can get any kind of n-gram (notice how `n = 1` is the special case of tokenizing to words). ```{r} tokenize_ngrams(abc, n = 3) @@ -60,10 +60,10 @@ tokenize_ngrams(abc, n = 3, ngram_delim = "_") # Only using `step_tokenize()` -The first methods work by using n-gram `token` from one of the built-in engine in `step_tokenize()` to get a full list of available tokens type `?step_tokenize()` and go down to `Details`. +The first method works by using n-gram `token` from one of the built-in engines in `step_tokenize()`. To get a full list of available tokens type `?step_tokenize()` and go down to `Details`. We can use the `token="ngrams"` along with `engine = "tokenizers"`(the default) to tokenize to n-grams. We finish this `recipe()` with `step_tokenfilter()` and `step_tf()`. -The filtering doesn't do anything to the data of this size but it is a good practice to use `step_tokenfilter()` before using `step_tf()` or `step_tfidf()` to control the size of the resulting data.frame. +The filtering doesn't do anything to data of this size but it is a good practice to use `step_tokenfilter()` before using `step_tf()` or `step_tfidf()` to control the size of the resulting data.frame. ```{r} abc_tibble <- tibble(text = abc) @@ -134,7 +134,7 @@ Pros: Cons: -- Minimal flexibility, (`tokenizers::tokenize_ngrams()` don't let you control how the words are tokenized.) +- Minimal flexibility, (`tokenizers::tokenize_ngrams()` doesn't let you control how the words are tokenized.) - You are not able to tune the number of tokens in your n-gram # Using `step_tokenize()` and `step_ngram()` diff --git a/vignettes/cookbook---using-more-complex-recipes-involving-text.Rmd b/vignettes/cookbook---using-more-complex-recipes-involving-text.Rmd index e0b3bbb6..b14b2c5b 100644 --- a/vignettes/cookbook---using-more-complex-recipes-involving-text.Rmd +++ b/vignettes/cookbook---using-more-complex-recipes-involving-text.Rmd @@ -18,7 +18,7 @@ knitr::opts_chunk$set( Working to get textual data converted into numerical can be done in many different ways. The steps included in `textrecipes` should hopefully give you the flexibility to perform most of your desired text preprocessing tasks. This vignette will showcase examples that combine multiple steps. -This vignette will not do any modeling with the processed text as its purpose it to showcase the flexibility and modularity. Therefore the only packages needed will be `recipes` and `textrecipes`. Examples will be performed on the `tate_text` data-set which is packaged with `modeldata`. +This vignette will not do any modeling with the processed text as its purpose it to showcase flexibility and modularity. Therefore the only packages needed will be `recipes` and `textrecipes`. Examples will be performed on the `tate_text` data-set which is packaged with `modeldata`. ```{r, message=FALSE} library(recipes) @@ -29,7 +29,7 @@ data("tate_text") ## Counting select words -Sometimes it is enough to know the counts of a handful of specific words. This can be easily be achieved by using the arguments `custom_stopword_source` and `keep = TRUE` in `step_stopwords`. +Sometimes it is enough to know the counts of a handful of specific words. This can be easily achieved using the arguments `custom_stopword_source` and `keep = TRUE` in `step_stopwords`. ```{r} words <- c("or", "and", "on") @@ -91,9 +91,9 @@ bake(okc_obj, tate_text) %>% ## TF-IDF of ngrams of stemmed tokens -Sometimes fairly complicated computations. Here we would like the term frequency inverse document frequency (TF-IDF) of the most common 500 ngrams done on stemmed tokens. It is quite a handful and would seldom be included as a option in most other libraries. But the modularity of `textrecipes` makes this task fairly easy. +Sometimes fairly complicated computations are needed. Here we would like the term frequency inverse document frequency (TF-IDF) of the most common 500 ngrams done on stemmed tokens. It is quite a handful and would seldom be included as an option in most other libraries. But the modularity of `textrecipes` makes this task fairly easy. -First we will tokenize according to words, then stemming those words. We will then paste together the stemmed tokens using `step_untokenize` so we are back at string that we then tokenize again but this time using the ngram tokenizers. Lastly just filtering and tfidf as usual. +First we will tokenize according to words, then stem those words. We will then paste together the stemmed tokens using `step_untokenize` so we are back at strings that we then tokenize again but this time using the ngram tokenizers. Lastly just filtering and tfidf as usual. ```{r} okc_rec <- recipe(~., data = tate_text) %>% diff --git a/vignettes/tokenlist.Rmd b/vignettes/tokenlist.Rmd index 0e25922e..95358799 100644 --- a/vignettes/tokenlist.Rmd +++ b/vignettes/tokenlist.Rmd @@ -18,7 +18,7 @@ knitr::opts_chunk$set( library(textrecipes) ``` -**textrecipes** have been using lists of character vectors to carry around the tokens. A simple S3 vector class has been implemented with the **vctrs** package to handle that list of tokens, henceforth to be known as a `tokenlist`. +**textrecipes** has been using lists of character vectors to carry around the tokens. A simple S3 vector class has been implemented with the **vctrs** package to handle that list of tokens, henceforth to be known as a `tokenlist`. If you are only using this package for preprocessing then you most likely won't even notice that this change has happened. However if you are thinking of contributing to **textrecipes** then knowing about `tokenlist`s will be essential. @@ -33,4 +33,4 @@ This attribute is calculated automatically when using `tokenlist()`. If a functi Both the `lemma` and `pos` attribute are used in the same way. They default to `NULL` but can be filled depending on which engine is being used in `step_tokenize()`. The attribute is a list of characters in the exact shape and size as the tokenlist and should have a one-to-one relationship. -If a specific element is removed in the tokenlist then the corresponding element in `lemma` and `pos` should be removed. +If a specific element is removed from the tokenlist then the corresponding element in `lemma` and `pos` should be removed. From 6f5fccdc19b4c8a20942d6b16c05a67e6787d152 Mon Sep 17 00:00:00 2001 From: joran Date: Thu, 28 Mar 2024 19:19:53 -0600 Subject: [PATCH 4/4] re-oxygenize --- man/show_tokens.Rd | 4 ++-- man/step_clean_levels.Rd | 2 +- man/step_clean_names.Rd | 2 +- man/step_dummy_hash.Rd | 4 ++-- man/step_lda.Rd | 6 +++--- man/step_lemma.Rd | 2 +- man/step_ngram.Rd | 8 ++++---- man/step_pos_filter.Rd | 2 +- man/step_sequence_onehot.Rd | 6 +++--- man/step_stem.Rd | 2 +- man/step_stopwords.Rd | 6 +++--- man/step_text_normalization.Rd | 2 +- man/step_textfeature.Rd | 6 +++--- man/step_texthash.Rd | 2 +- man/step_tf.Rd | 6 +++--- man/step_tfidf.Rd | 2 +- man/step_tokenfilter.Rd | 4 ++-- man/step_tokenize.Rd | 10 +++++----- man/step_tokenize_bpe.Rd | 2 +- man/step_tokenize_sentencepiece.Rd | 2 +- man/step_tokenize_wordpiece.Rd | 2 +- man/step_tokenmerge.Rd | 4 ++-- man/step_untokenize.Rd | 2 +- man/step_word_embeddings.Rd | 2 +- 24 files changed, 45 insertions(+), 45 deletions(-) diff --git a/man/show_tokens.Rd b/man/show_tokens.Rd index 78db5b13..4a6d3177 100644 --- a/man/show_tokens.Rd +++ b/man/show_tokens.Rd @@ -17,8 +17,8 @@ show_tokens(rec, var, n = 6L) A list of character vectors } \description{ -Returns the tokens as a list of character vector of a recipe. This function -can be useful for diagnostics doing recipe construction but should not be +Returns the tokens as a list of character vectors of a recipe. This function +can be useful for diagnostics during recipe construction but should not be used in final recipe steps. Note that this function will both prep() and bake() the recipe it is used on. } diff --git a/man/step_clean_levels.Rd b/man/step_clean_levels.Rd index 3cf85f6f..04ec607c 100644 --- a/man/step_clean_levels.Rd +++ b/man/step_clean_levels.Rd @@ -56,7 +56,7 @@ data to be processed contains novel levels (i.e., not contained in the training set), they are converted to missing. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms}, \code{orginal}, \code{value}, and \code{id}: \describe{ diff --git a/man/step_clean_names.Rd b/man/step_clean_names.Rd index 3ea11ff9..efae42a2 100644 --- a/man/step_clean_names.Rd +++ b/man/step_clean_names.Rd @@ -50,7 +50,7 @@ clean variable names so the names consist only of letters, numbers, and the underscore. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms}, \code{value}, and \code{id}: \describe{ diff --git a/man/step_dummy_hash.Rd b/man/step_dummy_hash.Rd index 83c81b08..09d42a53 100644 --- a/man/step_dummy_hash.Rd +++ b/man/step_dummy_hash.Rd @@ -85,7 +85,7 @@ using the MurmurHash3 method. The argument \code{num_terms} controls the number of indices that the hashing function will map to. This is the tuning parameter for this transformation. Since the hashing function can map two different tokens to the same index, -will a higher value of \code{num_terms} result in a lower chance of collision. +a higher value of \code{num_terms} will result in a lower chance of collision. The new components will have names that begin with \code{prefix}, then the name of the variable, followed by the tokens all separated by @@ -95,7 +95,7 @@ the name of the variable, followed by the tokens all separated by \code{hash001} - \code{hash101}. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms}, \code{value}, \code{num_terms}, \code{collapse}, and \code{id}: \describe{ diff --git a/man/step_lda.Rd b/man/step_lda.Rd index 20ae5ebf..6447c422 100644 --- a/man/step_lda.Rd +++ b/man/step_lda.Rd @@ -43,12 +43,12 @@ be populated (eventually) by the \code{terms} argument. This is \code{NULL} until the step is trained by \code{\link[recipes:prep]{recipes::prep.recipe()}}.} \item{lda_models}{A WarpLDA model object from the text2vec package. If left -to NULL, the default, will it train its model based on the training data. +to NULL, the default, it will train its model based on the training data. Look at the examples for how to fit a WarpLDA model.} \item{num_topics}{integer desired number of latent topics.} -\item{prefix}{A prefix for generated column names, default to "lda".} +\item{prefix}{A prefix for generated column names, defaults to "lda".} \item{keep_original_cols}{A logical to keep the original variables in the output. Defaults to \code{FALSE}.} @@ -70,7 +70,7 @@ to the sequence of existing steps (if any). lda dimension estimates of a text variable. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms}, \code{num_topics}, and \code{id}: \describe{ diff --git a/man/step_lemma.Rd b/man/step_lemma.Rd index 398cf22c..9735a5a2 100644 --- a/man/step_lemma.Rd +++ b/man/step_lemma.Rd @@ -57,7 +57,7 @@ lemmatization. Currently using the \code{"spacyr"} engine in \code{\link[=step_t provides lemmatization and works well with \code{step_lemma}. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms} and \code{id}: \describe{ diff --git a/man/step_ngram.Rd b/man/step_ngram.Rd index 9bbfcbd3..08c443af 100644 --- a/man/step_ngram.Rd +++ b/man/step_ngram.Rd @@ -64,13 +64,13 @@ ngrams. } \details{ The use of this step will leave the ordering of the tokens meaningless. If -\code{min_num_tokens < num_tokens} then the tokens order in increasing fashion -with respect to the number of tokens in the n-gram. If \code{min_num_tokens = 1} -and \code{num_tokens = 3} then the output contains all the 1-grams followed by all +\code{min_num_tokens < num_tokens} then the tokens will be ordered in increasing +fashion with respect to the number of tokens in the n-gram. If \code{min_num_tokens = 1} +and \code{num_tokens = 3} then the output will contain all the 1-grams followed by all the 2-grams followed by all the 3-grams. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms} and \code{id}: \describe{ diff --git a/man/step_pos_filter.Rd b/man/step_pos_filter.Rd index 493bc213..f0f3ae3e 100644 --- a/man/step_pos_filter.Rd +++ b/man/step_pos_filter.Rd @@ -61,7 +61,7 @@ information look here \url{https://github.com/explosion/spaCy/blob/master/spacy/glossary.py}. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms} and \code{id}: \describe{ diff --git a/man/step_sequence_onehot.Rd b/man/step_sequence_onehot.Rd index 14d8135a..0259b0cf 100644 --- a/man/step_sequence_onehot.Rd +++ b/man/step_sequence_onehot.Rd @@ -58,7 +58,7 @@ Defaults too 'pre'.} Characters not in the vocabulary will be encoded as 0. Defaults to \code{letters}.} -\item{prefix}{A prefix for generated column names, default to "seq1hot".} +\item{prefix}{A prefix for generated column names, defaults to "seq1hot".} \item{keep_original_cols}{A logical to keep the original variables in the output. Defaults to \code{FALSE}.} @@ -82,11 +82,11 @@ take a string and do one hot encoding for each character by position. \details{ The string will be capped by the sequence_length argument, strings shorter then sequence_length will be padded with empty characters. The encoding will -assign a integer to each character in the vocabulary, and will encode +assign an integer to each character in the vocabulary, and will encode accordingly. Characters not in the vocabulary will be encoded as 0. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms}, \code{vocabulary}, \code{token}, and \code{id}: \describe{ diff --git a/man/step_stem.Rd b/man/step_stem.Rd index b1e3430b..8b753583 100644 --- a/man/step_stem.Rd +++ b/man/step_stem.Rd @@ -66,7 +66,7 @@ Note that the stemming will only be done at the end of the word and will therefore not work reliably on ngrams or sentences. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms}, \code{is_custom_stemmer}, and \code{id}: \describe{ diff --git a/man/step_stopwords.Rd b/man/step_stopwords.Rd index f946944a..afc6d9b6 100644 --- a/man/step_stopwords.Rd +++ b/man/step_stopwords.Rd @@ -66,17 +66,17 @@ to the sequence of existing steps (if any). filter a \code{\link[=tokenlist]{token}} variable for stop words. } \details{ -Stop words are words which sometimes are remove before natural language +Stop words are words which sometimes are removed before natural language processing tasks. While stop words usually refers to the most common words in the language there is no universal stop word list. The argument \code{custom_stopword_source} allows you to pass a character vector -to filter against. With the \code{keep} argument one can specify to keep the words +to filter against. With the \code{keep} argument one can specify words to keep instead of removing thus allowing you to select words with a combination of these two arguments. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms}, \code{value}, \code{keep}, and \code{id}: \describe{ diff --git a/man/step_text_normalization.Rd b/man/step_text_normalization.Rd index b2c44a9a..0254a093 100644 --- a/man/step_text_normalization.Rd +++ b/man/step_text_normalization.Rd @@ -56,7 +56,7 @@ to the sequence of existing steps (if any). will perform Unicode Normalization on character variables. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms}, \code{normalization_form}, and \code{id}: \describe{ diff --git a/man/step_textfeature.Rd b/man/step_textfeature.Rd index cb2f922e..a216f54d 100644 --- a/man/step_textfeature.Rd +++ b/man/step_textfeature.Rd @@ -38,9 +38,9 @@ be populated (eventually) by the \code{terms} argument. This is \code{NULL} until the step is trained by \code{\link[recipes:prep]{recipes::prep.recipe()}}.} \item{extract_functions}{A named list of feature extracting functions. -default to \code{count_functions}. See details for more information.} +Defaults to \code{count_functions}. See details for more information.} -\item{prefix}{A prefix for generated column names, default to "textfeature".} +\item{prefix}{A prefix for generated column names, defaults to "textfeature".} \item{keep_original_cols}{A logical to keep the original variables in the output. Defaults to \code{FALSE}.} @@ -71,7 +71,7 @@ as input and return a numeric vector of the same length, otherwise an error will be thrown. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms}, \code{functions}, and \code{id}: \describe{ diff --git a/man/step_texthash.Rd b/man/step_texthash.Rd index 2342e35c..a9a7ea8c 100644 --- a/man/step_texthash.Rd +++ b/man/step_texthash.Rd @@ -88,7 +88,7 @@ the name of the variable, followed by the tokens all separated by \code{hash001} - \code{hash101}. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms}, value and \code{id}: \describe{ diff --git a/man/step_tf.Rd b/man/step_tf.Rd index 5dc5e1e0..c0a21373 100644 --- a/man/step_tf.Rd +++ b/man/step_tf.Rd @@ -84,12 +84,12 @@ limit the number of variables created, otherwise you might run into memory issues. A good strategy is to start with a low token count and go up according to how much RAM you want to use. -Term frequency is a weight of how many times each token appear in each +Term frequency is a weight of how many times each token appears in each observation. There are different ways to calculate the weight and this step can do it in a couple of ways. Setting the argument \code{weight_scheme} to "binary" will result in a set of binary variables denoting if a token is present in the observation. "raw count" will count the times a token is -present in the observation. "term frequency" will divide the count with the +present in the observation. "term frequency" will divide the count by the total number of words in the document to limit the effect of the document length as longer documents tends to have the word present more times but not necessarily at a higher percentage. "log normalization" takes the log of 1 @@ -107,7 +107,7 @@ the name of the variable, followed by the tokens all separated by \code{hash001} - \code{hash101}. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms}, \code{value}, and \code{id}: \describe{ diff --git a/man/step_tfidf.Rd b/man/step_tfidf.Rd index 96372be8..fe8208cf 100644 --- a/man/step_tfidf.Rd +++ b/man/step_tfidf.Rd @@ -110,7 +110,7 @@ the name of the variable, followed by the tokens all separated by \code{hash001} - \code{hash101}. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms}, \code{token}, \code{weight}, and \code{id}: \describe{ diff --git a/man/step_tokenfilter.Rd b/man/step_tokenfilter.Rd index 0936f2f0..48d82350 100644 --- a/man/step_tokenfilter.Rd +++ b/man/step_tokenfilter.Rd @@ -76,7 +76,7 @@ to the sequence of existing steps (if any). convert a \code{\link[=tokenlist]{token}} variable to be filtered based on frequency. } \details{ -This step allow you to limit the tokens you are looking at by filtering on +This step allows you to limit the tokens you are looking at by filtering on their occurrence in the corpus. You are able to exclude tokens if they appear too many times or too few times in the data. It can be specified as counts using \code{max_times} and \code{min_times} or as percentages by setting \code{percentage} @@ -90,7 +90,7 @@ It is strongly advised to filter before using \link{step_tf} or \link{step_tfidf limit the number of variables created. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms}, \code{value}, and \code{id}: \describe{ diff --git a/man/step_tokenize.Rd b/man/step_tokenize.Rd index ee8dfadd..0da6c165 100644 --- a/man/step_tokenize.Rd +++ b/man/step_tokenize.Rd @@ -70,16 +70,16 @@ to the sequence of existing steps (if any). convert a character predictor into a \code{\link[=tokenlist]{token}} variable. } \details{ -Tokenization is the act of splitting a character string into smaller parts to +Tokenization is the act of splitting a character vector into smaller parts to be further analyzed. This step uses the \code{tokenizers} package which includes heuristics on how to to split the text into paragraphs tokens, word tokens, among others. \code{textrecipes} keeps the tokens as a \code{\link[=tokenlist]{token}} variable and other steps will do their tasks on those \code{\link[=tokenlist]{token}} -variable before transforming them back to numeric variables. +variables before transforming them back to numeric variables. -Working will \code{textrecipes} will almost always start by calling +Working with \code{textrecipes} will almost always start by calling \code{step_tokenize} followed by modifying and filtering steps. This is not always -the case as you sometimes want to do apply pre-tokenization steps, this can +the case as you sometimes want to apply pre-tokenization steps; this can be done with \code{\link[recipes:step_mutate]{recipes::step_mutate()}}. } \section{Engines}{ @@ -245,7 +245,7 @@ recipe(~ text, data = text_tibble) \%>\% } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms}, \code{value}, and \code{id}: \describe{ diff --git a/man/step_tokenize_bpe.Rd b/man/step_tokenize_bpe.Rd index bf5bc8d2..cd6d1d07 100644 --- a/man/step_tokenize_bpe.Rd +++ b/man/step_tokenize_bpe.Rd @@ -62,7 +62,7 @@ convert a character predictor into a \code{\link[=tokenlist]{token}} variable us Byte Pair Encoding. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms} and \code{id}: \describe{ diff --git a/man/step_tokenize_sentencepiece.Rd b/man/step_tokenize_sentencepiece.Rd index 0f69c83c..ead33b8d 100644 --- a/man/step_tokenize_sentencepiece.Rd +++ b/man/step_tokenize_sentencepiece.Rd @@ -68,7 +68,7 @@ compiled code by setting \code{options = list(verbose = TRUE)}. This can reveal sentencepiece ran correctly or not. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms} and \code{id}: \describe{ diff --git a/man/step_tokenize_wordpiece.Rd b/man/step_tokenize_wordpiece.Rd index 3d7faf07..72cefe6a 100644 --- a/man/step_tokenize_wordpiece.Rd +++ b/man/step_tokenize_wordpiece.Rd @@ -61,7 +61,7 @@ will convert a character predictor into a \code{\link[=tokenlist]{token}} variab using WordPiece tokenization. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms} and \code{id}: \describe{ diff --git a/man/step_tokenmerge.Rd b/man/step_tokenmerge.Rd index 6a64b581..383dee36 100644 --- a/man/step_tokenmerge.Rd +++ b/man/step_tokenmerge.Rd @@ -37,7 +37,7 @@ preprocessing have been estimated.} be populated (eventually) by the \code{terms} argument. This is \code{NULL} until the step is trained by \code{\link[recipes:prep]{recipes::prep.recipe()}}.} -\item{prefix}{A prefix for generated column names, default to "tokenmerge".} +\item{prefix}{A prefix for generated column names, defaults to "tokenmerge".} \item{keep_original_cols}{A logical to keep the original variables in the output. Defaults to \code{FALSE}.} @@ -60,7 +60,7 @@ multiple \code{\link[=tokenlist]{token}} variables and combine them into one \code{\link[=tokenlist]{token}} variable. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms} and \code{id}: \describe{ diff --git a/man/step_untokenize.Rd b/man/step_untokenize.Rd index 6e4102a9..070f3ca4 100644 --- a/man/step_untokenize.Rd +++ b/man/step_untokenize.Rd @@ -59,7 +59,7 @@ vector. This step is calling \code{paste} internally to put the tokens back together to a character. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms}, \code{value}, and \code{id}: \describe{ diff --git a/man/step_word_embeddings.Rd b/man/step_word_embeddings.Rd index b31279d7..f56c0b0a 100644 --- a/man/step_word_embeddings.Rd +++ b/man/step_word_embeddings.Rd @@ -92,7 +92,7 @@ tibble (usually something like "d7"). For example, using the default \code{wordembedding_d1}, \code{wordembedding_d1}, etc. } \section{Tidying}{ -When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is retruned with +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with columns \code{terms}, \code{embedding_rows}, \code{aggregation}, and \code{id}: \describe{