|
| 1 | +if (!require("pacman")) install.packages("pacman") |
| 2 | +pacman::p_load(tidyverse, textshape, textreadr, data.table, clipr) |
| 3 | +pacman::p_load_current_gh('trinker/acc.roxygen2') |
| 4 | + |
| 5 | +## readin tar file |
| 6 | +loc <- 'http://www.dcs.shef.ac.uk/research/ilash/Moby/mpos.tar.Z' %>% |
| 7 | + download() |
| 8 | + |
| 9 | +## untar the file |
| 10 | +untar(loc, exdir = dirname(loc)) |
| 11 | + |
| 12 | + |
| 13 | +## read in the .txt files (words and readme pos lookup key |
| 14 | +mobyr <- readLines(file.path(dirname(loc), 'mpos/mobyposi.i')) |
| 15 | +readme <- readLines(file.path(dirname(loc), 'mpos/readme')) |
| 16 | + |
| 17 | +## part of speech symbol lookup key |
| 18 | +pos_key <- readme %>% |
| 19 | + {grep("\t|\\s{3,}[A-Z]$", ., value = TRUE)} %>% |
| 20 | + trimws() %>% |
| 21 | + stringi::stri_replace_all_regex('\\s{3,}', '\t') %>% |
| 22 | + stringi::stri_replace_all_regex('(\t)+', '\t')%>% |
| 23 | + {read.csv(text = ., sep = "\t", header=FALSE, stringsAsFactors = FALSE)} %>% |
| 24 | + setNames(c('pos', 'tag')) |
| 25 | + |
| 26 | +## create the words to parts of speech lexicon |
| 27 | +hash_grady_pos <- mobyr %>% |
| 28 | + data_frame(x = .) %>% |
| 29 | + extract(x, c('word', 'tag'), '(^[^×]+?)×(.+$)') %>% |
| 30 | + mutate( |
| 31 | + word = tolower(word), |
| 32 | + n_pos = nchar(tag), |
| 33 | + tag = stringi::stri_split_regex(tag, "(?=.)(?<=.)") |
| 34 | + ) %>% |
| 35 | + unnest() %>% |
| 36 | + left_join(pos_key, by = 'tag') %>% |
| 37 | + filter(!grepl("[^ -~]", word)) %>% |
| 38 | + mutate(space = grepl("\\s", word)) %>% |
| 39 | + select(word, pos, n_pos, space) %>% |
| 40 | + as.data.table() |
| 41 | + |
| 42 | +setkey(hash_grady_pos, 'word') |
| 43 | + |
| 44 | + |
| 45 | +uDT <- unique(hash_grady_pos) |
| 46 | +hash_grady_pos[, "primary":=FALSE] |
| 47 | +hash_grady_pos[uDT, primary:=TRUE, mult="first"][] |
| 48 | + |
| 49 | + |
| 50 | +## test hash |
| 51 | +hash_grady_pos['dog'] |
| 52 | + |
| 53 | +hash_grady_pos[pos == 'Pronoun', ] |
| 54 | +table(hash_grady_pos$pos) |
| 55 | +write_clip(capture.output(acc.roxygen2::dat4rox(hash_grady_pos))) |
| 56 | +write_clip(paste(paste0("\\code{", names(table(hash_grady_pos$pos)), "}"), collapse = ", ")) |
| 57 | + |
| 58 | +pax::new_data(hash_grady_pos) |
| 59 | + |
| 60 | + |
| 61 | +#' Grady Ward's Moby Parts of Speech |
| 62 | +#' |
| 63 | +#' A dataset containing a hash lookup of Grady Ward's parts of speech from the |
| 64 | +#' Moby project. The words with non-ASCII characters removed. |
| 65 | +#' |
| 66 | +#' @details |
| 67 | +#' \itemize{ |
| 68 | +#' \item word. The word. |
| 69 | +#' \item pos. The part of speech; one of :\code{Adjective}, \code{Adverb}, \code{Conjunction}, \code{Definite Article}, \code{Interjection}, \code{Noun}, \code{Noun Phrase}, \code{Plural}, \code{Preposition}, \code{Pronoun}, \code{Verb (intransitive)}, \code{Verb (transitive)}, or \code{Verb (usu participle)}. Note that the first part of speech for a word is its primary use; all other uses are seondary. |
| 70 | +#' \item n_pos. The number of parts of speech associated with a word. Useful for filtering. |
| 71 | +#' \item space. logical. If \code{TRUE} the word contains a space. Useful for filtering. |
| 72 | +#' \item primary. logical. If \code{TRUE} the word is the primary part of speech used. |
| 73 | +#' } |
| 74 | +#' |
| 75 | +#' @docType data |
| 76 | +#' @keywords datasets |
| 77 | +#' @name hash_grady_pos |
| 78 | +#' @usage data(hash_grady_pos) |
| 79 | +#' @format A data frame with 250,892 rows and 5 variables |
| 80 | +#' @source \url{http://icon.shef.ac.uk/Moby/mpos.html} |
| 81 | +#' @references Moby Thesaurus List by Grady Ward: \url{http://icon.shef.ac.uk/Moby/mpos.html} |
| 82 | +#' @examples |
| 83 | +#' hash_grady_pos['dog'] |
| 84 | +#' hash_grady_pos[, .SD[1], by='word'] |
| 85 | +NULL |
| 86 | + |
| 87 | + |
| 88 | + |
| 89 | + |
0 commit comments