Here we show how to use LACS framework and our R packages to classify abstracts for a specific scientific literature topic. In this exercise we are going to model from GMPD and Zover database abstracts and classify GBIF abstracts.
library(tidyverse)
library(tidymodels)
library(text2vec)
library(ggplot2)
db_abstracts <- readr::read_csv("https://raw.githubusercontent.com/alrobles/abstractsHostParasites/main/data-raw/df_abstracts.csv")
#View(db_abstracts)
db_abstracts <- db_abstracts %>%
mutate(Label.obs = ifelse(class == "parasite", 1, 0))
train_test_split <- initial_split(db_abstracts, prop = 0.65)
db_abstracts_train <- training(train_test_split)
db_abstracts_test <- testing(train_test_split)
db_abstracts_train %>% count(class)
db_abstracts_test %>% count(class)
#db_abstracts <- db_abstracts %>%
#mutate(language = textcat(search_abstract))
prep_fun = function(x) {
# make text lower case
x = str_to_lower(x)
# remove only numbers
x = str_replace_all(x, "^\\d+", " ")
# remove non-alphanumeric symbols
x = str_replace_all(x, "[^[:alnum:]]", " ")
# collapse multiple spaces
x = str_replace_all(x, "\\s+", " ")
# remove english
ifelse(stri_enc_isascii(x), x, str_replace_all(x, "." , "") )
}
it_train = itoken(db_abstracts_train$abstract,
preprocessor = tolower,
tok_fun = word_tokenizer,
progressbar = FALSE)
it_test = itoken(db_abstracts_test$abstract,
preprocessor = tolower,
tok_fun = word_tokenizer,
progressbar = FALSE)
library(stringi)
v <- create_vocabulary(it_train,
ngram = c(1L, 4L),
stopwords = stopwords::stopwords())
v <- v %>% dplyr::filter(!grepl(pattern = "^[0-9]", term ))
v <- v %>% dplyr::filter(!grepl(pattern = "[0-9]$", term ))
# v <- v %>% dplyr::filter(!grepl(pattern = "^[0-9]", .data$term ))
v <- v %>% dplyr::filter(grepl(pattern = "^[a-z]", .data$term ))
v <- v %>% dplyr::filter(stringi::stri_enc_isascii(term))
v <- v %>% dplyr::filter(nchar(term) >= 4)
v <- v %>% dplyr::filter(!grepl(pattern = "^[0-9]", term ))
#vectorizer <- vocab_vectorizer(v)
pruned_vocab = prune_vocabulary(v,
term_count_min = 20
#,doc_proportion_max = 0.1
#,doc_proportion_min = 0.01
)
#pruned_vocab_pulearning <- pruned_vocab
#write_rds(pruned_vocab_pulearning, "pruned_vocab_pulearning.rds")
vectorizer = vocab_vectorizer(pruned_vocab)
dtm_train <- create_dtm(it_train, vectorizer)
dtm_test <- create_dtm(it_test, vectorizer)
tfidf = TfIdf$new()
dtm_df_tfidf_train <- fit_transform(dtm_train, tfidf)
dtm_df_tfidf_test <- fit_transform(dtm_test, tfidf)
dtm_df_train <- as.matrix(dtm_df_tfidf_train) %>% as_tibble()
dtm_df_test <- as.matrix(dtm_df_tfidf_test) %>% as_tibble()
dtm_df_train$class <- db_abstracts_train[['class']]
dtm_df_test$class <- db_abstracts_test[['class']]
# Split test/training sets
#library(tidymodels)
# source("PLUR.R")
# library(glmnet)
Prediction <-
PLUS(train_data = dtm_df_tfidf_train,
Label.obs = db_abstracts_train[["Label.obs"]],
Sample_use_time = 100,
l.rate = 1, qq = 0.15)
Coef_Matrix <- Prediction$coef1 %>% as.matrix()
yhat_pulearning_prob_vec <- predict(Prediction$fit.pi,
s = 'lambda.min',
newx=dtm_df_tfidf_test,
type="response") %>% as.numeric()
yhat_pulearning_class_vec <- predict(Prediction$fit.pi,
s = 'lambda.min',
newx=dtm_df_tfidf_test,
type="class") %>% as.numeric()
# Format test data and predictions for yardstick metrics
estimates_pulearning_tbl <- tibble(
truth = factor(db_abstracts_test[["Label.obs"]], levels = c(0, 1)),
estimate = factor(yhat_pulearning_class_vec),
class_prob = yhat_pulearning_prob_vec
)
estimates_pulearning_tbl %>% yardstick::conf_mat(truth, estimate)
estimates_pulearning_tbl %>% arrange(class_prob) %>%
mutate(id = row_number()) %>%
ggplot() + geom_point(aes(id, class_prob))
#estimates_keras_tbl %>% conf_mat(truth, estimate)
estimates_pulearning_tbl %>%
yardstick::f_meas(truth, estimate, beta = 1)
estimates_pulearning_tbl %>%
arrange(class_prob) %>%
pROC::roc_(response = "truth", predictor = "class_prob") %>%
plot()
pROC::auc(estimates_pulearning_tbl$truth, estimates_pulearning_tbl$class_prob,
partial.auc=c(1, .8), partial.auc.focus="se",
partial.auc.correct = TRUE)
# glmnet_classifier_pulearning <- Prediction$fit.pi
# write_rds(glmnet_classifier_pulearning, "glmnet_classifier_pulearning_7_9_22.rds")
Except where otherwise noted, content on this site is licensed under the CC-BY license.