Fully remove incomplete idioms from data
Change-Id: I6bbad3d66b012d6e904d151903bdb860df69bc62
diff --git a/R/idiomclassification_mk_pf.R b/R/idiomclassification_mk_pf.R
index e8c0f87..1310b6b 100644
--- a/R/idiomclassification_mk_pf.R
+++ b/R/idiomclassification_mk_pf.R
@@ -16,6 +16,9 @@
wl[!(wl %in% stopwords)]
}
+oringramme <- oringramme %>%
+ filter(CO_IDIOM < 2) # just two classes: 0 no idiom, 1 idiom
+
ngramme <- oringramme %>%
add_column(NSTOPW = sapply(oringramme$tokens,function(x) length(deleteStopwords(tolower(unlist(strsplit(x," "))),stopwords)))) %>%
# select(-matches("CO_TOKEN.*"), -tokens) %>%
@@ -24,7 +27,7 @@
mutate(across(c("dice", "lfmd", "llr", "ld", "pmi"), ~ replace_na(.x, min(.x) - 1))) %>%
rename_at(syfeatures$innames, ~ syfeatures[syfeatures$innames==.x,]$synames ) %>%
mutate(across(everything(), ~ replace_na(.x, 0))) %>%
- mutate(CO_IDIOM = as.factor(if_else(CO_IDIOM !=1, 0, 1))) # just two classes: 0 no idiom, 1 idiom
+ mutate(CO_IDIOM = as.factor(if_else(CO_IDIOM !=1, "0", "1")))
covars <- c("CO_LL", "CO_Z", "CO_G", "CO_T", "CO_LOGDICE", "CO_PMI", "CO_MI3", "CO_DEREKO", "CO_SGT", "CO_WIN5_VEC","CO_WIN5_VEC_AUTOSEM")
syvars <- c(syfeaturenames$synames,"NSTOPW")