added NA treatment for dataset 1, Marc's improved axis labelling
Change-Id: I146fc9b096f1f8f8a66b51bef4c45bf17a0e68a8
diff --git a/R/idiomclassification_mk_pf.R b/R/idiomclassification_mk_pf.R
index 7521226..efca8b9 100644
--- a/R/idiomclassification_mk_pf.R
+++ b/R/idiomclassification_mk_pf.R
@@ -10,7 +10,8 @@
# Test
-ngramfile<-"gold03_anno_ml_synfeat_nstopw"
+ngramfile<-"gold03_anno_ml_synfeat_nstopw" # 2nd dataset
+# ngramfile <-"goldstandard01_anno_ml_synfeat_nstop1" # 1st dataset
setwd(dirname(rstudioapi::getSourceEditorContext()$path))
stopwords <- readLines(con = "../data/stopwords.txt",encoding="UTF-8")
@@ -23,6 +24,7 @@
}
oringramme <- oringramme %>%
+ mutate(CO_IDIOM = ifelse(is.na(CO_IDIOM),0,CO_IDIOM)) %>% # treat NAs as 0
filter(CO_IDIOM < 2) # just two classes: 0 no idiom, 1 idiom
# Reduce number of classes, treat null values, add NSTOPW, change names for SY features
@@ -114,7 +116,9 @@
cvalues %>%
select(c("cutoff", "Recall", "Precision", "F1", "Specificity", "Balanced Accuracy")) %>%
pivot_longer(!cutoff, names_to=c("measure")) %>%
- ggplot(aes(cutoff, value, colour=measure)) + geom_line()
+ ggplot(aes(cutoff, value, colour=measure)) + geom_line() +
+ scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
+ scale_y_continuous(breaks = scales::pretty_breaks(n = 10))
# Using estimates by random forest on entire dataset