added NA treatment for dataset 1, Marc's improved axis labelling Change-Id: I146fc9b096f1f8f8a66b51bef4c45bf17a0e68a8

commit: 3f97f36cc31b63cb90913ee7437ad33c6f4c7f6d [log] [tgz]
author: PeterFankhauserIDS <fankhauser@ids-mannheim.de> Tue Feb 23 10:28:07 2021 +0100
committer: PeterFankhauserIDS <fankhauser@ids-mannheim.de> Tue Feb 23 10:28:07 2021 +0100
tree: 1de64d1566705768b409f5fcb8c64ea4582a28e7
parent: 67a06bd6857fca000c02459a6840bcdcdd9c9bef [diff] [blame]
diff --git a/R/idiomclassification_mk_pf.R b/R/idiomclassification_mk_pf.R
index 7521226..efca8b9 100644
--- a/R/idiomclassification_mk_pf.R
+++ b/R/idiomclassification_mk_pf.R

@@ -10,7 +10,8 @@
 
 # Test
 
-ngramfile<-"gold03_anno_ml_synfeat_nstopw"
+ngramfile<-"gold03_anno_ml_synfeat_nstopw" # 2nd dataset
+# ngramfile <-"goldstandard01_anno_ml_synfeat_nstop1" # 1st dataset
 
 setwd(dirname(rstudioapi::getSourceEditorContext()$path))
 stopwords <- readLines(con = "../data/stopwords.txt",encoding="UTF-8")
@@ -23,6 +24,7 @@
 }
 
 oringramme <- oringramme %>%
+  mutate(CO_IDIOM = ifelse(is.na(CO_IDIOM),0,CO_IDIOM)) %>%  # treat NAs as 0
   filter(CO_IDIOM < 2)   # just two classes: 0 no idiom, 1 idiom
 
 # Reduce number of classes, treat null values, add NSTOPW, change names for SY features
@@ -114,7 +116,9 @@
 cvalues %>%
   select(c("cutoff", "Recall", "Precision", "F1", "Specificity", "Balanced Accuracy")) %>%
   pivot_longer(!cutoff, names_to=c("measure")) %>%
-  ggplot(aes(cutoff, value, colour=measure)) + geom_line()
+  ggplot(aes(cutoff, value, colour=measure)) + geom_line() +
+  scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
+  scale_y_continuous(breaks = scales::pretty_breaks(n = 10))
 
 # Using estimates by random forest on entire dataset
commit	3f97f36cc31b63cb90913ee7437ad33c6f4c7f6d	[log] [tgz]
author	PeterFankhauserIDS <fankhauser@ids-mannheim.de>	Tue Feb 23 10:28:07 2021 +0100
committer	PeterFankhauserIDS <fankhauser@ids-mannheim.de>	Tue Feb 23 10:28:07 2021 +0100
tree	1de64d1566705768b409f5fcb8c64ea4582a28e7
parent	67a06bd6857fca000c02459a6840bcdcdd9c9bef [diff] [blame]