Explicitly name factors idiom and no_idiom
Note that this also implicitly changes the order
of "event" and "no event" so that it follows the examples
in ?caret::confusionMatrix .
Change-Id: Ia33396fc104c190ffbd37eb08cc86d126311742e
diff --git a/R/idiomclassification_mk_pf.R b/R/idiomclassification_mk_pf.R
index 7a591b7..af96b27 100644
--- a/R/idiomclassification_mk_pf.R
+++ b/R/idiomclassification_mk_pf.R
@@ -35,7 +35,7 @@
mutate(across(c("dice", "lfmd", "llr", "ld", "pmi"), ~ replace_na(.x, min(.x) - 1))) %>%
rename_at(syfeaturenames$innames, ~ syfeaturenames[syfeaturenames$innames==.x,]$synames ) %>%
mutate(across(everything(), ~ replace_na(.x, 0))) %>%
- mutate(CO_IDIOM = as.factor(if_else(CO_IDIOM !=1, "0", "1"))) # just two classes: 0 no idiom, 1 idiom
+ mutate(CO_IDIOM = as.factor(if_else(CO_IDIOM == 1, "idiom", "no_idiom"))) # just two classes: 0 no idiom, 1 idiom
# Optional
write.table(ngramme,file=paste("../data/",ngramfile,"_cosy.csv",sep=""), sep = "\t", quote=F)
@@ -67,7 +67,7 @@
prediction_for_table <- predict(rf_classifier, test %>% select(-CO_IDIOM))
-res <- confusionMatrix(prediction_for_table, test$CO_IDIOM,positive= "1")
+res <- confusionMatrix(prediction_for_table, test$CO_IDIOM, positive= "idiom")
print(res)
# Sensitivity is recall of class 1
@@ -79,13 +79,13 @@
smoted.data <- SMOTE(fmla, subset(train, select = c("CO_IDIOM", vars)), perc.over = 1200, perc.under = 100)
rf_classifier = randomForest(fmla, smoted.data, importance=TRUE)
prediction_for_table <- predict(rf_classifier,test %>% select(-CO_IDIOM))
-res <- confusionMatrix(prediction_for_table,test$CO_IDIOM, positive = "1")
+res <- confusionMatrix(prediction_for_table,test$CO_IDIOM, positive = "idiom")
print(res)
cat("With SMOTE and detection task oriented cutoff for prediction\n")
-prediction_for_table <- predict(rf_classifier,test %>% select(-CO_IDIOM), cutoff = c(0.8, 0.2))
-res <- confusionMatrix(prediction_for_table,test$CO_IDIOM, positive = "1")
+prediction_for_table <- predict(rf_classifier,test %>% select(-CO_IDIOM), cutoff = c(0.2, 0.8))
+res <- confusionMatrix(prediction_for_table,test$CO_IDIOM, positive = "idiom")
print(res)
@@ -107,8 +107,8 @@
# ttest
-idioms<-ngramme %>% filter(CO_IDIOM==1)
-nonidioms<-ngramme %>% filter(CO_IDIOM!=1)
+idioms<-ngramme %>% filter(CO_IDIOM == "idiom")
+nonidioms<-ngramme %>% filter(CO_IDIOM != "idiom")
ttestPvalues<-sapply(vars,
function(sel) t.test(idioms[sel],nonidioms[sel])$p.value)
@@ -143,8 +143,8 @@
cbind(conf[,1:2]/i,(1-conf[,3]/i)*100),
c(100*diag(conf[,1:2])/colSums(conf[,1:2]),NA),
c(rowSums(conf[,1:2]/i),NA)),digits=2)
- colnames(conf1)<-c("0","1","rec")
- rownames(conf1)<-c("0","1","prec","sum")
+ colnames(conf1)<-c("1","0","rec")
+ rownames(conf1)<-c("1","0","prec","sum")
print(conf1)
}
featureRanks<-featureRanks/10