Show table with comparison of RF w/ or w/o SMOTE, cutoff
Change-Id: I298effe9ef38e4099c1d58beb6d12d126af6b004
diff --git a/R/idiomclassification_mk_pf.R b/R/idiomclassification_mk_pf.R
index af96b27..f564231 100644
--- a/R/idiomclassification_mk_pf.R
+++ b/R/idiomclassification_mk_pf.R
@@ -58,7 +58,7 @@
train <- ngramme[trainRows,]
test <- ngramme[setdiff(1:nrow(ngramme),trainRows),]
-cat("Random Forest without SMOTE\n")
+cat("Random Forest\n")
rf_classifier = randomForest(fmla, train, importance=TRUE)
@@ -69,25 +69,38 @@
res <- confusionMatrix(prediction_for_table, test$CO_IDIOM, positive= "idiom")
print(res)
+collected_results <- bind_cols("rf" = res$byClass)
# Sensitivity is recall of class 1
# Pos Pred Value is precision
varImpPlot(rf_classifier)
-cat("With SMOTE resampled training data\n")
+cat("Random Forest with cutoff\n")
+prediction_for_table <- predict(rf_classifier,test %>% select(-CO_IDIOM), cutoff = c(0.2, 0.8))
+res <- confusionMatrix(prediction_for_table,test$CO_IDIOM, positive = "idiom")
+collected_results <- bind_cols(collected_results, "rf with cutoff" = res$byClass)
+print(res)
+cat("With SMOTE resampled training data\n")
smoted.data <- SMOTE(fmla, subset(train, select = c("CO_IDIOM", vars)), perc.over = 1200, perc.under = 100)
rf_classifier = randomForest(fmla, smoted.data, importance=TRUE)
prediction_for_table <- predict(rf_classifier,test %>% select(-CO_IDIOM))
res <- confusionMatrix(prediction_for_table,test$CO_IDIOM, positive = "idiom")
+collected_results <- bind_cols(collected_results, "rf with SMOTE" = res$byClass)
print(res)
-cat("With SMOTE and detection task oriented cutoff for prediction\n")
-
+cat("With SMOTE and cutoff\n")
prediction_for_table <- predict(rf_classifier,test %>% select(-CO_IDIOM), cutoff = c(0.2, 0.8))
res <- confusionMatrix(prediction_for_table,test$CO_IDIOM, positive = "idiom")
+collected_results <- bind_cols(collected_results, "rf with SMOTE and cutoff" = res$byClass)
print(res)
+collected_results <- collected_results %>%
+ round(3) %>%
+ add_column(measure = names(res$byClass)) %>%
+ column_to_rownames("measure")
+
+View(collected_results)
# Using estimates by random forest on entire dataset