Use consistent default parameters for randomForest training This makes sure that the parameters are sane and consistent across different training scenarios. Also print out the results. Change-Id: I362baa9520ccf59669fa7b1fd9538f9283aa6c26

commit: 13f67ed2bd9b92229a0cf2690add7072cc5607eb [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Mon Feb 22 07:55:03 2021 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Mon Feb 22 08:33:33 2021 +0100
tree: f80d29c44e2b2763ef0a8d898d270b7cb9de3fc0
parent: ecc9c4c53699d57bea44ed70e62c78635029a683 [diff]
diff --git a/R/idiomclassification_mk_pf.R b/R/idiomclassification_mk_pf.R
index 8c6947b..473a577 100644
--- a/R/idiomclassification_mk_pf.R
+++ b/R/idiomclassification_mk_pf.R

@@ -56,7 +56,7 @@
 train <- ngramme[trainRows,]
 test <- ngramme[setdiff(1:nrow(ngramme),trainRows),]
 
-rf_classifier = randomForest(fmla, train, ntree=100, mtry=10, importance=TRUE)
+rf_classifier = randomForest(fmla, train, importance=TRUE)
 
 # only SY features
 # rf_classifier = randomForest(fmlasy, train, ntree=100, mtry=10, importance=TRUE)
@@ -66,7 +66,9 @@
 # different cutoff for prediction
 # prediction_for_table <- predict(rf_classifier, test %>% select(-CO_IDIOM), cutoff = c(0.8, 0.2))
 
-confusionMatrix(prediction_for_table, test$CO_IDIOM,positive= "1")
+res <- confusionMatrix(prediction_for_table, test$CO_IDIOM,positive= "1")
+cat("Without SMOTE")
+print(res)
 
 # Sensitivity is recall of class 1
 # Pos Pred Value is precision
@@ -75,14 +77,16 @@
 # optional resampling with smote
 
 smoted.data <- SMOTE(fmla, subset(train, select = c("CO_IDIOM", vars)), perc.over = 1200, perc.under = 100)
-rf_classifier = randomForest(fmla, smoted.data, ntree=100, mtry=10, importance=TRUE)
+rf_classifier = randomForest(fmla, smoted.data, importance=TRUE)
 prediction_for_table <- predict(rf_classifier,test %>% select(-CO_IDIOM))
-confusionMatrix(prediction_for_table,test$CO_IDIOM, positive = "1")
+res <- confusionMatrix(prediction_for_table,test$CO_IDIOM, positive = "1")
+cat("With SMOTE")
+print(res)
 
 # Using estimates by random forest on entire dataset
 
 library(randomForest)
-rf_classifier_full = randomForest(fmla, data=ngramme, ntree=100, mtry=2, importance=TRUE)
+rf_classifier_full = randomForest(fmla, data=ngramme, importance=TRUE)
 rf_classifier_full
 # class.error is 1 - recall
 varImpPlot(rf_classifier_full)
@@ -118,7 +122,7 @@
 conf<-matrix(0,2,3)
 featureRanks<-matrix(0,4,length(vars))
 for (i in 1:10) {
-  rfc =randomForest(fmla, data=ngramme, ntree=100, importance=TRUE)
+  rfc =randomForest(fmla, data=ngramme, importance=TRUE)
   #rfc =randomForest(fmla, data=ngramme, ntree=100, importance=TRUE, cutoff=c(0.8,0.2))
   errrate<-errrate+rfc$err.rate[100,1]
   conf<-conf+rfc$confusion
commit	13f67ed2bd9b92229a0cf2690add7072cc5607eb	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Feb 22 07:55:03 2021 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Feb 22 08:33:33 2021 +0100
tree	f80d29c44e2b2763ef0a8d898d270b7cb9de3fc0
parent	ecc9c4c53699d57bea44ed70e62c78635029a683 [diff]