Use consistent default parameters for randomForest training
This makes sure that the parameters are sane and consistent
across different training scenarios.
Also print out the results.
Change-Id: I362baa9520ccf59669fa7b1fd9538f9283aa6c26
diff --git a/R/idiomclassification_mk_pf.R b/R/idiomclassification_mk_pf.R
index 8c6947b..473a577 100644
--- a/R/idiomclassification_mk_pf.R
+++ b/R/idiomclassification_mk_pf.R
@@ -56,7 +56,7 @@
train <- ngramme[trainRows,]
test <- ngramme[setdiff(1:nrow(ngramme),trainRows),]
-rf_classifier = randomForest(fmla, train, ntree=100, mtry=10, importance=TRUE)
+rf_classifier = randomForest(fmla, train, importance=TRUE)
# only SY features
# rf_classifier = randomForest(fmlasy, train, ntree=100, mtry=10, importance=TRUE)
@@ -66,7 +66,9 @@
# different cutoff for prediction
# prediction_for_table <- predict(rf_classifier, test %>% select(-CO_IDIOM), cutoff = c(0.8, 0.2))
-confusionMatrix(prediction_for_table, test$CO_IDIOM,positive= "1")
+res <- confusionMatrix(prediction_for_table, test$CO_IDIOM,positive= "1")
+cat("Without SMOTE")
+print(res)
# Sensitivity is recall of class 1
# Pos Pred Value is precision
@@ -75,14 +77,16 @@
# optional resampling with smote
smoted.data <- SMOTE(fmla, subset(train, select = c("CO_IDIOM", vars)), perc.over = 1200, perc.under = 100)
-rf_classifier = randomForest(fmla, smoted.data, ntree=100, mtry=10, importance=TRUE)
+rf_classifier = randomForest(fmla, smoted.data, importance=TRUE)
prediction_for_table <- predict(rf_classifier,test %>% select(-CO_IDIOM))
-confusionMatrix(prediction_for_table,test$CO_IDIOM, positive = "1")
+res <- confusionMatrix(prediction_for_table,test$CO_IDIOM, positive = "1")
+cat("With SMOTE")
+print(res)
# Using estimates by random forest on entire dataset
library(randomForest)
-rf_classifier_full = randomForest(fmla, data=ngramme, ntree=100, mtry=2, importance=TRUE)
+rf_classifier_full = randomForest(fmla, data=ngramme, importance=TRUE)
rf_classifier_full
# class.error is 1 - recall
varImpPlot(rf_classifier_full)
@@ -118,7 +122,7 @@
conf<-matrix(0,2,3)
featureRanks<-matrix(0,4,length(vars))
for (i in 1:10) {
- rfc =randomForest(fmla, data=ngramme, ntree=100, importance=TRUE)
+ rfc =randomForest(fmla, data=ngramme, importance=TRUE)
#rfc =randomForest(fmla, data=ngramme, ntree=100, importance=TRUE, cutoff=c(0.8,0.2))
errrate<-errrate+rfc$err.rate[100,1]
conf<-conf+rfc$confusion