Added readme.txt, derekovecs_apicall_syn_nlc, and some data files

Change-Id: I4ab1b30d6e96368198f5519c161c7c5a18d5b17e
diff --git a/R/derekovecs_apicall_syn_nlc.R b/R/derekovecs_apicall_syn_nlc.R
new file mode 100644
index 0000000..b896e5c
--- /dev/null
+++ b/R/derekovecs_apicall_syn_nlc.R
@@ -0,0 +1,218 @@
+# Get classic and w2v based collocates from DeReKoVecs API 
+#
+# Have a look at the web gui column header mouse overs to figure out what the columns mean:
+# http://corpora.ids-mannheim.de/openlab/derekovecs?word=triftiger&cutoff=500000&n=100&N=2000#tabs-3
+
+library(httr)
+library(tidyverse)
+
+stopwords <- readLines(con = "../data/stopwords.txt",encoding="UTF-8")
+
+# ngramme <- read.csv("../data/gold03_anno_ml.csv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8")
+
+ngramme <- read.csv("../data/goldstandard01_anno_ml.tsv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8")
+
+
+ngramme[, c(7:23)] <- sapply(ngramme[, c(7:23)], as.numeric)
+
+DeReKoVecsCall <- function(wordform = "Grund", cutoff = 500000, n = 100) {
+  params = list(word = wordform, cutoff = cutoff, n = n, json = 1)
+  response <- tryCatch({httr::GET(url = 'http://korap-worker-04:5673', query = params,timeout(10))},
+                       error=function(cond) return(NA))
+  if(!is.na(response)) {
+    content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE)
+  }
+  else {
+    return(NA)
+  }
+}
+
+getW2VCollocates <- function(wordform = "Grund", ...) {
+  ret<-DeReKoVecsCall(wordform, ...)
+  if(!is.na(ret)) ret$collocators
+  else NA
+}
+
+getClassicCollocates <- function(wordform = "Grund") {
+  response <- tryCatch({httr::GET(url = 'http://korap-worker-04:5673/getClassicCollocators', query = list(w=wordform),timeout(10))},
+                       error=function(cond) return(NA))
+  if(!is.na(response)) {
+    content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE)$collocates
+  }
+  else {
+    return(NA)
+  }
+}
+
+
+getBoth <- function(wordform = "Schmetterlinge") {
+  w2v <- getW2VCollocates(wordform)
+  classic <- getClassicCollocates(wordform)
+  if (length(w2v)>0 && length(classic>0) && !is.na(w2v) && !is.na(classic)) {
+    merge(classic,w2v,by="word",all=TRUE)
+  }
+  else {
+    return(NA)
+  }
+}
+
+getRanks <- function(collocates,NApenalty=1000) {
+  if (is.na(collocates)) {
+    return(NA)
+  }
+  both <<-
+    collocates %>%
+    arrange(desc(cprob)) %>%
+    mutate(w2v.rank = 1:nrow(.)) %>%
+    mutate(w2v.rank = ifelse(is.na(cprob), NApenalty, w2v.rank)) %>%
+    arrange(desc(prob)) %>%
+    mutate(w2v.rank1 = 1:nrow(.)) %>%
+    mutate(w2v.rank1 = ifelse(is.na(prob), NApenalty, w2v.rank1)) %>%
+    arrange(desc(ld)) %>%
+    mutate(classic.rank = 1:nrow(.)) %>%
+    mutate(classic.rank = ifelse(is.na(ld), NApenalty, classic.rank))
+  return(both)
+}
+
+# rankDiff with precalculated collocates
+
+rankDiff1 <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) {
+  if (is.na(both)) {
+    return(NA)
+  }
+  if (! collocate %in% both$word)
+    return(NA)
+  w2vRank <- both[both$word==collocate,]$w2v.rank
+  classicRank <- both[both$word==collocate,]$classic.rank
+  if (is.numeric(w2vRank) && is.numeric(classicRank)) {w2vRank - classicRank}
+  else {NA}
+}
+
+featurenames<-c("af", "dice","ld","lfmd","llfmd","llr","lnpmi" ,"md","npmi","pmi","rlfmd","rnpmi","average","cprob","max","overall","prob","w2v.rank","w2v.rank1", "classic.rank")
+
+getFeatures <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) {
+  if (is.na(both)) {
+    return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1)))
+  }
+  if (! collocate %in% both$word)
+    return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1)))
+  features<-both[both$word==collocate,]
+  w2vRank <- features$w2v.rank
+  classicRank <- features$classic.rank
+  rankDiff<-NA
+  if (is.numeric(w2vRank) && is.numeric(classicRank)) {rankDiff<-w2vRank - classicRank}
+  return(as.numeric(unlist(c(rankDiff,features[1,featurenames]))))
+}
+
+#w2v <- getW2VCollocates("triftiger")
+#classic <- getClassicCollocates("triftiger")
+#rankDiff("Schmetterlinge", "Bauch")
+
+firstup <- function(x) {
+  substr(x, 1, 1) <- toupper(substr(x, 1, 1))
+  x
+}
+
+# rankDiffCase with precalculated collocates
+
+rankDiffCase1<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) {
+  if (wordform=="" || collocate=="") {
+    return(NA)
+  }
+  ret<-rankDiff1(wordform,collocate,collocates1)
+  if (is.na(ret)) {
+    ret<- rankDiff1(firstup(wordform),collocate,collocates2)
+    if (is.na(ret)) {
+      ret <- rankDiff1(wordform, firstup(collocate),collocates1)
+      if (is.na(ret)) {
+        ret <- rankDiff1(firstup(wordform),firstup(collocate),collocates2)
+      }
+    }
+  }
+  return(ret)
+}
+
+getFeaturesCase<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) {
+  if (wordform=="" || collocate=="") {
+    return(NA)
+  }
+  ret1<-getFeatures(wordform,collocate,collocates1)
+  ret2<-getFeatures(firstup(wordform),collocate,collocates2)
+  ret3<-getFeatures(wordform,firstup(collocate),collocates1)
+  ret4<-getFeatures(firstup(wordform),firstup(collocate),collocates2)
+  ret<-rbind(ret1,ret2,ret3,ret4)
+  colnames(ret)<-c("rankDiff",featurenames)
+  colMeans(ret,na.rm=TRUE)
+}
+
+# get rid of some "offending" characters
+
+cleanUp<-function(str) {
+  ret<-str_replace_all(str,"[^[:alnum:]]"," ")
+  ret<-str_replace_all(ret,"[\n\r\t]"," ")
+  ret<-str_replace_all(ret,"[ ]+"," ")
+  ret<-str_replace_all(ret,"^ ","")
+  ret<-str_replace_all(ret, " $","")
+  ret
+}
+
+cleanUpV<-Vectorize(cleanUp)
+
+deleteStopwords =  function(wl, stopwords = NULL) {
+  wl[!(wl %in% stopwords)]
+}
+
+ngramme$tokens <- cleanUpV(paste(ngramme$CO_TOKEN1, ngramme$CO_TOKEN2,ngramme$CO_TOKEN3,ngramme$CO_TOKEN4,ngramme$CO_TOKEN5,ngramme$CO_TOKENS6,sep=" "))
+
+# retrieve collocates only once per token.
+
+avgRankDiffCaseLinear<-function(wordlist) {
+  wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords)
+  sum<-0
+  count<-0
+  if (length(wl)>1) {
+  for (i in 1:length(wl)) {
+    collocates1<-getRanks(getBoth(wl[i]))
+    collocates2<-getRanks(getBoth(firstup(wl[i])))
+    for(j in 1:length(wl)) {
+      if (i!=j) {
+        rd<-rankDiffCase1(wl[i],wl[j],collocates1,collocates2)
+        if (!is.na(rd)) {
+          sum=sum+rd
+          count=count+1
+        }
+      }
+    }
+  }
+  }
+  ret<-ifelse(count>0,sum/count,NA)
+  print(paste(wordlist, ": ", ret))
+  return(ret)
+}
+
+avgRankDiffCaseLinearV<-Vectorize(avgRankDiffCaseLinear)
+
+
+avgFeaturesCaseLinear<-function(wordlist) {
+  wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords)
+  ret<- data.frame(matrix(ncol = length(featurenames)+1, nrow = 0))
+  if (length(wl)>1) {
+    for (i in 1:length(wl)) {
+      collocates1<-getRanks(getBoth(wl[i]))
+      collocates2<-getRanks(getBoth(firstup(wl[i])))
+      for(j in 1:length(wl)) {
+        if (i!=j) {
+          ret<-rbind(ret,getFeaturesCase(wl[i],wl[j],collocates1,collocates2))
+        }
+      }
+    }
+  }
+  colnames(ret)<-c("rankDiff",featurenames)
+  mret<-colMeans(ret,na.rm=TRUE)
+  print(paste(c(wordlist, ": ", mret)))
+  return(mret)
+}
+
+ngramme[,c("rankDiff",featurenames)]<-t(sapply(ngramme$tokens,avgFeaturesCaseLinear))
+write.table(ngramme,file="../data/goldstandard01_anno_ml_synfeat_nstop1.tsv", sep = "\t",quote=F)
+