| # Get classic and w2v based collocates from DeReKoVecs API  | 
 | # | 
 | # Have a look at the web gui column header mouse overs to figure out what the columns mean: | 
 | # http://corpora.ids-mannheim.de/openlab/derekovecs?word=triftiger&cutoff=500000&n=100&N=2000#tabs-3 | 
 |  | 
 | library(httr) | 
 | library(tidyverse) | 
 |  | 
 | stopwords <- readLines(con = "../data/stopwords.txt",encoding="UTF-8") | 
 | derekovecs_server = 'https://corpora.ids-mannheim.de/openlab/derekovecs/' | 
 |  | 
 | # ngramme <- read.csv("../data/gold03_anno_ml.csv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8") | 
 |  | 
 | ngramme <- read.csv("../data/goldstandard01_anno_ml.tsv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8") | 
 |  | 
 |  | 
 | ngramme[, c(7:23)] <- sapply(ngramme[, c(7:23)], as.numeric) | 
 |  | 
 | DeReKoVecsCall <- function(wordform = "Grund", cutoff = 500000, n = 100) { | 
 |   params = list(word = wordform, cutoff = cutoff, n = n, json = 1) | 
 |   response <- tryCatch({httr::GET(url = derekovecs_server, query = params,timeout(10))}, | 
 |                        error=function(cond) return(NULL)) | 
 |   if(!is.null(response) && httr::status_code(response) == 200) { | 
 |     content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE) | 
 |   } | 
 |   else { | 
 |     return(NULL) | 
 |   } | 
 | } | 
 |  | 
 | getW2VCollocates <- function(wordform = "Grund", ...) { | 
 |   ret<-DeReKoVecsCall(wordform, ...) | 
 |   if(!is.null(ret)) ret$collocators | 
 |   else NULL | 
 | } | 
 |  | 
 | getClassicCollocates <- function(wordform = "Grund") { | 
 |   response <- tryCatch({httr::GET(url = paste0(derekovecs_server, 'getClassicCollocators'), query = list(w=wordform),timeout(10))}, | 
 |                        error=function(cond) return(NULL)) | 
 |   if(!is.null(response) && httr::status_code(response) == 200) { | 
 |     content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE)$collocates | 
 |   } | 
 |   else { | 
 |     return(null) | 
 |   } | 
 | } | 
 |  | 
 |  | 
 | getBoth <- function(wordform = "Schmetterlinge") { | 
 |   w2v <- getW2VCollocates(wordform) | 
 |   classic <- getClassicCollocates(wordform) | 
 |   if (!is.null(w2v) && !is.null(classic)) { | 
 |     merge(classic,w2v,by="word",all=TRUE) | 
 |   } | 
 |   else { | 
 |     return(NA) | 
 |   } | 
 | } | 
 |  | 
 | getRanks <- function(collocates,NApenalty=1000) { | 
 |   if (is.null(collocates)) { | 
 |     return(NA) | 
 |   } | 
 |   both <<- | 
 |     collocates %>% | 
 |     arrange(desc(cprob)) %>% | 
 |     mutate(w2v.rank = 1:nrow(.)) %>% | 
 |     mutate(w2v.rank = ifelse(is.na(cprob), NApenalty, w2v.rank)) %>% | 
 |     arrange(desc(prob)) %>% | 
 |     mutate(w2v.rank1 = 1:nrow(.)) %>% | 
 |     mutate(w2v.rank1 = ifelse(is.na(prob), NApenalty, w2v.rank1)) %>% | 
 |     arrange(desc(ld)) %>% | 
 |     mutate(classic.rank = 1:nrow(.)) %>% | 
 |     mutate(classic.rank = ifelse(is.na(ld), NApenalty, classic.rank)) | 
 |   return(both) | 
 | } | 
 |  | 
 | # rankDiff with precalculated collocates | 
 |  | 
 | rankDiff1 <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) { | 
 |   if (is.na(both)) { | 
 |     return(NA) | 
 |   } | 
 |   if (! collocate %in% both$word) | 
 |     return(NA) | 
 |   w2vRank <- both[both$word==collocate,]$w2v.rank | 
 |   classicRank <- both[both$word==collocate,]$classic.rank | 
 |   if (is.numeric(w2vRank) && is.numeric(classicRank)) {w2vRank - classicRank} | 
 |   else {NA} | 
 | } | 
 |  | 
 | featurenames<-c("af", "dice","ld","lfmd","llfmd","llr","lnpmi" ,"md","npmi","pmi","rlfmd","rnpmi","average","cprob","max","overall","prob","w2v.rank","w2v.rank1", "classic.rank") | 
 |  | 
 | getFeatures <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) { | 
 |   if (is.na(both)) { | 
 |     return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1))) | 
 |   } | 
 |   if (! collocate %in% both$word) | 
 |     return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1))) | 
 |   features<-both[both$word==collocate,] | 
 |   w2vRank <- features$w2v.rank | 
 |   classicRank <- features$classic.rank | 
 |   rankDiff<-NA | 
 |   if (is.numeric(w2vRank) && is.numeric(classicRank)) {rankDiff<-w2vRank - classicRank} | 
 |   return(as.numeric(unlist(c(rankDiff,features[1,featurenames])))) | 
 | } | 
 |  | 
 | #w2v <- getW2VCollocates("triftiger") | 
 | #classic <- getClassicCollocates("triftiger") | 
 | #rankDiff("Schmetterlinge", "Bauch") | 
 |  | 
 | firstup <- function(x) { | 
 |   substr(x, 1, 1) <- toupper(substr(x, 1, 1)) | 
 |   x | 
 | } | 
 |  | 
 | # rankDiffCase with precalculated collocates | 
 |  | 
 | rankDiffCase1<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) { | 
 |   if (wordform=="" || collocate=="") { | 
 |     return(NA) | 
 |   } | 
 |   ret<-rankDiff1(wordform,collocate,collocates1) | 
 |   if (is.na(ret)) { | 
 |     ret<- rankDiff1(firstup(wordform),collocate,collocates2) | 
 |     if (is.na(ret)) { | 
 |       ret <- rankDiff1(wordform, firstup(collocate),collocates1) | 
 |       if (is.na(ret)) { | 
 |         ret <- rankDiff1(firstup(wordform),firstup(collocate),collocates2) | 
 |       } | 
 |     } | 
 |   } | 
 |   return(ret) | 
 | } | 
 |  | 
 | getFeaturesCase<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) { | 
 |   if (wordform=="" || collocate=="") { | 
 |     return(NA) | 
 |   } | 
 |   ret1<-getFeatures(wordform,collocate,collocates1) | 
 |   ret2<-getFeatures(firstup(wordform),collocate,collocates2) | 
 |   ret3<-getFeatures(wordform,firstup(collocate),collocates1) | 
 |   ret4<-getFeatures(firstup(wordform),firstup(collocate),collocates2) | 
 |   ret<-rbind(ret1,ret2,ret3,ret4) | 
 |   colnames(ret)<-c("rankDiff",featurenames) | 
 |   colMeans(ret,na.rm=TRUE) | 
 | } | 
 |  | 
 | # get rid of some "offending" characters | 
 |  | 
 | cleanUp<-function(str) { | 
 |   ret<-str_replace_all(str,"[^[:alnum:]]"," ") | 
 |   ret<-str_replace_all(ret,"[\n\r\t]"," ") | 
 |   ret<-str_replace_all(ret,"[ ]+"," ") | 
 |   ret<-str_replace_all(ret,"^ ","") | 
 |   ret<-str_replace_all(ret, " $","") | 
 |   ret | 
 | } | 
 |  | 
 | cleanUpV<-Vectorize(cleanUp) | 
 |  | 
 | deleteStopwords =  function(wl, stopwords = NULL) { | 
 |   wl[!(wl %in% stopwords)] | 
 | } | 
 |  | 
 | ngramme$tokens <- cleanUpV(paste(ngramme$CO_TOKEN1, ngramme$CO_TOKEN2,ngramme$CO_TOKEN3,ngramme$CO_TOKEN4,ngramme$CO_TOKEN5,ngramme$CO_TOKENS6,sep=" ")) | 
 |  | 
 | # retrieve collocates only once per token. | 
 |  | 
 | avgRankDiffCaseLinear<-function(wordlist) { | 
 |   wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords) | 
 |   sum<-0 | 
 |   count<-0 | 
 |   if (length(wl)>1) { | 
 |   for (i in 1:length(wl)) { | 
 |     collocates1<-getRanks(getBoth(wl[i])) | 
 |     collocates2<-getRanks(getBoth(firstup(wl[i]))) | 
 |     for(j in 1:length(wl)) { | 
 |       if (i!=j) { | 
 |         rd<-rankDiffCase1(wl[i],wl[j],collocates1,collocates2) | 
 |         if (!is.na(rd)) { | 
 |           sum=sum+rd | 
 |           count=count+1 | 
 |         } | 
 |       } | 
 |     } | 
 |   } | 
 |   } | 
 |   ret<-ifelse(count>0,sum/count,NA) | 
 |   print(paste(wordlist, ": ", ret)) | 
 |   return(ret) | 
 | } | 
 |  | 
 | avgRankDiffCaseLinearV<-Vectorize(avgRankDiffCaseLinear) | 
 |  | 
 |  | 
 | avgFeaturesCaseLinear<-function(wordlist) { | 
 |   wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords) | 
 |   ret<- data.frame(matrix(ncol = length(featurenames)+1, nrow = 0)) | 
 |   if (length(wl)>1) { | 
 |     for (i in 1:length(wl)) { | 
 |       collocates1<-getRanks(getBoth(wl[i])) | 
 |       collocates2<-getRanks(getBoth(firstup(wl[i]))) | 
 |       for(j in 1:length(wl)) { | 
 |         if (i!=j) { | 
 |           ret<-rbind(ret,getFeaturesCase(wl[i],wl[j],collocates1,collocates2)) | 
 |         } | 
 |       } | 
 |     } | 
 |   } | 
 |   colnames(ret)<-c("rankDiff",featurenames) | 
 |   mret<-colMeans(ret,na.rm=TRUE) | 
 |   print(paste(c(wordlist, ": ", mret))) | 
 |   return(mret) | 
 | } | 
 |  | 
 | ngramme[,c("rankDiff",featurenames)]<-t(sapply(ngramme$tokens,avgFeaturesCaseLinear)) | 
 | write.table(ngramme,file="../data/goldstandard01_anno_ml_synfeat_nstop1.tsv", sep = "\t",quote=F) | 
 |  |