R/derekovecs_apicall_syn_nlc.R - private/idioms - Gitiles

 # Get classic and w2v based collocates from DeReKoVecs API
 #
 # Have a look at the web gui column header mouse overs to figure out what the columns mean:
 # http://corpora.ids-mannheim.de/openlab/derekovecs?word=triftiger&cutoff=500000&n=100&N=2000#tabs-3

 library(httr)
 library(tidyverse)

 stopwords <- readLines(con = "../data/stopwords.txt",encoding="UTF-8")
 derekovecs_server = 'https://corpora.ids-mannheim.de/openlab/derekovecs/'

 # ngramme <- read.csv("../data/gold03_anno_ml.csv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8")

 ngramme <- read.csv("../data/goldstandard01_anno_ml.tsv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8")


 ngramme[, c(7:23)] <- sapply(ngramme[, c(7:23)], as.numeric)

 DeReKoVecsCall <- function(wordform = "Grund", cutoff = 500000, n = 100) {
   params = list(word = wordform, cutoff = cutoff, n = n, json = 1)
   response <- tryCatch({httr::GET(url = derekovecs_server, query = params,timeout(10))},
                        error=function(cond) return(NULL))
   if(!is.null(response) && httr::status_code(response) == 200) {
     content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE)
   }
   else {
     return(NULL)
   }
 }

 getW2VCollocates <- function(wordform = "Grund", ...) {
   ret<-DeReKoVecsCall(wordform, ...)
   if(!is.null(ret)) ret$collocators
   else NULL
 }

 getClassicCollocates <- function(wordform = "Grund") {
   response <- tryCatch({httr::GET(url = paste0(derekovecs_server, 'getClassicCollocators'), query = list(w=wordform),timeout(10))},
                        error=function(cond) return(NULL))
   if(!is.null(response) && httr::status_code(response) == 200) {
     content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE)$collocates
   }
   else {
     return(null)
   }
 }


 getBoth <- function(wordform = "Schmetterlinge") {
   w2v <- getW2VCollocates(wordform)
   classic <- getClassicCollocates(wordform)
   if (!is.null(w2v) && !is.null(classic)) {
     merge(classic,w2v,by="word",all=TRUE)
   }
   else {
     return(NA)
   }
 }

 getRanks <- function(collocates,NApenalty=1000) {
   if (is.null(collocates)) {
     return(NA)
   }
   both <<-
     collocates %>%
     arrange(desc(cprob)) %>%
     mutate(w2v.rank = 1:nrow(.)) %>%
     mutate(w2v.rank = ifelse(is.na(cprob), NApenalty, w2v.rank)) %>%
     arrange(desc(prob)) %>%
     mutate(w2v.rank1 = 1:nrow(.)) %>%
     mutate(w2v.rank1 = ifelse(is.na(prob), NApenalty, w2v.rank1)) %>%
     arrange(desc(ld)) %>%
     mutate(classic.rank = 1:nrow(.)) %>%
     mutate(classic.rank = ifelse(is.na(ld), NApenalty, classic.rank))
   return(both)
 }

 # rankDiff with precalculated collocates

 rankDiff1 <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) {
   if (is.na(both)) {
     return(NA)
   }
   if (! collocate %in% both$word)
     return(NA)
   w2vRank <- both[both$word==collocate,]$w2v.rank
   classicRank <- both[both$word==collocate,]$classic.rank
   if (is.numeric(w2vRank) && is.numeric(classicRank)) {w2vRank - classicRank}
   else {NA}
 }

 featurenames<-c("af", "dice","ld","lfmd","llfmd","llr","lnpmi" ,"md","npmi","pmi","rlfmd","rnpmi","average","cprob","max","overall","prob","w2v.rank","w2v.rank1", "classic.rank")

 getFeatures <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) {
   if (is.na(both)) {
     return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1)))
   }
   if (! collocate %in% both$word)
     return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1)))
   features<-both[both$word==collocate,]
   w2vRank <- features$w2v.rank
   classicRank <- features$classic.rank
   rankDiff<-NA
   if (is.numeric(w2vRank) && is.numeric(classicRank)) {rankDiff<-w2vRank - classicRank}
   return(as.numeric(unlist(c(rankDiff,features[1,featurenames]))))
 }

 #w2v <- getW2VCollocates("triftiger")
 #classic <- getClassicCollocates("triftiger")
 #rankDiff("Schmetterlinge", "Bauch")

 firstup <- function(x) {
   substr(x, 1, 1) <- toupper(substr(x, 1, 1))
   x
 }

 # rankDiffCase with precalculated collocates

 rankDiffCase1<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) {
   if (wordform=="" || collocate=="") {
     return(NA)
   }
   ret<-rankDiff1(wordform,collocate,collocates1)
   if (is.na(ret)) {
     ret<- rankDiff1(firstup(wordform),collocate,collocates2)
     if (is.na(ret)) {
       ret <- rankDiff1(wordform, firstup(collocate),collocates1)
       if (is.na(ret)) {
         ret <- rankDiff1(firstup(wordform),firstup(collocate),collocates2)
       }
     }
   }
   return(ret)
 }

 getFeaturesCase<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) {
   if (wordform=="" || collocate=="") {
     return(NA)
   }
   ret1<-getFeatures(wordform,collocate,collocates1)
   ret2<-getFeatures(firstup(wordform),collocate,collocates2)
   ret3<-getFeatures(wordform,firstup(collocate),collocates1)
   ret4<-getFeatures(firstup(wordform),firstup(collocate),collocates2)
   ret<-rbind(ret1,ret2,ret3,ret4)
   colnames(ret)<-c("rankDiff",featurenames)
   colMeans(ret,na.rm=TRUE)
 }

 # get rid of some "offending" characters

 cleanUp<-function(str) {
   ret<-str_replace_all(str,"[^[:alnum:]]"," ")
   ret<-str_replace_all(ret,"[\n\r\t]"," ")
   ret<-str_replace_all(ret,"[ ]+"," ")
   ret<-str_replace_all(ret,"^ ","")
   ret<-str_replace_all(ret, " $","")
   ret
 }

 cleanUpV<-Vectorize(cleanUp)

 deleteStopwords =  function(wl, stopwords = NULL) {
   wl[!(wl %in% stopwords)]
 }

 ngramme$tokens <- cleanUpV(paste(ngramme$CO_TOKEN1, ngramme$CO_TOKEN2,ngramme$CO_TOKEN3,ngramme$CO_TOKEN4,ngramme$CO_TOKEN5,ngramme$CO_TOKENS6,sep=" "))

 # retrieve collocates only once per token.

 avgRankDiffCaseLinear<-function(wordlist) {
   wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords)
   sum<-0
   count<-0
   if (length(wl)>1) {
   for (i in 1:length(wl)) {
     collocates1<-getRanks(getBoth(wl[i]))
     collocates2<-getRanks(getBoth(firstup(wl[i])))
     for(j in 1:length(wl)) {
       if (i!=j) {
         rd<-rankDiffCase1(wl[i],wl[j],collocates1,collocates2)
         if (!is.na(rd)) {
           sum=sum+rd
           count=count+1
         }
       }
     }
   }
   }
   ret<-ifelse(count>0,sum/count,NA)
   print(paste(wordlist, ": ", ret))
   return(ret)
 }

 avgRankDiffCaseLinearV<-Vectorize(avgRankDiffCaseLinear)


 avgFeaturesCaseLinear<-function(wordlist) {
   wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords)
   ret<- data.frame(matrix(ncol = length(featurenames)+1, nrow = 0))
   if (length(wl)>1) {
     for (i in 1:length(wl)) {
       collocates1<-getRanks(getBoth(wl[i]))
       collocates2<-getRanks(getBoth(firstup(wl[i])))
       for(j in 1:length(wl)) {
         if (i!=j) {
           ret<-rbind(ret,getFeaturesCase(wl[i],wl[j],collocates1,collocates2))
         }
       }
     }
   }
   colnames(ret)<-c("rankDiff",featurenames)
   mret<-colMeans(ret,na.rm=TRUE)
   print(paste(c(wordlist, ": ", mret)))
   return(mret)
 }

 ngramme[,c("rankDiff",featurenames)]<-t(sapply(ngramme$tokens,avgFeaturesCaseLinear))
 write.table(ngramme,file="../data/goldstandard01_anno_ml_synfeat_nstop1.tsv", sep = "\t",quote=F)
	# Get classic and w2v based collocates from DeReKoVecs API
	#
	# Have a look at the web gui column header mouse overs to figure out what the columns mean:
	# http://corpora.ids-mannheim.de/openlab/derekovecs?word=triftiger&cutoff=500000&n=100&N=2000#tabs-3

	library(httr)
	library(tidyverse)

	stopwords <- readLines(con = "../data/stopwords.txt",encoding="UTF-8")
	derekovecs_server = 'https://corpora.ids-mannheim.de/openlab/derekovecs/'

	# ngramme <- read.csv("../data/gold03_anno_ml.csv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8")

	ngramme <- read.csv("../data/goldstandard01_anno_ml.tsv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8")


	ngramme[, c(7:23)] <- sapply(ngramme[, c(7:23)], as.numeric)

	DeReKoVecsCall <- function(wordform = "Grund", cutoff = 500000, n = 100) {
	params = list(word = wordform, cutoff = cutoff, n = n, json = 1)
	response <- tryCatch({httr::GET(url = derekovecs_server, query = params,timeout(10))},
	error=function(cond) return(NULL))
	if(!is.null(response) && httr::status_code(response) == 200) {
	content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE)
	}
	else {
	return(NULL)
	}
	}

	getW2VCollocates <- function(wordform = "Grund", ...) {
	ret<-DeReKoVecsCall(wordform, ...)
	if(!is.null(ret)) ret$collocators
	else NULL
	}

	getClassicCollocates <- function(wordform = "Grund") {
	response <- tryCatch({httr::GET(url = paste0(derekovecs_server, 'getClassicCollocators'), query = list(w=wordform),timeout(10))},
	error=function(cond) return(NULL))
	if(!is.null(response) && httr::status_code(response) == 200) {
	content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE)$collocates
	}
	else {
	return(null)
	}
	}


	getBoth <- function(wordform = "Schmetterlinge") {
	w2v <- getW2VCollocates(wordform)
	classic <- getClassicCollocates(wordform)
	if (!is.null(w2v) && !is.null(classic)) {
	merge(classic,w2v,by="word",all=TRUE)
	}
	else {
	return(NA)
	}
	}

	getRanks <- function(collocates,NApenalty=1000) {
	if (is.null(collocates)) {
	return(NA)
	}
	both <<-
	collocates %>%
	arrange(desc(cprob)) %>%
	mutate(w2v.rank = 1:nrow(.)) %>%
	mutate(w2v.rank = ifelse(is.na(cprob), NApenalty, w2v.rank)) %>%
	arrange(desc(prob)) %>%
	mutate(w2v.rank1 = 1:nrow(.)) %>%
	mutate(w2v.rank1 = ifelse(is.na(prob), NApenalty, w2v.rank1)) %>%
	arrange(desc(ld)) %>%
	mutate(classic.rank = 1:nrow(.)) %>%
	mutate(classic.rank = ifelse(is.na(ld), NApenalty, classic.rank))
	return(both)
	}

	# rankDiff with precalculated collocates

	rankDiff1 <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) {
	if (is.na(both)) {
	return(NA)
	}
	if (! collocate %in% both$word)
	return(NA)
	w2vRank <- both[both$word==collocate,]$w2v.rank
	classicRank <- both[both$word==collocate,]$classic.rank
	if (is.numeric(w2vRank) && is.numeric(classicRank)) {w2vRank - classicRank}
	else {NA}
	}

	featurenames<-c("af", "dice","ld","lfmd","llfmd","llr","lnpmi" ,"md","npmi","pmi","rlfmd","rnpmi","average","cprob","max","overall","prob","w2v.rank","w2v.rank1", "classic.rank")

	getFeatures <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) {
	if (is.na(both)) {
	return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1)))
	}
	if (! collocate %in% both$word)
	return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1)))
	features<-both[both$word==collocate,]
	w2vRank <- features$w2v.rank
	classicRank <- features$classic.rank
	rankDiff<-NA
	if (is.numeric(w2vRank) && is.numeric(classicRank)) {rankDiff<-w2vRank - classicRank}
	return(as.numeric(unlist(c(rankDiff,features[1,featurenames]))))
	}

	#w2v <- getW2VCollocates("triftiger")
	#classic <- getClassicCollocates("triftiger")
	#rankDiff("Schmetterlinge", "Bauch")

	firstup <- function(x) {
	substr(x, 1, 1) <- toupper(substr(x, 1, 1))
	x
	}

	# rankDiffCase with precalculated collocates

	rankDiffCase1<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) {
	if (wordform=="" \|\| collocate=="") {
	return(NA)
	}
	ret<-rankDiff1(wordform,collocate,collocates1)
	if (is.na(ret)) {
	ret<- rankDiff1(firstup(wordform),collocate,collocates2)
	if (is.na(ret)) {
	ret <- rankDiff1(wordform, firstup(collocate),collocates1)
	if (is.na(ret)) {
	ret <- rankDiff1(firstup(wordform),firstup(collocate),collocates2)
	}
	}
	}
	return(ret)
	}

	getFeaturesCase<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) {
	if (wordform=="" \|\| collocate=="") {
	return(NA)
	}
	ret1<-getFeatures(wordform,collocate,collocates1)
	ret2<-getFeatures(firstup(wordform),collocate,collocates2)
	ret3<-getFeatures(wordform,firstup(collocate),collocates1)
	ret4<-getFeatures(firstup(wordform),firstup(collocate),collocates2)
	ret<-rbind(ret1,ret2,ret3,ret4)
	colnames(ret)<-c("rankDiff",featurenames)
	colMeans(ret,na.rm=TRUE)
	}

	# get rid of some "offending" characters

	cleanUp<-function(str) {
	ret<-str_replace_all(str,"[^[:alnum:]]"," ")
	ret<-str_replace_all(ret,"[\n\r\t]"," ")
	ret<-str_replace_all(ret,"[ ]+"," ")
	ret<-str_replace_all(ret,"^ ","")
	ret<-str_replace_all(ret, " $","")
	ret
	}

	cleanUpV<-Vectorize(cleanUp)

	deleteStopwords = function(wl, stopwords = NULL) {
	wl[!(wl %in% stopwords)]
	}

	ngramme$tokens <- cleanUpV(paste(ngramme$CO_TOKEN1, ngramme$CO_TOKEN2,ngramme$CO_TOKEN3,ngramme$CO_TOKEN4,ngramme$CO_TOKEN5,ngramme$CO_TOKENS6,sep=" "))

	# retrieve collocates only once per token.

	avgRankDiffCaseLinear<-function(wordlist) {
	wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords)
	sum<-0
	count<-0
	if (length(wl)>1) {
	for (i in 1:length(wl)) {
	collocates1<-getRanks(getBoth(wl[i]))
	collocates2<-getRanks(getBoth(firstup(wl[i])))
	for(j in 1:length(wl)) {
	if (i!=j) {
	rd<-rankDiffCase1(wl[i],wl[j],collocates1,collocates2)
	if (!is.na(rd)) {
	sum=sum+rd
	count=count+1
	}
	}
	}
	}
	}
	ret<-ifelse(count>0,sum/count,NA)
	print(paste(wordlist, ": ", ret))
	return(ret)
	}

	avgRankDiffCaseLinearV<-Vectorize(avgRankDiffCaseLinear)


	avgFeaturesCaseLinear<-function(wordlist) {
	wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords)
	ret<- data.frame(matrix(ncol = length(featurenames)+1, nrow = 0))
	if (length(wl)>1) {
	for (i in 1:length(wl)) {
	collocates1<-getRanks(getBoth(wl[i]))
	collocates2<-getRanks(getBoth(firstup(wl[i])))
	for(j in 1:length(wl)) {
	if (i!=j) {
	ret<-rbind(ret,getFeaturesCase(wl[i],wl[j],collocates1,collocates2))
	}
	}
	}
	}
	colnames(ret)<-c("rankDiff",featurenames)
	mret<-colMeans(ret,na.rm=TRUE)
	print(paste(c(wordlist, ": ", mret)))
	return(mret)
	}

	ngramme[,c("rankDiff",featurenames)]<-t(sapply(ngramme$tokens,avgFeaturesCaseLinear))
	write.table(ngramme,file="../data/goldstandard01_anno_ml_synfeat_nstop1.tsv", sep = "\t",quote=F)