blob: 23bd92a46871f4ed87fa3a5fee12b043c5e581d4 [file] [log] [blame]
# Get classic and w2v based collocates from DeReKoVecs API
#
# Have a look at the web gui column header mouse overs to figure out what the columns mean:
# http://corpora.ids-mannheim.de/openlab/derekovecs?word=triftiger&cutoff=500000&n=100&N=2000#tabs-3
library(httr)
library(tidyverse)
stopwords <- readLines(con = "../data/stopwords.txt",encoding="UTF-8")
derekovecs_server = 'https://corpora.ids-mannheim.de/openlab/derekovecs/'
# ngramme <- read.csv("../data/gold03_anno_ml.csv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8")
ngramme <- read.csv("../data/goldstandard01_anno_ml.tsv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8")
ngramme[, c(7:23)] <- sapply(ngramme[, c(7:23)], as.numeric)
DeReKoVecsCall <- function(wordform = "Grund", cutoff = 500000, n = 100) {
params = list(word = wordform, cutoff = cutoff, n = n, json = 1)
response <- tryCatch({httr::GET(url = derekovecs_server, query = params,timeout(10))},
error=function(cond) return(NULL))
if(!is.null(response) && httr::status_code(response) == 200) {
content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE)
}
else {
return(NULL)
}
}
getW2VCollocates <- function(wordform = "Grund", ...) {
ret<-DeReKoVecsCall(wordform, ...)
if(!is.null(ret)) ret$collocators
else NULL
}
getClassicCollocates <- function(wordform = "Grund") {
response <- tryCatch({httr::GET(url = paste0(derekovecs_server, 'getClassicCollocators'), query = list(w=wordform),timeout(10))},
error=function(cond) return(NULL))
if(!is.null(response) && httr::status_code(response) == 200) {
content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE)$collocates
}
else {
return(null)
}
}
getBoth <- function(wordform = "Schmetterlinge") {
w2v <- getW2VCollocates(wordform)
classic <- getClassicCollocates(wordform)
if (!is.null(w2v) && !is.null(classic)) {
merge(classic,w2v,by="word",all=TRUE)
}
else {
return(NA)
}
}
getRanks <- function(collocates,NApenalty=1000) {
if (is.null(collocates)) {
return(NA)
}
both <<-
collocates %>%
arrange(desc(cprob)) %>%
mutate(w2v.rank = 1:nrow(.)) %>%
mutate(w2v.rank = ifelse(is.na(cprob), NApenalty, w2v.rank)) %>%
arrange(desc(prob)) %>%
mutate(w2v.rank1 = 1:nrow(.)) %>%
mutate(w2v.rank1 = ifelse(is.na(prob), NApenalty, w2v.rank1)) %>%
arrange(desc(ld)) %>%
mutate(classic.rank = 1:nrow(.)) %>%
mutate(classic.rank = ifelse(is.na(ld), NApenalty, classic.rank))
return(both)
}
# rankDiff with precalculated collocates
rankDiff1 <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) {
if (is.na(both)) {
return(NA)
}
if (! collocate %in% both$word)
return(NA)
w2vRank <- both[both$word==collocate,]$w2v.rank
classicRank <- both[both$word==collocate,]$classic.rank
if (is.numeric(w2vRank) && is.numeric(classicRank)) {w2vRank - classicRank}
else {NA}
}
featurenames<-c("af", "dice","ld","lfmd","llfmd","llr","lnpmi" ,"md","npmi","pmi","rlfmd","rnpmi","average","cprob","max","overall","prob","w2v.rank","w2v.rank1", "classic.rank")
getFeatures <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) {
if (is.na(both)) {
return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1)))
}
if (! collocate %in% both$word)
return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1)))
features<-both[both$word==collocate,]
w2vRank <- features$w2v.rank
classicRank <- features$classic.rank
rankDiff<-NA
if (is.numeric(w2vRank) && is.numeric(classicRank)) {rankDiff<-w2vRank - classicRank}
return(as.numeric(unlist(c(rankDiff,features[1,featurenames]))))
}
#w2v <- getW2VCollocates("triftiger")
#classic <- getClassicCollocates("triftiger")
#rankDiff("Schmetterlinge", "Bauch")
firstup <- function(x) {
substr(x, 1, 1) <- toupper(substr(x, 1, 1))
x
}
# rankDiffCase with precalculated collocates
rankDiffCase1<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) {
if (wordform=="" || collocate=="") {
return(NA)
}
ret<-rankDiff1(wordform,collocate,collocates1)
if (is.na(ret)) {
ret<- rankDiff1(firstup(wordform),collocate,collocates2)
if (is.na(ret)) {
ret <- rankDiff1(wordform, firstup(collocate),collocates1)
if (is.na(ret)) {
ret <- rankDiff1(firstup(wordform),firstup(collocate),collocates2)
}
}
}
return(ret)
}
getFeaturesCase<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) {
if (wordform=="" || collocate=="") {
return(NA)
}
ret1<-getFeatures(wordform,collocate,collocates1)
ret2<-getFeatures(firstup(wordform),collocate,collocates2)
ret3<-getFeatures(wordform,firstup(collocate),collocates1)
ret4<-getFeatures(firstup(wordform),firstup(collocate),collocates2)
ret<-rbind(ret1,ret2,ret3,ret4)
colnames(ret)<-c("rankDiff",featurenames)
colMeans(ret,na.rm=TRUE)
}
# get rid of some "offending" characters
cleanUp<-function(str) {
ret<-str_replace_all(str,"[^[:alnum:]]"," ")
ret<-str_replace_all(ret,"[\n\r\t]"," ")
ret<-str_replace_all(ret,"[ ]+"," ")
ret<-str_replace_all(ret,"^ ","")
ret<-str_replace_all(ret, " $","")
ret
}
cleanUpV<-Vectorize(cleanUp)
deleteStopwords = function(wl, stopwords = NULL) {
wl[!(wl %in% stopwords)]
}
ngramme$tokens <- cleanUpV(paste(ngramme$CO_TOKEN1, ngramme$CO_TOKEN2,ngramme$CO_TOKEN3,ngramme$CO_TOKEN4,ngramme$CO_TOKEN5,ngramme$CO_TOKENS6,sep=" "))
# retrieve collocates only once per token.
avgRankDiffCaseLinear<-function(wordlist) {
wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords)
sum<-0
count<-0
if (length(wl)>1) {
for (i in 1:length(wl)) {
collocates1<-getRanks(getBoth(wl[i]))
collocates2<-getRanks(getBoth(firstup(wl[i])))
for(j in 1:length(wl)) {
if (i!=j) {
rd<-rankDiffCase1(wl[i],wl[j],collocates1,collocates2)
if (!is.na(rd)) {
sum=sum+rd
count=count+1
}
}
}
}
}
ret<-ifelse(count>0,sum/count,NA)
print(paste(wordlist, ": ", ret))
return(ret)
}
avgRankDiffCaseLinearV<-Vectorize(avgRankDiffCaseLinear)
avgFeaturesCaseLinear<-function(wordlist) {
wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords)
ret<- data.frame(matrix(ncol = length(featurenames)+1, nrow = 0))
if (length(wl)>1) {
for (i in 1:length(wl)) {
collocates1<-getRanks(getBoth(wl[i]))
collocates2<-getRanks(getBoth(firstup(wl[i])))
for(j in 1:length(wl)) {
if (i!=j) {
ret<-rbind(ret,getFeaturesCase(wl[i],wl[j],collocates1,collocates2))
}
}
}
}
colnames(ret)<-c("rankDiff",featurenames)
mret<-colMeans(ret,na.rm=TRUE)
print(paste(c(wordlist, ": ", mret)))
return(mret)
}
ngramme[,c("rankDiff",featurenames)]<-t(sapply(ngramme$tokens,avgFeaturesCaseLinear))
write.table(ngramme,file="../data/goldstandard01_anno_ml_synfeat_nstop1.tsv", sep = "\t",quote=F)