blob: b896e5cdcc8f4628e52a573e0bcadd80bccff401 [file] [log] [blame]
PeterFankhauserIDS7cad9b72021-02-20 15:47:14 +01001# Get classic and w2v based collocates from DeReKoVecs API
2#
3# Have a look at the web gui column header mouse overs to figure out what the columns mean:
4# http://corpora.ids-mannheim.de/openlab/derekovecs?word=triftiger&cutoff=500000&n=100&N=2000#tabs-3
5
6library(httr)
7library(tidyverse)
8
9stopwords <- readLines(con = "../data/stopwords.txt",encoding="UTF-8")
10
11# ngramme <- read.csv("../data/gold03_anno_ml.csv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8")
12
13ngramme <- read.csv("../data/goldstandard01_anno_ml.tsv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8")
14
15
16ngramme[, c(7:23)] <- sapply(ngramme[, c(7:23)], as.numeric)
17
18DeReKoVecsCall <- function(wordform = "Grund", cutoff = 500000, n = 100) {
19 params = list(word = wordform, cutoff = cutoff, n = n, json = 1)
20 response <- tryCatch({httr::GET(url = 'http://korap-worker-04:5673', query = params,timeout(10))},
21 error=function(cond) return(NA))
22 if(!is.na(response)) {
23 content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE)
24 }
25 else {
26 return(NA)
27 }
28}
29
30getW2VCollocates <- function(wordform = "Grund", ...) {
31 ret<-DeReKoVecsCall(wordform, ...)
32 if(!is.na(ret)) ret$collocators
33 else NA
34}
35
36getClassicCollocates <- function(wordform = "Grund") {
37 response <- tryCatch({httr::GET(url = 'http://korap-worker-04:5673/getClassicCollocators', query = list(w=wordform),timeout(10))},
38 error=function(cond) return(NA))
39 if(!is.na(response)) {
40 content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE)$collocates
41 }
42 else {
43 return(NA)
44 }
45}
46
47
48getBoth <- function(wordform = "Schmetterlinge") {
49 w2v <- getW2VCollocates(wordform)
50 classic <- getClassicCollocates(wordform)
51 if (length(w2v)>0 && length(classic>0) && !is.na(w2v) && !is.na(classic)) {
52 merge(classic,w2v,by="word",all=TRUE)
53 }
54 else {
55 return(NA)
56 }
57}
58
59getRanks <- function(collocates,NApenalty=1000) {
60 if (is.na(collocates)) {
61 return(NA)
62 }
63 both <<-
64 collocates %>%
65 arrange(desc(cprob)) %>%
66 mutate(w2v.rank = 1:nrow(.)) %>%
67 mutate(w2v.rank = ifelse(is.na(cprob), NApenalty, w2v.rank)) %>%
68 arrange(desc(prob)) %>%
69 mutate(w2v.rank1 = 1:nrow(.)) %>%
70 mutate(w2v.rank1 = ifelse(is.na(prob), NApenalty, w2v.rank1)) %>%
71 arrange(desc(ld)) %>%
72 mutate(classic.rank = 1:nrow(.)) %>%
73 mutate(classic.rank = ifelse(is.na(ld), NApenalty, classic.rank))
74 return(both)
75}
76
77# rankDiff with precalculated collocates
78
79rankDiff1 <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) {
80 if (is.na(both)) {
81 return(NA)
82 }
83 if (! collocate %in% both$word)
84 return(NA)
85 w2vRank <- both[both$word==collocate,]$w2v.rank
86 classicRank <- both[both$word==collocate,]$classic.rank
87 if (is.numeric(w2vRank) && is.numeric(classicRank)) {w2vRank - classicRank}
88 else {NA}
89}
90
91featurenames<-c("af", "dice","ld","lfmd","llfmd","llr","lnpmi" ,"md","npmi","pmi","rlfmd","rnpmi","average","cprob","max","overall","prob","w2v.rank","w2v.rank1", "classic.rank")
92
93getFeatures <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) {
94 if (is.na(both)) {
95 return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1)))
96 }
97 if (! collocate %in% both$word)
98 return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1)))
99 features<-both[both$word==collocate,]
100 w2vRank <- features$w2v.rank
101 classicRank <- features$classic.rank
102 rankDiff<-NA
103 if (is.numeric(w2vRank) && is.numeric(classicRank)) {rankDiff<-w2vRank - classicRank}
104 return(as.numeric(unlist(c(rankDiff,features[1,featurenames]))))
105}
106
107#w2v <- getW2VCollocates("triftiger")
108#classic <- getClassicCollocates("triftiger")
109#rankDiff("Schmetterlinge", "Bauch")
110
111firstup <- function(x) {
112 substr(x, 1, 1) <- toupper(substr(x, 1, 1))
113 x
114}
115
116# rankDiffCase with precalculated collocates
117
118rankDiffCase1<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) {
119 if (wordform=="" || collocate=="") {
120 return(NA)
121 }
122 ret<-rankDiff1(wordform,collocate,collocates1)
123 if (is.na(ret)) {
124 ret<- rankDiff1(firstup(wordform),collocate,collocates2)
125 if (is.na(ret)) {
126 ret <- rankDiff1(wordform, firstup(collocate),collocates1)
127 if (is.na(ret)) {
128 ret <- rankDiff1(firstup(wordform),firstup(collocate),collocates2)
129 }
130 }
131 }
132 return(ret)
133}
134
135getFeaturesCase<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) {
136 if (wordform=="" || collocate=="") {
137 return(NA)
138 }
139 ret1<-getFeatures(wordform,collocate,collocates1)
140 ret2<-getFeatures(firstup(wordform),collocate,collocates2)
141 ret3<-getFeatures(wordform,firstup(collocate),collocates1)
142 ret4<-getFeatures(firstup(wordform),firstup(collocate),collocates2)
143 ret<-rbind(ret1,ret2,ret3,ret4)
144 colnames(ret)<-c("rankDiff",featurenames)
145 colMeans(ret,na.rm=TRUE)
146}
147
148# get rid of some "offending" characters
149
150cleanUp<-function(str) {
151 ret<-str_replace_all(str,"[^[:alnum:]]"," ")
152 ret<-str_replace_all(ret,"[\n\r\t]"," ")
153 ret<-str_replace_all(ret,"[ ]+"," ")
154 ret<-str_replace_all(ret,"^ ","")
155 ret<-str_replace_all(ret, " $","")
156 ret
157}
158
159cleanUpV<-Vectorize(cleanUp)
160
161deleteStopwords = function(wl, stopwords = NULL) {
162 wl[!(wl %in% stopwords)]
163}
164
165ngramme$tokens <- cleanUpV(paste(ngramme$CO_TOKEN1, ngramme$CO_TOKEN2,ngramme$CO_TOKEN3,ngramme$CO_TOKEN4,ngramme$CO_TOKEN5,ngramme$CO_TOKENS6,sep=" "))
166
167# retrieve collocates only once per token.
168
169avgRankDiffCaseLinear<-function(wordlist) {
170 wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords)
171 sum<-0
172 count<-0
173 if (length(wl)>1) {
174 for (i in 1:length(wl)) {
175 collocates1<-getRanks(getBoth(wl[i]))
176 collocates2<-getRanks(getBoth(firstup(wl[i])))
177 for(j in 1:length(wl)) {
178 if (i!=j) {
179 rd<-rankDiffCase1(wl[i],wl[j],collocates1,collocates2)
180 if (!is.na(rd)) {
181 sum=sum+rd
182 count=count+1
183 }
184 }
185 }
186 }
187 }
188 ret<-ifelse(count>0,sum/count,NA)
189 print(paste(wordlist, ": ", ret))
190 return(ret)
191}
192
193avgRankDiffCaseLinearV<-Vectorize(avgRankDiffCaseLinear)
194
195
196avgFeaturesCaseLinear<-function(wordlist) {
197 wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords)
198 ret<- data.frame(matrix(ncol = length(featurenames)+1, nrow = 0))
199 if (length(wl)>1) {
200 for (i in 1:length(wl)) {
201 collocates1<-getRanks(getBoth(wl[i]))
202 collocates2<-getRanks(getBoth(firstup(wl[i])))
203 for(j in 1:length(wl)) {
204 if (i!=j) {
205 ret<-rbind(ret,getFeaturesCase(wl[i],wl[j],collocates1,collocates2))
206 }
207 }
208 }
209 }
210 colnames(ret)<-c("rankDiff",featurenames)
211 mret<-colMeans(ret,na.rm=TRUE)
212 print(paste(c(wordlist, ": ", mret)))
213 return(mret)
214}
215
216ngramme[,c("rankDiff",featurenames)]<-t(sapply(ngramme$tokens,avgFeaturesCaseLinear))
217write.table(ngramme,file="../data/goldstandard01_anno_ml_synfeat_nstop1.tsv", sep = "\t",quote=F)
218