blob: 93ab249d777f6bcb18fcd245b339ede0b3f69c9c [file] [log] [blame]
PeterFankhauserIDS7cad9b72021-02-20 15:47:14 +01001# Get classic and w2v based collocates from DeReKoVecs API
2#
3# Have a look at the web gui column header mouse overs to figure out what the columns mean:
4# http://corpora.ids-mannheim.de/openlab/derekovecs?word=triftiger&cutoff=500000&n=100&N=2000#tabs-3
5
6library(httr)
7library(tidyverse)
8
9stopwords <- readLines(con = "../data/stopwords.txt",encoding="UTF-8")
Marc Kupietzdb86ef92023-08-30 15:56:47 +020010derekovecs_server = 'https://corpora.ids-mannheim.de/openlab/derekovecs/'
PeterFankhauserIDS7cad9b72021-02-20 15:47:14 +010011
12# ngramme <- read.csv("../data/gold03_anno_ml.csv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8")
13
14ngramme <- read.csv("../data/goldstandard01_anno_ml.tsv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8")
15
16
17ngramme[, c(7:23)] <- sapply(ngramme[, c(7:23)], as.numeric)
18
19DeReKoVecsCall <- function(wordform = "Grund", cutoff = 500000, n = 100) {
20 params = list(word = wordform, cutoff = cutoff, n = n, json = 1)
Marc Kupietzdb86ef92023-08-30 15:56:47 +020021 response <- tryCatch({httr::GET(url = derekovecs_server, query = params,timeout(10))},
22 error=function(cond) return(NULL))
23 if(!is.null(response) && httr::status_code(response) == 200) {
PeterFankhauserIDS7cad9b72021-02-20 15:47:14 +010024 content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE)
25 }
26 else {
27 return(NA)
28 }
29}
30
31getW2VCollocates <- function(wordform = "Grund", ...) {
32 ret<-DeReKoVecsCall(wordform, ...)
33 if(!is.na(ret)) ret$collocators
34 else NA
35}
36
37getClassicCollocates <- function(wordform = "Grund") {
Marc Kupietzdb86ef92023-08-30 15:56:47 +020038 response <- tryCatch({httr::GET(url = paste0(derekovecs_server, 'getClassicCollocators'), query = list(w=wordform),timeout(10))},
39 error=function(cond) return(NULL))
40 if(!is.null(response) && httr::status_code(response) == 200) {
PeterFankhauserIDS7cad9b72021-02-20 15:47:14 +010041 content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE)$collocates
42 }
43 else {
44 return(NA)
45 }
46}
47
48
49getBoth <- function(wordform = "Schmetterlinge") {
50 w2v <- getW2VCollocates(wordform)
51 classic <- getClassicCollocates(wordform)
52 if (length(w2v)>0 && length(classic>0) && !is.na(w2v) && !is.na(classic)) {
53 merge(classic,w2v,by="word",all=TRUE)
54 }
55 else {
56 return(NA)
57 }
58}
59
60getRanks <- function(collocates,NApenalty=1000) {
61 if (is.na(collocates)) {
62 return(NA)
63 }
64 both <<-
65 collocates %>%
66 arrange(desc(cprob)) %>%
67 mutate(w2v.rank = 1:nrow(.)) %>%
68 mutate(w2v.rank = ifelse(is.na(cprob), NApenalty, w2v.rank)) %>%
69 arrange(desc(prob)) %>%
70 mutate(w2v.rank1 = 1:nrow(.)) %>%
71 mutate(w2v.rank1 = ifelse(is.na(prob), NApenalty, w2v.rank1)) %>%
72 arrange(desc(ld)) %>%
73 mutate(classic.rank = 1:nrow(.)) %>%
74 mutate(classic.rank = ifelse(is.na(ld), NApenalty, classic.rank))
75 return(both)
76}
77
78# rankDiff with precalculated collocates
79
80rankDiff1 <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) {
81 if (is.na(both)) {
82 return(NA)
83 }
84 if (! collocate %in% both$word)
85 return(NA)
86 w2vRank <- both[both$word==collocate,]$w2v.rank
87 classicRank <- both[both$word==collocate,]$classic.rank
88 if (is.numeric(w2vRank) && is.numeric(classicRank)) {w2vRank - classicRank}
89 else {NA}
90}
91
92featurenames<-c("af", "dice","ld","lfmd","llfmd","llr","lnpmi" ,"md","npmi","pmi","rlfmd","rnpmi","average","cprob","max","overall","prob","w2v.rank","w2v.rank1", "classic.rank")
93
94getFeatures <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) {
95 if (is.na(both)) {
96 return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1)))
97 }
98 if (! collocate %in% both$word)
99 return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1)))
100 features<-both[both$word==collocate,]
101 w2vRank <- features$w2v.rank
102 classicRank <- features$classic.rank
103 rankDiff<-NA
104 if (is.numeric(w2vRank) && is.numeric(classicRank)) {rankDiff<-w2vRank - classicRank}
105 return(as.numeric(unlist(c(rankDiff,features[1,featurenames]))))
106}
107
108#w2v <- getW2VCollocates("triftiger")
109#classic <- getClassicCollocates("triftiger")
110#rankDiff("Schmetterlinge", "Bauch")
111
112firstup <- function(x) {
113 substr(x, 1, 1) <- toupper(substr(x, 1, 1))
114 x
115}
116
117# rankDiffCase with precalculated collocates
118
119rankDiffCase1<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) {
120 if (wordform=="" || collocate=="") {
121 return(NA)
122 }
123 ret<-rankDiff1(wordform,collocate,collocates1)
124 if (is.na(ret)) {
125 ret<- rankDiff1(firstup(wordform),collocate,collocates2)
126 if (is.na(ret)) {
127 ret <- rankDiff1(wordform, firstup(collocate),collocates1)
128 if (is.na(ret)) {
129 ret <- rankDiff1(firstup(wordform),firstup(collocate),collocates2)
130 }
131 }
132 }
133 return(ret)
134}
135
136getFeaturesCase<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) {
137 if (wordform=="" || collocate=="") {
138 return(NA)
139 }
140 ret1<-getFeatures(wordform,collocate,collocates1)
141 ret2<-getFeatures(firstup(wordform),collocate,collocates2)
142 ret3<-getFeatures(wordform,firstup(collocate),collocates1)
143 ret4<-getFeatures(firstup(wordform),firstup(collocate),collocates2)
144 ret<-rbind(ret1,ret2,ret3,ret4)
145 colnames(ret)<-c("rankDiff",featurenames)
146 colMeans(ret,na.rm=TRUE)
147}
148
149# get rid of some "offending" characters
150
151cleanUp<-function(str) {
152 ret<-str_replace_all(str,"[^[:alnum:]]"," ")
153 ret<-str_replace_all(ret,"[\n\r\t]"," ")
154 ret<-str_replace_all(ret,"[ ]+"," ")
155 ret<-str_replace_all(ret,"^ ","")
156 ret<-str_replace_all(ret, " $","")
157 ret
158}
159
160cleanUpV<-Vectorize(cleanUp)
161
162deleteStopwords = function(wl, stopwords = NULL) {
163 wl[!(wl %in% stopwords)]
164}
165
166ngramme$tokens <- cleanUpV(paste(ngramme$CO_TOKEN1, ngramme$CO_TOKEN2,ngramme$CO_TOKEN3,ngramme$CO_TOKEN4,ngramme$CO_TOKEN5,ngramme$CO_TOKENS6,sep=" "))
167
168# retrieve collocates only once per token.
169
170avgRankDiffCaseLinear<-function(wordlist) {
171 wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords)
172 sum<-0
173 count<-0
174 if (length(wl)>1) {
175 for (i in 1:length(wl)) {
176 collocates1<-getRanks(getBoth(wl[i]))
177 collocates2<-getRanks(getBoth(firstup(wl[i])))
178 for(j in 1:length(wl)) {
179 if (i!=j) {
180 rd<-rankDiffCase1(wl[i],wl[j],collocates1,collocates2)
181 if (!is.na(rd)) {
182 sum=sum+rd
183 count=count+1
184 }
185 }
186 }
187 }
188 }
189 ret<-ifelse(count>0,sum/count,NA)
190 print(paste(wordlist, ": ", ret))
191 return(ret)
192}
193
194avgRankDiffCaseLinearV<-Vectorize(avgRankDiffCaseLinear)
195
196
197avgFeaturesCaseLinear<-function(wordlist) {
198 wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords)
199 ret<- data.frame(matrix(ncol = length(featurenames)+1, nrow = 0))
200 if (length(wl)>1) {
201 for (i in 1:length(wl)) {
202 collocates1<-getRanks(getBoth(wl[i]))
203 collocates2<-getRanks(getBoth(firstup(wl[i])))
204 for(j in 1:length(wl)) {
205 if (i!=j) {
206 ret<-rbind(ret,getFeaturesCase(wl[i],wl[j],collocates1,collocates2))
207 }
208 }
209 }
210 }
211 colnames(ret)<-c("rankDiff",featurenames)
212 mret<-colMeans(ret,na.rm=TRUE)
213 print(paste(c(wordlist, ": ", mret)))
214 return(mret)
215}
216
217ngramme[,c("rankDiff",featurenames)]<-t(sapply(ngramme$tokens,avgFeaturesCaseLinear))
218write.table(ngramme,file="../data/goldstandard01_anno_ml_synfeat_nstop1.tsv", sep = "\t",quote=F)
219