Blame - R/derekovecs_apicall_syn_nlc.R - private/idioms

blob: b896e5cdcc8f4628e52a573e0bcadd80bccff401 [file] [log] [blame]

PeterFankhauserIDS	7cad9b7	2021-02-20 15:47:14 +0100	[diff] [blame^]	1	# Get classic and w2v based collocates from DeReKoVecs API
				2	#
				3	# Have a look at the web gui column header mouse overs to figure out what the columns mean:
				4	# http://corpora.ids-mannheim.de/openlab/derekovecs?word=triftiger&cutoff=500000&n=100&N=2000#tabs-3
				5
				6	library(httr)
				7	library(tidyverse)
				8
				9	stopwords <- readLines(con = "../data/stopwords.txt",encoding="UTF-8")
				10
				11	# ngramme <- read.csv("../data/gold03_anno_ml.csv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8")
				12
				13	ngramme <- read.csv("../data/goldstandard01_anno_ml.tsv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8")
				14
				15
				16	ngramme[, c(7:23)] <- sapply(ngramme[, c(7:23)], as.numeric)
				17
				18	DeReKoVecsCall <- function(wordform = "Grund", cutoff = 500000, n = 100) {
				19	params = list(word = wordform, cutoff = cutoff, n = n, json = 1)
				20	response <- tryCatch({httr::GET(url = 'http://korap-worker-04:5673', query = params,timeout(10))},
				21	error=function(cond) return(NA))
				22	if(!is.na(response)) {
				23	content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE)
				24	}
				25	else {
				26	return(NA)
				27	}
				28	}
				29
				30	getW2VCollocates <- function(wordform = "Grund", ...) {
				31	ret<-DeReKoVecsCall(wordform, ...)
				32	if(!is.na(ret)) ret$collocators
				33	else NA
				34	}
				35
				36	getClassicCollocates <- function(wordform = "Grund") {
				37	response <- tryCatch({httr::GET(url = 'http://korap-worker-04:5673/getClassicCollocators', query = list(w=wordform),timeout(10))},
				38	error=function(cond) return(NA))
				39	if(!is.na(response)) {
				40	content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE)$collocates
				41	}
				42	else {
				43	return(NA)
				44	}
				45	}
				46
				47
				48	getBoth <- function(wordform = "Schmetterlinge") {
				49	w2v <- getW2VCollocates(wordform)
				50	classic <- getClassicCollocates(wordform)
				51	if (length(w2v)>0 && length(classic>0) && !is.na(w2v) && !is.na(classic)) {
				52	merge(classic,w2v,by="word",all=TRUE)
				53	}
				54	else {
				55	return(NA)
				56	}
				57	}
				58
				59	getRanks <- function(collocates,NApenalty=1000) {
				60	if (is.na(collocates)) {
				61	return(NA)
				62	}
				63	both <<-
				64	collocates %>%
				65	arrange(desc(cprob)) %>%
				66	mutate(w2v.rank = 1:nrow(.)) %>%
				67	mutate(w2v.rank = ifelse(is.na(cprob), NApenalty, w2v.rank)) %>%
				68	arrange(desc(prob)) %>%
				69	mutate(w2v.rank1 = 1:nrow(.)) %>%
				70	mutate(w2v.rank1 = ifelse(is.na(prob), NApenalty, w2v.rank1)) %>%
				71	arrange(desc(ld)) %>%
				72	mutate(classic.rank = 1:nrow(.)) %>%
				73	mutate(classic.rank = ifelse(is.na(ld), NApenalty, classic.rank))
				74	return(both)
				75	}
				76
				77	# rankDiff with precalculated collocates
				78
				79	rankDiff1 <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) {
				80	if (is.na(both)) {
				81	return(NA)
				82	}
				83	if (! collocate %in% both$word)
				84	return(NA)
				85	w2vRank <- both[both$word==collocate,]$w2v.rank
				86	classicRank <- both[both$word==collocate,]$classic.rank
				87	if (is.numeric(w2vRank) && is.numeric(classicRank)) {w2vRank - classicRank}
				88	else {NA}
				89	}
				90
				91	featurenames<-c("af", "dice","ld","lfmd","llfmd","llr","lnpmi" ,"md","npmi","pmi","rlfmd","rnpmi","average","cprob","max","overall","prob","w2v.rank","w2v.rank1", "classic.rank")
				92
				93	getFeatures <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) {
				94	if (is.na(both)) {
				95	return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1)))
				96	}
				97	if (! collocate %in% both$word)
				98	return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1)))
				99	features<-both[both$word==collocate,]
				100	w2vRank <- features$w2v.rank
				101	classicRank <- features$classic.rank
				102	rankDiff<-NA
				103	if (is.numeric(w2vRank) && is.numeric(classicRank)) {rankDiff<-w2vRank - classicRank}
				104	return(as.numeric(unlist(c(rankDiff,features[1,featurenames]))))
				105	}
				106
				107	#w2v <- getW2VCollocates("triftiger")
				108	#classic <- getClassicCollocates("triftiger")
				109	#rankDiff("Schmetterlinge", "Bauch")
				110
				111	firstup <- function(x) {
				112	substr(x, 1, 1) <- toupper(substr(x, 1, 1))
				113	x
				114	}
				115
				116	# rankDiffCase with precalculated collocates
				117
				118	rankDiffCase1<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) {
				119	if (wordform=="" \|\| collocate=="") {
				120	return(NA)
				121	}
				122	ret<-rankDiff1(wordform,collocate,collocates1)
				123	if (is.na(ret)) {
				124	ret<- rankDiff1(firstup(wordform),collocate,collocates2)
				125	if (is.na(ret)) {
				126	ret <- rankDiff1(wordform, firstup(collocate),collocates1)
				127	if (is.na(ret)) {
				128	ret <- rankDiff1(firstup(wordform),firstup(collocate),collocates2)
				129	}
				130	}
				131	}
				132	return(ret)
				133	}
				134
				135	getFeaturesCase<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) {
				136	if (wordform=="" \|\| collocate=="") {
				137	return(NA)
				138	}
				139	ret1<-getFeatures(wordform,collocate,collocates1)
				140	ret2<-getFeatures(firstup(wordform),collocate,collocates2)
				141	ret3<-getFeatures(wordform,firstup(collocate),collocates1)
				142	ret4<-getFeatures(firstup(wordform),firstup(collocate),collocates2)
				143	ret<-rbind(ret1,ret2,ret3,ret4)
				144	colnames(ret)<-c("rankDiff",featurenames)
				145	colMeans(ret,na.rm=TRUE)
				146	}
				147
				148	# get rid of some "offending" characters
				149
				150	cleanUp<-function(str) {
				151	ret<-str_replace_all(str,"[^[:alnum:]]"," ")
				152	ret<-str_replace_all(ret,"[\n\r\t]"," ")
				153	ret<-str_replace_all(ret,"[ ]+"," ")
				154	ret<-str_replace_all(ret,"^ ","")
				155	ret<-str_replace_all(ret, " $","")
				156	ret
				157	}
				158
				159	cleanUpV<-Vectorize(cleanUp)
				160
				161	deleteStopwords = function(wl, stopwords = NULL) {
				162	wl[!(wl %in% stopwords)]
				163	}
				164
				165	ngramme$tokens <- cleanUpV(paste(ngramme$CO_TOKEN1, ngramme$CO_TOKEN2,ngramme$CO_TOKEN3,ngramme$CO_TOKEN4,ngramme$CO_TOKEN5,ngramme$CO_TOKENS6,sep=" "))
				166
				167	# retrieve collocates only once per token.
				168
				169	avgRankDiffCaseLinear<-function(wordlist) {
				170	wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords)
				171	sum<-0
				172	count<-0
				173	if (length(wl)>1) {
				174	for (i in 1:length(wl)) {
				175	collocates1<-getRanks(getBoth(wl[i]))
				176	collocates2<-getRanks(getBoth(firstup(wl[i])))
				177	for(j in 1:length(wl)) {
				178	if (i!=j) {
				179	rd<-rankDiffCase1(wl[i],wl[j],collocates1,collocates2)
				180	if (!is.na(rd)) {
				181	sum=sum+rd
				182	count=count+1
				183	}
				184	}
				185	}
				186	}
				187	}
				188	ret<-ifelse(count>0,sum/count,NA)
				189	print(paste(wordlist, ": ", ret))
				190	return(ret)
				191	}
				192
				193	avgRankDiffCaseLinearV<-Vectorize(avgRankDiffCaseLinear)
				194
				195
				196	avgFeaturesCaseLinear<-function(wordlist) {
				197	wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords)
				198	ret<- data.frame(matrix(ncol = length(featurenames)+1, nrow = 0))
				199	if (length(wl)>1) {
				200	for (i in 1:length(wl)) {
				201	collocates1<-getRanks(getBoth(wl[i]))
				202	collocates2<-getRanks(getBoth(firstup(wl[i])))
				203	for(j in 1:length(wl)) {
				204	if (i!=j) {
				205	ret<-rbind(ret,getFeaturesCase(wl[i],wl[j],collocates1,collocates2))
				206	}
				207	}
				208	}
				209	}
				210	colnames(ret)<-c("rankDiff",featurenames)
				211	mret<-colMeans(ret,na.rm=TRUE)
				212	print(paste(c(wordlist, ": ", mret)))
				213	return(mret)
				214	}
				215
				216	ngramme[,c("rankDiff",featurenames)]<-t(sapply(ngramme$tokens,avgFeaturesCaseLinear))
				217	write.table(ngramme,file="../data/goldstandard01_anno_ml_synfeat_nstop1.tsv", sep = "\t",quote=F)
				218