Blame - R/derekovecs_apicall_syn_nlc.R - private/idioms

blob: 23bd92a46871f4ed87fa3a5fee12b043c5e581d4 [file] [log] [blame]

PeterFankhauserIDS	7cad9b7	2021-02-20 15:47:14 +0100	[diff] [blame]	1	# Get classic and w2v based collocates from DeReKoVecs API
				2	#
				3	# Have a look at the web gui column header mouse overs to figure out what the columns mean:
				4	# http://corpora.ids-mannheim.de/openlab/derekovecs?word=triftiger&cutoff=500000&n=100&N=2000#tabs-3
				5
				6	library(httr)
				7	library(tidyverse)
				8
				9	stopwords <- readLines(con = "../data/stopwords.txt",encoding="UTF-8")
Marc Kupietz	db86ef9	2023-08-30 15:56:47 +0200	[diff] [blame]	10	derekovecs_server = 'https://corpora.ids-mannheim.de/openlab/derekovecs/'
PeterFankhauserIDS	7cad9b7	2021-02-20 15:47:14 +0100	[diff] [blame]	11
				12	# ngramme <- read.csv("../data/gold03_anno_ml.csv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8")
				13
				14	ngramme <- read.csv("../data/goldstandard01_anno_ml.tsv", quote="", header = TRUE, sep = "\t", dec=",", encoding="UTF-8")
				15
				16
				17	ngramme[, c(7:23)] <- sapply(ngramme[, c(7:23)], as.numeric)
				18
				19	DeReKoVecsCall <- function(wordform = "Grund", cutoff = 500000, n = 100) {
				20	params = list(word = wordform, cutoff = cutoff, n = n, json = 1)
Marc Kupietz	db86ef9	2023-08-30 15:56:47 +0200	[diff] [blame]	21	response <- tryCatch({httr::GET(url = derekovecs_server, query = params,timeout(10))},
				22	error=function(cond) return(NULL))
				23	if(!is.null(response) && httr::status_code(response) == 200) {
PeterFankhauserIDS	7cad9b7	2021-02-20 15:47:14 +0100	[diff] [blame]	24	content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE)
				25	}
				26	else {
Marc Kupietz	078a75c	2023-08-30 16:08:00 +0200	[diff] [blame^]	27	return(NULL)
PeterFankhauserIDS	7cad9b7	2021-02-20 15:47:14 +0100	[diff] [blame]	28	}
				29	}
				30
				31	getW2VCollocates <- function(wordform = "Grund", ...) {
				32	ret<-DeReKoVecsCall(wordform, ...)
Marc Kupietz	078a75c	2023-08-30 16:08:00 +0200	[diff] [blame^]	33	if(!is.null(ret)) ret$collocators
				34	else NULL
PeterFankhauserIDS	7cad9b7	2021-02-20 15:47:14 +0100	[diff] [blame]	35	}
				36
				37	getClassicCollocates <- function(wordform = "Grund") {
Marc Kupietz	db86ef9	2023-08-30 15:56:47 +0200	[diff] [blame]	38	response <- tryCatch({httr::GET(url = paste0(derekovecs_server, 'getClassicCollocators'), query = list(w=wordform),timeout(10))},
				39	error=function(cond) return(NULL))
				40	if(!is.null(response) && httr::status_code(response) == 200) {
PeterFankhauserIDS	7cad9b7	2021-02-20 15:47:14 +0100	[diff] [blame]	41	content(response, as = 'parsed', type = 'application/json', simplifyDataFrame = TRUE)$collocates
				42	}
				43	else {
Marc Kupietz	078a75c	2023-08-30 16:08:00 +0200	[diff] [blame^]	44	return(null)
PeterFankhauserIDS	7cad9b7	2021-02-20 15:47:14 +0100	[diff] [blame]	45	}
				46	}
				47
				48
				49	getBoth <- function(wordform = "Schmetterlinge") {
				50	w2v <- getW2VCollocates(wordform)
				51	classic <- getClassicCollocates(wordform)
Marc Kupietz	078a75c	2023-08-30 16:08:00 +0200	[diff] [blame^]	52	if (!is.null(w2v) && !is.null(classic)) {
PeterFankhauserIDS	7cad9b7	2021-02-20 15:47:14 +0100	[diff] [blame]	53	merge(classic,w2v,by="word",all=TRUE)
				54	}
				55	else {
				56	return(NA)
				57	}
				58	}
				59
				60	getRanks <- function(collocates,NApenalty=1000) {
Marc Kupietz	078a75c	2023-08-30 16:08:00 +0200	[diff] [blame^]	61	if (is.null(collocates)) {
PeterFankhauserIDS	7cad9b7	2021-02-20 15:47:14 +0100	[diff] [blame]	62	return(NA)
				63	}
				64	both <<-
				65	collocates %>%
				66	arrange(desc(cprob)) %>%
				67	mutate(w2v.rank = 1:nrow(.)) %>%
				68	mutate(w2v.rank = ifelse(is.na(cprob), NApenalty, w2v.rank)) %>%
				69	arrange(desc(prob)) %>%
				70	mutate(w2v.rank1 = 1:nrow(.)) %>%
				71	mutate(w2v.rank1 = ifelse(is.na(prob), NApenalty, w2v.rank1)) %>%
				72	arrange(desc(ld)) %>%
				73	mutate(classic.rank = 1:nrow(.)) %>%
				74	mutate(classic.rank = ifelse(is.na(ld), NApenalty, classic.rank))
				75	return(both)
				76	}
				77
				78	# rankDiff with precalculated collocates
				79
				80	rankDiff1 <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) {
				81	if (is.na(both)) {
				82	return(NA)
				83	}
				84	if (! collocate %in% both$word)
				85	return(NA)
				86	w2vRank <- both[both$word==collocate,]$w2v.rank
				87	classicRank <- both[both$word==collocate,]$classic.rank
				88	if (is.numeric(w2vRank) && is.numeric(classicRank)) {w2vRank - classicRank}
				89	else {NA}
				90	}
				91
				92	featurenames<-c("af", "dice","ld","lfmd","llfmd","llr","lnpmi" ,"md","npmi","pmi","rlfmd","rnpmi","average","cprob","max","overall","prob","w2v.rank","w2v.rank1", "classic.rank")
				93
				94	getFeatures <- function(wordform = "Schmetterlinge", collocate = "Bauch", both) {
				95	if (is.na(both)) {
				96	return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1)))
				97	}
				98	if (! collocate %in% both$word)
				99	return(as.numeric(rep(c(featurenames[0], NA), length(featurenames)+1)))
				100	features<-both[both$word==collocate,]
				101	w2vRank <- features$w2v.rank
				102	classicRank <- features$classic.rank
				103	rankDiff<-NA
				104	if (is.numeric(w2vRank) && is.numeric(classicRank)) {rankDiff<-w2vRank - classicRank}
				105	return(as.numeric(unlist(c(rankDiff,features[1,featurenames]))))
				106	}
				107
				108	#w2v <- getW2VCollocates("triftiger")
				109	#classic <- getClassicCollocates("triftiger")
				110	#rankDiff("Schmetterlinge", "Bauch")
				111
				112	firstup <- function(x) {
				113	substr(x, 1, 1) <- toupper(substr(x, 1, 1))
				114	x
				115	}
				116
				117	# rankDiffCase with precalculated collocates
				118
				119	rankDiffCase1<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) {
				120	if (wordform=="" \|\| collocate=="") {
				121	return(NA)
				122	}
				123	ret<-rankDiff1(wordform,collocate,collocates1)
				124	if (is.na(ret)) {
				125	ret<- rankDiff1(firstup(wordform),collocate,collocates2)
				126	if (is.na(ret)) {
				127	ret <- rankDiff1(wordform, firstup(collocate),collocates1)
				128	if (is.na(ret)) {
				129	ret <- rankDiff1(firstup(wordform),firstup(collocate),collocates2)
				130	}
				131	}
				132	}
				133	return(ret)
				134	}
				135
				136	getFeaturesCase<-function(wordform="Schmetterlinge",collocate="Bauch",collocates1,collocates2) {
				137	if (wordform=="" \|\| collocate=="") {
				138	return(NA)
				139	}
				140	ret1<-getFeatures(wordform,collocate,collocates1)
				141	ret2<-getFeatures(firstup(wordform),collocate,collocates2)
				142	ret3<-getFeatures(wordform,firstup(collocate),collocates1)
				143	ret4<-getFeatures(firstup(wordform),firstup(collocate),collocates2)
				144	ret<-rbind(ret1,ret2,ret3,ret4)
				145	colnames(ret)<-c("rankDiff",featurenames)
				146	colMeans(ret,na.rm=TRUE)
				147	}
				148
				149	# get rid of some "offending" characters
				150
				151	cleanUp<-function(str) {
				152	ret<-str_replace_all(str,"[^[:alnum:]]"," ")
				153	ret<-str_replace_all(ret,"[\n\r\t]"," ")
				154	ret<-str_replace_all(ret,"[ ]+"," ")
				155	ret<-str_replace_all(ret,"^ ","")
				156	ret<-str_replace_all(ret, " $","")
				157	ret
				158	}
				159
				160	cleanUpV<-Vectorize(cleanUp)
				161
				162	deleteStopwords = function(wl, stopwords = NULL) {
				163	wl[!(wl %in% stopwords)]
				164	}
				165
				166	ngramme$tokens <- cleanUpV(paste(ngramme$CO_TOKEN1, ngramme$CO_TOKEN2,ngramme$CO_TOKEN3,ngramme$CO_TOKEN4,ngramme$CO_TOKEN5,ngramme$CO_TOKENS6,sep=" "))
				167
				168	# retrieve collocates only once per token.
				169
				170	avgRankDiffCaseLinear<-function(wordlist) {
				171	wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords)
				172	sum<-0
				173	count<-0
				174	if (length(wl)>1) {
				175	for (i in 1:length(wl)) {
				176	collocates1<-getRanks(getBoth(wl[i]))
				177	collocates2<-getRanks(getBoth(firstup(wl[i])))
				178	for(j in 1:length(wl)) {
				179	if (i!=j) {
				180	rd<-rankDiffCase1(wl[i],wl[j],collocates1,collocates2)
				181	if (!is.na(rd)) {
				182	sum=sum+rd
				183	count=count+1
				184	}
				185	}
				186	}
				187	}
				188	}
				189	ret<-ifelse(count>0,sum/count,NA)
				190	print(paste(wordlist, ": ", ret))
				191	return(ret)
				192	}
				193
				194	avgRankDiffCaseLinearV<-Vectorize(avgRankDiffCaseLinear)
				195
				196
				197	avgFeaturesCaseLinear<-function(wordlist) {
				198	wl<-deleteStopwords(tolower(unlist(strsplit(wordlist," "))),stopwords)
				199	ret<- data.frame(matrix(ncol = length(featurenames)+1, nrow = 0))
				200	if (length(wl)>1) {
				201	for (i in 1:length(wl)) {
				202	collocates1<-getRanks(getBoth(wl[i]))
				203	collocates2<-getRanks(getBoth(firstup(wl[i])))
				204	for(j in 1:length(wl)) {
				205	if (i!=j) {
				206	ret<-rbind(ret,getFeaturesCase(wl[i],wl[j],collocates1,collocates2))
				207	}
				208	}
				209	}
				210	}
				211	colnames(ret)<-c("rankDiff",featurenames)
				212	mret<-colMeans(ret,na.rm=TRUE)
				213	print(paste(c(wordlist, ": ", mret)))
				214	return(mret)
				215	}
				216
				217	ngramme[,c("rankDiff",featurenames)]<-t(sapply(ngramme$tokens,avgFeaturesCaseLinear))
				218	write.table(ngramme,file="../data/goldstandard01_anno_ml_synfeat_nstop1.tsv", sep = "\t",quote=F)
				219