Import Change-Id: I3e94bfa960151d26d2420393133cdce0d1f69797

commit: 686c431329bb36421ac2142812cb3216df4bd105 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Fri Jun 23 15:41:44 2023 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Fri Jun 23 15:41:44 2023 +0200
tree: a0539d82a0eb5fbdb4a802f700c202fa53c76d14
diff --git a/NKJP.Rproj b/NKJP.Rproj
new file mode 100644
index 0000000..e83436a
--- /dev/null
+++ b/NKJP.Rproj

@@ -0,0 +1,16 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes

diff --git a/common.R b/common.R
new file mode 100644
index 0000000..7b3fca1
--- /dev/null
+++ b/common.R

@@ -0,0 +1,53 @@
+library(RKorAPClient)
+library(httr)
+library(httpuv)
+library(tidyverse)
+library(scales)
+library(idsThemeR)
+library(kableExtra)
+library(DT)
+
+nkjp = new("KorAPConnection",  KorAPUrl = "https://korap.ids-mannheim.de/instance/nkjp1m-sgjp")
+wordsFromQuery <- function (query) {
+  v <- str_split(query, "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)") %>% unlist() %>%  unique()
+  v <- v[str_detect(v, '^[:alnum:]+-?[:alnum:]*$')]
+  v[order(nchar(v), v, decreasing = T)]
+}
+
+highliteSubstrings <- function (string, substrings) {
+  what = paste0('(', paste0(substrings, collapse="|"), ')')
+  with = '<b>\\1</b>'
+  str_replace_all(string, what, with)
+}
+
+deleteFillers <- function (string) {
+  string %>%
+    str_replace_all('</b> +<b>', ' ') %>%
+    str_replace_all('</b>[^<]+<b>', ' ... ') %>%
+    str_replace_all('^[^<]*<b>', '') %>%
+    str_replace_all('</b>[^<]*$', '')
+
+}
+
+show_table <- function(df) {
+  df %>%
+    mutate(Collocate=sprintf('<a href="%s">%s</a>', webUIRequestUrl, collocate)) %>%
+    mutate(example=str_replace(example, ".*(\\W+\\w+\\W+\\w+\\W+<mark.*/mark>.*)", "\\1")) %>%
+    mutate(example=str_replace(example, "(.*<mark.*/mark>\\W+\\w+\\W+\\w+).*", "\\1")) %>%
+    rowwise() %>%
+#    mutate(Example=highliteSubstrings(example, wordsFromQuery(query))) %>%
+    mutate(Example=example) %>%
+    select(Collocate, Example, logDice, pmi, ll) %>%
+    head(50) %>%
+    datatable(escape = F) %>%
+    formatRound(columns=~logDice + pmi + ll, digits=2)
+}
+
+show_simple_table <- function(df) {
+  df %>%
+    mutate(Collocate=sprintf('<a href="%s">%s</a>', webUIRequestUrl, collocate)) %>%
+    select(Collocate, logDice, pmi, ll) %>%
+    head(50) %>%
+    datatable(options = list(pageLength = 20), escape = F) %>%
+    formatRound(columns=~logDice + pmi + ll, digits=2)
+}

diff --git a/lvc_identification.Rmd b/lvc_identification.Rmd
new file mode 100644
index 0000000..f16249a
--- /dev/null
+++ b/lvc_identification.Rmd

@@ -0,0 +1,88 @@
+---
+title: "Assembling EuReCo for Contrastive Research"
+subtitle: "The Polish Piece"
+author:
+    - name: Piotr Bański
+    - name: Nils Diewald
+    - name: Marc Kupietz
+    - name: Beata Trawiński
+affiliation:
+    address: IDS Mannheim
+column_numbers: 2
+contact:
+    name: Piotr Bański
+    department: Digital Linguistics
+    email: banski@ids-mannheim.de
+    website: "https://korap.ids-mannheim.de/instance/nkjp1m-sgjp"
+
+output: 
+  posterdown::posterdown_ids:
+    self_contained: false
+    keep_md: true
+
+bibliography: references.bib
+csl: "https://raw.githubusercontent.com/ICLC-10/Zotero/master/styles/ICLC-10.csl"
+---
+
+```{r setup, include=FALSE, echo=FALSE, message=FALSE, warning=FALSE}
+knitr::opts_chunk$set(dev = 'svg', echo = FALSE, warning = FALSE)
+source("common.R")
+```
+# Plain collocation analysis
+
+without restriction to NN/subst
+
+## da(wa)?ć
+
+```{r dac_simple, echo=TRUE}
+collocationAnalysis(
+  nkjp,
+      '[nkjp/l="da(wa)?ć"]',
+      leftContextSize = 5,
+      rightContextSize = 5,
+      minOccur = 5
+    ) %>%
+  show_simple_table()
+```
+
+# Identification of Light Verb Constructions
+using collocation analysis
+
+## da(wa)?ć
+
+```{r dac, echo=TRUE}
+collocationAnalysis(
+  nkjp,
+      'focus({[nkjp/l="da(wa)?ć"] []{,5}} [nkjp/p=subst])',
+      leftContextSize = 0,
+      rightContextSize = 1, # relative to { ... } in focus(),
+      minOccur = 5,
+      addExamples = TRUE
+    ) %>%
+  show_table()
+```
+
+### (Z)robić
+
+```{r robic, echo=TRUE}
+collocationAnalysis(
+  nkjp,
+    'focus({[nkjp/l="z?robić"] []{,5}} [nkjp/p=subst])',
+    leftContextSize = 0,
+    rightContextSize = 1, # relative to { ... } in focus(),
+    minOccur = 5,
+    addExamples = TRUE
+  ) %>%
+  show_table()
+```
+
+```{r brac, echo=TRUE}
+collocationAnalysis(
+  nkjp,
+      'focus({[nkjp/l="brać" | nkjp/l="wziąć"] []{,5}} [nkjp/p=subst])',
+      leftContextSize = 0,
+      rightContextSize = 1, # relative to { ... } in focus(),
+      minOccur = 5,
+      addExamples = TRUE
+    ) %>%
+  show_table()

diff --git a/references.bib b/references.bib
new file mode 100644
index 0000000..b0c648f
--- /dev/null
+++ b/references.bib

@@ -0,0 +1,213 @@
+
+@book{greenbaum_comparing_1996,
+	address = {Oxford},
+	title = {Comparing {English} {Worldwide}: {The} {International} {Corpus} of {English}},
+	publisher = {Clarendon Press},
+	editor = {Greenbaum, Sidney},
+	year = {1996},
+}
+
+@book{teich_cross-linguistic_2003,
+	address = {Berlin},
+	title = {Cross-{Linguistic} {Variation} in {System} and {Text}: {A} {Methodology} for the {Investigation} of {Translations} and {Comparable} {Texts}},
+	publisher = {Mouton de Gruyter},
+	author = {Teich, Elke},
+	year = {2003},
+}
+
+@inproceedings{diewald_korap_2016,
+	address = {Portorož, Slovenia},
+	title = {{KorAP} {Architecture} ― {Diving} in the {Deep} {Sea} of {Corpus} {Data}},
+	url = {https://www.aclweb.org/anthology/L16-1569},
+	booktitle = {Proceedings of the {Tenth} {International} {Conference} on {Language} {Resources} and {Evaluation} ({LREC}'16)},
+	publisher = {European Language Resources Association (ELRA)},
+	author = {Diewald, Nils and Hanl, Michael and Margaretha, Eliza and Bingel, Joachim and Kupietz, Marc and Bański, Piotr and Witt, Andreas},
+	month = may,
+	year = {2016},
+	pages = {3586--3591},
+}
+
+@inproceedings{borin_korp_2012,
+	address = {Istanbul, Turkey},
+	title = {Korp — the corpus infrastructure of {Språkbanken}},
+	url = {http://www.lrec-conf.org/proceedings/lrec2012/pdf/248_Paper.pdf},
+	booktitle = {Proceedings of the {Eighth} {International} {Conference} on {Language} {Resources} and {Evaluation} ({LREC}'12)},
+	publisher = {European Language Resources Association (ELRA)},
+	author = {Borin, Lars and Forsberg, Markus and Roxendal, Johan},
+	month = may,
+	year = {2012},
+	pages = {474--478},
+}
+
+@inproceedings{machalek_kontext_2020,
+	address = {Marseille, France},
+	title = {{KonText}: {Advanced} and {Flexible} {Corpus} {Query} {Interface}},
+	isbn = {979-10-95546-34-4},
+	url = {https://www.aclweb.org/anthology/2020.lrec-1.865},
+	language = {English},
+	booktitle = {Proceedings of the 12th {Language} {Resources} and {Evaluation} {Conference}},
+	publisher = {European Language Resources Association},
+	author = {Machálek, Tomáš},
+	month = may,
+	year = {2020},
+	pages = {7003--7008},
+}
+
+@inproceedings{kirk_ice_2017,
+	title = {From {ICE} to {ICC}: {The} new {International} {Comparable} {Corpus}},
+	url = {https://nbn-resolving.org/urn:nbn:de:bsz:mh39-62490},
+	booktitle = {Proceedings of the {Workshop} on {Challenges} in the {Management} of {Large} {Corpora} and {Big} {Data} and {Natural} {Language} {Processing} ({CMLC}-5+{BigNLP}) 2017},
+	publisher = {IDS},
+	author = {Kirk, John and Čermáková, Anna},
+	editor = {Bański, Piotr and Kupietz, Marc and Lüngen, Harald and Rayson, Paul and Biber, Hanno and Breiteneder, Evelyn and Clematide, Simon and Mariani, John and Stevenson, Mark and Sick, Theresa},
+	year = {2017},
+	pages = {7 -- 12},
+}
+
+@article{kupietz_recent_2020,
+	series = {Corpora and {Language} in {Use}},
+	title = {Recent developments in the {European} {Reference} {Corpus} {EuReCo}},
+	journal = {Translating and Comparing Languages: Corpus-based Insights. Selected Proceedings of the Fifth Using Corpora in Contrastive and Translation Studies Conference. Louvain-la-Neuve: Presses universitaires de Louvain},
+	author = {Kupietz, Marc and Diewald, Nils and Trawiński, Beata and Cosma, Ruxandra and Cristea, Dan and Tufiş, Dan and Váradi, Tamás and Wöllstein, Angelika},
+	year = {2020},
+	pages = {257--273},
+}
+
+@inproceedings{nivre_universal_2020,
+	address = {Marseille, France},
+	title = {Universal {Dependencies} v2: {An} {Evergrowing} {Multilingual} {Treebank} {Collection}},
+	isbn = {979-10-95546-34-4},
+	url = {https://www.aclweb.org/anthology/2020.lrec-1.497},
+	language = {English},
+	booktitle = {Proceedings of the 12th {Language} {Resources} and {Evaluation} {Conference}},
+	publisher = {European Language Resources Association},
+	author = {Nivre, Joakim and de Marneffe, Marie-Catherine and Ginter, Filip and Hajič, Jan and Manning, Christopher D. and Pyysalo, Sampo and Schuster, Sebastian and Tyers, Francis and Zeman, Daniel},
+	month = may,
+	year = {2020},
+	pages = {4034--4043},
+}
+
+@article{cermakova_international_2021,
+	title = {The {International} {Comparable} {Corpus}: {Challenges} in building multilingual spoken and written comparable corpora},
+	volume = {9},
+	issn = {2243-4712},
+	url = {https://nbn-resolving.org/urn:nbn:de:bsz:mh39-105084},
+	doi = {10.32714/ricl.09.01.06},
+	abstract = {This paper reports on the efforts of twelve national teams in building the International Comparable Corpus (ICC; https://korpus.cz/icc) that will contain highly comparable datasets of spoken, written and electronic registers. The languages currently covered are Czech, Finnish, French, German, Irish, Italian, Norwegian, Polish, Slovak, Swedish and, more recently, Chinese, as well as English, which is considered to be the pivot language. The goal of the project is to provide much-needed data for contrastive corpus-based linguistics. The ICC corpus is committed to the idea of re-using existing multilingual resources as much as possible and the design is modelled, with various adjustments, on the International Corpus of English (ICE). As such, ICC will contain approximately the same balance of forty percent of written language and 60 percent of spoken language distributed across 27 different text types and contexts. A number of issues encountered by the project teams are discussed, ranging from copyright and data sustainability to technical advances in data distribution.},
+	language = {en},
+	number = {1},
+	journal = {Research in Corpus Linguistics: Special issue "Challenges of combining structured and unstructured data in corpus development"},
+	author = {Čermáková, Anna and Jantunen, Jarmo and Jauhiainen, Tommi and Kirk, John and Křen, Michal and Kupietz, Marc and Uí Dhonnchadha, Elaine},
+	editor = {Säily, Tanja and Tyrkkö, Jukka},
+	year = {2021},
+	note = {Place: Murcia
+Publisher: Spanish Association for Corpus Linguistics},
+	pages = {89 -- 103},
+}
+
+@incollection{kupietz_building_2022,
+	address = {Berlin},
+	title = {Building paths to corpus data: {A} multi-level least effort and maximum return approach},
+	url = {https://doi.org/10.1515/9783110767377-007},
+	booktitle = {{CLARIN}. {The} {Infrastructure} for {Language} {Resources}.},
+	publisher = {deGruyter},
+	author = {Kupietz, Marc and Diewald, Nils and Margaretha, Eliza},
+	editor = {Fišer, Darja and Witt, Andreas},
+	year = {2022},
+	note = {Section: number x},
+}
+
+@article{cermakova_be_nodate,
+	title = {‘{Be}’ verbs in a contrastive perspective: {The} case of být, be and være.},
+	journal = {Nordic Journal of English Studies},
+	author = {Čermáková, Anna and Ebeling, Jarle and Ebeling Oksefjell, Signe},
+}
+
+@incollection{kupietz_neue_2022,
+	address = {Bern},
+	series = {Jahrbuch für {Internationale} {Germanistik} - {Beihefte} - 6},
+	title = {Neue {Perspektiven} für kontrastive {Korpuslinguistik}: {Das} {Europäische} {Referenzkorpus} {EuReCo}},
+	isbn = {978-3-0343-3660-4},
+	abstract = {Dieser Beitrag beschreibt die Motivation und Ziele hinter der Initiative Europäisches Referenzkorpus EuReCo. Ausgehend von den Desiderata, die sich aufgrund der Defizite verfügbarer Forschungsdaten wie monolinguale Korpora, Parallelkorpora und Vergleichskorpora für den Sprachvergleich ergeben, werden die bisherigen und die laufenden Arbeiten im Rahmen von EuReCo präsentiert und anhand vergleichender deutsch-rumänischer Kookkurrenzanalysen neue Perspektiven für kontrastive Korpuslinguistik, die die EuReCo-Initiative öffnet, skizziert.},
+	booktitle = {Wege der {Germanistik} in transkultureller {Perspektive}. {Akten} des {XIV}. {Kongresses} der {Internationalen} {Vereinigung} für {Germanistik} ({IVG}) ({Bd}. 6)},
+	publisher = {Peter Lang},
+	author = {Kupietz, Marc and Trawiński, Beata},
+	editor = {Auteri, Laura and Barrale, Natascia and Di Bella, Arianna and Hoffmann, Sabine},
+	year = {2022},
+	keywords = {Kontrastive Linguistik, Korpus, Deutsch, Funktionsverbgefüge, Kookkurrenzanalyse, Korpuslinguistik, Rumänisch, Vergleichbare Korpora},
+	pages = {417--439},
+}
+
+@incollection{hardy_multi-dimensional_2015,
+	address = {London},
+	title = {Multi-{Dimensional} {Analysis} of {Academic} {Discourse}},
+	isbn = {978-1-137-43173-8},
+	url = {https://doi.org/10.1057/9781137431738_8},
+	abstract = {This chapter provides an overview of multi-dimensional (MD) analysis and important findings in this area of research. This approach to the study of language variation and discourse communities is then exemplified through a case study of an MD analysis of student writing from the Michigan Corpus of Upper-level Student Papers (MICUSP), which includes four different levels of discourse community members: final-year undergraduate students, and first-, second-, and third-year graduate students. Although variation of MICUSP has been investigated according to discipline (Hardy and Römer, 2013) and paper type (Hardy and Friginal, 2014), it has not been investigated according to writer level.},
+	booktitle = {Corpora and {Discourse} {Studies}: {Integrating} {Discourse} and {Corpora}},
+	publisher = {Palgrave Macmillan UK},
+	author = {Hardy, Jack A.},
+	editor = {Baker, Paul and McEnery, Tony},
+	year = {2015},
+	doi = {10.1057/9781137431738_8},
+	pages = {155--174},
+}
+
+@article{biber_spoken_1986,
+	title = {Spoken and {Written} {Textual} {Dimensions} in {English}: {Resolving} the {Contradictory} {Findings}},
+	volume = {62},
+	issn = {00978507, 15350665},
+	url = {http://www.jstor.org/stable/414678},
+	doi = {10.2307/414678},
+	abstract = {[Although similarities and differences between speech and writing have often been studied, contradictory claims concerning the linguistic relationship between the two modes are still common. These contradictions can arise from basing global conclusions on restricted methodologies-such as assigning undue weight to individual linguistic features, or to choice of particular text samples and text types. The present study uses a 'multi-feature/multi-dimension' approach, which includes a broad range of linguistic features and text types in a single quantitative analysis, to provide a global description of similarities and differences among spoken/written text types in English. The distribution of 41 linguistic features in 545 text samples of approximately 2000 words each is subjected to factor analysis (a multivariate statistical technique). Three underlying textual dimensions are identified: Interactive vs. Edited Text, Abstract vs. Situated Content, and Reported vs. Immediate Style. To demonstrate the value of the multi-feature/multi-dimension approach, the specific findings of earlier studies are reconciled within the model proposed here.]},
+	number = {2},
+	urldate = {2023-04-30},
+	journal = {Language},
+	author = {Biber, Douglas},
+	year = {1986},
+	note = {Publisher: Linguistic Society of America},
+	pages = {384--414},
+	file = {Spoken and Written Textual Dimensions in English\: Resolving the Contradictory Findings:/home/kupietz/Zotero/storage/938FXDXC/biber1986.pdf.pdf:application/pdf},
+}
+
+@inproceedings{straka_udpipe_2018,
+	address = {Brussels, Belgium},
+	title = {{UDPipe} 2.0 {Prototype} at {CoNLL} 2018 {UD} {Shared} {Task}},
+	url = {https://www.aclweb.org/anthology/K18-2020},
+	doi = {10.18653/v1/K18-2020},
+	booktitle = {Proceedings of the {CoNLL} 2018 {Shared} {Task}: {Multilingual} {Parsing} from {Raw} {Text} to {Universal} {Dependencies}},
+	publisher = {Association for Computational Linguistics},
+	author = {Straka, Milan},
+	month = oct,
+	year = {2018},
+	pages = {197--207},
+}
+
+@inproceedings{Kupietz:Diewald:Hanl:Margaretha:2016,
+	address = {Mannheim, Germany},
+	series = {Proceedings of the {Methodenmesse} im {Rahmen} der {Jahrestagung} des {Instituts} für {Deutsche} {Sprache}},
+	title = {Möglichkeiten der {Erforschung} grammatischer {Variation} mithilfe von {KorAP}, der neuen {Korpusanalyseplattform} des {IDS}},
+	copyright = {All rights reserved},
+	booktitle = {Grammatische {Variation}. {Empirische} {Zugänge} und theoretische {Modellierung}},
+	publisher = {De Gruyter},
+	author = {Kupietz, Marc and Diewald, Nils and Hanl, Michael and Margaretha, Eliza},
+	year = {2016},
+	pages = {319--329},
+	file = {Kupietz et al. - 2016 - Möglichkeiten der Erforschung grammatischer Variat.pdf:/home/kupietz/Zotero/storage/8K4AI4T9/Kupietz et al. - 2016 - Möglichkeiten der Erforschung grammatischer Variat.pdf:application/pdf},
+}
+
+@inproceedings{Banski:Fischer:Frick:Ketzan:Kupietz:Schnober:Schonefeld:Witt:2012,
+	address = {Istanbul, Turkey},
+	title = {The {New} {IDS} {Corpus} {Analysis} {Platform}: {Challenges} and {Prospects}},
+	shorttitle = {The {New} {IDS} {Corpus} {Analysis} {Platform}},
+	url = {http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf},
+	abstract = {The present article describes the first stage of the KorAP project, launched recently at the Institut für Deutsche Sprache (IDS) in Mannheim, Germany. The aim of this project is to develop an innovative corpus analysis platform to tackle the increasing demands of modern linguistic research. The platform will facilitate new linguistic findings by making it possible to manage and analyse primary data and annotations in the petabyte range, while at the same time allowing an undistorted view of the primary linguistic data, and thus fully satisfying the demands of a scientific tool. An additional important aim of the project is to make corpus data as openly accessible as possible in light of unavoidable legal restrictions, for instance through support for distributed virtual corpora, user-defined annotations and adaptable user interfaces, as well as interfaces and sandboxes for user-supplied analysis applications. We discuss our motivation for undertaking this endeavour and the challenges that face it. Next, we outline our software implementation plan and describe development to-date.},
+	urldate = {2022-04-12},
+	booktitle = {Proceedings of the {Eighth} {International} {Conference} on {Language} {Resources} and {Evaluation} ({LREC}'12)},
+	publisher = {European Language Resources Association (ELRA)},
+	author = {Bański, Piotr and Fischer, Peter M. and Frick, Elena and Ketzan, Erik and Kupietz, Marc and Schnober, Carsten and Schonefeld, Oliver and Witt, Andreas},
+	month = may,
+	year = {2012},
+	pages = {2905--2911},
+	file = {Full Text PDF:/home/kupietz/Zotero/storage/IC9U5T6F/Bański et al. - 2012 - The New IDS Corpus Analysis Platform Challenges a.pdf:application/pdf},
+}
commit	686c431329bb36421ac2142812cb3216df4bd105	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Fri Jun 23 15:41:44 2023 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Fri Jun 23 15:41:44 2023 +0200
tree	a0539d82a0eb5fbdb4a802f700c202fa53c76d14