blob: b09be9be6635bc6aa076cedd99babe05d50bd3f0 [file] [log] [blame]
Marc Kupietzbcde0b62023-06-14 14:22:35 +02001
2@book{greenbaum_comparing_1996,
3 address = {Oxford},
4 title = {Comparing {English} {Worldwide}: {The} {International} {Corpus} of {English}},
5 publisher = {Clarendon Press},
6 editor = {Greenbaum, Sidney},
7 year = {1996},
8}
9
10@book{teich_cross-linguistic_2003,
11 address = {Berlin},
12 title = {Cross-{Linguistic} {Variation} in {System} and {Text}: {A} {Methodology} for the {Investigation} of {Translations} and {Comparable} {Texts}},
13 publisher = {Mouton de Gruyter},
14 author = {Teich, Elke},
15 year = {2003},
16}
17
18@inproceedings{diewald_korap_2016,
19 address = {Portorož, Slovenia},
20 title = {{KorAP} {Architecture} {Diving} in the {Deep} {Sea} of {Corpus} {Data}},
Marc Kupietz333f99e2023-06-29 16:00:51 +020021 url = {https://aclanthology.org/L16-1569/},
Marc Kupietzbcde0b62023-06-14 14:22:35 +020022 booktitle = {Proceedings of the {Tenth} {International} {Conference} on {Language} {Resources} and {Evaluation} ({LREC}'16)},
Marc Kupietz333f99e2023-06-29 16:00:51 +020023 address = "Portorož / Paris",
24 publisher = {ELRA},
Marc Kupietzbcde0b62023-06-14 14:22:35 +020025 author = {Diewald, Nils and Hanl, Michael and Margaretha, Eliza and Bingel, Joachim and Kupietz, Marc and Bański, Piotr and Witt, Andreas},
26 month = may,
27 year = {2016},
28 pages = {3586--3591},
29}
30
31@inproceedings{borin_korp_2012,
32 address = {Istanbul, Turkey},
33 title = {Korp — the corpus infrastructure of {Språkbanken}},
34 url = {http://www.lrec-conf.org/proceedings/lrec2012/pdf/248_Paper.pdf},
35 booktitle = {Proceedings of the {Eighth} {International} {Conference} on {Language} {Resources} and {Evaluation} ({LREC}'12)},
36 publisher = {European Language Resources Association (ELRA)},
37 author = {Borin, Lars and Forsberg, Markus and Roxendal, Johan},
38 month = may,
39 year = {2012},
40 pages = {474--478},
41}
42
43@inproceedings{machalek_kontext_2020,
44 address = {Marseille, France},
45 title = {{KonText}: {Advanced} and {Flexible} {Corpus} {Query} {Interface}},
46 isbn = {979-10-95546-34-4},
47 url = {https://www.aclweb.org/anthology/2020.lrec-1.865},
48 language = {English},
49 booktitle = {Proceedings of the 12th {Language} {Resources} and {Evaluation} {Conference}},
50 publisher = {European Language Resources Association},
51 author = {Machálek, Tomáš},
52 month = may,
53 year = {2020},
54 pages = {7003--7008},
55}
56
57@inproceedings{kirk_ice_2017,
58 title = {From {ICE} to {ICC}: {The} new {International} {Comparable} {Corpus}},
59 url = {https://nbn-resolving.org/urn:nbn:de:bsz:mh39-62490},
60 booktitle = {Proceedings of the {Workshop} on {Challenges} in the {Management} of {Large} {Corpora} and {Big} {Data} and {Natural} {Language} {Processing} ({CMLC}-5+{BigNLP}) 2017},
61 publisher = {IDS},
62 author = {Kirk, John and Čermáková, Anna},
Marc Kupietzbcde0b62023-06-14 14:22:35 +020063 year = {2017},
64 pages = {7 -- 12},
65}
66
67@article{kupietz_recent_2020,
68 series = {Corpora and {Language} in {Use}},
69 title = {Recent developments in the {European} {Reference} {Corpus} {EuReCo}},
70 journal = {Translating and Comparing Languages: Corpus-based Insights. Selected Proceedings of the Fifth Using Corpora in Contrastive and Translation Studies Conference. Louvain-la-Neuve: Presses universitaires de Louvain},
71 author = {Kupietz, Marc and Diewald, Nils and Trawiński, Beata and Cosma, Ruxandra and Cristea, Dan and Tufiş, Dan and Váradi, Tamás and Wöllstein, Angelika},
72 year = {2020},
73 pages = {257--273},
74}
75
76@inproceedings{nivre_universal_2020,
77 address = {Marseille, France},
78 title = {Universal {Dependencies} v2: {An} {Evergrowing} {Multilingual} {Treebank} {Collection}},
79 isbn = {979-10-95546-34-4},
80 url = {https://www.aclweb.org/anthology/2020.lrec-1.497},
81 language = {English},
82 booktitle = {Proceedings of the 12th {Language} {Resources} and {Evaluation} {Conference}},
83 publisher = {European Language Resources Association},
84 author = {Nivre, Joakim and de Marneffe, Marie-Catherine and Ginter, Filip and Hajič, Jan and Manning, Christopher D. and Pyysalo, Sampo and Schuster, Sebastian and Tyers, Francis and Zeman, Daniel},
85 month = may,
86 year = {2020},
87 pages = {4034--4043},
88}
89
90@article{cermakova_international_2021,
91 title = {The {International} {Comparable} {Corpus}: {Challenges} in building multilingual spoken and written comparable corpora},
92 volume = {9},
93 issn = {2243-4712},
94 url = {https://nbn-resolving.org/urn:nbn:de:bsz:mh39-105084},
95 doi = {10.32714/ricl.09.01.06},
96 abstract = {This paper reports on the efforts of twelve national teams in building the International Comparable Corpus (ICC; https://korpus.cz/icc) that will contain highly comparable datasets of spoken, written and electronic registers. The languages currently covered are Czech, Finnish, French, German, Irish, Italian, Norwegian, Polish, Slovak, Swedish and, more recently, Chinese, as well as English, which is considered to be the pivot language. The goal of the project is to provide much-needed data for contrastive corpus-based linguistics. The ICC corpus is committed to the idea of re-using existing multilingual resources as much as possible and the design is modelled, with various adjustments, on the International Corpus of English (ICE). As such, ICC will contain approximately the same balance of forty percent of written language and 60 percent of spoken language distributed across 27 different text types and contexts. A number of issues encountered by the project teams are discussed, ranging from copyright and data sustainability to technical advances in data distribution.},
97 language = {en},
98 number = {1},
99 journal = {Research in Corpus Linguistics: Special issue "Challenges of combining structured and unstructured data in corpus development"},
100 author = ermáková, Anna and Jantunen, Jarmo and Jauhiainen, Tommi and Kirk, John and Křen, Michal and Kupietz, Marc and Uí Dhonnchadha, Elaine},
101 editor = {Säily, Tanja and Tyrkkö, Jukka},
102 year = {2021},
103 note = {Place: Murcia
104Publisher: Spanish Association for Corpus Linguistics},
105 pages = {89 -- 103},
106}
107
108@incollection{kupietz_building_2022,
109 address = {Berlin},
110 title = {Building paths to corpus data: {A} multi-level least effort and maximum return approach},
111 url = {https://doi.org/10.1515/9783110767377-007},
112 booktitle = {{CLARIN}. {The} {Infrastructure} for {Language} {Resources}.},
113 publisher = {deGruyter},
114 author = {Kupietz, Marc and Diewald, Nils and Margaretha, Eliza},
115 editor = {Fišer, Darja and Witt, Andreas},
116 year = {2022},
117 note = {Section: number x},
118}
119
120@article{cermakova_be_nodate,
121 title = {‘{Be}’ verbs in a contrastive perspective: {The} case of být, be and være.},
122 journal = {Nordic Journal of English Studies},
123 author = ermáková, Anna and Ebeling, Jarle and Ebeling Oksefjell, Signe},
124}
125
126@incollection{kupietz_neue_2022,
127 address = {Bern},
128 series = {Jahrbuch für {Internationale} {Germanistik} - {Beihefte} - 6},
129 title = {Neue {Perspektiven} für kontrastive {Korpuslinguistik}: {Das} {Europäische} {Referenzkorpus} {EuReCo}},
130 isbn = {978-3-0343-3660-4},
131 abstract = {Dieser Beitrag beschreibt die Motivation und Ziele hinter der Initiative Europäisches Referenzkorpus EuReCo. Ausgehend von den Desiderata, die sich aufgrund der Defizite verfügbarer Forschungsdaten wie monolinguale Korpora, Parallelkorpora und Vergleichskorpora für den Sprachvergleich ergeben, werden die bisherigen und die laufenden Arbeiten im Rahmen von EuReCo präsentiert und anhand vergleichender deutsch-rumänischer Kookkurrenzanalysen neue Perspektiven für kontrastive Korpuslinguistik, die die EuReCo-Initiative öffnet, skizziert.},
132 booktitle = {Wege der {Germanistik} in transkultureller {Perspektive}. {Akten} des {XIV}. {Kongresses} der {Internationalen} {Vereinigung} für {Germanistik} ({IVG}) ({Bd}. 6)},
133 publisher = {Peter Lang},
134 author = {Kupietz, Marc and Trawiński, Beata},
135 editor = {Auteri, Laura and Barrale, Natascia and Di Bella, Arianna and Hoffmann, Sabine},
136 year = {2022},
137 keywords = {Kontrastive Linguistik, Korpus, Deutsch, Funktionsverbgefüge, Kookkurrenzanalyse, Korpuslinguistik, Rumänisch, Vergleichbare Korpora},
138 pages = {417--439},
139}
140
141@incollection{hardy_multi-dimensional_2015,
142 address = {London},
143 title = {Multi-{Dimensional} {Analysis} of {Academic} {Discourse}},
144 isbn = {978-1-137-43173-8},
145 url = {https://doi.org/10.1057/9781137431738_8},
146 abstract = {This chapter provides an overview of multi-dimensional (MD) analysis and important findings in this area of research. This approach to the study of language variation and discourse communities is then exemplified through a case study of an MD analysis of student writing from the Michigan Corpus of Upper-level Student Papers (MICUSP), which includes four different levels of discourse community members: final-year undergraduate students, and first-, second-, and third-year graduate students. Although variation of MICUSP has been investigated according to discipline (Hardy and Römer, 2013) and paper type (Hardy and Friginal, 2014), it has not been investigated according to writer level.},
147 booktitle = {Corpora and {Discourse} {Studies}: {Integrating} {Discourse} and {Corpora}},
148 publisher = {Palgrave Macmillan UK},
149 author = {Hardy, Jack A.},
150 editor = {Baker, Paul and McEnery, Tony},
151 year = {2015},
152 doi = {10.1057/9781137431738_8},
153 pages = {155--174},
154}
155
156@article{biber_spoken_1986,
157 title = {Spoken and {Written} {Textual} {Dimensions} in {English}: {Resolving} the {Contradictory} {Findings}},
158 volume = {62},
159 issn = {00978507, 15350665},
160 url = {http://www.jstor.org/stable/414678},
161 doi = {10.2307/414678},
162 abstract = {[Although similarities and differences between speech and writing have often been studied, contradictory claims concerning the linguistic relationship between the two modes are still common. These contradictions can arise from basing global conclusions on restricted methodologies-such as assigning undue weight to individual linguistic features, or to choice of particular text samples and text types. The present study uses a 'multi-feature/multi-dimension' approach, which includes a broad range of linguistic features and text types in a single quantitative analysis, to provide a global description of similarities and differences among spoken/written text types in English. The distribution of 41 linguistic features in 545 text samples of approximately 2000 words each is subjected to factor analysis (a multivariate statistical technique). Three underlying textual dimensions are identified: Interactive vs. Edited Text, Abstract vs. Situated Content, and Reported vs. Immediate Style. To demonstrate the value of the multi-feature/multi-dimension approach, the specific findings of earlier studies are reconciled within the model proposed here.]},
163 number = {2},
164 urldate = {2023-04-30},
165 journal = {Language},
166 author = {Biber, Douglas},
167 year = {1986},
168 note = {Publisher: Linguistic Society of America},
169 pages = {384--414},
170 file = {Spoken and Written Textual Dimensions in English\: Resolving the Contradictory Findings:/home/kupietz/Zotero/storage/938FXDXC/biber1986.pdf.pdf:application/pdf},
171}
172
173@inproceedings{straka_udpipe_2018,
174 address = {Brussels, Belgium},
175 title = {{UDPipe} 2.0 {Prototype} at {CoNLL} 2018 {UD} {Shared} {Task}},
176 url = {https://www.aclweb.org/anthology/K18-2020},
177 doi = {10.18653/v1/K18-2020},
178 booktitle = {Proceedings of the {CoNLL} 2018 {Shared} {Task}: {Multilingual} {Parsing} from {Raw} {Text} to {Universal} {Dependencies}},
179 publisher = {Association for Computational Linguistics},
180 author = {Straka, Milan},
181 month = oct,
182 year = {2018},
183 pages = {197--207},
184}
185
186@inproceedings{Kupietz:Diewald:Hanl:Margaretha:2016,
187 address = {Mannheim, Germany},
188 series = {Proceedings of the {Methodenmesse} im {Rahmen} der {Jahrestagung} des {Instituts} für {Deutsche} {Sprache}},
189 title = {Möglichkeiten der {Erforschung} grammatischer {Variation} mithilfe von {KorAP}, der neuen {Korpusanalyseplattform} des {IDS}},
190 copyright = {All rights reserved},
191 booktitle = {Grammatische {Variation}. {Empirische} {Zugänge} und theoretische {Modellierung}},
192 publisher = {De Gruyter},
193 author = {Kupietz, Marc and Diewald, Nils and Hanl, Michael and Margaretha, Eliza},
194 year = {2016},
195 pages = {319--329},
196 file = {Kupietz et al. - 2016 - Möglichkeiten der Erforschung grammatischer Variat.pdf:/home/kupietz/Zotero/storage/8K4AI4T9/Kupietz et al. - 2016 - Möglichkeiten der Erforschung grammatischer Variat.pdf:application/pdf},
197}
198
199@inproceedings{Banski:Fischer:Frick:Ketzan:Kupietz:Schnober:Schonefeld:Witt:2012,
200 address = {Istanbul, Turkey},
201 title = {The {New} {IDS} {Corpus} {Analysis} {Platform}: {Challenges} and {Prospects}},
202 shorttitle = {The {New} {IDS} {Corpus} {Analysis} {Platform}},
203 url = {http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf},
204 abstract = {The present article describes the first stage of the KorAP project, launched recently at the Institut für Deutsche Sprache (IDS) in Mannheim, Germany. The aim of this project is to develop an innovative corpus analysis platform to tackle the increasing demands of modern linguistic research. The platform will facilitate new linguistic findings by making it possible to manage and analyse primary data and annotations in the petabyte range, while at the same time allowing an undistorted view of the primary linguistic data, and thus fully satisfying the demands of a scientific tool. An additional important aim of the project is to make corpus data as openly accessible as possible in light of unavoidable legal restrictions, for instance through support for distributed virtual corpora, user-defined annotations and adaptable user interfaces, as well as interfaces and sandboxes for user-supplied analysis applications. We discuss our motivation for undertaking this endeavour and the challenges that face it. Next, we outline our software implementation plan and describe development to-date.},
205 urldate = {2022-04-12},
206 booktitle = {Proceedings of the {Eighth} {International} {Conference} on {Language} {Resources} and {Evaluation} ({LREC}'12)},
207 publisher = {European Language Resources Association (ELRA)},
208 author = {Bański, Piotr and Fischer, Peter M. and Frick, Elena and Ketzan, Erik and Kupietz, Marc and Schnober, Carsten and Schonefeld, Oliver and Witt, Andreas},
209 month = may,
210 year = {2012},
211 pages = {2905--2911},
212 file = {Full Text PDF:/home/kupietz/Zotero/storage/IC9U5T6F/Bański et al. - 2012 - The New IDS Corpus Analysis Platform Challenges a.pdf:application/pdf},
213}
Marc Kupietzf1dd9102023-06-26 20:36:08 +0200214
215@inproceedings{kupietz_rkorapclient_2020,
Marc Kupietz333f99e2023-06-29 16:00:51 +0200216 address = {Marseille / Paris},
Marc Kupietzf1dd9102023-06-26 20:36:08 +0200217 title = {{RKorAPClient}: {An} {R} {Package} for {Accessing} the {German} {Reference} {Corpus} {DeReKo} via {KorAP}},
218 isbn = {979-10-95546-34-4},
Marc Kupietz333f99e2023-06-29 16:00:51 +0200219 url = {https://aclanthology.org/2020.lrec-1.867/},
Marc Kupietzf1dd9102023-06-26 20:36:08 +0200220 language = {English},
221 booktitle = {Proceedings of the 12th {Language} {Resources} and {Evaluation} {Conference}},
Marc Kupietz333f99e2023-06-29 16:00:51 +0200222 publisher = {ELRA},
Marc Kupietzf1dd9102023-06-26 20:36:08 +0200223 author = {Kupietz, Marc and Diewald, Nils and Margaretha, Eliza},
224 month = may,
225 year = {2020},
226 pages = {7015--7021},
227}
228
Marc Kupietz48a41342023-06-28 18:16:54 +0200229@article{MargarethaLuengen2014,
230 author = {Eliza Margaretha and Harald Lüngen},
231 title = {Building linguistic corpora from Wikipedia articles and discussions},
232 journal = {Journal of Language Technology and Computational Linguistics. Special issue on building and annotating corpora of computer-mediated communication. Issues and challenges at the interface between computational and corpus linguistics},
233 volume = {29},
234 number = {2},
235 editor = {Michael Beißwenger and Angelika Storrer and Nelleke Oostdijk and Henk van den Heuvel},
236 url = {https://nbn-resolving.org/urn:nbn:de:bsz:mh39-33306},
237 pages = {59 -- 82},
238 year = {2014},
239 abstract = {Wikipedia is a valuable resource, useful as a lingustic corpus or a dataset for many kinds of research. We built corpora from Wikipedia articles and talk pages in the I5 format, a TEI customisation used in the German Reference Corpus (Deutsches Referenzkorpus - DeReKo). Our approach is a two-stage conversion combining parsing using the Sweble parser, and transformation using XSLT stylesheets. The conversion approach is able to successfully generate rich and valid corpora regardless of languages. We also introduce a method to segment user contributions in talk pages into postings.},
240 language = {de}
241}