blob: cae2002b28ccf7f6f6392bdfe2d3d35ca513faec [file] [log] [blame]
Marc Kupietzbcde0b62023-06-14 14:22:35 +02001
2@book{greenbaum_comparing_1996,
3 address = {Oxford},
4 title = {Comparing {English} {Worldwide}: {The} {International} {Corpus} of {English}},
5 publisher = {Clarendon Press},
6 editor = {Greenbaum, Sidney},
7 year = {1996},
8}
9
10@book{teich_cross-linguistic_2003,
11 address = {Berlin},
12 title = {Cross-{Linguistic} {Variation} in {System} and {Text}: {A} {Methodology} for the {Investigation} of {Translations} and {Comparable} {Texts}},
13 publisher = {Mouton de Gruyter},
14 author = {Teich, Elke},
15 year = {2003},
16}
17
18@inproceedings{diewald_korap_2016,
19 address = {Portorož, Slovenia},
20 title = {{KorAP} {Architecture} {Diving} in the {Deep} {Sea} of {Corpus} {Data}},
21 url = {https://www.aclweb.org/anthology/L16-1569},
22 booktitle = {Proceedings of the {Tenth} {International} {Conference} on {Language} {Resources} and {Evaluation} ({LREC}'16)},
23 publisher = {European Language Resources Association (ELRA)},
24 author = {Diewald, Nils and Hanl, Michael and Margaretha, Eliza and Bingel, Joachim and Kupietz, Marc and Bański, Piotr and Witt, Andreas},
25 month = may,
26 year = {2016},
27 pages = {3586--3591},
28}
29
30@inproceedings{borin_korp_2012,
31 address = {Istanbul, Turkey},
32 title = {Korp — the corpus infrastructure of {Språkbanken}},
33 url = {http://www.lrec-conf.org/proceedings/lrec2012/pdf/248_Paper.pdf},
34 booktitle = {Proceedings of the {Eighth} {International} {Conference} on {Language} {Resources} and {Evaluation} ({LREC}'12)},
35 publisher = {European Language Resources Association (ELRA)},
36 author = {Borin, Lars and Forsberg, Markus and Roxendal, Johan},
37 month = may,
38 year = {2012},
39 pages = {474--478},
40}
41
42@inproceedings{machalek_kontext_2020,
43 address = {Marseille, France},
44 title = {{KonText}: {Advanced} and {Flexible} {Corpus} {Query} {Interface}},
45 isbn = {979-10-95546-34-4},
46 url = {https://www.aclweb.org/anthology/2020.lrec-1.865},
47 language = {English},
48 booktitle = {Proceedings of the 12th {Language} {Resources} and {Evaluation} {Conference}},
49 publisher = {European Language Resources Association},
50 author = {Machálek, Tomáš},
51 month = may,
52 year = {2020},
53 pages = {7003--7008},
54}
55
56@inproceedings{kirk_ice_2017,
57 title = {From {ICE} to {ICC}: {The} new {International} {Comparable} {Corpus}},
58 url = {https://nbn-resolving.org/urn:nbn:de:bsz:mh39-62490},
59 booktitle = {Proceedings of the {Workshop} on {Challenges} in the {Management} of {Large} {Corpora} and {Big} {Data} and {Natural} {Language} {Processing} ({CMLC}-5+{BigNLP}) 2017},
60 publisher = {IDS},
61 author = {Kirk, John and Čermáková, Anna},
Marc Kupietzbcde0b62023-06-14 14:22:35 +020062 year = {2017},
63 pages = {7 -- 12},
64}
65
66@article{kupietz_recent_2020,
67 series = {Corpora and {Language} in {Use}},
68 title = {Recent developments in the {European} {Reference} {Corpus} {EuReCo}},
69 journal = {Translating and Comparing Languages: Corpus-based Insights. Selected Proceedings of the Fifth Using Corpora in Contrastive and Translation Studies Conference. Louvain-la-Neuve: Presses universitaires de Louvain},
70 author = {Kupietz, Marc and Diewald, Nils and Trawiński, Beata and Cosma, Ruxandra and Cristea, Dan and Tufiş, Dan and Váradi, Tamás and Wöllstein, Angelika},
71 year = {2020},
72 pages = {257--273},
73}
74
75@inproceedings{nivre_universal_2020,
76 address = {Marseille, France},
77 title = {Universal {Dependencies} v2: {An} {Evergrowing} {Multilingual} {Treebank} {Collection}},
78 isbn = {979-10-95546-34-4},
79 url = {https://www.aclweb.org/anthology/2020.lrec-1.497},
80 language = {English},
81 booktitle = {Proceedings of the 12th {Language} {Resources} and {Evaluation} {Conference}},
82 publisher = {European Language Resources Association},
83 author = {Nivre, Joakim and de Marneffe, Marie-Catherine and Ginter, Filip and Hajič, Jan and Manning, Christopher D. and Pyysalo, Sampo and Schuster, Sebastian and Tyers, Francis and Zeman, Daniel},
84 month = may,
85 year = {2020},
86 pages = {4034--4043},
87}
88
89@article{cermakova_international_2021,
90 title = {The {International} {Comparable} {Corpus}: {Challenges} in building multilingual spoken and written comparable corpora},
91 volume = {9},
92 issn = {2243-4712},
93 url = {https://nbn-resolving.org/urn:nbn:de:bsz:mh39-105084},
94 doi = {10.32714/ricl.09.01.06},
95 abstract = {This paper reports on the efforts of twelve national teams in building the International Comparable Corpus (ICC; https://korpus.cz/icc) that will contain highly comparable datasets of spoken, written and electronic registers. The languages currently covered are Czech, Finnish, French, German, Irish, Italian, Norwegian, Polish, Slovak, Swedish and, more recently, Chinese, as well as English, which is considered to be the pivot language. The goal of the project is to provide much-needed data for contrastive corpus-based linguistics. The ICC corpus is committed to the idea of re-using existing multilingual resources as much as possible and the design is modelled, with various adjustments, on the International Corpus of English (ICE). As such, ICC will contain approximately the same balance of forty percent of written language and 60 percent of spoken language distributed across 27 different text types and contexts. A number of issues encountered by the project teams are discussed, ranging from copyright and data sustainability to technical advances in data distribution.},
96 language = {en},
97 number = {1},
98 journal = {Research in Corpus Linguistics: Special issue "Challenges of combining structured and unstructured data in corpus development"},
99 author = ermáková, Anna and Jantunen, Jarmo and Jauhiainen, Tommi and Kirk, John and Křen, Michal and Kupietz, Marc and Uí Dhonnchadha, Elaine},
100 editor = {Säily, Tanja and Tyrkkö, Jukka},
101 year = {2021},
102 note = {Place: Murcia
103Publisher: Spanish Association for Corpus Linguistics},
104 pages = {89 -- 103},
105}
106
107@incollection{kupietz_building_2022,
108 address = {Berlin},
109 title = {Building paths to corpus data: {A} multi-level least effort and maximum return approach},
110 url = {https://doi.org/10.1515/9783110767377-007},
111 booktitle = {{CLARIN}. {The} {Infrastructure} for {Language} {Resources}.},
112 publisher = {deGruyter},
113 author = {Kupietz, Marc and Diewald, Nils and Margaretha, Eliza},
114 editor = {Fišer, Darja and Witt, Andreas},
115 year = {2022},
116 note = {Section: number x},
117}
118
119@article{cermakova_be_nodate,
120 title = {‘{Be}’ verbs in a contrastive perspective: {The} case of být, be and være.},
121 journal = {Nordic Journal of English Studies},
122 author = ermáková, Anna and Ebeling, Jarle and Ebeling Oksefjell, Signe},
123}
124
125@incollection{kupietz_neue_2022,
126 address = {Bern},
127 series = {Jahrbuch für {Internationale} {Germanistik} - {Beihefte} - 6},
128 title = {Neue {Perspektiven} für kontrastive {Korpuslinguistik}: {Das} {Europäische} {Referenzkorpus} {EuReCo}},
129 isbn = {978-3-0343-3660-4},
130 abstract = {Dieser Beitrag beschreibt die Motivation und Ziele hinter der Initiative Europäisches Referenzkorpus EuReCo. Ausgehend von den Desiderata, die sich aufgrund der Defizite verfügbarer Forschungsdaten wie monolinguale Korpora, Parallelkorpora und Vergleichskorpora für den Sprachvergleich ergeben, werden die bisherigen und die laufenden Arbeiten im Rahmen von EuReCo präsentiert und anhand vergleichender deutsch-rumänischer Kookkurrenzanalysen neue Perspektiven für kontrastive Korpuslinguistik, die die EuReCo-Initiative öffnet, skizziert.},
131 booktitle = {Wege der {Germanistik} in transkultureller {Perspektive}. {Akten} des {XIV}. {Kongresses} der {Internationalen} {Vereinigung} für {Germanistik} ({IVG}) ({Bd}. 6)},
132 publisher = {Peter Lang},
133 author = {Kupietz, Marc and Trawiński, Beata},
134 editor = {Auteri, Laura and Barrale, Natascia and Di Bella, Arianna and Hoffmann, Sabine},
135 year = {2022},
136 keywords = {Kontrastive Linguistik, Korpus, Deutsch, Funktionsverbgefüge, Kookkurrenzanalyse, Korpuslinguistik, Rumänisch, Vergleichbare Korpora},
137 pages = {417--439},
138}
139
140@incollection{hardy_multi-dimensional_2015,
141 address = {London},
142 title = {Multi-{Dimensional} {Analysis} of {Academic} {Discourse}},
143 isbn = {978-1-137-43173-8},
144 url = {https://doi.org/10.1057/9781137431738_8},
145 abstract = {This chapter provides an overview of multi-dimensional (MD) analysis and important findings in this area of research. This approach to the study of language variation and discourse communities is then exemplified through a case study of an MD analysis of student writing from the Michigan Corpus of Upper-level Student Papers (MICUSP), which includes four different levels of discourse community members: final-year undergraduate students, and first-, second-, and third-year graduate students. Although variation of MICUSP has been investigated according to discipline (Hardy and Römer, 2013) and paper type (Hardy and Friginal, 2014), it has not been investigated according to writer level.},
146 booktitle = {Corpora and {Discourse} {Studies}: {Integrating} {Discourse} and {Corpora}},
147 publisher = {Palgrave Macmillan UK},
148 author = {Hardy, Jack A.},
149 editor = {Baker, Paul and McEnery, Tony},
150 year = {2015},
151 doi = {10.1057/9781137431738_8},
152 pages = {155--174},
153}
154
155@article{biber_spoken_1986,
156 title = {Spoken and {Written} {Textual} {Dimensions} in {English}: {Resolving} the {Contradictory} {Findings}},
157 volume = {62},
158 issn = {00978507, 15350665},
159 url = {http://www.jstor.org/stable/414678},
160 doi = {10.2307/414678},
161 abstract = {[Although similarities and differences between speech and writing have often been studied, contradictory claims concerning the linguistic relationship between the two modes are still common. These contradictions can arise from basing global conclusions on restricted methodologies-such as assigning undue weight to individual linguistic features, or to choice of particular text samples and text types. The present study uses a 'multi-feature/multi-dimension' approach, which includes a broad range of linguistic features and text types in a single quantitative analysis, to provide a global description of similarities and differences among spoken/written text types in English. The distribution of 41 linguistic features in 545 text samples of approximately 2000 words each is subjected to factor analysis (a multivariate statistical technique). Three underlying textual dimensions are identified: Interactive vs. Edited Text, Abstract vs. Situated Content, and Reported vs. Immediate Style. To demonstrate the value of the multi-feature/multi-dimension approach, the specific findings of earlier studies are reconciled within the model proposed here.]},
162 number = {2},
163 urldate = {2023-04-30},
164 journal = {Language},
165 author = {Biber, Douglas},
166 year = {1986},
167 note = {Publisher: Linguistic Society of America},
168 pages = {384--414},
169 file = {Spoken and Written Textual Dimensions in English\: Resolving the Contradictory Findings:/home/kupietz/Zotero/storage/938FXDXC/biber1986.pdf.pdf:application/pdf},
170}
171
172@inproceedings{straka_udpipe_2018,
173 address = {Brussels, Belgium},
174 title = {{UDPipe} 2.0 {Prototype} at {CoNLL} 2018 {UD} {Shared} {Task}},
175 url = {https://www.aclweb.org/anthology/K18-2020},
176 doi = {10.18653/v1/K18-2020},
177 booktitle = {Proceedings of the {CoNLL} 2018 {Shared} {Task}: {Multilingual} {Parsing} from {Raw} {Text} to {Universal} {Dependencies}},
178 publisher = {Association for Computational Linguistics},
179 author = {Straka, Milan},
180 month = oct,
181 year = {2018},
182 pages = {197--207},
183}
184
185@inproceedings{Kupietz:Diewald:Hanl:Margaretha:2016,
186 address = {Mannheim, Germany},
187 series = {Proceedings of the {Methodenmesse} im {Rahmen} der {Jahrestagung} des {Instituts} für {Deutsche} {Sprache}},
188 title = {Möglichkeiten der {Erforschung} grammatischer {Variation} mithilfe von {KorAP}, der neuen {Korpusanalyseplattform} des {IDS}},
189 copyright = {All rights reserved},
190 booktitle = {Grammatische {Variation}. {Empirische} {Zugänge} und theoretische {Modellierung}},
191 publisher = {De Gruyter},
192 author = {Kupietz, Marc and Diewald, Nils and Hanl, Michael and Margaretha, Eliza},
193 year = {2016},
194 pages = {319--329},
195 file = {Kupietz et al. - 2016 - Möglichkeiten der Erforschung grammatischer Variat.pdf:/home/kupietz/Zotero/storage/8K4AI4T9/Kupietz et al. - 2016 - Möglichkeiten der Erforschung grammatischer Variat.pdf:application/pdf},
196}
197
198@inproceedings{Banski:Fischer:Frick:Ketzan:Kupietz:Schnober:Schonefeld:Witt:2012,
199 address = {Istanbul, Turkey},
200 title = {The {New} {IDS} {Corpus} {Analysis} {Platform}: {Challenges} and {Prospects}},
201 shorttitle = {The {New} {IDS} {Corpus} {Analysis} {Platform}},
202 url = {http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf},
203 abstract = {The present article describes the first stage of the KorAP project, launched recently at the Institut für Deutsche Sprache (IDS) in Mannheim, Germany. The aim of this project is to develop an innovative corpus analysis platform to tackle the increasing demands of modern linguistic research. The platform will facilitate new linguistic findings by making it possible to manage and analyse primary data and annotations in the petabyte range, while at the same time allowing an undistorted view of the primary linguistic data, and thus fully satisfying the demands of a scientific tool. An additional important aim of the project is to make corpus data as openly accessible as possible in light of unavoidable legal restrictions, for instance through support for distributed virtual corpora, user-defined annotations and adaptable user interfaces, as well as interfaces and sandboxes for user-supplied analysis applications. We discuss our motivation for undertaking this endeavour and the challenges that face it. Next, we outline our software implementation plan and describe development to-date.},
204 urldate = {2022-04-12},
205 booktitle = {Proceedings of the {Eighth} {International} {Conference} on {Language} {Resources} and {Evaluation} ({LREC}'12)},
206 publisher = {European Language Resources Association (ELRA)},
207 author = {Bański, Piotr and Fischer, Peter M. and Frick, Elena and Ketzan, Erik and Kupietz, Marc and Schnober, Carsten and Schonefeld, Oliver and Witt, Andreas},
208 month = may,
209 year = {2012},
210 pages = {2905--2911},
211 file = {Full Text PDF:/home/kupietz/Zotero/storage/IC9U5T6F/Bański et al. - 2012 - The New IDS Corpus Analysis Platform Challenges a.pdf:application/pdf},
212}
Marc Kupietzf1dd9102023-06-26 20:36:08 +0200213
214@inproceedings{kupietz_rkorapclient_2020,
215 address = {Marseille, France},
216 title = {{RKorAPClient}: {An} {R} {Package} for {Accessing} the {German} {Reference} {Corpus} {DeReKo} via {KorAP}},
217 isbn = {979-10-95546-34-4},
218 url = {https://www.aclweb.org/anthology/2020.lrec-1.867},
219 language = {English},
220 booktitle = {Proceedings of the 12th {Language} {Resources} and {Evaluation} {Conference}},
221 publisher = {European Language Resources Association},
222 author = {Kupietz, Marc and Diewald, Nils and Margaretha, Eliza},
223 month = may,
224 year = {2020},
225 pages = {7015--7021},
226}
227
Marc Kupietz48a41342023-06-28 18:16:54 +0200228@article{MargarethaLuengen2014,
229 author = {Eliza Margaretha and Harald Lüngen},
230 title = {Building linguistic corpora from Wikipedia articles and discussions},
231 journal = {Journal of Language Technology and Computational Linguistics. Special issue on building and annotating corpora of computer-mediated communication. Issues and challenges at the interface between computational and corpus linguistics},
232 volume = {29},
233 number = {2},
234 editor = {Michael Beißwenger and Angelika Storrer and Nelleke Oostdijk and Henk van den Heuvel},
235 url = {https://nbn-resolving.org/urn:nbn:de:bsz:mh39-33306},
236 pages = {59 -- 82},
237 year = {2014},
238 abstract = {Wikipedia is a valuable resource, useful as a lingustic corpus or a dataset for many kinds of research. We built corpora from Wikipedia articles and talk pages in the I5 format, a TEI customisation used in the German Reference Corpus (Deutsches Referenzkorpus - DeReKo). Our approach is a two-stage conversion combining parsing using the Sweble parser, and transformation using XSLT stylesheets. The conversion approach is able to successfully generate rich and valid corpora regardless of languages. We also introduce a method to segment user contributions in talk pages into postings.},
239 language = {de}
240}