Add CMLC 22 paper to README.md
Change-Id: I0124c951a27fd6e8a5d45bc9e327306b48958611
diff --git a/README.md b/README.md
index eff4da1..8b0e11d 100644
--- a/README.md
+++ b/README.md
@@ -16,13 +16,13 @@
```
## Run
-The command to build word embeddings is exactly the same as in the original version, except that we added type 5 for setting up a purely count based collocation database.
+The command to build word embeddings is mostly the same as in the original version, except that we added `-metadata-categories <num>` for specifying the number of metadata catagoeries and `-type 5` for setting up a purely count based collocation database.
The -type argument is a integer that defines the architecture to use. These are the possible parameters:
0 - cbow
1 - skipngram
2 - cwindow (see below)
-3 - structured skipngram(see below)
+3 - structured skipngram (see below)
4 - collobert's senna context window model (still experimental)
5 - build a collocation count database instead of word embeddings
@@ -58,6 +58,21 @@
location="Denver, Colorado",
}
+@incollection{fankhauser_count-based_2022,
+ address = {Paris},
+ title = {Count-based and predictive language models for exploring {DeReKo}},
+ isbn = {979-10-95546-83-2},
+ url = {http://www.lrec-conf.org/proceedings/lrec2022/workshops/CMLC10/pdf/2022.cmlc10-1.5.pdf},
+ abstract = {We present the use of count-based and predictive language models for exploring language use in the German Reference Corpus DeReKo. For collocation analysis along the syntagmatic axis we employ traditional association measures based on co-occurrence counts as well as predictive association measures derived from the output weights of skipgram word embeddings. For inspecting the semantic neighbourhood of words along the paradigmatic axis we visualize the high dimensional word embeddings in two dimensions using t-stochastic neighbourhood embeddings. Together, these visualizations provide a complementary, explorative approach to analysing very large corpora in addition to corpus querying. Moreover, we discuss count-based and predictive models w.r.t. scalability and maintainability in very large corpora.},
+ booktitle = {Proceedings of the {LREC} 2022 {Workshop} on {Challenges} in the {Management} of {Large} {Corpora} ({CMLC}-10 2022). {Marseille}, 20 {June} 2022},
+ publisher = {European Language Resources Association (ELRA)},
+ author = {Fankhauser, Peter and Kupietz, Marc},
+ editor = {BaĆski, Piotr and Barbaresi, Adrien and Clematide, Simon and Kupietz, Marc and Lüngen, Harald},
+ year = {2022},
+ keywords = {Korpus, Deutsch, Assoziationsmaß, collocation analysis, Deutsches Referenzkorpus (DeReKo), German Reference Corpus (DeReKo), Kollokation, language models, Paradigma, Syntagma, word embeddings},
+ pages = {27--31},
+}
+
@InProceedings{FankhauserKupietz2019,
author = {Peter Fankhauser and Marc Kupietz},
title = {Analyzing domain specific word embeddings for a large corpus of contemporary German},