Add CMLC 22 paper to README.md Change-Id: I0124c951a27fd6e8a5d45bc9e327306b48958611

commit: 9cb3e3302a689159bfc731c028c6d0b6dd7b7cc8 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Wed Jan 31 19:44:24 2024 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Wed Jan 31 19:44:24 2024 +0100
tree: f07bb0754237b1306bb01900a12a5e6d2d17fd23
parent: 3b8d2eff3d4154dba3289b26a24128fa83604ae9 [diff]
diff --git a/README.md b/README.md
index eff4da1..8b0e11d 100644
--- a/README.md
+++ b/README.md

@@ -16,13 +16,13 @@
 ```
 ## Run
 
-The command to build word embeddings is exactly the same as in the original version, except that we added type 5 for setting up a purely count based collocation database.
+The command to build word embeddings is mostly the same as in the original version, except that we added `-metadata-categories <num>` for specifying the number of metadata catagoeries and `-type 5` for setting up a purely count based collocation database.
 
 The -type argument is a integer that defines the architecture to use. These are the possible parameters:  
 0 - cbow  
 1 - skipngram  
 2 - cwindow (see below)  
-3 - structured skipngram(see below)  
+3 - structured skipngram (see below)
 4 - collobert's senna context window model (still experimental)  
 5 - build a collocation count database instead of word embeddings
 
@@ -58,6 +58,21 @@
 location="Denver, Colorado",  
 }
 
+@incollection{fankhauser_count-based_2022,
+ address = {Paris},
+ title = {Count-based and predictive language models for exploring {DeReKo}},
+ isbn = {979-10-95546-83-2},
+ url = {http://www.lrec-conf.org/proceedings/lrec2022/workshops/CMLC10/pdf/2022.cmlc10-1.5.pdf},
+ abstract = {We present the use of count-based and predictive language models for exploring language use in the German Reference Corpus DeReKo. For collocation analysis along the syntagmatic axis we employ traditional association measures based on co-occurrence counts as well as predictive association measures derived from the output weights of skipgram word embeddings. For inspecting the semantic neighbourhood of words along the paradigmatic axis we visualize the high dimensional word embeddings in two dimensions using t-stochastic neighbourhood embeddings. Together, these visualizations provide a complementary, explorative approach to analysing very large corpora in addition to corpus querying. Moreover, we discuss count-based and predictive models w.r.t. scalability and maintainability in very large corpora.},
+ booktitle = {Proceedings of the {LREC} 2022 {Workshop} on {Challenges} in the {Management} of {Large} {Corpora} ({CMLC}-10 2022). {Marseille}, 20 {June} 2022},
+ publisher = {European Language Resources Association (ELRA)},
+ author = {Fankhauser, Peter and Kupietz, Marc},
+ editor = {Bański, Piotr and Barbaresi, Adrien and Clematide, Simon and Kupietz, Marc and Lüngen, Harald},
+ year = {2022},
+ keywords = {Korpus, Deutsch, Assoziationsmaß, collocation analysis, Deutsches Referenzkorpus (DeReKo), German Reference Corpus (DeReKo), Kollokation, language models, Paradigma, Syntagma, word embeddings},
+ pages = {27--31},
+}
+
 @InProceedings{FankhauserKupietz2019,
 author    = {Peter Fankhauser and Marc Kupietz},
 title     = {Analyzing domain specific word embeddings for a large corpus of contemporary German},
commit	9cb3e3302a689159bfc731c028c6d0b6dd7b7cc8	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Jan 31 19:44:24 2024 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Jan 31 19:44:24 2024 +0100
tree	f07bb0754237b1306bb01900a12a5e6d2d17fd23
parent	3b8d2eff3d4154dba3289b26a24128fa83604ae9 [diff]