Fix some false positives Change-Id: Ifaec3023569865919c524dc4cd72a4a418949711

commit: d2b9279dbf25640587934d38e948d597f74f027d [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Tue Mar 10 08:11:06 2026 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Tue Mar 10 08:11:06 2026 +0100
tree: 45b06a5f4e9858f40ca956cb83ab757a2e64ba34
parent: b2068f42635a72d2c88b4f5663927e09cf8380be [diff]
diff --git a/src/index.js b/src/index.js
index 567e80e..cbc0953 100644
--- a/src/index.js
+++ b/src/index.js

@@ -147,11 +147,12 @@
   ['et',      neoPron('et')],
   // ex/ex  (all forms = ex)
   ['ex',      neoPron('ex')],
-  // hän/sim  (NOM=hän, GEN=sir, DAT=sim, ACC=sin)
+  // hän/sim  (NOM=hän, GEN=sir, DAT=sim, ACC=sim)
+  // Note: ACC 'sin' is omitted — it frequently occurs in German texts as a Spanish
+  // loan word (e.g. 'Chili sin Carne') and would cause too many false positives.
   ['hän',     neoPron('hän')],
   ['sir',     neoPron('hän')],
   ['sim',     neoPron('hän')],
-  ['sin',     neoPron('hän')],
   // hen/hem  (NOM=hen, GEN=hens, DAT=hem, ACC=hen)
   ['hen',     neoPron('hen')],
   ['hens',    neoPron('hen')],
@@ -168,8 +169,9 @@
   ['inds',    neoPron('ind')],
   ['inde',    neoPron('ind')],
   // mensch/mensch  (NOM=mensch, GEN=menschs, DAT=mensch, ACC=mensch)
-  // Note: case-insensitive match means sentence-initial 'Mensch' (common noun)
-  // will also be tagged; acceptable in a gender-language–focused tagger.
+  // Note: 'Mensch' (uppercase) is only tagged as a neo-pronoun when the token is
+  // sentence-initial (tokenId === 1). Mid-sentence 'Mensch' is treated as the
+  // common German noun. See lookup logic in classifyToken().
   ['mensch',  neoPron('mensch')],
   ['menschs', neoPron('mensch')],
   // nin/nim  (NOM=nin, GEN=nims, DAT=nim, ACC=nin)
@@ -182,9 +184,7 @@
   ['ojm',     neoPron('oj')],
   ['ojn',     neoPron('oj')],
   // per/per  (all forms = per; GEN = pers)
-  // Note: 'per' also occurs as a German preposition (e.g. 'per E-Mail').
-  ['per',     neoPron('per')],
-  ['pers',    neoPron('per')],
+  // Excluded: 'per' is a common German preposition; 'pers' excluded together with it.
   // ser/sem  (NOM=ser, GEN=ses, DAT=sem, ACC=sen)
   ['ser',     neoPron('ser')],
   ['ses',     neoPron('ser')],
@@ -199,8 +199,8 @@
   ['zerm',    neoPron('zet')],
   ['zern',    neoPron('zet')],
   // */*  (Stern; all forms = *; GEN = *s)
-  ['*',       neoPron('*')],
-  ['*s',      neoPron('*')],
+  // Excluded: bare asterisk '*' causes too many false positives (e.g. list bullets,
+  // Genderstern markers in noun forms). Not included in the lexicon.
 ]);
 
 // ---------------------------------------------------------------------------
@@ -334,16 +334,29 @@
  * Returns an annotation object on success, or null if the token is not a
  * recognised gender-sensitive form.
  *
+ * @param {string} word     - surface form of the token
+ * @param {number} tokenId  - 1-based position of the token in its sentence
+ *
  * Annotation object shape:
  *   { lemma, upos, xpos, feats }
  */
-function classifyToken(word) {
+function classifyToken(word, tokenId) {
   let m;
 
   // ------------------------------------------------------------------
-  // 0. Neo-pronoun lexicon lookup (case-insensitive, exact form match)
+  // 0. Neo-pronoun lexicon lookup
   // ------------------------------------------------------------------
-  const entry = NEO_PRONOUN_FORMS.get(word.toLowerCase());
+  // To avoid false positives from capitalised abbreviations (EL, EM),
+  // title-case exclamations (Ey), or common nouns (Mensch mid-sentence),
+  // we only perform a case-insensitive lookup when:
+  //   a) the token is already lowercase, OR
+  //   b) it is sentence-initial (tokenId === 1), where capitalisation is
+  //      purely orthographic (e.g. 'Sier trifft xier').
+  const lc = word.toLowerCase();
+  let entry;
+  if (word === lc || tokenId === 1) {
+    entry = NEO_PRONOUN_FORMS.get(lc);
+  }
   if (entry) return entry;
 
   // ------------------------------------------------------------------
@@ -558,7 +571,8 @@
   // 0:ID  1:FORM  2:LEMMA  3:UPOS  4:XPOS  5:FEATS  6:HEAD  7:DEPREL  8:DEPS  9:MISC
 
   const word = columns[1];
-  const annotation = classifyToken(word);
+  const tokenId = parseInt(columns[0], 10);
+  const annotation = classifyToken(word, tokenId);
 
   if (annotation) {
     // Replace lemma (col 2), UPOS (col 3), XPOS (col 4), FEATS (col 5)
commit	d2b9279dbf25640587934d38e948d597f74f027d	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Tue Mar 10 08:11:06 2026 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Tue Mar 10 08:11:06 2026 +0100
tree	45b06a5f4e9858f40ca956cb83ab757a2e64ba34
parent	b2068f42635a72d2c88b4f5663927e09cf8380be [diff]