Fix some false positives Change-Id: Ifaec3023569865919c524dc4cd72a4a418949711

commit: d2b9279dbf25640587934d38e948d597f74f027d [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Tue Mar 10 08:11:06 2026 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Tue Mar 10 08:11:06 2026 +0100
tree: 45b06a5f4e9858f40ca956cb83ab757a2e64ba34
parent: b2068f42635a72d2c88b4f5663927e09cf8380be [diff]
diff --git a/src/index.js b/src/index.js
index 567e80e..cbc0953 100644
--- a/src/index.js
+++ b/src/index.js

@@ -147,11 +147,12 @@
   ['et',      neoPron('et')],
   // ex/ex  (all forms = ex)
   ['ex',      neoPron('ex')],
-  // hän/sim  (NOM=hän, GEN=sir, DAT=sim, ACC=sin)
+  // hän/sim  (NOM=hän, GEN=sir, DAT=sim, ACC=sim)
+  // Note: ACC 'sin' is omitted — it frequently occurs in German texts as a Spanish
+  // loan word (e.g. 'Chili sin Carne') and would cause too many false positives.
   ['hän',     neoPron('hän')],
   ['sir',     neoPron('hän')],
   ['sim',     neoPron('hän')],
-  ['sin',     neoPron('hän')],
   // hen/hem  (NOM=hen, GEN=hens, DAT=hem, ACC=hen)
   ['hen',     neoPron('hen')],
   ['hens',    neoPron('hen')],
@@ -168,8 +169,9 @@
   ['inds',    neoPron('ind')],
   ['inde',    neoPron('ind')],
   // mensch/mensch  (NOM=mensch, GEN=menschs, DAT=mensch, ACC=mensch)
-  // Note: case-insensitive match means sentence-initial 'Mensch' (common noun)
-  // will also be tagged; acceptable in a gender-language–focused tagger.
+  // Note: 'Mensch' (uppercase) is only tagged as a neo-pronoun when the token is
+  // sentence-initial (tokenId === 1). Mid-sentence 'Mensch' is treated as the
+  // common German noun. See lookup logic in classifyToken().
   ['mensch',  neoPron('mensch')],
   ['menschs', neoPron('mensch')],
   // nin/nim  (NOM=nin, GEN=nims, DAT=nim, ACC=nin)
@@ -182,9 +184,7 @@
   ['ojm',     neoPron('oj')],
   ['ojn',     neoPron('oj')],
   // per/per  (all forms = per; GEN = pers)
-  // Note: 'per' also occurs as a German preposition (e.g. 'per E-Mail').
-  ['per',     neoPron('per')],
-  ['pers',    neoPron('per')],
+  // Excluded: 'per' is a common German preposition; 'pers' excluded together with it.
   // ser/sem  (NOM=ser, GEN=ses, DAT=sem, ACC=sen)
   ['ser',     neoPron('ser')],
   ['ses',     neoPron('ser')],
@@ -199,8 +199,8 @@
   ['zerm',    neoPron('zet')],
   ['zern',    neoPron('zet')],
   // */*  (Stern; all forms = *; GEN = *s)
-  ['*',       neoPron('*')],
-  ['*s',      neoPron('*')],
+  // Excluded: bare asterisk '*' causes too many false positives (e.g. list bullets,
+  // Genderstern markers in noun forms). Not included in the lexicon.
 ]);
 
 // ---------------------------------------------------------------------------
@@ -334,16 +334,29 @@
  * Returns an annotation object on success, or null if the token is not a
  * recognised gender-sensitive form.
  *
+ * @param {string} word     - surface form of the token
+ * @param {number} tokenId  - 1-based position of the token in its sentence
+ *
  * Annotation object shape:
  *   { lemma, upos, xpos, feats }
  */
-function classifyToken(word) {
+function classifyToken(word, tokenId) {
   let m;
 
   // ------------------------------------------------------------------
-  // 0. Neo-pronoun lexicon lookup (case-insensitive, exact form match)
+  // 0. Neo-pronoun lexicon lookup
   // ------------------------------------------------------------------
-  const entry = NEO_PRONOUN_FORMS.get(word.toLowerCase());
+  // To avoid false positives from capitalised abbreviations (EL, EM),
+  // title-case exclamations (Ey), or common nouns (Mensch mid-sentence),
+  // we only perform a case-insensitive lookup when:
+  //   a) the token is already lowercase, OR
+  //   b) it is sentence-initial (tokenId === 1), where capitalisation is
+  //      purely orthographic (e.g. 'Sier trifft xier').
+  const lc = word.toLowerCase();
+  let entry;
+  if (word === lc || tokenId === 1) {
+    entry = NEO_PRONOUN_FORMS.get(lc);
+  }
   if (entry) return entry;
 
   // ------------------------------------------------------------------
@@ -558,7 +571,8 @@
   // 0:ID  1:FORM  2:LEMMA  3:UPOS  4:XPOS  5:FEATS  6:HEAD  7:DEPREL  8:DEPS  9:MISC
 
   const word = columns[1];
-  const annotation = classifyToken(word);
+  const tokenId = parseInt(columns[0], 10);
+  const annotation = classifyToken(word, tokenId);
 
   if (annotation) {
     // Replace lemma (col 2), UPOS (col 3), XPOS (col 4), FEATS (col 5)

diff --git a/test/data/gender.conllu b/test/data/gender.conllu
index 9319d7c..e764a9e 100644
--- a/test/data/gender.conllu
+++ b/test/data/gender.conllu

@@ -138,3 +138,77 @@
 7	die	_	DET	ART	_	_	_	_	_
 8	Hilfe	_	NOUN	NN	_	_	_	_	_
 
+# --- Regression tests: false-positive prevention --------------------------
+# Tokens *, Y, per, EL, EM, Ey, sin must NOT be tagged as neo-pronouns when
+# they appear mid-sentence in uppercase or are known false positives.
+
+# foundry = base
+# filename = TEST/gender/000013/base/tokens.xml
+# text_id = GENDER_TEST.000013
+# text = Hinweis auf * und Y sowie per Einschreiben
+1	Hinweis	_	NOUN	NN	_	_	_	_	_
+2	auf	_	ADP	APPR	_	_	_	_	_
+3	*	_	PUNCT	$(	_	_	_	_	_
+4	und	_	CCONJ	KON	_	_	_	_	_
+5	Y	_	NOUN	NN	_	_	_	_	_
+6	sowie	_	CCONJ	KON	_	_	_	_	_
+7	per	_	ADP	APPR	_	_	_	_	_
+8	Einschreiben	_	NOUN	NN	_	_	_	_	_
+
+# foundry = base
+# filename = TEST/gender/000014/base/tokens.xml
+# text_id = GENDER_TEST.000014
+# text = Verweise auf EL EM Ey sin im Text
+1	Verweise	_	NOUN	NN	_	_	_	_	_
+2	auf	_	ADP	APPR	_	_	_	_	_
+3	EL	_	NOUN	NN	_	_	_	_	_
+4	EM	_	NOUN	NN	_	_	_	_	_
+5	Ey	_	ITJ	ITJ	_	_	_	_	_
+6	sin	_	NOUN	NN	_	_	_	_	_
+7	im	_	ADP	APPRART	_	_	_	_	_
+8	Text	_	NOUN	NN	_	_	_	_	_
+
+# foundry = base
+# filename = TEST/gender/000015/base/tokens.xml
+# text_id = GENDER_TEST.000015
+# text = Jeder Mensch hat Würde
+1	Jeder	_	DET	PIAT	_	_	_	_	_
+2	Mensch	_	NOUN	NN	_	_	_	_	_
+3	hat	_	AUX	VAFIN	_	_	_	_	_
+4	Würde	_	NOUN	NN	_	_	_	_	_
+
+# --- Regression tests: correct neo-pronoun recognition -------------------
+# Lowercase forms and sentence-initial capitalised neo-pronouns must still
+# be tagged; only mid-sentence uppercase ambiguous forms are suppressed.
+
+# foundry = base
+# filename = TEST/gender/000016/base/tokens.xml
+# text_id = GENDER_TEST.000016
+# text = dankte el und em für ey und y
+1	dankte	_	VERB	VVFIN	_	_	_	_	_
+2	el	_	_	_	_	_	_	_	_
+3	und	_	CCONJ	KON	_	_	_	_	_
+4	em	_	_	_	_	_	_	_	_
+5	für	_	ADP	APPR	_	_	_	_	_
+6	ey	_	_	_	_	_	_	_	_
+7	und	_	CCONJ	KON	_	_	_	_	_
+8	y	_	_	_	_	_	_	_	_
+
+# foundry = base
+# filename = TEST/gender/000017/base/tokens.xml
+# text_id = GENDER_TEST.000017
+# text = mensch fragte und Mensch antwortete
+1	Mensch	_	_	_	_	_	_	_	_
+2	fragte	_	VERB	VVFIN	_	_	_	_	_
+3	und	_	CCONJ	KON	_	_	_	_	_
+4	Mensch	_	NOUN	NN	_	_	_	_	_
+5	antwortete	_	VERB	VVFIN	_	_	_	_	_
+
+# foundry = base
+# filename = TEST/gender/000018/base/tokens.xml
+# text_id = GENDER_TEST.000018
+# text = Mensch traf xier
+1	Mensch	_	_	_	_	_	_	_	_
+2	traf	_	VERB	VVFIN	_	_	_	_	_
+3	xier	_	_	_	_	_	_	_	_
+

diff --git a/test/test.js b/test/test.js
index 31a39e2..b954e06 100644
--- a/test/test.js
+++ b/test/test.js

@@ -110,7 +110,7 @@
     const command = 'node src/index.js < test/data/gender.conllu';
     const stdout = execSync(command).toString();
     const foundry_count = (stdout.match(/# foundry = gender/g) || []).length;
-    expect(foundry_count).toBe(12);
+    expect(foundry_count).toBe(18);
   });
 
   test('Full mode: non-gender tokens pass through unchanged', () => {
@@ -135,16 +135,17 @@
       const featsAnnotated  = cols[5] !== '_';
       expect(lemmaAnnotated || featsAnnotated).toBe(true);
     });
-    // Count: 18 NOUN + 5 DET + 1 PRON (sie*er) + 5 neo-pronouns (sier,xier,oj,el,hen) = 29
-    expect(tokenLines.length).toBe(29);
+    // Count: 18 NOUN + 5 DET + 1 PRON (sie*er) + 5 neo-pronouns (sier,xier,oj,el,hen)
+    //       + 7 new neo-pronouns from sentences 16–18 (el,em,ey,y,mensch,Mensch,xier) = 36
+    expect(tokenLines.length).toBe(36);
   });
 
   test('Sparse mode: sentence headers are emitted for sentences with matches', () => {
     const command = 'node src/index.js -s < test/data/gender.conllu';
     const stdout = execSync(command).toString();
-    // All 12 test sentences have at least one gender form
+    // 12 original + 3 new sentences (16–18) have at least one gender form
     const text_id_count = (stdout.match(/# text_id = /g) || []).length;
-    expect(text_id_count).toBe(12);
+    expect(text_id_count).toBe(15);
   });
 
   test('Inline input: basic Genderstern annotation', () => {
@@ -185,4 +186,112 @@
     // jede (without gender marker): unchanged
     expect(stdout).toContain('jede\t_\tDET\tPIAT\tGender=Fem|Number=Sing');
   });
+
+  // ---------------------------------------------------------------------------
+  // Regression tests: false-positive tokens that must NOT be tagged
+  // ---------------------------------------------------------------------------
+
+  test('No false positives: *, Y, per, EL, EM, Ey, sin mid-sentence pass through unchanged', () => {
+    // Each of these appeared as spurious neo-pronoun matches in the original code.
+    // They must not receive a neo-pronoun annotation.
+    const testInput = `# foundry = base
+# text_id = fp-001
+# text = Hinweis auf * und Y sowie per Einschreiben
+1\tHinweis\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+2\tauf\t_\tADP\tAPPR\t_\t_\t_\t_\t_
+3\t*\t_\tPUNCT\t$(\t_\t_\t_\t_\t_
+4\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+5\tY\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+6\tsowie\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+7\tper\t_\tADP\tAPPR\t_\t_\t_\t_\t_
+8\tEinschreiben\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+
+# foundry = base
+# text_id = fp-002
+# text = Verweise auf EL EM Ey sin im Text
+1\tVerweise\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+2\tauf\t_\tADP\tAPPR\t_\t_\t_\t_\t_
+3\tEL\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+4\tEM\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+5\tEy\t_\tITJ\tITJ\t_\t_\t_\t_\t_
+6\tsin\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+7\tim\t_\tADP\tAPPRART\t_\t_\t_\t_\t_
+8\tText\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+
+`;
+    const stdout = execSync('node src/index.js', { input: testInput }).toString();
+    // None of the false-positive tokens should receive a neo-pronoun annotation.
+    // A token passes through unchanged when its lemma column stays '_' and
+    // its upos/xpos/feats are not overwritten to PRON/PPER.
+    expect(stdout).toContain('3\t*\t_\tPUNCT');
+    expect(stdout).toContain('5\tY\t_\tNOUN');
+    expect(stdout).toContain('7\tper\t_\tADP');
+    expect(stdout).toContain('3\tEL\t_\tNOUN');
+    expect(stdout).toContain('4\tEM\t_\tNOUN');
+    expect(stdout).toContain('5\tEy\t_\tITJ');
+    expect(stdout).toContain('6\tsin\t_\tNOUN');
+  });
+
+  test('No false positive: Mensch mid-sentence must not be tagged as neo-pronoun', () => {
+    const testInput = `# foundry = base
+# text_id = fp-003
+# text = Jeder Mensch hat Würde
+1\tJeder\t_\tDET\tPIAT\t_\t_\t_\t_\t_
+2\tMensch\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+3\that\t_\tAUX\tVAFIN\t_\t_\t_\t_\t_
+4\tWürde\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+
+`;
+    const stdout = execSync('node src/index.js', { input: testInput }).toString();
+    // 'Mensch' at position 2 (not sentence-initial) must not be tagged.
+    expect(stdout).toContain('2\tMensch\t_\tNOUN\tNN\t_');
+  });
+
+  test('Neo-pronoun: lowercase el, em, ey, y mid-sentence are still tagged', () => {
+    const testInput = `# foundry = base
+# text_id = neo-lc-001
+# text = dankte el und em für ey und y
+1\tdankte\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
+2\tel\t_\t_\t_\t_\t_\t_\t_\t_
+3\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+4\tem\t_\t_\t_\t_\t_\t_\t_\t_
+5\tfür\t_\tADP\tAPPR\t_\t_\t_\t_\t_
+6\tey\t_\t_\t_\t_\t_\t_\t_\t_
+7\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+8\ty\t_\t_\t_\t_\t_\t_\t_\t_
+
+`;
+    const stdout = execSync('node src/index.js', { input: testInput }).toString();
+    expect(stdout).toContain('el\tel\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+    expect(stdout).toContain('em\tem\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+    expect(stdout).toContain('ey\tey\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+    expect(stdout).toContain('y\tY\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+  });
+
+  test('Neo-pronoun: mensch lowercase and sentence-initial Mensch are tagged', () => {
+    const testInput = `# foundry = base
+# text_id = neo-mensch-001
+# text = mensch fragte und Mensch antwortete
+1\tmensch\t_\t_\t_\t_\t_\t_\t_\t_
+2\tfragte\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
+3\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+4\tMensch\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+5\tantwortete\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
+
+# foundry = base
+# text_id = neo-mensch-002
+# text = Mensch traf xier
+1\tMensch\t_\t_\t_\t_\t_\t_\t_\t_
+2\ttraf\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
+3\txier\t_\t_\t_\t_\t_\t_\t_\t_
+
+`;
+    const stdout = execSync('node src/index.js', { input: testInput }).toString();
+    // lowercase 'mensch' → neo-pronoun
+    expect(stdout).toContain('1\tmensch\tmensch\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+    // 'Mensch' mid-sentence (position 4) → unchanged common noun
+    expect(stdout).toContain('4\tMensch\t_\tNOUN\tNN\t_');
+    // sentence-initial 'Mensch' (position 1) → neo-pronoun
+    expect(stdout).toContain('1\tMensch\tmensch\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+  });
 });
commit	d2b9279dbf25640587934d38e948d597f74f027d	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Tue Mar 10 08:11:06 2026 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Tue Mar 10 08:11:06 2026 +0100
tree	45b06a5f4e9858f40ca956cb83ab757a2e64ba34
parent	b2068f42635a72d2c88b4f5663927e09cf8380be [diff]