Fix some false positives

Change-Id: Ifaec3023569865919c524dc4cd72a4a418949711
diff --git a/test/data/gender.conllu b/test/data/gender.conllu
index 9319d7c..e764a9e 100644
--- a/test/data/gender.conllu
+++ b/test/data/gender.conllu
@@ -138,3 +138,77 @@
 7	die	_	DET	ART	_	_	_	_	_
 8	Hilfe	_	NOUN	NN	_	_	_	_	_
 
+# --- Regression tests: false-positive prevention --------------------------
+# Tokens *, Y, per, EL, EM, Ey, sin must NOT be tagged as neo-pronouns when
+# they appear mid-sentence in uppercase or are known false positives.
+
+# foundry = base
+# filename = TEST/gender/000013/base/tokens.xml
+# text_id = GENDER_TEST.000013
+# text = Hinweis auf * und Y sowie per Einschreiben
+1	Hinweis	_	NOUN	NN	_	_	_	_	_
+2	auf	_	ADP	APPR	_	_	_	_	_
+3	*	_	PUNCT	$(	_	_	_	_	_
+4	und	_	CCONJ	KON	_	_	_	_	_
+5	Y	_	NOUN	NN	_	_	_	_	_
+6	sowie	_	CCONJ	KON	_	_	_	_	_
+7	per	_	ADP	APPR	_	_	_	_	_
+8	Einschreiben	_	NOUN	NN	_	_	_	_	_
+
+# foundry = base
+# filename = TEST/gender/000014/base/tokens.xml
+# text_id = GENDER_TEST.000014
+# text = Verweise auf EL EM Ey sin im Text
+1	Verweise	_	NOUN	NN	_	_	_	_	_
+2	auf	_	ADP	APPR	_	_	_	_	_
+3	EL	_	NOUN	NN	_	_	_	_	_
+4	EM	_	NOUN	NN	_	_	_	_	_
+5	Ey	_	ITJ	ITJ	_	_	_	_	_
+6	sin	_	NOUN	NN	_	_	_	_	_
+7	im	_	ADP	APPRART	_	_	_	_	_
+8	Text	_	NOUN	NN	_	_	_	_	_
+
+# foundry = base
+# filename = TEST/gender/000015/base/tokens.xml
+# text_id = GENDER_TEST.000015
+# text = Jeder Mensch hat Würde
+1	Jeder	_	DET	PIAT	_	_	_	_	_
+2	Mensch	_	NOUN	NN	_	_	_	_	_
+3	hat	_	AUX	VAFIN	_	_	_	_	_
+4	Würde	_	NOUN	NN	_	_	_	_	_
+
+# --- Regression tests: correct neo-pronoun recognition -------------------
+# Lowercase forms and sentence-initial capitalised neo-pronouns must still
+# be tagged; only mid-sentence uppercase ambiguous forms are suppressed.
+
+# foundry = base
+# filename = TEST/gender/000016/base/tokens.xml
+# text_id = GENDER_TEST.000016
+# text = dankte el und em für ey und y
+1	dankte	_	VERB	VVFIN	_	_	_	_	_
+2	el	_	_	_	_	_	_	_	_
+3	und	_	CCONJ	KON	_	_	_	_	_
+4	em	_	_	_	_	_	_	_	_
+5	für	_	ADP	APPR	_	_	_	_	_
+6	ey	_	_	_	_	_	_	_	_
+7	und	_	CCONJ	KON	_	_	_	_	_
+8	y	_	_	_	_	_	_	_	_
+
+# foundry = base
+# filename = TEST/gender/000017/base/tokens.xml
+# text_id = GENDER_TEST.000017
+# text = mensch fragte und Mensch antwortete
+1	Mensch	_	_	_	_	_	_	_	_
+2	fragte	_	VERB	VVFIN	_	_	_	_	_
+3	und	_	CCONJ	KON	_	_	_	_	_
+4	Mensch	_	NOUN	NN	_	_	_	_	_
+5	antwortete	_	VERB	VVFIN	_	_	_	_	_
+
+# foundry = base
+# filename = TEST/gender/000018/base/tokens.xml
+# text_id = GENDER_TEST.000018
+# text = Mensch traf xier
+1	Mensch	_	_	_	_	_	_	_	_
+2	traf	_	VERB	VVFIN	_	_	_	_	_
+3	xier	_	_	_	_	_	_	_	_
+
diff --git a/test/test.js b/test/test.js
index 31a39e2..b954e06 100644
--- a/test/test.js
+++ b/test/test.js
@@ -110,7 +110,7 @@
     const command = 'node src/index.js < test/data/gender.conllu';
     const stdout = execSync(command).toString();
     const foundry_count = (stdout.match(/# foundry = gender/g) || []).length;
-    expect(foundry_count).toBe(12);
+    expect(foundry_count).toBe(18);
   });
 
   test('Full mode: non-gender tokens pass through unchanged', () => {
@@ -135,16 +135,17 @@
       const featsAnnotated  = cols[5] !== '_';
       expect(lemmaAnnotated || featsAnnotated).toBe(true);
     });
-    // Count: 18 NOUN + 5 DET + 1 PRON (sie*er) + 5 neo-pronouns (sier,xier,oj,el,hen) = 29
-    expect(tokenLines.length).toBe(29);
+    // Count: 18 NOUN + 5 DET + 1 PRON (sie*er) + 5 neo-pronouns (sier,xier,oj,el,hen)
+    //       + 7 new neo-pronouns from sentences 16–18 (el,em,ey,y,mensch,Mensch,xier) = 36
+    expect(tokenLines.length).toBe(36);
   });
 
   test('Sparse mode: sentence headers are emitted for sentences with matches', () => {
     const command = 'node src/index.js -s < test/data/gender.conllu';
     const stdout = execSync(command).toString();
-    // All 12 test sentences have at least one gender form
+    // 12 original + 3 new sentences (16–18) have at least one gender form
     const text_id_count = (stdout.match(/# text_id = /g) || []).length;
-    expect(text_id_count).toBe(12);
+    expect(text_id_count).toBe(15);
   });
 
   test('Inline input: basic Genderstern annotation', () => {
@@ -185,4 +186,112 @@
     // jede (without gender marker): unchanged
     expect(stdout).toContain('jede\t_\tDET\tPIAT\tGender=Fem|Number=Sing');
   });
+
+  // ---------------------------------------------------------------------------
+  // Regression tests: false-positive tokens that must NOT be tagged
+  // ---------------------------------------------------------------------------
+
+  test('No false positives: *, Y, per, EL, EM, Ey, sin mid-sentence pass through unchanged', () => {
+    // Each of these appeared as spurious neo-pronoun matches in the original code.
+    // They must not receive a neo-pronoun annotation.
+    const testInput = `# foundry = base
+# text_id = fp-001
+# text = Hinweis auf * und Y sowie per Einschreiben
+1\tHinweis\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+2\tauf\t_\tADP\tAPPR\t_\t_\t_\t_\t_
+3\t*\t_\tPUNCT\t$(\t_\t_\t_\t_\t_
+4\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+5\tY\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+6\tsowie\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+7\tper\t_\tADP\tAPPR\t_\t_\t_\t_\t_
+8\tEinschreiben\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+
+# foundry = base
+# text_id = fp-002
+# text = Verweise auf EL EM Ey sin im Text
+1\tVerweise\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+2\tauf\t_\tADP\tAPPR\t_\t_\t_\t_\t_
+3\tEL\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+4\tEM\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+5\tEy\t_\tITJ\tITJ\t_\t_\t_\t_\t_
+6\tsin\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+7\tim\t_\tADP\tAPPRART\t_\t_\t_\t_\t_
+8\tText\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+
+`;
+    const stdout = execSync('node src/index.js', { input: testInput }).toString();
+    // None of the false-positive tokens should receive a neo-pronoun annotation.
+    // A token passes through unchanged when its lemma column stays '_' and
+    // its upos/xpos/feats are not overwritten to PRON/PPER.
+    expect(stdout).toContain('3\t*\t_\tPUNCT');
+    expect(stdout).toContain('5\tY\t_\tNOUN');
+    expect(stdout).toContain('7\tper\t_\tADP');
+    expect(stdout).toContain('3\tEL\t_\tNOUN');
+    expect(stdout).toContain('4\tEM\t_\tNOUN');
+    expect(stdout).toContain('5\tEy\t_\tITJ');
+    expect(stdout).toContain('6\tsin\t_\tNOUN');
+  });
+
+  test('No false positive: Mensch mid-sentence must not be tagged as neo-pronoun', () => {
+    const testInput = `# foundry = base
+# text_id = fp-003
+# text = Jeder Mensch hat Würde
+1\tJeder\t_\tDET\tPIAT\t_\t_\t_\t_\t_
+2\tMensch\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+3\that\t_\tAUX\tVAFIN\t_\t_\t_\t_\t_
+4\tWürde\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+
+`;
+    const stdout = execSync('node src/index.js', { input: testInput }).toString();
+    // 'Mensch' at position 2 (not sentence-initial) must not be tagged.
+    expect(stdout).toContain('2\tMensch\t_\tNOUN\tNN\t_');
+  });
+
+  test('Neo-pronoun: lowercase el, em, ey, y mid-sentence are still tagged', () => {
+    const testInput = `# foundry = base
+# text_id = neo-lc-001
+# text = dankte el und em für ey und y
+1\tdankte\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
+2\tel\t_\t_\t_\t_\t_\t_\t_\t_
+3\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+4\tem\t_\t_\t_\t_\t_\t_\t_\t_
+5\tfür\t_\tADP\tAPPR\t_\t_\t_\t_\t_
+6\tey\t_\t_\t_\t_\t_\t_\t_\t_
+7\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+8\ty\t_\t_\t_\t_\t_\t_\t_\t_
+
+`;
+    const stdout = execSync('node src/index.js', { input: testInput }).toString();
+    expect(stdout).toContain('el\tel\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+    expect(stdout).toContain('em\tem\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+    expect(stdout).toContain('ey\tey\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+    expect(stdout).toContain('y\tY\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+  });
+
+  test('Neo-pronoun: mensch lowercase and sentence-initial Mensch are tagged', () => {
+    const testInput = `# foundry = base
+# text_id = neo-mensch-001
+# text = mensch fragte und Mensch antwortete
+1\tmensch\t_\t_\t_\t_\t_\t_\t_\t_
+2\tfragte\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
+3\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+4\tMensch\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+5\tantwortete\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
+
+# foundry = base
+# text_id = neo-mensch-002
+# text = Mensch traf xier
+1\tMensch\t_\t_\t_\t_\t_\t_\t_\t_
+2\ttraf\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
+3\txier\t_\t_\t_\t_\t_\t_\t_\t_
+
+`;
+    const stdout = execSync('node src/index.js', { input: testInput }).toString();
+    // lowercase 'mensch' → neo-pronoun
+    expect(stdout).toContain('1\tmensch\tmensch\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+    // 'Mensch' mid-sentence (position 4) → unchanged common noun
+    expect(stdout).toContain('4\tMensch\t_\tNOUN\tNN\t_');
+    // sentence-initial 'Mensch' (position 1) → neo-pronoun
+    expect(stdout).toContain('1\tMensch\tmensch\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+  });
 });