Fix some false positives
Change-Id: Ifaec3023569865919c524dc4cd72a4a418949711
diff --git a/test/data/gender.conllu b/test/data/gender.conllu
index 9319d7c..e764a9e 100644
--- a/test/data/gender.conllu
+++ b/test/data/gender.conllu
@@ -138,3 +138,77 @@
7 die _ DET ART _ _ _ _ _
8 Hilfe _ NOUN NN _ _ _ _ _
+# --- Regression tests: false-positive prevention --------------------------
+# Tokens *, Y, per, EL, EM, Ey, sin must NOT be tagged as neo-pronouns when
+# they appear mid-sentence in uppercase or are known false positives.
+
+# foundry = base
+# filename = TEST/gender/000013/base/tokens.xml
+# text_id = GENDER_TEST.000013
+# text = Hinweis auf * und Y sowie per Einschreiben
+1 Hinweis _ NOUN NN _ _ _ _ _
+2 auf _ ADP APPR _ _ _ _ _
+3 * _ PUNCT $( _ _ _ _ _
+4 und _ CCONJ KON _ _ _ _ _
+5 Y _ NOUN NN _ _ _ _ _
+6 sowie _ CCONJ KON _ _ _ _ _
+7 per _ ADP APPR _ _ _ _ _
+8 Einschreiben _ NOUN NN _ _ _ _ _
+
+# foundry = base
+# filename = TEST/gender/000014/base/tokens.xml
+# text_id = GENDER_TEST.000014
+# text = Verweise auf EL EM Ey sin im Text
+1 Verweise _ NOUN NN _ _ _ _ _
+2 auf _ ADP APPR _ _ _ _ _
+3 EL _ NOUN NN _ _ _ _ _
+4 EM _ NOUN NN _ _ _ _ _
+5 Ey _ ITJ ITJ _ _ _ _ _
+6 sin _ NOUN NN _ _ _ _ _
+7 im _ ADP APPRART _ _ _ _ _
+8 Text _ NOUN NN _ _ _ _ _
+
+# foundry = base
+# filename = TEST/gender/000015/base/tokens.xml
+# text_id = GENDER_TEST.000015
+# text = Jeder Mensch hat Würde
+1 Jeder _ DET PIAT _ _ _ _ _
+2 Mensch _ NOUN NN _ _ _ _ _
+3 hat _ AUX VAFIN _ _ _ _ _
+4 Würde _ NOUN NN _ _ _ _ _
+
+# --- Regression tests: correct neo-pronoun recognition -------------------
+# Lowercase forms and sentence-initial capitalised neo-pronouns must still
+# be tagged; only mid-sentence uppercase ambiguous forms are suppressed.
+
+# foundry = base
+# filename = TEST/gender/000016/base/tokens.xml
+# text_id = GENDER_TEST.000016
+# text = dankte el und em für ey und y
+1 dankte _ VERB VVFIN _ _ _ _ _
+2 el _ _ _ _ _ _ _ _
+3 und _ CCONJ KON _ _ _ _ _
+4 em _ _ _ _ _ _ _ _
+5 für _ ADP APPR _ _ _ _ _
+6 ey _ _ _ _ _ _ _ _
+7 und _ CCONJ KON _ _ _ _ _
+8 y _ _ _ _ _ _ _ _
+
+# foundry = base
+# filename = TEST/gender/000017/base/tokens.xml
+# text_id = GENDER_TEST.000017
+# text = mensch fragte und Mensch antwortete
+1 Mensch _ _ _ _ _ _ _ _
+2 fragte _ VERB VVFIN _ _ _ _ _
+3 und _ CCONJ KON _ _ _ _ _
+4 Mensch _ NOUN NN _ _ _ _ _
+5 antwortete _ VERB VVFIN _ _ _ _ _
+
+# foundry = base
+# filename = TEST/gender/000018/base/tokens.xml
+# text_id = GENDER_TEST.000018
+# text = Mensch traf xier
+1 Mensch _ _ _ _ _ _ _ _
+2 traf _ VERB VVFIN _ _ _ _ _
+3 xier _ _ _ _ _ _ _ _
+
diff --git a/test/test.js b/test/test.js
index 31a39e2..b954e06 100644
--- a/test/test.js
+++ b/test/test.js
@@ -110,7 +110,7 @@
const command = 'node src/index.js < test/data/gender.conllu';
const stdout = execSync(command).toString();
const foundry_count = (stdout.match(/# foundry = gender/g) || []).length;
- expect(foundry_count).toBe(12);
+ expect(foundry_count).toBe(18);
});
test('Full mode: non-gender tokens pass through unchanged', () => {
@@ -135,16 +135,17 @@
const featsAnnotated = cols[5] !== '_';
expect(lemmaAnnotated || featsAnnotated).toBe(true);
});
- // Count: 18 NOUN + 5 DET + 1 PRON (sie*er) + 5 neo-pronouns (sier,xier,oj,el,hen) = 29
- expect(tokenLines.length).toBe(29);
+ // Count: 18 NOUN + 5 DET + 1 PRON (sie*er) + 5 neo-pronouns (sier,xier,oj,el,hen)
+ // + 7 new neo-pronouns from sentences 16–18 (el,em,ey,y,mensch,Mensch,xier) = 36
+ expect(tokenLines.length).toBe(36);
});
test('Sparse mode: sentence headers are emitted for sentences with matches', () => {
const command = 'node src/index.js -s < test/data/gender.conllu';
const stdout = execSync(command).toString();
- // All 12 test sentences have at least one gender form
+ // 12 original + 3 new sentences (16–18) have at least one gender form
const text_id_count = (stdout.match(/# text_id = /g) || []).length;
- expect(text_id_count).toBe(12);
+ expect(text_id_count).toBe(15);
});
test('Inline input: basic Genderstern annotation', () => {
@@ -185,4 +186,112 @@
// jede (without gender marker): unchanged
expect(stdout).toContain('jede\t_\tDET\tPIAT\tGender=Fem|Number=Sing');
});
+
+ // ---------------------------------------------------------------------------
+ // Regression tests: false-positive tokens that must NOT be tagged
+ // ---------------------------------------------------------------------------
+
+ test('No false positives: *, Y, per, EL, EM, Ey, sin mid-sentence pass through unchanged', () => {
+ // Each of these appeared as spurious neo-pronoun matches in the original code.
+ // They must not receive a neo-pronoun annotation.
+ const testInput = `# foundry = base
+# text_id = fp-001
+# text = Hinweis auf * und Y sowie per Einschreiben
+1\tHinweis\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+2\tauf\t_\tADP\tAPPR\t_\t_\t_\t_\t_
+3\t*\t_\tPUNCT\t$(\t_\t_\t_\t_\t_
+4\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+5\tY\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+6\tsowie\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+7\tper\t_\tADP\tAPPR\t_\t_\t_\t_\t_
+8\tEinschreiben\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+
+# foundry = base
+# text_id = fp-002
+# text = Verweise auf EL EM Ey sin im Text
+1\tVerweise\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+2\tauf\t_\tADP\tAPPR\t_\t_\t_\t_\t_
+3\tEL\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+4\tEM\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+5\tEy\t_\tITJ\tITJ\t_\t_\t_\t_\t_
+6\tsin\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+7\tim\t_\tADP\tAPPRART\t_\t_\t_\t_\t_
+8\tText\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+
+`;
+ const stdout = execSync('node src/index.js', { input: testInput }).toString();
+ // None of the false-positive tokens should receive a neo-pronoun annotation.
+ // A token passes through unchanged when its lemma column stays '_' and
+ // its upos/xpos/feats are not overwritten to PRON/PPER.
+ expect(stdout).toContain('3\t*\t_\tPUNCT');
+ expect(stdout).toContain('5\tY\t_\tNOUN');
+ expect(stdout).toContain('7\tper\t_\tADP');
+ expect(stdout).toContain('3\tEL\t_\tNOUN');
+ expect(stdout).toContain('4\tEM\t_\tNOUN');
+ expect(stdout).toContain('5\tEy\t_\tITJ');
+ expect(stdout).toContain('6\tsin\t_\tNOUN');
+ });
+
+ test('No false positive: Mensch mid-sentence must not be tagged as neo-pronoun', () => {
+ const testInput = `# foundry = base
+# text_id = fp-003
+# text = Jeder Mensch hat Würde
+1\tJeder\t_\tDET\tPIAT\t_\t_\t_\t_\t_
+2\tMensch\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+3\that\t_\tAUX\tVAFIN\t_\t_\t_\t_\t_
+4\tWürde\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+
+`;
+ const stdout = execSync('node src/index.js', { input: testInput }).toString();
+ // 'Mensch' at position 2 (not sentence-initial) must not be tagged.
+ expect(stdout).toContain('2\tMensch\t_\tNOUN\tNN\t_');
+ });
+
+ test('Neo-pronoun: lowercase el, em, ey, y mid-sentence are still tagged', () => {
+ const testInput = `# foundry = base
+# text_id = neo-lc-001
+# text = dankte el und em für ey und y
+1\tdankte\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
+2\tel\t_\t_\t_\t_\t_\t_\t_\t_
+3\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+4\tem\t_\t_\t_\t_\t_\t_\t_\t_
+5\tfür\t_\tADP\tAPPR\t_\t_\t_\t_\t_
+6\tey\t_\t_\t_\t_\t_\t_\t_\t_
+7\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+8\ty\t_\t_\t_\t_\t_\t_\t_\t_
+
+`;
+ const stdout = execSync('node src/index.js', { input: testInput }).toString();
+ expect(stdout).toContain('el\tel\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+ expect(stdout).toContain('em\tem\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+ expect(stdout).toContain('ey\tey\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+ expect(stdout).toContain('y\tY\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+ });
+
+ test('Neo-pronoun: mensch lowercase and sentence-initial Mensch are tagged', () => {
+ const testInput = `# foundry = base
+# text_id = neo-mensch-001
+# text = mensch fragte und Mensch antwortete
+1\tmensch\t_\t_\t_\t_\t_\t_\t_\t_
+2\tfragte\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
+3\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+4\tMensch\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+5\tantwortete\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
+
+# foundry = base
+# text_id = neo-mensch-002
+# text = Mensch traf xier
+1\tMensch\t_\t_\t_\t_\t_\t_\t_\t_
+2\ttraf\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
+3\txier\t_\t_\t_\t_\t_\t_\t_\t_
+
+`;
+ const stdout = execSync('node src/index.js', { input: testInput }).toString();
+ // lowercase 'mensch' → neo-pronoun
+ expect(stdout).toContain('1\tmensch\tmensch\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+ // 'Mensch' mid-sentence (position 4) → unchanged common noun
+ expect(stdout).toContain('4\tMensch\t_\tNOUN\tNN\t_');
+ // sentence-initial 'Mensch' (position 1) → neo-pronoun
+ expect(stdout).toContain('1\tMensch\tmensch\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+ });
});