Fix some false positives
Change-Id: Ifaec3023569865919c524dc4cd72a4a418949711
diff --git a/test/test.js b/test/test.js
index 31a39e2..b954e06 100644
--- a/test/test.js
+++ b/test/test.js
@@ -110,7 +110,7 @@
const command = 'node src/index.js < test/data/gender.conllu';
const stdout = execSync(command).toString();
const foundry_count = (stdout.match(/# foundry = gender/g) || []).length;
- expect(foundry_count).toBe(12);
+ expect(foundry_count).toBe(18);
});
test('Full mode: non-gender tokens pass through unchanged', () => {
@@ -135,16 +135,17 @@
const featsAnnotated = cols[5] !== '_';
expect(lemmaAnnotated || featsAnnotated).toBe(true);
});
- // Count: 18 NOUN + 5 DET + 1 PRON (sie*er) + 5 neo-pronouns (sier,xier,oj,el,hen) = 29
- expect(tokenLines.length).toBe(29);
+ // Count: 18 NOUN + 5 DET + 1 PRON (sie*er) + 5 neo-pronouns (sier,xier,oj,el,hen)
+ // + 7 new neo-pronouns from sentences 16–18 (el,em,ey,y,mensch,Mensch,xier) = 36
+ expect(tokenLines.length).toBe(36);
});
test('Sparse mode: sentence headers are emitted for sentences with matches', () => {
const command = 'node src/index.js -s < test/data/gender.conllu';
const stdout = execSync(command).toString();
- // All 12 test sentences have at least one gender form
+ // 12 original + 3 new sentences (16–18) have at least one gender form
const text_id_count = (stdout.match(/# text_id = /g) || []).length;
- expect(text_id_count).toBe(12);
+ expect(text_id_count).toBe(15);
});
test('Inline input: basic Genderstern annotation', () => {
@@ -185,4 +186,112 @@
// jede (without gender marker): unchanged
expect(stdout).toContain('jede\t_\tDET\tPIAT\tGender=Fem|Number=Sing');
});
+
+ // ---------------------------------------------------------------------------
+ // Regression tests: false-positive tokens that must NOT be tagged
+ // ---------------------------------------------------------------------------
+
+ test('No false positives: *, Y, per, EL, EM, Ey, sin mid-sentence pass through unchanged', () => {
+ // Each of these appeared as spurious neo-pronoun matches in the original code.
+ // They must not receive a neo-pronoun annotation.
+ const testInput = `# foundry = base
+# text_id = fp-001
+# text = Hinweis auf * und Y sowie per Einschreiben
+1\tHinweis\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+2\tauf\t_\tADP\tAPPR\t_\t_\t_\t_\t_
+3\t*\t_\tPUNCT\t$(\t_\t_\t_\t_\t_
+4\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+5\tY\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+6\tsowie\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+7\tper\t_\tADP\tAPPR\t_\t_\t_\t_\t_
+8\tEinschreiben\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+
+# foundry = base
+# text_id = fp-002
+# text = Verweise auf EL EM Ey sin im Text
+1\tVerweise\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+2\tauf\t_\tADP\tAPPR\t_\t_\t_\t_\t_
+3\tEL\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+4\tEM\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+5\tEy\t_\tITJ\tITJ\t_\t_\t_\t_\t_
+6\tsin\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+7\tim\t_\tADP\tAPPRART\t_\t_\t_\t_\t_
+8\tText\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+
+`;
+ const stdout = execSync('node src/index.js', { input: testInput }).toString();
+ // None of the false-positive tokens should receive a neo-pronoun annotation.
+ // A token passes through unchanged when its lemma column stays '_' and
+ // its upos/xpos/feats are not overwritten to PRON/PPER.
+ expect(stdout).toContain('3\t*\t_\tPUNCT');
+ expect(stdout).toContain('5\tY\t_\tNOUN');
+ expect(stdout).toContain('7\tper\t_\tADP');
+ expect(stdout).toContain('3\tEL\t_\tNOUN');
+ expect(stdout).toContain('4\tEM\t_\tNOUN');
+ expect(stdout).toContain('5\tEy\t_\tITJ');
+ expect(stdout).toContain('6\tsin\t_\tNOUN');
+ });
+
+ test('No false positive: Mensch mid-sentence must not be tagged as neo-pronoun', () => {
+ const testInput = `# foundry = base
+# text_id = fp-003
+# text = Jeder Mensch hat Würde
+1\tJeder\t_\tDET\tPIAT\t_\t_\t_\t_\t_
+2\tMensch\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+3\that\t_\tAUX\tVAFIN\t_\t_\t_\t_\t_
+4\tWürde\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+
+`;
+ const stdout = execSync('node src/index.js', { input: testInput }).toString();
+ // 'Mensch' at position 2 (not sentence-initial) must not be tagged.
+ expect(stdout).toContain('2\tMensch\t_\tNOUN\tNN\t_');
+ });
+
+ test('Neo-pronoun: lowercase el, em, ey, y mid-sentence are still tagged', () => {
+ const testInput = `# foundry = base
+# text_id = neo-lc-001
+# text = dankte el und em für ey und y
+1\tdankte\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
+2\tel\t_\t_\t_\t_\t_\t_\t_\t_
+3\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+4\tem\t_\t_\t_\t_\t_\t_\t_\t_
+5\tfür\t_\tADP\tAPPR\t_\t_\t_\t_\t_
+6\tey\t_\t_\t_\t_\t_\t_\t_\t_
+7\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+8\ty\t_\t_\t_\t_\t_\t_\t_\t_
+
+`;
+ const stdout = execSync('node src/index.js', { input: testInput }).toString();
+ expect(stdout).toContain('el\tel\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+ expect(stdout).toContain('em\tem\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+ expect(stdout).toContain('ey\tey\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+ expect(stdout).toContain('y\tY\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+ });
+
+ test('Neo-pronoun: mensch lowercase and sentence-initial Mensch are tagged', () => {
+ const testInput = `# foundry = base
+# text_id = neo-mensch-001
+# text = mensch fragte und Mensch antwortete
+1\tmensch\t_\t_\t_\t_\t_\t_\t_\t_
+2\tfragte\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
+3\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
+4\tMensch\t_\tNOUN\tNN\t_\t_\t_\t_\t_
+5\tantwortete\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
+
+# foundry = base
+# text_id = neo-mensch-002
+# text = Mensch traf xier
+1\tMensch\t_\t_\t_\t_\t_\t_\t_\t_
+2\ttraf\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
+3\txier\t_\t_\t_\t_\t_\t_\t_\t_
+
+`;
+ const stdout = execSync('node src/index.js', { input: testInput }).toString();
+ // lowercase 'mensch' → neo-pronoun
+ expect(stdout).toContain('1\tmensch\tmensch\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+ // 'Mensch' mid-sentence (position 4) → unchanged common noun
+ expect(stdout).toContain('4\tMensch\t_\tNOUN\tNN\t_');
+ // sentence-initial 'Mensch' (position 1) → neo-pronoun
+ expect(stdout).toContain('1\tMensch\tmensch\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
+ });
});