Fix more FP
Change-Id: I1f8045b4f93f64a7c8597dfbc35a1744af3aaf20
diff --git a/src/index.js b/src/index.js
index cbc0953..47be79e 100644
--- a/src/index.js
+++ b/src/index.js
@@ -124,9 +124,9 @@
['eys', neoPron('ey')],
['emm', neoPron('ey')],
// they/them (NOM=they, GEN=their, DAT=them, ACC=them)
- ['they', neoPron('they')],
- ['their', neoPron('they')],
- ['them', neoPron('they')],
+ // Excluded: 'they' and 'them' are too frequent in English quotations within
+ // German texts and would cause many false positives.
+ // 'their' is likewise excluded for the same reason.
// ---- Neuer Stamm (new-stem pronouns) -----------------------------------
// el/em (NOM=el, GEN=ems, DAT=em, ACC=en)
@@ -143,8 +143,9 @@
// 'ens' takes priority as NOM of ens-paradigm (also GEN of en/em)
['ens', neoPron('ens')],
// et/siem (NOM=et, GEN=sier, DAT=siem, ACC=sien)
- // oblique forms 'sier'/'siem'/'sien' already mapped to sier-paradigm above
- ['et', neoPron('et')],
+ // Excluded: 'et' is omitted — it is ubiquitous in academic German texts as the
+ // Latin abbreviation in 'et al.' and would cause pervasive false positives.
+ // The oblique forms sier/siem/sien are still captured via the sier-paradigm.
// ex/ex (all forms = ex)
['ex', neoPron('ex')],
// hän/sim (NOM=hän, GEN=sir, DAT=sim, ACC=sim)
diff --git a/test/data/gender.conllu b/test/data/gender.conllu
index e764a9e..87ab0e6 100644
--- a/test/data/gender.conllu
+++ b/test/data/gender.conllu
@@ -212,3 +212,27 @@
2 traf _ VERB VVFIN _ _ _ _ _
3 xier _ _ _ _ _ _ _ _
+# foundry = base
+# filename = TEST/gender/000019/base/tokens.xml
+# text_id = GENDER_TEST.000019
+# text = Müller et al. berichten über neue Befunde
+1 Müller _ NOUN NN _ _ _ _ _
+2 et _ PART PTKA _ _ _ _ _
+3 al. _ NOUN NN _ _ _ _ _
+4 berichten _ VERB VVFIN _ _ _ _ _
+5 über _ ADP APPR _ _ _ _ _
+6 neue _ ADJ ADJA _ _ _ _ _
+7 Befunde _ NOUN NN _ _ _ _ _
+
+# foundry = base
+# filename = TEST/gender/000020/base/tokens.xml
+# text_id = GENDER_TEST.000020
+# text = "their results confirmed the hypothesis"
+1 " _ PUNCT $( _ _ _ _ _
+2 their _ PRON PPER _ _ _ _ _
+3 results _ NOUN NN _ _ _ _ _
+4 confirmed _ VERB VVFIN _ _ _ _ _
+5 the _ DET ART _ _ _ _ _
+6 hypothesis _ NOUN NN _ _ _ _ _
+7 " _ PUNCT $( _ _ _ _ _
+
diff --git a/test/test.js b/test/test.js
index b954e06..66cf37c 100644
--- a/test/test.js
+++ b/test/test.js
@@ -110,7 +110,7 @@
const command = 'node src/index.js < test/data/gender.conllu';
const stdout = execSync(command).toString();
const foundry_count = (stdout.match(/# foundry = gender/g) || []).length;
- expect(foundry_count).toBe(18);
+ expect(foundry_count).toBe(20);
});
test('Full mode: non-gender tokens pass through unchanged', () => {
@@ -232,6 +232,15 @@
expect(stdout).toContain('6\tsin\t_\tNOUN');
});
+ test('No false positives from gender.conllu: et (in et al.) and their (English) pass through unchanged', () => {
+ const command = 'node src/index.js < test/data/gender.conllu';
+ const stdout = execSync(command).toString();
+ // et in "Müller et al." must not be tagged
+ expect(stdout).toContain('2\tet\t_\tPART');
+ // their in English quotation must not be tagged
+ expect(stdout).toContain('2\ttheir\t_\tPRON');
+ });
+
test('No false positive: Mensch mid-sentence must not be tagged as neo-pronoun', () => {
const testInput = `# foundry = base
# text_id = fp-003