Fix more FP Change-Id: I1f8045b4f93f64a7c8597dfbc35a1744af3aaf20

commit: acf31208f105ec986c72d830573d6d12f17b7cf3 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Tue Mar 10 09:24:28 2026 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Tue Mar 10 09:24:57 2026 +0100
tree: d8c66d7139efbd31c07984549a2c30d16cc44ab4
parent: d2b9279dbf25640587934d38e948d597f74f027d [diff]
diff --git a/src/index.js b/src/index.js
index cbc0953..47be79e 100644
--- a/src/index.js
+++ b/src/index.js

@@ -124,9 +124,9 @@
   ['eys',     neoPron('ey')],
   ['emm',     neoPron('ey')],
   // they/them  (NOM=they, GEN=their, DAT=them, ACC=them)
-  ['they',    neoPron('they')],
-  ['their',   neoPron('they')],
-  ['them',    neoPron('they')],
+  // Excluded: 'they' and 'them' are too frequent in English quotations within
+  // German texts and would cause many false positives.
+  // 'their' is likewise excluded for the same reason.
 
   // ---- Neuer Stamm (new-stem pronouns) -----------------------------------
   // el/em  (NOM=el, GEN=ems, DAT=em, ACC=en)
@@ -143,8 +143,9 @@
   // 'ens' takes priority as NOM of ens-paradigm (also GEN of en/em)
   ['ens',     neoPron('ens')],
   // et/siem  (NOM=et, GEN=sier, DAT=siem, ACC=sien)
-  // oblique forms 'sier'/'siem'/'sien' already mapped to sier-paradigm above
-  ['et',      neoPron('et')],
+  // Excluded: 'et' is omitted — it is ubiquitous in academic German texts as the
+  // Latin abbreviation in 'et al.' and would cause pervasive false positives.
+  // The oblique forms sier/siem/sien are still captured via the sier-paradigm.
   // ex/ex  (all forms = ex)
   ['ex',      neoPron('ex')],
   // hän/sim  (NOM=hän, GEN=sir, DAT=sim, ACC=sim)

diff --git a/test/data/gender.conllu b/test/data/gender.conllu
index e764a9e..87ab0e6 100644
--- a/test/data/gender.conllu
+++ b/test/data/gender.conllu

@@ -212,3 +212,27 @@
 2	traf	_	VERB	VVFIN	_	_	_	_	_
 3	xier	_	_	_	_	_	_	_	_
 
+# foundry = base
+# filename = TEST/gender/000019/base/tokens.xml
+# text_id = GENDER_TEST.000019
+# text = Müller et al. berichten über neue Befunde
+1	Müller	_	NOUN	NN	_	_	_	_	_
+2	et	_	PART	PTKA	_	_	_	_	_
+3	al.	_	NOUN	NN	_	_	_	_	_
+4	berichten	_	VERB	VVFIN	_	_	_	_	_
+5	über	_	ADP	APPR	_	_	_	_	_
+6	neue	_	ADJ	ADJA	_	_	_	_	_
+7	Befunde	_	NOUN	NN	_	_	_	_	_
+
+# foundry = base
+# filename = TEST/gender/000020/base/tokens.xml
+# text_id = GENDER_TEST.000020
+# text = "their results confirmed the hypothesis"
+1	"	_	PUNCT	$(	_	_	_	_	_
+2	their	_	PRON	PPER	_	_	_	_	_
+3	results	_	NOUN	NN	_	_	_	_	_
+4	confirmed	_	VERB	VVFIN	_	_	_	_	_
+5	the	_	DET	ART	_	_	_	_	_
+6	hypothesis	_	NOUN	NN	_	_	_	_	_
+7	"	_	PUNCT	$(	_	_	_	_	_
+

diff --git a/test/test.js b/test/test.js
index b954e06..66cf37c 100644
--- a/test/test.js
+++ b/test/test.js

@@ -110,7 +110,7 @@
     const command = 'node src/index.js < test/data/gender.conllu';
     const stdout = execSync(command).toString();
     const foundry_count = (stdout.match(/# foundry = gender/g) || []).length;
-    expect(foundry_count).toBe(18);
+    expect(foundry_count).toBe(20);
   });
 
   test('Full mode: non-gender tokens pass through unchanged', () => {
@@ -232,6 +232,15 @@
     expect(stdout).toContain('6\tsin\t_\tNOUN');
   });
 
+  test('No false positives from gender.conllu: et (in et al.) and their (English) pass through unchanged', () => {
+    const command = 'node src/index.js < test/data/gender.conllu';
+    const stdout = execSync(command).toString();
+    // et in "Müller et al." must not be tagged
+    expect(stdout).toContain('2\tet\t_\tPART');
+    // their in English quotation must not be tagged
+    expect(stdout).toContain('2\ttheir\t_\tPRON');
+  });
+
   test('No false positive: Mensch mid-sentence must not be tagged as neo-pronoun', () => {
     const testInput = `# foundry = base
 # text_id = fp-003
commit	acf31208f105ec986c72d830573d6d12f17b7cf3	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Tue Mar 10 09:24:28 2026 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Tue Mar 10 09:24:57 2026 +0100
tree	d8c66d7139efbd31c07984549a2c30d16cc44ab4
parent	d2b9279dbf25640587934d38e948d597f74f027d [diff]