blob: b954e06948dd8d650c3cf3c4ac5c8a947a7d70d6 [file] [log] [blame]
Marc Kupietzb777f9d2026-03-07 09:26:20 +01001const { execSync } = require('child_process');
2
3describe('conllu-gender', () => {
4
5 test('Full mode: all gender-sensitive nouns (Genderstern singular)', () => {
6 const command = 'node src/index.js < test/data/gender.conllu';
7 const stdout = execSync(command).toString();
8
Marc Kupietzb2068f42026-03-07 21:58:05 +01009 // Genderstern singular noun: Bürger*in → lemma Bürger*in, NOUN NN Gender=Fem,Masc,NonBin|Number=Sing
10 expect(stdout).toContain('Bürger*in\tBürger*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing');
Marc Kupietzb777f9d2026-03-07 09:26:20 +010011 // Long compound: Jugendpsychiater*in
Marc Kupietzb2068f42026-03-07 21:58:05 +010012 expect(stdout).toContain('Jugendpsychiater*in\tJugendpsychiater*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing');
Marc Kupietzb777f9d2026-03-07 09:26:20 +010013 // Compound with umlaut base: Generalstaatsanwält*in
Marc Kupietzb2068f42026-03-07 21:58:05 +010014 expect(stdout).toContain('Generalstaatsanwält*in\tGeneralstaatsanwält*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing');
Marc Kupietzb777f9d2026-03-07 09:26:20 +010015 // Nachfolger*in
Marc Kupietzb2068f42026-03-07 21:58:05 +010016 expect(stdout).toContain('Nachfolger*in\tNachfolger*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing');
Marc Kupietzb777f9d2026-03-07 09:26:20 +010017 // With umlaut base: Antragssteller*in (no umlaut but long compound)
Marc Kupietzb2068f42026-03-07 21:58:05 +010018 expect(stdout).toContain('Antragssteller*in\tAntragssteller*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing');
Marc Kupietzb777f9d2026-03-07 09:26:20 +010019 });
20
21 test('Full mode: Genderstern plural nouns → lemma uses singular, Number=Plur', () => {
22 const command = 'node src/index.js < test/data/gender.conllu';
23 const stdout = execSync(command).toString();
24 // Fachärzt*innen → lemma Fachärzt*in
Marc Kupietzb2068f42026-03-07 21:58:05 +010025 expect(stdout).toContain('Fachärzt*innen\tFachärzt*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Plur');
Marc Kupietzb777f9d2026-03-07 09:26:20 +010026 });
27
28 test('Full mode: Doppelpunkt nouns', () => {
29 const command = 'node src/index.js < test/data/gender.conllu';
30 const stdout = execSync(command).toString();
31 // Doppelpunkt singular
Marc Kupietzb2068f42026-03-07 21:58:05 +010032 expect(stdout).toContain('Anhänger:in\tAnhänger:in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing');
33 expect(stdout).toContain('Wirt:in\tWirt:in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing');
Marc Kupietzb777f9d2026-03-07 09:26:20 +010034 // Doppelpunkt plural
Marc Kupietzb2068f42026-03-07 21:58:05 +010035 expect(stdout).toContain('Lehrer:innen\tLehrer:in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Plur');
Marc Kupietzb777f9d2026-03-07 09:26:20 +010036 // Long compound with umlaut base
Marc Kupietzb2068f42026-03-07 21:58:05 +010037 expect(stdout).toContain('Menschenrechtsanwält:innen\tMenschenrechtsanwält:in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Plur');
Marc Kupietzb777f9d2026-03-07 09:26:20 +010038 });
39
40 test('Full mode: Unterstrich plural nouns', () => {
41 const command = 'node src/index.js < test/data/gender.conllu';
42 const stdout = execSync(command).toString();
Marc Kupietzb2068f42026-03-07 21:58:05 +010043 expect(stdout).toContain('Autor_innen\tAutor_in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Plur');
Marc Kupietzb777f9d2026-03-07 09:26:20 +010044 });
45
46 test('Full mode: Binnen-I nouns (binary intended → Gender=Masc,Fem)', () => {
47 const command = 'node src/index.js < test/data/gender.conllu';
48 const stdout = execSync(command).toString();
49 // Binnen-I singular: ZeugIn → Gender=Masc,Fem|Number=Sing
50 expect(stdout).toContain('ZeugIn\tZeugIn\tNOUN\tNN\tGender=Masc,Fem|Number=Sing');
51 // Binnen-I singular: SchülerIn → Gender=Masc,Fem|Number=Sing
52 expect(stdout).toContain('SchülerIn\tSchülerIn\tNOUN\tNN\tGender=Masc,Fem|Number=Sing');
53 // Binnen-I plural: LehrerInnen → lemma LehrerIn, Number=Plur
54 expect(stdout).toContain('LehrerInnen\tLehrerIn\tNOUN\tNN\tGender=Masc,Fem|Number=Plur');
55 });
56
57 test('Full mode: Klammern plural nouns (binary intended)', () => {
58 const command = 'node src/index.js < test/data/gender.conllu';
59 const stdout = execSync(command).toString();
60 expect(stdout).toContain('Schüler(innen)\tSchüler(in)\tNOUN\tNN\tGender=Masc,Fem|Number=Plur');
61 });
62
63 test('Full mode: Schrägstrich nouns', () => {
64 const command = 'node src/index.js < test/data/gender.conllu';
65 const stdout = execSync(command).toString();
66 // Simple slash
67 expect(stdout).toContain('Autor/innen\tAutor/in\tNOUN\tNN\tGender=Masc,Fem|Number=Plur');
68 // Slash with Ergänzungsstrich (/-innen)
69 expect(stdout).toContain('Autor/-innen\tAutor/in\tNOUN\tNN\tGender=Masc,Fem|Number=Plur');
70 expect(stdout).toContain('Spieler/-innen\tSpieler/in\tNOUN\tNN\tGender=Masc,Fem|Number=Plur');
71 });
72
73 test('Full mode: gendered determiners/pronouns (non-binary intended)', () => {
74 const command = 'node src/index.js < test/data/gender.conllu';
75 const stdout = execSync(command).toString();
Marc Kupietzb2068f42026-03-07 21:58:05 +010076 // jede*r → DET PIAT Gender=Fem,Masc,NonBin, lemma jede*r
77 expect(stdout).toContain('jede*r\tjede*r\tDET\tPIAT\tGender=Fem,Masc,NonBin');
Marc Kupietzb777f9d2026-03-07 09:26:20 +010078 const jeder_count = (stdout.match(/jede\*r\tjede\*r/g) || []).length;
79 expect(jeder_count).toBe(2); // appears in two sentences
Marc Kupietzb2068f42026-03-07 21:58:05 +010080 // eine*n → DET ART Gender=Fem,Masc,NonBin (non-binary via *)
81 expect(stdout).toContain('eine*n\teine*n\tDET\tART\tGender=Fem,Masc,NonBin');
82 // jede:r → DET PIAT Gender=Fem,Masc,NonBin
83 expect(stdout).toContain('jede:r\tjede:r\tDET\tPIAT\tGender=Fem,Masc,NonBin');
84 // die*der → DET ART Gender=Fem,Masc,NonBin (* marker → inclusive even for merged forms)
85 expect(stdout).toContain('die*der\tdie*der\tDET\tART\tGender=Fem,Masc,NonBin');
Marc Kupietzb777f9d2026-03-07 09:26:20 +010086 });
87
Marc Kupietzb2068f42026-03-07 21:58:05 +010088 test('Full mode: neo-pronoun (sie*er → PRON PPER Gender=Fem,Masc,NonBin)', () => {
Marc Kupietzb777f9d2026-03-07 09:26:20 +010089 const command = 'node src/index.js < test/data/gender.conllu';
90 const stdout = execSync(command).toString();
Marc Kupietzb2068f42026-03-07 21:58:05 +010091 expect(stdout).toContain('sie*er\tsie*er\tPRON\tPPER\tGender=Fem,Masc,NonBin');
Marc Kupietzb777f9d2026-03-07 09:26:20 +010092 });
93
Marc Kupietzb2068f42026-03-07 21:58:05 +010094 test('Full mode: neo-pronoun lexicon forms (PRON PPER Gender=Fem,Masc,NonBin|PronType=Prs)', () => {
Marc Kupietz1a9f16e2026-03-07 09:50:55 +010095 const command = 'node src/index.js < test/data/gender.conllu';
96 const stdout = execSync(command).toString();
97 // sier: NOM of sier-paradigm
Marc Kupietzb2068f42026-03-07 21:58:05 +010098 expect(stdout).toContain('sier\tsier\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
Marc Kupietz1a9f16e2026-03-07 09:50:55 +010099 // xier: NOM of xier-paradigm
Marc Kupietzb2068f42026-03-07 21:58:05 +0100100 expect(stdout).toContain('xier\txier\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
Marc Kupietz1a9f16e2026-03-07 09:50:55 +0100101 // oj: NOM of oj-paradigm
Marc Kupietzb2068f42026-03-07 21:58:05 +0100102 expect(stdout).toContain('oj\toj\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
Marc Kupietz1a9f16e2026-03-07 09:50:55 +0100103 // el: NOM of el-paradigm
Marc Kupietzb2068f42026-03-07 21:58:05 +0100104 expect(stdout).toContain('el\tel\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
Marc Kupietz1a9f16e2026-03-07 09:50:55 +0100105 // hen: NOM of hen-paradigm
Marc Kupietzb2068f42026-03-07 21:58:05 +0100106 expect(stdout).toContain('hen\then\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
Marc Kupietz1a9f16e2026-03-07 09:50:55 +0100107 });
108
Marc Kupietzb777f9d2026-03-07 09:26:20 +0100109 test('Full mode: foundry comment changed to gender', () => {
110 const command = 'node src/index.js < test/data/gender.conllu';
111 const stdout = execSync(command).toString();
112 const foundry_count = (stdout.match(/# foundry = gender/g) || []).length;
Marc Kupietzd2b92792026-03-10 08:11:06 +0100113 expect(foundry_count).toBe(18);
Marc Kupietzb777f9d2026-03-07 09:26:20 +0100114 });
115
116 test('Full mode: non-gender tokens pass through unchanged', () => {
117 const command = 'node src/index.js < test/data/gender.conllu';
118 const stdout = execSync(command).toString();
119 // Regular nouns are passed through, lemma stays _
120 expect(stdout).toContain('Umbenennung\t_\tNOUN\tNN');
121 expect(stdout).toContain('Ideen\t_\tNOUN\tNN');
122 });
123
124 test('Sparse mode: only annotated tokens are emitted', () => {
125 const command = 'node src/index.js -s < test/data/gender.conllu';
126 const stdout = execSync(command).toString();
127 const lines = stdout.split('\n');
128 const tokenLines = lines.filter(l => l.match(/^\d+\t/));
129 // Every token line must carry a gender annotation:
130 // either the lemma column (col 2) is non-underscore (noun/pron was annotated)
131 // or the features column (col 5) is non-underscore (det was annotated)
132 tokenLines.forEach(line => {
133 const cols = line.split('\t');
134 const lemmaAnnotated = cols[2] !== '_';
135 const featsAnnotated = cols[5] !== '_';
136 expect(lemmaAnnotated || featsAnnotated).toBe(true);
137 });
Marc Kupietzd2b92792026-03-10 08:11:06 +0100138 // Count: 18 NOUN + 5 DET + 1 PRON (sie*er) + 5 neo-pronouns (sier,xier,oj,el,hen)
139 // + 7 new neo-pronouns from sentences 16–18 (el,em,ey,y,mensch,Mensch,xier) = 36
140 expect(tokenLines.length).toBe(36);
Marc Kupietzb777f9d2026-03-07 09:26:20 +0100141 });
142
143 test('Sparse mode: sentence headers are emitted for sentences with matches', () => {
144 const command = 'node src/index.js -s < test/data/gender.conllu';
145 const stdout = execSync(command).toString();
Marc Kupietzd2b92792026-03-10 08:11:06 +0100146 // 12 original + 3 new sentences (16–18) have at least one gender form
Marc Kupietzb777f9d2026-03-07 09:26:20 +0100147 const text_id_count = (stdout.match(/# text_id = /g) || []).length;
Marc Kupietzd2b92792026-03-10 08:11:06 +0100148 expect(text_id_count).toBe(15);
Marc Kupietzb777f9d2026-03-07 09:26:20 +0100149 });
150
151 test('Inline input: basic Genderstern annotation', () => {
152 const testInput = `# foundry = base
153# text_id = inline-001
154# text = Die Lehrerin und Lehrer*innen kamen
1551\tDie\t_\tDET\tART\t_\t_\t_\t_\t_
1562\tLehrerin\t_\tNOUN\tNN\t_\t_\t_\t_\t_
1573\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
1584\tLehrer*innen\t_\t_\t_\t_\t_\t_\t_\t_
1595\tkamen\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
160
161`;
162 const stdout = execSync('node src/index.js', { input: testInput }).toString();
Marc Kupietzb2068f42026-03-07 21:58:05 +0100163 expect(stdout).toContain('Lehrer*innen\tLehrer*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Plur');
Marc Kupietzb777f9d2026-03-07 09:26:20 +0100164 // Regular noun 'Lehrerin' must not be incorrectly tagged
165 expect(stdout).toContain('Lehrerin\t_\tNOUN\tNN\t_');
166 });
167
168 test('Inline input: existing POS/lemma/feats are replaced for gender forms', () => {
169 const testInput = `# foundry = base
170# text_id = inline-002
171# text = jede Ärztin und jede*r Arzt*in
1721\tjede\t_\tDET\tPIAT\tGender=Fem|Number=Sing\t_\t_\t_\t_
1732\tÄrztin\tÄrztin\tNOUN\tNN\tGender=Fem|Number=Sing\t_\t_\t_\t_
1743\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
1754\tjede*r\t_\tDET\tPIAT\t_\t_\t_\t_\t_
1765\tArzt*in\t_\t_\t_\t_\t_\t_\t_\t_
177
178`;
179 const stdout = execSync('node src/index.js', { input: testInput }).toString();
180 // jede*r: missing feats should be filled in
Marc Kupietzb2068f42026-03-07 21:58:05 +0100181 expect(stdout).toContain('jede*r\tjede*r\tDET\tPIAT\tGender=Fem,Masc,NonBin');
Marc Kupietzb777f9d2026-03-07 09:26:20 +0100182 // Arzt*in: umlaut base, missing everything
Marc Kupietzb2068f42026-03-07 21:58:05 +0100183 expect(stdout).toContain('Arzt*in\tArzt*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing');
Marc Kupietzb777f9d2026-03-07 09:26:20 +0100184 // Ärztin (regular moviertes Femininum, no gender marker): unchanged
185 expect(stdout).toContain('Ärztin\tÄrztin\tNOUN\tNN\tGender=Fem|Number=Sing');
186 // jede (without gender marker): unchanged
187 expect(stdout).toContain('jede\t_\tDET\tPIAT\tGender=Fem|Number=Sing');
188 });
Marc Kupietzd2b92792026-03-10 08:11:06 +0100189
190 // ---------------------------------------------------------------------------
191 // Regression tests: false-positive tokens that must NOT be tagged
192 // ---------------------------------------------------------------------------
193
194 test('No false positives: *, Y, per, EL, EM, Ey, sin mid-sentence pass through unchanged', () => {
195 // Each of these appeared as spurious neo-pronoun matches in the original code.
196 // They must not receive a neo-pronoun annotation.
197 const testInput = `# foundry = base
198# text_id = fp-001
199# text = Hinweis auf * und Y sowie per Einschreiben
2001\tHinweis\t_\tNOUN\tNN\t_\t_\t_\t_\t_
2012\tauf\t_\tADP\tAPPR\t_\t_\t_\t_\t_
2023\t*\t_\tPUNCT\t$(\t_\t_\t_\t_\t_
2034\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
2045\tY\t_\tNOUN\tNN\t_\t_\t_\t_\t_
2056\tsowie\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
2067\tper\t_\tADP\tAPPR\t_\t_\t_\t_\t_
2078\tEinschreiben\t_\tNOUN\tNN\t_\t_\t_\t_\t_
208
209# foundry = base
210# text_id = fp-002
211# text = Verweise auf EL EM Ey sin im Text
2121\tVerweise\t_\tNOUN\tNN\t_\t_\t_\t_\t_
2132\tauf\t_\tADP\tAPPR\t_\t_\t_\t_\t_
2143\tEL\t_\tNOUN\tNN\t_\t_\t_\t_\t_
2154\tEM\t_\tNOUN\tNN\t_\t_\t_\t_\t_
2165\tEy\t_\tITJ\tITJ\t_\t_\t_\t_\t_
2176\tsin\t_\tNOUN\tNN\t_\t_\t_\t_\t_
2187\tim\t_\tADP\tAPPRART\t_\t_\t_\t_\t_
2198\tText\t_\tNOUN\tNN\t_\t_\t_\t_\t_
220
221`;
222 const stdout = execSync('node src/index.js', { input: testInput }).toString();
223 // None of the false-positive tokens should receive a neo-pronoun annotation.
224 // A token passes through unchanged when its lemma column stays '_' and
225 // its upos/xpos/feats are not overwritten to PRON/PPER.
226 expect(stdout).toContain('3\t*\t_\tPUNCT');
227 expect(stdout).toContain('5\tY\t_\tNOUN');
228 expect(stdout).toContain('7\tper\t_\tADP');
229 expect(stdout).toContain('3\tEL\t_\tNOUN');
230 expect(stdout).toContain('4\tEM\t_\tNOUN');
231 expect(stdout).toContain('5\tEy\t_\tITJ');
232 expect(stdout).toContain('6\tsin\t_\tNOUN');
233 });
234
235 test('No false positive: Mensch mid-sentence must not be tagged as neo-pronoun', () => {
236 const testInput = `# foundry = base
237# text_id = fp-003
238# text = Jeder Mensch hat Würde
2391\tJeder\t_\tDET\tPIAT\t_\t_\t_\t_\t_
2402\tMensch\t_\tNOUN\tNN\t_\t_\t_\t_\t_
2413\that\t_\tAUX\tVAFIN\t_\t_\t_\t_\t_
2424\tWürde\t_\tNOUN\tNN\t_\t_\t_\t_\t_
243
244`;
245 const stdout = execSync('node src/index.js', { input: testInput }).toString();
246 // 'Mensch' at position 2 (not sentence-initial) must not be tagged.
247 expect(stdout).toContain('2\tMensch\t_\tNOUN\tNN\t_');
248 });
249
250 test('Neo-pronoun: lowercase el, em, ey, y mid-sentence are still tagged', () => {
251 const testInput = `# foundry = base
252# text_id = neo-lc-001
253# text = dankte el und em für ey und y
2541\tdankte\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
2552\tel\t_\t_\t_\t_\t_\t_\t_\t_
2563\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
2574\tem\t_\t_\t_\t_\t_\t_\t_\t_
2585\tfür\t_\tADP\tAPPR\t_\t_\t_\t_\t_
2596\tey\t_\t_\t_\t_\t_\t_\t_\t_
2607\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
2618\ty\t_\t_\t_\t_\t_\t_\t_\t_
262
263`;
264 const stdout = execSync('node src/index.js', { input: testInput }).toString();
265 expect(stdout).toContain('el\tel\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
266 expect(stdout).toContain('em\tem\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
267 expect(stdout).toContain('ey\tey\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
268 expect(stdout).toContain('y\tY\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
269 });
270
271 test('Neo-pronoun: mensch lowercase and sentence-initial Mensch are tagged', () => {
272 const testInput = `# foundry = base
273# text_id = neo-mensch-001
274# text = mensch fragte und Mensch antwortete
2751\tmensch\t_\t_\t_\t_\t_\t_\t_\t_
2762\tfragte\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
2773\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
2784\tMensch\t_\tNOUN\tNN\t_\t_\t_\t_\t_
2795\tantwortete\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
280
281# foundry = base
282# text_id = neo-mensch-002
283# text = Mensch traf xier
2841\tMensch\t_\t_\t_\t_\t_\t_\t_\t_
2852\ttraf\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
2863\txier\t_\t_\t_\t_\t_\t_\t_\t_
287
288`;
289 const stdout = execSync('node src/index.js', { input: testInput }).toString();
290 // lowercase 'mensch' → neo-pronoun
291 expect(stdout).toContain('1\tmensch\tmensch\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
292 // 'Mensch' mid-sentence (position 4) → unchanged common noun
293 expect(stdout).toContain('4\tMensch\t_\tNOUN\tNN\t_');
294 // sentence-initial 'Mensch' (position 1) → neo-pronoun
295 expect(stdout).toContain('1\tMensch\tmensch\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
296 });
Marc Kupietzb777f9d2026-03-07 09:26:20 +0100297});