blob: 63670492c88a584b03a1941c9d88848203f27d16 [file] [log] [blame]
Marc Kupietzb777f9d2026-03-07 09:26:20 +01001const { execSync } = require('child_process');
2
3describe('conllu-gender', () => {
4
5 test('Full mode: all gender-sensitive nouns (Genderstern singular)', () => {
6 const command = 'node src/index.js < test/data/gender.conllu';
7 const stdout = execSync(command).toString();
8
9 // Genderstern singular noun: Bürger*in → lemma Bürger*in, NOUN NN Gender=NonBin|Number=Sing
10 expect(stdout).toContain('Bürger*in\tBürger*in\tNOUN\tNN\tGender=NonBin|Number=Sing');
11 // Long compound: Jugendpsychiater*in
12 expect(stdout).toContain('Jugendpsychiater*in\tJugendpsychiater*in\tNOUN\tNN\tGender=NonBin|Number=Sing');
13 // Compound with umlaut base: Generalstaatsanwält*in
14 expect(stdout).toContain('Generalstaatsanwält*in\tGeneralstaatsanwält*in\tNOUN\tNN\tGender=NonBin|Number=Sing');
15 // Nachfolger*in
16 expect(stdout).toContain('Nachfolger*in\tNachfolger*in\tNOUN\tNN\tGender=NonBin|Number=Sing');
17 // With umlaut base: Antragssteller*in (no umlaut but long compound)
18 expect(stdout).toContain('Antragssteller*in\tAntragssteller*in\tNOUN\tNN\tGender=NonBin|Number=Sing');
19 });
20
21 test('Full mode: Genderstern plural nouns → lemma uses singular, Number=Plur', () => {
22 const command = 'node src/index.js < test/data/gender.conllu';
23 const stdout = execSync(command).toString();
24 // Fachärzt*innen → lemma Fachärzt*in
25 expect(stdout).toContain('Fachärzt*innen\tFachärzt*in\tNOUN\tNN\tGender=NonBin|Number=Plur');
26 });
27
28 test('Full mode: Doppelpunkt nouns', () => {
29 const command = 'node src/index.js < test/data/gender.conllu';
30 const stdout = execSync(command).toString();
31 // Doppelpunkt singular
32 expect(stdout).toContain('Anhänger:in\tAnhänger:in\tNOUN\tNN\tGender=NonBin|Number=Sing');
33 expect(stdout).toContain('Wirt:in\tWirt:in\tNOUN\tNN\tGender=NonBin|Number=Sing');
34 // Doppelpunkt plural
35 expect(stdout).toContain('Lehrer:innen\tLehrer:in\tNOUN\tNN\tGender=NonBin|Number=Plur');
36 // Long compound with umlaut base
37 expect(stdout).toContain('Menschenrechtsanwält:innen\tMenschenrechtsanwält:in\tNOUN\tNN\tGender=NonBin|Number=Plur');
38 });
39
40 test('Full mode: Unterstrich plural nouns', () => {
41 const command = 'node src/index.js < test/data/gender.conllu';
42 const stdout = execSync(command).toString();
43 expect(stdout).toContain('Autor_innen\tAutor_in\tNOUN\tNN\tGender=NonBin|Number=Plur');
44 });
45
46 test('Full mode: Binnen-I nouns (binary intended → Gender=Masc,Fem)', () => {
47 const command = 'node src/index.js < test/data/gender.conllu';
48 const stdout = execSync(command).toString();
49 // Binnen-I singular: ZeugIn → Gender=Masc,Fem|Number=Sing
50 expect(stdout).toContain('ZeugIn\tZeugIn\tNOUN\tNN\tGender=Masc,Fem|Number=Sing');
51 // Binnen-I singular: SchülerIn → Gender=Masc,Fem|Number=Sing
52 expect(stdout).toContain('SchülerIn\tSchülerIn\tNOUN\tNN\tGender=Masc,Fem|Number=Sing');
53 // Binnen-I plural: LehrerInnen → lemma LehrerIn, Number=Plur
54 expect(stdout).toContain('LehrerInnen\tLehrerIn\tNOUN\tNN\tGender=Masc,Fem|Number=Plur');
55 });
56
57 test('Full mode: Klammern plural nouns (binary intended)', () => {
58 const command = 'node src/index.js < test/data/gender.conllu';
59 const stdout = execSync(command).toString();
60 expect(stdout).toContain('Schüler(innen)\tSchüler(in)\tNOUN\tNN\tGender=Masc,Fem|Number=Plur');
61 });
62
63 test('Full mode: Schrägstrich nouns', () => {
64 const command = 'node src/index.js < test/data/gender.conllu';
65 const stdout = execSync(command).toString();
66 // Simple slash
67 expect(stdout).toContain('Autor/innen\tAutor/in\tNOUN\tNN\tGender=Masc,Fem|Number=Plur');
68 // Slash with Ergänzungsstrich (/-innen)
69 expect(stdout).toContain('Autor/-innen\tAutor/in\tNOUN\tNN\tGender=Masc,Fem|Number=Plur');
70 expect(stdout).toContain('Spieler/-innen\tSpieler/in\tNOUN\tNN\tGender=Masc,Fem|Number=Plur');
71 });
72
73 test('Full mode: gendered determiners/pronouns (non-binary intended)', () => {
74 const command = 'node src/index.js < test/data/gender.conllu';
75 const stdout = execSync(command).toString();
76 // jede*r → DET PIAT Gender=NonBin, lemma jede*r
77 expect(stdout).toContain('jede*r\tjede*r\tDET\tPIAT\tGender=NonBin');
78 const jeder_count = (stdout.match(/jede\*r\tjede\*r/g) || []).length;
79 expect(jeder_count).toBe(2); // appears in two sentences
80 // eine*n → DET ART Gender=NonBin (non-binary via *)
81 expect(stdout).toContain('eine*n\teine*n\tDET\tART\tGender=NonBin');
82 // jede:r → DET PIAT Gender=NonBin
83 expect(stdout).toContain('jede:r\tjede:r\tDET\tPIAT\tGender=NonBin');
84 // die*der → DET ART Gender=NonBin (* marker → NonBin even for merged forms)
85 expect(stdout).toContain('die*der\tdie*der\tDET\tART\tGender=NonBin');
86 });
87
88 test('Full mode: neo-pronoun (sie*er → PRON PPER Gender=NonBin)', () => {
89 const command = 'node src/index.js < test/data/gender.conllu';
90 const stdout = execSync(command).toString();
91 expect(stdout).toContain('sie*er\tsie*er\tPRON\tPPER\tGender=NonBin');
92 });
93
94 test('Full mode: foundry comment changed to gender', () => {
95 const command = 'node src/index.js < test/data/gender.conllu';
96 const stdout = execSync(command).toString();
97 const foundry_count = (stdout.match(/# foundry = gender/g) || []).length;
98 expect(foundry_count).toBe(10);
99 });
100
101 test('Full mode: non-gender tokens pass through unchanged', () => {
102 const command = 'node src/index.js < test/data/gender.conllu';
103 const stdout = execSync(command).toString();
104 // Regular nouns are passed through, lemma stays _
105 expect(stdout).toContain('Umbenennung\t_\tNOUN\tNN');
106 expect(stdout).toContain('Ideen\t_\tNOUN\tNN');
107 });
108
109 test('Sparse mode: only annotated tokens are emitted', () => {
110 const command = 'node src/index.js -s < test/data/gender.conllu';
111 const stdout = execSync(command).toString();
112 const lines = stdout.split('\n');
113 const tokenLines = lines.filter(l => l.match(/^\d+\t/));
114 // Every token line must carry a gender annotation:
115 // either the lemma column (col 2) is non-underscore (noun/pron was annotated)
116 // or the features column (col 5) is non-underscore (det was annotated)
117 tokenLines.forEach(line => {
118 const cols = line.split('\t');
119 const lemmaAnnotated = cols[2] !== '_';
120 const featsAnnotated = cols[5] !== '_';
121 expect(lemmaAnnotated || featsAnnotated).toBe(true);
122 });
123 // Count: 18 NOUN + 5 DET + 1 PRON = 24 annotated tokens
124 expect(tokenLines.length).toBe(24);
125 });
126
127 test('Sparse mode: sentence headers are emitted for sentences with matches', () => {
128 const command = 'node src/index.js -s < test/data/gender.conllu';
129 const stdout = execSync(command).toString();
130 // All 10 test sentences have at least one gender form
131 const text_id_count = (stdout.match(/# text_id = /g) || []).length;
132 expect(text_id_count).toBe(10);
133 });
134
135 test('Inline input: basic Genderstern annotation', () => {
136 const testInput = `# foundry = base
137# text_id = inline-001
138# text = Die Lehrerin und Lehrer*innen kamen
1391\tDie\t_\tDET\tART\t_\t_\t_\t_\t_
1402\tLehrerin\t_\tNOUN\tNN\t_\t_\t_\t_\t_
1413\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
1424\tLehrer*innen\t_\t_\t_\t_\t_\t_\t_\t_
1435\tkamen\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_
144
145`;
146 const stdout = execSync('node src/index.js', { input: testInput }).toString();
147 expect(stdout).toContain('Lehrer*innen\tLehrer*in\tNOUN\tNN\tGender=NonBin|Number=Plur');
148 // Regular noun 'Lehrerin' must not be incorrectly tagged
149 expect(stdout).toContain('Lehrerin\t_\tNOUN\tNN\t_');
150 });
151
152 test('Inline input: existing POS/lemma/feats are replaced for gender forms', () => {
153 const testInput = `# foundry = base
154# text_id = inline-002
155# text = jede Ärztin und jede*r Arzt*in
1561\tjede\t_\tDET\tPIAT\tGender=Fem|Number=Sing\t_\t_\t_\t_
1572\tÄrztin\tÄrztin\tNOUN\tNN\tGender=Fem|Number=Sing\t_\t_\t_\t_
1583\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_
1594\tjede*r\t_\tDET\tPIAT\t_\t_\t_\t_\t_
1605\tArzt*in\t_\t_\t_\t_\t_\t_\t_\t_
161
162`;
163 const stdout = execSync('node src/index.js', { input: testInput }).toString();
164 // jede*r: missing feats should be filled in
165 expect(stdout).toContain('jede*r\tjede*r\tDET\tPIAT\tGender=NonBin');
166 // Arzt*in: umlaut base, missing everything
167 expect(stdout).toContain('Arzt*in\tArzt*in\tNOUN\tNN\tGender=NonBin|Number=Sing');
168 // Ärztin (regular moviertes Femininum, no gender marker): unchanged
169 expect(stdout).toContain('Ärztin\tÄrztin\tNOUN\tNN\tGender=Fem|Number=Sing');
170 // jede (without gender marker): unchanged
171 expect(stdout).toContain('jede\t_\tDET\tPIAT\tGender=Fem|Number=Sing');
172 });
173});