| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 1 | const { execSync } = require('child_process'); |
| 2 | |
| 3 | describe('conllu-gender', () => { |
| 4 | |
| 5 | test('Full mode: all gender-sensitive nouns (Genderstern singular)', () => { |
| 6 | const command = 'node src/index.js < test/data/gender.conllu'; |
| 7 | const stdout = execSync(command).toString(); |
| 8 | |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 9 | // Genderstern singular noun: Bürger*in → lemma Bürger*in, NOUN NN Gender=Fem,Masc,NonBin|Number=Sing |
| 10 | expect(stdout).toContain('Bürger*in\tBürger*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing'); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 11 | // Long compound: Jugendpsychiater*in |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 12 | expect(stdout).toContain('Jugendpsychiater*in\tJugendpsychiater*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing'); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 13 | // Compound with umlaut base: Generalstaatsanwält*in |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 14 | expect(stdout).toContain('Generalstaatsanwält*in\tGeneralstaatsanwält*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing'); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 15 | // Nachfolger*in |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 16 | expect(stdout).toContain('Nachfolger*in\tNachfolger*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing'); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 17 | // With umlaut base: Antragssteller*in (no umlaut but long compound) |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 18 | expect(stdout).toContain('Antragssteller*in\tAntragssteller*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing'); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 19 | }); |
| 20 | |
| 21 | test('Full mode: Genderstern plural nouns → lemma uses singular, Number=Plur', () => { |
| 22 | const command = 'node src/index.js < test/data/gender.conllu'; |
| 23 | const stdout = execSync(command).toString(); |
| 24 | // Fachärzt*innen → lemma Fachärzt*in |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 25 | expect(stdout).toContain('Fachärzt*innen\tFachärzt*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Plur'); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 26 | }); |
| 27 | |
| 28 | test('Full mode: Doppelpunkt nouns', () => { |
| 29 | const command = 'node src/index.js < test/data/gender.conllu'; |
| 30 | const stdout = execSync(command).toString(); |
| 31 | // Doppelpunkt singular |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 32 | expect(stdout).toContain('Anhänger:in\tAnhänger:in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing'); |
| 33 | expect(stdout).toContain('Wirt:in\tWirt:in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing'); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 34 | // Doppelpunkt plural |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 35 | expect(stdout).toContain('Lehrer:innen\tLehrer:in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Plur'); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 36 | // Long compound with umlaut base |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 37 | expect(stdout).toContain('Menschenrechtsanwält:innen\tMenschenrechtsanwält:in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Plur'); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 38 | }); |
| 39 | |
| 40 | test('Full mode: Unterstrich plural nouns', () => { |
| 41 | const command = 'node src/index.js < test/data/gender.conllu'; |
| 42 | const stdout = execSync(command).toString(); |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 43 | expect(stdout).toContain('Autor_innen\tAutor_in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Plur'); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 44 | }); |
| 45 | |
| 46 | test('Full mode: Binnen-I nouns (binary intended → Gender=Masc,Fem)', () => { |
| 47 | const command = 'node src/index.js < test/data/gender.conllu'; |
| 48 | const stdout = execSync(command).toString(); |
| 49 | // Binnen-I singular: ZeugIn → Gender=Masc,Fem|Number=Sing |
| 50 | expect(stdout).toContain('ZeugIn\tZeugIn\tNOUN\tNN\tGender=Masc,Fem|Number=Sing'); |
| 51 | // Binnen-I singular: SchülerIn → Gender=Masc,Fem|Number=Sing |
| 52 | expect(stdout).toContain('SchülerIn\tSchülerIn\tNOUN\tNN\tGender=Masc,Fem|Number=Sing'); |
| 53 | // Binnen-I plural: LehrerInnen → lemma LehrerIn, Number=Plur |
| 54 | expect(stdout).toContain('LehrerInnen\tLehrerIn\tNOUN\tNN\tGender=Masc,Fem|Number=Plur'); |
| 55 | }); |
| 56 | |
| 57 | test('Full mode: Klammern plural nouns (binary intended)', () => { |
| 58 | const command = 'node src/index.js < test/data/gender.conllu'; |
| 59 | const stdout = execSync(command).toString(); |
| 60 | expect(stdout).toContain('Schüler(innen)\tSchüler(in)\tNOUN\tNN\tGender=Masc,Fem|Number=Plur'); |
| 61 | }); |
| 62 | |
| 63 | test('Full mode: Schrägstrich nouns', () => { |
| 64 | const command = 'node src/index.js < test/data/gender.conllu'; |
| 65 | const stdout = execSync(command).toString(); |
| 66 | // Simple slash |
| 67 | expect(stdout).toContain('Autor/innen\tAutor/in\tNOUN\tNN\tGender=Masc,Fem|Number=Plur'); |
| 68 | // Slash with Ergänzungsstrich (/-innen) |
| 69 | expect(stdout).toContain('Autor/-innen\tAutor/in\tNOUN\tNN\tGender=Masc,Fem|Number=Plur'); |
| 70 | expect(stdout).toContain('Spieler/-innen\tSpieler/in\tNOUN\tNN\tGender=Masc,Fem|Number=Plur'); |
| 71 | }); |
| 72 | |
| 73 | test('Full mode: gendered determiners/pronouns (non-binary intended)', () => { |
| 74 | const command = 'node src/index.js < test/data/gender.conllu'; |
| 75 | const stdout = execSync(command).toString(); |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 76 | // jede*r → DET PIAT Gender=Fem,Masc,NonBin, lemma jede*r |
| 77 | expect(stdout).toContain('jede*r\tjede*r\tDET\tPIAT\tGender=Fem,Masc,NonBin'); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 78 | const jeder_count = (stdout.match(/jede\*r\tjede\*r/g) || []).length; |
| 79 | expect(jeder_count).toBe(2); // appears in two sentences |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 80 | // eine*n → DET ART Gender=Fem,Masc,NonBin (non-binary via *) |
| 81 | expect(stdout).toContain('eine*n\teine*n\tDET\tART\tGender=Fem,Masc,NonBin'); |
| 82 | // jede:r → DET PIAT Gender=Fem,Masc,NonBin |
| 83 | expect(stdout).toContain('jede:r\tjede:r\tDET\tPIAT\tGender=Fem,Masc,NonBin'); |
| 84 | // die*der → DET ART Gender=Fem,Masc,NonBin (* marker → inclusive even for merged forms) |
| 85 | expect(stdout).toContain('die*der\tdie*der\tDET\tART\tGender=Fem,Masc,NonBin'); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 86 | }); |
| 87 | |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 88 | test('Full mode: neo-pronoun (sie*er → PRON PPER Gender=Fem,Masc,NonBin)', () => { |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 89 | const command = 'node src/index.js < test/data/gender.conllu'; |
| 90 | const stdout = execSync(command).toString(); |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 91 | expect(stdout).toContain('sie*er\tsie*er\tPRON\tPPER\tGender=Fem,Masc,NonBin'); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 92 | }); |
| 93 | |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 94 | test('Full mode: neo-pronoun lexicon forms (PRON PPER Gender=Fem,Masc,NonBin|PronType=Prs)', () => { |
| Marc Kupietz | 1a9f16e | 2026-03-07 09:50:55 +0100 | [diff] [blame] | 95 | const command = 'node src/index.js < test/data/gender.conllu'; |
| 96 | const stdout = execSync(command).toString(); |
| 97 | // sier: NOM of sier-paradigm |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 98 | expect(stdout).toContain('sier\tsier\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs'); |
| Marc Kupietz | 1a9f16e | 2026-03-07 09:50:55 +0100 | [diff] [blame] | 99 | // xier: NOM of xier-paradigm |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 100 | expect(stdout).toContain('xier\txier\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs'); |
| Marc Kupietz | 1a9f16e | 2026-03-07 09:50:55 +0100 | [diff] [blame] | 101 | // oj: NOM of oj-paradigm |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 102 | expect(stdout).toContain('oj\toj\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs'); |
| Marc Kupietz | 1a9f16e | 2026-03-07 09:50:55 +0100 | [diff] [blame] | 103 | // el: NOM of el-paradigm |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 104 | expect(stdout).toContain('el\tel\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs'); |
| Marc Kupietz | 1a9f16e | 2026-03-07 09:50:55 +0100 | [diff] [blame] | 105 | // hen: NOM of hen-paradigm |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 106 | expect(stdout).toContain('hen\then\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs'); |
| Marc Kupietz | 1a9f16e | 2026-03-07 09:50:55 +0100 | [diff] [blame] | 107 | }); |
| 108 | |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 109 | test('Full mode: foundry comment changed to gender', () => { |
| 110 | const command = 'node src/index.js < test/data/gender.conllu'; |
| 111 | const stdout = execSync(command).toString(); |
| 112 | const foundry_count = (stdout.match(/# foundry = gender/g) || []).length; |
| Marc Kupietz | acf3120 | 2026-03-10 09:24:28 +0100 | [diff] [blame^] | 113 | expect(foundry_count).toBe(20); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 114 | }); |
| 115 | |
| 116 | test('Full mode: non-gender tokens pass through unchanged', () => { |
| 117 | const command = 'node src/index.js < test/data/gender.conllu'; |
| 118 | const stdout = execSync(command).toString(); |
| 119 | // Regular nouns are passed through, lemma stays _ |
| 120 | expect(stdout).toContain('Umbenennung\t_\tNOUN\tNN'); |
| 121 | expect(stdout).toContain('Ideen\t_\tNOUN\tNN'); |
| 122 | }); |
| 123 | |
| 124 | test('Sparse mode: only annotated tokens are emitted', () => { |
| 125 | const command = 'node src/index.js -s < test/data/gender.conllu'; |
| 126 | const stdout = execSync(command).toString(); |
| 127 | const lines = stdout.split('\n'); |
| 128 | const tokenLines = lines.filter(l => l.match(/^\d+\t/)); |
| 129 | // Every token line must carry a gender annotation: |
| 130 | // either the lemma column (col 2) is non-underscore (noun/pron was annotated) |
| 131 | // or the features column (col 5) is non-underscore (det was annotated) |
| 132 | tokenLines.forEach(line => { |
| 133 | const cols = line.split('\t'); |
| 134 | const lemmaAnnotated = cols[2] !== '_'; |
| 135 | const featsAnnotated = cols[5] !== '_'; |
| 136 | expect(lemmaAnnotated || featsAnnotated).toBe(true); |
| 137 | }); |
| Marc Kupietz | d2b9279 | 2026-03-10 08:11:06 +0100 | [diff] [blame] | 138 | // Count: 18 NOUN + 5 DET + 1 PRON (sie*er) + 5 neo-pronouns (sier,xier,oj,el,hen) |
| 139 | // + 7 new neo-pronouns from sentences 16–18 (el,em,ey,y,mensch,Mensch,xier) = 36 |
| 140 | expect(tokenLines.length).toBe(36); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 141 | }); |
| 142 | |
| 143 | test('Sparse mode: sentence headers are emitted for sentences with matches', () => { |
| 144 | const command = 'node src/index.js -s < test/data/gender.conllu'; |
| 145 | const stdout = execSync(command).toString(); |
| Marc Kupietz | d2b9279 | 2026-03-10 08:11:06 +0100 | [diff] [blame] | 146 | // 12 original + 3 new sentences (16–18) have at least one gender form |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 147 | const text_id_count = (stdout.match(/# text_id = /g) || []).length; |
| Marc Kupietz | d2b9279 | 2026-03-10 08:11:06 +0100 | [diff] [blame] | 148 | expect(text_id_count).toBe(15); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 149 | }); |
| 150 | |
| 151 | test('Inline input: basic Genderstern annotation', () => { |
| 152 | const testInput = `# foundry = base |
| 153 | # text_id = inline-001 |
| 154 | # text = Die Lehrerin und Lehrer*innen kamen |
| 155 | 1\tDie\t_\tDET\tART\t_\t_\t_\t_\t_ |
| 156 | 2\tLehrerin\t_\tNOUN\tNN\t_\t_\t_\t_\t_ |
| 157 | 3\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_ |
| 158 | 4\tLehrer*innen\t_\t_\t_\t_\t_\t_\t_\t_ |
| 159 | 5\tkamen\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_ |
| 160 | |
| 161 | `; |
| 162 | const stdout = execSync('node src/index.js', { input: testInput }).toString(); |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 163 | expect(stdout).toContain('Lehrer*innen\tLehrer*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Plur'); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 164 | // Regular noun 'Lehrerin' must not be incorrectly tagged |
| 165 | expect(stdout).toContain('Lehrerin\t_\tNOUN\tNN\t_'); |
| 166 | }); |
| 167 | |
| 168 | test('Inline input: existing POS/lemma/feats are replaced for gender forms', () => { |
| 169 | const testInput = `# foundry = base |
| 170 | # text_id = inline-002 |
| 171 | # text = jede Ärztin und jede*r Arzt*in |
| 172 | 1\tjede\t_\tDET\tPIAT\tGender=Fem|Number=Sing\t_\t_\t_\t_ |
| 173 | 2\tÄrztin\tÄrztin\tNOUN\tNN\tGender=Fem|Number=Sing\t_\t_\t_\t_ |
| 174 | 3\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_ |
| 175 | 4\tjede*r\t_\tDET\tPIAT\t_\t_\t_\t_\t_ |
| 176 | 5\tArzt*in\t_\t_\t_\t_\t_\t_\t_\t_ |
| 177 | |
| 178 | `; |
| 179 | const stdout = execSync('node src/index.js', { input: testInput }).toString(); |
| 180 | // jede*r: missing feats should be filled in |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 181 | expect(stdout).toContain('jede*r\tjede*r\tDET\tPIAT\tGender=Fem,Masc,NonBin'); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 182 | // Arzt*in: umlaut base, missing everything |
| Marc Kupietz | b2068f4 | 2026-03-07 21:58:05 +0100 | [diff] [blame] | 183 | expect(stdout).toContain('Arzt*in\tArzt*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing'); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 184 | // Ärztin (regular moviertes Femininum, no gender marker): unchanged |
| 185 | expect(stdout).toContain('Ärztin\tÄrztin\tNOUN\tNN\tGender=Fem|Number=Sing'); |
| 186 | // jede (without gender marker): unchanged |
| 187 | expect(stdout).toContain('jede\t_\tDET\tPIAT\tGender=Fem|Number=Sing'); |
| 188 | }); |
| Marc Kupietz | d2b9279 | 2026-03-10 08:11:06 +0100 | [diff] [blame] | 189 | |
| 190 | // --------------------------------------------------------------------------- |
| 191 | // Regression tests: false-positive tokens that must NOT be tagged |
| 192 | // --------------------------------------------------------------------------- |
| 193 | |
| 194 | test('No false positives: *, Y, per, EL, EM, Ey, sin mid-sentence pass through unchanged', () => { |
| 195 | // Each of these appeared as spurious neo-pronoun matches in the original code. |
| 196 | // They must not receive a neo-pronoun annotation. |
| 197 | const testInput = `# foundry = base |
| 198 | # text_id = fp-001 |
| 199 | # text = Hinweis auf * und Y sowie per Einschreiben |
| 200 | 1\tHinweis\t_\tNOUN\tNN\t_\t_\t_\t_\t_ |
| 201 | 2\tauf\t_\tADP\tAPPR\t_\t_\t_\t_\t_ |
| 202 | 3\t*\t_\tPUNCT\t$(\t_\t_\t_\t_\t_ |
| 203 | 4\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_ |
| 204 | 5\tY\t_\tNOUN\tNN\t_\t_\t_\t_\t_ |
| 205 | 6\tsowie\t_\tCCONJ\tKON\t_\t_\t_\t_\t_ |
| 206 | 7\tper\t_\tADP\tAPPR\t_\t_\t_\t_\t_ |
| 207 | 8\tEinschreiben\t_\tNOUN\tNN\t_\t_\t_\t_\t_ |
| 208 | |
| 209 | # foundry = base |
| 210 | # text_id = fp-002 |
| 211 | # text = Verweise auf EL EM Ey sin im Text |
| 212 | 1\tVerweise\t_\tNOUN\tNN\t_\t_\t_\t_\t_ |
| 213 | 2\tauf\t_\tADP\tAPPR\t_\t_\t_\t_\t_ |
| 214 | 3\tEL\t_\tNOUN\tNN\t_\t_\t_\t_\t_ |
| 215 | 4\tEM\t_\tNOUN\tNN\t_\t_\t_\t_\t_ |
| 216 | 5\tEy\t_\tITJ\tITJ\t_\t_\t_\t_\t_ |
| 217 | 6\tsin\t_\tNOUN\tNN\t_\t_\t_\t_\t_ |
| 218 | 7\tim\t_\tADP\tAPPRART\t_\t_\t_\t_\t_ |
| 219 | 8\tText\t_\tNOUN\tNN\t_\t_\t_\t_\t_ |
| 220 | |
| 221 | `; |
| 222 | const stdout = execSync('node src/index.js', { input: testInput }).toString(); |
| 223 | // None of the false-positive tokens should receive a neo-pronoun annotation. |
| 224 | // A token passes through unchanged when its lemma column stays '_' and |
| 225 | // its upos/xpos/feats are not overwritten to PRON/PPER. |
| 226 | expect(stdout).toContain('3\t*\t_\tPUNCT'); |
| 227 | expect(stdout).toContain('5\tY\t_\tNOUN'); |
| 228 | expect(stdout).toContain('7\tper\t_\tADP'); |
| 229 | expect(stdout).toContain('3\tEL\t_\tNOUN'); |
| 230 | expect(stdout).toContain('4\tEM\t_\tNOUN'); |
| 231 | expect(stdout).toContain('5\tEy\t_\tITJ'); |
| 232 | expect(stdout).toContain('6\tsin\t_\tNOUN'); |
| 233 | }); |
| 234 | |
| Marc Kupietz | acf3120 | 2026-03-10 09:24:28 +0100 | [diff] [blame^] | 235 | test('No false positives from gender.conllu: et (in et al.) and their (English) pass through unchanged', () => { |
| 236 | const command = 'node src/index.js < test/data/gender.conllu'; |
| 237 | const stdout = execSync(command).toString(); |
| 238 | // et in "Müller et al." must not be tagged |
| 239 | expect(stdout).toContain('2\tet\t_\tPART'); |
| 240 | // their in English quotation must not be tagged |
| 241 | expect(stdout).toContain('2\ttheir\t_\tPRON'); |
| 242 | }); |
| 243 | |
| Marc Kupietz | d2b9279 | 2026-03-10 08:11:06 +0100 | [diff] [blame] | 244 | test('No false positive: Mensch mid-sentence must not be tagged as neo-pronoun', () => { |
| 245 | const testInput = `# foundry = base |
| 246 | # text_id = fp-003 |
| 247 | # text = Jeder Mensch hat Würde |
| 248 | 1\tJeder\t_\tDET\tPIAT\t_\t_\t_\t_\t_ |
| 249 | 2\tMensch\t_\tNOUN\tNN\t_\t_\t_\t_\t_ |
| 250 | 3\that\t_\tAUX\tVAFIN\t_\t_\t_\t_\t_ |
| 251 | 4\tWürde\t_\tNOUN\tNN\t_\t_\t_\t_\t_ |
| 252 | |
| 253 | `; |
| 254 | const stdout = execSync('node src/index.js', { input: testInput }).toString(); |
| 255 | // 'Mensch' at position 2 (not sentence-initial) must not be tagged. |
| 256 | expect(stdout).toContain('2\tMensch\t_\tNOUN\tNN\t_'); |
| 257 | }); |
| 258 | |
| 259 | test('Neo-pronoun: lowercase el, em, ey, y mid-sentence are still tagged', () => { |
| 260 | const testInput = `# foundry = base |
| 261 | # text_id = neo-lc-001 |
| 262 | # text = dankte el und em für ey und y |
| 263 | 1\tdankte\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_ |
| 264 | 2\tel\t_\t_\t_\t_\t_\t_\t_\t_ |
| 265 | 3\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_ |
| 266 | 4\tem\t_\t_\t_\t_\t_\t_\t_\t_ |
| 267 | 5\tfür\t_\tADP\tAPPR\t_\t_\t_\t_\t_ |
| 268 | 6\tey\t_\t_\t_\t_\t_\t_\t_\t_ |
| 269 | 7\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_ |
| 270 | 8\ty\t_\t_\t_\t_\t_\t_\t_\t_ |
| 271 | |
| 272 | `; |
| 273 | const stdout = execSync('node src/index.js', { input: testInput }).toString(); |
| 274 | expect(stdout).toContain('el\tel\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs'); |
| 275 | expect(stdout).toContain('em\tem\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs'); |
| 276 | expect(stdout).toContain('ey\tey\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs'); |
| 277 | expect(stdout).toContain('y\tY\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs'); |
| 278 | }); |
| 279 | |
| 280 | test('Neo-pronoun: mensch lowercase and sentence-initial Mensch are tagged', () => { |
| 281 | const testInput = `# foundry = base |
| 282 | # text_id = neo-mensch-001 |
| 283 | # text = mensch fragte und Mensch antwortete |
| 284 | 1\tmensch\t_\t_\t_\t_\t_\t_\t_\t_ |
| 285 | 2\tfragte\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_ |
| 286 | 3\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_ |
| 287 | 4\tMensch\t_\tNOUN\tNN\t_\t_\t_\t_\t_ |
| 288 | 5\tantwortete\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_ |
| 289 | |
| 290 | # foundry = base |
| 291 | # text_id = neo-mensch-002 |
| 292 | # text = Mensch traf xier |
| 293 | 1\tMensch\t_\t_\t_\t_\t_\t_\t_\t_ |
| 294 | 2\ttraf\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_ |
| 295 | 3\txier\t_\t_\t_\t_\t_\t_\t_\t_ |
| 296 | |
| 297 | `; |
| 298 | const stdout = execSync('node src/index.js', { input: testInput }).toString(); |
| 299 | // lowercase 'mensch' → neo-pronoun |
| 300 | expect(stdout).toContain('1\tmensch\tmensch\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs'); |
| 301 | // 'Mensch' mid-sentence (position 4) → unchanged common noun |
| 302 | expect(stdout).toContain('4\tMensch\t_\tNOUN\tNN\t_'); |
| 303 | // sentence-initial 'Mensch' (position 1) → neo-pronoun |
| 304 | expect(stdout).toContain('1\tMensch\tmensch\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs'); |
| 305 | }); |
| Marc Kupietz | b777f9d | 2026-03-07 09:26:20 +0100 | [diff] [blame] | 306 | }); |