| const { execSync } = require('child_process'); |
| |
| describe('conllu-gender', () => { |
| |
| test('Full mode: all gender-sensitive nouns (Genderstern singular)', () => { |
| const command = 'node src/index.js < test/data/gender.conllu'; |
| const stdout = execSync(command).toString(); |
| |
| // Genderstern singular noun: Bürger*in → lemma Bürger*in, NOUN NN Gender=NonBin|Number=Sing |
| expect(stdout).toContain('Bürger*in\tBürger*in\tNOUN\tNN\tGender=NonBin|Number=Sing'); |
| // Long compound: Jugendpsychiater*in |
| expect(stdout).toContain('Jugendpsychiater*in\tJugendpsychiater*in\tNOUN\tNN\tGender=NonBin|Number=Sing'); |
| // Compound with umlaut base: Generalstaatsanwält*in |
| expect(stdout).toContain('Generalstaatsanwält*in\tGeneralstaatsanwält*in\tNOUN\tNN\tGender=NonBin|Number=Sing'); |
| // Nachfolger*in |
| expect(stdout).toContain('Nachfolger*in\tNachfolger*in\tNOUN\tNN\tGender=NonBin|Number=Sing'); |
| // With umlaut base: Antragssteller*in (no umlaut but long compound) |
| expect(stdout).toContain('Antragssteller*in\tAntragssteller*in\tNOUN\tNN\tGender=NonBin|Number=Sing'); |
| }); |
| |
| test('Full mode: Genderstern plural nouns → lemma uses singular, Number=Plur', () => { |
| const command = 'node src/index.js < test/data/gender.conllu'; |
| const stdout = execSync(command).toString(); |
| // Fachärzt*innen → lemma Fachärzt*in |
| expect(stdout).toContain('Fachärzt*innen\tFachärzt*in\tNOUN\tNN\tGender=NonBin|Number=Plur'); |
| }); |
| |
| test('Full mode: Doppelpunkt nouns', () => { |
| const command = 'node src/index.js < test/data/gender.conllu'; |
| const stdout = execSync(command).toString(); |
| // Doppelpunkt singular |
| expect(stdout).toContain('Anhänger:in\tAnhänger:in\tNOUN\tNN\tGender=NonBin|Number=Sing'); |
| expect(stdout).toContain('Wirt:in\tWirt:in\tNOUN\tNN\tGender=NonBin|Number=Sing'); |
| // Doppelpunkt plural |
| expect(stdout).toContain('Lehrer:innen\tLehrer:in\tNOUN\tNN\tGender=NonBin|Number=Plur'); |
| // Long compound with umlaut base |
| expect(stdout).toContain('Menschenrechtsanwält:innen\tMenschenrechtsanwält:in\tNOUN\tNN\tGender=NonBin|Number=Plur'); |
| }); |
| |
| test('Full mode: Unterstrich plural nouns', () => { |
| const command = 'node src/index.js < test/data/gender.conllu'; |
| const stdout = execSync(command).toString(); |
| expect(stdout).toContain('Autor_innen\tAutor_in\tNOUN\tNN\tGender=NonBin|Number=Plur'); |
| }); |
| |
| test('Full mode: Binnen-I nouns (binary intended → Gender=Masc,Fem)', () => { |
| const command = 'node src/index.js < test/data/gender.conllu'; |
| const stdout = execSync(command).toString(); |
| // Binnen-I singular: ZeugIn → Gender=Masc,Fem|Number=Sing |
| expect(stdout).toContain('ZeugIn\tZeugIn\tNOUN\tNN\tGender=Masc,Fem|Number=Sing'); |
| // Binnen-I singular: SchülerIn → Gender=Masc,Fem|Number=Sing |
| expect(stdout).toContain('SchülerIn\tSchülerIn\tNOUN\tNN\tGender=Masc,Fem|Number=Sing'); |
| // Binnen-I plural: LehrerInnen → lemma LehrerIn, Number=Plur |
| expect(stdout).toContain('LehrerInnen\tLehrerIn\tNOUN\tNN\tGender=Masc,Fem|Number=Plur'); |
| }); |
| |
| test('Full mode: Klammern plural nouns (binary intended)', () => { |
| const command = 'node src/index.js < test/data/gender.conllu'; |
| const stdout = execSync(command).toString(); |
| expect(stdout).toContain('Schüler(innen)\tSchüler(in)\tNOUN\tNN\tGender=Masc,Fem|Number=Plur'); |
| }); |
| |
| test('Full mode: Schrägstrich nouns', () => { |
| const command = 'node src/index.js < test/data/gender.conllu'; |
| const stdout = execSync(command).toString(); |
| // Simple slash |
| expect(stdout).toContain('Autor/innen\tAutor/in\tNOUN\tNN\tGender=Masc,Fem|Number=Plur'); |
| // Slash with Ergänzungsstrich (/-innen) |
| expect(stdout).toContain('Autor/-innen\tAutor/in\tNOUN\tNN\tGender=Masc,Fem|Number=Plur'); |
| expect(stdout).toContain('Spieler/-innen\tSpieler/in\tNOUN\tNN\tGender=Masc,Fem|Number=Plur'); |
| }); |
| |
| test('Full mode: gendered determiners/pronouns (non-binary intended)', () => { |
| const command = 'node src/index.js < test/data/gender.conllu'; |
| const stdout = execSync(command).toString(); |
| // jede*r → DET PIAT Gender=NonBin, lemma jede*r |
| expect(stdout).toContain('jede*r\tjede*r\tDET\tPIAT\tGender=NonBin'); |
| const jeder_count = (stdout.match(/jede\*r\tjede\*r/g) || []).length; |
| expect(jeder_count).toBe(2); // appears in two sentences |
| // eine*n → DET ART Gender=NonBin (non-binary via *) |
| expect(stdout).toContain('eine*n\teine*n\tDET\tART\tGender=NonBin'); |
| // jede:r → DET PIAT Gender=NonBin |
| expect(stdout).toContain('jede:r\tjede:r\tDET\tPIAT\tGender=NonBin'); |
| // die*der → DET ART Gender=NonBin (* marker → NonBin even for merged forms) |
| expect(stdout).toContain('die*der\tdie*der\tDET\tART\tGender=NonBin'); |
| }); |
| |
| test('Full mode: neo-pronoun (sie*er → PRON PPER Gender=NonBin)', () => { |
| const command = 'node src/index.js < test/data/gender.conllu'; |
| const stdout = execSync(command).toString(); |
| expect(stdout).toContain('sie*er\tsie*er\tPRON\tPPER\tGender=NonBin'); |
| }); |
| |
| test('Full mode: foundry comment changed to gender', () => { |
| const command = 'node src/index.js < test/data/gender.conllu'; |
| const stdout = execSync(command).toString(); |
| const foundry_count = (stdout.match(/# foundry = gender/g) || []).length; |
| expect(foundry_count).toBe(10); |
| }); |
| |
| test('Full mode: non-gender tokens pass through unchanged', () => { |
| const command = 'node src/index.js < test/data/gender.conllu'; |
| const stdout = execSync(command).toString(); |
| // Regular nouns are passed through, lemma stays _ |
| expect(stdout).toContain('Umbenennung\t_\tNOUN\tNN'); |
| expect(stdout).toContain('Ideen\t_\tNOUN\tNN'); |
| }); |
| |
| test('Sparse mode: only annotated tokens are emitted', () => { |
| const command = 'node src/index.js -s < test/data/gender.conllu'; |
| const stdout = execSync(command).toString(); |
| const lines = stdout.split('\n'); |
| const tokenLines = lines.filter(l => l.match(/^\d+\t/)); |
| // Every token line must carry a gender annotation: |
| // either the lemma column (col 2) is non-underscore (noun/pron was annotated) |
| // or the features column (col 5) is non-underscore (det was annotated) |
| tokenLines.forEach(line => { |
| const cols = line.split('\t'); |
| const lemmaAnnotated = cols[2] !== '_'; |
| const featsAnnotated = cols[5] !== '_'; |
| expect(lemmaAnnotated || featsAnnotated).toBe(true); |
| }); |
| // Count: 18 NOUN + 5 DET + 1 PRON = 24 annotated tokens |
| expect(tokenLines.length).toBe(24); |
| }); |
| |
| test('Sparse mode: sentence headers are emitted for sentences with matches', () => { |
| const command = 'node src/index.js -s < test/data/gender.conllu'; |
| const stdout = execSync(command).toString(); |
| // All 10 test sentences have at least one gender form |
| const text_id_count = (stdout.match(/# text_id = /g) || []).length; |
| expect(text_id_count).toBe(10); |
| }); |
| |
| test('Inline input: basic Genderstern annotation', () => { |
| const testInput = `# foundry = base |
| # text_id = inline-001 |
| # text = Die Lehrerin und Lehrer*innen kamen |
| 1\tDie\t_\tDET\tART\t_\t_\t_\t_\t_ |
| 2\tLehrerin\t_\tNOUN\tNN\t_\t_\t_\t_\t_ |
| 3\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_ |
| 4\tLehrer*innen\t_\t_\t_\t_\t_\t_\t_\t_ |
| 5\tkamen\t_\tVERB\tVVFIN\t_\t_\t_\t_\t_ |
| |
| `; |
| const stdout = execSync('node src/index.js', { input: testInput }).toString(); |
| expect(stdout).toContain('Lehrer*innen\tLehrer*in\tNOUN\tNN\tGender=NonBin|Number=Plur'); |
| // Regular noun 'Lehrerin' must not be incorrectly tagged |
| expect(stdout).toContain('Lehrerin\t_\tNOUN\tNN\t_'); |
| }); |
| |
| test('Inline input: existing POS/lemma/feats are replaced for gender forms', () => { |
| const testInput = `# foundry = base |
| # text_id = inline-002 |
| # text = jede Ärztin und jede*r Arzt*in |
| 1\tjede\t_\tDET\tPIAT\tGender=Fem|Number=Sing\t_\t_\t_\t_ |
| 2\tÄrztin\tÄrztin\tNOUN\tNN\tGender=Fem|Number=Sing\t_\t_\t_\t_ |
| 3\tund\t_\tCCONJ\tKON\t_\t_\t_\t_\t_ |
| 4\tjede*r\t_\tDET\tPIAT\t_\t_\t_\t_\t_ |
| 5\tArzt*in\t_\t_\t_\t_\t_\t_\t_\t_ |
| |
| `; |
| const stdout = execSync('node src/index.js', { input: testInput }).toString(); |
| // jede*r: missing feats should be filled in |
| expect(stdout).toContain('jede*r\tjede*r\tDET\tPIAT\tGender=NonBin'); |
| // Arzt*in: umlaut base, missing everything |
| expect(stdout).toContain('Arzt*in\tArzt*in\tNOUN\tNN\tGender=NonBin|Number=Sing'); |
| // Ärztin (regular moviertes Femininum, no gender marker): unchanged |
| expect(stdout).toContain('Ärztin\tÄrztin\tNOUN\tNN\tGender=Fem|Number=Sing'); |
| // jede (without gender marker): unchanged |
| expect(stdout).toContain('jede\t_\tDET\tPIAT\tGender=Fem|Number=Sing'); |
| }); |
| }); |