Use Gender=Fem,Masc,NonBin when appropriate
Change-Id: I457814066595ec5b054de85f5cd068d1005fe65a
diff --git a/Readme.md b/Readme.md
index 277ad28..bf80b22 100644
--- a/Readme.md
+++ b/Readme.md
@@ -10,9 +10,9 @@
| Type | Examples | Intent | `Gender` feature |
|---|---|---|---|
-| Genderstern `*` | `Lehrer*in`, `Bürger*innen` | non-binary | `NonBin` |
-| Doppelpunkt `:` | `Lehrer:in`, `Bürger:innen` | non-binary | `NonBin` |
-| Unterstrich `_` | `Lehrer_in`, `Bürger_innen` | non-binary | `NonBin` |
+| Genderstern `*` | `Lehrer*in`, `Bürger*innen` | non-binary | `Fem,Masc,NonBin` |
+| Doppelpunkt `:` | `Lehrer:in`, `Bürger:innen` | non-binary | `Fem,Masc,NonBin` |
+| Unterstrich `_` | `Lehrer_in`, `Bürger_innen` | non-binary | `Fem,Masc,NonBin` |
| Binnen-I `I` | `LehrerIn`, `LehrerInnen` | binary | `Masc,Fem` |
| Klammern `()` | `Lehrer(in)`, `Lehrer(innen)` | binary | `Masc,Fem` |
| Schrägstrich `/` | `Lehrer/in`, `Lehrer/-innen` | binary | `Masc,Fem` |
@@ -23,11 +23,11 @@
| Surface form | Lemma | FEATS |
|---|---|---|
-| `Lehrer*in` | `Lehrer*in` | `Gender=NonBin\|Number=Sing` |
-| `Lehrer*innen` | `Lehrer*in` | `Gender=NonBin\|Number=Plur` |
-| `Lehrer:in` | `Lehrer:in` | `Gender=NonBin\|Number=Sing` |
-| `Lehrer:innen` | `Lehrer:in` | `Gender=NonBin\|Number=Plur` |
-| `Lehrer_in` | `Lehrer_in` | `Gender=NonBin\|Number=Sing` |
+| `Lehrer*in` | `Lehrer*in` | `Gender=Fem,Masc,NonBin\|Number=Sing` |
+| `Lehrer*innen` | `Lehrer*in` | `Gender=Fem,Masc,NonBin\|Number=Plur` |
+| `Lehrer:in` | `Lehrer:in` | `Gender=Fem,Masc,NonBin\|Number=Sing` |
+| `Lehrer:innen` | `Lehrer:in` | `Gender=Fem,Masc,NonBin\|Number=Plur` |
+| `Lehrer_in` | `Lehrer_in` | `Gender=Fem,Masc,NonBin\|Number=Sing` |
| `LehrerIn` | `LehrerIn` | `Gender=Masc,Fem\|Number=Sing` |
| `LehrerInnen` | `LehrerIn` | `Gender=Masc,Fem\|Number=Plur` |
| `Lehrer(in)` | `Lehrer(in)` | `Gender=Masc,Fem\|Number=Sing` |
@@ -42,24 +42,24 @@
| Surface form | Lemma | FEATS |
|---|---|---|
-| `jede*r` | `jede*r` | `Gender=NonBin` |
-| `jede:r` | `jede:r` | `Gender=NonBin` |
-| `eine*n` | `eine*n` | `Gender=NonBin` |
-| `kein_e` | `kein_e` | `Gender=NonBin` |
+| `jede*r` | `jede*r` | `Gender=Fem,Masc,NonBin` |
+| `jede:r` | `jede:r` | `Gender=Fem,Masc,NonBin` |
+| `eine*n` | `eine*n` | `Gender=Fem,Masc,NonBin` |
+| `kein_e` | `kein_e` | `Gender=Fem,Masc,NonBin` |
| `die/der` | `die/der` | `Gender=Masc,Fem` |
-Non-binary markers (`*`, `:`, `_`) yield `Gender=NonBin`; Schrägstrich (`/`) yields `Gender=Masc,Fem`.
+Non-binary markers (`*`, `:`, `_`) yield `Gender=Fem,Masc,NonBin`; Schrägstrich (`/`) yields `Gender=Masc,Fem`.
### Pronouns / Neo-pronouns (`PRON` / `PPER`)
-**Merged pronoun pairs** with gender markers receive `Gender=NonBin`:
+**Merged pronoun pairs** with gender markers receive `Gender=Fem,Masc,NonBin`:
| Surface form | Lemma | FEATS |
|---|---|---|
-| `sie*er` | `sie*er` | `Gender=NonBin` |
-| `er:sie` | `er:sie` | `Gender=NonBin` |
+| `sie*er` | `sie*er` | `Gender=Fem,Masc,NonBin` |
+| `er:sie` | `er:sie` | `Gender=Fem,Masc,NonBin` |
-**Lexicon-based neo-pronouns** are matched by exact surface form (case-insensitive) against a built-in lexicon sourced from [pronomen.net](https://pronomen.net/beliebige:neopronomen). All forms are tagged `PRON PPER` with `Gender=NonBin|PronType=Prs`; the lemma is the nominative form.
+**Lexicon-based neo-pronouns** are matched by exact surface form (case-insensitive) against a built-in lexicon sourced from [pronomen.net](https://pronomen.net/beliebige:neopronomen). All forms are tagged `PRON PPER` with `Gender=Fem,Masc,NonBin|PronType=Prs`; the lemma is the nominative form.
#### Verschmelzung (blend pronouns)
diff --git a/src/index.js b/src/index.js
index 38f21b5..567e80e 100644
--- a/src/index.js
+++ b/src/index.js
@@ -72,7 +72,7 @@
// Maps lowercased surface form → { lemma, upos, xpos, feats }.
//
// Lemma: nominative form as listed on pronomen.net.
-// UPOS: PRON | XPOS: PPER | FEATS: Gender=NonBin|PronType=Prs
+// UPOS: PRON | XPOS: PPER | FEATS: Gender=Fem,Masc,NonBin|PronType=Prs
//
// Excluded (too ambiguous with standard German words):
// 'dem' – dative definite article / demonstrative pronoun
@@ -91,7 +91,7 @@
// ---------------------------------------------------------------------------
function neoPron(lemma) {
- return { lemma, upos: 'PRON', xpos: 'PPER', feats: 'Gender=NonBin|PronType=Prs' };
+ return { lemma, upos: 'PRON', xpos: 'PPER', feats: 'Gender=Fem,Masc,NonBin|PronType=Prs' };
}
const NEO_PRONOUN_FORMS = new Map([
@@ -261,7 +261,7 @@
const feats = [];
if (genderIsNonBinary) {
- feats.push('Gender=NonBin');
+ feats.push('Gender=Fem,Masc,NonBin');
} else if (genderIsBinary) {
feats.push('Gender=Masc,Fem');
}
@@ -449,7 +449,7 @@
lemma: detBase + marker + ending,
upos: 'DET',
xpos: inferDetXpos(detBase),
- feats: 'Gender=NonBin',
+ feats: 'Gender=Fem,Masc,NonBin',
};
}
@@ -476,7 +476,7 @@
upos: 'PRON',
xpos: inferPronXpos(pron1),
feats: markerType === 'star' || markerType === 'colon' || markerType === 'underscore'
- ? 'Gender=NonBin' : 'Gender=Masc,Fem',
+ ? 'Gender=Fem,Masc,NonBin' : 'Gender=Masc,Fem',
};
}
diff --git a/test/test.js b/test/test.js
index 94391a3..31a39e2 100644
--- a/test/test.js
+++ b/test/test.js
@@ -6,41 +6,41 @@
const command = 'node src/index.js < test/data/gender.conllu';
const stdout = execSync(command).toString();
- // Genderstern singular noun: Bürger*in → lemma Bürger*in, NOUN NN Gender=NonBin|Number=Sing
- expect(stdout).toContain('Bürger*in\tBürger*in\tNOUN\tNN\tGender=NonBin|Number=Sing');
+ // Genderstern singular noun: Bürger*in → lemma Bürger*in, NOUN NN Gender=Fem,Masc,NonBin|Number=Sing
+ expect(stdout).toContain('Bürger*in\tBürger*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing');
// Long compound: Jugendpsychiater*in
- expect(stdout).toContain('Jugendpsychiater*in\tJugendpsychiater*in\tNOUN\tNN\tGender=NonBin|Number=Sing');
+ expect(stdout).toContain('Jugendpsychiater*in\tJugendpsychiater*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing');
// Compound with umlaut base: Generalstaatsanwält*in
- expect(stdout).toContain('Generalstaatsanwält*in\tGeneralstaatsanwält*in\tNOUN\tNN\tGender=NonBin|Number=Sing');
+ expect(stdout).toContain('Generalstaatsanwält*in\tGeneralstaatsanwält*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing');
// Nachfolger*in
- expect(stdout).toContain('Nachfolger*in\tNachfolger*in\tNOUN\tNN\tGender=NonBin|Number=Sing');
+ expect(stdout).toContain('Nachfolger*in\tNachfolger*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing');
// With umlaut base: Antragssteller*in (no umlaut but long compound)
- expect(stdout).toContain('Antragssteller*in\tAntragssteller*in\tNOUN\tNN\tGender=NonBin|Number=Sing');
+ expect(stdout).toContain('Antragssteller*in\tAntragssteller*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing');
});
test('Full mode: Genderstern plural nouns → lemma uses singular, Number=Plur', () => {
const command = 'node src/index.js < test/data/gender.conllu';
const stdout = execSync(command).toString();
// Fachärzt*innen → lemma Fachärzt*in
- expect(stdout).toContain('Fachärzt*innen\tFachärzt*in\tNOUN\tNN\tGender=NonBin|Number=Plur');
+ expect(stdout).toContain('Fachärzt*innen\tFachärzt*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Plur');
});
test('Full mode: Doppelpunkt nouns', () => {
const command = 'node src/index.js < test/data/gender.conllu';
const stdout = execSync(command).toString();
// Doppelpunkt singular
- expect(stdout).toContain('Anhänger:in\tAnhänger:in\tNOUN\tNN\tGender=NonBin|Number=Sing');
- expect(stdout).toContain('Wirt:in\tWirt:in\tNOUN\tNN\tGender=NonBin|Number=Sing');
+ expect(stdout).toContain('Anhänger:in\tAnhänger:in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing');
+ expect(stdout).toContain('Wirt:in\tWirt:in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing');
// Doppelpunkt plural
- expect(stdout).toContain('Lehrer:innen\tLehrer:in\tNOUN\tNN\tGender=NonBin|Number=Plur');
+ expect(stdout).toContain('Lehrer:innen\tLehrer:in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Plur');
// Long compound with umlaut base
- expect(stdout).toContain('Menschenrechtsanwält:innen\tMenschenrechtsanwält:in\tNOUN\tNN\tGender=NonBin|Number=Plur');
+ expect(stdout).toContain('Menschenrechtsanwält:innen\tMenschenrechtsanwält:in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Plur');
});
test('Full mode: Unterstrich plural nouns', () => {
const command = 'node src/index.js < test/data/gender.conllu';
const stdout = execSync(command).toString();
- expect(stdout).toContain('Autor_innen\tAutor_in\tNOUN\tNN\tGender=NonBin|Number=Plur');
+ expect(stdout).toContain('Autor_innen\tAutor_in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Plur');
});
test('Full mode: Binnen-I nouns (binary intended → Gender=Masc,Fem)', () => {
@@ -73,37 +73,37 @@
test('Full mode: gendered determiners/pronouns (non-binary intended)', () => {
const command = 'node src/index.js < test/data/gender.conllu';
const stdout = execSync(command).toString();
- // jede*r → DET PIAT Gender=NonBin, lemma jede*r
- expect(stdout).toContain('jede*r\tjede*r\tDET\tPIAT\tGender=NonBin');
+ // jede*r → DET PIAT Gender=Fem,Masc,NonBin, lemma jede*r
+ expect(stdout).toContain('jede*r\tjede*r\tDET\tPIAT\tGender=Fem,Masc,NonBin');
const jeder_count = (stdout.match(/jede\*r\tjede\*r/g) || []).length;
expect(jeder_count).toBe(2); // appears in two sentences
- // eine*n → DET ART Gender=NonBin (non-binary via *)
- expect(stdout).toContain('eine*n\teine*n\tDET\tART\tGender=NonBin');
- // jede:r → DET PIAT Gender=NonBin
- expect(stdout).toContain('jede:r\tjede:r\tDET\tPIAT\tGender=NonBin');
- // die*der → DET ART Gender=NonBin (* marker → NonBin even for merged forms)
- expect(stdout).toContain('die*der\tdie*der\tDET\tART\tGender=NonBin');
+ // eine*n → DET ART Gender=Fem,Masc,NonBin (non-binary via *)
+ expect(stdout).toContain('eine*n\teine*n\tDET\tART\tGender=Fem,Masc,NonBin');
+ // jede:r → DET PIAT Gender=Fem,Masc,NonBin
+ expect(stdout).toContain('jede:r\tjede:r\tDET\tPIAT\tGender=Fem,Masc,NonBin');
+ // die*der → DET ART Gender=Fem,Masc,NonBin (* marker → inclusive even for merged forms)
+ expect(stdout).toContain('die*der\tdie*der\tDET\tART\tGender=Fem,Masc,NonBin');
});
- test('Full mode: neo-pronoun (sie*er → PRON PPER Gender=NonBin)', () => {
+ test('Full mode: neo-pronoun (sie*er → PRON PPER Gender=Fem,Masc,NonBin)', () => {
const command = 'node src/index.js < test/data/gender.conllu';
const stdout = execSync(command).toString();
- expect(stdout).toContain('sie*er\tsie*er\tPRON\tPPER\tGender=NonBin');
+ expect(stdout).toContain('sie*er\tsie*er\tPRON\tPPER\tGender=Fem,Masc,NonBin');
});
- test('Full mode: neo-pronoun lexicon forms (PRON PPER Gender=NonBin|PronType=Prs)', () => {
+ test('Full mode: neo-pronoun lexicon forms (PRON PPER Gender=Fem,Masc,NonBin|PronType=Prs)', () => {
const command = 'node src/index.js < test/data/gender.conllu';
const stdout = execSync(command).toString();
// sier: NOM of sier-paradigm
- expect(stdout).toContain('sier\tsier\tPRON\tPPER\tGender=NonBin|PronType=Prs');
+ expect(stdout).toContain('sier\tsier\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
// xier: NOM of xier-paradigm
- expect(stdout).toContain('xier\txier\tPRON\tPPER\tGender=NonBin|PronType=Prs');
+ expect(stdout).toContain('xier\txier\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
// oj: NOM of oj-paradigm
- expect(stdout).toContain('oj\toj\tPRON\tPPER\tGender=NonBin|PronType=Prs');
+ expect(stdout).toContain('oj\toj\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
// el: NOM of el-paradigm
- expect(stdout).toContain('el\tel\tPRON\tPPER\tGender=NonBin|PronType=Prs');
+ expect(stdout).toContain('el\tel\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
// hen: NOM of hen-paradigm
- expect(stdout).toContain('hen\then\tPRON\tPPER\tGender=NonBin|PronType=Prs');
+ expect(stdout).toContain('hen\then\tPRON\tPPER\tGender=Fem,Masc,NonBin|PronType=Prs');
});
test('Full mode: foundry comment changed to gender', () => {
@@ -159,7 +159,7 @@
`;
const stdout = execSync('node src/index.js', { input: testInput }).toString();
- expect(stdout).toContain('Lehrer*innen\tLehrer*in\tNOUN\tNN\tGender=NonBin|Number=Plur');
+ expect(stdout).toContain('Lehrer*innen\tLehrer*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Plur');
// Regular noun 'Lehrerin' must not be incorrectly tagged
expect(stdout).toContain('Lehrerin\t_\tNOUN\tNN\t_');
});
@@ -177,9 +177,9 @@
`;
const stdout = execSync('node src/index.js', { input: testInput }).toString();
// jede*r: missing feats should be filled in
- expect(stdout).toContain('jede*r\tjede*r\tDET\tPIAT\tGender=NonBin');
+ expect(stdout).toContain('jede*r\tjede*r\tDET\tPIAT\tGender=Fem,Masc,NonBin');
// Arzt*in: umlaut base, missing everything
- expect(stdout).toContain('Arzt*in\tArzt*in\tNOUN\tNN\tGender=NonBin|Number=Sing');
+ expect(stdout).toContain('Arzt*in\tArzt*in\tNOUN\tNN\tGender=Fem,Masc,NonBin|Number=Sing');
// Ärztin (regular moviertes Femininum, no gender marker): unchanged
expect(stdout).toContain('Ärztin\tÄrztin\tNOUN\tNN\tGender=Fem|Number=Sing');
// jede (without gender marker): unchanged