Add neo pronoun support
diff --git a/Readme.md b/Readme.md
index a9f25c5..86f6df0 100644
--- a/Readme.md
+++ b/Readme.md
@@ -56,19 +56,68 @@
### Pronouns / Neo-pronouns (`PRON` / `PPER`)
-Merged pronoun pairs with gender markers are annotated:
+**Merged pronoun pairs** with gender markers receive `Gender=NonBin`:
| Surface form | Lemma | FEATS |
|---|---|---|
| `sie*er` | `sie*er` | `Gender=NonBin` |
| `er:sie` | `er:sie` | `Gender=NonBin` |
+**Lexicon-based neo-pronouns** are matched by exact surface form (case-insensitive) against a built-in lexicon sourced from [pronomen.net](https://pronomen.net/beliebige:neopronomen). All forms are tagged `PRON PPER` with `Gender=NonBin|PronType=Prs`; the lemma is the nominative form.
+
+#### Verschmelzung (blend pronouns)
+
+| Paradigm (NOM/DAT) | NOM | GEN | DAT | ACC |
+|---|---|---|---|---|
+| sier/siem | `sier` | `sies` | `siem` | `sien` |
+| xier/xiem | `xier` | `xies` | `xiem` | `xien` |
+| ersie/ihmihr | `ersie` | `seinihr` | `ihmihr` | `ihnsie` |
+
+#### They-ähnlich
+
+| Paradigm | NOM | GEN | DAT | ACC |
+|---|---|---|---|---|
+| dej/denen/dej | `dej` | _(deren)_ | _(denen)_ | `dej` |
+| dey/denen/dem | `dey` | _(deren)_ | _(denen)_ | _(dem)_ |
+| dey/denen/demm | `dey` | _(deren)_ | _(denen)_ | `demm` |
+| ey/emm | `ey` | `eys` | `emm` | `emm` |
+| they/them | `they` | `their` | `them` | `them` |
+
+Forms in italics are excluded from the lexicon because they are homonymous with standard German words (`deren`, `denen`, `dem`).
+
+#### Neuer Stamm (new-stem pronouns)
+
+| Paradigm | NOM | GEN | DAT | ACC |
+|---|---|---|---|---|
+| el/em | `el` | `ems` | `em` | `en` |
+| em/em | `em` | `ems` | `em` | `em` |
+| en/en | `en` | `enses` | `en` | `en` |
+| en/em | `en` | `ens` | `em` | `en` |
+| ens/ens | `ens` | `ens` | `ens` | `ens` |
+| et/siem | `et` | `sier` | `siem` | `sien` |
+| ex/ex | `ex` | `ex` | `ex` | `ex` |
+| hän/sim | `hän` | `sir` | `sim` | `sin` |
+| hen/hem | `hen` | `hens` | `hem` | `hen` |
+| hie/hiem | `hie` | `hein` | `hiem` | `hie` |
+| iks/iks | `iks` | `ikses` | `iks` | `iks` |
+| ind/inde | `ind` | `inds` | `inde` | `ind` |
+| mensch/mensch | `mensch` | `menschs` | `mensch` | `mensch` |
+| nin/nim | `nin` | `nims` | `nim` | `nin` |
+| oj/ojm | `oj` | `juj` | `ojm` | `ojn` |
+| per/per | `per` | `pers` | `per` | `per` |
+| ser/sem | `ser` | `ses` | `sem` | `sen` |
+| Y/Y | `Y` | `Ys` | `Y` | `Y` |
+| zet/zerm | `zet` | `zets` | `zerm` | `zern` |
+| \*/\* (Stern) | `*` | `*s` | `*` | `*` |
+
+Note: oblique forms of `et/siem` (`sier`, `siem`, `sien`) are shared with the `sier` paradigm and annotated with lemma `sier`.
+
### Known limitations
- **Binnen-I with non-final capital** (e.g. `jedEn`, `jedEr`): these forms embed the capital letter at a non-final position; detection requires morphological analysis beyond simple pattern matching and is not currently supported.
- **Gendered adjectives** (e.g. `begeisterte*n`): not yet annotated (occur in ~5 % of gendered NP elements per Ochs 2026, §7.3.2).
- **Inflected case suffixes** on gendered nouns (e.g. genitive `Lehrers*in`, dative plural extra marking): rare and not detected.
-- **Completely novel neo-pronouns** (e.g. `dier`, `xier`) that do not follow a known pattern cannot be detected by regular expressions.
+- **Ambiguous neo-pronoun forms**: `dem`, `deren`, `denen` are excluded from the neo-pronoun lexicon as they are indistinguishable from standard German determiners/pronouns without syntactic context. `per` is included despite its use as a preposition.
## Usage
@@ -121,6 +170,6 @@
## References
-Ochs, S. & Rüdiger, J. O. (2025). Of stars and colons: A corpus-based analysis of gender-inclusive orthographies in German press texts. In D. Schmitz, S. D. Stein & V. Schneider (Eds.), *Linguistic Intersections of Language and Gender* (pp. 31–62). Düsseldorf: düsseldorf university press. https://doi.org/10.1515/9783111388694-003
+Ochs, Samira/Rüdiger, Jan Oliver (2025): Of stars and colons: A corpus-based analysis of gender-inclusive orthographies in German press texts. In: Schmitz, Dominic/Stein, Simon David/Schneider, Viktoria (eds.): Linguistic intersections of language and gender. Of gender bias and gender fairness. Berlin/Boston: De Gruyter, pp. 31–62. https://doi.org/10.1515/9783111388694.
-Ochs, S. (2026). Die morphosyntaktische Integration neuer Gendersuffixe: Eine korpusbasierte Analyse deutschsprachiger Pressetexte. *Gender Linguistics*, 2. https://doi.org/10.65020/0619d927
+Ochs, Samira (2026). Die morphosyntaktische Integration neuer Gendersuffixe: Eine korpusbasierte Analyse deutschsprachiger Pressetexte. *Gender Linguistics*, 2. https://doi.org/10.65020/0619d927
diff --git a/src/index.js b/src/index.js
index e2f22aa..38f21b5 100644
--- a/src/index.js
+++ b/src/index.js
@@ -68,6 +68,142 @@
const neopronGenderStarPairRegex = /^(sie|er|ihr|ihn?|ihm?|dich|sich|mich|mir|uns|euch|ihnen|seinen?|ihrem?|deren?|denen)([*:_])(sie|er|ihr|ihn?|ihm?|dich|sich|mich|mir|uns|euch|ihnen|seinen?|ihrem?|deren?|denen)$/i;
// ---------------------------------------------------------------------------
+// Neo-pronoun lexicon (source: pronomen.net/beliebige:neopronomen)
+// Maps lowercased surface form → { lemma, upos, xpos, feats }.
+//
+// Lemma: nominative form as listed on pronomen.net.
+// UPOS: PRON | XPOS: PPER | FEATS: Gender=NonBin|PronType=Prs
+//
+// Excluded (too ambiguous with standard German words):
+// 'dem' – dative definite article / demonstrative pronoun
+// 'deren' – relative/demonstrative genitive pronoun
+// 'denen' – relative/demonstrative dative pronoun
+// 'per' – common German preposition
+// 'pers' – excluded together with 'per'
+//
+// Shared/ambiguous oblique forms:
+// 'sier','siem','sien' – NOM/DAT/ACC of sier-paradigm; also GEN/DAT/ACC of
+// et/siem-paradigm (both annotated with lemma 'sier')
+// 'em' – NOM of em/em-paradigm; also DAT of el/em and en/em
+// 'ems' – GEN of both el/em and em/em (annotated as lemma 'em')
+// 'en' – NOM/ACC/DAT of en/en; NOM/ACC of en/em (lemma 'en')
+// 'ens' – GEN of en/em; also all forms of ens/ens (lemma 'ens')
+// ---------------------------------------------------------------------------
+
+function neoPron(lemma) {
+ return { lemma, upos: 'PRON', xpos: 'PPER', feats: 'Gender=NonBin|PronType=Prs' };
+}
+
+const NEO_PRONOUN_FORMS = new Map([
+ // ---- Verschmelzung (blend pronouns) ------------------------------------
+ // sier/siem (NOM=sier, GEN=sies, DAT=siem, ACC=sien)
+ ['sier', neoPron('sier')],
+ ['sies', neoPron('sier')],
+ ['siem', neoPron('sier')],
+ ['sien', neoPron('sier')],
+ // xier/xiem (NOM=xier, GEN=xies, DAT=xiem, ACC=xien)
+ ['xier', neoPron('xier')],
+ ['xies', neoPron('xier')],
+ ['xiem', neoPron('xier')],
+ ['xien', neoPron('xier')],
+ // ersie/ihmihr (NOM=ersie, GEN=seinihr, DAT=ihmihr, ACC=ihnsie)
+ ['ersie', neoPron('ersie')],
+ ['seinihr', neoPron('ersie')],
+ ['ihmihr', neoPron('ersie')],
+ ['ihnsie', neoPron('ersie')],
+
+ // ---- They-ähnlich (they-like pronouns) ---------------------------------
+ // dej/denen/dej (NOM=dej, GEN=deren, DAT=denen, ACC=dej)
+ // 'deren' and 'denen' omitted (overlap with standard German pronouns)
+ ['dej', neoPron('dej')],
+ // dey/denen/dem and dey/denen/demm (NOM=dey; 'dem' excluded)
+ ['dey', neoPron('dey')],
+ ['demm', neoPron('dey')], // ACC of dey/denen/demm
+ // ey/emm (NOM=ey, GEN=eys, DAT=emm, ACC=emm)
+ ['ey', neoPron('ey')],
+ ['eys', neoPron('ey')],
+ ['emm', neoPron('ey')],
+ // they/them (NOM=they, GEN=their, DAT=them, ACC=them)
+ ['they', neoPron('they')],
+ ['their', neoPron('they')],
+ ['them', neoPron('they')],
+
+ // ---- Neuer Stamm (new-stem pronouns) -----------------------------------
+ // el/em (NOM=el, GEN=ems, DAT=em, ACC=en)
+ // 'ems' mapped to 'em'-paradigm below; 'em'/'en' mapped to their own NOM paradigms
+ ['el', neoPron('el')],
+ // em/em (NOM=em, GEN=ems, DAT=em, ACC=em)
+ ['em', neoPron('em')],
+ ['ems', neoPron('em')], // GEN shared with el/em paradigm
+ // en/en (NOM=en, GEN=enses, DAT=en, ACC=en)
+ // en/em (NOM=en, GEN=ens, DAT=em, ACC=en) — DAT 'em' mapped to em-paradigm
+ ['en', neoPron('en')],
+ ['enses', neoPron('en')],
+ // ens/ens (NOM=ens, GEN=ens, DAT=ens, ACC=ens)
+ // 'ens' takes priority as NOM of ens-paradigm (also GEN of en/em)
+ ['ens', neoPron('ens')],
+ // et/siem (NOM=et, GEN=sier, DAT=siem, ACC=sien)
+ // oblique forms 'sier'/'siem'/'sien' already mapped to sier-paradigm above
+ ['et', neoPron('et')],
+ // ex/ex (all forms = ex)
+ ['ex', neoPron('ex')],
+ // hän/sim (NOM=hän, GEN=sir, DAT=sim, ACC=sin)
+ ['hän', neoPron('hän')],
+ ['sir', neoPron('hän')],
+ ['sim', neoPron('hän')],
+ ['sin', neoPron('hän')],
+ // hen/hem (NOM=hen, GEN=hens, DAT=hem, ACC=hen)
+ ['hen', neoPron('hen')],
+ ['hens', neoPron('hen')],
+ ['hem', neoPron('hen')],
+ // hie/hiem (NOM=hie, GEN=hein, DAT=hiem, ACC=hie)
+ ['hie', neoPron('hie')],
+ ['hein', neoPron('hie')],
+ ['hiem', neoPron('hie')],
+ // iks/iks (NOM=iks, GEN=ikses, DAT=iks, ACC=iks)
+ ['iks', neoPron('iks')],
+ ['ikses', neoPron('iks')],
+ // ind/inde (NOM=ind, GEN=inds, DAT=inde, ACC=ind)
+ ['ind', neoPron('ind')],
+ ['inds', neoPron('ind')],
+ ['inde', neoPron('ind')],
+ // mensch/mensch (NOM=mensch, GEN=menschs, DAT=mensch, ACC=mensch)
+ // Note: case-insensitive match means sentence-initial 'Mensch' (common noun)
+ // will also be tagged; acceptable in a gender-language–focused tagger.
+ ['mensch', neoPron('mensch')],
+ ['menschs', neoPron('mensch')],
+ // nin/nim (NOM=nin, GEN=nims, DAT=nim, ACC=nin)
+ ['nin', neoPron('nin')],
+ ['nims', neoPron('nin')],
+ ['nim', neoPron('nin')],
+ // oj/ojm (NOM=oj, GEN=juj, DAT=ojm, ACC=ojn)
+ ['oj', neoPron('oj')],
+ ['juj', neoPron('oj')],
+ ['ojm', neoPron('oj')],
+ ['ojn', neoPron('oj')],
+ // per/per (all forms = per; GEN = pers)
+ // Note: 'per' also occurs as a German preposition (e.g. 'per E-Mail').
+ ['per', neoPron('per')],
+ ['pers', neoPron('per')],
+ // ser/sem (NOM=ser, GEN=ses, DAT=sem, ACC=sen)
+ ['ser', neoPron('ser')],
+ ['ses', neoPron('ser')],
+ ['sem', neoPron('ser')],
+ ['sen', neoPron('ser')],
+ // Y/Y (all forms = Y; GEN = Ys) — stored lowercase; lemma retains uppercase 'Y'
+ ['y', neoPron('Y')],
+ ['ys', neoPron('Y')],
+ // zet/zerm (NOM=zet, GEN=zets, DAT=zerm, ACC=zern)
+ ['zet', neoPron('zet')],
+ ['zets', neoPron('zet')],
+ ['zerm', neoPron('zet')],
+ ['zern', neoPron('zet')],
+ // */* (Stern; all forms = *; GEN = *s)
+ ['*', neoPron('*')],
+ ['*s', neoPron('*')],
+]);
+
+// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
@@ -205,6 +341,12 @@
let m;
// ------------------------------------------------------------------
+ // 0. Neo-pronoun lexicon lookup (case-insensitive, exact form match)
+ // ------------------------------------------------------------------
+ const entry = NEO_PRONOUN_FORMS.get(word.toLowerCase());
+ if (entry) return entry;
+
+ // ------------------------------------------------------------------
// 1. Gender-sensitive NOUNS
// ------------------------------------------------------------------
diff --git a/test/data/gender.conllu b/test/data/gender.conllu
index bc3c76c..9319d7c 100644
--- a/test/data/gender.conllu
+++ b/test/data/gender.conllu
@@ -115,3 +115,26 @@
3 Spieler/-innen _ _ _ _ _ _ _ _
4 lasen _ VERB VVFIN _ _ _ _ _
+# foundry = base
+# filename = TEST/gender/000011/base/tokens.xml
+# text_id = GENDER_TEST.000011
+# text = sier trifft xier jeden Tag
+1 sier _ _ _ _ _ _ _ _
+2 trifft _ VERB VVFIN _ _ _ _ _
+3 xier _ _ _ _ _ _ _ _
+4 jeden _ DET PIAT _ _ _ _ _
+5 Tag _ NOUN NN _ _ _ _ _
+
+# foundry = base
+# filename = TEST/gender/000012/base/tokens.xml
+# text_id = GENDER_TEST.000012
+# text = oj dankte el und hen für die Hilfe
+1 oj _ _ _ _ _ _ _ _
+2 dankte _ VERB VVFIN _ _ _ _ _
+3 el _ _ _ _ _ _ _ _
+4 und _ CCONJ KON _ _ _ _ _
+5 hen _ _ _ _ _ _ _ _
+6 für _ ADP APPR _ _ _ _ _
+7 die _ DET ART _ _ _ _ _
+8 Hilfe _ NOUN NN _ _ _ _ _
+
diff --git a/test/test.js b/test/test.js
index 6367049..94391a3 100644
--- a/test/test.js
+++ b/test/test.js
@@ -91,11 +91,26 @@
expect(stdout).toContain('sie*er\tsie*er\tPRON\tPPER\tGender=NonBin');
});
+ test('Full mode: neo-pronoun lexicon forms (PRON PPER Gender=NonBin|PronType=Prs)', () => {
+ const command = 'node src/index.js < test/data/gender.conllu';
+ const stdout = execSync(command).toString();
+ // sier: NOM of sier-paradigm
+ expect(stdout).toContain('sier\tsier\tPRON\tPPER\tGender=NonBin|PronType=Prs');
+ // xier: NOM of xier-paradigm
+ expect(stdout).toContain('xier\txier\tPRON\tPPER\tGender=NonBin|PronType=Prs');
+ // oj: NOM of oj-paradigm
+ expect(stdout).toContain('oj\toj\tPRON\tPPER\tGender=NonBin|PronType=Prs');
+ // el: NOM of el-paradigm
+ expect(stdout).toContain('el\tel\tPRON\tPPER\tGender=NonBin|PronType=Prs');
+ // hen: NOM of hen-paradigm
+ expect(stdout).toContain('hen\then\tPRON\tPPER\tGender=NonBin|PronType=Prs');
+ });
+
test('Full mode: foundry comment changed to gender', () => {
const command = 'node src/index.js < test/data/gender.conllu';
const stdout = execSync(command).toString();
const foundry_count = (stdout.match(/# foundry = gender/g) || []).length;
- expect(foundry_count).toBe(10);
+ expect(foundry_count).toBe(12);
});
test('Full mode: non-gender tokens pass through unchanged', () => {
@@ -120,16 +135,16 @@
const featsAnnotated = cols[5] !== '_';
expect(lemmaAnnotated || featsAnnotated).toBe(true);
});
- // Count: 18 NOUN + 5 DET + 1 PRON = 24 annotated tokens
- expect(tokenLines.length).toBe(24);
+ // Count: 18 NOUN + 5 DET + 1 PRON (sie*er) + 5 neo-pronouns (sier,xier,oj,el,hen) = 29
+ expect(tokenLines.length).toBe(29);
});
test('Sparse mode: sentence headers are emitted for sentences with matches', () => {
const command = 'node src/index.js -s < test/data/gender.conllu';
const stdout = execSync(command).toString();
- // All 10 test sentences have at least one gender form
+ // All 12 test sentences have at least one gender form
const text_id_count = (stdout.match(/# text_id = /g) || []).length;
- expect(text_id_count).toBe(10);
+ expect(text_id_count).toBe(12);
});
test('Inline input: basic Genderstern annotation', () => {