Add neo pronoun support

commit: 1a9f16e55d84ba53674405c98c75b2bfd7230e0b [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sat Mar 07 09:50:55 2026 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sat Mar 07 10:23:48 2026 +0100
tree: f5089d5cbd8cf21d138dc6c93a1c1bc4887a070e
parent: b777f9d80ce8daace4b5b41cccee1fea684d2e95 [diff]
diff --git a/Readme.md b/Readme.md
index a9f25c5..86f6df0 100644
--- a/Readme.md
+++ b/Readme.md

@@ -56,19 +56,68 @@
 
 ### Pronouns / Neo-pronouns (`PRON` / `PPER`)
 
-Merged pronoun pairs with gender markers are annotated:
+**Merged pronoun pairs** with gender markers receive `Gender=NonBin`:
 
 | Surface form | Lemma | FEATS |
 |---|---|---|
 | `sie*er` | `sie*er` | `Gender=NonBin` |
 | `er:sie` | `er:sie` | `Gender=NonBin` |
 
+**Lexicon-based neo-pronouns** are matched by exact surface form (case-insensitive) against a built-in lexicon sourced from [pronomen.net](https://pronomen.net/beliebige:neopronomen). All forms are tagged `PRON PPER` with `Gender=NonBin|PronType=Prs`; the lemma is the nominative form.
+
+#### Verschmelzung (blend pronouns)
+
+| Paradigm (NOM/DAT) | NOM | GEN | DAT | ACC |
+|---|---|---|---|---|
+| sier/siem | `sier` | `sies` | `siem` | `sien` |
+| xier/xiem | `xier` | `xies` | `xiem` | `xien` |
+| ersie/ihmihr | `ersie` | `seinihr` | `ihmihr` | `ihnsie` |
+
+#### They-ähnlich
+
+| Paradigm | NOM | GEN | DAT | ACC |
+|---|---|---|---|---|
+| dej/denen/dej | `dej` | _(deren)_ | _(denen)_ | `dej` |
+| dey/denen/dem | `dey` | _(deren)_ | _(denen)_ | _(dem)_ |
+| dey/denen/demm | `dey` | _(deren)_ | _(denen)_ | `demm` |
+| ey/emm | `ey` | `eys` | `emm` | `emm` |
+| they/them | `they` | `their` | `them` | `them` |
+
+Forms in italics are excluded from the lexicon because they are homonymous with standard German words (`deren`, `denen`, `dem`).
+
+#### Neuer Stamm (new-stem pronouns)
+
+| Paradigm | NOM | GEN | DAT | ACC |
+|---|---|---|---|---|
+| el/em | `el` | `ems` | `em` | `en` |
+| em/em | `em` | `ems` | `em` | `em` |
+| en/en | `en` | `enses` | `en` | `en` |
+| en/em | `en` | `ens` | `em` | `en` |
+| ens/ens | `ens` | `ens` | `ens` | `ens` |
+| et/siem | `et` | `sier` | `siem` | `sien` |
+| ex/ex | `ex` | `ex` | `ex` | `ex` |
+| hän/sim | `hän` | `sir` | `sim` | `sin` |
+| hen/hem | `hen` | `hens` | `hem` | `hen` |
+| hie/hiem | `hie` | `hein` | `hiem` | `hie` |
+| iks/iks | `iks` | `ikses` | `iks` | `iks` |
+| ind/inde | `ind` | `inds` | `inde` | `ind` |
+| mensch/mensch | `mensch` | `menschs` | `mensch` | `mensch` |
+| nin/nim | `nin` | `nims` | `nim` | `nin` |
+| oj/ojm | `oj` | `juj` | `ojm` | `ojn` |
+| per/per | `per` | `pers` | `per` | `per` |
+| ser/sem | `ser` | `ses` | `sem` | `sen` |
+| Y/Y | `Y` | `Ys` | `Y` | `Y` |
+| zet/zerm | `zet` | `zets` | `zerm` | `zern` |
+| \*/\* (Stern) | `*` | `*s` | `*` | `*` |
+
+Note: oblique forms of `et/siem` (`sier`, `siem`, `sien`) are shared with the `sier` paradigm and annotated with lemma `sier`.
+
 ### Known limitations
 
 - **Binnen-I with non-final capital** (e.g. `jedEn`, `jedEr`): these forms embed the capital letter at a non-final position; detection requires morphological analysis beyond simple pattern matching and is not currently supported.
 - **Gendered adjectives** (e.g. `begeisterte*n`): not yet annotated (occur in ~5 % of gendered NP elements per Ochs 2026, §7.3.2).
 - **Inflected case suffixes** on gendered nouns (e.g. genitive `Lehrers*in`, dative plural extra marking): rare and not detected.
-- **Completely novel neo-pronouns** (e.g. `dier`, `xier`) that do not follow a known pattern cannot be detected by regular expressions.
+- **Ambiguous neo-pronoun forms**: `dem`, `deren`, `denen` are excluded from the neo-pronoun lexicon as they are indistinguishable from standard German determiners/pronouns without syntactic context. `per` is included despite its use as a preposition.
 
 ## Usage
 
@@ -121,6 +170,6 @@
 
 ## References
 
-Ochs, S. & Rüdiger, J. O. (2025). Of stars and colons: A corpus-based analysis of gender-inclusive orthographies in German press texts. In D. Schmitz, S. D. Stein & V. Schneider (Eds.), *Linguistic Intersections of Language and Gender* (pp. 31–62). Düsseldorf: düsseldorf university press. https://doi.org/10.1515/9783111388694-003
+Ochs, Samira/Rüdiger, Jan Oliver (2025): Of stars and colons: A corpus-based analysis of gender-inclusive orthographies in German press texts. In: Schmitz, Dominic/Stein, Simon David/Schneider, Viktoria (eds.): Linguistic intersections of language and gender. Of gender bias and gender fairness. Berlin/Boston: De Gruyter, pp. 31–62. https://doi.org/10.1515/9783111388694.
 
-Ochs, S. (2026). Die morphosyntaktische Integration neuer Gendersuffixe: Eine korpusbasierte Analyse deutschsprachiger Pressetexte. *Gender Linguistics*, 2. https://doi.org/10.65020/0619d927
+Ochs, Samira (2026). Die morphosyntaktische Integration neuer Gendersuffixe: Eine korpusbasierte Analyse deutschsprachiger Pressetexte. *Gender Linguistics*, 2. https://doi.org/10.65020/0619d927

diff --git a/src/index.js b/src/index.js
index e2f22aa..38f21b5 100644
--- a/src/index.js
+++ b/src/index.js

@@ -68,6 +68,142 @@
 const neopronGenderStarPairRegex = /^(sie|er|ihr|ihn?|ihm?|dich|sich|mich|mir|uns|euch|ihnen|seinen?|ihrem?|deren?|denen)([*:_])(sie|er|ihr|ihn?|ihm?|dich|sich|mich|mir|uns|euch|ihnen|seinen?|ihrem?|deren?|denen)$/i;
 
 // ---------------------------------------------------------------------------
+// Neo-pronoun lexicon  (source: pronomen.net/beliebige:neopronomen)
+// Maps lowercased surface form → { lemma, upos, xpos, feats }.
+//
+// Lemma: nominative form as listed on pronomen.net.
+// UPOS:  PRON  |  XPOS: PPER  |  FEATS: Gender=NonBin|PronType=Prs
+//
+// Excluded (too ambiguous with standard German words):
+//   'dem'   – dative definite article / demonstrative pronoun
+//   'deren' – relative/demonstrative genitive pronoun
+//   'denen' – relative/demonstrative dative pronoun
+//   'per'   – common German preposition
+//   'pers'  – excluded together with 'per'
+//
+// Shared/ambiguous oblique forms:
+//   'sier','siem','sien' – NOM/DAT/ACC of sier-paradigm; also GEN/DAT/ACC of
+//                          et/siem-paradigm (both annotated with lemma 'sier')
+//   'em'   – NOM of em/em-paradigm; also DAT of el/em and en/em
+//   'ems'  – GEN of both el/em and em/em (annotated as lemma 'em')
+//   'en'   – NOM/ACC/DAT of en/en; NOM/ACC of en/em (lemma 'en')
+//   'ens'  – GEN of en/em; also all forms of ens/ens (lemma 'ens')
+// ---------------------------------------------------------------------------
+
+function neoPron(lemma) {
+  return { lemma, upos: 'PRON', xpos: 'PPER', feats: 'Gender=NonBin|PronType=Prs' };
+}
+
+const NEO_PRONOUN_FORMS = new Map([
+  // ---- Verschmelzung (blend pronouns) ------------------------------------
+  // sier/siem  (NOM=sier, GEN=sies, DAT=siem, ACC=sien)
+  ['sier',    neoPron('sier')],
+  ['sies',    neoPron('sier')],
+  ['siem',    neoPron('sier')],
+  ['sien',    neoPron('sier')],
+  // xier/xiem  (NOM=xier, GEN=xies, DAT=xiem, ACC=xien)
+  ['xier',    neoPron('xier')],
+  ['xies',    neoPron('xier')],
+  ['xiem',    neoPron('xier')],
+  ['xien',    neoPron('xier')],
+  // ersie/ihmihr  (NOM=ersie, GEN=seinihr, DAT=ihmihr, ACC=ihnsie)
+  ['ersie',   neoPron('ersie')],
+  ['seinihr', neoPron('ersie')],
+  ['ihmihr',  neoPron('ersie')],
+  ['ihnsie',  neoPron('ersie')],
+
+  // ---- They-ähnlich (they-like pronouns) ---------------------------------
+  // dej/denen/dej  (NOM=dej, GEN=deren, DAT=denen, ACC=dej)
+  // 'deren' and 'denen' omitted (overlap with standard German pronouns)
+  ['dej',     neoPron('dej')],
+  // dey/denen/dem and dey/denen/demm  (NOM=dey; 'dem' excluded)
+  ['dey',     neoPron('dey')],
+  ['demm',    neoPron('dey')],   // ACC of dey/denen/demm
+  // ey/emm  (NOM=ey, GEN=eys, DAT=emm, ACC=emm)
+  ['ey',      neoPron('ey')],
+  ['eys',     neoPron('ey')],
+  ['emm',     neoPron('ey')],
+  // they/them  (NOM=they, GEN=their, DAT=them, ACC=them)
+  ['they',    neoPron('they')],
+  ['their',   neoPron('they')],
+  ['them',    neoPron('they')],
+
+  // ---- Neuer Stamm (new-stem pronouns) -----------------------------------
+  // el/em  (NOM=el, GEN=ems, DAT=em, ACC=en)
+  // 'ems' mapped to 'em'-paradigm below; 'em'/'en' mapped to their own NOM paradigms
+  ['el',      neoPron('el')],
+  // em/em  (NOM=em, GEN=ems, DAT=em, ACC=em)
+  ['em',      neoPron('em')],
+  ['ems',     neoPron('em')],   // GEN shared with el/em paradigm
+  // en/en  (NOM=en, GEN=enses, DAT=en, ACC=en)
+  // en/em  (NOM=en, GEN=ens, DAT=em, ACC=en)  — DAT 'em' mapped to em-paradigm
+  ['en',      neoPron('en')],
+  ['enses',   neoPron('en')],
+  // ens/ens  (NOM=ens, GEN=ens, DAT=ens, ACC=ens)
+  // 'ens' takes priority as NOM of ens-paradigm (also GEN of en/em)
+  ['ens',     neoPron('ens')],
+  // et/siem  (NOM=et, GEN=sier, DAT=siem, ACC=sien)
+  // oblique forms 'sier'/'siem'/'sien' already mapped to sier-paradigm above
+  ['et',      neoPron('et')],
+  // ex/ex  (all forms = ex)
+  ['ex',      neoPron('ex')],
+  // hän/sim  (NOM=hän, GEN=sir, DAT=sim, ACC=sin)
+  ['hän',     neoPron('hän')],
+  ['sir',     neoPron('hän')],
+  ['sim',     neoPron('hän')],
+  ['sin',     neoPron('hän')],
+  // hen/hem  (NOM=hen, GEN=hens, DAT=hem, ACC=hen)
+  ['hen',     neoPron('hen')],
+  ['hens',    neoPron('hen')],
+  ['hem',     neoPron('hen')],
+  // hie/hiem  (NOM=hie, GEN=hein, DAT=hiem, ACC=hie)
+  ['hie',     neoPron('hie')],
+  ['hein',    neoPron('hie')],
+  ['hiem',    neoPron('hie')],
+  // iks/iks  (NOM=iks, GEN=ikses, DAT=iks, ACC=iks)
+  ['iks',     neoPron('iks')],
+  ['ikses',   neoPron('iks')],
+  // ind/inde  (NOM=ind, GEN=inds, DAT=inde, ACC=ind)
+  ['ind',     neoPron('ind')],
+  ['inds',    neoPron('ind')],
+  ['inde',    neoPron('ind')],
+  // mensch/mensch  (NOM=mensch, GEN=menschs, DAT=mensch, ACC=mensch)
+  // Note: case-insensitive match means sentence-initial 'Mensch' (common noun)
+  // will also be tagged; acceptable in a gender-language–focused tagger.
+  ['mensch',  neoPron('mensch')],
+  ['menschs', neoPron('mensch')],
+  // nin/nim  (NOM=nin, GEN=nims, DAT=nim, ACC=nin)
+  ['nin',     neoPron('nin')],
+  ['nims',    neoPron('nin')],
+  ['nim',     neoPron('nin')],
+  // oj/ojm  (NOM=oj, GEN=juj, DAT=ojm, ACC=ojn)
+  ['oj',      neoPron('oj')],
+  ['juj',     neoPron('oj')],
+  ['ojm',     neoPron('oj')],
+  ['ojn',     neoPron('oj')],
+  // per/per  (all forms = per; GEN = pers)
+  // Note: 'per' also occurs as a German preposition (e.g. 'per E-Mail').
+  ['per',     neoPron('per')],
+  ['pers',    neoPron('per')],
+  // ser/sem  (NOM=ser, GEN=ses, DAT=sem, ACC=sen)
+  ['ser',     neoPron('ser')],
+  ['ses',     neoPron('ser')],
+  ['sem',     neoPron('ser')],
+  ['sen',     neoPron('ser')],
+  // Y/Y  (all forms = Y; GEN = Ys) — stored lowercase; lemma retains uppercase 'Y'
+  ['y',       neoPron('Y')],
+  ['ys',      neoPron('Y')],
+  // zet/zerm  (NOM=zet, GEN=zets, DAT=zerm, ACC=zern)
+  ['zet',     neoPron('zet')],
+  ['zets',    neoPron('zet')],
+  ['zerm',    neoPron('zet')],
+  ['zern',    neoPron('zet')],
+  // */*  (Stern; all forms = *; GEN = *s)
+  ['*',       neoPron('*')],
+  ['*s',      neoPron('*')],
+]);
+
+// ---------------------------------------------------------------------------
 // Helpers
 // ---------------------------------------------------------------------------
 
@@ -205,6 +341,12 @@
   let m;
 
   // ------------------------------------------------------------------
+  // 0. Neo-pronoun lexicon lookup (case-insensitive, exact form match)
+  // ------------------------------------------------------------------
+  const entry = NEO_PRONOUN_FORMS.get(word.toLowerCase());
+  if (entry) return entry;
+
+  // ------------------------------------------------------------------
   // 1. Gender-sensitive NOUNS
   // ------------------------------------------------------------------
 

diff --git a/test/data/gender.conllu b/test/data/gender.conllu
index bc3c76c..9319d7c 100644
--- a/test/data/gender.conllu
+++ b/test/data/gender.conllu

@@ -115,3 +115,26 @@
 3	Spieler/-innen	_	_	_	_	_	_	_	_
 4	lasen	_	VERB	VVFIN	_	_	_	_	_
 
+# foundry = base
+# filename = TEST/gender/000011/base/tokens.xml
+# text_id = GENDER_TEST.000011
+# text = sier trifft xier jeden Tag
+1	sier	_	_	_	_	_	_	_	_
+2	trifft	_	VERB	VVFIN	_	_	_	_	_
+3	xier	_	_	_	_	_	_	_	_
+4	jeden	_	DET	PIAT	_	_	_	_	_
+5	Tag	_	NOUN	NN	_	_	_	_	_
+
+# foundry = base
+# filename = TEST/gender/000012/base/tokens.xml
+# text_id = GENDER_TEST.000012
+# text = oj dankte el und hen für die Hilfe
+1	oj	_	_	_	_	_	_	_	_
+2	dankte	_	VERB	VVFIN	_	_	_	_	_
+3	el	_	_	_	_	_	_	_	_
+4	und	_	CCONJ	KON	_	_	_	_	_
+5	hen	_	_	_	_	_	_	_	_
+6	für	_	ADP	APPR	_	_	_	_	_
+7	die	_	DET	ART	_	_	_	_	_
+8	Hilfe	_	NOUN	NN	_	_	_	_	_
+

diff --git a/test/test.js b/test/test.js
index 6367049..94391a3 100644
--- a/test/test.js
+++ b/test/test.js

@@ -91,11 +91,26 @@
     expect(stdout).toContain('sie*er\tsie*er\tPRON\tPPER\tGender=NonBin');
   });
 
+  test('Full mode: neo-pronoun lexicon forms (PRON PPER Gender=NonBin|PronType=Prs)', () => {
+    const command = 'node src/index.js < test/data/gender.conllu';
+    const stdout = execSync(command).toString();
+    // sier: NOM of sier-paradigm
+    expect(stdout).toContain('sier\tsier\tPRON\tPPER\tGender=NonBin|PronType=Prs');
+    // xier: NOM of xier-paradigm
+    expect(stdout).toContain('xier\txier\tPRON\tPPER\tGender=NonBin|PronType=Prs');
+    // oj: NOM of oj-paradigm
+    expect(stdout).toContain('oj\toj\tPRON\tPPER\tGender=NonBin|PronType=Prs');
+    // el: NOM of el-paradigm
+    expect(stdout).toContain('el\tel\tPRON\tPPER\tGender=NonBin|PronType=Prs');
+    // hen: NOM of hen-paradigm
+    expect(stdout).toContain('hen\then\tPRON\tPPER\tGender=NonBin|PronType=Prs');
+  });
+
   test('Full mode: foundry comment changed to gender', () => {
     const command = 'node src/index.js < test/data/gender.conllu';
     const stdout = execSync(command).toString();
     const foundry_count = (stdout.match(/# foundry = gender/g) || []).length;
-    expect(foundry_count).toBe(10);
+    expect(foundry_count).toBe(12);
   });
 
   test('Full mode: non-gender tokens pass through unchanged', () => {
@@ -120,16 +135,16 @@
       const featsAnnotated  = cols[5] !== '_';
       expect(lemmaAnnotated || featsAnnotated).toBe(true);
     });
-    // Count: 18 NOUN + 5 DET + 1 PRON = 24 annotated tokens
-    expect(tokenLines.length).toBe(24);
+    // Count: 18 NOUN + 5 DET + 1 PRON (sie*er) + 5 neo-pronouns (sier,xier,oj,el,hen) = 29
+    expect(tokenLines.length).toBe(29);
   });
 
   test('Sparse mode: sentence headers are emitted for sentences with matches', () => {
     const command = 'node src/index.js -s < test/data/gender.conllu';
     const stdout = execSync(command).toString();
-    // All 10 test sentences have at least one gender form
+    // All 12 test sentences have at least one gender form
     const text_id_count = (stdout.match(/# text_id = /g) || []).length;
-    expect(text_id_count).toBe(10);
+    expect(text_id_count).toBe(12);
   });
 
   test('Inline input: basic Genderstern annotation', () => {
commit	1a9f16e55d84ba53674405c98c75b2bfd7230e0b	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Mar 07 09:50:55 2026 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Mar 07 10:23:48 2026 +0100
tree	f5089d5cbd8cf21d138dc6c93a1c1bc4887a070e
parent	b777f9d80ce8daace4b5b41cccee1fea684d2e95 [diff]