Blame - src/index.js - KorAP/conllu-gender - Gitiles

blob: 47be79ed102a683209ccb8a7a0ba9d9e136a58e8 [file] [log] [blame]

Marc Kupietz	b777f9d	2026-03-07 09:26:20 +0100	[diff] [blame]	1	#!/usr/bin/env node
				2
				3	// conllu-gender
				4	// Reads CoNLL-U format from stdin and annotates German gender-sensitive personal
				5	// nouns, gendered determiners/pronouns, and neo-pronouns with correct POS (UPOS
				6	// and XPOS/STTS), lemma, and morphological features.
				7	//
				8	// Based on the morphosyntactic analysis in:
				9	// Ochs, S. (2026). Die morphosyntaktische Integration neuer Gendersuffixe:
				10	// Eine korpusbasierte Analyse deutschsprachiger Pressetexte.
				11	// Gender Linguistics, 2. doi: https://doi.org/10.65020/0619d927
				12	//
				13	// Gender marker types (following Ochs & Rüdiger 2025):
				14	// Non-binary intended: Genderstern (*), Doppelpunkt (:), Unterstrich (_)
				15	// Binary intended: Binnen-I (I), Klammern (()), Schrägstrich (/)
				16
				17	// ---------------------------------------------------------------------------
				18	// Regex patterns for gender-sensitive NOUNS
				19	// ---------------------------------------------------------------------------
				20	// Each regex captures: (base, marker, suffix)
				21	// suffix is either 'in' (singular) or 'innen' (plural)
				22
				23	// Genderstern: Lehrerin, Bürgerinnen, Ärzt*innen
				24	const nounGenderStarRegex = /^(.+)\*(in(?:nen)?)$/i;
				25	// Doppelpunkt: Lehrer:in, Bürger:innen
				26	const nounGenderColonRegex = /^(.+):(in(?:nen)?)$/i;
				27	// Unterstrich: Lehrer_in, Bürger_innen
				28	const nounGenderUnderscoreRegex = /^(.+)_(in(?:nen)?)$/i;
				29	// Binnen-I: LehrerIn, LehrerInnen (case-sensitive – the I is uppercase)
				30	// The base must end in a lowercase letter to avoid matching regular proper nouns
				31	// that start a sentence. We require at least one lowercase letter before the I.
				32	const nounBinnenIRegex = /^([A-ZÄÖÜ][a-zäöüß].*?[a-zäöüß])(In(?:nen)?)$/;
				33	// Klammern: Lehrer(in), Lehrer(innen)
				34	const nounKlammernRegex = /^(.+)\((in(?:nen)?)\)$/i;
				35	// Schrägstrich: Lehrer/in, Lehrer/innen, Lehrer/-in, Lehrer/-innen
				36	const nounSchraegstrichRegex = /^(.+)\/-?(in(?:nen)?)$/i;
				37
				38	// ---------------------------------------------------------------------------
				39	// Regex patterns for gender-sensitive DETERMINERS / PRONOUNS
				40	// (jeder, eine, derdie, desr, eines*r, etc.)
				41	// ---------------------------------------------------------------------------
				42	// Inflected forms of articles, indefinite articles, and pronouns with gender
				43	// markers. Non-binary intended markers (*, :, _) are the most common.
				44	// We match: any known determiner/pronoun stem + gender_marker + ending
				45
				46	// Gendered forms like: jeder, jede:r, jede_r, keine, kein:e, ein*e, ein:e,
				47	// ein_e, derdie, dieder, desr, desder, demder, dendie, etc.
				48	// Strategy: match known Determiner/Pronoun base forms followed by gender marker
				49	// and a short inflectional ending.
				50
				51	// Combined pattern: known pronoun/det base + non-binary marker + short ending
				52	// This covers forms documented in Ochs (2026) §7.3.2–7.3.4
				53	const detNonBinaryRegex = /^(jede[mn]?\|jede[rs]?\|keine?[mrns]?\|eine?[mrns]?\|de[mrns]\|die\|das\|de[rs]\|dem\|den\|aller?\|manche[mrns]?\|solche[mrns]?\|welche[mrns]?\|irgendeine[mrns]?)([*:_])([a-zäöürs]{1,3})$/i;
				54
				55	// Binnen-I variants of determiners: einE, jedeR, jedeN, JedeR, etc.
				56	// Base (lowercase or title-case) + uppercase inflection letter(s)
				57	const detBinnenIRegex = /^(jede[mn]?\|keine?[mrns]?\|eine?[mrns]?\|alle?\|manche?\|solche?\|welche?)([RNSEM]{1,2})$/;
				58
				59	// Doppelform determiners merged with Schrägstrich (the only binary-intended merge
				60	// character for articles per Ochs 2026): ein/e, die/der, einen/r, etc.
				61	// Non-binary markers (*, :, _) are handled by detNonBinaryRegex with Gender=NonBin.
				62	const detDoppelformRegex = /^(der\|die\|das\|dem\|den\|des\|ein\|eine\|einen\|einem\|einer\|eines)\/(der\|die\|das\|dem\|den\|des\|ein\|eine\|einen\|einem\|einer\|eines\|[rns])$/i;
				63
				64	// ---------------------------------------------------------------------------
				65	// Neo-pronouns (new gender-neutral pronouns in German)
				66	// ---------------------------------------------------------------------------
				67	// Gendered-star pronoun pairs (sieer, ersie, ihr*sein, etc.)
				68	const neopronGenderStarPairRegex = /^(sie\|er\|ihr\|ihn?\|ihm?\|dich\|sich\|mich\|mir\|uns\|euch\|ihnen\|seinen?\|ihrem?\|deren?\|denen)([*:_])(sie\|er\|ihr\|ihn?\|ihm?\|dich\|sich\|mich\|mir\|uns\|euch\|ihnen\|seinen?\|ihrem?\|deren?\|denen)$/i;
				69
				70	// ---------------------------------------------------------------------------
Marc Kupietz	1a9f16e	2026-03-07 09:50:55 +0100	[diff] [blame]	71	// Neo-pronoun lexicon (source: pronomen.net/beliebige:neopronomen)
				72	// Maps lowercased surface form → { lemma, upos, xpos, feats }.
				73	//
				74	// Lemma: nominative form as listed on pronomen.net.
Marc Kupietz	b2068f4	2026-03-07 21:58:05 +0100	[diff] [blame]	75	// UPOS: PRON \| XPOS: PPER \| FEATS: Gender=Fem,Masc,NonBin\|PronType=Prs
Marc Kupietz	1a9f16e	2026-03-07 09:50:55 +0100	[diff] [blame]	76	//
				77	// Excluded (too ambiguous with standard German words):
				78	// 'dem' – dative definite article / demonstrative pronoun
				79	// 'deren' – relative/demonstrative genitive pronoun
				80	// 'denen' – relative/demonstrative dative pronoun
				81	// 'per' – common German preposition
				82	// 'pers' – excluded together with 'per'
				83	//
				84	// Shared/ambiguous oblique forms:
				85	// 'sier','siem','sien' – NOM/DAT/ACC of sier-paradigm; also GEN/DAT/ACC of
				86	// et/siem-paradigm (both annotated with lemma 'sier')
				87	// 'em' – NOM of em/em-paradigm; also DAT of el/em and en/em
				88	// 'ems' – GEN of both el/em and em/em (annotated as lemma 'em')
				89	// 'en' – NOM/ACC/DAT of en/en; NOM/ACC of en/em (lemma 'en')
				90	// 'ens' – GEN of en/em; also all forms of ens/ens (lemma 'ens')
				91	// ---------------------------------------------------------------------------
				92
				93	function neoPron(lemma) {
Marc Kupietz	b2068f4	2026-03-07 21:58:05 +0100	[diff] [blame]	94	return { lemma, upos: 'PRON', xpos: 'PPER', feats: 'Gender=Fem,Masc,NonBin\|PronType=Prs' };
Marc Kupietz	1a9f16e	2026-03-07 09:50:55 +0100	[diff] [blame]	95	}
				96
				97	const NEO_PRONOUN_FORMS = new Map([
				98	// ---- Verschmelzung (blend pronouns) ------------------------------------
				99	// sier/siem (NOM=sier, GEN=sies, DAT=siem, ACC=sien)
				100	['sier', neoPron('sier')],
				101	['sies', neoPron('sier')],
				102	['siem', neoPron('sier')],
				103	['sien', neoPron('sier')],
				104	// xier/xiem (NOM=xier, GEN=xies, DAT=xiem, ACC=xien)
				105	['xier', neoPron('xier')],
				106	['xies', neoPron('xier')],
				107	['xiem', neoPron('xier')],
				108	['xien', neoPron('xier')],
				109	// ersie/ihmihr (NOM=ersie, GEN=seinihr, DAT=ihmihr, ACC=ihnsie)
				110	['ersie', neoPron('ersie')],
				111	['seinihr', neoPron('ersie')],
				112	['ihmihr', neoPron('ersie')],
				113	['ihnsie', neoPron('ersie')],
				114
				115	// ---- They-ähnlich (they-like pronouns) ---------------------------------
				116	// dej/denen/dej (NOM=dej, GEN=deren, DAT=denen, ACC=dej)
				117	// 'deren' and 'denen' omitted (overlap with standard German pronouns)
				118	['dej', neoPron('dej')],
				119	// dey/denen/dem and dey/denen/demm (NOM=dey; 'dem' excluded)
				120	['dey', neoPron('dey')],
				121	['demm', neoPron('dey')], // ACC of dey/denen/demm
				122	// ey/emm (NOM=ey, GEN=eys, DAT=emm, ACC=emm)
				123	['ey', neoPron('ey')],
				124	['eys', neoPron('ey')],
				125	['emm', neoPron('ey')],
				126	// they/them (NOM=they, GEN=their, DAT=them, ACC=them)
Marc Kupietz	acf3120	2026-03-10 09:24:28 +0100	[diff] [blame]	127	// Excluded: 'they' and 'them' are too frequent in English quotations within
				128	// German texts and would cause many false positives.
				129	// 'their' is likewise excluded for the same reason.
Marc Kupietz	1a9f16e	2026-03-07 09:50:55 +0100	[diff] [blame]	130
				131	// ---- Neuer Stamm (new-stem pronouns) -----------------------------------
				132	// el/em (NOM=el, GEN=ems, DAT=em, ACC=en)
				133	// 'ems' mapped to 'em'-paradigm below; 'em'/'en' mapped to their own NOM paradigms
				134	['el', neoPron('el')],
				135	// em/em (NOM=em, GEN=ems, DAT=em, ACC=em)
				136	['em', neoPron('em')],
				137	['ems', neoPron('em')], // GEN shared with el/em paradigm
				138	// en/en (NOM=en, GEN=enses, DAT=en, ACC=en)
				139	// en/em (NOM=en, GEN=ens, DAT=em, ACC=en) — DAT 'em' mapped to em-paradigm
				140	['en', neoPron('en')],
				141	['enses', neoPron('en')],
				142	// ens/ens (NOM=ens, GEN=ens, DAT=ens, ACC=ens)
				143	// 'ens' takes priority as NOM of ens-paradigm (also GEN of en/em)
				144	['ens', neoPron('ens')],
				145	// et/siem (NOM=et, GEN=sier, DAT=siem, ACC=sien)
Marc Kupietz	acf3120	2026-03-10 09:24:28 +0100	[diff] [blame]	146	// Excluded: 'et' is omitted — it is ubiquitous in academic German texts as the
				147	// Latin abbreviation in 'et al.' and would cause pervasive false positives.
				148	// The oblique forms sier/siem/sien are still captured via the sier-paradigm.
Marc Kupietz	1a9f16e	2026-03-07 09:50:55 +0100	[diff] [blame]	149	// ex/ex (all forms = ex)
				150	['ex', neoPron('ex')],
Marc Kupietz	d2b9279	2026-03-10 08:11:06 +0100	[diff] [blame]	151	// hän/sim (NOM=hän, GEN=sir, DAT=sim, ACC=sim)
				152	// Note: ACC 'sin' is omitted — it frequently occurs in German texts as a Spanish
				153	// loan word (e.g. 'Chili sin Carne') and would cause too many false positives.
Marc Kupietz	1a9f16e	2026-03-07 09:50:55 +0100	[diff] [blame]	154	['hän', neoPron('hän')],
				155	['sir', neoPron('hän')],
				156	['sim', neoPron('hän')],
Marc Kupietz	1a9f16e	2026-03-07 09:50:55 +0100	[diff] [blame]	157	// hen/hem (NOM=hen, GEN=hens, DAT=hem, ACC=hen)
				158	['hen', neoPron('hen')],
				159	['hens', neoPron('hen')],
				160	['hem', neoPron('hen')],
				161	// hie/hiem (NOM=hie, GEN=hein, DAT=hiem, ACC=hie)
				162	['hie', neoPron('hie')],
				163	['hein', neoPron('hie')],
				164	['hiem', neoPron('hie')],
				165	// iks/iks (NOM=iks, GEN=ikses, DAT=iks, ACC=iks)
				166	['iks', neoPron('iks')],
				167	['ikses', neoPron('iks')],
				168	// ind/inde (NOM=ind, GEN=inds, DAT=inde, ACC=ind)
				169	['ind', neoPron('ind')],
				170	['inds', neoPron('ind')],
				171	['inde', neoPron('ind')],
				172	// mensch/mensch (NOM=mensch, GEN=menschs, DAT=mensch, ACC=mensch)
Marc Kupietz	d2b9279	2026-03-10 08:11:06 +0100	[diff] [blame]	173	// Note: 'Mensch' (uppercase) is only tagged as a neo-pronoun when the token is
				174	// sentence-initial (tokenId === 1). Mid-sentence 'Mensch' is treated as the
				175	// common German noun. See lookup logic in classifyToken().
Marc Kupietz	1a9f16e	2026-03-07 09:50:55 +0100	[diff] [blame]	176	['mensch', neoPron('mensch')],
				177	['menschs', neoPron('mensch')],
				178	// nin/nim (NOM=nin, GEN=nims, DAT=nim, ACC=nin)
				179	['nin', neoPron('nin')],
				180	['nims', neoPron('nin')],
				181	['nim', neoPron('nin')],
				182	// oj/ojm (NOM=oj, GEN=juj, DAT=ojm, ACC=ojn)
				183	['oj', neoPron('oj')],
				184	['juj', neoPron('oj')],
				185	['ojm', neoPron('oj')],
				186	['ojn', neoPron('oj')],
				187	// per/per (all forms = per; GEN = pers)
Marc Kupietz	d2b9279	2026-03-10 08:11:06 +0100	[diff] [blame]	188	// Excluded: 'per' is a common German preposition; 'pers' excluded together with it.
Marc Kupietz	1a9f16e	2026-03-07 09:50:55 +0100	[diff] [blame]	189	// ser/sem (NOM=ser, GEN=ses, DAT=sem, ACC=sen)
				190	['ser', neoPron('ser')],
				191	['ses', neoPron('ser')],
				192	['sem', neoPron('ser')],
				193	['sen', neoPron('ser')],
				194	// Y/Y (all forms = Y; GEN = Ys) — stored lowercase; lemma retains uppercase 'Y'
				195	['y', neoPron('Y')],
				196	['ys', neoPron('Y')],
				197	// zet/zerm (NOM=zet, GEN=zets, DAT=zerm, ACC=zern)
				198	['zet', neoPron('zet')],
				199	['zets', neoPron('zet')],
				200	['zerm', neoPron('zet')],
				201	['zern', neoPron('zet')],
				202	// / (Stern; all forms = ; GEN = s)
Marc Kupietz	d2b9279	2026-03-10 08:11:06 +0100	[diff] [blame]	203	// Excluded: bare asterisk '*' causes too many false positives (e.g. list bullets,
				204	// Genderstern markers in noun forms). Not included in the lexicon.
Marc Kupietz	1a9f16e	2026-03-07 09:50:55 +0100	[diff] [blame]	205	]);
				206
				207	// ---------------------------------------------------------------------------
Marc Kupietz	b777f9d	2026-03-07 09:26:20 +0100	[diff] [blame]	208	// Helpers
				209	// ---------------------------------------------------------------------------
				210
				211	/**
				212	* Determine if a suffix string represents singular or plural.
				213	* 'in' (length 2) → Sing
				214	* 'innen' (length 5) → Plur
				215	* Works case-insensitively (In / Innen for Binnen-I forms).
				216	*/
				217	function getNumber(suffix) {
				218	return /^innen$/i.test(suffix) ? 'Plur' : 'Sing';
				219	}
				220
				221	/**
				222	* Build the canonical lemma for a gendered noun.
				223	* The lemma is always the nominative singular form, preserving the original
				224	* gender marker. This follows the convention that the lemma reflects the
				225	* citation form of the gendered derivate (Ochs 2026 §2).
				226	*
				227	* @param {string} base - derivation base (before the gender marker)
				228	* @param {string} marker - gender marker character(s), e.g. '*', ':', '_', 'I',
				229	* '(in)', '/in', etc.
				230	* @param {string} markerType - 'star'\|'colon'\|'underscore'\|'binnenI'\|
				231	* 'klammern'\|'schraegstrich'
				232	*/
				233	function buildNounLemma(base, marker, markerType) {
				234	switch (markerType) {
				235	case 'star': return base + '*in';
				236	case 'colon': return base + ':in';
				237	case 'underscore': return base + '_in';
				238	case 'binnenI': return base + 'In';
				239	case 'klammern': return base + '(in)';
				240	case 'schraegstrich':return base + '/in';
				241	default: return base + marker + 'in';
				242	}
				243	}
				244
				245	/**
				246	* Build the morphological features string for a gendered noun token.
				247	* Per CoNLL-U conventions, features are sorted alphabetically by feature name.
				248	*
				249	* Gender values used (extending standard UD practice for German):
				250	* NonBin – non-binary intended forms (*, :, _)
				251	* Masc,Fem – binary inclusive forms (I, (), /)
				252	*
				253	* Case is not set here because it cannot be determined from surface form alone
				254	* for the vast majority of gendered noun tokens (Ochs 2026 §7.1).
				255	*
				256	* @param {string} number - 'Sing' \| 'Plur'
				257	* @param {string} markerType - see buildNounLemma
				258	*/
				259	function buildNounFeatures(number, markerType) {
				260	const genderIsNonBinary = ['star', 'colon', 'underscore'].includes(markerType);
				261	const genderIsBinary = ['binnenI', 'klammern', 'schraegstrich'].includes(markerType);
				262
				263	const feats = [];
				264	if (genderIsNonBinary) {
Marc Kupietz	b2068f4	2026-03-07 21:58:05 +0100	[diff] [blame]	265	feats.push('Gender=Fem,Masc,NonBin');
Marc Kupietz	b777f9d	2026-03-07 09:26:20 +0100	[diff] [blame]	266	} else if (genderIsBinary) {
				267	feats.push('Gender=Masc,Fem');
				268	}
				269	feats.push('Number=' + number);
				270	return feats.join('\|');
				271	}
				272
				273	// ---------------------------------------------------------------------------
				274	// Command-line interface (mirrors conllu-cmc)
				275	// ---------------------------------------------------------------------------
				276
				277	const optionDefinitions = [
				278	{ name: 'sparse', alias: 's', type: Boolean,
				279	description: 'Print only the tokens that received new annotations.' },
				280	{ name: 'help', alias: 'h', type: Boolean,
				281	description: 'Print this usage guide.' },
				282	];
				283
				284	const sections = [
				285	{
				286	header: 'conllu-gender',
				287	content: 'Reads CoNLL-U format from stdin and annotates German gender-sensitive ' +
				288	'personal nouns, gendered determiners/pronouns, and neo-pronouns with ' +
				289	'correct POS, lemma, and morphological features. Writes CoNLL-U to stdout.'
				290	},
				291	{
				292	header: 'Synopsis',
				293	content: '$ conllu-gender [-s] < input.conllu > output.conllu'
				294	},
				295	{
				296	header: 'Options',
				297	optionList: optionDefinitions
				298	}
				299	];
				300
				301	const getUsage = require('command-line-usage');
				302	const commandLineArgs = require('command-line-args');
				303
				304	var options;
				305	try {
				306	options = commandLineArgs(optionDefinitions);
				307	} catch (e) {
				308	console.error(e.message);
				309	options = { help: true };
				310	}
				311
				312	if (options.help) {
				313	const usage = getUsage(sections);
				314	console.log(usage);
				315	process.exit(0);
				316	}
				317
				318	// ---------------------------------------------------------------------------
				319	// CoNLL-U processing
				320	// ---------------------------------------------------------------------------
				321
				322	const readline = require('readline');
				323	global.header = '';
				324	global.fileheader = '';
				325	global.standalone = false;
				326
				327	const rl = readline.createInterface({
				328	input: process.stdin,
				329	output: process.stdout,
				330	terminal: false,
				331	});
				332
				333	/**
				334	* Attempt to annotate a single CoNLL-U token (word form).
				335	* Returns an annotation object on success, or null if the token is not a
				336	* recognised gender-sensitive form.
				337	*
Marc Kupietz	d2b9279	2026-03-10 08:11:06 +0100	[diff] [blame]	338	* @param {string} word - surface form of the token
				339	* @param {number} tokenId - 1-based position of the token in its sentence
				340	*
Marc Kupietz	b777f9d	2026-03-07 09:26:20 +0100	[diff] [blame]	341	* Annotation object shape:
				342	* { lemma, upos, xpos, feats }
				343	*/
Marc Kupietz	d2b9279	2026-03-10 08:11:06 +0100	[diff] [blame]	344	function classifyToken(word, tokenId) {
Marc Kupietz	b777f9d	2026-03-07 09:26:20 +0100	[diff] [blame]	345	let m;
				346
				347	// ------------------------------------------------------------------
Marc Kupietz	d2b9279	2026-03-10 08:11:06 +0100	[diff] [blame]	348	// 0. Neo-pronoun lexicon lookup
Marc Kupietz	1a9f16e	2026-03-07 09:50:55 +0100	[diff] [blame]	349	// ------------------------------------------------------------------
Marc Kupietz	d2b9279	2026-03-10 08:11:06 +0100	[diff] [blame]	350	// To avoid false positives from capitalised abbreviations (EL, EM),
				351	// title-case exclamations (Ey), or common nouns (Mensch mid-sentence),
				352	// we only perform a case-insensitive lookup when:
				353	// a) the token is already lowercase, OR
				354	// b) it is sentence-initial (tokenId === 1), where capitalisation is
				355	// purely orthographic (e.g. 'Sier trifft xier').
				356	const lc = word.toLowerCase();
				357	let entry;
				358	if (word === lc \|\| tokenId === 1) {
				359	entry = NEO_PRONOUN_FORMS.get(lc);
				360	}
Marc Kupietz	1a9f16e	2026-03-07 09:50:55 +0100	[diff] [blame]	361	if (entry) return entry;
				362
				363	// ------------------------------------------------------------------
Marc Kupietz	b777f9d	2026-03-07 09:26:20 +0100	[diff] [blame]	364	// 1. Gender-sensitive NOUNS
				365	// ------------------------------------------------------------------
				366
				367	// Genderstern (non-binary intended)
				368	if ((m = nounGenderStarRegex.exec(word))) {
				369	const [, base, suffix] = m;
				370	const number = getNumber(suffix);
				371	return {
				372	lemma: buildNounLemma(base, '*', 'star'),
				373	upos: 'NOUN',
				374	xpos: 'NN',
				375	feats: buildNounFeatures(number, 'star'),
				376	};
				377	}
				378
				379	// Doppelpunkt (non-binary intended)
				380	if ((m = nounGenderColonRegex.exec(word))) {
				381	const [, base, suffix] = m;
				382	const number = getNumber(suffix);
				383	return {
				384	lemma: buildNounLemma(base, ':', 'colon'),
				385	upos: 'NOUN',
				386	xpos: 'NN',
				387	feats: buildNounFeatures(number, 'colon'),
				388	};
				389	}
				390
				391	// Unterstrich (non-binary intended)
				392	if ((m = nounGenderUnderscoreRegex.exec(word))) {
				393	const [, base, suffix] = m;
				394	const number = getNumber(suffix);
				395	return {
				396	lemma: buildNounLemma(base, '_', 'underscore'),
				397	upos: 'NOUN',
				398	xpos: 'NN',
				399	feats: buildNounFeatures(number, 'underscore'),
				400	};
				401	}
				402
				403	// Schrägstrich (binary intended) – before Binnen-I to avoid false matches
				404	if ((m = nounSchraegstrichRegex.exec(word))) {
				405	const [, base, suffix] = m;
				406	const number = getNumber(suffix);
				407	return {
				408	lemma: buildNounLemma(base, '/', 'schraegstrich'),
				409	upos: 'NOUN',
				410	xpos: 'NN',
				411	feats: buildNounFeatures(number, 'schraegstrich'),
				412	};
				413	}
				414
				415	// Klammern (binary intended)
				416	if ((m = nounKlammernRegex.exec(word))) {
				417	const [, base, suffix] = m;
				418	const number = getNumber(suffix);
				419	return {
				420	lemma: buildNounLemma(base, '()', 'klammern'),
				421	upos: 'NOUN',
				422	xpos: 'NN',
				423	feats: buildNounFeatures(number, 'klammern'),
				424	};
				425	}
				426
				427	// Binnen-I (binary intended) – requires at least one lowercase letter before
				428	// the I to distinguish from sentence-initial capitalisation
				429	if ((m = nounBinnenIRegex.exec(word))) {
				430	const [, base, suffix] = m;
				431	const number = getNumber(suffix);
				432	return {
				433	lemma: buildNounLemma(base, 'I', 'binnenI'),
				434	upos: 'NOUN',
				435	xpos: 'NN',
				436	feats: buildNounFeatures(number, 'binnenI'),
				437	};
				438	}
				439
				440	// ------------------------------------------------------------------
				441	// 2. Gender-sensitive DETERMINERS / PRONOUNS
				442	// ------------------------------------------------------------------
				443
				444	// Doppelform determiners merged with gender marker (derdie, desr, etc.)
				445	// Checked before detNonBinaryRegex because die*der is a Doppelform, not purely
				446	// non-binary intended, and should receive Gender=Masc,Fem features.
				447	if ((m = detDoppelformRegex.exec(word))) {
				448	const [fullMatch, form1] = m;
				449	return {
				450	lemma: fullMatch,
				451	upos: 'DET',
				452	xpos: inferDetXpos(form1),
				453	feats: 'Gender=Masc,Fem',
				454	};
				455	}
				456
				457	// Non-binary marker determiners (jede*r, ein:e, kein_e, etc.)
				458	if ((m = detNonBinaryRegex.exec(word))) {
				459	const [, detBase, marker, ending] = m;
				460	// Preserve full base + marker + ending as lemma (no stripping needed;
				461	// gendered determiners have no established uninflected citation form).
				462	return {
				463	lemma: detBase + marker + ending,
				464	upos: 'DET',
				465	xpos: inferDetXpos(detBase),
Marc Kupietz	b2068f4	2026-03-07 21:58:05 +0100	[diff] [blame]	466	feats: 'Gender=Fem,Masc,NonBin',
Marc Kupietz	b777f9d	2026-03-07 09:26:20 +0100	[diff] [blame]	467	};
				468	}
				469
				470	// Binnen-I determiners (einE, JedeR, jedeN, etc.)
				471	if ((m = detBinnenIRegex.exec(word))) {
				472	const [, detBase, endings] = m;
				473	return {
				474	lemma: detBase + endings,
				475	upos: 'DET',
				476	xpos: inferDetXpos(detBase),
				477	feats: 'Gender=Masc,Fem',
				478	};
				479	}
				480
				481	// ------------------------------------------------------------------
				482	// 3. Neo-pronouns / gendered pronoun pairs
				483	// ------------------------------------------------------------------
				484
				485	if ((m = neopronGenderStarPairRegex.exec(word))) {
				486	const [fullMatch, pron1, marker, pron2] = m;
				487	const markerType = marker === '*' ? 'star' : marker === ':' ? 'colon' : 'underscore';
				488	return {
				489	lemma: fullMatch,
				490	upos: 'PRON',
				491	xpos: inferPronXpos(pron1),
				492	feats: markerType === 'star' \|\| markerType === 'colon' \|\| markerType === 'underscore'
Marc Kupietz	b2068f4	2026-03-07 21:58:05 +0100	[diff] [blame]	493	? 'Gender=Fem,Masc,NonBin' : 'Gender=Masc,Fem',
Marc Kupietz	b777f9d	2026-03-07 09:26:20 +0100	[diff] [blame]	494	};
				495	}
				496
				497	return null;
				498	}
				499
				500	/**
				501	* Infer STTS XPOS tag for a determiner/article base.
				502	*/
				503	function inferDetXpos(base) {
				504	const b = base.toLowerCase();
				505	if (/^(der\|die\|das\|de[mrns])/.test(b)) return 'ART';
				506	if (/^(ein\|eine\|einen\|einem\|einer\|eines\|kein\|keine\|keinen\|keinem\|keiner\|keines)/.test(b)) return 'ART';
				507	if (/^(jede\|jeder\|jeden\|jedem\|jedes\|jedem)/.test(b)) return 'PIAT';
				508	if (/^(alle\|aller\|allen\|alles\|allem)/.test(b)) return 'PIAT';
				509	if (/^(manche\|mancher\|manchen\|manchem\|manches)/.test(b)) return 'PIAT';
				510	if (/^(solche\|solcher\|solchen\|solchem\|solches)/.test(b)) return 'PIAT';
				511	if (/^(welche\|welcher\|welchen\|welchem\|welches)/.test(b)) return 'PWAT';
				512	if (/^(irgend)/.test(b)) return 'PIAT';
				513	return 'ART';
				514	}
				515
				516	/**
				517	* Infer STTS XPOS tag for a personal pronoun base.
				518	*/
				519	function inferPronXpos(base) {
				520	const b = base.toLowerCase();
				521	if (/^(ich\|du\|er\|sie\|es\|wir\|ihr\|sie\|mich\|mir\|dich\|dir\|sich\|ihn\|ihm\|uns\|euch)$/.test(b)) return 'PPER';
				522	return 'PPER';
				523	}
				524
				525	// ---------------------------------------------------------------------------
				526	// Main line-by-line processing loop (mirrors conllu-cmc approach)
				527	// ---------------------------------------------------------------------------
				528
				529	function parseConllu(line) {
				530	// Handle foundry comment: change to 'gender'
				531	if (line.match('#\\s*foundry')) {
				532	if (line.match('=\\s*base')) {
				533	if (options.sparse) {
				534	global.standalone = true;
				535	}
				536	process.stdout.write('# foundry = gender\n');
				537	} else {
				538	process.stdout.write(`${line}\n`);
				539	}
				540	return;
				541	}
				542
				543	if (global.standalone) {
				544	if (line.match('^#\\s*filename')) {
				545	global.fileheader = `${line}\n`;
				546	return;
				547	} else if (line.match('^#\\s*text_id')) {
				548	global.fileheader += `${line}\n`;
				549	return;
				550	} else if (line.match('^#\\s*eo[ft]')) {
				551	process.stdout.write(`${line}\n`);
				552	return;
				553	} else if (line.match('^#')) {
				554	global.header += `${line}\n`;
				555	return;
				556	} else if (line.trim().match('^$')) {
				557	if (global.header === '') {
				558	process.stdout.write('\n');
				559	}
				560	global.header = '';
				561	return;
				562	}
				563	} else {
				564	if (!line.match('^\\d+')) {
				565	process.stdout.write(`${line}\n`);
				566	return;
				567	}
				568	}
				569
				570	const columns = line.trim().split('\t');
				571	// CoNLL-U columns (0-indexed):
				572	// 0:ID 1:FORM 2:LEMMA 3:UPOS 4:XPOS 5:FEATS 6:HEAD 7:DEPREL 8:DEPS 9:MISC
				573
				574	const word = columns[1];
Marc Kupietz	d2b9279	2026-03-10 08:11:06 +0100	[diff] [blame]	575	const tokenId = parseInt(columns[0], 10);
				576	const annotation = classifyToken(word, tokenId);
Marc Kupietz	b777f9d	2026-03-07 09:26:20 +0100	[diff] [blame]	577
				578	if (annotation) {
				579	// Replace lemma (col 2), UPOS (col 3), XPOS (col 4), FEATS (col 5)
				580	columns[2] = annotation.lemma;
				581	columns[3] = annotation.upos;
				582	columns[4] = annotation.xpos;
				583	columns[5] = annotation.feats;
				584
				585	if (global.standalone) {
				586	process.stdout.write(global.fileheader);
				587	process.stdout.write(global.header);
				588	global.header = global.fileheader = '';
				589	}
				590	process.stdout.write(columns.join('\t') + '\n');
				591	} else if (!global.standalone) {
				592	process.stdout.write(`${line}\n`);
				593	}
				594	}
				595
				596	rl.on('line', parseConllu);
				597	rl.on('close', () => process.exit(0));