Blame - src/index.js - KorAP/conllu-gender - Gitiles

blob: e2f22aacbb625a9dae4d02792e986df542b6c4bb [file] [log] [blame]

Marc Kupietz	b777f9d	2026-03-07 09:26:20 +0100	[diff] [blame^]	1	#!/usr/bin/env node
				2
				3	// conllu-gender
				4	// Reads CoNLL-U format from stdin and annotates German gender-sensitive personal
				5	// nouns, gendered determiners/pronouns, and neo-pronouns with correct POS (UPOS
				6	// and XPOS/STTS), lemma, and morphological features.
				7	//
				8	// Based on the morphosyntactic analysis in:
				9	// Ochs, S. (2026). Die morphosyntaktische Integration neuer Gendersuffixe:
				10	// Eine korpusbasierte Analyse deutschsprachiger Pressetexte.
				11	// Gender Linguistics, 2. doi: https://doi.org/10.65020/0619d927
				12	//
				13	// Gender marker types (following Ochs & Rüdiger 2025):
				14	// Non-binary intended: Genderstern (*), Doppelpunkt (:), Unterstrich (_)
				15	// Binary intended: Binnen-I (I), Klammern (()), Schrägstrich (/)
				16
				17	// ---------------------------------------------------------------------------
				18	// Regex patterns for gender-sensitive NOUNS
				19	// ---------------------------------------------------------------------------
				20	// Each regex captures: (base, marker, suffix)
				21	// suffix is either 'in' (singular) or 'innen' (plural)
				22
				23	// Genderstern: Lehrerin, Bürgerinnen, Ärzt*innen
				24	const nounGenderStarRegex = /^(.+)\*(in(?:nen)?)$/i;
				25	// Doppelpunkt: Lehrer:in, Bürger:innen
				26	const nounGenderColonRegex = /^(.+):(in(?:nen)?)$/i;
				27	// Unterstrich: Lehrer_in, Bürger_innen
				28	const nounGenderUnderscoreRegex = /^(.+)_(in(?:nen)?)$/i;
				29	// Binnen-I: LehrerIn, LehrerInnen (case-sensitive – the I is uppercase)
				30	// The base must end in a lowercase letter to avoid matching regular proper nouns
				31	// that start a sentence. We require at least one lowercase letter before the I.
				32	const nounBinnenIRegex = /^([A-ZÄÖÜ][a-zäöüß].*?[a-zäöüß])(In(?:nen)?)$/;
				33	// Klammern: Lehrer(in), Lehrer(innen)
				34	const nounKlammernRegex = /^(.+)\((in(?:nen)?)\)$/i;
				35	// Schrägstrich: Lehrer/in, Lehrer/innen, Lehrer/-in, Lehrer/-innen
				36	const nounSchraegstrichRegex = /^(.+)\/-?(in(?:nen)?)$/i;
				37
				38	// ---------------------------------------------------------------------------
				39	// Regex patterns for gender-sensitive DETERMINERS / PRONOUNS
				40	// (jeder, eine, derdie, desr, eines*r, etc.)
				41	// ---------------------------------------------------------------------------
				42	// Inflected forms of articles, indefinite articles, and pronouns with gender
				43	// markers. Non-binary intended markers (*, :, _) are the most common.
				44	// We match: any known determiner/pronoun stem + gender_marker + ending
				45
				46	// Gendered forms like: jeder, jede:r, jede_r, keine, kein:e, ein*e, ein:e,
				47	// ein_e, derdie, dieder, desr, desder, demder, dendie, etc.
				48	// Strategy: match known Determiner/Pronoun base forms followed by gender marker
				49	// and a short inflectional ending.
				50
				51	// Combined pattern: known pronoun/det base + non-binary marker + short ending
				52	// This covers forms documented in Ochs (2026) §7.3.2–7.3.4
				53	const detNonBinaryRegex = /^(jede[mn]?\|jede[rs]?\|keine?[mrns]?\|eine?[mrns]?\|de[mrns]\|die\|das\|de[rs]\|dem\|den\|aller?\|manche[mrns]?\|solche[mrns]?\|welche[mrns]?\|irgendeine[mrns]?)([*:_])([a-zäöürs]{1,3})$/i;
				54
				55	// Binnen-I variants of determiners: einE, jedeR, jedeN, JedeR, etc.
				56	// Base (lowercase or title-case) + uppercase inflection letter(s)
				57	const detBinnenIRegex = /^(jede[mn]?\|keine?[mrns]?\|eine?[mrns]?\|alle?\|manche?\|solche?\|welche?)([RNSEM]{1,2})$/;
				58
				59	// Doppelform determiners merged with Schrägstrich (the only binary-intended merge
				60	// character for articles per Ochs 2026): ein/e, die/der, einen/r, etc.
				61	// Non-binary markers (*, :, _) are handled by detNonBinaryRegex with Gender=NonBin.
				62	const detDoppelformRegex = /^(der\|die\|das\|dem\|den\|des\|ein\|eine\|einen\|einem\|einer\|eines)\/(der\|die\|das\|dem\|den\|des\|ein\|eine\|einen\|einem\|einer\|eines\|[rns])$/i;
				63
				64	// ---------------------------------------------------------------------------
				65	// Neo-pronouns (new gender-neutral pronouns in German)
				66	// ---------------------------------------------------------------------------
				67	// Gendered-star pronoun pairs (sieer, ersie, ihr*sein, etc.)
				68	const neopronGenderStarPairRegex = /^(sie\|er\|ihr\|ihn?\|ihm?\|dich\|sich\|mich\|mir\|uns\|euch\|ihnen\|seinen?\|ihrem?\|deren?\|denen)([*:_])(sie\|er\|ihr\|ihn?\|ihm?\|dich\|sich\|mich\|mir\|uns\|euch\|ihnen\|seinen?\|ihrem?\|deren?\|denen)$/i;
				69
				70	// ---------------------------------------------------------------------------
				71	// Helpers
				72	// ---------------------------------------------------------------------------
				73
				74	/**
				75	* Determine if a suffix string represents singular or plural.
				76	* 'in' (length 2) → Sing
				77	* 'innen' (length 5) → Plur
				78	* Works case-insensitively (In / Innen for Binnen-I forms).
				79	*/
				80	function getNumber(suffix) {
				81	return /^innen$/i.test(suffix) ? 'Plur' : 'Sing';
				82	}
				83
				84	/**
				85	* Build the canonical lemma for a gendered noun.
				86	* The lemma is always the nominative singular form, preserving the original
				87	* gender marker. This follows the convention that the lemma reflects the
				88	* citation form of the gendered derivate (Ochs 2026 §2).
				89	*
				90	* @param {string} base - derivation base (before the gender marker)
				91	* @param {string} marker - gender marker character(s), e.g. '*', ':', '_', 'I',
				92	* '(in)', '/in', etc.
				93	* @param {string} markerType - 'star'\|'colon'\|'underscore'\|'binnenI'\|
				94	* 'klammern'\|'schraegstrich'
				95	*/
				96	function buildNounLemma(base, marker, markerType) {
				97	switch (markerType) {
				98	case 'star': return base + '*in';
				99	case 'colon': return base + ':in';
				100	case 'underscore': return base + '_in';
				101	case 'binnenI': return base + 'In';
				102	case 'klammern': return base + '(in)';
				103	case 'schraegstrich':return base + '/in';
				104	default: return base + marker + 'in';
				105	}
				106	}
				107
				108	/**
				109	* Build the morphological features string for a gendered noun token.
				110	* Per CoNLL-U conventions, features are sorted alphabetically by feature name.
				111	*
				112	* Gender values used (extending standard UD practice for German):
				113	* NonBin – non-binary intended forms (*, :, _)
				114	* Masc,Fem – binary inclusive forms (I, (), /)
				115	*
				116	* Case is not set here because it cannot be determined from surface form alone
				117	* for the vast majority of gendered noun tokens (Ochs 2026 §7.1).
				118	*
				119	* @param {string} number - 'Sing' \| 'Plur'
				120	* @param {string} markerType - see buildNounLemma
				121	*/
				122	function buildNounFeatures(number, markerType) {
				123	const genderIsNonBinary = ['star', 'colon', 'underscore'].includes(markerType);
				124	const genderIsBinary = ['binnenI', 'klammern', 'schraegstrich'].includes(markerType);
				125
				126	const feats = [];
				127	if (genderIsNonBinary) {
				128	feats.push('Gender=NonBin');
				129	} else if (genderIsBinary) {
				130	feats.push('Gender=Masc,Fem');
				131	}
				132	feats.push('Number=' + number);
				133	return feats.join('\|');
				134	}
				135
				136	// ---------------------------------------------------------------------------
				137	// Command-line interface (mirrors conllu-cmc)
				138	// ---------------------------------------------------------------------------
				139
				140	const optionDefinitions = [
				141	{ name: 'sparse', alias: 's', type: Boolean,
				142	description: 'Print only the tokens that received new annotations.' },
				143	{ name: 'help', alias: 'h', type: Boolean,
				144	description: 'Print this usage guide.' },
				145	];
				146
				147	const sections = [
				148	{
				149	header: 'conllu-gender',
				150	content: 'Reads CoNLL-U format from stdin and annotates German gender-sensitive ' +
				151	'personal nouns, gendered determiners/pronouns, and neo-pronouns with ' +
				152	'correct POS, lemma, and morphological features. Writes CoNLL-U to stdout.'
				153	},
				154	{
				155	header: 'Synopsis',
				156	content: '$ conllu-gender [-s] < input.conllu > output.conllu'
				157	},
				158	{
				159	header: 'Options',
				160	optionList: optionDefinitions
				161	}
				162	];
				163
				164	const getUsage = require('command-line-usage');
				165	const commandLineArgs = require('command-line-args');
				166
				167	var options;
				168	try {
				169	options = commandLineArgs(optionDefinitions);
				170	} catch (e) {
				171	console.error(e.message);
				172	options = { help: true };
				173	}
				174
				175	if (options.help) {
				176	const usage = getUsage(sections);
				177	console.log(usage);
				178	process.exit(0);
				179	}
				180
				181	// ---------------------------------------------------------------------------
				182	// CoNLL-U processing
				183	// ---------------------------------------------------------------------------
				184
				185	const readline = require('readline');
				186	global.header = '';
				187	global.fileheader = '';
				188	global.standalone = false;
				189
				190	const rl = readline.createInterface({
				191	input: process.stdin,
				192	output: process.stdout,
				193	terminal: false,
				194	});
				195
				196	/**
				197	* Attempt to annotate a single CoNLL-U token (word form).
				198	* Returns an annotation object on success, or null if the token is not a
				199	* recognised gender-sensitive form.
				200	*
				201	* Annotation object shape:
				202	* { lemma, upos, xpos, feats }
				203	*/
				204	function classifyToken(word) {
				205	let m;
				206
				207	// ------------------------------------------------------------------
				208	// 1. Gender-sensitive NOUNS
				209	// ------------------------------------------------------------------
				210
				211	// Genderstern (non-binary intended)
				212	if ((m = nounGenderStarRegex.exec(word))) {
				213	const [, base, suffix] = m;
				214	const number = getNumber(suffix);
				215	return {
				216	lemma: buildNounLemma(base, '*', 'star'),
				217	upos: 'NOUN',
				218	xpos: 'NN',
				219	feats: buildNounFeatures(number, 'star'),
				220	};
				221	}
				222
				223	// Doppelpunkt (non-binary intended)
				224	if ((m = nounGenderColonRegex.exec(word))) {
				225	const [, base, suffix] = m;
				226	const number = getNumber(suffix);
				227	return {
				228	lemma: buildNounLemma(base, ':', 'colon'),
				229	upos: 'NOUN',
				230	xpos: 'NN',
				231	feats: buildNounFeatures(number, 'colon'),
				232	};
				233	}
				234
				235	// Unterstrich (non-binary intended)
				236	if ((m = nounGenderUnderscoreRegex.exec(word))) {
				237	const [, base, suffix] = m;
				238	const number = getNumber(suffix);
				239	return {
				240	lemma: buildNounLemma(base, '_', 'underscore'),
				241	upos: 'NOUN',
				242	xpos: 'NN',
				243	feats: buildNounFeatures(number, 'underscore'),
				244	};
				245	}
				246
				247	// Schrägstrich (binary intended) – before Binnen-I to avoid false matches
				248	if ((m = nounSchraegstrichRegex.exec(word))) {
				249	const [, base, suffix] = m;
				250	const number = getNumber(suffix);
				251	return {
				252	lemma: buildNounLemma(base, '/', 'schraegstrich'),
				253	upos: 'NOUN',
				254	xpos: 'NN',
				255	feats: buildNounFeatures(number, 'schraegstrich'),
				256	};
				257	}
				258
				259	// Klammern (binary intended)
				260	if ((m = nounKlammernRegex.exec(word))) {
				261	const [, base, suffix] = m;
				262	const number = getNumber(suffix);
				263	return {
				264	lemma: buildNounLemma(base, '()', 'klammern'),
				265	upos: 'NOUN',
				266	xpos: 'NN',
				267	feats: buildNounFeatures(number, 'klammern'),
				268	};
				269	}
				270
				271	// Binnen-I (binary intended) – requires at least one lowercase letter before
				272	// the I to distinguish from sentence-initial capitalisation
				273	if ((m = nounBinnenIRegex.exec(word))) {
				274	const [, base, suffix] = m;
				275	const number = getNumber(suffix);
				276	return {
				277	lemma: buildNounLemma(base, 'I', 'binnenI'),
				278	upos: 'NOUN',
				279	xpos: 'NN',
				280	feats: buildNounFeatures(number, 'binnenI'),
				281	};
				282	}
				283
				284	// ------------------------------------------------------------------
				285	// 2. Gender-sensitive DETERMINERS / PRONOUNS
				286	// ------------------------------------------------------------------
				287
				288	// Doppelform determiners merged with gender marker (derdie, desr, etc.)
				289	// Checked before detNonBinaryRegex because die*der is a Doppelform, not purely
				290	// non-binary intended, and should receive Gender=Masc,Fem features.
				291	if ((m = detDoppelformRegex.exec(word))) {
				292	const [fullMatch, form1] = m;
				293	return {
				294	lemma: fullMatch,
				295	upos: 'DET',
				296	xpos: inferDetXpos(form1),
				297	feats: 'Gender=Masc,Fem',
				298	};
				299	}
				300
				301	// Non-binary marker determiners (jede*r, ein:e, kein_e, etc.)
				302	if ((m = detNonBinaryRegex.exec(word))) {
				303	const [, detBase, marker, ending] = m;
				304	// Preserve full base + marker + ending as lemma (no stripping needed;
				305	// gendered determiners have no established uninflected citation form).
				306	return {
				307	lemma: detBase + marker + ending,
				308	upos: 'DET',
				309	xpos: inferDetXpos(detBase),
				310	feats: 'Gender=NonBin',
				311	};
				312	}
				313
				314	// Binnen-I determiners (einE, JedeR, jedeN, etc.)
				315	if ((m = detBinnenIRegex.exec(word))) {
				316	const [, detBase, endings] = m;
				317	return {
				318	lemma: detBase + endings,
				319	upos: 'DET',
				320	xpos: inferDetXpos(detBase),
				321	feats: 'Gender=Masc,Fem',
				322	};
				323	}
				324
				325	// ------------------------------------------------------------------
				326	// 3. Neo-pronouns / gendered pronoun pairs
				327	// ------------------------------------------------------------------
				328
				329	if ((m = neopronGenderStarPairRegex.exec(word))) {
				330	const [fullMatch, pron1, marker, pron2] = m;
				331	const markerType = marker === '*' ? 'star' : marker === ':' ? 'colon' : 'underscore';
				332	return {
				333	lemma: fullMatch,
				334	upos: 'PRON',
				335	xpos: inferPronXpos(pron1),
				336	feats: markerType === 'star' \|\| markerType === 'colon' \|\| markerType === 'underscore'
				337	? 'Gender=NonBin' : 'Gender=Masc,Fem',
				338	};
				339	}
				340
				341	return null;
				342	}
				343
				344	/**
				345	* Infer STTS XPOS tag for a determiner/article base.
				346	*/
				347	function inferDetXpos(base) {
				348	const b = base.toLowerCase();
				349	if (/^(der\|die\|das\|de[mrns])/.test(b)) return 'ART';
				350	if (/^(ein\|eine\|einen\|einem\|einer\|eines\|kein\|keine\|keinen\|keinem\|keiner\|keines)/.test(b)) return 'ART';
				351	if (/^(jede\|jeder\|jeden\|jedem\|jedes\|jedem)/.test(b)) return 'PIAT';
				352	if (/^(alle\|aller\|allen\|alles\|allem)/.test(b)) return 'PIAT';
				353	if (/^(manche\|mancher\|manchen\|manchem\|manches)/.test(b)) return 'PIAT';
				354	if (/^(solche\|solcher\|solchen\|solchem\|solches)/.test(b)) return 'PIAT';
				355	if (/^(welche\|welcher\|welchen\|welchem\|welches)/.test(b)) return 'PWAT';
				356	if (/^(irgend)/.test(b)) return 'PIAT';
				357	return 'ART';
				358	}
				359
				360	/**
				361	* Infer STTS XPOS tag for a personal pronoun base.
				362	*/
				363	function inferPronXpos(base) {
				364	const b = base.toLowerCase();
				365	if (/^(ich\|du\|er\|sie\|es\|wir\|ihr\|sie\|mich\|mir\|dich\|dir\|sich\|ihn\|ihm\|uns\|euch)$/.test(b)) return 'PPER';
				366	return 'PPER';
				367	}
				368
				369	// ---------------------------------------------------------------------------
				370	// Main line-by-line processing loop (mirrors conllu-cmc approach)
				371	// ---------------------------------------------------------------------------
				372
				373	function parseConllu(line) {
				374	// Handle foundry comment: change to 'gender'
				375	if (line.match('#\\s*foundry')) {
				376	if (line.match('=\\s*base')) {
				377	if (options.sparse) {
				378	global.standalone = true;
				379	}
				380	process.stdout.write('# foundry = gender\n');
				381	} else {
				382	process.stdout.write(`${line}\n`);
				383	}
				384	return;
				385	}
				386
				387	if (global.standalone) {
				388	if (line.match('^#\\s*filename')) {
				389	global.fileheader = `${line}\n`;
				390	return;
				391	} else if (line.match('^#\\s*text_id')) {
				392	global.fileheader += `${line}\n`;
				393	return;
				394	} else if (line.match('^#\\s*eo[ft]')) {
				395	process.stdout.write(`${line}\n`);
				396	return;
				397	} else if (line.match('^#')) {
				398	global.header += `${line}\n`;
				399	return;
				400	} else if (line.trim().match('^$')) {
				401	if (global.header === '') {
				402	process.stdout.write('\n');
				403	}
				404	global.header = '';
				405	return;
				406	}
				407	} else {
				408	if (!line.match('^\\d+')) {
				409	process.stdout.write(`${line}\n`);
				410	return;
				411	}
				412	}
				413
				414	const columns = line.trim().split('\t');
				415	// CoNLL-U columns (0-indexed):
				416	// 0:ID 1:FORM 2:LEMMA 3:UPOS 4:XPOS 5:FEATS 6:HEAD 7:DEPREL 8:DEPS 9:MISC
				417
				418	const word = columns[1];
				419	const annotation = classifyToken(word);
				420
				421	if (annotation) {
				422	// Replace lemma (col 2), UPOS (col 3), XPOS (col 4), FEATS (col 5)
				423	columns[2] = annotation.lemma;
				424	columns[3] = annotation.upos;
				425	columns[4] = annotation.xpos;
				426	columns[5] = annotation.feats;
				427
				428	if (global.standalone) {
				429	process.stdout.write(global.fileheader);
				430	process.stdout.write(global.header);
				431	global.header = global.fileheader = '';
				432	}
				433	process.stdout.write(columns.join('\t') + '\n');
				434	} else if (!global.standalone) {
				435	process.stdout.write(`${line}\n`);
				436	}
				437	}
				438
				439	rl.on('line', parseConllu);
				440	rl.on('close', () => process.exit(0));