Blame - scripts/update_emoji_db.js - KorAP/conllu-cmc-docker - Gitiles

blob: 1a823396b2ea1e193f6f6d396f7a60c0e99678a2 [file] [log] [blame]

Marc Kupietz	30634ff	2025-12-18 11:39:03 +0100	[diff] [blame^]	1	const https = require('https');
				2	const fs = require('fs');
				3	const path = require('path');
				4
				5	const url = 'https://www.unicode.org/Public/UCD/latest/emoji/emoji-test.txt';
				6	const outputPath = path.join(__dirname, '../src/emoji_data.json');
				7
				8	console.log(`Downloading ${url}...`);
				9
				10	https.get(url, (res) => {
				11	let data = '';
				12
				13	res.on('data', (chunk) => {
				14	data += chunk;
				15	});
				16
				17	res.on('end', () => {
				18	console.log('Download complete. Parsing data...');
				19	const emojiData = parseEmojiData(data);
				20	fs.writeFileSync(outputPath, JSON.stringify(emojiData, null, 2));
				21	console.log(`Wrote ${Object.keys(emojiData).length} emojis to ${outputPath}`);
				22	});
				23
				24	}).on('error', (err) => {
				25	console.error('Error downloading file: ' + err.message);
				26	process.exit(1);
				27	});
				28
				29	function parseEmojiData(text) {
				30	const lines = text.split('\n');
				31	const result = {};
				32	let currentGroup = '';
				33	let currentSubgroup = '';
				34
				35	for (const line of lines) {
				36	// Skip empty lines
				37	if (!line.trim()) continue;
				38
				39	// Parse Group
				40	if (line.startsWith('# group:')) {
				41	currentGroup = normalize(line.substring(8).trim());
				42	continue;
				43	}
				44
				45	// Parse Subgroup
				46	if (line.startsWith('# subgroup:')) {
				47	currentSubgroup = normalize(line.substring(11).trim());
				48	continue;
				49	}
				50
				51	// Skip comments that don't look like data (lines starting with #)
				52	if (line.startsWith('#')) continue;
				53
				54	// Parse data line
				55	// Format: code_points ; status # emoji name
				56	// Example: 1F607 ; fully-qualified # 😇 E1.0 smiling face with halo
				57	const parts = line.split(';');
				58	if (parts.length < 2) continue;
				59
				60	const statusPart = parts[1].split('#');
				61	const status = normalize(statusPart[0].trim()); // e.g., fully-qualified
				62
				63	// The part after # contains: emoji char, version, name
				64	// Example: " 😇 E1.0 smiling face with halo"
				65	const commentPart = statusPart[1].trim();
				66
				67	// We need to extract the actual emoji character(s) to use as key.
				68	// It's the first 'word' in the comment part usually.
				69	// But extracting it from the code points is safer/standard.
				70	const codePoints = parts[0].trim().split(' ');
				71	const emojiKey = String.fromCodePoint(...codePoints.map(cp => parseInt(cp, 16)));
				72
				73	// Extract Version and Name from the comment part
				74	// Format is: [Emoji] E[Version] [Name]
				75	// Example: 😇 E1.0 smiling face with halo
				76	const versionMatch = commentPart.match(/E(\d+\.\d+)/);
				77	let version = '';
				78	let name = '';
				79
				80	if (versionMatch) {
				81	version = 'E' + versionMatch[1];
				82	// Name is everything after the version
				83	const nameIndex = commentPart.indexOf(version) + version.length;
				84	name = normalize(commentPart.substring(nameIndex).trim());
				85	} else {
				86	// Fallback if regex fails (shouldn't happen with standard file)
				87	console.warn(`Could not parse version for line: ${line}`);
				88	}
				89
				90	result[emojiKey] = {
				91	g: currentGroup,
				92	s: currentSubgroup,
				93	q: status,
				94	v: version,
				95	n: name
				96	};
				97	}
				98	return result;
				99	}
				100
				101	function normalize(str) {
				102	// Replace spaces with underscores and lowercase
				103	return str.toLowerCase().replace(/[ \-]+/g, '_');
				104	}