scripts/update_emoji_db.js - KorAP/conllu-cmc-docker - Gitiles

 const https = require('https');
 const fs = require('fs');
 const path = require('path');

 const url = 'https://www.unicode.org/Public/UCD/latest/emoji/emoji-test.txt';
 const outputPath = path.join(__dirname, '../src/emoji_data.json');

 console.log(`Downloading ${url}...`);

 https.get(url, (res) => {
     let data = '';

     res.on('data', (chunk) => {
         data += chunk;
     });

     res.on('end', () => {
         console.log('Download complete. Parsing data...');
         const emojiData = parseEmojiData(data);
         fs.writeFileSync(outputPath, JSON.stringify(emojiData, null, 2));
         console.log(`Wrote ${Object.keys(emojiData).length} emojis to ${outputPath}`);
     });

 }).on('error', (err) => {
     console.error('Error downloading file: ' + err.message);
     process.exit(1);
 });

 function parseEmojiData(text) {
     const lines = text.split('\n');
     const result = {};
     let currentGroup = '';
     let currentSubgroup = '';

     for (const line of lines) {
         // Skip empty lines
         if (!line.trim()) continue;

         // Parse Group
         if (line.startsWith('# group:')) {
             currentGroup = normalize(line.substring(8).trim());
             continue;
         }

         // Parse Subgroup
         if (line.startsWith('# subgroup:')) {
             currentSubgroup = normalize(line.substring(11).trim());
             continue;
         }

         // Skip comments that don't look like data (lines starting with #)
         if (line.startsWith('#')) continue;

         // Parse data line
         // Format: code_points ; status # emoji name
         // Example: 1F607 ; fully-qualified # 😇 E1.0 smiling face with halo
         const parts = line.split(';');
         if (parts.length < 2) continue;

         const statusPart = parts[1].split('#');
         const status = normalize(statusPart[0].trim()); // e.g., fully-qualified

         // The part after # contains: emoji char, version, name
         // Example: " 😇 E1.0 smiling face with halo"
         const commentPart = statusPart[1].trim();

         // We need to extract the actual emoji character(s) to use as key.
         // It's the first 'word' in the comment part usually.
         // But extracting it from the code points is safer/standard.
         const codePoints = parts[0].trim().split(' ');
         const emojiKey = String.fromCodePoint(...codePoints.map(cp => parseInt(cp, 16)));

         // Extract Version and Name from the comment part
         // Format is: [Emoji] E[Version] [Name]
         // Example: 😇 E1.0 smiling face with halo
         const versionMatch = commentPart.match(/E(\d+\.\d+)/);
         let version = '';
         let name = '';

         if (versionMatch) {
             version = 'E' + versionMatch[1];
             // Name is everything after the version
             const nameIndex = commentPart.indexOf(version) + version.length;
             name = normalize(commentPart.substring(nameIndex).trim());
         } else {
              // Fallback if regex fails (shouldn't happen with standard file)
              console.warn(`Could not parse version for line: ${line}`);
         }

         result[emojiKey] = {
             g: currentGroup,
             s: currentSubgroup,
             q: status,
             v: version,
             n: name
         };
     }
     return result;
 }

 function normalize(str) {
     // Replace spaces with underscores and lowercase
     return str.toLowerCase().replace(/[ \-]+/g, '_');
 }
	const https = require('https');
	const fs = require('fs');
	const path = require('path');

	const url = 'https://www.unicode.org/Public/UCD/latest/emoji/emoji-test.txt';
	const outputPath = path.join(__dirname, '../src/emoji_data.json');

	console.log(`Downloading ${url}...`);

	https.get(url, (res) => {
	let data = '';

	res.on('data', (chunk) => {
	data += chunk;
	});

	res.on('end', () => {
	console.log('Download complete. Parsing data...');
	const emojiData = parseEmojiData(data);
	fs.writeFileSync(outputPath, JSON.stringify(emojiData, null, 2));
	console.log(`Wrote ${Object.keys(emojiData).length} emojis to ${outputPath}`);
	});

	}).on('error', (err) => {
	console.error('Error downloading file: ' + err.message);
	process.exit(1);
	});

	function parseEmojiData(text) {
	const lines = text.split('\n');
	const result = {};
	let currentGroup = '';
	let currentSubgroup = '';

	for (const line of lines) {
	// Skip empty lines
	if (!line.trim()) continue;

	// Parse Group
	if (line.startsWith('# group:')) {
	currentGroup = normalize(line.substring(8).trim());
	continue;
	}

	// Parse Subgroup
	if (line.startsWith('# subgroup:')) {
	currentSubgroup = normalize(line.substring(11).trim());
	continue;
	}

	// Skip comments that don't look like data (lines starting with #)
	if (line.startsWith('#')) continue;

	// Parse data line
	// Format: code_points ; status # emoji name
	// Example: 1F607 ; fully-qualified # 😇 E1.0 smiling face with halo
	const parts = line.split(';');
	if (parts.length < 2) continue;

	const statusPart = parts[1].split('#');
	const status = normalize(statusPart[0].trim()); // e.g., fully-qualified

	// The part after # contains: emoji char, version, name
	// Example: " 😇 E1.0 smiling face with halo"
	const commentPart = statusPart[1].trim();

	// We need to extract the actual emoji character(s) to use as key.
	// It's the first 'word' in the comment part usually.
	// But extracting it from the code points is safer/standard.
	const codePoints = parts[0].trim().split(' ');
	const emojiKey = String.fromCodePoint(...codePoints.map(cp => parseInt(cp, 16)));

	// Extract Version and Name from the comment part
	// Format is: [Emoji] E[Version] [Name]
	// Example: 😇 E1.0 smiling face with halo
	const versionMatch = commentPart.match(/E(\d+\.\d+)/);
	let version = '';
	let name = '';

	if (versionMatch) {
	version = 'E' + versionMatch[1];
	// Name is everything after the version
	const nameIndex = commentPart.indexOf(version) + version.length;
	name = normalize(commentPart.substring(nameIndex).trim());
	} else {
	// Fallback if regex fails (shouldn't happen with standard file)
	console.warn(`Could not parse version for line: ${line}`);
	}

	result[emojiKey] = {
	g: currentGroup,
	s: currentSubgroup,
	q: status,
	v: version,
	n: name
	};
	}
	return result;
	}

	function normalize(str) {
	// Replace spaces with underscores and lowercase
	return str.toLowerCase().replace(/[ \-]+/g, '_');
	}