blob: 9e0b5e312493594934b1c041ef3597fb68160e21 [file] [log] [blame]
Denis Logvinenkobabebbd2021-11-08 23:15:13 +01001package de.ids_mannheim.korap.openthesaurus;
2
3import java.io.*;
4import java.util.*;
5import java.util.regex.*;
Denis Logvinenko4d9ffdf2021-11-22 21:00:18 +01006import java.util.stream.Collectors;
7import java.util.stream.Stream;
Denis Logvinenkobabebbd2021-11-08 23:15:13 +01008
9public class ThesaurusDictionary {
Denis Logvinenko4d9ffdf2021-11-22 21:00:18 +010010
11 // The pattern is instantiated here for the sake of efficiency (gets reused lots of times)
Denis Logvinenkobabebbd2021-11-08 23:15:13 +010012 static String regex = "(\\(.+\\) *)*([^\\(\\)]+[^\\(\\ )])( *\\(.+\\))*";
13 static Pattern withoutParenthesesPattern = Pattern.compile(regex);
14
Denis Logvinenko4d9ffdf2021-11-22 21:00:18 +010015 /**
16 * Extracts words without parentheses from a string. Text file provided by
17 * OpenThesaurus contains not only synonyms on a line, but also some
18 * explanations/annotations, e.g. "(sich etwas) versagen (geh.)".
19 * Those are useless for querying and are, therefore, removed together
20 * with surrounding spaces.
21 * @param str string to extract a substring from
22 * @return substring without words in parentheses
23 */
Denis Logvinenkobabebbd2021-11-08 23:15:13 +010024 static String cleanString(String str) {
25 Matcher withoutParentheses = withoutParenthesesPattern.matcher(str);
26 if (withoutParentheses.matches()) {
27 return withoutParentheses.group(2);
28 }
29 return str;
30 }
31
Denis Logvinenko4d9ffdf2021-11-22 21:00:18 +010032 /**
33 * Reads in the text file with OpenThesaurus data and filters it as follows:
34 * 1) Ignores all lines, which start with a "#", as those are comments.
35 * 2) Reads in the lines before a line with "==" signs appears. This is the beginning
36 * of the licensing section, which should be removed.
37 * @param thesaurusInputStream InputStream created from a path to OpenThesaurus data
38 * @return Stream<String> with relevant lines from the OpenThesaurus data file.
39 * @throws IOException
40 */
41 public static Stream<String> getAndFilterLines(InputStream thesaurusInputStream) throws IOException {
42 BufferedReader bf = new BufferedReader(new InputStreamReader(thesaurusInputStream));
43 return bf.lines()
44 .takeWhile(line -> !line.startsWith("=="))
45 .filter(line -> !line.startsWith("#"));
46 }
Denis Logvinenkobabebbd2021-11-08 23:15:13 +010047
Denis Logvinenko4d9ffdf2021-11-22 21:00:18 +010048 /**
49 * Processes a single line of text from the OpenThesaurus data file,
50 * creating a dictionary, where every word on the line appears as a
51 * key, having all other words as a list of values.
52 * @param line a single line in the OpenThesaurus file
53 * @return HashMap<String, List<String>> with all combinations of
54 * words on the line as keys, and their neighbors as values
55 * (of course, ignoring the ordering in values)
56 */
57 public static Map<String, List<String>> processLine(String line) {
58 String[] lineSplit = line.split(";");
59 Map<String, List<String>> dictFromLine = new HashMap<>();
60 if (lineSplit.length > 1) {
61 // this creates a dictionary entry for every word on the line
62 // with all the other words as a List of synonyms
63 for (int i = 0; i < lineSplit.length; i++) {
64 // don't include explanations and word usage as "(ugs.)"
65 String currentKey = cleanString(lineSplit[i]);
66 List<String> synonyms = new ArrayList<>();
67 for (int j = 0; j < lineSplit.length; j++) {
68 // include all words except the current key
69 if (j != i) synonyms.add(cleanString(lineSplit[j]));
Denis Logvinenkobabebbd2021-11-08 23:15:13 +010070 }
Denis Logvinenko4d9ffdf2021-11-22 21:00:18 +010071 dictFromLine.put(currentKey, synonyms);
Denis Logvinenkobabebbd2021-11-08 23:15:13 +010072 }
Denis Logvinenko4d9ffdf2021-11-22 21:00:18 +010073 }
74 return dictFromLine;
75 }
76
77 /**
78 * Combines all the above into a pipeline and returns a dictionary, which
79 * can be queried for synonyms to get a list of synsets.
80 * @param thesaurusFileStream InputStream created from OpenThesaurus data file
81 * @return HashMap<String, List<List<String>>> with a word as a key, and
82 * a List of Lists as values (each of them represents a synset)
83 */
84 public static Map<String, List<List<String>>> createThesaurusDictionary(InputStream thesaurusFileStream) {
85 Map<String, List<List<String>>> thesaurusDict = null;
86
87 try {
88 Stream<String> fileLines = getAndFilterLines(thesaurusFileStream);
89 thesaurusDict = fileLines
90 .parallel()
91 .flatMap(line -> processLine(line).entrySet().stream())
92 .collect(Collectors.groupingBy(Map.Entry::getKey,
93 Collectors.mapping(Map.Entry::getValue, Collectors.toList())));
94 }
95 catch (IOException e) {
Denis Logvinenkobabebbd2021-11-08 23:15:13 +010096 e.printStackTrace();
97 }
Denis Logvinenko4d9ffdf2021-11-22 21:00:18 +010098
Denis Logvinenkobabebbd2021-11-08 23:15:13 +010099 return thesaurusDict;
100 }
101
Denis Logvinenko4d9ffdf2021-11-22 21:00:18 +0100102 /**
103 * Overloads the InputStream version of the method in case a filepath String to
104 * OpenThesaurus data is provided (the alternative is when the default cached data
105 * from resources/static are used, obtained via "getResourceAsStream")
106 * @param thesaurusPath filepath to the OpenThesaurus data file
107 * @return HashMap<String, List<List<String>>> with a word as a key, and
108 * a List of Lists as values (each of them represents a synset)
109 */
110 public static Map<String, List<List<String>>> createThesaurusDictionary(String thesaurusPath) {
Denis Logvinenkobabebbd2021-11-08 23:15:13 +0100111 InputStream FIS = null;
Denis Logvinenkobabebbd2021-11-08 23:15:13 +0100112 try {
113 FIS = new FileInputStream(thesaurusPath);
114 }
115 catch (IOException e) {
116 e.printStackTrace();
117 }
118 return createThesaurusDictionary(FIS);
Denis Logvinenkobabebbd2021-11-08 23:15:13 +0100119 }
120 /*
121 public static void main(String[] args) {
Denis Logvinenko4d9ffdf2021-11-22 21:00:18 +0100122 InputStream thesaurusInputStream = ThesaurusDictionary.class.getResourceAsStream("/static/openthesaurus.txt");
123 Map<String, List<List<String>>> thesaurusDict = createThesaurusDictionary(thesaurusInputStream);
Denis Logvinenkobabebbd2021-11-08 23:15:13 +0100124
125 System.out.println(thesaurusDict.get("laufen"));
126 }
Denis Logvinenko4d9ffdf2021-11-22 21:00:18 +0100127
Denis Logvinenkobabebbd2021-11-08 23:15:13 +0100128 */
129}