Blame - src/main/java/de/ids_mannheim/korap/openthesaurus/ThesaurusDictionary.java - KorAP/Kalamar-Plugin-OpenThesaurus

Denis Logvinenko

babebbd

2021-11-08 23:15:13 +0100

[diff] [blame]

1

package de.ids_mannheim.korap.openthesaurus;

import java.io.*;

import java.util.*;

import java.util.regex.*;

Denis Logvinenko

4d9ffdf

2021-11-22 21:00:18 +0100

[diff] [blame^]

6

import java.util.stream.Collectors;

7

import java.util.stream.Stream;

Denis Logvinenko

babebbd

2021-11-08 23:15:13 +0100

[diff] [blame]

8

9

public class ThesaurusDictionary {

Denis Logvinenko

4d9ffdf

2021-11-22 21:00:18 +0100

[diff] [blame^]

10

11

// The pattern is instantiated here for the sake of efficiency (gets reused lots of times)

Denis Logvinenko

babebbd

2021-11-08 23:15:13 +0100

[diff] [blame]

12

static String regex = "(\\(.+\\) *)*([^\\(\\)]+[^\\(\\ )])( *\\(.+\\))*";

13

static Pattern withoutParenthesesPattern = Pattern.compile(regex);

14

Denis Logvinenko

4d9ffdf

2021-11-22 21:00:18 +0100

[diff] [blame^]

15

/**

16

* Extracts words without parentheses from a string. Text file provided by

17

* OpenThesaurus contains not only synonyms on a line, but also some

18

* explanations/annotations, e.g. "(sich etwas) versagen (geh.)".

19

* Those are useless for querying and are, therefore, removed together

20

* with surrounding spaces.

21

* @param str string to extract a substring from

22

* @return substring without words in parentheses

23

*/

Denis Logvinenko

babebbd

2021-11-08 23:15:13 +0100

[diff] [blame]

24

static String cleanString(String str) {

25

Matcher withoutParentheses = withoutParenthesesPattern.matcher(str);

26

if (withoutParentheses.matches()) {

27

return withoutParentheses.group(2);

}

return str;

}

Denis Logvinenko

2021-11-22 21:00:18 +0100

[diff] [blame^]

32

/**

33

* Reads in the text file with OpenThesaurus data and filters it as follows:

34

* 1) Ignores all lines, which start with a "#", as those are comments.

35

* 2) Reads in the lines before a line with "==" signs appears. This is the beginning

36

* of the licensing section, which should be removed.

37

* @param thesaurusInputStream InputStream created from a path to OpenThesaurus data

38

* @return Stream<String> with relevant lines from the OpenThesaurus data file.

39

* @throws IOException

40

*/

41

public static Stream<String> getAndFilterLines(InputStream thesaurusInputStream) throws IOException {

42

BufferedReader bf = new BufferedReader(new InputStreamReader(thesaurusInputStream));

43

return bf.lines()

44

.takeWhile(line -> !line.startsWith("=="))

45

.filter(line -> !line.startsWith("#"));

46

}

Denis Logvinenko

babebbd

2021-11-08 23:15:13 +0100

[diff] [blame]

47

Denis Logvinenko

4d9ffdf

2021-11-22 21:00:18 +0100

[diff] [blame^]

48

/**

49

* Processes a single line of text from the OpenThesaurus data file,

50

* creating a dictionary, where every word on the line appears as a

51

* key, having all other words as a list of values.

52

* @param line a single line in the OpenThesaurus file

53

* @return HashMap<String, List<String>> with all combinations of

54

* words on the line as keys, and their neighbors as values

55

* (of course, ignoring the ordering in values)

56

*/

57

public static Map<String, List<String>> processLine(String line) {

58

String[] lineSplit = line.split(";");

59

Map<String, List<String>> dictFromLine = new HashMap<>();

60

if (lineSplit.length > 1) {

61

// this creates a dictionary entry for every word on the line

62

// with all the other words as a List of synonyms

63

for (int i = 0; i < lineSplit.length; i++) {

64

// don't include explanations and word usage as "(ugs.)"

65

String currentKey = cleanString(lineSplit[i]);

66

List<String> synonyms = new ArrayList<>();

67

for (int j = 0; j < lineSplit.length; j++) {

68

// include all words except the current key

69

if (j != i) synonyms.add(cleanString(lineSplit[j]));

Denis Logvinenko

babebbd

2021-11-08 23:15:13 +0100

[diff] [blame]

70

}

Denis Logvinenko

4d9ffdf

2021-11-22 21:00:18 +0100

[diff] [blame^]

71

dictFromLine.put(currentKey, synonyms);

Denis Logvinenko

babebbd

2021-11-08 23:15:13 +0100

[diff] [blame]

72

}

Denis Logvinenko

4d9ffdf

2021-11-22 21:00:18 +0100

[diff] [blame^]

}

return dictFromLine;

}

/**

* Combines all the above into a pipeline and returns a dictionary, which

79

* can be queried for synonyms to get a list of synsets.

80

* @param thesaurusFileStream InputStream created from OpenThesaurus data file

81

* @return HashMap<String, List<List<String>>> with a word as a key, and

82

* a List of Lists as values (each of them represents a synset)

83

*/

84

public static Map<String, List<List<String>>> createThesaurusDictionary(InputStream thesaurusFileStream) {

85

Map<String, List<List<String>>> thesaurusDict = null;

86

87

try {

88

Stream<String> fileLines = getAndFilterLines(thesaurusFileStream);

89

thesaurusDict = fileLines

90

.parallel()

91

.flatMap(line -> processLine(line).entrySet().stream())

92

.collect(Collectors.groupingBy(Map.Entry::getKey,

93

Collectors.mapping(Map.Entry::getValue, Collectors.toList())));

94

}

95

catch (IOException e) {

Denis Logvinenko

babebbd

2021-11-08 23:15:13 +0100

[diff] [blame]

96

e.printStackTrace();

97

}

Denis Logvinenko

4d9ffdf

2021-11-22 21:00:18 +0100

[diff] [blame^]

98

Denis Logvinenko

babebbd

2021-11-08 23:15:13 +0100

[diff] [blame]

99

return thesaurusDict;

100

}

101

Denis Logvinenko

4d9ffdf

2021-11-22 21:00:18 +0100

[diff] [blame^]

102

/**

103

* Overloads the InputStream version of the method in case a filepath String to

104

* OpenThesaurus data is provided (the alternative is when the default cached data

105

* from resources/static are used, obtained via "getResourceAsStream")

106

* @param thesaurusPath filepath to the OpenThesaurus data file

107

* @return HashMap<String, List<List<String>>> with a word as a key, and

108

* a List of Lists as values (each of them represents a synset)

109

*/

110

public static Map<String, List<List<String>>> createThesaurusDictionary(String thesaurusPath) {

Denis Logvinenko

babebbd

2021-11-08 23:15:13 +0100

[diff] [blame]

111

InputStream FIS = null;

Denis Logvinenko

babebbd

2021-11-08 23:15:13 +0100

[diff] [blame]

112

try {

113

FIS = new FileInputStream(thesaurusPath);

114

}

115

catch (IOException e) {

116

e.printStackTrace();

117

}

118

return createThesaurusDictionary(FIS);

Denis Logvinenko

babebbd

2021-11-08 23:15:13 +0100

[diff] [blame]

119

}

120

/*

121

public static void main(String[] args) {

Denis Logvinenko

4d9ffdf

2021-11-22 21:00:18 +0100

[diff] [blame^]

122

InputStream thesaurusInputStream = ThesaurusDictionary.class.getResourceAsStream("/static/openthesaurus.txt");

123

Map<String, List<List<String>>> thesaurusDict = createThesaurusDictionary(thesaurusInputStream);

Denis Logvinenko

babebbd

2021-11-08 23:15:13 +0100

[diff] [blame]

124

125

System.out.println(thesaurusDict.get("laufen"));

126

}

Denis Logvinenko

4d9ffdf

2021-11-22 21:00:18 +0100

[diff] [blame^]

127

Denis Logvinenko

babebbd

2021-11-08 23:15:13 +0100

[diff] [blame]

128

*/

129

}