Akron | 1c34ce6 | 2021-09-23 23:27:39 +0200 | [diff] [blame] | 1 | package datok |
| 2 | |
| 3 | import ( |
| 4 | "bytes" |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 5 | "fmt" |
| 6 | "os" |
Akron | 1c34ce6 | 2021-09-23 23:27:39 +0200 | [diff] [blame] | 7 | "strings" |
| 8 | "testing" |
| 9 | |
| 10 | "github.com/stretchr/testify/assert" |
| 11 | ) |
| 12 | |
Akron | abcb6a5 | 2021-10-09 15:52:08 +0200 | [diff] [blame] | 13 | var s string = `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar. |
| 14 | Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher. |
| 15 | Der Termin ist am 5.9.2018. |
| 16 | Ich habe die readme.txt heruntergeladen. |
| 17 | Ausschalten!!! Hast Du nicht gehört??? |
| 18 | Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen. |
| 19 | Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz! |
| 20 | Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft. |
| 21 | Mach's macht's was'n ist's haste willste kannste biste kriegste.` |
| 22 | |
Akron | 9fb63af | 2021-10-28 01:15:53 +0200 | [diff] [blame] | 23 | var mat *MatrixTokenizer |
| 24 | |
Akron | c9c0eae | 2021-10-22 19:49:43 +0200 | [diff] [blame] | 25 | func TestMatrixFullTokenizer(t *testing.T) { |
Akron | 1c34ce6 | 2021-09-23 23:27:39 +0200 | [diff] [blame] | 26 | assert := assert.New(t) |
| 27 | foma := LoadFomaFile("testdata/simpletok.fst") |
| 28 | assert.NotNil(foma) |
| 29 | |
| 30 | mat := foma.ToMatrix() |
| 31 | |
| 32 | r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!") |
| 33 | b := make([]byte, 0, 2048) |
| 34 | w := bytes.NewBuffer(b) |
| 35 | var tokens []string |
| 36 | mat.Transduce(r, w) |
| 37 | tokens = strings.Split(w.String(), "\n") |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 38 | assert.Equal(len(tokens), 11) |
Akron | 1c34ce6 | 2021-09-23 23:27:39 +0200 | [diff] [blame] | 39 | assert.Equal("wald", tokens[0]) |
| 40 | assert.Equal("gehen", tokens[1]) |
| 41 | assert.Equal("Da", tokens[2]) |
| 42 | assert.Equal("kann", tokens[3]) |
| 43 | assert.Equal("man", tokens[4]) |
| 44 | assert.Equal("was", tokens[5]) |
| 45 | assert.Equal("\"erleben\"", tokens[6]) |
| 46 | assert.Equal("!", tokens[7]) |
Akron | 5c82a92 | 2021-09-24 19:11:29 +0200 | [diff] [blame] | 47 | |
| 48 | r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!") |
| 49 | w.Reset() |
| 50 | mat.Transduce(r, w) |
| 51 | tokens = strings.Split(w.String(), "\n") |
| 52 | assert.Equal("In", tokens[0]) |
| 53 | assert.Equal("den", tokens[1]) |
| 54 | assert.Equal("Wald", tokens[2]) |
| 55 | assert.Equal("gehen", tokens[3]) |
| 56 | assert.Equal("?", tokens[4]) |
| 57 | assert.Equal("--", tokens[5]) |
| 58 | |
| 59 | r = strings.NewReader(" g? -- D") |
| 60 | w.Reset() |
| 61 | mat.Transduce(r, w) |
| 62 | tokens = strings.Split(w.String(), "\n") |
| 63 | assert.Equal("g", tokens[0]) |
| 64 | assert.Equal("?", tokens[1]) |
| 65 | assert.Equal("--", tokens[2]) |
| 66 | assert.Equal("D", tokens[3]) |
| 67 | assert.Equal("", tokens[4]) |
| 68 | assert.Equal("", tokens[5]) |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 69 | assert.Equal("", tokens[6]) |
| 70 | assert.Equal(7, len(tokens)) |
Akron | 5c82a92 | 2021-09-24 19:11:29 +0200 | [diff] [blame] | 71 | } |
| 72 | |
Akron | df27581 | 2022-03-27 12:54:46 +0200 | [diff] [blame] | 73 | func TestMatrixSimpleString(t *testing.T) { |
| 74 | assert := assert.New(t) |
| 75 | // bau | bauamt |
| 76 | tok := LoadFomaFile("testdata/bauamt.fst") |
| 77 | mat := tok.ToMatrix() |
| 78 | |
| 79 | b := make([]byte, 0, 2048) |
| 80 | w := bytes.NewBuffer(b) |
| 81 | var tokens []string |
| 82 | |
| 83 | tokens = ttokenize(mat, w, "ibauamt") |
| 84 | assert.Equal("i", tokens[0]) |
| 85 | assert.Equal("bauamt", tokens[1]) |
| 86 | |
| 87 | tokens = ttokenize(mat, w, "ibbauamt") |
| 88 | assert.Equal("i", tokens[0]) |
| 89 | |
| 90 | assert.Equal("b", tokens[1]) |
| 91 | assert.Equal("bauamt", tokens[2]) |
| 92 | |
| 93 | tokens = ttokenize(mat, w, "bau") |
| 94 | assert.Equal("bau", tokens[0]) |
| 95 | |
| 96 | tokens = ttokenize(mat, w, "baum") |
| 97 | assert.Equal("bau", tokens[0]) |
| 98 | assert.Equal("m", tokens[1]) |
| 99 | |
| 100 | tokens = ttokenize(mat, w, "baudibauamt") |
| 101 | assert.Equal("bau", tokens[0]) |
| 102 | assert.Equal("d", tokens[1]) |
| 103 | assert.Equal("i", tokens[2]) |
| 104 | assert.Equal("bauamt", tokens[3]) |
| 105 | } |
| 106 | |
Akron | c9c0eae | 2021-10-22 19:49:43 +0200 | [diff] [blame] | 107 | func TestMatrixReadWriteTokenizer(t *testing.T) { |
Akron | 16c312e | 2021-09-26 13:11:12 +0200 | [diff] [blame] | 108 | assert := assert.New(t) |
| 109 | foma := LoadFomaFile("testdata/simpletok.fst") |
| 110 | assert.NotNil(foma) |
| 111 | |
| 112 | mat := foma.ToMatrix() |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 113 | assert.NotNil(mat) |
Akron | 16c312e | 2021-09-26 13:11:12 +0200 | [diff] [blame] | 114 | |
Akron | df27581 | 2022-03-27 12:54:46 +0200 | [diff] [blame] | 115 | assert.Equal(ttokenizeStr(mat, "bau"), "bau") |
| 116 | assert.Equal(ttokenizeStr(mat, "bad"), "bad") |
| 117 | assert.Equal(ttokenizeStr(mat, "wald gehen"), "wald\ngehen") |
Akron | 16c312e | 2021-09-26 13:11:12 +0200 | [diff] [blame] | 118 | b := make([]byte, 0, 1024) |
| 119 | buf := bytes.NewBuffer(b) |
| 120 | n, err := mat.WriteTo(buf) |
| 121 | assert.Nil(err) |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 122 | assert.Equal(int64(230), n) |
Akron | 16c312e | 2021-09-26 13:11:12 +0200 | [diff] [blame] | 123 | mat2 := ParseMatrix(buf) |
| 124 | assert.NotNil(mat2) |
| 125 | assert.Equal(mat.sigma, mat2.sigma) |
| 126 | assert.Equal(mat.epsilon, mat2.epsilon) |
| 127 | assert.Equal(mat.unknown, mat2.unknown) |
| 128 | assert.Equal(mat.identity, mat2.identity) |
| 129 | assert.Equal(mat.stateCount, mat2.stateCount) |
| 130 | assert.Equal(len(mat.array), len(mat2.array)) |
| 131 | assert.Equal(mat.array, mat2.array) |
Akron | df27581 | 2022-03-27 12:54:46 +0200 | [diff] [blame] | 132 | assert.Equal(ttokenizeStr(mat2, "bau"), "bau") |
| 133 | assert.Equal(ttokenizeStr(mat2, "bad"), "bad") |
| 134 | assert.Equal(ttokenizeStr(mat2, "wald gehen"), "wald\ngehen") |
Akron | 16c312e | 2021-09-26 13:11:12 +0200 | [diff] [blame] | 135 | } |
| 136 | |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 137 | func TestMatrixIgnorableMCS(t *testing.T) { |
| 138 | assert := assert.New(t) |
| 139 | |
| 140 | // This test relies on final states. That's why it is |
| 141 | // not working correctly anymore. |
| 142 | |
| 143 | // File has MCS in sigma but not in net |
| 144 | tok := LoadFomaFile("testdata/ignorable_mcs.fst") |
| 145 | assert.NotNil(tok) |
| 146 | mat := tok.ToMatrix() |
| 147 | assert.NotNil(mat) |
| 148 | |
| 149 | b := make([]byte, 0, 2048) |
| 150 | w := bytes.NewBuffer(b) |
| 151 | var tokens []string |
| 152 | |
| 153 | // Is only unambigous when transducing strictly greedy! |
| 154 | assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w)) |
| 155 | tokens = strings.Split(w.String(), "\n") |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 156 | assert.Equal("a\nb\n<ab>a\n\n\n", w.String()) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 157 | assert.Equal("a", tokens[0]) |
| 158 | assert.Equal("b", tokens[1]) |
| 159 | assert.Equal("<ab>a", tokens[2]) |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 160 | assert.Equal(6, len(tokens)) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 161 | } |
| 162 | |
Akron | c9c0eae | 2021-10-22 19:49:43 +0200 | [diff] [blame] | 163 | func xTestMatrixReadWriteFullTokenizer(t *testing.T) { |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 164 | assert := assert.New(t) |
| 165 | foma := LoadFomaFile("testdata/tokenizer.fst") |
| 166 | assert.NotNil(foma) |
| 167 | |
| 168 | mat := foma.ToMatrix() |
| 169 | assert.NotNil(foma) |
| 170 | |
| 171 | tb := make([]byte, 0, 2048) |
| 172 | w := bytes.NewBuffer(tb) |
| 173 | |
| 174 | assert.True(mat.Transduce(strings.NewReader("der alte baum"), w)) |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 175 | assert.Equal("der\nalte\nbaum\n\n\n", w.String()) |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 176 | |
| 177 | b := make([]byte, 0, 1024) |
| 178 | buf := bytes.NewBuffer(b) |
| 179 | _, err := mat.WriteTo(buf) |
| 180 | assert.Nil(err) |
| 181 | w.Reset() |
| 182 | // assert.Equal(int64(248), n) |
| 183 | |
| 184 | mat2 := ParseMatrix(buf) |
| 185 | assert.NotNil(mat2) |
| 186 | assert.Equal(mat.sigma, mat2.sigma) |
| 187 | assert.Equal(mat.epsilon, mat2.epsilon) |
| 188 | assert.Equal(mat.unknown, mat2.unknown) |
| 189 | assert.Equal(mat.identity, mat2.identity) |
| 190 | assert.Equal(mat.stateCount, mat2.stateCount) |
| 191 | assert.Equal(len(mat.array), len(mat2.array)) |
| 192 | // assert.Equal(mat.array, mat2.array) |
| 193 | |
| 194 | assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w)) |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 195 | assert.Equal("der\nalte\nbaum\n\n\n", w.String()) |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 196 | } |
| 197 | |
Akron | c9c0eae | 2021-10-22 19:49:43 +0200 | [diff] [blame] | 198 | func TestMatrixFullTokenizerTransduce(t *testing.T) { |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 199 | assert := assert.New(t) |
| 200 | |
Akron | 9fb63af | 2021-10-28 01:15:53 +0200 | [diff] [blame] | 201 | if mat == nil { |
| 202 | mat = LoadMatrixFile("testdata/tokenizer.matok") |
| 203 | } |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 204 | |
| 205 | assert.NotNil(mat) |
| 206 | |
| 207 | b := make([]byte, 0, 2048) |
| 208 | w := bytes.NewBuffer(b) |
| 209 | var tokens []string |
| 210 | |
| 211 | assert.True(mat.Transduce(strings.NewReader("tra. u Du?"), w)) |
| 212 | |
| 213 | tokens = strings.Split(w.String(), "\n") |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 214 | assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String()) |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 215 | assert.Equal("tra", tokens[0]) |
| 216 | assert.Equal(".", tokens[1]) |
| 217 | assert.Equal("", tokens[2]) |
| 218 | assert.Equal("u", tokens[3]) |
| 219 | assert.Equal("Du", tokens[4]) |
| 220 | assert.Equal("?", tokens[5]) |
| 221 | assert.Equal("", tokens[6]) |
| 222 | assert.Equal("", tokens[7]) |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 223 | assert.Equal(9, len(tokens)) |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 224 | |
| 225 | w.Reset() |
| 226 | assert.True(mat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w)) |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 227 | assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String()) |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 228 | } |
| 229 | |
Akron | c9c0eae | 2021-10-22 19:49:43 +0200 | [diff] [blame] | 230 | func TestMatrixFullTokenizerMatrixSentenceSplitter(t *testing.T) { |
Akron | 5c82a92 | 2021-09-24 19:11:29 +0200 | [diff] [blame] | 231 | assert := assert.New(t) |
Akron | 9fb63af | 2021-10-28 01:15:53 +0200 | [diff] [blame] | 232 | |
| 233 | if mat == nil { |
| 234 | mat = LoadMatrixFile("testdata/tokenizer.matok") |
| 235 | } |
Akron | 5c82a92 | 2021-09-24 19:11:29 +0200 | [diff] [blame] | 236 | |
| 237 | b := make([]byte, 0, 2048) |
| 238 | w := bytes.NewBuffer(b) |
| 239 | var sentences []string |
| 240 | |
| 241 | // testSentSplitterSimple |
| 242 | assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w)) |
| 243 | sentences = strings.Split(w.String(), "\n\n") |
| 244 | |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 245 | assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String()) |
Akron | 5c82a92 | 2021-09-24 19:11:29 +0200 | [diff] [blame] | 246 | assert.Equal("Der\nalte\nMann\n.", sentences[0]) |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 247 | assert.Equal("\n", sentences[1]) |
Akron | 5c82a92 | 2021-09-24 19:11:29 +0200 | [diff] [blame] | 248 | assert.Equal(len(sentences), 2) |
| 249 | |
| 250 | w.Reset() |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 251 | assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der F.D.P. hat gewählt."), w)) |
| 252 | sentences = strings.Split(w.String(), "\n\n") |
| 253 | assert.Equal(len(sentences), 2) |
| 254 | assert.Equal("Der\nVorsitzende\nder\nF.D.P.\nhat\ngewählt\n.", sentences[0]) |
| 255 | assert.Equal("\n", sentences[1]) |
| 256 | |
| 257 | w.Reset() |
Akron | 5c82a92 | 2021-09-24 19:11:29 +0200 | [diff] [blame] | 258 | assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w)) |
| 259 | sentences = strings.Split(w.String(), "\n\n") |
| 260 | assert.Equal(len(sentences), 2) |
| 261 | assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0]) |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 262 | assert.Equal("\n", sentences[1]) |
Akron | 5c82a92 | 2021-09-24 19:11:29 +0200 | [diff] [blame] | 263 | |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 264 | w.Reset() |
| 265 | assert.True(mat.Transduce(strings.NewReader(""), w)) |
| 266 | sentences = strings.Split(w.String(), "\n\n") |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 267 | assert.Equal(len(sentences), 2) |
| 268 | assert.Equal("", sentences[0]) |
| 269 | assert.Equal("", sentences[1]) |
Akron | 5c82a92 | 2021-09-24 19:11:29 +0200 | [diff] [blame] | 270 | |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 271 | w.Reset() |
| 272 | assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w)) |
| 273 | sentences = strings.Split(w.String(), "\n\n") |
| 274 | assert.Equal(len(sentences), 2) |
Akron | 5c82a92 | 2021-09-24 19:11:29 +0200 | [diff] [blame] | 275 | |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 276 | w.Reset() |
| 277 | assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w)) |
| 278 | sentences = strings.Split(w.String(), "\n\n") |
| 279 | assert.Equal(len(sentences), 2) |
Akron | 5c82a92 | 2021-09-24 19:11:29 +0200 | [diff] [blame] | 280 | |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 281 | w.Reset() |
| 282 | assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w)) |
| 283 | sentences = strings.Split(w.String(), "\n\n") |
| 284 | assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0]) |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 285 | assert.Equal("\n", sentences[1]) |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 286 | assert.Equal(len(sentences), 2) |
Akron | 5c82a92 | 2021-09-24 19:11:29 +0200 | [diff] [blame] | 287 | |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 288 | w.Reset() |
| 289 | assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w)) |
| 290 | sentences = strings.Split(w.String(), "\n\n") |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 291 | assert.Equal("\n", sentences[1]) |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 292 | assert.Equal(len(sentences), 2) |
Akron | 5c82a92 | 2021-09-24 19:11:29 +0200 | [diff] [blame] | 293 | |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 294 | w.Reset() |
| 295 | assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w)) |
| 296 | sentences = strings.Split(w.String(), "\n\n") |
| 297 | assert.Equal(len(sentences), 2) |
Akron | 5c82a92 | 2021-09-24 19:11:29 +0200 | [diff] [blame] | 298 | |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 299 | w.Reset() |
| 300 | assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w)) |
| 301 | sentences = strings.Split(w.String(), "\n\n") |
| 302 | assert.Equal(len(sentences), 2) |
Akron | 5c82a92 | 2021-09-24 19:11:29 +0200 | [diff] [blame] | 303 | |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 304 | w.Reset() |
| 305 | assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w)) |
| 306 | sentences = strings.Split(w.String(), "\n\n") |
| 307 | assert.Equal(len(sentences), 2) |
| 308 | assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0]) |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 309 | assert.Equal("\n", sentences[1]) |
Akron | 5c82a92 | 2021-09-24 19:11:29 +0200 | [diff] [blame] | 310 | |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 311 | w.Reset() |
| 312 | assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w)) |
| 313 | sentences = strings.Split(w.String(), "\n\n") |
| 314 | assert.Equal(len(sentences), 3) |
| 315 | assert.Equal("Ausschalten\n!!!", sentences[0]) |
| 316 | assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1]) |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 317 | assert.Equal("\n", sentences[2]) |
Akron | 5c82a92 | 2021-09-24 19:11:29 +0200 | [diff] [blame] | 318 | |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 319 | w.Reset() |
| 320 | assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w)) |
| 321 | sentences = strings.Split(w.String(), "\n\n") |
| 322 | assert.Equal(len(sentences), 2) |
Akron | e96895f | 2022-03-08 19:58:37 +0100 | [diff] [blame] | 323 | |
| 324 | w.Reset() |
| 325 | assert.True(mat.Transduce(strings.NewReader("\"Alter!\", sagte er: \"Komm nicht wieder!\" Geh!!! \"Lass!\" Dann ging er."), w)) |
| 326 | sentences = strings.Split(w.String(), "\n\n") |
| 327 | assert.Equal(len(sentences), 5) |
| 328 | assert.Equal("\"\nAlter\n!\n\"\n,\nsagte\ner\n:\n\"\nKomm\nnicht\nwieder\n!\n\"", sentences[0]) |
| 329 | assert.Equal("Geh\n!!!", sentences[1]) |
| 330 | assert.Equal("\"\nLass\n!\n\"", sentences[2]) |
| 331 | assert.Equal("Dann\nging\ner\n.", sentences[3]) |
| 332 | |
| 333 | w.Reset() |
| 334 | assert.True(mat.Transduce(strings.NewReader("\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w)) |
| 335 | sentences = strings.Split(w.String(), "\n\n") |
| 336 | assert.Equal(len(sentences), 3) |
| 337 | assert.Equal("\"\nAusschalten\n!!!\n\"\n,\nsagte\ner\n.", sentences[0]) |
| 338 | assert.Equal("\"\nHast\nDu\nnicht\ngehört\n???\n\"", sentences[1]) |
Akron | ece3f01 | 2022-03-09 19:12:15 +0100 | [diff] [blame] | 339 | |
| 340 | w.Reset() |
| 341 | assert.True(mat.Transduce(strings.NewReader("“Ausschalten!!!”, sagte er. «Hast Du nicht gehört???»"), w)) |
| 342 | sentences = strings.Split(w.String(), "\n\n") |
| 343 | assert.Equal(len(sentences), 3) |
| 344 | assert.Equal("“\nAusschalten\n!!!\n”\n,\nsagte\ner\n.", sentences[0]) |
| 345 | assert.Equal("«\nHast\nDu\nnicht\ngehört\n???\n»", sentences[1]) |
| 346 | |
| 347 | w.Reset() |
| 348 | assert.True(mat.Transduce(strings.NewReader("“Ausschalten!!!”, sagte er. «Hast Du nicht gehört???»"), w)) |
| 349 | sentences = strings.Split(w.String(), "\n\n") |
| 350 | assert.Equal(len(sentences), 3) |
| 351 | assert.Equal("“\nAusschalten\n!!!\n”\n,\nsagte\ner\n.", sentences[0]) |
| 352 | assert.Equal("«\nHast\nDu\nnicht\ngehört\n???\n»", sentences[1]) |
| 353 | |
| 354 | text := `»Meinetwegen. Denkst du, daß ich darauf warte? Das fehlte noch. |
| 355 | Übrigens, ich kriege schon einen und vielleicht bald. Da ist mir nicht |
| 356 | bange. Neulich erst hat mir der kleine Ventivegni von drüben gesagt: |
| 357 | 'Fräulein Effi, was gilt die Wette, wir sind hier noch in diesem Jahre |
| 358 | zu Polterabend und Hochzeit.'« |
| 359 | |
| 360 | »Und was sagtest du da?«` |
| 361 | |
| 362 | w.Reset() |
| 363 | assert.True(mat.Transduce(strings.NewReader(text), w)) |
| 364 | sentences = strings.Split(w.String(), "\n\n") |
| 365 | assert.Equal(len(sentences), 8) |
| 366 | assert.Equal("Neulich\nerst\nhat\nmir\nder\nkleine\nVentivegni\nvon\ndrüben\ngesagt\n:\n'\nFräulein\nEffi\n,\nwas\ngilt\ndie\nWette\n,\nwir\nsind\nhier\nnoch\nin\ndiesem\nJahre\nzu\nPolterabend\nund\nHochzeit\n.\n'\n«", sentences[5]) |
| 367 | assert.Equal("»\nUnd\nwas\nsagtest\ndu\nda\n?\n«", sentences[6]) |
Akron | 4222ac8 | 2022-03-11 01:06:21 +0100 | [diff] [blame] | 368 | |
| 369 | text = `»Nun, gib dich zufrieden, ich fange schon an ... Also Baron |
| 370 | Innstetten!` |
| 371 | |
| 372 | w.Reset() |
| 373 | assert.True(mat.Transduce(strings.NewReader(text), w)) |
| 374 | sentences = strings.Split(w.String(), "\n\n") |
| 375 | assert.Equal(len(sentences), 3) |
| 376 | assert.Equal("»\nNun\n,\ngib\ndich\nzufrieden\n,\nich\nfange\nschon\nan\n...", sentences[0]) |
| 377 | assert.Equal("Also\nBaron\nInnstetten\n!", sentences[1]) |
Akron | df27581 | 2022-03-27 12:54:46 +0200 | [diff] [blame] | 378 | |
Akron | f94b9ce | 2022-03-27 18:18:09 +0200 | [diff] [blame^] | 379 | // Check paranthesis at the end of sentences. |
| 380 | w.Reset() |
| 381 | assert.True(mat.Transduce(strings.NewReader("(Er ging.) Und kam (später)."), w)) |
| 382 | sentences = strings.Split(w.String(), "\n\n") |
| 383 | assert.Equal(len(sentences), 3) |
| 384 | assert.Equal("(\nEr\nging\n.\n)", sentences[0]) |
| 385 | assert.Equal("Und\nkam\n(\nspäter\n)\n.", sentences[1]) |
Akron | df27581 | 2022-03-27 12:54:46 +0200 | [diff] [blame] | 386 | } |
| 387 | |
| 388 | func TestMatrixFullTokenizerMatrixSentenceSplitterBug1(t *testing.T) { |
| 389 | assert := assert.New(t) |
| 390 | |
| 391 | if mat == nil { |
| 392 | mat = LoadMatrixFile("testdata/tokenizer.matok") |
| 393 | } |
| 394 | |
| 395 | b := make([]byte, 0, 2048) |
| 396 | w := bytes.NewBuffer(b) |
| 397 | var sentences []string |
| 398 | |
| 399 | text := `Wüllersdorf war aufgestanden. »Ich finde es furchtbar, daß Sie recht haben, aber Sie haben recht. Ich quäle Sie nicht länger mit meinem 'Muß es sein?'. Die Welt ist einmal, wie sie ist, und die Dinge verlaufen nicht, wie wir wollen, sondern wie die andern wollen. Das mit dem 'Gottesgericht', wie manche hochtrabend versichern, ist freilich ein Unsinn, nichts davon, umgekehrt, unser Ehrenkultus ist ein Götzendienst, aber wir müssen uns ihm unterwerfen, solange der Götze gilt.«` |
| 400 | |
| 401 | w.Reset() |
| 402 | assert.True(mat.Transduce(strings.NewReader(text), w)) |
| 403 | sentences = strings.Split(w.String(), "\n\n") |
Akron | b428755 | 2022-03-27 14:11:24 +0200 | [diff] [blame] | 404 | assert.Equal(len(sentences), 6) |
Akron | df27581 | 2022-03-27 12:54:46 +0200 | [diff] [blame] | 405 | assert.Equal("Wüllersdorf\nwar\naufgestanden\n.", sentences[0]) |
| 406 | assert.Equal("»\nIch\nfinde\nes\nfurchtbar\n,\ndaß\nSie\nrecht\nhaben\n,\naber\nSie\nhaben\nrecht\n.", sentences[1]) |
Akron | b428755 | 2022-03-27 14:11:24 +0200 | [diff] [blame] | 407 | assert.Equal("Ich\nquäle\nSie\nnicht\nlänger\nmit\nmeinem\n'\nMuß\nes\nsein\n?\n'\n.", sentences[2]) |
| 408 | assert.Equal("Die\nWelt\nist\neinmal\n,\nwie\nsie\nist\n,\nund\ndie\nDinge\nverlaufen\nnicht\n,\nwie\nwir\nwollen\n,\nsondern\nwie\ndie\nandern\nwollen\n.", sentences[3]) |
| 409 | assert.Equal("Das\nmit\ndem\n'\nGottesgericht\n'\n,\nwie\nmanche\nhochtrabend\nversichern\n,\nist\nfreilich\nein\nUnsinn\n,\nnichts\ndavon\n,\numgekehrt\n,\nunser\nEhrenkultus\nist\nein\nGötzendienst\n,\naber\nwir\nmüssen\nuns\nihm\nunterwerfen\n,\nsolange\nder\nGötze\ngilt\n.\n«", sentences[4]) |
Akron | 1c34ce6 | 2021-09-23 23:27:39 +0200 | [diff] [blame] | 410 | } |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 411 | |
Akron | c9c0eae | 2021-10-22 19:49:43 +0200 | [diff] [blame] | 412 | func TestMatrixFullTokenizerTokenSplitter(t *testing.T) { |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 413 | assert := assert.New(t) |
| 414 | |
Akron | 9fb63af | 2021-10-28 01:15:53 +0200 | [diff] [blame] | 415 | if mat == nil { |
| 416 | mat = LoadMatrixFile("testdata/tokenizer.matok") |
| 417 | } |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 418 | |
| 419 | b := make([]byte, 0, 2048) |
| 420 | w := bytes.NewBuffer(b) |
| 421 | var tokens []string |
| 422 | |
| 423 | // testTokenizerSimple |
| 424 | tokens = ttokenize(mat, w, "Der alte Mann") |
| 425 | assert.Equal(tokens[0], "Der") |
| 426 | assert.Equal(tokens[1], "alte") |
| 427 | assert.Equal(tokens[2], "Mann") |
| 428 | assert.Equal(len(tokens), 3) |
| 429 | |
| 430 | tokens = ttokenize(mat, w, "Der alte Mann.") |
| 431 | assert.Equal(tokens[0], "Der") |
| 432 | assert.Equal(tokens[1], "alte") |
| 433 | assert.Equal(tokens[2], "Mann") |
| 434 | assert.Equal(tokens[3], ".") |
| 435 | assert.Equal(len(tokens), 4) |
| 436 | |
| 437 | // testTokenizerAbbr |
| 438 | tokens = ttokenize(mat, w, "Der Vorsitzende der F.D.P. hat gewählt") |
| 439 | assert.Equal(tokens[0], "Der") |
| 440 | assert.Equal(tokens[1], "Vorsitzende") |
| 441 | assert.Equal(tokens[2], "der") |
| 442 | assert.Equal(tokens[3], "F.D.P.") |
| 443 | assert.Equal(tokens[4], "hat") |
| 444 | assert.Equal(tokens[5], "gewählt") |
| 445 | assert.Equal(len(tokens), 6) |
| 446 | // Ignored in KorAP-Tokenizer |
| 447 | |
| 448 | // testTokenizerHost1 |
| 449 | tokens = ttokenize(mat, w, "Gefunden auf wikipedia.org") |
| 450 | assert.Equal(tokens[0], "Gefunden") |
| 451 | assert.Equal(tokens[1], "auf") |
| 452 | assert.Equal(tokens[2], "wikipedia.org") |
| 453 | assert.Equal(len(tokens), 3) |
| 454 | |
| 455 | // testTokenizerWwwHost |
| 456 | tokens = ttokenize(mat, w, "Gefunden auf www.wikipedia.org") |
| 457 | assert.Equal("Gefunden", tokens[0]) |
| 458 | assert.Equal("auf", tokens[1]) |
| 459 | assert.Equal("www.wikipedia.org", tokens[2]) |
| 460 | assert.Equal(3, len(tokens)) |
| 461 | |
| 462 | // testTokenizerWwwUrl |
| 463 | tokens = ttokenize(mat, w, "Weitere Infos unter www.info.biz/info") |
| 464 | assert.Equal("www.info.biz/info", tokens[3]) |
| 465 | |
| 466 | // testTokenizerFtpHost |
| 467 | /* |
| 468 | tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden") |
| 469 | assert.Equal("Kann", tokens[0]) |
| 470 | assert.Equal("von", tokens[1]) |
| 471 | assert.Equal("ftp.download.org", tokens[2]) |
| 472 | assert.Equal(5, len(tokens)) |
| 473 | // Ignored in KorAP-Tokenizer |
| 474 | */ |
| 475 | |
| 476 | // testTokenizerDash |
| 477 | tokens = ttokenize(mat, w, "Das war -- spitze") |
| 478 | assert.Equal(tokens[0], "Das") |
| 479 | assert.Equal(tokens[1], "war") |
| 480 | assert.Equal(tokens[2], "--") |
| 481 | assert.Equal(tokens[3], "spitze") |
| 482 | assert.Equal(len(tokens), 4) |
| 483 | |
| 484 | // testTokenizerEmail1 |
| 485 | tokens = ttokenize(mat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.") |
| 486 | assert.Equal(tokens[0], "Ich") |
| 487 | assert.Equal(tokens[1], "bin") |
| 488 | assert.Equal(tokens[2], "unter") |
| 489 | assert.Equal(tokens[3], "korap@ids-mannheim.de") |
| 490 | assert.Equal(tokens[4], "erreichbar") |
| 491 | assert.Equal(tokens[5], ".") |
| 492 | assert.Equal(len(tokens), 6) |
| 493 | |
| 494 | // testTokenizerEmail2 |
| 495 | tokens = ttokenize(mat, w, "Oder unter korap[at]ids-mannheim[dot]de.") |
| 496 | assert.Equal(tokens[0], "Oder") |
| 497 | assert.Equal(tokens[1], "unter") |
| 498 | assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de") |
| 499 | assert.Equal(tokens[3], ".") |
| 500 | assert.Equal(len(tokens), 4) |
| 501 | |
| 502 | // testTokenizerEmail3 |
| 503 | tokens = ttokenize(mat, w, "Oder unter korap(at)ids-mannheim(dot)de.") |
| 504 | assert.Equal(tokens[0], "Oder") |
| 505 | assert.Equal(tokens[1], "unter") |
| 506 | assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de") |
| 507 | assert.Equal(tokens[3], ".") |
| 508 | assert.Equal(len(tokens), 4) |
| 509 | // Ignored in KorAP-Tokenizer |
| 510 | |
| 511 | // testTokenizerDoNotAcceptQuotedEmailNames |
| 512 | tokens = ttokenize(mat, w, "\"John Doe\"@xx.com") |
| 513 | assert.Equal("\"", tokens[0]) |
| 514 | assert.Equal("John", tokens[1]) |
| 515 | assert.Equal("Doe", tokens[2]) |
| 516 | assert.Equal("\"", tokens[3]) |
| 517 | assert.Equal("@xx", tokens[4]) |
| 518 | assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here! |
| 519 | assert.Equal("com", tokens[6]) |
| 520 | assert.Equal(7, len(tokens)) |
| 521 | |
| 522 | // testTokenizerTwitter |
| 523 | tokens = ttokenize(mat, w, "Folgt @korap und #korap") |
| 524 | assert.Equal(tokens[0], "Folgt") |
| 525 | assert.Equal(tokens[1], "@korap") |
| 526 | assert.Equal(tokens[2], "und") |
| 527 | assert.Equal(tokens[3], "#korap") |
| 528 | assert.Equal(len(tokens), 4) |
| 529 | |
| 530 | // testTokenizerWeb1 |
| 531 | tokens = ttokenize(mat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum") |
| 532 | assert.Equal(tokens[0], "Unsere") |
| 533 | assert.Equal(tokens[1], "Website") |
| 534 | assert.Equal(tokens[2], "ist") |
| 535 | assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum") |
| 536 | assert.Equal(len(tokens), 4) |
| 537 | |
| 538 | // testTokenizerWeb2 |
| 539 | tokens = ttokenize(mat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)") |
| 540 | assert.Equal(tokens[0], "Wir") |
| 541 | assert.Equal(tokens[1], "sind") |
| 542 | assert.Equal(tokens[2], "auch") |
| 543 | assert.Equal(tokens[3], "im") |
| 544 | assert.Equal(tokens[4], "Internet") |
| 545 | assert.Equal(tokens[5], "(") |
| 546 | assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum") |
| 547 | assert.Equal(tokens[7], ")") |
| 548 | assert.Equal(len(tokens), 8) |
| 549 | // Ignored in KorAP-Tokenizer |
| 550 | |
| 551 | // testTokenizerWeb3 |
| 552 | tokens = ttokenize(mat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.") |
| 553 | assert.Equal(tokens[0], "Die") |
| 554 | assert.Equal(tokens[1], "Adresse") |
| 555 | assert.Equal(tokens[2], "ist") |
| 556 | assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum") |
| 557 | assert.Equal(tokens[4], ".") |
| 558 | assert.Equal(len(tokens), 5) |
| 559 | // Ignored in KorAP-Tokenizer |
| 560 | |
| 561 | // testTokenizerServer |
| 562 | tokens = ttokenize(mat, w, "Unser Server ist 10.0.10.51.") |
| 563 | assert.Equal(tokens[0], "Unser") |
| 564 | assert.Equal(tokens[1], "Server") |
| 565 | assert.Equal(tokens[2], "ist") |
| 566 | assert.Equal(tokens[3], "10.0.10.51") |
| 567 | assert.Equal(tokens[4], ".") |
| 568 | assert.Equal(len(tokens), 5) |
| 569 | |
| 570 | // testTokenizerNum |
| 571 | tokens = ttokenize(mat, w, "Zu 50,4% ist es sicher") |
| 572 | assert.Equal(tokens[0], "Zu") |
| 573 | assert.Equal(tokens[1], "50,4%") |
| 574 | assert.Equal(tokens[2], "ist") |
| 575 | assert.Equal(tokens[3], "es") |
| 576 | assert.Equal(tokens[4], "sicher") |
| 577 | assert.Equal(len(tokens), 5) |
| 578 | // Differs from KorAP-Tokenizer |
| 579 | |
| 580 | // testTokenizerDate |
| 581 | tokens = ttokenize(mat, w, "Der Termin ist am 5.9.2018") |
| 582 | assert.Equal(tokens[0], "Der") |
| 583 | assert.Equal(tokens[1], "Termin") |
| 584 | assert.Equal(tokens[2], "ist") |
| 585 | assert.Equal(tokens[3], "am") |
| 586 | assert.Equal(tokens[4], "5.9.2018") |
| 587 | assert.Equal(len(tokens), 5) |
| 588 | |
| 589 | tokens = ttokenize(mat, w, "Der Termin ist am 5/9/2018") |
| 590 | assert.Equal(tokens[0], "Der") |
| 591 | assert.Equal(tokens[1], "Termin") |
| 592 | assert.Equal(tokens[2], "ist") |
| 593 | assert.Equal(tokens[3], "am") |
| 594 | assert.Equal(tokens[4], "5/9/2018") |
| 595 | assert.Equal(len(tokens), 5) |
| 596 | |
| 597 | // testTokenizerDateRange |
| 598 | /* |
| 599 | tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018") |
| 600 | assert.Equal(tokens[0], "Der") |
| 601 | assert.Equal(tokens[1], "Termin") |
| 602 | assert.Equal(tokens[2], "war") |
| 603 | assert.Equal(tokens[3], "vom") |
| 604 | assert.Equal(tokens[4], "4.") |
| 605 | assert.Equal(tokens[5], "-") |
| 606 | assert.Equal(tokens[6], "5.9.2018") |
| 607 | assert.Equal(len(tokens), 7) |
| 608 | // Ignored in KorAP-Tokenizer |
| 609 | */ |
| 610 | |
| 611 | // testTokenizerEmoji1 |
| 612 | tokens = ttokenize(mat, w, "Das ist toll! ;)") |
| 613 | assert.Equal(tokens[0], "Das") |
| 614 | assert.Equal(tokens[1], "ist") |
| 615 | assert.Equal(tokens[2], "toll") |
| 616 | assert.Equal(tokens[3], "!") |
| 617 | assert.Equal(tokens[4], ";)") |
| 618 | assert.Equal(len(tokens), 5) |
| 619 | |
| 620 | // testTokenizerRef1 |
| 621 | tokens = ttokenize(mat, w, "Kupietz und Schmidt (2018): Korpuslinguistik") |
| 622 | assert.Equal(tokens[0], "Kupietz") |
| 623 | assert.Equal(tokens[1], "und") |
| 624 | assert.Equal(tokens[2], "Schmidt") |
| 625 | assert.Equal(tokens[3], "(2018)") |
| 626 | assert.Equal(tokens[4], ":") |
| 627 | assert.Equal(tokens[5], "Korpuslinguistik") |
| 628 | assert.Equal(len(tokens), 6) |
| 629 | // Differs from KorAP-Tokenizer! |
| 630 | |
| 631 | // testTokenizerRef2 () { |
| 632 | tokens = ttokenize(mat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik") |
| 633 | assert.Equal(tokens[0], "Kupietz") |
| 634 | assert.Equal(tokens[1], "und") |
| 635 | assert.Equal(tokens[2], "Schmidt") |
| 636 | assert.Equal(tokens[3], "[2018]") |
| 637 | assert.Equal(tokens[4], ":") |
| 638 | assert.Equal(tokens[5], "Korpuslinguistik") |
| 639 | assert.Equal(len(tokens), 6) |
| 640 | // Differs from KorAP-Tokenizer! |
| 641 | |
| 642 | // testTokenizerOmission1 () { |
| 643 | tokens = ttokenize(mat, w, "Er ist ein A****loch!") |
| 644 | assert.Equal(tokens[0], "Er") |
| 645 | assert.Equal(tokens[1], "ist") |
| 646 | assert.Equal(tokens[2], "ein") |
| 647 | assert.Equal(tokens[3], "A****loch") |
| 648 | assert.Equal(tokens[4], "!") |
| 649 | assert.Equal(len(tokens), 5) |
| 650 | |
| 651 | // testTokenizerOmission2 |
| 652 | tokens = ttokenize(mat, w, "F*ck!") |
| 653 | assert.Equal(tokens[0], "F*ck") |
| 654 | assert.Equal(tokens[1], "!") |
| 655 | assert.Equal(len(tokens), 2) |
| 656 | |
| 657 | // testTokenizerOmission3 () { |
| 658 | tokens = ttokenize(mat, w, "Dieses verf***** Kleid!") |
| 659 | assert.Equal(tokens[0], "Dieses") |
| 660 | assert.Equal(tokens[1], "verf*****") |
| 661 | assert.Equal(tokens[2], "Kleid") |
| 662 | assert.Equal(tokens[3], "!") |
| 663 | assert.Equal(len(tokens), 4) |
| 664 | |
| 665 | // Probably interpreted as HOST |
| 666 | // testTokenizerFileExtension1 |
| 667 | tokens = ttokenize(mat, w, "Ich habe die readme.txt heruntergeladen") |
| 668 | assert.Equal(tokens[0], "Ich") |
| 669 | assert.Equal(tokens[1], "habe") |
| 670 | assert.Equal(tokens[2], "die") |
| 671 | assert.Equal(tokens[3], "readme.txt") |
| 672 | assert.Equal(tokens[4], "heruntergeladen") |
| 673 | assert.Equal(len(tokens), 5) |
| 674 | |
| 675 | // Probably interpreted as HOST |
| 676 | // testTokenizerFileExtension2 |
| 677 | tokens = ttokenize(mat, w, "Nimm die README.TXT!") |
| 678 | assert.Equal(tokens[0], "Nimm") |
| 679 | assert.Equal(tokens[1], "die") |
| 680 | assert.Equal(tokens[2], "README.TXT") |
| 681 | assert.Equal(tokens[3], "!") |
| 682 | assert.Equal(len(tokens), 4) |
| 683 | |
| 684 | // Probably interpreted as HOST |
| 685 | // testTokenizerFileExtension3 |
| 686 | tokens = ttokenize(mat, w, "Zeig mir profile.jpeg") |
| 687 | assert.Equal(tokens[0], "Zeig") |
| 688 | assert.Equal(tokens[1], "mir") |
| 689 | assert.Equal(tokens[2], "profile.jpeg") |
| 690 | assert.Equal(len(tokens), 3) |
| 691 | |
| 692 | // testTokenizerFile1 |
| 693 | |
| 694 | tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.docx") |
| 695 | assert.Equal(tokens[0], "Zeig") |
| 696 | assert.Equal(tokens[1], "mir") |
| 697 | assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx") |
| 698 | assert.Equal(len(tokens), 3) |
| 699 | |
| 700 | // testTokenizerFile2 |
| 701 | tokens = ttokenize(mat, w, "Gehe zu /Dokumente/profile.docx") |
| 702 | assert.Equal(tokens[0], "Gehe") |
| 703 | assert.Equal(tokens[1], "zu") |
| 704 | assert.Equal(tokens[2], "/Dokumente/profile.docx") |
| 705 | assert.Equal(len(tokens), 3) |
| 706 | |
| 707 | // testTokenizerFile3 |
| 708 | tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.jpeg") |
| 709 | assert.Equal(tokens[0], "Zeig") |
| 710 | assert.Equal(tokens[1], "mir") |
| 711 | assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg") |
| 712 | assert.Equal(len(tokens), 3) |
| 713 | // Ignored in KorAP-Tokenizer |
| 714 | |
| 715 | // testTokenizerPunct |
| 716 | tokens = ttokenize(mat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.") |
| 717 | assert.Equal(tokens[0], "Er") |
| 718 | assert.Equal(tokens[1], "sagte") |
| 719 | assert.Equal(tokens[2], ":") |
| 720 | assert.Equal(tokens[3], "\"") |
| 721 | assert.Equal(tokens[4], "Es") |
| 722 | assert.Equal(tokens[5], "geht") |
| 723 | assert.Equal(tokens[6], "mir") |
| 724 | assert.Equal(tokens[7], "gut") |
| 725 | assert.Equal(tokens[8], "!") |
| 726 | assert.Equal(tokens[9], "\"") |
| 727 | assert.Equal(tokens[10], ",") |
| 728 | assert.Equal(tokens[11], "daraufhin") |
| 729 | assert.Equal(tokens[12], "ging") |
| 730 | assert.Equal(tokens[13], "er") |
| 731 | assert.Equal(tokens[14], ".") |
| 732 | assert.Equal(len(tokens), 15) |
| 733 | |
| 734 | // testTokenizerPlusAmpersand |
| 735 | tokens = ttokenize(mat, w, ""Das ist von C&A!"") |
| 736 | assert.Equal(tokens[0], """) |
| 737 | assert.Equal(tokens[1], "Das") |
| 738 | assert.Equal(tokens[2], "ist") |
| 739 | assert.Equal(tokens[3], "von") |
| 740 | assert.Equal(tokens[4], "C&A") |
| 741 | assert.Equal(tokens[5], "!") |
| 742 | assert.Equal(tokens[6], """) |
| 743 | assert.Equal(len(tokens), 7) |
| 744 | |
| 745 | // testTokenizerLongEnd |
| 746 | tokens = ttokenize(mat, w, "Siehst Du?!!?") |
| 747 | assert.Equal(tokens[0], "Siehst") |
| 748 | assert.Equal(tokens[1], "Du") |
| 749 | assert.Equal(tokens[2], "?!!?") |
| 750 | assert.Equal(len(tokens), 3) |
| 751 | |
| 752 | // testTokenizerIrishO |
| 753 | tokens = ttokenize(mat, w, "Peter O'Toole") |
| 754 | assert.Equal(tokens[0], "Peter") |
| 755 | assert.Equal(tokens[1], "O'Toole") |
| 756 | assert.Equal(len(tokens), 2) |
| 757 | |
| 758 | // testTokenizerAbr |
| 759 | tokens = ttokenize(mat, w, "Früher bzw. später ...") |
| 760 | assert.Equal(tokens[0], "Früher") |
| 761 | assert.Equal(tokens[1], "bzw.") |
| 762 | assert.Equal(tokens[2], "später") |
| 763 | assert.Equal(tokens[3], "...") |
| 764 | assert.Equal(len(tokens), 4) |
| 765 | |
| 766 | // testTokenizerUppercaseRule |
| 767 | tokens = ttokenize(mat, w, "Es war spät.Morgen ist es früh.") |
| 768 | assert.Equal(tokens[0], "Es") |
| 769 | assert.Equal(tokens[1], "war") |
| 770 | assert.Equal(tokens[2], "spät") |
| 771 | assert.Equal(tokens[3], ".") |
| 772 | assert.Equal(tokens[4], "Morgen") |
| 773 | assert.Equal(tokens[5], "ist") |
| 774 | assert.Equal(tokens[6], "es") |
| 775 | assert.Equal(tokens[7], "früh") |
| 776 | assert.Equal(tokens[8], ".") |
| 777 | assert.Equal(len(tokens), 9) |
| 778 | // Ignored in KorAP-Tokenizer |
| 779 | |
| 780 | // testTokenizerOrd |
| 781 | tokens = ttokenize(mat, w, "Sie erreichte den 1. Platz!") |
| 782 | assert.Equal(tokens[0], "Sie") |
| 783 | assert.Equal(tokens[1], "erreichte") |
| 784 | assert.Equal(tokens[2], "den") |
| 785 | assert.Equal(tokens[3], "1.") |
| 786 | assert.Equal(tokens[4], "Platz") |
| 787 | assert.Equal(tokens[5], "!") |
| 788 | assert.Equal(len(tokens), 6) |
| 789 | |
| 790 | // testNoZipOuputArchive |
| 791 | tokens = ttokenize(mat, w, "Archive: Ich bin kein zip\n") |
| 792 | assert.Equal(tokens[0], "Archive") |
| 793 | assert.Equal(tokens[1], ":") |
| 794 | assert.Equal(tokens[2], "Ich") |
| 795 | assert.Equal(tokens[3], "bin") |
| 796 | assert.Equal(tokens[4], "kein") |
| 797 | assert.Equal(tokens[5], "zip") |
| 798 | assert.Equal(6, len(tokens)) |
| 799 | |
| 800 | // testTokenizerStrasse |
| 801 | tokens = ttokenize(mat, w, "Ich wohne in der Weststr. und Du?") |
| 802 | assert.Equal(tokens[4], "Weststr.") |
| 803 | assert.Equal(8, len(tokens)) |
| 804 | |
| 805 | // germanTokenizerKnowsGermanOmissionWords |
| 806 | tokens = ttokenize(mat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft") |
| 807 | assert.Equal("D'dorf", tokens[0]) |
| 808 | assert.Equal("Ku'damm", tokens[1]) |
| 809 | assert.Equal("Lu'hafen", tokens[2]) |
| 810 | assert.Equal("M'gladbach", tokens[3]) |
| 811 | assert.Equal("W'schaft", tokens[4]) |
| 812 | assert.Equal(5, len(tokens)) |
| 813 | |
| 814 | // germanTokenizerDoesNOTSeparateGermanContractions |
| 815 | tokens = ttokenize(mat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste") |
| 816 | assert.Equal("mach's", tokens[0]) |
| 817 | assert.Equal("macht's", tokens[1]) |
| 818 | assert.Equal("was'n", tokens[2]) |
| 819 | assert.Equal("ist's", tokens[3]) |
| 820 | assert.Equal("haste", tokens[4]) |
| 821 | assert.Equal("willste", tokens[5]) |
| 822 | assert.Equal("kannste", tokens[6]) |
| 823 | assert.Equal("biste", tokens[7]) |
| 824 | assert.Equal("kriegste", tokens[8]) |
| 825 | assert.Equal(9, len(tokens)) |
| 826 | |
Akron | 78dba06 | 2021-10-28 19:30:46 +0200 | [diff] [blame] | 827 | tokens = ttokenize(mat, w, "Es ist gleich 2:30 Uhr.") |
| 828 | assert.Equal("Es", tokens[0]) |
| 829 | assert.Equal("ist", tokens[1]) |
| 830 | assert.Equal("gleich", tokens[2]) |
| 831 | assert.Equal("2:30", tokens[3]) |
| 832 | assert.Equal("Uhr", tokens[4]) |
| 833 | assert.Equal(".", tokens[5]) |
| 834 | assert.Equal(6, len(tokens)) |
| 835 | |
Akron | 17984c8 | 2021-10-30 11:44:37 +0200 | [diff] [blame] | 836 | tokens = ttokenize(mat, w, "Sie schwamm die Strecke in 00:00:57,34 00:57,341 0:57 Stunden.") |
| 837 | assert.Equal("Sie", tokens[0]) |
| 838 | assert.Equal("schwamm", tokens[1]) |
| 839 | assert.Equal("die", tokens[2]) |
| 840 | assert.Equal("Strecke", tokens[3]) |
| 841 | assert.Equal("in", tokens[4]) |
| 842 | assert.Equal("00:00:57,34", tokens[5]) |
| 843 | assert.Equal("00:57,341", tokens[6]) |
| 844 | assert.Equal("0:57", tokens[7]) |
| 845 | assert.Equal("Stunden", tokens[8]) |
| 846 | assert.Equal(".", tokens[9]) |
| 847 | assert.Equal(10, len(tokens)) |
| 848 | |
Akron | f1106ec | 2021-11-05 13:04:44 +0100 | [diff] [blame] | 849 | // waste example |
| 850 | tokens = ttokenize(mat, w, "Am 24.1.1806 feierte E. T. A. Hoffmann seinen 30. Geburtstag.") |
| 851 | assert.Equal(tokens[0], "Am") |
| 852 | assert.Equal(tokens[1], "24.1.1806") |
| 853 | assert.Equal(tokens[2], "feierte") |
| 854 | assert.Equal(tokens[3], "E.") |
| 855 | assert.Equal(tokens[4], "T.") |
| 856 | assert.Equal(tokens[5], "A.") |
| 857 | assert.Equal(tokens[6], "Hoffmann") |
| 858 | assert.Equal(tokens[7], "seinen") |
| 859 | assert.Equal(tokens[8], "30.") |
| 860 | assert.Equal(tokens[9], "Geburtstag") |
| 861 | assert.Equal(tokens[10], ".") |
| 862 | assert.Equal(11, len(tokens)) |
| 863 | |
Akron | 9135b20 | 2021-11-06 13:16:07 +0100 | [diff] [blame] | 864 | // IPtest |
| 865 | tokens = ttokenize(mat, w, "Meine IP ist 192.178.168.55.") |
| 866 | assert.Equal(tokens[0], "Meine") |
| 867 | assert.Equal(tokens[1], "IP") |
| 868 | assert.Equal(tokens[2], "ist") |
| 869 | assert.Equal(tokens[3], "192.178.168.55") |
| 870 | assert.Equal(tokens[4], ".") |
| 871 | assert.Equal(5, len(tokens)) |
| 872 | |
Akron | 6742b96 | 2021-11-09 01:17:20 +0100 | [diff] [blame] | 873 | // XML entities |
| 874 | tokens = ttokenize(mat, w, "Das ist 1:30 Stunden&20 Minuten zu spät >.") |
| 875 | assert.Equal(tokens[0], "Das") |
| 876 | assert.Equal(tokens[1], "ist") |
| 877 | assert.Equal(tokens[2], " ") |
| 878 | assert.Equal(tokens[3], "1:30") |
| 879 | assert.Equal(tokens[4], "Stunden") |
| 880 | assert.Equal(tokens[5], "&") |
| 881 | assert.Equal(tokens[6], "20") |
| 882 | assert.Equal(tokens[7], "Minuten") |
| 883 | assert.Equal(tokens[8], "zu") |
| 884 | assert.Equal(tokens[9], "spät") |
| 885 | assert.Equal(tokens[10], ">") |
| 886 | assert.Equal(tokens[11], ".") |
| 887 | assert.Equal(12, len(tokens)) |
| 888 | |
Akron | 936c0f5 | 2021-12-07 11:30:53 +0100 | [diff] [blame] | 889 | // Plusampersand compounds (1) |
Akron | e62e8eb | 2021-12-03 11:59:53 +0100 | [diff] [blame] | 890 | tokens = ttokenize(mat, w, "Die 2G+-Regel soll weitere Covid-19-Erkrankungen reduzieren.") |
| 891 | assert.Equal(tokens[0], "Die") |
| 892 | assert.Equal(tokens[1], "2G+-Regel") |
| 893 | assert.Equal(tokens[2], "soll") |
| 894 | assert.Equal(tokens[3], "weitere") |
| 895 | assert.Equal(tokens[4], "Covid-19-Erkrankungen") |
| 896 | assert.Equal(tokens[5], "reduzieren") |
| 897 | assert.Equal(tokens[6], ".") |
| 898 | assert.Equal(7, len(tokens)) |
| 899 | |
Akron | 936c0f5 | 2021-12-07 11:30:53 +0100 | [diff] [blame] | 900 | // Plusampersand compounds (2) |
| 901 | tokens = ttokenize(mat, w, "Der Neu-C++-Programmierer.") |
| 902 | assert.Equal(tokens[0], "Der") |
| 903 | assert.Equal(tokens[1], "Neu-C++-Programmierer") |
| 904 | assert.Equal(tokens[2], ".") |
| 905 | assert.Equal(3, len(tokens)) |
| 906 | |
Akron | 54ed7e7 | 2022-01-04 12:05:00 +0100 | [diff] [blame] | 907 | // z.B. |
| 908 | tokens = ttokenize(mat, w, "Dies sind z.B. zwei Wörter - z. B. auch.") |
| 909 | assert.Equal(tokens[0], "Dies") |
| 910 | assert.Equal(tokens[1], "sind") |
| 911 | assert.Equal(tokens[2], "z.") |
| 912 | assert.Equal(tokens[3], "B.") |
| 913 | assert.Equal(tokens[4], "zwei") |
| 914 | assert.Equal(tokens[5], "Wörter") |
| 915 | assert.Equal(tokens[6], "-") |
| 916 | assert.Equal(tokens[7], "z.") |
| 917 | assert.Equal(tokens[8], "B.") |
| 918 | assert.Equal(tokens[9], "auch") |
| 919 | assert.Equal(tokens[10], ".") |
| 920 | assert.Equal(11, len(tokens)) |
| 921 | |
Akron | 9a59471 | 2022-01-14 11:12:21 +0100 | [diff] [blame] | 922 | // z.B. |
| 923 | tokens = ttokenize(mat, w, "Dies sind z.B. zwei Wörter - z. B. auch.") |
| 924 | assert.Equal(tokens[0], "Dies") |
| 925 | assert.Equal(tokens[1], "sind") |
| 926 | assert.Equal(tokens[2], "z.") |
| 927 | assert.Equal(tokens[3], "B.") |
| 928 | assert.Equal(tokens[4], "zwei") |
| 929 | assert.Equal(tokens[5], "Wörter") |
| 930 | assert.Equal(tokens[6], "-") |
| 931 | assert.Equal(tokens[7], "z.") |
| 932 | assert.Equal(tokens[8], "B.") |
| 933 | assert.Equal(tokens[9], "auch") |
| 934 | assert.Equal(tokens[10], ".") |
| 935 | assert.Equal(11, len(tokens)) |
| 936 | |
| 937 | // Single quote handling |
| 938 | tokens = ttokenize(mat, w, "Es heißt 'Leitungssportteams' und nicht anders.") |
| 939 | assert.Equal(tokens[0], "Es") |
| 940 | assert.Equal(tokens[1], "heißt") |
| 941 | assert.Equal(tokens[2], "'") |
| 942 | assert.Equal(tokens[3], "Leitungssportteams") |
| 943 | assert.Equal(tokens[4], "'") |
| 944 | assert.Equal(tokens[5], "und") |
| 945 | assert.Equal(tokens[6], "nicht") |
| 946 | assert.Equal(tokens[7], "anders") |
| 947 | assert.Equal(tokens[8], ".") |
| 948 | assert.Equal(9, len(tokens)) |
| 949 | |
Akron | b02ad07 | 2022-01-19 12:41:44 +0100 | [diff] [blame] | 950 | // Apostrophe handling |
| 951 | tokens = ttokenize(mat, w, "Das ist Nils’ Einkaufskorb bei McDonald's.") |
| 952 | assert.Equal(tokens[0], "Das") |
| 953 | assert.Equal(tokens[1], "ist") |
| 954 | assert.Equal(tokens[2], "Nils’") |
| 955 | assert.Equal(tokens[3], "Einkaufskorb") |
| 956 | assert.Equal(tokens[4], "bei") |
| 957 | assert.Equal(tokens[5], "McDonald's") |
| 958 | assert.Equal(tokens[6], ".") |
| 959 | assert.Equal(7, len(tokens)) |
| 960 | |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 961 | /* |
| 962 | @Test |
| 963 | public void englishTokenizerSeparatesEnglishContractionsAndClitics () { |
| 964 | DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en(); |
| 965 | tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't") |
| 966 | assert.Equal("'ve", tokens[1]); |
| 967 | assert.Equal("'ll", tokens[3]); |
| 968 | assert.Equal("'d", tokens[5]); |
| 969 | assert.Equal("'m", tokens[7]); |
| 970 | assert.Equal("'re", tokens[9]); |
| 971 | assert.Equal("'s", tokens[11]); |
| 972 | assert.Equal("is", tokens[12]); |
| 973 | assert.Equal("n't", tokens[13]); |
| 974 | assert.Equal(14, len(tokens)); |
| 975 | } |
| 976 | |
| 977 | @Test |
| 978 | public void frenchTokenizerKnowsFrenchAbbreviations () { |
| 979 | DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr(); |
| 980 | tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.") |
| 981 | assert.Equal("Approx.", tokens[0]); |
| 982 | assert.Equal("juill.", tokens[2]); |
| 983 | assert.Equal("prof.", tokens[5]); |
| 984 | assert.Equal("exerc.", tokens[15]); |
| 985 | assert.Equal("no.", tokens[16]); |
| 986 | assert.Equal("pp.", tokens[21]); |
| 987 | } |
| 988 | |
| 989 | @Test |
| 990 | public void frenchTokenizerKnowsFrenchContractions () { |
| 991 | DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr(); |
| 992 | tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île") |
| 993 | assert.Equal("J'", tokens[0]); |
| 994 | assert.Equal("j'", tokens[2]); |
| 995 | assert.Equal("qu'", tokens[4]); |
| 996 | assert.Equal("d'", tokens[6]); |
| 997 | assert.Equal("jusqu'", tokens[8]); |
| 998 | assert.Equal("Aujourd'hui", tokens[10]); |
| 999 | assert.Equal("D'", tokens[11]); // ’ |
| 1000 | assert.Equal("Quelqu'un", tokens[13]); // ’ |
| 1001 | assert.Equal("Presqu'île", tokens[14]); // ’ |
| 1002 | } |
| 1003 | |
| 1004 | @Test |
| 1005 | public void frenchTokenizerKnowsFrenchClitics () { |
| 1006 | DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr(); |
| 1007 | tokens = tokenize(dat, w, "suis-je sont-elles ") |
| 1008 | assert.Equal("suis", tokens[0]); |
| 1009 | assert.Equal("-je", tokens[1]); |
| 1010 | assert.Equal("sont", tokens[2]); |
| 1011 | assert.Equal("-elles", tokens[3]); |
| 1012 | } |
| 1013 | |
| 1014 | @Test |
| 1015 | public void testEnglishTokenizerScienceAbbreviations () { |
| 1016 | DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en(); |
| 1017 | tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.") |
| 1018 | assert.Equal("Approx.", tokens[0]); |
| 1019 | assert.Equal("in", tokens[1]); |
| 1020 | assert.Equal("Sept.", tokens[2]); |
| 1021 | assert.Equal("1954", tokens[3]); |
| 1022 | assert.Equal(",", tokens[4]); |
| 1023 | assert.Equal("Assoc.", tokens[5]); |
| 1024 | assert.Equal("Prof.", tokens[6]); |
| 1025 | assert.Equal("Dr.", tokens[7]); |
| 1026 | assert.Equal("R.", tokens[8]); |
| 1027 | assert.Equal("J.", tokens[9]); |
| 1028 | assert.Equal("Ewing", tokens[10]); |
| 1029 | assert.Equal("reviewed", tokens[11]); |
| 1030 | assert.Equal("articles", tokens[12]); |
| 1031 | assert.Equal("on", tokens[13]); |
| 1032 | assert.Equal("Enzymol.", tokens[14]); |
| 1033 | assert.Equal("Bacteriol.", tokens[15]); |
| 1034 | assert.Equal("effects", tokens[16]); |
| 1035 | assert.Equal("later", tokens[17]); |
| 1036 | assert.Equal("published", tokens[18]); |
| 1037 | assert.Equal("in", tokens[19]); |
| 1038 | assert.Equal("Nutr.", tokens[20]); |
| 1039 | assert.Equal("Rheumatol.", tokens[21]); |
| 1040 | assert.Equal("No.", tokens[22]); |
| 1041 | assert.Equal("12", tokens[23]); |
| 1042 | assert.Equal("and", tokens[24]); |
| 1043 | assert.Equal("Nº.", tokens[25]); |
| 1044 | assert.Equal("13.", tokens[26]); |
| 1045 | assert.Equal(",", tokens[27]); |
| 1046 | assert.Equal("pp.", tokens[28]); |
| 1047 | assert.Equal("17-18", tokens[29]); |
| 1048 | assert.Equal(".", tokens[30]); |
| 1049 | } |
| 1050 | |
| 1051 | @Test |
| 1052 | public void englishTokenizerCanGuessWhetherIIsAbbrev () { |
| 1053 | DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en(); |
| 1054 | tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.") |
| 1055 | assert.Equal("I.", tokens[1]); |
| 1056 | assert.Equal("I", tokens[8]); |
| 1057 | assert.Equal(".", tokens[9]); |
| 1058 | assert.Equal("I", tokens[12]); |
| 1059 | assert.Equal(".", tokens[13]); |
| 1060 | } |
| 1061 | |
| 1062 | @Test |
| 1063 | public void testZipOuputArchive () { |
| 1064 | |
| 1065 | final ByteArrayOutputStream clearOut = new ByteArrayOutputStream(); |
| 1066 | System.setOut(new PrintStream(clearOut)); |
| 1067 | tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n") |
| 1068 | assert.Equal(0, len(tokens)); |
| 1069 | } |
| 1070 | */ |
| 1071 | /* |
| 1072 | |
| 1073 | @Test |
| 1074 | public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException { |
| 1075 | DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder() |
| 1076 | .tokenizerClassName(DerekoDfaTokenizer_de.class.getName()) |
| 1077 | .printOffsets(true) |
| 1078 | .build(); |
| 1079 | Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n"); |
| 1080 | assert.Equal("Text1", tokens[0].getType()); |
| 1081 | assert.Equal(len(tokens), 9 ); |
| 1082 | } |
| 1083 | */ |
| 1084 | } |
| 1085 | |
Akron | c9c0eae | 2021-10-22 19:49:43 +0200 | [diff] [blame] | 1086 | func TestMatrixFullTokenizerXML(t *testing.T) { |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 1087 | assert := assert.New(t) |
| 1088 | |
Akron | 9fb63af | 2021-10-28 01:15:53 +0200 | [diff] [blame] | 1089 | if mat == nil { |
| 1090 | mat = LoadMatrixFile("testdata/tokenizer.matok") |
| 1091 | } |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 1092 | |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 1093 | assert.NotNil(mat) |
| 1094 | |
| 1095 | b := make([]byte, 0, 2048) |
| 1096 | w := bytes.NewBuffer(b) |
| 1097 | var tokens []string |
| 1098 | |
| 1099 | tokens = ttokenize(mat, w, "Das <b>beste</b> Fußballspiel") |
| 1100 | assert.Equal("Das", tokens[0]) |
| 1101 | assert.Equal("<b>", tokens[1]) |
| 1102 | assert.Equal("beste", tokens[2]) |
| 1103 | assert.Equal("</b>", tokens[3]) |
| 1104 | assert.Equal("Fußballspiel", tokens[4]) |
| 1105 | assert.Equal(5, len(tokens)) |
| 1106 | |
| 1107 | tokens = ttokenize(mat, w, "Das <b class=\"c\">beste</b> Fußballspiel") |
| 1108 | assert.Equal("Das", tokens[0]) |
| 1109 | assert.Equal("<b class=\"c\">", tokens[1]) |
| 1110 | assert.Equal("beste", tokens[2]) |
| 1111 | assert.Equal("</b>", tokens[3]) |
| 1112 | assert.Equal("Fußballspiel", tokens[4]) |
| 1113 | assert.Equal(5, len(tokens)) |
| 1114 | |
| 1115 | tokens = ttokenize(mat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.") |
| 1116 | assert.Equal("der", tokens[0]) |
| 1117 | assert.Equal("<x y=\"alte \">", tokens[1]) |
| 1118 | assert.Equal("<x x>", tokens[2]) |
| 1119 | assert.Equal("alte", tokens[3]) |
| 1120 | assert.Equal("</x>", tokens[4]) |
| 1121 | assert.Equal("etc.", tokens[5]) |
| 1122 | assert.Equal("et", tokens[6]) |
| 1123 | assert.Equal(".", tokens[7]) |
| 1124 | assert.Equal("Mann", tokens[8]) |
| 1125 | assert.Equal(".", tokens[9]) |
| 1126 | assert.Equal(10, len(tokens)) |
Akron | 066d99c | 2021-10-28 19:04:59 +0200 | [diff] [blame] | 1127 | |
| 1128 | tokens = ttokenize(mat, w, "das<br class=\"br\" />ging.") |
| 1129 | assert.Equal("das", tokens[0]) |
| 1130 | assert.Equal("<br class=\"br\" />", tokens[1]) |
| 1131 | assert.Equal("ging", tokens[2]) |
| 1132 | assert.Equal(".", tokens[3]) |
| 1133 | assert.Equal(4, len(tokens)) |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 1134 | } |
| 1135 | |
Akron | abcb6a5 | 2021-10-09 15:52:08 +0200 | [diff] [blame] | 1136 | func TestMatokDatokEquivalence(t *testing.T) { |
| 1137 | assert := assert.New(t) |
| 1138 | |
Akron | 9fb63af | 2021-10-28 01:15:53 +0200 | [diff] [blame] | 1139 | if mat == nil { |
| 1140 | mat = LoadMatrixFile("testdata/tokenizer.matok") |
| 1141 | } |
Akron | abcb6a5 | 2021-10-09 15:52:08 +0200 | [diff] [blame] | 1142 | dat := LoadDatokFile("testdata/tokenizer.datok") |
| 1143 | |
| 1144 | r := strings.NewReader(s) |
| 1145 | |
| 1146 | tb := make([]byte, 0, 2048) |
| 1147 | w := bytes.NewBuffer(tb) |
| 1148 | |
| 1149 | // Transduce with double array representation |
| 1150 | dat.Transduce(r, w) |
| 1151 | |
| 1152 | datStr := w.String() |
| 1153 | |
| 1154 | r.Reset(s) |
| 1155 | w.Reset() |
| 1156 | |
| 1157 | // Transduce with matrix representation |
| 1158 | mat.Transduce(r, w) |
| 1159 | |
| 1160 | matStr := w.String() |
| 1161 | |
| 1162 | assert.Equal(datStr, matStr) |
| 1163 | } |
| 1164 | |
Akron | c9c0eae | 2021-10-22 19:49:43 +0200 | [diff] [blame] | 1165 | func TestMatrixFullTokenizerCallbackTransduce(t *testing.T) { |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 1166 | assert := assert.New(t) |
| 1167 | |
Akron | 9fb63af | 2021-10-28 01:15:53 +0200 | [diff] [blame] | 1168 | if mat == nil { |
| 1169 | mat = LoadMatrixFile("testdata/tokenizer.matok") |
| 1170 | } |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 1171 | |
| 1172 | assert.NotNil(mat) |
| 1173 | |
| 1174 | b := make([]byte, 0, 2048) |
| 1175 | w := bytes.NewBuffer(b) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 1176 | |
| 1177 | assert.True(mat.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w)) |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 1178 | |
| 1179 | matStr := w.String() |
| 1180 | |
| 1181 | assert.Equal("Der\nalte\nBaum\n.\n\nEr\nwar\nschon\nalt\n.\n\n\n", matStr) |
| 1182 | } |
| 1183 | |
Akron | c9c0eae | 2021-10-22 19:49:43 +0200 | [diff] [blame] | 1184 | func TestMatrixFullTokenizerTextTreatment(t *testing.T) { |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 1185 | assert := assert.New(t) |
| 1186 | |
Akron | 9fb63af | 2021-10-28 01:15:53 +0200 | [diff] [blame] | 1187 | if mat == nil { |
| 1188 | mat = LoadMatrixFile("testdata/tokenizer.matok") |
| 1189 | } |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 1190 | |
| 1191 | assert.NotNil(mat) |
| 1192 | |
| 1193 | b := make([]byte, 0, 2048) |
| 1194 | w := bytes.NewBuffer(b) |
| 1195 | |
| 1196 | assert.True(mat.Transduce(strings.NewReader("Erste.\n\n\n\n\x04\x0aNächst.\x04"), w)) |
| 1197 | matStr := w.String() |
| 1198 | assert.Equal("Erste\n.\n\n\nNächst\n.\n\n\n", matStr) |
Akron | f6bdfdb | 2021-10-23 15:56:53 +0200 | [diff] [blame] | 1199 | } |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 1200 | |
Akron | 22c565a | 2021-11-28 17:31:36 +0100 | [diff] [blame] | 1201 | func TestMatrixFullTokenizerLongText(t *testing.T) { |
| 1202 | assert := assert.New(t) |
| 1203 | |
| 1204 | if mat == nil { |
| 1205 | mat = LoadMatrixFile("testdata/tokenizer.matok") |
| 1206 | } |
| 1207 | |
| 1208 | assert.NotNil(mat) |
| 1209 | |
| 1210 | b := make([]byte, 0, 2048) |
| 1211 | w := bytes.NewBuffer(b) |
| 1212 | |
| 1213 | text := `The Project Gutenberg EBook of Effi Briest, by Theodor Fontane |
| 1214 | |
| 1215 | Copyright laws are changing all over the world. Be sure to check the |
| 1216 | copyright laws for your country before downloading or redistributing |
| 1217 | this or any other Project Gutenberg eBook. |
| 1218 | |
| 1219 | This header should be the first thing seen when viewing this Project |
| 1220 | Gutenberg file. Please do not remove it. Do not change or edit the |
| 1221 | header without written permission. |
| 1222 | |
| 1223 | Please read the "legal small print," and other information about the |
| 1224 | eBook and Project Gutenberg at the bottom of this file. Included is |
| 1225 | important information about your specific rights and restrictions in |
| 1226 | how the file may be used. You can also find out about how to make a |
| 1227 | donation to Project Gutenberg, and how to get involved. |
| 1228 | |
| 1229 | |
| 1230 | **Welcome To The World of Free Plain Vanilla Electronic Texts** |
| 1231 | |
| 1232 | **eBooks Readable By Both Humans and By Computers, Since 1971** |
| 1233 | |
| 1234 | *****These eBooks Were Prepared By Thousands of Volunteers!***** |
| 1235 | |
| 1236 | |
| 1237 | Title: Effi Briest |
| 1238 | |
| 1239 | Author: Theodor Fontane |
| 1240 | |
| 1241 | Release Date: March, 2004 [EBook #5323] |
| 1242 | ` |
| 1243 | |
| 1244 | assert.True(mat.Transduce(strings.NewReader(text), w)) |
| 1245 | |
| 1246 | assert.True(strings.Contains(w.String(), "Release")) |
| 1247 | } |
| 1248 | |
Akron | f6bdfdb | 2021-10-23 15:56:53 +0200 | [diff] [blame] | 1249 | func TestMatrixTrimming(t *testing.T) { |
| 1250 | assert := assert.New(t) |
| 1251 | |
Akron | 9fb63af | 2021-10-28 01:15:53 +0200 | [diff] [blame] | 1252 | if mat == nil { |
| 1253 | mat = LoadMatrixFile("testdata/tokenizer.matok") |
| 1254 | } |
Akron | f6bdfdb | 2021-10-23 15:56:53 +0200 | [diff] [blame] | 1255 | |
| 1256 | assert.NotNil(mat) |
| 1257 | |
| 1258 | b := make([]byte, 0, 2048) |
| 1259 | w := bytes.NewBuffer(b) |
| 1260 | |
| 1261 | assert.True(mat.Transduce(strings.NewReader(" Erste."), w)) |
| 1262 | matStr := w.String() |
| 1263 | assert.Equal("Erste\n.\n\n\n", matStr) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 1264 | } |
| 1265 | |
Akron | c9c0eae | 2021-10-22 19:49:43 +0200 | [diff] [blame] | 1266 | func BenchmarkMatrixTransduce(b *testing.B) { |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 1267 | bu := make([]byte, 0, 2048) |
| 1268 | w := bytes.NewBuffer(bu) |
| 1269 | |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 1270 | r := strings.NewReader(s) |
| 1271 | |
Akron | 094a4e8 | 2021-10-02 18:37:00 +0200 | [diff] [blame] | 1272 | mat := LoadMatrixFile("testdata/tokenizer.matok") |
Akron | 28031b7 | 2021-10-02 13:07:25 +0200 | [diff] [blame] | 1273 | |
| 1274 | b.ResetTimer() |
| 1275 | |
| 1276 | for i := 0; i < b.N; i++ { |
| 1277 | w.Reset() |
| 1278 | r.Reset(s) |
| 1279 | ok := mat.Transduce(r, w) |
| 1280 | if !ok { |
| 1281 | fmt.Println("Fail!") |
| 1282 | fmt.Println(w.String()) |
| 1283 | os.Exit(1) |
| 1284 | } |
| 1285 | } |
| 1286 | } |