| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 1 | package datokenizer | 
|  | 2 |  | 
|  | 3 | import ( | 
| Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 4 | "bytes" | 
| Akron | bd40680 | 2021-08-11 18:39:13 +0200 | [diff] [blame] | 5 | "fmt" | 
|  | 6 | "os" | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 7 | "regexp" | 
| Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 8 | "strings" | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 9 | "testing" | 
|  | 10 |  | 
|  | 11 | "github.com/stretchr/testify/assert" | 
|  | 12 | ) | 
|  | 13 |  | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 14 | func tmatch(dat *DaTokenizer, s string) bool { | 
|  | 15 | b := make([]byte, 0, 2048) | 
|  | 16 | w := bytes.NewBuffer(b) | 
|  | 17 | return dat.Transduce(strings.NewReader(s), w) | 
|  | 18 | } | 
|  | 19 |  | 
|  | 20 | func ttokenize(dat *DaTokenizer, w *bytes.Buffer, str string) []string { | 
|  | 21 | w.Reset() | 
|  | 22 | ok := dat.Transduce(strings.NewReader(str), w) | 
|  | 23 | if !ok { | 
|  | 24 | return []string{} | 
|  | 25 | } | 
|  | 26 | obj := regexp.MustCompile("\n+") | 
|  | 27 |  | 
|  | 28 | tokens := obj.Split(w.String(), -1) | 
|  | 29 | return tokens[:len(tokens)-1] | 
|  | 30 | } | 
|  | 31 |  | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 32 | func TestSimpleString(t *testing.T) { | 
|  | 33 | assert := assert.New(t) | 
|  | 34 |  | 
|  | 35 | // bau | bauamt | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 36 | tok := LoadFomaFile("testdata/bauamt.fst") | 
| Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 37 | dat := tok.ToDoubleArray() | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 38 | assert.True(tmatch(dat, "bau")) | 
|  | 39 | assert.True(tmatch(dat, "bauamt")) | 
|  | 40 | assert.False(tmatch(dat, "baum")) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 41 | } | 
| Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 42 |  | 
|  | 43 | func TestSimpleBranches(t *testing.T) { | 
|  | 44 | assert := assert.New(t) | 
|  | 45 |  | 
|  | 46 | // (bau | wahl) (amt | en) | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 47 | tok := LoadFomaFile("testdata/wahlamt.fst") | 
| Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 48 | dat := tok.ToDoubleArray() | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 49 | assert.False(tmatch(dat, "bau")) | 
|  | 50 | assert.True(tmatch(dat, "bauamt")) | 
|  | 51 | assert.True(tmatch(dat, "wahlamt")) | 
|  | 52 | assert.True(tmatch(dat, "bauen")) | 
|  | 53 | assert.True(tmatch(dat, "wahlen")) | 
|  | 54 | assert.False(tmatch(dat, "baum")) | 
| Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 55 | } | 
| Akron | 730a79c | 2021-08-03 11:05:29 +0200 | [diff] [blame] | 56 |  | 
|  | 57 | func TestSimpleTokenizer(t *testing.T) { | 
|  | 58 | assert := assert.New(t) | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 59 | tok := LoadFomaFile("testdata/simpletok.fst") | 
| Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 60 | dat := tok.ToDoubleArray() | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 61 | assert.True(tmatch(dat, "bau")) | 
|  | 62 | assert.True(tmatch(dat, "bad")) | 
|  | 63 | assert.True(tmatch(dat, "wald gehen")) | 
| Akron | 730a79c | 2021-08-03 11:05:29 +0200 | [diff] [blame] | 64 | } | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 65 |  | 
| Akron | 068874c | 2021-08-04 15:19:56 +0200 | [diff] [blame] | 66 | func TestSimpleTokenizerTransduce(t *testing.T) { | 
| Akron | 84d68e6 | 2021-08-04 17:06:52 +0200 | [diff] [blame] | 67 | assert := assert.New(t) | 
|  | 68 | tok := LoadFomaFile("testdata/simpletok.fst") | 
| Akron | 84d68e6 | 2021-08-04 17:06:52 +0200 | [diff] [blame] | 69 | dat := tok.ToDoubleArray() | 
| Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 70 |  | 
|  | 71 | r := strings.NewReader("  wald   gehen Da kann\t man was \"erleben\"!") | 
|  | 72 | b := make([]byte, 0, 2048) | 
|  | 73 | w := bytes.NewBuffer(b) | 
| Akron | 524c543 | 2021-08-05 14:14:27 +0200 | [diff] [blame] | 74 | var tokens []string | 
| Akron | 524c543 | 2021-08-05 14:14:27 +0200 | [diff] [blame] | 75 | dat.Transduce(r, w) | 
|  | 76 | tokens = strings.Split(w.String(), "\n") | 
| Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 77 | assert.Equal("wald", tokens[0]) | 
|  | 78 | assert.Equal("gehen", tokens[1]) | 
|  | 79 | assert.Equal("Da", tokens[2]) | 
|  | 80 | assert.Equal("kann", tokens[3]) | 
|  | 81 | assert.Equal("man", tokens[4]) | 
|  | 82 | assert.Equal("was", tokens[5]) | 
|  | 83 | assert.Equal("\"erleben\"", tokens[6]) | 
|  | 84 |  | 
| Akron | 524c543 | 2021-08-05 14:14:27 +0200 | [diff] [blame] | 85 | r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!") | 
|  | 86 | w.Reset() | 
|  | 87 | dat.Transduce(r, w) | 
|  | 88 | tokens = strings.Split(w.String(), "\n") | 
|  | 89 | assert.Equal("In", tokens[0]) | 
|  | 90 | assert.Equal("den", tokens[1]) | 
|  | 91 | assert.Equal("Wald", tokens[2]) | 
|  | 92 | assert.Equal("gehen", tokens[3]) | 
|  | 93 | assert.Equal("?", tokens[4]) | 
|  | 94 | assert.Equal("--", tokens[5]) | 
| Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 95 |  | 
| Akron | 524c543 | 2021-08-05 14:14:27 +0200 | [diff] [blame] | 96 | r = strings.NewReader(" g? -- D") | 
|  | 97 | w.Reset() | 
|  | 98 | dat.Transduce(r, w) | 
|  | 99 | tokens = strings.Split(w.String(), "\n") | 
|  | 100 | assert.Equal("g", tokens[0]) | 
|  | 101 | assert.Equal("?", tokens[1]) | 
|  | 102 | assert.Equal("--", tokens[2]) | 
|  | 103 | assert.Equal("D", tokens[3]) | 
|  | 104 | assert.Equal("", tokens[4]) | 
| Akron | 6e70dc8 | 2021-08-11 11:33:18 +0200 | [diff] [blame] | 105 | assert.Equal("", tokens[5]) | 
|  | 106 | assert.Equal(6, len(tokens)) | 
| Akron | 068874c | 2021-08-04 15:19:56 +0200 | [diff] [blame] | 107 | } | 
|  | 108 |  | 
| Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 109 | func TestReadWriteTokenizer(t *testing.T) { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 110 | assert := assert.New(t) | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 111 | tok := LoadFomaFile("testdata/simpletok.fst") | 
| Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 112 | dat := tok.ToDoubleArray() | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 113 | assert.True(tmatch(dat, "bau")) | 
|  | 114 | assert.True(tmatch(dat, "bad")) | 
|  | 115 | assert.True(tmatch(dat, "wald gehen")) | 
| Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 116 |  | 
| Akron | 03a3c61 | 2021-08-04 11:51:27 +0200 | [diff] [blame] | 117 | assert.True(dat.LoadFactor() >= 70) | 
| Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 118 |  | 
| Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 119 | b := make([]byte, 0, 1024) | 
| Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 120 | buf := bytes.NewBuffer(b) | 
|  | 121 | n, err := dat.WriteTo(buf) | 
|  | 122 | assert.Nil(err) | 
| Akron | 03c92fe | 2021-08-09 14:07:57 +0200 | [diff] [blame] | 123 | assert.Equal(int64(224), n) | 
| Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 124 |  | 
|  | 125 | dat2 := ParseDatok(buf) | 
|  | 126 | assert.NotNil(dat2) | 
|  | 127 | assert.Equal(dat.array, dat2.array) | 
|  | 128 | assert.Equal(dat.sigma, dat2.sigma) | 
|  | 129 | assert.Equal(dat.epsilon, dat2.epsilon) | 
|  | 130 | assert.Equal(dat.unknown, dat2.unknown) | 
|  | 131 | assert.Equal(dat.identity, dat2.identity) | 
|  | 132 | assert.Equal(dat.final, dat2.final) | 
|  | 133 | assert.Equal(dat.LoadFactor(), dat2.LoadFactor()) | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 134 | assert.True(tmatch(dat2, "bau")) | 
|  | 135 | assert.True(tmatch(dat2, "bad")) | 
|  | 136 | assert.True(tmatch(dat2, "wald gehen")) | 
| Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 137 | } | 
|  | 138 |  | 
|  | 139 | func TestFullTokenizer(t *testing.T) { | 
| Akron | 3a063ef | 2021-08-05 19:36:35 +0200 | [diff] [blame] | 140 | assert := assert.New(t) | 
| Akron | 3a063ef | 2021-08-05 19:36:35 +0200 | [diff] [blame] | 141 | dat := LoadDatokFile("testdata/tokenizer.datok") | 
|  | 142 | assert.NotNil(dat) | 
|  | 143 | assert.True(dat.LoadFactor() >= 70) | 
|  | 144 | assert.Equal(dat.epsilon, 1) | 
|  | 145 | assert.Equal(dat.unknown, 2) | 
|  | 146 | assert.Equal(dat.identity, 3) | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 147 | assert.Equal(dat.final, 137) | 
|  | 148 | assert.Equal(len(dat.sigma), 132) | 
| Akron | f1a1650 | 2021-08-16 15:24:38 +0200 | [diff] [blame] | 149 | assert.True(len(dat.array) > 3600000) | 
|  | 150 | assert.True(dat.maxSize > 3600000) | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 151 | assert.True(tmatch(dat, "bau")) | 
|  | 152 | assert.True(tmatch(dat, "bad")) | 
|  | 153 | assert.True(tmatch(dat, "wald gehen")) | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 154 | } | 
| Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 155 |  | 
| Akron | a0bded5 | 2021-08-11 15:48:02 +0200 | [diff] [blame] | 156 | func XTestFullTokenizerBuild(t *testing.T) { | 
|  | 157 | assert := assert.New(t) | 
|  | 158 | tok := LoadFomaFile("testdata/tokenizer.fst") | 
|  | 159 | dat := tok.ToDoubleArray() | 
|  | 160 | n, err := dat.Save("testdata/tokenizer.datok") | 
|  | 161 | assert.Nil(err) | 
|  | 162 | assert.True(n > 500) | 
|  | 163 | } | 
|  | 164 |  | 
| Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 165 | func TestFullTokenizerTransduce(t *testing.T) { | 
| Akron | 3610f10 | 2021-08-08 14:13:25 +0200 | [diff] [blame] | 166 | assert := assert.New(t) | 
|  | 167 |  | 
| Akron | a0bded5 | 2021-08-11 15:48:02 +0200 | [diff] [blame] | 168 | dat := LoadDatokFile("testdata/tokenizer.datok") | 
| Akron | 3610f10 | 2021-08-08 14:13:25 +0200 | [diff] [blame] | 169 | assert.NotNil(dat) | 
|  | 170 |  | 
| Akron | 3610f10 | 2021-08-08 14:13:25 +0200 | [diff] [blame] | 171 | b := make([]byte, 0, 2048) | 
|  | 172 | w := bytes.NewBuffer(b) | 
|  | 173 | var tokens []string | 
|  | 174 |  | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 175 | assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w)) | 
| Akron | 3610f10 | 2021-08-08 14:13:25 +0200 | [diff] [blame] | 176 |  | 
|  | 177 | tokens = strings.Split(w.String(), "\n") | 
| Akron | 1594cb8 | 2021-08-11 11:14:56 +0200 | [diff] [blame] | 178 | assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String()) | 
| Akron | 3610f10 | 2021-08-08 14:13:25 +0200 | [diff] [blame] | 179 | assert.Equal("tra", tokens[0]) | 
|  | 180 | assert.Equal(".", tokens[1]) | 
| Akron | b4bbb47 | 2021-08-09 11:49:38 +0200 | [diff] [blame] | 181 | assert.Equal("", tokens[2]) | 
| Akron | c5d8d43 | 2021-08-10 16:48:44 +0200 | [diff] [blame] | 182 | assert.Equal("u", tokens[3]) | 
| Akron | b4bbb47 | 2021-08-09 11:49:38 +0200 | [diff] [blame] | 183 | assert.Equal("Du", tokens[4]) | 
|  | 184 | assert.Equal("?", tokens[5]) | 
| Akron | 3610f10 | 2021-08-08 14:13:25 +0200 | [diff] [blame] | 185 | assert.Equal("", tokens[6]) | 
| Akron | 1594cb8 | 2021-08-11 11:14:56 +0200 | [diff] [blame] | 186 | assert.Equal("", tokens[7]) | 
|  | 187 | assert.Equal(8, len(tokens)) | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 188 |  | 
|  | 189 | w.Reset() | 
|  | 190 | assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w)) | 
|  | 191 | assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String()) | 
| Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 192 | } | 
| Akron | b7e1f13 | 2021-08-10 11:52:31 +0200 | [diff] [blame] | 193 |  | 
|  | 194 | func TestFullTokenizerSentenceSplitter(t *testing.T) { | 
|  | 195 | assert := assert.New(t) | 
|  | 196 | dat := LoadDatokFile("testdata/tokenizer.datok") | 
|  | 197 | assert.NotNil(dat) | 
|  | 198 |  | 
|  | 199 | b := make([]byte, 0, 2048) | 
|  | 200 | w := bytes.NewBuffer(b) | 
|  | 201 | var sentences []string | 
|  | 202 |  | 
|  | 203 | // testSentSplitterSimple | 
| Akron | 1594cb8 | 2021-08-11 11:14:56 +0200 | [diff] [blame] | 204 | assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w)) | 
|  | 205 | sentences = strings.Split(w.String(), "\n\n") | 
|  | 206 |  | 
|  | 207 | assert.Equal("Der\nalte\nMann\n.\n\n", w.String()) | 
|  | 208 | assert.Equal("Der\nalte\nMann\n.", sentences[0]) | 
|  | 209 | assert.Equal("", sentences[1]) | 
|  | 210 | assert.Equal(len(sentences), 2) | 
|  | 211 |  | 
|  | 212 | w.Reset() | 
|  | 213 | assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w)) | 
|  | 214 | sentences = strings.Split(w.String(), "\n\n") | 
|  | 215 | assert.Equal(len(sentences), 2) | 
|  | 216 | assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0]) | 
|  | 217 | assert.Equal("", sentences[1]) | 
|  | 218 |  | 
|  | 219 | w.Reset() | 
|  | 220 | assert.True(dat.Transduce(strings.NewReader(""), w)) | 
| Akron | b7e1f13 | 2021-08-10 11:52:31 +0200 | [diff] [blame] | 221 | sentences = strings.Split(w.String(), "\n\n") | 
|  | 222 | assert.Equal(len(sentences), 1) | 
| Akron | 6e70dc8 | 2021-08-11 11:33:18 +0200 | [diff] [blame] | 223 | assert.Equal("\n", sentences[0]) | 
| Akron | 1594cb8 | 2021-08-11 11:14:56 +0200 | [diff] [blame] | 224 |  | 
|  | 225 | w.Reset() | 
|  | 226 | assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w)) | 
|  | 227 | sentences = strings.Split(w.String(), "\n\n") | 
|  | 228 | assert.Equal(len(sentences), 2) | 
|  | 229 |  | 
|  | 230 | w.Reset() | 
|  | 231 | assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w)) | 
|  | 232 | sentences = strings.Split(w.String(), "\n\n") | 
|  | 233 | assert.Equal(len(sentences), 2) | 
|  | 234 |  | 
| Akron | 6e70dc8 | 2021-08-11 11:33:18 +0200 | [diff] [blame] | 235 | w.Reset() | 
|  | 236 | assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w)) | 
|  | 237 | sentences = strings.Split(w.String(), "\n\n") | 
|  | 238 | assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0]) | 
|  | 239 | assert.Equal("", sentences[1]) | 
|  | 240 | assert.Equal(len(sentences), 2) | 
| Akron | 1594cb8 | 2021-08-11 11:14:56 +0200 | [diff] [blame] | 241 |  | 
| Akron | 6e70dc8 | 2021-08-11 11:33:18 +0200 | [diff] [blame] | 242 | w.Reset() | 
|  | 243 | assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w)) | 
|  | 244 | sentences = strings.Split(w.String(), "\n\n") | 
|  | 245 | assert.Equal("", sentences[1]) | 
|  | 246 | assert.Equal(len(sentences), 2) | 
| Akron | 1594cb8 | 2021-08-11 11:14:56 +0200 | [diff] [blame] | 247 |  | 
| Akron | 6e70dc8 | 2021-08-11 11:33:18 +0200 | [diff] [blame] | 248 | w.Reset() | 
|  | 249 | assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w)) | 
|  | 250 | sentences = strings.Split(w.String(), "\n\n") | 
|  | 251 | assert.Equal(len(sentences), 2) | 
| Akron | 1594cb8 | 2021-08-11 11:14:56 +0200 | [diff] [blame] | 252 |  | 
| Akron | 6e70dc8 | 2021-08-11 11:33:18 +0200 | [diff] [blame] | 253 | w.Reset() | 
|  | 254 | assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w)) | 
|  | 255 | sentences = strings.Split(w.String(), "\n\n") | 
|  | 256 | assert.Equal(len(sentences), 2) | 
| Akron | 1594cb8 | 2021-08-11 11:14:56 +0200 | [diff] [blame] | 257 |  | 
| Akron | 6e70dc8 | 2021-08-11 11:33:18 +0200 | [diff] [blame] | 258 | w.Reset() | 
|  | 259 | assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w)) | 
|  | 260 | sentences = strings.Split(w.String(), "\n\n") | 
|  | 261 | assert.Equal(len(sentences), 2) | 
|  | 262 | assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0]) | 
|  | 263 | assert.Equal("", sentences[1]) | 
| Akron | 1594cb8 | 2021-08-11 11:14:56 +0200 | [diff] [blame] | 264 |  | 
| Akron | 6e70dc8 | 2021-08-11 11:33:18 +0200 | [diff] [blame] | 265 | w.Reset() | 
|  | 266 | assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w)) | 
|  | 267 | sentences = strings.Split(w.String(), "\n\n") | 
|  | 268 | assert.Equal(len(sentences), 3) | 
|  | 269 | assert.Equal("Ausschalten\n!!!", sentences[0]) | 
|  | 270 | assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1]) | 
|  | 271 | assert.Equal("", sentences[2]) | 
| Akron | 1594cb8 | 2021-08-11 11:14:56 +0200 | [diff] [blame] | 272 |  | 
| Akron | 4af79f1 | 2021-08-11 14:48:17 +0200 | [diff] [blame] | 273 | w.Reset() | 
|  | 274 | assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w)) | 
|  | 275 | sentences = strings.Split(w.String(), "\n\n") | 
|  | 276 | assert.Equal(len(sentences), 2) | 
| Akron | 1594cb8 | 2021-08-11 11:14:56 +0200 | [diff] [blame] | 277 |  | 
|  | 278 | /* | 
|  | 279 | Test: | 
|  | 280 | "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w)) | 
|  | 281 | */ | 
| Akron | b7e1f13 | 2021-08-10 11:52:31 +0200 | [diff] [blame] | 282 | } | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 283 |  | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 284 | func TestFullTokenizerTokenSplitter(t *testing.T) { | 
|  | 285 | assert := assert.New(t) | 
|  | 286 | dat := LoadDatokFile("testdata/tokenizer.datok") | 
|  | 287 | assert.NotNil(dat) | 
|  | 288 |  | 
|  | 289 | b := make([]byte, 0, 2048) | 
|  | 290 | w := bytes.NewBuffer(b) | 
|  | 291 | var tokens []string | 
|  | 292 |  | 
|  | 293 | // testTokenizerSimple | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 294 | tokens = ttokenize(dat, w, "Der alte Mann") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 295 | assert.Equal(tokens[0], "Der") | 
|  | 296 | assert.Equal(tokens[1], "alte") | 
|  | 297 | assert.Equal(tokens[2], "Mann") | 
|  | 298 | assert.Equal(len(tokens), 3) | 
|  | 299 |  | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 300 | tokens = ttokenize(dat, w, "Der alte Mann.") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 301 | assert.Equal(tokens[0], "Der") | 
|  | 302 | assert.Equal(tokens[1], "alte") | 
|  | 303 | assert.Equal(tokens[2], "Mann") | 
|  | 304 | assert.Equal(tokens[3], ".") | 
|  | 305 | assert.Equal(len(tokens), 4) | 
|  | 306 |  | 
|  | 307 | // testTokenizerAbbr | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 308 | tokens = ttokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 309 | assert.Equal(tokens[0], "Der") | 
|  | 310 | assert.Equal(tokens[1], "Vorsitzende") | 
|  | 311 | assert.Equal(tokens[2], "der") | 
|  | 312 | assert.Equal(tokens[3], "F.D.P.") | 
|  | 313 | assert.Equal(tokens[4], "hat") | 
|  | 314 | assert.Equal(tokens[5], "gewählt") | 
|  | 315 | assert.Equal(len(tokens), 6) | 
|  | 316 | // Ignored in KorAP-Tokenizer | 
|  | 317 |  | 
|  | 318 | // testTokenizerHost1 | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 319 | tokens = ttokenize(dat, w, "Gefunden auf wikipedia.org") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 320 | assert.Equal(tokens[0], "Gefunden") | 
|  | 321 | assert.Equal(tokens[1], "auf") | 
|  | 322 | assert.Equal(tokens[2], "wikipedia.org") | 
|  | 323 | assert.Equal(len(tokens), 3) | 
|  | 324 |  | 
|  | 325 | // testTokenizerWwwHost | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 326 | tokens = ttokenize(dat, w, "Gefunden auf www.wikipedia.org") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 327 | assert.Equal("Gefunden", tokens[0]) | 
|  | 328 | assert.Equal("auf", tokens[1]) | 
|  | 329 | assert.Equal("www.wikipedia.org", tokens[2]) | 
|  | 330 | assert.Equal(3, len(tokens)) | 
|  | 331 |  | 
|  | 332 | // testTokenizerWwwUrl | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 333 | tokens = ttokenize(dat, w, "Weitere Infos unter www.info.biz/info") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 334 | assert.Equal("www.info.biz/info", tokens[3]) | 
|  | 335 |  | 
|  | 336 | // testTokenizerFtpHost | 
|  | 337 | /* | 
|  | 338 | tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden") | 
|  | 339 | assert.Equal("Kann", tokens[0]) | 
|  | 340 | assert.Equal("von", tokens[1]) | 
|  | 341 | assert.Equal("ftp.download.org", tokens[2]) | 
|  | 342 | assert.Equal(5, len(tokens)) | 
|  | 343 | // Ignored in KorAP-Tokenizer | 
|  | 344 | */ | 
|  | 345 |  | 
|  | 346 | // testTokenizerDash | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 347 | tokens = ttokenize(dat, w, "Das war -- spitze") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 348 | assert.Equal(tokens[0], "Das") | 
|  | 349 | assert.Equal(tokens[1], "war") | 
|  | 350 | assert.Equal(tokens[2], "--") | 
|  | 351 | assert.Equal(tokens[3], "spitze") | 
|  | 352 | assert.Equal(len(tokens), 4) | 
|  | 353 |  | 
|  | 354 | // testTokenizerEmail1 | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 355 | tokens = ttokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 356 | assert.Equal(tokens[0], "Ich") | 
|  | 357 | assert.Equal(tokens[1], "bin") | 
|  | 358 | assert.Equal(tokens[2], "unter") | 
|  | 359 | assert.Equal(tokens[3], "korap@ids-mannheim.de") | 
|  | 360 | assert.Equal(tokens[4], "erreichbar") | 
|  | 361 | assert.Equal(tokens[5], ".") | 
|  | 362 | assert.Equal(len(tokens), 6) | 
|  | 363 |  | 
|  | 364 | // testTokenizerEmail2 | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 365 | tokens = ttokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 366 | assert.Equal(tokens[0], "Oder") | 
|  | 367 | assert.Equal(tokens[1], "unter") | 
|  | 368 | assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de") | 
|  | 369 | assert.Equal(tokens[3], ".") | 
|  | 370 | assert.Equal(len(tokens), 4) | 
|  | 371 |  | 
|  | 372 | // testTokenizerEmail3 | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 373 | tokens = ttokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 374 | assert.Equal(tokens[0], "Oder") | 
|  | 375 | assert.Equal(tokens[1], "unter") | 
|  | 376 | assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de") | 
|  | 377 | assert.Equal(tokens[3], ".") | 
|  | 378 | assert.Equal(len(tokens), 4) | 
|  | 379 | // Ignored in KorAP-Tokenizer | 
|  | 380 |  | 
|  | 381 | // testTokenizerDoNotAcceptQuotedEmailNames | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 382 | tokens = ttokenize(dat, w, "\"John Doe\"@xx.com") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 383 | assert.Equal("\"", tokens[0]) | 
|  | 384 | assert.Equal("John", tokens[1]) | 
|  | 385 | assert.Equal("Doe", tokens[2]) | 
|  | 386 | assert.Equal("\"", tokens[3]) | 
|  | 387 | assert.Equal("@xx", tokens[4]) | 
|  | 388 | assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here! | 
|  | 389 | assert.Equal("com", tokens[6]) | 
|  | 390 | assert.Equal(7, len(tokens)) | 
|  | 391 |  | 
|  | 392 | // testTokenizerTwitter | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 393 | tokens = ttokenize(dat, w, "Folgt @korap und #korap") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 394 | assert.Equal(tokens[0], "Folgt") | 
|  | 395 | assert.Equal(tokens[1], "@korap") | 
|  | 396 | assert.Equal(tokens[2], "und") | 
|  | 397 | assert.Equal(tokens[3], "#korap") | 
|  | 398 | assert.Equal(len(tokens), 4) | 
|  | 399 |  | 
|  | 400 | // testTokenizerWeb1 | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 401 | tokens = ttokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 402 | assert.Equal(tokens[0], "Unsere") | 
|  | 403 | assert.Equal(tokens[1], "Website") | 
|  | 404 | assert.Equal(tokens[2], "ist") | 
|  | 405 | assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum") | 
|  | 406 | assert.Equal(len(tokens), 4) | 
|  | 407 |  | 
|  | 408 | // testTokenizerWeb2 | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 409 | tokens = ttokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 410 | assert.Equal(tokens[0], "Wir") | 
|  | 411 | assert.Equal(tokens[1], "sind") | 
|  | 412 | assert.Equal(tokens[2], "auch") | 
|  | 413 | assert.Equal(tokens[3], "im") | 
|  | 414 | assert.Equal(tokens[4], "Internet") | 
|  | 415 | assert.Equal(tokens[5], "(") | 
|  | 416 | assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum") | 
|  | 417 | assert.Equal(tokens[7], ")") | 
|  | 418 | assert.Equal(len(tokens), 8) | 
|  | 419 | // Ignored in KorAP-Tokenizer | 
|  | 420 |  | 
|  | 421 | // testTokenizerWeb3 | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 422 | tokens = ttokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 423 | assert.Equal(tokens[0], "Die") | 
|  | 424 | assert.Equal(tokens[1], "Adresse") | 
|  | 425 | assert.Equal(tokens[2], "ist") | 
|  | 426 | assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum") | 
|  | 427 | assert.Equal(tokens[4], ".") | 
|  | 428 | assert.Equal(len(tokens), 5) | 
|  | 429 | // Ignored in KorAP-Tokenizer | 
|  | 430 |  | 
|  | 431 | // testTokenizerServer | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 432 | tokens = ttokenize(dat, w, "Unser Server ist 10.0.10.51.") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 433 | assert.Equal(tokens[0], "Unser") | 
|  | 434 | assert.Equal(tokens[1], "Server") | 
|  | 435 | assert.Equal(tokens[2], "ist") | 
|  | 436 | assert.Equal(tokens[3], "10.0.10.51") | 
|  | 437 | assert.Equal(tokens[4], ".") | 
|  | 438 | assert.Equal(len(tokens), 5) | 
|  | 439 |  | 
|  | 440 | // testTokenizerNum | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 441 | tokens = ttokenize(dat, w, "Zu 50,4% ist es sicher") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 442 | assert.Equal(tokens[0], "Zu") | 
|  | 443 | assert.Equal(tokens[1], "50,4%") | 
|  | 444 | assert.Equal(tokens[2], "ist") | 
|  | 445 | assert.Equal(tokens[3], "es") | 
|  | 446 | assert.Equal(tokens[4], "sicher") | 
|  | 447 | assert.Equal(len(tokens), 5) | 
|  | 448 | // Differs from KorAP-Tokenizer | 
|  | 449 |  | 
|  | 450 | // testTokenizerDate | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 451 | tokens = ttokenize(dat, w, "Der Termin ist am 5.9.2018") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 452 | assert.Equal(tokens[0], "Der") | 
|  | 453 | assert.Equal(tokens[1], "Termin") | 
|  | 454 | assert.Equal(tokens[2], "ist") | 
|  | 455 | assert.Equal(tokens[3], "am") | 
|  | 456 | assert.Equal(tokens[4], "5.9.2018") | 
|  | 457 | assert.Equal(len(tokens), 5) | 
|  | 458 |  | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 459 | tokens = ttokenize(dat, w, "Der Termin ist am 5/9/2018") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 460 | assert.Equal(tokens[0], "Der") | 
|  | 461 | assert.Equal(tokens[1], "Termin") | 
|  | 462 | assert.Equal(tokens[2], "ist") | 
|  | 463 | assert.Equal(tokens[3], "am") | 
|  | 464 | assert.Equal(tokens[4], "5/9/2018") | 
|  | 465 | assert.Equal(len(tokens), 5) | 
|  | 466 |  | 
|  | 467 | // testTokenizerDateRange | 
|  | 468 | /* | 
|  | 469 | tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018") | 
|  | 470 | assert.Equal(tokens[0], "Der") | 
|  | 471 | assert.Equal(tokens[1], "Termin") | 
|  | 472 | assert.Equal(tokens[2], "war") | 
|  | 473 | assert.Equal(tokens[3], "vom") | 
|  | 474 | assert.Equal(tokens[4], "4.") | 
|  | 475 | assert.Equal(tokens[5], "-") | 
|  | 476 | assert.Equal(tokens[6], "5.9.2018") | 
|  | 477 | assert.Equal(len(tokens), 7) | 
|  | 478 | // Ignored in KorAP-Tokenizer | 
|  | 479 | */ | 
|  | 480 |  | 
|  | 481 | // testTokenizerEmoji1 | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 482 | tokens = ttokenize(dat, w, "Das ist toll! ;)") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 483 | assert.Equal(tokens[0], "Das") | 
|  | 484 | assert.Equal(tokens[1], "ist") | 
|  | 485 | assert.Equal(tokens[2], "toll") | 
|  | 486 | assert.Equal(tokens[3], "!") | 
|  | 487 | assert.Equal(tokens[4], ";)") | 
|  | 488 | assert.Equal(len(tokens), 5) | 
|  | 489 |  | 
|  | 490 | // testTokenizerRef1 | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 491 | tokens = ttokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 492 | assert.Equal(tokens[0], "Kupietz") | 
|  | 493 | assert.Equal(tokens[1], "und") | 
|  | 494 | assert.Equal(tokens[2], "Schmidt") | 
|  | 495 | assert.Equal(tokens[3], "(2018)") | 
|  | 496 | assert.Equal(tokens[4], ":") | 
|  | 497 | assert.Equal(tokens[5], "Korpuslinguistik") | 
|  | 498 | assert.Equal(len(tokens), 6) | 
|  | 499 | // Differs from KorAP-Tokenizer! | 
|  | 500 |  | 
|  | 501 | // testTokenizerRef2 () { | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 502 | tokens = ttokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 503 | assert.Equal(tokens[0], "Kupietz") | 
|  | 504 | assert.Equal(tokens[1], "und") | 
|  | 505 | assert.Equal(tokens[2], "Schmidt") | 
|  | 506 | assert.Equal(tokens[3], "[2018]") | 
|  | 507 | assert.Equal(tokens[4], ":") | 
|  | 508 | assert.Equal(tokens[5], "Korpuslinguistik") | 
|  | 509 | assert.Equal(len(tokens), 6) | 
|  | 510 | // Differs from KorAP-Tokenizer! | 
|  | 511 |  | 
|  | 512 | // testTokenizerOmission1 () { | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 513 | tokens = ttokenize(dat, w, "Er ist ein A****loch!") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 514 | assert.Equal(tokens[0], "Er") | 
|  | 515 | assert.Equal(tokens[1], "ist") | 
|  | 516 | assert.Equal(tokens[2], "ein") | 
|  | 517 | assert.Equal(tokens[3], "A****loch") | 
|  | 518 | assert.Equal(tokens[4], "!") | 
|  | 519 | assert.Equal(len(tokens), 5) | 
|  | 520 |  | 
|  | 521 | // testTokenizerOmission2 | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 522 | tokens = ttokenize(dat, w, "F*ck!") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 523 | assert.Equal(tokens[0], "F*ck") | 
|  | 524 | assert.Equal(tokens[1], "!") | 
|  | 525 | assert.Equal(len(tokens), 2) | 
|  | 526 |  | 
|  | 527 | // testTokenizerOmission3 () { | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 528 | tokens = ttokenize(dat, w, "Dieses verf***** Kleid!") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 529 | assert.Equal(tokens[0], "Dieses") | 
|  | 530 | assert.Equal(tokens[1], "verf*****") | 
|  | 531 | assert.Equal(tokens[2], "Kleid") | 
|  | 532 | assert.Equal(tokens[3], "!") | 
|  | 533 | assert.Equal(len(tokens), 4) | 
|  | 534 |  | 
|  | 535 | // Probably interpreted as HOST | 
|  | 536 | // testTokenizerFileExtension1 | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 537 | tokens = ttokenize(dat, w, "Ich habe die readme.txt heruntergeladen") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 538 | assert.Equal(tokens[0], "Ich") | 
|  | 539 | assert.Equal(tokens[1], "habe") | 
|  | 540 | assert.Equal(tokens[2], "die") | 
|  | 541 | assert.Equal(tokens[3], "readme.txt") | 
|  | 542 | assert.Equal(tokens[4], "heruntergeladen") | 
|  | 543 | assert.Equal(len(tokens), 5) | 
|  | 544 |  | 
|  | 545 | // Probably interpreted as HOST | 
|  | 546 | // testTokenizerFileExtension2 | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 547 | tokens = ttokenize(dat, w, "Nimm die README.TXT!") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 548 | assert.Equal(tokens[0], "Nimm") | 
|  | 549 | assert.Equal(tokens[1], "die") | 
|  | 550 | assert.Equal(tokens[2], "README.TXT") | 
|  | 551 | assert.Equal(tokens[3], "!") | 
|  | 552 | assert.Equal(len(tokens), 4) | 
|  | 553 |  | 
|  | 554 | // Probably interpreted as HOST | 
|  | 555 | // testTokenizerFileExtension3 | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 556 | tokens = ttokenize(dat, w, "Zeig mir profile.jpeg") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 557 | assert.Equal(tokens[0], "Zeig") | 
|  | 558 | assert.Equal(tokens[1], "mir") | 
|  | 559 | assert.Equal(tokens[2], "profile.jpeg") | 
|  | 560 | assert.Equal(len(tokens), 3) | 
|  | 561 |  | 
|  | 562 | // testTokenizerFile1 | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 563 |  | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 564 | tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx") | 
| Akron | e8837b5 | 2021-08-11 17:29:58 +0200 | [diff] [blame] | 565 | assert.Equal(tokens[0], "Zeig") | 
|  | 566 | assert.Equal(tokens[1], "mir") | 
|  | 567 | assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx") | 
|  | 568 | assert.Equal(len(tokens), 3) | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 569 |  | 
| Akron | e8837b5 | 2021-08-11 17:29:58 +0200 | [diff] [blame] | 570 | // testTokenizerFile2 | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 571 | tokens = ttokenize(dat, w, "Gehe zu /Dokumente/profile.docx") | 
| Akron | e8837b5 | 2021-08-11 17:29:58 +0200 | [diff] [blame] | 572 | assert.Equal(tokens[0], "Gehe") | 
|  | 573 | assert.Equal(tokens[1], "zu") | 
|  | 574 | assert.Equal(tokens[2], "/Dokumente/profile.docx") | 
|  | 575 | assert.Equal(len(tokens), 3) | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 576 |  | 
| Akron | e8837b5 | 2021-08-11 17:29:58 +0200 | [diff] [blame] | 577 | // testTokenizerFile3 | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 578 | tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg") | 
| Akron | e8837b5 | 2021-08-11 17:29:58 +0200 | [diff] [blame] | 579 | assert.Equal(tokens[0], "Zeig") | 
|  | 580 | assert.Equal(tokens[1], "mir") | 
|  | 581 | assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg") | 
|  | 582 | assert.Equal(len(tokens), 3) | 
|  | 583 | // Ignored in KorAP-Tokenizer | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 584 |  | 
| Akron | fd92d7e | 2021-08-11 16:31:43 +0200 | [diff] [blame] | 585 | // testTokenizerPunct | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 586 | tokens = ttokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.") | 
| Akron | fd92d7e | 2021-08-11 16:31:43 +0200 | [diff] [blame] | 587 | assert.Equal(tokens[0], "Er") | 
|  | 588 | assert.Equal(tokens[1], "sagte") | 
|  | 589 | assert.Equal(tokens[2], ":") | 
|  | 590 | assert.Equal(tokens[3], "\"") | 
|  | 591 | assert.Equal(tokens[4], "Es") | 
|  | 592 | assert.Equal(tokens[5], "geht") | 
|  | 593 | assert.Equal(tokens[6], "mir") | 
|  | 594 | assert.Equal(tokens[7], "gut") | 
|  | 595 | assert.Equal(tokens[8], "!") | 
|  | 596 | assert.Equal(tokens[9], "\"") | 
|  | 597 | assert.Equal(tokens[10], ",") | 
|  | 598 | assert.Equal(tokens[11], "daraufhin") | 
|  | 599 | assert.Equal(tokens[12], "ging") | 
|  | 600 | assert.Equal(tokens[13], "er") | 
|  | 601 | assert.Equal(tokens[14], ".") | 
|  | 602 | assert.Equal(len(tokens), 15) | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 603 |  | 
|  | 604 | // testTokenizerPlusAmpersand | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 605 | tokens = ttokenize(dat, w, ""Das ist von C&A!"") | 
|  | 606 | assert.Equal(tokens[0], """) | 
|  | 607 | assert.Equal(tokens[1], "Das") | 
|  | 608 | assert.Equal(tokens[2], "ist") | 
|  | 609 | assert.Equal(tokens[3], "von") | 
|  | 610 | assert.Equal(tokens[4], "C&A") | 
|  | 611 | assert.Equal(tokens[5], "!") | 
|  | 612 | assert.Equal(tokens[6], """) | 
|  | 613 | assert.Equal(len(tokens), 7) | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 614 |  | 
|  | 615 | // testTokenizerLongEnd | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 616 | tokens = ttokenize(dat, w, "Siehst Du?!!?") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 617 | assert.Equal(tokens[0], "Siehst") | 
|  | 618 | assert.Equal(tokens[1], "Du") | 
|  | 619 | assert.Equal(tokens[2], "?!!?") | 
|  | 620 | assert.Equal(len(tokens), 3) | 
|  | 621 |  | 
|  | 622 | // testTokenizerIrishO | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 623 | tokens = ttokenize(dat, w, "Peter O'Toole") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 624 | assert.Equal(tokens[0], "Peter") | 
|  | 625 | assert.Equal(tokens[1], "O'Toole") | 
|  | 626 | assert.Equal(len(tokens), 2) | 
|  | 627 |  | 
|  | 628 | // testTokenizerAbr | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 629 | tokens = ttokenize(dat, w, "Früher bzw. später ...") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 630 | assert.Equal(tokens[0], "Früher") | 
|  | 631 | assert.Equal(tokens[1], "bzw.") | 
|  | 632 | assert.Equal(tokens[2], "später") | 
|  | 633 | assert.Equal(tokens[3], "...") | 
|  | 634 | assert.Equal(len(tokens), 4) | 
|  | 635 |  | 
|  | 636 | // testTokenizerUppercaseRule | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 637 | tokens = ttokenize(dat, w, "Es war spät.Morgen ist es früh.") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 638 | assert.Equal(tokens[0], "Es") | 
|  | 639 | assert.Equal(tokens[1], "war") | 
|  | 640 | assert.Equal(tokens[2], "spät") | 
|  | 641 | assert.Equal(tokens[3], ".") | 
|  | 642 | assert.Equal(tokens[4], "Morgen") | 
|  | 643 | assert.Equal(tokens[5], "ist") | 
|  | 644 | assert.Equal(tokens[6], "es") | 
|  | 645 | assert.Equal(tokens[7], "früh") | 
|  | 646 | assert.Equal(tokens[8], ".") | 
|  | 647 | assert.Equal(len(tokens), 9) | 
|  | 648 | // Ignored in KorAP-Tokenizer | 
|  | 649 |  | 
|  | 650 | // testTokenizerOrd | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 651 | tokens = ttokenize(dat, w, "Sie erreichte den 1. Platz!") | 
| Akron | a0bded5 | 2021-08-11 15:48:02 +0200 | [diff] [blame] | 652 | assert.Equal(tokens[0], "Sie") | 
|  | 653 | assert.Equal(tokens[1], "erreichte") | 
|  | 654 | assert.Equal(tokens[2], "den") | 
|  | 655 | assert.Equal(tokens[3], "1.") | 
|  | 656 | assert.Equal(tokens[4], "Platz") | 
|  | 657 | assert.Equal(tokens[5], "!") | 
|  | 658 | assert.Equal(len(tokens), 6) | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 659 |  | 
|  | 660 | // testNoZipOuputArchive | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 661 | tokens = ttokenize(dat, w, "Archive:  Ich bin kein zip\n") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 662 | assert.Equal(tokens[0], "Archive") | 
|  | 663 | assert.Equal(tokens[1], ":") | 
|  | 664 | assert.Equal(tokens[2], "Ich") | 
|  | 665 | assert.Equal(tokens[3], "bin") | 
|  | 666 | assert.Equal(tokens[4], "kein") | 
|  | 667 | assert.Equal(tokens[5], "zip") | 
|  | 668 | assert.Equal(6, len(tokens)) | 
|  | 669 |  | 
|  | 670 | // testTokenizerStrasse | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 671 | tokens = ttokenize(dat, w, "Ich wohne in der Weststr. und Du?") | 
| Akron | 4af79f1 | 2021-08-11 14:48:17 +0200 | [diff] [blame] | 672 | assert.Equal(tokens[4], "Weststr.") | 
|  | 673 | assert.Equal(8, len(tokens)) | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 674 |  | 
|  | 675 | // germanTokenizerKnowsGermanOmissionWords | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 676 | tokens = ttokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 677 | assert.Equal("D'dorf", tokens[0]) | 
|  | 678 | assert.Equal("Ku'damm", tokens[1]) | 
|  | 679 | assert.Equal("Lu'hafen", tokens[2]) | 
|  | 680 | assert.Equal("M'gladbach", tokens[3]) | 
|  | 681 | assert.Equal("W'schaft", tokens[4]) | 
|  | 682 | assert.Equal(5, len(tokens)) | 
|  | 683 |  | 
|  | 684 | // germanTokenizerDoesNOTSeparateGermanContractions | 
| Akron | ec835ad | 2021-08-11 18:23:22 +0200 | [diff] [blame] | 685 | tokens = ttokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste") | 
| Akron | 03ca425 | 2021-08-11 13:32:53 +0200 | [diff] [blame] | 686 | assert.Equal("mach's", tokens[0]) | 
|  | 687 | assert.Equal("macht's", tokens[1]) | 
|  | 688 | assert.Equal("was'n", tokens[2]) | 
|  | 689 | assert.Equal("ist's", tokens[3]) | 
|  | 690 | assert.Equal("haste", tokens[4]) | 
|  | 691 | assert.Equal("willste", tokens[5]) | 
|  | 692 | assert.Equal("kannste", tokens[6]) | 
|  | 693 | assert.Equal("biste", tokens[7]) | 
|  | 694 | assert.Equal("kriegste", tokens[8]) | 
|  | 695 | assert.Equal(9, len(tokens)) | 
|  | 696 |  | 
|  | 697 | /* | 
|  | 698 | @Test | 
|  | 699 | public void englishTokenizerSeparatesEnglishContractionsAndClitics () { | 
|  | 700 | DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en(); | 
|  | 701 | tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't") | 
|  | 702 | assert.Equal("'ve", tokens[1]); | 
|  | 703 | assert.Equal("'ll", tokens[3]); | 
|  | 704 | assert.Equal("'d", tokens[5]); | 
|  | 705 | assert.Equal("'m", tokens[7]); | 
|  | 706 | assert.Equal("'re", tokens[9]); | 
|  | 707 | assert.Equal("'s", tokens[11]); | 
|  | 708 | assert.Equal("is", tokens[12]); | 
|  | 709 | assert.Equal("n't", tokens[13]); | 
|  | 710 | assert.Equal(14, len(tokens)); | 
|  | 711 | } | 
|  | 712 |  | 
|  | 713 | @Test | 
|  | 714 | public void frenchTokenizerKnowsFrenchAbbreviations () { | 
|  | 715 | DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr(); | 
|  | 716 | tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.") | 
|  | 717 | assert.Equal("Approx.", tokens[0]); | 
|  | 718 | assert.Equal("juill.", tokens[2]); | 
|  | 719 | assert.Equal("prof.", tokens[5]); | 
|  | 720 | assert.Equal("exerc.", tokens[15]); | 
|  | 721 | assert.Equal("no.", tokens[16]); | 
|  | 722 | assert.Equal("pp.", tokens[21]); | 
|  | 723 | } | 
|  | 724 |  | 
|  | 725 | @Test | 
|  | 726 | public void frenchTokenizerKnowsFrenchContractions () { | 
|  | 727 | DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr(); | 
|  | 728 | tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île") | 
|  | 729 | assert.Equal("J'", tokens[0]); | 
|  | 730 | assert.Equal("j'", tokens[2]); | 
|  | 731 | assert.Equal("qu'", tokens[4]); | 
|  | 732 | assert.Equal("d'", tokens[6]); | 
|  | 733 | assert.Equal("jusqu'", tokens[8]); | 
|  | 734 | assert.Equal("Aujourd'hui", tokens[10]); | 
|  | 735 | assert.Equal("D'", tokens[11]); // ’ | 
|  | 736 | assert.Equal("Quelqu'un", tokens[13]); // ’ | 
|  | 737 | assert.Equal("Presqu'île", tokens[14]); // ’ | 
|  | 738 | } | 
|  | 739 |  | 
|  | 740 | @Test | 
|  | 741 | public void frenchTokenizerKnowsFrenchClitics () { | 
|  | 742 | DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr(); | 
|  | 743 | tokens = tokenize(dat, w, "suis-je sont-elles ") | 
|  | 744 | assert.Equal("suis", tokens[0]); | 
|  | 745 | assert.Equal("-je", tokens[1]); | 
|  | 746 | assert.Equal("sont", tokens[2]); | 
|  | 747 | assert.Equal("-elles", tokens[3]); | 
|  | 748 | } | 
|  | 749 |  | 
|  | 750 | @Test | 
|  | 751 | public void testEnglishTokenizerScienceAbbreviations () { | 
|  | 752 | DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en(); | 
|  | 753 | tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.") | 
|  | 754 | assert.Equal("Approx.", tokens[0]); | 
|  | 755 | assert.Equal("in", tokens[1]); | 
|  | 756 | assert.Equal("Sept.", tokens[2]); | 
|  | 757 | assert.Equal("1954", tokens[3]); | 
|  | 758 | assert.Equal(",", tokens[4]); | 
|  | 759 | assert.Equal("Assoc.", tokens[5]); | 
|  | 760 | assert.Equal("Prof.", tokens[6]); | 
|  | 761 | assert.Equal("Dr.", tokens[7]); | 
|  | 762 | assert.Equal("R.", tokens[8]); | 
|  | 763 | assert.Equal("J.", tokens[9]); | 
|  | 764 | assert.Equal("Ewing", tokens[10]); | 
|  | 765 | assert.Equal("reviewed", tokens[11]); | 
|  | 766 | assert.Equal("articles", tokens[12]); | 
|  | 767 | assert.Equal("on", tokens[13]); | 
|  | 768 | assert.Equal("Enzymol.", tokens[14]); | 
|  | 769 | assert.Equal("Bacteriol.", tokens[15]); | 
|  | 770 | assert.Equal("effects", tokens[16]); | 
|  | 771 | assert.Equal("later", tokens[17]); | 
|  | 772 | assert.Equal("published", tokens[18]); | 
|  | 773 | assert.Equal("in", tokens[19]); | 
|  | 774 | assert.Equal("Nutr.", tokens[20]); | 
|  | 775 | assert.Equal("Rheumatol.", tokens[21]); | 
|  | 776 | assert.Equal("No.", tokens[22]); | 
|  | 777 | assert.Equal("12", tokens[23]); | 
|  | 778 | assert.Equal("and", tokens[24]); | 
|  | 779 | assert.Equal("Nº.", tokens[25]); | 
|  | 780 | assert.Equal("13.", tokens[26]); | 
|  | 781 | assert.Equal(",", tokens[27]); | 
|  | 782 | assert.Equal("pp.", tokens[28]); | 
|  | 783 | assert.Equal("17-18", tokens[29]); | 
|  | 784 | assert.Equal(".", tokens[30]); | 
|  | 785 | } | 
|  | 786 |  | 
|  | 787 | @Test | 
|  | 788 | public void englishTokenizerCanGuessWhetherIIsAbbrev () { | 
|  | 789 | DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en(); | 
|  | 790 | tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.") | 
|  | 791 | assert.Equal("I.", tokens[1]); | 
|  | 792 | assert.Equal("I", tokens[8]); | 
|  | 793 | assert.Equal(".", tokens[9]); | 
|  | 794 | assert.Equal("I", tokens[12]); | 
|  | 795 | assert.Equal(".", tokens[13]); | 
|  | 796 | } | 
|  | 797 |  | 
|  | 798 | @Test | 
|  | 799 | public void testZipOuputArchive () { | 
|  | 800 |  | 
|  | 801 | final ByteArrayOutputStream clearOut = new ByteArrayOutputStream(); | 
|  | 802 | System.setOut(new PrintStream(clearOut)); | 
|  | 803 | tokens = tokenize(dat, w, "Archive:  ich/bin/ein.zip\n") | 
|  | 804 | assert.Equal(0, len(tokens)); | 
|  | 805 | } | 
|  | 806 | */ | 
|  | 807 | /* | 
|  | 808 |  | 
|  | 809 | @Test | 
|  | 810 | public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException { | 
|  | 811 | DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder() | 
|  | 812 | .tokenizerClassName(DerekoDfaTokenizer_de.class.getName()) | 
|  | 813 | .printOffsets(true) | 
|  | 814 | .build(); | 
|  | 815 | Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n"); | 
|  | 816 | assert.Equal("Text1", tokens[0].getType()); | 
|  | 817 | assert.Equal(len(tokens), 9 ); | 
|  | 818 | } | 
|  | 819 | */ | 
|  | 820 | } | 
| Akron | bd40680 | 2021-08-11 18:39:13 +0200 | [diff] [blame] | 821 |  | 
|  | 822 | func BenchmarkTransduce(b *testing.B) { | 
|  | 823 | bu := make([]byte, 0, 2048) | 
|  | 824 | w := bytes.NewBuffer(bu) | 
|  | 825 |  | 
|  | 826 | s := `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar. | 
|  | 827 | Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher. | 
|  | 828 | Der Termin ist am 5.9.2018. | 
|  | 829 | Ich habe die readme.txt heruntergeladen. | 
|  | 830 | Ausschalten!!! Hast Du nicht gehört??? | 
|  | 831 | Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen. | 
|  | 832 | Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz! | 
|  | 833 | Archive:  Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft. | 
|  | 834 | Mach's macht's was'n ist's haste willste kannste biste kriegste.` | 
|  | 835 | r := strings.NewReader(s) | 
|  | 836 |  | 
|  | 837 | dat := LoadDatokFile("testdata/tokenizer.datok") | 
|  | 838 |  | 
|  | 839 | for i := 0; i < b.N; i++ { | 
|  | 840 | w.Reset() | 
|  | 841 | r.Reset(s) | 
|  | 842 | ok := dat.Transduce(r, w) | 
|  | 843 | if !ok { | 
|  | 844 | fmt.Println("Fail!") | 
|  | 845 | fmt.Println(w.String()) | 
|  | 846 | os.Exit(1) | 
|  | 847 | } | 
|  | 848 | } | 
| Akron | bd40680 | 2021-08-11 18:39:13 +0200 | [diff] [blame] | 849 | } | 
| Akron | bb4aac5 | 2021-08-13 00:52:27 +0200 | [diff] [blame] | 850 |  | 
| Akron | 6f1c16c | 2021-08-17 10:45:42 +0200 | [diff] [blame^] | 851 | // This test is deprecated as the datok file changes over time | 
|  | 852 | func XBenchmarkLoadDatokFile(b *testing.B) { | 
| Akron | bb4aac5 | 2021-08-13 00:52:27 +0200 | [diff] [blame] | 853 | for i := 0; i < b.N; i++ { | 
|  | 854 | dat := LoadDatokFile("testdata/tokenizer.datok") | 
|  | 855 | if dat == nil { | 
|  | 856 | fmt.Println("Fail!") | 
|  | 857 | os.Exit(1) | 
|  | 858 | } | 
|  | 859 | } | 
|  | 860 | } | 
|  | 861 |  | 
| Akron | 6f1c16c | 2021-08-17 10:45:42 +0200 | [diff] [blame^] | 862 | func BenchmarkToDoubleArray(b *testing.B) { | 
|  | 863 | tok := LoadFomaFile("testdata/simple_bench.fst") | 
|  | 864 | for i := 0; i < b.N; i++ { | 
|  | 865 | dat := tok.ToDoubleArray() | 
|  | 866 | if dat == nil { | 
|  | 867 | fmt.Println("Fail!") | 
|  | 868 | os.Exit(1) | 
|  | 869 | } | 
|  | 870 | } | 
|  | 871 | } | 
|  | 872 |  | 
| Akron | bb4aac5 | 2021-08-13 00:52:27 +0200 | [diff] [blame] | 873 | // 2021-08-11 (go 1.16) | 
|  | 874 | // go test -bench=. -test.benchmem | 
|  | 875 | //   BenchmarkTransduce-4         19069             60609 ns/op           11048 B/op        137 allocs/op | 
| Akron | f1a1650 | 2021-08-16 15:24:38 +0200 | [diff] [blame] | 876 | // 2021-08-12 (go 1.16) | 
| Akron | bb4aac5 | 2021-08-13 00:52:27 +0200 | [diff] [blame] | 877 | //   BenchmarkTransduce-4         20833             55241 ns/op            9676 B/op          3 allocs/op | 
|  | 878 | //   BenchmarkLoadDatokFile-4         4         258418169 ns/op        29916470 B/op       5697 allocs/op | 
|  | 879 | //   BenchmarkTransduce-4         19430             58133 ns/op           18696 B/op          3 allocs/op | 
|  | 880 | //   BenchmarkLoadDatokFile-4         8         139071939 ns/op       203158377 B/op       5742 allocs/op | 
| Akron | f1a1650 | 2021-08-16 15:24:38 +0200 | [diff] [blame] | 881 | // 2021-08-16 | 
|  | 882 | //   BenchmarkTransduce-4               22251             49989 ns/op           17370 B/op          3 allocs/op | 
|  | 883 | //   BenchmarkLoadDatokFile-4               8         138937532 ns/op        203158327 B/op      5742 allocs/op | 
|  | 884 | //   BenchmarkTransduce-4               22005             48665 ns/op           17472 B/op          3 allocs/op | 
|  | 885 | //   BenchmarkLoadDatokFile-4               7         143143934 ns/op        203158450 B/op      5743 allocs/op | 
| Akron | ea46e8a | 2021-08-17 00:36:31 +0200 | [diff] [blame] | 886 | //   BenchmarkTransduce-4               34939             34363 ns/op           14056 B/op          3 allocs/op | 
|  | 887 | //   BenchmarkLoadDatokFile-4               7         149511609 ns/op        203217193 B/op      5915 allocs/op | 
| Akron | 6f1c16c | 2021-08-17 10:45:42 +0200 | [diff] [blame^] | 888 | // 2021-08-17 | 
|  | 889 | //   BenchmarkTransduce-4               31204             32678 ns/op           14752 B/op          3 allocs/op | 
|  | 890 | //   BenchmarkToDoubleArray-4           44138             26850 ns/op           10704 B/op         29 allocs/op |