Introduce english clitics
Change-Id: Ib943a96fa097a71c77cd878c71392e5c5139315a
diff --git a/Makefile b/Makefile
index e8eddc5..05981c6 100644
--- a/Makefile
+++ b/Makefile
@@ -16,18 +16,27 @@
-e "save stack ../testdata/tokenizer_en.fst" -q -s && \
cd ..
-buildmatok: buildfoma build
+buildmatok_de: buildfoma_de build
./bin/datok convert -i ./testdata/tokenizer.fst -o ./testdata/tokenizer.matok
buildmatok_en: buildfoma_en build
./bin/datok convert -i ./testdata/tokenizer_en.fst -o ./testdata/tokenizer_en.matok
-builddatok: buildfoma build
+builddatok: buildfoma_de build
./bin/datok convert -i ./testdata/tokenizer.fst -o ./testdata/tokenizer.datok -d
+builddatok_en: buildfoma_en build
+ ./bin/datok convert -i ./testdata/tokenizer_en.fst -o ./testdata/tokenizer_en.datok -d
+
test:
go test ./...
+test_clitic:
+ foma -e "source testdata/clitic_test.xfst" \
+ -e "save stack testdata/clitic_test.fst" -q -s && \
+ ./bin/datok convert -i ./testdata/clitic_test.fst -o ./testdata/clitic_test.matok && \
+ go test ./... -timeout 30s -run ^TestMatrixCliticRule$
+
build:
go build -v -o ./bin/datok ./cmd/datok.go
diff --git a/Readme.md b/Readme.md
index ed10aec..3f9683b 100644
--- a/Readme.md
+++ b/Readme.md
@@ -190,6 +190,8 @@
and [Marc Kupietz](https://github.com/KorAP/KorAP-Tokenizer)
(published under the Apache License).
+The english clitics list is based on Zwicky & Pullum (1983).
+
The foma parser is based on
[*foma2js*](https://github.com/mhulden/foma),
written by Mans Hulden (published under the Apache License).
@@ -216,3 +218,7 @@
Kanda, Shunsuke, Yuma Fujita, Kazuhiro Morita & Masao Fuketa (2018):
*Practical rearrangement methods for dynamic double-array dictionaries*.
Software: Practice and Experience (SPE), 48(1), pp. 65–83.
+
+Zwicky, Arnold M., Geoffrey K. Pullum (1983):
+*Cliticization vs. Inflection: English N’T*.
+Language, 59, pp. 502-513.
diff --git a/datok.go b/datok.go
index f19ce5a..88975ea 100644
--- a/datok.go
+++ b/datok.go
@@ -169,6 +169,7 @@
dat.array[t1].setCheck(t)
// Set maxSize
+ // New: dat.maxSize = max(dat.maxSize, int(t1))
if dat.maxSize < int(t1) {
dat.maxSize = int(t1)
}
diff --git a/matrix_test.go b/matrix_test.go
index 40ddb8d..9d2c674 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -125,12 +125,28 @@
tws := NewTokenWriter(w, TOKENS|SENTENCES)
assert.True(mat.TransduceTokenWriter(
+
strings.NewReader(exstring), tws),
)
tws.Flush()
matStr := w.String()
- assert.Equal("dead\n.\n\n\n\n\n\n\n", matStr)
+ assert.Equal("dead\n.\n\n\n", matStr)
+
+ tokens = ttokenize(mat, w, "they're")
+ assert.Equal("they", tokens[0])
+ assert.Equal("'re", tokens[1])
+
+ tokens = ttokenize(mat, w, "they're They're their don't wouldn't")
+ assert.Equal("they", tokens[0])
+ assert.Equal("'re", tokens[1])
+ assert.Equal("They", tokens[2])
+ assert.Equal("'re", tokens[3])
+ assert.Equal("their", tokens[4])
+ assert.Equal("do", tokens[5])
+ assert.Equal("n't", tokens[6])
+ assert.Equal("would", tokens[7])
+ assert.Equal("n't", tokens[8])
}
func TestMatrixReadWriteTokenizer(t *testing.T) {
@@ -1051,23 +1067,23 @@
assert.Equal("I", tokens[12])
assert.Equal(".", tokens[13])
+ // englishTokenizerSeparatesEnglishContractionsAndClitics
+ tokens = ttokenize(mat_en, w, "I've we'll you'd I'm we're Peter's isn't who'll've")
+ assert.Equal("I", tokens[0])
+ assert.Equal("'ve", tokens[1])
+ assert.Equal("'ll", tokens[3])
+ assert.Equal("'d", tokens[5])
+ assert.Equal("'m", tokens[7])
+ assert.Equal("'re", tokens[9])
+ assert.Equal("'s", tokens[11])
+ assert.Equal("is", tokens[12])
+ assert.Equal("n't", tokens[13])
+ assert.Equal("who", tokens[14])
+ assert.Equal("'ll", tokens[15])
+ assert.Equal("'ve", tokens[16])
+ assert.Equal(17, len(tokens))
/*
@Test
- public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
- DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
- tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
- assert.Equal("'ve", tokens[1]);
- assert.Equal("'ll", tokens[3]);
- assert.Equal("'d", tokens[5]);
- assert.Equal("'m", tokens[7]);
- assert.Equal("'re", tokens[9]);
- assert.Equal("'s", tokens[11]);
- assert.Equal("is", tokens[12]);
- assert.Equal("n't", tokens[13]);
- assert.Equal(14, len(tokens));
- }
-
- @Test
public void frenchTokenizerKnowsFrenchAbbreviations () {
DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
diff --git a/src/en/clitics.xfst b/src/en/clitics.xfst
new file mode 100644
index 0000000..ab919b5
--- /dev/null
+++ b/src/en/clitics.xfst
@@ -0,0 +1,4 @@
+define Clitics [ Apos [[{ll}|d|{ve}|s|{re}|(e)m|n] .o. Caseinsensitive] ];
+# Following https://web.stanford.edu/~zwicky/ZPCliticsInfl.pdf
+define CliticsNTPart [[{do}({es})|{did}|{have}|{has}|{had}|{ca}|{could}|{might}|{sha}|{should}|{wo}|{would}|{dare}|{must}|{need}|{ought}|{are}|{is}|{was}|{were}|{ai}] {n}] .o. Caseinsensitive;
+define CliticsNT [CliticsNTPart "'" ["t"|"T"]];
diff --git a/src/en/tokenizer.xfst b/src/en/tokenizer.xfst
index adcdac8..259fa60 100644
--- a/src/en/tokenizer.xfst
+++ b/src/en/tokenizer.xfst
@@ -89,10 +89,10 @@
define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä - è - é - ú - á - â - ê - î - ô - û];
-! Irrelevant becose of the more general rule followed
-! define Clitics [ Apos [{ll}|d|{ve}|s|{re}|m|n|{em}] .o. Caseinsensitive ] | ["n" Apos "t"] .o. Caseinsensitive ];
+source en/clitics.xfst
-define Word Char+ ([Apos|Asterisk] Char+)*;
+# define Word Char+ (Apos Char+)*;
+define Word [[ Char+ | Clitics ] - CliticsNTPart | CliticsNT];
define Plusampersand @txt"de/plusampersand.txt";
define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*;
@@ -121,7 +121,11 @@
File @-> ... NLout,
Domain @-> ... NLout,
[Emoticons|Arrows] @-> ... NLout
-] .o. ["I" @-> ... NLout \/ NonAbbrI [WS | NLout ]+ _ ];
+]
+! I as a separate token
+.o. ["I" @-> ... NLout \/ NonAbbrI [WS | NLout ]+ _ ]
+! Negative clitics are tokens
+.o. [ {n't} ->@ NLout ... \/ NLout ];
source all/allsentencesplit.xfst
diff --git a/testdata/clitic_test.fst b/testdata/clitic_test.fst
index b373352..c6fe6aa 100644
--- a/testdata/clitic_test.fst
+++ b/testdata/clitic_test.fst
Binary files differ
diff --git a/testdata/clitic_test.matok b/testdata/clitic_test.matok
index 9cc4971..e58b29d 100644
--- a/testdata/clitic_test.matok
+++ b/testdata/clitic_test.matok
Binary files differ
diff --git a/testdata/clitic_test.xfst b/testdata/clitic_test.xfst
index 1b129aa..de6788f 100644
--- a/testdata/clitic_test.xfst
+++ b/testdata/clitic_test.xfst
@@ -1,13 +1,21 @@
define TB "@_TOKEN_BOUND_@";
+# define TB "_";
define WS [" "|"\u000a"|"\u0009"];
define PUNCT ["."|"?"|"!"];
-define Char \[WS|PUNCT];
-define Word Char+;
+define Char \[WS|PUNCT|"'"];
+define Clitic ["'" [{ll}|{d}|{ve}|{s}|{re}|"m"|"n"|"t"]];
+# Following https://web.stanford.edu/~zwicky/ZPCliticsInfl.pdf
+define CliticNTPart [[{do}({es})|{did}|{have}|{has}|{had}|{ca}|{could}|{might}|{sha}|{should}|{wo}|{would}|{dare}|{must}|{need}|{ought}|{are}|{is}|{was}|{were}|{ai}] {n}];
+define CliticNT [CliticNTPart "'" {t}];
+
+
+define Word [ [ Clitic | Char+] - CliticNTPart | CliticNT];
! Compose token boundaries
-define Tokenizer [[Word|PUNCT] @-> ... TB] .o.
+define Tokenizer [[Word|PUNCT|CliticNT] @-> ... TB] .o.
! Compose Whitespace ignorance
[WS+ @-> 0] .o.
! Compose sentence ends
[[PUNCT+] @-> ... TB \/ TB _ ];
-read regex Tokenizer .o. [{'re} ->@ TB ... ];
+read regex Tokenizer .o. [{n't} ->@ TB ... \/ TB ];
+
diff --git a/testdata/tokenizer_en.fst b/testdata/tokenizer_en.fst
index a9411cb..011934a 100644
--- a/testdata/tokenizer_en.fst
+++ b/testdata/tokenizer_en.fst
Binary files differ
diff --git a/testdata/tokenizer_en.matok b/testdata/tokenizer_en.matok
index 20ec553..3ec42a5 100644
--- a/testdata/tokenizer_en.matok
+++ b/testdata/tokenizer_en.matok
Binary files differ