Introduce english clitics Change-Id: Ib943a96fa097a71c77cd878c71392e5c5139315a

commit: 72a6422a29c2245a68afaa25e00c5557bb741b69 [log] [tgz]
author: Akron <nils@diewald-online.de> Wed Apr 26 17:00:45 2023 +0200
committer: Akron <nils@diewald-online.de> Thu Aug 31 13:39:05 2023 +0200
tree: e0ac7243627c30567ca396f6a26f0a5a3a6b757f
parent: cab40cf242939715599b05b8e14ce750409a020c [diff]
diff --git a/Makefile b/Makefile
index e8eddc5..05981c6 100644
--- a/Makefile
+++ b/Makefile

@@ -16,18 +16,27 @@
 	-e "save stack ../testdata/tokenizer_en.fst" -q -s && \
 	cd ..
 
-buildmatok: buildfoma build
+buildmatok_de: buildfoma_de build
 	./bin/datok convert -i ./testdata/tokenizer.fst -o ./testdata/tokenizer.matok
 
 buildmatok_en: buildfoma_en build
 	./bin/datok convert -i ./testdata/tokenizer_en.fst -o ./testdata/tokenizer_en.matok
 
-builddatok: buildfoma build
+builddatok: buildfoma_de build
 	./bin/datok convert -i ./testdata/tokenizer.fst -o ./testdata/tokenizer.datok -d
 
+builddatok_en: buildfoma_en build
+	./bin/datok convert -i ./testdata/tokenizer_en.fst -o ./testdata/tokenizer_en.datok -d
+
 test:
 	go test ./...
 
+test_clitic:
+	foma -e "source testdata/clitic_test.xfst" \
+	-e "save stack testdata/clitic_test.fst" -q -s && \
+	./bin/datok convert -i ./testdata/clitic_test.fst -o ./testdata/clitic_test.matok && \
+	go test ./... -timeout 30s -run ^TestMatrixCliticRule$
+
 build:
 	go build -v -o ./bin/datok ./cmd/datok.go
 

diff --git a/Readme.md b/Readme.md
index ed10aec..3f9683b 100644
--- a/Readme.md
+++ b/Readme.md

@@ -190,6 +190,8 @@
 and [Marc Kupietz](https://github.com/KorAP/KorAP-Tokenizer)
 (published under the Apache License).
 
+The english clitics list is based on Zwicky & Pullum (1983).
+
 The foma parser is based on
 [*foma2js*](https://github.com/mhulden/foma),
 written by Mans Hulden (published under the Apache License).
@@ -216,3 +218,7 @@
 Kanda, Shunsuke, Yuma Fujita, Kazuhiro Morita & Masao Fuketa (2018):
 *Practical rearrangement methods for dynamic double-array dictionaries*.
 Software: Practice and Experience (SPE), 48(1), pp. 65–83.
+
+Zwicky, Arnold M., Geoffrey K. Pullum (1983):
+*Cliticization vs. Inflection: English N’T*.
+Language, 59, pp. 502-513.

diff --git a/datok.go b/datok.go
index f19ce5a..88975ea 100644
--- a/datok.go
+++ b/datok.go

@@ -169,6 +169,7 @@
 				dat.array[t1].setCheck(t)
 
 				// Set maxSize
+				// New: dat.maxSize = max(dat.maxSize, int(t1))
 				if dat.maxSize < int(t1) {
 					dat.maxSize = int(t1)
 				}

diff --git a/matrix_test.go b/matrix_test.go
index 40ddb8d..9d2c674 100644
--- a/matrix_test.go
+++ b/matrix_test.go

@@ -125,12 +125,28 @@
 	tws := NewTokenWriter(w, TOKENS|SENTENCES)
 
 	assert.True(mat.TransduceTokenWriter(
+
 		strings.NewReader(exstring), tws),
 	)
 	tws.Flush()
 
 	matStr := w.String()
-	assert.Equal("dead\n.\n\n\n\n\n\n\n", matStr)
+	assert.Equal("dead\n.\n\n\n", matStr)
+
+	tokens = ttokenize(mat, w, "they're")
+	assert.Equal("they", tokens[0])
+	assert.Equal("'re", tokens[1])
+
+	tokens = ttokenize(mat, w, "they're They're their don't wouldn't")
+	assert.Equal("they", tokens[0])
+	assert.Equal("'re", tokens[1])
+	assert.Equal("They", tokens[2])
+	assert.Equal("'re", tokens[3])
+	assert.Equal("their", tokens[4])
+	assert.Equal("do", tokens[5])
+	assert.Equal("n't", tokens[6])
+	assert.Equal("would", tokens[7])
+	assert.Equal("n't", tokens[8])
 }
 
 func TestMatrixReadWriteTokenizer(t *testing.T) {
@@ -1051,23 +1067,23 @@
 	assert.Equal("I", tokens[12])
 	assert.Equal(".", tokens[13])
 
+	// englishTokenizerSeparatesEnglishContractionsAndClitics
+	tokens = ttokenize(mat_en, w, "I've we'll you'd I'm we're Peter's isn't who'll've")
+	assert.Equal("I", tokens[0])
+	assert.Equal("'ve", tokens[1])
+	assert.Equal("'ll", tokens[3])
+	assert.Equal("'d", tokens[5])
+	assert.Equal("'m", tokens[7])
+	assert.Equal("'re", tokens[9])
+	assert.Equal("'s", tokens[11])
+	assert.Equal("is", tokens[12])
+	assert.Equal("n't", tokens[13])
+	assert.Equal("who", tokens[14])
+	assert.Equal("'ll", tokens[15])
+	assert.Equal("'ve", tokens[16])
+	assert.Equal(17, len(tokens))
 	/*
 		@Test
-		public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
-				DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
-				tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
-				assert.Equal("'ve", tokens[1]);
-				assert.Equal("'ll", tokens[3]);
-				assert.Equal("'d", tokens[5]);
-				assert.Equal("'m", tokens[7]);
-				assert.Equal("'re", tokens[9]);
-				assert.Equal("'s", tokens[11]);
-				assert.Equal("is", tokens[12]);
-				assert.Equal("n't", tokens[13]);
-				assert.Equal(14, len(tokens));
-		}
-
-		@Test
 		public void frenchTokenizerKnowsFrenchAbbreviations () {
 				DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
 				tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")

diff --git a/src/en/clitics.xfst b/src/en/clitics.xfst
new file mode 100644
index 0000000..ab919b5
--- /dev/null
+++ b/src/en/clitics.xfst

@@ -0,0 +1,4 @@
+define Clitics [ Apos [[{ll}|d|{ve}|s|{re}|(e)m|n] .o. Caseinsensitive] ];
+# Following https://web.stanford.edu/~zwicky/ZPCliticsInfl.pdf
+define CliticsNTPart [[{do}({es})|{did}|{have}|{has}|{had}|{ca}|{could}|{might}|{sha}|{should}|{wo}|{would}|{dare}|{must}|{need}|{ought}|{are}|{is}|{was}|{were}|{ai}] {n}] .o. Caseinsensitive;
+define CliticsNT [CliticsNTPart "'" ["t"|"T"]];

diff --git a/src/en/tokenizer.xfst b/src/en/tokenizer.xfst
index adcdac8..259fa60 100644
--- a/src/en/tokenizer.xfst
+++ b/src/en/tokenizer.xfst

@@ -89,10 +89,10 @@
 
 define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä - è - é - ú - á - â - ê - î - ô - û];
 
-! Irrelevant becose of the more general rule followed
-! define Clitics [ Apos [{ll}|d|{ve}|s|{re}|m|n|{em}] .o. Caseinsensitive ] | ["n" Apos "t"] .o. Caseinsensitive ];
+source en/clitics.xfst
 
-define Word Char+ ([Apos|Asterisk] Char+)*;
+# define Word Char+ (Apos Char+)*;
+define Word [[ Char+ | Clitics ] - CliticsNTPart | CliticsNT];
 
 define Plusampersand @txt"de/plusampersand.txt";
 define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*;
@@ -121,7 +121,11 @@
   File @-> ... NLout,
   Domain @-> ... NLout,
   [Emoticons|Arrows] @-> ... NLout
-] .o. ["I" @-> ... NLout \/ NonAbbrI [WS | NLout ]+ _ ];
+]
+! I as a separate token
+.o. ["I" @-> ... NLout \/ NonAbbrI [WS | NLout ]+ _ ]
+! Negative clitics are tokens
+.o. [ {n't} ->@ NLout ... \/ NLout ];
 
 source all/allsentencesplit.xfst
 

diff --git a/testdata/clitic_test.fst b/testdata/clitic_test.fst
index b373352..c6fe6aa 100644
--- a/testdata/clitic_test.fst
+++ b/testdata/clitic_test.fst
Binary files differ

diff --git a/testdata/clitic_test.matok b/testdata/clitic_test.matok
index 9cc4971..e58b29d 100644
--- a/testdata/clitic_test.matok
+++ b/testdata/clitic_test.matok
Binary files differ

diff --git a/testdata/clitic_test.xfst b/testdata/clitic_test.xfst
index 1b129aa..de6788f 100644
--- a/testdata/clitic_test.xfst
+++ b/testdata/clitic_test.xfst

@@ -1,13 +1,21 @@
 define TB "@_TOKEN_BOUND_@";
+# define TB "_";
 define WS [" "|"\u000a"|"\u0009"];
 define PUNCT ["."|"?"|"!"];
-define Char \[WS|PUNCT];
-define Word Char+;
+define Char \[WS|PUNCT|"'"];
+define Clitic ["'" [{ll}|{d}|{ve}|{s}|{re}|"m"|"n"|"t"]];
+# Following https://web.stanford.edu/~zwicky/ZPCliticsInfl.pdf
+define CliticNTPart [[{do}({es})|{did}|{have}|{has}|{had}|{ca}|{could}|{might}|{sha}|{should}|{wo}|{would}|{dare}|{must}|{need}|{ought}|{are}|{is}|{was}|{were}|{ai}] {n}];
+define CliticNT [CliticNTPart "'" {t}];
+
+
+define Word [ [ Clitic |  Char+] - CliticNTPart | CliticNT];
 
 ! Compose token boundaries
-define Tokenizer [[Word|PUNCT] @-> ... TB] .o.
+define Tokenizer [[Word|PUNCT|CliticNT] @-> ... TB] .o.
  ! Compose Whitespace ignorance
 [WS+ @-> 0] .o.
  ! Compose sentence ends
 [[PUNCT+] @-> ... TB \/ TB _ ];
-read regex Tokenizer .o. [{'re} ->@ TB ... ];
+read regex Tokenizer .o. [{n't} ->@ TB ... \/ TB ];
+

diff --git a/testdata/tokenizer_en.fst b/testdata/tokenizer_en.fst
index a9411cb..011934a 100644
--- a/testdata/tokenizer_en.fst
+++ b/testdata/tokenizer_en.fst
Binary files differ

diff --git a/testdata/tokenizer_en.matok b/testdata/tokenizer_en.matok
index 20ec553..3ec42a5 100644
--- a/testdata/tokenizer_en.matok
+++ b/testdata/tokenizer_en.matok
Binary files differ
commit	72a6422a29c2245a68afaa25e00c5557bb741b69	[log] [tgz]
author	Akron <nils@diewald-online.de>	Wed Apr 26 17:00:45 2023 +0200
committer	Akron <nils@diewald-online.de>	Thu Aug 31 13:39:05 2023 +0200
tree	e0ac7243627c30567ca396f6a26f0a5a3a6b757f
parent	cab40cf242939715599b05b8e14ce750409a020c [diff]