Introduce english tokenizer
Change-Id: I5b60d9a4de8db3c5730957335fc674adb4fccf0f
diff --git a/src/en/abbrv.txt b/src/en/abbrv.txt
new file mode 100644
index 0000000..3aaf94e
--- /dev/null
+++ b/src/en/abbrv.txt
@@ -0,0 +1,346 @@
+abt
+adfsd
+agg
+Agg
+AkH
+Analg
+Androl
+Anm
+Anniv
+anniv
+Apollod
+Appr
+appr
+approx
+Approx
+apptd
+Appx
+Aprox
+aprox
+apx
+Assoc
+assoc
+asst
+Auk
+Ave
+avg
+Avg
+Bacteriol
+Balf
+bap
+bef
+Benth
+Bioeng
+Bip
+Blg
+Blvd
+Boiss
+Bojan
+Bonpl
+Burm
+BVN
+ca
+Cardiol
+cca
+Chemother
+Chevr
+cho
+choreo
+chpt
+cir
+Cir
+Cist
+Cmnd
+Cor
+cr
+Cuatrec
+Cunn
+Cy
+Dall
+Davidsz
+defns
+dep
+dept
+Dept
+Dermatol
+Desf
+Desv
+Deut
+dis
+Dis
+disamb
+Disamb
+Disord
+disord
+Dispos
+dispos
+dolore
+dr
+Dr
+DR
+Eckl
+Ed
+ed
+Eds
+eds
+Ehrend
+elev
+Elysa
+Emph
+emph
+Engelm
+Enzymol
+Ep
+EpGuides
+Eph
+Eps
+eq
+Eq
+Eqs
+equiv
+Equiv
+Ericka
+esp
+Esp
+estb
+estd
+Estd
+et
+Et
+Ethn
+exh
+Exod
+Ezek
+Fairm
+fig
+Fig
+figs
+Figs
+fl
+fn
+fol
+foll
+fols
+frg
+frr
+Genet
+Gesch
+Gracch
+Graec
+Grav
+gs
+Guill
+Gyll
+habuit
+Haematol
+Haemost
+Heb
+Hematol
+Henn
+Hepatol
+Hered
+Hertig
+hlm
+Hochst
+Holmiae
+hrsg
+Humb
+Hwy
+Hyg
+Hypertens
+Ikirun
+Immun
+immunol
+Immunol
+Instrum
+Iss
+Jaub
+JBNHS
+jct
+Jv
+Kelloff
+KES
+Kfz
+Korch
+Kyal
+laen
+Lehm
+Lett
+Lindl
+Loc
+loc
+Lond
+Macc
+mag
+Mag
+maint
+Margalit
+Masc
+masc
+Max
+max
+Meisn
+Microlep
+Mildbr
+Min
+min
+Misc
+misc
+Mitja
+Monit
+Movt
+mr
+Mr
+MR
+mrs
+Mrs
+MRS
+ms
+Ms
+MS
+Naturalist
+Navig
+Neg
+neg
+Neh
+Neof
+Neurophysiol
+neut
+ngupil
+NJCL
+nº
+No
+Nº
+Nohlen
+Noordel
+nos
+Nos
+nr
+Nr
+núm
+Núm
+Nutr
+Oberw
+Oliv
+Oncol
+op
+Op
+Ophthalmol
+osth
+Pag
+pagg
+pags
+paragr
+Pathog
+Pathol
+Pav
+pbk
+pct
+Periodontol
+pers
+Perspect
+Pes
+pg
+Pg
+pgs
+Pgs
+Pipo
+Planch
+Pls
+Plz
+Poepp
+Polyb
+pos
+potest
+poz
+pp
+Pp
+prec
+pref
+prof
+Prof
+PROF
+PSl
+Psychopharmacol
+puto
+Qld
+qtd
+Qtd
+Rafiuddin
+Rchb
+Rd
+Rheumatol
+Rhif
+ric
+Ridl
+Rodr
+Roem
+Romagn
+Rp
+Rptr
+Rs
+Rte
+Rul
+Sacc
+Sadayakko
+Salisb
+Schltdl
+Schltr
+Schnepf
+Schoenh
+Schrad
+Schum
+Schumach
+Scler
+Scol
+Seidenf
+Sep
+shaadi
+Shab
+shd
+Sln
+Sm
+spol
+sq
+ssp
+St
+Standl
+Stat
+Steril
+Str
+Strab
+subd
+subsp
+supp
+Supp
+Tas
+Teijsm
+tel
+Ther
+Thess
+Thm
+tj
+Tms
+Torr
+Ulmus
+USFWS
+usu
+var
+Verm
+Virol
+vs
+vz
+Vz
+Waldst
+Welw
+Wendl
+Wochenschr
+WoO
+wz
+Wz
+xl
+xliv
+Yeb
+Zaven
+Zeb
+zm
diff --git a/src/en/tokenizer.xfst b/src/en/tokenizer.xfst
new file mode 100644
index 0000000..6b3cc78
--- /dev/null
+++ b/src/en/tokenizer.xfst
@@ -0,0 +1,124 @@
+source all/allpref.xfst
+
+define Caseinsensitive [
+a (->) A,
+b (->) B,
+c (->) C,
+d (->) D,
+e (->) E,
+f (->) F,
+g (->) G,
+h (->) H,
+i (->) I,
+j (->) J,
+k (->) K,
+l (->) L,
+m (->) M,
+n (->) N,
+o (->) O,
+p (->) P,
+q (->) Q,
+r (->) R,
+s (->) S,
+t (->) T,
+u (->) U,
+v (->) V,
+w (->) W,
+x (->) X,
+y (->) Y,
+z (->) Z,
+ö (->) Ö,
+ü (->) Ü,
+ä (->) Ä,
+è (->) È,
+é (->) É,
+ú (->) Ú,
+á (->) Á,
+â (->) Â,
+ê (->) Ê,
+î (->) Î,
+ô (->) Ô,
+û (->) Û,
+ß (->) {SS}
+];
+
+! Foma complains when this reuses the above definition
+define CapitalCaseinsensitive [
+a (->) A,
+b (->) B,
+c (->) C,
+d (->) D,
+e (->) E,
+f (->) F,
+g (->) G,
+h (->) H,
+i (->) I,
+j (->) J,
+k (->) K,
+l (->) L,
+m (->) M,
+n (->) N,
+o (->) O,
+p (->) P,
+q (->) Q,
+r (->) R,
+s (->) S,
+t (->) T,
+u (->) U,
+v (->) V,
+w (->) W,
+x (->) X,
+y (->) Y,
+z (->) Z,
+ö (->) Ö,
+ü (->) Ü,
+ä (->) Ä,
+è (->) È,
+é (->) É,
+ú (->) Ú,
+á (->) Á,
+â (->) Â,
+ê (->) Ê,
+î (->) Î,
+ô (->) Ô,
+û (->) Û,
+ß (->) {SS}
+|| .#. _ ];
+
+define Letter [ [ AsciiLetter | ö | ü | ä | è | é | ú | á | â | ê | î | ô | û | ß ] .o. Caseinsensitive ];
+
+define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä - è - é - ú - á - â - ê - î - ô - û];
+
+! Irrelevant becose of the more general rule followed
+! define Clitics [ Apos [{ll}|d|{ve}|s|{re}|m|n|{em}] .o. Caseinsensitive ] | ["n" Apos "t"] .o. Caseinsensitive ];
+
+define Word Char+ ([Apos|Asterisk] Char+)*;
+
+define Plusampersand @txt"de/plusampersand.txt";
+define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*;
+
+! Abbreviations and Initials
+define Months [{Jan}|{Feb}|{Mar}|{Apr}|{Jun}|{Jul}|{Aug}|{Sep}(t)|{Oct}|{Nov}|{Dec}];
+define Abbr [ [ @txt"en/abbrv.txt" | Letter | Months ] .o. CapitalCaseinsensitive ] %.;
+
+source all/allpost.xfst
+
+echo - Compile Real Token
+
+define RealToken [Punct|Emdash|Abbr|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission];
+
+echo - Introduce Token splitter
+
+define Token [
+ RealToken @-> ... NLout,
+ XML @-> ... NLout,
+ URL @-> ... NLout,
+ Email @-> ... NLout,
+ File @-> ... NLout,
+ Domain @-> ... NLout,
+ [Emoticons|Arrows] @-> ... NLout
+];
+
+source all/allsentencesplit.xfst
+
+! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b
\ No newline at end of file