blob: 1c1a7810e84bb6164d46494d2d579209d0d4a973 [file] [log] [blame]
Akron8e1d69b2021-08-12 17:38:49 +02001package main
2
3import (
Akron7e269d42021-08-12 23:18:05 +02004 "fmt"
Akron15bb13d2021-10-30 11:57:41 +02005 "io"
Akron8e1d69b2021-08-12 17:38:49 +02006 "os"
7
Akron527c10c2021-08-13 01:45:18 +02008 "log"
9
Akron7f1097f2021-09-21 16:00:29 +020010 datok "github.com/KorAP/datok"
Akron8e1d69b2021-08-12 17:38:49 +020011 "github.com/alecthomas/kong"
12)
13
Akron54ed7e72022-01-04 12:05:00 +010014// TODO:
15// - Support version information based on
16// https://blog.carlmjohnson.net/post/2021/golang-118-minor-features/
17
Akron8e1d69b2021-08-12 17:38:49 +020018var cli struct {
Akron7e269d42021-08-12 23:18:05 +020019 Convert struct {
Akron15bb13d2021-10-30 11:57:41 +020020 Foma string `kong:"required,short='i',help='The Foma FST file'"`
Akron941f2152021-09-26 15:14:25 +020021 Tokenizer string `kong:"required,short='o',help='The Tokenizer file'"`
22 DoubleArray bool `kong:"optional,short='d',help='Convert to Double Array instead of Matrix representation'"`
Akron274600e2021-11-03 20:09:06 +010023 } `kong:"cmd, help='Convert a compiled foma FST file to a Matrix or Double Array tokenizer'"`
Akron7e269d42021-08-12 23:18:05 +020024 Tokenize struct {
Akronfceddb62021-10-27 19:27:54 +020025 Tokenizer string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"`
Akron15bb13d2021-10-30 11:57:41 +020026 Input string `kong:"required,arg='',type='existingfile',help='Input file to tokenize (use - for STDIN)'"`
Akron2612f992021-10-27 20:12:15 +020027 Tokens bool `kong:"optional,negatable,default=true,help='Print token surfaces (defaults to ${default})'"`
28 Sentences bool `kong:"optional,negatable,default=true,help='Print sentence boundaries (defaults to ${default})'"`
29 TokenPositions bool `kong:"optional,default=false,short='p',help='Print token offsets (defaults to ${default})'"`
30 SentencePositions bool `kong:"optional,default=false,help='Print sentence offsets (defaults to ${default})'"`
31 NewlineAfterEOT bool `kong:"optional,default=false,help='Ignore newline after EOT (defaults to ${default})'"`
Akron7e269d42021-08-12 23:18:05 +020032 } `kong:"cmd, help='Tokenize a text'"`
Akron8e1d69b2021-08-12 17:38:49 +020033}
34
35// Main method for command line handling
36func main() {
37
38 // Parse command line parameters
39 parser := kong.Must(
40 &cli,
41 kong.Name("datok"),
Akron941f2152021-09-26 15:14:25 +020042 kong.Description("FSA based tokenizer"),
Akron8e1d69b2021-08-12 17:38:49 +020043 kong.UsageOnError(),
44 )
45
Akron7e269d42021-08-12 23:18:05 +020046 ctx, err := parser.Parse(os.Args[1:])
Akron8e1d69b2021-08-12 17:38:49 +020047
48 parser.FatalIfErrorf(err)
49
Akron7e269d42021-08-12 23:18:05 +020050 if ctx.Command() == "convert" {
51 tok := datok.LoadFomaFile(cli.Convert.Foma)
52 if tok == nil {
Akron527c10c2021-08-13 01:45:18 +020053 log.Fatalln("Unable to load foma file")
Akron7e269d42021-08-12 23:18:05 +020054 }
Akron941f2152021-09-26 15:14:25 +020055 if cli.Convert.DoubleArray {
56 dat := tok.ToDoubleArray()
57 _, err := dat.Save(cli.Convert.Tokenizer)
58 if err != nil {
59 log.Fatalln(err)
60 }
61 } else {
62 mat := tok.ToMatrix()
63 _, err := mat.Save(cli.Convert.Tokenizer)
64 if err != nil {
65 log.Fatalln(err)
66 }
Akron7e269d42021-08-12 23:18:05 +020067 }
68 fmt.Println("File successfully converted.")
69 os.Exit(0)
70 }
71
Akron941f2152021-09-26 15:14:25 +020072 // Load the Datok or Matrix file
73 dat := datok.LoadTokenizerFile(cli.Tokenize.Tokenizer)
Akron8e1d69b2021-08-12 17:38:49 +020074
75 // Unable to load the datok file
76 if dat == nil {
Akron941f2152021-09-26 15:14:25 +020077 log.Fatalln("Unable to load file")
Akron8e1d69b2021-08-12 17:38:49 +020078 os.Exit(1)
79 }
80
Akron0f087ea2021-10-27 19:40:15 +020081 // Create flags parameter based on command line parameters
82 var flags datok.Bits
83 if cli.Tokenize.Tokens {
84 flags |= datok.TOKENS
85 }
86
87 if cli.Tokenize.TokenPositions {
88 flags |= datok.TOKEN_POS
89 }
90
91 if cli.Tokenize.Sentences {
92 flags |= datok.SENTENCES
93 }
94
95 if cli.Tokenize.SentencePositions {
96 flags |= datok.SENTENCE_POS
97 }
98
99 if cli.Tokenize.NewlineAfterEOT {
100 flags |= datok.NEWLINE_AFTER_EOT
101 }
102
Akron4f6b28c2021-10-25 00:52:03 +0200103 // Create token writer based on the options defined
Akron96fdc9b2021-10-27 21:11:17 +0200104 tw := datok.NewTokenWriter(os.Stdout, flags)
Akron274600e2021-11-03 20:09:06 +0100105 defer os.Stdout.Close()
Akron4f6b28c2021-10-25 00:52:03 +0200106
Akron15bb13d2021-10-30 11:57:41 +0200107 var r io.Reader
108
Akron8e1d69b2021-08-12 17:38:49 +0200109 // Program is running in a pipe
Akron15bb13d2021-10-30 11:57:41 +0200110 if cli.Tokenize.Input == "-" {
111 fileInfo, _ := os.Stdin.Stat()
112 if fileInfo.Mode()&os.ModeCharDevice == 0 {
113 r = os.Stdin
Akron274600e2021-11-03 20:09:06 +0100114 defer os.Stdin.Close()
Akron15bb13d2021-10-30 11:57:41 +0200115 } else {
116 log.Fatalln("Unable to read from STDIN")
117 os.Exit(1)
118 return
119 }
120 } else {
121 f, err := os.Open(cli.Tokenize.Input)
122 if err != nil {
123 log.Fatalln(err)
124 os.Exit(1)
125 return
126 }
127 defer f.Close()
128 r = f
Akron8e1d69b2021-08-12 17:38:49 +0200129 }
Akron15bb13d2021-10-30 11:57:41 +0200130
131 dat.TransduceTokenWriter(r, tw)
132 tw.Flush()
Akron8e1d69b2021-08-12 17:38:49 +0200133}