blob: d97107b77f4815384e158a7d27ddfdbcefe3ab2b [file] [log] [blame]
Akron8e1d69b2021-08-12 17:38:49 +02001package main
2
3import (
Akron7e269d42021-08-12 23:18:05 +02004 "fmt"
Akron15bb13d2021-10-30 11:57:41 +02005 "io"
Akron8e1d69b2021-08-12 17:38:49 +02006 "os"
7
Akron527c10c2021-08-13 01:45:18 +02008 "log"
9
Akron7f1097f2021-09-21 16:00:29 +020010 datok "github.com/KorAP/datok"
Akron8e1d69b2021-08-12 17:38:49 +020011 "github.com/alecthomas/kong"
12)
13
Akron54ed7e72022-01-04 12:05:00 +010014// TODO:
15// - Support version information based on
16// https://blog.carlmjohnson.net/post/2021/golang-118-minor-features/
17
Akron8e1d69b2021-08-12 17:38:49 +020018var cli struct {
Akron7e269d42021-08-12 23:18:05 +020019 Convert struct {
Akron15bb13d2021-10-30 11:57:41 +020020 Foma string `kong:"required,short='i',help='The Foma FST file'"`
Akron941f2152021-09-26 15:14:25 +020021 Tokenizer string `kong:"required,short='o',help='The Tokenizer file'"`
22 DoubleArray bool `kong:"optional,short='d',help='Convert to Double Array instead of Matrix representation'"`
Akron274600e2021-11-03 20:09:06 +010023 } `kong:"cmd, help='Convert a compiled foma FST file to a Matrix or Double Array tokenizer'"`
Akron7e269d42021-08-12 23:18:05 +020024 Tokenize struct {
Akronfceddb62021-10-27 19:27:54 +020025 Tokenizer string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"`
Akron15bb13d2021-10-30 11:57:41 +020026 Input string `kong:"required,arg='',type='existingfile',help='Input file to tokenize (use - for STDIN)'"`
Akron2612f992021-10-27 20:12:15 +020027 Tokens bool `kong:"optional,negatable,default=true,help='Print token surfaces (defaults to ${default})'"`
28 Sentences bool `kong:"optional,negatable,default=true,help='Print sentence boundaries (defaults to ${default})'"`
29 TokenPositions bool `kong:"optional,default=false,short='p',help='Print token offsets (defaults to ${default})'"`
30 SentencePositions bool `kong:"optional,default=false,help='Print sentence offsets (defaults to ${default})'"`
31 NewlineAfterEOT bool `kong:"optional,default=false,help='Ignore newline after EOT (defaults to ${default})'"`
Akron7e269d42021-08-12 23:18:05 +020032 } `kong:"cmd, help='Tokenize a text'"`
Akron8e1d69b2021-08-12 17:38:49 +020033}
34
35// Main method for command line handling
36func main() {
37
38 // Parse command line parameters
39 parser := kong.Must(
40 &cli,
41 kong.Name("datok"),
Akron941f2152021-09-26 15:14:25 +020042 kong.Description("FSA based tokenizer"),
Akron8e1d69b2021-08-12 17:38:49 +020043 kong.UsageOnError(),
44 )
45
Akron7e269d42021-08-12 23:18:05 +020046 ctx, err := parser.Parse(os.Args[1:])
Akron8e1d69b2021-08-12 17:38:49 +020047
48 parser.FatalIfErrorf(err)
49
Akron7e269d42021-08-12 23:18:05 +020050 if ctx.Command() == "convert" {
51 tok := datok.LoadFomaFile(cli.Convert.Foma)
52 if tok == nil {
Akron527c10c2021-08-13 01:45:18 +020053 log.Fatalln("Unable to load foma file")
Akron7e269d42021-08-12 23:18:05 +020054 }
Akron941f2152021-09-26 15:14:25 +020055 if cli.Convert.DoubleArray {
56 dat := tok.ToDoubleArray()
Akron6a4ce182022-04-18 21:15:06 +020057 fmt.Println("Load factor", dat.LoadFactor())
Akron941f2152021-09-26 15:14:25 +020058 _, err := dat.Save(cli.Convert.Tokenizer)
59 if err != nil {
60 log.Fatalln(err)
61 }
62 } else {
63 mat := tok.ToMatrix()
64 _, err := mat.Save(cli.Convert.Tokenizer)
65 if err != nil {
66 log.Fatalln(err)
67 }
Akron7e269d42021-08-12 23:18:05 +020068 }
69 fmt.Println("File successfully converted.")
70 os.Exit(0)
71 }
72
Akron941f2152021-09-26 15:14:25 +020073 // Load the Datok or Matrix file
74 dat := datok.LoadTokenizerFile(cli.Tokenize.Tokenizer)
Akron8e1d69b2021-08-12 17:38:49 +020075
76 // Unable to load the datok file
77 if dat == nil {
Akron941f2152021-09-26 15:14:25 +020078 log.Fatalln("Unable to load file")
Akron8e1d69b2021-08-12 17:38:49 +020079 os.Exit(1)
80 }
81
Akron0f087ea2021-10-27 19:40:15 +020082 // Create flags parameter based on command line parameters
83 var flags datok.Bits
84 if cli.Tokenize.Tokens {
85 flags |= datok.TOKENS
86 }
87
88 if cli.Tokenize.TokenPositions {
89 flags |= datok.TOKEN_POS
90 }
91
92 if cli.Tokenize.Sentences {
93 flags |= datok.SENTENCES
94 }
95
96 if cli.Tokenize.SentencePositions {
97 flags |= datok.SENTENCE_POS
98 }
99
100 if cli.Tokenize.NewlineAfterEOT {
101 flags |= datok.NEWLINE_AFTER_EOT
102 }
103
Akron4f6b28c2021-10-25 00:52:03 +0200104 // Create token writer based on the options defined
Akron96fdc9b2021-10-27 21:11:17 +0200105 tw := datok.NewTokenWriter(os.Stdout, flags)
Akron274600e2021-11-03 20:09:06 +0100106 defer os.Stdout.Close()
Akron4f6b28c2021-10-25 00:52:03 +0200107
Akron15bb13d2021-10-30 11:57:41 +0200108 var r io.Reader
109
Akron8e1d69b2021-08-12 17:38:49 +0200110 // Program is running in a pipe
Akron15bb13d2021-10-30 11:57:41 +0200111 if cli.Tokenize.Input == "-" {
112 fileInfo, _ := os.Stdin.Stat()
113 if fileInfo.Mode()&os.ModeCharDevice == 0 {
114 r = os.Stdin
Akron274600e2021-11-03 20:09:06 +0100115 defer os.Stdin.Close()
Akron15bb13d2021-10-30 11:57:41 +0200116 } else {
117 log.Fatalln("Unable to read from STDIN")
118 os.Exit(1)
119 return
120 }
121 } else {
122 f, err := os.Open(cli.Tokenize.Input)
123 if err != nil {
124 log.Fatalln(err)
125 os.Exit(1)
126 return
127 }
128 defer f.Close()
129 r = f
Akron8e1d69b2021-08-12 17:38:49 +0200130 }
Akron15bb13d2021-10-30 11:57:41 +0200131
132 dat.TransduceTokenWriter(r, tw)
133 tw.Flush()
Akron8e1d69b2021-08-12 17:38:49 +0200134}