blob: 08b4160aea676a3842a7f11ff2238a9138ee5859 [file] [log] [blame]
Akron8e1d69b2021-08-12 17:38:49 +02001package main
2
3import (
Akron7e269d42021-08-12 23:18:05 +02004 "fmt"
Akron8e1d69b2021-08-12 17:38:49 +02005 "os"
6
Akron527c10c2021-08-13 01:45:18 +02007 "log"
8
Akron7f1097f2021-09-21 16:00:29 +02009 datok "github.com/KorAP/datok"
Akron8e1d69b2021-08-12 17:38:49 +020010 "github.com/alecthomas/kong"
11)
12
13var cli struct {
Akron7e269d42021-08-12 23:18:05 +020014 Convert struct {
Akron941f2152021-09-26 15:14:25 +020015 Foma string `kong:"required,short='i',help='The Foma file'"`
16 Tokenizer string `kong:"required,short='o',help='The Tokenizer file'"`
17 DoubleArray bool `kong:"optional,short='d',help='Convert to Double Array instead of Matrix representation'"`
18 } `kong:"cmd, help='Convert a foma file to a Matrix or Double Array tokenizer'"`
Akron7e269d42021-08-12 23:18:05 +020019 Tokenize struct {
Akronfceddb62021-10-27 19:27:54 +020020 Tokenizer string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"`
21 Tokens bool `kong:"optional,negatable,default=true,help='Print token surfaces'"`
22 Sentences bool `kong:"optional,negatable,default=true,help='Print sentence boundaries'"`
23 TokenPositions bool `kong:"optional,negatable,default=false,short='p',help='Print token offsets'"`
24 SentencePositions bool `kong:"optional,negatable,default=false,help='Print sentence offsets'"`
Akron0f087ea2021-10-27 19:40:15 +020025 NewlineAfterEOT bool `kong:"optional,negatable,default=false,help='Ignore newline after EOT'"`
Akron7e269d42021-08-12 23:18:05 +020026 } `kong:"cmd, help='Tokenize a text'"`
Akron8e1d69b2021-08-12 17:38:49 +020027}
28
29// Main method for command line handling
30func main() {
31
32 // Parse command line parameters
33 parser := kong.Must(
34 &cli,
35 kong.Name("datok"),
Akron941f2152021-09-26 15:14:25 +020036 kong.Description("FSA based tokenizer"),
Akron8e1d69b2021-08-12 17:38:49 +020037 kong.UsageOnError(),
38 )
39
Akron7e269d42021-08-12 23:18:05 +020040 ctx, err := parser.Parse(os.Args[1:])
Akron8e1d69b2021-08-12 17:38:49 +020041
42 parser.FatalIfErrorf(err)
43
Akron7e269d42021-08-12 23:18:05 +020044 if ctx.Command() == "convert" {
45 tok := datok.LoadFomaFile(cli.Convert.Foma)
46 if tok == nil {
Akron527c10c2021-08-13 01:45:18 +020047 log.Fatalln("Unable to load foma file")
Akron7e269d42021-08-12 23:18:05 +020048 }
Akron941f2152021-09-26 15:14:25 +020049 if cli.Convert.DoubleArray {
50 dat := tok.ToDoubleArray()
51 _, err := dat.Save(cli.Convert.Tokenizer)
52 if err != nil {
53 log.Fatalln(err)
54 }
55 } else {
56 mat := tok.ToMatrix()
57 _, err := mat.Save(cli.Convert.Tokenizer)
58 if err != nil {
59 log.Fatalln(err)
60 }
Akron7e269d42021-08-12 23:18:05 +020061 }
62 fmt.Println("File successfully converted.")
63 os.Exit(0)
64 }
65
Akron941f2152021-09-26 15:14:25 +020066 // Load the Datok or Matrix file
67 dat := datok.LoadTokenizerFile(cli.Tokenize.Tokenizer)
Akron8e1d69b2021-08-12 17:38:49 +020068
69 // Unable to load the datok file
70 if dat == nil {
Akron941f2152021-09-26 15:14:25 +020071 log.Fatalln("Unable to load file")
Akron8e1d69b2021-08-12 17:38:49 +020072 os.Exit(1)
73 }
74
Akron0f087ea2021-10-27 19:40:15 +020075 // Create flags parameter based on command line parameters
76 var flags datok.Bits
77 if cli.Tokenize.Tokens {
78 flags |= datok.TOKENS
79 }
80
81 if cli.Tokenize.TokenPositions {
82 flags |= datok.TOKEN_POS
83 }
84
85 if cli.Tokenize.Sentences {
86 flags |= datok.SENTENCES
87 }
88
89 if cli.Tokenize.SentencePositions {
90 flags |= datok.SENTENCE_POS
91 }
92
93 if cli.Tokenize.NewlineAfterEOT {
94 flags |= datok.NEWLINE_AFTER_EOT
95 }
96
Akron4f6b28c2021-10-25 00:52:03 +020097 // Create token writer based on the options defined
Akron0f087ea2021-10-27 19:40:15 +020098 tw := datok.NewTokenWriterFromOptions(os.Stdout, flags)
Akron4f6b28c2021-10-25 00:52:03 +020099
Akron8e1d69b2021-08-12 17:38:49 +0200100 // Program is running in a pipe
101 fileInfo, _ := os.Stdin.Stat()
102 if fileInfo.Mode()&os.ModeCharDevice == 0 {
Akron4f6b28c2021-10-25 00:52:03 +0200103 dat.TransduceTokenWriter(os.Stdin, tw)
104 tw.Flush()
Akron8e1d69b2021-08-12 17:38:49 +0200105 }
106}