blob: e31745a9349ca0d34c09d13880417e0d14d85954 [file] [log] [blame]
Akron8e1d69b2021-08-12 17:38:49 +02001package main
2
3import (
Akron7e269d42021-08-12 23:18:05 +02004 "fmt"
Akron8e1d69b2021-08-12 17:38:49 +02005 "os"
6
Akron527c10c2021-08-13 01:45:18 +02007 "log"
8
Akron7f1097f2021-09-21 16:00:29 +02009 datok "github.com/KorAP/datok"
Akron8e1d69b2021-08-12 17:38:49 +020010 "github.com/alecthomas/kong"
11)
12
13var cli struct {
Akron7e269d42021-08-12 23:18:05 +020014 Convert struct {
Akron941f2152021-09-26 15:14:25 +020015 Foma string `kong:"required,short='i',help='The Foma file'"`
16 Tokenizer string `kong:"required,short='o',help='The Tokenizer file'"`
17 DoubleArray bool `kong:"optional,short='d',help='Convert to Double Array instead of Matrix representation'"`
18 } `kong:"cmd, help='Convert a foma file to a Matrix or Double Array tokenizer'"`
Akron7e269d42021-08-12 23:18:05 +020019 Tokenize struct {
Akrone9431ec2021-10-25 21:35:33 +020020 Tokenizer string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"`
21 Positions bool `kong:"optional,negatable,default=false,short='p',help='Print token offsets'"`
22 Tokens bool `kong:"optional,negatable,default=true,help='Print token surfaces'"`
23 NewlineAfterEOT bool `kong:"optional,negatable,help='Ignore newline after EOT'"`
Akron7e269d42021-08-12 23:18:05 +020024 } `kong:"cmd, help='Tokenize a text'"`
Akron8e1d69b2021-08-12 17:38:49 +020025}
26
27// Main method for command line handling
28func main() {
29
30 // Parse command line parameters
31 parser := kong.Must(
32 &cli,
33 kong.Name("datok"),
Akron941f2152021-09-26 15:14:25 +020034 kong.Description("FSA based tokenizer"),
Akron8e1d69b2021-08-12 17:38:49 +020035 kong.UsageOnError(),
36 )
37
Akron7e269d42021-08-12 23:18:05 +020038 ctx, err := parser.Parse(os.Args[1:])
Akron8e1d69b2021-08-12 17:38:49 +020039
40 parser.FatalIfErrorf(err)
41
Akron7e269d42021-08-12 23:18:05 +020042 if ctx.Command() == "convert" {
43 tok := datok.LoadFomaFile(cli.Convert.Foma)
44 if tok == nil {
Akron527c10c2021-08-13 01:45:18 +020045 log.Fatalln("Unable to load foma file")
Akron7e269d42021-08-12 23:18:05 +020046 }
Akron941f2152021-09-26 15:14:25 +020047 if cli.Convert.DoubleArray {
48 dat := tok.ToDoubleArray()
49 _, err := dat.Save(cli.Convert.Tokenizer)
50 if err != nil {
51 log.Fatalln(err)
52 }
53 } else {
54 mat := tok.ToMatrix()
55 _, err := mat.Save(cli.Convert.Tokenizer)
56 if err != nil {
57 log.Fatalln(err)
58 }
Akron7e269d42021-08-12 23:18:05 +020059 }
60 fmt.Println("File successfully converted.")
61 os.Exit(0)
62 }
63
Akron941f2152021-09-26 15:14:25 +020064 // Load the Datok or Matrix file
65 dat := datok.LoadTokenizerFile(cli.Tokenize.Tokenizer)
Akron8e1d69b2021-08-12 17:38:49 +020066
67 // Unable to load the datok file
68 if dat == nil {
Akron941f2152021-09-26 15:14:25 +020069 log.Fatalln("Unable to load file")
Akron8e1d69b2021-08-12 17:38:49 +020070 os.Exit(1)
71 }
72
Akron4f6b28c2021-10-25 00:52:03 +020073 // Create token writer based on the options defined
Akrone9431ec2021-10-25 21:35:33 +020074 tw := datok.NewTokenWriterFromOptions(
75 os.Stdout,
76 cli.Tokenize.Positions,
77 cli.Tokenize.NewlineAfterEOT,
78 )
Akron4f6b28c2021-10-25 00:52:03 +020079
Akron8e1d69b2021-08-12 17:38:49 +020080 // Program is running in a pipe
81 fileInfo, _ := os.Stdin.Stat()
82 if fileInfo.Mode()&os.ModeCharDevice == 0 {
Akron4f6b28c2021-10-25 00:52:03 +020083 dat.TransduceTokenWriter(os.Stdin, tw)
84 tw.Flush()
Akron8e1d69b2021-08-12 17:38:49 +020085 }
86}