blob: 9314a93eeb2ef2eff6292059017d6c2fc371abe8 [file] [log] [blame]
Akron8e1d69b2021-08-12 17:38:49 +02001package main
2
3import (
Akron7e269d42021-08-12 23:18:05 +02004 "fmt"
Akron8e1d69b2021-08-12 17:38:49 +02005 "os"
6
Akron527c10c2021-08-13 01:45:18 +02007 "log"
8
Akron7f1097f2021-09-21 16:00:29 +02009 datok "github.com/KorAP/datok"
Akron8e1d69b2021-08-12 17:38:49 +020010 "github.com/alecthomas/kong"
11)
12
13var cli struct {
Akron7e269d42021-08-12 23:18:05 +020014 Convert struct {
Akron941f2152021-09-26 15:14:25 +020015 Foma string `kong:"required,short='i',help='The Foma file'"`
16 Tokenizer string `kong:"required,short='o',help='The Tokenizer file'"`
17 DoubleArray bool `kong:"optional,short='d',help='Convert to Double Array instead of Matrix representation'"`
18 } `kong:"cmd, help='Convert a foma file to a Matrix or Double Array tokenizer'"`
Akron7e269d42021-08-12 23:18:05 +020019 Tokenize struct {
Akron941f2152021-09-26 15:14:25 +020020 Tokenizer string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"`
Akron4f6b28c2021-10-25 00:52:03 +020021 Positions bool `kong:"optional,negatable,default=false,short='p',help='Print token offsets'"`
22 Tokens bool `kong:"optional,negatable,default=true,help="Print token surfaces""`
Akron7e269d42021-08-12 23:18:05 +020023 } `kong:"cmd, help='Tokenize a text'"`
Akron8e1d69b2021-08-12 17:38:49 +020024}
25
26// Main method for command line handling
27func main() {
28
29 // Parse command line parameters
30 parser := kong.Must(
31 &cli,
32 kong.Name("datok"),
Akron941f2152021-09-26 15:14:25 +020033 kong.Description("FSA based tokenizer"),
Akron8e1d69b2021-08-12 17:38:49 +020034 kong.UsageOnError(),
35 )
36
Akron7e269d42021-08-12 23:18:05 +020037 ctx, err := parser.Parse(os.Args[1:])
Akron8e1d69b2021-08-12 17:38:49 +020038
39 parser.FatalIfErrorf(err)
40
Akron7e269d42021-08-12 23:18:05 +020041 if ctx.Command() == "convert" {
42 tok := datok.LoadFomaFile(cli.Convert.Foma)
43 if tok == nil {
Akron527c10c2021-08-13 01:45:18 +020044 log.Fatalln("Unable to load foma file")
Akron7e269d42021-08-12 23:18:05 +020045 }
Akron941f2152021-09-26 15:14:25 +020046 if cli.Convert.DoubleArray {
47 dat := tok.ToDoubleArray()
48 _, err := dat.Save(cli.Convert.Tokenizer)
49 if err != nil {
50 log.Fatalln(err)
51 }
52 } else {
53 mat := tok.ToMatrix()
54 _, err := mat.Save(cli.Convert.Tokenizer)
55 if err != nil {
56 log.Fatalln(err)
57 }
Akron7e269d42021-08-12 23:18:05 +020058 }
59 fmt.Println("File successfully converted.")
60 os.Exit(0)
61 }
62
Akron941f2152021-09-26 15:14:25 +020063 // Load the Datok or Matrix file
64 dat := datok.LoadTokenizerFile(cli.Tokenize.Tokenizer)
Akron8e1d69b2021-08-12 17:38:49 +020065
66 // Unable to load the datok file
67 if dat == nil {
Akron941f2152021-09-26 15:14:25 +020068 log.Fatalln("Unable to load file")
Akron8e1d69b2021-08-12 17:38:49 +020069 os.Exit(1)
70 }
71
Akron4f6b28c2021-10-25 00:52:03 +020072 // Create token writer based on the options defined
73 tw := datok.NewTokenWriterFromOptions(os.Stdout, cli.Tokenize.Positions)
74
Akron8e1d69b2021-08-12 17:38:49 +020075 // Program is running in a pipe
76 fileInfo, _ := os.Stdin.Stat()
77 if fileInfo.Mode()&os.ModeCharDevice == 0 {
Akron4f6b28c2021-10-25 00:52:03 +020078 dat.TransduceTokenWriter(os.Stdin, tw)
79 tw.Flush()
Akron8e1d69b2021-08-12 17:38:49 +020080 }
81}