blob: e2a4efb05803b9f8695cd7f97b478c34e7342d6a [file] [log] [blame]
Akron8e1d69b2021-08-12 17:38:49 +02001package main
2
3import (
Akron7e269d42021-08-12 23:18:05 +02004 "fmt"
Akron15bb13d2021-10-30 11:57:41 +02005 "io"
Akron8e1d69b2021-08-12 17:38:49 +02006 "os"
7
Akron527c10c2021-08-13 01:45:18 +02008 "log"
9
Akron7f1097f2021-09-21 16:00:29 +020010 datok "github.com/KorAP/datok"
Akron8e1d69b2021-08-12 17:38:49 +020011 "github.com/alecthomas/kong"
12)
13
14var cli struct {
Akron7e269d42021-08-12 23:18:05 +020015 Convert struct {
Akron15bb13d2021-10-30 11:57:41 +020016 Foma string `kong:"required,short='i',help='The Foma FST file'"`
Akron941f2152021-09-26 15:14:25 +020017 Tokenizer string `kong:"required,short='o',help='The Tokenizer file'"`
18 DoubleArray bool `kong:"optional,short='d',help='Convert to Double Array instead of Matrix representation'"`
19 } `kong:"cmd, help='Convert a foma file to a Matrix or Double Array tokenizer'"`
Akron7e269d42021-08-12 23:18:05 +020020 Tokenize struct {
Akronfceddb62021-10-27 19:27:54 +020021 Tokenizer string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"`
Akron15bb13d2021-10-30 11:57:41 +020022 Input string `kong:"required,arg='',type='existingfile',help='Input file to tokenize (use - for STDIN)'"`
Akron2612f992021-10-27 20:12:15 +020023 Tokens bool `kong:"optional,negatable,default=true,help='Print token surfaces (defaults to ${default})'"`
24 Sentences bool `kong:"optional,negatable,default=true,help='Print sentence boundaries (defaults to ${default})'"`
25 TokenPositions bool `kong:"optional,default=false,short='p',help='Print token offsets (defaults to ${default})'"`
26 SentencePositions bool `kong:"optional,default=false,help='Print sentence offsets (defaults to ${default})'"`
27 NewlineAfterEOT bool `kong:"optional,default=false,help='Ignore newline after EOT (defaults to ${default})'"`
Akron7e269d42021-08-12 23:18:05 +020028 } `kong:"cmd, help='Tokenize a text'"`
Akron8e1d69b2021-08-12 17:38:49 +020029}
30
31// Main method for command line handling
32func main() {
33
34 // Parse command line parameters
35 parser := kong.Must(
36 &cli,
37 kong.Name("datok"),
Akron941f2152021-09-26 15:14:25 +020038 kong.Description("FSA based tokenizer"),
Akron8e1d69b2021-08-12 17:38:49 +020039 kong.UsageOnError(),
40 )
41
Akron7e269d42021-08-12 23:18:05 +020042 ctx, err := parser.Parse(os.Args[1:])
Akron8e1d69b2021-08-12 17:38:49 +020043
44 parser.FatalIfErrorf(err)
45
Akron7e269d42021-08-12 23:18:05 +020046 if ctx.Command() == "convert" {
47 tok := datok.LoadFomaFile(cli.Convert.Foma)
48 if tok == nil {
Akron527c10c2021-08-13 01:45:18 +020049 log.Fatalln("Unable to load foma file")
Akron7e269d42021-08-12 23:18:05 +020050 }
Akron941f2152021-09-26 15:14:25 +020051 if cli.Convert.DoubleArray {
52 dat := tok.ToDoubleArray()
53 _, err := dat.Save(cli.Convert.Tokenizer)
54 if err != nil {
55 log.Fatalln(err)
56 }
57 } else {
58 mat := tok.ToMatrix()
59 _, err := mat.Save(cli.Convert.Tokenizer)
60 if err != nil {
61 log.Fatalln(err)
62 }
Akron7e269d42021-08-12 23:18:05 +020063 }
64 fmt.Println("File successfully converted.")
65 os.Exit(0)
66 }
67
Akron941f2152021-09-26 15:14:25 +020068 // Load the Datok or Matrix file
69 dat := datok.LoadTokenizerFile(cli.Tokenize.Tokenizer)
Akron8e1d69b2021-08-12 17:38:49 +020070
71 // Unable to load the datok file
72 if dat == nil {
Akron941f2152021-09-26 15:14:25 +020073 log.Fatalln("Unable to load file")
Akron8e1d69b2021-08-12 17:38:49 +020074 os.Exit(1)
75 }
76
Akron0f087ea2021-10-27 19:40:15 +020077 // Create flags parameter based on command line parameters
78 var flags datok.Bits
79 if cli.Tokenize.Tokens {
80 flags |= datok.TOKENS
81 }
82
83 if cli.Tokenize.TokenPositions {
84 flags |= datok.TOKEN_POS
85 }
86
87 if cli.Tokenize.Sentences {
88 flags |= datok.SENTENCES
89 }
90
91 if cli.Tokenize.SentencePositions {
92 flags |= datok.SENTENCE_POS
93 }
94
95 if cli.Tokenize.NewlineAfterEOT {
96 flags |= datok.NEWLINE_AFTER_EOT
97 }
98
Akron4f6b28c2021-10-25 00:52:03 +020099 // Create token writer based on the options defined
Akron96fdc9b2021-10-27 21:11:17 +0200100 tw := datok.NewTokenWriter(os.Stdout, flags)
Akron4f6b28c2021-10-25 00:52:03 +0200101
Akron15bb13d2021-10-30 11:57:41 +0200102 var r io.Reader
103
Akron8e1d69b2021-08-12 17:38:49 +0200104 // Program is running in a pipe
Akron15bb13d2021-10-30 11:57:41 +0200105 if cli.Tokenize.Input == "-" {
106 fileInfo, _ := os.Stdin.Stat()
107 if fileInfo.Mode()&os.ModeCharDevice == 0 {
108 r = os.Stdin
109 } else {
110 log.Fatalln("Unable to read from STDIN")
111 os.Exit(1)
112 return
113 }
114 } else {
115 f, err := os.Open(cli.Tokenize.Input)
116 if err != nil {
117 log.Fatalln(err)
118 os.Exit(1)
119 return
120 }
121 defer f.Close()
122 r = f
Akron8e1d69b2021-08-12 17:38:49 +0200123 }
Akron15bb13d2021-10-30 11:57:41 +0200124
125 dat.TransduceTokenWriter(r, tw)
126 tw.Flush()
Akron8e1d69b2021-08-12 17:38:49 +0200127}