blob: 1c9d99b9e87d9f91f9bdbebeaaa27bda31c59bc9 [file] [log] [blame]
Akron8e1d69b2021-08-12 17:38:49 +02001package main
2
3import (
Akron7e269d42021-08-12 23:18:05 +02004 "fmt"
Akron15bb13d2021-10-30 11:57:41 +02005 "io"
Akron8e1d69b2021-08-12 17:38:49 +02006 "os"
7
Akron527c10c2021-08-13 01:45:18 +02008 "log"
9
Akron7f1097f2021-09-21 16:00:29 +020010 datok "github.com/KorAP/datok"
Akron8e1d69b2021-08-12 17:38:49 +020011 "github.com/alecthomas/kong"
12)
13
14var cli struct {
Akron7e269d42021-08-12 23:18:05 +020015 Convert struct {
Akron15bb13d2021-10-30 11:57:41 +020016 Foma string `kong:"required,short='i',help='The Foma FST file'"`
Akron941f2152021-09-26 15:14:25 +020017 Tokenizer string `kong:"required,short='o',help='The Tokenizer file'"`
18 DoubleArray bool `kong:"optional,short='d',help='Convert to Double Array instead of Matrix representation'"`
Akron274600e2021-11-03 20:09:06 +010019 } `kong:"cmd, help='Convert a compiled foma FST file to a Matrix or Double Array tokenizer'"`
Akron7e269d42021-08-12 23:18:05 +020020 Tokenize struct {
Akronfceddb62021-10-27 19:27:54 +020021 Tokenizer string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"`
Akron15bb13d2021-10-30 11:57:41 +020022 Input string `kong:"required,arg='',type='existingfile',help='Input file to tokenize (use - for STDIN)'"`
Akron2612f992021-10-27 20:12:15 +020023 Tokens bool `kong:"optional,negatable,default=true,help='Print token surfaces (defaults to ${default})'"`
24 Sentences bool `kong:"optional,negatable,default=true,help='Print sentence boundaries (defaults to ${default})'"`
25 TokenPositions bool `kong:"optional,default=false,short='p',help='Print token offsets (defaults to ${default})'"`
26 SentencePositions bool `kong:"optional,default=false,help='Print sentence offsets (defaults to ${default})'"`
27 NewlineAfterEOT bool `kong:"optional,default=false,help='Ignore newline after EOT (defaults to ${default})'"`
Akron7e269d42021-08-12 23:18:05 +020028 } `kong:"cmd, help='Tokenize a text'"`
Akron8e1d69b2021-08-12 17:38:49 +020029}
30
31// Main method for command line handling
32func main() {
33
34 // Parse command line parameters
35 parser := kong.Must(
36 &cli,
37 kong.Name("datok"),
Akron941f2152021-09-26 15:14:25 +020038 kong.Description("FSA based tokenizer"),
Akron8e1d69b2021-08-12 17:38:49 +020039 kong.UsageOnError(),
40 )
41
Akron7e269d42021-08-12 23:18:05 +020042 ctx, err := parser.Parse(os.Args[1:])
Akron8e1d69b2021-08-12 17:38:49 +020043
44 parser.FatalIfErrorf(err)
45
Akron7e269d42021-08-12 23:18:05 +020046 if ctx.Command() == "convert" {
47 tok := datok.LoadFomaFile(cli.Convert.Foma)
48 if tok == nil {
Akron527c10c2021-08-13 01:45:18 +020049 log.Fatalln("Unable to load foma file")
Akron7e269d42021-08-12 23:18:05 +020050 }
Akron941f2152021-09-26 15:14:25 +020051 if cli.Convert.DoubleArray {
52 dat := tok.ToDoubleArray()
53 _, err := dat.Save(cli.Convert.Tokenizer)
54 if err != nil {
55 log.Fatalln(err)
56 }
57 } else {
58 mat := tok.ToMatrix()
59 _, err := mat.Save(cli.Convert.Tokenizer)
60 if err != nil {
61 log.Fatalln(err)
62 }
Akron7e269d42021-08-12 23:18:05 +020063 }
64 fmt.Println("File successfully converted.")
65 os.Exit(0)
66 }
67
Akron941f2152021-09-26 15:14:25 +020068 // Load the Datok or Matrix file
69 dat := datok.LoadTokenizerFile(cli.Tokenize.Tokenizer)
Akron8e1d69b2021-08-12 17:38:49 +020070
71 // Unable to load the datok file
72 if dat == nil {
Akron941f2152021-09-26 15:14:25 +020073 log.Fatalln("Unable to load file")
Akron8e1d69b2021-08-12 17:38:49 +020074 os.Exit(1)
75 }
76
Akron0f087ea2021-10-27 19:40:15 +020077 // Create flags parameter based on command line parameters
78 var flags datok.Bits
79 if cli.Tokenize.Tokens {
80 flags |= datok.TOKENS
81 }
82
83 if cli.Tokenize.TokenPositions {
84 flags |= datok.TOKEN_POS
85 }
86
87 if cli.Tokenize.Sentences {
88 flags |= datok.SENTENCES
89 }
90
91 if cli.Tokenize.SentencePositions {
92 flags |= datok.SENTENCE_POS
93 }
94
95 if cli.Tokenize.NewlineAfterEOT {
96 flags |= datok.NEWLINE_AFTER_EOT
97 }
98
Akron4f6b28c2021-10-25 00:52:03 +020099 // Create token writer based on the options defined
Akron96fdc9b2021-10-27 21:11:17 +0200100 tw := datok.NewTokenWriter(os.Stdout, flags)
Akron274600e2021-11-03 20:09:06 +0100101 defer os.Stdout.Close()
Akron4f6b28c2021-10-25 00:52:03 +0200102
Akron15bb13d2021-10-30 11:57:41 +0200103 var r io.Reader
104
Akron8e1d69b2021-08-12 17:38:49 +0200105 // Program is running in a pipe
Akron15bb13d2021-10-30 11:57:41 +0200106 if cli.Tokenize.Input == "-" {
107 fileInfo, _ := os.Stdin.Stat()
108 if fileInfo.Mode()&os.ModeCharDevice == 0 {
109 r = os.Stdin
Akron274600e2021-11-03 20:09:06 +0100110 defer os.Stdin.Close()
Akron15bb13d2021-10-30 11:57:41 +0200111 } else {
112 log.Fatalln("Unable to read from STDIN")
113 os.Exit(1)
114 return
115 }
116 } else {
117 f, err := os.Open(cli.Tokenize.Input)
118 if err != nil {
119 log.Fatalln(err)
120 os.Exit(1)
121 return
122 }
123 defer f.Close()
124 r = f
Akron8e1d69b2021-08-12 17:38:49 +0200125 }
Akron15bb13d2021-10-30 11:57:41 +0200126
127 dat.TransduceTokenWriter(r, tw)
128 tw.Flush()
Akron8e1d69b2021-08-12 17:38:49 +0200129}