Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 1 | package main |
| 2 | |
| 3 | import ( |
Akron | 7e269d4 | 2021-08-12 23:18:05 +0200 | [diff] [blame] | 4 | "fmt" |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 5 | "os" |
| 6 | |
Akron | 527c10c | 2021-08-13 01:45:18 +0200 | [diff] [blame] | 7 | "log" |
| 8 | |
Akron | 7f1097f | 2021-09-21 16:00:29 +0200 | [diff] [blame] | 9 | datok "github.com/KorAP/datok" |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 10 | "github.com/alecthomas/kong" |
| 11 | ) |
| 12 | |
| 13 | var cli struct { |
Akron | 7e269d4 | 2021-08-12 23:18:05 +0200 | [diff] [blame] | 14 | Convert struct { |
Akron | 941f215 | 2021-09-26 15:14:25 +0200 | [diff] [blame] | 15 | Foma string `kong:"required,short='i',help='The Foma file'"` |
| 16 | Tokenizer string `kong:"required,short='o',help='The Tokenizer file'"` |
| 17 | DoubleArray bool `kong:"optional,short='d',help='Convert to Double Array instead of Matrix representation'"` |
| 18 | } `kong:"cmd, help='Convert a foma file to a Matrix or Double Array tokenizer'"` |
Akron | 7e269d4 | 2021-08-12 23:18:05 +0200 | [diff] [blame] | 19 | Tokenize struct { |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 20 | Tokenizer string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"` |
| 21 | Positions bool `kong:"optional,negatable,default=false,short='p',help='Print token offsets'"` |
| 22 | Tokens bool `kong:"optional,negatable,default=true,help='Print token surfaces'"` |
| 23 | NewlineAfterEOT bool `kong:"optional,negatable,help='Ignore newline after EOT'"` |
Akron | 7e269d4 | 2021-08-12 23:18:05 +0200 | [diff] [blame] | 24 | } `kong:"cmd, help='Tokenize a text'"` |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 25 | } |
| 26 | |
| 27 | // Main method for command line handling |
| 28 | func main() { |
| 29 | |
| 30 | // Parse command line parameters |
| 31 | parser := kong.Must( |
| 32 | &cli, |
| 33 | kong.Name("datok"), |
Akron | 941f215 | 2021-09-26 15:14:25 +0200 | [diff] [blame] | 34 | kong.Description("FSA based tokenizer"), |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 35 | kong.UsageOnError(), |
| 36 | ) |
| 37 | |
Akron | 7e269d4 | 2021-08-12 23:18:05 +0200 | [diff] [blame] | 38 | ctx, err := parser.Parse(os.Args[1:]) |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 39 | |
| 40 | parser.FatalIfErrorf(err) |
| 41 | |
Akron | 7e269d4 | 2021-08-12 23:18:05 +0200 | [diff] [blame] | 42 | if ctx.Command() == "convert" { |
| 43 | tok := datok.LoadFomaFile(cli.Convert.Foma) |
| 44 | if tok == nil { |
Akron | 527c10c | 2021-08-13 01:45:18 +0200 | [diff] [blame] | 45 | log.Fatalln("Unable to load foma file") |
Akron | 7e269d4 | 2021-08-12 23:18:05 +0200 | [diff] [blame] | 46 | } |
Akron | 941f215 | 2021-09-26 15:14:25 +0200 | [diff] [blame] | 47 | if cli.Convert.DoubleArray { |
| 48 | dat := tok.ToDoubleArray() |
| 49 | _, err := dat.Save(cli.Convert.Tokenizer) |
| 50 | if err != nil { |
| 51 | log.Fatalln(err) |
| 52 | } |
| 53 | } else { |
| 54 | mat := tok.ToMatrix() |
| 55 | _, err := mat.Save(cli.Convert.Tokenizer) |
| 56 | if err != nil { |
| 57 | log.Fatalln(err) |
| 58 | } |
Akron | 7e269d4 | 2021-08-12 23:18:05 +0200 | [diff] [blame] | 59 | } |
| 60 | fmt.Println("File successfully converted.") |
| 61 | os.Exit(0) |
| 62 | } |
| 63 | |
Akron | 941f215 | 2021-09-26 15:14:25 +0200 | [diff] [blame] | 64 | // Load the Datok or Matrix file |
| 65 | dat := datok.LoadTokenizerFile(cli.Tokenize.Tokenizer) |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 66 | |
| 67 | // Unable to load the datok file |
| 68 | if dat == nil { |
Akron | 941f215 | 2021-09-26 15:14:25 +0200 | [diff] [blame] | 69 | log.Fatalln("Unable to load file") |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 70 | os.Exit(1) |
| 71 | } |
| 72 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 73 | // Create token writer based on the options defined |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 74 | tw := datok.NewTokenWriterFromOptions( |
| 75 | os.Stdout, |
| 76 | cli.Tokenize.Positions, |
Akron | a9e0c42 | 2021-10-27 19:01:17 +0200 | [diff] [blame^] | 77 | cli.Tokenize.Tokens, |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 78 | cli.Tokenize.NewlineAfterEOT, |
| 79 | ) |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 80 | |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 81 | // Program is running in a pipe |
| 82 | fileInfo, _ := os.Stdin.Stat() |
| 83 | if fileInfo.Mode()&os.ModeCharDevice == 0 { |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 84 | dat.TransduceTokenWriter(os.Stdin, tw) |
| 85 | tw.Flush() |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 86 | } |
| 87 | } |