Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 1 | package main |
| 2 | |
| 3 | import ( |
Akron | 7e269d4 | 2021-08-12 23:18:05 +0200 | [diff] [blame] | 4 | "fmt" |
Akron | 15bb13d | 2021-10-30 11:57:41 +0200 | [diff] [blame] | 5 | "io" |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 6 | "os" |
| 7 | |
Akron | 527c10c | 2021-08-13 01:45:18 +0200 | [diff] [blame] | 8 | "log" |
| 9 | |
Akron | 7f1097f | 2021-09-21 16:00:29 +0200 | [diff] [blame] | 10 | datok "github.com/KorAP/datok" |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 11 | "github.com/alecthomas/kong" |
| 12 | ) |
| 13 | |
Akron | 54ed7e7 | 2022-01-04 12:05:00 +0100 | [diff] [blame] | 14 | // TODO: |
| 15 | // - Support version information based on |
| 16 | // https://blog.carlmjohnson.net/post/2021/golang-118-minor-features/ |
| 17 | |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 18 | var cli struct { |
Akron | 7e269d4 | 2021-08-12 23:18:05 +0200 | [diff] [blame] | 19 | Convert struct { |
Akron | 15bb13d | 2021-10-30 11:57:41 +0200 | [diff] [blame] | 20 | Foma string `kong:"required,short='i',help='The Foma FST file'"` |
Akron | 941f215 | 2021-09-26 15:14:25 +0200 | [diff] [blame] | 21 | Tokenizer string `kong:"required,short='o',help='The Tokenizer file'"` |
| 22 | DoubleArray bool `kong:"optional,short='d',help='Convert to Double Array instead of Matrix representation'"` |
Akron | 274600e | 2021-11-03 20:09:06 +0100 | [diff] [blame] | 23 | } `kong:"cmd, help='Convert a compiled foma FST file to a Matrix or Double Array tokenizer'"` |
Akron | 7e269d4 | 2021-08-12 23:18:05 +0200 | [diff] [blame] | 24 | Tokenize struct { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 25 | Tokenizer string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"` |
Akron | 15bb13d | 2021-10-30 11:57:41 +0200 | [diff] [blame] | 26 | Input string `kong:"required,arg='',type='existingfile',help='Input file to tokenize (use - for STDIN)'"` |
Akron | 2612f99 | 2021-10-27 20:12:15 +0200 | [diff] [blame] | 27 | Tokens bool `kong:"optional,negatable,default=true,help='Print token surfaces (defaults to ${default})'"` |
| 28 | Sentences bool `kong:"optional,negatable,default=true,help='Print sentence boundaries (defaults to ${default})'"` |
| 29 | TokenPositions bool `kong:"optional,default=false,short='p',help='Print token offsets (defaults to ${default})'"` |
| 30 | SentencePositions bool `kong:"optional,default=false,help='Print sentence offsets (defaults to ${default})'"` |
| 31 | NewlineAfterEOT bool `kong:"optional,default=false,help='Ignore newline after EOT (defaults to ${default})'"` |
Akron | 7e269d4 | 2021-08-12 23:18:05 +0200 | [diff] [blame] | 32 | } `kong:"cmd, help='Tokenize a text'"` |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 33 | } |
| 34 | |
| 35 | // Main method for command line handling |
| 36 | func main() { |
| 37 | |
| 38 | // Parse command line parameters |
| 39 | parser := kong.Must( |
| 40 | &cli, |
| 41 | kong.Name("datok"), |
Akron | 941f215 | 2021-09-26 15:14:25 +0200 | [diff] [blame] | 42 | kong.Description("FSA based tokenizer"), |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 43 | kong.UsageOnError(), |
| 44 | ) |
| 45 | |
Akron | 7e269d4 | 2021-08-12 23:18:05 +0200 | [diff] [blame] | 46 | ctx, err := parser.Parse(os.Args[1:]) |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 47 | |
| 48 | parser.FatalIfErrorf(err) |
| 49 | |
Akron | 7e269d4 | 2021-08-12 23:18:05 +0200 | [diff] [blame] | 50 | if ctx.Command() == "convert" { |
| 51 | tok := datok.LoadFomaFile(cli.Convert.Foma) |
| 52 | if tok == nil { |
Akron | 527c10c | 2021-08-13 01:45:18 +0200 | [diff] [blame] | 53 | log.Fatalln("Unable to load foma file") |
Akron | 7e269d4 | 2021-08-12 23:18:05 +0200 | [diff] [blame] | 54 | } |
Akron | 941f215 | 2021-09-26 15:14:25 +0200 | [diff] [blame] | 55 | if cli.Convert.DoubleArray { |
| 56 | dat := tok.ToDoubleArray() |
Akron | 6a4ce18 | 2022-04-18 21:15:06 +0200 | [diff] [blame] | 57 | fmt.Println("Load factor", dat.LoadFactor()) |
Akron | 941f215 | 2021-09-26 15:14:25 +0200 | [diff] [blame] | 58 | _, err := dat.Save(cli.Convert.Tokenizer) |
| 59 | if err != nil { |
| 60 | log.Fatalln(err) |
| 61 | } |
| 62 | } else { |
| 63 | mat := tok.ToMatrix() |
| 64 | _, err := mat.Save(cli.Convert.Tokenizer) |
| 65 | if err != nil { |
| 66 | log.Fatalln(err) |
| 67 | } |
Akron | 7e269d4 | 2021-08-12 23:18:05 +0200 | [diff] [blame] | 68 | } |
| 69 | fmt.Println("File successfully converted.") |
| 70 | os.Exit(0) |
| 71 | } |
| 72 | |
Akron | 941f215 | 2021-09-26 15:14:25 +0200 | [diff] [blame] | 73 | // Load the Datok or Matrix file |
| 74 | dat := datok.LoadTokenizerFile(cli.Tokenize.Tokenizer) |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 75 | |
| 76 | // Unable to load the datok file |
| 77 | if dat == nil { |
Akron | 941f215 | 2021-09-26 15:14:25 +0200 | [diff] [blame] | 78 | log.Fatalln("Unable to load file") |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 79 | os.Exit(1) |
| 80 | } |
| 81 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 82 | // Create flags parameter based on command line parameters |
| 83 | var flags datok.Bits |
| 84 | if cli.Tokenize.Tokens { |
| 85 | flags |= datok.TOKENS |
| 86 | } |
| 87 | |
| 88 | if cli.Tokenize.TokenPositions { |
| 89 | flags |= datok.TOKEN_POS |
| 90 | } |
| 91 | |
| 92 | if cli.Tokenize.Sentences { |
| 93 | flags |= datok.SENTENCES |
| 94 | } |
| 95 | |
| 96 | if cli.Tokenize.SentencePositions { |
| 97 | flags |= datok.SENTENCE_POS |
| 98 | } |
| 99 | |
| 100 | if cli.Tokenize.NewlineAfterEOT { |
| 101 | flags |= datok.NEWLINE_AFTER_EOT |
| 102 | } |
| 103 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 104 | // Create token writer based on the options defined |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 105 | tw := datok.NewTokenWriter(os.Stdout, flags) |
Akron | 274600e | 2021-11-03 20:09:06 +0100 | [diff] [blame] | 106 | defer os.Stdout.Close() |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 107 | |
Akron | 15bb13d | 2021-10-30 11:57:41 +0200 | [diff] [blame] | 108 | var r io.Reader |
| 109 | |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 110 | // Program is running in a pipe |
Akron | 15bb13d | 2021-10-30 11:57:41 +0200 | [diff] [blame] | 111 | if cli.Tokenize.Input == "-" { |
| 112 | fileInfo, _ := os.Stdin.Stat() |
| 113 | if fileInfo.Mode()&os.ModeCharDevice == 0 { |
| 114 | r = os.Stdin |
Akron | 274600e | 2021-11-03 20:09:06 +0100 | [diff] [blame] | 115 | defer os.Stdin.Close() |
Akron | 15bb13d | 2021-10-30 11:57:41 +0200 | [diff] [blame] | 116 | } else { |
| 117 | log.Fatalln("Unable to read from STDIN") |
| 118 | os.Exit(1) |
| 119 | return |
| 120 | } |
| 121 | } else { |
| 122 | f, err := os.Open(cli.Tokenize.Input) |
| 123 | if err != nil { |
| 124 | log.Fatalln(err) |
| 125 | os.Exit(1) |
| 126 | return |
| 127 | } |
| 128 | defer f.Close() |
| 129 | r = f |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 130 | } |
Akron | 15bb13d | 2021-10-30 11:57:41 +0200 | [diff] [blame] | 131 | |
| 132 | dat.TransduceTokenWriter(r, tw) |
| 133 | tw.Flush() |
Akron | 8e1d69b | 2021-08-12 17:38:49 +0200 | [diff] [blame] | 134 | } |