| Akron | 492a3bb | 2022-03-02 10:38:33 +0100 | [diff] [blame] | 1 | import sys |
| 2 | |||||
| 3 | from spacy.lang.de import German | ||||
| 4 | |||||
| 5 | nlp = German() | ||||
| 6 | |||||
| 7 | # Create a Tokenizer with the default settings for English | ||||
| 8 | # including punctuation rules and exceptions | ||||
| 9 | tokenizer = nlp.tokenizer | ||||
| 10 | |||||
| 11 | with open(sys.argv[1], 'r') as f: | ||||
| 12 | contents = f.read() | ||||
| 13 | |||||
| 14 | tokens = tokenizer(contents) | ||||
| 15 | |||||
| 16 | for t in tokens: | ||||
| 17 | print(t) | ||||