| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 1 | package KorAP::XML::TEI::Tokenizer::Conservative; |
| Akron | 7501ca0 | 2020-08-01 21:05:25 +0200 | [diff] [blame] | 2 | use base 'KorAP::XML::TEI::Annotations'; |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 3 | use strict; |
| 4 | use warnings; |
| 5 | |
| 6 | # This tokenizer was originally written by cschnober. |
| 7 | |
| 8 | # Tokenize string "conservatively" and return an array |
| 9 | # with character boundaries. |
| 10 | sub tokenize { |
| Akron | 190d022 | 2020-07-25 22:44:33 +0200 | [diff] [blame] | 11 | my ($self, $txt) = @_; |
| Peter Harders | 994aff7 | 2020-07-25 09:53:35 +0200 | [diff] [blame] | 12 | |
| Akron | 190d022 | 2020-07-25 22:44:33 +0200 | [diff] [blame] | 13 | # Replace MBCs with single bytes |
| 14 | $txt =~ s/\p{Punct}/./g; |
| 15 | $txt =~ s/\s/~/g; |
| 16 | $txt =~ s/[^\.\~]/_/g; |
| 17 | utf8::downgrade($txt); |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 18 | |
| 19 | # Iterate over the whole string |
| Akron | 190d022 | 2020-07-25 22:44:33 +0200 | [diff] [blame] | 20 | while ($txt =~ /(\.*) |
| 21 | (_+(?:\.+_+)*)? |
| 22 | (\.*) |
| 23 | \~?/gx) { |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 24 | |
| 25 | # Punctuation preceding a token |
| Peter Harders | b122717 | 2020-07-21 02:12:10 +0200 | [diff] [blame] | 26 | $self->_add_surroundings($txt, $-[1], $+[1], 1) if $1; |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 27 | |
| 28 | # Token sequence |
| Peter Harders | b122717 | 2020-07-21 02:12:10 +0200 | [diff] [blame] | 29 | push @$self, ($-[2], $+[2]) if $2; # from and to |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 30 | |
| 31 | # Punctuation following a token |
| Peter Harders | b122717 | 2020-07-21 02:12:10 +0200 | [diff] [blame] | 32 | $self->_add_surroundings($txt, $-[3], $+[3]) if $3; |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 33 | }; |
| 34 | |
| Akron | edee6e5 | 2020-07-27 14:15:11 +0200 | [diff] [blame] | 35 | return $self; |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 36 | }; |
| 37 | |
| 38 | |
| Peter Harders | 854a115 | 2020-07-22 22:48:02 +0200 | [diff] [blame] | 39 | # Check if surrounding characters justify tokenization of Punctuation |
| 40 | # (in that case $pr is set) |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 41 | sub _add_surroundings { |
| Peter Harders | b122717 | 2020-07-21 02:12:10 +0200 | [diff] [blame] | 42 | my ($self, $txt, $p1, $p2, $preceding) = @_; |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 43 | |
| Peter Harders | 854a115 | 2020-07-22 22:48:02 +0200 | [diff] [blame] | 44 | my $pr; # "print" (tokenize) punctuation character (if one of the below tests justified it) |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 45 | |
| Peter Harders | 854a115 | 2020-07-22 22:48:02 +0200 | [diff] [blame] | 46 | if ($p2 == $p1+1) { # single punctuation character |
| Akron | 190d022 | 2020-07-25 22:44:33 +0200 | [diff] [blame] | 47 | my $char; |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 48 | |
| 49 | # Variant for preceding characters |
| 50 | if ($preceding) { |
| Peter Harders | 854a115 | 2020-07-22 22:48:02 +0200 | [diff] [blame] | 51 | |
| 52 | $pr = 1; # the first punctuation character should always be tokenized |
| Peter Harders | 854a115 | 2020-07-22 22:48:02 +0200 | [diff] [blame] | 53 | |
| 54 | # Punctuation character doesn't start at first position |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 55 | if ($p1 != 0) { |
| Akron | 190d022 | 2020-07-25 22:44:33 +0200 | [diff] [blame] | 56 | |
| Peter Harders | 854a115 | 2020-07-22 22:48:02 +0200 | [diff] [blame] | 57 | # Check char before punctuation char |
| Akron | 190d022 | 2020-07-25 22:44:33 +0200 | [diff] [blame] | 58 | $char = substr( $txt, $p1-1, 1 ); |
| 59 | $pr = ($char eq '.' || $char eq '~') ? 1 : 0; |
| Peter Harders | 854a115 | 2020-07-22 22:48:02 +0200 | [diff] [blame] | 60 | } |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 61 | } |
| 62 | |
| 63 | else { |
| Peter Harders | 854a115 | 2020-07-22 22:48:02 +0200 | [diff] [blame] | 64 | # Check char after punctuation char |
| Akron | 190d022 | 2020-07-25 22:44:33 +0200 | [diff] [blame] | 65 | $char = substr( $txt, $p2, 1 ); |
| 66 | |
| 67 | # The last punctuation character should always be tokenized |
| 68 | $pr = (!$char || $char eq '.' || $char eq '~') ? 1 : 0; |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 69 | |
| Peter Harders | 854a115 | 2020-07-22 22:48:02 +0200 | [diff] [blame] | 70 | # Check char before punctuation char |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 71 | unless ($pr) { |
| Akron | 190d022 | 2020-07-25 22:44:33 +0200 | [diff] [blame] | 72 | $char = substr ( $txt, $p1-1, 1); |
| 73 | $pr = ($char eq '.' || $char eq '~' ) ? 1 : 0; |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 74 | }; |
| 75 | }; |
| 76 | |
| Peter Harders | 854a115 | 2020-07-22 22:48:02 +0200 | [diff] [blame] | 77 | # tokenize punctuation char (because it was justified) |
| Peter Harders | b122717 | 2020-07-21 02:12:10 +0200 | [diff] [blame] | 78 | push @$self, ($p1, $p2) if $pr; # from and to |
| Peter Harders | 854a115 | 2020-07-22 22:48:02 +0200 | [diff] [blame] | 79 | |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 80 | return; |
| 81 | }; |
| 82 | |
| 83 | # Iterate over all single punctuation symbols |
| 84 | for (my $i = $p1; $i < $p2; $i++ ){ |
| Peter Harders | b122717 | 2020-07-21 02:12:10 +0200 | [diff] [blame] | 85 | push @$self, $i, $i+1; # from and to |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 86 | }; |
| 87 | }; |
| 88 | |
| 89 | |
| Akron | 91705d7 | 2021-02-19 10:59:45 +0100 | [diff] [blame] | 90 | # Name of the tokenizer file |
| 91 | sub name { |
| 92 | 'tokens_conservative'; |
| 93 | }; |
| 94 | |
| 95 | |
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 96 | 1; |