blob: f08f430ca654a0402f6c27995ddbe2d87e10ed6e [file] [log] [blame]
Akrond9627472020-07-09 16:53:09 +02001package KorAP::XML::TEI::Tokenizer::Conservative;
Akron7501ca02020-08-01 21:05:25 +02002use base 'KorAP::XML::TEI::Annotations';
Akrond9627472020-07-09 16:53:09 +02003use strict;
4use warnings;
5
6# This tokenizer was originally written by cschnober.
7
8# Tokenize string "conservatively" and return an array
9# with character boundaries.
10sub tokenize {
Akron190d0222020-07-25 22:44:33 +020011 my ($self, $txt) = @_;
Peter Harders994aff72020-07-25 09:53:35 +020012
Akron190d0222020-07-25 22:44:33 +020013 # Replace MBCs with single bytes
14 $txt =~ s/\p{Punct}/./g;
15 $txt =~ s/\s/~/g;
16 $txt =~ s/[^\.\~]/_/g;
17 utf8::downgrade($txt);
Akrond9627472020-07-09 16:53:09 +020018
19 # Iterate over the whole string
Akron190d0222020-07-25 22:44:33 +020020 while ($txt =~ /(\.*)
21 (_+(?:\.+_+)*)?
22 (\.*)
23 \~?/gx) {
Akrond9627472020-07-09 16:53:09 +020024
25 # Punctuation preceding a token
Peter Hardersb1227172020-07-21 02:12:10 +020026 $self->_add_surroundings($txt, $-[1], $+[1], 1) if $1;
Akrond9627472020-07-09 16:53:09 +020027
28 # Token sequence
Peter Hardersb1227172020-07-21 02:12:10 +020029 push @$self, ($-[2], $+[2]) if $2; # from and to
Akrond9627472020-07-09 16:53:09 +020030
31 # Punctuation following a token
Peter Hardersb1227172020-07-21 02:12:10 +020032 $self->_add_surroundings($txt, $-[3], $+[3]) if $3;
Akrond9627472020-07-09 16:53:09 +020033 };
34
Akronedee6e52020-07-27 14:15:11 +020035 return $self;
Akrond9627472020-07-09 16:53:09 +020036};
37
38
Peter Harders854a1152020-07-22 22:48:02 +020039# Check if surrounding characters justify tokenization of Punctuation
40# (in that case $pr is set)
Akrond9627472020-07-09 16:53:09 +020041sub _add_surroundings {
Peter Hardersb1227172020-07-21 02:12:10 +020042 my ($self, $txt, $p1, $p2, $preceding) = @_;
Akrond9627472020-07-09 16:53:09 +020043
Peter Harders854a1152020-07-22 22:48:02 +020044 my $pr; # "print" (tokenize) punctuation character (if one of the below tests justified it)
Akrond9627472020-07-09 16:53:09 +020045
Peter Harders854a1152020-07-22 22:48:02 +020046 if ($p2 == $p1+1) { # single punctuation character
Akron190d0222020-07-25 22:44:33 +020047 my $char;
Akrond9627472020-07-09 16:53:09 +020048
49 # Variant for preceding characters
50 if ($preceding) {
Peter Harders854a1152020-07-22 22:48:02 +020051
52 $pr = 1; # the first punctuation character should always be tokenized
Peter Harders854a1152020-07-22 22:48:02 +020053
54 # Punctuation character doesn't start at first position
Akrond9627472020-07-09 16:53:09 +020055 if ($p1 != 0) {
Akron190d0222020-07-25 22:44:33 +020056
Peter Harders854a1152020-07-22 22:48:02 +020057 # Check char before punctuation char
Akron190d0222020-07-25 22:44:33 +020058 $char = substr( $txt, $p1-1, 1 );
59 $pr = ($char eq '.' || $char eq '~') ? 1 : 0;
Peter Harders854a1152020-07-22 22:48:02 +020060 }
Akrond9627472020-07-09 16:53:09 +020061 }
62
63 else {
Peter Harders854a1152020-07-22 22:48:02 +020064 # Check char after punctuation char
Akron190d0222020-07-25 22:44:33 +020065 $char = substr( $txt, $p2, 1 );
66
67 # The last punctuation character should always be tokenized
68 $pr = (!$char || $char eq '.' || $char eq '~') ? 1 : 0;
Akrond9627472020-07-09 16:53:09 +020069
Peter Harders854a1152020-07-22 22:48:02 +020070 # Check char before punctuation char
Akrond9627472020-07-09 16:53:09 +020071 unless ($pr) {
Akron190d0222020-07-25 22:44:33 +020072 $char = substr ( $txt, $p1-1, 1);
73 $pr = ($char eq '.' || $char eq '~' ) ? 1 : 0;
Akrond9627472020-07-09 16:53:09 +020074 };
75 };
76
Peter Harders854a1152020-07-22 22:48:02 +020077 # tokenize punctuation char (because it was justified)
Peter Hardersb1227172020-07-21 02:12:10 +020078 push @$self, ($p1, $p2) if $pr; # from and to
Peter Harders854a1152020-07-22 22:48:02 +020079
Akrond9627472020-07-09 16:53:09 +020080 return;
81 };
82
83 # Iterate over all single punctuation symbols
84 for (my $i = $p1; $i < $p2; $i++ ){
Peter Hardersb1227172020-07-21 02:12:10 +020085 push @$self, $i, $i+1; # from and to
Akrond9627472020-07-09 16:53:09 +020086 };
87};
88
89
Akron91705d72021-02-19 10:59:45 +010090# Name of the tokenizer file
91sub name {
92 'tokens_conservative';
93};
94
95
Akrond9627472020-07-09 16:53:09 +0200961;