Turn Def parser into a package

Change-Id: I2afb9d79f3f4a9c8bb9afce728c162af0507bf31
diff --git a/lib/KorAP/Def.pm b/lib/KorAP/Def.pm
new file mode 100644
index 0000000..5a4591d
--- /dev/null
+++ b/lib/KorAP/Def.pm
@@ -0,0 +1,310 @@
+package KorAP::Def;
+use KorAP::VirtualCorpus::Group;
+use strict;
+use warnings;
+
+sub new {
+  my $class = shift;
+  my $file = shift;
+
+  my $self = {};
+
+  if (ref $file && ref $file eq 'GLOB') {
+    $self->{file} = '';
+    $self->{fh} = $file;
+  }
+
+  elsif ($file =~ /\.def\.bz2$/) {
+    $self->{file} = $file;
+    # $self->{fh} = $file;
+    # TODO
+  }
+
+  elsif (-f $file) {
+    $self->{file} = $file;
+ 
+    # Open def file
+    if (!open($self->{fh}, '<' . $file)) {
+      warn $ARGV[0] . " can't be opened";
+      exit(0);
+    };
+  }; # or guess
+
+  return bless $self, $class;
+};
+
+sub parse {
+  my $self = shift;
+
+  # Initial VC group
+  my $vc;
+
+  # Create an intensional and an extensional VC
+  my $vc_ext = KorAP::VirtualCorpus::Group->new;
+  my $vc_int = KorAP::VirtualCorpus::Group->new;
+
+  # Load ext initially
+  $$vc = $vc_ext;
+
+  # Collect all virtual corpora
+  my %all_vcs;
+
+  my $frozen = 0;
+
+  # Iterate over the whole list
+  while (!eof($self->{fh})) {
+    my $line = readline($self->{fh});
+    chomp $line;
+
+    # Skip empty lines
+    if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
+      # empty
+      next;
+    };
+
+    my ($key, $value, $desc);
+
+    # Line-Type: <e>c</a>
+    if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
+      $key = $1;
+      $value = $2 // undef;
+    }
+
+    # Line-Type: <e>c
+    elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
+      $key = $1;
+      $value = $2;
+    }
+
+    # Get text sigles
+    elsif ($line =~ m!^(?:\w+\/){2}\w+$!) {
+      $key = 'text';
+      $value = $line;
+    }
+
+    # Get doc sigles
+    elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
+      $key = 'doc';
+      $value = $1;
+    }
+
+    # Get corpus sigles
+    elsif ($line !~ m!(?:\/|\s)!) {
+      $key = 'corpus';
+      $value = $line;
+    }
+
+    # Not known
+    else {
+      warn _shorten($line) . q! isn't a valid VC definition!;
+      next;
+    };
+
+    # Add text field
+    if ($key eq 'text') {
+
+      # Convert C2 sigle to KorAP form
+      $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
+      ${$vc}->union_field(textSigle => $value);
+    }
+
+    # Add doc field
+    elsif ($key eq 'doc') {
+      ${$vc}->union_field(docSigle => $value);
+    }
+
+    # Add corpus field
+    elsif ($key eq 'corpus') {
+      ${$vc}->union_field(corpusSigle => $value);
+    }
+
+    # Add corpus field
+    elsif ($key eq 'cn') {
+      # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
+      if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
+        ${$vc}->union_field(corpusSigle => $1);
+      };
+    }
+
+    # Mark the vc as frozen
+    # This means that an extended VC area is expected
+    elsif ($key eq 'frozen') {
+      $frozen = 1;
+    }
+
+    # Start/End intended VC area
+    elsif ($key eq 'intended') {
+      if ($value eq 'start') {
+        $$vc = $vc_int;
+      }
+      elsif ($value ne 'end') {
+        warn 'Unknown intension value ' . $value;
+      };
+    }
+
+    # Start/End extended VC area
+    elsif ($key eq 'extended') {
+      if ($value eq 'start') {
+        $$vc = $vc_ext;
+      }
+      elsif ($value ne 'end') {
+        warn 'Unknown extension value ' . $value;
+      };
+    }
+
+    # Set VC name
+    elsif ($key eq 'name') {
+      # "Name des virt. Korpus, der angezeigt wird.
+      # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>,
+      # <add>, <sub>"
+
+      # No global name defined yet
+      if ($$vc && !$$vc->name) {
+        $vc_ext->name($value);
+        $vc_int->name($value);
+        next;
+      };
+
+      ${$vc} = KorAP::VirtualCorpus::Group->new;
+      ${$vc}->name($value);
+    }
+
+    # End VC def
+    elsif ($key eq 'end') {
+      $all_vcs{${$vc}->name} = $$vc;
+      # $vc = undef;
+    }
+
+    # Add VC definition
+    elsif ($key eq 'add') {
+      unless (defined $all_vcs{$value}) {
+        #       warn 'VC ' . $value . ' not defined';
+        # exit(1);
+        next;
+      };
+
+      $$vc->union($all_vcs{$value}->clone->to_koral);
+    }
+
+    # AND definition
+    elsif ($key eq 'and') {
+      unless (defined $all_vcs{$value}) {
+        #       warn 'VC ' . $value . ' not defined';
+        # exit(1);
+        next;
+      };
+
+      $$vc->joint($all_vcs{$value}->clone->to_koral);
+    }
+
+    # Source of the corpus
+    elsif ($key eq 'ql') {
+      # Quellenname, z.B. "Neue Zürcher Zeitung"
+      $$vc->union_field(corpusTitle => $value);
+    }
+
+    elsif ($key eq 'sub') {
+      # "Sub" is the difference - it is the "and not" operation.
+      warn $key . ' is not yet supported';
+    }
+
+    elsif ($key eq 'co') {
+      # Country,	z.B. DE für Text in Deutschland erschienen
+      warn $key . ' is not yet supported';
+    }
+
+    elsif ($key eq 'tl') {
+      # Textlength, Bereich von Texten der angegebenen Länge [in Anz. Wörtern]
+      warn $key . ' is not yet supported';
+    }
+
+    elsif ($key eq 'ts') {
+      # Textsorte, 	z.B. "Bericht"
+      warn $key . ' is not yet supported';
+    }
+
+    elsif ($key eq 'th') {
+      # Thema, z.B. "Sport - Fußball"
+      warn $key . ' is not yet supported';
+    }
+
+    elsif ($key eq 'red') {
+      # Reduktionsfaktor
+      # Wert zw. 1-99%: virt. Korpus wird auf diesen Wert
+      # reduziert. Modus: feste Reduzierung, nicht variabel.
+      warn $key . ' is not yet supported';
+    }
+
+    elsif ($key eq 'thprob') {
+      # ThemaProbability
+      # Wert, der für <th>Thema verwendet wird um zu bestimmen, ab welchem
+      # Zuverläßigkeitswert ein Thema übernommen wird
+    }
+
+    # Add reduction value as a comment
+    elsif ($key eq 'redabs') {
+      # "red. Anz. Texte
+      # absoluter Wert der durch Reduktion zu erzielende Anzahl Texte"
+      $$vc->comment('redabs:' . $value);
+      warn $key . ' is not yet supported';
+    }
+
+    # Add reduction value as a comment
+    elsif ($key eq 'date') {
+      # Supports two pattern schemes:
+      # m1=Year1/Month1 bis Year2/Month2
+      #   Datumsbereich Schema 1: z.B. "2000/01 bis 2010/12"
+
+      # Schema 1
+      if ($value =~ m!^(?:m1\s*=\s*)?\s*(\d+)\/(\d+) bis (\d+)\/(\d+)\s*$!s) {
+        my ($y1, $m1, $y2, $m2) = ($1, $2, $3, $4);
+        if ($m1 < 10) {
+          $m1 = '0' . (0+$m1);
+        };
+        if ($m2 < 10) {
+          $m2 = '0' . (0+$m2);
+        };
+        $$vc->from($y1, $m1);
+        $$vc->to($y2, $m2);
+      }
+
+      # Scheme 2
+      elsif ($value =~ m!^\s*\d{4}-\d{4}\s+und\s+\d{1,2}-\d{1,2}\s*$!) {
+        # m2=Year1-Year2 und Month1-Month2
+        #   Datumsbereich Schema 2: z.B. "1990-2000 und 06-06"
+
+        warn 'Second date scheme not yet supported!'
+      }
+
+      else {
+        warn 'Unknown date scheme ' . $value;
+      };
+    }
+
+    # Unknown
+    else {
+      warn $key . ' is an unknown field';
+    };
+  };
+  $self->{vc} = $vc;
+
+  close($self->{fh});
+};
+
+sub to_string {
+  return ${shift->{vc}}->to_string;
+};
+
+# Shorten long strings for logging
+sub _shorten ($) {
+  my $line = shift;
+  if (length($line) < 20) {
+    return $line;
+  }
+  else {
+    return substr($line,0,17) . '...';
+  };
+};
+
+
+1;
diff --git a/script/cosmasvc2koralquery b/script/cosmasvc2koralquery
index 85986a9..c0b3ed2 100755
--- a/script/cosmasvc2koralquery
+++ b/script/cosmasvc2koralquery
@@ -1,15 +1,17 @@
 #!/usr/bin/env perl
 use strict;
 use warnings;
+use KorAP::Def;
 use lib 'lib';
-use KorAP::VirtualCorpus::Group;
 
 # 2020-05-20
 #   Preliminary support for C2 def-files.
 # 2020-05-29
 #   Introduce optimizable object system.
+# 2024-07-17
+#   Add KorAP::Def.
 
-our $VERSION = 0.1;
+our $VERSION = 0.2;
 
 our @ARGV;
 
@@ -25,281 +27,15 @@
 exit 0;
 };
 
-
-# Shorten long strings for logging
-sub _shorten ($) {
-  my $line = shift;
-  if (length($line) < 20) {
-    return $line;
-  }
-  else {
-    return substr($line,0,17) . '...';
-  };
-};
-
-
-my $fh;
+my $def_parser;
 if ($ARGV[0] eq '-') {
-  $fh = *STDIN;
-} elsif (!open($fh, '<' . $ARGV[0])) {
-  warn $ARGV[0] . " can't be opened";
-  exit(0);
+  $def_parser = KorAP::Def->new(\*STDIN);
+}
+else {
+  $def_parser = KorAP::Def->new($ARGV[0]);
 };
 
-# Initial VC group
-my $vc;
-
-# Create an intensional and an extensional VC
-my $vc_ext = KorAP::VirtualCorpus::Group->new;
-my $vc_int = KorAP::VirtualCorpus::Group->new;
-
-# Load ext initially
-$$vc = $vc_ext;
-
-# Collect all virtual corpora
-my %all_vcs;
-
-my $frozen = 0;
-
-# Iterate over the whole list
-while (!eof $fh) {
-  my $line = readline($fh);
-  chomp $line;
-
-
-  # Skip empty lines
-  if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
-    # empty
-    next;
-  };
-
-  my ($key, $value, $desc);
-
-  # Line-Type: <e>c</a>
-  if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
-    $key = $1;
-    $value = $2 // undef;
-  }
-
-  # Line-Type: <e>c
-  elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
-    $key = $1;
-    $value = $2;
-  }
-
-  # Get text sigles
-  elsif ($line =~ m!^(?:\w+\/){2}\w+$!) {
-    $key = 'text';
-    $value = $line;
-  }
-
-  # Get doc sigles
-  elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
-    $key = 'doc';
-    $value = $1;
-  }
-
-  # Get corpus sigles
-  elsif ($line !~ m!(?:\/|\s)!) {
-    $key = 'corpus';
-    $value = $line;
-  }
-
-  # Not known
-  else {
-    warn _shorten($line) . q! isn't a valid VC definition!;
-    next;
-  };
-
-  # Add text field
-  if ($key eq 'text') {
-
-    # Convert C2 sigle to KorAP form
-    $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
-    ${$vc}->union_field(textSigle => $value);
-  }
-
-  # Add doc field
-  elsif ($key eq 'doc') {
-    ${$vc}->union_field(docSigle => $value);
-  }
-
-  # Add corpus field
-  elsif ($key eq 'corpus') {
-    ${$vc}->union_field(corpusSigle => $value);
-  }
-
-  # Add corpus field
-  elsif ($key eq 'cn') {
-    # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
-    if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
-      ${$vc}->union_field(corpusSigle => $1);
-    };
-  }
-
-  # Mark the vc as frozen
-  # This means that an extended VC area is expected
-  elsif ($key eq 'frozen') {
-    $frozen = 1;
-  }
-
-  # Start/End intended VC area
-  elsif ($key eq 'intended') {
-    if ($value eq 'start') {
-      $$vc = $vc_int;
-    }
-    elsif ($value ne 'end') {
-      warn 'Unknown intension value ' . $value;
-    };
-  }
-
-  # Start/End extended VC area
-  elsif ($key eq 'extended') {
-    if ($value eq 'start') {
-      $$vc = $vc_ext;
-    }
-    elsif ($value ne 'end') {
-      warn 'Unknown extension value ' . $value;
-    };
-  }
-
-  # Set VC name
-  elsif ($key eq 'name') {
-    # "Name des virt. Korpus, der angezeigt wird.
-    # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>,
-    # <add>, <sub>"
-
-    # No global name defined yet
-    if ($$vc && !$$vc->name) {
-      $vc_ext->name($value);
-      $vc_int->name($value);
-      next;
-    };
-
-    ${$vc} = KorAP::VirtualCorpus::Group->new;
-    ${$vc}->name($value);
-  }
-
-  # End VC def
-  elsif ($key eq 'end') {
-    $all_vcs{${$vc}->name} = $$vc;
-    # $vc = undef;
-  }
-
-  # Add VC definition
-  elsif ($key eq 'add') {
-    unless (defined $all_vcs{$value}) {
-      #       warn 'VC ' . $value . ' not defined';
-      # exit(1);
-      next;
-    };
-
-    $$vc->union($all_vcs{$value}->clone->to_koral);
-  }
-
-  # AND definition
-  elsif ($key eq 'and') {
-    unless (defined $all_vcs{$value}) {
-      #       warn 'VC ' . $value . ' not defined';
-      # exit(1);
-      next;
-    };
-
-    $$vc->joint($all_vcs{$value}->clone->to_koral);
-  }
-
-  # Source of the corpus
-  elsif ($key eq 'ql') {
-    # Quellenname, z.B. "Neue Zürcher Zeitung"
-    $$vc->union_field(corpusTitle => $value);
-  }
-
-  elsif ($key eq 'sub') {
-    # "Sub" is the difference - it is the "and not" operation.
-    warn $key . ' is not yet supported';
-  }
-
-  elsif ($key eq 'co') {
-    # Country,	z.B. DE für Text in Deutschland erschienen
-    warn $key . ' is not yet supported';
-  }
-
-  elsif ($key eq 'tl') {
-    # Textlength, Bereich von Texten der angegebenen Länge [in Anz. Wörtern]
-    warn $key . ' is not yet supported';
-  }
-
-  elsif ($key eq 'ts') {
-    # Textsorte, 	z.B. "Bericht"
-    warn $key . ' is not yet supported';
-  }
-
-  elsif ($key eq 'th') {
-    # Thema, z.B. "Sport - Fußball"
-    warn $key . ' is not yet supported';
-  }
-
-  elsif ($key eq 'red') {
-    # Reduktionsfaktor
-    # Wert zw. 1-99%: virt. Korpus wird auf diesen Wert
-    # reduziert. Modus: feste Reduzierung, nicht variabel.
-    warn $key . ' is not yet supported';
-  }
-
-  elsif ($key eq 'thprob') {
-    # ThemaProbability
-    # Wert, der für <th>Thema verwendet wird um zu bestimmen, ab welchem
-    # Zuverläßigkeitswert ein Thema übernommen wird
-  }
-
-
-  # Add reduction value as a comment
-  elsif ($key eq 'redabs') {
-    # "red. Anz. Texte
-    # absoluter Wert der durch Reduktion zu erzielende Anzahl Texte"
-    $$vc->comment('redabs:' . $value);
-    warn $key . ' is not yet supported';
-  }
-
-  # Add reduction value as a comment
-  elsif ($key eq 'date') {
-    # Supports two pattern schemes:
-    # m1=Year1/Month1 bis Year2/Month2
-    #   Datumsbereich Schema 1: z.B. "2000/01 bis 2010/12"
-
-    # Schema 1
-    if ($value =~ m!^(?:m1\s*=\s*)?\s*(\d+)\/(\d+) bis (\d+)\/(\d+)\s*$!s) {
-      my ($y1, $m1, $y2, $m2) = ($1, $2, $3, $4);
-      if ($m1 < 10) {
-        $m1 = '0' . (0+$m1);
-      };
-      if ($m2 < 10) {
-        $m2 = '0' . (0+$m2);
-      };
-      $$vc->from($y1, $m1);
-      $$vc->to($y2, $m2);
-    }
-
-    # Scheme 2
-    elsif ($value =~ m!^\s*\d{4}-\d{4}\s+und\s+\d{1,2}-\d{1,2}\s*$!) {
-      # m2=Year1-Year2 und Month1-Month2
-      #   Datumsbereich Schema 2: z.B. "1990-2000 und 06-06"
-
-      warn 'Second date scheme not yet supported!'
-    }
-
-    else {
-      warn 'Unknown date scheme ' . $value;
-    };
-  }
-
-  # Unknown
-  else {
-    warn $key . ' is an unknown field';
-  };
-};
-
-close($fh);
+$def_parser->parse;
 
 # Stringify current (extended?) virtual corpus
-print $$vc->to_string;
+print $def_parser->to_string;