Add w2v->compare_to parameter and use for count-based delta calcs
Change-Id: Ic53cbc5b3984659b3ebfa6c8ad30fd42850a9fc9
diff --git a/Changelog.md b/Changelog.md
index 506fcf4..750e924 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -1,5 +1,8 @@
# Changelog
+- added `w2v.compare_to` configuration option to configure a derekovecs
+ instance to compare results to (currently only count-based LogDice-delta supported)
+
## [0.93.2] - 2024-11-15
- fixed calculation of total token count by using collocatordb 1.3.2
diff --git a/example.conf b/example.conf
index 4991015..ddc1d2f 100644
--- a/example.conf
+++ b/example.conf
@@ -14,6 +14,7 @@
w2v => {
vecs => "example-models/wpd19_10000/wpd19_10000.vecs",
+ # compare_to => "https://corpora.ids-mannheim.de/openlab/derekovecs", # compare results to this derekovecs instance
# korap_url => "https://korap.ids-mannheim.de"
},
diff --git a/lib/IDS/DeReKoVecs/Read.pm b/lib/IDS/DeReKoVecs/Read.pm
index fb11fe9..6e2acfc 100644
--- a/lib/IDS/DeReKoVecs/Read.pm
+++ b/lib/IDS/DeReKoVecs/Read.pm
@@ -43,15 +43,18 @@
}
sub getClassicCollocatorsCached {
- my ($c, $word) = @_;
+ my ($c, $word, $compare_to) = @_;
my $s2 = "";
if($word > $mergedEnd) {
$word-=$mergedEnd;
}
- if($opt_p >= 5000 && $opt_p < 5600) { # German non-reference
- open PIPE, "GET http://corpora.ids-mannheim.de/openlab/derekovecs/getClassicCollocators?w=$word |";
+ my $pipe;
+ if($compare_to ne "") {
+ $c->app->log->info("comparing syn neighbours to: $compare_to/getClassicCollocators?w=$word");
+ open $pipe, "lwp-request $compare_to/getClassicCollocators?w=$word |";
}
+
if($opt_C || !$cccache{$word}) {
$c->app->log->info("Getting classic collocates of $word.");
$cccache{$word} = getClassicCollocators($word);
@@ -60,11 +63,12 @@
} else {
$c->app->log->info("Getting classic collocates for $word from cache.");
}
- if($opt_p >= 5000 && $opt_p < 5600) { # German non-reference
- while(<PIPE>) {
+
+ if(defined($pipe)) {
+ while(<$pipe>) {
$s2 .= $_;
}
- close(PIPE);
+ close($pipe);
}
if(length($s2) > 2000) {
diff --git a/script/derekovecs-server b/script/derekovecs-server
index 372e472..3811858 100755
--- a/script/derekovecs-server
+++ b/script/derekovecs-server
@@ -25,6 +25,8 @@
}
my $DEFAULT_NET = app->config->{w2v}->{net} // $DEFAULT_NET_NAME;
my $DOWNTIME_CALENDAR_URL = app->config->{downtime_calendar_url} // '';
+my $COMPARE_TO = app->config->{w2v}->{compare_to} // '';
+
app->static->paths->[0] = getcwd;
plugin 'Piwik';
@@ -184,12 +186,14 @@
any '*/getClassicCollocators' => sub {
my $self = shift;
- $self->render(data => getClassicCollocatorsCached($self, getWord($self->param("w") ? $self->param("w") : $self->req->json)), format=>'json');
+ $self->render(data => getClassicCollocatorsCached($self, getWord($self->param("w") ? $self->param("w",) : $self->req->json),
+ $COMPARE_TO), format=>'json');
} => 'getClassicCollocators1';
any '/getClassicCollocators' => sub {
my $self = shift;
- $self->render(data => getClassicCollocatorsCached($self, getWord($self->param("w") ? $self->param("w") : $self->req->json)), format=>'json');
+ $self->render(data => getClassicCollocatorsCached($self, getWord($self->param("w") ? $self->param("w") : $self->req->json),
+ $COMPARE_TO), format=>'json');
} => 'getClassicCollocators';
any '/getBiggestVocabDistances' => sub {