collocatordb: user Evert's (2004) ll-function instead of Dunning's (1993)
diff --git a/collocatordb.cc b/collocatordb.cc
index 9afe027..cb3806e 100644
--- a/collocatordb.cc
+++ b/collocatordb.cc
@@ -135,6 +135,20 @@
       return log2((double)f12 * f12 /  ((double) total * window_size * window_size * f1 * f2)) + log2((double) f12 / window_size / total);
   }
 
+  // Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714. 
+  // Free PDF available from http://purl.org/stefan.evert/PUB/Evert2004phd.pdf
+  static inline double ca_ll(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
+    double
+      r1 = (double) w1 * window_size,
+      r2 = (double) n - r1,
+      c1 = w2,
+      c2 = n - c1,
+      o11 = w12,          o12 = r1 - o11,
+      o21 = c1 - w12,     o22 = r2 - o21,
+      e11 = r1 * c1 / n,  e12 = r1 * c2 / n,
+      e21 = r2 * c1 / n,  e22 = r2 * c2 / n;
+    return (2 * ( (o11>0? o11 * log(o11/e11):0) + (o12>0? o12 * log(o12/e12):0) + (o21>0? o21 * log(o21/e21):0) + (o22>0? o22 * log(o22/e22):0)));
+  }
 
   class CountMergeOperator : public AssociativeMergeOperator {
   public:
@@ -495,48 +509,6 @@
     std::cout << "ready dumping\n";
   }
 
-  double calculateLLR(uint64_t f_X_, uint64_t uintN, uint64_t f_X_Y_, uint64_t f_Y_) {
-    double f_e_, f_o_;
-    double A=0.0, B=0.0, C=0.0, D=0.0, N=0.0;
-    double LLR=0.0, statVal=0.0, minusDiffCoeff=0.0;
-    double BlogB=0.0, ClogC=0.0;
-
-    N = (double)uintN;
-    A = (double)f_X_Y_;
-    B = (double)f_X_   -A;
-    C = (double)f_Y_   -A;
-    D = (double)N      -A-B-C;;
-
-    if (B > 0.) BlogB = B*log(B);
-    if (C > 0.) ClogC = C*log(C);
-
-    if ((A>0.) && (D>0.) && (N>0.))	{
-      f_e_ = (double)f_X_  /(double)N;
-      f_o_ = (double)f_X_Y_/(double)f_Y_;
-
-      minusDiffCoeff =
-        (	f_X_==0 ?   (double)((-1)*f_X_Y_) :
-          (	f_X_Y_==0 ? (double)((+1)*f_X_) :
-            (f_e_-f_o_)/(f_e_+f_o_)
-            )
-          );
-
-      /* log likelihood ratio */
-      LLR     =  2*( A*log(A)
-                     +BlogB
-                     +ClogC
-                     +D*log(D)
-                     -(A+B)*log(A+B)
-                     -(A+C)*log(A+C)
-                     -(B+D)*log(B+D)
-                     -(C+D)*log(C+D)
-                     +N*log(N)
-                     );
-    }
-    return(minusDiffCoeff > 0 ? 0 : (statVal=LLR));
-  }
-
-	
 	bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
 	bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
 	bool sortByLlr(const Collocator &lhs, const Collocator &rhs) { return lhs.llr > rhs.llr; }
@@ -619,7 +591,7 @@
       if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
       if (w2 != last_w2) {
         if(maxv >= min_cooccur) {
-          double llr = calculateLLR(_vocab[w1].freq, total, maxv, _vocab[last_w2].freq);
+          double llr = ca_ll(_vocab[w1].freq, _vocab[last_w2].freq,  maxv, total, 1);
           if(first)
             first = false;
           else