Author Topic: Bugs in vector.Sn()?  (Read 5147 times)

Offline mschuster

  • PTeam Member
  • PixInsight Jedi
  • *****
  • Posts: 1087
Bugs in vector.Sn()?
« on: 2014 December 15 20:08:45 »
Hi Juan,

Win 7, 1123:

I am seeing bad values returned by vector.Sn() for small vectors (i.e. lengths up to 64).

The script has both brute force and efficient PJSR implementations of unnormalized Sn for comparison purposes.

The ratio of the length 2 and 4 tests equals a "partial" Sn normalization factor. The additional factor of 1.1926 is missing per the Croux/Rousseeuw time-efficient algorithms paper.

But for the other cases I can't explain the ratio.

On many 64 length cases, I am getting from vector.Sn() what appear to be poor results.

Mike

Code: [Select]
array = [0.533087, -1.02232]
vector.Sn(): 1.1556674009999999
SnSlow(array): 1.5554069999999998
vector.Sn() / SnSlow(array): 0.743
SnFast(array): 1.5554069999999998

array = [0.533087, -1.02232, -0.855217]
vector.Sn(): 2.569750704
SnSlow(array): 0.1671029999999999
vector.Sn() / SnSlow(array): 15.378243981257079
SnFast(array): 0.1671029999999999

array = [0.533087, -1.02232, -0.855217, -0.364599]
vector.Sn(): 0.6274658339999999
SnSlow(array): 0.6577209999999999
vector.Sn() / SnSlow(array): 0.954
SnFast(array): 0.6577209999999999

array = [...]
vector.Sn(): 0.7659920000000001
SnSlow(array): 0.760739
vector.Sn() / SnSlow(array): 1.0069051277770695
SnFast(array): 0.760739

Code: [Select]
function sortCompare(a, b) {
   return a < b ? -1 : a > b ? 1 : 0;
}

// brute force O(n^2)
function SnSlow(a) {
   var n = a.length;
   var a1 = new Array(n);
   var a2 = new Array(n);
   for (var i = 0; i != n; ++i) {
      for (var j = 0; j != n; ++j) {
         a1[j] = Math.abs(a[i] - a[j]);
      }
      a1.sort(sortCompare);
      a2[i] = a1[Math.floor(n / 2)]; // high median
   }
   a2.sort(sortCompare);
   return a2[Math.floor((n + 1) / 2) - 1]; // low median
}

// efficient O(n log n)
function SnFast(x1) { // ?
   var x = x1.slice(0, x1.length);
   x.sort(sortCompare);

   var n = x.length;
   if (n == 0) {
      return 0;
   }

   var y = new Array(n);

   y[0] = x[Math.floor(n / 2)] - x[0];

   for (var i = 2; i <= Math.floor((n + 1) / 2); ++i) {
      var nA = i - 1;
      var nB = n - i;
      var diff = nB - nA;
      var leftA = 1;
      var leftB = 1;
      var rightA = nB;
      var rightB = nB;
      var Amin = Math.floor(diff / 2) + 1;
      var Amax = Math.floor(diff / 2) + nA;

      while (true) {
         if (leftA < rightA) {
            var length = rightA - leftA + 1;
            var even = 1 - Math.mod(length, 2);
            var half = Math.floor((length - 1) / 2);
            var tryA = leftA + half;
            var tryB = leftB + half;

            if (tryA < Amin) {
               rightB = tryB;
               leftA = tryA + even;
            }
            else {
               if (tryA > Amax) {
                  rightA = tryA;
                  leftB = tryB + even;
               }
               else {
                  var medA = x[i - 1] - x[i - tryA + Amin - 1 - 1];
                  var medB = x[tryB + i - 1] - x[i - 1];
                  if (medA >= medB) {
                     rightA = tryA;
                     leftB = tryB + even;
                  }
                  else {
                     rightB = tryB;
                     leftA = tryA + even;
                  }
               }
            }
         }
         else {
            if (leftA > Amax) {
               y[i - 1] = x[leftB + i - 1] - x[i - 1];
            }
            else {
               var medA = x[i - 1] - x[i - leftA + Amin - 1 - 1];
               var medB = x[leftB + i - 1] - x[i - 1];
               y[i - 1] = Math.min(medA, medB);
            }
            break;
         }
      }
   }

   for (var i = Math.floor((n + 1) / 2) + 1; i <= n - 1; ++i) {
      var nA = n - i;
      var nB = i - 1;
      var diff = nB - nA;
      var leftA = 1;
      var leftB = 1;
      var rightA = nB;
      var rightB = nB;
      var Amin = Math.floor(diff / 2) + 1;
      var Amax = Math.floor(diff / 2) + nA;

      while (true) {
         if (leftA < rightA) {
            var length = rightA - leftA + 1;
            var even = 1 - Math.mod(length, 2);
            var half = Math.floor((length - 1) / 2);
            var tryA = leftA + half;
            var tryB = leftB + half;

            if (tryA < Amin) {
               rightB = tryB;
               leftA = tryA + even;
            }
            else {
               if (tryA > Amax) {
                  rightA = tryA;
                  leftB = tryB + even;
               }
               else {
                  var medA = x[i + tryA - Amin + 1 - 1] - x[i - 1];
                  var medB = x[i - 1] - x[i - tryB - 1];
                  if (medA >= medB) {
                     rightA = tryA;
                     leftB = tryB + even;
                  }
                  else {
                     rightB = tryB;
                     leftA = tryA + even;
                  }
               }
            }
         }
         else {
            if (leftA > Amax) {
               y[i - 1] = x[i - 1] - x[i - leftB - 1];
            }
            else {
               var medA = x[i + leftA - Amin + 1 - 1] - x[i - 1];
               var medB = x[i - 1] - x[i - leftB - 1];
               y[i - 1] = Math.min(medA, medB);
            }
            break;
         }
      }
   }

   y[n - 1] = x[n - 1] - x[Math.floor((n + 1) / 2) - 1];

   y.sort(sortCompare);

   return y[Math.floor((n + 1) / 2) - 1];
}

console.writeln();
var array = [0.533087, -1.02232]; //, -0.855217]; // , -0.364599];
console.write("array = [");
for (var i = 0; i != array.length -1; ++i) {
   console.write(array[i], ", ");
}
console.writeln(array[array.length - 1], "]");
var vector = new Vector(array.length);
vector.assign(array, 0, array.length);
console.writeln("vector.Sn(): ", vector.Sn());
console.writeln("SnSlow(array): ", SnSlow(array));
console.writeln("vector.Sn() / SnSlow(array): ", vector.Sn() / SnSlow(array));
console.writeln("SnFast(array): ", SnFast(array));


console.writeln();
var array = [0.533087, -1.02232, -0.855217]; // , -0.364599];
console.write("array = [");
for (var i = 0; i != array.length -1; ++i) {
   console.write(array[i], ", ");
}
console.writeln(array[array.length - 1], "]");
var vector = new Vector(array.length);
vector.assign(array, 0, array.length);
console.writeln("vector.Sn(): ", vector.Sn());
console.writeln("SnSlow(array): ", SnSlow(array));
console.writeln("vector.Sn() / SnSlow(array): ", vector.Sn() / SnSlow(array));
console.writeln("SnFast(array): ", SnFast(array));


console.writeln();
var array = [0.533087, -1.02232, -0.855217, -0.364599];
console.write("array = [");
for (var i = 0; i != array.length -1; ++i) {
   console.write(array[i], ", ");
}
console.writeln(array[array.length - 1], "]");
var vector = new Vector(array.length);
vector.assign(array, 0, array.length);
console.writeln("vector.Sn(): ", vector.Sn());
console.writeln("SnSlow(array): ", SnSlow(array));
console.writeln("vector.Sn() / SnSlow(array): ", vector.Sn() / SnSlow(array));
console.writeln("SnFast(array): ", SnFast(array));

console.writeln();
var array = [0.533087, -1.02232, -0.855217, -0.364599, 1.46429, 0.756261,
2.17472, -0.0421965, 0.627769, -0.655922, 0.256595, -0.838173,
0.642002, -1.42016, 0.460509, 0.160264, 0.132564, -0.722706,
0.347617, 0.698298, -1.64382, 0.254763, 0.771643, -2.38453,
-0.810664, -1.07687, 0.191627, -0.676371, -0.818345, 1.37031,
0.140143, -1.12645, 0.314962, 0.84163, 0.0156415, -0.584815, 0.36632,
-1.75322, 0.417046, 0.445999, 2.18433, 1.19637, -0.108828, 0.219621,
0.651911, -0.999894, -0.993649, -1.28664, -0.455387, -0.234295,
-0.12633, -0.238317, -1.08056, -0.393563, 0.108105, -0.169317,
-0.12739, 0.171254, 0.605376, 0.619183, 0.782824, 1.39034, 0.911639,
-1.25819];
console.writeln("array = [...]");
var vector = new Vector(array.length);
vector.assign(array, 0, array.length);
console.writeln("vector.Sn(): ", vector.Sn());
console.writeln("SnSlow(array): ", SnSlow(array));
console.writeln("vector.Sn() / SnSlow(array): ", vector.Sn() / SnSlow(array));
console.writeln("SnFast(array): ", SnFast(array));
« Last Edit: 2014 December 15 21:34:48 by mschuster »

Offline Juan Conejero

  • PTeam Member
  • PixInsight Jedi Grand Master
  • ********
  • Posts: 7111
    • http://pixinsight.com/
Re: Bugs in vector.Sn()?
« Reply #1 on: 2014 December 16 01:54:18 »
Hi Mike,

Quote
I am seeing bad values returned by vector.Sn() for small vectors (i.e. lengths up to 64).

Actually, those results should be better :) My implementation in PCL (which PJSR uses) applies finite sample corrections for the Sn and Qn estimators. See lines 2716 and 2931 in pcl/Math.h:

https://github.com/PixInsight/PCL/blob/master/include/pcl/Math.h

These corrections are negligible for large vectors, but can be very significant for relatively small vector lengths. See the implementations in C. Croux and P. J. Rousseeuw (1992) Time-Efficient Algorithms for Two Highly Robust Estimators of Scale, Computational Statistics, Vol. 1

Also take into account that there is a very small error (approx. 0.001) in the constant 2.2219 typically used to normalize the result of Qn. See the comments in the documentation for the R implementation:

http://www.inside-r.org/packages/cran/robustbase/docs/Qn

Finally, there is also an error in the author's implementation of Qn; see line 2857 of our implementation in Math.h.

Quote
The additional factor of 1.1926 is missing per the Croux/Rousseeuw time-efficient algorithms paper.

This is by design. All scale estimators are computed unnormalized in PCL/PJSR, so you have to multiply each estimator by the appropriate constant. The reason for this is that one might want to normalize a scale estimator with respect to any distribution, not just a normal distribution.
Juan Conejero
PixInsight Development Team
http://pixinsight.com/

Offline Juan Conejero

  • PTeam Member
  • PixInsight Jedi Grand Master
  • ********
  • Posts: 7111
    • http://pixinsight.com/
Re: Bugs in vector.Sn()?
« Reply #2 on: 2014 December 16 02:06:08 »
Quote
array = [0.533087, -1.02232, -0.855217]
vector.Sn(): 2.569750704
SnSlow(array): 0.1671029999999999
vector.Sn() / SnSlow(array): 15.378243981257079
SnFast(array): 0.1671029999999999

Anyway, this evidently looks like a bug, since the value 2.56... is wrong. I am investigating this problem. Thank you for detecting it!
Juan Conejero
PixInsight Development Team
http://pixinsight.com/

Offline Juan Conejero

  • PTeam Member
  • PixInsight Jedi Grand Master
  • ********
  • Posts: 7111
    • http://pixinsight.com/
Re: Bugs in vector.Sn()?
« Reply #3 on: 2014 December 16 03:10:32 »
Ok, I have found the bug. It is in line 2733 of pcl/Math.h:

https://github.com/PixInsight/PCL/blob/master/include/pcl/Math.h

The current code is:

double sn = cn * double( *pcl::Select( a2, a2+n, nh ) );

and should be:

double sn = cn * double( *pcl::Select( a2, a2+n, nh-1 ) );

This error has gone unnoticed because it is irrelevant for reasonably sized vectors and images. However, it can be critical for small vector lengths.

This bug affects all implementations of the Sn estimator in the core application, which includes PJSR and internal view property calculation routines, which are called by the Statistics tool. To fix it, we have to release a new version of the whole platform. However, since I think this bug is of little practical importance, we'll wait until January, when we'll release a new version with an implementation of (hopefully) the final XISF specification and many more features.

In the meanwhile, here is a JavaScript routine that you can use to replace Vector.Sn() with a new Vector.Sn1():

Code: [Select]
/*!
 * Returns the Sn scale estimator of Rousseeuw and Croux:
 *
 * Sn = c * low_median( high_median( |x_i - x_j| ) )
 *
 * where low_median() is the order statistic of rank (n + 1)/2, and
 * high_median() is the order statistic of rank n/2 + 1.
 *
 * For vectors with less than two components, this function returns zero.
 *
 * The constant c = 1.1926 must be used to make the Sn estimator converge to
 * the standard deviation of a pure normal distribution. However, this
 * implementation does not apply it (it uses c=1 implicitly), for
 * consistency with other implementations of scale estimators.
 *
 * This implementation includes finite sample corrections, which can be
 * significant for relatively small vector lengths.
 *
 * \b References
 *
 * P.J. Rousseeuw and C. Croux (1993), <em>Alternatives to the Median Absolute
 * Deviation,</em> Journal of the American Statistical Association, Vol. 88,
 * pp. 1273-1283.
 *
 * C. Croux and P.J. Rousseeuw (1992), <em>Time-Efficient Algorithms for Two
 * Highly Robust Estimators of Scale, Computational Statistics, Vol. 1.
 */
Vector.prototype.Sn1 = function()
{
   var n = this.length;
   if ( n < 2 )
      return 0;

   var x = this.toArray();
   x.sort( function( a, b ) { return a - b; } );

   var a2 = new Array( n );
   a2[0] = x[n >> 1] - x[0];

   var nh = (n + 1) >> 1;

   for ( var i = 2; i <= nh; ++i )
   {
      var nA = i - 1;
      var nB = n - i;
      var diff = nB - nA;
      var leftA = 1;
      var leftB = 1;
      var rightA = nB;
      var Amin = (diff >> 1) + 1;
      var Amax = (diff >> 1) + nA;

      while ( leftA < rightA )
      {
         var length = rightA - leftA + 1;
         var even = (length & 1) == 0;
         var half = (length - 1) >> 1;
         var tryA = leftA + half;
         var tryB = leftB + half;

         if ( tryA < Amin )
            leftA = tryA + even;
         else
         {
            if ( tryA > Amax )
            {
               rightA = tryA;
               leftB = tryB + even;
            }
            else
            {
               var medA = x[i-1] - x[i-2 - tryA + Amin];
               var medB = x[tryB + i-1] - x[i-1];
               if ( medA >= medB )
               {
                  rightA = tryA;
                  leftB = tryB + even;
               }
               else
                  leftA = tryA + even;
            }
         }
      }

      if ( leftA > Amax )
         a2[i-1] = x[leftB + i-1] - x[i-1];
      else
      {
         var medA = x[i-1] - x[i-2 - leftA + Amin];
         var medB = x[leftB + i-1] - x[i-1];
         a2[i-1] = Math.min( medA, medB );
      }
   }

   for ( var i = nh + 1; i < n; ++i )
   {
      var nA = n - i;
      var nB = i - 1;
      var diff = nB - nA;
      var leftA = 1;
      var leftB = 1;
      var rightA = nB;
      var Amin = (diff >> 1) + 1;
      var Amax = (diff >> 1) + nA;

      while ( leftA < rightA )
      {
         var length = rightA - leftA + 1;
         var even = (length & 1) == 0;
         var half = (length - 1) >> 1;
         var tryA = leftA + half;
         var tryB = leftB + half;

         if ( tryA < Amin)
            leftA = tryA + even;
         else
         {
            if ( tryA > Amax )
            {
               rightA = tryA;
               leftB = tryB + even;
            }
            else
            {
               var medA = x[i + tryA - Amin] - x[i-1];
               var medB = x[i-1] - x[i-1 - tryB];
               if ( medA >= medB )
               {
                  rightA = tryA;
                  leftB = tryB + even;
               }
               else
                  leftA = tryA + even;
            }
         }
      }

      if ( leftA > Amax )
         a2[i-1] = x[i-1] - x[i-1 - leftB];
      else
      {
         var medA = x[i + leftA - Amin] - x[i-1];
         var medB = x[i-1] - x[i-1 - leftB];
         a2[i-1] = Math.min( medA, medB );
      }
   }

   a2[n-1] = x[n-1] - x[nh-1];

   /*
    * Correction for a finite sample
    */
   var cn;
   switch ( n )
   {
   case  2: cn = 0.743; break;
   case  3: cn = 1.851; break;
   case  4: cn = 0.954; break;
   case  5: cn = 1.351; break;
   case  6: cn = 0.993; break;
   case  7: cn = 1.198; break;
   case  8: cn = 1.005; break;
   case  9: cn = 1.131; break;
   default: cn = (n & 1) ? n/(n - 0.9) : 1.0; break;
   }

   return cn * Math.select( nh-1, a2 );
};

It includes finite sample corrections and provides correct results for all vector lengths.

Again, thank you for finding this error.
Juan Conejero
PixInsight Development Team
http://pixinsight.com/

Offline mschuster

  • PTeam Member
  • PixInsight Jedi
  • *****
  • Posts: 1087
Re: Bugs in vector.Sn()?
« Reply #4 on: 2014 December 16 09:35:46 »
Thanks Juan,

I had assumed "finite sample correction" was part of "normalization", and that unnormalized Sn did not do it. Your convention is fine however.

The "-1" bug fix noticeably improves length 64 results.

Note: changing your Sn1 prototype to a simple function with an array argument gives significantly better performance when an array is already available (i.e. use .slice rather than .toArray to obtain a copy for sorting purposes).

Mike