ini/trakem2/vector/Editions.java

   1 /**
   2
   3 TrakEM2 plugin for ImageJ(C).
   4 Copyright (C) 2007-2009 Albert Cardona.
   5
   6 This program is free software; you can redistribute it and/or
   7 modify it under the terms of the GNU General Public License
   8 as published by the Free Software Foundation (http://www.gnu.org/licenses/gpl.txt )
   9
  10 This program is distributed in the hope that it will be useful,
  11 but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 GNU General Public License for more details.
  14
  15 You should have received a copy of the GNU General Public License
  16 along with this program; if not, write to the Free Software
  17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  18
  19 You may contact Albert Cardona at acardona at ini.phys.ethz.ch
  20 Institute of Neuroinformatics, University of Zurich / ETH, Switzerland.
  21 **/
  22
  23 package ini.trakem2.vector;
  24
  25 import java.util.*;
  26 import javax.vecmath.Point3f;
  27 import java.util.Collections;
  28 import ini.trakem2.utils.Utils;
  29 import ini.trakem2.utils.IJError;
  30
  31 /** To extract and represent the sequence of editions that convert any N-dimensional vector string to any other of the same number of dimensions. */
  32 public class Editions {
  33
  34         static public final int DELETION = 1;
  35         static public final int INSERTION = 2;
  36         static public final int MUTATION = 3;
  37
  38         protected VectorString vs1;
  39         protected VectorString vs2;
  40         protected double delta;
  41         protected boolean closed;
  42
  43         /** In the form [length][3] */
  44         protected int[][] editions;
  45         /** Levenshtein's distance between vs1 and vs2.*/
  46         public double distance;
  47
  48         public Editions(final VectorString vs1, final VectorString vs2, final double delta, final boolean closed) {
  49                 this.vs1 = vs1;
  50                 this.vs2 = vs2;
  51                 this.delta = delta;
  52                 this.closed = closed;
  53                 init();
  54         }
  55
  56         public double getDistance() { return distance; }
  57         public int length() { return editions.length; }
  58         public int[][] getEditions() { return editions; }
  59         public VectorString getVS1() { return vs1; }
  60         public VectorString getVS2() { return vs2; }
  61
  62         /** A mutation is considered an equal or near equal, and thus does not count. Only deletions and insertions count towards scoring the similarity.
  63          *
  64          * @param skip_ends enables ignoring sequences in the beginning and ending if they are insertions or deletions.
  65          * @param max_mut indicates the maximum length of a contiguous sequence of mutations to be ignored when skipping insertions and deletions at beginning and end.
  66          * @param min_chunk indicates the minimal proportion of the string that should remain between the found start and end, for vs1. The function will return the regular similarity if the chunk is too small.
  67          *
  68          */
  69         public double getSimilarity(boolean skip_ends, final int max_mut, final float min_chunk) {
  70                 return getSimilarity(getStartEndSkip(skip_ends, max_mut, min_chunk));
  71         }
  72
  73         private double getSimilarity(final int[] g) {
  74                 int i_start = g[0];
  75                 int i_end = g[1];
  76                 boolean skip_ends = 1 == g[2];
  77
  78                 int non_mut = 0;
  79
  80                 if (skip_ends) {
  81                         // count non mutations
  82                         for (int i=i_start; i<=i_end; i++) {
  83                                 if (MUTATION != editions[i][0]) non_mut++;
  84                         }
  85
  86                         // compute proper segment lengths, inlined
  87                         final double sim = 1.0 - ( (double)non_mut / Math.max( editions[i_end][1] - editions[i_start][1] + 1, editions[i_end][2] - editions[i_start][2] + 1) );
  88
  89                         //if (sim > 0.7) Utils.log2("similarity: non_mut, len1, len2, i_start, i_end : " + non_mut + ", " + (editions[i_end][1] - editions[i_start][1] + 1) + ", " + (editions[i_end][2] - editions[i_start][2] + 1) + ", " + i_start + "," + i_end + "   " + Utils.cutNumber(sim * 100, 2) + " %");
  90
  91                         return sim;
  92                 } else {
  93                         for (int i=0; i<editions.length; i++) {
  94                                 if (MUTATION != editions[i][0]) non_mut++;
  95                         }
  96                         /*
  97                          * If the max_len is smaller than the number of non-mutations, then a NEGATIVE similarity value is returned,
  98                          * but it's ok. All it means is that it's not similar at all.
  99                         int max_len = Math.max(vs1.length(), vs2.length());
 100                         Utils.log2("non_mut: " + non_mut + "  total: " + editions.length + "  max length: " + max_len + (non_mut > max_len ? "  WARNING!" : ""));
 101                         */
 102                         return 1.0 - ( (double)non_mut / Math.max(vs1.length(), vs2.length()) );
 103                 }
 104         }
 105
 106         public double getSimilarity() {
 107                 return getSimilarity(false, 0, 1);
 108         }
 109
 110         public double getSimilarity2() {
 111                 return getSimilarity2(false, 0, 1);
 112         }
 113
 114         /** Returns the number of mutations / max(len(vs1), len(vs2)) : 1.0 means all are mutations and the sequences have the same lengths.*/
 115         public double getSimilarity2(boolean skip_ends, final int max_mut, final float min_chunk) {
 116
 117                 int[] g = getStartEndSkip(skip_ends, max_mut, min_chunk);
 118                 int i_start = g[0];
 119                 int i_end = g[1];
 120                 skip_ends = 1 == g[2];
 121
 122                 int mut = 0;
 123
 124                 if (skip_ends) {
 125                         // count non mutations
 126                         for (int i=i_start; i<i_end; i++) {
 127                                 if (MUTATION == editions[i][0]) mut++;
 128                         }
 129
 130                         // compute proper segment lengths, inlined
 131                         double sim = (double)mut / Math.max( editions[i_end][1] - editions[i_start][1] + 1, editions[i_end][2] - editions[i_start][2] + 1);
 132
 133                         //if (sim > 0.7) Utils.log2("similarity: mut, len1, len2, i_start, i_end : " + mut + ", " + (editions[i_end][1] - editions[i_start][1] + 1) + ", " + (editions[i_end][2] - editions[i_start][2] + 1) + ", " + i_start + "," + i_end + "   " + Utils.cutNumber(sim * 100, 2) + " %");
 134
 135                         return sim;
 136
 137                 } else {
 138                         for (int i=0; i<editions.length; i++) {
 139                                 if (MUTATION == editions[i][0]) mut++;
 140                         }
 141                         return (double)mut / Math.max(vs1.length(), vs2.length());
 142                 }
 143         }
 144
 145
 146         /** Returns starting and ending indices, both inclusive. */
 147         private final int[] getStartEndSkip(boolean skip_ends, final int max_mut, final float min_chunk) {
 148                 int i_start = 0;
 149                 int i_end = editions.length -1;
 150                 if (skip_ends) {
 151                         // 1 - find start:
 152                         int len_mut_seq = 0;
 153                         // break when the found sequence of continuous mutations is larger than max_mut
 154                         for (int i=0; i<editions.length; i++) {
 155                                 if (MUTATION == editions[i][0]) {
 156                                         len_mut_seq++;
 157                                 } else {
 158                                         // reset
 159                                         len_mut_seq = 0;
 160                                 }
 161                                 if (len_mut_seq > max_mut) {
 162                                         i_start = i - len_mut_seq +1;
 163                                         break;
 164                                 }
 165                         }
 166                         // find end
 167                         len_mut_seq = 0;
 168                         for (int i=i_end; i>-1; i--) {
 169                                 if (MUTATION == editions[i][0]) {
 170                                         len_mut_seq++;
 171                                 } else {
 172                                         // reset
 173                                         len_mut_seq = 0;
 174                                 }
 175                                 if (len_mut_seq > max_mut) {
 176                                         i_end = i;
 177                                         break;
 178                                 }
 179                         }
 180                         // determine if remaining chunk is larger than required min_chunk
 181                         skip_ends = ((float)(editions[i_end][1] - editions[i_start][1] + 1) / vs1.length()) >= min_chunk;
 182                 }
 183                 return new int[]{i_start, i_end, skip_ends ? 1 : 0};
 184         }
 185
 186         /** Returns the distance between all points involved in a mutation; if average is false, then it returns the cummulative. Returns Double.MAX_VALUE if no mutations are found. */
 187         public double getPhysicalDistance(boolean skip_ends, final int max_mut, final float min_chunk, boolean average) {
 188                 return getPhysicalDistance(getStartEndSkip(skip_ends, max_mut, min_chunk), average);
 189         }
 190
 191         private double getPhysicalDistance(final int[] g, final boolean average) {
 192                 int i_start = g[0];
 193                 int i_end = g[1];
 194                 boolean skip_ends = 1 == g[2];
 195
 196                 double dist = 0;
 197                 int len = 0;
 198                 int i = 0;
 199                 final int len1 = vs1.length();
 200                 final int len2 = vs2.length();
 201                 try {
 202                         for (i=i_start; i<=i_end; i++) {
 203                                 if (MUTATION != editions[i][0]) continue;
 204                                 int k1 = editions[i][1];
 205                                 int k2 = editions[i][2];
 206                                 if (len1 == k1 || len2 == k2) continue; // LAST point will fail in some occasions, needs fixing
 207                                 dist += vs1.distance(k1, vs2, k2);
 208                                 len++;
 209                         }
 210                         if (0 == len) return Double.MAX_VALUE;
 211                         if (average) return dist / len; // can len be zero ?
 212                         return dist;
 213                 } catch (Exception e) {
 214                         IJError.print(e);
 215                         Utils.log2("ERROR in getPhysicalDistance: i,len  j,len : " + editions[i][1] + ", " + vs1.length() + "    " + editions[i][2] + ", " + vs2.length());
 216                         return Double.MAX_VALUE;
 217                 }
 218         }
 219
 220         public double getStdDev(final boolean skip_ends, final int max_mut, final float min_chunk) {
 221                 return getStdDev(getStartEndSkip(skip_ends, max_mut, min_chunk));
 222         }
 223
 224         /** Returns the standard deviation of the distances between all points involved in a mutation. */
 225         private double getStdDev(final int[] g) {
 226                 int i_start = g[0];
 227                 int i_end = g[1];
 228                 boolean skip_ends = 1 == g[2];
 229
 230                 double dist = 0;
 231                 int i = 0;
 232                 final int len1 = vs1.length();
 233                 final int len2 = vs2.length();
 234                 final ArrayList<Double> mut = new ArrayList<Double>(); // why not ArrayList<double> ? STUPID JAVA
 235                 try {
 236                         for (i=i_start; i<=i_end; i++) {
 237                                 if (MUTATION != editions[i][0]) continue;
 238                                 int k1 = editions[i][1];
 239                                 int k2 = editions[i][2];
 240                                 if (len1 == k1 || len2 == k2) continue; // LAST point will fail in some occasions, needs fixing
 241                                 double d = vs1.distance(k1, vs2, k2);
 242                                 dist += d;
 243                                 mut.add(d);
 244                         }
 245                         if (0 == mut.size()) return Double.MAX_VALUE;
 246                         Double[] di = new Double[mut.size()];
 247                         mut.toArray(di);
 248                         double average = dist / di.length; // can length be zero ?
 249                         double std = 0;
 250                         for (int k=0; k<di.length; k++) {
 251                                 std += Math.pow(di[k].doubleValue() - average, 2);
 252                         }
 253                         return Math.sqrt(std / di.length);
 254
 255                 } catch (Exception e) {
 256                         IJError.print(e);
 257                         Utils.log2("ERROR in getPhysicalDistance: i,len  j,len : " + editions[i][1] + ", " + vs1.length() + "    " + editions[i][2] + ", " + vs2.length());
 258                         return Double.MAX_VALUE;
 259                 }
 260         }
 261
 262         /** Returns {average distance, cummulative distance, stdDev, median, prop_mut} which are:
 263          *
 264          * [0] - average distance: the average physical distance between mutation pairs
 265          * [1] - cummulative distance: the sum of the distances between mutation pairs
 266          * [2] - stdDev: of the physical distances between mutation pairs relative to the average
 267          * [3] - median: the average medial physical distance between mutation pairs, more robust than the average to extreme values
 268          * [4] - prop_mut: the proportion of mutation pairs relative to the length of the queried sequence vs1.
 269          * [5] - Levenshtein's distance
 270          * [6] - Similarity (@see getSimilarity)
 271          */
 272         public double[] getStatistics(final boolean skip_ends, final int max_mut, final float min_chunk, final boolean score_mut_only) {
 273                 return getStatistics(getStartEndSkip(skip_ends, max_mut, min_chunk), score_mut_only);
 274         }
 275
 276         private double[] getStatistics(final int[] g, final boolean score_mut_only) {
 277                 int i_start = g[0];
 278                 int i_end = g[1];
 279                 boolean skip_ends = 1 == g[2];
 280
 281                 int i = 0;
 282                 final int len1 = vs1.length();
 283                 final int len2 = vs2.length();
 284                 final ArrayList<Double> dist = new ArrayList<Double>(); // why not ArrayList<double> ? STUPID JAVA
 285                 final double[] pack = new double[9];
 286
 287                 Arrays.fill(pack, Double.MAX_VALUE);
 288                 pack[4] = 0;
 289
 290                 int n_mutation = 0;
 291
 292                 double c_dist = 0;     // cummulative distance of pairs
 293                 double c_dist_mut = 0; // cummulative distance of mutation pairs
 294
 295                 try {
 296                         for (i=i_start; i<=i_end; i++) {
 297                                 if (MUTATION == editions[i][0]) n_mutation++;
 298                                 else if (score_mut_only) continue; // not a mutation, so continue because we want only mutations.
 299                                 //if (score_mut_only && MUTATION != editions[i][0]) continue;
 300                                 final int k1 = editions[i][1];
 301                                 final int k2 = editions[i][2];
 302                                 if (len1 == k1 || len2 == k2) continue; // LAST point will fail in some occasions, needs fixing
 303                                 final double d = vs1.distance(k1, vs2, k2);
 304                                 c_dist += d;
 305                                 if (MUTATION == editions[i][0]) c_dist_mut += d; // will be the same as c_dist when 'score_mut_only' is true
 306                                 dist.add(d);
 307                         }
 308
 309                         // Need at least one value to make any sense of the data
 310                         if (0 == dist.size()) return pack;
 311
 312                         final Double[] di = new Double[dist.size()];
 313                         dist.toArray(di);
 314
 315                         final double average = c_dist / di.length;
 316
 317                         double std = 0;
 318                         for (int k=0; k<di.length; k++) {
 319                                 std += Math.pow(di[k].doubleValue() - average, 2);
 320                         }
 321                         std = Math.sqrt(std / di.length);
 322
 323                         Collections.sort(dist);
 324                         final double median = dist.get(di.length/2);
 325
 326                         final double prop_mut = ((double)n_mutation) / (i_end - i_start); // i_end is non-inclusive
 327
 328                         // the max physical length of the measured part of the sequences
 329                         final double phys_len = Math.max(editions[i_end][1] - editions[i_start][1] + 1,
 330                                                          editions[i_end][2] - editions[i_start][2] + 1)
 331                                                 * vs1.getDelta(); // delta is the same for both
 332
 333                         pack[0] = average;
 334                         pack[1] = c_dist;
 335                         pack[2] = std;
 336                         pack[3] = median;
 337                         pack[4] = prop_mut;
 338                         pack[5] = this.distance;
 339                         pack[6] = getSimilarity(g);
 340                         // proximity: Unitless value indicating proximity:
 341                         pack[7] = ((double)c_dist) / phys_len;
 342                         // proximity_mut: Unitless value indicating proximity between mutation pairs only:
 343                         pack[8] = score_mut_only ? pack[7] : ((double)c_dist_mut) / phys_len;
 344
 345                         // When one does the proximity with the length of the query sequence only and not the max of both, then shorter ref sequences will score better.
 346
 347                 } catch (Exception e) {
 348                         IJError.print(e);
 349                         Utils.log2("ERROR in getStatistics: i,len  j,len : " + editions[i][1] + ", " + vs1.length() + "    " + editions[i][2] + ", " + vs2.length());
 350                 }
 351
 352                 return pack;
 353         }
 354
 355         final private void init() {
 356                 final boolean with_source = (vs1 instanceof VectorString3D && vs2 instanceof VectorString3D) ?
 357                         null != ((VectorString3D)vs1).getSource() && null != ((VectorString3D)vs2).getSource()
 358                       : false;
 359                 //Utils.log2("Editions.init() : With source: " + with_source);
 360                 // equalize point interdistance in both strings of vectors and create the actual vectors
 361                 vs1.resample(delta, with_source);
 362                 vs2.resample(delta, with_source);
 363                 // fetch the optimal matrix
 364                 final double[][] matrix = findMinimumEditDistance();
 365
 366                 final int n = vs1.length();
 367                 final int m = vs2.length();
 368
 369                 this.distance = matrix[n][m];
 370
 371                 // let's see the matrix
 372                 /*
 373                 final StringBuffer sb = new StringBuffer();
 374                 for (int f=0; f<n+1; f++) {
 375                         for (int g=0; g<m+1; g++) {
 376                                 sb.append(matrix[f][g]).append('\t');
 377                         }
 378                         sb.append('\n');
 379                 }
 380                 Utils.saveToFile(new java.io.File("/home/albert/Desktop/matrix.txt"), sb.toString());
 381                 */
 382
 383
 384                 final int initial_length = (int)Math.sqrt((n*n) + (m*m));
 385                 int i = 0;
 386                 int ed_length = initial_length;
 387                 editions = new int[ed_length][3];
 388                 int next = 0;
 389                 i = n;
 390                 int j = m;
 391                 int k;
 392                 final double error = 0.0000001; // for floating-point math
 393                 while (0 != i && 0 != j) { // the matrix is n+1,m+1 in size
 394                         // check editions array
 395                         if (next == ed_length) {
 396                                 // enlarge editions array
 397                                 this.editions = resizeAndFillEditionsCopy(editions, ed_length, ed_length + 20);
 398                                 ed_length += 20;
 399                         }
 400                         // find next i, j and the type of transform:
 401                         if (error > Math.abs(matrix[i][j] - matrix[i-1][j] - delta)) {
 402                                 // a deletion:
 403                                 editions[next][0] = DELETION;
 404                                 editions[next][1] = i;
 405                                 editions[next][2] = j;
 406                                 i = i-1;
 407                         } else if (error > Math.abs(matrix[i][j] - matrix[i][j-1] - delta)) {
 408                                 // an insertion:
 409                                 editions[next][0] = INSERTION;
 410                                 editions[next][1] = i;
 411                                 editions[next][2] = j;
 412                                 j = j-1;
 413                         } else {
 414                                 // a mutation (a trace between two elements of the string of vectors)
 415                                 editions[next][0] = MUTATION;
 416                                 editions[next][1] = i;
 417                                 editions[next][2] = j;
 418                                 i = i-1;
 419                                 j = j-1;
 420                         }
 421                         //prepare next
 422                         next++;
 423                 }
 424                 // add unnoticed insertions/deletions. Happens when 'i' and 'j' don't reach the zero value at the same time.
 425                 if (0 != j) {
 426                         for (k=j; k>-1; k--) {
 427                                 if (next == ed_length) {
 428                                         //enlarge editions array
 429                                         editions = resizeAndFillEditionsCopy(editions, ed_length, ed_length + 20);
 430                                         ed_length += 20;
 431                                 }
 432                                 editions[next][0] = INSERTION; // insertion
 433                                 editions[next][1] = 0; // the 'i'
 434                                 editions[next][2] = k; // the 'j'
 435                                 next++;
 436                         }
 437                 }
 438                 if (0 != i) {
 439                         for (k=i; k>-1; k--) {
 440                                 if (next == ed_length) {
 441                                         //enlarge editions array
 442                                         editions = resizeAndFillEditionsCopy(editions, ed_length, ed_length + 20);
 443                                         ed_length += 20;
 444                                 }
 445                                 editions[next][0] = DELETION; // deletion
 446                                 editions[next][1] = k; // the 'i'
 447                                 editions[next][2] = 0; // the 'j'
 448                                 next++;
 449                         }
 450                 }
 451                 // reverse and resize editions array, and DO NOT slice out the last element.
 452                 if (next != ed_length) {
 453                         // allocate a new editions array  ('next' is the length now, since it is the index of the element after the last element)
 454                         int[][] editions2 = new int[next][3];
 455                         for (i=0, j=next-1; i<next; i++, j--) {
 456                                 editions2[i][0] = editions[j][0];
 457                                 editions2[i][1] = editions[j][1];
 458                                 editions2[i][2] = editions[j][2];
 459                         }
 460                         editions = editions2;
 461                 } else {
 462                         //simply reorder in place
 463                         int[] temp;
 464                         for (i=0; i<next; i++) {
 465                                 temp = editions[i];
 466                                 editions[i] = editions[next -1 -i];
 467                                 editions[next -1 -i] = temp;
 468                         }
 469                 }
 470         }
 471
 472         static private final int[][] resizeAndFillEditionsCopy(final int[][] editions, final int ed_length, final int new_length) {
 473                 // check
 474                 if (new_length <= ed_length) return editions;
 475                 // create a copy
 476                 final int[][] editions2 = new int[new_length][3];
 477                 // fill with previous values
 478                 for (int k=0; k<ed_length; k++) {
 479                         editions2[k][0] = editions[k][0];
 480                         editions2[k][1] = editions[k][1];
 481                         editions2[k][2] = editions[k][2];
 482                 }
 483                 return editions2;
 484         }
 485
 486         /** Convenient tuple to store the statting index and the matrix for that index.*/
 487         private class MinDist {
 488                 int min_j;
 489                 double min_dist;
 490                 double[][] matrix;
 491                 double[][] matrix_e;
 492         }
 493
 494         private double[][] findMinimumEditDistance() {
 495                 int j, i;
 496                 int min_j = 0;
 497                 final int n = vs1.length();
 498                 final int m = vs2.length();
 499                 // allocate the first matrix
 500                 final double[][] matrix1 = new double[n+1][m+1];
 501                 // make the first element be i*delta
 502                 for (i=0; i < n +1; i++) {
 503                         matrix1[i][0] = i * delta;
 504                 }
 505                 // fill first column
 506                 for (j=0; j < m + 1; j++) {
 507                         matrix1[0][j] = j * delta;
 508                 }
 509                 // return the matrix made matching point 0 of both curves, if the curve is open.
 510                 if (!closed) {
 511                         return findEditMatrix(0, matrix1);
 512                 }
 513
 514                 // else try every point in the second curve to see which one is the best possible match.
 515
 516                 // allocate the second matrix
 517                 final double[][] matrix2 = new double[n+1][m+1];
 518                 /// fill top row values for the second matrix
 519                 for (i=0; i < n +1; i++) {
 520                         matrix2[i][0] = i * delta;
 521                 }
 522                 // make the first j row be j * delta
 523                 for (j=0; j < m + 1; j++) {
 524                         matrix2[0][j] = j * delta;
 525                 }
 526
 527                 // the current, editable
 528                 double[][] matrix_e = matrix1;
 529
 530
 531                 // the one to keep
 532                 double[][] matrix = null;
 533
 534                 // The algorithm to find the starting point, based on vector string distance:
 535                 /*
 536                 // find the minimum distance
 537                 for (j=0; j<m; j++) {
 538                         // get the distance starting at 0 in p1 and at j in p2:
 539                         matrix_e = findEditMatrix(j, matrix_e);
 540
 541                         // last element of the matrix is the string distance between p1 and p2
 542                         if (matrix_e[n][m] < min_dist) {
 543                                 // record values
 544                                 min_dist = matrix_e[n][m];
 545                                 min_j = j;
 546                                 // record new one
 547                                 matrix = matrix_e;
 548                                 // swap editable matrix
 549                                 if (matrix1 == matrix_e) { // compare the addresses of the arrays.
 550                                         matrix_e = matrix2;
 551                                 } else {
 552                                         matrix_e = matrix1;
 553                                 }
 554                         }
 555                         // else nothing needs to be needed, the current editable matrix remains the same.
 556                 }
 557                 */
 558
 559                 // A 'divide and conquer' approach: much faster, based on the fact that the distances always make a valey when plotted
 560                 // Find the value of one every 10% of points. Then find those intervals with the lowest starting and ending values, and then look for 50% intervals inside those, and so on, until locking into the lowest value. It will save about 80% or more of all computations.
 561                 MinDist min_data = new MinDist();
 562                 min_data.min_j = -1;
 563                 min_data.min_dist = Double.MAX_VALUE;
 564                 min_data.matrix = matrix2;
 565                 min_data.matrix_e = matrix1;
 566
 567                 min_data = findMinDist(0, m-1, (int)Math.ceil(m * 0.1), min_data);
 568
 569                 min_j = min_data.min_j;
 570                 matrix = min_data.matrix;
 571                 matrix_e = min_data.matrix_e;
 572
 573                 // free the current editable matrix. The other one is returned below and will be free elsewhere after usage.
 574                 if (matrix != matrix_e) {
 575                         matrix_e = null;
 576                 } else {
 577                         System.out.println("\nERROR: matrix is matrix_e unexpectedly");
 578                 }
 579
 580
 581                 // Reorder the second array, so that min_j is index zero (i.e. simply making both curves start at points that are closest to each other in terms of curve similarity).
 582                 if (0 != min_j) {
 583                         vs2.reorder(min_j);
 584                 }
 585
 586                 // return the matrix
 587                 return matrix;
 588         }
 589
 590         /** Returns the same instance of MinDist given as a parameter (so it has to be non-null). */
 591         private MinDist findMinDist(int first, int last, int interval_length, final MinDist result) {
 592                 double[][] matrix = result.matrix;
 593                 double[][] matrix_e = result.matrix_e;
 594                 double[][] matrix1 = matrix_e;
 595                 double[][] matrix2 = matrix;
 596
 597                 // the iterator over p2
 598                 int j;
 599
 600                 // size of the interval to explore
 601                 int k = 0;
 602                 int length;
 603                 if (last < first) {
 604                         length = vs2.length() - first + last;
 605                 } else {
 606                         length = last - first + 1;
 607                 }
 608
 609                 // gather data
 610                 final int n = vs1.length();
 611                 final int m = vs2.length();
 612                 int min_j = result.min_j;
 613                 double min_dist = result.min_dist;
 614
 615                 while (k < length) {
 616                         j = first + k;
 617                         // correct circular array
 618                         if (j >= m) {
 619                                 j = j - m;
 620                         }
 621                         // don't do some twice: TODO this setup does not save the case when the computation was done not in the previous iteration but before.
 622                         if (j == result.min_j) {
 623                                 matrix_e = result.matrix_e;
 624                         } else {
 625                                 matrix_e = findEditMatrix(j, matrix_e);
 626                         }
 627                         if (matrix_e[n][m] < min_dist) {
 628                                 // record values
 629                                 min_j = j;
 630                                 matrix = matrix_e;
 631                                 min_dist = matrix[n][m];
 632                                 // swap editable matrix
 633                                 if (matrix_e == matrix1) { // compare the addresses of the arrays.
 634                                         matrix_e = matrix2;
 635                                 } else {
 636                                         matrix_e = matrix1;
 637                                 }
 638                         }
 639                         // advance iterator
 640                         if (length -1 != k && k + interval_length >= length) {
 641                                 // do the last one (and then finish)
 642                                 k = length -1;
 643                         } else {
 644                                 k += interval_length;
 645                         }
 646                 }
 647
 648                 // pack result:
 649                 result.min_j = min_j;
 650                 result.min_dist = min_dist;
 651                 result.matrix = matrix;
 652                 result.matrix_e = matrix_e;
 653
 654                 if (1 == interval_length) {
 655                         // done!
 656                         return result;
 657                 } else {
 658                         first = min_j - (interval_length -1);
 659                         last = min_j + (interval_length -1);
 660                         // correct first:
 661                         if (first < 0) {
 662                                 first = m + first; // a '+' so it is substracted from vs2.length
 663                         }
 664                         // correct last:
 665                         if (last >= m) {
 666                                 last -= m;
 667                         }
 668                         // recurse, with half the interval length
 669                         interval_length = (int)Math.ceil(interval_length / 2.0f);
 670                         return findMinDist(first, last, interval_length, result);
 671                 }
 672         }
 673
 674         /** Generate the matrix between vs1 and vs2. The lower right value of the matrix is the Levenshtein's distance between the two strings of vectors. @param delta is the desired point interdistance. @param first is the first index of vs2  to be matched with index zero of vs1. @param matrix is optional, for recyclying. */
 675         private double[][] findEditMatrix(final int first, double[][] matrix_) {
 676
 677                 final int n = vs1.length();
 678                 final int m = vs2.length();
 679
 680                 if (null == matrix_) matrix_ =  new double[n+1][m+1];
 681                 final double[][] matrix = matrix_;
 682
 683                 int i=0, j=0;
 684                 for (; i < n +1; i++) {
 685                         matrix[i][0] = i * delta;
 686                 }
 687                 for (; j < m +1; j++) {
 688                         matrix[0][j] = j * delta;
 689                 }
 690                 // as optimized in the findEditMatrix in CurveMorphing_just_C.c
 691                 double[] mati;
 692                 double[] mat1;
 693                 double fun1, fun2, fun3;
 694                 double dx, dy;
 695                 for (i=1; i < n +1; i++) {
 696                         mati = matrix[i];
 697                         mat1 = matrix[i-1];
 698                         for (j=1; j < m +1; j++) {
 699                                 // cost deletion:
 700                                 fun1 = mat1[j] + delta; // matrix[i-1][j] + delta
 701                                 // cost insertion:
 702                                 fun2 = mati[j-1] + delta; // matrix[i][j-1] + delta
 703                                 // cost mutation:
 704                                 if (i == n || j == m) {
 705                                         fun3 = mat1[j-1]; // matrix[i-1][j-1]
 706                                 } else {
 707                                         //dx = v_x1[i] - v_x2[j];
 708                                         //dy = v_y1[i] - v_y2[j];
 709                                         fun3 = mat1[j-1] + vs1.getDiffVectorLength(i, j, vs2); //  Math.sqrt(dx*dx + dy*dy); // the vector length is the hypothenusa.
 710                                 }
 711                                 // insert the lowest value in the matrix.
 712                                 // since most are mutations, start with fun3:
 713                                 if (fun3 <= fun1 && fun3 <= fun2) {
 714                                         mati[j] = fun3;//matrix[i][jj] = fun3;
 715                                 } else if (fun1 <= fun2 && fun1 <= fun3) {
 716                                         mati[j] = fun1;//matrix[i][jj] = fun1;
 717                                 } else {
 718                                         mati[j] = fun2;//matrix[i][jj] = fun2;
 719                                 }
 720                         }
 721                 }
 722
 723                 return matrix;
 724         }
 725
 726         /** Get the sequence of editions and matches in three lines, like:
 727          *  vs1: 1 2 3 4 5 6     7 8 9
 728          *       M M D M M M I I M M M
 729          *  vs2: 1 2   3 4   5 6 7 8 9
 730          *
 731          *  With the given separator (defaults to tab if null)
 732          */
 733         public String prettyPrint(String separator) {
 734                 if (null == editions) return null;
 735                 if (null == separator) separator = "\t";
 736                 final StringBuffer s1 = new StringBuffer(editions.length*2 + 5);
 737                 final StringBuffer se = new StringBuffer(editions.length*2 + 5);
 738                 final StringBuffer s2 = new StringBuffer(editions.length*2 + 5);
 739                 s1.append("vs1:");
 740                 se.append("    ");
 741                 s2.append("vs2:");
 742                 for (int i=0; i<editions.length; i++) {
 743                         switch (editions[i][0]) {
 744                                 case MUTATION:
 745                                         s1.append(separator).append(editions[i][1] + 1);
 746                                         se.append(separator).append('M');
 747                                         s2.append(separator).append(editions[i][2] + 1);
 748                                         break;
 749                                 case DELETION:
 750                                         s1.append(separator).append(editions[i][1] + 1);
 751                                         se.append(separator).append('D');
 752                                         s2.append(separator).append(' ');
 753                                         break;
 754                                 case INSERTION:
 755                                         s1.append(separator).append(' ');
 756                                         se.append(separator).append('I');
 757                                         s2.append(separator).append(editions[i][2] + 1);
 758                                         break;
 759                         }
 760                 }
 761                 s1.append('\n');
 762                 se.append('\n');
 763                 s2.append('\n');
 764                 return s1.append(se).append(s2).toString();
 765         }
 766
 767         static public class Chunk {
 768                 int i_start, i_end;
 769                 Chunk(int i_start, int i_end) {
 770                         this.i_start = i_start;
 771                         this.i_end = i_end;
 772                 }
 773                 int length() { return i_end - i_start + 1; }
 774         }
 775
 776         /*
 777         private class ChunkComparator implements Comparator {
 778                 public int compare(Object o1, Object o2) {
 779                         Chunck c1 = (Chunck)o1;
 780                         Chunck c2 = (Chunck)o2;
 781                         double val = c1.length() - c2.length();
 782                         if (val < 0) return -1; // c1 is shorter
 783                         if (val > 0) return 1; // c1 is longer
 784                         return 0; // equally long
 785                 }
 786         }
 787         */
 788
 789         public Chunk findLargestMutationChunk(final int max_non_mut) {
 790                 // find the longest chunk of contiguous mutations
 791                 // with no interruptions (insertions or deletions) longer than max_non_mut
 792                 final ArrayList<Chunk> chunks = new ArrayList<Chunk>();
 793
 794                 // search for chunks of consecutive mutations, with any number of gaps of maximum length max_non_mut
 795                 Chunk cur = new Chunk(0, 0);
 796                 boolean reached_mut = false;
 797                 int non_mut = 0;
 798
 799                 for (int i=0; i<editions.length; i++) {
 800                         if (MUTATION == editions[i][0]) {
 801                                 cur.i_end = i;
 802                                 if (!reached_mut) reached_mut = true;
 803                         } else {
 804                                 if (!reached_mut) cur.i_start = i; // move forward until finding the first mutation
 805                                 else non_mut++; // if it has reached some mut, count non-muts
 806                                 if (non_mut > max_non_mut || editions.length -1 == i) {
 807                                         // break current chain, try to add it if long enough
 808                                         cur.i_end = i;
 809                                         if (0 != chunks.size()) {
 810                                                 // else, check if its equally long or longer than any existent one
 811                                                 final Chunk[] c = new Chunk[chunks.size()];
 812                                                 chunks.toArray(c);
 813                                                 boolean add = false;
 814                                                 final int len = cur.length();
 815                                                 for (int k=c.length-1; k>-1; k--) {
 816                                                         int clen = c[k].length();
 817                                                         if (len > clen) {
 818                                                                 chunks.remove(c[k]); // remove all found that are shorter
 819                                                         }
 820                                                         if (!add && len >= clen) {
 821                                                                 add = true;
 822                                                         }
 823                                                 }
 824                                                 if (add) chunks.add(cur);
 825                                         } else {
 826                                                 // if none yet, just add it
 827                                                 chunks.add(cur);
 828                                         }
 829                                         // create new current
 830                                         cur = new Chunk(i, i);
 831                                         reached_mut =false;
 832                                         non_mut = 0;
 833                                 }
 834                         }
 835                 }
 836
 837                 if (0 == chunks.size()) {
 838                         Utils.log2("No chunks found.");
 839                         return null;
 840                 }
 841
 842                 // select a chunk
 843                 Chunk chunk = null;
 844                 if (1 == chunks.size()) chunk = chunks.get(0);
 845                 else {
 846                         // All added chunks have the same length (otherwise would have been deleted)
 847                         // Find the one with the smallest cummulative physical distance
 848                         double[] dist = new double[chunks.size()];
 849                         int next = 0;
 850                         Hashtable<Chunk,Double> ht = new Hashtable<Chunk,Double>();
 851                         for (Chunk c : chunks) {
 852                                 dist[next] = getPhysicalDistance(new int[]{c.i_start, c.i_end, max_non_mut}, false);
 853                                 ht.put(c, dist[next]);
 854                                 next++;
 855                         }
 856                         Arrays.sort(dist);
 857                         for (Iterator it = ht.entrySet().iterator(); it.hasNext(); ) {
 858                                 Map.Entry entry = (Map.Entry)it.next();
 859                                 if ( ((Double)entry.getValue()).doubleValue() == dist[0]) {
 860                                         chunk = (Chunk)entry.getKey();
 861                                         break;
 862                                 }
 863                         }
 864                 }
 865
 866                 return chunk;
 867         }
 868
 869         /** Find the longest chunk of mutations (which can include chunks of up to max_non_mut of non-mutations),
 870          *  then take the center point and split both vector strings there, perform matching towards the ends,
 871          *  and assemble a new Editions object.
 872          */
 873         public Editions recreateFromCenter(final int max_non_mut) throws Exception {
 874
 875                 final Chunk chunk = findLargestMutationChunk(max_non_mut);
 876
 877                 // from the midpoint, create two vector strings and reverse them, and compute editions for them
 878                 // (so: get new edition sequence from chunk's midpoint to zero)
 879
 880                 final int midpoint = (chunk.i_start + chunk.i_end) / 2;
 881                 if (0 == midpoint) return null;
 882
 883                 VectorString3D firsthalf1 = (VectorString3D)vs1.subVectorString(editions[midpoint][1], 0); // already reversed, by giving indices in reverse order
 884                 VectorString3D firsthalf2 = (VectorString3D)vs2.subVectorString(editions[midpoint][2], 0); // already reversed, by giving indices in reverse order
 885
 886                 final Editions ed = new Editions(firsthalf1, firsthalf2, this.delta, this.closed);
 887                 // compose new editions array from the new one and the second half of this
 888                 int[][] mushup = new int[ed.editions.length + this.editions.length - midpoint][3]; // not +1 after midpoint to not include the midpoint, which was included in the new Editions.
 889                 for (int i=0; i<=midpoint/* same as i<ed.editions.length*/; i++) {
 890                         mushup[midpoint -i] = ed.editions[i];
 891                 }
 892                 for (int i=midpoint+1; i<this.editions.length; i++) {
 893                         mushup[i] = this.editions[i];
 894                 }
 895                 // Levenshtein's distance should be recomputed ... but I don't know how, considering the above mush up
 896                 ed.vs1 = this.vs1;
 897                 ed.vs2 = this.vs2;
 898                 ed.distance = this.distance;
 899                 ed.editions = mushup;
 900
 901                 return ed;
 902         }
 903
 904 }