Major cleanup of Utils class.
[trakem2.git] / ini / trakem2 / vector / Editions.java
blob72f1b02f7132dec98bd799bf88f60afffc29ab39
1 /**
3 TrakEM2 plugin for ImageJ(C).
4 Copyright (C) 2007 Albert Cardona.
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License
8 as published by the Free Software Foundation (http://www.gnu.org/licenses/gpl.txt )
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 You may contact Albert Cardona at acardona at ini.phys.ethz.ch
20 Institute of Neuroinformatics, University of Zurich / ETH, Switzerland.
21 **/
23 package ini.trakem2.vector;
25 import java.util.*;
26 import javax.vecmath.Point3f;
27 import java.util.Collections;
28 import ini.trakem2.utils.Utils;
29 import ini.trakem2.utils.IJError;
31 /** To extract and represent the sequence of editions that convert any N-dimensional vector string to any other of the same number of dimensions. */
32 public class Editions {
34 static public final int DELETION = 1;
35 static public final int INSERTION = 2;
36 static public final int MUTATION = 3;
38 protected VectorString vs1;
39 protected VectorString vs2;
40 protected double delta;
41 protected boolean closed;
43 /** In the form [length][3] */
44 protected int[][] editions;
45 /** Levenshtein's distance between vs1 and vs2.*/
46 public double distance;
48 public Editions(final VectorString vs1, final VectorString vs2, final double delta, final boolean closed) {
49 this.vs1 = vs1;
50 this.vs2 = vs2;
51 this.delta = delta;
52 this.closed = closed;
53 init();
56 public double getDistance() { return distance; }
57 public int length() { return editions.length; }
58 public int[][] getEditions() { return editions; }
59 public VectorString getVS1() { return vs1; }
60 public VectorString getVS2() { return vs2; }
62 /** A mutation is considered an equal or near equal, and thus does not count. Only deletions and insertions count towards scoring the similarity.
64 * @param skip_ends enables ignoring sequences in the beginning and ending if they are insertions or deletions.
65 * @param max_mut indicates the maximum length of a contiguous sequence of mutations to be ignored when skipping insertions and deletions at beginning and end.
66 * @param min_chunk indicates the minimal proportion of the string that should remain between the found start and end, for vs1. The function will return the regular similarity if the chunk is too small.
69 public double getSimilarity(boolean skip_ends, final int max_mut, final float min_chunk) {
70 return getSimilarity(getStartEndSkip(skip_ends, max_mut, min_chunk));
73 private double getSimilarity(final int[] g) {
74 int i_start = g[0];
75 int i_end = g[1];
76 boolean skip_ends = 1 == g[2];
78 int non_mut = 0;
80 if (skip_ends) {
81 // count non mutations
82 for (int i=i_start; i<=i_end; i++) {
83 if (MUTATION != editions[i][0]) non_mut++;
86 // compute proper segment lengths, inlined
87 final double sim = 1.0 - ( (double)non_mut / Math.max( editions[i_end][1] - editions[i_start][1] + 1, editions[i_end][2] - editions[i_start][2] + 1) );
89 //if (sim > 0.7) Utils.log2("similarity: non_mut, len1, len2, i_start, i_end : " + non_mut + ", " + (editions[i_end][1] - editions[i_start][1] + 1) + ", " + (editions[i_end][2] - editions[i_start][2] + 1) + ", " + i_start + "," + i_end + " " + Utils.cutNumber(sim * 100, 2) + " %");
91 return sim;
92 } else {
93 for (int i=0; i<editions.length; i++) {
94 if (MUTATION != editions[i][0]) non_mut++;
97 * If the max_len is smaller than the number of non-mutations, then a NEGATIVE similarity value is returned,
98 * but it's ok. All it means is that it's not similar at all.
99 int max_len = Math.max(vs1.length(), vs2.length());
100 Utils.log2("non_mut: " + non_mut + " total: " + editions.length + " max length: " + max_len + (non_mut > max_len ? " WARNING!" : ""));
102 return 1.0 - ( (double)non_mut / Math.max(vs1.length(), vs2.length()) );
106 public double getSimilarity() {
107 return getSimilarity(false, 0, 1);
110 public double getSimilarity2() {
111 return getSimilarity2(false, 0, 1);
114 /** Returns the number of mutations / max(len(vs1), len(vs2)) : 1.0 means all are mutations and the sequences have the same lengths.*/
115 public double getSimilarity2(boolean skip_ends, final int max_mut, final float min_chunk) {
117 int[] g = getStartEndSkip(skip_ends, max_mut, min_chunk);
118 int i_start = g[0];
119 int i_end = g[1];
120 skip_ends = 1 == g[2];
122 int mut = 0;
124 if (skip_ends) {
125 // count non mutations
126 for (int i=i_start; i<i_end; i++) {
127 if (MUTATION == editions[i][0]) mut++;
130 // compute proper segment lengths, inlined
131 double sim = (double)mut / Math.max( editions[i_end][1] - editions[i_start][1] + 1, editions[i_end][2] - editions[i_start][2] + 1);
133 //if (sim > 0.7) Utils.log2("similarity: mut, len1, len2, i_start, i_end : " + mut + ", " + (editions[i_end][1] - editions[i_start][1] + 1) + ", " + (editions[i_end][2] - editions[i_start][2] + 1) + ", " + i_start + "," + i_end + " " + Utils.cutNumber(sim * 100, 2) + " %");
135 return sim;
137 } else {
138 for (int i=0; i<editions.length; i++) {
139 if (MUTATION == editions[i][0]) mut++;
141 return (double)mut / Math.max(vs1.length(), vs2.length());
146 /** Returns starting and ending indices, both inclusive. */
147 private final int[] getStartEndSkip(boolean skip_ends, final int max_mut, final float min_chunk) {
148 int i_start = 0;
149 int i_end = editions.length -1;
150 if (skip_ends) {
151 // 1 - find start:
152 int len_mut_seq = 0;
153 // break when the found sequence of continuous mutations is larger than max_mut
154 for (int i=0; i<editions.length; i++) {
155 if (MUTATION == editions[i][0]) {
156 len_mut_seq++;
157 } else {
158 // reset
159 len_mut_seq = 0;
161 if (len_mut_seq > max_mut) {
162 i_start = i - len_mut_seq +1;
163 break;
166 // find end
167 len_mut_seq = 0;
168 for (int i=i_end; i>-1; i--) {
169 if (MUTATION == editions[i][0]) {
170 len_mut_seq++;
171 } else {
172 // reset
173 len_mut_seq = 0;
175 if (len_mut_seq > max_mut) {
176 i_end = i;
177 break;
180 // determine if remaining chunk is larger than required min_chunk
181 skip_ends = ((float)(editions[i_end][1] - editions[i_start][1] + 1) / vs1.length()) >= min_chunk;
183 return new int[]{i_start, i_end, skip_ends ? 1 : 0};
186 /** Returns the distance between all points involved in a mutation; if average is false, then it returns the cummulative. Returns Double.MAX_VALUE if no mutations are found. */
187 public double getPhysicalDistance(boolean skip_ends, final int max_mut, final float min_chunk, boolean average) {
188 return getPhysicalDistance(getStartEndSkip(skip_ends, max_mut, min_chunk), average);
191 private double getPhysicalDistance(final int[] g, final boolean average) {
192 int i_start = g[0];
193 int i_end = g[1];
194 boolean skip_ends = 1 == g[2];
196 double dist = 0;
197 int len = 0;
198 int i = 0;
199 final int len1 = vs1.length();
200 final int len2 = vs2.length();
201 try {
202 for (i=i_start; i<=i_end; i++) {
203 if (MUTATION != editions[i][0]) continue;
204 int k1 = editions[i][1];
205 int k2 = editions[i][2];
206 if (len1 == k1 || len2 == k2) continue; // LAST point will fail in some occasions, needs fixing
207 dist += vs1.distance(k1, vs2, k2);
208 len++;
210 if (0 == len) return Double.MAX_VALUE;
211 if (average) return dist / len; // can len be zero ?
212 return dist;
213 } catch (Exception e) {
214 IJError.print(e);
215 Utils.log2("ERROR in getPhysicalDistance: i,len j,len : " + editions[i][1] + ", " + vs1.length() + " " + editions[i][2] + ", " + vs2.length());
216 return Double.MAX_VALUE;
220 public double getStdDev(final boolean skip_ends, final int max_mut, final float min_chunk) {
221 return getStdDev(getStartEndSkip(skip_ends, max_mut, min_chunk));
224 /** Returns the standard deviation of the distances between all points involved in a mutation. */
225 private double getStdDev(final int[] g) {
226 int i_start = g[0];
227 int i_end = g[1];
228 boolean skip_ends = 1 == g[2];
230 double dist = 0;
231 int i = 0;
232 final int len1 = vs1.length();
233 final int len2 = vs2.length();
234 final ArrayList<Double> mut = new ArrayList<Double>(); // why not ArrayList<double> ? STUPID JAVA
235 try {
236 for (i=i_start; i<=i_end; i++) {
237 if (MUTATION != editions[i][0]) continue;
238 int k1 = editions[i][1];
239 int k2 = editions[i][2];
240 if (len1 == k1 || len2 == k2) continue; // LAST point will fail in some occasions, needs fixing
241 double d = vs1.distance(k1, vs2, k2);
242 dist += d;
243 mut.add(d);
245 if (0 == mut.size()) return Double.MAX_VALUE;
246 Double[] di = new Double[mut.size()];
247 mut.toArray(di);
248 double average = dist / di.length; // can length be zero ?
249 double std = 0;
250 for (int k=0; k<di.length; k++) {
251 std += Math.pow(di[k].doubleValue() - average, 2);
253 return Math.sqrt(std / di.length);
255 } catch (Exception e) {
256 IJError.print(e);
257 Utils.log2("ERROR in getPhysicalDistance: i,len j,len : " + editions[i][1] + ", " + vs1.length() + " " + editions[i][2] + ", " + vs2.length());
258 return Double.MAX_VALUE;
262 /** Returns {average distance, cummulative distance, stdDev, median, prop_mut} which are:
264 * [0] - average distance: the average physical distance between mutation pairs
265 * [1] - cummulative distance: the sum of the distances between mutation pairs
266 * [2] - stdDev: of the physical distances between mutation pairs relative to the average
267 * [3] - median: the average medial physical distance between mutation pairs, more robust than the average to extreme values
268 * [4] - prop_mut: the proportion of mutation pairs relative to the length of the queried sequence vs1.
269 * [5] - Levenshtein's distance
270 * [6] - Similarity (@see getSimilarity)
272 public double[] getStatistics(final boolean skip_ends, final int max_mut, final float min_chunk, final boolean score_mut_only) {
273 return getStatistics(getStartEndSkip(skip_ends, max_mut, min_chunk), score_mut_only);
276 private double[] getStatistics(final int[] g, final boolean score_mut_only) {
277 int i_start = g[0];
278 int i_end = g[1];
279 boolean skip_ends = 1 == g[2];
281 int i = 0;
282 final int len1 = vs1.length();
283 final int len2 = vs2.length();
284 final ArrayList<Double> dist = new ArrayList<Double>(); // why not ArrayList<double> ? STUPID JAVA
285 final double[] pack = new double[9];
287 Arrays.fill(pack, Double.MAX_VALUE);
288 pack[4] = 0;
290 int n_mutation = 0;
292 double c_dist = 0; // cummulative distance of pairs
293 double c_dist_mut = 0; // cummulative distance of mutation pairs
295 try {
296 for (i=i_start; i<=i_end; i++) {
297 if (MUTATION == editions[i][0]) n_mutation++;
298 else if (score_mut_only) continue; // not a mutation, so continue because we want only mutations.
299 //if (score_mut_only && MUTATION != editions[i][0]) continue;
300 final int k1 = editions[i][1];
301 final int k2 = editions[i][2];
302 if (len1 == k1 || len2 == k2) continue; // LAST point will fail in some occasions, needs fixing
303 final double d = vs1.distance(k1, vs2, k2);
304 c_dist += d;
305 if (MUTATION == editions[i][0]) c_dist_mut += d; // will be the same as c_dist when 'score_mut_only' is true
306 dist.add(d);
309 // Need at least one value to make any sense of the data
310 if (0 == dist.size()) return pack;
312 final Double[] di = new Double[dist.size()];
313 dist.toArray(di);
315 final double average = c_dist / di.length;
317 double std = 0;
318 for (int k=0; k<di.length; k++) {
319 std += Math.pow(di[k].doubleValue() - average, 2);
321 std = Math.sqrt(std / di.length);
323 Collections.sort(dist);
324 final double median = dist.get(di.length/2);
326 final double prop_mut = ((double)n_mutation) / (i_end - i_start); // i_end is non-inclusive
328 // the max physical length of the measured part of the sequences
329 final double phys_len = Math.max(editions[i_end][1] - editions[i_start][1] + 1,
330 editions[i_end][2] - editions[i_start][2] + 1)
331 * vs1.getDelta(); // delta is the same for both
333 pack[0] = average;
334 pack[1] = c_dist;
335 pack[2] = std;
336 pack[3] = median;
337 pack[4] = prop_mut;
338 pack[5] = this.distance;
339 pack[6] = getSimilarity(g);
340 // proximity: Unitless value indicating proximity:
341 pack[7] = ((double)c_dist) / phys_len;
342 // proximity_mut: Unitless value indicating proximity between mutation pairs only:
343 pack[8] = score_mut_only ? pack[7] : ((double)c_dist_mut) / phys_len;
345 // When one does the proximity with the length of the query sequence only and not the max of both, then shorter ref sequences will score better.
347 } catch (Exception e) {
348 IJError.print(e);
349 Utils.log2("ERROR in getStatistics: i,len j,len : " + editions[i][1] + ", " + vs1.length() + " " + editions[i][2] + ", " + vs2.length());
352 return pack;
355 final private void init() {
356 final boolean with_source = (vs1 instanceof VectorString3D && vs2 instanceof VectorString3D) ?
357 null != ((VectorString3D)vs1).getSource() && null != ((VectorString3D)vs2).getSource()
358 : false;
359 //Utils.log2("Editions.init() : With source: " + with_source);
360 // equalize point interdistance in both strings of vectors and create the actual vectors
361 vs1.resample(delta, with_source);
362 vs2.resample(delta, with_source);
363 // fetch the optimal matrix
364 final double[][] matrix = findMinimumEditDistance();
366 final int n = vs1.length();
367 final int m = vs2.length();
369 this.distance = matrix[n][m];
371 // let's see the matrix
373 final StringBuffer sb = new StringBuffer();
374 for (int f=0; f<n+1; f++) {
375 for (int g=0; g<m+1; g++) {
376 sb.append(matrix[f][g]).append('\t');
378 sb.append('\n');
380 Utils.saveToFile(new java.io.File("/home/albert/Desktop/matrix.txt"), sb.toString());
384 final int initial_length = (int)Math.sqrt((n*n) + (m*m));
385 int i = 0;
386 int ed_length = initial_length;
387 editions = new int[ed_length][3];
388 int next = 0;
389 i = n;
390 int j = m;
391 int k;
392 final double error = 0.0000001; // for floating-point math
393 while (0 != i && 0 != j) { // the matrix is n+1,m+1 in size
394 // check editions array
395 if (next == ed_length) {
396 // enlarge editions array
397 this.editions = resizeAndFillEditionsCopy(editions, ed_length, ed_length + 20);
398 ed_length += 20;
400 // find next i, j and the type of transform:
401 if (error > Math.abs(matrix[i][j] - matrix[i-1][j] - delta)) {
402 // a deletion:
403 editions[next][0] = DELETION;
404 editions[next][1] = i;
405 editions[next][2] = j;
406 i = i-1;
407 } else if (error > Math.abs(matrix[i][j] - matrix[i][j-1] - delta)) {
408 // an insertion:
409 editions[next][0] = INSERTION;
410 editions[next][1] = i;
411 editions[next][2] = j;
412 j = j-1;
413 } else {
414 // a mutation (a trace between two elements of the string of vectors)
415 editions[next][0] = MUTATION;
416 editions[next][1] = i;
417 editions[next][2] = j;
418 i = i-1;
419 j = j-1;
421 //prepare next
422 next++;
424 // add unnoticed insertions/deletions. Happens when 'i' and 'j' don't reach the zero value at the same time.
425 if (0 != j) {
426 for (k=j; k>-1; k--) {
427 if (next == ed_length) {
428 //enlarge editions array
429 editions = resizeAndFillEditionsCopy(editions, ed_length, ed_length + 20);
430 ed_length += 20;
432 editions[next][0] = INSERTION; // insertion
433 editions[next][1] = 0; // the 'i'
434 editions[next][2] = k; // the 'j'
435 next++;
438 if (0 != i) {
439 for (k=i; k>-1; k--) {
440 if (next == ed_length) {
441 //enlarge editions array
442 editions = resizeAndFillEditionsCopy(editions, ed_length, ed_length + 20);
443 ed_length += 20;
445 editions[next][0] = DELETION; // deletion
446 editions[next][1] = k; // the 'i'
447 editions[next][2] = 0; // the 'j'
448 next++;
451 // reverse and resize editions array, and DO NOT slice out the last element.
452 if (next != ed_length) {
453 // allocate a new editions array ('next' is the length now, since it is the index of the element after the last element)
454 int[][] editions2 = new int[next][3];
455 for (i=0, j=next-1; i<next; i++, j--) {
456 editions2[i][0] = editions[j][0];
457 editions2[i][1] = editions[j][1];
458 editions2[i][2] = editions[j][2];
460 editions = editions2;
461 } else {
462 //simply reorder in place
463 int[] temp;
464 for (i=0; i<next; i++) {
465 temp = editions[i];
466 editions[i] = editions[next -1 -i];
467 editions[next -1 -i] = temp;
472 static private final int[][] resizeAndFillEditionsCopy(final int[][] editions, final int ed_length, final int new_length) {
473 // check
474 if (new_length <= ed_length) return editions;
475 // create a copy
476 final int[][] editions2 = new int[new_length][3];
477 // fill with previous values
478 for (int k=0; k<ed_length; k++) {
479 editions2[k][0] = editions[k][0];
480 editions2[k][1] = editions[k][1];
481 editions2[k][2] = editions[k][2];
483 return editions2;
486 /** Convenient tuple to store the statting index and the matrix for that index.*/
487 private class MinDist {
488 int min_j;
489 double min_dist;
490 double[][] matrix;
491 double[][] matrix_e;
494 private double[][] findMinimumEditDistance() {
495 int j, i;
496 int min_j = 0;
497 final int n = vs1.length();
498 final int m = vs2.length();
499 // allocate the first matrix
500 final double[][] matrix1 = new double[n+1][m+1];
501 // make the first element be i*delta
502 for (i=0; i < n +1; i++) {
503 matrix1[i][0] = i * delta;
505 // fill first column
506 for (j=0; j < m + 1; j++) {
507 matrix1[0][j] = j * delta;
509 // return the matrix made matching point 0 of both curves, if the curve is open.
510 if (!closed) {
511 return findEditMatrix(0, matrix1);
514 // else try every point in the second curve to see which one is the best possible match.
516 // allocate the second matrix
517 final double[][] matrix2 = new double[n+1][m+1];
518 /// fill top row values for the second matrix
519 for (i=0; i < n +1; i++) {
520 matrix2[i][0] = i * delta;
522 // make the first j row be j * delta
523 for (j=0; j < m + 1; j++) {
524 matrix2[0][j] = j * delta;
527 // the current, editable
528 double[][] matrix_e = matrix1;
531 // the one to keep
532 double[][] matrix = null;
534 // The algorithm to find the starting point, based on vector string distance:
536 // find the minimum distance
537 for (j=0; j<m; j++) {
538 // get the distance starting at 0 in p1 and at j in p2:
539 matrix_e = findEditMatrix(j, matrix_e);
541 // last element of the matrix is the string distance between p1 and p2
542 if (matrix_e[n][m] < min_dist) {
543 // record values
544 min_dist = matrix_e[n][m];
545 min_j = j;
546 // record new one
547 matrix = matrix_e;
548 // swap editable matrix
549 if (matrix1 == matrix_e) { // compare the addresses of the arrays.
550 matrix_e = matrix2;
551 } else {
552 matrix_e = matrix1;
555 // else nothing needs to be needed, the current editable matrix remains the same.
559 // A 'divide and conquer' approach: much faster, based on the fact that the distances always make a valey when plotted
560 // Find the value of one every 10% of points. Then find those intervals with the lowest starting and ending values, and then look for 50% intervals inside those, and so on, until locking into the lowest value. It will save about 80% or more of all computations.
561 MinDist min_data = new MinDist();
562 min_data.min_j = -1;
563 min_data.min_dist = Double.MAX_VALUE;
564 min_data.matrix = matrix2;
565 min_data.matrix_e = matrix1;
567 min_data = findMinDist(0, m-1, (int)Math.ceil(m * 0.1), min_data);
569 min_j = min_data.min_j;
570 matrix = min_data.matrix;
571 matrix_e = min_data.matrix_e;
573 // free the current editable matrix. The other one is returned below and will be free elsewhere after usage.
574 if (matrix != matrix_e) {
575 matrix_e = null;
576 } else {
577 System.out.println("\nERROR: matrix is matrix_e unexpectedly");
581 // Reorder the second array, so that min_j is index zero (i.e. simply making both curves start at points that are closest to each other in terms of curve similarity).
582 if (0 != min_j) {
583 vs2.reorder(min_j);
586 // return the matrix
587 return matrix;
590 /** Returns the same instance of MinDist given as a parameter (so it has to be non-null). */
591 private MinDist findMinDist(int first, int last, int interval_length, final MinDist result) {
592 double[][] matrix = result.matrix;
593 double[][] matrix_e = result.matrix_e;
594 double[][] matrix1 = matrix_e;
595 double[][] matrix2 = matrix;
597 // the iterator over p2
598 int j;
600 // size of the interval to explore
601 int k = 0;
602 int length;
603 if (last < first) {
604 length = vs2.length() - first + last;
605 } else {
606 length = last - first + 1;
609 // gather data
610 final int n = vs1.length();
611 final int m = vs2.length();
612 int min_j = result.min_j;
613 double min_dist = result.min_dist;
615 while (k < length) {
616 j = first + k;
617 // correct circular array
618 if (j >= m) {
619 j = j - m;
621 // don't do some twice: TODO this setup does not save the case when the computation was done not in the previous iteration but before.
622 if (j == result.min_j) {
623 matrix_e = result.matrix_e;
624 } else {
625 matrix_e = findEditMatrix(j, matrix_e);
627 if (matrix_e[n][m] < min_dist) {
628 // record values
629 min_j = j;
630 matrix = matrix_e;
631 min_dist = matrix[n][m];
632 // swap editable matrix
633 if (matrix_e == matrix1) { // compare the addresses of the arrays.
634 matrix_e = matrix2;
635 } else {
636 matrix_e = matrix1;
639 // advance iterator
640 if (length -1 != k && k + interval_length >= length) {
641 // do the last one (and then finish)
642 k = length -1;
643 } else {
644 k += interval_length;
648 // pack result:
649 result.min_j = min_j;
650 result.min_dist = min_dist;
651 result.matrix = matrix;
652 result.matrix_e = matrix_e;
654 if (1 == interval_length) {
655 // done!
656 return result;
657 } else {
658 first = min_j - (interval_length -1);
659 last = min_j + (interval_length -1);
660 // correct first:
661 if (first < 0) {
662 first = m + first; // a '+' so it is substracted from vs2.length
664 // correct last:
665 if (last >= m) {
666 last -= m;
668 // recurse, with half the interval length
669 interval_length = (int)Math.ceil(interval_length / 2.0f);
670 return findMinDist(first, last, interval_length, result);
674 /** Generate the matrix between vs1 and vs2. The lower right value of the matrix is the Levenshtein's distance between the two strings of vectors. @param delta is the desired point interdistance. @param first is the first index of vs2 to be matched with index zero of vs1. @param matrix is optional, for recyclying. */
675 private double[][] findEditMatrix(final int first, double[][] matrix_) {
677 final int n = vs1.length();
678 final int m = vs2.length();
680 if (null == matrix_) matrix_ = new double[n+1][m+1];
681 final double[][] matrix = matrix_;
683 int i=0, j=0;
684 for (; i < n +1; i++) {
685 matrix[i][0] = i * delta;
687 for (; j < m +1; j++) {
688 matrix[0][j] = j * delta;
690 // as optimized in the findEditMatrix in CurveMorphing_just_C.c
691 double[] mati;
692 double[] mat1;
693 double fun1, fun2, fun3;
694 double dx, dy;
695 for (i=1; i < n +1; i++) {
696 mati = matrix[i];
697 mat1 = matrix[i-1];
698 for (j=1; j < m +1; j++) {
699 // cost deletion:
700 fun1 = mat1[j] + delta; // matrix[i-1][j] + delta
701 // cost insertion:
702 fun2 = mati[j-1] + delta; // matrix[i][j-1] + delta
703 // cost mutation:
704 if (i == n || j == m) {
705 fun3 = mat1[j-1]; // matrix[i-1][j-1]
706 } else {
707 //dx = v_x1[i] - v_x2[j];
708 //dy = v_y1[i] - v_y2[j];
709 fun3 = mat1[j-1] + vs1.getDiffVectorLength(i, j, vs2); // Math.sqrt(dx*dx + dy*dy); // the vector length is the hypothenusa.
711 // insert the lowest value in the matrix.
712 // since most are mutations, start with fun3:
713 if (fun3 <= fun1 && fun3 <= fun2) {
714 mati[j] = fun3;//matrix[i][jj] = fun3;
715 } else if (fun1 <= fun2 && fun1 <= fun3) {
716 mati[j] = fun1;//matrix[i][jj] = fun1;
717 } else {
718 mati[j] = fun2;//matrix[i][jj] = fun2;
723 return matrix;
726 /** Get the sequence of editions and matches in three lines, like:
727 * vs1: 1 2 3 4 5 6 7 8 9
728 * M M D M M M I I M M M
729 * vs2: 1 2 3 4 5 6 7 8 9
731 * With the given separator (defaults to tab if null)
733 public String prettyPrint(String separator) {
734 if (null == editions) return null;
735 if (null == separator) separator = "\t";
736 final StringBuffer s1 = new StringBuffer(editions.length*2 + 5);
737 final StringBuffer se = new StringBuffer(editions.length*2 + 5);
738 final StringBuffer s2 = new StringBuffer(editions.length*2 + 5);
739 s1.append("vs1:");
740 se.append(" ");
741 s2.append("vs2:");
742 for (int i=0; i<editions.length; i++) {
743 switch (editions[i][0]) {
744 case MUTATION:
745 s1.append(separator).append(editions[i][1] + 1);
746 se.append(separator).append('M');
747 s2.append(separator).append(editions[i][2] + 1);
748 break;
749 case DELETION:
750 s1.append(separator).append(editions[i][1] + 1);
751 se.append(separator).append('D');
752 s2.append(separator).append(' ');
753 break;
754 case INSERTION:
755 s1.append(separator).append(' ');
756 se.append(separator).append('I');
757 s2.append(separator).append(editions[i][2] + 1);
758 break;
761 s1.append('\n');
762 se.append('\n');
763 s2.append('\n');
764 return s1.append(se).append(s2).toString();
767 static public class Chunk {
768 int i_start, i_end;
769 Chunk(int i_start, int i_end) {
770 this.i_start = i_start;
771 this.i_end = i_end;
773 int length() { return i_end - i_start + 1; }
777 private class ChunkComparator implements Comparator {
778 public int compare(Object o1, Object o2) {
779 Chunck c1 = (Chunck)o1;
780 Chunck c2 = (Chunck)o2;
781 double val = c1.length() - c2.length();
782 if (val < 0) return -1; // c1 is shorter
783 if (val > 0) return 1; // c1 is longer
784 return 0; // equally long
789 public Chunk findLargestMutationChunk(final int max_non_mut) {
790 // find the longest chunk of contiguous mutations
791 // with no interruptions (insertions or deletions) longer than max_non_mut
792 final ArrayList<Chunk> chunks = new ArrayList<Chunk>();
794 // search for chunks of consecutive mutations, with any number of gaps of maximum length max_non_mut
795 Chunk cur = new Chunk(0, 0);
796 boolean reached_mut = false;
797 int non_mut = 0;
799 for (int i=0; i<editions.length; i++) {
800 if (MUTATION == editions[i][0]) {
801 cur.i_end = i;
802 if (!reached_mut) reached_mut = true;
803 } else {
804 if (!reached_mut) cur.i_start = i; // move forward until finding the first mutation
805 else non_mut++; // if it has reached some mut, count non-muts
806 if (non_mut > max_non_mut || editions.length -1 == i) {
807 // break current chain, try to add it if long enough
808 cur.i_end = i;
809 if (0 != chunks.size()) {
810 // else, check if its equally long or longer than any existent one
811 final Chunk[] c = new Chunk[chunks.size()];
812 chunks.toArray(c);
813 boolean add = false;
814 final int len = cur.length();
815 for (int k=c.length-1; k>-1; k--) {
816 int clen = c[k].length();
817 if (len > clen) {
818 chunks.remove(c[k]); // remove all found that are shorter
820 if (!add && len >= clen) {
821 add = true;
824 if (add) chunks.add(cur);
825 } else {
826 // if none yet, just add it
827 chunks.add(cur);
829 // create new current
830 cur = new Chunk(i, i);
831 reached_mut =false;
832 non_mut = 0;
837 if (0 == chunks.size()) {
838 Utils.log2("No chunks found.");
839 return null;
842 // select a chunk
843 Chunk chunk = null;
844 if (1 == chunks.size()) chunk = chunks.get(0);
845 else {
846 // All added chunks have the same length (otherwise would have been deleted)
847 // Find the one with the smallest cummulative physical distance
848 double[] dist = new double[chunks.size()];
849 int next = 0;
850 Hashtable<Chunk,Double> ht = new Hashtable<Chunk,Double>();
851 for (Chunk c : chunks) {
852 dist[next] = getPhysicalDistance(new int[]{c.i_start, c.i_end, max_non_mut}, false);
853 ht.put(c, dist[next]);
854 next++;
856 Arrays.sort(dist);
857 for (Iterator it = ht.entrySet().iterator(); it.hasNext(); ) {
858 Map.Entry entry = (Map.Entry)it.next();
859 if ( ((Double)entry.getValue()).doubleValue() == dist[0]) {
860 chunk = (Chunk)entry.getKey();
861 break;
866 return chunk;
869 /** Find the longest chunk of mutations (which can include chunks of up to max_non_mut of non-mutations),
870 * then take the center point and split both vector strings there, perform matching towards the ends,
871 * and assemble a new Editions object.
873 public Editions recreateFromCenter(final int max_non_mut) throws Exception {
875 final Chunk chunk = findLargestMutationChunk(max_non_mut);
877 // from the midpoint, create two vector strings and reverse them, and compute editions for them
878 // (so: get new edition sequence from chunk's midpoint to zero)
880 final int midpoint = (chunk.i_start + chunk.i_end) / 2;
881 if (0 == midpoint) return null;
883 VectorString3D firsthalf1 = (VectorString3D)vs1.subVectorString(editions[midpoint][1], 0); // already reversed, by giving indices in reverse order
884 VectorString3D firsthalf2 = (VectorString3D)vs2.subVectorString(editions[midpoint][2], 0); // already reversed, by giving indices in reverse order
886 final Editions ed = new Editions(firsthalf1, firsthalf2, this.delta, this.closed);
887 // compose new editions array from the new one and the second half of this
888 int[][] mushup = new int[ed.editions.length + this.editions.length - midpoint][3]; // not +1 after midpoint to not include the midpoint, which was included in the new Editions.
889 for (int i=0; i<=midpoint/* same as i<ed.editions.length*/; i++) {
890 mushup[midpoint -i] = ed.editions[i];
892 for (int i=midpoint+1; i<this.editions.length; i++) {
893 mushup[i] = this.editions[i];
895 // Levenshtein's distance should be recomputed ... but I don't know how, considering the above mush up
896 ed.vs1 = this.vs1;
897 ed.vs2 = this.vs2;
898 ed.distance = this.distance;
899 ed.editions = mushup;
901 return ed;