3 TrakEM2 plugin for ImageJ(C).
4 Copyright (C) 2007-2009 Albert Cardona.
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License
8 as published by the Free Software Foundation (http://www.gnu.org/licenses/gpl.txt )
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 You may contact Albert Cardona at acardona at ini.phys.ethz.ch
20 Institute of Neuroinformatics, University of Zurich / ETH, Switzerland.
23 package ini
.trakem2
.vector
;
26 import javax
.vecmath
.Point3f
;
27 import java
.util
.Collections
;
28 import ini
.trakem2
.utils
.Utils
;
29 import ini
.trakem2
.utils
.IJError
;
31 /** To extract and represent the sequence of editions that convert any N-dimensional vector string to any other of the same number of dimensions. */
32 public class Editions
{
34 static public final int DELETION
= 1;
35 static public final int INSERTION
= 2;
36 static public final int MUTATION
= 3;
38 protected VectorString vs1
;
39 protected VectorString vs2
;
40 protected double delta
;
41 protected boolean closed
;
43 /** In the form [length][3] */
44 protected int[][] editions
;
45 /** Levenshtein's distance between vs1 and vs2.*/
46 public double distance
;
48 public Editions(final VectorString vs1
, final VectorString vs2
, final double delta
, final boolean closed
) {
56 public double getDistance() { return distance
; }
57 public int length() { return editions
.length
; }
58 public int[][] getEditions() { return editions
; }
59 public VectorString
getVS1() { return vs1
; }
60 public VectorString
getVS2() { return vs2
; }
62 /** A mutation is considered an equal or near equal, and thus does not count. Only deletions and insertions count towards scoring the similarity.
64 * @param skip_ends enables ignoring sequences in the beginning and ending if they are insertions or deletions.
65 * @param max_mut indicates the maximum length of a contiguous sequence of mutations to be ignored when skipping insertions and deletions at beginning and end.
66 * @param min_chunk indicates the minimal proportion of the string that should remain between the found start and end, for vs1. The function will return the regular similarity if the chunk is too small.
69 public double getSimilarity(boolean skip_ends
, final int max_mut
, final float min_chunk
) {
70 return getSimilarity(getStartEndSkip(skip_ends
, max_mut
, min_chunk
));
73 private double getSimilarity(final int[] g
) {
76 boolean skip_ends
= 1 == g
[2];
81 // count non mutations
82 for (int i
=i_start
; i
<=i_end
; i
++) {
83 if (MUTATION
!= editions
[i
][0]) non_mut
++;
86 // compute proper segment lengths, inlined
87 final double sim
= 1.0 - ( (double)non_mut
/ Math
.max( editions
[i_end
][1] - editions
[i_start
][1] + 1, editions
[i_end
][2] - editions
[i_start
][2] + 1) );
89 //if (sim > 0.7) Utils.log2("similarity: non_mut, len1, len2, i_start, i_end : " + non_mut + ", " + (editions[i_end][1] - editions[i_start][1] + 1) + ", " + (editions[i_end][2] - editions[i_start][2] + 1) + ", " + i_start + "," + i_end + " " + Utils.cutNumber(sim * 100, 2) + " %");
93 for (int i
=0; i
<editions
.length
; i
++) {
94 if (MUTATION
!= editions
[i
][0]) non_mut
++;
97 * If the max_len is smaller than the number of non-mutations, then a NEGATIVE similarity value is returned,
98 * but it's ok. All it means is that it's not similar at all.
99 int max_len = Math.max(vs1.length(), vs2.length());
100 Utils.log2("non_mut: " + non_mut + " total: " + editions.length + " max length: " + max_len + (non_mut > max_len ? " WARNING!" : ""));
102 return 1.0 - ( (double)non_mut
/ Math
.max(vs1
.length(), vs2
.length()) );
106 public double getSimilarity() {
107 return getSimilarity(false, 0, 1);
110 public double getSimilarity2() {
111 return getSimilarity2(false, 0, 1);
114 /** Returns the number of mutations / max(len(vs1), len(vs2)) : 1.0 means all are mutations and the sequences have the same lengths.*/
115 public double getSimilarity2(boolean skip_ends
, final int max_mut
, final float min_chunk
) {
117 int[] g
= getStartEndSkip(skip_ends
, max_mut
, min_chunk
);
120 skip_ends
= 1 == g
[2];
125 // count non mutations
126 for (int i
=i_start
; i
<i_end
; i
++) {
127 if (MUTATION
== editions
[i
][0]) mut
++;
130 // compute proper segment lengths, inlined
131 double sim
= (double)mut
/ Math
.max( editions
[i_end
][1] - editions
[i_start
][1] + 1, editions
[i_end
][2] - editions
[i_start
][2] + 1);
133 //if (sim > 0.7) Utils.log2("similarity: mut, len1, len2, i_start, i_end : " + mut + ", " + (editions[i_end][1] - editions[i_start][1] + 1) + ", " + (editions[i_end][2] - editions[i_start][2] + 1) + ", " + i_start + "," + i_end + " " + Utils.cutNumber(sim * 100, 2) + " %");
138 for (int i
=0; i
<editions
.length
; i
++) {
139 if (MUTATION
== editions
[i
][0]) mut
++;
141 return (double)mut
/ Math
.max(vs1
.length(), vs2
.length());
146 /** Returns starting and ending indices, both inclusive. */
147 private final int[] getStartEndSkip(boolean skip_ends
, final int max_mut
, final float min_chunk
) {
149 int i_end
= editions
.length
-1;
153 // break when the found sequence of continuous mutations is larger than max_mut
154 for (int i
=0; i
<editions
.length
; i
++) {
155 if (MUTATION
== editions
[i
][0]) {
161 if (len_mut_seq
> max_mut
) {
162 i_start
= i
- len_mut_seq
+1;
168 for (int i
=i_end
; i
>-1; i
--) {
169 if (MUTATION
== editions
[i
][0]) {
175 if (len_mut_seq
> max_mut
) {
180 // determine if remaining chunk is larger than required min_chunk
181 skip_ends
= ((float)(editions
[i_end
][1] - editions
[i_start
][1] + 1) / vs1
.length()) >= min_chunk
;
183 return new int[]{i_start
, i_end
, skip_ends ?
1 : 0};
186 /** Returns the distance between all points involved in a mutation; if average is false, then it returns the cummulative. Returns Double.MAX_VALUE if no mutations are found. */
187 public double getPhysicalDistance(boolean skip_ends
, final int max_mut
, final float min_chunk
, boolean average
) {
188 return getPhysicalDistance(getStartEndSkip(skip_ends
, max_mut
, min_chunk
), average
);
191 private double getPhysicalDistance(final int[] g
, final boolean average
) {
194 boolean skip_ends
= 1 == g
[2];
199 final int len1
= vs1
.length();
200 final int len2
= vs2
.length();
202 for (i
=i_start
; i
<=i_end
; i
++) {
203 if (MUTATION
!= editions
[i
][0]) continue;
204 int k1
= editions
[i
][1];
205 int k2
= editions
[i
][2];
206 if (len1
== k1
|| len2
== k2
) continue; // LAST point will fail in some occasions, needs fixing
207 dist
+= vs1
.distance(k1
, vs2
, k2
);
210 if (0 == len
) return Double
.MAX_VALUE
;
211 if (average
) return dist
/ len
; // can len be zero ?
213 } catch (Exception e
) {
215 Utils
.log2("ERROR in getPhysicalDistance: i,len j,len : " + editions
[i
][1] + ", " + vs1
.length() + " " + editions
[i
][2] + ", " + vs2
.length());
216 return Double
.MAX_VALUE
;
220 public double getStdDev(final boolean skip_ends
, final int max_mut
, final float min_chunk
) {
221 return getStdDev(getStartEndSkip(skip_ends
, max_mut
, min_chunk
));
224 /** Returns the standard deviation of the distances between all points involved in a mutation. */
225 private double getStdDev(final int[] g
) {
228 boolean skip_ends
= 1 == g
[2];
232 final int len1
= vs1
.length();
233 final int len2
= vs2
.length();
234 final ArrayList
<Double
> mut
= new ArrayList
<Double
>(); // why not ArrayList<double> ? STUPID JAVA
236 for (i
=i_start
; i
<=i_end
; i
++) {
237 if (MUTATION
!= editions
[i
][0]) continue;
238 int k1
= editions
[i
][1];
239 int k2
= editions
[i
][2];
240 if (len1
== k1
|| len2
== k2
) continue; // LAST point will fail in some occasions, needs fixing
241 double d
= vs1
.distance(k1
, vs2
, k2
);
245 if (0 == mut
.size()) return Double
.MAX_VALUE
;
246 Double
[] di
= new Double
[mut
.size()];
248 double average
= dist
/ di
.length
; // can length be zero ?
250 for (int k
=0; k
<di
.length
; k
++) {
251 std
+= Math
.pow(di
[k
].doubleValue() - average
, 2);
253 return Math
.sqrt(std
/ di
.length
);
255 } catch (Exception e
) {
257 Utils
.log2("ERROR in getPhysicalDistance: i,len j,len : " + editions
[i
][1] + ", " + vs1
.length() + " " + editions
[i
][2] + ", " + vs2
.length());
258 return Double
.MAX_VALUE
;
262 /** Returns {average distance, cummulative distance, stdDev, median, prop_mut} which are:
264 * [0] - average distance: the average physical distance between mutation pairs
265 * [1] - cummulative distance: the sum of the distances between mutation pairs
266 * [2] - stdDev: of the physical distances between mutation pairs relative to the average
267 * [3] - median: the average medial physical distance between mutation pairs, more robust than the average to extreme values
268 * [4] - prop_mut: the proportion of mutation pairs relative to the length of the queried sequence vs1.
269 * [5] - Levenshtein's distance
270 * [6] - Similarity (@see getSimilarity)
272 public double[] getStatistics(final boolean skip_ends
, final int max_mut
, final float min_chunk
, final boolean score_mut_only
) {
273 return getStatistics(getStartEndSkip(skip_ends
, max_mut
, min_chunk
), score_mut_only
);
276 private double[] getStatistics(final int[] g
, final boolean score_mut_only
) {
279 boolean skip_ends
= 1 == g
[2];
282 final int len1
= vs1
.length();
283 final int len2
= vs2
.length();
284 final ArrayList
<Double
> dist
= new ArrayList
<Double
>(); // why not ArrayList<double> ? STUPID JAVA
285 final double[] pack
= new double[9];
287 Arrays
.fill(pack
, Double
.MAX_VALUE
);
292 double c_dist
= 0; // cummulative distance of pairs
293 double c_dist_mut
= 0; // cummulative distance of mutation pairs
296 for (i
=i_start
; i
<=i_end
; i
++) {
297 if (MUTATION
== editions
[i
][0]) n_mutation
++;
298 else if (score_mut_only
) continue; // not a mutation, so continue because we want only mutations.
299 //if (score_mut_only && MUTATION != editions[i][0]) continue;
300 final int k1
= editions
[i
][1];
301 final int k2
= editions
[i
][2];
302 if (len1
== k1
|| len2
== k2
) continue; // LAST point will fail in some occasions, needs fixing
303 final double d
= vs1
.distance(k1
, vs2
, k2
);
305 if (MUTATION
== editions
[i
][0]) c_dist_mut
+= d
; // will be the same as c_dist when 'score_mut_only' is true
309 // Need at least one value to make any sense of the data
310 if (0 == dist
.size()) return pack
;
312 final Double
[] di
= new Double
[dist
.size()];
315 final double average
= c_dist
/ di
.length
;
318 for (int k
=0; k
<di
.length
; k
++) {
319 std
+= Math
.pow(di
[k
].doubleValue() - average
, 2);
321 std
= Math
.sqrt(std
/ di
.length
);
323 Collections
.sort(dist
);
324 final double median
= dist
.get(di
.length
/2);
326 final double prop_mut
= ((double)n_mutation
) / (i_end
- i_start
); // i_end is non-inclusive
328 // the max physical length of the measured part of the sequences
329 final double phys_len
= Math
.max(editions
[i_end
][1] - editions
[i_start
][1] + 1,
330 editions
[i_end
][2] - editions
[i_start
][2] + 1)
331 * vs1
.getDelta(); // delta is the same for both
338 pack
[5] = this.distance
;
339 pack
[6] = getSimilarity(g
);
340 // proximity: Unitless value indicating proximity:
341 pack
[7] = ((double)c_dist
) / phys_len
;
342 // proximity_mut: Unitless value indicating proximity between mutation pairs only:
343 pack
[8] = score_mut_only ? pack
[7] : ((double)c_dist_mut
) / phys_len
;
345 // When one does the proximity with the length of the query sequence only and not the max of both, then shorter ref sequences will score better.
347 } catch (Exception e
) {
349 Utils
.log2("ERROR in getStatistics: i,len j,len : " + editions
[i
][1] + ", " + vs1
.length() + " " + editions
[i
][2] + ", " + vs2
.length());
355 final private void init() {
356 final boolean with_source
= (vs1
instanceof VectorString3D
&& vs2
instanceof VectorString3D
) ?
357 null != ((VectorString3D
)vs1
).getSource() && null != ((VectorString3D
)vs2
).getSource()
359 //Utils.log2("Editions.init() : With source: " + with_source);
360 // equalize point interdistance in both strings of vectors and create the actual vectors
361 vs1
.resample(delta
, with_source
);
362 vs2
.resample(delta
, with_source
);
363 // fetch the optimal matrix
364 final double[][] matrix
= findMinimumEditDistance();
366 final int n
= vs1
.length();
367 final int m
= vs2
.length();
369 this.distance
= matrix
[n
][m
];
371 // let's see the matrix
373 final StringBuffer sb = new StringBuffer();
374 for (int f=0; f<n+1; f++) {
375 for (int g=0; g<m+1; g++) {
376 sb.append(matrix[f][g]).append('\t');
380 Utils.saveToFile(new java.io.File("/home/albert/Desktop/matrix.txt"), sb.toString());
384 final int initial_length
= (int)Math
.sqrt((n
*n
) + (m
*m
));
386 int ed_length
= initial_length
;
387 editions
= new int[ed_length
][3];
392 final double error
= 0.0000001; // for floating-point math
393 while (0 != i
&& 0 != j
) { // the matrix is n+1,m+1 in size
394 // check editions array
395 if (next
== ed_length
) {
396 // enlarge editions array
397 this.editions
= resizeAndFillEditionsCopy(editions
, ed_length
, ed_length
+ 20);
400 // find next i, j and the type of transform:
401 if (error
> Math
.abs(matrix
[i
][j
] - matrix
[i
-1][j
] - delta
)) {
403 editions
[next
][0] = DELETION
;
404 editions
[next
][1] = i
;
405 editions
[next
][2] = j
;
407 } else if (error
> Math
.abs(matrix
[i
][j
] - matrix
[i
][j
-1] - delta
)) {
409 editions
[next
][0] = INSERTION
;
410 editions
[next
][1] = i
;
411 editions
[next
][2] = j
;
414 // a mutation (a trace between two elements of the string of vectors)
415 editions
[next
][0] = MUTATION
;
416 editions
[next
][1] = i
;
417 editions
[next
][2] = j
;
424 // add unnoticed insertions/deletions. Happens when 'i' and 'j' don't reach the zero value at the same time.
426 for (k
=j
; k
>-1; k
--) {
427 if (next
== ed_length
) {
428 //enlarge editions array
429 editions
= resizeAndFillEditionsCopy(editions
, ed_length
, ed_length
+ 20);
432 editions
[next
][0] = INSERTION
; // insertion
433 editions
[next
][1] = 0; // the 'i'
434 editions
[next
][2] = k
; // the 'j'
439 for (k
=i
; k
>-1; k
--) {
440 if (next
== ed_length
) {
441 //enlarge editions array
442 editions
= resizeAndFillEditionsCopy(editions
, ed_length
, ed_length
+ 20);
445 editions
[next
][0] = DELETION
; // deletion
446 editions
[next
][1] = k
; // the 'i'
447 editions
[next
][2] = 0; // the 'j'
451 // reverse and resize editions array, and DO NOT slice out the last element.
452 if (next
!= ed_length
) {
453 // allocate a new editions array ('next' is the length now, since it is the index of the element after the last element)
454 int[][] editions2
= new int[next
][3];
455 for (i
=0, j
=next
-1; i
<next
; i
++, j
--) {
456 editions2
[i
][0] = editions
[j
][0];
457 editions2
[i
][1] = editions
[j
][1];
458 editions2
[i
][2] = editions
[j
][2];
460 editions
= editions2
;
462 //simply reorder in place
464 for (i
=0; i
<next
; i
++) {
466 editions
[i
] = editions
[next
-1 -i
];
467 editions
[next
-1 -i
] = temp
;
472 static private final int[][] resizeAndFillEditionsCopy(final int[][] editions
, final int ed_length
, final int new_length
) {
474 if (new_length
<= ed_length
) return editions
;
476 final int[][] editions2
= new int[new_length
][3];
477 // fill with previous values
478 for (int k
=0; k
<ed_length
; k
++) {
479 editions2
[k
][0] = editions
[k
][0];
480 editions2
[k
][1] = editions
[k
][1];
481 editions2
[k
][2] = editions
[k
][2];
486 /** Convenient tuple to store the statting index and the matrix for that index.*/
487 private class MinDist
{
494 private double[][] findMinimumEditDistance() {
497 final int n
= vs1
.length();
498 final int m
= vs2
.length();
499 // allocate the first matrix
500 final double[][] matrix1
= new double[n
+1][m
+1];
501 // make the first element be i*delta
502 for (i
=0; i
< n
+1; i
++) {
503 matrix1
[i
][0] = i
* delta
;
506 for (j
=0; j
< m
+ 1; j
++) {
507 matrix1
[0][j
] = j
* delta
;
509 // return the matrix made matching point 0 of both curves, if the curve is open.
511 return findEditMatrix(0, matrix1
);
514 // else try every point in the second curve to see which one is the best possible match.
516 // allocate the second matrix
517 final double[][] matrix2
= new double[n
+1][m
+1];
518 /// fill top row values for the second matrix
519 for (i
=0; i
< n
+1; i
++) {
520 matrix2
[i
][0] = i
* delta
;
522 // make the first j row be j * delta
523 for (j
=0; j
< m
+ 1; j
++) {
524 matrix2
[0][j
] = j
* delta
;
527 // the current, editable
528 double[][] matrix_e
= matrix1
;
532 double[][] matrix
= null;
534 // The algorithm to find the starting point, based on vector string distance:
536 // find the minimum distance
537 for (j=0; j<m; j++) {
538 // get the distance starting at 0 in p1 and at j in p2:
539 matrix_e = findEditMatrix(j, matrix_e);
541 // last element of the matrix is the string distance between p1 and p2
542 if (matrix_e[n][m] < min_dist) {
544 min_dist = matrix_e[n][m];
548 // swap editable matrix
549 if (matrix1 == matrix_e) { // compare the addresses of the arrays.
555 // else nothing needs to be needed, the current editable matrix remains the same.
559 // A 'divide and conquer' approach: much faster, based on the fact that the distances always make a valey when plotted
560 // Find the value of one every 10% of points. Then find those intervals with the lowest starting and ending values, and then look for 50% intervals inside those, and so on, until locking into the lowest value. It will save about 80% or more of all computations.
561 MinDist min_data
= new MinDist();
563 min_data
.min_dist
= Double
.MAX_VALUE
;
564 min_data
.matrix
= matrix2
;
565 min_data
.matrix_e
= matrix1
;
567 min_data
= findMinDist(0, m
-1, (int)Math
.ceil(m
* 0.1), min_data
);
569 min_j
= min_data
.min_j
;
570 matrix
= min_data
.matrix
;
571 matrix_e
= min_data
.matrix_e
;
573 // free the current editable matrix. The other one is returned below and will be free elsewhere after usage.
574 if (matrix
!= matrix_e
) {
577 System
.out
.println("\nERROR: matrix is matrix_e unexpectedly");
581 // Reorder the second array, so that min_j is index zero (i.e. simply making both curves start at points that are closest to each other in terms of curve similarity).
590 /** Returns the same instance of MinDist given as a parameter (so it has to be non-null). */
591 private MinDist
findMinDist(int first
, int last
, int interval_length
, final MinDist result
) {
592 double[][] matrix
= result
.matrix
;
593 double[][] matrix_e
= result
.matrix_e
;
594 double[][] matrix1
= matrix_e
;
595 double[][] matrix2
= matrix
;
597 // the iterator over p2
600 // size of the interval to explore
604 length
= vs2
.length() - first
+ last
;
606 length
= last
- first
+ 1;
610 final int n
= vs1
.length();
611 final int m
= vs2
.length();
612 int min_j
= result
.min_j
;
613 double min_dist
= result
.min_dist
;
617 // correct circular array
621 // don't do some twice: TODO this setup does not save the case when the computation was done not in the previous iteration but before.
622 if (j
== result
.min_j
) {
623 matrix_e
= result
.matrix_e
;
625 matrix_e
= findEditMatrix(j
, matrix_e
);
627 if (matrix_e
[n
][m
] < min_dist
) {
631 min_dist
= matrix
[n
][m
];
632 // swap editable matrix
633 if (matrix_e
== matrix1
) { // compare the addresses of the arrays.
640 if (length
-1 != k
&& k
+ interval_length
>= length
) {
641 // do the last one (and then finish)
644 k
+= interval_length
;
649 result
.min_j
= min_j
;
650 result
.min_dist
= min_dist
;
651 result
.matrix
= matrix
;
652 result
.matrix_e
= matrix_e
;
654 if (1 == interval_length
) {
658 first
= min_j
- (interval_length
-1);
659 last
= min_j
+ (interval_length
-1);
662 first
= m
+ first
; // a '+' so it is substracted from vs2.length
668 // recurse, with half the interval length
669 interval_length
= (int)Math
.ceil(interval_length
/ 2.0f
);
670 return findMinDist(first
, last
, interval_length
, result
);
674 /** Generate the matrix between vs1 and vs2. The lower right value of the matrix is the Levenshtein's distance between the two strings of vectors. @param delta is the desired point interdistance. @param first is the first index of vs2 to be matched with index zero of vs1. @param matrix is optional, for recyclying. */
675 private double[][] findEditMatrix(final int first
, double[][] matrix_
) {
677 final int n
= vs1
.length();
678 final int m
= vs2
.length();
680 if (null == matrix_
) matrix_
= new double[n
+1][m
+1];
681 final double[][] matrix
= matrix_
;
684 for (; i
< n
+1; i
++) {
685 matrix
[i
][0] = i
* delta
;
687 for (; j
< m
+1; j
++) {
688 matrix
[0][j
] = j
* delta
;
690 // as optimized in the findEditMatrix in CurveMorphing_just_C.c
693 double fun1
, fun2
, fun3
;
695 for (i
=1; i
< n
+1; i
++) {
698 for (j
=1; j
< m
+1; j
++) {
700 fun1
= mat1
[j
] + delta
; // matrix[i-1][j] + delta
702 fun2
= mati
[j
-1] + delta
; // matrix[i][j-1] + delta
704 if (i
== n
|| j
== m
) {
705 fun3
= mat1
[j
-1]; // matrix[i-1][j-1]
707 //dx = v_x1[i] - v_x2[j];
708 //dy = v_y1[i] - v_y2[j];
709 fun3
= mat1
[j
-1] + vs1
.getDiffVectorLength(i
, j
, vs2
); // Math.sqrt(dx*dx + dy*dy); // the vector length is the hypothenusa.
711 // insert the lowest value in the matrix.
712 // since most are mutations, start with fun3:
713 if (fun3
<= fun1
&& fun3
<= fun2
) {
714 mati
[j
] = fun3
;//matrix[i][jj] = fun3;
715 } else if (fun1
<= fun2
&& fun1
<= fun3
) {
716 mati
[j
] = fun1
;//matrix[i][jj] = fun1;
718 mati
[j
] = fun2
;//matrix[i][jj] = fun2;
726 /** Get the sequence of editions and matches in three lines, like:
727 * vs1: 1 2 3 4 5 6 7 8 9
728 * M M D M M M I I M M M
729 * vs2: 1 2 3 4 5 6 7 8 9
731 * With the given separator (defaults to tab if null)
733 public String
prettyPrint(String separator
) {
734 if (null == editions
) return null;
735 if (null == separator
) separator
= "\t";
736 final StringBuffer s1
= new StringBuffer(editions
.length
*2 + 5);
737 final StringBuffer se
= new StringBuffer(editions
.length
*2 + 5);
738 final StringBuffer s2
= new StringBuffer(editions
.length
*2 + 5);
742 for (int i
=0; i
<editions
.length
; i
++) {
743 switch (editions
[i
][0]) {
745 s1
.append(separator
).append(editions
[i
][1] + 1);
746 se
.append(separator
).append('M');
747 s2
.append(separator
).append(editions
[i
][2] + 1);
750 s1
.append(separator
).append(editions
[i
][1] + 1);
751 se
.append(separator
).append('D');
752 s2
.append(separator
).append(' ');
755 s1
.append(separator
).append(' ');
756 se
.append(separator
).append('I');
757 s2
.append(separator
).append(editions
[i
][2] + 1);
764 return s1
.append(se
).append(s2
).toString();
767 static public class Chunk
{
769 Chunk(int i_start
, int i_end
) {
770 this.i_start
= i_start
;
773 int length() { return i_end
- i_start
+ 1; }
777 private class ChunkComparator implements Comparator {
778 public int compare(Object o1, Object o2) {
779 Chunck c1 = (Chunck)o1;
780 Chunck c2 = (Chunck)o2;
781 double val = c1.length() - c2.length();
782 if (val < 0) return -1; // c1 is shorter
783 if (val > 0) return 1; // c1 is longer
784 return 0; // equally long
789 public Chunk
findLargestMutationChunk(final int max_non_mut
) {
790 // find the longest chunk of contiguous mutations
791 // with no interruptions (insertions or deletions) longer than max_non_mut
792 final ArrayList
<Chunk
> chunks
= new ArrayList
<Chunk
>();
794 // search for chunks of consecutive mutations, with any number of gaps of maximum length max_non_mut
795 Chunk cur
= new Chunk(0, 0);
796 boolean reached_mut
= false;
799 for (int i
=0; i
<editions
.length
; i
++) {
800 if (MUTATION
== editions
[i
][0]) {
802 if (!reached_mut
) reached_mut
= true;
804 if (!reached_mut
) cur
.i_start
= i
; // move forward until finding the first mutation
805 else non_mut
++; // if it has reached some mut, count non-muts
806 if (non_mut
> max_non_mut
|| editions
.length
-1 == i
) {
807 // break current chain, try to add it if long enough
809 if (0 != chunks
.size()) {
810 // else, check if its equally long or longer than any existent one
811 final Chunk
[] c
= new Chunk
[chunks
.size()];
814 final int len
= cur
.length();
815 for (int k
=c
.length
-1; k
>-1; k
--) {
816 int clen
= c
[k
].length();
818 chunks
.remove(c
[k
]); // remove all found that are shorter
820 if (!add
&& len
>= clen
) {
824 if (add
) chunks
.add(cur
);
826 // if none yet, just add it
829 // create new current
830 cur
= new Chunk(i
, i
);
837 if (0 == chunks
.size()) {
838 Utils
.log2("No chunks found.");
844 if (1 == chunks
.size()) chunk
= chunks
.get(0);
846 // All added chunks have the same length (otherwise would have been deleted)
847 // Find the one with the smallest cummulative physical distance
848 double[] dist
= new double[chunks
.size()];
850 Hashtable
<Chunk
,Double
> ht
= new Hashtable
<Chunk
,Double
>();
851 for (Chunk c
: chunks
) {
852 dist
[next
] = getPhysicalDistance(new int[]{c
.i_start
, c
.i_end
, max_non_mut
}, false);
853 ht
.put(c
, dist
[next
]);
857 for (Iterator it
= ht
.entrySet().iterator(); it
.hasNext(); ) {
858 Map
.Entry entry
= (Map
.Entry
)it
.next();
859 if ( ((Double
)entry
.getValue()).doubleValue() == dist
[0]) {
860 chunk
= (Chunk
)entry
.getKey();
869 /** Find the longest chunk of mutations (which can include chunks of up to max_non_mut of non-mutations),
870 * then take the center point and split both vector strings there, perform matching towards the ends,
871 * and assemble a new Editions object.
873 public Editions
recreateFromCenter(final int max_non_mut
) throws Exception
{
875 final Chunk chunk
= findLargestMutationChunk(max_non_mut
);
877 // from the midpoint, create two vector strings and reverse them, and compute editions for them
878 // (so: get new edition sequence from chunk's midpoint to zero)
880 final int midpoint
= (chunk
.i_start
+ chunk
.i_end
) / 2;
881 if (0 == midpoint
) return null;
883 VectorString3D firsthalf1
= (VectorString3D
)vs1
.subVectorString(editions
[midpoint
][1], 0); // already reversed, by giving indices in reverse order
884 VectorString3D firsthalf2
= (VectorString3D
)vs2
.subVectorString(editions
[midpoint
][2], 0); // already reversed, by giving indices in reverse order
886 final Editions ed
= new Editions(firsthalf1
, firsthalf2
, this.delta
, this.closed
);
887 // compose new editions array from the new one and the second half of this
888 int[][] mushup
= new int[ed
.editions
.length
+ this.editions
.length
- midpoint
][3]; // not +1 after midpoint to not include the midpoint, which was included in the new Editions.
889 for (int i
=0; i
<=midpoint
/* same as i<ed.editions.length*/; i
++) {
890 mushup
[midpoint
-i
] = ed
.editions
[i
];
892 for (int i
=midpoint
+1; i
<this.editions
.length
; i
++) {
893 mushup
[i
] = this.editions
[i
];
895 // Levenshtein's distance should be recomputed ... but I don't know how, considering the above mush up
898 ed
.distance
= this.distance
;
899 ed
.editions
= mushup
;