1 diff -urN SuperLU_MT_2.0.orig/make.inc SuperLU_MT_2.0/make.inc
2 --- SuperLU_MT_2.0.orig/make.inc 2010-10-24 15:32:48.004562348 +0200
3 +++ SuperLU_MT_2.0/make.inc 2010-10-24 15:41:18.877895682 +0200
6 # The machine (platform) identifier to append to the library names
12 # The name of the libraries to be created/linked to
14 # BLASLIB = -lblas -lxlf -lxlf90
15 # which may be slower than ESSL
17 -BLASDEF = -DUSE_VENDOR_BLAS
21 #BLASLIB = ../lib/libblas$(PLAT).a
29 -CFLAGS = -qarch=pwr5 -qalias=allptrs $(PREDEFS) -DPRNTlevel=0 -O3
31 -FFLAGS = -O3 -qarch=pwr5
33 -LOADOPTS = -bmaxdata:0x80000000
41 # C preprocessor defs for compilation for the Fortran interface
42 # (-DNoChange, -DAdd_, -DAdd__, or -DUpCase)
48 diff -urN SuperLU_MT_2.0.orig/SRC/cmatgen.c SuperLU_MT_2.0/SRC/cmatgen.c
49 --- SuperLU_MT_2.0.orig/SRC/cmatgen.c 2010-10-24 15:32:48.001229014 +0200
50 +++ SuperLU_MT_2.0/SRC/cmatgen.c 2010-10-24 16:48:08.347895679 +0200
55 -double dlaran_(int *iseed)
57 -/* -- LAPACK auxiliary routine (version 2.0) --
58 - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
59 - Courant Institute, Argonne National Lab, and Rice University
65 - DLARAN returns a random real number from a uniform (0,1)
71 - ISEED (input/output) INT array, dimension (4)
72 - On entry, the seed of the random number generator; the array
74 - elements must be between 0 and 4095, and ISEED(4) must be
76 - On exit, the seed is updated.
81 - This routine uses a multiplicative congruential method with modulus
82 - 2**48 and multiplier 33952834046453 (see G.S.Fishman,
83 - 'Multiplicative congruential random number generators with modulus
84 - 2**b: an exhaustive analysis for b = 32 and a partial analysis for
85 - b = 48', Math. Comp. 189, pp 331-344, 1990).
87 - 48-bit integers are stored in 4 integer array elements with 12 bits
88 - per element. Hence the routine is portable across machines with
89 - integers of 32 bits or more.
91 - =====================================================================
94 - /* Local variables */
95 - int it1, it2, it3, it4;
99 - /* multiply the seed by the multiplier modulo 2**48 */
100 - it4 = iseed[4] * 2549;
103 - it3 = it3 + iseed[3] * 2549 + iseed[4] * 2508;
106 - it2 = it2 + iseed[2] * 2549 + iseed[3] * 2508 + iseed[4] * 322;
109 - it1 = it1 + iseed[1] * 2549 + iseed[2] * 2508 + iseed[3] * 322 + iseed[4]
113 - /* return updated seed */
120 - /* convert 48-bit integer to a real number in the interval (0,1) */
122 - return ((double) it1 +
123 - ((double) it2 + ((double) it3 + (double) it4 * 2.44140625e-4) *
124 - 2.44140625e-4) * 2.44140625e-4) * 2.44140625e-4;
128 diff -urN SuperLU_MT_2.0.orig/SRC/cmyblas2.c SuperLU_MT_2.0/SRC/cmyblas2.c
129 --- SuperLU_MT_2.0.orig/SRC/cmyblas2.c 2010-10-24 15:32:48.001229014 +0200
130 +++ SuperLU_MT_2.0/SRC/cmyblas2.c 2010-10-24 16:43:36.411229012 +0200
131 @@ -181,3 +181,127 @@
136 + * Performs dense matrix-vector multiply with 2 vectors:
141 + int lda, /* leading dimension of A */
144 + complex *A, /* in - size m-by-n */
145 + complex *x0, /* in - size n-by-1 */
146 + complex *x1, /* in - size n-by-1 */
147 + complex *y0, /* out - size n-by-1 */
148 + complex *y1 /* out - size n-by-1 */
152 + complex v00, v10, v20, v30, v40, v50, v60, v70,
153 + v01, v11, v21, v31, v41, v51, v61, v71;
154 + complex t0, t1, t2, t3, t4, t5, t6, t7;
156 + complex *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
157 + register int firstcol = 0;
163 + while ( firstcol < n - 7 ) { /* Do 8 columns */
174 + v00 = x0[firstcol]; v01 = x1[firstcol++];
175 + v10 = x0[firstcol]; v11 = x1[firstcol++];
176 + v20 = x0[firstcol]; v21 = x1[firstcol++];
177 + v30 = x0[firstcol]; v31 = x1[firstcol++];
178 + v40 = x0[firstcol]; v41 = x1[firstcol++];
179 + v50 = x0[firstcol]; v51 = x1[firstcol++];
180 + v60 = x0[firstcol]; v61 = x1[firstcol++];
181 + v70 = x0[firstcol]; v71 = x1[firstcol++];
183 + for (k = 0; k < m; k++) {
186 + t0 = Mki0[k]; cc_mult(&temp, &v00, &t0);c_add(&f0,&f0,&temp);
187 + cc_mult(&temp,&v01,&t0);c_add(&f1,&f1,&temp);
188 + t1 = Mki1[k]; cc_mult(&temp,&v10,&t1);c_add(&f0,&f0,&temp);
189 + cc_mult(&temp,&v11,&t1);c_add(&f1,&f1,&temp);
190 + t2 = Mki2[k]; cc_mult(&temp,&v20,&t2);c_add(&f0,&f0,&temp);
191 + cc_mult(&temp,&v21,&t2);c_add(&f1,&f1,&temp);
192 + t3 = Mki3[k]; cc_mult(&temp,&v30,&t3);c_add(&f0,&f0,&temp);
193 + cc_mult(&temp,&v31,&t3);c_add(&f1,&f1,&temp);
194 + t4 = Mki4[k]; cc_mult(&temp,&v40,&t4);c_add(&f0,&f0,&temp);
195 + cc_mult(&temp,&v41,&t4);c_add(&f1,&f1,&temp);
196 + t5 = Mki5[k]; cc_mult(&temp,&v50,&t5);c_add(&f0,&f0,&temp);
197 + cc_mult(&temp,&v51,&t5);c_add(&f1,&f1,&temp);
198 + t6 = Mki6[k]; cc_mult(&temp,&v60,&t6);c_add(&f0,&f0,&temp);
199 + cc_mult(&temp,&v61,&t6);c_add(&f1,&f1,&temp);
200 + t7 = Mki7[k]; cc_mult(&temp,&v70,&t7);c_add(&f0,&f0,&temp);
201 + cc_mult(&temp,&v71,&t7);c_add(&f1,&f1,&temp);
209 + while ( firstcol < n - 3 ) { /* Do 4 columns */
215 + v00 = x0[firstcol]; v01 = x1[firstcol++];
216 + v10 = x0[firstcol]; v11 = x1[firstcol++];
217 + v20 = x0[firstcol]; v21 = x1[firstcol++];
218 + v30 = x0[firstcol]; v31 = x1[firstcol++];
220 + for (k = 0; k < m; k++) {
223 + t0 = Mki0[k]; cc_mult(&temp,&v00,&t0);c_add(&f0,&f0,&temp);
224 + cc_mult(&temp,&v01,&t0);c_add(&f1,&f1,&temp);
225 + t1 = Mki1[k]; cc_mult(&temp,&v10,&t1);c_add(&f0,&f0,&temp);
226 + cc_mult(&temp,&v11,&t1);c_add(&f1,&f1,&temp);
227 + t2 = Mki2[k]; cc_mult(&temp,&v20,&t2);c_add(&f0,&f0,&temp);
228 + cc_mult(&temp,&v21,&t2);c_add(&f1,&f1,&temp);
229 + t3 = Mki3[k]; cc_mult(&temp,&v30,&t3);c_add(&f0,&f0,&temp);
230 + cc_mult(&temp,&v31,&t3);c_add(&f1,&f1,&temp);
239 + while ( firstcol < n ) { /* Do 1 column */
241 + v00 = x0[firstcol]; v01 = x1[firstcol++];
243 + for (k = 0; k < m; k++) {
247 + cc_mult(&temp,&v00,&t0);c_add(&f0,&f0,&temp);
248 + cc_mult(&temp,&v01,&t0);c_add(&f1,&f1,&temp);
259 diff -urN SuperLU_MT_2.0.orig/SRC/smatgen.c SuperLU_MT_2.0/SRC/smatgen.c
260 --- SuperLU_MT_2.0.orig/SRC/smatgen.c 2010-10-24 15:32:47.997895679 +0200
261 +++ SuperLU_MT_2.0/SRC/smatgen.c 2010-10-24 16:48:08.347895679 +0200
266 -double dlaran_(int *iseed)
268 -/* -- LAPACK auxiliary routine (version 2.0) --
269 - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
270 - Courant Institute, Argonne National Lab, and Rice University
276 - DLARAN returns a random real number from a uniform (0,1)
282 - ISEED (input/output) INT array, dimension (4)
283 - On entry, the seed of the random number generator; the array
285 - elements must be between 0 and 4095, and ISEED(4) must be
287 - On exit, the seed is updated.
292 - This routine uses a multiplicative congruential method with modulus
293 - 2**48 and multiplier 33952834046453 (see G.S.Fishman,
294 - 'Multiplicative congruential random number generators with modulus
295 - 2**b: an exhaustive analysis for b = 32 and a partial analysis for
296 - b = 48', Math. Comp. 189, pp 331-344, 1990).
298 - 48-bit integers are stored in 4 integer array elements with 12 bits
299 - per element. Hence the routine is portable across machines with
300 - integers of 32 bits or more.
302 - =====================================================================
305 - /* Local variables */
306 - int it1, it2, it3, it4;
310 - /* multiply the seed by the multiplier modulo 2**48 */
311 - it4 = iseed[4] * 2549;
314 - it3 = it3 + iseed[3] * 2549 + iseed[4] * 2508;
317 - it2 = it2 + iseed[2] * 2549 + iseed[3] * 2508 + iseed[4] * 322;
320 - it1 = it1 + iseed[1] * 2549 + iseed[2] * 2508 + iseed[3] * 322 + iseed[4]
324 - /* return updated seed */
331 - /* convert 48-bit integer to a real number in the interval (0,1) */
333 - return ((double) it1 +
334 - ((double) it2 + ((double) it3 + (double) it4 * 2.44140625e-4) *
335 - 2.44140625e-4) * 2.44140625e-4) * 2.44140625e-4;
339 diff -urN SuperLU_MT_2.0.orig/SRC/xerbla.c SuperLU_MT_2.0/SRC/xerbla.c
340 --- SuperLU_MT_2.0.orig/SRC/xerbla.c 2010-10-24 15:32:48.001229014 +0200
341 +++ SuperLU_MT_2.0/SRC/xerbla.c 2010-10-24 16:48:08.351229012 +0200
344 /* Subroutine */ int xerbla_(char *srname, int *info)
346 /* -- LAPACK auxiliary routine (version 2.0) --
347 diff -urN SuperLU_MT_2.0.orig/SRC/zmatgen.c SuperLU_MT_2.0/SRC/zmatgen.c
348 --- SuperLU_MT_2.0.orig/SRC/zmatgen.c 2010-10-24 15:32:47.997895679 +0200
349 +++ SuperLU_MT_2.0/SRC/zmatgen.c 2010-10-24 16:48:08.351229012 +0200
354 -double dlaran_(int *iseed)
356 -/* -- LAPACK auxiliary routine (version 2.0) --
357 - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
358 - Courant Institute, Argonne National Lab, and Rice University
364 - DLARAN returns a random real number from a uniform (0,1)
370 - ISEED (input/output) INT array, dimension (4)
371 - On entry, the seed of the random number generator; the array
373 - elements must be between 0 and 4095, and ISEED(4) must be
375 - On exit, the seed is updated.
380 - This routine uses a multiplicative congruential method with modulus
381 - 2**48 and multiplier 33952834046453 (see G.S.Fishman,
382 - 'Multiplicative congruential random number generators with modulus
383 - 2**b: an exhaustive analysis for b = 32 and a partial analysis for
384 - b = 48', Math. Comp. 189, pp 331-344, 1990).
386 - 48-bit integers are stored in 4 integer array elements with 12 bits
387 - per element. Hence the routine is portable across machines with
388 - integers of 32 bits or more.
390 - =====================================================================
393 - /* Local variables */
394 - int it1, it2, it3, it4;
398 - /* multiply the seed by the multiplier modulo 2**48 */
399 - it4 = iseed[4] * 2549;
402 - it3 = it3 + iseed[3] * 2549 + iseed[4] * 2508;
405 - it2 = it2 + iseed[2] * 2549 + iseed[3] * 2508 + iseed[4] * 322;
408 - it1 = it1 + iseed[1] * 2549 + iseed[2] * 2508 + iseed[3] * 322 + iseed[4]
412 - /* return updated seed */
419 - /* convert 48-bit integer to a real number in the interval (0,1) */
421 - return ((double) it1 +
422 - ((double) it2 + ((double) it3 + (double) it4 * 2.44140625e-4) *
423 - 2.44140625e-4) * 2.44140625e-4) * 2.44140625e-4;
427 diff -urN SuperLU_MT_2.0.orig/SRC/zmyblas2.c SuperLU_MT_2.0/SRC/zmyblas2.c
428 --- SuperLU_MT_2.0.orig/SRC/zmyblas2.c 2010-10-24 15:32:48.001229014 +0200
429 +++ SuperLU_MT_2.0/SRC/zmyblas2.c 2010-10-24 16:45:39.021229012 +0200
430 @@ -181,3 +181,127 @@
435 + * Performs dense matrix-vector multiply with 2 vectors:
440 + int lda, /* leading dimension of A */
443 + doublecomplex *A, /* in - size m-by-n */
444 + doublecomplex *x0, /* in - size n-by-1 */
445 + doublecomplex *x1, /* in - size n-by-1 */
446 + doublecomplex *y0, /* out - size n-by-1 */
447 + doublecomplex *y1 /* out - size n-by-1 */
451 + doublecomplex v00, v10, v20, v30, v40, v50, v60, v70,
452 + v01, v11, v21, v31, v41, v51, v61, v71;
453 + doublecomplex t0, t1, t2, t3, t4, t5, t6, t7;
454 + doublecomplex f0, f1;
455 + doublecomplex *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
456 + register int firstcol = 0;
457 + doublecomplex *M0, temp;
462 + while ( firstcol < n - 7 ) { /* Do 8 columns */
473 + v00 = x0[firstcol]; v01 = x1[firstcol++];
474 + v10 = x0[firstcol]; v11 = x1[firstcol++];
475 + v20 = x0[firstcol]; v21 = x1[firstcol++];
476 + v30 = x0[firstcol]; v31 = x1[firstcol++];
477 + v40 = x0[firstcol]; v41 = x1[firstcol++];
478 + v50 = x0[firstcol]; v51 = x1[firstcol++];
479 + v60 = x0[firstcol]; v61 = x1[firstcol++];
480 + v70 = x0[firstcol]; v71 = x1[firstcol++];
482 + for (k = 0; k < m; k++) {
485 + t0 = Mki0[k]; zz_mult(&temp,&v00,&t0);z_add(&f0,&f0,&temp);
486 + zz_mult(&temp,&v01,&t0);z_add(&f1,&f1,&temp);
487 + t1 = Mki1[k]; zz_mult(&temp,&v10,&t1);z_add(&f0,&f0,&temp);
488 + zz_mult(&temp,&v11,&t1);z_add(&f1,&f1,&temp);
489 + t2 = Mki2[k]; zz_mult(&temp,&v20,&t2);z_add(&f0,&f0,&temp);
490 + zz_mult(&temp,&v21,&t2);z_add(&f1,&f1,&temp);
491 + t3 = Mki3[k]; zz_mult(&temp,&v30,&t3);z_add(&f0,&f0,&temp);
492 + zz_mult(&temp,&v31,&t3);z_add(&f1,&f1,&temp);
493 + t4 = Mki4[k]; zz_mult(&temp,&v40,&t4);z_add(&f0,&f0,&temp);
494 + zz_mult(&temp,&v41,&t4);z_add(&f1,&f1,&temp);
495 + t5 = Mki5[k]; zz_mult(&temp,&v50,&t5);z_add(&f0,&f0,&temp);
496 + zz_mult(&temp,&v51,&t5);z_add(&f1,&f1,&temp);
497 + t6 = Mki6[k]; zz_mult(&temp,&v60,&t6);z_add(&f0,&f0,&temp);
498 + zz_mult(&temp,&v61,&t6);z_add(&f1,&f1,&temp);
499 + t7 = Mki7[k]; zz_mult(&temp,&v70,&t7);z_add(&f0,&f0,&temp);
500 + zz_mult(&temp,&v71,&t7);z_add(&f1,&f1,&temp);
508 + while ( firstcol < n - 3 ) { /* Do 4 columns */
514 + v00 = x0[firstcol]; v01 = x1[firstcol++];
515 + v10 = x0[firstcol]; v11 = x1[firstcol++];
516 + v20 = x0[firstcol]; v21 = x1[firstcol++];
517 + v30 = x0[firstcol]; v31 = x1[firstcol++];
519 + for (k = 0; k < m; k++) {
522 + t0 = Mki0[k]; zz_mult(&temp,&v00,&t0);z_add(&f0,&f0,&temp);
523 + zz_mult(&temp,&v01,&t0);z_add(&f1,&f1,&temp);
524 + t1 = Mki1[k]; zz_mult(&temp,&v10,&t1);z_add(&f0,&f0,&temp);
525 + zz_mult(&temp,&v11,&t1);z_add(&f1,&f1,&temp);
526 + t2 = Mki2[k]; zz_mult(&temp,&v20,&t2);z_add(&f0,&f0,&temp);
527 + zz_mult(&temp,&v21,&t2);z_add(&f1,&f1,&temp);
528 + t3 = Mki3[k]; zz_mult(&temp,&v30,&t3);z_add(&f0,&f0,&temp);
529 + zz_mult(&temp,&v31,&t3);z_add(&f1,&f1,&temp);
538 + while ( firstcol < n ) { /* Do 1 column */
540 + v00 = x0[firstcol]; v01 = x1[firstcol++];
542 + for (k = 0; k < m; k++) {
546 + zz_mult(&temp,&v00,&t0);z_add(&f0,&f0,&temp);
547 + zz_mult(&temp,&v01,&t0);z_add(&f1,&f1,&temp);
558 diff -urN SuperLU_MT_2.0.orig/TESTING/MATGEN/clatb4.c SuperLU_MT_2.0/TESTING/MATGEN/clatb4.c
559 --- SuperLU_MT_2.0.orig/TESTING/MATGEN/clatb4.c 2010-10-24 15:32:47.994562344 +0200
560 +++ SuperLU_MT_2.0/TESTING/MATGEN/clatb4.c 2010-10-24 15:33:14.847895677 +0200
562 -lf2c -lm (in that order)
568 /* Table of constant values */
569 diff -urN SuperLU_MT_2.0.orig/TESTING/MATGEN/dlatb4.c SuperLU_MT_2.0/TESTING/MATGEN/dlatb4.c
570 --- SuperLU_MT_2.0.orig/TESTING/MATGEN/dlatb4.c 2010-10-24 15:32:47.994562344 +0200
571 +++ SuperLU_MT_2.0/TESTING/MATGEN/dlatb4.c 2010-10-24 15:33:14.851229012 +0200
573 -lf2c -lm (in that order)
579 /* Table of constant values */
580 diff -urN SuperLU_MT_2.0.orig/TESTING/MATGEN/slatb4.c SuperLU_MT_2.0/TESTING/MATGEN/slatb4.c
581 --- SuperLU_MT_2.0.orig/TESTING/MATGEN/slatb4.c 2010-10-24 15:32:47.994562344 +0200
582 +++ SuperLU_MT_2.0/TESTING/MATGEN/slatb4.c 2010-10-24 15:33:14.851229012 +0200
584 -lf2c -lm (in that order)
590 /* Table of constant values */
591 diff -urN SuperLU_MT_2.0.orig/TESTING/MATGEN/zlatb4.c SuperLU_MT_2.0/TESTING/MATGEN/zlatb4.c
592 --- SuperLU_MT_2.0.orig/TESTING/MATGEN/zlatb4.c 2010-10-24 15:32:47.994562344 +0200
593 +++ SuperLU_MT_2.0/TESTING/MATGEN/zlatb4.c 2010-10-24 15:33:14.854562347 +0200
595 -lf2c -lm (in that order)
601 /* Table of constant values */