security/nss/lib/freebl/mpi/mpi_amd64_sun.s

   1 / ***** BEGIN LICENSE BLOCK *****
   2 / Version: MPL 1.1/GPL 2.0/LGPL 2.1
   3 /
   4 / The contents of this file are subject to the Mozilla Public License Version
   5 / 1.1 (the "License"); you may not use this file except in compliance with
   6 / the License. You may obtain a copy of the License at
   7 / http://www.mozilla.org/MPL/
   8 /
   9 / Software distributed under the License is distributed on an "AS IS" basis,
  10 / WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  11 / for the specific language governing rights and limitations under the
  12 / License.
  13 /
  14 / The Original Code is the Solaris software cryptographic token.
  15 /
  16 / The Initial Developer of the Original Code is
  17 / Sun Microsystems, Inc.
  18 / Portions created by the Initial Developer are Copyright (C) 2005
  19 / the Initial Developer. All Rights Reserved.
  20 /
  21 / Contributor(s):
  22 /   Sun Microsystems, Inc.
  23 /
  24 / Alternatively, the contents of this file may be used under the terms of
  25 / either the GNU General Public License Version 2 or later (the "GPL"), or
  26 / the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  27 / in which case the provisions of the GPL or the LGPL are applicable instead
  28 / of those above. If you wish to allow use of your version of this file only
  29 / under the terms of either the GPL or the LGPL, and not to allow others to
  30 / use your version of this file under the terms of the MPL, indicate your
  31 / decision by deleting the provisions above and replace them with the notice
  32 / and other provisions required by the GPL or the LGPL. If you do not delete
  33 / the provisions above, a recipient may use your version of this file under
  34 / the terms of any one of the MPL, the GPL or the LGPL.
  35 /
  36 / ***** END LICENSE BLOCK ***** */
  37
  38
  39 / ------------------------------------------------------------------------
  40 /
  41 /  Implementation of s_mpv_mul_set_vec which exploits
  42 /  the 64X64->128 bit  unsigned multiply instruction.
  43 /
  44 / ------------------------------------------------------------------------
  45
  46 / r = a * digit, r and a are vectors of length len
  47 / returns the carry digit
  48 / r and a are 64 bit aligned.
  49 /
  50 / uint64_t
  51 / s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
  52 /
  53
  54 .text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
  55
  56         xorq    %rax, %rax              / if (len == 0) return (0)
  57         testq   %rdx, %rdx
  58         jz      .L17
  59
  60         movq    %rdx, %r8               / Use r8 for len; %rdx is used by mul
  61         xorq    %r9, %r9                / cy = 0
  62
  63 .L15:
  64         cmpq    $8, %r8                 / 8 - len
  65         jb      .L16
  66         movq    0(%rsi), %rax           / rax = a[0]
  67         movq    8(%rsi), %r11           / prefetch a[1]
  68         mulq    %rcx                    / p = a[0] * digit
  69         addq    %r9, %rax
  70         adcq    $0, %rdx                / p += cy
  71         movq    %rax, 0(%rdi)           / r[0] = lo(p)
  72         movq    %rdx, %r9               / cy = hi(p)
  73
  74         movq    %r11, %rax
  75         movq    16(%rsi), %r11          / prefetch a[2]
  76         mulq    %rcx                    / p = a[1] * digit
  77         addq    %r9, %rax
  78         adcq    $0, %rdx                / p += cy
  79         movq    %rax, 8(%rdi)           / r[1] = lo(p)
  80         movq    %rdx, %r9               / cy = hi(p)
  81
  82         movq    %r11, %rax
  83         movq    24(%rsi), %r11          / prefetch a[3]
  84         mulq    %rcx                    / p = a[2] * digit
  85         addq    %r9, %rax
  86         adcq    $0, %rdx                / p += cy
  87         movq    %rax, 16(%rdi)          / r[2] = lo(p)
  88         movq    %rdx, %r9               / cy = hi(p)
  89
  90         movq    %r11, %rax
  91         movq    32(%rsi), %r11          / prefetch a[4]
  92         mulq    %rcx                    / p = a[3] * digit
  93         addq    %r9, %rax
  94         adcq    $0, %rdx                / p += cy
  95         movq    %rax, 24(%rdi)          / r[3] = lo(p)
  96         movq    %rdx, %r9               / cy = hi(p)
  97
  98         movq    %r11, %rax
  99         movq    40(%rsi), %r11          / prefetch a[5]
 100         mulq    %rcx                    / p = a[4] * digit
 101         addq    %r9, %rax
 102         adcq    $0, %rdx                / p += cy
 103         movq    %rax, 32(%rdi)          / r[4] = lo(p)
 104         movq    %rdx, %r9               / cy = hi(p)
 105
 106         movq    %r11, %rax
 107         movq    48(%rsi), %r11          / prefetch a[6]
 108         mulq    %rcx                    / p = a[5] * digit
 109         addq    %r9, %rax
 110         adcq    $0, %rdx                / p += cy
 111         movq    %rax, 40(%rdi)          / r[5] = lo(p)
 112         movq    %rdx, %r9               / cy = hi(p)
 113
 114         movq    %r11, %rax
 115         movq    56(%rsi), %r11          / prefetch a[7]
 116         mulq    %rcx                    / p = a[6] * digit
 117         addq    %r9, %rax
 118         adcq    $0, %rdx                / p += cy
 119         movq    %rax, 48(%rdi)          / r[6] = lo(p)
 120         movq    %rdx, %r9               / cy = hi(p)
 121
 122         movq    %r11, %rax
 123         mulq    %rcx                    / p = a[7] * digit
 124         addq    %r9, %rax
 125         adcq    $0, %rdx                / p += cy
 126         movq    %rax, 56(%rdi)          / r[7] = lo(p)
 127         movq    %rdx, %r9               / cy = hi(p)
 128
 129         addq    $64, %rsi
 130         addq    $64, %rdi
 131         subq    $8, %r8
 132
 133         jz      .L17
 134         jmp     .L15
 135
 136 .L16:
 137         movq    0(%rsi), %rax
 138         mulq    %rcx                    / p = a[0] * digit
 139         addq    %r9, %rax
 140         adcq    $0, %rdx                / p += cy
 141         movq    %rax, 0(%rdi)           / r[0] = lo(p)
 142         movq    %rdx, %r9               / cy = hi(p)
 143         decq    %r8
 144         jz      .L17
 145
 146         movq    8(%rsi), %rax
 147         mulq    %rcx                    / p = a[1] * digit
 148         addq    %r9, %rax
 149         adcq    $0, %rdx                / p += cy
 150         movq    %rax, 8(%rdi)           / r[1] = lo(p)
 151         movq    %rdx, %r9               / cy = hi(p)
 152         decq    %r8
 153         jz      .L17
 154
 155         movq    16(%rsi), %rax
 156         mulq    %rcx                    / p = a[2] * digit
 157         addq    %r9, %rax
 158         adcq    $0, %rdx                / p += cy
 159         movq    %rax, 16(%rdi)          / r[2] = lo(p)
 160         movq    %rdx, %r9               / cy = hi(p)
 161         decq    %r8
 162         jz      .L17
 163
 164         movq    24(%rsi), %rax
 165         mulq    %rcx                    / p = a[3] * digit
 166         addq    %r9, %rax
 167         adcq    $0, %rdx                / p += cy
 168         movq    %rax, 24(%rdi)          / r[3] = lo(p)
 169         movq    %rdx, %r9               / cy = hi(p)
 170         decq    %r8
 171         jz      .L17
 172
 173         movq    32(%rsi), %rax
 174         mulq    %rcx                    / p = a[4] * digit
 175         addq    %r9, %rax
 176         adcq    $0, %rdx                / p += cy
 177         movq    %rax, 32(%rdi)          / r[4] = lo(p)
 178         movq    %rdx, %r9               / cy = hi(p)
 179         decq    %r8
 180         jz      .L17
 181
 182         movq    40(%rsi), %rax
 183         mulq    %rcx                    / p = a[5] * digit
 184         addq    %r9, %rax
 185         adcq    $0, %rdx                / p += cy
 186         movq    %rax, 40(%rdi)          / r[5] = lo(p)
 187         movq    %rdx, %r9               / cy = hi(p)
 188         decq    %r8
 189         jz      .L17
 190
 191         movq    48(%rsi), %rax
 192         mulq    %rcx                    / p = a[6] * digit
 193         addq    %r9, %rax
 194         adcq    $0, %rdx                / p += cy
 195         movq    %rax, 48(%rdi)          / r[6] = lo(p)
 196         movq    %rdx, %r9               / cy = hi(p)
 197         decq    %r8
 198         jz      .L17
 199
 200
 201 .L17:
 202         movq    %r9, %rax
 203         ret
 204
 205 .size s_mpv_mul_set_vec64, [.-s_mpv_mul_set_vec64]
 206
 207 / ------------------------------------------------------------------------
 208 /
 209 /  Implementation of s_mpv_mul_add_vec which exploits
 210 /  the 64X64->128 bit  unsigned multiply instruction.
 211 /
 212 / ------------------------------------------------------------------------
 213
 214 / r += a * digit, r and a are vectors of length len
 215 / returns the carry digit
 216 / r and a are 64 bit aligned.
 217 /
 218 / uint64_t
 219 / s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
 220 /
 221
 222 .text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
 223
 224         xorq    %rax, %rax              / if (len == 0) return (0)
 225         testq   %rdx, %rdx
 226         jz      .L27
 227
 228         movq    %rdx, %r8               / Use r8 for len; %rdx is used by mul
 229         xorq    %r9, %r9                / cy = 0
 230
 231 .L25:
 232         cmpq    $8, %r8                 / 8 - len
 233         jb      .L26
 234         movq    0(%rsi), %rax           / rax = a[0]
 235         movq    0(%rdi), %r10           / r10 = r[0]
 236         movq    8(%rsi), %r11           / prefetch a[1]
 237         mulq    %rcx                    / p = a[0] * digit
 238         addq    %r10, %rax
 239         adcq    $0, %rdx                / p += r[0]
 240         movq    8(%rdi), %r10           / prefetch r[1]
 241         addq    %r9, %rax
 242         adcq    $0, %rdx                / p += cy
 243         movq    %rax, 0(%rdi)           / r[0] = lo(p)
 244         movq    %rdx, %r9               / cy = hi(p)
 245
 246         movq    %r11, %rax
 247         movq    16(%rsi), %r11          / prefetch a[2]
 248         mulq    %rcx                    / p = a[1] * digit
 249         addq    %r10, %rax
 250         adcq    $0, %rdx                / p += r[1]
 251         movq    16(%rdi), %r10          / prefetch r[2]
 252         addq    %r9, %rax
 253         adcq    $0, %rdx                / p += cy
 254         movq    %rax, 8(%rdi)           / r[1] = lo(p)
 255         movq    %rdx, %r9               / cy = hi(p)
 256
 257         movq    %r11, %rax
 258         movq    24(%rsi), %r11          / prefetch a[3]
 259         mulq    %rcx                    / p = a[2] * digit
 260         addq    %r10, %rax
 261         adcq    $0, %rdx                / p += r[2]
 262         movq    24(%rdi), %r10          / prefetch r[3]
 263         addq    %r9, %rax
 264         adcq    $0, %rdx                / p += cy
 265         movq    %rax, 16(%rdi)          / r[2] = lo(p)
 266         movq    %rdx, %r9               / cy = hi(p)
 267
 268         movq    %r11, %rax
 269         movq    32(%rsi), %r11          / prefetch a[4]
 270         mulq    %rcx                    / p = a[3] * digit
 271         addq    %r10, %rax
 272         adcq    $0, %rdx                / p += r[3]
 273         movq    32(%rdi), %r10          / prefetch r[4]
 274         addq    %r9, %rax
 275         adcq    $0, %rdx                / p += cy
 276         movq    %rax, 24(%rdi)          / r[3] = lo(p)
 277         movq    %rdx, %r9               / cy = hi(p)
 278
 279         movq    %r11, %rax
 280         movq    40(%rsi), %r11          / prefetch a[5]
 281         mulq    %rcx                    / p = a[4] * digit
 282         addq    %r10, %rax
 283         adcq    $0, %rdx                / p += r[4]
 284         movq    40(%rdi), %r10          / prefetch r[5]
 285         addq    %r9, %rax
 286         adcq    $0, %rdx                / p += cy
 287         movq    %rax, 32(%rdi)          / r[4] = lo(p)
 288         movq    %rdx, %r9               / cy = hi(p)
 289
 290         movq    %r11, %rax
 291         movq    48(%rsi), %r11          / prefetch a[6]
 292         mulq    %rcx                    / p = a[5] * digit
 293         addq    %r10, %rax
 294         adcq    $0, %rdx                / p += r[5]
 295         movq    48(%rdi), %r10          / prefetch r[6]
 296         addq    %r9, %rax
 297         adcq    $0, %rdx                / p += cy
 298         movq    %rax, 40(%rdi)          / r[5] = lo(p)
 299         movq    %rdx, %r9               / cy = hi(p)
 300
 301         movq    %r11, %rax
 302         movq    56(%rsi), %r11          / prefetch a[7]
 303         mulq    %rcx                    / p = a[6] * digit
 304         addq    %r10, %rax
 305         adcq    $0, %rdx                / p += r[6]
 306         movq    56(%rdi), %r10          / prefetch r[7]
 307         addq    %r9, %rax
 308         adcq    $0, %rdx                / p += cy
 309         movq    %rax, 48(%rdi)          / r[6] = lo(p)
 310         movq    %rdx, %r9               / cy = hi(p)
 311
 312         movq    %r11, %rax
 313         mulq    %rcx                    / p = a[7] * digit
 314         addq    %r10, %rax
 315         adcq    $0, %rdx                / p += r[7]
 316         addq    %r9, %rax
 317         adcq    $0, %rdx                / p += cy
 318         movq    %rax, 56(%rdi)          / r[7] = lo(p)
 319         movq    %rdx, %r9               / cy = hi(p)
 320
 321         addq    $64, %rsi
 322         addq    $64, %rdi
 323         subq    $8, %r8
 324
 325         jz      .L27
 326         jmp     .L25
 327
 328 .L26:
 329         movq    0(%rsi), %rax
 330         movq    0(%rdi), %r10
 331         mulq    %rcx                    / p = a[0] * digit
 332         addq    %r10, %rax
 333         adcq    $0, %rdx                / p += r[0]
 334         addq    %r9, %rax
 335         adcq    $0, %rdx                / p += cy
 336         movq    %rax, 0(%rdi)           / r[0] = lo(p)
 337         movq    %rdx, %r9               / cy = hi(p)
 338         decq    %r8
 339         jz      .L27
 340
 341         movq    8(%rsi), %rax
 342         movq    8(%rdi), %r10
 343         mulq    %rcx                    / p = a[1] * digit
 344         addq    %r10, %rax
 345         adcq    $0, %rdx                / p += r[1]
 346         addq    %r9, %rax
 347         adcq    $0, %rdx                / p += cy
 348         movq    %rax, 8(%rdi)           / r[1] = lo(p)
 349         movq    %rdx, %r9               / cy = hi(p)
 350         decq    %r8
 351         jz      .L27
 352
 353         movq    16(%rsi), %rax
 354         movq    16(%rdi), %r10
 355         mulq    %rcx                    / p = a[2] * digit
 356         addq    %r10, %rax
 357         adcq    $0, %rdx                / p += r[2]
 358         addq    %r9, %rax
 359         adcq    $0, %rdx                / p += cy
 360         movq    %rax, 16(%rdi)          / r[2] = lo(p)
 361         movq    %rdx, %r9               / cy = hi(p)
 362         decq    %r8
 363         jz      .L27
 364
 365         movq    24(%rsi), %rax
 366         movq    24(%rdi), %r10
 367         mulq    %rcx                    / p = a[3] * digit
 368         addq    %r10, %rax
 369         adcq    $0, %rdx                / p += r[3]
 370         addq    %r9, %rax
 371         adcq    $0, %rdx                / p += cy
 372         movq    %rax, 24(%rdi)          / r[3] = lo(p)
 373         movq    %rdx, %r9               / cy = hi(p)
 374         decq    %r8
 375         jz      .L27
 376
 377         movq    32(%rsi), %rax
 378         movq    32(%rdi), %r10
 379         mulq    %rcx                    / p = a[4] * digit
 380         addq    %r10, %rax
 381         adcq    $0, %rdx                / p += r[4]
 382         addq    %r9, %rax
 383         adcq    $0, %rdx                / p += cy
 384         movq    %rax, 32(%rdi)          / r[4] = lo(p)
 385         movq    %rdx, %r9               / cy = hi(p)
 386         decq    %r8
 387         jz      .L27
 388
 389         movq    40(%rsi), %rax
 390         movq    40(%rdi), %r10
 391         mulq    %rcx                    / p = a[5] * digit
 392         addq    %r10, %rax
 393         adcq    $0, %rdx                / p += r[5]
 394         addq    %r9, %rax
 395         adcq    $0, %rdx                / p += cy
 396         movq    %rax, 40(%rdi)          / r[5] = lo(p)
 397         movq    %rdx, %r9               / cy = hi(p)
 398         decq    %r8
 399         jz      .L27
 400
 401         movq    48(%rsi), %rax
 402         movq    48(%rdi), %r10
 403         mulq    %rcx                    / p = a[6] * digit
 404         addq    %r10, %rax
 405         adcq    $0, %rdx                / p += r[6]
 406         addq    %r9, %rax
 407         adcq    $0, %rdx                / p += cy
 408         movq    %rax, 48(%rdi)          / r[6] = lo(p)
 409         movq    %rdx, %r9               / cy = hi(p)
 410         decq    %r8
 411         jz      .L27
 412
 413
 414 .L27:
 415         movq    %r9, %rax
 416         ret
 417
 418 .size s_mpv_mul_add_vec64, [.-s_mpv_mul_add_vec64]