1 /* Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
3 Permission is hereby granted, free of charge, to any person obtaining
4 a copy of this software and associated documentation files (the
5 "Software"), to deal in the Software without restriction, including
6 without limitation the rights to use, copy, modify, merge, publish,
7 distribute, sublicense, and/or sell copies of the Software, and to
8 permit persons to whom the Software is furnished to do so, subject to
9 the following conditions:
11 The above copyright notice and this permission notice shall be
12 included in all copies or substantial portions of the Software.
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
22 // Common registers are assigned as follows:
26 // t0 Const Tbl Ptr TPtr
27 // t1 Round Constant TRound
28 // t4 Block residual LenResid
29 // t5 Residual Data DTmp
31 // {in,out}0 Block 0 Cycle RotateM0
32 // {in,out}1 Block Value 12 M12
33 // {in,out}2 Block Value 8 M8
34 // {in,out}3 Block Value 4 M4
35 // {in,out}4 Block Value 0 M0
36 // {in,out}5 Block 1 Cycle RotateM1
37 // {in,out}6 Block Value 13 M13
38 // {in,out}7 Block Value 9 M9
39 // {in,out}8 Block Value 5 M5
40 // {in,out}9 Block Value 1 M1
41 // {in,out}10 Block 2 Cycle RotateM2
42 // {in,out}11 Block Value 14 M14
43 // {in,out}12 Block Value 10 M10
44 // {in,out}13 Block Value 6 M6
45 // {in,out}14 Block Value 2 M2
46 // {in,out}15 Block 3 Cycle RotateM3
47 // {in,out}16 Block Value 15 M15
48 // {in,out}17 Block Value 11 M11
49 // {in,out}18 Block Value 7 M7
50 // {in,out}19 Block Value 3 M3
51 // {in,out}20 Scratch Z
52 // {in,out}21 Scratch Y
53 // {in,out}22 Scratch X
54 // {in,out}23 Scratch W
55 // {in,out}24 Digest A A
56 // {in,out}25 Digest B B
57 // {in,out}26 Digest C C
58 // {in,out}27 Digest D D
59 // {in,out}28 Active Data Ptr DPtr
61 // out28 Dummy Value -
62 // bt0 Coroutine Link QUICK_RTN
64 /// These predicates are used for computing the padding block(s) and
65 /// are shared between the driver and digest co-routines
67 // pt0 Extra Pad Block pExtra
68 // pt1 Load next word pLoad
69 // pt2 Skip next word pSkip
70 // pt3 Search for Pad pNoPad
71 // pt4 Pad Word 0 pPad0
72 // pt5 Pad Word 1 pPad1
73 // pt6 Pad Word 2 pPad2
74 // pt7 Pad Word 3 pPad3
111 #define RotateM0_ out0
112 #define RotateM1_ out5
113 #define RotateM2_ out10
114 #define RotateM3_ out15
143 #define RotateM2 in10
144 #define RotateM3 in15
150 /* register stack configuration for md5_block_asm_data_order(): */
156 /* register stack configuration for helpers: */
157 #define _NINPUTS MD5_NOUT
160 #define _NROTATE 24 /* this must be <= _NINPUTS */
162 #if defined(_HPUX_SOURCE) && !defined(_LP64)
168 #if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
169 #define HOST_IS_BIG_ENDIAN
172 // Macros for getting the left and right portions of little-endian words
174 #define GETLW(dst, src, align) dep.z dst = src, 32 - 8 * align, 8 * align
175 #define GETRW(dst, src, align) extr.u dst = src, 8 * align, 32 - 8 * align
179 // Reads an input block, then calls the digest block
180 // subroutine and adds the results to the accumulated
181 // digest. It allocates 32 outs which the subroutine
182 // uses as it's inputs and rotating
183 // registers. Initializes the round constant pointer and
184 // takes care of saving/restoring ar.lc
188 // in0 Context Ptr CtxPtr0
189 // in1 Input Data Ptr DPtrIn
190 // in2 Integral Blocks BlockCount
191 // rp Return Address -
195 // v2 Input Align InAlign
196 // t0 Shared w/digest -
197 // t1 Shared w/digest -
198 // t2 Shared w/digest -
199 // t3 Shared w/digest -
200 // t4 Shared w/digest -
201 // t5 Shared w/digest -
202 // t6 PFS Save PFSSave
203 // t7 ar.lc Save LCSave
204 // t8 Saved PR PRSave
205 // t9 2nd CtxPtr CtxPtr1
206 // t10 Table Base CTable
207 // t11 Table[0] CTable0
208 // t13 Accumulator A AccumA
209 // t14 Accumulator B AccumB
210 // t15 Accumulator C AccumC
211 // t16 Accumulator D AccumD
212 // pt0 Shared w/digest -
213 // pt1 Shared w/digest -
214 // pt2 Shared w/digest -
215 // pt3 Shared w/digest -
216 // pt4 Shared w/digest -
217 // pt5 Shared w/digest -
218 // pt6 Shared w/digest -
219 // pt7 Shared w/digest -
220 // pt8 Not Aligned pOff
221 // pt8 Blocks Left pAgain
232 #define BlockCount in2
242 /* md5_block_asm_data_order(MD5_CTX *c, const void *data, size_t num)
245 c: a pointer to a structure of this type:
247 typedef struct MD5state_st
251 MD5_LONG data[MD5_LBLOCK];
256 data: a pointer to the input data (may be misaligned)
257 num: the number of 16-byte blocks to hash (i.e., the length
262 .type md5_block_asm_data_order, @function
263 .global md5_block_asm_data_order
265 .proc md5_block_asm_data_order
266 md5_block_asm_data_order:
270 .save ar.pfs, PFSSave
271 alloc PFSSave = ar.pfs, MD5_NINP, MD5_NLOC, MD5_NOUT, MD5_NROT
272 ADDP CtxPtr1 = 8, CtxPtr0
276 ADDP DPtrIn = 0, DPtrIn
277 ADDP CtxPtr0 = 0, CtxPtr0
283 add CTable = .md5_tbl_data_order#-.md5_block#, CTable
284 and InAlign = 0x3, DPtrIn
288 ld4 AccumA = [CtxPtr0], 4
289 ld4 AccumC = [CtxPtr1], 4
296 ld4 AccumB = [CtxPtr0]
297 ld4 AccumD = [CtxPtr1]
298 dep DPtr_ = 0, DPtrIn, 0, 2
300 #ifdef HOST_IS_BIG_ENDIAN
301 rum psr.be;; // switch to little-endian
304 ld4 CTable0 = [CTable], 4
305 cmp.ne pOff, p0 = 0, InAlign
306 (pOff) br.cond.spnt.many .md5_unaligned
309 // The FF load/compute loop rotates values three times, so that
310 // loading into M12 here produces the M0 value, M13 -> M1, etc.
314 ld4 M12_ = [DPtr_], 4
319 ld4 M13_ = [DPtr_], 4
324 ld4 M14_ = [DPtr_], 4
329 ld4 M15_ = [DPtr_], 4
330 add BlockCount = -1, BlockCount
331 br.call.sptk.many QUICK_RTN = md5_digest_block0
334 // Now, we add the new digest values and do some clean-up
335 // before checking if there's another full block to process
338 add AccumA = AccumA, A_
339 add AccumB = AccumB, B_
340 cmp.ne pAgain, p0 = 0, BlockCount
343 add AccumC = AccumC, C_
344 add AccumD = AccumD, D_
345 (pAgain) br.cond.dptk.many .md5_block_loop0
349 #ifdef HOST_IS_BIG_ENDIAN
350 sum psr.be;; // switch back to big-endian mode
353 st4 [CtxPtr0] = AccumB, -4
354 st4 [CtxPtr1] = AccumD, -4
355 mov pr = PRSave, 0x1ffff ;;
358 st4 [CtxPtr0] = AccumA
359 st4 [CtxPtr1] = AccumC
367 #define MD5UNALIGNED(offset) \
368 .md5_process##offset: \
371 GETRW(DTmp, DTmp, offset) ; \
373 .md5_block_loop##offset: \
375 ld4 Y_ = [DPtr_], 4 ; \
376 mov TPtr = CTable ; \
377 mov TRound = CTable0 ; \
380 ld4 M13_ = [DPtr_], 4 ; \
385 ld4 M14_ = [DPtr_], 4 ; \
386 GETLW(W_, Y_, offset) ; \
391 or M12_ = W_, DTmp ; \
392 GETRW(DTmp, Y_, offset) ; \
395 ld4 M15_ = [DPtr_], 4 ; \
396 add BlockCount = -1, BlockCount ; \
397 br.call.sptk.many QUICK_RTN = md5_digest_block##offset; \
400 add AccumA = AccumA, A_ ; \
401 add AccumB = AccumB, B_ ; \
402 cmp.ne pAgain, p0 = 0, BlockCount ; \
405 add AccumC = AccumC, C_ ; \
406 add AccumD = AccumD, D_ ; \
407 (pAgain) br.cond.dptk.many .md5_block_loop##offset ; \
412 br.cond.sptk.many .md5_exit ; \
418 // Because variable shifts are expensive, we special case each of
419 // the four alignements. In practice, this won't hurt too much
420 // since only one working set of code will be loaded.
423 ld4 DTmp = [DPtr_], 4
424 cmp.eq pOff, p0 = 1, InAlign
425 (pOff) br.cond.dpnt.many .md5_process1
428 cmp.eq pOff, p0 = 2, InAlign
430 (pOff) br.cond.dpnt.many .md5_process2
436 .endp md5_block_asm_data_order
439 // MD5 Perform the F function and load
441 // Passed the first 4 words (M0 - M3) and initial (A, B, C, D) values,
442 // computes the FF() round of functions, then branches to the common
443 // digest code to finish up with GG(), HH, and II().
447 // rp Return Address -
451 // v0 PFS bit bucket PFS
452 // v1 Loop Trip Count LTrip
453 // pt0 Load next word pMore
460 /* For GHI rounds: */
465 #define COMPUTE(a, b, s, M, R) \
468 ld4 TRound = [TPtr], 4 ; \
469 dep.z Y = Z, 32, 32 ;; \
470 shrp Z = Z, Y, 64 - s ; \
479 #define LOOP(a, b, s, M, R, label) \
481 ld4 TRound = [TPtr], 4 ; \
482 dep.z Y = Z, 32, 32 ;; \
483 shrp Z = Z, Y, 64 - s ; \
488 br.ctop.sptk.many label ; \
491 // G(B, C, D) = (B & D) | (C & ~D)
493 #define G(a, b, c, d, M) \
495 add Z = M, TRound ; \
505 // H(B, C, D) = B ^ C ^ D
507 #define H(a, b, c, d, M) \
509 add Z = M, TRound ; \
519 // I(B, C, D) = C ^ (B | ~D)
521 // However, since we have an andcm operator, we use the fact that
525 // to rewrite the expression as
527 // I(B, C, D) = ~C ^ (~B & D)
529 #define I(a, b, c, d, M) \
531 add Z = M, TRound ; \
543 COMPUTE(A, B, 5, M0, RotateM0) \
545 COMPUTE(D, A, 9, M1, RotateM1) \
547 COMPUTE(C, D, 14, M2, RotateM2) \
549 LOOP(B, C, 20, M3, RotateM3, label)
553 COMPUTE(A, B, 4, M0, RotateM0) \
555 COMPUTE(D, A, 11, M1, RotateM1) \
557 COMPUTE(C, D, 16, M2, RotateM2) \
559 LOOP(B, C, 23, M3, RotateM3, label)
563 COMPUTE(A, B, 6, M0, RotateM0) \
565 COMPUTE(D, A, 10, M1, RotateM1) \
567 COMPUTE(C, D, 15, M2, RotateM2) \
569 LOOP(B, C, 21, M3, RotateM3, label)
571 #define FFLOAD(a, b, c, d, M, N, s) \
573 (pMore) ld4 N = [DPtr], 4 ; \
574 add Z = M, TRound ; \
583 ld4 TRound = [TPtr], 4 ; \
585 dep.z Y = Z, 32, 32 ; \
589 shrp Z = Z, Y, 64 - s ;; \
593 #define FFLOOP(a, b, c, d, M, N, s, dest) \
595 (pMore) ld4 N = [DPtr], 4 ; \
596 add Z = M, TRound ; \
605 ld4 TRound = [TPtr], 4 ; \
607 dep.z Y = Z, 32, 32 ; \
611 shrp Z = Z, Y, 64 - s ;; \
615 cmp.ne pMore, p0 = 0, LTrip ; \
616 add LTrip = -1, LTrip ; \
617 br.ctop.dptk.many dest ; \
620 .type md5_digest_block0, @function
623 .proc md5_digest_block0
629 alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
634 cmp.eq pMore, p0 = r0, r0
640 FFLOAD(A, B, C, D, M12, RotateM0, 7)
641 FFLOAD(D, A, B, C, M13, RotateM1, 12)
642 FFLOAD(C, D, A, B, M14, RotateM2, 17)
643 FFLOOP(B, C, D, A, M15, RotateM3, 22, .md5_FF_round0)
645 // !!! Fall through to md5_digest_GHI
647 .endp md5_digest_block0
649 .type md5_digest_GHI, @function
654 .regstk _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
659 // The following sequence shuffles the block counstants round for the
662 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
663 // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
712 // The following sequence shuffles the block constants round for the
715 // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
716 // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
765 // The following sequence shuffles the block constants round for the
768 // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
769 // 0 7 14 5 12 3 10 1 8 15 6 13 4 11 2 9
821 br.ret.sptk.many QUICK_RTN
826 #define FFLOADU(a, b, c, d, M, P, N, s, offset) \
828 (pMore) ld4 N = [DPtr], 4 ; \
829 add Z = M, TRound ; \
838 ld4 TRound = [TPtr], 4 ; \
839 GETLW(W, P, offset) ; \
844 dep.z Y = Z, 32, 32 ;; \
845 shrp Z = Z, Y, 64 - s ; \
849 GETRW(DTmp, P, offset) ; \
853 #define FFLOOPU(a, b, c, d, M, P, N, s, offset) \
855 (pMore) ld4 N = [DPtr], 4 ; \
856 add Z = M, TRound ; \
865 ld4 TRound = [TPtr], 4 ; \
866 (pMore) GETLW(W, P, offset) ; \
870 (pMore) or W = W, DTmp ; \
871 dep.z Y = Z, 32, 32 ;; \
872 shrp Z = Z, Y, 64 - s ; \
876 (pMore) GETRW(DTmp, P, offset) ; \
877 (pMore) mov P = W ; \
880 cmp.ne pMore, p0 = 0, LTrip ; \
881 add LTrip = -1, LTrip ; \
882 br.ctop.sptk.many .md5_FF_round##offset ; \
885 #define MD5FBLOCK(offset) \
886 .type md5_digest_block##offset, @function ; \
889 .proc md5_digest_block##offset ; \
893 md5_digest_block##offset: \
895 alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE ; \
900 cmp.eq pMore, p0 = r0, r0 ; \
905 .pred.rel "mutex", pLoad, pSkip ; \
906 .md5_FF_round##offset: \
907 FFLOADU(A, B, C, D, M12, M13, RotateM0, 7, offset) \
908 FFLOADU(D, A, B, C, M13, M14, RotateM1, 12, offset) \
909 FFLOADU(C, D, A, B, M14, M15, RotateM2, 17, offset) \
910 FFLOOPU(B, C, D, A, M15, RotateM0, RotateM3, 22, offset) \
915 br.cond.sptk.many md5_digest_GHI ; \
917 .endp md5_digest_block##offset
924 .type md5_constants, @object
926 .md5_tbl_data_order: // To ensure little-endian data
927 // order, code as bytes.
928 data1 0x78, 0xa4, 0x6a, 0xd7 // 0
929 data1 0x56, 0xb7, 0xc7, 0xe8 // 1
930 data1 0xdb, 0x70, 0x20, 0x24 // 2
931 data1 0xee, 0xce, 0xbd, 0xc1 // 3
932 data1 0xaf, 0x0f, 0x7c, 0xf5 // 4
933 data1 0x2a, 0xc6, 0x87, 0x47 // 5
934 data1 0x13, 0x46, 0x30, 0xa8 // 6
935 data1 0x01, 0x95, 0x46, 0xfd // 7
936 data1 0xd8, 0x98, 0x80, 0x69 // 8
937 data1 0xaf, 0xf7, 0x44, 0x8b // 9
938 data1 0xb1, 0x5b, 0xff, 0xff // 10
939 data1 0xbe, 0xd7, 0x5c, 0x89 // 11
940 data1 0x22, 0x11, 0x90, 0x6b // 12
941 data1 0x93, 0x71, 0x98, 0xfd // 13
942 data1 0x8e, 0x43, 0x79, 0xa6 // 14
943 data1 0x21, 0x08, 0xb4, 0x49 // 15
944 data1 0x62, 0x25, 0x1e, 0xf6 // 16
945 data1 0x40, 0xb3, 0x40, 0xc0 // 17
946 data1 0x51, 0x5a, 0x5e, 0x26 // 18
947 data1 0xaa, 0xc7, 0xb6, 0xe9 // 19
948 data1 0x5d, 0x10, 0x2f, 0xd6 // 20
949 data1 0x53, 0x14, 0x44, 0x02 // 21
950 data1 0x81, 0xe6, 0xa1, 0xd8 // 22
951 data1 0xc8, 0xfb, 0xd3, 0xe7 // 23
952 data1 0xe6, 0xcd, 0xe1, 0x21 // 24
953 data1 0xd6, 0x07, 0x37, 0xc3 // 25
954 data1 0x87, 0x0d, 0xd5, 0xf4 // 26
955 data1 0xed, 0x14, 0x5a, 0x45 // 27
956 data1 0x05, 0xe9, 0xe3, 0xa9 // 28
957 data1 0xf8, 0xa3, 0xef, 0xfc // 29
958 data1 0xd9, 0x02, 0x6f, 0x67 // 30
959 data1 0x8a, 0x4c, 0x2a, 0x8d // 31
960 data1 0x42, 0x39, 0xfa, 0xff // 32
961 data1 0x81, 0xf6, 0x71, 0x87 // 33
962 data1 0x22, 0x61, 0x9d, 0x6d // 34
963 data1 0x0c, 0x38, 0xe5, 0xfd // 35
964 data1 0x44, 0xea, 0xbe, 0xa4 // 36
965 data1 0xa9, 0xcf, 0xde, 0x4b // 37
966 data1 0x60, 0x4b, 0xbb, 0xf6 // 38
967 data1 0x70, 0xbc, 0xbf, 0xbe // 39
968 data1 0xc6, 0x7e, 0x9b, 0x28 // 40
969 data1 0xfa, 0x27, 0xa1, 0xea // 41
970 data1 0x85, 0x30, 0xef, 0xd4 // 42
971 data1 0x05, 0x1d, 0x88, 0x04 // 43
972 data1 0x39, 0xd0, 0xd4, 0xd9 // 44
973 data1 0xe5, 0x99, 0xdb, 0xe6 // 45
974 data1 0xf8, 0x7c, 0xa2, 0x1f // 46
975 data1 0x65, 0x56, 0xac, 0xc4 // 47
976 data1 0x44, 0x22, 0x29, 0xf4 // 48
977 data1 0x97, 0xff, 0x2a, 0x43 // 49
978 data1 0xa7, 0x23, 0x94, 0xab // 50
979 data1 0x39, 0xa0, 0x93, 0xfc // 51
980 data1 0xc3, 0x59, 0x5b, 0x65 // 52
981 data1 0x92, 0xcc, 0x0c, 0x8f // 53
982 data1 0x7d, 0xf4, 0xef, 0xff // 54
983 data1 0xd1, 0x5d, 0x84, 0x85 // 55
984 data1 0x4f, 0x7e, 0xa8, 0x6f // 56
985 data1 0xe0, 0xe6, 0x2c, 0xfe // 57
986 data1 0x14, 0x43, 0x01, 0xa3 // 58
987 data1 0xa1, 0x11, 0x08, 0x4e // 59
988 data1 0x82, 0x7e, 0x53, 0xf7 // 60
989 data1 0x35, 0xf2, 0x3a, 0xbd // 61
990 data1 0xbb, 0xd2, 0xd7, 0x2a // 62
991 data1 0x91, 0xd3, 0x86, 0xeb // 63
992 .size md5_constants#,64*4