3 * Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
5 * Licensed under the OpenSSL license (the "License"). You may not use
6 * this file except in compliance with the License. You can obtain a copy
7 * in the file LICENSE in the source distribution or at
8 * https://www.openssl.org/source/license.html
11 /* Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, sublicense, and/or sell copies of the Software, and to
18 permit persons to whom the Software is furnished to do so, subject to
19 the following conditions:
21 The above copyright notice and this permission notice shall be
22 included in all copies or substantial portions of the Software.
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
28 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
29 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
30 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
32 // Common registers are assigned as follows:
36 // t0 Const Tbl Ptr TPtr
37 // t1 Round Constant TRound
38 // t4 Block residual LenResid
39 // t5 Residual Data DTmp
41 // {in,out}0 Block 0 Cycle RotateM0
42 // {in,out}1 Block Value 12 M12
43 // {in,out}2 Block Value 8 M8
44 // {in,out}3 Block Value 4 M4
45 // {in,out}4 Block Value 0 M0
46 // {in,out}5 Block 1 Cycle RotateM1
47 // {in,out}6 Block Value 13 M13
48 // {in,out}7 Block Value 9 M9
49 // {in,out}8 Block Value 5 M5
50 // {in,out}9 Block Value 1 M1
51 // {in,out}10 Block 2 Cycle RotateM2
52 // {in,out}11 Block Value 14 M14
53 // {in,out}12 Block Value 10 M10
54 // {in,out}13 Block Value 6 M6
55 // {in,out}14 Block Value 2 M2
56 // {in,out}15 Block 3 Cycle RotateM3
57 // {in,out}16 Block Value 15 M15
58 // {in,out}17 Block Value 11 M11
59 // {in,out}18 Block Value 7 M7
60 // {in,out}19 Block Value 3 M3
61 // {in,out}20 Scratch Z
62 // {in,out}21 Scratch Y
63 // {in,out}22 Scratch X
64 // {in,out}23 Scratch W
65 // {in,out}24 Digest A A
66 // {in,out}25 Digest B B
67 // {in,out}26 Digest C C
68 // {in,out}27 Digest D D
69 // {in,out}28 Active Data Ptr DPtr
71 // out28 Dummy Value -
72 // bt0 Coroutine Link QUICK_RTN
74 /// These predicates are used for computing the padding block(s) and
75 /// are shared between the driver and digest co-routines
77 // pt0 Extra Pad Block pExtra
78 // pt1 Load next word pLoad
79 // pt2 Skip next word pSkip
80 // pt3 Search for Pad pNoPad
81 // pt4 Pad Word 0 pPad0
82 // pt5 Pad Word 1 pPad1
83 // pt6 Pad Word 2 pPad2
84 // pt7 Pad Word 3 pPad3
121 #define RotateM0_ out0
122 #define RotateM1_ out5
123 #define RotateM2_ out10
124 #define RotateM3_ out15
153 #define RotateM2 in10
154 #define RotateM3 in15
160 /* register stack configuration for md5_block_asm_data_order(): */
166 /* register stack configuration for helpers: */
167 #define _NINPUTS MD5_NOUT
170 #define _NROTATE 24 /* this must be <= _NINPUTS */
172 #if defined(_HPUX_SOURCE) && !defined(_LP64)
178 #if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
179 #define HOST_IS_BIG_ENDIAN
182 // Macros for getting the left and right portions of little-endian words
184 #define GETLW(dst, src, align) dep.z dst = src, 32 - 8 * align, 8 * align
185 #define GETRW(dst, src, align) extr.u dst = src, 8 * align, 32 - 8 * align
189 // Reads an input block, then calls the digest block
190 // subroutine and adds the results to the accumulated
191 // digest. It allocates 32 outs which the subroutine
192 // uses as it's inputs and rotating
193 // registers. Initializes the round constant pointer and
194 // takes care of saving/restoring ar.lc
198 // in0 Context Ptr CtxPtr0
199 // in1 Input Data Ptr DPtrIn
200 // in2 Integral Blocks BlockCount
201 // rp Return Address -
205 // v2 Input Align InAlign
206 // t0 Shared w/digest -
207 // t1 Shared w/digest -
208 // t2 Shared w/digest -
209 // t3 Shared w/digest -
210 // t4 Shared w/digest -
211 // t5 Shared w/digest -
212 // t6 PFS Save PFSSave
213 // t7 ar.lc Save LCSave
214 // t8 Saved PR PRSave
215 // t9 2nd CtxPtr CtxPtr1
216 // t10 Table Base CTable
217 // t11 Table[0] CTable0
218 // t13 Accumulator A AccumA
219 // t14 Accumulator B AccumB
220 // t15 Accumulator C AccumC
221 // t16 Accumulator D AccumD
222 // pt0 Shared w/digest -
223 // pt1 Shared w/digest -
224 // pt2 Shared w/digest -
225 // pt3 Shared w/digest -
226 // pt4 Shared w/digest -
227 // pt5 Shared w/digest -
228 // pt6 Shared w/digest -
229 // pt7 Shared w/digest -
230 // pt8 Not Aligned pOff
231 // pt8 Blocks Left pAgain
242 #define BlockCount in2
252 /* md5_block_asm_data_order(MD5_CTX *c, const void *data, size_t num)
255 c: a pointer to a structure of this type:
257 typedef struct MD5state_st
261 MD5_LONG data[MD5_LBLOCK];
266 data: a pointer to the input data (may be misaligned)
267 num: the number of 16-byte blocks to hash (i.e., the length
272 .type md5_block_asm_data_order, @function
273 .global md5_block_asm_data_order
275 .proc md5_block_asm_data_order
276 md5_block_asm_data_order:
280 .save ar.pfs, PFSSave
281 alloc PFSSave = ar.pfs, MD5_NINP, MD5_NLOC, MD5_NOUT, MD5_NROT
282 ADDP CtxPtr1 = 8, CtxPtr0
286 ADDP DPtrIn = 0, DPtrIn
287 ADDP CtxPtr0 = 0, CtxPtr0
293 add CTable = .md5_tbl_data_order#-.md5_block#, CTable
294 and InAlign = 0x3, DPtrIn
298 ld4 AccumA = [CtxPtr0], 4
299 ld4 AccumC = [CtxPtr1], 4
306 ld4 AccumB = [CtxPtr0]
307 ld4 AccumD = [CtxPtr1]
308 dep DPtr_ = 0, DPtrIn, 0, 2
310 #ifdef HOST_IS_BIG_ENDIAN
311 rum psr.be;; // switch to little-endian
314 ld4 CTable0 = [CTable], 4
315 cmp.ne pOff, p0 = 0, InAlign
316 (pOff) br.cond.spnt.many .md5_unaligned
319 // The FF load/compute loop rotates values three times, so that
320 // loading into M12 here produces the M0 value, M13 -> M1, etc.
324 ld4 M12_ = [DPtr_], 4
329 ld4 M13_ = [DPtr_], 4
334 ld4 M14_ = [DPtr_], 4
339 ld4 M15_ = [DPtr_], 4
340 add BlockCount = -1, BlockCount
341 br.call.sptk.many QUICK_RTN = md5_digest_block0
344 // Now, we add the new digest values and do some clean-up
345 // before checking if there's another full block to process
348 add AccumA = AccumA, A_
349 add AccumB = AccumB, B_
350 cmp.ne pAgain, p0 = 0, BlockCount
353 add AccumC = AccumC, C_
354 add AccumD = AccumD, D_
355 (pAgain) br.cond.dptk.many .md5_block_loop0
359 #ifdef HOST_IS_BIG_ENDIAN
360 sum psr.be;; // switch back to big-endian mode
363 st4 [CtxPtr0] = AccumB, -4
364 st4 [CtxPtr1] = AccumD, -4
365 mov pr = PRSave, 0x1ffff ;;
368 st4 [CtxPtr0] = AccumA
369 st4 [CtxPtr1] = AccumC
377 #define MD5UNALIGNED(offset) \
378 .md5_process##offset: \
381 GETRW(DTmp, DTmp, offset) ; \
383 .md5_block_loop##offset: \
385 ld4 Y_ = [DPtr_], 4 ; \
386 mov TPtr = CTable ; \
387 mov TRound = CTable0 ; \
390 ld4 M13_ = [DPtr_], 4 ; \
395 ld4 M14_ = [DPtr_], 4 ; \
396 GETLW(W_, Y_, offset) ; \
401 or M12_ = W_, DTmp ; \
402 GETRW(DTmp, Y_, offset) ; \
405 ld4 M15_ = [DPtr_], 4 ; \
406 add BlockCount = -1, BlockCount ; \
407 br.call.sptk.many QUICK_RTN = md5_digest_block##offset; \
410 add AccumA = AccumA, A_ ; \
411 add AccumB = AccumB, B_ ; \
412 cmp.ne pAgain, p0 = 0, BlockCount ; \
415 add AccumC = AccumC, C_ ; \
416 add AccumD = AccumD, D_ ; \
417 (pAgain) br.cond.dptk.many .md5_block_loop##offset ; \
422 br.cond.sptk.many .md5_exit ; \
428 // Because variable shifts are expensive, we special case each of
429 // the four alignements. In practice, this won't hurt too much
430 // since only one working set of code will be loaded.
433 ld4 DTmp = [DPtr_], 4
434 cmp.eq pOff, p0 = 1, InAlign
435 (pOff) br.cond.dpnt.many .md5_process1
438 cmp.eq pOff, p0 = 2, InAlign
440 (pOff) br.cond.dpnt.many .md5_process2
446 .endp md5_block_asm_data_order
449 // MD5 Perform the F function and load
451 // Passed the first 4 words (M0 - M3) and initial (A, B, C, D) values,
452 // computes the FF() round of functions, then branches to the common
453 // digest code to finish up with GG(), HH, and II().
457 // rp Return Address -
461 // v0 PFS bit bucket PFS
462 // v1 Loop Trip Count LTrip
463 // pt0 Load next word pMore
470 /* For GHI rounds: */
475 #define COMPUTE(a, b, s, M, R) \
478 ld4 TRound = [TPtr], 4 ; \
479 dep.z Y = Z, 32, 32 ;; \
480 shrp Z = Z, Y, 64 - s ; \
489 #define LOOP(a, b, s, M, R, label) \
491 ld4 TRound = [TPtr], 4 ; \
492 dep.z Y = Z, 32, 32 ;; \
493 shrp Z = Z, Y, 64 - s ; \
498 br.ctop.sptk.many label ; \
501 // G(B, C, D) = (B & D) | (C & ~D)
503 #define G(a, b, c, d, M) \
505 add Z = M, TRound ; \
515 // H(B, C, D) = B ^ C ^ D
517 #define H(a, b, c, d, M) \
519 add Z = M, TRound ; \
529 // I(B, C, D) = C ^ (B | ~D)
531 // However, since we have an andcm operator, we use the fact that
535 // to rewrite the expression as
537 // I(B, C, D) = ~C ^ (~B & D)
539 #define I(a, b, c, d, M) \
541 add Z = M, TRound ; \
553 COMPUTE(A, B, 5, M0, RotateM0) \
555 COMPUTE(D, A, 9, M1, RotateM1) \
557 COMPUTE(C, D, 14, M2, RotateM2) \
559 LOOP(B, C, 20, M3, RotateM3, label)
563 COMPUTE(A, B, 4, M0, RotateM0) \
565 COMPUTE(D, A, 11, M1, RotateM1) \
567 COMPUTE(C, D, 16, M2, RotateM2) \
569 LOOP(B, C, 23, M3, RotateM3, label)
573 COMPUTE(A, B, 6, M0, RotateM0) \
575 COMPUTE(D, A, 10, M1, RotateM1) \
577 COMPUTE(C, D, 15, M2, RotateM2) \
579 LOOP(B, C, 21, M3, RotateM3, label)
581 #define FFLOAD(a, b, c, d, M, N, s) \
583 (pMore) ld4 N = [DPtr], 4 ; \
584 add Z = M, TRound ; \
593 ld4 TRound = [TPtr], 4 ; \
595 dep.z Y = Z, 32, 32 ; \
599 shrp Z = Z, Y, 64 - s ;; \
603 #define FFLOOP(a, b, c, d, M, N, s, dest) \
605 (pMore) ld4 N = [DPtr], 4 ; \
606 add Z = M, TRound ; \
615 ld4 TRound = [TPtr], 4 ; \
617 dep.z Y = Z, 32, 32 ; \
621 shrp Z = Z, Y, 64 - s ;; \
625 cmp.ne pMore, p0 = 0, LTrip ; \
626 add LTrip = -1, LTrip ; \
627 br.ctop.dptk.many dest ; \
630 .type md5_digest_block0, @function
633 .proc md5_digest_block0
639 alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
644 cmp.eq pMore, p0 = r0, r0
650 FFLOAD(A, B, C, D, M12, RotateM0, 7)
651 FFLOAD(D, A, B, C, M13, RotateM1, 12)
652 FFLOAD(C, D, A, B, M14, RotateM2, 17)
653 FFLOOP(B, C, D, A, M15, RotateM3, 22, .md5_FF_round0)
655 // !!! Fall through to md5_digest_GHI
657 .endp md5_digest_block0
659 .type md5_digest_GHI, @function
664 .regstk _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
669 // The following sequence shuffles the block counstants round for the
672 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
673 // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
722 // The following sequence shuffles the block constants round for the
725 // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
726 // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
775 // The following sequence shuffles the block constants round for the
778 // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
779 // 0 7 14 5 12 3 10 1 8 15 6 13 4 11 2 9
831 br.ret.sptk.many QUICK_RTN
836 #define FFLOADU(a, b, c, d, M, P, N, s, offset) \
838 (pMore) ld4 N = [DPtr], 4 ; \
839 add Z = M, TRound ; \
848 ld4 TRound = [TPtr], 4 ; \
849 GETLW(W, P, offset) ; \
854 dep.z Y = Z, 32, 32 ;; \
855 shrp Z = Z, Y, 64 - s ; \
859 GETRW(DTmp, P, offset) ; \
863 #define FFLOOPU(a, b, c, d, M, P, N, s, offset) \
865 (pMore) ld4 N = [DPtr], 4 ; \
866 add Z = M, TRound ; \
875 ld4 TRound = [TPtr], 4 ; \
876 (pMore) GETLW(W, P, offset) ; \
880 (pMore) or W = W, DTmp ; \
881 dep.z Y = Z, 32, 32 ;; \
882 shrp Z = Z, Y, 64 - s ; \
886 (pMore) GETRW(DTmp, P, offset) ; \
887 (pMore) mov P = W ; \
890 cmp.ne pMore, p0 = 0, LTrip ; \
891 add LTrip = -1, LTrip ; \
892 br.ctop.sptk.many .md5_FF_round##offset ; \
895 #define MD5FBLOCK(offset) \
896 .type md5_digest_block##offset, @function ; \
899 .proc md5_digest_block##offset ; \
903 md5_digest_block##offset: \
905 alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE ; \
910 cmp.eq pMore, p0 = r0, r0 ; \
915 .pred.rel "mutex", pLoad, pSkip ; \
916 .md5_FF_round##offset: \
917 FFLOADU(A, B, C, D, M12, M13, RotateM0, 7, offset) \
918 FFLOADU(D, A, B, C, M13, M14, RotateM1, 12, offset) \
919 FFLOADU(C, D, A, B, M14, M15, RotateM2, 17, offset) \
920 FFLOOPU(B, C, D, A, M15, RotateM0, RotateM3, 22, offset) \
925 br.cond.sptk.many md5_digest_GHI ; \
927 .endp md5_digest_block##offset
934 .type md5_constants, @object
936 .md5_tbl_data_order: // To ensure little-endian data
937 // order, code as bytes.
938 data1 0x78, 0xa4, 0x6a, 0xd7 // 0
939 data1 0x56, 0xb7, 0xc7, 0xe8 // 1
940 data1 0xdb, 0x70, 0x20, 0x24 // 2
941 data1 0xee, 0xce, 0xbd, 0xc1 // 3
942 data1 0xaf, 0x0f, 0x7c, 0xf5 // 4
943 data1 0x2a, 0xc6, 0x87, 0x47 // 5
944 data1 0x13, 0x46, 0x30, 0xa8 // 6
945 data1 0x01, 0x95, 0x46, 0xfd // 7
946 data1 0xd8, 0x98, 0x80, 0x69 // 8
947 data1 0xaf, 0xf7, 0x44, 0x8b // 9
948 data1 0xb1, 0x5b, 0xff, 0xff // 10
949 data1 0xbe, 0xd7, 0x5c, 0x89 // 11
950 data1 0x22, 0x11, 0x90, 0x6b // 12
951 data1 0x93, 0x71, 0x98, 0xfd // 13
952 data1 0x8e, 0x43, 0x79, 0xa6 // 14
953 data1 0x21, 0x08, 0xb4, 0x49 // 15
954 data1 0x62, 0x25, 0x1e, 0xf6 // 16
955 data1 0x40, 0xb3, 0x40, 0xc0 // 17
956 data1 0x51, 0x5a, 0x5e, 0x26 // 18
957 data1 0xaa, 0xc7, 0xb6, 0xe9 // 19
958 data1 0x5d, 0x10, 0x2f, 0xd6 // 20
959 data1 0x53, 0x14, 0x44, 0x02 // 21
960 data1 0x81, 0xe6, 0xa1, 0xd8 // 22
961 data1 0xc8, 0xfb, 0xd3, 0xe7 // 23
962 data1 0xe6, 0xcd, 0xe1, 0x21 // 24
963 data1 0xd6, 0x07, 0x37, 0xc3 // 25
964 data1 0x87, 0x0d, 0xd5, 0xf4 // 26
965 data1 0xed, 0x14, 0x5a, 0x45 // 27
966 data1 0x05, 0xe9, 0xe3, 0xa9 // 28
967 data1 0xf8, 0xa3, 0xef, 0xfc // 29
968 data1 0xd9, 0x02, 0x6f, 0x67 // 30
969 data1 0x8a, 0x4c, 0x2a, 0x8d // 31
970 data1 0x42, 0x39, 0xfa, 0xff // 32
971 data1 0x81, 0xf6, 0x71, 0x87 // 33
972 data1 0x22, 0x61, 0x9d, 0x6d // 34
973 data1 0x0c, 0x38, 0xe5, 0xfd // 35
974 data1 0x44, 0xea, 0xbe, 0xa4 // 36
975 data1 0xa9, 0xcf, 0xde, 0x4b // 37
976 data1 0x60, 0x4b, 0xbb, 0xf6 // 38
977 data1 0x70, 0xbc, 0xbf, 0xbe // 39
978 data1 0xc6, 0x7e, 0x9b, 0x28 // 40
979 data1 0xfa, 0x27, 0xa1, 0xea // 41
980 data1 0x85, 0x30, 0xef, 0xd4 // 42
981 data1 0x05, 0x1d, 0x88, 0x04 // 43
982 data1 0x39, 0xd0, 0xd4, 0xd9 // 44
983 data1 0xe5, 0x99, 0xdb, 0xe6 // 45
984 data1 0xf8, 0x7c, 0xa2, 0x1f // 46
985 data1 0x65, 0x56, 0xac, 0xc4 // 47
986 data1 0x44, 0x22, 0x29, 0xf4 // 48
987 data1 0x97, 0xff, 0x2a, 0x43 // 49
988 data1 0xa7, 0x23, 0x94, 0xab // 50
989 data1 0x39, 0xa0, 0x93, 0xfc // 51
990 data1 0xc3, 0x59, 0x5b, 0x65 // 52
991 data1 0x92, 0xcc, 0x0c, 0x8f // 53
992 data1 0x7d, 0xf4, 0xef, 0xff // 54
993 data1 0xd1, 0x5d, 0x84, 0x85 // 55
994 data1 0x4f, 0x7e, 0xa8, 0x6f // 56
995 data1 0xe0, 0xe6, 0x2c, 0xfe // 57
996 data1 0x14, 0x43, 0x01, 0xa3 // 58
997 data1 0xa1, 0x11, 0x08, 0x4e // 59
998 data1 0x82, 0x7e, 0x53, 0xf7 // 60
999 data1 0x35, 0xf2, 0x3a, 0xbd // 61
1000 data1 0xbb, 0xd2, 0xd7, 0x2a // 62
1001 data1 0x91, 0xd3, 0x86, 0xeb // 63
1002 .size md5_constants#,64*4