]> WPIA git - cassiopeia.git/blobdiff - lib/openssl/crypto/aes/asm/aes-c64xplus.pl
upd: openssl to 1.1.0
[cassiopeia.git] / lib / openssl / crypto / aes / asm / aes-c64xplus.pl
diff --git a/lib/openssl/crypto/aes/asm/aes-c64xplus.pl b/lib/openssl/crypto/aes/asm/aes-c64xplus.pl
new file mode 100644 (file)
index 0000000..19d2cc1
--- /dev/null
@@ -0,0 +1,1382 @@
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# [Endian-neutral] AES for C64x+.
+#
+# Even though SPLOOPs are scheduled for 13 cycles, and thus expected
+# performance is ~8.5 cycles per byte processed with 128-bit key,
+# measured performance turned to be ~10 cycles per byte. Discrepancy
+# must be caused by limitations of L1D memory banking(*), see SPRU871
+# TI publication for further details. If any consolation it's still
+# ~20% faster than TI's linear assembly module anyway... Compared to
+# aes_core.c compiled with cl6x 6.0 with -mv6400+ -o2 options this
+# code is 3.75x faster and almost 3x smaller (tables included).
+#
+# (*)  This means that there might be subtle correlation between data
+#      and timing and one can wonder if it can be ... attacked:-(
+#      On the other hand this also means that *if* one chooses to
+#      implement *4* T-tables variant [instead of 1 T-table as in
+#      this implementation, or in addition to], then one ought to
+#      *interleave* them. Even though it complicates addressing,
+#      references to interleaved tables would be guaranteed not to
+#      clash. I reckon that it should be possible to break 8 cycles
+#      per byte "barrier," i.e. improve by ~20%, naturally at the
+#      cost of 8x increased pressure on L1D. 8x because you'd have
+#      to interleave both Te and Td tables...
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($TEA,$TEB)=("A5","B5");
+($KPA,$KPB)=("A3","B1");
+@K=("A6","B6","A7","B7");
+@s=("A8","B8","A9","B9");
+@Te0=@Td0=("A16","B16","A17","B17");
+@Te1=@Td1=("A18","B18","A19","B19");
+@Te2=@Td2=("A20","B20","A21","B21");
+@Te3=@Td3=("A22","B22","A23","B23");
+
+$code=<<___;
+       .text
+
+       .if     .ASSEMBLER_VERSION<7000000
+       .asg    0,__TI_EABI__
+       .endif
+       .if     __TI_EABI__
+       .nocmp
+       .asg    AES_encrypt,_AES_encrypt
+       .asg    AES_decrypt,_AES_decrypt
+       .asg    AES_set_encrypt_key,_AES_set_encrypt_key
+       .asg    AES_set_decrypt_key,_AES_set_decrypt_key
+       .asg    AES_ctr32_encrypt,_AES_ctr32_encrypt
+       .endif
+
+       .asg    B3,RA
+       .asg    A4,INP
+       .asg    B4,OUT
+       .asg    A6,KEY
+       .asg    A4,RET
+       .asg    B15,SP
+
+       .eval   24,EXT0
+       .eval   16,EXT1
+       .eval   8,EXT2
+       .eval   0,EXT3
+       .eval   8,TBL1
+       .eval   16,TBL2
+       .eval   24,TBL3
+
+       .if     .BIG_ENDIAN
+       .eval   24-EXT0,EXT0
+       .eval   24-EXT1,EXT1
+       .eval   24-EXT2,EXT2
+       .eval   24-EXT3,EXT3
+       .eval   32-TBL1,TBL1
+       .eval   32-TBL2,TBL2
+       .eval   32-TBL3,TBL3
+       .endif
+
+       .global _AES_encrypt
+_AES_encrypt:
+       .asmfunc
+       MVK     1,B2
+__encrypt:
+       .if     __TI_EABI__
+   [B2]        LDNDW   *INP++,A9:A8                    ; load input
+||     MVKL    \$PCR_OFFSET(AES_Te,__encrypt),$TEA
+||     ADDKPC  __encrypt,B0
+   [B2]        LDNDW   *INP++,B9:B8
+||     MVKH    \$PCR_OFFSET(AES_Te,__encrypt),$TEA
+||     ADD     0,KEY,$KPA
+||     ADD     4,KEY,$KPB
+       .else
+   [B2]        LDNDW   *INP++,A9:A8                    ; load input
+||     MVKL    (AES_Te-__encrypt),$TEA
+||     ADDKPC  __encrypt,B0
+   [B2]        LDNDW   *INP++,B9:B8
+||     MVKH    (AES_Te-__encrypt),$TEA
+||     ADD     0,KEY,$KPA
+||     ADD     4,KEY,$KPB
+       .endif
+       LDW     *$KPA++[2],$Te0[0]              ; zero round key
+||     LDW     *$KPB++[2],$Te0[1]
+||     MVK     60,A0
+||     ADD     B0,$TEA,$TEA                    ; AES_Te
+       LDW     *KEY[A0],B0                     ; rounds
+||     MVK     1024,A0                         ; sizeof(AES_Te)
+       LDW     *$KPA++[2],$Te0[2]
+||     LDW     *$KPB++[2],$Te0[3]
+||     MV      $TEA,$TEB
+       NOP
+       .if     .BIG_ENDIAN
+       MV      A9,$s[0]
+||     MV      A8,$s[1]
+||     MV      B9,$s[2]
+||     MV      B8,$s[3]
+       .else
+       MV      A8,$s[0]
+||     MV      A9,$s[1]
+||     MV      B8,$s[2]
+||     MV      B9,$s[3]
+       .endif
+       XOR     $Te0[0],$s[0],$s[0]
+||     XOR     $Te0[1],$s[1],$s[1]
+||     LDW     *$KPA++[2],$K[0]                ; 1st round key
+||     LDW     *$KPB++[2],$K[1]
+       SUB     B0,2,B0
+
+       SPLOOPD 13
+||     MVC     B0,ILC
+||     LDW     *$KPA++[2],$K[2]
+||     LDW     *$KPB++[2],$K[3]
+;;====================================================================
+       EXTU    $s[1],EXT1,24,$Te1[1]
+||     EXTU    $s[0],EXT3,24,$Te3[0]
+       LDW     *${TEB}[$Te1[1]],$Te1[1]        ; Te1[s1>>8],   t0
+||     LDW     *${TEA}[$Te3[0]],$Te3[0]        ; Te3[s0>>24],  t1
+||     XOR     $s[2],$Te0[2],$s[2]             ; modulo-scheduled
+||     XOR     $s[3],$Te0[3],$s[3]             ; modulo-scheduled
+||     EXTU    $s[1],EXT3,24,$Te3[1]
+||     EXTU    $s[0],EXT1,24,$Te1[0]
+       LDW     *${TEB}[$Te3[1]],$Te3[1]        ; Te3[s1>>24],  t2
+||     LDW     *${TEA}[$Te1[0]],$Te1[0]        ; Te1[s0>>8],   t3
+||     EXTU    $s[2],EXT2,24,$Te2[2]
+||     EXTU    $s[3],EXT2,24,$Te2[3]
+       LDW     *${TEA}[$Te2[2]],$Te2[2]        ; Te2[s2>>16],  t0
+||     LDW     *${TEB}[$Te2[3]],$Te2[3]        ; Te2[s3>>16],  t1
+||     EXTU    $s[3],EXT3,24,$Te3[3]
+||     EXTU    $s[2],EXT1,24,$Te1[2]
+       LDW     *${TEB}[$Te3[3]],$Te3[3]        ; Te3[s3>>24],  t0
+||     LDW     *${TEA}[$Te1[2]],$Te1[2]        ; Te1[s2>>8],   t1
+||     EXTU    $s[0],EXT2,24,$Te2[0]
+||     EXTU    $s[1],EXT2,24,$Te2[1]
+       LDW     *${TEA}[$Te2[0]],$Te2[0]        ; Te2[s0>>16],  t2
+||     LDW     *${TEB}[$Te2[1]],$Te2[1]        ; Te2[s1>>16],  t3
+||     EXTU    $s[3],EXT1,24,$Te1[3]
+||     EXTU    $s[2],EXT3,24,$Te3[2]
+       LDW     *${TEB}[$Te1[3]],$Te1[3]        ; Te1[s3>>8],   t2
+||     LDW     *${TEA}[$Te3[2]],$Te3[2]        ; Te3[s2>>24],  t3
+||     ROTL    $Te1[1],TBL1,$Te3[0]            ; t0
+||     ROTL    $Te3[0],TBL3,$Te1[1]            ; t1
+||     EXTU    $s[0],EXT0,24,$Te0[0]
+||     EXTU    $s[1],EXT0,24,$Te0[1]
+       LDW     *${TEA}[$Te0[0]],$Te0[0]        ; Te0[s0],      t0
+||     LDW     *${TEB}[$Te0[1]],$Te0[1]        ; Te0[s1],      t1
+||     ROTL    $Te3[1],TBL3,$Te1[0]            ; t2
+||     ROTL    $Te1[0],TBL1,$Te3[1]            ; t3
+||     EXTU    $s[2],EXT0,24,$Te0[2]
+||     EXTU    $s[3],EXT0,24,$Te0[3]
+       LDW     *${TEA}[$Te0[2]],$Te0[2]        ; Te0[s2],      t2
+||     LDW     *${TEB}[$Te0[3]],$Te0[3]        ; Te0[s3],      t3
+||     ROTL    $Te2[2],TBL2,$Te2[2]            ; t0
+||     ROTL    $Te2[3],TBL2,$Te2[3]            ; t1
+||     XOR     $K[0],$Te3[0],$s[0]
+||     XOR     $K[1],$Te1[1],$s[1]
+       ROTL    $Te3[3],TBL3,$Te1[2]            ; t0
+||     ROTL    $Te1[2],TBL1,$Te3[3]            ; t1
+||     XOR     $K[2],$Te1[0],$s[2]
+||     XOR     $K[3],$Te3[1],$s[3]
+||     LDW     *$KPA++[2],$K[0]                ; next round key
+||     LDW     *$KPB++[2],$K[1]
+       ROTL    $Te2[0],TBL2,$Te2[0]            ; t2
+||     ROTL    $Te2[1],TBL2,$Te2[1]            ; t3
+||     XOR     $s[0],$Te2[2],$s[0]
+||     XOR     $s[1],$Te2[3],$s[1]
+||     LDW     *$KPA++[2],$K[2]
+||     LDW     *$KPB++[2],$K[3]
+       ROTL    $Te1[3],TBL1,$Te3[2]            ; t2
+||     ROTL    $Te3[2],TBL3,$Te1[3]            ; t3
+||     XOR     $s[0],$Te1[2],$s[0]
+||     XOR     $s[1],$Te3[3],$s[1]
+       XOR     $s[2],$Te2[0],$s[2]
+||     XOR     $s[3],$Te2[1],$s[3]
+||     XOR     $s[0],$Te0[0],$s[0]
+||     XOR     $s[1],$Te0[1],$s[1]
+       SPKERNEL
+||     XOR.L   $s[2],$Te3[2],$s[2]
+||     XOR.L   $s[3],$Te1[3],$s[3]
+;;====================================================================
+       ADD.D   ${TEA},A0,${TEA}                ; point to Te4
+||     ADD.D   ${TEB},A0,${TEB}
+||     EXTU    $s[1],EXT1,24,$Te1[1]
+||     EXTU    $s[0],EXT3,24,$Te3[0]
+       LDBU    *${TEB}[$Te1[1]],$Te1[1]        ; Te1[s1>>8],   t0
+||     LDBU    *${TEA}[$Te3[0]],$Te3[0]        ; Te3[s0>>24],  t1
+||     XOR     $s[2],$Te0[2],$s[2]             ; modulo-scheduled
+||     XOR     $s[3],$Te0[3],$s[3]             ; modulo-scheduled
+||     EXTU    $s[0],EXT0,24,$Te0[0]
+||     EXTU    $s[1],EXT0,24,$Te0[1]
+       LDBU    *${TEA}[$Te0[0]],$Te0[0]        ; Te0[s0],      t0
+||     LDBU    *${TEB}[$Te0[1]],$Te0[1]        ; Te0[s1],      t1
+||     EXTU    $s[3],EXT3,24,$Te3[3]
+||     EXTU    $s[2],EXT1,24,$Te1[2]
+       LDBU    *${TEB}[$Te3[3]],$Te3[3]        ; Te3[s3>>24],  t0
+||     LDBU    *${TEA}[$Te1[2]],$Te1[2]        ; Te1[s2>>8],   t1
+||     EXTU    $s[2],EXT2,24,$Te2[2]
+||     EXTU    $s[3],EXT2,24,$Te2[3]
+       LDBU    *${TEA}[$Te2[2]],$Te2[2]        ; Te2[s2>>16],  t0
+||     LDBU    *${TEB}[$Te2[3]],$Te2[3]        ; Te2[s3>>16],  t1
+||     EXTU    $s[1],EXT3,24,$Te3[1]
+||     EXTU    $s[0],EXT1,24,$Te1[0]
+       LDBU    *${TEB}[$Te3[1]],$Te3[1]        ; Te3[s1>>24],  t2
+||     LDBU    *${TEA}[$Te1[0]],$Te1[0]        ; Te1[s0>>8],   t3
+||     EXTU    $s[3],EXT1,24,$Te1[3]
+||     EXTU    $s[2],EXT3,24,$Te3[2]
+       LDBU    *${TEB}[$Te1[3]],$Te1[3]        ; Te1[s3>>8],   t2
+||     LDBU    *${TEA}[$Te3[2]],$Te3[2]        ; Te3[s2>>24],  t3
+||     EXTU    $s[2],EXT0,24,$Te0[2]
+||     EXTU    $s[3],EXT0,24,$Te0[3]
+       LDBU    *${TEA}[$Te0[2]],$Te0[2]        ; Te0[s2],      t2
+||     LDBU    *${TEB}[$Te0[3]],$Te0[3]        ; Te0[s3],      t3
+||     EXTU    $s[0],EXT2,24,$Te2[0]
+||     EXTU    $s[1],EXT2,24,$Te2[1]
+       LDBU    *${TEA}[$Te2[0]],$Te2[0]        ; Te2[s0>>16],  t2
+||     LDBU    *${TEB}[$Te2[1]],$Te2[1]        ; Te2[s1>>16],  t3
+
+       .if     .BIG_ENDIAN
+       PACK2   $Te0[0],$Te1[1],$Te0[0]
+||     PACK2   $Te0[1],$Te1[2],$Te0[1]
+       PACK2   $Te2[2],$Te3[3],$Te2[2]
+||     PACK2   $Te2[3],$Te3[0],$Te2[3]
+       PACKL4  $Te0[0],$Te2[2],$Te0[0]
+||     PACKL4  $Te0[1],$Te2[3],$Te0[1]
+       XOR     $K[0],$Te0[0],$Te0[0]           ; s[0]
+||     XOR     $K[1],$Te0[1],$Te0[1]           ; s[1]
+
+       PACK2   $Te0[2],$Te1[3],$Te0[2]
+||     PACK2   $Te0[3],$Te1[0],$Te0[3]
+       PACK2   $Te2[0],$Te3[1],$Te2[0]
+||     PACK2   $Te2[1],$Te3[2],$Te2[1]
+||     BNOP    RA
+       PACKL4  $Te0[2],$Te2[0],$Te0[2]
+||     PACKL4  $Te0[3],$Te2[1],$Te0[3]
+       XOR     $K[2],$Te0[2],$Te0[2]           ; s[2]
+||     XOR     $K[3],$Te0[3],$Te0[3]           ; s[3]
+
+       MV      $Te0[0],A9
+||     MV      $Te0[1],A8
+       MV      $Te0[2],B9
+||     MV      $Te0[3],B8
+|| [B2]        STNDW   A9:A8,*OUT++
+   [B2]        STNDW   B9:B8,*OUT++
+       .else
+       PACK2   $Te1[1],$Te0[0],$Te1[1]
+||     PACK2   $Te1[2],$Te0[1],$Te1[2]
+       PACK2   $Te3[3],$Te2[2],$Te3[3]
+||     PACK2   $Te3[0],$Te2[3],$Te3[0]
+       PACKL4  $Te3[3],$Te1[1],$Te1[1]
+||     PACKL4  $Te3[0],$Te1[2],$Te1[2]
+       XOR     $K[0],$Te1[1],$Te1[1]           ; s[0]
+||     XOR     $K[1],$Te1[2],$Te1[2]           ; s[1]
+
+       PACK2   $Te1[3],$Te0[2],$Te1[3]
+||     PACK2   $Te1[0],$Te0[3],$Te1[0]
+       PACK2   $Te3[1],$Te2[0],$Te3[1]
+||     PACK2   $Te3[2],$Te2[1],$Te3[2]
+||     BNOP    RA
+       PACKL4  $Te3[1],$Te1[3],$Te1[3]
+||     PACKL4  $Te3[2],$Te1[0],$Te1[0]
+       XOR     $K[2],$Te1[3],$Te1[3]           ; s[2]
+||     XOR     $K[3],$Te1[0],$Te1[0]           ; s[3]
+
+       MV      $Te1[1],A8
+||     MV      $Te1[2],A9
+       MV      $Te1[3],B8
+||     MV      $Te1[0],B9
+|| [B2]        STNDW   A9:A8,*OUT++
+   [B2]        STNDW   B9:B8,*OUT++
+       .endif
+       .endasmfunc
+
+       .global _AES_decrypt
+_AES_decrypt:
+       .asmfunc
+       MVK     1,B2
+__decrypt:
+       .if     __TI_EABI__
+   [B2]        LDNDW   *INP++,A9:A8                    ; load input
+||     MVKL    \$PCR_OFFSET(AES_Td,__decrypt),$TEA
+||     ADDKPC  __decrypt,B0
+   [B2]        LDNDW   *INP++,B9:B8
+||     MVKH    \$PCR_OFFSET(AES_Td,__decrypt),$TEA
+||     ADD     0,KEY,$KPA
+||     ADD     4,KEY,$KPB
+       .else
+   [B2]        LDNDW   *INP++,A9:A8                    ; load input
+||     MVKL    (AES_Td-__decrypt),$TEA
+||     ADDKPC  __decrypt,B0
+   [B2]        LDNDW   *INP++,B9:B8
+||     MVKH    (AES_Td-__decrypt),$TEA
+||     ADD     0,KEY,$KPA
+||     ADD     4,KEY,$KPB
+       .endif
+       LDW     *$KPA++[2],$Td0[0]              ; zero round key
+||     LDW     *$KPB++[2],$Td0[1]
+||     MVK     60,A0
+||     ADD     B0,$TEA,$TEA                    ; AES_Td
+       LDW     *KEY[A0],B0                     ; rounds
+||     MVK     1024,A0                         ; sizeof(AES_Td)
+       LDW     *$KPA++[2],$Td0[2]
+||     LDW     *$KPB++[2],$Td0[3]
+||     MV      $TEA,$TEB
+       NOP
+       .if     .BIG_ENDIAN
+       MV      A9,$s[0]
+||     MV      A8,$s[1]
+||     MV      B9,$s[2]
+||     MV      B8,$s[3]
+       .else
+       MV      A8,$s[0]
+||     MV      A9,$s[1]
+||     MV      B8,$s[2]
+||     MV      B9,$s[3]
+       .endif
+       XOR     $Td0[0],$s[0],$s[0]
+||     XOR     $Td0[1],$s[1],$s[1]
+||     LDW     *$KPA++[2],$K[0]                ; 1st round key
+||     LDW     *$KPB++[2],$K[1]
+       SUB     B0,2,B0
+
+       SPLOOPD 13
+||     MVC     B0,ILC
+||     LDW     *$KPA++[2],$K[2]
+||     LDW     *$KPB++[2],$K[3]
+;;====================================================================
+       EXTU    $s[1],EXT3,24,$Td3[1]
+||     EXTU    $s[0],EXT1,24,$Td1[0]
+       LDW     *${TEB}[$Td3[1]],$Td3[1]        ; Td3[s1>>24],  t0
+||     LDW     *${TEA}[$Td1[0]],$Td1[0]        ; Td1[s0>>8],   t1
+||     XOR     $s[2],$Td0[2],$s[2]             ; modulo-scheduled
+||     XOR     $s[3],$Td0[3],$s[3]             ; modulo-scheduled
+||     EXTU    $s[1],EXT1,24,$Td1[1]
+||     EXTU    $s[0],EXT3,24,$Td3[0]
+       LDW     *${TEB}[$Td1[1]],$Td1[1]        ; Td1[s1>>8],   t2
+||     LDW     *${TEA}[$Td3[0]],$Td3[0]        ; Td3[s0>>24],  t3
+||     EXTU    $s[2],EXT2,24,$Td2[2]
+||     EXTU    $s[3],EXT2,24,$Td2[3]
+       LDW     *${TEA}[$Td2[2]],$Td2[2]        ; Td2[s2>>16],  t0
+||     LDW     *${TEB}[$Td2[3]],$Td2[3]        ; Td2[s3>>16],  t1
+||     EXTU    $s[3],EXT1,24,$Td1[3]
+||     EXTU    $s[2],EXT3,24,$Td3[2]
+       LDW     *${TEB}[$Td1[3]],$Td1[3]        ; Td1[s3>>8],   t0
+||     LDW     *${TEA}[$Td3[2]],$Td3[2]        ; Td3[s2>>24],  t1
+||     EXTU    $s[0],EXT2,24,$Td2[0]
+||     EXTU    $s[1],EXT2,24,$Td2[1]
+       LDW     *${TEA}[$Td2[0]],$Td2[0]        ; Td2[s0>>16],  t2
+||     LDW     *${TEB}[$Td2[1]],$Td2[1]        ; Td2[s1>>16],  t3
+||     EXTU    $s[3],EXT3,24,$Td3[3]
+||     EXTU    $s[2],EXT1,24,$Td1[2]
+       LDW     *${TEB}[$Td3[3]],$Td3[3]        ; Td3[s3>>24],  t2
+||     LDW     *${TEA}[$Td1[2]],$Td1[2]        ; Td1[s2>>8],   t3
+||     ROTL    $Td3[1],TBL3,$Td1[0]            ; t0
+||     ROTL    $Td1[0],TBL1,$Td3[1]            ; t1
+||     EXTU    $s[0],EXT0,24,$Td0[0]
+||     EXTU    $s[1],EXT0,24,$Td0[1]
+       LDW     *${TEA}[$Td0[0]],$Td0[0]        ; Td0[s0],      t0
+||     LDW     *${TEB}[$Td0[1]],$Td0[1]        ; Td0[s1],      t1
+||     ROTL    $Td1[1],TBL1,$Td3[0]            ; t2
+||     ROTL    $Td3[0],TBL3,$Td1[1]            ; t3
+||     EXTU    $s[2],EXT0,24,$Td0[2]
+||     EXTU    $s[3],EXT0,24,$Td0[3]
+       LDW     *${TEA}[$Td0[2]],$Td0[2]        ; Td0[s2],      t2
+||     LDW     *${TEB}[$Td0[3]],$Td0[3]        ; Td0[s3],      t3
+||     ROTL    $Td2[2],TBL2,$Td2[2]            ; t0
+||     ROTL    $Td2[3],TBL2,$Td2[3]            ; t1
+||     XOR     $K[0],$Td1[0],$s[0]
+||     XOR     $K[1],$Td3[1],$s[1]
+       ROTL    $Td1[3],TBL1,$Td3[2]            ; t0
+||     ROTL    $Td3[2],TBL3,$Td1[3]            ; t1
+||     XOR     $K[2],$Td3[0],$s[2]
+||     XOR     $K[3],$Td1[1],$s[3]
+||     LDW     *$KPA++[2],$K[0]                ; next round key
+||     LDW     *$KPB++[2],$K[1]
+       ROTL    $Td2[0],TBL2,$Td2[0]            ; t2
+||     ROTL    $Td2[1],TBL2,$Td2[1]            ; t3
+||     XOR     $s[0],$Td2[2],$s[0]
+||     XOR     $s[1],$Td2[3],$s[1]
+||     LDW     *$KPA++[2],$K[2]
+||     LDW     *$KPB++[2],$K[3]
+       ROTL    $Td3[3],TBL3,$Td1[2]            ; t2
+||     ROTL    $Td1[2],TBL1,$Td3[3]            ; t3
+||     XOR     $s[0],$Td3[2],$s[0]
+||     XOR     $s[1],$Td1[3],$s[1]
+       XOR     $s[2],$Td2[0],$s[2]
+||     XOR     $s[3],$Td2[1],$s[3]
+||     XOR     $s[0],$Td0[0],$s[0]
+||     XOR     $s[1],$Td0[1],$s[1]
+       SPKERNEL
+||     XOR.L   $s[2],$Td1[2],$s[2]
+||     XOR.L   $s[3],$Td3[3],$s[3]
+;;====================================================================
+       ADD.D   ${TEA},A0,${TEA}                ; point to Td4
+||     ADD.D   ${TEB},A0,${TEB}
+||     EXTU    $s[1],EXT3,24,$Td3[1]
+||     EXTU    $s[0],EXT1,24,$Td1[0]
+       LDBU    *${TEB}[$Td3[1]],$Td3[1]        ; Td3[s1>>24],  t0
+||     LDBU    *${TEA}[$Td1[0]],$Td1[0]        ; Td1[s0>>8],   t1
+||     XOR     $s[2],$Td0[2],$s[2]             ; modulo-scheduled
+||     XOR     $s[3],$Td0[3],$s[3]             ; modulo-scheduled
+||     EXTU    $s[0],EXT0,24,$Td0[0]
+||     EXTU    $s[1],EXT0,24,$Td0[1]
+       LDBU    *${TEA}[$Td0[0]],$Td0[0]        ; Td0[s0],      t0
+||     LDBU    *${TEB}[$Td0[1]],$Td0[1]        ; Td0[s1],      t1
+||     EXTU    $s[2],EXT2,24,$Td2[2]
+||     EXTU    $s[3],EXT2,24,$Td2[3]
+       LDBU    *${TEA}[$Td2[2]],$Td2[2]        ; Td2[s2>>16],  t0
+||     LDBU    *${TEB}[$Td2[3]],$Td2[3]        ; Td2[s3>>16],  t1
+||     EXTU    $s[3],EXT1,24,$Td1[3]
+||     EXTU    $s[2],EXT3,24,$Td3[2]
+       LDBU    *${TEB}[$Td1[3]],$Td1[3]        ; Td1[s3>>8],   t0
+||     LDBU    *${TEA}[$Td3[2]],$Td3[2]        ; Td3[s2>>24],  t1
+||     EXTU    $s[1],EXT1,24,$Td1[1]
+||     EXTU    $s[0],EXT3,24,$Td3[0]
+       LDBU    *${TEB}[$Td1[1]],$Td1[1]        ; Td1[s1>>8],   t2
+||     LDBU    *${TEA}[$Td3[0]],$Td3[0]        ; Td3[s0>>24],  t3
+||     EXTU    $s[0],EXT2,24,$Td2[0]
+||     EXTU    $s[1],EXT2,24,$Td2[1]
+       LDBU    *${TEA}[$Td2[0]],$Td2[0]        ; Td2[s0>>16],  t2
+||     LDBU    *${TEB}[$Td2[1]],$Td2[1]        ; Td2[s1>>16],  t3
+||     EXTU    $s[3],EXT3,24,$Td3[3]
+||     EXTU    $s[2],EXT1,24,$Td1[2]
+       LDBU    *${TEB}[$Td3[3]],$Td3[3]        ; Td3[s3>>24],  t2
+||     LDBU    *${TEA}[$Td1[2]],$Td1[2]        ; Td1[s2>>8],   t3
+||     EXTU    $s[2],EXT0,24,$Td0[2]
+||     EXTU    $s[3],EXT0,24,$Td0[3]
+       LDBU    *${TEA}[$Td0[2]],$Td0[2]        ; Td0[s2],      t2
+||     LDBU    *${TEB}[$Td0[3]],$Td0[3]        ; Td0[s3],      t3
+
+       .if     .BIG_ENDIAN
+       PACK2   $Td0[0],$Td1[3],$Td0[0]
+||     PACK2   $Td0[1],$Td1[0],$Td0[1]
+       PACK2   $Td2[2],$Td3[1],$Td2[2]
+||     PACK2   $Td2[3],$Td3[2],$Td2[3]
+       PACKL4  $Td0[0],$Td2[2],$Td0[0]
+||     PACKL4  $Td0[1],$Td2[3],$Td0[1]
+       XOR     $K[0],$Td0[0],$Td0[0]           ; s[0]
+||     XOR     $K[1],$Td0[1],$Td0[1]           ; s[1]
+
+       PACK2   $Td0[2],$Td1[1],$Td0[2]
+||     PACK2   $Td0[3],$Td1[2],$Td0[3]
+       PACK2   $Td2[0],$Td3[3],$Td2[0]
+||     PACK2   $Td2[1],$Td3[0],$Td2[1]
+||     BNOP    RA
+       PACKL4  $Td0[2],$Td2[0],$Td0[2]
+||     PACKL4  $Td0[3],$Td2[1],$Td0[3]
+       XOR     $K[2],$Td0[2],$Td0[2]           ; s[2]
+||     XOR     $K[3],$Td0[3],$Td0[3]           ; s[3]
+
+       MV      $Td0[0],A9
+||     MV      $Td0[1],A8
+       MV      $Td0[2],B9
+||     MV      $Td0[3],B8
+|| [B2]        STNDW   A9:A8,*OUT++
+   [B2]        STNDW   B9:B8,*OUT++
+       .else
+       PACK2   $Td1[3],$Td0[0],$Td1[3]
+||     PACK2   $Td1[0],$Td0[1],$Td1[0]
+       PACK2   $Td3[1],$Td2[2],$Td3[1]
+||     PACK2   $Td3[2],$Td2[3],$Td3[2]
+       PACKL4  $Td3[1],$Td1[3],$Td1[3]
+||     PACKL4  $Td3[2],$Td1[0],$Td1[0]
+       XOR     $K[0],$Td1[3],$Td1[3]           ; s[0]
+||     XOR     $K[1],$Td1[0],$Td1[0]           ; s[1]
+
+       PACK2   $Td1[1],$Td0[2],$Td1[1]
+||     PACK2   $Td1[2],$Td0[3],$Td1[2]
+       PACK2   $Td3[3],$Td2[0],$Td3[3]
+||     PACK2   $Td3[0],$Td2[1],$Td3[0]
+||     BNOP    RA
+       PACKL4  $Td3[3],$Td1[1],$Td1[1]
+||     PACKL4  $Td3[0],$Td1[2],$Td1[2]
+       XOR     $K[2],$Td1[1],$Td1[1]           ; s[2]
+||     XOR     $K[3],$Td1[2],$Td1[2]           ; s[3]
+
+       MV      $Td1[3],A8
+||     MV      $Td1[0],A9
+       MV      $Td1[1],B8
+||     MV      $Td1[2],B9
+|| [B2]        STNDW   A9:A8,*OUT++
+   [B2]        STNDW   B9:B8,*OUT++
+       .endif
+       .endasmfunc
+___
+{
+my @K=(@K,@s);                 # extended key
+my @Te4=map("B$_",(16..19));
+
+my @Kx9=@Te0;                  # used in AES_set_decrypt_key
+my @KxB=@Te1;
+my @KxD=@Te2;
+my @KxE=@Te3;
+
+$code.=<<___;
+       .asg    OUT,BITS
+
+       .global _AES_set_encrypt_key
+_AES_set_encrypt_key:
+__set_encrypt_key:
+       .asmfunc
+       MV      INP,A0
+||     SHRU    BITS,5,BITS                     ; 128-192-256 -> 4-6-8
+||     MV      KEY,A1
+  [!A0]        B       RA
+||[!A0]        MVK     -1,RET
+||[!A0]        MVK     1,A1                            ; only one B RA
+  [!A1]        B       RA
+||[!A1]        MVK     -1,RET
+||[!A1]        MVK     0,A0
+||     MVK     0,B0
+||     MVK     0,A1
+   [A0]        LDNDW   *INP++,A9:A8
+|| [A0]        CMPEQ   4,BITS,B0
+|| [A0]        CMPLT   3,BITS,A1
+   [B0]        B       key128?
+|| [A1]        LDNDW   *INP++,B9:B8
+|| [A0]        CMPEQ   6,BITS,B0
+|| [A0]        CMPLT   5,BITS,A1
+   [B0]        B       key192?
+|| [A1]        LDNDW   *INP++,B17:B16
+|| [A0]        CMPEQ   8,BITS,B0
+|| [A0]        CMPLT   7,BITS,A1
+   [B0]        B       key256?
+|| [A1]        LDNDW   *INP++,B19:B18
+
+       .if     __TI_EABI__
+   [A0]        ADD     0,KEY,$KPA
+|| [A0]        ADD     4,KEY,$KPB
+|| [A0]        MVKL    \$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
+|| [A0]        ADDKPC  __set_encrypt_key,B6
+   [A0]        MVKH    \$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
+   [A0]        ADD     B6,$TEA,$TEA                    ; AES_Te4
+       .else
+   [A0]        ADD     0,KEY,$KPA
+|| [A0]        ADD     4,KEY,$KPB
+|| [A0]        MVKL    (AES_Te4-__set_encrypt_key),$TEA
+|| [A0]        ADDKPC  __set_encrypt_key,B6
+   [A0]        MVKH    (AES_Te4-__set_encrypt_key),$TEA
+   [A0]        ADD     B6,$TEA,$TEA                    ; AES_Te4
+       .endif
+       NOP
+       NOP
+
+       BNOP    RA,5
+||     MVK     -2,RET                          ; unknown bit length
+||     MVK     0,B0                            ; redundant
+;;====================================================================
+;;====================================================================
+key128?:
+       .if     .BIG_ENDIAN
+       MV      A9,$K[0]
+||     MV      A8,$K[1]
+||     MV      B9,$Te4[2]
+||     MV      B8,$K[3]
+       .else
+       MV      A8,$K[0]
+||     MV      A9,$K[1]
+||     MV      B8,$Te4[2]
+||     MV      B9,$K[3]
+       .endif
+
+       MVK     256,A0
+||     MVK     9,B0
+
+       SPLOOPD 14
+||     MVC     B0,ILC
+||     MV      $TEA,$TEB
+||     ADD     $TEA,A0,A30                     ; rcon
+;;====================================================================
+       LDW     *A30++[1],A31                   ; rcon[i]
+||     MV      $Te4[2],$K[2]
+||     EXTU    $K[3],EXT1,24,$Te4[0]
+       LDBU    *${TEB}[$Te4[0]],$Te4[0]
+||     MV      $K[3],A0
+||     EXTU    $K[3],EXT2,24,$Te4[1]
+       LDBU    *${TEB}[$Te4[1]],$Te4[1]
+||     EXTU    A0,EXT3,24,A0
+||     EXTU    $K[3],EXT0,24,$Te4[3]
+       .if     .BIG_ENDIAN
+       LDBU    *${TEA}[A0],$Te4[3]
+||     LDBU    *${TEB}[$Te4[3]],A0
+       .else
+       LDBU    *${TEA}[A0],A0
+||     LDBU    *${TEB}[$Te4[3]],$Te4[3]
+       .endif
+
+       STW     $K[0],*$KPA++[2]
+||     STW     $K[1],*$KPB++[2]
+       STW     $K[2],*$KPA++[2]
+||     STW     $K[3],*$KPB++[2]
+
+       XOR     A31,$K[0],$K[0]                 ; ^=rcon[i]
+       .if     .BIG_ENDIAN
+       PACK2   $Te4[0],$Te4[1],$Te4[1]
+       PACK2   $Te4[3],A0,$Te4[3]
+       PACKL4  $Te4[1],$Te4[3],$Te4[3]
+       .else
+       PACK2   $Te4[1],$Te4[0],$Te4[1]
+       PACK2   $Te4[3],A0,$Te4[3]
+       PACKL4  $Te4[3],$Te4[1],$Te4[3]
+       .endif
+       XOR     $Te4[3],$K[0],$Te4[0]           ; K[0]
+       XOR     $Te4[0],$K[1],$K[1]             ; K[1]
+       MV      $Te4[0],$K[0]
+||     XOR     $K[1],$K[2],$Te4[2]             ; K[2]
+       XOR     $Te4[2],$K[3],$K[3]             ; K[3]
+       SPKERNEL
+;;====================================================================
+       BNOP    RA
+       MV      $Te4[2],$K[2]
+||     STW     $K[0],*$KPA++[2]
+||     STW     $K[1],*$KPB++[2]
+       STW     $K[2],*$KPA++[2]
+||     STW     $K[3],*$KPB++[2]
+       MVK     10,B0                           ; rounds
+       STW     B0,*++${KPB}[15]
+       MVK     0,RET
+;;====================================================================
+;;====================================================================
+key192?:
+       .if     .BIG_ENDIAN
+       MV      A9,$K[0]
+||     MV      A8,$K[1]
+||     MV      B9,$K[2]
+||     MV      B8,$K[3]
+       MV      B17,$Te4[2]
+||     MV      B16,$K[5]
+       .else
+       MV      A8,$K[0]
+||     MV      A9,$K[1]
+||     MV      B8,$K[2]
+||     MV      B9,$K[3]
+       MV      B16,$Te4[2]
+||     MV      B17,$K[5]
+       .endif
+
+       MVK     256,A0
+||     MVK     6,B0
+       MV      $TEA,$TEB
+||     ADD     $TEA,A0,A30                     ; rcon
+;;====================================================================
+loop192?:
+       LDW     *A30++[1],A31                   ; rcon[i]
+||     MV      $Te4[2],$K[4]
+||     EXTU    $K[5],EXT1,24,$Te4[0]
+       LDBU    *${TEB}[$Te4[0]],$Te4[0]
+||     MV      $K[5],A0
+||     EXTU    $K[5],EXT2,24,$Te4[1]
+       LDBU    *${TEB}[$Te4[1]],$Te4[1]
+||     EXTU    A0,EXT3,24,A0
+||     EXTU    $K[5],EXT0,24,$Te4[3]
+       .if     .BIG_ENDIAN
+       LDBU    *${TEA}[A0],$Te4[3]
+||     LDBU    *${TEB}[$Te4[3]],A0
+       .else
+       LDBU    *${TEA}[A0],A0
+||     LDBU    *${TEB}[$Te4[3]],$Te4[3]
+       .endif
+
+       STW     $K[0],*$KPA++[2]
+||     STW     $K[1],*$KPB++[2]
+       STW     $K[2],*$KPA++[2]
+||     STW     $K[3],*$KPB++[2]
+       STW     $K[4],*$KPA++[2]
+||     STW     $K[5],*$KPB++[2]
+
+       XOR     A31,$K[0],$K[0]                 ; ^=rcon[i]
+       .if     .BIG_ENDIAN
+       PACK2   $Te4[0],$Te4[1],$Te4[1]
+||     PACK2   $Te4[3],A0,$Te4[3]
+       PACKL4  $Te4[1],$Te4[3],$Te4[3]
+       .else
+       PACK2   $Te4[1],$Te4[0],$Te4[1]
+||     PACK2   $Te4[3],A0,$Te4[3]
+       PACKL4  $Te4[3],$Te4[1],$Te4[3]
+       .endif
+       BDEC    loop192?,B0
+||     XOR     $Te4[3],$K[0],$Te4[0]           ; K[0]
+       XOR     $Te4[0],$K[1],$K[1]             ; K[1]
+       MV      $Te4[0],$K[0]
+||     XOR     $K[1],$K[2],$Te4[2]             ; K[2]
+       XOR     $Te4[2],$K[3],$K[3]             ; K[3]
+       MV      $Te4[2],$K[2]
+||     XOR     $K[3],$K[4],$Te4[2]             ; K[4]
+       XOR     $Te4[2],$K[5],$K[5]             ; K[5]
+;;====================================================================
+       BNOP    RA
+       STW     $K[0],*$KPA++[2]
+||     STW     $K[1],*$KPB++[2]
+       STW     $K[2],*$KPA++[2]
+||     STW     $K[3],*$KPB++[2]
+       MVK     12,B0                           ; rounds
+       STW     B0,*++${KPB}[7]
+       MVK     0,RET
+;;====================================================================
+;;====================================================================
+key256?:
+       .if     .BIG_ENDIAN
+       MV      A9,$K[0]
+||     MV      A8,$K[1]
+||     MV      B9,$K[2]
+||     MV      B8,$K[3]
+       MV      B17,$K[4]
+||     MV      B16,$K[5]
+||     MV      B19,$Te4[2]
+||     MV      B18,$K[7]
+       .else
+       MV      A8,$K[0]
+||     MV      A9,$K[1]
+||     MV      B8,$K[2]
+||     MV      B9,$K[3]
+       MV      B16,$K[4]
+||     MV      B17,$K[5]
+||     MV      B18,$Te4[2]
+||     MV      B19,$K[7]
+       .endif
+
+       MVK     256,A0
+||     MVK     6,B0
+       MV      $TEA,$TEB
+||     ADD     $TEA,A0,A30                     ; rcon
+;;====================================================================
+loop256?:
+       LDW     *A30++[1],A31                   ; rcon[i]
+||     MV      $Te4[2],$K[6]
+||     EXTU    $K[7],EXT1,24,$Te4[0]
+       LDBU    *${TEB}[$Te4[0]],$Te4[0]
+||     MV      $K[7],A0
+||     EXTU    $K[7],EXT2,24,$Te4[1]
+       LDBU    *${TEB}[$Te4[1]],$Te4[1]
+||     EXTU    A0,EXT3,24,A0
+||     EXTU    $K[7],EXT0,24,$Te4[3]
+       .if     .BIG_ENDIAN
+       LDBU    *${TEA}[A0],$Te4[3]
+||     LDBU    *${TEB}[$Te4[3]],A0
+       .else
+       LDBU    *${TEA}[A0],A0
+||     LDBU    *${TEB}[$Te4[3]],$Te4[3]
+       .endif
+
+       STW     $K[0],*$KPA++[2]
+||     STW     $K[1],*$KPB++[2]
+       STW     $K[2],*$KPA++[2]
+||     STW     $K[3],*$KPB++[2]
+       STW     $K[4],*$KPA++[2]
+||     STW     $K[5],*$KPB++[2]
+       STW     $K[6],*$KPA++[2]
+||     STW     $K[7],*$KPB++[2]
+||     XOR     A31,$K[0],$K[0]                 ; ^=rcon[i]
+       .if     .BIG_ENDIAN
+       PACK2   $Te4[0],$Te4[1],$Te4[1]
+||     PACK2   $Te4[3],A0,$Te4[3]
+       PACKL4  $Te4[1],$Te4[3],$Te4[3]
+||[!B0]        B       done256?
+       .else
+       PACK2   $Te4[1],$Te4[0],$Te4[1]
+||     PACK2   $Te4[3],A0,$Te4[3]
+       PACKL4  $Te4[3],$Te4[1],$Te4[3]
+||[!B0]        B       done256?
+       .endif
+       XOR     $Te4[3],$K[0],$Te4[0]           ; K[0]
+       XOR     $Te4[0],$K[1],$K[1]             ; K[1]
+       MV      $Te4[0],$K[0]
+||     XOR     $K[1],$K[2],$Te4[2]             ; K[2]
+       XOR     $Te4[2],$K[3],$K[3]             ; K[3]
+
+       MV      $Te4[2],$K[2]
+|| [B0]        EXTU    $K[3],EXT0,24,$Te4[0]
+|| [B0]        SUB     B0,1,B0
+       LDBU    *${TEB}[$Te4[0]],$Te4[0]
+||     MV      $K[3],A0
+||     EXTU    $K[3],EXT1,24,$Te4[1]
+       LDBU    *${TEB}[$Te4[1]],$Te4[1]
+||     EXTU    A0,EXT2,24,A0
+||     EXTU    $K[3],EXT3,24,$Te4[3]
+
+       .if     .BIG_ENDIAN
+       LDBU    *${TEA}[A0],$Te4[3]
+||     LDBU    *${TEB}[$Te4[3]],A0
+       NOP     3
+       PACK2   $Te4[0],$Te4[1],$Te4[1]
+       PACK2   $Te4[3],A0,$Te4[3]
+||     B       loop256?
+       PACKL4  $Te4[1],$Te4[3],$Te4[3]
+       .else
+       LDBU    *${TEA}[A0],A0
+||     LDBU    *${TEB}[$Te4[3]],$Te4[3]
+       NOP     3
+       PACK2   $Te4[1],$Te4[0],$Te4[1]
+       PACK2   $Te4[3],A0,$Te4[3]
+||     B       loop256?
+       PACKL4  $Te4[3],$Te4[1],$Te4[3]
+       .endif
+
+       XOR     $Te4[3],$K[4],$Te4[0]           ; K[4]
+       XOR     $Te4[0],$K[5],$K[5]             ; K[5]
+       MV      $Te4[0],$K[4]
+||     XOR     $K[5],$K[6],$Te4[2]             ; K[6]
+       XOR     $Te4[2],$K[7],$K[7]             ; K[7]
+;;====================================================================
+done256?:
+       BNOP    RA
+       STW     $K[0],*$KPA++[2]
+||     STW     $K[1],*$KPB++[2]
+       STW     $K[2],*$KPA++[2]
+||     STW     $K[3],*$KPB++[2]
+       MVK     14,B0                           ; rounds
+       STW     B0,*--${KPB}[1]
+       MVK     0,RET
+       .endasmfunc
+
+       .global _AES_set_decrypt_key
+_AES_set_decrypt_key:
+       .asmfunc
+       B       __set_encrypt_key               ; guarantee local call
+       MV      KEY,B30                         ; B30 is not modified
+       MV      RA, B31                         ; B31 is not modified
+       ADDKPC  ret?,RA,2
+ret?:                                          ; B0 holds rounds or zero
+  [!B0]        BNOP    B31                             ; return if zero
+   [B0]        SHL     B0,4,A0                         ; offset to last round key
+   [B0]        SHRU    B0,1,B1
+   [B0]        SUB     B1,1,B1
+   [B0]        MVK     0x0000001B,B3                   ; AES polynomial
+   [B0]        MVKH    0x07000000,B3
+
+       SPLOOPD 9                               ; flip round keys
+||     MVC     B1,ILC
+||     MV      B30,$KPA
+||     ADD     B30,A0,$KPB
+||     MVK     16,A0                           ; sizeof(round key)
+;;====================================================================
+       LDW     *${KPA}[0],A16
+||     LDW     *${KPB}[0],B16
+       LDW     *${KPA}[1],A17
+||     LDW     *${KPB}[1],B17
+       LDW     *${KPA}[2],A18
+||     LDW     *${KPB}[2],B18
+       LDW     *${KPA}[3],A19
+||     ADD     $KPA,A0,$KPA
+||     LDW     *${KPB}[3],B19
+||     SUB     $KPB,A0,$KPB
+       NOP
+       STW     B16,*${KPA}[-4]
+||     STW     A16,*${KPB}[4]
+       STW     B17,*${KPA}[-3]
+||     STW     A17,*${KPB}[5]
+       STW     B18,*${KPA}[-2]
+||     STW     A18,*${KPB}[6]
+       STW     B19,*${KPA}[-1]
+||     STW     A19,*${KPB}[7]
+       SPKERNEL
+;;====================================================================
+       SUB     B0,1,B0                         ; skip last round
+||     ADD     B30,A0,$KPA                     ; skip first round
+||     ADD     B30,A0,$KPB
+||     MVC     GFPGFR,B30                      ; save GFPGFR
+       LDW     *${KPA}[0],$K[0]
+||     LDW     *${KPB}[1],$K[1]
+||     MVC     B3,GFPGFR
+       LDW     *${KPA}[2],$K[2]
+||     LDW     *${KPB}[3],$K[3]
+       MVK     0x00000909,A24
+||     MVK     0x00000B0B,B24
+       MVKH    0x09090000,A24
+||     MVKH    0x0B0B0000,B24
+       MVC     B0,ILC
+||     SUB     B0,1,B0
+
+       GMPY4   $K[0],A24,$Kx9[0]               ; ยท0x09
+||     GMPY4   $K[1],A24,$Kx9[1]
+||     MVK     0x00000D0D,A25
+||     MVK     0x00000E0E,B25
+       GMPY4   $K[2],A24,$Kx9[2]
+||     GMPY4   $K[3],A24,$Kx9[3]
+||     MVKH    0x0D0D0000,A25
+||     MVKH    0x0E0E0000,B25
+
+       GMPY4   $K[0],B24,$KxB[0]               ; ยท0x0B
+||     GMPY4   $K[1],B24,$KxB[1]
+       GMPY4   $K[2],B24,$KxB[2]
+||     GMPY4   $K[3],B24,$KxB[3]
+
+       SPLOOP  11                              ; InvMixColumns
+;;====================================================================
+       GMPY4   $K[0],A25,$KxD[0]               ; ยท0x0D
+||     GMPY4   $K[1],A25,$KxD[1]
+||     SWAP2   $Kx9[0],$Kx9[0]                 ; rotate by 16
+||     SWAP2   $Kx9[1],$Kx9[1]
+||     MV      $K[0],$s[0]                     ; this or DINT
+||     MV      $K[1],$s[1]
+|| [B0]        LDW     *${KPA}[4],$K[0]
+|| [B0]        LDW     *${KPB}[5],$K[1]
+       GMPY4   $K[2],A25,$KxD[2]
+||     GMPY4   $K[3],A25,$KxD[3]
+||     SWAP2   $Kx9[2],$Kx9[2]
+||     SWAP2   $Kx9[3],$Kx9[3]
+||     MV      $K[2],$s[2]
+||     MV      $K[3],$s[3]
+|| [B0]        LDW     *${KPA}[6],$K[2]
+|| [B0]        LDW     *${KPB}[7],$K[3]
+
+       GMPY4   $s[0],B25,$KxE[0]               ; ยท0x0E
+||     GMPY4   $s[1],B25,$KxE[1]
+||     XOR     $Kx9[0],$KxB[0],$KxB[0]
+||     XOR     $Kx9[1],$KxB[1],$KxB[1]
+       GMPY4   $s[2],B25,$KxE[2]
+||     GMPY4   $s[3],B25,$KxE[3]
+||     XOR     $Kx9[2],$KxB[2],$KxB[2]
+||     XOR     $Kx9[3],$KxB[3],$KxB[3]
+
+       ROTL    $KxB[0],TBL3,$KxB[0]
+||     ROTL    $KxB[1],TBL3,$KxB[1]
+||     SWAP2   $KxD[0],$KxD[0]                 ; rotate by 16
+||     SWAP2   $KxD[1],$KxD[1]
+       ROTL    $KxB[2],TBL3,$KxB[2]
+||     ROTL    $KxB[3],TBL3,$KxB[3]
+||     SWAP2   $KxD[2],$KxD[2]
+||     SWAP2   $KxD[3],$KxD[3]
+
+       XOR     $KxE[0],$KxD[0],$KxE[0]
+||     XOR     $KxE[1],$KxD[1],$KxE[1]
+|| [B0]        GMPY4   $K[0],A24,$Kx9[0]               ; ยท0x09
+|| [B0]        GMPY4   $K[1],A24,$Kx9[1]
+||     ADDAW   $KPA,4,$KPA
+       XOR     $KxE[2],$KxD[2],$KxE[2]
+||     XOR     $KxE[3],$KxD[3],$KxE[3]
+|| [B0]        GMPY4   $K[2],A24,$Kx9[2]
+|| [B0]        GMPY4   $K[3],A24,$Kx9[3]
+||     ADDAW   $KPB,4,$KPB
+
+       XOR     $KxB[0],$KxE[0],$KxE[0]
+||     XOR     $KxB[1],$KxE[1],$KxE[1]
+|| [B0]        GMPY4   $K[0],B24,$KxB[0]               ; ยท0x0B
+|| [B0]        GMPY4   $K[1],B24,$KxB[1]
+       XOR     $KxB[2],$KxE[2],$KxE[2]
+||     XOR     $KxB[3],$KxE[3],$KxE[3]
+|| [B0]        GMPY4   $K[2],B24,$KxB[2]
+|| [B0]        GMPY4   $K[3],B24,$KxB[3]
+||     STW     $KxE[0],*${KPA}[-4]
+||     STW     $KxE[1],*${KPB}[-3]
+       STW     $KxE[2],*${KPA}[-2]
+||     STW     $KxE[3],*${KPB}[-1]
+|| [B0]        SUB     B0,1,B0
+       SPKERNEL
+;;====================================================================
+       BNOP    B31,3
+       MVC     B30,GFPGFR                      ; restore GFPGFR(*)
+       MVK     0,RET
+       .endasmfunc
+___
+# (*)  Even though ABI doesn't specify GFPGFR as non-volatile, there
+#      are code samples out there that *assume* its default value.
+}
+{
+my ($inp,$out,$blocks,$key,$ivp)=("A4","B4","A6","B6","A8");
+$code.=<<___;
+       .global _AES_ctr32_encrypt
+_AES_ctr32_encrypt:
+       .asmfunc
+       LDNDW   *${ivp}[0],A31:A30      ; load counter value
+||     MV      $blocks,A2              ; reassign $blocks
+||     DMV     RA,$key,B27:B26         ; reassign RA and $key
+       LDNDW   *${ivp}[1],B31:B30
+||     MVK     0,B2                    ; don't let __encrypt load input
+||     MVK     0,A1                    ; and postpone writing output
+       .if     .BIG_ENDIAN
+       NOP
+       .else
+       NOP     4
+       SWAP2   B31,B31                 ; keep least significant 32 bits
+       SWAP4   B31,B31                 ; in host byte order
+       .endif
+ctr32_loop?:
+   [A2]        BNOP    __encrypt
+|| [A1]        XOR     A29,A9,A9               ; input^Ek(counter)
+|| [A1]        XOR     A28,A8,A8
+|| [A2]        LDNDW   *INP++,A29:A28          ; load input
+  [!A2]        BNOP    B27                     ; return
+|| [A1]        XOR     B29,B9,B9
+|| [A1]        XOR     B28,B8,B8
+|| [A2]        LDNDW   *INP++,B29:B28
+       .if     .BIG_ENDIAN
+   [A1]        STNDW   A9:A8,*OUT++            ; save output
+|| [A2]        DMV     A31,A30,A9:A8           ; pass counter value to __encrypt
+   [A1]        STNDW   B9:B8,*OUT++
+|| [A2]        DMV     B31,B30,B9:B8
+|| [A2]        ADD     B30,1,B30               ; counter++
+       .else
+   [A1]        STNDW   A9:A8,*OUT++            ; save output
+|| [A2]        DMV     A31,A30,A9:A8
+|| [A2]        SWAP2   B31,B0
+|| [A2]        ADD     B31,1,B31               ; counter++
+   [A1]        STNDW   B9:B8,*OUT++
+|| [A2]        MV      B30,B8
+|| [A2]        SWAP4   B0,B9
+       .endif
+   [A2]        ADDKPC  ctr32_loop?,RA          ; return to ctr32_loop?
+|| [A2]        MV      B26,KEY                 ; pass $key
+|| [A2]        SUB     A2,1,A2                 ; $blocks--
+||[!A1]        MVK     1,A1
+       NOP
+       NOP
+       .endasmfunc
+___
+}
+# Tables are kept in endian-neutral manner
+$code.=<<___;
+       .if     __TI_EABI__
+       .sect   ".text:aes_asm.const"
+       .else
+       .sect   ".const:aes_asm"
+       .endif
+       .align  128
+AES_Te:
+       .byte   0xc6,0x63,0x63,0xa5,    0xf8,0x7c,0x7c,0x84
+       .byte   0xee,0x77,0x77,0x99,    0xf6,0x7b,0x7b,0x8d
+       .byte   0xff,0xf2,0xf2,0x0d,    0xd6,0x6b,0x6b,0xbd
+       .byte   0xde,0x6f,0x6f,0xb1,    0x91,0xc5,0xc5,0x54
+       .byte   0x60,0x30,0x30,0x50,    0x02,0x01,0x01,0x03
+       .byte   0xce,0x67,0x67,0xa9,    0x56,0x2b,0x2b,0x7d
+       .byte   0xe7,0xfe,0xfe,0x19,    0xb5,0xd7,0xd7,0x62
+       .byte   0x4d,0xab,0xab,0xe6,    0xec,0x76,0x76,0x9a
+       .byte   0x8f,0xca,0xca,0x45,    0x1f,0x82,0x82,0x9d
+       .byte   0x89,0xc9,0xc9,0x40,    0xfa,0x7d,0x7d,0x87
+       .byte   0xef,0xfa,0xfa,0x15,    0xb2,0x59,0x59,0xeb
+       .byte   0x8e,0x47,0x47,0xc9,    0xfb,0xf0,0xf0,0x0b
+       .byte   0x41,0xad,0xad,0xec,    0xb3,0xd4,0xd4,0x67
+       .byte   0x5f,0xa2,0xa2,0xfd,    0x45,0xaf,0xaf,0xea
+       .byte   0x23,0x9c,0x9c,0xbf,    0x53,0xa4,0xa4,0xf7
+       .byte   0xe4,0x72,0x72,0x96,    0x9b,0xc0,0xc0,0x5b
+       .byte   0x75,0xb7,0xb7,0xc2,    0xe1,0xfd,0xfd,0x1c
+       .byte   0x3d,0x93,0x93,0xae,    0x4c,0x26,0x26,0x6a
+       .byte   0x6c,0x36,0x36,0x5a,    0x7e,0x3f,0x3f,0x41
+       .byte   0xf5,0xf7,0xf7,0x02,    0x83,0xcc,0xcc,0x4f
+       .byte   0x68,0x34,0x34,0x5c,    0x51,0xa5,0xa5,0xf4
+       .byte   0xd1,0xe5,0xe5,0x34,    0xf9,0xf1,0xf1,0x08
+       .byte   0xe2,0x71,0x71,0x93,    0xab,0xd8,0xd8,0x73
+       .byte   0x62,0x31,0x31,0x53,    0x2a,0x15,0x15,0x3f
+       .byte   0x08,0x04,0x04,0x0c,    0x95,0xc7,0xc7,0x52
+       .byte   0x46,0x23,0x23,0x65,    0x9d,0xc3,0xc3,0x5e
+       .byte   0x30,0x18,0x18,0x28,    0x37,0x96,0x96,0xa1
+       .byte   0x0a,0x05,0x05,0x0f,    0x2f,0x9a,0x9a,0xb5
+       .byte   0x0e,0x07,0x07,0x09,    0x24,0x12,0x12,0x36
+       .byte   0x1b,0x80,0x80,0x9b,    0xdf,0xe2,0xe2,0x3d
+       .byte   0xcd,0xeb,0xeb,0x26,    0x4e,0x27,0x27,0x69
+       .byte   0x7f,0xb2,0xb2,0xcd,    0xea,0x75,0x75,0x9f
+       .byte   0x12,0x09,0x09,0x1b,    0x1d,0x83,0x83,0x9e
+       .byte   0x58,0x2c,0x2c,0x74,    0x34,0x1a,0x1a,0x2e
+       .byte   0x36,0x1b,0x1b,0x2d,    0xdc,0x6e,0x6e,0xb2
+       .byte   0xb4,0x5a,0x5a,0xee,    0x5b,0xa0,0xa0,0xfb
+       .byte   0xa4,0x52,0x52,0xf6,    0x76,0x3b,0x3b,0x4d
+       .byte   0xb7,0xd6,0xd6,0x61,    0x7d,0xb3,0xb3,0xce
+       .byte   0x52,0x29,0x29,0x7b,    0xdd,0xe3,0xe3,0x3e
+       .byte   0x5e,0x2f,0x2f,0x71,    0x13,0x84,0x84,0x97
+       .byte   0xa6,0x53,0x53,0xf5,    0xb9,0xd1,0xd1,0x68
+       .byte   0x00,0x00,0x00,0x00,    0xc1,0xed,0xed,0x2c
+       .byte   0x40,0x20,0x20,0x60,    0xe3,0xfc,0xfc,0x1f
+       .byte   0x79,0xb1,0xb1,0xc8,    0xb6,0x5b,0x5b,0xed
+       .byte   0xd4,0x6a,0x6a,0xbe,    0x8d,0xcb,0xcb,0x46
+       .byte   0x67,0xbe,0xbe,0xd9,    0x72,0x39,0x39,0x4b
+       .byte   0x94,0x4a,0x4a,0xde,    0x98,0x4c,0x4c,0xd4
+       .byte   0xb0,0x58,0x58,0xe8,    0x85,0xcf,0xcf,0x4a
+       .byte   0xbb,0xd0,0xd0,0x6b,    0xc5,0xef,0xef,0x2a
+       .byte   0x4f,0xaa,0xaa,0xe5,    0xed,0xfb,0xfb,0x16
+       .byte   0x86,0x43,0x43,0xc5,    0x9a,0x4d,0x4d,0xd7
+       .byte   0x66,0x33,0x33,0x55,    0x11,0x85,0x85,0x94
+       .byte   0x8a,0x45,0x45,0xcf,    0xe9,0xf9,0xf9,0x10
+       .byte   0x04,0x02,0x02,0x06,    0xfe,0x7f,0x7f,0x81
+       .byte   0xa0,0x50,0x50,0xf0,    0x78,0x3c,0x3c,0x44
+       .byte   0x25,0x9f,0x9f,0xba,    0x4b,0xa8,0xa8,0xe3
+       .byte   0xa2,0x51,0x51,0xf3,    0x5d,0xa3,0xa3,0xfe
+       .byte   0x80,0x40,0x40,0xc0,    0x05,0x8f,0x8f,0x8a
+       .byte   0x3f,0x92,0x92,0xad,    0x21,0x9d,0x9d,0xbc
+       .byte   0x70,0x38,0x38,0x48,    0xf1,0xf5,0xf5,0x04
+       .byte   0x63,0xbc,0xbc,0xdf,    0x77,0xb6,0xb6,0xc1
+       .byte   0xaf,0xda,0xda,0x75,    0x42,0x21,0x21,0x63
+       .byte   0x20,0x10,0x10,0x30,    0xe5,0xff,0xff,0x1a
+       .byte   0xfd,0xf3,0xf3,0x0e,    0xbf,0xd2,0xd2,0x6d
+       .byte   0x81,0xcd,0xcd,0x4c,    0x18,0x0c,0x0c,0x14
+       .byte   0x26,0x13,0x13,0x35,    0xc3,0xec,0xec,0x2f
+       .byte   0xbe,0x5f,0x5f,0xe1,    0x35,0x97,0x97,0xa2
+       .byte   0x88,0x44,0x44,0xcc,    0x2e,0x17,0x17,0x39
+       .byte   0x93,0xc4,0xc4,0x57,    0x55,0xa7,0xa7,0xf2
+       .byte   0xfc,0x7e,0x7e,0x82,    0x7a,0x3d,0x3d,0x47
+       .byte   0xc8,0x64,0x64,0xac,    0xba,0x5d,0x5d,0xe7
+       .byte   0x32,0x19,0x19,0x2b,    0xe6,0x73,0x73,0x95
+       .byte   0xc0,0x60,0x60,0xa0,    0x19,0x81,0x81,0x98
+       .byte   0x9e,0x4f,0x4f,0xd1,    0xa3,0xdc,0xdc,0x7f
+       .byte   0x44,0x22,0x22,0x66,    0x54,0x2a,0x2a,0x7e
+       .byte   0x3b,0x90,0x90,0xab,    0x0b,0x88,0x88,0x83
+       .byte   0x8c,0x46,0x46,0xca,    0xc7,0xee,0xee,0x29
+       .byte   0x6b,0xb8,0xb8,0xd3,    0x28,0x14,0x14,0x3c
+       .byte   0xa7,0xde,0xde,0x79,    0xbc,0x5e,0x5e,0xe2
+       .byte   0x16,0x0b,0x0b,0x1d,    0xad,0xdb,0xdb,0x76
+       .byte   0xdb,0xe0,0xe0,0x3b,    0x64,0x32,0x32,0x56
+       .byte   0x74,0x3a,0x3a,0x4e,    0x14,0x0a,0x0a,0x1e
+       .byte   0x92,0x49,0x49,0xdb,    0x0c,0x06,0x06,0x0a
+       .byte   0x48,0x24,0x24,0x6c,    0xb8,0x5c,0x5c,0xe4
+       .byte   0x9f,0xc2,0xc2,0x5d,    0xbd,0xd3,0xd3,0x6e
+       .byte   0x43,0xac,0xac,0xef,    0xc4,0x62,0x62,0xa6
+       .byte   0x39,0x91,0x91,0xa8,    0x31,0x95,0x95,0xa4
+       .byte   0xd3,0xe4,0xe4,0x37,    0xf2,0x79,0x79,0x8b
+       .byte   0xd5,0xe7,0xe7,0x32,    0x8b,0xc8,0xc8,0x43
+       .byte   0x6e,0x37,0x37,0x59,    0xda,0x6d,0x6d,0xb7
+       .byte   0x01,0x8d,0x8d,0x8c,    0xb1,0xd5,0xd5,0x64
+       .byte   0x9c,0x4e,0x4e,0xd2,    0x49,0xa9,0xa9,0xe0
+       .byte   0xd8,0x6c,0x6c,0xb4,    0xac,0x56,0x56,0xfa
+       .byte   0xf3,0xf4,0xf4,0x07,    0xcf,0xea,0xea,0x25
+       .byte   0xca,0x65,0x65,0xaf,    0xf4,0x7a,0x7a,0x8e
+       .byte   0x47,0xae,0xae,0xe9,    0x10,0x08,0x08,0x18
+       .byte   0x6f,0xba,0xba,0xd5,    0xf0,0x78,0x78,0x88
+       .byte   0x4a,0x25,0x25,0x6f,    0x5c,0x2e,0x2e,0x72
+       .byte   0x38,0x1c,0x1c,0x24,    0x57,0xa6,0xa6,0xf1
+       .byte   0x73,0xb4,0xb4,0xc7,    0x97,0xc6,0xc6,0x51
+       .byte   0xcb,0xe8,0xe8,0x23,    0xa1,0xdd,0xdd,0x7c
+       .byte   0xe8,0x74,0x74,0x9c,    0x3e,0x1f,0x1f,0x21
+       .byte   0x96,0x4b,0x4b,0xdd,    0x61,0xbd,0xbd,0xdc
+       .byte   0x0d,0x8b,0x8b,0x86,    0x0f,0x8a,0x8a,0x85
+       .byte   0xe0,0x70,0x70,0x90,    0x7c,0x3e,0x3e,0x42
+       .byte   0x71,0xb5,0xb5,0xc4,    0xcc,0x66,0x66,0xaa
+       .byte   0x90,0x48,0x48,0xd8,    0x06,0x03,0x03,0x05
+       .byte   0xf7,0xf6,0xf6,0x01,    0x1c,0x0e,0x0e,0x12
+       .byte   0xc2,0x61,0x61,0xa3,    0x6a,0x35,0x35,0x5f
+       .byte   0xae,0x57,0x57,0xf9,    0x69,0xb9,0xb9,0xd0
+       .byte   0x17,0x86,0x86,0x91,    0x99,0xc1,0xc1,0x58
+       .byte   0x3a,0x1d,0x1d,0x27,    0x27,0x9e,0x9e,0xb9
+       .byte   0xd9,0xe1,0xe1,0x38,    0xeb,0xf8,0xf8,0x13
+       .byte   0x2b,0x98,0x98,0xb3,    0x22,0x11,0x11,0x33
+       .byte   0xd2,0x69,0x69,0xbb,    0xa9,0xd9,0xd9,0x70
+       .byte   0x07,0x8e,0x8e,0x89,    0x33,0x94,0x94,0xa7
+       .byte   0x2d,0x9b,0x9b,0xb6,    0x3c,0x1e,0x1e,0x22
+       .byte   0x15,0x87,0x87,0x92,    0xc9,0xe9,0xe9,0x20
+       .byte   0x87,0xce,0xce,0x49,    0xaa,0x55,0x55,0xff
+       .byte   0x50,0x28,0x28,0x78,    0xa5,0xdf,0xdf,0x7a
+       .byte   0x03,0x8c,0x8c,0x8f,    0x59,0xa1,0xa1,0xf8
+       .byte   0x09,0x89,0x89,0x80,    0x1a,0x0d,0x0d,0x17
+       .byte   0x65,0xbf,0xbf,0xda,    0xd7,0xe6,0xe6,0x31
+       .byte   0x84,0x42,0x42,0xc6,    0xd0,0x68,0x68,0xb8
+       .byte   0x82,0x41,0x41,0xc3,    0x29,0x99,0x99,0xb0
+       .byte   0x5a,0x2d,0x2d,0x77,    0x1e,0x0f,0x0f,0x11
+       .byte   0x7b,0xb0,0xb0,0xcb,    0xa8,0x54,0x54,0xfc
+       .byte   0x6d,0xbb,0xbb,0xd6,    0x2c,0x16,0x16,0x3a
+AES_Te4:
+       .byte   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+       .byte   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+       .byte   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+       .byte   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+       .byte   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+       .byte   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+       .byte   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+       .byte   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+       .byte   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+       .byte   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+       .byte   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+       .byte   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+       .byte   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+       .byte   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+       .byte   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+       .byte   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+       .byte   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+       .byte   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+       .byte   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+       .byte   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+       .byte   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+       .byte   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+       .byte   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+       .byte   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+       .byte   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+       .byte   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+       .byte   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+       .byte   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+       .byte   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+       .byte   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+       .byte   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+       .byte   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+rcon:
+       .byte   0x01,0x00,0x00,0x00,    0x02,0x00,0x00,0x00
+       .byte   0x04,0x00,0x00,0x00,    0x08,0x00,0x00,0x00
+       .byte   0x10,0x00,0x00,0x00,    0x20,0x00,0x00,0x00
+       .byte   0x40,0x00,0x00,0x00,    0x80,0x00,0x00,0x00
+       .byte   0x1B,0x00,0x00,0x00,    0x36,0x00,0x00,0x00
+       .align  128
+AES_Td:
+       .byte   0x51,0xf4,0xa7,0x50,    0x7e,0x41,0x65,0x53
+       .byte   0x1a,0x17,0xa4,0xc3,    0x3a,0x27,0x5e,0x96
+       .byte   0x3b,0xab,0x6b,0xcb,    0x1f,0x9d,0x45,0xf1
+       .byte   0xac,0xfa,0x58,0xab,    0x4b,0xe3,0x03,0x93
+       .byte   0x20,0x30,0xfa,0x55,    0xad,0x76,0x6d,0xf6
+       .byte   0x88,0xcc,0x76,0x91,    0xf5,0x02,0x4c,0x25
+       .byte   0x4f,0xe5,0xd7,0xfc,    0xc5,0x2a,0xcb,0xd7
+       .byte   0x26,0x35,0x44,0x80,    0xb5,0x62,0xa3,0x8f
+       .byte   0xde,0xb1,0x5a,0x49,    0x25,0xba,0x1b,0x67
+       .byte   0x45,0xea,0x0e,0x98,    0x5d,0xfe,0xc0,0xe1
+       .byte   0xc3,0x2f,0x75,0x02,    0x81,0x4c,0xf0,0x12
+       .byte   0x8d,0x46,0x97,0xa3,    0x6b,0xd3,0xf9,0xc6
+       .byte   0x03,0x8f,0x5f,0xe7,    0x15,0x92,0x9c,0x95
+       .byte   0xbf,0x6d,0x7a,0xeb,    0x95,0x52,0x59,0xda
+       .byte   0xd4,0xbe,0x83,0x2d,    0x58,0x74,0x21,0xd3
+       .byte   0x49,0xe0,0x69,0x29,    0x8e,0xc9,0xc8,0x44
+       .byte   0x75,0xc2,0x89,0x6a,    0xf4,0x8e,0x79,0x78
+       .byte   0x99,0x58,0x3e,0x6b,    0x27,0xb9,0x71,0xdd
+       .byte   0xbe,0xe1,0x4f,0xb6,    0xf0,0x88,0xad,0x17
+       .byte   0xc9,0x20,0xac,0x66,    0x7d,0xce,0x3a,0xb4
+       .byte   0x63,0xdf,0x4a,0x18,    0xe5,0x1a,0x31,0x82
+       .byte   0x97,0x51,0x33,0x60,    0x62,0x53,0x7f,0x45
+       .byte   0xb1,0x64,0x77,0xe0,    0xbb,0x6b,0xae,0x84
+       .byte   0xfe,0x81,0xa0,0x1c,    0xf9,0x08,0x2b,0x94
+       .byte   0x70,0x48,0x68,0x58,    0x8f,0x45,0xfd,0x19
+       .byte   0x94,0xde,0x6c,0x87,    0x52,0x7b,0xf8,0xb7
+       .byte   0xab,0x73,0xd3,0x23,    0x72,0x4b,0x02,0xe2
+       .byte   0xe3,0x1f,0x8f,0x57,    0x66,0x55,0xab,0x2a
+       .byte   0xb2,0xeb,0x28,0x07,    0x2f,0xb5,0xc2,0x03
+       .byte   0x86,0xc5,0x7b,0x9a,    0xd3,0x37,0x08,0xa5
+       .byte   0x30,0x28,0x87,0xf2,    0x23,0xbf,0xa5,0xb2
+       .byte   0x02,0x03,0x6a,0xba,    0xed,0x16,0x82,0x5c
+       .byte   0x8a,0xcf,0x1c,0x2b,    0xa7,0x79,0xb4,0x92
+       .byte   0xf3,0x07,0xf2,0xf0,    0x4e,0x69,0xe2,0xa1
+       .byte   0x65,0xda,0xf4,0xcd,    0x06,0x05,0xbe,0xd5
+       .byte   0xd1,0x34,0x62,0x1f,    0xc4,0xa6,0xfe,0x8a
+       .byte   0x34,0x2e,0x53,0x9d,    0xa2,0xf3,0x55,0xa0
+       .byte   0x05,0x8a,0xe1,0x32,    0xa4,0xf6,0xeb,0x75
+       .byte   0x0b,0x83,0xec,0x39,    0x40,0x60,0xef,0xaa
+       .byte   0x5e,0x71,0x9f,0x06,    0xbd,0x6e,0x10,0x51
+       .byte   0x3e,0x21,0x8a,0xf9,    0x96,0xdd,0x06,0x3d
+       .byte   0xdd,0x3e,0x05,0xae,    0x4d,0xe6,0xbd,0x46
+       .byte   0x91,0x54,0x8d,0xb5,    0x71,0xc4,0x5d,0x05
+       .byte   0x04,0x06,0xd4,0x6f,    0x60,0x50,0x15,0xff
+       .byte   0x19,0x98,0xfb,0x24,    0xd6,0xbd,0xe9,0x97
+       .byte   0x89,0x40,0x43,0xcc,    0x67,0xd9,0x9e,0x77
+       .byte   0xb0,0xe8,0x42,0xbd,    0x07,0x89,0x8b,0x88
+       .byte   0xe7,0x19,0x5b,0x38,    0x79,0xc8,0xee,0xdb
+       .byte   0xa1,0x7c,0x0a,0x47,    0x7c,0x42,0x0f,0xe9
+       .byte   0xf8,0x84,0x1e,0xc9,    0x00,0x00,0x00,0x00
+       .byte   0x09,0x80,0x86,0x83,    0x32,0x2b,0xed,0x48
+       .byte   0x1e,0x11,0x70,0xac,    0x6c,0x5a,0x72,0x4e
+       .byte   0xfd,0x0e,0xff,0xfb,    0x0f,0x85,0x38,0x56
+       .byte   0x3d,0xae,0xd5,0x1e,    0x36,0x2d,0x39,0x27
+       .byte   0x0a,0x0f,0xd9,0x64,    0x68,0x5c,0xa6,0x21
+       .byte   0x9b,0x5b,0x54,0xd1,    0x24,0x36,0x2e,0x3a
+       .byte   0x0c,0x0a,0x67,0xb1,    0x93,0x57,0xe7,0x0f
+       .byte   0xb4,0xee,0x96,0xd2,    0x1b,0x9b,0x91,0x9e
+       .byte   0x80,0xc0,0xc5,0x4f,    0x61,0xdc,0x20,0xa2
+       .byte   0x5a,0x77,0x4b,0x69,    0x1c,0x12,0x1a,0x16
+       .byte   0xe2,0x93,0xba,0x0a,    0xc0,0xa0,0x2a,0xe5
+       .byte   0x3c,0x22,0xe0,0x43,    0x12,0x1b,0x17,0x1d
+       .byte   0x0e,0x09,0x0d,0x0b,    0xf2,0x8b,0xc7,0xad
+       .byte   0x2d,0xb6,0xa8,0xb9,    0x14,0x1e,0xa9,0xc8
+       .byte   0x57,0xf1,0x19,0x85,    0xaf,0x75,0x07,0x4c
+       .byte   0xee,0x99,0xdd,0xbb,    0xa3,0x7f,0x60,0xfd
+       .byte   0xf7,0x01,0x26,0x9f,    0x5c,0x72,0xf5,0xbc
+       .byte   0x44,0x66,0x3b,0xc5,    0x5b,0xfb,0x7e,0x34
+       .byte   0x8b,0x43,0x29,0x76,    0xcb,0x23,0xc6,0xdc
+       .byte   0xb6,0xed,0xfc,0x68,    0xb8,0xe4,0xf1,0x63
+       .byte   0xd7,0x31,0xdc,0xca,    0x42,0x63,0x85,0x10
+       .byte   0x13,0x97,0x22,0x40,    0x84,0xc6,0x11,0x20
+       .byte   0x85,0x4a,0x24,0x7d,    0xd2,0xbb,0x3d,0xf8
+       .byte   0xae,0xf9,0x32,0x11,    0xc7,0x29,0xa1,0x6d
+       .byte   0x1d,0x9e,0x2f,0x4b,    0xdc,0xb2,0x30,0xf3
+       .byte   0x0d,0x86,0x52,0xec,    0x77,0xc1,0xe3,0xd0
+       .byte   0x2b,0xb3,0x16,0x6c,    0xa9,0x70,0xb9,0x99
+       .byte   0x11,0x94,0x48,0xfa,    0x47,0xe9,0x64,0x22
+       .byte   0xa8,0xfc,0x8c,0xc4,    0xa0,0xf0,0x3f,0x1a
+       .byte   0x56,0x7d,0x2c,0xd8,    0x22,0x33,0x90,0xef
+       .byte   0x87,0x49,0x4e,0xc7,    0xd9,0x38,0xd1,0xc1
+       .byte   0x8c,0xca,0xa2,0xfe,    0x98,0xd4,0x0b,0x36
+       .byte   0xa6,0xf5,0x81,0xcf,    0xa5,0x7a,0xde,0x28
+       .byte   0xda,0xb7,0x8e,0x26,    0x3f,0xad,0xbf,0xa4
+       .byte   0x2c,0x3a,0x9d,0xe4,    0x50,0x78,0x92,0x0d
+       .byte   0x6a,0x5f,0xcc,0x9b,    0x54,0x7e,0x46,0x62
+       .byte   0xf6,0x8d,0x13,0xc2,    0x90,0xd8,0xb8,0xe8
+       .byte   0x2e,0x39,0xf7,0x5e,    0x82,0xc3,0xaf,0xf5
+       .byte   0x9f,0x5d,0x80,0xbe,    0x69,0xd0,0x93,0x7c
+       .byte   0x6f,0xd5,0x2d,0xa9,    0xcf,0x25,0x12,0xb3
+       .byte   0xc8,0xac,0x99,0x3b,    0x10,0x18,0x7d,0xa7
+       .byte   0xe8,0x9c,0x63,0x6e,    0xdb,0x3b,0xbb,0x7b
+       .byte   0xcd,0x26,0x78,0x09,    0x6e,0x59,0x18,0xf4
+       .byte   0xec,0x9a,0xb7,0x01,    0x83,0x4f,0x9a,0xa8
+       .byte   0xe6,0x95,0x6e,0x65,    0xaa,0xff,0xe6,0x7e
+       .byte   0x21,0xbc,0xcf,0x08,    0xef,0x15,0xe8,0xe6
+       .byte   0xba,0xe7,0x9b,0xd9,    0x4a,0x6f,0x36,0xce
+       .byte   0xea,0x9f,0x09,0xd4,    0x29,0xb0,0x7c,0xd6
+       .byte   0x31,0xa4,0xb2,0xaf,    0x2a,0x3f,0x23,0x31
+       .byte   0xc6,0xa5,0x94,0x30,    0x35,0xa2,0x66,0xc0
+       .byte   0x74,0x4e,0xbc,0x37,    0xfc,0x82,0xca,0xa6
+       .byte   0xe0,0x90,0xd0,0xb0,    0x33,0xa7,0xd8,0x15
+       .byte   0xf1,0x04,0x98,0x4a,    0x41,0xec,0xda,0xf7
+       .byte   0x7f,0xcd,0x50,0x0e,    0x17,0x91,0xf6,0x2f
+       .byte   0x76,0x4d,0xd6,0x8d,    0x43,0xef,0xb0,0x4d
+       .byte   0xcc,0xaa,0x4d,0x54,    0xe4,0x96,0x04,0xdf
+       .byte   0x9e,0xd1,0xb5,0xe3,    0x4c,0x6a,0x88,0x1b
+       .byte   0xc1,0x2c,0x1f,0xb8,    0x46,0x65,0x51,0x7f
+       .byte   0x9d,0x5e,0xea,0x04,    0x01,0x8c,0x35,0x5d
+       .byte   0xfa,0x87,0x74,0x73,    0xfb,0x0b,0x41,0x2e
+       .byte   0xb3,0x67,0x1d,0x5a,    0x92,0xdb,0xd2,0x52
+       .byte   0xe9,0x10,0x56,0x33,    0x6d,0xd6,0x47,0x13
+       .byte   0x9a,0xd7,0x61,0x8c,    0x37,0xa1,0x0c,0x7a
+       .byte   0x59,0xf8,0x14,0x8e,    0xeb,0x13,0x3c,0x89
+       .byte   0xce,0xa9,0x27,0xee,    0xb7,0x61,0xc9,0x35
+       .byte   0xe1,0x1c,0xe5,0xed,    0x7a,0x47,0xb1,0x3c
+       .byte   0x9c,0xd2,0xdf,0x59,    0x55,0xf2,0x73,0x3f
+       .byte   0x18,0x14,0xce,0x79,    0x73,0xc7,0x37,0xbf
+       .byte   0x53,0xf7,0xcd,0xea,    0x5f,0xfd,0xaa,0x5b
+       .byte   0xdf,0x3d,0x6f,0x14,    0x78,0x44,0xdb,0x86
+       .byte   0xca,0xaf,0xf3,0x81,    0xb9,0x68,0xc4,0x3e
+       .byte   0x38,0x24,0x34,0x2c,    0xc2,0xa3,0x40,0x5f
+       .byte   0x16,0x1d,0xc3,0x72,    0xbc,0xe2,0x25,0x0c
+       .byte   0x28,0x3c,0x49,0x8b,    0xff,0x0d,0x95,0x41
+       .byte   0x39,0xa8,0x01,0x71,    0x08,0x0c,0xb3,0xde
+       .byte   0xd8,0xb4,0xe4,0x9c,    0x64,0x56,0xc1,0x90
+       .byte   0x7b,0xcb,0x84,0x61,    0xd5,0x32,0xb6,0x70
+       .byte   0x48,0x6c,0x5c,0x74,    0xd0,0xb8,0x57,0x42
+AES_Td4:
+       .byte   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+       .byte   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+       .byte   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+       .byte   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+       .byte   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+       .byte   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+       .byte   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+       .byte   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+       .byte   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+       .byte   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+       .byte   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+       .byte   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+       .byte   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+       .byte   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+       .byte   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+       .byte   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+       .byte   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+       .byte   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+       .byte   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+       .byte   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+       .byte   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+       .byte   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+       .byte   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+       .byte   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+       .byte   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+       .byte   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+       .byte   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+       .byte   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+       .byte   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+       .byte   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+       .byte   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+       .byte   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+       .cstring "AES for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+       .align  4
+___
+
+print $code;
+close STDOUT;