+sub clmul64x64 {
+my ($r,$a,$b)=@_;
+$code.=<<___;
+ vext.8 $t0#lo, $a, $a, #1 @ A1
+ vmull.p8 $t0, $t0#lo, $b @ F = A1*B
+ vext.8 $r#lo, $b, $b, #1 @ B1
+ vmull.p8 $r, $a, $r#lo @ E = A*B1
+ vext.8 $t1#lo, $a, $a, #2 @ A2
+ vmull.p8 $t1, $t1#lo, $b @ H = A2*B
+ vext.8 $t3#lo, $b, $b, #2 @ B2
+ vmull.p8 $t3, $a, $t3#lo @ G = A*B2
+ vext.8 $t2#lo, $a, $a, #3 @ A3
+ veor $t0, $t0, $r @ L = E + F
+ vmull.p8 $t2, $t2#lo, $b @ J = A3*B
+ vext.8 $r#lo, $b, $b, #3 @ B3
+ veor $t1, $t1, $t3 @ M = G + H
+ vmull.p8 $r, $a, $r#lo @ I = A*B3
+ veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
+ vand $t0#hi, $t0#hi, $k48
+ vext.8 $t3#lo, $b, $b, #4 @ B4
+ veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
+ vand $t1#hi, $t1#hi, $k32
+ vmull.p8 $t3, $a, $t3#lo @ K = A*B4
+ veor $t2, $t2, $r @ N = I + J
+ veor $t0#lo, $t0#lo, $t0#hi
+ veor $t1#lo, $t1#lo, $t1#hi
+ veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
+ vand $t2#hi, $t2#hi, $k16
+ vext.8 $t0, $t0, $t0, #15
+ veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 $t3#hi, #0
+ vext.8 $t1, $t1, $t1, #14
+ veor $t2#lo, $t2#lo, $t2#hi
+ vmull.p8 $r, $a, $b @ D = A*B
+ vext.8 $t3, $t3, $t3, #12
+ vext.8 $t2, $t2, $t2, #13
+ veor $t0, $t0, $t1
+ veor $t2, $t2, $t3
+ veor $r, $r, $t0
+ veor $r, $r, $t2
+___
+}