1658 lines
35 KiB
Diff
1658 lines
35 KiB
Diff
From 7205c715b3e0f6fd0b853e8916d174048f43c03b Mon Sep 17 00:00:00 2001
|
|
From: Danny Tsen <dtsen@us.ibm.com>
|
|
Date: Tue, 14 Dec 2021 20:03:06 +0200
|
|
Subject: AES-GCM: Bulk implementation of AES-GCM acceleration for ppc64le
|
|
|
|
* configure.ac: Added p10 assembly implementation file and assiciated file.
|
|
* cipher/Makefile.am: Added p10 assembly implementation file and associated
|
|
file.
|
|
* cipher/rijndael.c: Added p10 function.
|
|
* cipher/rijndael-p10le.c: New wrapper file for AES-GCM call.
|
|
* cipher/rijndael-gcm-p10le.s: New implementation of AES-GCM bulk function in
|
|
Power Assembly.
|
|
* src/g10lib.h: Added Power arch 3.1 definition for p10.
|
|
* src/hwf-ppc.c: Added Power arch 3.1 definition for p10.
|
|
* src/hwfeatures.c: Added Power arch 3.1 definition for p10.
|
|
--
|
|
|
|
GnuPG-bug-id: 5700
|
|
Signed-off-by: Danny Tsen <dtsen@us.ibm.com>
|
|
[jk: fixes for C coding style]
|
|
[jk: prefix assembly functions with '_gcry_ppc10']
|
|
[jk: add assert check for gcm_table size]
|
|
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
|
|
|
Index: libgcrypt-1.9.4/cipher/Makefile.am
|
|
===================================================================
|
|
--- libgcrypt-1.9.4.orig/cipher/Makefile.am
|
|
+++ libgcrypt-1.9.4/cipher/Makefile.am
|
|
@@ -105,6 +105,7 @@ EXTRA_libcipher_la_SOURCES = \
|
|
rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S \
|
|
rijndael-armv8-aarch64-ce.S rijndael-aarch64.S \
|
|
rijndael-ppc.c rijndael-ppc9le.c \
|
|
+ rijndael-p10le.c rijndael-gcm-p10le.s \
|
|
rijndael-ppc-common.h rijndael-ppc-functions.h \
|
|
rijndael-s390x.c \
|
|
rmd160.c \
|
|
@@ -242,6 +243,12 @@ rijndael-ppc9le.o: $(srcdir)/rijndael-pp
|
|
rijndael-ppc9le.lo: $(srcdir)/rijndael-ppc9le.c Makefile
|
|
`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
|
|
|
|
+rijndael-p10le.o: $(srcdir)/rijndael-p10le.c Makefile
|
|
+ `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
|
|
+
|
|
+rijndael-p10le.lo: $(srcdir)/rijndael-p10le.c Makefile
|
|
+ `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
|
|
+
|
|
sha256-ppc.o: $(srcdir)/sha256-ppc.c Makefile
|
|
`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
|
|
|
|
Index: libgcrypt-1.9.4/cipher/rijndael-gcm-p10le.s
|
|
===================================================================
|
|
--- /dev/null
|
|
+++ libgcrypt-1.9.4/cipher/rijndael-gcm-p10le.s
|
|
@@ -0,0 +1,1401 @@
|
|
+# Copyright 2021- IBM Inc. All rights reserved
|
|
+#
|
|
+# This file is part of Libgcrypt.
|
|
+#
|
|
+# Libgcrypt is free software; you can redistribute it and/or modify
|
|
+# it under the terms of the GNU Lesser General Public License as
|
|
+# published by the Free Software Foundation; either version 2.1 of
|
|
+# the License, or (at your option) any later version.
|
|
+#
|
|
+# Libgcrypt is distributed in the hope that it will be useful,
|
|
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
+# GNU Lesser General Public License for more details.
|
|
+#
|
|
+# You should have received a copy of the GNU Lesser General Public
|
|
+# License along with this program; if not, see <http://www.gnu.org/licenses/>.
|
|
+#
|
|
+#===================================================================================
|
|
+# Written by Danny Tsen <dtsen@us.ibm.com>
|
|
+#
|
|
+# GHASH is based on the Karatsuba multiplication method.
|
|
+#
|
|
+# Xi xor X1
|
|
+#
|
|
+# X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
|
|
+# (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
|
|
+# (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
|
|
+# (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
|
|
+# (X4.h * H.h + X4.l * H.l + X4 * H)
|
|
+#
|
|
+# Xi = v0
|
|
+# H Poly = v2
|
|
+# Hash keys = v3 - v14
|
|
+# ( H.l, H, H.h)
|
|
+# ( H^2.l, H^2, H^2.h)
|
|
+# ( H^3.l, H^3, H^3.h)
|
|
+# ( H^4.l, H^4, H^4.h)
|
|
+#
|
|
+# v30 is IV
|
|
+# v31 - counter 1
|
|
+#
|
|
+# AES used,
|
|
+# vs0 - vs14 for round keys
|
|
+# v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
|
|
+#
|
|
+# This implementation uses stitched AES-GCM approach to improve overall performance.
|
|
+# AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
|
|
+#
|
|
+# Current performance with 128 bit key using bench-slope on Power10[le] (3.89GHz):
|
|
+#
|
|
+# AES | nanosecs/byte mebibytes/sec cycles/byte
|
|
+# GCM enc | 0.169 ns/B 5643 MiB/s - c/B
|
|
+# GCM dec | 0.171 ns/B 5585 MiB/s - c/B
|
|
+#
|
|
+# ===================================================================================
|
|
+#
|
|
+
|
|
+.machine "any"
|
|
+.abiversion 2
|
|
+.text
|
|
+
|
|
+# 4x loops
|
|
+# v15 - v18 - input states
|
|
+# vs1 - vs9 - round keys
|
|
+#
|
|
+.macro Loop_aes_middle4x
|
|
+ xxlor 19+32, 1, 1
|
|
+ xxlor 20+32, 2, 2
|
|
+ xxlor 21+32, 3, 3
|
|
+ xxlor 22+32, 4, 4
|
|
+
|
|
+ vcipher 15, 15, 19
|
|
+ vcipher 16, 16, 19
|
|
+ vcipher 17, 17, 19
|
|
+ vcipher 18, 18, 19
|
|
+
|
|
+ vcipher 15, 15, 20
|
|
+ vcipher 16, 16, 20
|
|
+ vcipher 17, 17, 20
|
|
+ vcipher 18, 18, 20
|
|
+
|
|
+ vcipher 15, 15, 21
|
|
+ vcipher 16, 16, 21
|
|
+ vcipher 17, 17, 21
|
|
+ vcipher 18, 18, 21
|
|
+
|
|
+ vcipher 15, 15, 22
|
|
+ vcipher 16, 16, 22
|
|
+ vcipher 17, 17, 22
|
|
+ vcipher 18, 18, 22
|
|
+
|
|
+ xxlor 19+32, 5, 5
|
|
+ xxlor 20+32, 6, 6
|
|
+ xxlor 21+32, 7, 7
|
|
+ xxlor 22+32, 8, 8
|
|
+
|
|
+ vcipher 15, 15, 19
|
|
+ vcipher 16, 16, 19
|
|
+ vcipher 17, 17, 19
|
|
+ vcipher 18, 18, 19
|
|
+
|
|
+ vcipher 15, 15, 20
|
|
+ vcipher 16, 16, 20
|
|
+ vcipher 17, 17, 20
|
|
+ vcipher 18, 18, 20
|
|
+
|
|
+ vcipher 15, 15, 21
|
|
+ vcipher 16, 16, 21
|
|
+ vcipher 17, 17, 21
|
|
+ vcipher 18, 18, 21
|
|
+
|
|
+ vcipher 15, 15, 22
|
|
+ vcipher 16, 16, 22
|
|
+ vcipher 17, 17, 22
|
|
+ vcipher 18, 18, 22
|
|
+
|
|
+ xxlor 23+32, 9, 9
|
|
+ vcipher 15, 15, 23
|
|
+ vcipher 16, 16, 23
|
|
+ vcipher 17, 17, 23
|
|
+ vcipher 18, 18, 23
|
|
+.endm
|
|
+
|
|
+# 8x loops
|
|
+# v15 - v22 - input states
|
|
+# vs1 - vs9 - round keys
|
|
+#
|
|
+.macro Loop_aes_middle8x
|
|
+ xxlor 23+32, 1, 1
|
|
+ xxlor 24+32, 2, 2
|
|
+ xxlor 25+32, 3, 3
|
|
+ xxlor 26+32, 4, 4
|
|
+
|
|
+ vcipher 15, 15, 23
|
|
+ vcipher 16, 16, 23
|
|
+ vcipher 17, 17, 23
|
|
+ vcipher 18, 18, 23
|
|
+ vcipher 19, 19, 23
|
|
+ vcipher 20, 20, 23
|
|
+ vcipher 21, 21, 23
|
|
+ vcipher 22, 22, 23
|
|
+
|
|
+ vcipher 15, 15, 24
|
|
+ vcipher 16, 16, 24
|
|
+ vcipher 17, 17, 24
|
|
+ vcipher 18, 18, 24
|
|
+ vcipher 19, 19, 24
|
|
+ vcipher 20, 20, 24
|
|
+ vcipher 21, 21, 24
|
|
+ vcipher 22, 22, 24
|
|
+
|
|
+ vcipher 15, 15, 25
|
|
+ vcipher 16, 16, 25
|
|
+ vcipher 17, 17, 25
|
|
+ vcipher 18, 18, 25
|
|
+ vcipher 19, 19, 25
|
|
+ vcipher 20, 20, 25
|
|
+ vcipher 21, 21, 25
|
|
+ vcipher 22, 22, 25
|
|
+
|
|
+ vcipher 15, 15, 26
|
|
+ vcipher 16, 16, 26
|
|
+ vcipher 17, 17, 26
|
|
+ vcipher 18, 18, 26
|
|
+ vcipher 19, 19, 26
|
|
+ vcipher 20, 20, 26
|
|
+ vcipher 21, 21, 26
|
|
+ vcipher 22, 22, 26
|
|
+
|
|
+ xxlor 23+32, 5, 5
|
|
+ xxlor 24+32, 6, 6
|
|
+ xxlor 25+32, 7, 7
|
|
+ xxlor 26+32, 8, 8
|
|
+
|
|
+ vcipher 15, 15, 23
|
|
+ vcipher 16, 16, 23
|
|
+ vcipher 17, 17, 23
|
|
+ vcipher 18, 18, 23
|
|
+ vcipher 19, 19, 23
|
|
+ vcipher 20, 20, 23
|
|
+ vcipher 21, 21, 23
|
|
+ vcipher 22, 22, 23
|
|
+
|
|
+ vcipher 15, 15, 24
|
|
+ vcipher 16, 16, 24
|
|
+ vcipher 17, 17, 24
|
|
+ vcipher 18, 18, 24
|
|
+ vcipher 19, 19, 24
|
|
+ vcipher 20, 20, 24
|
|
+ vcipher 21, 21, 24
|
|
+ vcipher 22, 22, 24
|
|
+
|
|
+ vcipher 15, 15, 25
|
|
+ vcipher 16, 16, 25
|
|
+ vcipher 17, 17, 25
|
|
+ vcipher 18, 18, 25
|
|
+ vcipher 19, 19, 25
|
|
+ vcipher 20, 20, 25
|
|
+ vcipher 21, 21, 25
|
|
+ vcipher 22, 22, 25
|
|
+
|
|
+ vcipher 15, 15, 26
|
|
+ vcipher 16, 16, 26
|
|
+ vcipher 17, 17, 26
|
|
+ vcipher 18, 18, 26
|
|
+ vcipher 19, 19, 26
|
|
+ vcipher 20, 20, 26
|
|
+ vcipher 21, 21, 26
|
|
+ vcipher 22, 22, 26
|
|
+
|
|
+ xxlor 23+32, 9, 9
|
|
+ vcipher 15, 15, 23
|
|
+ vcipher 16, 16, 23
|
|
+ vcipher 17, 17, 23
|
|
+ vcipher 18, 18, 23
|
|
+ vcipher 19, 19, 23
|
|
+ vcipher 20, 20, 23
|
|
+ vcipher 21, 21, 23
|
|
+ vcipher 22, 22, 23
|
|
+.endm
|
|
+
|
|
+#
|
|
+# Compute 4x hash values based on Karatsuba method.
|
|
+#
|
|
+ppc_aes_gcm_ghash:
|
|
+ vxor 15, 15, 0
|
|
+
|
|
+ xxlxor 29, 29, 29
|
|
+
|
|
+ vpmsumd 23, 12, 15 # H4.L * X.L
|
|
+ vpmsumd 24, 9, 16
|
|
+ vpmsumd 25, 6, 17
|
|
+ vpmsumd 26, 3, 18
|
|
+
|
|
+ vxor 23, 23, 24
|
|
+ vxor 23, 23, 25
|
|
+ vxor 23, 23, 26 # L
|
|
+
|
|
+ vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
|
|
+ vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
|
|
+ vpmsumd 26, 7, 17
|
|
+ vpmsumd 27, 4, 18
|
|
+
|
|
+ vxor 24, 24, 25
|
|
+ vxor 24, 24, 26
|
|
+ vxor 24, 24, 27 # M
|
|
+
|
|
+ # sum hash and reduction with H Poly
|
|
+ vpmsumd 28, 23, 2 # reduction
|
|
+
|
|
+ xxlor 29+32, 29, 29
|
|
+ vsldoi 26, 24, 29, 8 # mL
|
|
+ vsldoi 29, 29, 24, 8 # mH
|
|
+ vxor 23, 23, 26 # mL + L
|
|
+
|
|
+ vsldoi 23, 23, 23, 8 # swap
|
|
+ vxor 23, 23, 28
|
|
+
|
|
+ vpmsumd 24, 14, 15 # H4.H * X.H
|
|
+ vpmsumd 25, 11, 16
|
|
+ vpmsumd 26, 8, 17
|
|
+ vpmsumd 27, 5, 18
|
|
+
|
|
+ vxor 24, 24, 25
|
|
+ vxor 24, 24, 26
|
|
+ vxor 24, 24, 27
|
|
+
|
|
+ vxor 24, 24, 29
|
|
+
|
|
+ # sum hash and reduction with H Poly
|
|
+ vsldoi 27, 23, 23, 8 # swap
|
|
+ vpmsumd 23, 23, 2
|
|
+ vxor 27, 27, 24
|
|
+ vxor 23, 23, 27
|
|
+
|
|
+ xxlor 32, 23+32, 23+32 # update hash
|
|
+
|
|
+ blr
|
|
+
|
|
+#
|
|
+# Combine two 4x ghash
|
|
+# v15 - v22 - input blocks
|
|
+#
|
|
+.macro ppc_aes_gcm_ghash2_4x
|
|
+ # first 4x hash
|
|
+ vxor 15, 15, 0 # Xi + X
|
|
+
|
|
+ xxlxor 29, 29, 29
|
|
+
|
|
+ vpmsumd 23, 12, 15 # H4.L * X.L
|
|
+ vpmsumd 24, 9, 16
|
|
+ vpmsumd 25, 6, 17
|
|
+ vpmsumd 26, 3, 18
|
|
+
|
|
+ vxor 23, 23, 24
|
|
+ vxor 23, 23, 25
|
|
+ vxor 23, 23, 26 # L
|
|
+
|
|
+ vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
|
|
+ vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
|
|
+ vpmsumd 26, 7, 17
|
|
+ vpmsumd 27, 4, 18
|
|
+
|
|
+ vxor 24, 24, 25
|
|
+ vxor 24, 24, 26
|
|
+
|
|
+ # sum hash and reduction with H Poly
|
|
+ vpmsumd 28, 23, 2 # reduction
|
|
+
|
|
+ xxlor 29+32, 29, 29
|
|
+
|
|
+ vxor 24, 24, 27 # M
|
|
+ vsldoi 26, 24, 29, 8 # mL
|
|
+ vsldoi 29, 29, 24, 8 # mH
|
|
+ vxor 23, 23, 26 # mL + L
|
|
+
|
|
+ vsldoi 23, 23, 23, 8 # swap
|
|
+ vxor 23, 23, 28
|
|
+
|
|
+ vpmsumd 24, 14, 15 # H4.H * X.H
|
|
+ vpmsumd 25, 11, 16
|
|
+ vpmsumd 26, 8, 17
|
|
+ vpmsumd 27, 5, 18
|
|
+
|
|
+ vxor 24, 24, 25
|
|
+ vxor 24, 24, 26
|
|
+ vxor 24, 24, 27 # H
|
|
+
|
|
+ vxor 24, 24, 29 # H + mH
|
|
+
|
|
+ # sum hash and reduction with H Poly
|
|
+ vsldoi 27, 23, 23, 8 # swap
|
|
+ vpmsumd 23, 23, 2
|
|
+ vxor 27, 27, 24
|
|
+ vxor 27, 23, 27 # 1st Xi
|
|
+
|
|
+ # 2nd 4x hash
|
|
+ vpmsumd 24, 9, 20
|
|
+ vpmsumd 25, 6, 21
|
|
+ vpmsumd 26, 3, 22
|
|
+ vxor 19, 19, 27 # Xi + X
|
|
+ vpmsumd 23, 12, 19 # H4.L * X.L
|
|
+
|
|
+ vxor 23, 23, 24
|
|
+ vxor 23, 23, 25
|
|
+ vxor 23, 23, 26 # L
|
|
+
|
|
+ vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L
|
|
+ vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L
|
|
+ vpmsumd 26, 7, 21
|
|
+ vpmsumd 27, 4, 22
|
|
+
|
|
+ vxor 24, 24, 25
|
|
+ vxor 24, 24, 26
|
|
+
|
|
+ # sum hash and reduction with H Poly
|
|
+ vpmsumd 28, 23, 2 # reduction
|
|
+
|
|
+ xxlor 29+32, 29, 29
|
|
+
|
|
+ vxor 24, 24, 27 # M
|
|
+ vsldoi 26, 24, 29, 8 # mL
|
|
+ vsldoi 29, 29, 24, 8 # mH
|
|
+ vxor 23, 23, 26 # mL + L
|
|
+
|
|
+ vsldoi 23, 23, 23, 8 # swap
|
|
+ vxor 23, 23, 28
|
|
+
|
|
+ vpmsumd 24, 14, 19 # H4.H * X.H
|
|
+ vpmsumd 25, 11, 20
|
|
+ vpmsumd 26, 8, 21
|
|
+ vpmsumd 27, 5, 22
|
|
+
|
|
+ vxor 24, 24, 25
|
|
+ vxor 24, 24, 26
|
|
+ vxor 24, 24, 27 # H
|
|
+
|
|
+ vxor 24, 24, 29 # H + mH
|
|
+
|
|
+ # sum hash and reduction with H Poly
|
|
+ vsldoi 27, 23, 23, 8 # swap
|
|
+ vpmsumd 23, 23, 2
|
|
+ vxor 27, 27, 24
|
|
+ vxor 23, 23, 27
|
|
+
|
|
+ xxlor 32, 23+32, 23+32 # update hash
|
|
+
|
|
+.endm
|
|
+
|
|
+#
|
|
+# Compute update single hash
|
|
+#
|
|
+.macro ppc_update_hash_1x
|
|
+ vxor 28, 28, 0
|
|
+
|
|
+ vxor 19, 19, 19
|
|
+
|
|
+ vpmsumd 22, 3, 28 # L
|
|
+ vpmsumd 23, 4, 28 # M
|
|
+ vpmsumd 24, 5, 28 # H
|
|
+
|
|
+ vpmsumd 27, 22, 2 # reduction
|
|
+
|
|
+ vsldoi 25, 23, 19, 8 # mL
|
|
+ vsldoi 26, 19, 23, 8 # mH
|
|
+ vxor 22, 22, 25 # LL + LL
|
|
+ vxor 24, 24, 26 # HH + HH
|
|
+
|
|
+ vsldoi 22, 22, 22, 8 # swap
|
|
+ vxor 22, 22, 27
|
|
+
|
|
+ vsldoi 20, 22, 22, 8 # swap
|
|
+ vpmsumd 22, 22, 2 # reduction
|
|
+ vxor 20, 20, 24
|
|
+ vxor 22, 22, 20
|
|
+
|
|
+ vmr 0, 22 # update hash
|
|
+
|
|
+.endm
|
|
+
|
|
+#
|
|
+# libgcrypt:
|
|
+# _gcry_ppc10_aes_gcm_encrypt (const void *inp, void *out, size_t len,
|
|
+# const char *rk, unsigned char iv[16], void *Xip);
|
|
+#
|
|
+# r3 - inp
|
|
+# r4 - out
|
|
+# r5 - len
|
|
+# r6 - AES round keys
|
|
+# r7 - iv
|
|
+# r8 - HPoli, hash keys, Xi
|
|
+#
|
|
+# rounds is at offset 480 in rk
|
|
+# Xi is at 256 in gcm_table (Xip).
|
|
+#
|
|
+.global _gcry_ppc10_aes_gcm_encrypt
|
|
+.align 5
|
|
+_gcry_ppc10_aes_gcm_encrypt:
|
|
+_gcry_ppc_aes_gcm_encrypt:
|
|
+
|
|
+ stdu 1,-512(1)
|
|
+ mflr 0
|
|
+
|
|
+ std 14,112(1)
|
|
+ std 15,120(1)
|
|
+ std 16,128(1)
|
|
+ std 17,136(1)
|
|
+ std 18,144(1)
|
|
+ std 19,152(1)
|
|
+ std 20,160(1)
|
|
+ std 21,168(1)
|
|
+ li 9, 256
|
|
+ stvx 20, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 21, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 22, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 23, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 24, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 25, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 26, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 27, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 28, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 29, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 30, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 31, 9, 1
|
|
+ std 0, 528(1)
|
|
+
|
|
+ # Load Xi
|
|
+ li 10, 256
|
|
+ lxvb16x 32, 10, 8 # load Xi
|
|
+
|
|
+ # load Hash - h^4, h^3, h^2, h
|
|
+ lxvd2x 2+32, 0, 8 # H Poli
|
|
+ li 10, 16
|
|
+ lxvd2x 3+32, 10, 8 # Hl
|
|
+ li 10, 32
|
|
+ lxvd2x 4+32, 10, 8 # H
|
|
+ li 10, 48
|
|
+ lxvd2x 5+32, 10, 8 # Hh
|
|
+
|
|
+ li 10, 64
|
|
+ lxvd2x 6+32, 10, 8 # H^2l
|
|
+ li 10, 80
|
|
+ lxvd2x 7+32, 10, 8 # H^2
|
|
+ li 10, 96
|
|
+ lxvd2x 8+32, 10, 8 # H^2h
|
|
+
|
|
+ li 10, 112
|
|
+ lxvd2x 9+32, 10, 8 # H^3l
|
|
+ li 10, 128
|
|
+ lxvd2x 10+32, 10, 8 # H^3
|
|
+ li 10, 144
|
|
+ lxvd2x 11+32, 10, 8 # H^3h
|
|
+
|
|
+ li 10, 160
|
|
+ lxvd2x 12+32, 10, 8 # H^4l
|
|
+ li 10, 176
|
|
+ lxvd2x 13+32, 10, 8 # H^4
|
|
+ li 10, 192
|
|
+ lxvd2x 14+32, 10, 8 # H^4h
|
|
+
|
|
+ # initialize ICB: GHASH( IV ), IV - r7
|
|
+ lxvb16x 30+32, 0, 7 # load IV - v30
|
|
+
|
|
+ mr 12, 5 # length
|
|
+ li 11, 0 # block index
|
|
+
|
|
+ # counter 1
|
|
+ vxor 31, 31, 31
|
|
+ vspltisb 22, 1
|
|
+ vsldoi 31, 31, 22,1 # counter 1
|
|
+
|
|
+ # load round key to VSR
|
|
+ lxv 0, 0(6)
|
|
+ lxv 1, 0x10(6)
|
|
+ lxv 2, 0x20(6)
|
|
+ lxv 3, 0x30(6)
|
|
+ lxv 4, 0x40(6)
|
|
+ lxv 5, 0x50(6)
|
|
+ lxv 6, 0x60(6)
|
|
+ lxv 7, 0x70(6)
|
|
+ lxv 8, 0x80(6)
|
|
+ lxv 9, 0x90(6)
|
|
+ lxv 10, 0xa0(6)
|
|
+
|
|
+ # load rounds - 10 (128), 12 (192), 14 (256)
|
|
+ lwz 9,480(6)
|
|
+
|
|
+ #
|
|
+ # vxor state, state, w # addroundkey
|
|
+ xxlor 32+29, 0, 0
|
|
+ vxor 15, 30, 29 # IV + round key - add round key 0
|
|
+
|
|
+ cmpdi 9, 10
|
|
+ beq Loop_aes_gcm_8x
|
|
+
|
|
+ # load 2 more round keys (v11, v12)
|
|
+ lxv 11, 0xb0(6)
|
|
+ lxv 12, 0xc0(6)
|
|
+
|
|
+ cmpdi 9, 12
|
|
+ beq Loop_aes_gcm_8x
|
|
+
|
|
+ # load 2 more round keys (v11, v12, v13, v14)
|
|
+ lxv 13, 0xd0(6)
|
|
+ lxv 14, 0xe0(6)
|
|
+ cmpdi 9, 14
|
|
+ beq Loop_aes_gcm_8x
|
|
+
|
|
+ b aes_gcm_out
|
|
+
|
|
+.align 5
|
|
+Loop_aes_gcm_8x:
|
|
+ mr 14, 3
|
|
+ mr 9, 4
|
|
+
|
|
+ # n blcoks
|
|
+ li 10, 128
|
|
+ divdu 10, 5, 10 # n 128 bytes-blocks
|
|
+ cmpdi 10, 0
|
|
+ beq Loop_last_block
|
|
+
|
|
+ vaddudm 30, 30, 31 # IV + counter
|
|
+ vxor 16, 30, 29
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 17, 30, 29
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 18, 30, 29
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 19, 30, 29
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 20, 30, 29
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 21, 30, 29
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 22, 30, 29
|
|
+
|
|
+ mtctr 10
|
|
+
|
|
+ li 15, 16
|
|
+ li 16, 32
|
|
+ li 17, 48
|
|
+ li 18, 64
|
|
+ li 19, 80
|
|
+ li 20, 96
|
|
+ li 21, 112
|
|
+
|
|
+ lwz 10, 480(6)
|
|
+
|
|
+Loop_8x_block:
|
|
+
|
|
+ lxvb16x 15, 0, 14 # load block
|
|
+ lxvb16x 16, 15, 14 # load block
|
|
+ lxvb16x 17, 16, 14 # load block
|
|
+ lxvb16x 18, 17, 14 # load block
|
|
+ lxvb16x 19, 18, 14 # load block
|
|
+ lxvb16x 20, 19, 14 # load block
|
|
+ lxvb16x 21, 20, 14 # load block
|
|
+ lxvb16x 22, 21, 14 # load block
|
|
+ addi 14, 14, 128
|
|
+
|
|
+ Loop_aes_middle8x
|
|
+
|
|
+ xxlor 23+32, 10, 10
|
|
+
|
|
+ cmpdi 10, 10
|
|
+ beq Do_next_ghash
|
|
+
|
|
+ # 192 bits
|
|
+ xxlor 24+32, 11, 11
|
|
+
|
|
+ vcipher 15, 15, 23
|
|
+ vcipher 16, 16, 23
|
|
+ vcipher 17, 17, 23
|
|
+ vcipher 18, 18, 23
|
|
+ vcipher 19, 19, 23
|
|
+ vcipher 20, 20, 23
|
|
+ vcipher 21, 21, 23
|
|
+ vcipher 22, 22, 23
|
|
+
|
|
+ vcipher 15, 15, 24
|
|
+ vcipher 16, 16, 24
|
|
+ vcipher 17, 17, 24
|
|
+ vcipher 18, 18, 24
|
|
+ vcipher 19, 19, 24
|
|
+ vcipher 20, 20, 24
|
|
+ vcipher 21, 21, 24
|
|
+ vcipher 22, 22, 24
|
|
+
|
|
+ xxlor 23+32, 12, 12
|
|
+
|
|
+ cmpdi 10, 12
|
|
+ beq Do_next_ghash
|
|
+
|
|
+ # 256 bits
|
|
+ xxlor 24+32, 13, 13
|
|
+
|
|
+ vcipher 15, 15, 23
|
|
+ vcipher 16, 16, 23
|
|
+ vcipher 17, 17, 23
|
|
+ vcipher 18, 18, 23
|
|
+ vcipher 19, 19, 23
|
|
+ vcipher 20, 20, 23
|
|
+ vcipher 21, 21, 23
|
|
+ vcipher 22, 22, 23
|
|
+
|
|
+ vcipher 15, 15, 24
|
|
+ vcipher 16, 16, 24
|
|
+ vcipher 17, 17, 24
|
|
+ vcipher 18, 18, 24
|
|
+ vcipher 19, 19, 24
|
|
+ vcipher 20, 20, 24
|
|
+ vcipher 21, 21, 24
|
|
+ vcipher 22, 22, 24
|
|
+
|
|
+ xxlor 23+32, 14, 14
|
|
+
|
|
+ cmpdi 10, 14
|
|
+ beq Do_next_ghash
|
|
+ b aes_gcm_out
|
|
+
|
|
+Do_next_ghash:
|
|
+
|
|
+ #
|
|
+ # last round
|
|
+ vcipherlast 15, 15, 23
|
|
+ vcipherlast 16, 16, 23
|
|
+
|
|
+ xxlxor 47, 47, 15
|
|
+ stxvb16x 47, 0, 9 # store output
|
|
+ xxlxor 48, 48, 16
|
|
+ stxvb16x 48, 15, 9 # store output
|
|
+
|
|
+ vcipherlast 17, 17, 23
|
|
+ vcipherlast 18, 18, 23
|
|
+
|
|
+ xxlxor 49, 49, 17
|
|
+ stxvb16x 49, 16, 9 # store output
|
|
+ xxlxor 50, 50, 18
|
|
+ stxvb16x 50, 17, 9 # store output
|
|
+
|
|
+ vcipherlast 19, 19, 23
|
|
+ vcipherlast 20, 20, 23
|
|
+
|
|
+ xxlxor 51, 51, 19
|
|
+ stxvb16x 51, 18, 9 # store output
|
|
+ xxlxor 52, 52, 20
|
|
+ stxvb16x 52, 19, 9 # store output
|
|
+
|
|
+ vcipherlast 21, 21, 23
|
|
+ vcipherlast 22, 22, 23
|
|
+
|
|
+ xxlxor 53, 53, 21
|
|
+ stxvb16x 53, 20, 9 # store output
|
|
+ xxlxor 54, 54, 22
|
|
+ stxvb16x 54, 21, 9 # store output
|
|
+
|
|
+ addi 9, 9, 128
|
|
+
|
|
+ # ghash here
|
|
+ ppc_aes_gcm_ghash2_4x
|
|
+
|
|
+ xxlor 27+32, 0, 0
|
|
+ vaddudm 30, 30, 31 # IV + counter
|
|
+ vmr 29, 30
|
|
+ vxor 15, 30, 27 # add round key
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 16, 30, 27
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 17, 30, 27
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 18, 30, 27
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 19, 30, 27
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 20, 30, 27
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 21, 30, 27
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 22, 30, 27
|
|
+
|
|
+ addi 12, 12, -128
|
|
+ addi 11, 11, 128
|
|
+
|
|
+ bdnz Loop_8x_block
|
|
+
|
|
+ vmr 30, 29
|
|
+
|
|
+Loop_last_block:
|
|
+ cmpdi 12, 0
|
|
+ beq aes_gcm_out
|
|
+
|
|
+ # loop last few blocks
|
|
+ li 10, 16
|
|
+ divdu 10, 12, 10
|
|
+
|
|
+ mtctr 10
|
|
+
|
|
+ lwz 10, 480(6)
|
|
+
|
|
+ cmpdi 12, 16
|
|
+ blt Final_block
|
|
+
|
|
+.macro Loop_aes_middle_1x
|
|
+ xxlor 19+32, 1, 1
|
|
+ xxlor 20+32, 2, 2
|
|
+ xxlor 21+32, 3, 3
|
|
+ xxlor 22+32, 4, 4
|
|
+
|
|
+ vcipher 15, 15, 19
|
|
+ vcipher 15, 15, 20
|
|
+ vcipher 15, 15, 21
|
|
+ vcipher 15, 15, 22
|
|
+
|
|
+ xxlor 19+32, 5, 5
|
|
+ xxlor 20+32, 6, 6
|
|
+ xxlor 21+32, 7, 7
|
|
+ xxlor 22+32, 8, 8
|
|
+
|
|
+ vcipher 15, 15, 19
|
|
+ vcipher 15, 15, 20
|
|
+ vcipher 15, 15, 21
|
|
+ vcipher 15, 15, 22
|
|
+
|
|
+ xxlor 19+32, 9, 9
|
|
+ vcipher 15, 15, 19
|
|
+.endm
|
|
+
|
|
+Next_rem_block:
|
|
+ lxvb16x 15, 0, 14 # load block
|
|
+
|
|
+ Loop_aes_middle_1x
|
|
+
|
|
+ xxlor 23+32, 10, 10
|
|
+
|
|
+ cmpdi 10, 10
|
|
+ beq Do_next_1x
|
|
+
|
|
+ # 192 bits
|
|
+ xxlor 24+32, 11, 11
|
|
+
|
|
+ vcipher 15, 15, 23
|
|
+ vcipher 15, 15, 24
|
|
+
|
|
+ xxlor 23+32, 12, 12
|
|
+
|
|
+ cmpdi 10, 12
|
|
+ beq Do_next_1x
|
|
+
|
|
+ # 256 bits
|
|
+ xxlor 24+32, 13, 13
|
|
+
|
|
+ vcipher 15, 15, 23
|
|
+ vcipher 15, 15, 24
|
|
+
|
|
+ xxlor 23+32, 14, 14
|
|
+
|
|
+ cmpdi 10, 14
|
|
+ beq Do_next_1x
|
|
+
|
|
+Do_next_1x:
|
|
+ vcipherlast 15, 15, 23
|
|
+
|
|
+ xxlxor 47, 47, 15
|
|
+ stxvb16x 47, 0, 9 # store output
|
|
+ addi 14, 14, 16
|
|
+ addi 9, 9, 16
|
|
+
|
|
+ vmr 28, 15
|
|
+ ppc_update_hash_1x
|
|
+
|
|
+ addi 12, 12, -16
|
|
+ addi 11, 11, 16
|
|
+ xxlor 19+32, 0, 0
|
|
+ vaddudm 30, 30, 31 # IV + counter
|
|
+ vxor 15, 30, 19 # add round key
|
|
+
|
|
+ bdnz Next_rem_block
|
|
+
|
|
+ cmpdi 12, 0
|
|
+ beq aes_gcm_out
|
|
+
|
|
+Final_block:
|
|
+ Loop_aes_middle_1x
|
|
+
|
|
+ xxlor 23+32, 10, 10
|
|
+
|
|
+ cmpdi 10, 10
|
|
+ beq Do_final_1x
|
|
+
|
|
+ # 192 bits
|
|
+ xxlor 24+32, 11, 11
|
|
+
|
|
+ vcipher 15, 15, 23
|
|
+ vcipher 15, 15, 24
|
|
+
|
|
+ xxlor 23+32, 12, 12
|
|
+
|
|
+ cmpdi 10, 12
|
|
+ beq Do_final_1x
|
|
+
|
|
+ # 256 bits
|
|
+ xxlor 24+32, 13, 13
|
|
+
|
|
+ vcipher 15, 15, 23
|
|
+ vcipher 15, 15, 24
|
|
+
|
|
+ xxlor 23+32, 14, 14
|
|
+
|
|
+ cmpdi 10, 14
|
|
+ beq Do_final_1x
|
|
+
|
|
+Do_final_1x:
|
|
+ vcipherlast 15, 15, 23
|
|
+
|
|
+ lxvb16x 15, 0, 14 # load last block
|
|
+ xxlxor 47, 47, 15
|
|
+
|
|
+ # create partial block mask
|
|
+ li 15, 16
|
|
+ sub 15, 15, 12 # index to the mask
|
|
+
|
|
+ vspltisb 16, -1 # first 16 bytes - 0xffff...ff
|
|
+ vspltisb 17, 0 # second 16 bytes - 0x0000...00
|
|
+ li 10, 192
|
|
+ stvx 16, 10, 1
|
|
+ addi 10, 10, 16
|
|
+ stvx 17, 10, 1
|
|
+
|
|
+ addi 10, 1, 192
|
|
+ lxvb16x 16, 15, 10 # load partial block mask
|
|
+ xxland 47, 47, 16
|
|
+
|
|
+ vmr 28, 15
|
|
+ ppc_update_hash_1x
|
|
+
|
|
+ # * should store only the remaining bytes.
|
|
+ bl Write_partial_block
|
|
+
|
|
+ b aes_gcm_out
|
|
+
|
|
+#
|
|
+# Write partial block
|
|
+# r9 - output
|
|
+# r12 - remaining bytes
|
|
+# v15 - partial input data
|
|
+#
|
|
+Write_partial_block:
|
|
+ li 10, 192
|
|
+ stxvb16x 15+32, 10, 1 # last block
|
|
+
|
|
+ #add 10, 9, 11 # Output
|
|
+ addi 10, 9, -1
|
|
+ addi 16, 1, 191
|
|
+
|
|
+ mtctr 12 # remaining bytes
|
|
+ li 15, 0
|
|
+
|
|
+Write_last_byte:
|
|
+ lbzu 14, 1(16)
|
|
+ stbu 14, 1(10)
|
|
+ bdnz Write_last_byte
|
|
+ blr
|
|
+
|
|
+aes_gcm_out:
|
|
+ # out = state
|
|
+ li 10, 256
|
|
+ stxvb16x 32, 10, 8 # write out Xi
|
|
+ add 3, 11, 12 # return count
|
|
+
|
|
+ li 9, 256
|
|
+ lvx 20, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ lvx 21, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ lvx 22, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ lvx 23, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ lvx 24, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ lvx 25, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ lvx 26, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ lvx 27, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ lvx 28, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ lvx 29, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ lvx 30, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ lvx 31, 9, 1
|
|
+
|
|
+ ld 0, 528(1)
|
|
+ ld 14,112(1)
|
|
+ ld 15,120(1)
|
|
+ ld 16,128(1)
|
|
+ ld 17,136(1)
|
|
+ ld 18,144(1)
|
|
+ ld 19,152(1)
|
|
+ ld 20,160(1)
|
|
+ ld 21,168(1)
|
|
+
|
|
+ mtlr 0
|
|
+ addi 1, 1, 512
|
|
+ blr
|
|
+
|
|
+#
|
|
+# 8x Decrypt
|
|
+#
|
|
+.global _gcry_ppc10_aes_gcm_decrypt
|
|
+.align 5
|
|
+_gcry_ppc10_aes_gcm_decrypt:
|
|
+_gcry_ppc_aes_gcm_decrypt:
|
|
+
|
|
+ stdu 1,-512(1)
|
|
+ mflr 0
|
|
+
|
|
+ std 14,112(1)
|
|
+ std 15,120(1)
|
|
+ std 16,128(1)
|
|
+ std 17,136(1)
|
|
+ std 18,144(1)
|
|
+ std 19,152(1)
|
|
+ std 20,160(1)
|
|
+ std 21,168(1)
|
|
+ li 9, 256
|
|
+ stvx 20, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 21, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 22, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 23, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 24, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 25, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 26, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 27, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 28, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 29, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 30, 9, 1
|
|
+ addi 9, 9, 16
|
|
+ stvx 31, 9, 1
|
|
+ std 0, 528(1)
|
|
+
|
|
+ # Load Xi
|
|
+ li 10, 256
|
|
+ lxvb16x 32, 10, 8 # load Xi
|
|
+
|
|
+ # load Hash - h^4, h^3, h^2, h
|
|
+ lxvd2x 2+32, 0, 8 # H Poli
|
|
+ li 10, 16
|
|
+ lxvd2x 3+32, 10, 8 # Hl
|
|
+ li 10, 32
|
|
+ lxvd2x 4+32, 10, 8 # H
|
|
+ li 10, 48
|
|
+ lxvd2x 5+32, 10, 8 # Hh
|
|
+
|
|
+ li 10, 64
|
|
+ lxvd2x 6+32, 10, 8 # H^2l
|
|
+ li 10, 80
|
|
+ lxvd2x 7+32, 10, 8 # H^2
|
|
+ li 10, 96
|
|
+ lxvd2x 8+32, 10, 8 # H^2h
|
|
+
|
|
+ li 10, 112
|
|
+ lxvd2x 9+32, 10, 8 # H^3l
|
|
+ li 10, 128
|
|
+ lxvd2x 10+32, 10, 8 # H^3
|
|
+ li 10, 144
|
|
+ lxvd2x 11+32, 10, 8 # H^3h
|
|
+
|
|
+ li 10, 160
|
|
+ lxvd2x 12+32, 10, 8 # H^4l
|
|
+ li 10, 176
|
|
+ lxvd2x 13+32, 10, 8 # H^4
|
|
+ li 10, 192
|
|
+ lxvd2x 14+32, 10, 8 # H^4h
|
|
+
|
|
+ # initialize ICB: GHASH( IV ), IV - r7
|
|
+ lxvb16x 30+32, 0, 7 # load IV - v30
|
|
+
|
|
+ mr 12, 5 # length
|
|
+ li 11, 0 # block index
|
|
+
|
|
+ # counter 1
|
|
+ vxor 31, 31, 31
|
|
+ vspltisb 22, 1
|
|
+ vsldoi 31, 31, 22,1 # counter 1
|
|
+
|
|
+ # load round key to VSR
|
|
+ lxv 0, 0(6)
|
|
+ lxv 1, 0x10(6)
|
|
+ lxv 2, 0x20(6)
|
|
+ lxv 3, 0x30(6)
|
|
+ lxv 4, 0x40(6)
|
|
+ lxv 5, 0x50(6)
|
|
+ lxv 6, 0x60(6)
|
|
+ lxv 7, 0x70(6)
|
|
+ lxv 8, 0x80(6)
|
|
+ lxv 9, 0x90(6)
|
|
+ lxv 10, 0xa0(6)
|
|
+
|
|
+ # load rounds - 10 (128), 12 (192), 14 (256)
|
|
+ lwz 9,480(6)
|
|
+
|
|
+ #
|
|
+ # vxor state, state, w # addroundkey
|
|
+ xxlor 32+29, 0, 0
|
|
+ vxor 15, 30, 29 # IV + round key - add round key 0
|
|
+
|
|
+ cmpdi 9, 10
|
|
+ beq Loop_aes_gcm_8x_dec
|
|
+
|
|
+ # load 2 more round keys (v11, v12)
|
|
+ lxv 11, 0xb0(6)
|
|
+ lxv 12, 0xc0(6)
|
|
+
|
|
+ cmpdi 9, 12
|
|
+ beq Loop_aes_gcm_8x_dec
|
|
+
|
|
+ # load 2 more round keys (v11, v12, v13, v14)
|
|
+ lxv 13, 0xd0(6)
|
|
+ lxv 14, 0xe0(6)
|
|
+ cmpdi 9, 14
|
|
+ beq Loop_aes_gcm_8x_dec
|
|
+
|
|
+ b aes_gcm_out
|
|
+
|
|
+.align 5
|
|
+Loop_aes_gcm_8x_dec:
|
|
+ mr 14, 3
|
|
+ mr 9, 4
|
|
+
|
|
+ # n blcoks
|
|
+ li 10, 128
|
|
+ divdu 10, 5, 10 # n 128 bytes-blocks
|
|
+ cmpdi 10, 0
|
|
+ beq Loop_last_block_dec
|
|
+
|
|
+ vaddudm 30, 30, 31 # IV + counter
|
|
+ vxor 16, 30, 29
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 17, 30, 29
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 18, 30, 29
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 19, 30, 29
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 20, 30, 29
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 21, 30, 29
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 22, 30, 29
|
|
+
|
|
+ mtctr 10
|
|
+
|
|
+ li 15, 16
|
|
+ li 16, 32
|
|
+ li 17, 48
|
|
+ li 18, 64
|
|
+ li 19, 80
|
|
+ li 20, 96
|
|
+ li 21, 112
|
|
+
|
|
+ lwz 10, 480(6)
|
|
+
|
|
+Loop_8x_block_dec:
|
|
+
|
|
+ lxvb16x 15, 0, 14 # load block
|
|
+ lxvb16x 16, 15, 14 # load block
|
|
+ lxvb16x 17, 16, 14 # load block
|
|
+ lxvb16x 18, 17, 14 # load block
|
|
+ lxvb16x 19, 18, 14 # load block
|
|
+ lxvb16x 20, 19, 14 # load block
|
|
+ lxvb16x 21, 20, 14 # load block
|
|
+ lxvb16x 22, 21, 14 # load block
|
|
+ addi 14, 14, 128
|
|
+
|
|
+ Loop_aes_middle8x
|
|
+
|
|
+ xxlor 23+32, 10, 10
|
|
+
|
|
+ cmpdi 10, 10
|
|
+ beq Do_last_aes_dec
|
|
+
|
|
+ # 192 bits
|
|
+ xxlor 24+32, 11, 11
|
|
+
|
|
+ vcipher 15, 15, 23
|
|
+ vcipher 16, 16, 23
|
|
+ vcipher 17, 17, 23
|
|
+ vcipher 18, 18, 23
|
|
+ vcipher 19, 19, 23
|
|
+ vcipher 20, 20, 23
|
|
+ vcipher 21, 21, 23
|
|
+ vcipher 22, 22, 23
|
|
+
|
|
+ vcipher 15, 15, 24
|
|
+ vcipher 16, 16, 24
|
|
+ vcipher 17, 17, 24
|
|
+ vcipher 18, 18, 24
|
|
+ vcipher 19, 19, 24
|
|
+ vcipher 20, 20, 24
|
|
+ vcipher 21, 21, 24
|
|
+ vcipher 22, 22, 24
|
|
+
|
|
+ xxlor 23+32, 12, 12
|
|
+
|
|
+ cmpdi 10, 12
|
|
+ beq Do_last_aes_dec
|
|
+
|
|
+ # 256 bits
|
|
+ xxlor 24+32, 13, 13
|
|
+
|
|
+ vcipher 15, 15, 23
|
|
+ vcipher 16, 16, 23
|
|
+ vcipher 17, 17, 23
|
|
+ vcipher 18, 18, 23
|
|
+ vcipher 19, 19, 23
|
|
+ vcipher 20, 20, 23
|
|
+ vcipher 21, 21, 23
|
|
+ vcipher 22, 22, 23
|
|
+
|
|
+ vcipher 15, 15, 24
|
|
+ vcipher 16, 16, 24
|
|
+ vcipher 17, 17, 24
|
|
+ vcipher 18, 18, 24
|
|
+ vcipher 19, 19, 24
|
|
+ vcipher 20, 20, 24
|
|
+ vcipher 21, 21, 24
|
|
+ vcipher 22, 22, 24
|
|
+
|
|
+ xxlor 23+32, 14, 14
|
|
+
|
|
+ cmpdi 10, 14
|
|
+ beq Do_last_aes_dec
|
|
+ b aes_gcm_out
|
|
+
|
|
+Do_last_aes_dec:
|
|
+
|
|
+ #
|
|
+ # last round
|
|
+ vcipherlast 15, 15, 23
|
|
+ vcipherlast 16, 16, 23
|
|
+
|
|
+ xxlxor 47, 47, 15
|
|
+ stxvb16x 47, 0, 9 # store output
|
|
+ xxlxor 48, 48, 16
|
|
+ stxvb16x 48, 15, 9 # store output
|
|
+
|
|
+ vcipherlast 17, 17, 23
|
|
+ vcipherlast 18, 18, 23
|
|
+
|
|
+ xxlxor 49, 49, 17
|
|
+ stxvb16x 49, 16, 9 # store output
|
|
+ xxlxor 50, 50, 18
|
|
+ stxvb16x 50, 17, 9 # store output
|
|
+
|
|
+ vcipherlast 19, 19, 23
|
|
+ vcipherlast 20, 20, 23
|
|
+
|
|
+ xxlxor 51, 51, 19
|
|
+ stxvb16x 51, 18, 9 # store output
|
|
+ xxlxor 52, 52, 20
|
|
+ stxvb16x 52, 19, 9 # store output
|
|
+
|
|
+ vcipherlast 21, 21, 23
|
|
+ vcipherlast 22, 22, 23
|
|
+
|
|
+ xxlxor 53, 53, 21
|
|
+ stxvb16x 53, 20, 9 # store output
|
|
+ xxlxor 54, 54, 22
|
|
+ stxvb16x 54, 21, 9 # store output
|
|
+
|
|
+ addi 9, 9, 128
|
|
+
|
|
+ xxlor 15+32, 15, 15
|
|
+ xxlor 16+32, 16, 16
|
|
+ xxlor 17+32, 17, 17
|
|
+ xxlor 18+32, 18, 18
|
|
+ xxlor 19+32, 19, 19
|
|
+ xxlor 20+32, 20, 20
|
|
+ xxlor 21+32, 21, 21
|
|
+ xxlor 22+32, 22, 22
|
|
+
|
|
+ # ghash here
|
|
+ ppc_aes_gcm_ghash2_4x
|
|
+
|
|
+ xxlor 27+32, 0, 0
|
|
+ vaddudm 30, 30, 31 # IV + counter
|
|
+ vmr 29, 30
|
|
+ vxor 15, 30, 27 # add round key
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 16, 30, 27
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 17, 30, 27
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 18, 30, 27
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 19, 30, 27
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 20, 30, 27
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 21, 30, 27
|
|
+ vaddudm 30, 30, 31
|
|
+ vxor 22, 30, 27
|
|
+ addi 12, 12, -128
|
|
+ addi 11, 11, 128
|
|
+
|
|
+ bdnz Loop_8x_block_dec
|
|
+
|
|
+ vmr 30, 29
|
|
+
|
|
+Loop_last_block_dec:
|
|
+ cmpdi 12, 0
|
|
+ beq aes_gcm_out
|
|
+
|
|
+ # loop last few blocks
|
|
+ li 10, 16
|
|
+ divdu 10, 12, 10
|
|
+
|
|
+ mtctr 10
|
|
+
|
|
+ lwz 10,480(6)
|
|
+
|
|
+ cmpdi 12, 16
|
|
+ blt Final_block_dec
|
|
+
|
|
+Next_rem_block_dec:
|
|
+ lxvb16x 15, 0, 14 # load block
|
|
+
|
|
+ Loop_aes_middle_1x
|
|
+
|
|
+ xxlor 23+32, 10, 10
|
|
+
|
|
+ cmpdi 10, 10
|
|
+ beq Do_next_1x_dec
|
|
+
|
|
+ # 192 bits
|
|
+ xxlor 24+32, 11, 11
|
|
+
|
|
+ vcipher 15, 15, 23
|
|
+ vcipher 15, 15, 24
|
|
+
|
|
+ xxlor 23+32, 12, 12
|
|
+
|
|
+ cmpdi 10, 12
|
|
+ beq Do_next_1x_dec
|
|
+
|
|
+ # 256 bits
|
|
+ xxlor 24+32, 13, 13
|
|
+
|
|
+ vcipher 15, 15, 23
|
|
+ vcipher 15, 15, 24
|
|
+
|
|
+ xxlor 23+32, 14, 14
|
|
+
|
|
+ cmpdi 10, 14
|
|
+ beq Do_next_1x_dec
|
|
+
|
|
+Do_next_1x_dec:
|
|
+ vcipherlast 15, 15, 23
|
|
+
|
|
+ xxlxor 47, 47, 15
|
|
+ stxvb16x 47, 0, 9 # store output
|
|
+ addi 14, 14, 16
|
|
+ addi 9, 9, 16
|
|
+
|
|
+ xxlor 28+32, 15, 15
|
|
+ ppc_update_hash_1x
|
|
+
|
|
+ addi 12, 12, -16
|
|
+ addi 11, 11, 16
|
|
+ xxlor 19+32, 0, 0
|
|
+ vaddudm 30, 30, 31 # IV + counter
|
|
+ vxor 15, 30, 19 # add round key
|
|
+
|
|
+ bdnz Next_rem_block_dec
|
|
+
|
|
+ cmpdi 12, 0
|
|
+ beq aes_gcm_out
|
|
+
|
|
+Final_block_dec:
|
|
+ Loop_aes_middle_1x
|
|
+
|
|
+ xxlor 23+32, 10, 10
|
|
+
|
|
+ cmpdi 10, 10
|
|
+ beq Do_final_1x_dec
|
|
+
|
|
+ # 192 bits
|
|
+ xxlor 24+32, 11, 11
|
|
+
|
|
+ vcipher 15, 15, 23
|
|
+ vcipher 15, 15, 24
|
|
+
|
|
+ xxlor 23+32, 12, 12
|
|
+
|
|
+ cmpdi 10, 12
|
|
+ beq Do_final_1x_dec
|
|
+
|
|
+ # 256 bits
|
|
+ xxlor 24+32, 13, 13
|
|
+
|
|
+ vcipher 15, 15, 23
|
|
+ vcipher 15, 15, 24
|
|
+
|
|
+ xxlor 23+32, 14, 14
|
|
+
|
|
+ cmpdi 10, 14
|
|
+ beq Do_final_1x_dec
|
|
+
|
|
+Do_final_1x_dec:
|
|
+ vcipherlast 15, 15, 23
|
|
+
|
|
+ lxvb16x 15, 0, 14 # load block
|
|
+ xxlxor 47, 47, 15
|
|
+
|
|
+ # create partial block mask
|
|
+ li 15, 16
|
|
+ sub 15, 15, 12 # index to the mask
|
|
+
|
|
+ vspltisb 16, -1 # first 16 bytes - 0xffff...ff
|
|
+ vspltisb 17, 0 # second 16 bytes - 0x0000...00
|
|
+ li 10, 192
|
|
+ stvx 16, 10, 1
|
|
+ addi 10, 10, 16
|
|
+ stvx 17, 10, 1
|
|
+
|
|
+ addi 10, 1, 192
|
|
+ lxvb16x 16, 15, 10 # load block mask
|
|
+ xxland 47, 47, 16
|
|
+
|
|
+ xxlor 28+32, 15, 15
|
|
+ ppc_update_hash_1x
|
|
+
|
|
+ # * should store only the remaining bytes.
|
|
+ bl Write_partial_block
|
|
+
|
|
+ b aes_gcm_out
|
|
+#
|
|
Index: libgcrypt-1.9.4/cipher/rijndael-p10le.c
|
|
===================================================================
|
|
--- /dev/null
|
|
+++ libgcrypt-1.9.4/cipher/rijndael-p10le.c
|
|
@@ -0,0 +1,119 @@
|
|
+/* Rijndael (AES) for GnuPG - PowerPC Vector Crypto AES implementation
|
|
+ * Copyright 2021- IBM Inc. All rights reserved
|
|
+ *
|
|
+ * This file is part of Libgcrypt.
|
|
+ *
|
|
+ * Libgcrypt is free software; you can redistribute it and/or modify
|
|
+ * it under the terms of the GNU Lesser General Public License as
|
|
+ * published by the Free Software Foundation; either version 2.1 of
|
|
+ * the License, or (at your option) any later version.
|
|
+ *
|
|
+ * Libgcrypt is distributed in the hope that it will be useful,
|
|
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
+ * GNU Lesser General Public License for more details.
|
|
+ *
|
|
+ * You should have received a copy of the GNU Lesser General Public
|
|
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
|
|
+ *
|
|
+ * Alternatively, this code may be used in OpenSSL from The OpenSSL Project,
|
|
+ * and Cryptogams by Andy Polyakov, and if made part of a release of either
|
|
+ * or both projects, is thereafter dual-licensed under the license said project
|
|
+ * is released under.
|
|
+ */
|
|
+
|
|
+#include <config.h>
|
|
+
|
|
+#include "rijndael-internal.h"
|
|
+#include "cipher-internal.h"
|
|
+#include "bufhelp.h"
|
|
+
|
|
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
|
|
+
|
|
+
|
|
+extern size_t _gcry_ppc10_aes_gcm_encrypt (const void *inp, void *out,
|
|
+ size_t len,
|
|
+ const unsigned char *key,
|
|
+ unsigned char iv[16], void *Xip);
|
|
+extern size_t _gcry_ppc10_aes_gcm_decrypt (const void *inp, void *out,
|
|
+ size_t len,
|
|
+ const unsigned char *key,
|
|
+ unsigned char iv[16], void *Xip);
|
|
+
|
|
+size_t
|
|
+_gcry_aes_p10le_gcm_crypt(gcry_cipher_hd_t c, void *outbuf_arg,
|
|
+ const void *inbuf_arg, size_t nblocks, int encrypt)
|
|
+{
|
|
+ RIJNDAEL_context *ctx = (RIJNDAEL_context *) &c->context.c;
|
|
+ unsigned char *rk = (unsigned char *) ctx->u1.keyschedule;
|
|
+ unsigned char *gcm_table = (unsigned char *) c->u_mode.gcm.gcm_table;
|
|
+ unsigned char *iv = c->u_ctr.ctr;
|
|
+ unsigned char *Xi = c->u_mode.gcm.u_tag.tag;
|
|
+ int s = 0;
|
|
+ int ndone = 0;
|
|
+ int ctr_reset = 0;
|
|
+ size_t len = nblocks * GCRY_GCM_BLOCK_LEN;
|
|
+ u64 blocks_unused;
|
|
+ u64 nb = nblocks;
|
|
+ u64 next_ctr = 0;
|
|
+ unsigned char ctr_saved[12];
|
|
+ unsigned char *inp = (unsigned char *) inbuf_arg;
|
|
+ unsigned char *out = (unsigned char *) outbuf_arg;
|
|
+
|
|
+ /*
|
|
+ * This is what the aes-gcm asembly code expects some input parameters.
|
|
+ *
|
|
+ * - Number of rounds is at 480 offset from rk (rk->rounds)
|
|
+ * - Xi at 256 offset from gcm_table
|
|
+ */
|
|
+ gcry_assert (sizeof(c->u_mode.gcm.gcm_table) >= 256 + 16);
|
|
+ buf_cpy (gcm_table+256, Xi, 16);
|
|
+ buf_cpy (ctr_saved, c->u_ctr.ctr, 12);
|
|
+
|
|
+ while (nb)
|
|
+ {
|
|
+ blocks_unused = (u64) 0xffffffffU + 1 - (u64) buf_get_be32 (iv + 12);
|
|
+ if (nb > blocks_unused)
|
|
+ {
|
|
+ len = blocks_unused * GCRY_GCM_BLOCK_LEN;
|
|
+ nb -= blocks_unused;
|
|
+ next_ctr = blocks_unused;
|
|
+ ctr_reset = 1;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ len = nb * GCRY_GCM_BLOCK_LEN;
|
|
+ next_ctr = nb;
|
|
+ nb = 0;
|
|
+ }
|
|
+
|
|
+ if (encrypt)
|
|
+ s = _gcry_ppc10_aes_gcm_encrypt((const void *) inp, (void *) out, len,
|
|
+ (const unsigned char *) rk, iv,
|
|
+ (void *) gcm_table);
|
|
+ else
|
|
+ s = _gcry_ppc10_aes_gcm_decrypt((const void *) inp, (void *) out, len,
|
|
+ (const unsigned char *) rk, iv,
|
|
+ (void *) gcm_table);
|
|
+
|
|
+ cipher_block_add(c->u_ctr.ctr, next_ctr, GCRY_GCM_BLOCK_LEN);
|
|
+ if (ctr_reset)
|
|
+ {
|
|
+ ctr_reset = 0;
|
|
+ inp += len;
|
|
+ out += len;
|
|
+ }
|
|
+ buf_cpy (c->u_ctr.ctr, ctr_saved, 12);
|
|
+ ndone += s;
|
|
+ }
|
|
+ buf_cpy (Xi, gcm_table+256, 16);
|
|
+
|
|
+ /*
|
|
+ * Return number of blocks done.
|
|
+ */
|
|
+ s = ndone / GCRY_GCM_BLOCK_LEN;
|
|
+ s = nblocks - s;
|
|
+ return ( s );
|
|
+}
|
|
+
|
|
+#endif /* USE_PPC_CRYPTO_WITH_PPC9LE */
|
|
Index: libgcrypt-1.9.4/cipher/rijndael.c
|
|
===================================================================
|
|
--- libgcrypt-1.9.4.orig/cipher/rijndael.c
|
|
+++ libgcrypt-1.9.4/cipher/rijndael.c
|
|
@@ -295,6 +295,10 @@ extern void _gcry_aes_ppc9le_xts_crypt (
|
|
void *outbuf_arg,
|
|
const void *inbuf_arg,
|
|
size_t nblocks, int encrypt);
|
|
+
|
|
+extern size_t _gcry_aes_p10le_gcm_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
|
|
+ const void *inbuf_arg,
|
|
+ size_t nblocks, int encrypt);
|
|
#endif /*USE_PPC_CRYPTO_WITH_PPC9LE*/
|
|
|
|
#ifdef USE_S390X_CRYPTO
|
|
@@ -586,6 +590,8 @@ do_setkey (RIJNDAEL_context *ctx, const
|
|
bulk_ops->ocb_crypt = _gcry_aes_ppc9le_ocb_crypt;
|
|
bulk_ops->ocb_auth = _gcry_aes_ppc9le_ocb_auth;
|
|
bulk_ops->xts_crypt = _gcry_aes_ppc9le_xts_crypt;
|
|
+ if (hwfeatures & HWF_PPC_ARCH_3_10) /* for P10 */
|
|
+ bulk_ops->gcm_crypt = _gcry_aes_p10le_gcm_crypt;
|
|
}
|
|
#endif
|
|
#ifdef USE_PPC_CRYPTO
|
|
Index: libgcrypt-1.9.4/configure.ac
|
|
===================================================================
|
|
--- libgcrypt-1.9.4.orig/configure.ac
|
|
+++ libgcrypt-1.9.4/configure.ac
|
|
@@ -2597,6 +2597,12 @@ if test "$found" = "1" ; then
|
|
# Build with the crypto extension implementation
|
|
GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc.lo"
|
|
GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc9le.lo"
|
|
+ if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" &&
|
|
+ test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
|
|
+ # Build with AES-GCM bulk implementation for P10
|
|
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-gcm-p10le.lo"
|
|
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-p10le.lo"
|
|
+ fi
|
|
;;
|
|
powerpc64-*-*)
|
|
# Big-Endian.
|
|
Index: libgcrypt-1.9.4/src/g10lib.h
|
|
===================================================================
|
|
--- libgcrypt-1.9.4.orig/src/g10lib.h
|
|
+++ libgcrypt-1.9.4/src/g10lib.h
|
|
@@ -252,6 +252,7 @@ char **_gcry_strtokenize (const char *st
|
|
#define HWF_PPC_VCRYPTO (1 << 0)
|
|
#define HWF_PPC_ARCH_3_00 (1 << 1)
|
|
#define HWF_PPC_ARCH_2_07 (1 << 2)
|
|
+#define HWF_PPC_ARCH_3_10 (1 << 3)
|
|
|
|
#elif defined(HAVE_CPU_ARCH_S390X)
|
|
|
|
Index: libgcrypt-1.9.4/src/hwf-ppc.c
|
|
===================================================================
|
|
--- libgcrypt-1.9.4.orig/src/hwf-ppc.c
|
|
+++ libgcrypt-1.9.4/src/hwf-ppc.c
|
|
@@ -92,6 +92,9 @@ struct feature_map_s
|
|
#ifndef PPC_FEATURE2_ARCH_3_00
|
|
# define PPC_FEATURE2_ARCH_3_00 0x00800000
|
|
#endif
|
|
+#ifndef PPC_FEATURE2_ARCH_3_10
|
|
+# define PPC_FEATURE2_ARCH_3_10 0x00040000
|
|
+#endif
|
|
|
|
static const struct feature_map_s ppc_features[] =
|
|
{
|
|
Index: libgcrypt-1.9.4/src/hwfeatures.c
|
|
===================================================================
|
|
--- libgcrypt-1.9.4.orig/src/hwfeatures.c
|
|
+++ libgcrypt-1.9.4/src/hwfeatures.c
|
|
@@ -72,6 +72,7 @@ static struct
|
|
{ HWF_PPC_VCRYPTO, "ppc-vcrypto" },
|
|
{ HWF_PPC_ARCH_3_00, "ppc-arch_3_00" },
|
|
{ HWF_PPC_ARCH_2_07, "ppc-arch_2_07" },
|
|
+ { HWF_PPC_ARCH_3_10, "ppc-arch_3_10" },
|
|
#elif defined(HAVE_CPU_ARCH_S390X)
|
|
{ HWF_S390X_MSA, "s390x-msa" },
|
|
{ HWF_S390X_MSA_4, "s390x-msa-4" },
|