libgcrypt/libgcrypt-Bulk-implementation-of-AES-GCM-acceleration-ppc64le.patch

From 7205c715b3e0f6fd0b853e8916d174048f43c03b Mon Sep 17 00:00:00 2001
From: Danny Tsen <dtsen@us.ibm.com>
Date: Tue, 14 Dec 2021 20:03:06 +0200
Subject: AES-GCM: Bulk implementation of AES-GCM acceleration for ppc64le

* configure.ac: Added p10 assembly implementation file and assiciated file.
* cipher/Makefile.am: Added p10 assembly implementation file and associated
file.
* cipher/rijndael.c: Added p10 function.
* cipher/rijndael-p10le.c: New wrapper file for AES-GCM call.
* cipher/rijndael-gcm-p10le.s: New implementation of AES-GCM bulk function in
Power Assembly.
* src/g10lib.h: Added Power arch 3.1 definition for p10.
* src/hwf-ppc.c: Added Power arch 3.1 definition for p10.
* src/hwfeatures.c: Added Power arch 3.1 definition for p10.
--

GnuPG-bug-id: 5700
Signed-off-by: Danny Tsen <dtsen@us.ibm.com>
[jk: fixes for C coding style]
[jk: prefix assembly functions with '_gcry_ppc10']
[jk: add assert check for gcm_table size]
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>

Index: libgcrypt-1.9.4/cipher/Makefile.am
===================================================================
--- libgcrypt-1.9.4.orig/cipher/Makefile.am
+++ libgcrypt-1.9.4/cipher/Makefile.am
@@ -105,6 +105,7 @@ EXTRA_libcipher_la_SOURCES = \
 	rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S    \
 	rijndael-armv8-aarch64-ce.S rijndael-aarch64.S     \
 	rijndael-ppc.c rijndael-ppc9le.c                   \
+	rijndael-p10le.c rijndael-gcm-p10le.s             \
 	rijndael-ppc-common.h rijndael-ppc-functions.h     \
 	rijndael-s390x.c                                   \
 	rmd160.c \
@@ -242,6 +243,12 @@ rijndael-ppc9le.o: $(srcdir)/rijndael-pp
 rijndael-ppc9le.lo: $(srcdir)/rijndael-ppc9le.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `

+rijndael-p10le.o: $(srcdir)/rijndael-p10le.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+rijndael-p10le.lo: $(srcdir)/rijndael-p10le.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
 sha256-ppc.o: $(srcdir)/sha256-ppc.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `

Index: libgcrypt-1.9.4/cipher/rijndael-gcm-p10le.s
===================================================================
--- /dev/null
+++ libgcrypt-1.9.4/cipher/rijndael-gcm-p10le.s
@@ -0,0 +1,1401 @@
+# Copyright 2021- IBM Inc. All rights reserved
+#
+# This file is part of Libgcrypt.
+#
+# Libgcrypt is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation; either version 2.1 of
+# the License, or (at your option) any later version.
+#
+# Libgcrypt is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program; if not, see <http://www.gnu.org/licenses/>.
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# GHASH is based on the Karatsuba multiplication method.
+#
+#    Xi xor X1
+#
+#    X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
+#      (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
+#      (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
+#      (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
+#      (X4.h * H.h + X4.l * H.l + X4 * H)
+#
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+#     ( H.l, H, H.h)
+#     ( H^2.l, H^2, H^2.h)
+#     ( H^3.l, H^3, H^3.h)
+#     ( H^4.l, H^4, H^4.h)
+#
+# v30 is IV
+# v31 - counter 1
+#
+# AES used,
+#     vs0 - vs14 for round keys
+#     v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
+#
+# This implementation uses stitched AES-GCM approach to improve overall performance.
+# AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
+#
+# Current performance with 128 bit key using bench-slope on Power10[le] (3.89GHz):
+#
+# AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
+#        GCM enc |     0.169 ns/B      5643 MiB/s         - c/B
+#        GCM dec |     0.171 ns/B      5585 MiB/s         - c/B
+#
+# ===================================================================================
+#
+
+.machine        "any"
+.abiversion     2
+.text
+
+# 4x loops
+# v15 - v18 - input states
+# vs1 - vs9 - round keys
+#
+.macro Loop_aes_middle4x
+	xxlor	19+32, 1, 1
+	xxlor	20+32, 2, 2
+	xxlor	21+32, 3, 3
+	xxlor	22+32, 4, 4
+
+	vcipher	15, 15, 19
+	vcipher	16, 16, 19
+	vcipher	17, 17, 19
+	vcipher	18, 18, 19
+
+	vcipher	15, 15, 20
+	vcipher	16, 16, 20
+	vcipher	17, 17, 20
+	vcipher	18, 18, 20
+
+	vcipher	15, 15, 21
+	vcipher	16, 16, 21
+	vcipher	17, 17, 21
+	vcipher	18, 18, 21
+
+	vcipher	15, 15, 22
+	vcipher	16, 16, 22
+	vcipher	17, 17, 22
+	vcipher	18, 18, 22
+
+	xxlor	19+32, 5, 5
+	xxlor	20+32, 6, 6
+	xxlor	21+32, 7, 7
+	xxlor	22+32, 8, 8
+
+	vcipher	15, 15, 19
+	vcipher	16, 16, 19
+	vcipher	17, 17, 19
+	vcipher	18, 18, 19
+
+	vcipher	15, 15, 20
+	vcipher	16, 16, 20
+	vcipher	17, 17, 20
+	vcipher	18, 18, 20
+
+	vcipher	15, 15, 21
+	vcipher	16, 16, 21
+	vcipher	17, 17, 21
+	vcipher	18, 18, 21
+
+	vcipher	15, 15, 22
+	vcipher	16, 16, 22
+	vcipher	17, 17, 22
+	vcipher	18, 18, 22
+
+	xxlor	23+32, 9, 9
+	vcipher	15, 15, 23
+	vcipher	16, 16, 23
+	vcipher	17, 17, 23
+	vcipher	18, 18, 23
+.endm
+
+# 8x loops
+# v15 - v22 - input states
+# vs1 - vs9 - round keys
+#
+.macro Loop_aes_middle8x
+	xxlor	23+32, 1, 1
+	xxlor	24+32, 2, 2
+	xxlor	25+32, 3, 3
+	xxlor	26+32, 4, 4
+
+	vcipher	15, 15, 23
+	vcipher	16, 16, 23
+	vcipher	17, 17, 23
+	vcipher	18, 18, 23
+	vcipher	19, 19, 23
+	vcipher	20, 20, 23
+	vcipher	21, 21, 23
+	vcipher	22, 22, 23
+
+	vcipher	15, 15, 24
+	vcipher	16, 16, 24
+	vcipher	17, 17, 24
+	vcipher	18, 18, 24
+	vcipher	19, 19, 24
+	vcipher	20, 20, 24
+	vcipher	21, 21, 24
+	vcipher	22, 22, 24
+
+	vcipher	15, 15, 25
+	vcipher	16, 16, 25
+	vcipher	17, 17, 25
+	vcipher	18, 18, 25
+	vcipher	19, 19, 25
+	vcipher	20, 20, 25
+	vcipher	21, 21, 25
+	vcipher	22, 22, 25
+
+	vcipher	15, 15, 26
+	vcipher	16, 16, 26
+	vcipher	17, 17, 26
+	vcipher	18, 18, 26
+	vcipher	19, 19, 26
+	vcipher	20, 20, 26
+	vcipher	21, 21, 26
+	vcipher	22, 22, 26
+
+	xxlor	23+32, 5, 5
+	xxlor	24+32, 6, 6
+	xxlor	25+32, 7, 7
+	xxlor	26+32, 8, 8
+
+	vcipher	15, 15, 23
+	vcipher	16, 16, 23
+	vcipher	17, 17, 23
+	vcipher	18, 18, 23
+	vcipher	19, 19, 23
+	vcipher	20, 20, 23
+	vcipher	21, 21, 23
+	vcipher	22, 22, 23
+
+	vcipher	15, 15, 24
+	vcipher	16, 16, 24
+	vcipher	17, 17, 24
+	vcipher	18, 18, 24
+	vcipher	19, 19, 24
+	vcipher	20, 20, 24
+	vcipher	21, 21, 24
+	vcipher	22, 22, 24
+
+	vcipher	15, 15, 25
+	vcipher	16, 16, 25
+	vcipher	17, 17, 25
+	vcipher	18, 18, 25
+	vcipher	19, 19, 25
+	vcipher	20, 20, 25
+	vcipher	21, 21, 25
+	vcipher	22, 22, 25
+
+	vcipher	15, 15, 26
+	vcipher	16, 16, 26
+	vcipher	17, 17, 26
+	vcipher	18, 18, 26
+	vcipher	19, 19, 26
+	vcipher	20, 20, 26
+	vcipher	21, 21, 26
+	vcipher	22, 22, 26
+
+	xxlor	23+32, 9, 9
+	vcipher	15, 15, 23
+	vcipher	16, 16, 23
+	vcipher	17, 17, 23
+	vcipher	18, 18, 23
+	vcipher	19, 19, 23
+	vcipher	20, 20, 23
+	vcipher	21, 21, 23
+	vcipher	22, 22, 23
+.endm
+
+#
+# Compute 4x hash values based on Karatsuba method.
+#
+ppc_aes_gcm_ghash:
+	vxor		15, 15, 0
+
+	xxlxor		29, 29, 29
+
+	vpmsumd		23, 12, 15		# H4.L * X.L
+	vpmsumd		24, 9, 16
+	vpmsumd		25, 6, 17
+	vpmsumd		26, 3, 18
+
+	vxor		23, 23, 24
+	vxor		23, 23, 25
+	vxor		23, 23, 26		# L
+
+	vpmsumd		24, 13, 15		# H4.L * X.H + H4.H * X.L
+	vpmsumd		25, 10, 16		# H3.L * X1.H + H3.H * X1.L
+	vpmsumd		26, 7, 17
+	vpmsumd		27, 4, 18
+
+	vxor		24, 24, 25
+	vxor		24, 24, 26
+	vxor		24, 24, 27		# M
+
+	# sum hash and reduction with H Poly
+	vpmsumd		28, 23, 2		# reduction
+
+	xxlor		29+32, 29, 29
+	vsldoi		26, 24, 29, 8		# mL
+	vsldoi		29, 29, 24, 8		# mH
+	vxor		23, 23, 26		# mL + L
+
+	vsldoi		23, 23, 23, 8		# swap
+	vxor		23, 23, 28
+
+	vpmsumd		24, 14, 15		# H4.H * X.H
+	vpmsumd		25, 11, 16
+	vpmsumd		26, 8, 17
+	vpmsumd		27, 5, 18
+
+	vxor		24, 24, 25
+	vxor		24, 24, 26
+	vxor		24, 24, 27
+
+	vxor		24, 24, 29
+
+	# sum hash and reduction with H Poly
+	vsldoi		27, 23, 23, 8		# swap
+	vpmsumd		23, 23, 2
+	vxor		27, 27, 24
+	vxor		23, 23, 27
+
+	xxlor		32, 23+32, 23+32		# update hash
+
+	blr
+
+#
+# Combine two 4x ghash
+# v15 - v22 - input blocks
+#
+.macro ppc_aes_gcm_ghash2_4x
+	# first 4x hash
+	vxor		15, 15, 0		# Xi + X
+
+	xxlxor		29, 29, 29
+
+	vpmsumd		23, 12, 15		# H4.L * X.L
+	vpmsumd		24, 9, 16
+	vpmsumd		25, 6, 17
+	vpmsumd		26, 3, 18
+
+	vxor		23, 23, 24
+	vxor		23, 23, 25
+	vxor		23, 23, 26		# L
+
+	vpmsumd		24, 13, 15		# H4.L * X.H + H4.H * X.L
+	vpmsumd		25, 10, 16		# H3.L * X1.H + H3.H * X1.L
+	vpmsumd		26, 7, 17
+	vpmsumd		27, 4, 18
+
+	vxor		24, 24, 25
+	vxor		24, 24, 26
+
+	# sum hash and reduction with H Poly
+	vpmsumd		28, 23, 2		# reduction
+
+	xxlor		29+32, 29, 29
+
+	vxor		24, 24, 27		# M
+	vsldoi		26, 24, 29, 8		# mL
+	vsldoi		29, 29, 24, 8		# mH
+	vxor		23, 23, 26		# mL + L
+
+	vsldoi		23, 23, 23, 8		# swap
+	vxor		23, 23, 28
+
+	vpmsumd		24, 14, 15		# H4.H * X.H
+	vpmsumd		25, 11, 16
+	vpmsumd		26, 8, 17
+	vpmsumd		27, 5, 18
+
+	vxor		24, 24, 25
+	vxor		24, 24, 26
+	vxor		24, 24, 27		# H
+
+	vxor		24, 24, 29		# H + mH
+
+	# sum hash and reduction with H Poly
+	vsldoi		27, 23, 23, 8		# swap
+	vpmsumd		23, 23, 2
+	vxor		27, 27, 24
+	vxor		27, 23, 27		# 1st Xi
+
+	# 2nd 4x hash
+	vpmsumd		24, 9, 20
+	vpmsumd		25, 6, 21
+	vpmsumd		26, 3, 22
+	vxor		19, 19, 27		# Xi + X
+	vpmsumd		23, 12, 19		# H4.L * X.L
+
+	vxor		23, 23, 24
+	vxor		23, 23, 25
+	vxor		23, 23, 26		# L
+
+	vpmsumd		24, 13, 19		# H4.L * X.H + H4.H * X.L
+	vpmsumd		25, 10, 20		# H3.L * X1.H + H3.H * X1.L
+	vpmsumd		26, 7, 21
+	vpmsumd		27, 4, 22
+
+	vxor		24, 24, 25
+	vxor		24, 24, 26
+
+	# sum hash and reduction with H Poly
+	vpmsumd		28, 23, 2		# reduction
+
+	xxlor		29+32, 29, 29
+
+	vxor		24, 24, 27		# M
+	vsldoi		26, 24, 29, 8		# mL
+	vsldoi		29, 29, 24, 8		# mH
+	vxor		23, 23, 26		# mL + L
+
+	vsldoi		23, 23, 23, 8		# swap
+	vxor		23, 23, 28
+
+	vpmsumd		24, 14, 19		# H4.H * X.H
+	vpmsumd		25, 11, 20
+	vpmsumd		26, 8, 21
+	vpmsumd		27, 5, 22
+
+	vxor		24, 24, 25
+	vxor		24, 24, 26
+	vxor		24, 24, 27		# H
+
+	vxor		24, 24, 29		# H + mH
+
+	# sum hash and reduction with H Poly
+	vsldoi		27, 23, 23, 8		# swap
+	vpmsumd		23, 23, 2
+	vxor		27, 27, 24
+	vxor		23, 23, 27
+
+	xxlor		32, 23+32, 23+32		# update hash
+
+.endm
+
+#
+# Compute update single hash
+#
+.macro ppc_update_hash_1x
+	vxor		28, 28, 0
+
+	vxor		19, 19, 19
+
+	vpmsumd		22, 3, 28		# L
+	vpmsumd		23, 4, 28		# M
+	vpmsumd		24, 5, 28		# H
+
+	vpmsumd		27, 22, 2		# reduction
+
+	vsldoi		25, 23, 19, 8		# mL
+	vsldoi		26, 19, 23, 8		# mH
+	vxor		22, 22, 25		# LL + LL
+	vxor		24, 24, 26		# HH + HH
+
+	vsldoi		22, 22, 22, 8		# swap
+	vxor		22, 22, 27
+
+	vsldoi		20, 22, 22, 8		# swap
+	vpmsumd		22, 22, 2		# reduction
+	vxor		20, 20, 24
+	vxor		22, 22, 20
+
+	vmr		0, 22			# update hash
+
+.endm
+
+#
+# libgcrypt:
+# _gcry_ppc10_aes_gcm_encrypt (const void *inp, void *out, size_t len,
+#               const char *rk, unsigned char iv[16], void *Xip);
+#
+#    r3 - inp
+#    r4 - out
+#    r5 - len
+#    r6 - AES round keys
+#    r7 - iv
+#    r8 - HPoli, hash keys, Xi
+#
+#    rounds is at offset 480 in rk
+#    Xi is at 256 in gcm_table (Xip).
+#
+.global _gcry_ppc10_aes_gcm_encrypt
+.align 5
+_gcry_ppc10_aes_gcm_encrypt:
+_gcry_ppc_aes_gcm_encrypt:
+
+	stdu 1,-512(1)
+	mflr 0
+
+	std	14,112(1)
+	std	15,120(1)
+	std	16,128(1)
+	std	17,136(1)
+	std	18,144(1)
+	std	19,152(1)
+	std	20,160(1)
+	std	21,168(1)
+	li	9, 256
+	stvx	20, 9, 1
+	addi	9, 9, 16
+	stvx	21, 9, 1
+	addi	9, 9, 16
+	stvx	22, 9, 1
+	addi	9, 9, 16
+	stvx	23, 9, 1
+	addi	9, 9, 16
+	stvx	24, 9, 1
+	addi	9, 9, 16
+	stvx	25, 9, 1
+	addi	9, 9, 16
+	stvx	26, 9, 1
+	addi	9, 9, 16
+	stvx	27, 9, 1
+	addi	9, 9, 16
+	stvx	28, 9, 1
+	addi	9, 9, 16
+	stvx	29, 9, 1
+	addi	9, 9, 16
+	stvx	30, 9, 1
+	addi	9, 9, 16
+	stvx	31, 9, 1
+	std	0, 528(1)
+
+	# Load Xi
+	li	10, 256
+	lxvb16x	32, 10, 8	# load Xi
+
+	# load Hash - h^4, h^3, h^2, h
+	lxvd2x	2+32, 0, 8	# H Poli
+	li	10, 16
+	lxvd2x	3+32, 10, 8	# Hl
+	li	10, 32
+	lxvd2x	4+32, 10, 8	# H
+	li	10, 48
+	lxvd2x	5+32, 10, 8	# Hh
+
+	li	10, 64
+	lxvd2x	6+32, 10, 8	# H^2l
+	li	10, 80
+	lxvd2x	7+32, 10, 8	# H^2
+	li	10, 96
+	lxvd2x	8+32, 10, 8	# H^2h
+
+	li	10, 112
+	lxvd2x	9+32, 10, 8	# H^3l
+	li	10, 128
+	lxvd2x	10+32, 10, 8	# H^3
+	li	10, 144
+	lxvd2x	11+32, 10, 8	# H^3h
+
+	li	10, 160
+	lxvd2x	12+32, 10, 8	# H^4l
+	li	10, 176
+	lxvd2x	13+32, 10, 8	# H^4
+	li	10, 192
+	lxvd2x	14+32, 10, 8	# H^4h
+
+	# initialize ICB: GHASH( IV ), IV - r7
+	lxvb16x	30+32, 0, 7	# load IV  - v30
+
+	mr	12, 5		# length
+	li	11, 0		# block index
+
+	# counter 1
+	vxor	31, 31, 31
+	vspltisb 22, 1
+	vsldoi	31, 31, 22,1	# counter 1
+
+	# load round key to VSR
+	lxv	0, 0(6)
+	lxv	1, 0x10(6)
+	lxv	2, 0x20(6)
+	lxv	3, 0x30(6)
+	lxv	4, 0x40(6)
+	lxv	5, 0x50(6)
+	lxv	6, 0x60(6)
+	lxv	7, 0x70(6)
+	lxv	8, 0x80(6)
+	lxv	9, 0x90(6)
+	lxv	10, 0xa0(6)
+
+	# load rounds - 10 (128), 12 (192), 14 (256)
+	lwz	9,480(6)
+
+	#
+	# vxor	state, state, w # addroundkey
+	xxlor	32+29, 0, 0
+	vxor	15, 30, 29	# IV + round key - add round key 0
+
+	cmpdi	9, 10
+	beq	Loop_aes_gcm_8x
+
+	# load 2 more round keys (v11, v12)
+	lxv	11, 0xb0(6)
+	lxv	12, 0xc0(6)
+
+	cmpdi	9, 12
+	beq	Loop_aes_gcm_8x
+
+	# load 2 more round keys (v11, v12, v13, v14)
+	lxv	13, 0xd0(6)
+	lxv	14, 0xe0(6)
+	cmpdi	9, 14
+	beq	Loop_aes_gcm_8x
+
+	b	aes_gcm_out
+
+.align 5
+Loop_aes_gcm_8x:
+	mr	14, 3
+	mr	9, 4
+
+	# n blcoks
+	li	10, 128
+	divdu	10, 5, 10	# n 128 bytes-blocks
+	cmpdi	10, 0
+	beq	Loop_last_block
+
+	vaddudm	30, 30, 31	# IV + counter
+	vxor	16, 30, 29
+	vaddudm	30, 30, 31
+	vxor	17, 30, 29
+	vaddudm	30, 30, 31
+	vxor	18, 30, 29
+	vaddudm	30, 30, 31
+	vxor	19, 30, 29
+	vaddudm	30, 30, 31
+	vxor	20, 30, 29
+	vaddudm	30, 30, 31
+	vxor	21, 30, 29
+	vaddudm	30, 30, 31
+	vxor	22, 30, 29
+
+	mtctr	10
+
+	li	15, 16
+	li	16, 32
+	li	17, 48
+	li	18, 64
+	li	19, 80
+	li	20, 96
+	li	21, 112
+
+	lwz	10, 480(6)
+
+Loop_8x_block:
+
+	lxvb16x		15, 0, 14	# load block
+	lxvb16x		16, 15, 14	# load block
+	lxvb16x		17, 16, 14	# load block
+	lxvb16x		18, 17, 14	# load block
+	lxvb16x		19, 18, 14	# load block
+	lxvb16x		20, 19, 14	# load block
+	lxvb16x		21, 20, 14	# load block
+	lxvb16x		22, 21, 14	# load block
+	addi		14, 14, 128
+
+	Loop_aes_middle8x
+
+	xxlor	23+32, 10, 10
+
+	cmpdi	10, 10
+	beq	Do_next_ghash
+
+	# 192 bits
+	xxlor	24+32, 11, 11
+
+	vcipher	15, 15, 23
+	vcipher	16, 16, 23
+	vcipher	17, 17, 23
+	vcipher	18, 18, 23
+	vcipher	19, 19, 23
+	vcipher	20, 20, 23
+	vcipher	21, 21, 23
+	vcipher	22, 22, 23
+
+	vcipher	15, 15, 24
+	vcipher	16, 16, 24
+	vcipher	17, 17, 24
+	vcipher	18, 18, 24
+	vcipher	19, 19, 24
+	vcipher	20, 20, 24
+	vcipher	21, 21, 24
+	vcipher	22, 22, 24
+
+	xxlor	23+32, 12, 12
+
+	cmpdi	10, 12
+	beq	Do_next_ghash
+
+	# 256 bits
+	xxlor	24+32, 13, 13
+
+	vcipher	15, 15, 23
+	vcipher	16, 16, 23
+	vcipher	17, 17, 23
+	vcipher	18, 18, 23
+	vcipher	19, 19, 23
+	vcipher	20, 20, 23
+	vcipher	21, 21, 23
+	vcipher	22, 22, 23
+
+	vcipher	15, 15, 24
+	vcipher	16, 16, 24
+	vcipher	17, 17, 24
+	vcipher	18, 18, 24
+	vcipher	19, 19, 24
+	vcipher	20, 20, 24
+	vcipher	21, 21, 24
+	vcipher	22, 22, 24
+
+	xxlor	23+32, 14, 14
+
+	cmpdi	10, 14
+	beq	Do_next_ghash
+	b	aes_gcm_out
+
+Do_next_ghash:
+
+	#
+	# last round
+	vcipherlast     15, 15, 23
+	vcipherlast     16, 16, 23
+
+	xxlxor		47, 47, 15
+	stxvb16x        47, 0, 9	# store output
+	xxlxor		48, 48, 16
+	stxvb16x        48, 15, 9	# store output
+
+	vcipherlast     17, 17, 23
+	vcipherlast     18, 18, 23
+
+	xxlxor		49, 49, 17
+	stxvb16x        49, 16, 9	# store output
+	xxlxor		50, 50, 18
+	stxvb16x        50, 17, 9	# store output
+
+	vcipherlast     19, 19, 23
+	vcipherlast     20, 20, 23
+
+	xxlxor		51, 51, 19
+	stxvb16x        51, 18, 9	# store output
+	xxlxor		52, 52, 20
+	stxvb16x        52, 19, 9	# store output
+
+	vcipherlast     21, 21, 23
+	vcipherlast     22, 22, 23
+
+	xxlxor		53, 53, 21
+	stxvb16x        53, 20, 9	# store output
+	xxlxor		54, 54, 22
+	stxvb16x        54, 21, 9	# store output
+
+	addi		9, 9, 128
+
+	# ghash here
+	ppc_aes_gcm_ghash2_4x
+
+	xxlor	27+32, 0, 0
+	vaddudm 30, 30, 31		# IV + counter
+	vmr	29, 30
+	vxor    15, 30, 27		# add round key
+	vaddudm 30, 30, 31
+	vxor    16, 30, 27
+	vaddudm 30, 30, 31
+	vxor    17, 30, 27
+	vaddudm 30, 30, 31
+	vxor    18, 30, 27
+	vaddudm 30, 30, 31
+	vxor    19, 30, 27
+	vaddudm 30, 30, 31
+	vxor    20, 30, 27
+	vaddudm 30, 30, 31
+	vxor    21, 30, 27
+	vaddudm 30, 30, 31
+	vxor    22, 30, 27
+
+	addi    12, 12, -128
+	addi    11, 11, 128
+
+	bdnz	Loop_8x_block
+
+	vmr	30, 29
+
+Loop_last_block:
+	cmpdi   12, 0
+	beq     aes_gcm_out
+
+	# loop last few blocks
+	li      10, 16
+	divdu   10, 12, 10
+
+	mtctr   10
+
+	lwz	10, 480(6)
+
+	cmpdi   12, 16
+	blt     Final_block
+
+.macro Loop_aes_middle_1x
+	xxlor	19+32, 1, 1
+	xxlor	20+32, 2, 2
+	xxlor	21+32, 3, 3
+	xxlor	22+32, 4, 4
+
+	vcipher 15, 15, 19
+	vcipher 15, 15, 20
+	vcipher 15, 15, 21
+	vcipher 15, 15, 22
+
+	xxlor	19+32, 5, 5
+	xxlor	20+32, 6, 6
+	xxlor	21+32, 7, 7
+	xxlor	22+32, 8, 8
+
+	vcipher 15, 15, 19
+	vcipher 15, 15, 20
+	vcipher 15, 15, 21
+	vcipher 15, 15, 22
+
+	xxlor	19+32, 9, 9
+	vcipher 15, 15, 19
+.endm
+
+Next_rem_block:
+	lxvb16x 15, 0, 14		# load block
+
+	Loop_aes_middle_1x
+
+	xxlor	23+32, 10, 10
+
+	cmpdi	10, 10
+	beq	Do_next_1x
+
+	# 192 bits
+	xxlor	24+32, 11, 11
+
+	vcipher	15, 15, 23
+	vcipher	15, 15, 24
+
+	xxlor	23+32, 12, 12
+
+	cmpdi	10, 12
+	beq	Do_next_1x
+
+	# 256 bits
+	xxlor	24+32, 13, 13
+
+	vcipher	15, 15, 23
+	vcipher	15, 15, 24
+
+	xxlor	23+32, 14, 14
+
+	cmpdi	10, 14
+	beq	Do_next_1x
+
+Do_next_1x:
+	vcipherlast     15, 15, 23
+
+	xxlxor		47, 47, 15
+	stxvb16x	47, 0, 9	# store output
+	addi		14, 14, 16
+	addi		9, 9, 16
+
+	vmr		28, 15
+	ppc_update_hash_1x
+
+	addi		12, 12, -16
+	addi		11, 11, 16
+	xxlor		19+32, 0, 0
+	vaddudm		30, 30, 31		# IV + counter
+	vxor		15, 30, 19		# add round key
+
+	bdnz	Next_rem_block
+
+	cmpdi	12, 0
+	beq	aes_gcm_out
+
+Final_block:
+	Loop_aes_middle_1x
+
+	xxlor	23+32, 10, 10
+
+	cmpdi	10, 10
+	beq	Do_final_1x
+
+	# 192 bits
+	xxlor	24+32, 11, 11
+
+	vcipher	15, 15, 23
+	vcipher	15, 15, 24
+
+	xxlor	23+32, 12, 12
+
+	cmpdi	10, 12
+	beq	Do_final_1x
+
+	# 256 bits
+	xxlor	24+32, 13, 13
+
+	vcipher	15, 15, 23
+	vcipher	15, 15, 24
+
+	xxlor	23+32, 14, 14
+
+	cmpdi	10, 14
+	beq	Do_final_1x
+
+Do_final_1x:
+	vcipherlast     15, 15, 23
+
+	lxvb16x	15, 0, 14		# load last block
+	xxlxor	47, 47, 15
+
+	# create partial block mask
+	li	15, 16
+	sub	15, 15, 12		# index to the mask
+
+	vspltisb	16, -1		# first 16 bytes - 0xffff...ff
+	vspltisb	17, 0		# second 16 bytes - 0x0000...00
+	li	10, 192
+	stvx	16, 10, 1
+	addi	10, 10, 16
+	stvx	17, 10, 1
+
+	addi	10, 1, 192
+	lxvb16x	16, 15, 10		# load partial block mask
+	xxland	47, 47, 16
+
+	vmr	28, 15
+	ppc_update_hash_1x
+
+	# * should store only the remaining bytes.
+	bl	Write_partial_block
+
+	b aes_gcm_out
+
+#
+# Write partial block
+# r9 - output
+# r12 - remaining bytes
+# v15 - partial input data
+#
+Write_partial_block:
+	li		10, 192
+	stxvb16x	15+32, 10, 1		# last block
+
+	#add		10, 9, 11		# Output
+	addi		10, 9, -1
+	addi		16, 1, 191
+
+        mtctr		12			# remaining bytes
+	li		15, 0
+
+Write_last_byte:
+        lbzu		14, 1(16)
+	stbu		14, 1(10)
+        bdnz		Write_last_byte
+	blr
+
+aes_gcm_out:
+	# out = state
+	li	10, 256
+	stxvb16x	32, 10, 8		# write out Xi
+	add	3, 11, 12		# return count
+
+	li	9, 256
+	lvx	20, 9, 1
+	addi	9, 9, 16
+	lvx	21, 9, 1
+	addi	9, 9, 16
+	lvx	22, 9, 1
+	addi	9, 9, 16
+	lvx	23, 9, 1
+	addi	9, 9, 16
+	lvx	24, 9, 1
+	addi	9, 9, 16
+	lvx	25, 9, 1
+	addi	9, 9, 16
+	lvx	26, 9, 1
+	addi	9, 9, 16
+	lvx	27, 9, 1
+	addi	9, 9, 16
+	lvx	28, 9, 1
+	addi	9, 9, 16
+	lvx	29, 9, 1
+	addi	9, 9, 16
+	lvx	30, 9, 1
+	addi	9, 9, 16
+	lvx	31, 9, 1
+
+	ld	0, 528(1)
+	ld      14,112(1)
+	ld      15,120(1)
+	ld      16,128(1)
+	ld      17,136(1)
+	ld      18,144(1)
+	ld      19,152(1)
+	ld      20,160(1)
+	ld	21,168(1)
+
+	mtlr	0
+	addi	1, 1, 512
+	blr
+
+#
+# 8x Decrypt
+#
+.global _gcry_ppc10_aes_gcm_decrypt
+.align 5
+_gcry_ppc10_aes_gcm_decrypt:
+_gcry_ppc_aes_gcm_decrypt:
+
+	stdu 1,-512(1)
+	mflr 0
+
+	std	14,112(1)
+	std	15,120(1)
+	std	16,128(1)
+	std	17,136(1)
+	std	18,144(1)
+	std	19,152(1)
+	std	20,160(1)
+	std	21,168(1)
+	li	9, 256
+	stvx	20, 9, 1
+	addi	9, 9, 16
+	stvx	21, 9, 1
+	addi	9, 9, 16
+	stvx	22, 9, 1
+	addi	9, 9, 16
+	stvx	23, 9, 1
+	addi	9, 9, 16
+	stvx	24, 9, 1
+	addi	9, 9, 16
+	stvx	25, 9, 1
+	addi	9, 9, 16
+	stvx	26, 9, 1
+	addi	9, 9, 16
+	stvx	27, 9, 1
+	addi	9, 9, 16
+	stvx	28, 9, 1
+	addi	9, 9, 16
+	stvx	29, 9, 1
+	addi	9, 9, 16
+	stvx	30, 9, 1
+	addi	9, 9, 16
+	stvx	31, 9, 1
+	std	0, 528(1)
+
+	# Load Xi
+	li	10, 256
+	lxvb16x	32, 10, 8	# load Xi
+
+	# load Hash - h^4, h^3, h^2, h
+	lxvd2x	2+32, 0, 8	# H Poli
+	li	10, 16
+	lxvd2x	3+32, 10, 8	# Hl
+	li	10, 32
+	lxvd2x	4+32, 10, 8	# H
+	li	10, 48
+	lxvd2x	5+32, 10, 8	# Hh
+
+	li	10, 64
+	lxvd2x	6+32, 10, 8	# H^2l
+	li	10, 80
+	lxvd2x	7+32, 10, 8	# H^2
+	li	10, 96
+	lxvd2x	8+32, 10, 8	# H^2h
+
+	li	10, 112
+	lxvd2x	9+32, 10, 8	# H^3l
+	li	10, 128
+	lxvd2x	10+32, 10, 8	# H^3
+	li	10, 144
+	lxvd2x	11+32, 10, 8	# H^3h
+
+	li	10, 160
+	lxvd2x	12+32, 10, 8	# H^4l
+	li	10, 176
+	lxvd2x	13+32, 10, 8	# H^4
+	li	10, 192
+	lxvd2x	14+32, 10, 8	# H^4h
+
+	# initialize ICB: GHASH( IV ), IV - r7
+	lxvb16x	30+32, 0, 7	# load IV  - v30
+
+	mr	12, 5		# length
+	li	11, 0		# block index
+
+	# counter 1
+	vxor	31, 31, 31
+	vspltisb 22, 1
+	vsldoi	31, 31, 22,1	# counter 1
+
+	# load round key to VSR
+	lxv	0, 0(6)
+	lxv	1, 0x10(6)
+	lxv	2, 0x20(6)
+	lxv	3, 0x30(6)
+	lxv	4, 0x40(6)
+	lxv	5, 0x50(6)
+	lxv	6, 0x60(6)
+	lxv	7, 0x70(6)
+	lxv	8, 0x80(6)
+	lxv	9, 0x90(6)
+	lxv	10, 0xa0(6)
+
+	# load rounds - 10 (128), 12 (192), 14 (256)
+	lwz	9,480(6)
+
+	#
+	# vxor	state, state, w # addroundkey
+	xxlor	32+29, 0, 0
+	vxor	15, 30, 29	# IV + round key - add round key 0
+
+	cmpdi	9, 10
+	beq	Loop_aes_gcm_8x_dec
+
+	# load 2 more round keys (v11, v12)
+	lxv	11, 0xb0(6)
+	lxv	12, 0xc0(6)
+
+	cmpdi	9, 12
+	beq	Loop_aes_gcm_8x_dec
+
+	# load 2 more round keys (v11, v12, v13, v14)
+	lxv	13, 0xd0(6)
+	lxv	14, 0xe0(6)
+	cmpdi	9, 14
+	beq	Loop_aes_gcm_8x_dec
+
+	b	aes_gcm_out
+
+.align 5
+Loop_aes_gcm_8x_dec:
+	mr	14, 3
+	mr	9, 4
+
+	# n blcoks
+	li	10, 128
+	divdu	10, 5, 10	# n 128 bytes-blocks
+	cmpdi	10, 0
+	beq	Loop_last_block_dec
+
+	vaddudm	30, 30, 31	# IV + counter
+	vxor	16, 30, 29
+	vaddudm	30, 30, 31
+	vxor	17, 30, 29
+	vaddudm	30, 30, 31
+	vxor	18, 30, 29
+	vaddudm	30, 30, 31
+	vxor	19, 30, 29
+	vaddudm	30, 30, 31
+	vxor	20, 30, 29
+	vaddudm	30, 30, 31
+	vxor	21, 30, 29
+	vaddudm	30, 30, 31
+	vxor	22, 30, 29
+
+	mtctr	10
+
+	li	15, 16
+	li	16, 32
+	li	17, 48
+	li	18, 64
+	li	19, 80
+	li	20, 96
+	li	21, 112
+
+	lwz	10, 480(6)
+
+Loop_8x_block_dec:
+
+	lxvb16x		15, 0, 14	# load block
+	lxvb16x		16, 15, 14	# load block
+	lxvb16x		17, 16, 14	# load block
+	lxvb16x		18, 17, 14	# load block
+	lxvb16x		19, 18, 14	# load block
+	lxvb16x		20, 19, 14	# load block
+	lxvb16x		21, 20, 14	# load block
+	lxvb16x		22, 21, 14	# load block
+	addi		14, 14, 128
+
+	Loop_aes_middle8x
+
+	xxlor	23+32, 10, 10
+
+	cmpdi	10, 10
+	beq	Do_last_aes_dec
+
+	# 192 bits
+	xxlor	24+32, 11, 11
+
+	vcipher	15, 15, 23
+	vcipher	16, 16, 23
+	vcipher	17, 17, 23
+	vcipher	18, 18, 23
+	vcipher	19, 19, 23
+	vcipher	20, 20, 23
+	vcipher	21, 21, 23
+	vcipher	22, 22, 23
+
+	vcipher	15, 15, 24
+	vcipher	16, 16, 24
+	vcipher	17, 17, 24
+	vcipher	18, 18, 24
+	vcipher	19, 19, 24
+	vcipher	20, 20, 24
+	vcipher	21, 21, 24
+	vcipher	22, 22, 24
+
+	xxlor	23+32, 12, 12
+
+	cmpdi	10, 12
+	beq	Do_last_aes_dec
+
+	# 256 bits
+	xxlor	24+32, 13, 13
+
+	vcipher	15, 15, 23
+	vcipher	16, 16, 23
+	vcipher	17, 17, 23
+	vcipher	18, 18, 23
+	vcipher	19, 19, 23
+	vcipher	20, 20, 23
+	vcipher	21, 21, 23
+	vcipher	22, 22, 23
+
+	vcipher	15, 15, 24
+	vcipher	16, 16, 24
+	vcipher	17, 17, 24
+	vcipher	18, 18, 24
+	vcipher	19, 19, 24
+	vcipher	20, 20, 24
+	vcipher	21, 21, 24
+	vcipher	22, 22, 24
+
+	xxlor	23+32, 14, 14
+
+	cmpdi	10, 14
+	beq	Do_last_aes_dec
+	b	aes_gcm_out
+
+Do_last_aes_dec:
+
+	#
+	# last round
+	vcipherlast     15, 15, 23
+	vcipherlast     16, 16, 23
+
+	xxlxor		47, 47, 15
+	stxvb16x        47, 0, 9	# store output
+	xxlxor		48, 48, 16
+	stxvb16x        48, 15, 9	# store output
+
+	vcipherlast     17, 17, 23
+	vcipherlast     18, 18, 23
+
+	xxlxor		49, 49, 17
+	stxvb16x        49, 16, 9	# store output
+	xxlxor		50, 50, 18
+	stxvb16x        50, 17, 9	# store output
+
+	vcipherlast     19, 19, 23
+	vcipherlast     20, 20, 23
+
+	xxlxor		51, 51, 19
+	stxvb16x        51, 18, 9	# store output
+	xxlxor		52, 52, 20
+	stxvb16x        52, 19, 9	# store output
+
+	vcipherlast     21, 21, 23
+	vcipherlast     22, 22, 23
+
+	xxlxor		53, 53, 21
+	stxvb16x        53, 20, 9	# store output
+	xxlxor		54, 54, 22
+	stxvb16x        54, 21, 9	# store output
+
+	addi		9, 9, 128
+
+	xxlor		15+32, 15, 15
+	xxlor		16+32, 16, 16
+	xxlor		17+32, 17, 17
+	xxlor		18+32, 18, 18
+	xxlor		19+32, 19, 19
+	xxlor		20+32, 20, 20
+	xxlor		21+32, 21, 21
+	xxlor		22+32, 22, 22
+
+	# ghash here
+	ppc_aes_gcm_ghash2_4x
+
+	xxlor	27+32, 0, 0
+	vaddudm 30, 30, 31		# IV + counter
+	vmr	29, 30
+	vxor    15, 30, 27		# add round key
+	vaddudm 30, 30, 31
+	vxor    16, 30, 27
+	vaddudm 30, 30, 31
+	vxor    17, 30, 27
+	vaddudm 30, 30, 31
+	vxor    18, 30, 27
+	vaddudm 30, 30, 31
+	vxor    19, 30, 27
+	vaddudm 30, 30, 31
+	vxor    20, 30, 27
+	vaddudm 30, 30, 31
+	vxor    21, 30, 27
+	vaddudm 30, 30, 31
+	vxor    22, 30, 27
+	addi    12, 12, -128
+	addi    11, 11, 128
+
+	bdnz	Loop_8x_block_dec
+
+	vmr	30, 29
+
+Loop_last_block_dec:
+	cmpdi   12, 0
+	beq     aes_gcm_out
+
+	# loop last few blocks
+	li      10, 16
+	divdu   10, 12, 10
+
+	mtctr   10
+
+	lwz	10,480(6)
+
+	cmpdi   12, 16
+	blt     Final_block_dec
+
+Next_rem_block_dec:
+	lxvb16x 15, 0, 14		# load block
+
+	Loop_aes_middle_1x
+
+	xxlor	23+32, 10, 10
+
+	cmpdi	10, 10
+	beq	Do_next_1x_dec
+
+	# 192 bits
+	xxlor	24+32, 11, 11
+
+	vcipher	15, 15, 23
+	vcipher	15, 15, 24
+
+	xxlor	23+32, 12, 12
+
+	cmpdi	10, 12
+	beq	Do_next_1x_dec
+
+	# 256 bits
+	xxlor	24+32, 13, 13
+
+	vcipher	15, 15, 23
+	vcipher	15, 15, 24
+
+	xxlor	23+32, 14, 14
+
+	cmpdi	10, 14
+	beq	Do_next_1x_dec
+
+Do_next_1x_dec:
+	vcipherlast     15, 15, 23
+
+	xxlxor  47, 47, 15
+	stxvb16x        47, 0, 9	# store output
+	addi	14, 14, 16
+	addi	9, 9, 16
+
+	xxlor	28+32, 15, 15
+	ppc_update_hash_1x
+
+	addi    12, 12, -16
+	addi    11, 11, 16
+	xxlor	19+32, 0, 0
+	vaddudm 30, 30, 31		# IV + counter
+	vxor	15, 30, 19		# add round key
+
+	bdnz	Next_rem_block_dec
+
+	cmpdi	12, 0
+	beq	aes_gcm_out
+
+Final_block_dec:
+	Loop_aes_middle_1x
+
+	xxlor	23+32, 10, 10
+
+	cmpdi	10, 10
+	beq	Do_final_1x_dec
+
+	# 192 bits
+	xxlor	24+32, 11, 11
+
+	vcipher	15, 15, 23
+	vcipher	15, 15, 24
+
+	xxlor	23+32, 12, 12
+
+	cmpdi	10, 12
+	beq	Do_final_1x_dec
+
+	# 256 bits
+	xxlor	24+32, 13, 13
+
+	vcipher	15, 15, 23
+	vcipher	15, 15, 24
+
+	xxlor	23+32, 14, 14
+
+	cmpdi	10, 14
+	beq	Do_final_1x_dec
+
+Do_final_1x_dec:
+	vcipherlast     15, 15, 23
+
+	lxvb16x	15, 0, 14		# load block
+	xxlxor	47, 47, 15
+
+	# create partial block mask
+	li	15, 16
+	sub	15, 15, 12		# index to the mask
+
+	vspltisb	16, -1		# first 16 bytes - 0xffff...ff
+	vspltisb	17, 0		# second 16 bytes - 0x0000...00
+	li	10, 192
+	stvx	16, 10, 1
+	addi	10, 10, 16
+	stvx	17, 10, 1
+
+	addi	10, 1, 192
+	lxvb16x	16, 15, 10		# load block mask
+	xxland	47, 47, 16
+
+	xxlor	28+32, 15, 15
+	ppc_update_hash_1x
+
+	# * should store only the remaining bytes.
+	bl	Write_partial_block
+
+	b aes_gcm_out
+#
Index: libgcrypt-1.9.4/cipher/rijndael-p10le.c
===================================================================
--- /dev/null
+++ libgcrypt-1.9.4/cipher/rijndael-p10le.c
@@ -0,0 +1,119 @@
+/* Rijndael (AES) for GnuPG - PowerPC Vector Crypto AES implementation
+ * Copyright 2021- IBM Inc. All rights reserved
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Alternatively, this code may be used in OpenSSL from The OpenSSL Project,
+ * and Cryptogams by Andy Polyakov, and if made part of a release of either
+ * or both projects, is thereafter dual-licensed under the license said project
+ * is released under.
+ */
+
+#include <config.h>
+
+#include "rijndael-internal.h"
+#include "cipher-internal.h"
+#include "bufhelp.h"
+
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
+
+
+extern size_t _gcry_ppc10_aes_gcm_encrypt (const void *inp, void *out,
+                                           size_t len,
+                                           const unsigned char *key,
+                                           unsigned char iv[16], void *Xip);
+extern size_t _gcry_ppc10_aes_gcm_decrypt (const void *inp, void *out,
+                                           size_t len,
+                                           const unsigned char *key,
+                                           unsigned char iv[16], void *Xip);
+
+size_t
+_gcry_aes_p10le_gcm_crypt(gcry_cipher_hd_t c, void *outbuf_arg,
+                          const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  RIJNDAEL_context *ctx = (RIJNDAEL_context *) &c->context.c;
+  unsigned char *rk = (unsigned char *) ctx->u1.keyschedule;
+  unsigned char *gcm_table = (unsigned char *) c->u_mode.gcm.gcm_table;
+  unsigned char *iv = c->u_ctr.ctr;
+  unsigned char *Xi = c->u_mode.gcm.u_tag.tag;
+  int s = 0;
+  int ndone = 0;
+  int ctr_reset = 0;
+  size_t len = nblocks * GCRY_GCM_BLOCK_LEN;
+  u64 blocks_unused;
+  u64 nb = nblocks;
+  u64 next_ctr = 0;
+  unsigned char ctr_saved[12];
+  unsigned char *inp = (unsigned char *) inbuf_arg;
+  unsigned char *out = (unsigned char *) outbuf_arg;
+
+  /*
+   * This is what the aes-gcm asembly code expects some input parameters.
+   *
+   *   - Number of rounds is at 480 offset from rk (rk->rounds)
+   *   - Xi at 256 offset from gcm_table
+   */
+  gcry_assert (sizeof(c->u_mode.gcm.gcm_table) >= 256 + 16);
+  buf_cpy (gcm_table+256, Xi, 16);
+  buf_cpy (ctr_saved, c->u_ctr.ctr, 12);
+
+  while (nb)
+    {
+      blocks_unused = (u64) 0xffffffffU + 1 - (u64) buf_get_be32 (iv + 12);
+      if (nb > blocks_unused)
+        {
+          len = blocks_unused * GCRY_GCM_BLOCK_LEN;
+          nb -= blocks_unused;
+          next_ctr = blocks_unused;
+          ctr_reset = 1;
+        }
+      else
+        {
+          len = nb * GCRY_GCM_BLOCK_LEN;
+          next_ctr = nb;
+          nb = 0;
+        }
+
+      if (encrypt)
+        s = _gcry_ppc10_aes_gcm_encrypt((const void *) inp, (void *) out, len,
+                                        (const unsigned char *) rk, iv,
+                                        (void *) gcm_table);
+      else
+        s = _gcry_ppc10_aes_gcm_decrypt((const void *) inp, (void *) out, len,
+                                        (const unsigned char *) rk, iv,
+                                        (void *) gcm_table);
+
+      cipher_block_add(c->u_ctr.ctr, next_ctr, GCRY_GCM_BLOCK_LEN);
+      if (ctr_reset)
+        {
+          ctr_reset = 0;
+          inp += len;
+          out += len;
+        }
+      buf_cpy (c->u_ctr.ctr, ctr_saved, 12);
+      ndone += s;
+    }
+  buf_cpy (Xi, gcm_table+256, 16);
+
+  /*
+   * Return number of blocks done.
+   */
+  s = ndone / GCRY_GCM_BLOCK_LEN;
+  s = nblocks - s;
+  return ( s );
+}
+
+#endif /* USE_PPC_CRYPTO_WITH_PPC9LE */
Index: libgcrypt-1.9.4/cipher/rijndael.c
===================================================================
--- libgcrypt-1.9.4.orig/cipher/rijndael.c
+++ libgcrypt-1.9.4/cipher/rijndael.c
@@ -295,6 +295,10 @@ extern void _gcry_aes_ppc9le_xts_crypt (
 					void *outbuf_arg,
 					const void *inbuf_arg,
 					size_t nblocks, int encrypt);
+
+extern size_t _gcry_aes_p10le_gcm_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+					 const void *inbuf_arg,
+					 size_t nblocks, int encrypt);
 #endif /*USE_PPC_CRYPTO_WITH_PPC9LE*/

 #ifdef USE_S390X_CRYPTO
@@ -586,6 +590,8 @@ do_setkey (RIJNDAEL_context *ctx, const
       bulk_ops->ocb_crypt = _gcry_aes_ppc9le_ocb_crypt;
       bulk_ops->ocb_auth = _gcry_aes_ppc9le_ocb_auth;
       bulk_ops->xts_crypt = _gcry_aes_ppc9le_xts_crypt;
+      if (hwfeatures & HWF_PPC_ARCH_3_10)  /* for P10 */
+        bulk_ops->gcm_crypt = _gcry_aes_p10le_gcm_crypt;
     }
 #endif
 #ifdef USE_PPC_CRYPTO
Index: libgcrypt-1.9.4/configure.ac
===================================================================
--- libgcrypt-1.9.4.orig/configure.ac
+++ libgcrypt-1.9.4/configure.ac
@@ -2597,6 +2597,12 @@ if test "$found" = "1" ; then
          # Build with the crypto extension implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc9le.lo"
+         if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" &&
+            test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
+            # Build with AES-GCM bulk implementation for P10
+            GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-gcm-p10le.lo"
+            GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-p10le.lo"
+         fi
       ;;
       powerpc64-*-*)
          # Big-Endian.
Index: libgcrypt-1.9.4/src/g10lib.h
===================================================================
--- libgcrypt-1.9.4.orig/src/g10lib.h
+++ libgcrypt-1.9.4/src/g10lib.h
@@ -252,6 +252,7 @@ char **_gcry_strtokenize (const char *st
 #define HWF_PPC_VCRYPTO         (1 << 0)
 #define HWF_PPC_ARCH_3_00       (1 << 1)
 #define HWF_PPC_ARCH_2_07       (1 << 2)
+#define HWF_PPC_ARCH_3_10       (1 << 3)

 #elif defined(HAVE_CPU_ARCH_S390X)

Index: libgcrypt-1.9.4/src/hwf-ppc.c
===================================================================
--- libgcrypt-1.9.4.orig/src/hwf-ppc.c
+++ libgcrypt-1.9.4/src/hwf-ppc.c
@@ -92,6 +92,9 @@ struct feature_map_s
 #ifndef PPC_FEATURE2_ARCH_3_00
 # define PPC_FEATURE2_ARCH_3_00     0x00800000
 #endif
+#ifndef PPC_FEATURE2_ARCH_3_10
+# define PPC_FEATURE2_ARCH_3_10     0x00040000
+#endif

 static const struct feature_map_s ppc_features[] =
   {
Index: libgcrypt-1.9.4/src/hwfeatures.c
===================================================================
--- libgcrypt-1.9.4.orig/src/hwfeatures.c
+++ libgcrypt-1.9.4/src/hwfeatures.c
@@ -72,6 +72,7 @@ static struct
     { HWF_PPC_VCRYPTO,         "ppc-vcrypto" },
     { HWF_PPC_ARCH_3_00,       "ppc-arch_3_00" },
     { HWF_PPC_ARCH_2_07,       "ppc-arch_2_07" },
+    { HWF_PPC_ARCH_3_10,       "ppc-arch_3_10" },
 #elif defined(HAVE_CPU_ARCH_S390X)
     { HWF_S390X_MSA,           "s390x-msa" },
     { HWF_S390X_MSA_4,         "s390x-msa-4" },