Increase small and medium cases for

2023-12-11 14:13:53 +08:00 · 2023-12-11 14:13:53 +08:00 · 91e142ca44
commit 91e142ca44
parent ec545446cf
2 changed files with 185 additions and 0 deletions
--- a/glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch
+++ b/glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch
@ -0,0 +1,183 @@
+From b9f145df85145506f8e61bac38b792584a38d88f Mon Sep 17 00:00:00 2001
+From: Krzysztof Koch <Krzysztof.Koch@arm.com>
+Date: Tue, 5 Nov 2019 17:35:18 +0000
+Subject: [PATCH 02/14] aarch64: Increase small and medium cases for
+ __memcpy_generic
+
+Increase the upper bound on medium cases from 96 to 128 bytes.
+Now, up to 128 bytes are copied unrolled.
+
+Increase the upper bound on small cases from 16 to 32 bytes so that
+copies of 17-32 bytes are not impacted by the larger medium case.
+
+Benchmarking:
+The attached figures show relative timing difference with respect
+to 'memcpy_generic', which is the existing implementation.
+'memcpy_med_128' denotes the the version of memcpy_generic with
+only the medium case enlarged. The 'memcpy_med_128_small_32' numbers
+are for the version of memcpy_generic submitted in this patch, which
+has both medium and small cases enlarged. The figures were generated
+using the script from:
+https://www.sourceware.org/ml/libc-alpha/2019-10/msg00563.html
+
+Depending on the platform, the performance improvement in the
+bench-memcpy-random.c benchmark ranges from 6% to 20% between
+the original and final version of memcpy.S
+
+Tested against GLIBC testsuite and randomized tests.
+---
+ sysdeps/aarch64/memcpy.S | 82 +++++++++++++++++++++++-----------------
+ 1 file changed, 47 insertions(+), 35 deletions(-)
+
+diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
+index 6e4f4a74bd..10801aa0f4 100644
+--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
+@@ -41,17 +41,19 @@
+ #define C_h	x11
+ #define D_l	x12
+ #define D_h	x13
+-#define E_l	src
+-#define E_h	count
+-#define F_l	srcend
+-#define F_h	dst
+#define E_l	x14
+#define E_h	x15
+#define F_l	x16
+#define F_h	x17
+ #define G_l	count
+ #define G_h	dst
+#define H_l	src
+#define H_h	srcend
+ #define tmp1	x14
+ 
+-/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+-   medium copies of 17..96 bytes which are fully unrolled. Large copies
+-   of more than 96 bytes align the destination and use an unrolled loop
+/* Copies are split into 3 main cases: small copies of up to 32 bytes,
+   medium copies of 33..128 bytes which are fully unrolled. Large copies
+   of more than 128 bytes align the destination and use an unrolled loop
+    processing 64 bytes per iteration.
+    In order to share code with memmove, small and medium copies read all
+    data before writing, allowing any kind of overlap. So small, medium
+@@ -73,7 +75,7 @@ ENTRY_ALIGN (MEMMOVE, 6)
+ 	DELOUSE (2)
+ 
+ 	sub	tmp1, dstin, src
+-	cmp	count, 96
+	cmp	count, 128
+ 	ccmp	tmp1, count, 2, hi
+ 	b.lo	L(move_long)
+ 
+@@ -89,31 +91,39 @@ ENTRY (MEMCPY)
+ 	prfm	PLDL1KEEP, [src]
+ 	add	srcend, src, count
+ 	add	dstend, dstin, count
+-	cmp	count, 16
+-	b.ls	L(copy16)
+-	cmp	count, 96
+	cmp	count, 32
+	b.ls	L(copy32)
+	cmp	count, 128
+ 	b.hi	L(copy_long)
+ 
+-	/* Medium copies: 17..96 bytes.  */
+-	sub	tmp1, count, 1
+	/* Medium copies: 33..128 bytes.  */
+ 	ldp	A_l, A_h, [src]
+-	tbnz	tmp1, 6, L(copy96)
+-	ldp	D_l, D_h, [srcend, -16]
+-	tbz	tmp1, 5, 1f
+ 	ldp	B_l, B_h, [src, 16]
+ 	ldp	C_l, C_h, [srcend, -32]
+	ldp	D_l, D_h, [srcend, -16]
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp	A_l, A_h, [dstin]
+ 	stp	B_l, B_h, [dstin, 16]
+ 	stp	C_l, C_h, [dstend, -32]
+-1:
+-	stp	A_l, A_h, [dstin]
+ 	stp	D_l, D_h, [dstend, -16]
+ 	ret
+ 
+ 	.p2align 4
+-	/* Small copies: 0..16 bytes.  */
+-L(copy16):
+-	cmp	count, 8
+	/* Small copies: 0..32 bytes.  */
+L(copy32):
+	/* 16-32 bytes.  */
+	cmp	count, 16
+ 	b.lo	1f
+	ldp	A_l, A_h, [src]
+	ldp	B_l, B_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstend, -16]
+	ret
+	.p2align 4
+1:
+	/* 8-15 bytes.  */
+	tbz	count, 3, 1f
+ 	ldr	A_l, [src]
+ 	ldr	A_h, [srcend, -8]
+ 	str	A_l, [dstin]
+@@ -121,6 +131,7 @@ L(copy16):
+ 	ret
+ 	.p2align 4
+ 1:
+	/* 4-7 bytes.  */
+ 	tbz	count, 2, 1f
+ 	ldr	A_lw, [src]
+ 	ldr	A_hw, [srcend, -4]
+@@ -142,24 +153,25 @@ L(copy16):
+ 2:	ret
+ 
+ 	.p2align 4
+-	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
+-	   32 bytes from the end.  */
+-L(copy96):
+-	ldp	B_l, B_h, [src, 16]
+-	ldp	C_l, C_h, [src, 32]
+-	ldp	D_l, D_h, [src, 48]
+-	ldp	E_l, E_h, [srcend, -32]
+-	ldp	F_l, F_h, [srcend, -16]
+	/* Copy 65..128 bytes.  Copy 64 bytes from the start and
+	   64 bytes from the end.  */
+L(copy128):
+	ldp	E_l, E_h, [src, 32]
+	ldp	F_l, F_h, [src, 48]
+	ldp	G_l, G_h, [srcend, -64]
+	ldp	H_l, H_h, [srcend, -48]
+ 	stp	A_l, A_h, [dstin]
+ 	stp	B_l, B_h, [dstin, 16]
+-	stp	C_l, C_h, [dstin, 32]
+-	stp	D_l, D_h, [dstin, 48]
+-	stp	E_l, E_h, [dstend, -32]
+-	stp	F_l, F_h, [dstend, -16]
+	stp	E_l, E_h, [dstin, 32]
+	stp	F_l, F_h, [dstin, 48]
+	stp	G_l, G_h, [dstend, -64]
+	stp	H_l, H_h, [dstend, -48]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
+ 	ret
+ 
+ 	/* Align DST to 16 byte alignment so that we don't cross cache line
+-	   boundaries on both loads and stores.  There are at least 96 bytes
+	   boundaries on both loads and stores.  There are at least 128 bytes
+ 	   to copy, so copy 16 bytes unaligned and then align.  The loop
+ 	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+ 
+@@ -215,7 +227,7 @@ L(move_long):
+ 	add	dstend, dstin, count
+ 
+ 	/* Align dstend to 16 byte alignment so that we don't cross cache line
+-	   boundaries on both loads and stores.  There are at least 96 bytes
+	   boundaries on both loads and stores.  There are at least 128 bytes
+ 	   to copy, so copy 16 bytes unaligned and then align.  The loop
+ 	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+ 
+-- 
+2.39.3
+
--- a/glibc.spec
+++ b/glibc.spec
@ -1067,6 +1067,7 @@ Patch2005: glibc-elf-Fix-tst-align3.patch

 Patch2006: glibc-Sync-to-lnd-35-for-LoongArch.patch
 Patch2007: Fix-tst-cancel21.c-to-suit-kernel-struct-sigcontext-.patch
+Patch2008: glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch

 ##############################################################################
 # Continued list of core "glibc" package information:
@ -2906,6 +2907,7 @@ fi
 - elf: Properly align PT_LOAD segments
 - Sync loongarch64 code to lnd.35. (lixing@loongson.cn)
 - Add patch for gb18030-2022 from upstream bug#30243 (fundawang@yeah.net)
+- aarch64: Increase small and medium cases for __memcpy_generic (bug#7060) (Kaiqiang Wang)

 * Wed Sep 20 2023 Siddhesh Poyarekar <siddhesh@redhat.com> - 2.28-236.7
 - CVE-2023-4911 glibc: buffer overflow in ld.so leading to privilege escalation (RHEL-3036)