Increase small and medium cases for
This commit is contained in:
parent
ec545446cf
commit
91e142ca44
2 changed files with 185 additions and 0 deletions
183
glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch
Normal file
183
glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch
Normal file
|
@ -0,0 +1,183 @@
|
|||
From b9f145df85145506f8e61bac38b792584a38d88f Mon Sep 17 00:00:00 2001
|
||||
From: Krzysztof Koch <Krzysztof.Koch@arm.com>
|
||||
Date: Tue, 5 Nov 2019 17:35:18 +0000
|
||||
Subject: [PATCH 02/14] aarch64: Increase small and medium cases for
|
||||
__memcpy_generic
|
||||
|
||||
Increase the upper bound on medium cases from 96 to 128 bytes.
|
||||
Now, up to 128 bytes are copied unrolled.
|
||||
|
||||
Increase the upper bound on small cases from 16 to 32 bytes so that
|
||||
copies of 17-32 bytes are not impacted by the larger medium case.
|
||||
|
||||
Benchmarking:
|
||||
The attached figures show relative timing difference with respect
|
||||
to 'memcpy_generic', which is the existing implementation.
|
||||
'memcpy_med_128' denotes the the version of memcpy_generic with
|
||||
only the medium case enlarged. The 'memcpy_med_128_small_32' numbers
|
||||
are for the version of memcpy_generic submitted in this patch, which
|
||||
has both medium and small cases enlarged. The figures were generated
|
||||
using the script from:
|
||||
https://www.sourceware.org/ml/libc-alpha/2019-10/msg00563.html
|
||||
|
||||
Depending on the platform, the performance improvement in the
|
||||
bench-memcpy-random.c benchmark ranges from 6% to 20% between
|
||||
the original and final version of memcpy.S
|
||||
|
||||
Tested against GLIBC testsuite and randomized tests.
|
||||
---
|
||||
sysdeps/aarch64/memcpy.S | 82 +++++++++++++++++++++++-----------------
|
||||
1 file changed, 47 insertions(+), 35 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
|
||||
index 6e4f4a74bd..10801aa0f4 100644
|
||||
--- a/sysdeps/aarch64/memcpy.S
|
||||
+++ b/sysdeps/aarch64/memcpy.S
|
||||
@@ -41,17 +41,19 @@
|
||||
#define C_h x11
|
||||
#define D_l x12
|
||||
#define D_h x13
|
||||
-#define E_l src
|
||||
-#define E_h count
|
||||
-#define F_l srcend
|
||||
-#define F_h dst
|
||||
+#define E_l x14
|
||||
+#define E_h x15
|
||||
+#define F_l x16
|
||||
+#define F_h x17
|
||||
#define G_l count
|
||||
#define G_h dst
|
||||
+#define H_l src
|
||||
+#define H_h srcend
|
||||
#define tmp1 x14
|
||||
|
||||
-/* Copies are split into 3 main cases: small copies of up to 16 bytes,
|
||||
- medium copies of 17..96 bytes which are fully unrolled. Large copies
|
||||
- of more than 96 bytes align the destination and use an unrolled loop
|
||||
+/* Copies are split into 3 main cases: small copies of up to 32 bytes,
|
||||
+ medium copies of 33..128 bytes which are fully unrolled. Large copies
|
||||
+ of more than 128 bytes align the destination and use an unrolled loop
|
||||
processing 64 bytes per iteration.
|
||||
In order to share code with memmove, small and medium copies read all
|
||||
data before writing, allowing any kind of overlap. So small, medium
|
||||
@@ -73,7 +75,7 @@ ENTRY_ALIGN (MEMMOVE, 6)
|
||||
DELOUSE (2)
|
||||
|
||||
sub tmp1, dstin, src
|
||||
- cmp count, 96
|
||||
+ cmp count, 128
|
||||
ccmp tmp1, count, 2, hi
|
||||
b.lo L(move_long)
|
||||
|
||||
@@ -89,31 +91,39 @@ ENTRY (MEMCPY)
|
||||
prfm PLDL1KEEP, [src]
|
||||
add srcend, src, count
|
||||
add dstend, dstin, count
|
||||
- cmp count, 16
|
||||
- b.ls L(copy16)
|
||||
- cmp count, 96
|
||||
+ cmp count, 32
|
||||
+ b.ls L(copy32)
|
||||
+ cmp count, 128
|
||||
b.hi L(copy_long)
|
||||
|
||||
- /* Medium copies: 17..96 bytes. */
|
||||
- sub tmp1, count, 1
|
||||
+ /* Medium copies: 33..128 bytes. */
|
||||
ldp A_l, A_h, [src]
|
||||
- tbnz tmp1, 6, L(copy96)
|
||||
- ldp D_l, D_h, [srcend, -16]
|
||||
- tbz tmp1, 5, 1f
|
||||
ldp B_l, B_h, [src, 16]
|
||||
ldp C_l, C_h, [srcend, -32]
|
||||
+ ldp D_l, D_h, [srcend, -16]
|
||||
+ cmp count, 64
|
||||
+ b.hi L(copy128)
|
||||
+ stp A_l, A_h, [dstin]
|
||||
stp B_l, B_h, [dstin, 16]
|
||||
stp C_l, C_h, [dstend, -32]
|
||||
-1:
|
||||
- stp A_l, A_h, [dstin]
|
||||
stp D_l, D_h, [dstend, -16]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
- /* Small copies: 0..16 bytes. */
|
||||
-L(copy16):
|
||||
- cmp count, 8
|
||||
+ /* Small copies: 0..32 bytes. */
|
||||
+L(copy32):
|
||||
+ /* 16-32 bytes. */
|
||||
+ cmp count, 16
|
||||
b.lo 1f
|
||||
+ ldp A_l, A_h, [src]
|
||||
+ ldp B_l, B_h, [srcend, -16]
|
||||
+ stp A_l, A_h, [dstin]
|
||||
+ stp B_l, B_h, [dstend, -16]
|
||||
+ ret
|
||||
+ .p2align 4
|
||||
+1:
|
||||
+ /* 8-15 bytes. */
|
||||
+ tbz count, 3, 1f
|
||||
ldr A_l, [src]
|
||||
ldr A_h, [srcend, -8]
|
||||
str A_l, [dstin]
|
||||
@@ -121,6 +131,7 @@ L(copy16):
|
||||
ret
|
||||
.p2align 4
|
||||
1:
|
||||
+ /* 4-7 bytes. */
|
||||
tbz count, 2, 1f
|
||||
ldr A_lw, [src]
|
||||
ldr A_hw, [srcend, -4]
|
||||
@@ -142,24 +153,25 @@ L(copy16):
|
||||
2: ret
|
||||
|
||||
.p2align 4
|
||||
- /* Copy 64..96 bytes. Copy 64 bytes from the start and
|
||||
- 32 bytes from the end. */
|
||||
-L(copy96):
|
||||
- ldp B_l, B_h, [src, 16]
|
||||
- ldp C_l, C_h, [src, 32]
|
||||
- ldp D_l, D_h, [src, 48]
|
||||
- ldp E_l, E_h, [srcend, -32]
|
||||
- ldp F_l, F_h, [srcend, -16]
|
||||
+ /* Copy 65..128 bytes. Copy 64 bytes from the start and
|
||||
+ 64 bytes from the end. */
|
||||
+L(copy128):
|
||||
+ ldp E_l, E_h, [src, 32]
|
||||
+ ldp F_l, F_h, [src, 48]
|
||||
+ ldp G_l, G_h, [srcend, -64]
|
||||
+ ldp H_l, H_h, [srcend, -48]
|
||||
stp A_l, A_h, [dstin]
|
||||
stp B_l, B_h, [dstin, 16]
|
||||
- stp C_l, C_h, [dstin, 32]
|
||||
- stp D_l, D_h, [dstin, 48]
|
||||
- stp E_l, E_h, [dstend, -32]
|
||||
- stp F_l, F_h, [dstend, -16]
|
||||
+ stp E_l, E_h, [dstin, 32]
|
||||
+ stp F_l, F_h, [dstin, 48]
|
||||
+ stp G_l, G_h, [dstend, -64]
|
||||
+ stp H_l, H_h, [dstend, -48]
|
||||
+ stp C_l, C_h, [dstend, -32]
|
||||
+ stp D_l, D_h, [dstend, -16]
|
||||
ret
|
||||
|
||||
/* Align DST to 16 byte alignment so that we don't cross cache line
|
||||
- boundaries on both loads and stores. There are at least 96 bytes
|
||||
+ boundaries on both loads and stores. There are at least 128 bytes
|
||||
to copy, so copy 16 bytes unaligned and then align. The loop
|
||||
copies 64 bytes per iteration and prefetches one iteration ahead. */
|
||||
|
||||
@@ -215,7 +227,7 @@ L(move_long):
|
||||
add dstend, dstin, count
|
||||
|
||||
/* Align dstend to 16 byte alignment so that we don't cross cache line
|
||||
- boundaries on both loads and stores. There are at least 96 bytes
|
||||
+ boundaries on both loads and stores. There are at least 128 bytes
|
||||
to copy, so copy 16 bytes unaligned and then align. The loop
|
||||
copies 64 bytes per iteration and prefetches one iteration ahead. */
|
||||
|
||||
--
|
||||
2.39.3
|
||||
|
|
@ -1067,6 +1067,7 @@ Patch2005: glibc-elf-Fix-tst-align3.patch
|
|||
|
||||
Patch2006: glibc-Sync-to-lnd-35-for-LoongArch.patch
|
||||
Patch2007: Fix-tst-cancel21.c-to-suit-kernel-struct-sigcontext-.patch
|
||||
Patch2008: glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch
|
||||
|
||||
##############################################################################
|
||||
# Continued list of core "glibc" package information:
|
||||
|
@ -2906,6 +2907,7 @@ fi
|
|||
- elf: Properly align PT_LOAD segments
|
||||
- Sync loongarch64 code to lnd.35. (lixing@loongson.cn)
|
||||
- Add patch for gb18030-2022 from upstream bug#30243 (fundawang@yeah.net)
|
||||
- aarch64: Increase small and medium cases for __memcpy_generic (bug#7060) (Kaiqiang Wang)
|
||||
|
||||
* Wed Sep 20 2023 Siddhesh Poyarekar <siddhesh@redhat.com> - 2.28-236.7
|
||||
- CVE-2023-4911 glibc: buffer overflow in ld.so leading to privilege escalation (RHEL-3036)
|
||||
|
|
Loading…
Add table
Reference in a new issue