Increase small and medium cases for
This commit is contained in:
parent
61bbd0d77b
commit
ca2242d4b2
2 changed files with 188 additions and 1 deletions
183
glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch
Normal file
183
glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch
Normal file
|
@ -0,0 +1,183 @@
|
||||||
|
From b9f145df85145506f8e61bac38b792584a38d88f Mon Sep 17 00:00:00 2001
|
||||||
|
From: Krzysztof Koch <Krzysztof.Koch@arm.com>
|
||||||
|
Date: Tue, 5 Nov 2019 17:35:18 +0000
|
||||||
|
Subject: [PATCH 02/14] aarch64: Increase small and medium cases for
|
||||||
|
__memcpy_generic
|
||||||
|
|
||||||
|
Increase the upper bound on medium cases from 96 to 128 bytes.
|
||||||
|
Now, up to 128 bytes are copied unrolled.
|
||||||
|
|
||||||
|
Increase the upper bound on small cases from 16 to 32 bytes so that
|
||||||
|
copies of 17-32 bytes are not impacted by the larger medium case.
|
||||||
|
|
||||||
|
Benchmarking:
|
||||||
|
The attached figures show relative timing difference with respect
|
||||||
|
to 'memcpy_generic', which is the existing implementation.
|
||||||
|
'memcpy_med_128' denotes the the version of memcpy_generic with
|
||||||
|
only the medium case enlarged. The 'memcpy_med_128_small_32' numbers
|
||||||
|
are for the version of memcpy_generic submitted in this patch, which
|
||||||
|
has both medium and small cases enlarged. The figures were generated
|
||||||
|
using the script from:
|
||||||
|
https://www.sourceware.org/ml/libc-alpha/2019-10/msg00563.html
|
||||||
|
|
||||||
|
Depending on the platform, the performance improvement in the
|
||||||
|
bench-memcpy-random.c benchmark ranges from 6% to 20% between
|
||||||
|
the original and final version of memcpy.S
|
||||||
|
|
||||||
|
Tested against GLIBC testsuite and randomized tests.
|
||||||
|
---
|
||||||
|
sysdeps/aarch64/memcpy.S | 82 +++++++++++++++++++++++-----------------
|
||||||
|
1 file changed, 47 insertions(+), 35 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
|
||||||
|
index 6e4f4a74bd..10801aa0f4 100644
|
||||||
|
--- a/sysdeps/aarch64/memcpy.S
|
||||||
|
+++ b/sysdeps/aarch64/memcpy.S
|
||||||
|
@@ -41,17 +41,19 @@
|
||||||
|
#define C_h x11
|
||||||
|
#define D_l x12
|
||||||
|
#define D_h x13
|
||||||
|
-#define E_l src
|
||||||
|
-#define E_h count
|
||||||
|
-#define F_l srcend
|
||||||
|
-#define F_h dst
|
||||||
|
+#define E_l x14
|
||||||
|
+#define E_h x15
|
||||||
|
+#define F_l x16
|
||||||
|
+#define F_h x17
|
||||||
|
#define G_l count
|
||||||
|
#define G_h dst
|
||||||
|
+#define H_l src
|
||||||
|
+#define H_h srcend
|
||||||
|
#define tmp1 x14
|
||||||
|
|
||||||
|
-/* Copies are split into 3 main cases: small copies of up to 16 bytes,
|
||||||
|
- medium copies of 17..96 bytes which are fully unrolled. Large copies
|
||||||
|
- of more than 96 bytes align the destination and use an unrolled loop
|
||||||
|
+/* Copies are split into 3 main cases: small copies of up to 32 bytes,
|
||||||
|
+ medium copies of 33..128 bytes which are fully unrolled. Large copies
|
||||||
|
+ of more than 128 bytes align the destination and use an unrolled loop
|
||||||
|
processing 64 bytes per iteration.
|
||||||
|
In order to share code with memmove, small and medium copies read all
|
||||||
|
data before writing, allowing any kind of overlap. So small, medium
|
||||||
|
@@ -73,7 +75,7 @@ ENTRY_ALIGN (MEMMOVE, 6)
|
||||||
|
DELOUSE (2)
|
||||||
|
|
||||||
|
sub tmp1, dstin, src
|
||||||
|
- cmp count, 96
|
||||||
|
+ cmp count, 128
|
||||||
|
ccmp tmp1, count, 2, hi
|
||||||
|
b.lo L(move_long)
|
||||||
|
|
||||||
|
@@ -89,31 +91,39 @@ ENTRY (MEMCPY)
|
||||||
|
prfm PLDL1KEEP, [src]
|
||||||
|
add srcend, src, count
|
||||||
|
add dstend, dstin, count
|
||||||
|
- cmp count, 16
|
||||||
|
- b.ls L(copy16)
|
||||||
|
- cmp count, 96
|
||||||
|
+ cmp count, 32
|
||||||
|
+ b.ls L(copy32)
|
||||||
|
+ cmp count, 128
|
||||||
|
b.hi L(copy_long)
|
||||||
|
|
||||||
|
- /* Medium copies: 17..96 bytes. */
|
||||||
|
- sub tmp1, count, 1
|
||||||
|
+ /* Medium copies: 33..128 bytes. */
|
||||||
|
ldp A_l, A_h, [src]
|
||||||
|
- tbnz tmp1, 6, L(copy96)
|
||||||
|
- ldp D_l, D_h, [srcend, -16]
|
||||||
|
- tbz tmp1, 5, 1f
|
||||||
|
ldp B_l, B_h, [src, 16]
|
||||||
|
ldp C_l, C_h, [srcend, -32]
|
||||||
|
+ ldp D_l, D_h, [srcend, -16]
|
||||||
|
+ cmp count, 64
|
||||||
|
+ b.hi L(copy128)
|
||||||
|
+ stp A_l, A_h, [dstin]
|
||||||
|
stp B_l, B_h, [dstin, 16]
|
||||||
|
stp C_l, C_h, [dstend, -32]
|
||||||
|
-1:
|
||||||
|
- stp A_l, A_h, [dstin]
|
||||||
|
stp D_l, D_h, [dstend, -16]
|
||||||
|
ret
|
||||||
|
|
||||||
|
.p2align 4
|
||||||
|
- /* Small copies: 0..16 bytes. */
|
||||||
|
-L(copy16):
|
||||||
|
- cmp count, 8
|
||||||
|
+ /* Small copies: 0..32 bytes. */
|
||||||
|
+L(copy32):
|
||||||
|
+ /* 16-32 bytes. */
|
||||||
|
+ cmp count, 16
|
||||||
|
b.lo 1f
|
||||||
|
+ ldp A_l, A_h, [src]
|
||||||
|
+ ldp B_l, B_h, [srcend, -16]
|
||||||
|
+ stp A_l, A_h, [dstin]
|
||||||
|
+ stp B_l, B_h, [dstend, -16]
|
||||||
|
+ ret
|
||||||
|
+ .p2align 4
|
||||||
|
+1:
|
||||||
|
+ /* 8-15 bytes. */
|
||||||
|
+ tbz count, 3, 1f
|
||||||
|
ldr A_l, [src]
|
||||||
|
ldr A_h, [srcend, -8]
|
||||||
|
str A_l, [dstin]
|
||||||
|
@@ -121,6 +131,7 @@ L(copy16):
|
||||||
|
ret
|
||||||
|
.p2align 4
|
||||||
|
1:
|
||||||
|
+ /* 4-7 bytes. */
|
||||||
|
tbz count, 2, 1f
|
||||||
|
ldr A_lw, [src]
|
||||||
|
ldr A_hw, [srcend, -4]
|
||||||
|
@@ -142,24 +153,25 @@ L(copy16):
|
||||||
|
2: ret
|
||||||
|
|
||||||
|
.p2align 4
|
||||||
|
- /* Copy 64..96 bytes. Copy 64 bytes from the start and
|
||||||
|
- 32 bytes from the end. */
|
||||||
|
-L(copy96):
|
||||||
|
- ldp B_l, B_h, [src, 16]
|
||||||
|
- ldp C_l, C_h, [src, 32]
|
||||||
|
- ldp D_l, D_h, [src, 48]
|
||||||
|
- ldp E_l, E_h, [srcend, -32]
|
||||||
|
- ldp F_l, F_h, [srcend, -16]
|
||||||
|
+ /* Copy 65..128 bytes. Copy 64 bytes from the start and
|
||||||
|
+ 64 bytes from the end. */
|
||||||
|
+L(copy128):
|
||||||
|
+ ldp E_l, E_h, [src, 32]
|
||||||
|
+ ldp F_l, F_h, [src, 48]
|
||||||
|
+ ldp G_l, G_h, [srcend, -64]
|
||||||
|
+ ldp H_l, H_h, [srcend, -48]
|
||||||
|
stp A_l, A_h, [dstin]
|
||||||
|
stp B_l, B_h, [dstin, 16]
|
||||||
|
- stp C_l, C_h, [dstin, 32]
|
||||||
|
- stp D_l, D_h, [dstin, 48]
|
||||||
|
- stp E_l, E_h, [dstend, -32]
|
||||||
|
- stp F_l, F_h, [dstend, -16]
|
||||||
|
+ stp E_l, E_h, [dstin, 32]
|
||||||
|
+ stp F_l, F_h, [dstin, 48]
|
||||||
|
+ stp G_l, G_h, [dstend, -64]
|
||||||
|
+ stp H_l, H_h, [dstend, -48]
|
||||||
|
+ stp C_l, C_h, [dstend, -32]
|
||||||
|
+ stp D_l, D_h, [dstend, -16]
|
||||||
|
ret
|
||||||
|
|
||||||
|
/* Align DST to 16 byte alignment so that we don't cross cache line
|
||||||
|
- boundaries on both loads and stores. There are at least 96 bytes
|
||||||
|
+ boundaries on both loads and stores. There are at least 128 bytes
|
||||||
|
to copy, so copy 16 bytes unaligned and then align. The loop
|
||||||
|
copies 64 bytes per iteration and prefetches one iteration ahead. */
|
||||||
|
|
||||||
|
@@ -215,7 +227,7 @@ L(move_long):
|
||||||
|
add dstend, dstin, count
|
||||||
|
|
||||||
|
/* Align dstend to 16 byte alignment so that we don't cross cache line
|
||||||
|
- boundaries on both loads and stores. There are at least 96 bytes
|
||||||
|
+ boundaries on both loads and stores. There are at least 128 bytes
|
||||||
|
to copy, so copy 16 bytes unaligned and then align. The loop
|
||||||
|
copies 64 bytes per iteration and prefetches one iteration ahead. */
|
||||||
|
|
||||||
|
--
|
||||||
|
2.39.3
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
%global anolis_release .0.4
|
%global anolis_release .0.5
|
||||||
%define glibcsrcdir glibc-2.28
|
%define glibcsrcdir glibc-2.28
|
||||||
%define glibcversion 2.28
|
%define glibcversion 2.28
|
||||||
%define glibcrelease 225%{anolis_release}%{?dist}
|
%define glibcrelease 225%{anolis_release}%{?dist}
|
||||||
|
@ -1050,6 +1050,7 @@ Patch2005: glibc-elf-Fix-tst-align3.patch
|
||||||
|
|
||||||
Patch2006: glibc-Sync-to-lnd-35-for-LoongArch.patch
|
Patch2006: glibc-Sync-to-lnd-35-for-LoongArch.patch
|
||||||
Patch2007: Fix-tst-cancel21.c-to-suit-kernel-struct-sigcontext-.patch
|
Patch2007: Fix-tst-cancel21.c-to-suit-kernel-struct-sigcontext-.patch
|
||||||
|
Patch2008: glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
# Continued list of core "glibc" package information:
|
# Continued list of core "glibc" package information:
|
||||||
|
@ -2885,6 +2886,9 @@ fi
|
||||||
%files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
|
%files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Mon Dec 11 2023 Kaiqiang Wang <wangkaiqiang@inspur.com> - 2.28-225.0.5.6
|
||||||
|
* aarch64: Increase small and medium cases for __memcpy_generic (bug#7060)
|
||||||
|
|
||||||
* Sun Oct 08 2023 Rongwei Wang <rongwei.wang@linux.alibaba.com> - 2.28-225.0.4.6
|
* Sun Oct 08 2023 Rongwei Wang <rongwei.wang@linux.alibaba.com> - 2.28-225.0.4.6
|
||||||
- elf: Properly align PT_LOAD segments
|
- elf: Properly align PT_LOAD segments
|
||||||
- Sync loongarch64 code to lnd.35. (lixing@loongson.cn)
|
- Sync loongarch64 code to lnd.35. (lixing@loongson.cn)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue