183 lines
5.3 KiB
Diff
183 lines
5.3 KiB
Diff
From b9f145df85145506f8e61bac38b792584a38d88f Mon Sep 17 00:00:00 2001
|
|
From: Krzysztof Koch <Krzysztof.Koch@arm.com>
|
|
Date: Tue, 5 Nov 2019 17:35:18 +0000
|
|
Subject: [PATCH 02/14] aarch64: Increase small and medium cases for
|
|
__memcpy_generic
|
|
|
|
Increase the upper bound on medium cases from 96 to 128 bytes.
|
|
Now, up to 128 bytes are copied unrolled.
|
|
|
|
Increase the upper bound on small cases from 16 to 32 bytes so that
|
|
copies of 17-32 bytes are not impacted by the larger medium case.
|
|
|
|
Benchmarking:
|
|
The attached figures show relative timing difference with respect
|
|
to 'memcpy_generic', which is the existing implementation.
|
|
'memcpy_med_128' denotes the the version of memcpy_generic with
|
|
only the medium case enlarged. The 'memcpy_med_128_small_32' numbers
|
|
are for the version of memcpy_generic submitted in this patch, which
|
|
has both medium and small cases enlarged. The figures were generated
|
|
using the script from:
|
|
https://www.sourceware.org/ml/libc-alpha/2019-10/msg00563.html
|
|
|
|
Depending on the platform, the performance improvement in the
|
|
bench-memcpy-random.c benchmark ranges from 6% to 20% between
|
|
the original and final version of memcpy.S
|
|
|
|
Tested against GLIBC testsuite and randomized tests.
|
|
---
|
|
sysdeps/aarch64/memcpy.S | 82 +++++++++++++++++++++++-----------------
|
|
1 file changed, 47 insertions(+), 35 deletions(-)
|
|
|
|
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
|
|
index 6e4f4a74bd..10801aa0f4 100644
|
|
--- a/sysdeps/aarch64/memcpy.S
|
|
+++ b/sysdeps/aarch64/memcpy.S
|
|
@@ -41,17 +41,19 @@
|
|
#define C_h x11
|
|
#define D_l x12
|
|
#define D_h x13
|
|
-#define E_l src
|
|
-#define E_h count
|
|
-#define F_l srcend
|
|
-#define F_h dst
|
|
+#define E_l x14
|
|
+#define E_h x15
|
|
+#define F_l x16
|
|
+#define F_h x17
|
|
#define G_l count
|
|
#define G_h dst
|
|
+#define H_l src
|
|
+#define H_h srcend
|
|
#define tmp1 x14
|
|
|
|
-/* Copies are split into 3 main cases: small copies of up to 16 bytes,
|
|
- medium copies of 17..96 bytes which are fully unrolled. Large copies
|
|
- of more than 96 bytes align the destination and use an unrolled loop
|
|
+/* Copies are split into 3 main cases: small copies of up to 32 bytes,
|
|
+ medium copies of 33..128 bytes which are fully unrolled. Large copies
|
|
+ of more than 128 bytes align the destination and use an unrolled loop
|
|
processing 64 bytes per iteration.
|
|
In order to share code with memmove, small and medium copies read all
|
|
data before writing, allowing any kind of overlap. So small, medium
|
|
@@ -73,7 +75,7 @@ ENTRY_ALIGN (MEMMOVE, 6)
|
|
DELOUSE (2)
|
|
|
|
sub tmp1, dstin, src
|
|
- cmp count, 96
|
|
+ cmp count, 128
|
|
ccmp tmp1, count, 2, hi
|
|
b.lo L(move_long)
|
|
|
|
@@ -89,31 +91,39 @@ ENTRY (MEMCPY)
|
|
prfm PLDL1KEEP, [src]
|
|
add srcend, src, count
|
|
add dstend, dstin, count
|
|
- cmp count, 16
|
|
- b.ls L(copy16)
|
|
- cmp count, 96
|
|
+ cmp count, 32
|
|
+ b.ls L(copy32)
|
|
+ cmp count, 128
|
|
b.hi L(copy_long)
|
|
|
|
- /* Medium copies: 17..96 bytes. */
|
|
- sub tmp1, count, 1
|
|
+ /* Medium copies: 33..128 bytes. */
|
|
ldp A_l, A_h, [src]
|
|
- tbnz tmp1, 6, L(copy96)
|
|
- ldp D_l, D_h, [srcend, -16]
|
|
- tbz tmp1, 5, 1f
|
|
ldp B_l, B_h, [src, 16]
|
|
ldp C_l, C_h, [srcend, -32]
|
|
+ ldp D_l, D_h, [srcend, -16]
|
|
+ cmp count, 64
|
|
+ b.hi L(copy128)
|
|
+ stp A_l, A_h, [dstin]
|
|
stp B_l, B_h, [dstin, 16]
|
|
stp C_l, C_h, [dstend, -32]
|
|
-1:
|
|
- stp A_l, A_h, [dstin]
|
|
stp D_l, D_h, [dstend, -16]
|
|
ret
|
|
|
|
.p2align 4
|
|
- /* Small copies: 0..16 bytes. */
|
|
-L(copy16):
|
|
- cmp count, 8
|
|
+ /* Small copies: 0..32 bytes. */
|
|
+L(copy32):
|
|
+ /* 16-32 bytes. */
|
|
+ cmp count, 16
|
|
b.lo 1f
|
|
+ ldp A_l, A_h, [src]
|
|
+ ldp B_l, B_h, [srcend, -16]
|
|
+ stp A_l, A_h, [dstin]
|
|
+ stp B_l, B_h, [dstend, -16]
|
|
+ ret
|
|
+ .p2align 4
|
|
+1:
|
|
+ /* 8-15 bytes. */
|
|
+ tbz count, 3, 1f
|
|
ldr A_l, [src]
|
|
ldr A_h, [srcend, -8]
|
|
str A_l, [dstin]
|
|
@@ -121,6 +131,7 @@ L(copy16):
|
|
ret
|
|
.p2align 4
|
|
1:
|
|
+ /* 4-7 bytes. */
|
|
tbz count, 2, 1f
|
|
ldr A_lw, [src]
|
|
ldr A_hw, [srcend, -4]
|
|
@@ -142,24 +153,25 @@ L(copy16):
|
|
2: ret
|
|
|
|
.p2align 4
|
|
- /* Copy 64..96 bytes. Copy 64 bytes from the start and
|
|
- 32 bytes from the end. */
|
|
-L(copy96):
|
|
- ldp B_l, B_h, [src, 16]
|
|
- ldp C_l, C_h, [src, 32]
|
|
- ldp D_l, D_h, [src, 48]
|
|
- ldp E_l, E_h, [srcend, -32]
|
|
- ldp F_l, F_h, [srcend, -16]
|
|
+ /* Copy 65..128 bytes. Copy 64 bytes from the start and
|
|
+ 64 bytes from the end. */
|
|
+L(copy128):
|
|
+ ldp E_l, E_h, [src, 32]
|
|
+ ldp F_l, F_h, [src, 48]
|
|
+ ldp G_l, G_h, [srcend, -64]
|
|
+ ldp H_l, H_h, [srcend, -48]
|
|
stp A_l, A_h, [dstin]
|
|
stp B_l, B_h, [dstin, 16]
|
|
- stp C_l, C_h, [dstin, 32]
|
|
- stp D_l, D_h, [dstin, 48]
|
|
- stp E_l, E_h, [dstend, -32]
|
|
- stp F_l, F_h, [dstend, -16]
|
|
+ stp E_l, E_h, [dstin, 32]
|
|
+ stp F_l, F_h, [dstin, 48]
|
|
+ stp G_l, G_h, [dstend, -64]
|
|
+ stp H_l, H_h, [dstend, -48]
|
|
+ stp C_l, C_h, [dstend, -32]
|
|
+ stp D_l, D_h, [dstend, -16]
|
|
ret
|
|
|
|
/* Align DST to 16 byte alignment so that we don't cross cache line
|
|
- boundaries on both loads and stores. There are at least 96 bytes
|
|
+ boundaries on both loads and stores. There are at least 128 bytes
|
|
to copy, so copy 16 bytes unaligned and then align. The loop
|
|
copies 64 bytes per iteration and prefetches one iteration ahead. */
|
|
|
|
@@ -215,7 +227,7 @@ L(move_long):
|
|
add dstend, dstin, count
|
|
|
|
/* Align dstend to 16 byte alignment so that we don't cross cache line
|
|
- boundaries on both loads and stores. There are at least 96 bytes
|
|
+ boundaries on both loads and stores. There are at least 128 bytes
|
|
to copy, so copy 16 bytes unaligned and then align. The loop
|
|
copies 64 bytes per iteration and prefetches one iteration ahead. */
|
|
|
|
--
|
|
2.39.3
|
|
|