diff --git a/glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch b/glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch new file mode 100644 index 0000000..b6fbf73 --- /dev/null +++ b/glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch @@ -0,0 +1,183 @@ +From b9f145df85145506f8e61bac38b792584a38d88f Mon Sep 17 00:00:00 2001 +From: Krzysztof Koch +Date: Tue, 5 Nov 2019 17:35:18 +0000 +Subject: [PATCH 02/14] aarch64: Increase small and medium cases for + __memcpy_generic + +Increase the upper bound on medium cases from 96 to 128 bytes. +Now, up to 128 bytes are copied unrolled. + +Increase the upper bound on small cases from 16 to 32 bytes so that +copies of 17-32 bytes are not impacted by the larger medium case. + +Benchmarking: +The attached figures show relative timing difference with respect +to 'memcpy_generic', which is the existing implementation. +'memcpy_med_128' denotes the the version of memcpy_generic with +only the medium case enlarged. The 'memcpy_med_128_small_32' numbers +are for the version of memcpy_generic submitted in this patch, which +has both medium and small cases enlarged. The figures were generated +using the script from: +https://www.sourceware.org/ml/libc-alpha/2019-10/msg00563.html + +Depending on the platform, the performance improvement in the +bench-memcpy-random.c benchmark ranges from 6% to 20% between +the original and final version of memcpy.S + +Tested against GLIBC testsuite and randomized tests. +--- + sysdeps/aarch64/memcpy.S | 82 +++++++++++++++++++++++----------------- + 1 file changed, 47 insertions(+), 35 deletions(-) + +diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S +index 6e4f4a74bd..10801aa0f4 100644 +--- a/sysdeps/aarch64/memcpy.S ++++ b/sysdeps/aarch64/memcpy.S +@@ -41,17 +41,19 @@ + #define C_h x11 + #define D_l x12 + #define D_h x13 +-#define E_l src +-#define E_h count +-#define F_l srcend +-#define F_h dst ++#define E_l x14 ++#define E_h x15 ++#define F_l x16 ++#define F_h x17 + #define G_l count + #define G_h dst ++#define H_l src ++#define H_h srcend + #define tmp1 x14 + +-/* Copies are split into 3 main cases: small copies of up to 16 bytes, +- medium copies of 17..96 bytes which are fully unrolled. Large copies +- of more than 96 bytes align the destination and use an unrolled loop ++/* Copies are split into 3 main cases: small copies of up to 32 bytes, ++ medium copies of 33..128 bytes which are fully unrolled. Large copies ++ of more than 128 bytes align the destination and use an unrolled loop + processing 64 bytes per iteration. + In order to share code with memmove, small and medium copies read all + data before writing, allowing any kind of overlap. So small, medium +@@ -73,7 +75,7 @@ ENTRY_ALIGN (MEMMOVE, 6) + DELOUSE (2) + + sub tmp1, dstin, src +- cmp count, 96 ++ cmp count, 128 + ccmp tmp1, count, 2, hi + b.lo L(move_long) + +@@ -89,31 +91,39 @@ ENTRY (MEMCPY) + prfm PLDL1KEEP, [src] + add srcend, src, count + add dstend, dstin, count +- cmp count, 16 +- b.ls L(copy16) +- cmp count, 96 ++ cmp count, 32 ++ b.ls L(copy32) ++ cmp count, 128 + b.hi L(copy_long) + +- /* Medium copies: 17..96 bytes. */ +- sub tmp1, count, 1 ++ /* Medium copies: 33..128 bytes. */ + ldp A_l, A_h, [src] +- tbnz tmp1, 6, L(copy96) +- ldp D_l, D_h, [srcend, -16] +- tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] ++ ldp D_l, D_h, [srcend, -16] ++ cmp count, 64 ++ b.hi L(copy128) ++ stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] +-1: +- stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 +- /* Small copies: 0..16 bytes. */ +-L(copy16): +- cmp count, 8 ++ /* Small copies: 0..32 bytes. */ ++L(copy32): ++ /* 16-32 bytes. */ ++ cmp count, 16 + b.lo 1f ++ ldp A_l, A_h, [src] ++ ldp B_l, B_h, [srcend, -16] ++ stp A_l, A_h, [dstin] ++ stp B_l, B_h, [dstend, -16] ++ ret ++ .p2align 4 ++1: ++ /* 8-15 bytes. */ ++ tbz count, 3, 1f + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] +@@ -121,6 +131,7 @@ L(copy16): + ret + .p2align 4 + 1: ++ /* 4-7 bytes. */ + tbz count, 2, 1f + ldr A_lw, [src] + ldr A_hw, [srcend, -4] +@@ -142,24 +153,25 @@ L(copy16): + 2: ret + + .p2align 4 +- /* Copy 64..96 bytes. Copy 64 bytes from the start and +- 32 bytes from the end. */ +-L(copy96): +- ldp B_l, B_h, [src, 16] +- ldp C_l, C_h, [src, 32] +- ldp D_l, D_h, [src, 48] +- ldp E_l, E_h, [srcend, -32] +- ldp F_l, F_h, [srcend, -16] ++ /* Copy 65..128 bytes. Copy 64 bytes from the start and ++ 64 bytes from the end. */ ++L(copy128): ++ ldp E_l, E_h, [src, 32] ++ ldp F_l, F_h, [src, 48] ++ ldp G_l, G_h, [srcend, -64] ++ ldp H_l, H_h, [srcend, -48] + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] +- stp C_l, C_h, [dstin, 32] +- stp D_l, D_h, [dstin, 48] +- stp E_l, E_h, [dstend, -32] +- stp F_l, F_h, [dstend, -16] ++ stp E_l, E_h, [dstin, 32] ++ stp F_l, F_h, [dstin, 48] ++ stp G_l, G_h, [dstend, -64] ++ stp H_l, H_h, [dstend, -48] ++ stp C_l, C_h, [dstend, -32] ++ stp D_l, D_h, [dstend, -16] + ret + + /* Align DST to 16 byte alignment so that we don't cross cache line +- boundaries on both loads and stores. There are at least 96 bytes ++ boundaries on both loads and stores. There are at least 128 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + +@@ -215,7 +227,7 @@ L(move_long): + add dstend, dstin, count + + /* Align dstend to 16 byte alignment so that we don't cross cache line +- boundaries on both loads and stores. There are at least 96 bytes ++ boundaries on both loads and stores. There are at least 128 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + +-- +2.39.3 + diff --git a/glibc.spec b/glibc.spec index 2471c65..33b9795 100644 --- a/glibc.spec +++ b/glibc.spec @@ -1067,6 +1067,7 @@ Patch2005: glibc-elf-Fix-tst-align3.patch Patch2006: glibc-Sync-to-lnd-35-for-LoongArch.patch Patch2007: Fix-tst-cancel21.c-to-suit-kernel-struct-sigcontext-.patch +Patch2008: glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2906,6 +2907,7 @@ fi - elf: Properly align PT_LOAD segments - Sync loongarch64 code to lnd.35. (lixing@loongson.cn) - Add patch for gb18030-2022 from upstream bug#30243 (fundawang@yeah.net) +- aarch64: Increase small and medium cases for __memcpy_generic (bug#7060) (Kaiqiang Wang) * Wed Sep 20 2023 Siddhesh Poyarekar - 2.28-236.7 - CVE-2023-4911 glibc: buffer overflow in ld.so leading to privilege escalation (RHEL-3036)