update to glibc-2.28-251.2.src.rpm
Signed-off-by: Zhao Hang <wb-zh951434@alibaba-inc.com>
This commit is contained in:
parent
e44d0a27bf
commit
bd4ad22e2b
160 changed files with 43782 additions and 39677 deletions
459
glibc-RHEL-15696-78.patch
Normal file
459
glibc-RHEL-15696-78.patch
Normal file
|
@ -0,0 +1,459 @@
|
|||
From b62ace2740a106222e124cc86956448fa07abf4d Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Sun, 6 Feb 2022 00:54:18 -0600
|
||||
Subject: [PATCH] x86: Improve vec generation in memset-vec-unaligned-erms.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug.
|
||||
|
||||
Split vec generation into multiple steps. This allows the
|
||||
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
|
||||
case. This saves an expensive lane-cross instruction and removes
|
||||
the need for 'vzeroupper'.
|
||||
|
||||
For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
|
||||
byte broadcast.
|
||||
|
||||
Results for memset-avx2 small (geomean of N = 20 benchset runs).
|
||||
|
||||
size, New Time, Old Time, New / Old
|
||||
0, 4.100, 3.831, 0.934
|
||||
1, 5.074, 4.399, 0.867
|
||||
2, 4.433, 4.411, 0.995
|
||||
4, 4.487, 4.415, 0.984
|
||||
8, 4.454, 4.396, 0.987
|
||||
16, 4.502, 4.443, 0.987
|
||||
|
||||
All relevant string/wcsmbs tests are passing.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/memset.S | 21 ++-
|
||||
.../multiarch/memset-avx2-unaligned-erms.S | 18 +-
|
||||
.../multiarch/memset-avx512-unaligned-erms.S | 18 +-
|
||||
.../multiarch/memset-evex-unaligned-erms.S | 18 +-
|
||||
.../multiarch/memset-vec-unaligned-erms.S | 164 +++++++++++-------
|
||||
5 files changed, 152 insertions(+), 87 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
|
||||
index 8672b030..27debd2b 100644
|
||||
--- a/sysdeps/x86_64/memset.S
|
||||
+++ b/sysdeps/x86_64/memset.S
|
||||
@@ -28,17 +28,22 @@
|
||||
#define VMOVU movups
|
||||
#define VMOVA movaps
|
||||
|
||||
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
movd d, %xmm0; \
|
||||
- movq r, %rax; \
|
||||
- punpcklbw %xmm0, %xmm0; \
|
||||
- punpcklwd %xmm0, %xmm0; \
|
||||
- pshufd $0, %xmm0, %xmm0
|
||||
+ pxor %xmm1, %xmm1; \
|
||||
+ pshufb %xmm1, %xmm0; \
|
||||
+ movq r, %rax
|
||||
|
||||
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
movd d, %xmm0; \
|
||||
- movq r, %rax; \
|
||||
- pshufd $0, %xmm0, %xmm0
|
||||
+ pshufd $0, %xmm0, %xmm0; \
|
||||
+ movq r, %rax
|
||||
+
|
||||
+# define MEMSET_VDUP_TO_VEC0_HIGH()
|
||||
+# define MEMSET_VDUP_TO_VEC0_LOW()
|
||||
+
|
||||
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
||||
+# define WMEMSET_VDUP_TO_VEC0_LOW()
|
||||
|
||||
#define SECTION(p) p
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
index 1af668af..c0bf2875 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
@@ -10,15 +10,18 @@
|
||||
# define VMOVU vmovdqu
|
||||
# define VMOVA vmovdqa
|
||||
|
||||
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
vmovd d, %xmm0; \
|
||||
- movq r, %rax; \
|
||||
- vpbroadcastb %xmm0, %ymm0
|
||||
+ movq r, %rax;
|
||||
|
||||
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
- vmovd d, %xmm0; \
|
||||
- movq r, %rax; \
|
||||
- vpbroadcastd %xmm0, %ymm0
|
||||
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
+ MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
|
||||
+
|
||||
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
|
||||
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
|
||||
+
|
||||
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
|
||||
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
|
||||
|
||||
# ifndef SECTION
|
||||
# define SECTION(p) p##.avx
|
||||
@@ -30,5 +33,6 @@
|
||||
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
|
||||
# endif
|
||||
|
||||
+# define USE_XMM_LESS_VEC
|
||||
# include "memset-vec-unaligned-erms.S"
|
||||
#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
index f14d6f84..5241216a 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
@@ -15,13 +15,19 @@
|
||||
|
||||
# define VZEROUPPER
|
||||
|
||||
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
- movq r, %rax; \
|
||||
- vpbroadcastb d, %VEC0
|
||||
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
+ vpbroadcastb d, %VEC0; \
|
||||
+ movq r, %rax
|
||||
|
||||
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
- movq r, %rax; \
|
||||
- vpbroadcastd d, %VEC0
|
||||
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
+ vpbroadcastd d, %VEC0; \
|
||||
+ movq r, %rax
|
||||
+
|
||||
+# define MEMSET_VDUP_TO_VEC0_HIGH()
|
||||
+# define MEMSET_VDUP_TO_VEC0_LOW()
|
||||
+
|
||||
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
||||
+# define WMEMSET_VDUP_TO_VEC0_LOW()
|
||||
|
||||
# define SECTION(p) p##.evex512
|
||||
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
index 64b09e77..63700215 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
@@ -15,13 +15,19 @@
|
||||
|
||||
# define VZEROUPPER
|
||||
|
||||
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
- movq r, %rax; \
|
||||
- vpbroadcastb d, %VEC0
|
||||
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
+ vpbroadcastb d, %VEC0; \
|
||||
+ movq r, %rax
|
||||
|
||||
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
- movq r, %rax; \
|
||||
- vpbroadcastd d, %VEC0
|
||||
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
+ vpbroadcastd d, %VEC0; \
|
||||
+ movq r, %rax
|
||||
+
|
||||
+# define MEMSET_VDUP_TO_VEC0_HIGH()
|
||||
+# define MEMSET_VDUP_TO_VEC0_LOW()
|
||||
+
|
||||
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
||||
+# define WMEMSET_VDUP_TO_VEC0_LOW()
|
||||
|
||||
# define SECTION(p) p##.evex
|
||||
# define MEMSET_SYMBOL(p,s) p##_evex_##s
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
index f08b7323..a67f9833 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
@@ -58,8 +58,10 @@
|
||||
#ifndef MOVQ
|
||||
# if VEC_SIZE > 16
|
||||
# define MOVQ vmovq
|
||||
+# define MOVD vmovd
|
||||
# else
|
||||
# define MOVQ movq
|
||||
+# define MOVD movd
|
||||
# endif
|
||||
#endif
|
||||
|
||||
@@ -72,9 +74,17 @@
|
||||
#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||||
# define END_REG rcx
|
||||
# define LOOP_REG rdi
|
||||
+# define LESS_VEC_REG rax
|
||||
#else
|
||||
# define END_REG rdi
|
||||
# define LOOP_REG rdx
|
||||
+# define LESS_VEC_REG rdi
|
||||
+#endif
|
||||
+
|
||||
+#ifdef USE_XMM_LESS_VEC
|
||||
+# define XMM_SMALL 1
|
||||
+#else
|
||||
+# define XMM_SMALL 0
|
||||
#endif
|
||||
|
||||
#define PAGE_SIZE 4096
|
||||
@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
||||
|
||||
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
|
||||
shl $2, %RDX_LP
|
||||
- WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
- jmp L(entry_from_bzero)
|
||||
+ WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
+ WMEMSET_VDUP_TO_VEC0_LOW()
|
||||
+ cmpq $VEC_SIZE, %rdx
|
||||
+ jb L(less_vec_no_vdup)
|
||||
+ WMEMSET_VDUP_TO_VEC0_HIGH()
|
||||
+ jmp L(entry_from_wmemset)
|
||||
END (WMEMSET_SYMBOL (__wmemset, unaligned))
|
||||
#endif
|
||||
|
||||
@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
||||
#endif
|
||||
|
||||
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
||||
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
mov %edx, %edx
|
||||
@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
||||
L(entry_from_bzero):
|
||||
cmpq $VEC_SIZE, %rdx
|
||||
jb L(less_vec)
|
||||
+ MEMSET_VDUP_TO_VEC0_HIGH()
|
||||
+L(entry_from_wmemset):
|
||||
cmpq $(VEC_SIZE * 2), %rdx
|
||||
ja L(more_2x_vec)
|
||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
||||
# endif
|
||||
|
||||
ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
|
||||
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
mov %edx, %edx
|
||||
# endif
|
||||
cmp $VEC_SIZE, %RDX_LP
|
||||
jb L(less_vec)
|
||||
+ MEMSET_VDUP_TO_VEC0_HIGH ()
|
||||
cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
ja L(stosb_more_2x_vec)
|
||||
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
|
||||
- */
|
||||
- VMOVU %VEC(0), (%rax)
|
||||
- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
|
||||
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
+ VMOVU %VEC(0), (%rdi)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
#endif
|
||||
|
||||
- .p2align 4,, 10
|
||||
+ .p2align 4,, 4
|
||||
L(last_2x_vec):
|
||||
#ifdef USE_LESS_VEC_MASK_STORE
|
||||
- VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
|
||||
- VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
|
||||
#else
|
||||
VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
|
||||
VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
|
||||
@@ -212,6 +228,7 @@ L(last_2x_vec):
|
||||
#ifdef USE_LESS_VEC_MASK_STORE
|
||||
.p2align 4,, 10
|
||||
L(less_vec):
|
||||
+L(less_vec_no_vdup):
|
||||
/* Less than 1 VEC. */
|
||||
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||||
# error Unsupported VEC_SIZE!
|
||||
@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
|
||||
/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
|
||||
and (4x, 8x] jump to target. */
|
||||
L(more_2x_vec):
|
||||
-
|
||||
- /* Two different methods of setting up pointers / compare. The
|
||||
- two methods are based on the fact that EVEX/AVX512 mov
|
||||
- instructions take more bytes then AVX2/SSE2 mov instructions. As
|
||||
- well that EVEX/AVX512 machines also have fast LEA_BID. Both
|
||||
- setup and END_REG to avoid complex address mode. For EVEX/AVX512
|
||||
- this saves code size and keeps a few targets in one fetch block.
|
||||
- For AVX2/SSE2 this helps prevent AGU bottlenecks. */
|
||||
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||||
- /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
|
||||
- LOOP_4X_OFFSET) with LEA_BID. */
|
||||
-
|
||||
- /* END_REG is rcx for EVEX/AVX512. */
|
||||
- leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
|
||||
-#endif
|
||||
-
|
||||
- /* Stores to first 2x VEC before cmp as any path forward will
|
||||
- require it. */
|
||||
- VMOVU %VEC(0), (%rax)
|
||||
- VMOVU %VEC(0), VEC_SIZE(%rax)
|
||||
+ /* Store next 2x vec regardless. */
|
||||
+ VMOVU %VEC(0), (%rdi)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi)
|
||||
|
||||
|
||||
+ /* Two different methods of setting up pointers / compare. The two
|
||||
+ methods are based on the fact that EVEX/AVX512 mov instructions take
|
||||
+ more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
|
||||
+ machines also have fast LEA_BID. Both setup and END_REG to avoid complex
|
||||
+ address mode. For EVEX/AVX512 this saves code size and keeps a few
|
||||
+ targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
|
||||
+ bottlenecks. */
|
||||
#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
|
||||
/* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
|
||||
addq %rdx, %END_REG
|
||||
@@ -292,6 +299,15 @@ L(more_2x_vec):
|
||||
cmpq $(VEC_SIZE * 4), %rdx
|
||||
jbe L(last_2x_vec)
|
||||
|
||||
+
|
||||
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||||
+ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
|
||||
+ LEA_BID. */
|
||||
+
|
||||
+ /* END_REG is rcx for EVEX/AVX512. */
|
||||
+ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
|
||||
+#endif
|
||||
+
|
||||
/* Store next 2x vec regardless. */
|
||||
VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
|
||||
VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
|
||||
@@ -355,65 +371,93 @@ L(stosb_local):
|
||||
/* Define L(less_vec) only if not otherwise defined. */
|
||||
.p2align 4
|
||||
L(less_vec):
|
||||
+ /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
|
||||
+ xmm). This is only does anything for AVX2. */
|
||||
+ MEMSET_VDUP_TO_VEC0_LOW ()
|
||||
+L(less_vec_no_vdup):
|
||||
#endif
|
||||
L(cross_page):
|
||||
#if VEC_SIZE > 32
|
||||
cmpl $32, %edx
|
||||
- jae L(between_32_63)
|
||||
+ jge L(between_32_63)
|
||||
#endif
|
||||
#if VEC_SIZE > 16
|
||||
cmpl $16, %edx
|
||||
- jae L(between_16_31)
|
||||
+ jge L(between_16_31)
|
||||
+#endif
|
||||
+#ifndef USE_XMM_LESS_VEC
|
||||
+ MOVQ %XMM0, %rcx
|
||||
#endif
|
||||
- MOVQ %XMM0, %rdi
|
||||
cmpl $8, %edx
|
||||
- jae L(between_8_15)
|
||||
+ jge L(between_8_15)
|
||||
cmpl $4, %edx
|
||||
- jae L(between_4_7)
|
||||
+ jge L(between_4_7)
|
||||
cmpl $1, %edx
|
||||
- ja L(between_2_3)
|
||||
- jb L(return)
|
||||
- movb %sil, (%rax)
|
||||
- VZEROUPPER_RETURN
|
||||
+ jg L(between_2_3)
|
||||
+ jl L(between_0_0)
|
||||
+ movb %sil, (%LESS_VEC_REG)
|
||||
+L(between_0_0):
|
||||
+ ret
|
||||
|
||||
- /* Align small targets only if not doing so would cross a fetch
|
||||
- line. */
|
||||
+ /* Align small targets only if not doing so would cross a fetch line.
|
||||
+ */
|
||||
#if VEC_SIZE > 32
|
||||
.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
|
||||
/* From 32 to 63. No branch when size == 32. */
|
||||
L(between_32_63):
|
||||
- VMOVU %YMM0, (%rax)
|
||||
- VMOVU %YMM0, -32(%rax, %rdx)
|
||||
+ VMOVU %YMM0, (%LESS_VEC_REG)
|
||||
+ VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
#endif
|
||||
|
||||
#if VEC_SIZE >= 32
|
||||
- .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
|
||||
+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
|
||||
L(between_16_31):
|
||||
/* From 16 to 31. No branch when size == 16. */
|
||||
- VMOVU %XMM0, (%rax)
|
||||
- VMOVU %XMM0, -16(%rax, %rdx)
|
||||
- VZEROUPPER_RETURN
|
||||
+ VMOVU %XMM0, (%LESS_VEC_REG)
|
||||
+ VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx)
|
||||
+ ret
|
||||
#endif
|
||||
|
||||
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
|
||||
+ /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
|
||||
+ */
|
||||
+ .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
|
||||
L(between_8_15):
|
||||
/* From 8 to 15. No branch when size == 8. */
|
||||
- movq %rdi, (%rax)
|
||||
- movq %rdi, -8(%rax, %rdx)
|
||||
- VZEROUPPER_RETURN
|
||||
+#ifdef USE_XMM_LESS_VEC
|
||||
+ MOVQ %XMM0, (%rdi)
|
||||
+ MOVQ %XMM0, -8(%rdi, %rdx)
|
||||
+#else
|
||||
+ movq %rcx, (%LESS_VEC_REG)
|
||||
+ movq %rcx, -8(%LESS_VEC_REG, %rdx)
|
||||
+#endif
|
||||
+ ret
|
||||
|
||||
- .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
|
||||
+ /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
|
||||
+ */
|
||||
+ .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
|
||||
L(between_4_7):
|
||||
/* From 4 to 7. No branch when size == 4. */
|
||||
- movl %edi, (%rax)
|
||||
- movl %edi, -4(%rax, %rdx)
|
||||
- VZEROUPPER_RETURN
|
||||
+#ifdef USE_XMM_LESS_VEC
|
||||
+ MOVD %XMM0, (%rdi)
|
||||
+ MOVD %XMM0, -4(%rdi, %rdx)
|
||||
+#else
|
||||
+ movl %ecx, (%LESS_VEC_REG)
|
||||
+ movl %ecx, -4(%LESS_VEC_REG, %rdx)
|
||||
+#endif
|
||||
+ ret
|
||||
|
||||
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
|
||||
+ /* 4 * XMM_SMALL for the third mov for AVX2. */
|
||||
+ .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
|
||||
L(between_2_3):
|
||||
/* From 2 to 3. No branch when size == 2. */
|
||||
- movw %di, (%rax)
|
||||
- movb %dil, -1(%rax, %rdx)
|
||||
- VZEROUPPER_RETURN
|
||||
+#ifdef USE_XMM_LESS_VEC
|
||||
+ movb %sil, (%rdi)
|
||||
+ movb %sil, 1(%rdi)
|
||||
+ movb %sil, -1(%rdi, %rdx)
|
||||
+#else
|
||||
+ movw %cx, (%LESS_VEC_REG)
|
||||
+ movb %sil, -1(%LESS_VEC_REG, %rdx)
|
||||
+#endif
|
||||
+ ret
|
||||
END (MEMSET_SYMBOL (__memset, unaligned_erms))
|
||||
--
|
||||
GitLab
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue