diff --git a/Fix-tst-cancel21.c-to-suit-kernel-struct-sigcontext-.patch b/Fix-tst-cancel21.c-to-suit-kernel-struct-sigcontext-.patch deleted file mode 100644 index 62b5dab..0000000 --- a/Fix-tst-cancel21.c-to-suit-kernel-struct-sigcontext-.patch +++ /dev/null @@ -1,34 +0,0 @@ -From c5de7c407853b807e8d0c764e6325bb1311f39cd Mon Sep 17 00:00:00 2001 -From: Xing Li -Date: Tue, 4 Jul 2023 15:10:03 +0800 -Subject: [PATCH 2/2] Fix tst-cancel21.c to suit kernel struct sigcontext - change. * nptl/tst-cancel21.c - ---- - nptl/tst-cancel21.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/nptl/tst-cancel21.c b/nptl/tst-cancel21.c -index b10fdbc1..a3653f21 100644 ---- a/nptl/tst-cancel21.c -+++ b/nptl/tst-cancel21.c -@@ -217,14 +217,14 @@ static int - do_test (void) - { - stack_t ss; -- ss.ss_sp = malloc (2 * SIGSTKSZ); -+ ss.ss_sp = malloc (4 * SIGSTKSZ); - if (ss.ss_sp == NULL) - { - puts ("failed to allocate alternate stack"); - return 1; - } - ss.ss_flags = 0; -- ss.ss_size = 2 * SIGSTKSZ; -+ ss.ss_size = 4 * SIGSTKSZ; - if (sigaltstack (&ss, NULL) < 0) - { - printf ("sigaltstack failed %m\n"); --- -2.27.0 - diff --git a/dist b/dist index 37a6f9c..1fe92cf 100644 --- a/dist +++ b/dist @@ -1 +1 @@ -an8_9 +an8_10 diff --git a/glibc-2.28-Add-macro-defination-of-lasx-lsx-and-fcc-.patch b/glibc-2.28-Add-macro-defination-of-lasx-lsx-and-fcc-.patch deleted file mode 100644 index 86f142d..0000000 --- a/glibc-2.28-Add-macro-defination-of-lasx-lsx-and-fcc-.patch +++ /dev/null @@ -1,3946 +0,0 @@ -From d97d963796b092b9c0bd4712f992a08dd20bf5ed Mon Sep 17 00:00:00 2001 -From: caiyinyu -Date: Tue, 11 Jul 2023 15:40:15 +0800 -Subject: [PATCH 11/14] glibc-2.28: Add macro defination of lasx lsx and fcc - registers. - -Change-Id: Ic723521775a0133e25bf1d568c588f930ec5ff49 -Signed-off-by: ticat_fp ---- - sysdeps/loongarch/dl-trampoline.h | 64 +-- - .../loongarch/lp64/multiarch/memchr-lasx.S | 74 +-- - sysdeps/loongarch/lp64/multiarch/memchr-lsx.S | 48 +- - .../loongarch/lp64/multiarch/memcmp-lasx.S | 138 +++--- - sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S | 194 ++++---- - .../loongarch/lp64/multiarch/memmove-lasx.S | 160 +++---- - .../loongarch/lp64/multiarch/memmove-lsx.S | 424 +++++++++--------- - .../loongarch/lp64/multiarch/memrchr-lasx.S | 74 +-- - .../loongarch/lp64/multiarch/memrchr-lsx.S | 48 +- - .../loongarch/lp64/multiarch/memset-lasx.S | 64 +-- - sysdeps/loongarch/lp64/multiarch/memset-lsx.S | 62 +-- - .../loongarch/lp64/multiarch/rawmemchr-lasx.S | 30 +- - .../loongarch/lp64/multiarch/rawmemchr-lsx.S | 30 +- - sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S | 114 ++--- - .../loongarch/lp64/multiarch/strchr-lasx.S | 52 +-- - sysdeps/loongarch/lp64/multiarch/strchr-lsx.S | 30 +- - sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S | 114 ++--- - sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S | 112 ++--- - .../loongarch/lp64/multiarch/strlen-lasx.S | 24 +- - sysdeps/loongarch/lp64/multiarch/strlen-lsx.S | 30 +- - .../loongarch/lp64/multiarch/strncmp-lsx.S | 144 +++--- - .../loongarch/lp64/multiarch/strnlen-lasx.S | 46 +- - .../loongarch/lp64/multiarch/strnlen-lsx.S | 30 +- - .../loongarch/lp64/multiarch/strrchr-lasx.S | 88 ++-- - .../loongarch/lp64/multiarch/strrchr-lsx.S | 56 +-- - sysdeps/loongarch/lp64/s_cosf.S | 4 +- - sysdeps/loongarch/lp64/s_sinf.S | 4 +- - sysdeps/loongarch/sys/regdef.h | 74 +++ - 28 files changed, 1203 insertions(+), 1129 deletions(-) - -diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h -index fb15983f..96f41f1d 100644 ---- a/sysdeps/loongarch/dl-trampoline.h -+++ b/sysdeps/loongarch/dl-trampoline.h -@@ -61,23 +61,23 @@ ENTRY (_dl_runtime_resolve, 3) - FREG_S fa6, sp, 10*SZREG + 6*SZFREG - FREG_S fa7, sp, 10*SZREG + 7*SZFREG - #ifdef USE_LASX -- xvst $xr0, sp, 10*SZREG + 0*256 -- xvst $xr1, sp, 10*SZREG + 1*256 -- xvst $xr2, sp, 10*SZREG + 2*256 -- xvst $xr3, sp, 10*SZREG + 3*256 -- xvst $xr4, sp, 10*SZREG + 4*256 -- xvst $xr5, sp, 10*SZREG + 5*256 -- xvst $xr6, sp, 10*SZREG + 6*256 -- xvst $xr7, sp, 10*SZREG + 7*256 -+ xvst xr0, sp, 10*SZREG + 0*256 -+ xvst xr1, sp, 10*SZREG + 1*256 -+ xvst xr2, sp, 10*SZREG + 2*256 -+ xvst xr3, sp, 10*SZREG + 3*256 -+ xvst xr4, sp, 10*SZREG + 4*256 -+ xvst xr5, sp, 10*SZREG + 5*256 -+ xvst xr6, sp, 10*SZREG + 6*256 -+ xvst xr7, sp, 10*SZREG + 7*256 - #elif defined USE_LSX -- vst $vr0, sp, 10*SZREG + 0*128 -- vst $vr1, sp, 10*SZREG + 1*128 -- vst $vr2, sp, 10*SZREG + 2*128 -- vst $vr3, sp, 10*SZREG + 3*128 -- vst $vr4, sp, 10*SZREG + 4*128 -- vst $vr5, sp, 10*SZREG + 5*128 -- vst $vr6, sp, 10*SZREG + 6*128 -- vst $vr7, sp, 10*SZREG + 7*128 -+ vst vr0, sp, 10*SZREG + 0*128 -+ vst vr1, sp, 10*SZREG + 1*128 -+ vst vr2, sp, 10*SZREG + 2*128 -+ vst vr3, sp, 10*SZREG + 3*128 -+ vst vr4, sp, 10*SZREG + 4*128 -+ vst vr5, sp, 10*SZREG + 5*128 -+ vst vr6, sp, 10*SZREG + 6*128 -+ vst vr7, sp, 10*SZREG + 7*128 - #endif - #endif - -@@ -119,23 +119,23 @@ ENTRY (_dl_runtime_resolve, 3) - FREG_L fa6, sp, 10*SZREG + 6*SZFREG - FREG_L fa7, sp, 10*SZREG + 7*SZFREG - #ifdef USE_LASX -- xvld $xr0, sp, 10*SZREG + 0*256 -- xvld $xr1, sp, 10*SZREG + 1*256 -- xvld $xr2, sp, 10*SZREG + 2*256 -- xvld $xr3, sp, 10*SZREG + 3*256 -- xvld $xr4, sp, 10*SZREG + 4*256 -- xvld $xr5, sp, 10*SZREG + 5*256 -- xvld $xr6, sp, 10*SZREG + 6*256 -- xvld $xr7, sp, 10*SZREG + 7*256 -+ xvld xr0, sp, 10*SZREG + 0*256 -+ xvld xr1, sp, 10*SZREG + 1*256 -+ xvld xr2, sp, 10*SZREG + 2*256 -+ xvld xr3, sp, 10*SZREG + 3*256 -+ xvld xr4, sp, 10*SZREG + 4*256 -+ xvld xr5, sp, 10*SZREG + 5*256 -+ xvld xr6, sp, 10*SZREG + 6*256 -+ xvld xr7, sp, 10*SZREG + 7*256 - #elif defined USE_LSX -- vld $vr0, sp, 10*SZREG + 0*128 -- vld $vr1, sp, 10*SZREG + 1*128 -- vld $vr2, sp, 10*SZREG + 2*128 -- vld $vr3, sp, 10*SZREG + 3*128 -- vld $vr4, sp, 10*SZREG + 4*128 -- vld $vr5, sp, 10*SZREG + 5*128 -- vld $vr6, sp, 10*SZREG + 6*128 -- vld $vr7, sp, 10*SZREG + 7*128 -+ vld vr0, sp, 10*SZREG + 0*128 -+ vld vr1, sp, 10*SZREG + 1*128 -+ vld vr2, sp, 10*SZREG + 2*128 -+ vld vr3, sp, 10*SZREG + 3*128 -+ vld vr4, sp, 10*SZREG + 4*128 -+ vld vr5, sp, 10*SZREG + 5*128 -+ vld vr6, sp, 10*SZREG + 6*128 -+ vld vr7, sp, 10*SZREG + 7*128 - #endif - #endif - -diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S -index 387a35fe..425fcede 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S -@@ -17,28 +17,28 @@ LEAF(MEMCHR, 6) - andi t0, a0, 0x3f - bstrins.d a0, zero, 5, 0 - -- xvld $xr0, a0, 0 -- xvld $xr1, a0, 32 -+ xvld xr0, a0, 0 -+ xvld xr1, a0, 32 - li.d t1, -1 - li.d t2, 64 - -- xvreplgr2vr.b $xr2, a1 -+ xvreplgr2vr.b xr2, a1 - sll.d t3, t1, t0 - sub.d t2, t2, t0 -- xvseq.b $xr0, $xr0, $xr2 -+ xvseq.b xr0, xr0, xr2 - -- xvseq.b $xr1, $xr1, $xr2 -- xvmsknz.b $xr0, $xr0 -- xvmsknz.b $xr1, $xr1 -- xvpickve.w $xr3, $xr0, 4 -+ xvseq.b xr1, xr1, xr2 -+ xvmsknz.b xr0, xr0 -+ xvmsknz.b xr1, xr1 -+ xvpickve.w xr3, xr0, 4 - - -- xvpickve.w $xr4, $xr1, 4 -- vilvl.h $vr0, $vr3, $vr0 -- vilvl.h $vr1, $vr4, $vr1 -- vilvl.w $vr0, $vr1, $vr0 -+ xvpickve.w xr4, xr1, 4 -+ vilvl.h vr0, vr3, vr0 -+ vilvl.h vr1, vr4, vr1 -+ vilvl.w vr0, vr1, vr0 - -- movfr2gr.d t0, $f0 -+ movfr2gr.d t0, fa0 - and t0, t0, t3 - bgeu t2, a2, L(end) - bnez t0, L(found) -@@ -46,28 +46,28 @@ LEAF(MEMCHR, 6) - addi.d a4, a3, -1 - bstrins.d a4, zero, 5, 0 - L(loop): -- xvld $xr0, a0, 64 -- xvld $xr1, a0, 96 -+ xvld xr0, a0, 64 -+ xvld xr1, a0, 96 - - addi.d a0, a0, 64 -- xvseq.b $xr0, $xr0, $xr2 -- xvseq.b $xr1, $xr1, $xr2 -+ xvseq.b xr0, xr0, xr2 -+ xvseq.b xr1, xr1, xr2 - beq a0, a4, L(out) - - -- xvmax.bu $xr3, $xr0, $xr1 -- xvseteqz.v $fcc0, $xr3 -- bcnez $fcc0, L(loop) -- xvmsknz.b $xr0, $xr0 -+ xvmax.bu xr3, xr0, xr1 -+ xvseteqz.v fcc0, xr3 -+ bcnez fcc0, L(loop) -+ xvmsknz.b xr0, xr0 - -- xvmsknz.b $xr1, $xr1 -- xvpickve.w $xr3, $xr0, 4 -- xvpickve.w $xr4, $xr1, 4 -- vilvl.h $vr0, $vr3, $vr0 -+ xvmsknz.b xr1, xr1 -+ xvpickve.w xr3, xr0, 4 -+ xvpickve.w xr4, xr1, 4 -+ vilvl.h vr0, vr3, vr0 - -- vilvl.h $vr1, $vr4, $vr1 -- vilvl.w $vr0, $vr1, $vr0 -- movfr2gr.d t0, $f0 -+ vilvl.h vr1, vr4, vr1 -+ vilvl.w vr0, vr1, vr0 -+ movfr2gr.d t0, fa0 - L(found): - ctz.d t1, t0 - -@@ -79,15 +79,15 @@ L(ret0): - - - L(out): -- xvmsknz.b $xr0, $xr0 -- xvmsknz.b $xr1, $xr1 -- xvpickve.w $xr3, $xr0, 4 -- xvpickve.w $xr4, $xr1, 4 -- -- vilvl.h $vr0, $vr3, $vr0 -- vilvl.h $vr1, $vr4, $vr1 -- vilvl.w $vr0, $vr1, $vr0 -- movfr2gr.d t0, $f0 -+ xvmsknz.b xr0, xr0 -+ xvmsknz.b xr1, xr1 -+ xvpickve.w xr3, xr0, 4 -+ xvpickve.w xr4, xr1, 4 -+ -+ vilvl.h vr0, vr3, vr0 -+ vilvl.h vr1, vr4, vr1 -+ vilvl.w vr0, vr1, vr0 -+ movfr2gr.d t0, fa0 - - L(end): - sub.d t2, zero, a3 -diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S -index c6952657..08a630d3 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S -@@ -17,23 +17,23 @@ LEAF(MEMCHR, 6) - andi t0, a0, 0x1f - bstrins.d a0, zero, 4, 0 - -- vld $vr0, a0, 0 -- vld $vr1, a0, 16 -+ vld vr0, a0, 0 -+ vld vr1, a0, 16 - li.d t1, -1 - li.d t2, 32 - -- vreplgr2vr.b $vr2, a1 -+ vreplgr2vr.b vr2, a1 - sll.d t3, t1, t0 - sub.d t2, t2, t0 -- vseq.b $vr0, $vr0, $vr2 -+ vseq.b vr0, vr0, vr2 - -- vseq.b $vr1, $vr1, $vr2 -- vmsknz.b $vr0, $vr0 -- vmsknz.b $vr1, $vr1 -- vilvl.h $vr0, $vr1, $vr0 -+ vseq.b vr1, vr1, vr2 -+ vmsknz.b vr0, vr0 -+ vmsknz.b vr1, vr1 -+ vilvl.h vr0, vr1, vr0 - - -- movfr2gr.s t0, $f0 -+ movfr2gr.s t0, fa0 - and t0, t0, t3 - bgeu t2, a2, L(end) - bnez t0, L(found) -@@ -41,23 +41,23 @@ LEAF(MEMCHR, 6) - addi.d a4, a3, -1 - bstrins.d a4, zero, 4, 0 - L(loop): -- vld $vr0, a0, 32 -- vld $vr1, a0, 48 -+ vld vr0, a0, 32 -+ vld vr1, a0, 48 - - addi.d a0, a0, 32 -- vseq.b $vr0, $vr0, $vr2 -- vseq.b $vr1, $vr1, $vr2 -+ vseq.b vr0, vr0, vr2 -+ vseq.b vr1, vr1, vr2 - beq a0, a4, L(out) - -- vmax.bu $vr3, $vr0, $vr1 -- vseteqz.v $fcc0, $vr3 -- bcnez $fcc0, L(loop) -- vmsknz.b $vr0, $vr0 -+ vmax.bu vr3, vr0, vr1 -+ vseteqz.v fcc0, vr3 -+ bcnez fcc0, L(loop) -+ vmsknz.b vr0, vr0 - - -- vmsknz.b $vr1, $vr1 -- vilvl.h $vr0, $vr1, $vr0 -- movfr2gr.s t0, $f0 -+ vmsknz.b vr1, vr1 -+ vilvl.h vr0, vr1, vr0 -+ movfr2gr.s t0, fa0 - L(found): - ctz.w t0, t0 - -@@ -68,10 +68,10 @@ L(ret0): - jr ra - - L(out): -- vmsknz.b $vr0, $vr0 -- vmsknz.b $vr1, $vr1 -- vilvl.h $vr0, $vr1, $vr0 -- movfr2gr.s t0, $f0 -+ vmsknz.b vr0, vr0 -+ vmsknz.b vr1, vr1 -+ vilvl.h vr0, vr1, vr0 -+ movfr2gr.s t0, fa0 - - L(end): - sub.d t2, zero, a3 -diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S -index 9151d38d..2c192954 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S -@@ -20,39 +20,39 @@ LEAF(MEMCMP, 6) - li.d t1, 160 - bgeu a2, t1, L(make_aligned) # a2 >= 160 - L(loop32): -- xvld $xr0, a0, 0 -- xvld $xr1, a1, 0 -+ xvld xr0, a0, 0 -+ xvld xr1, a1, 0 - - addi.d a0, a0, 32 - addi.d a1, a1, 32 - addi.d a2, a2, -32 -- xvseq.b $xr2, $xr0, $xr1 -+ xvseq.b xr2, xr0, xr1 - -- xvsetanyeqz.b $fcc0, $xr2 -- bcnez $fcc0, L(end) -+ xvsetanyeqz.b fcc0, xr2 -+ bcnez fcc0, L(end) - L(last_bytes): - bltu t2, a2, L(loop32) -- xvld $xr0, a3, -32 -+ xvld xr0, a3, -32 - - -- xvld $xr1, a4, -32 -- xvseq.b $xr2, $xr0, $xr1 -+ xvld xr1, a4, -32 -+ xvseq.b xr2, xr0, xr1 - L(end): -- xvmsknz.b $xr2, $xr2 -- xvpermi.q $xr4, $xr0, 1 -+ xvmsknz.b xr2, xr2 -+ xvpermi.q xr4, xr0, 1 - -- xvpickve.w $xr3, $xr2, 4 -- xvpermi.q $xr5, $xr1, 1 -- vilvl.h $vr2, $vr3, $vr2 -- movfr2gr.s t0, $f2 -+ xvpickve.w xr3, xr2, 4 -+ xvpermi.q xr5, xr1, 1 -+ vilvl.h vr2, vr3, vr2 -+ movfr2gr.s t0, fa2 - - cto.w t0, t0 -- vreplgr2vr.b $vr2, t0 -- vshuf.b $vr0, $vr4, $vr0, $vr2 -- vshuf.b $vr1, $vr5, $vr1, $vr2 -+ vreplgr2vr.b vr2, t0 -+ vshuf.b vr0, vr4, vr0, vr2 -+ vshuf.b vr1, vr5, vr1, vr2 - -- vpickve2gr.bu t0, $vr0, 0 -- vpickve2gr.bu t1, $vr1, 0 -+ vpickve2gr.bu t0, vr0, 0 -+ vpickve2gr.bu t1, vr1, 0 - sub.d a0, t0, t1 - jr ra - -@@ -60,59 +60,59 @@ L(end): - L(less32): - srli.d t0, a2, 4 - beqz t0, L(less16) -- vld $vr0, a0, 0 -- vld $vr1, a1, 0 -+ vld vr0, a0, 0 -+ vld vr1, a1, 0 - -- vld $vr2, a3, -16 -- vld $vr3, a4, -16 -+ vld vr2, a3, -16 -+ vld vr3, a4, -16 - L(short_ret): -- vseq.b $vr4, $vr0, $vr1 -- vseq.b $vr5, $vr2, $vr3 -+ vseq.b vr4, vr0, vr1 -+ vseq.b vr5, vr2, vr3 - -- vmsknz.b $vr4, $vr4 -- vmsknz.b $vr5, $vr5 -- vilvl.h $vr4, $vr5, $vr4 -- movfr2gr.s t0, $f4 -+ vmsknz.b vr4, vr4 -+ vmsknz.b vr5, vr5 -+ vilvl.h vr4, vr5, vr4 -+ movfr2gr.s t0, fa4 - - cto.w t0, t0 -- vreplgr2vr.b $vr4, t0 -- vshuf.b $vr0, $vr2, $vr0, $vr4 -- vshuf.b $vr1, $vr3, $vr1, $vr4 -+ vreplgr2vr.b vr4, t0 -+ vshuf.b vr0, vr2, vr0, vr4 -+ vshuf.b vr1, vr3, vr1, vr4 - - -- vpickve2gr.bu t0, $vr0, 0 -- vpickve2gr.bu t1, $vr1, 0 -+ vpickve2gr.bu t0, vr0, 0 -+ vpickve2gr.bu t1, vr1, 0 - sub.d a0, t0, t1 - jr ra - - L(less16): - srli.d t0, a2, 3 - beqz t0, L(less8) -- vldrepl.d $vr0, a0, 0 -- vldrepl.d $vr1, a1, 0 -+ vldrepl.d vr0, a0, 0 -+ vldrepl.d vr1, a1, 0 - -- vldrepl.d $vr2, a3, -8 -- vldrepl.d $vr3, a4, -8 -+ vldrepl.d vr2, a3, -8 -+ vldrepl.d vr3, a4, -8 - b L(short_ret) - L(less8): - srli.d t0, a2, 2 - - beqz t0, L(less4) -- vldrepl.w $vr0, a0, 0 -- vldrepl.w $vr1, a1, 0 -- vldrepl.w $vr2, a3, -4 -+ vldrepl.w vr0, a0, 0 -+ vldrepl.w vr1, a1, 0 -+ vldrepl.w vr2, a3, -4 - - -- vldrepl.w $vr3, a4, -4 -+ vldrepl.w vr3, a4, -4 - b L(short_ret) - L(less4): - srli.d t0, a2, 1 - beqz t0, L(less2) - -- vldrepl.h $vr0, a0, 0 -- vldrepl.h $vr1, a1, 0 -- vldrepl.h $vr2, a3, -2 -- vldrepl.h $vr3, a4, -2 -+ vldrepl.h vr0, a0, 0 -+ vldrepl.h vr1, a1, 0 -+ vldrepl.h vr2, a3, -2 -+ vldrepl.h vr3, a4, -2 - - b L(short_ret) - L(less2): -@@ -132,12 +132,12 @@ L(ret0): - nop - /* make src1 aligned, and adjust scr2 and length. */ - L(make_aligned): -- xvld $xr0, a0, 0 -+ xvld xr0, a0, 0 - -- xvld $xr1, a1, 0 -- xvseq.b $xr2, $xr0, $xr1 -- xvsetanyeqz.b $fcc0, $xr2 -- bcnez $fcc0, L(end) -+ xvld xr1, a1, 0 -+ xvseq.b xr2, xr0, xr1 -+ xvsetanyeqz.b fcc0, xr2 -+ bcnez fcc0, L(end) - - andi t0, a0, 0x1f - sub.d t0, t2, t0 -@@ -151,17 +151,17 @@ L(make_aligned): - - - L(loop_align): -- xvld $xr0, a0, 0 -- xvld $xr1, a1, 0 -- xvld $xr2, a0, 32 -- xvld $xr3, a1, 32 -+ xvld xr0, a0, 0 -+ xvld xr1, a1, 0 -+ xvld xr2, a0, 32 -+ xvld xr3, a1, 32 - -- xvseq.b $xr0, $xr0, $xr1 -- xvseq.b $xr1, $xr2, $xr3 -- xvmin.bu $xr2, $xr1, $xr0 -- xvsetanyeqz.b $fcc0, $xr2 -+ xvseq.b xr0, xr0, xr1 -+ xvseq.b xr1, xr2, xr3 -+ xvmin.bu xr2, xr1, xr0 -+ xvsetanyeqz.b fcc0, xr2 - -- bcnez $fcc0, L(pair_end) -+ bcnez fcc0, L(pair_end) - addi.d a0, a0, 64 - addi.d a1, a1, 64 - bne a0, a5, L(loop_align) -@@ -173,15 +173,15 @@ L(loop_align): - - - L(pair_end): -- xvmsknz.b $xr0, $xr0 -- xvmsknz.b $xr1, $xr1 -- xvpickve.w $xr2, $xr0, 4 -- xvpickve.w $xr3, $xr1, 4 -- -- vilvl.h $vr0, $vr2, $vr0 -- vilvl.h $vr1, $vr3, $vr1 -- vilvl.w $vr0, $vr1, $vr0 -- movfr2gr.d t0, $f0 -+ xvmsknz.b xr0, xr0 -+ xvmsknz.b xr1, xr1 -+ xvpickve.w xr2, xr0, 4 -+ xvpickve.w xr3, xr1, 4 -+ -+ vilvl.h vr0, vr2, vr0 -+ vilvl.h vr1, vr3, vr1 -+ vilvl.w vr0, vr1, vr0 -+ movfr2gr.d t0, fa0 - - cto.d t0, t0 - ldx.bu t1, a0, t0 -diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S -index 8535aa22..b407275f 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S -@@ -21,28 +21,28 @@ ENTRY_NO_ALIGN(MEMCMP) - pcaddi t0, -7 - - andi a3, a0, 0xf -- vld $vr5, t0, 0 -+ vld vr5, t0, 0 - andi a4, a1, 0xf - bne a3, a4, L(unaligned) - - bstrins.d a0, zero, 3, 0 - xor a1, a1, a4 -- vld $vr0, a0, 0 -- vld $vr1, a1, 0 -+ vld vr0, a0, 0 -+ vld vr1, a1, 0 - - - li.d t0, 16 -- vreplgr2vr.b $vr3, a3 -+ vreplgr2vr.b vr3, a3 - sub.d t1, t0, a3 -- vadd.b $vr3, $vr3, $vr5 -+ vadd.b vr3, vr3, vr5 - -- vshuf.b $vr0, $vr3, $vr0, $vr3 -- vshuf.b $vr1, $vr3, $vr1, $vr3 -- vseq.b $vr4, $vr0, $vr1 -+ vshuf.b vr0, vr3, vr0, vr3 -+ vshuf.b vr1, vr3, vr1, vr3 -+ vseq.b vr4, vr0, vr1 - bgeu t1, a2, L(al_end) - -- vsetanyeqz.b $fcc0, $vr4 -- bcnez $fcc0, L(al_found) -+ vsetanyeqz.b fcc0, vr4 -+ bcnez fcc0, L(al_found) - sub.d a2, a2, t1 - andi t1, a2, 31 - -@@ -53,70 +53,70 @@ ENTRY_NO_ALIGN(MEMCMP) - - - L(al_loop): -- vld $vr0, a0, 16 -- vld $vr1, a1, 16 -- vld $vr2, a0, 32 -- vld $vr3, a1, 32 -+ vld vr0, a0, 16 -+ vld vr1, a1, 16 -+ vld vr2, a0, 32 -+ vld vr3, a1, 32 - - addi.d a0, a0, 32 - addi.d a1, a1, 32 -- vseq.b $vr4, $vr0, $vr1 -- vseq.b $vr6, $vr2, $vr3 -+ vseq.b vr4, vr0, vr1 -+ vseq.b vr6, vr2, vr3 - -- vand.v $vr6, $vr4, $vr6 -- vsetanyeqz.b $fcc0, $vr6 -- bcnez $fcc0, L(al_pair_end) -+ vand.v vr6, vr4, vr6 -+ vsetanyeqz.b fcc0, vr6 -+ bcnez fcc0, L(al_pair_end) - bne a0, a4, L(al_loop) - - L(al_less_32bytes): - bgeu t0, a2, L(al_less_16bytes) -- vld $vr0, a0, 16 -- vld $vr1, a1, 16 -- vld $vr2, a0, 32 -+ vld vr0, a0, 16 -+ vld vr1, a1, 16 -+ vld vr2, a0, 32 - - -- vld $vr3, a1, 32 -+ vld vr3, a1, 32 - addi.d a2, a2, -16 -- vreplgr2vr.b $vr6, a2 -- vslt.b $vr5, $vr5, $vr6 -+ vreplgr2vr.b vr6, a2 -+ vslt.b vr5, vr5, vr6 - -- vseq.b $vr4, $vr0, $vr1 -- vseq.b $vr6, $vr2, $vr3 -- vorn.v $vr6, $vr6, $vr5 -+ vseq.b vr4, vr0, vr1 -+ vseq.b vr6, vr2, vr3 -+ vorn.v vr6, vr6, vr5 - L(al_pair_end): -- vsetanyeqz.b $fcc0, $vr4 -+ vsetanyeqz.b fcc0, vr4 - -- bcnez $fcc0, L(al_found) -- vnori.b $vr4, $vr6, 0 -- vfrstpi.b $vr4, $vr4, 0 -- vshuf.b $vr0, $vr2, $vr2, $vr4 -+ bcnez fcc0, L(al_found) -+ vnori.b vr4, vr6, 0 -+ vfrstpi.b vr4, vr4, 0 -+ vshuf.b vr0, vr2, vr2, vr4 - -- vshuf.b $vr1, $vr3, $vr3, $vr4 -- vpickve2gr.bu t0, $vr0, 0 -- vpickve2gr.bu t1, $vr1, 0 -+ vshuf.b vr1, vr3, vr3, vr4 -+ vpickve2gr.bu t0, vr0, 0 -+ vpickve2gr.bu t1, vr1, 0 - sub.d a0, t0, t1 - - - jr ra - L(al_less_16bytes): - beqz a2, L(out) -- vld $vr0, a0, 16 -- vld $vr1, a1, 16 -+ vld vr0, a0, 16 -+ vld vr1, a1, 16 - -- vseq.b $vr4, $vr0, $vr1 -+ vseq.b vr4, vr0, vr1 - L(al_end): -- vreplgr2vr.b $vr6, a2 -- vslt.b $vr5, $vr5, $vr6 -- vorn.v $vr4, $vr4, $vr5 -+ vreplgr2vr.b vr6, a2 -+ vslt.b vr5, vr5, vr6 -+ vorn.v vr4, vr4, vr5 - - L(al_found): -- vnori.b $vr4, $vr4, 0 -- vfrstpi.b $vr4, $vr4, 0 -- vshuf.b $vr0, $vr0, $vr0, $vr4 -- vshuf.b $vr1, $vr1, $vr1, $vr4 -+ vnori.b vr4, vr4, 0 -+ vfrstpi.b vr4, vr4, 0 -+ vshuf.b vr0, vr0, vr0, vr4 -+ vshuf.b vr1, vr1, vr1, vr4 - -- vpickve2gr.bu t0, $vr0, 0 -- vpickve2gr.bu t1, $vr1, 0 -+ vpickve2gr.bu t0, vr0, 0 -+ vpickve2gr.bu t1, vr1, 0 - sub.d a0, t0, t1 - jr ra - -@@ -133,28 +133,28 @@ L(unaligned): - bstrins.d a0, zero, 3, 0 - - xor a1, a1, a4 -- vld $vr4, a0, 0 -- vld $vr1, a1, 0 -+ vld vr4, a0, 0 -+ vld vr1, a1, 0 - li.d t0, 16 - -- vreplgr2vr.b $vr2, a4 -+ vreplgr2vr.b vr2, a4 - sub.d a6, a4, a3 # a6 hold the diff - sub.d t1, t0, a4 - sub.d t2, t0, a6 - - -- vadd.b $vr2, $vr2, $vr5 # [4, 5, 6, ...] -- vreplgr2vr.b $vr6, t2 -- vadd.b $vr6, $vr6, $vr5 # [14, 15, 16, ... ] -- vshuf.b $vr0, $vr4, $vr4, $vr6 # make data be in the same position -+ vadd.b vr2, vr2, vr5 # [4, 5, 6, ...] -+ vreplgr2vr.b vr6, t2 -+ vadd.b vr6, vr6, vr5 # [14, 15, 16, ... ] -+ vshuf.b vr0, vr4, vr4, vr6 # make data be in the same position - -- vshuf.b $vr1, $vr2, $vr1, $vr2 -- vshuf.b $vr0, $vr2, $vr0, $vr2 -- vseq.b $vr7, $vr0, $vr1 -+ vshuf.b vr1, vr2, vr1, vr2 -+ vshuf.b vr0, vr2, vr0, vr2 -+ vseq.b vr7, vr0, vr1 - bgeu t1, a2, L(un_end) - -- vsetanyeqz.b $fcc0, $vr7 -- bcnez $fcc0, L(un_found) -+ vsetanyeqz.b fcc0, vr7 -+ bcnez fcc0, L(un_found) - sub.d a2, a2, t1 - andi t1, a2, 31 - -@@ -165,63 +165,63 @@ L(unaligned): - - - L(un_loop): -- vld $vr2, a0, 16 -- vld $vr1, a1, 16 -- vld $vr3, a1, 32 -+ vld vr2, a0, 16 -+ vld vr1, a1, 16 -+ vld vr3, a1, 32 - addi.d a1, a1, 32 - - addi.d a0, a0, 32 -- vshuf.b $vr0, $vr2, $vr4, $vr6 -- vld $vr4, a0, 0 -- vseq.b $vr7, $vr0, $vr1 -+ vshuf.b vr0, vr2, vr4, vr6 -+ vld vr4, a0, 0 -+ vseq.b vr7, vr0, vr1 - -- vshuf.b $vr2, $vr4, $vr2, $vr6 -- vseq.b $vr8, $vr2, $vr3 -- vand.v $vr8, $vr7, $vr8 -- vsetanyeqz.b $fcc0, $vr8 -+ vshuf.b vr2, vr4, vr2, vr6 -+ vseq.b vr8, vr2, vr3 -+ vand.v vr8, vr7, vr8 -+ vsetanyeqz.b fcc0, vr8 - -- bcnez $fcc0, L(un_pair_end) -+ bcnez fcc0, L(un_pair_end) - bne a1, a4, L(un_loop) - L(un_less_32bytes): - bltu a2, t0, L(un_less_16bytes) -- vld $vr2, a0, 16 -+ vld vr2, a0, 16 - - -- vld $vr1, a1, 16 -+ vld vr1, a1, 16 - addi.d a0, a0, 16 - addi.d a1, a1, 16 - addi.d a2, a2, -16 - -- vshuf.b $vr0, $vr2, $vr4, $vr6 -- vor.v $vr4, $vr2, $vr2 -- vseq.b $vr7, $vr0, $vr1 -- vsetanyeqz.b $fcc0, $vr7 -+ vshuf.b vr0, vr2, vr4, vr6 -+ vor.v vr4, vr2, vr2 -+ vseq.b vr7, vr0, vr1 -+ vsetanyeqz.b fcc0, vr7 - -- bcnez $fcc0, L(un_found) -+ bcnez fcc0, L(un_found) - L(un_less_16bytes): - beqz a2, L(out) -- vld $vr1, a1, 16 -+ vld vr1, a1, 16 - bgeu a6, a2, 1f - -- vld $vr2, a0, 16 -+ vld vr2, a0, 16 - 1: -- vshuf.b $vr0, $vr2, $vr4, $vr6 -- vseq.b $vr7, $vr0, $vr1 -+ vshuf.b vr0, vr2, vr4, vr6 -+ vseq.b vr7, vr0, vr1 - L(un_end): -- vreplgr2vr.b $vr3, a2 -+ vreplgr2vr.b vr3, a2 - - -- vslt.b $vr3, $vr5, $vr3 -- vorn.v $vr7, $vr7, $vr3 -+ vslt.b vr3, vr5, vr3 -+ vorn.v vr7, vr7, vr3 - L(un_found): -- vnori.b $vr7, $vr7, 0 -- vfrstpi.b $vr7, $vr7, 0 -+ vnori.b vr7, vr7, 0 -+ vfrstpi.b vr7, vr7, 0 - -- vshuf.b $vr0, $vr0, $vr0, $vr7 -- vshuf.b $vr1, $vr1, $vr1, $vr7 -+ vshuf.b vr0, vr0, vr0, vr7 -+ vshuf.b vr1, vr1, vr1, vr7 - L(calc_result): -- vpickve2gr.bu t0, $vr0, 0 -- vpickve2gr.bu t1, $vr1, 0 -+ vpickve2gr.bu t0, vr0, 0 -+ vpickve2gr.bu t1, vr1, 0 - - sub.d t2, t0, t1 - sub.d t3, t1, t0 -@@ -231,14 +231,14 @@ L(calc_result): - or a0, t0, t1 - jr ra - L(un_pair_end): -- vsetanyeqz.b $fcc0, $vr7 -- bcnez $fcc0, L(un_found) -+ vsetanyeqz.b fcc0, vr7 -+ bcnez fcc0, L(un_found) - - -- vnori.b $vr7, $vr8, 0 -- vfrstpi.b $vr7, $vr7, 0 -- vshuf.b $vr0, $vr2, $vr2, $vr7 -- vshuf.b $vr1, $vr3, $vr3, $vr7 -+ vnori.b vr7, vr8, 0 -+ vfrstpi.b vr7, vr7, 0 -+ vshuf.b vr0, vr2, vr2, vr7 -+ vshuf.b vr1, vr3, vr3, vr7 - - b L(calc_result) - L(out): -diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S -index e8b2c441..c317592f 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S -@@ -26,22 +26,22 @@ LEAF(MEMCPY_NAME, 6) - - li.d t1, 64 - bltu t1, a2, L(copy_long) # a2 > 64 -- xvld $xr0, a1, 0 -- xvld $xr1, a4, -32 -+ xvld xr0, a1, 0 -+ xvld xr1, a4, -32 - -- xvst $xr0, a0, 0 -- xvst $xr1, a3, -32 -+ xvst xr0, a0, 0 -+ xvst xr1, a3, -32 - jr ra - L(less_32bytes): - srli.d t0, a2, 4 - - beqz t0, L(less_16bytes) -- vld $vr0, a1, 0 -- vld $vr1, a4, -16 -- vst $vr0, a0, 0 -+ vld vr0, a1, 0 -+ vld vr1, a4, -16 -+ vst vr0, a0, 0 - - -- vst $vr1, a3, -16 -+ vst vr1, a3, -16 - jr ra - L(less_16bytes): - srli.d t0, a2, 3 -@@ -91,11 +91,11 @@ LEAF(MEMMOVE_NAME, 6) - - li.d t1, 64 - bltu t1, a2, L(move_long) # a2 > 64 -- xvld $xr0, a1, 0 -- xvld $xr1, a4, -32 -+ xvld xr0, a1, 0 -+ xvld xr1, a4, -32 - -- xvst $xr0, a0, 0 -- xvst $xr1, a3, -32 -+ xvst xr0, a0, 0 -+ xvst xr1, a3, -32 - jr ra - L(move_long): - sub.d t2, a0, a1 -@@ -107,8 +107,8 @@ L(copy_long): - sub.d t2, t0, t2 - - -- xvld $xr8, a1, 0 -- xvld $xr9, a4, -32 -+ xvld xr8, a1, 0 -+ xvld xr9, a4, -32 - sub.d t3, a2, t2 - add.d a5, a0, t2 - -@@ -119,69 +119,69 @@ L(copy_long): - - addi.d a6, a6, -1 - L(loop_256): -- xvld $xr0, a1, 0 -- xvld $xr1, a1, 32 -- xvld $xr2, a1, 64 -+ xvld xr0, a1, 0 -+ xvld xr1, a1, 32 -+ xvld xr2, a1, 64 - -- xvld $xr3, a1, 96 -- xvld $xr4, a1, 128 -- xvld $xr5, a1, 160 -- xvld $xr6, a1, 192 -+ xvld xr3, a1, 96 -+ xvld xr4, a1, 128 -+ xvld xr5, a1, 160 -+ xvld xr6, a1, 192 - - -- xvld $xr7, a1, 224 -+ xvld xr7, a1, 224 - addi.d a1, a1, 256 -- xvst $xr0, a5, 0 -- xvst $xr1, a5, 32 -+ xvst xr0, a5, 0 -+ xvst xr1, a5, 32 - -- xvst $xr2, a5, 64 -- xvst $xr3, a5, 96 -- xvst $xr4, a5, 128 -- xvst $xr5, a5, 160 -+ xvst xr2, a5, 64 -+ xvst xr3, a5, 96 -+ xvst xr4, a5, 128 -+ xvst xr5, a5, 160 - -- xvst $xr6, a5, 192 -- xvst $xr7, a5, 224 -+ xvst xr6, a5, 192 -+ xvst xr7, a5, 224 - addi.d a5, a5, 256 - bne a1, a6, L(loop_256) - - L(lt256): - srli.d t2, a2, 7 - beqz t2, L(lt128) -- xvld $xr0, a1, 0 -- xvld $xr1, a1, 32 -+ xvld xr0, a1, 0 -+ xvld xr1, a1, 32 - - -- xvld $xr2, a1, 64 -- xvld $xr3, a1, 96 -+ xvld xr2, a1, 64 -+ xvld xr3, a1, 96 - addi.d a1, a1, 128 - addi.d a2, a2, -128 - -- xvst $xr0, a5, 0 -- xvst $xr1, a5, 32 -- xvst $xr2, a5, 64 -- xvst $xr3, a5, 96 -+ xvst xr0, a5, 0 -+ xvst xr1, a5, 32 -+ xvst xr2, a5, 64 -+ xvst xr3, a5, 96 - - addi.d a5, a5, 128 - L(lt128): - bltu a2, t1, L(lt64) -- xvld $xr0, a1, 0 -- xvld $xr1, a1, 32 -+ xvld xr0, a1, 0 -+ xvld xr1, a1, 32 - - addi.d a1, a1, 64 - addi.d a2, a2, -64 -- xvst $xr0, a5, 0 -- xvst $xr1, a5, 32 -+ xvst xr0, a5, 0 -+ xvst xr1, a5, 32 - - - addi.d a5, a5, 64 - L(lt64): - bltu a2, t0, L(lt32) -- xvld $xr0, a1, 0 -- xvst $xr0, a5, 0 -+ xvld xr0, a1, 0 -+ xvst xr0, a5, 0 - - L(lt32): -- xvst $xr8, a0, 0 -- xvst $xr9, a3, -32 -+ xvst xr8, a0, 0 -+ xvst xr9, a3, -32 - jr ra - nop - -@@ -189,9 +189,9 @@ L(copy_back): - addi.d a3, a3, -1 - addi.d a2, a2, -2 - andi t2, a3, 0x1f -- xvld $xr8, a1, 0 -+ xvld xr8, a1, 0 - -- xvld $xr9, a4, -32 -+ xvld xr9, a4, -32 - sub.d t3, a2, t2 - sub.d a5, a3, t2 - sub.d a4, a4, t2 -@@ -203,69 +203,69 @@ L(copy_back): - addi.d a6, a6, 2 - - L(back_loop_256): -- xvld $xr0, a4, -33 -- xvld $xr1, a4, -65 -- xvld $xr2, a4, -97 -- xvld $xr3, a4, -129 -+ xvld xr0, a4, -33 -+ xvld xr1, a4, -65 -+ xvld xr2, a4, -97 -+ xvld xr3, a4, -129 - -- xvld $xr4, a4, -161 -- xvld $xr5, a4, -193 -- xvld $xr6, a4, -225 -- xvld $xr7, a4, -257 -+ xvld xr4, a4, -161 -+ xvld xr5, a4, -193 -+ xvld xr6, a4, -225 -+ xvld xr7, a4, -257 - - addi.d a4, a4, -256 -- xvst $xr0, a5, -32 -- xvst $xr1, a5, -64 -- xvst $xr2, a5, -96 -+ xvst xr0, a5, -32 -+ xvst xr1, a5, -64 -+ xvst xr2, a5, -96 - - -- xvst $xr3, a5, -128 -- xvst $xr4, a5, -160 -- xvst $xr5, a5, -192 -- xvst $xr6, a5, -224 -+ xvst xr3, a5, -128 -+ xvst xr4, a5, -160 -+ xvst xr5, a5, -192 -+ xvst xr6, a5, -224 - -- xvst $xr7, a5, -256 -+ xvst xr7, a5, -256 - addi.d a5, a5, -256 - bne a4, a6, L(back_loop_256) - L(back_lt256): - srli.d t2, a2, 7 - - beqz t2, L(back_lt128) -- xvld $xr0, a4, -33 -- xvld $xr1, a4, -65 -- xvld $xr2, a4, -97 -+ xvld xr0, a4, -33 -+ xvld xr1, a4, -65 -+ xvld xr2, a4, -97 - -- xvld $xr3, a4, -129 -+ xvld xr3, a4, -129 - addi.d a2, a2, -128 - addi.d a4, a4, -128 -- xvst $xr0, a5, -32 -+ xvst xr0, a5, -32 - - -- xvst $xr1, a5, -64 -- xvst $xr2, a5, -96 -- xvst $xr3, a5, -128 -+ xvst xr1, a5, -64 -+ xvst xr2, a5, -96 -+ xvst xr3, a5, -128 - addi.d a5, a5, -128 - - L(back_lt128): - blt a2, t1, L(back_lt64) -- xvld $xr0, a4, -33 -- xvld $xr1, a4, -65 -+ xvld xr0, a4, -33 -+ xvld xr1, a4, -65 - addi.d a2, a2, -64 - - addi.d a4, a4, -64 -- xvst $xr0, a5, -32 -- xvst $xr1, a5, -64 -+ xvst xr0, a5, -32 -+ xvst xr1, a5, -64 - addi.d a5, a5, -64 - - L(back_lt64): - bltu a2, t0, L(back_lt32) -- xvld $xr0, a4, -33 -- xvst $xr0, a5, -32 -+ xvld xr0, a4, -33 -+ xvst xr0, a5, -32 - L(back_lt32): -- xvst $xr8, a0, 0 -+ xvst xr8, a0, 0 - - -- xvst $xr9, a3, -31 -+ xvst xr9, a3, -31 - jr ra - END(MEMMOVE_NAME) - -diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S -index 90f89c7a..77f1b4ab 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S -@@ -23,54 +23,54 @@ LEAF(MEMCPY_NAME, 6) - bltu t8, a2, L(copy_long) # a2 > 64 - bltu t7, a2, L(more_32bytes) # a2 > 32 - -- vld $vr0, a1, 0 -- vld $vr1, a4, -16 -- vst $vr0, a0, 0 -- vst $vr1, a3, -16 -+ vld vr0, a1, 0 -+ vld vr1, a4, -16 -+ vst vr0, a0, 0 -+ vst vr1, a3, -16 - - jr ra - L(more_32bytes): -- vld $vr0, a1, 0 -- vld $vr1, a1, 16 -- vld $vr2, a4, -32 -+ vld vr0, a1, 0 -+ vld vr1, a1, 16 -+ vld vr2, a4, -32 - - -- vld $vr3, a4, -16 -- vst $vr0, a0, 0 -- vst $vr1, a0, 16 -- vst $vr2, a3, -32 -+ vld vr3, a4, -16 -+ vst vr0, a0, 0 -+ vst vr1, a0, 16 -+ vst vr2, a3, -32 - -- vst $vr3, a3, -16 -+ vst vr3, a3, -16 - jr ra - L(less_16bytes): - srli.d t0, a2, 3 - beqz t0, L(less_8bytes) - -- vldrepl.d $vr0, a1, 0 -- vldrepl.d $vr1, a4, -8 -- vstelm.d $vr0, a0, 0, 0 -- vstelm.d $vr1, a3, -8, 0 -+ vldrepl.d vr0, a1, 0 -+ vldrepl.d vr1, a4, -8 -+ vstelm.d vr0, a0, 0, 0 -+ vstelm.d vr1, a3, -8, 0 - - jr ra - L(less_8bytes): - srli.d t0, a2, 2 - beqz t0, L(less_4bytes) -- vldrepl.w $vr0, a1, 0 -+ vldrepl.w vr0, a1, 0 - - -- vldrepl.w $vr1, a4, -4 -- vstelm.w $vr0, a0, 0, 0 -- vstelm.w $vr1, a3, -4, 0 -+ vldrepl.w vr1, a4, -4 -+ vstelm.w vr0, a0, 0, 0 -+ vstelm.w vr1, a3, -4, 0 - jr ra - - L(less_4bytes): - srli.d t0, a2, 1 - beqz t0, L(less_2bytes) -- vldrepl.h $vr0, a1, 0 -- vldrepl.h $vr1, a4, -2 -+ vldrepl.h vr0, a1, 0 -+ vldrepl.h vr1, a4, -2 - -- vstelm.h $vr0, a0, 0, 0 -- vstelm.h $vr1, a3, -2, 0 -+ vstelm.h vr0, a0, 0, 0 -+ vstelm.h vr1, a3, -2, 0 - jr ra - L(less_2bytes): - beqz a2, L(less_1bytes) -@@ -93,10 +93,10 @@ LEAF(MEMMOVE_NAME, 6) - bltu t8, a2, L(move_long) # a2 > 64 - bltu t7, a2, L(more_32bytes) # a2 > 32 - -- vld $vr0, a1, 0 -- vld $vr1, a4, -16 -- vst $vr0, a0, 0 -- vst $vr1, a3, -16 -+ vld vr0, a1, 0 -+ vld vr1, a4, -16 -+ vst vr0, a0, 0 -+ vst vr1, a3, -16 - - jr ra - nop -@@ -106,7 +106,7 @@ L(move_long): - - - L(copy_long): -- vld $vr2, a1, 0 -+ vld vr2, a1, 0 - andi t0, a0, 0xf - sub.d t0, t6, t0 - add.d a1, a1, t0 -@@ -114,10 +114,10 @@ L(copy_long): - sub.d a2, a2, t0 - andi t1, a1, 0xf - bnez t1, L(unaligned) -- vld $vr0, a1, 0 -+ vld vr0, a1, 0 - - addi.d a2, a2, -16 -- vst $vr2, a0, 0 -+ vst vr2, a0, 0 - andi t2, a2, 0x7f - add.d a5, a0, t0 - -@@ -128,69 +128,69 @@ L(copy_long): - - - L(al_loop): -- vld $vr1, a1, 16 -- vld $vr2, a1, 32 -- vld $vr3, a1, 48 -- vld $vr4, a1, 64 -+ vld vr1, a1, 16 -+ vld vr2, a1, 32 -+ vld vr3, a1, 48 -+ vld vr4, a1, 64 - -- vld $vr5, a1, 80 -- vld $vr6, a1, 96 -- vld $vr7, a1, 112 -- vst $vr0, a5, 0 -+ vld vr5, a1, 80 -+ vld vr6, a1, 96 -+ vld vr7, a1, 112 -+ vst vr0, a5, 0 - -- vld $vr0, a1, 128 -+ vld vr0, a1, 128 - addi.d a1, a1, 128 -- vst $vr1, a5, 16 -- vst $vr2, a5, 32 -+ vst vr1, a5, 16 -+ vst vr2, a5, 32 - -- vst $vr3, a5, 48 -- vst $vr4, a5, 64 -- vst $vr5, a5, 80 -- vst $vr6, a5, 96 -+ vst vr3, a5, 48 -+ vst vr4, a5, 64 -+ vst vr5, a5, 80 -+ vst vr6, a5, 96 - - -- vst $vr7, a5, 112 -+ vst vr7, a5, 112 - addi.d a5, a5, 128 - bne a1, a6, L(al_loop) - L(al_less_128): - blt a2, t8, L(al_less_64) - -- vld $vr1, a1, 16 -- vld $vr2, a1, 32 -- vld $vr3, a1, 48 -+ vld vr1, a1, 16 -+ vld vr2, a1, 32 -+ vld vr3, a1, 48 - addi.d a2, a2, -64 - -- vst $vr0, a5, 0 -- vld $vr0, a1, 64 -+ vst vr0, a5, 0 -+ vld vr0, a1, 64 - addi.d a1, a1, 64 -- vst $vr1, a5, 16 -+ vst vr1, a5, 16 - -- vst $vr2, a5, 32 -- vst $vr3, a5, 48 -+ vst vr2, a5, 32 -+ vst vr3, a5, 48 - addi.d a5, a5, 64 - L(al_less_64): - blt a2, t7, L(al_less_32) - - -- vld $vr1, a1, 16 -+ vld vr1, a1, 16 - addi.d a2, a2, -32 -- vst $vr0, a5, 0 -- vld $vr0, a1, 32 -+ vst vr0, a5, 0 -+ vld vr0, a1, 32 - - addi.d a1, a1, 32 -- vst $vr1, a5, 16 -+ vst vr1, a5, 16 - addi.d a5, a5, 32 - L(al_less_32): - blt a2, t6, L(al_less_16) - -- vst $vr0, a5, 0 -- vld $vr0, a1, 16 -+ vst vr0, a5, 0 -+ vld vr0, a1, 16 - addi.d a5, a5, 16 - L(al_less_16): -- vld $vr1, a4, -16 -+ vld vr1, a4, -16 - -- vst $vr0, a5, 0 -- vst $vr1, a3, -16 -+ vst vr0, a5, 0 -+ vst vr1, a3, -16 - jr ra - nop - -@@ -201,17 +201,17 @@ L(magic_num): - L(unaligned): - pcaddi t2, -4 - bstrins.d a1, zero, 3, 0 -- vld $vr8, t2, 0 -- vld $vr0, a1, 0 -+ vld vr8, t2, 0 -+ vld vr0, a1, 0 - -- vld $vr1, a1, 16 -+ vld vr1, a1, 16 - addi.d a2, a2, -16 -- vst $vr2, a0, 0 -+ vst vr2, a0, 0 - add.d a5, a0, t0 - -- vreplgr2vr.b $vr9, t1 -+ vreplgr2vr.b vr9, t1 - andi t2, a2, 0x7f -- vadd.b $vr9, $vr9, $vr8 -+ vadd.b vr9, vr9, vr8 - addi.d a1, a1, 32 - - -@@ -221,97 +221,97 @@ L(unaligned): - add.d a6, a1, t3 - - L(un_loop): -- vld $vr2, a1, 0 -- vld $vr3, a1, 16 -- vld $vr4, a1, 32 -- vld $vr5, a1, 48 -+ vld vr2, a1, 0 -+ vld vr3, a1, 16 -+ vld vr4, a1, 32 -+ vld vr5, a1, 48 - -- vld $vr6, a1, 64 -- vld $vr7, a1, 80 -- vshuf.b $vr8, $vr1, $vr0, $vr9 -- vld $vr0, a1, 96 -+ vld vr6, a1, 64 -+ vld vr7, a1, 80 -+ vshuf.b vr8, vr1, vr0, vr9 -+ vld vr0, a1, 96 - -- vst $vr8, a5, 0 -- vshuf.b $vr8, $vr2, $vr1, $vr9 -- vld $vr1, a1, 112 -- vst $vr8, a5, 16 -+ vst vr8, a5, 0 -+ vshuf.b vr8, vr2, vr1, vr9 -+ vld vr1, a1, 112 -+ vst vr8, a5, 16 - - - addi.d a1, a1, 128 -- vshuf.b $vr2, $vr3, $vr2, $vr9 -- vshuf.b $vr3, $vr4, $vr3, $vr9 -- vst $vr2, a5, 32 -+ vshuf.b vr2, vr3, vr2, vr9 -+ vshuf.b vr3, vr4, vr3, vr9 -+ vst vr2, a5, 32 - -- vshuf.b $vr4, $vr5, $vr4, $vr9 -- vst $vr3, a5, 48 -- vshuf.b $vr5, $vr6, $vr5, $vr9 -- vst $vr4, a5, 64 -+ vshuf.b vr4, vr5, vr4, vr9 -+ vst vr3, a5, 48 -+ vshuf.b vr5, vr6, vr5, vr9 -+ vst vr4, a5, 64 - -- vshuf.b $vr6, $vr7, $vr6, $vr9 -- vst $vr5, a5, 80 -- vshuf.b $vr7, $vr0, $vr7, $vr9 -- vst $vr6, a5, 96 -+ vshuf.b vr6, vr7, vr6, vr9 -+ vst vr5, a5, 80 -+ vshuf.b vr7, vr0, vr7, vr9 -+ vst vr6, a5, 96 - -- vst $vr7, a5, 112 -+ vst vr7, a5, 112 - addi.d a5, a5, 128 - bne a1, a6, L(un_loop) - L(un_less_128): - blt a2, t8, L(un_less_64) - - -- vld $vr2, a1, 0 -- vld $vr3, a1, 16 -- vshuf.b $vr4, $vr1, $vr0, $vr9 -- vld $vr0, a1, 32 -+ vld vr2, a1, 0 -+ vld vr3, a1, 16 -+ vshuf.b vr4, vr1, vr0, vr9 -+ vld vr0, a1, 32 - -- vst $vr4, a5, 0 -+ vst vr4, a5, 0 - addi.d a2, a2, -64 -- vshuf.b $vr4, $vr2, $vr1, $vr9 -- vld $vr1, a1, 48 -+ vshuf.b vr4, vr2, vr1, vr9 -+ vld vr1, a1, 48 - - addi.d a1, a1, 64 -- vst $vr4, a5, 16 -- vshuf.b $vr2, $vr3, $vr2, $vr9 -- vshuf.b $vr3, $vr0, $vr3, $vr9 -+ vst vr4, a5, 16 -+ vshuf.b vr2, vr3, vr2, vr9 -+ vshuf.b vr3, vr0, vr3, vr9 - -- vst $vr2, a5, 32 -- vst $vr3, a5, 48 -+ vst vr2, a5, 32 -+ vst vr3, a5, 48 - addi.d a5, a5, 64 - L(un_less_64): - blt a2, t7, L(un_less_32) - - -- vshuf.b $vr3, $vr1, $vr0, $vr9 -- vld $vr0, a1, 0 -- vst $vr3, a5, 0 -+ vshuf.b vr3, vr1, vr0, vr9 -+ vld vr0, a1, 0 -+ vst vr3, a5, 0 - addi.d a2, a2, -32 - -- vshuf.b $vr3, $vr0, $vr1, $vr9 -- vld $vr1, a1, 16 -+ vshuf.b vr3, vr0, vr1, vr9 -+ vld vr1, a1, 16 - addi.d a1, a1, 32 -- vst $vr3, a5, 16 -+ vst vr3, a5, 16 - - addi.d a5, a5, 32 - L(un_less_32): - blt a2, t6, L(un_less_16) -- vshuf.b $vr2, $vr1, $vr0, $vr9 -- vor.v $vr0, $vr1, $vr1 -+ vshuf.b vr2, vr1, vr0, vr9 -+ vor.v vr0, vr1, vr1 - -- vld $vr1, a1, 0 -- vst $vr2, a5, 0 -+ vld vr1, a1, 0 -+ vst vr2, a5, 0 - addi.d a5, a5, 16 - L(un_less_16): -- vld $vr2, a4, -16 -+ vld vr2, a4, -16 - - -- vshuf.b $vr0, $vr1, $vr0, $vr9 -- vst $vr0, a5, 0 -- vst $vr2, a3, -16 -+ vshuf.b vr0, vr1, vr0, vr9 -+ vst vr0, a5, 0 -+ vst vr2, a3, -16 - jr ra - - L(copy_back): - addi.d t0, a3, -1 -- vld $vr2, a4, -16 -+ vld vr2, a4, -16 - andi t0, t0, 0xf - addi.d t0, t0, 1 # in case a3 is already aligned, load 16bytes and store 16bytes - -@@ -320,9 +320,9 @@ L(copy_back): - andi t1, a4, 0xf - bnez t1, L(back_unaligned) - -- vld $vr0, a4, -16 -+ vld vr0, a4, -16 - addi.d a2, a2, -16 -- vst $vr2, a3, -16 -+ vst vr2, a3, -16 - andi t2, a2, 0x7f - - -@@ -333,70 +333,70 @@ L(copy_back): - - sub.d a6, a4, t3 - L(back_al_loop): -- vld $vr1, a4, -32 -- vld $vr2, a4, -48 -- vld $vr3, a4, -64 -+ vld vr1, a4, -32 -+ vld vr2, a4, -48 -+ vld vr3, a4, -64 - -- vld $vr4, a4, -80 -- vld $vr5, a4, -96 -- vld $vr6, a4, -112 -- vld $vr7, a4, -128 -+ vld vr4, a4, -80 -+ vld vr5, a4, -96 -+ vld vr6, a4, -112 -+ vld vr7, a4, -128 - -- vst $vr0, a3, -16 -- vld $vr0, a4, -144 -+ vst vr0, a3, -16 -+ vld vr0, a4, -144 - addi.d a4, a4, -128 -- vst $vr1, a3, -32 -+ vst vr1, a3, -32 - - -- vst $vr2, a3, -48 -- vst $vr3, a3, -64 -- vst $vr4, a3, -80 -- vst $vr5, a3, -96 -+ vst vr2, a3, -48 -+ vst vr3, a3, -64 -+ vst vr4, a3, -80 -+ vst vr5, a3, -96 - -- vst $vr6, a3, -112 -- vst $vr7, a3, -128 -+ vst vr6, a3, -112 -+ vst vr7, a3, -128 - addi.d a3, a3, -128 - bne a4, a6, L(back_al_loop) - - L(back_al_less_128): - blt a2, t8, L(back_al_less_64) -- vld $vr1, a4, -32 -- vld $vr2, a4, -48 -- vld $vr3, a4, -64 -+ vld vr1, a4, -32 -+ vld vr2, a4, -48 -+ vld vr3, a4, -64 - - addi.d a2, a2, -64 -- vst $vr0, a3, -16 -- vld $vr0, a4, -80 -+ vst vr0, a3, -16 -+ vld vr0, a4, -80 - addi.d a4, a4, -64 - - -- vst $vr1, a3, -32 -- vst $vr2, a3, -48 -- vst $vr3, a3, -64 -+ vst vr1, a3, -32 -+ vst vr2, a3, -48 -+ vst vr3, a3, -64 - addi.d a3, a3, -64 - - L(back_al_less_64): - blt a2, t7, L(back_al_less_32) -- vld $vr1, a4, -32 -+ vld vr1, a4, -32 - addi.d a2, a2, -32 -- vst $vr0, a3, -16 -+ vst vr0, a3, -16 - -- vld $vr0, a4, -48 -- vst $vr1, a3, -32 -+ vld vr0, a4, -48 -+ vst vr1, a3, -32 - addi.d a3, a3, -32 - addi.d a4, a4, -32 - - L(back_al_less_32): - blt a2, t6, L(back_al_less_16) -- vst $vr0, a3, -16 -- vld $vr0, a4, -32 -+ vst vr0, a3, -16 -+ vld vr0, a4, -32 - addi.d a3, a3, -16 - - - L(back_al_less_16): -- vld $vr1, a1, 0 -- vst $vr0, a3, -16 -- vst $vr1, a0, 0 -+ vld vr1, a1, 0 -+ vst vr0, a3, -16 -+ vst vr1, a0, 0 - jr ra - - L(magic_num_2): -@@ -405,18 +405,18 @@ L(magic_num_2): - L(back_unaligned): - pcaddi t2, -4 - bstrins.d a4, zero, 3, 0 -- vld $vr8, t2, 0 -- vld $vr0, a4, 0 -+ vld vr8, t2, 0 -+ vld vr0, a4, 0 - -- vld $vr1, a4, -16 -+ vld vr1, a4, -16 - addi.d a2, a2, -16 -- vst $vr2, a3, -16 -+ vst vr2, a3, -16 - sub.d a3, a3, t0 - - -- vreplgr2vr.b $vr9, t1 -+ vreplgr2vr.b vr9, t1 - andi t2, a2, 0x7f -- vadd.b $vr9, $vr9, $vr8 -+ vadd.b vr9, vr9, vr8 - addi.d a4, a4, -16 - - beq t2, a2, L(back_un_less_128) -@@ -425,92 +425,92 @@ L(back_unaligned): - sub.d a6, a4, t3 - - L(back_un_loop): -- vld $vr2, a4, -16 -- vld $vr3, a4, -32 -- vld $vr4, a4, -48 -+ vld vr2, a4, -16 -+ vld vr3, a4, -32 -+ vld vr4, a4, -48 - -- vld $vr5, a4, -64 -- vld $vr6, a4, -80 -- vld $vr7, a4, -96 -- vshuf.b $vr8, $vr0, $vr1, $vr9 -+ vld vr5, a4, -64 -+ vld vr6, a4, -80 -+ vld vr7, a4, -96 -+ vshuf.b vr8, vr0, vr1, vr9 - - -- vld $vr0, a4, -112 -- vst $vr8, a3, -16 -- vshuf.b $vr8, $vr1, $vr2, $vr9 -- vld $vr1, a4, -128 -+ vld vr0, a4, -112 -+ vst vr8, a3, -16 -+ vshuf.b vr8, vr1, vr2, vr9 -+ vld vr1, a4, -128 - -- vst $vr8, a3, -32 -+ vst vr8, a3, -32 - addi.d a4, a4, -128 -- vshuf.b $vr2, $vr2, $vr3, $vr9 -- vshuf.b $vr3, $vr3, $vr4, $vr9 -+ vshuf.b vr2, vr2, vr3, vr9 -+ vshuf.b vr3, vr3, vr4, vr9 - -- vst $vr2, a3, -48 -- vshuf.b $vr4, $vr4, $vr5, $vr9 -- vst $vr3, a3, -64 -- vshuf.b $vr5, $vr5, $vr6, $vr9 -+ vst vr2, a3, -48 -+ vshuf.b vr4, vr4, vr5, vr9 -+ vst vr3, a3, -64 -+ vshuf.b vr5, vr5, vr6, vr9 - -- vst $vr4, a3, -80 -- vshuf.b $vr6, $vr6, $vr7, $vr9 -- vst $vr5, a3, -96 -- vshuf.b $vr7, $vr7, $vr0, $vr9 -+ vst vr4, a3, -80 -+ vshuf.b vr6, vr6, vr7, vr9 -+ vst vr5, a3, -96 -+ vshuf.b vr7, vr7, vr0, vr9 - - -- vst $vr6, a3, -112 -- vst $vr7, a3, -128 -+ vst vr6, a3, -112 -+ vst vr7, a3, -128 - addi.d a3, a3, -128 - bne a4, a6, L(back_un_loop) - - L(back_un_less_128): - blt a2, t8, L(back_un_less_64) -- vld $vr2, a4, -16 -- vld $vr3, a4, -32 -- vshuf.b $vr4, $vr0, $vr1, $vr9 -+ vld vr2, a4, -16 -+ vld vr3, a4, -32 -+ vshuf.b vr4, vr0, vr1, vr9 - -- vld $vr0, a4, -48 -- vst $vr4, a3, -16 -+ vld vr0, a4, -48 -+ vst vr4, a3, -16 - addi.d a2, a2, -64 -- vshuf.b $vr4, $vr1, $vr2, $vr9 -+ vshuf.b vr4, vr1, vr2, vr9 - -- vld $vr1, a4, -64 -+ vld vr1, a4, -64 - addi.d a4, a4, -64 -- vst $vr4, a3, -32 -- vshuf.b $vr2, $vr2, $vr3, $vr9 -+ vst vr4, a3, -32 -+ vshuf.b vr2, vr2, vr3, vr9 - - -- vshuf.b $vr3, $vr3, $vr0, $vr9 -- vst $vr2, a3, -48 -- vst $vr3, a3, -64 -+ vshuf.b vr3, vr3, vr0, vr9 -+ vst vr2, a3, -48 -+ vst vr3, a3, -64 - addi.d a3, a3, -64 - - L(back_un_less_64): - blt a2, t7, L(back_un_less_32) -- vshuf.b $vr3, $vr0, $vr1, $vr9 -- vld $vr0, a4, -16 -- vst $vr3, a3, -16 -+ vshuf.b vr3, vr0, vr1, vr9 -+ vld vr0, a4, -16 -+ vst vr3, a3, -16 - - addi.d a2, a2, -32 -- vshuf.b $vr3, $vr1, $vr0, $vr9 -- vld $vr1, a4, -32 -+ vshuf.b vr3, vr1, vr0, vr9 -+ vld vr1, a4, -32 - addi.d a4, a4, -32 - -- vst $vr3, a3, -32 -+ vst vr3, a3, -32 - addi.d a3, a3, -32 - L(back_un_less_32): - blt a2, t6, L(back_un_less_16) -- vshuf.b $vr2, $vr0, $vr1, $vr9 -+ vshuf.b vr2, vr0, vr1, vr9 - - -- vor.v $vr0, $vr1, $vr1 -- vld $vr1, a4, -16 -- vst $vr2, a3, -16 -+ vor.v vr0, vr1, vr1 -+ vld vr1, a4, -16 -+ vst vr2, a3, -16 - addi.d a3, a3, -16 - - L(back_un_less_16): -- vld $vr2, a1, 0 -- vshuf.b $vr0, $vr0, $vr1, $vr9 -- vst $vr0, a3, -16 -- vst $vr2, a0, 0 -+ vld vr2, a1, 0 -+ vshuf.b vr0, vr0, vr1, vr9 -+ vst vr0, a3, -16 -+ vst vr2, a0, 0 - - jr ra - END(MEMMOVE_NAME) -diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S -index 9ecd0257..41554552 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S -@@ -21,56 +21,56 @@ LEAF(MEMRCHR, 6) - - bstrins.d a3, zero, 5, 0 - addi.d t1, t1, 1 # len for unaligned address -- xvld $xr0, a3, 0 -- xvld $xr1, a3, 32 -+ xvld xr0, a3, 0 -+ xvld xr1, a3, 32 - - sub.d t2, zero, t1 - li.d t3, -1 -- xvreplgr2vr.b $xr2, a1 -+ xvreplgr2vr.b xr2, a1 - andi t4, a0, 0x3f - - srl.d t2, t3, t2 -- xvseq.b $xr0, $xr0, $xr2 -- xvseq.b $xr1, $xr1, $xr2 -- xvmsknz.b $xr0, $xr0 -+ xvseq.b xr0, xr0, xr2 -+ xvseq.b xr1, xr1, xr2 -+ xvmsknz.b xr0, xr0 - - -- xvmsknz.b $xr1, $xr1 -- xvpickve.w $xr3, $xr0, 4 -- xvpickve.w $xr4, $xr1, 4 -- vilvl.h $vr0, $vr3, $vr0 -+ xvmsknz.b xr1, xr1 -+ xvpickve.w xr3, xr0, 4 -+ xvpickve.w xr4, xr1, 4 -+ vilvl.h vr0, vr3, vr0 - -- vilvl.h $vr1, $vr4, $vr1 -- vilvl.w $vr0, $vr1, $vr0 -- movfr2gr.d t0, $f0 -+ vilvl.h vr1, vr4, vr1 -+ vilvl.w vr0, vr1, vr0 -+ movfr2gr.d t0, fa0 - and t0, t0, t2 - - bltu a2, t1, L(end) - bnez t0, L(found) - bstrins.d a0, zero, 5, 0 - L(loop): -- xvld $xr0, a3, -64 -+ xvld xr0, a3, -64 - -- xvld $xr1, a3, -32 -+ xvld xr1, a3, -32 - addi.d a3, a3, -64 -- xvseq.b $xr0, $xr0, $xr2 -- xvseq.b $xr1, $xr1, $xr2 -+ xvseq.b xr0, xr0, xr2 -+ xvseq.b xr1, xr1, xr2 - - - beq a0, a3, L(out) -- xvmax.bu $xr3, $xr0, $xr1 -- xvseteqz.v $fcc0, $xr3 -- bcnez $fcc0, L(loop) -+ xvmax.bu xr3, xr0, xr1 -+ xvseteqz.v fcc0, xr3 -+ bcnez fcc0, L(loop) - -- xvmsknz.b $xr0, $xr0 -- xvmsknz.b $xr1, $xr1 -- xvpickve.w $xr3, $xr0, 4 -- xvpickve.w $xr4, $xr1, 4 -+ xvmsknz.b xr0, xr0 -+ xvmsknz.b xr1, xr1 -+ xvpickve.w xr3, xr0, 4 -+ xvpickve.w xr4, xr1, 4 - -- vilvl.h $vr0, $vr3, $vr0 -- vilvl.h $vr1, $vr4, $vr1 -- vilvl.w $vr0, $vr1, $vr0 -- movfr2gr.d t0, $f0 -+ vilvl.h vr0, vr3, vr0 -+ vilvl.h vr1, vr4, vr1 -+ vilvl.w vr0, vr1, vr0 -+ movfr2gr.d t0, fa0 - - L(found): - addi.d a0, a3, 63 -@@ -80,15 +80,15 @@ L(found): - - - L(out): -- xvmsknz.b $xr0, $xr0 -- xvmsknz.b $xr1, $xr1 -- xvpickve.w $xr3, $xr0, 4 -- xvpickve.w $xr4, $xr1, 4 -- -- vilvl.h $vr0, $vr3, $vr0 -- vilvl.h $vr1, $vr4, $vr1 -- vilvl.w $vr0, $vr1, $vr0 -- movfr2gr.d t0, $f0 -+ xvmsknz.b xr0, xr0 -+ xvmsknz.b xr1, xr1 -+ xvpickve.w xr3, xr0, 4 -+ xvpickve.w xr4, xr1, 4 -+ -+ vilvl.h vr0, vr3, vr0 -+ vilvl.h vr1, vr4, vr1 -+ vilvl.w vr0, vr1, vr0 -+ movfr2gr.d t0, fa0 - - L(end): - sll.d t2, t3, t4 -diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S -index 4bdc18d8..4a302cac 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S -@@ -19,46 +19,46 @@ LEAF(MEMRCHR, 6) - - bstrins.d a3, zero, 4, 0 - addi.d t1, t1, 1 # len for unaligned address -- vld $vr0, a3, 0 -- vld $vr1, a3, 16 -+ vld vr0, a3, 0 -+ vld vr1, a3, 16 - - sub.d t2, zero, t1 - li.d t3, -1 -- vreplgr2vr.b $vr2, a1 -+ vreplgr2vr.b vr2, a1 - andi t4, a0, 0x1f - - srl.d t2, t3, t2 -- vseq.b $vr0, $vr0, $vr2 -- vseq.b $vr1, $vr1, $vr2 -- vmsknz.b $vr0, $vr0 -+ vseq.b vr0, vr0, vr2 -+ vseq.b vr1, vr1, vr2 -+ vmsknz.b vr0, vr0 - - -- vmsknz.b $vr1, $vr1 -- vilvl.h $vr0, $vr1, $vr0 -- movfr2gr.s t0, $f0 -+ vmsknz.b vr1, vr1 -+ vilvl.h vr0, vr1, vr0 -+ movfr2gr.s t0, fa0 - and t0, t0, t2 - - bltu a2, t1, L(end) - bnez t0, L(found) - bstrins.d a0, zero, 4, 0 - L(loop): -- vld $vr0, a3, -32 -+ vld vr0, a3, -32 - -- vld $vr1, a3, -16 -+ vld vr1, a3, -16 - addi.d a3, a3, -32 -- vseq.b $vr0, $vr0, $vr2 -- vseq.b $vr1, $vr1, $vr2 -+ vseq.b vr0, vr0, vr2 -+ vseq.b vr1, vr1, vr2 - - beq a0, a3, L(out) -- vmax.bu $vr3, $vr0, $vr1 -- vseteqz.v $fcc0, $vr3 -- bcnez $fcc0, L(loop) -+ vmax.bu vr3, vr0, vr1 -+ vseteqz.v fcc0, vr3 -+ bcnez fcc0, L(loop) - - -- vmsknz.b $vr0, $vr0 -- vmsknz.b $vr1, $vr1 -- vilvl.h $vr0, $vr1, $vr0 -- movfr2gr.s t0, $f0 -+ vmsknz.b vr0, vr0 -+ vmsknz.b vr1, vr1 -+ vilvl.h vr0, vr1, vr0 -+ movfr2gr.s t0, fa0 - - L(found): - addi.d a0, a3, 31 -@@ -67,10 +67,10 @@ L(found): - jr ra - - L(out): -- vmsknz.b $vr0, $vr0 -- vmsknz.b $vr1, $vr1 -- vilvl.h $vr0, $vr1, $vr0 -- movfr2gr.s t0, $f0 -+ vmsknz.b vr0, vr0 -+ vmsknz.b vr1, vr1 -+ vilvl.h vr0, vr1, vr0 -+ movfr2gr.s t0, fa0 - - L(end): - sll.d t2, t3, t4 -diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S -index b53c0b7b..5e4908dc 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S -@@ -14,7 +14,7 @@ - LEAF(MEMSET, 6) - li.d t1, 32 - move a3, a0 -- xvreplgr2vr.b $xr0, a1 -+ xvreplgr2vr.b xr0, a1 - add.d a4, a0, a2 - - bgeu t1, a2, L(less_32bytes) # len <= 32 -@@ -24,46 +24,46 @@ LEAF(MEMSET, 6) - - L(less_128bytes): - bgeu t2, a2, L(less_64bytes) # len <= 64 -- xvst $xr0, a3, 0 -- xvst $xr0, a3, 32 -- xvst $xr0, a4, -32 -+ xvst xr0, a3, 0 -+ xvst xr0, a3, 32 -+ xvst xr0, a4, -32 - -- xvst $xr0, a4, -64 -+ xvst xr0, a4, -64 - jr ra - L(less_64bytes): -- xvst $xr0, a3, 0 -- xvst $xr0, a4, -32 -+ xvst xr0, a3, 0 -+ xvst xr0, a4, -32 - - - jr ra - L(less_32bytes): - srli.d t0, a2, 4 - beqz t0, L(less_16bytes) -- vst $vr0, a3, 0 -+ vst vr0, a3, 0 - -- vst $vr0, a4, -16 -+ vst vr0, a4, -16 - jr ra - L(less_16bytes): - srli.d t0, a2, 3 - beqz t0, L(less_8bytes) - -- vstelm.d $vr0, a3, 0, 0 -- vstelm.d $vr0, a4, -8, 0 -+ vstelm.d vr0, a3, 0, 0 -+ vstelm.d vr0, a4, -8, 0 - jr ra - L(less_8bytes): - srli.d t0, a2, 2 - - beqz t0, L(less_4bytes) -- vstelm.w $vr0, a3, 0, 0 -- vstelm.w $vr0, a4, -4, 0 -+ vstelm.w vr0, a3, 0, 0 -+ vstelm.w vr0, a4, -4, 0 - jr ra - - - L(less_4bytes): - srli.d t0, a2, 1 - beqz t0, L(less_2bytes) -- vstelm.h $vr0, a3, 0, 0 -- vstelm.h $vr0, a4, -2, 0 -+ vstelm.h vr0, a3, 0, 0 -+ vstelm.h vr0, a4, -2, 0 - - jr ra - L(less_2bytes): -@@ -73,7 +73,7 @@ L(less_1bytes): - jr ra - - L(long_bytes): -- xvst $xr0, a3, 0 -+ xvst xr0, a3, 0 - bstrins.d a3, zero, 4, 0 - addi.d a3, a3, 32 - sub.d a2, a4, a3 -@@ -85,15 +85,15 @@ L(long_bytes): - - - L(loop_256): -- xvst $xr0, a3, 0 -- xvst $xr0, a3, 32 -- xvst $xr0, a3, 64 -- xvst $xr0, a3, 96 -+ xvst xr0, a3, 0 -+ xvst xr0, a3, 32 -+ xvst xr0, a3, 64 -+ xvst xr0, a3, 96 - -- xvst $xr0, a3, 128 -- xvst $xr0, a3, 160 -- xvst $xr0, a3, 192 -- xvst $xr0, a3, 224 -+ xvst xr0, a3, 128 -+ xvst xr0, a3, 160 -+ xvst xr0, a3, 192 -+ xvst xr0, a3, 224 - - addi.d a3, a3, 256 - bne a3, t0, L(loop_256) -@@ -101,26 +101,26 @@ L(long_end): - bltu a2, t3, L(end_less_128) - addi.d a2, a2, -128 - -- xvst $xr0, a3, 0 -- xvst $xr0, a3, 32 -- xvst $xr0, a3, 64 -- xvst $xr0, a3, 96 -+ xvst xr0, a3, 0 -+ xvst xr0, a3, 32 -+ xvst xr0, a3, 64 -+ xvst xr0, a3, 96 - - - addi.d a3, a3, 128 - L(end_less_128): - bltu a2, t2, L(end_less_64) - addi.d a2, a2, -64 -- xvst $xr0, a3, 0 -+ xvst xr0, a3, 0 - -- xvst $xr0, a3, 32 -+ xvst xr0, a3, 32 - addi.d a3, a3, 64 - L(end_less_64): - bltu a2, t1, L(end_less_32) -- xvst $xr0, a3, 0 -+ xvst xr0, a3, 0 - - L(end_less_32): -- xvst $xr0, a4, -32 -+ xvst xr0, a4, -32 - jr ra - END(MEMSET) - -diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S -index 7ab85283..67b279c8 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S -@@ -14,7 +14,7 @@ - LEAF(MEMSET, 6) - li.d t1, 16 - move a3, a0 -- vreplgr2vr.b $vr0, a1 -+ vreplgr2vr.b vr0, a1 - add.d a4, a0, a2 - - bgeu t1, a2, L(less_16bytes) # len <= 16 -@@ -24,48 +24,48 @@ LEAF(MEMSET, 6) - - L(less_64bytes): - bgeu t2, a2, L(less_32bytes) # len <= 32 -- vst $vr0, a3, 0 -- vst $vr0, a3, 16 -- vst $vr0, a4, -32 -+ vst vr0, a3, 0 -+ vst vr0, a3, 16 -+ vst vr0, a4, -32 - -- vst $vr0, a4, -16 -+ vst vr0, a4, -16 - jr ra - L(less_32bytes): -- vst $vr0, a3, 0 -- vst $vr0, a4, -16 -+ vst vr0, a3, 0 -+ vst vr0, a4, -16 - - - jr ra - L(less_16bytes): - srli.d t0, a2, 3 - beqz t0, L(less_8bytes) -- vstelm.d $vr0, a3, 0, 0 -+ vstelm.d vr0, a3, 0, 0 - -- vstelm.d $vr0, a4, -8, 0 -+ vstelm.d vr0, a4, -8, 0 - jr ra - L(less_8bytes): - srli.d t0, a2, 2 - beqz t0, L(less_4bytes) - -- vstelm.w $vr0, a3, 0, 0 -- vstelm.w $vr0, a4, -4, 0 -+ vstelm.w vr0, a3, 0, 0 -+ vstelm.w vr0, a4, -4, 0 - jr ra - L(less_4bytes): - srli.d t0, a2, 1 - - beqz t0, L(less_2bytes) -- vstelm.h $vr0, a3, 0, 0 -- vstelm.h $vr0, a4, -2, 0 -+ vstelm.h vr0, a3, 0, 0 -+ vstelm.h vr0, a4, -2, 0 - jr ra - - - L(less_2bytes): - beqz a2, L(less_1bytes) -- vstelm.b $vr0, a3, 0, 0 -+ vstelm.b vr0, a3, 0, 0 - L(less_1bytes): - jr ra - L(long_bytes): -- vst $vr0, a3, 0 -+ vst vr0, a3, 0 - - bstrins.d a3, zero, 3, 0 - addi.d a3, a3, 16 -@@ -77,43 +77,43 @@ L(long_bytes): - sub.d t0, a4, t0 - - L(loop_128): -- vst $vr0, a3, 0 -+ vst vr0, a3, 0 - -- vst $vr0, a3, 16 -- vst $vr0, a3, 32 -- vst $vr0, a3, 48 -- vst $vr0, a3, 64 -+ vst vr0, a3, 16 -+ vst vr0, a3, 32 -+ vst vr0, a3, 48 -+ vst vr0, a3, 64 - - -- vst $vr0, a3, 80 -- vst $vr0, a3, 96 -- vst $vr0, a3, 112 -+ vst vr0, a3, 80 -+ vst vr0, a3, 96 -+ vst vr0, a3, 112 - addi.d a3, a3, 128 - - bne a3, t0, L(loop_128) - L(long_end): - bltu a2, t3, L(end_less_64) - addi.d a2, a2, -64 -- vst $vr0, a3, 0 -+ vst vr0, a3, 0 - -- vst $vr0, a3, 16 -- vst $vr0, a3, 32 -- vst $vr0, a3, 48 -+ vst vr0, a3, 16 -+ vst vr0, a3, 32 -+ vst vr0, a3, 48 - addi.d a3, a3, 64 - - L(end_less_64): - bltu a2, t2, L(end_less_32) - addi.d a2, a2, -32 -- vst $vr0, a3, 0 -- vst $vr0, a3, 16 -+ vst vr0, a3, 0 -+ vst vr0, a3, 16 - - addi.d a3, a3, 32 - L(end_less_32): - bltu a2, t1, L(end_less_16) -- vst $vr0, a3, 0 -+ vst vr0, a3, 0 - - L(end_less_16): -- vst $vr0, a4, -16 -+ vst vr0, a4, -16 - jr ra - END(MEMSET) - -diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S -index 1e94aa50..856f99ce 100644 ---- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S -@@ -8,15 +8,15 @@ - LEAF(RAWMEMCHR, 6) - move a2, a0 - bstrins.d a0, zero, 4, 0 -- xvld $xr0, a0, 0 -- xvreplgr2vr.b $xr1, a1 -+ xvld xr0, a0, 0 -+ xvreplgr2vr.b xr1, a1 - -- xvseq.b $xr0, $xr0, $xr1 -- xvmsknz.b $xr0, $xr0 -- xvpickve.w $xr2, $xr0, 4 -- vilvl.h $vr0, $vr2, $vr0 -+ xvseq.b xr0, xr0, xr1 -+ xvmsknz.b xr0, xr0 -+ xvpickve.w xr2, xr0, 4 -+ vilvl.h vr0, vr2, vr0 - -- movfr2gr.s t0, $f0 -+ movfr2gr.s t0, fa0 - sra.w t0, t0, a2 - beqz t0, L(loop) - ctz.w t0, t0 -@@ -27,17 +27,17 @@ LEAF(RAWMEMCHR, 6) - nop - - L(loop): -- xvld $xr0, a0, 32 -+ xvld xr0, a0, 32 - addi.d a0, a0, 32 -- xvseq.b $xr0, $xr0, $xr1 -- xvseteqz.v $fcc0, $xr0 -+ xvseq.b xr0, xr0, xr1 -+ xvseteqz.v fcc0, xr0 - -- bcnez $fcc0, L(loop) -- xvmsknz.b $xr0, $xr0 -- xvpickve.w $xr1, $xr0, 4 -- vilvl.h $vr0, $vr1, $vr0 -+ bcnez fcc0, L(loop) -+ xvmsknz.b xr0, xr0 -+ xvpickve.w xr1, xr0, 4 -+ vilvl.h vr0, vr1, vr0 - -- movfr2gr.s t0, $f0 -+ movfr2gr.s t0, fa0 - ctz.w t0, t0 - add.d a0, a0, t0 - jr ra -diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S -index 40bf0cda..7e864e96 100644 ---- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S -@@ -14,17 +14,17 @@ - LEAF(RAWMEMCHR, 6) - move a2, a0 - bstrins.d a0, zero, 4, 0 -- vld $vr0, a0, 0 -- vld $vr1, a0, 16 -+ vld vr0, a0, 0 -+ vld vr1, a0, 16 - -- vreplgr2vr.b $vr2, a1 -- vseq.b $vr0, $vr0, $vr2 -- vseq.b $vr1, $vr1, $vr2 -- vmsknz.b $vr0, $vr0 -+ vreplgr2vr.b vr2, a1 -+ vseq.b vr0, vr0, vr2 -+ vseq.b vr1, vr1, vr2 -+ vmsknz.b vr0, vr0 - -- vmsknz.b $vr1, $vr1 -- vilvl.h $vr0, $vr1, $vr0 -- movfr2gr.s t0, $f0 -+ vmsknz.b vr1, vr1 -+ vilvl.h vr0, vr1, vr0 -+ movfr2gr.s t0, fa0 - sra.w t0, t0, a2 - - beqz t0, L(loop) -@@ -34,15 +34,15 @@ LEAF(RAWMEMCHR, 6) - - - L(loop): -- vld $vr0, a0, 32 -+ vld vr0, a0, 32 - addi.d a0, a0, 16 -- vseq.b $vr0, $vr0, $vr2 -- vseteqz.v $fcc0, $vr0 -+ vseq.b vr0, vr0, vr2 -+ vseteqz.v fcc0, vr0 - -- bcnez $fcc0, L(loop) -+ bcnez fcc0, L(loop) - addi.d a0, a0, 16 -- vfrstpi.b $vr0, $vr0, 0 -- vpickve2gr.bu t0, $vr0, 0 -+ vfrstpi.b vr0, vr0, 0 -+ vpickve2gr.bu t0, vr0, 0 - - add.d a0, a0, t0 - jr ra -diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S -index 0836f590..53832de7 100644 ---- a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S -@@ -18,67 +18,67 @@ L(magic_num): - ENTRY_NO_ALIGN(STPCPY) - pcaddi t0, -4 - andi a4, a1, 0xf -- vld $vr1, t0, 0 -+ vld vr1, t0, 0 - beqz a4, L(load_start) - - xor t0, a1, a4 -- vld $vr0, t0, 0 -- vreplgr2vr.b $vr2, a4 -- vadd.b $vr2, $vr2, $vr1 -+ vld vr0, t0, 0 -+ vreplgr2vr.b vr2, a4 -+ vadd.b vr2, vr2, vr1 - -- vshuf.b $vr0, $vr2, $vr0, $vr2 -- vsetanyeqz.b $fcc0, $vr0 -- bcnez $fcc0, L(end) -+ vshuf.b vr0, vr2, vr0, vr2 -+ vsetanyeqz.b fcc0, vr0 -+ bcnez fcc0, L(end) - L(load_start): -- vld $vr0, a1, 0 -+ vld vr0, a1, 0 - - - li.d t1, 16 - andi a3, a0, 0xf -- vsetanyeqz.b $fcc0, $vr0 -+ vsetanyeqz.b fcc0, vr0 - sub.d t0, t1, a3 - -- bcnez $fcc0, L(end) -+ bcnez fcc0, L(end) - add.d a1, a1, t0 -- vst $vr0, a0, 0 -+ vst vr0, a0, 0 - add.d a0, a0, t0 - - bne a3, a4, L(unaligned) -- vld $vr0, a1, 0 -- vsetanyeqz.b $fcc0, $vr0 -- bcnez $fcc0, L(end) -+ vld vr0, a1, 0 -+ vsetanyeqz.b fcc0, vr0 -+ bcnez fcc0, L(end) - - L(loop): -- vst $vr0, a0, 0 -- vld $vr0, a1, 16 -+ vst vr0, a0, 0 -+ vld vr0, a1, 16 - addi.d a0, a0, 16 - addi.d a1, a1, 16 - - -- vsetanyeqz.b $fcc0, $vr0 -- bceqz $fcc0, L(loop) -- vmsknz.b $vr1, $vr0 -- movfr2gr.s t0, $f1 -+ vsetanyeqz.b fcc0, vr0 -+ bceqz fcc0, L(loop) -+ vmsknz.b vr1, vr0 -+ movfr2gr.s t0, fa1 - - cto.w t0, t0 - add.d a1, a1, t0 -- vld $vr0, a1, -15 -+ vld vr0, a1, -15 - add.d a0, a0, t0 - -- vst $vr0, a0, -15 -+ vst vr0, a0, -15 - jr ra - L(end): -- vseqi.b $vr1, $vr0, 0 -- vfrstpi.b $vr1, $vr1, 0 -+ vseqi.b vr1, vr0, 0 -+ vfrstpi.b vr1, vr1, 0 - -- vpickve2gr.bu t0, $vr1, 0 -+ vpickve2gr.bu t0, vr1, 0 - addi.d t0, t0, 1 - L(end_16): - andi t1, t0, 16 - beqz t1, L(end_8) - - -- vst $vr0, a0, 0 -+ vst vr0, a0, 0 - addi.d a0, a0, 15 - jr ra - L(end_8): -@@ -89,26 +89,26 @@ L(end_8): - andi t5, t0, 1 - beqz t2, L(end_4) - -- vstelm.d $vr0, a0, 0, 0 -+ vstelm.d vr0, a0, 0, 0 - addi.d a0, a0, 8 -- vbsrl.v $vr0, $vr0, 8 -+ vbsrl.v vr0, vr0, 8 - L(end_4): - beqz t3, L(end_2) - -- vstelm.w $vr0, a0, 0, 0 -+ vstelm.w vr0, a0, 0, 0 - addi.d a0, a0, 4 -- vbsrl.v $vr0, $vr0, 4 -+ vbsrl.v vr0, vr0, 4 - L(end_2): - beqz t4, L(end_1) - - -- vstelm.h $vr0, a0, 0, 0 -+ vstelm.h vr0, a0, 0, 0 - addi.d a0, a0, 2 -- vbsrl.v $vr0, $vr0, 2 -+ vbsrl.v vr0, vr0, 2 - L(end_1): - beqz t5, L(out) - -- vstelm.b $vr0, a0, 0, 0 -+ vstelm.b vr0, a0, 0, 0 - addi.d a0, a0, 1 - L(out): - addi.d a0, a0, -1 -@@ -120,49 +120,49 @@ L(unaligned): - andi a3, a1, 0xf - bstrins.d a1, zero, 3, 0 - -- vld $vr2, a1, 0 -- vreplgr2vr.b $vr3, a3 -- vslt.b $vr4, $vr1, $vr3 -- vor.v $vr0, $vr2, $vr4 -+ vld vr2, a1, 0 -+ vreplgr2vr.b vr3, a3 -+ vslt.b vr4, vr1, vr3 -+ vor.v vr0, vr2, vr4 - - -- vsetanyeqz.b $fcc0, $vr0 -- bcnez $fcc0, L(un_first_end) -- vld $vr0, a1, 16 -- vadd.b $vr3, $vr3, $vr1 -+ vsetanyeqz.b fcc0, vr0 -+ bcnez fcc0, L(un_first_end) -+ vld vr0, a1, 16 -+ vadd.b vr3, vr3, vr1 - - addi.d a1, a1, 16 -- vshuf.b $vr4, $vr0, $vr2, $vr3 -- vsetanyeqz.b $fcc0, $vr0 -- bcnez $fcc0, L(un_end) -+ vshuf.b vr4, vr0, vr2, vr3 -+ vsetanyeqz.b fcc0, vr0 -+ bcnez fcc0, L(un_end) - - L(un_loop): -- vor.v $vr2, $vr0, $vr0 -- vld $vr0, a1, 16 -- vst $vr4, a0, 0 -+ vor.v vr2, vr0, vr0 -+ vld vr0, a1, 16 -+ vst vr4, a0, 0 - addi.d a1, a1, 16 - - addi.d a0, a0, 16 -- vshuf.b $vr4, $vr0, $vr2, $vr3 -- vsetanyeqz.b $fcc0, $vr0 -- bceqz $fcc0, L(un_loop) -+ vshuf.b vr4, vr0, vr2, vr3 -+ vsetanyeqz.b fcc0, vr0 -+ bceqz fcc0, L(un_loop) - - - L(un_end): -- vsetanyeqz.b $fcc0, $vr4 -- bcnez $fcc0, 1f -- vst $vr4, a0, 0 -+ vsetanyeqz.b fcc0, vr4 -+ bcnez fcc0, 1f -+ vst vr4, a0, 0 - 1: -- vmsknz.b $vr1, $vr0 -+ vmsknz.b vr1, vr0 - -- movfr2gr.s t0, $f1 -+ movfr2gr.s t0, fa1 - cto.w t0, t0 - add.d a1, a1, t0 -- vld $vr0, a1, -15 -+ vld vr0, a1, -15 - - add.d a0, a0, t0 - sub.d a0, a0, a3 -- vst $vr0, a0, 1 -+ vst vr0, a0, 1 - addi.d a0, a0, 16 - - jr ra -diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S -index 3f6ad915..fab6edc7 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S -@@ -16,18 +16,18 @@ - LEAF(STRCHR, 6) - andi t1, a0, 0x1f - bstrins.d a0, zero, 4, 0 -- xvld $xr0, a0, 0 -+ xvld xr0, a0, 0 - li.d t2, -1 - -- xvreplgr2vr.b $xr1, a1 -+ xvreplgr2vr.b xr1, a1 - sll.d t1, t2, t1 -- xvxor.v $xr2, $xr0, $xr1 -- xvmin.bu $xr0, $xr0, $xr2 -+ xvxor.v xr2, xr0, xr1 -+ xvmin.bu xr0, xr0, xr2 - -- xvmsknz.b $xr0, $xr0 -- xvpickve.w $xr3, $xr0, 4 -- vilvl.h $vr0, $vr3, $vr0 -- movfr2gr.s t0, $f0 -+ xvmsknz.b xr0, xr0 -+ xvpickve.w xr3, xr0, 4 -+ vilvl.h vr0, vr3, vr0 -+ movfr2gr.s t0, fa0 - - orn t0, t0, t1 - bne t0, t2, L(end) -@@ -36,37 +36,37 @@ LEAF(STRCHR, 6) - - - L(loop): -- xvld $xr0, a0, 0 -- xvxor.v $xr2, $xr0, $xr1 -- xvmin.bu $xr0, $xr0, $xr2 -- xvsetanyeqz.b $fcc0, $xr0 -+ xvld xr0, a0, 0 -+ xvxor.v xr2, xr0, xr1 -+ xvmin.bu xr0, xr0, xr2 -+ xvsetanyeqz.b fcc0, xr0 - -- bcnez $fcc0, L(loop_end) -- xvld $xr0, a0, 32 -+ bcnez fcc0, L(loop_end) -+ xvld xr0, a0, 32 - addi.d a0, a0, 64 -- xvxor.v $xr2, $xr0, $xr1 -+ xvxor.v xr2, xr0, xr1 - -- xvmin.bu $xr0, $xr0, $xr2 -- xvsetanyeqz.b $fcc0, $xr0 -- bceqz $fcc0, L(loop) -+ xvmin.bu xr0, xr0, xr2 -+ xvsetanyeqz.b fcc0, xr0 -+ bceqz fcc0, L(loop) - addi.d a0, a0, -32 - - L(loop_end): -- xvmsknz.b $xr0, $xr0 -- xvpickve.w $xr1, $xr0, 4 -- vilvl.h $vr0, $vr1, $vr0 -- movfr2gr.s t0, $f0 -+ xvmsknz.b xr0, xr0 -+ xvpickve.w xr1, xr0, 4 -+ vilvl.h vr0, vr1, vr0 -+ movfr2gr.s t0, fa0 - - - L(end): - cto.w t0, t0 - add.d a0, a0, t0 - #ifndef AS_STRCHRNUL -- vreplgr2vr.b $vr0, t0 -- xvpermi.q $xr3, $xr2, 1 -+ vreplgr2vr.b vr0, t0 -+ xvpermi.q xr3, xr2, 1 - -- vshuf.b $vr0, $vr3, $vr2, $vr0 -- vpickve2gr.bu t0, $vr0, 0 -+ vshuf.b vr0, vr3, vr2, vr0 -+ vpickve2gr.bu t0, vr0, 0 - masknez a0, a0, t0 - #endif - jr ra -diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S -index 4ad9a4ad..ebeb332e 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S -@@ -16,16 +16,16 @@ - LEAF(STRCHR, 6) - andi t1, a0, 0xf - bstrins.d a0, zero, 3, 0 -- vld $vr0, a0, 0 -+ vld vr0, a0, 0 - li.d t2, -1 - -- vreplgr2vr.b $vr1, a1 -+ vreplgr2vr.b vr1, a1 - sll.d t3, t2, t1 -- vxor.v $vr2, $vr0, $vr1 -- vmin.bu $vr0, $vr0, $vr2 -+ vxor.v vr2, vr0, vr1 -+ vmin.bu vr0, vr0, vr2 - -- vmsknz.b $vr0, $vr0 -- movfr2gr.s t0, $f0 -+ vmsknz.b vr0, vr0 -+ movfr2gr.s t0, fa0 - ext.w.h t0, t0 - orn t0, t0, t3 - -@@ -34,23 +34,23 @@ L(found): - cto.w t0, t0 - add.d a0, a0, t0 - #ifndef AS_STRCHRNUL -- vreplve.b $vr2, $vr2, t0 -- vpickve2gr.bu t1, $vr2, 0 -+ vreplve.b vr2, vr2, t0 -+ vpickve2gr.bu t1, vr2, 0 - masknez a0, a0, t1 - #endif - jr ra - - - L(loop): -- vld $vr0, a0, 16 -+ vld vr0, a0, 16 - addi.d a0, a0, 16 -- vxor.v $vr2, $vr0, $vr1 -- vmin.bu $vr0, $vr0, $vr2 -+ vxor.v vr2, vr0, vr1 -+ vmin.bu vr0, vr0, vr2 - -- vsetanyeqz.b $fcc0, $vr0 -- bceqz $fcc0, L(loop) -- vmsknz.b $vr0, $vr0 -- movfr2gr.s t0, $f0 -+ vsetanyeqz.b fcc0, vr0 -+ bceqz fcc0, L(loop) -+ vmsknz.b vr0, vr0 -+ movfr2gr.s t0, fa0 - - b L(found) - END(STRCHR) -diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S -index c86e3ecd..c6e1110c 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S -@@ -20,45 +20,45 @@ L(magic_num): - ENTRY_NO_ALIGN(STRCMP) - pcaddi t0, -4 - andi a2, a0, 0xf -- vld $vr2, t0, 0 -+ vld vr2, t0, 0 - andi a3, a1, 0xf - - bne a2, a3, L(unaligned) - bstrins.d a0, zero, 3, 0 - bstrins.d a1, zero, 3, 0 -- vld $vr0, a0, 0 -+ vld vr0, a0, 0 - -- vld $vr1, a1, 0 -- vreplgr2vr.b $vr3, a2 -- vslt.b $vr2, $vr2, $vr3 -- vseq.b $vr3, $vr0, $vr1 -+ vld vr1, a1, 0 -+ vreplgr2vr.b vr3, a2 -+ vslt.b vr2, vr2, vr3 -+ vseq.b vr3, vr0, vr1 - - -- vmin.bu $vr3, $vr0, $vr3 -- vor.v $vr3, $vr3, $vr2 -- vsetanyeqz.b $fcc0, $vr3 -- bcnez $fcc0, L(al_out) -+ vmin.bu vr3, vr0, vr3 -+ vor.v vr3, vr3, vr2 -+ vsetanyeqz.b fcc0, vr3 -+ bcnez fcc0, L(al_out) - - L(al_loop): -- vld $vr0, a0, 16 -- vld $vr1, a1, 16 -+ vld vr0, a0, 16 -+ vld vr1, a1, 16 - addi.d a0, a0, 16 - addi.d a1, a1, 16 - -- vseq.b $vr3, $vr0, $vr1 -- vmin.bu $vr3, $vr0, $vr3 -- vsetanyeqz.b $fcc0, $vr3 -- bceqz $fcc0, L(al_loop) -+ vseq.b vr3, vr0, vr1 -+ vmin.bu vr3, vr0, vr3 -+ vsetanyeqz.b fcc0, vr3 -+ bceqz fcc0, L(al_loop) - - L(al_out): -- vseqi.b $vr3, $vr3, 0 -- vfrstpi.b $vr3, $vr3, 0 -- vshuf.b $vr0, $vr0, $vr0, $vr3 -- vshuf.b $vr1, $vr1, $vr1, $vr3 -+ vseqi.b vr3, vr3, 0 -+ vfrstpi.b vr3, vr3, 0 -+ vshuf.b vr0, vr0, vr0, vr3 -+ vshuf.b vr1, vr1, vr1, vr3 - - -- vpickve2gr.bu t0, $vr0, 0 -- vpickve2gr.bu t1, $vr1, 0 -+ vpickve2gr.bu t0, vr0, 0 -+ vpickve2gr.bu t1, vr1, 0 - sub.d a0, t0, t1 - jr ra - -@@ -79,52 +79,52 @@ L(unaligned): - bstrins.d a1, zero, 3, 0 - - -- vld $vr0, a0, 0 -- vld $vr3, a1, 0 -- vreplgr2vr.b $vr4, a2 -- vreplgr2vr.b $vr5, a3 -+ vld vr0, a0, 0 -+ vld vr3, a1, 0 -+ vreplgr2vr.b vr4, a2 -+ vreplgr2vr.b vr5, a3 - -- vslt.b $vr7, $vr2, $vr4 -- vsub.b $vr4, $vr4, $vr5 -- vaddi.bu $vr6, $vr2, 16 -- vsub.b $vr6, $vr6, $vr4 -+ vslt.b vr7, vr2, vr4 -+ vsub.b vr4, vr4, vr5 -+ vaddi.bu vr6, vr2, 16 -+ vsub.b vr6, vr6, vr4 - -- vshuf.b $vr1, $vr3, $vr3, $vr6 -- vseq.b $vr4, $vr0, $vr1 -- vmin.bu $vr4, $vr0, $vr4 -- vor.v $vr4, $vr4, $vr7 -+ vshuf.b vr1, vr3, vr3, vr6 -+ vseq.b vr4, vr0, vr1 -+ vmin.bu vr4, vr0, vr4 -+ vor.v vr4, vr4, vr7 - -- vsetanyeqz.b $fcc0, $vr4 -- bcnez $fcc0, L(un_end) -- vslt.b $vr5, $vr2, $vr5 -- vor.v $vr3, $vr3, $vr5 -+ vsetanyeqz.b fcc0, vr4 -+ bcnez fcc0, L(un_end) -+ vslt.b vr5, vr2, vr5 -+ vor.v vr3, vr3, vr5 - - - L(un_loop): -- vld $vr0, a0, 16 -- vsetanyeqz.b $fcc0, $vr3 -- bcnez $fcc0, L(remaining_end) -- vor.v $vr1, $vr3, $vr3 -+ vld vr0, a0, 16 -+ vsetanyeqz.b fcc0, vr3 -+ bcnez fcc0, L(remaining_end) -+ vor.v vr1, vr3, vr3 - -- vld $vr3, a1, 16 -+ vld vr3, a1, 16 - addi.d a0, a0, 16 - addi.d a1, a1, 16 -- vshuf.b $vr1, $vr3, $vr1, $vr6 -+ vshuf.b vr1, vr3, vr1, vr6 - -- vseq.b $vr4, $vr0, $vr1 -- vmin.bu $vr4, $vr0, $vr4 -- vsetanyeqz.b $fcc0, $vr4 -- bceqz $fcc0, L(un_loop) -+ vseq.b vr4, vr0, vr1 -+ vmin.bu vr4, vr0, vr4 -+ vsetanyeqz.b fcc0, vr4 -+ bceqz fcc0, L(un_loop) - - L(un_end): -- vseqi.b $vr4, $vr4, 0 -- vfrstpi.b $vr4, $vr4, 0 -- vshuf.b $vr0, $vr0, $vr0, $vr4 -- vshuf.b $vr1, $vr1, $vr1, $vr4 -+ vseqi.b vr4, vr4, 0 -+ vfrstpi.b vr4, vr4, 0 -+ vshuf.b vr0, vr0, vr0, vr4 -+ vshuf.b vr1, vr1, vr1, vr4 - - -- vpickve2gr.bu t0, $vr0, 0 -- vpickve2gr.bu t1, $vr1, 0 -+ vpickve2gr.bu t0, vr0, 0 -+ vpickve2gr.bu t1, vr1, 0 - sub.d t3, t0, t1 - sub.d t4, t1, t0 - -@@ -134,9 +134,9 @@ L(un_end): - jr ra - - L(remaining_end): -- vshuf.b $vr1, $vr3, $vr3, $vr6 -- vseq.b $vr4, $vr0, $vr1 -- vmin.bu $vr4, $vr4, $vr0 -+ vshuf.b vr1, vr3, vr3, vr6 -+ vseq.b vr4, vr0, vr1 -+ vmin.bu vr4, vr4, vr0 - b L(un_end) - END(STRCMP) - -diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S -index dbc061ad..52d77fa3 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S -@@ -21,61 +21,61 @@ L(magic_num): - ENTRY_NO_ALIGN(STRCPY) - pcaddi t0, -4 - andi a4, a1, 0xf -- vld $vr1, t0, 0 -+ vld vr1, t0, 0 - move a2, a0 - - beqz a4, L(load_start) - xor t0, a1, a4 -- vld $vr0, t0, 0 -- vreplgr2vr.b $vr2, a4 -+ vld vr0, t0, 0 -+ vreplgr2vr.b vr2, a4 - -- vadd.b $vr2, $vr2, $vr1 -- vshuf.b $vr0, $vr2, $vr0, $vr2 -- vsetanyeqz.b $fcc0, $vr0 -- bcnez $fcc0, L(end) -+ vadd.b vr2, vr2, vr1 -+ vshuf.b vr0, vr2, vr0, vr2 -+ vsetanyeqz.b fcc0, vr0 -+ bcnez fcc0, L(end) - - - L(load_start): -- vld $vr0, a1, 0 -+ vld vr0, a1, 0 - li.d t1, 16 - andi a3, a2, 0xf -- vsetanyeqz.b $fcc0, $vr0 -+ vsetanyeqz.b fcc0, vr0 - - sub.d t0, t1, a3 -- bcnez $fcc0, L(end) -+ bcnez fcc0, L(end) - add.d a1, a1, t0 -- vst $vr0, a2, 0 -+ vst vr0, a2, 0 - - andi a3, a1, 0xf - add.d a2, a2, t0 - bnez a3, L(unaligned) -- vld $vr0, a1, 0 -+ vld vr0, a1, 0 - -- vsetanyeqz.b $fcc0, $vr0 -- bcnez $fcc0, L(end) -+ vsetanyeqz.b fcc0, vr0 -+ bcnez fcc0, L(end) - L(loop): -- vst $vr0, a2, 0 -- vld $vr0, a1, 16 -+ vst vr0, a2, 0 -+ vld vr0, a1, 16 - - - addi.d a2, a2, 16 - addi.d a1, a1, 16 -- vsetanyeqz.b $fcc0, $vr0 -- bceqz $fcc0, L(loop) -+ vsetanyeqz.b fcc0, vr0 -+ bceqz fcc0, L(loop) - -- vmsknz.b $vr1, $vr0 -- movfr2gr.s t0, $f1 -+ vmsknz.b vr1, vr0 -+ movfr2gr.s t0, fa1 - cto.w t0, t0 - add.d a1, a1, t0 - -- vld $vr0, a1, -15 -+ vld vr0, a1, -15 - add.d a2, a2, t0 -- vst $vr0, a2, -15 -+ vst vr0, a2, -15 - jr ra - - L(end): -- vmsknz.b $vr1, $vr0 -- movfr2gr.s t0, $f1 -+ vmsknz.b vr1, vr0 -+ movfr2gr.s t0, fa1 - cto.w t0, t0 - addi.d t0, t0, 1 - -@@ -83,7 +83,7 @@ L(end): - L(end_16): - andi t1, t0, 16 - beqz t1, L(end_8) -- vst $vr0, a2, 0 -+ vst vr0, a2, 0 - jr ra - - L(end_8): -@@ -93,74 +93,74 @@ L(end_8): - andi t5, t0, 1 - - beqz t2, L(end_4) -- vstelm.d $vr0, a2, 0, 0 -+ vstelm.d vr0, a2, 0, 0 - addi.d a2, a2, 8 -- vbsrl.v $vr0, $vr0, 8 -+ vbsrl.v vr0, vr0, 8 - - L(end_4): - beqz t3, L(end_2) -- vstelm.w $vr0, a2, 0, 0 -+ vstelm.w vr0, a2, 0, 0 - addi.d a2, a2, 4 -- vbsrl.v $vr0, $vr0, 4 -+ vbsrl.v vr0, vr0, 4 - - - L(end_2): - beqz t4, L(end_1) -- vstelm.h $vr0, a2, 0, 0 -+ vstelm.h vr0, a2, 0, 0 - addi.d a2, a2, 2 -- vbsrl.v $vr0, $vr0, 2 -+ vbsrl.v vr0, vr0, 2 - - L(end_1): - beqz t5, L(out) -- vstelm.b $vr0, a2, 0, 0 -+ vstelm.b vr0, a2, 0, 0 - L(out): - jr ra - L(unaligned): - bstrins.d a1, zero, 3, 0 - -- vld $vr2, a1, 0 -- vreplgr2vr.b $vr3, a3 -- vslt.b $vr4, $vr1, $vr3 -- vor.v $vr0, $vr2, $vr4 -+ vld vr2, a1, 0 -+ vreplgr2vr.b vr3, a3 -+ vslt.b vr4, vr1, vr3 -+ vor.v vr0, vr2, vr4 - -- vsetanyeqz.b $fcc0, $vr0 -- bcnez $fcc0, L(un_first_end) -- vld $vr0, a1, 16 -- vadd.b $vr3, $vr3, $vr1 -+ vsetanyeqz.b fcc0, vr0 -+ bcnez fcc0, L(un_first_end) -+ vld vr0, a1, 16 -+ vadd.b vr3, vr3, vr1 - - - addi.d a1, a1, 16 -- vshuf.b $vr4, $vr0, $vr2, $vr3 -- vsetanyeqz.b $fcc0, $vr0 -- bcnez $fcc0, L(un_end) -+ vshuf.b vr4, vr0, vr2, vr3 -+ vsetanyeqz.b fcc0, vr0 -+ bcnez fcc0, L(un_end) - - L(un_loop): -- vor.v $vr2, $vr0, $vr0 -- vld $vr0, a1, 16 -- vst $vr4, a2, 0 -+ vor.v vr2, vr0, vr0 -+ vld vr0, a1, 16 -+ vst vr4, a2, 0 - addi.d a1, a1, 16 - - addi.d a2, a2, 16 -- vshuf.b $vr4, $vr0, $vr2, $vr3 -- vsetanyeqz.b $fcc0, $vr0 -- bceqz $fcc0, L(un_loop) -+ vshuf.b vr4, vr0, vr2, vr3 -+ vsetanyeqz.b fcc0, vr0 -+ bceqz fcc0, L(un_loop) - - L(un_end): -- vsetanyeqz.b $fcc0, $vr4 -- bcnez $fcc0, 1f -- vst $vr4, a2, 0 -+ vsetanyeqz.b fcc0, vr4 -+ bcnez fcc0, 1f -+ vst vr4, a2, 0 - 1: -- vmsknz.b $vr1, $vr0 -+ vmsknz.b vr1, vr0 - - -- movfr2gr.s t0, $f1 -+ movfr2gr.s t0, fa1 - cto.w t0, t0 - add.d a1, a1, t0 -- vld $vr0, a1, -15 -+ vld vr0, a1, -15 - - add.d a2, a2, t0 - sub.d a2, a2, a3 -- vst $vr0, a2, 1 -+ vst vr0, a2, 1 - jr ra - - L(un_first_end): -diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S -index fd6c002d..fc25dd50 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S -@@ -17,12 +17,12 @@ LEAF(STRLEN, 6) - move a1, a0 - bstrins.d a0, zero, 4, 0 - li.d t1, -1 -- xvld $xr0, a0, 0 -+ xvld xr0, a0, 0 - -- xvmsknz.b $xr0, $xr0 -- xvpickve.w $xr1, $xr0, 4 -- vilvl.h $vr0, $vr1, $vr0 -- movfr2gr.s t0, $f0 # sign extend -+ xvmsknz.b xr0, xr0 -+ xvpickve.w xr1, xr0, 4 -+ vilvl.h vr0, vr1, vr0 -+ movfr2gr.s t0, fa0 # sign extend - - sra.w t0, t0, a1 - beq t0, t1, L(loop) -@@ -30,18 +30,18 @@ LEAF(STRLEN, 6) - jr ra - - L(loop): -- xvld $xr0, a0, 32 -+ xvld xr0, a0, 32 - addi.d a0, a0, 32 -- xvsetanyeqz.b $fcc0, $xr0 -- bceqz $fcc0, L(loop) -+ xvsetanyeqz.b fcc0, xr0 -+ bceqz fcc0, L(loop) - - -- xvmsknz.b $xr0, $xr0 -+ xvmsknz.b xr0, xr0 - sub.d a0, a0, a1 -- xvpickve.w $xr1, $xr0, 4 -- vilvl.h $vr0, $vr1, $vr0 -+ xvpickve.w xr1, xr0, 4 -+ vilvl.h vr0, vr1, vr0 - -- movfr2gr.s t0, $f0 -+ movfr2gr.s t0, fa0 - cto.w t0, t0 - add.d a0, a0, t0 - jr ra -diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S -index 6f311506..45c3db93 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S -@@ -16,15 +16,15 @@ - LEAF(STRLEN, 6) - move a1, a0 - bstrins.d a0, zero, 4, 0 -- vld $vr0, a0, 0 -- vld $vr1, a0, 16 -+ vld vr0, a0, 0 -+ vld vr1, a0, 16 - - li.d t1, -1 -- vmsknz.b $vr0, $vr0 -- vmsknz.b $vr1, $vr1 -- vilvl.h $vr0, $vr1, $vr0 -+ vmsknz.b vr0, vr0 -+ vmsknz.b vr1, vr1 -+ vilvl.h vr0, vr1, vr0 - -- movfr2gr.s t0, $f0 -+ movfr2gr.s t0, fa0 - sra.w t0, t0, a1 - beq t0, t1, L(loop) - cto.w a0, t0 -@@ -36,19 +36,19 @@ LEAF(STRLEN, 6) - - - L(loop): -- vld $vr0, a0, 32 -- vld $vr1, a0, 48 -+ vld vr0, a0, 32 -+ vld vr1, a0, 48 - addi.d a0, a0, 32 -- vmin.bu $vr2, $vr0, $vr1 -+ vmin.bu vr2, vr0, vr1 - -- vsetanyeqz.b $fcc0, $vr2 -- bceqz $fcc0, L(loop) -- vmsknz.b $vr0, $vr0 -- vmsknz.b $vr1, $vr1 -+ vsetanyeqz.b fcc0, vr2 -+ bceqz fcc0, L(loop) -+ vmsknz.b vr0, vr0 -+ vmsknz.b vr1, vr1 - -- vilvl.h $vr0, $vr1, $vr0 -+ vilvl.h vr0, vr1, vr0 - sub.d a0, a0, a1 -- movfr2gr.s t0, $f0 -+ movfr2gr.s t0, fa0 - cto.w t0, t0 - - add.d a0, a0, t0 -diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S -index 2c6f9614..21f3e689 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S -@@ -22,7 +22,7 @@ ENTRY_NO_ALIGN(STRNCMP) - beqz a2, L(ret0) - pcaddi t0, -5 - andi a3, a0, 0xf -- vld $vr2, t0, 0 -+ vld vr2, t0, 0 - - andi a4, a1, 0xf - li.d t2, 16 -@@ -30,57 +30,57 @@ ENTRY_NO_ALIGN(STRNCMP) - xor t0, a0, a3 - - xor t1, a1, a4 -- vld $vr0, t0, 0 -- vld $vr1, t1, 0 -- vreplgr2vr.b $vr3, a3 -+ vld vr0, t0, 0 -+ vld vr1, t1, 0 -+ vreplgr2vr.b vr3, a3 - - - sub.d t2, t2, a3 -- vadd.b $vr3, $vr3, $vr2 -- vshuf.b $vr0, $vr3, $vr0, $vr3 -- vshuf.b $vr1, $vr3, $vr1, $vr3 -+ vadd.b vr3, vr3, vr2 -+ vshuf.b vr0, vr3, vr0, vr3 -+ vshuf.b vr1, vr3, vr1, vr3 - -- vseq.b $vr3, $vr0, $vr1 -- vmin.bu $vr3, $vr0, $vr3 -+ vseq.b vr3, vr0, vr1 -+ vmin.bu vr3, vr0, vr3 - bgeu t2, a2, L(al_early_end) -- vsetanyeqz.b $fcc0, $vr3 -+ vsetanyeqz.b fcc0, vr3 - -- bcnez $fcc0, L(al_end) -+ bcnez fcc0, L(al_end) - add.d a3, a0, a2 - addi.d a4, a3, -1 - bstrins.d a4, zero, 3, 0 - - sub.d a2, a3, a4 - L(al_loop): -- vld $vr0, t0, 16 -- vld $vr1, t1, 16 -+ vld vr0, t0, 16 -+ vld vr1, t1, 16 - addi.d t0, t0, 16 - - - addi.d t1, t1, 16 -- vseq.b $vr3, $vr0, $vr1 -- vmin.bu $vr3, $vr0, $vr3 -+ vseq.b vr3, vr0, vr1 -+ vmin.bu vr3, vr0, vr3 - beq t0, a4, L(al_early_end) - -- vsetanyeqz.b $fcc0, $vr3 -- bceqz $fcc0, L(al_loop) -+ vsetanyeqz.b fcc0, vr3 -+ bceqz fcc0, L(al_loop) - L(al_end): -- vseqi.b $vr3, $vr3, 0 -- vfrstpi.b $vr3, $vr3, 0 -+ vseqi.b vr3, vr3, 0 -+ vfrstpi.b vr3, vr3, 0 - -- vshuf.b $vr0, $vr0, $vr0, $vr3 -- vshuf.b $vr1, $vr1, $vr1, $vr3 -- vpickve2gr.bu t0, $vr0, 0 -- vpickve2gr.bu t1, $vr1, 0 -+ vshuf.b vr0, vr0, vr0, vr3 -+ vshuf.b vr1, vr1, vr1, vr3 -+ vpickve2gr.bu t0, vr0, 0 -+ vpickve2gr.bu t1, vr1, 0 - - sub.d a0, t0, t1 - jr ra - L(al_early_end): -- vreplgr2vr.b $vr4, a2 -- vslt.b $vr4, $vr2, $vr4 -+ vreplgr2vr.b vr4, a2 -+ vslt.b vr4, vr2, vr4 - - -- vorn.v $vr3, $vr3, $vr4 -+ vorn.v vr3, vr3, vr4 - b L(al_end) - L(unaligned): - slt a5, a3, a4 -@@ -94,64 +94,64 @@ L(unaligned): - andi a4, a1, 0xf - xor t0, a0, a3 - xor t1, a1, a4 -- vld $vr0, t0, 0 -+ vld vr0, t0, 0 - -- vld $vr3, t1, 0 -+ vld vr3, t1, 0 - sub.d t2, t2, a3 -- vreplgr2vr.b $vr4, a3 -- vreplgr2vr.b $vr5, a4 -+ vreplgr2vr.b vr4, a3 -+ vreplgr2vr.b vr5, a4 - - -- vaddi.bu $vr6, $vr2, 16 -- vsub.b $vr7, $vr4, $vr5 -- vsub.b $vr6, $vr6, $vr7 -- vadd.b $vr4, $vr2, $vr4 -+ vaddi.bu vr6, vr2, 16 -+ vsub.b vr7, vr4, vr5 -+ vsub.b vr6, vr6, vr7 -+ vadd.b vr4, vr2, vr4 - -- vshuf.b $vr1, $vr3, $vr3, $vr6 -- vshuf.b $vr0, $vr7, $vr0, $vr4 -- vshuf.b $vr1, $vr7, $vr1, $vr4 -- vseq.b $vr4, $vr0, $vr1 -+ vshuf.b vr1, vr3, vr3, vr6 -+ vshuf.b vr0, vr7, vr0, vr4 -+ vshuf.b vr1, vr7, vr1, vr4 -+ vseq.b vr4, vr0, vr1 - -- vmin.bu $vr4, $vr0, $vr4 -+ vmin.bu vr4, vr0, vr4 - bgeu t2, a2, L(un_early_end) -- vsetanyeqz.b $fcc0, $vr4 -- bcnez $fcc0, L(un_end) -+ vsetanyeqz.b fcc0, vr4 -+ bcnez fcc0, L(un_end) - - add.d a6, a0, a2 -- vslt.b $vr5, $vr2, $vr5 -+ vslt.b vr5, vr2, vr5 - addi.d a7, a6, -1 -- vor.v $vr3, $vr3, $vr5 -+ vor.v vr3, vr3, vr5 - - - bstrins.d a7, zero, 3, 0 - sub.d a2, a6, a7 - L(un_loop): -- vld $vr0, t0, 16 -+ vld vr0, t0, 16 - addi.d t0, t0, 16 - -- vsetanyeqz.b $fcc0, $vr3 -- bcnez $fcc0, L(has_zero) -+ vsetanyeqz.b fcc0, vr3 -+ bcnez fcc0, L(has_zero) - beq t0, a7, L(end_with_len) -- vor.v $vr1, $vr3, $vr3 -+ vor.v vr1, vr3, vr3 - -- vld $vr3, t1, 16 -+ vld vr3, t1, 16 - addi.d t1, t1, 16 -- vshuf.b $vr1, $vr3, $vr1, $vr6 -- vseq.b $vr4, $vr0, $vr1 -+ vshuf.b vr1, vr3, vr1, vr6 -+ vseq.b vr4, vr0, vr1 - -- vmin.bu $vr4, $vr0, $vr4 -- vsetanyeqz.b $fcc0, $vr4 -- bceqz $fcc0, L(un_loop) -+ vmin.bu vr4, vr0, vr4 -+ vsetanyeqz.b fcc0, vr4 -+ bceqz fcc0, L(un_loop) - L(un_end): -- vseqi.b $vr4, $vr4, 0 -+ vseqi.b vr4, vr4, 0 - - -- vfrstpi.b $vr4, $vr4, 0 -- vshuf.b $vr0, $vr0, $vr0, $vr4 -- vshuf.b $vr1, $vr1, $vr1, $vr4 -- vpickve2gr.bu t0, $vr0, 0 -+ vfrstpi.b vr4, vr4, 0 -+ vshuf.b vr0, vr0, vr0, vr4 -+ vshuf.b vr1, vr1, vr1, vr4 -+ vpickve2gr.bu t0, vr0, 0 - -- vpickve2gr.bu t1, $vr1, 0 -+ vpickve2gr.bu t1, vr1, 0 - sub.d t2, t0, t1 - sub.d t3, t1, t0 - masknez t0, t2, a5 -@@ -160,30 +160,30 @@ L(un_end): - or a0, t0, t1 - jr ra - L(has_zero): -- vshuf.b $vr1, $vr3, $vr3, $vr6 -+ vshuf.b vr1, vr3, vr3, vr6 - -- vseq.b $vr4, $vr0, $vr1 -- vmin.bu $vr4, $vr0, $vr4 -+ vseq.b vr4, vr0, vr1 -+ vmin.bu vr4, vr0, vr4 - bne t0, a7, L(un_end) - L(un_early_end): -- vreplgr2vr.b $vr5, a2 -+ vreplgr2vr.b vr5, a2 - -- vslt.b $vr5, $vr2, $vr5 -- vorn.v $vr4, $vr4, $vr5 -+ vslt.b vr5, vr2, vr5 -+ vorn.v vr4, vr4, vr5 - b L(un_end) - L(end_with_len): - sub.d a6, a3, a4 - - bgeu a6, a2, 1f -- vld $vr4, t1, 16 -+ vld vr4, t1, 16 - 1: -- vshuf.b $vr1, $vr4, $vr3, $vr6 -- vseq.b $vr4, $vr0, $vr1 -+ vshuf.b vr1, vr4, vr3, vr6 -+ vseq.b vr4, vr0, vr1 - -- vmin.bu $vr4, $vr0, $vr4 -- vreplgr2vr.b $vr5, a2 -- vslt.b $vr5, $vr2, $vr5 -- vorn.v $vr4, $vr4, $vr5 -+ vmin.bu vr4, vr0, vr4 -+ vreplgr2vr.b vr5, a2 -+ vslt.b vr5, vr2, vr5 -+ vorn.v vr4, vr4, vr5 - - b L(un_end) - L(ret0): -diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S -index 910b52fe..6410a907 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S -@@ -19,23 +19,23 @@ LEAF(STRNLEN, 6) - li.d t3, 65 - sub.d a2, a0, t1 - -- xvld $xr0, a2, 0 -- xvld $xr1, a2, 32 -+ xvld xr0, a2, 0 -+ xvld xr1, a2, 32 - sub.d t1, t3, t1 - move a3, a0 - - sltu t1, a1, t1 -- xvmsknz.b $xr0, $xr0 -- xvmsknz.b $xr1, $xr1 -- xvpickve.w $xr2, $xr0, 4 -+ xvmsknz.b xr0, xr0 -+ xvmsknz.b xr1, xr1 -+ xvpickve.w xr2, xr0, 4 - -- xvpickve.w $xr3, $xr1, 4 -- vilvl.h $vr0, $vr2, $vr0 -- vilvl.h $vr1, $vr3, $vr1 -- vilvl.w $vr0, $vr1, $vr0 -+ xvpickve.w xr3, xr1, 4 -+ vilvl.h vr0, vr2, vr0 -+ vilvl.h vr1, vr3, vr1 -+ vilvl.w vr0, vr1, vr0 - - -- movfr2gr.d t0, $f0 -+ movfr2gr.d t0, fa0 - sra.d t0, t0, a0 - orn t1, t1, t0 - bnez t1, L(end) -@@ -46,26 +46,26 @@ LEAF(STRNLEN, 6) - bstrins.d a4, zero, 5, 0 - - L(loop): -- xvld $xr0, a0, 64 -- xvld $xr1, a0, 96 -+ xvld xr0, a0, 64 -+ xvld xr1, a0, 96 - addi.d a0, a0, 64 - beq a0, a4, L(out) - -- xvmin.bu $xr2, $xr0, $xr1 -- xvsetanyeqz.b $fcc0, $xr2 -- bceqz $fcc0, L(loop) -+ xvmin.bu xr2, xr0, xr1 -+ xvsetanyeqz.b fcc0, xr2 -+ bceqz fcc0, L(loop) - L(out): -- xvmsknz.b $xr0, $xr0 -+ xvmsknz.b xr0, xr0 - - -- xvmsknz.b $xr1, $xr1 -- xvpickve.w $xr2, $xr0, 4 -- xvpickve.w $xr3, $xr1, 4 -- vilvl.h $vr0, $vr2, $vr0 -+ xvmsknz.b xr1, xr1 -+ xvpickve.w xr2, xr0, 4 -+ xvpickve.w xr3, xr1, 4 -+ vilvl.h vr0, vr2, vr0 - -- vilvl.h $vr1, $vr3, $vr1 -- vilvl.w $vr0, $vr1, $vr0 -- movfr2gr.d t0, $f0 -+ vilvl.h vr1, vr3, vr1 -+ vilvl.w vr0, vr1, vr0 -+ movfr2gr.d t0, fa0 - L(end): - sub.d a0, a0, a3 - -diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S -index db0e90ff..9250a0cd 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S -@@ -19,17 +19,17 @@ LEAF(STRNLEN, 6) - li.d t3, 33 - sub.d a2, a0, t1 - -- vld $vr0, a2, 0 -- vld $vr1, a2, 16 -+ vld vr0, a2, 0 -+ vld vr1, a2, 16 - sub.d t1, t3, t1 - move a3, a0 - - sltu t1, a1, t1 -- vmsknz.b $vr0, $vr0 -- vmsknz.b $vr1, $vr1 -- vilvl.h $vr0, $vr1, $vr0 -+ vmsknz.b vr0, vr0 -+ vmsknz.b vr1, vr1 -+ vilvl.h vr0, vr1, vr0 - -- movfr2gr.s t0, $f0 -+ movfr2gr.s t0, fa0 - sra.w t0, t0, a0 - orn t1, t1, t0 - bnez t1, L(end) -@@ -41,20 +41,20 @@ LEAF(STRNLEN, 6) - bstrins.d a4, zero, 4, 0 - - L(loop): -- vld $vr0, a0, 32 -- vld $vr1, a0, 48 -+ vld vr0, a0, 32 -+ vld vr1, a0, 48 - addi.d a0, a0, 32 - beq a0, a4, L(out) - -- vmin.bu $vr2, $vr0, $vr1 -- vsetanyeqz.b $fcc0, $vr2 -- bceqz $fcc0, L(loop) -+ vmin.bu vr2, vr0, vr1 -+ vsetanyeqz.b fcc0, vr2 -+ bceqz fcc0, L(loop) - L(out): -- vmsknz.b $vr0, $vr0 -+ vmsknz.b vr0, vr0 - -- vmsknz.b $vr1, $vr1 -- vilvl.h $vr0, $vr1, $vr0 -- movfr2gr.s t0, $f0 -+ vmsknz.b vr1, vr1 -+ vilvl.h vr0, vr1, vr0 -+ movfr2gr.s t0, fa0 - L(end): - sub.d a0, a0, a3 - -diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S -index 325458ff..990be973 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S -@@ -14,45 +14,45 @@ - LEAF(STRRCHR, 6) - andi t1, a0, 0x3f - bstrins.d a0, zero, 5, 0 -- xvld $xr0, a0, 0 -- xvld $xr1, a0, 32 -+ xvld xr0, a0, 0 -+ xvld xr1, a0, 32 - - li.d t2, -1 -- xvreplgr2vr.b $xr4, a1 -+ xvreplgr2vr.b xr4, a1 - move a2, zero - sll.d t3, t2, t1 - - addi.d a0, a0, 63 -- xvseq.b $xr2, $xr0, $xr4 -- xvseq.b $xr3, $xr1, $xr4 -- xvmsknz.b $xr0, $xr0 -+ xvseq.b xr2, xr0, xr4 -+ xvseq.b xr3, xr1, xr4 -+ xvmsknz.b xr0, xr0 - -- xvmsknz.b $xr1, $xr1 -- xvpickve.w $xr5, $xr0, 4 -- xvpickve.w $xr6, $xr1, 4 -- vilvl.h $vr0, $vr5, $vr0 -+ xvmsknz.b xr1, xr1 -+ xvpickve.w xr5, xr0, 4 -+ xvpickve.w xr6, xr1, 4 -+ vilvl.h vr0, vr5, vr0 - - -- vilvl.h $vr1, $vr6, $vr1 -- xvmsknz.b $xr2, $xr2 -- xvmsknz.b $xr3, $xr3 -- xvpickve.w $xr5, $xr2, 4 -+ vilvl.h vr1, vr6, vr1 -+ xvmsknz.b xr2, xr2 -+ xvmsknz.b xr3, xr3 -+ xvpickve.w xr5, xr2, 4 - -- xvpickve.w $xr6, $xr3, 4 -- vilvl.h $vr2, $vr5, $vr2 -- vilvl.h $vr3, $vr6, $vr3 -- vilvl.w $vr0, $vr1, $vr0 -+ xvpickve.w xr6, xr3, 4 -+ vilvl.h vr2, vr5, vr2 -+ vilvl.h vr3, vr6, vr3 -+ vilvl.w vr0, vr1, vr0 - -- vilvl.w $vr1, $vr3, $vr2 -- movfr2gr.d t0, $f0 -- movfr2gr.d t1, $f1 -+ vilvl.w vr1, vr3, vr2 -+ movfr2gr.d t0, fa0 -+ movfr2gr.d t1, fa1 - orn t0, t0, t3 - - and t1, t1, t3 - bne t0, t2, L(end) - L(loop): -- xvld $xr0, a0, 1 -- xvld $xr1, a0, 33 -+ xvld xr0, a0, 1 -+ xvld xr1, a0, 33 - - - clz.d t0, t1 -@@ -62,33 +62,33 @@ L(loop): - - masknez t1, a2, t1 - or a2, t0, t1 -- xvseq.b $xr2, $xr0, $xr4 -- xvseq.b $xr3, $xr1, $xr4 -+ xvseq.b xr2, xr0, xr4 -+ xvseq.b xr3, xr1, xr4 - -- xvmsknz.b $xr2, $xr2 -- xvmsknz.b $xr3, $xr3 -- xvpickve.w $xr5, $xr2, 4 -- xvpickve.w $xr6, $xr3, 4 -+ xvmsknz.b xr2, xr2 -+ xvmsknz.b xr3, xr3 -+ xvpickve.w xr5, xr2, 4 -+ xvpickve.w xr6, xr3, 4 - -- vilvl.h $vr2, $vr5, $vr2 -- vilvl.h $vr3, $vr6, $vr3 -- xvmin.bu $xr5, $xr0, $xr1 -- vilvl.w $vr2, $vr3, $vr2 -+ vilvl.h vr2, vr5, vr2 -+ vilvl.h vr3, vr6, vr3 -+ xvmin.bu xr5, xr0, xr1 -+ vilvl.w vr2, vr3, vr2 - - -- xvsetanyeqz.b $fcc0, $xr5 -- movfr2gr.d t1, $f2 -- bceqz $fcc0, L(loop) -- xvmsknz.b $xr0, $xr0 -+ xvsetanyeqz.b fcc0, xr5 -+ movfr2gr.d t1, fa2 -+ bceqz fcc0, L(loop) -+ xvmsknz.b xr0, xr0 - -- xvmsknz.b $xr1, $xr1 -- xvpickve.w $xr5, $xr0, 4 -- xvpickve.w $xr6, $xr1, 4 -- vilvl.h $vr0, $vr5, $vr0 -+ xvmsknz.b xr1, xr1 -+ xvpickve.w xr5, xr0, 4 -+ xvpickve.w xr6, xr1, 4 -+ vilvl.h vr0, vr5, vr0 - -- vilvl.h $vr1, $vr6, $vr1 -- vilvl.w $vr0, $vr1, $vr0 -- movfr2gr.d t0, $f0 -+ vilvl.h vr1, vr6, vr1 -+ vilvl.w vr0, vr1, vr0 -+ movfr2gr.d t0, fa0 - L(end): - slli.d t3, t2, 1 # shift one more for the last '\0' - -diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S -index e082eaab..6aede6ae 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S -@@ -14,35 +14,35 @@ - LEAF(STRRCHR, 6) - andi t1, a0, 0x1f - bstrins.d a0, zero, 4, 0 -- vld $vr0, a0, 0 -- vld $vr1, a0, 16 -+ vld vr0, a0, 0 -+ vld vr1, a0, 16 - -- vreplgr2vr.b $vr4, a1 -+ vreplgr2vr.b vr4, a1 - li.d t2, -1 - move a2, zero - addi.d a0, a0, 31 - -- vseq.b $vr2, $vr0, $vr4 -- vseq.b $vr3, $vr1, $vr4 -- vmsknz.b $vr0, $vr0 -- vmsknz.b $vr1, $vr1 -+ vseq.b vr2, vr0, vr4 -+ vseq.b vr3, vr1, vr4 -+ vmsknz.b vr0, vr0 -+ vmsknz.b vr1, vr1 - -- vmsknz.b $vr2, $vr2 -- vmsknz.b $vr3, $vr3 -- vilvl.h $vr0, $vr1, $vr0 -- vilvl.h $vr1, $vr3, $vr2 -+ vmsknz.b vr2, vr2 -+ vmsknz.b vr3, vr3 -+ vilvl.h vr0, vr1, vr0 -+ vilvl.h vr1, vr3, vr2 - - -- movfr2gr.s t0, $f0 -+ movfr2gr.s t0, fa0 - sll.d t3, t2, t1 -- movfr2gr.s t1, $f1 -+ movfr2gr.s t1, fa1 - orn t0, t0, t3 - - and t1, t1, t3 - bne t0, t2, L(end) - L(loop): -- vld $vr0, a0, 1 -- vld $vr1, a0, 17 -+ vld vr0, a0, 1 -+ vld vr1, a0, 17 - - clz.w t0, t1 - sub.d t0, a0, t0 -@@ -51,23 +51,23 @@ L(loop): - - masknez t1, a2, t1 - or a2, t0, t1 -- vseq.b $vr2, $vr0, $vr4 -- vseq.b $vr3, $vr1, $vr4 -+ vseq.b vr2, vr0, vr4 -+ vseq.b vr3, vr1, vr4 - - -- vmsknz.b $vr2, $vr2 -- vmsknz.b $vr3, $vr3 -- vmin.bu $vr5, $vr0, $vr1 -- vilvl.h $vr2, $vr3, $vr2 -+ vmsknz.b vr2, vr2 -+ vmsknz.b vr3, vr3 -+ vmin.bu vr5, vr0, vr1 -+ vilvl.h vr2, vr3, vr2 - -- vsetanyeqz.b $fcc0, $vr5 -- movfr2gr.s t1, $f2 -- bceqz $fcc0, L(loop) -- vmsknz.b $vr0, $vr0 -+ vsetanyeqz.b fcc0, vr5 -+ movfr2gr.s t1, fa2 -+ bceqz fcc0, L(loop) -+ vmsknz.b vr0, vr0 - -- vmsknz.b $vr1, $vr1 -- vilvl.h $vr0, $vr1, $vr0 -- movfr2gr.s t0, $f0 -+ vmsknz.b vr1, vr1 -+ vilvl.h vr0, vr1, vr0 -+ movfr2gr.s t0, fa0 - L(end): - slli.d t3, t2, 1 # shift one more for the last '\0' - -diff --git a/sysdeps/loongarch/lp64/s_cosf.S b/sysdeps/loongarch/lp64/s_cosf.S -index 9fcbe6ca..cb3a4faa 100644 ---- a/sysdeps/loongarch/lp64/s_cosf.S -+++ b/sysdeps/loongarch/lp64/s_cosf.S -@@ -213,9 +213,9 @@ L_even_integer: - fadd.d fa0, fa0, fa1 - fadd.d fa2, fa2, fa3 - fadd.d fa0, fa0, fa2 -- fcmp.sle.d $fcc0, fa0, fa5 -+ fcmp.sle.d fcc0, fa0, fa5 - addi.d t0, t0, 3 -- bcnez $fcc0, L_leq_one -+ bcnez fcc0, L_leq_one - /*L_gt_one:*/ - fld.d fa2, t1, 16 /* 2.0 */ - addi.d t0, t0, 1 -diff --git a/sysdeps/loongarch/lp64/s_sinf.S b/sysdeps/loongarch/lp64/s_sinf.S -index 45d1c4b5..1e77282d 100644 ---- a/sysdeps/loongarch/lp64/s_sinf.S -+++ b/sysdeps/loongarch/lp64/s_sinf.S -@@ -215,9 +215,9 @@ L_even_integer: - fadd.d fa0, fa0, fa1 - fadd.d fa2, fa2, fa3 - fadd.d fa0, fa0, fa2 -- fcmp.sle.d $fcc0, fa0, fa5 -+ fcmp.sle.d fcc0, fa0, fa5 - addi.d t0, t0, 1 -- bcnez $fcc0, L_leq_one -+ bcnez fcc0, L_leq_one - /*L_gt_one:*/ - fld.d fa2, t1, 16 /* 2.0 */ - addi.d t0, t0, 1 -diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h -index 36f00939..b5ee57cf 100644 ---- a/sysdeps/loongarch/sys/regdef.h -+++ b/sysdeps/loongarch/sys/regdef.h -@@ -71,6 +71,14 @@ - # define fs5 $f29 - # define fs6 $f30 - # define fs7 $f31 -+# define fcc0 $fcc0 -+# define fcc1 $fcc1 -+# define fcc2 $fcc2 -+# define fcc3 $fcc3 -+# define fcc4 $fcc4 -+# define fcc5 $fcc5 -+# define fcc6 $fcc6 -+# define fcc7 $fcc7 - - #elif _LOONGARCH_SIM == _ABILP32 - # error ABILP32 not support yet -@@ -78,4 +86,70 @@ - # error noABI - #endif - -+#define vr0 $vr0 -+#define vr1 $vr1 -+#define vr2 $vr2 -+#define vr3 $vr3 -+#define vr4 $vr4 -+#define vr5 $vr5 -+#define vr6 $vr6 -+#define vr7 $vr7 -+#define vr8 $vr8 -+#define vr9 $vr9 -+#define vr10 $vr10 -+#define vr11 $vr11 -+#define vr12 $vr12 -+#define vr13 $vr13 -+#define vr14 $vr14 -+#define vr15 $vr15 -+#define vr16 $vr16 -+#define vr17 $vr17 -+#define vr18 $vr18 -+#define vr19 $vr19 -+#define vr20 $vr20 -+#define vr21 $vr21 -+#define vr22 $vr22 -+#define vr23 $vr23 -+#define vr24 $vr24 -+#define vr25 $vr25 -+#define vr26 $vr26 -+#define vr27 $vr27 -+#define vr28 $vr28 -+#define vr29 $vr29 -+#define vr30 $vr30 -+#define vr31 $vr31 -+ -+#define xr0 $xr0 -+#define xr1 $xr1 -+#define xr2 $xr2 -+#define xr3 $xr3 -+#define xr4 $xr4 -+#define xr5 $xr5 -+#define xr6 $xr6 -+#define xr7 $xr7 -+#define xr8 $xr8 -+#define xr9 $xr9 -+#define xr10 $xr10 -+#define xr11 $xr11 -+#define xr12 $xr12 -+#define xr13 $xr13 -+#define xr14 $xr14 -+#define xr15 $xr15 -+#define xr16 $xr16 -+#define xr17 $xr17 -+#define xr18 $xr18 -+#define xr19 $xr19 -+#define xr20 $xr20 -+#define xr21 $xr21 -+#define xr22 $xr22 -+#define xr23 $xr23 -+#define xr24 $xr24 -+#define xr25 $xr25 -+#define xr26 $xr26 -+#define xr27 $xr27 -+#define xr28 $xr28 -+#define xr29 $xr29 -+#define xr30 $xr30 -+#define xr31 $xr31 -+ - #endif /* _SYS_REGDEF_H */ --- -2.33.0 - diff --git a/glibc-2.28-Add-new-struct-user_fp_state-in-user.h.patch b/glibc-2.28-Add-new-struct-user_fp_state-in-user.h.patch deleted file mode 100644 index b7ae1ad..0000000 --- a/glibc-2.28-Add-new-struct-user_fp_state-in-user.h.patch +++ /dev/null @@ -1,29 +0,0 @@ -From dc2d26d52c129c47fa1f16bd0157cd20c6d9a958 Mon Sep 17 00:00:00 2001 -From: caiyinyu -Date: Wed, 21 Jun 2023 11:55:02 +0800 -Subject: [PATCH 08/14] glibc-2.28: Add new struct user_fp_state in user.h - -Change-Id: Idc233cc11c8f76b624dc2891b432f4d02a53cebc -Signed-off-by: ticat_fp ---- - sysdeps/unix/sysv/linux/loongarch/sys/user.h | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/sysdeps/unix/sysv/linux/loongarch/sys/user.h b/sysdeps/unix/sysv/linux/loongarch/sys/user.h -index f9108350..21e340f6 100644 ---- a/sysdeps/unix/sysv/linux/loongarch/sys/user.h -+++ b/sysdeps/unix/sysv/linux/loongarch/sys/user.h -@@ -28,4 +28,10 @@ struct user_regs_struct - uint64_t reserved[11]; - }; - -+struct user_fp_struct { -+ uint64_t fpr[32]; -+ uint64_t fcc; -+ uint32_t fcsr; -+}; -+ - #endif /* _SYS_USER_H */ --- -2.33.0 - diff --git a/glibc-2.28-Fix-ifunc-str-mem-functions-xfail-problem.patch b/glibc-2.28-Fix-ifunc-str-mem-functions-xfail-problem.patch deleted file mode 100644 index ff87ba3..0000000 --- a/glibc-2.28-Fix-ifunc-str-mem-functions-xfail-problem.patch +++ /dev/null @@ -1,162 +0,0 @@ -From 647a0a28e5c9aed2f1fa59bbb7595133e7a4e62f Mon Sep 17 00:00:00 2001 -From: caiyinyu -Date: Mon, 24 Apr 2023 18:09:55 +0800 -Subject: [PATCH 03/14] glibc-2.28: Fix ifunc str/mem functions xfail problems. - -Change-Id: Ibff4229fcfef23c0b19fb94b21a4d17b49eceec6 -Signed-off-by: ticat_fp ---- - .../lp64/multiarch/ifunc-impl-list.c | 76 +++++++++---------- - 1 file changed, 38 insertions(+), 38 deletions(-) - -diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c -index c2b6bbf7..fdeae797 100644 ---- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c -+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c -@@ -36,105 +36,105 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, - size_t i = 0; - - IFUNC_IMPL (i, name, memcpy, -- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_lasx) -- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_lsx) -+ IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx) -+ IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LSX, __memcpy_lsx) -+ IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_UAL, __memcpy_unaligned) - IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_aligned) -- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_unaligned) - ) - - IFUNC_IMPL (i, name, memmove, -- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_lasx) -- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_lsx) -+ IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_LASX, __memmove_lasx) -+ IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_LSX, __memmove_lsx) -+ IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_UAL, __memmove_unaligned) - IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_aligned) -- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_unaligned) - ) - - IFUNC_IMPL (i, name, memset, -- IFUNC_IMPL_ADD (array, i, memset, 1, __memset_lasx) -- IFUNC_IMPL_ADD (array, i, memset, 1, __memset_lsx) -+ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LASX, __memset_lasx) -+ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LSX, __memset_lsx) -+ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_UAL, __memset_unaligned) - IFUNC_IMPL_ADD (array, i, memset, 1, __memset_aligned) -- IFUNC_IMPL_ADD (array, i, memset, 1, __memset_unaligned) - ) - - IFUNC_IMPL (i, name, memchr, -- IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_lasx) -- IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_lsx) -+ IFUNC_IMPL_ADD (array, i, memchr, SUPPORT_LASX, __memchr_lasx) -+ IFUNC_IMPL_ADD (array, i, memchr, SUPPORT_LSX, __memchr_lsx) - IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_aligned) - ) - - IFUNC_IMPL (i, name, memrchr, -- IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_lasx) -- IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_lsx) -+ IFUNC_IMPL_ADD (array, i, memrchr, SUPPORT_LASX, __memrchr_lasx) -+ IFUNC_IMPL_ADD (array, i, memrchr, SUPPORT_LSX, __memrchr_lsx) - IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_generic) - ) - - IFUNC_IMPL (i, name, memcmp, -- IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_lasx) -- IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_lsx) -+ IFUNC_IMPL_ADD (array, i, memcmp, SUPPORT_LASX, __memcmp_lasx) -+ IFUNC_IMPL_ADD (array, i, memcmp, SUPPORT_LSX, __memcmp_lsx) - IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_aligned) - ) - - IFUNC_IMPL (i, name, rawmemchr, -- IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_lasx) -- IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_lsx) -+ IFUNC_IMPL_ADD (array, i, rawmemchr, SUPPORT_LASX, __rawmemchr_lasx) -+ IFUNC_IMPL_ADD (array, i, rawmemchr, SUPPORT_LSX, __rawmemchr_lsx) - IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_aligned) - ) - - IFUNC_IMPL (i, name, strchr, -- IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_lasx) -- IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_lsx) -+ IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_LASX, __strchr_lasx) -+ IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_LSX, __strchr_lsx) -+ IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_UAL, __strchr_unaligned) - IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_aligned) -- IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_unaligned) - ) - - IFUNC_IMPL (i, name, strrchr, -- IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_lasx) -- IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_lsx) -+ IFUNC_IMPL_ADD (array, i, strrchr, SUPPORT_LASX, __strrchr_lasx) -+ IFUNC_IMPL_ADD (array, i, strrchr, SUPPORT_LSX, __strrchr_lsx) - IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_aligned) - ) - - IFUNC_IMPL (i, name, strlen, -- IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_lasx) -- IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_lsx) -+ IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LASX, __strlen_lasx) -+ IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LSX, __strlen_lsx) -+ IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_UAL, __strlen_unaligned) - IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned) -- IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_unaligned) - ) - - IFUNC_IMPL (i, name, strnlen, -- IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_lasx) -- IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_lsx) -+ IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_LASX, __strnlen_lasx) -+ IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_LSX, __strnlen_lsx) -+ IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_UAL, __strnlen_unaligned) - IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_aligned) -- IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_unaligned) - ) - - IFUNC_IMPL (i, name, strchrnul, -- IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_lasx) -- IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_lsx) -+ IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_LASX, __strchrnul_lasx) -+ IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_LSX, __strchrnul_lsx) -+ IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_UAL, __strchrnul_unaligned) - IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_aligned) -- IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_unaligned) - ) - - IFUNC_IMPL (i, name, strncmp, -- IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_lsx) -+ IFUNC_IMPL_ADD (array, i, strncmp, SUPPORT_LSX, __strncmp_lsx) -+ IFUNC_IMPL_ADD (array, i, strncmp, SUPPORT_UAL, __strncmp_unaligned) - IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_aligned) -- IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_unaligned) - ) - - IFUNC_IMPL (i, name, strcpy, -- IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_lsx) -+ IFUNC_IMPL_ADD (array, i, strcpy, SUPPORT_LSX, __strcpy_lsx) -+ IFUNC_IMPL_ADD (array, i, strcpy, SUPPORT_UAL, __strcpy_unaligned) - IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_aligned) -- IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_unaligned) - ) - - IFUNC_IMPL (i, name, stpcpy, -- IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_lsx) -+ IFUNC_IMPL_ADD (array, i, stpcpy, SUPPORT_LSX, __stpcpy_lsx) - IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_aligned) - ) - - IFUNC_IMPL (i, name, strcmp, -- IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_lsx) -+ IFUNC_IMPL_ADD (array, i, strcmp, SUPPORT_LSX, __strcmp_lsx) -+ IFUNC_IMPL_ADD (array, i, strcmp, SUPPORT_UAL, __strcmp_unaligned) - IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_aligned) -- IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_unaligned) - ) - - return i; --- -2.33.0 - diff --git a/glibc-2.28-Redefine-macro-LEAF-ENTRY.patch b/glibc-2.28-Redefine-macro-LEAF-ENTRY.patch deleted file mode 100644 index 42b4200..0000000 --- a/glibc-2.28-Redefine-macro-LEAF-ENTRY.patch +++ /dev/null @@ -1,57 +0,0 @@ -From 00537d6945e71af8c9b0b1e7c2695f6a9a1ef1f5 Mon Sep 17 00:00:00 2001 -From: caiyinyu -Date: Sun, 25 Jun 2023 16:23:25 +0800 -Subject: [PATCH 09/14] glibc-2.28: Redefine macro LEAF/ENTRY. - - The following usage of macro LEAF/ENTRY are all feasible: - 1. LEAF(fcn) -- the align value of fcn is .align 3 (default value) - 2. LEAF(fcn, 6) -- the align value of fcn is .align 6 - -Change-Id: Ie3df4df8dba5259b665bd0e4702aaab0a09a5f65 -Signed-off-by: ticat_fp ---- - sysdeps/loongarch/sys/asm.h | 15 ++++++++++----- - 1 file changed, 10 insertions(+), 5 deletions(-) - -diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h -index 357a5ba3..734e45ae 100644 ---- a/sysdeps/loongarch/sys/asm.h -+++ b/sysdeps/loongarch/sys/asm.h -@@ -26,16 +26,21 @@ - #endif - - --/* Declare leaf routine. */ --#define LEAF(symbol, aln) \ -+/* Declare leaf routine. -+ The usage of macro LEAF/ENTRY is as follows: -+ 1. LEAF(fcn) -- the align value of fcn is .align 3 (default value) -+ 2. LEAF(fcn, 6) -- the align value of fcn is .align 6 -+*/ -+#define LEAF_IMPL(symbol, aln, ...) \ - .text; \ - .globl symbol; \ - .align aln; \ - .type symbol, @function; \ - symbol: \ -- cfi_startproc; \ -+ cfi_startproc; - --# define ENTRY(symbol, aln) LEAF(symbol, aln) -+#define LEAF(...) LEAF_IMPL(__VA_ARGS__, 3) -+#define ENTRY(...) LEAF(__VA_ARGS__) - - #define LEAF_NO_ALIGN(symbol) \ - .text; \ -@@ -44,7 +49,7 @@ symbol: \ - symbol: \ - cfi_startproc; - --# define ENTRY_NO_ALIGN(symbol) LEAF_NO_ALIGN(symbol) -+#define ENTRY_NO_ALIGN(symbol) LEAF_NO_ALIGN(symbol) - - /* Mark end of function. */ - #undef END --- -2.33.0 - diff --git a/glibc-2.28-Refactor-code-and-fix-bug-in-_dl_runtime_.patch b/glibc-2.28-Refactor-code-and-fix-bug-in-_dl_runtime_.patch deleted file mode 100644 index 075149b..0000000 --- a/glibc-2.28-Refactor-code-and-fix-bug-in-_dl_runtime_.patch +++ /dev/null @@ -1,306 +0,0 @@ -From 27a004c9777340afd86fc0d129f6ffad508bf090 Mon Sep 17 00:00:00 2001 -From: caiyinyu -Date: Tue, 11 Jul 2023 16:09:55 +0800 -Subject: [PATCH 12/14] glibc-2.28: Refactor code and fix bug in - _dl_runtime_resolve. - -Change-Id: I4907e6643ef25b87d7862e957ce9bf6d201da816 -Signed-off-by: ticat_fp ---- - sysdeps/loongarch/dl-machine.h | 8 +- - sysdeps/loongarch/dl-trampoline.S | 7 ++ - sysdeps/loongarch/dl-trampoline.h | 159 +++++++++++++----------------- - sysdeps/loongarch/sys/asm.h | 9 ++ - 4 files changed, 90 insertions(+), 93 deletions(-) - -diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h -index 6e9c6258..ff520a07 100644 ---- a/sysdeps/loongarch/dl-machine.h -+++ b/sysdeps/loongarch/dl-machine.h -@@ -381,9 +381,13 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], - /* If using PLTs, fill in the first two entries of .got.plt. */ - if (l->l_info[DT_JMPREL]) - { -- extern void _dl_runtime_resolve (void) __attribute__ ((visibility ("hidden"))); -+ -+#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float - extern void _dl_runtime_resolve_lasx (void) __attribute__ ((visibility ("hidden"))); - extern void _dl_runtime_resolve_lsx (void) __attribute__ ((visibility ("hidden"))); -+#endif -+ extern void _dl_runtime_resolve (void) __attribute__ ((visibility ("hidden"))); -+ - ElfW(Addr) *gotplt = (ElfW(Addr) *) D_PTR (l, l_info[DT_PLTGOT]); - /* If a library is prelinked but we have to relocate anyway, - we have to be able to undo the prelinking of .got.plt. -@@ -391,11 +395,13 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], - if (gotplt[1]) - l->l_mach.plt = gotplt[1] + l->l_addr; - -+#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float - if (SUPPORT_LASX) - gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx; - else if (SUPPORT_LSX) - gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lsx; - else -+#endif - gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve; - - gotplt[1] = (ElfW(Addr)) l; -diff --git a/sysdeps/loongarch/dl-trampoline.S b/sysdeps/loongarch/dl-trampoline.S -index 5f627a63..78d741f3 100644 ---- a/sysdeps/loongarch/dl-trampoline.S -+++ b/sysdeps/loongarch/dl-trampoline.S -@@ -16,16 +16,23 @@ - License along with the GNU C Library. If not, see - . */ - -+#include -+#include -+ -+#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float - #define USE_LASX - #define _dl_runtime_resolve _dl_runtime_resolve_lasx - #include "dl-trampoline.h" -+#undef FRAME_SIZE - #undef USE_LASX - #undef _dl_runtime_resolve - - #define USE_LSX - #define _dl_runtime_resolve _dl_runtime_resolve_lsx - #include "dl-trampoline.h" -+#undef FRAME_SIZE - #undef USE_LSX - #undef _dl_runtime_resolve -+#endif - - #include "dl-trampoline.h" -diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h -index 96f41f1d..9a6d9b6c 100644 ---- a/sysdeps/loongarch/dl-trampoline.h -+++ b/sysdeps/loongarch/dl-trampoline.h -@@ -17,31 +17,24 @@ - License along with the GNU C Library. If not, see - . */ - --#include --#include -- - /* Assembler veneer called from the PLT header code for lazy loading. - The PLT header passes its own args in t0-t2. */ -- --#ifdef __loongarch_soft_float --# define FRAME_SIZE (-((-10 * SZREG) & ALMASK)) -+#ifdef USE_LASX -+# define FRAME_SIZE (-((-9 * SZREG - 8 * SZFREG - 8 * SZXREG) & ALMASK)) -+#elif defined USE_LSX -+# define FRAME_SIZE (-((-9 * SZREG - 8 * SZFREG - 8 * SZVREG) & ALMASK)) -+#elif !defined __loongarch_soft_float -+# define FRAME_SIZE (-((-9 * SZREG - 8 * SZFREG) & ALMASK)) - #else --# define FRAME_SIZE (-((-10 * SZREG - 8 * 256) & ALMASK)) -+# define FRAME_SIZE (-((-9 * SZREG) & ALMASK)) - #endif - - ENTRY (_dl_runtime_resolve, 3) -- # Save arguments to stack. -- --#ifdef __loongarch64 -- li.d t3, -FRAME_SIZE -- add.d sp, sp, t3 --#elif defined __loongarch32 -- li.w t3, -FRAME_SIZE -- add.w sp, sp, t3 --#endif - -+ /* Save arguments to stack. */ -+ ADDI sp, sp, -FRAME_SIZE - -- REG_S ra, sp, 9*SZREG -+ REG_S ra, sp, 0*SZREG - REG_S a0, sp, 1*SZREG - REG_S a1, sp, 2*SZREG - REG_S a2, sp, 3*SZREG -@@ -51,55 +44,45 @@ ENTRY (_dl_runtime_resolve, 3) - REG_S a6, sp, 7*SZREG - REG_S a7, sp, 8*SZREG - --#ifndef __loongarch_soft_float -- FREG_S fa0, sp, 10*SZREG + 0*SZFREG -- FREG_S fa1, sp, 10*SZREG + 1*SZFREG -- FREG_S fa2, sp, 10*SZREG + 2*SZFREG -- FREG_S fa3, sp, 10*SZREG + 3*SZFREG -- FREG_S fa4, sp, 10*SZREG + 4*SZFREG -- FREG_S fa5, sp, 10*SZREG + 5*SZFREG -- FREG_S fa6, sp, 10*SZREG + 6*SZFREG -- FREG_S fa7, sp, 10*SZREG + 7*SZFREG - #ifdef USE_LASX -- xvst xr0, sp, 10*SZREG + 0*256 -- xvst xr1, sp, 10*SZREG + 1*256 -- xvst xr2, sp, 10*SZREG + 2*256 -- xvst xr3, sp, 10*SZREG + 3*256 -- xvst xr4, sp, 10*SZREG + 4*256 -- xvst xr5, sp, 10*SZREG + 5*256 -- xvst xr6, sp, 10*SZREG + 6*256 -- xvst xr7, sp, 10*SZREG + 7*256 -+ xvst xr0, sp, 9*SZREG + 8*SZFREG + 0*SZXREG -+ xvst xr1, sp, 9*SZREG + 8*SZFREG + 1*SZXREG -+ xvst xr2, sp, 9*SZREG + 8*SZFREG + 2*SZXREG -+ xvst xr3, sp, 9*SZREG + 8*SZFREG + 3*SZXREG -+ xvst xr4, sp, 9*SZREG + 8*SZFREG + 4*SZXREG -+ xvst xr5, sp, 9*SZREG + 8*SZFREG + 5*SZXREG -+ xvst xr6, sp, 9*SZREG + 8*SZFREG + 6*SZXREG -+ xvst xr7, sp, 9*SZREG + 8*SZFREG + 7*SZXREG - #elif defined USE_LSX -- vst vr0, sp, 10*SZREG + 0*128 -- vst vr1, sp, 10*SZREG + 1*128 -- vst vr2, sp, 10*SZREG + 2*128 -- vst vr3, sp, 10*SZREG + 3*128 -- vst vr4, sp, 10*SZREG + 4*128 -- vst vr5, sp, 10*SZREG + 5*128 -- vst vr6, sp, 10*SZREG + 6*128 -- vst vr7, sp, 10*SZREG + 7*128 --#endif -+ vst vr0, sp, 9*SZREG + 8*SZFREG + 0*SZVREG -+ vst vr1, sp, 9*SZREG + 8*SZFREG + 1*SZVREG -+ vst vr2, sp, 9*SZREG + 8*SZFREG + 2*SZVREG -+ vst vr3, sp, 9*SZREG + 8*SZFREG + 3*SZVREG -+ vst vr4, sp, 9*SZREG + 8*SZFREG + 4*SZVREG -+ vst vr5, sp, 9*SZREG + 8*SZFREG + 5*SZVREG -+ vst vr6, sp, 9*SZREG + 8*SZFREG + 6*SZVREG -+ vst vr7, sp, 9*SZREG + 8*SZFREG + 7*SZVREG -+#elif !defined __loongarch_soft_float -+ FREG_S fa0, sp, 9*SZREG + 0*SZFREG -+ FREG_S fa1, sp, 9*SZREG + 1*SZFREG -+ FREG_S fa2, sp, 9*SZREG + 2*SZFREG -+ FREG_S fa3, sp, 9*SZREG + 3*SZFREG -+ FREG_S fa4, sp, 9*SZREG + 4*SZFREG -+ FREG_S fa5, sp, 9*SZREG + 5*SZFREG -+ FREG_S fa6, sp, 9*SZREG + 6*SZFREG -+ FREG_S fa7, sp, 9*SZREG + 7*SZFREG - #endif - -- # Update .got.plt and obtain runtime address of callee. --#ifdef __loongarch64 -- slli.d a1, t1, 1 -+ /* Update .got.plt and obtain runtime address of callee */ -+ SLLI a1, t1, 1 - or a0, t0, zero -- add.d a1, a1, t1 -+ ADD a1, a1, t1 - la a2, _dl_fixup - jirl ra, a2, 0 - or t1, v0, zero --#elif defined __loongarch32 -- slli.w a1, t1, 1 -- or a0, t0, zero -- add.w a1, a1, t1 -- la a2, _dl_fixup -- jirl ra, a2, 0 -- or t1, v0, zero --#endif - -- # Restore arguments from stack. -- REG_L ra, sp, 9*SZREG -+ /* Restore arguments from stack. */ -+ REG_L ra, sp, 0*SZREG - REG_L a0, sp, 1*SZREG - REG_L a1, sp, 2*SZREG - REG_L a2, sp, 3*SZREG -@@ -109,45 +92,37 @@ ENTRY (_dl_runtime_resolve, 3) - REG_L a6, sp, 7*SZREG - REG_L a7, sp, 8*SZREG - --#ifndef __loongarch_soft_float -- FREG_L fa0, sp, 10*SZREG + 0*SZFREG -- FREG_L fa1, sp, 10*SZREG + 1*SZFREG -- FREG_L fa2, sp, 10*SZREG + 2*SZFREG -- FREG_L fa3, sp, 10*SZREG + 3*SZFREG -- FREG_L fa4, sp, 10*SZREG + 4*SZFREG -- FREG_L fa5, sp, 10*SZREG + 5*SZFREG -- FREG_L fa6, sp, 10*SZREG + 6*SZFREG -- FREG_L fa7, sp, 10*SZREG + 7*SZFREG - #ifdef USE_LASX -- xvld xr0, sp, 10*SZREG + 0*256 -- xvld xr1, sp, 10*SZREG + 1*256 -- xvld xr2, sp, 10*SZREG + 2*256 -- xvld xr3, sp, 10*SZREG + 3*256 -- xvld xr4, sp, 10*SZREG + 4*256 -- xvld xr5, sp, 10*SZREG + 5*256 -- xvld xr6, sp, 10*SZREG + 6*256 -- xvld xr7, sp, 10*SZREG + 7*256 -+ xvld xr0, sp, 9*SZREG + 8*SZFREG + 0*SZXREG -+ xvld xr1, sp, 9*SZREG + 8*SZFREG + 1*SZXREG -+ xvld xr2, sp, 9*SZREG + 8*SZFREG + 2*SZXREG -+ xvld xr3, sp, 9*SZREG + 8*SZFREG + 3*SZXREG -+ xvld xr4, sp, 9*SZREG + 8*SZFREG + 4*SZXREG -+ xvld xr5, sp, 9*SZREG + 8*SZFREG + 5*SZXREG -+ xvld xr6, sp, 9*SZREG + 8*SZFREG + 6*SZXREG -+ xvld xr7, sp, 9*SZREG + 8*SZFREG + 7*SZXREG - #elif defined USE_LSX -- vld vr0, sp, 10*SZREG + 0*128 -- vld vr1, sp, 10*SZREG + 1*128 -- vld vr2, sp, 10*SZREG + 2*128 -- vld vr3, sp, 10*SZREG + 3*128 -- vld vr4, sp, 10*SZREG + 4*128 -- vld vr5, sp, 10*SZREG + 5*128 -- vld vr6, sp, 10*SZREG + 6*128 -- vld vr7, sp, 10*SZREG + 7*128 --#endif --#endif -- --#ifdef __loongarch64 -- li.d t3, FRAME_SIZE -- add.d sp, sp, t3 --#elif defined __loongarch32 -- li.w t3, FRAME_SIZE -- addi.w sp, sp, FRAME_SIZE -+ vld vr0, sp, 9*SZREG + 8*SZFREG + 0*SZVREG -+ vld vr1, sp, 9*SZREG + 8*SZFREG + 1*SZVREG -+ vld vr2, sp, 9*SZREG + 8*SZFREG + 2*SZVREG -+ vld vr3, sp, 9*SZREG + 8*SZFREG + 3*SZVREG -+ vld vr4, sp, 9*SZREG + 8*SZFREG + 4*SZVREG -+ vld vr5, sp, 9*SZREG + 8*SZFREG + 5*SZVREG -+ vld vr6, sp, 9*SZREG + 8*SZFREG + 6*SZVREG -+ vld vr7, sp, 9*SZREG + 8*SZFREG + 7*SZVREG -+#elif !defined __loongarch_soft_float -+ FREG_L fa0, sp, 9*SZREG + 0*SZFREG -+ FREG_L fa1, sp, 9*SZREG + 1*SZFREG -+ FREG_L fa2, sp, 9*SZREG + 2*SZFREG -+ FREG_L fa3, sp, 9*SZREG + 3*SZFREG -+ FREG_L fa4, sp, 9*SZREG + 4*SZFREG -+ FREG_L fa5, sp, 9*SZREG + 5*SZFREG -+ FREG_L fa6, sp, 9*SZREG + 6*SZFREG -+ FREG_L fa7, sp, 9*SZREG + 7*SZFREG - #endif - -+ ADDI sp, sp, FRAME_SIZE - -- # Invoke the callee. -+ /* Invoke the callee. */ - jirl zero, t1, 0 - END (_dl_runtime_resolve) -diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h -index 734e45ae..e80c6245 100644 ---- a/sysdeps/loongarch/sys/asm.h -+++ b/sysdeps/loongarch/sys/asm.h -@@ -9,8 +9,17 @@ - # define PTRLOG 3 - # define SZREG 8 - # define SZFREG 8 -+# define SZVREG 16 -+# define SZXREG 32 - # define REG_L ld.d - # define REG_S st.d -+# define SRLI srli.d -+# define SLLI slli.d -+# define ADDI addi.d -+# define ADD add.d -+# define SUB sub.d -+# define BSTRINS bstrins.d -+# define LI li.d - # define FREG_L fld.d - # define FREG_S fst.d - #elif defined __loongarch32 --- -2.33.0 - diff --git a/glibc-2.28-Refactor-code-of-raw-mem-functions.patch b/glibc-2.28-Refactor-code-of-raw-mem-functions.patch deleted file mode 100644 index 0db95f8..0000000 --- a/glibc-2.28-Refactor-code-of-raw-mem-functions.patch +++ /dev/null @@ -1,3031 +0,0 @@ -From 4879bd4e0aff7d884d9b026b6081a0e8cffc491c Mon Sep 17 00:00:00 2001 -From: caiyinyu -Date: Wed, 21 Jun 2023 09:30:54 +0800 -Subject: [PATCH 06/14] glibc-2.28: Refactor code of {raw,}mem* functions. - -Change-Id: Icafaf6bc8216f48be64cf25a40b9fe28ce127914 -Signed-off-by: ticat_fp ---- - sysdeps/loongarch/lp64/memchr.S | 92 -- - sysdeps/loongarch/lp64/memcmp.S | 280 ------ - sysdeps/loongarch/lp64/memcpy.S | 804 ------------------ - sysdeps/loongarch/lp64/memmove.S | 2 - - sysdeps/loongarch/lp64/memset.S | 166 ---- - .../loongarch/lp64/multiarch/memchr-aligned.S | 91 +- - .../loongarch/lp64/multiarch/memcmp-aligned.S | 282 +++++- - .../loongarch/lp64/multiarch/memcpy-aligned.S | 799 ++++++++++++++++- - .../loongarch/lp64/multiarch/memset-aligned.S | 166 +++- - .../lp64/multiarch/rawmemchr-aligned.S | 110 ++- - sysdeps/loongarch/lp64/rawmemchr.S | 113 --- - 11 files changed, 1438 insertions(+), 1467 deletions(-) - delete mode 100644 sysdeps/loongarch/lp64/memchr.S - delete mode 100644 sysdeps/loongarch/lp64/memcmp.S - delete mode 100644 sysdeps/loongarch/lp64/memcpy.S - delete mode 100644 sysdeps/loongarch/lp64/memmove.S - delete mode 100644 sysdeps/loongarch/lp64/memset.S - delete mode 100644 sysdeps/loongarch/lp64/rawmemchr.S - -diff --git a/sysdeps/loongarch/lp64/memchr.S b/sysdeps/loongarch/lp64/memchr.S -deleted file mode 100644 -index 23f1fd13..00000000 ---- a/sysdeps/loongarch/lp64/memchr.S -+++ /dev/null -@@ -1,92 +0,0 @@ --#ifdef _LIBC --#include --#include --#include --#else --#include --#include --#endif -- --#ifndef MEMCHR_NAME --#define MEMCHR_NAME memchr --#endif -- --LEAF(MEMCHR_NAME, 6) -- beqz a2, L(out) -- andi t1, a0, 0x7 -- lu12i.w a3, 0x01010 -- sub.d a5, a0, t1 -- -- bstrins.d a1, a1, 15, 8 -- ld.d t0, a5, 0 -- slli.d t2, t1, 3 -- ori a3, a3, 0x101 -- -- bstrins.d a1, a1, 31, 16 -- li.w t7, -1 -- li.w t8, 9 -- bstrins.d a3, a3, 63, 32 -- -- srl.d t3, t7, t2 -- bstrins.d a1, a1, 63, 32 -- sub.d t4, t8, t1 -- orn t3, a1, t3 -- -- srl.d t0, t0, t2 -- slli.d a4, a3, 7 # 0x8080808080808080 -- sltu t4, a2, t4 -- xor t2, t0, t3 -- -- sub.d a6, t2, a3 -- andn a7, a4, t2 -- and t2, a6, a7 -- or t3, t2, t4 -- -- bnez t3, L(count_pos) -- addi.d a2, a2, -8 -- addi.d a0, a5, 8 -- add.d a2, a2, t1 -- --L(loop): -- ld.d t0, a0, 0 -- sltui t4, a2, 9 -- xor t2, t0, a1 -- sub.d a6, t2, a3 -- -- andn a7, a4, t2 -- and t2, a6, a7 -- or t3, t2, t4 -- bnez t3, L(count_pos) -- -- ld.d t1, a0, 8 -- addi.d a0, a0, 16 -- sltui t4, a2, 17 -- xor t2, t1, a1 -- -- sub.d a6, t2, a3 -- andn a7, a4, t2 -- and t2, a6, a7 -- addi.d a2, a2, -16 -- -- or t3, t2, t4 -- beqz t3, L(loop) -- addi.d a0, a0, -8 -- addi.d a2, a2, 8 -- --L(count_pos): -- ctz.d t0, t2 -- srli.d t0, t0, 3 -- sltu t1, t0, a2 -- add.d a0, a0, t0 -- -- maskeqz a0, a0, t1 -- jr ra -- --L(out): -- move a0, zero -- jr ra --END(MEMCHR_NAME) -- --#ifdef _LIBC --libc_hidden_builtin_def (MEMCHR_NAME) --#endif -diff --git a/sysdeps/loongarch/lp64/memcmp.S b/sysdeps/loongarch/lp64/memcmp.S -deleted file mode 100644 -index 457a4dc7..00000000 ---- a/sysdeps/loongarch/lp64/memcmp.S -+++ /dev/null -@@ -1,280 +0,0 @@ --#ifdef _LIBC --#include --#include --#include --#else --#include --#include --#endif -- --#ifndef MEMCMP_NAME --#define MEMCMP_NAME memcmp --#endif -- --LEAF(MEMCMP_NAME, 6) -- beqz a2, L(ret) -- andi a4, a1, 0x7 -- andi a3, a0, 0x7 -- sltu a5, a4, a3 -- -- xor t0, a0, a1 -- li.w t8, 8 -- maskeqz t0, t0, a5 -- li.w t7, -1 -- -- xor a0, a0, t0 // a0 hold smaller one -- xor a1, a1, t0 // a1 hold larger one -- andi a3, a0, 0x7 // a3 hold small offset -- andi a4, a1, 0x7 // a4 hold larger offset -- -- xor a0, a0, a3 -- xor a1, a1, a4 -- ld.d t2, a0, 0 // t2 = "fedcbaXX" -- ld.d t1, a1, 0 // t1 = "54321YYY" -- -- slli.d t3, a3, 3 -- slli.d t4, a4, 3 -- sub.d a6, t3, t4 // a6 = 0xfffffffffffffff8 -- srl.d t1, t1, t4 // t1 = "00054321" -- -- srl.d t0, t2, t3 // t0 = "00fedcba" -- srl.d t5, t7, t4 // t5 = 0x000000FFFFFFFFFF -- sub.d t6, t0, t1 // t6 hold diff -- and t6, t6, t5 // t6 = "000xxxxx" -- -- sub.d t5, t8, a4 // t5 hold margin 8 - 3 = 5 -- bnez t6, L(first_out) -- bgeu t5, a2, L(ret) -- sub.d a2, a2, t5 -- -- bnez a6, L(unaligned) -- blt a2, t8, L(al_less_8bytes) -- andi t1, a2, 31 -- beq t1, a2, L(al_less_32bytes) -- -- sub.d t2, a2, t1 -- add.d a4, a0, t2 -- move a2, t1 -- --L(al_loop): -- ld.d t0, a0, 8 -- -- ld.d t1, a1, 8 -- ld.d t2, a0, 16 -- ld.d t3, a1, 16 -- ld.d t4, a0, 24 -- -- ld.d t5, a1, 24 -- ld.d t6, a0, 32 -- ld.d t7, a1, 32 -- addi.d a0, a0, 32 -- -- addi.d a1, a1, 32 -- bne t0, t1, L(out1) -- bne t2, t3, L(out2) -- bne t4, t5, L(out3) -- -- bne t6, t7, L(out4) -- bne a0, a4, L(al_loop) -- --L(al_less_32bytes): -- srai.d a4, a2, 4 -- beqz a4, L(al_less_16bytes) -- -- ld.d t0, a0, 8 -- ld.d t1, a1, 8 -- ld.d t2, a0, 16 -- ld.d t3, a1, 16 -- -- addi.d a0, a0, 16 -- addi.d a1, a1, 16 -- addi.d a2, a2, -16 -- bne t0, t1, L(out1) -- -- bne t2, t3, L(out2) -- --L(al_less_16bytes): -- srai.d a4, a2, 3 -- beqz a4, L(al_less_8bytes) -- ld.d t0, a0, 8 -- -- ld.d t1, a1, 8 -- addi.d a0, a0, 8 -- addi.d a1, a1, 8 -- addi.d a2, a2, -8 -- -- bne t0, t1, L(out1) -- --L(al_less_8bytes): -- beqz a2, L(ret) -- ld.d t0, a0, 8 -- ld.d t1, a1, 8 -- -- li.d t7, -1 -- slli.d t2, a2, 3 -- sll.d t2, t7, t2 -- sub.d t3, t0, t1 -- -- andn t6, t3, t2 -- bnez t6, L(count_diff) -- --L(ret): -- move a0, zero -- jr ra -- --L(out4): -- move t0, t6 -- move t1, t7 -- sub.d t6, t6, t7 -- b L(count_diff) -- --L(out3): -- move t0, t4 -- move t1, t5 -- sub.d t6, t4, t5 -- b L(count_diff) -- --L(out2): -- move t0, t2 -- move t1, t3 --L(out1): -- sub.d t6, t0, t1 -- b L(count_diff) -- --L(first_out): -- slli.d t4, a2, 3 -- slt t3, a2, t5 -- sll.d t4, t7, t4 -- maskeqz t4, t4, t3 -- -- andn t6, t6, t4 -- --L(count_diff): -- ctz.d t2, t6 -- bstrins.d t2, zero, 2, 0 -- srl.d t0, t0, t2 -- -- srl.d t1, t1, t2 -- andi t0, t0, 0xff -- andi t1, t1, 0xff -- sub.d t2, t0, t1 -- -- sub.d t3, t1, t0 -- masknez t2, t2, a5 -- maskeqz t3, t3, a5 -- or a0, t2, t3 -- -- jr ra -- --L(unaligned): -- sub.d a7, zero, a6 -- srl.d t0, t2, a6 -- blt a2, t8, L(un_less_8bytes) -- -- andi t1, a2, 31 -- beq t1, a2, L(un_less_32bytes) -- sub.d t2, a2, t1 -- add.d a4, a0, t2 -- -- move a2, t1 -- --L(un_loop): -- ld.d t2, a0, 8 -- ld.d t1, a1, 8 -- ld.d t4, a0, 16 -- -- ld.d t3, a1, 16 -- ld.d t6, a0, 24 -- ld.d t5, a1, 24 -- ld.d t8, a0, 32 -- -- ld.d t7, a1, 32 -- addi.d a0, a0, 32 -- addi.d a1, a1, 32 -- sll.d a3, t2, a7 -- -- or t0, a3, t0 -- bne t0, t1, L(out1) -- srl.d t0, t2, a6 -- sll.d a3, t4, a7 -- -- or t2, a3, t0 -- bne t2, t3, L(out2) -- srl.d t0, t4, a6 -- sll.d a3, t6, a7 -- -- or t4, a3, t0 -- bne t4, t5, L(out3) -- srl.d t0, t6, a6 -- sll.d a3, t8, a7 -- -- or t6, t0, a3 -- bne t6, t7, L(out4) -- srl.d t0, t8, a6 -- bne a0, a4, L(un_loop) -- --L(un_less_32bytes): -- srai.d a4, a2, 4 -- beqz a4, L(un_less_16bytes) -- ld.d t2, a0, 8 -- ld.d t1, a1, 8 -- -- ld.d t4, a0, 16 -- ld.d t3, a1, 16 -- addi.d a0, a0, 16 -- addi.d a1, a1, 16 -- -- addi.d a2, a2, -16 -- sll.d a3, t2, a7 -- or t0, a3, t0 -- bne t0, t1, L(out1) -- -- srl.d t0, t2, a6 -- sll.d a3, t4, a7 -- or t2, a3, t0 -- bne t2, t3, L(out2) -- -- srl.d t0, t4, a6 -- --L(un_less_16bytes): -- srai.d a4, a2, 3 -- beqz a4, L(un_less_8bytes) -- ld.d t2, a0, 8 -- -- ld.d t1, a1, 8 -- addi.d a0, a0, 8 -- addi.d a1, a1, 8 -- addi.d a2, a2, -8 -- -- sll.d a3, t2, a7 -- or t0, a3, t0 -- bne t0, t1, L(out1) -- srl.d t0, t2, a6 -- --L(un_less_8bytes): -- beqz a2, L(ret) -- andi a7, a7, 63 -- slli.d a4, a2, 3 -- bgeu a7, a4, L(last_cmp) -- -- ld.d t2, a0, 8 -- sll.d a3, t2, a7 -- or t0, a3, t0 -- --L(last_cmp): -- ld.d t1, a1, 8 -- -- li.d t7, -1 -- sll.d t2, t7, a4 -- sub.d t3, t0, t1 -- andn t6, t3, t2 -- -- bnez t6, L(count_diff) -- move a0, zero -- jr ra -- --END(MEMCMP_NAME) -- --#ifdef _LIBC --libc_hidden_builtin_def (MEMCMP_NAME) --#endif -diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S -deleted file mode 100644 -index 4791e1a4..00000000 ---- a/sysdeps/loongarch/lp64/memcpy.S -+++ /dev/null -@@ -1,804 +0,0 @@ --#ifdef _LIBC --#include --#include --#include --#else --#include --#include --#endif -- --#ifndef MEMCPY_NAME --#define MEMCPY_NAME memcpy --#endif -- --#ifndef MEMMOVE_NAME --#define MEMMOVE_NAME memmove --#endif -- --#define LD_64(reg, n) \ -- ld.d t0, reg, n; \ -- ld.d t1, reg, n+8; \ -- ld.d t2, reg, n+16; \ -- ld.d t3, reg, n+24; \ -- ld.d t4, reg, n+32; \ -- ld.d t5, reg, n+40; \ -- ld.d t6, reg, n+48; \ -- ld.d t7, reg, n+56; -- --#define ST_64(reg, n) \ -- st.d t0, reg, n; \ -- st.d t1, reg, n+8; \ -- st.d t2, reg, n+16; \ -- st.d t3, reg, n+24; \ -- st.d t4, reg, n+32; \ -- st.d t5, reg, n+40; \ -- st.d t6, reg, n+48; \ -- st.d t7, reg, n+56; -- --LEAF(MEMMOVE_NAME, 6) -- sub.d t0, a0, a1 -- bltu t0, a2, L(copy_back) -- --END(MEMMOVE_NAME) -- --#ifdef _LIBC --libc_hidden_builtin_def (MEMMOVE_NAME) --#endif -- --LEAF_NO_ALIGN(MEMCPY_NAME) -- -- srai.d a3, a2, 4 -- beqz a3, L(short_data) # less than 16 bytes -- -- move a4, a0 -- andi a5, a0, 0x7 -- andi a6, a1, 0x7 -- li.d t8, 8 -- beqz a5, L(check_align) -- -- # make dest aligned 8 bytes -- sub.d t2, t8, a5 -- sub.d a2, a2, t2 -- -- pcaddi t1, 20 -- slli.d t3, t2, 3 -- add.d a1, a1, t2 -- sub.d t1, t1, t3 -- add.d a4, a4, t2 -- jr t1 -- --L(al7): -- ld.b t0, a1, -7 -- st.b t0, a4, -7 --L(al6): -- ld.b t0, a1, -6 -- st.b t0, a4, -6 --L(al5): -- ld.b t0, a1, -5 -- st.b t0, a4, -5 --L(al4): -- ld.b t0, a1, -4 -- st.b t0, a4, -4 --L(al3): -- ld.b t0, a1, -3 -- st.b t0, a4, -3 --L(al2): -- ld.b t0, a1, -2 -- st.b t0, a4, -2 --L(al1): -- ld.b t0, a1, -1 -- st.b t0, a4, -1 -- --L(check_align): -- bne a5, a6, L(unalign) -- -- srai.d a3, a2, 4 -- beqz a3, L(al_less_16bytes) -- -- andi a3, a2, 0x3f -- beq a3, a2, L(al_less_64bytes) -- -- sub.d t0, a2, a3 -- move a2, a3 -- add.d a5, a1, t0 -- --L(loop_64bytes): -- LD_64(a1, 0) -- addi.d a1, a1, 64 -- ST_64(a4, 0) -- -- addi.d a4, a4, 64 -- bne a1, a5, L(loop_64bytes) -- --L(al_less_64bytes): -- srai.d a3, a2, 5 -- beqz a3, L(al_less_32bytes) -- -- ld.d t0, a1, 0 -- ld.d t1, a1, 8 -- ld.d t2, a1, 16 -- ld.d t3, a1, 24 -- -- addi.d a1, a1, 32 -- addi.d a2, a2, -32 -- -- st.d t0, a4, 0 -- st.d t1, a4, 8 -- st.d t2, a4, 16 -- st.d t3, a4, 24 -- -- addi.d a4, a4, 32 -- --L(al_less_32bytes): -- srai.d a3, a2, 4 -- beqz a3, L(al_less_16bytes) -- -- ld.d t0, a1, 0 -- ld.d t1, a1, 8 -- addi.d a1, a1, 16 -- addi.d a2, a2, -16 -- -- st.d t0, a4, 0 -- st.d t1, a4, 8 -- addi.d a4, a4, 16 -- --L(al_less_16bytes): -- srai.d a3, a2, 3 -- beqz a3, L(al_less_8bytes) -- -- ld.d t0, a1, 0 -- addi.d a1, a1, 8 -- addi.d a2, a2, -8 -- -- st.d t0, a4, 0 -- addi.d a4, a4, 8 -- --L(al_less_8bytes): -- srai.d a3, a2, 2 -- beqz a3, L(al_less_4bytes) -- -- ld.w t0, a1, 0 -- addi.d a1, a1, 4 -- addi.d a2, a2, -4 -- -- st.w t0, a4, 0 -- addi.d a4, a4, 4 -- --L(al_less_4bytes): -- srai.d a3, a2, 1 -- beqz a3, L(al_less_2bytes) -- -- ld.h t0, a1, 0 -- addi.d a1, a1, 2 -- addi.d a2, a2, -2 -- -- st.h t0, a4, 0 -- addi.d a4, a4, 2 -- --L(al_less_2bytes): -- beqz a2, L(al_less_1byte) -- -- ld.b t0, a1, 0 -- st.b t0, a4, 0 -- --L(al_less_1byte): -- jr ra -- --L(unalign): -- andi a5, a1, 0x7 -- bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned -- -- sub.d t8, t8, a5 # use t8 to save count of bytes for aligning -- slli.d a5, a5, 3 -- -- ld.d t0, a1, 0 -- addi.d a1, a1, 8 -- -- slli.d a6, t8, 3 -- srl.d a7, t0, a5 -- -- srai.d a3, a2, 4 -- beqz a3, L(un_less_16bytes) -- -- andi a3, a2, 0x3f -- beq a3, a2, L(un_less_64bytes) -- -- sub.d t0, a2, a3 -- move a2, a3 -- add.d a3, a1, t0 -- --# a5 shift right num --# a6 shift left num --# a7 remaining part --L(un_long_bytes): -- ld.d t0, a1, 0 -- ld.d t1, a1, 8 -- ld.d t2, a1, 16 -- ld.d t3, a1, 24 -- -- srl.d t4, t0, a5 -- sll.d t0, t0, a6 -- -- srl.d t5, t1, a5 -- sll.d t1, t1, a6 -- -- srl.d t6, t2, a5 -- sll.d t2, t2, a6 -- -- srl.d t7, t3, a5 -- sll.d t3, t3, a6 -- -- or t0, a7, t0 -- or t1, t4, t1 -- or t2, t5, t2 -- or t3, t6, t3 -- -- ld.d t4, a1, 32 -- ld.d t5, a1, 40 -- ld.d t6, a1, 48 -- ld.d a7, a1, 56 -- -- st.d t0, a4, 0 -- st.d t1, a4, 8 -- st.d t2, a4, 16 -- st.d t3, a4, 24 -- -- addi.d a1, a1, 64 -- -- srl.d t0, t4, a5 -- sll.d t4, t4, a6 -- -- srl.d t1, t5, a5 -- sll.d t5, t5, a6 -- -- srl.d t2, t6, a5 -- sll.d t6, t6, a6 -- -- sll.d t3, a7, a6 -- srl.d a7, a7, a5 -- -- or t4, t7, t4 -- or t5, t0, t5 -- or t6, t1, t6 -- or t3, t2, t3 -- -- st.d t4, a4, 32 -- st.d t5, a4, 40 -- st.d t6, a4, 48 -- st.d t3, a4, 56 -- -- addi.d a4, a4, 64 -- bne a3, a1, L(un_long_bytes) -- --L(un_less_64bytes): -- srai.d a3, a2, 5 -- beqz a3, L(un_less_32bytes) -- -- ld.d t0, a1, 0 -- ld.d t1, a1, 8 -- ld.d t2, a1, 16 -- ld.d t3, a1, 24 -- -- addi.d a1, a1, 32 -- addi.d a2, a2, -32 -- -- srl.d t4, t0, a5 -- sll.d t0, t0, a6 -- -- srl.d t5, t1, a5 -- sll.d t1, t1, a6 -- -- srl.d t6, t2, a5 -- sll.d t2, t2, a6 -- -- or t0, a7, t0 -- -- srl.d a7, t3, a5 -- sll.d t3, t3, a6 -- -- or t1, t4, t1 -- or t2, t5, t2 -- or t3, t6, t3 -- -- st.d t0, a4, 0 -- st.d t1, a4, 8 -- st.d t2, a4, 16 -- st.d t3, a4, 24 -- -- addi.d a4, a4, 32 -- --L(un_less_32bytes): -- srai.d a3, a2, 4 -- beqz a3, L(un_less_16bytes) -- -- ld.d t0, a1, 0 -- ld.d t1, a1, 8 -- -- addi.d a1, a1, 16 -- addi.d a2, a2, -16 -- -- srl.d t2, t0, a5 -- sll.d t3, t0, a6 -- -- sll.d t4, t1, a6 -- or t3, a7, t3 -- or t4, t2, t4 -- srl.d a7, t1, a5 -- -- st.d t3, a4, 0 -- st.d t4, a4, 8 -- -- addi.d a4, a4, 16 -- --L(un_less_16bytes): -- srai.d a3, a2, 3 -- beqz a3, L(un_less_8bytes) -- -- ld.d t0, a1, 0 -- -- addi.d a1, a1, 8 -- addi.d a2, a2, -8 -- -- sll.d t1, t0, a6 -- or t2, a7, t1 -- srl.d a7, t0, a5 -- -- st.d t2, a4, 0 -- addi.d a4, a4, 8 -- --L(un_less_8bytes): -- beqz a2, L(un_less_1byte) -- bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7 -- -- # combine data in memory and a7(remaining part) -- ld.d t0, a1, 0 -- sll.d t0, t0, a6 -- or a7, a7, t0 -- --1: -- srai.d a3, a2, 2 -- beqz a3, L(un_less_4bytes) -- -- addi.d a2, a2, -4 -- st.w a7, a4, 0 -- addi.d a4, a4, 4 -- srai.d a7, a7, 32 -- --L(un_less_4bytes): -- srai.d a3, a2, 1 -- beqz a3, L(un_less_2bytes) -- -- addi.d a2, a2, -2 -- st.h a7, a4, 0 -- addi.d a4, a4, 2 -- srai.d a7, a7, 16 -- --L(un_less_2bytes): -- beqz a2, L(un_less_1byte) -- st.b a7, a4, 0 -- --L(un_less_1byte): -- jr ra -- --# Bytes copying for data less than 16 bytes --L(short_data): -- pcaddi t1, 36 -- slli.d t2, a2, 3 -- add.d a4, a0, a2 -- sub.d t1, t1, t2 -- add.d a1, a1, a2 -- jr t1 -- --L(short_15_bytes): -- ld.b t0, a1, -15 -- st.b t0, a4, -15 --L(short_14_bytes): -- ld.b t0, a1, -14 -- st.b t0, a4, -14 --L(short_13_bytes): -- ld.b t0, a1, -13 -- st.b t0, a4, -13 --L(short_12_bytes): -- ld.b t0, a1, -12 -- st.b t0, a4, -12 --L(short_11_bytes): -- ld.b t0, a1, -11 -- st.b t0, a4, -11 --L(short_10_bytes): -- ld.b t0, a1, -10 -- st.b t0, a4, -10 --L(short_9_bytes): -- ld.b t0, a1, -9 -- st.b t0, a4, -9 --L(short_8_bytes): -- ld.b t0, a1, -8 -- st.b t0, a4, -8 --L(short_7_bytes): -- ld.b t0, a1, -7 -- st.b t0, a4, -7 --L(short_6_bytes): -- ld.b t0, a1, -6 -- st.b t0, a4, -6 --L(short_5_bytes): -- ld.b t0, a1, -5 -- st.b t0, a4, -5 --L(short_4_bytes): -- ld.b t0, a1, -4 -- st.b t0, a4, -4 --L(short_3_bytes): -- ld.b t0, a1, -3 -- st.b t0, a4, -3 --L(short_2_bytes): -- ld.b t0, a1, -2 -- st.b t0, a4, -2 --L(short_1_bytes): -- ld.b t0, a1, -1 -- st.b t0, a4, -1 -- jr ra -- --L(copy_back): -- srai.d a3, a2, 4 -- beqz a3, L(back_short_data) # less than 16 bytes -- -- add.d a4, a0, a2 # store the tail of dest -- add.d a1, a1, a2 # store the tail of src -- -- andi a5, a4, 0x7 -- andi a6, a1, 0x7 -- beqz a5, L(back_check_align) -- -- # make dest aligned 8 bytes -- sub.d a2, a2, a5 -- sub.d a1, a1, a5 -- sub.d a4, a4, a5 -- -- pcaddi t1, 18 -- slli.d t3, a5, 3 -- sub.d t1, t1, t3 -- jr t1 -- -- ld.b t0, a1, 6 -- st.b t0, a4, 6 -- ld.b t0, a1, 5 -- st.b t0, a4, 5 -- ld.b t0, a1, 4 -- st.b t0, a4, 4 -- ld.b t0, a1, 3 -- st.b t0, a4, 3 -- ld.b t0, a1, 2 -- st.b t0, a4, 2 -- ld.b t0, a1, 1 -- st.b t0, a4, 1 -- ld.b t0, a1, 0 -- st.b t0, a4, 0 -- --L(back_check_align): -- bne a5, a6, L(back_unalign) -- -- srai.d a3, a2, 4 -- beqz a3, L(back_less_16bytes) -- -- andi a3, a2, 0x3f -- beq a3, a2, L(back_less_64bytes) -- -- sub.d t0, a2, a3 -- move a2, a3 -- sub.d a5, a1, t0 -- --L(back_loop_64bytes): -- LD_64(a1, -64) -- addi.d a1, a1, -64 -- ST_64(a4, -64) -- -- addi.d a4, a4, -64 -- bne a1, a5, L(back_loop_64bytes) -- --L(back_less_64bytes): -- srai.d a3, a2, 5 -- beqz a3, L(back_less_32bytes) -- -- ld.d t0, a1, -32 -- ld.d t1, a1, -24 -- ld.d t2, a1, -16 -- ld.d t3, a1, -8 -- -- addi.d a1, a1, -32 -- addi.d a2, a2, -32 -- -- st.d t0, a4, -32 -- st.d t1, a4, -24 -- st.d t2, a4, -16 -- st.d t3, a4, -8 -- -- addi.d a4, a4, -32 -- --L(back_less_32bytes): -- srai.d a3, a2, 4 -- beqz a3, L(back_less_16bytes) -- -- ld.d t0, a1, -16 -- ld.d t1, a1, -8 -- -- addi.d a2, a2, -16 -- addi.d a1, a1, -16 -- -- st.d t0, a4, -16 -- st.d t1, a4, -8 -- addi.d a4, a4, -16 -- --L(back_less_16bytes): -- srai.d a3, a2, 3 -- beqz a3, L(back_less_8bytes) -- -- ld.d t0, a1, -8 -- addi.d a2, a2, -8 -- addi.d a1, a1, -8 -- -- st.d t0, a4, -8 -- addi.d a4, a4, -8 -- --L(back_less_8bytes): -- srai.d a3, a2, 2 -- beqz a3, L(back_less_4bytes) -- -- ld.w t0, a1, -4 -- addi.d a2, a2, -4 -- addi.d a1, a1, -4 -- -- st.w t0, a4, -4 -- addi.d a4, a4, -4 -- --L(back_less_4bytes): -- srai.d a3, a2, 1 -- beqz a3, L(back_less_2bytes) -- -- ld.h t0, a1, -2 -- addi.d a2, a2, -2 -- addi.d a1, a1, -2 -- -- st.h t0, a4, -2 -- addi.d a4, a4, -2 -- --L(back_less_2bytes): -- beqz a2, L(back_less_1byte) -- -- ld.b t0, a1, -1 -- st.b t0, a4, -1 -- --L(back_less_1byte): -- jr ra -- --L(back_unalign): -- andi t8, a1, 0x7 -- bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned -- -- sub.d a6, zero, t8 -- -- ld.d t0, a1, 0 -- slli.d a6, a6, 3 -- slli.d a5, t8, 3 -- sll.d a7, t0, a6 -- -- srai.d a3, a2, 4 -- beqz a3, L(back_un_less_16bytes) -- -- andi a3, a2, 0x3f -- beq a3, a2, L(back_un_less_64bytes) -- -- sub.d t0, a2, a3 -- move a2, a3 -- sub.d a3, a1, t0 -- --L(back_un_long_bytes): -- ld.d t0, a1, -8 -- ld.d t1, a1, -16 -- ld.d t2, a1, -24 -- ld.d t3, a1, -32 -- -- sll.d t4, t0, a6 -- srl.d t0, t0, a5 -- -- sll.d t5, t1, a6 -- srl.d t1, t1, a5 -- -- sll.d t6, t2, a6 -- srl.d t2, t2, a5 -- -- sll.d t7, t3, a6 -- srl.d t3, t3, a5 -- -- or t0, t0, a7 -- or t1, t1, t4 -- or t2, t2, t5 -- or t3, t3, t6 -- -- ld.d t4, a1, -40 -- ld.d t5, a1, -48 -- ld.d t6, a1, -56 -- ld.d a7, a1, -64 -- st.d t0, a4, -8 -- st.d t1, a4, -16 -- st.d t2, a4, -24 -- st.d t3, a4, -32 -- -- addi.d a1, a1, -64 -- -- sll.d t0, t4, a6 -- srl.d t4, t4, a5 -- -- sll.d t1, t5, a6 -- srl.d t5, t5, a5 -- -- sll.d t2, t6, a6 -- srl.d t6, t6, a5 -- -- srl.d t3, a7, a5 -- sll.d a7, a7, a6 -- -- or t4, t7, t4 -- or t5, t0, t5 -- or t6, t1, t6 -- or t3, t2, t3 -- -- st.d t4, a4, -40 -- st.d t5, a4, -48 -- st.d t6, a4, -56 -- st.d t3, a4, -64 -- -- addi.d a4, a4, -64 -- bne a3, a1, L(back_un_long_bytes) -- --L(back_un_less_64bytes): -- srai.d a3, a2, 5 -- beqz a3, L(back_un_less_32bytes) -- -- ld.d t0, a1, -8 -- ld.d t1, a1, -16 -- ld.d t2, a1, -24 -- ld.d t3, a1, -32 -- -- addi.d a1, a1, -32 -- addi.d a2, a2, -32 -- -- sll.d t4, t0, a6 -- srl.d t0, t0, a5 -- -- sll.d t5, t1, a6 -- srl.d t1, t1, a5 -- -- sll.d t6, t2, a6 -- srl.d t2, t2, a5 -- -- or t0, a7, t0 -- -- sll.d a7, t3, a6 -- srl.d t3, t3, a5 -- -- or t1, t4, t1 -- or t2, t5, t2 -- or t3, t6, t3 -- -- st.d t0, a4, -8 -- st.d t1, a4, -16 -- st.d t2, a4, -24 -- st.d t3, a4, -32 -- -- addi.d a4, a4, -32 -- --L(back_un_less_32bytes): -- srai.d a3, a2, 4 -- beqz a3, L(back_un_less_16bytes) -- -- ld.d t0, a1, -8 -- ld.d t1, a1, -16 -- -- addi.d a1, a1, -16 -- addi.d a2, a2, -16 -- -- sll.d t2, t0, a6 -- srl.d t3, t0, a5 -- -- srl.d t4, t1, a5 -- or t3, a7, t3 -- or t4, t2, t4 -- sll.d a7, t1, a6 -- -- st.d t3, a4, -8 -- st.d t4, a4, -16 -- -- addi.d a4, a4, -16 -- --L(back_un_less_16bytes): -- srai.d a3, a2, 3 -- beqz a3, L(back_un_less_8bytes) -- -- ld.d t0, a1, -8 -- -- addi.d a1, a1, -8 -- addi.d a2, a2, -8 -- -- srl.d t1, t0, a5 -- or t2, a7, t1 -- sll.d a7, t0, a6 -- -- st.d t2, a4, -8 -- addi.d a4, a4, -8 -- --L(back_un_less_8bytes): -- beqz a2, L(back_end) -- bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7 -- -- # combine data in memory and a7(remaining part) -- ld.d t0, a1, -8 -- srl.d t0, t0, a5 -- or a7, a7, t0 -- --1: -- srai.d a3, a2, 2 -- beqz a3, L(back_un_less_4bytes) -- -- srai.d t0, a7, 32 -- addi.d a2, a2, -4 -- st.w t0, a4, -4 -- addi.d a4, a4, -4 -- slli.d a7, a7, 32 -- --L(back_un_less_4bytes): -- srai.d a3, a2, 1 -- beqz a3, L(back_un_less_2bytes) -- srai.d t0, a7, 48 -- addi.d a2, a2, -2 -- st.h t0, a4, -2 -- addi.d a4, a4, -2 -- slli.d a7, a7, 16 --L(back_un_less_2bytes): -- beqz a2, L(back_un_less_1byte) -- srai.d t0, a7, 56 -- st.b t0, a4, -1 --L(back_un_less_1byte): -- jr ra -- --L(back_short_data): -- pcaddi t1, 34 -- slli.d t2, a2, 3 -- sub.d t1, t1, t2 -- jr t1 -- -- ld.b t0, a1, 14 -- st.b t0, a0, 14 -- ld.b t0, a1, 13 -- st.b t0, a0, 13 -- ld.b t0, a1, 12 -- st.b t0, a0, 12 -- ld.b t0, a1, 11 -- st.b t0, a0, 11 -- ld.b t0, a1, 10 -- st.b t0, a0, 10 -- ld.b t0, a1, 9 -- st.b t0, a0, 9 -- ld.b t0, a1, 8 -- st.b t0, a0, 8 -- ld.b t0, a1, 7 -- st.b t0, a0, 7 -- ld.b t0, a1, 6 -- st.b t0, a0, 6 -- ld.b t0, a1, 5 -- st.b t0, a0, 5 -- ld.b t0, a1, 4 -- st.b t0, a0, 4 -- ld.b t0, a1, 3 -- st.b t0, a0, 3 -- ld.b t0, a1, 2 -- st.b t0, a0, 2 -- ld.b t0, a1, 1 -- st.b t0, a0, 1 -- ld.b t0, a1, 0 -- st.b t0, a0, 0 --L(back_end): -- jr ra -- --END(MEMCPY_NAME) -- --#ifdef _LIBC --libc_hidden_builtin_def (MEMCPY_NAME) --#endif -diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S -deleted file mode 100644 -index 6d1922c4..00000000 ---- a/sysdeps/loongarch/lp64/memmove.S -+++ /dev/null -@@ -1,2 +0,0 @@ --/* DONT DELETE THIS FILE, OTHERWIES MEMCPY.C WILL BE COMPILED. */ --/* There are too many common code in memcpy and memmove. See memcpy.S */ -diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S -deleted file mode 100644 -index eabd7d23..00000000 ---- a/sysdeps/loongarch/lp64/memset.S -+++ /dev/null -@@ -1,166 +0,0 @@ --#ifdef _LIBC --#include --#include --#include --#else --#include --#include --#endif -- --#ifndef MEMSET_NAME --#define MEMSET_NAME memset --#endif -- --#define ST_64(n) \ -- st.d a1, a0, n; \ -- st.d a1, a0, n+8; \ -- st.d a1, a0, n+16; \ -- st.d a1, a0, n+24; \ -- st.d a1, a0, n+32; \ -- st.d a1, a0, n+40; \ -- st.d a1, a0, n+48; \ -- st.d a1, a0, n+56; -- --LEAF(MEMSET_NAME, 6) -- move t0, a0 -- andi a3, a0, 0x7 -- li.w t6, 16 -- beqz a3, L(align) -- blt a2, t6, L(short_data) -- --L(make_align): -- li.w t8, 8 -- sub.d t2, t8, a3 -- pcaddi t1, 11 -- slli.d t3, t2, 2 -- sub.d t1, t1, t3 -- jirl zero, t1, 0 -- --L(al7): -- st.b a1, t0, 6 --L(al6): -- st.b a1, t0, 5 --L(al5): -- st.b a1, t0, 4 --L(al4): -- st.b a1, t0, 3 --L(al3): -- st.b a1, t0, 2 --L(al2): -- st.b a1, t0, 1 --L(al1): -- st.b a1, t0, 0 --L(al0): -- add.d t0, t0, t2 -- sub.d a2, a2, t2 -- --L(align): -- bstrins.d a1, a1, 15, 8 -- bstrins.d a1, a1, 31, 16 -- bstrins.d a1, a1, 63, 32 -- -- blt a2, t6, L(less_16bytes) -- -- andi a4, a2, 0x3f -- beq a4, a2, L(less_64bytes) -- -- sub.d t1, a2, a4 -- move a2, a4 -- add.d a5, t0, t1 -- --L(loop_64bytes): -- addi.d t0, t0, 64 -- st.d a1, t0, -64 -- st.d a1, t0, -56 -- st.d a1, t0, -48 -- st.d a1, t0, -40 -- st.d a1, t0, -32 -- st.d a1, t0, -24 -- st.d a1, t0, -16 -- st.d a1, t0, -8 -- bne t0, a5, L(loop_64bytes) -- --L(less_64bytes): -- srai.d a4, a2, 5 -- beqz a4, L(less_32bytes) -- addi.d a2, a2, -32 -- st.d a1, t0, 0 -- st.d a1, t0, 8 -- st.d a1, t0, 16 -- st.d a1, t0, 24 -- addi.d t0, t0, 32 --L(less_32bytes): -- blt a2, t6, L(less_16bytes) -- addi.d a2, a2, -16 -- st.d a1, t0, 0 -- st.d a1, t0, 8 -- addi.d t0, t0, 16 --L(less_16bytes): -- srai.d a4, a2, 3 -- beqz a4, L(less_8bytes) -- addi.d a2, a2, -8 -- st.d a1, t0, 0 -- addi.d t0, t0, 8 --L(less_8bytes): -- beqz a2, L(less_1byte) -- srai.d a4, a2, 2 -- beqz a4, L(less_4bytes) -- addi.d a2, a2, -4 -- st.w a1, t0, 0 -- addi.d t0, t0, 4 --L(less_4bytes): -- srai.d a3, a2, 1 -- beqz a3, L(less_2bytes) -- addi.d a2, a2, -2 -- st.h a1, t0, 0 -- addi.d t0, t0, 2 --L(less_2bytes): -- beqz a2, L(less_1byte) -- st.b a1, t0, 0 --L(less_1byte): -- jr ra -- --L(short_data): -- pcaddi t1, 19 -- slli.d t3, a2, 2 -- sub.d t1, t1, t3 -- jirl zero, t1, 0 --L(short_15): -- st.b a1, a0, 14 -- --L(short_14): -- st.b a1, a0, 13 --L(short_13): -- st.b a1, a0, 12 --L(short_12): -- st.b a1, a0, 11 --L(short_11): -- st.b a1, a0, 10 --L(short_10): -- st.b a1, a0, 9 --L(short_9): -- st.b a1, a0, 8 --L(short_8): -- st.b a1, a0, 7 --L(short_7): -- st.b a1, a0, 6 --L(short_6): -- st.b a1, a0, 5 --L(short_5): -- st.b a1, a0, 4 --L(short_4): -- st.b a1, a0, 3 --L(short_3): -- st.b a1, a0, 2 --L(short_2): -- st.b a1, a0, 1 --L(short_1): -- st.b a1, a0, 0 --L(short_0): -- jr ra -- --END(MEMSET_NAME) -- --#ifdef _LIBC --libc_hidden_builtin_def (MEMSET_NAME) --#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S -index 4677c912..7dfa3ade 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S -@@ -1,7 +1,96 @@ - -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ - #if IS_IN (libc) - #define MEMCHR_NAME __memchr_aligned -+#else -+#define MEMCHR_NAME memchr - #endif - --#include "../memchr.S" -+LEAF(MEMCHR_NAME, 6) -+ beqz a2, L(out) -+ andi t1, a0, 0x7 -+ lu12i.w a3, 0x01010 -+ sub.d a5, a0, t1 -+ -+ bstrins.d a1, a1, 15, 8 -+ ld.d t0, a5, 0 -+ slli.d t2, t1, 3 -+ ori a3, a3, 0x101 -+ -+ bstrins.d a1, a1, 31, 16 -+ li.w t7, -1 -+ li.w t8, 9 -+ bstrins.d a3, a3, 63, 32 -+ -+ srl.d t3, t7, t2 -+ bstrins.d a1, a1, 63, 32 -+ sub.d t4, t8, t1 -+ orn t3, a1, t3 -+ -+ srl.d t0, t0, t2 -+ slli.d a4, a3, 7 # 0x8080808080808080 -+ sltu t4, a2, t4 -+ xor t2, t0, t3 -+ -+ sub.d a6, t2, a3 -+ andn a7, a4, t2 -+ and t2, a6, a7 -+ or t3, t2, t4 -+ -+ bnez t3, L(count_pos) -+ addi.d a2, a2, -8 -+ addi.d a0, a5, 8 -+ add.d a2, a2, t1 -+ -+L(loop): -+ ld.d t0, a0, 0 -+ sltui t4, a2, 9 -+ xor t2, t0, a1 -+ sub.d a6, t2, a3 -+ -+ andn a7, a4, t2 -+ and t2, a6, a7 -+ or t3, t2, t4 -+ bnez t3, L(count_pos) -+ -+ ld.d t1, a0, 8 -+ addi.d a0, a0, 16 -+ sltui t4, a2, 17 -+ xor t2, t1, a1 -+ -+ sub.d a6, t2, a3 -+ andn a7, a4, t2 -+ and t2, a6, a7 -+ addi.d a2, a2, -16 -+ -+ or t3, t2, t4 -+ beqz t3, L(loop) -+ addi.d a0, a0, -8 -+ addi.d a2, a2, 8 -+ -+L(count_pos): -+ ctz.d t0, t2 -+ srli.d t0, t0, 3 -+ sltu t1, t0, a2 -+ add.d a0, a0, t0 -+ -+ maskeqz a0, a0, t1 -+ jr ra -+ -+L(out): -+ move a0, zero -+ jr ra -+END(MEMCHR_NAME) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMCHR_NAME) -+#endif - -diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S -index 512eabca..9505dfce 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S -@@ -1,11 +1,289 @@ - --#if IS_IN (libc) - -+ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) - #define MEMCMP_NAME __memcmp_aligned -+#else -+#define MEMCMP_NAME memcmp -+#endif -+ -+LEAF(MEMCMP_NAME, 6) -+ beqz a2, L(ret) -+ andi a4, a1, 0x7 -+ andi a3, a0, 0x7 -+ sltu a5, a4, a3 -+ -+ xor t0, a0, a1 -+ li.w t8, 8 -+ maskeqz t0, t0, a5 -+ li.w t7, -1 -+ -+ xor a0, a0, t0 // a0 hold smaller one -+ xor a1, a1, t0 // a1 hold larger one -+ andi a3, a0, 0x7 // a3 hold small offset -+ andi a4, a1, 0x7 // a4 hold larger offset -+ -+ xor a0, a0, a3 -+ xor a1, a1, a4 -+ ld.d t2, a0, 0 // t2 = "fedcbaXX" -+ ld.d t1, a1, 0 // t1 = "54321YYY" -+ -+ slli.d t3, a3, 3 -+ slli.d t4, a4, 3 -+ sub.d a6, t3, t4 // a6 = 0xfffffffffffffff8 -+ srl.d t1, t1, t4 // t1 = "00054321" -+ -+ srl.d t0, t2, t3 // t0 = "00fedcba" -+ srl.d t5, t7, t4 // t5 = 0x000000FFFFFFFFFF -+ sub.d t6, t0, t1 // t6 hold diff -+ and t6, t6, t5 // t6 = "000xxxxx" -+ -+ sub.d t5, t8, a4 // t5 hold margin 8 - 3 = 5 -+ bnez t6, L(first_out) -+ bgeu t5, a2, L(ret) -+ sub.d a2, a2, t5 -+ -+ bnez a6, L(unaligned) -+ blt a2, t8, L(al_less_8bytes) -+ andi t1, a2, 31 -+ beq t1, a2, L(al_less_32bytes) -+ -+ sub.d t2, a2, t1 -+ add.d a4, a0, t2 -+ move a2, t1 -+ -+L(al_loop): -+ ld.d t0, a0, 8 -+ -+ ld.d t1, a1, 8 -+ ld.d t2, a0, 16 -+ ld.d t3, a1, 16 -+ ld.d t4, a0, 24 -+ -+ ld.d t5, a1, 24 -+ ld.d t6, a0, 32 -+ ld.d t7, a1, 32 -+ addi.d a0, a0, 32 -+ -+ addi.d a1, a1, 32 -+ bne t0, t1, L(out1) -+ bne t2, t3, L(out2) -+ bne t4, t5, L(out3) -+ -+ bne t6, t7, L(out4) -+ bne a0, a4, L(al_loop) -+ -+L(al_less_32bytes): -+ srai.d a4, a2, 4 -+ beqz a4, L(al_less_16bytes) -+ -+ ld.d t0, a0, 8 -+ ld.d t1, a1, 8 -+ ld.d t2, a0, 16 -+ ld.d t3, a1, 16 -+ -+ addi.d a0, a0, 16 -+ addi.d a1, a1, 16 -+ addi.d a2, a2, -16 -+ bne t0, t1, L(out1) -+ -+ bne t2, t3, L(out2) -+ -+L(al_less_16bytes): -+ srai.d a4, a2, 3 -+ beqz a4, L(al_less_8bytes) -+ ld.d t0, a0, 8 -+ -+ ld.d t1, a1, 8 -+ addi.d a0, a0, 8 -+ addi.d a1, a1, 8 -+ addi.d a2, a2, -8 -+ -+ bne t0, t1, L(out1) -+ -+L(al_less_8bytes): -+ beqz a2, L(ret) -+ ld.d t0, a0, 8 -+ ld.d t1, a1, 8 -+ -+ li.d t7, -1 -+ slli.d t2, a2, 3 -+ sll.d t2, t7, t2 -+ sub.d t3, t0, t1 -+ -+ andn t6, t3, t2 -+ bnez t6, L(count_diff) -+ -+L(ret): -+ move a0, zero -+ jr ra -+ -+L(out4): -+ move t0, t6 -+ move t1, t7 -+ sub.d t6, t6, t7 -+ b L(count_diff) -+ -+L(out3): -+ move t0, t4 -+ move t1, t5 -+ sub.d t6, t4, t5 -+ b L(count_diff) -+ -+L(out2): -+ move t0, t2 -+ move t1, t3 -+L(out1): -+ sub.d t6, t0, t1 -+ b L(count_diff) -+ -+L(first_out): -+ slli.d t4, a2, 3 -+ slt t3, a2, t5 -+ sll.d t4, t7, t4 -+ maskeqz t4, t4, t3 -+ -+ andn t6, t6, t4 -+ -+L(count_diff): -+ ctz.d t2, t6 -+ bstrins.d t2, zero, 2, 0 -+ srl.d t0, t0, t2 -+ -+ srl.d t1, t1, t2 -+ andi t0, t0, 0xff -+ andi t1, t1, 0xff -+ sub.d t2, t0, t1 -+ -+ sub.d t3, t1, t0 -+ masknez t2, t2, a5 -+ maskeqz t3, t3, a5 -+ or a0, t2, t3 -+ -+ jr ra -+ -+L(unaligned): -+ sub.d a7, zero, a6 -+ srl.d t0, t2, a6 -+ blt a2, t8, L(un_less_8bytes) -+ -+ andi t1, a2, 31 -+ beq t1, a2, L(un_less_32bytes) -+ sub.d t2, a2, t1 -+ add.d a4, a0, t2 -+ -+ move a2, t1 -+ -+L(un_loop): -+ ld.d t2, a0, 8 -+ ld.d t1, a1, 8 -+ ld.d t4, a0, 16 -+ -+ ld.d t3, a1, 16 -+ ld.d t6, a0, 24 -+ ld.d t5, a1, 24 -+ ld.d t8, a0, 32 -+ -+ ld.d t7, a1, 32 -+ addi.d a0, a0, 32 -+ addi.d a1, a1, 32 -+ sll.d a3, t2, a7 -+ -+ or t0, a3, t0 -+ bne t0, t1, L(out1) -+ srl.d t0, t2, a6 -+ sll.d a3, t4, a7 -+ -+ or t2, a3, t0 -+ bne t2, t3, L(out2) -+ srl.d t0, t4, a6 -+ sll.d a3, t6, a7 -+ -+ or t4, a3, t0 -+ bne t4, t5, L(out3) -+ srl.d t0, t6, a6 -+ sll.d a3, t8, a7 -+ -+ or t6, t0, a3 -+ bne t6, t7, L(out4) -+ srl.d t0, t8, a6 -+ bne a0, a4, L(un_loop) -+ -+L(un_less_32bytes): -+ srai.d a4, a2, 4 -+ beqz a4, L(un_less_16bytes) -+ ld.d t2, a0, 8 -+ ld.d t1, a1, 8 -+ -+ ld.d t4, a0, 16 -+ ld.d t3, a1, 16 -+ addi.d a0, a0, 16 -+ addi.d a1, a1, 16 -+ -+ addi.d a2, a2, -16 -+ sll.d a3, t2, a7 -+ or t0, a3, t0 -+ bne t0, t1, L(out1) -+ -+ srl.d t0, t2, a6 -+ sll.d a3, t4, a7 -+ or t2, a3, t0 -+ bne t2, t3, L(out2) -+ -+ srl.d t0, t4, a6 -+ -+L(un_less_16bytes): -+ srai.d a4, a2, 3 -+ beqz a4, L(un_less_8bytes) -+ ld.d t2, a0, 8 -+ -+ ld.d t1, a1, 8 -+ addi.d a0, a0, 8 -+ addi.d a1, a1, 8 -+ addi.d a2, a2, -8 -+ -+ sll.d a3, t2, a7 -+ or t0, a3, t0 -+ bne t0, t1, L(out1) -+ srl.d t0, t2, a6 -+ -+L(un_less_8bytes): -+ beqz a2, L(ret) -+ andi a7, a7, 63 -+ slli.d a4, a2, 3 -+ bgeu a7, a4, L(last_cmp) -+ -+ ld.d t2, a0, 8 -+ sll.d a3, t2, a7 -+ or t0, a3, t0 -+ -+L(last_cmp): -+ ld.d t1, a1, 8 -+ -+ li.d t7, -1 -+ sll.d t2, t7, a4 -+ sub.d t3, t0, t1 -+ andn t6, t3, t2 -+ -+ bnez t6, L(count_diff) -+ move a0, zero -+ jr ra -+ -+END(MEMCMP_NAME) - -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMCMP_NAME) - #endif - --#include "../memcmp.S" - # undef bcmp - weak_alias (MEMCMP_NAME, bcmp) - -diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S -index 5ff8b4e6..3fc86a7f 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S -@@ -1,11 +1,804 @@ -- -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif - - #if IS_IN (libc) -- - #define MEMCPY_NAME __memcpy_aligned - #define MEMMOVE_NAME __memmove_aligned -+#else -+#define MEMCPY_NAME memcpy -+#define MEMMOVE_NAME memmove -+#endif -+ -+#define LD_64(reg, n) \ -+ ld.d t0, reg, n; \ -+ ld.d t1, reg, n+8; \ -+ ld.d t2, reg, n+16; \ -+ ld.d t3, reg, n+24; \ -+ ld.d t4, reg, n+32; \ -+ ld.d t5, reg, n+40; \ -+ ld.d t6, reg, n+48; \ -+ ld.d t7, reg, n+56; -+ -+#define ST_64(reg, n) \ -+ st.d t0, reg, n; \ -+ st.d t1, reg, n+8; \ -+ st.d t2, reg, n+16; \ -+ st.d t3, reg, n+24; \ -+ st.d t4, reg, n+32; \ -+ st.d t5, reg, n+40; \ -+ st.d t6, reg, n+48; \ -+ st.d t7, reg, n+56; - -+LEAF(MEMMOVE_NAME, 6) -+ sub.d t0, a0, a1 -+ bltu t0, a2, L(copy_back) -+ -+END(MEMMOVE_NAME) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMMOVE_NAME) - #endif - --#include "../memcpy.S" -+LEAF_NO_ALIGN(MEMCPY_NAME) -+ -+ srai.d a3, a2, 4 -+ beqz a3, L(short_data) # less than 16 bytes -+ -+ move a4, a0 -+ andi a5, a0, 0x7 -+ andi a6, a1, 0x7 -+ li.d t8, 8 -+ beqz a5, L(check_align) -+ -+ # make dest aligned 8 bytes -+ sub.d t2, t8, a5 -+ sub.d a2, a2, t2 -+ -+ pcaddi t1, 20 -+ slli.d t3, t2, 3 -+ add.d a1, a1, t2 -+ sub.d t1, t1, t3 -+ add.d a4, a4, t2 -+ jr t1 -+ -+L(al7): -+ ld.b t0, a1, -7 -+ st.b t0, a4, -7 -+L(al6): -+ ld.b t0, a1, -6 -+ st.b t0, a4, -6 -+L(al5): -+ ld.b t0, a1, -5 -+ st.b t0, a4, -5 -+L(al4): -+ ld.b t0, a1, -4 -+ st.b t0, a4, -4 -+L(al3): -+ ld.b t0, a1, -3 -+ st.b t0, a4, -3 -+L(al2): -+ ld.b t0, a1, -2 -+ st.b t0, a4, -2 -+L(al1): -+ ld.b t0, a1, -1 -+ st.b t0, a4, -1 -+ -+L(check_align): -+ bne a5, a6, L(unalign) -+ -+ srai.d a3, a2, 4 -+ beqz a3, L(al_less_16bytes) -+ -+ andi a3, a2, 0x3f -+ beq a3, a2, L(al_less_64bytes) -+ -+ sub.d t0, a2, a3 -+ move a2, a3 -+ add.d a5, a1, t0 -+ -+L(loop_64bytes): -+ LD_64(a1, 0) -+ addi.d a1, a1, 64 -+ ST_64(a4, 0) -+ -+ addi.d a4, a4, 64 -+ bne a1, a5, L(loop_64bytes) -+ -+L(al_less_64bytes): -+ srai.d a3, a2, 5 -+ beqz a3, L(al_less_32bytes) -+ -+ ld.d t0, a1, 0 -+ ld.d t1, a1, 8 -+ ld.d t2, a1, 16 -+ ld.d t3, a1, 24 -+ -+ addi.d a1, a1, 32 -+ addi.d a2, a2, -32 -+ -+ st.d t0, a4, 0 -+ st.d t1, a4, 8 -+ st.d t2, a4, 16 -+ st.d t3, a4, 24 -+ -+ addi.d a4, a4, 32 -+ -+L(al_less_32bytes): -+ srai.d a3, a2, 4 -+ beqz a3, L(al_less_16bytes) -+ -+ ld.d t0, a1, 0 -+ ld.d t1, a1, 8 -+ addi.d a1, a1, 16 -+ addi.d a2, a2, -16 -+ -+ st.d t0, a4, 0 -+ st.d t1, a4, 8 -+ addi.d a4, a4, 16 -+ -+L(al_less_16bytes): -+ srai.d a3, a2, 3 -+ beqz a3, L(al_less_8bytes) -+ -+ ld.d t0, a1, 0 -+ addi.d a1, a1, 8 -+ addi.d a2, a2, -8 -+ -+ st.d t0, a4, 0 -+ addi.d a4, a4, 8 -+ -+L(al_less_8bytes): -+ srai.d a3, a2, 2 -+ beqz a3, L(al_less_4bytes) -+ -+ ld.w t0, a1, 0 -+ addi.d a1, a1, 4 -+ addi.d a2, a2, -4 -+ -+ st.w t0, a4, 0 -+ addi.d a4, a4, 4 -+ -+L(al_less_4bytes): -+ srai.d a3, a2, 1 -+ beqz a3, L(al_less_2bytes) -+ -+ ld.h t0, a1, 0 -+ addi.d a1, a1, 2 -+ addi.d a2, a2, -2 -+ -+ st.h t0, a4, 0 -+ addi.d a4, a4, 2 -+ -+L(al_less_2bytes): -+ beqz a2, L(al_less_1byte) -+ -+ ld.b t0, a1, 0 -+ st.b t0, a4, 0 -+ -+L(al_less_1byte): -+ jr ra -+ -+L(unalign): -+ andi a5, a1, 0x7 -+ bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned -+ -+ sub.d t8, t8, a5 # use t8 to save count of bytes for aligning -+ slli.d a5, a5, 3 -+ -+ ld.d t0, a1, 0 -+ addi.d a1, a1, 8 -+ -+ slli.d a6, t8, 3 -+ srl.d a7, t0, a5 -+ -+ srai.d a3, a2, 4 -+ beqz a3, L(un_less_16bytes) -+ -+ andi a3, a2, 0x3f -+ beq a3, a2, L(un_less_64bytes) -+ -+ sub.d t0, a2, a3 -+ move a2, a3 -+ add.d a3, a1, t0 -+ -+# a5 shift right num -+# a6 shift left num -+# a7 remaining part -+L(un_long_bytes): -+ ld.d t0, a1, 0 -+ ld.d t1, a1, 8 -+ ld.d t2, a1, 16 -+ ld.d t3, a1, 24 -+ -+ srl.d t4, t0, a5 -+ sll.d t0, t0, a6 -+ -+ srl.d t5, t1, a5 -+ sll.d t1, t1, a6 -+ -+ srl.d t6, t2, a5 -+ sll.d t2, t2, a6 -+ -+ srl.d t7, t3, a5 -+ sll.d t3, t3, a6 -+ -+ or t0, a7, t0 -+ or t1, t4, t1 -+ or t2, t5, t2 -+ or t3, t6, t3 -+ -+ ld.d t4, a1, 32 -+ ld.d t5, a1, 40 -+ ld.d t6, a1, 48 -+ ld.d a7, a1, 56 -+ -+ st.d t0, a4, 0 -+ st.d t1, a4, 8 -+ st.d t2, a4, 16 -+ st.d t3, a4, 24 -+ -+ addi.d a1, a1, 64 -+ -+ srl.d t0, t4, a5 -+ sll.d t4, t4, a6 -+ -+ srl.d t1, t5, a5 -+ sll.d t5, t5, a6 -+ -+ srl.d t2, t6, a5 -+ sll.d t6, t6, a6 -+ -+ sll.d t3, a7, a6 -+ srl.d a7, a7, a5 -+ -+ or t4, t7, t4 -+ or t5, t0, t5 -+ or t6, t1, t6 -+ or t3, t2, t3 -+ -+ st.d t4, a4, 32 -+ st.d t5, a4, 40 -+ st.d t6, a4, 48 -+ st.d t3, a4, 56 -+ -+ addi.d a4, a4, 64 -+ bne a3, a1, L(un_long_bytes) -+ -+L(un_less_64bytes): -+ srai.d a3, a2, 5 -+ beqz a3, L(un_less_32bytes) -+ -+ ld.d t0, a1, 0 -+ ld.d t1, a1, 8 -+ ld.d t2, a1, 16 -+ ld.d t3, a1, 24 -+ -+ addi.d a1, a1, 32 -+ addi.d a2, a2, -32 -+ -+ srl.d t4, t0, a5 -+ sll.d t0, t0, a6 -+ -+ srl.d t5, t1, a5 -+ sll.d t1, t1, a6 -+ -+ srl.d t6, t2, a5 -+ sll.d t2, t2, a6 -+ -+ or t0, a7, t0 -+ -+ srl.d a7, t3, a5 -+ sll.d t3, t3, a6 -+ -+ or t1, t4, t1 -+ or t2, t5, t2 -+ or t3, t6, t3 -+ -+ st.d t0, a4, 0 -+ st.d t1, a4, 8 -+ st.d t2, a4, 16 -+ st.d t3, a4, 24 -+ -+ addi.d a4, a4, 32 -+ -+L(un_less_32bytes): -+ srai.d a3, a2, 4 -+ beqz a3, L(un_less_16bytes) -+ -+ ld.d t0, a1, 0 -+ ld.d t1, a1, 8 -+ -+ addi.d a1, a1, 16 -+ addi.d a2, a2, -16 -+ -+ srl.d t2, t0, a5 -+ sll.d t3, t0, a6 -+ -+ sll.d t4, t1, a6 -+ or t3, a7, t3 -+ or t4, t2, t4 -+ srl.d a7, t1, a5 -+ -+ st.d t3, a4, 0 -+ st.d t4, a4, 8 -+ -+ addi.d a4, a4, 16 -+ -+L(un_less_16bytes): -+ srai.d a3, a2, 3 -+ beqz a3, L(un_less_8bytes) -+ -+ ld.d t0, a1, 0 -+ -+ addi.d a1, a1, 8 -+ addi.d a2, a2, -8 -+ -+ sll.d t1, t0, a6 -+ or t2, a7, t1 -+ srl.d a7, t0, a5 -+ -+ st.d t2, a4, 0 -+ addi.d a4, a4, 8 -+ -+L(un_less_8bytes): -+ beqz a2, L(un_less_1byte) -+ bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7 -+ -+ # combine data in memory and a7(remaining part) -+ ld.d t0, a1, 0 -+ sll.d t0, t0, a6 -+ or a7, a7, t0 -+ -+1: -+ srai.d a3, a2, 2 -+ beqz a3, L(un_less_4bytes) -+ -+ addi.d a2, a2, -4 -+ st.w a7, a4, 0 -+ addi.d a4, a4, 4 -+ srai.d a7, a7, 32 -+ -+L(un_less_4bytes): -+ srai.d a3, a2, 1 -+ beqz a3, L(un_less_2bytes) -+ -+ addi.d a2, a2, -2 -+ st.h a7, a4, 0 -+ addi.d a4, a4, 2 -+ srai.d a7, a7, 16 - -+L(un_less_2bytes): -+ beqz a2, L(un_less_1byte) -+ st.b a7, a4, 0 -+ -+L(un_less_1byte): -+ jr ra -+ -+# Bytes copying for data less than 16 bytes -+L(short_data): -+ pcaddi t1, 36 -+ slli.d t2, a2, 3 -+ add.d a4, a0, a2 -+ sub.d t1, t1, t2 -+ add.d a1, a1, a2 -+ jr t1 -+ -+L(short_15_bytes): -+ ld.b t0, a1, -15 -+ st.b t0, a4, -15 -+L(short_14_bytes): -+ ld.b t0, a1, -14 -+ st.b t0, a4, -14 -+L(short_13_bytes): -+ ld.b t0, a1, -13 -+ st.b t0, a4, -13 -+L(short_12_bytes): -+ ld.b t0, a1, -12 -+ st.b t0, a4, -12 -+L(short_11_bytes): -+ ld.b t0, a1, -11 -+ st.b t0, a4, -11 -+L(short_10_bytes): -+ ld.b t0, a1, -10 -+ st.b t0, a4, -10 -+L(short_9_bytes): -+ ld.b t0, a1, -9 -+ st.b t0, a4, -9 -+L(short_8_bytes): -+ ld.b t0, a1, -8 -+ st.b t0, a4, -8 -+L(short_7_bytes): -+ ld.b t0, a1, -7 -+ st.b t0, a4, -7 -+L(short_6_bytes): -+ ld.b t0, a1, -6 -+ st.b t0, a4, -6 -+L(short_5_bytes): -+ ld.b t0, a1, -5 -+ st.b t0, a4, -5 -+L(short_4_bytes): -+ ld.b t0, a1, -4 -+ st.b t0, a4, -4 -+L(short_3_bytes): -+ ld.b t0, a1, -3 -+ st.b t0, a4, -3 -+L(short_2_bytes): -+ ld.b t0, a1, -2 -+ st.b t0, a4, -2 -+L(short_1_bytes): -+ ld.b t0, a1, -1 -+ st.b t0, a4, -1 -+ jr ra -+ -+L(copy_back): -+ srai.d a3, a2, 4 -+ beqz a3, L(back_short_data) # less than 16 bytes -+ -+ add.d a4, a0, a2 # store the tail of dest -+ add.d a1, a1, a2 # store the tail of src -+ -+ andi a5, a4, 0x7 -+ andi a6, a1, 0x7 -+ beqz a5, L(back_check_align) -+ -+ # make dest aligned 8 bytes -+ sub.d a2, a2, a5 -+ sub.d a1, a1, a5 -+ sub.d a4, a4, a5 -+ -+ pcaddi t1, 18 -+ slli.d t3, a5, 3 -+ sub.d t1, t1, t3 -+ jr t1 -+ -+ ld.b t0, a1, 6 -+ st.b t0, a4, 6 -+ ld.b t0, a1, 5 -+ st.b t0, a4, 5 -+ ld.b t0, a1, 4 -+ st.b t0, a4, 4 -+ ld.b t0, a1, 3 -+ st.b t0, a4, 3 -+ ld.b t0, a1, 2 -+ st.b t0, a4, 2 -+ ld.b t0, a1, 1 -+ st.b t0, a4, 1 -+ ld.b t0, a1, 0 -+ st.b t0, a4, 0 -+ -+L(back_check_align): -+ bne a5, a6, L(back_unalign) -+ -+ srai.d a3, a2, 4 -+ beqz a3, L(back_less_16bytes) -+ -+ andi a3, a2, 0x3f -+ beq a3, a2, L(back_less_64bytes) -+ -+ sub.d t0, a2, a3 -+ move a2, a3 -+ sub.d a5, a1, t0 -+ -+L(back_loop_64bytes): -+ LD_64(a1, -64) -+ addi.d a1, a1, -64 -+ ST_64(a4, -64) -+ -+ addi.d a4, a4, -64 -+ bne a1, a5, L(back_loop_64bytes) -+ -+L(back_less_64bytes): -+ srai.d a3, a2, 5 -+ beqz a3, L(back_less_32bytes) -+ -+ ld.d t0, a1, -32 -+ ld.d t1, a1, -24 -+ ld.d t2, a1, -16 -+ ld.d t3, a1, -8 -+ -+ addi.d a1, a1, -32 -+ addi.d a2, a2, -32 -+ -+ st.d t0, a4, -32 -+ st.d t1, a4, -24 -+ st.d t2, a4, -16 -+ st.d t3, a4, -8 -+ -+ addi.d a4, a4, -32 -+ -+L(back_less_32bytes): -+ srai.d a3, a2, 4 -+ beqz a3, L(back_less_16bytes) -+ -+ ld.d t0, a1, -16 -+ ld.d t1, a1, -8 -+ -+ addi.d a2, a2, -16 -+ addi.d a1, a1, -16 -+ -+ st.d t0, a4, -16 -+ st.d t1, a4, -8 -+ addi.d a4, a4, -16 -+ -+L(back_less_16bytes): -+ srai.d a3, a2, 3 -+ beqz a3, L(back_less_8bytes) -+ -+ ld.d t0, a1, -8 -+ addi.d a2, a2, -8 -+ addi.d a1, a1, -8 -+ -+ st.d t0, a4, -8 -+ addi.d a4, a4, -8 -+ -+L(back_less_8bytes): -+ srai.d a3, a2, 2 -+ beqz a3, L(back_less_4bytes) -+ -+ ld.w t0, a1, -4 -+ addi.d a2, a2, -4 -+ addi.d a1, a1, -4 -+ -+ st.w t0, a4, -4 -+ addi.d a4, a4, -4 -+ -+L(back_less_4bytes): -+ srai.d a3, a2, 1 -+ beqz a3, L(back_less_2bytes) -+ -+ ld.h t0, a1, -2 -+ addi.d a2, a2, -2 -+ addi.d a1, a1, -2 -+ -+ st.h t0, a4, -2 -+ addi.d a4, a4, -2 -+ -+L(back_less_2bytes): -+ beqz a2, L(back_less_1byte) -+ -+ ld.b t0, a1, -1 -+ st.b t0, a4, -1 -+ -+L(back_less_1byte): -+ jr ra -+ -+L(back_unalign): -+ andi t8, a1, 0x7 -+ bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned -+ -+ sub.d a6, zero, t8 -+ -+ ld.d t0, a1, 0 -+ slli.d a6, a6, 3 -+ slli.d a5, t8, 3 -+ sll.d a7, t0, a6 -+ -+ srai.d a3, a2, 4 -+ beqz a3, L(back_un_less_16bytes) -+ -+ andi a3, a2, 0x3f -+ beq a3, a2, L(back_un_less_64bytes) -+ -+ sub.d t0, a2, a3 -+ move a2, a3 -+ sub.d a3, a1, t0 -+ -+L(back_un_long_bytes): -+ ld.d t0, a1, -8 -+ ld.d t1, a1, -16 -+ ld.d t2, a1, -24 -+ ld.d t3, a1, -32 -+ -+ sll.d t4, t0, a6 -+ srl.d t0, t0, a5 -+ -+ sll.d t5, t1, a6 -+ srl.d t1, t1, a5 -+ -+ sll.d t6, t2, a6 -+ srl.d t2, t2, a5 -+ -+ sll.d t7, t3, a6 -+ srl.d t3, t3, a5 -+ -+ or t0, t0, a7 -+ or t1, t1, t4 -+ or t2, t2, t5 -+ or t3, t3, t6 -+ -+ ld.d t4, a1, -40 -+ ld.d t5, a1, -48 -+ ld.d t6, a1, -56 -+ ld.d a7, a1, -64 -+ st.d t0, a4, -8 -+ st.d t1, a4, -16 -+ st.d t2, a4, -24 -+ st.d t3, a4, -32 -+ -+ addi.d a1, a1, -64 -+ -+ sll.d t0, t4, a6 -+ srl.d t4, t4, a5 -+ -+ sll.d t1, t5, a6 -+ srl.d t5, t5, a5 -+ -+ sll.d t2, t6, a6 -+ srl.d t6, t6, a5 -+ -+ srl.d t3, a7, a5 -+ sll.d a7, a7, a6 -+ -+ or t4, t7, t4 -+ or t5, t0, t5 -+ or t6, t1, t6 -+ or t3, t2, t3 -+ -+ st.d t4, a4, -40 -+ st.d t5, a4, -48 -+ st.d t6, a4, -56 -+ st.d t3, a4, -64 -+ -+ addi.d a4, a4, -64 -+ bne a3, a1, L(back_un_long_bytes) -+ -+L(back_un_less_64bytes): -+ srai.d a3, a2, 5 -+ beqz a3, L(back_un_less_32bytes) -+ -+ ld.d t0, a1, -8 -+ ld.d t1, a1, -16 -+ ld.d t2, a1, -24 -+ ld.d t3, a1, -32 -+ -+ addi.d a1, a1, -32 -+ addi.d a2, a2, -32 -+ -+ sll.d t4, t0, a6 -+ srl.d t0, t0, a5 -+ -+ sll.d t5, t1, a6 -+ srl.d t1, t1, a5 -+ -+ sll.d t6, t2, a6 -+ srl.d t2, t2, a5 -+ -+ or t0, a7, t0 -+ -+ sll.d a7, t3, a6 -+ srl.d t3, t3, a5 -+ -+ or t1, t4, t1 -+ or t2, t5, t2 -+ or t3, t6, t3 -+ -+ st.d t0, a4, -8 -+ st.d t1, a4, -16 -+ st.d t2, a4, -24 -+ st.d t3, a4, -32 -+ -+ addi.d a4, a4, -32 -+ -+L(back_un_less_32bytes): -+ srai.d a3, a2, 4 -+ beqz a3, L(back_un_less_16bytes) -+ -+ ld.d t0, a1, -8 -+ ld.d t1, a1, -16 -+ -+ addi.d a1, a1, -16 -+ addi.d a2, a2, -16 -+ -+ sll.d t2, t0, a6 -+ srl.d t3, t0, a5 -+ -+ srl.d t4, t1, a5 -+ or t3, a7, t3 -+ or t4, t2, t4 -+ sll.d a7, t1, a6 -+ -+ st.d t3, a4, -8 -+ st.d t4, a4, -16 -+ -+ addi.d a4, a4, -16 -+ -+L(back_un_less_16bytes): -+ srai.d a3, a2, 3 -+ beqz a3, L(back_un_less_8bytes) -+ -+ ld.d t0, a1, -8 -+ -+ addi.d a1, a1, -8 -+ addi.d a2, a2, -8 -+ -+ srl.d t1, t0, a5 -+ or t2, a7, t1 -+ sll.d a7, t0, a6 -+ -+ st.d t2, a4, -8 -+ addi.d a4, a4, -8 -+ -+L(back_un_less_8bytes): -+ beqz a2, L(back_end) -+ bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7 -+ -+ # combine data in memory and a7(remaining part) -+ ld.d t0, a1, -8 -+ srl.d t0, t0, a5 -+ or a7, a7, t0 -+ -+1: -+ srai.d a3, a2, 2 -+ beqz a3, L(back_un_less_4bytes) -+ -+ srai.d t0, a7, 32 -+ addi.d a2, a2, -4 -+ st.w t0, a4, -4 -+ addi.d a4, a4, -4 -+ slli.d a7, a7, 32 -+ -+L(back_un_less_4bytes): -+ srai.d a3, a2, 1 -+ beqz a3, L(back_un_less_2bytes) -+ srai.d t0, a7, 48 -+ addi.d a2, a2, -2 -+ st.h t0, a4, -2 -+ addi.d a4, a4, -2 -+ slli.d a7, a7, 16 -+L(back_un_less_2bytes): -+ beqz a2, L(back_un_less_1byte) -+ srai.d t0, a7, 56 -+ st.b t0, a4, -1 -+L(back_un_less_1byte): -+ jr ra -+ -+L(back_short_data): -+ pcaddi t1, 34 -+ slli.d t2, a2, 3 -+ sub.d t1, t1, t2 -+ jr t1 -+ -+ ld.b t0, a1, 14 -+ st.b t0, a0, 14 -+ ld.b t0, a1, 13 -+ st.b t0, a0, 13 -+ ld.b t0, a1, 12 -+ st.b t0, a0, 12 -+ ld.b t0, a1, 11 -+ st.b t0, a0, 11 -+ ld.b t0, a1, 10 -+ st.b t0, a0, 10 -+ ld.b t0, a1, 9 -+ st.b t0, a0, 9 -+ ld.b t0, a1, 8 -+ st.b t0, a0, 8 -+ ld.b t0, a1, 7 -+ st.b t0, a0, 7 -+ ld.b t0, a1, 6 -+ st.b t0, a0, 6 -+ ld.b t0, a1, 5 -+ st.b t0, a0, 5 -+ ld.b t0, a1, 4 -+ st.b t0, a0, 4 -+ ld.b t0, a1, 3 -+ st.b t0, a0, 3 -+ ld.b t0, a1, 2 -+ st.b t0, a0, 2 -+ ld.b t0, a1, 1 -+ st.b t0, a0, 1 -+ ld.b t0, a1, 0 -+ st.b t0, a0, 0 -+L(back_end): -+ jr ra -+ -+END(MEMCPY_NAME) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMCPY_NAME) -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memset-aligned.S b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S -index da2f5ada..412ee849 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memset-aligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S -@@ -1,9 +1,169 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif - - #if IS_IN (libc) -- - #define MEMSET_NAME __memset_aligned -- -+#else -+#define MEMSET_NAME memset - #endif - --#include "../memset.S" -+#define ST_64(n) \ -+ st.d a1, a0, n; \ -+ st.d a1, a0, n+8; \ -+ st.d a1, a0, n+16; \ -+ st.d a1, a0, n+24; \ -+ st.d a1, a0, n+32; \ -+ st.d a1, a0, n+40; \ -+ st.d a1, a0, n+48; \ -+ st.d a1, a0, n+56; -+ -+LEAF(MEMSET_NAME, 6) -+ move t0, a0 -+ andi a3, a0, 0x7 -+ li.w t6, 16 -+ beqz a3, L(align) -+ blt a2, t6, L(short_data) -+ -+L(make_align): -+ li.w t8, 8 -+ sub.d t2, t8, a3 -+ pcaddi t1, 11 -+ slli.d t3, t2, 2 -+ sub.d t1, t1, t3 -+ jirl zero, t1, 0 -+ -+L(al7): -+ st.b a1, t0, 6 -+L(al6): -+ st.b a1, t0, 5 -+L(al5): -+ st.b a1, t0, 4 -+L(al4): -+ st.b a1, t0, 3 -+L(al3): -+ st.b a1, t0, 2 -+L(al2): -+ st.b a1, t0, 1 -+L(al1): -+ st.b a1, t0, 0 -+L(al0): -+ add.d t0, t0, t2 -+ sub.d a2, a2, t2 -+ -+L(align): -+ bstrins.d a1, a1, 15, 8 -+ bstrins.d a1, a1, 31, 16 -+ bstrins.d a1, a1, 63, 32 -+ -+ blt a2, t6, L(less_16bytes) -+ -+ andi a4, a2, 0x3f -+ beq a4, a2, L(less_64bytes) -+ -+ sub.d t1, a2, a4 -+ move a2, a4 -+ add.d a5, t0, t1 -+ -+L(loop_64bytes): -+ addi.d t0, t0, 64 -+ st.d a1, t0, -64 -+ st.d a1, t0, -56 -+ st.d a1, t0, -48 -+ st.d a1, t0, -40 -+ st.d a1, t0, -32 -+ st.d a1, t0, -24 -+ st.d a1, t0, -16 -+ st.d a1, t0, -8 -+ bne t0, a5, L(loop_64bytes) -+ -+L(less_64bytes): -+ srai.d a4, a2, 5 -+ beqz a4, L(less_32bytes) -+ addi.d a2, a2, -32 -+ st.d a1, t0, 0 -+ st.d a1, t0, 8 -+ st.d a1, t0, 16 -+ st.d a1, t0, 24 -+ addi.d t0, t0, 32 -+L(less_32bytes): -+ blt a2, t6, L(less_16bytes) -+ addi.d a2, a2, -16 -+ st.d a1, t0, 0 -+ st.d a1, t0, 8 -+ addi.d t0, t0, 16 -+L(less_16bytes): -+ srai.d a4, a2, 3 -+ beqz a4, L(less_8bytes) -+ addi.d a2, a2, -8 -+ st.d a1, t0, 0 -+ addi.d t0, t0, 8 -+L(less_8bytes): -+ beqz a2, L(less_1byte) -+ srai.d a4, a2, 2 -+ beqz a4, L(less_4bytes) -+ addi.d a2, a2, -4 -+ st.w a1, t0, 0 -+ addi.d t0, t0, 4 -+L(less_4bytes): -+ srai.d a3, a2, 1 -+ beqz a3, L(less_2bytes) -+ addi.d a2, a2, -2 -+ st.h a1, t0, 0 -+ addi.d t0, t0, 2 -+L(less_2bytes): -+ beqz a2, L(less_1byte) -+ st.b a1, t0, 0 -+L(less_1byte): -+ jr ra -+ -+L(short_data): -+ pcaddi t1, 19 -+ slli.d t3, a2, 2 -+ sub.d t1, t1, t3 -+ jirl zero, t1, 0 -+L(short_15): -+ st.b a1, a0, 14 -+ -+L(short_14): -+ st.b a1, a0, 13 -+L(short_13): -+ st.b a1, a0, 12 -+L(short_12): -+ st.b a1, a0, 11 -+L(short_11): -+ st.b a1, a0, 10 -+L(short_10): -+ st.b a1, a0, 9 -+L(short_9): -+ st.b a1, a0, 8 -+L(short_8): -+ st.b a1, a0, 7 -+L(short_7): -+ st.b a1, a0, 6 -+L(short_6): -+ st.b a1, a0, 5 -+L(short_5): -+ st.b a1, a0, 4 -+L(short_4): -+ st.b a1, a0, 3 -+L(short_3): -+ st.b a1, a0, 2 -+L(short_2): -+ st.b a1, a0, 1 -+L(short_1): -+ st.b a1, a0, 0 -+L(short_0): -+ jr ra -+ -+END(MEMSET_NAME) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMSET_NAME) -+#endif - -diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S -index 0b46b4ca..a13e293f 100644 ---- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S -@@ -1,7 +1,115 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif - - #if IS_IN (libc) - #define RAWMEMCHR_NAME __rawmemchr_aligned -+#else -+#define RAWMEMCHR_NAME __rawmemchr - #endif - --#include "../rawmemchr.S" -+LEAF(RAWMEMCHR_NAME, 6) -+ andi t1, a0, 0x7 -+ bstrins.d a0, zero, 2, 0 -+ lu12i.w a2, 0x01010 -+ bstrins.d a1, a1, 15, 8 -+ -+ ld.d t0, a0, 0 -+ slli.d t1, t1, 3 -+ ori a2, a2, 0x101 -+ bstrins.d a1, a1, 31, 16 -+ -+ li.w t8, -1 -+ bstrins.d a1, a1, 63, 32 -+ bstrins.d a2, a2, 63, 32 -+ sll.d t2, t8, t1 -+ -+ sll.d t3, a1, t1 -+ orn t0, t0, t2 -+ slli.d a3, a2, 7 -+ beqz a1, L(find_zero) -+ -+ xor t0, t0, t3 -+ sub.d t1, t0, a2 -+ andn t2, a3, t0 -+ and t3, t1, t2 -+ -+ bnez t3, L(count_pos) -+ addi.d a0, a0, 8 -+ -+L(loop): -+ ld.d t0, a0, 0 -+ xor t0, t0, a1 -+ -+ sub.d t1, t0, a2 -+ andn t2, a3, t0 -+ and t3, t1, t2 -+ bnez t3, L(count_pos) -+ -+ ld.d t0, a0, 8 -+ addi.d a0, a0, 16 -+ xor t0, t0, a1 -+ sub.d t1, t0, a2 -+ -+ andn t2, a3, t0 -+ and t3, t1, t2 -+ beqz t3, L(loop) -+ addi.d a0, a0, -8 -+L(count_pos): -+ ctz.d t0, t3 -+ srli.d t0, t0, 3 -+ add.d a0, a0, t0 -+ jr ra -+ -+L(loop_7bit): -+ ld.d t0, a0, 0 -+L(find_zero): -+ sub.d t1, t0, a2 -+ and t2, t1, a3 -+ bnez t2, L(more_check) -+ -+ ld.d t0, a0, 8 -+ addi.d a0, a0, 16 -+ sub.d t1, t0, a2 -+ and t2, t1, a3 -+ -+ beqz t2, L(loop_7bit) -+ addi.d a0, a0, -8 -+ -+L(more_check): -+ andn t2, a3, t0 -+ and t3, t1, t2 -+ bnez t3, L(count_pos) -+ addi.d a0, a0, 8 -+ -+L(loop_8bit): -+ ld.d t0, a0, 0 -+ -+ sub.d t1, t0, a2 -+ andn t2, a3, t0 -+ and t3, t1, t2 -+ bnez t3, L(count_pos) -+ -+ ld.d t0, a0, 8 -+ addi.d a0, a0, 16 -+ sub.d t1, t0, a2 -+ -+ andn t2, a3, t0 -+ and t3, t1, t2 -+ beqz t3, L(loop_8bit) -+ -+ addi.d a0, a0, -8 -+ b L(count_pos) -+ -+END(RAWMEMCHR_NAME) -+ -+#ifdef _LIBC -+weak_alias (__rawmemchr, rawmemchr) -+libc_hidden_builtin_def (__rawmemchr) -+#endif - -diff --git a/sysdeps/loongarch/lp64/rawmemchr.S b/sysdeps/loongarch/lp64/rawmemchr.S -deleted file mode 100644 -index ef1db7ed..00000000 ---- a/sysdeps/loongarch/lp64/rawmemchr.S -+++ /dev/null -@@ -1,113 +0,0 @@ --#ifdef _LIBC --#include --#include --#include --#else --#include --#include --#endif -- --#ifndef RAWMEMCHR_NAME --# define RAWMEMCHR_NAME __rawmemchr --#endif -- -- --LEAF(RAWMEMCHR_NAME, 6) -- andi t1, a0, 0x7 -- bstrins.d a0, zero, 2, 0 -- lu12i.w a2, 0x01010 -- bstrins.d a1, a1, 15, 8 -- -- ld.d t0, a0, 0 -- slli.d t1, t1, 3 -- ori a2, a2, 0x101 -- bstrins.d a1, a1, 31, 16 -- -- li.w t8, -1 -- bstrins.d a1, a1, 63, 32 -- bstrins.d a2, a2, 63, 32 -- sll.d t2, t8, t1 -- -- sll.d t3, a1, t1 -- orn t0, t0, t2 -- slli.d a3, a2, 7 -- beqz a1, L(find_zero) -- -- xor t0, t0, t3 -- sub.d t1, t0, a2 -- andn t2, a3, t0 -- and t3, t1, t2 -- -- bnez t3, L(count_pos) -- addi.d a0, a0, 8 -- --L(loop): -- ld.d t0, a0, 0 -- xor t0, t0, a1 -- -- sub.d t1, t0, a2 -- andn t2, a3, t0 -- and t3, t1, t2 -- bnez t3, L(count_pos) -- -- ld.d t0, a0, 8 -- addi.d a0, a0, 16 -- xor t0, t0, a1 -- sub.d t1, t0, a2 -- -- andn t2, a3, t0 -- and t3, t1, t2 -- beqz t3, L(loop) -- addi.d a0, a0, -8 --L(count_pos): -- ctz.d t0, t3 -- srli.d t0, t0, 3 -- add.d a0, a0, t0 -- jr ra -- --L(loop_7bit): -- ld.d t0, a0, 0 --L(find_zero): -- sub.d t1, t0, a2 -- and t2, t1, a3 -- bnez t2, L(more_check) -- -- ld.d t0, a0, 8 -- addi.d a0, a0, 16 -- sub.d t1, t0, a2 -- and t2, t1, a3 -- -- beqz t2, L(loop_7bit) -- addi.d a0, a0, -8 -- --L(more_check): -- andn t2, a3, t0 -- and t3, t1, t2 -- bnez t3, L(count_pos) -- addi.d a0, a0, 8 -- --L(loop_8bit): -- ld.d t0, a0, 0 -- -- sub.d t1, t0, a2 -- andn t2, a3, t0 -- and t3, t1, t2 -- bnez t3, L(count_pos) -- -- ld.d t0, a0, 8 -- addi.d a0, a0, 16 -- sub.d t1, t0, a2 -- -- andn t2, a3, t0 -- and t3, t1, t2 -- beqz t3, L(loop_8bit) -- -- addi.d a0, a0, -8 -- b L(count_pos) -- --END(RAWMEMCHR_NAME) -- --#ifdef _LIBC --weak_alias (__rawmemchr, rawmemchr) --libc_hidden_builtin_def (__rawmemchr) --#endif --- -2.33.0 - diff --git a/glibc-2.28-Refactor-code-of-st-r-p-functions.patch b/glibc-2.28-Refactor-code-of-st-r-p-functions.patch deleted file mode 100644 index 7c453e7..0000000 --- a/glibc-2.28-Refactor-code-of-st-r-p-functions.patch +++ /dev/null @@ -1,2770 +0,0 @@ -From b720fd44df475685ea164491d76c42e127aab3ea Mon Sep 17 00:00:00 2001 -From: caiyinyu -Date: Wed, 21 Jun 2023 10:49:39 +0800 -Subject: [PATCH 07/14] glibc-2.28: Refactor code of st{r,p}* functions. - -Change-Id: Ife977373e9ba071b284ee19ca4ba121bc27d5834 -Signed-off-by: ticat_fp ---- - .../loongarch/lp64/multiarch/stpcpy-aligned.S | 179 +++++++++++- - .../loongarch/lp64/multiarch/strchr-aligned.S | 91 ++++++- - .../lp64/multiarch/strchrnul-aligned.S | 94 ++++++- - .../loongarch/lp64/multiarch/strcmp-aligned.S | 225 ++++++++++++++- - .../loongarch/lp64/multiarch/strcpy-aligned.S | 173 +++++++++++- - .../loongarch/lp64/multiarch/strlen-aligned.S | 85 +++++- - .../lp64/multiarch/strncmp-aligned.S | 256 +++++++++++++++++- - .../lp64/multiarch/strnlen-aligned.S | 82 +++++- - .../lp64/multiarch/strrchr-aligned.S | 105 ++++++- - sysdeps/loongarch/lp64/stpcpy.S | 179 ------------ - sysdeps/loongarch/lp64/strchr.S | 89 ------ - sysdeps/loongarch/lp64/strchrnul.S | 94 ------- - sysdeps/loongarch/lp64/strcmp.S | 227 ---------------- - sysdeps/loongarch/lp64/strcpy.S | 173 ------------ - sysdeps/loongarch/lp64/strlen.S | 85 ------ - sysdeps/loongarch/lp64/strncmp.S | 256 ------------------ - sysdeps/loongarch/lp64/strnlen.S | 82 ------ - sysdeps/loongarch/lp64/strrchr.S | 105 ------- - 18 files changed, 1264 insertions(+), 1316 deletions(-) - delete mode 100644 sysdeps/loongarch/lp64/stpcpy.S - delete mode 100644 sysdeps/loongarch/lp64/strchr.S - delete mode 100644 sysdeps/loongarch/lp64/strchrnul.S - delete mode 100644 sysdeps/loongarch/lp64/strcmp.S - delete mode 100644 sysdeps/loongarch/lp64/strcpy.S - delete mode 100644 sysdeps/loongarch/lp64/strlen.S - delete mode 100644 sysdeps/loongarch/lp64/strncmp.S - delete mode 100644 sysdeps/loongarch/lp64/strnlen.S - delete mode 100644 sysdeps/loongarch/lp64/strrchr.S - -diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S -index 3d134e3f..7109b0f0 100644 ---- a/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S -@@ -1,8 +1,181 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif - - #if IS_IN (libc) -- - #define STPCPY_NAME __stpcpy_aligned -- -+#else -+#define STPCPY_NAME __stpcpy - #endif - --#include "../stpcpy.S" -+LEAF(STPCPY_NAME, 6) -+ andi a3, a0, 0x7 -+ beqz a3, L(dest_align) -+ sub.d a5, a1, a3 -+ addi.d a5, a5, 8 -+ -+L(make_dest_align): -+ ld.b t0, a1, 0 -+ addi.d a1, a1, 1 -+ st.b t0, a0, 0 -+ addi.d a0, a0, 1 -+ -+ beqz t0, L(al_out) -+ bne a1, a5, L(make_dest_align) -+ -+L(dest_align): -+ andi a4, a1, 7 -+ bstrins.d a1, zero, 2, 0 -+ -+ lu12i.w t5, 0x1010 -+ ld.d t0, a1, 0 -+ ori t5, t5, 0x101 -+ bstrins.d t5, t5, 63, 32 -+ -+ slli.d t6, t5, 0x7 -+ bnez a4, L(unalign) -+ sub.d t1, t0, t5 -+ andn t2, t6, t0 -+ -+ and t3, t1, t2 -+ bnez t3, L(al_end) -+ -+L(al_loop): -+ st.d t0, a0, 0 -+ ld.d t0, a1, 8 -+ -+ addi.d a1, a1, 8 -+ addi.d a0, a0, 8 -+ sub.d t1, t0, t5 -+ andn t2, t6, t0 -+ -+ and t3, t1, t2 -+ beqz t3, L(al_loop) -+ -+L(al_end): -+ ctz.d t1, t3 -+ srli.d t1, t1, 3 -+ addi.d t1, t1, 1 # add 1, since '\0' needs to be copied to dest -+ -+ andi a3, t1, 8 -+ andi a4, t1, 4 -+ andi a5, t1, 2 -+ andi a6, t1, 1 -+ -+L(al_end_8): -+ beqz a3, L(al_end_4) -+ st.d t0, a0, 0 -+ addi.d a0, a0, 7 -+ jr ra -+L(al_end_4): -+ beqz a4, L(al_end_2) -+ st.w t0, a0, 0 -+ addi.d a0, a0, 4 -+ srli.d t0, t0, 32 -+L(al_end_2): -+ beqz a5, L(al_end_1) -+ st.h t0, a0, 0 -+ addi.d a0, a0, 2 -+ srli.d t0, t0, 16 -+L(al_end_1): -+ beqz a6, L(al_out) -+ st.b t0, a0, 0 -+ addi.d a0, a0, 1 -+L(al_out): -+ addi.d a0, a0, -1 -+ jr ra -+ -+L(unalign): -+ slli.d a5, a4, 3 -+ li.d t1, -1 -+ sub.d a6, zero, a5 -+ -+ srl.d a7, t0, a5 -+ sll.d t7, t1, a6 -+ -+ or t0, a7, t7 -+ sub.d t1, t0, t5 -+ andn t2, t6, t0 -+ and t3, t1, t2 -+ -+ bnez t3, L(un_end) -+ -+ ld.d t4, a1, 8 -+ addi.d a1, a1, 8 -+ -+ sub.d t1, t4, t5 -+ andn t2, t6, t4 -+ sll.d t0, t4, a6 -+ and t3, t1, t2 -+ -+ or t0, t0, a7 -+ bnez t3, L(un_end_with_remaining) -+ -+L(un_loop): -+ srl.d a7, t4, a5 -+ -+ ld.d t4, a1, 8 -+ addi.d a1, a1, 8 -+ -+ st.d t0, a0, 0 -+ addi.d a0, a0, 8 -+ -+ sub.d t1, t4, t5 -+ andn t2, t6, t4 -+ sll.d t0, t4, a6 -+ and t3, t1, t2 -+ -+ or t0, t0, a7 -+ beqz t3, L(un_loop) -+ -+L(un_end_with_remaining): -+ ctz.d t1, t3 -+ srli.d t1, t1, 3 -+ addi.d t1, t1, 1 -+ sub.d t1, t1, a4 -+ -+ blt t1, zero, L(un_end_less_8) -+ st.d t0, a0, 0 -+ addi.d a0, a0, 8 -+ beqz t1, L(un_out) -+ srl.d t0, t4, a5 # get the remaining part -+ b L(un_end_less_8) -+ -+L(un_end): -+ ctz.d t1, t3 -+ srli.d t1, t1, 3 -+ addi.d t1, t1, 1 -+ -+L(un_end_less_8): -+ andi a4, t1, 4 -+ andi a5, t1, 2 -+ andi a6, t1, 1 -+L(un_end_4): -+ beqz a4, L(un_end_2) -+ st.w t0, a0, 0 -+ addi.d a0, a0, 4 -+ srli.d t0, t0, 32 -+L(un_end_2): -+ beqz a5, L(un_end_1) -+ st.h t0, a0, 0 -+ addi.d a0, a0, 2 -+ srli.d t0, t0, 16 -+L(un_end_1): -+ beqz a6, L(un_out) -+ st.b t0, a0, 0 -+ addi.d a0, a0, 1 -+L(un_out): -+ addi.d a0, a0, -1 -+ jr ra -+ -+END(STPCPY_NAME) -+ -+#ifdef _LIBC -+weak_alias (STPCPY_NAME, stpcpy) -+libc_hidden_builtin_def (STPCPY_NAME) -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S -index 92365658..d9bd4587 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S -@@ -1,10 +1,95 @@ - --#if IS_IN (libc) - --#define STRCHR_NAME __strchr_aligned -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif - -+#if IS_IN (libc) -+#define STRCHR_NAME __strchr_aligned -+#else -+#define STRCHR_NAME strchr - #endif - --#include "../strchr.S" -+/* char * strchr (const char *s1, int c); */ -+ -+LEAF(STRCHR_NAME, 6) -+ slli.d t1, a0, 3 -+ bstrins.d a0, zero, 2, 0 -+ lu12i.w a2, 0x01010 -+ ld.d t2, a0, 0 -+ -+ ori a2, a2, 0x101 -+ andi a1, a1, 0xff -+ bstrins.d a2, a2, 63, 32 -+ li.w t0, -1 -+ -+ mul.d a1, a1, a2 # "cccccccc" -+ sll.d t0, t0, t1 -+ slli.d a3, a2, 7 # 0x8080808080808080 -+ orn t2, t2, t0 -+ -+ sll.d t3, a1, t1 -+ xor t4, t2, t3 -+ sub.d a7, t2, a2 -+ andn a6, a3, t2 -+ -+ -+ sub.d a5, t4, a2 -+ andn a4, a3, t4 -+ and a6, a7, a6 -+ and a5, a5, a4 -+ -+ or t0, a6, a5 -+ bnez t0, L(_mc8_a) -+ addi.d a0, a0, 8 -+L(_aloop): -+ ld.d t4, a0, 0 -+ -+ xor t2, t4, a1 -+ sub.d a7, t4, a2 -+ andn a6, a3, t4 -+ sub.d a5, t2, a2 -+ -+ andn a4, a3, t2 -+ and a6, a7, a6 -+ and a5, a5, a4 -+ or a7, a6, a5 -+ -+ -+ bnez a7, L(_mc8_a) -+ ld.d t4, a0, 8 -+ addi.d a0, a0, 16 -+ xor t2, t4, a1 -+ -+ sub.d a7, t4, a2 -+ andn a6, a3, t4 -+ sub.d a5, t2, a2 -+ andn a4, a3, t2 -+ -+ and a6, a7, a6 -+ and a5, a5, a4 -+ or a7, a6, a5 -+ beqz a7, L(_aloop) -+ -+ addi.d a0, a0, -8 -+ -+L(_mc8_a): -+ ctz.d t0, a5 -+ ctz.d t2, a6 -+ srli.w t0, t0, 3 -+ -+ -+ srli.w t2, t2, 3 -+ sltu t1, t2, t0 -+ add.d a0, a0, t0 -+ masknez a0, a0, t1 -+ -+ jr ra -+END(STRCHR_NAME) - - weak_alias (STRCHR_NAME, index) -diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S -index 4fa63ecc..f18b01a3 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S -@@ -1,8 +1,96 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif - - #if IS_IN (libc) -- - #define STRCHRNUL_NAME __strchrnul_aligned -- -+#else -+#define STRCHRNUL_NAME __strchrnul - #endif - --#include "../strchrnul.S" -+/* char * strchrnul (const char *s1, int c); */ -+ -+LEAF(STRCHRNUL_NAME, 6) -+ slli.d t1, a0, 3 -+ bstrins.d a0, zero, 2, 0 -+ lu12i.w a2, 0x01010 -+ ld.d t2, a0, 0 -+ -+ ori a2, a2, 0x101 -+ andi a1, a1, 0xff -+ bstrins.d a2, a2, 63, 32 -+ li.w t0, -1 -+ -+ mul.d a1, a1, a2 # "cccccccc" -+ sll.d t0, t0, t1 -+ slli.d a3, a2, 7 # 0x8080808080808080 -+ orn t2, t2, t0 -+ -+ sll.d t3, a1, t1 -+ xor t4, t2, t3 -+ sub.d a7, t2, a2 -+ andn a6, a3, t2 -+ -+ -+ sub.d a5, t4, a2 -+ andn a4, a3, t4 -+ and a6, a7, a6 -+ and a5, a5, a4 -+ -+ or t0, a6, a5 -+ bnez t0, L(_mc8_a) -+ addi.d a0, a0, 8 -+L(_aloop): -+ ld.d t4, a0, 0 -+ -+ xor t2, t4, a1 -+ sub.d a7, t4, a2 -+ andn a6, a3, t4 -+ sub.d a5, t2, a2 -+ -+ andn a4, a3, t2 -+ and a6, a7, a6 -+ and a5, a5, a4 -+ or a7, a6, a5 -+ -+ -+ bnez a7, L(_mc8_a) -+ ld.d t4, a0, 8 -+ addi.d a0, a0, 16 -+ xor t2, t4, a1 -+ -+ sub.d a7, t4, a2 -+ andn a6, a3, t4 -+ sub.d a5, t2, a2 -+ andn a4, a3, t2 -+ -+ and a6, a7, a6 -+ and a5, a5, a4 -+ or a7, a6, a5 -+ beqz a7, L(_aloop) -+ -+ addi.d a0, a0, -8 -+L(_mc8_a): -+ ctz.d t0, a5 -+ ctz.d t2, a6 -+ srli.w t0, t0, 3 -+ -+ srli.w t2, t2, 3 -+ slt t1, t0, t2 -+ masknez t3, t2, t1 -+ maskeqz t4, t0, t1 -+ -+ or t0, t3, t4 -+ add.d a0, a0, t0 -+ jr ra -+END(STRCHRNUL_NAME) -+ -+#ifdef _LIBC -+weak_alias(STRCHRNUL_NAME, strchrnul) -+libc_hidden_builtin_def (STRCHRNUL_NAME) -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S -index f84f52b8..a9b74b0c 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S -@@ -1,8 +1,229 @@ -+/* 2022\06\15 loongarch64 author: chenxiaolong. */ - --#if IS_IN (libc) -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif - -+#if IS_IN (libc) - #define STRCMP_NAME __strcmp_aligned -+#else -+#define STRCMP_NAME strcmp -+#endif -+ -+/* int strcmp (const char *s1, const char *s2); */ -+ -+/* Parameters and Results */ -+#define src1 a0 -+#define src2 a1 -+#define result v0 -+LEAF(STRCMP_NAME, 6) -+ xor a4, src1, src2 -+ lu12i.w t5, 0x01010 -+ lu12i.w t6, 0x7f7f7 -+ andi a2, src1, 0x7 -+ -+ ori t5, t5, 0x101 -+ andi a4, a4, 0x7 -+ ori t6, t6, 0xf7f -+ bstrins.d t5, t5, 63, 32 -+ bstrins.d t6, t6, 63, 32 -+ -+ bnez a4, 3f // unaligned -+ beqz a2, 1f // loop aligned -+ -+// mutual aligned -+ bstrins.d src1, zero, 2, 0 -+ bstrins.d src2, zero, 2, 0 -+ slli.d a4, a2, 0x3 -+ ld.d t0, src1, 0 -+ -+ sub.d a4, zero, a4 -+ ld.d t1, src2, 0 -+ addi.d src1, src1, 8 -+ addi.d src2, src2, 8 -+ -+ nor a5, zero, zero -+ srl.d a5, a5, a4 -+ or t0, t0, a5 -+ -+ or t1, t1, a5 -+ b 2f //start realigned -+ -+// loop aligned -+1: -+ ld.d t0, src1, 0 -+ addi.d src1, src1, 8 -+ ld.d t1, src2, 0 -+ addi.d src2, src2, 8 -+ -+// start realigned: -+2: -+ sub.d t2, t0, t5 -+ nor t3, t0, t6 -+ and t2, t2, t3 -+ -+ xor t3, t0, t1 -+ or t2, t2, t3 -+ beqz t2, 1b -+ -+ ctz.d t7, t2 -+ bstrins.d t7, zero, 2, 0 -+ srl.d t0, t0, t7 -+ srl.d t1, t1, t7 -+ -+ andi t0, t0, 0xff -+ andi t1, t1, 0xff -+ sub.d v0, t0, t1 -+ jr ra -+ -+// unaligned -+3: -+ andi a3, src2, 0x7 -+ slt a5, a2, a3 -+ masknez t8, a2, a5 -+ xor a6, src1, src2 -+ maskeqz a6, a6, t8 -+ xor src1, src1, a6 -+ xor src2, src2, a6 -+ -+ andi a2, src1, 0x7 -+ beqz a2, 4f // src1 is aligned -+ -+//strcmp_unaligned: -+ andi a3, src2, 0x7 -+ bstrins.d src1, zero, 2, 0 -+ bstrins.d src2, zero, 2, 0 -+ nor t3, zero, zero -+ -+ ld.d t0, src1, 0 -+ ld.d t1, src2, 0 -+ sub.d a2, a3, a2 -+ addi.d t2, zero, 8 -+ -+ sub.d a5, t2, a2 -+ sub.d a6, t2, a3 -+ slli.d a5, a5, 0x3 -+ slli.d a6, a6, 0x3 -+ -+ srl.d t4, t3, a6 -+ srl.d a4, t3, a5 -+ rotr.d a7, t0, a5 -+ -+ addi.d src2, src2, 8 -+ addi.d src1, src1, 8 -+ or t1, t1, t4 -+ or t0, a7, t4 -+ -+ sub.d t2, t0, t5 -+ nor t3, t0, t6 -+ and t2, t2, t3 -+ xor t3, t0, t1 -+ or t2, t2, t3 -+ bnez t2, 7f -+ -+ and a7, a7, a4 -+ slli.d a6, a2, 0x3 -+ nor a4, zero, a4 -+ b 5f -+ -+// src1 is aligned -+4: -+ andi a3, src2, 0x7 -+ ld.d t0, src1, 0 -+ -+ bstrins.d src2, zero, 2, 0 -+ nor t2, zero, zero -+ ld.d t1, src2, 0 -+ -+ addi.d t3, zero, 0x8 -+ sub.d a5, t3, a3 -+ slli.d a5, a5, 0x3 -+ srl.d a4, t2, a5 -+ rotr.d t4, t0, a5 -+ -+ addi.d src2, src2, 8 -+ addi.d src1, src1, 8 -+ or t1, t1, a4 -+ or t0, t4, a4 -+ -+ sub.d t2, t0, t5 -+ nor t3, t0, t6 -+ and t2, t2, t3 -+ xor t3, t0, t1 -+ or t2, t2, t3 -+ -+ bnez t2, 7f -+ -+ and a7, t4, a4 -+ slli.d a6, a3, 0x3 -+ nor a4, zero, a4 -+ -+// unaligned loop -+// a7: remaining number -+// a6: shift left number -+// a5: shift right number -+// a4: mask for checking remaining number -+5: -+ or t0, a7, a4 -+ sub.d t2, t0, t5 -+ nor t3, t0, t6 -+ and t2, t2, t3 -+ bnez t2, 6f -+ -+ ld.d t0, src1, 0 -+ addi.d src1, src1, 8 -+ ld.d t1, src2, 0 -+ addi.d src2, src2, 8 -+ -+ srl.d t7, t0, a5 -+ sll.d t0, t0, a6 -+ or t0, a7, t0 -+ -+ sub.d t2, t0, t5 -+ nor t3, t0, t6 -+ and t2, t2, t3 -+ xor t3, t0, t1 -+ or t2, t2, t3 -+ bnez t2, 7f -+ -+ or a7, t7, zero -+ b 5b -+ -+6: -+ ld.bu t1, src2, 0 -+ andi t0, a7, 0xff -+ xor t2, t0, t1 -+ srli.d a7, a7, 0x8 -+ masknez t2, t0, t2 -+ addi.d src2, src2, 1 -+ beqz t2, 8f -+ b 6b -+ -+7: -+ ctz.d t7, t2 -+ bstrins.d t7, zero, 2, 0 -+ srl.d t0, t0, t7 -+ srl.d t1, t1, t7 -+ -+ andi t0, t0, 0xff -+ andi t1, t1, 0xff -+ -+8: -+ sub.d a4, t0, t1 -+ sub.d a5, t1, t0 -+ maskeqz a6, a5, t8 -+ masknez result, a4, t8 -+ or result, result, a6 -+ jr ra -+ -+END(STRCMP_NAME) - -+#ifdef _LIBC -+libc_hidden_builtin_def (STRCMP_NAME) - #endif - --#include "../strcmp.S" -diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S -index 4860398b..80954912 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S -@@ -1,8 +1,175 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif - - #if IS_IN (libc) -- - #define STRCPY __strcpy_aligned -- -+#else -+#define STRCPY strcpy - #endif - --#include "../strcpy.S" -+LEAF(STRCPY, 6) -+ andi a3, a0, 0x7 -+ move a2, a0 -+ beqz a3, L(dest_align) -+ sub.d a5, a1, a3 -+ addi.d a5, a5, 8 -+ -+L(make_dest_align): -+ ld.b t0, a1, 0 -+ addi.d a1, a1, 1 -+ st.b t0, a2, 0 -+ beqz t0, L(al_out) -+ -+ addi.d a2, a2, 1 -+ bne a1, a5, L(make_dest_align) -+ -+L(dest_align): -+ andi a4, a1, 7 -+ bstrins.d a1, zero, 2, 0 -+ -+ lu12i.w t5, 0x1010 -+ ld.d t0, a1, 0 -+ ori t5, t5, 0x101 -+ bstrins.d t5, t5, 63, 32 -+ -+ slli.d t6, t5, 0x7 -+ bnez a4, L(unalign) -+ sub.d t1, t0, t5 -+ andn t2, t6, t0 -+ -+ and t3, t1, t2 -+ bnez t3, L(al_end) -+ -+L(al_loop): -+ st.d t0, a2, 0 -+ ld.d t0, a1, 8 -+ -+ addi.d a1, a1, 8 -+ addi.d a2, a2, 8 -+ sub.d t1, t0, t5 -+ andn t2, t6, t0 -+ -+ and t3, t1, t2 -+ beqz t3, L(al_loop) -+ -+L(al_end): -+ ctz.d t1, t3 -+ srli.d t1, t1, 3 -+ addi.d t1, t1, 1 # add 1, since '\0' needs to be copied to dest -+ -+ andi a3, t1, 8 -+ andi a4, t1, 4 -+ andi a5, t1, 2 -+ andi a6, t1, 1 -+ -+L(al_end_8): -+ beqz a3, L(al_end_4) -+ st.d t0, a2, 0 -+ jr ra -+L(al_end_4): -+ beqz a4, L(al_end_2) -+ st.w t0, a2, 0 -+ addi.d a2, a2, 4 -+ srli.d t0, t0, 32 -+L(al_end_2): -+ beqz a5, L(al_end_1) -+ st.h t0, a2, 0 -+ addi.d a2, a2, 2 -+ srli.d t0, t0, 16 -+L(al_end_1): -+ beqz a6, L(al_out) -+ st.b t0, a2, 0 -+L(al_out): -+ jr ra -+ -+L(unalign): -+ slli.d a5, a4, 3 -+ li.d t1, -1 -+ sub.d a6, zero, a5 -+ -+ srl.d a7, t0, a5 -+ sll.d t7, t1, a6 -+ -+ or t0, a7, t7 -+ sub.d t1, t0, t5 -+ andn t2, t6, t0 -+ and t3, t1, t2 -+ -+ bnez t3, L(un_end) -+ -+ ld.d t4, a1, 8 -+ -+ sub.d t1, t4, t5 -+ andn t2, t6, t4 -+ sll.d t0, t4, a6 -+ and t3, t1, t2 -+ -+ or t0, t0, a7 -+ bnez t3, L(un_end_with_remaining) -+ -+L(un_loop): -+ srl.d a7, t4, a5 -+ -+ ld.d t4, a1, 16 -+ addi.d a1, a1, 8 -+ -+ st.d t0, a2, 0 -+ addi.d a2, a2, 8 -+ -+ sub.d t1, t4, t5 -+ andn t2, t6, t4 -+ sll.d t0, t4, a6 -+ and t3, t1, t2 -+ -+ or t0, t0, a7 -+ beqz t3, L(un_loop) -+ -+L(un_end_with_remaining): -+ ctz.d t1, t3 -+ srli.d t1, t1, 3 -+ addi.d t1, t1, 1 -+ sub.d t1, t1, a4 -+ -+ blt t1, zero, L(un_end_less_8) -+ st.d t0, a2, 0 -+ addi.d a2, a2, 8 -+ beqz t1, L(un_out) -+ srl.d t0, t4, a5 # get the remaining part -+ b L(un_end_less_8) -+ -+L(un_end): -+ ctz.d t1, t3 -+ srli.d t1, t1, 3 -+ addi.d t1, t1, 1 -+ -+L(un_end_less_8): -+ andi a4, t1, 4 -+ andi a5, t1, 2 -+ andi a6, t1, 1 -+L(un_end_4): -+ beqz a4, L(un_end_2) -+ st.w t0, a2, 0 -+ addi.d a2, a2, 4 -+ srli.d t0, t0, 32 -+L(un_end_2): -+ beqz a5, L(un_end_1) -+ st.h t0, a2, 0 -+ addi.d a2, a2, 2 -+ srli.d t0, t0, 16 -+L(un_end_1): -+ beqz a6, L(un_out) -+ st.b t0, a2, 0 -+L(un_out): -+ jr ra -+ -+END(STRCPY) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (STRCPY) -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S -index d31875fd..fcbc4f6a 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S -@@ -1,8 +1,87 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif - - #if IS_IN (libc) -- - #define STRLEN __strlen_aligned -- -+#else -+#define STRLEN strlen - #endif - --#include "../strlen.S" -+LEAF(STRLEN, 6) -+ move a1, a0 -+ bstrins.d a0, zero, 2, 0 -+ lu12i.w a2, 0x01010 -+ li.w t0, -1 -+ -+ ld.d t2, a0, 0 -+ andi t1, a1, 0x7 -+ ori a2, a2, 0x101 -+ slli.d t1, t1, 3 -+ -+ bstrins.d a2, a2, 63, 32 -+ sll.d t1, t0, t1 -+ slli.d t3, a2, 7 -+ nor a3, zero, t3 -+ -+ orn t2, t2, t1 -+ sub.d t0, t2, a2 -+ nor t1, t2, a3 -+ and t0, t0, t1 -+ -+ -+ bnez t0, L(count_pos) -+ addi.d a0, a0, 8 -+L(loop_16_7bit): -+ ld.d t2, a0, 0 -+ sub.d t1, t2, a2 -+ -+ and t0, t1, t3 -+ bnez t0, L(more_check) -+ ld.d t2, a0, 8 -+ addi.d a0, a0, 16 -+ -+ sub.d t1, t2, a2 -+ and t0, t1, t3 -+ beqz t0, L(loop_16_7bit) -+ addi.d a0, a0, -8 -+L(more_check): -+ nor t0, t2, a3 -+ -+ and t0, t1, t0 -+ bnez t0, L(count_pos) -+ addi.d a0, a0, 8 -+L(loop_16_8bit): -+ ld.d t2, a0, 0 -+ -+ sub.d t1, t2, a2 -+ nor t0, t2, a3 -+ and t0, t0, t1 -+ bnez t0, L(count_pos) -+ -+ ld.d t2, a0, 8 -+ addi.d a0, a0, 16 -+ sub.d t1, t2, a2 -+ nor t0, t2, a3 -+ -+ and t0, t0, t1 -+ beqz t0, L(loop_16_8bit) -+ addi.d a0, a0, -8 -+L(count_pos): -+ ctz.d t1, t0 -+ sub.d a0, a0, a1 -+ -+ srli.d t1, t1, 3 -+ add.d a0, a0, t1 -+ jr ra -+ -+END(STRLEN) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (STRLEN) -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S -index f371b19e..2cd56c44 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S -@@ -1,8 +1,258 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif - - #if IS_IN (libc) -- - #define STRNCMP __strncmp_aligned -- -+#else -+#define STRNCMP strncmp - #endif - --#include "../strncmp.S" -+/* int strncmp (const char *s1, const char *s2); */ -+ -+LEAF(STRNCMP, 6) -+ beqz a2, L(ret0) -+ xor a4, a0, a1 -+ lu12i.w t5, 0x01010 -+ lu12i.w t6, 0x7f7f7 -+ -+ andi a3, a0, 0x7 -+ ori t5, t5, 0x101 -+ andi a4, a4, 0x7 -+ ori t6, t6, 0xf7f -+ -+ bstrins.d t5, t5, 63, 32 -+ bstrins.d t6, t6, 63, 32 -+ -+ bnez a4, L(unalign) -+ bnez a3, L(mutual_align) -+ -+L(a_loop): -+ ld.d t0, a0, 0 -+ ld.d t1, a1, 0 -+ addi.d a0, a0, 8 -+ addi.d a1, a1, 8 -+ -+ -+ sltui t7, a2, 9 -+ -+L(start_realign): -+ sub.d t2, t0, t5 -+ nor t3, t0, t6 -+ xor t4, t0, t1 -+ -+ and t2, t2, t3 -+ addi.d a2, a2, -8 -+ -+ or t2, t2, t4 -+ or t3, t2, t7 -+ beqz t3, L(a_loop) -+ -+L(end): -+ bge zero, t7, L(out) -+ andi t4, a2, 7 -+ li.d t3, -1 -+ addi.d t4, t4, -1 -+ slli.d t4, t4, 3 -+ sll.d t3, t3, t4 -+ or t2, t2, t3 -+ -+ -+L(out): -+ ctz.d t3, t2 -+ bstrins.d t3, zero, 2, 0 -+ srl.d t0, t0, t3 -+ srl.d t1, t1, t3 -+ -+ andi t0, t0, 0xff -+ andi t1, t1, 0xff -+ sub.d a0, t0, t1 -+ jr ra -+ -+L(mutual_align): -+ bstrins.d a0, zero, 2, 0 -+ bstrins.d a1, zero, 2, 0 -+ slli.d a5, a3, 0x3 -+ li.d t2, -1 -+ -+ ld.d t0, a0, 0 -+ ld.d t1, a1, 0 -+ -+ li.d t3, 9 -+ sll.d t2, t2, a5 -+ -+ sub.d t3, t3, a3 -+ addi.d a0, a0, 8 -+ -+ sltu t7, a2, t3 -+ addi.d a1, a1, 8 -+ -+ add.d a2, a2, a3 -+ orn t0, t0, t2 -+ orn t1, t1, t2 -+ b L(start_realign) -+ -+L(ret0): -+ move a0, zero -+ jr ra -+ -+L(unalign): -+ li.d t8, 8 -+ blt a2, t8, L(short_cmp) -+ -+ # swap a0 and a1 in case a3 > a4 -+ andi a4, a1, 0x7 -+ sltu t8, a4, a3 -+ xor a6, a0, a1 -+ maskeqz a6, a6, t8 -+ xor a0, a0, a6 -+ xor a1, a1, a6 -+ -+ andi a3, a0, 0x7 -+ andi a4, a1, 0x7 -+ -+ bstrins.d a0, zero, 2, 0 -+ bstrins.d a1, zero, 2, 0 -+ -+ li.d t2, -1 -+ li.d t3, 9 -+ -+ ld.d t0, a0, 0 -+ ld.d t1, a1, 0 -+ -+ sub.d t3, t3, a4 -+ sub.d a3, a4, a3 -+ -+ slli.d t4, a4, 3 -+ slli.d a6, a3, 3 -+ -+ sub.d a5, zero, a6 -+ sltu t7, a2, t3 -+ -+ rotr.d a7, t0, a5 -+ sll.d t4, t2, t4 # mask for first num -+ -+ add.d a2, a2, a4 -+ sll.d a4, t2, a6 # mask for a7 -+ -+ orn t0, a7, t4 -+ orn t1, t1, t4 -+ -+ sub.d t2, t0, t5 -+ nor t4, t0, t6 -+ and t2, t2, t4 -+ -+ xor t3, t0, t1 -+ or t2, t2, t3 -+ -+ or t3, t2, t7 -+ bnez t3, L(un_end) -+ -+ andn a7, a7, a4 -+ addi.d a3, a3, 1 -+ -+L(un_loop): -+ addi.d a2, a2, -8 -+ # in case remaining part has '\0', no more load instructions should be executed on a0 address -+ or t0, a7, a4 -+ sltu t7, a2, a3 -+ -+ sub.d t2, t0, t5 -+ nor t3, t0, t6 -+ and t2, t2, t3 -+ -+ or t3, t2, t7 -+ bnez t3, L(check_remaining) -+ -+ ld.d t7, a0, 8 -+ ld.d t1, a1, 8 -+ addi.d a0, a0, 8 -+ addi.d a1, a1, 8 -+ -+ sll.d t4, t7, a6 -+ sub.d t2, t1, t5 -+ nor t3, t1, t6 -+ -+ or t0, t4, a7 -+ srl.d a7, t7, a5 -+ -+ and t2, t2, t3 -+ xor t3, t0, t1 -+ -+ sltui t7, a2, 9 -+ or t2, t2, t3 -+ -+ or t3, t2, t7 -+ beqz t3, L(un_loop) -+ b L(un_end) -+ -+L(check_remaining): -+ ld.d t1, a1, 8 -+ xor t3, t1, a7 -+ or t2, t2, t3 -+ -+L(un_end): -+ bge zero, t7, L(un_out) -+ andi t4, a2, 7 -+ li.d t3, -1 -+ -+ addi.d t4, t4, -1 -+ slli.d t4, t4, 3 -+ sll.d t3, t3, t4 -+ or t2, t2, t3 -+ -+L(un_out): -+ ctz.d t3, t2 -+ bstrins.d t3, zero, 2, 0 -+ srl.d t0, t0, t3 -+ srl.d t1, t1, t3 -+ -+ andi t0, t0, 0xff -+ andi t1, t1, 0xff -+ -+ sub.d a4, t0, t1 -+ sub.d a5, t1, t0 -+ -+ maskeqz a6, a5, t8 -+ masknez a0, a4, t8 -+ -+ or a0, a0, a6 -+ jr ra -+ -+L(short_cmp): -+ ld.bu t0, a0, 0 -+ ld.bu t1, a1, 0 -+ addi.d a2, a2, -1 -+ -+ xor t2, t0, t1 -+ masknez t2, t0, t2 -+ maskeqz t2, a2, t2 -+ -+ beqz t2, L(short_out) -+ -+ ld.bu t0, a0, 1 -+ ld.bu t1, a1, 1 -+ -+ addi.d a2, a2, -1 -+ addi.d a0, a0, 2 -+ -+ addi.d a1, a1, 2 -+ xor t2, t0, t1 -+ masknez t2, t0, t2 -+ maskeqz t2, a2, t2 -+ -+ bnez t2, L(short_cmp) -+ -+L(short_out): -+ sub.d a0, t0, t1 -+ jr ra -+ -+END(STRNCMP) -+#ifdef _LIBC -+libc_hidden_builtin_def (STRNCMP) -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S -index 503442b3..78c8fd5d 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S -@@ -1,8 +1,84 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif - - #if IS_IN (libc) -- - #define STRNLEN __strnlen_aligned -- -+#else -+#define STRNLEN __strnlen - #endif - --#include "../strnlen.S" -+#. before every load, a1(t5) must > 0; -+#. first load with t1 != 0, need to adjust t5; -+#. return the less one of both strlen(s) and a1; -+ -+LEAF(STRNLEN, 6) -+ beqz a1, L(out) -+ lu12i.w a2, 0x01010 -+ andi t1, a0, 0x7 -+ move t4, a0 -+ -+ bstrins.d a0, zero, 2, 0 -+ ori a2, a2, 0x101 -+ li.w t0, -1 -+ ld.d t2, a0, 0 -+ -+ slli.d t3, t1, 3 -+ bstrins.d a2, a2, 63, 32 -+ li.w t5, 8 -+ slli.d a3, a2, 7 -+ -+ sub.w t1, t5, t1 -+ sll.d t0, t0, t3 -+ nor a3, zero, a3 -+ orn t2, t2, t0 -+ -+ -+ sub.d t0, t2, a2 -+ nor t3, t2, a3 -+ and t0, t0, t3 -+ bnez t0, L(count_pos) -+ -+ sub.d t5, a1, t1 -+ bgeu t1, a1, L(out) -+L(loop_8bytes): -+ ld.d t2, a0, 8 -+ addi.d a0, a0, 8 -+ -+ sub.d t0, t2, a2 -+ nor t1, t2, a3 -+ sltui t6, t5, 9 -+ and t0, t0, t1 -+ -+ addi.d t5, t5, -8 -+ or t7, t0, t6 -+ beqz t7, L(loop_8bytes) -+L(count_pos): -+ ctz.d t1, t0 -+ -+ -+ sub.d a0, a0, t4 -+ srli.d t1, t1, 3 -+ add.d a0, t1, a0 -+ sltu t0, a0, a1 -+ -+ masknez t1, a1, t0 -+ maskeqz a0, a0, t0 -+ or a0, a0, t1 -+ jr ra -+ -+L(out): -+ move a0, a1 -+ jr ra -+ -+END(STRNLEN) -+ -+#ifdef _LIBC -+weak_alias (STRNLEN, strnlen) -+libc_hidden_builtin_def (STRNLEN) -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S -index a58ddde8..6931045b 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S -@@ -1,11 +1,110 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif - - #if IS_IN (libc) -- - #define STRRCHR_NAME __strrchr_aligned -- -+#else -+#define STRRCHR_NAME strrchr - #endif - --#include "../strrchr.S" -+LEAF(STRRCHR_NAME, 6) -+ slli.d t1, a0, 3 -+ bstrins.d a0, zero, 2, 0 -+ lu12i.w a2, 0x01010 -+ ld.d t2, a0, 0 // t2 = "5ZZ21abc" -+ -+ ori a2, a2, 0x101 -+ andi a1, a1, 0xff // a1 = "0000000Z" -+ li.d a5, -1 -+ bstrins.d a2, a2, 63, 32 // a2 = 0x0101010101010101 -+ -+ sll.d t1, a5, t1 // t1 = 0xffffffffff000000 -+ mul.d a1, a1, a2 // a1 = "ZZZZZZZZ" -+ orn t2, t2, t1 // t2 = "5ZZ21YYY" -+ slli.d a3, a2, 7 // a3 = 0x8080808080808080 -+ -+ sub.d a4, t2, a2 -+ andn t0, a3, t2 -+ move t3, zero -+ and t0, a4, t0 -+ -+ -+ xor a4, t2, a1 -+ move t5, zero -+ orn a4, a4, t1 -+ bnez t0, L(found_end) -+ -+ sub.d t1, a4, a2 -+ andn t0, a3, a4 -+ and t1, t1, t0 -+ -+L(loop_8bytes): -+ masknez t4, t3, t1 -+ -+ maskeqz t3, t2, t1 -+ ld.d t2, a0, 8 -+ masknez t0, t5, t1 -+ maskeqz t5, a0, t1 -+ -+ or t3, t3, t4 -+ or t5, t0, t5 -+ sub.d t0, t2, a2 -+ andn t1, a3, t2 -+ -+ -+ xor a4, t2, a1 -+ and t0, t0, t1 //t0 hold diff pattern for '\0' -+ sub.d t1, a4, a2 -+ andn t4, a3, a4 -+ -+ and t1, t1, t4 //t1 hold diff pattern for 'a1' -+ addi.d a0, a0, 8 -+ beqz t0, L(loop_8bytes) //ok, neither \0 nor found -+L(found_end): -+ ctz.d t1, t0 -+ -+ xor t3, t3, a1 -+ orn t1, zero, t1 -+ revb.d t3, t3 -+ srl.d t1, a5, t1 // mask for '\0' -+ -+ sub.d t4, t3, a2 -+ orn a4, a4, t1 -+ andn t3, a3, t3 -+ revb.d t2, a4 -+ -+ sub.d t0, t2, a2 -+ andn t1, a3, t2 -+ and t3, t3, t4 -+ and t1, t0, t1 -+ -+ li.d t7, 7 -+ masknez t4, t3, t1 -+ maskeqz t3, t1, t1 -+ masknez t5, t5, t1 -+ -+ or t3, t3, t4 -+ maskeqz t6, a0, t1 -+ ctz.d t0, t3 -+ or t5, t6, t5 -+ -+ srli.d t0, t0, 3 -+ sub.d t0, t7, t0 -+ add.d a0, t5, t0 -+ maskeqz a0, a0, t3 -+ -+ jr ra -+END(STRRCHR_NAME) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def(STRRCHR_NAME) -+#endif - - #undef rindex - weak_alias(STRRCHR_NAME, rindex) -diff --git a/sysdeps/loongarch/lp64/stpcpy.S b/sysdeps/loongarch/lp64/stpcpy.S -deleted file mode 100644 -index b6a367dc..00000000 ---- a/sysdeps/loongarch/lp64/stpcpy.S -+++ /dev/null -@@ -1,179 +0,0 @@ --#ifdef _LIBC --#include --#include --#include --#else --#include --#include --#endif -- --#ifndef STPCPY_NAME --#define STPCPY_NAME __stpcpy --#endif -- --LEAF(STPCPY_NAME, 6) -- andi a3, a0, 0x7 -- beqz a3, L(dest_align) -- sub.d a5, a1, a3 -- addi.d a5, a5, 8 -- --L(make_dest_align): -- ld.b t0, a1, 0 -- addi.d a1, a1, 1 -- st.b t0, a0, 0 -- addi.d a0, a0, 1 -- -- beqz t0, L(al_out) -- bne a1, a5, L(make_dest_align) -- --L(dest_align): -- andi a4, a1, 7 -- bstrins.d a1, zero, 2, 0 -- -- lu12i.w t5, 0x1010 -- ld.d t0, a1, 0 -- ori t5, t5, 0x101 -- bstrins.d t5, t5, 63, 32 -- -- slli.d t6, t5, 0x7 -- bnez a4, L(unalign) -- sub.d t1, t0, t5 -- andn t2, t6, t0 -- -- and t3, t1, t2 -- bnez t3, L(al_end) -- --L(al_loop): -- st.d t0, a0, 0 -- ld.d t0, a1, 8 -- -- addi.d a1, a1, 8 -- addi.d a0, a0, 8 -- sub.d t1, t0, t5 -- andn t2, t6, t0 -- -- and t3, t1, t2 -- beqz t3, L(al_loop) -- --L(al_end): -- ctz.d t1, t3 -- srli.d t1, t1, 3 -- addi.d t1, t1, 1 # add 1, since '\0' needs to be copied to dest -- -- andi a3, t1, 8 -- andi a4, t1, 4 -- andi a5, t1, 2 -- andi a6, t1, 1 -- --L(al_end_8): -- beqz a3, L(al_end_4) -- st.d t0, a0, 0 -- addi.d a0, a0, 7 -- jr ra --L(al_end_4): -- beqz a4, L(al_end_2) -- st.w t0, a0, 0 -- addi.d a0, a0, 4 -- srli.d t0, t0, 32 --L(al_end_2): -- beqz a5, L(al_end_1) -- st.h t0, a0, 0 -- addi.d a0, a0, 2 -- srli.d t0, t0, 16 --L(al_end_1): -- beqz a6, L(al_out) -- st.b t0, a0, 0 -- addi.d a0, a0, 1 --L(al_out): -- addi.d a0, a0, -1 -- jr ra -- --L(unalign): -- slli.d a5, a4, 3 -- li.d t1, -1 -- sub.d a6, zero, a5 -- -- srl.d a7, t0, a5 -- sll.d t7, t1, a6 -- -- or t0, a7, t7 -- sub.d t1, t0, t5 -- andn t2, t6, t0 -- and t3, t1, t2 -- -- bnez t3, L(un_end) -- -- ld.d t4, a1, 8 -- addi.d a1, a1, 8 -- -- sub.d t1, t4, t5 -- andn t2, t6, t4 -- sll.d t0, t4, a6 -- and t3, t1, t2 -- -- or t0, t0, a7 -- bnez t3, L(un_end_with_remaining) -- --L(un_loop): -- srl.d a7, t4, a5 -- -- ld.d t4, a1, 8 -- addi.d a1, a1, 8 -- -- st.d t0, a0, 0 -- addi.d a0, a0, 8 -- -- sub.d t1, t4, t5 -- andn t2, t6, t4 -- sll.d t0, t4, a6 -- and t3, t1, t2 -- -- or t0, t0, a7 -- beqz t3, L(un_loop) -- --L(un_end_with_remaining): -- ctz.d t1, t3 -- srli.d t1, t1, 3 -- addi.d t1, t1, 1 -- sub.d t1, t1, a4 -- -- blt t1, zero, L(un_end_less_8) -- st.d t0, a0, 0 -- addi.d a0, a0, 8 -- beqz t1, L(un_out) -- srl.d t0, t4, a5 # get the remaining part -- b L(un_end_less_8) -- --L(un_end): -- ctz.d t1, t3 -- srli.d t1, t1, 3 -- addi.d t1, t1, 1 -- --L(un_end_less_8): -- andi a4, t1, 4 -- andi a5, t1, 2 -- andi a6, t1, 1 --L(un_end_4): -- beqz a4, L(un_end_2) -- st.w t0, a0, 0 -- addi.d a0, a0, 4 -- srli.d t0, t0, 32 --L(un_end_2): -- beqz a5, L(un_end_1) -- st.h t0, a0, 0 -- addi.d a0, a0, 2 -- srli.d t0, t0, 16 --L(un_end_1): -- beqz a6, L(un_out) -- st.b t0, a0, 0 -- addi.d a0, a0, 1 --L(un_out): -- addi.d a0, a0, -1 -- jr ra -- --END(STPCPY_NAME) -- --#ifdef _LIBC --weak_alias (STPCPY_NAME, stpcpy) --libc_hidden_builtin_def (STPCPY_NAME) --#endif -diff --git a/sysdeps/loongarch/lp64/strchr.S b/sysdeps/loongarch/lp64/strchr.S -deleted file mode 100644 -index fde53a30..00000000 ---- a/sysdeps/loongarch/lp64/strchr.S -+++ /dev/null -@@ -1,89 +0,0 @@ --#ifdef _LIBC --#include --#include --#include --#else --#include --#include --#endif -- --#ifndef STRCHR_NAME --#define STRCHR_NAME strchr --#endif -- --/* char * strchr (const char *s1, int c); */ -- --LEAF(STRCHR_NAME, 6) -- slli.d t1, a0, 3 -- bstrins.d a0, zero, 2, 0 -- lu12i.w a2, 0x01010 -- ld.d t2, a0, 0 -- -- ori a2, a2, 0x101 -- andi a1, a1, 0xff -- bstrins.d a2, a2, 63, 32 -- li.w t0, -1 -- -- mul.d a1, a1, a2 # "cccccccc" -- sll.d t0, t0, t1 -- slli.d a3, a2, 7 # 0x8080808080808080 -- orn t2, t2, t0 -- -- sll.d t3, a1, t1 -- xor t4, t2, t3 -- sub.d a7, t2, a2 -- andn a6, a3, t2 -- -- -- sub.d a5, t4, a2 -- andn a4, a3, t4 -- and a6, a7, a6 -- and a5, a5, a4 -- -- or t0, a6, a5 -- bnez t0, L(_mc8_a) -- addi.d a0, a0, 8 --L(_aloop): -- ld.d t4, a0, 0 -- -- xor t2, t4, a1 -- sub.d a7, t4, a2 -- andn a6, a3, t4 -- sub.d a5, t2, a2 -- -- andn a4, a3, t2 -- and a6, a7, a6 -- and a5, a5, a4 -- or a7, a6, a5 -- -- -- bnez a7, L(_mc8_a) -- ld.d t4, a0, 8 -- addi.d a0, a0, 16 -- xor t2, t4, a1 -- -- sub.d a7, t4, a2 -- andn a6, a3, t4 -- sub.d a5, t2, a2 -- andn a4, a3, t2 -- -- and a6, a7, a6 -- and a5, a5, a4 -- or a7, a6, a5 -- beqz a7, L(_aloop) -- -- addi.d a0, a0, -8 -- --L(_mc8_a): -- ctz.d t0, a5 -- ctz.d t2, a6 -- srli.w t0, t0, 3 -- -- -- srli.w t2, t2, 3 -- sltu t1, t2, t0 -- add.d a0, a0, t0 -- masknez a0, a0, t1 -- -- jr ra --END(STRCHR_NAME) -diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S -deleted file mode 100644 -index a5ee09a3..00000000 ---- a/sysdeps/loongarch/lp64/strchrnul.S -+++ /dev/null -@@ -1,94 +0,0 @@ --#ifdef _LIBC --#include --#include --#include --#else --#include --#include --#endif -- --#ifndef STRCHRNUL_NAME --#define STRCHRNUL_NAME __strchrnul --#endif -- --/* char * strchrnul (const char *s1, int c); */ -- --LEAF(STRCHRNUL_NAME, 6) -- slli.d t1, a0, 3 -- bstrins.d a0, zero, 2, 0 -- lu12i.w a2, 0x01010 -- ld.d t2, a0, 0 -- -- ori a2, a2, 0x101 -- andi a1, a1, 0xff -- bstrins.d a2, a2, 63, 32 -- li.w t0, -1 -- -- mul.d a1, a1, a2 # "cccccccc" -- sll.d t0, t0, t1 -- slli.d a3, a2, 7 # 0x8080808080808080 -- orn t2, t2, t0 -- -- sll.d t3, a1, t1 -- xor t4, t2, t3 -- sub.d a7, t2, a2 -- andn a6, a3, t2 -- -- -- sub.d a5, t4, a2 -- andn a4, a3, t4 -- and a6, a7, a6 -- and a5, a5, a4 -- -- or t0, a6, a5 -- bnez t0, L(_mc8_a) -- addi.d a0, a0, 8 --L(_aloop): -- ld.d t4, a0, 0 -- -- xor t2, t4, a1 -- sub.d a7, t4, a2 -- andn a6, a3, t4 -- sub.d a5, t2, a2 -- -- andn a4, a3, t2 -- and a6, a7, a6 -- and a5, a5, a4 -- or a7, a6, a5 -- -- -- bnez a7, L(_mc8_a) -- ld.d t4, a0, 8 -- addi.d a0, a0, 16 -- xor t2, t4, a1 -- -- sub.d a7, t4, a2 -- andn a6, a3, t4 -- sub.d a5, t2, a2 -- andn a4, a3, t2 -- -- and a6, a7, a6 -- and a5, a5, a4 -- or a7, a6, a5 -- beqz a7, L(_aloop) -- -- addi.d a0, a0, -8 --L(_mc8_a): -- ctz.d t0, a5 -- ctz.d t2, a6 -- srli.w t0, t0, 3 -- -- srli.w t2, t2, 3 -- slt t1, t0, t2 -- masknez t3, t2, t1 -- maskeqz t4, t0, t1 -- -- or t0, t3, t4 -- add.d a0, a0, t0 -- jr ra --END(STRCHRNUL_NAME) -- --#ifdef _LIBC --weak_alias(STRCHRNUL_NAME, strchrnul) --libc_hidden_builtin_def (STRCHRNUL_NAME) --#endif -diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S -deleted file mode 100644 -index 3a863992..00000000 ---- a/sysdeps/loongarch/lp64/strcmp.S -+++ /dev/null -@@ -1,227 +0,0 @@ --/* 2022\06\15 loongarch64 author: chenxiaolong. */ -- --#ifdef _LIBC --#include --#include --#include --#else --#include --#include --#endif -- --#ifndef STRCMP_NAME --#define STRCMP_NAME strcmp --#endif -- --/* int strcmp (const char *s1, const char *s2); */ -- --/* Parameters and Results */ --#define src1 a0 --#define src2 a1 --#define result v0 --LEAF(STRCMP_NAME, 6) -- xor a4, src1, src2 -- lu12i.w t5, 0x01010 -- lu12i.w t6, 0x7f7f7 -- andi a2, src1, 0x7 -- -- ori t5, t5, 0x101 -- andi a4, a4, 0x7 -- ori t6, t6, 0xf7f -- bstrins.d t5, t5, 63, 32 -- bstrins.d t6, t6, 63, 32 -- -- bnez a4, 3f // unaligned -- beqz a2, 1f // loop aligned -- --// mutual aligned -- bstrins.d src1, zero, 2, 0 -- bstrins.d src2, zero, 2, 0 -- slli.d a4, a2, 0x3 -- ld.d t0, src1, 0 -- -- sub.d a4, zero, a4 -- ld.d t1, src2, 0 -- addi.d src1, src1, 8 -- addi.d src2, src2, 8 -- -- nor a5, zero, zero -- srl.d a5, a5, a4 -- or t0, t0, a5 -- -- or t1, t1, a5 -- b 2f //start realigned -- --// loop aligned --1: -- ld.d t0, src1, 0 -- addi.d src1, src1, 8 -- ld.d t1, src2, 0 -- addi.d src2, src2, 8 -- --// start realigned: --2: -- sub.d t2, t0, t5 -- nor t3, t0, t6 -- and t2, t2, t3 -- -- xor t3, t0, t1 -- or t2, t2, t3 -- beqz t2, 1b -- -- ctz.d t7, t2 -- bstrins.d t7, zero, 2, 0 -- srl.d t0, t0, t7 -- srl.d t1, t1, t7 -- -- andi t0, t0, 0xff -- andi t1, t1, 0xff -- sub.d v0, t0, t1 -- jr ra -- --// unaligned --3: -- andi a3, src2, 0x7 -- slt a5, a2, a3 -- masknez t8, a2, a5 -- xor a6, src1, src2 -- maskeqz a6, a6, t8 -- xor src1, src1, a6 -- xor src2, src2, a6 -- -- andi a2, src1, 0x7 -- beqz a2, 4f // src1 is aligned -- --//strcmp_unaligned: -- andi a3, src2, 0x7 -- bstrins.d src1, zero, 2, 0 -- bstrins.d src2, zero, 2, 0 -- nor t3, zero, zero -- -- ld.d t0, src1, 0 -- ld.d t1, src2, 0 -- sub.d a2, a3, a2 -- addi.d t2, zero, 8 -- -- sub.d a5, t2, a2 -- sub.d a6, t2, a3 -- slli.d a5, a5, 0x3 -- slli.d a6, a6, 0x3 -- -- srl.d t4, t3, a6 -- srl.d a4, t3, a5 -- rotr.d a7, t0, a5 -- -- addi.d src2, src2, 8 -- addi.d src1, src1, 8 -- or t1, t1, t4 -- or t0, a7, t4 -- -- sub.d t2, t0, t5 -- nor t3, t0, t6 -- and t2, t2, t3 -- xor t3, t0, t1 -- or t2, t2, t3 -- bnez t2, 7f -- -- and a7, a7, a4 -- slli.d a6, a2, 0x3 -- nor a4, zero, a4 -- b 5f -- --// src1 is aligned --4: -- andi a3, src2, 0x7 -- ld.d t0, src1, 0 -- -- bstrins.d src2, zero, 2, 0 -- nor t2, zero, zero -- ld.d t1, src2, 0 -- -- addi.d t3, zero, 0x8 -- sub.d a5, t3, a3 -- slli.d a5, a5, 0x3 -- srl.d a4, t2, a5 -- rotr.d t4, t0, a5 -- -- addi.d src2, src2, 8 -- addi.d src1, src1, 8 -- or t1, t1, a4 -- or t0, t4, a4 -- -- sub.d t2, t0, t5 -- nor t3, t0, t6 -- and t2, t2, t3 -- xor t3, t0, t1 -- or t2, t2, t3 -- -- bnez t2, 7f -- -- and a7, t4, a4 -- slli.d a6, a3, 0x3 -- nor a4, zero, a4 -- --// unaligned loop --// a7: remaining number --// a6: shift left number --// a5: shift right number --// a4: mask for checking remaining number --5: -- or t0, a7, a4 -- sub.d t2, t0, t5 -- nor t3, t0, t6 -- and t2, t2, t3 -- bnez t2, 6f -- -- ld.d t0, src1, 0 -- addi.d src1, src1, 8 -- ld.d t1, src2, 0 -- addi.d src2, src2, 8 -- -- srl.d t7, t0, a5 -- sll.d t0, t0, a6 -- or t0, a7, t0 -- -- sub.d t2, t0, t5 -- nor t3, t0, t6 -- and t2, t2, t3 -- xor t3, t0, t1 -- or t2, t2, t3 -- bnez t2, 7f -- -- or a7, t7, zero -- b 5b -- --6: -- ld.bu t1, src2, 0 -- andi t0, a7, 0xff -- xor t2, t0, t1 -- srli.d a7, a7, 0x8 -- masknez t2, t0, t2 -- addi.d src2, src2, 1 -- beqz t2, 8f -- b 6b -- --7: -- ctz.d t7, t2 -- bstrins.d t7, zero, 2, 0 -- srl.d t0, t0, t7 -- srl.d t1, t1, t7 -- -- andi t0, t0, 0xff -- andi t1, t1, 0xff -- --8: -- sub.d a4, t0, t1 -- sub.d a5, t1, t0 -- maskeqz a6, a5, t8 -- masknez result, a4, t8 -- or result, result, a6 -- jr ra -- --END(STRCMP_NAME) -- --#ifdef _LIBC --libc_hidden_builtin_def (STRCMP_NAME) --#endif -- -diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S -deleted file mode 100644 -index 08505192..00000000 ---- a/sysdeps/loongarch/lp64/strcpy.S -+++ /dev/null -@@ -1,173 +0,0 @@ --#ifdef _LIBC --#include --#include --#include --#else --#include --#include --#endif -- --#ifndef STRCPY --#define STRCPY strcpy --#endif -- --LEAF(STRCPY, 6) -- andi a3, a0, 0x7 -- move a2, a0 -- beqz a3, L(dest_align) -- sub.d a5, a1, a3 -- addi.d a5, a5, 8 -- --L(make_dest_align): -- ld.b t0, a1, 0 -- addi.d a1, a1, 1 -- st.b t0, a2, 0 -- beqz t0, L(al_out) -- -- addi.d a2, a2, 1 -- bne a1, a5, L(make_dest_align) -- --L(dest_align): -- andi a4, a1, 7 -- bstrins.d a1, zero, 2, 0 -- -- lu12i.w t5, 0x1010 -- ld.d t0, a1, 0 -- ori t5, t5, 0x101 -- bstrins.d t5, t5, 63, 32 -- -- slli.d t6, t5, 0x7 -- bnez a4, L(unalign) -- sub.d t1, t0, t5 -- andn t2, t6, t0 -- -- and t3, t1, t2 -- bnez t3, L(al_end) -- --L(al_loop): -- st.d t0, a2, 0 -- ld.d t0, a1, 8 -- -- addi.d a1, a1, 8 -- addi.d a2, a2, 8 -- sub.d t1, t0, t5 -- andn t2, t6, t0 -- -- and t3, t1, t2 -- beqz t3, L(al_loop) -- --L(al_end): -- ctz.d t1, t3 -- srli.d t1, t1, 3 -- addi.d t1, t1, 1 # add 1, since '\0' needs to be copied to dest -- -- andi a3, t1, 8 -- andi a4, t1, 4 -- andi a5, t1, 2 -- andi a6, t1, 1 -- --L(al_end_8): -- beqz a3, L(al_end_4) -- st.d t0, a2, 0 -- jr ra --L(al_end_4): -- beqz a4, L(al_end_2) -- st.w t0, a2, 0 -- addi.d a2, a2, 4 -- srli.d t0, t0, 32 --L(al_end_2): -- beqz a5, L(al_end_1) -- st.h t0, a2, 0 -- addi.d a2, a2, 2 -- srli.d t0, t0, 16 --L(al_end_1): -- beqz a6, L(al_out) -- st.b t0, a2, 0 --L(al_out): -- jr ra -- --L(unalign): -- slli.d a5, a4, 3 -- li.d t1, -1 -- sub.d a6, zero, a5 -- -- srl.d a7, t0, a5 -- sll.d t7, t1, a6 -- -- or t0, a7, t7 -- sub.d t1, t0, t5 -- andn t2, t6, t0 -- and t3, t1, t2 -- -- bnez t3, L(un_end) -- -- ld.d t4, a1, 8 -- -- sub.d t1, t4, t5 -- andn t2, t6, t4 -- sll.d t0, t4, a6 -- and t3, t1, t2 -- -- or t0, t0, a7 -- bnez t3, L(un_end_with_remaining) -- --L(un_loop): -- srl.d a7, t4, a5 -- -- ld.d t4, a1, 16 -- addi.d a1, a1, 8 -- -- st.d t0, a2, 0 -- addi.d a2, a2, 8 -- -- sub.d t1, t4, t5 -- andn t2, t6, t4 -- sll.d t0, t4, a6 -- and t3, t1, t2 -- -- or t0, t0, a7 -- beqz t3, L(un_loop) -- --L(un_end_with_remaining): -- ctz.d t1, t3 -- srli.d t1, t1, 3 -- addi.d t1, t1, 1 -- sub.d t1, t1, a4 -- -- blt t1, zero, L(un_end_less_8) -- st.d t0, a2, 0 -- addi.d a2, a2, 8 -- beqz t1, L(un_out) -- srl.d t0, t4, a5 # get the remaining part -- b L(un_end_less_8) -- --L(un_end): -- ctz.d t1, t3 -- srli.d t1, t1, 3 -- addi.d t1, t1, 1 -- --L(un_end_less_8): -- andi a4, t1, 4 -- andi a5, t1, 2 -- andi a6, t1, 1 --L(un_end_4): -- beqz a4, L(un_end_2) -- st.w t0, a2, 0 -- addi.d a2, a2, 4 -- srli.d t0, t0, 32 --L(un_end_2): -- beqz a5, L(un_end_1) -- st.h t0, a2, 0 -- addi.d a2, a2, 2 -- srli.d t0, t0, 16 --L(un_end_1): -- beqz a6, L(un_out) -- st.b t0, a2, 0 --L(un_out): -- jr ra -- --END(STRCPY) -- --#ifdef _LIBC --libc_hidden_builtin_def (STRCPY) --#endif -diff --git a/sysdeps/loongarch/lp64/strlen.S b/sysdeps/loongarch/lp64/strlen.S -deleted file mode 100644 -index 71431ce2..00000000 ---- a/sysdeps/loongarch/lp64/strlen.S -+++ /dev/null -@@ -1,85 +0,0 @@ --#ifdef _LIBC --#include --#include --#include --#else --#include --#include --#endif -- --#ifndef STRLEN --#define STRLEN strlen --#endif -- --LEAF(STRLEN, 6) -- move a1, a0 -- bstrins.d a0, zero, 2, 0 -- lu12i.w a2, 0x01010 -- li.w t0, -1 -- -- ld.d t2, a0, 0 -- andi t1, a1, 0x7 -- ori a2, a2, 0x101 -- slli.d t1, t1, 3 -- -- bstrins.d a2, a2, 63, 32 -- sll.d t1, t0, t1 -- slli.d t3, a2, 7 -- nor a3, zero, t3 -- -- orn t2, t2, t1 -- sub.d t0, t2, a2 -- nor t1, t2, a3 -- and t0, t0, t1 -- -- -- bnez t0, L(count_pos) -- addi.d a0, a0, 8 --L(loop_16_7bit): -- ld.d t2, a0, 0 -- sub.d t1, t2, a2 -- -- and t0, t1, t3 -- bnez t0, L(more_check) -- ld.d t2, a0, 8 -- addi.d a0, a0, 16 -- -- sub.d t1, t2, a2 -- and t0, t1, t3 -- beqz t0, L(loop_16_7bit) -- addi.d a0, a0, -8 --L(more_check): -- nor t0, t2, a3 -- -- and t0, t1, t0 -- bnez t0, L(count_pos) -- addi.d a0, a0, 8 --L(loop_16_8bit): -- ld.d t2, a0, 0 -- -- sub.d t1, t2, a2 -- nor t0, t2, a3 -- and t0, t0, t1 -- bnez t0, L(count_pos) -- -- ld.d t2, a0, 8 -- addi.d a0, a0, 16 -- sub.d t1, t2, a2 -- nor t0, t2, a3 -- -- and t0, t0, t1 -- beqz t0, L(loop_16_8bit) -- addi.d a0, a0, -8 --L(count_pos): -- ctz.d t1, t0 -- sub.d a0, a0, a1 -- -- srli.d t1, t1, 3 -- add.d a0, a0, t1 -- jr ra -- --END(STRLEN) -- --#ifdef _LIBC --libc_hidden_builtin_def (STRLEN) --#endif -diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S -deleted file mode 100644 -index 55450e55..00000000 ---- a/sysdeps/loongarch/lp64/strncmp.S -+++ /dev/null -@@ -1,256 +0,0 @@ --#ifdef _LIBC --#include --#include --#include --#else --#include --#include --#endif -- --#ifndef STRNCMP --#define STRNCMP strncmp --#endif -- --/* int strncmp (const char *s1, const char *s2); */ -- --LEAF(STRNCMP, 6) -- beqz a2, L(ret0) -- xor a4, a0, a1 -- lu12i.w t5, 0x01010 -- lu12i.w t6, 0x7f7f7 -- -- andi a3, a0, 0x7 -- ori t5, t5, 0x101 -- andi a4, a4, 0x7 -- ori t6, t6, 0xf7f -- -- bstrins.d t5, t5, 63, 32 -- bstrins.d t6, t6, 63, 32 -- -- bnez a4, L(unalign) -- bnez a3, L(mutual_align) -- --L(a_loop): -- ld.d t0, a0, 0 -- ld.d t1, a1, 0 -- addi.d a0, a0, 8 -- addi.d a1, a1, 8 -- -- -- sltui t7, a2, 9 -- --L(start_realign): -- sub.d t2, t0, t5 -- nor t3, t0, t6 -- xor t4, t0, t1 -- -- and t2, t2, t3 -- addi.d a2, a2, -8 -- -- or t2, t2, t4 -- or t3, t2, t7 -- beqz t3, L(a_loop) -- --L(end): -- bge zero, t7, L(out) -- andi t4, a2, 7 -- li.d t3, -1 -- addi.d t4, t4, -1 -- slli.d t4, t4, 3 -- sll.d t3, t3, t4 -- or t2, t2, t3 -- -- --L(out): -- ctz.d t3, t2 -- bstrins.d t3, zero, 2, 0 -- srl.d t0, t0, t3 -- srl.d t1, t1, t3 -- -- andi t0, t0, 0xff -- andi t1, t1, 0xff -- sub.d a0, t0, t1 -- jr ra -- --L(mutual_align): -- bstrins.d a0, zero, 2, 0 -- bstrins.d a1, zero, 2, 0 -- slli.d a5, a3, 0x3 -- li.d t2, -1 -- -- ld.d t0, a0, 0 -- ld.d t1, a1, 0 -- -- li.d t3, 9 -- sll.d t2, t2, a5 -- -- sub.d t3, t3, a3 -- addi.d a0, a0, 8 -- -- sltu t7, a2, t3 -- addi.d a1, a1, 8 -- -- add.d a2, a2, a3 -- orn t0, t0, t2 -- orn t1, t1, t2 -- b L(start_realign) -- --L(ret0): -- move a0, zero -- jr ra -- --L(unalign): -- li.d t8, 8 -- blt a2, t8, L(short_cmp) -- -- # swap a0 and a1 in case a3 > a4 -- andi a4, a1, 0x7 -- sltu t8, a4, a3 -- xor a6, a0, a1 -- maskeqz a6, a6, t8 -- xor a0, a0, a6 -- xor a1, a1, a6 -- -- andi a3, a0, 0x7 -- andi a4, a1, 0x7 -- -- bstrins.d a0, zero, 2, 0 -- bstrins.d a1, zero, 2, 0 -- -- li.d t2, -1 -- li.d t3, 9 -- -- ld.d t0, a0, 0 -- ld.d t1, a1, 0 -- -- sub.d t3, t3, a4 -- sub.d a3, a4, a3 -- -- slli.d t4, a4, 3 -- slli.d a6, a3, 3 -- -- sub.d a5, zero, a6 -- sltu t7, a2, t3 -- -- rotr.d a7, t0, a5 -- sll.d t4, t2, t4 # mask for first num -- -- add.d a2, a2, a4 -- sll.d a4, t2, a6 # mask for a7 -- -- orn t0, a7, t4 -- orn t1, t1, t4 -- -- sub.d t2, t0, t5 -- nor t4, t0, t6 -- and t2, t2, t4 -- -- xor t3, t0, t1 -- or t2, t2, t3 -- -- or t3, t2, t7 -- bnez t3, L(un_end) -- -- andn a7, a7, a4 -- addi.d a3, a3, 1 -- --L(un_loop): -- addi.d a2, a2, -8 -- # in case remaining part has '\0', no more load instructions should be executed on a0 address -- or t0, a7, a4 -- sltu t7, a2, a3 -- -- sub.d t2, t0, t5 -- nor t3, t0, t6 -- and t2, t2, t3 -- -- or t3, t2, t7 -- bnez t3, L(check_remaining) -- -- ld.d t7, a0, 8 -- ld.d t1, a1, 8 -- addi.d a0, a0, 8 -- addi.d a1, a1, 8 -- -- sll.d t4, t7, a6 -- sub.d t2, t1, t5 -- nor t3, t1, t6 -- -- or t0, t4, a7 -- srl.d a7, t7, a5 -- -- and t2, t2, t3 -- xor t3, t0, t1 -- -- sltui t7, a2, 9 -- or t2, t2, t3 -- -- or t3, t2, t7 -- beqz t3, L(un_loop) -- b L(un_end) -- --L(check_remaining): -- ld.d t1, a1, 8 -- xor t3, t1, a7 -- or t2, t2, t3 -- --L(un_end): -- bge zero, t7, L(un_out) -- andi t4, a2, 7 -- li.d t3, -1 -- -- addi.d t4, t4, -1 -- slli.d t4, t4, 3 -- sll.d t3, t3, t4 -- or t2, t2, t3 -- --L(un_out): -- ctz.d t3, t2 -- bstrins.d t3, zero, 2, 0 -- srl.d t0, t0, t3 -- srl.d t1, t1, t3 -- -- andi t0, t0, 0xff -- andi t1, t1, 0xff -- -- sub.d a4, t0, t1 -- sub.d a5, t1, t0 -- -- maskeqz a6, a5, t8 -- masknez a0, a4, t8 -- -- or a0, a0, a6 -- jr ra -- --L(short_cmp): -- ld.bu t0, a0, 0 -- ld.bu t1, a1, 0 -- addi.d a2, a2, -1 -- -- xor t2, t0, t1 -- masknez t2, t0, t2 -- maskeqz t2, a2, t2 -- -- beqz t2, L(short_out) -- -- ld.bu t0, a0, 1 -- ld.bu t1, a1, 1 -- -- addi.d a2, a2, -1 -- addi.d a0, a0, 2 -- -- addi.d a1, a1, 2 -- xor t2, t0, t1 -- masknez t2, t0, t2 -- maskeqz t2, a2, t2 -- -- bnez t2, L(short_cmp) -- --L(short_out): -- sub.d a0, t0, t1 -- jr ra -- --END(STRNCMP) --#ifdef _LIBC --libc_hidden_builtin_def (STRNCMP) --#endif -diff --git a/sysdeps/loongarch/lp64/strnlen.S b/sysdeps/loongarch/lp64/strnlen.S -deleted file mode 100644 -index 5b5ab585..00000000 ---- a/sysdeps/loongarch/lp64/strnlen.S -+++ /dev/null -@@ -1,82 +0,0 @@ --#ifdef _LIBC --#include --#include --#include --#else --#include --#include --#endif -- --#ifndef STRNLEN --#define STRNLEN __strnlen --#endif -- --#. before every load, a1(t5) must > 0; --#. first load with t1 != 0, need to adjust t5; --#. return the less one of both strlen(s) and a1; -- --LEAF(STRNLEN, 6) -- beqz a1, L(out) -- lu12i.w a2, 0x01010 -- andi t1, a0, 0x7 -- move t4, a0 -- -- bstrins.d a0, zero, 2, 0 -- ori a2, a2, 0x101 -- li.w t0, -1 -- ld.d t2, a0, 0 -- -- slli.d t3, t1, 3 -- bstrins.d a2, a2, 63, 32 -- li.w t5, 8 -- slli.d a3, a2, 7 -- -- sub.w t1, t5, t1 -- sll.d t0, t0, t3 -- nor a3, zero, a3 -- orn t2, t2, t0 -- -- -- sub.d t0, t2, a2 -- nor t3, t2, a3 -- and t0, t0, t3 -- bnez t0, L(count_pos) -- -- sub.d t5, a1, t1 -- bgeu t1, a1, L(out) --L(loop_8bytes): -- ld.d t2, a0, 8 -- addi.d a0, a0, 8 -- -- sub.d t0, t2, a2 -- nor t1, t2, a3 -- sltui t6, t5, 9 -- and t0, t0, t1 -- -- addi.d t5, t5, -8 -- or t7, t0, t6 -- beqz t7, L(loop_8bytes) --L(count_pos): -- ctz.d t1, t0 -- -- -- sub.d a0, a0, t4 -- srli.d t1, t1, 3 -- add.d a0, t1, a0 -- sltu t0, a0, a1 -- -- masknez t1, a1, t0 -- maskeqz a0, a0, t0 -- or a0, a0, t1 -- jr ra -- --L(out): -- move a0, a1 -- jr ra -- --END(STRNLEN) -- --#ifdef _LIBC --weak_alias (STRNLEN, strnlen) --libc_hidden_builtin_def (STRNLEN) --#endif -diff --git a/sysdeps/loongarch/lp64/strrchr.S b/sysdeps/loongarch/lp64/strrchr.S -deleted file mode 100644 -index df7fcb6b..00000000 ---- a/sysdeps/loongarch/lp64/strrchr.S -+++ /dev/null -@@ -1,105 +0,0 @@ --#ifdef _LIBC --#include --#include --#include --#else --#include --#include --#endif -- --#ifndef STRRCHR_NAME --#define STRRCHR_NAME strrchr --#endif -- --LEAF(STRRCHR_NAME, 6) -- slli.d t1, a0, 3 -- bstrins.d a0, zero, 2, 0 -- lu12i.w a2, 0x01010 -- ld.d t2, a0, 0 // t2 = "5ZZ21abc" -- -- ori a2, a2, 0x101 -- andi a1, a1, 0xff // a1 = "0000000Z" -- li.d a5, -1 -- bstrins.d a2, a2, 63, 32 // a2 = 0x0101010101010101 -- -- sll.d t1, a5, t1 // t1 = 0xffffffffff000000 -- mul.d a1, a1, a2 // a1 = "ZZZZZZZZ" -- orn t2, t2, t1 // t2 = "5ZZ21YYY" -- slli.d a3, a2, 7 // a3 = 0x8080808080808080 -- -- sub.d a4, t2, a2 -- andn t0, a3, t2 -- move t3, zero -- and t0, a4, t0 -- -- -- xor a4, t2, a1 -- move t5, zero -- orn a4, a4, t1 -- bnez t0, L(found_end) -- -- sub.d t1, a4, a2 -- andn t0, a3, a4 -- and t1, t1, t0 -- --L(loop_8bytes): -- masknez t4, t3, t1 -- -- maskeqz t3, t2, t1 -- ld.d t2, a0, 8 -- masknez t0, t5, t1 -- maskeqz t5, a0, t1 -- -- or t3, t3, t4 -- or t5, t0, t5 -- sub.d t0, t2, a2 -- andn t1, a3, t2 -- -- -- xor a4, t2, a1 -- and t0, t0, t1 //t0 hold diff pattern for '\0' -- sub.d t1, a4, a2 -- andn t4, a3, a4 -- -- and t1, t1, t4 //t1 hold diff pattern for 'a1' -- addi.d a0, a0, 8 -- beqz t0, L(loop_8bytes) //ok, neither \0 nor found --L(found_end): -- ctz.d t1, t0 -- -- xor t3, t3, a1 -- orn t1, zero, t1 -- revb.d t3, t3 -- srl.d t1, a5, t1 // mask for '\0' -- -- sub.d t4, t3, a2 -- orn a4, a4, t1 -- andn t3, a3, t3 -- revb.d t2, a4 -- -- sub.d t0, t2, a2 -- andn t1, a3, t2 -- and t3, t3, t4 -- and t1, t0, t1 -- -- li.d t7, 7 -- masknez t4, t3, t1 -- maskeqz t3, t1, t1 -- masknez t5, t5, t1 -- -- or t3, t3, t4 -- maskeqz t6, a0, t1 -- ctz.d t0, t3 -- or t5, t6, t5 -- -- srli.d t0, t0, 3 -- sub.d t0, t7, t0 -- add.d a0, t5, t0 -- maskeqz a0, a0, t3 -- -- jr ra --END(STRRCHR_NAME) -- --#ifdef _LIBC --libc_hidden_builtin_def(STRRCHR_NAME) --#endif --- -2.33.0 - diff --git a/glibc-2.28-Remove-unseless-ANDROID_CHANGES-and-relat.patch b/glibc-2.28-Remove-unseless-ANDROID_CHANGES-and-relat.patch deleted file mode 100644 index ad4b53c..0000000 --- a/glibc-2.28-Remove-unseless-ANDROID_CHANGES-and-relat.patch +++ /dev/null @@ -1,292 +0,0 @@ -From e2dd1f13592fa3b99b70eb54cc61e9f98cdcb123 Mon Sep 17 00:00:00 2001 -From: caiyinyu -Date: Mon, 17 Apr 2023 17:20:04 +0800 -Subject: [PATCH 01/14] glibc-2.28: Remove unseless ANDROID_CHANGES and related - code. - -Change-Id: Ib08e92d435126c7b56096ff6f24f1c6b5ea57f46 -Signed-off-by: ticat_fp ---- - sysdeps/loongarch/lp64/memchr.S | 6 ------ - sysdeps/loongarch/lp64/memcpy.S | 13 ------------- - sysdeps/loongarch/lp64/memset.S | 6 ------ - sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S | 6 ------ - .../loongarch/lp64/multiarch/memmove-unaligned.S | 6 ------ - sysdeps/loongarch/lp64/multiarch/memset-unaligned.S | 7 ------- - sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S | 2 -- - .../loongarch/lp64/multiarch/strchrnul-unaligned.S | 2 -- - sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S | 2 -- - sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S | 2 -- - .../loongarch/lp64/multiarch/strncmp-unaligned.S | 2 -- - .../loongarch/lp64/multiarch/strnlen-unaligned.S | 2 -- - 12 files changed, 56 deletions(-) - -diff --git a/sysdeps/loongarch/lp64/memchr.S b/sysdeps/loongarch/lp64/memchr.S -index ec34b1af..75c4e15c 100644 ---- a/sysdeps/loongarch/lp64/memchr.S -+++ b/sysdeps/loongarch/lp64/memchr.S -@@ -11,11 +11,7 @@ - #define MEMCHR_NAME memchr - #endif - --#ifdef ANDROID_CHANGES --LEAF(MEMCHR_NAME, 0) --#else - LEAF(MEMCHR_NAME) --#endif - .align 6 - beqz a2, L(out) - andi t1, a0, 0x7 -@@ -92,8 +88,6 @@ L(out): - jr ra - END(MEMCHR_NAME) - --#ifndef ANDROID_CHANGES - #ifdef _LIBC - libc_hidden_builtin_def (MEMCHR_NAME) - #endif --#endif -diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S -index 1076e678..b6ca60a1 100644 ---- a/sysdeps/loongarch/lp64/memcpy.S -+++ b/sysdeps/loongarch/lp64/memcpy.S -@@ -35,29 +35,18 @@ - st.d t6, reg, n+48; \ - st.d t7, reg, n+56; - --#ifdef ANDROID_CHANGES --LEAF(MEMMOVE_NAME, 0) --#else - LEAF(MEMMOVE_NAME) --#endif -- - .align 6 - sub.d t0, a0, a1 - bltu t0, a2, L(copy_back) - - END(MEMMOVE_NAME) - --#ifndef ANDROID_CHANGES - #ifdef _LIBC - libc_hidden_builtin_def (MEMMOVE_NAME) - #endif --#endif - --#ifdef ANDROID_CHANGES --LEAF(MEMCPY_NAME, 0) --#else - LEAF(MEMCPY_NAME) --#endif - - srai.d a3, a2, 4 - beqz a3, L(short_data) # less than 16 bytes -@@ -811,8 +800,6 @@ L(back_end): - - END(MEMCPY_NAME) - --#ifndef ANDROID_CHANGES - #ifdef _LIBC - libc_hidden_builtin_def (MEMCPY_NAME) - #endif --#endif -diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S -index 9fe42b24..41629e7e 100644 ---- a/sysdeps/loongarch/lp64/memset.S -+++ b/sysdeps/loongarch/lp64/memset.S -@@ -21,11 +21,7 @@ - st.d a1, a0, n+48; \ - st.d a1, a0, n+56; - --#ifdef ANDROID_CHANGES --LEAF(MEMSET_NAME, 0) --#else - LEAF(MEMSET_NAME) --#endif - .align 6 - move t0, a0 - andi a3, a0, 0x7 -@@ -166,8 +162,6 @@ L(short_0): - - END(MEMSET_NAME) - --#ifndef ANDROID_CHANGES - #ifdef _LIBC - libc_hidden_builtin_def (MEMSET_NAME) - #endif --#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S -index 5e38df0d..64b60244 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S -@@ -31,11 +31,7 @@ - st.d t6, reg, n+48; \ - st.d t7, reg, n+56; - --#ifdef ANDROID_CHANGES --LEAF(MEMCPY_NAME, 0) --#else - LEAF(MEMCPY_NAME) --#endif - - //1st var: dst ptr: void *a1 $r4 a0 - //2nd var: src ptr: void *a2 $r5 a1 -@@ -250,10 +246,8 @@ end_0_8_unalign: - - END(MEMCPY_NAME) - --#ifndef ANDROID_CHANGES - #ifdef _LIBC - libc_hidden_builtin_def (MEMCPY_NAME) - #endif --#endif - - #endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S -index 27ed0c9c..42920a1a 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S -@@ -100,11 +100,7 @@ - LD_64(a4, -1024); \ - ST_64(a3, -1024); - --#ifdef ANDROID_CHANGES --LEAF(MEMMOVE_NAME, 0) --#else - LEAF(MEMMOVE_NAME) --#endif - - //1st var: dest ptr: void *str1 $r4 a0 - //2nd var: src ptr: void *str2 $r5 a1 -@@ -469,10 +465,8 @@ end_unalign_proc_back: - - END(MEMMOVE_NAME) - --#ifndef ANDROID_CHANGES - #ifdef _LIBC - libc_hidden_builtin_def (MEMMOVE_NAME) - #endif --#endif - - #endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S -index 16ff2ef7..54e51546 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S -@@ -33,12 +33,7 @@ - //2nd var: int val $5 a1 - //3rd var: size_t num $6 a2 - --#ifdef ANDROID_CHANGES --LEAF(MEMSET_NAME, 0) --#else - LEAF(MEMSET_NAME) --#endif -- - .align 6 - bstrins.d a1, a1, 15, 8 - add.d t7, a0, a2 -@@ -168,10 +163,8 @@ end_0_8_unalign: - - END(MEMSET_NAME) - --#ifndef ANDROID_CHANGES - #ifdef _LIBC - libc_hidden_builtin_def (MEMSET_NAME) - #endif --#endif - - #endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S -index 1d5e56c5..de6c7f4f 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S -@@ -123,10 +123,8 @@ L(_mc8_a): - jr ra - END(STRCHR_NAME) - --#ifndef ANDROID_CHANGES - #ifdef _LIBC - libc_hidden_builtin_def (STRCHR_NAME) - #endif --#endif - - #endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S -index 6338d005..abc246ca 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S -@@ -136,11 +136,9 @@ L(_mc8_a): - jr ra - END(STRCHRNUL_NAME) - --#ifndef ANDROID_CHANGES - #ifdef _LIBC - weak_alias(STRCHRNUL_NAME, strchrnul) - libc_hidden_builtin_def (STRCHRNUL_NAME) - #endif --#endif - - #endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S -index 449733cb..c77dc1a9 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S -@@ -190,10 +190,8 @@ strcpy_page_cross: - beqz has_nul, strcpy_page_cross_ok - b strcpy_end - END(STRCPY) --#ifndef ANDROID_CHANGES - #ifdef _LIBC - libc_hidden_builtin_def (STRCPY) - #endif --#endif - - #endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S -index e9b7cf67..2fe0fb34 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S -@@ -107,10 +107,8 @@ strlen_loop_noascii: - jr ra - END(STRLEN) - --#ifndef ANDROID_CHANGES - #ifdef _LIBC - libc_hidden_builtin_def (STRLEN) - #endif --#endif - - #endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S -index 558df29b..6ec107ca 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S -@@ -248,10 +248,8 @@ strncmp_ret0: - then exchange(src1,src2). */ - - END(STRNCMP) --#ifndef ANDROID_CHANGES - #ifdef _LIBC - libc_hidden_builtin_def (STRNCMP) - #endif --#endif - - #endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S -index 60eccf00..4a195b7c 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S -@@ -136,10 +136,8 @@ L(_hit_limit): - move len, limit - jr ra - END(STRNLEN) --#ifndef ANDROID_CHANGES - #ifdef _LIBC - libc_hidden_builtin_def (STRNLEN) - #endif --#endif - - #endif --- -2.33.0 - diff --git a/glibc-2.28-Remove-useless-IS_LA-264-364-464-and-IS_L.patch b/glibc-2.28-Remove-useless-IS_LA-264-364-464-and-IS_L.patch deleted file mode 100644 index 4880d26..0000000 --- a/glibc-2.28-Remove-useless-IS_LA-264-364-464-and-IS_L.patch +++ /dev/null @@ -1,40 +0,0 @@ -From f4041e5da609a9f5da966fa000c00b150788a948 Mon Sep 17 00:00:00 2001 -From: caiyinyu -Date: Sun, 23 Jul 2023 14:32:08 +0800 -Subject: [PATCH 13/14] glibc-2.28: Remove useless IS_LA{264,364,464} and - IS_LA{264, 364, 464}. - -Change-Id: Id9a573510e2a493151191372d651f381ec2aefe7 -Signed-off-by: ticat_fp ---- - sysdeps/unix/sysv/linux/loongarch/cpu-features.h | 7 ------- - 1 file changed, 7 deletions(-) - -diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h -index b46a8489..2703d4f7 100644 ---- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h -+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h -@@ -22,10 +22,6 @@ - #include - #include - --#define LA264 0x14a000 --#define LA364 0x14b000 --#define LA464 0x14c011 -- - struct cpu_features - { - uint64_t cpucfg_prid; -@@ -42,9 +38,6 @@ extern const struct cpu_features *_dl_larch_get_cpu_features (void) - :"=r"(ret) \ - :"r"(index)); - --#define IS_LA264(prid) (prid == LA264) --#define IS_LA364(prid) (prid == LA364) --#define IS_LA464(prid) (prid == LA464) - #define SUPPORT_UAL (GLRO (dl_hwcap) & HWCAP_LOONGARCH_UAL) - #define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX) - #define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX) --- -2.33.0 - diff --git a/glibc-2.28-Use-RTLD_SUPPORT_-LSX-LASX-to-choose-_dl_.patch b/glibc-2.28-Use-RTLD_SUPPORT_-LSX-LASX-to-choose-_dl_.patch deleted file mode 100644 index 720cd20..0000000 --- a/glibc-2.28-Use-RTLD_SUPPORT_-LSX-LASX-to-choose-_dl_.patch +++ /dev/null @@ -1,123 +0,0 @@ -From c94d9376e241dc52eb9f2a2107313b7836e0e9ad Mon Sep 17 00:00:00 2001 -From: caiyinyu -Date: Wed, 6 Sep 2023 16:41:09 +0800 -Subject: [PATCH 14/14] glibc-2.28: Use RTLD_SUPPORT_{LSX, LASX} to choose - _dl_runtime_resolve. - -Key Points: -1. On lasx & lsx platforms, use _dl_runtime_resolve_{lsx, lasx} to save vector registers. -2. Via "tunables", users can choose str/mem functions with - `export GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX`. - Note: glibc.cpu.hwcaps doesn't affect _dl_runtime_resolve_{lsx, lasx} selection. - -Usage Notes: -1. Only valid inputs: LASX, LSX, UAL. Case-sensitive, comma-separated, no spaces. -2. Example: `export GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX,UAL` turns on LASX & UAL. - Unmentioned features turn off. With default ifunc: lasx > lsx > unaligned > - aligned > generic, effect is: lasx > unaligned > aligned > generic; lsx off. -3. Incorrect GLIBC_TUNABLES settings will show error messages. -4. Valid input examples: - - GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX: lasx > aligned > generic. - - GLIBC_TUNABLES=glibc.cpu.hwcaps=LSX,UAL: lsx > unaligned > aligned > generic. - - GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX,UAL,LASX,UAL,LSX,LASX,UAL: Repetitions - allowed but not recommended. Results in: lasx > lsx > unaligned > aligned > - generic. - -Change-Id: I555ce2039bc36bf071fc9265d7b0bb7b93b96ae7 -Signed-off-by: ticat_fp ---- - sysdeps/loongarch/cpu-tunables.c | 2 +- - sysdeps/loongarch/dl-machine.h | 11 ++++++----- - sysdeps/unix/sysv/linux/loongarch/cpu-features.c | 2 ++ - sysdeps/unix/sysv/linux/loongarch/cpu-features.h | 10 +++++++--- - 4 files changed, 16 insertions(+), 9 deletions(-) - -diff --git a/sysdeps/loongarch/cpu-tunables.c b/sysdeps/loongarch/cpu-tunables.c -index 840c1b8c..e0799ca9 100644 ---- a/sysdeps/loongarch/cpu-tunables.c -+++ b/sysdeps/loongarch/cpu-tunables.c -@@ -88,7 +88,7 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) - } - while (*c != '\0'); - -- GLRO (dl_hwcap) &= hwcap; -+ GLRO (dl_larch_cpu_features).hwcap &= hwcap; - } - - #endif -diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h -index ff520a07..b5f43c84 100644 ---- a/sysdeps/loongarch/dl-machine.h -+++ b/sysdeps/loongarch/dl-machine.h -@@ -75,13 +75,14 @@ dl_platform_init (void) - GLRO(dl_platform) = NULL; - - #ifdef SHARED -+ /* init_cpu_features has been called early from __libc_start_main in -+ static executable. */ -+ init_cpu_features (&GLRO(dl_larch_cpu_features)); - - #if HAVE_TUNABLES - TUNABLE_GET (hwcaps, tunable_val_t *, TUNABLE_CALLBACK (set_hwcaps)); - #endif -- /* init_cpu_features has been called early from __libc_start_main in -- static executable. */ -- init_cpu_features (&GLRO(dl_larch_cpu_features)); -+ - #endif - } - -@@ -396,9 +397,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], - l->l_mach.plt = gotplt[1] + l->l_addr; - - #if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float -- if (SUPPORT_LASX) -+ if (RTLD_SUPPORT_LASX) - gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx; -- else if (SUPPORT_LSX) -+ else if (RTLD_SUPPORT_LSX) - gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lsx; - else - #endif -diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c -index 80870f3c..cf015011 100644 ---- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c -+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c -@@ -29,4 +29,6 @@ init_cpu_features (struct cpu_features *cpu_features) - - __cpucfg(cpucfg_word, 2); - cpu_features->cpucfg_word_idx2 = cpucfg_word; -+ -+ GLRO (dl_larch_cpu_features).hwcap = GLRO (dl_hwcap); - } -diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h -index 2703d4f7..17c9f5a7 100644 ---- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h -+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h -@@ -26,6 +26,7 @@ struct cpu_features - { - uint64_t cpucfg_prid; - uint64_t cpucfg_word_idx2; -+ uint64_t hwcap; - }; - - /* Get a pointer to the CPU features structure. */ -@@ -38,9 +39,12 @@ extern const struct cpu_features *_dl_larch_get_cpu_features (void) - :"=r"(ret) \ - :"r"(index)); - --#define SUPPORT_UAL (GLRO (dl_hwcap) & HWCAP_LOONGARCH_UAL) --#define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX) --#define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX) -+#define SUPPORT_UAL (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_UAL) -+#define SUPPORT_LSX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LSX) -+#define SUPPORT_LASX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LASX) -+ -+#define RTLD_SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX) -+#define RTLD_SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX) - - #endif /* _CPU_FEATURES_LOONGARCH64_H */ - --- -2.33.0 - diff --git a/glibc-2.28-config-Added-HAVE_LOONGARCH_VEC_ASM.patch b/glibc-2.28-config-Added-HAVE_LOONGARCH_VEC_ASM.patch deleted file mode 100644 index bfbe0e2..0000000 --- a/glibc-2.28-config-Added-HAVE_LOONGARCH_VEC_ASM.patch +++ /dev/null @@ -1,91 +0,0 @@ -From 58b1f882644f839259505dde3205e226a1c649f1 Mon Sep 17 00:00:00 2001 -From: caiyinyu -Date: Tue, 11 Jul 2023 15:42:26 +0800 -Subject: [PATCH 10/14] glibc-2.28: config: Added HAVE_LOONGARCH_VEC_ASM. - -Change-Id: Iea464ea0c975a351682a60f66251167f6c79385b -Signed-off-by: ticat_fp ---- - config.h.in | 5 +++++ - sysdeps/loongarch/configure | 28 ++++++++++++++++++++++++++++ - sysdeps/loongarch/configure.ac | 15 +++++++++++++++ - 3 files changed, 48 insertions(+) - -diff --git a/config.h.in b/config.h.in -index 94d5ea36..fa53cc2d 100644 ---- a/config.h.in -+++ b/config.h.in -@@ -123,6 +123,11 @@ - /* RISC-V floating-point ABI for ld.so. */ - #undef RISCV_ABI_FLEN - -+/* Assembler support LoongArch LASX/LSX vector instructions. -+ This macro becomes obsolete when glibc increased the minimum -+ required version of GNU 'binutils' to 2.41 or later. */ -+#define HAVE_LOONGARCH_VEC_ASM 0 -+ - /* Linux specific: minimum supported kernel version. */ - #undef __LINUX_KERNEL_VERSION - -diff --git a/sysdeps/loongarch/configure b/sysdeps/loongarch/configure -index 1e5abf81..0f0dae3a 100755 ---- a/sysdeps/loongarch/configure -+++ b/sysdeps/loongarch/configure -@@ -2,3 +2,31 @@ - # Local configure fragment for sysdeps/loongarch/elf. - - #AC_DEFINE(PI_STATIC_AND_HIDDEN) -+ -+# Check if asm support vector instructions. -+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for vector support in assembler" >&5 -+$as_echo_n "checking for vector support in assembler... " >&6; } -+if ${libc_cv_loongarch_vec_asm+:} false; then : -+ $as_echo_n "(cached) " >&6 -+else -+ cat > conftest.s <<\EOF -+ vld $vr0, $sp, 0 -+EOF -+if { ac_try='${CC-cc} -c $CFLAGS conftest.s -o conftest 1>&5' -+ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 -+ (eval $ac_try) 2>&5 -+ ac_status=$? -+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 -+ test $ac_status = 0; }; }; then -+ libc_cv_loongarch_vec_asm=yes -+else -+ libc_cv_loongarch_vec_asm=no -+fi -+rm -f conftest* -+fi -+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_loongarch_vec_asm" >&5 -+$as_echo "$libc_cv_loongarch_vec_asm" >&6; } -+if test $libc_cv_loongarch_vec_asm = yes; then -+ $as_echo "#define HAVE_LOONGARCH_VEC_ASM 1" >>confdefs.h -+ -+fi -diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac -index 67b46ce0..aac0efa9 100644 ---- a/sysdeps/loongarch/configure.ac -+++ b/sysdeps/loongarch/configure.ac -@@ -4,3 +4,18 @@ GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory. - dnl It is always possible to access static and hidden symbols in an - dnl position independent way. - #AC_DEFINE(PI_STATIC_AND_HIDDEN) -+ -+# Check if asm support vector instructions. -+AC_CACHE_CHECK(for vector support in assembler, libc_cv_loongarch_vec_asm, [dnl -+cat > conftest.s <<\EOF -+ vld $vr0, $sp, 0 -+EOF -+if AC_TRY_COMMAND(${CC-cc} -c $CFLAGS conftest.s -o conftest 1>&AS_MESSAGE_LOG_FD); then -+ libc_cv_loongarch_vec_asm=yes -+else -+ libc_cv_loongarch_vec_asm=no -+fi -+rm -f conftest*]) -+if test $libc_cv_loongarch_vec_asm = yes; then -+ AC_DEFINE(HAVE_LOONGARCH_VEC_ASM) -+fi --- -2.33.0 - diff --git a/glibc-2.28-remove-ABILPX32-related-code.patch b/glibc-2.28-remove-ABILPX32-related-code.patch deleted file mode 100644 index d5ece82..0000000 --- a/glibc-2.28-remove-ABILPX32-related-code.patch +++ /dev/null @@ -1,75 +0,0 @@ -From 0153532f680527c4378a10673518cabda2e02584 Mon Sep 17 00:00:00 2001 -From: caiyinyu -Date: Fri, 26 May 2023 14:58:39 +0800 -Subject: [PATCH 05/14] glibc-2.28: remove ABILPX32 related code. - -Change-Id: I73eb5bc4d4ca12e4d45ed6b533fa38d60a3a633f -Signed-off-by: ticat_fp ---- - elf/elf.h | 3 +-- - sysdeps/loongarch/dl-machine.h | 2 -- - sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h | 2 +- - sysdeps/loongarch/sys/regdef.h | 4 +--- - 4 files changed, 3 insertions(+), 8 deletions(-) - -diff --git a/elf/elf.h b/elf/elf.h -index 65d1fb46..4bfbad61 100644 ---- a/elf/elf.h -+++ b/elf/elf.h -@@ -3933,10 +3933,9 @@ enum - #define R_NDS32_TLS_TPOFF 102 - #define R_NDS32_TLS_DESC 119 - --/* LoongISA ELF Flags */ -+/* LoongArch ELF Flags */ - #define EF_LARCH_ABI 0x0003 - #define EF_LARCH_ABI_LP64 0x0003 --#define EF_LARCH_ABI_LPX32 0x0002 - #define EF_LARCH_ABI_LP32 0x0001 - - /* Loongarch specific dynamic relocations. */ -diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h -index 2d527241..6e9c6258 100644 ---- a/sysdeps/loongarch/dl-machine.h -+++ b/sysdeps/loongarch/dl-machine.h -@@ -96,8 +96,6 @@ elf_machine_matches_host (const ElfW(Ehdr) *ehdr) - - #ifdef _ABILP64 - if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LP64) --#elif defined _ABILPX32 -- if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LPX32) - #elif defined _ABILP32 - if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LP32) - #else -diff --git a/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h b/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h -index 5a761355..aa63bce1 100644 ---- a/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h -+++ b/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h -@@ -32,7 +32,7 @@ - # define __SIZEOF_PTHREAD_BARRIER_T 32 - # define __SIZEOF_PTHREAD_BARRIERATTR_T 4 - #else --# error "rv32i-based systems are not supported" -+# error "32-bit based systems are not supported" - #endif - - #define __PTHREAD_COMPAT_PADDING_MID -diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h -index 769784b8..36f00939 100644 ---- a/sysdeps/loongarch/sys/regdef.h -+++ b/sysdeps/loongarch/sys/regdef.h -@@ -72,10 +72,8 @@ - # define fs6 $f30 - # define fs7 $f31 - --#elif _LOONGARCH_SIM == _ABILPX32 --# error ABILPX32 - #elif _LOONGARCH_SIM == _ABILP32 --# error ABILP32 -+# error ABILP32 not support yet - #else - # error noABI - #endif --- -2.33.0 - diff --git a/glibc-2.28-use-new-macro-LEAF-and-ENTRY-and-modify-r.patch b/glibc-2.28-use-new-macro-LEAF-and-ENTRY-and-modify-r.patch deleted file mode 100644 index fce80c4..0000000 --- a/glibc-2.28-use-new-macro-LEAF-and-ENTRY-and-modify-r.patch +++ /dev/null @@ -1,1033 +0,0 @@ -From 18331a16d37b191b84296d8a5e96cd069fe45664 Mon Sep 17 00:00:00 2001 -From: caiyinyu -Date: Mon, 17 Apr 2023 17:04:57 +0800 -Subject: [PATCH 02/14] glibc-2.28: use new macro LEAF and ENTRY and modify - related code. - -Change-Id: Iac8a3cc0f57ba39cf364580966c8bfca1b54a7a5 -Signed-off-by: ticat_fp ---- - sysdeps/loongarch/__longjmp.S | 2 +- - sysdeps/loongarch/dl-trampoline.h | 2 +- - sysdeps/loongarch/lp64/memchr.S | 3 +-- - sysdeps/loongarch/lp64/memcmp.S | 3 +-- - sysdeps/loongarch/lp64/memcpy.S | 5 ++--- - sysdeps/loongarch/lp64/memset.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/memchr-lasx.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/memchr-lsx.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S | 7 +++---- - sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S | 2 +- - sysdeps/loongarch/lp64/multiarch/memmove-lasx.S | 6 ++---- - sysdeps/loongarch/lp64/multiarch/memmove-lsx.S | 5 ++--- - sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S | 2 +- - sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/memset-lasx.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/memset-lsx.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/memset-unaligned.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S | 6 +++--- - sysdeps/loongarch/lp64/multiarch/strchr-lasx.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/strchr-lsx.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S | 6 +++--- - sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S | 7 ++++--- - sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/strlen-lasx.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/strlen-lsx.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S | 7 ++++--- - sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S | 3 +-- - sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S | 3 +-- - sysdeps/loongarch/lp64/rawmemchr.S | 3 +-- - sysdeps/loongarch/lp64/s_cosf.S | 4 +--- - sysdeps/loongarch/lp64/s_sinf.S | 4 +--- - sysdeps/loongarch/lp64/stpcpy.S | 3 +-- - sysdeps/loongarch/lp64/strchr.S | 3 +-- - sysdeps/loongarch/lp64/strchrnul.S | 3 +-- - sysdeps/loongarch/lp64/strcmp.S | 3 +-- - sysdeps/loongarch/lp64/strcpy.S | 3 +-- - sysdeps/loongarch/lp64/strlen.S | 3 +-- - sysdeps/loongarch/lp64/strncmp.S | 3 +-- - sysdeps/loongarch/lp64/strnlen.S | 3 +-- - sysdeps/loongarch/lp64/strrchr.S | 3 +-- - sysdeps/loongarch/setjmp.S | 6 +++--- - sysdeps/loongarch/start.S | 2 +- - sysdeps/loongarch/sys/asm.h | 6 +++--- - sysdeps/unix/sysv/linux/loongarch/clone.S | 4 ++-- - sysdeps/unix/sysv/linux/loongarch/getcontext.S | 2 +- - sysdeps/unix/sysv/linux/loongarch/setcontext.S | 4 ++-- - sysdeps/unix/sysv/linux/loongarch/swapcontext.S | 2 +- - sysdeps/unix/sysv/linux/loongarch/sysdep.S | 4 ++-- - sysdeps/unix/sysv/linux/loongarch/sysdep.h | 4 ++-- - sysdeps/unix/sysv/linux/loongarch/vfork.S | 2 +- - 62 files changed, 85 insertions(+), 130 deletions(-) - -diff --git a/sysdeps/loongarch/__longjmp.S b/sysdeps/loongarch/__longjmp.S -index 68f67639..bd06b919 100644 ---- a/sysdeps/loongarch/__longjmp.S -+++ b/sysdeps/loongarch/__longjmp.S -@@ -19,7 +19,7 @@ - #include - #include - --ENTRY (__longjmp) -+ENTRY (__longjmp, 3) - REG_L ra, a0, 0*SZREG - REG_L sp, a0, 1*SZREG - REG_L x, a0, 2*SZREG -diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h -index 95639111..fb15983f 100644 ---- a/sysdeps/loongarch/dl-trampoline.h -+++ b/sysdeps/loongarch/dl-trampoline.h -@@ -29,7 +29,7 @@ - # define FRAME_SIZE (-((-10 * SZREG - 8 * 256) & ALMASK)) - #endif - --ENTRY (_dl_runtime_resolve) -+ENTRY (_dl_runtime_resolve, 3) - # Save arguments to stack. - - #ifdef __loongarch64 -diff --git a/sysdeps/loongarch/lp64/memchr.S b/sysdeps/loongarch/lp64/memchr.S -index 75c4e15c..23f1fd13 100644 ---- a/sysdeps/loongarch/lp64/memchr.S -+++ b/sysdeps/loongarch/lp64/memchr.S -@@ -11,8 +11,7 @@ - #define MEMCHR_NAME memchr - #endif - --LEAF(MEMCHR_NAME) -- .align 6 -+LEAF(MEMCHR_NAME, 6) - beqz a2, L(out) - andi t1, a0, 0x7 - lu12i.w a3, 0x01010 -diff --git a/sysdeps/loongarch/lp64/memcmp.S b/sysdeps/loongarch/lp64/memcmp.S -index 9e57a924..457a4dc7 100644 ---- a/sysdeps/loongarch/lp64/memcmp.S -+++ b/sysdeps/loongarch/lp64/memcmp.S -@@ -11,8 +11,7 @@ - #define MEMCMP_NAME memcmp - #endif - --LEAF(MEMCMP_NAME) -- .align 6 -+LEAF(MEMCMP_NAME, 6) - beqz a2, L(ret) - andi a4, a1, 0x7 - andi a3, a0, 0x7 -diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S -index b6ca60a1..4791e1a4 100644 ---- a/sysdeps/loongarch/lp64/memcpy.S -+++ b/sysdeps/loongarch/lp64/memcpy.S -@@ -35,8 +35,7 @@ - st.d t6, reg, n+48; \ - st.d t7, reg, n+56; - --LEAF(MEMMOVE_NAME) -- .align 6 -+LEAF(MEMMOVE_NAME, 6) - sub.d t0, a0, a1 - bltu t0, a2, L(copy_back) - -@@ -46,7 +45,7 @@ END(MEMMOVE_NAME) - libc_hidden_builtin_def (MEMMOVE_NAME) - #endif - --LEAF(MEMCPY_NAME) -+LEAF_NO_ALIGN(MEMCPY_NAME) - - srai.d a3, a2, 4 - beqz a3, L(short_data) # less than 16 bytes -diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S -index 41629e7e..eabd7d23 100644 ---- a/sysdeps/loongarch/lp64/memset.S -+++ b/sysdeps/loongarch/lp64/memset.S -@@ -21,8 +21,7 @@ - st.d a1, a0, n+48; \ - st.d a1, a0, n+56; - --LEAF(MEMSET_NAME) -- .align 6 -+LEAF(MEMSET_NAME, 6) - move t0, a0 - andi a3, a0, 0x7 - li.w t6, 16 -diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S -index e63e34ae..387a35fe 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S -@@ -11,8 +11,7 @@ - - #define MEMCHR __memchr_lasx - --LEAF(MEMCHR) -- .align 6 -+LEAF(MEMCHR, 6) - beqz a2, L(ret0) - add.d a3, a0, a2 - andi t0, a0, 0x3f -diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S -index 441db534..c6952657 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S -@@ -11,8 +11,7 @@ - - #define MEMCHR __memchr_lsx - --LEAF(MEMCHR) -- .align 6 -+LEAF(MEMCHR, 6) - beqz a2, L(ret0) - add.d a3, a0, a2 - andi t0, a0, 0x1f -diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S -index 30e2dbe6..9151d38d 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S -@@ -11,8 +11,7 @@ - - #define MEMCMP __memcmp_lasx - --LEAF(MEMCMP) -- .align 6 -+LEAF(MEMCMP, 6) - li.d t2, 32 - add.d a3, a0, a2 - add.d a4, a1, a2 -diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S -index 7fd349b6..8535aa22 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S -@@ -10,11 +10,10 @@ - #if IS_IN (libc) - - #define MEMCMP __memcmp_lsx -- - L(magic_num): -- .align 6 -- .dword 0x0706050403020100 -- .dword 0x0f0e0d0c0b0a0908 -+ .align 6 -+ .dword 0x0706050403020100 -+ .dword 0x0f0e0d0c0b0a0908 - nop - nop - ENTRY_NO_ALIGN(MEMCMP) -diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S -index 64b60244..96df7c40 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S -@@ -31,7 +31,7 @@ - st.d t6, reg, n+48; \ - st.d t7, reg, n+56; - --LEAF(MEMCPY_NAME) -+LEAF(MEMCPY_NAME, 3) - - //1st var: dst ptr: void *a1 $r4 a0 - //2nd var: src ptr: void *a2 $r5 a1 -diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S -index 9537a35a..e8b2c441 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S -@@ -17,8 +17,7 @@ - #define MEMMOVE_NAME __memmove_lasx - #endif - --LEAF(MEMCPY_NAME) -- .align 6 -+LEAF(MEMCPY_NAME, 6) - - li.d t0, 32 - add.d a3, a0, a2 -@@ -83,8 +82,7 @@ L(less_1bytes): - jr ra - END(MEMCPY_NAME) - --LEAF(MEMMOVE_NAME) -- .align 6 -+LEAF(MEMMOVE_NAME, 6) - - li.d t0, 32 - add.d a3, a0, a2 -diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S -index 26babad4..90f89c7a 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S -@@ -12,8 +12,7 @@ - #define MEMCPY_NAME __memcpy_lsx - #define MEMMOVE_NAME __memmove_lsx - --LEAF(MEMCPY_NAME) -- .align 6 -+LEAF(MEMCPY_NAME, 6) - li.d t6, 16 - add.d a3, a0, a2 - add.d a4, a1, a2 -@@ -83,7 +82,7 @@ L(less_1bytes): - nop - END(MEMCPY_NAME) - --LEAF(MEMMOVE_NAME) -+LEAF(MEMMOVE_NAME, 6) - li.d t6, 16 - add.d a3, a0, a2 - add.d a4, a1, a2 -diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S -index 42920a1a..712b1c62 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S -@@ -100,7 +100,7 @@ - LD_64(a4, -1024); \ - ST_64(a3, -1024); - --LEAF(MEMMOVE_NAME) -+LEAF(MEMMOVE_NAME, 3) - - //1st var: dest ptr: void *str1 $r4 a0 - //2nd var: src ptr: void *str2 $r5 a1 -diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S -index 57e1035f..9ecd0257 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S -@@ -13,8 +13,7 @@ - #define MEMRCHR __memrchr_lasx - #endif - --LEAF(MEMRCHR) -- .align 6 -+LEAF(MEMRCHR, 6) - beqz a2, L(ret0) - addi.d a2, a2, -1 - add.d a3, a0, a2 -diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S -index eac2059a..4bdc18d8 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S -@@ -11,8 +11,7 @@ - - #define MEMRCHR __memrchr_lsx - --LEAF(MEMRCHR) -- .align 6 -+LEAF(MEMRCHR, 6) - beqz a2, L(ret0) - addi.d a2, a2, -1 - add.d a3, a0, a2 -diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S -index 1bd2dda9..b53c0b7b 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S -@@ -11,8 +11,7 @@ - - #define MEMSET __memset_lasx - --LEAF(MEMSET) -- .align 6 -+LEAF(MEMSET, 6) - li.d t1, 32 - move a3, a0 - xvreplgr2vr.b $xr0, a1 -diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S -index a3bbadb7..7ab85283 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S -@@ -11,8 +11,7 @@ - - #define MEMSET __memset_lsx - --LEAF(MEMSET) -- .align 6 -+LEAF(MEMSET, 6) - li.d t1, 16 - move a3, a0 - vreplgr2vr.b $vr0, a1 -diff --git a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S -index 54e51546..92b0fab5 100644 ---- a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S -@@ -33,8 +33,7 @@ - //2nd var: int val $5 a1 - //3rd var: size_t num $6 a2 - --LEAF(MEMSET_NAME) -- .align 6 -+LEAF(MEMSET_NAME, 6) - bstrins.d a1, a1, 15, 8 - add.d t7, a0, a2 - bstrins.d a1, a1, 31, 16 -diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S -index bff92969..1e94aa50 100644 ---- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S -@@ -5,8 +5,7 @@ - - # define RAWMEMCHR __rawmemchr_lasx - --LEAF(RAWMEMCHR) -- .align 6 -+LEAF(RAWMEMCHR, 6) - move a2, a0 - bstrins.d a0, zero, 4, 0 - xvld $xr0, a0, 0 -diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S -index 11a19c1d..40bf0cda 100644 ---- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S -@@ -11,8 +11,7 @@ - - # define RAWMEMCHR __rawmemchr_lsx - --LEAF(RAWMEMCHR) -- .align 6 -+LEAF(RAWMEMCHR, 6) - move a2, a0 - bstrins.d a0, zero, 4, 0 - vld $vr0, a0, 0 -diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S -index bf0eed43..0836f590 100644 ---- a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S -@@ -12,9 +12,9 @@ - #define STPCPY __stpcpy_lsx - - L(magic_num): -- .align 6 -- .dword 0x0706050403020100 -- .dword 0x0f0e0d0c0b0a0908 -+ .align 6 -+ .dword 0x0706050403020100 -+ .dword 0x0f0e0d0c0b0a0908 - ENTRY_NO_ALIGN(STPCPY) - pcaddi t0, -4 - andi a4, a1, 0xf -diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S -index ea7eb9d2..3f6ad915 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S -@@ -13,8 +13,7 @@ - #define STRCHR __strchr_lasx - #endif - --LEAF(STRCHR) -- .align 6 -+LEAF(STRCHR, 6) - andi t1, a0, 0x1f - bstrins.d a0, zero, 4, 0 - xvld $xr0, a0, 0 -diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S -index 64ead00b..4ad9a4ad 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S -@@ -13,8 +13,7 @@ - #define STRCHR __strchr_lsx - #endif - --LEAF(STRCHR) -- .align 6 -+LEAF(STRCHR, 6) - andi t1, a0, 0xf - bstrins.d a0, zero, 3, 0 - vld $vr0, a0, 0 -diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S -index de6c7f4f..365818f9 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S -@@ -38,8 +38,7 @@ - #define STRCHR_NAME __strchr_unaligned - - /* char * strchr (const char *s1, int c); */ --LEAF(STRCHR_NAME) -- .align 6 -+LEAF(STRCHR_NAME, 6) - - li.w t4, 0x7 - lu12i.w a2, 0x01010 -diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S -index abc246ca..7b496076 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S -@@ -46,8 +46,7 @@ - - /* char * strchrnul (const char *s1, int c); */ - --LEAF(STRCHRNUL_NAME) -- .align 6 -+LEAF(STRCHRNUL_NAME, 6) - li.w t4, 0x7 - lu12i.w a2, 0x01010 - bstrins.d a1, a1, 15, 8 -diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S -index 226b1d63..c86e3ecd 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S -@@ -13,9 +13,9 @@ - - /* int strcmp (const char *s1, const char *s2); */ - L(magic_num): -- .align 6 -- .dword 0x0706050403020100 -- .dword 0x0f0e0d0c0b0a0908 -+ .align 6 -+ .dword 0x0706050403020100 -+ .dword 0x0f0e0d0c0b0a0908 - - ENTRY_NO_ALIGN(STRCMP) - pcaddi t0, -4 -diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S -index e29d872f..1e2e44ec 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S -@@ -73,8 +73,7 @@ - - /* int strcmp (const char *s1, const char *s2); */ - --LEAF(STRCMP_NAME) -- .align 4 -+LEAF(STRCMP_NAME, 4) - - xor tmp1, src1, src2 - lu12i.w zeroones, 0x01010 -diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S -index 76db561a..dbc061ad 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S -@@ -14,9 +14,10 @@ - /* int strcpy (const char *s1, const char *s2); */ - - L(magic_num): -- .align 6 -- .dword 0x0706050403020100 -- .dword 0x0f0e0d0c0b0a0908 -+ .align 6 -+ .dword 0x0706050403020100 -+ .dword 0x0f0e0d0c0b0a0908 -+ - ENTRY_NO_ALIGN(STRCPY) - pcaddi t0, -4 - andi a4, a1, 0xf -diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S -index c77dc1a9..150dc802 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S -@@ -61,8 +61,7 @@ - - /* int strcpy (const char *s1, const char *s2); */ - --LEAF(STRCPY) -- .align 4 -+LEAF(STRCPY, 4) - move dest_backup, dest - lu12i.w zeroones, 0x01010 - lu12i.w sevenf, 0x7f7f7 -diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S -index cb276aa0..fd6c002d 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S -@@ -13,8 +13,7 @@ - - /* size_t strlen(const char *s1); */ - --LEAF(STRLEN) -- .align 6 -+LEAF(STRLEN, 6) - move a1, a0 - bstrins.d a0, zero, 4, 0 - li.d t1, -1 -diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S -index 6edcac8c..6f311506 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S -@@ -13,8 +13,7 @@ - - /* size_t strlen(const char *s1); */ - --LEAF(STRLEN) -- .align 6 -+LEAF(STRLEN, 6) - move a1, a0 - bstrins.d a0, zero, 4, 0 - vld $vr0, a0, 0 -diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S -index 2fe0fb34..837255e3 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S -@@ -31,8 +31,7 @@ - - /* size_t strlen (const char *s1); */ - --LEAF(STRLEN) -- .align 5 -+LEAF(STRLEN, 5) - nor t4, zero, zero - lu12i.w a2, 0x01010 - andi t5, a0, 0x7 -diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S -index 3399bf77..2c6f9614 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S -@@ -14,9 +14,10 @@ - /* int strncmp (const char *s1, const char *s2); */ - - L(magic_num): -- .align 6 -- .dword 0x0706050403020100 -- .dword 0x0f0e0d0c0b0a0908 -+ .align 6 -+ .dword 0x0706050403020100 -+ .dword 0x0f0e0d0c0b0a0908 -+ - ENTRY_NO_ALIGN(STRNCMP) - beqz a2, L(ret0) - pcaddi t0, -5 -diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S -index 6ec107ca..88397528 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S -@@ -60,8 +60,7 @@ - - /* int strncmp (const char *s1, const char *s2); */ - --LEAF(STRNCMP) -- .align 4 -+LEAF(STRNCMP, 4) - beqz limit, strncmp_ret0 - - xor tmp1, src1, src2 -diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S -index 8c30f10c..910b52fe 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S -@@ -13,8 +13,7 @@ - - /* size_t strnlen (const char *s1, size_t maxlen); */ - --LEAF(STRNLEN) -- .align 6 -+LEAF(STRNLEN, 6) - beqz a1, L(ret0) - andi t1, a0, 0x3f - li.d t3, 65 -diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S -index 388c239a..db0e90ff 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S -@@ -13,8 +13,7 @@ - - /* size_t strnlen (const char *s1, size_t maxlen); */ - --LEAF(STRNLEN) -- .align 6 -+LEAF(STRNLEN, 6) - beqz a1, L(ret0) - andi t1, a0, 0x1f - li.d t3, 33 -diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S -index 4a195b7c..78e7444d 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S -+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S -@@ -63,9 +63,8 @@ - - /* size_t strnlen (const char *s1,size_t maxlen); */ - --LEAF(STRNLEN) -+LEAF(STRNLEN, 4) - -- .align 4 - beqz limit, L(_hit_limit) - lu12i.w zeroones, 0x01010 - lu12i.w sevenf, 0x7f7f7 -diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S -index 6f7a5618..325458ff 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S -@@ -11,8 +11,7 @@ - - #define STRRCHR __strrchr_lasx - --LEAF(STRRCHR) -- .align 6 -+LEAF(STRRCHR, 6) - andi t1, a0, 0x3f - bstrins.d a0, zero, 5, 0 - xvld $xr0, a0, 0 -diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S -index e9228a2e..e082eaab 100644 ---- a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S -+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S -@@ -11,8 +11,7 @@ - - #define STRRCHR __strrchr_lsx - --LEAF(STRRCHR) -- .align 6 -+LEAF(STRRCHR, 6) - andi t1, a0, 0x1f - bstrins.d a0, zero, 4, 0 - vld $vr0, a0, 0 -diff --git a/sysdeps/loongarch/lp64/rawmemchr.S b/sysdeps/loongarch/lp64/rawmemchr.S -index 94b70f2d..ef1db7ed 100644 ---- a/sysdeps/loongarch/lp64/rawmemchr.S -+++ b/sysdeps/loongarch/lp64/rawmemchr.S -@@ -12,8 +12,7 @@ - #endif - - --LEAF(RAWMEMCHR_NAME) -- .align 6 -+LEAF(RAWMEMCHR_NAME, 6) - andi t1, a0, 0x7 - bstrins.d a0, zero, 2, 0 - lu12i.w a2, 0x01010 -diff --git a/sysdeps/loongarch/lp64/s_cosf.S b/sysdeps/loongarch/lp64/s_cosf.S -index 5bfabefb..9fcbe6ca 100644 ---- a/sysdeps/loongarch/lp64/s_cosf.S -+++ b/sysdeps/loongarch/lp64/s_cosf.S -@@ -74,9 +74,7 @@ - movgr2fr.d tmp, rs;\ - ffint.d.l rd, tmp - --LEAF(COSF) -- .align 2 -- .align 3 -+LEAF(COSF, 3) - /* fa0 is SP x; fa1 is DP x */ - movfr2gr.s t0, fa0 /* Bits of x */ - fcvt.d.s fa1, fa0 /* DP x */ -diff --git a/sysdeps/loongarch/lp64/s_sinf.S b/sysdeps/loongarch/lp64/s_sinf.S -index 91c9db9e..45d1c4b5 100644 ---- a/sysdeps/loongarch/lp64/s_sinf.S -+++ b/sysdeps/loongarch/lp64/s_sinf.S -@@ -74,9 +74,7 @@ - movgr2fr.d tmp, rs;\ - ffint.d.l rd, tmp - --LEAF(SINF) -- .align 2 -- .align 3 -+LEAF(SINF, 3) - /* fa0 is SP x; fa1 is DP x */ - movfr2gr.s t2, fa0 /* Bits of x */ - fcvt.d.s fa1, fa0 /* DP x */ -diff --git a/sysdeps/loongarch/lp64/stpcpy.S b/sysdeps/loongarch/lp64/stpcpy.S -index 9d4b0c8d..b6a367dc 100644 ---- a/sysdeps/loongarch/lp64/stpcpy.S -+++ b/sysdeps/loongarch/lp64/stpcpy.S -@@ -11,8 +11,7 @@ - #define STPCPY_NAME __stpcpy - #endif - --LEAF(STPCPY_NAME) -- .align 6 -+LEAF(STPCPY_NAME, 6) - andi a3, a0, 0x7 - beqz a3, L(dest_align) - sub.d a5, a1, a3 -diff --git a/sysdeps/loongarch/lp64/strchr.S b/sysdeps/loongarch/lp64/strchr.S -index 63454c17..fde53a30 100644 ---- a/sysdeps/loongarch/lp64/strchr.S -+++ b/sysdeps/loongarch/lp64/strchr.S -@@ -13,8 +13,7 @@ - - /* char * strchr (const char *s1, int c); */ - --LEAF(STRCHR_NAME) -- .align 6 -+LEAF(STRCHR_NAME, 6) - slli.d t1, a0, 3 - bstrins.d a0, zero, 2, 0 - lu12i.w a2, 0x01010 -diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S -index c4532e11..a5ee09a3 100644 ---- a/sysdeps/loongarch/lp64/strchrnul.S -+++ b/sysdeps/loongarch/lp64/strchrnul.S -@@ -13,8 +13,7 @@ - - /* char * strchrnul (const char *s1, int c); */ - --LEAF(STRCHRNUL_NAME) -- .align 6 -+LEAF(STRCHRNUL_NAME, 6) - slli.d t1, a0, 3 - bstrins.d a0, zero, 2, 0 - lu12i.w a2, 0x01010 -diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S -index 22c261a3..3a863992 100644 ---- a/sysdeps/loongarch/lp64/strcmp.S -+++ b/sysdeps/loongarch/lp64/strcmp.S -@@ -19,8 +19,7 @@ - #define src1 a0 - #define src2 a1 - #define result v0 --LEAF(STRCMP_NAME) -- .align 6 -+LEAF(STRCMP_NAME, 6) - xor a4, src1, src2 - lu12i.w t5, 0x01010 - lu12i.w t6, 0x7f7f7 -diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S -index c6fe74cb..08505192 100644 ---- a/sysdeps/loongarch/lp64/strcpy.S -+++ b/sysdeps/loongarch/lp64/strcpy.S -@@ -11,8 +11,7 @@ - #define STRCPY strcpy - #endif - --LEAF(STRCPY) -- .align 6 -+LEAF(STRCPY, 6) - andi a3, a0, 0x7 - move a2, a0 - beqz a3, L(dest_align) -diff --git a/sysdeps/loongarch/lp64/strlen.S b/sysdeps/loongarch/lp64/strlen.S -index dd5a8da3..71431ce2 100644 ---- a/sysdeps/loongarch/lp64/strlen.S -+++ b/sysdeps/loongarch/lp64/strlen.S -@@ -11,8 +11,7 @@ - #define STRLEN strlen - #endif - --LEAF(STRLEN) -- .align 6 -+LEAF(STRLEN, 6) - move a1, a0 - bstrins.d a0, zero, 2, 0 - lu12i.w a2, 0x01010 -diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S -index dcb15350..55450e55 100644 ---- a/sysdeps/loongarch/lp64/strncmp.S -+++ b/sysdeps/loongarch/lp64/strncmp.S -@@ -13,8 +13,7 @@ - - /* int strncmp (const char *s1, const char *s2); */ - --LEAF(STRNCMP) -- .align 6 -+LEAF(STRNCMP, 6) - beqz a2, L(ret0) - xor a4, a0, a1 - lu12i.w t5, 0x01010 -diff --git a/sysdeps/loongarch/lp64/strnlen.S b/sysdeps/loongarch/lp64/strnlen.S -index 0517e206..5b5ab585 100644 ---- a/sysdeps/loongarch/lp64/strnlen.S -+++ b/sysdeps/loongarch/lp64/strnlen.S -@@ -15,8 +15,7 @@ - #. first load with t1 != 0, need to adjust t5; - #. return the less one of both strlen(s) and a1; - --LEAF(STRNLEN) -- .align 6 -+LEAF(STRNLEN, 6) - beqz a1, L(out) - lu12i.w a2, 0x01010 - andi t1, a0, 0x7 -diff --git a/sysdeps/loongarch/lp64/strrchr.S b/sysdeps/loongarch/lp64/strrchr.S -index 3bf92ecd..df7fcb6b 100644 ---- a/sysdeps/loongarch/lp64/strrchr.S -+++ b/sysdeps/loongarch/lp64/strrchr.S -@@ -11,8 +11,7 @@ - #define STRRCHR_NAME strrchr - #endif - --LEAF(STRRCHR_NAME) -- .align 6 -+LEAF(STRRCHR_NAME, 6) - slli.d t1, a0, 3 - bstrins.d a0, zero, 2, 0 - lu12i.w a2, 0x01010 -diff --git a/sysdeps/loongarch/setjmp.S b/sysdeps/loongarch/setjmp.S -index da09a93c..c4e6d01c 100644 ---- a/sysdeps/loongarch/setjmp.S -+++ b/sysdeps/loongarch/setjmp.S -@@ -19,14 +19,14 @@ - #include - #include - --ENTRY (_setjmp) -+ENTRY (_setjmp, 3) - li.w a1,0 - b __sigsetjmp - END (_setjmp) --ENTRY (setjmp) -+ENTRY (setjmp, 3) - li.w a1,1 - END (setjmp) --ENTRY (__sigsetjmp) -+ENTRY (__sigsetjmp, 3) - REG_S ra, a0, 0*SZREG - REG_S sp, a0, 1*SZREG - REG_S x, a0, 2*SZREG -diff --git a/sysdeps/loongarch/start.S b/sysdeps/loongarch/start.S -index cf0a14b5..b83221e4 100644 ---- a/sysdeps/loongarch/start.S -+++ b/sysdeps/loongarch/start.S -@@ -17,7 +17,7 @@ __libc_start_main (int (*main) (int, char **, char **), - void *stack_end); - */ - --ENTRY (ENTRY_POINT) -+ENTRY (ENTRY_POINT, 3) - /* Terminate call stack by noting ra is undefined. Use a dummy - .cfi_label to force starting the FDE. */ - .cfi_label .Ldummy -diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h -index f64bfb2b..357a5ba3 100644 ---- a/sysdeps/loongarch/sys/asm.h -+++ b/sysdeps/loongarch/sys/asm.h -@@ -27,15 +27,15 @@ - - - /* Declare leaf routine. */ --#define LEAF(symbol) \ -+#define LEAF(symbol, aln) \ - .text; \ - .globl symbol; \ -- .align 3; \ -+ .align aln; \ - .type symbol, @function; \ - symbol: \ - cfi_startproc; \ - --# define ENTRY(symbol) LEAF(symbol) -+# define ENTRY(symbol, aln) LEAF(symbol, aln) - - #define LEAF_NO_ALIGN(symbol) \ - .text; \ -diff --git a/sysdeps/unix/sysv/linux/loongarch/clone.S b/sysdeps/unix/sysv/linux/loongarch/clone.S -index f0fc566e..1180a11d 100644 ---- a/sysdeps/unix/sysv/linux/loongarch/clone.S -+++ b/sysdeps/unix/sysv/linux/loongarch/clone.S -@@ -29,7 +29,7 @@ - /* int clone(int (*fn)(void *arg), void *child_stack, int flags, void *arg, - void *parent_tidptr, void *tls, void *child_tidptr) */ - --ENTRY (__clone) -+ENTRY (__clone, 3) - - /* Align stack to 16 or 8 bytes per the ABI. */ - #if _LOONGARCH_SIM == _ABILP64 -@@ -74,7 +74,7 @@ L (error): - its own function so that we can terminate the stack trace with our - debug info. */ - --ENTRY (__thread_start) -+ENTRY (__thread_start, 3) - L (thread_start): - /* Terminate call stack by noting ra is undefined. Use a dummy - .cfi_label to force starting the FDE. */ -diff --git a/sysdeps/unix/sysv/linux/loongarch/getcontext.S b/sysdeps/unix/sysv/linux/loongarch/getcontext.S -index 9c28d958..6391850e 100644 ---- a/sysdeps/unix/sysv/linux/loongarch/getcontext.S -+++ b/sysdeps/unix/sysv/linux/loongarch/getcontext.S -@@ -21,7 +21,7 @@ - /* int getcontext (ucontext_t *ucp) */ - - .text --LEAF (__getcontext) -+LEAF (__getcontext, 3) - SAVE_INT_REG (ra, 1, a0) - SAVE_INT_REG (sp, 3, a0) - SAVE_INT_REG (zero, 4, a0) /* return 0 by overwriting a0. */ -diff --git a/sysdeps/unix/sysv/linux/loongarch/setcontext.S b/sysdeps/unix/sysv/linux/loongarch/setcontext.S -index c96ec43c..3a043a63 100644 ---- a/sysdeps/unix/sysv/linux/loongarch/setcontext.S -+++ b/sysdeps/unix/sysv/linux/loongarch/setcontext.S -@@ -28,7 +28,7 @@ - other than the PRESERVED state. */ - - .text --LEAF (__setcontext) -+LEAF (__setcontext, 3) - - addi.d sp, sp, -16 - st.d a0, sp, 0 /* Save ucp to stack. */ -@@ -94,7 +94,7 @@ LEAF (__setcontext) - PSEUDO_END (__setcontext) - weak_alias (__setcontext, setcontext) - --LEAF (__start_context) -+LEAF (__start_context, 3) - - /* Terminate call stack by noting ra == 0. Happily, s0 == 0 here. */ - cfi_register (1, 23) -diff --git a/sysdeps/unix/sysv/linux/loongarch/swapcontext.S b/sysdeps/unix/sysv/linux/loongarch/swapcontext.S -index d839dd87..c9024d5f 100644 ---- a/sysdeps/unix/sysv/linux/loongarch/swapcontext.S -+++ b/sysdeps/unix/sysv/linux/loongarch/swapcontext.S -@@ -20,7 +20,7 @@ - - /* int swapcontext (ucontext_t *oucp, const ucontext_t *ucp) */ - --LEAF (__swapcontext) -+LEAF (__swapcontext, 3) - ori a2, sp, 0 /* Save sp to a2. */ - addi.d sp, sp, -16 - st.d a1, sp, 0 -diff --git a/sysdeps/unix/sysv/linux/loongarch/sysdep.S b/sysdeps/unix/sysv/linux/loongarch/sysdep.S -index a8094283..19c03fb4 100644 ---- a/sysdeps/unix/sysv/linux/loongarch/sysdep.S -+++ b/sysdeps/unix/sysv/linux/loongarch/sysdep.S -@@ -22,13 +22,13 @@ - # define errno __libc_errno - #endif - --ENTRY (__syscall_error) -+ENTRY (__syscall_error, 3) - /* Fall through to __syscall_set_errno. */ - END (__syscall_error) - - /* Non-standard calling convention: argument in a0, return address in t0, - and clobber only t1. */ --ENTRY (__syscall_set_errno) -+ENTRY (__syscall_set_errno, 3) - /* We got here because a0 < 0, but only codes in the range [-4095, -1] - represent errors. Otherwise, just return the result normally. */ - -diff --git a/sysdeps/unix/sysv/linux/loongarch/sysdep.h b/sysdeps/unix/sysv/linux/loongarch/sysdep.h -index f50946d4..7b45f609 100644 ---- a/sysdeps/unix/sysv/linux/loongarch/sysdep.h -+++ b/sysdeps/unix/sysv/linux/loongarch/sysdep.h -@@ -14,7 +14,7 @@ - errors by setting a0 to a value between -1 and -4095. */ - # undef PSEUDO - # define PSEUDO(name, syscall_name, args) \ -- ENTRY (name); \ -+ ENTRY (name, 3); \ - li.d a7, SYS_ify (syscall_name); \ - syscall 0; \ - li.d a7, -4096; \ -@@ -58,7 +58,7 @@ - /* Performs a system call, not setting errno. */ - # undef PSEUDO_NEORRNO - # define PSEUDO_NOERRNO(name, syscall_name, args) \ -- ENTRY (name); \ -+ ENTRY (name, 3); \ - li.d a7, SYS_ify (syscall_name); \ - syscall 0; - -diff --git a/sysdeps/unix/sysv/linux/loongarch/vfork.S b/sysdeps/unix/sysv/linux/loongarch/vfork.S -index 83cf141f..5db6720a 100644 ---- a/sysdeps/unix/sysv/linux/loongarch/vfork.S -+++ b/sysdeps/unix/sysv/linux/loongarch/vfork.S -@@ -25,7 +25,7 @@ - replaced by a call to `execve'. Return -1 for errors, 0 to the new process, - and the process ID of the new process to the old process. */ - --ENTRY (__vfork) -+ENTRY (__vfork, 3) - - - li.d a0, 0x4111 /* CLONE_VM | CLONE_VFORK | SIGCHLD */ --- -2.33.0 - diff --git a/glibc-Add-Hygon-Support.patch b/glibc-Add-Hygon-Support.patch deleted file mode 100644 index c108f84..0000000 --- a/glibc-Add-Hygon-Support.patch +++ /dev/null @@ -1,28 +0,0 @@ -From ed64d30125f855e25ac6f12d8863857dfd3e2cbe Mon Sep 17 00:00:00 2001 -From: lijing22222 -Date: Fri, 1 Mar 2024 16:00:15 +0800 -Subject: [PATCH] Add Hygon Support - ---- - sysdeps/x86/cpu-features.c | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c -index 91042505..0ce37a9a 100644 ---- a/sysdeps/x86/cpu-features.c -+++ b/sysdeps/x86/cpu-features.c -@@ -527,8 +527,9 @@ init_cpu_features (struct cpu_features *cpu_features) - cpu_features->preferred[index_arch_Prefer_No_AVX512] - |= bit_arch_Prefer_No_AVX512; - } -- /* This spells out "AuthenticAMD". */ -- else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) -+ /* This spells out "AuthenticAMD" or "HygonGenuine". */ -+ else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)||(ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)) -+ - { - unsigned int extended_model; - --- -2.17.1 - diff --git a/glibc-Add-a-testcase-to-check-alignment-of-PT_LOAD-segment-2.patch b/glibc-Add-a-testcase-to-check-alignment-of-PT_LOAD-segment-2.patch deleted file mode 100644 index 73f1a06..0000000 --- a/glibc-Add-a-testcase-to-check-alignment-of-PT_LOAD-segment-2.patch +++ /dev/null @@ -1,147 +0,0 @@ -From 58f93dff514cc0bdf3c72eff590dcf5fe5bf9e00 Mon Sep 17 00:00:00 2001 -From: "H.J. Lu" -Date: Wed, 19 Jul 2023 23:09:09 +0800 -Subject: [PATCH 3/6] Add a testcase to check alignment of PT_LOAD segment [BZ - #28676] - -Backport from master commit: fc2334a - -Signed-off-by: Rongwei Wang ---- - elf/Makefile | 13 ++++++++++++- - elf/tst-align3.c | 38 ++++++++++++++++++++++++++++++++++++++ - elf/tst-alignmod3.c | 32 ++++++++++++++++++++++++++++++++ - 3 files changed, 82 insertions(+), 1 deletion(-) - create mode 100644 elf/tst-align3.c - create mode 100644 elf/tst-alignmod3.c - -diff --git a/elf/Makefile b/elf/Makefile -index 634c3113..442817ca 100644 ---- a/elf/Makefile -+++ b/elf/Makefile -@@ -331,6 +331,7 @@ tests += \ - tst-addr1 \ - tst-align \ - tst-align2 \ -+ tst-align3 \ - tst-audit-tlsdesc \ - tst-audit-tlsdesc-dlopen \ - tst-audit1 \ -@@ -466,7 +467,9 @@ endif - test-srcs = \ - tst-pathopt - # tests-srcs -- -+ifeq (yes,$(have-fpie)) -+tests-pie += tst-align3 -+endif - selinux-enabled := $(shell cat /selinux/enforce 2> /dev/null) - - ifneq ($(selinux-enabled),1) -@@ -647,6 +650,7 @@ modules-names = \ - tst-absolute-zero-lib \ - tst-alignmod \ - tst-alignmod2 \ -+ tst-alignmod3 \ - tst-array2dep \ - tst-array5dep \ - tst-audit-tlsdesc-mod1 \ -@@ -1669,6 +1673,13 @@ CFLAGS-tst-alignmod2.c += $(stack-align-test-flags) - $(objpfx)tst-align: $(libdl) - $(objpfx)tst-align.out: $(objpfx)tst-alignmod.so - $(objpfx)tst-align2: $(objpfx)tst-alignmod2.so -+$(objpfx)tst-align3: $(objpfx)tst-alignmod3.so -+ifeq (yes,$(have-fpie)) -+CFLAGS-tst-align3.c += $(PIE-ccflag) -+endif -+LDFLAGS-tst-align3 += -Wl,-z,max-page-size=0x200000 -+LDFLAGS-tst-alignmod3.so += -Wl,-z,max-page-size=0x200000 -+$(objpfx)tst-alignmod3.so: $(libsupport) - - $(objpfx)unload3: $(libdl) - $(objpfx)unload3.out: $(objpfx)unload3mod1.so $(objpfx)unload3mod2.so \ -diff --git a/elf/tst-align3.c b/elf/tst-align3.c -new file mode 100644 -index 00000000..ac86d623 ---- /dev/null -+++ b/elf/tst-align3.c -@@ -0,0 +1,38 @@ -+/* Check alignment of PT_LOAD segment in a shared library. -+ Copyright (C) 2021 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+#include -+ -+/* This should cover all possible page sizes we currently support. */ -+#define ALIGN 0x200000 -+ -+int bar __attribute__ ((aligned (ALIGN))) = 1; -+ -+extern int do_load_test (void); -+ -+static int -+do_test (void) -+{ -+ printf ("bar: %p\n", &bar); -+ TEST_VERIFY (is_aligned (&bar, ALIGN) == 0); -+ -+ return do_load_test (); -+} -+ -+#include -diff --git a/elf/tst-alignmod3.c b/elf/tst-alignmod3.c -new file mode 100644 -index 00000000..0d33f237 ---- /dev/null -+++ b/elf/tst-alignmod3.c -@@ -0,0 +1,32 @@ -+/* Check alignment of PT_LOAD segment in a shared library. -+ Copyright (C) 2021 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+#include -+ -+/* This should cover all possible page sizes we currently support. */ -+#define ALIGN 0x200000 -+ -+int foo __attribute__ ((aligned (ALIGN))) = 1; -+ -+void -+do_load_test (void) -+{ -+ printf ("foo: %p\n", &foo); -+ TEST_VERIFY (is_aligned (&foo, ALIGN) == 0); -+} --- -2.27.0 - diff --git a/glibc-Properly-check-stack-alignment-BZ-27901.patch b/glibc-Properly-check-stack-alignment-BZ-27901.patch deleted file mode 100644 index 20dff99..0000000 --- a/glibc-Properly-check-stack-alignment-BZ-27901.patch +++ /dev/null @@ -1,325 +0,0 @@ -From 6152628751bf13f74c9336263a9c22f29ccd8ffb Mon Sep 17 00:00:00 2001 -From: "H.J. Lu" -Date: Wed, 19 Jul 2023 23:01:53 +0800 -Subject: [PATCH 1/6] Properly check stack alignment [BZ #27901] - -1. Replace - -if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) - -which may be optimized out by compiler, with - -int -__attribute__ ((weak, noclone, noinline)) -is_aligned (void *p, int align) -{ - return (((uintptr_t) p) & (align - 1)) != 0; -} - -2. Add TEST_STACK_ALIGN_INIT to TEST_STACK_ALIGN. -3. Add a common TEST_STACK_ALIGN_INIT to check 16-byte stack alignment -for both i386 and x86-64. -4. Update powerpc to use TEST_STACK_ALIGN_INIT. - -Reviewed-by: Carlos O'Donell -Signed-off-by: Rongwei Wang ---- - sysdeps/generic/tst-stack-align.h | 40 ++++++++++++++++--------- - sysdeps/i386/i686/tst-stack-align.h | 44 --------------------------- - sysdeps/i386/tst-stack-align.h | 41 ------------------------- - sysdeps/powerpc/tst-stack-align.h | 27 +++++------------ - sysdeps/x86/tst-stack-align.h | 28 ++++++++++++++++++ - sysdeps/x86_64/tst-stack-align.h | 46 ----------------------------- - 6 files changed, 61 insertions(+), 165 deletions(-) - delete mode 100644 sysdeps/i386/i686/tst-stack-align.h - delete mode 100644 sysdeps/i386/tst-stack-align.h - create mode 100644 sysdeps/x86/tst-stack-align.h - delete mode 100644 sysdeps/x86_64/tst-stack-align.h - -diff --git a/sysdeps/generic/tst-stack-align.h b/sysdeps/generic/tst-stack-align.h -index e5cb3310..e6050901 100644 ---- a/sysdeps/generic/tst-stack-align.h -+++ b/sysdeps/generic/tst-stack-align.h -@@ -1,4 +1,5 @@ --/* Copyright (C) 2003-2018 Free Software Foundation, Inc. -+/* Check stack alignment. Generic version. -+ Copyright (C) 2003-2021 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or -@@ -18,17 +19,28 @@ - #include - #include - -+int -+__attribute__ ((weak, noclone, noinline)) -+is_aligned (void *p, int align) -+{ -+ return (((uintptr_t) p) & (align - 1)) != 0; -+} -+ -+#ifndef TEST_STACK_ALIGN_INIT -+# define TEST_STACK_ALIGN_INIT() 0 -+#endif -+ - #define TEST_STACK_ALIGN() \ -- ({ \ -- double _d = 12.0; \ -- long double _ld = 15.0; \ -- int _ret = 0; \ -- printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \ -- if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \ -- _ret = 1; \ -- \ -- printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \ -- if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \ -- _ret = 1; \ -- _ret; \ -- }) -+ ({ \ -+ double _d = 12.0; \ -+ long double _ld = 15.0; \ -+ int _ret = TEST_STACK_ALIGN_INIT (); \ -+ \ -+ printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \ -+ _ret += is_aligned (&_d, __alignof (double)); \ -+ \ -+ printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, \ -+ __alignof (long double)); \ -+ _ret += is_aligned (&_ld, __alignof (long double)); \ -+ _ret; \ -+ }) -diff --git a/sysdeps/i386/i686/tst-stack-align.h b/sysdeps/i386/i686/tst-stack-align.h -deleted file mode 100644 -index 975f26ef..00000000 ---- a/sysdeps/i386/i686/tst-stack-align.h -+++ /dev/null -@@ -1,44 +0,0 @@ --/* Copyright (C) 2003-2018 Free Software Foundation, Inc. -- This file is part of the GNU C Library. -- -- The GNU C Library is free software; you can redistribute it and/or -- modify it under the terms of the GNU Lesser General Public -- License as published by the Free Software Foundation; either -- version 2.1 of the License, or (at your option) any later version. -- -- The GNU C Library is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -- Lesser General Public License for more details. -- -- You should have received a copy of the GNU Lesser General Public -- License along with the GNU C Library; if not, see -- . */ -- --#include --#include --#ifndef __SSE__ --#include_next --#else --#include -- --#define TEST_STACK_ALIGN() \ -- ({ \ -- __m128 _m; \ -- double _d = 12.0; \ -- long double _ld = 15.0; \ -- int _ret = 0; \ -- printf ("__m128: %p %zu\n", &_m, __alignof (__m128)); \ -- if ((((uintptr_t) &_m) & (__alignof (__m128) - 1)) != 0) \ -- _ret = 1; \ -- \ -- printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \ -- if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \ -- _ret = 1; \ -- \ -- printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \ -- if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \ -- _ret = 1; \ -- _ret; \ -- }) --#endif -diff --git a/sysdeps/i386/tst-stack-align.h b/sysdeps/i386/tst-stack-align.h -deleted file mode 100644 -index 394ff773..00000000 ---- a/sysdeps/i386/tst-stack-align.h -+++ /dev/null -@@ -1,41 +0,0 @@ --/* Copyright (C) 2004-2018 Free Software Foundation, Inc. -- This file is part of the GNU C Library. -- -- The GNU C Library is free software; you can redistribute it and/or -- modify it under the terms of the GNU Lesser General Public -- License as published by the Free Software Foundation; either -- version 2.1 of the License, or (at your option) any later version. -- -- The GNU C Library is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -- Lesser General Public License for more details. -- -- You should have received a copy of the GNU Lesser General Public -- License along with the GNU C Library; if not, see -- . */ -- --#include --#include -- --typedef struct { int i[4]; } int_al16 __attribute__((aligned (16))); -- --#define TEST_STACK_ALIGN() \ -- ({ \ -- int_al16 _m; \ -- double _d = 12.0; \ -- long double _ld = 15.0; \ -- int _ret = 0; \ -- printf ("int_al16: %p %zu\n", &_m, __alignof (int_al16)); \ -- if ((((uintptr_t) &_m) & (__alignof (int_al16) - 1)) != 0) \ -- _ret = 1; \ -- \ -- printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \ -- if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \ -- _ret = 1; \ -- \ -- printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \ -- if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \ -- _ret = 1; \ -- _ret; \ -- }) -diff --git a/sysdeps/powerpc/tst-stack-align.h b/sysdeps/powerpc/tst-stack-align.h -index 7fd7013b..d7400b28 100644 ---- a/sysdeps/powerpc/tst-stack-align.h -+++ b/sysdeps/powerpc/tst-stack-align.h -@@ -1,4 +1,5 @@ --/* Copyright (C) 2005-2018 Free Software Foundation, Inc. -+/* Check stack alignment. PowerPC version. -+ Copyright (C) 2005-2021 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or -@@ -15,10 +16,7 @@ - License along with the GNU C Library; if not, see - . */ - --#include --#include -- --#define TEST_STACK_ALIGN() \ -+#define TEST_STACK_ALIGN_INIT() \ - ({ \ - /* Altivec __vector int etc. needs 16byte aligned stack. \ - Instead of using altivec.h here, use aligned attribute instead. */ \ -@@ -27,20 +25,9 @@ - int _i __attribute__((aligned (16))); \ - int _j[3]; \ - } _s = { ._i = 18, ._j[0] = 19, ._j[1] = 20, ._j[2] = 21 }; \ -- double _d = 12.0; \ -- long double _ld = 15.0; \ -- int _ret = 0; \ - printf ("__vector int: { %d, %d, %d, %d } %p %zu\n", _s._i, _s._j[0], \ - _s._j[1], _s._j[2], &_s, __alignof (_s)); \ -- if ((((uintptr_t) &_s) & (__alignof (_s) - 1)) != 0) \ -- _ret = 1; \ -- \ -- printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \ -- if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \ -- _ret = 1; \ -- \ -- printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \ -- if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \ -- _ret = 1; \ -- _ret; \ -- }) -+ is_aligned (&_s, __alignof (_s)); \ -+ }) -+ -+#include_next -diff --git a/sysdeps/x86/tst-stack-align.h b/sysdeps/x86/tst-stack-align.h -new file mode 100644 -index 00000000..02ecc72d ---- /dev/null -+++ b/sysdeps/x86/tst-stack-align.h -@@ -0,0 +1,28 @@ -+/* Check stack alignment. X86 version. -+ Copyright (C) 2021 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+typedef struct { int i[16]; } int_al16 __attribute__((aligned (16))); -+ -+#define TEST_STACK_ALIGN_INIT() \ -+ ({ \ -+ int_al16 _m; \ -+ printf ("int_al16: %p %zu\n", &_m, __alignof (int_al16)); \ -+ is_aligned (&_m, __alignof (int_al16)); \ -+ }) -+ -+#include_next -diff --git a/sysdeps/x86_64/tst-stack-align.h b/sysdeps/x86_64/tst-stack-align.h -deleted file mode 100644 -index b2ef77f6..00000000 ---- a/sysdeps/x86_64/tst-stack-align.h -+++ /dev/null -@@ -1,46 +0,0 @@ --/* Copyright (C) 2003-2018 Free Software Foundation, Inc. -- This file is part of the GNU C Library. -- -- The GNU C Library is free software; you can redistribute it and/or -- modify it under the terms of the GNU Lesser General Public -- License as published by the Free Software Foundation; either -- version 2.1 of the License, or (at your option) any later version. -- -- The GNU C Library is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -- Lesser General Public License for more details. -- -- You should have received a copy of the GNU Lesser General Public -- License along with the GNU C Library; if not, see -- . */ -- --#include --#include -- --#define TEST_STACK_ALIGN() \ -- ({ \ -- /* AMD64 ABI mandates 16byte aligned stack. \ -- Unfortunately, current GCC doesn't support __int128 or __float128 \ -- types, so use aligned attribute instead. */ \ -- struct _S \ -- { \ -- int _i __attribute__((aligned (16))); \ -- int _pad[3]; \ -- } _s = { ._i = 18 }; \ -- double _d = 12.0; \ -- long double _ld = 15.0; \ -- int _ret = 0; \ -- printf ("__int128: %d %p %zu\n", _s._i, &_s, __alignof (_s)); \ -- if ((((uintptr_t) &_s) & (__alignof (_s) - 1)) != 0) \ -- _ret = 1; \ -- \ -- printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \ -- if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \ -- _ret = 1; \ -- \ -- printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \ -- if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \ -- _ret = 1; \ -- _ret; \ -- }) --- -2.27.0 - diff --git a/glibc-RHEL-21519.patch b/glibc-RHEL-10481.patch similarity index 99% rename from glibc-RHEL-21519.patch rename to glibc-RHEL-10481.patch index 7e22998..825f13c 100644 --- a/glibc-RHEL-21519.patch +++ b/glibc-RHEL-10481.patch @@ -16,7 +16,7 @@ Date: Thu Nov 16 19:55:35 2023 +0100 Reviewed-by: Adhemerval Zanella diff --git a/elf/dl-close.c b/elf/dl-close.c -index 22225efb3226c3e1..16a39f5bf17b440f 100644 +index 66524b6708c59f29..8107c2d5f6ad2bc6 100644 --- a/elf/dl-close.c +++ b/elf/dl-close.c @@ -182,6 +182,16 @@ _dl_close_worker (struct link_map *map, bool force) diff --git a/glibc-RHEL-22441.patch b/glibc-RHEL-1192.patch similarity index 100% rename from glibc-RHEL-22441.patch rename to glibc-RHEL-1192.patch diff --git a/glibc-RHEL-13720-1.patch b/glibc-RHEL-13720-1.patch new file mode 100644 index 0000000..5eab70c --- /dev/null +++ b/glibc-RHEL-13720-1.patch @@ -0,0 +1,72 @@ +commit 2aa0974d2573441bffd596b07bff8698b1f2f18c +Author: Florian Weimer +Date: Fri Oct 20 14:29:50 2023 +0200 + + elf: ldconfig should skip temporary files created by package managers + + This avoids crashes due to partially written files, after a package + update is interrupted. + + Reviewed-by: Adhemerval Zanella + +Conflicts: + elf/ldconfig.c + (missing alloca removal downstream) + +diff --git a/elf/ldconfig.c b/elf/ldconfig.c +index 8c66d7e5426d8cc4..51de08f91fbaf093 100644 +--- a/elf/ldconfig.c ++++ b/elf/ldconfig.c +@@ -771,6 +771,31 @@ struct dlib_entry + struct dlib_entry *next; + }; + ++/* Skip some temporary DSO files. These files may be partially written ++ and lead to ldconfig crashes when examined. */ ++static bool ++skip_dso_based_on_name (const char *name, size_t len) ++{ ++ /* Skip temporary files created by the prelink program. Files with ++ names like these are never really DSOs we want to look at. */ ++ if (len >= sizeof (".#prelink#") - 1) ++ { ++ if (strcmp (name + len - sizeof (".#prelink#") + 1, ++ ".#prelink#") == 0) ++ return true; ++ if (len >= sizeof (".#prelink#.XXXXXX") - 1 ++ && memcmp (name + len - sizeof (".#prelink#.XXXXXX") ++ + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0) ++ return true; ++ } ++ /* Skip temporary files created by RPM. */ ++ if (memchr (name, len, ';') != NULL) ++ return true; ++ /* Skip temporary files created by dpkg. */ ++ if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0) ++ return true; ++ return false; ++} + + static void + search_dir (const struct dir_entry *entry) +@@ -849,18 +874,8 @@ search_dir (const struct dir_entry *entry) + continue; + + size_t len = strlen (direntry->d_name); +- /* Skip temporary files created by the prelink program. Files with +- names like these are never really DSOs we want to look at. */ +- if (len >= sizeof (".#prelink#") - 1) +- { +- if (strcmp (direntry->d_name + len - sizeof (".#prelink#") + 1, +- ".#prelink#") == 0) +- continue; +- if (len >= sizeof (".#prelink#.XXXXXX") - 1 +- && memcmp (direntry->d_name + len - sizeof (".#prelink#.XXXXXX") +- + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0) +- continue; +- } ++ if (skip_dso_based_on_name (direntry->d_name, len)) ++ continue; + len += strlen (entry->path) + 2; + if (len > file_name_len) + { diff --git a/glibc-RHEL-13720-2.patch b/glibc-RHEL-13720-2.patch new file mode 100644 index 0000000..69d5a90 --- /dev/null +++ b/glibc-RHEL-13720-2.patch @@ -0,0 +1,61 @@ +commit cfb5a97a93ea656e3b2263e42142a4032986d9ba +Author: Florian Weimer +Date: Mon Oct 23 12:53:16 2023 +0200 + + ldconfig: Fixes for skipping temporary files. + + Arguments to a memchr call were swapped, causing incorrect skipping + of files. + + Files related to dpkg have different names: they actually end in + .dpkg-new and .dpkg-tmp, not .tmp as I mistakenly assumed. + + Fixes commit 2aa0974d2573441bffd59 ("elf: ldconfig should skip + temporary files created by package managers"). + +diff --git a/elf/ldconfig.c b/elf/ldconfig.c +index 51de08f91fbaf093..fb19dd68d41c07a4 100644 +--- a/elf/ldconfig.c ++++ b/elf/ldconfig.c +@@ -771,6 +771,17 @@ struct dlib_entry + struct dlib_entry *next; + }; + ++/* Return true if the N bytes at NAME end with with the characters in ++ the string SUFFIX. (NAME[N + 1] does not have to be a null byte.) ++ Expected to be called with a string literal for SUFFIX. */ ++static inline bool ++endswithn (const char *name, size_t n, const char *suffix) ++{ ++ return (n >= strlen (suffix) ++ && memcmp (name + n - strlen (suffix), suffix, ++ strlen (suffix)) == 0); ++} ++ + /* Skip some temporary DSO files. These files may be partially written + and lead to ldconfig crashes when examined. */ + static bool +@@ -780,8 +791,7 @@ skip_dso_based_on_name (const char *name, size_t len) + names like these are never really DSOs we want to look at. */ + if (len >= sizeof (".#prelink#") - 1) + { +- if (strcmp (name + len - sizeof (".#prelink#") + 1, +- ".#prelink#") == 0) ++ if (endswithn (name, len, ".#prelink#")) + return true; + if (len >= sizeof (".#prelink#.XXXXXX") - 1 + && memcmp (name + len - sizeof (".#prelink#.XXXXXX") +@@ -789,10 +799,11 @@ skip_dso_based_on_name (const char *name, size_t len) + return true; + } + /* Skip temporary files created by RPM. */ +- if (memchr (name, len, ';') != NULL) ++ if (memchr (name, ';', len) != NULL) + return true; + /* Skip temporary files created by dpkg. */ +- if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0) ++ if (endswithn (name, len, ".dpkg-new") ++ || endswithn (name, len, ".dpkg-tmp")) + return true; + return false; + } diff --git a/glibc-RHEL-15696-1.patch b/glibc-RHEL-15696-1.patch new file mode 100644 index 0000000..804de54 --- /dev/null +++ b/glibc-RHEL-15696-1.patch @@ -0,0 +1,259 @@ +From 97700a34f36721b11a754cf37a1cc40695ece1fd Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 21 Jan 2019 11:23:59 -0800 +Subject: [PATCH] x86-64 memchr/wmemchr: Properly handle the length parameter + [BZ# 24097] +Content-type: text/plain; charset=UTF-8 + +On x32, the size_t parameter may be passed in the lower 32 bits of a +64-bit register with the non-zero upper 32 bits. The string/memory +functions written in assembly can only use the lower 32 bits of a +64-bit register as length or must clear the upper 32 bits before using +the full 64-bit register for length. + +This pach fixes memchr/wmemchr for x32. Tested on x86-64 and x32. On +x86-64, libc.so is the same with and withou the fix. + + [BZ# 24097] + CVE-2019-6488 + * sysdeps/x86_64/memchr.S: Use RDX_LP for length. Clear the + upper 32 bits of RDX register. + * sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and + tst-size_t-wmemchr. + * sysdeps/x86_64/x32/test-size_t.h: New file. + * sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise. + * sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise. +--- + sysdeps/x86_64/memchr.S | 10 ++-- + sysdeps/x86_64/multiarch/memchr-avx2.S | 8 ++- + sysdeps/x86_64/x32/Makefile | 8 +++ + sysdeps/x86_64/x32/test-size_t.h | 35 ++++++++++++ + sysdeps/x86_64/x32/tst-size_t-memchr.c | 72 +++++++++++++++++++++++++ + sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++ + 6 files changed, 148 insertions(+), 5 deletions(-) + create mode 100644 sysdeps/x86_64/x32/test-size_t.h + create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c + create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c + +Conflicts: + ChangeLog + (removed) + NEWS + (removed) + +diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S +index feef5d4f..cb320257 100644 +--- a/sysdeps/x86_64/memchr.S ++++ b/sysdeps/x86_64/memchr.S +@@ -34,12 +34,16 @@ ENTRY(MEMCHR) + mov %edi, %ecx + + #ifdef USE_AS_WMEMCHR +- test %rdx, %rdx ++ test %RDX_LP, %RDX_LP + jz L(return_null) +- shl $2, %rdx ++ shl $2, %RDX_LP + #else ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx ++# endif + punpcklbw %xmm1, %xmm1 +- test %rdx, %rdx ++ test %RDX_LP, %RDX_LP + jz L(return_null) + punpcklbw %xmm1, %xmm1 + #endif +diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S +index 5f5e7725..c81da19b 100644 +--- a/sysdeps/x86_64/multiarch/memchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S +@@ -40,16 +40,20 @@ + ENTRY (MEMCHR) + # ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ +- testq %rdx, %rdx ++ test %RDX_LP, %RDX_LP + jz L(null) + # endif + movl %edi, %ecx + /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 + # ifdef USE_AS_WMEMCHR +- shl $2, %rdx ++ shl $2, %RDX_LP + vpbroadcastd %xmm0, %ymm0 + # else ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx ++# endif + vpbroadcastb %xmm0, %ymm0 + # endif + /* Check if we may cross page boundary with one vector load. */ +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index f2ebc24f..7d528889 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -4,3 +4,11 @@ ifeq ($(subdir),math) + # 64-bit llround. Add -fno-builtin-lround to silence the compiler. + CFLAGS-s_llround.c += -fno-builtin-lround + endif ++ ++ifeq ($(subdir),string) ++tests += tst-size_t-memchr ++endif ++ ++ifeq ($(subdir),wcsmbs) ++tests += tst-size_t-wmemchr ++endif +diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h +new file mode 100644 +index 00000000..78a94086 +--- /dev/null ++++ b/sysdeps/x86_64/x32/test-size_t.h +@@ -0,0 +1,35 @@ ++/* Test string/memory functions with size_t in the lower 32 bits of ++ 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define TEST_MAIN ++#include ++ ++/* On x32, parameter_t may be passed in a 64-bit register with the LEN ++ field in the lower 32 bits. When the LEN field of 64-bit register ++ is passed to string/memory function as the size_t parameter, only ++ the lower 32 bits can be used. */ ++typedef struct ++{ ++ union ++ { ++ size_t len; ++ void (*fn) (void); ++ }; ++ void *p; ++} parameter_t; +diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c +new file mode 100644 +index 00000000..29a3daf1 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c +@@ -0,0 +1,72 @@ ++/* Test memchr with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef WIDE ++# define TEST_NAME "memchr" ++#else ++# define TEST_NAME "wmemchr" ++#endif /* WIDE */ ++#include "test-size_t.h" ++ ++#ifndef WIDE ++# define MEMCHR memchr ++# define CHAR char ++# define UCHAR unsigned char ++#else ++# include ++# define MEMCHR wmemchr ++# define CHAR wchar_t ++# define UCHAR wchar_t ++#endif /* WIDE */ ++ ++IMPL (MEMCHR, 1) ++ ++typedef CHAR * (*proto_t) (const CHAR*, int, size_t); ++ ++static CHAR * ++__attribute__ ((noinline, noclone)) ++do_memchr (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, (uintptr_t) b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ parameter_t src = { { page_size / sizeof (CHAR) }, buf2 }; ++ parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 }; ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ c.fn = impl->fn; ++ CHAR *res = do_memchr (src, c); ++ if (res) ++ { ++ error (0, 0, "Wrong result in function %s: %p != NULL", ++ impl->name, res); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c +new file mode 100644 +index 00000000..877801d6 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c +@@ -0,0 +1,20 @@ ++/* Test wmemchr with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define WIDE 1 ++#include "tst-size_t-memchr.c" +-- +GitLab + diff --git a/glibc-RHEL-15696-10.patch b/glibc-RHEL-15696-10.patch new file mode 100644 index 0000000..10bd49d --- /dev/null +++ b/glibc-RHEL-15696-10.patch @@ -0,0 +1,41 @@ +From ddf0992cf57a93200e0c782e2a94d0733a5a0b87 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Sun, 9 Jan 2022 16:02:21 -0600 +Subject: [PATCH] x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755] +Content-type: text/plain; charset=UTF-8 + +Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to +__wcscmp_avx2. For x86_64 this covers the entire address range so any +length larger could not possibly be used to bound `s1` or `s2`. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/strcmp-avx2.S | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 156c1949..8fb8eedc 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -83,6 +83,16 @@ ENTRY (STRCMP) + je L(char0) + jb L(zero) + # ifdef USE_AS_WCSCMP ++# ifndef __ILP32__ ++ movq %rdx, %rcx ++ /* Check if length could overflow when multiplied by ++ sizeof(wchar_t). Checking top 8 bits will cover all potential ++ overflow cases as well as redirect cases where its impossible to ++ length to bound a valid memory region. In these cases just use ++ 'wcscmp'. */ ++ shrq $56, %rcx ++ jnz __wcscmp_avx2 ++# endif + /* Convert units: from wide to byte char. */ + shl $2, %RDX_LP + # endif +-- +GitLab + diff --git a/glibc-RHEL-15696-100.patch b/glibc-RHEL-15696-100.patch new file mode 100644 index 0000000..0e779e4 --- /dev/null +++ b/glibc-RHEL-15696-100.patch @@ -0,0 +1,257 @@ +From 244b415d386487521882debb845a040a4758cb18 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 25 Mar 2022 17:13:33 -0500 +Subject: [PATCH] x86: Small improvements for wcslen +Content-type: text/plain; charset=UTF-8 + +Just a few QOL changes. + 1. Prefer `add` > `lea` as it has high execution units it can run + on. + 2. Don't break macro-fusion between `test` and `jcc` + 3. Reduce code size by removing gratuitous padding bytes (-90 + bytes). + +geometric_mean(N=20) of all benchmarks New / Original: 0.959 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++--------------------- + 1 file changed, 41 insertions(+), 45 deletions(-) + +diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S +index 9f5f7232..254bb030 100644 +--- a/sysdeps/x86_64/wcslen.S ++++ b/sysdeps/x86_64/wcslen.S +@@ -41,82 +41,82 @@ ENTRY (__wcslen) + pxor %xmm0, %xmm0 + + lea 32(%rdi), %rax +- lea 16(%rdi), %rcx ++ addq $16, %rdi + and $-16, %rax + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + and $-0x40, %rax +@@ -133,104 +133,100 @@ L(aligned_64_loop): + pminub %xmm0, %xmm2 + pcmpeqd %xmm3, %xmm2 + pmovmskb %xmm2, %edx ++ addq $64, %rax + test %edx, %edx +- lea 64(%rax), %rax + jz L(aligned_64_loop) + + pcmpeqd -64(%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $48, %rdi + test %edx, %edx +- lea 48(%rcx), %rcx + jnz L(exit) + + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm3, %edx ++ addq $-16, %rdi + test %edx, %edx +- lea -16(%rcx), %rcx + jnz L(exit) + + pcmpeqd -32(%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $-16, %rdi + test %edx, %edx +- lea -16(%rcx), %rcx + jnz L(exit) + + pcmpeqd %xmm6, %xmm3 + pmovmskb %xmm3, %edx ++ addq $-16, %rdi + test %edx, %edx +- lea -16(%rcx), %rcx +- jnz L(exit) +- +- jmp L(aligned_64_loop) ++ jz L(aligned_64_loop) + + .p2align 4 + L(exit): +- sub %rcx, %rax ++ sub %rdi, %rax + shr $2, %rax + test %dl, %dl + jz L(exit_high) + +- mov %dl, %cl +- and $15, %cl ++ andl $15, %edx + jz L(exit_1) + ret + +- .p2align 4 ++ /* No align here. Naturally aligned % 16 == 1. */ + L(exit_high): +- mov %dh, %ch +- and $15, %ch ++ andl $(15 << 8), %edx + jz L(exit_3) + add $2, %rax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_1): + add $1, %rax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_3): + add $3, %rax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail0): +- xor %rax, %rax ++ xorl %eax, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail1): +- mov $1, %rax ++ movl $1, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail2): +- mov $2, %rax ++ movl $2, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail3): +- mov $3, %rax ++ movl $3, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail4): +- mov $4, %rax ++ movl $4, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail5): +- mov $5, %rax ++ movl $5, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail6): +- mov $6, %rax ++ movl $6, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail7): +- mov $7, %rax ++ movl $7, %eax + ret + + END (__wcslen) +-- +GitLab + diff --git a/glibc-RHEL-15696-101.patch b/glibc-RHEL-15696-101.patch new file mode 100644 index 0000000..131ea5b --- /dev/null +++ b/glibc-RHEL-15696-101.patch @@ -0,0 +1,964 @@ +From 7cbc03d03091d5664060924789afe46d30a5477e Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 15 Apr 2022 12:28:00 -0500 +Subject: [PATCH] x86: Remove memcmp-sse4.S +Content-type: text/plain; charset=UTF-8 + +Code didn't actually use any sse4 instructions since `ptest` was +removed in: + +commit 2f9062d7171850451e6044ef78d91ff8c017b9c0 +Author: Noah Goldstein +Date: Wed Nov 10 16:18:56 2021 -0600 + + x86: Shrink memcmp-sse4.S code size + +The new memcmp-sse2 implementation is also faster. + +geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905 + +Note there are two regressions preferring SSE2 for Size = 1 and Size = +65. + +Size = 1: +size, align0, align1, ret, New Time/Old Time + 1, 1, 1, 0, 1.2 + 1, 1, 1, 1, 1.197 + 1, 1, 1, -1, 1.2 + +This is intentional. Size == 1 is significantly less hot based on +profiles of GCC11 and Python3 than sizes [4, 8] (which is made +hotter). + +Python3 Size = 1 -> 13.64% +Python3 Size = [4, 8] -> 60.92% + +GCC11 Size = 1 -> 1.29% +GCC11 Size = [4, 8] -> 33.86% + +size, align0, align1, ret, New Time/Old Time + 4, 4, 4, 0, 0.622 + 4, 4, 4, 1, 0.797 + 4, 4, 4, -1, 0.805 + 5, 5, 5, 0, 0.623 + 5, 5, 5, 1, 0.777 + 5, 5, 5, -1, 0.802 + 6, 6, 6, 0, 0.625 + 6, 6, 6, 1, 0.813 + 6, 6, 6, -1, 0.788 + 7, 7, 7, 0, 0.625 + 7, 7, 7, 1, 0.799 + 7, 7, 7, -1, 0.795 + 8, 8, 8, 0, 0.625 + 8, 8, 8, 1, 0.848 + 8, 8, 8, -1, 0.914 + 9, 9, 9, 0, 0.625 + +Size = 65: +size, align0, align1, ret, New Time/Old Time + 65, 0, 0, 0, 1.103 + 65, 0, 0, 1, 1.216 + 65, 0, 0, -1, 1.227 + 65, 65, 0, 0, 1.091 + 65, 0, 65, 1, 1.19 + 65, 65, 65, -1, 1.215 + +This is because A) the checks in range [65, 96] are now unrolled 2x +and B) because smaller values <= 16 are now given a hotter path. By +contrast the SSE4 version has a branch for Size = 80. The unrolled +version has get better performance for returns which need both +comparisons. + +size, align0, align1, ret, New Time/Old Time + 128, 4, 8, 0, 0.858 + 128, 4, 8, 1, 0.879 + 128, 4, 8, -1, 0.888 + +As well, out of microbenchmark environments that are not full +predictable the branch will have a real-cost. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/Makefile | 2 - + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 - + sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 - + sysdeps/x86_64/multiarch/memcmp-sse4.S | 804 --------------------- + 4 files changed, 814 deletions(-) + delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index bca82e38..b503e4b8 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -11,7 +11,6 @@ sysdep_routines += \ + memcmp-avx2-movbe-rtm \ + memcmp-evex-movbe \ + memcmp-sse2 \ +- memcmp-sse4 \ + memcmp-ssse3 \ + memcpy-ssse3 \ + memcpy-ssse3-back \ +@@ -174,7 +173,6 @@ sysdep_routines += \ + wmemcmp-avx2-movbe-rtm \ + wmemcmp-c \ + wmemcmp-evex-movbe \ +- wmemcmp-sse4 \ + wmemcmp-ssse3 \ + # sysdep_routines + endif +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 14314367..450a2917 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -78,8 +78,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE)), + __memcmp_evex_movbe) +- IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), +- __memcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), + __memcmp_ssse3) + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) +@@ -824,8 +822,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE)), + __wmemcmp_evex_movbe) +- IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), +- __wmemcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), + __wmemcmp_ssse3) + IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) +diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +index 690dffe8..0bc47a7f 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +@@ -21,7 +21,6 @@ + + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; +@@ -47,9 +46,6 @@ IFUNC_SELECTOR (void) + return OPTIMIZE (avx2_movbe); + } + +- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) +- return OPTIMIZE (sse4_1); +- + if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) + return OPTIMIZE (ssse3); + +diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S +deleted file mode 100644 +index 50060006..00000000 +--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S ++++ /dev/null +@@ -1,804 +0,0 @@ +-/* memcmp with SSE4.1, wmemcmp with SSE4.1 +- Copyright (C) 2010-2018 Free Software Foundation, Inc. +- Contributed by Intel Corporation. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#if IS_IN (libc) +- +-# include +- +-# ifndef MEMCMP +-# define MEMCMP __memcmp_sse4_1 +-# endif +- +-#ifdef USE_AS_WMEMCMP +-# define CMPEQ pcmpeqd +-# define CHAR_SIZE 4 +-#else +-# define CMPEQ pcmpeqb +-# define CHAR_SIZE 1 +-#endif +- +- +-/* Warning! +- wmemcmp has to use SIGNED comparison for elements. +- memcmp has to use UNSIGNED comparison for elemnts. +-*/ +- +- .section .text.sse4.1,"ax",@progbits +-ENTRY (MEMCMP) +-# ifdef USE_AS_WMEMCMP +- shl $2, %RDX_LP +-# elif defined __ILP32__ +- /* Clear the upper 32 bits. */ +- mov %edx, %edx +-# endif +- cmp $79, %RDX_LP +- ja L(79bytesormore) +- +- cmp $CHAR_SIZE, %RDX_LP +- jbe L(firstbyte) +- +- /* N in (CHAR_SIZE, 79) bytes. */ +- cmpl $32, %edx +- ja L(more_32_bytes) +- +- cmpl $16, %edx +- jae L(16_to_32_bytes) +- +-# ifndef USE_AS_WMEMCMP +- cmpl $8, %edx +- jae L(8_to_16_bytes) +- +- cmpl $4, %edx +- jb L(2_to_3_bytes) +- +- movl (%rdi), %eax +- movl (%rsi), %ecx +- +- bswap %eax +- bswap %ecx +- +- shlq $32, %rax +- shlq $32, %rcx +- +- movl -4(%rdi, %rdx), %edi +- movl -4(%rsi, %rdx), %esi +- +- bswap %edi +- bswap %esi +- +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- cmovne %edx, %eax +- sbbl %ecx, %ecx +- orl %ecx, %eax +- ret +- +- .p2align 4,, 8 +-L(2_to_3_bytes): +- movzwl (%rdi), %eax +- movzwl (%rsi), %ecx +- shll $8, %eax +- shll $8, %ecx +- bswap %eax +- bswap %ecx +- movzbl -1(%rdi, %rdx), %edi +- movzbl -1(%rsi, %rdx), %esi +- orl %edi, %eax +- orl %esi, %ecx +- subl %ecx, %eax +- ret +- +- .p2align 4,, 8 +-L(8_to_16_bytes): +- movq (%rdi), %rax +- movq (%rsi), %rcx +- +- bswap %rax +- bswap %rcx +- +- subq %rcx, %rax +- jne L(8_to_16_bytes_done) +- +- movq -8(%rdi, %rdx), %rax +- movq -8(%rsi, %rdx), %rcx +- +- bswap %rax +- bswap %rcx +- +- subq %rcx, %rax +- +-L(8_to_16_bytes_done): +- cmovne %edx, %eax +- sbbl %ecx, %ecx +- orl %ecx, %eax +- ret +-# else +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- jne L(8_to_16_bytes_done) +- movl 4(%rdi), %ecx +- cmpl 4(%rsi), %ecx +- jne L(8_to_16_bytes_done) +- movl -4(%rdi, %rdx), %ecx +- cmpl -4(%rsi, %rdx), %ecx +- jne L(8_to_16_bytes_done) +- ret +-# endif +- +- .p2align 4,, 3 +-L(ret_zero): +- xorl %eax, %eax +-L(zero): +- ret +- +- .p2align 4,, 8 +-L(firstbyte): +- jb L(ret_zero) +-# ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- je L(zero) +-L(8_to_16_bytes_done): +- setg %al +- leal -1(%rax, %rax), %eax +-# else +- movzbl (%rdi), %eax +- movzbl (%rsi), %ecx +- sub %ecx, %eax +-# endif +- ret +- +- .p2align 4 +-L(vec_return_begin_48): +- addq $16, %rdi +- addq $16, %rsi +-L(vec_return_begin_32): +- bsfl %eax, %eax +-# ifdef USE_AS_WMEMCMP +- movl 32(%rdi, %rax), %ecx +- xorl %edx, %edx +- cmpl 32(%rsi, %rax), %ecx +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl 32(%rsi, %rax), %ecx +- movzbl 32(%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret +- +- .p2align 4 +-L(vec_return_begin_16): +- addq $16, %rdi +- addq $16, %rsi +-L(vec_return_begin): +- bsfl %eax, %eax +-# ifdef USE_AS_WMEMCMP +- movl (%rdi, %rax), %ecx +- xorl %edx, %edx +- cmpl (%rsi, %rax), %ecx +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl (%rsi, %rax), %ecx +- movzbl (%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret +- +- .p2align 4 +-L(vec_return_end_16): +- subl $16, %edx +-L(vec_return_end): +- bsfl %eax, %eax +- addl %edx, %eax +-# ifdef USE_AS_WMEMCMP +- movl -16(%rdi, %rax), %ecx +- xorl %edx, %edx +- cmpl -16(%rsi, %rax), %ecx +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl -16(%rsi, %rax), %ecx +- movzbl -16(%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret +- +- .p2align 4,, 8 +-L(more_32_bytes): +- movdqu (%rdi), %xmm0 +- movdqu (%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm0 +- movdqu 16(%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- cmpl $64, %edx +- jbe L(32_to_64_bytes) +- movdqu 32(%rdi), %xmm0 +- movdqu 32(%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- .p2align 4,, 6 +-L(32_to_64_bytes): +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(16_to_32_bytes): +- movdqu (%rdi), %xmm0 +- movdqu (%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- +- .p2align 4 +-L(79bytesormore): +- movdqu (%rdi), %xmm0 +- movdqu (%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- +- mov %rsi, %rcx +- and $-16, %rsi +- add $16, %rsi +- sub %rsi, %rcx +- +- sub %rcx, %rdi +- add %rcx, %rdx +- test $0xf, %rdi +- jz L(2aligned) +- +- cmp $128, %rdx +- ja L(128bytesormore) +- +- .p2align 4,, 6 +-L(less128bytes): +- movdqu (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqu 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- cmp $96, %rdx +- jb L(32_to_64_bytes) +- +- addq $64, %rdi +- addq $64, %rsi +- subq $64, %rdx +- +- .p2align 4,, 6 +-L(last_64_bytes): +- movdqu (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(128bytesormore): +- cmp $256, %rdx +- ja L(unaligned_loop) +-L(less256bytes): +- movdqu (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqu 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- addq $64, %rdi +- addq $64, %rsi +- +- movdqu (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqu 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- addq $-128, %rdx +- subq $-64, %rsi +- subq $-64, %rdi +- +- cmp $64, %rdx +- ja L(less128bytes) +- +- cmp $32, %rdx +- ja L(last_64_bytes) +- +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(unaligned_loop): +-# ifdef DATA_CACHE_SIZE_HALF +- mov $DATA_CACHE_SIZE_HALF, %R8_LP +-# else +- mov __x86_data_cache_size_half(%rip), %R8_LP +-# endif +- movq %r8, %r9 +- addq %r8, %r8 +- addq %r9, %r8 +- cmpq %r8, %rdx +- ja L(L2_L3_cache_unaligned) +- sub $64, %rdx +- .p2align 4 +-L(64bytesormore_loop): +- movdqu (%rdi), %xmm0 +- movdqu 16(%rdi), %xmm1 +- movdqu 32(%rdi), %xmm2 +- movdqu 48(%rdi), %xmm3 +- +- CMPEQ (%rsi), %xmm0 +- CMPEQ 16(%rsi), %xmm1 +- CMPEQ 32(%rsi), %xmm2 +- CMPEQ 48(%rsi), %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- +- add $64, %rsi +- add $64, %rdi +- sub $64, %rdx +- ja L(64bytesormore_loop) +- +- .p2align 4,, 6 +-L(loop_tail): +- addq %rdx, %rdi +- movdqu (%rdi), %xmm0 +- movdqu 16(%rdi), %xmm1 +- movdqu 32(%rdi), %xmm2 +- movdqu 48(%rdi), %xmm3 +- +- addq %rdx, %rsi +- movdqu (%rsi), %xmm4 +- movdqu 16(%rsi), %xmm5 +- movdqu 32(%rsi), %xmm6 +- movdqu 48(%rsi), %xmm7 +- +- CMPEQ %xmm4, %xmm0 +- CMPEQ %xmm5, %xmm1 +- CMPEQ %xmm6, %xmm2 +- CMPEQ %xmm7, %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- ret +- +-L(L2_L3_cache_unaligned): +- subq $64, %rdx +- .p2align 4 +-L(L2_L3_unaligned_128bytes_loop): +- prefetchnta 0x1c0(%rdi) +- prefetchnta 0x1c0(%rsi) +- +- movdqu (%rdi), %xmm0 +- movdqu 16(%rdi), %xmm1 +- movdqu 32(%rdi), %xmm2 +- movdqu 48(%rdi), %xmm3 +- +- CMPEQ (%rsi), %xmm0 +- CMPEQ 16(%rsi), %xmm1 +- CMPEQ 32(%rsi), %xmm2 +- CMPEQ 48(%rsi), %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- +- add $64, %rsi +- add $64, %rdi +- sub $64, %rdx +- ja L(L2_L3_unaligned_128bytes_loop) +- jmp L(loop_tail) +- +- +- /* This case is for machines which are sensitive for unaligned +- * instructions. */ +- .p2align 4 +-L(2aligned): +- cmp $128, %rdx +- ja L(128bytesormorein2aligned) +-L(less128bytesin2aligned): +- movdqa (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqa 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqa 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqa 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- cmp $96, %rdx +- jb L(32_to_64_bytes) +- +- addq $64, %rdi +- addq $64, %rsi +- subq $64, %rdx +- +- .p2align 4,, 6 +-L(aligned_last_64_bytes): +- movdqa (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqa 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(128bytesormorein2aligned): +- cmp $256, %rdx +- ja L(aligned_loop) +-L(less256bytesin2alinged): +- movdqa (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqa 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqa 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqa 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- addq $64, %rdi +- addq $64, %rsi +- +- movdqa (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqa 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqa 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqa 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- addq $-128, %rdx +- subq $-64, %rsi +- subq $-64, %rdi +- +- cmp $64, %rdx +- ja L(less128bytesin2aligned) +- +- cmp $32, %rdx +- ja L(aligned_last_64_bytes) +- +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(aligned_loop): +-# ifdef DATA_CACHE_SIZE_HALF +- mov $DATA_CACHE_SIZE_HALF, %R8_LP +-# else +- mov __x86_data_cache_size_half(%rip), %R8_LP +-# endif +- movq %r8, %r9 +- addq %r8, %r8 +- addq %r9, %r8 +- cmpq %r8, %rdx +- ja L(L2_L3_cache_aligned) +- +- sub $64, %rdx +- .p2align 4 +-L(64bytesormore_loopin2aligned): +- movdqa (%rdi), %xmm0 +- movdqa 16(%rdi), %xmm1 +- movdqa 32(%rdi), %xmm2 +- movdqa 48(%rdi), %xmm3 +- +- CMPEQ (%rsi), %xmm0 +- CMPEQ 16(%rsi), %xmm1 +- CMPEQ 32(%rsi), %xmm2 +- CMPEQ 48(%rsi), %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- add $64, %rsi +- add $64, %rdi +- sub $64, %rdx +- ja L(64bytesormore_loopin2aligned) +- jmp L(loop_tail) +- +-L(L2_L3_cache_aligned): +- subq $64, %rdx +- .p2align 4 +-L(L2_L3_aligned_128bytes_loop): +- prefetchnta 0x1c0(%rdi) +- prefetchnta 0x1c0(%rsi) +- movdqa (%rdi), %xmm0 +- movdqa 16(%rdi), %xmm1 +- movdqa 32(%rdi), %xmm2 +- movdqa 48(%rdi), %xmm3 +- +- CMPEQ (%rsi), %xmm0 +- CMPEQ 16(%rsi), %xmm1 +- CMPEQ 32(%rsi), %xmm2 +- CMPEQ 48(%rsi), %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- +- addq $64, %rsi +- addq $64, %rdi +- subq $64, %rdx +- ja L(L2_L3_aligned_128bytes_loop) +- jmp L(loop_tail) +- +- .p2align 4 +-L(64bytesormore_loop_end): +- pmovmskb %xmm0, %ecx +- incw %cx +- jnz L(loop_end_ret) +- +- pmovmskb %xmm1, %ecx +- notw %cx +- sall $16, %ecx +- jnz L(loop_end_ret) +- +- pmovmskb %xmm2, %ecx +- notw %cx +- shlq $32, %rcx +- jnz L(loop_end_ret) +- +- addq $48, %rdi +- addq $48, %rsi +- movq %rax, %rcx +- +- .p2align 4,, 6 +-L(loop_end_ret): +- bsfq %rcx, %rcx +-# ifdef USE_AS_WMEMCMP +- movl (%rdi, %rcx), %eax +- xorl %edx, %edx +- cmpl (%rsi, %rcx), %eax +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl (%rdi, %rcx), %eax +- movzbl (%rsi, %rcx), %ecx +- subl %ecx, %eax +-# endif +- ret +-END (MEMCMP) +-#endif +-- +GitLab + diff --git a/glibc-RHEL-15696-102.patch b/glibc-RHEL-15696-102.patch new file mode 100644 index 0000000..8cb20ad --- /dev/null +++ b/glibc-RHEL-15696-102.patch @@ -0,0 +1,263 @@ +From 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 15 Apr 2022 12:28:01 -0500 +Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S +Content-type: text/plain; charset=UTF-8 + +Old code was both inefficient and wasted code size. New code (-62 +bytes) and comparable or better performance in the page cross case. + +geometric_mean(N=20) of page cross cases New / Original: 0.960 + +size, align0, align1, ret, New Time/Old Time + 1, 4095, 0, 0, 1.001 + 1, 4095, 0, 1, 0.999 + 1, 4095, 0, -1, 1.0 + 2, 4094, 0, 0, 1.0 + 2, 4094, 0, 1, 1.0 + 2, 4094, 0, -1, 1.0 + 3, 4093, 0, 0, 1.0 + 3, 4093, 0, 1, 1.0 + 3, 4093, 0, -1, 1.0 + 4, 4092, 0, 0, 0.987 + 4, 4092, 0, 1, 1.0 + 4, 4092, 0, -1, 1.0 + 5, 4091, 0, 0, 0.984 + 5, 4091, 0, 1, 1.002 + 5, 4091, 0, -1, 1.005 + 6, 4090, 0, 0, 0.993 + 6, 4090, 0, 1, 1.001 + 6, 4090, 0, -1, 1.003 + 7, 4089, 0, 0, 0.991 + 7, 4089, 0, 1, 1.0 + 7, 4089, 0, -1, 1.001 + 8, 4088, 0, 0, 0.875 + 8, 4088, 0, 1, 0.881 + 8, 4088, 0, -1, 0.888 + 9, 4087, 0, 0, 0.872 + 9, 4087, 0, 1, 0.879 + 9, 4087, 0, -1, 0.883 + 10, 4086, 0, 0, 0.878 + 10, 4086, 0, 1, 0.886 + 10, 4086, 0, -1, 0.873 + 11, 4085, 0, 0, 0.878 + 11, 4085, 0, 1, 0.881 + 11, 4085, 0, -1, 0.879 + 12, 4084, 0, 0, 0.873 + 12, 4084, 0, 1, 0.889 + 12, 4084, 0, -1, 0.875 + 13, 4083, 0, 0, 0.873 + 13, 4083, 0, 1, 0.863 + 13, 4083, 0, -1, 0.863 + 14, 4082, 0, 0, 0.838 + 14, 4082, 0, 1, 0.869 + 14, 4082, 0, -1, 0.877 + 15, 4081, 0, 0, 0.841 + 15, 4081, 0, 1, 0.869 + 15, 4081, 0, -1, 0.876 + 16, 4080, 0, 0, 0.988 + 16, 4080, 0, 1, 0.99 + 16, 4080, 0, -1, 0.989 + 17, 4079, 0, 0, 0.978 + 17, 4079, 0, 1, 0.981 + 17, 4079, 0, -1, 0.98 + 18, 4078, 0, 0, 0.981 + 18, 4078, 0, 1, 0.98 + 18, 4078, 0, -1, 0.985 + 19, 4077, 0, 0, 0.977 + 19, 4077, 0, 1, 0.979 + 19, 4077, 0, -1, 0.986 + 20, 4076, 0, 0, 0.977 + 20, 4076, 0, 1, 0.986 + 20, 4076, 0, -1, 0.984 + 21, 4075, 0, 0, 0.977 + 21, 4075, 0, 1, 0.983 + 21, 4075, 0, -1, 0.988 + 22, 4074, 0, 0, 0.983 + 22, 4074, 0, 1, 0.994 + 22, 4074, 0, -1, 0.993 + 23, 4073, 0, 0, 0.98 + 23, 4073, 0, 1, 0.992 + 23, 4073, 0, -1, 0.995 + 24, 4072, 0, 0, 0.989 + 24, 4072, 0, 1, 0.989 + 24, 4072, 0, -1, 0.991 + 25, 4071, 0, 0, 0.99 + 25, 4071, 0, 1, 0.999 + 25, 4071, 0, -1, 0.996 + 26, 4070, 0, 0, 0.993 + 26, 4070, 0, 1, 0.995 + 26, 4070, 0, -1, 0.998 + 27, 4069, 0, 0, 0.993 + 27, 4069, 0, 1, 0.999 + 27, 4069, 0, -1, 1.0 + 28, 4068, 0, 0, 0.997 + 28, 4068, 0, 1, 1.0 + 28, 4068, 0, -1, 0.999 + 29, 4067, 0, 0, 0.996 + 29, 4067, 0, 1, 0.999 + 29, 4067, 0, -1, 0.999 + 30, 4066, 0, 0, 0.991 + 30, 4066, 0, 1, 1.001 + 30, 4066, 0, -1, 0.999 + 31, 4065, 0, 0, 0.988 + 31, 4065, 0, 1, 0.998 + 31, 4065, 0, -1, 0.998 +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++-------- + 1 file changed, 61 insertions(+), 37 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +index 16fc673e..99258cf5 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +@@ -429,22 +429,21 @@ L(page_cross_less_vec): + # ifndef USE_AS_WMEMCMP + cmpl $8, %edx + jae L(between_8_15) ++ /* Fall through for [4, 7]. */ + cmpl $4, %edx +- jae L(between_4_7) ++ jb L(between_2_3) + +- /* Load as big endian to avoid branches. */ +- movzwl (%rdi), %eax +- movzwl (%rsi), %ecx +- shll $8, %eax +- shll $8, %ecx +- bswap %eax +- bswap %ecx +- movzbl -1(%rdi, %rdx), %edi +- movzbl -1(%rsi, %rdx), %esi +- orl %edi, %eax +- orl %esi, %ecx +- /* Subtraction is okay because the upper 8 bits are zero. */ +- subl %ecx, %eax ++ movbe (%rdi), %eax ++ movbe (%rsi), %ecx ++ shlq $32, %rax ++ shlq $32, %rcx ++ movbe -4(%rdi, %rdx), %edi ++ movbe -4(%rsi, %rdx), %esi ++ orq %rdi, %rax ++ orq %rsi, %rcx ++ subq %rcx, %rax ++ /* Fast path for return zero. */ ++ jnz L(ret_nonzero) + /* No ymm register was touched. */ + ret + +@@ -457,9 +456,33 @@ L(one_or_less): + /* No ymm register was touched. */ + ret + ++ .p2align 4,, 5 ++L(ret_nonzero): ++ sbbl %eax, %eax ++ orl $1, %eax ++ /* No ymm register was touched. */ ++ ret ++ ++ .p2align 4,, 2 ++L(zero): ++ xorl %eax, %eax ++ /* No ymm register was touched. */ ++ ret ++ + .p2align 4 + L(between_8_15): +-# endif ++ movbe (%rdi), %rax ++ movbe (%rsi), %rcx ++ subq %rcx, %rax ++ jnz L(ret_nonzero) ++ movbe -8(%rdi, %rdx), %rax ++ movbe -8(%rsi, %rdx), %rcx ++ subq %rcx, %rax ++ /* Fast path for return zero. */ ++ jnz L(ret_nonzero) ++ /* No ymm register was touched. */ ++ ret ++# else + /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ + vmovq (%rdi), %xmm1 + vmovq (%rsi), %xmm2 +@@ -475,16 +498,13 @@ L(between_8_15): + VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax ++ /* Fast path for return zero. */ + jnz L(return_vec_0) + /* No ymm register was touched. */ + ret ++# endif + +- .p2align 4 +-L(zero): +- xorl %eax, %eax +- ret +- +- .p2align 4 ++ .p2align 4,, 10 + L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ + vmovdqu (%rsi), %xmm2 +@@ -501,11 +521,17 @@ L(between_16_31): + VPCMPEQ (%rdi), %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax ++ /* Fast path for return zero. */ + jnz L(return_vec_0) + /* No ymm register was touched. */ + ret + + # ifdef USE_AS_WMEMCMP ++ .p2align 4,, 2 ++L(zero): ++ xorl %eax, %eax ++ ret ++ + .p2align 4 + L(one_or_less): + jb L(zero) +@@ -520,22 +546,20 @@ L(one_or_less): + # else + + .p2align 4 +-L(between_4_7): +- /* Load as big endian with overlapping movbe to avoid branches. +- */ +- movbe (%rdi), %eax +- movbe (%rsi), %ecx +- shlq $32, %rax +- shlq $32, %rcx +- movbe -4(%rdi, %rdx), %edi +- movbe -4(%rsi, %rdx), %esi +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- jz L(zero_4_7) +- sbbl %eax, %eax +- orl $1, %eax +-L(zero_4_7): ++L(between_2_3): ++ /* Load as big endian to avoid branches. */ ++ movzwl (%rdi), %eax ++ movzwl (%rsi), %ecx ++ bswap %eax ++ bswap %ecx ++ shrl %eax ++ shrl %ecx ++ movzbl -1(%rdi, %rdx), %edi ++ movzbl -1(%rsi, %rdx), %esi ++ orl %edi, %eax ++ orl %esi, %ecx ++ /* Subtraction is okay because the upper bit is zero. */ ++ subl %ecx, %eax + /* No ymm register was touched. */ + ret + # endif +-- +GitLab + diff --git a/glibc-RHEL-15696-103.patch b/glibc-RHEL-15696-103.patch new file mode 100644 index 0000000..c080e54 --- /dev/null +++ b/glibc-RHEL-15696-103.patch @@ -0,0 +1,876 @@ +From 5307aa9c1800f36a64c183c091c9af392c1fa75c Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 21 Apr 2022 20:52:28 -0500 +Subject: [PATCH] x86: Optimize {str|wcs}rchr-sse2 +Content-type: text/plain; charset=UTF-8 + +The new code unrolls the main loop slightly without adding too much +overhead and minimizes the comparisons for the search CHAR. + +Geometric Mean of all benchmarks New / Old: 0.741 +See email for all results. + +Full xcheck passes on x86_64 with and without multiarch enabled. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +- + sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +- + sysdeps/x86_64/strrchr.S | 510 +++++++++++++++--------- + sysdeps/x86_64/wcsrchr.S | 266 +----------- + 4 files changed, 338 insertions(+), 443 deletions(-) + +Conflicts: + sysdeps/x86_64/wcsrchr.S + (copyright header) + +diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S +index 0ec76fe9..6bb1284b 100644 +--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S ++++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S +@@ -17,7 +17,7 @@ + . */ + + #if IS_IN (libc) +-# define strrchr __strrchr_sse2 ++# define STRRCHR __strrchr_sse2 + + # undef weak_alias + # define weak_alias(strrchr, rindex) +diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S +index d015e953..f26d53b5 100644 +--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S ++++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S +@@ -17,7 +17,6 @@ + . */ + + #if IS_IN (libc) +-# define wcsrchr __wcsrchr_sse2 ++# define STRRCHR __wcsrchr_sse2 + #endif +- + #include "../wcsrchr.S" +diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S +index aca98e7e..a58cc220 100644 +--- a/sysdeps/x86_64/strrchr.S ++++ b/sysdeps/x86_64/strrchr.S +@@ -19,210 +19,360 @@ + + #include + ++#ifndef STRRCHR ++# define STRRCHR strrchr ++#endif ++ ++#ifdef USE_AS_WCSRCHR ++# define PCMPEQ pcmpeqd ++# define CHAR_SIZE 4 ++# define PMINU pminud ++#else ++# define PCMPEQ pcmpeqb ++# define CHAR_SIZE 1 ++# define PMINU pminub ++#endif ++ ++#define PAGE_SIZE 4096 ++#define VEC_SIZE 16 ++ + .text +-ENTRY (strrchr) +- movd %esi, %xmm1 ++ENTRY(STRRCHR) ++ movd %esi, %xmm0 + movq %rdi, %rax +- andl $4095, %eax +- punpcklbw %xmm1, %xmm1 +- cmpq $4032, %rax +- punpcklwd %xmm1, %xmm1 +- pshufd $0, %xmm1, %xmm1 ++ andl $(PAGE_SIZE - 1), %eax ++#ifndef USE_AS_WCSRCHR ++ punpcklbw %xmm0, %xmm0 ++ punpcklwd %xmm0, %xmm0 ++#endif ++ pshufd $0, %xmm0, %xmm0 ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page) +- movdqu (%rdi), %xmm0 ++ ++L(cross_page_continue): ++ movups (%rdi), %xmm1 + pxor %xmm2, %xmm2 +- movdqa %xmm0, %xmm3 +- pcmpeqb %xmm1, %xmm0 +- pcmpeqb %xmm2, %xmm3 +- pmovmskb %xmm0, %ecx +- pmovmskb %xmm3, %edx +- testq %rdx, %rdx +- je L(next_48_bytes) +- leaq -1(%rdx), %rax +- xorq %rdx, %rax +- andq %rcx, %rax +- je L(exit) +- bsrq %rax, %rax ++ PCMPEQ %xmm1, %xmm2 ++ pmovmskb %xmm2, %ecx ++ testl %ecx, %ecx ++ jz L(aligned_more) ++ ++ PCMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ leal -1(%rcx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(ret0) ++ bsrl %eax, %eax + addq %rdi, %rax ++ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If ++ search CHAR is zero we are correct. Either way `andq ++ -CHAR_SIZE, %rax` gets the correct result. */ ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++L(ret0): + ret + ++ /* Returns for first vec x1/x2 have hard coded backward search ++ path for earlier matches. */ + .p2align 4 +-L(next_48_bytes): +- movdqu 16(%rdi), %xmm4 +- movdqa %xmm4, %xmm5 +- movdqu 32(%rdi), %xmm3 +- pcmpeqb %xmm1, %xmm4 +- pcmpeqb %xmm2, %xmm5 +- movdqu 48(%rdi), %xmm0 +- pmovmskb %xmm5, %edx +- movdqa %xmm3, %xmm5 +- pcmpeqb %xmm1, %xmm3 +- pcmpeqb %xmm2, %xmm5 +- pcmpeqb %xmm0, %xmm2 +- salq $16, %rdx +- pmovmskb %xmm3, %r8d +- pmovmskb %xmm5, %eax +- pmovmskb %xmm2, %esi +- salq $32, %r8 +- salq $32, %rax +- pcmpeqb %xmm1, %xmm0 +- orq %rdx, %rax +- movq %rsi, %rdx +- pmovmskb %xmm4, %esi +- salq $48, %rdx +- salq $16, %rsi +- orq %r8, %rsi +- orq %rcx, %rsi +- pmovmskb %xmm0, %ecx +- salq $48, %rcx +- orq %rcx, %rsi +- orq %rdx, %rax +- je L(loop_header2) +- leaq -1(%rax), %rcx +- xorq %rax, %rcx +- andq %rcx, %rsi +- je L(exit) +- bsrq %rsi, %rsi +- leaq (%rdi,%rsi), %rax ++L(first_vec_x0_test): ++ PCMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ testl %eax, %eax ++ jz L(ret0) ++ bsrl %eax, %eax ++ addq %r8, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif + ret + + .p2align 4 +-L(loop_header2): +- testq %rsi, %rsi +- movq %rdi, %rcx +- je L(no_c_found) +-L(loop_header): +- addq $64, %rdi +- pxor %xmm7, %xmm7 +- andq $-64, %rdi +- jmp L(loop_entry) ++L(first_vec_x1): ++ PCMPEQ %xmm0, %xmm2 ++ pmovmskb %xmm2, %eax ++ leal -1(%rcx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(first_vec_x0_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE)(%rdi, %rax), %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret + + .p2align 4 +-L(loop64): +- testq %rdx, %rdx +- cmovne %rdx, %rsi +- cmovne %rdi, %rcx +- addq $64, %rdi +-L(loop_entry): +- movdqa 32(%rdi), %xmm3 +- pxor %xmm6, %xmm6 +- movdqa 48(%rdi), %xmm2 +- movdqa %xmm3, %xmm0 +- movdqa 16(%rdi), %xmm4 +- pminub %xmm2, %xmm0 +- movdqa (%rdi), %xmm5 +- pminub %xmm4, %xmm0 +- pminub %xmm5, %xmm0 +- pcmpeqb %xmm7, %xmm0 +- pmovmskb %xmm0, %eax +- movdqa %xmm5, %xmm0 +- pcmpeqb %xmm1, %xmm0 +- pmovmskb %xmm0, %r9d +- movdqa %xmm4, %xmm0 +- pcmpeqb %xmm1, %xmm0 +- pmovmskb %xmm0, %edx +- movdqa %xmm3, %xmm0 +- pcmpeqb %xmm1, %xmm0 +- salq $16, %rdx +- pmovmskb %xmm0, %r10d +- movdqa %xmm2, %xmm0 +- pcmpeqb %xmm1, %xmm0 +- salq $32, %r10 +- orq %r10, %rdx +- pmovmskb %xmm0, %r8d +- orq %r9, %rdx +- salq $48, %r8 +- orq %r8, %rdx ++L(first_vec_x1_test): ++ PCMPEQ %xmm0, %xmm2 ++ pmovmskb %xmm2, %eax + testl %eax, %eax +- je L(loop64) +- pcmpeqb %xmm6, %xmm4 +- pcmpeqb %xmm6, %xmm3 +- pcmpeqb %xmm6, %xmm5 +- pmovmskb %xmm4, %eax +- pmovmskb %xmm3, %r10d +- pcmpeqb %xmm6, %xmm2 +- pmovmskb %xmm5, %r9d +- salq $32, %r10 +- salq $16, %rax +- pmovmskb %xmm2, %r8d +- orq %r10, %rax +- orq %r9, %rax +- salq $48, %r8 +- orq %r8, %rax +- leaq -1(%rax), %r8 +- xorq %rax, %r8 +- andq %r8, %rdx +- cmovne %rdi, %rcx +- cmovne %rdx, %rsi +- bsrq %rsi, %rsi +- leaq (%rcx,%rsi), %rax ++ jz L(first_vec_x0_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE)(%rdi, %rax), %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x2): ++ PCMPEQ %xmm0, %xmm3 ++ pmovmskb %xmm3, %eax ++ leal -1(%rcx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(first_vec_x1_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret ++ ++ .p2align 4 ++L(aligned_more): ++ /* Save original pointer if match was in VEC 0. */ ++ movq %rdi, %r8 ++ andq $-VEC_SIZE, %rdi ++ ++ movaps VEC_SIZE(%rdi), %xmm2 ++ pxor %xmm3, %xmm3 ++ PCMPEQ %xmm2, %xmm3 ++ pmovmskb %xmm3, %ecx ++ testl %ecx, %ecx ++ jnz L(first_vec_x1) ++ ++ movaps (VEC_SIZE * 2)(%rdi), %xmm3 ++ pxor %xmm4, %xmm4 ++ PCMPEQ %xmm3, %xmm4 ++ pmovmskb %xmm4, %ecx ++ testl %ecx, %ecx ++ jnz L(first_vec_x2) ++ ++ addq $VEC_SIZE, %rdi ++ /* Save pointer again before realigning. */ ++ movq %rdi, %rsi ++ andq $-(VEC_SIZE * 2), %rdi ++ .p2align 4 ++L(first_loop): ++ /* Do 2x VEC at a time. */ ++ movaps (VEC_SIZE * 2)(%rdi), %xmm4 ++ movaps (VEC_SIZE * 3)(%rdi), %xmm5 ++ /* Since SSE2 no pminud so wcsrchr needs seperate logic for ++ detecting zero. Note if this is found to be a bottleneck it ++ may be worth adding an SSE4.1 wcsrchr implementation. */ ++#ifdef USE_AS_WCSRCHR ++ movaps %xmm5, %xmm6 ++ pxor %xmm8, %xmm8 ++ ++ PCMPEQ %xmm8, %xmm5 ++ PCMPEQ %xmm4, %xmm8 ++ por %xmm5, %xmm8 ++#else ++ movaps %xmm5, %xmm6 ++ PMINU %xmm4, %xmm5 ++#endif ++ ++ movaps %xmm4, %xmm9 ++ PCMPEQ %xmm0, %xmm4 ++ PCMPEQ %xmm0, %xmm6 ++ movaps %xmm6, %xmm7 ++ por %xmm4, %xmm6 ++#ifndef USE_AS_WCSRCHR ++ pxor %xmm8, %xmm8 ++ PCMPEQ %xmm5, %xmm8 ++#endif ++ pmovmskb %xmm8, %ecx ++ pmovmskb %xmm6, %eax ++ ++ addq $(VEC_SIZE * 2), %rdi ++ /* Use `addl` 1) so we can undo it with `subl` and 2) it can ++ macro-fuse with `jz`. */ ++ addl %ecx, %eax ++ jz L(first_loop) ++ ++ /* Check if there is zero match. */ ++ testl %ecx, %ecx ++ jz L(second_loop_match) ++ ++ /* Check if there was a match in last iteration. */ ++ subl %ecx, %eax ++ jnz L(new_match) ++ ++L(first_loop_old_match): ++ PCMPEQ %xmm0, %xmm2 ++ PCMPEQ %xmm0, %xmm3 ++ pmovmskb %xmm2, %ecx ++ pmovmskb %xmm3, %eax ++ addl %eax, %ecx ++ jz L(first_vec_x0_test) ++ /* NB: We could move this shift to before the branch and save a ++ bit of code size / performance on the fall through. The ++ branch leads to the null case which generally seems hotter ++ than char in first 3x VEC. */ ++ sall $16, %eax ++ orl %ecx, %eax ++ ++ bsrl %eax, %eax ++ addq %rsi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret ++ ++ .p2align 4 ++L(new_match): ++ pxor %xmm6, %xmm6 ++ PCMPEQ %xmm9, %xmm6 ++ pmovmskb %xmm6, %eax ++ sall $16, %ecx ++ orl %eax, %ecx ++ ++ /* We can't reuse either of the old comparisons as since we mask ++ of zeros after first zero (instead of using the full ++ comparison) we can't gurantee no interference between match ++ after end of string and valid match. */ ++ pmovmskb %xmm4, %eax ++ pmovmskb %xmm7, %edx ++ sall $16, %edx ++ orl %edx, %eax ++ ++ leal -1(%ecx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(first_loop_old_match) ++ bsrl %eax, %eax ++ addq %rdi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif + ret + ++ /* Save minimum state for getting most recent match. We can ++ throw out all previous work. */ + .p2align 4 +-L(no_c_found): +- movl $1, %esi +- xorl %ecx, %ecx +- jmp L(loop_header) ++L(second_loop_match): ++ movq %rdi, %rsi ++ movaps %xmm4, %xmm2 ++ movaps %xmm7, %xmm3 + + .p2align 4 +-L(exit): +- xorl %eax, %eax ++L(second_loop): ++ movaps (VEC_SIZE * 2)(%rdi), %xmm4 ++ movaps (VEC_SIZE * 3)(%rdi), %xmm5 ++ /* Since SSE2 no pminud so wcsrchr needs seperate logic for ++ detecting zero. Note if this is found to be a bottleneck it ++ may be worth adding an SSE4.1 wcsrchr implementation. */ ++#ifdef USE_AS_WCSRCHR ++ movaps %xmm5, %xmm6 ++ pxor %xmm8, %xmm8 ++ ++ PCMPEQ %xmm8, %xmm5 ++ PCMPEQ %xmm4, %xmm8 ++ por %xmm5, %xmm8 ++#else ++ movaps %xmm5, %xmm6 ++ PMINU %xmm4, %xmm5 ++#endif ++ ++ movaps %xmm4, %xmm9 ++ PCMPEQ %xmm0, %xmm4 ++ PCMPEQ %xmm0, %xmm6 ++ movaps %xmm6, %xmm7 ++ por %xmm4, %xmm6 ++#ifndef USE_AS_WCSRCHR ++ pxor %xmm8, %xmm8 ++ PCMPEQ %xmm5, %xmm8 ++#endif ++ ++ pmovmskb %xmm8, %ecx ++ pmovmskb %xmm6, %eax ++ ++ addq $(VEC_SIZE * 2), %rdi ++ /* Either null term or new occurence of CHAR. */ ++ addl %ecx, %eax ++ jz L(second_loop) ++ ++ /* No null term so much be new occurence of CHAR. */ ++ testl %ecx, %ecx ++ jz L(second_loop_match) ++ ++ ++ subl %ecx, %eax ++ jnz L(second_loop_new_match) ++ ++L(second_loop_old_match): ++ pmovmskb %xmm2, %ecx ++ pmovmskb %xmm3, %eax ++ sall $16, %eax ++ orl %ecx, %eax ++ bsrl %eax, %eax ++ addq %rsi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif + ret + + .p2align 4 ++L(second_loop_new_match): ++ pxor %xmm6, %xmm6 ++ PCMPEQ %xmm9, %xmm6 ++ pmovmskb %xmm6, %eax ++ sall $16, %ecx ++ orl %eax, %ecx ++ ++ /* We can't reuse either of the old comparisons as since we mask ++ of zeros after first zero (instead of using the full ++ comparison) we can't gurantee no interference between match ++ after end of string and valid match. */ ++ pmovmskb %xmm4, %eax ++ pmovmskb %xmm7, %edx ++ sall $16, %edx ++ orl %edx, %eax ++ ++ leal -1(%ecx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(second_loop_old_match) ++ bsrl %eax, %eax ++ addq %rdi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret ++ ++ .p2align 4,, 4 + L(cross_page): +- movq %rdi, %rax +- pxor %xmm0, %xmm0 +- andq $-64, %rax +- movdqu (%rax), %xmm5 +- movdqa %xmm5, %xmm6 +- movdqu 16(%rax), %xmm4 +- pcmpeqb %xmm1, %xmm5 +- pcmpeqb %xmm0, %xmm6 +- movdqu 32(%rax), %xmm3 +- pmovmskb %xmm6, %esi +- movdqa %xmm4, %xmm6 +- movdqu 48(%rax), %xmm2 +- pcmpeqb %xmm1, %xmm4 +- pcmpeqb %xmm0, %xmm6 +- pmovmskb %xmm6, %edx +- movdqa %xmm3, %xmm6 +- pcmpeqb %xmm1, %xmm3 +- pcmpeqb %xmm0, %xmm6 +- pcmpeqb %xmm2, %xmm0 +- salq $16, %rdx +- pmovmskb %xmm3, %r9d +- pmovmskb %xmm6, %r8d +- pmovmskb %xmm0, %ecx +- salq $32, %r9 +- salq $32, %r8 +- pcmpeqb %xmm1, %xmm2 +- orq %r8, %rdx +- salq $48, %rcx +- pmovmskb %xmm5, %r8d +- orq %rsi, %rdx +- pmovmskb %xmm4, %esi +- orq %rcx, %rdx +- pmovmskb %xmm2, %ecx +- salq $16, %rsi +- salq $48, %rcx +- orq %r9, %rsi +- orq %r8, %rsi +- orq %rcx, %rsi ++ movq %rdi, %rsi ++ andq $-VEC_SIZE, %rsi ++ movaps (%rsi), %xmm1 ++ pxor %xmm2, %xmm2 ++ PCMPEQ %xmm1, %xmm2 ++ pmovmskb %xmm2, %edx + movl %edi, %ecx +- subl %eax, %ecx +- shrq %cl, %rdx +- shrq %cl, %rsi +- testq %rdx, %rdx +- je L(loop_header2) +- leaq -1(%rdx), %rax +- xorq %rdx, %rax +- andq %rax, %rsi +- je L(exit) +- bsrq %rsi, %rax ++ andl $(VEC_SIZE - 1), %ecx ++ sarl %cl, %edx ++ jz L(cross_page_continue) ++ PCMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ sarl %cl, %eax ++ leal -1(%rdx), %ecx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(ret1) ++ bsrl %eax, %eax + addq %rdi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++L(ret1): + ret +-END (strrchr) ++END(STRRCHR) + +-weak_alias (strrchr, rindex) +-libc_hidden_builtin_def (strrchr) ++#ifndef USE_AS_WCSRCHR ++ weak_alias (STRRCHR, rindex) ++ libc_hidden_builtin_def (STRRCHR) ++#endif +diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S +index 2f388537..ae3cfa7d 100644 +--- a/sysdeps/x86_64/wcsrchr.S ++++ b/sysdeps/x86_64/wcsrchr.S +@@ -17,266 +17,12 @@ + License along with the GNU C Library; if not, see + . */ + +-#include + +- .text +-ENTRY (wcsrchr) ++#define USE_AS_WCSRCHR 1 ++#define NO_PMINU 1 + +- movd %rsi, %xmm1 +- mov %rdi, %rcx +- punpckldq %xmm1, %xmm1 +- pxor %xmm2, %xmm2 +- punpckldq %xmm1, %xmm1 +- and $63, %rcx +- cmp $48, %rcx +- ja L(crosscache) ++#ifndef STRRCHR ++# define STRRCHR wcsrchr ++#endif + +- movdqu (%rdi), %xmm0 +- pcmpeqd %xmm0, %xmm2 +- pcmpeqd %xmm1, %xmm0 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm0, %rax +- add $16, %rdi +- +- test %rax, %rax +- jnz L(unaligned_match1) +- +- test %rcx, %rcx +- jnz L(return_null) +- +- and $-16, %rdi +- xor %r8, %r8 +- jmp L(loop) +- +- .p2align 4 +-L(unaligned_match1): +- test %rcx, %rcx +- jnz L(prolog_find_zero_1) +- +- mov %rax, %r8 +- mov %rdi, %rsi +- and $-16, %rdi +- jmp L(loop) +- +- .p2align 4 +-L(crosscache): +- and $15, %rcx +- and $-16, %rdi +- pxor %xmm3, %xmm3 +- movdqa (%rdi), %xmm0 +- pcmpeqd %xmm0, %xmm3 +- pcmpeqd %xmm1, %xmm0 +- pmovmskb %xmm3, %rdx +- pmovmskb %xmm0, %rax +- shr %cl, %rdx +- shr %cl, %rax +- add $16, %rdi +- +- test %rax, %rax +- jnz L(unaligned_match) +- +- test %rdx, %rdx +- jnz L(return_null) +- +- xor %r8, %r8 +- jmp L(loop) +- +- .p2align 4 +-L(unaligned_match): +- test %rdx, %rdx +- jnz L(prolog_find_zero) +- +- mov %rax, %r8 +- lea (%rdi, %rcx), %rsi +- +-/* Loop start on aligned string. */ +- .p2align 4 +-L(loop): +- movdqa (%rdi), %xmm0 +- pcmpeqd %xmm0, %xmm2 +- add $16, %rdi +- pcmpeqd %xmm1, %xmm0 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm0, %rax +- or %rax, %rcx +- jnz L(matches) +- +- movdqa (%rdi), %xmm3 +- pcmpeqd %xmm3, %xmm2 +- add $16, %rdi +- pcmpeqd %xmm1, %xmm3 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm3, %rax +- or %rax, %rcx +- jnz L(matches) +- +- movdqa (%rdi), %xmm4 +- pcmpeqd %xmm4, %xmm2 +- add $16, %rdi +- pcmpeqd %xmm1, %xmm4 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm4, %rax +- or %rax, %rcx +- jnz L(matches) +- +- movdqa (%rdi), %xmm5 +- pcmpeqd %xmm5, %xmm2 +- add $16, %rdi +- pcmpeqd %xmm1, %xmm5 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm5, %rax +- or %rax, %rcx +- jz L(loop) +- +- .p2align 4 +-L(matches): +- test %rax, %rax +- jnz L(match) +-L(return_value): +- test %r8, %r8 +- jz L(return_null) +- mov %r8, %rax +- mov %rsi, %rdi +- +- test $15 << 4, %ah +- jnz L(match_fourth_wchar) +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(match): +- pmovmskb %xmm2, %rcx +- test %rcx, %rcx +- jnz L(find_zero) +- mov %rax, %r8 +- mov %rdi, %rsi +- jmp L(loop) +- +- .p2align 4 +-L(find_zero): +- test $15, %cl +- jnz L(find_zero_in_first_wchar) +- test %cl, %cl +- jnz L(find_zero_in_second_wchar) +- test $15, %ch +- jnz L(find_zero_in_third_wchar) +- +- and $1 << 13 - 1, %rax +- jz L(return_value) +- +- test $15 << 4, %ah +- jnz L(match_fourth_wchar) +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(find_zero_in_first_wchar): +- test $1, %rax +- jz L(return_value) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(find_zero_in_second_wchar): +- and $1 << 5 - 1, %rax +- jz L(return_value) +- +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(find_zero_in_third_wchar): +- and $1 << 9 - 1, %rax +- jz L(return_value) +- +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(prolog_find_zero): +- add %rcx, %rdi +- mov %rdx, %rcx +-L(prolog_find_zero_1): +- test $15, %cl +- jnz L(prolog_find_zero_in_first_wchar) +- test %cl, %cl +- jnz L(prolog_find_zero_in_second_wchar) +- test $15, %ch +- jnz L(prolog_find_zero_in_third_wchar) +- +- and $1 << 13 - 1, %rax +- jz L(return_null) +- +- test $15 << 4, %ah +- jnz L(match_fourth_wchar) +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(prolog_find_zero_in_first_wchar): +- test $1, %rax +- jz L(return_null) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(prolog_find_zero_in_second_wchar): +- and $1 << 5 - 1, %rax +- jz L(return_null) +- +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(prolog_find_zero_in_third_wchar): +- and $1 << 9 - 1, %rax +- jz L(return_null) +- +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(match_second_wchar): +- lea -12(%rdi), %rax +- ret +- +- .p2align 4 +-L(match_third_wchar): +- lea -8(%rdi), %rax +- ret +- +- .p2align 4 +-L(match_fourth_wchar): +- lea -4(%rdi), %rax +- ret +- +- .p2align 4 +-L(return_null): +- xor %rax, %rax +- ret +- +-END (wcsrchr) ++#include "../strrchr.S" +-- +GitLab + diff --git a/glibc-RHEL-15696-104.patch b/glibc-RHEL-15696-104.patch new file mode 100644 index 0000000..1cb312a --- /dev/null +++ b/glibc-RHEL-15696-104.patch @@ -0,0 +1,501 @@ +From df7e295d18ffa34f629578c0017a9881af7620f6 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 21 Apr 2022 20:52:29 -0500 +Subject: [PATCH] x86: Optimize {str|wcs}rchr-avx2 +Content-type: text/plain; charset=UTF-8 + +The new code unrolls the main loop slightly without adding too much +overhead and minimizes the comparisons for the search CHAR. + +Geometric Mean of all benchmarks New / Old: 0.832 +See email for all results. + +Full xcheck passes on x86_64 with and without multiarch enabled. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++--------- + 1 file changed, 269 insertions(+), 157 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S +index c949410b..3d26fad4 100644 +--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S +@@ -27,9 +27,13 @@ + # ifdef USE_AS_WCSRCHR + # define VPBROADCAST vpbroadcastd + # define VPCMPEQ vpcmpeqd ++# define VPMIN vpminud ++# define CHAR_SIZE 4 + # else + # define VPBROADCAST vpbroadcastb + # define VPCMPEQ vpcmpeqb ++# define VPMIN vpminub ++# define CHAR_SIZE 1 + # endif + + # ifndef VZEROUPPER +@@ -41,196 +45,304 @@ + # endif + + # define VEC_SIZE 32 ++# define PAGE_SIZE 4096 + +- .section SECTION(.text),"ax",@progbits +-ENTRY (STRRCHR) +- movd %esi, %xmm4 +- movl %edi, %ecx ++ .section SECTION(.text), "ax", @progbits ++ENTRY(STRRCHR) ++ movd %esi, %xmm7 ++ movl %edi, %eax + /* Broadcast CHAR to YMM4. */ +- VPBROADCAST %xmm4, %ymm4 ++ VPBROADCAST %xmm7, %ymm7 + vpxor %xmm0, %xmm0, %xmm0 + +- /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) ++ /* Shift here instead of `andl` to save code size (saves a fetch ++ block). */ ++ sall $20, %eax ++ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax ++ ja L(cross_page) + ++L(page_cross_continue): + vmovdqu (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx +- vpmovmskb %ymm3, %eax +- addq $VEC_SIZE, %rdi ++ /* Check end of string match. */ ++ VPCMPEQ %ymm1, %ymm0, %ymm6 ++ vpmovmskb %ymm6, %ecx ++ testl %ecx, %ecx ++ jz L(aligned_more) ++ ++ /* Only check match with search CHAR if needed. */ ++ VPCMPEQ %ymm1, %ymm7, %ymm1 ++ vpmovmskb %ymm1, %eax ++ /* Check if match before first zero. */ ++ blsmskl %ecx, %ecx ++ andl %ecx, %eax ++ jz L(ret0) ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If ++ search CHAR is zero we are correct. Either way `andq ++ -CHAR_SIZE, %rax` gets the correct result. */ ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++L(ret0): ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN ++ ++ /* Returns for first vec x1/x2 have hard coded backward search ++ path for earlier matches. */ ++ .p2align 4,, 10 ++L(first_vec_x1): ++ VPCMPEQ %ymm2, %ymm7, %ymm6 ++ vpmovmskb %ymm6, %eax ++ blsmskl %ecx, %ecx ++ andl %ecx, %eax ++ jnz L(first_vec_x1_return) ++ ++ .p2align 4,, 4 ++L(first_vec_x0_test): ++ VPCMPEQ %ymm1, %ymm7, %ymm6 ++ vpmovmskb %ymm6, %eax ++ testl %eax, %eax ++ jz L(ret1) ++ bsrl %eax, %eax ++ addq %r8, %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++L(ret1): ++ VZEROUPPER_RETURN + ++ .p2align 4,, 10 ++L(first_vec_x0_x1_test): ++ VPCMPEQ %ymm2, %ymm7, %ymm6 ++ vpmovmskb %ymm6, %eax ++ /* Check ymm2 for search CHAR match. If no match then check ymm1 ++ before returning. */ + testl %eax, %eax +- jnz L(first_vec) ++ jz L(first_vec_x0_test) ++ .p2align 4,, 4 ++L(first_vec_x1_return): ++ bsrl %eax, %eax ++ leaq 1(%rdi, %rax), %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++ VZEROUPPER_RETURN + +- testl %ecx, %ecx +- jnz L(return_null) + +- andq $-VEC_SIZE, %rdi +- xorl %edx, %edx +- jmp L(aligned_loop) ++ .p2align 4,, 10 ++L(first_vec_x2): ++ VPCMPEQ %ymm3, %ymm7, %ymm6 ++ vpmovmskb %ymm6, %eax ++ blsmskl %ecx, %ecx ++ /* If no in-range search CHAR match in ymm3 then need to check ++ ymm1/ymm2 for an earlier match (we delay checking search ++ CHAR matches until needed). */ ++ andl %ecx, %eax ++ jz L(first_vec_x0_x1_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE + 1)(%rdi, %rax), %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++ VZEROUPPER_RETURN ++ + + .p2align 4 +-L(first_vec): +- /* Check if there is a nul CHAR. */ ++L(aligned_more): ++ /* Save original pointer if match was in VEC 0. */ ++ movq %rdi, %r8 ++ ++ /* Align src. */ ++ orq $(VEC_SIZE - 1), %rdi ++ vmovdqu 1(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm6 ++ vpmovmskb %ymm6, %ecx + testl %ecx, %ecx +- jnz L(char_and_nul_in_first_vec) ++ jnz L(first_vec_x1) + +- /* Remember the match and keep searching. */ +- movl %eax, %edx +- movq %rdi, %rsi +- andq $-VEC_SIZE, %rdi +- jmp L(aligned_loop) ++ vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3 ++ VPCMPEQ %ymm3, %ymm0, %ymm6 ++ vpmovmskb %ymm6, %ecx ++ testl %ecx, %ecx ++ jnz L(first_vec_x2) + ++ /* Save pointer again before realigning. */ ++ movq %rdi, %rsi ++ addq $(VEC_SIZE + 1), %rdi ++ andq $-(VEC_SIZE * 2), %rdi + .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %edx +- vpmovmskb %ymm3, %eax +- shrl %cl, %edx +- shrl %cl, %eax +- addq $VEC_SIZE, %rdi +- +- /* Check if there is a CHAR. */ ++L(first_aligned_loop): ++ /* Do 2x VEC at a time. Any more and the cost of finding the ++ match outweights loop benefit. */ ++ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 ++ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 ++ ++ VPCMPEQ %ymm4, %ymm7, %ymm6 ++ VPMIN %ymm4, %ymm5, %ymm8 ++ VPCMPEQ %ymm5, %ymm7, %ymm10 ++ vpor %ymm6, %ymm10, %ymm5 ++ VPCMPEQ %ymm8, %ymm0, %ymm8 ++ vpor %ymm5, %ymm8, %ymm9 ++ ++ vpmovmskb %ymm9, %eax ++ addq $(VEC_SIZE * 2), %rdi ++ /* No zero or search CHAR. */ + testl %eax, %eax +- jnz L(found_char) +- +- testl %edx, %edx +- jnz L(return_null) ++ jz L(first_aligned_loop) + +- jmp L(aligned_loop) +- +- .p2align 4 +-L(found_char): +- testl %edx, %edx +- jnz L(char_and_nul) ++ /* If no zero CHAR then go to second loop (this allows us to ++ throw away all prior work). */ ++ vpmovmskb %ymm8, %ecx ++ testl %ecx, %ecx ++ jz L(second_aligned_loop_prep) + +- /* Remember the match and keep searching. */ +- movl %eax, %edx +- leaq (%rdi, %rcx), %rsi ++ /* Search char could be zero so we need to get the true match. ++ */ ++ vpmovmskb %ymm5, %eax ++ testl %eax, %eax ++ jnz L(first_aligned_loop_return) + +- .p2align 4 +-L(aligned_loop): +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- addq $VEC_SIZE, %rdi +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx +- vpmovmskb %ymm3, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) +- +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- add $VEC_SIZE, %rdi +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx ++ .p2align 4,, 4 ++L(first_vec_x1_or_x2): ++ VPCMPEQ %ymm3, %ymm7, %ymm3 ++ VPCMPEQ %ymm2, %ymm7, %ymm2 + vpmovmskb %ymm3, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) +- +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- addq $VEC_SIZE, %rdi +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx +- vpmovmskb %ymm3, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) +- +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- addq $VEC_SIZE, %rdi +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx +- vpmovmskb %ymm3, %eax +- orl %eax, %ecx +- jz L(aligned_loop) +- +- .p2align 4 +-L(char_nor_null): +- /* Find a CHAR or a nul CHAR in a loop. */ +- testl %eax, %eax +- jnz L(match) +-L(return_value): +- testl %edx, %edx +- jz L(return_null) +- movl %edx, %eax +- movq %rsi, %rdi ++ vpmovmskb %ymm2, %edx ++ /* Use add for macro-fusion. */ ++ addq %rax, %rdx ++ jz L(first_vec_x0_test) ++ /* NB: We could move this shift to before the branch and save a ++ bit of code size / performance on the fall through. The ++ branch leads to the null case which generally seems hotter ++ than char in first 3x VEC. */ ++ salq $32, %rax ++ addq %rdx, %rax ++ bsrq %rax, %rax ++ leaq 1(%rsi, %rax), %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++ VZEROUPPER_RETURN + ++ .p2align 4,, 8 ++L(first_aligned_loop_return): ++ VPCMPEQ %ymm4, %ymm0, %ymm4 ++ vpmovmskb %ymm4, %edx ++ salq $32, %rcx ++ orq %rdx, %rcx ++ ++ vpmovmskb %ymm10, %eax ++ vpmovmskb %ymm6, %edx ++ salq $32, %rax ++ orq %rdx, %rax ++ blsmskq %rcx, %rcx ++ andq %rcx, %rax ++ jz L(first_vec_x1_or_x2) ++ ++ bsrq %rax, %rax ++ leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax + # ifdef USE_AS_WCSRCHR +- /* Keep the first bit for each matching CHAR for bsr. */ +- andl $0x11111111, %eax ++ andq $-CHAR_SIZE, %rax + # endif +- bsrl %eax, %eax +- leaq -VEC_SIZE(%rdi, %rax), %rax +-L(return_vzeroupper): +- ZERO_UPPER_VEC_REGISTERS_RETURN ++ VZEROUPPER_RETURN + ++ /* Search char cannot be zero. */ + .p2align 4 +-L(match): +- /* Find a CHAR. Check if there is a nul CHAR. */ +- vpmovmskb %ymm2, %ecx +- testl %ecx, %ecx +- jnz L(find_nul) +- +- /* Remember the match and keep searching. */ +- movl %eax, %edx ++L(second_aligned_loop_set_furthest_match): ++ /* Save VEC and pointer from most recent match. */ ++L(second_aligned_loop_prep): + movq %rdi, %rsi +- jmp L(aligned_loop) ++ vmovdqu %ymm6, %ymm2 ++ vmovdqu %ymm10, %ymm3 + + .p2align 4 +-L(find_nul): +-# ifdef USE_AS_WCSRCHR +- /* Keep the first bit for each matching CHAR for bsr. */ +- andl $0x11111111, %ecx +- andl $0x11111111, %eax +-# endif +- /* Mask out any matching bits after the nul CHAR. */ +- movl %ecx, %r8d +- subl $1, %r8d +- xorl %ecx, %r8d +- andl %r8d, %eax ++L(second_aligned_loop): ++ /* Search 2x at at time. */ ++ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 ++ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 ++ ++ VPCMPEQ %ymm4, %ymm7, %ymm6 ++ VPMIN %ymm4, %ymm5, %ymm1 ++ VPCMPEQ %ymm5, %ymm7, %ymm10 ++ vpor %ymm6, %ymm10, %ymm5 ++ VPCMPEQ %ymm1, %ymm0, %ymm1 ++ vpor %ymm5, %ymm1, %ymm9 ++ ++ vpmovmskb %ymm9, %eax ++ addq $(VEC_SIZE * 2), %rdi + testl %eax, %eax +- /* If there is no CHAR here, return the remembered one. */ +- jz L(return_value) +- bsrl %eax, %eax +- leaq -VEC_SIZE(%rdi, %rax), %rax +- VZEROUPPER_RETURN +- +- .p2align 4 +-L(char_and_nul): +- /* Find both a CHAR and a nul CHAR. */ +- addq %rcx, %rdi +- movl %edx, %ecx +-L(char_and_nul_in_first_vec): +-# ifdef USE_AS_WCSRCHR +- /* Keep the first bit for each matching CHAR for bsr. */ +- andl $0x11111111, %ecx +- andl $0x11111111, %eax +-# endif +- /* Mask out any matching bits after the nul CHAR. */ +- movl %ecx, %r8d +- subl $1, %r8d +- xorl %ecx, %r8d +- andl %r8d, %eax ++ jz L(second_aligned_loop) ++ vpmovmskb %ymm1, %ecx ++ testl %ecx, %ecx ++ jz L(second_aligned_loop_set_furthest_match) ++ vpmovmskb %ymm5, %eax + testl %eax, %eax +- /* Return null pointer if the nul CHAR comes first. */ +- jz L(return_null) +- bsrl %eax, %eax +- leaq -VEC_SIZE(%rdi, %rax), %rax ++ jnz L(return_new_match) ++ ++ /* This is the hot patch. We know CHAR is inbounds and that ++ ymm3/ymm2 have latest match. */ ++ .p2align 4,, 4 ++L(return_old_match): ++ vpmovmskb %ymm3, %eax ++ vpmovmskb %ymm2, %edx ++ salq $32, %rax ++ orq %rdx, %rax ++ bsrq %rax, %rax ++ /* Search char cannot be zero so safe to just use lea for ++ wcsrchr. */ ++ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax + VZEROUPPER_RETURN + +- .p2align 4 +-L(return_null): +- xorl %eax, %eax ++ /* Last iteration also potentially has a match. */ ++ .p2align 4,, 8 ++L(return_new_match): ++ VPCMPEQ %ymm4, %ymm0, %ymm4 ++ vpmovmskb %ymm4, %edx ++ salq $32, %rcx ++ orq %rdx, %rcx ++ ++ vpmovmskb %ymm10, %eax ++ vpmovmskb %ymm6, %edx ++ salq $32, %rax ++ orq %rdx, %rax ++ blsmskq %rcx, %rcx ++ andq %rcx, %rax ++ jz L(return_old_match) ++ bsrq %rax, %rax ++ /* Search char cannot be zero so safe to just use lea for ++ wcsrchr. */ ++ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax + VZEROUPPER_RETURN + +-END (STRRCHR) ++ .p2align 4,, 4 ++L(cross_page): ++ movq %rdi, %rsi ++ andq $-VEC_SIZE, %rsi ++ vmovdqu (%rsi), %ymm1 ++ VPCMPEQ %ymm1, %ymm0, %ymm6 ++ vpmovmskb %ymm6, %ecx ++ /* Shift out zero CHAR matches that are before the begining of ++ src (rdi). */ ++ shrxl %edi, %ecx, %ecx ++ testl %ecx, %ecx ++ jz L(page_cross_continue) ++ VPCMPEQ %ymm1, %ymm7, %ymm1 ++ vpmovmskb %ymm1, %eax ++ ++ /* Shift out search CHAR matches that are before the begining of ++ src (rdi). */ ++ shrxl %edi, %eax, %eax ++ blsmskl %ecx, %ecx ++ /* Check if any search CHAR match in range. */ ++ andl %ecx, %eax ++ jz L(ret2) ++ bsrl %eax, %eax ++ addq %rdi, %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++L(ret2): ++ VZEROUPPER_RETURN ++END(STRRCHR) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-105.patch b/glibc-RHEL-15696-105.patch new file mode 100644 index 0000000..e0a157f --- /dev/null +++ b/glibc-RHEL-15696-105.patch @@ -0,0 +1,558 @@ +From c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 21 Apr 2022 20:52:30 -0500 +Subject: [PATCH] x86: Optimize {str|wcs}rchr-evex +Content-type: text/plain; charset=UTF-8 + +The new code unrolls the main loop slightly without adding too much +overhead and minimizes the comparisons for the search CHAR. + +Geometric Mean of all benchmarks New / Old: 0.755 +See email for all results. + +Full xcheck passes on x86_64 with and without multiarch enabled. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++--------- + 1 file changed, 290 insertions(+), 181 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S +index f920b5a5..f5b6d755 100644 +--- a/sysdeps/x86_64/multiarch/strrchr-evex.S ++++ b/sysdeps/x86_64/multiarch/strrchr-evex.S +@@ -24,242 +24,351 @@ + # define STRRCHR __strrchr_evex + # endif + +-# define VMOVU vmovdqu64 +-# define VMOVA vmovdqa64 ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 + + # ifdef USE_AS_WCSRCHR ++# define SHIFT_REG esi ++ ++# define kunpck kunpckbw ++# define kmov_2x kmovd ++# define maskz_2x ecx ++# define maskm_2x eax ++# define CHAR_SIZE 4 ++# define VPMIN vpminud ++# define VPTESTN vptestnmd + # define VPBROADCAST vpbroadcastd +-# define VPCMP vpcmpd +-# define SHIFT_REG r8d ++# define VPCMP vpcmpd + # else ++# define SHIFT_REG edi ++ ++# define kunpck kunpckdq ++# define kmov_2x kmovq ++# define maskz_2x rcx ++# define maskm_2x rax ++ ++# define CHAR_SIZE 1 ++# define VPMIN vpminub ++# define VPTESTN vptestnmb + # define VPBROADCAST vpbroadcastb +-# define VPCMP vpcmpb +-# define SHIFT_REG ecx ++# define VPCMP vpcmpb + # endif + + # define XMMZERO xmm16 + # define YMMZERO ymm16 + # define YMMMATCH ymm17 +-# define YMM1 ymm18 ++# define YMMSAVE ymm18 ++ ++# define YMM1 ymm19 ++# define YMM2 ymm20 ++# define YMM3 ymm21 ++# define YMM4 ymm22 ++# define YMM5 ymm23 ++# define YMM6 ymm24 ++# define YMM7 ymm25 ++# define YMM8 ymm26 + +-# define VEC_SIZE 32 + +- .section .text.evex,"ax",@progbits +-ENTRY (STRRCHR) +- movl %edi, %ecx ++# define VEC_SIZE 32 ++# define PAGE_SIZE 4096 ++ .section .text.evex, "ax", @progbits ++ENTRY(STRRCHR) ++ movl %edi, %eax + /* Broadcast CHAR to YMMMATCH. */ + VPBROADCAST %esi, %YMMMATCH + +- vpxorq %XMMZERO, %XMMZERO, %XMMZERO +- +- /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ jg L(cross_page_boundary) + ++L(page_cross_continue): + VMOVU (%rdi), %YMM1 +- +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ /* k0 has a 1 for each zero CHAR in YMM1. */ ++ VPTESTN %YMM1, %YMM1, %k0 + kmovd %k0, %ecx +- kmovd %k1, %eax +- +- addq $VEC_SIZE, %rdi +- +- testl %eax, %eax +- jnz L(first_vec) +- + testl %ecx, %ecx +- jnz L(return_null) +- +- andq $-VEC_SIZE, %rdi +- xorl %edx, %edx +- jmp L(aligned_loop) +- +- .p2align 4 +-L(first_vec): +- /* Check if there is a null byte. */ +- testl %ecx, %ecx +- jnz L(char_and_nul_in_first_vec) +- +- /* Remember the match and keep searching. */ +- movl %eax, %edx +- movq %rdi, %rsi +- andq $-VEC_SIZE, %rdi +- jmp L(aligned_loop) +- +- .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi ++ jz L(aligned_more) ++ /* fallthrough: zero CHAR in first VEC. */ + ++ /* K1 has a 1 for each search CHAR match in YMM1. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k1, %eax ++ /* Build mask up until first zero CHAR (used to mask of ++ potential search CHAR matches past the end of the string). ++ */ ++ blsmskl %ecx, %ecx ++ andl %ecx, %eax ++ jz L(ret0) ++ /* Get last match (the `andl` removed any out of bounds ++ matches). */ ++ bsrl %eax, %eax + # ifdef USE_AS_WCSRCHR +- /* NB: Divide shift count by 4 since each bit in K1 represent 4 +- bytes. */ +- movl %ecx, %SHIFT_REG +- sarl $2, %SHIFT_REG ++ leaq (%rdi, %rax, CHAR_SIZE), %rax ++# else ++ addq %rdi, %rax + # endif ++L(ret0): ++ ret + +- VMOVA (%rdi), %YMM1 +- +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ ++ /* Returns for first vec x1/x2/x3 have hard coded backward ++ search path for earlier matches. */ ++ .p2align 4,, 6 ++L(first_vec_x1): ++ VPCMP $0, %YMMMATCH, %YMM2, %k1 ++ kmovd %k1, %eax ++ blsmskl %ecx, %ecx ++ /* eax non-zero if search CHAR in range. */ ++ andl %ecx, %eax ++ jnz L(first_vec_x1_return) ++ ++ /* fallthrough: no match in YMM2 then need to check for earlier ++ matches (in YMM1). */ ++ .p2align 4,, 4 ++L(first_vec_x0_test): + VPCMP $0, %YMMMATCH, %YMM1, %k1 +- kmovd %k0, %edx + kmovd %k1, %eax +- +- shrxl %SHIFT_REG, %edx, %edx +- shrxl %SHIFT_REG, %eax, %eax +- addq $VEC_SIZE, %rdi +- +- /* Check if there is a CHAR. */ + testl %eax, %eax +- jnz L(found_char) +- +- testl %edx, %edx +- jnz L(return_null) +- +- jmp L(aligned_loop) +- +- .p2align 4 +-L(found_char): +- testl %edx, %edx +- jnz L(char_and_nul) +- +- /* Remember the match and keep searching. */ +- movl %eax, %edx +- leaq (%rdi, %rcx), %rsi ++ jz L(ret1) ++ bsrl %eax, %eax ++# ifdef USE_AS_WCSRCHR ++ leaq (%rsi, %rax, CHAR_SIZE), %rax ++# else ++ addq %rsi, %rax ++# endif ++L(ret1): ++ ret + +- .p2align 4 +-L(aligned_loop): +- VMOVA (%rdi), %YMM1 +- addq $VEC_SIZE, %rdi ++ .p2align 4,, 10 ++L(first_vec_x1_or_x2): ++ VPCMP $0, %YMM3, %YMMMATCH, %k3 ++ VPCMP $0, %YMM2, %YMMMATCH, %k2 ++ /* K2 and K3 have 1 for any search CHAR match. Test if any ++ matches between either of them. Otherwise check YMM1. */ ++ kortestd %k2, %k3 ++ jz L(first_vec_x0_test) ++ ++ /* Guranteed that YMM2 and YMM3 are within range so merge the ++ two bitmasks then get last result. */ ++ kunpck %k2, %k3, %k3 ++ kmovq %k3, %rax ++ bsrq %rax, %rax ++ leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax ++ ret + +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 +- kmovd %k0, %ecx ++ .p2align 4,, 6 ++L(first_vec_x3): ++ VPCMP $0, %YMMMATCH, %YMM4, %k1 + kmovd %k1, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) ++ blsmskl %ecx, %ecx ++ /* If no search CHAR match in range check YMM1/YMM2/YMM3. */ ++ andl %ecx, %eax ++ jz L(first_vec_x1_or_x2) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- VMOVA (%rdi), %YMM1 +- add $VEC_SIZE, %rdi ++ .p2align 4,, 6 ++L(first_vec_x0_x1_test): ++ VPCMP $0, %YMMMATCH, %YMM2, %k1 ++ kmovd %k1, %eax ++ /* Check YMM2 for last match first. If no match try YMM1. */ ++ testl %eax, %eax ++ jz L(first_vec_x0_test) ++ .p2align 4,, 4 ++L(first_vec_x1_return): ++ bsrl %eax, %eax ++ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 +- kmovd %k0, %ecx ++ .p2align 4,, 10 ++L(first_vec_x2): ++ VPCMP $0, %YMMMATCH, %YMM3, %k1 + kmovd %k1, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) ++ blsmskl %ecx, %ecx ++ /* Check YMM3 for last match first. If no match try YMM2/YMM1. ++ */ ++ andl %ecx, %eax ++ jz L(first_vec_x0_x1_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- VMOVA (%rdi), %YMM1 +- addq $VEC_SIZE, %rdi + +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ .p2align 4 ++L(aligned_more): ++ /* Need to keep original pointer incase YMM1 has last match. */ ++ movq %rdi, %rsi ++ andq $-VEC_SIZE, %rdi ++ VMOVU VEC_SIZE(%rdi), %YMM2 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %ecx +- kmovd %k1, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) ++ testl %ecx, %ecx ++ jnz L(first_vec_x1) + +- VMOVA (%rdi), %YMM1 +- addq $VEC_SIZE, %rdi ++ VMOVU (VEC_SIZE * 2)(%rdi), %YMM3 ++ VPTESTN %YMM3, %YMM3, %k0 ++ kmovd %k0, %ecx ++ testl %ecx, %ecx ++ jnz L(first_vec_x2) + +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ VMOVU (VEC_SIZE * 3)(%rdi), %YMM4 ++ VPTESTN %YMM4, %YMM4, %k0 + kmovd %k0, %ecx +- kmovd %k1, %eax +- orl %eax, %ecx +- jz L(aligned_loop) ++ movq %rdi, %r8 ++ testl %ecx, %ecx ++ jnz L(first_vec_x3) + ++ andq $-(VEC_SIZE * 2), %rdi + .p2align 4 +-L(char_nor_null): +- /* Find a CHAR or a null byte in a loop. */ ++L(first_aligned_loop): ++ /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee ++ they don't store a match. */ ++ VMOVA (VEC_SIZE * 4)(%rdi), %YMM5 ++ VMOVA (VEC_SIZE * 5)(%rdi), %YMM6 ++ ++ VPCMP $0, %YMM5, %YMMMATCH, %k2 ++ vpxord %YMM6, %YMMMATCH, %YMM7 ++ ++ VPMIN %YMM5, %YMM6, %YMM8 ++ VPMIN %YMM8, %YMM7, %YMM7 ++ ++ VPTESTN %YMM7, %YMM7, %k1 ++ subq $(VEC_SIZE * -2), %rdi ++ kortestd %k1, %k2 ++ jz L(first_aligned_loop) ++ ++ VPCMP $0, %YMM6, %YMMMATCH, %k3 ++ VPTESTN %YMM8, %YMM8, %k1 ++ ktestd %k1, %k1 ++ jz L(second_aligned_loop_prep) ++ ++ kortestd %k2, %k3 ++ jnz L(return_first_aligned_loop) ++ ++ .p2align 4,, 6 ++L(first_vec_x1_or_x2_or_x3): ++ VPCMP $0, %YMM4, %YMMMATCH, %k4 ++ kmovd %k4, %eax + testl %eax, %eax +- jnz L(match) +-L(return_value): +- testl %edx, %edx +- jz L(return_null) +- movl %edx, %eax +- movq %rsi, %rdi ++ jz L(first_vec_x1_or_x2) + bsrl %eax, %eax +-# ifdef USE_AS_WCSRCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq -VEC_SIZE(%rdi, %rax, 4), %rax +-# else +- leaq -VEC_SIZE(%rdi, %rax), %rax +-# endif ++ leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax + ret + +- .p2align 4 +-L(match): +- /* Find a CHAR. Check if there is a null byte. */ +- kmovd %k0, %ecx +- testl %ecx, %ecx +- jnz L(find_nul) ++ .p2align 4,, 8 ++L(return_first_aligned_loop): ++ VPTESTN %YMM5, %YMM5, %k0 ++ kunpck %k0, %k1, %k0 ++ kmov_2x %k0, %maskz_2x ++ ++ blsmsk %maskz_2x, %maskz_2x ++ kunpck %k2, %k3, %k3 ++ kmov_2x %k3, %maskm_2x ++ and %maskz_2x, %maskm_2x ++ jz L(first_vec_x1_or_x2_or_x3) + +- /* Remember the match and keep searching. */ +- movl %eax, %edx ++ bsr %maskm_2x, %maskm_2x ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++ ret ++ ++ .p2align 4 ++ /* We can throw away the work done for the first 4x checks here ++ as we have a later match. This is the 'fast' path persay. ++ */ ++L(second_aligned_loop_prep): ++L(second_aligned_loop_set_furthest_match): + movq %rdi, %rsi +- jmp L(aligned_loop) ++ kunpck %k2, %k3, %k4 + + .p2align 4 +-L(find_nul): +- /* Mask out any matching bits after the null byte. */ +- movl %ecx, %r8d +- subl $1, %r8d +- xorl %ecx, %r8d +- andl %r8d, %eax +- testl %eax, %eax +- /* If there is no CHAR here, return the remembered one. */ +- jz L(return_value) +- bsrl %eax, %eax ++L(second_aligned_loop): ++ VMOVU (VEC_SIZE * 4)(%rdi), %YMM1 ++ VMOVU (VEC_SIZE * 5)(%rdi), %YMM2 ++ ++ VPCMP $0, %YMM1, %YMMMATCH, %k2 ++ vpxord %YMM2, %YMMMATCH, %YMM3 ++ ++ VPMIN %YMM1, %YMM2, %YMM4 ++ VPMIN %YMM3, %YMM4, %YMM3 ++ ++ VPTESTN %YMM3, %YMM3, %k1 ++ subq $(VEC_SIZE * -2), %rdi ++ kortestd %k1, %k2 ++ jz L(second_aligned_loop) ++ ++ VPCMP $0, %YMM2, %YMMMATCH, %k3 ++ VPTESTN %YMM4, %YMM4, %k1 ++ ktestd %k1, %k1 ++ jz L(second_aligned_loop_set_furthest_match) ++ ++ kortestd %k2, %k3 ++ /* branch here because there is a significant advantage interms ++ of output dependency chance in using edx. */ ++ jnz L(return_new_match) ++L(return_old_match): ++ kmovq %k4, %rax ++ bsrq %rax, %rax ++ leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax ++ ret ++ ++L(return_new_match): ++ VPTESTN %YMM1, %YMM1, %k0 ++ kunpck %k0, %k1, %k0 ++ kmov_2x %k0, %maskz_2x ++ ++ blsmsk %maskz_2x, %maskz_2x ++ kunpck %k2, %k3, %k3 ++ kmov_2x %k3, %maskm_2x ++ and %maskz_2x, %maskm_2x ++ jz L(return_old_match) ++ ++ bsr %maskm_2x, %maskm_2x ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++ ret ++ ++L(cross_page_boundary): ++ /* eax contains all the page offset bits of src (rdi). `xor rdi, ++ rax` sets pointer will all page offset bits cleared so ++ offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC ++ before page cross (guranteed to be safe to read). Doing this ++ as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves ++ a bit of code size. */ ++ xorq %rdi, %rax ++ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1 ++ VPTESTN %YMM1, %YMM1, %k0 ++ kmovd %k0, %ecx ++ ++ /* Shift out zero CHAR matches that are before the begining of ++ src (rdi). */ + # ifdef USE_AS_WCSRCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq -VEC_SIZE(%rdi, %rax, 4), %rax +-# else +- leaq -VEC_SIZE(%rdi, %rax), %rax ++ movl %edi, %esi ++ andl $(VEC_SIZE - 1), %esi ++ shrl $2, %esi + # endif +- ret ++ shrxl %SHIFT_REG, %ecx, %ecx + +- .p2align 4 +-L(char_and_nul): +- /* Find both a CHAR and a null byte. */ +- addq %rcx, %rdi +- movl %edx, %ecx +-L(char_and_nul_in_first_vec): +- /* Mask out any matching bits after the null byte. */ +- movl %ecx, %r8d +- subl $1, %r8d +- xorl %ecx, %r8d +- andl %r8d, %eax +- testl %eax, %eax +- /* Return null pointer if the null byte comes first. */ +- jz L(return_null) ++ testl %ecx, %ecx ++ jz L(page_cross_continue) ++ ++ /* Found zero CHAR so need to test for search CHAR. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k1, %eax ++ /* Shift out search CHAR matches that are before the begining of ++ src (rdi). */ ++ shrxl %SHIFT_REG, %eax, %eax ++ ++ /* Check if any search CHAR match in range. */ ++ blsmskl %ecx, %ecx ++ andl %ecx, %eax ++ jz L(ret3) + bsrl %eax, %eax + # ifdef USE_AS_WCSRCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq -VEC_SIZE(%rdi, %rax, 4), %rax ++ leaq (%rdi, %rax, CHAR_SIZE), %rax + # else +- leaq -VEC_SIZE(%rdi, %rax), %rax ++ addq %rdi, %rax + # endif ++L(ret3): + ret + +- .p2align 4 +-L(return_null): +- xorl %eax, %eax +- ret +- +-END (STRRCHR) ++END(STRRCHR) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-106.patch b/glibc-RHEL-15696-106.patch new file mode 100644 index 0000000..f3bdb17 --- /dev/null +++ b/glibc-RHEL-15696-106.patch @@ -0,0 +1,73 @@ +From 911c63a51c690dd1a97dfc587097277029baf00f Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 27 Apr 2022 15:13:02 -0500 +Subject: [PATCH] sysdeps: Add 'get_fast_jitter' interace in fast-jitter.h +Content-type: text/plain; charset=UTF-8 + +'get_fast_jitter' is meant to be used purely for performance +purposes. In all cases it's used it should be acceptable to get no +randomness (see default case). An example use case is in setting +jitter for retries between threads at a lock. There is a +performance benefit to having jitter, but only if the jitter can +be generated very quickly and ultimately there is no serious issue +if no jitter is generated. + +The implementation generally uses 'HP_TIMING_NOW' iff it is +inlined (avoid any potential syscall paths). +Reviewed-by: H.J. Lu +--- + sysdeps/generic/fast-jitter.h | 42 +++++++++++++++++++++++++++++++++++ + 1 file changed, 42 insertions(+) + create mode 100644 sysdeps/generic/fast-jitter.h + +diff --git a/sysdeps/generic/fast-jitter.h b/sysdeps/generic/fast-jitter.h +new file mode 100644 +index 00000000..4dd53e34 +--- /dev/null ++++ b/sysdeps/generic/fast-jitter.h +@@ -0,0 +1,42 @@ ++/* Fallback for fast jitter just return 0. ++ Copyright (C) 2019-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef _FAST_JITTER_H ++# define _FAST_JITTER_H ++ ++# include ++# include ++ ++/* Baseline just return 0. We could create jitter using a clock or ++ 'random_bits' but that may imply a syscall and the goal of ++ 'get_fast_jitter' is minimal overhead "randomness" when such ++ randomness helps performance. Adding high overhead the function ++ defeats the purpose. */ ++static inline uint32_t ++get_fast_jitter (void) ++{ ++# if HP_TIMING_INLINE ++ hp_timing_t jitter; ++ HP_TIMING_NOW (jitter); ++ return (uint32_t) jitter; ++# else ++ return 0; ++# endif ++} ++ ++#endif +-- +GitLab + diff --git a/glibc-RHEL-15696-107.patch b/glibc-RHEL-15696-107.patch new file mode 100644 index 0000000..738cc23 --- /dev/null +++ b/glibc-RHEL-15696-107.patch @@ -0,0 +1,226 @@ +From 8162147872491bb5b48e91543b19c49a29ae6b6d Mon Sep 17 00:00:00 2001 +From: Wangyang Guo +Date: Fri, 6 May 2022 01:50:10 +0000 +Subject: [PATCH] nptl: Add backoff mechanism to spinlock loop +Content-type: text/plain; charset=UTF-8 + +When mutiple threads waiting for lock at the same time, once lock owner +releases the lock, waiters will see lock available and all try to lock, +which may cause an expensive CAS storm. + +Binary exponential backoff with random jitter is introduced. As try-lock +attempt increases, there is more likely that a larger number threads +compete for adaptive mutex lock, so increase wait time in exponential. +A random jitter is also added to avoid synchronous try-lock from other +threads. + +v2: Remove read-check before try-lock for performance. + +v3: +1. Restore read-check since it works well in some platform. +2. Make backoff arch dependent, and enable it for x86_64. +3. Limit max backoff to reduce latency in large critical section. + +v4: Fix strict-prototypes error in sysdeps/nptl/pthread_mutex_backoff.h + +v5: Commit log updated for regression in large critical section. + +Result of pthread-mutex-locks bench + +Test Platform: Xeon 8280L (2 socket, 112 CPUs in total) +First Row: thread number +First Col: critical section length +Values: backoff vs upstream, time based, low is better + +non-critical-length: 1 + 1 2 4 8 16 32 64 112 140 +0 0.99 0.58 0.52 0.49 0.43 0.44 0.46 0.52 0.54 +1 0.98 0.43 0.56 0.50 0.44 0.45 0.50 0.56 0.57 +2 0.99 0.41 0.57 0.51 0.45 0.47 0.48 0.60 0.61 +4 0.99 0.45 0.59 0.53 0.48 0.49 0.52 0.64 0.65 +8 1.00 0.66 0.71 0.63 0.56 0.59 0.66 0.72 0.71 +16 0.97 0.78 0.91 0.73 0.67 0.70 0.79 0.80 0.80 +32 0.95 1.17 0.98 0.87 0.82 0.86 0.89 0.90 0.90 +64 0.96 0.95 1.01 1.01 0.98 1.00 1.03 0.99 0.99 +128 0.99 1.01 1.01 1.17 1.08 1.12 1.02 0.97 1.02 + +non-critical-length: 32 + 1 2 4 8 16 32 64 112 140 +0 1.03 0.97 0.75 0.65 0.58 0.58 0.56 0.70 0.70 +1 0.94 0.95 0.76 0.65 0.58 0.58 0.61 0.71 0.72 +2 0.97 0.96 0.77 0.66 0.58 0.59 0.62 0.74 0.74 +4 0.99 0.96 0.78 0.66 0.60 0.61 0.66 0.76 0.77 +8 0.99 0.99 0.84 0.70 0.64 0.66 0.71 0.80 0.80 +16 0.98 0.97 0.95 0.76 0.70 0.73 0.81 0.85 0.84 +32 1.04 1.12 1.04 0.89 0.82 0.86 0.93 0.91 0.91 +64 0.99 1.15 1.07 1.00 0.99 1.01 1.05 0.99 0.99 +128 1.00 1.21 1.20 1.22 1.25 1.31 1.12 1.10 0.99 + +non-critical-length: 128 + 1 2 4 8 16 32 64 112 140 +0 1.02 1.00 0.99 0.67 0.61 0.61 0.61 0.74 0.73 +1 0.95 0.99 1.00 0.68 0.61 0.60 0.60 0.74 0.74 +2 1.00 1.04 1.00 0.68 0.59 0.61 0.65 0.76 0.76 +4 1.00 0.96 0.98 0.70 0.63 0.63 0.67 0.78 0.77 +8 1.01 1.02 0.89 0.73 0.65 0.67 0.71 0.81 0.80 +16 0.99 0.96 0.96 0.79 0.71 0.73 0.80 0.84 0.84 +32 0.99 0.95 1.05 0.89 0.84 0.85 0.94 0.92 0.91 +64 1.00 0.99 1.16 1.04 1.00 1.02 1.06 0.99 0.99 +128 1.00 1.06 0.98 1.14 1.39 1.26 1.08 1.02 0.98 + +There is regression in large critical section. But adaptive mutex is +aimed for "quick" locks. Small critical section is more common when +users choose to use adaptive pthread_mutex. + +Signed-off-by: Wangyang Guo +Reviewed-by: H.J. Lu + +Conflicts: + pthreadP.h + (had been moved) + nptl/pthread_mutex_lock.c + (max_adaptive_count renamed) + +--- + nptl/pthreadP.h | 1 + + nptl/pthread_mutex_lock.c | 16 +++++++-- + sysdeps/nptl/pthread_mutex_backoff.h | 35 ++++++++++++++++++ + sysdeps/x86_64/nptl/pthread_mutex_backoff.h | 39 +++++++++++++++++++++ + 4 files changed, 89 insertions(+), 2 deletions(-) + create mode 100644 sysdeps/nptl/pthread_mutex_backoff.h + create mode 100644 sysdeps/x86_64/nptl/pthread_mutex_backoff.h + +diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h +index 7ddc166c..1550e3b6 100644 +--- a/nptl/pthreadP.h ++++ b/nptl/pthreadP.h +@@ -33,6 +33,7 @@ + #include + #include + #include ++#include + + + /* Atomic operations on TLS memory. */ +diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c +index d96a9933..c7770fc9 100644 +--- a/nptl/pthread_mutex_lock.c ++++ b/nptl/pthread_mutex_lock.c +@@ -133,14 +133,26 @@ __pthread_mutex_lock (pthread_mutex_t *mutex) + int cnt = 0; + int max_cnt = MIN (MAX_ADAPTIVE_COUNT, + mutex->__data.__spins * 2 + 10); ++ int spin_count, exp_backoff = 1; ++ unsigned int jitter = get_jitter (); + do + { +- if (cnt++ >= max_cnt) ++ /* In each loop, spin count is exponential backoff plus ++ random jitter, random range is [0, exp_backoff-1]. */ ++ spin_count = exp_backoff + (jitter & (exp_backoff - 1)); ++ cnt += spin_count; ++ if (cnt >= max_cnt) + { ++ /* If cnt exceeds max spin count, just go to wait ++ queue. */ + LLL_MUTEX_LOCK (mutex); + break; + } +- atomic_spin_nop (); ++ do ++ atomic_spin_nop (); ++ while (--spin_count > 0); ++ /* Prepare for next loop. */ ++ exp_backoff = get_next_backoff (exp_backoff); + } + while (LLL_MUTEX_READ_LOCK (mutex) != 0 + || LLL_MUTEX_TRYLOCK (mutex) != 0); +diff --git a/sysdeps/nptl/pthread_mutex_backoff.h b/sysdeps/nptl/pthread_mutex_backoff.h +new file mode 100644 +index 00000000..5b26c22a +--- /dev/null ++++ b/sysdeps/nptl/pthread_mutex_backoff.h +@@ -0,0 +1,35 @@ ++/* Pthread mutex backoff configuration. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++#ifndef _PTHREAD_MUTEX_BACKOFF_H ++#define _PTHREAD_MUTEX_BACKOFF_H 1 ++ ++static inline unsigned int ++get_jitter (void) ++{ ++ /* Arch dependent random jitter, return 0 disables random. */ ++ return 0; ++} ++ ++static inline int ++get_next_backoff (int backoff) ++{ ++ /* Next backoff, return 1 disables mutex backoff. */ ++ return 1; ++} ++ ++#endif +diff --git a/sysdeps/x86_64/nptl/pthread_mutex_backoff.h b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h +new file mode 100644 +index 00000000..ec74c3d9 +--- /dev/null ++++ b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h +@@ -0,0 +1,39 @@ ++/* Pthread mutex backoff configuration. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++#ifndef _PTHREAD_MUTEX_BACKOFF_H ++#define _PTHREAD_MUTEX_BACKOFF_H 1 ++ ++#include ++ ++static inline unsigned int ++get_jitter (void) ++{ ++ return get_fast_jitter (); ++} ++ ++#define MAX_BACKOFF 16 ++ ++static inline int ++get_next_backoff (int backoff) ++{ ++ /* Binary expontial backoff. Limiting max backoff ++ can reduce latency in large critical section. */ ++ return (backoff < MAX_BACKOFF) ? backoff << 1 : backoff; ++} ++ ++#endif +-- +GitLab + diff --git a/glibc-RHEL-15696-108.patch b/glibc-RHEL-15696-108.patch new file mode 100644 index 0000000..17bf7d8 --- /dev/null +++ b/glibc-RHEL-15696-108.patch @@ -0,0 +1,55 @@ +From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Tue, 15 Feb 2022 08:18:15 -0600 +Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ + #28896] +Content-type: text/plain; charset=UTF-8 + +In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would +call strcmp-avx2 and wcscmp-avx2 respectively. This would have +not checks around vzeroupper and would trigger spurious +aborts. This commit fixes that. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on +AVX2 machines with and without RTM. + +Co-authored-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strcmp-avx2.S | 8 ++------ + 1 file changed, 2 insertions(+), 6 deletions(-) + +Conflicts: + sysdeps/x86_64/multiarch/strcmp-avx2.S + (split into two patches due to upstream bug differences) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 28cc98b6..e267c6cb 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -345,10 +345,10 @@ L(one_or_less): + movq %LOCALE_REG, %rdx + # endif + jb L(ret_zero) +-# ifdef USE_AS_WCSCMP + /* 'nbe' covers the case where length is negative (large + unsigned). */ +- jnbe __wcscmp_avx2 ++ jnbe OVERFLOW_STRCMP ++# ifdef USE_AS_WCSCMP + movl (%rdi), %edx + xorl %eax, %eax + cmpl (%rsi), %edx +@@ -357,10 +357,6 @@ L(one_or_less): + negl %eax + orl $1, %eax + # else +- /* 'nbe' covers the case where length is negative (large +- unsigned). */ +- +- jnbe __strcmp_avx2 + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + TOLOWER_gpr (%rax, %eax) +-- +GitLab + diff --git a/glibc-RHEL-15696-109.patch b/glibc-RHEL-15696-109.patch new file mode 100644 index 0000000..8aaa314 --- /dev/null +++ b/glibc-RHEL-15696-109.patch @@ -0,0 +1,60 @@ +From 259a17cc98058d2576511201f85d28cb5d9de2a2 Mon Sep 17 00:00:00 2001 +From: Stefan Liebler +Date: Mon, 28 Jun 2021 13:01:07 +0200 +Subject: s390x: Update math: redirect roundeven function + +After recent commit +447954a206837b5f153869cfeeeab44631c3fac9 +"math: redirect roundeven function", building on +s390x fails with: +Error: symbol `__roundevenl' is already defined + +Similar to aarch64/riscv fix, this patch redirects target +specific functions for s390x: +commit 3213ed770cbc5821920d16caa93c85e92dd7b9f6 +"Update math: redirect roundeven function" + +diff --git a/sysdeps/s390/fpu/s_roundeven.c b/sysdeps/s390/fpu/s_roundeven.c +index 40b07e054b..0773adfed0 100644 +--- a/sysdeps/s390/fpu/s_roundeven.c ++++ b/sysdeps/s390/fpu/s_roundeven.c +@@ -18,6 +18,7 @@ + . */ + + #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT ++# define NO_MATH_REDIRECT + # include + # include + +@@ -31,7 +32,6 @@ __roundeven (double x) + __asm__ ("fidbra %0,4,%1,4" : "=f" (y) : "f" (x)); + return y; + } +-hidden_def (__roundeven) + libm_alias_double (__roundeven, roundeven) + + #else +diff --git a/sysdeps/s390/fpu/s_roundevenf.c b/sysdeps/s390/fpu/s_roundevenf.c +index d2fbf3d2b6..289785bc4a 100644 +--- a/sysdeps/s390/fpu/s_roundevenf.c ++++ b/sysdeps/s390/fpu/s_roundevenf.c +@@ -18,6 +18,7 @@ + . */ + + #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT ++# define NO_MATH_REDIRECT + # include + # include + +diff --git a/sysdeps/s390/fpu/s_roundevenl.c b/sysdeps/s390/fpu/s_roundevenl.c +index 29ab7a8616..94b6459ab4 100644 +--- a/sysdeps/s390/fpu/s_roundevenl.c ++++ b/sysdeps/s390/fpu/s_roundevenl.c +@@ -18,6 +18,7 @@ + . */ + + #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT ++# define NO_MATH_REDIRECT + # include + # include + # include diff --git a/glibc-RHEL-15696-11.patch b/glibc-RHEL-15696-11.patch new file mode 100644 index 0000000..54d7eff --- /dev/null +++ b/glibc-RHEL-15696-11.patch @@ -0,0 +1,74 @@ +From 1da50d4bda07f04135dca39f40e79fc9eabed1f8 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 26 Feb 2021 05:36:59 -0800 +Subject: [PATCH] x86: Set Prefer_No_VZEROUPPER and add Prefer_AVX2_STRCMP +Content-type: text/plain; charset=UTF-8 + +1. Set Prefer_No_VZEROUPPER if RTM is usable to avoid RTM abort triggered +by VZEROUPPER inside a transactionally executing RTM region. +2. Since to compare 2 32-byte strings, 256-bit EVEX strcmp requires 2 +loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp requires 1 load, 2 VPCMPEQs, +1 VPMINU and 1 VPMOVMSKB, AVX2 strcmp is faster than EVEX strcmp. Add +Prefer_AVX2_STRCMP to prefer AVX2 strcmp family functions. +--- + sysdeps/x86/cpu-features.c | 20 +++++++++++++++++-- + sysdeps/x86/cpu-tunables.c | 2 ++ + ...cpu-features-preferred_feature_index_1.def | 1 + + 3 files changed, 21 insertions(+), 2 deletions(-) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index 91042505..3610ee5c 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -524,8 +524,24 @@ init_cpu_features (struct cpu_features *cpu_features) + cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER] + |= bit_arch_Prefer_No_VZEROUPPER; + else +- cpu_features->preferred[index_arch_Prefer_No_AVX512] +- |= bit_arch_Prefer_No_AVX512; ++ { ++ cpu_features->preferred[index_arch_Prefer_No_AVX512] ++ |= bit_arch_Prefer_No_AVX512; ++ ++ /* Avoid RTM abort triggered by VZEROUPPER inside a ++ transactionally executing RTM region. */ ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER] ++ |= bit_arch_Prefer_No_VZEROUPPER; ++ ++ /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp ++ requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp ++ requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB, ++ AVX2 strcmp is faster than EVEX strcmp. */ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) ++ cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP] ++ |= bit_arch_Prefer_AVX2_STRCMP; ++ } + } + /* This spells out "AuthenticAMD". */ + else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) +diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c +index 3173b2b9..73adbaba 100644 +--- a/sysdeps/x86/cpu-tunables.c ++++ b/sysdeps/x86/cpu-tunables.c +@@ -239,6 +239,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, + Fast_Copy_Backward, + disable, 18); ++ CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH ++ (n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18); + } + break; + case 19: +diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +index 17a5cc42..4ca70b40 100644 +--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def ++++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +@@ -32,3 +32,4 @@ BIT (Prefer_ERMS) + BIT (Prefer_FSRM) + BIT (Prefer_No_AVX512) + BIT (MathVec_Prefer_No_AVX512) ++BIT (Prefer_AVX2_STRCMP) +-- +GitLab + diff --git a/glibc-RHEL-15696-110.patch b/glibc-RHEL-15696-110.patch new file mode 100644 index 0000000..c499761 --- /dev/null +++ b/glibc-RHEL-15696-110.patch @@ -0,0 +1,26 @@ +From 3213ed770cbc5821920d16caa93c85e92dd7b9f6 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Wed, 23 Jun 2021 13:29:41 -0700 +Subject: Update math: redirect roundeven function + +Redirect target specific roundeven functions for aarch64, ldbl-128ibm +and riscv. + +Conflicts: + sysdeps/aarch64/* + (not needed) + sysdeps/riscv/* + (not supported) + +diff --git a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c +index 6701970f4a..90eecf496b 100644 +--- a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c ++++ b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c +@@ -17,6 +17,7 @@ + License along with the GNU C Library; if not, see + . */ + ++#define NO_MATH_REDIRECT + #include + #include + diff --git a/glibc-RHEL-15696-12.patch b/glibc-RHEL-15696-12.patch new file mode 100644 index 0000000..85b568e --- /dev/null +++ b/glibc-RHEL-15696-12.patch @@ -0,0 +1,3410 @@ +From 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 5 Mar 2021 06:24:52 -0800 +Subject: [PATCH] x86-64: Add ifunc-avx2.h functions with 256-bit EVEX +Content-type: text/plain; charset=UTF-8 + +Update ifunc-avx2.h, strchr.c, strcmp.c, strncmp.c and wcsnlen.c to +select the function optimized with 256-bit EVEX instructions using +YMM16-YMM31 registers to avoid RTM abort with usable AVX512VL, AVX512BW +and BMI2 since VZEROUPPER isn't needed at function exit. + +For strcmp/strncmp, prefer AVX2 strcmp/strncmp if Prefer_AVX2_STRCMP +is set. +--- + sysdeps/x86_64/multiarch/Makefile | 21 +- + sysdeps/x86_64/multiarch/ifunc-avx2.h | 14 +- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 81 ++ + sysdeps/x86_64/multiarch/memchr-evex.S | 381 +++++++ + sysdeps/x86_64/multiarch/memrchr-evex.S | 337 +++++++ + sysdeps/x86_64/multiarch/rawmemchr-evex.S | 4 + + sysdeps/x86_64/multiarch/strchr-evex.S | 335 +++++++ + sysdeps/x86_64/multiarch/strchr.c | 14 +- + sysdeps/x86_64/multiarch/strchrnul-evex.S | 3 + + sysdeps/x86_64/multiarch/strcmp-evex.S | 1043 ++++++++++++++++++++ + sysdeps/x86_64/multiarch/strcmp.c | 15 +- + sysdeps/x86_64/multiarch/strlen-evex.S | 436 ++++++++ + sysdeps/x86_64/multiarch/strncmp-evex.S | 3 + + sysdeps/x86_64/multiarch/strncmp.c | 15 +- + sysdeps/x86_64/multiarch/strnlen-evex.S | 4 + + sysdeps/x86_64/multiarch/strrchr-evex.S | 265 +++++ + sysdeps/x86_64/multiarch/wcschr-evex.S | 3 + + sysdeps/x86_64/multiarch/wcscmp-evex.S | 4 + + sysdeps/x86_64/multiarch/wcslen-evex.S | 4 + + sysdeps/x86_64/multiarch/wcsncmp-evex.S | 5 + + sysdeps/x86_64/multiarch/wcsnlen-evex.S | 5 + + sysdeps/x86_64/multiarch/wcsnlen.c | 14 +- + sysdeps/x86_64/multiarch/wcsrchr-evex.S | 3 + + sysdeps/x86_64/multiarch/wmemchr-evex.S | 4 + + 24 files changed, 2996 insertions(+), 17 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/memchr-evex.S + create mode 100644 sysdeps/x86_64/multiarch/memrchr-evex.S + create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strchr-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strcmp-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strlen-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strncmp-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex.S + create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex.S + create mode 100644 sysdeps/x86_64/multiarch/wcscmp-evex.S + create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex.S + create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-evex.S + create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex.S + create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex.S + create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex.S + +Conflicts: + sysdeps/x86_64/multiarch/wcsnlen.c + (account for missing upstream macros) + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 9477538a..5ce85882 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -39,7 +39,17 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ + memmove-avx512-unaligned-erms \ + memset-sse2-unaligned-erms \ + memset-avx2-unaligned-erms \ +- memset-avx512-unaligned-erms ++ memset-avx512-unaligned-erms \ ++ memchr-evex \ ++ memrchr-evex \ ++ rawmemchr-evex \ ++ strchr-evex \ ++ strchrnul-evex \ ++ strcmp-evex \ ++ strlen-evex \ ++ strncmp-evex \ ++ strnlen-evex \ ++ strrchr-evex + CFLAGS-varshift.c += -msse4 + CFLAGS-strcspn-c.c += -msse4 + CFLAGS-strpbrk-c.c += -msse4 +@@ -56,7 +66,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ + wcschr-sse2 wcschr-avx2 \ + wcsrchr-sse2 wcsrchr-avx2 \ + wcsnlen-sse4_1 wcsnlen-c \ +- wcslen-sse2 wcslen-avx2 wcsnlen-avx2 ++ wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \ ++ wcschr-evex \ ++ wcscmp-evex \ ++ wcslen-evex \ ++ wcsncmp-evex \ ++ wcsnlen-evex \ ++ wcsrchr-evex \ ++ wmemchr-evex + endif + + ifeq ($(subdir),debug) +diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h +index 5c88640a..7081b0c9 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h ++++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h +@@ -21,16 +21,24 @@ + + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +- return OPTIMIZE (avx2); ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ return OPTIMIZE (evex); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } + + return OPTIMIZE (sse2); + } +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index fe13505c..bd7d9f19 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -43,6 +43,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memchr, + CPU_FEATURE_USABLE (AVX2), + __memchr_avx2) ++ IFUNC_IMPL_ADD (array, i, memchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __memchr_evex) + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/memcmp.c. */ +@@ -121,6 +126,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memrchr, + CPU_FEATURE_USABLE (AVX2), + __memrchr_avx2) ++ IFUNC_IMPL_ADD (array, i, memrchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __memrchr_evex) ++ + IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2)) + + #ifdef SHARED +@@ -179,6 +189,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, rawmemchr, + CPU_FEATURE_USABLE (AVX2), + __rawmemchr_avx2) ++ IFUNC_IMPL_ADD (array, i, rawmemchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __rawmemchr_evex) + IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/strlen.c. */ +@@ -186,6 +201,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strlen, + CPU_FEATURE_USABLE (AVX2), + __strlen_avx2) ++ IFUNC_IMPL_ADD (array, i, strlen, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strlen_evex) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)) + + /* Support sysdeps/x86_64/multiarch/strnlen.c. */ +@@ -193,6 +212,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strnlen, + CPU_FEATURE_USABLE (AVX2), + __strnlen_avx2) ++ IFUNC_IMPL_ADD (array, i, strnlen, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strnlen_evex) + IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2)) + + /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ +@@ -255,6 +278,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strchr, + CPU_FEATURE_USABLE (AVX2), + __strchr_avx2) ++ IFUNC_IMPL_ADD (array, i, strchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __strchr_evex) + IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf) + IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2)) + +@@ -263,6 +291,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strchrnul, + CPU_FEATURE_USABLE (AVX2), + __strchrnul_avx2) ++ IFUNC_IMPL_ADD (array, i, strchrnul, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __strchrnul_evex) + IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2)) + + /* Support sysdeps/x86_64/multiarch/strrchr.c. */ +@@ -270,6 +303,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strrchr, + CPU_FEATURE_USABLE (AVX2), + __strrchr_avx2) ++ IFUNC_IMPL_ADD (array, i, strrchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strrchr_evex) + IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/strcmp.c. */ +@@ -277,6 +314,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strcmp, + CPU_FEATURE_USABLE (AVX2), + __strcmp_avx2) ++ IFUNC_IMPL_ADD (array, i, strcmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __strcmp_evex) + IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2), + __strcmp_sse42) + IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3), +@@ -370,6 +412,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcschr, + CPU_FEATURE_USABLE (AVX2), + __wcschr_avx2) ++ IFUNC_IMPL_ADD (array, i, wcschr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wcschr_evex) + IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcsrchr.c. */ +@@ -377,6 +424,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcsrchr, + CPU_FEATURE_USABLE (AVX2), + __wcsrchr_avx2) ++ IFUNC_IMPL_ADD (array, i, wcsrchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wcsrchr_evex) + IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcscmp.c. */ +@@ -384,6 +436,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcscmp, + CPU_FEATURE_USABLE (AVX2), + __wcscmp_avx2) ++ IFUNC_IMPL_ADD (array, i, wcscmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wcscmp_evex) + IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcsncmp.c. */ +@@ -391,6 +448,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcsncmp, + CPU_FEATURE_USABLE (AVX2), + __wcsncmp_avx2) ++ IFUNC_IMPL_ADD (array, i, wcsncmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wcsncmp_evex) + IFUNC_IMPL_ADD (array, i, wcsncmp, 1, __wcsncmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcscpy.c. */ +@@ -404,6 +466,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcslen, + CPU_FEATURE_USABLE (AVX2), + __wcslen_avx2) ++ IFUNC_IMPL_ADD (array, i, wcslen, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wcslen_evex) + IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ +@@ -411,6 +478,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcsnlen, + CPU_FEATURE_USABLE (AVX2), + __wcsnlen_avx2) ++ IFUNC_IMPL_ADD (array, i, wcsnlen, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wcsnlen_evex) + IFUNC_IMPL_ADD (array, i, wcsnlen, + CPU_FEATURE_USABLE (SSE4_1), + __wcsnlen_sse4_1) +@@ -421,6 +493,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wmemchr, + CPU_FEATURE_USABLE (AVX2), + __wmemchr_avx2) ++ IFUNC_IMPL_ADD (array, i, wmemchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wmemchr_evex) + IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/wmemcmp.c. */ +@@ -568,6 +645,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strncmp, + CPU_FEATURE_USABLE (AVX2), + __strncmp_avx2) ++ IFUNC_IMPL_ADD (array, i, strncmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strncmp_evex) + IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2), + __strncmp_sse42) + IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3), +diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S +new file mode 100644 +index 00000000..6dd5d67b +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memchr-evex.S +@@ -0,0 +1,381 @@ ++/* memchr/wmemchr optimized with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++# include ++ ++# ifndef MEMCHR ++# define MEMCHR __memchr_evex ++# endif ++ ++# ifdef USE_AS_WMEMCHR ++# define VPBROADCAST vpbroadcastd ++# define VPCMP vpcmpd ++# define SHIFT_REG r8d ++# else ++# define VPBROADCAST vpbroadcastb ++# define VPCMP vpcmpb ++# define SHIFT_REG ecx ++# endif ++ ++# define XMMMATCH xmm16 ++# define YMMMATCH ymm16 ++# define YMM1 ymm17 ++# define YMM2 ymm18 ++# define YMM3 ymm19 ++# define YMM4 ymm20 ++# define YMM5 ymm21 ++# define YMM6 ymm22 ++ ++# define VEC_SIZE 32 ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (MEMCHR) ++# ifndef USE_AS_RAWMEMCHR ++ /* Check for zero length. */ ++ test %RDX_LP, %RDX_LP ++ jz L(zero) ++# endif ++ movl %edi, %ecx ++# ifdef USE_AS_WMEMCHR ++ shl $2, %RDX_LP ++# else ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx ++# endif ++# endif ++ /* Broadcast CHAR to YMMMATCH. */ ++ VPBROADCAST %esi, %YMMMATCH ++ /* Check if we may cross page boundary with one vector load. */ ++ andl $(2 * VEC_SIZE - 1), %ecx ++ cmpl $VEC_SIZE, %ecx ++ ja L(cros_page_boundary) ++ ++ /* Check the first VEC_SIZE bytes. */ ++ VPCMP $0, (%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ ++# ifndef USE_AS_RAWMEMCHR ++ jnz L(first_vec_x0_check) ++ /* Adjust length and check the end of data. */ ++ subq $VEC_SIZE, %rdx ++ jbe L(zero) ++# else ++ jnz L(first_vec_x0) ++# endif ++ ++ /* Align data for aligned loads in the loop. */ ++ addq $VEC_SIZE, %rdi ++ andl $(VEC_SIZE - 1), %ecx ++ andq $-VEC_SIZE, %rdi ++ ++# ifndef USE_AS_RAWMEMCHR ++ /* Adjust length. */ ++ addq %rcx, %rdx ++ ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec_or_less) ++# endif ++ jmp L(more_4x_vec) ++ ++ .p2align 4 ++L(cros_page_boundary): ++ andl $(VEC_SIZE - 1), %ecx ++# ifdef USE_AS_WMEMCHR ++ /* NB: Divide shift count by 4 since each bit in K1 represent 4 ++ bytes. */ ++ movl %ecx, %SHIFT_REG ++ sarl $2, %SHIFT_REG ++# endif ++ andq $-VEC_SIZE, %rdi ++ VPCMP $0, (%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ /* Remove the leading bytes. */ ++ sarxl %SHIFT_REG, %eax, %eax ++ testl %eax, %eax ++ jz L(aligned_more) ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++# ifndef USE_AS_RAWMEMCHR ++ /* Check the end of data. */ ++ cmpq %rax, %rdx ++ jbe L(zero) ++# endif ++ addq %rdi, %rax ++ addq %rcx, %rax ++ ret ++ ++ .p2align 4 ++L(aligned_more): ++# ifndef USE_AS_RAWMEMCHR ++ /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" ++ instead of "(rdx + rcx) - VEC_SIZE" to void possible addition ++ overflow. */ ++ negq %rcx ++ addq $VEC_SIZE, %rcx ++ ++ /* Check the end of data. */ ++ subq %rcx, %rdx ++ jbe L(zero) ++# endif ++ ++ addq $VEC_SIZE, %rdi ++ ++# ifndef USE_AS_RAWMEMCHR ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec_or_less) ++# endif ++ ++L(more_4x_vec): ++ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time ++ since data is only aligned to VEC_SIZE. */ ++ VPCMP $0, (%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ ++ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x3) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ ++# ifndef USE_AS_RAWMEMCHR ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec_or_less) ++# endif ++ ++ /* Align data to 4 * VEC_SIZE. */ ++ movq %rdi, %rcx ++ andl $(4 * VEC_SIZE - 1), %ecx ++ andq $-(4 * VEC_SIZE), %rdi ++ ++# ifndef USE_AS_RAWMEMCHR ++ /* Adjust length. */ ++ addq %rcx, %rdx ++# endif ++ ++ .p2align 4 ++L(loop_4x_vec): ++ /* Compare 4 * VEC at a time forward. */ ++ VPCMP $0, (%rdi), %YMMMATCH, %k1 ++ VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2 ++ kord %k1, %k2, %k5 ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3 ++ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4 ++ ++ kord %k3, %k4, %k6 ++ kortestd %k5, %k6 ++ jnz L(4x_vec_end) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ ++# ifdef USE_AS_RAWMEMCHR ++ jmp L(loop_4x_vec) ++# else ++ subq $(VEC_SIZE * 4), %rdx ++ ja L(loop_4x_vec) ++ ++L(last_4x_vec_or_less): ++ /* Less than 4 * VEC and aligned to VEC_SIZE. */ ++ addl $(VEC_SIZE * 2), %edx ++ jle L(last_2x_vec) ++ ++ VPCMP $0, (%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ ++ jnz L(first_vec_x2_check) ++ subl $VEC_SIZE, %edx ++ jle L(zero) ++ ++ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ ++ jnz L(first_vec_x3_check) ++ xorl %eax, %eax ++ ret ++ ++ .p2align 4 ++L(last_2x_vec): ++ addl $(VEC_SIZE * 2), %edx ++ VPCMP $0, (%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ ++ jnz L(first_vec_x0_check) ++ subl $VEC_SIZE, %edx ++ jle L(zero) ++ ++ VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1_check) ++ xorl %eax, %eax ++ ret ++ ++ .p2align 4 ++L(first_vec_x0_check): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ /* Check the end of data. */ ++ cmpq %rax, %rdx ++ jbe L(zero) ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(first_vec_x1_check): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ /* Check the end of data. */ ++ cmpq %rax, %rdx ++ jbe L(zero) ++ addq $VEC_SIZE, %rax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(first_vec_x2_check): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ /* Check the end of data. */ ++ cmpq %rax, %rdx ++ jbe L(zero) ++ addq $(VEC_SIZE * 2), %rax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(first_vec_x3_check): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ /* Check the end of data. */ ++ cmpq %rax, %rdx ++ jbe L(zero) ++ addq $(VEC_SIZE * 3), %rax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(zero): ++ xorl %eax, %eax ++ ret ++# endif ++ ++ .p2align 4 ++L(first_vec_x0): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (%rdi, %rax, 4), %rax ++# else ++ addq %rdi, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x1): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq VEC_SIZE(%rdi, %rax, 4), %rax ++# else ++ addq $VEC_SIZE, %rax ++ addq %rdi, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x2): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax ++# else ++ addq $(VEC_SIZE * 2), %rax ++ addq %rdi, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(4x_vec_end): ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ kmovd %k2, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ kmovd %k3, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ kmovd %k4, %eax ++ testl %eax, %eax ++L(first_vec_x3): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax ++# else ++ addq $(VEC_SIZE * 3), %rax ++ addq %rdi, %rax ++# endif ++ ret ++ ++END (MEMCHR) ++#endif +diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S +new file mode 100644 +index 00000000..16bf8e02 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memrchr-evex.S +@@ -0,0 +1,337 @@ ++/* memrchr optimized with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++# include ++ ++# define VMOVA vmovdqa64 ++ ++# define YMMMATCH ymm16 ++ ++# define VEC_SIZE 32 ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (__memrchr_evex) ++ /* Broadcast CHAR to YMMMATCH. */ ++ vpbroadcastb %esi, %YMMMATCH ++ ++ sub $VEC_SIZE, %RDX_LP ++ jbe L(last_vec_or_less) ++ ++ add %RDX_LP, %RDI_LP ++ ++ /* Check the last VEC_SIZE bytes. */ ++ vpcmpb $0, (%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x0) ++ ++ subq $(VEC_SIZE * 4), %rdi ++ movl %edi, %ecx ++ andl $(VEC_SIZE - 1), %ecx ++ jz L(aligned_more) ++ ++ /* Align data for aligned loads in the loop. */ ++ addq $VEC_SIZE, %rdi ++ addq $VEC_SIZE, %rdx ++ andq $-VEC_SIZE, %rdi ++ subq %rcx, %rdx ++ ++ .p2align 4 ++L(aligned_more): ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec_or_less) ++ ++ /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time ++ since data is only aligned to VEC_SIZE. */ ++ vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x3) ++ ++ vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 ++ kmovd %k2, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x2) ++ ++ vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 ++ kmovd %k3, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x1) ++ ++ vpcmpb $0, (%rdi), %YMMMATCH, %k4 ++ kmovd %k4, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x0) ++ ++ /* Align data to 4 * VEC_SIZE for loop with fewer branches. ++ There are some overlaps with above if data isn't aligned ++ to 4 * VEC_SIZE. */ ++ movl %edi, %ecx ++ andl $(VEC_SIZE * 4 - 1), %ecx ++ jz L(loop_4x_vec) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ addq $(VEC_SIZE * 4), %rdx ++ andq $-(VEC_SIZE * 4), %rdi ++ subq %rcx, %rdx ++ ++ .p2align 4 ++L(loop_4x_vec): ++ /* Compare 4 * VEC at a time forward. */ ++ subq $(VEC_SIZE * 4), %rdi ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec_or_less) ++ ++ vpcmpb $0, (%rdi), %YMMMATCH, %k1 ++ vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k2 ++ kord %k1, %k2, %k5 ++ vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3 ++ vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4 ++ ++ kord %k3, %k4, %k6 ++ kortestd %k5, %k6 ++ jz L(loop_4x_vec) ++ ++ /* There is a match. */ ++ kmovd %k4, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x3) ++ ++ kmovd %k3, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x2) ++ ++ kmovd %k2, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x1) ++ ++ kmovd %k1, %eax ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_4x_vec_or_less): ++ addl $(VEC_SIZE * 4), %edx ++ cmpl $(VEC_SIZE * 2), %edx ++ jbe L(last_2x_vec) ++ ++ vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x3) ++ ++ vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 ++ kmovd %k2, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x2) ++ ++ vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 ++ kmovd %k3, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x1_check) ++ cmpl $(VEC_SIZE * 3), %edx ++ jbe L(zero) ++ ++ vpcmpb $0, (%rdi), %YMMMATCH, %k4 ++ kmovd %k4, %eax ++ testl %eax, %eax ++ jz L(zero) ++ bsrl %eax, %eax ++ subq $(VEC_SIZE * 4), %rdx ++ addq %rax, %rdx ++ jl L(zero) ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_2x_vec): ++ vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x3_check) ++ cmpl $VEC_SIZE, %edx ++ jbe L(zero) ++ ++ vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jz L(zero) ++ bsrl %eax, %eax ++ subq $(VEC_SIZE * 2), %rdx ++ addq %rax, %rdx ++ jl L(zero) ++ addl $(VEC_SIZE * 2), %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_x0): ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_x1): ++ bsrl %eax, %eax ++ addl $VEC_SIZE, %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_x2): ++ bsrl %eax, %eax ++ addl $(VEC_SIZE * 2), %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_x3): ++ bsrl %eax, %eax ++ addl $(VEC_SIZE * 3), %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_x1_check): ++ bsrl %eax, %eax ++ subq $(VEC_SIZE * 3), %rdx ++ addq %rax, %rdx ++ jl L(zero) ++ addl $VEC_SIZE, %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_x3_check): ++ bsrl %eax, %eax ++ subq $VEC_SIZE, %rdx ++ addq %rax, %rdx ++ jl L(zero) ++ addl $(VEC_SIZE * 3), %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(zero): ++ xorl %eax, %eax ++ ret ++ ++ .p2align 4 ++L(last_vec_or_less_aligned): ++ movl %edx, %ecx ++ ++ vpcmpb $0, (%rdi), %YMMMATCH, %k1 ++ ++ movl $1, %edx ++ /* Support rdx << 32. */ ++ salq %cl, %rdx ++ subq $1, %rdx ++ ++ kmovd %k1, %eax ++ ++ /* Remove the trailing bytes. */ ++ andl %edx, %eax ++ testl %eax, %eax ++ jz L(zero) ++ ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_or_less): ++ addl $VEC_SIZE, %edx ++ ++ /* Check for zero length. */ ++ testl %edx, %edx ++ jz L(zero) ++ ++ movl %edi, %ecx ++ andl $(VEC_SIZE - 1), %ecx ++ jz L(last_vec_or_less_aligned) ++ ++ movl %ecx, %esi ++ movl %ecx, %r8d ++ addl %edx, %esi ++ andq $-VEC_SIZE, %rdi ++ ++ subl $VEC_SIZE, %esi ++ ja L(last_vec_2x_aligned) ++ ++ /* Check the last VEC. */ ++ vpcmpb $0, (%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ ++ /* Remove the leading and trailing bytes. */ ++ sarl %cl, %eax ++ movl %edx, %ecx ++ ++ movl $1, %edx ++ sall %cl, %edx ++ subl $1, %edx ++ ++ andl %edx, %eax ++ testl %eax, %eax ++ jz L(zero) ++ ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ addq %r8, %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_2x_aligned): ++ movl %esi, %ecx ++ ++ /* Check the last VEC. */ ++ vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 ++ ++ movl $1, %edx ++ sall %cl, %edx ++ subl $1, %edx ++ ++ kmovd %k1, %eax ++ ++ /* Remove the trailing bytes. */ ++ andl %edx, %eax ++ ++ testl %eax, %eax ++ jnz L(last_vec_x1) ++ ++ /* Check the second last VEC. */ ++ vpcmpb $0, (%rdi), %YMMMATCH, %k1 ++ ++ movl %r8d, %ecx ++ ++ kmovd %k1, %eax ++ ++ /* Remove the leading bytes. Must use unsigned right shift for ++ bsrl below. */ ++ shrl %cl, %eax ++ testl %eax, %eax ++ jz L(zero) ++ ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ addq %r8, %rax ++ ret ++END (__memrchr_evex) ++#endif +diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S +new file mode 100644 +index 00000000..ec942b77 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S +@@ -0,0 +1,4 @@ ++#define MEMCHR __rawmemchr_evex ++#define USE_AS_RAWMEMCHR 1 ++ ++#include "memchr-evex.S" +diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S +new file mode 100644 +index 00000000..ddc86a70 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strchr-evex.S +@@ -0,0 +1,335 @@ ++/* strchr/strchrnul optimized with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++# include ++ ++# ifndef STRCHR ++# define STRCHR __strchr_evex ++# endif ++ ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++ ++# ifdef USE_AS_WCSCHR ++# define VPBROADCAST vpbroadcastd ++# define VPCMP vpcmpd ++# define VPMINU vpminud ++# define CHAR_REG esi ++# define SHIFT_REG r8d ++# else ++# define VPBROADCAST vpbroadcastb ++# define VPCMP vpcmpb ++# define VPMINU vpminub ++# define CHAR_REG sil ++# define SHIFT_REG ecx ++# endif ++ ++# define XMMZERO xmm16 ++ ++# define YMMZERO ymm16 ++# define YMM0 ymm17 ++# define YMM1 ymm18 ++# define YMM2 ymm19 ++# define YMM3 ymm20 ++# define YMM4 ymm21 ++# define YMM5 ymm22 ++# define YMM6 ymm23 ++# define YMM7 ymm24 ++# define YMM8 ymm25 ++ ++# define VEC_SIZE 32 ++# define PAGE_SIZE 4096 ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (STRCHR) ++ movl %edi, %ecx ++# ifndef USE_AS_STRCHRNUL ++ xorl %edx, %edx ++# endif ++ ++ /* Broadcast CHAR to YMM0. */ ++ VPBROADCAST %esi, %YMM0 ++ ++ vpxorq %XMMZERO, %XMMZERO, %XMMZERO ++ ++ /* Check if we cross page boundary with one vector load. */ ++ andl $(PAGE_SIZE - 1), %ecx ++ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx ++ ja L(cross_page_boundary) ++ ++ /* Check the first VEC_SIZE bytes. Search for both CHAR and the ++ null bytes. */ ++ VMOVU (%rdi), %YMM1 ++ ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM2 ++ VPMINU %YMM2, %YMM1, %YMM2 ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ ktestd %k0, %k0 ++ jz L(more_vecs) ++ kmovd %k0, %eax ++ tzcntl %eax, %eax ++ /* Found CHAR or the null byte. */ ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (%rdi, %rax, 4), %rax ++# else ++ addq %rdi, %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(more_vecs): ++ /* Align data for aligned loads in the loop. */ ++ andq $-VEC_SIZE, %rdi ++L(aligned_more): ++ ++ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time ++ since data is only aligned to VEC_SIZE. */ ++ VMOVA VEC_SIZE(%rdi), %YMM1 ++ addq $VEC_SIZE, %rdi ++ ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM2 ++ VPMINU %YMM2, %YMM1, %YMM2 ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ VMOVA VEC_SIZE(%rdi), %YMM1 ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM2 ++ VPMINU %YMM2, %YMM1, %YMM2 ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM2 ++ VPMINU %YMM2, %YMM1, %YMM2 ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ ++ VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM2 ++ VPMINU %YMM2, %YMM1, %YMM2 ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ ktestd %k0, %k0 ++ jz L(prep_loop_4x) ++ ++ kmovd %k0, %eax ++ tzcntl %eax, %eax ++ /* Found CHAR or the null byte. */ ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax ++# else ++ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x0): ++ tzcntl %eax, %eax ++ /* Found CHAR or the null byte. */ ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (%rdi, %rax, 4), %rax ++# else ++ addq %rdi, %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x1): ++ tzcntl %eax, %eax ++ /* Found CHAR or the null byte. */ ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq VEC_SIZE(%rdi, %rax, 4), %rax ++# else ++ leaq VEC_SIZE(%rdi, %rax), %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x2): ++ tzcntl %eax, %eax ++ /* Found CHAR or the null byte. */ ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax ++# else ++ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ ret ++ ++L(prep_loop_4x): ++ /* Align data to 4 * VEC_SIZE. */ ++ andq $-(VEC_SIZE * 4), %rdi ++ ++ .p2align 4 ++L(loop_4x_vec): ++ /* Compare 4 * VEC at a time forward. */ ++ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 ++ VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 ++ VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 ++ VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 ++ ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM5 ++ vpxorq %YMM2, %YMM0, %YMM6 ++ vpxorq %YMM3, %YMM0, %YMM7 ++ vpxorq %YMM4, %YMM0, %YMM8 ++ ++ VPMINU %YMM5, %YMM1, %YMM5 ++ VPMINU %YMM6, %YMM2, %YMM6 ++ VPMINU %YMM7, %YMM3, %YMM7 ++ VPMINU %YMM8, %YMM4, %YMM8 ++ ++ VPMINU %YMM5, %YMM6, %YMM1 ++ VPMINU %YMM7, %YMM8, %YMM2 ++ ++ VPMINU %YMM1, %YMM2, %YMM1 ++ ++ /* Each bit in K0 represents a CHAR or a null byte. */ ++ VPCMP $0, %YMMZERO, %YMM1, %k0 ++ ++ addq $(VEC_SIZE * 4), %rdi ++ ++ ktestd %k0, %k0 ++ jz L(loop_4x_vec) ++ ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM5, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ /* Each bit in K1 represents a CHAR or a null byte in YMM2. */ ++ VPCMP $0, %YMMZERO, %YMM6, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ /* Each bit in K2 represents a CHAR or a null byte in YMM3. */ ++ VPCMP $0, %YMMZERO, %YMM7, %k2 ++ /* Each bit in K3 represents a CHAR or a null byte in YMM4. */ ++ VPCMP $0, %YMMZERO, %YMM8, %k3 ++ ++# ifdef USE_AS_WCSCHR ++ /* NB: Each bit in K2/K3 represents 4-byte element. */ ++ kshiftlw $8, %k3, %k1 ++# else ++ kshiftlq $32, %k3, %k1 ++# endif ++ ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ korq %k1, %k2, %k1 ++ kmovq %k1, %rax ++ ++ tzcntq %rax, %rax ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax ++# else ++ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ ret ++ ++ /* Cold case for crossing page with first load. */ ++ .p2align 4 ++L(cross_page_boundary): ++ andq $-VEC_SIZE, %rdi ++ andl $(VEC_SIZE - 1), %ecx ++ ++ VMOVA (%rdi), %YMM1 ++ ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM2 ++ VPMINU %YMM2, %YMM1, %YMM2 ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ ++# ifdef USE_AS_WCSCHR ++ /* NB: Divide shift count by 4 since each bit in K1 represent 4 ++ bytes. */ ++ movl %ecx, %SHIFT_REG ++ sarl $2, %SHIFT_REG ++# endif ++ ++ /* Remove the leading bits. */ ++ sarxl %SHIFT_REG, %eax, %eax ++ testl %eax, %eax ++ ++ jz L(aligned_more) ++ tzcntl %eax, %eax ++ addq %rcx, %rdi ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (%rdi, %rax, 4), %rax ++# else ++ addq %rdi, %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ ret ++ ++END (STRCHR) ++# endif +diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c +index 32954713..be05e197 100644 +--- a/sysdeps/x86_64/multiarch/strchr.c ++++ b/sysdeps/x86_64/multiarch/strchr.c +@@ -29,16 +29,24 @@ + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +- return OPTIMIZE (avx2); ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ return OPTIMIZE (evex); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } + + if (CPU_FEATURES_ARCH_P (cpu_features, Slow_BSF)) + return OPTIMIZE (sse2_no_bsf); +diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex.S b/sysdeps/x86_64/multiarch/strchrnul-evex.S +new file mode 100644 +index 00000000..064fe7ca +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strchrnul-evex.S +@@ -0,0 +1,3 @@ ++#define STRCHR __strchrnul_evex ++#define USE_AS_STRCHRNUL 1 ++#include "strchr-evex.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +new file mode 100644 +index 00000000..459eeed0 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -0,0 +1,1043 @@ ++/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++# include ++ ++# ifndef STRCMP ++# define STRCMP __strcmp_evex ++# endif ++ ++# define PAGE_SIZE 4096 ++ ++/* VEC_SIZE = Number of bytes in a ymm register */ ++# define VEC_SIZE 32 ++ ++/* Shift for dividing by (VEC_SIZE * 4). */ ++# define DIVIDE_BY_VEC_4_SHIFT 7 ++# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) ++# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) ++# endif ++ ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++ ++# ifdef USE_AS_WCSCMP ++/* Compare packed dwords. */ ++# define VPCMP vpcmpd ++# define SHIFT_REG32 r8d ++# define SHIFT_REG64 r8 ++/* 1 dword char == 4 bytes. */ ++# define SIZE_OF_CHAR 4 ++# else ++/* Compare packed bytes. */ ++# define VPCMP vpcmpb ++# define SHIFT_REG32 ecx ++# define SHIFT_REG64 rcx ++/* 1 byte char == 1 byte. */ ++# define SIZE_OF_CHAR 1 ++# endif ++ ++# define XMMZERO xmm16 ++# define XMM0 xmm17 ++# define XMM1 xmm18 ++ ++# define YMMZERO ymm16 ++# define YMM0 ymm17 ++# define YMM1 ymm18 ++# define YMM2 ymm19 ++# define YMM3 ymm20 ++# define YMM4 ymm21 ++# define YMM5 ymm22 ++# define YMM6 ymm23 ++# define YMM7 ymm24 ++ ++/* Warning! ++ wcscmp/wcsncmp have to use SIGNED comparison for elements. ++ strcmp/strncmp have to use UNSIGNED comparison for elements. ++*/ ++ ++/* The main idea of the string comparison (byte or dword) using 256-bit ++ EVEX instructions consists of comparing (VPCMP) two ymm vectors. The ++ latter can be on either packed bytes or dwords depending on ++ USE_AS_WCSCMP. In order to check the null char, algorithm keeps the ++ matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2 ++ KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes) ++ are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd ++ instructions. Main loop (away from from page boundary) compares 4 ++ vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128 ++ bytes) on each loop. ++ ++ The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic ++ is the same as strcmp, except that an a maximum offset is tracked. If ++ the maximum offset is reached before a difference is found, zero is ++ returned. */ ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (STRCMP) ++# ifdef USE_AS_STRNCMP ++ /* Check for simple cases (0 or 1) in offset. */ ++ cmp $1, %RDX_LP ++ je L(char0) ++ jb L(zero) ++# ifdef USE_AS_WCSCMP ++ /* Convert units: from wide to byte char. */ ++ shl $2, %RDX_LP ++# endif ++ /* Register %r11 tracks the maximum offset. */ ++ mov %RDX_LP, %R11_LP ++# endif ++ movl %edi, %eax ++ xorl %edx, %edx ++ /* Make %XMMZERO (%YMMZERO) all zeros in this function. */ ++ vpxorq %XMMZERO, %XMMZERO, %XMMZERO ++ orl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax ++ jg L(cross_page) ++ /* Start comparing 4 vectors. */ ++ VMOVU (%rdi), %YMM0 ++ VMOVU (%rsi), %YMM1 ++ ++ /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ ++ VPCMP $4, %YMM0, %YMM1, %k0 ++ ++ /* Check for NULL in YMM0. */ ++ VPCMP $0, %YMMZERO, %YMM0, %k1 ++ /* Check for NULL in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM1, %k2 ++ /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ ++ kord %k1, %k2, %k1 ++ ++ /* Each bit in K1 represents: ++ 1. A mismatch in YMM0 and YMM1. Or ++ 2. A NULL in YMM0 or YMM1. ++ */ ++ kord %k0, %k1, %k1 ++ ++ ktestd %k1, %k1 ++ je L(next_3_vectors) ++ kmovd %k1, %ecx ++ tzcntl %ecx, %edx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %edx ++# endif ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the mismatched index (%rdx) is after the maximum ++ offset (%r11). */ ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi, %rdx), %ecx ++ cmpl (%rsi, %rdx), %ecx ++ je L(return) ++L(wcscmp_return): ++ setl %al ++ negl %eax ++ orl $1, %eax ++L(return): ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++ ret ++ ++ .p2align 4 ++L(return_vec_size): ++ kmovd %k1, %ecx ++ tzcntl %ecx, %edx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %edx ++# endif ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after ++ the maximum offset (%r11). */ ++ addq $VEC_SIZE, %rdx ++ cmpq %r11, %rdx ++ jae L(zero) ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi, %rdx), %ecx ++ cmpl (%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl VEC_SIZE(%rdi, %rdx), %ecx ++ cmpl VEC_SIZE(%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl VEC_SIZE(%rdi, %rdx), %eax ++ movzbl VEC_SIZE(%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++ .p2align 4 ++L(return_2_vec_size): ++ kmovd %k1, %ecx ++ tzcntl %ecx, %edx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %edx ++# endif ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is ++ after the maximum offset (%r11). */ ++ addq $(VEC_SIZE * 2), %rdx ++ cmpq %r11, %rdx ++ jae L(zero) ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi, %rdx), %ecx ++ cmpl (%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx ++ cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++ .p2align 4 ++L(return_3_vec_size): ++ kmovd %k1, %ecx ++ tzcntl %ecx, %edx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %edx ++# endif ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is ++ after the maximum offset (%r11). */ ++ addq $(VEC_SIZE * 3), %rdx ++ cmpq %r11, %rdx ++ jae L(zero) ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi, %rdx), %ecx ++ cmpl (%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx ++ cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++ .p2align 4 ++L(next_3_vectors): ++ VMOVU VEC_SIZE(%rdi), %YMM0 ++ VMOVU VEC_SIZE(%rsi), %YMM1 ++ /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ ++ VPCMP $4, %YMM0, %YMM1, %k0 ++ VPCMP $0, %YMMZERO, %YMM0, %k1 ++ VPCMP $0, %YMMZERO, %YMM1, %k2 ++ /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ ++ kord %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ kord %k0, %k1, %k1 ++ ktestd %k1, %k1 ++ jne L(return_vec_size) ++ ++ VMOVU (VEC_SIZE * 2)(%rdi), %YMM2 ++ VMOVU (VEC_SIZE * 3)(%rdi), %YMM3 ++ VMOVU (VEC_SIZE * 2)(%rsi), %YMM4 ++ VMOVU (VEC_SIZE * 3)(%rsi), %YMM5 ++ ++ /* Each bit in K0 represents a mismatch in YMM2 and YMM4. */ ++ VPCMP $4, %YMM2, %YMM4, %k0 ++ VPCMP $0, %YMMZERO, %YMM2, %k1 ++ VPCMP $0, %YMMZERO, %YMM4, %k2 ++ /* Each bit in K1 represents a NULL in YMM2 or YMM4. */ ++ kord %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ kord %k0, %k1, %k1 ++ ktestd %k1, %k1 ++ jne L(return_2_vec_size) ++ ++ /* Each bit in K0 represents a mismatch in YMM3 and YMM5. */ ++ VPCMP $4, %YMM3, %YMM5, %k0 ++ VPCMP $0, %YMMZERO, %YMM3, %k1 ++ VPCMP $0, %YMMZERO, %YMM5, %k2 ++ /* Each bit in K1 represents a NULL in YMM3 or YMM5. */ ++ kord %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ kord %k0, %k1, %k1 ++ ktestd %k1, %k1 ++ jne L(return_3_vec_size) ++L(main_loop_header): ++ leaq (VEC_SIZE * 4)(%rdi), %rdx ++ movl $PAGE_SIZE, %ecx ++ /* Align load via RAX. */ ++ andq $-(VEC_SIZE * 4), %rdx ++ subq %rdi, %rdx ++ leaq (%rdi, %rdx), %rax ++# ifdef USE_AS_STRNCMP ++ /* Starting from this point, the maximum offset, or simply the ++ 'offset', DECREASES by the same amount when base pointers are ++ moved forward. Return 0 when: ++ 1) On match: offset <= the matched vector index. ++ 2) On mistmach, offset is before the mistmatched index. ++ */ ++ subq %rdx, %r11 ++ jbe L(zero) ++# endif ++ addq %rsi, %rdx ++ movq %rdx, %rsi ++ andl $(PAGE_SIZE - 1), %esi ++ /* Number of bytes before page crossing. */ ++ subq %rsi, %rcx ++ /* Number of VEC_SIZE * 4 blocks before page crossing. */ ++ shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx ++ /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ ++ movl %ecx, %esi ++ jmp L(loop_start) ++ ++ .p2align 4 ++L(loop): ++# ifdef USE_AS_STRNCMP ++ /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease ++ the maximum offset (%r11) by the same amount. */ ++ subq $(VEC_SIZE * 4), %r11 ++ jbe L(zero) ++# endif ++ addq $(VEC_SIZE * 4), %rax ++ addq $(VEC_SIZE * 4), %rdx ++L(loop_start): ++ testl %esi, %esi ++ leal -1(%esi), %esi ++ je L(loop_cross_page) ++L(back_to_loop): ++ /* Main loop, comparing 4 vectors are a time. */ ++ VMOVA (%rax), %YMM0 ++ VMOVA VEC_SIZE(%rax), %YMM2 ++ VMOVA (VEC_SIZE * 2)(%rax), %YMM4 ++ VMOVA (VEC_SIZE * 3)(%rax), %YMM6 ++ VMOVU (%rdx), %YMM1 ++ VMOVU VEC_SIZE(%rdx), %YMM3 ++ VMOVU (VEC_SIZE * 2)(%rdx), %YMM5 ++ VMOVU (VEC_SIZE * 3)(%rdx), %YMM7 ++ ++ VPCMP $4, %YMM0, %YMM1, %k0 ++ VPCMP $0, %YMMZERO, %YMM0, %k1 ++ VPCMP $0, %YMMZERO, %YMM1, %k2 ++ kord %k1, %k2, %k1 ++ /* Each bit in K4 represents a NULL or a mismatch in YMM0 and ++ YMM1. */ ++ kord %k0, %k1, %k4 ++ ++ VPCMP $4, %YMM2, %YMM3, %k0 ++ VPCMP $0, %YMMZERO, %YMM2, %k1 ++ VPCMP $0, %YMMZERO, %YMM3, %k2 ++ kord %k1, %k2, %k1 ++ /* Each bit in K5 represents a NULL or a mismatch in YMM2 and ++ YMM3. */ ++ kord %k0, %k1, %k5 ++ ++ VPCMP $4, %YMM4, %YMM5, %k0 ++ VPCMP $0, %YMMZERO, %YMM4, %k1 ++ VPCMP $0, %YMMZERO, %YMM5, %k2 ++ kord %k1, %k2, %k1 ++ /* Each bit in K6 represents a NULL or a mismatch in YMM4 and ++ YMM5. */ ++ kord %k0, %k1, %k6 ++ ++ VPCMP $4, %YMM6, %YMM7, %k0 ++ VPCMP $0, %YMMZERO, %YMM6, %k1 ++ VPCMP $0, %YMMZERO, %YMM7, %k2 ++ kord %k1, %k2, %k1 ++ /* Each bit in K7 represents a NULL or a mismatch in YMM6 and ++ YMM7. */ ++ kord %k0, %k1, %k7 ++ ++ kord %k4, %k5, %k0 ++ kord %k6, %k7, %k1 ++ ++ /* Test each mask (32 bits) individually because for VEC_SIZE ++ == 32 is not possible to OR the four masks and keep all bits ++ in a 64-bit integer register, differing from SSE2 strcmp ++ where ORing is possible. */ ++ kortestd %k0, %k1 ++ je L(loop) ++ ktestd %k4, %k4 ++ je L(test_vec) ++ kmovd %k4, %edi ++ tzcntl %edi, %ecx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %ecx ++# endif ++# ifdef USE_AS_STRNCMP ++ cmpq %rcx, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %edi ++ cmpl (%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %edi ++ cmpl (%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++ .p2align 4 ++L(test_vec): ++# ifdef USE_AS_STRNCMP ++ /* The first vector matched. Return 0 if the maximum offset ++ (%r11) <= VEC_SIZE. */ ++ cmpq $VEC_SIZE, %r11 ++ jbe L(zero) ++# endif ++ ktestd %k5, %k5 ++ je L(test_2_vec) ++ kmovd %k5, %ecx ++ tzcntl %ecx, %edi ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %edi ++# endif ++# ifdef USE_AS_STRNCMP ++ addq $VEC_SIZE, %rdi ++ cmpq %rdi, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rdi), %ecx ++ cmpl (%rdx, %rdi), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rdi), %eax ++ movzbl (%rdx, %rdi), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl VEC_SIZE(%rsi, %rdi), %ecx ++ cmpl VEC_SIZE(%rdx, %rdi), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl VEC_SIZE(%rax, %rdi), %eax ++ movzbl VEC_SIZE(%rdx, %rdi), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++ .p2align 4 ++L(test_2_vec): ++# ifdef USE_AS_STRNCMP ++ /* The first 2 vectors matched. Return 0 if the maximum offset ++ (%r11) <= 2 * VEC_SIZE. */ ++ cmpq $(VEC_SIZE * 2), %r11 ++ jbe L(zero) ++# endif ++ ktestd %k6, %k6 ++ je L(test_3_vec) ++ kmovd %k6, %ecx ++ tzcntl %ecx, %edi ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %edi ++# endif ++# ifdef USE_AS_STRNCMP ++ addq $(VEC_SIZE * 2), %rdi ++ cmpq %rdi, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rdi), %ecx ++ cmpl (%rdx, %rdi), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rdi), %eax ++ movzbl (%rdx, %rdi), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx ++ cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax ++ movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++ .p2align 4 ++L(test_3_vec): ++# ifdef USE_AS_STRNCMP ++ /* The first 3 vectors matched. Return 0 if the maximum offset ++ (%r11) <= 3 * VEC_SIZE. */ ++ cmpq $(VEC_SIZE * 3), %r11 ++ jbe L(zero) ++# endif ++ kmovd %k7, %esi ++ tzcntl %esi, %ecx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %ecx ++# endif ++# ifdef USE_AS_STRNCMP ++ addq $(VEC_SIZE * 3), %rcx ++ cmpq %rcx, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %esi ++ cmpl (%rdx, %rcx), %esi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (VEC_SIZE * 3)(%rsi, %rcx), %esi ++ cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi ++ jne L(wcscmp_return) ++# else ++ movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++ .p2align 4 ++L(loop_cross_page): ++ xorl %r10d, %r10d ++ movq %rdx, %rcx ++ /* Align load via RDX. We load the extra ECX bytes which should ++ be ignored. */ ++ andl $((VEC_SIZE * 4) - 1), %ecx ++ /* R10 is -RCX. */ ++ subq %rcx, %r10 ++ ++ /* This works only if VEC_SIZE * 2 == 64. */ ++# if (VEC_SIZE * 2) != 64 ++# error (VEC_SIZE * 2) != 64 ++# endif ++ ++ /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ ++ cmpl $(VEC_SIZE * 2), %ecx ++ jge L(loop_cross_page_2_vec) ++ ++ VMOVU (%rax, %r10), %YMM2 ++ VMOVU VEC_SIZE(%rax, %r10), %YMM3 ++ VMOVU (%rdx, %r10), %YMM4 ++ VMOVU VEC_SIZE(%rdx, %r10), %YMM5 ++ ++ VPCMP $4, %YMM4, %YMM2, %k0 ++ VPCMP $0, %YMMZERO, %YMM2, %k1 ++ VPCMP $0, %YMMZERO, %YMM4, %k2 ++ kord %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch in YMM2 and ++ YMM4. */ ++ kord %k0, %k1, %k1 ++ ++ VPCMP $4, %YMM5, %YMM3, %k3 ++ VPCMP $0, %YMMZERO, %YMM3, %k4 ++ VPCMP $0, %YMMZERO, %YMM5, %k5 ++ kord %k4, %k5, %k4 ++ /* Each bit in K3 represents a NULL or a mismatch in YMM3 and ++ YMM5. */ ++ kord %k3, %k4, %k3 ++ ++# ifdef USE_AS_WCSCMP ++ /* NB: Each bit in K1/K3 represents 4-byte element. */ ++ kshiftlw $8, %k3, %k2 ++ /* NB: Divide shift count by 4 since each bit in K1 represent 4 ++ bytes. */ ++ movl %ecx, %SHIFT_REG32 ++ sarl $2, %SHIFT_REG32 ++# else ++ kshiftlq $32, %k3, %k2 ++# endif ++ ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ korq %k1, %k2, %k1 ++ kmovq %k1, %rdi ++ ++ /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ ++ shrxq %SHIFT_REG64, %rdi, %rdi ++ testq %rdi, %rdi ++ je L(loop_cross_page_2_vec) ++ tzcntq %rdi, %rcx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %ecx ++# endif ++# ifdef USE_AS_STRNCMP ++ cmpq %rcx, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %edi ++ cmpl (%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %edi ++ cmpl (%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++ .p2align 4 ++L(loop_cross_page_2_vec): ++ /* The first VEC_SIZE * 2 bytes match or are ignored. */ ++ VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0 ++ VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1 ++ VMOVU (VEC_SIZE * 2)(%rdx, %r10), %YMM2 ++ VMOVU (VEC_SIZE * 3)(%rdx, %r10), %YMM3 ++ ++ VPCMP $4, %YMM0, %YMM2, %k0 ++ VPCMP $0, %YMMZERO, %YMM0, %k1 ++ VPCMP $0, %YMMZERO, %YMM2, %k2 ++ kord %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch in YMM0 and ++ YMM2. */ ++ kord %k0, %k1, %k1 ++ ++ VPCMP $4, %YMM1, %YMM3, %k3 ++ VPCMP $0, %YMMZERO, %YMM1, %k4 ++ VPCMP $0, %YMMZERO, %YMM3, %k5 ++ kord %k4, %k5, %k4 ++ /* Each bit in K3 represents a NULL or a mismatch in YMM1 and ++ YMM3. */ ++ kord %k3, %k4, %k3 ++ ++# ifdef USE_AS_WCSCMP ++ /* NB: Each bit in K1/K3 represents 4-byte element. */ ++ kshiftlw $8, %k3, %k2 ++# else ++ kshiftlq $32, %k3, %k2 ++# endif ++ ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ korq %k1, %k2, %k1 ++ kmovq %k1, %rdi ++ ++ xorl %r8d, %r8d ++ /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ ++ subl $(VEC_SIZE * 2), %ecx ++ jle 1f ++ /* R8 has number of bytes skipped. */ ++ movl %ecx, %r8d ++# ifdef USE_AS_WCSCMP ++ /* NB: Divide shift count by 4 since each bit in K1 represent 4 ++ bytes. */ ++ sarl $2, %ecx ++# endif ++ /* Skip ECX bytes. */ ++ shrq %cl, %rdi ++1: ++ /* Before jumping back to the loop, set ESI to the number of ++ VEC_SIZE * 4 blocks before page crossing. */ ++ movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi ++ ++ testq %rdi, %rdi ++# ifdef USE_AS_STRNCMP ++ /* At this point, if %rdi value is 0, it already tested ++ VEC_SIZE*4+%r10 byte starting from %rax. This label ++ checks whether strncmp maximum offset reached or not. */ ++ je L(string_nbyte_offset_check) ++# else ++ je L(back_to_loop) ++# endif ++ tzcntq %rdi, %rcx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %ecx ++# endif ++ addq %r10, %rcx ++ /* Adjust for number of bytes skipped. */ ++ addq %r8, %rcx ++# ifdef USE_AS_STRNCMP ++ addq $(VEC_SIZE * 2), %rcx ++ subq %rcx, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %edi ++ cmpl (%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (VEC_SIZE * 2)(%rsi, %rcx), %edi ++ cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++# ifdef USE_AS_STRNCMP ++L(string_nbyte_offset_check): ++ leaq (VEC_SIZE * 4)(%r10), %r10 ++ cmpq %r10, %r11 ++ jbe L(zero) ++ jmp L(back_to_loop) ++# endif ++ ++ .p2align 4 ++L(cross_page_loop): ++ /* Check one byte/dword at a time. */ ++# ifdef USE_AS_WCSCMP ++ cmpl %ecx, %eax ++# else ++ subl %ecx, %eax ++# endif ++ jne L(different) ++ addl $SIZE_OF_CHAR, %edx ++ cmpl $(VEC_SIZE * 4), %edx ++ je L(main_loop_header) ++# ifdef USE_AS_STRNCMP ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++# ifdef USE_AS_WCSCMP ++ movl (%rdi, %rdx), %eax ++ movl (%rsi, %rdx), %ecx ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %ecx ++# endif ++ /* Check null char. */ ++ testl %eax, %eax ++ jne L(cross_page_loop) ++ /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED ++ comparisons. */ ++ subl %ecx, %eax ++# ifndef USE_AS_WCSCMP ++L(different): ++# endif ++ ret ++ ++# ifdef USE_AS_WCSCMP ++ .p2align 4 ++L(different): ++ /* Use movl to avoid modifying EFLAGS. */ ++ movl $0, %eax ++ setl %al ++ negl %eax ++ orl $1, %eax ++ ret ++# endif ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4 ++L(zero): ++ xorl %eax, %eax ++ ret ++ ++ .p2align 4 ++L(char0): ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi), %ecx ++ cmpl (%rsi), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rsi), %ecx ++ movzbl (%rdi), %eax ++ subl %ecx, %eax ++# endif ++ ret ++# endif ++ ++ .p2align 4 ++L(last_vector): ++ addq %rdx, %rdi ++ addq %rdx, %rsi ++# ifdef USE_AS_STRNCMP ++ subq %rdx, %r11 ++# endif ++ tzcntl %ecx, %edx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %edx ++# endif ++# ifdef USE_AS_STRNCMP ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi, %rdx), %ecx ++ cmpl (%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++ ret ++ ++ /* Comparing on page boundary region requires special treatment: ++ It must done one vector at the time, starting with the wider ++ ymm vector if possible, if not, with xmm. If fetching 16 bytes ++ (xmm) still passes the boundary, byte comparison must be done. ++ */ ++ .p2align 4 ++L(cross_page): ++ /* Try one ymm vector at a time. */ ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ jg L(cross_page_1_vector) ++L(loop_1_vector): ++ VMOVU (%rdi, %rdx), %YMM0 ++ VMOVU (%rsi, %rdx), %YMM1 ++ ++ /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ ++ VPCMP $4, %YMM0, %YMM1, %k0 ++ VPCMP $0, %YMMZERO, %YMM0, %k1 ++ VPCMP $0, %YMMZERO, %YMM1, %k2 ++ /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ ++ kord %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ kord %k0, %k1, %k1 ++ kmovd %k1, %ecx ++ testl %ecx, %ecx ++ jne L(last_vector) ++ ++ addl $VEC_SIZE, %edx ++ ++ addl $VEC_SIZE, %eax ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the current offset (%rdx) >= the maximum offset ++ (%r11). */ ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ jle L(loop_1_vector) ++L(cross_page_1_vector): ++ /* Less than 32 bytes to check, try one xmm vector. */ ++ cmpl $(PAGE_SIZE - 16), %eax ++ jg L(cross_page_1_xmm) ++ VMOVU (%rdi, %rdx), %XMM0 ++ VMOVU (%rsi, %rdx), %XMM1 ++ ++ /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ ++ VPCMP $4, %XMM0, %XMM1, %k0 ++ VPCMP $0, %XMMZERO, %XMM0, %k1 ++ VPCMP $0, %XMMZERO, %XMM1, %k2 ++ /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ ++ korw %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ korw %k0, %k1, %k1 ++ kmovw %k1, %ecx ++ testl %ecx, %ecx ++ jne L(last_vector) ++ ++ addl $16, %edx ++# ifndef USE_AS_WCSCMP ++ addl $16, %eax ++# endif ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the current offset (%rdx) >= the maximum offset ++ (%r11). */ ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++ ++L(cross_page_1_xmm): ++# ifndef USE_AS_WCSCMP ++ /* Less than 16 bytes to check, try 8 byte vector. NB: No need ++ for wcscmp nor wcsncmp since wide char is 4 bytes. */ ++ cmpl $(PAGE_SIZE - 8), %eax ++ jg L(cross_page_8bytes) ++ vmovq (%rdi, %rdx), %XMM0 ++ vmovq (%rsi, %rdx), %XMM1 ++ ++ /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ ++ VPCMP $4, %XMM0, %XMM1, %k0 ++ VPCMP $0, %XMMZERO, %XMM0, %k1 ++ VPCMP $0, %XMMZERO, %XMM1, %k2 ++ /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ ++ kord %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ kord %k0, %k1, %k1 ++ kmovd %k1, %ecx ++ ++# ifdef USE_AS_WCSCMP ++ /* Only last 2 bits are valid. */ ++ andl $0x3, %ecx ++# else ++ /* Only last 8 bits are valid. */ ++ andl $0xff, %ecx ++# endif ++ ++ testl %ecx, %ecx ++ jne L(last_vector) ++ ++ addl $8, %edx ++ addl $8, %eax ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the current offset (%rdx) >= the maximum offset ++ (%r11). */ ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++ ++L(cross_page_8bytes): ++ /* Less than 8 bytes to check, try 4 byte vector. */ ++ cmpl $(PAGE_SIZE - 4), %eax ++ jg L(cross_page_4bytes) ++ vmovd (%rdi, %rdx), %XMM0 ++ vmovd (%rsi, %rdx), %XMM1 ++ ++ /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ ++ VPCMP $4, %XMM0, %XMM1, %k0 ++ VPCMP $0, %XMMZERO, %XMM0, %k1 ++ VPCMP $0, %XMMZERO, %XMM1, %k2 ++ /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ ++ kord %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ kord %k0, %k1, %k1 ++ kmovd %k1, %ecx ++ ++# ifdef USE_AS_WCSCMP ++ /* Only the last bit is valid. */ ++ andl $0x1, %ecx ++# else ++ /* Only last 4 bits are valid. */ ++ andl $0xf, %ecx ++# endif ++ ++ testl %ecx, %ecx ++ jne L(last_vector) ++ ++ addl $4, %edx ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the current offset (%rdx) >= the maximum offset ++ (%r11). */ ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++ ++L(cross_page_4bytes): ++# endif ++ /* Less than 4 bytes to check, try one byte/dword at a time. */ ++# ifdef USE_AS_STRNCMP ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++# ifdef USE_AS_WCSCMP ++ movl (%rdi, %rdx), %eax ++ movl (%rsi, %rdx), %ecx ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %ecx ++# endif ++ testl %eax, %eax ++ jne L(cross_page_loop) ++ subl %ecx, %eax ++ ret ++END (STRCMP) ++#endif +diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c +index 3f433fbc..c5f38510 100644 +--- a/sysdeps/x86_64/multiarch/strcmp.c ++++ b/sysdeps/x86_64/multiarch/strcmp.c +@@ -30,16 +30,25 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +- return OPTIMIZE (avx2); ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2) ++ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) ++ return OPTIMIZE (evex); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } + + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) + return OPTIMIZE (sse2_unaligned); +diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S +new file mode 100644 +index 00000000..cd022509 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strlen-evex.S +@@ -0,0 +1,436 @@ ++/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++# include ++ ++# ifndef STRLEN ++# define STRLEN __strlen_evex ++# endif ++ ++# define VMOVA vmovdqa64 ++ ++# ifdef USE_AS_WCSLEN ++# define VPCMP vpcmpd ++# define VPMINU vpminud ++# define SHIFT_REG r9d ++# else ++# define VPCMP vpcmpb ++# define VPMINU vpminub ++# define SHIFT_REG ecx ++# endif ++ ++# define XMMZERO xmm16 ++# define YMMZERO ymm16 ++# define YMM1 ymm17 ++# define YMM2 ymm18 ++# define YMM3 ymm19 ++# define YMM4 ymm20 ++# define YMM5 ymm21 ++# define YMM6 ymm22 ++ ++# define VEC_SIZE 32 ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (STRLEN) ++# ifdef USE_AS_STRNLEN ++ /* Check for zero length. */ ++ test %RSI_LP, %RSI_LP ++ jz L(zero) ++# ifdef USE_AS_WCSLEN ++ shl $2, %RSI_LP ++# elif defined __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %esi, %esi ++# endif ++ mov %RSI_LP, %R8_LP ++# endif ++ movl %edi, %ecx ++ movq %rdi, %rdx ++ vpxorq %XMMZERO, %XMMZERO, %XMMZERO ++ ++ /* Check if we may cross page boundary with one vector load. */ ++ andl $(2 * VEC_SIZE - 1), %ecx ++ cmpl $VEC_SIZE, %ecx ++ ja L(cros_page_boundary) ++ ++ /* Check the first VEC_SIZE bytes. Each bit in K0 represents a ++ null byte. */ ++ VPCMP $0, (%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ ++# ifdef USE_AS_STRNLEN ++ jnz L(first_vec_x0_check) ++ /* Adjust length and check the end of data. */ ++ subq $VEC_SIZE, %rsi ++ jbe L(max) ++# else ++ jnz L(first_vec_x0) ++# endif ++ ++ /* Align data for aligned loads in the loop. */ ++ addq $VEC_SIZE, %rdi ++ andl $(VEC_SIZE - 1), %ecx ++ andq $-VEC_SIZE, %rdi ++ ++# ifdef USE_AS_STRNLEN ++ /* Adjust length. */ ++ addq %rcx, %rsi ++ ++ subq $(VEC_SIZE * 4), %rsi ++ jbe L(last_4x_vec_or_less) ++# endif ++ jmp L(more_4x_vec) ++ ++ .p2align 4 ++L(cros_page_boundary): ++ andl $(VEC_SIZE - 1), %ecx ++ andq $-VEC_SIZE, %rdi ++ ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide shift count by 4 since each bit in K0 represent 4 ++ bytes. */ ++ movl %ecx, %SHIFT_REG ++ sarl $2, %SHIFT_REG ++# endif ++ VPCMP $0, (%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ ++ /* Remove the leading bytes. */ ++ sarxl %SHIFT_REG, %eax, %eax ++ testl %eax, %eax ++ jz L(aligned_more) ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++# ifdef USE_AS_STRNLEN ++ /* Check the end of data. */ ++ cmpq %rax, %rsi ++ jbe L(max) ++# endif ++ addq %rdi, %rax ++ addq %rcx, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(aligned_more): ++# ifdef USE_AS_STRNLEN ++ /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" ++ with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" ++ to void possible addition overflow. */ ++ negq %rcx ++ addq $VEC_SIZE, %rcx ++ ++ /* Check the end of data. */ ++ subq %rcx, %rsi ++ jbe L(max) ++# endif ++ ++ addq $VEC_SIZE, %rdi ++ ++# ifdef USE_AS_STRNLEN ++ subq $(VEC_SIZE * 4), %rsi ++ jbe L(last_4x_vec_or_less) ++# endif ++ ++L(more_4x_vec): ++ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time ++ since data is only aligned to VEC_SIZE. */ ++ VPCMP $0, (%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ ++ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x3) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ ++# ifdef USE_AS_STRNLEN ++ subq $(VEC_SIZE * 4), %rsi ++ jbe L(last_4x_vec_or_less) ++# endif ++ ++ /* Align data to 4 * VEC_SIZE. */ ++ movq %rdi, %rcx ++ andl $(4 * VEC_SIZE - 1), %ecx ++ andq $-(4 * VEC_SIZE), %rdi ++ ++# ifdef USE_AS_STRNLEN ++ /* Adjust length. */ ++ addq %rcx, %rsi ++# endif ++ ++ .p2align 4 ++L(loop_4x_vec): ++ /* Compare 4 * VEC at a time forward. */ ++ VMOVA (%rdi), %YMM1 ++ VMOVA VEC_SIZE(%rdi), %YMM2 ++ VMOVA (VEC_SIZE * 2)(%rdi), %YMM3 ++ VMOVA (VEC_SIZE * 3)(%rdi), %YMM4 ++ ++ VPMINU %YMM1, %YMM2, %YMM5 ++ VPMINU %YMM3, %YMM4, %YMM6 ++ ++ VPMINU %YMM5, %YMM6, %YMM5 ++ VPCMP $0, %YMM5, %YMMZERO, %k0 ++ ktestd %k0, %k0 ++ jnz L(4x_vec_end) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ ++# ifndef USE_AS_STRNLEN ++ jmp L(loop_4x_vec) ++# else ++ subq $(VEC_SIZE * 4), %rsi ++ ja L(loop_4x_vec) ++ ++L(last_4x_vec_or_less): ++ /* Less than 4 * VEC and aligned to VEC_SIZE. */ ++ addl $(VEC_SIZE * 2), %esi ++ jle L(last_2x_vec) ++ ++ VPCMP $0, (%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2_check) ++ subl $VEC_SIZE, %esi ++ jle L(max) ++ ++ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x3_check) ++ movq %r8, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(last_2x_vec): ++ addl $(VEC_SIZE * 2), %esi ++ ++ VPCMP $0, (%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0_check) ++ subl $VEC_SIZE, %esi ++ jle L(max) ++ ++ VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1_check) ++ movq %r8, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x0_check): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ /* Check the end of data. */ ++ cmpq %rax, %rsi ++ jbe L(max) ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x1_check): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ /* Check the end of data. */ ++ cmpq %rax, %rsi ++ jbe L(max) ++ addq $VEC_SIZE, %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x2_check): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ /* Check the end of data. */ ++ cmpq %rax, %rsi ++ jbe L(max) ++ addq $(VEC_SIZE * 2), %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x3_check): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ /* Check the end of data. */ ++ cmpq %rax, %rsi ++ jbe L(max) ++ addq $(VEC_SIZE * 3), %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(max): ++ movq %r8, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(zero): ++ xorl %eax, %eax ++ ret ++# endif ++ ++ .p2align 4 ++L(first_vec_x0): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x1): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ addq $VEC_SIZE, %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x2): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ addq $(VEC_SIZE * 2), %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(4x_vec_end): ++ VPCMP $0, %YMM1, %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ VPCMP $0, %YMM2, %YMMZERO, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ VPCMP $0, %YMM3, %YMMZERO, %k2 ++ kmovd %k2, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ VPCMP $0, %YMM4, %YMMZERO, %k3 ++ kmovd %k3, %eax ++L(first_vec_x3): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ addq $(VEC_SIZE * 3), %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++END (STRLEN) ++#endif +diff --git a/sysdeps/x86_64/multiarch/strncmp-evex.S b/sysdeps/x86_64/multiarch/strncmp-evex.S +new file mode 100644 +index 00000000..a1d53e8c +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncmp-evex.S +@@ -0,0 +1,3 @@ ++#define STRCMP __strncmp_evex ++#define USE_AS_STRNCMP 1 ++#include "strcmp-evex.S" +diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c +index 686d654f..4c15542f 100644 +--- a/sysdeps/x86_64/multiarch/strncmp.c ++++ b/sysdeps/x86_64/multiarch/strncmp.c +@@ -30,16 +30,25 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +- return OPTIMIZE (avx2); ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2) ++ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) ++ return OPTIMIZE (evex); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } + + if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2) + && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) +diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S +new file mode 100644 +index 00000000..722022f3 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strnlen-evex.S +@@ -0,0 +1,4 @@ ++#define STRLEN __strnlen_evex ++#define USE_AS_STRNLEN 1 ++ ++#include "strlen-evex.S" +diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S +new file mode 100644 +index 00000000..f920b5a5 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strrchr-evex.S +@@ -0,0 +1,265 @@ ++/* strrchr/wcsrchr optimized with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++# include ++ ++# ifndef STRRCHR ++# define STRRCHR __strrchr_evex ++# endif ++ ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++ ++# ifdef USE_AS_WCSRCHR ++# define VPBROADCAST vpbroadcastd ++# define VPCMP vpcmpd ++# define SHIFT_REG r8d ++# else ++# define VPBROADCAST vpbroadcastb ++# define VPCMP vpcmpb ++# define SHIFT_REG ecx ++# endif ++ ++# define XMMZERO xmm16 ++# define YMMZERO ymm16 ++# define YMMMATCH ymm17 ++# define YMM1 ymm18 ++ ++# define VEC_SIZE 32 ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (STRRCHR) ++ movl %edi, %ecx ++ /* Broadcast CHAR to YMMMATCH. */ ++ VPBROADCAST %esi, %YMMMATCH ++ ++ vpxorq %XMMZERO, %XMMZERO, %XMMZERO ++ ++ /* Check if we may cross page boundary with one vector load. */ ++ andl $(2 * VEC_SIZE - 1), %ecx ++ cmpl $VEC_SIZE, %ecx ++ ja L(cros_page_boundary) ++ ++ VMOVU (%rdi), %YMM1 ++ ++ /* Each bit in K0 represents a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM1, %k0 ++ /* Each bit in K1 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k0, %ecx ++ kmovd %k1, %eax ++ ++ addq $VEC_SIZE, %rdi ++ ++ testl %eax, %eax ++ jnz L(first_vec) ++ ++ testl %ecx, %ecx ++ jnz L(return_null) ++ ++ andq $-VEC_SIZE, %rdi ++ xorl %edx, %edx ++ jmp L(aligned_loop) ++ ++ .p2align 4 ++L(first_vec): ++ /* Check if there is a null byte. */ ++ testl %ecx, %ecx ++ jnz L(char_and_nul_in_first_vec) ++ ++ /* Remember the match and keep searching. */ ++ movl %eax, %edx ++ movq %rdi, %rsi ++ andq $-VEC_SIZE, %rdi ++ jmp L(aligned_loop) ++ ++ .p2align 4 ++L(cros_page_boundary): ++ andl $(VEC_SIZE - 1), %ecx ++ andq $-VEC_SIZE, %rdi ++ ++# ifdef USE_AS_WCSRCHR ++ /* NB: Divide shift count by 4 since each bit in K1 represent 4 ++ bytes. */ ++ movl %ecx, %SHIFT_REG ++ sarl $2, %SHIFT_REG ++# endif ++ ++ VMOVA (%rdi), %YMM1 ++ ++ /* Each bit in K0 represents a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM1, %k0 ++ /* Each bit in K1 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k0, %edx ++ kmovd %k1, %eax ++ ++ shrxl %SHIFT_REG, %edx, %edx ++ shrxl %SHIFT_REG, %eax, %eax ++ addq $VEC_SIZE, %rdi ++ ++ /* Check if there is a CHAR. */ ++ testl %eax, %eax ++ jnz L(found_char) ++ ++ testl %edx, %edx ++ jnz L(return_null) ++ ++ jmp L(aligned_loop) ++ ++ .p2align 4 ++L(found_char): ++ testl %edx, %edx ++ jnz L(char_and_nul) ++ ++ /* Remember the match and keep searching. */ ++ movl %eax, %edx ++ leaq (%rdi, %rcx), %rsi ++ ++ .p2align 4 ++L(aligned_loop): ++ VMOVA (%rdi), %YMM1 ++ addq $VEC_SIZE, %rdi ++ ++ /* Each bit in K0 represents a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM1, %k0 ++ /* Each bit in K1 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k0, %ecx ++ kmovd %k1, %eax ++ orl %eax, %ecx ++ jnz L(char_nor_null) ++ ++ VMOVA (%rdi), %YMM1 ++ add $VEC_SIZE, %rdi ++ ++ /* Each bit in K0 represents a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM1, %k0 ++ /* Each bit in K1 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k0, %ecx ++ kmovd %k1, %eax ++ orl %eax, %ecx ++ jnz L(char_nor_null) ++ ++ VMOVA (%rdi), %YMM1 ++ addq $VEC_SIZE, %rdi ++ ++ /* Each bit in K0 represents a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM1, %k0 ++ /* Each bit in K1 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k0, %ecx ++ kmovd %k1, %eax ++ orl %eax, %ecx ++ jnz L(char_nor_null) ++ ++ VMOVA (%rdi), %YMM1 ++ addq $VEC_SIZE, %rdi ++ ++ /* Each bit in K0 represents a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM1, %k0 ++ /* Each bit in K1 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k0, %ecx ++ kmovd %k1, %eax ++ orl %eax, %ecx ++ jz L(aligned_loop) ++ ++ .p2align 4 ++L(char_nor_null): ++ /* Find a CHAR or a null byte in a loop. */ ++ testl %eax, %eax ++ jnz L(match) ++L(return_value): ++ testl %edx, %edx ++ jz L(return_null) ++ movl %edx, %eax ++ movq %rsi, %rdi ++ bsrl %eax, %eax ++# ifdef USE_AS_WCSRCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq -VEC_SIZE(%rdi, %rax, 4), %rax ++# else ++ leaq -VEC_SIZE(%rdi, %rax), %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(match): ++ /* Find a CHAR. Check if there is a null byte. */ ++ kmovd %k0, %ecx ++ testl %ecx, %ecx ++ jnz L(find_nul) ++ ++ /* Remember the match and keep searching. */ ++ movl %eax, %edx ++ movq %rdi, %rsi ++ jmp L(aligned_loop) ++ ++ .p2align 4 ++L(find_nul): ++ /* Mask out any matching bits after the null byte. */ ++ movl %ecx, %r8d ++ subl $1, %r8d ++ xorl %ecx, %r8d ++ andl %r8d, %eax ++ testl %eax, %eax ++ /* If there is no CHAR here, return the remembered one. */ ++ jz L(return_value) ++ bsrl %eax, %eax ++# ifdef USE_AS_WCSRCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq -VEC_SIZE(%rdi, %rax, 4), %rax ++# else ++ leaq -VEC_SIZE(%rdi, %rax), %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(char_and_nul): ++ /* Find both a CHAR and a null byte. */ ++ addq %rcx, %rdi ++ movl %edx, %ecx ++L(char_and_nul_in_first_vec): ++ /* Mask out any matching bits after the null byte. */ ++ movl %ecx, %r8d ++ subl $1, %r8d ++ xorl %ecx, %r8d ++ andl %r8d, %eax ++ testl %eax, %eax ++ /* Return null pointer if the null byte comes first. */ ++ jz L(return_null) ++ bsrl %eax, %eax ++# ifdef USE_AS_WCSRCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq -VEC_SIZE(%rdi, %rax, 4), %rax ++# else ++ leaq -VEC_SIZE(%rdi, %rax), %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(return_null): ++ xorl %eax, %eax ++ ret ++ ++END (STRRCHR) ++#endif +diff --git a/sysdeps/x86_64/multiarch/wcschr-evex.S b/sysdeps/x86_64/multiarch/wcschr-evex.S +new file mode 100644 +index 00000000..7cb8f1e4 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcschr-evex.S +@@ -0,0 +1,3 @@ ++#define STRCHR __wcschr_evex ++#define USE_AS_WCSCHR 1 ++#include "strchr-evex.S" +diff --git a/sysdeps/x86_64/multiarch/wcscmp-evex.S b/sysdeps/x86_64/multiarch/wcscmp-evex.S +new file mode 100644 +index 00000000..42e73e51 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcscmp-evex.S +@@ -0,0 +1,4 @@ ++#define STRCMP __wcscmp_evex ++#define USE_AS_WCSCMP 1 ++ ++#include "strcmp-evex.S" +diff --git a/sysdeps/x86_64/multiarch/wcslen-evex.S b/sysdeps/x86_64/multiarch/wcslen-evex.S +new file mode 100644 +index 00000000..bdafa83b +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcslen-evex.S +@@ -0,0 +1,4 @@ ++#define STRLEN __wcslen_evex ++#define USE_AS_WCSLEN 1 ++ ++#include "strlen-evex.S" +diff --git a/sysdeps/x86_64/multiarch/wcsncmp-evex.S b/sysdeps/x86_64/multiarch/wcsncmp-evex.S +new file mode 100644 +index 00000000..8a8e3107 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcsncmp-evex.S +@@ -0,0 +1,5 @@ ++#define STRCMP __wcsncmp_evex ++#define USE_AS_STRNCMP 1 ++#define USE_AS_WCSCMP 1 ++ ++#include "strcmp-evex.S" +diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S +new file mode 100644 +index 00000000..24773bb4 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S +@@ -0,0 +1,5 @@ ++#define STRLEN __wcsnlen_evex ++#define USE_AS_WCSLEN 1 ++#define USE_AS_STRNLEN 1 ++ ++#include "strlen-evex.S" +diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c +index b3144c93..84254b83 100644 +--- a/sysdeps/x86_64/multiarch/wcsnlen.c ++++ b/sysdeps/x86_64/multiarch/wcsnlen.c +@@ -29,16 +29,24 @@ + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +- return OPTIMIZE (avx2); ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ return OPTIMIZE (evex); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } + + if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) + return OPTIMIZE (sse4_1); +diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S +new file mode 100644 +index 00000000..c64602f7 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S +@@ -0,0 +1,3 @@ ++#define STRRCHR __wcsrchr_evex ++#define USE_AS_WCSRCHR 1 ++#include "strrchr-evex.S" +diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S +new file mode 100644 +index 00000000..06cd0f9f +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S +@@ -0,0 +1,4 @@ ++#define MEMCHR __wmemchr_evex ++#define USE_AS_WMEMCHR 1 ++ ++#include "memchr-evex.S" +-- +GitLab + diff --git a/glibc-RHEL-15696-13.patch b/glibc-RHEL-15696-13.patch new file mode 100644 index 0000000..a88a3bc --- /dev/null +++ b/glibc-RHEL-15696-13.patch @@ -0,0 +1,1488 @@ +From 525bc2a32c9710df40371f951217c6ae7a923aee Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 5 Mar 2021 06:36:50 -0800 +Subject: [PATCH] x86-64: Add strcpy family functions with 256-bit EVEX +Content-type: text/plain; charset=UTF-8 + +Update ifunc-strcpy.h to select the function optimized with 256-bit EVEX +instructions using YMM16-YMM31 registers to avoid RTM abort with usable +AVX512VL and AVX512BW since VZEROUPPER isn't needed at function exit. +--- + sysdeps/x86_64/multiarch/Makefile | 6 + + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 24 + + sysdeps/x86_64/multiarch/ifunc-strcpy.h | 13 +- + sysdeps/x86_64/multiarch/stpcpy-evex.S | 3 + + sysdeps/x86_64/multiarch/stpncpy-evex.S | 4 + + sysdeps/x86_64/multiarch/strcat-evex.S | 283 ++++++ + sysdeps/x86_64/multiarch/strcpy-evex.S | 1003 ++++++++++++++++++++ + sysdeps/x86_64/multiarch/strncat-evex.S | 3 + + sysdeps/x86_64/multiarch/strncpy-evex.S | 3 + + 9 files changed, 1339 insertions(+), 3 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/stpcpy-evex.S + create mode 100644 sysdeps/x86_64/multiarch/stpncpy-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strcat-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strcpy-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strncat-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strncpy-evex.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 5ce85882..46783cd1 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -43,11 +43,17 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ + memchr-evex \ + memrchr-evex \ + rawmemchr-evex \ ++ stpcpy-evex \ ++ stpncpy-evex \ ++ strcat-evex \ + strchr-evex \ + strchrnul-evex \ + strcmp-evex \ ++ strcpy-evex \ + strlen-evex \ ++ strncat-evex \ + strncmp-evex \ ++ strncpy-evex \ + strnlen-evex \ + strrchr-evex + CFLAGS-varshift.c += -msse4 +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index bd7d9f19..082e4da3 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -224,6 +224,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __stpncpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), + __stpncpy_avx2) ++ IFUNC_IMPL_ADD (array, i, stpncpy, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __stpncpy_evex) + IFUNC_IMPL_ADD (array, i, stpncpy, 1, + __stpncpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2)) +@@ -234,6 +238,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __stpcpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), + __stpcpy_avx2) ++ IFUNC_IMPL_ADD (array, i, stpcpy, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __stpcpy_evex) + IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2)) + +@@ -268,6 +276,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, strcat, + IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2), + __strcat_avx2) ++ IFUNC_IMPL_ADD (array, i, strcat, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strcat_evex) + IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3), + __strcat_ssse3) + IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned) +@@ -330,6 +342,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, strcpy, + IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2), + __strcpy_avx2) ++ IFUNC_IMPL_ADD (array, i, strcpy, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strcpy_evex) + IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3), + __strcpy_ssse3) + IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) +@@ -373,6 +389,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, strncat, + IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2), + __strncat_avx2) ++ IFUNC_IMPL_ADD (array, i, strncat, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strncat_evex) + IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3), + __strncat_ssse3) + IFUNC_IMPL_ADD (array, i, strncat, 1, +@@ -383,6 +403,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, strncpy, + IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2), + __strncpy_avx2) ++ IFUNC_IMPL_ADD (array, i, strncpy, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strncpy_evex) + IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3), + __strncpy_ssse3) + IFUNC_IMPL_ADD (array, i, strncpy, 1, +diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h +index 100dca5c..deae6348 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h ++++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h +@@ -25,16 +25,23 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) + attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +- return OPTIMIZE (avx2); ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ return OPTIMIZE (evex); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } + + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) + return OPTIMIZE (sse2_unaligned); +diff --git a/sysdeps/x86_64/multiarch/stpcpy-evex.S b/sysdeps/x86_64/multiarch/stpcpy-evex.S +new file mode 100644 +index 00000000..7c6f26cd +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/stpcpy-evex.S +@@ -0,0 +1,3 @@ ++#define USE_AS_STPCPY ++#define STRCPY __stpcpy_evex ++#include "strcpy-evex.S" +diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S +new file mode 100644 +index 00000000..1570014d +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S +@@ -0,0 +1,4 @@ ++#define USE_AS_STPCPY ++#define USE_AS_STRNCPY ++#define STRCPY __stpncpy_evex ++#include "strcpy-evex.S" +diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S +new file mode 100644 +index 00000000..97c3d85b +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcat-evex.S +@@ -0,0 +1,283 @@ ++/* strcat with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++# include ++ ++# ifndef STRCAT ++# define STRCAT __strcat_evex ++# endif ++ ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++ ++/* zero register */ ++# define XMMZERO xmm16 ++# define YMMZERO ymm16 ++# define YMM0 ymm17 ++# define YMM1 ymm18 ++ ++# define USE_AS_STRCAT ++ ++/* Number of bytes in a vector register */ ++# define VEC_SIZE 32 ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (STRCAT) ++ mov %rdi, %r9 ++# ifdef USE_AS_STRNCAT ++ mov %rdx, %r8 ++# endif ++ ++ xor %eax, %eax ++ mov %edi, %ecx ++ and $((VEC_SIZE * 4) - 1), %ecx ++ vpxorq %XMMZERO, %XMMZERO, %XMMZERO ++ cmp $(VEC_SIZE * 3), %ecx ++ ja L(fourth_vector_boundary) ++ vpcmpb $0, (%rdi), %YMMZERO, %k0 ++ kmovd %k0, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_first_vector) ++ mov %rdi, %rax ++ and $-VEC_SIZE, %rax ++ jmp L(align_vec_size_start) ++L(fourth_vector_boundary): ++ mov %rdi, %rax ++ and $-VEC_SIZE, %rax ++ vpcmpb $0, (%rax), %YMMZERO, %k0 ++ mov $-1, %r10d ++ sub %rax, %rcx ++ shl %cl, %r10d ++ kmovd %k0, %edx ++ and %r10d, %edx ++ jnz L(exit) ++ ++L(align_vec_size_start): ++ vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 ++ kmovd %k0, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_second_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 ++ kmovd %k1, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_third_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 ++ kmovd %k2, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fourth_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 ++ kmovd %k3, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fifth_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 ++ add $(VEC_SIZE * 4), %rax ++ kmovd %k4, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_second_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 ++ kmovd %k1, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_third_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 ++ kmovd %k2, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fourth_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 ++ kmovd %k3, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fifth_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 ++ kmovd %k4, %edx ++ add $(VEC_SIZE * 4), %rax ++ test %edx, %edx ++ jnz L(exit_null_on_second_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 ++ kmovd %k1, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_third_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 ++ kmovd %k2, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fourth_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 ++ kmovd %k3, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fifth_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 ++ add $(VEC_SIZE * 4), %rax ++ kmovd %k4, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_second_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 ++ kmovd %k1, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_third_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 ++ kmovd %k2, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fourth_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 ++ kmovd %k3, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fifth_vector) ++ ++ test $((VEC_SIZE * 4) - 1), %rax ++ jz L(align_four_vec_loop) ++ ++ vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 ++ add $(VEC_SIZE * 5), %rax ++ kmovd %k4, %edx ++ test %edx, %edx ++ jnz L(exit) ++ ++ test $((VEC_SIZE * 4) - 1), %rax ++ jz L(align_four_vec_loop) ++ ++ vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 ++ add $VEC_SIZE, %rax ++ kmovd %k0, %edx ++ test %edx, %edx ++ jnz L(exit) ++ ++ test $((VEC_SIZE * 4) - 1), %rax ++ jz L(align_four_vec_loop) ++ ++ vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 ++ add $VEC_SIZE, %rax ++ kmovd %k0, %edx ++ test %edx, %edx ++ jnz L(exit) ++ ++ test $((VEC_SIZE * 4) - 1), %rax ++ jz L(align_four_vec_loop) ++ ++ vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1 ++ add $VEC_SIZE, %rax ++ kmovd %k1, %edx ++ test %edx, %edx ++ jnz L(exit) ++ ++ add $VEC_SIZE, %rax ++ ++ .p2align 4 ++L(align_four_vec_loop): ++ VMOVA (%rax), %YMM0 ++ VMOVA (VEC_SIZE * 2)(%rax), %YMM1 ++ vpminub VEC_SIZE(%rax), %YMM0, %YMM0 ++ vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1 ++ vpminub %YMM0, %YMM1, %YMM0 ++ /* If K0 != 0, there is a null byte. */ ++ vpcmpb $0, %YMM0, %YMMZERO, %k0 ++ add $(VEC_SIZE * 4), %rax ++ ktestd %k0, %k0 ++ jz L(align_four_vec_loop) ++ ++ vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0 ++ sub $(VEC_SIZE * 5), %rax ++ kmovd %k0, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_second_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 ++ kmovd %k1, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_third_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 ++ kmovd %k2, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fourth_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 ++ kmovd %k3, %edx ++ sub %rdi, %rax ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ add $(VEC_SIZE * 4), %rax ++ jmp L(StartStrcpyPart) ++ ++ .p2align 4 ++L(exit): ++ sub %rdi, %rax ++L(exit_null_on_first_vector): ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ jmp L(StartStrcpyPart) ++ ++ .p2align 4 ++L(exit_null_on_second_vector): ++ sub %rdi, %rax ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ add $VEC_SIZE, %rax ++ jmp L(StartStrcpyPart) ++ ++ .p2align 4 ++L(exit_null_on_third_vector): ++ sub %rdi, %rax ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ add $(VEC_SIZE * 2), %rax ++ jmp L(StartStrcpyPart) ++ ++ .p2align 4 ++L(exit_null_on_fourth_vector): ++ sub %rdi, %rax ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ add $(VEC_SIZE * 3), %rax ++ jmp L(StartStrcpyPart) ++ ++ .p2align 4 ++L(exit_null_on_fifth_vector): ++ sub %rdi, %rax ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ add $(VEC_SIZE * 4), %rax ++ ++ .p2align 4 ++L(StartStrcpyPart): ++ lea (%r9, %rax), %rdi ++ mov %rsi, %rcx ++ mov %r9, %rax /* save result */ ++ ++# ifdef USE_AS_STRNCAT ++ test %r8, %r8 ++ jz L(ExitZero) ++# define USE_AS_STRNCPY ++# endif ++ ++# include "strcpy-evex.S" ++#endif +diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S +new file mode 100644 +index 00000000..a343a1a6 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcpy-evex.S +@@ -0,0 +1,1003 @@ ++/* strcpy with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++# ifndef USE_AS_STRCAT ++# include ++ ++# ifndef STRCPY ++# define STRCPY __strcpy_evex ++# endif ++ ++# endif ++ ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++ ++/* Number of bytes in a vector register */ ++# ifndef VEC_SIZE ++# define VEC_SIZE 32 ++# endif ++ ++# define XMM2 xmm18 ++# define XMM3 xmm19 ++ ++# define YMM2 ymm18 ++# define YMM3 ymm19 ++# define YMM4 ymm20 ++# define YMM5 ymm21 ++# define YMM6 ymm22 ++# define YMM7 ymm23 ++ ++# ifndef USE_AS_STRCAT ++ ++/* zero register */ ++# define XMMZERO xmm16 ++# define YMMZERO ymm16 ++# define YMM1 ymm17 ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (STRCPY) ++# ifdef USE_AS_STRNCPY ++ mov %RDX_LP, %R8_LP ++ test %R8_LP, %R8_LP ++ jz L(ExitZero) ++# endif ++ mov %rsi, %rcx ++# ifndef USE_AS_STPCPY ++ mov %rdi, %rax /* save result */ ++# endif ++ ++ vpxorq %XMMZERO, %XMMZERO, %XMMZERO ++# endif ++ ++ and $((VEC_SIZE * 4) - 1), %ecx ++ cmp $(VEC_SIZE * 2), %ecx ++ jbe L(SourceStringAlignmentLessTwoVecSize) ++ ++ and $-VEC_SIZE, %rsi ++ and $(VEC_SIZE - 1), %ecx ++ ++ vpcmpb $0, (%rsi), %YMMZERO, %k0 ++ kmovd %k0, %edx ++ shr %cl, %rdx ++ ++# ifdef USE_AS_STRNCPY ++# if defined USE_AS_STPCPY || defined USE_AS_STRCAT ++ mov $VEC_SIZE, %r10 ++ sub %rcx, %r10 ++ cmp %r10, %r8 ++# else ++ mov $(VEC_SIZE + 1), %r10 ++ sub %rcx, %r10 ++ cmp %r10, %r8 ++# endif ++ jbe L(CopyVecSizeTailCase2OrCase3) ++# endif ++ test %edx, %edx ++ jnz L(CopyVecSizeTail) ++ ++ vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1 ++ kmovd %k1, %edx ++ ++# ifdef USE_AS_STRNCPY ++ add $VEC_SIZE, %r10 ++ cmp %r10, %r8 ++ jbe L(CopyTwoVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++ jnz L(CopyTwoVecSize) ++ ++ VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */ ++ VMOVU %YMM2, (%rdi) ++ ++/* If source address alignment != destination address alignment */ ++ .p2align 4 ++L(UnalignVecSizeBoth): ++ sub %rcx, %rdi ++# ifdef USE_AS_STRNCPY ++ add %rcx, %r8 ++ sbb %rcx, %rcx ++ or %rcx, %r8 ++# endif ++ mov $VEC_SIZE, %rcx ++ VMOVA (%rsi, %rcx), %YMM2 ++ VMOVU %YMM2, (%rdi, %rcx) ++ VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 ++ vpcmpb $0, %YMM2, %YMMZERO, %k0 ++ kmovd %k0, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $(VEC_SIZE * 3), %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec2) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ VMOVU %YMM2, (%rdi, %rcx) ++ VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 ++ vpcmpb $0, %YMM3, %YMMZERO, %k0 ++ kmovd %k0, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec3) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ VMOVU %YMM3, (%rdi, %rcx) ++ VMOVA VEC_SIZE(%rsi, %rcx), %YMM4 ++ vpcmpb $0, %YMM4, %YMMZERO, %k0 ++ kmovd %k0, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec4) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ VMOVU %YMM4, (%rdi, %rcx) ++ VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 ++ vpcmpb $0, %YMM2, %YMMZERO, %k0 ++ kmovd %k0, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec2) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ VMOVU %YMM2, (%rdi, %rcx) ++ VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 ++ vpcmpb $0, %YMM2, %YMMZERO, %k0 ++ kmovd %k0, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec2) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 ++ VMOVU %YMM2, (%rdi, %rcx) ++ vpcmpb $0, %YMM3, %YMMZERO, %k0 ++ kmovd %k0, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec3) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ VMOVU %YMM3, (%rdi, %rcx) ++ mov %rsi, %rdx ++ lea VEC_SIZE(%rsi, %rcx), %rsi ++ and $-(VEC_SIZE * 4), %rsi ++ sub %rsi, %rdx ++ sub %rdx, %rdi ++# ifdef USE_AS_STRNCPY ++ lea (VEC_SIZE * 8)(%r8, %rdx), %r8 ++# endif ++L(UnalignedFourVecSizeLoop): ++ VMOVA (%rsi), %YMM4 ++ VMOVA VEC_SIZE(%rsi), %YMM5 ++ VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 ++ VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 ++ vpminub %YMM5, %YMM4, %YMM2 ++ vpminub %YMM7, %YMM6, %YMM3 ++ vpminub %YMM2, %YMM3, %YMM2 ++ /* If K7 != 0, there is a null byte. */ ++ vpcmpb $0, %YMM2, %YMMZERO, %k7 ++ kmovd %k7, %edx ++# ifdef USE_AS_STRNCPY ++ sub $(VEC_SIZE * 4), %r8 ++ jbe L(UnalignedLeaveCase2OrCase3) ++# endif ++ test %edx, %edx ++ jnz L(UnalignedFourVecSizeLeave) ++ ++L(UnalignedFourVecSizeLoop_start): ++ add $(VEC_SIZE * 4), %rdi ++ add $(VEC_SIZE * 4), %rsi ++ VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi) ++ VMOVA (%rsi), %YMM4 ++ VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi) ++ VMOVA VEC_SIZE(%rsi), %YMM5 ++ vpminub %YMM5, %YMM4, %YMM2 ++ VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi) ++ VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 ++ VMOVU %YMM7, -VEC_SIZE(%rdi) ++ VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 ++ vpminub %YMM7, %YMM6, %YMM3 ++ vpminub %YMM2, %YMM3, %YMM2 ++ /* If K7 != 0, there is a null byte. */ ++ vpcmpb $0, %YMM2, %YMMZERO, %k7 ++ kmovd %k7, %edx ++# ifdef USE_AS_STRNCPY ++ sub $(VEC_SIZE * 4), %r8 ++ jbe L(UnalignedLeaveCase2OrCase3) ++# endif ++ test %edx, %edx ++ jz L(UnalignedFourVecSizeLoop_start) ++ ++L(UnalignedFourVecSizeLeave): ++ vpcmpb $0, %YMM4, %YMMZERO, %k1 ++ kmovd %k1, %edx ++ test %edx, %edx ++ jnz L(CopyVecSizeUnaligned_0) ++ ++ vpcmpb $0, %YMM5, %YMMZERO, %k2 ++ kmovd %k2, %ecx ++ test %ecx, %ecx ++ jnz L(CopyVecSizeUnaligned_16) ++ ++ vpcmpb $0, %YMM6, %YMMZERO, %k3 ++ kmovd %k3, %edx ++ test %edx, %edx ++ jnz L(CopyVecSizeUnaligned_32) ++ ++ vpcmpb $0, %YMM7, %YMMZERO, %k4 ++ kmovd %k4, %ecx ++ bsf %ecx, %edx ++ VMOVU %YMM4, (%rdi) ++ VMOVU %YMM5, VEC_SIZE(%rdi) ++ VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++# ifdef USE_AS_STPCPY ++ lea (VEC_SIZE * 3)(%rdi, %rdx), %rax ++# endif ++ VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) ++ add $(VEC_SIZE - 1), %r8 ++ sub %rdx, %r8 ++ lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi ++ jmp L(StrncpyFillTailWithZero) ++# else ++ add $(VEC_SIZE * 3), %rsi ++ add $(VEC_SIZE * 3), %rdi ++ jmp L(CopyVecSizeExit) ++# endif ++ ++/* If source address alignment == destination address alignment */ ++ ++L(SourceStringAlignmentLessTwoVecSize): ++ VMOVU (%rsi), %YMM3 ++ VMOVU VEC_SIZE(%rsi), %YMM2 ++ vpcmpb $0, %YMM3, %YMMZERO, %k0 ++ kmovd %k0, %edx ++ ++# ifdef USE_AS_STRNCPY ++# if defined USE_AS_STPCPY || defined USE_AS_STRCAT ++ cmp $VEC_SIZE, %r8 ++# else ++ cmp $(VEC_SIZE + 1), %r8 ++# endif ++ jbe L(CopyVecSizeTail1Case2OrCase3) ++# endif ++ test %edx, %edx ++ jnz L(CopyVecSizeTail1) ++ ++ VMOVU %YMM3, (%rdi) ++ vpcmpb $0, %YMM2, %YMMZERO, %k0 ++ kmovd %k0, %edx ++ ++# ifdef USE_AS_STRNCPY ++# if defined USE_AS_STPCPY || defined USE_AS_STRCAT ++ cmp $(VEC_SIZE * 2), %r8 ++# else ++ cmp $((VEC_SIZE * 2) + 1), %r8 ++# endif ++ jbe L(CopyTwoVecSize1Case2OrCase3) ++# endif ++ test %edx, %edx ++ jnz L(CopyTwoVecSize1) ++ ++ and $-VEC_SIZE, %rsi ++ and $(VEC_SIZE - 1), %ecx ++ jmp L(UnalignVecSizeBoth) ++ ++/*------End of main part with loops---------------------*/ ++ ++/* Case1 */ ++ ++# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) ++ .p2align 4 ++L(CopyVecSize): ++ add %rcx, %rdi ++# endif ++L(CopyVecSizeTail): ++ add %rcx, %rsi ++L(CopyVecSizeTail1): ++ bsf %edx, %edx ++L(CopyVecSizeExit): ++ cmp $32, %edx ++ jae L(Exit32_63) ++ cmp $16, %edx ++ jae L(Exit16_31) ++ cmp $8, %edx ++ jae L(Exit8_15) ++ cmp $4, %edx ++ jae L(Exit4_7) ++ cmp $3, %edx ++ je L(Exit3) ++ cmp $1, %edx ++ ja L(Exit2) ++ je L(Exit1) ++ movb $0, (%rdi) ++# ifdef USE_AS_STPCPY ++ lea (%rdi), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub $1, %r8 ++ lea 1(%rdi), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ ret ++ ++ .p2align 4 ++L(CopyTwoVecSize1): ++ add $VEC_SIZE, %rsi ++ add $VEC_SIZE, %rdi ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub $VEC_SIZE, %r8 ++# endif ++ jmp L(CopyVecSizeTail1) ++ ++ .p2align 4 ++L(CopyTwoVecSize): ++ bsf %edx, %edx ++ add %rcx, %rsi ++ add $VEC_SIZE, %edx ++ sub %ecx, %edx ++ jmp L(CopyVecSizeExit) ++ ++ .p2align 4 ++L(CopyVecSizeUnaligned_0): ++ bsf %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++ VMOVU %YMM4, (%rdi) ++ add $((VEC_SIZE * 4) - 1), %r8 ++ sub %rdx, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ jmp L(StrncpyFillTailWithZero) ++# else ++ jmp L(CopyVecSizeExit) ++# endif ++ ++ .p2align 4 ++L(CopyVecSizeUnaligned_16): ++ bsf %ecx, %edx ++ VMOVU %YMM4, (%rdi) ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++# ifdef USE_AS_STPCPY ++ lea VEC_SIZE(%rdi, %rdx), %rax ++# endif ++ VMOVU %YMM5, VEC_SIZE(%rdi) ++ add $((VEC_SIZE * 3) - 1), %r8 ++ sub %rdx, %r8 ++ lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi ++ jmp L(StrncpyFillTailWithZero) ++# else ++ add $VEC_SIZE, %rsi ++ add $VEC_SIZE, %rdi ++ jmp L(CopyVecSizeExit) ++# endif ++ ++ .p2align 4 ++L(CopyVecSizeUnaligned_32): ++ bsf %edx, %edx ++ VMOVU %YMM4, (%rdi) ++ VMOVU %YMM5, VEC_SIZE(%rdi) ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++# ifdef USE_AS_STPCPY ++ lea (VEC_SIZE * 2)(%rdi, %rdx), %rax ++# endif ++ VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) ++ add $((VEC_SIZE * 2) - 1), %r8 ++ sub %rdx, %r8 ++ lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi ++ jmp L(StrncpyFillTailWithZero) ++# else ++ add $(VEC_SIZE * 2), %rsi ++ add $(VEC_SIZE * 2), %rdi ++ jmp L(CopyVecSizeExit) ++# endif ++ ++# ifdef USE_AS_STRNCPY ++# ifndef USE_AS_STRCAT ++ .p2align 4 ++L(CopyVecSizeUnalignedVec6): ++ VMOVU %YMM6, (%rdi, %rcx) ++ jmp L(CopyVecSizeVecExit) ++ ++ .p2align 4 ++L(CopyVecSizeUnalignedVec5): ++ VMOVU %YMM5, (%rdi, %rcx) ++ jmp L(CopyVecSizeVecExit) ++ ++ .p2align 4 ++L(CopyVecSizeUnalignedVec4): ++ VMOVU %YMM4, (%rdi, %rcx) ++ jmp L(CopyVecSizeVecExit) ++ ++ .p2align 4 ++L(CopyVecSizeUnalignedVec3): ++ VMOVU %YMM3, (%rdi, %rcx) ++ jmp L(CopyVecSizeVecExit) ++# endif ++ ++/* Case2 */ ++ ++ .p2align 4 ++L(CopyVecSizeCase2): ++ add $VEC_SIZE, %r8 ++ add %rcx, %rdi ++ add %rcx, %rsi ++ bsf %edx, %edx ++ cmp %r8d, %edx ++ jb L(CopyVecSizeExit) ++ jmp L(StrncpyExit) ++ ++ .p2align 4 ++L(CopyTwoVecSizeCase2): ++ add %rcx, %rsi ++ bsf %edx, %edx ++ add $VEC_SIZE, %edx ++ sub %ecx, %edx ++ cmp %r8d, %edx ++ jb L(CopyVecSizeExit) ++ jmp L(StrncpyExit) ++ ++L(CopyVecSizeTailCase2): ++ add %rcx, %rsi ++ bsf %edx, %edx ++ cmp %r8d, %edx ++ jb L(CopyVecSizeExit) ++ jmp L(StrncpyExit) ++ ++L(CopyVecSizeTail1Case2): ++ bsf %edx, %edx ++ cmp %r8d, %edx ++ jb L(CopyVecSizeExit) ++ jmp L(StrncpyExit) ++ ++/* Case2 or Case3, Case3 */ ++ ++ .p2align 4 ++L(CopyVecSizeCase2OrCase3): ++ test %rdx, %rdx ++ jnz L(CopyVecSizeCase2) ++L(CopyVecSizeCase3): ++ add $VEC_SIZE, %r8 ++ add %rcx, %rdi ++ add %rcx, %rsi ++ jmp L(StrncpyExit) ++ ++ .p2align 4 ++L(CopyTwoVecSizeCase2OrCase3): ++ test %rdx, %rdx ++ jnz L(CopyTwoVecSizeCase2) ++ add %rcx, %rsi ++ jmp L(StrncpyExit) ++ ++ .p2align 4 ++L(CopyVecSizeTailCase2OrCase3): ++ test %rdx, %rdx ++ jnz L(CopyVecSizeTailCase2) ++ add %rcx, %rsi ++ jmp L(StrncpyExit) ++ ++ .p2align 4 ++L(CopyTwoVecSize1Case2OrCase3): ++ add $VEC_SIZE, %rdi ++ add $VEC_SIZE, %rsi ++ sub $VEC_SIZE, %r8 ++L(CopyVecSizeTail1Case2OrCase3): ++ test %rdx, %rdx ++ jnz L(CopyVecSizeTail1Case2) ++ jmp L(StrncpyExit) ++# endif ++ ++/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ ++ ++ .p2align 4 ++L(Exit1): ++ movzwl (%rsi), %edx ++ mov %dx, (%rdi) ++# ifdef USE_AS_STPCPY ++ lea 1(%rdi), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub $2, %r8 ++ lea 2(%rdi), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ ret ++ ++ .p2align 4 ++L(Exit2): ++ movzwl (%rsi), %ecx ++ mov %cx, (%rdi) ++ movb $0, 2(%rdi) ++# ifdef USE_AS_STPCPY ++ lea 2(%rdi), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub $3, %r8 ++ lea 3(%rdi), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ ret ++ ++ .p2align 4 ++L(Exit3): ++ mov (%rsi), %edx ++ mov %edx, (%rdi) ++# ifdef USE_AS_STPCPY ++ lea 3(%rdi), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub $4, %r8 ++ lea 4(%rdi), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ ret ++ ++ .p2align 4 ++L(Exit4_7): ++ mov (%rsi), %ecx ++ mov %ecx, (%rdi) ++ mov -3(%rsi, %rdx), %ecx ++ mov %ecx, -3(%rdi, %rdx) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub %rdx, %r8 ++ sub $1, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ ret ++ ++ .p2align 4 ++L(Exit8_15): ++ mov (%rsi), %rcx ++ mov -7(%rsi, %rdx), %r9 ++ mov %rcx, (%rdi) ++ mov %r9, -7(%rdi, %rdx) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub %rdx, %r8 ++ sub $1, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ ret ++ ++ .p2align 4 ++L(Exit16_31): ++ VMOVU (%rsi), %XMM2 ++ VMOVU -15(%rsi, %rdx), %XMM3 ++ VMOVU %XMM2, (%rdi) ++ VMOVU %XMM3, -15(%rdi, %rdx) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub %rdx, %r8 ++ sub $1, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ ret ++ ++ .p2align 4 ++L(Exit32_63): ++ VMOVU (%rsi), %YMM2 ++ VMOVU -31(%rsi, %rdx), %YMM3 ++ VMOVU %YMM2, (%rdi) ++ VMOVU %YMM3, -31(%rdi, %rdx) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub %rdx, %r8 ++ sub $1, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ ret ++ ++# ifdef USE_AS_STRNCPY ++ ++ .p2align 4 ++L(StrncpyExit1): ++ movzbl (%rsi), %edx ++ mov %dl, (%rdi) ++# ifdef USE_AS_STPCPY ++ lea 1(%rdi), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, 1(%rdi) ++# endif ++ ret ++ ++ .p2align 4 ++L(StrncpyExit2): ++ movzwl (%rsi), %edx ++ mov %dx, (%rdi) ++# ifdef USE_AS_STPCPY ++ lea 2(%rdi), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, 2(%rdi) ++# endif ++ ret ++ ++ .p2align 4 ++L(StrncpyExit3_4): ++ movzwl (%rsi), %ecx ++ movzwl -2(%rsi, %r8), %edx ++ mov %cx, (%rdi) ++ mov %dx, -2(%rdi, %r8) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %r8), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi, %r8) ++# endif ++ ret ++ ++ .p2align 4 ++L(StrncpyExit5_8): ++ mov (%rsi), %ecx ++ mov -4(%rsi, %r8), %edx ++ mov %ecx, (%rdi) ++ mov %edx, -4(%rdi, %r8) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %r8), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi, %r8) ++# endif ++ ret ++ ++ .p2align 4 ++L(StrncpyExit9_16): ++ mov (%rsi), %rcx ++ mov -8(%rsi, %r8), %rdx ++ mov %rcx, (%rdi) ++ mov %rdx, -8(%rdi, %r8) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %r8), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi, %r8) ++# endif ++ ret ++ ++ .p2align 4 ++L(StrncpyExit17_32): ++ VMOVU (%rsi), %XMM2 ++ VMOVU -16(%rsi, %r8), %XMM3 ++ VMOVU %XMM2, (%rdi) ++ VMOVU %XMM3, -16(%rdi, %r8) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %r8), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi, %r8) ++# endif ++ ret ++ ++ .p2align 4 ++L(StrncpyExit33_64): ++ /* 0/32, 31/16 */ ++ VMOVU (%rsi), %YMM2 ++ VMOVU -VEC_SIZE(%rsi, %r8), %YMM3 ++ VMOVU %YMM2, (%rdi) ++ VMOVU %YMM3, -VEC_SIZE(%rdi, %r8) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %r8), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi, %r8) ++# endif ++ ret ++ ++ .p2align 4 ++L(StrncpyExit65): ++ /* 0/32, 32/32, 64/1 */ ++ VMOVU (%rsi), %YMM2 ++ VMOVU 32(%rsi), %YMM3 ++ mov 64(%rsi), %cl ++ VMOVU %YMM2, (%rdi) ++ VMOVU %YMM3, 32(%rdi) ++ mov %cl, 64(%rdi) ++# ifdef USE_AS_STPCPY ++ lea 65(%rdi), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, 65(%rdi) ++# endif ++ ret ++ ++# ifndef USE_AS_STRCAT ++ ++ .p2align 4 ++L(Fill1): ++ mov %dl, (%rdi) ++ ret ++ ++ .p2align 4 ++L(Fill2): ++ mov %dx, (%rdi) ++ ret ++ ++ .p2align 4 ++L(Fill3_4): ++ mov %dx, (%rdi) ++ mov %dx, -2(%rdi, %r8) ++ ret ++ ++ .p2align 4 ++L(Fill5_8): ++ mov %edx, (%rdi) ++ mov %edx, -4(%rdi, %r8) ++ ret ++ ++ .p2align 4 ++L(Fill9_16): ++ mov %rdx, (%rdi) ++ mov %rdx, -8(%rdi, %r8) ++ ret ++ ++ .p2align 4 ++L(Fill17_32): ++ VMOVU %XMMZERO, (%rdi) ++ VMOVU %XMMZERO, -16(%rdi, %r8) ++ ret ++ ++ .p2align 4 ++L(CopyVecSizeUnalignedVec2): ++ VMOVU %YMM2, (%rdi, %rcx) ++ ++ .p2align 4 ++L(CopyVecSizeVecExit): ++ bsf %edx, %edx ++ add $(VEC_SIZE - 1), %r8 ++ add %rcx, %rdi ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++ sub %rdx, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ ++ .p2align 4 ++L(StrncpyFillTailWithZero): ++ xor %edx, %edx ++ sub $VEC_SIZE, %r8 ++ jbe L(StrncpyFillExit) ++ ++ VMOVU %YMMZERO, (%rdi) ++ add $VEC_SIZE, %rdi ++ ++ mov %rdi, %rsi ++ and $(VEC_SIZE - 1), %esi ++ sub %rsi, %rdi ++ add %rsi, %r8 ++ sub $(VEC_SIZE * 4), %r8 ++ jb L(StrncpyFillLessFourVecSize) ++ ++L(StrncpyFillLoopVmovdqa): ++ VMOVA %YMMZERO, (%rdi) ++ VMOVA %YMMZERO, VEC_SIZE(%rdi) ++ VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi) ++ VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi) ++ add $(VEC_SIZE * 4), %rdi ++ sub $(VEC_SIZE * 4), %r8 ++ jae L(StrncpyFillLoopVmovdqa) ++ ++L(StrncpyFillLessFourVecSize): ++ add $(VEC_SIZE * 2), %r8 ++ jl L(StrncpyFillLessTwoVecSize) ++ VMOVA %YMMZERO, (%rdi) ++ VMOVA %YMMZERO, VEC_SIZE(%rdi) ++ add $(VEC_SIZE * 2), %rdi ++ sub $VEC_SIZE, %r8 ++ jl L(StrncpyFillExit) ++ VMOVA %YMMZERO, (%rdi) ++ add $VEC_SIZE, %rdi ++ jmp L(Fill) ++ ++ .p2align 4 ++L(StrncpyFillLessTwoVecSize): ++ add $VEC_SIZE, %r8 ++ jl L(StrncpyFillExit) ++ VMOVA %YMMZERO, (%rdi) ++ add $VEC_SIZE, %rdi ++ jmp L(Fill) ++ ++ .p2align 4 ++L(StrncpyFillExit): ++ add $VEC_SIZE, %r8 ++L(Fill): ++ cmp $17, %r8d ++ jae L(Fill17_32) ++ cmp $9, %r8d ++ jae L(Fill9_16) ++ cmp $5, %r8d ++ jae L(Fill5_8) ++ cmp $3, %r8d ++ jae L(Fill3_4) ++ cmp $1, %r8d ++ ja L(Fill2) ++ je L(Fill1) ++ ret ++ ++/* end of ifndef USE_AS_STRCAT */ ++# endif ++ ++ .p2align 4 ++L(UnalignedLeaveCase2OrCase3): ++ test %rdx, %rdx ++ jnz L(UnalignedFourVecSizeLeaveCase2) ++L(UnalignedFourVecSizeLeaveCase3): ++ lea (VEC_SIZE * 4)(%r8), %rcx ++ and $-VEC_SIZE, %rcx ++ add $(VEC_SIZE * 3), %r8 ++ jl L(CopyVecSizeCase3) ++ VMOVU %YMM4, (%rdi) ++ sub $VEC_SIZE, %r8 ++ jb L(CopyVecSizeCase3) ++ VMOVU %YMM5, VEC_SIZE(%rdi) ++ sub $VEC_SIZE, %r8 ++ jb L(CopyVecSizeCase3) ++ VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) ++ sub $VEC_SIZE, %r8 ++ jb L(CopyVecSizeCase3) ++ VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) ++# ifdef USE_AS_STPCPY ++ lea (VEC_SIZE * 4)(%rdi), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (VEC_SIZE * 4)(%rdi) ++# endif ++ ret ++ ++ .p2align 4 ++L(UnalignedFourVecSizeLeaveCase2): ++ xor %ecx, %ecx ++ vpcmpb $0, %YMM4, %YMMZERO, %k1 ++ kmovd %k1, %edx ++ add $(VEC_SIZE * 3), %r8 ++ jle L(CopyVecSizeCase2OrCase3) ++ test %edx, %edx ++# ifndef USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec4) ++# else ++ jnz L(CopyVecSize) ++# endif ++ vpcmpb $0, %YMM5, %YMMZERO, %k2 ++ kmovd %k2, %edx ++ VMOVU %YMM4, (%rdi) ++ add $VEC_SIZE, %rcx ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++ test %edx, %edx ++# ifndef USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec5) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ vpcmpb $0, %YMM6, %YMMZERO, %k3 ++ kmovd %k3, %edx ++ VMOVU %YMM5, VEC_SIZE(%rdi) ++ add $VEC_SIZE, %rcx ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++ test %edx, %edx ++# ifndef USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec6) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ vpcmpb $0, %YMM7, %YMMZERO, %k4 ++ kmovd %k4, %edx ++ VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) ++ lea VEC_SIZE(%rdi, %rcx), %rdi ++ lea VEC_SIZE(%rsi, %rcx), %rsi ++ bsf %edx, %edx ++ cmp %r8d, %edx ++ jb L(CopyVecSizeExit) ++L(StrncpyExit): ++ cmp $65, %r8d ++ je L(StrncpyExit65) ++ cmp $33, %r8d ++ jae L(StrncpyExit33_64) ++ cmp $17, %r8d ++ jae L(StrncpyExit17_32) ++ cmp $9, %r8d ++ jae L(StrncpyExit9_16) ++ cmp $5, %r8d ++ jae L(StrncpyExit5_8) ++ cmp $3, %r8d ++ jae L(StrncpyExit3_4) ++ cmp $1, %r8d ++ ja L(StrncpyExit2) ++ je L(StrncpyExit1) ++# ifdef USE_AS_STPCPY ++ mov %rdi, %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi) ++# endif ++ ret ++ ++ .p2align 4 ++L(ExitZero): ++# ifndef USE_AS_STRCAT ++ mov %rdi, %rax ++# endif ++ ret ++ ++# endif ++ ++# ifndef USE_AS_STRCAT ++END (STRCPY) ++# else ++END (STRCAT) ++# endif ++#endif +diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S +new file mode 100644 +index 00000000..8884f023 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncat-evex.S +@@ -0,0 +1,3 @@ ++#define USE_AS_STRNCAT ++#define STRCAT __strncat_evex ++#include "strcat-evex.S" +diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S +new file mode 100644 +index 00000000..40e391f0 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncpy-evex.S +@@ -0,0 +1,3 @@ ++#define USE_AS_STRNCPY ++#define STRCPY __strncpy_evex ++#include "strcpy-evex.S" +-- +GitLab + diff --git a/glibc-RHEL-15696-14.patch b/glibc-RHEL-15696-14.patch new file mode 100644 index 0000000..84a4593 --- /dev/null +++ b/glibc-RHEL-15696-14.patch @@ -0,0 +1,242 @@ +From 63ad43566f7a25d140dc723598aeb441ad657eed Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 5 Mar 2021 06:46:08 -0800 +Subject: [PATCH] x86-64: Add memmove family functions with 256-bit EVEX +Content-type: text/plain; charset=UTF-8 + +Update ifunc-memmove.h to select the function optimized with 256-bit EVEX +instructions using YMM16-YMM31 registers to avoid RTM abort with usable +AVX512VL since VZEROUPPER isn't needed at function exit. +--- + sysdeps/x86_64/multiarch/Makefile | 1 + + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 36 +++++++++++++++++++ + sysdeps/x86_64/multiarch/ifunc-memmove.h | 21 +++++++++-- + .../multiarch/memmove-evex-unaligned-erms.S | 33 +++++++++++++++++ + .../multiarch/memmove-vec-unaligned-erms.S | 24 ++++++++----- + 5 files changed, 104 insertions(+), 11 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 46783cd1..4563fc56 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ + memset-avx2-unaligned-erms \ + memset-avx512-unaligned-erms \ + memchr-evex \ ++ memmove-evex-unaligned-erms \ + memrchr-evex \ + rawmemchr-evex \ + stpcpy-evex \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 082e4da3..6bd3abfc 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __memmove_chk, + CPU_FEATURE_USABLE (AVX), + __memmove_chk_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, __memmove_chk, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __memmove_chk_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, __memmove_chk, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __memmove_chk_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + CPU_FEATURE_USABLE (SSSE3), + __memmove_chk_ssse3_back) +@@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memmove, + CPU_FEATURE_USABLE (AVX), + __memmove_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, memmove, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __memmove_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, memmove, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __memmove_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memmove, + CPU_FEATURE_USABLE (AVX512F), + __memmove_avx512_no_vzeroupper) +@@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + CPU_FEATURE_USABLE (AVX), + __memcpy_chk_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, __memcpy_chk, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __memcpy_chk_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, __memcpy_chk, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __memcpy_chk_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + CPU_FEATURE_USABLE (SSSE3), + __memcpy_chk_ssse3_back) +@@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memcpy, + CPU_FEATURE_USABLE (AVX), + __memcpy_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, memcpy, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __memcpy_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, memcpy, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __memcpy_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), + __memcpy_ssse3_back) + IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), +@@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + CPU_FEATURE_USABLE (AVX), + __mempcpy_chk_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __mempcpy_chk_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __mempcpy_chk_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + CPU_FEATURE_USABLE (SSSE3), + __mempcpy_chk_ssse3_back) +@@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, mempcpy, + CPU_FEATURE_USABLE (AVX), + __mempcpy_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, mempcpy, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __mempcpy_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, mempcpy, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __mempcpy_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), + __mempcpy_ssse3_back) + IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), +diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h +index 5e5f0299..6f8bce5f 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h +@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) + attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) ++ attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) + attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms) +@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void) + + if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +- return OPTIMIZE (avx_unaligned_erms); ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE (evex_unaligned_erms); ++ ++ return OPTIMIZE (evex_unaligned); ++ } ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE (avx_unaligned_erms); + +- return OPTIMIZE (avx_unaligned); ++ return OPTIMIZE (avx_unaligned); ++ } + } + + if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3) +diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S +new file mode 100644 +index 00000000..0cbce8f9 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S +@@ -0,0 +1,33 @@ ++#if IS_IN (libc) ++# define VEC_SIZE 32 ++# define XMM0 xmm16 ++# define XMM1 xmm17 ++# define YMM0 ymm16 ++# define YMM1 ymm17 ++# define VEC0 ymm16 ++# define VEC1 ymm17 ++# define VEC2 ymm18 ++# define VEC3 ymm19 ++# define VEC4 ymm20 ++# define VEC5 ymm21 ++# define VEC6 ymm22 ++# define VEC7 ymm23 ++# define VEC8 ymm24 ++# define VEC9 ymm25 ++# define VEC10 ymm26 ++# define VEC11 ymm27 ++# define VEC12 ymm28 ++# define VEC13 ymm29 ++# define VEC14 ymm30 ++# define VEC15 ymm31 ++# define VEC(i) VEC##i ++# define VMOVNT vmovntdq ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++# define VZEROUPPER ++ ++# define SECTION(p) p##.evex ++# define MEMMOVE_SYMBOL(p,s) p##_evex_##s ++ ++# include "memmove-vec-unaligned-erms.S" ++#endif +diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +index 274aa1c7..08e21692 100644 +--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +@@ -48,6 +48,14 @@ + # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) + #endif + ++#ifndef XMM0 ++# define XMM0 xmm0 ++#endif ++ ++#ifndef YMM0 ++# define YMM0 ymm0 ++#endif ++ + #ifndef VZEROUPPER + # if VEC_SIZE > 16 + # define VZEROUPPER vzeroupper +@@ -277,20 +285,20 @@ L(less_vec): + #if VEC_SIZE > 32 + L(between_32_63): + /* From 32 to 63. No branch when size == 32. */ +- vmovdqu (%rsi), %ymm0 +- vmovdqu -32(%rsi,%rdx), %ymm1 +- vmovdqu %ymm0, (%rdi) +- vmovdqu %ymm1, -32(%rdi,%rdx) ++ VMOVU (%rsi), %YMM0 ++ VMOVU -32(%rsi,%rdx), %YMM1 ++ VMOVU %YMM0, (%rdi) ++ VMOVU %YMM1, -32(%rdi,%rdx) + VZEROUPPER + ret + #endif + #if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ + L(between_16_31): +- vmovdqu (%rsi), %xmm0 +- vmovdqu -16(%rsi,%rdx), %xmm1 +- vmovdqu %xmm0, (%rdi) +- vmovdqu %xmm1, -16(%rdi,%rdx) ++ VMOVU (%rsi), %XMM0 ++ VMOVU -16(%rsi,%rdx), %XMM1 ++ VMOVU %XMM0, (%rdi) ++ VMOVU %XMM1, -16(%rdi,%rdx) + ret + #endif + L(between_8_15): +-- +GitLab + diff --git a/glibc-RHEL-15696-15.patch b/glibc-RHEL-15696-15.patch new file mode 100644 index 0000000..72cd8cf --- /dev/null +++ b/glibc-RHEL-15696-15.patch @@ -0,0 +1,254 @@ +From 1b968b6b9b3aac702ac2f133e0dd16cfdbb415ee Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 5 Mar 2021 07:15:03 -0800 +Subject: [PATCH] x86-64: Add memset family functions with 256-bit EVEX +Content-type: text/plain; charset=UTF-8 + +Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized +with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM +abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at +function exit. +--- + sysdeps/x86_64/multiarch/Makefile | 1 + + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 22 +++++++++++++++++ + sysdeps/x86_64/multiarch/ifunc-memset.h | 24 +++++++++++++++---- + sysdeps/x86_64/multiarch/ifunc-wmemset.h | 13 ++++++---- + .../multiarch/memset-evex-unaligned-erms.S | 24 +++++++++++++++++++ + .../multiarch/memset-vec-unaligned-erms.S | 20 +++++++++++----- + 6 files changed, 90 insertions(+), 14 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 4563fc56..1cc0a10e 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ + memchr-evex \ + memmove-evex-unaligned-erms \ + memrchr-evex \ ++ memset-evex-unaligned-erms \ + rawmemchr-evex \ + stpcpy-evex \ + stpncpy-evex \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 6bd3abfc..7cf83485 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __memset_chk, + CPU_FEATURE_USABLE (AVX2), + __memset_chk_avx2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, __memset_chk, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __memset_chk_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, __memset_chk, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __memset_chk_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + CPU_FEATURE_USABLE (AVX512F), + __memset_chk_avx512_unaligned_erms) +@@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memset, + CPU_FEATURE_USABLE (AVX2), + __memset_avx2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, memset, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __memset_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, memset, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __memset_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + CPU_FEATURE_USABLE (AVX512F), + __memset_avx512_unaligned_erms) +@@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wmemset, + CPU_FEATURE_USABLE (AVX2), + __wmemset_avx2_unaligned) ++ IFUNC_IMPL_ADD (array, i, wmemset, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __wmemset_evex_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + CPU_FEATURE_USABLE (AVX512F), + __wmemset_avx512_unaligned)) +@@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + CPU_FEATURE_USABLE (AVX2), + __wmemset_chk_avx2_unaligned) ++ IFUNC_IMPL_ADD (array, i, __wmemset_chk, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __wmemset_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + CPU_FEATURE_USABLE (AVX512F), + __wmemset_chk_avx512_unaligned)) +diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h +index 708bd72e..6f31f4dc 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memset.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h +@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms) + attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) ++ attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) + attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms) +@@ -56,10 +60,22 @@ IFUNC_SELECTOR (void) + + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) + { +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +- return OPTIMIZE (avx2_unaligned_erms); +- else +- return OPTIMIZE (avx2_unaligned); ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE (evex_unaligned_erms); ++ ++ return OPTIMIZE (evex_unaligned); ++ } ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE (avx2_unaligned_erms); ++ ++ return OPTIMIZE (avx2_unaligned); ++ } + } + + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h +index eb242210..9290c4bf 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h ++++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h +@@ -20,6 +20,7 @@ + + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; + + static inline void * +@@ -27,14 +28,18 @@ IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) +- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) ++ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) ++ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx512_unaligned); +- else ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) ++ return OPTIMIZE (evex_unaligned); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2_unaligned); + } + +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +new file mode 100644 +index 00000000..ae0a4d6e +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -0,0 +1,24 @@ ++#if IS_IN (libc) ++# define VEC_SIZE 32 ++# define XMM0 xmm16 ++# define YMM0 ymm16 ++# define VEC0 ymm16 ++# define VEC(i) VEC##i ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++# define VZEROUPPER ++ ++# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++ movq r, %rax; \ ++ vpbroadcastb d, %VEC0 ++ ++# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++ movq r, %rax; \ ++ vpbroadcastd d, %VEC0 ++ ++# define SECTION(p) p##.evex ++# define MEMSET_SYMBOL(p,s) p##_evex_##s ++# define WMEMSET_SYMBOL(p,s) p##_evex_##s ++ ++# include "memset-vec-unaligned-erms.S" ++#endif +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 9a0fd818..71e91a8f 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -34,6 +34,14 @@ + # define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s) + #endif + ++#ifndef XMM0 ++# define XMM0 xmm0 ++#endif ++ ++#ifndef YMM0 ++# define YMM0 ymm0 ++#endif ++ + #ifndef VZEROUPPER + # if VEC_SIZE > 16 + # define VZEROUPPER vzeroupper +@@ -67,7 +75,7 @@ + ENTRY (__bzero) + mov %RDI_LP, %RAX_LP /* Set return value. */ + mov %RSI_LP, %RDX_LP /* Set n. */ +- pxor %xmm0, %xmm0 ++ pxor %XMM0, %XMM0 + jmp L(entry_from_bzero) + END (__bzero) + weak_alias (__bzero, bzero) +@@ -223,7 +231,7 @@ L(less_vec): + cmpb $16, %dl + jae L(between_16_31) + # endif +- MOVQ %xmm0, %rcx ++ MOVQ %XMM0, %rcx + cmpb $8, %dl + jae L(between_8_15) + cmpb $4, %dl +@@ -238,16 +246,16 @@ L(less_vec): + # if VEC_SIZE > 32 + /* From 32 to 63. No branch when size == 32. */ + L(between_32_63): +- vmovdqu %ymm0, -32(%rdi,%rdx) +- vmovdqu %ymm0, (%rdi) ++ VMOVU %YMM0, -32(%rdi,%rdx) ++ VMOVU %YMM0, (%rdi) + VZEROUPPER + ret + # endif + # if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ + L(between_16_31): +- vmovdqu %xmm0, -16(%rdi,%rdx) +- vmovdqu %xmm0, (%rdi) ++ VMOVU %XMM0, -16(%rdi,%rdx) ++ VMOVU %XMM0, (%rdi) + VZEROUPPER + ret + # endif +-- +GitLab + diff --git a/glibc-RHEL-15696-16.patch b/glibc-RHEL-15696-16.patch new file mode 100644 index 0000000..b3f443d --- /dev/null +++ b/glibc-RHEL-15696-16.patch @@ -0,0 +1,561 @@ +From 91264fe3577fe887b4860923fa6142b5274c8965 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 5 Mar 2021 07:20:28 -0800 +Subject: [PATCH] x86-64: Add memcmp family functions with 256-bit EVEX +Content-type: text/plain; charset=UTF-8 + +Update ifunc-memcmp.h to select the function optimized with 256-bit EVEX +instructions using YMM16-YMM31 registers to avoid RTM abort with usable +AVX512VL, AVX512BW and MOVBE since VZEROUPPER isn't needed at function +exit. +--- + sysdeps/x86_64/multiarch/Makefile | 4 +- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 10 + + sysdeps/x86_64/multiarch/ifunc-memcmp.h | 13 +- + sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 440 ++++++++++++++++++ + sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S | 4 + + 5 files changed, 467 insertions(+), 4 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S + create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 1cc0a10e..9d79b138 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ + memset-avx2-unaligned-erms \ + memset-avx512-unaligned-erms \ + memchr-evex \ ++ memcmp-evex-movbe \ + memmove-evex-unaligned-erms \ + memrchr-evex \ + memset-evex-unaligned-erms \ +@@ -81,7 +82,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ + wcsncmp-evex \ + wcsnlen-evex \ + wcsrchr-evex \ +- wmemchr-evex ++ wmemchr-evex \ ++ wmemcmp-evex-movbe + endif + + ifeq ($(subdir),debug) +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 7cf83485..c8da910e 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -56,6 +56,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (MOVBE)), + __memcmp_avx2_movbe) ++ IFUNC_IMPL_ADD (array, i, memcmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (MOVBE)), ++ __memcmp_evex_movbe) + IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), + __memcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), +@@ -558,6 +563,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (MOVBE)), + __wmemcmp_avx2_movbe) ++ IFUNC_IMPL_ADD (array, i, wmemcmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (MOVBE)), ++ __wmemcmp_evex_movbe) + IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), + __wmemcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), +diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +index 6c1f3153..3ca1f0a6 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +@@ -23,17 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURE_USABLE_P (cpu_features, MOVBE) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +- return OPTIMIZE (avx2_movbe); ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ return OPTIMIZE (evex_movbe); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2_movbe); ++ } + + if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) + return OPTIMIZE (sse4_1); +diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +new file mode 100644 +index 00000000..9c093972 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +@@ -0,0 +1,440 @@ ++/* memcmp/wmemcmp optimized with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++/* memcmp/wmemcmp is implemented as: ++ 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap ++ to avoid branches. ++ 2. Use overlapping compare to avoid branch. ++ 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8 ++ bytes for wmemcmp. ++ 4. If size is 8 * VEC_SIZE or less, unroll the loop. ++ 5. Compare 4 * VEC_SIZE at a time with the aligned first memory ++ area. ++ 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. ++ 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. ++ 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ ++ ++# include ++ ++# ifndef MEMCMP ++# define MEMCMP __memcmp_evex_movbe ++# endif ++ ++# define VMOVU vmovdqu64 ++ ++# ifdef USE_AS_WMEMCMP ++# define VPCMPEQ vpcmpeqd ++# else ++# define VPCMPEQ vpcmpeqb ++# endif ++ ++# define XMM1 xmm17 ++# define XMM2 xmm18 ++# define YMM1 ymm17 ++# define YMM2 ymm18 ++# define YMM3 ymm19 ++# define YMM4 ymm20 ++# define YMM5 ymm21 ++# define YMM6 ymm22 ++ ++# define VEC_SIZE 32 ++# ifdef USE_AS_WMEMCMP ++# define VEC_MASK 0xff ++# define XMM_MASK 0xf ++# else ++# define VEC_MASK 0xffffffff ++# define XMM_MASK 0xffff ++# endif ++ ++/* Warning! ++ wmemcmp has to use SIGNED comparison for elements. ++ memcmp has to use UNSIGNED comparison for elemnts. ++*/ ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (MEMCMP) ++# ifdef USE_AS_WMEMCMP ++ shl $2, %RDX_LP ++# elif defined __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx ++# endif ++ cmp $VEC_SIZE, %RDX_LP ++ jb L(less_vec) ++ ++ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU (%rsi), %YMM2 ++ VPCMPEQ (%rdi), %YMM2, %k1 ++ kmovd %k1, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ ++ cmpq $(VEC_SIZE * 2), %rdx ++ jbe L(last_vec) ++ ++ /* More than 2 * VEC. */ ++ cmpq $(VEC_SIZE * 8), %rdx ++ ja L(more_8x_vec) ++ cmpq $(VEC_SIZE * 4), %rdx ++ jb L(last_4x_vec) ++ ++ /* From 4 * VEC to 8 * VEC, inclusively. */ ++ VMOVU (%rsi), %YMM1 ++ VPCMPEQ (%rdi), %YMM1, %k1 ++ ++ VMOVU VEC_SIZE(%rsi), %YMM2 ++ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 ++ ++ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 ++ ++ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 ++ ++ kandd %k1, %k2, %k5 ++ kandd %k3, %k4, %k6 ++ kandd %k5, %k6, %k6 ++ ++ kmovd %k6, %eax ++ cmpl $VEC_MASK, %eax ++ jne L(4x_vec_end) ++ ++ leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi ++ leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi ++ VMOVU (%rsi), %YMM1 ++ VPCMPEQ (%rdi), %YMM1, %k1 ++ ++ VMOVU VEC_SIZE(%rsi), %YMM2 ++ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 ++ kandd %k1, %k2, %k5 ++ ++ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 ++ kandd %k3, %k5, %k5 ++ ++ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 ++ kandd %k4, %k5, %k5 ++ ++ kmovd %k5, %eax ++ cmpl $VEC_MASK, %eax ++ jne L(4x_vec_end) ++ xorl %eax, %eax ++ ret ++ ++ .p2align 4 ++L(last_2x_vec): ++ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU (%rsi), %YMM2 ++ VPCMPEQ (%rdi), %YMM2, %k2 ++ kmovd %k2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ ++L(last_vec): ++ /* Use overlapping loads to avoid branches. */ ++ leaq -VEC_SIZE(%rdi, %rdx), %rdi ++ leaq -VEC_SIZE(%rsi, %rdx), %rsi ++ VMOVU (%rsi), %YMM2 ++ VPCMPEQ (%rdi), %YMM2, %k2 ++ kmovd %k2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ ret ++ ++ .p2align 4 ++L(first_vec): ++ /* A byte or int32 is different within 16 or 32 bytes. */ ++ tzcntl %eax, %ecx ++# ifdef USE_AS_WMEMCMP ++ xorl %eax, %eax ++ movl (%rdi, %rcx, 4), %edx ++ cmpl (%rsi, %rcx, 4), %edx ++L(wmemcmp_return): ++ setl %al ++ negl %eax ++ orl $1, %eax ++# else ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %edx ++ sub %edx, %eax ++# endif ++ ret ++ ++# ifdef USE_AS_WMEMCMP ++ .p2align 4 ++L(4): ++ xorl %eax, %eax ++ movl (%rdi), %edx ++ cmpl (%rsi), %edx ++ jne L(wmemcmp_return) ++ ret ++# else ++ .p2align 4 ++L(between_4_7): ++ /* Load as big endian with overlapping movbe to avoid branches. */ ++ movbe (%rdi), %eax ++ movbe (%rsi), %ecx ++ shlq $32, %rax ++ shlq $32, %rcx ++ movbe -4(%rdi, %rdx), %edi ++ movbe -4(%rsi, %rdx), %esi ++ orq %rdi, %rax ++ orq %rsi, %rcx ++ subq %rcx, %rax ++ je L(exit) ++ sbbl %eax, %eax ++ orl $1, %eax ++ ret ++ ++ .p2align 4 ++L(exit): ++ ret ++ ++ .p2align 4 ++L(between_2_3): ++ /* Load as big endian to avoid branches. */ ++ movzwl (%rdi), %eax ++ movzwl (%rsi), %ecx ++ shll $8, %eax ++ shll $8, %ecx ++ bswap %eax ++ bswap %ecx ++ movb -1(%rdi, %rdx), %al ++ movb -1(%rsi, %rdx), %cl ++ /* Subtraction is okay because the upper 8 bits are zero. */ ++ subl %ecx, %eax ++ ret ++ ++ .p2align 4 ++L(1): ++ movzbl (%rdi), %eax ++ movzbl (%rsi), %ecx ++ subl %ecx, %eax ++ ret ++# endif ++ ++ .p2align 4 ++L(zero): ++ xorl %eax, %eax ++ ret ++ ++ .p2align 4 ++L(less_vec): ++# ifdef USE_AS_WMEMCMP ++ /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ ++ cmpb $4, %dl ++ je L(4) ++ jb L(zero) ++# else ++ cmpb $1, %dl ++ je L(1) ++ jb L(zero) ++ cmpb $4, %dl ++ jb L(between_2_3) ++ cmpb $8, %dl ++ jb L(between_4_7) ++# endif ++ cmpb $16, %dl ++ jae L(between_16_31) ++ /* It is between 8 and 15 bytes. */ ++ vmovq (%rdi), %XMM1 ++ vmovq (%rsi), %XMM2 ++ VPCMPEQ %XMM1, %XMM2, %k2 ++ kmovw %k2, %eax ++ subl $XMM_MASK, %eax ++ jnz L(first_vec) ++ /* Use overlapping loads to avoid branches. */ ++ leaq -8(%rdi, %rdx), %rdi ++ leaq -8(%rsi, %rdx), %rsi ++ vmovq (%rdi), %XMM1 ++ vmovq (%rsi), %XMM2 ++ VPCMPEQ %XMM1, %XMM2, %k2 ++ kmovw %k2, %eax ++ subl $XMM_MASK, %eax ++ jnz L(first_vec) ++ ret ++ ++ .p2align 4 ++L(between_16_31): ++ /* From 16 to 31 bytes. No branch when size == 16. */ ++ VMOVU (%rsi), %XMM2 ++ VPCMPEQ (%rdi), %XMM2, %k2 ++ kmovw %k2, %eax ++ subl $XMM_MASK, %eax ++ jnz L(first_vec) ++ ++ /* Use overlapping loads to avoid branches. */ ++ leaq -16(%rdi, %rdx), %rdi ++ leaq -16(%rsi, %rdx), %rsi ++ VMOVU (%rsi), %XMM2 ++ VPCMPEQ (%rdi), %XMM2, %k2 ++ kmovw %k2, %eax ++ subl $XMM_MASK, %eax ++ jnz L(first_vec) ++ ret ++ ++ .p2align 4 ++L(more_8x_vec): ++ /* More than 8 * VEC. Check the first VEC. */ ++ VMOVU (%rsi), %YMM2 ++ VPCMPEQ (%rdi), %YMM2, %k2 ++ kmovd %k2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ ++ /* Align the first memory area for aligned loads in the loop. ++ Compute how much the first memory area is misaligned. */ ++ movq %rdi, %rcx ++ andl $(VEC_SIZE - 1), %ecx ++ /* Get the negative of offset for alignment. */ ++ subq $VEC_SIZE, %rcx ++ /* Adjust the second memory area. */ ++ subq %rcx, %rsi ++ /* Adjust the first memory area which should be aligned now. */ ++ subq %rcx, %rdi ++ /* Adjust length. */ ++ addq %rcx, %rdx ++ ++L(loop_4x_vec): ++ /* Compare 4 * VEC at a time forward. */ ++ VMOVU (%rsi), %YMM1 ++ VPCMPEQ (%rdi), %YMM1, %k1 ++ ++ VMOVU VEC_SIZE(%rsi), %YMM2 ++ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 ++ kandd %k2, %k1, %k5 ++ ++ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 ++ kandd %k3, %k5, %k5 ++ ++ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 ++ kandd %k4, %k5, %k5 ++ ++ kmovd %k5, %eax ++ cmpl $VEC_MASK, %eax ++ jne L(4x_vec_end) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ addq $(VEC_SIZE * 4), %rsi ++ ++ subq $(VEC_SIZE * 4), %rdx ++ cmpq $(VEC_SIZE * 4), %rdx ++ jae L(loop_4x_vec) ++ ++ /* Less than 4 * VEC. */ ++ cmpq $VEC_SIZE, %rdx ++ jbe L(last_vec) ++ cmpq $(VEC_SIZE * 2), %rdx ++ jbe L(last_2x_vec) ++ ++L(last_4x_vec): ++ /* From 2 * VEC to 4 * VEC. */ ++ VMOVU (%rsi), %YMM2 ++ VPCMPEQ (%rdi), %YMM2, %k2 ++ kmovd %k2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ ++ addq $VEC_SIZE, %rdi ++ addq $VEC_SIZE, %rsi ++ VMOVU (%rsi), %YMM2 ++ VPCMPEQ (%rdi), %YMM2, %k2 ++ kmovd %k2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ ++ /* Use overlapping loads to avoid branches. */ ++ leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi ++ leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi ++ VMOVU (%rsi), %YMM2 ++ VPCMPEQ (%rdi), %YMM2, %k2 ++ kmovd %k2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ ++ addq $VEC_SIZE, %rdi ++ addq $VEC_SIZE, %rsi ++ VMOVU (%rsi), %YMM2 ++ VPCMPEQ (%rdi), %YMM2, %k2 ++ kmovd %k2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ ret ++ ++ .p2align 4 ++L(4x_vec_end): ++ kmovd %k1, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ kmovd %k2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec_x1) ++ kmovd %k3, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec_x2) ++ kmovd %k4, %eax ++ subl $VEC_MASK, %eax ++ tzcntl %eax, %ecx ++# ifdef USE_AS_WMEMCMP ++ xorl %eax, %eax ++ movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx ++ jmp L(wmemcmp_return) ++# else ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx ++ sub %edx, %eax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x1): ++ tzcntl %eax, %ecx ++# ifdef USE_AS_WMEMCMP ++ xorl %eax, %eax ++ movl VEC_SIZE(%rdi, %rcx, 4), %edx ++ cmpl VEC_SIZE(%rsi, %rcx, 4), %edx ++ jmp L(wmemcmp_return) ++# else ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %edx ++ sub %edx, %eax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x2): ++ tzcntl %eax, %ecx ++# ifdef USE_AS_WMEMCMP ++ xorl %eax, %eax ++ movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx ++ jmp L(wmemcmp_return) ++# else ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx ++ sub %edx, %eax ++# endif ++ ret ++END (MEMCMP) ++#endif +diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S +new file mode 100644 +index 00000000..4726d74a +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S +@@ -0,0 +1,4 @@ ++#define MEMCMP __wmemcmp_evex_movbe ++#define USE_AS_WMEMCMP 1 ++ ++#include "memcmp-evex-movbe.S" +-- +GitLab + diff --git a/glibc-RHEL-15696-17.patch b/glibc-RHEL-15696-17.patch new file mode 100644 index 0000000..3176514 --- /dev/null +++ b/glibc-RHEL-15696-17.patch @@ -0,0 +1,2568 @@ +From 7ebba91361badf7531d4e75050627a88d424872f Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 5 Mar 2021 07:26:42 -0800 +Subject: [PATCH] x86-64: Add AVX optimized string/memory functions for RTM +Content-type: text/plain; charset=UTF-8 + +Since VZEROUPPER triggers RTM abort while VZEROALL won't, select AVX +optimized string/memory functions with + + xtest + jz 1f + vzeroall + ret +1: + vzeroupper + ret + +at function exit on processors with usable RTM, but without 256-bit EVEX +instructions to avoid VZEROUPPER inside a transactionally executing RTM +region. +--- + sysdeps/x86_64/multiarch/Makefile | 27 +++ + sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 + + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 170 ++++++++++++++++++ + sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 + + sysdeps/x86_64/multiarch/ifunc-memmove.h | 12 ++ + sysdeps/x86_64/multiarch/ifunc-memset.h | 12 ++ + sysdeps/x86_64/multiarch/ifunc-strcpy.h | 4 + + sysdeps/x86_64/multiarch/ifunc-wmemset.h | 5 + + sysdeps/x86_64/multiarch/memchr-avx2-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/memchr-avx2.S | 45 +++-- + .../x86_64/multiarch/memcmp-avx2-movbe-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 28 ++- + .../memmove-avx-unaligned-erms-rtm.S | 17 ++ + .../multiarch/memmove-vec-unaligned-erms.S | 33 ++-- + sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/memrchr-avx2.S | 53 +++--- + .../memset-avx2-unaligned-erms-rtm.S | 10 ++ + .../multiarch/memset-avx2-unaligned-erms.S | 12 +- + .../multiarch/memset-vec-unaligned-erms.S | 41 ++--- + sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S | 4 + + sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S | 3 + + sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S | 4 + + sysdeps/x86_64/multiarch/strcat-avx2-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/strcat-avx2.S | 6 +- + sysdeps/x86_64/multiarch/strchr-avx2-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/strchr-avx2.S | 22 +-- + sysdeps/x86_64/multiarch/strchr.c | 4 + + sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S | 3 + + sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/strcmp-avx2.S | 55 +++--- + sysdeps/x86_64/multiarch/strcmp.c | 4 + + sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/strcpy-avx2.S | 85 ++++----- + sysdeps/x86_64/multiarch/strlen-avx2-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/strlen-avx2.S | 43 ++--- + sysdeps/x86_64/multiarch/strncat-avx2-rtm.S | 3 + + sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S | 3 + + sysdeps/x86_64/multiarch/strncmp.c | 4 + + sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S | 3 + + sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S | 4 + + sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/strrchr-avx2.S | 19 +- + sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S | 3 + + sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S | 4 + + sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S | 4 + + sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S | 5 + + sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S | 5 + + sysdeps/x86_64/multiarch/wcsnlen.c | 4 + + sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S | 3 + + sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S | 4 + + .../x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S | 4 + + sysdeps/x86_64/sysdep.h | 22 +++ + 52 files changed, 668 insertions(+), 244 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/memchr-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strcat-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strchr-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strlen-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strncat-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S + +Conflicts: + sysdeps/x86_64/multiarch/strchr-avx2.S + (same fix, different location) + + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 9d79b138..491c7698 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -40,6 +40,25 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ + memset-sse2-unaligned-erms \ + memset-avx2-unaligned-erms \ + memset-avx512-unaligned-erms \ ++ memchr-avx2-rtm \ ++ memcmp-avx2-movbe-rtm \ ++ memmove-avx-unaligned-erms-rtm \ ++ memrchr-avx2-rtm \ ++ memset-avx2-unaligned-erms-rtm \ ++ rawmemchr-avx2-rtm \ ++ strchr-avx2-rtm \ ++ strcmp-avx2-rtm \ ++ strchrnul-avx2-rtm \ ++ stpcpy-avx2-rtm \ ++ stpncpy-avx2-rtm \ ++ strcat-avx2-rtm \ ++ strcpy-avx2-rtm \ ++ strlen-avx2-rtm \ ++ strncat-avx2-rtm \ ++ strncmp-avx2-rtm \ ++ strncpy-avx2-rtm \ ++ strnlen-avx2-rtm \ ++ strrchr-avx2-rtm \ + memchr-evex \ + memcmp-evex-movbe \ + memmove-evex-unaligned-erms \ +@@ -76,6 +95,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ + wcsrchr-sse2 wcsrchr-avx2 \ + wcsnlen-sse4_1 wcsnlen-c \ + wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \ ++ wcschr-avx2-rtm \ ++ wcscmp-avx2-rtm \ ++ wcslen-avx2-rtm \ ++ wcsncmp-avx2-rtm \ ++ wcsnlen-avx2-rtm \ ++ wcsrchr-avx2-rtm \ ++ wmemchr-avx2-rtm \ ++ wmemcmp-avx2-movbe-rtm \ + wcschr-evex \ + wcscmp-evex \ + wcslen-evex \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h +index 7081b0c9..e0f30e61 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h ++++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h +@@ -21,6 +21,7 @@ + + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * +@@ -36,6 +37,9 @@ IFUNC_SELECTOR (void) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + return OPTIMIZE (evex); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index c8da910e..c1efeec0 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -43,6 +43,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memchr, + CPU_FEATURE_USABLE (AVX2), + __memchr_avx2) ++ IFUNC_IMPL_ADD (array, i, memchr, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, memchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -56,6 +60,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (MOVBE)), + __memcmp_avx2_movbe) ++ IFUNC_IMPL_ADD (array, i, memcmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (MOVBE) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memcmp_avx2_movbe_rtm) + IFUNC_IMPL_ADD (array, i, memcmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -85,6 +94,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __memmove_chk, + CPU_FEATURE_USABLE (AVX), + __memmove_chk_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, __memmove_chk, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memmove_chk_avx_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, __memmove_chk, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memmove_chk_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + CPU_FEATURE_USABLE (AVX512VL), + __memmove_chk_evex_unaligned) +@@ -113,6 +130,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memmove, + CPU_FEATURE_USABLE (AVX), + __memmove_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, memmove, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memmove_avx_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, memmove, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memmove_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, memmove, + CPU_FEATURE_USABLE (AVX512VL), + __memmove_evex_unaligned) +@@ -143,6 +168,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memrchr, + CPU_FEATURE_USABLE (AVX2), + __memrchr_avx2) ++ IFUNC_IMPL_ADD (array, i, memrchr, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memrchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, memrchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -165,6 +194,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __memset_chk, + CPU_FEATURE_USABLE (AVX2), + __memset_chk_avx2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, __memset_chk, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memset_chk_avx2_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, __memset_chk, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memset_chk_avx2_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -198,6 +235,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memset, + CPU_FEATURE_USABLE (AVX2), + __memset_avx2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, memset, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memset_avx2_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, memset, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memset_avx2_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -222,6 +267,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, rawmemchr, + CPU_FEATURE_USABLE (AVX2), + __rawmemchr_avx2) ++ IFUNC_IMPL_ADD (array, i, rawmemchr, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __rawmemchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, rawmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -234,6 +283,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strlen, + CPU_FEATURE_USABLE (AVX2), + __strlen_avx2) ++ IFUNC_IMPL_ADD (array, i, strlen, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strlen, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -245,6 +298,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strnlen, + CPU_FEATURE_USABLE (AVX2), + __strnlen_avx2) ++ IFUNC_IMPL_ADD (array, i, strnlen, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strnlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strnlen, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -257,6 +314,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __stpncpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), + __stpncpy_avx2) ++ IFUNC_IMPL_ADD (array, i, stpncpy, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __stpncpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, stpncpy, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -271,6 +332,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __stpcpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), + __stpcpy_avx2) ++ IFUNC_IMPL_ADD (array, i, stpcpy, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __stpcpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, stpcpy, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -309,6 +374,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, strcat, + IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2), + __strcat_avx2) ++ IFUNC_IMPL_ADD (array, i, strcat, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strcat_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcat, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -323,6 +392,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strchr, + CPU_FEATURE_USABLE (AVX2), + __strchr_avx2) ++ IFUNC_IMPL_ADD (array, i, strchr, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -336,6 +409,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strchrnul, + CPU_FEATURE_USABLE (AVX2), + __strchrnul_avx2) ++ IFUNC_IMPL_ADD (array, i, strchrnul, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strchrnul_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strchrnul, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -348,6 +425,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strrchr, + CPU_FEATURE_USABLE (AVX2), + __strrchr_avx2) ++ IFUNC_IMPL_ADD (array, i, strrchr, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strrchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strrchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -359,6 +440,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strcmp, + CPU_FEATURE_USABLE (AVX2), + __strcmp_avx2) ++ IFUNC_IMPL_ADD (array, i, strcmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strcmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -375,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, strcpy, + IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2), + __strcpy_avx2) ++ IFUNC_IMPL_ADD (array, i, strcpy, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strcpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcpy, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -422,6 +511,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, strncat, + IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2), + __strncat_avx2) ++ IFUNC_IMPL_ADD (array, i, strncat, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strncat_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncat, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -436,6 +529,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, strncpy, + IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2), + __strncpy_avx2) ++ IFUNC_IMPL_ADD (array, i, strncpy, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strncpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncpy, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -469,6 +566,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcschr, + CPU_FEATURE_USABLE (AVX2), + __wcschr_avx2) ++ IFUNC_IMPL_ADD (array, i, wcschr, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wcschr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcschr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -481,6 +582,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcsrchr, + CPU_FEATURE_USABLE (AVX2), + __wcsrchr_avx2) ++ IFUNC_IMPL_ADD (array, i, wcsrchr, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wcsrchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcsrchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -493,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcscmp, + CPU_FEATURE_USABLE (AVX2), + __wcscmp_avx2) ++ IFUNC_IMPL_ADD (array, i, wcscmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wcscmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcscmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -505,6 +614,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcsncmp, + CPU_FEATURE_USABLE (AVX2), + __wcsncmp_avx2) ++ IFUNC_IMPL_ADD (array, i, wcsncmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wcsncmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcsncmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -523,6 +636,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcslen, + CPU_FEATURE_USABLE (AVX2), + __wcslen_avx2) ++ IFUNC_IMPL_ADD (array, i, wcslen, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wcslen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcslen, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -535,6 +652,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcsnlen, + CPU_FEATURE_USABLE (AVX2), + __wcsnlen_avx2) ++ IFUNC_IMPL_ADD (array, i, wcsnlen, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wcsnlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcsnlen, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -550,6 +671,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wmemchr, + CPU_FEATURE_USABLE (AVX2), + __wmemchr_avx2) ++ IFUNC_IMPL_ADD (array, i, wmemchr, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wmemchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -563,6 +688,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (MOVBE)), + __wmemcmp_avx2_movbe) ++ IFUNC_IMPL_ADD (array, i, wmemcmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (MOVBE) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wmemcmp_avx2_movbe_rtm) + IFUNC_IMPL_ADD (array, i, wmemcmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -581,6 +711,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wmemset, + CPU_FEATURE_USABLE (AVX2), + __wmemset_avx2_unaligned) ++ IFUNC_IMPL_ADD (array, i, wmemset, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wmemset_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, wmemset, + CPU_FEATURE_USABLE (AVX512VL), + __wmemset_evex_unaligned) +@@ -606,6 +740,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + CPU_FEATURE_USABLE (AVX), + __memcpy_chk_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, __memcpy_chk, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memcpy_chk_avx_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, __memcpy_chk, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memcpy_chk_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + CPU_FEATURE_USABLE (AVX512VL), + __memcpy_chk_evex_unaligned) +@@ -634,6 +776,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memcpy, + CPU_FEATURE_USABLE (AVX), + __memcpy_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, memcpy, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memcpy_avx_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, memcpy, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memcpy_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, memcpy, + CPU_FEATURE_USABLE (AVX512VL), + __memcpy_evex_unaligned) +@@ -676,6 +826,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + CPU_FEATURE_USABLE (AVX), + __mempcpy_chk_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __mempcpy_chk_avx_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __mempcpy_chk_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + CPU_FEATURE_USABLE (AVX512VL), + __mempcpy_chk_evex_unaligned) +@@ -713,6 +871,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, mempcpy, + CPU_FEATURE_USABLE (AVX), + __mempcpy_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, mempcpy, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __mempcpy_avx_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, mempcpy, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __mempcpy_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, mempcpy, + CPU_FEATURE_USABLE (AVX512VL), + __mempcpy_evex_unaligned) +@@ -734,6 +900,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strncmp, + CPU_FEATURE_USABLE (AVX2), + __strncmp_avx2) ++ IFUNC_IMPL_ADD (array, i, strncmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strncmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +index 3ca1f0a6..8043c635 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +@@ -23,6 +23,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; + + static inline void * +@@ -38,6 +39,9 @@ IFUNC_SELECTOR (void) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + return OPTIMIZE (evex_movbe); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_movbe_rtm); ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2_movbe); + } +diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h +index 6f8bce5f..fa09b9fb 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h +@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) + attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm) ++ attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) + attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) +@@ -71,6 +75,14 @@ IFUNC_SELECTOR (void) + return OPTIMIZE (evex_unaligned); + } + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE (avx_unaligned_erms_rtm); ++ ++ return OPTIMIZE (avx_unaligned_rtm); ++ } ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h +index 6f31f4dc..6f3375cc 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memset.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h +@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms) + attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm) ++ attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) + attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) +@@ -69,6 +73,14 @@ IFUNC_SELECTOR (void) + return OPTIMIZE (evex_unaligned); + } + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE (avx2_unaligned_erms_rtm); ++ ++ return OPTIMIZE (avx2_unaligned_rtm); ++ } ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h +index deae6348..a924762e 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h ++++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h +@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) + attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * +@@ -39,6 +40,9 @@ IFUNC_SELECTOR (void) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + return OPTIMIZE (evex); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } +diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h +index 9290c4bf..bdc94c6c 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h ++++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h +@@ -20,6 +20,8 @@ + + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm) ++ attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; + +@@ -39,6 +41,9 @@ IFUNC_SELECTOR (void) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) + return OPTIMIZE (evex_unaligned); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_unaligned_rtm); ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2_unaligned); + } +diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S +new file mode 100644 +index 00000000..87b076c7 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef MEMCHR ++# define MEMCHR __memchr_avx2_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "memchr-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S +index c81da19b..cf893e77 100644 +--- a/sysdeps/x86_64/multiarch/memchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S +@@ -34,9 +34,13 @@ + # define VZEROUPPER vzeroupper + # endif + ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ + # define VEC_SIZE 32 + +- .section .text.avx,"ax",@progbits ++ .section SECTION(.text),"ax",@progbits + ENTRY (MEMCHR) + # ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ +@@ -107,8 +111,8 @@ L(cros_page_boundary): + # endif + addq %rdi, %rax + addq %rcx, %rax +- VZEROUPPER +- ret ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 + L(aligned_more): +@@ -224,8 +228,7 @@ L(last_4x_vec_or_less): + + jnz L(first_vec_x3_check) + xorl %eax, %eax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_2x_vec): +@@ -243,8 +246,7 @@ L(last_2x_vec): + testl %eax, %eax + jnz L(first_vec_x1_check) + xorl %eax, %eax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x0_check): +@@ -253,8 +255,7 @@ L(first_vec_x0_check): + cmpq %rax, %rdx + jbe L(zero) + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x1_check): +@@ -264,8 +265,7 @@ L(first_vec_x1_check): + jbe L(zero) + addq $VEC_SIZE, %rax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x2_check): +@@ -275,8 +275,7 @@ L(first_vec_x2_check): + jbe L(zero) + addq $(VEC_SIZE * 2), %rax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x3_check): +@@ -286,12 +285,14 @@ L(first_vec_x3_check): + jbe L(zero) + addq $(VEC_SIZE * 3), %rax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(zero): +- VZEROUPPER ++ xorl %eax, %eax ++ jmp L(return_vzeroupper) ++ ++ .p2align 4 + L(null): + xorl %eax, %eax + ret +@@ -301,24 +302,21 @@ L(null): + L(first_vec_x0): + tzcntl %eax, %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x1): + tzcntl %eax, %eax + addq $VEC_SIZE, %rax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x2): + tzcntl %eax, %eax + addq $(VEC_SIZE * 2), %rax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(4x_vec_end): +@@ -337,8 +335,7 @@ L(first_vec_x3): + tzcntl %eax, %eax + addq $(VEC_SIZE * 3), %rax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + END (MEMCHR) + #endif +diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S +new file mode 100644 +index 00000000..cf4eff5d +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef MEMCMP ++# define MEMCMP __memcmp_avx2_movbe_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "memcmp-avx2-movbe.S" +diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +index e3a35b89..9d5c9c72 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +@@ -47,6 +47,10 @@ + # define VZEROUPPER vzeroupper + # endif + ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ + # define VEC_SIZE 32 + # define VEC_MASK ((1 << VEC_SIZE) - 1) + +@@ -55,7 +59,7 @@ + memcmp has to use UNSIGNED comparison for elemnts. + */ + +- .section .text.avx,"ax",@progbits ++ .section SECTION(.text),"ax",@progbits + ENTRY (MEMCMP) + # ifdef USE_AS_WMEMCMP + shl $2, %RDX_LP +@@ -123,8 +127,8 @@ ENTRY (MEMCMP) + vptest %ymm0, %ymm5 + jnc L(4x_vec_end) + xorl %eax, %eax +- VZEROUPPER +- ret ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 + L(last_2x_vec): +@@ -144,8 +148,7 @@ L(last_vec): + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec): +@@ -164,8 +167,7 @@ L(wmemcmp_return): + movzbl (%rsi, %rcx), %edx + sub %edx, %eax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + # ifdef USE_AS_WMEMCMP + .p2align 4 +@@ -367,8 +369,7 @@ L(last_4x_vec): + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(4x_vec_end): +@@ -394,8 +395,7 @@ L(4x_vec_end): + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx + sub %edx, %eax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x1): +@@ -410,8 +410,7 @@ L(first_vec_x1): + movzbl VEC_SIZE(%rsi, %rcx), %edx + sub %edx, %eax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x2): +@@ -426,7 +425,6 @@ L(first_vec_x2): + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx + sub %edx, %eax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + END (MEMCMP) + #endif +diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S +new file mode 100644 +index 00000000..1ec1962e +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S +@@ -0,0 +1,17 @@ ++#if IS_IN (libc) ++# define VEC_SIZE 32 ++# define VEC(i) ymm##i ++# define VMOVNT vmovntdq ++# define VMOVU vmovdqu ++# define VMOVA vmovdqa ++ ++# define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++# define VZEROUPPER_RETURN jmp L(return) ++ ++# define SECTION(p) p##.avx.rtm ++# define MEMMOVE_SYMBOL(p,s) p##_avx_##s##_rtm ++ ++# include "memmove-vec-unaligned-erms.S" ++#endif +diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +index 08e21692..71f5954d 100644 +--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +@@ -140,11 +140,12 @@ L(last_2x_vec): + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) +- VZEROUPPER + #if !defined USE_MULTIARCH || !IS_IN (libc) + L(nop): +-#endif + ret ++#else ++ VZEROUPPER_RETURN ++#endif + #if defined USE_MULTIARCH && IS_IN (libc) + END (MEMMOVE_SYMBOL (__memmove, unaligned)) + +@@ -237,8 +238,11 @@ L(last_2x_vec): + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) + L(return): +- VZEROUPPER ++#if VEC_SIZE > 16 ++ ZERO_UPPER_VEC_REGISTERS_RETURN ++#else + ret ++#endif + + L(movsb): + cmpq __x86_shared_non_temporal_threshold(%rip), %rdx +@@ -289,8 +293,7 @@ L(between_32_63): + VMOVU -32(%rsi,%rdx), %YMM1 + VMOVU %YMM0, (%rdi) + VMOVU %YMM1, -32(%rdi,%rdx) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + #endif + #if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ +@@ -299,7 +302,7 @@ L(between_16_31): + VMOVU -16(%rsi,%rdx), %XMM1 + VMOVU %XMM0, (%rdi) + VMOVU %XMM1, -16(%rdi,%rdx) +- ret ++ VZEROUPPER_RETURN + #endif + L(between_8_15): + /* From 8 to 15. No branch when size == 8. */ +@@ -352,8 +355,7 @@ L(more_2x_vec): + VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + L(last_4x_vec): + /* Copy from 2 * VEC to 4 * VEC. */ + VMOVU (%rsi), %VEC(0) +@@ -364,8 +366,7 @@ L(last_4x_vec): + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + L(more_8x_vec): + cmpq %rsi, %rdi +@@ -421,8 +422,7 @@ L(loop_4x_vec_forward): + VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) + /* Store the first VEC. */ + VMOVU %VEC(4), (%r11) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + L(more_8x_vec_backward): + /* Load the first 4 * VEC and last VEC to support overlapping +@@ -473,8 +473,7 @@ L(loop_4x_vec_backward): + VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) + /* Store the last VEC. */ + VMOVU %VEC(8), (%r11) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + L(large_forward): +@@ -509,8 +508,7 @@ L(loop_large_forward): + VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) + /* Store the first VEC. */ + VMOVU %VEC(4), (%r11) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + L(large_backward): + /* Don't use non-temporal store if there is overlap between +@@ -544,8 +542,7 @@ L(loop_large_backward): + VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) + /* Store the last VEC. */ + VMOVU %VEC(8), (%r11) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + #endif + END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) + +diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S +new file mode 100644 +index 00000000..cea2d2a7 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef MEMRCHR ++# define MEMRCHR __memrchr_avx2_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "memrchr-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S +index ce488dd9..20efe7ac 100644 +--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S +@@ -20,14 +20,22 @@ + + # include + ++# ifndef MEMRCHR ++# define MEMRCHR __memrchr_avx2 ++# endif ++ + # ifndef VZEROUPPER + # define VZEROUPPER vzeroupper + # endif + ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ + # define VEC_SIZE 32 + +- .section .text.avx,"ax",@progbits +-ENTRY (__memrchr_avx2) ++ .section SECTION(.text),"ax",@progbits ++ENTRY (MEMRCHR) + /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 + vpbroadcastb %xmm0, %ymm0 +@@ -134,8 +142,8 @@ L(loop_4x_vec): + vpmovmskb %ymm1, %eax + bsrl %eax, %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 + L(last_4x_vec_or_less): +@@ -169,8 +177,7 @@ L(last_4x_vec_or_less): + addq %rax, %rdx + jl L(zero) + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_2x_vec): +@@ -191,31 +198,27 @@ L(last_2x_vec): + jl L(zero) + addl $(VEC_SIZE * 2), %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_vec_x0): + bsrl %eax, %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_vec_x1): + bsrl %eax, %eax + addl $VEC_SIZE, %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_vec_x2): + bsrl %eax, %eax + addl $(VEC_SIZE * 2), %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_vec_x3): +@@ -232,8 +235,7 @@ L(last_vec_x1_check): + jl L(zero) + addl $VEC_SIZE, %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_vec_x3_check): +@@ -243,12 +245,14 @@ L(last_vec_x3_check): + jl L(zero) + addl $(VEC_SIZE * 3), %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(zero): +- VZEROUPPER ++ xorl %eax, %eax ++ VZEROUPPER_RETURN ++ ++ .p2align 4 + L(null): + xorl %eax, %eax + ret +@@ -273,8 +277,7 @@ L(last_vec_or_less_aligned): + + bsrl %eax, %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_vec_or_less): +@@ -315,8 +318,7 @@ L(last_vec_or_less): + bsrl %eax, %eax + addq %rdi, %rax + addq %r8, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_vec_2x_aligned): +@@ -353,7 +355,6 @@ L(last_vec_2x_aligned): + bsrl %eax, %eax + addq %rdi, %rax + addq %r8, %rax +- VZEROUPPER +- ret +-END (__memrchr_avx2) ++ VZEROUPPER_RETURN ++END (MEMRCHR) + #endif +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +new file mode 100644 +index 00000000..8ac3e479 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +@@ -0,0 +1,10 @@ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return) ++ ++#define SECTION(p) p##.avx.rtm ++#define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm ++#define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm ++ ++#include "memset-avx2-unaligned-erms.S" +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +index 7ab3d898..ae0860f3 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +@@ -14,9 +14,15 @@ + movq r, %rax; \ + vpbroadcastd %xmm0, %ymm0 + +-# define SECTION(p) p##.avx +-# define MEMSET_SYMBOL(p,s) p##_avx2_##s +-# define WMEMSET_SYMBOL(p,s) p##_avx2_##s ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++# ifndef MEMSET_SYMBOL ++# define MEMSET_SYMBOL(p,s) p##_avx2_##s ++# endif ++# ifndef WMEMSET_SYMBOL ++# define WMEMSET_SYMBOL(p,s) p##_avx2_##s ++# endif + + # include "memset-vec-unaligned-erms.S" + #endif +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 71e91a8f..bae5cba4 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -45,17 +45,14 @@ + #ifndef VZEROUPPER + # if VEC_SIZE > 16 + # define VZEROUPPER vzeroupper ++# define VZEROUPPER_SHORT_RETURN vzeroupper; ret + # else + # define VZEROUPPER + # endif + #endif + + #ifndef VZEROUPPER_SHORT_RETURN +-# if VEC_SIZE > 16 +-# define VZEROUPPER_SHORT_RETURN vzeroupper +-# else +-# define VZEROUPPER_SHORT_RETURN rep +-# endif ++# define VZEROUPPER_SHORT_RETURN rep; ret + #endif + + #ifndef MOVQ +@@ -117,8 +114,7 @@ L(entry_from_bzero): + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + #if defined USE_MULTIARCH && IS_IN (libc) + END (MEMSET_SYMBOL (__memset, unaligned)) + +@@ -141,14 +137,12 @@ ENTRY (__memset_erms) + ENTRY (MEMSET_SYMBOL (__memset, erms)) + # endif + L(stosb): +- /* Issue vzeroupper before rep stosb. */ +- VZEROUPPER + mov %RDX_LP, %RCX_LP + movzbl %sil, %eax + mov %RDI_LP, %RDX_LP + rep stosb + mov %RDX_LP, %RAX_LP +- ret ++ VZEROUPPER_RETURN + # if VEC_SIZE == 16 + END (__memset_erms) + # else +@@ -175,8 +169,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + L(stosb_more_2x_vec): + cmp __x86_rep_stosb_threshold(%rip), %RDX_LP +@@ -190,8 +183,11 @@ L(more_2x_vec): + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) + L(return): +- VZEROUPPER ++#if VEC_SIZE > 16 ++ ZERO_UPPER_VEC_REGISTERS_RETURN ++#else + ret ++#endif + + L(loop_start): + leaq (VEC_SIZE * 4)(%rdi), %rcx +@@ -217,7 +213,6 @@ L(loop): + cmpq %rcx, %rdx + jne L(loop) + VZEROUPPER_SHORT_RETURN +- ret + L(less_vec): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +@@ -241,40 +236,34 @@ L(less_vec): + jb 1f + movb %cl, (%rdi) + 1: +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + # if VEC_SIZE > 32 + /* From 32 to 63. No branch when size == 32. */ + L(between_32_63): + VMOVU %YMM0, -32(%rdi,%rdx) + VMOVU %YMM0, (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + # endif + # if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ + L(between_16_31): + VMOVU %XMM0, -16(%rdi,%rdx) + VMOVU %XMM0, (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + # endif + /* From 8 to 15. No branch when size == 8. */ + L(between_8_15): + movq %rcx, -8(%rdi,%rdx) + movq %rcx, (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ + movl %ecx, -4(%rdi,%rdx) + movl %ecx, (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ + movw %cx, -2(%rdi,%rdx) + movw %cx, (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + END (MEMSET_SYMBOL (__memset, unaligned_erms)) +diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S +new file mode 100644 +index 00000000..acc5f6e2 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S +@@ -0,0 +1,4 @@ ++#define MEMCHR __rawmemchr_avx2_rtm ++#define USE_AS_RAWMEMCHR 1 ++ ++#include "memchr-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S +new file mode 100644 +index 00000000..2b9c07a5 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S +@@ -0,0 +1,3 @@ ++#define USE_AS_STPCPY ++#define STRCPY __stpcpy_avx2_rtm ++#include "strcpy-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S +new file mode 100644 +index 00000000..60a2ccfe +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S +@@ -0,0 +1,4 @@ ++#define USE_AS_STPCPY ++#define USE_AS_STRNCPY ++#define STRCPY __stpncpy_avx2_rtm ++#include "strcpy-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S +new file mode 100644 +index 00000000..637fb557 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef STRCAT ++# define STRCAT __strcat_avx2_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "strcat-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S +index b0623564..aa48c058 100644 +--- a/sysdeps/x86_64/multiarch/strcat-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcat-avx2.S +@@ -30,7 +30,11 @@ + /* Number of bytes in a vector register */ + # define VEC_SIZE 32 + +- .section .text.avx,"ax",@progbits ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ ++ .section SECTION(.text),"ax",@progbits + ENTRY (STRCAT) + mov %rdi, %r9 + # ifdef USE_AS_STRNCAT +diff --git a/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S +new file mode 100644 +index 00000000..81f20d1d +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef STRCHR ++# define STRCHR __strchr_avx2_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "strchr-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S +index 47bc3c99..da7d2620 100644 +--- a/sysdeps/x86_64/multiarch/strchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S +@@ -38,9 +38,13 @@ + # define VZEROUPPER vzeroupper + # endif + ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ + # define VEC_SIZE 32 + +- .section .text.avx,"ax",@progbits ++ .section SECTION(.text),"ax",@progbits + ENTRY (STRCHR) + movl %edi, %ecx + /* Broadcast CHAR to YMM0. */ +@@ -93,8 +97,8 @@ L(cros_page_boundary): + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif +- VZEROUPPER +- ret ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 + L(aligned_more): +@@ -190,8 +194,7 @@ L(first_vec_x0): + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x1): +@@ -205,8 +208,7 @@ L(first_vec_x1): + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x2): +@@ -220,8 +222,7 @@ L(first_vec_x2): + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(4x_vec_end): +@@ -247,8 +248,7 @@ L(first_vec_x3): + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + END (STRCHR) + #endif +diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c +index be05e197..7e582f02 100644 +--- a/sysdeps/x86_64/multiarch/strchr.c ++++ b/sysdeps/x86_64/multiarch/strchr.c +@@ -29,6 +29,7 @@ + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * +@@ -44,6 +45,9 @@ IFUNC_SELECTOR (void) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + return OPTIMIZE (evex); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } +diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S +new file mode 100644 +index 00000000..cdcf818b +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S +@@ -0,0 +1,3 @@ ++#define STRCHR __strchrnul_avx2_rtm ++#define USE_AS_STRCHRNUL 1 ++#include "strchr-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S +new file mode 100644 +index 00000000..aecd30d9 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef STRCMP ++# define STRCMP __strcmp_avx2_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "strcmp-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 8fb8eedc..5d1c9d90 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -55,6 +55,10 @@ + # define VZEROUPPER vzeroupper + # endif + ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. + strcmp/strncmp have to use UNSIGNED comparison for elements. +@@ -75,7 +79,7 @@ + the maximum offset is reached before a difference is found, zero is + returned. */ + +- .section .text.avx,"ax",@progbits ++ .section SECTION(.text),"ax",@progbits + ENTRY (STRCMP) + # ifdef USE_AS_STRNCMP + /* Check for simple cases (0 or 1) in offset. */ +@@ -137,8 +141,8 @@ L(return): + movzbl (%rsi, %rdx), %edx + subl %edx, %eax + # endif +- VZEROUPPER +- ret ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 + L(return_vec_size): +@@ -171,8 +175,7 @@ L(return_vec_size): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(return_2_vec_size): +@@ -205,8 +208,7 @@ L(return_2_vec_size): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(return_3_vec_size): +@@ -239,8 +241,7 @@ L(return_3_vec_size): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(next_3_vectors): +@@ -366,8 +367,7 @@ L(back_to_loop): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(test_vec): +@@ -410,8 +410,7 @@ L(test_vec): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(test_2_vec): +@@ -454,8 +453,7 @@ L(test_2_vec): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(test_3_vec): +@@ -496,8 +494,7 @@ L(test_3_vec): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(loop_cross_page): +@@ -566,8 +563,7 @@ L(loop_cross_page): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(loop_cross_page_2_vec): +@@ -641,8 +637,7 @@ L(loop_cross_page_2_vec): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + # ifdef USE_AS_STRNCMP + L(string_nbyte_offset_check): +@@ -684,8 +679,7 @@ L(cross_page_loop): + # ifndef USE_AS_WCSCMP + L(different): + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + # ifdef USE_AS_WCSCMP + .p2align 4 +@@ -695,16 +689,14 @@ L(different): + setl %al + negl %eax + orl $1, %eax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + # endif + + # ifdef USE_AS_STRNCMP + .p2align 4 + L(zero): + xorl %eax, %eax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(char0): +@@ -718,8 +710,7 @@ L(char0): + movzbl (%rdi), %eax + subl %ecx, %eax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + # endif + + .p2align 4 +@@ -744,8 +735,7 @@ L(last_vector): + movzbl (%rsi, %rdx), %edx + subl %edx, %eax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + /* Comparing on page boundary region requires special treatment: + It must done one vector at the time, starting with the wider +@@ -866,7 +856,6 @@ L(cross_page_4bytes): + testl %eax, %eax + jne L(cross_page_loop) + subl %ecx, %eax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + END (STRCMP) + #endif +diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c +index c5f38510..11bbea2b 100644 +--- a/sysdeps/x86_64/multiarch/strcmp.c ++++ b/sysdeps/x86_64/multiarch/strcmp.c +@@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * +@@ -46,6 +47,9 @@ IFUNC_SELECTOR (void) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) + return OPTIMIZE (evex); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } +diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S +new file mode 100644 +index 00000000..c2c581ec +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef STRCPY ++# define STRCPY __strcpy_avx2_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "strcpy-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S +index 81677f90..613c59aa 100644 +--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S +@@ -37,6 +37,10 @@ + # define VZEROUPPER vzeroupper + # endif + ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ + /* zero register */ + #define xmmZ xmm0 + #define ymmZ ymm0 +@@ -46,7 +50,7 @@ + + # ifndef USE_AS_STRCAT + +- .section .text.avx,"ax",@progbits ++ .section SECTION(.text),"ax",@progbits + ENTRY (STRCPY) + # ifdef USE_AS_STRNCPY + mov %rdx, %r8 +@@ -369,8 +373,8 @@ L(CopyVecSizeExit): + lea 1(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) + # endif +- VZEROUPPER +- ret ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 + L(CopyTwoVecSize1): +@@ -553,8 +557,7 @@ L(Exit1): + lea 2(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Exit2): +@@ -569,8 +572,7 @@ L(Exit2): + lea 3(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Exit3): +@@ -584,8 +586,7 @@ L(Exit3): + lea 4(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Exit4_7): +@@ -602,8 +603,7 @@ L(Exit4_7): + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Exit8_15): +@@ -620,8 +620,7 @@ L(Exit8_15): + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Exit16_31): +@@ -638,8 +637,7 @@ L(Exit16_31): + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Exit32_63): +@@ -656,8 +654,7 @@ L(Exit32_63): + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + # ifdef USE_AS_STRNCPY + +@@ -671,8 +668,7 @@ L(StrncpyExit1): + # ifdef USE_AS_STRCAT + movb $0, 1(%rdi) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(StrncpyExit2): +@@ -684,8 +680,7 @@ L(StrncpyExit2): + # ifdef USE_AS_STRCAT + movb $0, 2(%rdi) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(StrncpyExit3_4): +@@ -699,8 +694,7 @@ L(StrncpyExit3_4): + # ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(StrncpyExit5_8): +@@ -714,8 +708,7 @@ L(StrncpyExit5_8): + # ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(StrncpyExit9_16): +@@ -729,8 +722,7 @@ L(StrncpyExit9_16): + # ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(StrncpyExit17_32): +@@ -744,8 +736,7 @@ L(StrncpyExit17_32): + # ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(StrncpyExit33_64): +@@ -760,8 +751,7 @@ L(StrncpyExit33_64): + # ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(StrncpyExit65): +@@ -778,50 +768,43 @@ L(StrncpyExit65): + # ifdef USE_AS_STRCAT + movb $0, 65(%rdi) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + # ifndef USE_AS_STRCAT + + .p2align 4 + L(Fill1): + mov %dl, (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Fill2): + mov %dx, (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Fill3_4): + mov %dx, (%rdi) + mov %dx, -2(%rdi, %r8) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Fill5_8): + mov %edx, (%rdi) + mov %edx, -4(%rdi, %r8) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Fill9_16): + mov %rdx, (%rdi) + mov %rdx, -8(%rdi, %r8) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Fill17_32): + vmovdqu %xmmZ, (%rdi) + vmovdqu %xmmZ, -16(%rdi, %r8) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(CopyVecSizeUnalignedVec2): +@@ -898,8 +881,7 @@ L(Fill): + cmp $1, %r8d + ja L(Fill2) + je L(Fill1) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + /* end of ifndef USE_AS_STRCAT */ + # endif +@@ -929,8 +911,7 @@ L(UnalignedFourVecSizeLeaveCase3): + # ifdef USE_AS_STRCAT + movb $0, (VEC_SIZE * 4)(%rdi) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(UnalignedFourVecSizeLeaveCase2): +@@ -1001,16 +982,14 @@ L(StrncpyExit): + # ifdef USE_AS_STRCAT + movb $0, (%rdi) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(ExitZero): + # ifndef USE_AS_STRCAT + mov %rdi, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + # endif + +diff --git a/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S +new file mode 100644 +index 00000000..75b4b761 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef STRLEN ++# define STRLEN __strlen_avx2_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "strlen-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S +index 645e0446..82826e10 100644 +--- a/sysdeps/x86_64/multiarch/strlen-avx2.S ++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S +@@ -36,9 +36,13 @@ + # define VZEROUPPER vzeroupper + # endif + ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ + # define VEC_SIZE 32 + +- .section .text.avx,"ax",@progbits ++ .section SECTION(.text),"ax",@progbits + ENTRY (STRLEN) + # ifdef USE_AS_STRNLEN + /* Check for zero length. */ +@@ -111,8 +115,8 @@ L(cros_page_boundary): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 + L(aligned_more): +@@ -231,8 +235,7 @@ L(last_4x_vec_or_less): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_2x_vec): +@@ -253,8 +256,7 @@ L(last_2x_vec): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x0_check): +@@ -267,8 +269,7 @@ L(first_vec_x0_check): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x1_check): +@@ -282,8 +283,7 @@ L(first_vec_x1_check): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x2_check): +@@ -297,8 +297,7 @@ L(first_vec_x2_check): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x3_check): +@@ -312,8 +311,7 @@ L(first_vec_x3_check): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(max): +@@ -321,8 +319,7 @@ L(max): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(zero): +@@ -338,8 +335,7 @@ L(first_vec_x0): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x1): +@@ -350,8 +346,7 @@ L(first_vec_x1): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x2): +@@ -362,8 +357,7 @@ L(first_vec_x2): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(4x_vec_end): +@@ -389,8 +383,7 @@ L(first_vec_x3): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + END (STRLEN) + #endif +diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S +new file mode 100644 +index 00000000..0dcea18d +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S +@@ -0,0 +1,3 @@ ++#define USE_AS_STRNCAT ++#define STRCAT __strncat_avx2_rtm ++#include "strcat-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S +new file mode 100644 +index 00000000..37d1224b +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S +@@ -0,0 +1,3 @@ ++#define STRCMP __strncmp_avx2_rtm ++#define USE_AS_STRNCMP 1 ++#include "strcmp-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c +index 4c15542f..44c85116 100644 +--- a/sysdeps/x86_64/multiarch/strncmp.c ++++ b/sysdeps/x86_64/multiarch/strncmp.c +@@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * +@@ -46,6 +47,9 @@ IFUNC_SELECTOR (void) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) + return OPTIMIZE (evex); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } +diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S +new file mode 100644 +index 00000000..79e70832 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S +@@ -0,0 +1,3 @@ ++#define USE_AS_STRNCPY ++#define STRCPY __strncpy_avx2_rtm ++#include "strcpy-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S +new file mode 100644 +index 00000000..04f1626a +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S +@@ -0,0 +1,4 @@ ++#define STRLEN __strnlen_avx2_rtm ++#define USE_AS_STRNLEN 1 ++ ++#include "strlen-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S +new file mode 100644 +index 00000000..5def14ec +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef STRRCHR ++# define STRRCHR __strrchr_avx2_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "strrchr-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S +index 4381e6ab..9f22a15e 100644 +--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S +@@ -36,9 +36,13 @@ + # define VZEROUPPER vzeroupper + # endif + ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ + # define VEC_SIZE 32 + +- .section .text.avx,"ax",@progbits ++ .section SECTION(.text),"ax",@progbits + ENTRY (STRRCHR) + movd %esi, %xmm4 + movl %edi, %ecx +@@ -166,8 +170,8 @@ L(return_value): + # endif + bsrl %eax, %eax + leaq -VEC_SIZE(%rdi, %rax), %rax +- VZEROUPPER +- ret ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 + L(match): +@@ -198,8 +202,7 @@ L(find_nul): + jz L(return_value) + bsrl %eax, %eax + leaq -VEC_SIZE(%rdi, %rax), %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(char_and_nul): +@@ -222,14 +225,12 @@ L(char_and_nul_in_first_vec): + jz L(return_null) + bsrl %eax, %eax + leaq -VEC_SIZE(%rdi, %rax), %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(return_null): + xorl %eax, %eax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + END (STRRCHR) + #endif +diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S +new file mode 100644 +index 00000000..d49dbbf0 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S +@@ -0,0 +1,3 @@ ++#define STRCHR __wcschr_avx2_rtm ++#define USE_AS_WCSCHR 1 ++#include "strchr-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S +new file mode 100644 +index 00000000..d6ca2b80 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S +@@ -0,0 +1,4 @@ ++#define STRCMP __wcscmp_avx2_rtm ++#define USE_AS_WCSCMP 1 ++ ++#include "strcmp-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S +new file mode 100644 +index 00000000..35658d73 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S +@@ -0,0 +1,4 @@ ++#define STRLEN __wcslen_avx2_rtm ++#define USE_AS_WCSLEN 1 ++ ++#include "strlen-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S +new file mode 100644 +index 00000000..4e88c70c +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S +@@ -0,0 +1,5 @@ ++#define STRCMP __wcsncmp_avx2_rtm ++#define USE_AS_STRNCMP 1 ++#define USE_AS_WCSCMP 1 ++ ++#include "strcmp-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S +new file mode 100644 +index 00000000..7437ebee +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S +@@ -0,0 +1,5 @@ ++#define STRLEN __wcsnlen_avx2_rtm ++#define USE_AS_WCSLEN 1 ++#define USE_AS_STRNLEN 1 ++ ++#include "strlen-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c +index 84254b83..20b731ae 100644 +--- a/sysdeps/x86_64/multiarch/wcsnlen.c ++++ b/sysdeps/x86_64/multiarch/wcsnlen.c +@@ -29,6 +29,7 @@ + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * +@@ -44,6 +45,9 @@ IFUNC_SELECTOR (void) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + return OPTIMIZE (evex); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } +diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S +new file mode 100644 +index 00000000..9bf76083 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S +@@ -0,0 +1,3 @@ ++#define STRRCHR __wcsrchr_avx2_rtm ++#define USE_AS_WCSRCHR 1 ++#include "strrchr-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S +new file mode 100644 +index 00000000..58ed21db +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S +@@ -0,0 +1,4 @@ ++#define MEMCHR __wmemchr_avx2_rtm ++#define USE_AS_WMEMCHR 1 ++ ++#include "memchr-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S +new file mode 100644 +index 00000000..31104d12 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S +@@ -0,0 +1,4 @@ ++#define MEMCMP __wmemcmp_avx2_movbe_rtm ++#define USE_AS_WMEMCMP 1 ++ ++#include "memcmp-avx2-movbe-rtm.S" +diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h +index 1738d7f9..223f1a59 100644 +--- a/sysdeps/x86_64/sysdep.h ++++ b/sysdeps/x86_64/sysdep.h +@@ -95,6 +95,28 @@ lose: \ + #define R14_LP r14 + #define R15_LP r15 + ++/* Zero upper vector registers and return with xtest. NB: Use VZEROALL ++ to avoid RTM abort triggered by VZEROUPPER inside transactionally. */ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \ ++ xtest; \ ++ jz 1f; \ ++ vzeroall; \ ++ ret; \ ++1: \ ++ vzeroupper; \ ++ ret ++ ++/* Zero upper vector registers and return. */ ++#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN ++# define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ VZEROUPPER; \ ++ ret ++#endif ++ ++#ifndef VZEROUPPER_RETURN ++# define VZEROUPPER_RETURN VZEROUPPER; ret ++#endif ++ + #else /* __ASSEMBLER__ */ + + /* Long and pointer size in bytes. */ +-- +GitLab + diff --git a/glibc-RHEL-15696-18.patch b/glibc-RHEL-15696-18.patch new file mode 100644 index 0000000..2cf0e45 --- /dev/null +++ b/glibc-RHEL-15696-18.patch @@ -0,0 +1,735 @@ +From 4bd660be40967cd69072f69ebc2ad32bfcc1f206 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Tue, 23 Feb 2021 06:33:10 -0800 +Subject: [PATCH] x86: Add string/memory function tests in RTM region +Content-type: text/plain; charset=UTF-8 + +At function exit, AVX optimized string/memory functions have VZEROUPPER +which triggers RTM abort. When such functions are called inside a +transactionally executing RTM region, RTM abort causes severe performance +degradation. Add tests to verify that string/memory functions won't +cause RTM abort in RTM region. +--- + sysdeps/x86/Makefile | 23 +++++++++++ + sysdeps/x86/tst-memchr-rtm.c | 54 ++++++++++++++++++++++++++ + sysdeps/x86/tst-memcmp-rtm.c | 52 +++++++++++++++++++++++++ + sysdeps/x86/tst-memmove-rtm.c | 53 ++++++++++++++++++++++++++ + sysdeps/x86/tst-memrchr-rtm.c | 54 ++++++++++++++++++++++++++ + sysdeps/x86/tst-memset-rtm.c | 45 ++++++++++++++++++++++ + sysdeps/x86/tst-strchr-rtm.c | 54 ++++++++++++++++++++++++++ + sysdeps/x86/tst-strcpy-rtm.c | 53 ++++++++++++++++++++++++++ + sysdeps/x86/tst-string-rtm.h | 72 +++++++++++++++++++++++++++++++++++ + sysdeps/x86/tst-strlen-rtm.c | 53 ++++++++++++++++++++++++++ + sysdeps/x86/tst-strncmp-rtm.c | 52 +++++++++++++++++++++++++ + sysdeps/x86/tst-strrchr-rtm.c | 53 ++++++++++++++++++++++++++ + 12 files changed, 618 insertions(+) + create mode 100644 sysdeps/x86/tst-memchr-rtm.c + create mode 100644 sysdeps/x86/tst-memcmp-rtm.c + create mode 100644 sysdeps/x86/tst-memmove-rtm.c + create mode 100644 sysdeps/x86/tst-memrchr-rtm.c + create mode 100644 sysdeps/x86/tst-memset-rtm.c + create mode 100644 sysdeps/x86/tst-strchr-rtm.c + create mode 100644 sysdeps/x86/tst-strcpy-rtm.c + create mode 100644 sysdeps/x86/tst-string-rtm.h + create mode 100644 sysdeps/x86/tst-strlen-rtm.c + create mode 100644 sysdeps/x86/tst-strncmp-rtm.c + create mode 100644 sysdeps/x86/tst-strrchr-rtm.c + +diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile +index 59e928e9..5be71ada 100644 +--- a/sysdeps/x86/Makefile ++++ b/sysdeps/x86/Makefile +@@ -17,6 +17,29 @@ endif + + ifeq ($(subdir),string) + sysdep_routines += cacheinfo ++ ++tests += \ ++ tst-memchr-rtm \ ++ tst-memcmp-rtm \ ++ tst-memmove-rtm \ ++ tst-memrchr-rtm \ ++ tst-memset-rtm \ ++ tst-strchr-rtm \ ++ tst-strcpy-rtm \ ++ tst-strlen-rtm \ ++ tst-strncmp-rtm \ ++ tst-strrchr-rtm ++ ++CFLAGS-tst-memchr-rtm.c += -mrtm ++CFLAGS-tst-memcmp-rtm.c += -mrtm ++CFLAGS-tst-memmove-rtm.c += -mrtm ++CFLAGS-tst-memrchr-rtm.c += -mrtm ++CFLAGS-tst-memset-rtm.c += -mrtm ++CFLAGS-tst-strchr-rtm.c += -mrtm ++CFLAGS-tst-strcpy-rtm.c += -mrtm ++CFLAGS-tst-strlen-rtm.c += -mrtm ++CFLAGS-tst-strncmp-rtm.c += -mrtm ++CFLAGS-tst-strrchr-rtm.c += -mrtm + endif + + ifneq ($(enable-cet),no) +diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c +new file mode 100644 +index 00000000..e4749401 +--- /dev/null ++++ b/sysdeps/x86/tst-memchr-rtm.c +@@ -0,0 +1,54 @@ ++/* Test case for memchr inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE); ++ string1[100] = 'c'; ++ string1[STRING_SIZE - 100] = 'c'; ++ char *p = memchr (string1, 'c', STRING_SIZE); ++ if (p == &string1[100]) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ char *p = memchr (string1, 'c', STRING_SIZE); ++ if (p == &string1[100]) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("memchr", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c +new file mode 100644 +index 00000000..e4c8a623 +--- /dev/null ++++ b/sysdeps/x86/tst-memcmp-rtm.c +@@ -0,0 +1,52 @@ ++/* Test case for memcmp inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++char string2[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE); ++ memset (string2, 'a', STRING_SIZE); ++ if (memcmp (string1, string2, STRING_SIZE) == 0) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ if (memcmp (string1, string2, STRING_SIZE) == 0) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("memcmp", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c +new file mode 100644 +index 00000000..4bf97ef1 +--- /dev/null ++++ b/sysdeps/x86/tst-memmove-rtm.c +@@ -0,0 +1,53 @@ ++/* Test case for memmove inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++char string2[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE); ++ if (memmove (string2, string1, STRING_SIZE) == string2 ++ && memcmp (string2, string1, STRING_SIZE) == 0) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ if (memmove (string2, string1, STRING_SIZE) == string2 ++ && memcmp (string2, string1, STRING_SIZE) == 0) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("memmove", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c +new file mode 100644 +index 00000000..a57a5a8e +--- /dev/null ++++ b/sysdeps/x86/tst-memrchr-rtm.c +@@ -0,0 +1,54 @@ ++/* Test case for memrchr inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE); ++ string1[100] = 'c'; ++ string1[STRING_SIZE - 100] = 'c'; ++ char *p = memrchr (string1, 'c', STRING_SIZE); ++ if (p == &string1[STRING_SIZE - 100]) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ char *p = memrchr (string1, 'c', STRING_SIZE); ++ if (p == &string1[STRING_SIZE - 100]) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("memrchr", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c +new file mode 100644 +index 00000000..bf343a4d +--- /dev/null ++++ b/sysdeps/x86/tst-memset-rtm.c +@@ -0,0 +1,45 @@ ++/* Test case for memset inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE); ++ return EXIT_SUCCESS; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ memset (string1, 'a', STRING_SIZE); ++ return 0; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("memset", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c +new file mode 100644 +index 00000000..a82e29c0 +--- /dev/null ++++ b/sysdeps/x86/tst-strchr-rtm.c +@@ -0,0 +1,54 @@ ++/* Test case for strchr inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE - 1); ++ string1[100] = 'c'; ++ string1[STRING_SIZE - 100] = 'c'; ++ char *p = strchr (string1, 'c'); ++ if (p == &string1[100]) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ char *p = strchr (string1, 'c'); ++ if (p == &string1[100]) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("strchr", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c +new file mode 100644 +index 00000000..2b2a583f +--- /dev/null ++++ b/sysdeps/x86/tst-strcpy-rtm.c +@@ -0,0 +1,53 @@ ++/* Test case for strcpy inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++char string2[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE - 1); ++ if (strcpy (string2, string1) == string2 ++ && strcmp (string2, string1) == 0) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ if (strcpy (string2, string1) == string2 ++ && strcmp (string2, string1) == 0) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("strcpy", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h +new file mode 100644 +index 00000000..d2470afa +--- /dev/null ++++ b/sysdeps/x86/tst-string-rtm.h +@@ -0,0 +1,72 @@ ++/* Test string function in a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++static int ++do_test_1 (const char *name, unsigned int loop, int (*prepare) (void), ++ int (*function) (void)) ++{ ++ if (!CPU_FEATURE_USABLE (RTM)) ++ return EXIT_UNSUPPORTED; ++ ++ int status = prepare (); ++ if (status != EXIT_SUCCESS) ++ return status; ++ ++ unsigned int i; ++ unsigned int naborts = 0; ++ unsigned int failed = 0; ++ for (i = 0; i < loop; i++) ++ { ++ failed |= function (); ++ if (_xbegin() == _XBEGIN_STARTED) ++ { ++ failed |= function (); ++ _xend(); ++ } ++ else ++ { ++ failed |= function (); ++ ++naborts; ++ } ++ } ++ ++ if (failed) ++ FAIL_EXIT1 ("%s() failed", name); ++ ++ if (naborts) ++ { ++ /* NB: Low single digit (<= 5%) noise-level aborts are normal for ++ TSX. */ ++ double rate = 100 * ((double) naborts) / ((double) loop); ++ if (rate > 5) ++ FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)", ++ rate, naborts, loop); ++ } ++ ++ return EXIT_SUCCESS; ++} ++ ++static int do_test (void); ++ ++#include +diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c +new file mode 100644 +index 00000000..0dcf14db +--- /dev/null ++++ b/sysdeps/x86/tst-strlen-rtm.c +@@ -0,0 +1,53 @@ ++/* Test case for strlen inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE - 1); ++ string1[STRING_SIZE - 100] = '\0'; ++ size_t len = strlen (string1); ++ if (len == STRING_SIZE - 100) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ size_t len = strlen (string1); ++ if (len == STRING_SIZE - 100) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("strlen", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c +new file mode 100644 +index 00000000..236ad951 +--- /dev/null ++++ b/sysdeps/x86/tst-strncmp-rtm.c +@@ -0,0 +1,52 @@ ++/* Test case for strncmp inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++char string2[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE - 1); ++ memset (string2, 'a', STRING_SIZE - 1); ++ if (strncmp (string1, string2, STRING_SIZE) == 0) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ if (strncmp (string1, string2, STRING_SIZE) == 0) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("strncmp", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c +new file mode 100644 +index 00000000..e32bfaf5 +--- /dev/null ++++ b/sysdeps/x86/tst-strrchr-rtm.c +@@ -0,0 +1,53 @@ ++/* Test case for strrchr inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE - 1); ++ string1[STRING_SIZE - 100] = 'c'; ++ char *p = strrchr (string1, 'c'); ++ if (p == &string1[STRING_SIZE - 100]) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ char *p = strrchr (string1, 'c'); ++ if (p == &string1[STRING_SIZE - 100]) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("strrchr", LOOP, prepare, function); ++} +-- +GitLab + diff --git a/glibc-RHEL-15696-19.patch b/glibc-RHEL-15696-19.patch new file mode 100644 index 0000000..0500875 --- /dev/null +++ b/glibc-RHEL-15696-19.patch @@ -0,0 +1,148 @@ +From 4e2d8f352774b56078c34648b14a2412c38384f4 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Sun, 7 Mar 2021 09:44:18 -0800 +Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memset family functions +Content-type: text/plain; charset=UTF-8 + +Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized +with AVX512 instructions using ZMM16-ZMM31 registers to avoid RTM abort +with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at +function exit. +--- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 14 +++++++++----- + sysdeps/x86_64/multiarch/ifunc-memset.h | 13 ++++++++----- + sysdeps/x86_64/multiarch/ifunc-wmemset.h | 12 ++++++------ + .../multiarch/memset-avx512-unaligned-erms.S | 16 ++++++++-------- + 4 files changed, 31 insertions(+), 24 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index c1efeec0..d969a156 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -211,10 +211,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW)), + __memset_chk_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), + __memset_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), + __memset_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + CPU_FEATURE_USABLE (AVX512F), +@@ -252,10 +254,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW)), + __memset_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, +- CPU_FEATURE_USABLE (AVX512F), ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), + __memset_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, +- CPU_FEATURE_USABLE (AVX512F), ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), + __memset_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memset, + CPU_FEATURE_USABLE (AVX512F), +@@ -719,7 +723,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + CPU_FEATURE_USABLE (AVX512VL), + __wmemset_evex_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __wmemset_avx512_unaligned)) + + #ifdef SHARED +diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h +index 6f3375cc..19795938 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memset.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h +@@ -53,13 +53,16 @@ IFUNC_SELECTOR (void) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) + { +- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) +- return OPTIMIZE (avx512_no_vzeroupper); ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE (avx512_unaligned_erms); + +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +- return OPTIMIZE (avx512_unaligned_erms); ++ return OPTIMIZE (avx512_unaligned); ++ } + +- return OPTIMIZE (avx512_unaligned); ++ return OPTIMIZE (avx512_no_vzeroupper); + } + + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) +diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h +index bdc94c6c..98c5d406 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h ++++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h +@@ -33,13 +33,13 @@ IFUNC_SELECTOR (void) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { +- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) +- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) +- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) +- return OPTIMIZE (avx512_unaligned); +- + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) +- return OPTIMIZE (evex_unaligned); ++ { ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) ++ return OPTIMIZE (avx512_unaligned); ++ ++ return OPTIMIZE (evex_unaligned); ++ } + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_unaligned_rtm); +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index 0783979c..22e7b187 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -1,22 +1,22 @@ + #if IS_IN (libc) + # define VEC_SIZE 64 +-# define VEC(i) zmm##i ++# define XMM0 xmm16 ++# define YMM0 ymm16 ++# define VEC0 zmm16 ++# define VEC(i) VEC##i + # define VMOVU vmovdqu64 + # define VMOVA vmovdqa64 ++# define VZEROUPPER + + # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- vmovd d, %xmm0; \ + movq r, %rax; \ +- vpbroadcastb %xmm0, %xmm0; \ +- vpbroadcastq %xmm0, %zmm0 ++ vpbroadcastb d, %VEC0 + + # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- vmovd d, %xmm0; \ + movq r, %rax; \ +- vpbroadcastd %xmm0, %xmm0; \ +- vpbroadcastq %xmm0, %zmm0 ++ vpbroadcastd d, %VEC0 + +-# define SECTION(p) p##.avx512 ++# define SECTION(p) p##.evex512 + # define MEMSET_SYMBOL(p,s) p##_avx512_##s + # define WMEMSET_SYMBOL(p,s) p##_avx512_##s + +-- +GitLab + diff --git a/glibc-RHEL-15696-2.patch b/glibc-RHEL-15696-2.patch new file mode 100644 index 0000000..54f3ac3 --- /dev/null +++ b/glibc-RHEL-15696-2.patch @@ -0,0 +1,230 @@ +From b304fc201d2f6baf52ea790df8643e99772243cd Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 21 Jan 2019 11:25:56 -0800 +Subject: [PATCH] x86-64 memcmp/wmemcmp: Properly handle the length parameter + [BZ# 24097] +Content-type: text/plain; charset=UTF-8 + +On x32, the size_t parameter may be passed in the lower 32 bits of a +64-bit register with the non-zero upper 32 bits. The string/memory +functions written in assembly can only use the lower 32 bits of a +64-bit register as length or must clear the upper 32 bits before using +the full 64-bit register for length. + +This pach fixes memcmp/wmemcmp for x32. Tested on x86-64 and x32. On +x86-64, libc.so is the same with and withou the fix. + + [BZ# 24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for + length. Clear the upper 32 bits of RDX register. + * sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise. + * sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and + tst-size_t-wmemcmp. + * sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file. + * sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise. +--- + sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 7 +- + sysdeps/x86_64/multiarch/memcmp-sse4.S | 9 ++- + sysdeps/x86_64/multiarch/memcmp-ssse3.S | 7 +- + sysdeps/x86_64/x32/Makefile | 4 +- + sysdeps/x86_64/x32/tst-size_t-memcmp.c | 76 ++++++++++++++++++++ + sysdeps/x86_64/x32/tst-size_t-wmemcmp.c | 20 ++++++ + 6 files changed, 114 insertions(+), 9 deletions(-) + create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c + create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c + +Conflicts: + ChangeLog + (removed) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +index 30f764c3..e3a35b89 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +@@ -58,9 +58,12 @@ + .section .text.avx,"ax",@progbits + ENTRY (MEMCMP) + # ifdef USE_AS_WMEMCMP +- shl $2, %rdx ++ shl $2, %RDX_LP ++# elif defined __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx + # endif +- cmpq $VEC_SIZE, %rdx ++ cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) + + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ +diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S +index 8e164f2c..302900f5 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S ++++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S +@@ -42,13 +42,16 @@ + .section .text.sse4.1,"ax",@progbits + ENTRY (MEMCMP) + # ifdef USE_AS_WMEMCMP +- shl $2, %rdx ++ shl $2, %RDX_LP ++# elif defined __ILP32__ ++ /* Clear the upper 32 bits. */ ++ mov %edx, %edx + # endif + pxor %xmm0, %xmm0 +- cmp $79, %rdx ++ cmp $79, %RDX_LP + ja L(79bytesormore) + # ifndef USE_AS_WMEMCMP +- cmp $1, %rdx ++ cmp $1, %RDX_LP + je L(firstbyte) + # endif + add %rdx, %rsi +diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S +index 6f76c641..69d030fc 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S ++++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S +@@ -33,9 +33,12 @@ + atom_text_section + ENTRY (MEMCMP) + # ifdef USE_AS_WMEMCMP +- shl $2, %rdx +- test %rdx, %rdx ++ shl $2, %RDX_LP ++ test %RDX_LP, %RDX_LP + jz L(equal) ++# elif defined __ILP32__ ++ /* Clear the upper 32 bits. */ ++ mov %edx, %edx + # endif + mov %rdx, %rcx + mov %rdi, %rdx +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index 7d528889..ddec7f04 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround + endif + + ifeq ($(subdir),string) +-tests += tst-size_t-memchr ++tests += tst-size_t-memchr tst-size_t-memcmp + endif + + ifeq ($(subdir),wcsmbs) +-tests += tst-size_t-wmemchr ++tests += tst-size_t-wmemchr tst-size_t-wmemcmp + endif +diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c +new file mode 100644 +index 00000000..9bd6fdb4 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c +@@ -0,0 +1,76 @@ ++/* Test memcmp with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define TEST_MAIN ++#ifdef WIDE ++# define TEST_NAME "wmemcmp" ++#else ++# define TEST_NAME "memcmp" ++#endif ++ ++#include "test-size_t.h" ++ ++#ifdef WIDE ++# include ++# include ++ ++# define MEMCMP wmemcmp ++# define CHAR wchar_t ++#else ++# define MEMCMP memcmp ++# define CHAR char ++#endif ++ ++IMPL (MEMCMP, 1) ++ ++typedef int (*proto_t) (const CHAR *, const CHAR *, size_t); ++ ++static int ++__attribute__ ((noinline, noclone)) ++do_memcmp (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 }; ++ parameter_t src = { { 0 }, buf2 }; ++ ++ memcpy (buf1, buf2, page_size); ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ src.fn = impl->fn; ++ int res = do_memcmp (dest, src); ++ if (res) ++ { ++ error (0, 0, "Wrong result in function %s: %i != 0", ++ impl->name, res); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c +new file mode 100644 +index 00000000..e8b5ffd0 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c +@@ -0,0 +1,20 @@ ++/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define WIDE 1 ++#include "tst-size_t-memcmp.c" +-- +GitLab + diff --git a/glibc-RHEL-15696-20.patch b/glibc-RHEL-15696-20.patch new file mode 100644 index 0000000..c63b3fb --- /dev/null +++ b/glibc-RHEL-15696-20.patch @@ -0,0 +1,164 @@ +From e4fda4631017e49d4ee5a2755db34289b6860fa4 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Sun, 7 Mar 2021 09:45:23 -0800 +Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memmove family functions +Content-type: text/plain; charset=UTF-8 + +Update ifunc-memmove.h to select the function optimized with AVX512 +instructions using ZMM16-ZMM31 registers to avoid RTM abort with usable +AVX512VL since VZEROUPPER isn't needed at function exit. +--- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 24 +++++++++--------- + sysdeps/x86_64/multiarch/ifunc-memmove.h | 12 +++++---- + .../multiarch/memmove-avx512-unaligned-erms.S | 25 +++++++++++++++++-- + 3 files changed, 42 insertions(+), 19 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index d969a156..fec384f6 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -83,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + CPU_FEATURE_USABLE (AVX512F), + __memmove_chk_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, __memmove_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __memmove_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memmove_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __memmove_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + CPU_FEATURE_USABLE (AVX), +@@ -148,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + CPU_FEATURE_USABLE (AVX512F), + __memmove_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, memmove, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __memmove_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __memmove_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), + __memmove_ssse3_back) +@@ -733,10 +733,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + CPU_FEATURE_USABLE (AVX512F), + __memcpy_chk_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __memcpy_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __memcpy_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + CPU_FEATURE_USABLE (AVX), +@@ -802,10 +802,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + CPU_FEATURE_USABLE (AVX512F), + __memcpy_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, memcpy, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __memcpy_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __memcpy_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, 1, +@@ -819,10 +819,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + CPU_FEATURE_USABLE (AVX512F), + __mempcpy_chk_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __mempcpy_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __mempcpy_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + CPU_FEATURE_USABLE (AVX), +@@ -864,10 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + CPU_FEATURE_USABLE (AVX512F), + __mempcpy_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, mempcpy, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __mempcpy_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __mempcpy_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, mempcpy, + CPU_FEATURE_USABLE (AVX), +diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h +index fa09b9fb..014e95c7 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h +@@ -56,13 +56,15 @@ IFUNC_SELECTOR (void) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) + { +- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) +- return OPTIMIZE (avx512_no_vzeroupper); ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE (avx512_unaligned_erms); + +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +- return OPTIMIZE (avx512_unaligned_erms); ++ return OPTIMIZE (avx512_unaligned); ++ } + +- return OPTIMIZE (avx512_unaligned); ++ return OPTIMIZE (avx512_no_vzeroupper); + } + + if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S +index aac1515c..848848ab 100644 +--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S +@@ -1,11 +1,32 @@ + #if IS_IN (libc) + # define VEC_SIZE 64 +-# define VEC(i) zmm##i ++# define XMM0 xmm16 ++# define XMM1 xmm17 ++# define YMM0 ymm16 ++# define YMM1 ymm17 ++# define VEC0 zmm16 ++# define VEC1 zmm17 ++# define VEC2 zmm18 ++# define VEC3 zmm19 ++# define VEC4 zmm20 ++# define VEC5 zmm21 ++# define VEC6 zmm22 ++# define VEC7 zmm23 ++# define VEC8 zmm24 ++# define VEC9 zmm25 ++# define VEC10 zmm26 ++# define VEC11 zmm27 ++# define VEC12 zmm28 ++# define VEC13 zmm29 ++# define VEC14 zmm30 ++# define VEC15 zmm31 ++# define VEC(i) VEC##i + # define VMOVNT vmovntdq + # define VMOVU vmovdqu64 + # define VMOVA vmovdqa64 ++# define VZEROUPPER + +-# define SECTION(p) p##.avx512 ++# define SECTION(p) p##.evex512 + # define MEMMOVE_SYMBOL(p,s) p##_avx512_##s + + # include "memmove-vec-unaligned-erms.S" +-- +GitLab + diff --git a/glibc-RHEL-15696-21.patch b/glibc-RHEL-15696-21.patch new file mode 100644 index 0000000..319c08d --- /dev/null +++ b/glibc-RHEL-15696-21.patch @@ -0,0 +1,71 @@ +From 595c22ecd8e87a27fd19270ed30fdbae9ad25426 Mon Sep 17 00:00:00 2001 +From: Sunil K Pandey +Date: Thu, 1 Apr 2021 15:47:04 -0700 +Subject: [PATCH] x86-64: Fix ifdef indentation in strlen-evex.S +Content-type: text/plain; charset=UTF-8 + +Fix some indentations of ifdef in file strlen-evex.S which are off by 1 +and confusing to read. +--- + sysdeps/x86_64/multiarch/strlen-evex.S | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S +index cd022509..05838190 100644 +--- a/sysdeps/x86_64/multiarch/strlen-evex.S ++++ b/sysdeps/x86_64/multiarch/strlen-evex.S +@@ -276,10 +276,10 @@ L(last_2x_vec): + .p2align 4 + L(first_vec_x0_check): + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +-# endif ++# endif + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) +@@ -293,10 +293,10 @@ L(first_vec_x0_check): + .p2align 4 + L(first_vec_x1_check): + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +-# endif ++# endif + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) +@@ -311,10 +311,10 @@ L(first_vec_x1_check): + .p2align 4 + L(first_vec_x2_check): + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +-# endif ++# endif + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) +@@ -329,10 +329,10 @@ L(first_vec_x2_check): + .p2align 4 + L(first_vec_x3_check): + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +-# endif ++# endif + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) +-- +GitLab + diff --git a/glibc-RHEL-15696-22.patch b/glibc-RHEL-15696-22.patch new file mode 100644 index 0000000..c20557b --- /dev/null +++ b/glibc-RHEL-15696-22.patch @@ -0,0 +1,51 @@ +From 55bf411b451c13f0fb7ff3d3bf9a820020b45df1 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 19 Apr 2021 07:07:21 -0700 +Subject: [PATCH] x86-64: Require BMI2 for __strlen_evex and __strnlen_evex +Content-type: text/plain; charset=UTF-8 + +Since __strlen_evex and __strnlen_evex added by + +commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77 +Author: H.J. Lu +Date: Fri Mar 5 06:24:52 2021 -0800 + + x86-64: Add ifunc-avx2.h functions with 256-bit EVEX + +use sarx: + +c4 e2 6a f7 c0 sarx %edx,%eax,%eax + +require BMI2 for __strlen_evex and __strnlen_evex in ifunc-impl-list.c. +ifunc-avx2.h already requires BMI2 for EVEX implementation. +--- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index fec384f6..cbfc1a5d 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -293,7 +293,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __strlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strlen, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __strlen_evex) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)) + +@@ -308,7 +309,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __strnlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strnlen, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __strnlen_evex) + IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2)) + +-- +GitLab + diff --git a/glibc-RHEL-15696-23.patch b/glibc-RHEL-15696-23.patch new file mode 100644 index 0000000..ffde3d7 --- /dev/null +++ b/glibc-RHEL-15696-23.patch @@ -0,0 +1,584 @@ +From acfd088a1963ba51cd83c78f95c0ab25ead79e04 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 3 May 2021 03:01:58 -0400 +Subject: [PATCH] x86: Optimize memchr-avx2.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit optimizes memchr-avx2.S. The optimizations include +replacing some branches with cmovcc, avoiding some branches entirely +in the less_4x_vec case, making the page cross logic less strict, +asaving a few instructions the in loop return loop. test-memchr, +test-rawmemchr, and test-wmemchr are all passing. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++----------- + 1 file changed, 247 insertions(+), 178 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S +index cf893e77..b377f22e 100644 +--- a/sysdeps/x86_64/multiarch/memchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S +@@ -26,8 +26,22 @@ + + # ifdef USE_AS_WMEMCHR + # define VPCMPEQ vpcmpeqd ++# define VPBROADCAST vpbroadcastd ++# define CHAR_SIZE 4 + # else + # define VPCMPEQ vpcmpeqb ++# define VPBROADCAST vpbroadcastb ++# define CHAR_SIZE 1 ++# endif ++ ++# ifdef USE_AS_RAWMEMCHR ++# define ERAW_PTR_REG ecx ++# define RRAW_PTR_REG rcx ++# define ALGN_PTR_REG rdi ++# else ++# define ERAW_PTR_REG edi ++# define RRAW_PTR_REG rdi ++# define ALGN_PTR_REG rcx + # endif + + # ifndef VZEROUPPER +@@ -39,6 +53,7 @@ + # endif + + # define VEC_SIZE 32 ++# define PAGE_SIZE 4096 + + .section SECTION(.text),"ax",@progbits + ENTRY (MEMCHR) +@@ -47,295 +62,349 @@ ENTRY (MEMCHR) + test %RDX_LP, %RDX_LP + jz L(null) + # endif +- movl %edi, %ecx +- /* Broadcast CHAR to YMM0. */ +- vmovd %esi, %xmm0 + # ifdef USE_AS_WMEMCHR + shl $2, %RDX_LP +- vpbroadcastd %xmm0, %ymm0 + # else + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx + # endif +- vpbroadcastb %xmm0, %ymm0 + # endif ++ /* Broadcast CHAR to YMMMATCH. */ ++ vmovd %esi, %xmm0 ++ VPBROADCAST %xmm0, %ymm0 + /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) ++ movl %edi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. */ +- VPCMPEQ (%rdi), %ymm0, %ymm1 ++ VPCMPEQ (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax +- testl %eax, %eax +- + # ifndef USE_AS_RAWMEMCHR +- jnz L(first_vec_x0_check) +- /* Adjust length and check the end of data. */ +- subq $VEC_SIZE, %rdx +- jbe L(zero) +-# else +- jnz L(first_vec_x0) ++ /* If length < CHAR_PER_VEC handle special. */ ++ cmpq $VEC_SIZE, %rdx ++ jbe L(first_vec_x0) + # endif +- +- /* Align data for aligned loads in the loop. */ +- addq $VEC_SIZE, %rdi +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi ++ testl %eax, %eax ++ jz L(aligned_more) ++ tzcntl %eax, %eax ++ addq %rdi, %rax ++ VZEROUPPER_RETURN + + # ifndef USE_AS_RAWMEMCHR +- /* Adjust length. */ +- addq %rcx, %rdx ++ .p2align 5 ++L(first_vec_x0): ++ /* Check if first match was before length. */ ++ tzcntl %eax, %eax ++ xorl %ecx, %ecx ++ cmpl %eax, %edx ++ leaq (%rdi, %rax), %rax ++ cmovle %rcx, %rax ++ VZEROUPPER_RETURN + +- subq $(VEC_SIZE * 4), %rdx +- jbe L(last_4x_vec_or_less) ++L(null): ++ xorl %eax, %eax ++ ret + # endif +- jmp L(more_4x_vec) +- + .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi +- VPCMPEQ (%rdi), %ymm0, %ymm1 ++L(cross_page_boundary): ++ /* Save pointer before aligning as its original value is necessary ++ for computer return address if byte is found or adjusting length ++ if it is not and this is memchr. */ ++ movq %rdi, %rcx ++ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and ++ rdi for rawmemchr. */ ++ orq $(VEC_SIZE - 1), %ALGN_PTR_REG ++ VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax ++# ifndef USE_AS_RAWMEMCHR ++ /* Calculate length until end of page (length checked for a ++ match). */ ++ leaq 1(%ALGN_PTR_REG), %rsi ++ subq %RRAW_PTR_REG, %rsi ++# endif + /* Remove the leading bytes. */ +- sarl %cl, %eax +- testl %eax, %eax +- jz L(aligned_more) +- tzcntl %eax, %eax ++ sarxl %ERAW_PTR_REG, %eax, %eax + # ifndef USE_AS_RAWMEMCHR + /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) ++ cmpq %rsi, %rdx ++ jbe L(first_vec_x0) + # endif +- addq %rdi, %rax +- addq %rcx, %rax ++ testl %eax, %eax ++ jz L(cross_page_continue) ++ tzcntl %eax, %eax ++ addq %RRAW_PTR_REG, %rax + L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 +-L(aligned_more): +-# ifndef USE_AS_RAWMEMCHR +- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" +- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition +- overflow. */ +- negq %rcx +- addq $VEC_SIZE, %rcx ++L(first_vec_x1): ++ tzcntl %eax, %eax ++ incq %rdi ++ addq %rdi, %rax ++ VZEROUPPER_RETURN + +- /* Check the end of data. */ +- subq %rcx, %rdx +- jbe L(zero) +-# endif ++ .p2align 4 ++L(first_vec_x2): ++ tzcntl %eax, %eax ++ addq $(VEC_SIZE + 1), %rdi ++ addq %rdi, %rax ++ VZEROUPPER_RETURN ++ ++ .p2align 4 ++L(first_vec_x3): ++ tzcntl %eax, %eax ++ addq $(VEC_SIZE * 2 + 1), %rdi ++ addq %rdi, %rax ++ VZEROUPPER_RETURN + +- addq $VEC_SIZE, %rdi + +-# ifndef USE_AS_RAWMEMCHR +- subq $(VEC_SIZE * 4), %rdx +- jbe L(last_4x_vec_or_less) +-# endif ++ .p2align 4 ++L(first_vec_x4): ++ tzcntl %eax, %eax ++ addq $(VEC_SIZE * 3 + 1), %rdi ++ addq %rdi, %rax ++ VZEROUPPER_RETURN + +-L(more_4x_vec): ++ .p2align 4 ++L(aligned_more): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ +- VPCMPEQ (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) + +- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 ++# ifndef USE_AS_RAWMEMCHR ++L(cross_page_continue): ++ /* Align data to VEC_SIZE - 1. */ ++ xorl %ecx, %ecx ++ subl %edi, %ecx ++ orq $(VEC_SIZE - 1), %rdi ++ /* esi is for adjusting length to see if near the end. */ ++ leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi ++# else ++ orq $(VEC_SIZE - 1), %rdi ++L(cross_page_continue): ++# endif ++ /* Load first VEC regardless. */ ++ VPCMPEQ 1(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax ++# ifndef USE_AS_RAWMEMCHR ++ /* Adjust length. If near end handle specially. */ ++ subq %rsi, %rdx ++ jbe L(last_4x_vec_or_less) ++# endif + testl %eax, %eax + jnz L(first_vec_x1) + +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 ++ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x2) + +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 ++ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x3) + +- addq $(VEC_SIZE * 4), %rdi ++ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x4) + + # ifndef USE_AS_RAWMEMCHR ++ /* Check if at last VEC_SIZE * 4 length. */ + subq $(VEC_SIZE * 4), %rdx +- jbe L(last_4x_vec_or_less) +-# endif +- +- /* Align data to 4 * VEC_SIZE. */ +- movq %rdi, %rcx +- andl $(4 * VEC_SIZE - 1), %ecx +- andq $-(4 * VEC_SIZE), %rdi +- +-# ifndef USE_AS_RAWMEMCHR +- /* Adjust length. */ ++ jbe L(last_4x_vec_or_less_cmpeq) ++ /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust ++ length. */ ++ incq %rdi ++ movl %edi, %ecx ++ orq $(VEC_SIZE * 4 - 1), %rdi ++ andl $(VEC_SIZE * 4 - 1), %ecx + addq %rcx, %rdx ++# else ++ /* Align data to VEC_SIZE * 4 - 1 for loop. */ ++ incq %rdi ++ orq $(VEC_SIZE * 4 - 1), %rdi + # endif + ++ /* Compare 4 * VEC at a time forward. */ + .p2align 4 + L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ +- VPCMPEQ (%rdi), %ymm0, %ymm1 +- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 +- ++ VPCMPEQ 1(%rdi), %ymm0, %ymm1 ++ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2 ++ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3 ++ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4 + vpor %ymm1, %ymm2, %ymm5 + vpor %ymm3, %ymm4, %ymm6 + vpor %ymm5, %ymm6, %ymm5 + +- vpmovmskb %ymm5, %eax +- testl %eax, %eax +- jnz L(4x_vec_end) +- +- addq $(VEC_SIZE * 4), %rdi +- ++ vpmovmskb %ymm5, %ecx + # ifdef USE_AS_RAWMEMCHR +- jmp L(loop_4x_vec) ++ subq $-(VEC_SIZE * 4), %rdi ++ testl %ecx, %ecx ++ jz L(loop_4x_vec) + # else +- subq $(VEC_SIZE * 4), %rdx +- ja L(loop_4x_vec) ++ testl %ecx, %ecx ++ jnz L(loop_4x_vec_end) + +-L(last_4x_vec_or_less): +- /* Less than 4 * VEC and aligned to VEC_SIZE. */ +- addl $(VEC_SIZE * 2), %edx +- jle L(last_2x_vec) ++ subq $-(VEC_SIZE * 4), %rdi + +- VPCMPEQ (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) ++ subq $(VEC_SIZE * 4), %rdx ++ ja L(loop_4x_vec) + +- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 ++ /* Fall through into less than 4 remaining vectors of length case. ++ */ ++ VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax ++ .p2align 4 ++L(last_4x_vec_or_less): ++ /* Check if first VEC contained match. */ + testl %eax, %eax +- jnz L(first_vec_x1) ++ jnz L(first_vec_x1_check) + +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax ++ /* If remaining length > VEC_SIZE * 2. */ ++ addl $(VEC_SIZE * 2), %edx ++ jg L(last_4x_vec) + +- jnz L(first_vec_x2_check) +- subl $VEC_SIZE, %edx +- jle L(zero) ++L(last_2x_vec): ++ /* If remaining length < VEC_SIZE. */ ++ addl $VEC_SIZE, %edx ++ jle L(zero_end) + +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 ++ /* Check VEC2 and compare any match with remaining length. */ ++ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax +- testl %eax, %eax +- +- jnz L(first_vec_x3_check) +- xorl %eax, %eax ++ tzcntl %eax, %eax ++ cmpl %eax, %edx ++ jbe L(set_zero_end) ++ addq $(VEC_SIZE + 1), %rdi ++ addq %rdi, %rax ++L(zero_end): + VZEROUPPER_RETURN + + .p2align 4 +-L(last_2x_vec): +- addl $(VEC_SIZE * 2), %edx +- VPCMPEQ (%rdi), %ymm0, %ymm1 ++L(loop_4x_vec_end): ++# endif ++ /* rawmemchr will fall through into this if match was found in ++ loop. */ ++ + vpmovmskb %ymm1, %eax + testl %eax, %eax ++ jnz L(last_vec_x1_return) + +- jnz L(first_vec_x0_check) +- subl $VEC_SIZE, %edx +- jle L(zero) +- +- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm2, %eax + testl %eax, %eax +- jnz L(first_vec_x1_check) +- xorl %eax, %eax +- VZEROUPPER_RETURN ++ jnz L(last_vec_x2_return) + +- .p2align 4 +-L(first_vec_x0_check): +- tzcntl %eax, %eax +- /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) ++ vpmovmskb %ymm3, %eax ++ /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */ ++ salq $32, %rcx ++ orq %rcx, %rax ++ tzcntq %rax, %rax ++# ifdef USE_AS_RAWMEMCHR ++ subq $(VEC_SIZE * 2 - 1), %rdi ++# else ++ subq $-(VEC_SIZE * 2 + 1), %rdi ++# endif + addq %rdi, %rax + VZEROUPPER_RETURN ++# ifndef USE_AS_RAWMEMCHR + + .p2align 4 + L(first_vec_x1_check): + tzcntl %eax, %eax +- /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) +- addq $VEC_SIZE, %rax ++ /* Adjust length. */ ++ subl $-(VEC_SIZE * 4), %edx ++ /* Check if match within remaining length. */ ++ cmpl %eax, %edx ++ jbe L(set_zero_end) ++ incq %rdi + addq %rdi, %rax + VZEROUPPER_RETURN ++ .p2align 4 ++L(set_zero_end): ++ xorl %eax, %eax ++ VZEROUPPER_RETURN ++# endif + + .p2align 4 +-L(first_vec_x2_check): ++L(last_vec_x1_return): + tzcntl %eax, %eax +- /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) +- addq $(VEC_SIZE * 2), %rax ++# ifdef USE_AS_RAWMEMCHR ++ subq $(VEC_SIZE * 4 - 1), %rdi ++# else ++ incq %rdi ++# endif + addq %rdi, %rax + VZEROUPPER_RETURN + + .p2align 4 +-L(first_vec_x3_check): ++L(last_vec_x2_return): + tzcntl %eax, %eax +- /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) +- addq $(VEC_SIZE * 3), %rax ++# ifdef USE_AS_RAWMEMCHR ++ subq $(VEC_SIZE * 3 - 1), %rdi ++# else ++ subq $-(VEC_SIZE + 1), %rdi ++# endif + addq %rdi, %rax + VZEROUPPER_RETURN + ++# ifndef USE_AS_RAWMEMCHR + .p2align 4 +-L(zero): +- xorl %eax, %eax +- jmp L(return_vzeroupper) ++L(last_4x_vec_or_less_cmpeq): ++ VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ subq $-(VEC_SIZE * 4), %rdi ++ /* Check first VEC regardless. */ ++ testl %eax, %eax ++ jnz L(first_vec_x1_check) + ++ /* If remaining length <= CHAR_PER_VEC * 2. */ ++ addl $(VEC_SIZE * 2), %edx ++ jle L(last_2x_vec) + .p2align 4 +-L(null): +- xorl %eax, %eax +- ret +-# endif ++L(last_4x_vec): ++ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x2_return) + +- .p2align 4 +-L(first_vec_x0): +- tzcntl %eax, %eax +- addq %rdi, %rax +- VZEROUPPER_RETURN ++ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax + +- .p2align 4 +-L(first_vec_x1): +- tzcntl %eax, %eax +- addq $VEC_SIZE, %rax +- addq %rdi, %rax +- VZEROUPPER_RETURN ++ /* Create mask for possible matches within remaining length. */ ++ movq $-1, %rcx ++ bzhiq %rdx, %rcx, %rcx + +- .p2align 4 +-L(first_vec_x2): ++ /* Test matches in data against length match. */ ++ andl %ecx, %eax ++ jnz L(last_vec_x3) ++ ++ /* if remaining length <= VEC_SIZE * 3 (Note this is after ++ remaining length was found to be > VEC_SIZE * 2. */ ++ subl $VEC_SIZE, %edx ++ jbe L(zero_end2) ++ ++ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ /* Shift remaining length mask for last VEC. */ ++ shrq $32, %rcx ++ andl %ecx, %eax ++ jz L(zero_end2) + tzcntl %eax, %eax +- addq $(VEC_SIZE * 2), %rax ++ addq $(VEC_SIZE * 3 + 1), %rdi + addq %rdi, %rax ++L(zero_end2): + VZEROUPPER_RETURN + + .p2align 4 +-L(4x_vec_end): +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) +- vpmovmskb %ymm2, %eax +- testl %eax, %eax +- jnz L(first_vec_x1) +- vpmovmskb %ymm3, %eax +- testl %eax, %eax +- jnz L(first_vec_x2) +- vpmovmskb %ymm4, %eax +- testl %eax, %eax +-L(first_vec_x3): ++L(last_vec_x3): + tzcntl %eax, %eax +- addq $(VEC_SIZE * 3), %rax ++ subq $-(VEC_SIZE * 2 + 1), %rdi + addq %rdi, %rax + VZEROUPPER_RETURN ++# endif + + END (MEMCHR) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-24.patch b/glibc-RHEL-15696-24.patch new file mode 100644 index 0000000..c4f24ff --- /dev/null +++ b/glibc-RHEL-15696-24.patch @@ -0,0 +1,388 @@ +From 645a158978f9520e74074e8c14047503be4db0f0 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 9 Jun 2021 16:25:32 -0400 +Subject: [PATCH] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ + #27974] +Content-type: text/plain; charset=UTF-8 + +This commit fixes the bug mentioned in the previous commit. + +The previous implementations of wmemchr in these files relied +on n * sizeof(wchar_t) which was not guranteed by the standard. + +The new overflow tests added in the previous commit now +pass (As well as all the other tests). + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/memchr.S | 77 +++++++++++++++++++------- + sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------ + 2 files changed, 98 insertions(+), 37 deletions(-) + +diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S +index cb320257..24f9a0c5 100644 +--- a/sysdeps/x86_64/memchr.S ++++ b/sysdeps/x86_64/memchr.S +@@ -21,9 +21,11 @@ + #ifdef USE_AS_WMEMCHR + # define MEMCHR wmemchr + # define PCMPEQ pcmpeqd ++# define CHAR_PER_VEC 4 + #else + # define MEMCHR memchr + # define PCMPEQ pcmpeqb ++# define CHAR_PER_VEC 16 + #endif + + /* fast SSE2 version with using pmaxub and 64 byte loop */ +@@ -33,15 +35,14 @@ ENTRY(MEMCHR) + movd %esi, %xmm1 + mov %edi, %ecx + ++#ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx ++#endif + #ifdef USE_AS_WMEMCHR + test %RDX_LP, %RDX_LP + jz L(return_null) +- shl $2, %RDX_LP + #else +-# ifdef __ILP32__ +- /* Clear the upper 32 bits. */ +- movl %edx, %edx +-# endif + punpcklbw %xmm1, %xmm1 + test %RDX_LP, %RDX_LP + jz L(return_null) +@@ -60,13 +61,16 @@ ENTRY(MEMCHR) + test %eax, %eax + + jnz L(matches_1) +- sub $16, %rdx ++ sub $CHAR_PER_VEC, %rdx + jbe L(return_null) + add $16, %rdi + and $15, %ecx + and $-16, %rdi ++#ifdef USE_AS_WMEMCHR ++ shr $2, %ecx ++#endif + add %rcx, %rdx +- sub $64, %rdx ++ sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + jmp L(loop_prolog) + +@@ -77,16 +81,21 @@ L(crosscache): + movdqa (%rdi), %xmm0 + + PCMPEQ %xmm1, %xmm0 +-/* Check if there is a match. */ ++ /* Check if there is a match. */ + pmovmskb %xmm0, %eax +-/* Remove the leading bytes. */ ++ /* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) +-/* Check which byte is a match. */ ++ /* Check which byte is a match. */ + bsf %eax, %eax +- ++#ifdef USE_AS_WMEMCHR ++ mov %eax, %esi ++ shr $2, %esi ++ sub %rsi, %rdx ++#else + sub %rax, %rdx ++#endif + jbe L(return_null) + add %rdi, %rax + add %rcx, %rax +@@ -94,15 +103,18 @@ L(crosscache): + + .p2align 4 + L(unaligned_no_match): +- /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using ++ /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using + "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void + possible addition overflow. */ + neg %rcx + add $16, %rcx ++#ifdef USE_AS_WMEMCHR ++ shr $2, %ecx ++#endif + sub %rcx, %rdx + jbe L(return_null) + add $16, %rdi +- sub $64, %rdx ++ sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + + .p2align 4 +@@ -135,7 +147,7 @@ L(loop_prolog): + test $0x3f, %rdi + jz L(align64_loop) + +- sub $64, %rdx ++ sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + + movdqa (%rdi), %xmm0 +@@ -167,11 +179,14 @@ L(loop_prolog): + mov %rdi, %rcx + and $-64, %rdi + and $63, %ecx ++#ifdef USE_AS_WMEMCHR ++ shr $2, %ecx ++#endif + add %rcx, %rdx + + .p2align 4 + L(align64_loop): +- sub $64, %rdx ++ sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + movdqa (%rdi), %xmm0 + movdqa 16(%rdi), %xmm2 +@@ -218,7 +233,7 @@ L(align64_loop): + + .p2align 4 + L(exit_loop): +- add $32, %edx ++ add $(CHAR_PER_VEC * 2), %edx + jle L(exit_loop_32) + + movdqa (%rdi), %xmm0 +@@ -238,7 +253,7 @@ L(exit_loop): + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32_1) +- sub $16, %edx ++ sub $CHAR_PER_VEC, %edx + jle L(return_null) + + PCMPEQ 48(%rdi), %xmm1 +@@ -250,13 +265,13 @@ L(exit_loop): + + .p2align 4 + L(exit_loop_32): +- add $32, %edx ++ add $(CHAR_PER_VEC * 2), %edx + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches_1) +- sub $16, %edx ++ sub $CHAR_PER_VEC, %edx + jbe L(return_null) + + PCMPEQ 16(%rdi), %xmm1 +@@ -293,7 +308,13 @@ L(matches32): + .p2align 4 + L(matches_1): + bsf %eax, %eax ++#ifdef USE_AS_WMEMCHR ++ mov %eax, %esi ++ shr $2, %esi ++ sub %rsi, %rdx ++#else + sub %rax, %rdx ++#endif + jbe L(return_null) + add %rdi, %rax + ret +@@ -301,7 +322,13 @@ L(matches_1): + .p2align 4 + L(matches16_1): + bsf %eax, %eax ++#ifdef USE_AS_WMEMCHR ++ mov %eax, %esi ++ shr $2, %esi ++ sub %rsi, %rdx ++#else + sub %rax, %rdx ++#endif + jbe L(return_null) + lea 16(%rdi, %rax), %rax + ret +@@ -309,7 +336,13 @@ L(matches16_1): + .p2align 4 + L(matches32_1): + bsf %eax, %eax ++#ifdef USE_AS_WMEMCHR ++ mov %eax, %esi ++ shr $2, %esi ++ sub %rsi, %rdx ++#else + sub %rax, %rdx ++#endif + jbe L(return_null) + lea 32(%rdi, %rax), %rax + ret +@@ -317,7 +350,13 @@ L(matches32_1): + .p2align 4 + L(matches48_1): + bsf %eax, %eax ++#ifdef USE_AS_WMEMCHR ++ mov %eax, %esi ++ shr $2, %esi ++ sub %rsi, %rdx ++#else + sub %rax, %rdx ++#endif + jbe L(return_null) + lea 48(%rdi, %rax), %rax + ret +diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S +index b377f22e..16027abb 100644 +--- a/sysdeps/x86_64/multiarch/memchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S +@@ -54,21 +54,19 @@ + + # define VEC_SIZE 32 + # define PAGE_SIZE 4096 ++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section SECTION(.text),"ax",@progbits + ENTRY (MEMCHR) + # ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ +- test %RDX_LP, %RDX_LP +- jz L(null) +-# endif +-# ifdef USE_AS_WMEMCHR +- shl $2, %RDX_LP +-# else + # ifdef __ILP32__ +- /* Clear the upper 32 bits. */ +- movl %edx, %edx ++ /* Clear upper bits. */ ++ and %RDX_LP, %RDX_LP ++# else ++ test %RDX_LP, %RDX_LP + # endif ++ jz L(null) + # endif + /* Broadcast CHAR to YMMMATCH. */ + vmovd %esi, %xmm0 +@@ -84,7 +82,7 @@ ENTRY (MEMCHR) + vpmovmskb %ymm1, %eax + # ifndef USE_AS_RAWMEMCHR + /* If length < CHAR_PER_VEC handle special. */ +- cmpq $VEC_SIZE, %rdx ++ cmpq $CHAR_PER_VEC, %rdx + jbe L(first_vec_x0) + # endif + testl %eax, %eax +@@ -98,6 +96,10 @@ ENTRY (MEMCHR) + L(first_vec_x0): + /* Check if first match was before length. */ + tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply length by 4 to get byte count. */ ++ sall $2, %edx ++# endif + xorl %ecx, %ecx + cmpl %eax, %edx + leaq (%rdi, %rax), %rax +@@ -110,12 +112,12 @@ L(null): + # endif + .p2align 4 + L(cross_page_boundary): +- /* Save pointer before aligning as its original value is necessary +- for computer return address if byte is found or adjusting length +- if it is not and this is memchr. */ ++ /* Save pointer before aligning as its original value is ++ necessary for computer return address if byte is found or ++ adjusting length if it is not and this is memchr. */ + movq %rdi, %rcx +- /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and +- rdi for rawmemchr. */ ++ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr ++ and rdi for rawmemchr. */ + orq $(VEC_SIZE - 1), %ALGN_PTR_REG + VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax +@@ -124,6 +126,10 @@ L(cross_page_boundary): + match). */ + leaq 1(%ALGN_PTR_REG), %rsi + subq %RRAW_PTR_REG, %rsi ++# ifdef USE_AS_WMEMCHR ++ /* NB: Divide bytes by 4 to get wchar_t count. */ ++ shrl $2, %esi ++# endif + # endif + /* Remove the leading bytes. */ + sarxl %ERAW_PTR_REG, %eax, %eax +@@ -181,6 +187,10 @@ L(cross_page_continue): + orq $(VEC_SIZE - 1), %rdi + /* esi is for adjusting length to see if near the end. */ + leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi ++# ifdef USE_AS_WMEMCHR ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %esi ++# endif + # else + orq $(VEC_SIZE - 1), %rdi + L(cross_page_continue): +@@ -213,7 +223,7 @@ L(cross_page_continue): + + # ifndef USE_AS_RAWMEMCHR + /* Check if at last VEC_SIZE * 4 length. */ +- subq $(VEC_SIZE * 4), %rdx ++ subq $(CHAR_PER_VEC * 4), %rdx + jbe L(last_4x_vec_or_less_cmpeq) + /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust + length. */ +@@ -221,6 +231,10 @@ L(cross_page_continue): + movl %edi, %ecx + orq $(VEC_SIZE * 4 - 1), %rdi + andl $(VEC_SIZE * 4 - 1), %ecx ++# ifdef USE_AS_WMEMCHR ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %ecx ++# endif + addq %rcx, %rdx + # else + /* Align data to VEC_SIZE * 4 - 1 for loop. */ +@@ -250,15 +264,19 @@ L(loop_4x_vec): + + subq $-(VEC_SIZE * 4), %rdi + +- subq $(VEC_SIZE * 4), %rdx ++ subq $(CHAR_PER_VEC * 4), %rdx + ja L(loop_4x_vec) + +- /* Fall through into less than 4 remaining vectors of length case. +- */ ++ /* Fall through into less than 4 remaining vectors of length ++ case. */ + VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + .p2align 4 + L(last_4x_vec_or_less): ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply length by 4 to get byte count. */ ++ sall $2, %edx ++# endif + /* Check if first VEC contained match. */ + testl %eax, %eax + jnz L(first_vec_x1_check) +@@ -355,6 +373,10 @@ L(last_vec_x2_return): + L(last_4x_vec_or_less_cmpeq): + VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply length by 4 to get byte count. */ ++ sall $2, %edx ++# endif + subq $-(VEC_SIZE * 4), %rdi + /* Check first VEC regardless. */ + testl %eax, %eax +-- +GitLab + diff --git a/glibc-RHEL-15696-25.patch b/glibc-RHEL-15696-25.patch new file mode 100644 index 0000000..e0ed8ea --- /dev/null +++ b/glibc-RHEL-15696-25.patch @@ -0,0 +1,767 @@ +From aaa23c35071537e2dcf5807e956802ed215210aa Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 19 Apr 2021 19:36:07 -0400 +Subject: [PATCH] x86: Optimize strlen-avx2.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit optimizes strlen-avx2.S. The optimizations are +mostly small things but they add up to roughly 10-30% performance +improvement for strlen. The results for strnlen are bit more +ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen +are all passing. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- + sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- + 2 files changed, 334 insertions(+), 214 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index cbfc1a5d..f1a6460a 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -285,10 +285,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + /* Support sysdeps/x86_64/multiarch/strlen.c. */ + IFUNC_IMPL (i, name, strlen, + IFUNC_IMPL_ADD (array, i, strlen, +- CPU_FEATURE_USABLE (AVX2), ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2)), + __strlen_avx2) + IFUNC_IMPL_ADD (array, i, strlen, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __strlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strlen, +@@ -301,10 +303,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + /* Support sysdeps/x86_64/multiarch/strnlen.c. */ + IFUNC_IMPL (i, name, strnlen, + IFUNC_IMPL_ADD (array, i, strnlen, +- CPU_FEATURE_USABLE (AVX2), ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2)), + __strnlen_avx2) + IFUNC_IMPL_ADD (array, i, strnlen, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __strnlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strnlen, +@@ -640,10 +644,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + /* Support sysdeps/x86_64/multiarch/wcslen.c. */ + IFUNC_IMPL (i, name, wcslen, + IFUNC_IMPL_ADD (array, i, wcslen, +- CPU_FEATURE_USABLE (AVX2), ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2)), + __wcslen_avx2) + IFUNC_IMPL_ADD (array, i, wcslen, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __wcslen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcslen, +@@ -656,10 +662,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ + IFUNC_IMPL (i, name, wcsnlen, + IFUNC_IMPL_ADD (array, i, wcsnlen, +- CPU_FEATURE_USABLE (AVX2), ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2)), + __wcsnlen_avx2) + IFUNC_IMPL_ADD (array, i, wcsnlen, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __wcsnlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcsnlen, +diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S +index 82826e10..be8a5db5 100644 +--- a/sysdeps/x86_64/multiarch/strlen-avx2.S ++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S +@@ -27,9 +27,11 @@ + # ifdef USE_AS_WCSLEN + # define VPCMPEQ vpcmpeqd + # define VPMINU vpminud ++# define CHAR_SIZE 4 + # else + # define VPCMPEQ vpcmpeqb + # define VPMINU vpminub ++# define CHAR_SIZE 1 + # endif + + # ifndef VZEROUPPER +@@ -41,349 +43,459 @@ + # endif + + # define VEC_SIZE 32 ++# define PAGE_SIZE 4096 + + .section SECTION(.text),"ax",@progbits + ENTRY (STRLEN) + # ifdef USE_AS_STRNLEN +- /* Check for zero length. */ ++ /* Check zero length. */ + test %RSI_LP, %RSI_LP + jz L(zero) ++ /* Store max len in R8_LP before adjusting if using WCSLEN. */ ++ mov %RSI_LP, %R8_LP + # ifdef USE_AS_WCSLEN + shl $2, %RSI_LP + # elif defined __ILP32__ + /* Clear the upper 32 bits. */ + movl %esi, %esi + # endif +- mov %RSI_LP, %R8_LP + # endif +- movl %edi, %ecx ++ movl %edi, %eax + movq %rdi, %rdx + vpxor %xmm0, %xmm0, %xmm0 +- ++ /* Clear high bits from edi. Only keeping bits relevant to page ++ cross check. */ ++ andl $(PAGE_SIZE - 1), %eax + /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. */ +- VPCMPEQ (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- ++ VPCMPEQ (%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax + # ifdef USE_AS_STRNLEN +- jnz L(first_vec_x0_check) +- /* Adjust length and check the end of data. */ +- subq $VEC_SIZE, %rsi +- jbe L(max) +-# else +- jnz L(first_vec_x0) ++ /* If length < VEC_SIZE handle special. */ ++ cmpq $VEC_SIZE, %rsi ++ jbe L(first_vec_x0) + # endif +- +- /* Align data for aligned loads in the loop. */ +- addq $VEC_SIZE, %rdi +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi ++ /* If empty continue to aligned_more. Otherwise return bit ++ position of first match. */ ++ testl %eax, %eax ++ jz L(aligned_more) ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ shrl $2, %eax ++# endif ++ VZEROUPPER_RETURN + + # ifdef USE_AS_STRNLEN +- /* Adjust length. */ +- addq %rcx, %rsi ++L(zero): ++ xorl %eax, %eax ++ ret + +- subq $(VEC_SIZE * 4), %rsi +- jbe L(last_4x_vec_or_less) ++ .p2align 4 ++L(first_vec_x0): ++ /* Set bit for max len so that tzcnt will return min of max len ++ and position of first match. */ ++ btsq %rsi, %rax ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ shrl $2, %eax ++# endif ++ VZEROUPPER_RETURN + # endif +- jmp L(more_4x_vec) + + .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi +- VPCMPEQ (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- /* Remove the leading bytes. */ +- sarl %cl, %eax +- testl %eax, %eax +- jz L(aligned_more) ++L(first_vec_x1): + tzcntl %eax, %eax ++ /* Safe to use 32 bit instructions as these are only called for ++ size = [1, 159]. */ + # ifdef USE_AS_STRNLEN +- /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) ++ /* Use ecx which was computed earlier to compute correct value. ++ */ ++ subl $(VEC_SIZE * 4 + 1), %ecx ++ addl %ecx, %eax ++# else ++ subl %edx, %edi ++ incl %edi ++ addl %edi, %eax + # endif +- addq %rdi, %rax +- addq %rcx, %rax +- subq %rdx, %rax + # ifdef USE_AS_WCSLEN +- shrq $2, %rax ++ shrl $2, %eax + # endif +-L(return_vzeroupper): +- ZERO_UPPER_VEC_REGISTERS_RETURN ++ VZEROUPPER_RETURN + + .p2align 4 +-L(aligned_more): ++L(first_vec_x2): ++ tzcntl %eax, %eax ++ /* Safe to use 32 bit instructions as these are only called for ++ size = [1, 159]. */ + # ifdef USE_AS_STRNLEN +- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" +- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" +- to void possible addition overflow. */ +- negq %rcx +- addq $VEC_SIZE, %rcx +- +- /* Check the end of data. */ +- subq %rcx, %rsi +- jbe L(max) ++ /* Use ecx which was computed earlier to compute correct value. ++ */ ++ subl $(VEC_SIZE * 3 + 1), %ecx ++ addl %ecx, %eax ++# else ++ subl %edx, %edi ++ addl $(VEC_SIZE + 1), %edi ++ addl %edi, %eax + # endif ++# ifdef USE_AS_WCSLEN ++ shrl $2, %eax ++# endif ++ VZEROUPPER_RETURN + +- addq $VEC_SIZE, %rdi ++ .p2align 4 ++L(first_vec_x3): ++ tzcntl %eax, %eax ++ /* Safe to use 32 bit instructions as these are only called for ++ size = [1, 159]. */ ++# ifdef USE_AS_STRNLEN ++ /* Use ecx which was computed earlier to compute correct value. ++ */ ++ subl $(VEC_SIZE * 2 + 1), %ecx ++ addl %ecx, %eax ++# else ++ subl %edx, %edi ++ addl $(VEC_SIZE * 2 + 1), %edi ++ addl %edi, %eax ++# endif ++# ifdef USE_AS_WCSLEN ++ shrl $2, %eax ++# endif ++ VZEROUPPER_RETURN + ++ .p2align 4 ++L(first_vec_x4): ++ tzcntl %eax, %eax ++ /* Safe to use 32 bit instructions as these are only called for ++ size = [1, 159]. */ + # ifdef USE_AS_STRNLEN +- subq $(VEC_SIZE * 4), %rsi +- jbe L(last_4x_vec_or_less) ++ /* Use ecx which was computed earlier to compute correct value. ++ */ ++ subl $(VEC_SIZE + 1), %ecx ++ addl %ecx, %eax ++# else ++ subl %edx, %edi ++ addl $(VEC_SIZE * 3 + 1), %edi ++ addl %edi, %eax + # endif ++# ifdef USE_AS_WCSLEN ++ shrl $2, %eax ++# endif ++ VZEROUPPER_RETURN + +-L(more_4x_vec): ++ .p2align 5 ++L(aligned_more): ++ /* Align data to VEC_SIZE - 1. This is the same number of ++ instructions as using andq with -VEC_SIZE but saves 4 bytes of ++ code on the x4 check. */ ++ orq $(VEC_SIZE - 1), %rdi ++L(cross_page_continue): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ +- VPCMPEQ (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) +- +- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++# ifdef USE_AS_STRNLEN ++ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because ++ it simplies the logic in last_4x_vec_or_less. */ ++ leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx ++ subq %rdx, %rcx ++# endif ++ /* Load first VEC regardless. */ ++ VPCMPEQ 1(%rdi), %ymm0, %ymm1 ++# ifdef USE_AS_STRNLEN ++ /* Adjust length. If near end handle specially. */ ++ subq %rcx, %rsi ++ jb L(last_4x_vec_or_less) ++# endif ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x1) + +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x2) + +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x3) + +- addq $(VEC_SIZE * 4), %rdi +- +-# ifdef USE_AS_STRNLEN +- subq $(VEC_SIZE * 4), %rsi +- jbe L(last_4x_vec_or_less) +-# endif +- +- /* Align data to 4 * VEC_SIZE. */ +- movq %rdi, %rcx +- andl $(4 * VEC_SIZE - 1), %ecx +- andq $-(4 * VEC_SIZE), %rdi ++ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x4) + ++ /* Align data to VEC_SIZE * 4 - 1. */ + # ifdef USE_AS_STRNLEN +- /* Adjust length. */ ++ /* Before adjusting length check if at last VEC_SIZE * 4. */ ++ cmpq $(VEC_SIZE * 4 - 1), %rsi ++ jbe L(last_4x_vec_or_less_load) ++ incq %rdi ++ movl %edi, %ecx ++ orq $(VEC_SIZE * 4 - 1), %rdi ++ andl $(VEC_SIZE * 4 - 1), %ecx ++ /* Readjust length. */ + addq %rcx, %rsi ++# else ++ incq %rdi ++ orq $(VEC_SIZE * 4 - 1), %rdi + # endif +- ++ /* Compare 4 * VEC at a time forward. */ + .p2align 4 + L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ +- vmovdqa (%rdi), %ymm1 +- vmovdqa VEC_SIZE(%rdi), %ymm2 +- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 +- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 +- VPMINU %ymm1, %ymm2, %ymm5 +- VPMINU %ymm3, %ymm4, %ymm6 +- VPMINU %ymm5, %ymm6, %ymm5 +- +- VPCMPEQ %ymm5, %ymm0, %ymm5 +- vpmovmskb %ymm5, %eax +- testl %eax, %eax +- jnz L(4x_vec_end) +- +- addq $(VEC_SIZE * 4), %rdi +- +-# ifndef USE_AS_STRNLEN +- jmp L(loop_4x_vec) +-# else ++# ifdef USE_AS_STRNLEN ++ /* Break if at end of length. */ + subq $(VEC_SIZE * 4), %rsi +- ja L(loop_4x_vec) +- +-L(last_4x_vec_or_less): +- /* Less than 4 * VEC and aligned to VEC_SIZE. */ +- addl $(VEC_SIZE * 2), %esi +- jle L(last_2x_vec) ++ jb L(last_4x_vec_or_less_cmpeq) ++# endif ++ /* Save some code size by microfusing VPMINU with the load. Since ++ the matches in ymm2/ymm4 can only be returned if there where no ++ matches in ymm1/ymm3 respectively there is no issue with overlap. ++ */ ++ vmovdqa 1(%rdi), %ymm1 ++ VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 ++ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 ++ VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 ++ ++ VPMINU %ymm2, %ymm4, %ymm5 ++ VPCMPEQ %ymm5, %ymm0, %ymm5 ++ vpmovmskb %ymm5, %ecx + +- VPCMPEQ (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) ++ subq $-(VEC_SIZE * 4), %rdi ++ testl %ecx, %ecx ++ jz L(loop_4x_vec) + +- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x1) + +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ VPCMPEQ %ymm1, %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ subq %rdx, %rdi + testl %eax, %eax ++ jnz L(last_vec_return_x0) + +- jnz L(first_vec_x2_check) +- subl $VEC_SIZE, %esi +- jle L(max) +- +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ VPCMPEQ %ymm2, %ymm0, %ymm2 ++ vpmovmskb %ymm2, %eax + testl %eax, %eax +- +- jnz L(first_vec_x3_check) +- movq %r8, %rax +-# ifdef USE_AS_WCSLEN ++ jnz L(last_vec_return_x1) ++ ++ /* Combine last 2 VEC. */ ++ VPCMPEQ %ymm3, %ymm0, %ymm3 ++ vpmovmskb %ymm3, %eax ++ /* rcx has combined result from all 4 VEC. It will only be used if ++ the first 3 other VEC all did not contain a match. */ ++ salq $32, %rcx ++ orq %rcx, %rax ++ tzcntq %rax, %rax ++ subq $(VEC_SIZE * 2 - 1), %rdi ++ addq %rdi, %rax ++# ifdef USE_AS_WCSLEN + shrq $2, %rax +-# endif ++# endif + VZEROUPPER_RETURN + ++ ++# ifdef USE_AS_STRNLEN + .p2align 4 +-L(last_2x_vec): +- addl $(VEC_SIZE * 2), %esi +- VPCMPEQ (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax ++L(last_4x_vec_or_less_load): ++ /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ ++ subq $-(VEC_SIZE * 4), %rdi ++L(last_4x_vec_or_less_cmpeq): ++ VPCMPEQ 1(%rdi), %ymm0, %ymm1 ++L(last_4x_vec_or_less): + +- jnz L(first_vec_x0_check) +- subl $VEC_SIZE, %esi +- jle L(max) ++ vpmovmskb %ymm1, %eax ++ /* If remaining length > VEC_SIZE * 2. This works if esi is off by ++ VEC_SIZE * 4. */ ++ testl $(VEC_SIZE * 2), %esi ++ jnz L(last_4x_vec) + +- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ /* length may have been negative or positive by an offset of ++ VEC_SIZE * 4 depending on where this was called from. This fixes ++ that. */ ++ andl $(VEC_SIZE * 4 - 1), %esi + testl %eax, %eax +- jnz L(first_vec_x1_check) +- movq %r8, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax +-# endif +- VZEROUPPER_RETURN ++ jnz L(last_vec_x1_check) + +- .p2align 4 +-L(first_vec_x0_check): ++ subl $VEC_SIZE, %esi ++ jb L(max) ++ ++ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax + tzcntl %eax, %eax + /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) ++ cmpl %eax, %esi ++ jb L(max) ++ subq %rdx, %rdi ++ addl $(VEC_SIZE + 1), %eax + addq %rdi, %rax +- subq %rdx, %rax + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif + VZEROUPPER_RETURN ++# endif + + .p2align 4 +-L(first_vec_x1_check): ++L(last_vec_return_x0): + tzcntl %eax, %eax +- /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) +- addq $VEC_SIZE, %rax ++ subq $(VEC_SIZE * 4 - 1), %rdi + addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + shrq $2, %rax +-# endif ++# endif + VZEROUPPER_RETURN + + .p2align 4 +-L(first_vec_x2_check): ++L(last_vec_return_x1): + tzcntl %eax, %eax +- /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) +- addq $(VEC_SIZE * 2), %rax ++ subq $(VEC_SIZE * 3 - 1), %rdi + addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + shrq $2, %rax +-# endif ++# endif + VZEROUPPER_RETURN + ++# ifdef USE_AS_STRNLEN + .p2align 4 +-L(first_vec_x3_check): ++L(last_vec_x1_check): ++ + tzcntl %eax, %eax + /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) +- addq $(VEC_SIZE * 3), %rax ++ cmpl %eax, %esi ++ jb L(max) ++ subq %rdx, %rdi ++ incl %eax + addq %rdi, %rax +- subq %rdx, %rax + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif + VZEROUPPER_RETURN + +- .p2align 4 + L(max): + movq %r8, %rax ++ VZEROUPPER_RETURN ++ ++ .p2align 4 ++L(last_4x_vec): ++ /* Test first 2x VEC normally. */ ++ testl %eax, %eax ++ jnz L(last_vec_x1) ++ ++ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x2) ++ ++ /* Normalize length. */ ++ andl $(VEC_SIZE * 4 - 1), %esi ++ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x3) ++ ++ subl $(VEC_SIZE * 3), %esi ++ jb L(max) ++ ++ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ tzcntl %eax, %eax ++ /* Check the end of data. */ ++ cmpl %eax, %esi ++ jb L(max) ++ subq %rdx, %rdi ++ addl $(VEC_SIZE * 3 + 1), %eax ++ addq %rdi, %rax + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif + VZEROUPPER_RETURN + +- .p2align 4 +-L(zero): +- xorl %eax, %eax +- ret +-# endif + + .p2align 4 +-L(first_vec_x0): ++L(last_vec_x1): ++ /* essentially duplicates of first_vec_x1 but use 64 bit ++ instructions. */ + tzcntl %eax, %eax ++ subq %rdx, %rdi ++ incl %eax + addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + shrq $2, %rax +-# endif ++# endif + VZEROUPPER_RETURN + + .p2align 4 +-L(first_vec_x1): ++L(last_vec_x2): ++ /* essentially duplicates of first_vec_x1 but use 64 bit ++ instructions. */ + tzcntl %eax, %eax +- addq $VEC_SIZE, %rax ++ subq %rdx, %rdi ++ addl $(VEC_SIZE + 1), %eax + addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + shrq $2, %rax +-# endif ++# endif + VZEROUPPER_RETURN + + .p2align 4 +-L(first_vec_x2): ++L(last_vec_x3): + tzcntl %eax, %eax +- addq $(VEC_SIZE * 2), %rax ++ subl $(VEC_SIZE * 2), %esi ++ /* Check the end of data. */ ++ cmpl %eax, %esi ++ jb L(max_end) ++ subq %rdx, %rdi ++ addl $(VEC_SIZE * 2 + 1), %eax + addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + shrq $2, %rax +-# endif ++# endif ++ VZEROUPPER_RETURN ++L(max_end): ++ movq %r8, %rax + VZEROUPPER_RETURN ++# endif + ++ /* Cold case for crossing page with first load. */ + .p2align 4 +-L(4x_vec_end): +- VPCMPEQ %ymm1, %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) +- VPCMPEQ %ymm2, %ymm0, %ymm2 +- vpmovmskb %ymm2, %eax ++L(cross_page_boundary): ++ /* Align data to VEC_SIZE - 1. */ ++ orq $(VEC_SIZE - 1), %rdi ++ VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT ++ so no need to manually mod rdx. */ ++ sarxl %edx, %eax, %eax ++# ifdef USE_AS_STRNLEN + testl %eax, %eax +- jnz L(first_vec_x1) +- VPCMPEQ %ymm3, %ymm0, %ymm3 +- vpmovmskb %ymm3, %eax ++ jnz L(cross_page_less_vec) ++ leaq 1(%rdi), %rcx ++ subq %rdx, %rcx ++ /* Check length. */ ++ cmpq %rsi, %rcx ++ jb L(cross_page_continue) ++ movq %r8, %rax ++# else + testl %eax, %eax +- jnz L(first_vec_x2) +- VPCMPEQ %ymm4, %ymm0, %ymm4 +- vpmovmskb %ymm4, %eax +-L(first_vec_x3): ++ jz L(cross_page_continue) + tzcntl %eax, %eax +- addq $(VEC_SIZE * 3), %rax +- addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax ++# ifdef USE_AS_WCSLEN ++ shrl $2, %eax ++# endif + # endif ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN ++ ++# ifdef USE_AS_STRNLEN ++ .p2align 4 ++L(cross_page_less_vec): ++ tzcntl %eax, %eax ++ cmpq %rax, %rsi ++ cmovb %esi, %eax ++# ifdef USE_AS_WCSLEN ++ shrl $2, %eax ++# endif + VZEROUPPER_RETURN ++# endif + + END (STRLEN) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-26.patch b/glibc-RHEL-15696-26.patch new file mode 100644 index 0000000..d46fe6e --- /dev/null +++ b/glibc-RHEL-15696-26.patch @@ -0,0 +1,701 @@ +From 2a76821c3081d2c0231ecd2618f52662cb48fccd Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 3 May 2021 03:03:19 -0400 +Subject: [PATCH] x86: Optimize memchr-evex.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit optimizes memchr-evex.S. The optimizations include +replacing some branches with cmovcc, avoiding some branches entirely +in the less_4x_vec case, making the page cross logic less strict, +saving some ALU in the alignment process, and most importantly +increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and +test-wmemchr are all passing. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++---------- + 1 file changed, 322 insertions(+), 225 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S +index 6dd5d67b..81d5cd64 100644 +--- a/sysdeps/x86_64/multiarch/memchr-evex.S ++++ b/sysdeps/x86_64/multiarch/memchr-evex.S +@@ -26,14 +26,28 @@ + + # ifdef USE_AS_WMEMCHR + # define VPBROADCAST vpbroadcastd +-# define VPCMP vpcmpd +-# define SHIFT_REG r8d ++# define VPMINU vpminud ++# define VPCMP vpcmpd ++# define VPCMPEQ vpcmpeqd ++# define CHAR_SIZE 4 + # else + # define VPBROADCAST vpbroadcastb +-# define VPCMP vpcmpb +-# define SHIFT_REG ecx ++# define VPMINU vpminub ++# define VPCMP vpcmpb ++# define VPCMPEQ vpcmpeqb ++# define CHAR_SIZE 1 + # endif + ++# ifdef USE_AS_RAWMEMCHR ++# define RAW_PTR_REG rcx ++# define ALGN_PTR_REG rdi ++# else ++# define RAW_PTR_REG rdi ++# define ALGN_PTR_REG rcx ++# endif ++ ++# define XMMZERO xmm23 ++# define YMMZERO ymm23 + # define XMMMATCH xmm16 + # define YMMMATCH ymm16 + # define YMM1 ymm17 +@@ -44,6 +58,8 @@ + # define YMM6 ymm22 + + # define VEC_SIZE 32 ++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) ++# define PAGE_SIZE 4096 + + .section .text.evex,"ax",@progbits + ENTRY (MEMCHR) +@@ -51,11 +67,7 @@ ENTRY (MEMCHR) + /* Check for zero length. */ + test %RDX_LP, %RDX_LP + jz L(zero) +-# endif +- movl %edi, %ecx +-# ifdef USE_AS_WMEMCHR +- shl $2, %RDX_LP +-# else ++ + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +@@ -64,318 +76,403 @@ ENTRY (MEMCHR) + /* Broadcast CHAR to YMMMATCH. */ + VPBROADCAST %esi, %YMMMATCH + /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) ++ movl %edi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. */ +- VPCMP $0, (%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- ++ VPCMP $0, (%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax + # ifndef USE_AS_RAWMEMCHR +- jnz L(first_vec_x0_check) +- /* Adjust length and check the end of data. */ +- subq $VEC_SIZE, %rdx +- jbe L(zero) ++ /* If length < CHAR_PER_VEC handle special. */ ++ cmpq $CHAR_PER_VEC, %rdx ++ jbe L(first_vec_x0) ++# endif ++ testl %eax, %eax ++ jz L(aligned_more) ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq (%rdi, %rax, CHAR_SIZE), %rax + # else +- jnz L(first_vec_x0) ++ addq %rdi, %rax + # endif +- +- /* Align data for aligned loads in the loop. */ +- addq $VEC_SIZE, %rdi +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi ++ ret + + # ifndef USE_AS_RAWMEMCHR +- /* Adjust length. */ +- addq %rcx, %rdx +- +- subq $(VEC_SIZE * 4), %rdx +- jbe L(last_4x_vec_or_less) +-# endif +- jmp L(more_4x_vec) ++L(zero): ++ xorl %eax, %eax ++ ret + ++ .p2align 5 ++L(first_vec_x0): ++ /* Check if first match was before length. */ ++ tzcntl %eax, %eax ++ xorl %ecx, %ecx ++ cmpl %eax, %edx ++ leaq (%rdi, %rax, CHAR_SIZE), %rax ++ cmovle %rcx, %rax ++ ret ++# else ++ /* NB: first_vec_x0 is 17 bytes which will leave ++ cross_page_boundary (which is relatively cold) close enough ++ to ideal alignment. So only realign L(cross_page_boundary) if ++ rawmemchr. */ + .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx ++# endif ++L(cross_page_boundary): ++ /* Save pointer before aligning as its original value is ++ necessary for computer return address if byte is found or ++ adjusting length if it is not and this is memchr. */ ++ movq %rdi, %rcx ++ /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi ++ for rawmemchr. */ ++ andq $-VEC_SIZE, %ALGN_PTR_REG ++ VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0 ++ kmovd %k0, %r8d + # ifdef USE_AS_WMEMCHR +- /* NB: Divide shift count by 4 since each bit in K1 represent 4 ++ /* NB: Divide shift count by 4 since each bit in K0 represent 4 + bytes. */ +- movl %ecx, %SHIFT_REG +- sarl $2, %SHIFT_REG ++ sarl $2, %eax ++# endif ++# ifndef USE_AS_RAWMEMCHR ++ movl $(PAGE_SIZE / CHAR_SIZE), %esi ++ subl %eax, %esi + # endif +- andq $-VEC_SIZE, %rdi +- VPCMP $0, (%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax +- /* Remove the leading bytes. */ +- sarxl %SHIFT_REG, %eax, %eax +- testl %eax, %eax +- jz L(aligned_more) +- tzcntl %eax, %eax + # ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax ++ andl $(CHAR_PER_VEC - 1), %eax + # endif ++ /* Remove the leading bytes. */ ++ sarxl %eax, %r8d, %eax + # ifndef USE_AS_RAWMEMCHR + /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) ++ cmpq %rsi, %rdx ++ jbe L(first_vec_x0) ++# endif ++ testl %eax, %eax ++ jz L(cross_page_continue) ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax ++# else ++ addq %RAW_PTR_REG, %rax + # endif +- addq %rdi, %rax +- addq %rcx, %rax + ret + + .p2align 4 +-L(aligned_more): +-# ifndef USE_AS_RAWMEMCHR +- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" +- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition +- overflow. */ +- negq %rcx +- addq $VEC_SIZE, %rcx ++L(first_vec_x1): ++ tzcntl %eax, %eax ++ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- /* Check the end of data. */ +- subq %rcx, %rdx +- jbe L(zero) +-# endif ++ .p2align 4 ++L(first_vec_x2): ++ tzcntl %eax, %eax ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- addq $VEC_SIZE, %rdi ++ .p2align 4 ++L(first_vec_x3): ++ tzcntl %eax, %eax ++ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +-# ifndef USE_AS_RAWMEMCHR +- subq $(VEC_SIZE * 4), %rdx +- jbe L(last_4x_vec_or_less) +-# endif ++ .p2align 4 ++L(first_vec_x4): ++ tzcntl %eax, %eax ++ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +-L(more_4x_vec): ++ .p2align 5 ++L(aligned_more): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ +- VPCMP $0, (%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) + +- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax ++# ifndef USE_AS_RAWMEMCHR ++ /* Align data to VEC_SIZE. */ ++L(cross_page_continue): ++ xorl %ecx, %ecx ++ subl %edi, %ecx ++ andq $-VEC_SIZE, %rdi ++ /* esi is for adjusting length to see if near the end. */ ++ leal (VEC_SIZE * 5)(%rdi, %rcx), %esi ++# ifdef USE_AS_WMEMCHR ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %esi ++# endif ++# else ++ andq $-VEC_SIZE, %rdi ++L(cross_page_continue): ++# endif ++ /* Load first VEC regardless. */ ++ VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax ++# ifndef USE_AS_RAWMEMCHR ++ /* Adjust length. If near end handle specially. */ ++ subq %rsi, %rdx ++ jbe L(last_4x_vec_or_less) ++# endif + testl %eax, %eax + jnz L(first_vec_x1) + +- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x2) + +- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax ++ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x3) + +- addq $(VEC_SIZE * 4), %rdi ++ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x4) ++ + + # ifndef USE_AS_RAWMEMCHR +- subq $(VEC_SIZE * 4), %rdx +- jbe L(last_4x_vec_or_less) +-# endif ++ /* Check if at last CHAR_PER_VEC * 4 length. */ ++ subq $(CHAR_PER_VEC * 4), %rdx ++ jbe L(last_4x_vec_or_less_cmpeq) ++ addq $VEC_SIZE, %rdi + +- /* Align data to 4 * VEC_SIZE. */ +- movq %rdi, %rcx +- andl $(4 * VEC_SIZE - 1), %ecx ++ /* Align data to VEC_SIZE * 4 for the loop and readjust length. ++ */ ++# ifdef USE_AS_WMEMCHR ++ movl %edi, %ecx + andq $-(4 * VEC_SIZE), %rdi +- +-# ifndef USE_AS_RAWMEMCHR +- /* Adjust length. */ ++ andl $(VEC_SIZE * 4 - 1), %ecx ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %ecx + addq %rcx, %rdx ++# else ++ addq %rdi, %rdx ++ andq $-(4 * VEC_SIZE), %rdi ++ subq %rdi, %rdx ++# endif ++# else ++ addq $VEC_SIZE, %rdi ++ andq $-(4 * VEC_SIZE), %rdi + # endif + ++ vpxorq %XMMZERO, %XMMZERO, %XMMZERO ++ ++ /* Compare 4 * VEC at a time forward. */ + .p2align 4 + L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ +- VPCMP $0, (%rdi), %YMMMATCH, %k1 +- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2 +- kord %k1, %k2, %k5 +- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3 +- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4 +- +- kord %k3, %k4, %k6 +- kortestd %k5, %k6 +- jnz L(4x_vec_end) +- +- addq $(VEC_SIZE * 4), %rdi +- ++ /* It would be possible to save some instructions using 4x VPCMP ++ but bottleneck on port 5 makes it not woth it. */ ++ VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 ++ /* xor will set bytes match esi to zero. */ ++ vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2 ++ vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 ++ VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 ++ /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ ++ VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z} ++ VPCMP $0, %YMM3, %YMMZERO, %k2 + # ifdef USE_AS_RAWMEMCHR +- jmp L(loop_4x_vec) ++ subq $-(VEC_SIZE * 4), %rdi ++ kortestd %k2, %k3 ++ jz L(loop_4x_vec) + # else +- subq $(VEC_SIZE * 4), %rdx ++ kortestd %k2, %k3 ++ jnz L(loop_4x_vec_end) ++ ++ subq $-(VEC_SIZE * 4), %rdi ++ ++ subq $(CHAR_PER_VEC * 4), %rdx + ja L(loop_4x_vec) + ++ /* Fall through into less than 4 remaining vectors of length case. ++ */ ++ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax ++ addq $(VEC_SIZE * 3), %rdi ++ .p2align 4 + L(last_4x_vec_or_less): +- /* Less than 4 * VEC and aligned to VEC_SIZE. */ +- addl $(VEC_SIZE * 2), %edx +- jle L(last_2x_vec) +- +- VPCMP $0, (%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax ++ /* Check if first VEC contained match. */ + testl %eax, %eax +- jnz L(first_vec_x0) ++ jnz L(first_vec_x1_check) + +- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(first_vec_x1) ++ /* If remaining length > CHAR_PER_VEC * 2. */ ++ addl $(CHAR_PER_VEC * 2), %edx ++ jg L(last_4x_vec) + +- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax +- testl %eax, %eax ++L(last_2x_vec): ++ /* If remaining length < CHAR_PER_VEC. */ ++ addl $CHAR_PER_VEC, %edx ++ jle L(zero_end) + +- jnz L(first_vec_x2_check) +- subl $VEC_SIZE, %edx +- jle L(zero) ++ /* Check VEC2 and compare any match with remaining length. */ ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax ++ tzcntl %eax, %eax ++ cmpl %eax, %edx ++ jbe L(set_zero_end) ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++L(zero_end): ++ ret + +- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax +- testl %eax, %eax + +- jnz L(first_vec_x3_check) ++ .p2align 4 ++L(first_vec_x1_check): ++ tzcntl %eax, %eax ++ /* Adjust length. */ ++ subl $-(CHAR_PER_VEC * 4), %edx ++ /* Check if match within remaining length. */ ++ cmpl %eax, %edx ++ jbe L(set_zero_end) ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax ++ ret ++L(set_zero_end): + xorl %eax, %eax + ret + + .p2align 4 +-L(last_2x_vec): +- addl $(VEC_SIZE * 2), %edx +- VPCMP $0, (%rdi), %YMMMATCH, %k1 ++L(loop_4x_vec_end): ++# endif ++ /* rawmemchr will fall through into this if match was found in ++ loop. */ ++ ++ /* k1 has not of matches with VEC1. */ + kmovd %k1, %eax +- testl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ subl $((1 << CHAR_PER_VEC) - 1), %eax ++# else ++ incl %eax ++# endif ++ jnz L(last_vec_x1_return) + +- jnz L(first_vec_x0_check) +- subl $VEC_SIZE, %edx +- jle L(zero) ++ VPCMP $0, %YMM2, %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x2_return) + +- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax ++ kmovd %k2, %eax + testl %eax, %eax +- jnz L(first_vec_x1_check) +- xorl %eax, %eax +- ret ++ jnz L(last_vec_x3_return) + +- .p2align 4 +-L(first_vec_x0_check): ++ kmovd %k3, %eax + tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax ++# ifdef USE_AS_RAWMEMCHR ++ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ++# else ++ leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax + # endif +- /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) +- addq %rdi, %rax + ret + + .p2align 4 +-L(first_vec_x1_check): ++L(last_vec_x1_return): + tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif +- /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) +- addq $VEC_SIZE, %rax ++# ifdef USE_AS_RAWMEMCHR ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq (%rdi, %rax, CHAR_SIZE), %rax ++# else + addq %rdi, %rax +- ret +- +- .p2align 4 +-L(first_vec_x2_check): +- tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax ++# endif ++# else ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax + # endif +- /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) +- addq $(VEC_SIZE * 2), %rax +- addq %rdi, %rax + ret + + .p2align 4 +-L(first_vec_x3_check): ++L(last_vec_x2_return): + tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax ++# ifdef USE_AS_RAWMEMCHR ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax ++# else ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax + # endif +- /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) +- addq $(VEC_SIZE * 3), %rax +- addq %rdi, %rax + ret + + .p2align 4 +-L(zero): +- xorl %eax, %eax +- ret +-# endif +- +- .p2align 4 +-L(first_vec_x0): ++L(last_vec_x3_return): + tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (%rdi, %rax, 4), %rax ++# ifdef USE_AS_RAWMEMCHR ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + # else +- addq %rdi, %rax ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax + # endif + ret + ++ ++# ifndef USE_AS_RAWMEMCHR ++L(last_4x_vec_or_less_cmpeq): ++ VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax ++ subq $-(VEC_SIZE * 4), %rdi ++ /* Check first VEC regardless. */ ++ testl %eax, %eax ++ jnz L(first_vec_x1_check) ++ ++ /* If remaining length <= CHAR_PER_VEC * 2. */ ++ addl $(CHAR_PER_VEC * 2), %edx ++ jle L(last_2x_vec) ++ + .p2align 4 +-L(first_vec_x1): ++L(last_4x_vec): ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x2) ++ ++ ++ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax ++ /* Create mask for possible matches within remaining length. */ ++# ifdef USE_AS_WMEMCHR ++ movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx ++ bzhil %edx, %ecx, %ecx ++# else ++ movq $-1, %rcx ++ bzhiq %rdx, %rcx, %rcx ++# endif ++ /* Test matches in data against length match. */ ++ andl %ecx, %eax ++ jnz L(last_vec_x3) ++ ++ /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after ++ remaining length was found to be > CHAR_PER_VEC * 2. */ ++ subl $CHAR_PER_VEC, %edx ++ jbe L(zero_end2) ++ ++ ++ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax ++ /* Shift remaining length mask for last VEC. */ ++# ifdef USE_AS_WMEMCHR ++ shrl $CHAR_PER_VEC, %ecx ++# else ++ shrq $CHAR_PER_VEC, %rcx ++# endif ++ andl %ecx, %eax ++ jz L(zero_end2) + tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq VEC_SIZE(%rdi, %rax, 4), %rax +-# else +- addq $VEC_SIZE, %rax +- addq %rdi, %rax +-# endif ++ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax ++L(zero_end2): + ret + +- .p2align 4 +-L(first_vec_x2): ++L(last_vec_x2): + tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax +-# else +- addq $(VEC_SIZE * 2), %rax +- addq %rdi, %rax +-# endif ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 +-L(4x_vec_end): +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) +- kmovd %k2, %eax +- testl %eax, %eax +- jnz L(first_vec_x1) +- kmovd %k3, %eax +- testl %eax, %eax +- jnz L(first_vec_x2) +- kmovd %k4, %eax +- testl %eax, %eax +-L(first_vec_x3): ++L(last_vec_x3): + tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax +-# else +- addq $(VEC_SIZE * 3), %rax +- addq %rdi, %rax +-# endif ++ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret ++# endif + + END (MEMCHR) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-27.patch b/glibc-RHEL-15696-27.patch new file mode 100644 index 0000000..9dcf16d --- /dev/null +++ b/glibc-RHEL-15696-27.patch @@ -0,0 +1,30 @@ +From 6ea916adfa0ab9af6e7dc6adcf6f977dfe017835 Mon Sep 17 00:00:00 2001 +From: Alice Xu +Date: Fri, 7 May 2021 19:03:21 -0700 +Subject: [PATCH] x86-64: Fix an unknown vector operation in memchr-evex.S +Content-type: text/plain; charset=UTF-8 + +An unknown vector operation occurred in commit 2a76821c308. Fixed it +by using "ymm{k1}{z}" but not "ymm {k1} {z}". + +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/memchr-evex.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S +index 81d5cd64..f3fdad4f 100644 +--- a/sysdeps/x86_64/multiarch/memchr-evex.S ++++ b/sysdeps/x86_64/multiarch/memchr-evex.S +@@ -271,7 +271,7 @@ L(loop_4x_vec): + vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 + VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 + /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ +- VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z} ++ VPMINU %YMM2, %YMM3, %YMM3{%k1}{z} + VPCMP $0, %YMM3, %YMMZERO, %k2 + # ifdef USE_AS_RAWMEMCHR + subq $-(VEC_SIZE * 4), %rdi +-- +GitLab + diff --git a/glibc-RHEL-15696-28.patch b/glibc-RHEL-15696-28.patch new file mode 100644 index 0000000..3063d4d --- /dev/null +++ b/glibc-RHEL-15696-28.patch @@ -0,0 +1,566 @@ +From a0db678071c60b6c47c468d231dd0b3694ba7a98 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Tue, 22 Jun 2021 20:42:10 -0700 +Subject: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S +Content-type: text/plain; charset=UTF-8 + +Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1 +version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S +and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants. +This also removes the unused symbols, __GI___strlen_sse2 and +__GI___wcsnlen_sse4_1. +--- + sysdeps/x86_64/multiarch/strlen-sse2.S | 2 +- + sysdeps/x86_64/multiarch/strlen-vec.S | 257 ++++++++++++++++++++++ + sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 2 +- + sysdeps/x86_64/strlen.S | 243 +------------------- + 4 files changed, 262 insertions(+), 242 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S + +Conflicts: + sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S + (Copyright dates, URL) + +diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S +index 7bc57b8d..449c8a7f 100644 +--- a/sysdeps/x86_64/multiarch/strlen-sse2.S ++++ b/sysdeps/x86_64/multiarch/strlen-sse2.S +@@ -20,4 +20,4 @@ + # define strlen __strlen_sse2 + #endif + +-#include "../strlen.S" ++#include "strlen-vec.S" +diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S +new file mode 100644 +index 00000000..8f660bb9 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strlen-vec.S +@@ -0,0 +1,257 @@ ++/* SSE2 version of strlen and SSE4.1 version of wcslen. ++ Copyright (C) 2012-2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#ifdef AS_WCSLEN ++# define PMINU pminud ++# define PCMPEQ pcmpeqd ++# define SHIFT_RETURN shrq $2, %rax ++#else ++# define PMINU pminub ++# define PCMPEQ pcmpeqb ++# define SHIFT_RETURN ++#endif ++ ++/* Long lived register in strlen(s), strnlen(s, n) are: ++ ++ %xmm3 - zero ++ %rdi - s ++ %r10 (s+n) & (~(64-1)) ++ %r11 s+n ++*/ ++ ++ ++.text ++ENTRY(strlen) ++ ++/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ ++#define FIND_ZERO \ ++ PCMPEQ (%rax), %xmm0; \ ++ PCMPEQ 16(%rax), %xmm1; \ ++ PCMPEQ 32(%rax), %xmm2; \ ++ PCMPEQ 48(%rax), %xmm3; \ ++ pmovmskb %xmm0, %esi; \ ++ pmovmskb %xmm1, %edx; \ ++ pmovmskb %xmm2, %r8d; \ ++ pmovmskb %xmm3, %ecx; \ ++ salq $16, %rdx; \ ++ salq $16, %rcx; \ ++ orq %rsi, %rdx; \ ++ orq %r8, %rcx; \ ++ salq $32, %rcx; \ ++ orq %rcx, %rdx; ++ ++#ifdef AS_STRNLEN ++/* Do not read anything when n==0. */ ++ test %RSI_LP, %RSI_LP ++ jne L(n_nonzero) ++ xor %rax, %rax ++ ret ++L(n_nonzero): ++# ifdef AS_WCSLEN ++ shl $2, %RSI_LP ++# endif ++ ++/* Initialize long lived registers. */ ++ ++ add %RDI_LP, %RSI_LP ++ mov %RSI_LP, %R10_LP ++ and $-64, %R10_LP ++ mov %RSI_LP, %R11_LP ++#endif ++ ++ pxor %xmm0, %xmm0 ++ pxor %xmm1, %xmm1 ++ pxor %xmm2, %xmm2 ++ pxor %xmm3, %xmm3 ++ movq %rdi, %rax ++ movq %rdi, %rcx ++ andq $4095, %rcx ++/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ ++ cmpq $4047, %rcx ++/* We cannot unify this branching as it would be ~6 cycles slower. */ ++ ja L(cross_page) ++ ++#ifdef AS_STRNLEN ++/* Test if end is among first 64 bytes. */ ++# define STRNLEN_PROLOG \ ++ mov %r11, %rsi; \ ++ subq %rax, %rsi; \ ++ andq $-64, %rax; \ ++ testq $-64, %rsi; \ ++ je L(strnlen_ret) ++#else ++# define STRNLEN_PROLOG andq $-64, %rax; ++#endif ++ ++/* Ignore bits in mask that come before start of string. */ ++#define PROLOG(lab) \ ++ movq %rdi, %rcx; \ ++ xorq %rax, %rcx; \ ++ STRNLEN_PROLOG; \ ++ sarq %cl, %rdx; \ ++ test %rdx, %rdx; \ ++ je L(lab); \ ++ bsfq %rdx, %rax; \ ++ SHIFT_RETURN; \ ++ ret ++ ++#ifdef AS_STRNLEN ++ andq $-16, %rax ++ FIND_ZERO ++#else ++ /* Test first 16 bytes unaligned. */ ++ movdqu (%rax), %xmm4 ++ PCMPEQ %xmm0, %xmm4 ++ pmovmskb %xmm4, %edx ++ test %edx, %edx ++ je L(next48_bytes) ++ bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ ++ SHIFT_RETURN ++ ret ++ ++L(next48_bytes): ++/* Same as FIND_ZERO except we do not check first 16 bytes. */ ++ andq $-16, %rax ++ PCMPEQ 16(%rax), %xmm1 ++ PCMPEQ 32(%rax), %xmm2 ++ PCMPEQ 48(%rax), %xmm3 ++ pmovmskb %xmm1, %edx ++ pmovmskb %xmm2, %r8d ++ pmovmskb %xmm3, %ecx ++ salq $16, %rdx ++ salq $16, %rcx ++ orq %r8, %rcx ++ salq $32, %rcx ++ orq %rcx, %rdx ++#endif ++ ++ /* When no zero byte is found xmm1-3 are zero so we do not have to ++ zero them. */ ++ PROLOG(loop) ++ ++ .p2align 4 ++L(cross_page): ++ andq $-64, %rax ++ FIND_ZERO ++ PROLOG(loop_init) ++ ++#ifdef AS_STRNLEN ++/* We must do this check to correctly handle strnlen (s, -1). */ ++L(strnlen_ret): ++ bts %rsi, %rdx ++ sarq %cl, %rdx ++ test %rdx, %rdx ++ je L(loop_init) ++ bsfq %rdx, %rax ++ SHIFT_RETURN ++ ret ++#endif ++ .p2align 4 ++L(loop_init): ++ pxor %xmm1, %xmm1 ++ pxor %xmm2, %xmm2 ++ pxor %xmm3, %xmm3 ++#ifdef AS_STRNLEN ++ .p2align 4 ++L(loop): ++ ++ addq $64, %rax ++ cmpq %rax, %r10 ++ je L(exit_end) ++ ++ movdqa (%rax), %xmm0 ++ PMINU 16(%rax), %xmm0 ++ PMINU 32(%rax), %xmm0 ++ PMINU 48(%rax), %xmm0 ++ PCMPEQ %xmm3, %xmm0 ++ pmovmskb %xmm0, %edx ++ testl %edx, %edx ++ jne L(exit) ++ jmp L(loop) ++ ++ .p2align 4 ++L(exit_end): ++ cmp %rax, %r11 ++ je L(first) /* Do not read when end is at page boundary. */ ++ pxor %xmm0, %xmm0 ++ FIND_ZERO ++ ++L(first): ++ bts %r11, %rdx ++ bsfq %rdx, %rdx ++ addq %rdx, %rax ++ subq %rdi, %rax ++ SHIFT_RETURN ++ ret ++ ++ .p2align 4 ++L(exit): ++ pxor %xmm0, %xmm0 ++ FIND_ZERO ++ ++ bsfq %rdx, %rdx ++ addq %rdx, %rax ++ subq %rdi, %rax ++ SHIFT_RETURN ++ ret ++ ++#else ++ ++ /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ ++ .p2align 4 ++L(loop): ++ ++ movdqa 64(%rax), %xmm0 ++ PMINU 80(%rax), %xmm0 ++ PMINU 96(%rax), %xmm0 ++ PMINU 112(%rax), %xmm0 ++ PCMPEQ %xmm3, %xmm0 ++ pmovmskb %xmm0, %edx ++ testl %edx, %edx ++ jne L(exit64) ++ ++ subq $-128, %rax ++ ++ movdqa (%rax), %xmm0 ++ PMINU 16(%rax), %xmm0 ++ PMINU 32(%rax), %xmm0 ++ PMINU 48(%rax), %xmm0 ++ PCMPEQ %xmm3, %xmm0 ++ pmovmskb %xmm0, %edx ++ testl %edx, %edx ++ jne L(exit0) ++ jmp L(loop) ++ ++ .p2align 4 ++L(exit64): ++ addq $64, %rax ++L(exit0): ++ pxor %xmm0, %xmm0 ++ FIND_ZERO ++ ++ bsfq %rdx, %rdx ++ addq %rdx, %rax ++ subq %rdi, %rax ++ SHIFT_RETURN ++ ret ++ ++#endif ++ ++END(strlen) +diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S +index a8cab0cb..5fa51fe0 100644 +--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S ++++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S +@@ -2,4 +2,4 @@ + #define AS_STRNLEN + #define strlen __wcsnlen_sse4_1 + +-#include "../strlen.S" ++#include "strlen-vec.S" +diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S +index f845f3d4..ad047d84 100644 +--- a/sysdeps/x86_64/strlen.S ++++ b/sysdeps/x86_64/strlen.S +@@ -1,5 +1,5 @@ +-/* SSE2 version of strlen/wcslen. +- Copyright (C) 2012-2018 Free Software Foundation, Inc. ++/* SSE2 version of strlen. ++ Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or +@@ -16,243 +16,6 @@ + License along with the GNU C Library; if not, see + . */ + +-#include ++#include "multiarch/strlen-vec.S" + +-#ifdef AS_WCSLEN +-# define PMINU pminud +-# define PCMPEQ pcmpeqd +-# define SHIFT_RETURN shrq $2, %rax +-#else +-# define PMINU pminub +-# define PCMPEQ pcmpeqb +-# define SHIFT_RETURN +-#endif +- +-/* Long lived register in strlen(s), strnlen(s, n) are: +- +- %xmm3 - zero +- %rdi - s +- %r10 (s+n) & (~(64-1)) +- %r11 s+n +-*/ +- +- +-.text +-ENTRY(strlen) +- +-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ +-#define FIND_ZERO \ +- PCMPEQ (%rax), %xmm0; \ +- PCMPEQ 16(%rax), %xmm1; \ +- PCMPEQ 32(%rax), %xmm2; \ +- PCMPEQ 48(%rax), %xmm3; \ +- pmovmskb %xmm0, %esi; \ +- pmovmskb %xmm1, %edx; \ +- pmovmskb %xmm2, %r8d; \ +- pmovmskb %xmm3, %ecx; \ +- salq $16, %rdx; \ +- salq $16, %rcx; \ +- orq %rsi, %rdx; \ +- orq %r8, %rcx; \ +- salq $32, %rcx; \ +- orq %rcx, %rdx; +- +-#ifdef AS_STRNLEN +-/* Do not read anything when n==0. */ +- test %RSI_LP, %RSI_LP +- jne L(n_nonzero) +- xor %rax, %rax +- ret +-L(n_nonzero): +-# ifdef AS_WCSLEN +- shl $2, %RSI_LP +-# endif +- +-/* Initialize long lived registers. */ +- +- add %RDI_LP, %RSI_LP +- mov %RSI_LP, %R10_LP +- and $-64, %R10_LP +- mov %RSI_LP, %R11_LP +-#endif +- +- pxor %xmm0, %xmm0 +- pxor %xmm1, %xmm1 +- pxor %xmm2, %xmm2 +- pxor %xmm3, %xmm3 +- movq %rdi, %rax +- movq %rdi, %rcx +- andq $4095, %rcx +-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ +- cmpq $4047, %rcx +-/* We cannot unify this branching as it would be ~6 cycles slower. */ +- ja L(cross_page) +- +-#ifdef AS_STRNLEN +-/* Test if end is among first 64 bytes. */ +-# define STRNLEN_PROLOG \ +- mov %r11, %rsi; \ +- subq %rax, %rsi; \ +- andq $-64, %rax; \ +- testq $-64, %rsi; \ +- je L(strnlen_ret) +-#else +-# define STRNLEN_PROLOG andq $-64, %rax; +-#endif +- +-/* Ignore bits in mask that come before start of string. */ +-#define PROLOG(lab) \ +- movq %rdi, %rcx; \ +- xorq %rax, %rcx; \ +- STRNLEN_PROLOG; \ +- sarq %cl, %rdx; \ +- test %rdx, %rdx; \ +- je L(lab); \ +- bsfq %rdx, %rax; \ +- SHIFT_RETURN; \ +- ret +- +-#ifdef AS_STRNLEN +- andq $-16, %rax +- FIND_ZERO +-#else +- /* Test first 16 bytes unaligned. */ +- movdqu (%rax), %xmm4 +- PCMPEQ %xmm0, %xmm4 +- pmovmskb %xmm4, %edx +- test %edx, %edx +- je L(next48_bytes) +- bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ +- SHIFT_RETURN +- ret +- +-L(next48_bytes): +-/* Same as FIND_ZERO except we do not check first 16 bytes. */ +- andq $-16, %rax +- PCMPEQ 16(%rax), %xmm1 +- PCMPEQ 32(%rax), %xmm2 +- PCMPEQ 48(%rax), %xmm3 +- pmovmskb %xmm1, %edx +- pmovmskb %xmm2, %r8d +- pmovmskb %xmm3, %ecx +- salq $16, %rdx +- salq $16, %rcx +- orq %r8, %rcx +- salq $32, %rcx +- orq %rcx, %rdx +-#endif +- +- /* When no zero byte is found xmm1-3 are zero so we do not have to +- zero them. */ +- PROLOG(loop) +- +- .p2align 4 +-L(cross_page): +- andq $-64, %rax +- FIND_ZERO +- PROLOG(loop_init) +- +-#ifdef AS_STRNLEN +-/* We must do this check to correctly handle strnlen (s, -1). */ +-L(strnlen_ret): +- bts %rsi, %rdx +- sarq %cl, %rdx +- test %rdx, %rdx +- je L(loop_init) +- bsfq %rdx, %rax +- SHIFT_RETURN +- ret +-#endif +- .p2align 4 +-L(loop_init): +- pxor %xmm1, %xmm1 +- pxor %xmm2, %xmm2 +- pxor %xmm3, %xmm3 +-#ifdef AS_STRNLEN +- .p2align 4 +-L(loop): +- +- addq $64, %rax +- cmpq %rax, %r10 +- je L(exit_end) +- +- movdqa (%rax), %xmm0 +- PMINU 16(%rax), %xmm0 +- PMINU 32(%rax), %xmm0 +- PMINU 48(%rax), %xmm0 +- PCMPEQ %xmm3, %xmm0 +- pmovmskb %xmm0, %edx +- testl %edx, %edx +- jne L(exit) +- jmp L(loop) +- +- .p2align 4 +-L(exit_end): +- cmp %rax, %r11 +- je L(first) /* Do not read when end is at page boundary. */ +- pxor %xmm0, %xmm0 +- FIND_ZERO +- +-L(first): +- bts %r11, %rdx +- bsfq %rdx, %rdx +- addq %rdx, %rax +- subq %rdi, %rax +- SHIFT_RETURN +- ret +- +- .p2align 4 +-L(exit): +- pxor %xmm0, %xmm0 +- FIND_ZERO +- +- bsfq %rdx, %rdx +- addq %rdx, %rax +- subq %rdi, %rax +- SHIFT_RETURN +- ret +- +-#else +- +- /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ +- .p2align 4 +-L(loop): +- +- movdqa 64(%rax), %xmm0 +- PMINU 80(%rax), %xmm0 +- PMINU 96(%rax), %xmm0 +- PMINU 112(%rax), %xmm0 +- PCMPEQ %xmm3, %xmm0 +- pmovmskb %xmm0, %edx +- testl %edx, %edx +- jne L(exit64) +- +- subq $-128, %rax +- +- movdqa (%rax), %xmm0 +- PMINU 16(%rax), %xmm0 +- PMINU 32(%rax), %xmm0 +- PMINU 48(%rax), %xmm0 +- PCMPEQ %xmm3, %xmm0 +- pmovmskb %xmm0, %edx +- testl %edx, %edx +- jne L(exit0) +- jmp L(loop) +- +- .p2align 4 +-L(exit64): +- addq $64, %rax +-L(exit0): +- pxor %xmm0, %xmm0 +- FIND_ZERO +- +- bsfq %rdx, %rdx +- addq %rdx, %rax +- subq %rdi, %rax +- SHIFT_RETURN +- ret +- +-#endif +- +-END(strlen) + libc_hidden_builtin_def (strlen) +-- +GitLab + diff --git a/glibc-RHEL-15696-29.patch b/glibc-RHEL-15696-29.patch new file mode 100644 index 0000000..112821a --- /dev/null +++ b/glibc-RHEL-15696-29.patch @@ -0,0 +1,181 @@ +From 6f573a27b6c8b4236445810a44660612323f5a73 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Jun 2021 01:19:34 -0400 +Subject: [PATCH] x86-64: Add wcslen optimize for sse4.1 +Content-type: text/plain; charset=UTF-8 + +No bug. This comment adds the ifunc / build infrastructure +necessary for wcslen to prefer the sse4.1 implementation +in strlen-vec.S. test-wcslen.c is passing. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/Makefile | 4 +- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 3 ++ + sysdeps/x86_64/multiarch/ifunc-wcslen.h | 52 ++++++++++++++++++++++ + sysdeps/x86_64/multiarch/wcslen-sse4_1.S | 4 ++ + sysdeps/x86_64/multiarch/wcslen.c | 2 +- + sysdeps/x86_64/multiarch/wcsnlen.c | 34 +------------- + 6 files changed, 63 insertions(+), 36 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcslen.h + create mode 100644 sysdeps/x86_64/multiarch/wcslen-sse4_1.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 491c7698..65fde4eb 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -93,8 +93,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ + wcscpy-ssse3 wcscpy-c \ + wcschr-sse2 wcschr-avx2 \ + wcsrchr-sse2 wcsrchr-avx2 \ +- wcsnlen-sse4_1 wcsnlen-c \ +- wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \ ++ wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \ ++ wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \ + wcschr-avx2-rtm \ + wcscmp-avx2-rtm \ + wcslen-avx2-rtm \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index f1a6460a..580913ca 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -657,6 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wcslen_evex) ++ IFUNC_IMPL_ADD (array, i, wcsnlen, ++ CPU_FEATURE_USABLE (SSE4_1), ++ __wcsnlen_sse4_1) + IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ +diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h +new file mode 100644 +index 00000000..39e33473 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h +@@ -0,0 +1,52 @@ ++/* Common definition for ifunc selections for wcslen and wcsnlen ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2017-2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; ++ ++static inline void * ++IFUNC_SELECTOR (void) ++{ ++ const struct cpu_features* cpu_features = __get_cpu_features (); ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2) ++ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ return OPTIMIZE (evex); ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) ++ return OPTIMIZE (sse4_1); ++ ++ return OPTIMIZE (sse2); ++} +diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S +new file mode 100644 +index 00000000..7e62621a +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S +@@ -0,0 +1,4 @@ ++#define AS_WCSLEN ++#define strlen __wcslen_sse4_1 ++ ++#include "strlen-vec.S" +diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c +index 6d06e47c..3b04b75b 100644 +--- a/sysdeps/x86_64/multiarch/wcslen.c ++++ b/sysdeps/x86_64/multiarch/wcslen.c +@@ -24,7 +24,7 @@ + # undef __wcslen + + # define SYMBOL_NAME wcslen +-# include "ifunc-avx2.h" ++# include "ifunc-wcslen.h" + + libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ()); + weak_alias (__wcslen, wcslen); +diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c +index 20b731ae..06736410 100644 +--- a/sysdeps/x86_64/multiarch/wcsnlen.c ++++ b/sysdeps/x86_64/multiarch/wcsnlen.c +@@ -24,39 +24,7 @@ + # undef __wcsnlen + + # define SYMBOL_NAME wcsnlen +-# include +- +-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; +- +-static inline void * +-IFUNC_SELECTOR (void) +-{ +- const struct cpu_features* cpu_features = __get_cpu_features (); +- +- if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) +- && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +- { +- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) +- && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) +- return OPTIMIZE (evex); +- +- if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) +- return OPTIMIZE (avx2_rtm); +- +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) +- return OPTIMIZE (avx2); +- } +- +- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) +- return OPTIMIZE (sse4_1); +- +- return OPTIMIZE (sse2); +-} ++# include "ifunc-wcslen.h" + + libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ()); + weak_alias (__wcsnlen, wcsnlen); +-- +GitLab + diff --git a/glibc-RHEL-15696-3.patch b/glibc-RHEL-15696-3.patch new file mode 100644 index 0000000..8f5093c --- /dev/null +++ b/glibc-RHEL-15696-3.patch @@ -0,0 +1,396 @@ +From 231c56760c1e2ded21ad96bbb860b1f08c556c7a Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 21 Jan 2019 11:27:25 -0800 +Subject: [PATCH] x86-64 memcpy: Properly handle the length parameter [BZ# + 24097] +Content-type: text/plain; charset=UTF-8 + +On x32, the size_t parameter may be passed in the lower 32 bits of a +64-bit register with the non-zero upper 32 bits. The string/memory +functions written in assembly can only use the lower 32 bits of a +64-bit register as length or must clear the upper 32 bits before using +the full 64-bit register for length. + +This pach fixes memcpy for x32. Tested on x86-64 and x32. On x86-64, +libc.so is the same with and withou the fix. + + [BZ# 24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for + length. Clear the upper 32 bits of RDX register. + * sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise. + * sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S: + Likewise. + * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: + Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy. + tst-size_t-wmemchr. + * sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file. +--- + sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 17 ++++-- + sysdeps/x86_64/multiarch/memcpy-ssse3.S | 17 ++++-- + .../multiarch/memmove-avx512-no-vzeroupper.S | 16 +++-- + .../multiarch/memmove-vec-unaligned-erms.S | 54 +++++++++-------- + sysdeps/x86_64/x32/Makefile | 2 +- + sysdeps/x86_64/x32/tst-size_t-memcpy.c | 58 +++++++++++++++++++ + 6 files changed, 122 insertions(+), 42 deletions(-) + create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c + +Conflicts: + ChangeLog + (removed) + +diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S +index 3cd11233..568eebd3 100644 +--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S ++++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S +@@ -45,28 +45,33 @@ + .section .text.ssse3,"ax",@progbits + #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE + ENTRY (MEMPCPY_CHK) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMPCPY_CHK) + + ENTRY (MEMPCPY) +- movq %rdi, %rax +- addq %rdx, %rax ++ mov %RDI_LP, %RAX_LP ++ add %RDX_LP, %RAX_LP + jmp L(start) + END (MEMPCPY) + #endif + + #if !defined USE_AS_BCOPY + ENTRY (MEMCPY_CHK) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMCPY_CHK) + #endif + + ENTRY (MEMCPY) +- mov %rdi, %rax ++ mov %RDI_LP, %RAX_LP + #ifdef USE_AS_MEMPCPY +- add %rdx, %rax ++ add %RDX_LP, %RAX_LP ++#endif ++ ++#ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ mov %edx, %edx + #endif + + #ifdef USE_AS_MEMMOVE +diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S +index 0240bfa3..0bd5ee99 100644 +--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S ++++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S +@@ -45,28 +45,33 @@ + .section .text.ssse3,"ax",@progbits + #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE + ENTRY (MEMPCPY_CHK) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMPCPY_CHK) + + ENTRY (MEMPCPY) +- movq %rdi, %rax +- addq %rdx, %rax ++ mov %RDI_LP, %RAX_LP ++ add %RDX_LP, %RAX_LP + jmp L(start) + END (MEMPCPY) + #endif + + #if !defined USE_AS_BCOPY + ENTRY (MEMCPY_CHK) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMCPY_CHK) + #endif + + ENTRY (MEMCPY) +- mov %rdi, %rax ++ mov %RDI_LP, %RAX_LP + #ifdef USE_AS_MEMPCPY +- add %rdx, %rax ++ add %RDX_LP, %RAX_LP ++#endif ++ ++#ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ mov %edx, %edx + #endif + + #ifdef USE_AS_MEMMOVE +diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S +index effc3ac2..6ca2bbc9 100644 +--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S ++++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S +@@ -24,27 +24,31 @@ + + .section .text.avx512,"ax",@progbits + ENTRY (__mempcpy_chk_avx512_no_vzeroupper) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (__mempcpy_chk_avx512_no_vzeroupper) + + ENTRY (__mempcpy_avx512_no_vzeroupper) +- movq %rdi, %rax +- addq %rdx, %rax ++ mov %RDI_LP, %RAX_LP ++ add %RDX_LP, %RAX_LP + jmp L(start) + END (__mempcpy_avx512_no_vzeroupper) + + ENTRY (__memmove_chk_avx512_no_vzeroupper) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (__memmove_chk_avx512_no_vzeroupper) + + ENTRY (__memmove_avx512_no_vzeroupper) +- mov %rdi, %rax ++ mov %RDI_LP, %RAX_LP + # ifdef USE_AS_MEMPCPY +- add %rdx, %rax ++ add %RDX_LP, %RAX_LP + # endif + L(start): ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ mov %edx, %edx ++# endif + lea (%rsi, %rdx), %rcx + lea (%rdi, %rdx), %r9 + cmp $512, %rdx +diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +index c952576c..274aa1c7 100644 +--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +@@ -95,20 +95,20 @@ + .section SECTION(.text),"ax",@progbits + #if defined SHARED && IS_IN (libc) + ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) + #endif + + ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) +- movq %rdi, %rax +- addq %rdx, %rax ++ mov %RDI_LP, %RAX_LP ++ add %RDX_LP, %RAX_LP + jmp L(start) + END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) + + #if defined SHARED && IS_IN (libc) + ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) + #endif +@@ -116,9 +116,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) + ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) + movq %rdi, %rax + L(start): +- cmpq $VEC_SIZE, %rdx ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx ++# endif ++ cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) +- cmpq $(VEC_SIZE * 2), %rdx ++ cmp $(VEC_SIZE * 2), %RDX_LP + ja L(more_2x_vec) + #if !defined USE_MULTIARCH || !IS_IN (libc) + L(last_2x_vec): +@@ -138,38 +142,38 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned)) + + # if VEC_SIZE == 16 + ENTRY (__mempcpy_chk_erms) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (__mempcpy_chk_erms) + + /* Only used to measure performance of REP MOVSB. */ + ENTRY (__mempcpy_erms) +- movq %rdi, %rax ++ mov %RDI_LP, %RAX_LP + /* Skip zero length. */ +- testq %rdx, %rdx ++ test %RDX_LP, %RDX_LP + jz 2f +- addq %rdx, %rax ++ add %RDX_LP, %RAX_LP + jmp L(start_movsb) + END (__mempcpy_erms) + + ENTRY (__memmove_chk_erms) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (__memmove_chk_erms) + + ENTRY (__memmove_erms) + movq %rdi, %rax + /* Skip zero length. */ +- testq %rdx, %rdx ++ test %RDX_LP, %RDX_LP + jz 2f + L(start_movsb): +- movq %rdx, %rcx +- cmpq %rsi, %rdi ++ mov %RDX_LP, %RCX_LP ++ cmp %RSI_LP, %RDI_LP + jb 1f + /* Source == destination is less common. */ + je 2f +- leaq (%rsi,%rcx), %rdx +- cmpq %rdx, %rdi ++ lea (%rsi,%rcx), %RDX_LP ++ cmp %RDX_LP, %RDI_LP + jb L(movsb_backward) + 1: + rep movsb +@@ -189,20 +193,20 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms) + + # ifdef SHARED + ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) + # endif + + ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) +- movq %rdi, %rax +- addq %rdx, %rax ++ mov %RDI_LP, %RAX_LP ++ add %RDX_LP, %RAX_LP + jmp L(start_erms) + END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) + + # ifdef SHARED + ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) + # endif +@@ -210,9 +214,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) + ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) + movq %rdi, %rax + L(start_erms): +- cmpq $VEC_SIZE, %rdx ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx ++# endif ++ cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) +- cmpq $(VEC_SIZE * 2), %rdx ++ cmp $(VEC_SIZE * 2), %RDX_LP + ja L(movsb_more_2x_vec) + L(last_2x_vec): + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ +@@ -236,7 +244,7 @@ L(movsb): + /* Avoid slow backward REP MOVSB. */ + jb L(more_8x_vec_backward) + 1: +- movq %rdx, %rcx ++ mov %RDX_LP, %RCX_LP + rep movsb + L(nop): + ret +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index ddec7f04..2fe1e5ac 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround + endif + + ifeq ($(subdir),string) +-tests += tst-size_t-memchr tst-size_t-memcmp ++tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy + endif + + ifeq ($(subdir),wcsmbs) +diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c +new file mode 100644 +index 00000000..66b71e17 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c +@@ -0,0 +1,58 @@ ++/* Test memcpy with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define TEST_NAME "memcpy" ++#include "test-size_t.h" ++ ++IMPL (memcpy, 1) ++ ++typedef void *(*proto_t) (void *, const void *, size_t); ++ ++static void * ++__attribute__ ((noinline, noclone)) ++do_memcpy (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ parameter_t dest = { { page_size }, buf1 }; ++ parameter_t src = { { 0 }, buf2 }; ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ src.fn = impl->fn; ++ do_memcpy (dest, src); ++ int res = memcmp (dest.p, src.p, dest.len); ++ if (res) ++ { ++ error (0, 0, "Wrong result in function %s: %i != 0", ++ impl->name, res); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +-- +GitLab + diff --git a/glibc-RHEL-15696-30.patch b/glibc-RHEL-15696-30.patch new file mode 100644 index 0000000..0b16f0f --- /dev/null +++ b/glibc-RHEL-15696-30.patch @@ -0,0 +1,497 @@ +From a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Jun 2021 01:56:29 -0400 +Subject: [PATCH] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ + #27974] +Content-type: text/plain; charset=UTF-8 + +This commit fixes the bug mentioned in the previous commit. + +The previous implementations of wmemchr in these files relied +on maxlen * sizeof(wchar_t) which was not guranteed by the standard. + +The new overflow tests added in the previous commit now +pass (As well as all the other tests). + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++------- + sysdeps/x86_64/multiarch/strlen-vec.S | 15 ++- + 2 files changed, 107 insertions(+), 38 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S +index be8a5db5..37688966 100644 +--- a/sysdeps/x86_64/multiarch/strlen-avx2.S ++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S +@@ -44,21 +44,21 @@ + + # define VEC_SIZE 32 + # define PAGE_SIZE 4096 ++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section SECTION(.text),"ax",@progbits + ENTRY (STRLEN) + # ifdef USE_AS_STRNLEN + /* Check zero length. */ ++# ifdef __ILP32__ ++ /* Clear upper bits. */ ++ and %RSI_LP, %RSI_LP ++# else + test %RSI_LP, %RSI_LP ++# endif + jz L(zero) + /* Store max len in R8_LP before adjusting if using WCSLEN. */ + mov %RSI_LP, %R8_LP +-# ifdef USE_AS_WCSLEN +- shl $2, %RSI_LP +-# elif defined __ILP32__ +- /* Clear the upper 32 bits. */ +- movl %esi, %esi +-# endif + # endif + movl %edi, %eax + movq %rdi, %rdx +@@ -72,10 +72,10 @@ ENTRY (STRLEN) + + /* Check the first VEC_SIZE bytes. */ + VPCMPEQ (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + # ifdef USE_AS_STRNLEN + /* If length < VEC_SIZE handle special. */ +- cmpq $VEC_SIZE, %rsi ++ cmpq $CHAR_PER_VEC, %rsi + jbe L(first_vec_x0) + # endif + /* If empty continue to aligned_more. Otherwise return bit +@@ -84,6 +84,7 @@ ENTRY (STRLEN) + jz L(aligned_more) + tzcntl %eax, %eax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax + # endif + VZEROUPPER_RETURN +@@ -97,9 +98,14 @@ L(zero): + L(first_vec_x0): + /* Set bit for max len so that tzcnt will return min of max len + and position of first match. */ ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply length by 4 to get byte count. */ ++ sall $2, %esi ++# endif + btsq %rsi, %rax + tzcntl %eax, %eax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax + # endif + VZEROUPPER_RETURN +@@ -113,14 +119,19 @@ L(first_vec_x1): + # ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ ++# ifdef USE_AS_WCSLEN ++ leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax ++# else + subl $(VEC_SIZE * 4 + 1), %ecx + addl %ecx, %eax ++# endif + # else + subl %edx, %edi + incl %edi + addl %edi, %eax + # endif + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax + # endif + VZEROUPPER_RETURN +@@ -133,14 +144,19 @@ L(first_vec_x2): + # ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ ++# ifdef USE_AS_WCSLEN ++ leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax ++# else + subl $(VEC_SIZE * 3 + 1), %ecx + addl %ecx, %eax ++# endif + # else + subl %edx, %edi + addl $(VEC_SIZE + 1), %edi + addl %edi, %eax + # endif + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax + # endif + VZEROUPPER_RETURN +@@ -153,14 +169,19 @@ L(first_vec_x3): + # ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ ++# ifdef USE_AS_WCSLEN ++ leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax ++# else + subl $(VEC_SIZE * 2 + 1), %ecx + addl %ecx, %eax ++# endif + # else + subl %edx, %edi + addl $(VEC_SIZE * 2 + 1), %edi + addl %edi, %eax + # endif + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax + # endif + VZEROUPPER_RETURN +@@ -173,14 +194,19 @@ L(first_vec_x4): + # ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ ++# ifdef USE_AS_WCSLEN ++ leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax ++# else + subl $(VEC_SIZE + 1), %ecx + addl %ecx, %eax ++# endif + # else + subl %edx, %edi + addl $(VEC_SIZE * 3 + 1), %edi + addl %edi, %eax + # endif + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax + # endif + VZEROUPPER_RETURN +@@ -195,10 +221,14 @@ L(cross_page_continue): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + # ifdef USE_AS_STRNLEN +- /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because +- it simplies the logic in last_4x_vec_or_less. */ ++ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE ++ because it simplies the logic in last_4x_vec_or_less. */ + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx + subq %rdx, %rcx ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %ecx ++# endif + # endif + /* Load first VEC regardless. */ + VPCMPEQ 1(%rdi), %ymm0, %ymm1 +@@ -207,34 +237,38 @@ L(cross_page_continue): + subq %rcx, %rsi + jb L(last_4x_vec_or_less) + # endif +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x2) + + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x3) + + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x4) + + /* Align data to VEC_SIZE * 4 - 1. */ + # ifdef USE_AS_STRNLEN + /* Before adjusting length check if at last VEC_SIZE * 4. */ +- cmpq $(VEC_SIZE * 4 - 1), %rsi ++ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi + jbe L(last_4x_vec_or_less_load) + incq %rdi + movl %edi, %ecx + orq $(VEC_SIZE * 4 - 1), %rdi + andl $(VEC_SIZE * 4 - 1), %ecx ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %ecx ++# endif + /* Readjust length. */ + addq %rcx, %rsi + # else +@@ -246,13 +280,13 @@ L(cross_page_continue): + L(loop_4x_vec): + # ifdef USE_AS_STRNLEN + /* Break if at end of length. */ +- subq $(VEC_SIZE * 4), %rsi ++ subq $(CHAR_PER_VEC * 4), %rsi + jb L(last_4x_vec_or_less_cmpeq) + # endif +- /* Save some code size by microfusing VPMINU with the load. Since +- the matches in ymm2/ymm4 can only be returned if there where no +- matches in ymm1/ymm3 respectively there is no issue with overlap. +- */ ++ /* Save some code size by microfusing VPMINU with the load. ++ Since the matches in ymm2/ymm4 can only be returned if there ++ where no matches in ymm1/ymm3 respectively there is no issue ++ with overlap. */ + vmovdqa 1(%rdi), %ymm1 + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 +@@ -260,7 +294,7 @@ L(loop_4x_vec): + + VPMINU %ymm2, %ymm4, %ymm5 + VPCMPEQ %ymm5, %ymm0, %ymm5 +- vpmovmskb %ymm5, %ecx ++ vpmovmskb %ymm5, %ecx + + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx +@@ -268,27 +302,28 @@ L(loop_4x_vec): + + + VPCMPEQ %ymm1, %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + subq %rdx, %rdi + testl %eax, %eax + jnz L(last_vec_return_x0) + + VPCMPEQ %ymm2, %ymm0, %ymm2 +- vpmovmskb %ymm2, %eax ++ vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(last_vec_return_x1) + + /* Combine last 2 VEC. */ + VPCMPEQ %ymm3, %ymm0, %ymm3 +- vpmovmskb %ymm3, %eax +- /* rcx has combined result from all 4 VEC. It will only be used if +- the first 3 other VEC all did not contain a match. */ ++ vpmovmskb %ymm3, %eax ++ /* rcx has combined result from all 4 VEC. It will only be used ++ if the first 3 other VEC all did not contain a match. */ + salq $32, %rcx + orq %rcx, %rax + tzcntq %rax, %rax + subq $(VEC_SIZE * 2 - 1), %rdi + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -297,15 +332,19 @@ L(loop_4x_vec): + # ifdef USE_AS_STRNLEN + .p2align 4 + L(last_4x_vec_or_less_load): +- /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ ++ /* Depending on entry adjust rdi / prepare first VEC in ymm1. ++ */ + subq $-(VEC_SIZE * 4), %rdi + L(last_4x_vec_or_less_cmpeq): + VPCMPEQ 1(%rdi), %ymm0, %ymm1 + L(last_4x_vec_or_less): +- +- vpmovmskb %ymm1, %eax +- /* If remaining length > VEC_SIZE * 2. This works if esi is off by +- VEC_SIZE * 4. */ ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply length by 4 to get byte count. */ ++ sall $2, %esi ++# endif ++ vpmovmskb %ymm1, %eax ++ /* If remaining length > VEC_SIZE * 2. This works if esi is off ++ by VEC_SIZE * 4. */ + testl $(VEC_SIZE * 2), %esi + jnz L(last_4x_vec) + +@@ -320,7 +359,7 @@ L(last_4x_vec_or_less): + jb L(max) + + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + tzcntl %eax, %eax + /* Check the end of data. */ + cmpl %eax, %esi +@@ -329,6 +368,7 @@ L(last_4x_vec_or_less): + addl $(VEC_SIZE + 1), %eax + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -340,6 +380,7 @@ L(last_vec_return_x0): + subq $(VEC_SIZE * 4 - 1), %rdi + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -350,6 +391,7 @@ L(last_vec_return_x1): + subq $(VEC_SIZE * 3 - 1), %rdi + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -366,6 +408,7 @@ L(last_vec_x1_check): + incl %eax + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -381,14 +424,14 @@ L(last_4x_vec): + jnz L(last_vec_x1) + + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + /* Normalize length. */ + andl $(VEC_SIZE * 4 - 1), %esi + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + +@@ -396,7 +439,7 @@ L(last_4x_vec): + jb L(max) + + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + tzcntl %eax, %eax + /* Check the end of data. */ + cmpl %eax, %esi +@@ -405,6 +448,7 @@ L(last_4x_vec): + addl $(VEC_SIZE * 3 + 1), %eax + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -419,6 +463,7 @@ L(last_vec_x1): + incl %eax + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -432,6 +477,7 @@ L(last_vec_x2): + addl $(VEC_SIZE + 1), %eax + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -447,6 +493,7 @@ L(last_vec_x3): + addl $(VEC_SIZE * 2 + 1), %eax + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -455,13 +502,13 @@ L(max_end): + VZEROUPPER_RETURN + # endif + +- /* Cold case for crossing page with first load. */ ++ /* Cold case for crossing page with first load. */ + .p2align 4 + L(cross_page_boundary): + /* Align data to VEC_SIZE - 1. */ + orq $(VEC_SIZE - 1), %rdi + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT + so no need to manually mod rdx. */ + sarxl %edx, %eax, %eax +@@ -470,6 +517,10 @@ L(cross_page_boundary): + jnz L(cross_page_less_vec) + leaq 1(%rdi), %rcx + subq %rdx, %rcx ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ ++ shrl $2, %ecx ++# endif + /* Check length. */ + cmpq %rsi, %rcx + jb L(cross_page_continue) +@@ -479,6 +530,7 @@ L(cross_page_boundary): + jz L(cross_page_continue) + tzcntl %eax, %eax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide length by 4 to get wchar_t count. */ + shrl $2, %eax + # endif + # endif +@@ -489,6 +541,10 @@ L(return_vzeroupper): + .p2align 4 + L(cross_page_less_vec): + tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply length by 4 to get byte count. */ ++ sall $2, %esi ++# endif + cmpq %rax, %rsi + cmovb %esi, %eax + # ifdef USE_AS_WCSLEN +diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S +index 8f660bb9..439e486a 100644 +--- a/sysdeps/x86_64/multiarch/strlen-vec.S ++++ b/sysdeps/x86_64/multiarch/strlen-vec.S +@@ -65,12 +65,25 @@ ENTRY(strlen) + ret + L(n_nonzero): + # ifdef AS_WCSLEN +- shl $2, %RSI_LP ++/* Check for overflow from maxlen * sizeof(wchar_t). If it would ++ overflow the only way this program doesn't have undefined behavior ++ is if there is a null terminator in valid memory so wcslen will ++ suffice. */ ++ mov %RSI_LP, %R10_LP ++ sar $62, %R10_LP ++ test %R10_LP, %R10_LP ++ jnz __wcslen_sse4_1 ++ sal $2, %RSI_LP + # endif + ++ + /* Initialize long lived registers. */ + + add %RDI_LP, %RSI_LP ++# ifdef AS_WCSLEN ++/* Check for overflow again from s + maxlen * sizeof(wchar_t). */ ++ jbe __wcslen_sse4_1 ++# endif + mov %RSI_LP, %R10_LP + and $-64, %R10_LP + mov %RSI_LP, %R11_LP +-- +GitLab + diff --git a/glibc-RHEL-15696-31.patch b/glibc-RHEL-15696-31.patch new file mode 100644 index 0000000..4ef6911 --- /dev/null +++ b/glibc-RHEL-15696-31.patch @@ -0,0 +1,745 @@ +From 4ba65586847751372520a36757c17f114588794e Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 19 Apr 2021 19:36:06 -0400 +Subject: [PATCH] x86: Optimize strlen-evex.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit optimizes strlen-evex.S. The +optimizations are mostly small things but they add up to roughly +10-30% performance improvement for strlen. The results for strnlen are +bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and +test-wcsnlen are all passing. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++----------- + 1 file changed, 317 insertions(+), 264 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S +index 05838190..4bf6874b 100644 +--- a/sysdeps/x86_64/multiarch/strlen-evex.S ++++ b/sysdeps/x86_64/multiarch/strlen-evex.S +@@ -29,11 +29,13 @@ + # ifdef USE_AS_WCSLEN + # define VPCMP vpcmpd + # define VPMINU vpminud +-# define SHIFT_REG r9d ++# define SHIFT_REG ecx ++# define CHAR_SIZE 4 + # else + # define VPCMP vpcmpb + # define VPMINU vpminub +-# define SHIFT_REG ecx ++# define SHIFT_REG edx ++# define CHAR_SIZE 1 + # endif + + # define XMMZERO xmm16 +@@ -46,132 +48,165 @@ + # define YMM6 ymm22 + + # define VEC_SIZE 32 ++# define PAGE_SIZE 4096 ++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section .text.evex,"ax",@progbits + ENTRY (STRLEN) + # ifdef USE_AS_STRNLEN +- /* Check for zero length. */ ++ /* Check zero length. */ + test %RSI_LP, %RSI_LP + jz L(zero) +-# ifdef USE_AS_WCSLEN +- shl $2, %RSI_LP +-# elif defined __ILP32__ ++# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %esi, %esi + # endif + mov %RSI_LP, %R8_LP + # endif +- movl %edi, %ecx +- movq %rdi, %rdx ++ movl %edi, %eax + vpxorq %XMMZERO, %XMMZERO, %XMMZERO +- ++ /* Clear high bits from edi. Only keeping bits relevant to page ++ cross check. */ ++ andl $(PAGE_SIZE - 1), %eax + /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a + null byte. */ + VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax +- testl %eax, %eax +- + # ifdef USE_AS_STRNLEN +- jnz L(first_vec_x0_check) +- /* Adjust length and check the end of data. */ +- subq $VEC_SIZE, %rsi +- jbe L(max) +-# else +- jnz L(first_vec_x0) ++ /* If length < CHAR_PER_VEC handle special. */ ++ cmpq $CHAR_PER_VEC, %rsi ++ jbe L(first_vec_x0) + # endif +- +- /* Align data for aligned loads in the loop. */ +- addq $VEC_SIZE, %rdi +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi +- ++ testl %eax, %eax ++ jz L(aligned_more) ++ tzcntl %eax, %eax ++ ret + # ifdef USE_AS_STRNLEN +- /* Adjust length. */ +- addq %rcx, %rsi ++L(zero): ++ xorl %eax, %eax ++ ret + +- subq $(VEC_SIZE * 4), %rsi +- jbe L(last_4x_vec_or_less) ++ .p2align 4 ++L(first_vec_x0): ++ /* Set bit for max len so that tzcnt will return min of max len ++ and position of first match. */ ++ btsq %rsi, %rax ++ tzcntl %eax, %eax ++ ret + # endif +- jmp L(more_4x_vec) + + .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi +- +-# ifdef USE_AS_WCSLEN +- /* NB: Divide shift count by 4 since each bit in K0 represent 4 +- bytes. */ +- movl %ecx, %SHIFT_REG +- sarl $2, %SHIFT_REG ++L(first_vec_x1): ++ tzcntl %eax, %eax ++ /* Safe to use 32 bit instructions as these are only called for ++ size = [1, 159]. */ ++# ifdef USE_AS_STRNLEN ++ /* Use ecx which was computed earlier to compute correct value. ++ */ ++ leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax ++# else ++ subl %edx, %edi ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %edi ++# endif ++ leal CHAR_PER_VEC(%rdi, %rax), %eax + # endif +- VPCMP $0, (%rdi), %YMMZERO, %k0 +- kmovd %k0, %eax ++ ret + +- /* Remove the leading bytes. */ +- sarxl %SHIFT_REG, %eax, %eax +- testl %eax, %eax +- jz L(aligned_more) ++ .p2align 4 ++L(first_vec_x2): + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif ++ /* Safe to use 32 bit instructions as these are only called for ++ size = [1, 159]. */ + # ifdef USE_AS_STRNLEN +- /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) +-# endif +- addq %rdi, %rax +- addq %rcx, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax ++ /* Use ecx which was computed earlier to compute correct value. ++ */ ++ leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax ++# else ++ subl %edx, %edi ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %edi ++# endif ++ leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax + # endif + ret + + .p2align 4 +-L(aligned_more): ++L(first_vec_x3): ++ tzcntl %eax, %eax ++ /* Safe to use 32 bit instructions as these are only called for ++ size = [1, 159]. */ + # ifdef USE_AS_STRNLEN +- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" +- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" +- to void possible addition overflow. */ +- negq %rcx +- addq $VEC_SIZE, %rcx +- +- /* Check the end of data. */ +- subq %rcx, %rsi +- jbe L(max) ++ /* Use ecx which was computed earlier to compute correct value. ++ */ ++ leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax ++# else ++ subl %edx, %edi ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %edi ++# endif ++ leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax + # endif ++ ret + +- addq $VEC_SIZE, %rdi +- ++ .p2align 4 ++L(first_vec_x4): ++ tzcntl %eax, %eax ++ /* Safe to use 32 bit instructions as these are only called for ++ size = [1, 159]. */ + # ifdef USE_AS_STRNLEN +- subq $(VEC_SIZE * 4), %rsi +- jbe L(last_4x_vec_or_less) ++ /* Use ecx which was computed earlier to compute correct value. ++ */ ++ leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax ++# else ++ subl %edx, %edi ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %edi ++# endif ++ leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax + # endif ++ ret + +-L(more_4x_vec): ++ .p2align 5 ++L(aligned_more): ++ movq %rdi, %rdx ++ /* Align data to VEC_SIZE. */ ++ andq $-(VEC_SIZE), %rdi ++L(cross_page_continue): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ +- VPCMP $0, (%rdi), %YMMZERO, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) +- ++# ifdef USE_AS_STRNLEN ++ /* + CHAR_SIZE because it simplies the logic in ++ last_4x_vec_or_less. */ ++ leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx ++ subq %rdx, %rcx ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %ecx ++# endif ++# endif ++ /* Load first VEC regardless. */ + VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 ++# ifdef USE_AS_STRNLEN ++ /* Adjust length. If near end handle specially. */ ++ subq %rcx, %rsi ++ jb L(last_4x_vec_or_less) ++# endif + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax +- testl %eax, %eax ++ test %eax, %eax + jnz L(first_vec_x2) + + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 +@@ -179,258 +214,276 @@ L(more_4x_vec): + testl %eax, %eax + jnz L(first_vec_x3) + +- addq $(VEC_SIZE * 4), %rdi +- +-# ifdef USE_AS_STRNLEN +- subq $(VEC_SIZE * 4), %rsi +- jbe L(last_4x_vec_or_less) +-# endif +- +- /* Align data to 4 * VEC_SIZE. */ +- movq %rdi, %rcx +- andl $(4 * VEC_SIZE - 1), %ecx +- andq $-(4 * VEC_SIZE), %rdi ++ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x4) + ++ addq $VEC_SIZE, %rdi + # ifdef USE_AS_STRNLEN +- /* Adjust length. */ ++ /* Check if at last VEC_SIZE * 4 length. */ ++ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi ++ jbe L(last_4x_vec_or_less_load) ++ movl %edi, %ecx ++ andl $(VEC_SIZE * 4 - 1), %ecx ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %ecx ++# endif ++ /* Readjust length. */ + addq %rcx, %rsi + # endif ++ /* Align data to VEC_SIZE * 4. */ ++ andq $-(VEC_SIZE * 4), %rdi + ++ /* Compare 4 * VEC at a time forward. */ + .p2align 4 + L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ +- VMOVA (%rdi), %YMM1 +- VMOVA VEC_SIZE(%rdi), %YMM2 +- VMOVA (VEC_SIZE * 2)(%rdi), %YMM3 +- VMOVA (VEC_SIZE * 3)(%rdi), %YMM4 +- +- VPMINU %YMM1, %YMM2, %YMM5 +- VPMINU %YMM3, %YMM4, %YMM6 ++ /* Load first VEC regardless. */ ++ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 ++# ifdef USE_AS_STRNLEN ++ /* Break if at end of length. */ ++ subq $(CHAR_PER_VEC * 4), %rsi ++ jb L(last_4x_vec_or_less_cmpeq) ++# endif ++ /* Save some code size by microfusing VPMINU with the load. Since ++ the matches in ymm2/ymm4 can only be returned if there where no ++ matches in ymm1/ymm3 respectively there is no issue with overlap. ++ */ ++ VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2 ++ VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 ++ VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4 ++ ++ VPCMP $0, %YMM2, %YMMZERO, %k0 ++ VPCMP $0, %YMM4, %YMMZERO, %k1 ++ subq $-(VEC_SIZE * 4), %rdi ++ kortestd %k0, %k1 ++ jz L(loop_4x_vec) ++ ++ /* Check if end was in first half. */ ++ kmovd %k0, %eax ++ subq %rdx, %rdi ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rdi ++# endif ++ testl %eax, %eax ++ jz L(second_vec_return) + +- VPMINU %YMM5, %YMM6, %YMM5 +- VPCMP $0, %YMM5, %YMMZERO, %k0 +- ktestd %k0, %k0 +- jnz L(4x_vec_end) ++ VPCMP $0, %YMM1, %YMMZERO, %k2 ++ kmovd %k2, %edx ++ /* Combine VEC1 matches (edx) with VEC2 matches (eax). */ ++# ifdef USE_AS_WCSLEN ++ sall $CHAR_PER_VEC, %eax ++ orl %edx, %eax ++ tzcntl %eax, %eax ++# else ++ salq $CHAR_PER_VEC, %rax ++ orq %rdx, %rax ++ tzcntq %rax, %rax ++# endif ++ addq %rdi, %rax ++ ret + +- addq $(VEC_SIZE * 4), %rdi + +-# ifndef USE_AS_STRNLEN +- jmp L(loop_4x_vec) +-# else +- subq $(VEC_SIZE * 4), %rsi +- ja L(loop_4x_vec) ++# ifdef USE_AS_STRNLEN + ++L(last_4x_vec_or_less_load): ++ /* Depending on entry adjust rdi / prepare first VEC in YMM1. */ ++ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 ++L(last_4x_vec_or_less_cmpeq): ++ VPCMP $0, %YMM1, %YMMZERO, %k0 ++ addq $(VEC_SIZE * 3), %rdi + L(last_4x_vec_or_less): +- /* Less than 4 * VEC and aligned to VEC_SIZE. */ +- addl $(VEC_SIZE * 2), %esi +- jle L(last_2x_vec) +- +- VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax ++ /* If remaining length > VEC_SIZE * 2. This works if esi is off by ++ VEC_SIZE * 4. */ ++ testl $(CHAR_PER_VEC * 2), %esi ++ jnz L(last_4x_vec) ++ ++ /* length may have been negative or positive by an offset of ++ CHAR_PER_VEC * 4 depending on where this was called from. This ++ fixes that. */ ++ andl $(CHAR_PER_VEC * 4 - 1), %esi + testl %eax, %eax +- jnz L(first_vec_x0) ++ jnz L(last_vec_x1_check) + +- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(first_vec_x1) ++ /* Check the end of data. */ ++ subl $CHAR_PER_VEC, %esi ++ jb L(max) + + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax +- testl %eax, %eax +- jnz L(first_vec_x2_check) +- subl $VEC_SIZE, %esi +- jle L(max) ++ tzcntl %eax, %eax ++ /* Check the end of data. */ ++ cmpl %eax, %esi ++ jb L(max) + +- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(first_vec_x3_check) ++ subq %rdx, %rdi ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarq $2, %rdi ++# endif ++ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax ++ ret ++L(max): + movq %r8, %rax ++ ret ++# endif ++ ++ /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less) ++ in the 4x VEC loop can use 2 byte encoding. */ ++ .p2align 4 ++L(second_vec_return): ++ VPCMP $0, %YMM3, %YMMZERO, %k0 ++ /* Combine YMM3 matches (k0) with YMM4 matches (k1). */ ++# ifdef USE_AS_WCSLEN ++ kunpckbw %k0, %k1, %k0 ++ kmovd %k0, %eax ++ tzcntl %eax, %eax ++# else ++ kunpckdq %k0, %k1, %k0 ++ kmovq %k0, %rax ++ tzcntq %rax, %rax ++# endif ++ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax ++ ret ++ ++ ++# ifdef USE_AS_STRNLEN ++L(last_vec_x1_check): ++ tzcntl %eax, %eax ++ /* Check the end of data. */ ++ cmpl %eax, %esi ++ jb L(max) ++ subq %rdx, %rdi + # ifdef USE_AS_WCSLEN +- shrq $2, %rax ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarq $2, %rdi + # endif ++ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax + ret + + .p2align 4 +-L(last_2x_vec): +- addl $(VEC_SIZE * 2), %esi ++L(last_4x_vec): ++ /* Test first 2x VEC normally. */ ++ testl %eax, %eax ++ jnz L(last_vec_x1) + +- VPCMP $0, (%rdi), %YMMZERO, %k0 ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax +- jnz L(first_vec_x0_check) +- subl $VEC_SIZE, %esi +- jle L(max) ++ jnz L(last_vec_x2) + +- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 ++ /* Normalize length. */ ++ andl $(CHAR_PER_VEC * 4 - 1), %esi ++ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax +- jnz L(first_vec_x1_check) +- movq %r8, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax +-# endif +- ret ++ jnz L(last_vec_x3) + +- .p2align 4 +-L(first_vec_x0_check): ++ /* Check the end of data. */ ++ subl $(CHAR_PER_VEC * 3), %esi ++ jb L(max) ++ ++ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif + /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) +- addq %rdi, %rax +- subq %rdx, %rax ++ cmpl %eax, %esi ++ jb L(max_end) ++ ++ subq %rdx, %rdi + # ifdef USE_AS_WCSLEN +- shrq $2, %rax ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarq $2, %rdi + # endif ++ leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax + ret + + .p2align 4 +-L(first_vec_x1_check): ++L(last_vec_x1): + tzcntl %eax, %eax ++ subq %rdx, %rdi + # ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif +- /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) +- addq $VEC_SIZE, %rax +- addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarq $2, %rdi + # endif ++ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax + ret + + .p2align 4 +-L(first_vec_x2_check): ++L(last_vec_x2): + tzcntl %eax, %eax ++ subq %rdx, %rdi + # ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif +- /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) +- addq $(VEC_SIZE * 2), %rax +- addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarq $2, %rdi + # endif ++ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax + ret + + .p2align 4 +-L(first_vec_x3_check): ++L(last_vec_x3): + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif ++ subl $(CHAR_PER_VEC * 2), %esi + /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) +- addq $(VEC_SIZE * 3), %rax +- addq %rdi, %rax +- subq %rdx, %rax ++ cmpl %eax, %esi ++ jb L(max_end) ++ subq %rdx, %rdi + # ifdef USE_AS_WCSLEN +- shrq $2, %rax ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarq $2, %rdi + # endif ++ leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax + ret +- +- .p2align 4 +-L(max): ++L(max_end): + movq %r8, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax +-# endif +- ret +- +- .p2align 4 +-L(zero): +- xorl %eax, %eax + ret + # endif + ++ /* Cold case for crossing page with first load. */ + .p2align 4 +-L(first_vec_x0): +- tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif +- addq %rdi, %rax +- subq %rdx, %rax ++L(cross_page_boundary): ++ movq %rdi, %rdx ++ /* Align data to VEC_SIZE. */ ++ andq $-VEC_SIZE, %rdi ++ VPCMP $0, (%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ /* Remove the leading bytes. */ + # ifdef USE_AS_WCSLEN +- shrq $2, %rax ++ /* NB: Divide shift count by 4 since each bit in K0 represent 4 ++ bytes. */ ++ movl %edx, %ecx ++ shrl $2, %ecx ++ andl $(CHAR_PER_VEC - 1), %ecx + # endif +- ret +- +- .p2align 4 +-L(first_vec_x1): ++ /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */ ++ sarxl %SHIFT_REG, %eax, %eax ++ testl %eax, %eax ++# ifndef USE_AS_STRNLEN ++ jz L(cross_page_continue) + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif +- addq $VEC_SIZE, %rax +- addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax +-# endif + ret +- +- .p2align 4 +-L(first_vec_x2): +- tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif +- addq $(VEC_SIZE * 2), %rax +- addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax +-# endif ++# else ++ jnz L(cross_page_less_vec) ++# ifndef USE_AS_WCSLEN ++ movl %edx, %ecx ++ andl $(CHAR_PER_VEC - 1), %ecx ++# endif ++ movl $CHAR_PER_VEC, %eax ++ subl %ecx, %eax ++ /* Check the end of data. */ ++ cmpq %rax, %rsi ++ ja L(cross_page_continue) ++ movl %esi, %eax + ret +- +- .p2align 4 +-L(4x_vec_end): +- VPCMP $0, %YMM1, %YMMZERO, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) +- VPCMP $0, %YMM2, %YMMZERO, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(first_vec_x1) +- VPCMP $0, %YMM3, %YMMZERO, %k2 +- kmovd %k2, %eax +- testl %eax, %eax +- jnz L(first_vec_x2) +- VPCMP $0, %YMM4, %YMMZERO, %k3 +- kmovd %k3, %eax +-L(first_vec_x3): ++L(cross_page_less_vec): + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif +- addq $(VEC_SIZE * 3), %rax +- addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax +-# endif ++ /* Select min of length and position of first null. */ ++ cmpq %rax, %rsi ++ cmovb %esi, %eax + ret ++# endif + + END (STRLEN) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-32.patch b/glibc-RHEL-15696-32.patch new file mode 100644 index 0000000..8f1a94a --- /dev/null +++ b/glibc-RHEL-15696-32.patch @@ -0,0 +1,158 @@ +From ea8e465a6b8d0f26c72bcbe453a854de3abf68ec Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Wed, 30 Jun 2021 10:47:06 -0700 +Subject: [PATCH] x86: Check RTM_ALWAYS_ABORT for RTM [BZ #28033] +Content-type: text/plain; charset=UTF-8 + +From + +https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html + +* Intel TSX will be disabled by default. +* The processor will force abort all Restricted Transactional Memory (RTM) + transactions by default. +* A new CPUID bit CPUID.07H.0H.EDX[11](RTM_ALWAYS_ABORT) will be enumerated, + which is set to indicate to updated software that the loaded microcode is + forcing RTM abort. +* On processors that enumerate support for RTM, the CPUID enumeration bits + for Intel TSX (CPUID.07H.0H.EBX[11] and CPUID.07H.0H.EBX[4]) continue to + be set by default after microcode update. +* Workloads that were benefited from Intel TSX might experience a change + in performance. +* System software may use a new bit in Model-Specific Register (MSR) 0x10F + TSX_FORCE_ABORT[TSX_CPUID_CLEAR] functionality to clear the Hardware Lock + Elision (HLE) and RTM bits to indicate to software that Intel TSX is + disabled. + +1. Add RTM_ALWAYS_ABORT to CPUID features. +2. Set RTM usable only if RTM_ALWAYS_ABORT isn't set. This skips the +string/tst-memchr-rtm etc. testcases on the affected processors, which +always fail after a microcde update. +3. Check RTM feature, instead of usability, against /proc/cpuinfo. + +This fixes BZ #28033. +--- + manual/platform.texi | 3 +++ + sysdeps/x86/cpu-features.c | 5 ++++- + sysdeps/x86/sys/platform/x86.h | 6 +++--- + sysdeps/x86/tst-cpu-features-supports.c | 2 +- + sysdeps/x86/tst-get-cpu-features.c | 2 ++ + 5 files changed, 13 insertions(+), 5 deletions(-) + +Conflicts: + sysdeps/x86/bits/platform/x86.h + (doesn't exist) + sysdeps/x86/bits/platform/x86.h + (account for lack of upstream renames) + +diff --git a/manual/platform.texi b/manual/platform.texi +index 8fec2933..b7e8aef7 100644 +--- a/manual/platform.texi ++++ b/manual/platform.texi +@@ -510,6 +510,9 @@ capability. + @item + @code{RTM} -- RTM instruction extensions. + ++@item ++@code{RTM_ALWAYS_ABORT} -- Transactions always abort, making RTM unusable. ++ + @item + @code{SDBG} -- IA32_DEBUG_INTERFACE MSR for silicon debug. + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index 3610ee5c..4889f062 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -74,7 +74,6 @@ update_usable (struct cpu_features *cpu_features) + CPU_FEATURE_SET_USABLE (cpu_features, HLE); + CPU_FEATURE_SET_USABLE (cpu_features, BMI2); + CPU_FEATURE_SET_USABLE (cpu_features, ERMS); +- CPU_FEATURE_SET_USABLE (cpu_features, RTM); + CPU_FEATURE_SET_USABLE (cpu_features, RDSEED); + CPU_FEATURE_SET_USABLE (cpu_features, ADX); + CPU_FEATURE_SET_USABLE (cpu_features, CLFLUSHOPT); +@@ -90,6 +89,7 @@ update_usable (struct cpu_features *cpu_features) + CPU_FEATURE_SET_USABLE (cpu_features, MOVDIRI); + CPU_FEATURE_SET_USABLE (cpu_features, MOVDIR64B); + CPU_FEATURE_SET_USABLE (cpu_features, FSRM); ++ CPU_FEATURE_SET_USABLE (cpu_features, RTM_ALWAYS_ABORT); + CPU_FEATURE_SET_USABLE (cpu_features, SERIALIZE); + CPU_FEATURE_SET_USABLE (cpu_features, TSXLDTRK); + CPU_FEATURE_SET_USABLE (cpu_features, LAHF64_SAHF64); +@@ -779,6 +779,9 @@ no_cpuid: + GLRO(dl_platform) = "i586"; + #endif + ++ if (!CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT)) ++ CPU_FEATURE_SET_USABLE (cpu_features, RTM); ++ + #if CET_ENABLED + # if HAVE_TUNABLES + TUNABLE_GET (x86_ibt, tunable_val_t *, +diff --git a/sysdeps/x86/sys/platform/x86.h b/sysdeps/x86/sys/platform/x86.h +index e5cc7c68..7a434926 100644 +--- a/sysdeps/x86/sys/platform/x86.h ++++ b/sysdeps/x86/sys/platform/x86.h +@@ -247,7 +247,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int) + #define bit_cpu_AVX512_VP2INTERSECT (1u << 8) + #define bit_cpu_INDEX_7_EDX_9 (1u << 9) + #define bit_cpu_MD_CLEAR (1u << 10) +-#define bit_cpu_INDEX_7_EDX_11 (1u << 11) ++#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11) + #define bit_cpu_INDEX_7_EDX_12 (1u << 12) + #define bit_cpu_INDEX_7_EDX_13 (1u << 13) + #define bit_cpu_SERIALIZE (1u << 14) +@@ -471,7 +471,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int) + #define index_cpu_AVX512_VP2INTERSECT COMMON_CPUID_INDEX_7 + #define index_cpu_INDEX_7_EDX_9 COMMON_CPUID_INDEX_7 + #define index_cpu_MD_CLEAR COMMON_CPUID_INDEX_7 +-#define index_cpu_INDEX_7_EDX_11 COMMON_CPUID_INDEX_7 ++#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7 + #define index_cpu_INDEX_7_EDX_12 COMMON_CPUID_INDEX_7 + #define index_cpu_INDEX_7_EDX_13 COMMON_CPUID_INDEX_7 + #define index_cpu_SERIALIZE COMMON_CPUID_INDEX_7 +@@ -695,7 +695,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int) + #define reg_AVX512_VP2INTERSECT edx + #define reg_INDEX_7_EDX_9 edx + #define reg_MD_CLEAR edx +-#define reg_INDEX_7_EDX_11 edx ++#define reg_RTM_ALWAYS_ABORT edx + #define reg_INDEX_7_EDX_12 edx + #define reg_INDEX_7_EDX_13 edx + #define reg_SERIALIZE edx +diff --git a/sysdeps/x86/tst-cpu-features-supports.c b/sysdeps/x86/tst-cpu-features-supports.c +index 287cf01f..8100a319 100644 +--- a/sysdeps/x86/tst-cpu-features-supports.c ++++ b/sysdeps/x86/tst-cpu-features-supports.c +@@ -152,7 +152,7 @@ do_test (int argc, char **argv) + fails += CHECK_SUPPORTS (rdpid, RDPID); + fails += CHECK_SUPPORTS (rdrnd, RDRAND); + fails += CHECK_SUPPORTS (rdseed, RDSEED); +- fails += CHECK_SUPPORTS (rtm, RTM); ++ fails += CHECK_CPU_SUPPORTS (rtm, RTM); + fails += CHECK_SUPPORTS (serialize, SERIALIZE); + fails += CHECK_SUPPORTS (sha, SHA); + fails += CHECK_CPU_SUPPORTS (shstk, SHSTK); +diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c +index 2763deb6..0717e5d8 100644 +--- a/sysdeps/x86/tst-get-cpu-features.c ++++ b/sysdeps/x86/tst-get-cpu-features.c +@@ -183,6 +183,7 @@ do_test (void) + CHECK_CPU_FEATURE (UINTR); + CHECK_CPU_FEATURE (AVX512_VP2INTERSECT); + CHECK_CPU_FEATURE (MD_CLEAR); ++ CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT); + CHECK_CPU_FEATURE (SERIALIZE); + CHECK_CPU_FEATURE (HYBRID); + CHECK_CPU_FEATURE (TSXLDTRK); +@@ -344,6 +345,7 @@ do_test (void) + CHECK_CPU_FEATURE_USABLE (FSRM); + CHECK_CPU_FEATURE_USABLE (AVX512_VP2INTERSECT); + CHECK_CPU_FEATURE_USABLE (MD_CLEAR); ++ CHECK_CPU_FEATURE_USABLE (RTM_ALWAYS_ABORT); + CHECK_CPU_FEATURE_USABLE (SERIALIZE); + CHECK_CPU_FEATURE_USABLE (HYBRID); + CHECK_CPU_FEATURE_USABLE (TSXLDTRK); +-- +GitLab + diff --git a/glibc-RHEL-15696-33.patch b/glibc-RHEL-15696-33.patch new file mode 100644 index 0000000..1196471 --- /dev/null +++ b/glibc-RHEL-15696-33.patch @@ -0,0 +1,51 @@ +From 0679442defedf7e52a94264975880ab8674736b2 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 8 Jul 2021 16:13:19 -0400 +Subject: [PATCH] x86: Remove wcsnlen-sse4_1 from wcslen ifunc-impl-list [BZ + #28064] +Content-type: text/plain; charset=UTF-8 + +The following commit + +commit 6f573a27b6c8b4236445810a44660612323f5a73 +Author: Noah Goldstein +Date: Wed Jun 23 01:19:34 2021 -0400 + + x86-64: Add wcslen optimize for sse4.1 + +Added wcsnlen-sse4.1 to the wcslen ifunc implementation list and did +not add wcslen-sse4.1 to wcslen ifunc implementation list. This commit +fixes that by removing wcsnlen-sse4.1 from the wcslen ifunc +implementation list and adding wcslen-sse4.1 to the ifunc +implementation list. + +Testing: +test-wcslen.c, test-rsi-wcslen.c, and test-rsi-strlen.c are passing as +well as all other tests in wcsmbs and string. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 580913ca..695cdba6 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -657,9 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wcslen_evex) +- IFUNC_IMPL_ADD (array, i, wcsnlen, ++ IFUNC_IMPL_ADD (array, i, wcslen, + CPU_FEATURE_USABLE (SSE4_1), +- __wcsnlen_sse4_1) ++ __wcslen_sse4_1) + IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ +-- +GitLab + diff --git a/glibc-RHEL-15696-34.patch b/glibc-RHEL-15696-34.patch new file mode 100644 index 0000000..f7c9a56 --- /dev/null +++ b/glibc-RHEL-15696-34.patch @@ -0,0 +1,135 @@ +From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Tue, 15 Feb 2022 08:18:15 -0600 +Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ + #28896] +Content-type: text/plain; charset=UTF-8 + +In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would +call strcmp-avx2 and wcscmp-avx2 respectively. This would have +not checks around vzeroupper and would trigger spurious +aborts. This commit fixes that. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on +AVX2 machines with and without RTM. + +Co-authored-by: H.J. Lu +--- + sysdeps/x86/Makefile | 2 +- + sysdeps/x86/tst-strncmp-rtm.c | 17 ++++++++++++++++- + sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +- + sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S | 1 + + sysdeps/x86_64/multiarch/strncmp-avx2.S | 1 + + sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S | 2 +- + sysdeps/x86_64/multiarch/wcsncmp-avx2.S | 2 +- + 7 files changed, 22 insertions(+), 5 deletions(-) + +Conflicts: + sysdeps/x86_64/multiarch/strcmp-avx2.S + (split into two patches due to upstream bug differences) + +diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile +index 5be71ada..2d814915 100644 +--- a/sysdeps/x86/Makefile ++++ b/sysdeps/x86/Makefile +@@ -38,7 +38,7 @@ CFLAGS-tst-memset-rtm.c += -mrtm + CFLAGS-tst-strchr-rtm.c += -mrtm + CFLAGS-tst-strcpy-rtm.c += -mrtm + CFLAGS-tst-strlen-rtm.c += -mrtm +-CFLAGS-tst-strncmp-rtm.c += -mrtm ++CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error + CFLAGS-tst-strrchr-rtm.c += -mrtm + endif + +diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c +index 236ad951..4d0004b5 100644 +--- a/sysdeps/x86/tst-strncmp-rtm.c ++++ b/sysdeps/x86/tst-strncmp-rtm.c +@@ -16,6 +16,7 @@ + License along with the GNU C Library; if not, see + . */ + ++#include + #include + + #define LOOP 3000 +@@ -45,8 +46,22 @@ function (void) + return 1; + } + ++__attribute__ ((noinline, noclone)) ++static int ++function_overflow (void) ++{ ++ if (strncmp (string1, string2, SIZE_MAX) == 0) ++ return 0; ++ else ++ return 1; ++} ++ + static int + do_test (void) + { +- return do_test_1 ("strncmp", LOOP, prepare, function); ++ int status = do_test_1 ("strncmp", LOOP, prepare, function); ++ if (status != EXIT_SUCCESS) ++ return status; ++ status = do_test_1 ("strncmp", LOOP, prepare, function_overflow); ++ return status; + } +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 5d1c9d90..433ae047 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -95,7 +95,7 @@ ENTRY (STRCMP) + length to bound a valid memory region. In these cases just use + 'wcscmp'. */ + shrq $56, %rcx +- jnz __wcscmp_avx2 ++ jnz OVERFLOW_STRCMP + # endif + /* Convert units: from wide to byte char. */ + shl $2, %RDX_LP +diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S +index 37d1224b..68bad365 100644 +--- a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S ++++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S +@@ -1,3 +1,4 @@ + #define STRCMP __strncmp_avx2_rtm + #define USE_AS_STRNCMP 1 ++#define OVERFLOW_STRCMP __strcmp_avx2_rtm + #include "strcmp-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S +index 1678bcc2..f138e9f1 100644 +--- a/sysdeps/x86_64/multiarch/strncmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S +@@ -1,3 +1,4 @@ + #define STRCMP __strncmp_avx2 + #define USE_AS_STRNCMP 1 ++#define OVERFLOW_STRCMP __strcmp_avx2 + #include "strcmp-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S +index 4e88c70c..f467582c 100644 +--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S ++++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S +@@ -1,5 +1,5 @@ + #define STRCMP __wcsncmp_avx2_rtm + #define USE_AS_STRNCMP 1 + #define USE_AS_WCSCMP 1 +- ++#define OVERFLOW_STRCMP __wcscmp_avx2_rtm + #include "strcmp-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S +index 4fa1de4d..e9ede522 100644 +--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S +@@ -1,5 +1,5 @@ + #define STRCMP __wcsncmp_avx2 + #define USE_AS_STRNCMP 1 + #define USE_AS_WCSCMP 1 +- ++#define OVERFLOW_STRCMP __wcscmp_avx2 + #include "strcmp-avx2.S" +-- +GitLab + diff --git a/glibc-RHEL-15696-35.patch b/glibc-RHEL-15696-35.patch new file mode 100644 index 0000000..5e4fbdd --- /dev/null +++ b/glibc-RHEL-15696-35.patch @@ -0,0 +1,51 @@ +From 55c7bcc71b84123d5d4bd2814366a6b05fcf8ebd Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Sat, 9 May 2020 12:04:23 -0700 +Subject: [PATCH] x86-64: Use RDX_LP on __x86_shared_non_temporal_threshold [BZ + #25966] +Content-type: text/plain; charset=UTF-8 + +Since __x86_shared_non_temporal_threshold is defined as + +long int __x86_shared_non_temporal_threshold; + +and long int is 4 bytes for x32, use RDX_LP to compare against +__x86_shared_non_temporal_threshold in assembly code. +--- + sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +index 71f5954d..673b73aa 100644 +--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +@@ -245,7 +245,7 @@ L(return): + #endif + + L(movsb): +- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx ++ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + jae L(more_8x_vec) + cmpq %rsi, %rdi + jb 1f +@@ -397,7 +397,7 @@ L(more_8x_vec): + addq %r8, %rdx + #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + /* Check non-temporal store threshold. */ +- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx ++ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + ja L(large_forward) + #endif + L(loop_4x_vec_forward): +@@ -448,7 +448,7 @@ L(more_8x_vec_backward): + subq %r8, %rdx + #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + /* Check non-temporal store threshold. */ +- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx ++ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + ja L(large_backward) + #endif + L(loop_4x_vec_backward): +-- +GitLab + diff --git a/glibc-RHEL-15696-36.patch b/glibc-RHEL-15696-36.patch new file mode 100644 index 0000000..e00b96e --- /dev/null +++ b/glibc-RHEL-15696-36.patch @@ -0,0 +1,44 @@ +From a35a59036ebae3efcdf5e8167610e0656fca9770 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Thu, 11 Jun 2020 12:41:18 -0700 +Subject: [PATCH] x86_64: Use %xmmN with vpxor to clear a vector register +Content-type: text/plain; charset=UTF-8 + +Since "vpxor %xmmN, %xmmN, %xmmN" clears the whole vector register, use +%xmmN, instead of %ymmN, with vpxor to clear a vector register. +--- + sysdeps/x86_64/multiarch/strcmp-avx2.S | 4 ++-- + sysdeps/x86_64/multiarch/strrchr-avx2.S | 2 +- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 433ae047..70d8499b 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -105,8 +105,8 @@ ENTRY (STRCMP) + # endif + movl %edi, %eax + xorl %edx, %edx +- /* Make %ymm7 all zeros in this function. */ +- vpxor %ymm7, %ymm7, %ymm7 ++ /* Make %xmm7 (%ymm7) all zeros in this function. */ ++ vpxor %xmm7, %xmm7, %xmm7 + orl %esi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax +diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S +index 9f22a15e..c949410b 100644 +--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S +@@ -48,7 +48,7 @@ ENTRY (STRRCHR) + movl %edi, %ecx + /* Broadcast CHAR to YMM4. */ + VPBROADCAST %xmm4, %ymm4 +- vpxor %ymm0, %ymm0, %ymm0 ++ vpxor %xmm0, %xmm0, %xmm0 + + /* Check if we may cross page boundary with one vector load. */ + andl $(2 * VEC_SIZE - 1), %ecx +-- +GitLab + diff --git a/glibc-RHEL-15696-37.patch b/glibc-RHEL-15696-37.patch new file mode 100644 index 0000000..10b0cc4 --- /dev/null +++ b/glibc-RHEL-15696-37.patch @@ -0,0 +1,359 @@ +From 1f745ecc2109890886b161d4791e1406fdfc29b8 Mon Sep 17 00:00:00 2001 +From: noah +Date: Wed, 3 Feb 2021 00:38:59 -0500 +Subject: [PATCH] x86-64: Refactor and improve performance of strchr-avx2.S +Content-type: text/plain; charset=UTF-8 + +No bug. Just seemed the performance could be improved a bit. Observed +and expected behavior are unchanged. Optimized body of main +loop. Updated page cross logic and optimized accordingly. Made a few +minor instruction selection modifications. No regressions in test +suite. Both test-strchrnul and test-strchr passed. +--- + sysdeps/x86_64/multiarch/strchr-avx2.S | 225 ++++++++++++------------- + sysdeps/x86_64/multiarch/strchr.c | 4 +- + 2 files changed, 114 insertions(+), 115 deletions(-) + +Conflicts: + sysdeps/x86_64/multiarch/strchr.c + (account for missing upstream macros) + +diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S +index da7d2620..919d256c 100644 +--- a/sysdeps/x86_64/multiarch/strchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S +@@ -27,10 +27,12 @@ + # ifdef USE_AS_WCSCHR + # define VPBROADCAST vpbroadcastd + # define VPCMPEQ vpcmpeqd ++# define VPMINU vpminud + # define CHAR_REG esi + # else + # define VPBROADCAST vpbroadcastb + # define VPCMPEQ vpcmpeqb ++# define VPMINU vpminub + # define CHAR_REG sil + # endif + +@@ -43,71 +45,54 @@ + # endif + + # define VEC_SIZE 32 ++# define PAGE_SIZE 4096 + + .section SECTION(.text),"ax",@progbits + ENTRY (STRCHR) + movl %edi, %ecx +- /* Broadcast CHAR to YMM0. */ ++# ifndef USE_AS_STRCHRNUL ++ xorl %edx, %edx ++# endif ++ ++ /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 + vpxor %xmm9, %xmm9, %xmm9 + VPBROADCAST %xmm0, %ymm0 +- /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) + +- /* Check the first VEC_SIZE bytes. Search for both CHAR and the +- null byte. */ +- vmovdqu (%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) ++ /* Check if we cross page boundary with one vector load. */ ++ andl $(PAGE_SIZE - 1), %ecx ++ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx ++ ja L(cross_page_boundary) + +- /* Align data for aligned loads in the loop. */ +- addq $VEC_SIZE, %rdi +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi +- +- jmp L(more_4x_vec) +- +- .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi ++ /* Check the first VEC_SIZE bytes. Search for both CHAR and the ++ null byte. */ + vmovdqu (%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax +- /* Remove the leading bytes. */ +- sarl %cl, %eax + testl %eax, %eax +- jz L(aligned_more) +- /* Found CHAR or the null byte. */ ++ jz L(more_vecs) + tzcntl %eax, %eax +- addq %rcx, %rax +-# ifdef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ + addq %rdi, %rax +-# else +- xorl %edx, %edx +- leaq (%rdi, %rax), %rax +- cmp (%rax), %CHAR_REG ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif + L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 ++L(more_vecs): ++ /* Align data for aligned loads in the loop. */ ++ andq $-VEC_SIZE, %rdi + L(aligned_more): +- addq $VEC_SIZE, %rdi + +-L(more_4x_vec): +- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time +- since data is only aligned to VEC_SIZE. */ +- vmovdqa (%rdi), %ymm8 ++ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time ++ since data is only aligned to VEC_SIZE. */ ++ vmovdqa VEC_SIZE(%rdi), %ymm8 ++ addq $VEC_SIZE, %rdi + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 +@@ -137,61 +122,24 @@ L(more_4x_vec): + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax +- jnz L(first_vec_x3) +- +- addq $(VEC_SIZE * 4), %rdi +- +- /* Align data to 4 * VEC_SIZE. */ +- movq %rdi, %rcx +- andl $(4 * VEC_SIZE - 1), %ecx +- andq $-(4 * VEC_SIZE), %rdi +- +- .p2align 4 +-L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ +- vmovdqa (%rdi), %ymm5 +- vmovdqa VEC_SIZE(%rdi), %ymm6 +- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 +- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 +- +- VPCMPEQ %ymm5, %ymm0, %ymm1 +- VPCMPEQ %ymm6, %ymm0, %ymm2 +- VPCMPEQ %ymm7, %ymm0, %ymm3 +- VPCMPEQ %ymm8, %ymm0, %ymm4 +- +- VPCMPEQ %ymm5, %ymm9, %ymm5 +- VPCMPEQ %ymm6, %ymm9, %ymm6 +- VPCMPEQ %ymm7, %ymm9, %ymm7 +- VPCMPEQ %ymm8, %ymm9, %ymm8 +- +- vpor %ymm1, %ymm5, %ymm1 +- vpor %ymm2, %ymm6, %ymm2 +- vpor %ymm3, %ymm7, %ymm3 +- vpor %ymm4, %ymm8, %ymm4 +- +- vpor %ymm1, %ymm2, %ymm5 +- vpor %ymm3, %ymm4, %ymm6 +- +- vpor %ymm5, %ymm6, %ymm5 +- +- vpmovmskb %ymm5, %eax +- testl %eax, %eax +- jnz L(4x_vec_end) +- +- addq $(VEC_SIZE * 4), %rdi ++ jz L(prep_loop_4x) + +- jmp L(loop_4x_vec) ++ tzcntl %eax, %eax ++ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ VZEROUPPER ++ ret + + .p2align 4 + L(first_vec_x0): +- /* Found CHAR or the null byte. */ + tzcntl %eax, %eax +-# ifdef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ + addq %rdi, %rax +-# else +- xorl %edx, %edx +- leaq (%rdi, %rax), %rax +- cmp (%rax), %CHAR_REG ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif + VZEROUPPER_RETURN +@@ -199,13 +147,9 @@ L(first_vec_x0): + .p2align 4 + L(first_vec_x1): + tzcntl %eax, %eax +-# ifdef USE_AS_STRCHRNUL +- addq $VEC_SIZE, %rax +- addq %rdi, %rax +-# else +- xorl %edx, %edx + leaq VEC_SIZE(%rdi, %rax), %rax +- cmp (%rax), %CHAR_REG ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif + VZEROUPPER_RETURN +@@ -213,42 +157,97 @@ L(first_vec_x1): + .p2align 4 + L(first_vec_x2): + tzcntl %eax, %eax +-# ifdef USE_AS_STRCHRNUL +- addq $(VEC_SIZE * 2), %rax +- addq %rdi, %rax +-# else +- xorl %edx, %edx ++ /* Found CHAR or the null byte. */ + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax +- cmp (%rax), %CHAR_REG ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif + VZEROUPPER_RETURN + ++L(prep_loop_4x): ++ /* Align data to 4 * VEC_SIZE. */ ++ andq $-(VEC_SIZE * 4), %rdi ++ + .p2align 4 +-L(4x_vec_end): ++L(loop_4x_vec): ++ /* Compare 4 * VEC at a time forward. */ ++ vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5 ++ vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6 ++ vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7 ++ vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8 ++ ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxor %ymm5, %ymm0, %ymm1 ++ vpxor %ymm6, %ymm0, %ymm2 ++ vpxor %ymm7, %ymm0, %ymm3 ++ vpxor %ymm8, %ymm0, %ymm4 ++ ++ VPMINU %ymm1, %ymm5, %ymm1 ++ VPMINU %ymm2, %ymm6, %ymm2 ++ VPMINU %ymm3, %ymm7, %ymm3 ++ VPMINU %ymm4, %ymm8, %ymm4 ++ ++ VPMINU %ymm1, %ymm2, %ymm5 ++ VPMINU %ymm3, %ymm4, %ymm6 ++ ++ VPMINU %ymm5, %ymm6, %ymm5 ++ ++ VPCMPEQ %ymm5, %ymm9, %ymm5 ++ vpmovmskb %ymm5, %eax ++ ++ addq $(VEC_SIZE * 4), %rdi ++ testl %eax, %eax ++ jz L(loop_4x_vec) ++ ++ VPCMPEQ %ymm1, %ymm9, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x0) ++ ++ VPCMPEQ %ymm2, %ymm9, %ymm2 + vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(first_vec_x1) +- vpmovmskb %ymm3, %eax +- testl %eax, %eax +- jnz L(first_vec_x2) ++ ++ VPCMPEQ %ymm3, %ymm9, %ymm3 ++ VPCMPEQ %ymm4, %ymm9, %ymm4 ++ vpmovmskb %ymm3, %ecx + vpmovmskb %ymm4, %eax ++ salq $32, %rax ++ orq %rcx, %rax ++ tzcntq %rax, %rax ++ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++ /* Cold case for crossing page with first load. */ ++ .p2align 4 ++L(cross_page_boundary): ++ andq $-VEC_SIZE, %rdi ++ andl $(VEC_SIZE - 1), %ecx ++ ++ vmovdqa (%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 ++ vpor %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %eax ++ /* Remove the leading bits. */ ++ sarxl %ecx, %eax, %eax + testl %eax, %eax +-L(first_vec_x3): ++ jz L(aligned_more) + tzcntl %eax, %eax +-# ifdef USE_AS_STRCHRNUL +- addq $(VEC_SIZE * 3), %rax ++ addq %rcx, %rdi + addq %rdi, %rax +-# else +- xorl %edx, %edx +- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax +- cmp (%rax), %CHAR_REG ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif + VZEROUPPER_RETURN + + END (STRCHR) +-#endif ++# endif +diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c +index 7e582f02..5225bd4f 100644 +--- a/sysdeps/x86_64/multiarch/strchr.c ++++ b/sysdeps/x86_64/multiarch/strchr.c +@@ -38,11 +38,11 @@ IFUNC_SELECTOR (void) + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) +- && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + return OPTIMIZE (evex); + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) +-- +GitLab + diff --git a/glibc-RHEL-15696-38.patch b/glibc-RHEL-15696-38.patch new file mode 100644 index 0000000..f97ab23 --- /dev/null +++ b/glibc-RHEL-15696-38.patch @@ -0,0 +1,67 @@ +From 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Sat, 25 Jan 2020 14:19:40 -0800 +Subject: [PATCH] x86-64: Avoid rep movsb with short distance [BZ #27130] +Content-type: text/plain; charset=UTF-8 + +When copying with "rep movsb", if the distance between source and +destination is N*4GB + [1..63] with N >= 0, performance may be very +slow. This patch updates memmove-vec-unaligned-erms.S for AVX and +AVX512 versions with the distance in RCX: + + cmpl $63, %ecx + // Don't use "rep movsb" if ECX <= 63 + jbe L(Don't use rep movsb") + Use "rep movsb" + +Benchtests data with bench-memcpy, bench-memcpy-large, bench-memcpy-random +and bench-memcpy-walk on Skylake, Ice Lake and Tiger Lake show that its +performance impact is within noise range as "rep movsb" is only used for +data size >= 4KB. +--- + .../multiarch/memmove-vec-unaligned-erms.S | 21 +++++++++++++++++++ + 1 file changed, 21 insertions(+) + +diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +index 673b73aa..c475fed4 100644 +--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +@@ -64,6 +64,13 @@ + # endif + #endif + ++/* Avoid short distance rep movsb only with non-SSE vector. */ ++#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB ++# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) ++#else ++# define AVOID_SHORT_DISTANCE_REP_MOVSB 0 ++#endif ++ + #ifndef PREFETCH + # define PREFETCH(addr) prefetcht0 addr + #endif +@@ -255,7 +262,21 @@ L(movsb): + cmpq %r9, %rdi + /* Avoid slow backward REP MOVSB. */ + jb L(more_8x_vec_backward) ++# if AVOID_SHORT_DISTANCE_REP_MOVSB ++ movq %rdi, %rcx ++ subq %rsi, %rcx ++ jmp 2f ++# endif + 1: ++# if AVOID_SHORT_DISTANCE_REP_MOVSB ++ movq %rsi, %rcx ++ subq %rdi, %rcx ++2: ++/* Avoid "rep movsb" if RCX, the distance between source and destination, ++ is N*4GB + [1..63] with N >= 0. */ ++ cmpl $63, %ecx ++ jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ ++# endif + mov %RDX_LP, %RCX_LP + rep movsb + L(nop): +-- +GitLab + diff --git a/glibc-RHEL-15696-39.patch b/glibc-RHEL-15696-39.patch new file mode 100644 index 0000000..8343ba9 --- /dev/null +++ b/glibc-RHEL-15696-39.patch @@ -0,0 +1,449 @@ +From 1a8605b6cd257e8a74e29b5b71c057211f5fb847 Mon Sep 17 00:00:00 2001 +From: noah +Date: Sat, 3 Apr 2021 04:12:15 -0400 +Subject: [PATCH] x86: Update large memcpy case in memmove-vec-unaligned-erms.S +Content-type: text/plain; charset=UTF-8 + +No Bug. This commit updates the large memcpy case (no overlap). The +update is to perform memcpy on either 2 or 4 contiguous pages at +once. This 1) helps to alleviate the affects of false memory aliasing +when destination and source have a close 4k alignment and 2) In most +cases and for most DRAM units is a modestly more efficient access +pattern. These changes are a clear performance improvement for +VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, +test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all +pass. + +Signed-off-by: Noah Goldstein +--- + .../multiarch/memmove-vec-unaligned-erms.S | 338 ++++++++++++++---- + 1 file changed, 265 insertions(+), 73 deletions(-) + +Conflicts: + sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S + (different number of sections) + +diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +index c475fed4..3e2dd6bc 100644 +--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +@@ -32,7 +32,16 @@ + overlapping addresses. + 6. If size >= __x86_shared_non_temporal_threshold and there is no + overlap between destination and source, use non-temporal store +- instead of aligned store. */ ++ instead of aligned store copying from either 2 or 4 pages at ++ once. ++ 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold ++ and source and destination do not page alias, copy from 2 pages ++ at once using non-temporal stores. Page aliasing in this case is ++ considered true if destination's page alignment - sources' page ++ alignment is less than 8 * VEC_SIZE. ++ 9. If size >= 16 * __x86_shared_non_temporal_threshold or source ++ and destination do page alias copy from 4 pages at once using ++ non-temporal stores. */ + + #include + +@@ -64,6 +73,34 @@ + # endif + #endif + ++#ifndef PAGE_SIZE ++# define PAGE_SIZE 4096 ++#endif ++ ++#if PAGE_SIZE != 4096 ++# error Unsupported PAGE_SIZE ++#endif ++ ++#ifndef LOG_PAGE_SIZE ++# define LOG_PAGE_SIZE 12 ++#endif ++ ++#if PAGE_SIZE != (1 << LOG_PAGE_SIZE) ++# error Invalid LOG_PAGE_SIZE ++#endif ++ ++/* Byte per page for large_memcpy inner loop. */ ++#if VEC_SIZE == 64 ++# define LARGE_LOAD_SIZE (VEC_SIZE * 2) ++#else ++# define LARGE_LOAD_SIZE (VEC_SIZE * 4) ++#endif ++ ++/* Amount to shift rdx by to compare for memcpy_large_4x. */ ++#ifndef LOG_4X_MEMCPY_THRESH ++# define LOG_4X_MEMCPY_THRESH 4 ++#endif ++ + /* Avoid short distance rep movsb only with non-SSE vector. */ + #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB + # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) +@@ -103,6 +140,28 @@ + # error Unsupported PREFETCH_SIZE! + #endif + ++#if LARGE_LOAD_SIZE == (VEC_SIZE * 2) ++# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ ++ VMOVU (offset)base, vec0; \ ++ VMOVU ((offset) + VEC_SIZE)base, vec1; ++# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ ++ VMOVNT vec0, (offset)base; \ ++ VMOVNT vec1, ((offset) + VEC_SIZE)base; ++#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) ++# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ ++ VMOVU (offset)base, vec0; \ ++ VMOVU ((offset) + VEC_SIZE)base, vec1; \ ++ VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ ++ VMOVU ((offset) + VEC_SIZE * 3)base, vec3; ++# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ ++ VMOVNT vec0, (offset)base; \ ++ VMOVNT vec1, ((offset) + VEC_SIZE)base; \ ++ VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ ++ VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; ++#else ++# error Invalid LARGE_LOAD_SIZE ++#endif ++ + #ifndef SECTION + # error SECTION is not defined! + #endif +@@ -390,6 +449,15 @@ L(last_4x_vec): + VZEROUPPER_RETURN + + L(more_8x_vec): ++ /* Check if non-temporal move candidate. */ ++#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) ++ /* Check non-temporal store threshold. */ ++ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP ++ ja L(large_memcpy_2x) ++#endif ++ /* Entry if rdx is greater than non-temporal threshold but there ++ is overlap. */ ++L(more_8x_vec_check): + cmpq %rsi, %rdi + ja L(more_8x_vec_backward) + /* Source == destination is less common. */ +@@ -416,24 +484,21 @@ L(more_8x_vec): + subq %r8, %rdi + /* Adjust length. */ + addq %r8, %rdx +-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) +- /* Check non-temporal store threshold. */ +- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP +- ja L(large_forward) +-#endif ++ ++ .p2align 4 + L(loop_4x_vec_forward): + /* Copy 4 * VEC a time forward. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) +- addq $(VEC_SIZE * 4), %rsi +- subq $(VEC_SIZE * 4), %rdx ++ subq $-(VEC_SIZE * 4), %rsi ++ addq $-(VEC_SIZE * 4), %rdx + VMOVA %VEC(0), (%rdi) + VMOVA %VEC(1), VEC_SIZE(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) +- addq $(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rdi + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_4x_vec_forward) + /* Store the last 4 * VEC. */ +@@ -467,24 +532,21 @@ L(more_8x_vec_backward): + subq %r8, %r9 + /* Adjust length. */ + subq %r8, %rdx +-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) +- /* Check non-temporal store threshold. */ +- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP +- ja L(large_backward) +-#endif ++ ++ .p2align 4 + L(loop_4x_vec_backward): + /* Copy 4 * VEC a time backward. */ + VMOVU (%rcx), %VEC(0) + VMOVU -VEC_SIZE(%rcx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) + VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) +- subq $(VEC_SIZE * 4), %rcx +- subq $(VEC_SIZE * 4), %rdx ++ addq $-(VEC_SIZE * 4), %rcx ++ addq $-(VEC_SIZE * 4), %rdx + VMOVA %VEC(0), (%r9) + VMOVA %VEC(1), -VEC_SIZE(%r9) + VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) + VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) +- subq $(VEC_SIZE * 4), %r9 ++ addq $-(VEC_SIZE * 4), %r9 + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_4x_vec_backward) + /* Store the first 4 * VEC. */ +@@ -497,72 +559,202 @@ L(loop_4x_vec_backward): + VZEROUPPER_RETURN + + #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) +-L(large_forward): ++ .p2align 4 ++L(large_memcpy_2x): ++ /* Compute absolute value of difference between source and ++ destination. */ ++ movq %rdi, %r9 ++ subq %rsi, %r9 ++ movq %r9, %r8 ++ leaq -1(%r9), %rcx ++ sarq $63, %r8 ++ xorq %r8, %r9 ++ subq %r8, %r9 + /* Don't use non-temporal store if there is overlap between +- destination and source since destination may be in cache +- when source is loaded. */ +- leaq (%rdi, %rdx), %r10 +- cmpq %r10, %rsi +- jb L(loop_4x_vec_forward) +-L(loop_large_forward): ++ destination and source since destination may be in cache when ++ source is loaded. */ ++ cmpq %r9, %rdx ++ ja L(more_8x_vec_check) ++ ++ /* Cache align destination. First store the first 64 bytes then ++ adjust alignments. */ ++ VMOVU (%rsi), %VEC(8) ++#if VEC_SIZE < 64 ++ VMOVU VEC_SIZE(%rsi), %VEC(9) ++#if VEC_SIZE < 32 ++ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) ++ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) ++#endif ++#endif ++ VMOVU %VEC(8), (%rdi) ++#if VEC_SIZE < 64 ++ VMOVU %VEC(9), VEC_SIZE(%rdi) ++#if VEC_SIZE < 32 ++ VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) ++ VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) ++#endif ++#endif ++ /* Adjust source, destination, and size. */ ++ movq %rdi, %r8 ++ andq $63, %r8 ++ /* Get the negative of offset for alignment. */ ++ subq $64, %r8 ++ /* Adjust source. */ ++ subq %r8, %rsi ++ /* Adjust destination which should be aligned now. */ ++ subq %r8, %rdi ++ /* Adjust length. */ ++ addq %r8, %rdx ++ ++ /* Test if source and destination addresses will alias. If they do ++ the larger pipeline in large_memcpy_4x alleviated the ++ performance drop. */ ++ testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx ++ jz L(large_memcpy_4x) ++ ++ movq %rdx, %r10 ++ shrq $LOG_4X_MEMCPY_THRESH, %r10 ++ cmp __x86_shared_non_temporal_threshold(%rip), %r10 ++ jae L(large_memcpy_4x) ++ ++ /* edx will store remainder size for copying tail. */ ++ andl $(PAGE_SIZE * 2 - 1), %edx ++ /* r10 stores outer loop counter. */ ++ shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 ++ /* Copy 4x VEC at a time from 2 pages. */ ++ .p2align 4 ++L(loop_large_memcpy_2x_outer): ++ /* ecx stores inner loop counter. */ ++ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx ++L(loop_large_memcpy_2x_inner): ++ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) ++ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) ++ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) ++ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) ++ /* Load vectors from rsi. */ ++ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) ++ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) ++ subq $-LARGE_LOAD_SIZE, %rsi ++ /* Non-temporal store vectors to rdi. */ ++ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) ++ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) ++ subq $-LARGE_LOAD_SIZE, %rdi ++ decl %ecx ++ jnz L(loop_large_memcpy_2x_inner) ++ addq $PAGE_SIZE, %rdi ++ addq $PAGE_SIZE, %rsi ++ decq %r10 ++ jne L(loop_large_memcpy_2x_outer) ++ sfence ++ ++ /* Check if only last 4 loads are needed. */ ++ cmpl $(VEC_SIZE * 4), %edx ++ jbe L(large_memcpy_2x_end) ++ ++ /* Handle the last 2 * PAGE_SIZE bytes. */ ++L(loop_large_memcpy_2x_tail): + /* Copy 4 * VEC a time forward with non-temporal stores. */ +- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) +- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) ++ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) ++ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) +- addq $PREFETCHED_LOAD_SIZE, %rsi +- subq $PREFETCHED_LOAD_SIZE, %rdx +- VMOVNT %VEC(0), (%rdi) +- VMOVNT %VEC(1), VEC_SIZE(%rdi) +- VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) +- VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) +- addq $PREFETCHED_LOAD_SIZE, %rdi +- cmpq $PREFETCHED_LOAD_SIZE, %rdx +- ja L(loop_large_forward) +- sfence ++ subq $-(VEC_SIZE * 4), %rsi ++ addl $-(VEC_SIZE * 4), %edx ++ VMOVA %VEC(0), (%rdi) ++ VMOVA %VEC(1), VEC_SIZE(%rdi) ++ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) ++ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) ++ subq $-(VEC_SIZE * 4), %rdi ++ cmpl $(VEC_SIZE * 4), %edx ++ ja L(loop_large_memcpy_2x_tail) ++ ++L(large_memcpy_2x_end): + /* Store the last 4 * VEC. */ +- VMOVU %VEC(5), (%rcx) +- VMOVU %VEC(6), -VEC_SIZE(%rcx) +- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) +- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) +- /* Store the first VEC. */ +- VMOVU %VEC(4), (%r11) ++ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) ++ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) ++ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) ++ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) ++ ++ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) ++ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) ++ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) ++ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) + VZEROUPPER_RETURN + +-L(large_backward): +- /* Don't use non-temporal store if there is overlap between +- destination and source since destination may be in cache +- when source is loaded. */ +- leaq (%rcx, %rdx), %r10 +- cmpq %r10, %r9 +- jb L(loop_4x_vec_backward) +-L(loop_large_backward): +- /* Copy 4 * VEC a time backward with non-temporal stores. */ +- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) +- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) +- VMOVU (%rcx), %VEC(0) +- VMOVU -VEC_SIZE(%rcx), %VEC(1) +- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) +- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) +- subq $PREFETCHED_LOAD_SIZE, %rcx +- subq $PREFETCHED_LOAD_SIZE, %rdx +- VMOVNT %VEC(0), (%r9) +- VMOVNT %VEC(1), -VEC_SIZE(%r9) +- VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) +- VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) +- subq $PREFETCHED_LOAD_SIZE, %r9 +- cmpq $PREFETCHED_LOAD_SIZE, %rdx +- ja L(loop_large_backward) ++ .p2align 4 ++L(large_memcpy_4x): ++ movq %rdx, %r10 ++ /* edx will store remainder size for copying tail. */ ++ andl $(PAGE_SIZE * 4 - 1), %edx ++ /* r10 stores outer loop counter. */ ++ shrq $(LOG_PAGE_SIZE + 2), %r10 ++ /* Copy 4x VEC at a time from 4 pages. */ ++ .p2align 4 ++L(loop_large_memcpy_4x_outer): ++ /* ecx stores inner loop counter. */ ++ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx ++L(loop_large_memcpy_4x_inner): ++ /* Only one prefetch set per page as doing 4 pages give more time ++ for prefetcher to keep up. */ ++ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) ++ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) ++ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) ++ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) ++ /* Load vectors from rsi. */ ++ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) ++ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) ++ LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) ++ LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) ++ subq $-LARGE_LOAD_SIZE, %rsi ++ /* Non-temporal store vectors to rdi. */ ++ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) ++ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) ++ STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) ++ STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) ++ subq $-LARGE_LOAD_SIZE, %rdi ++ decl %ecx ++ jnz L(loop_large_memcpy_4x_inner) ++ addq $(PAGE_SIZE * 3), %rdi ++ addq $(PAGE_SIZE * 3), %rsi ++ decq %r10 ++ jne L(loop_large_memcpy_4x_outer) + sfence +- /* Store the first 4 * VEC. */ +- VMOVU %VEC(4), (%rdi) +- VMOVU %VEC(5), VEC_SIZE(%rdi) +- VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) +- VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) +- /* Store the last VEC. */ +- VMOVU %VEC(8), (%r11) ++ /* Check if only last 4 loads are needed. */ ++ cmpl $(VEC_SIZE * 4), %edx ++ jbe L(large_memcpy_4x_end) ++ ++ /* Handle the last 4 * PAGE_SIZE bytes. */ ++L(loop_large_memcpy_4x_tail): ++ /* Copy 4 * VEC a time forward with non-temporal stores. */ ++ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) ++ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) ++ VMOVU (%rsi), %VEC(0) ++ VMOVU VEC_SIZE(%rsi), %VEC(1) ++ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) ++ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) ++ subq $-(VEC_SIZE * 4), %rsi ++ addl $-(VEC_SIZE * 4), %edx ++ VMOVA %VEC(0), (%rdi) ++ VMOVA %VEC(1), VEC_SIZE(%rdi) ++ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) ++ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) ++ subq $-(VEC_SIZE * 4), %rdi ++ cmpl $(VEC_SIZE * 4), %edx ++ ja L(loop_large_memcpy_4x_tail) ++ ++L(large_memcpy_4x_end): ++ /* Store the last 4 * VEC. */ ++ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) ++ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) ++ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) ++ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) ++ ++ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) ++ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) ++ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) ++ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) + VZEROUPPER_RETURN + #endif + END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) +-- +GitLab + diff --git a/glibc-RHEL-15696-4.patch b/glibc-RHEL-15696-4.patch new file mode 100644 index 0000000..531c171 --- /dev/null +++ b/glibc-RHEL-15696-4.patch @@ -0,0 +1,151 @@ +From ecd8b842cf37ea112e59cd9085ff1f1b6e208ae0 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 21 Jan 2019 11:29:58 -0800 +Subject: [PATCH] x86-64 memrchr: Properly handle the length parameter [BZ# + 24097] +Content-type: text/plain; charset=UTF-8 + +On x32, the size_t parameter may be passed in the lower 32 bits of a +64-bit register with the non-zero upper 32 bits. The string/memory +functions written in assembly can only use the lower 32 bits of a +64-bit register as length or must clear the upper 32 bits before using +the full 64-bit register for length. + +This pach fixes memrchr for x32. Tested on x86-64 and x32. On x86-64, +libc.so is the same with and withou the fix. + + [BZ# 24097] + CVE-2019-6488 + * sysdeps/x86_64/memrchr.S: Use RDX_LP for length. + * sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr. + * sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file. +--- + sysdeps/x86_64/memrchr.S | 4 +- + sysdeps/x86_64/multiarch/memrchr-avx2.S | 4 +- + sysdeps/x86_64/x32/Makefile | 3 +- + sysdeps/x86_64/x32/tst-size_t-memrchr.c | 57 +++++++++++++++++++++++++ + 4 files changed, 63 insertions(+), 5 deletions(-) + create mode 100644 sysdeps/x86_64/x32/tst-size_t-memrchr.c + +Conflicts: + ChangeLog + (removed) + +diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S +index b8e3fa1d..dc82f8f7 100644 +--- a/sysdeps/x86_64/memrchr.S ++++ b/sysdeps/x86_64/memrchr.S +@@ -24,13 +24,13 @@ + ENTRY (__memrchr) + movd %esi, %xmm1 + +- sub $16, %rdx ++ sub $16, %RDX_LP + jbe L(length_less16) + + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + +- add %rdx, %rdi ++ add %RDX_LP, %RDI_LP + pshufd $0, %xmm1, %xmm1 + + movdqu (%rdi), %xmm0 +diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S +index b41a58bc..ce488dd9 100644 +--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S +@@ -32,10 +32,10 @@ ENTRY (__memrchr_avx2) + vmovd %esi, %xmm0 + vpbroadcastb %xmm0, %ymm0 + +- subq $VEC_SIZE, %rdx ++ sub $VEC_SIZE, %RDX_LP + jbe L(last_vec_or_less) + +- addq %rdx, %rdi ++ add %RDX_LP, %RDI_LP + + /* Check the last VEC_SIZE bytes. */ + vpcmpeqb (%rdi), %ymm0, %ymm1 +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index 2fe1e5ac..e99dbd7c 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -6,7 +6,8 @@ CFLAGS-s_llround.c += -fno-builtin-lround + endif + + ifeq ($(subdir),string) +-tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy ++tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ ++ tst-size_t-memrchr + endif + + ifeq ($(subdir),wcsmbs) +diff --git a/sysdeps/x86_64/x32/tst-size_t-memrchr.c b/sysdeps/x86_64/x32/tst-size_t-memrchr.c +new file mode 100644 +index 00000000..c83699c0 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-memrchr.c +@@ -0,0 +1,57 @@ ++/* Test memrchr with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define TEST_NAME "memrchr" ++#include "test-size_t.h" ++ ++IMPL (memchr, 1) ++ ++typedef void * (*proto_t) (const void *, int, size_t); ++ ++static void * ++__attribute__ ((noinline, noclone)) ++do_memrchr (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, (uintptr_t) b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ parameter_t src = { { page_size }, buf2 }; ++ parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 }; ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ c.fn = impl->fn; ++ void * res = do_memrchr (src, c); ++ if (res) ++ { ++ error (0, 0, "Wrong result in function %s: %p != NULL", ++ impl->name, res); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +-- +GitLab + diff --git a/glibc-RHEL-15696-40.patch b/glibc-RHEL-15696-40.patch new file mode 100644 index 0000000..7b7c07b --- /dev/null +++ b/glibc-RHEL-15696-40.patch @@ -0,0 +1,92 @@ +From 83c5b368226c34a2f0a5287df40fc290b2b34359 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 19 Apr 2021 10:45:07 -0700 +Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S +Content-type: text/plain; charset=UTF-8 + +Since strchr-avx2.S updated by + +commit 1f745ecc2109890886b161d4791e1406fdfc29b8 +Author: noah +Date: Wed Feb 3 00:38:59 2021 -0500 + + x86-64: Refactor and improve performance of strchr-avx2.S + +uses sarx: + +c4 e2 72 f7 c0 sarx %ecx,%eax,%eax + +for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and +ifunc-avx2.h. +--- + sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- + 2 files changed, 11 insertions(+), 5 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h +index e0f30e61..ef72b73f 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h ++++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h +@@ -30,11 +30,11 @@ IFUNC_SELECTOR (void) + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) +- && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + return OPTIMIZE (evex); + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 695cdba6..85b8863a 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -400,10 +400,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + /* Support sysdeps/x86_64/multiarch/strchr.c. */ + IFUNC_IMPL (i, name, strchr, + IFUNC_IMPL_ADD (array, i, strchr, +- CPU_FEATURE_USABLE (AVX2), ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2)), + __strchr_avx2) + IFUNC_IMPL_ADD (array, i, strchr, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __strchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strchr, +@@ -417,10 +419,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + /* Support sysdeps/x86_64/multiarch/strchrnul.c. */ + IFUNC_IMPL (i, name, strchrnul, + IFUNC_IMPL_ADD (array, i, strchrnul, +- CPU_FEATURE_USABLE (AVX2), ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2)), + __strchrnul_avx2) + IFUNC_IMPL_ADD (array, i, strchrnul, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __strchrnul_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strchrnul, +@@ -574,10 +578,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + /* Support sysdeps/x86_64/multiarch/wcschr.c. */ + IFUNC_IMPL (i, name, wcschr, + IFUNC_IMPL_ADD (array, i, wcschr, +- CPU_FEATURE_USABLE (AVX2), ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2)), + __wcschr_avx2) + IFUNC_IMPL_ADD (array, i, wcschr, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __wcschr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcschr, +-- +GitLab + diff --git a/glibc-RHEL-15696-41.patch b/glibc-RHEL-15696-41.patch new file mode 100644 index 0000000..aa8fc69 --- /dev/null +++ b/glibc-RHEL-15696-41.patch @@ -0,0 +1,265 @@ +From f53790272ce7bdc5ecd14b45f65d0464d2a61a3a Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 19 Apr 2021 17:48:10 -0400 +Subject: [PATCH] x86: Optimize less_vec evex and avx512 + memset-vec-unaligned-erms.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit adds optimized cased for less_vec memset case that +uses the avx512vl/avx512bw mask store avoiding the excessive +branches. test-memset and test-wmemset are passing. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 40 ++++++++++----- + sysdeps/x86_64/multiarch/ifunc-memset.h | 6 ++- + .../multiarch/memset-avx512-unaligned-erms.S | 2 +- + .../multiarch/memset-evex-unaligned-erms.S | 2 +- + .../multiarch/memset-vec-unaligned-erms.S | 51 +++++++++++++++---- + 5 files changed, 74 insertions(+), 27 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 85b8863a..d59d65f8 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -204,19 +204,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __memset_chk_avx2_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __memset_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __memset_chk_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __memset_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __memset_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + CPU_FEATURE_USABLE (AVX512F), +@@ -247,19 +251,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __memset_avx2_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __memset_evex_unaligned) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __memset_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __memset_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __memset_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memset, + CPU_FEATURE_USABLE (AVX512F), +@@ -739,10 +747,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (RTM)), + __wmemset_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, wmemset, +- CPU_FEATURE_USABLE (AVX512VL), ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __wmemset_evex_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, +- CPU_FEATURE_USABLE (AVX512VL), ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __wmemset_avx512_unaligned)) + + #ifdef SHARED +@@ -946,10 +958,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + CPU_FEATURE_USABLE (AVX2), + __wmemset_chk_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, +- CPU_FEATURE_USABLE (AVX512VL), ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __wmemset_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __wmemset_chk_avx512_unaligned)) + #endif + +diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h +index 19795938..100e3707 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memset.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h +@@ -54,7 +54,8 @@ IFUNC_SELECTOR (void) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (avx512_unaligned_erms); +@@ -68,7 +69,8 @@ IFUNC_SELECTOR (void) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (evex_unaligned_erms); +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index 22e7b187..8ad842fc 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -19,6 +19,6 @@ + # define SECTION(p) p##.evex512 + # define MEMSET_SYMBOL(p,s) p##_avx512_##s + # define WMEMSET_SYMBOL(p,s) p##_avx512_##s +- ++# define USE_LESS_VEC_MASK_STORE 1 + # include "memset-vec-unaligned-erms.S" + #endif +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +index ae0a4d6e..640f0929 100644 +--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -19,6 +19,6 @@ + # define SECTION(p) p##.evex + # define MEMSET_SYMBOL(p,s) p##_evex_##s + # define WMEMSET_SYMBOL(p,s) p##_evex_##s +- ++# define USE_LESS_VEC_MASK_STORE 1 + # include "memset-vec-unaligned-erms.S" + #endif +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index bae5cba4..f877ac9d 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -63,6 +63,8 @@ + # endif + #endif + ++#define PAGE_SIZE 4096 ++ + #ifndef SECTION + # error SECTION is not defined! + #endif +@@ -213,11 +215,38 @@ L(loop): + cmpq %rcx, %rdx + jne L(loop) + VZEROUPPER_SHORT_RETURN ++ ++ .p2align 4 + L(less_vec): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 + # error Unsupported VEC_SIZE! + # endif ++# ifdef USE_LESS_VEC_MASK_STORE ++ /* Clear high bits from edi. Only keeping bits relevant to page ++ cross check. Note that we are using rax which is set in ++ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. ++ */ ++ andl $(PAGE_SIZE - 1), %edi ++ /* Check if VEC_SIZE store cross page. Mask stores suffer serious ++ performance degradation when it has to fault supress. */ ++ cmpl $(PAGE_SIZE - VEC_SIZE), %edi ++ ja L(cross_page) ++# if VEC_SIZE > 32 ++ movq $-1, %rcx ++ bzhiq %rdx, %rcx, %rcx ++ kmovq %rcx, %k1 ++# else ++ movl $-1, %ecx ++ bzhil %edx, %ecx, %ecx ++ kmovd %ecx, %k1 ++# endif ++ vmovdqu8 %VEC(0), (%rax) {%k1} ++ VZEROUPPER_RETURN ++ ++ .p2align 4 ++L(cross_page): ++# endif + # if VEC_SIZE > 32 + cmpb $32, %dl + jae L(between_32_63) +@@ -234,36 +263,36 @@ L(less_vec): + cmpb $1, %dl + ja L(between_2_3) + jb 1f +- movb %cl, (%rdi) ++ movb %cl, (%rax) + 1: + VZEROUPPER_RETURN + # if VEC_SIZE > 32 + /* From 32 to 63. No branch when size == 32. */ + L(between_32_63): +- VMOVU %YMM0, -32(%rdi,%rdx) +- VMOVU %YMM0, (%rdi) ++ VMOVU %YMM0, -32(%rax,%rdx) ++ VMOVU %YMM0, (%rax) + VZEROUPPER_RETURN + # endif + # if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ + L(between_16_31): +- VMOVU %XMM0, -16(%rdi,%rdx) +- VMOVU %XMM0, (%rdi) ++ VMOVU %XMM0, -16(%rax,%rdx) ++ VMOVU %XMM0, (%rax) + VZEROUPPER_RETURN + # endif + /* From 8 to 15. No branch when size == 8. */ + L(between_8_15): +- movq %rcx, -8(%rdi,%rdx) +- movq %rcx, (%rdi) ++ movq %rcx, -8(%rax,%rdx) ++ movq %rcx, (%rax) + VZEROUPPER_RETURN + L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ +- movl %ecx, -4(%rdi,%rdx) +- movl %ecx, (%rdi) ++ movl %ecx, -4(%rax,%rdx) ++ movl %ecx, (%rax) + VZEROUPPER_RETURN + L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ +- movw %cx, -2(%rdi,%rdx) +- movw %cx, (%rdi) ++ movw %cx, -2(%rax,%rdx) ++ movw %cx, (%rax) + VZEROUPPER_RETURN + END (MEMSET_SYMBOL (__memset, unaligned_erms)) +-- +GitLab + diff --git a/glibc-RHEL-15696-42.patch b/glibc-RHEL-15696-42.patch new file mode 100644 index 0000000..e2ca245 --- /dev/null +++ b/glibc-RHEL-15696-42.patch @@ -0,0 +1,396 @@ +From ccabe7971f508709d034b63b8672f6f751a3d356 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 23 Apr 2021 15:56:24 -0400 +Subject: [PATCH] x86: Optimize strchr-avx2.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit optimizes strchr-avx2.S. The optimizations are all +small things such as save an ALU in the alignment process, saving a +few instructions in the loop return, saving some bytes in the main +loop, and increasing the ILP in the return cases. test-strchr, +test-strchrnul, test-wcschr, and test-wcschrnul are all passing. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/strchr-avx2.S | 290 +++++++++++++++---------- + 1 file changed, 170 insertions(+), 120 deletions(-) + +Conflics: + sysdeps/x86_64/multiarch/strchr-avx2.S + (rearranged to account for branch changes) + +diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S +index 919d256c..5884726b 100644 +--- a/sysdeps/x86_64/multiarch/strchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S +@@ -49,133 +49,144 @@ + + .section SECTION(.text),"ax",@progbits + ENTRY (STRCHR) +- movl %edi, %ecx +-# ifndef USE_AS_STRCHRNUL +- xorl %edx, %edx +-# endif +- + /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 ++ movl %edi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ VPBROADCAST %xmm0, %ymm0 + vpxor %xmm9, %xmm9, %xmm9 +- VPBROADCAST %xmm0, %ymm0 + + /* Check if we cross page boundary with one vector load. */ +- andl $(PAGE_SIZE - 1), %ecx +- cmpl $(PAGE_SIZE - VEC_SIZE), %ecx +- ja L(cross_page_boundary) ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. Search for both CHAR and the + null byte. */ + vmovdqu (%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax +- jz L(more_vecs) ++ jz L(aligned_more) + tzcntl %eax, %eax ++# ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero) ++# endif + addq %rdi, %rax ++ VZEROUPPER_RETURN ++ ++ /* .p2align 5 helps keep performance more consistent if ENTRY() ++ alignment % 32 was either 16 or 0. As well this makes the ++ alignment % 32 of the loop_4x_vec fixed which makes tuning it ++ easier. */ ++ .p2align 5 ++L(first_vec_x4): ++ tzcntl %eax, %eax ++ addq $(VEC_SIZE * 3 + 1), %rdi + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero) + # endif +-L(return_vzeroupper): +- ZERO_UPPER_VEC_REGISTERS_RETURN +- +- .p2align 4 +-L(more_vecs): +- /* Align data for aligned loads in the loop. */ +- andq $-VEC_SIZE, %rdi +-L(aligned_more): +- +- /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time +- since data is only aligned to VEC_SIZE. */ +- vmovdqa VEC_SIZE(%rdi), %ymm8 +- addq $VEC_SIZE, %rdi +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) +- +- vmovdqa VEC_SIZE(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x1) +- +- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x2) +- +- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jz L(prep_loop_4x) ++ addq %rdi, %rax ++ VZEROUPPER_RETURN + +- tzcntl %eax, %eax +- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++L(zero): ++ xorl %eax, %eax ++ VZEROUPPER_RETURN + # endif +- VZEROUPPER +- ret ++ + + .p2align 4 +-L(first_vec_x0): ++L(first_vec_x1): + tzcntl %eax, %eax +- /* Found CHAR or the null byte. */ +- addq %rdi, %rax ++ incq %rdi + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero) + # endif ++ addq %rdi, %rax + VZEROUPPER_RETURN + + .p2align 4 +-L(first_vec_x1): ++L(first_vec_x2): + tzcntl %eax, %eax +- leaq VEC_SIZE(%rdi, %rax), %rax ++ addq $(VEC_SIZE + 1), %rdi + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero) + # endif ++ addq %rdi, %rax + VZEROUPPER_RETURN + + .p2align 4 +-L(first_vec_x2): ++L(first_vec_x3): + tzcntl %eax, %eax +- /* Found CHAR or the null byte. */ +- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax ++ addq $(VEC_SIZE * 2 + 1), %rdi + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero) + # endif ++ addq %rdi, %rax + VZEROUPPER_RETURN + +-L(prep_loop_4x): +- /* Align data to 4 * VEC_SIZE. */ +- andq $-(VEC_SIZE * 4), %rdi ++ .p2align 4 ++L(aligned_more): ++ /* Align data to VEC_SIZE - 1. This is the same number of ++ instructions as using andq -VEC_SIZE but saves 4 bytes of code ++ on x4 check. */ ++ orq $(VEC_SIZE - 1), %rdi ++L(cross_page_continue): ++ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time ++ since data is only aligned to VEC_SIZE. */ ++ vmovdqa 1(%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 ++ vpor %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 ++ vpor %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ ++ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 ++ vpor %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x3) + ++ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 ++ vpor %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x4) ++ /* Align data to VEC_SIZE * 4 - 1. */ ++ addq $(VEC_SIZE * 4 + 1), %rdi ++ andq $-(VEC_SIZE * 4), %rdi + .p2align 4 + L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ +- vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5 +- vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6 +- vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7 +- vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8 ++ vmovdqa (%rdi), %ymm5 ++ vmovdqa (VEC_SIZE)(%rdi), %ymm6 ++ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 ++ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 + + /* Leaves only CHARS matching esi as 0. */ + vpxor %ymm5, %ymm0, %ymm1 +@@ -191,63 +202,102 @@ L(loop_4x_vec): + VPMINU %ymm1, %ymm2, %ymm5 + VPMINU %ymm3, %ymm4, %ymm6 + +- VPMINU %ymm5, %ymm6, %ymm5 ++ VPMINU %ymm5, %ymm6, %ymm6 + +- VPCMPEQ %ymm5, %ymm9, %ymm5 +- vpmovmskb %ymm5, %eax ++ VPCMPEQ %ymm6, %ymm9, %ymm6 ++ vpmovmskb %ymm6, %ecx ++ subq $-(VEC_SIZE * 4), %rdi ++ testl %ecx, %ecx ++ jz L(loop_4x_vec) + +- addq $(VEC_SIZE * 4), %rdi +- testl %eax, %eax +- jz L(loop_4x_vec) + +- VPCMPEQ %ymm1, %ymm9, %ymm1 ++ VPCMPEQ %ymm1, %ymm9, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax +- jnz L(first_vec_x0) ++ jnz L(last_vec_x0) ++ + +- VPCMPEQ %ymm2, %ymm9, %ymm2 ++ VPCMPEQ %ymm5, %ymm9, %ymm2 + vpmovmskb %ymm2, %eax + testl %eax, %eax +- jnz L(first_vec_x1) ++ jnz L(last_vec_x1) ++ ++ VPCMPEQ %ymm3, %ymm9, %ymm3 ++ vpmovmskb %ymm3, %eax ++ /* rcx has combined result from all 4 VEC. It will only be used ++ if the first 3 other VEC all did not contain a match. */ ++ salq $32, %rcx ++ orq %rcx, %rax ++ tzcntq %rax, %rax ++ subq $(VEC_SIZE * 2), %rdi ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero_end) ++# endif ++ addq %rdi, %rax ++ VZEROUPPER_RETURN ++ ++ ++ .p2align 4 ++L(last_vec_x0): ++ tzcntl %eax, %eax ++ addq $-(VEC_SIZE * 4), %rdi ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero_end) ++# endif ++ addq %rdi, %rax ++ VZEROUPPER_RETURN + +- VPCMPEQ %ymm3, %ymm9, %ymm3 +- VPCMPEQ %ymm4, %ymm9, %ymm4 +- vpmovmskb %ymm3, %ecx +- vpmovmskb %ymm4, %eax +- salq $32, %rax +- orq %rcx, %rax +- tzcntq %rax, %rax +- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++L(zero_end): ++ xorl %eax, %eax ++ VZEROUPPER_RETURN + # endif +- VZEROUPPER +- ret ++ ++ .p2align 4 ++L(last_vec_x1): ++ tzcntl %eax, %eax ++ subq $(VEC_SIZE * 3), %rdi ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero_end) ++# endif ++ addq %rdi, %rax ++ VZEROUPPER_RETURN ++ + + /* Cold case for crossing page with first load. */ + .p2align 4 + L(cross_page_boundary): +- andq $-VEC_SIZE, %rdi +- andl $(VEC_SIZE - 1), %ecx +- +- vmovdqa (%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 ++ movq %rdi, %rdx ++ /* Align rdi to VEC_SIZE - 1. */ ++ orq $(VEC_SIZE - 1), %rdi ++ vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax +- /* Remove the leading bits. */ +- sarxl %ecx, %eax, %eax ++ /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT ++ so no need to manually mod edx. */ ++ sarxl %edx, %eax, %eax + testl %eax, %eax +- jz L(aligned_more) ++ jz L(cross_page_continue) + tzcntl %eax, %eax +- addq %rcx, %rdi +- addq %rdi, %rax + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ xorl %ecx, %ecx ++ /* Found CHAR or the null byte. */ ++ cmp (%rdx, %rax), %CHAR_REG ++ leaq (%rdx, %rax), %rax ++ cmovne %rcx, %rax ++# else ++ addq %rdx, %rax + # endif +- VZEROUPPER_RETURN ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + END (STRCHR) + # endif +-- +GitLab + diff --git a/glibc-RHEL-15696-43.patch b/glibc-RHEL-15696-43.patch new file mode 100644 index 0000000..9f76b11 --- /dev/null +++ b/glibc-RHEL-15696-43.patch @@ -0,0 +1,532 @@ +From 7f3e7c262cab4e2401e4331a6ef29c428de02044 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 23 Apr 2021 15:56:25 -0400 +Subject: [PATCH] x86: Optimize strchr-evex.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit optimizes strchr-evex.S. The optimizations are +mostly small things such as save an ALU in the alignment process, +saving a few instructions in the loop return. The one significant +change is saving 2 instructions in the 4x loop. test-strchr, +test-strchrnul, test-wcschr, and test-wcschrnul are all passing. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/strchr-evex.S | 392 ++++++++++++++----------- + 1 file changed, 218 insertions(+), 174 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S +index ddc86a70..7f9d4ee4 100644 +--- a/sysdeps/x86_64/multiarch/strchr-evex.S ++++ b/sysdeps/x86_64/multiarch/strchr-evex.S +@@ -32,13 +32,15 @@ + # define VPCMP vpcmpd + # define VPMINU vpminud + # define CHAR_REG esi +-# define SHIFT_REG r8d ++# define SHIFT_REG ecx ++# define CHAR_SIZE 4 + # else + # define VPBROADCAST vpbroadcastb + # define VPCMP vpcmpb + # define VPMINU vpminub + # define CHAR_REG sil +-# define SHIFT_REG ecx ++# define SHIFT_REG edx ++# define CHAR_SIZE 1 + # endif + + # define XMMZERO xmm16 +@@ -56,23 +58,20 @@ + + # define VEC_SIZE 32 + # define PAGE_SIZE 4096 ++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section .text.evex,"ax",@progbits + ENTRY (STRCHR) +- movl %edi, %ecx +-# ifndef USE_AS_STRCHRNUL +- xorl %edx, %edx +-# endif +- + /* Broadcast CHAR to YMM0. */ +- VPBROADCAST %esi, %YMM0 +- ++ VPBROADCAST %esi, %YMM0 ++ movl %edi, %eax ++ andl $(PAGE_SIZE - 1), %eax + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + +- /* Check if we cross page boundary with one vector load. */ +- andl $(PAGE_SIZE - 1), %ecx +- cmpl $(PAGE_SIZE - VEC_SIZE), %ecx +- ja L(cross_page_boundary) ++ /* Check if we cross page boundary with one vector load. ++ Otherwise it is safe to use an unaligned load. */ ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. Search for both CHAR and the + null bytes. */ +@@ -83,251 +82,296 @@ ENTRY (STRCHR) + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 +- ktestd %k0, %k0 +- jz L(more_vecs) + kmovd %k0, %eax ++ testl %eax, %eax ++ jz L(aligned_more) + tzcntl %eax, %eax +- /* Found CHAR or the null byte. */ + # ifdef USE_AS_WCSCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (%rdi, %rax, 4), %rax ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. ++ */ ++ leaq (%rdi, %rax, CHAR_SIZE), %rax + # else + addq %rdi, %rax + # endif + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Found CHAR or the null byte. */ ++ cmp (%rax), %CHAR_REG ++ jne L(zero) + # endif + ret + +- .p2align 4 +-L(more_vecs): +- /* Align data for aligned loads in the loop. */ +- andq $-VEC_SIZE, %rdi +-L(aligned_more): +- +- /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time +- since data is only aligned to VEC_SIZE. */ +- VMOVA VEC_SIZE(%rdi), %YMM1 +- addq $VEC_SIZE, %rdi +- +- /* Leaves only CHARS matching esi as 0. */ +- vpxorq %YMM1, %YMM0, %YMM2 +- VPMINU %YMM2, %YMM1, %YMM2 +- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) +- +- VMOVA VEC_SIZE(%rdi), %YMM1 +- /* Leaves only CHARS matching esi as 0. */ +- vpxorq %YMM1, %YMM0, %YMM2 +- VPMINU %YMM2, %YMM1, %YMM2 +- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(first_vec_x1) +- +- VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 +- /* Leaves only CHARS matching esi as 0. */ +- vpxorq %YMM1, %YMM0, %YMM2 +- VPMINU %YMM2, %YMM1, %YMM2 +- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(first_vec_x2) +- +- VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 +- /* Leaves only CHARS matching esi as 0. */ +- vpxorq %YMM1, %YMM0, %YMM2 +- VPMINU %YMM2, %YMM1, %YMM2 +- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 +- ktestd %k0, %k0 +- jz L(prep_loop_4x) +- +- kmovd %k0, %eax ++ /* .p2align 5 helps keep performance more consistent if ENTRY() ++ alignment % 32 was either 16 or 0. As well this makes the ++ alignment % 32 of the loop_4x_vec fixed which makes tuning it ++ easier. */ ++ .p2align 5 ++L(first_vec_x3): + tzcntl %eax, %eax ++# ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ +-# ifdef USE_AS_WCSCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax +-# else +- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax ++ cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ jne L(zero) + # endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ++ ret ++ + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax +-# endif ++L(zero): ++ xorl %eax, %eax + ret ++# endif + + .p2align 4 +-L(first_vec_x0): ++L(first_vec_x4): ++# ifndef USE_AS_STRCHRNUL ++ /* Check to see if first match was CHAR (k0) or null (k1). */ ++ kmovd %k0, %eax + tzcntl %eax, %eax +- /* Found CHAR or the null byte. */ +-# ifdef USE_AS_WCSCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (%rdi, %rax, 4), %rax ++ kmovd %k1, %ecx ++ /* bzhil will not be 0 if first match was null. */ ++ bzhil %eax, %ecx, %ecx ++ jne L(zero) + # else +- addq %rdi, %rax +-# endif +-# ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Combine CHAR and null matches. */ ++ kord %k0, %k1, %k0 ++ kmovd %k0, %eax ++ tzcntl %eax, %eax + # endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 + L(first_vec_x1): + tzcntl %eax, %eax +- /* Found CHAR or the null byte. */ +-# ifdef USE_AS_WCSCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq VEC_SIZE(%rdi, %rax, 4), %rax +-# else +- leaq VEC_SIZE(%rdi, %rax), %rax +-# endif + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Found CHAR or the null byte. */ ++ cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ jne L(zero) ++ + # endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 + L(first_vec_x2): ++# ifndef USE_AS_STRCHRNUL ++ /* Check to see if first match was CHAR (k0) or null (k1). */ ++ kmovd %k0, %eax + tzcntl %eax, %eax +- /* Found CHAR or the null byte. */ +-# ifdef USE_AS_WCSCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax ++ kmovd %k1, %ecx ++ /* bzhil will not be 0 if first match was null. */ ++ bzhil %eax, %ecx, %ecx ++ jne L(zero) + # else +- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax +-# endif +-# ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Combine CHAR and null matches. */ ++ kord %k0, %k1, %k0 ++ kmovd %k0, %eax ++ tzcntl %eax, %eax + # endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + +-L(prep_loop_4x): +- /* Align data to 4 * VEC_SIZE. */ ++ .p2align 4 ++L(aligned_more): ++ /* Align data to VEC_SIZE. */ ++ andq $-VEC_SIZE, %rdi ++L(cross_page_continue): ++ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since ++ data is only aligned to VEC_SIZE. Use two alternating methods ++ for checking VEC to balance latency and port contention. */ ++ ++ /* This method has higher latency but has better port ++ distribution. */ ++ VMOVA (VEC_SIZE)(%rdi), %YMM1 ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM2 ++ VPMINU %YMM2, %YMM1, %YMM2 ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ /* This method has higher latency but has better port ++ distribution. */ ++ VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 ++ /* Each bit in K0 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMM1, %YMM0, %k0 ++ /* Each bit in K1 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMM1, %YMMZERO, %k1 ++ kortestd %k0, %k1 ++ jnz L(first_vec_x2) ++ ++ VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM2 ++ VPMINU %YMM2, %YMM1, %YMM2 ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x3) ++ ++ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 ++ /* Each bit in K0 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMM1, %YMM0, %k0 ++ /* Each bit in K1 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMM1, %YMMZERO, %k1 ++ kortestd %k0, %k1 ++ jnz L(first_vec_x4) ++ ++ /* Align data to VEC_SIZE * 4 for the loop. */ ++ addq $VEC_SIZE, %rdi + andq $-(VEC_SIZE * 4), %rdi + + .p2align 4 + L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ ++ /* Check 4x VEC at a time. No penalty to imm32 offset with evex ++ encoding. */ + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 + VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 + VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 + VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 + +- /* Leaves only CHARS matching esi as 0. */ ++ /* For YMM1 and YMM3 use xor to set the CHARs matching esi to ++ zero. */ + vpxorq %YMM1, %YMM0, %YMM5 +- vpxorq %YMM2, %YMM0, %YMM6 ++ /* For YMM2 and YMM4 cmp not equals to CHAR and store result in ++ k register. Its possible to save either 1 or 2 instructions ++ using cmp no equals method for either YMM1 or YMM1 and YMM3 ++ respectively but bottleneck on p5 makes it not worth it. */ ++ VPCMP $4, %YMM0, %YMM2, %k2 + vpxorq %YMM3, %YMM0, %YMM7 +- vpxorq %YMM4, %YMM0, %YMM8 +- +- VPMINU %YMM5, %YMM1, %YMM5 +- VPMINU %YMM6, %YMM2, %YMM6 +- VPMINU %YMM7, %YMM3, %YMM7 +- VPMINU %YMM8, %YMM4, %YMM8 +- +- VPMINU %YMM5, %YMM6, %YMM1 +- VPMINU %YMM7, %YMM8, %YMM2 +- +- VPMINU %YMM1, %YMM2, %YMM1 +- +- /* Each bit in K0 represents a CHAR or a null byte. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- +- addq $(VEC_SIZE * 4), %rdi +- +- ktestd %k0, %k0 ++ VPCMP $4, %YMM0, %YMM4, %k4 ++ ++ /* Use min to select all zeros from either xor or end of string). ++ */ ++ VPMINU %YMM1, %YMM5, %YMM1 ++ VPMINU %YMM3, %YMM7, %YMM3 ++ ++ /* Use min + zeromask to select for zeros. Since k2 and k4 will ++ have 0 as positions that matched with CHAR which will set ++ zero in the corresponding destination bytes in YMM2 / YMM4. ++ */ ++ VPMINU %YMM1, %YMM2, %YMM2{%k2}{z} ++ VPMINU %YMM3, %YMM4, %YMM4 ++ VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} ++ ++ VPCMP $0, %YMMZERO, %YMM4, %k1 ++ kmovd %k1, %ecx ++ subq $-(VEC_SIZE * 4), %rdi ++ testl %ecx, %ecx + jz L(loop_4x_vec) + +- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM5, %k0 ++ VPCMP $0, %YMMZERO, %YMM1, %k0 + kmovd %k0, %eax + testl %eax, %eax +- jnz L(first_vec_x0) ++ jnz L(last_vec_x1) + +- /* Each bit in K1 represents a CHAR or a null byte in YMM2. */ +- VPCMP $0, %YMMZERO, %YMM6, %k1 +- kmovd %k1, %eax ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ kmovd %k0, %eax + testl %eax, %eax +- jnz L(first_vec_x1) +- +- /* Each bit in K2 represents a CHAR or a null byte in YMM3. */ +- VPCMP $0, %YMMZERO, %YMM7, %k2 +- /* Each bit in K3 represents a CHAR or a null byte in YMM4. */ +- VPCMP $0, %YMMZERO, %YMM8, %k3 ++ jnz L(last_vec_x2) + ++ VPCMP $0, %YMMZERO, %YMM3, %k0 ++ kmovd %k0, %eax ++ /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ + # ifdef USE_AS_WCSCHR +- /* NB: Each bit in K2/K3 represents 4-byte element. */ +- kshiftlw $8, %k3, %k1 ++ sall $8, %ecx ++ orl %ecx, %eax ++ tzcntl %eax, %eax + # else +- kshiftlq $32, %k3, %k1 ++ salq $32, %rcx ++ orq %rcx, %rax ++ tzcntq %rax, %rax + # endif ++# ifndef USE_AS_STRCHRNUL ++ /* Check if match was CHAR or null. */ ++ cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ jne L(zero_end) ++# endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- /* Each bit in K1 represents a NULL or a mismatch. */ +- korq %k1, %k2, %k1 +- kmovq %k1, %rax ++# ifndef USE_AS_STRCHRNUL ++L(zero_end): ++ xorl %eax, %eax ++ ret ++# endif + +- tzcntq %rax, %rax +-# ifdef USE_AS_WCSCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax +-# else +- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax ++ .p2align 4 ++L(last_vec_x1): ++ tzcntl %eax, %eax ++# ifndef USE_AS_STRCHRNUL ++ /* Check if match was null. */ ++ cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ jne L(zero_end) + # endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (%rdi, %rax, CHAR_SIZE), %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_x2): ++ tzcntl %eax, %eax + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Check if match was null. */ ++ cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ jne L(zero_end) + # endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + ret + + /* Cold case for crossing page with first load. */ + .p2align 4 + L(cross_page_boundary): ++ movq %rdi, %rdx ++ /* Align rdi. */ + andq $-VEC_SIZE, %rdi +- andl $(VEC_SIZE - 1), %ecx +- + VMOVA (%rdi), %YMM1 +- + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + kmovd %k0, %eax +- testl %eax, %eax +- ++ /* Remove the leading bits. */ + # ifdef USE_AS_WCSCHR ++ movl %edx, %SHIFT_REG + /* NB: Divide shift count by 4 since each bit in K1 represent 4 + bytes. */ +- movl %ecx, %SHIFT_REG +- sarl $2, %SHIFT_REG ++ sarl $2, %SHIFT_REG ++ andl $(CHAR_PER_VEC - 1), %SHIFT_REG + # endif +- +- /* Remove the leading bits. */ + sarxl %SHIFT_REG, %eax, %eax ++ /* If eax is zero continue. */ + testl %eax, %eax +- +- jz L(aligned_more) ++ jz L(cross_page_continue) + tzcntl %eax, %eax +- addq %rcx, %rdi ++# ifndef USE_AS_STRCHRNUL ++ /* Check to see if match was CHAR or null. */ ++ cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG ++ jne L(zero_end) ++# endif + # ifdef USE_AS_WCSCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (%rdi, %rax, 4), %rax ++ /* NB: Multiply wchar_t count by 4 to get the number of ++ bytes. */ ++ leaq (%rdx, %rax, CHAR_SIZE), %rax + # else +- addq %rdi, %rax +-# endif +-# ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ addq %rdx, %rax + # endif + ret + +-- +GitLab + diff --git a/glibc-RHEL-15696-44.patch b/glibc-RHEL-15696-44.patch new file mode 100644 index 0000000..52fec88 --- /dev/null +++ b/glibc-RHEL-15696-44.patch @@ -0,0 +1,536 @@ +From 104c7b1967c3e78435c6f7eab5e225a7eddf9c6e Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Tue, 4 May 2021 19:02:40 -0400 +Subject: [PATCH] x86: Add EVEX optimized memchr family not safe for RTM +Content-type: text/plain; charset=UTF-8 + +No bug. + +This commit adds a new implementation for EVEX memchr that is not safe +for RTM because it uses vzeroupper. The benefit is that by using +ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is +faster than the RTM safe version which cannot use vpcmpeq because +there is no EVEX encoding for the instruction. All parts of the +implementation aside from the 4x loop are the same for the two +versions and the optimization is only relevant for large sizes. + +Tigerlake: +size , algn , Pos , Cur T , New T , Win , Dif +512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16 +512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21 +2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2 +2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06 +2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4 +2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <-- + +Icelake: +size , algn , Pos , Cur T , New T , Win , Dif +512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3 +512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36 +2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1 +2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15 +2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54 +2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <-- + +test-memchr, test-wmemchr, and test-rawmemchr are all passing. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/Makefile | 7 +- + sysdeps/x86_64/multiarch/ifunc-evex.h | 55 ++++++ + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 ++ + sysdeps/x86_64/multiarch/memchr-evex-rtm.S | 8 + + sysdeps/x86_64/multiarch/memchr-evex.S | 161 ++++++++++++++---- + sysdeps/x86_64/multiarch/memchr.c | 2 +- + sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | 3 + + sysdeps/x86_64/multiarch/rawmemchr.c | 2 +- + sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S | 3 + + sysdeps/x86_64/multiarch/wmemchr.c | 2 +- + 10 files changed, 217 insertions(+), 41 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/ifunc-evex.h + create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 65fde4eb..26be4095 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -77,7 +77,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ + strncmp-evex \ + strncpy-evex \ + strnlen-evex \ +- strrchr-evex ++ strrchr-evex \ ++ memchr-evex-rtm \ ++ rawmemchr-evex-rtm + CFLAGS-varshift.c += -msse4 + CFLAGS-strcspn-c.c += -msse4 + CFLAGS-strpbrk-c.c += -msse4 +@@ -110,7 +112,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ + wcsnlen-evex \ + wcsrchr-evex \ + wmemchr-evex \ +- wmemcmp-evex-movbe ++ wmemcmp-evex-movbe \ ++ wmemchr-evex-rtm + endif + + ifeq ($(subdir),debug) +diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h +new file mode 100644 +index 00000000..fc391edb +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/ifunc-evex.h +@@ -0,0 +1,55 @@ ++/* Common definition for ifunc selection optimized with EVEX. ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2017-2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden; ++ ++ ++static inline void * ++IFUNC_SELECTOR (void) ++{ ++ const struct cpu_features* cpu_features = __get_cpu_features (); ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2) ++ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (evex_rtm); ++ ++ return OPTIMIZE (evex); ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } ++ ++ return OPTIMIZE (sse2); ++} +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index d59d65f8..ac097e8d 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -52,6 +52,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __memchr_evex) ++ IFUNC_IMPL_ADD (array, i, memchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __memchr_evex_rtm) + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/memcmp.c. */ +@@ -288,6 +293,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __rawmemchr_evex) ++ IFUNC_IMPL_ADD (array, i, rawmemchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __rawmemchr_evex_rtm) + IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/strlen.c. */ +@@ -711,6 +721,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wmemchr_evex) ++ IFUNC_IMPL_ADD (array, i, wmemchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wmemchr_evex_rtm) + IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/wmemcmp.c. */ +diff --git a/sysdeps/x86_64/multiarch/memchr-evex-rtm.S b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S +new file mode 100644 +index 00000000..19871882 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S +@@ -0,0 +1,8 @@ ++#ifndef MEMCHR ++# define MEMCHR __memchr_evex_rtm ++#endif ++ ++#define USE_IN_RTM 1 ++#define SECTION(p) p##.evex.rtm ++ ++#include "memchr-evex.S" +diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S +index f3fdad4f..4d0ed6d1 100644 +--- a/sysdeps/x86_64/multiarch/memchr-evex.S ++++ b/sysdeps/x86_64/multiarch/memchr-evex.S +@@ -38,10 +38,32 @@ + # define CHAR_SIZE 1 + # endif + ++ /* In the 4x loop the RTM and non-RTM versions have data pointer ++ off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater. ++ This is represented by BASE_OFFSET. As well because the RTM ++ version uses vpcmp which stores a bit per element compared where ++ the non-RTM version uses vpcmpeq which stores a bit per byte ++ compared RET_SCALE of CHAR_SIZE is only relevant for the RTM ++ version. */ ++# ifdef USE_IN_RTM ++# define VZEROUPPER ++# define BASE_OFFSET (VEC_SIZE * 4) ++# define RET_SCALE CHAR_SIZE ++# else ++# define VZEROUPPER vzeroupper ++# define BASE_OFFSET 0 ++# define RET_SCALE 1 ++# endif ++ ++ /* In the return from 4x loop memchr and rawmemchr versions have ++ data pointers off by VEC_SIZE * 4 with memchr version being ++ VEC_SIZE * 4 greater. */ + # ifdef USE_AS_RAWMEMCHR ++# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4)) + # define RAW_PTR_REG rcx + # define ALGN_PTR_REG rdi + # else ++# define RET_OFFSET BASE_OFFSET + # define RAW_PTR_REG rdi + # define ALGN_PTR_REG rcx + # endif +@@ -57,11 +79,15 @@ + # define YMM5 ymm21 + # define YMM6 ymm22 + ++# ifndef SECTION ++# define SECTION(p) p##.evex ++# endif ++ + # define VEC_SIZE 32 + # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + # define PAGE_SIZE 4096 + +- .section .text.evex,"ax",@progbits ++ .section SECTION(.text),"ax",@progbits + ENTRY (MEMCHR) + # ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ +@@ -237,14 +263,15 @@ L(cross_page_continue): + /* Check if at last CHAR_PER_VEC * 4 length. */ + subq $(CHAR_PER_VEC * 4), %rdx + jbe L(last_4x_vec_or_less_cmpeq) +- addq $VEC_SIZE, %rdi ++ /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */ ++ addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi + + /* Align data to VEC_SIZE * 4 for the loop and readjust length. + */ + # ifdef USE_AS_WMEMCHR + movl %edi, %ecx + andq $-(4 * VEC_SIZE), %rdi +- andl $(VEC_SIZE * 4 - 1), %ecx ++ subl %edi, %ecx + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %ecx + addq %rcx, %rdx +@@ -254,15 +281,28 @@ L(cross_page_continue): + subq %rdi, %rdx + # endif + # else +- addq $VEC_SIZE, %rdi ++ addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi + andq $-(4 * VEC_SIZE), %rdi + # endif +- ++# ifdef USE_IN_RTM + vpxorq %XMMZERO, %XMMZERO, %XMMZERO ++# else ++ /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not ++ encodable with EVEX registers (ymm16-ymm31). */ ++ vmovdqa64 %YMMMATCH, %ymm0 ++# endif + + /* Compare 4 * VEC at a time forward. */ + .p2align 4 + L(loop_4x_vec): ++ /* Two versions of the loop. One that does not require ++ vzeroupper by not using ymm0-ymm15 and another does that require ++ vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15 ++ is used at all is because there is no EVEX encoding vpcmpeq and ++ with vpcmpeq this loop can be performed more efficiently. The ++ non-vzeroupper version is safe for RTM while the vzeroupper ++ version should be prefered if RTM are not supported. */ ++# ifdef USE_IN_RTM + /* It would be possible to save some instructions using 4x VPCMP + but bottleneck on port 5 makes it not woth it. */ + VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 +@@ -273,12 +313,55 @@ L(loop_4x_vec): + /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ + VPMINU %YMM2, %YMM3, %YMM3{%k1}{z} + VPCMP $0, %YMM3, %YMMZERO, %k2 ++# else ++ /* Since vptern can only take 3x vectors fastest to do 1 vec ++ seperately with EVEX vpcmp. */ ++# ifdef USE_AS_WMEMCHR ++ /* vptern can only accept masks for epi32/epi64 so can only save ++ instruction using not equals mask on vptern with wmemchr. */ ++ VPCMP $4, (%rdi), %YMMMATCH, %k1 ++# else ++ VPCMP $0, (%rdi), %YMMMATCH, %k1 ++# endif ++ /* Compare 3x with vpcmpeq and or them all together with vptern. ++ */ ++ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 ++# ifdef USE_AS_WMEMCHR ++ /* This takes the not of or between ymm2, ymm3, ymm4 as well as ++ combines result from VEC0 with zero mask. */ ++ vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z} ++ vpmovmskb %ymm4, %ecx ++# else ++ /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */ ++ vpternlogd $254, %ymm2, %ymm3, %ymm4 ++ vpmovmskb %ymm4, %ecx ++ kmovd %k1, %eax ++# endif ++# endif ++ + # ifdef USE_AS_RAWMEMCHR + subq $-(VEC_SIZE * 4), %rdi ++# endif ++# ifdef USE_IN_RTM + kortestd %k2, %k3 ++# else ++# ifdef USE_AS_WMEMCHR ++ /* ecx contains not of matches. All 1s means no matches. incl will ++ overflow and set zeroflag if that is the case. */ ++ incl %ecx ++# else ++ /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding ++ to ecx is not an issue because if eax is non-zero it will be ++ used for returning the match. If it is zero the add does ++ nothing. */ ++ addq %rax, %rcx ++# endif ++# endif ++# ifdef USE_AS_RAWMEMCHR + jz L(loop_4x_vec) + # else +- kortestd %k2, %k3 + jnz L(loop_4x_vec_end) + + subq $-(VEC_SIZE * 4), %rdi +@@ -288,10 +371,11 @@ L(loop_4x_vec): + + /* Fall through into less than 4 remaining vectors of length case. + */ +- VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 ++ VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0 ++ addq $(BASE_OFFSET - VEC_SIZE), %rdi + kmovd %k0, %eax +- addq $(VEC_SIZE * 3), %rdi +- .p2align 4 ++ VZEROUPPER ++ + L(last_4x_vec_or_less): + /* Check if first VEC contained match. */ + testl %eax, %eax +@@ -338,73 +422,78 @@ L(loop_4x_vec_end): + /* rawmemchr will fall through into this if match was found in + loop. */ + ++# if defined USE_IN_RTM || defined USE_AS_WMEMCHR + /* k1 has not of matches with VEC1. */ + kmovd %k1, %eax +-# ifdef USE_AS_WMEMCHR ++# ifdef USE_AS_WMEMCHR + subl $((1 << CHAR_PER_VEC) - 1), %eax +-# else ++# else + incl %eax ++# endif ++# else ++ /* eax already has matches for VEC1. */ ++ testl %eax, %eax + # endif + jnz L(last_vec_x1_return) + ++# ifdef USE_IN_RTM + VPCMP $0, %YMM2, %YMMZERO, %k0 + kmovd %k0, %eax ++# else ++ vpmovmskb %ymm2, %eax ++# endif + testl %eax, %eax + jnz L(last_vec_x2_return) + ++# ifdef USE_IN_RTM + kmovd %k2, %eax + testl %eax, %eax + jnz L(last_vec_x3_return) + + kmovd %k3, %eax + tzcntl %eax, %eax +-# ifdef USE_AS_RAWMEMCHR +- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ++ leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax + # else +- leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax ++ vpmovmskb %ymm3, %eax ++ /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */ ++ salq $VEC_SIZE, %rcx ++ orq %rcx, %rax ++ tzcntq %rax, %rax ++ leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax ++ VZEROUPPER + # endif + ret + + .p2align 4 + L(last_vec_x1_return): + tzcntl %eax, %eax +-# ifdef USE_AS_RAWMEMCHR +-# ifdef USE_AS_WMEMCHR ++# if defined USE_AS_WMEMCHR || RET_OFFSET != 0 + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ +- leaq (%rdi, %rax, CHAR_SIZE), %rax +-# else +- addq %rdi, %rax +-# endif ++ leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax + # else +- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ +- leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax ++ addq %rdi, %rax + # endif ++ VZEROUPPER + ret + + .p2align 4 + L(last_vec_x2_return): + tzcntl %eax, %eax +-# ifdef USE_AS_RAWMEMCHR +- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ +- leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax +-# else +- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ +- leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax +-# endif ++ /* NB: Multiply bytes by RET_SCALE to get the wchar_t count ++ if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and ++ USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */ ++ leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax ++ VZEROUPPER + ret + ++# ifdef USE_IN_RTM + .p2align 4 + L(last_vec_x3_return): + tzcntl %eax, %eax +-# ifdef USE_AS_RAWMEMCHR +- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ +- leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax +-# else + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ +- leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax +-# endif ++ leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax + ret +- ++# endif + + # ifndef USE_AS_RAWMEMCHR + L(last_4x_vec_or_less_cmpeq): +diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c +index 016f5784..f28aea77 100644 +--- a/sysdeps/x86_64/multiarch/memchr.c ++++ b/sysdeps/x86_64/multiarch/memchr.c +@@ -24,7 +24,7 @@ + # undef memchr + + # define SYMBOL_NAME memchr +-# include "ifunc-avx2.h" ++# include "ifunc-evex.h" + + libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ()); + strong_alias (memchr, __memchr) +diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S +new file mode 100644 +index 00000000..deda1ca3 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S +@@ -0,0 +1,3 @@ ++#define MEMCHR __rawmemchr_evex_rtm ++#define USE_AS_RAWMEMCHR 1 ++#include "memchr-evex-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c +index 8a0bc313..1f764f35 100644 +--- a/sysdeps/x86_64/multiarch/rawmemchr.c ++++ b/sysdeps/x86_64/multiarch/rawmemchr.c +@@ -26,7 +26,7 @@ + # undef __rawmemchr + + # define SYMBOL_NAME rawmemchr +-# include "ifunc-avx2.h" ++# include "ifunc-evex.h" + + libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr, + IFUNC_SELECTOR ()); +diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S +new file mode 100644 +index 00000000..a346cd35 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S +@@ -0,0 +1,3 @@ ++#define MEMCHR __wmemchr_evex_rtm ++#define USE_AS_WMEMCHR 1 ++#include "memchr-evex-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c +index 6d833702..f9c91915 100644 +--- a/sysdeps/x86_64/multiarch/wmemchr.c ++++ b/sysdeps/x86_64/multiarch/wmemchr.c +@@ -26,7 +26,7 @@ + # undef __wmemchr + + # define SYMBOL_NAME wmemchr +-# include "ifunc-avx2.h" ++# include "ifunc-evex.h" + + libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ()); + weak_alias (__wmemchr, wmemchr) +-- +GitLab + diff --git a/glibc-RHEL-15696-45.patch b/glibc-RHEL-15696-45.patch new file mode 100644 index 0000000..380217e --- /dev/null +++ b/glibc-RHEL-15696-45.patch @@ -0,0 +1,873 @@ +From 16d12015c57701b08d7bbed6ec536641bcafb428 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 17 May 2021 13:56:52 -0400 +Subject: [PATCH] x86: Optimize memcmp-avx2-movbe.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit optimizes memcmp-avx2.S. The optimizations include +adding a new vec compare path for small sizes, reorganizing the entry +control flow, and removing some unnecissary ALU instructions from the +main loop. test-memcmp and test-wmemcmp are both passing. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 + + sysdeps/x86_64/multiarch/ifunc-memcmp.h | 1 + + sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 676 +++++++++++-------- + 3 files changed, 402 insertions(+), 281 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index ac097e8d..8be0d78a 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -63,16 +63,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, memcmp, + IFUNC_IMPL_ADD (array, i, memcmp, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE)), + __memcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, memcmp, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE) + && CPU_FEATURE_USABLE (RTM)), + __memcmp_avx2_movbe_rtm) + IFUNC_IMPL_ADD (array, i, memcmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE)), + __memcmp_evex_movbe) + IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), +@@ -732,16 +735,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, wmemcmp, + IFUNC_IMPL_ADD (array, i, wmemcmp, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE)), + __wmemcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, wmemcmp, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE) + && CPU_FEATURE_USABLE (RTM)), + __wmemcmp_avx2_movbe_rtm) + IFUNC_IMPL_ADD (array, i, wmemcmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE)), + __wmemcmp_evex_movbe) + IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), +diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +index 8043c635..690dffe8 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +@@ -33,6 +33,7 @@ IFUNC_SELECTOR (void) + + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURE_USABLE_P (cpu_features, MOVBE) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) +diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +index 9d5c9c72..16fc673e 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +@@ -19,17 +19,23 @@ + #if IS_IN (libc) + + /* memcmp/wmemcmp is implemented as: +- 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap +- to avoid branches. +- 2. Use overlapping compare to avoid branch. +- 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8 +- bytes for wmemcmp. +- 4. If size is 8 * VEC_SIZE or less, unroll the loop. +- 5. Compare 4 * VEC_SIZE at a time with the aligned first memory ++ 1. Use ymm vector compares when possible. The only case where ++ vector compares is not possible for when size < VEC_SIZE ++ and loading from either s1 or s2 would cause a page cross. ++ 2. For size from 2 to 7 bytes on page cross, load as big endian ++ with movbe and bswap to avoid branches. ++ 3. Use xmm vector compare when size >= 4 bytes for memcmp or ++ size >= 8 bytes for wmemcmp. ++ 4. Optimistically compare up to first 4 * VEC_SIZE one at a ++ to check for early mismatches. Only do this if its guranteed the ++ work is not wasted. ++ 5. If size is 8 * VEC_SIZE or less, unroll the loop. ++ 6. Compare 4 * VEC_SIZE at a time with the aligned first memory + area. +- 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. +- 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. +- 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ ++ 7. Use 2 vector compares when size is 2 * VEC_SIZE or less. ++ 8. Use 4 vector compares when size is 4 * VEC_SIZE or less. ++ 9. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ ++ + + # include + +@@ -38,8 +44,10 @@ + # endif + + # ifdef USE_AS_WMEMCMP ++# define CHAR_SIZE 4 + # define VPCMPEQ vpcmpeqd + # else ++# define CHAR_SIZE 1 + # define VPCMPEQ vpcmpeqb + # endif + +@@ -52,7 +60,7 @@ + # endif + + # define VEC_SIZE 32 +-# define VEC_MASK ((1 << VEC_SIZE) - 1) ++# define PAGE_SIZE 4096 + + /* Warning! + wmemcmp has to use SIGNED comparison for elements. +@@ -71,136 +79,359 @@ ENTRY (MEMCMP) + jb L(less_vec) + + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ +- vmovdqu (%rsi), %ymm2 +- VPCMPEQ (%rdi), %ymm2, %ymm2 +- vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) ++ vmovdqu (%rsi), %ymm1 ++ VPCMPEQ (%rdi), %ymm1, %ymm1 ++ vpmovmskb %ymm1, %eax ++ /* NB: eax must be destination register if going to ++ L(return_vec_[0,2]). For L(return_vec_3 destination register ++ must be ecx. */ ++ incl %eax ++ jnz L(return_vec_0) + + cmpq $(VEC_SIZE * 2), %rdx +- jbe L(last_vec) +- +- VPCMPEQ %ymm0, %ymm0, %ymm0 +- /* More than 2 * VEC. */ +- cmpq $(VEC_SIZE * 8), %rdx +- ja L(more_8x_vec) +- cmpq $(VEC_SIZE * 4), %rdx +- jb L(last_4x_vec) +- +- /* From 4 * VEC to 8 * VEC, inclusively. */ +- vmovdqu (%rsi), %ymm1 +- VPCMPEQ (%rdi), %ymm1, %ymm1 ++ jbe L(last_1x_vec) + ++ /* Check second VEC no matter what. */ + vmovdqu VEC_SIZE(%rsi), %ymm2 +- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 ++ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 ++ vpmovmskb %ymm2, %eax ++ /* If all 4 VEC where equal eax will be all 1s so incl will ++ overflow and set zero flag. */ ++ incl %eax ++ jnz L(return_vec_1) + +- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 ++ /* Less than 4 * VEC. */ ++ cmpq $(VEC_SIZE * 4), %rdx ++ jbe L(last_2x_vec) + ++ /* Check third and fourth VEC no matter what. */ ++ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 ++ vpmovmskb %ymm3, %eax ++ incl %eax ++ jnz L(return_vec_2) + vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 ++ vpmovmskb %ymm4, %ecx ++ incl %ecx ++ jnz L(return_vec_3) + +- vpand %ymm1, %ymm2, %ymm5 +- vpand %ymm3, %ymm4, %ymm6 +- vpand %ymm5, %ymm6, %ymm5 ++ /* Go to 4x VEC loop. */ ++ cmpq $(VEC_SIZE * 8), %rdx ++ ja L(more_8x_vec) + +- vptest %ymm0, %ymm5 +- jnc L(4x_vec_end) ++ /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any ++ branches. */ + ++ /* Load first two VEC from s2 before adjusting addresses. */ ++ vmovdqu -(VEC_SIZE * 4)(%rsi, %rdx), %ymm1 ++ vmovdqu -(VEC_SIZE * 3)(%rsi, %rdx), %ymm2 + leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi + leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi +- vmovdqu (%rsi), %ymm1 +- VPCMPEQ (%rdi), %ymm1, %ymm1 + +- vmovdqu VEC_SIZE(%rsi), %ymm2 +- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 +- vpand %ymm2, %ymm1, %ymm5 ++ /* Wait to load from s1 until addressed adjust due to ++ unlamination of microfusion with complex address mode. */ ++ VPCMPEQ (%rdi), %ymm1, %ymm1 ++ VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2 + + vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 +- vpand %ymm3, %ymm5, %ymm5 +- ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 + vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 +- vpand %ymm4, %ymm5, %ymm5 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 + +- vptest %ymm0, %ymm5 +- jnc L(4x_vec_end) +- xorl %eax, %eax ++ /* Reduce VEC0 - VEC4. */ ++ vpand %ymm1, %ymm2, %ymm5 ++ vpand %ymm3, %ymm4, %ymm6 ++ vpand %ymm5, %ymm6, %ymm7 ++ vpmovmskb %ymm7, %ecx ++ incl %ecx ++ jnz L(return_vec_0_1_2_3) ++ /* NB: eax must be zero to reach here. */ ++ VZEROUPPER_RETURN ++ ++ .p2align 4 ++L(return_vec_0): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl (%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl (%rsi, %rax), %ecx ++ /* NB: no partial register stall here because xorl zero idiom ++ above. */ ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (%rsi, %rax), %ecx ++ movzbl (%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif + L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 +-L(last_2x_vec): +- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ +- vmovdqu (%rsi), %ymm2 +- VPCMPEQ (%rdi), %ymm2, %ymm2 +- vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) ++L(return_vec_1): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl VEC_SIZE(%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl VEC_SIZE(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl VEC_SIZE(%rsi, %rax), %ecx ++ movzbl VEC_SIZE(%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ VZEROUPPER_RETURN ++ ++ .p2align 4 ++L(return_vec_2): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl (VEC_SIZE * 2)(%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx ++ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ VZEROUPPER_RETURN ++ ++ /* NB: p2align 5 here to ensure 4x loop is 32 byte aligned. */ ++ .p2align 5 ++L(8x_return_vec_0_1_2_3): ++ /* Returning from L(more_8x_vec) requires restoring rsi. */ ++ addq %rdi, %rsi ++L(return_vec_0_1_2_3): ++ vpmovmskb %ymm1, %eax ++ incl %eax ++ jnz L(return_vec_0) + +-L(last_vec): +- /* Use overlapping loads to avoid branches. */ +- leaq -VEC_SIZE(%rdi, %rdx), %rdi +- leaq -VEC_SIZE(%rsi, %rdx), %rsi +- vmovdqu (%rsi), %ymm2 +- VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) ++ incl %eax ++ jnz L(return_vec_1) ++ ++ vpmovmskb %ymm3, %eax ++ incl %eax ++ jnz L(return_vec_2) ++L(return_vec_3): ++ tzcntl %ecx, %ecx ++# ifdef USE_AS_WMEMCMP ++ movl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx), %eax ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++# endif ++ VZEROUPPER_RETURN ++ ++ .p2align 4 ++L(more_8x_vec): ++ /* Set end of s1 in rdx. */ ++ leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx ++ /* rsi stores s2 - s1. This allows loop to only update one ++ pointer. */ ++ subq %rdi, %rsi ++ /* Align s1 pointer. */ ++ andq $-VEC_SIZE, %rdi ++ /* Adjust because first 4x vec where check already. */ ++ subq $-(VEC_SIZE * 4), %rdi ++ .p2align 4 ++L(loop_4x_vec): ++ /* rsi has s2 - s1 so get correct address by adding s1 (in rdi). ++ */ ++ vmovdqu (%rsi, %rdi), %ymm1 ++ VPCMPEQ (%rdi), %ymm1, %ymm1 ++ ++ vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2 ++ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 ++ ++ vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3 ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 ++ ++ vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 ++ ++ vpand %ymm1, %ymm2, %ymm5 ++ vpand %ymm3, %ymm4, %ymm6 ++ vpand %ymm5, %ymm6, %ymm7 ++ vpmovmskb %ymm7, %ecx ++ incl %ecx ++ jnz L(8x_return_vec_0_1_2_3) ++ subq $-(VEC_SIZE * 4), %rdi ++ /* Check if s1 pointer at end. */ ++ cmpq %rdx, %rdi ++ jb L(loop_4x_vec) ++ ++ subq %rdx, %rdi ++ /* rdi has 4 * VEC_SIZE - remaining length. */ ++ cmpl $(VEC_SIZE * 3), %edi ++ jae L(8x_last_1x_vec) ++ /* Load regardless of branch. */ ++ vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3 ++ cmpl $(VEC_SIZE * 2), %edi ++ jae L(8x_last_2x_vec) ++ ++ /* Check last 4 VEC. */ ++ vmovdqu (%rsi, %rdx), %ymm1 ++ VPCMPEQ (%rdx), %ymm1, %ymm1 ++ ++ vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2 ++ VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2 ++ ++ VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 ++ ++ vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 ++ VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 ++ ++ vpand %ymm1, %ymm2, %ymm5 ++ vpand %ymm3, %ymm4, %ymm6 ++ vpand %ymm5, %ymm6, %ymm7 ++ vpmovmskb %ymm7, %ecx ++ /* Restore s1 pointer to rdi. */ ++ movq %rdx, %rdi ++ incl %ecx ++ jnz L(8x_return_vec_0_1_2_3) ++ /* NB: eax must be zero to reach here. */ ++ VZEROUPPER_RETURN ++ ++ /* Only entry is from L(more_8x_vec). */ ++ .p2align 4 ++L(8x_last_2x_vec): ++ /* Check second to last VEC. rdx store end pointer of s1 and ++ ymm3 has already been loaded with second to last VEC from s2. ++ */ ++ VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 ++ vpmovmskb %ymm3, %eax ++ incl %eax ++ jnz L(8x_return_vec_2) ++ /* Check last VEC. */ ++ .p2align 4 ++L(8x_last_1x_vec): ++ vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 ++ VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 ++ vpmovmskb %ymm4, %eax ++ incl %eax ++ jnz L(8x_return_vec_3) + VZEROUPPER_RETURN + + .p2align 4 +-L(first_vec): +- /* A byte or int32 is different within 16 or 32 bytes. */ +- tzcntl %eax, %ecx ++L(last_2x_vec): ++ /* Check second to last VEC. */ ++ vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1 ++ VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1 ++ vpmovmskb %ymm1, %eax ++ incl %eax ++ jnz L(return_vec_1_end) ++ /* Check last VEC. */ ++L(last_1x_vec): ++ vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1 ++ VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1 ++ vpmovmskb %ymm1, %eax ++ incl %eax ++ jnz L(return_vec_0_end) ++ VZEROUPPER_RETURN ++ ++ .p2align 4 ++L(8x_return_vec_2): ++ subq $VEC_SIZE, %rdx ++L(8x_return_vec_3): ++ tzcntl %eax, %eax ++ addq %rdx, %rax + # ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl (%rdi, %rcx), %edx +- cmpl (%rsi, %rcx), %edx +-L(wmemcmp_return): +- setl %al +- negl %eax +- orl $1, %eax ++ movl (VEC_SIZE * 3)(%rax), %ecx ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax + # else +- movzbl (%rdi, %rcx), %eax +- movzbl (%rsi, %rcx), %edx +- sub %edx, %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx ++ movzbl (VEC_SIZE * 3)(%rax), %eax ++ subl %ecx, %eax + # endif + VZEROUPPER_RETURN + +-# ifdef USE_AS_WMEMCMP + .p2align 4 +-L(4): +- xorl %eax, %eax +- movl (%rdi), %edx +- cmpl (%rsi), %edx +- jne L(wmemcmp_return) +- ret ++L(return_vec_1_end): ++ tzcntl %eax, %eax ++ addl %edx, %eax ++# ifdef USE_AS_WMEMCMP ++ movl -(VEC_SIZE * 2)(%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl -(VEC_SIZE * 2)(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax + # else ++ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx ++ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ VZEROUPPER_RETURN ++ + .p2align 4 +-L(between_4_7): +- /* Load as big endian with overlapping movbe to avoid branches. */ +- movbe (%rdi), %eax +- movbe (%rsi), %ecx +- shlq $32, %rax +- shlq $32, %rcx +- movbe -4(%rdi, %rdx), %edi +- movbe -4(%rsi, %rdx), %esi +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- je L(exit) +- sbbl %eax, %eax +- orl $1, %eax +- ret ++L(return_vec_0_end): ++ tzcntl %eax, %eax ++ addl %edx, %eax ++# ifdef USE_AS_WMEMCMP ++ movl -VEC_SIZE(%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl -VEC_SIZE(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl -VEC_SIZE(%rsi, %rax), %ecx ++ movzbl -VEC_SIZE(%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ VZEROUPPER_RETURN + + .p2align 4 +-L(exit): +- ret ++L(less_vec): ++ /* Check if one or less CHAR. This is necessary for size = 0 but ++ is also faster for size = CHAR_SIZE. */ ++ cmpl $CHAR_SIZE, %edx ++ jbe L(one_or_less) ++ ++ /* Check if loading one VEC from either s1 or s2 could cause a ++ page cross. This can have false positives but is by far the ++ fastest method. */ ++ movl %edi, %eax ++ orl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ jg L(page_cross_less_vec) ++ ++ /* No page cross possible. */ ++ vmovdqu (%rsi), %ymm2 ++ VPCMPEQ (%rdi), %ymm2, %ymm2 ++ vpmovmskb %ymm2, %eax ++ incl %eax ++ /* Result will be zero if s1 and s2 match. Otherwise first set ++ bit will be first mismatch. */ ++ bzhil %edx, %eax, %edx ++ jnz L(return_vec_0) ++ xorl %eax, %eax ++ VZEROUPPER_RETURN + + .p2align 4 +-L(between_2_3): ++L(page_cross_less_vec): ++ /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 ++ bytes. */ ++ cmpl $16, %edx ++ jae L(between_16_31) ++# ifndef USE_AS_WMEMCMP ++ cmpl $8, %edx ++ jae L(between_8_15) ++ cmpl $4, %edx ++ jae L(between_4_7) ++ + /* Load as big endian to avoid branches. */ + movzwl (%rdi), %eax + movzwl (%rsi), %ecx +@@ -208,223 +439,106 @@ L(between_2_3): + shll $8, %ecx + bswap %eax + bswap %ecx +- movb -1(%rdi, %rdx), %al +- movb -1(%rsi, %rdx), %cl ++ movzbl -1(%rdi, %rdx), %edi ++ movzbl -1(%rsi, %rdx), %esi ++ orl %edi, %eax ++ orl %esi, %ecx + /* Subtraction is okay because the upper 8 bits are zero. */ + subl %ecx, %eax ++ /* No ymm register was touched. */ + ret + + .p2align 4 +-L(1): +- movzbl (%rdi), %eax ++L(one_or_less): ++ jb L(zero) + movzbl (%rsi), %ecx ++ movzbl (%rdi), %eax + subl %ecx, %eax +- ret +-# endif +- +- .p2align 4 +-L(zero): +- xorl %eax, %eax ++ /* No ymm register was touched. */ + ret + + .p2align 4 +-L(less_vec): +-# ifdef USE_AS_WMEMCMP +- /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ +- cmpb $4, %dl +- je L(4) +- jb L(zero) +-# else +- cmpb $1, %dl +- je L(1) +- jb L(zero) +- cmpb $4, %dl +- jb L(between_2_3) +- cmpb $8, %dl +- jb L(between_4_7) ++L(between_8_15): + # endif +- cmpb $16, %dl +- jae L(between_16_31) +- /* It is between 8 and 15 bytes. */ ++ /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ + vmovq (%rdi), %xmm1 + vmovq (%rsi), %xmm2 +- VPCMPEQ %xmm1, %xmm2, %xmm2 ++ VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax +- subl $0xffff, %eax +- jnz L(first_vec) ++ subl $0xffff, %eax ++ jnz L(return_vec_0) + /* Use overlapping loads to avoid branches. */ + leaq -8(%rdi, %rdx), %rdi + leaq -8(%rsi, %rdx), %rsi + vmovq (%rdi), %xmm1 + vmovq (%rsi), %xmm2 +- VPCMPEQ %xmm1, %xmm2, %xmm2 ++ VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax +- subl $0xffff, %eax +- jnz L(first_vec) ++ subl $0xffff, %eax ++ jnz L(return_vec_0) ++ /* No ymm register was touched. */ ++ ret ++ ++ .p2align 4 ++L(zero): ++ xorl %eax, %eax + ret + + .p2align 4 + L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ + vmovdqu (%rsi), %xmm2 +- VPCMPEQ (%rdi), %xmm2, %xmm2 ++ VPCMPEQ (%rdi), %xmm2, %xmm2 + vpmovmskb %xmm2, %eax +- subl $0xffff, %eax +- jnz L(first_vec) ++ subl $0xffff, %eax ++ jnz L(return_vec_0) + + /* Use overlapping loads to avoid branches. */ ++ ++ vmovdqu -16(%rsi, %rdx), %xmm2 + leaq -16(%rdi, %rdx), %rdi + leaq -16(%rsi, %rdx), %rsi +- vmovdqu (%rsi), %xmm2 +- VPCMPEQ (%rdi), %xmm2, %xmm2 ++ VPCMPEQ (%rdi), %xmm2, %xmm2 + vpmovmskb %xmm2, %eax +- subl $0xffff, %eax +- jnz L(first_vec) ++ subl $0xffff, %eax ++ jnz L(return_vec_0) ++ /* No ymm register was touched. */ + ret + +- .p2align 4 +-L(more_8x_vec): +- /* More than 8 * VEC. Check the first VEC. */ +- vmovdqu (%rsi), %ymm2 +- VPCMPEQ (%rdi), %ymm2, %ymm2 +- vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- +- /* Align the first memory area for aligned loads in the loop. +- Compute how much the first memory area is misaligned. */ +- movq %rdi, %rcx +- andl $(VEC_SIZE - 1), %ecx +- /* Get the negative of offset for alignment. */ +- subq $VEC_SIZE, %rcx +- /* Adjust the second memory area. */ +- subq %rcx, %rsi +- /* Adjust the first memory area which should be aligned now. */ +- subq %rcx, %rdi +- /* Adjust length. */ +- addq %rcx, %rdx +- +-L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ +- vmovdqu (%rsi), %ymm1 +- VPCMPEQ (%rdi), %ymm1, %ymm1 +- +- vmovdqu VEC_SIZE(%rsi), %ymm2 +- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 +- vpand %ymm2, %ymm1, %ymm5 +- +- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 +- vpand %ymm3, %ymm5, %ymm5 +- +- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 +- vpand %ymm4, %ymm5, %ymm5 +- +- vptest %ymm0, %ymm5 +- jnc L(4x_vec_end) +- +- addq $(VEC_SIZE * 4), %rdi +- addq $(VEC_SIZE * 4), %rsi +- +- subq $(VEC_SIZE * 4), %rdx +- cmpq $(VEC_SIZE * 4), %rdx +- jae L(loop_4x_vec) +- +- /* Less than 4 * VEC. */ +- cmpq $VEC_SIZE, %rdx +- jbe L(last_vec) +- cmpq $(VEC_SIZE * 2), %rdx +- jbe L(last_2x_vec) +- +-L(last_4x_vec): +- /* From 2 * VEC to 4 * VEC. */ +- vmovdqu (%rsi), %ymm2 +- VPCMPEQ (%rdi), %ymm2, %ymm2 +- vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- +- addq $VEC_SIZE, %rdi +- addq $VEC_SIZE, %rsi +- vmovdqu (%rsi), %ymm2 +- VPCMPEQ (%rdi), %ymm2, %ymm2 +- vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- +- /* Use overlapping loads to avoid branches. */ +- leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi +- leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi +- vmovdqu (%rsi), %ymm2 +- VPCMPEQ (%rdi), %ymm2, %ymm2 +- vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- +- addq $VEC_SIZE, %rdi +- addq $VEC_SIZE, %rsi +- vmovdqu (%rsi), %ymm2 +- VPCMPEQ (%rdi), %ymm2, %ymm2 +- vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- VZEROUPPER_RETURN +- +- .p2align 4 +-L(4x_vec_end): +- vpmovmskb %ymm1, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec_x1) +- vpmovmskb %ymm3, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec_x2) +- vpmovmskb %ymm4, %eax +- subl $VEC_MASK, %eax +- tzcntl %eax, %ecx + # ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rdi, %rcx), %edx +- cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx +- jmp L(wmemcmp_return) +-# else +- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax +- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx +- sub %edx, %eax +-# endif +- VZEROUPPER_RETURN +- + .p2align 4 +-L(first_vec_x1): +- tzcntl %eax, %ecx +-# ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl VEC_SIZE(%rdi, %rcx), %edx +- cmpl VEC_SIZE(%rsi, %rcx), %edx +- jmp L(wmemcmp_return) ++L(one_or_less): ++ jb L(zero) ++ movl (%rdi), %ecx ++ xorl %edx, %edx ++ cmpl (%rsi), %ecx ++ je L(zero) ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++ /* No ymm register was touched. */ ++ ret + # else +- movzbl VEC_SIZE(%rdi, %rcx), %eax +- movzbl VEC_SIZE(%rsi, %rcx), %edx +- sub %edx, %eax +-# endif +- VZEROUPPER_RETURN + + .p2align 4 +-L(first_vec_x2): +- tzcntl %eax, %ecx +-# ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rdi, %rcx), %edx +- cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx +- jmp L(wmemcmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax +- movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx +- sub %edx, %eax ++L(between_4_7): ++ /* Load as big endian with overlapping movbe to avoid branches. ++ */ ++ movbe (%rdi), %eax ++ movbe (%rsi), %ecx ++ shlq $32, %rax ++ shlq $32, %rcx ++ movbe -4(%rdi, %rdx), %edi ++ movbe -4(%rsi, %rdx), %esi ++ orq %rdi, %rax ++ orq %rsi, %rcx ++ subq %rcx, %rax ++ jz L(zero_4_7) ++ sbbl %eax, %eax ++ orl $1, %eax ++L(zero_4_7): ++ /* No ymm register was touched. */ ++ ret + # endif +- VZEROUPPER_RETURN ++ + END (MEMCMP) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-46.patch b/glibc-RHEL-15696-46.patch new file mode 100644 index 0000000..881fe81 --- /dev/null +++ b/glibc-RHEL-15696-46.patch @@ -0,0 +1,851 @@ +From 4ad473e97acdc5f6d811755b67c09f2128a644ce Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 17 May 2021 13:57:24 -0400 +Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit optimizes memcmp-evex.S. The optimizations include +adding a new vec compare path for small sizes, reorganizing the entry +control flow, removing some unnecissary ALU instructions from the main +loop, and most importantly replacing the heavy use of vpcmp + kand +logic with vpxor + vptern. test-memcmp and test-wmemcmp are both +passing. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 710 +++++++++++-------- + 1 file changed, 408 insertions(+), 302 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +index 9c093972..654dc7ac 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +@@ -19,17 +19,22 @@ + #if IS_IN (libc) + + /* memcmp/wmemcmp is implemented as: +- 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap +- to avoid branches. +- 2. Use overlapping compare to avoid branch. +- 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8 +- bytes for wmemcmp. +- 4. If size is 8 * VEC_SIZE or less, unroll the loop. +- 5. Compare 4 * VEC_SIZE at a time with the aligned first memory ++ 1. Use ymm vector compares when possible. The only case where ++ vector compares is not possible for when size < CHAR_PER_VEC ++ and loading from either s1 or s2 would cause a page cross. ++ 2. For size from 2 to 7 bytes on page cross, load as big endian ++ with movbe and bswap to avoid branches. ++ 3. Use xmm vector compare when size >= 4 bytes for memcmp or ++ size >= 8 bytes for wmemcmp. ++ 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a ++ to check for early mismatches. Only do this if its guranteed the ++ work is not wasted. ++ 5. If size is 8 * VEC_SIZE or less, unroll the loop. ++ 6. Compare 4 * VEC_SIZE at a time with the aligned first memory + area. +- 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. +- 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. +- 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ ++ 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less. ++ 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less. ++ 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */ + + # include + +@@ -40,11 +45,21 @@ + # define VMOVU vmovdqu64 + + # ifdef USE_AS_WMEMCMP +-# define VPCMPEQ vpcmpeqd ++# define CHAR_SIZE 4 ++# define VPCMP vpcmpd + # else +-# define VPCMPEQ vpcmpeqb ++# define CHAR_SIZE 1 ++# define VPCMP vpcmpub + # endif + ++# define VEC_SIZE 32 ++# define PAGE_SIZE 4096 ++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) ++ ++# define XMM0 xmm16 ++# define XMM1 xmm17 ++# define XMM2 xmm18 ++# define YMM0 ymm16 + # define XMM1 xmm17 + # define XMM2 xmm18 + # define YMM1 ymm17 +@@ -54,15 +69,6 @@ + # define YMM5 ymm21 + # define YMM6 ymm22 + +-# define VEC_SIZE 32 +-# ifdef USE_AS_WMEMCMP +-# define VEC_MASK 0xff +-# define XMM_MASK 0xf +-# else +-# define VEC_MASK 0xffffffff +-# define XMM_MASK 0xffff +-# endif +- + /* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +@@ -70,145 +76,370 @@ + + .section .text.evex,"ax",@progbits + ENTRY (MEMCMP) +-# ifdef USE_AS_WMEMCMP +- shl $2, %RDX_LP +-# elif defined __ILP32__ ++# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx + # endif +- cmp $VEC_SIZE, %RDX_LP ++ cmp $CHAR_PER_VEC, %RDX_LP + jb L(less_vec) + + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ +- VMOVU (%rsi), %YMM2 +- VPCMPEQ (%rdi), %YMM2, %k1 ++ VMOVU (%rsi), %YMM1 ++ /* Use compare not equals to directly check for mismatch. */ ++ VPCMP $4, (%rdi), %YMM1, %k1 + kmovd %k1, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- +- cmpq $(VEC_SIZE * 2), %rdx +- jbe L(last_vec) +- +- /* More than 2 * VEC. */ +- cmpq $(VEC_SIZE * 8), %rdx +- ja L(more_8x_vec) +- cmpq $(VEC_SIZE * 4), %rdx +- jb L(last_4x_vec) ++ /* NB: eax must be destination register if going to ++ L(return_vec_[0,2]). For L(return_vec_3 destination register ++ must be ecx. */ ++ testl %eax, %eax ++ jnz L(return_vec_0) + +- /* From 4 * VEC to 8 * VEC, inclusively. */ +- VMOVU (%rsi), %YMM1 +- VPCMPEQ (%rdi), %YMM1, %k1 ++ cmpq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(last_1x_vec) + ++ /* Check second VEC no matter what. */ + VMOVU VEC_SIZE(%rsi), %YMM2 +- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 ++ VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(return_vec_1) ++ ++ /* Less than 4 * VEC. */ ++ cmpq $(CHAR_PER_VEC * 4), %rdx ++ jbe L(last_2x_vec) + ++ /* Check third and fourth VEC no matter what. */ + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 ++ VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 ++ VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1 ++ kmovd %k1, %ecx ++ testl %ecx, %ecx ++ jnz L(return_vec_3) + +- kandd %k1, %k2, %k5 +- kandd %k3, %k4, %k6 +- kandd %k5, %k6, %k6 ++ /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so ++ compare with zero to get a mask is needed. */ ++ vpxorq %XMM0, %XMM0, %XMM0 + +- kmovd %k6, %eax +- cmpl $VEC_MASK, %eax +- jne L(4x_vec_end) ++ /* Go to 4x VEC loop. */ ++ cmpq $(CHAR_PER_VEC * 8), %rdx ++ ja L(more_8x_vec) + +- leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi +- leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi +- VMOVU (%rsi), %YMM1 +- VPCMPEQ (%rdi), %YMM1, %k1 ++ /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any ++ branches. */ + +- VMOVU VEC_SIZE(%rsi), %YMM2 +- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 +- kandd %k1, %k2, %k5 ++ /* Load first two VEC from s2 before adjusting addresses. */ ++ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1 ++ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2 ++ leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi ++ leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi ++ ++ /* Wait to load from s1 until addressed adjust due to ++ unlamination of microfusion with complex address mode. */ ++ ++ /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it ++ will have some 1s. */ ++ vpxorq (%rdi), %YMM1, %YMM1 ++ vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2 + + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 +- kandd %k3, %k5, %k5 ++ vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 ++ /* Or together YMM1, YMM2, and YMM3 into YMM3. */ ++ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 +- kandd %k4, %k5, %k5 ++ /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while ++ oring with YMM3. Result is stored in YMM4. */ ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4 ++ /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */ ++ VPCMP $4, %YMM4, %YMM0, %k1 ++ kmovd %k1, %ecx ++ testl %ecx, %ecx ++ jnz L(return_vec_0_1_2_3) ++ /* NB: eax must be zero to reach here. */ ++ ret + +- kmovd %k5, %eax +- cmpl $VEC_MASK, %eax +- jne L(4x_vec_end) +- xorl %eax, %eax ++ /* NB: aligning 32 here allows for the rest of the jump targets ++ to be tuned for 32 byte alignment. Most important this ensures ++ the L(more_8x_vec) loop is 32 byte aligned. */ ++ .p2align 5 ++L(less_vec): ++ /* Check if one or less CHAR. This is necessary for size = 0 but ++ is also faster for size = CHAR_SIZE. */ ++ cmpl $1, %edx ++ jbe L(one_or_less) ++ ++ /* Check if loading one VEC from either s1 or s2 could cause a ++ page cross. This can have false positives but is by far the ++ fastest method. */ ++ movl %edi, %eax ++ orl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ jg L(page_cross_less_vec) ++ ++ /* No page cross possible. */ ++ VMOVU (%rsi), %YMM2 ++ VPCMP $4, (%rdi), %YMM2, %k1 ++ kmovd %k1, %eax ++ /* Create mask in ecx for potentially in bound matches. */ ++ bzhil %edx, %eax, %eax ++ jnz L(return_vec_0) + ret + + .p2align 4 +-L(last_2x_vec): +- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ +- VMOVU (%rsi), %YMM2 +- VPCMPEQ (%rdi), %YMM2, %k2 +- kmovd %k2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) ++L(return_vec_0): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl (%rdi, %rax, CHAR_SIZE), %ecx ++ xorl %edx, %edx ++ cmpl (%rsi, %rax, CHAR_SIZE), %ecx ++ /* NB: no partial register stall here because xorl zero idiom ++ above. */ ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (%rsi, %rax), %ecx ++ movzbl (%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ ret + +-L(last_vec): +- /* Use overlapping loads to avoid branches. */ +- leaq -VEC_SIZE(%rdi, %rdx), %rdi +- leaq -VEC_SIZE(%rsi, %rdx), %rsi +- VMOVU (%rsi), %YMM2 +- VPCMPEQ (%rdi), %YMM2, %k2 +- kmovd %k2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) ++ /* NB: No p2align necessary. Alignment % 16 is naturally 1 ++ which is good enough for a target not in a loop. */ ++L(return_vec_1): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx ++ xorl %edx, %edx ++ cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl VEC_SIZE(%rsi, %rax), %ecx ++ movzbl VEC_SIZE(%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif + ret + +- .p2align 4 +-L(first_vec): +- /* A byte or int32 is different within 16 or 32 bytes. */ +- tzcntl %eax, %ecx ++ /* NB: No p2align necessary. Alignment % 16 is naturally 2 ++ which is good enough for a target not in a loop. */ ++L(return_vec_2): ++ tzcntl %eax, %eax + # ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl (%rdi, %rcx, 4), %edx +- cmpl (%rsi, %rcx, 4), %edx +-L(wmemcmp_return): +- setl %al +- negl %eax +- orl $1, %eax ++ movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax + # else +- movzbl (%rdi, %rcx), %eax +- movzbl (%rsi, %rcx), %edx +- sub %edx, %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx ++ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax ++ subl %ecx, %eax + # endif + ret + ++ .p2align 4 ++L(8x_return_vec_0_1_2_3): ++ /* Returning from L(more_8x_vec) requires restoring rsi. */ ++ addq %rdi, %rsi ++L(return_vec_0_1_2_3): ++ VPCMP $4, %YMM1, %YMM0, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(return_vec_0) ++ ++ VPCMP $4, %YMM2, %YMM0, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(return_vec_1) ++ ++ VPCMP $4, %YMM3, %YMM0, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(return_vec_2) ++L(return_vec_3): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WMEMCMP ++ movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++# endif ++ ret ++ + .p2align 4 +-L(4): +- xorl %eax, %eax +- movl (%rdi), %edx +- cmpl (%rsi), %edx +- jne L(wmemcmp_return) ++L(more_8x_vec): ++ /* Set end of s1 in rdx. */ ++ leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx ++ /* rsi stores s2 - s1. This allows loop to only update one ++ pointer. */ ++ subq %rdi, %rsi ++ /* Align s1 pointer. */ ++ andq $-VEC_SIZE, %rdi ++ /* Adjust because first 4x vec where check already. */ ++ subq $-(VEC_SIZE * 4), %rdi ++ .p2align 4 ++L(loop_4x_vec): ++ VMOVU (%rsi, %rdi), %YMM1 ++ vpxorq (%rdi), %YMM1, %YMM1 ++ ++ VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 ++ vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2 ++ ++ VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 ++ vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 ++ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 ++ ++ VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4 ++ VPCMP $4, %YMM4, %YMM0, %k1 ++ kmovd %k1, %ecx ++ testl %ecx, %ecx ++ jnz L(8x_return_vec_0_1_2_3) ++ subq $-(VEC_SIZE * 4), %rdi ++ cmpq %rdx, %rdi ++ jb L(loop_4x_vec) ++ ++ subq %rdx, %rdi ++ /* rdi has 4 * VEC_SIZE - remaining length. */ ++ cmpl $(VEC_SIZE * 3), %edi ++ jae L(8x_last_1x_vec) ++ /* Load regardless of branch. */ ++ VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3 ++ cmpl $(VEC_SIZE * 2), %edi ++ jae L(8x_last_2x_vec) ++ ++ VMOVU (%rsi, %rdx), %YMM1 ++ vpxorq (%rdx), %YMM1, %YMM1 ++ ++ VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 ++ vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 ++ ++ vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3 ++ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 ++ ++ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4 ++ VPCMP $4, %YMM4, %YMM0, %k1 ++ kmovd %k1, %ecx ++ /* Restore s1 pointer to rdi. */ ++ movq %rdx, %rdi ++ testl %ecx, %ecx ++ jnz L(8x_return_vec_0_1_2_3) ++ /* NB: eax must be zero to reach here. */ ++ ret ++ ++ /* Only entry is from L(more_8x_vec). */ ++ .p2align 4 ++L(8x_last_2x_vec): ++ VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(8x_return_vec_2) ++ /* Naturally aligned to 16 bytes. */ ++L(8x_last_1x_vec): ++ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1 ++ VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(8x_return_vec_3) ++ ret ++ ++ .p2align 4 ++L(last_2x_vec): ++ /* Check second to last VEC. */ ++ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1 ++ VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(return_vec_1_end) ++ ++ /* Check last VEC. */ ++ .p2align 4 ++L(last_1x_vec): ++ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1 ++ VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(return_vec_0_end) + ret ++ ++ .p2align 4 ++L(8x_return_vec_2): ++ subq $VEC_SIZE, %rdx ++L(8x_return_vec_3): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ leaq (%rdx, %rax, CHAR_SIZE), %rax ++ movl (VEC_SIZE * 3)(%rax), %ecx ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax + # else ++ addq %rdx, %rax ++ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx ++ movzbl (VEC_SIZE * 3)(%rax), %eax ++ subl %ecx, %eax ++# endif ++ ret ++ + .p2align 4 +-L(between_4_7): +- /* Load as big endian with overlapping movbe to avoid branches. */ +- movbe (%rdi), %eax +- movbe (%rsi), %ecx +- shlq $32, %rax +- shlq $32, %rcx +- movbe -4(%rdi, %rdx), %edi +- movbe -4(%rsi, %rdx), %esi +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- je L(exit) +- sbbl %eax, %eax +- orl $1, %eax ++L(return_vec_0_end): ++ tzcntl %eax, %eax ++ addl %edx, %eax ++# ifdef USE_AS_WMEMCMP ++ movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx ++ xorl %edx, %edx ++ cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl -VEC_SIZE(%rsi, %rax), %ecx ++ movzbl -VEC_SIZE(%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif + ret + + .p2align 4 +-L(exit): ++L(return_vec_1_end): ++ tzcntl %eax, %eax ++ addl %edx, %eax ++# ifdef USE_AS_WMEMCMP ++ movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx ++ xorl %edx, %edx ++ cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx ++ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif + ret + ++ + .p2align 4 ++L(page_cross_less_vec): ++ /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 ++ bytes. */ ++ cmpl $(16 / CHAR_SIZE), %edx ++ jae L(between_16_31) ++# ifndef USE_AS_WMEMCMP ++ cmpl $8, %edx ++ jae L(between_8_15) ++ cmpl $4, %edx ++ jae L(between_4_7) + L(between_2_3): + /* Load as big endian to avoid branches. */ + movzwl (%rdi), %eax +@@ -217,224 +448,99 @@ L(between_2_3): + shll $8, %ecx + bswap %eax + bswap %ecx +- movb -1(%rdi, %rdx), %al +- movb -1(%rsi, %rdx), %cl ++ movzbl -1(%rdi, %rdx), %edi ++ movzbl -1(%rsi, %rdx), %esi ++ orl %edi, %eax ++ orl %esi, %ecx + /* Subtraction is okay because the upper 8 bits are zero. */ + subl %ecx, %eax + ret +- + .p2align 4 +-L(1): +- movzbl (%rdi), %eax ++L(one_or_less): ++ jb L(zero) + movzbl (%rsi), %ecx ++ movzbl (%rdi), %eax + subl %ecx, %eax + ret +-# endif +- +- .p2align 4 +-L(zero): +- xorl %eax, %eax +- ret + + .p2align 4 +-L(less_vec): +-# ifdef USE_AS_WMEMCMP +- /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ +- cmpb $4, %dl +- je L(4) +- jb L(zero) +-# else +- cmpb $1, %dl +- je L(1) +- jb L(zero) +- cmpb $4, %dl +- jb L(between_2_3) +- cmpb $8, %dl +- jb L(between_4_7) ++L(between_8_15): + # endif +- cmpb $16, %dl +- jae L(between_16_31) +- /* It is between 8 and 15 bytes. */ ++ /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ + vmovq (%rdi), %XMM1 + vmovq (%rsi), %XMM2 +- VPCMPEQ %XMM1, %XMM2, %k2 +- kmovw %k2, %eax +- subl $XMM_MASK, %eax +- jnz L(first_vec) ++ VPCMP $4, %XMM1, %XMM2, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(return_vec_0) + /* Use overlapping loads to avoid branches. */ +- leaq -8(%rdi, %rdx), %rdi +- leaq -8(%rsi, %rdx), %rsi ++ leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi ++ leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi + vmovq (%rdi), %XMM1 + vmovq (%rsi), %XMM2 +- VPCMPEQ %XMM1, %XMM2, %k2 +- kmovw %k2, %eax +- subl $XMM_MASK, %eax +- jnz L(first_vec) ++ VPCMP $4, %XMM1, %XMM2, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(return_vec_0) + ret + + .p2align 4 +-L(between_16_31): +- /* From 16 to 31 bytes. No branch when size == 16. */ +- VMOVU (%rsi), %XMM2 +- VPCMPEQ (%rdi), %XMM2, %k2 +- kmovw %k2, %eax +- subl $XMM_MASK, %eax +- jnz L(first_vec) +- +- /* Use overlapping loads to avoid branches. */ +- leaq -16(%rdi, %rdx), %rdi +- leaq -16(%rsi, %rdx), %rsi +- VMOVU (%rsi), %XMM2 +- VPCMPEQ (%rdi), %XMM2, %k2 +- kmovw %k2, %eax +- subl $XMM_MASK, %eax +- jnz L(first_vec) ++L(zero): ++ xorl %eax, %eax + ret + + .p2align 4 +-L(more_8x_vec): +- /* More than 8 * VEC. Check the first VEC. */ +- VMOVU (%rsi), %YMM2 +- VPCMPEQ (%rdi), %YMM2, %k2 +- kmovd %k2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- +- /* Align the first memory area for aligned loads in the loop. +- Compute how much the first memory area is misaligned. */ +- movq %rdi, %rcx +- andl $(VEC_SIZE - 1), %ecx +- /* Get the negative of offset for alignment. */ +- subq $VEC_SIZE, %rcx +- /* Adjust the second memory area. */ +- subq %rcx, %rsi +- /* Adjust the first memory area which should be aligned now. */ +- subq %rcx, %rdi +- /* Adjust length. */ +- addq %rcx, %rdx +- +-L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ +- VMOVU (%rsi), %YMM1 +- VPCMPEQ (%rdi), %YMM1, %k1 +- +- VMOVU VEC_SIZE(%rsi), %YMM2 +- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 +- kandd %k2, %k1, %k5 +- +- VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 +- kandd %k3, %k5, %k5 +- +- VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 +- kandd %k4, %k5, %k5 +- +- kmovd %k5, %eax +- cmpl $VEC_MASK, %eax +- jne L(4x_vec_end) +- +- addq $(VEC_SIZE * 4), %rdi +- addq $(VEC_SIZE * 4), %rsi +- +- subq $(VEC_SIZE * 4), %rdx +- cmpq $(VEC_SIZE * 4), %rdx +- jae L(loop_4x_vec) +- +- /* Less than 4 * VEC. */ +- cmpq $VEC_SIZE, %rdx +- jbe L(last_vec) +- cmpq $(VEC_SIZE * 2), %rdx +- jbe L(last_2x_vec) +- +-L(last_4x_vec): +- /* From 2 * VEC to 4 * VEC. */ +- VMOVU (%rsi), %YMM2 +- VPCMPEQ (%rdi), %YMM2, %k2 +- kmovd %k2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- +- addq $VEC_SIZE, %rdi +- addq $VEC_SIZE, %rsi +- VMOVU (%rsi), %YMM2 +- VPCMPEQ (%rdi), %YMM2, %k2 +- kmovd %k2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) ++L(between_16_31): ++ /* From 16 to 31 bytes. No branch when size == 16. */ ++ VMOVU (%rsi), %XMM2 ++ VPCMP $4, (%rdi), %XMM2, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(return_vec_0) + + /* Use overlapping loads to avoid branches. */ +- leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi +- leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi +- VMOVU (%rsi), %YMM2 +- VPCMPEQ (%rdi), %YMM2, %k2 +- kmovd %k2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) + +- addq $VEC_SIZE, %rdi +- addq $VEC_SIZE, %rsi +- VMOVU (%rsi), %YMM2 +- VPCMPEQ (%rdi), %YMM2, %k2 +- kmovd %k2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- ret +- +- .p2align 4 +-L(4x_vec_end): ++ VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2 ++ leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi ++ leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi ++ VPCMP $4, (%rdi), %XMM2, %k1 + kmovd %k1, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- kmovd %k2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec_x1) +- kmovd %k3, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec_x2) +- kmovd %k4, %eax +- subl $VEC_MASK, %eax +- tzcntl %eax, %ecx +-# ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx +- cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx +- jmp L(wmemcmp_return) +-# else +- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax +- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx +- sub %edx, %eax +-# endif ++ testl %eax, %eax ++ jnz L(return_vec_0) + ret + +- .p2align 4 +-L(first_vec_x1): +- tzcntl %eax, %ecx + # ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl VEC_SIZE(%rdi, %rcx, 4), %edx +- cmpl VEC_SIZE(%rsi, %rcx, 4), %edx +- jmp L(wmemcmp_return) +-# else +- movzbl VEC_SIZE(%rdi, %rcx), %eax +- movzbl VEC_SIZE(%rsi, %rcx), %edx +- sub %edx, %eax +-# endif ++ .p2align 4 ++L(one_or_less): ++ jb L(zero) ++ movl (%rdi), %ecx ++ xorl %edx, %edx ++ cmpl (%rsi), %ecx ++ je L(zero) ++ setg %dl ++ leal -1(%rdx, %rdx), %eax + ret ++# else + + .p2align 4 +-L(first_vec_x2): +- tzcntl %eax, %ecx +-# ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx +- cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx +- jmp L(wmemcmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax +- movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx +- sub %edx, %eax +-# endif ++L(between_4_7): ++ /* Load as big endian with overlapping movbe to avoid branches. ++ */ ++ movbe (%rdi), %eax ++ movbe (%rsi), %ecx ++ shlq $32, %rax ++ shlq $32, %rcx ++ movbe -4(%rdi, %rdx), %edi ++ movbe -4(%rsi, %rdx), %esi ++ orq %rdi, %rax ++ orq %rsi, %rcx ++ subq %rcx, %rax ++ jz L(zero_4_7) ++ sbbl %eax, %eax ++ orl $1, %eax ++L(zero_4_7): + ret ++# endif ++ + END (MEMCMP) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-47.patch b/glibc-RHEL-15696-47.patch new file mode 100644 index 0000000..70c3171 --- /dev/null +++ b/glibc-RHEL-15696-47.patch @@ -0,0 +1,104 @@ +From 6abf27980a947f9b6e514d6b33b83059d39566ae Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 20 May 2021 13:13:51 -0400 +Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit makes a few small improvements to +memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64 +instead of 128. Either alignment will perform equally well in a loop +and 128 just increases the odds of having to do an extra iteration +which can be significant overhead for small values. 2) Align some +targets and the loop. 3) Remove an ALU from the alignment process. 4) +Reorder the last 4x VEC so that they are stored after the loop. 5) +Move the condition for leq 8x VEC to before the alignment +process. test-memset and test-wmemset are both passing. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + .../multiarch/memset-vec-unaligned-erms.S | 50 +++++++++++-------- + 1 file changed, 28 insertions(+), 22 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index f877ac9d..909c33f6 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) + VMOVU %VEC(0), (%rdi) + VZEROUPPER_RETURN + ++ .p2align 4 + L(stosb_more_2x_vec): + cmp __x86_rep_stosb_threshold(%rip), %RDX_LP + ja L(stosb) ++#else ++ .p2align 4 + #endif + L(more_2x_vec): +- cmpq $(VEC_SIZE * 4), %rdx +- ja L(loop_start) ++ /* Stores to first 2x VEC before cmp as any path forward will ++ require it. */ + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(0), VEC_SIZE(%rdi) +- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) ++ cmpq $(VEC_SIZE * 4), %rdx ++ ja L(loop_start) + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) ++ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + L(return): + #if VEC_SIZE > 16 + ZERO_UPPER_VEC_REGISTERS_RETURN +@@ -192,28 +197,29 @@ L(return): + #endif + + L(loop_start): +- leaq (VEC_SIZE * 4)(%rdi), %rcx +- VMOVU %VEC(0), (%rdi) +- andq $-(VEC_SIZE * 4), %rcx +- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) +- VMOVU %VEC(0), VEC_SIZE(%rdi) +- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) +- VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx) + VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi) +- VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx) +- addq %rdi, %rdx +- andq $-(VEC_SIZE * 4), %rdx +- cmpq %rdx, %rcx +- je L(return) ++ cmpq $(VEC_SIZE * 8), %rdx ++ jbe L(loop_end) ++ andq $-(VEC_SIZE * 2), %rdi ++ subq $-(VEC_SIZE * 4), %rdi ++ leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx ++ .p2align 4 + L(loop): +- VMOVA %VEC(0), (%rcx) +- VMOVA %VEC(0), VEC_SIZE(%rcx) +- VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx) +- VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx) +- addq $(VEC_SIZE * 4), %rcx +- cmpq %rcx, %rdx +- jne L(loop) ++ VMOVA %VEC(0), (%rdi) ++ VMOVA %VEC(0), VEC_SIZE(%rdi) ++ VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi) ++ VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi) ++ subq $-(VEC_SIZE * 4), %rdi ++ cmpq %rcx, %rdi ++ jb L(loop) ++L(loop_end): ++ /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN. ++ rdx as length is also unchanged. */ ++ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx) ++ VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx) ++ VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx) ++ VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) + VZEROUPPER_SHORT_RETURN + + .p2align 4 +-- +GitLab + diff --git a/glibc-RHEL-15696-48.patch b/glibc-RHEL-15696-48.patch new file mode 100644 index 0000000..645536e --- /dev/null +++ b/glibc-RHEL-15696-48.patch @@ -0,0 +1,84 @@ +From 1b992204f68af851e905c16016756fd4421e1934 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Sun, 23 May 2021 19:43:24 -0400 +Subject: [PATCH] x86: Improve memmove-vec-unaligned-erms.S +Content-type: text/plain; charset=UTF-8 + +This patch changes the condition for copy 4x VEC so that if length is +exactly equal to 4 * VEC_SIZE it will use the 4x VEC case instead of +8x VEC case. + +Results For Skylake memcpy-avx2-erms +size, al1 , al2 , Cur T , New T , Win , New / Cur +128 , 0 , 0 , 9.137 , 6.873 , New , 75.22 +128 , 7 , 0 , 12.933 , 7.732 , New , 59.79 +128 , 0 , 7 , 11.852 , 6.76 , New , 57.04 +128 , 7 , 7 , 12.587 , 6.808 , New , 54.09 + +Results For Icelake memcpy-evex-erms +size, al1 , al2 , Cur T , New T , Win , New / Cur +128 , 0 , 0 , 9.963 , 5.416 , New , 54.36 +128 , 7 , 0 , 16.467 , 8.061 , New , 48.95 +128 , 0 , 7 , 14.388 , 7.644 , New , 53.13 +128 , 7 , 7 , 14.546 , 7.642 , New , 52.54 + +Results For Tigerlake memcpy-evex-erms +size, al1 , al2 , Cur T , New T , Win , New / Cur +128 , 0 , 0 , 8.979 , 4.95 , New , 55.13 +128 , 7 , 0 , 14.245 , 7.122 , New , 50.0 +128 , 0 , 7 , 12.668 , 6.675 , New , 52.69 +128 , 7 , 7 , 13.042 , 6.802 , New , 52.15 + +Results For Skylake memmove-avx2-erms +size, al1 , al2 , Cur T , New T , Win , New / Cur +128 , 0 , 32 , 6.181 , 5.691 , New , 92.07 +128 , 32 , 0 , 6.165 , 5.752 , New , 93.3 +128 , 0 , 7 , 13.923 , 9.37 , New , 67.3 +128 , 7 , 0 , 12.049 , 10.182 , New , 84.5 + +Results For Icelake memmove-evex-erms +size, al1 , al2 , Cur T , New T , Win , New / Cur +128 , 0 , 32 , 5.479 , 4.889 , New , 89.23 +128 , 32 , 0 , 5.127 , 4.911 , New , 95.79 +128 , 0 , 7 , 18.885 , 13.547 , New , 71.73 +128 , 7 , 0 , 15.565 , 14.436 , New , 92.75 + +Results For Tigerlake memmove-evex-erms +size, al1 , al2 , Cur T , New T , Win , New / Cur +128 , 0 , 32 , 5.275 , 4.815 , New , 91.28 +128 , 32 , 0 , 5.376 , 4.565 , New , 84.91 +128 , 0 , 7 , 19.426 , 14.273 , New , 73.47 +128 , 7 , 0 , 15.924 , 14.951 , New , 93.89 + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +index 3e2dd6bc..572cef04 100644 +--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +@@ -417,8 +417,8 @@ L(more_2x_vec): + cmpq $(VEC_SIZE * 8), %rdx + ja L(more_8x_vec) + cmpq $(VEC_SIZE * 4), %rdx +- jb L(last_4x_vec) +- /* Copy from 4 * VEC to 8 * VEC, inclusively. */ ++ jbe L(last_4x_vec) ++ /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) +@@ -437,7 +437,7 @@ L(more_2x_vec): + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) + VZEROUPPER_RETURN + L(last_4x_vec): +- /* Copy from 2 * VEC to 4 * VEC. */ ++ /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) +-- +GitLab + diff --git a/glibc-RHEL-15696-49.patch b/glibc-RHEL-15696-49.patch new file mode 100644 index 0000000..b59f582 --- /dev/null +++ b/glibc-RHEL-15696-49.patch @@ -0,0 +1,55 @@ +From 08cbcd4dbc686bb38ec3093aff2f919fbff5ec17 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Jun 2021 19:19:34 -0400 +Subject: [PATCH] x86: Remove unnecessary overflow check from wcsnlen-sse4_1.S +Content-type: text/plain; charset=UTF-8 + +No bug. The way wcsnlen will check if near the end of maxlen +is the following macro: + + mov %r11, %rsi; \ + subq %rax, %rsi; \ + andq $-64, %rax; \ + testq $-64, %rsi; \ + je L(strnlen_ret) + +Which words independently of s + maxlen overflowing. So the +second overflow check is unnecissary for correctness and +just extra overhead in the common no overflow case. + +test-strlen.c, test-wcslen.c, test-strnlen.c and test-wcsnlen.c are +all passing + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strlen-vec.S | 7 ------- + 1 file changed, 7 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S +index 439e486a..b7657282 100644 +--- a/sysdeps/x86_64/multiarch/strlen-vec.S ++++ b/sysdeps/x86_64/multiarch/strlen-vec.S +@@ -71,19 +71,12 @@ L(n_nonzero): + suffice. */ + mov %RSI_LP, %R10_LP + sar $62, %R10_LP +- test %R10_LP, %R10_LP + jnz __wcslen_sse4_1 + sal $2, %RSI_LP + # endif + +- + /* Initialize long lived registers. */ +- + add %RDI_LP, %RSI_LP +-# ifdef AS_WCSLEN +-/* Check for overflow again from s + maxlen * sizeof(wchar_t). */ +- jbe __wcslen_sse4_1 +-# endif + mov %RSI_LP, %R10_LP + and $-64, %R10_LP + mov %RSI_LP, %R11_LP +-- +GitLab + diff --git a/glibc-RHEL-15696-5.patch b/glibc-RHEL-15696-5.patch new file mode 100644 index 0000000..75d3978 --- /dev/null +++ b/glibc-RHEL-15696-5.patch @@ -0,0 +1,290 @@ +From 82d0b4a4d76db554eb6757acb790fcea30b19965 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 21 Jan 2019 11:32:24 -0800 +Subject: [PATCH] x86-64 memset/wmemset: Properly handle the length parameter + [BZ# 24097] +Content-type: text/plain; charset=UTF-8 + +On x32, the size_t parameter may be passed in the lower 32 bits of a +64-bit register with the non-zero upper 32 bits. The string/memory +functions written in assembly can only use the lower 32 bits of a +64-bit register as length or must clear the upper 32 bits before using +the full 64-bit register for length. + +This pach fixes memset/wmemset for x32. Tested on x86-64 and x32. On +x86-64, libc.so is the same with and withou the fix. + + [BZ# 24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use + RDX_LP for length. Clear the upper 32 bits of RDX register. + * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset. + * sysdeps/x86_64/x32/tst-size_t-memset.c: New file. + * sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise. +--- + .../multiarch/memset-avx512-no-vzeroupper.S | 6 +- + .../multiarch/memset-vec-unaligned-erms.S | 34 +++++---- + sysdeps/x86_64/x32/Makefile | 4 +- + sysdeps/x86_64/x32/tst-size_t-memset.c | 73 +++++++++++++++++++ + sysdeps/x86_64/x32/tst-size_t-wmemset.c | 20 +++++ + 5 files changed, 121 insertions(+), 16 deletions(-) + create mode 100644 sysdeps/x86_64/x32/tst-size_t-memset.c + create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemset.c + +Conflicts: + ChangeLog + (removed) + +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S +index 689cc119..99e25519 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S +@@ -29,12 +29,16 @@ + .section .text.avx512,"ax",@progbits + #if defined PIC + ENTRY (MEMSET_CHK) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMSET_CHK) + #endif + + ENTRY (MEMSET) ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ mov %edx, %edx ++# endif + vpxor %xmm0, %xmm0, %xmm0 + vmovd %esi, %xmm1 + lea (%rdi, %rdx), %rsi +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 270a1d49..9a0fd818 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -65,8 +65,8 @@ + .section SECTION(.text),"ax",@progbits + #if VEC_SIZE == 16 && IS_IN (libc) + ENTRY (__bzero) +- movq %rdi, %rax /* Set return value. */ +- movq %rsi, %rdx /* Set n. */ ++ mov %RDI_LP, %RAX_LP /* Set return value. */ ++ mov %RSI_LP, %RDX_LP /* Set n. */ + pxor %xmm0, %xmm0 + jmp L(entry_from_bzero) + END (__bzero) +@@ -76,13 +76,13 @@ weak_alias (__bzero, bzero) + #if IS_IN (libc) + # if defined SHARED + ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) + # endif + + ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) +- shlq $2, %rdx ++ shl $2, %RDX_LP + WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + jmp L(entry_from_bzero) + END (WMEMSET_SYMBOL (__wmemset, unaligned)) +@@ -90,13 +90,17 @@ END (WMEMSET_SYMBOL (__wmemset, unaligned)) + + #if defined SHARED && IS_IN (libc) + ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) + #endif + + ENTRY (MEMSET_SYMBOL (__memset, unaligned)) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ mov %edx, %edx ++# endif + L(entry_from_bzero): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) +@@ -112,14 +116,14 @@ END (MEMSET_SYMBOL (__memset, unaligned)) + + # if VEC_SIZE == 16 + ENTRY (__memset_chk_erms) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (__memset_chk_erms) + + /* Only used to measure performance of REP STOSB. */ + ENTRY (__memset_erms) + /* Skip zero length. */ +- testq %rdx, %rdx ++ test %RDX_LP, %RDX_LP + jnz L(stosb) + movq %rdi, %rax + ret +@@ -131,11 +135,11 @@ ENTRY (MEMSET_SYMBOL (__memset, erms)) + L(stosb): + /* Issue vzeroupper before rep stosb. */ + VZEROUPPER +- movq %rdx, %rcx ++ mov %RDX_LP, %RCX_LP + movzbl %sil, %eax +- movq %rdi, %rdx ++ mov %RDI_LP, %RDX_LP + rep stosb +- movq %rdx, %rax ++ mov %RDX_LP, %RAX_LP + ret + # if VEC_SIZE == 16 + END (__memset_erms) +@@ -145,16 +149,20 @@ END (MEMSET_SYMBOL (__memset, erms)) + + # if defined SHARED && IS_IN (libc) + ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + # endif + + ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) +- cmpq $VEC_SIZE, %rdx ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ mov %edx, %edx ++# endif ++ cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) +- cmpq $(VEC_SIZE * 2), %rdx ++ cmp $(VEC_SIZE * 2), %RDX_LP + ja L(stosb_more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index e99dbd7c..98bd9ae9 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -7,9 +7,9 @@ endif + + ifeq ($(subdir),string) + tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ +- tst-size_t-memrchr ++ tst-size_t-memrchr tst-size_t-memset + endif + + ifeq ($(subdir),wcsmbs) +-tests += tst-size_t-wmemchr tst-size_t-wmemcmp ++tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset + endif +diff --git a/sysdeps/x86_64/x32/tst-size_t-memset.c b/sysdeps/x86_64/x32/tst-size_t-memset.c +new file mode 100644 +index 00000000..2c367af6 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-memset.c +@@ -0,0 +1,73 @@ ++/* Test memset with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifdef WIDE ++# define TEST_NAME "wmemset" ++#else ++# define TEST_NAME "memset" ++#endif /* WIDE */ ++ ++#include "test-size_t.h" ++ ++#ifdef WIDE ++# include ++# define MEMSET wmemset ++# define CHAR wchar_t ++#else ++# define MEMSET memset ++# define CHAR char ++#endif /* WIDE */ ++ ++IMPL (MEMSET, 1) ++ ++typedef CHAR *(*proto_t) (CHAR *, int, size_t); ++ ++static void * ++__attribute__ ((noinline, noclone)) ++do_memset (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, (uintptr_t) b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ CHAR ch = 0x23; ++ parameter_t src = { { page_size / sizeof (CHAR) }, buf2 }; ++ parameter_t c = { { 0 }, (void *) (uintptr_t) ch }; ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ c.fn = impl->fn; ++ CHAR *p = (CHAR *) do_memset (src, c); ++ size_t i; ++ for (i = 0; i < src.len; i++) ++ if (p[i] != ch) ++ { ++ error (0, 0, "Wrong result in function %s", impl->name); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemset.c b/sysdeps/x86_64/x32/tst-size_t-wmemset.c +new file mode 100644 +index 00000000..955eb488 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-wmemset.c +@@ -0,0 +1,20 @@ ++/* Test wmemset with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define WIDE 1 ++#include "tst-size_t-memset.c" +-- +GitLab + diff --git a/glibc-RHEL-15696-50.patch b/glibc-RHEL-15696-50.patch new file mode 100644 index 0000000..e896698 --- /dev/null +++ b/glibc-RHEL-15696-50.patch @@ -0,0 +1,43 @@ +From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001 +Author: Shen-Ta Hsieh 2021-05-23 21:43:10 +Committer: H.J. Lu 2021-06-27 10:56:57 +Parent: 2c16cb88a6e5ace0fb7cedca86860ea7bde522a7 (Linux: Move timer helper routines from librt to libc) +Child: 1683249d17e14827b6579529742eb895027dfa84 (x86_64: roundeven with sse4.1 support) +Branches: master, remotes/origin/master and many more (41) +Follows: glibc-2.33.9000 +Precedes: glibc-2.34 + + math: redirect roundeven function + + This patch redirect roundeven function for futhermore changes. + + Signed-off-by: Shen-Ta Hsieh + Reviewed-by: H.J. Lu + +Conflicts: + * + (rewritten for older branch) + +diff --git a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c +index 7bbbb2dc..8728d0f2 100644 +--- a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c ++++ b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c +@@ -17,6 +17,7 @@ + License along with the GNU C Library; if not, see + . */ + ++#define NO_MATH_REDIRECT + #include + #include + #include +@@ -67,5 +68,6 @@ __roundeven (double x) + INSERT_WORDS64 (x, ix); + return x; + } +-hidden_def (__roundeven) ++#ifndef __roundeven + libm_alias_double (__roundeven, roundeven) ++#endif +-- +GitLab + diff --git a/glibc-RHEL-15696-51.patch b/glibc-RHEL-15696-51.patch new file mode 100644 index 0000000..105843d --- /dev/null +++ b/glibc-RHEL-15696-51.patch @@ -0,0 +1,118 @@ +From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001 +From: Shen-Ta Hsieh +Date: Mon, 24 May 2021 09:43:10 +0800 +Subject: [PATCH] math: redirect roundeven function +Content-type: text/plain; charset=UTF-8 + +This patch redirect roundeven function for futhermore changes. + +Signed-off-by: Shen-Ta Hsieh +Reviewed-by: H.J. Lu +--- + include/math.h | 3 ++- + sysdeps/ieee754/dbl-64/s_roundeven.c | 4 +++- + sysdeps/ieee754/float128/s_roundevenf128.c | 1 + + sysdeps/ieee754/flt-32/s_roundevenf.c | 3 +++ + sysdeps/ieee754/ldbl-128/s_roundevenl.c | 1 + + sysdeps/ieee754/ldbl-96/s_roundevenl.c | 1 + + 6 files changed, 11 insertions(+), 2 deletions(-) + +Conflicts: + include/math.h + (missing MATH_REDIRECT macros) + +diff --git a/include/math.h b/include/math.h +index e21d34b8..1f9f9a54 100644 +--- a/include/math.h ++++ b/include/math.h +@@ -38,7 +38,6 @@ libm_hidden_proto (__issignaling) + libm_hidden_proto (__issignalingf) + libm_hidden_proto (__exp) + libm_hidden_proto (__expf) +-libm_hidden_proto (__roundeven) + + # ifndef __NO_LONG_DOUBLE_MATH + libm_hidden_proto (__fpclassifyl) +@@ -56,6 +55,8 @@ libm_hidden_proto (__expm1f128) + + # if !(defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ > 0) + # ifndef NO_MATH_REDIRECT ++float (roundevenf) (float) asm ("__roundevenf"); ++double (roundeven) (double) asm ("__roundeven"); + /* Declare sqrt for use within GLIBC. Compilers typically inline sqrt as a + single instruction. Use an asm to avoid use of PLTs if it doesn't. */ + float (sqrtf) (float) asm ("__ieee754_sqrtf"); +diff --git a/sysdeps/ieee754/dbl-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/s_roundeven.c +index 1438e81d..61962184 100644 +--- a/sysdeps/ieee754/dbl-64/s_roundeven.c ++++ b/sysdeps/ieee754/dbl-64/s_roundeven.c +@@ -17,6 +17,7 @@ + License along with the GNU C Library; if not, see + . */ + ++#define NO_MATH_REDIRECT + #include + #include + #include +@@ -101,5 +102,6 @@ __roundeven (double x) + INSERT_WORDS (x, hx, lx); + return x; + } +-hidden_def (__roundeven) ++#ifndef __roundeven + libm_alias_double (__roundeven, roundeven) ++#endif +diff --git a/sysdeps/ieee754/float128/s_roundevenf128.c b/sysdeps/ieee754/float128/s_roundevenf128.c +index 5a9b3f39..e0faf727 100644 +--- a/sysdeps/ieee754/float128/s_roundevenf128.c ++++ b/sysdeps/ieee754/float128/s_roundevenf128.c +@@ -1,2 +1,3 @@ ++#define NO_MATH_REDIRECT + #include + #include "../ldbl-128/s_roundevenl.c" +diff --git a/sysdeps/ieee754/flt-32/s_roundevenf.c b/sysdeps/ieee754/flt-32/s_roundevenf.c +index 90f991d5..a661875e 100644 +--- a/sysdeps/ieee754/flt-32/s_roundevenf.c ++++ b/sysdeps/ieee754/flt-32/s_roundevenf.c +@@ -17,6 +17,7 @@ + License along with the GNU C Library; if not, see + . */ + ++#define NO_MATH_REDIRECT + #include + #include + #include +@@ -67,4 +68,6 @@ __roundevenf (float x) + SET_FLOAT_WORD (x, ix); + return x; + } ++#ifndef __roundevenf + libm_alias_float (__roundeven, roundeven) ++#endif +diff --git a/sysdeps/ieee754/ldbl-128/s_roundevenl.c b/sysdeps/ieee754/ldbl-128/s_roundevenl.c +index 5fc59af4..b9375b6c 100644 +--- a/sysdeps/ieee754/ldbl-128/s_roundevenl.c ++++ b/sysdeps/ieee754/ldbl-128/s_roundevenl.c +@@ -17,6 +17,7 @@ + License along with the GNU C Library; if not, see + . */ + ++#define NO_MATH_REDIRECT + #include + #include + #include +diff --git a/sysdeps/ieee754/ldbl-96/s_roundevenl.c b/sysdeps/ieee754/ldbl-96/s_roundevenl.c +index be2e4fa4..65031ab7 100644 +--- a/sysdeps/ieee754/ldbl-96/s_roundevenl.c ++++ b/sysdeps/ieee754/ldbl-96/s_roundevenl.c +@@ -17,6 +17,7 @@ + License along with the GNU C Library; if not, see + . */ + ++#define NO_MATH_REDIRECT + #include + #include + #include +-- +GitLab + diff --git a/glibc-RHEL-15696-52.patch b/glibc-RHEL-15696-52.patch new file mode 100644 index 0000000..4602f51 --- /dev/null +++ b/glibc-RHEL-15696-52.patch @@ -0,0 +1,242 @@ +From 1683249d17e14827b6579529742eb895027dfa84 Mon Sep 17 00:00:00 2001 +From: Shen-Ta Hsieh +Date: Mon, 24 May 2021 09:43:11 +0800 +Subject: [PATCH] x86_64: roundeven with sse4.1 support +Content-type: text/plain; charset=UTF-8 + +This patch adds support for the sse4.1 hardware floating point +roundeven. + +Here is some benchmark results on my systems: + +=AMD Ryzen 9 3900X 12-Core Processor= + +* benchmark result before this commit +| | roundeven | roundevenf | +|------------|--------------|--------------| +| duration | 3.75587e+09 | 3.75114e+09 | +| iterations | 3.93053e+08 | 4.35402e+08 | +| max | 52.592 | 58.71 | +| min | 7.98 | 7.22 | +| mean | 9.55563 | 8.61535 | + +* benchmark result after this commit +| | roundeven | roundevenf | +|------------|---------------|--------------| +| duration | 3.73815e+09 | 3.73738e+09 | +| iterations | 5.82692e+08 | 5.91498e+08 | +| max | 56.468 | 51.642 | +| min | 6.27 | 6.156 | +| mean | 6.41532 | 6.3185 | + +=Intel(R) Pentium(R) CPU D1508 @ 2.20GHz= + +* benchmark result before this commit +| | roundeven | roundevenf | +|------------|--------------|--------------| +| duration | 2.18208e+09 | 2.18258e+09 | +| iterations | 2.39932e+08 | 2.46924e+08 | +| max | 96.378 | 98.035 | +| min | 6.776 | 5.94 | +| mean | 9.09456 | 8.83907 | + +* benchmark result after this commit +| | roundeven | roundevenf | +|------------|--------------|--------------| +| duration | 2.17415e+09 | 2.17005e+09 | +| iterations | 3.56193e+08 | 4.09824e+08 | +| max | 51.693 | 97.192 | +| min | 5.926 | 5.093 | +| mean | 6.10385 | 5.29507 | + +Signed-off-by: Shen-Ta Hsieh +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/fpu/multiarch/Makefile | 5 +-- + sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c | 2 ++ + .../x86_64/fpu/multiarch/s_roundeven-sse4_1.S | 24 ++++++++++++++ + sysdeps/x86_64/fpu/multiarch/s_roundeven.c | 31 +++++++++++++++++++ + sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c | 3 ++ + .../fpu/multiarch/s_roundevenf-sse4_1.S | 24 ++++++++++++++ + sysdeps/x86_64/fpu/multiarch/s_roundevenf.c | 31 +++++++++++++++++++ + 7 files changed, 118 insertions(+), 2 deletions(-) + create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c + create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S + create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven.c + create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c + create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S + create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf.c + +diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile +index 9f387248..6ddd1c01 100644 +--- a/sysdeps/x86_64/fpu/multiarch/Makefile ++++ b/sysdeps/x86_64/fpu/multiarch/Makefile +@@ -1,11 +1,12 @@ + ifeq ($(subdir),math) + libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \ + s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \ +- s_trunc-c s_truncf-c ++ s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c + + libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \ + s_floorf-sse4_1 s_nearbyint-sse4_1 \ +- s_nearbyintf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \ ++ s_nearbyintf-sse4_1 s_roundeven-sse4_1 \ ++ s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \ + s_trunc-sse4_1 s_truncf-sse4_1 + + libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \ +diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c +new file mode 100644 +index 00000000..c7be43cb +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c +@@ -0,0 +1,2 @@ ++#define __roundeven __roundeven_c ++#include +diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S +new file mode 100644 +index 00000000..6ae8f6b1 +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S +@@ -0,0 +1,24 @@ ++/* Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++ .section .text.sse4.1,"ax",@progbits ++ENTRY(__roundeven_sse41) ++ roundsd $8, %xmm0, %xmm0 ++ ret ++END(__roundeven_sse41) +diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c +new file mode 100644 +index 00000000..d92eda65 +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c +@@ -0,0 +1,31 @@ ++/* Multiple versions of __roundeven. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define roundeven __redirect_roundeven ++#define __roundeven __redirect___roundeven ++#include ++#undef roundeven ++#undef __roundeven ++ ++#define SYMBOL_NAME roundeven ++#include "ifunc-sse4_1.h" ++ ++libc_ifunc_redirected (__redirect_roundeven, __roundeven, IFUNC_SELECTOR ()); ++libm_alias_double (__roundeven, roundeven) +diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c +new file mode 100644 +index 00000000..72a6e7d1 +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c +@@ -0,0 +1,3 @@ ++#undef __roundevenf ++#define __roundevenf __roundevenf_c ++#include +diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S +new file mode 100644 +index 00000000..a76e1080 +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S +@@ -0,0 +1,24 @@ ++/* Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++ .section .text.sse4.1,"ax",@progbits ++ENTRY(__roundevenf_sse41) ++ roundss $8, %xmm0, %xmm0 ++ ret ++END(__roundevenf_sse41) +diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c +new file mode 100644 +index 00000000..2ee196e6 +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c +@@ -0,0 +1,31 @@ ++/* Multiple versions of __roundevenf. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define roundevenf __redirect_roundevenf ++#define __roundevenf __redirect___roundevenf ++#include ++#undef roundevenf ++#undef __roundevenf ++ ++#define SYMBOL_NAME roundevenf ++#include "ifunc-sse4_1.h" ++ ++libc_ifunc_redirected (__redirect_roundevenf, __roundevenf, IFUNC_SELECTOR ()); ++libm_alias_float (__roundeven, roundeven) +-- +GitLab + diff --git a/glibc-RHEL-15696-53.patch b/glibc-RHEL-15696-53.patch new file mode 100644 index 0000000..7221d38 --- /dev/null +++ b/glibc-RHEL-15696-53.patch @@ -0,0 +1,41 @@ +From 7e08db3359c86c94918feb33a1182cd0ff3bb10b Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Sun, 9 Jan 2022 16:02:28 -0600 +Subject: [PATCH] x86: Fix __wcsncmp_evex in strcmp-evex.S [BZ# 28755] +Content-type: text/plain; charset=UTF-8 + +Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to +__wcscmp_evex. For x86_64 this covers the entire address range so any +length larger could not possibly be used to bound `s1` or `s2`. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/strcmp-evex.S | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 459eeed0..d5aa6daa 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -97,6 +97,16 @@ ENTRY (STRCMP) + je L(char0) + jb L(zero) + # ifdef USE_AS_WCSCMP ++# ifndef __ILP32__ ++ movq %rdx, %rcx ++ /* Check if length could overflow when multiplied by ++ sizeof(wchar_t). Checking top 8 bits will cover all potential ++ overflow cases as well as redirect cases where its impossible to ++ length to bound a valid memory region. In these cases just use ++ 'wcscmp'. */ ++ shrq $56, %rcx ++ jnz __wcscmp_evex ++# endif + /* Convert units: from wide to byte char. */ + shl $2, %RDX_LP + # endif +-- +GitLab + diff --git a/glibc-RHEL-15696-54.patch b/glibc-RHEL-15696-54.patch new file mode 100644 index 0000000..b2aaaa1 --- /dev/null +++ b/glibc-RHEL-15696-54.patch @@ -0,0 +1,268 @@ +From 78c9ec9000f873abe7a15a91b87080a2e4308260 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 20 Aug 2021 06:42:24 -0700 +Subject: [PATCH] x86-64: Optimize load of all bits set into ZMM register [BZ + #28252] +Content-type: text/plain; charset=UTF-8 + +Optimize loads of all bits set into ZMM register in AVX512 SVML codes +by replacing + + vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX + +and + + vmovups .L_2il0floatpacket.13(%rip), %zmmX + +with + vpternlogd $0xff, %zmmX, %zmmX, %zmmX + +This fixes BZ #28252. +--- + .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------ + .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------ + .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------ + .../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------ + .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------ + .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------ + .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------ + .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++---------- + .../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------ + .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------ + 10 files changed, 11 insertions(+), 64 deletions(-) + +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S +index 24e3b363..07dfed85 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S +@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos + vmovaps %zmm0, %zmm8 + + /* Check for large arguments path */ +- vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2 ++ vpternlogd $0xff, %zmm2, %zmm2, %zmm2 + + /* + ARGUMENT RANGE REDUCTION: +@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos + jmp .LBL_2_7 + #endif + END (_ZGVeN8v_cos_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.16: +- .long 0xffffffff,0xffffffff +- .type .L_2il0floatpacket.16,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S +index ae8af8d8..ddb60e5b 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S +@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log + + /* preserve mantissa, set input exponent to 2^(-10) */ + vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2 +- vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1 ++ vpternlogd $0xff, %zmm1, %zmm1, %zmm1 + vpsrlq $32, %zmm4, %zmm6 + + /* reciprocal approximation good to at least 11 bits */ +@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log + jmp .LBL_2_7 + #endif + END (_ZGVeN8v_log_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.12: +- .long 0xffffffff,0xffffffff +- .type .L_2il0floatpacket.12,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S +index 2d4b14fd..529c454a 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S +@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin + andq $-64, %rsp + subq $1280, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax +- vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14 ++ vpternlogd $0xff, %zmm1, %zmm1, %zmm14 + vmovups __dAbsMask(%rax), %zmm7 + vmovups __dInvPI(%rax), %zmm2 + vmovups __dRShifter(%rax), %zmm1 +@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin + jmp .LBL_2_7 + #endif + END (_ZGVeN8v_sin_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.14: +- .long 0xffffffff,0xffffffff +- .type .L_2il0floatpacket.14,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S +index 2df626c0..e501a53a 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S +@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos + + /* SinPoly = SinR*SinPoly */ + vfmadd213pd %zmm5, %zmm5, %zmm4 +- vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3 ++ vpternlogd $0xff, %zmm3, %zmm3, %zmm3 + + /* Update Cos result's sign */ + vxorpd %zmm2, %zmm1, %zmm1 +@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl) + ENTRY (_ZGVeN8vvv_sincos_skx) + WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx + END (_ZGVeN8vvv_sincos_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.15: +- .long 0xffffffff,0xffffffff +- .type .L_2il0floatpacket.15,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S +index 6ea1137b..377af394 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S +@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf + X = X - Y*PI1 - Y*PI2 - Y*PI3 + */ + vmovaps %zmm0, %zmm6 +- vmovups .L_2il0floatpacket.13(%rip), %zmm12 ++ vpternlogd $0xff, %zmm12, %zmm12, %zmm12 + vmovups __sRShifter(%rax), %zmm3 + vmovups __sPI1_FMA(%rax), %zmm5 + vmovups __sA9_FMA(%rax), %zmm9 +@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf + jmp .LBL_2_7 + #endif + END (_ZGVeN16v_cosf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.13: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.13,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S +index 89ba0df2..46f33d46 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S +@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf + vmovaps %zmm0, %zmm7 + + /* compare against threshold */ +- vmovups .L_2il0floatpacket.13(%rip), %zmm3 ++ vpternlogd $0xff, %zmm3, %zmm3, %zmm3 + vmovups __sInvLn2(%rax), %zmm4 + vmovups __sShifter(%rax), %zmm1 + vmovups __sLn2hi(%rax), %zmm6 +@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf + + #endif + END (_ZGVeN16v_expf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.13: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.13,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S +index 4cf0a96f..9e254956 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S +@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf + andq $-64, %rsp + subq $1280, %rsp + movq __svml_slog_data@GOTPCREL(%rip), %rax +- vmovups .L_2il0floatpacket.7(%rip), %zmm6 ++ vpternlogd $0xff, %zmm6, %zmm6, %zmm6 + vmovups _iBrkValue(%rax), %zmm4 + vmovups _sPoly_7(%rax), %zmm8 + +@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf + + #endif + END (_ZGVeN16v_logf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.7: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.7,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S +index bdcd50af..e8331ba1 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S +@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf + vpsrlq $32, %zmm3, %zmm2 + vpmovqd %zmm2, %ymm11 + vcvtps2pd %ymm14, %zmm13 +- vmovups .L_2il0floatpacket.23(%rip), %zmm14 ++ vpternlogd $0xff, %zmm14, %zmm14, %zmm14 + vmovaps %zmm14, %zmm26 + vpandd _ABSMASK(%rax), %zmm1, %zmm8 + vpcmpd $1, _INF(%rax), %zmm8, %k2 +@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf + vpmovqd %zmm11, %ymm5 + vpxord %zmm10, %zmm10, %zmm10 + vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3} +- vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4 ++ vpternlogd $0xff, %zmm4, %zmm4, %zmm4 + vpxord %zmm11, %zmm11, %zmm11 + vcvtdq2pd %ymm7, %zmm7 + vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1} +@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf + jmp .LBL_2_7 + #endif + END (_ZGVeN16vv_powf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.23: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.23,@object +-.L_2il0floatpacket.24: +- .long 0xffffffff,0xffffffff +- .type .L_2il0floatpacket.24,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S +index 5fa4bc41..1f46f334 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S +@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf + + /* Result sign calculations */ + vpternlogd $150, %zmm0, %zmm14, %zmm1 +- vmovups .L_2il0floatpacket.13(%rip), %zmm14 ++ vpternlogd $0xff, %zmm14, %zmm14, %zmm14 + + /* Add correction term 0.5 for cos() part */ + vaddps %zmm8, %zmm5, %zmm15 +@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl) + ENTRY (_ZGVeN16vvv_sincosf_skx) + WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx + END (_ZGVeN16vvv_sincosf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.13: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.13,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S +index 141f747e..1fc9308a 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S +@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + + /* Check for large and special values */ +- vmovups .L_2il0floatpacket.11(%rip), %zmm14 ++ vpternlogd $0xff, %zmm14, %zmm14, %zmm14 + vmovups __sAbsMask(%rax), %zmm5 + vmovups __sInvPI(%rax), %zmm1 + vmovups __sRShifter(%rax), %zmm2 +@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf + jmp .LBL_2_7 + #endif + END (_ZGVeN16v_sinf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.11: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.11,@object +-- +GitLab + diff --git a/glibc-RHEL-15696-55.patch b/glibc-RHEL-15696-55.patch new file mode 100644 index 0000000..d44eef1 --- /dev/null +++ b/glibc-RHEL-15696-55.patch @@ -0,0 +1,48 @@ +From fc5bd179ef3a953dff8d1655bd530d0e230ffe71 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Tue, 21 Sep 2021 18:31:49 -0500 +Subject: [PATCH] x86: Modify ENTRY in sysdep.h so that p2align can be + specified +Content-type: text/plain; charset=UTF-8 + +No bug. + +This change adds a new macro ENTRY_P2ALIGN which takes a second +argument, log2 of the desired function alignment. + +The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this +doesn't affect any existing functionality. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86/sysdep.h | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h +index 01bac0f6..a70bb3a2 100644 +--- a/sysdeps/x86/sysdep.h ++++ b/sysdeps/x86/sysdep.h +@@ -78,15 +78,18 @@ enum cf_protection_level + #define ASM_SIZE_DIRECTIVE(name) .size name,.-name; + + /* Define an entry point visible from C. */ +-#define ENTRY(name) \ ++#define ENTRY_P2ALIGN(name, alignment) \ + .globl C_SYMBOL_NAME(name); \ + .type C_SYMBOL_NAME(name),@function; \ +- .align ALIGNARG(4); \ ++ .align ALIGNARG(alignment); \ + C_LABEL(name) \ + cfi_startproc; \ + _CET_ENDBR; \ + CALL_MCOUNT + ++/* Common entry 16 byte aligns. */ ++#define ENTRY(name) ENTRY_P2ALIGN (name, 4) ++ + #undef END + #define END(name) \ + cfi_endproc; \ +-- +GitLab + diff --git a/glibc-RHEL-15696-56.patch b/glibc-RHEL-15696-56.patch new file mode 100644 index 0000000..45b9975 --- /dev/null +++ b/glibc-RHEL-15696-56.patch @@ -0,0 +1,658 @@ +From 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Tue, 21 Sep 2021 18:45:03 -0500 +Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S for frontend behavior and + size +Content-type: text/plain; charset=UTF-8 + +No bug. + +The frontend optimizations are to: +1. Reorganize logically connected basic blocks so they are either in + the same cache line or adjacent cache lines. +2. Avoid cases when basic blocks unnecissarily cross cache lines. +3. Try and 32 byte align any basic blocks possible without sacrificing + code size. Smaller / Less hot basic blocks are used for this. + +Overall code size shrunk by 168 bytes. This should make up for any +extra costs due to aligning to 64 bytes. + +In general performance before deviated a great deal dependending on +whether entry alignment % 64 was 0, 16, 32, or 48. These changes +essentially make it so that the current implementation is at least +equal to the best alignment of the original for any arguments. + +The only additional optimization is in the page cross case. Branch on +equals case was removed from the size == [4, 7] case. As well the [4, +7] and [2, 3] case where swapped as [4, 7] is likely a more hot +argument size. + +test-memcmp and test-wmemcmp are both passing. +--- + sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 434 +++++++++++-------- + 1 file changed, 242 insertions(+), 192 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +index 654dc7ac..2761b54f 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +@@ -34,7 +34,24 @@ + area. + 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less. + 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less. +- 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */ ++ 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. ++ ++When possible the implementation tries to optimize for frontend in the ++following ways: ++Throughput: ++ 1. All code sections that fit are able to run optimally out of the ++ LSD. ++ 2. All code sections that fit are able to run optimally out of the ++ DSB ++ 3. Basic blocks are contained in minimum number of fetch blocks ++ necessary. ++ ++Latency: ++ 1. Logically connected basic blocks are put in the same ++ cache-line. ++ 2. Logically connected basic blocks that do not fit in the same ++ cache-line are put in adjacent lines. This can get beneficial ++ L2 spatial prefetching and L1 next-line prefetching. */ + + # include + +@@ -47,9 +64,11 @@ + # ifdef USE_AS_WMEMCMP + # define CHAR_SIZE 4 + # define VPCMP vpcmpd ++# define VPTEST vptestmd + # else + # define CHAR_SIZE 1 + # define VPCMP vpcmpub ++# define VPTEST vptestmb + # endif + + # define VEC_SIZE 32 +@@ -75,7 +94,9 @@ + */ + + .section .text.evex,"ax",@progbits +-ENTRY (MEMCMP) ++/* Cache align memcmp entry. This allows for much more thorough ++ frontend optimization. */ ++ENTRY_P2ALIGN (MEMCMP, 6) + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +@@ -89,7 +110,7 @@ ENTRY (MEMCMP) + VPCMP $4, (%rdi), %YMM1, %k1 + kmovd %k1, %eax + /* NB: eax must be destination register if going to +- L(return_vec_[0,2]). For L(return_vec_3 destination register ++ L(return_vec_[0,2]). For L(return_vec_3) destination register + must be ecx. */ + testl %eax, %eax + jnz L(return_vec_0) +@@ -121,10 +142,6 @@ ENTRY (MEMCMP) + testl %ecx, %ecx + jnz L(return_vec_3) + +- /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so +- compare with zero to get a mask is needed. */ +- vpxorq %XMM0, %XMM0, %XMM0 +- + /* Go to 4x VEC loop. */ + cmpq $(CHAR_PER_VEC * 8), %rdx + ja L(more_8x_vec) +@@ -148,47 +165,61 @@ ENTRY (MEMCMP) + + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 + vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 +- /* Or together YMM1, YMM2, and YMM3 into YMM3. */ +- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 + /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while +- oring with YMM3. Result is stored in YMM4. */ +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4 +- /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */ +- VPCMP $4, %YMM4, %YMM0, %k1 ++ oring with YMM1. Result is stored in YMM4. */ ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 ++ ++ /* Or together YMM2, YMM3, and YMM4 into YMM4. */ ++ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 ++ ++ /* Test YMM4 against itself. Store any CHAR mismatches in k1. ++ */ ++ VPTEST %YMM4, %YMM4, %k1 ++ /* k1 must go to ecx for L(return_vec_0_1_2_3). */ + kmovd %k1, %ecx + testl %ecx, %ecx + jnz L(return_vec_0_1_2_3) + /* NB: eax must be zero to reach here. */ + ret + +- /* NB: aligning 32 here allows for the rest of the jump targets +- to be tuned for 32 byte alignment. Most important this ensures +- the L(more_8x_vec) loop is 32 byte aligned. */ +- .p2align 5 +-L(less_vec): +- /* Check if one or less CHAR. This is necessary for size = 0 but +- is also faster for size = CHAR_SIZE. */ +- cmpl $1, %edx +- jbe L(one_or_less) ++ .p2align 4 ++L(8x_end_return_vec_0_1_2_3): ++ movq %rdx, %rdi ++L(8x_return_vec_0_1_2_3): ++ addq %rdi, %rsi ++L(return_vec_0_1_2_3): ++ VPTEST %YMM1, %YMM1, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(return_vec_0) + +- /* Check if loading one VEC from either s1 or s2 could cause a +- page cross. This can have false positives but is by far the +- fastest method. */ +- movl %edi, %eax +- orl %esi, %eax +- andl $(PAGE_SIZE - 1), %eax +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jg L(page_cross_less_vec) ++ VPTEST %YMM2, %YMM2, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(return_vec_1) + +- /* No page cross possible. */ +- VMOVU (%rsi), %YMM2 +- VPCMP $4, (%rdi), %YMM2, %k1 +- kmovd %k1, %eax +- /* Create mask in ecx for potentially in bound matches. */ +- bzhil %edx, %eax, %eax +- jnz L(return_vec_0) ++ VPTEST %YMM3, %YMM3, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(return_vec_2) ++L(return_vec_3): ++ /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one ++ fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache ++ line. */ ++ bsfl %ecx, %ecx ++# ifdef USE_AS_WMEMCMP ++ movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++# endif + ret + + .p2align 4 +@@ -209,10 +240,11 @@ L(return_vec_0): + # endif + ret + +- /* NB: No p2align necessary. Alignment % 16 is naturally 1 +- which is good enough for a target not in a loop. */ ++ .p2align 4 + L(return_vec_1): +- tzcntl %eax, %eax ++ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one ++ fetch block. */ ++ bsfl %eax, %eax + # ifdef USE_AS_WMEMCMP + movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx + xorl %edx, %edx +@@ -226,10 +258,11 @@ L(return_vec_1): + # endif + ret + +- /* NB: No p2align necessary. Alignment % 16 is naturally 2 +- which is good enough for a target not in a loop. */ ++ .p2align 4,, 10 + L(return_vec_2): +- tzcntl %eax, %eax ++ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one ++ fetch block. */ ++ bsfl %eax, %eax + # ifdef USE_AS_WMEMCMP + movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx + xorl %edx, %edx +@@ -243,40 +276,6 @@ L(return_vec_2): + # endif + ret + +- .p2align 4 +-L(8x_return_vec_0_1_2_3): +- /* Returning from L(more_8x_vec) requires restoring rsi. */ +- addq %rdi, %rsi +-L(return_vec_0_1_2_3): +- VPCMP $4, %YMM1, %YMM0, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(return_vec_0) +- +- VPCMP $4, %YMM2, %YMM0, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(return_vec_1) +- +- VPCMP $4, %YMM3, %YMM0, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(return_vec_2) +-L(return_vec_3): +- tzcntl %ecx, %ecx +-# ifdef USE_AS_WMEMCMP +- movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax +- xorl %edx, %edx +- cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax +- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx +- subl %ecx, %eax +-# endif +- ret +- + .p2align 4 + L(more_8x_vec): + /* Set end of s1 in rdx. */ +@@ -288,21 +287,19 @@ L(more_8x_vec): + andq $-VEC_SIZE, %rdi + /* Adjust because first 4x vec where check already. */ + subq $-(VEC_SIZE * 4), %rdi ++ + .p2align 4 + L(loop_4x_vec): + VMOVU (%rsi, %rdi), %YMM1 + vpxorq (%rdi), %YMM1, %YMM1 +- + VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 + vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2 +- + VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 + vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 +- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 +- + VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4 +- VPCMP $4, %YMM4, %YMM0, %k1 ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 ++ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 ++ VPTEST %YMM4, %YMM4, %k1 + kmovd %k1, %ecx + testl %ecx, %ecx + jnz L(8x_return_vec_0_1_2_3) +@@ -319,28 +316,25 @@ L(loop_4x_vec): + cmpl $(VEC_SIZE * 2), %edi + jae L(8x_last_2x_vec) + ++ vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3 ++ + VMOVU (%rsi, %rdx), %YMM1 + vpxorq (%rdx), %YMM1, %YMM1 + + VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 + vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 +- +- vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3 +- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 +- + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4 +- VPCMP $4, %YMM4, %YMM0, %k1 ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4 ++ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 ++ VPTEST %YMM4, %YMM4, %k1 + kmovd %k1, %ecx +- /* Restore s1 pointer to rdi. */ +- movq %rdx, %rdi + testl %ecx, %ecx +- jnz L(8x_return_vec_0_1_2_3) ++ jnz L(8x_end_return_vec_0_1_2_3) + /* NB: eax must be zero to reach here. */ + ret + + /* Only entry is from L(more_8x_vec). */ +- .p2align 4 ++ .p2align 4,, 10 + L(8x_last_2x_vec): + VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1 + kmovd %k1, %eax +@@ -355,7 +349,31 @@ L(8x_last_1x_vec): + jnz L(8x_return_vec_3) + ret + +- .p2align 4 ++ /* Not ideally aligned (at offset +9 bytes in fetch block) but ++ not aligning keeps it in the same cache line as ++ L(8x_last_1x/2x_vec) so likely worth it. As well, saves code ++ size. */ ++ .p2align 4,, 4 ++L(8x_return_vec_2): ++ subq $VEC_SIZE, %rdx ++L(8x_return_vec_3): ++ bsfl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ leaq (%rdx, %rax, CHAR_SIZE), %rax ++ movl (VEC_SIZE * 3)(%rax), %ecx ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ addq %rdx, %rax ++ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx ++ movzbl (VEC_SIZE * 3)(%rax), %eax ++ subl %ecx, %eax ++# endif ++ ret ++ ++ .p2align 4,, 10 + L(last_2x_vec): + /* Check second to last VEC. */ + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1 +@@ -374,26 +392,49 @@ L(last_1x_vec): + jnz L(return_vec_0_end) + ret + +- .p2align 4 +-L(8x_return_vec_2): +- subq $VEC_SIZE, %rdx +-L(8x_return_vec_3): +- tzcntl %eax, %eax ++ .p2align 4,, 10 ++L(return_vec_1_end): ++ /* Use bsf to save code size. This is necessary to have ++ L(one_or_less) fit in aligning bytes between. */ ++ bsfl %eax, %eax ++ addl %edx, %eax + # ifdef USE_AS_WMEMCMP +- leaq (%rdx, %rax, CHAR_SIZE), %rax +- movl (VEC_SIZE * 3)(%rax), %ecx ++ movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx + xorl %edx, %edx +- cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx ++ cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx + setg %dl + leal -1(%rdx, %rdx), %eax + # else +- addq %rdx, %rax +- movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx +- movzbl (VEC_SIZE * 3)(%rax), %eax ++ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx ++ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax + subl %ecx, %eax + # endif + ret + ++ /* NB: L(one_or_less) fits in alignment padding between ++ L(return_vec_1_end) and L(return_vec_0_end). */ ++# ifdef USE_AS_WMEMCMP ++L(one_or_less): ++ jb L(zero) ++ movl (%rdi), %ecx ++ xorl %edx, %edx ++ cmpl (%rsi), %ecx ++ je L(zero) ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++ ret ++# else ++L(one_or_less): ++ jb L(zero) ++ movzbl (%rsi), %ecx ++ movzbl (%rdi), %eax ++ subl %ecx, %eax ++ ret ++# endif ++L(zero): ++ xorl %eax, %eax ++ ret ++ + .p2align 4 + L(return_vec_0_end): + tzcntl %eax, %eax +@@ -412,23 +453,56 @@ L(return_vec_0_end): + ret + + .p2align 4 +-L(return_vec_1_end): ++L(less_vec): ++ /* Check if one or less CHAR. This is necessary for size == 0 ++ but is also faster for size == CHAR_SIZE. */ ++ cmpl $1, %edx ++ jbe L(one_or_less) ++ ++ /* Check if loading one VEC from either s1 or s2 could cause a ++ page cross. This can have false positives but is by far the ++ fastest method. */ ++ movl %edi, %eax ++ orl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ jg L(page_cross_less_vec) ++ ++ /* No page cross possible. */ ++ VMOVU (%rsi), %YMM2 ++ VPCMP $4, (%rdi), %YMM2, %k1 ++ kmovd %k1, %eax ++ /* Check if any matches where in bounds. Intentionally not ++ storing result in eax to limit dependency chain if it goes to ++ L(return_vec_0_lv). */ ++ bzhil %edx, %eax, %edx ++ jnz L(return_vec_0_lv) ++ xorl %eax, %eax ++ ret ++ ++ /* Essentially duplicate of L(return_vec_0). Ends up not costing ++ any code as shrinks L(less_vec) by allowing 2-byte encoding of ++ the jump and ends up fitting in aligning bytes. As well fits on ++ same cache line as L(less_vec) so also saves a line from having ++ to be fetched on cold calls to memcmp. */ ++ .p2align 4,, 4 ++L(return_vec_0_lv): + tzcntl %eax, %eax +- addl %edx, %eax + # ifdef USE_AS_WMEMCMP +- movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx ++ movl (%rdi, %rax, CHAR_SIZE), %ecx + xorl %edx, %edx +- cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx ++ cmpl (%rsi, %rax, CHAR_SIZE), %ecx ++ /* NB: no partial register stall here because xorl zero idiom ++ above. */ + setg %dl + leal -1(%rdx, %rdx), %eax + # else +- movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx +- movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax ++ movzbl (%rsi, %rax), %ecx ++ movzbl (%rdi, %rax), %eax + subl %ecx, %eax + # endif + ret + +- + .p2align 4 + L(page_cross_less_vec): + /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 +@@ -439,108 +513,84 @@ L(page_cross_less_vec): + cmpl $8, %edx + jae L(between_8_15) + cmpl $4, %edx +- jae L(between_4_7) +-L(between_2_3): +- /* Load as big endian to avoid branches. */ +- movzwl (%rdi), %eax +- movzwl (%rsi), %ecx +- shll $8, %eax +- shll $8, %ecx +- bswap %eax +- bswap %ecx +- movzbl -1(%rdi, %rdx), %edi +- movzbl -1(%rsi, %rdx), %esi +- orl %edi, %eax +- orl %esi, %ecx +- /* Subtraction is okay because the upper 8 bits are zero. */ +- subl %ecx, %eax +- ret +- .p2align 4 +-L(one_or_less): +- jb L(zero) +- movzbl (%rsi), %ecx +- movzbl (%rdi), %eax +- subl %ecx, %eax ++ jb L(between_2_3) ++ ++ /* Load as big endian with overlapping movbe to avoid branches. ++ */ ++ movbe (%rdi), %eax ++ movbe (%rsi), %ecx ++ shlq $32, %rax ++ shlq $32, %rcx ++ movbe -4(%rdi, %rdx), %edi ++ movbe -4(%rsi, %rdx), %esi ++ orq %rdi, %rax ++ orq %rsi, %rcx ++ subq %rcx, %rax ++ /* edx is guranteed to be positive int32 in range [4, 7]. */ ++ cmovne %edx, %eax ++ /* ecx is -1 if rcx > rax. Otherwise 0. */ ++ sbbl %ecx, %ecx ++ /* If rcx > rax, then ecx is 0 and eax is positive. If rcx == ++ rax then eax and ecx are zero. If rax < rax then ecx is -1 so ++ eax doesn't matter. */ ++ orl %ecx, %eax + ret + +- .p2align 4 ++ .p2align 4,, 8 + L(between_8_15): + # endif + /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ +- vmovq (%rdi), %XMM1 +- vmovq (%rsi), %XMM2 +- VPCMP $4, %XMM1, %XMM2, %k1 ++ vmovq (%rdi), %xmm1 ++ vmovq (%rsi), %xmm2 ++ VPCMP $4, %xmm1, %xmm2, %k1 + kmovd %k1, %eax + testl %eax, %eax +- jnz L(return_vec_0) ++ jnz L(return_vec_0_lv) + /* Use overlapping loads to avoid branches. */ +- leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi +- leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi +- vmovq (%rdi), %XMM1 +- vmovq (%rsi), %XMM2 +- VPCMP $4, %XMM1, %XMM2, %k1 ++ vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1 ++ vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2 ++ VPCMP $4, %xmm1, %xmm2, %k1 ++ addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx + kmovd %k1, %eax + testl %eax, %eax +- jnz L(return_vec_0) +- ret +- +- .p2align 4 +-L(zero): +- xorl %eax, %eax ++ jnz L(return_vec_0_end) + ret + +- .p2align 4 ++ .p2align 4,, 8 + L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ +- VMOVU (%rsi), %XMM2 +- VPCMP $4, (%rdi), %XMM2, %k1 ++ ++ /* Use movups to save code size. */ ++ movups (%rsi), %xmm2 ++ VPCMP $4, (%rdi), %xmm2, %k1 + kmovd %k1, %eax + testl %eax, %eax +- jnz L(return_vec_0) +- ++ jnz L(return_vec_0_lv) + /* Use overlapping loads to avoid branches. */ +- +- VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2 +- leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi +- leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi +- VPCMP $4, (%rdi), %XMM2, %k1 ++ movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2 ++ VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1 ++ addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx + kmovd %k1, %eax + testl %eax, %eax +- jnz L(return_vec_0) +- ret +- +-# ifdef USE_AS_WMEMCMP +- .p2align 4 +-L(one_or_less): +- jb L(zero) +- movl (%rdi), %ecx +- xorl %edx, %edx +- cmpl (%rsi), %ecx +- je L(zero) +- setg %dl +- leal -1(%rdx, %rdx), %eax ++ jnz L(return_vec_0_end) + ret +-# else + +- .p2align 4 +-L(between_4_7): +- /* Load as big endian with overlapping movbe to avoid branches. +- */ +- movbe (%rdi), %eax +- movbe (%rsi), %ecx +- shlq $32, %rax +- shlq $32, %rcx +- movbe -4(%rdi, %rdx), %edi +- movbe -4(%rsi, %rdx), %esi +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- jz L(zero_4_7) +- sbbl %eax, %eax +- orl $1, %eax +-L(zero_4_7): ++# ifndef USE_AS_WMEMCMP ++L(between_2_3): ++ /* Load as big endian to avoid branches. */ ++ movzwl (%rdi), %eax ++ movzwl (%rsi), %ecx ++ shll $8, %eax ++ shll $8, %ecx ++ bswap %eax ++ bswap %ecx ++ movzbl -1(%rdi, %rdx), %edi ++ movzbl -1(%rsi, %rdx), %esi ++ orl %edi, %eax ++ orl %esi, %ecx ++ /* Subtraction is okay because the upper 8 bits are zero. */ ++ subl %ecx, %eax + ret + # endif +- + END (MEMCMP) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-57.patch b/glibc-RHEL-15696-57.patch new file mode 100644 index 0000000..51d5dd0 --- /dev/null +++ b/glibc-RHEL-15696-57.patch @@ -0,0 +1,510 @@ +From e59ced238482fd71f3e493717f14f6507346741e Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 20 Sep 2021 16:20:15 -0500 +Subject: [PATCH] x86: Optimize memset-vec-unaligned-erms.S +Content-type: text/plain; charset=UTF-8 + +No bug. + +Optimization are + +1. change control flow for L(more_2x_vec) to fall through to loop and + jump for L(less_4x_vec) and L(less_8x_vec). This uses less code + size and saves jumps for length > 4x VEC_SIZE. + +2. For EVEX/AVX512 move L(less_vec) closer to entry. + +3. Avoid complex address mode for length > 2x VEC_SIZE + +4. Slightly better aligning code for the loop from the perspective of + code size and uops. + +5. Align targets so they make full use of their fetch block and if + possible cache line. + +6. Try and reduce total number of icache lines that will need to be + pulled in for a given length. + +7. Include "local" version of stosb target. For AVX2/EVEX/AVX512 + jumping to the stosb target in the sse2 code section will almost + certainly be to a new page. The new version does increase code size + marginally by duplicating the target but should get better iTLB + behavior as a result. + +test-memset, test-wmemset, and test-bzero are all passing. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/memset.S | 10 +- + .../multiarch/memset-avx2-unaligned-erms.S | 10 +- + .../multiarch/memset-avx512-unaligned-erms.S | 11 +- + .../multiarch/memset-evex-unaligned-erms.S | 11 +- + .../multiarch/memset-vec-unaligned-erms.S | 285 ++++++++++++------ + 5 files changed, 232 insertions(+), 95 deletions(-) + +Conflicts: + sysdeps/x86_64/memset.S + (GNU URL) + +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index b3426795..8672b030 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -18,13 +18,15 @@ + . */ + + #include ++#define USE_WITH_SSE2 1 + + #define VEC_SIZE 16 ++#define MOV_SIZE 3 ++#define RET_SIZE 1 ++ + #define VEC(i) xmm##i +-/* Don't use movups and movaps since it will get larger nop paddings for +- alignment. */ +-#define VMOVU movdqu +-#define VMOVA movdqa ++#define VMOVU movups ++#define VMOVA movaps + + #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +index ae0860f3..1af668af 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +@@ -1,8 +1,14 @@ + #if IS_IN (libc) ++# define USE_WITH_AVX2 1 ++ + # define VEC_SIZE 32 ++# define MOV_SIZE 4 ++# define RET_SIZE 4 ++ + # define VEC(i) ymm##i +-# define VMOVU vmovdqu +-# define VMOVA vmovdqa ++ ++# define VMOVU vmovdqu ++# define VMOVA vmovdqa + + # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index 8ad842fc..f14d6f84 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -1,11 +1,18 @@ + #if IS_IN (libc) ++# define USE_WITH_AVX512 1 ++ + # define VEC_SIZE 64 ++# define MOV_SIZE 6 ++# define RET_SIZE 1 ++ + # define XMM0 xmm16 + # define YMM0 ymm16 + # define VEC0 zmm16 + # define VEC(i) VEC##i +-# define VMOVU vmovdqu64 +-# define VMOVA vmovdqa64 ++ ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++ + # define VZEROUPPER + + # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +index 640f0929..64b09e77 100644 +--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -1,11 +1,18 @@ + #if IS_IN (libc) ++# define USE_WITH_EVEX 1 ++ + # define VEC_SIZE 32 ++# define MOV_SIZE 6 ++# define RET_SIZE 1 ++ + # define XMM0 xmm16 + # define YMM0 ymm16 + # define VEC0 ymm16 + # define VEC(i) VEC##i +-# define VMOVU vmovdqu64 +-# define VMOVA vmovdqa64 ++ ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++ + # define VZEROUPPER + + # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 909c33f6..f08b7323 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -63,8 +63,27 @@ + # endif + #endif + ++#if VEC_SIZE == 64 ++# define LOOP_4X_OFFSET (VEC_SIZE * 4) ++#else ++# define LOOP_4X_OFFSET (0) ++#endif ++ ++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 ++# define END_REG rcx ++# define LOOP_REG rdi ++#else ++# define END_REG rdi ++# define LOOP_REG rdx ++#endif ++ + #define PAGE_SIZE 4096 + ++/* Macro to calculate size of small memset block for aligning ++ purposes. */ ++#define SMALL_MEMSET_ALIGN(mov_sz, ret_sz) (2 * (mov_sz) + (ret_sz) + 1) ++ ++ + #ifndef SECTION + # error SECTION is not defined! + #endif +@@ -74,6 +93,7 @@ + ENTRY (__bzero) + mov %RDI_LP, %RAX_LP /* Set return value. */ + mov %RSI_LP, %RDX_LP /* Set n. */ ++ xorl %esi, %esi + pxor %XMM0, %XMM0 + jmp L(entry_from_bzero) + END (__bzero) +@@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + # endif + +-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) ++ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ +@@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) + jb L(less_vec) + cmp $(VEC_SIZE * 2), %RDX_LP + ja L(stosb_more_2x_vec) +- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ +- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) +- VMOVU %VEC(0), (%rdi) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. ++ */ ++ VMOVU %VEC(0), (%rax) ++ VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) + VZEROUPPER_RETURN +- +- .p2align 4 +-L(stosb_more_2x_vec): +- cmp __x86_rep_stosb_threshold(%rip), %RDX_LP +- ja L(stosb) +-#else +- .p2align 4 + #endif +-L(more_2x_vec): +- /* Stores to first 2x VEC before cmp as any path forward will +- require it. */ +- VMOVU %VEC(0), (%rdi) +- VMOVU %VEC(0), VEC_SIZE(%rdi) +- cmpq $(VEC_SIZE * 4), %rdx +- ja L(loop_start) +- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) +- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) +-L(return): +-#if VEC_SIZE > 16 +- ZERO_UPPER_VEC_REGISTERS_RETURN ++ ++ .p2align 4,, 10 ++L(last_2x_vec): ++#ifdef USE_LESS_VEC_MASK_STORE ++ VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx) ++ VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx) + #else +- ret ++ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) + #endif ++ VZEROUPPER_RETURN + +-L(loop_start): +- VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) +- VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi) +- cmpq $(VEC_SIZE * 8), %rdx +- jbe L(loop_end) +- andq $-(VEC_SIZE * 2), %rdi +- subq $-(VEC_SIZE * 4), %rdi +- leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx +- .p2align 4 +-L(loop): +- VMOVA %VEC(0), (%rdi) +- VMOVA %VEC(0), VEC_SIZE(%rdi) +- VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi) +- VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi) +- subq $-(VEC_SIZE * 4), %rdi +- cmpq %rcx, %rdi +- jb L(loop) +-L(loop_end): +- /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN. +- rdx as length is also unchanged. */ +- VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx) +- VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx) +- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx) +- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) +- VZEROUPPER_SHORT_RETURN +- +- .p2align 4 ++ /* If have AVX512 mask instructions put L(less_vec) close to ++ entry as it doesn't take much space and is likely a hot target. ++ */ ++#ifdef USE_LESS_VEC_MASK_STORE ++ .p2align 4,, 10 + L(less_vec): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 + # error Unsupported VEC_SIZE! + # endif +-# ifdef USE_LESS_VEC_MASK_STORE + /* Clear high bits from edi. Only keeping bits relevant to page + cross check. Note that we are using rax which is set in +- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. +- */ ++ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */ + andl $(PAGE_SIZE - 1), %edi +- /* Check if VEC_SIZE store cross page. Mask stores suffer serious +- performance degradation when it has to fault supress. */ ++ /* Check if VEC_SIZE store cross page. Mask stores suffer ++ serious performance degradation when it has to fault supress. ++ */ + cmpl $(PAGE_SIZE - VEC_SIZE), %edi ++ /* This is generally considered a cold target. */ + ja L(cross_page) + # if VEC_SIZE > 32 + movq $-1, %rcx +@@ -247,58 +235,185 @@ L(less_vec): + bzhil %edx, %ecx, %ecx + kmovd %ecx, %k1 + # endif +- vmovdqu8 %VEC(0), (%rax) {%k1} ++ vmovdqu8 %VEC(0), (%rax){%k1} + VZEROUPPER_RETURN + ++# if defined USE_MULTIARCH && IS_IN (libc) ++ /* Include L(stosb_local) here if including L(less_vec) between ++ L(stosb_more_2x_vec) and ENTRY. This is to cache align the ++ L(stosb_more_2x_vec) target. */ ++ .p2align 4,, 10 ++L(stosb_local): ++ movzbl %sil, %eax ++ mov %RDX_LP, %RCX_LP ++ mov %RDI_LP, %RDX_LP ++ rep stosb ++ mov %RDX_LP, %RAX_LP ++ VZEROUPPER_RETURN ++# endif ++#endif ++ ++#if defined USE_MULTIARCH && IS_IN (libc) + .p2align 4 +-L(cross_page): ++L(stosb_more_2x_vec): ++ cmp __x86_rep_stosb_threshold(%rip), %RDX_LP ++ ja L(stosb_local) ++#endif ++ /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x] ++ and (4x, 8x] jump to target. */ ++L(more_2x_vec): ++ ++ /* Two different methods of setting up pointers / compare. The ++ two methods are based on the fact that EVEX/AVX512 mov ++ instructions take more bytes then AVX2/SSE2 mov instructions. As ++ well that EVEX/AVX512 machines also have fast LEA_BID. Both ++ setup and END_REG to avoid complex address mode. For EVEX/AVX512 ++ this saves code size and keeps a few targets in one fetch block. ++ For AVX2/SSE2 this helps prevent AGU bottlenecks. */ ++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 ++ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + ++ LOOP_4X_OFFSET) with LEA_BID. */ ++ ++ /* END_REG is rcx for EVEX/AVX512. */ ++ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG ++#endif ++ ++ /* Stores to first 2x VEC before cmp as any path forward will ++ require it. */ ++ VMOVU %VEC(0), (%rax) ++ VMOVU %VEC(0), VEC_SIZE(%rax) ++ ++ ++#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) ++ /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */ ++ addq %rdx, %END_REG ++#endif ++ ++ cmpq $(VEC_SIZE * 4), %rdx ++ jbe L(last_2x_vec) ++ ++ /* Store next 2x vec regardless. */ ++ VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) ++ VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) ++ ++ ++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 ++ /* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add ++ extra offset to addresses in loop. Used for AVX512 to save space ++ as no way to get (VEC_SIZE * 4) in imm8. */ ++# if LOOP_4X_OFFSET == 0 ++ subq $-(VEC_SIZE * 4), %LOOP_REG + # endif +-# if VEC_SIZE > 32 +- cmpb $32, %dl +- jae L(between_32_63) ++ /* Avoid imm32 compare here to save code size. */ ++ cmpq %rdi, %rcx ++#else ++ addq $-(VEC_SIZE * 4), %END_REG ++ cmpq $(VEC_SIZE * 8), %rdx ++#endif ++ jbe L(last_4x_vec) ++#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) ++ /* Set LOOP_REG (rdx). */ ++ leaq (VEC_SIZE * 4)(%rax), %LOOP_REG ++#endif ++ /* Align dst for loop. */ ++ andq $(VEC_SIZE * -2), %LOOP_REG ++ .p2align 4 ++L(loop): ++ VMOVA %VEC(0), LOOP_4X_OFFSET(%LOOP_REG) ++ VMOVA %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG) ++ VMOVA %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG) ++ VMOVA %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG) ++ subq $-(VEC_SIZE * 4), %LOOP_REG ++ cmpq %END_REG, %LOOP_REG ++ jb L(loop) ++ .p2align 4,, MOV_SIZE ++L(last_4x_vec): ++ VMOVU %VEC(0), LOOP_4X_OFFSET(%END_REG) ++ VMOVU %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG) ++ VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG) ++ VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG) ++L(return): ++#if VEC_SIZE > 16 ++ ZERO_UPPER_VEC_REGISTERS_RETURN ++#else ++ ret ++#endif ++ ++ .p2align 4,, 10 ++#ifndef USE_LESS_VEC_MASK_STORE ++# if defined USE_MULTIARCH && IS_IN (libc) ++ /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in ++ range for 2-byte jump encoding. */ ++L(stosb_local): ++ movzbl %sil, %eax ++ mov %RDX_LP, %RCX_LP ++ mov %RDI_LP, %RDX_LP ++ rep stosb ++ mov %RDX_LP, %RAX_LP ++ VZEROUPPER_RETURN + # endif +-# if VEC_SIZE > 16 +- cmpb $16, %dl ++ /* Define L(less_vec) only if not otherwise defined. */ ++ .p2align 4 ++L(less_vec): ++#endif ++L(cross_page): ++#if VEC_SIZE > 32 ++ cmpl $32, %edx ++ jae L(between_32_63) ++#endif ++#if VEC_SIZE > 16 ++ cmpl $16, %edx + jae L(between_16_31) +-# endif +- MOVQ %XMM0, %rcx +- cmpb $8, %dl ++#endif ++ MOVQ %XMM0, %rdi ++ cmpl $8, %edx + jae L(between_8_15) +- cmpb $4, %dl ++ cmpl $4, %edx + jae L(between_4_7) +- cmpb $1, %dl ++ cmpl $1, %edx + ja L(between_2_3) +- jb 1f +- movb %cl, (%rax) +-1: ++ jb L(return) ++ movb %sil, (%rax) + VZEROUPPER_RETURN +-# if VEC_SIZE > 32 ++ ++ /* Align small targets only if not doing so would cross a fetch ++ line. */ ++#if VEC_SIZE > 32 ++ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) + /* From 32 to 63. No branch when size == 32. */ + L(between_32_63): +- VMOVU %YMM0, -32(%rax,%rdx) + VMOVU %YMM0, (%rax) ++ VMOVU %YMM0, -32(%rax, %rdx) + VZEROUPPER_RETURN +-# endif +-# if VEC_SIZE > 16 +- /* From 16 to 31. No branch when size == 16. */ ++#endif ++ ++#if VEC_SIZE >= 32 ++ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) + L(between_16_31): +- VMOVU %XMM0, -16(%rax,%rdx) ++ /* From 16 to 31. No branch when size == 16. */ + VMOVU %XMM0, (%rax) ++ VMOVU %XMM0, -16(%rax, %rdx) + VZEROUPPER_RETURN +-# endif +- /* From 8 to 15. No branch when size == 8. */ ++#endif ++ ++ .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) + L(between_8_15): +- movq %rcx, -8(%rax,%rdx) +- movq %rcx, (%rax) ++ /* From 8 to 15. No branch when size == 8. */ ++ movq %rdi, (%rax) ++ movq %rdi, -8(%rax, %rdx) + VZEROUPPER_RETURN ++ ++ .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE) + L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ +- movl %ecx, -4(%rax,%rdx) +- movl %ecx, (%rax) ++ movl %edi, (%rax) ++ movl %edi, -4(%rax, %rdx) + VZEROUPPER_RETURN ++ ++ .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) + L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ +- movw %cx, -2(%rax,%rdx) +- movw %cx, (%rax) ++ movw %di, (%rax) ++ movb %dil, -1(%rax, %rdx) + VZEROUPPER_RETURN + END (MEMSET_SYMBOL (__memset, unaligned_erms)) +-- +GitLab + diff --git a/glibc-RHEL-15696-58.patch b/glibc-RHEL-15696-58.patch new file mode 100644 index 0000000..cec0788 --- /dev/null +++ b/glibc-RHEL-15696-58.patch @@ -0,0 +1,45 @@ +From bad852b61b79503fcb3c5fc379c70f768df3e1fb Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Sat, 23 Oct 2021 01:26:47 -0400 +Subject: [PATCH] x86: Replace sse2 instructions with avx in + memcmp-evex-movbe.S +Content-type: text/plain; charset=UTF-8 + +This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'. + +it could potentially be dangerous to use SSE2 if this function is ever +called without using 'vzeroupper' beforehand. While compilers appear +to use 'vzeroupper' before function calls if AVX2 has been used, using +SSE2 here is more brittle. Since it is not absolutely necessary it +should be avoided. + +It costs 2-extra bytes but the extra bytes should only eat into +alignment padding. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +index 2761b54f..640f6757 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +@@ -561,13 +561,13 @@ L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ + + /* Use movups to save code size. */ +- movups (%rsi), %xmm2 ++ vmovdqu (%rsi), %xmm2 + VPCMP $4, (%rdi), %xmm2, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(return_vec_0_lv) + /* Use overlapping loads to avoid branches. */ +- movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2 ++ vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2 + VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1 + addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx + kmovd %k1, %eax +-- +GitLab + diff --git a/glibc-RHEL-15696-59.patch b/glibc-RHEL-15696-59.patch new file mode 100644 index 0000000..efc618c --- /dev/null +++ b/glibc-RHEL-15696-59.patch @@ -0,0 +1,695 @@ +From c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 29 Oct 2021 12:40:20 -0700 +Subject: [PATCH] x86-64: Improve EVEX strcmp with masked load +Content-type: text/plain; charset=UTF-8 + +In strcmp-evex.S, to compare 2 32-byte strings, replace + + VMOVU (%rdi, %rdx), %YMM0 + VMOVU (%rsi, %rdx), %YMM1 + /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ + VPCMP $4, %YMM0, %YMM1, %k0 + VPCMP $0, %YMMZERO, %YMM0, %k1 + VPCMP $0, %YMMZERO, %YMM1, %k2 + /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch. */ + kord %k0, %k1, %k1 + kmovd %k1, %ecx + testl %ecx, %ecx + jne L(last_vector) + +with + + VMOVU (%rdi, %rdx), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 + /* Each bit cleared in K1 represents a mismatch or a null CHAR + in YMM0 and 32 bytes at (%rsi, %rdx). */ + VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2} + kmovd %k1, %ecx + incl %ecx + jne L(last_vector) + +It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake +and Ice Lake. + +Co-Authored-By: Noah Goldstein +--- + sysdeps/x86_64/multiarch/strcmp-evex.S | 461 +++++++++++++------------ + 1 file changed, 243 insertions(+), 218 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index d5aa6daa..82f12ac8 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -41,6 +41,8 @@ + # ifdef USE_AS_WCSCMP + /* Compare packed dwords. */ + # define VPCMP vpcmpd ++# define VPMINU vpminud ++# define VPTESTM vptestmd + # define SHIFT_REG32 r8d + # define SHIFT_REG64 r8 + /* 1 dword char == 4 bytes. */ +@@ -48,6 +50,8 @@ + # else + /* Compare packed bytes. */ + # define VPCMP vpcmpb ++# define VPMINU vpminub ++# define VPTESTM vptestmb + # define SHIFT_REG32 ecx + # define SHIFT_REG64 rcx + /* 1 byte char == 1 byte. */ +@@ -67,6 +71,9 @@ + # define YMM5 ymm22 + # define YMM6 ymm23 + # define YMM7 ymm24 ++# define YMM8 ymm25 ++# define YMM9 ymm26 ++# define YMM10 ymm27 + + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. +@@ -76,7 +83,7 @@ + /* The main idea of the string comparison (byte or dword) using 256-bit + EVEX instructions consists of comparing (VPCMP) two ymm vectors. The + latter can be on either packed bytes or dwords depending on +- USE_AS_WCSCMP. In order to check the null char, algorithm keeps the ++ USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the + matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2 + KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes) + are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd +@@ -123,27 +130,21 @@ ENTRY (STRCMP) + jg L(cross_page) + /* Start comparing 4 vectors. */ + VMOVU (%rdi), %YMM0 +- VMOVU (%rsi), %YMM1 + +- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ +- VPCMP $4, %YMM0, %YMM1, %k0 ++ /* Each bit set in K2 represents a non-null CHAR in YMM0. */ ++ VPTESTM %YMM0, %YMM0, %k2 + +- /* Check for NULL in YMM0. */ +- VPCMP $0, %YMMZERO, %YMM0, %k1 +- /* Check for NULL in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k2 +- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ +- kord %k1, %k2, %k1 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at (%rsi). */ ++ VPCMP $0, (%rsi), %YMM0, %k1{%k2} + +- /* Each bit in K1 represents: +- 1. A mismatch in YMM0 and YMM1. Or +- 2. A NULL in YMM0 or YMM1. +- */ +- kord %k0, %k1, %k1 +- +- ktestd %k1, %k1 +- je L(next_3_vectors) + kmovd %k1, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif ++ je L(next_3_vectors) + tzcntl %ecx, %edx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -172,9 +173,7 @@ L(return): + # endif + ret + +- .p2align 4 + L(return_vec_size): +- kmovd %k1, %ecx + tzcntl %ecx, %edx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -210,9 +209,7 @@ L(return_vec_size): + # endif + ret + +- .p2align 4 + L(return_2_vec_size): +- kmovd %k1, %ecx + tzcntl %ecx, %edx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -248,9 +245,7 @@ L(return_2_vec_size): + # endif + ret + +- .p2align 4 + L(return_3_vec_size): +- kmovd %k1, %ecx + tzcntl %ecx, %edx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -289,43 +284,45 @@ L(return_3_vec_size): + .p2align 4 + L(next_3_vectors): + VMOVU VEC_SIZE(%rdi), %YMM0 +- VMOVU VEC_SIZE(%rsi), %YMM1 +- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ +- VPCMP $4, %YMM0, %YMM1, %k0 +- VPCMP $0, %YMMZERO, %YMM0, %k1 +- VPCMP $0, %YMMZERO, %YMM1, %k2 +- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 +- ktestd %k1, %k1 ++ /* Each bit set in K2 represents a non-null CHAR in YMM0. */ ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at VEC_SIZE(%rsi). */ ++ VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + jne L(return_vec_size) + +- VMOVU (VEC_SIZE * 2)(%rdi), %YMM2 +- VMOVU (VEC_SIZE * 3)(%rdi), %YMM3 +- VMOVU (VEC_SIZE * 2)(%rsi), %YMM4 +- VMOVU (VEC_SIZE * 3)(%rsi), %YMM5 +- +- /* Each bit in K0 represents a mismatch in YMM2 and YMM4. */ +- VPCMP $4, %YMM2, %YMM4, %k0 +- VPCMP $0, %YMMZERO, %YMM2, %k1 +- VPCMP $0, %YMMZERO, %YMM4, %k2 +- /* Each bit in K1 represents a NULL in YMM2 or YMM4. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 +- ktestd %k1, %k1 ++ VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 ++ /* Each bit set in K2 represents a non-null CHAR in YMM0. */ ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ ++ VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + jne L(return_2_vec_size) + +- /* Each bit in K0 represents a mismatch in YMM3 and YMM5. */ +- VPCMP $4, %YMM3, %YMM5, %k0 +- VPCMP $0, %YMMZERO, %YMM3, %k1 +- VPCMP $0, %YMMZERO, %YMM5, %k2 +- /* Each bit in K1 represents a NULL in YMM3 or YMM5. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 +- ktestd %k1, %k1 ++ VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 ++ /* Each bit set in K2 represents a non-null CHAR in YMM0. */ ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ ++ VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + jne L(return_3_vec_size) + L(main_loop_header): + leaq (VEC_SIZE * 4)(%rdi), %rdx +@@ -375,56 +372,51 @@ L(back_to_loop): + VMOVA VEC_SIZE(%rax), %YMM2 + VMOVA (VEC_SIZE * 2)(%rax), %YMM4 + VMOVA (VEC_SIZE * 3)(%rax), %YMM6 +- VMOVU (%rdx), %YMM1 +- VMOVU VEC_SIZE(%rdx), %YMM3 +- VMOVU (VEC_SIZE * 2)(%rdx), %YMM5 +- VMOVU (VEC_SIZE * 3)(%rdx), %YMM7 +- +- VPCMP $4, %YMM0, %YMM1, %k0 +- VPCMP $0, %YMMZERO, %YMM0, %k1 +- VPCMP $0, %YMMZERO, %YMM1, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K4 represents a NULL or a mismatch in YMM0 and +- YMM1. */ +- kord %k0, %k1, %k4 +- +- VPCMP $4, %YMM2, %YMM3, %k0 +- VPCMP $0, %YMMZERO, %YMM2, %k1 +- VPCMP $0, %YMMZERO, %YMM3, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K5 represents a NULL or a mismatch in YMM2 and +- YMM3. */ +- kord %k0, %k1, %k5 +- +- VPCMP $4, %YMM4, %YMM5, %k0 +- VPCMP $0, %YMMZERO, %YMM4, %k1 +- VPCMP $0, %YMMZERO, %YMM5, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K6 represents a NULL or a mismatch in YMM4 and +- YMM5. */ +- kord %k0, %k1, %k6 +- +- VPCMP $4, %YMM6, %YMM7, %k0 +- VPCMP $0, %YMMZERO, %YMM6, %k1 +- VPCMP $0, %YMMZERO, %YMM7, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K7 represents a NULL or a mismatch in YMM6 and +- YMM7. */ +- kord %k0, %k1, %k7 +- +- kord %k4, %k5, %k0 +- kord %k6, %k7, %k1 +- +- /* Test each mask (32 bits) individually because for VEC_SIZE +- == 32 is not possible to OR the four masks and keep all bits +- in a 64-bit integer register, differing from SSE2 strcmp +- where ORing is possible. */ +- kortestd %k0, %k1 +- je L(loop) +- ktestd %k4, %k4 ++ ++ VPMINU %YMM0, %YMM2, %YMM8 ++ VPMINU %YMM4, %YMM6, %YMM9 ++ ++ /* A zero CHAR in YMM8 means that there is a null CHAR. */ ++ VPMINU %YMM8, %YMM9, %YMM8 ++ ++ /* Each bit set in K1 represents a non-null CHAR in YMM8. */ ++ VPTESTM %YMM8, %YMM8, %k1 ++ ++ /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */ ++ vpxorq (%rdx), %YMM0, %YMM1 ++ vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3 ++ vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5 ++ vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7 ++ ++ vporq %YMM1, %YMM3, %YMM9 ++ vporq %YMM5, %YMM7, %YMM10 ++ ++ /* A non-zero CHAR in YMM9 represents a mismatch. */ ++ vporq %YMM9, %YMM10, %YMM9 ++ ++ /* Each bit cleared in K0 represents a mismatch or a null CHAR. */ ++ VPCMP $0, %YMMZERO, %YMM9, %k0{%k1} ++ kmovd %k0, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif ++ je L(loop) ++ ++ /* Each bit set in K1 represents a non-null CHAR in YMM0. */ ++ VPTESTM %YMM0, %YMM0, %k1 ++ /* Each bit cleared in K0 represents a mismatch or a null CHAR ++ in YMM0 and (%rdx). */ ++ VPCMP $0, %YMMZERO, %YMM1, %k0{%k1} ++ kmovd %k0, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + je L(test_vec) +- kmovd %k4, %edi +- tzcntl %edi, %ecx ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %ecx +@@ -466,9 +458,18 @@ L(test_vec): + cmpq $VEC_SIZE, %r11 + jbe L(zero) + # endif +- ktestd %k5, %k5 ++ /* Each bit set in K1 represents a non-null CHAR in YMM2. */ ++ VPTESTM %YMM2, %YMM2, %k1 ++ /* Each bit cleared in K0 represents a mismatch or a null CHAR ++ in YMM2 and VEC_SIZE(%rdx). */ ++ VPCMP $0, %YMMZERO, %YMM3, %k0{%k1} ++ kmovd %k0, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + je L(test_2_vec) +- kmovd %k5, %ecx + tzcntl %ecx, %edi + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -512,9 +513,18 @@ L(test_2_vec): + cmpq $(VEC_SIZE * 2), %r11 + jbe L(zero) + # endif +- ktestd %k6, %k6 ++ /* Each bit set in K1 represents a non-null CHAR in YMM4. */ ++ VPTESTM %YMM4, %YMM4, %k1 ++ /* Each bit cleared in K0 represents a mismatch or a null CHAR ++ in YMM4 and (VEC_SIZE * 2)(%rdx). */ ++ VPCMP $0, %YMMZERO, %YMM5, %k0{%k1} ++ kmovd %k0, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + je L(test_3_vec) +- kmovd %k6, %ecx + tzcntl %ecx, %edi + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -558,8 +568,18 @@ L(test_3_vec): + cmpq $(VEC_SIZE * 3), %r11 + jbe L(zero) + # endif +- kmovd %k7, %esi +- tzcntl %esi, %ecx ++ /* Each bit set in K1 represents a non-null CHAR in YMM6. */ ++ VPTESTM %YMM6, %YMM6, %k1 ++ /* Each bit cleared in K0 represents a mismatch or a null CHAR ++ in YMM6 and (VEC_SIZE * 3)(%rdx). */ ++ VPCMP $0, %YMMZERO, %YMM7, %k0{%k1} ++ kmovd %k0, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %ecx +@@ -615,39 +635,51 @@ L(loop_cross_page): + + VMOVU (%rax, %r10), %YMM2 + VMOVU VEC_SIZE(%rax, %r10), %YMM3 +- VMOVU (%rdx, %r10), %YMM4 +- VMOVU VEC_SIZE(%rdx, %r10), %YMM5 +- +- VPCMP $4, %YMM4, %YMM2, %k0 +- VPCMP $0, %YMMZERO, %YMM2, %k1 +- VPCMP $0, %YMMZERO, %YMM4, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch in YMM2 and +- YMM4. */ +- kord %k0, %k1, %k1 +- +- VPCMP $4, %YMM5, %YMM3, %k3 +- VPCMP $0, %YMMZERO, %YMM3, %k4 +- VPCMP $0, %YMMZERO, %YMM5, %k5 +- kord %k4, %k5, %k4 +- /* Each bit in K3 represents a NULL or a mismatch in YMM3 and +- YMM5. */ +- kord %k3, %k4, %k3 ++ ++ /* Each bit set in K2 represents a non-null CHAR in YMM2. */ ++ VPTESTM %YMM2, %YMM2, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM2 and 32 bytes at (%rdx, %r10). */ ++ VPCMP $0, (%rdx, %r10), %YMM2, %k1{%k2} ++ kmovd %k1, %r9d ++ /* Don't use subl since it is the lower 16/32 bits of RDI ++ below. */ ++ notl %r9d ++# ifdef USE_AS_WCSCMP ++ /* Only last 8 bits are valid. */ ++ andl $0xff, %r9d ++# endif ++ ++ /* Each bit set in K4 represents a non-null CHAR in YMM3. */ ++ VPTESTM %YMM3, %YMM3, %k4 ++ /* Each bit cleared in K3 represents a mismatch or a null CHAR ++ in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */ ++ VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4} ++ kmovd %k3, %edi ++# ifdef USE_AS_WCSCMP ++ /* Don't use subl since it is the upper 8 bits of EDI below. */ ++ notl %edi ++ andl $0xff, %edi ++# else ++ incl %edi ++# endif + + # ifdef USE_AS_WCSCMP +- /* NB: Each bit in K1/K3 represents 4-byte element. */ +- kshiftlw $8, %k3, %k2 ++ /* NB: Each bit in EDI/R9D represents 4-byte element. */ ++ sall $8, %edi + /* NB: Divide shift count by 4 since each bit in K1 represent 4 + bytes. */ + movl %ecx, %SHIFT_REG32 + sarl $2, %SHIFT_REG32 ++ ++ /* Each bit in EDI represents a null CHAR or a mismatch. */ ++ orl %r9d, %edi + # else +- kshiftlq $32, %k3, %k2 +-# endif ++ salq $32, %rdi + +- /* Each bit in K1 represents a NULL or a mismatch. */ +- korq %k1, %k2, %k1 +- kmovq %k1, %rdi ++ /* Each bit in RDI represents a null CHAR or a mismatch. */ ++ orq %r9, %rdi ++# endif + + /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ + shrxq %SHIFT_REG64, %rdi, %rdi +@@ -692,35 +724,45 @@ L(loop_cross_page_2_vec): + /* The first VEC_SIZE * 2 bytes match or are ignored. */ + VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0 + VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1 +- VMOVU (VEC_SIZE * 2)(%rdx, %r10), %YMM2 +- VMOVU (VEC_SIZE * 3)(%rdx, %r10), %YMM3 +- +- VPCMP $4, %YMM0, %YMM2, %k0 +- VPCMP $0, %YMMZERO, %YMM0, %k1 +- VPCMP $0, %YMMZERO, %YMM2, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch in YMM0 and +- YMM2. */ +- kord %k0, %k1, %k1 +- +- VPCMP $4, %YMM1, %YMM3, %k3 +- VPCMP $0, %YMMZERO, %YMM1, %k4 +- VPCMP $0, %YMMZERO, %YMM3, %k5 +- kord %k4, %k5, %k4 +- /* Each bit in K3 represents a NULL or a mismatch in YMM1 and +- YMM3. */ +- kord %k3, %k4, %k3 + ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10). */ ++ VPCMP $0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2} ++ kmovd %k1, %r9d ++ /* Don't use subl since it is the lower 16/32 bits of RDI ++ below. */ ++ notl %r9d + # ifdef USE_AS_WCSCMP +- /* NB: Each bit in K1/K3 represents 4-byte element. */ +- kshiftlw $8, %k3, %k2 ++ /* Only last 8 bits are valid. */ ++ andl $0xff, %r9d ++# endif ++ ++ VPTESTM %YMM1, %YMM1, %k4 ++ /* Each bit cleared in K3 represents a mismatch or a null CHAR ++ in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */ ++ VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4} ++ kmovd %k3, %edi ++# ifdef USE_AS_WCSCMP ++ /* Don't use subl since it is the upper 8 bits of EDI below. */ ++ notl %edi ++ andl $0xff, %edi + # else +- kshiftlq $32, %k3, %k2 ++ incl %edi + # endif + +- /* Each bit in K1 represents a NULL or a mismatch. */ +- korq %k1, %k2, %k1 +- kmovq %k1, %rdi ++# ifdef USE_AS_WCSCMP ++ /* NB: Each bit in EDI/R9D represents 4-byte element. */ ++ sall $8, %edi ++ ++ /* Each bit in EDI represents a null CHAR or a mismatch. */ ++ orl %r9d, %edi ++# else ++ salq $32, %rdi ++ ++ /* Each bit in RDI represents a null CHAR or a mismatch. */ ++ orq %r9, %rdi ++# endif + + xorl %r8d, %r8d + /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ +@@ -729,12 +771,15 @@ L(loop_cross_page_2_vec): + /* R8 has number of bytes skipped. */ + movl %ecx, %r8d + # ifdef USE_AS_WCSCMP +- /* NB: Divide shift count by 4 since each bit in K1 represent 4 ++ /* NB: Divide shift count by 4 since each bit in RDI represent 4 + bytes. */ + sarl $2, %ecx +-# endif ++ /* Skip ECX bytes. */ ++ shrl %cl, %edi ++# else + /* Skip ECX bytes. */ + shrq %cl, %rdi ++# endif + 1: + /* Before jumping back to the loop, set ESI to the number of + VEC_SIZE * 4 blocks before page crossing. */ +@@ -818,7 +863,7 @@ L(cross_page_loop): + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %ecx + # endif +- /* Check null char. */ ++ /* Check null CHAR. */ + testl %eax, %eax + jne L(cross_page_loop) + /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED +@@ -901,18 +946,17 @@ L(cross_page): + jg L(cross_page_1_vector) + L(loop_1_vector): + VMOVU (%rdi, %rdx), %YMM0 +- VMOVU (%rsi, %rdx), %YMM1 +- +- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ +- VPCMP $4, %YMM0, %YMM1, %k0 +- VPCMP $0, %YMMZERO, %YMM0, %k1 +- VPCMP $0, %YMMZERO, %YMM1, %k2 +- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 ++ ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at (%rsi, %rdx). */ ++ VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2} + kmovd %k1, %ecx +- testl %ecx, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + jne L(last_vector) + + addl $VEC_SIZE, %edx +@@ -931,18 +975,17 @@ L(cross_page_1_vector): + cmpl $(PAGE_SIZE - 16), %eax + jg L(cross_page_1_xmm) + VMOVU (%rdi, %rdx), %XMM0 +- VMOVU (%rsi, %rdx), %XMM1 +- +- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ +- VPCMP $4, %XMM0, %XMM1, %k0 +- VPCMP $0, %XMMZERO, %XMM0, %k1 +- VPCMP $0, %XMMZERO, %XMM1, %k2 +- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ +- korw %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- korw %k0, %k1, %k1 +- kmovw %k1, %ecx +- testl %ecx, %ecx ++ ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in XMM0 and 16 bytes at (%rsi, %rdx). */ ++ VPCMP $0, (%rsi, %rdx), %XMM0, %k1{%k2} ++ kmovd %k1, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xf, %ecx ++# else ++ subl $0xffff, %ecx ++# endif + jne L(last_vector) + + addl $16, %edx +@@ -965,25 +1008,16 @@ L(cross_page_1_xmm): + vmovq (%rdi, %rdx), %XMM0 + vmovq (%rsi, %rdx), %XMM1 + +- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ +- VPCMP $4, %XMM0, %XMM1, %k0 +- VPCMP $0, %XMMZERO, %XMM0, %k1 +- VPCMP $0, %XMMZERO, %XMM1, %k2 +- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 +- kmovd %k1, %ecx +- ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in XMM0 and XMM1. */ ++ VPCMP $0, %XMM1, %XMM0, %k1{%k2} ++ kmovb %k1, %ecx + # ifdef USE_AS_WCSCMP +- /* Only last 2 bits are valid. */ +- andl $0x3, %ecx ++ subl $0x3, %ecx + # else +- /* Only last 8 bits are valid. */ +- andl $0xff, %ecx ++ subl $0xff, %ecx + # endif +- +- testl %ecx, %ecx + jne L(last_vector) + + addl $8, %edx +@@ -1002,25 +1036,16 @@ L(cross_page_8bytes): + vmovd (%rdi, %rdx), %XMM0 + vmovd (%rsi, %rdx), %XMM1 + +- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ +- VPCMP $4, %XMM0, %XMM1, %k0 +- VPCMP $0, %XMMZERO, %XMM0, %k1 +- VPCMP $0, %XMMZERO, %XMM1, %k2 +- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in XMM0 and XMM1. */ ++ VPCMP $0, %XMM1, %XMM0, %k1{%k2} + kmovd %k1, %ecx +- + # ifdef USE_AS_WCSCMP +- /* Only the last bit is valid. */ +- andl $0x1, %ecx ++ subl $0x1, %ecx + # else +- /* Only last 4 bits are valid. */ +- andl $0xf, %ecx ++ subl $0xf, %ecx + # endif +- +- testl %ecx, %ecx + jne L(last_vector) + + addl $4, %edx +-- +GitLab + diff --git a/glibc-RHEL-15696-6.patch b/glibc-RHEL-15696-6.patch new file mode 100644 index 0000000..f6725a6 --- /dev/null +++ b/glibc-RHEL-15696-6.patch @@ -0,0 +1,300 @@ +From ee915088a0231cd421054dbd8abab7aadf331153 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 21 Jan 2019 11:33:52 -0800 +Subject: [PATCH] x86-64 strncmp family: Properly handle the length parameter + [BZ# 24097] +Content-type: text/plain; charset=UTF-8 + +On x32, the size_t parameter may be passed in the lower 32 bits of a +64-bit register with the non-zero upper 32 bits. The string/memory +functions written in assembly can only use the lower 32 bits of a +64-bit register as length or must clear the upper 32 bits before using +the full 64-bit register for length. + +This pach fixes the strncmp family for x32. Tested on x86-64 and x32. +On x86-64, libc.so is the same with and withou the fix. + + [BZ# 24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/strcmp-avx2.S: Use RDX_LP for length. + * sysdeps/x86_64/multiarch/strcmp-sse42.S: Likewise. + * sysdeps/x86_64/strcmp.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp, + tst-size_t-strncmp and tst-size_t-wcsncmp. + * sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file. + * sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise. + * sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise. +--- + sysdeps/x86_64/multiarch/strcmp-avx2.S | 6 +- + sysdeps/x86_64/multiarch/strcmp-sse42.S | 6 +- + sysdeps/x86_64/strcmp.S | 6 +- + sysdeps/x86_64/x32/Makefile | 6 +- + sysdeps/x86_64/x32/tst-size_t-strncasecmp.c | 59 ++++++++++++++++ + sysdeps/x86_64/x32/tst-size_t-strncmp.c | 78 +++++++++++++++++++++ + sysdeps/x86_64/x32/tst-size_t-wcsncmp.c | 20 ++++++ + 7 files changed, 170 insertions(+), 11 deletions(-) + create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncasecmp.c + create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncmp.c + create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsncmp.c + +Conflicts: + ChangeLog + (removed) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 327e3d87..156c1949 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -79,15 +79,15 @@ + ENTRY (STRCMP) + # ifdef USE_AS_STRNCMP + /* Check for simple cases (0 or 1) in offset. */ +- cmp $1, %rdx ++ cmp $1, %RDX_LP + je L(char0) + jb L(zero) + # ifdef USE_AS_WCSCMP + /* Convert units: from wide to byte char. */ +- shl $2, %rdx ++ shl $2, %RDX_LP + # endif + /* Register %r11 tracks the maximum offset. */ +- movq %rdx, %r11 ++ mov %RDX_LP, %R11_LP + # endif + movl %edi, %eax + xorl %edx, %edx +diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S +index d3c07bd2..a1ebea46 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S ++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S +@@ -156,11 +156,11 @@ STRCMP_SSE42: + #endif + + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L +- test %rdx, %rdx ++ test %RDX_LP, %RDX_LP + je LABEL(strcmp_exitz) +- cmp $1, %rdx ++ cmp $1, %RDX_LP + je LABEL(Byte0) +- mov %rdx, %r11 ++ mov %RDX_LP, %R11_LP + #endif + mov %esi, %ecx + mov %edi, %eax +diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S +index e16945b9..f47c8ad4 100644 +--- a/sysdeps/x86_64/strcmp.S ++++ b/sysdeps/x86_64/strcmp.S +@@ -135,11 +135,11 @@ ENTRY (STRCMP) + * This implementation uses SSE to compare up to 16 bytes at a time. + */ + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L +- test %rdx, %rdx ++ test %RDX_LP, %RDX_LP + je LABEL(strcmp_exitz) +- cmp $1, %rdx ++ cmp $1, %RDX_LP + je LABEL(Byte0) +- mov %rdx, %r11 ++ mov %RDX_LP, %R11_LP + #endif + mov %esi, %ecx + mov %edi, %eax +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index 98bd9ae9..db302839 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -7,9 +7,11 @@ endif + + ifeq ($(subdir),string) + tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ +- tst-size_t-memrchr tst-size_t-memset ++ tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \ ++ tst-size_t-strncmp + endif + + ifeq ($(subdir),wcsmbs) +-tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset ++tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \ ++ tst-size_t-wcsncmp + endif +diff --git a/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c +new file mode 100644 +index 00000000..86233593 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c +@@ -0,0 +1,59 @@ ++/* Test strncaecmp with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define TEST_NAME "strncasecmp" ++#include "test-size_t.h" ++ ++IMPL (strncasecmp, 1) ++ ++typedef int (*proto_t) (const char *, const char *, size_t); ++ ++static int ++__attribute__ ((noinline, noclone)) ++do_strncasecmp (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ parameter_t dest = { { page_size }, buf1 }; ++ parameter_t src = { { 0 }, buf2 }; ++ ++ strncpy ((char *) buf1, (const char *) buf2, page_size); ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ src.fn = impl->fn; ++ int res = do_strncasecmp (dest, src); ++ if (res) ++ { ++ error (0, 0, "Wrong result in function %s: %i != 0", ++ impl->name, res); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +diff --git a/sysdeps/x86_64/x32/tst-size_t-strncmp.c b/sysdeps/x86_64/x32/tst-size_t-strncmp.c +new file mode 100644 +index 00000000..54e6bd83 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-strncmp.c +@@ -0,0 +1,78 @@ ++/* Test strncmp with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifdef WIDE ++# define TEST_NAME "wcsncmp" ++#else ++# define TEST_NAME "strncmp" ++#endif ++ ++#include "test-size_t.h" ++ ++#ifdef WIDE ++# include ++ ++# define STRNCMP wcsncmp ++# define STRNCPY wcsncpy ++# define CHAR wchar_t ++#else ++# define STRNCMP strncmp ++# define STRNCPY strncpy ++# define CHAR char ++#endif ++ ++IMPL (STRNCMP, 1) ++ ++typedef int (*proto_t) (const CHAR *, const CHAR *, size_t); ++ ++ ++static int ++__attribute__ ((noinline, noclone)) ++do_strncmp (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ size_t size = page_size / sizeof (CHAR); ++ parameter_t dest = { { size }, buf1 }; ++ parameter_t src = { { 0 }, buf2 }; ++ ++ STRNCPY ((CHAR *) buf1, (const CHAR *) buf2, size); ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ src.fn = impl->fn; ++ int res = do_strncmp (dest, src); ++ if (res) ++ { ++ error (0, 0, "Wrong result in function %s: %i != 0", ++ impl->name, res); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c +new file mode 100644 +index 00000000..4829647c +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c +@@ -0,0 +1,20 @@ ++/* Test wcsncmp with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define WIDE 1 ++#include "tst-size_t-strncmp.c" +-- +GitLab + diff --git a/glibc-RHEL-15696-60.patch b/glibc-RHEL-15696-60.patch new file mode 100644 index 0000000..a3739eb --- /dev/null +++ b/glibc-RHEL-15696-60.patch @@ -0,0 +1,54 @@ +From 6720d36b6623c5e48c070d86acf61198b33e144e Mon Sep 17 00:00:00 2001 +From: Fangrui Song +Date: Tue, 2 Nov 2021 20:59:52 -0700 +Subject: [PATCH] x86-64: Replace movzx with movzbl +Content-type: text/plain; charset=UTF-8 + +Clang cannot assemble movzx in the AT&T dialect mode. + +../sysdeps/x86_64/strcmp.S:2232:16: error: invalid operand for instruction + movzx (%rsi), %ecx + ^~~~ + +Change movzx to movzbl, which follows the AT&T dialect and is used +elsewhere in the file. + +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strcmp-sse42.S | 4 ++-- + sysdeps/x86_64/strcmp.S | 4 ++-- + 2 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S +index a1ebea46..d8fdeb3a 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S ++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S +@@ -1771,8 +1771,8 @@ LABEL(strcmp_exitz): + .p2align 4 + // XXX Same as code above + LABEL(Byte0): +- movzx (%rsi), %ecx +- movzx (%rdi), %eax ++ movzbl (%rsi), %ecx ++ movzbl (%rdi), %eax + + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx +diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S +index f47c8ad4..aa6df898 100644 +--- a/sysdeps/x86_64/strcmp.S ++++ b/sysdeps/x86_64/strcmp.S +@@ -2232,8 +2232,8 @@ LABEL(strcmp_exitz): + + .p2align 4 + LABEL(Byte0): +- movzx (%rsi), %ecx +- movzx (%rdi), %eax ++ movzbl (%rsi), %ecx ++ movzbl (%rdi), %eax + + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx +-- +GitLab + diff --git a/glibc-RHEL-15696-61.patch b/glibc-RHEL-15696-61.patch new file mode 100644 index 0000000..d6dbe81 --- /dev/null +++ b/glibc-RHEL-15696-61.patch @@ -0,0 +1,56 @@ +From cf2c57526ba4b57e6863ad4db8a868e2678adce8 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 30 Apr 2021 05:58:59 -0700 +Subject: [PATCH] x86: Set rep_movsb_threshold to 2112 on processors with FSRM +Content-type: text/plain; charset=UTF-8 + +The glibc memcpy benchmark on Intel Core i7-1065G7 (Ice Lake) showed +that REP MOVSB became faster after 2112 bytes: + + Vector Move REP MOVSB +length=2112, align1=0, align2=0: 24.20 24.40 +length=2112, align1=1, align2=0: 26.07 23.13 +length=2112, align1=0, align2=1: 27.18 28.13 +length=2112, align1=1, align2=1: 26.23 25.16 +length=2176, align1=0, align2=0: 23.18 22.52 +length=2176, align1=2, align2=0: 25.45 22.52 +length=2176, align1=0, align2=2: 27.14 27.82 +length=2176, align1=2, align2=2: 22.73 25.56 +length=2240, align1=0, align2=0: 24.62 24.25 +length=2240, align1=3, align2=0: 29.77 27.15 +length=2240, align1=0, align2=3: 35.55 29.93 +length=2240, align1=3, align2=3: 34.49 25.15 +length=2304, align1=0, align2=0: 34.75 26.64 +length=2304, align1=4, align2=0: 32.09 22.63 +length=2304, align1=0, align2=4: 28.43 31.24 + +Use REP MOVSB for data size > 2112 bytes in memcpy on processors with +fast short REP MOVSB (FSRM). + + * sysdeps/x86/dl-cacheinfo.h (dl_init_cacheinfo): Set + rep_movsb_threshold to 2112 on processors with fast short REP + MOVSB (FSRM). +--- + sysdeps/x86/cacheinfo.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h +index f72f634a..cc3941d3 100644 +--- a/sysdeps/x86/cacheinfo.h ++++ b/sysdeps/x86/cacheinfo.h +@@ -430,6 +430,12 @@ init_cacheinfo (void) + rep_movsb_threshold = 2048 * (16 / 16); + minimum_rep_movsb_threshold = 16 * 8; + } ++ ++ /* NB: The default REP MOVSB threshold is 2112 on processors with fast ++ short REP MOVSB (FSRM). */ ++ if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) ++ rep_movsb_threshold = 2112; ++ + if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold) + __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; + else +-- +GitLab + diff --git a/glibc-RHEL-15696-62.patch b/glibc-RHEL-15696-62.patch new file mode 100644 index 0000000..a7a9286 --- /dev/null +++ b/glibc-RHEL-15696-62.patch @@ -0,0 +1,136 @@ +From 475b63702ef38b69558fc3d31a0b66776a70f1d3 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 1 Nov 2021 00:49:52 -0500 +Subject: [PATCH] x86: Double size of ERMS rep_movsb_threshold in + dl-cacheinfo.h +Content-type: text/plain; charset=UTF-8 + +No bug. + +This patch doubles the rep_movsb_threshold when using ERMS. Based on +benchmarks the vector copy loop, especially now that it handles 4k +aliasing, is better for these medium ranged. + +On Skylake with ERMS: + +Size, Align1, Align2, dst>src,(rep movsb) / (vec copy) +4096, 0, 0, 0, 0.975 +4096, 0, 0, 1, 0.953 +4096, 12, 0, 0, 0.969 +4096, 12, 0, 1, 0.872 +4096, 44, 0, 0, 0.979 +4096, 44, 0, 1, 0.83 +4096, 0, 12, 0, 1.006 +4096, 0, 12, 1, 0.989 +4096, 0, 44, 0, 0.739 +4096, 0, 44, 1, 0.942 +4096, 12, 12, 0, 1.009 +4096, 12, 12, 1, 0.973 +4096, 44, 44, 0, 0.791 +4096, 44, 44, 1, 0.961 +4096, 2048, 0, 0, 0.978 +4096, 2048, 0, 1, 0.951 +4096, 2060, 0, 0, 0.986 +4096, 2060, 0, 1, 0.963 +4096, 2048, 12, 0, 0.971 +4096, 2048, 12, 1, 0.941 +4096, 2060, 12, 0, 0.977 +4096, 2060, 12, 1, 0.949 +8192, 0, 0, 0, 0.85 +8192, 0, 0, 1, 0.845 +8192, 13, 0, 0, 0.937 +8192, 13, 0, 1, 0.939 +8192, 45, 0, 0, 0.932 +8192, 45, 0, 1, 0.927 +8192, 0, 13, 0, 0.621 +8192, 0, 13, 1, 0.62 +8192, 0, 45, 0, 0.53 +8192, 0, 45, 1, 0.516 +8192, 13, 13, 0, 0.664 +8192, 13, 13, 1, 0.659 +8192, 45, 45, 0, 0.593 +8192, 45, 45, 1, 0.575 +8192, 2048, 0, 0, 0.854 +8192, 2048, 0, 1, 0.834 +8192, 2061, 0, 0, 0.863 +8192, 2061, 0, 1, 0.857 +8192, 2048, 13, 0, 0.63 +8192, 2048, 13, 1, 0.629 +8192, 2061, 13, 0, 0.627 +8192, 2061, 13, 1, 0.62 + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86/cacheinfo.h | 8 +++++--- + sysdeps/x86/dl-tunables.list | 26 +++++++++++++++----------- + 2 files changed, 20 insertions(+), 14 deletions(-) + +diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h +index cc3941d3..ac025e08 100644 +--- a/sysdeps/x86/cacheinfo.h ++++ b/sysdeps/x86/cacheinfo.h +@@ -411,18 +411,20 @@ init_cacheinfo (void) + + /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */ + unsigned int minimum_rep_movsb_threshold; +- /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */ ++ /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for ++ VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB ++ threshold is 2048 * (VEC_SIZE / 16). */ + unsigned int rep_movsb_threshold; + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) + && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512)) + { +- rep_movsb_threshold = 2048 * (64 / 16); ++ rep_movsb_threshold = 4096 * (64 / 16); + minimum_rep_movsb_threshold = 64 * 8; + } + else if (CPU_FEATURE_PREFERRED_P (cpu_features, + AVX_Fast_Unaligned_Load)) + { +- rep_movsb_threshold = 2048 * (32 / 16); ++ rep_movsb_threshold = 4096 * (32 / 16); + minimum_rep_movsb_threshold = 32 * 8; + } + else +diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list +index 89bf2966..56c6834a 100644 +--- a/sysdeps/x86/dl-tunables.list ++++ b/sysdeps/x86/dl-tunables.list +@@ -32,17 +32,21 @@ glibc { + } + x86_rep_movsb_threshold { + type: SIZE_T +- # Since there is overhead to set up REP MOVSB operation, REP MOVSB +- # isn't faster on short data. The memcpy micro benchmark in glibc +- # shows that 2KB is the approximate value above which REP MOVSB +- # becomes faster than SSE2 optimization on processors with Enhanced +- # REP MOVSB. Since larger register size can move more data with a +- # single load and store, the threshold is higher with larger register +- # size. Note: Since the REP MOVSB threshold must be greater than 8 +- # times of vector size and the default value is 2048 * (vector size +- # / 16), the default value and the minimum value must be updated at +- # run-time. NB: Don't set the default value since we can't tell if +- # the tunable value is set by user or not [BZ #27069]. ++ # Since there is overhead to set up REP MOVSB operation, REP ++ # MOVSB isn't faster on short data. The memcpy micro benchmark ++ # in glibc shows that 2KB is the approximate value above which ++ # REP MOVSB becomes faster than SSE2 optimization on processors ++ # with Enhanced REP MOVSB. Since larger register size can move ++ # more data with a single load and store, the threshold is ++ # higher with larger register size. Micro benchmarks show AVX ++ # REP MOVSB becomes faster apprximately at 8KB. The AVX512 ++ # threshold is extrapolated to 16KB. For machines with FSRM the ++ # threshold is universally set at 2112 bytes. Note: Since the ++ # REP MOVSB threshold must be greater than 8 times of vector ++ # size and the default value is 4096 * (vector size / 16), the ++ # default value and the minimum value must be updated at ++ # run-time. NB: Don't set the default value since we can't tell ++ # if the tunable value is set by user or not [BZ #27069]. + minval: 1 + } + x86_rep_stosb_threshold { +-- +GitLab + diff --git a/glibc-RHEL-15696-63.patch b/glibc-RHEL-15696-63.patch new file mode 100644 index 0000000..c14e8b3 --- /dev/null +++ b/glibc-RHEL-15696-63.patch @@ -0,0 +1,2428 @@ +From 2f9062d7171850451e6044ef78d91ff8c017b9c0 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 10 Nov 2021 16:18:56 -0600 +Subject: [PATCH] x86: Shrink memcmp-sse4.S code size +Content-type: text/plain; charset=UTF-8 + +No bug. + +This implementation refactors memcmp-sse4.S primarily with minimizing +code size in mind. It does this by removing the lookup table logic and +removing the unrolled check from (256, 512] bytes. + +memcmp-sse4 code size reduction : -3487 bytes +wmemcmp-sse4 code size reduction: -1472 bytes + +The current memcmp-sse4.S implementation has a large code size +cost. This has serious adverse affects on the ICache / ITLB. While +in micro-benchmarks the implementations appears fast, traces of +real-world code have shown that the speed in micro benchmarks does not +translate when the ICache/ITLB are not primed, and that the cost +of the code size has measurable negative affects on overall +application performance. + +See https://research.google/pubs/pub48320/ for more details. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/memcmp-sse4.S | 2267 +++++++----------------- + 1 file changed, 646 insertions(+), 1621 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S +index 302900f5..50060006 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S ++++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S +@@ -25,14 +25,14 @@ + # define MEMCMP __memcmp_sse4_1 + # endif + +-# define JMPTBL(I, B) (I - B) ++#ifdef USE_AS_WMEMCMP ++# define CMPEQ pcmpeqd ++# define CHAR_SIZE 4 ++#else ++# define CMPEQ pcmpeqb ++# define CHAR_SIZE 1 ++#endif + +-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ +- lea TABLE(%rip), %r11; \ +- movslq (%r11, INDEX, SCALE), %rcx; \ +- add %r11, %rcx; \ +- _CET_NOTRACK jmp *%rcx; \ +- ud2 + + /* Warning! + wmemcmp has to use SIGNED comparison for elements. +@@ -47,33 +47,253 @@ ENTRY (MEMCMP) + /* Clear the upper 32 bits. */ + mov %edx, %edx + # endif +- pxor %xmm0, %xmm0 + cmp $79, %RDX_LP + ja L(79bytesormore) ++ ++ cmp $CHAR_SIZE, %RDX_LP ++ jbe L(firstbyte) ++ ++ /* N in (CHAR_SIZE, 79) bytes. */ ++ cmpl $32, %edx ++ ja L(more_32_bytes) ++ ++ cmpl $16, %edx ++ jae L(16_to_32_bytes) ++ + # ifndef USE_AS_WMEMCMP +- cmp $1, %RDX_LP +- je L(firstbyte) ++ cmpl $8, %edx ++ jae L(8_to_16_bytes) ++ ++ cmpl $4, %edx ++ jb L(2_to_3_bytes) ++ ++ movl (%rdi), %eax ++ movl (%rsi), %ecx ++ ++ bswap %eax ++ bswap %ecx ++ ++ shlq $32, %rax ++ shlq $32, %rcx ++ ++ movl -4(%rdi, %rdx), %edi ++ movl -4(%rsi, %rdx), %esi ++ ++ bswap %edi ++ bswap %esi ++ ++ orq %rdi, %rax ++ orq %rsi, %rcx ++ subq %rcx, %rax ++ cmovne %edx, %eax ++ sbbl %ecx, %ecx ++ orl %ecx, %eax ++ ret ++ ++ .p2align 4,, 8 ++L(2_to_3_bytes): ++ movzwl (%rdi), %eax ++ movzwl (%rsi), %ecx ++ shll $8, %eax ++ shll $8, %ecx ++ bswap %eax ++ bswap %ecx ++ movzbl -1(%rdi, %rdx), %edi ++ movzbl -1(%rsi, %rdx), %esi ++ orl %edi, %eax ++ orl %esi, %ecx ++ subl %ecx, %eax ++ ret ++ ++ .p2align 4,, 8 ++L(8_to_16_bytes): ++ movq (%rdi), %rax ++ movq (%rsi), %rcx ++ ++ bswap %rax ++ bswap %rcx ++ ++ subq %rcx, %rax ++ jne L(8_to_16_bytes_done) ++ ++ movq -8(%rdi, %rdx), %rax ++ movq -8(%rsi, %rdx), %rcx ++ ++ bswap %rax ++ bswap %rcx ++ ++ subq %rcx, %rax ++ ++L(8_to_16_bytes_done): ++ cmovne %edx, %eax ++ sbbl %ecx, %ecx ++ orl %ecx, %eax ++ ret ++# else ++ xorl %eax, %eax ++ movl (%rdi), %ecx ++ cmpl (%rsi), %ecx ++ jne L(8_to_16_bytes_done) ++ movl 4(%rdi), %ecx ++ cmpl 4(%rsi), %ecx ++ jne L(8_to_16_bytes_done) ++ movl -4(%rdi, %rdx), %ecx ++ cmpl -4(%rsi, %rdx), %ecx ++ jne L(8_to_16_bytes_done) ++ ret + # endif +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +-# ifndef USE_AS_WMEMCMP +- .p2align 4 ++ .p2align 4,, 3 ++L(ret_zero): ++ xorl %eax, %eax ++L(zero): ++ ret ++ ++ .p2align 4,, 8 + L(firstbyte): ++ jb L(ret_zero) ++# ifdef USE_AS_WMEMCMP ++ xorl %eax, %eax ++ movl (%rdi), %ecx ++ cmpl (%rsi), %ecx ++ je L(zero) ++L(8_to_16_bytes_done): ++ setg %al ++ leal -1(%rax, %rax), %eax ++# else + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + sub %ecx, %eax ++# endif + ret ++ ++ .p2align 4 ++L(vec_return_begin_48): ++ addq $16, %rdi ++ addq $16, %rsi ++L(vec_return_begin_32): ++ bsfl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl 32(%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl 32(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl 32(%rsi, %rax), %ecx ++ movzbl 32(%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ ret ++ ++ .p2align 4 ++L(vec_return_begin_16): ++ addq $16, %rdi ++ addq $16, %rsi ++L(vec_return_begin): ++ bsfl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl (%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl (%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (%rsi, %rax), %ecx ++ movzbl (%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ ret ++ ++ .p2align 4 ++L(vec_return_end_16): ++ subl $16, %edx ++L(vec_return_end): ++ bsfl %eax, %eax ++ addl %edx, %eax ++# ifdef USE_AS_WMEMCMP ++ movl -16(%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl -16(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl -16(%rsi, %rax), %ecx ++ movzbl -16(%rdi, %rax), %eax ++ subl %ecx, %eax + # endif ++ ret ++ ++ .p2align 4,, 8 ++L(more_32_bytes): ++ movdqu (%rdi), %xmm0 ++ movdqu (%rsi), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu 16(%rdi), %xmm0 ++ movdqu 16(%rsi), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ cmpl $64, %edx ++ jbe L(32_to_64_bytes) ++ movdqu 32(%rdi), %xmm0 ++ movdqu 32(%rsi), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ .p2align 4,, 6 ++L(32_to_64_bytes): ++ movdqu -32(%rdi, %rdx), %xmm0 ++ movdqu -32(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end_16) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret ++ ++ .p2align 4 ++L(16_to_32_bytes): ++ movdqu (%rdi), %xmm0 ++ movdqu (%rsi), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret ++ + + .p2align 4 + L(79bytesormore): ++ movdqu (%rdi), %xmm0 + movdqu (%rsi), %xmm1 +- movdqu (%rdi), %xmm2 +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ + mov %rsi, %rcx + and $-16, %rsi + add $16, %rsi +@@ -86,1694 +306,499 @@ L(79bytesormore): + + cmp $128, %rdx + ja L(128bytesormore) +-L(less128bytes): +- sub $64, %rdx +- +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) + +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqu 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqu 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- cmp $32, %rdx +- jb L(less32bytesin64) +- +- movdqu 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqu 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin64): +- add $64, %rdi +- add $64, %rsi +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ .p2align 4,, 6 ++L(less128bytes): ++ movdqu (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqu 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqu 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ cmp $96, %rdx ++ jb L(32_to_64_bytes) ++ ++ addq $64, %rdi ++ addq $64, %rsi ++ subq $64, %rdx ++ ++ .p2align 4,, 6 ++L(last_64_bytes): ++ movdqu (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqu -32(%rdi, %rdx), %xmm0 ++ movdqu -32(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end_16) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret + ++ .p2align 4 + L(128bytesormore): +- cmp $512, %rdx +- ja L(512bytesormore) + cmp $256, %rdx +- ja L(less512bytes) ++ ja L(unaligned_loop) + L(less256bytes): +- sub $128, %rdx +- +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqu 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqu 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- +- movdqu 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqu 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- +- movdqu 96(%rdi), %xmm2 +- pxor 96(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(112bytesin256) +- +- movdqu 112(%rdi), %xmm2 +- pxor 112(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(128bytesin256) +- +- add $128, %rsi +- add $128, %rdi +- +- cmp $64, %rdx +- jae L(less128bytes) +- +- cmp $32, %rdx +- jb L(less32bytesin128) +- +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin128): +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) +- +-L(less512bytes): +- sub $256, %rdx +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqu 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqu 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- +- movdqu 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqu 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- +- movdqu 96(%rdi), %xmm2 +- pxor 96(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(112bytesin256) +- +- movdqu 112(%rdi), %xmm2 +- pxor 112(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(128bytesin256) +- +- movdqu 128(%rdi), %xmm2 +- pxor 128(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(144bytesin256) +- +- movdqu 144(%rdi), %xmm2 +- pxor 144(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(160bytesin256) +- +- movdqu 160(%rdi), %xmm2 +- pxor 160(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(176bytesin256) +- +- movdqu 176(%rdi), %xmm2 +- pxor 176(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(192bytesin256) +- +- movdqu 192(%rdi), %xmm2 +- pxor 192(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(208bytesin256) +- +- movdqu 208(%rdi), %xmm2 +- pxor 208(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(224bytesin256) +- +- movdqu 224(%rdi), %xmm2 +- pxor 224(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(240bytesin256) +- +- movdqu 240(%rdi), %xmm2 +- pxor 240(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(256bytesin256) +- +- add $256, %rsi +- add $256, %rdi +- +- cmp $128, %rdx +- jae L(less256bytes) ++ movdqu (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqu 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqu 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ addq $64, %rdi ++ addq $64, %rsi ++ ++ movdqu (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqu 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqu 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ addq $-128, %rdx ++ subq $-64, %rsi ++ subq $-64, %rdi + + cmp $64, %rdx +- jae L(less128bytes) ++ ja L(less128bytes) + + cmp $32, %rdx +- jb L(less32bytesin256) +- +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin256): +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ ja L(last_64_bytes) ++ ++ movdqu -32(%rdi, %rdx), %xmm0 ++ movdqu -32(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end_16) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret + + .p2align 4 +-L(512bytesormore): ++L(unaligned_loop): + # ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %R8_LP + # else + mov __x86_data_cache_size_half(%rip), %R8_LP + # endif +- mov %r8, %r9 +- shr $1, %r8 +- add %r9, %r8 +- cmp %r8, %rdx +- ja L(L2_L3_cache_unaglined) ++ movq %r8, %r9 ++ addq %r8, %r8 ++ addq %r9, %r8 ++ cmpq %r8, %rdx ++ ja L(L2_L3_cache_unaligned) + sub $64, %rdx + .p2align 4 + L(64bytesormore_loop): +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- movdqa %xmm2, %xmm1 ++ movdqu (%rdi), %xmm0 ++ movdqu 16(%rdi), %xmm1 ++ movdqu 32(%rdi), %xmm2 ++ movdqu 48(%rdi), %xmm3 + +- movdqu 16(%rdi), %xmm3 +- pxor 16(%rsi), %xmm3 +- por %xmm3, %xmm1 ++ CMPEQ (%rsi), %xmm0 ++ CMPEQ 16(%rsi), %xmm1 ++ CMPEQ 32(%rsi), %xmm2 ++ CMPEQ 48(%rsi), %xmm3 + +- movdqu 32(%rdi), %xmm4 +- pxor 32(%rsi), %xmm4 +- por %xmm4, %xmm1 ++ pand %xmm0, %xmm1 ++ pand %xmm2, %xmm3 ++ pand %xmm1, %xmm3 + +- movdqu 48(%rdi), %xmm5 +- pxor 48(%rsi), %xmm5 +- por %xmm5, %xmm1 ++ pmovmskb %xmm3, %eax ++ incw %ax ++ jnz L(64bytesormore_loop_end) + +- ptest %xmm1, %xmm0 +- jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx +- jae L(64bytesormore_loop) ++ ja L(64bytesormore_loop) + +- add $64, %rdx +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ .p2align 4,, 6 ++L(loop_tail): ++ addq %rdx, %rdi ++ movdqu (%rdi), %xmm0 ++ movdqu 16(%rdi), %xmm1 ++ movdqu 32(%rdi), %xmm2 ++ movdqu 48(%rdi), %xmm3 ++ ++ addq %rdx, %rsi ++ movdqu (%rsi), %xmm4 ++ movdqu 16(%rsi), %xmm5 ++ movdqu 32(%rsi), %xmm6 ++ movdqu 48(%rsi), %xmm7 ++ ++ CMPEQ %xmm4, %xmm0 ++ CMPEQ %xmm5, %xmm1 ++ CMPEQ %xmm6, %xmm2 ++ CMPEQ %xmm7, %xmm3 ++ ++ pand %xmm0, %xmm1 ++ pand %xmm2, %xmm3 ++ pand %xmm1, %xmm3 ++ ++ pmovmskb %xmm3, %eax ++ incw %ax ++ jnz L(64bytesormore_loop_end) ++ ret + +-L(L2_L3_cache_unaglined): +- sub $64, %rdx ++L(L2_L3_cache_unaligned): ++ subq $64, %rdx + .p2align 4 + L(L2_L3_unaligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- movdqa %xmm2, %xmm1 + +- movdqu 16(%rdi), %xmm3 +- pxor 16(%rsi), %xmm3 +- por %xmm3, %xmm1 ++ movdqu (%rdi), %xmm0 ++ movdqu 16(%rdi), %xmm1 ++ movdqu 32(%rdi), %xmm2 ++ movdqu 48(%rdi), %xmm3 ++ ++ CMPEQ (%rsi), %xmm0 ++ CMPEQ 16(%rsi), %xmm1 ++ CMPEQ 32(%rsi), %xmm2 ++ CMPEQ 48(%rsi), %xmm3 + +- movdqu 32(%rdi), %xmm4 +- pxor 32(%rsi), %xmm4 +- por %xmm4, %xmm1 ++ pand %xmm0, %xmm1 ++ pand %xmm2, %xmm3 ++ pand %xmm1, %xmm3 + +- movdqu 48(%rdi), %xmm5 +- pxor 48(%rsi), %xmm5 +- por %xmm5, %xmm1 ++ pmovmskb %xmm3, %eax ++ incw %ax ++ jnz L(64bytesormore_loop_end) + +- ptest %xmm1, %xmm0 +- jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx +- jae L(L2_L3_unaligned_128bytes_loop) ++ ja L(L2_L3_unaligned_128bytes_loop) ++ jmp L(loop_tail) + +- add $64, %rdx +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +-/* +- * This case is for machines which are sensitive for unaligned instructions. +- */ ++ /* This case is for machines which are sensitive for unaligned ++ * instructions. */ + .p2align 4 + L(2aligned): + cmp $128, %rdx + ja L(128bytesormorein2aligned) + L(less128bytesin2aligned): +- sub $64, %rdx +- +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqa 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqa 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqa 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- cmp $32, %rdx +- jb L(less32bytesin64in2alinged) +- +- movdqa 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqa 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin64in2alinged): +- add $64, %rdi +- add $64, %rsi +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ movdqa (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqa 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqa 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqa 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ cmp $96, %rdx ++ jb L(32_to_64_bytes) ++ ++ addq $64, %rdi ++ addq $64, %rsi ++ subq $64, %rdx ++ ++ .p2align 4,, 6 ++L(aligned_last_64_bytes): ++ movdqa (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqa 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqu -32(%rdi, %rdx), %xmm0 ++ movdqu -32(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end_16) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret + + .p2align 4 + L(128bytesormorein2aligned): +- cmp $512, %rdx +- ja L(512bytesormorein2aligned) + cmp $256, %rdx +- ja L(256bytesormorein2aligned) ++ ja L(aligned_loop) + L(less256bytesin2alinged): +- sub $128, %rdx +- +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqa 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqa 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqa 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- +- movdqa 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqa 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- +- movdqa 96(%rdi), %xmm2 +- pxor 96(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(112bytesin256) +- +- movdqa 112(%rdi), %xmm2 +- pxor 112(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(128bytesin256) +- +- add $128, %rsi +- add $128, %rdi ++ movdqa (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqa 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqa 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqa 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ addq $64, %rdi ++ addq $64, %rsi ++ ++ movdqa (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqa 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqa 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqa 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ addq $-128, %rdx ++ subq $-64, %rsi ++ subq $-64, %rdi + + cmp $64, %rdx +- jae L(less128bytesin2aligned) ++ ja L(less128bytesin2aligned) + + cmp $32, %rdx +- jb L(less32bytesin128in2aligned) +- +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin128in2aligned): +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) +- +- .p2align 4 +-L(256bytesormorein2aligned): +- +- sub $256, %rdx +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqa 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqa 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqa 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- +- movdqa 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqa 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- +- movdqa 96(%rdi), %xmm2 +- pxor 96(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(112bytesin256) +- +- movdqa 112(%rdi), %xmm2 +- pxor 112(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(128bytesin256) +- +- movdqa 128(%rdi), %xmm2 +- pxor 128(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(144bytesin256) +- +- movdqa 144(%rdi), %xmm2 +- pxor 144(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(160bytesin256) +- +- movdqa 160(%rdi), %xmm2 +- pxor 160(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(176bytesin256) +- +- movdqa 176(%rdi), %xmm2 +- pxor 176(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(192bytesin256) +- +- movdqa 192(%rdi), %xmm2 +- pxor 192(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(208bytesin256) +- +- movdqa 208(%rdi), %xmm2 +- pxor 208(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(224bytesin256) +- +- movdqa 224(%rdi), %xmm2 +- pxor 224(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(240bytesin256) +- +- movdqa 240(%rdi), %xmm2 +- pxor 240(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(256bytesin256) +- +- add $256, %rsi +- add $256, %rdi +- +- cmp $128, %rdx +- jae L(less256bytesin2alinged) +- +- cmp $64, %rdx +- jae L(less128bytesin2aligned) +- +- cmp $32, %rdx +- jb L(less32bytesin256in2alinged) +- +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqa 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin256in2alinged): +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ ja L(aligned_last_64_bytes) ++ ++ movdqu -32(%rdi, %rdx), %xmm0 ++ movdqu -32(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end_16) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret + + .p2align 4 +-L(512bytesormorein2aligned): ++L(aligned_loop): + # ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %R8_LP + # else + mov __x86_data_cache_size_half(%rip), %R8_LP + # endif +- mov %r8, %r9 +- shr $1, %r8 +- add %r9, %r8 +- cmp %r8, %rdx +- ja L(L2_L3_cache_aglined) ++ movq %r8, %r9 ++ addq %r8, %r8 ++ addq %r9, %r8 ++ cmpq %r8, %rdx ++ ja L(L2_L3_cache_aligned) + + sub $64, %rdx + .p2align 4 + L(64bytesormore_loopin2aligned): +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- movdqa %xmm2, %xmm1 +- +- movdqa 16(%rdi), %xmm3 +- pxor 16(%rsi), %xmm3 +- por %xmm3, %xmm1 ++ movdqa (%rdi), %xmm0 ++ movdqa 16(%rdi), %xmm1 ++ movdqa 32(%rdi), %xmm2 ++ movdqa 48(%rdi), %xmm3 + +- movdqa 32(%rdi), %xmm4 +- pxor 32(%rsi), %xmm4 +- por %xmm4, %xmm1 ++ CMPEQ (%rsi), %xmm0 ++ CMPEQ 16(%rsi), %xmm1 ++ CMPEQ 32(%rsi), %xmm2 ++ CMPEQ 48(%rsi), %xmm3 + +- movdqa 48(%rdi), %xmm5 +- pxor 48(%rsi), %xmm5 +- por %xmm5, %xmm1 ++ pand %xmm0, %xmm1 ++ pand %xmm2, %xmm3 ++ pand %xmm1, %xmm3 + +- ptest %xmm1, %xmm0 +- jnc L(64bytesormore_loop_end) ++ pmovmskb %xmm3, %eax ++ incw %ax ++ jnz L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx +- jae L(64bytesormore_loopin2aligned) +- +- add $64, %rdx +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) +-L(L2_L3_cache_aglined): +- sub $64, %rdx ++ ja L(64bytesormore_loopin2aligned) ++ jmp L(loop_tail) + ++L(L2_L3_cache_aligned): ++ subq $64, %rdx + .p2align 4 + L(L2_L3_aligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- movdqa %xmm2, %xmm1 +- +- movdqa 16(%rdi), %xmm3 +- pxor 16(%rsi), %xmm3 +- por %xmm3, %xmm1 ++ movdqa (%rdi), %xmm0 ++ movdqa 16(%rdi), %xmm1 ++ movdqa 32(%rdi), %xmm2 ++ movdqa 48(%rdi), %xmm3 + +- movdqa 32(%rdi), %xmm4 +- pxor 32(%rsi), %xmm4 +- por %xmm4, %xmm1 ++ CMPEQ (%rsi), %xmm0 ++ CMPEQ 16(%rsi), %xmm1 ++ CMPEQ 32(%rsi), %xmm2 ++ CMPEQ 48(%rsi), %xmm3 + +- movdqa 48(%rdi), %xmm5 +- pxor 48(%rsi), %xmm5 +- por %xmm5, %xmm1 ++ pand %xmm0, %xmm1 ++ pand %xmm2, %xmm3 ++ pand %xmm1, %xmm3 + +- ptest %xmm1, %xmm0 +- jnc L(64bytesormore_loop_end) +- add $64, %rsi +- add $64, %rdi +- sub $64, %rdx +- jae L(L2_L3_aligned_128bytes_loop) +- +- add $64, %rdx +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ pmovmskb %xmm3, %eax ++ incw %ax ++ jnz L(64bytesormore_loop_end) + ++ addq $64, %rsi ++ addq $64, %rdi ++ subq $64, %rdx ++ ja L(L2_L3_aligned_128bytes_loop) ++ jmp L(loop_tail) + + .p2align 4 + L(64bytesormore_loop_end): +- add $16, %rdi +- add $16, %rsi +- ptest %xmm2, %xmm0 +- jnc L(16bytes) +- +- add $16, %rdi +- add $16, %rsi +- ptest %xmm3, %xmm0 +- jnc L(16bytes) +- +- add $16, %rdi +- add $16, %rsi +- ptest %xmm4, %xmm0 +- jnc L(16bytes) +- +- add $16, %rdi +- add $16, %rsi +- jmp L(16bytes) +- +-L(256bytesin256): +- add $256, %rdi +- add $256, %rsi +- jmp L(16bytes) +-L(240bytesin256): +- add $240, %rdi +- add $240, %rsi +- jmp L(16bytes) +-L(224bytesin256): +- add $224, %rdi +- add $224, %rsi +- jmp L(16bytes) +-L(208bytesin256): +- add $208, %rdi +- add $208, %rsi +- jmp L(16bytes) +-L(192bytesin256): +- add $192, %rdi +- add $192, %rsi +- jmp L(16bytes) +-L(176bytesin256): +- add $176, %rdi +- add $176, %rsi +- jmp L(16bytes) +-L(160bytesin256): +- add $160, %rdi +- add $160, %rsi +- jmp L(16bytes) +-L(144bytesin256): +- add $144, %rdi +- add $144, %rsi +- jmp L(16bytes) +-L(128bytesin256): +- add $128, %rdi +- add $128, %rsi +- jmp L(16bytes) +-L(112bytesin256): +- add $112, %rdi +- add $112, %rsi +- jmp L(16bytes) +-L(96bytesin256): +- add $96, %rdi +- add $96, %rsi +- jmp L(16bytes) +-L(80bytesin256): +- add $80, %rdi +- add $80, %rsi +- jmp L(16bytes) +-L(64bytesin256): +- add $64, %rdi +- add $64, %rsi +- jmp L(16bytes) +-L(48bytesin256): +- add $16, %rdi +- add $16, %rsi +-L(32bytesin256): +- add $16, %rdi +- add $16, %rsi +-L(16bytesin256): +- add $16, %rdi +- add $16, %rsi +-L(16bytes): +- mov -16(%rdi), %rax +- mov -16(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +-L(8bytes): +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(12bytes): +- mov -12(%rdi), %rax +- mov -12(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +-L(4bytes): +- mov -4(%rsi), %ecx +-# ifndef USE_AS_WMEMCMP +- mov -4(%rdi), %eax +- cmp %eax, %ecx +-# else +- cmp -4(%rdi), %ecx +-# endif +- jne L(diffin4bytes) +-L(0bytes): +- xor %eax, %eax +- ret +- +-# ifndef USE_AS_WMEMCMP +-/* unreal case for wmemcmp */ +- .p2align 4 +-L(65bytes): +- movdqu -65(%rdi), %xmm1 +- movdqu -65(%rsi), %xmm2 +- mov $-65, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(49bytes): +- movdqu -49(%rdi), %xmm1 +- movdqu -49(%rsi), %xmm2 +- mov $-49, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(33bytes): +- movdqu -33(%rdi), %xmm1 +- movdqu -33(%rsi), %xmm2 +- mov $-33, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(17bytes): +- mov -17(%rdi), %rax +- mov -17(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +-L(9bytes): +- mov -9(%rdi), %rax +- mov -9(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- movzbl -1(%rdi), %eax +- movzbl -1(%rsi), %edx +- sub %edx, %eax +- ret +- +- .p2align 4 +-L(13bytes): +- mov -13(%rdi), %rax +- mov -13(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(5bytes): +- mov -5(%rdi), %eax +- mov -5(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +- movzbl -1(%rdi), %eax +- movzbl -1(%rsi), %edx +- sub %edx, %eax +- ret +- +- .p2align 4 +-L(66bytes): +- movdqu -66(%rdi), %xmm1 +- movdqu -66(%rsi), %xmm2 +- mov $-66, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(50bytes): +- movdqu -50(%rdi), %xmm1 +- movdqu -50(%rsi), %xmm2 +- mov $-50, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(34bytes): +- movdqu -34(%rdi), %xmm1 +- movdqu -34(%rsi), %xmm2 +- mov $-34, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(18bytes): +- mov -18(%rdi), %rax +- mov -18(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +-L(10bytes): +- mov -10(%rdi), %rax +- mov -10(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- movzwl -2(%rdi), %eax +- movzwl -2(%rsi), %ecx +- cmp %cl, %al +- jne L(end) +- and $0xffff, %eax +- and $0xffff, %ecx +- sub %ecx, %eax +- ret +- +- .p2align 4 +-L(14bytes): +- mov -14(%rdi), %rax +- mov -14(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(6bytes): +- mov -6(%rdi), %eax +- mov -6(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +-L(2bytes): +- movzwl -2(%rsi), %ecx +- movzwl -2(%rdi), %eax +- cmp %cl, %al +- jne L(end) +- and $0xffff, %eax +- and $0xffff, %ecx +- sub %ecx, %eax +- ret +- +- .p2align 4 +-L(67bytes): +- movdqu -67(%rdi), %xmm2 +- movdqu -67(%rsi), %xmm1 +- mov $-67, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(51bytes): +- movdqu -51(%rdi), %xmm2 +- movdqu -51(%rsi), %xmm1 +- mov $-51, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(35bytes): +- movdqu -35(%rsi), %xmm1 +- movdqu -35(%rdi), %xmm2 +- mov $-35, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(19bytes): +- mov -19(%rdi), %rax +- mov -19(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +-L(11bytes): +- mov -11(%rdi), %rax +- mov -11(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -4(%rdi), %eax +- mov -4(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(15bytes): +- mov -15(%rdi), %rax +- mov -15(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(7bytes): +- mov -7(%rdi), %eax +- mov -7(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +- mov -4(%rdi), %eax +- mov -4(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(3bytes): +- movzwl -3(%rdi), %eax +- movzwl -3(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin2bytes) +-L(1bytes): +- movzbl -1(%rdi), %eax +- movzbl -1(%rsi), %ecx +- sub %ecx, %eax +- ret +-# endif +- +- .p2align 4 +-L(68bytes): +- movdqu -68(%rdi), %xmm2 +- movdqu -68(%rsi), %xmm1 +- mov $-68, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(52bytes): +- movdqu -52(%rdi), %xmm2 +- movdqu -52(%rsi), %xmm1 +- mov $-52, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(36bytes): +- movdqu -36(%rdi), %xmm2 +- movdqu -36(%rsi), %xmm1 +- mov $-36, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(20bytes): +- movdqu -20(%rdi), %xmm2 +- movdqu -20(%rsi), %xmm1 +- mov $-20, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -4(%rsi), %ecx +- +-# ifndef USE_AS_WMEMCMP +- mov -4(%rdi), %eax +- cmp %eax, %ecx +-# else +- cmp -4(%rdi), %ecx +-# endif +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +- +-# ifndef USE_AS_WMEMCMP +-/* unreal cases for wmemcmp */ +- .p2align 4 +-L(69bytes): +- movdqu -69(%rsi), %xmm1 +- movdqu -69(%rdi), %xmm2 +- mov $-69, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(53bytes): +- movdqu -53(%rsi), %xmm1 +- movdqu -53(%rdi), %xmm2 +- mov $-53, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(37bytes): +- movdqu -37(%rsi), %xmm1 +- movdqu -37(%rdi), %xmm2 +- mov $-37, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(21bytes): +- movdqu -21(%rsi), %xmm1 +- movdqu -21(%rdi), %xmm2 +- mov $-21, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(70bytes): +- movdqu -70(%rsi), %xmm1 +- movdqu -70(%rdi), %xmm2 +- mov $-70, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(54bytes): +- movdqu -54(%rsi), %xmm1 +- movdqu -54(%rdi), %xmm2 +- mov $-54, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(38bytes): +- movdqu -38(%rsi), %xmm1 +- movdqu -38(%rdi), %xmm2 +- mov $-38, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(22bytes): +- movdqu -22(%rsi), %xmm1 +- movdqu -22(%rdi), %xmm2 +- mov $-22, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(71bytes): +- movdqu -71(%rsi), %xmm1 +- movdqu -71(%rdi), %xmm2 +- mov $-71, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(55bytes): +- movdqu -55(%rdi), %xmm2 +- movdqu -55(%rsi), %xmm1 +- mov $-55, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(39bytes): +- movdqu -39(%rdi), %xmm2 +- movdqu -39(%rsi), %xmm1 +- mov $-39, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(23bytes): +- movdqu -23(%rdi), %xmm2 +- movdqu -23(%rsi), %xmm1 +- mov $-23, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +-# endif +- +- .p2align 4 +-L(72bytes): +- movdqu -72(%rsi), %xmm1 +- movdqu -72(%rdi), %xmm2 +- mov $-72, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(56bytes): +- movdqu -56(%rdi), %xmm2 +- movdqu -56(%rsi), %xmm1 +- mov $-56, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(40bytes): +- movdqu -40(%rdi), %xmm2 +- movdqu -40(%rsi), %xmm1 +- mov $-40, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(24bytes): +- movdqu -24(%rdi), %xmm2 +- movdqu -24(%rsi), %xmm1 +- mov $-24, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- +- mov -8(%rsi), %rcx +- mov -8(%rdi), %rax +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +-# ifndef USE_AS_WMEMCMP +-/* unreal cases for wmemcmp */ +- .p2align 4 +-L(73bytes): +- movdqu -73(%rsi), %xmm1 +- movdqu -73(%rdi), %xmm2 +- mov $-73, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(57bytes): +- movdqu -57(%rdi), %xmm2 +- movdqu -57(%rsi), %xmm1 +- mov $-57, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(41bytes): +- movdqu -41(%rdi), %xmm2 +- movdqu -41(%rsi), %xmm1 +- mov $-41, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(25bytes): +- movdqu -25(%rdi), %xmm2 +- movdqu -25(%rsi), %xmm1 +- mov $-25, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -9(%rdi), %rax +- mov -9(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- movzbl -1(%rdi), %eax +- movzbl -1(%rsi), %ecx +- sub %ecx, %eax +- ret +- +- .p2align 4 +-L(74bytes): +- movdqu -74(%rsi), %xmm1 +- movdqu -74(%rdi), %xmm2 +- mov $-74, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(58bytes): +- movdqu -58(%rdi), %xmm2 +- movdqu -58(%rsi), %xmm1 +- mov $-58, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(42bytes): +- movdqu -42(%rdi), %xmm2 +- movdqu -42(%rsi), %xmm1 +- mov $-42, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(26bytes): +- movdqu -26(%rdi), %xmm2 +- movdqu -26(%rsi), %xmm1 +- mov $-26, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -10(%rdi), %rax +- mov -10(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- movzwl -2(%rdi), %eax +- movzwl -2(%rsi), %ecx +- jmp L(diffin2bytes) +- +- .p2align 4 +-L(75bytes): +- movdqu -75(%rsi), %xmm1 +- movdqu -75(%rdi), %xmm2 +- mov $-75, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(59bytes): +- movdqu -59(%rdi), %xmm2 +- movdqu -59(%rsi), %xmm1 +- mov $-59, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(43bytes): +- movdqu -43(%rdi), %xmm2 +- movdqu -43(%rsi), %xmm1 +- mov $-43, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(27bytes): +- movdqu -27(%rdi), %xmm2 +- movdqu -27(%rsi), %xmm1 +- mov $-27, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -11(%rdi), %rax +- mov -11(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -4(%rdi), %eax +- mov -4(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +-# endif +- .p2align 4 +-L(76bytes): +- movdqu -76(%rsi), %xmm1 +- movdqu -76(%rdi), %xmm2 +- mov $-76, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(60bytes): +- movdqu -60(%rdi), %xmm2 +- movdqu -60(%rsi), %xmm1 +- mov $-60, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(44bytes): +- movdqu -44(%rdi), %xmm2 +- movdqu -44(%rsi), %xmm1 +- mov $-44, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(28bytes): +- movdqu -28(%rdi), %xmm2 +- movdqu -28(%rsi), %xmm1 +- mov $-28, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -12(%rdi), %rax +- mov -12(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -4(%rsi), %ecx +-# ifndef USE_AS_WMEMCMP +- mov -4(%rdi), %eax +- cmp %eax, %ecx +-# else +- cmp -4(%rdi), %ecx +-# endif +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +- +-# ifndef USE_AS_WMEMCMP +-/* unreal cases for wmemcmp */ +- .p2align 4 +-L(77bytes): +- movdqu -77(%rsi), %xmm1 +- movdqu -77(%rdi), %xmm2 +- mov $-77, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(61bytes): +- movdqu -61(%rdi), %xmm2 +- movdqu -61(%rsi), %xmm1 +- mov $-61, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(45bytes): +- movdqu -45(%rdi), %xmm2 +- movdqu -45(%rsi), %xmm1 +- mov $-45, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(29bytes): +- movdqu -29(%rdi), %xmm2 +- movdqu -29(%rsi), %xmm1 +- mov $-29, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- +- mov -13(%rdi), %rax +- mov -13(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(78bytes): +- movdqu -78(%rsi), %xmm1 +- movdqu -78(%rdi), %xmm2 +- mov $-78, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(62bytes): +- movdqu -62(%rdi), %xmm2 +- movdqu -62(%rsi), %xmm1 +- mov $-62, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(46bytes): +- movdqu -46(%rdi), %xmm2 +- movdqu -46(%rsi), %xmm1 +- mov $-46, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(30bytes): +- movdqu -30(%rdi), %xmm2 +- movdqu -30(%rsi), %xmm1 +- mov $-30, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -14(%rdi), %rax +- mov -14(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(79bytes): +- movdqu -79(%rsi), %xmm1 +- movdqu -79(%rdi), %xmm2 +- mov $-79, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(63bytes): +- movdqu -63(%rdi), %xmm2 +- movdqu -63(%rsi), %xmm1 +- mov $-63, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(47bytes): +- movdqu -47(%rdi), %xmm2 +- movdqu -47(%rsi), %xmm1 +- mov $-47, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(31bytes): +- movdqu -31(%rdi), %xmm2 +- movdqu -31(%rsi), %xmm1 +- mov $-31, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -15(%rdi), %rax +- mov -15(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +-# endif +- .p2align 4 +-L(64bytes): +- movdqu -64(%rdi), %xmm2 +- movdqu -64(%rsi), %xmm1 +- mov $-64, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(48bytes): +- movdqu -48(%rdi), %xmm2 +- movdqu -48(%rsi), %xmm1 +- mov $-48, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(32bytes): +- movdqu -32(%rdi), %xmm2 +- movdqu -32(%rsi), %xmm1 +- mov $-32, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- +- mov -16(%rdi), %rax +- mov -16(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +-/* +- * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. +- */ +- .p2align 3 +-L(less16bytes): +- movsbq %dl, %rdx +- mov (%rsi, %rdx), %rcx +- mov (%rdi, %rdx), %rax +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov 8(%rsi, %rdx), %rcx +- mov 8(%rdi, %rdx), %rax +-L(diffin8bytes): +- cmp %eax, %ecx +- jne L(diffin4bytes) +- shr $32, %rcx +- shr $32, %rax +- ++ pmovmskb %xmm0, %ecx ++ incw %cx ++ jnz L(loop_end_ret) ++ ++ pmovmskb %xmm1, %ecx ++ notw %cx ++ sall $16, %ecx ++ jnz L(loop_end_ret) ++ ++ pmovmskb %xmm2, %ecx ++ notw %cx ++ shlq $32, %rcx ++ jnz L(loop_end_ret) ++ ++ addq $48, %rdi ++ addq $48, %rsi ++ movq %rax, %rcx ++ ++ .p2align 4,, 6 ++L(loop_end_ret): ++ bsfq %rcx, %rcx + # ifdef USE_AS_WMEMCMP +-/* for wmemcmp */ +- cmp %eax, %ecx +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +-# endif +- +-L(diffin4bytes): +-# ifndef USE_AS_WMEMCMP +- cmp %cx, %ax +- jne L(diffin2bytes) +- shr $16, %ecx +- shr $16, %eax +-L(diffin2bytes): +- cmp %cl, %al +- jne L(end) +- and $0xffff, %eax +- and $0xffff, %ecx +- sub %ecx, %eax +- ret +- +- .p2align 4 +-L(end): +- and $0xff, %eax +- and $0xff, %ecx +- sub %ecx, %eax +- ret ++ movl (%rdi, %rcx), %eax ++ xorl %edx, %edx ++ cmpl (%rsi, %rcx), %eax ++ setg %dl ++ leal -1(%rdx, %rdx), %eax + # else +- +-/* for wmemcmp */ +- mov $1, %eax +- jl L(nequal_bigger) +- neg %eax +- ret +- +- .p2align 4 +-L(nequal_bigger): +- ret +- +-L(unreal_case): +- xor %eax, %eax +- ret ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif +- ++ ret + END (MEMCMP) +- +- .section .rodata.sse4.1,"a",@progbits +- .p2align 3 +-# ifndef USE_AS_WMEMCMP +-L(table_64bytes): +- .int JMPTBL (L(0bytes), L(table_64bytes)) +- .int JMPTBL (L(1bytes), L(table_64bytes)) +- .int JMPTBL (L(2bytes), L(table_64bytes)) +- .int JMPTBL (L(3bytes), L(table_64bytes)) +- .int JMPTBL (L(4bytes), L(table_64bytes)) +- .int JMPTBL (L(5bytes), L(table_64bytes)) +- .int JMPTBL (L(6bytes), L(table_64bytes)) +- .int JMPTBL (L(7bytes), L(table_64bytes)) +- .int JMPTBL (L(8bytes), L(table_64bytes)) +- .int JMPTBL (L(9bytes), L(table_64bytes)) +- .int JMPTBL (L(10bytes), L(table_64bytes)) +- .int JMPTBL (L(11bytes), L(table_64bytes)) +- .int JMPTBL (L(12bytes), L(table_64bytes)) +- .int JMPTBL (L(13bytes), L(table_64bytes)) +- .int JMPTBL (L(14bytes), L(table_64bytes)) +- .int JMPTBL (L(15bytes), L(table_64bytes)) +- .int JMPTBL (L(16bytes), L(table_64bytes)) +- .int JMPTBL (L(17bytes), L(table_64bytes)) +- .int JMPTBL (L(18bytes), L(table_64bytes)) +- .int JMPTBL (L(19bytes), L(table_64bytes)) +- .int JMPTBL (L(20bytes), L(table_64bytes)) +- .int JMPTBL (L(21bytes), L(table_64bytes)) +- .int JMPTBL (L(22bytes), L(table_64bytes)) +- .int JMPTBL (L(23bytes), L(table_64bytes)) +- .int JMPTBL (L(24bytes), L(table_64bytes)) +- .int JMPTBL (L(25bytes), L(table_64bytes)) +- .int JMPTBL (L(26bytes), L(table_64bytes)) +- .int JMPTBL (L(27bytes), L(table_64bytes)) +- .int JMPTBL (L(28bytes), L(table_64bytes)) +- .int JMPTBL (L(29bytes), L(table_64bytes)) +- .int JMPTBL (L(30bytes), L(table_64bytes)) +- .int JMPTBL (L(31bytes), L(table_64bytes)) +- .int JMPTBL (L(32bytes), L(table_64bytes)) +- .int JMPTBL (L(33bytes), L(table_64bytes)) +- .int JMPTBL (L(34bytes), L(table_64bytes)) +- .int JMPTBL (L(35bytes), L(table_64bytes)) +- .int JMPTBL (L(36bytes), L(table_64bytes)) +- .int JMPTBL (L(37bytes), L(table_64bytes)) +- .int JMPTBL (L(38bytes), L(table_64bytes)) +- .int JMPTBL (L(39bytes), L(table_64bytes)) +- .int JMPTBL (L(40bytes), L(table_64bytes)) +- .int JMPTBL (L(41bytes), L(table_64bytes)) +- .int JMPTBL (L(42bytes), L(table_64bytes)) +- .int JMPTBL (L(43bytes), L(table_64bytes)) +- .int JMPTBL (L(44bytes), L(table_64bytes)) +- .int JMPTBL (L(45bytes), L(table_64bytes)) +- .int JMPTBL (L(46bytes), L(table_64bytes)) +- .int JMPTBL (L(47bytes), L(table_64bytes)) +- .int JMPTBL (L(48bytes), L(table_64bytes)) +- .int JMPTBL (L(49bytes), L(table_64bytes)) +- .int JMPTBL (L(50bytes), L(table_64bytes)) +- .int JMPTBL (L(51bytes), L(table_64bytes)) +- .int JMPTBL (L(52bytes), L(table_64bytes)) +- .int JMPTBL (L(53bytes), L(table_64bytes)) +- .int JMPTBL (L(54bytes), L(table_64bytes)) +- .int JMPTBL (L(55bytes), L(table_64bytes)) +- .int JMPTBL (L(56bytes), L(table_64bytes)) +- .int JMPTBL (L(57bytes), L(table_64bytes)) +- .int JMPTBL (L(58bytes), L(table_64bytes)) +- .int JMPTBL (L(59bytes), L(table_64bytes)) +- .int JMPTBL (L(60bytes), L(table_64bytes)) +- .int JMPTBL (L(61bytes), L(table_64bytes)) +- .int JMPTBL (L(62bytes), L(table_64bytes)) +- .int JMPTBL (L(63bytes), L(table_64bytes)) +- .int JMPTBL (L(64bytes), L(table_64bytes)) +- .int JMPTBL (L(65bytes), L(table_64bytes)) +- .int JMPTBL (L(66bytes), L(table_64bytes)) +- .int JMPTBL (L(67bytes), L(table_64bytes)) +- .int JMPTBL (L(68bytes), L(table_64bytes)) +- .int JMPTBL (L(69bytes), L(table_64bytes)) +- .int JMPTBL (L(70bytes), L(table_64bytes)) +- .int JMPTBL (L(71bytes), L(table_64bytes)) +- .int JMPTBL (L(72bytes), L(table_64bytes)) +- .int JMPTBL (L(73bytes), L(table_64bytes)) +- .int JMPTBL (L(74bytes), L(table_64bytes)) +- .int JMPTBL (L(75bytes), L(table_64bytes)) +- .int JMPTBL (L(76bytes), L(table_64bytes)) +- .int JMPTBL (L(77bytes), L(table_64bytes)) +- .int JMPTBL (L(78bytes), L(table_64bytes)) +- .int JMPTBL (L(79bytes), L(table_64bytes)) +-# else +-L(table_64bytes): +- .int JMPTBL (L(0bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(4bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(8bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(12bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(16bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(20bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(24bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(28bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(32bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(36bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(40bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(44bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(48bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(52bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(56bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(60bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(64bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(68bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(72bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(76bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +-# endif + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-64.patch b/glibc-RHEL-15696-64.patch new file mode 100644 index 0000000..ba7f14a --- /dev/null +++ b/glibc-RHEL-15696-64.patch @@ -0,0 +1,39 @@ +From 0b82747dc48d5bf0871bdc6da8cb6eec1256355f Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Thu, 11 Nov 2021 06:31:51 -0800 +Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_lock_full [BZ + #28537] +Content-type: text/plain; charset=UTF-8 + +Replace boolean CAS with value CAS to avoid the extra load. + +Reviewed-by: Szabolcs Nagy +--- + nptl/pthread_mutex_lock.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c +index 29cc143e..60ada70d 100644 +--- a/nptl/pthread_mutex_lock.c ++++ b/nptl/pthread_mutex_lock.c +@@ -292,12 +292,12 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex) + meantime. */ + if ((oldval & FUTEX_WAITERS) == 0) + { +- if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock, +- oldval | FUTEX_WAITERS, +- oldval) +- != 0) ++ int val; ++ if ((val = atomic_compare_and_exchange_val_acq ++ (&mutex->__data.__lock, oldval | FUTEX_WAITERS, ++ oldval)) != oldval) + { +- oldval = mutex->__data.__lock; ++ oldval = val; + continue; + } + oldval |= FUTEX_WAITERS; +-- +GitLab + diff --git a/glibc-RHEL-15696-65.patch b/glibc-RHEL-15696-65.patch new file mode 100644 index 0000000..296d4a9 --- /dev/null +++ b/glibc-RHEL-15696-65.patch @@ -0,0 +1,39 @@ +From 49302b8fdf9103b6fc0a398678668a22fa19574c Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Thu, 11 Nov 2021 06:54:01 -0800 +Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_clocklock_common + [BZ #28537] +Content-type: text/plain; charset=UTF-8 + +Replace boolean CAS with value CAS to avoid the extra load. + +Reviewed-by: Szabolcs Nagy +--- + nptl/pthread_mutex_timedlock.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c +index 888c12fe..c4627ef6 100644 +--- a/nptl/pthread_mutex_timedlock.c ++++ b/nptl/pthread_mutex_timedlock.c +@@ -269,12 +269,12 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex, + meantime. */ + if ((oldval & FUTEX_WAITERS) == 0) + { +- if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock, +- oldval | FUTEX_WAITERS, +- oldval) +- != 0) ++ int val; ++ if ((val = atomic_compare_and_exchange_val_acq ++ (&mutex->__data.__lock, oldval | FUTEX_WAITERS, ++ oldval)) != oldval) + { +- oldval = mutex->__data.__lock; ++ oldval = val; + continue; + } + oldval |= FUTEX_WAITERS; +-- +GitLab + diff --git a/glibc-RHEL-15696-66.patch b/glibc-RHEL-15696-66.patch new file mode 100644 index 0000000..4579636 --- /dev/null +++ b/glibc-RHEL-15696-66.patch @@ -0,0 +1,51 @@ +From d672a98a1af106bd68deb15576710cd61363f7a6 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Tue, 2 Nov 2021 18:33:07 -0700 +Subject: [PATCH] Add LLL_MUTEX_READ_LOCK [BZ #28537] +Content-type: text/plain; charset=UTF-8 + +CAS instruction is expensive. From the x86 CPU's point of view, getting +a cache line for writing is more expensive than reading. See Appendix +A.2 Spinlock in: + +https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf + +The full compare and swap will grab the cache line exclusive and cause +excessive cache line bouncing. + +Add LLL_MUTEX_READ_LOCK to do an atomic load and skip CAS in spinlock +loop if compare may fail to reduce cache line bouncing on contended locks. + +Reviewed-by: Szabolcs Nagy +--- + nptl/pthread_mutex_lock.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c +index 60ada70d..eb4d8baa 100644 +--- a/nptl/pthread_mutex_lock.c ++++ b/nptl/pthread_mutex_lock.c +@@ -56,6 +56,11 @@ + #define FORCE_ELISION(m, s) + #endif + ++#ifndef LLL_MUTEX_READ_LOCK ++# define LLL_MUTEX_READ_LOCK(mutex) \ ++ atomic_load_relaxed (&(mutex)->__data.__lock) ++#endif ++ + static int __pthread_mutex_lock_full (pthread_mutex_t *mutex) + __attribute_noinline__; + +@@ -136,6 +141,8 @@ __pthread_mutex_lock (pthread_mutex_t *mutex) + break; + } + atomic_spin_nop (); ++ if (LLL_MUTEX_READ_LOCK (mutex) != 0) ++ continue; + } + while (LLL_MUTEX_TRYLOCK (mutex) != 0); + +-- +GitLab + diff --git a/glibc-RHEL-15696-67.patch b/glibc-RHEL-15696-67.patch new file mode 100644 index 0000000..73c8306 --- /dev/null +++ b/glibc-RHEL-15696-67.patch @@ -0,0 +1,71 @@ +From 120ac6d238825452e8024e2f627da33b2508dfd3 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 12 Nov 2021 11:47:42 -0800 +Subject: [PATCH] Move assignment out of the CAS condition +Content-type: text/plain; charset=UTF-8 + +Update + +commit 49302b8fdf9103b6fc0a398678668a22fa19574c +Author: H.J. Lu +Date: Thu Nov 11 06:54:01 2021 -0800 + + Avoid extra load with CAS in __pthread_mutex_clocklock_common [BZ #28537] + + Replace boolean CAS with value CAS to avoid the extra load. + +and + +commit 0b82747dc48d5bf0871bdc6da8cb6eec1256355f +Author: H.J. Lu +Date: Thu Nov 11 06:31:51 2021 -0800 + + Avoid extra load with CAS in __pthread_mutex_lock_full [BZ #28537] + + Replace boolean CAS with value CAS to avoid the extra load. + +by moving assignment out of the CAS condition. +--- + nptl/pthread_mutex_lock.c | 7 +++---- + nptl/pthread_mutex_timedlock.c | 7 +++---- + 2 files changed, 6 insertions(+), 8 deletions(-) + +diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c +index eb4d8baa..a633d95e 100644 +--- a/nptl/pthread_mutex_lock.c ++++ b/nptl/pthread_mutex_lock.c +@@ -299,10 +299,9 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex) + meantime. */ + if ((oldval & FUTEX_WAITERS) == 0) + { +- int val; +- if ((val = atomic_compare_and_exchange_val_acq +- (&mutex->__data.__lock, oldval | FUTEX_WAITERS, +- oldval)) != oldval) ++ int val = atomic_compare_and_exchange_val_acq ++ (&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval); ++ if (val != oldval) + { + oldval = val; + continue; +diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c +index c4627ef6..a76c30b7 100644 +--- a/nptl/pthread_mutex_timedlock.c ++++ b/nptl/pthread_mutex_timedlock.c +@@ -269,10 +269,9 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex, + meantime. */ + if ((oldval & FUTEX_WAITERS) == 0) + { +- int val; +- if ((val = atomic_compare_and_exchange_val_acq +- (&mutex->__data.__lock, oldval | FUTEX_WAITERS, +- oldval)) != oldval) ++ int val = atomic_compare_and_exchange_val_acq ++ (&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval); ++ if (val != oldval) + { + oldval = val; + continue; +-- +GitLab + diff --git a/glibc-RHEL-15696-68.patch b/glibc-RHEL-15696-68.patch new file mode 100644 index 0000000..df35b31 --- /dev/null +++ b/glibc-RHEL-15696-68.patch @@ -0,0 +1,60 @@ +From 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 3 Dec 2021 15:29:25 -0800 +Subject: [PATCH] x86-64: Use notl in EVEX strcmp [BZ #28646] +Content-type: text/plain; charset=UTF-8 + +Must use notl %edi here as lower bits are for CHAR comparisons +potentially out of range thus can be 0 without indicating mismatch. +This fixes BZ #28646. + +Co-Authored-By: H.J. Lu +--- + sysdeps/x86_64/multiarch/strcmp-evex.S | 14 ++++++++------ + 1 file changed, 8 insertions(+), 6 deletions(-) + +Conflicts: + string/test-strcmp.c + (new check omitted) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 82f12ac8..6f5c4bf9 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -656,12 +656,13 @@ L(loop_cross_page): + in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */ + VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4} + kmovd %k3, %edi ++ /* Must use notl %edi here as lower bits are for CHAR ++ comparisons potentially out of range thus can be 0 without ++ indicating mismatch. */ ++ notl %edi + # ifdef USE_AS_WCSCMP + /* Don't use subl since it is the upper 8 bits of EDI below. */ +- notl %edi + andl $0xff, %edi +-# else +- incl %edi + # endif + + # ifdef USE_AS_WCSCMP +@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec): + in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */ + VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4} + kmovd %k3, %edi ++ /* Must use notl %edi here as lower bits are for CHAR ++ comparisons potentially out of range thus can be 0 without ++ indicating mismatch. */ ++ notl %edi + # ifdef USE_AS_WCSCMP + /* Don't use subl since it is the upper 8 bits of EDI below. */ +- notl %edi + andl $0xff, %edi +-# else +- incl %edi + # endif + + # ifdef USE_AS_WCSCMP +-- +GitLab + diff --git a/glibc-RHEL-15696-69.patch b/glibc-RHEL-15696-69.patch new file mode 100644 index 0000000..9f859f2 --- /dev/null +++ b/glibc-RHEL-15696-69.patch @@ -0,0 +1,35 @@ +From ceeffe968c01b1202e482f4855cb6baf5c6cb713 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 6 Dec 2021 07:14:12 -0800 +Subject: [PATCH] x86: Don't set Prefer_No_AVX512 for processors with AVX512 + and AVX-VNNI +Content-type: text/plain; charset=UTF-8 + +Don't set Prefer_No_AVX512 on processors with AVX512 and AVX-VNNI since +they won't lower CPU frequency when ZMM load and store instructions are +used. +--- + sysdeps/x86/cpu-features.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index 956bfb4f..5ff2baa0 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -525,8 +525,11 @@ init_cpu_features (struct cpu_features *cpu_features) + |= bit_arch_Prefer_No_VZEROUPPER; + else + { +- cpu_features->preferred[index_arch_Prefer_No_AVX512] +- |= bit_arch_Prefer_No_AVX512; ++ /* Processors with AVX512 and AVX-VNNI won't lower CPU frequency ++ when ZMM load and store instructions are used. */ ++ if (!CPU_FEATURES_CPU_P (cpu_features, AVX_VNNI)) ++ cpu_features->preferred[index_arch_Prefer_No_AVX512] ++ |= bit_arch_Prefer_No_AVX512; + + /* Avoid RTM abort triggered by VZEROUPPER inside a + transactionally executing RTM region. */ +-- +GitLab + diff --git a/glibc-RHEL-15696-7.patch b/glibc-RHEL-15696-7.patch new file mode 100644 index 0000000..8ef468c --- /dev/null +++ b/glibc-RHEL-15696-7.patch @@ -0,0 +1,153 @@ +From c7c54f65b080affb87a1513dee449c8ad6143c8b Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 21 Jan 2019 11:35:18 -0800 +Subject: [PATCH] x86-64 strncpy: Properly handle the length parameter [BZ# + 24097] +Content-type: text/plain; charset=UTF-8 + +On x32, the size_t parameter may be passed in the lower 32 bits of a +64-bit register with the non-zero upper 32 bits. The string/memory +functions written in assembly can only use the lower 32 bits of a +64-bit register as length or must clear the upper 32 bits before using +the full 64-bit register for length. + +This pach fixes strncpy for x32. Tested on x86-64 and x32. On x86-64, +libc.so is the same with and withou the fix. + + [BZ# 24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/strcpy-avx2.S: Use RDX_LP for length. + * sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise. + * sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy. + * sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file. +--- + .../x86_64/multiarch/strcpy-sse2-unaligned.S | 4 +- + sysdeps/x86_64/multiarch/strcpy-ssse3.S | 6 +- + sysdeps/x86_64/x32/Makefile | 2 +- + sysdeps/x86_64/x32/tst-size_t-strncpy.c | 58 +++++++++++++++++++ + 4 files changed, 64 insertions(+), 6 deletions(-) + create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncpy.c + +Conflicts: + ChangeLog + (removed) + sysdeps/x86_64/multiarch/strcpy-avx2.S + (skipped, only needed for x32 arch) + +diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S +index 72bf7e85..50aca22d 100644 +--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S ++++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S +@@ -40,8 +40,8 @@ + .text + ENTRY (STRCPY) + # ifdef USE_AS_STRNCPY +- mov %rdx, %r8 +- test %r8, %r8 ++ mov %RDX_LP, %R8_LP ++ test %R8_LP, %R8_LP + jz L(ExitZero) + # endif + mov %rsi, %rcx +diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S +index 9858d0c4..0a62814a 100644 +--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S ++++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S +@@ -31,13 +31,13 @@ ENTRY (STRCPY) + + mov %rsi, %rcx + # ifdef USE_AS_STRNCPY +- mov %rdx, %r8 ++ mov %RDX_LP, %R8_LP + # endif + mov %rdi, %rdx + # ifdef USE_AS_STRNCPY +- test %r8, %r8 ++ test %R8_LP, %R8_LP + jz L(Exit0) +- cmp $8, %r8 ++ cmp $8, %R8_LP + jbe L(StrncpyExit8Bytes) + # endif + cmpb $0, (%rcx) +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index db302839..2a9e20a9 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -8,7 +8,7 @@ endif + ifeq ($(subdir),string) + tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ + tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \ +- tst-size_t-strncmp ++ tst-size_t-strncmp tst-size_t-strncpy + endif + + ifeq ($(subdir),wcsmbs) +diff --git a/sysdeps/x86_64/x32/tst-size_t-strncpy.c b/sysdeps/x86_64/x32/tst-size_t-strncpy.c +new file mode 100644 +index 00000000..4dec71e6 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-strncpy.c +@@ -0,0 +1,58 @@ ++/* Test strncpy with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define TEST_NAME "strncpy" ++#include "test-size_t.h" ++ ++IMPL (strncpy, 1) ++ ++typedef char *(*proto_t) (char *, const char*, size_t); ++ ++static void * ++__attribute__ ((noinline, noclone)) ++do_strncpy (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ parameter_t dest = { { page_size }, buf1 }; ++ parameter_t src = { { 0 }, buf2 }; ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ src.fn = impl->fn; ++ do_strncpy (dest, src); ++ int res = strncmp (dest.p, src.p, dest.len); ++ if (res) ++ { ++ error (0, 0, "Wrong result in function %s: %i != 0", ++ impl->name, res); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +-- +GitLab + diff --git a/glibc-RHEL-15696-70.patch b/glibc-RHEL-15696-70.patch new file mode 100644 index 0000000..8935ac5 --- /dev/null +++ b/glibc-RHEL-15696-70.patch @@ -0,0 +1,389 @@ +From abddd61de090ae84e380aff68a98bd94ef704667 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 24 Dec 2021 18:54:41 -0600 +Subject: [PATCH] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S +Content-type: text/plain; charset=UTF-8 + +No bug. +Optimizations are twofold. + +1) Replace page cross and 0/1 checks with masked load instructions in + L(less_vec). In applications this reduces branch-misses in the + hot [0, 32] case. +2) Change controlflow so that L(less_vec) case gets the fall through. + +Change 2) helps copies in the [0, 32] size range but comes at the cost +of copies in the [33, 64] size range. From profiles of GCC and +Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this +appears to the the right tradeoff. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 249 +++++-------------- + 1 file changed, 56 insertions(+), 193 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +index 640f6757..d2899e7c 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +@@ -62,15 +62,18 @@ Latency: + # define VMOVU vmovdqu64 + + # ifdef USE_AS_WMEMCMP ++# define VMOVU_MASK vmovdqu32 + # define CHAR_SIZE 4 + # define VPCMP vpcmpd + # define VPTEST vptestmd + # else ++# define VMOVU_MASK vmovdqu8 + # define CHAR_SIZE 1 + # define VPCMP vpcmpub + # define VPTEST vptestmb + # endif + ++ + # define VEC_SIZE 32 + # define PAGE_SIZE 4096 + # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) +@@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6) + movl %edx, %edx + # endif + cmp $CHAR_PER_VEC, %RDX_LP +- jb L(less_vec) ++ /* Fall through for [0, VEC_SIZE] as its the hottest. */ ++ ja L(more_1x_vec) ++ ++ /* Create mask for CHAR's we want to compare. This allows us to ++ avoid having to include page cross logic. */ ++ movl $-1, %ecx ++ bzhil %edx, %ecx, %ecx ++ kmovd %ecx, %k2 ++ ++ /* Safe to load full ymm with mask. */ ++ VMOVU_MASK (%rsi), %YMM2{%k2} ++ VPCMP $4,(%rdi), %YMM2, %k1{%k2} ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(return_vec_0) ++ ret + ++ .p2align 4 ++L(return_vec_0): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl (%rdi, %rax, CHAR_SIZE), %ecx ++ xorl %edx, %edx ++ cmpl (%rsi, %rax, CHAR_SIZE), %ecx ++ /* NB: no partial register stall here because xorl zero idiom ++ above. */ ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (%rsi, %rax), %ecx ++ movzbl (%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ ret ++ ++ ++ .p2align 4 ++L(more_1x_vec): + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %YMM1 + /* Use compare not equals to directly check for mismatch. */ +- VPCMP $4, (%rdi), %YMM1, %k1 ++ VPCMP $4,(%rdi), %YMM1, %k1 + kmovd %k1, %eax + /* NB: eax must be destination register if going to + L(return_vec_[0,2]). For L(return_vec_3) destination register +@@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6) + + /* Check third and fourth VEC no matter what. */ + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 +- VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1 ++ VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 +- VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1 ++ VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1 + kmovd %k1, %ecx + testl %ecx, %ecx + jnz L(return_vec_3) +@@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6) + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 + /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while + oring with YMM1. Result is stored in YMM4. */ +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 ++ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 + + /* Or together YMM2, YMM3, and YMM4 into YMM4. */ + vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 +@@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6) + /* NB: eax must be zero to reach here. */ + ret + +- .p2align 4 ++ ++ .p2align 4,, 8 + L(8x_end_return_vec_0_1_2_3): + movq %rdx, %rdi + L(8x_return_vec_0_1_2_3): +@@ -222,23 +262,6 @@ L(return_vec_3): + # endif + ret + +- .p2align 4 +-L(return_vec_0): +- tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCMP +- movl (%rdi, %rax, CHAR_SIZE), %ecx +- xorl %edx, %edx +- cmpl (%rsi, %rax, CHAR_SIZE), %ecx +- /* NB: no partial register stall here because xorl zero idiom +- above. */ +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl (%rsi, %rax), %ecx +- movzbl (%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret + + .p2align 4 + L(return_vec_1): +@@ -297,7 +320,7 @@ L(loop_4x_vec): + VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 + vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 + VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 ++ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 + vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 + VPTEST %YMM4, %YMM4, %k1 + kmovd %k1, %ecx +@@ -324,7 +347,7 @@ L(loop_4x_vec): + VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 + vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4 ++ vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4 + vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 + VPTEST %YMM4, %YMM4, %k1 + kmovd %k1, %ecx +@@ -336,14 +359,14 @@ L(loop_4x_vec): + /* Only entry is from L(more_8x_vec). */ + .p2align 4,, 10 + L(8x_last_2x_vec): +- VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1 ++ VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(8x_return_vec_2) + /* Naturally aligned to 16 bytes. */ + L(8x_last_1x_vec): + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1 +- VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1 ++ VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(8x_return_vec_3) +@@ -392,7 +415,9 @@ L(last_1x_vec): + jnz L(return_vec_0_end) + ret + +- .p2align 4,, 10 ++ ++ /* Don't align. Takes 2-fetch blocks either way and aligning ++ will cause code to spill into another cacheline. */ + L(return_vec_1_end): + /* Use bsf to save code size. This is necessary to have + L(one_or_less) fit in aligning bytes between. */ +@@ -411,31 +436,8 @@ L(return_vec_1_end): + # endif + ret + +- /* NB: L(one_or_less) fits in alignment padding between +- L(return_vec_1_end) and L(return_vec_0_end). */ +-# ifdef USE_AS_WMEMCMP +-L(one_or_less): +- jb L(zero) +- movl (%rdi), %ecx +- xorl %edx, %edx +- cmpl (%rsi), %ecx +- je L(zero) +- setg %dl +- leal -1(%rdx, %rdx), %eax +- ret +-# else +-L(one_or_less): +- jb L(zero) +- movzbl (%rsi), %ecx +- movzbl (%rdi), %eax +- subl %ecx, %eax +- ret +-# endif +-L(zero): +- xorl %eax, %eax +- ret +- +- .p2align 4 ++ /* Don't align. Takes 2-fetch blocks either way and aligning ++ will cause code to spill into another cacheline. */ + L(return_vec_0_end): + tzcntl %eax, %eax + addl %edx, %eax +@@ -451,146 +453,7 @@ L(return_vec_0_end): + subl %ecx, %eax + # endif + ret ++ /* 1-byte until next cache line. */ + +- .p2align 4 +-L(less_vec): +- /* Check if one or less CHAR. This is necessary for size == 0 +- but is also faster for size == CHAR_SIZE. */ +- cmpl $1, %edx +- jbe L(one_or_less) +- +- /* Check if loading one VEC from either s1 or s2 could cause a +- page cross. This can have false positives but is by far the +- fastest method. */ +- movl %edi, %eax +- orl %esi, %eax +- andl $(PAGE_SIZE - 1), %eax +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jg L(page_cross_less_vec) +- +- /* No page cross possible. */ +- VMOVU (%rsi), %YMM2 +- VPCMP $4, (%rdi), %YMM2, %k1 +- kmovd %k1, %eax +- /* Check if any matches where in bounds. Intentionally not +- storing result in eax to limit dependency chain if it goes to +- L(return_vec_0_lv). */ +- bzhil %edx, %eax, %edx +- jnz L(return_vec_0_lv) +- xorl %eax, %eax +- ret +- +- /* Essentially duplicate of L(return_vec_0). Ends up not costing +- any code as shrinks L(less_vec) by allowing 2-byte encoding of +- the jump and ends up fitting in aligning bytes. As well fits on +- same cache line as L(less_vec) so also saves a line from having +- to be fetched on cold calls to memcmp. */ +- .p2align 4,, 4 +-L(return_vec_0_lv): +- tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCMP +- movl (%rdi, %rax, CHAR_SIZE), %ecx +- xorl %edx, %edx +- cmpl (%rsi, %rax, CHAR_SIZE), %ecx +- /* NB: no partial register stall here because xorl zero idiom +- above. */ +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl (%rsi, %rax), %ecx +- movzbl (%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret +- +- .p2align 4 +-L(page_cross_less_vec): +- /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 +- bytes. */ +- cmpl $(16 / CHAR_SIZE), %edx +- jae L(between_16_31) +-# ifndef USE_AS_WMEMCMP +- cmpl $8, %edx +- jae L(between_8_15) +- cmpl $4, %edx +- jb L(between_2_3) +- +- /* Load as big endian with overlapping movbe to avoid branches. +- */ +- movbe (%rdi), %eax +- movbe (%rsi), %ecx +- shlq $32, %rax +- shlq $32, %rcx +- movbe -4(%rdi, %rdx), %edi +- movbe -4(%rsi, %rdx), %esi +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- /* edx is guranteed to be positive int32 in range [4, 7]. */ +- cmovne %edx, %eax +- /* ecx is -1 if rcx > rax. Otherwise 0. */ +- sbbl %ecx, %ecx +- /* If rcx > rax, then ecx is 0 and eax is positive. If rcx == +- rax then eax and ecx are zero. If rax < rax then ecx is -1 so +- eax doesn't matter. */ +- orl %ecx, %eax +- ret +- +- .p2align 4,, 8 +-L(between_8_15): +-# endif +- /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ +- vmovq (%rdi), %xmm1 +- vmovq (%rsi), %xmm2 +- VPCMP $4, %xmm1, %xmm2, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(return_vec_0_lv) +- /* Use overlapping loads to avoid branches. */ +- vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1 +- vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2 +- VPCMP $4, %xmm1, %xmm2, %k1 +- addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(return_vec_0_end) +- ret +- +- .p2align 4,, 8 +-L(between_16_31): +- /* From 16 to 31 bytes. No branch when size == 16. */ +- +- /* Use movups to save code size. */ +- vmovdqu (%rsi), %xmm2 +- VPCMP $4, (%rdi), %xmm2, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(return_vec_0_lv) +- /* Use overlapping loads to avoid branches. */ +- vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2 +- VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1 +- addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(return_vec_0_end) +- ret +- +-# ifndef USE_AS_WMEMCMP +-L(between_2_3): +- /* Load as big endian to avoid branches. */ +- movzwl (%rdi), %eax +- movzwl (%rsi), %ecx +- shll $8, %eax +- shll $8, %ecx +- bswap %eax +- bswap %ecx +- movzbl -1(%rdi, %rdx), %edi +- movzbl -1(%rsi, %rdx), %esi +- orl %edi, %eax +- orl %esi, %ecx +- /* Subtraction is okay because the upper 8 bits are zero. */ +- subl %ecx, %eax +- ret +-# endif + END (MEMCMP) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-71.patch b/glibc-RHEL-15696-71.patch new file mode 100644 index 0000000..2d018d0 --- /dev/null +++ b/glibc-RHEL-15696-71.patch @@ -0,0 +1,43 @@ +From 6b8dbbd03ac88f169b65b5c7d7278576a11d2e44 Mon Sep 17 00:00:00 2001 +From: Jangwoong Kim <6812skiii@gmail.com> +Date: Tue, 14 Dec 2021 21:30:51 +0900 +Subject: [PATCH] nptl: Effectively skip CAS in spinlock loop +Content-type: text/plain; charset=UTF-8 + +The commit: +"Add LLL_MUTEX_READ_LOCK [BZ #28537]" +SHA1: d672a98a1af106bd68deb15576710cd61363f7a6 + +introduced LLL_MUTEX_READ_LOCK, to skip CAS in spinlock loop +if atomic load fails. But, "continue" inside of do-while loop +does not skip the evaluation of escape expression, thus CAS +is not skipped. + +Replace do-while with while and skip LLL_MUTEX_TRYLOCK if +LLL_MUTEX_READ_LOCK fails. + +Reviewed-by: H.J. Lu +--- + nptl/pthread_mutex_lock.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c +index a633d95e..d96a9933 100644 +--- a/nptl/pthread_mutex_lock.c ++++ b/nptl/pthread_mutex_lock.c +@@ -141,10 +141,9 @@ __pthread_mutex_lock (pthread_mutex_t *mutex) + break; + } + atomic_spin_nop (); +- if (LLL_MUTEX_READ_LOCK (mutex) != 0) +- continue; + } +- while (LLL_MUTEX_TRYLOCK (mutex) != 0); ++ while (LLL_MUTEX_READ_LOCK (mutex) != 0 ++ || LLL_MUTEX_TRYLOCK (mutex) != 0); + + mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8; + } +-- +GitLab + diff --git a/glibc-RHEL-15696-72.patch b/glibc-RHEL-15696-72.patch new file mode 100644 index 0000000..34f2a61 --- /dev/null +++ b/glibc-RHEL-15696-72.patch @@ -0,0 +1,146 @@ +From 7835d611af0854e69a0c71e3806f8fe379282d6f Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 18 Feb 2022 14:19:15 -0600 +Subject: [PATCH] x86: Test wcscmp RTM in the wcsncmp overflow case [BZ #28896] +Content-type: text/plain; charset=UTF-8 + +In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would +call strcmp-avx2 and wcscmp-avx2 respectively. This would have +not checks around vzeroupper and would trigger spurious +aborts. This commit fixes that. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on +AVX2 machines with and without RTM. +Reviewed-by: H.J. Lu +--- + sysdeps/x86/Makefile | 5 ++++- + sysdeps/x86/tst-strncmp-rtm.c | 32 +++++++++++++++++++++++--------- + sysdeps/x86/tst-wcsncmp-rtm.c | 21 +++++++++++++++++++++ + 3 files changed, 48 insertions(+), 10 deletions(-) + create mode 100644 sysdeps/x86/tst-wcsncmp-rtm.c + +diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile +index 2d814915..c2111f49 100644 +--- a/sysdeps/x86/Makefile ++++ b/sysdeps/x86/Makefile +@@ -28,7 +28,9 @@ tests += \ + tst-strcpy-rtm \ + tst-strlen-rtm \ + tst-strncmp-rtm \ +- tst-strrchr-rtm ++ tst-strrchr-rtm \ ++ tst-wcsncmp-rtm \ ++# tests + + CFLAGS-tst-memchr-rtm.c += -mrtm + CFLAGS-tst-memcmp-rtm.c += -mrtm +@@ -40,6 +42,7 @@ CFLAGS-tst-strcpy-rtm.c += -mrtm + CFLAGS-tst-strlen-rtm.c += -mrtm + CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error + CFLAGS-tst-strrchr-rtm.c += -mrtm ++CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error + endif + + ifneq ($(enable-cet),no) +diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c +index 4d0004b5..4e9f094f 100644 +--- a/sysdeps/x86/tst-strncmp-rtm.c ++++ b/sysdeps/x86/tst-strncmp-rtm.c +@@ -19,18 +19,32 @@ + #include + #include + ++#ifdef WIDE ++# define CHAR wchar_t ++# define MEMSET wmemset ++# define STRNCMP wcsncmp ++# define TEST_NAME wcsncmp ++#else /* !WIDE */ ++# define CHAR char ++# define MEMSET memset ++# define STRNCMP strncmp ++# define TEST_NAME strncmp ++#endif /* !WIDE */ ++ ++ ++ + #define LOOP 3000 + #define STRING_SIZE 1024 +-char string1[STRING_SIZE]; +-char string2[STRING_SIZE]; ++CHAR string1[STRING_SIZE]; ++CHAR string2[STRING_SIZE]; + + __attribute__ ((noinline, noclone)) + static int + prepare (void) + { +- memset (string1, 'a', STRING_SIZE - 1); +- memset (string2, 'a', STRING_SIZE - 1); +- if (strncmp (string1, string2, STRING_SIZE) == 0) ++ MEMSET (string1, 'a', STRING_SIZE - 1); ++ MEMSET (string2, 'a', STRING_SIZE - 1); ++ if (STRNCMP (string1, string2, STRING_SIZE) == 0) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +@@ -40,7 +54,7 @@ __attribute__ ((noinline, noclone)) + static int + function (void) + { +- if (strncmp (string1, string2, STRING_SIZE) == 0) ++ if (STRNCMP (string1, string2, STRING_SIZE) == 0) + return 0; + else + return 1; +@@ -50,7 +64,7 @@ __attribute__ ((noinline, noclone)) + static int + function_overflow (void) + { +- if (strncmp (string1, string2, SIZE_MAX) == 0) ++ if (STRNCMP (string1, string2, SIZE_MAX) == 0) + return 0; + else + return 1; +@@ -59,9 +73,9 @@ function_overflow (void) + static int + do_test (void) + { +- int status = do_test_1 ("strncmp", LOOP, prepare, function); ++ int status = do_test_1 (TEST_NAME, LOOP, prepare, function); + if (status != EXIT_SUCCESS) + return status; +- status = do_test_1 ("strncmp", LOOP, prepare, function_overflow); ++ status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow); + return status; + } +diff --git a/sysdeps/x86/tst-wcsncmp-rtm.c b/sysdeps/x86/tst-wcsncmp-rtm.c +new file mode 100644 +index 00000000..bad3b863 +--- /dev/null ++++ b/sysdeps/x86/tst-wcsncmp-rtm.c +@@ -0,0 +1,21 @@ ++/* Test case for wcsncmp inside a transactionally executing RTM region. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define WIDE 1 ++#include ++#include "tst-strncmp-rtm.c" +-- +GitLab + diff --git a/glibc-RHEL-15696-73.patch b/glibc-RHEL-15696-73.patch new file mode 100644 index 0000000..e8cc3a2 --- /dev/null +++ b/glibc-RHEL-15696-73.patch @@ -0,0 +1,37 @@ +From b98d0bbf747f39770e0caba7e984ce9f8f900330 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 18 Feb 2022 17:00:25 -0600 +Subject: [PATCH] x86: Fix TEST_NAME to make it a string in tst-strncmp-rtm.c +Content-type: text/plain; charset=UTF-8 + +Previously TEST_NAME was passing a function pointer. This didn't fail +because of the -Wno-error flag (to allow for overflow sizes passed +to strncmp/wcsncmp) + +Reviewed-by: H.J. Lu +--- + sysdeps/x86/tst-strncmp-rtm.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c +index 4e9f094f..aef9866c 100644 +--- a/sysdeps/x86/tst-strncmp-rtm.c ++++ b/sysdeps/x86/tst-strncmp-rtm.c +@@ -23,12 +23,12 @@ + # define CHAR wchar_t + # define MEMSET wmemset + # define STRNCMP wcsncmp +-# define TEST_NAME wcsncmp ++# define TEST_NAME "wcsncmp" + #else /* !WIDE */ + # define CHAR char + # define MEMSET memset + # define STRNCMP strncmp +-# define TEST_NAME strncmp ++# define TEST_NAME "strncmp" + #endif /* !WIDE */ + + +-- +GitLab + diff --git a/glibc-RHEL-15696-74.patch b/glibc-RHEL-15696-74.patch new file mode 100644 index 0000000..e5e6842 --- /dev/null +++ b/glibc-RHEL-15696-74.patch @@ -0,0 +1,1798 @@ +From b77b06e0e296f1a2276c27a67e1d44f2cfa38d45 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 10 Jan 2022 15:35:38 -0600 +Subject: [PATCH] x86: Optimize strcmp-avx2.S +Content-type: text/plain; charset=UTF-8 + +Optimization are primarily to the loop logic and how the page cross +logic interacts with the loop. + +The page cross logic is at times more expensive for short strings near +the end of a page but not crossing the page. This is done to retest +the page cross conditions with a non-faulty check and to improve the +logic for entering the loop afterwards. This is only particular cases, +however, and is general made up for by more than 10x improvements on +the transition from the page cross -> loop case. + +The non-page cross cases are improved most for smaller sizes [0, 128] +and go about even for (128, 4096]. The loop page cross logic is +improved so some more significant speedup is seen there as well. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/strcmp-avx2.S | 1592 ++++++++++++++---------- + 1 file changed, 940 insertions(+), 652 deletions(-) + +Conflicts: + sysdeps/x86_64/multiarch/strcmp-avx2.S + (account for sw28896 patches) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 70d8499b..554ffe4c 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -26,35 +26,57 @@ + + # define PAGE_SIZE 4096 + +-/* VEC_SIZE = Number of bytes in a ymm register */ ++ /* VEC_SIZE = Number of bytes in a ymm register. */ + # define VEC_SIZE 32 + +-/* Shift for dividing by (VEC_SIZE * 4). */ +-# define DIVIDE_BY_VEC_4_SHIFT 7 +-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# endif ++# define VMOVU vmovdqu ++# define VMOVA vmovdqa + + # ifdef USE_AS_WCSCMP +-/* Compare packed dwords. */ ++ /* Compare packed dwords. */ + # define VPCMPEQ vpcmpeqd +-/* Compare packed dwords and store minimum. */ ++ /* Compare packed dwords and store minimum. */ + # define VPMINU vpminud +-/* 1 dword char == 4 bytes. */ ++ /* 1 dword char == 4 bytes. */ + # define SIZE_OF_CHAR 4 + # else +-/* Compare packed bytes. */ ++ /* Compare packed bytes. */ + # define VPCMPEQ vpcmpeqb +-/* Compare packed bytes and store minimum. */ ++ /* Compare packed bytes and store minimum. */ + # define VPMINU vpminub +-/* 1 byte char == 1 byte. */ ++ /* 1 byte char == 1 byte. */ + # define SIZE_OF_CHAR 1 + # endif + ++# ifdef USE_AS_STRNCMP ++# define LOOP_REG r9d ++# define LOOP_REG64 r9 ++ ++# define OFFSET_REG8 r9b ++# define OFFSET_REG r9d ++# define OFFSET_REG64 r9 ++# else ++# define LOOP_REG edx ++# define LOOP_REG64 rdx ++ ++# define OFFSET_REG8 dl ++# define OFFSET_REG edx ++# define OFFSET_REG64 rdx ++# endif ++ + # ifndef VZEROUPPER + # define VZEROUPPER vzeroupper + # endif + ++# if defined USE_AS_STRNCMP ++# define VEC_OFFSET 0 ++# else ++# define VEC_OFFSET (-VEC_SIZE) ++# endif ++ ++# define xmmZERO xmm15 ++# define ymmZERO ymm15 ++ + # ifndef SECTION + # define SECTION(p) p##.avx + # endif +@@ -79,783 +101,1049 @@ + the maximum offset is reached before a difference is found, zero is + returned. */ + +- .section SECTION(.text),"ax",@progbits +-ENTRY (STRCMP) ++ .section SECTION(.text), "ax", @progbits ++ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP +- /* Check for simple cases (0 or 1) in offset. */ ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %rdx ++# endif + cmp $1, %RDX_LP +- je L(char0) +- jb L(zero) ++ /* Signed comparison intentional. We use this branch to also ++ test cases where length >= 2^63. These very large sizes can be ++ handled with strcmp as there is no way for that length to ++ actually bound the buffer. */ ++ jle L(one_or_less) + # ifdef USE_AS_WCSCMP +-# ifndef __ILP32__ + movq %rdx, %rcx +- /* Check if length could overflow when multiplied by +- sizeof(wchar_t). Checking top 8 bits will cover all potential +- overflow cases as well as redirect cases where its impossible to +- length to bound a valid memory region. In these cases just use +- 'wcscmp'. */ ++ ++ /* Multiplying length by sizeof(wchar_t) can result in overflow. ++ Check if that is possible. All cases where overflow are possible ++ are cases where length is large enough that it can never be a ++ bound on valid memory so just use wcscmp. */ + shrq $56, %rcx +- jnz OVERFLOW_STRCMP +-# endif +- /* Convert units: from wide to byte char. */ +- shl $2, %RDX_LP ++ jnz __wcscmp_avx2 ++ ++ leaq (, %rdx, 4), %rdx + # endif +- /* Register %r11 tracks the maximum offset. */ +- mov %RDX_LP, %R11_LP + # endif ++ vpxor %xmmZERO, %xmmZERO, %xmmZERO + movl %edi, %eax +- xorl %edx, %edx +- /* Make %xmm7 (%ymm7) all zeros in this function. */ +- vpxor %xmm7, %xmm7, %xmm7 + orl %esi, %eax +- andl $(PAGE_SIZE - 1), %eax +- cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax +- jg L(cross_page) +- /* Start comparing 4 vectors. */ +- vmovdqu (%rdi), %ymm1 +- VPCMPEQ (%rsi), %ymm1, %ymm0 +- VPMINU %ymm1, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- vpmovmskb %ymm0, %ecx +- testl %ecx, %ecx +- je L(next_3_vectors) +- tzcntl %ecx, %edx ++ sall $20, %eax ++ /* Check if s1 or s2 may cross a page in next 4x VEC loads. */ ++ cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax ++ ja L(page_cross) ++ ++L(no_page_cross): ++ /* Safe to compare 4x vectors. */ ++ VMOVU (%rdi), %ymm0 ++ /* 1s where s1 and s2 equal. */ ++ VPCMPEQ (%rsi), %ymm0, %ymm1 ++ /* 1s at null CHAR. */ ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ /* 1s where s1 and s2 equal AND not null CHAR. */ ++ vpandn %ymm1, %ymm2, %ymm1 ++ ++ /* All 1s -> keep going, any 0s -> return. */ ++ vpmovmskb %ymm1, %ecx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx) is after the maximum +- offset (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $VEC_SIZE, %rdx ++ jbe L(vec_0_test_len) + # endif ++ ++ /* All 1s represents all equals. incl will overflow to zero in ++ all equals case. Otherwise 1s will carry until position of first ++ mismatch. */ ++ incl %ecx ++ jz L(more_3x_vec) ++ ++ .p2align 4,, 4 ++L(return_vec_0): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- je L(return) +-L(wcscmp_return): ++ cmpl (%rsi, %rcx), %edx ++ je L(ret0) + setl %al + negl %eax + orl $1, %eax +-L(return): + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret0): + L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + +- .p2align 4 +-L(return_vec_size): +- tzcntl %ecx, %edx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after +- the maximum offset (%r11). */ +- addq $VEC_SIZE, %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ .p2align 4,, 8 ++L(vec_0_test_len): ++ notl %ecx ++ bzhil %edx, %ecx, %eax ++ jnz L(return_vec_0) ++ /* Align if will cross fetch block. */ ++ .p2align 4,, 2 ++L(ret_zero): + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif +-# else ++ VZEROUPPER_RETURN ++ ++ .p2align 4,, 5 ++L(one_or_less): ++ jb L(ret_zero) + # ifdef USE_AS_WCSCMP ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ jnbe __wcscmp_avx2 ++ movl (%rdi), %edx + xorl %eax, %eax +- movl VEC_SIZE(%rdi, %rdx), %ecx +- cmpl VEC_SIZE(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (%rsi), %edx ++ je L(ret1) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl VEC_SIZE(%rdi, %rdx), %eax +- movzbl VEC_SIZE(%rsi, %rdx), %edx +- subl %edx, %eax ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ ++ jnbe __strcmp_avx2 ++ movzbl (%rdi), %eax ++ movzbl (%rsi), %ecx ++ subl %ecx, %eax + # endif ++L(ret1): ++ ret + # endif +- VZEROUPPER_RETURN + +- .p2align 4 +-L(return_2_vec_size): +- tzcntl %ecx, %edx ++ .p2align 4,, 10 ++L(return_vec_1): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 2), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ /* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of ++ overflow. */ ++ addq $-VEC_SIZE, %rdx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif ++# ifdef USE_AS_WCSCMP ++ movl VEC_SIZE(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_SIZE(%rsi, %rcx), %edx ++ je L(ret2) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret2): + VZEROUPPER_RETURN + +- .p2align 4 +-L(return_3_vec_size): +- tzcntl %ecx, %edx ++ .p2align 4,, 10 + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 3), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++L(return_vec_3): ++ salq $32, %rcx ++# endif ++ ++L(return_vec_2): ++# ifndef USE_AS_STRNCMP ++ tzcntl %ecx, %ecx ++# else ++ tzcntq %rcx, %rcx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2)(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx ++ je L(ret3) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++# endif ++L(ret3): ++ VZEROUPPER_RETURN ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_3): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 3)(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx ++ je L(ret4) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif +-# endif ++L(ret4): + VZEROUPPER_RETURN ++# endif ++ ++ .p2align 4,, 10 ++L(more_3x_vec): ++ /* Safe to compare 4x vectors. */ ++ VMOVU VEC_SIZE(%rdi), %ymm0 ++ VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_1) ++ ++# ifdef USE_AS_STRNCMP ++ subq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero) ++# endif ++ ++ VMOVU (VEC_SIZE * 2)(%rdi), %ymm0 ++ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_2) ++ ++ VMOVU (VEC_SIZE * 3)(%rdi), %ymm0 ++ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_3) + +- .p2align 4 +-L(next_3_vectors): +- vmovdqu VEC_SIZE(%rdi), %ymm6 +- VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3 +- VPMINU %ymm6, %ymm3, %ymm3 +- VPCMPEQ %ymm7, %ymm3, %ymm3 +- vpmovmskb %ymm3, %ecx +- testl %ecx, %ecx +- jne L(return_vec_size) +- vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5 +- vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4 +- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0 +- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2 +- VPMINU %ymm5, %ymm2, %ymm2 +- VPCMPEQ %ymm4, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm2, %ymm2 +- vpmovmskb %ymm2, %ecx +- testl %ecx, %ecx +- jne L(return_2_vec_size) +- VPMINU %ymm4, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- vpmovmskb %ymm0, %ecx +- testl %ecx, %ecx +- jne L(return_3_vec_size) +-L(main_loop_header): +- leaq (VEC_SIZE * 4)(%rdi), %rdx +- movl $PAGE_SIZE, %ecx +- /* Align load via RAX. */ +- andq $-(VEC_SIZE * 4), %rdx +- subq %rdi, %rdx +- leaq (%rdi, %rdx), %rax + # ifdef USE_AS_STRNCMP +- /* Starting from this point, the maximum offset, or simply the +- 'offset', DECREASES by the same amount when base pointers are +- moved forward. Return 0 when: +- 1) On match: offset <= the matched vector index. +- 2) On mistmach, offset is before the mistmatched index. ++ cmpq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ /* any non-zero positive value that doesn't inference with 0x1. + */ +- subq %rdx, %r11 +- jbe L(zero) +-# endif +- addq %rsi, %rdx +- movq %rdx, %rsi +- andl $(PAGE_SIZE - 1), %esi +- /* Number of bytes before page crossing. */ +- subq %rsi, %rcx +- /* Number of VEC_SIZE * 4 blocks before page crossing. */ +- shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx +- /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ +- movl %ecx, %esi +- jmp L(loop_start) ++ movl $2, %r8d + ++# else ++ xorl %r8d, %r8d ++# endif ++ ++ /* The prepare labels are various entry points from the page ++ cross logic. */ ++L(prepare_loop): ++ ++# ifdef USE_AS_STRNCMP ++ /* Store N + (VEC_SIZE * 4) and place check at the begining of ++ the loop. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx ++# endif ++L(prepare_loop_no_len): ++ ++ /* Align s1 and adjust s2 accordingly. */ ++ subq %rdi, %rsi ++ andq $-(VEC_SIZE * 4), %rdi ++ addq %rdi, %rsi ++ ++# ifdef USE_AS_STRNCMP ++ subq %rdi, %rdx ++# endif ++ ++L(prepare_loop_aligned): ++ /* eax stores distance from rsi to next page cross. These cases ++ need to be handled specially as the 4x loop could potentially ++ read memory past the length of s1 or s2 and across a page ++ boundary. */ ++ movl $-(VEC_SIZE * 4), %eax ++ subl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ ++ /* Loop 4x comparisons at a time. */ + .p2align 4 + L(loop): ++ ++ /* End condition for strncmp. */ + # ifdef USE_AS_STRNCMP +- /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease +- the maximum offset (%r11) by the same amount. */ +- subq $(VEC_SIZE * 4), %r11 +- jbe L(zero) +-# endif +- addq $(VEC_SIZE * 4), %rax +- addq $(VEC_SIZE * 4), %rdx +-L(loop_start): +- testl %esi, %esi +- leal -1(%esi), %esi +- je L(loop_cross_page) +-L(back_to_loop): +- /* Main loop, comparing 4 vectors are a time. */ +- vmovdqa (%rax), %ymm0 +- vmovdqa VEC_SIZE(%rax), %ymm3 +- VPCMPEQ (%rdx), %ymm0, %ymm4 +- VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1 +- VPMINU %ymm0, %ymm4, %ymm4 +- VPMINU %ymm3, %ymm1, %ymm1 +- vmovdqa (VEC_SIZE * 2)(%rax), %ymm2 +- VPMINU %ymm1, %ymm4, %ymm0 +- vmovdqa (VEC_SIZE * 3)(%rax), %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6 +- VPMINU %ymm2, %ymm5, %ymm5 +- VPMINU %ymm3, %ymm6, %ymm6 +- VPMINU %ymm5, %ymm0, %ymm0 +- VPMINU %ymm6, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- +- /* Test each mask (32 bits) individually because for VEC_SIZE +- == 32 is not possible to OR the four masks and keep all bits +- in a 64-bit integer register, differing from SSE2 strcmp +- where ORing is possible. */ +- vpmovmskb %ymm0, %ecx ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(ret_zero) ++# endif ++ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ ++ /* Check if rsi loads will cross a page boundary. */ ++ addl $-(VEC_SIZE * 4), %eax ++ jnb L(page_cross_during_loop) ++ ++ /* Loop entry after handling page cross during loop. */ ++L(loop_skip_page_cross_check): ++ VMOVA (VEC_SIZE * 0)(%rdi), %ymm0 ++ VMOVA (VEC_SIZE * 1)(%rdi), %ymm2 ++ VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 ++ ++ /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */ ++ VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1 ++ ++ VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3 ++ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 ++ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 ++ ++ ++ /* If any mismatches or null CHAR then 0 CHAR, otherwise non- ++ zero. */ ++ vpand %ymm0, %ymm1, %ymm1 ++ ++ ++ vpand %ymm2, %ymm3, %ymm3 ++ vpand %ymm4, %ymm5, %ymm5 ++ vpand %ymm6, %ymm7, %ymm7 ++ ++ VPMINU %ymm1, %ymm3, %ymm3 ++ VPMINU %ymm5, %ymm7, %ymm7 ++ ++ /* Reduce all 0 CHARs for the 4x VEC into ymm7. */ ++ VPMINU %ymm3, %ymm7, %ymm7 ++ ++ /* If any 0 CHAR then done. */ ++ VPCMPEQ %ymm7, %ymmZERO, %ymm7 ++ vpmovmskb %ymm7, %LOOP_REG ++ testl %LOOP_REG, %LOOP_REG ++ jz L(loop) ++ ++ /* Find which VEC has the mismatch of end of string. */ ++ VPCMPEQ %ymm1, %ymmZERO, %ymm1 ++ vpmovmskb %ymm1, %ecx + testl %ecx, %ecx +- je L(loop) +- VPCMPEQ %ymm7, %ymm4, %ymm0 +- vpmovmskb %ymm0, %edi +- testl %edi, %edi +- je L(test_vec) +- tzcntl %edi, %ecx ++ jnz L(return_vec_0_end) ++ ++ ++ VPCMPEQ %ymm3, %ymmZERO, %ymm3 ++ vpmovmskb %ymm3, %ecx ++ testl %ecx, %ecx ++ jnz L(return_vec_1_end) ++ ++L(return_vec_2_3_end): + # ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ subq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero_end) ++# endif ++ ++ VPCMPEQ %ymm5, %ymmZERO, %ymm5 ++ vpmovmskb %ymm5, %ecx ++ testl %ecx, %ecx ++ jnz L(return_vec_2_end) ++ ++ /* LOOP_REG contains matches for null/mismatch from the loop. If ++ VEC 0,1,and 2 all have no null and no mismatches then mismatch ++ must entirely be from VEC 3 which is fully represented by ++ LOOP_REG. */ ++ tzcntl %LOOP_REG, %LOOP_REG ++ ++# ifdef USE_AS_STRNCMP ++ subl $-(VEC_SIZE), %LOOP_REG ++ cmpq %LOOP_REG64, %rdx ++ jbe L(ret_zero_end) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx ++ je L(ret5) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax ++ movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret5): + VZEROUPPER_RETURN + +- .p2align 4 +-L(test_vec): + # ifdef USE_AS_STRNCMP +- /* The first vector matched. Return 0 if the maximum offset +- (%r11) <= VEC_SIZE. */ +- cmpq $VEC_SIZE, %r11 +- jbe L(zero) ++ .p2align 4,, 2 ++L(ret_zero_end): ++ xorl %eax, %eax ++ VZEROUPPER_RETURN + # endif +- VPCMPEQ %ymm7, %ymm1, %ymm1 +- vpmovmskb %ymm1, %ecx +- testl %ecx, %ecx +- je L(test_2_vec) +- tzcntl %ecx, %edi ++ ++ ++ /* The L(return_vec_N_end) differ from L(return_vec_N) in that ++ they use the value of `r8` to negate the return value. This is ++ because the page cross logic can swap `rdi` and `rsi`. */ ++ .p2align 4,, 10 + # ifdef USE_AS_STRNCMP +- addq $VEC_SIZE, %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++L(return_vec_1_end): ++ salq $32, %rcx ++# endif ++L(return_vec_0_end): ++# ifndef USE_AS_STRNCMP ++ tzcntl %ecx, %ecx ++# else ++ tzcntq %rcx, %rcx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_end) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ cmpl (%rsi, %rcx), %edx ++ je L(ret6) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax ++# endif ++L(ret6): ++ VZEROUPPER_RETURN ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_1_end): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ movl VEC_SIZE(%rdi, %rcx), %edx + xorl %eax, %eax +- movl VEC_SIZE(%rsi, %rdi), %ecx +- cmpl VEC_SIZE(%rdx, %rdi), %ecx +- jne L(wcscmp_return) ++ cmpl VEC_SIZE(%rsi, %rcx), %edx ++ je L(ret7) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +- movzbl VEC_SIZE(%rax, %rdi), %eax +- movzbl VEC_SIZE(%rdx, %rdi), %edx +- subl %edx, %eax ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif +-# endif ++L(ret7): + VZEROUPPER_RETURN ++# endif + +- .p2align 4 +-L(test_2_vec): ++ .p2align 4,, 10 ++L(return_vec_2_end): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_STRNCMP +- /* The first 2 vectors matched. Return 0 if the maximum offset +- (%r11) <= 2 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 2), %r11 +- jbe L(zero) ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_page_cross) + # endif +- VPCMPEQ %ymm7, %ymm5, %ymm5 +- vpmovmskb %ymm5, %ecx +- testl %ecx, %ecx +- je L(test_3_vec) +- tzcntl %ecx, %edi +-# ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2)(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx ++ je L(ret11) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx +- cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret11): + VZEROUPPER_RETURN + +- .p2align 4 +-L(test_3_vec): ++ ++ /* Page cross in rsi in next 4x VEC. */ ++ ++ /* TODO: Improve logic here. */ ++ .p2align 4,, 10 ++L(page_cross_during_loop): ++ /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */ ++ ++ /* Optimistically rsi and rdi and both aligned inwhich case we ++ don't need any logic here. */ ++ cmpl $-(VEC_SIZE * 4), %eax ++ /* Don't adjust eax before jumping back to loop and we will ++ never hit page cross case again. */ ++ je L(loop_skip_page_cross_check) ++ ++ /* Check if we can safely load a VEC. */ ++ cmpl $-(VEC_SIZE * 3), %eax ++ jle L(less_1x_vec_till_page_cross) ++ ++ VMOVA (%rdi), %ymm0 ++ VPCMPEQ (%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_0_end) ++ ++ /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ ++ cmpl $-(VEC_SIZE * 2), %eax ++ jg L(more_2x_vec_till_page_cross) ++ ++ .p2align 4,, 4 ++L(less_1x_vec_till_page_cross): ++ subl $-(VEC_SIZE * 4), %eax ++ /* Guranteed safe to read from rdi - VEC_SIZE here. The only ++ concerning case is first iteration if incoming s1 was near start ++ of a page and s2 near end. If s1 was near the start of the page ++ we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe ++ to read back -VEC_SIZE. If rdi is truly at the start of a page ++ here, it means the previous page (rdi - VEC_SIZE) has already ++ been loaded earlier so must be valid. */ ++ VMOVU -VEC_SIZE(%rdi, %rax), %ymm0 ++ VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ ++ /* Mask of potentially valid bits. The lower bits can be out of ++ range comparisons (but safe regarding page crosses). */ ++ movl $-1, %r10d ++ shlxl %esi, %r10d, %r10d ++ notl %ecx ++ + # ifdef USE_AS_STRNCMP +- /* The first 3 vectors matched. Return 0 if the maximum offset +- (%r11) <= 3 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 3), %r11 +- jbe L(zero) +-# endif +- VPCMPEQ %ymm7, %ymm6, %ymm6 +- vpmovmskb %ymm6, %esi +- tzcntl %esi, %ecx ++ cmpq %rax, %rdx ++ jbe L(return_page_cross_end_check) ++# endif ++ movl %eax, %OFFSET_REG ++ addl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ ++ andl %r10d, %ecx ++ jz L(loop_skip_page_cross_check) ++ ++ .p2align 4,, 3 ++L(return_page_cross_end): ++ tzcntl %ecx, %ecx ++ + # ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 3), %rcx +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %esi +- cmpl (%rdx, %rcx), %esi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ leal -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx ++L(return_page_cross_cmp_mem): + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ addl %OFFSET_REG, %ecx ++# endif ++# ifdef USE_AS_WCSCMP ++ movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rsi, %rcx), %esi +- cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret8) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax ++# else ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret8): + VZEROUPPER_RETURN + +- .p2align 4 +-L(loop_cross_page): +- xorl %r10d, %r10d +- movq %rdx, %rcx +- /* Align load via RDX. We load the extra ECX bytes which should +- be ignored. */ +- andl $((VEC_SIZE * 4) - 1), %ecx +- /* R10 is -RCX. */ +- subq %rcx, %r10 +- +- /* This works only if VEC_SIZE * 2 == 64. */ +-# if (VEC_SIZE * 2) != 64 +-# error (VEC_SIZE * 2) != 64 +-# endif +- +- /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ +- cmpl $(VEC_SIZE * 2), %ecx +- jge L(loop_cross_page_2_vec) +- +- vmovdqu (%rax, %r10), %ymm2 +- vmovdqu VEC_SIZE(%rax, %r10), %ymm3 +- VPCMPEQ (%rdx, %r10), %ymm2, %ymm0 +- VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1 +- VPMINU %ymm2, %ymm0, %ymm0 +- VPMINU %ymm3, %ymm1, %ymm1 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm1, %ymm1 +- +- vpmovmskb %ymm0, %edi +- vpmovmskb %ymm1, %esi +- +- salq $32, %rsi +- xorq %rsi, %rdi +- +- /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ +- shrq %cl, %rdi +- +- testq %rdi, %rdi +- je L(loop_cross_page_2_vec) +- tzcntq %rdi, %rcx + # ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ .p2align 4,, 10 ++L(return_page_cross_end_check): ++ tzcntl %ecx, %ecx ++ leal -VEC_SIZE(%rax, %rcx), %ecx ++ cmpl %ecx, %edx ++ ja L(return_page_cross_cmp_mem) + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# endif + VZEROUPPER_RETURN ++# endif + +- .p2align 4 +-L(loop_cross_page_2_vec): +- /* The first VEC_SIZE * 2 bytes match or are ignored. */ +- vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2 +- vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5 +- VPMINU %ymm2, %ymm5, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6 +- VPCMPEQ %ymm7, %ymm5, %ymm5 +- VPMINU %ymm3, %ymm6, %ymm6 +- VPCMPEQ %ymm7, %ymm6, %ymm6 +- +- vpmovmskb %ymm5, %edi +- vpmovmskb %ymm6, %esi +- +- salq $32, %rsi +- xorq %rsi, %rdi + +- xorl %r8d, %r8d +- /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ +- subl $(VEC_SIZE * 2), %ecx +- jle 1f +- /* Skip ECX bytes. */ +- shrq %cl, %rdi +- /* R8 has number of bytes skipped. */ +- movl %ecx, %r8d +-1: +- /* Before jumping back to the loop, set ESI to the number of +- VEC_SIZE * 4 blocks before page crossing. */ +- movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi +- +- testq %rdi, %rdi ++ .p2align 4,, 10 ++L(more_2x_vec_till_page_cross): ++ /* If more 2x vec till cross we will complete a full loop ++ iteration here. */ ++ ++ VMOVU VEC_SIZE(%rdi), %ymm0 ++ VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_1_end) ++ + # ifdef USE_AS_STRNCMP +- /* At this point, if %rdi value is 0, it already tested +- VEC_SIZE*4+%r10 byte starting from %rax. This label +- checks whether strncmp maximum offset reached or not. */ +- je L(string_nbyte_offset_check) +-# else +- je L(back_to_loop) ++ cmpq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero_in_loop_page_cross) + # endif +- tzcntq %rdi, %rcx +- addq %r10, %rcx +- /* Adjust for number of bytes skipped. */ +- addq %r8, %rcx ++ ++ subl $-(VEC_SIZE * 4), %eax ++ ++ /* Safe to include comparisons from lower bytes. */ ++ VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0 ++ VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_page_cross_0) ++ ++ VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0 ++ VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_page_cross_1) ++ + # ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rcx +- subq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ /* Must check length here as length might proclude reading next ++ page. */ ++ cmpq %rax, %rdx ++ jbe L(ret_zero_in_loop_page_cross) ++# endif ++ ++ /* Finish the loop. */ ++ VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 ++ ++ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 ++ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 ++ vpand %ymm4, %ymm5, %ymm5 ++ vpand %ymm6, %ymm7, %ymm7 ++ VPMINU %ymm5, %ymm7, %ymm7 ++ VPCMPEQ %ymm7, %ymmZERO, %ymm7 ++ vpmovmskb %ymm7, %LOOP_REG ++ testl %LOOP_REG, %LOOP_REG ++ jnz L(return_vec_2_3_end) ++ ++ /* Best for code size to include ucond-jmp here. Would be faster ++ if this case is hot to duplicate the L(return_vec_2_3_end) code ++ as fall-through and have jump back to loop on mismatch ++ comparison. */ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ addl $(PAGE_SIZE - VEC_SIZE * 8), %eax ++# ifdef USE_AS_STRNCMP ++ subq $(VEC_SIZE * 4), %rdx ++ ja L(loop_skip_page_cross_check) ++L(ret_zero_in_loop_page_cross): + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ VZEROUPPER_RETURN + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rcx), %edi +- cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ jmp L(loop_skip_page_cross_check) + # endif +- VZEROUPPER_RETURN + ++ ++ .p2align 4,, 10 ++L(return_vec_page_cross_0): ++ addl $-VEC_SIZE, %eax ++L(return_vec_page_cross_1): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_STRNCMP +-L(string_nbyte_offset_check): +- leaq (VEC_SIZE * 4)(%r10), %r10 +- cmpq %r10, %r11 +- jbe L(zero) +- jmp L(back_to_loop) ++ leal -VEC_SIZE(%rax, %rcx), %ecx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_in_loop_page_cross) ++# else ++ addl %eax, %ecx + # endif + +- .p2align 4 +-L(cross_page_loop): +- /* Check one byte/dword at a time. */ + # ifdef USE_AS_WCSCMP +- cmpl %ecx, %eax ++ movl VEC_OFFSET(%rdi, %rcx), %edx ++ xorl %eax, %eax ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret9) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx + subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif +- jne L(different) +- addl $SIZE_OF_CHAR, %edx +- cmpl $(VEC_SIZE * 4), %edx +- je L(main_loop_header) +-# ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++L(ret9): ++ VZEROUPPER_RETURN ++ ++ ++ .p2align 4,, 10 ++L(page_cross): ++# ifndef USE_AS_STRNCMP ++ /* If both are VEC aligned we don't need any special logic here. ++ Only valid for strcmp where stop condition is guranteed to be ++ reachable by just reading memory. */ ++ testl $((VEC_SIZE - 1) << 20), %eax ++ jz L(no_page_cross) + # endif ++ ++ movl %edi, %eax ++ movl %esi, %ecx ++ andl $(PAGE_SIZE - 1), %eax ++ andl $(PAGE_SIZE - 1), %ecx ++ ++ xorl %OFFSET_REG, %OFFSET_REG ++ ++ /* Check which is closer to page cross, s1 or s2. */ ++ cmpl %eax, %ecx ++ jg L(page_cross_s2) ++ ++ /* The previous page cross check has false positives. Check for ++ true positive as page cross logic is very expensive. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ jbe L(no_page_cross) ++ ++ /* Set r8 to not interfere with normal return value (rdi and rsi ++ did not swap). */ + # ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx ++ /* any non-zero positive value that doesn't inference with 0x1. ++ */ ++ movl $2, %r8d + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx ++ xorl %r8d, %r8d + # endif +- /* Check null char. */ +- testl %eax, %eax +- jne L(cross_page_loop) +- /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED +- comparisons. */ +- subl %ecx, %eax +-# ifndef USE_AS_WCSCMP +-L(different): ++ ++ /* Check if less than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jg L(less_1x_vec_till_page) ++ ++ /* If more than 1x VEC till page cross, loop throuh safely ++ loadable memory until within 1x VEC of page cross. */ ++ ++ .p2align 4,, 10 ++L(page_cross_loop): ++ ++ VMOVU (%rdi, %OFFSET_REG64), %ymm0 ++ VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ ++ jnz L(check_ret_vec_page_cross) ++ addl $VEC_SIZE, %OFFSET_REG ++# ifdef USE_AS_STRNCMP ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross) + # endif +- VZEROUPPER_RETURN ++ addl $VEC_SIZE, %eax ++ jl L(page_cross_loop) ++ ++ subl %eax, %OFFSET_REG ++ /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed ++ to not cross page so is safe to load. Since we have already ++ loaded at least 1 VEC from rsi it is also guranteed to be safe. ++ */ ++ ++ VMOVU (%rdi, %OFFSET_REG64), %ymm0 ++ VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ ++# ifdef USE_AS_STRNCMP ++ leal VEC_SIZE(%OFFSET_REG64), %eax ++ cmpq %rax, %rdx ++ jbe L(check_ret_vec_page_cross2) ++ addq %rdi, %rdx ++# endif ++ incl %ecx ++ jz L(prepare_loop_no_len) + ++ .p2align 4,, 4 ++L(ret_vec_page_cross): ++# ifndef USE_AS_STRNCMP ++L(check_ret_vec_page_cross): ++# endif ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++L(ret_vec_page_cross_cont): + # ifdef USE_AS_WCSCMP +- .p2align 4 +-L(different): +- /* Use movl to avoid modifying EFLAGS. */ +- movl $0, %eax ++ movl (%rdi, %rcx), %edx ++ xorl %eax, %eax ++ cmpl (%rsi, %rcx), %edx ++ je L(ret12) + setl %al + negl %eax +- orl $1, %eax +- VZEROUPPER_RETURN ++ xorl %r8d, %eax ++# else ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret12): ++ VZEROUPPER_RETURN + + # ifdef USE_AS_STRNCMP +- .p2align 4 +-L(zero): ++ .p2align 4,, 10 ++L(check_ret_vec_page_cross2): ++ incl %ecx ++L(check_ret_vec_page_cross): ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++ cmpq %rcx, %rdx ++ ja L(ret_vec_page_cross_cont) ++ .p2align 4,, 2 ++L(ret_zero_page_cross): + xorl %eax, %eax + VZEROUPPER_RETURN ++# endif + +- .p2align 4 +-L(char0): +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rsi), %ecx +- movzbl (%rdi), %eax +- subl %ecx, %eax +-# endif +- VZEROUPPER_RETURN ++ .p2align 4,, 4 ++L(page_cross_s2): ++ /* Ensure this is a true page cross. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx ++ jbe L(no_page_cross) ++ ++ ++ movl %ecx, %eax ++ movq %rdi, %rcx ++ movq %rsi, %rdi ++ movq %rcx, %rsi ++ ++ /* set r8 to negate return value as rdi and rsi swapped. */ ++# ifdef USE_AS_WCSCMP ++ movl $-4, %r8d ++# else ++ movl $-1, %r8d + # endif ++ xorl %OFFSET_REG, %OFFSET_REG + +- .p2align 4 +-L(last_vector): +- addq %rdx, %rdi +- addq %rdx, %rsi ++ /* Check if more than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jle L(page_cross_loop) ++ ++ .p2align 4,, 6 ++L(less_1x_vec_till_page): ++ /* Find largest load size we can use. */ ++ cmpl $16, %eax ++ ja L(less_16_till_page) ++ ++ VMOVU (%rdi), %xmm0 ++ VPCMPEQ (%rsi), %xmm0, %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incw %cx ++ jnz L(check_ret_vec_page_cross) ++ movl $16, %OFFSET_REG + # ifdef USE_AS_STRNCMP +- subq %rdx, %r11 ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subl %eax, %OFFSET_REG ++# else ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG ++ jz L(prepare_loop) + # endif +- tzcntl %ecx, %edx ++ ++ VMOVU (%rdi, %OFFSET_REG64), %xmm0 ++ VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incw %cx ++ jnz L(check_ret_vec_page_cross) ++ + # ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++ addl $16, %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(VEC_SIZE * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# else ++ leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi + # endif +-# ifdef USE_AS_WCSCMP ++ jmp L(prepare_loop_aligned) ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case0): + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ ret + # endif +- VZEROUPPER_RETURN + +- /* Comparing on page boundary region requires special treatment: +- It must done one vector at the time, starting with the wider +- ymm vector if possible, if not, with xmm. If fetching 16 bytes +- (xmm) still passes the boundary, byte comparison must be done. +- */ +- .p2align 4 +-L(cross_page): +- /* Try one ymm vector at a time. */ +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jg L(cross_page_1_vector) +-L(loop_1_vector): +- vmovdqu (%rdi, %rdx), %ymm1 +- VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0 +- VPMINU %ymm1, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- vpmovmskb %ymm0, %ecx +- testl %ecx, %ecx +- jne L(last_vector) + +- addl $VEC_SIZE, %edx ++ .p2align 4,, 10 ++L(less_16_till_page): ++ /* Find largest load size we can use. */ ++ cmpl $24, %eax ++ ja L(less_8_till_page) + +- addl $VEC_SIZE, %eax +-# ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) +-# endif +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jle L(loop_1_vector) +-L(cross_page_1_vector): +- /* Less than 32 bytes to check, try one xmm vector. */ +- cmpl $(PAGE_SIZE - 16), %eax +- jg L(cross_page_1_xmm) +- vmovdqu (%rdi, %rdx), %xmm1 +- VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0 +- VPMINU %xmm1, %xmm0, %xmm0 +- VPCMPEQ %xmm7, %xmm0, %xmm0 +- vpmovmskb %xmm0, %ecx +- testl %ecx, %ecx +- jne L(last_vector) ++ vmovq (%rdi), %xmm0 ++ vmovq (%rsi), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incb %cl ++ jnz L(check_ret_vec_page_cross) + +- addl $16, %edx +-# ifndef USE_AS_WCSCMP +- addl $16, %eax ++ ++# ifdef USE_AS_STRNCMP ++ cmpq $8, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) + # endif ++ movl $24, %OFFSET_REG ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG ++ ++ ++ ++ vmovq (%rdi, %OFFSET_REG64), %xmm0 ++ vmovq (%rsi, %OFFSET_REG64), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incb %cl ++ jnz L(check_ret_vec_page_cross) ++ + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) +-# endif +- +-L(cross_page_1_xmm): +-# ifndef USE_AS_WCSCMP +- /* Less than 16 bytes to check, try 8 byte vector. NB: No need +- for wcscmp nor wcsncmp since wide char is 4 bytes. */ +- cmpl $(PAGE_SIZE - 8), %eax +- jg L(cross_page_8bytes) +- vmovq (%rdi, %rdx), %xmm1 +- vmovq (%rsi, %rdx), %xmm0 +- VPCMPEQ %xmm0, %xmm1, %xmm0 +- VPMINU %xmm1, %xmm0, %xmm0 +- VPCMPEQ %xmm7, %xmm0, %xmm0 +- vpmovmskb %xmm0, %ecx +- /* Only last 8 bits are valid. */ +- andl $0xff, %ecx +- testl %ecx, %ecx +- jne L(last_vector) ++ addl $8, %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(VEC_SIZE * 4), %rdx + +- addl $8, %edx +- addl $8, %eax ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# else ++ leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ ++ ++ .p2align 4,, 10 ++L(less_8_till_page): ++# ifdef USE_AS_WCSCMP ++ /* If using wchar then this is the only check before we reach ++ the page boundary. */ ++ movl (%rdi), %eax ++ movl (%rsi), %ecx ++ cmpl %ecx, %eax ++ jnz L(ret_less_8_wcs) + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ addq %rdi, %rdx ++ /* We already checked for len <= 1 so cannot hit that case here. ++ */ + # endif ++ testl %eax, %eax ++ jnz L(prepare_loop_no_len) ++ ret + +-L(cross_page_8bytes): +- /* Less than 8 bytes to check, try 4 byte vector. */ +- cmpl $(PAGE_SIZE - 4), %eax +- jg L(cross_page_4bytes) +- vmovd (%rdi, %rdx), %xmm1 +- vmovd (%rsi, %rdx), %xmm0 +- VPCMPEQ %xmm0, %xmm1, %xmm0 +- VPMINU %xmm1, %xmm0, %xmm0 +- VPCMPEQ %xmm7, %xmm0, %xmm0 +- vpmovmskb %xmm0, %ecx +- /* Only last 4 bits are valid. */ +- andl $0xf, %ecx +- testl %ecx, %ecx +- jne L(last_vector) ++ .p2align 4,, 8 ++L(ret_less_8_wcs): ++ setl %OFFSET_REG8 ++ negl %OFFSET_REG ++ movl %OFFSET_REG, %eax ++ xorl %r8d, %eax ++ ret ++ ++# else ++ ++ /* Find largest load size we can use. */ ++ cmpl $28, %eax ++ ja L(less_4_till_page) ++ ++ vmovd (%rdi), %xmm0 ++ vmovd (%rsi), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ subl $0xf, %ecx ++ jnz L(check_ret_vec_page_cross) + +- addl $4, %edx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $4, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) + # endif ++ movl $28, %OFFSET_REG ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG + +-L(cross_page_4bytes): +-# endif +- /* Less than 4 bytes to check, try one byte/dword at a time. */ +-# ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) +-# endif +-# ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx +-# endif +- testl %eax, %eax +- jne L(cross_page_loop) ++ ++ ++ vmovd (%rdi, %OFFSET_REG64), %xmm0 ++ vmovd (%rsi, %OFFSET_REG64), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ subl $0xf, %ecx ++ jnz L(check_ret_vec_page_cross) ++ ++# ifdef USE_AS_STRNCMP ++ addl $4, %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) ++ subq $-(VEC_SIZE * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# else ++ leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case1): ++ xorl %eax, %eax ++ ret ++# endif ++ ++ .p2align 4,, 10 ++L(less_4_till_page): ++ subq %rdi, %rsi ++ /* Extremely slow byte comparison loop. */ ++L(less_4_loop): ++ movzbl (%rdi), %eax ++ movzbl (%rsi, %rdi), %ecx + subl %ecx, %eax +- VZEROUPPER_RETURN +-END (STRCMP) ++ jnz L(ret_less_4_loop) ++ testl %ecx, %ecx ++ jz L(ret_zero_4_loop) ++# ifdef USE_AS_STRNCMP ++ decq %rdx ++ jz L(ret_zero_4_loop) ++# endif ++ incq %rdi ++ /* end condition is reach page boundary (rdi is aligned). */ ++ testl $31, %edi ++ jnz L(less_4_loop) ++ leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi ++ addq $-(VEC_SIZE * 4), %rdi ++# ifdef USE_AS_STRNCMP ++ subq $-(VEC_SIZE * 4), %rdx ++# endif ++ jmp L(prepare_loop_aligned) ++ ++L(ret_zero_4_loop): ++ xorl %eax, %eax ++ ret ++L(ret_less_4_loop): ++ xorl %r8d, %eax ++ subl %r8d, %eax ++ ret ++# endif ++END(STRCMP) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-75.patch b/glibc-RHEL-15696-75.patch new file mode 100644 index 0000000..4bd0cd4 --- /dev/null +++ b/glibc-RHEL-15696-75.patch @@ -0,0 +1,1992 @@ +From 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 10 Jan 2022 15:35:39 -0600 +Subject: [PATCH] x86: Optimize strcmp-evex.S +Content-type: text/plain; charset=UTF-8 + +Optimization are primarily to the loop logic and how the page cross +logic interacts with the loop. + +The page cross logic is at times more expensive for short strings near +the end of a page but not crossing the page. This is done to retest +the page cross conditions with a non-faulty check and to improve the +logic for entering the loop afterwards. This is only particular cases, +however, and is general made up for by more than 10x improvements on +the transition from the page cross -> loop case. + +The non-page cross cases as well are nearly universally improved. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/strcmp-evex.S | 1712 +++++++++++++----------- + 1 file changed, 919 insertions(+), 793 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 6f5c4bf9..99d8409a 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -26,54 +26,69 @@ + + # define PAGE_SIZE 4096 + +-/* VEC_SIZE = Number of bytes in a ymm register */ ++ /* VEC_SIZE = Number of bytes in a ymm register. */ + # define VEC_SIZE 32 ++# define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR) + +-/* Shift for dividing by (VEC_SIZE * 4). */ +-# define DIVIDE_BY_VEC_4_SHIFT 7 +-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# endif +- +-# define VMOVU vmovdqu64 +-# define VMOVA vmovdqa64 ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 + + # ifdef USE_AS_WCSCMP +-/* Compare packed dwords. */ +-# define VPCMP vpcmpd ++# define TESTEQ subl $0xff, ++ /* Compare packed dwords. */ ++# define VPCMP vpcmpd + # define VPMINU vpminud + # define VPTESTM vptestmd +-# define SHIFT_REG32 r8d +-# define SHIFT_REG64 r8 +-/* 1 dword char == 4 bytes. */ ++ /* 1 dword char == 4 bytes. */ + # define SIZE_OF_CHAR 4 + # else +-/* Compare packed bytes. */ +-# define VPCMP vpcmpb ++# define TESTEQ incl ++ /* Compare packed bytes. */ ++# define VPCMP vpcmpb + # define VPMINU vpminub + # define VPTESTM vptestmb +-# define SHIFT_REG32 ecx +-# define SHIFT_REG64 rcx +-/* 1 byte char == 1 byte. */ ++ /* 1 byte char == 1 byte. */ + # define SIZE_OF_CHAR 1 + # endif + ++# ifdef USE_AS_STRNCMP ++# define LOOP_REG r9d ++# define LOOP_REG64 r9 ++ ++# define OFFSET_REG8 r9b ++# define OFFSET_REG r9d ++# define OFFSET_REG64 r9 ++# else ++# define LOOP_REG edx ++# define LOOP_REG64 rdx ++ ++# define OFFSET_REG8 dl ++# define OFFSET_REG edx ++# define OFFSET_REG64 rdx ++# endif ++ ++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP ++# define VEC_OFFSET 0 ++# else ++# define VEC_OFFSET (-VEC_SIZE) ++# endif ++ + # define XMMZERO xmm16 +-# define XMM0 xmm17 +-# define XMM1 xmm18 ++# define XMM0 xmm17 ++# define XMM1 xmm18 + + # define YMMZERO ymm16 +-# define YMM0 ymm17 +-# define YMM1 ymm18 +-# define YMM2 ymm19 +-# define YMM3 ymm20 +-# define YMM4 ymm21 +-# define YMM5 ymm22 +-# define YMM6 ymm23 +-# define YMM7 ymm24 +-# define YMM8 ymm25 +-# define YMM9 ymm26 +-# define YMM10 ymm27 ++# define YMM0 ymm17 ++# define YMM1 ymm18 ++# define YMM2 ymm19 ++# define YMM3 ymm20 ++# define YMM4 ymm21 ++# define YMM5 ymm22 ++# define YMM6 ymm23 ++# define YMM7 ymm24 ++# define YMM8 ymm25 ++# define YMM9 ymm26 ++# define YMM10 ymm27 + + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. +@@ -96,985 +111,1096 @@ + the maximum offset is reached before a difference is found, zero is + returned. */ + +- .section .text.evex,"ax",@progbits +-ENTRY (STRCMP) ++ .section .text.evex, "ax", @progbits ++ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP +- /* Check for simple cases (0 or 1) in offset. */ +- cmp $1, %RDX_LP +- je L(char0) +- jb L(zero) +-# ifdef USE_AS_WCSCMP +-# ifndef __ILP32__ +- movq %rdx, %rcx +- /* Check if length could overflow when multiplied by +- sizeof(wchar_t). Checking top 8 bits will cover all potential +- overflow cases as well as redirect cases where its impossible to +- length to bound a valid memory region. In these cases just use +- 'wcscmp'. */ +- shrq $56, %rcx +- jnz __wcscmp_evex +-# endif +- /* Convert units: from wide to byte char. */ +- shl $2, %RDX_LP ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %rdx + # endif +- /* Register %r11 tracks the maximum offset. */ +- mov %RDX_LP, %R11_LP ++ cmp $1, %RDX_LP ++ /* Signed comparison intentional. We use this branch to also ++ test cases where length >= 2^63. These very large sizes can be ++ handled with strcmp as there is no way for that length to ++ actually bound the buffer. */ ++ jle L(one_or_less) + # endif + movl %edi, %eax +- xorl %edx, %edx +- /* Make %XMMZERO (%YMMZERO) all zeros in this function. */ +- vpxorq %XMMZERO, %XMMZERO, %XMMZERO + orl %esi, %eax +- andl $(PAGE_SIZE - 1), %eax +- cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax +- jg L(cross_page) +- /* Start comparing 4 vectors. */ ++ /* Shift out the bits irrelivant to page boundary ([63:12]). */ ++ sall $20, %eax ++ /* Check if s1 or s2 may cross a page in next 4x VEC loads. */ ++ cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax ++ ja L(page_cross) ++ ++L(no_page_cross): ++ /* Safe to compare 4x vectors. */ + VMOVU (%rdi), %YMM0 +- +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ + VPTESTM %YMM0, %YMM0, %k2 +- + /* Each bit cleared in K1 represents a mismatch or a null CHAR + in YMM0 and 32 bytes at (%rsi). */ + VPCMP $0, (%rsi), %YMM0, %k1{%k2} +- + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(next_3_vectors) +- tzcntl %ecx, %edx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx) is after the maximum +- offset (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $CHAR_PER_VEC, %rdx ++ jbe L(vec_0_test_len) + # endif ++ ++ /* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for ++ wcscmp/wcsncmp. */ ++ ++ /* All 1s represents all equals. TESTEQ will overflow to zero in ++ all equals case. Otherwise 1s will carry until position of first ++ mismatch. */ ++ TESTEQ %ecx ++ jz L(more_3x_vec) ++ ++ .p2align 4,, 4 ++L(return_vec_0): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- je L(return) +-L(wcscmp_return): ++ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret0) + setl %al + negl %eax + orl $1, %eax +-L(return): + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret0): + ret + +-L(return_vec_size): +- tzcntl %ecx, %edx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after +- the maximum offset (%r11). */ +- addq $VEC_SIZE, %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ .p2align 4,, 4 ++L(vec_0_test_len): ++ notl %ecx ++ bzhil %edx, %ecx, %eax ++ jnz L(return_vec_0) ++ /* Align if will cross fetch block. */ ++ .p2align 4,, 2 ++L(ret_zero): + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif +-# else ++ ret ++ ++ .p2align 4,, 5 ++L(one_or_less): ++ jb L(ret_zero) + # ifdef USE_AS_WCSCMP ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ jnbe __wcscmp_evex ++ movl (%rdi), %edx + xorl %eax, %eax +- movl VEC_SIZE(%rdi, %rdx), %ecx +- cmpl VEC_SIZE(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (%rsi), %edx ++ je L(ret1) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl VEC_SIZE(%rdi, %rdx), %eax +- movzbl VEC_SIZE(%rsi, %rdx), %edx +- subl %edx, %eax ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ jnbe __strcmp_evex ++ movzbl (%rdi), %eax ++ movzbl (%rsi), %ecx ++ subl %ecx, %eax + # endif +-# endif ++L(ret1): + ret ++# endif + +-L(return_2_vec_size): +- tzcntl %ecx, %edx ++ .p2align 4,, 10 ++L(return_vec_1): ++ tzcntl %ecx, %ecx ++# ifdef USE_AS_STRNCMP ++ /* rdx must be > CHAR_PER_VEC so its safe to subtract without ++ worrying about underflow. */ ++ addq $-CHAR_PER_VEC, %rdx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx ++ movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx ++ xorl %eax, %eax ++ cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret2) ++ setl %al ++ negl %eax ++ orl $1, %eax ++# else ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret2): ++ ret ++ ++ .p2align 4,, 10 + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 2), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++L(return_vec_3): ++# if CHAR_PER_VEC <= 16 ++ sall $CHAR_PER_VEC, %ecx + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ salq $CHAR_PER_VEC, %rcx + # endif ++# endif ++L(return_vec_2): ++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) ++ tzcntl %ecx, %ecx + # else +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ tzcntq %rcx, %rcx + # endif +- ret + +-L(return_3_vec_size): +- tzcntl %ecx, %edx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 3), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret3) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++# endif ++L(ret3): ++ ret ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_3): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret4) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif +-# endif ++L(ret4): + ret ++# endif + +- .p2align 4 +-L(next_3_vectors): +- VMOVU VEC_SIZE(%rdi), %YMM0 +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ ++ /* 32 byte align here ensures the main loop is ideally aligned ++ for DSB. */ ++ .p2align 5 ++L(more_3x_vec): ++ /* Safe to compare 4x vectors. */ ++ VMOVU (VEC_SIZE)(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at VEC_SIZE(%rsi). */ +- VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} ++ VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2} + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_1) ++ ++# ifdef USE_AS_STRNCMP ++ subq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero) + # endif +- jne L(return_vec_size) + + VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ + VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2} + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- jne L(return_2_vec_size) ++ TESTEQ %ecx ++ jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ + VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2} + kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_3) ++ ++# ifdef USE_AS_STRNCMP ++ cmpq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero) ++# endif ++ ++ + # ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ /* any non-zero positive value that doesn't inference with 0x1. ++ */ ++ movl $2, %r8d ++ + # else +- incl %ecx ++ xorl %r8d, %r8d + # endif +- jne L(return_3_vec_size) +-L(main_loop_header): +- leaq (VEC_SIZE * 4)(%rdi), %rdx +- movl $PAGE_SIZE, %ecx +- /* Align load via RAX. */ +- andq $-(VEC_SIZE * 4), %rdx +- subq %rdi, %rdx +- leaq (%rdi, %rdx), %rax ++ ++ /* The prepare labels are various entry points from the page ++ cross logic. */ ++L(prepare_loop): ++ + # ifdef USE_AS_STRNCMP +- /* Starting from this point, the maximum offset, or simply the +- 'offset', DECREASES by the same amount when base pointers are +- moved forward. Return 0 when: +- 1) On match: offset <= the matched vector index. +- 2) On mistmach, offset is before the mistmatched index. +- */ +- subq %rdx, %r11 +- jbe L(zero) ++# ifdef USE_AS_WCSCMP ++L(prepare_loop_no_len): ++ movl %edi, %ecx ++ andl $(VEC_SIZE * 4 - 1), %ecx ++ shrl $2, %ecx ++ leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx ++# else ++ /* Store N + (VEC_SIZE * 4) and place check at the begining of ++ the loop. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx ++L(prepare_loop_no_len): ++# endif ++# else ++L(prepare_loop_no_len): + # endif +- addq %rsi, %rdx +- movq %rdx, %rsi +- andl $(PAGE_SIZE - 1), %esi +- /* Number of bytes before page crossing. */ +- subq %rsi, %rcx +- /* Number of VEC_SIZE * 4 blocks before page crossing. */ +- shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx +- /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ +- movl %ecx, %esi +- jmp L(loop_start) + ++ /* Align s1 and adjust s2 accordingly. */ ++ subq %rdi, %rsi ++ andq $-(VEC_SIZE * 4), %rdi ++L(prepare_loop_readj): ++ addq %rdi, %rsi ++# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP) ++ subq %rdi, %rdx ++# endif ++ ++L(prepare_loop_aligned): ++ /* eax stores distance from rsi to next page cross. These cases ++ need to be handled specially as the 4x loop could potentially ++ read memory past the length of s1 or s2 and across a page ++ boundary. */ ++ movl $-(VEC_SIZE * 4), %eax ++ subl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ ++ vpxorq %YMMZERO, %YMMZERO, %YMMZERO ++ ++ /* Loop 4x comparisons at a time. */ + .p2align 4 + L(loop): ++ ++ /* End condition for strncmp. */ + # ifdef USE_AS_STRNCMP +- /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease +- the maximum offset (%r11) by the same amount. */ +- subq $(VEC_SIZE * 4), %r11 +- jbe L(zero) ++ subq $(CHAR_PER_VEC * 4), %rdx ++ jbe L(ret_zero) + # endif +- addq $(VEC_SIZE * 4), %rax +- addq $(VEC_SIZE * 4), %rdx +-L(loop_start): +- testl %esi, %esi +- leal -1(%esi), %esi +- je L(loop_cross_page) +-L(back_to_loop): +- /* Main loop, comparing 4 vectors are a time. */ +- VMOVA (%rax), %YMM0 +- VMOVA VEC_SIZE(%rax), %YMM2 +- VMOVA (VEC_SIZE * 2)(%rax), %YMM4 +- VMOVA (VEC_SIZE * 3)(%rax), %YMM6 ++ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ ++ /* Check if rsi loads will cross a page boundary. */ ++ addl $-(VEC_SIZE * 4), %eax ++ jnb L(page_cross_during_loop) ++ ++ /* Loop entry after handling page cross during loop. */ ++L(loop_skip_page_cross_check): ++ VMOVA (VEC_SIZE * 0)(%rdi), %YMM0 ++ VMOVA (VEC_SIZE * 1)(%rdi), %YMM2 ++ VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 + + VPMINU %YMM0, %YMM2, %YMM8 + VPMINU %YMM4, %YMM6, %YMM9 + +- /* A zero CHAR in YMM8 means that there is a null CHAR. */ +- VPMINU %YMM8, %YMM9, %YMM8 ++ /* A zero CHAR in YMM9 means that there is a null CHAR. */ ++ VPMINU %YMM8, %YMM9, %YMM9 + + /* Each bit set in K1 represents a non-null CHAR in YMM8. */ +- VPTESTM %YMM8, %YMM8, %k1 ++ VPTESTM %YMM9, %YMM9, %k1 + +- /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */ +- vpxorq (%rdx), %YMM0, %YMM1 +- vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3 +- vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5 +- vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7 ++ vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1 ++ vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3 ++ vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 ++ /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while ++ oring with YMM1. Result is stored in YMM6. */ ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6 + +- vporq %YMM1, %YMM3, %YMM9 +- vporq %YMM5, %YMM7, %YMM10 ++ /* Or together YMM3, YMM5, and YMM6. */ ++ vpternlogd $0xfe, %YMM3, %YMM5, %YMM6 + +- /* A non-zero CHAR in YMM9 represents a mismatch. */ +- vporq %YMM9, %YMM10, %YMM9 + +- /* Each bit cleared in K0 represents a mismatch or a null CHAR. */ +- VPCMP $0, %YMMZERO, %YMM9, %k0{%k1} +- kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(loop) ++ /* A non-zero CHAR in YMM6 represents a mismatch. */ ++ VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++ kmovd %k0, %LOOP_REG + +- /* Each bit set in K1 represents a non-null CHAR in YMM0. */ ++ TESTEQ %LOOP_REG ++ jz L(loop) ++ ++ ++ /* Find which VEC has the mismatch of end of string. */ + VPTESTM %YMM0, %YMM0, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM0 and (%rdx). */ + VPCMP $0, %YMMZERO, %YMM1, %k0{%k1} + kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(test_vec) +- tzcntl %ecx, %ecx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx +-# endif +-# ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# endif +- ret ++ TESTEQ %ecx ++ jnz L(return_vec_0_end) + +- .p2align 4 +-L(test_vec): +-# ifdef USE_AS_STRNCMP +- /* The first vector matched. Return 0 if the maximum offset +- (%r11) <= VEC_SIZE. */ +- cmpq $VEC_SIZE, %r11 +- jbe L(zero) +-# endif +- /* Each bit set in K1 represents a non-null CHAR in YMM2. */ + VPTESTM %YMM2, %YMM2, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM2 and VEC_SIZE(%rdx). */ + VPCMP $0, %YMMZERO, %YMM3, %k0{%k1} + kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(test_2_vec) +- tzcntl %ecx, %edi +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edi +-# endif +-# ifdef USE_AS_STRNCMP +- addq $VEC_SIZE, %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl VEC_SIZE(%rsi, %rdi), %ecx +- cmpl VEC_SIZE(%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl VEC_SIZE(%rax, %rdi), %eax +- movzbl VEC_SIZE(%rdx, %rdi), %edx +- subl %edx, %eax +-# endif +-# endif +- ret ++ TESTEQ %ecx ++ jnz L(return_vec_1_end) + +- .p2align 4 +-L(test_2_vec): ++ ++ /* Handle VEC 2 and 3 without branches. */ ++L(return_vec_2_3_end): + # ifdef USE_AS_STRNCMP +- /* The first 2 vectors matched. Return 0 if the maximum offset +- (%r11) <= 2 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 2), %r11 +- jbe L(zero) ++ subq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero_end) + # endif +- /* Each bit set in K1 represents a non-null CHAR in YMM4. */ ++ + VPTESTM %YMM4, %YMM4, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM4 and (VEC_SIZE * 2)(%rdx). */ + VPCMP $0, %YMMZERO, %YMM5, %k0{%k1} + kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ TESTEQ %ecx ++# if CHAR_PER_VEC <= 16 ++ sall $CHAR_PER_VEC, %LOOP_REG ++ orl %ecx, %LOOP_REG + # else +- incl %ecx ++ salq $CHAR_PER_VEC, %LOOP_REG64 ++ orq %rcx, %LOOP_REG64 ++# endif ++L(return_vec_3_end): ++ /* LOOP_REG contains matches for null/mismatch from the loop. If ++ VEC 0,1,and 2 all have no null and no mismatches then mismatch ++ must entirely be from VEC 3 which is fully represented by ++ LOOP_REG. */ ++# if CHAR_PER_VEC <= 16 ++ tzcntl %LOOP_REG, %LOOP_REG ++# else ++ tzcntq %LOOP_REG64, %LOOP_REG64 ++# endif ++# ifdef USE_AS_STRNCMP ++ cmpq %LOOP_REG64, %rdx ++ jbe L(ret_zero_end) + # endif +- je L(test_3_vec) +- tzcntl %ecx, %edi ++ + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edi ++ movl (VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx ++ xorl %eax, %eax ++ cmpl (VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx ++ je L(ret5) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax ++# else ++ movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret5): ++ ret ++ + # ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ .p2align 4,, 2 ++L(ret_zero_end): + xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) ++ ret ++# endif ++ ++ ++ /* The L(return_vec_N_end) differ from L(return_vec_N) in that ++ they use the value of `r8` to negate the return value. This is ++ because the page cross logic can swap `rdi` and `rsi`. */ ++ .p2align 4,, 10 ++# ifdef USE_AS_STRNCMP ++L(return_vec_1_end): ++# if CHAR_PER_VEC <= 16 ++ sall $CHAR_PER_VEC, %ecx + # else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax ++ salq $CHAR_PER_VEC, %rcx + # endif ++# endif ++L(return_vec_0_end): ++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) ++ tzcntl %ecx, %ecx + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx +- cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ tzcntq %rcx, %rcx + # endif +- ret + +- .p2align 4 +-L(test_3_vec): + # ifdef USE_AS_STRNCMP +- /* The first 3 vectors matched. Return 0 if the maximum offset +- (%r11) <= 3 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 3), %r11 +- jbe L(zero) ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_end) + # endif +- /* Each bit set in K1 represents a non-null CHAR in YMM6. */ +- VPTESTM %YMM6, %YMM6, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM6 and (VEC_SIZE * 3)(%rdx). */ +- VPCMP $0, %YMMZERO, %YMM7, %k0{%k1} +- kmovd %k0, %ecx ++ + # ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx ++ xorl %eax, %eax ++ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret6) ++ setl %al ++ negl %eax ++ /* This is the non-zero case for `eax` so just xorl with `r8d` ++ flip is `rdi` and `rsi` where swapped. */ ++ xorl %r8d, %eax + # else +- incl %ecx ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ /* Flip `eax` if `rdi` and `rsi` where swapped in page cross ++ logic. Subtract `r8d` after xor for zero case. */ ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret6): ++ ret ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_1_end): + tzcntl %ecx, %ecx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx +-# endif +-# ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 3), %rcx +- cmpq %rcx, %r11 +- jbe L(zero) + # ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (%rsi, %rcx), %esi +- cmpl (%rdx, %rcx), %esi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rsi, %rcx), %esi +- cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi +- jne L(wcscmp_return) ++ cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret7) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +- movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx +- subl %edx, %eax ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif +-# endif ++L(ret7): + ret +- +- .p2align 4 +-L(loop_cross_page): +- xorl %r10d, %r10d +- movq %rdx, %rcx +- /* Align load via RDX. We load the extra ECX bytes which should +- be ignored. */ +- andl $((VEC_SIZE * 4) - 1), %ecx +- /* R10 is -RCX. */ +- subq %rcx, %r10 +- +- /* This works only if VEC_SIZE * 2 == 64. */ +-# if (VEC_SIZE * 2) != 64 +-# error (VEC_SIZE * 2) != 64 + # endif + +- /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ +- cmpl $(VEC_SIZE * 2), %ecx +- jge L(loop_cross_page_2_vec) + +- VMOVU (%rax, %r10), %YMM2 +- VMOVU VEC_SIZE(%rax, %r10), %YMM3 ++ /* Page cross in rsi in next 4x VEC. */ + +- /* Each bit set in K2 represents a non-null CHAR in YMM2. */ +- VPTESTM %YMM2, %YMM2, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM2 and 32 bytes at (%rdx, %r10). */ +- VPCMP $0, (%rdx, %r10), %YMM2, %k1{%k2} +- kmovd %k1, %r9d +- /* Don't use subl since it is the lower 16/32 bits of RDI +- below. */ +- notl %r9d +-# ifdef USE_AS_WCSCMP +- /* Only last 8 bits are valid. */ +- andl $0xff, %r9d +-# endif ++ /* TODO: Improve logic here. */ ++ .p2align 4,, 10 ++L(page_cross_during_loop): ++ /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */ + +- /* Each bit set in K4 represents a non-null CHAR in YMM3. */ +- VPTESTM %YMM3, %YMM3, %k4 +- /* Each bit cleared in K3 represents a mismatch or a null CHAR +- in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */ +- VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4} +- kmovd %k3, %edi +- /* Must use notl %edi here as lower bits are for CHAR +- comparisons potentially out of range thus can be 0 without +- indicating mismatch. */ +- notl %edi +-# ifdef USE_AS_WCSCMP +- /* Don't use subl since it is the upper 8 bits of EDI below. */ +- andl $0xff, %edi +-# endif ++ /* Optimistically rsi and rdi and both aligned in which case we ++ don't need any logic here. */ ++ cmpl $-(VEC_SIZE * 4), %eax ++ /* Don't adjust eax before jumping back to loop and we will ++ never hit page cross case again. */ ++ je L(loop_skip_page_cross_check) + +-# ifdef USE_AS_WCSCMP +- /* NB: Each bit in EDI/R9D represents 4-byte element. */ +- sall $8, %edi +- /* NB: Divide shift count by 4 since each bit in K1 represent 4 +- bytes. */ +- movl %ecx, %SHIFT_REG32 +- sarl $2, %SHIFT_REG32 +- +- /* Each bit in EDI represents a null CHAR or a mismatch. */ +- orl %r9d, %edi +-# else +- salq $32, %rdi ++ /* Check if we can safely load a VEC. */ ++ cmpl $-(VEC_SIZE * 3), %eax ++ jle L(less_1x_vec_till_page_cross) + +- /* Each bit in RDI represents a null CHAR or a mismatch. */ +- orq %r9, %rdi +-# endif ++ VMOVA (%rdi), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, (%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_0_end) ++ ++ /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ ++ cmpl $-(VEC_SIZE * 2), %eax ++ jg L(more_2x_vec_till_page_cross) ++ ++ .p2align 4,, 4 ++L(less_1x_vec_till_page_cross): ++ subl $-(VEC_SIZE * 4), %eax ++ /* Guranteed safe to read from rdi - VEC_SIZE here. The only ++ concerning case is first iteration if incoming s1 was near start ++ of a page and s2 near end. If s1 was near the start of the page ++ we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe ++ to read back -VEC_SIZE. If rdi is truly at the start of a page ++ here, it means the previous page (rdi - VEC_SIZE) has already ++ been loaded earlier so must be valid. */ ++ VMOVU -VEC_SIZE(%rdi, %rax), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2} ++ ++ /* Mask of potentially valid bits. The lower bits can be out of ++ range comparisons (but safe regarding page crosses). */ + +- /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ +- shrxq %SHIFT_REG64, %rdi, %rdi +- testq %rdi, %rdi +- je L(loop_cross_page_2_vec) +- tzcntq %rdi, %rcx + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx ++ movl $-1, %r10d ++ movl %esi, %ecx ++ andl $(VEC_SIZE - 1), %ecx ++ shrl $2, %ecx ++ shlxl %ecx, %r10d, %ecx ++ movzbl %cl, %r10d ++# else ++ movl $-1, %ecx ++ shlxl %esi, %ecx, %r10d + # endif ++ ++ kmovd %k1, %ecx ++ notl %ecx ++ ++ + # ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) + # ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) ++ movl %eax, %r11d ++ shrl $2, %r11d ++ cmpq %r11, %rdx + # else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax ++ cmpq %rax, %rdx + # endif ++ jbe L(return_page_cross_end_check) ++# endif ++ movl %eax, %OFFSET_REG ++ ++ /* Readjust eax before potentially returning to the loop. */ ++ addl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ ++ andl %r10d, %ecx ++ jz L(loop_skip_page_cross_check) ++ ++ .p2align 4,, 3 ++L(return_page_cross_end): ++ tzcntl %ecx, %ecx ++ ++# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP) ++ leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx ++L(return_page_cross_cmp_mem): + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ addl %OFFSET_REG, %ecx ++# endif ++# ifdef USE_AS_WCSCMP ++ movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret8) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax ++# else ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret8): + ret + +- .p2align 4 +-L(loop_cross_page_2_vec): +- /* The first VEC_SIZE * 2 bytes match or are ignored. */ +- VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0 +- VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1 ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_page_cross_end_check): ++ tzcntl %ecx, %ecx ++ leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx ++# ifdef USE_AS_WCSCMP ++ sall $2, %edx ++# endif ++ cmpl %ecx, %edx ++ ja L(return_page_cross_cmp_mem) ++ xorl %eax, %eax ++ ret ++# endif ++ + ++ .p2align 4,, 10 ++L(more_2x_vec_till_page_cross): ++ /* If more 2x vec till cross we will complete a full loop ++ iteration here. */ ++ ++ VMOVA VEC_SIZE(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10). */ +- VPCMP $0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2} +- kmovd %k1, %r9d +- /* Don't use subl since it is the lower 16/32 bits of RDI +- below. */ +- notl %r9d +-# ifdef USE_AS_WCSCMP +- /* Only last 8 bits are valid. */ +- andl $0xff, %r9d +-# endif ++ VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_1_end) + +- VPTESTM %YMM1, %YMM1, %k4 +- /* Each bit cleared in K3 represents a mismatch or a null CHAR +- in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */ +- VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4} +- kmovd %k3, %edi +- /* Must use notl %edi here as lower bits are for CHAR +- comparisons potentially out of range thus can be 0 without +- indicating mismatch. */ +- notl %edi +-# ifdef USE_AS_WCSCMP +- /* Don't use subl since it is the upper 8 bits of EDI below. */ +- andl $0xff, %edi ++# ifdef USE_AS_STRNCMP ++ cmpq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero_in_loop_page_cross) + # endif + +-# ifdef USE_AS_WCSCMP +- /* NB: Each bit in EDI/R9D represents 4-byte element. */ +- sall $8, %edi ++ subl $-(VEC_SIZE * 4), %eax + +- /* Each bit in EDI represents a null CHAR or a mismatch. */ +- orl %r9d, %edi +-# else +- salq $32, %rdi ++ /* Safe to include comparisons from lower bytes. */ ++ VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_page_cross_0) ++ ++ VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_page_cross_1) + +- /* Each bit in RDI represents a null CHAR or a mismatch. */ +- orq %r9, %rdi ++# ifdef USE_AS_STRNCMP ++ /* Must check length here as length might proclude reading next ++ page. */ ++# ifdef USE_AS_WCSCMP ++ movl %eax, %r11d ++ shrl $2, %r11d ++ cmpq %r11, %rdx ++# else ++ cmpq %rax, %rdx ++# endif ++ jbe L(ret_zero_in_loop_page_cross) + # endif + +- xorl %r8d, %r8d +- /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ +- subl $(VEC_SIZE * 2), %ecx +- jle 1f +- /* R8 has number of bytes skipped. */ +- movl %ecx, %r8d +-# ifdef USE_AS_WCSCMP +- /* NB: Divide shift count by 4 since each bit in RDI represent 4 +- bytes. */ +- sarl $2, %ecx +- /* Skip ECX bytes. */ +- shrl %cl, %edi ++ /* Finish the loop. */ ++ VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 ++ VPMINU %YMM4, %YMM6, %YMM9 ++ VPTESTM %YMM9, %YMM9, %k1 ++ ++ vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 ++ /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */ ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6 ++ ++ VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++ kmovd %k0, %LOOP_REG ++ TESTEQ %LOOP_REG ++ jnz L(return_vec_2_3_end) ++ ++ /* Best for code size to include ucond-jmp here. Would be faster ++ if this case is hot to duplicate the L(return_vec_2_3_end) code ++ as fall-through and have jump back to loop on mismatch ++ comparison. */ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ addl $(PAGE_SIZE - VEC_SIZE * 8), %eax ++# ifdef USE_AS_STRNCMP ++ subq $(CHAR_PER_VEC * 4), %rdx ++ ja L(loop_skip_page_cross_check) ++L(ret_zero_in_loop_page_cross): ++ xorl %eax, %eax ++ ret + # else +- /* Skip ECX bytes. */ +- shrq %cl, %rdi ++ jmp L(loop_skip_page_cross_check) + # endif +-1: +- /* Before jumping back to the loop, set ESI to the number of +- VEC_SIZE * 4 blocks before page crossing. */ +- movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi + +- testq %rdi, %rdi +-# ifdef USE_AS_STRNCMP +- /* At this point, if %rdi value is 0, it already tested +- VEC_SIZE*4+%r10 byte starting from %rax. This label +- checks whether strncmp maximum offset reached or not. */ +- je L(string_nbyte_offset_check) ++ ++ .p2align 4,, 10 ++L(return_vec_page_cross_0): ++ addl $-VEC_SIZE, %eax ++L(return_vec_page_cross_1): ++ tzcntl %ecx, %ecx ++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP ++ leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx ++# ifdef USE_AS_STRNCMP ++# ifdef USE_AS_WCSCMP ++ /* Must divide ecx instead of multiply rdx due to overflow. */ ++ movl %ecx, %eax ++ shrl $2, %eax ++ cmpq %rax, %rdx ++# else ++ cmpq %rcx, %rdx ++# endif ++ jbe L(ret_zero_in_loop_page_cross) ++# endif + # else +- je L(back_to_loop) ++ addl %eax, %ecx + # endif +- tzcntq %rdi, %rcx ++ + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx +-# endif +- addq %r10, %rcx +- /* Adjust for number of bytes skipped. */ +- addq %r8, %rcx +-# ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rcx +- subq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret9) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rcx), %edi +- cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret9): + ret + +-# ifdef USE_AS_STRNCMP +-L(string_nbyte_offset_check): +- leaq (VEC_SIZE * 4)(%r10), %r10 +- cmpq %r10, %r11 +- jbe L(zero) +- jmp L(back_to_loop) ++ ++ .p2align 4,, 10 ++L(page_cross): ++# ifndef USE_AS_STRNCMP ++ /* If both are VEC aligned we don't need any special logic here. ++ Only valid for strcmp where stop condition is guranteed to be ++ reachable by just reading memory. */ ++ testl $((VEC_SIZE - 1) << 20), %eax ++ jz L(no_page_cross) + # endif + +- .p2align 4 +-L(cross_page_loop): +- /* Check one byte/dword at a time. */ ++ movl %edi, %eax ++ movl %esi, %ecx ++ andl $(PAGE_SIZE - 1), %eax ++ andl $(PAGE_SIZE - 1), %ecx ++ ++ xorl %OFFSET_REG, %OFFSET_REG ++ ++ /* Check which is closer to page cross, s1 or s2. */ ++ cmpl %eax, %ecx ++ jg L(page_cross_s2) ++ ++ /* The previous page cross check has false positives. Check for ++ true positive as page cross logic is very expensive. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ jbe L(no_page_cross) ++ ++ ++ /* Set r8 to not interfere with normal return value (rdi and rsi ++ did not swap). */ + # ifdef USE_AS_WCSCMP +- cmpl %ecx, %eax ++ /* any non-zero positive value that doesn't inference with 0x1. ++ */ ++ movl $2, %r8d + # else +- subl %ecx, %eax ++ xorl %r8d, %r8d + # endif +- jne L(different) +- addl $SIZE_OF_CHAR, %edx +- cmpl $(VEC_SIZE * 4), %edx +- je L(main_loop_header) ++ ++ /* Check if less than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jg L(less_1x_vec_till_page) ++ ++ ++ /* If more than 1x VEC till page cross, loop throuh safely ++ loadable memory until within 1x VEC of page cross. */ ++ .p2align 4,, 8 ++L(page_cross_loop): ++ VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(check_ret_vec_page_cross) ++ addl $CHAR_PER_VEC, %OFFSET_REG + # ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross) + # endif ++ addl $VEC_SIZE, %eax ++ jl L(page_cross_loop) ++ + # ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx ++ shrl $2, %eax + # endif +- /* Check null CHAR. */ +- testl %eax, %eax +- jne L(cross_page_loop) +- /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED +- comparisons. */ +- subl %ecx, %eax +-# ifndef USE_AS_WCSCMP +-L(different): ++ ++ ++ subl %eax, %OFFSET_REG ++ /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed ++ to not cross page so is safe to load. Since we have already ++ loaded at least 1 VEC from rsi it is also guranteed to be safe. ++ */ ++ VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ ++ kmovd %k1, %ecx ++# ifdef USE_AS_STRNCMP ++ leal CHAR_PER_VEC(%OFFSET_REG64), %eax ++ cmpq %rax, %rdx ++ jbe L(check_ret_vec_page_cross2) ++# ifdef USE_AS_WCSCMP ++ addq $-(CHAR_PER_VEC * 2), %rdx ++# else ++ addq %rdi, %rdx ++# endif + # endif +- ret ++ TESTEQ %ecx ++ jz L(prepare_loop_no_len) + ++ .p2align 4,, 4 ++L(ret_vec_page_cross): ++# ifndef USE_AS_STRNCMP ++L(check_ret_vec_page_cross): ++# endif ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++L(ret_vec_page_cross_cont): + # ifdef USE_AS_WCSCMP +- .p2align 4 +-L(different): +- /* Use movl to avoid modifying EFLAGS. */ +- movl $0, %eax ++ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx ++ xorl %eax, %eax ++ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret12) + setl %al + negl %eax +- orl $1, %eax +- ret ++ xorl %r8d, %eax ++# else ++ movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax ++ movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret12): ++ ret ++ + + # ifdef USE_AS_STRNCMP +- .p2align 4 +-L(zero): ++ .p2align 4,, 10 ++L(check_ret_vec_page_cross2): ++ TESTEQ %ecx ++L(check_ret_vec_page_cross): ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++ cmpq %rcx, %rdx ++ ja L(ret_vec_page_cross_cont) ++ .p2align 4,, 2 ++L(ret_zero_page_cross): + xorl %eax, %eax + ret ++# endif + +- .p2align 4 +-L(char0): +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rsi), %ecx +- movzbl (%rdi), %eax +- subl %ecx, %eax +-# endif +- ret ++ .p2align 4,, 4 ++L(page_cross_s2): ++ /* Ensure this is a true page cross. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx ++ jbe L(no_page_cross) ++ ++ ++ movl %ecx, %eax ++ movq %rdi, %rcx ++ movq %rsi, %rdi ++ movq %rcx, %rsi ++ ++ /* set r8 to negate return value as rdi and rsi swapped. */ ++# ifdef USE_AS_WCSCMP ++ movl $-4, %r8d ++# else ++ movl $-1, %r8d + # endif ++ xorl %OFFSET_REG, %OFFSET_REG + +- .p2align 4 +-L(last_vector): +- addq %rdx, %rdi +- addq %rdx, %rsi +-# ifdef USE_AS_STRNCMP +- subq %rdx, %r11 ++ /* Check if more than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jle L(page_cross_loop) ++ ++ .p2align 4,, 6 ++L(less_1x_vec_till_page): ++# ifdef USE_AS_WCSCMP ++ shrl $2, %eax + # endif +- tzcntl %ecx, %edx ++ /* Find largest load size we can use. */ ++ cmpl $(16 / SIZE_OF_CHAR), %eax ++ ja L(less_16_till_page) ++ ++ /* Use 16 byte comparison. */ ++ vmovdqu (%rdi), %xmm0 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, (%rsi), %xmm0, %k1{%k2} ++ kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx ++ subl $0xf, %ecx ++# else ++ incw %cx + # endif ++ jnz L(check_ret_vec_page_cross) ++ movl $(16 / SIZE_OF_CHAR), %OFFSET_REG + # ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subl %eax, %OFFSET_REG ++# else ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG ++ jz L(prepare_loop) + # endif ++ vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2} ++ kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ subl $0xf, %ecx + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ incw %cx + # endif ++ jnz L(check_ret_vec_page_cross) ++# ifdef USE_AS_STRNCMP ++ addl $(16 / SIZE_OF_CHAR), %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(CHAR_PER_VEC * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# else ++ leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case0): ++ xorl %eax, %eax + ret ++# endif + +- /* Comparing on page boundary region requires special treatment: +- It must done one vector at the time, starting with the wider +- ymm vector if possible, if not, with xmm. If fetching 16 bytes +- (xmm) still passes the boundary, byte comparison must be done. +- */ +- .p2align 4 +-L(cross_page): +- /* Try one ymm vector at a time. */ +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jg L(cross_page_1_vector) +-L(loop_1_vector): +- VMOVU (%rdi, %rdx), %YMM0 + +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (%rsi, %rdx). */ +- VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2} ++ .p2align 4,, 10 ++L(less_16_till_page): ++ cmpl $(24 / SIZE_OF_CHAR), %eax ++ ja L(less_8_till_page) ++ ++ /* Use 8 byte comparison. */ ++ vmovq (%rdi), %xmm0 ++ vmovq (%rsi), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ subl $0x3, %ecx + # else +- incl %ecx ++ incb %cl + # endif +- jne L(last_vector) ++ jnz L(check_ret_vec_page_cross) + +- addl $VEC_SIZE, %edx + +- addl $VEC_SIZE, %eax + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $(8 / SIZE_OF_CHAR), %rdx ++ jbe L(ret_zero_page_cross_slow_case0) + # endif +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jle L(loop_1_vector) +-L(cross_page_1_vector): +- /* Less than 32 bytes to check, try one xmm vector. */ +- cmpl $(PAGE_SIZE - 16), %eax +- jg L(cross_page_1_xmm) +- VMOVU (%rdi, %rdx), %XMM0 ++ movl $(24 / SIZE_OF_CHAR), %OFFSET_REG ++ subl %eax, %OFFSET_REG + +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in XMM0 and 16 bytes at (%rsi, %rdx). */ +- VPCMP $0, (%rsi, %rdx), %XMM0, %k1{%k2} ++ vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 ++ vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- subl $0xf, %ecx ++ subl $0x3, %ecx + # else +- subl $0xffff, %ecx ++ incb %cl + # endif +- jne L(last_vector) ++ jnz L(check_ret_vec_page_cross) ++ + +- addl $16, %edx +-# ifndef USE_AS_WCSCMP +- addl $16, %eax +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ addl $(8 / SIZE_OF_CHAR), %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(CHAR_PER_VEC * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# else ++ leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi + # endif ++ jmp L(prepare_loop_aligned) + +-L(cross_page_1_xmm): +-# ifndef USE_AS_WCSCMP +- /* Less than 16 bytes to check, try 8 byte vector. NB: No need +- for wcscmp nor wcsncmp since wide char is 4 bytes. */ +- cmpl $(PAGE_SIZE - 8), %eax +- jg L(cross_page_8bytes) +- vmovq (%rdi, %rdx), %XMM0 +- vmovq (%rsi, %rdx), %XMM1 + +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in XMM0 and XMM1. */ +- VPCMP $0, %XMM1, %XMM0, %k1{%k2} +- kmovb %k1, %ecx ++ ++ ++ .p2align 4,, 10 ++L(less_8_till_page): + # ifdef USE_AS_WCSCMP +- subl $0x3, %ecx ++ /* If using wchar then this is the only check before we reach ++ the page boundary. */ ++ movl (%rdi), %eax ++ movl (%rsi), %ecx ++ cmpl %ecx, %eax ++ jnz L(ret_less_8_wcs) ++# ifdef USE_AS_STRNCMP ++ addq $-(CHAR_PER_VEC * 2), %rdx ++ /* We already checked for len <= 1 so cannot hit that case here. ++ */ ++# endif ++ testl %eax, %eax ++ jnz L(prepare_loop) ++ ret ++ ++ .p2align 4,, 8 ++L(ret_less_8_wcs): ++ setl %OFFSET_REG8 ++ negl %OFFSET_REG ++ movl %OFFSET_REG, %eax ++ xorl %r8d, %eax ++ ret ++ + # else +- subl $0xff, %ecx +-# endif +- jne L(last_vector) ++ cmpl $28, %eax ++ ja L(less_4_till_page) ++ ++ vmovd (%rdi), %xmm0 ++ vmovd (%rsi), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ kmovd %k1, %ecx ++ subl $0xf, %ecx ++ jnz L(check_ret_vec_page_cross) + +- addl $8, %edx +- addl $8, %eax + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $4, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) + # endif ++ movl $(28 / SIZE_OF_CHAR), %OFFSET_REG ++ subl %eax, %OFFSET_REG + +-L(cross_page_8bytes): +- /* Less than 8 bytes to check, try 4 byte vector. */ +- cmpl $(PAGE_SIZE - 4), %eax +- jg L(cross_page_4bytes) +- vmovd (%rdi, %rdx), %XMM0 +- vmovd (%rsi, %rdx), %XMM1 +- +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in XMM0 and XMM1. */ +- VPCMP $0, %XMM1, %XMM0, %k1{%k2} ++ vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 ++ vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0x1, %ecx +-# else + subl $0xf, %ecx +-# endif +- jne L(last_vector) ++ jnz L(check_ret_vec_page_cross) ++# ifdef USE_AS_STRNCMP ++ addl $(4 / SIZE_OF_CHAR), %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) ++ subq $-(CHAR_PER_VEC * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# else ++ leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ + +- addl $4, %edx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case1): ++ xorl %eax, %eax ++ ret + # endif + +-L(cross_page_4bytes): +-# endif +- /* Less than 4 bytes to check, try one byte/dword at a time. */ +-# ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) +-# endif +-# ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx +-# endif +- testl %eax, %eax +- jne L(cross_page_loop) ++ .p2align 4,, 10 ++L(less_4_till_page): ++ subq %rdi, %rsi ++ /* Extremely slow byte comparison loop. */ ++L(less_4_loop): ++ movzbl (%rdi), %eax ++ movzbl (%rsi, %rdi), %ecx + subl %ecx, %eax ++ jnz L(ret_less_4_loop) ++ testl %ecx, %ecx ++ jz L(ret_zero_4_loop) ++# ifdef USE_AS_STRNCMP ++ decq %rdx ++ jz L(ret_zero_4_loop) ++# endif ++ incq %rdi ++ /* end condition is reach page boundary (rdi is aligned). */ ++ testl $31, %edi ++ jnz L(less_4_loop) ++ leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi ++ addq $-(VEC_SIZE * 4), %rdi ++# ifdef USE_AS_STRNCMP ++ subq $-(CHAR_PER_VEC * 4), %rdx ++# endif ++ jmp L(prepare_loop_aligned) ++ ++L(ret_zero_4_loop): ++ xorl %eax, %eax ++ ret ++L(ret_less_4_loop): ++ xorl %r8d, %eax ++ subl %r8d, %eax + ret +-END (STRCMP) ++# endif ++END(STRCMP) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-76.patch b/glibc-RHEL-15696-76.patch new file mode 100644 index 0000000..84d9a6f --- /dev/null +++ b/glibc-RHEL-15696-76.patch @@ -0,0 +1,33 @@ +From c15efd011cea3d8f0494269eb539583215a1feed Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 4 Feb 2022 11:09:10 -0800 +Subject: [PATCH] x86-64: Fix strcmp-avx2.S +Content-type: text/plain; charset=UTF-8 + +Change "movl %edx, %rdx" to "movl %edx, %edx" in: + +commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45 +Author: Noah Goldstein +Date: Mon Jan 10 15:35:38 2022 -0600 + + x86: Optimize strcmp-avx2.S +--- + sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 554ffe4c..04675aa4 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -106,7 +106,7 @@ ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ +- movl %edx, %rdx ++ movl %edx, %edx + # endif + cmp $1, %RDX_LP + /* Signed comparison intentional. We use this branch to also +-- +GitLab + diff --git a/glibc-RHEL-15696-77.patch b/glibc-RHEL-15696-77.patch new file mode 100644 index 0000000..1a1cdae --- /dev/null +++ b/glibc-RHEL-15696-77.patch @@ -0,0 +1,33 @@ +From 0e0199a9e02ebe42e2b36958964d63f03573c382 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 4 Feb 2022 11:11:08 -0800 +Subject: [PATCH] x86-64: Fix strcmp-evex.S +Content-type: text/plain; charset=UTF-8 + +Change "movl %edx, %rdx" to "movl %edx, %edx" in: + +commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9 +Author: Noah Goldstein +Date: Mon Jan 10 15:35:39 2022 -0600 + + x86: Optimize strcmp-evex.S +--- + sysdeps/x86_64/multiarch/strcmp-evex.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 99d8409a..ed56af8e 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -116,7 +116,7 @@ ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ +- movl %edx, %rdx ++ movl %edx, %edx + # endif + cmp $1, %RDX_LP + /* Signed comparison intentional. We use this branch to also +-- +GitLab + diff --git a/glibc-RHEL-15696-78.patch b/glibc-RHEL-15696-78.patch new file mode 100644 index 0000000..885b715 --- /dev/null +++ b/glibc-RHEL-15696-78.patch @@ -0,0 +1,459 @@ +From b62ace2740a106222e124cc86956448fa07abf4d Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Sun, 6 Feb 2022 00:54:18 -0600 +Subject: [PATCH] x86: Improve vec generation in memset-vec-unaligned-erms.S +Content-type: text/plain; charset=UTF-8 + +No bug. + +Split vec generation into multiple steps. This allows the +broadcast in AVX2 to use 'xmm' registers for the L(less_vec) +case. This saves an expensive lane-cross instruction and removes +the need for 'vzeroupper'. + +For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for +byte broadcast. + +Results for memset-avx2 small (geomean of N = 20 benchset runs). + +size, New Time, Old Time, New / Old + 0, 4.100, 3.831, 0.934 + 1, 5.074, 4.399, 0.867 + 2, 4.433, 4.411, 0.995 + 4, 4.487, 4.415, 0.984 + 8, 4.454, 4.396, 0.987 + 16, 4.502, 4.443, 0.987 + +All relevant string/wcsmbs tests are passing. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/memset.S | 21 ++- + .../multiarch/memset-avx2-unaligned-erms.S | 18 +- + .../multiarch/memset-avx512-unaligned-erms.S | 18 +- + .../multiarch/memset-evex-unaligned-erms.S | 18 +- + .../multiarch/memset-vec-unaligned-erms.S | 164 +++++++++++------- + 5 files changed, 152 insertions(+), 87 deletions(-) + +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index 8672b030..27debd2b 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -28,17 +28,22 @@ + #define VMOVU movups + #define VMOVA movaps + +-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +- movq r, %rax; \ +- punpcklbw %xmm0, %xmm0; \ +- punpcklwd %xmm0, %xmm0; \ +- pshufd $0, %xmm0, %xmm0 ++ pxor %xmm1, %xmm1; \ ++ pshufb %xmm1, %xmm0; \ ++ movq r, %rax + +-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +- movq r, %rax; \ +- pshufd $0, %xmm0, %xmm0 ++ pshufd $0, %xmm0, %xmm0; \ ++ movq r, %rax ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() ++# define MEMSET_VDUP_TO_VEC0_LOW() ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() ++# define WMEMSET_VDUP_TO_VEC0_LOW() + + #define SECTION(p) p + +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +index 1af668af..c0bf2875 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +@@ -10,15 +10,18 @@ + # define VMOVU vmovdqu + # define VMOVA vmovdqa + +-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ +- movq r, %rax; \ +- vpbroadcastb %xmm0, %ymm0 ++ movq r, %rax; + +-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- vmovd d, %xmm0; \ +- movq r, %rax; \ +- vpbroadcastd %xmm0, %ymm0 ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ MEMSET_SET_VEC0_AND_SET_RETURN(d, r) ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0 ++# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0 ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0 ++# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0 + + # ifndef SECTION + # define SECTION(p) p##.avx +@@ -30,5 +33,6 @@ + # define WMEMSET_SYMBOL(p,s) p##_avx2_##s + # endif + ++# define USE_XMM_LESS_VEC + # include "memset-vec-unaligned-erms.S" + #endif +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index f14d6f84..5241216a 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -15,13 +15,19 @@ + + # define VZEROUPPER + +-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastb d, %VEC0 ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastb d, %VEC0; \ ++ movq r, %rax + +-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastd d, %VEC0 ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastd d, %VEC0; \ ++ movq r, %rax ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() ++# define MEMSET_VDUP_TO_VEC0_LOW() ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() ++# define WMEMSET_VDUP_TO_VEC0_LOW() + + # define SECTION(p) p##.evex512 + # define MEMSET_SYMBOL(p,s) p##_avx512_##s +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +index 64b09e77..63700215 100644 +--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -15,13 +15,19 @@ + + # define VZEROUPPER + +-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastb d, %VEC0 ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastb d, %VEC0; \ ++ movq r, %rax + +-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastd d, %VEC0 ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastd d, %VEC0; \ ++ movq r, %rax ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() ++# define MEMSET_VDUP_TO_VEC0_LOW() ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() ++# define WMEMSET_VDUP_TO_VEC0_LOW() + + # define SECTION(p) p##.evex + # define MEMSET_SYMBOL(p,s) p##_evex_##s +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index f08b7323..a67f9833 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -58,8 +58,10 @@ + #ifndef MOVQ + # if VEC_SIZE > 16 + # define MOVQ vmovq ++# define MOVD vmovd + # else + # define MOVQ movq ++# define MOVD movd + # endif + #endif + +@@ -72,9 +74,17 @@ + #if defined USE_WITH_EVEX || defined USE_WITH_AVX512 + # define END_REG rcx + # define LOOP_REG rdi ++# define LESS_VEC_REG rax + #else + # define END_REG rdi + # define LOOP_REG rdx ++# define LESS_VEC_REG rdi ++#endif ++ ++#ifdef USE_XMM_LESS_VEC ++# define XMM_SMALL 1 ++#else ++# define XMM_SMALL 0 + #endif + + #define PAGE_SIZE 4096 +@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) + + ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) + shl $2, %RDX_LP +- WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) +- jmp L(entry_from_bzero) ++ WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) ++ WMEMSET_VDUP_TO_VEC0_LOW() ++ cmpq $VEC_SIZE, %rdx ++ jb L(less_vec_no_vdup) ++ WMEMSET_VDUP_TO_VEC0_HIGH() ++ jmp L(entry_from_wmemset) + END (WMEMSET_SYMBOL (__wmemset, unaligned)) + #endif + +@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) + #endif + + ENTRY (MEMSET_SYMBOL (__memset, unaligned)) +- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) ++ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx +@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) + L(entry_from_bzero): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) ++ MEMSET_VDUP_TO_VEC0_HIGH() ++L(entry_from_wmemset): + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ +@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + # endif + + ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) +- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) ++ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx + # endif + cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) ++ MEMSET_VDUP_TO_VEC0_HIGH () + cmp $(VEC_SIZE * 2), %RDX_LP + ja L(stosb_more_2x_vec) +- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. +- */ +- VMOVU %VEC(0), (%rax) +- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + VZEROUPPER_RETURN + #endif + +- .p2align 4,, 10 ++ .p2align 4,, 4 + L(last_2x_vec): + #ifdef USE_LESS_VEC_MASK_STORE +- VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx) +- VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx) ++ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + #else + VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) +@@ -212,6 +228,7 @@ L(last_2x_vec): + #ifdef USE_LESS_VEC_MASK_STORE + .p2align 4,, 10 + L(less_vec): ++L(less_vec_no_vdup): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 + # error Unsupported VEC_SIZE! +@@ -262,28 +279,18 @@ L(stosb_more_2x_vec): + /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x] + and (4x, 8x] jump to target. */ + L(more_2x_vec): +- +- /* Two different methods of setting up pointers / compare. The +- two methods are based on the fact that EVEX/AVX512 mov +- instructions take more bytes then AVX2/SSE2 mov instructions. As +- well that EVEX/AVX512 machines also have fast LEA_BID. Both +- setup and END_REG to avoid complex address mode. For EVEX/AVX512 +- this saves code size and keeps a few targets in one fetch block. +- For AVX2/SSE2 this helps prevent AGU bottlenecks. */ +-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 +- /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + +- LOOP_4X_OFFSET) with LEA_BID. */ +- +- /* END_REG is rcx for EVEX/AVX512. */ +- leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG +-#endif +- +- /* Stores to first 2x VEC before cmp as any path forward will +- require it. */ +- VMOVU %VEC(0), (%rax) +- VMOVU %VEC(0), VEC_SIZE(%rax) ++ /* Store next 2x vec regardless. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi) + + ++ /* Two different methods of setting up pointers / compare. The two ++ methods are based on the fact that EVEX/AVX512 mov instructions take ++ more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512 ++ machines also have fast LEA_BID. Both setup and END_REG to avoid complex ++ address mode. For EVEX/AVX512 this saves code size and keeps a few ++ targets in one fetch block. For AVX2/SSE2 this helps prevent AGU ++ bottlenecks. */ + #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) + /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */ + addq %rdx, %END_REG +@@ -292,6 +299,15 @@ L(more_2x_vec): + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_2x_vec) + ++ ++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 ++ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with ++ LEA_BID. */ ++ ++ /* END_REG is rcx for EVEX/AVX512. */ ++ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG ++#endif ++ + /* Store next 2x vec regardless. */ + VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) + VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) +@@ -355,65 +371,93 @@ L(stosb_local): + /* Define L(less_vec) only if not otherwise defined. */ + .p2align 4 + L(less_vec): ++ /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to ++ xmm). This is only does anything for AVX2. */ ++ MEMSET_VDUP_TO_VEC0_LOW () ++L(less_vec_no_vdup): + #endif + L(cross_page): + #if VEC_SIZE > 32 + cmpl $32, %edx +- jae L(between_32_63) ++ jge L(between_32_63) + #endif + #if VEC_SIZE > 16 + cmpl $16, %edx +- jae L(between_16_31) ++ jge L(between_16_31) ++#endif ++#ifndef USE_XMM_LESS_VEC ++ MOVQ %XMM0, %rcx + #endif +- MOVQ %XMM0, %rdi + cmpl $8, %edx +- jae L(between_8_15) ++ jge L(between_8_15) + cmpl $4, %edx +- jae L(between_4_7) ++ jge L(between_4_7) + cmpl $1, %edx +- ja L(between_2_3) +- jb L(return) +- movb %sil, (%rax) +- VZEROUPPER_RETURN ++ jg L(between_2_3) ++ jl L(between_0_0) ++ movb %sil, (%LESS_VEC_REG) ++L(between_0_0): ++ ret + +- /* Align small targets only if not doing so would cross a fetch +- line. */ ++ /* Align small targets only if not doing so would cross a fetch line. ++ */ + #if VEC_SIZE > 32 + .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) + /* From 32 to 63. No branch when size == 32. */ + L(between_32_63): +- VMOVU %YMM0, (%rax) +- VMOVU %YMM0, -32(%rax, %rdx) ++ VMOVU %YMM0, (%LESS_VEC_REG) ++ VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx) + VZEROUPPER_RETURN + #endif + + #if VEC_SIZE >= 32 +- .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) ++ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1) + L(between_16_31): + /* From 16 to 31. No branch when size == 16. */ +- VMOVU %XMM0, (%rax) +- VMOVU %XMM0, -16(%rax, %rdx) +- VZEROUPPER_RETURN ++ VMOVU %XMM0, (%LESS_VEC_REG) ++ VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx) ++ ret + #endif + +- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) ++ /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. ++ */ ++ .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1) + L(between_8_15): + /* From 8 to 15. No branch when size == 8. */ +- movq %rdi, (%rax) +- movq %rdi, -8(%rax, %rdx) +- VZEROUPPER_RETURN ++#ifdef USE_XMM_LESS_VEC ++ MOVQ %XMM0, (%rdi) ++ MOVQ %XMM0, -8(%rdi, %rdx) ++#else ++ movq %rcx, (%LESS_VEC_REG) ++ movq %rcx, -8(%LESS_VEC_REG, %rdx) ++#endif ++ ret + +- .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE) ++ /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. ++ */ ++ .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1) + L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ +- movl %edi, (%rax) +- movl %edi, -4(%rax, %rdx) +- VZEROUPPER_RETURN ++#ifdef USE_XMM_LESS_VEC ++ MOVD %XMM0, (%rdi) ++ MOVD %XMM0, -4(%rdi, %rdx) ++#else ++ movl %ecx, (%LESS_VEC_REG) ++ movl %ecx, -4(%LESS_VEC_REG, %rdx) ++#endif ++ ret + +- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) ++ /* 4 * XMM_SMALL for the third mov for AVX2. */ ++ .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1) + L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ +- movw %di, (%rax) +- movb %dil, -1(%rax, %rdx) +- VZEROUPPER_RETURN ++#ifdef USE_XMM_LESS_VEC ++ movb %sil, (%rdi) ++ movb %sil, 1(%rdi) ++ movb %sil, -1(%rdi, %rdx) ++#else ++ movw %cx, (%LESS_VEC_REG) ++ movb %sil, -1(%LESS_VEC_REG, %rdx) ++#endif ++ ret + END (MEMSET_SYMBOL (__memset, unaligned_erms)) +-- +GitLab + diff --git a/glibc-RHEL-15696-79.patch b/glibc-RHEL-15696-79.patch new file mode 100644 index 0000000..91e850f --- /dev/null +++ b/glibc-RHEL-15696-79.patch @@ -0,0 +1,40 @@ +From 1b0c60f95bbe2eded80b2bb5be75c0e45b11cde1 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 7 Feb 2022 00:32:23 -0600 +Subject: [PATCH] x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2 + Only) +Content-type: text/plain; charset=UTF-8 + +commit b62ace2740a106222e124cc86956448fa07abf4d +Author: Noah Goldstein +Date: Sun Feb 6 00:54:18 2022 -0600 + + x86: Improve vec generation in memset-vec-unaligned-erms.S + +Revert usage of 'pshufb' in broadcast logic as it is an SSSE3 +instruction and memset.S is restricted to only SSE2 instructions. +--- + sysdeps/x86_64/memset.S | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index 27debd2b..4cb4aa71 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -30,9 +30,10 @@ + + # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +- pxor %xmm1, %xmm1; \ +- pshufb %xmm1, %xmm0; \ +- movq r, %rax ++ movq r, %rax; \ ++ punpcklbw %xmm0, %xmm0; \ ++ punpcklwd %xmm0, %xmm0; \ ++ pshufd $0, %xmm0, %xmm0 + + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +-- +GitLab + diff --git a/glibc-RHEL-15696-8.patch b/glibc-RHEL-15696-8.patch new file mode 100644 index 0000000..5cf7633 --- /dev/null +++ b/glibc-RHEL-15696-8.patch @@ -0,0 +1,218 @@ +From 5165de69c0908e28a380cbd4bb054e55ea4abc95 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 21 Jan 2019 11:36:36 -0800 +Subject: [PATCH] x86-64 strnlen/wcsnlen: Properly handle the length parameter + [BZ# 24097] +Content-type: text/plain; charset=UTF-8 + +On x32, the size_t parameter may be passed in the lower 32 bits of a +64-bit register with the non-zero upper 32 bits. The string/memory +functions written in assembly can only use the lower 32 bits of a +64-bit register as length or must clear the upper 32 bits before using +the full 64-bit register for length. + +This pach fixes strnlen/wcsnlen for x32. Tested on x86-64 and x32. On +x86-64, libc.so is the same with and withou the fix. + + [BZ# 24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/strlen-avx2.S: Use RSI_LP for length. + Clear the upper 32 bits of RSI register. + * sysdeps/x86_64/strlen.S: Use RSI_LP for length. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strnlen + and tst-size_t-wcsnlen. + * sysdeps/x86_64/x32/tst-size_t-strnlen.c: New file. + * sysdeps/x86_64/x32/tst-size_t-wcsnlen.c: Likewise. +--- + sysdeps/x86_64/multiarch/strlen-avx2.S | 9 ++-- + sysdeps/x86_64/strlen.S | 12 ++--- + sysdeps/x86_64/x32/Makefile | 4 +- + sysdeps/x86_64/x32/tst-size_t-strnlen.c | 72 +++++++++++++++++++++++++ + sysdeps/x86_64/x32/tst-size_t-wcsnlen.c | 20 +++++++ + 5 files changed, 106 insertions(+), 11 deletions(-) + create mode 100644 sysdeps/x86_64/x32/tst-size_t-strnlen.c + create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsnlen.c + +Conflicts: + ChangeLog + (removed) + +diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S +index fb2418cd..645e0446 100644 +--- a/sysdeps/x86_64/multiarch/strlen-avx2.S ++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S +@@ -42,12 +42,15 @@ + ENTRY (STRLEN) + # ifdef USE_AS_STRNLEN + /* Check for zero length. */ +- testq %rsi, %rsi ++ test %RSI_LP, %RSI_LP + jz L(zero) + # ifdef USE_AS_WCSLEN +- shl $2, %rsi ++ shl $2, %RSI_LP ++# elif defined __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %esi, %esi + # endif +- movq %rsi, %r8 ++ mov %RSI_LP, %R8_LP + # endif + movl %edi, %ecx + movq %rdi, %rdx +diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S +index 01cb5fa8..f845f3d4 100644 +--- a/sysdeps/x86_64/strlen.S ++++ b/sysdeps/x86_64/strlen.S +@@ -59,21 +59,21 @@ ENTRY(strlen) + + #ifdef AS_STRNLEN + /* Do not read anything when n==0. */ +- test %rsi, %rsi ++ test %RSI_LP, %RSI_LP + jne L(n_nonzero) + xor %rax, %rax + ret + L(n_nonzero): + # ifdef AS_WCSLEN +- shlq $2, %rsi ++ shl $2, %RSI_LP + # endif + + /* Initialize long lived registers. */ + +- add %rdi, %rsi +- mov %rsi, %r10 +- and $-64, %r10 +- mov %rsi, %r11 ++ add %RDI_LP, %RSI_LP ++ mov %RSI_LP, %R10_LP ++ and $-64, %R10_LP ++ mov %RSI_LP, %R11_LP + #endif + + pxor %xmm0, %xmm0 +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index 2a9e20a9..1557724b 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -8,10 +8,10 @@ endif + ifeq ($(subdir),string) + tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ + tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \ +- tst-size_t-strncmp tst-size_t-strncpy ++ tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen + endif + + ifeq ($(subdir),wcsmbs) + tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \ +- tst-size_t-wcsncmp ++ tst-size_t-wcsncmp tst-size_t-wcsnlen + endif +diff --git a/sysdeps/x86_64/x32/tst-size_t-strnlen.c b/sysdeps/x86_64/x32/tst-size_t-strnlen.c +new file mode 100644 +index 00000000..690a4a8a +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-strnlen.c +@@ -0,0 +1,72 @@ ++/* Test strnlen with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifdef WIDE ++# define TEST_NAME "wcsnlen" ++#else ++# define TEST_NAME "strnlen" ++#endif /* WIDE */ ++ ++#include "test-size_t.h" ++ ++#ifdef WIDE ++# include ++# define STRNLEN wcsnlen ++# define CHAR wchar_t ++#else ++# define STRNLEN strnlen ++# define CHAR char ++#endif /* WIDE */ ++ ++IMPL (STRNLEN, 1) ++ ++typedef size_t (*proto_t) (const CHAR *, size_t); ++ ++static size_t ++__attribute__ ((noinline, noclone)) ++do_strnlen (parameter_t a, parameter_t b) ++{ ++ return CALL (&a, a.p, b.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ size_t size = page_size / sizeof (CHAR); ++ parameter_t src = { { 0 }, buf2 }; ++ parameter_t c = { { size }, (void *) (uintptr_t) 'a' }; ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ src.fn = impl->fn; ++ size_t res = do_strnlen (src, c); ++ if (res != size) ++ { ++ error (0, 0, "Wrong result in function %s: 0x%x != 0x%x", ++ impl->name, res, size); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c +new file mode 100644 +index 00000000..093b4bbe +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c +@@ -0,0 +1,20 @@ ++/* Test wcsnlen with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define WIDE 1 ++#include "tst-size_t-strnlen.c" +-- +GitLab + diff --git a/glibc-RHEL-15696-80.patch b/glibc-RHEL-15696-80.patch new file mode 100644 index 0000000..53a3e7e --- /dev/null +++ b/glibc-RHEL-15696-80.patch @@ -0,0 +1,753 @@ +From 3d9f171bfb5325bd5f427e9fc386453358c6e840 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 7 Feb 2022 05:55:15 -0800 +Subject: [PATCH] x86-64: Optimize bzero +Content-type: text/plain; charset=UTF-8 + +memset with zero as the value to set is by far the majority value (99%+ +for Python3 and GCC). + +bzero can be slightly more optimized for this case by using a zero-idiom +xor for broadcasting the set value to a register (vector or GPR). + +Co-developed-by: Noah Goldstein +--- + sysdeps/generic/ifunc-init.h | 5 +- + sysdeps/x86_64/memset.S | 8 + + sysdeps/x86_64/multiarch/Makefile | 205 +++++++++++------- + sysdeps/x86_64/multiarch/bzero.c | 106 +++++++++ + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 42 ++++ + .../memset-avx2-unaligned-erms-rtm.S | 1 + + .../multiarch/memset-avx2-unaligned-erms.S | 6 + + .../multiarch/memset-avx512-unaligned-erms.S | 3 + + .../multiarch/memset-evex-unaligned-erms.S | 3 + + .../multiarch/memset-sse2-unaligned-erms.S | 1 + + .../multiarch/memset-vec-unaligned-erms.S | 110 +++++++--- + 11 files changed, 384 insertions(+), 106 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/bzero.c + +Conflicts: + sysdeps/generic/ifunc-init.h + (needs macros from cf4fd28ea453d1a9cec93939bc88b58ccef5437a (memcmpeq)) + sysdeps/x86_64/multiarch/Makefile + (file ordering) + +diff --git a/sysdeps/generic/ifunc-init.h b/sysdeps/generic/ifunc-init.h +index 241e4161..f7a72375 100644 +--- a/sysdeps/generic/ifunc-init.h ++++ b/sysdeps/generic/ifunc-init.h +@@ -50,5 +50,8 @@ + '___' as the optimized implementation and + '_ifunc_selector' as the IFUNC selector. */ + #define REDIRECT_NAME EVALUATOR1 (__redirect, SYMBOL_NAME) +-#define OPTIMIZE(name) EVALUATOR2 (SYMBOL_NAME, name) ++#define OPTIMIZE1(name) EVALUATOR1 (SYMBOL_NAME, name) ++#define OPTIMIZE2(name) EVALUATOR2 (SYMBOL_NAME, name) ++/* Default is to use OPTIMIZE2. */ ++#define OPTIMIZE(name) OPTIMIZE2(name) + #define IFUNC_SELECTOR EVALUATOR1 (SYMBOL_NAME, ifunc_selector) +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index 4cb4aa71..a1353f89 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -35,6 +35,9 @@ + punpcklwd %xmm0, %xmm0; \ + pshufd $0, %xmm0, %xmm0 + ++# define BZERO_ZERO_VEC0() \ ++ pxor %xmm0, %xmm0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ + pshufd $0, %xmm0, %xmm0; \ +@@ -53,6 +56,10 @@ + # define MEMSET_SYMBOL(p,s) memset + #endif + ++#ifndef BZERO_SYMBOL ++# define BZERO_SYMBOL(p,s) __bzero ++#endif ++ + #ifndef WMEMSET_SYMBOL + # define WMEMSET_CHK_SYMBOL(p,s) p + # define WMEMSET_SYMBOL(p,s) __wmemset +@@ -63,6 +70,7 @@ + libc_hidden_builtin_def (memset) + + #if IS_IN (libc) ++weak_alias (__bzero, bzero) + libc_hidden_def (__wmemset) + weak_alias (__wmemset, wmemset) + libc_hidden_weak (wmemset) +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 26be4095..37d8d6f0 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -1,85 +1,130 @@ + ifeq ($(subdir),string) + +-sysdep_routines += strncat-c stpncpy-c strncpy-c \ +- strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3 \ +- strcmp-sse4_2 strcmp-avx2 \ +- strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \ +- memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \ +- memrchr-sse2 memrchr-avx2 \ +- memcmp-sse2 \ +- memcmp-avx2-movbe \ +- memcmp-sse4 memcpy-ssse3 \ +- memmove-ssse3 \ +- memcpy-ssse3-back \ +- memmove-ssse3-back \ +- memmove-avx512-no-vzeroupper \ +- strcasecmp_l-sse2 strcasecmp_l-ssse3 \ +- strcasecmp_l-sse4_2 strcasecmp_l-avx \ +- strncase_l-sse2 strncase_l-ssse3 \ +- strncase_l-sse4_2 strncase_l-avx \ +- strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \ +- strrchr-sse2 strrchr-avx2 \ +- strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \ +- strcat-avx2 strncat-avx2 \ +- strcat-ssse3 strncat-ssse3\ +- strcpy-avx2 strncpy-avx2 \ +- strcpy-sse2 stpcpy-sse2 \ +- strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ +- strcpy-sse2-unaligned strncpy-sse2-unaligned \ +- stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ +- stpcpy-avx2 stpncpy-avx2 \ +- strcat-sse2 \ +- strcat-sse2-unaligned strncat-sse2-unaligned \ +- strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ +- strcspn-sse2 strpbrk-sse2 strspn-sse2 \ +- strcspn-c strpbrk-c strspn-c varshift \ +- memset-avx512-no-vzeroupper \ +- memmove-sse2-unaligned-erms \ +- memmove-avx-unaligned-erms \ +- memmove-avx512-unaligned-erms \ +- memset-sse2-unaligned-erms \ +- memset-avx2-unaligned-erms \ +- memset-avx512-unaligned-erms \ +- memchr-avx2-rtm \ +- memcmp-avx2-movbe-rtm \ +- memmove-avx-unaligned-erms-rtm \ +- memrchr-avx2-rtm \ +- memset-avx2-unaligned-erms-rtm \ +- rawmemchr-avx2-rtm \ +- strchr-avx2-rtm \ +- strcmp-avx2-rtm \ +- strchrnul-avx2-rtm \ +- stpcpy-avx2-rtm \ +- stpncpy-avx2-rtm \ +- strcat-avx2-rtm \ +- strcpy-avx2-rtm \ +- strlen-avx2-rtm \ +- strncat-avx2-rtm \ +- strncmp-avx2-rtm \ +- strncpy-avx2-rtm \ +- strnlen-avx2-rtm \ +- strrchr-avx2-rtm \ +- memchr-evex \ +- memcmp-evex-movbe \ +- memmove-evex-unaligned-erms \ +- memrchr-evex \ +- memset-evex-unaligned-erms \ +- rawmemchr-evex \ +- stpcpy-evex \ +- stpncpy-evex \ +- strcat-evex \ +- strchr-evex \ +- strchrnul-evex \ +- strcmp-evex \ +- strcpy-evex \ +- strlen-evex \ +- strncat-evex \ +- strncmp-evex \ +- strncpy-evex \ +- strnlen-evex \ +- strrchr-evex \ +- memchr-evex-rtm \ +- rawmemchr-evex-rtm ++sysdep_routines += \ ++ bzero \ ++ memchr-avx2 \ ++ memchr-avx2-rtm \ ++ memchr-evex \ ++ memchr-evex-rtm \ ++ memchr-sse2 \ ++ memcmp-avx2-movbe \ ++ memcmp-avx2-movbe-rtm \ ++ memcmp-evex-movbe \ ++ memcmp-sse2 \ ++ memcmp-sse4 \ ++ memcmp-ssse3 \ ++ memcpy-ssse3 \ ++ memcpy-ssse3-back \ ++ memmove-avx-unaligned-erms \ ++ memmove-avx-unaligned-erms-rtm \ ++ memmove-avx512-no-vzeroupper \ ++ memmove-avx512-unaligned-erms \ ++ memmove-evex-unaligned-erms \ ++ memmove-sse2-unaligned-erms \ ++ memmove-ssse3 \ ++ memmove-ssse3-back \ ++ memrchr-avx2 \ ++ memrchr-avx2-rtm \ ++ memrchr-evex \ ++ memrchr-sse2 \ ++ memset-avx2-unaligned-erms \ ++ memset-avx2-unaligned-erms-rtm \ ++ memset-avx512-no-vzeroupper \ ++ memset-avx512-unaligned-erms \ ++ memset-evex-unaligned-erms \ ++ memset-sse2-unaligned-erms \ ++ rawmemchr-avx2 \ ++ rawmemchr-avx2-rtm \ ++ rawmemchr-evex \ ++ rawmemchr-evex-rtm \ ++ rawmemchr-sse2 \ ++ stpcpy-avx2 \ ++ stpcpy-avx2-rtm \ ++ stpcpy-evex \ ++ stpcpy-sse2 \ ++ stpcpy-sse2-unaligned \ ++ stpcpy-ssse3 \ ++ stpncpy-avx2 \ ++ stpncpy-avx2-rtm \ ++ stpncpy-c \ ++ stpncpy-evex \ ++ stpncpy-sse2-unaligned \ ++ stpncpy-ssse3 \ ++ strcasecmp_l-avx \ ++ strcasecmp_l-sse2 \ ++ strcasecmp_l-sse4_2 \ ++ strcasecmp_l-ssse3 \ ++ strcat-avx2 \ ++ strcat-avx2-rtm \ ++ strcat-evex \ ++ strcat-sse2 \ ++ strcat-sse2-unaligned \ ++ strcat-ssse3 \ ++ strchr-avx2 \ ++ strchr-avx2-rtm \ ++ strchr-evex \ ++ strchr-sse2 \ ++ strchr-sse2-no-bsf \ ++ strchrnul-avx2 \ ++ strchrnul-avx2-rtm \ ++ strchrnul-evex \ ++ strchrnul-sse2 \ ++ strcmp-avx2 \ ++ strcmp-avx2-rtm \ ++ strcmp-evex \ ++ strcmp-sse2 \ ++ strcmp-sse2-unaligned \ ++ strcmp-sse4_2 \ ++ strcmp-ssse3 \ ++ strcpy-avx2 \ ++ strcpy-avx2-rtm \ ++ strcpy-evex \ ++ strcpy-sse2 \ ++ strcpy-sse2-unaligned \ ++ strcpy-ssse3 \ ++ strcspn-c \ ++ strcspn-sse2 \ ++ strlen-avx2 \ ++ strlen-avx2-rtm \ ++ strlen-evex \ ++ strlen-sse2 \ ++ strncase_l-avx \ ++ strncase_l-sse2 \ ++ strncase_l-sse4_2 \ ++ strncase_l-ssse3 \ ++ strncat-avx2 \ ++ strncat-avx2-rtm \ ++ strncat-c \ ++ strncat-evex \ ++ strncat-sse2-unaligned \ ++ strncat-ssse3 \ ++ strncmp-avx2 \ ++ strncmp-avx2-rtm \ ++ strncmp-evex \ ++ strncmp-sse2 \ ++ strncmp-sse4_2 \ ++ strncmp-ssse3 \ ++ strncpy-avx2 \ ++ strncpy-avx2-rtm \ ++ strncpy-c \ ++ strncpy-evex \ ++ strncpy-sse2-unaligned \ ++ strncpy-ssse3 \ ++ strnlen-avx2 \ ++ strnlen-avx2-rtm \ ++ strnlen-evex \ ++ strnlen-sse2 \ ++ strpbrk-c \ ++ strpbrk-sse2 \ ++ strrchr-avx2 \ ++ strrchr-avx2-rtm \ ++ strrchr-evex \ ++ strrchr-sse2 \ ++ strspn-c \ ++ strspn-sse2 \ ++ strstr-sse2-unaligned \ ++ varshift \ ++# sysdep_routines + CFLAGS-varshift.c += -msse4 + CFLAGS-strcspn-c.c += -msse4 + CFLAGS-strpbrk-c.c += -msse4 +diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c +new file mode 100644 +index 00000000..58a14b2c +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/bzero.c +@@ -0,0 +1,106 @@ ++/* Multiple versions of bzero. ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++/* Define multiple versions only for the definition in libc. */ ++#if IS_IN (libc) ++# define __bzero __redirect___bzero ++# include ++# undef __bzero ++ ++# define SYMBOL_NAME __bzero ++# include ++ ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms) ++ attribute_hidden; ++ ++static inline void * ++IFUNC_SELECTOR (void) ++{ ++ const struct cpu_features* cpu_features = __get_cpu_features (); ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) ++ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (avx512_unaligned_erms); ++ ++ return OPTIMIZE1 (avx512_unaligned); ++ } ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (evex_unaligned_erms); ++ ++ return OPTIMIZE1 (evex_unaligned); ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (avx2_unaligned_erms_rtm); ++ ++ return OPTIMIZE1 (avx2_unaligned_rtm); ++ } ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (avx2_unaligned_erms); ++ ++ return OPTIMIZE1 (avx2_unaligned); ++ } ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (sse2_unaligned_erms); ++ ++ return OPTIMIZE1 (sse2_unaligned); ++} ++ ++libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ()); ++ ++weak_alias (__bzero, bzero) ++#endif +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 8be0d78a..c963d391 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -282,6 +282,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __memset_avx512_no_vzeroupper) + ) + ++ /* Support sysdeps/x86_64/multiarch/bzero.c. */ ++ IFUNC_IMPL (i, name, bzero, ++ IFUNC_IMPL_ADD (array, i, bzero, 1, ++ __bzero_sse2_unaligned) ++ IFUNC_IMPL_ADD (array, i, bzero, 1, ++ __bzero_sse2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ CPU_FEATURE_USABLE (AVX2), ++ __bzero_avx2_unaligned) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ CPU_FEATURE_USABLE (AVX2), ++ __bzero_avx2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __bzero_avx2_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __bzero_avx2_unaligned_erms_rtm) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_evex_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_avx512_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_avx512_unaligned) ++ ) ++ + /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ + IFUNC_IMPL (i, name, rawmemchr, + IFUNC_IMPL_ADD (array, i, rawmemchr, +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +index 8ac3e479..5a5ee6f6 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +@@ -5,6 +5,7 @@ + + #define SECTION(p) p##.avx.rtm + #define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm ++#define BZERO_SYMBOL(p,s) p##_avx2_##s##_rtm + #define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm + + #include "memset-avx2-unaligned-erms.S" +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +index c0bf2875..a093a283 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +@@ -14,6 +14,9 @@ + vmovd d, %xmm0; \ + movq r, %rax; + ++# define BZERO_ZERO_VEC0() \ ++ vpxor %xmm0, %xmm0, %xmm0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + MEMSET_SET_VEC0_AND_SET_RETURN(d, r) + +@@ -29,6 +32,9 @@ + # ifndef MEMSET_SYMBOL + # define MEMSET_SYMBOL(p,s) p##_avx2_##s + # endif ++# ifndef BZERO_SYMBOL ++# define BZERO_SYMBOL(p,s) p##_avx2_##s ++# endif + # ifndef WMEMSET_SYMBOL + # define WMEMSET_SYMBOL(p,s) p##_avx2_##s + # endif +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index 5241216a..727c9213 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -19,6 +19,9 @@ + vpbroadcastb d, %VEC0; \ + movq r, %rax + ++# define BZERO_ZERO_VEC0() \ ++ vpxorq %XMM0, %XMM0, %XMM0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastd d, %VEC0; \ + movq r, %rax +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +index 63700215..5d8fa78f 100644 +--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -19,6 +19,9 @@ + vpbroadcastb d, %VEC0; \ + movq r, %rax + ++# define BZERO_ZERO_VEC0() \ ++ vpxorq %XMM0, %XMM0, %XMM0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastd d, %VEC0; \ + movq r, %rax +diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +index 56b81f5c..8f579ad6 100644 +--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +@@ -22,6 +22,7 @@ + + #if IS_IN (libc) + # define MEMSET_SYMBOL(p,s) p##_sse2_##s ++# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) + # define WMEMSET_SYMBOL(p,s) p##_sse2_##s + + # ifdef SHARED +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index a67f9833..06f5f5d7 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -26,6 +26,10 @@ + + #include + ++#ifndef BZERO_SYMBOL ++# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) ++#endif ++ + #ifndef MEMSET_CHK_SYMBOL + # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) + #endif +@@ -87,6 +91,18 @@ + # define XMM_SMALL 0 + #endif + ++#ifdef USE_LESS_VEC_MASK_STORE ++# define SET_REG64 rcx ++# define SET_REG32 ecx ++# define SET_REG16 cx ++# define SET_REG8 cl ++#else ++# define SET_REG64 rsi ++# define SET_REG32 esi ++# define SET_REG16 si ++# define SET_REG8 sil ++#endif ++ + #define PAGE_SIZE 4096 + + /* Macro to calculate size of small memset block for aligning +@@ -96,18 +112,6 @@ + + #ifndef SECTION + # error SECTION is not defined! +-#endif +- +- .section SECTION(.text),"ax",@progbits +-#if VEC_SIZE == 16 && IS_IN (libc) +-ENTRY (__bzero) +- mov %RDI_LP, %RAX_LP /* Set return value. */ +- mov %RSI_LP, %RDX_LP /* Set n. */ +- xorl %esi, %esi +- pxor %XMM0, %XMM0 +- jmp L(entry_from_bzero) +-END (__bzero) +-weak_alias (__bzero, bzero) + #endif + + #if IS_IN (libc) +@@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) + WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) + WMEMSET_VDUP_TO_VEC0_LOW() + cmpq $VEC_SIZE, %rdx +- jb L(less_vec_no_vdup) ++ jb L(less_vec_from_wmemset) + WMEMSET_VDUP_TO_VEC0_HIGH() + jmp L(entry_from_wmemset) + END (WMEMSET_SYMBOL (__wmemset, unaligned)) + #endif + ++ENTRY (BZERO_SYMBOL(__bzero, unaligned)) ++#if VEC_SIZE > 16 ++ BZERO_ZERO_VEC0 () ++#endif ++ mov %RDI_LP, %RAX_LP ++ mov %RSI_LP, %RDX_LP ++#ifndef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++#endif ++ cmp $VEC_SIZE, %RDX_LP ++ jb L(less_vec_no_vdup) ++#ifdef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++#endif ++#if VEC_SIZE <= 16 ++ BZERO_ZERO_VEC0 () ++#endif ++ cmp $(VEC_SIZE * 2), %RDX_LP ++ ja L(more_2x_vec) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) ++ VZEROUPPER_RETURN ++END (BZERO_SYMBOL(__bzero, unaligned)) ++ + #if defined SHARED && IS_IN (libc) + ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) + cmp %RDX_LP, %RCX_LP +@@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) + /* Clear the upper 32 bits. */ + mov %edx, %edx + # endif +-L(entry_from_bzero): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + MEMSET_VDUP_TO_VEC0_HIGH() +@@ -187,6 +215,31 @@ END (__memset_erms) + END (MEMSET_SYMBOL (__memset, erms)) + # endif + ++ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6) ++# if VEC_SIZE > 16 ++ BZERO_ZERO_VEC0 () ++# endif ++ mov %RDI_LP, %RAX_LP ++ mov %RSI_LP, %RDX_LP ++# ifndef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++# endif ++ cmp $VEC_SIZE, %RDX_LP ++ jb L(less_vec_no_vdup) ++# ifdef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++# endif ++# if VEC_SIZE <= 16 ++ BZERO_ZERO_VEC0 () ++# endif ++ cmp $(VEC_SIZE * 2), %RDX_LP ++ ja L(stosb_more_2x_vec) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) ++ VZEROUPPER_RETURN ++END (BZERO_SYMBOL(__bzero, unaligned_erms)) ++ + # if defined SHARED && IS_IN (libc) + ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + cmp %RDX_LP, %RCX_LP +@@ -229,6 +282,7 @@ L(last_2x_vec): + .p2align 4,, 10 + L(less_vec): + L(less_vec_no_vdup): ++L(less_vec_from_wmemset): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 + # error Unsupported VEC_SIZE! +@@ -374,8 +428,11 @@ L(less_vec): + /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to + xmm). This is only does anything for AVX2. */ + MEMSET_VDUP_TO_VEC0_LOW () ++L(less_vec_from_wmemset): ++#if VEC_SIZE > 16 + L(less_vec_no_vdup): + #endif ++#endif + L(cross_page): + #if VEC_SIZE > 32 + cmpl $32, %edx +@@ -386,7 +443,10 @@ L(cross_page): + jge L(between_16_31) + #endif + #ifndef USE_XMM_LESS_VEC +- MOVQ %XMM0, %rcx ++ MOVQ %XMM0, %SET_REG64 ++#endif ++#if VEC_SIZE <= 16 ++L(less_vec_no_vdup): + #endif + cmpl $8, %edx + jge L(between_8_15) +@@ -395,7 +455,7 @@ L(cross_page): + cmpl $1, %edx + jg L(between_2_3) + jl L(between_0_0) +- movb %sil, (%LESS_VEC_REG) ++ movb %SET_REG8, (%LESS_VEC_REG) + L(between_0_0): + ret + +@@ -428,8 +488,8 @@ L(between_8_15): + MOVQ %XMM0, (%rdi) + MOVQ %XMM0, -8(%rdi, %rdx) + #else +- movq %rcx, (%LESS_VEC_REG) +- movq %rcx, -8(%LESS_VEC_REG, %rdx) ++ movq %SET_REG64, (%LESS_VEC_REG) ++ movq %SET_REG64, -8(%LESS_VEC_REG, %rdx) + #endif + ret + +@@ -442,8 +502,8 @@ L(between_4_7): + MOVD %XMM0, (%rdi) + MOVD %XMM0, -4(%rdi, %rdx) + #else +- movl %ecx, (%LESS_VEC_REG) +- movl %ecx, -4(%LESS_VEC_REG, %rdx) ++ movl %SET_REG32, (%LESS_VEC_REG) ++ movl %SET_REG32, -4(%LESS_VEC_REG, %rdx) + #endif + ret + +@@ -452,12 +512,12 @@ L(between_4_7): + L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ + #ifdef USE_XMM_LESS_VEC +- movb %sil, (%rdi) +- movb %sil, 1(%rdi) +- movb %sil, -1(%rdi, %rdx) ++ movb %SET_REG8, (%rdi) ++ movb %SET_REG8, 1(%rdi) ++ movb %SET_REG8, -1(%rdi, %rdx) + #else +- movw %cx, (%LESS_VEC_REG) +- movb %sil, -1(%LESS_VEC_REG, %rdx) ++ movw %SET_REG16, (%LESS_VEC_REG) ++ movb %SET_REG8, -1(%LESS_VEC_REG, %rdx) + #endif + ret + END (MEMSET_SYMBOL (__memset, unaligned_erms)) +-- +GitLab + diff --git a/glibc-RHEL-15696-81.patch b/glibc-RHEL-15696-81.patch new file mode 100644 index 0000000..960a4cc --- /dev/null +++ b/glibc-RHEL-15696-81.patch @@ -0,0 +1,33 @@ +From 7912236f4a597deb092650ca79f33504ddb4af28 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Sat, 12 Feb 2022 00:45:00 -0600 +Subject: [PATCH] x86: Set .text section in memset-vec-unaligned-erms +Content-type: text/plain; charset=UTF-8 + +commit 3d9f171bfb5325bd5f427e9fc386453358c6e840 +Author: H.J. Lu +Date: Mon Feb 7 05:55:15 2022 -0800 + + x86-64: Optimize bzero + +Remove setting the .text section for the code. This commit +adds that back. +--- + sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 06f5f5d7..4fb475c0 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -114,6 +114,7 @@ + # error SECTION is not defined! + #endif + ++ .section SECTION(.text), "ax", @progbits + #if IS_IN (libc) + # if defined SHARED + ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) +-- +GitLab + diff --git a/glibc-RHEL-15696-82.patch b/glibc-RHEL-15696-82.patch new file mode 100644 index 0000000..23ee46e --- /dev/null +++ b/glibc-RHEL-15696-82.patch @@ -0,0 +1,90 @@ +From e108c02a5e23c8c88ce66d8705d4a24bb6b9a8bf Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Tue, 15 Feb 2022 20:27:21 -0600 +Subject: [PATCH] x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895] +Content-type: text/plain; charset=UTF-8 + +Logic can read before the start of `s1` / `s2` if both `s1` and `s2` +are near the start of a page. To avoid having the result contimated by +these comparisons the `strcmp` variants would mask off these +comparisons. This was missing in the `strncmp` variants causing +the bug. This commit adds the masking to `strncmp` so that out of +range comparisons don't affect the result. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass as +well a full xcheck on x86_64 linux. +Reviewed-by: H.J. Lu +--- + string/test-strncmp.c | 23 +++++++++++++++++++++++ + sysdeps/x86_64/multiarch/strcmp-avx2.S | 1 + + sysdeps/x86_64/multiarch/strcmp-evex.S | 1 + + 3 files changed, 25 insertions(+) + +diff --git a/string/test-strncmp.c b/string/test-strncmp.c +index 927a6daa..e61fffd9 100644 +--- a/string/test-strncmp.c ++++ b/string/test-strncmp.c +@@ -403,6 +403,28 @@ check2 (void) + free (s2); + } + ++static void ++check4 (void) ++{ ++ /* To trigger bug 28895; We need 1) both s1 and s2 to be within 32 bytes of ++ the end of the page. 2) For there to be no mismatch/null byte before the ++ first page cross. 3) For length (`n`) to be large enough for one string to ++ cross the page. And 4) for there to be either mismatch/null bytes before ++ the start of the strings. */ ++ ++ size_t size = 10; ++ size_t addr_mask = (getpagesize () - 1) ^ (sizeof (CHAR) - 1); ++ CHAR *s1 = (CHAR *)(buf1 + (addr_mask & 0xffa)); ++ CHAR *s2 = (CHAR *)(buf2 + (addr_mask & 0xfed)); ++ int exp_result; ++ ++ STRCPY (s1, L ("tst-tlsmod%")); ++ STRCPY (s2, L ("tst-tls-manydynamic73mod")); ++ exp_result = SIMPLE_STRNCMP (s1, s2, size); ++ FOR_EACH_IMPL (impl, 0) ++ check_result (impl, s1, s2, size, exp_result); ++} ++ + static void + check3 (void) + { +@@ -445,6 +467,7 @@ test_main (void) + check1 (); + check2 (); + check3 (); ++ check4 (); + + printf ("%23s", ""); + FOR_EACH_IMPL (impl, 0) +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 04675aa4..179cc0e3 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -661,6 +661,7 @@ L(ret8): + # ifdef USE_AS_STRNCMP + .p2align 4,, 10 + L(return_page_cross_end_check): ++ andl %r10d, %ecx + tzcntl %ecx, %ecx + leal -VEC_SIZE(%rax, %rcx), %ecx + cmpl %ecx, %edx +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index ed56af8e..0dfa62bd 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -689,6 +689,7 @@ L(ret8): + # ifdef USE_AS_STRNCMP + .p2align 4,, 10 + L(return_page_cross_end_check): ++ andl %r10d, %ecx + tzcntl %ecx, %ecx + leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx + # ifdef USE_AS_WCSCMP +-- +GitLab + diff --git a/glibc-RHEL-15696-83.patch b/glibc-RHEL-15696-83.patch new file mode 100644 index 0000000..e7475a8 --- /dev/null +++ b/glibc-RHEL-15696-83.patch @@ -0,0 +1,77 @@ +From 9fef7039a7d04947bc89296ee0d187bc8d89b772 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 24 Mar 2022 15:50:33 -0500 +Subject: [PATCH] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ + #28896] +Content-type: text/plain; charset=UTF-8 + +Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not +__wcscmp_avx2. + +commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87 +Author: Noah Goldstein +Date: Sun Jan 9 16:02:21 2022 -0600 + + x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755] + +Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set +to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which +can cause spurious aborts. + +This change will need to be backported. + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86/tst-strncmp-rtm.c | 15 +++++++++++++++ + sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +- + 2 files changed, 16 insertions(+), 1 deletion(-) + +diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c +index aef9866c..ba6543be 100644 +--- a/sysdeps/x86/tst-strncmp-rtm.c ++++ b/sysdeps/x86/tst-strncmp-rtm.c +@@ -70,6 +70,16 @@ function_overflow (void) + return 1; + } + ++__attribute__ ((noinline, noclone)) ++static int ++function_overflow2 (void) ++{ ++ if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0) ++ return 0; ++ else ++ return 1; ++} ++ + static int + do_test (void) + { +@@ -77,5 +87,10 @@ do_test (void) + if (status != EXIT_SUCCESS) + return status; + status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow); ++ if (status != EXIT_SUCCESS) ++ return status; ++ status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2); ++ if (status != EXIT_SUCCESS) ++ return status; + return status; + } +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 179cc0e3..782f9472 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -122,7 +122,7 @@ ENTRY(STRCMP) + are cases where length is large enough that it can never be a + bound on valid memory so just use wcscmp. */ + shrq $56, %rcx +- jnz __wcscmp_avx2 ++ jnz OVERFLOW_STRCMP + + leaq (, %rdx, 4), %rdx + # endif +-- +GitLab + diff --git a/glibc-RHEL-15696-84.patch b/glibc-RHEL-15696-84.patch new file mode 100644 index 0000000..e998eff --- /dev/null +++ b/glibc-RHEL-15696-84.patch @@ -0,0 +1,27 @@ +From 1283948f236f209b7d3f44b69a42b96806fa6da0 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Sat, 5 Feb 2022 11:06:01 -0800 +Subject: [PATCH] x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ)) +Content-type: text/plain; charset=UTF-8 + +--- + sysdeps/x86/sysdep.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h +index a70bb3a2..49b0efe2 100644 +--- a/sysdeps/x86/sysdep.h ++++ b/sysdeps/x86/sysdep.h +@@ -111,7 +111,8 @@ enum cf_protection_level + /* Local label name for asm code. */ + #ifndef L + /* ELF-like local names start with `.L'. */ +-# define L(name) .L##name ++# define LOCAL_LABEL(name) .L##name ++# define L(name) LOCAL_LABEL(name) + #endif + + #define atom_text_section .section ".text.atom", "ax" +-- +GitLab + diff --git a/glibc-RHEL-15696-85.patch b/glibc-RHEL-15696-85.patch new file mode 100644 index 0000000..18f8a47 --- /dev/null +++ b/glibc-RHEL-15696-85.patch @@ -0,0 +1,108 @@ +From c328d0152d4b14cca58407ec68143894c8863004 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Sat, 5 Feb 2022 11:52:33 -0800 +Subject: [PATCH] x86_64/multiarch: Sort sysdep_routines and put one entry per + line +Content-type: text/plain; charset=UTF-8 + +Conflicts: + sysdeps/x86_64/multiarch/Makefile + (test order changed) + +--- + sysdeps/x86_64/multiarch/Makefile | 78 +++++++++++++++++++------------ + 1 file changed, 48 insertions(+), 30 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 37d8d6f0..8c9e7812 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -132,37 +132,55 @@ CFLAGS-strspn-c.c += -msse4 + endif + + ifeq ($(subdir),wcsmbs) +-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ +- wmemcmp-avx2-movbe \ +- wmemchr-sse2 wmemchr-avx2 \ +- wcscmp-sse2 wcscmp-avx2 \ +- wcsncmp-sse2 wcsncmp-avx2 \ +- wcscpy-ssse3 wcscpy-c \ +- wcschr-sse2 wcschr-avx2 \ +- wcsrchr-sse2 wcsrchr-avx2 \ +- wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \ +- wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \ +- wcschr-avx2-rtm \ +- wcscmp-avx2-rtm \ +- wcslen-avx2-rtm \ +- wcsncmp-avx2-rtm \ +- wcsnlen-avx2-rtm \ +- wcsrchr-avx2-rtm \ +- wmemchr-avx2-rtm \ +- wmemcmp-avx2-movbe-rtm \ +- wcschr-evex \ +- wcscmp-evex \ +- wcslen-evex \ +- wcsncmp-evex \ +- wcsnlen-evex \ +- wcsrchr-evex \ +- wmemchr-evex \ +- wmemcmp-evex-movbe \ +- wmemchr-evex-rtm ++sysdep_routines += \ ++ wcschr-avx2 \ ++ wcschr-avx2-rtm \ ++ wcschr-evex \ ++ wcschr-sse2 \ ++ wcscmp-avx2 \ ++ wcscmp-avx2-rtm \ ++ wcscmp-evex \ ++ wcscmp-sse2 \ ++ wcscpy-c \ ++ wcscpy-ssse3 \ ++ wcslen-avx2 \ ++ wcslen-avx2-rtm \ ++ wcslen-evex \ ++ wcslen-sse2 \ ++ wcslen-sse4_1 \ ++ wcsncmp-avx2 \ ++ wcsncmp-avx2-rtm \ ++ wcsncmp-evex \ ++ wcsncmp-sse2 \ ++ wcsnlen-avx2 \ ++ wcsnlen-avx2-rtm \ ++ wcsnlen-c \ ++ wcsnlen-evex \ ++ wcsnlen-sse4_1 \ ++ wcsrchr-avx2 \ ++ wcsrchr-avx2-rtm \ ++ wcsrchr-evex \ ++ wcsrchr-sse2 \ ++ wmemchr-avx2 \ ++ wmemchr-avx2-rtm \ ++ wmemchr-evex \ ++ wmemchr-evex-rtm \ ++ wmemchr-sse2 \ ++ wmemcmp-avx2-movbe \ ++ wmemcmp-avx2-movbe-rtm \ ++ wmemcmp-c \ ++ wmemcmp-evex-movbe \ ++ wmemcmp-sse4 \ ++ wmemcmp-ssse3 \ ++# sysdep_routines + endif + + ifeq ($(subdir),debug) +-sysdep_routines += memcpy_chk-nonshared mempcpy_chk-nonshared \ +- memmove_chk-nonshared memset_chk-nonshared \ +- wmemset_chk-nonshared ++sysdep_routines += \ ++ memcpy_chk-nonshared \ ++ memmove_chk-nonshared \ ++ mempcpy_chk-nonshared \ ++ memset_chk-nonshared \ ++ wmemset_chk-nonshared \ ++# sysdep_routines + endif +-- +GitLab + diff --git a/glibc-RHEL-15696-86.patch b/glibc-RHEL-15696-86.patch new file mode 100644 index 0000000..d4fb42f --- /dev/null +++ b/glibc-RHEL-15696-86.patch @@ -0,0 +1,36 @@ +From 0fb8800029d230b3711bf722b2a47db92d0e273f Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Thu, 10 Feb 2022 11:52:50 -0800 +Subject: [PATCH] x86-64: Remove bzero weak alias in SS2 memset +Content-type: text/plain; charset=UTF-8 + +commit 3d9f171bfb5325bd5f427e9fc386453358c6e840 +Author: H.J. Lu +Date: Mon Feb 7 05:55:15 2022 -0800 + + x86-64: Optimize bzero + +added the optimized bzero. Remove bzero weak alias in SS2 memset to +avoid undefined __bzero in memset-sse2-unaligned-erms. +--- + sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +index 8f579ad6..af51362b 100644 +--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +@@ -31,9 +31,7 @@ + # endif + + # undef weak_alias +-# define weak_alias(original, alias) \ +- .weak bzero; bzero = __bzero +- ++# define weak_alias(original, alias) + # undef strong_alias + # define strong_alias(ignored1, ignored2) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-87.patch b/glibc-RHEL-15696-87.patch new file mode 100644 index 0000000..4882613 --- /dev/null +++ b/glibc-RHEL-15696-87.patch @@ -0,0 +1,29 @@ +From bf92893a14ebc161b08b28acc24fa06ae6be19cb Mon Sep 17 00:00:00 2001 +From: Adhemerval Zanella +Date: Thu, 10 Feb 2022 11:23:24 -0300 +Subject: [PATCH] x86_64: Remove bcopy optimizations +Content-type: text/plain; charset=UTF-8 + +The symbols is not present in current POSIX specification and compiler +already generates memmove call. +--- + sysdeps/x86_64/multiarch/bcopy.S | 7 ------- + 1 file changed, 7 deletions(-) + delete mode 100644 sysdeps/x86_64/multiarch/bcopy.S + +diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S +deleted file mode 100644 +index 639f02bd..00000000 +--- a/sysdeps/x86_64/multiarch/bcopy.S ++++ /dev/null +@@ -1,7 +0,0 @@ +-#include +- +- .text +-ENTRY(bcopy) +- xchg %rdi, %rsi +- jmp __libc_memmove /* Branch to IFUNC memmove. */ +-END(bcopy) +-- +GitLab + diff --git a/glibc-RHEL-15696-88.patch b/glibc-RHEL-15696-88.patch new file mode 100644 index 0000000..d075f80 --- /dev/null +++ b/glibc-RHEL-15696-88.patch @@ -0,0 +1,372 @@ +From a6fbf4d51e9ba8063c4f8331564892ead9c67344 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:16 -0500 +Subject: [PATCH] x86: Code cleanup in strchr-avx2 and comment justifying + branch +Content-type: text/plain; charset=UTF-8 + +Small code cleanup for size: -53 bytes. + +Add comment justifying using a branch to do NULL/non-null return. + +All string/memory tests pass and no regressions in benchtests. + +geometric_mean(N=20) of all benchmarks Original / New: 1.00 +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------ + 1 file changed, 107 insertions(+), 97 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S +index 5884726b..89dd2bf7 100644 +--- a/sysdeps/x86_64/multiarch/strchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S +@@ -48,13 +48,13 @@ + # define PAGE_SIZE 4096 + + .section SECTION(.text),"ax",@progbits +-ENTRY (STRCHR) ++ENTRY_P2ALIGN (STRCHR, 5) + /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + VPBROADCAST %xmm0, %ymm0 +- vpxor %xmm9, %xmm9, %xmm9 ++ vpxor %xmm1, %xmm1, %xmm1 + + /* Check if we cross page boundary with one vector load. */ + cmpl $(PAGE_SIZE - VEC_SIZE), %eax +@@ -62,37 +62,29 @@ ENTRY (STRCHR) + + /* Check the first VEC_SIZE bytes. Search for both CHAR and the + null byte. */ +- vmovdqu (%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqu (%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax + # ifndef USE_AS_STRCHRNUL +- /* Found CHAR or the null byte. */ +- cmp (%rdi, %rax), %CHAR_REG +- jne L(zero) +-# endif +- addq %rdi, %rax +- VZEROUPPER_RETURN +- +- /* .p2align 5 helps keep performance more consistent if ENTRY() +- alignment % 32 was either 16 or 0. As well this makes the +- alignment % 32 of the loop_4x_vec fixed which makes tuning it +- easier. */ +- .p2align 5 +-L(first_vec_x4): +- tzcntl %eax, %eax +- addq $(VEC_SIZE * 3 + 1), %rdi +-# ifndef USE_AS_STRCHRNUL +- /* Found CHAR or the null byte. */ ++ /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG ++ /* NB: Use a branch instead of cmovcc here. The expectation is ++ that with strchr the user will branch based on input being ++ null. Since this branch will be 100% predictive of the user ++ branch a branch miss here should save what otherwise would ++ be branch miss in the user code. Otherwise using a branch 1) ++ saves code size and 2) is faster in highly predictable ++ environments. */ + jne L(zero) + # endif + addq %rdi, %rax +- VZEROUPPER_RETURN ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + # ifndef USE_AS_STRCHRNUL + L(zero): +@@ -103,7 +95,8 @@ L(zero): + + .p2align 4 + L(first_vec_x1): +- tzcntl %eax, %eax ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax + incq %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ +@@ -113,9 +106,10 @@ L(first_vec_x1): + addq %rdi, %rax + VZEROUPPER_RETURN + +- .p2align 4 ++ .p2align 4,, 10 + L(first_vec_x2): +- tzcntl %eax, %eax ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax + addq $(VEC_SIZE + 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ +@@ -125,9 +119,10 @@ L(first_vec_x2): + addq %rdi, %rax + VZEROUPPER_RETURN + +- .p2align 4 ++ .p2align 4,, 8 + L(first_vec_x3): +- tzcntl %eax, %eax ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax + addq $(VEC_SIZE * 2 + 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ +@@ -137,6 +132,21 @@ L(first_vec_x3): + addq %rdi, %rax + VZEROUPPER_RETURN + ++ .p2align 4,, 10 ++L(first_vec_x4): ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax ++ addq $(VEC_SIZE * 3 + 1), %rdi ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero) ++# endif ++ addq %rdi, %rax ++ VZEROUPPER_RETURN ++ ++ ++ + .p2align 4 + L(aligned_more): + /* Align data to VEC_SIZE - 1. This is the same number of +@@ -146,90 +156,92 @@ L(aligned_more): + L(cross_page_continue): + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ +- vmovdqa 1(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa 1(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x1) + +- vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x2) + +- vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x3) + +- vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x4) +- /* Align data to VEC_SIZE * 4 - 1. */ +- addq $(VEC_SIZE * 4 + 1), %rdi +- andq $-(VEC_SIZE * 4), %rdi ++ /* Align data to VEC_SIZE * 4 - 1. */ ++ incq %rdi ++ orq $(VEC_SIZE * 4 - 1), %rdi + .p2align 4 + L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ +- vmovdqa (%rdi), %ymm5 +- vmovdqa (VEC_SIZE)(%rdi), %ymm6 +- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 +- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 ++ vmovdqa 1(%rdi), %ymm6 ++ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7 + + /* Leaves only CHARS matching esi as 0. */ +- vpxor %ymm5, %ymm0, %ymm1 + vpxor %ymm6, %ymm0, %ymm2 + vpxor %ymm7, %ymm0, %ymm3 +- vpxor %ymm8, %ymm0, %ymm4 + +- VPMINU %ymm1, %ymm5, %ymm1 + VPMINU %ymm2, %ymm6, %ymm2 + VPMINU %ymm3, %ymm7, %ymm3 +- VPMINU %ymm4, %ymm8, %ymm4 + +- VPMINU %ymm1, %ymm2, %ymm5 +- VPMINU %ymm3, %ymm4, %ymm6 ++ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6 ++ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7 ++ ++ vpxor %ymm6, %ymm0, %ymm4 ++ vpxor %ymm7, %ymm0, %ymm5 ++ ++ VPMINU %ymm4, %ymm6, %ymm4 ++ VPMINU %ymm5, %ymm7, %ymm5 + +- VPMINU %ymm5, %ymm6, %ymm6 ++ VPMINU %ymm2, %ymm3, %ymm6 ++ VPMINU %ymm4, %ymm5, %ymm7 + +- VPCMPEQ %ymm6, %ymm9, %ymm6 +- vpmovmskb %ymm6, %ecx ++ VPMINU %ymm6, %ymm7, %ymm7 ++ ++ VPCMPEQ %ymm7, %ymm1, %ymm7 ++ vpmovmskb %ymm7, %ecx + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx + jz L(loop_4x_vec) + +- +- VPCMPEQ %ymm1, %ymm9, %ymm1 +- vpmovmskb %ymm1, %eax ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(last_vec_x0) + + +- VPCMPEQ %ymm5, %ymm9, %ymm2 +- vpmovmskb %ymm2, %eax ++ VPCMPEQ %ymm3, %ymm1, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(last_vec_x1) + +- VPCMPEQ %ymm3, %ymm9, %ymm3 +- vpmovmskb %ymm3, %eax ++ VPCMPEQ %ymm4, %ymm1, %ymm4 ++ vpmovmskb %ymm4, %eax + /* rcx has combined result from all 4 VEC. It will only be used + if the first 3 other VEC all did not contain a match. */ + salq $32, %rcx + orq %rcx, %rax + tzcntq %rax, %rax +- subq $(VEC_SIZE * 2), %rdi ++ subq $(VEC_SIZE * 2 - 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG +@@ -239,10 +251,11 @@ L(loop_4x_vec): + VZEROUPPER_RETURN + + +- .p2align 4 ++ .p2align 4,, 10 + L(last_vec_x0): +- tzcntl %eax, %eax +- addq $-(VEC_SIZE * 4), %rdi ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax ++ addq $-(VEC_SIZE * 4 - 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG +@@ -251,16 +264,11 @@ L(last_vec_x0): + addq %rdi, %rax + VZEROUPPER_RETURN + +-# ifndef USE_AS_STRCHRNUL +-L(zero_end): +- xorl %eax, %eax +- VZEROUPPER_RETURN +-# endif + +- .p2align 4 ++ .p2align 4,, 10 + L(last_vec_x1): + tzcntl %eax, %eax +- subq $(VEC_SIZE * 3), %rdi ++ subq $(VEC_SIZE * 3 - 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG +@@ -269,18 +277,23 @@ L(last_vec_x1): + addq %rdi, %rax + VZEROUPPER_RETURN + ++# ifndef USE_AS_STRCHRNUL ++L(zero_end): ++ xorl %eax, %eax ++ VZEROUPPER_RETURN ++# endif + + /* Cold case for crossing page with first load. */ +- .p2align 4 ++ .p2align 4,, 8 + L(cross_page_boundary): + movq %rdi, %rdx + /* Align rdi to VEC_SIZE - 1. */ + orq $(VEC_SIZE - 1), %rdi +- vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT + so no need to manually mod edx. */ + sarxl %edx, %eax, %eax +@@ -291,13 +304,10 @@ L(cross_page_boundary): + xorl %ecx, %ecx + /* Found CHAR or the null byte. */ + cmp (%rdx, %rax), %CHAR_REG +- leaq (%rdx, %rax), %rax +- cmovne %rcx, %rax +-# else +- addq %rdx, %rax ++ jne L(zero_end) + # endif +-L(return_vzeroupper): +- ZERO_UPPER_VEC_REGISTERS_RETURN ++ addq %rdx, %rax ++ VZEROUPPER_RETURN + + END (STRCHR) +-# endif ++#endif +-- +GitLab + diff --git a/glibc-RHEL-15696-89.patch b/glibc-RHEL-15696-89.patch new file mode 100644 index 0000000..45ee946 --- /dev/null +++ b/glibc-RHEL-15696-89.patch @@ -0,0 +1,343 @@ +From ec285ea90415458225623ddc0492ae3f705af043 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:18 -0500 +Subject: [PATCH] x86: Code cleanup in strchr-evex and comment justifying + branch +Content-type: text/plain; charset=UTF-8 + +Small code cleanup for size: -81 bytes. + +Add comment justifying using a branch to do NULL/non-null return. + +All string/memory tests pass and no regressions in benchtests. + +geometric_mean(N=20) of all benchmarks New / Original: .985 +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++----------- + 1 file changed, 80 insertions(+), 66 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S +index 7f9d4ee4..0b49e0ac 100644 +--- a/sysdeps/x86_64/multiarch/strchr-evex.S ++++ b/sysdeps/x86_64/multiarch/strchr-evex.S +@@ -30,6 +30,7 @@ + # ifdef USE_AS_WCSCHR + # define VPBROADCAST vpbroadcastd + # define VPCMP vpcmpd ++# define VPTESTN vptestnmd + # define VPMINU vpminud + # define CHAR_REG esi + # define SHIFT_REG ecx +@@ -37,6 +38,7 @@ + # else + # define VPBROADCAST vpbroadcastb + # define VPCMP vpcmpb ++# define VPTESTN vptestnmb + # define VPMINU vpminub + # define CHAR_REG sil + # define SHIFT_REG edx +@@ -61,13 +63,11 @@ + # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section .text.evex,"ax",@progbits +-ENTRY (STRCHR) ++ENTRY_P2ALIGN (STRCHR, 5) + /* Broadcast CHAR to YMM0. */ + VPBROADCAST %esi, %YMM0 + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax +- vpxorq %XMMZERO, %XMMZERO, %XMMZERO +- + /* Check if we cross page boundary with one vector load. + Otherwise it is safe to use an unaligned load. */ + cmpl $(PAGE_SIZE - VEC_SIZE), %eax +@@ -81,49 +81,35 @@ ENTRY (STRCHR) + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ /* NB: Use a branch instead of cmovcc here. The expectation is ++ that with strchr the user will branch based on input being ++ null. Since this branch will be 100% predictive of the user ++ branch a branch miss here should save what otherwise would ++ be branch miss in the user code. Otherwise using a branch 1) ++ saves code size and 2) is faster in highly predictable ++ environments. */ ++ jne L(zero) ++# endif + # ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. + */ + leaq (%rdi, %rax, CHAR_SIZE), %rax + # else + addq %rdi, %rax +-# endif +-# ifndef USE_AS_STRCHRNUL +- /* Found CHAR or the null byte. */ +- cmp (%rax), %CHAR_REG +- jne L(zero) + # endif + ret + +- /* .p2align 5 helps keep performance more consistent if ENTRY() +- alignment % 32 was either 16 or 0. As well this makes the +- alignment % 32 of the loop_4x_vec fixed which makes tuning it +- easier. */ +- .p2align 5 +-L(first_vec_x3): +- tzcntl %eax, %eax +-# ifndef USE_AS_STRCHRNUL +- /* Found CHAR or the null byte. */ +- cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG +- jne L(zero) +-# endif +- /* NB: Multiply sizeof char type (1 or 4) to get the number of +- bytes. */ +- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax +- ret + +-# ifndef USE_AS_STRCHRNUL +-L(zero): +- xorl %eax, %eax +- ret +-# endif + +- .p2align 4 ++ .p2align 4,, 10 + L(first_vec_x4): + # ifndef USE_AS_STRCHRNUL + /* Check to see if first match was CHAR (k0) or null (k1). */ +@@ -144,9 +130,18 @@ L(first_vec_x4): + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax + ret + ++# ifndef USE_AS_STRCHRNUL ++L(zero): ++ xorl %eax, %eax ++ ret ++# endif ++ ++ + .p2align 4 + L(first_vec_x1): +- tzcntl %eax, %eax ++ /* Use bsf here to save 1-byte keeping keeping the block in 1x ++ fetch block. eax guranteed non-zero. */ ++ bsfl %eax, %eax + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG +@@ -158,7 +153,7 @@ L(first_vec_x1): + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + ret + +- .p2align 4 ++ .p2align 4,, 10 + L(first_vec_x2): + # ifndef USE_AS_STRCHRNUL + /* Check to see if first match was CHAR (k0) or null (k1). */ +@@ -179,6 +174,21 @@ L(first_vec_x2): + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + ++ .p2align 4,, 10 ++L(first_vec_x3): ++ /* Use bsf here to save 1-byte keeping keeping the block in 1x ++ fetch block. eax guranteed non-zero. */ ++ bsfl %eax, %eax ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ jne L(zero) ++# endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ++ ret ++ + .p2align 4 + L(aligned_more): + /* Align data to VEC_SIZE. */ +@@ -195,7 +205,7 @@ L(cross_page_continue): + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x1) +@@ -206,7 +216,7 @@ L(cross_page_continue): + /* Each bit in K0 represents a CHAR in YMM1. */ + VPCMP $0, %YMM1, %YMM0, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMM1, %YMMZERO, %k1 ++ VPTESTN %YMM1, %YMM1, %k1 + kortestd %k0, %k1 + jnz L(first_vec_x2) + +@@ -215,7 +225,7 @@ L(cross_page_continue): + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x3) +@@ -224,7 +234,7 @@ L(cross_page_continue): + /* Each bit in K0 represents a CHAR in YMM1. */ + VPCMP $0, %YMM1, %YMM0, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMM1, %YMMZERO, %k1 ++ VPTESTN %YMM1, %YMM1, %k1 + kortestd %k0, %k1 + jnz L(first_vec_x4) + +@@ -265,33 +275,33 @@ L(loop_4x_vec): + VPMINU %YMM3, %YMM4, %YMM4 + VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} + +- VPCMP $0, %YMMZERO, %YMM4, %k1 ++ VPTESTN %YMM4, %YMM4, %k1 + kmovd %k1, %ecx + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx + jz L(loop_4x_vec) + +- VPCMP $0, %YMMZERO, %YMM1, %k0 ++ VPTESTN %YMM1, %YMM1, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(last_vec_x1) + +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(last_vec_x2) + +- VPCMP $0, %YMMZERO, %YMM3, %k0 ++ VPTESTN %YMM3, %YMM3, %k0 + kmovd %k0, %eax + /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ + # ifdef USE_AS_WCSCHR + sall $8, %ecx + orl %ecx, %eax +- tzcntl %eax, %eax ++ bsfl %eax, %eax + # else + salq $32, %rcx + orq %rcx, %rax +- tzcntq %rax, %rax ++ bsfq %rax, %rax + # endif + # ifndef USE_AS_STRCHRNUL + /* Check if match was CHAR or null. */ +@@ -303,28 +313,28 @@ L(loop_4x_vec): + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + +-# ifndef USE_AS_STRCHRNUL +-L(zero_end): +- xorl %eax, %eax +- ret ++ .p2align 4,, 8 ++L(last_vec_x1): ++ bsfl %eax, %eax ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. ++ */ ++ leaq (%rdi, %rax, CHAR_SIZE), %rax ++# else ++ addq %rdi, %rax + # endif + +- .p2align 4 +-L(last_vec_x1): +- tzcntl %eax, %eax + # ifndef USE_AS_STRCHRNUL + /* Check if match was null. */ +- cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ cmp (%rax), %CHAR_REG + jne L(zero_end) + # endif +- /* NB: Multiply sizeof char type (1 or 4) to get the number of +- bytes. */ +- leaq (%rdi, %rax, CHAR_SIZE), %rax ++ + ret + +- .p2align 4 ++ .p2align 4,, 8 + L(last_vec_x2): +- tzcntl %eax, %eax ++ bsfl %eax, %eax + # ifndef USE_AS_STRCHRNUL + /* Check if match was null. */ + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG +@@ -336,7 +346,7 @@ L(last_vec_x2): + ret + + /* Cold case for crossing page with first load. */ +- .p2align 4 ++ .p2align 4,, 8 + L(cross_page_boundary): + movq %rdi, %rdx + /* Align rdi. */ +@@ -346,9 +356,9 @@ L(cross_page_boundary): + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax +- /* Remove the leading bits. */ ++ /* Remove the leading bits. */ + # ifdef USE_AS_WCSCHR + movl %edx, %SHIFT_REG + /* NB: Divide shift count by 4 since each bit in K1 represent 4 +@@ -360,20 +370,24 @@ L(cross_page_boundary): + /* If eax is zero continue. */ + testl %eax, %eax + jz L(cross_page_continue) +- tzcntl %eax, %eax +-# ifndef USE_AS_STRCHRNUL +- /* Check to see if match was CHAR or null. */ +- cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG +- jne L(zero_end) +-# endif ++ bsfl %eax, %eax ++ + # ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of + bytes. */ + leaq (%rdx, %rax, CHAR_SIZE), %rax + # else + addq %rdx, %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ /* Check to see if match was CHAR or null. */ ++ cmp (%rax), %CHAR_REG ++ je L(cross_page_ret) ++L(zero_end): ++ xorl %eax, %eax ++L(cross_page_ret): + # endif + ret + + END (STRCHR) +-# endif ++#endif +-- +GitLab + diff --git a/glibc-RHEL-15696-9.patch b/glibc-RHEL-15696-9.patch new file mode 100644 index 0000000..5aa3e7b --- /dev/null +++ b/glibc-RHEL-15696-9.patch @@ -0,0 +1,206 @@ +From 3f635fb43389b54f682fc9ed2acc0b2aaf4a923d Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 4 Feb 2019 06:31:01 -0800 +Subject: [PATCH] x86-64 memcmp: Use unsigned Jcc instructions on size [BZ + #24155] +Content-type: text/plain; charset=UTF-8 + +Since the size argument is unsigned. we should use unsigned Jcc +instructions, instead of signed, to check size. + +Tested on x86-64 and x32, with and without --disable-multi-arch. + + [BZ #24155] + CVE-2019-7309 + * NEWS: Updated for CVE-2019-7309. + * sysdeps/x86_64/memcmp.S: Use RDX_LP for size. Clear the + upper 32 bits of RDX register for x32. Use unsigned Jcc + instructions, instead of signed. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp-2. + * sysdeps/x86_64/x32/tst-size_t-memcmp-2.c: New test. +--- + sysdeps/x86_64/memcmp.S | 20 +++--- + sysdeps/x86_64/x32/Makefile | 3 +- + sysdeps/x86_64/x32/tst-size_t-memcmp-2.c | 79 ++++++++++++++++++++++++ + 3 files changed, 93 insertions(+), 9 deletions(-) + create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp-2.c + +Conflics: + ChangeLog + (removed) + NEWS + (removed) + +diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S +index bcb4a2e8..45918d37 100644 +--- a/sysdeps/x86_64/memcmp.S ++++ b/sysdeps/x86_64/memcmp.S +@@ -21,14 +21,18 @@ + + .text + ENTRY (memcmp) +- test %rdx, %rdx ++#ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx ++#endif ++ test %RDX_LP, %RDX_LP + jz L(finz) + cmpq $1, %rdx +- jle L(finr1b) ++ jbe L(finr1b) + subq %rdi, %rsi + movq %rdx, %r10 + cmpq $32, %r10 +- jge L(gt32) ++ jae L(gt32) + /* Handle small chunks and last block of less than 32 bytes. */ + L(small): + testq $1, %r10 +@@ -156,7 +160,7 @@ L(A32): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi +- jge L(mt16) ++ jae L(mt16) + /* Pre-unroll to be ready for unrolled 64B loop. */ + testq $32, %rdi + jz L(A64) +@@ -178,7 +182,7 @@ L(A64): + movq %r11, %r10 + andq $-64, %r10 + cmpq %r10, %rdi +- jge L(mt32) ++ jae L(mt32) + + L(A64main): + movdqu (%rdi,%rsi), %xmm0 +@@ -216,7 +220,7 @@ L(mt32): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi +- jge L(mt16) ++ jae L(mt16) + + L(A32main): + movdqu (%rdi,%rsi), %xmm0 +@@ -254,7 +258,7 @@ L(ATR): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi +- jge L(mt16) ++ jae L(mt16) + testq $16, %rdi + jz L(ATR32) + +@@ -325,7 +329,7 @@ L(ATR64main): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi +- jge L(mt16) ++ jae L(mt16) + + L(ATR32res): + movdqa (%rdi,%rsi), %xmm0 +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index 1557724b..87489565 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -8,7 +8,8 @@ endif + ifeq ($(subdir),string) + tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ + tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \ +- tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen ++ tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen \ ++ tst-size_t-memcmp-2 + endif + + ifeq ($(subdir),wcsmbs) +diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c +new file mode 100644 +index 00000000..d8ae1a08 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c +@@ -0,0 +1,79 @@ ++/* Test memcmp with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define TEST_MAIN ++#ifdef WIDE ++# define TEST_NAME "wmemcmp" ++#else ++# define TEST_NAME "memcmp" ++#endif ++ ++#include "test-size_t.h" ++ ++#ifdef WIDE ++# include ++# include ++ ++# define MEMCMP wmemcmp ++# define CHAR wchar_t ++#else ++# define MEMCMP memcmp ++# define CHAR char ++#endif ++ ++IMPL (MEMCMP, 1) ++ ++typedef int (*proto_t) (const CHAR *, const CHAR *, size_t); ++ ++static int ++__attribute__ ((noinline, noclone)) ++do_memcmp (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 }; ++ parameter_t src = { { 0 }, buf2 }; ++ ++ memcpy (buf1, buf2, page_size); ++ ++ CHAR *p = (CHAR *) buf1; ++ p[page_size / sizeof (CHAR) - 1] = (CHAR) 1; ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ src.fn = impl->fn; ++ int res = do_memcmp (dest, src); ++ if (res >= 0) ++ { ++ error (0, 0, "Wrong result in function %s: %i >= 0", ++ impl->name, res); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +-- +GitLab + diff --git a/glibc-RHEL-15696-90.patch b/glibc-RHEL-15696-90.patch new file mode 100644 index 0000000..11835aa --- /dev/null +++ b/glibc-RHEL-15696-90.patch @@ -0,0 +1,147 @@ +From 30d627d477d7255345a4b713cf352ac32d644d61 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:22 -0500 +Subject: [PATCH] x86: Optimize strcspn and strpbrk in strcspn-c.c +Content-type: text/plain; charset=UTF-8 + +Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of +_mm_cmpistri. Also change offset to unsigned to avoid unnecessary +sign extensions. + +geometric_mean(N=20) of all benchmarks that dont fallback on +sse2/strlen; New / Original: .928 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++--------------- + 1 file changed, 37 insertions(+), 46 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c +index 857af104..6cce4296 100644 +--- a/sysdeps/x86_64/multiarch/strcspn-c.c ++++ b/sysdeps/x86_64/multiarch/strcspn-c.c +@@ -85,83 +85,74 @@ STRCSPN_SSE42 (const char *s, const char *a) + RETURN (NULL, strlen (s)); + + const char *aligned; +- __m128i mask; +- int offset = (int) ((size_t) a & 15); ++ __m128i mask, maskz, zero; ++ unsigned int maskz_bits; ++ unsigned int offset = (unsigned int) ((size_t) a & 15); ++ zero = _mm_set1_epi8 (0); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & -16L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); +- +- mask = __m128i_shift_right (mask0, offset); ++ maskz = _mm_cmpeq_epi8 (mask0, zero); + + /* Find where the NULL terminator is. */ +- int length = _mm_cmpistri (mask, mask, 0x3a); +- if (length == 16 - offset) +- { +- /* There is no NULL terminator. */ +- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); +- int index = _mm_cmpistri (mask1, mask1, 0x3a); +- length += index; +- +- /* Don't use SSE4.2 if the length of A > 16. */ +- if (length > 16) +- return STRCSPN_SSE2 (s, a); +- +- if (index != 0) +- { +- /* Combine mask0 and mask1. We could play games with +- palignr, but frankly this data should be in L1 now +- so do the merge via an unaligned load. */ +- mask = _mm_loadu_si128 ((__m128i *) a); +- } +- } ++ maskz_bits = _mm_movemask_epi8 (maskz) >> offset; ++ if (maskz_bits != 0) ++ { ++ mask = __m128i_shift_right (mask0, offset); ++ offset = (unsigned int) ((size_t) s & 15); ++ if (offset) ++ goto start_unaligned; ++ ++ aligned = s; ++ goto start_loop; ++ } + } +- else +- { +- /* A is aligned. */ +- mask = _mm_load_si128 ((__m128i *) a); + +- /* Find where the NULL terminator is. */ +- int length = _mm_cmpistri (mask, mask, 0x3a); +- if (length == 16) +- { +- /* There is no NULL terminator. Don't use SSE4.2 if the length +- of A > 16. */ +- if (a[16] != 0) +- return STRCSPN_SSE2 (s, a); +- } ++ /* A is aligned. */ ++ mask = _mm_loadu_si128 ((__m128i *) a); ++ /* Find where the NULL terminator is. */ ++ maskz = _mm_cmpeq_epi8 (mask, zero); ++ maskz_bits = _mm_movemask_epi8 (maskz); ++ if (maskz_bits == 0) ++ { ++ /* There is no NULL terminator. Don't use SSE4.2 if the length ++ of A > 16. */ ++ if (a[16] != 0) ++ return STRCSPN_SSE2 (s, a); + } + +- offset = (int) ((size_t) s & 15); ++ aligned = s; ++ offset = (unsigned int) ((size_t) s & 15); + if (offset != 0) + { ++ start_unaligned: + /* Check partial string. */ + aligned = (const char *) ((size_t) s & -16L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + + value = __m128i_shift_right (value, offset); + +- int length = _mm_cmpistri (mask, value, 0x2); ++ unsigned int length = _mm_cmpistri (mask, value, 0x2); + /* No need to check ZFlag since ZFlag is always 1. */ +- int cflag = _mm_cmpistrc (mask, value, 0x2); ++ unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); + if (cflag) + RETURN ((char *) (s + length), length); + /* Find where the NULL terminator is. */ +- int index = _mm_cmpistri (value, value, 0x3a); ++ unsigned int index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + RETURN (NULL, index); + aligned += 16; + } +- else +- aligned = s; + ++start_loop: + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); +- int index = _mm_cmpistri (mask, value, 0x2); +- int cflag = _mm_cmpistrc (mask, value, 0x2); +- int zflag = _mm_cmpistrz (mask, value, 0x2); ++ unsigned int index = _mm_cmpistri (mask, value, 0x2); ++ unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); ++ unsigned int zflag = _mm_cmpistrz (mask, value, 0x2); + if (cflag) + RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); + if (zflag) +-- +GitLab + diff --git a/glibc-RHEL-15696-91.patch b/glibc-RHEL-15696-91.patch new file mode 100644 index 0000000..de3c8ec --- /dev/null +++ b/glibc-RHEL-15696-91.patch @@ -0,0 +1,147 @@ +From 412d10343168b05b8cf6c3683457cf9711d28046 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:24 -0500 +Subject: [PATCH] x86: Optimize strspn in strspn-c.c +Content-type: text/plain; charset=UTF-8 + +Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of +_mm_cmpistri. Also change offset to unsigned to avoid unnecessary +sign extensions. + +geometric_mean(N=20) of all benchmarks that dont fallback on +sse2; New / Original: .901 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++---------------- + 1 file changed, 39 insertions(+), 47 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c +index 4554cff0..87c5e4bf 100644 +--- a/sysdeps/x86_64/multiarch/strspn-c.c ++++ b/sysdeps/x86_64/multiarch/strspn-c.c +@@ -63,81 +63,73 @@ __strspn_sse42 (const char *s, const char *a) + return 0; + + const char *aligned; +- __m128i mask; +- int offset = (int) ((size_t) a & 15); ++ __m128i mask, maskz, zero; ++ unsigned int maskz_bits; ++ unsigned int offset = (int) ((size_t) a & 15); ++ zero = _mm_set1_epi8 (0); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & -16L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); +- +- mask = __m128i_shift_right (mask0, offset); ++ maskz = _mm_cmpeq_epi8 (mask0, zero); + + /* Find where the NULL terminator is. */ +- int length = _mm_cmpistri (mask, mask, 0x3a); +- if (length == 16 - offset) +- { +- /* There is no NULL terminator. */ +- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); +- int index = _mm_cmpistri (mask1, mask1, 0x3a); +- length += index; +- +- /* Don't use SSE4.2 if the length of A > 16. */ +- if (length > 16) +- return __strspn_sse2 (s, a); +- +- if (index != 0) +- { +- /* Combine mask0 and mask1. We could play games with +- palignr, but frankly this data should be in L1 now +- so do the merge via an unaligned load. */ +- mask = _mm_loadu_si128 ((__m128i *) a); +- } +- } ++ maskz_bits = _mm_movemask_epi8 (maskz) >> offset; ++ if (maskz_bits != 0) ++ { ++ mask = __m128i_shift_right (mask0, offset); ++ offset = (unsigned int) ((size_t) s & 15); ++ if (offset) ++ goto start_unaligned; ++ ++ aligned = s; ++ goto start_loop; ++ } + } +- else +- { +- /* A is aligned. */ +- mask = _mm_load_si128 ((__m128i *) a); + +- /* Find where the NULL terminator is. */ +- int length = _mm_cmpistri (mask, mask, 0x3a); +- if (length == 16) +- { +- /* There is no NULL terminator. Don't use SSE4.2 if the length +- of A > 16. */ +- if (a[16] != 0) +- return __strspn_sse2 (s, a); +- } ++ /* A is aligned. */ ++ mask = _mm_loadu_si128 ((__m128i *) a); ++ ++ /* Find where the NULL terminator is. */ ++ maskz = _mm_cmpeq_epi8 (mask, zero); ++ maskz_bits = _mm_movemask_epi8 (maskz); ++ if (maskz_bits == 0) ++ { ++ /* There is no NULL terminator. Don't use SSE4.2 if the length ++ of A > 16. */ ++ if (a[16] != 0) ++ return __strspn_sse2 (s, a); + } ++ aligned = s; ++ offset = (unsigned int) ((size_t) s & 15); + +- offset = (int) ((size_t) s & 15); + if (offset != 0) + { ++ start_unaligned: + /* Check partial string. */ + aligned = (const char *) ((size_t) s & -16L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); ++ __m128i adj_value = __m128i_shift_right (value, offset); + +- value = __m128i_shift_right (value, offset); +- +- int length = _mm_cmpistri (mask, value, 0x12); ++ unsigned int length = _mm_cmpistri (mask, adj_value, 0x12); + /* No need to check CFlag since it is always 1. */ + if (length < 16 - offset) + return length; + /* Find where the NULL terminator is. */ +- int index = _mm_cmpistri (value, value, 0x3a); +- if (index < 16 - offset) ++ maskz = _mm_cmpeq_epi8 (value, zero); ++ maskz_bits = _mm_movemask_epi8 (maskz) >> offset; ++ if (maskz_bits != 0) + return length; + aligned += 16; + } +- else +- aligned = s; + ++start_loop: + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); +- int index = _mm_cmpistri (mask, value, 0x12); +- int cflag = _mm_cmpistrc (mask, value, 0x12); ++ unsigned int index = _mm_cmpistri (mask, value, 0x12); ++ unsigned int cflag = _mm_cmpistrc (mask, value, 0x12); + if (cflag) + return (size_t) (aligned + index - s); + aligned += 16; +-- +GitLab + diff --git a/glibc-RHEL-15696-92.patch b/glibc-RHEL-15696-92.patch new file mode 100644 index 0000000..f19914e --- /dev/null +++ b/glibc-RHEL-15696-92.patch @@ -0,0 +1,175 @@ +From fe28e7d9d9535ebab4081d195c553b4fbf39d9ae Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:26 -0500 +Subject: [PATCH] x86: Remove strcspn-sse2.S and use the generic implementation +Content-type: text/plain; charset=UTF-8 + +The generic implementation is faster. + +geometric_mean(N=20) of all benchmarks New / Original: .678 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + .../{strcspn-sse2.S => strcspn-sse2.c} | 6 +- + sysdeps/x86_64/strcspn.S | 122 ------------------ + 2 files changed, 3 insertions(+), 125 deletions(-) + rename sysdeps/x86_64/multiarch/{strcspn-sse2.S => strcspn-sse2.c} (89%) + delete mode 100644 sysdeps/x86_64/strcspn.S + +Conflicts: + sysdeps/x86_64/multiarch/strcspn-sse2.S + (copyright header) + +diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.c +similarity index 89% +rename from sysdeps/x86_64/multiarch/strcspn-sse2.S +rename to sysdeps/x86_64/multiarch/strcspn-sse2.c +index 8a0c69d7..32debee4 100644 +--- a/sysdeps/x86_64/multiarch/strcspn-sse2.S ++++ b/sysdeps/x86_64/multiarch/strcspn-sse2.c +@@ -19,10 +19,10 @@ + #if IS_IN (libc) + + # include +-# define strcspn __strcspn_sse2 ++# define STRCSPN __strcspn_sse2 + + # undef libc_hidden_builtin_def +-# define libc_hidden_builtin_def(strcspn) ++# define libc_hidden_builtin_def(STRCSPN) + #endif + +-#include ++#include +diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S +deleted file mode 100644 +index 7f9202d6..00000000 +--- a/sysdeps/x86_64/strcspn.S ++++ /dev/null +@@ -1,122 +0,0 @@ +-/* strcspn (str, ss) -- Return the length of the initial segment of STR +- which contains no characters from SS. +- For AMD x86-64. +- Copyright (C) 1994-2018 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- Contributed by Ulrich Drepper . +- Bug fixes by Alan Modra . +- Adopted for x86-64 by Andreas Jaeger . +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#include +-#include "asm-syntax.h" +- +- .text +-ENTRY (strcspn) +- +- movq %rdi, %rdx /* Save SRC. */ +- +- /* First we create a table with flags for all possible characters. +- For the ASCII (7bit/8bit) or ISO-8859-X character sets which are +- supported by the C string functions we have 256 characters. +- Before inserting marks for the stop characters we clear the whole +- table. */ +- movq %rdi, %r8 /* Save value. */ +- subq $256, %rsp /* Make space for 256 bytes. */ +- cfi_adjust_cfa_offset(256) +- movl $32, %ecx /* 32*8 bytes = 256 bytes. */ +- movq %rsp, %rdi +- xorl %eax, %eax /* We store 0s. */ +- cld +- rep +- stosq +- +- movq %rsi, %rax /* Setup skipset. */ +- +-/* For understanding the following code remember that %rcx == 0 now. +- Although all the following instruction only modify %cl we always +- have a correct zero-extended 64-bit value in %rcx. */ +- +- .p2align 4 +-L(2): movb (%rax), %cl /* get byte from skipset */ +- testb %cl, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ +- +- movb 1(%rax), %cl /* get byte from skipset */ +- testb $0xff, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ +- +- movb 2(%rax), %cl /* get byte from skipset */ +- testb $0xff, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ +- +- movb 3(%rax), %cl /* get byte from skipset */ +- addq $4, %rax /* increment skipset pointer */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ +- testb $0xff, %cl /* is NUL char? */ +- jnz L(2) /* no => process next dword from skipset */ +- +-L(1): leaq -4(%rdx), %rax /* prepare loop */ +- +- /* We use a neat trick for the following loop. Normally we would +- have to test for two termination conditions +- 1. a character in the skipset was found +- and +- 2. the end of the string was found +- But as a sign that the character is in the skipset we store its +- value in the table. But the value of NUL is NUL so the loop +- terminates for NUL in every case. */ +- +- .p2align 4 +-L(3): addq $4, %rax /* adjust pointer for full loop round */ +- +- movb (%rax), %cl /* get byte from string */ +- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- je L(4) /* yes => return */ +- +- movb 1(%rax), %cl /* get byte from string */ +- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- je L(5) /* yes => return */ +- +- movb 2(%rax), %cl /* get byte from string */ +- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jz L(6) /* yes => return */ +- +- movb 3(%rax), %cl /* get byte from string */ +- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jne L(3) /* no => start loop again */ +- +- incq %rax /* adjust pointer */ +-L(6): incq %rax +-L(5): incq %rax +- +-L(4): addq $256, %rsp /* remove skipset */ +- cfi_adjust_cfa_offset(-256) +-#ifdef USE_AS_STRPBRK +- xorl %edx,%edx +- orb %cl, %cl /* was last character NUL? */ +- cmovzq %rdx, %rax /* Yes: return NULL */ +-#else +- subq %rdx, %rax /* we have to return the number of valid +- characters, so compute distance to first +- non-valid character */ +-#endif +- ret +-END (strcspn) +-libc_hidden_builtin_def (strcspn) +-- +GitLab + diff --git a/glibc-RHEL-15696-93.patch b/glibc-RHEL-15696-93.patch new file mode 100644 index 0000000..45c8527 --- /dev/null +++ b/glibc-RHEL-15696-93.patch @@ -0,0 +1,55 @@ +From 653358535280a599382cb6c77538a187dac6a87f Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:27 -0500 +Subject: [PATCH] x86: Remove strpbrk-sse2.S and use the generic implementation +Content-type: text/plain; charset=UTF-8 + +The generic implementation is faster (see strcspn commit). + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + .../x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} | 7 +++---- + sysdeps/x86_64/strpbrk.S | 3 --- + 2 files changed, 3 insertions(+), 7 deletions(-) + rename sysdeps/x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} (87%) + delete mode 100644 sysdeps/x86_64/strpbrk.S + +Conflicts: + sysdeps/x86_64/multiarch/strpbrk-sse2.S + (copyright header) + +diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.S b/sysdeps/x86_64/multiarch/strpbrk-sse2.c +similarity index 87% +rename from sysdeps/x86_64/multiarch/strpbrk-sse2.S +rename to sysdeps/x86_64/multiarch/strpbrk-sse2.c +index 3c6a74db..ec0b6fda 100644 +--- a/sysdeps/x86_64/multiarch/strpbrk-sse2.S ++++ b/sysdeps/x86_64/multiarch/strpbrk-sse2.c +@@ -19,11 +19,10 @@ + #if IS_IN (libc) + + # include +-# define strcspn __strpbrk_sse2 ++# define STRPBRK __strpbrk_sse2 + + # undef libc_hidden_builtin_def +-# define libc_hidden_builtin_def(strpbrk) ++# define libc_hidden_builtin_def(STRPBRK) + #endif + +-#define USE_AS_STRPBRK +-#include ++#include +diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S +deleted file mode 100644 +index 21888a5b..00000000 +--- a/sysdeps/x86_64/strpbrk.S ++++ /dev/null +@@ -1,3 +0,0 @@ +-#define strcspn strpbrk +-#define USE_AS_STRPBRK +-#include +-- +GitLab + diff --git a/glibc-RHEL-15696-94.patch b/glibc-RHEL-15696-94.patch new file mode 100644 index 0000000..2fa86da --- /dev/null +++ b/glibc-RHEL-15696-94.patch @@ -0,0 +1,168 @@ +From 9c8a6ad620b49a27120ecdd7049c26bf05900397 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:29 -0500 +Subject: [PATCH] x86: Remove strspn-sse2.S and use the generic implementation +Content-type: text/plain; charset=UTF-8 + +The generic implementation is faster. + +geometric_mean(N=20) of all benchmarks New / Original: .710 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + .../{strspn-sse2.S => strspn-sse2.c} | 6 +- + sysdeps/x86_64/strspn.S | 115 ------------------ + 2 files changed, 3 insertions(+), 118 deletions(-) + rename sysdeps/x86_64/multiarch/{strspn-sse2.S => strspn-sse2.c} (89%) + delete mode 100644 sysdeps/x86_64/strspn.S + +Conflicts: + sysdeps/x86_64/multiarch/strspn-sse2.c + (copyright header) + +diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.S b/sysdeps/x86_64/multiarch/strspn-sse2.c +similarity index 89% +rename from sysdeps/x86_64/multiarch/strspn-sse2.S +rename to sysdeps/x86_64/multiarch/strspn-sse2.c +index 4686cdd5..ab0dae40 100644 +--- a/sysdeps/x86_64/multiarch/strspn-sse2.S ++++ b/sysdeps/x86_64/multiarch/strspn-sse2.c +@@ -19,10 +19,10 @@ + #if IS_IN (libc) + + # include +-# define strspn __strspn_sse2 ++# define STRSPN __strspn_sse2 + + # undef libc_hidden_builtin_def +-# define libc_hidden_builtin_def(strspn) ++# define libc_hidden_builtin_def(STRSPN) + #endif + +-#include ++#include +diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S +deleted file mode 100644 +index 635f1bc6..00000000 +--- a/sysdeps/x86_64/strspn.S ++++ /dev/null +@@ -1,115 +0,0 @@ +-/* strspn (str, ss) -- Return the length of the initial segment of STR +- which contains only characters from SS. +- For AMD x86-64. +- Copyright (C) 1994-2018 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- Contributed by Ulrich Drepper . +- Bug fixes by Alan Modra . +- Adopted for x86-64 by Andreas Jaeger . +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#include +- +- .text +-ENTRY (strspn) +- +- movq %rdi, %rdx /* Save SRC. */ +- +- /* First we create a table with flags for all possible characters. +- For the ASCII (7bit/8bit) or ISO-8859-X character sets which are +- supported by the C string functions we have 256 characters. +- Before inserting marks for the stop characters we clear the whole +- table. */ +- movq %rdi, %r8 /* Save value. */ +- subq $256, %rsp /* Make space for 256 bytes. */ +- cfi_adjust_cfa_offset(256) +- movl $32, %ecx /* 32*8 bytes = 256 bytes. */ +- movq %rsp, %rdi +- xorl %eax, %eax /* We store 0s. */ +- cld +- rep +- stosq +- +- movq %rsi, %rax /* Setup stopset. */ +- +-/* For understanding the following code remember that %rcx == 0 now. +- Although all the following instruction only modify %cl we always +- have a correct zero-extended 64-bit value in %rcx. */ +- +- .p2align 4 +-L(2): movb (%rax), %cl /* get byte from stopset */ +- testb %cl, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ +- +- movb 1(%rax), %cl /* get byte from stopset */ +- testb $0xff, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ +- +- movb 2(%rax), %cl /* get byte from stopset */ +- testb $0xff, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ +- +- movb 3(%rax), %cl /* get byte from stopset */ +- addq $4, %rax /* increment stopset pointer */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ +- testb $0xff, %cl /* is NUL char? */ +- jnz L(2) /* no => process next dword from stopset */ +- +-L(1): leaq -4(%rdx), %rax /* prepare loop */ +- +- /* We use a neat trick for the following loop. Normally we would +- have to test for two termination conditions +- 1. a character in the stopset was found +- and +- 2. the end of the string was found +- But as a sign that the character is in the stopset we store its +- value in the table. But the value of NUL is NUL so the loop +- terminates for NUL in every case. */ +- +- .p2align 4 +-L(3): addq $4, %rax /* adjust pointer for full loop round */ +- +- movb (%rax), %cl /* get byte from string */ +- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jz L(4) /* no => return */ +- +- movb 1(%rax), %cl /* get byte from string */ +- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jz L(5) /* no => return */ +- +- movb 2(%rax), %cl /* get byte from string */ +- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jz L(6) /* no => return */ +- +- movb 3(%rax), %cl /* get byte from string */ +- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jnz L(3) /* yes => start loop again */ +- +- incq %rax /* adjust pointer */ +-L(6): incq %rax +-L(5): incq %rax +- +-L(4): addq $256, %rsp /* remove stopset */ +- cfi_adjust_cfa_offset(-256) +- subq %rdx, %rax /* we have to return the number of valid +- characters, so compute distance to first +- non-valid character */ +- ret +-END (strspn) +-libc_hidden_builtin_def (strspn) +-- +GitLab + diff --git a/glibc-RHEL-15696-95.patch b/glibc-RHEL-15696-95.patch new file mode 100644 index 0000000..cf21b96 --- /dev/null +++ b/glibc-RHEL-15696-95.patch @@ -0,0 +1,122 @@ +From 670b54bc585ea4a94f3b2e9272ba44aa6b730b73 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:36 -0500 +Subject: [PATCH] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S +Content-type: text/plain; charset=UTF-8 + +Slightly faster method of doing TOLOWER that saves an +instruction. + +Also replace the hard coded 5-byte no with .p2align 4. On builds with +CET enabled this misaligned entry to strcasecmp. + +geometric_mean(N=40) of all benchmarks New / Original: .894 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++---------------------- + 1 file changed, 29 insertions(+), 35 deletions(-) + +diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S +index aa6df898..f454ce5b 100644 +--- a/sysdeps/x86_64/strcmp.S ++++ b/sysdeps/x86_64/strcmp.S +@@ -78,9 +78,8 @@ ENTRY2 (__strcasecmp) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RDX_LP + +- // XXX 5 byte should be before the function +- /* 5-byte NOP. */ +- .byte 0x0f,0x1f,0x44,0x00,0x00 ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 + END2 (__strcasecmp) + # ifndef NO_NOLOCALE_ALIAS + weak_alias (__strcasecmp, strcasecmp) +@@ -97,9 +96,8 @@ ENTRY2 (__strncasecmp) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RCX_LP + +- // XXX 5 byte should be before the function +- /* 5-byte NOP. */ +- .byte 0x0f,0x1f,0x44,0x00,0x00 ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 + END2 (__strncasecmp) + # ifndef NO_NOLOCALE_ALIAS + weak_alias (__strncasecmp, strncasecmp) +@@ -149,22 +147,22 @@ ENTRY (STRCMP) + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +-.Lbelowupper: +- .quad 0x4040404040404040 +- .quad 0x4040404040404040 +-.Ltopupper: +- .quad 0x5b5b5b5b5b5b5b5b +- .quad 0x5b5b5b5b5b5b5b5b +-.Ltouppermask: ++.Llcase_min: ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++.Llcase_max: ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++.Lcase_add: + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous +- movdqa .Lbelowupper(%rip), %xmm5 +-# define UCLOW_reg %xmm5 +- movdqa .Ltopupper(%rip), %xmm6 +-# define UCHIGH_reg %xmm6 +- movdqa .Ltouppermask(%rip), %xmm7 +-# define LCQWORD_reg %xmm7 ++ movdqa .Llcase_min(%rip), %xmm5 ++# define LCASE_MIN_reg %xmm5 ++ movdqa .Llcase_max(%rip), %xmm6 ++# define LCASE_MAX_reg %xmm6 ++ movdqa .Lcase_add(%rip), %xmm7 ++# define CASE_ADD_reg %xmm7 + #endif + cmp $0x30, %ecx + ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ +@@ -175,22 +173,18 @@ ENTRY (STRCMP) + movhpd 8(%rdi), %xmm1 + movhpd 8(%rsi), %xmm2 + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +-# define TOLOWER(reg1, reg2) \ +- movdqa reg1, %xmm8; \ +- movdqa UCHIGH_reg, %xmm9; \ +- movdqa reg2, %xmm10; \ +- movdqa UCHIGH_reg, %xmm11; \ +- pcmpgtb UCLOW_reg, %xmm8; \ +- pcmpgtb reg1, %xmm9; \ +- pcmpgtb UCLOW_reg, %xmm10; \ +- pcmpgtb reg2, %xmm11; \ +- pand %xmm9, %xmm8; \ +- pand %xmm11, %xmm10; \ +- pand LCQWORD_reg, %xmm8; \ +- pand LCQWORD_reg, %xmm10; \ +- por %xmm8, reg1; \ +- por %xmm10, reg2 +- TOLOWER (%xmm1, %xmm2) ++# define TOLOWER(reg1, reg2) \ ++ movdqa LCASE_MIN_reg, %xmm8; \ ++ movdqa LCASE_MIN_reg, %xmm9; \ ++ paddb reg1, %xmm8; \ ++ paddb reg2, %xmm9; \ ++ pcmpgtb LCASE_MAX_reg, %xmm8; \ ++ pcmpgtb LCASE_MAX_reg, %xmm9; \ ++ pandn CASE_ADD_reg, %xmm8; \ ++ pandn CASE_ADD_reg, %xmm9; \ ++ paddb %xmm8, reg1; \ ++ paddb %xmm9, reg2 ++ TOLOWER (%xmm1, %xmm2) + #else + # define TOLOWER(reg1, reg2) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-96.patch b/glibc-RHEL-15696-96.patch new file mode 100644 index 0000000..2d3b891 --- /dev/null +++ b/glibc-RHEL-15696-96.patch @@ -0,0 +1,143 @@ +From d154758e618ec9324f5d339c46db0aa27e8b1226 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:38 -0500 +Subject: [PATCH] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S +Content-type: text/plain; charset=UTF-8 + +Slightly faster method of doing TOLOWER that saves an +instruction. + +Also replace the hard coded 5-byte no with .p2align 4. On builds with +CET enabled this misaligned entry to strcasecmp. + +geometric_mean(N=40) of all benchmarks New / Original: .920 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++-------------- + 1 file changed, 35 insertions(+), 48 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S +index d8fdeb3a..59e8ddfc 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S ++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S +@@ -89,9 +89,8 @@ ENTRY (GLABEL(__strcasecmp)) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RDX_LP + +- // XXX 5 byte should be before the function +- /* 5-byte NOP. */ +- .byte 0x0f,0x1f,0x44,0x00,0x00 ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 + END (GLABEL(__strcasecmp)) + /* FALLTHROUGH to strcasecmp_l. */ + #endif +@@ -100,9 +99,8 @@ ENTRY (GLABEL(__strncasecmp)) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RCX_LP + +- // XXX 5 byte should be before the function +- /* 5-byte NOP. */ +- .byte 0x0f,0x1f,0x44,0x00,0x00 ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 + END (GLABEL(__strncasecmp)) + /* FALLTHROUGH to strncasecmp_l. */ + #endif +@@ -170,27 +168,22 @@ STRCMP_SSE42: + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +-LABEL(belowupper): +- .quad 0x4040404040404040 +- .quad 0x4040404040404040 +-LABEL(topupper): +-# ifdef USE_AVX +- .quad 0x5a5a5a5a5a5a5a5a +- .quad 0x5a5a5a5a5a5a5a5a +-# else +- .quad 0x5b5b5b5b5b5b5b5b +- .quad 0x5b5b5b5b5b5b5b5b +-# endif +-LABEL(touppermask): ++LABEL(lcase_min): ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++LABEL(lcase_max): ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++LABEL(case_add): + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous +- movdqa LABEL(belowupper)(%rip), %xmm4 +-# define UCLOW_reg %xmm4 +- movdqa LABEL(topupper)(%rip), %xmm5 +-# define UCHIGH_reg %xmm5 +- movdqa LABEL(touppermask)(%rip), %xmm6 +-# define LCQWORD_reg %xmm6 ++ movdqa LABEL(lcase_min)(%rip), %xmm4 ++# define LCASE_MIN_reg %xmm4 ++ movdqa LABEL(lcase_max)(%rip), %xmm5 ++# define LCASE_MAX_reg %xmm5 ++ movdqa LABEL(case_add)(%rip), %xmm6 ++# define CASE_ADD_reg %xmm6 + #endif + cmp $0x30, %ecx + ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ +@@ -201,32 +194,26 @@ LABEL(touppermask): + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + # ifdef USE_AVX + # define TOLOWER(reg1, reg2) \ +- vpcmpgtb UCLOW_reg, reg1, %xmm7; \ +- vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ +- vpcmpgtb UCLOW_reg, reg2, %xmm9; \ +- vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ +- vpandn %xmm7, %xmm8, %xmm8; \ +- vpandn %xmm9, %xmm10, %xmm10; \ +- vpand LCQWORD_reg, %xmm8, %xmm8; \ +- vpand LCQWORD_reg, %xmm10, %xmm10; \ +- vpor reg1, %xmm8, reg1; \ +- vpor reg2, %xmm10, reg2 ++ vpaddb LCASE_MIN_reg, reg1, %xmm7; \ ++ vpaddb LCASE_MIN_reg, reg2, %xmm8; \ ++ vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \ ++ vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \ ++ vpandn CASE_ADD_reg, %xmm7, %xmm7; \ ++ vpandn CASE_ADD_reg, %xmm8, %xmm8; \ ++ vpaddb %xmm7, reg1, reg1; \ ++ vpaddb %xmm8, reg2, reg2 + # else + # define TOLOWER(reg1, reg2) \ +- movdqa reg1, %xmm7; \ +- movdqa UCHIGH_reg, %xmm8; \ +- movdqa reg2, %xmm9; \ +- movdqa UCHIGH_reg, %xmm10; \ +- pcmpgtb UCLOW_reg, %xmm7; \ +- pcmpgtb reg1, %xmm8; \ +- pcmpgtb UCLOW_reg, %xmm9; \ +- pcmpgtb reg2, %xmm10; \ +- pand %xmm8, %xmm7; \ +- pand %xmm10, %xmm9; \ +- pand LCQWORD_reg, %xmm7; \ +- pand LCQWORD_reg, %xmm9; \ +- por %xmm7, reg1; \ +- por %xmm9, reg2 ++ movdqa LCASE_MIN_reg, %xmm7; \ ++ movdqa LCASE_MIN_reg, %xmm8; \ ++ paddb reg1, %xmm7; \ ++ paddb reg2, %xmm8; \ ++ pcmpgtb LCASE_MAX_reg, %xmm7; \ ++ pcmpgtb LCASE_MAX_reg, %xmm8; \ ++ pandn CASE_ADD_reg, %xmm7; \ ++ pandn CASE_ADD_reg, %xmm8; \ ++ paddb %xmm7, reg1; \ ++ paddb %xmm8, reg2 + # endif + TOLOWER (%xmm1, %xmm2) + #else +-- +GitLab + diff --git a/glibc-RHEL-15696-97.patch b/glibc-RHEL-15696-97.patch new file mode 100644 index 0000000..9592795 --- /dev/null +++ b/glibc-RHEL-15696-97.patch @@ -0,0 +1,759 @@ +From bbf81222343fed5cd704001a2ae0d86c71544151 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 24 Mar 2022 18:56:12 -0500 +Subject: [PATCH] x86: Add AVX2 optimized str{n}casecmp +Content-type: text/plain; charset=UTF-8 + +geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/Makefile | 4 + + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 +++ + sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 12 + + .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S | 15 ++ + sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S | 23 ++ + sysdeps/x86_64/multiarch/strcmp-avx2.S | 237 +++++++++++++++--- + .../x86_64/multiarch/strncase_l-avx2-rtm.S | 16 ++ + sysdeps/x86_64/multiarch/strncase_l-avx2.S | 27 ++ + 8 files changed, 331 insertions(+), 31 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S + create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 8c9e7812..711ecf2e 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -51,6 +51,8 @@ sysdep_routines += \ + stpncpy-sse2-unaligned \ + stpncpy-ssse3 \ + strcasecmp_l-avx \ ++ strcasecmp_l-avx2 \ ++ strcasecmp_l-avx2-rtm \ + strcasecmp_l-sse2 \ + strcasecmp_l-sse4_2 \ + strcasecmp_l-ssse3 \ +@@ -89,6 +91,8 @@ sysdep_routines += \ + strlen-evex \ + strlen-sse2 \ + strncase_l-avx \ ++ strncase_l-avx2 \ ++ strncase_l-avx2-rtm \ + strncase_l-sse2 \ + strncase_l-sse4_2 \ + strncase_l-ssse3 \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index c963d391..d873e1be 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -418,6 +418,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ + IFUNC_IMPL (i, name, strcasecmp, ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ CPU_FEATURE_USABLE (AVX2), ++ __strcasecmp_avx2) ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strcasecmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (AVX), + __strcasecmp_avx) +@@ -431,6 +438,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ + IFUNC_IMPL (i, name, strcasecmp_l, ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ CPU_FEATURE_USABLE (AVX2), ++ __strcasecmp_l_avx2) ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strcasecmp_l_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + CPU_FEATURE_USABLE (AVX), + __strcasecmp_l_avx) +@@ -558,6 +572,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ + IFUNC_IMPL (i, name, strncasecmp, ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ CPU_FEATURE_USABLE (AVX2), ++ __strncasecmp_avx2) ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strncasecmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (AVX), + __strncasecmp_avx) +@@ -572,6 +593,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ + IFUNC_IMPL (i, name, strncasecmp_l, ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ CPU_FEATURE_USABLE (AVX2), ++ __strncasecmp_l_avx2) ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strncasecmp_l_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + CPU_FEATURE_USABLE (AVX), + __strncasecmp_l_avx) +diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +index 6a4bb078..926508c4 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } ++ + if (CPU_FEATURE_USABLE_P (cpu_features, AVX)) + return OPTIMIZE (avx); + +diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S +new file mode 100644 +index 00000000..09957fc3 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S +@@ -0,0 +1,15 @@ ++#ifndef STRCMP ++# define STRCMP __strcasecmp_l_avx2_rtm ++#endif ++ ++#define _GLABEL(x) x ## _rtm ++#define GLABEL(x) _GLABEL(x) ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "strcasecmp_l-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S +new file mode 100644 +index 00000000..e2762f2a +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S +@@ -0,0 +1,23 @@ ++/* strcasecmp_l optimized with AVX2. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef STRCMP ++# define STRCMP __strcasecmp_l_avx2 ++#endif ++#define USE_AS_STRCASECMP_L ++#include "strcmp-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 782f9472..28cc98b6 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -20,6 +20,10 @@ + + # include + ++# if defined USE_AS_STRCASECMP_L ++# include "locale-defines.h" ++# endif ++ + # ifndef STRCMP + # define STRCMP __strcmp_avx2 + # endif +@@ -74,13 +78,88 @@ + # define VEC_OFFSET (-VEC_SIZE) + # endif + ++# ifdef USE_AS_STRCASECMP_L ++# define BYTE_LOOP_REG OFFSET_REG ++# else ++# define BYTE_LOOP_REG ecx ++# endif ++ ++# ifdef USE_AS_STRCASECMP_L ++# ifdef USE_AS_STRNCMP ++# define STRCASECMP __strncasecmp_avx2 ++# define LOCALE_REG rcx ++# define LOCALE_REG_LP RCX_LP ++# define STRCASECMP_NONASCII __strncasecmp_l_nonascii ++# else ++# define STRCASECMP __strcasecmp_avx2 ++# define LOCALE_REG rdx ++# define LOCALE_REG_LP RDX_LP ++# define STRCASECMP_NONASCII __strcasecmp_l_nonascii ++# endif ++# endif ++ + # define xmmZERO xmm15 + # define ymmZERO ymm15 + ++# define LCASE_MIN_ymm %ymm10 ++# define LCASE_MAX_ymm %ymm11 ++# define CASE_ADD_ymm %ymm12 ++ ++# define LCASE_MIN_xmm %xmm10 ++# define LCASE_MAX_xmm %xmm11 ++# define CASE_ADD_xmm %xmm12 ++ ++ /* r11 is never use elsewhere so this is safe to maintain. */ ++# define TOLOWER_BASE %r11 ++ + # ifndef SECTION + # define SECTION(p) p##.avx + # endif + ++# ifdef USE_AS_STRCASECMP_L ++# define REG(x, y) x ## y ++# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \ ++ vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \ ++ vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \ ++ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \ ++ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \ ++ vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \ ++ vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \ ++ vpaddb REG(%ext, 8), reg1_in, reg1_out; \ ++ vpaddb REG(%ext, 9), reg2_in, reg2_out ++ ++# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst ++# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm) ++# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm) ++ ++# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \ ++ TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \ ++ VPCMPEQ scratch_reg, s2_reg, reg_out ++ ++# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \ ++ VMOVU s2_mem, reg_out; \ ++ CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext) ++ ++# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm) ++# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm) ++ ++# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm) ++# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm) ++ ++# else ++# define TOLOWER_gpr(...) ++# define TOLOWER_ymm(...) ++# define TOLOWER_xmm(...) ++ ++# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \ ++ VPCMPEQ s2_reg, s1_reg, reg_out ++ ++# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__) ++ ++# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__) ++# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__) ++# endif ++ + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. + strcmp/strncmp have to use UNSIGNED comparison for elements. +@@ -102,8 +181,49 @@ + returned. */ + + .section SECTION(.text), "ax", @progbits +-ENTRY(STRCMP) ++ .align 16 ++ .type STRCMP, @function ++ .globl STRCMP ++ .hidden STRCMP ++ ++# ifndef GLABEL ++# define GLABEL(...) __VA_ARGS__ ++# endif ++ ++# ifdef USE_AS_STRCASECMP_L ++ENTRY (GLABEL(STRCASECMP)) ++ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax ++ mov %fs:(%rax), %LOCALE_REG_LP ++ ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 ++END (GLABEL(STRCASECMP)) ++ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */ ++# endif ++ ++ .p2align 4 ++STRCMP: ++ cfi_startproc ++ _CET_ENDBR ++ CALL_MCOUNT ++ ++# if defined USE_AS_STRCASECMP_L ++ /* We have to fall back on the C implementation for locales with ++ encodings not matching ASCII for single bytes. */ ++# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 ++ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP ++# else ++ mov (%LOCALE_REG), %RAX_LP ++# endif ++ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) ++ jne STRCASECMP_NONASCII ++ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE ++# endif ++ + # ifdef USE_AS_STRNCMP ++ /* Don't overwrite LOCALE_REG (rcx) until we have pass ++ L(one_or_less). Otherwise we might use the wrong locale in ++ the OVERFLOW_STRCMP (strcasecmp_l). */ + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +@@ -128,6 +248,30 @@ ENTRY(STRCMP) + # endif + # endif + vpxor %xmmZERO, %xmmZERO, %xmmZERO ++# if defined USE_AS_STRCASECMP_L ++ .section .rodata.cst32, "aM", @progbits, 32 ++ .align 32 ++L(lcase_min): ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++L(lcase_max): ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++L(case_add): ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .previous ++ ++ vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm ++ vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm ++ vmovdqa L(case_add)(%rip), CASE_ADD_ymm ++# endif + movl %edi, %eax + orl %esi, %eax + sall $20, %eax +@@ -138,8 +282,10 @@ ENTRY(STRCMP) + L(no_page_cross): + /* Safe to compare 4x vectors. */ + VMOVU (%rdi), %ymm0 +- /* 1s where s1 and s2 equal. */ +- VPCMPEQ (%rsi), %ymm0, %ymm1 ++ /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp. ++ Otherwise converts ymm0 and load from rsi to lower. ymm2 is ++ scratch and ymm1 is the return. */ ++ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) + /* 1s at null CHAR. */ + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + /* 1s where s1 and s2 equal AND not null CHAR. */ +@@ -172,6 +318,8 @@ L(return_vec_0): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret0): +@@ -192,6 +340,10 @@ L(ret_zero): + + .p2align 4,, 5 + L(one_or_less): ++# ifdef USE_AS_STRCASECMP_L ++ /* Set locale argument for strcasecmp. */ ++ movq %LOCALE_REG, %rdx ++# endif + jb L(ret_zero) + # ifdef USE_AS_WCSCMP + /* 'nbe' covers the case where length is negative (large +@@ -211,6 +363,8 @@ L(one_or_less): + jnbe __strcmp_avx2 + movzbl (%rdi), %eax + movzbl (%rsi), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret1): +@@ -238,6 +392,8 @@ L(return_vec_1): + # else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret2): +@@ -269,6 +425,8 @@ L(return_vec_2): + # else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret3): +@@ -289,6 +447,8 @@ L(return_vec_3): + # else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret4): +@@ -299,7 +459,7 @@ L(ret4): + L(more_3x_vec): + /* Safe to compare 4x vectors. */ + VMOVU VEC_SIZE(%rdi), %ymm0 +- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -312,7 +472,7 @@ L(more_3x_vec): + # endif + + VMOVU (VEC_SIZE * 2)(%rdi), %ymm0 +- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -320,7 +480,7 @@ L(more_3x_vec): + jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rdi), %ymm0 +- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -395,12 +555,10 @@ L(loop_skip_page_cross_check): + VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 + + /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */ +- VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1 +- +- VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 +- ++ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1) ++ CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3) ++ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) ++ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) + + /* If any mismatches or null CHAR then 0 CHAR, otherwise non- + zero. */ +@@ -469,6 +627,8 @@ L(return_vec_2_3_end): + # else + movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax + movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -512,6 +672,8 @@ L(return_vec_0_end): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -534,6 +696,8 @@ L(return_vec_1_end): + # else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -560,6 +724,8 @@ L(return_vec_2_end): + # else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -587,7 +753,7 @@ L(page_cross_during_loop): + jle L(less_1x_vec_till_page_cross) + + VMOVA (%rdi), %ymm0 +- VPCMPEQ (%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -609,7 +775,7 @@ L(less_1x_vec_till_page_cross): + here, it means the previous page (rdi - VEC_SIZE) has already + been loaded earlier so must be valid. */ + VMOVU -VEC_SIZE(%rdi, %rax), %ymm0 +- VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -651,6 +817,8 @@ L(return_page_cross_cmp_mem): + # else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -677,7 +845,7 @@ L(more_2x_vec_till_page_cross): + iteration here. */ + + VMOVU VEC_SIZE(%rdi), %ymm0 +- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -693,7 +861,7 @@ L(more_2x_vec_till_page_cross): + + /* Safe to include comparisons from lower bytes. */ + VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0 +- VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -701,7 +869,7 @@ L(more_2x_vec_till_page_cross): + jnz L(return_vec_page_cross_0) + + VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0 +- VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -719,8 +887,8 @@ L(more_2x_vec_till_page_cross): + VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 + VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 + +- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 ++ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) ++ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) + vpand %ymm4, %ymm5, %ymm5 + vpand %ymm6, %ymm7, %ymm7 + VPMINU %ymm5, %ymm7, %ymm7 +@@ -771,6 +939,8 @@ L(return_vec_page_cross_1): + # else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -826,7 +996,7 @@ L(page_cross): + L(page_cross_loop): + + VMOVU (%rdi, %OFFSET_REG64), %ymm0 +- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -844,11 +1014,11 @@ L(page_cross_loop): + subl %eax, %OFFSET_REG + /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed + to not cross page so is safe to load. Since we have already +- loaded at least 1 VEC from rsi it is also guranteed to be safe. +- */ ++ loaded at least 1 VEC from rsi it is also guranteed to be ++ safe. */ + + VMOVU (%rdi, %OFFSET_REG64), %ymm0 +- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -881,6 +1051,8 @@ L(ret_vec_page_cross_cont): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -934,7 +1106,7 @@ L(less_1x_vec_till_page): + ja L(less_16_till_page) + + VMOVU (%rdi), %xmm0 +- VPCMPEQ (%rsi), %xmm0, %xmm1 ++ CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1) + VPCMPEQ %xmm0, %xmmZERO, %xmm2 + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx +@@ -952,7 +1124,7 @@ L(less_1x_vec_till_page): + # endif + + VMOVU (%rdi, %OFFSET_REG64), %xmm0 +- VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1 ++ CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1) + VPCMPEQ %xmm0, %xmmZERO, %xmm2 + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx +@@ -990,7 +1162,7 @@ L(less_16_till_page): + vmovq (%rdi), %xmm0 + vmovq (%rsi), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 +- VPCMPEQ %xmm1, %xmm0, %xmm1 ++ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + incb %cl +@@ -1010,7 +1182,7 @@ L(less_16_till_page): + vmovq (%rdi, %OFFSET_REG64), %xmm0 + vmovq (%rsi, %OFFSET_REG64), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 +- VPCMPEQ %xmm1, %xmm0, %xmm1 ++ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + incb %cl +@@ -1066,7 +1238,7 @@ L(ret_less_8_wcs): + vmovd (%rdi), %xmm0 + vmovd (%rsi), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 +- VPCMPEQ %xmm1, %xmm0, %xmm1 ++ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + subl $0xf, %ecx +@@ -1085,7 +1257,7 @@ L(ret_less_8_wcs): + vmovd (%rdi, %OFFSET_REG64), %xmm0 + vmovd (%rsi, %OFFSET_REG64), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 +- VPCMPEQ %xmm1, %xmm0, %xmm1 ++ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + subl $0xf, %ecx +@@ -1119,7 +1291,9 @@ L(less_4_till_page): + L(less_4_loop): + movzbl (%rdi), %eax + movzbl (%rsi, %rdi), %ecx +- subl %ecx, %eax ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG) ++ subl %BYTE_LOOP_REG, %eax + jnz L(ret_less_4_loop) + testl %ecx, %ecx + jz L(ret_zero_4_loop) +@@ -1146,5 +1320,6 @@ L(ret_less_4_loop): + subl %r8d, %eax + ret + # endif +-END(STRCMP) ++ cfi_endproc ++ .size STRCMP, .-STRCMP + #endif +diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S +new file mode 100644 +index 00000000..58c05dcf +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S +@@ -0,0 +1,16 @@ ++#ifndef STRCMP ++# define STRCMP __strncasecmp_l_avx2_rtm ++#endif ++ ++#define _GLABEL(x) x ## _rtm ++#define GLABEL(x) _GLABEL(x) ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++#define OVERFLOW_STRCMP __strcasecmp_l_avx2_rtm ++ ++#include "strncase_l-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S +new file mode 100644 +index 00000000..48c0aa21 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S +@@ -0,0 +1,27 @@ ++/* strncasecmp_l optimized with AVX2. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef STRCMP ++# define STRCMP __strncasecmp_l_avx2 ++#endif ++#define USE_AS_STRCASECMP_L ++#define USE_AS_STRNCMP ++#ifndef OVERFLOW_STRCMP ++# define OVERFLOW_STRCMP __strcasecmp_l_avx2 ++#endif ++#include "strcmp-avx2.S" +-- +GitLab + diff --git a/glibc-RHEL-15696-98.patch b/glibc-RHEL-15696-98.patch new file mode 100644 index 0000000..9941bcc --- /dev/null +++ b/glibc-RHEL-15696-98.patch @@ -0,0 +1,814 @@ +From 84e7c46df4086873eae28a1fb87d2cf5388b1e16 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 24 Mar 2022 18:56:13 -0500 +Subject: [PATCH] x86: Add EVEX optimized str{n}casecmp +Content-type: text/plain; charset=UTF-8 + +geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/Makefile | 2 + + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 + + sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 5 + + sysdeps/x86_64/multiarch/strcasecmp_l-evex.S | 23 ++ + sysdeps/x86_64/multiarch/strcmp-evex.S | 290 ++++++++++++++++--- + sysdeps/x86_64/multiarch/strncase_l-evex.S | 25 ++ + 6 files changed, 321 insertions(+), 40 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 711ecf2e..359712c1 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -53,6 +53,7 @@ sysdep_routines += \ + strcasecmp_l-avx \ + strcasecmp_l-avx2 \ + strcasecmp_l-avx2-rtm \ ++ strcasecmp_l-evex \ + strcasecmp_l-sse2 \ + strcasecmp_l-sse4_2 \ + strcasecmp_l-ssse3 \ +@@ -93,6 +94,7 @@ sysdep_routines += \ + strncase_l-avx \ + strncase_l-avx2 \ + strncase_l-avx2-rtm \ ++ strncase_l-evex \ + strncase_l-sse2 \ + strncase_l-sse4_2 \ + strncase_l-ssse3 \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index d873e1be..1dedc637 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -418,6 +418,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ + IFUNC_IMPL (i, name, strcasecmp, ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strcasecmp_evex) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (AVX2), + __strcasecmp_avx2) +@@ -438,6 +442,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ + IFUNC_IMPL (i, name, strcasecmp_l, ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strcasecmp_l_evex) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (AVX2), + __strcasecmp_l_avx2) +@@ -572,6 +580,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ + IFUNC_IMPL (i, name, strncasecmp, ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strncasecmp_evex) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (AVX2), + __strncasecmp_avx2) +@@ -593,6 +605,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ + IFUNC_IMPL (i, name, strncasecmp_l, ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strncasecmp_l_evex) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (AVX2), + __strncasecmp_l_avx2) +diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +index 926508c4..6dd49a21 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) +@@ -34,6 +35,10 @@ IFUNC_SELECTOR (void) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ return OPTIMIZE (evex); ++ + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + +diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S +new file mode 100644 +index 00000000..58642db7 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S +@@ -0,0 +1,23 @@ ++/* strcasecmp_l optimized with EVEX. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef STRCMP ++# define STRCMP __strcasecmp_l_evex ++#endif ++#define USE_AS_STRCASECMP_L ++#include "strcmp-evex.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 0dfa62bd..b81b5775 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -19,6 +19,9 @@ + #if IS_IN (libc) + + # include ++# if defined USE_AS_STRCASECMP_L ++# include "locale-defines.h" ++# endif + + # ifndef STRCMP + # define STRCMP __strcmp_evex +@@ -34,19 +37,29 @@ + # define VMOVA vmovdqa64 + + # ifdef USE_AS_WCSCMP +-# define TESTEQ subl $0xff, ++# ifndef OVERFLOW_STRCMP ++# define OVERFLOW_STRCMP __wcscmp_evex ++# endif ++ ++# define TESTEQ subl $0xff, + /* Compare packed dwords. */ + # define VPCMP vpcmpd + # define VPMINU vpminud + # define VPTESTM vptestmd ++# define VPTESTNM vptestnmd + /* 1 dword char == 4 bytes. */ + # define SIZE_OF_CHAR 4 + # else ++# ifndef OVERFLOW_STRCMP ++# define OVERFLOW_STRCMP __strcmp_evex ++# endif ++ + # define TESTEQ incl + /* Compare packed bytes. */ + # define VPCMP vpcmpb + # define VPMINU vpminub + # define VPTESTM vptestmb ++# define VPTESTNM vptestnmb + /* 1 byte char == 1 byte. */ + # define SIZE_OF_CHAR 1 + # endif +@@ -73,11 +86,16 @@ + # define VEC_OFFSET (-VEC_SIZE) + # endif + +-# define XMMZERO xmm16 + # define XMM0 xmm17 + # define XMM1 xmm18 + +-# define YMMZERO ymm16 ++# define XMM10 xmm27 ++# define XMM11 xmm28 ++# define XMM12 xmm29 ++# define XMM13 xmm30 ++# define XMM14 xmm31 ++ ++ + # define YMM0 ymm17 + # define YMM1 ymm18 + # define YMM2 ymm19 +@@ -89,6 +107,87 @@ + # define YMM8 ymm25 + # define YMM9 ymm26 + # define YMM10 ymm27 ++# define YMM11 ymm28 ++# define YMM12 ymm29 ++# define YMM13 ymm30 ++# define YMM14 ymm31 ++ ++# ifdef USE_AS_STRCASECMP_L ++# define BYTE_LOOP_REG OFFSET_REG ++# else ++# define BYTE_LOOP_REG ecx ++# endif ++ ++# ifdef USE_AS_STRCASECMP_L ++# ifdef USE_AS_STRNCMP ++# define STRCASECMP __strncasecmp_evex ++# define LOCALE_REG rcx ++# define LOCALE_REG_LP RCX_LP ++# define STRCASECMP_NONASCII __strncasecmp_l_nonascii ++# else ++# define STRCASECMP __strcasecmp_evex ++# define LOCALE_REG rdx ++# define LOCALE_REG_LP RDX_LP ++# define STRCASECMP_NONASCII __strcasecmp_l_nonascii ++# endif ++# endif ++ ++# define LCASE_MIN_YMM %YMM12 ++# define LCASE_MAX_YMM %YMM13 ++# define CASE_ADD_YMM %YMM14 ++ ++# define LCASE_MIN_XMM %XMM12 ++# define LCASE_MAX_XMM %XMM13 ++# define CASE_ADD_XMM %XMM14 ++ ++ /* NB: wcsncmp uses r11 but strcasecmp is never used in ++ conjunction with wcscmp. */ ++# define TOLOWER_BASE %r11 ++ ++# ifdef USE_AS_STRCASECMP_L ++# define _REG(x, y) x ## y ++# define REG(x, y) _REG(x, y) ++# define TOLOWER(reg1, reg2, ext) \ ++ vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \ ++ vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \ ++ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \ ++ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \ ++ vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \ ++ vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6} ++ ++# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst ++# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM) ++# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM) ++ ++# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \ ++ TOLOWER (s1_reg, s2_reg, ext); \ ++ VPCMP $0, s1_reg, s2_reg, reg_out ++ ++# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \ ++ VMOVU s2_mem, s2_reg; \ ++ CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) ++ ++# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM) ++# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM) ++ ++# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM) ++# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM) ++ ++# else ++# define TOLOWER_gpr(...) ++# define TOLOWER_YMM(...) ++# define TOLOWER_XMM(...) ++ ++# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \ ++ VPCMP $0, s2_reg, s1_reg, reg_out ++ ++# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__) ++ ++# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \ ++ VPCMP $0, s2_mem, s1_reg, reg_out ++ ++# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__) ++# endif + + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. +@@ -112,8 +211,45 @@ + returned. */ + + .section .text.evex, "ax", @progbits +-ENTRY(STRCMP) ++ .align 16 ++ .type STRCMP, @function ++ .globl STRCMP ++ .hidden STRCMP ++ ++# ifdef USE_AS_STRCASECMP_L ++ENTRY (STRCASECMP) ++ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax ++ mov %fs:(%rax), %LOCALE_REG_LP ++ ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 ++END (STRCASECMP) ++ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */ ++# endif ++ ++ .p2align 4 ++STRCMP: ++ cfi_startproc ++ _CET_ENDBR ++ CALL_MCOUNT ++ ++# if defined USE_AS_STRCASECMP_L ++ /* We have to fall back on the C implementation for locales with ++ encodings not matching ASCII for single bytes. */ ++# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 ++ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP ++# else ++ mov (%LOCALE_REG), %RAX_LP ++# endif ++ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) ++ jne STRCASECMP_NONASCII ++ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE ++# endif ++ + # ifdef USE_AS_STRNCMP ++ /* Don't overwrite LOCALE_REG (rcx) until we have pass ++ L(one_or_less). Otherwise we might use the wrong locale in ++ the OVERFLOW_STRCMP (strcasecmp_l). */ + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +@@ -125,6 +261,32 @@ ENTRY(STRCMP) + actually bound the buffer. */ + jle L(one_or_less) + # endif ++ ++# if defined USE_AS_STRCASECMP_L ++ .section .rodata.cst32, "aM", @progbits, 32 ++ .align 32 ++L(lcase_min): ++ .quad 0x4141414141414141 ++ .quad 0x4141414141414141 ++ .quad 0x4141414141414141 ++ .quad 0x4141414141414141 ++L(lcase_max): ++ .quad 0x1a1a1a1a1a1a1a1a ++ .quad 0x1a1a1a1a1a1a1a1a ++ .quad 0x1a1a1a1a1a1a1a1a ++ .quad 0x1a1a1a1a1a1a1a1a ++L(case_add): ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .previous ++ ++ vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM ++ vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM ++ vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM ++# endif ++ + movl %edi, %eax + orl %esi, %eax + /* Shift out the bits irrelivant to page boundary ([63:12]). */ +@@ -139,7 +301,7 @@ L(no_page_cross): + VPTESTM %YMM0, %YMM0, %k2 + /* Each bit cleared in K1 represents a mismatch or a null CHAR + in YMM0 and 32 bytes at (%rsi). */ +- VPCMP $0, (%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_STRNCMP + cmpq $CHAR_PER_VEC, %rdx +@@ -169,6 +331,8 @@ L(return_vec_0): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret0): +@@ -188,11 +352,15 @@ L(ret_zero): + + .p2align 4,, 5 + L(one_or_less): ++# ifdef USE_AS_STRCASECMP_L ++ /* Set locale argument for strcasecmp. */ ++ movq %LOCALE_REG, %rdx ++# endif + jb L(ret_zero) +-# ifdef USE_AS_WCSCMP + /* 'nbe' covers the case where length is negative (large + unsigned). */ +- jnbe __wcscmp_evex ++ jnbe OVERFLOW_STRCMP ++# ifdef USE_AS_WCSCMP + movl (%rdi), %edx + xorl %eax, %eax + cmpl (%rsi), %edx +@@ -201,11 +369,10 @@ L(one_or_less): + negl %eax + orl $1, %eax + # else +- /* 'nbe' covers the case where length is negative (large +- unsigned). */ +- jnbe __strcmp_evex + movzbl (%rdi), %eax + movzbl (%rsi), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret1): +@@ -233,6 +400,8 @@ L(return_vec_1): + # else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret2): +@@ -270,6 +439,8 @@ L(return_vec_2): + # else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret3): +@@ -290,6 +461,8 @@ L(return_vec_3): + # else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret4): +@@ -303,7 +476,7 @@ L(more_3x_vec): + /* Safe to compare 4x vectors. */ + VMOVU (VEC_SIZE)(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_1) +@@ -315,14 +488,14 @@ L(more_3x_vec): + + VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_3) +@@ -381,7 +554,6 @@ L(prepare_loop_aligned): + subl %esi, %eax + andl $(PAGE_SIZE - 1), %eax + +- vpxorq %YMMZERO, %YMMZERO, %YMMZERO + + /* Loop 4x comparisons at a time. */ + .p2align 4 +@@ -413,22 +585,35 @@ L(loop_skip_page_cross_check): + /* A zero CHAR in YMM9 means that there is a null CHAR. */ + VPMINU %YMM8, %YMM9, %YMM9 + +- /* Each bit set in K1 represents a non-null CHAR in YMM8. */ ++ /* Each bit set in K1 represents a non-null CHAR in YMM9. */ + VPTESTM %YMM9, %YMM9, %k1 +- ++# ifndef USE_AS_STRCASECMP_L + vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1 + vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3 + vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 + /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while + oring with YMM1. Result is stored in YMM6. */ + vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6 +- ++# else ++ VMOVU (VEC_SIZE * 0)(%rsi), %YMM1 ++ TOLOWER_YMM (%YMM0, %YMM1) ++ VMOVU (VEC_SIZE * 1)(%rsi), %YMM3 ++ TOLOWER_YMM (%YMM2, %YMM3) ++ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5 ++ TOLOWER_YMM (%YMM4, %YMM5) ++ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7 ++ TOLOWER_YMM (%YMM6, %YMM7) ++ vpxorq %YMM0, %YMM1, %YMM1 ++ vpxorq %YMM2, %YMM3, %YMM3 ++ vpxorq %YMM4, %YMM5, %YMM5 ++ vpternlogd $0xde, %YMM7, %YMM1, %YMM6 ++# endif + /* Or together YMM3, YMM5, and YMM6. */ + vpternlogd $0xfe, %YMM3, %YMM5, %YMM6 + + + /* A non-zero CHAR in YMM6 represents a mismatch. */ +- VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++ VPTESTNM %YMM6, %YMM6, %k0{%k1} + kmovd %k0, %LOOP_REG + + TESTEQ %LOOP_REG +@@ -437,13 +622,13 @@ L(loop_skip_page_cross_check): + + /* Find which VEC has the mismatch of end of string. */ + VPTESTM %YMM0, %YMM0, %k1 +- VPCMP $0, %YMMZERO, %YMM1, %k0{%k1} ++ VPTESTNM %YMM1, %YMM1, %k0{%k1} + kmovd %k0, %ecx + TESTEQ %ecx + jnz L(return_vec_0_end) + + VPTESTM %YMM2, %YMM2, %k1 +- VPCMP $0, %YMMZERO, %YMM3, %k0{%k1} ++ VPTESTNM %YMM3, %YMM3, %k0{%k1} + kmovd %k0, %ecx + TESTEQ %ecx + jnz L(return_vec_1_end) +@@ -457,7 +642,7 @@ L(return_vec_2_3_end): + # endif + + VPTESTM %YMM4, %YMM4, %k1 +- VPCMP $0, %YMMZERO, %YMM5, %k0{%k1} ++ VPTESTNM %YMM5, %YMM5, %k0{%k1} + kmovd %k0, %ecx + TESTEQ %ecx + # if CHAR_PER_VEC <= 16 +@@ -493,6 +678,8 @@ L(return_vec_3_end): + # else + movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax + movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -545,6 +732,8 @@ L(return_vec_0_end): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + /* Flip `eax` if `rdi` and `rsi` where swapped in page cross + logic. Subtract `r8d` after xor for zero case. */ +@@ -569,6 +758,8 @@ L(return_vec_1_end): + # else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -598,7 +789,7 @@ L(page_cross_during_loop): + + VMOVA (%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_0_end) +@@ -619,8 +810,7 @@ L(less_1x_vec_till_page_cross): + been loaded earlier so must be valid. */ + VMOVU -VEC_SIZE(%rdi, %rax), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2} +- ++ CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2} + /* Mask of potentially valid bits. The lower bits can be out of + range comparisons (but safe regarding page crosses). */ + +@@ -642,6 +832,8 @@ L(less_1x_vec_till_page_cross): + + # ifdef USE_AS_STRNCMP + # ifdef USE_AS_WCSCMP ++ /* NB: strcasecmp not used with WCSCMP so this access to r11 is ++ safe. */ + movl %eax, %r11d + shrl $2, %r11d + cmpq %r11, %rdx +@@ -679,6 +871,8 @@ L(return_page_cross_cmp_mem): + # else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -709,7 +903,7 @@ L(more_2x_vec_till_page_cross): + + VMOVA VEC_SIZE(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_1_end) +@@ -724,14 +918,14 @@ L(more_2x_vec_till_page_cross): + /* Safe to include comparisons from lower bytes. */ + VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_page_cross_0) + + VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_page_cross_1) +@@ -740,6 +934,8 @@ L(more_2x_vec_till_page_cross): + /* Must check length here as length might proclude reading next + page. */ + # ifdef USE_AS_WCSCMP ++ /* NB: strcasecmp not used with WCSCMP so this access to r11 is ++ safe. */ + movl %eax, %r11d + shrl $2, %r11d + cmpq %r11, %rdx +@@ -754,12 +950,19 @@ L(more_2x_vec_till_page_cross): + VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 + VPMINU %YMM4, %YMM6, %YMM9 + VPTESTM %YMM9, %YMM9, %k1 +- ++# ifndef USE_AS_STRCASECMP_L + vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 + /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */ + vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6 +- +- VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++# else ++ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5 ++ TOLOWER_YMM (%YMM4, %YMM5) ++ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7 ++ TOLOWER_YMM (%YMM6, %YMM7) ++ vpxorq %YMM4, %YMM5, %YMM5 ++ vpternlogd $0xde, %YMM7, %YMM5, %YMM6 ++# endif ++ VPTESTNM %YMM6, %YMM6, %k0{%k1} + kmovd %k0, %LOOP_REG + TESTEQ %LOOP_REG + jnz L(return_vec_2_3_end) +@@ -815,6 +1018,8 @@ L(return_vec_page_cross_1): + # else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -871,7 +1076,7 @@ L(page_cross): + L(page_cross_loop): + VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(check_ret_vec_page_cross) +@@ -895,7 +1100,7 @@ L(page_cross_loop): + */ + VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} + + kmovd %k1, %ecx + # ifdef USE_AS_STRNCMP +@@ -930,6 +1135,8 @@ L(ret_vec_page_cross_cont): + # else + movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax + movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -989,7 +1196,7 @@ L(less_1x_vec_till_page): + /* Use 16 byte comparison. */ + vmovdqu (%rdi), %xmm0 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, (%rsi), %xmm0, %k1{%k2} ++ CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP + subl $0xf, %ecx +@@ -1009,7 +1216,7 @@ L(less_1x_vec_till_page): + # endif + vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2} ++ CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP + subl $0xf, %ecx +@@ -1048,7 +1255,7 @@ L(less_16_till_page): + vmovq (%rdi), %xmm0 + vmovq (%rsi), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP + subl $0x3, %ecx +@@ -1068,7 +1275,7 @@ L(less_16_till_page): + vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 + vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP + subl $0x3, %ecx +@@ -1128,7 +1335,7 @@ L(ret_less_8_wcs): + vmovd (%rdi), %xmm0 + vmovd (%rsi), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} + kmovd %k1, %ecx + subl $0xf, %ecx + jnz L(check_ret_vec_page_cross) +@@ -1143,7 +1350,7 @@ L(ret_less_8_wcs): + vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 + vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} + kmovd %k1, %ecx + subl $0xf, %ecx + jnz L(check_ret_vec_page_cross) +@@ -1176,7 +1383,9 @@ L(less_4_till_page): + L(less_4_loop): + movzbl (%rdi), %eax + movzbl (%rsi, %rdi), %ecx +- subl %ecx, %eax ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG) ++ subl %BYTE_LOOP_REG, %eax + jnz L(ret_less_4_loop) + testl %ecx, %ecx + jz L(ret_zero_4_loop) +@@ -1203,5 +1412,6 @@ L(ret_less_4_loop): + subl %r8d, %eax + ret + # endif +-END(STRCMP) ++ cfi_endproc ++ .size STRCMP, .-STRCMP + #endif +diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S +new file mode 100644 +index 00000000..8a5af369 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S +@@ -0,0 +1,25 @@ ++/* strncasecmp_l optimized with EVEX. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef STRCMP ++# define STRCMP __strncasecmp_l_evex ++#endif ++#define OVERFLOW_STRCMP __strcasecmp_l_evex ++#define USE_AS_STRCASECMP_L ++#define USE_AS_STRNCMP ++#include "strcmp-evex.S" +-- +GitLab + diff --git a/glibc-RHEL-15696-99.patch b/glibc-RHEL-15696-99.patch new file mode 100644 index 0000000..06d5d53 --- /dev/null +++ b/glibc-RHEL-15696-99.patch @@ -0,0 +1,913 @@ +From 305769b2a15c2e96f9e1b5195d3c4e0d6f0f4b68 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:46 -0500 +Subject: [PATCH] x86: Remove AVX str{n}casecmp +Content-type: text/plain; charset=UTF-8 + +The rational is: + +1. SSE42 has nearly identical logic so any benefit is minimal (3.4% + regression on Tigerlake using SSE42 versus AVX across the + benchtest suite). +2. AVX2 version covers the majority of targets that previously + prefered it. +3. The targets where AVX would still be best (SnB and IVB) are + becoming outdated. + +All in all the saving the code size is worth it. + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/Makefile | 2 - + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 - + sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 - + sysdeps/x86_64/multiarch/strcasecmp_l-avx.S | 22 -- + sysdeps/x86_64/multiarch/strcmp-sse42.S | 240 +++++++++----------- + sysdeps/x86_64/multiarch/strncase_l-avx.S | 22 -- + 6 files changed, 105 insertions(+), 197 deletions(-) + delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx.S + delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 359712c1..bca82e38 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -50,7 +50,6 @@ sysdep_routines += \ + stpncpy-evex \ + stpncpy-sse2-unaligned \ + stpncpy-ssse3 \ +- strcasecmp_l-avx \ + strcasecmp_l-avx2 \ + strcasecmp_l-avx2-rtm \ + strcasecmp_l-evex \ +@@ -91,7 +90,6 @@ sysdep_routines += \ + strlen-avx2-rtm \ + strlen-evex \ + strlen-sse2 \ +- strncase_l-avx \ + strncase_l-avx2 \ + strncase_l-avx2-rtm \ + strncase_l-evex \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 1dedc637..14314367 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -429,9 +429,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcasecmp_avx2_rtm) +- IFUNC_IMPL_ADD (array, i, strcasecmp, +- CPU_FEATURE_USABLE (AVX), +- __strcasecmp_avx) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (SSE4_2), + __strcasecmp_sse42) +@@ -453,9 +450,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcasecmp_l_avx2_rtm) +- IFUNC_IMPL_ADD (array, i, strcasecmp_l, +- CPU_FEATURE_USABLE (AVX), +- __strcasecmp_l_avx) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + CPU_FEATURE_USABLE (SSE4_2), + __strcasecmp_l_sse42) +@@ -591,9 +585,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncasecmp_avx2_rtm) +- IFUNC_IMPL_ADD (array, i, strncasecmp, +- CPU_FEATURE_USABLE (AVX), +- __strncasecmp_avx) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (SSE4_2), + __strncasecmp_sse42) +@@ -616,9 +607,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncasecmp_l_avx2_rtm) +- IFUNC_IMPL_ADD (array, i, strncasecmp_l, +- CPU_FEATURE_USABLE (AVX), +- __strncasecmp_l_avx) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + CPU_FEATURE_USABLE (SSE4_2), + __strncasecmp_l_sse42) +diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +index 6dd49a21..34cfbb8f 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +@@ -22,7 +22,6 @@ + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; +@@ -46,9 +45,6 @@ IFUNC_SELECTOR (void) + return OPTIMIZE (avx2); + } + +- if (CPU_FEATURE_USABLE_P (cpu_features, AVX)) +- return OPTIMIZE (avx); +- + if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2) + && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) + return OPTIMIZE (sse42); +diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S +deleted file mode 100644 +index 56a03547..00000000 +--- a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S ++++ /dev/null +@@ -1,22 +0,0 @@ +-/* strcasecmp_l optimized with AVX. +- Copyright (C) 2017-2018 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#define STRCMP_SSE42 __strcasecmp_l_avx +-#define USE_AVX 1 +-#define USE_AS_STRCASECMP_L +-#include "strcmp-sse42.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S +index 59e8ddfc..0a42b7a4 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S ++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S +@@ -42,13 +42,8 @@ + # define UPDATE_STRNCMP_COUNTER + #endif + +-#ifdef USE_AVX +-# define SECTION avx +-# define GLABEL(l) l##_avx +-#else +-# define SECTION sse4.2 +-# define GLABEL(l) l##_sse42 +-#endif ++#define SECTION sse4.2 ++#define GLABEL(l) l##_sse42 + + #define LABEL(l) .L##l + +@@ -106,21 +101,7 @@ END (GLABEL(__strncasecmp)) + #endif + + +-#ifdef USE_AVX +-# define movdqa vmovdqa +-# define movdqu vmovdqu +-# define pmovmskb vpmovmskb +-# define pcmpistri vpcmpistri +-# define psubb vpsubb +-# define pcmpeqb vpcmpeqb +-# define psrldq vpsrldq +-# define pslldq vpslldq +-# define palignr vpalignr +-# define pxor vpxor +-# define D(arg) arg, arg +-#else +-# define D(arg) arg +-#endif ++#define arg arg + + STRCMP_SSE42: + cfi_startproc +@@ -192,18 +173,7 @@ LABEL(case_add): + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm2 + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +-# ifdef USE_AVX +-# define TOLOWER(reg1, reg2) \ +- vpaddb LCASE_MIN_reg, reg1, %xmm7; \ +- vpaddb LCASE_MIN_reg, reg2, %xmm8; \ +- vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \ +- vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \ +- vpandn CASE_ADD_reg, %xmm7, %xmm7; \ +- vpandn CASE_ADD_reg, %xmm8, %xmm8; \ +- vpaddb %xmm7, reg1, reg1; \ +- vpaddb %xmm8, reg2, reg2 +-# else +-# define TOLOWER(reg1, reg2) \ ++# define TOLOWER(reg1, reg2) \ + movdqa LCASE_MIN_reg, %xmm7; \ + movdqa LCASE_MIN_reg, %xmm8; \ + paddb reg1, %xmm7; \ +@@ -214,15 +184,15 @@ LABEL(case_add): + pandn CASE_ADD_reg, %xmm8; \ + paddb %xmm7, reg1; \ + paddb %xmm8, reg2 +-# endif ++ + TOLOWER (%xmm1, %xmm2) + #else + # define TOLOWER(reg1, reg2) + #endif +- pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */ +- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ +- pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */ +- psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ ++ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ ++ pcmpeqb %xmm1, %xmm0 /* Any null chars? */ ++ pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ ++ psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ + jnz LABEL(less16bytes)/* If not, find different value or null char */ +@@ -246,7 +216,7 @@ LABEL(crosscache): + xor %r8d, %r8d + and $0xf, %ecx /* offset of rsi */ + and $0xf, %eax /* offset of rdi */ +- pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */ ++ pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ + cmp %eax, %ecx + je LABEL(ashr_0) /* rsi and rdi relative offset same */ + ja LABEL(bigger) +@@ -260,7 +230,7 @@ LABEL(bigger): + sub %rcx, %r9 + lea LABEL(unaligned_table)(%rip), %r10 + movslq (%r10, %r9,4), %r9 +- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ ++ pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + lea (%r10, %r9), %r10 + _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ + +@@ -273,15 +243,15 @@ LABEL(bigger): + LABEL(ashr_0): + + movdqa (%rsi), %xmm1 +- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ ++ pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L +- pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */ ++ pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ + #else + movdqa (%rdi), %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */ ++ pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ + #endif +- psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ ++ psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ +@@ -361,10 +331,10 @@ LABEL(ashr_0_exit_use): + */ + .p2align 4 + LABEL(ashr_1): +- pslldq $15, D(%xmm2) /* shift first string to align with second */ ++ pslldq $15, %xmm2 /* shift first string to align with second */ + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */ +- psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/ ++ pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ ++ psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ +@@ -392,7 +362,7 @@ LABEL(loop_ashr_1_use): + + LABEL(nibble_ashr_1_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $1, -16(%rdi, %rdx), D(%xmm0) ++ palignr $1, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -411,7 +381,7 @@ LABEL(nibble_ashr_1_restart_use): + jg LABEL(nibble_ashr_1_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $1, -16(%rdi, %rdx), D(%xmm0) ++ palignr $1, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -431,7 +401,7 @@ LABEL(nibble_ashr_1_restart_use): + LABEL(nibble_ashr_1_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $1, D(%xmm0) ++ psrldq $1, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -449,10 +419,10 @@ LABEL(nibble_ashr_1_use): + */ + .p2align 4 + LABEL(ashr_2): +- pslldq $14, D(%xmm2) ++ pslldq $14, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -480,7 +450,7 @@ LABEL(loop_ashr_2_use): + + LABEL(nibble_ashr_2_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $2, -16(%rdi, %rdx), D(%xmm0) ++ palignr $2, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -499,7 +469,7 @@ LABEL(nibble_ashr_2_restart_use): + jg LABEL(nibble_ashr_2_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $2, -16(%rdi, %rdx), D(%xmm0) ++ palignr $2, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -519,7 +489,7 @@ LABEL(nibble_ashr_2_restart_use): + LABEL(nibble_ashr_2_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $2, D(%xmm0) ++ psrldq $2, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -537,10 +507,10 @@ LABEL(nibble_ashr_2_use): + */ + .p2align 4 + LABEL(ashr_3): +- pslldq $13, D(%xmm2) ++ pslldq $13, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -568,7 +538,7 @@ LABEL(loop_ashr_3_use): + + LABEL(nibble_ashr_3_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $3, -16(%rdi, %rdx), D(%xmm0) ++ palignr $3, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -587,7 +557,7 @@ LABEL(nibble_ashr_3_restart_use): + jg LABEL(nibble_ashr_3_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $3, -16(%rdi, %rdx), D(%xmm0) ++ palignr $3, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -607,7 +577,7 @@ LABEL(nibble_ashr_3_restart_use): + LABEL(nibble_ashr_3_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $3, D(%xmm0) ++ psrldq $3, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -625,10 +595,10 @@ LABEL(nibble_ashr_3_use): + */ + .p2align 4 + LABEL(ashr_4): +- pslldq $12, D(%xmm2) ++ pslldq $12, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -657,7 +627,7 @@ LABEL(loop_ashr_4_use): + + LABEL(nibble_ashr_4_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $4, -16(%rdi, %rdx), D(%xmm0) ++ palignr $4, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -676,7 +646,7 @@ LABEL(nibble_ashr_4_restart_use): + jg LABEL(nibble_ashr_4_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $4, -16(%rdi, %rdx), D(%xmm0) ++ palignr $4, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -696,7 +666,7 @@ LABEL(nibble_ashr_4_restart_use): + LABEL(nibble_ashr_4_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $4, D(%xmm0) ++ psrldq $4, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -714,10 +684,10 @@ LABEL(nibble_ashr_4_use): + */ + .p2align 4 + LABEL(ashr_5): +- pslldq $11, D(%xmm2) ++ pslldq $11, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -746,7 +716,7 @@ LABEL(loop_ashr_5_use): + + LABEL(nibble_ashr_5_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $5, -16(%rdi, %rdx), D(%xmm0) ++ palignr $5, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -766,7 +736,7 @@ LABEL(nibble_ashr_5_restart_use): + + movdqa (%rdi, %rdx), %xmm0 + +- palignr $5, -16(%rdi, %rdx), D(%xmm0) ++ palignr $5, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -786,7 +756,7 @@ LABEL(nibble_ashr_5_restart_use): + LABEL(nibble_ashr_5_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $5, D(%xmm0) ++ psrldq $5, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -804,10 +774,10 @@ LABEL(nibble_ashr_5_use): + */ + .p2align 4 + LABEL(ashr_6): +- pslldq $10, D(%xmm2) ++ pslldq $10, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -836,7 +806,7 @@ LABEL(loop_ashr_6_use): + + LABEL(nibble_ashr_6_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $6, -16(%rdi, %rdx), D(%xmm0) ++ palignr $6, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -855,7 +825,7 @@ LABEL(nibble_ashr_6_restart_use): + jg LABEL(nibble_ashr_6_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $6, -16(%rdi, %rdx), D(%xmm0) ++ palignr $6, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -875,7 +845,7 @@ LABEL(nibble_ashr_6_restart_use): + LABEL(nibble_ashr_6_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $6, D(%xmm0) ++ psrldq $6, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -893,10 +863,10 @@ LABEL(nibble_ashr_6_use): + */ + .p2align 4 + LABEL(ashr_7): +- pslldq $9, D(%xmm2) ++ pslldq $9, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -925,7 +895,7 @@ LABEL(loop_ashr_7_use): + + LABEL(nibble_ashr_7_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $7, -16(%rdi, %rdx), D(%xmm0) ++ palignr $7, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -944,7 +914,7 @@ LABEL(nibble_ashr_7_restart_use): + jg LABEL(nibble_ashr_7_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $7, -16(%rdi, %rdx), D(%xmm0) ++ palignr $7, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -964,7 +934,7 @@ LABEL(nibble_ashr_7_restart_use): + LABEL(nibble_ashr_7_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $7, D(%xmm0) ++ psrldq $7, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -982,10 +952,10 @@ LABEL(nibble_ashr_7_use): + */ + .p2align 4 + LABEL(ashr_8): +- pslldq $8, D(%xmm2) ++ pslldq $8, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1014,7 +984,7 @@ LABEL(loop_ashr_8_use): + + LABEL(nibble_ashr_8_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $8, -16(%rdi, %rdx), D(%xmm0) ++ palignr $8, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1033,7 +1003,7 @@ LABEL(nibble_ashr_8_restart_use): + jg LABEL(nibble_ashr_8_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $8, -16(%rdi, %rdx), D(%xmm0) ++ palignr $8, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1053,7 +1023,7 @@ LABEL(nibble_ashr_8_restart_use): + LABEL(nibble_ashr_8_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $8, D(%xmm0) ++ psrldq $8, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1071,10 +1041,10 @@ LABEL(nibble_ashr_8_use): + */ + .p2align 4 + LABEL(ashr_9): +- pslldq $7, D(%xmm2) ++ pslldq $7, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1104,7 +1074,7 @@ LABEL(loop_ashr_9_use): + LABEL(nibble_ashr_9_restart_use): + movdqa (%rdi, %rdx), %xmm0 + +- palignr $9, -16(%rdi, %rdx), D(%xmm0) ++ palignr $9, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1123,7 +1093,7 @@ LABEL(nibble_ashr_9_restart_use): + jg LABEL(nibble_ashr_9_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $9, -16(%rdi, %rdx), D(%xmm0) ++ palignr $9, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1143,7 +1113,7 @@ LABEL(nibble_ashr_9_restart_use): + LABEL(nibble_ashr_9_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $9, D(%xmm0) ++ psrldq $9, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1161,10 +1131,10 @@ LABEL(nibble_ashr_9_use): + */ + .p2align 4 + LABEL(ashr_10): +- pslldq $6, D(%xmm2) ++ pslldq $6, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1193,7 +1163,7 @@ LABEL(loop_ashr_10_use): + + LABEL(nibble_ashr_10_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $10, -16(%rdi, %rdx), D(%xmm0) ++ palignr $10, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1212,7 +1182,7 @@ LABEL(nibble_ashr_10_restart_use): + jg LABEL(nibble_ashr_10_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $10, -16(%rdi, %rdx), D(%xmm0) ++ palignr $10, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1232,7 +1202,7 @@ LABEL(nibble_ashr_10_restart_use): + LABEL(nibble_ashr_10_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $10, D(%xmm0) ++ psrldq $10, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1250,10 +1220,10 @@ LABEL(nibble_ashr_10_use): + */ + .p2align 4 + LABEL(ashr_11): +- pslldq $5, D(%xmm2) ++ pslldq $5, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1282,7 +1252,7 @@ LABEL(loop_ashr_11_use): + + LABEL(nibble_ashr_11_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $11, -16(%rdi, %rdx), D(%xmm0) ++ palignr $11, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1301,7 +1271,7 @@ LABEL(nibble_ashr_11_restart_use): + jg LABEL(nibble_ashr_11_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $11, -16(%rdi, %rdx), D(%xmm0) ++ palignr $11, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1321,7 +1291,7 @@ LABEL(nibble_ashr_11_restart_use): + LABEL(nibble_ashr_11_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $11, D(%xmm0) ++ psrldq $11, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1339,10 +1309,10 @@ LABEL(nibble_ashr_11_use): + */ + .p2align 4 + LABEL(ashr_12): +- pslldq $4, D(%xmm2) ++ pslldq $4, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1371,7 +1341,7 @@ LABEL(loop_ashr_12_use): + + LABEL(nibble_ashr_12_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $12, -16(%rdi, %rdx), D(%xmm0) ++ palignr $12, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1390,7 +1360,7 @@ LABEL(nibble_ashr_12_restart_use): + jg LABEL(nibble_ashr_12_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $12, -16(%rdi, %rdx), D(%xmm0) ++ palignr $12, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1410,7 +1380,7 @@ LABEL(nibble_ashr_12_restart_use): + LABEL(nibble_ashr_12_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $12, D(%xmm0) ++ psrldq $12, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1428,10 +1398,10 @@ LABEL(nibble_ashr_12_use): + */ + .p2align 4 + LABEL(ashr_13): +- pslldq $3, D(%xmm2) ++ pslldq $3, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1461,7 +1431,7 @@ LABEL(loop_ashr_13_use): + + LABEL(nibble_ashr_13_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $13, -16(%rdi, %rdx), D(%xmm0) ++ palignr $13, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1480,7 +1450,7 @@ LABEL(nibble_ashr_13_restart_use): + jg LABEL(nibble_ashr_13_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $13, -16(%rdi, %rdx), D(%xmm0) ++ palignr $13, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1500,7 +1470,7 @@ LABEL(nibble_ashr_13_restart_use): + LABEL(nibble_ashr_13_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $13, D(%xmm0) ++ psrldq $13, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1518,10 +1488,10 @@ LABEL(nibble_ashr_13_use): + */ + .p2align 4 + LABEL(ashr_14): +- pslldq $2, D(%xmm2) ++ pslldq $2, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1551,7 +1521,7 @@ LABEL(loop_ashr_14_use): + + LABEL(nibble_ashr_14_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $14, -16(%rdi, %rdx), D(%xmm0) ++ palignr $14, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1570,7 +1540,7 @@ LABEL(nibble_ashr_14_restart_use): + jg LABEL(nibble_ashr_14_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $14, -16(%rdi, %rdx), D(%xmm0) ++ palignr $14, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1590,7 +1560,7 @@ LABEL(nibble_ashr_14_restart_use): + LABEL(nibble_ashr_14_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $14, D(%xmm0) ++ psrldq $14, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1608,10 +1578,10 @@ LABEL(nibble_ashr_14_use): + */ + .p2align 4 + LABEL(ashr_15): +- pslldq $1, D(%xmm2) ++ pslldq $1, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1643,7 +1613,7 @@ LABEL(loop_ashr_15_use): + + LABEL(nibble_ashr_15_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $15, -16(%rdi, %rdx), D(%xmm0) ++ palignr $15, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1662,7 +1632,7 @@ LABEL(nibble_ashr_15_restart_use): + jg LABEL(nibble_ashr_15_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $15, -16(%rdi, %rdx), D(%xmm0) ++ palignr $15, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1682,7 +1652,7 @@ LABEL(nibble_ashr_15_restart_use): + LABEL(nibble_ashr_15_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $15, D(%xmm0) ++ psrldq $15, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx.S b/sysdeps/x86_64/multiarch/strncase_l-avx.S +deleted file mode 100644 +index 0c4e525b..00000000 +--- a/sysdeps/x86_64/multiarch/strncase_l-avx.S ++++ /dev/null +@@ -1,22 +0,0 @@ +-/* strncasecmp_l optimized with AVX. +- Copyright (C) 2017-2018 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#define STRCMP_SSE42 __strncasecmp_l_avx +-#define USE_AVX 1 +-#define USE_AS_STRNCASECMP_L +-#include "strcmp-sse42.S" +-- +GitLab + diff --git a/glibc-RHEL-15867.patch b/glibc-RHEL-15867.patch new file mode 100644 index 0000000..7df2fb8 --- /dev/null +++ b/glibc-RHEL-15867.patch @@ -0,0 +1,47 @@ +commit 2337e04e21ba6040926ec871e403533f77043c40 +Author: Siddhesh Poyarekar +Date: Thu Feb 2 07:49:02 2023 -0500 + + cdefs: Limit definition of fortification macros + + Define the __glibc_fortify and other macros only when __FORTIFY_LEVEL > + 0. This has the effect of not defining these macros on older C90 + compilers that do not have support for variable length argument lists. + + Also trim off the trailing backslashes from the definition of + __glibc_fortify and __glibc_fortify_n macros. + + Signed-off-by: Siddhesh Poyarekar + Reviewed-by: Florian Weimer + +diff --git a/misc/sys/cdefs.h b/misc/sys/cdefs.h +index f3d7efdd2a9320f7..46ec4ef71e14c569 100644 +--- a/misc/sys/cdefs.h ++++ b/misc/sys/cdefs.h +@@ -133,6 +133,7 @@ + # define __glibc_objsize(__o) __bos (__o) + #endif + ++#if __USE_FORTIFY_LEVEL > 0 + /* Compile time conditions to choose between the regular, _chk and _chk_warn + variants. These conditions should get evaluated to constant and optimized + away. */ +@@ -168,7 +169,7 @@ + ? __ ## f ## _alias (__VA_ARGS__) \ + : (__glibc_unsafe_len (__l, __s, __osz) \ + ? __ ## f ## _chk_warn (__VA_ARGS__, __osz) \ +- : __ ## f ## _chk (__VA_ARGS__, __osz))) \ ++ : __ ## f ## _chk (__VA_ARGS__, __osz))) + + /* Fortify function f, where object size argument passed to f is the number of + elements and not total size. */ +@@ -178,7 +179,8 @@ + ? __ ## f ## _alias (__VA_ARGS__) \ + : (__glibc_unsafe_len (__l, __s, __osz) \ + ? __ ## f ## _chk_warn (__VA_ARGS__, (__osz) / (__s)) \ +- : __ ## f ## _chk (__VA_ARGS__, (__osz) / (__s)))) \ ++ : __ ## f ## _chk (__VA_ARGS__, (__osz) / (__s)))) ++#endif + + #if __GNUC_PREREQ (4,3) + # define __warndecl(name, msg) \ diff --git a/glibc-RHEL-21522-1.patch b/glibc-RHEL-16825-1.patch similarity index 95% rename from glibc-RHEL-21522-1.patch rename to glibc-RHEL-16825-1.patch index 9cc6c79..c13cfb3 100644 --- a/glibc-RHEL-21522-1.patch +++ b/glibc-RHEL-16825-1.patch @@ -10,7 +10,7 @@ Date: Mon Nov 27 11:28:07 2023 +0100 Reviewed-by: Carlos O'Donell diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c -index 7a84b1fa8c3a7fdd..a80a54fb013adab5 100644 +index 7d8b2bd2336eecb6..66c9266d7f9d65af 100644 --- a/elf/dl-reloc.c +++ b/elf/dl-reloc.c @@ -165,6 +165,9 @@ void diff --git a/glibc-RHEL-21522-2.patch b/glibc-RHEL-16825-2.patch similarity index 98% rename from glibc-RHEL-21522-2.patch rename to glibc-RHEL-16825-2.patch index f42374b..37d2bf9 100644 --- a/glibc-RHEL-21522-2.patch +++ b/glibc-RHEL-16825-2.patch @@ -9,7 +9,7 @@ Date: Mon Nov 27 11:28:10 2023 +0100 Reviewed-by: Carlos O'Donell diff --git a/elf/dl-open.c b/elf/dl-open.c -index e82e53ff8b38fa11..1505fdb73088dcdb 100644 +index 7dfb6b680c108c0b..160451790bb88447 100644 --- a/elf/dl-open.c +++ b/elf/dl-open.c @@ -466,6 +466,50 @@ activate_nodelete (struct link_map *new) diff --git a/glibc-RHEL-21522-3.patch b/glibc-RHEL-16825-3.patch similarity index 95% rename from glibc-RHEL-21522-3.patch rename to glibc-RHEL-16825-3.patch index 1471f87..758e156 100644 --- a/glibc-RHEL-21522-3.patch +++ b/glibc-RHEL-16825-3.patch @@ -15,11 +15,12 @@ Conflicts: elf/rtld.c (removal of prelink support upstream) + diff --git a/elf/Makefile b/elf/Makefile -index 634c3113227d64a6..6f0f36cdfe3961e8 100644 +index 42dc878209b11d29..ebf46a297d241d8f 100644 --- a/elf/Makefile +++ b/elf/Makefile -@@ -386,6 +386,8 @@ tests += \ +@@ -387,6 +387,8 @@ tests += \ tst-nodelete2 \ tst-nodelete-dlclose \ tst-nodelete-opened \ @@ -28,7 +29,7 @@ index 634c3113227d64a6..6f0f36cdfe3961e8 100644 tst-noload \ tst-null-argv \ tst-relsort1 \ -@@ -740,6 +742,8 @@ modules-names = \ +@@ -743,6 +745,8 @@ modules-names = \ tst-nodelete-dlclose-dso \ tst-nodelete-dlclose-plugin \ tst-nodelete-opened-lib \ @@ -37,7 +38,7 @@ index 634c3113227d64a6..6f0f36cdfe3961e8 100644 tst-null-argv-lib \ tst-relsort1mod1 \ tst-relsort1mod2 \ -@@ -886,8 +890,13 @@ modules-execstack-yes = tst-execstack-mod +@@ -889,8 +893,13 @@ modules-execstack-yes = tst-execstack-mod extra-test-objs += $(addsuffix .os,$(strip $(modules-names))) # filtmod1.so has a special rule @@ -53,11 +54,10 @@ index 634c3113227d64a6..6f0f36cdfe3961e8 100644 tests += $(tests-static) -@@ -2697,3 +2706,19 @@ $(objpfx)tst-dlmopen-twice: $(libdl) - $(objpfx)tst-dlmopen-twice.out: \ - $(objpfx)tst-dlmopen-twice-mod1.so \ - $(objpfx)tst-dlmopen-twice-mod2.so -+ +@@ -2707,3 +2716,18 @@ $(objpfx)tst-dlclose-lazy: $(libdl) + $(objpfx)tst-dlclose-lazy.out: \ + $(objpfx)tst-dlclose-lazy-mod1.so $(objpfx)tst-dlclose-lazy-mod2.so + +# The object tst-nodeps1-mod.so has no explicit dependencies on libc.so. +$(objpfx)tst-nodeps1-mod.so: $(objpfx)tst-nodeps1-mod.os + $(LINK.o) -nostartfiles -nostdlib -shared -o $@ $^ @@ -74,7 +74,7 @@ index 634c3113227d64a6..6f0f36cdfe3961e8 100644 +$(objpfx)tst-nodeps2.out: \ + $(objpfx)tst-nodeps1-mod.so $(objpfx)tst-nodeps2-mod.so diff --git a/elf/dl-open.c b/elf/dl-open.c -index 1505fdb73088dcdb..6508b0ea545440b8 100644 +index 160451790bb88447..f32e2fd4ee39db93 100644 --- a/elf/dl-open.c +++ b/elf/dl-open.c @@ -692,6 +692,17 @@ dl_open_worker_begin (void *a) diff --git a/glibc-RHEL-21522-4.patch b/glibc-RHEL-16825-4.patch similarity index 100% rename from glibc-RHEL-21522-4.patch rename to glibc-RHEL-16825-4.patch diff --git a/glibc-RHEL-17468-1.patch b/glibc-RHEL-17468-1.patch new file mode 100644 index 0000000..7d7209a --- /dev/null +++ b/glibc-RHEL-17468-1.patch @@ -0,0 +1,47 @@ +commit 3921c5b40f293c57cb326f58713c924b0662ef59 +Author: Hector Martin +Date: Tue Nov 28 15:23:07 2023 +0900 + + elf: Fix TLS modid reuse generation assignment (BZ 29039) + + _dl_assign_tls_modid() assigns a slotinfo entry for a new module, but + does *not* do anything to the generation counter. The first time this + happens, the generation is zero and map_generation() returns the current + generation to be used during relocation processing. However, if + a slotinfo entry is later reused, it will already have a generation + assigned. If this generation has fallen behind the current global max + generation, then this causes an obsolete generation to be assigned + during relocation processing, as map_generation() returns this + generation if nonzero. _dl_add_to_slotinfo() eventually resets the + generation, but by then it is too late. This causes DTV updates to be + skipped, leading to NULL or broken TLS slot pointers and segfaults. + + Fix this by resetting the generation to zero in _dl_assign_tls_modid(), + so it behaves the same as the first time a slot is assigned. + _dl_add_to_slotinfo() will still assign the correct static generation + later during module load, but relocation processing will no longer use + an obsolete generation. + + Note that slotinfo entry (aka modid) reuse typically happens after a + dlclose and only TLS access via dynamic tlsdesc is affected. Because + tlsdesc is optimized to use the optional part of static TLS, dynamic + tlsdesc can be avoided by increasing the glibc.rtld.optional_static_tls + tunable to a large enough value, or by LD_PRELOAD-ing the affected + modules. + + Fixes bug 29039. + + Reviewed-by: Szabolcs Nagy + +diff --git a/elf/dl-tls.c b/elf/dl-tls.c +index a21276732968d88b..c8104078b2aa0aa2 100644 +--- a/elf/dl-tls.c ++++ b/elf/dl-tls.c +@@ -156,6 +156,7 @@ _dl_assign_tls_modid (struct link_map *l) + { + /* Mark the entry as used, so any dependency see it. */ + atomic_store_relaxed (&runp->slotinfo[result - disp].map, l); ++ atomic_store_relaxed (&runp->slotinfo[result - disp].gen, 0); + break; + } + diff --git a/glibc-RHEL-17468-2.patch b/glibc-RHEL-17468-2.patch new file mode 100644 index 0000000..3722477 --- /dev/null +++ b/glibc-RHEL-17468-2.patch @@ -0,0 +1,198 @@ +commit 980450f12685326729d63ff72e93a996113bf073 +Author: Szabolcs Nagy +Date: Wed Nov 29 11:31:37 2023 +0000 + + elf: Add TLS modid reuse test for bug 29039 + + This is a minimal regression test for bug 29039 which only affects + targets with TLSDESC and a reproducer requires that + + 1) Have modid gaps (closed modules) with old generation. + 2) Update a DTV to a newer generation (needs a newer dlopen). + 3) But do not update the closed gap entry in that DTV. + 4) Reuse the modid gap for a new module (another dlopen). + 5) Use dynamic TLSDESC in that new module with old generation (bug). + 6) Access TLS via this TLSDESC and the now outdated DTV. + + However step (3) in practice rarely happens: during DTV update the + entries for closed modids are initialized to "unallocated" and then + dynamic TLSDESC calls __tls_get_addr independently of its generation. + The only exception to this is DTV setup at thread creation (gaps are + initialized to NULL instead of unallocated) or DTV resize where the + gap entries are outside the previous DTV array (again NULL instead + of unallocated, and this requires loading > DTV_SURPLUS modules). + + So the bug can only cause NULL (+ offset) dereference, not use after + free. And the easiest way to get (3) is via thread creation. + + Note that step (5) requires that the newly loaded module has larger + TLS than the remaining optional static TLS. And for (6) there cannot + be other TLS access or dlopen in the thread that updates the DTV. + + Tested on aarch64-linux-gnu. + + Reviewed-by: Adhemerval Zanella + +Conflicts: + elf/Makefile + (Add $(libdl), Resolve test case ordering conflict.) + +diff --git a/elf/Makefile b/elf/Makefile +index ebf46a297d241d8f..b8fdee7c0d37137e 100644 +--- a/elf/Makefile ++++ b/elf/Makefile +@@ -416,6 +416,7 @@ tests += \ + tst-tls-ie \ + tst-tls-ie-dlmopen \ + tst-tls-manydynamic \ ++ tst-tlsgap \ + tst-unique1 \ + tst-unique2 \ + unload3 \ +@@ -759,6 +760,9 @@ modules-names = \ + tst-tls20mod-bad \ + tst-tls21mod \ + tst-tlsalign-lib \ ++ tst-tlsgap-mod0 \ ++ tst-tlsgap-mod1 \ ++ tst-tlsgap-mod2 \ + tst-tls-ie-mod0 \ + tst-tls-ie-mod1 \ + tst-tls-ie-mod2 \ +@@ -2731,3 +2735,14 @@ $(objpfx)tst-nodeps2-mod.so: $(common-objpfx)libc.so \ + $(objpfx)tst-nodeps2: $(libdl) + $(objpfx)tst-nodeps2.out: \ + $(objpfx)tst-nodeps1-mod.so $(objpfx)tst-nodeps2-mod.so ++ ++$(objpfx)tst-tlsgap: $(libdl) $(shared-thread-library) ++$(objpfx)tst-tlsgap.out: \ ++ $(objpfx)tst-tlsgap-mod0.so \ ++ $(objpfx)tst-tlsgap-mod1.so \ ++ $(objpfx)tst-tlsgap-mod2.so ++ifeq (yes,$(have-mtls-dialect-gnu2)) ++CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2 ++CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2 ++CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2 ++endif +diff --git a/elf/tst-tlsgap-mod0.c b/elf/tst-tlsgap-mod0.c +new file mode 100644 +index 0000000000000000..1478b0beac5faf98 +--- /dev/null ++++ b/elf/tst-tlsgap-mod0.c +@@ -0,0 +1,2 @@ ++int __thread tls0; ++int *f0(void) { return &tls0; } +diff --git a/elf/tst-tlsgap-mod1.c b/elf/tst-tlsgap-mod1.c +new file mode 100644 +index 0000000000000000..b10fc3702c43e478 +--- /dev/null ++++ b/elf/tst-tlsgap-mod1.c +@@ -0,0 +1,2 @@ ++int __thread tls1[100]; /* Size > glibc.rtld.optional_static_tls / 2. */ ++int *f1(void) { return tls1; } +diff --git a/elf/tst-tlsgap-mod2.c b/elf/tst-tlsgap-mod2.c +new file mode 100644 +index 0000000000000000..166c27d7f3fac252 +--- /dev/null ++++ b/elf/tst-tlsgap-mod2.c +@@ -0,0 +1,2 @@ ++int __thread tls2; ++int *f2(void) { return &tls2; } +diff --git a/elf/tst-tlsgap.c b/elf/tst-tlsgap.c +new file mode 100644 +index 0000000000000000..49328850769c5609 +--- /dev/null ++++ b/elf/tst-tlsgap.c +@@ -0,0 +1,92 @@ ++/* TLS modid gap reuse regression test for bug 29039. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static void *mod[3]; ++#define MOD(i) "tst-tlsgap-mod" #i ".so" ++static const char *modname[3] = { MOD(0), MOD(1), MOD(2) }; ++#undef MOD ++ ++static void ++open_mod (int i) ++{ ++ mod[i] = xdlopen (modname[i], RTLD_LAZY); ++ printf ("open %s\n", modname[i]); ++} ++ ++static void ++close_mod (int i) ++{ ++ xdlclose (mod[i]); ++ mod[i] = NULL; ++ printf ("close %s\n", modname[i]); ++} ++ ++static void ++access_mod (int i, const char *sym) ++{ ++ int *(*f) (void) = xdlsym (mod[i], sym); ++ int *p = f (); ++ printf ("access %s: %s() = %p\n", modname[i], sym, p); ++ TEST_VERIFY_EXIT (p != NULL); ++ ++*p; ++} ++ ++static void * ++start (void *arg) ++{ ++ /* The DTV generation is at the last dlopen of mod0 and the ++ entry for mod1 is NULL. */ ++ ++ open_mod (1); /* Reuse modid of mod1. Uses dynamic TLS. */ ++ ++ /* DTV is unchanged: dlopen only updates the DTV to the latest ++ generation if static TLS is allocated for a loaded module. ++ ++ With bug 29039, the TLSDESC relocation in mod1 uses the old ++ dlclose generation of mod1 instead of the new dlopen one so ++ DTV is not updated on TLS access. */ ++ ++ access_mod (1, "f1"); ++ ++ return arg; ++} ++ ++static int ++do_test (void) ++{ ++ open_mod (0); ++ open_mod (1); ++ open_mod (2); ++ close_mod (0); ++ close_mod (1); /* Create modid gap at mod1. */ ++ open_mod (0); /* Reuse modid of mod0, bump generation count. */ ++ ++ /* Create a thread where DTV of mod1 is NULL. */ ++ pthread_t t = xpthread_create (NULL, start, NULL); ++ xpthread_join (t); ++ return 0; ++} ++ ++#include diff --git a/glibc-RHEL-19445.patch b/glibc-RHEL-19445.patch new file mode 100644 index 0000000..b3769d3 --- /dev/null +++ b/glibc-RHEL-19445.patch @@ -0,0 +1,31 @@ +Based on the following commit, adjusted for glibc-2.28 in RHEL-8: + +commit 5eabdb6a6ac1599d23dd5966a37417215950245f +Author: Andreas Schwab +Date: Wed Dec 6 14:48:22 2023 +0100 + + getaddrinfo: translate ENOMEM to EAI_MEMORY (bug 31163) + + When __resolv_context_get returns NULL due to out of memory, translate it + to a return value of EAI_MEMORY. + +diff --git a/sysdeps/posix/getaddrinfo.c b/sysdeps/posix/getaddrinfo.c +index 46046504a6858f2e..d0708f3e84e20025 100644 +--- a/sysdeps/posix/getaddrinfo.c ++++ b/sysdeps/posix/getaddrinfo.c +@@ -777,7 +777,14 @@ gaih_inet (const char *name, const struct gaih_service *service, + res_ctx = __resolv_context_get (); + res_enable_inet6 = __resolv_context_disable_inet6 (res_ctx); + if (res_ctx == NULL) +- no_more = 1; ++ { ++ if (errno == ENOMEM) ++ { ++ result = -EAI_MEMORY; ++ goto free_and_return; ++ } ++ no_more = 1; ++ } + + while (!no_more) + { diff --git a/glibc-RHEL-22847.patch b/glibc-RHEL-19824.patch similarity index 100% rename from glibc-RHEL-22847.patch rename to glibc-RHEL-19824.patch diff --git a/glibc-RHEL-2122.patch b/glibc-RHEL-2122.patch new file mode 100644 index 0000000..69a294f --- /dev/null +++ b/glibc-RHEL-2122.patch @@ -0,0 +1,312 @@ +From d2123d68275acc0f061e73d5f86ca504e0d5a344 Mon Sep 17 00:00:00 2001 +From: Szabolcs Nagy +Date: Tue, 16 Feb 2021 12:55:13 +0000 +Subject: elf: Fix slow tls access after dlopen [BZ #19924] + +In short: __tls_get_addr checks the global generation counter and if +the current dtv is older then _dl_update_slotinfo updates dtv up to the +generation of the accessed module. So if the global generation is newer +than generation of the module then __tls_get_addr keeps hitting the +slow dtv update path. The dtv update path includes a number of checks +to see if any update is needed and this already causes measurable tls +access slow down after dlopen. + +It may be possible to detect up-to-date dtv faster. But if there are +many modules loaded (> TLS_SLOTINFO_SURPLUS) then this requires at +least walking the slotinfo list. + +This patch tries to update the dtv to the global generation instead, so +after a dlopen the tls access slow path is only hit once. The modules +with larger generation than the accessed one were not necessarily +synchronized before, so additional synchronization is needed. + +This patch uses acquire/release synchronization when accessing the +generation counter. + +Note: in the x86_64 version of dl-tls.c the generation is only loaded +once, since relaxed mo is not faster than acquire mo load. + +I have not benchmarked this. Tested by Adhemerval Zanella on aarch64, +powerpc, sparc, x86 who reported that it fixes the performance issue +of bug 19924. + +Reviewed-by: Adhemerval Zanella + +[rebased to c8s by DJ] + +diff -rup a/elf/dl-close.c b/elf/dl-close.c +--- a/elf/dl-close.c 2023-10-13 16:24:27.068217519 -0400 ++++ b/elf/dl-close.c 2023-10-13 16:28:59.936019397 -0400 +@@ -739,7 +739,7 @@ _dl_close_worker (struct link_map *map, + if (__glibc_unlikely (newgen == 0)) + _dl_fatal_printf ("TLS generation counter wrapped! Please report as described in "REPORT_BUGS_TO".\n"); + /* Can be read concurrently. */ +- atomic_store_relaxed (&GL(dl_tls_generation), newgen); ++ atomic_store_release (&GL(dl_tls_generation), newgen); + + if (tls_free_end == GL(dl_tls_static_used)) + GL(dl_tls_static_used) = tls_free_start; +diff -rup a/elf/dl-open.c b/elf/dl-open.c +--- a/elf/dl-open.c 2023-10-13 16:24:26.930212160 -0400 ++++ b/elf/dl-open.c 2023-10-13 16:28:59.936019397 -0400 +@@ -403,7 +403,7 @@ update_tls_slotinfo (struct link_map *ne + _dl_fatal_printf (N_("\ + TLS generation counter wrapped! Please report this.")); + /* Can be read concurrently. */ +- atomic_store_relaxed (&GL(dl_tls_generation), newgen); ++ atomic_store_release (&GL(dl_tls_generation), newgen); + + /* We need a second pass for static tls data, because + _dl_update_slotinfo must not be run while calls to +@@ -420,8 +420,8 @@ TLS generation counter wrapped! Please + now, but we can delay updating the DTV. */ + imap->l_need_tls_init = 0; + #ifdef SHARED +- /* Update the slot information data for at least the +- generation of the DSO we are allocating data for. */ ++ /* Update the slot information data for the current ++ generation. */ + + /* FIXME: This can terminate the process on memory + allocation failure. It is not possible to raise +@@ -429,7 +429,7 @@ TLS generation counter wrapped! Please + _dl_update_slotinfo would have to be split into two + operations, similar to resize_scopes and update_scopes + above. This is related to bug 16134. */ +- _dl_update_slotinfo (imap->l_tls_modid); ++ _dl_update_slotinfo (imap->l_tls_modid, newgen); + #endif + + GL(dl_init_static_tls) (imap); +diff -rup a/elf/dl-reloc.c b/elf/dl-reloc.c +--- a/elf/dl-reloc.c 2023-10-13 16:24:26.390191189 -0400 ++++ b/elf/dl-reloc.c 2023-10-13 16:28:59.937019438 -0400 +@@ -111,11 +111,11 @@ _dl_try_allocate_static_tls (struct link + if (map->l_real->l_relocated) + { + #ifdef SHARED ++ /* Update the DTV of the current thread. Note: GL(dl_load_tls_lock) ++ is held here so normal load of the generation counter is valid. */ + if (__builtin_expect (THREAD_DTV()[0].counter != GL(dl_tls_generation), + 0)) +- /* Update the slot information data for at least the generation of +- the DSO we are allocating data for. */ +- (void) _dl_update_slotinfo (map->l_tls_modid); ++ (void) _dl_update_slotinfo (map->l_tls_modid, GL(dl_tls_generation)); + #endif + + GL(dl_init_static_tls) (map); +diff -rup a/elf/dl-tls.c b/elf/dl-tls.c +--- a/elf/dl-tls.c 2023-10-13 16:24:26.564197946 -0400 ++++ b/elf/dl-tls.c 2023-10-13 16:28:59.937019438 -0400 +@@ -716,57 +716,57 @@ allocate_and_init (struct link_map *map) + + + struct link_map * +-_dl_update_slotinfo (unsigned long int req_modid) ++_dl_update_slotinfo (unsigned long int req_modid, size_t new_gen) + { + struct link_map *the_map = NULL; + dtv_t *dtv = THREAD_DTV (); + +- /* The global dl_tls_dtv_slotinfo array contains for each module +- index the generation counter current when the entry was created. ++ /* CONCURRENCY NOTES: ++ ++ The global dl_tls_dtv_slotinfo_list array contains for each module ++ index the generation counter current when that entry was updated. + This array never shrinks so that all module indices which were +- valid at some time can be used to access it. Before the first +- use of a new module index in this function the array was extended +- appropriately. Access also does not have to be guarded against +- modifications of the array. It is assumed that pointer-size +- values can be read atomically even in SMP environments. It is +- possible that other threads at the same time dynamically load +- code and therefore add to the slotinfo list. This is a problem +- since we must not pick up any information about incomplete work. +- The solution to this is to ignore all dtv slots which were +- created after the one we are currently interested. We know that +- dynamic loading for this module is completed and this is the last +- load operation we know finished. */ +- unsigned long int idx = req_modid; ++ valid at some time can be used to access it. Concurrent loading ++ and unloading of modules can update slotinfo entries or extend ++ the array. The updates happen under the GL(dl_load_tls_lock) and ++ finish with the release store of the generation counter to ++ GL(dl_tls_generation) which is synchronized with the load of ++ new_gen in the caller. So updates up to new_gen are synchronized ++ but updates for later generations may not be. ++ ++ Here we update the thread dtv from old_gen (== dtv[0].counter) to ++ new_gen generation. For this, each dtv[i] entry is either set to ++ an unallocated state (set), or left unmodified (nop). Where (set) ++ may resize the dtv first if modid i >= dtv[-1].counter. The rules ++ for the decision between (set) and (nop) are ++ ++ (1) If slotinfo entry i is concurrently updated then either (set) ++ or (nop) is valid: TLS access cannot use dtv[i] unless it is ++ synchronized with a generation > new_gen. ++ ++ Otherwise, if the generation of slotinfo entry i is gen and the ++ loaded module for this entry is map then ++ ++ (2) If gen <= old_gen then do (nop). ++ ++ (3) If old_gen < gen <= new_gen then ++ (3.1) if map != 0 then (set) ++ (3.2) if map == 0 then either (set) or (nop). ++ ++ Note that (1) cannot be reliably detected, but since both actions ++ are valid it does not have to be. Only (2) and (3.1) cases need ++ to be distinguished for which relaxed mo access of gen and map is ++ enough: their value is synchronized when it matters. ++ ++ Note that a relaxed mo load may give an out-of-thin-air value since ++ it is used in decisions that can affect concurrent stores. But this ++ should only happen if the OOTA value causes UB that justifies the ++ concurrent store of the value. This is not expected to be an issue ++ in practice. */ + struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list); + +- while (idx >= listp->len) ++ if (dtv[0].counter < new_gen) + { +- idx -= listp->len; +- listp = listp->next; +- } +- +- if (dtv[0].counter < listp->slotinfo[idx].gen) +- { +- /* CONCURRENCY NOTES: +- +- Here the dtv needs to be updated to new_gen generation count. +- +- This code may be called during TLS access when GL(dl_load_tls_lock) +- is not held. In that case the user code has to synchronize with +- dlopen and dlclose calls of relevant modules. A module m is +- relevant if the generation of m <= new_gen and dlclose of m is +- synchronized: a memory access here happens after the dlopen and +- before the dlclose of relevant modules. The dtv entries for +- relevant modules need to be updated, other entries can be +- arbitrary. +- +- This e.g. means that the first part of the slotinfo list can be +- accessed race free, but the tail may be concurrently extended. +- Similarly relevant slotinfo entries can be read race free, but +- other entries are racy. However updating a non-relevant dtv +- entry does not affect correctness. For a relevant module m, +- max_modid >= modid of m. */ +- size_t new_gen = listp->slotinfo[idx].gen; + size_t total = 0; + size_t max_modid = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx)); + assert (max_modid >= req_modid); +@@ -779,31 +779,33 @@ _dl_update_slotinfo (unsigned long int r + { + size_t modid = total + cnt; + +- /* Later entries are not relevant. */ ++ /* Case (1) for all later modids. */ + if (modid > max_modid) + break; + + size_t gen = atomic_load_relaxed (&listp->slotinfo[cnt].gen); + ++ /* Case (1). */ + if (gen > new_gen) +- /* Not relevant. */ + continue; + +- /* If the entry is older than the current dtv layout we +- know we don't have to handle it. */ ++ /* Case (2) or (1). */ + if (gen <= dtv[0].counter) + continue; + ++ /* Case (3) or (1). */ ++ + /* If there is no map this means the entry is empty. */ + struct link_map *map + = atomic_load_relaxed (&listp->slotinfo[cnt].map); + /* Check whether the current dtv array is large enough. */ + if (dtv[-1].counter < modid) + { ++ /* Case (3.2) or (1). */ + if (map == NULL) + continue; + +- /* Resize the dtv. */ ++ /* Resizing the dtv aborts on failure: bug 16134. */ + dtv = _dl_resize_dtv (dtv, max_modid); + + assert (modid <= dtv[-1].counter); +@@ -814,7 +816,7 @@ _dl_update_slotinfo (unsigned long int r + } + + /* If there is currently memory allocate for this +- dtv entry free it. */ ++ dtv entry free it. Note: this is not AS-safe. */ + /* XXX Ideally we will at some point create a memory + pool. */ + free (dtv[modid].pointer.to_free); +@@ -909,9 +911,9 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t + + static struct link_map * + __attribute_noinline__ +-update_get_addr (GET_ADDR_ARGS) ++update_get_addr (GET_ADDR_ARGS, size_t gen) + { +- struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE); ++ struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE, gen); + dtv_t *dtv = THREAD_DTV (); + + void *p = dtv[GET_ADDR_MODULE].pointer.val; +@@ -941,12 +943,17 @@ __tls_get_addr (GET_ADDR_ARGS) + dtv_t *dtv = THREAD_DTV (); + + /* Update is needed if dtv[0].counter < the generation of the accessed +- module. The global generation counter is used here as it is easier +- to check. Synchronization for the relaxed MO access is guaranteed +- by user code, see CONCURRENCY NOTES in _dl_update_slotinfo. */ ++ module, but the global generation counter is easier to check (which ++ must be synchronized up to the generation of the accessed module by ++ user code doing the TLS access so relaxed mo read is enough). */ + size_t gen = atomic_load_relaxed (&GL(dl_tls_generation)); + if (__glibc_unlikely (dtv[0].counter != gen)) +- return update_get_addr (GET_ADDR_PARAM); ++ { ++ /* Update DTV up to the global generation, see CONCURRENCY NOTES ++ in _dl_update_slotinfo. */ ++ gen = atomic_load_acquire (&GL(dl_tls_generation)); ++ return update_get_addr (GET_ADDR_PARAM, gen); ++ } + + void *p = dtv[GET_ADDR_MODULE].pointer.val; + +diff -rup a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h +--- a/sysdeps/generic/ldsodefs.h 2023-10-13 16:24:27.136220160 -0400 ++++ b/sysdeps/generic/ldsodefs.h 2023-10-13 16:28:59.937019438 -0400 +@@ -1231,7 +1231,8 @@ extern void _dl_add_to_slotinfo (struct + + /* Update slot information data for at least the generation of the + module with the given index. */ +-extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid) ++extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid, ++ size_t gen) + attribute_hidden; + + /* Look up the module's TLS block as for __tls_get_addr, +diff -rup a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c +--- a/sysdeps/x86_64/dl-tls.c 2023-10-13 16:24:24.948135189 -0400 ++++ b/sysdeps/x86_64/dl-tls.c 2023-10-13 16:28:59.938019479 -0400 +@@ -40,9 +40,9 @@ __tls_get_addr_slow (GET_ADDR_ARGS) + { + dtv_t *dtv = THREAD_DTV (); + +- size_t gen = atomic_load_relaxed (&GL(dl_tls_generation)); ++ size_t gen = atomic_load_acquire (&GL(dl_tls_generation)); + if (__glibc_unlikely (dtv[0].counter != gen)) +- return update_get_addr (GET_ADDR_PARAM); ++ return update_get_addr (GET_ADDR_PARAM, gen); + + return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL); + } diff --git a/glibc-RHEL-21997.patch b/glibc-RHEL-21997.patch new file mode 100644 index 0000000..865c508 --- /dev/null +++ b/glibc-RHEL-21997.patch @@ -0,0 +1,112 @@ +This downstream-only patch compensates for the missing backport of +commit 2d651eb9265d1366d7b9e881bfddd46db9c1ecc4 ("x86: Move +x86 processor cache info to cpu_features"). Without it, +ld.so --list-diagnostics prints values that have not been properly +initalized from CPUID data. + +diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h +index 10ebadd819d9efff..d8421fab83ab08ac 100644 +--- a/sysdeps/x86/cacheinfo.h ++++ b/sysdeps/x86/cacheinfo.h +@@ -19,31 +19,42 @@ + #include + #include + ++/* When building ld.so, do not export any of the variables. They are ++ only used for diagnostics and are not initialized during regular ++ operation. */ ++#if IS_IN (rtld) ++# define CACHEINFO_VARIABLE(name, initializer) \ ++ static long int name = initializer ++#else ++# define CACHEINFO_VARIABLE(name, initializer) \ ++ long int name attribute_hidden = initializer ++#endif ++ + /* Data cache size for use in memory and string routines, typically + L1 size, rounded to multiple of 256 bytes. */ +-long int __x86_data_cache_size_half attribute_hidden = 32 * 1024 / 2; +-long int __x86_data_cache_size attribute_hidden = 32 * 1024; ++CACHEINFO_VARIABLE (__x86_data_cache_size_half, 32 * 1024 / 2); ++CACHEINFO_VARIABLE (__x86_data_cache_size, 32 * 1024); + /* Similar to __x86_data_cache_size_half, but not rounded. */ +-long int __x86_raw_data_cache_size_half attribute_hidden = 32 * 1024 / 2; ++CACHEINFO_VARIABLE (__x86_raw_data_cache_size_half, 32 * 1024 / 2); + /* Similar to __x86_data_cache_size, but not rounded. */ +-long int __x86_raw_data_cache_size attribute_hidden = 32 * 1024; ++CACHEINFO_VARIABLE (__x86_raw_data_cache_size, 32 * 1024); + /* Shared cache size for use in memory and string routines, typically + L2 or L3 size, rounded to multiple of 256 bytes. */ +-long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2; +-long int __x86_shared_cache_size attribute_hidden = 1024 * 1024; ++CACHEINFO_VARIABLE (__x86_shared_cache_size_half, 1024 * 1024 / 2); ++CACHEINFO_VARIABLE (__x86_shared_cache_size, 1024 * 1024); + /* Similar to __x86_shared_cache_size_half, but not rounded. */ +-long int __x86_raw_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2; ++CACHEINFO_VARIABLE (__x86_raw_shared_cache_size_half, 1024 * 1024 / 2); + /* Similar to __x86_shared_cache_size, but not rounded. */ +-long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024; ++CACHEINFO_VARIABLE (__x86_raw_shared_cache_size, 1024 * 1024); + + /* Threshold to use non temporal store. */ +-long int __x86_shared_non_temporal_threshold attribute_hidden; ++CACHEINFO_VARIABLE (__x86_shared_non_temporal_threshold, 0); + + /* Threshold to use Enhanced REP MOVSB. */ +-long int __x86_rep_movsb_threshold attribute_hidden = 2048; ++CACHEINFO_VARIABLE (__x86_rep_movsb_threshold, 2048); + + /* Threshold to use Enhanced REP STOSB. */ +-long int __x86_rep_stosb_threshold attribute_hidden = 2048; ++CACHEINFO_VARIABLE (__x86_rep_stosb_threshold, 2048); + + static void + get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr, +diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c +index 0ba286a828b69937..9215604ecf22344c 100644 +--- a/sysdeps/x86/dl-diagnostics-cpu.c ++++ b/sysdeps/x86/dl-diagnostics-cpu.c +@@ -19,6 +19,13 @@ + #include + #include + ++#include ++#include ++#include ++#include ++#include ++#include ++ + static void + print_cpu_features_value (const char *label, uint64_t value) + { +@@ -81,19 +88,21 @@ _dl_diagnostics_cpu (void) + #include "cpu-features-preferred_feature_index_1.def" + #undef BIT + ++ /* The cache information variables are only used for diagnostics and ++ are not initialized during startup. The values used at run time ++ are only in libc.so.6. */ ++ init_cacheinfo (); ++ + print_cpu_features_value ("xsave_state_size", + cpu_features->xsave_state_size); + print_cpu_features_value ("xsave_state_full_size", + cpu_features->xsave_state_full_size); +- print_cpu_features_value ("data_cache_size", cpu_features->data_cache_size); +- print_cpu_features_value ("shared_cache_size", +- cpu_features->shared_cache_size); ++ print_cpu_features_value ("data_cache_size", __x86_data_cache_size); ++ print_cpu_features_value ("shared_cache_size", __x86_shared_cache_size); + print_cpu_features_value ("non_temporal_threshold", +- cpu_features->non_temporal_threshold); +- print_cpu_features_value ("rep_movsb_threshold", +- cpu_features->rep_movsb_threshold); +- print_cpu_features_value ("rep_stosb_threshold", +- cpu_features->rep_stosb_threshold); ++ __x86_shared_non_temporal_threshold); ++ print_cpu_features_value ("rep_movsb_threshold", __x86_rep_movsb_threshold); ++ print_cpu_features_value ("rep_stosb_threshold", __x86_rep_stosb_threshold); + _Static_assert (offsetof (struct cpu_features, rep_stosb_threshold) + + sizeof (cpu_features->rep_stosb_threshold) + == sizeof (*cpu_features), diff --git a/glibc-RHEL-3010-1.patch b/glibc-RHEL-3010-1.patch new file mode 100644 index 0000000..494ebfd --- /dev/null +++ b/glibc-RHEL-3010-1.patch @@ -0,0 +1,247 @@ +commit 103a469dc7755fd9e8ccf362f3dd4c55dc761908 +Author: Sajan Karumanchi +Date: Wed Jan 18 18:29:04 2023 +0100 + + x86: Cache computation for AMD architecture. + + All AMD architectures cache details will be computed based on + __cpuid__ `0x8000_001D` and the reference to __cpuid__ `0x8000_0006` will be + zeroed out for future architectures. + + Reviewed-by: Premachandra Mallappa + +Conflicts: + sysdeps/x86/dl-cacheinfo.h + (missing backport of commit 2d651eb9265d1366d7b9e881bfddd4 + ("x86: Move x86 processor cache info to cpu_features")) + +diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h +index 572f753474ee0610..b6f111e6668cc212 100644 +--- a/sysdeps/x86/cacheinfo.h ++++ b/sysdeps/x86/cacheinfo.h +@@ -266,10 +266,6 @@ static void + init_cacheinfo (void) + { + /* Find out what brand of processor. */ +- unsigned int ebx; +- unsigned int ecx; +- unsigned int edx; +- int max_cpuid_ex; + long int data = -1; + long int shared = -1; + long int shared_per_thread = -1; +@@ -303,62 +299,14 @@ init_cacheinfo (void) + } + else if (cpu_features->basic.kind == arch_kind_amd) + { +- data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); +- long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE); +- shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); ++ data = handle_amd (_SC_LEVEL1_DCACHE_SIZE, cpu_features); ++ long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE, cpu_features); ++ shared = handle_amd (_SC_LEVEL3_CACHE_SIZE, cpu_features); + shared_per_thread = shared; + +- /* Get maximum extended function. */ +- __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx); +- + if (shared <= 0) + /* No shared L3 cache. All we have is the L2 cache. */ + shared = core; +- else +- { +- /* Figure out the number of logical threads that share L3. */ +- if (max_cpuid_ex >= 0x80000008) +- { +- /* Get width of APIC ID. */ +- __cpuid (0x80000008, max_cpuid_ex, ebx, ecx, edx); +- threads = 1 << ((ecx >> 12) & 0x0f); +- } +- +- if (threads == 0 || cpu_features->basic.family >= 0x17) +- { +- /* If APIC ID width is not available, use logical +- processor count. */ +- __cpuid (0x00000001, max_cpuid_ex, ebx, ecx, edx); +- +- if ((edx & (1 << 28)) != 0) +- threads = (ebx >> 16) & 0xff; +- } +- +- /* Cap usage of highest cache level to the number of +- supported threads. */ +- if (threads > 0) +- shared /= threads; +- +- /* Get shared cache per ccx for Zen architectures. */ +- if (cpu_features->basic.family >= 0x17) +- { +- unsigned int eax; +- +- /* Get number of threads share the L3 cache in CCX. */ +- __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx); +- +- unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1; +- shared *= threads_per_ccx; +- } +- else +- { +- /* Account for exclusive L2 and L3 caches. */ +- shared += core; +- } +- } +- +- if (shared_per_thread <= 0) +- shared_per_thread = shared; + } + + if (cpu_features->data_cache_size != 0) +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index b2b90074b0e98a60..294a7d8bfc564aef 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -311,117 +311,47 @@ handle_intel (int name, const struct cpu_features *cpu_features) + + + static long int __attribute__ ((noinline)) +-handle_amd (int name) ++handle_amd (int name, const struct cpu_features *cpu_features) + { + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; +- __cpuid (0x80000000, eax, ebx, ecx, edx); ++ unsigned int count = 0x1; + + /* No level 4 cache (yet). */ + if (name > _SC_LEVEL3_CACHE_LINESIZE) + return 0; + +- unsigned int fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE); +- if (eax < fn) +- return 0; ++ if (name >= _SC_LEVEL3_CACHE_SIZE) ++ count = 0x3; ++ else if (name >= _SC_LEVEL2_CACHE_SIZE) ++ count = 0x2; ++ else if (name >= _SC_LEVEL1_DCACHE_SIZE) ++ count = 0x0; + +- __cpuid (fn, eax, ebx, ecx, edx); +- +- if (name < _SC_LEVEL1_DCACHE_SIZE) +- { +- name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE; +- ecx = edx; +- } ++ __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx); + + switch (name) + { +- case _SC_LEVEL1_DCACHE_SIZE: +- return (ecx >> 14) & 0x3fc00; +- +- case _SC_LEVEL1_DCACHE_ASSOC: +- ecx >>= 16; +- if ((ecx & 0xff) == 0xff) +- /* Fully associative. */ +- return (ecx << 2) & 0x3fc00; +- return ecx & 0xff; +- +- case _SC_LEVEL1_DCACHE_LINESIZE: +- return ecx & 0xff; +- +- case _SC_LEVEL2_CACHE_SIZE: +- return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00; +- +- case _SC_LEVEL2_CACHE_ASSOC: +- switch ((ecx >> 12) & 0xf) +- { +- case 0: +- case 1: +- case 2: +- case 4: +- return (ecx >> 12) & 0xf; +- case 6: +- return 8; +- case 8: +- return 16; +- case 10: +- return 32; +- case 11: +- return 48; +- case 12: +- return 64; +- case 13: +- return 96; +- case 14: +- return 128; +- case 15: +- return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff); +- default: +- return 0; +- } +- /* NOTREACHED */ +- +- case _SC_LEVEL2_CACHE_LINESIZE: +- return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff; +- +- case _SC_LEVEL3_CACHE_SIZE: +- return (edx & 0xf000) == 0 ? 0 : (edx & 0x3ffc0000) << 1; +- +- case _SC_LEVEL3_CACHE_ASSOC: +- switch ((edx >> 12) & 0xf) +- { +- case 0: +- case 1: +- case 2: +- case 4: +- return (edx >> 12) & 0xf; +- case 6: +- return 8; +- case 8: +- return 16; +- case 10: +- return 32; +- case 11: +- return 48; +- case 12: +- return 64; +- case 13: +- return 96; +- case 14: +- return 128; +- case 15: +- return ((edx & 0x3ffc0000) << 1) / (edx & 0xff); +- default: +- return 0; +- } +- /* NOTREACHED */ +- +- case _SC_LEVEL3_CACHE_LINESIZE: +- return (edx & 0xf000) == 0 ? 0 : edx & 0xff; +- +- default: +- assert (! "cannot happen"); ++ case _SC_LEVEL1_ICACHE_ASSOC: ++ case _SC_LEVEL1_DCACHE_ASSOC: ++ case _SC_LEVEL2_CACHE_ASSOC: ++ case _SC_LEVEL3_CACHE_ASSOC: ++ return ecx?((ebx >> 22) & 0x3ff) + 1 : 0; ++ case _SC_LEVEL1_ICACHE_LINESIZE: ++ case _SC_LEVEL1_DCACHE_LINESIZE: ++ case _SC_LEVEL2_CACHE_LINESIZE: ++ case _SC_LEVEL3_CACHE_LINESIZE: ++ return ecx?(ebx & 0xfff) + 1 : 0; ++ case _SC_LEVEL1_ICACHE_SIZE: ++ case _SC_LEVEL1_DCACHE_SIZE: ++ case _SC_LEVEL2_CACHE_SIZE: ++ case _SC_LEVEL3_CACHE_SIZE: ++ return ecx?(((ebx >> 22) & 0x3ff) + 1)*((ebx & 0xfff) + 1)\ ++ *(ecx + 1):0; ++ default: ++ assert (! "cannot happen"); + } + return -1; + } diff --git a/glibc-RHEL-3010-2.patch b/glibc-RHEL-3010-2.patch new file mode 100644 index 0000000..26a42d9 --- /dev/null +++ b/glibc-RHEL-3010-2.patch @@ -0,0 +1,85 @@ +commit 856bab7717ef6d1033fd7cbf7cfb2ddefbfffb07 +Author: Andreas Schwab +Date: Thu Feb 9 14:56:21 2023 +0100 + + x86/dl-cacheinfo: remove unsused parameter from handle_amd + + Also replace an unreachable assert with __builtin_unreachable. + +Conflicts: + sysdeps/x86/dl-cacheinfo.h + (missing backport of commit 2d651eb9265d1366d7b9e881bfddd4 + ("x86: Move x86 processor cache info to cpu_features")) + +diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h +index b6f111e6668cc212..85e5731281c62503 100644 +--- a/sysdeps/x86/cacheinfo.h ++++ b/sysdeps/x86/cacheinfo.h +@@ -299,9 +299,9 @@ init_cacheinfo (void) + } + else if (cpu_features->basic.kind == arch_kind_amd) + { +- data = handle_amd (_SC_LEVEL1_DCACHE_SIZE, cpu_features); +- long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE, cpu_features); +- shared = handle_amd (_SC_LEVEL3_CACHE_SIZE, cpu_features); ++ data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); ++ long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE); ++ shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); + shared_per_thread = shared; + + if (shared <= 0) +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index 294a7d8bfc564aef..74cd5072a9d10756 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -311,7 +311,7 @@ handle_intel (int name, const struct cpu_features *cpu_features) + + + static long int __attribute__ ((noinline)) +-handle_amd (int name, const struct cpu_features *cpu_features) ++handle_amd (int name) + { + unsigned int eax; + unsigned int ebx; +@@ -334,24 +334,23 @@ handle_amd (int name, const struct cpu_features *cpu_features) + + switch (name) + { +- case _SC_LEVEL1_ICACHE_ASSOC: +- case _SC_LEVEL1_DCACHE_ASSOC: +- case _SC_LEVEL2_CACHE_ASSOC: +- case _SC_LEVEL3_CACHE_ASSOC: +- return ecx?((ebx >> 22) & 0x3ff) + 1 : 0; +- case _SC_LEVEL1_ICACHE_LINESIZE: +- case _SC_LEVEL1_DCACHE_LINESIZE: +- case _SC_LEVEL2_CACHE_LINESIZE: +- case _SC_LEVEL3_CACHE_LINESIZE: +- return ecx?(ebx & 0xfff) + 1 : 0; +- case _SC_LEVEL1_ICACHE_SIZE: +- case _SC_LEVEL1_DCACHE_SIZE: +- case _SC_LEVEL2_CACHE_SIZE: +- case _SC_LEVEL3_CACHE_SIZE: +- return ecx?(((ebx >> 22) & 0x3ff) + 1)*((ebx & 0xfff) + 1)\ +- *(ecx + 1):0; +- default: +- assert (! "cannot happen"); ++ case _SC_LEVEL1_ICACHE_ASSOC: ++ case _SC_LEVEL1_DCACHE_ASSOC: ++ case _SC_LEVEL2_CACHE_ASSOC: ++ case _SC_LEVEL3_CACHE_ASSOC: ++ return ecx ? ((ebx >> 22) & 0x3ff) + 1 : 0; ++ case _SC_LEVEL1_ICACHE_LINESIZE: ++ case _SC_LEVEL1_DCACHE_LINESIZE: ++ case _SC_LEVEL2_CACHE_LINESIZE: ++ case _SC_LEVEL3_CACHE_LINESIZE: ++ return ecx ? (ebx & 0xfff) + 1 : 0; ++ case _SC_LEVEL1_ICACHE_SIZE: ++ case _SC_LEVEL1_DCACHE_SIZE: ++ case _SC_LEVEL2_CACHE_SIZE: ++ case _SC_LEVEL3_CACHE_SIZE: ++ return ecx ? (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1): 0; ++ default: ++ __builtin_unreachable (); + } + return -1; + } diff --git a/glibc-RHEL-3010-3.patch b/glibc-RHEL-3010-3.patch new file mode 100644 index 0000000..05022a2 --- /dev/null +++ b/glibc-RHEL-3010-3.patch @@ -0,0 +1,280 @@ +commit dcad5c8578130dec7f35fd5b0885304b59f9f543 +Author: Sajan Karumanchi +Date: Tue Aug 1 15:20:55 2023 +0000 + + x86: Fix for cache computation on AMD legacy cpus. + + Some legacy AMD CPUs and hypervisors have the _cpuid_ '0x8000_001D' + set to Zero, thus resulting in zeroed-out computed cache values. + This patch reintroduces the old way of cache computation as a + fail-safe option to handle these exceptions. + Fixed 'level4_cache_size' value through handle_amd(). + + Reviewed-by: Premachandra Mallappa + Tested-by: Florian Weimer + +Conflicts: + sysdeps/x86/dl-cacheinfo.h + (missing backport of commit 2d651eb9265d1366d7b9e881bfddd4 + ("x86: Move x86 processor cache info to cpu_features")) + +diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h +index 85e5731281c62503..10ebadd819d9efff 100644 +--- a/sysdeps/x86/cacheinfo.h ++++ b/sysdeps/x86/cacheinfo.h +@@ -302,11 +302,19 @@ init_cacheinfo (void) + data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); + long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE); + shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); +- shared_per_thread = shared; + + if (shared <= 0) +- /* No shared L3 cache. All we have is the L2 cache. */ +- shared = core; ++ { ++ /* No shared L3 cache. All we have is the L2 cache. */ ++ shared = core; ++ } ++ else if (cpu_features->basic.family < 0x17) ++ { ++ /* Account for exclusive L2 and L3 caches. */ ++ shared += core; ++ } ++ ++ shared_per_thread = shared; + } + + if (cpu_features->data_cache_size != 0) +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index 74cd5072a9d10756..75a6b1dfde199dd7 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -315,40 +315,206 @@ handle_amd (int name) + { + unsigned int eax; + unsigned int ebx; +- unsigned int ecx; ++ unsigned int ecx = 0; + unsigned int edx; +- unsigned int count = 0x1; ++ unsigned int max_cpuid = 0; ++ unsigned int fn = 0; + + /* No level 4 cache (yet). */ + if (name > _SC_LEVEL3_CACHE_LINESIZE) + return 0; + +- if (name >= _SC_LEVEL3_CACHE_SIZE) +- count = 0x3; +- else if (name >= _SC_LEVEL2_CACHE_SIZE) +- count = 0x2; +- else if (name >= _SC_LEVEL1_DCACHE_SIZE) +- count = 0x0; ++ __cpuid (0x80000000, max_cpuid, ebx, ecx, edx); ++ ++ if (max_cpuid >= 0x8000001D) ++ /* Use __cpuid__ '0x8000_001D' to compute cache details. */ ++ { ++ unsigned int count = 0x1; ++ ++ if (name >= _SC_LEVEL3_CACHE_SIZE) ++ count = 0x3; ++ else if (name >= _SC_LEVEL2_CACHE_SIZE) ++ count = 0x2; ++ else if (name >= _SC_LEVEL1_DCACHE_SIZE) ++ count = 0x0; ++ ++ __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx); ++ ++ if (ecx != 0) ++ { ++ switch (name) ++ { ++ case _SC_LEVEL1_ICACHE_ASSOC: ++ case _SC_LEVEL1_DCACHE_ASSOC: ++ case _SC_LEVEL2_CACHE_ASSOC: ++ case _SC_LEVEL3_CACHE_ASSOC: ++ return ((ebx >> 22) & 0x3ff) + 1; ++ case _SC_LEVEL1_ICACHE_LINESIZE: ++ case _SC_LEVEL1_DCACHE_LINESIZE: ++ case _SC_LEVEL2_CACHE_LINESIZE: ++ case _SC_LEVEL3_CACHE_LINESIZE: ++ return (ebx & 0xfff) + 1; ++ case _SC_LEVEL1_ICACHE_SIZE: ++ case _SC_LEVEL1_DCACHE_SIZE: ++ case _SC_LEVEL2_CACHE_SIZE: ++ case _SC_LEVEL3_CACHE_SIZE: ++ return (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1); ++ default: ++ __builtin_unreachable (); ++ } ++ return -1; ++ } ++ } + +- __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx); ++ /* Legacy cache computation for CPUs prior to Bulldozer family. ++ This is also a fail-safe mechanism for some hypervisors that ++ accidentally configure __cpuid__ '0x8000_001D' to Zero. */ ++ ++ fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE); ++ ++ if (max_cpuid < fn) ++ return 0; ++ ++ __cpuid (fn, eax, ebx, ecx, edx); ++ ++ if (name < _SC_LEVEL1_DCACHE_SIZE) ++ { ++ name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE; ++ ecx = edx; ++ } + + switch (name) + { +- case _SC_LEVEL1_ICACHE_ASSOC: +- case _SC_LEVEL1_DCACHE_ASSOC: +- case _SC_LEVEL2_CACHE_ASSOC: ++ case _SC_LEVEL1_DCACHE_SIZE: ++ return (ecx >> 14) & 0x3fc00; ++ ++ case _SC_LEVEL1_DCACHE_ASSOC: ++ ecx >>= 16; ++ if ((ecx & 0xff) == 0xff) ++ { ++ /* Fully associative. */ ++ return (ecx << 2) & 0x3fc00; ++ } ++ return ecx & 0xff; ++ ++ case _SC_LEVEL1_DCACHE_LINESIZE: ++ return ecx & 0xff; ++ ++ case _SC_LEVEL2_CACHE_SIZE: ++ return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00; ++ ++ case _SC_LEVEL2_CACHE_ASSOC: ++ switch ((ecx >> 12) & 0xf) ++ { ++ case 0: ++ case 1: ++ case 2: ++ case 4: ++ return (ecx >> 12) & 0xf; ++ case 6: ++ return 8; ++ case 8: ++ return 16; ++ case 10: ++ return 32; ++ case 11: ++ return 48; ++ case 12: ++ return 64; ++ case 13: ++ return 96; ++ case 14: ++ return 128; ++ case 15: ++ return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff); ++ default: ++ return 0; ++ } ++ ++ case _SC_LEVEL2_CACHE_LINESIZE: ++ return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff; ++ ++ case _SC_LEVEL3_CACHE_SIZE: ++ { ++ long int total_l3_cache = 0, l3_cache_per_thread = 0; ++ unsigned int threads = 0; ++ const struct cpu_features *cpu_features; ++ ++ if ((edx & 0xf000) == 0) ++ return 0; ++ ++ total_l3_cache = (edx & 0x3ffc0000) << 1; ++ cpu_features = __get_cpu_features (); ++ ++ /* Figure out the number of logical threads that share L3. */ ++ if (max_cpuid >= 0x80000008) ++ { ++ /* Get width of APIC ID. */ ++ __cpuid (0x80000008, eax, ebx, ecx, edx); ++ threads = (ecx & 0xff) + 1; ++ } ++ ++ if (threads == 0) ++ { ++ /* If APIC ID width is not available, use logical ++ processor count. */ ++ __cpuid (0x00000001, eax, ebx, ecx, edx); ++ if ((edx & (1 << 28)) != 0) ++ threads = (ebx >> 16) & 0xff; ++ } ++ ++ /* Cap usage of highest cache level to the number of ++ supported threads. */ ++ if (threads > 0) ++ l3_cache_per_thread = total_l3_cache/threads; ++ ++ /* Get shared cache per ccx for Zen architectures. */ ++ if (cpu_features->basic.family >= 0x17) ++ { ++ long int l3_cache_per_ccx = 0; ++ /* Get number of threads share the L3 cache in CCX. */ ++ __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx); ++ unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1; ++ l3_cache_per_ccx = l3_cache_per_thread * threads_per_ccx; ++ return l3_cache_per_ccx; ++ } ++ else ++ { ++ return l3_cache_per_thread; ++ } ++ } ++ + case _SC_LEVEL3_CACHE_ASSOC: +- return ecx ? ((ebx >> 22) & 0x3ff) + 1 : 0; +- case _SC_LEVEL1_ICACHE_LINESIZE: +- case _SC_LEVEL1_DCACHE_LINESIZE: +- case _SC_LEVEL2_CACHE_LINESIZE: ++ switch ((edx >> 12) & 0xf) ++ { ++ case 0: ++ case 1: ++ case 2: ++ case 4: ++ return (edx >> 12) & 0xf; ++ case 6: ++ return 8; ++ case 8: ++ return 16; ++ case 10: ++ return 32; ++ case 11: ++ return 48; ++ case 12: ++ return 64; ++ case 13: ++ return 96; ++ case 14: ++ return 128; ++ case 15: ++ return ((edx & 0x3ffc0000) << 1) / (edx & 0xff); ++ default: ++ return 0; ++ } ++ + case _SC_LEVEL3_CACHE_LINESIZE: +- return ecx ? (ebx & 0xfff) + 1 : 0; +- case _SC_LEVEL1_ICACHE_SIZE: +- case _SC_LEVEL1_DCACHE_SIZE: +- case _SC_LEVEL2_CACHE_SIZE: +- case _SC_LEVEL3_CACHE_SIZE: +- return ecx ? (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1): 0; ++ return (edx & 0xf000) == 0 ? 0 : edx & 0xff; ++ + default: + __builtin_unreachable (); + } diff --git a/glibc-RHEL-31804.patch b/glibc-RHEL-31804.patch new file mode 100644 index 0000000..21f8672 --- /dev/null +++ b/glibc-RHEL-31804.patch @@ -0,0 +1,203 @@ +Author: Charles Fol +Date: Thu Mar 28 12:25:38 2024 -0300 + + iconv: ISO-2022-CN-EXT: fix out-of-bound writes when writing escape sequence (CVE-2024-2961) + + ISO-2022-CN-EXT uses escape sequences to indicate character set changes + (as specified by RFC 1922). While the SOdesignation has the expected + bounds checks, neither SS2designation nor SS3designation have its; + allowing a write overflow of 1, 2, or 3 bytes with fixed values: + '$+I', '$+J', '$+K', '$+L', '$+M', or '$*H'. + + Checked on aarch64-linux-gnu. + + Co-authored-by: Adhemerval Zanella + Reviewed-by: Carlos O'Donell + Tested-by: Carlos O'Donell + +diff --git a/iconvdata/Makefile b/iconvdata/Makefile +index 646e2ccd11478646..c959758a90ed954f 100644 +--- a/iconvdata/Makefile ++++ b/iconvdata/Makefile +@@ -75,7 +75,7 @@ ifeq (yes,$(build-shared)) + tests = bug-iconv1 bug-iconv2 tst-loading tst-e2big tst-iconv4 bug-iconv4 \ + tst-iconv6 bug-iconv5 bug-iconv6 tst-iconv7 bug-iconv8 bug-iconv9 \ + bug-iconv10 bug-iconv11 bug-iconv12 bug-iconv13 bug-iconv14 \ +- bug-iconv15 ++ bug-iconv15 tst-iconv-iso-2022-cn-ext + ifeq ($(have-thread-library),yes) + tests += bug-iconv3 + endif +@@ -325,6 +325,8 @@ $(objpfx)bug-iconv14.out: $(addprefix $(objpfx), $(gconv-modules)) \ + $(addprefix $(objpfx),$(modules.so)) + $(objpfx)bug-iconv15.out: $(addprefix $(objpfx), $(gconv-modules)) \ + $(addprefix $(objpfx),$(modules.so)) ++$(objpfx)tst-iconv-iso-2022-cn-ext.out: $(addprefix $(objpfx), $(gconv-modules)) \ ++ $(addprefix $(objpfx),$(modules.so)) + + $(objpfx)iconv-test.out: run-iconv-test.sh \ + $(addprefix $(objpfx), $(gconv-modules)) \ +diff --git a/iconvdata/iso-2022-cn-ext.c b/iconvdata/iso-2022-cn-ext.c +index c21a7187b4d7808e..bd9493c12d95070b 100644 +--- a/iconvdata/iso-2022-cn-ext.c ++++ b/iconvdata/iso-2022-cn-ext.c +@@ -575,6 +575,12 @@ DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized"); + { \ + const char *escseq; \ + \ ++ if (outptr + 4 > outend) \ ++ { \ ++ result = __GCONV_FULL_OUTPUT; \ ++ break; \ ++ } \ ++ \ + assert (used == CNS11643_2_set); /* XXX */ \ + escseq = "*H"; \ + *outptr++ = ESC; \ +@@ -588,6 +594,12 @@ DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized"); + { \ + const char *escseq; \ + \ ++ if (outptr + 4 > outend) \ ++ { \ ++ result = __GCONV_FULL_OUTPUT; \ ++ break; \ ++ } \ ++ \ + assert ((used >> 5) >= 3 && (used >> 5) <= 7); \ + escseq = "+I+J+K+L+M" + ((used >> 5) - 3) * 2; \ + *outptr++ = ESC; \ +diff --git a/iconvdata/tst-iconv-iso-2022-cn-ext.c b/iconvdata/tst-iconv-iso-2022-cn-ext.c +new file mode 100644 +index 0000000000000000..96a8765fd5369681 +--- /dev/null ++++ b/iconvdata/tst-iconv-iso-2022-cn-ext.c +@@ -0,0 +1,128 @@ ++/* Verify ISO-2022-CN-EXT does not write out of the bounds. ++ Copyright (C) 2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++/* The test sets up a two memory page buffer with the second page marked ++ PROT_NONE to trigger a fault if the conversion writes beyond the exact ++ expected amount. Then we carry out various conversions and precisely ++ place the start of the output buffer in order to trigger a SIGSEGV if the ++ process writes anywhere between 1 and page sized bytes more (only one ++ PROT_NONE page is setup as a canary) than expected. These tests exercise ++ all three of the cases in ISO-2022-CN-EXT where the converter must switch ++ character sets and may run out of buffer space while doing the ++ operation. */ ++ ++static int ++do_test (void) ++{ ++ iconv_t cd = iconv_open ("ISO-2022-CN-EXT", "UTF-8"); ++ TEST_VERIFY_EXIT (cd != (iconv_t) -1); ++ ++ char *ntf; ++ size_t ntfsize; ++ char *outbufbase; ++ { ++ int pgz = getpagesize (); ++ TEST_VERIFY_EXIT (pgz > 0); ++ ntfsize = 2 * pgz; ++ ++ ntf = xmmap (NULL, ntfsize, PROT_READ | PROT_WRITE, MAP_PRIVATE ++ | MAP_ANONYMOUS, -1); ++ xmprotect (ntf + pgz, pgz, PROT_NONE); ++ ++ outbufbase = ntf + pgz; ++ } ++ ++ /* Check if SOdesignation escape sequence does not trigger an OOB write. */ ++ { ++ char inbuf[] = "\xe4\xba\xa4\xe6\x8d\xa2"; ++ ++ for (int i = 0; i < 9; i++) ++ { ++ char *inp = inbuf; ++ size_t inleft = sizeof (inbuf) - 1; ++ ++ char *outp = outbufbase - i; ++ size_t outleft = i; ++ ++ TEST_VERIFY_EXIT (iconv (cd, &inp, &inleft, &outp, &outleft) ++ == (size_t) -1); ++ TEST_COMPARE (errno, E2BIG); ++ ++ TEST_VERIFY_EXIT (iconv (cd, NULL, NULL, NULL, NULL) == 0); ++ } ++ } ++ ++ /* Same as before for SS2designation. */ ++ { ++ char inbuf[] = "㴽 \xe3\xb4\xbd"; ++ ++ for (int i = 0; i < 14; i++) ++ { ++ char *inp = inbuf; ++ size_t inleft = sizeof (inbuf) - 1; ++ ++ char *outp = outbufbase - i; ++ size_t outleft = i; ++ ++ TEST_VERIFY_EXIT (iconv (cd, &inp, &inleft, &outp, &outleft) ++ == (size_t) -1); ++ TEST_COMPARE (errno, E2BIG); ++ ++ TEST_VERIFY_EXIT (iconv (cd, NULL, NULL, NULL, NULL) == 0); ++ } ++ } ++ ++ /* Same as before for SS3designation. */ ++ { ++ char inbuf[] = "劄 \xe5\x8a\x84"; ++ ++ for (int i = 0; i < 14; i++) ++ { ++ char *inp = inbuf; ++ size_t inleft = sizeof (inbuf) - 1; ++ ++ char *outp = outbufbase - i; ++ size_t outleft = i; ++ ++ TEST_VERIFY_EXIT (iconv (cd, &inp, &inleft, &outp, &outleft) ++ == (size_t) -1); ++ TEST_COMPARE (errno, E2BIG); ++ ++ TEST_VERIFY_EXIT (iconv (cd, NULL, NULL, NULL, NULL) == 0); ++ } ++ } ++ ++ TEST_VERIFY_EXIT (iconv_close (cd) != -1); ++ ++ xmunmap (ntf, ntfsize); ++ ++ return 0; ++} ++ ++#include diff --git a/glibc-RHEL-34264.patch b/glibc-RHEL-34264.patch new file mode 100644 index 0000000..550ae38 --- /dev/null +++ b/glibc-RHEL-34264.patch @@ -0,0 +1,31 @@ +commit 87801a8fd06db1d654eea3e4f7626ff476a9bdaa +Author: Florian Weimer +Date: Thu Apr 25 15:00:45 2024 +0200 + + CVE-2024-33599: nscd: Stack-based buffer overflow in netgroup cache (bug 31677) + + Using alloca matches what other caches do. The request length is + bounded by MAXKEYLEN. + + Reviewed-by: Carlos O'Donell + +diff --git a/nscd/netgroupcache.c b/nscd/netgroupcache.c +index 5ee4413ef9384ec9..60c8225639a33b6b 100644 +--- a/nscd/netgroupcache.c ++++ b/nscd/netgroupcache.c +@@ -503,12 +503,13 @@ addinnetgrX (struct database_dyn *db, int fd, request_header *req, + = (struct indataset *) mempool_alloc (db, + sizeof (*dataset) + req->key_len, + 1); +- struct indataset dataset_mem; + bool cacheable = true; + if (__glibc_unlikely (dataset == NULL)) + { + cacheable = false; +- dataset = &dataset_mem; ++ /* The alloca is safe because nscd_run_worker verfies that ++ key_len is not larger than MAXKEYLEN. */ ++ dataset = alloca (sizeof (*dataset) + req->key_len); + } + + datahead_init_pos (&dataset->head, sizeof (*dataset) + req->key_len, diff --git a/glibc-RHEL-34267-1.patch b/glibc-RHEL-34267-1.patch new file mode 100644 index 0000000..8cd4d05 --- /dev/null +++ b/glibc-RHEL-34267-1.patch @@ -0,0 +1,52 @@ +commit 7835b00dbce53c3c87bbbb1754a95fb5e58187aa +Author: Florian Weimer +Date: Thu Apr 25 15:01:07 2024 +0200 + + CVE-2024-33600: nscd: Do not send missing not-found response in addgetnetgrentX (bug 31678) + + If we failed to add a not-found response to the cache, the dataset + point can be null, resulting in a null pointer dereference. + + Reviewed-by: Siddhesh Poyarekar + +diff --git a/nscd/netgroupcache.c b/nscd/netgroupcache.c +index 60c8225639a33b6b..a3e04b4c43e6acae 100644 +--- a/nscd/netgroupcache.c ++++ b/nscd/netgroupcache.c +@@ -148,7 +148,7 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req, + /* No such service. */ + cacheable = do_notfound (db, fd, req, key, &dataset, &total, &timeout, + &key_copy); +- goto writeout; ++ goto maybe_cache_add; + } + + memset (&data, '\0', sizeof (data)); +@@ -349,7 +349,7 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req, + { + cacheable = do_notfound (db, fd, req, key, &dataset, &total, &timeout, + &key_copy); +- goto writeout; ++ goto maybe_cache_add; + } + + total = buffilled; +@@ -411,14 +411,12 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req, + } + + if (he == NULL && fd != -1) +- { +- /* We write the dataset before inserting it to the database +- since while inserting this thread might block and so would +- unnecessarily let the receiver wait. */ +- writeout: ++ /* We write the dataset before inserting it to the database since ++ while inserting this thread might block and so would ++ unnecessarily let the receiver wait. */ + writeall (fd, &dataset->resp, dataset->head.recsize); +- } + ++ maybe_cache_add: + if (cacheable) + { + /* If necessary, we also propagate the data to disk. */ diff --git a/glibc-RHEL-34267-2.patch b/glibc-RHEL-34267-2.patch new file mode 100644 index 0000000..6f1b7a1 --- /dev/null +++ b/glibc-RHEL-34267-2.patch @@ -0,0 +1,53 @@ +commit b048a482f088e53144d26a61c390bed0210f49f2 +Author: Florian Weimer +Date: Thu Apr 25 15:01:07 2024 +0200 + + CVE-2024-33600: nscd: Avoid null pointer crashes after notfound response (bug 31678) + + The addgetnetgrentX call in addinnetgrX may have failed to produce + a result, so the result variable in addinnetgrX can be NULL. + Use db->negtimeout as the fallback value if there is no result data; + the timeout is also overwritten below. + + Also avoid sending a second not-found response. (The client + disconnects after receiving the first response, so the data stream did + not go out of sync even without this fix.) It is still beneficial to + add the negative response to the mapping, so that the client can get + it from there in the future, instead of going through the socket. + + Reviewed-by: Siddhesh Poyarekar + +diff --git a/nscd/netgroupcache.c b/nscd/netgroupcache.c +index a3e04b4c43e6acae..f656872ae8c3b888 100644 +--- a/nscd/netgroupcache.c ++++ b/nscd/netgroupcache.c +@@ -512,14 +512,15 @@ addinnetgrX (struct database_dyn *db, int fd, request_header *req, + + datahead_init_pos (&dataset->head, sizeof (*dataset) + req->key_len, + sizeof (innetgroup_response_header), +- he == NULL ? 0 : dh->nreloads + 1, result->head.ttl); ++ he == NULL ? 0 : dh->nreloads + 1, ++ result == NULL ? db->negtimeout : result->head.ttl); + /* Set the notfound status and timeout based on the result from + getnetgrent. */ +- dataset->head.notfound = result->head.notfound; ++ dataset->head.notfound = result == NULL || result->head.notfound; + dataset->head.timeout = timeout; + + dataset->resp.version = NSCD_VERSION; +- dataset->resp.found = result->resp.found; ++ dataset->resp.found = result != NULL && result->resp.found; + /* Until we find a matching entry the result is 0. */ + dataset->resp.result = 0; + +@@ -567,7 +568,9 @@ addinnetgrX (struct database_dyn *db, int fd, request_header *req, + goto out; + } + +- if (he == NULL) ++ /* addgetnetgrentX may have already sent a notfound response. Do ++ not send another one. */ ++ if (he == NULL && dataset->resp.found) + { + /* We write the dataset before inserting it to the database + since while inserting this thread might block and so would diff --git a/glibc-RHEL-34273.patch b/glibc-RHEL-34273.patch new file mode 100644 index 0000000..f855206 --- /dev/null +++ b/glibc-RHEL-34273.patch @@ -0,0 +1,383 @@ +commit c04a21e050d64a1193a6daab872bca2528bda44b +Author: Florian Weimer +Date: Thu Apr 25 15:01:07 2024 +0200 + + CVE-2024-33601, CVE-2024-33602: nscd: netgroup: Use two buffers in addgetnetgrentX (bug 31680) + + This avoids potential memory corruption when the underlying NSS + callback function does not use the buffer space to store all strings + (e.g., for constant strings). + + Instead of custom buffer management, two scratch buffers are used. + This increases stack usage somewhat. + + Scratch buffer allocation failure is handled by return -1 + (an invalid timeout value) instead of terminating the process. + This fixes bug 31679. + + Reviewed-by: Siddhesh Poyarekar + +diff --git a/nscd/netgroupcache.c b/nscd/netgroupcache.c +index f656872ae8c3b888..dd180f8083e7c9f9 100644 +--- a/nscd/netgroupcache.c ++++ b/nscd/netgroupcache.c +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + + #include "../inet/netgroup.h" + #include "nscd.h" +@@ -66,6 +67,16 @@ struct dataset + char strdata[0]; + }; + ++/* Send a notfound response to FD. Always returns -1 to indicate an ++ ephemeral error. */ ++static time_t ++send_notfound (int fd) ++{ ++ if (fd != -1) ++ TEMP_FAILURE_RETRY (send (fd, ¬found, sizeof (notfound), MSG_NOSIGNAL)); ++ return -1; ++} ++ + /* Sends a notfound message and prepares a notfound dataset to write to the + cache. Returns true if there was enough memory to allocate the dataset and + returns the dataset in DATASETP, total bytes to write in TOTALP and the +@@ -84,8 +95,7 @@ do_notfound (struct database_dyn *db, int fd, request_header *req, + total = sizeof (notfound); + timeout = time (NULL) + db->negtimeout; + +- if (fd != -1) +- TEMP_FAILURE_RETRY (send (fd, ¬found, total, MSG_NOSIGNAL)); ++ send_notfound (fd); + + dataset = mempool_alloc (db, sizeof (struct dataset) + req->key_len, 1); + /* If we cannot permanently store the result, so be it. */ +@@ -110,11 +120,78 @@ do_notfound (struct database_dyn *db, int fd, request_header *req, + return cacheable; + } + ++struct addgetnetgrentX_scratch ++{ ++ /* This is the result that the caller should use. It can be NULL, ++ point into buffer, or it can be in the cache. */ ++ struct dataset *dataset; ++ ++ struct scratch_buffer buffer; ++ ++ /* Used internally in addgetnetgrentX as a staging area. */ ++ struct scratch_buffer tmp; ++ ++ /* Number of bytes in buffer that are actually used. */ ++ size_t buffer_used; ++}; ++ ++static void ++addgetnetgrentX_scratch_init (struct addgetnetgrentX_scratch *scratch) ++{ ++ scratch->dataset = NULL; ++ scratch_buffer_init (&scratch->buffer); ++ scratch_buffer_init (&scratch->tmp); ++ ++ /* Reserve space for the header. */ ++ scratch->buffer_used = sizeof (struct dataset); ++ static_assert (sizeof (struct dataset) < sizeof (scratch->tmp.__space), ++ "initial buffer space"); ++ memset (scratch->tmp.data, 0, sizeof (struct dataset)); ++} ++ ++static void ++addgetnetgrentX_scratch_free (struct addgetnetgrentX_scratch *scratch) ++{ ++ scratch_buffer_free (&scratch->buffer); ++ scratch_buffer_free (&scratch->tmp); ++} ++ ++/* Copy LENGTH bytes from S into SCRATCH. Returns NULL if SCRATCH ++ could not be resized, otherwise a pointer to the copy. */ ++static char * ++addgetnetgrentX_append_n (struct addgetnetgrentX_scratch *scratch, ++ const char *s, size_t length) ++{ ++ while (true) ++ { ++ size_t remaining = scratch->buffer.length - scratch->buffer_used; ++ if (remaining >= length) ++ break; ++ if (!scratch_buffer_grow_preserve (&scratch->buffer)) ++ return NULL; ++ } ++ char *copy = scratch->buffer.data + scratch->buffer_used; ++ memcpy (copy, s, length); ++ scratch->buffer_used += length; ++ return copy; ++} ++ ++/* Copy S into SCRATCH, including its null terminator. Returns false ++ if SCRATCH could not be resized. */ ++static bool ++addgetnetgrentX_append (struct addgetnetgrentX_scratch *scratch, const char *s) ++{ ++ if (s == NULL) ++ s = ""; ++ return addgetnetgrentX_append_n (scratch, s, strlen (s) + 1) != NULL; ++} ++ ++/* Caller must initialize and free *SCRATCH. If the return value is ++ negative, this function has sent a notfound response. */ + static time_t + addgetnetgrentX (struct database_dyn *db, int fd, request_header *req, + const char *key, uid_t uid, struct hashentry *he, +- struct datahead *dh, struct dataset **resultp, +- void **tofreep) ++ struct datahead *dh, struct addgetnetgrentX_scratch *scratch) + { + if (__glibc_unlikely (debug_level > 0)) + { +@@ -133,14 +210,10 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req, + + char *key_copy = NULL; + struct __netgrent data; +- size_t buflen = MAX (1024, sizeof (*dataset) + req->key_len); +- size_t buffilled = sizeof (*dataset); +- char *buffer = NULL; + size_t nentries = 0; + size_t group_len = strlen (key) + 1; + struct name_list *first_needed + = alloca (sizeof (struct name_list) + group_len); +- *tofreep = NULL; + + if (netgroup_database == NULL + && __nss_database_lookup2 ("netgroup", NULL, NULL, &netgroup_database)) +@@ -152,8 +225,6 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req, + } + + memset (&data, '\0', sizeof (data)); +- buffer = xmalloc (buflen); +- *tofreep = buffer; + first_needed->next = first_needed; + memcpy (first_needed->name, key, group_len); + data.needed_groups = first_needed; +@@ -196,8 +267,8 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req, + while (1) + { + int e; +- status = getfct.f (&data, buffer + buffilled, +- buflen - buffilled - req->key_len, &e); ++ status = getfct.f (&data, scratch->tmp.data, ++ scratch->tmp.length, &e); + if (status == NSS_STATUS_SUCCESS) + { + if (data.type == triple_val) +@@ -205,68 +276,10 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req, + const char *nhost = data.val.triple.host; + const char *nuser = data.val.triple.user; + const char *ndomain = data.val.triple.domain; +- +- size_t hostlen = strlen (nhost ?: "") + 1; +- size_t userlen = strlen (nuser ?: "") + 1; +- size_t domainlen = strlen (ndomain ?: "") + 1; +- +- if (nhost == NULL || nuser == NULL || ndomain == NULL +- || nhost > nuser || nuser > ndomain) +- { +- const char *last = nhost; +- if (last == NULL +- || (nuser != NULL && nuser > last)) +- last = nuser; +- if (last == NULL +- || (ndomain != NULL && ndomain > last)) +- last = ndomain; +- +- size_t bufused +- = (last == NULL +- ? buffilled +- : last + strlen (last) + 1 - buffer); +- +- /* We have to make temporary copies. */ +- size_t needed = hostlen + userlen + domainlen; +- +- if (buflen - req->key_len - bufused < needed) +- { +- buflen += MAX (buflen, 2 * needed); +- /* Save offset in the old buffer. We don't +- bother with the NULL check here since +- we'll do that later anyway. */ +- size_t nhostdiff = nhost - buffer; +- size_t nuserdiff = nuser - buffer; +- size_t ndomaindiff = ndomain - buffer; +- +- char *newbuf = xrealloc (buffer, buflen); +- /* Fix up the triplet pointers into the new +- buffer. */ +- nhost = (nhost ? newbuf + nhostdiff +- : NULL); +- nuser = (nuser ? newbuf + nuserdiff +- : NULL); +- ndomain = (ndomain ? newbuf + ndomaindiff +- : NULL); +- *tofreep = buffer = newbuf; +- } +- +- nhost = memcpy (buffer + bufused, +- nhost ?: "", hostlen); +- nuser = memcpy ((char *) nhost + hostlen, +- nuser ?: "", userlen); +- ndomain = memcpy ((char *) nuser + userlen, +- ndomain ?: "", domainlen); +- } +- +- char *wp = buffer + buffilled; +- wp = memmove (wp, nhost ?: "", hostlen); +- wp += hostlen; +- wp = memmove (wp, nuser ?: "", userlen); +- wp += userlen; +- wp = memmove (wp, ndomain ?: "", domainlen); +- wp += domainlen; +- buffilled = wp - buffer; ++ if (!(addgetnetgrentX_append (scratch, nhost) ++ && addgetnetgrentX_append (scratch, nuser) ++ && addgetnetgrentX_append (scratch, ndomain))) ++ return send_notfound (fd); + ++nentries; + } + else +@@ -318,8 +331,8 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req, + } + else if (status == NSS_STATUS_TRYAGAIN && e == ERANGE) + { +- buflen *= 2; +- *tofreep = buffer = xrealloc (buffer, buflen); ++ if (!scratch_buffer_grow (&scratch->tmp)) ++ return send_notfound (fd); + } + else if (status == NSS_STATUS_RETURN + || status == NSS_STATUS_NOTFOUND +@@ -352,10 +365,17 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req, + goto maybe_cache_add; + } + +- total = buffilled; ++ /* Capture the result size without the key appended. */ ++ total = scratch->buffer_used; ++ ++ /* Make a copy of the key. The scratch buffer must not move after ++ this point. */ ++ key_copy = addgetnetgrentX_append_n (scratch, key, req->key_len); ++ if (key_copy == NULL) ++ return send_notfound (fd); + + /* Fill in the dataset. */ +- dataset = (struct dataset *) buffer; ++ dataset = scratch->buffer.data; + timeout = datahead_init_pos (&dataset->head, total + req->key_len, + total - offsetof (struct dataset, resp), + he == NULL ? 0 : dh->nreloads + 1, +@@ -364,11 +384,7 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req, + dataset->resp.version = NSCD_VERSION; + dataset->resp.found = 1; + dataset->resp.nresults = nentries; +- dataset->resp.result_len = buffilled - sizeof (*dataset); +- +- assert (buflen - buffilled >= req->key_len); +- key_copy = memcpy (buffer + buffilled, key, req->key_len); +- buffilled += req->key_len; ++ dataset->resp.result_len = total - sizeof (*dataset); + + /* Now we can determine whether on refill we have to create a new + record or not. */ +@@ -399,7 +415,7 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req, + if (__glibc_likely (newp != NULL)) + { + /* Adjust pointer into the memory block. */ +- key_copy = (char *) newp + (key_copy - buffer); ++ key_copy = (char *) newp + (key_copy - (char *) dataset); + + dataset = memcpy (newp, dataset, total + req->key_len); + cacheable = true; +@@ -440,7 +456,7 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req, + } + + out: +- *resultp = dataset; ++ scratch->dataset = dataset; + + return timeout; + } +@@ -461,6 +477,9 @@ addinnetgrX (struct database_dyn *db, int fd, request_header *req, + if (user != NULL) + key = (char *) rawmemchr (key, '\0') + 1; + const char *domain = *key++ ? key : NULL; ++ struct addgetnetgrentX_scratch scratch; ++ ++ addgetnetgrentX_scratch_init (&scratch); + + if (__glibc_unlikely (debug_level > 0)) + { +@@ -476,12 +495,8 @@ addinnetgrX (struct database_dyn *db, int fd, request_header *req, + group, group_len, + db, uid); + time_t timeout; +- void *tofree; + if (result != NULL) +- { +- timeout = result->head.timeout; +- tofree = NULL; +- } ++ timeout = result->head.timeout; + else + { + request_header req_get = +@@ -490,7 +505,10 @@ addinnetgrX (struct database_dyn *db, int fd, request_header *req, + .key_len = group_len + }; + timeout = addgetnetgrentX (db, -1, &req_get, group, uid, NULL, NULL, +- &result, &tofree); ++ &scratch); ++ result = scratch.dataset; ++ if (timeout < 0) ++ goto out; + } + + struct indataset +@@ -604,7 +622,7 @@ addinnetgrX (struct database_dyn *db, int fd, request_header *req, + } + + out: +- free (tofree); ++ addgetnetgrentX_scratch_free (&scratch); + return timeout; + } + +@@ -614,11 +632,12 @@ addgetnetgrentX_ignore (struct database_dyn *db, int fd, request_header *req, + const char *key, uid_t uid, struct hashentry *he, + struct datahead *dh) + { +- struct dataset *ignore; +- void *tofree; +- time_t timeout = addgetnetgrentX (db, fd, req, key, uid, he, dh, +- &ignore, &tofree); +- free (tofree); ++ struct addgetnetgrentX_scratch scratch; ++ addgetnetgrentX_scratch_init (&scratch); ++ time_t timeout = addgetnetgrentX (db, fd, req, key, uid, he, dh, &scratch); ++ addgetnetgrentX_scratch_free (&scratch); ++ if (timeout < 0) ++ timeout = 0; + return timeout; + } + +@@ -662,5 +681,9 @@ readdinnetgr (struct database_dyn *db, struct hashentry *he, + .key_len = he->len + }; + +- return addinnetgrX (db, -1, &req, db->data + he->key, he->owner, he, dh); ++ int timeout = addinnetgrX (db, -1, &req, db->data + he->key, he->owner, ++ he, dh); ++ if (timeout < 0) ++ timeout = 0; ++ return timeout; + } diff --git a/glibc-RHEL-22846.patch b/glibc-RHEL-3639.patch similarity index 96% rename from glibc-RHEL-22846.patch rename to glibc-RHEL-3639.patch index 4179f74..e51ab13 100644 --- a/glibc-RHEL-22846.patch +++ b/glibc-RHEL-3639.patch @@ -20,7 +20,7 @@ Conflicts: (usual test differences, link test with -ldl) diff --git a/elf/Makefile b/elf/Makefile -index 6f0f36cdfe3961e8..ebf46a297d241d8f 100644 +index 634c3113227d64a6..42dc878209b11d29 100644 --- a/elf/Makefile +++ b/elf/Makefile @@ -362,6 +362,7 @@ tests += \ @@ -31,7 +31,7 @@ index 6f0f36cdfe3961e8..ebf46a297d241d8f 100644 tst-dlmodcount \ tst-dlmopen1 \ tst-dlmopen3 \ -@@ -711,6 +712,8 @@ modules-names = \ +@@ -709,6 +710,8 @@ modules-names = \ tst-deep1mod2 \ tst-deep1mod3 \ tst-dlmopen1mod \ @@ -40,19 +40,17 @@ index 6f0f36cdfe3961e8..ebf46a297d241d8f 100644 tst-dlmopen-dlerror-mod \ tst-dlmopen-gethostbyname-mod \ tst-dlmopen-twice-mod1 \ -@@ -2707,6 +2710,12 @@ $(objpfx)tst-dlmopen-twice.out: \ +@@ -2697,3 +2700,10 @@ $(objpfx)tst-dlmopen-twice: $(libdl) + $(objpfx)tst-dlmopen-twice.out: \ $(objpfx)tst-dlmopen-twice-mod1.so \ $(objpfx)tst-dlmopen-twice-mod2.so - ++ +LDFLAGS-tst-dlclose-lazy-mod1.so = -Wl,-z,lazy,--no-as-needed +$(objpfx)tst-dlclose-lazy-mod1.so: $(objpfx)tst-dlclose-lazy-mod2.so +$(objpfx)tst-dlclose-lazy: $(libdl) +$(objpfx)tst-dlclose-lazy.out: \ + $(objpfx)tst-dlclose-lazy-mod1.so $(objpfx)tst-dlclose-lazy-mod2.so + - # The object tst-nodeps1-mod.so has no explicit dependencies on libc.so. - $(objpfx)tst-nodeps1-mod.so: $(objpfx)tst-nodeps1-mod.os - $(LINK.o) -nostartfiles -nostdlib -shared -o $@ $^ diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c index 47acd134600b44b5..9e8f14b8483f5eba 100644 --- a/elf/dl-lookup.c diff --git a/glibc-2.28-Add-run-one-test-convenience-target-and-m.patch b/glibc-RHEL-3757.patch similarity index 65% rename from glibc-2.28-Add-run-one-test-convenience-target-and-m.patch rename to glibc-RHEL-3757.patch index 7cd6393..8902823 100644 --- a/glibc-2.28-Add-run-one-test-convenience-target-and-m.patch +++ b/glibc-RHEL-3757.patch @@ -1,33 +1,17 @@ -From bbc404e8f6e59aa808642c2a40e24a81744967e3 Mon Sep 17 00:00:00 2001 -From: caiyinyu -Date: Mon, 15 May 2023 12:00:50 +0800 -Subject: [PATCH 04/14] glibc-2.28: Add run-one-test convenience target and - makefile help text - -Reference: - - commit 2ac579f9c25388a7734948d77b03e4dd10f35334 - Author: DJ Delorie - Date: Mon Sep 30 16:04:52 2019 -0400 +commit 2ac579f9c25388a7734948d77b03e4dd10f35334 +Author: DJ Delorie +Date: Mon Sep 30 16:04:52 2019 -0400 Add run-one-test convenience target and makefile help text - + Adds "make test" for re-running just one test. Also adds "make help" for help with our Makefile targets, and adds a mini-help when you just run "make". - + Reviewed-by: Carlos O'Donell -Change-Id: I8c7ccf9a5ec4dc4afd4901d2f8f693677d0d94ea -Signed-off-by: ticat_fp ---- - Makefile | 22 ++++++++++++++++++++-- - Makefile.help | 42 ++++++++++++++++++++++++++++++++++++++++++ - 2 files changed, 62 insertions(+), 2 deletions(-) - create mode 100644 Makefile.help - diff --git a/Makefile b/Makefile -index 6d73241b..6518f62e 100644 +index 6d73241bbc811c13..6518f62ee0676b0d 100644 --- a/Makefile +++ b/Makefile @@ -26,8 +26,17 @@ include Makeconfig @@ -65,7 +49,7 @@ index 6d73241b..6518f62e 100644 + @cat $(objpfx)$t.out diff --git a/Makefile.help b/Makefile.help new file mode 100644 -index 00000000..319fdaa1 +index 0000000000000000..3b043bce013cc2b4 --- /dev/null +++ b/Makefile.help @@ -0,0 +1,42 @@ @@ -92,25 +76,22 @@ index 00000000..319fdaa1 +help-starts-here + +all -+ The usual default; builds everything but doesn't run the -+ tests. ++ The usual default; builds everything but doesn't run the ++ tests. + +check (or tests) -+ Runs the standard set of tests. ++ Runs the standard set of tests. + +test -+ Runs one test. Use like this: -+ make test t=wcsmbs/test-wcsnlen -+ Note that this will rebuild the test if needed, but will not -+ rebuild what "make all" would have rebuilt. ++ Runs one test. Use like this: ++ make test t=wcsmbs/test-wcsnlen ++ Note that this will rebuild the test if needed, but will not ++ rebuild what "make all" would have rebuilt. + +-- +Other useful hints: + +builddir$ rm testroot.pristine/install.stamp -+ Forces the testroot to be reinstalled the next time you run -+ the testsuite (or just rm -rf testroot.pristine) ++ Forces the testroot to be reinstalled the next time you run ++ the testsuite (or just rm -rf testroot.pristine) + --- -2.33.0 - diff --git a/glibc-Support-target-specific-ALIGN-for-variable-alignment-4.patch b/glibc-Support-target-specific-ALIGN-for-variable-alignment-4.patch deleted file mode 100644 index d7552e1..0000000 --- a/glibc-Support-target-specific-ALIGN-for-variable-alignment-4.patch +++ /dev/null @@ -1,171 +0,0 @@ -From 2e86602d21fcaa8353c529f2f6768125396da39f Mon Sep 17 00:00:00 2001 -From: "H.J. Lu" -Date: Wed, 19 Jul 2023 23:12:30 +0800 -Subject: [PATCH 5/6] Support target specific ALIGN for variable alignment test - [BZ #28676] - -Add to support target specific ALIGN for variable -alignment test: - -1. Alpha: Use 0x10000. -2. MicroBlaze and Nios II: Use 0x8000. -3. All others: Use 0x200000. - -Backport from master commit: 4435c29 - -Reviewed-by: Adhemerval Zanella -Signed-off-by: Rongwei Wang ---- - elf/tst-align3.c | 4 +--- - elf/tst-alignmod3.c | 4 +--- - sysdeps/alpha/tst-file-align.h | 20 ++++++++++++++++++++ - sysdeps/generic/tst-file-align.h | 20 ++++++++++++++++++++ - sysdeps/microblaze/tst-file-align.h | 20 ++++++++++++++++++++ - sysdeps/nios2/tst-file-align.h | 20 ++++++++++++++++++++ - 6 files changed, 82 insertions(+), 6 deletions(-) - create mode 100644 sysdeps/alpha/tst-file-align.h - create mode 100644 sysdeps/generic/tst-file-align.h - create mode 100644 sysdeps/microblaze/tst-file-align.h - create mode 100644 sysdeps/nios2/tst-file-align.h - -diff --git a/elf/tst-align3.c b/elf/tst-align3.c -index ac86d623..87a8ff81 100644 ---- a/elf/tst-align3.c -+++ b/elf/tst-align3.c -@@ -17,11 +17,9 @@ - . */ - - #include -+#include - #include - --/* This should cover all possible page sizes we currently support. */ --#define ALIGN 0x200000 -- - int bar __attribute__ ((aligned (ALIGN))) = 1; - - extern int do_load_test (void); -diff --git a/elf/tst-alignmod3.c b/elf/tst-alignmod3.c -index 0d33f237..9520c352 100644 ---- a/elf/tst-alignmod3.c -+++ b/elf/tst-alignmod3.c -@@ -17,11 +17,9 @@ - . */ - - #include -+#include - #include - --/* This should cover all possible page sizes we currently support. */ --#define ALIGN 0x200000 -- - int foo __attribute__ ((aligned (ALIGN))) = 1; - - void -diff --git a/sysdeps/alpha/tst-file-align.h b/sysdeps/alpha/tst-file-align.h -new file mode 100644 -index 00000000..8fc3c940 ---- /dev/null -+++ b/sysdeps/alpha/tst-file-align.h -@@ -0,0 +1,20 @@ -+/* Check file alignment. Alpha version. -+ Copyright (C) 2021 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* This should cover all possible alignments we currently support. */ -+#define ALIGN 0x10000 -diff --git a/sysdeps/generic/tst-file-align.h b/sysdeps/generic/tst-file-align.h -new file mode 100644 -index 00000000..6ee6783a ---- /dev/null -+++ b/sysdeps/generic/tst-file-align.h -@@ -0,0 +1,20 @@ -+/* Check file alignment. Generic version. -+ Copyright (C) 2021 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* This should cover all possible page sizes we currently support. */ -+#define ALIGN 0x200000 -diff --git a/sysdeps/microblaze/tst-file-align.h b/sysdeps/microblaze/tst-file-align.h -new file mode 100644 -index 00000000..43c58b29 ---- /dev/null -+++ b/sysdeps/microblaze/tst-file-align.h -@@ -0,0 +1,20 @@ -+/* Check file alignment. MicroBlaze version. -+ Copyright (C) 2021 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* This should cover all possible alignments we currently support. */ -+#define ALIGN 0x8000 -diff --git a/sysdeps/nios2/tst-file-align.h b/sysdeps/nios2/tst-file-align.h -new file mode 100644 -index 00000000..589a2d5a ---- /dev/null -+++ b/sysdeps/nios2/tst-file-align.h -@@ -0,0 +1,20 @@ -+/* Check file alignment. Nios II version. -+ Copyright (C) 2021 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* This should cover all possible alignments we currently support. */ -+#define ALIGN 0x8000 --- -2.27.0 - diff --git a/glibc-Sync-to-lnd-35-for-LoongArch.patch b/glibc-Sync-to-lnd-35-for-LoongArch.patch deleted file mode 100644 index e1e1615..0000000 --- a/glibc-Sync-to-lnd-35-for-LoongArch.patch +++ /dev/null @@ -1,26506 +0,0 @@ -From 4bcb0bf4f727666ba875302baf52d60f65bd7cb1 Mon Sep 17 00:00:00 2001 -From: Lixing -Date: Wed, 19 Jul 2023 11:59:19 +0800 -Subject: [PATCH] glibc Sync to vec.35 for LoongArch - -dl-machine.h: scope used - #define PLTREL ElfW(Rela) -dl-tunables.list added -ld.abilist changed -localplt.data changed ---- - elf/dl-reloc.c | 13 +- - elf/elf.h | 85 +- - scripts/config.guess | 3 + - scripts/config.sub | 7 +- - sysdeps/loongarch/Implies | 5 + - sysdeps/loongarch/Makefile | 36 + - sysdeps/loongarch/Versions | 5 + - sysdeps/loongarch/__longjmp.S | 50 + - sysdeps/loongarch/abort-instr.h | 2 + - sysdeps/loongarch/at_quick_exit.c | 1 + - sysdeps/loongarch/atexit.c | 1 + - sysdeps/loongarch/bits/endian.h | 9 + - sysdeps/loongarch/bits/fenv.h | 93 + - sysdeps/loongarch/bits/link.h | 56 + - sysdeps/loongarch/bits/setjmp.h | 39 + - sysdeps/loongarch/bits/wordsize.h | 22 + - sysdeps/loongarch/bsd-_setjmp.c | 1 + - sysdeps/loongarch/bsd-setjmp.c | 1 + - sysdeps/loongarch/configure | 4 + - sysdeps/loongarch/configure.ac | 6 + - sysdeps/loongarch/cpu-tunables.c | 94 + - sysdeps/loongarch/dl-get-cpu-features.c | 25 + - sysdeps/loongarch/dl-irel.h | 51 + - sysdeps/loongarch/dl-machine.h | 410 +++ - sysdeps/loongarch/dl-tls.h | 49 + - sysdeps/loongarch/dl-trampoline.S | 31 + - sysdeps/loongarch/dl-trampoline.h | 153 ++ - sysdeps/loongarch/dl-tunables.list | 25 + - sysdeps/loongarch/e_sqrtl.c | 39 + - sysdeps/loongarch/elf-init.c | 1 + - sysdeps/loongarch/fenv_private.h | 328 +++ - sysdeps/loongarch/fpu/e_ilogb.c | 39 + - sysdeps/loongarch/fpu/e_ilogbf.c | 39 + - sysdeps/loongarch/fpu/e_sqrt.c | 29 + - sysdeps/loongarch/fpu/e_sqrtf.c | 28 + - sysdeps/loongarch/fpu/fclrexcpt.c | 47 + - sysdeps/loongarch/fpu/fedisblxcpt.c | 40 + - sysdeps/loongarch/fpu/feenablxcpt.c | 40 + - sysdeps/loongarch/fpu/fegetenv.c | 33 + - sysdeps/loongarch/fpu/fegetexcept.c | 33 + - sysdeps/loongarch/fpu/fegetmode.c | 27 + - sysdeps/loongarch/fpu/fegetround.c | 35 + - sysdeps/loongarch/fpu/feholdexcpt.c | 41 + - sysdeps/loongarch/fpu/fenv_libc.h | 31 + - sysdeps/loongarch/fpu/fesetenv.c | 44 + - sysdeps/loongarch/fpu/fesetexcept.c | 32 + - sysdeps/loongarch/fpu/fesetmode.c | 38 + - sysdeps/loongarch/fpu/fesetround.c | 46 + - sysdeps/loongarch/fpu/feupdateenv.c | 45 + - sysdeps/loongarch/fpu/fgetexcptflg.c | 39 + - sysdeps/loongarch/fpu/fraiseexcpt.c | 84 + - sysdeps/loongarch/fpu/fsetexcptflg.c | 42 + - sysdeps/loongarch/fpu/ftestexcept.c | 33 + - sysdeps/loongarch/fpu/s_copysign.c | 30 + - sysdeps/loongarch/fpu/s_copysignf.c | 30 + - sysdeps/loongarch/fpu/s_finite.c | 30 + - sysdeps/loongarch/fpu/s_finitef.c | 30 + - sysdeps/loongarch/fpu/s_fmax.c | 30 + - sysdeps/loongarch/fpu/s_fmaxf.c | 30 + - sysdeps/loongarch/fpu/s_fmaxmag.c | 29 + - sysdeps/loongarch/fpu/s_fmaxmagf.c | 29 + - sysdeps/loongarch/fpu/s_fmin.c | 30 + - sysdeps/loongarch/fpu/s_fminf.c | 30 + - sysdeps/loongarch/fpu/s_fminmag.c | 29 + - sysdeps/loongarch/fpu/s_fminmagf.c | 29 + - sysdeps/loongarch/fpu/s_fpclassify.c | 38 + - sysdeps/loongarch/fpu/s_fpclassifyf.c | 38 + - sysdeps/loongarch/fpu/s_isinf.c | 30 + - sysdeps/loongarch/fpu/s_isinff.c | 30 + - sysdeps/loongarch/fpu/s_isnan.c | 31 + - sysdeps/loongarch/fpu/s_isnanf.c | 31 + - sysdeps/loongarch/fpu/s_issignaling.c | 29 + - sysdeps/loongarch/fpu/s_issignalingf.c | 29 + - sysdeps/loongarch/fpu/s_llrint.c | 31 + - sysdeps/loongarch/fpu/s_llrintf.c | 31 + - sysdeps/loongarch/fpu/s_logb.c | 30 + - sysdeps/loongarch/fpu/s_logbf.c | 30 + - sysdeps/loongarch/fpu/s_lrint.c | 31 + - sysdeps/loongarch/fpu/s_lrintf.c | 31 + - sysdeps/loongarch/fpu/s_rint.c | 29 + - sysdeps/loongarch/fpu/s_rintf.c | 29 + - sysdeps/loongarch/fpu/s_scalbn.c | 29 + - sysdeps/loongarch/fpu/s_scalbnf.c | 29 + - sysdeps/loongarch/fpu_control.h | 128 + - sysdeps/loongarch/fstat.c | 1 + - sysdeps/loongarch/fstat64.c | 1 + - sysdeps/loongarch/fstatat.c | 1 + - sysdeps/loongarch/fstatat64.c | 1 + - sysdeps/loongarch/gccframe.h | 21 + - sysdeps/loongarch/hp-timing.h | 40 + - sysdeps/loongarch/init-arch.h | 24 + - sysdeps/loongarch/jmpbuf-offsets.h | 23 + - sysdeps/loongarch/jmpbuf-unwind.h | 46 + - sysdeps/loongarch/ldsodefs.h | 48 + - sysdeps/loongarch/libc-start.h | 25 + - sysdeps/loongarch/libc-tls.c | 32 + - sysdeps/loongarch/linkmap.h | 4 + - sysdeps/loongarch/lp64/Implies-after | 1 + - sysdeps/loongarch/lp64/libm-test-ulps | 2206 +++++++++++++++++ - sysdeps/loongarch/lp64/libm-test-ulps-name | 1 + - sysdeps/loongarch/lp64/memchr.S | 99 + - sysdeps/loongarch/lp64/memcmp.S | 281 +++ - sysdeps/loongarch/lp64/memcpy.S | 818 ++++++ - sysdeps/loongarch/lp64/memmove.S | 2 + - sysdeps/loongarch/lp64/memset.S | 173 ++ - sysdeps/loongarch/lp64/multiarch/Makefile | 18 + - .../lp64/multiarch/ifunc-impl-list.c | 142 ++ - sysdeps/loongarch/lp64/multiarch/ifunc-lasx.h | 40 + - sysdeps/loongarch/lp64/multiarch/ifunc-lsx.h | 37 + - .../loongarch/lp64/multiarch/ifunc-memchr.h | 37 + - .../loongarch/lp64/multiarch/ifunc-memrchr.h | 37 + - .../loongarch/lp64/multiarch/ifunc-stpcpy.h | 34 + - .../loongarch/lp64/multiarch/memchr-aligned.S | 7 + - .../loongarch/lp64/multiarch/memchr-lasx.S | 108 + - sysdeps/loongarch/lp64/multiarch/memchr-lsx.S | 93 + - sysdeps/loongarch/lp64/multiarch/memchr.c | 39 + - .../loongarch/lp64/multiarch/memcmp-aligned.S | 11 + - .../loongarch/lp64/multiarch/memcmp-lasx.S | 199 ++ - sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S | 255 ++ - sysdeps/loongarch/lp64/multiarch/memcmp.c | 41 + - .../loongarch/lp64/multiarch/memcpy-aligned.S | 11 + - .../loongarch/lp64/multiarch/memcpy-lasx.S | 1 + - sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S | 1 + - .../lp64/multiarch/memcpy-unaligned.S | 259 ++ - sysdeps/loongarch/lp64/multiarch/memcpy.c | 39 + - .../lp64/multiarch/memmove-aligned.S | 1 + - .../loongarch/lp64/multiarch/memmove-lasx.S | 279 +++ - .../loongarch/lp64/multiarch/memmove-lsx.S | 524 ++++ - .../lp64/multiarch/memmove-unaligned.S | 478 ++++ - sysdeps/loongarch/lp64/multiarch/memmove.c | 39 + - .../lp64/multiarch/memrchr-generic.c | 9 + - .../loongarch/lp64/multiarch/memrchr-lasx.S | 114 + - .../loongarch/lp64/multiarch/memrchr-lsx.S | 96 + - sysdeps/loongarch/lp64/multiarch/memrchr.c | 39 + - .../loongarch/lp64/multiarch/memset-aligned.S | 9 + - .../loongarch/lp64/multiarch/memset-lasx.S | 132 + - sysdeps/loongarch/lp64/multiarch/memset-lsx.S | 125 + - .../lp64/multiarch/memset-unaligned.S | 177 ++ - sysdeps/loongarch/lp64/multiarch/memset.c | 39 + - .../lp64/multiarch/rawmemchr-aligned.S | 7 + - .../loongarch/lp64/multiarch/rawmemchr-lasx.S | 51 + - .../loongarch/lp64/multiarch/rawmemchr-lsx.S | 56 + - sysdeps/loongarch/lp64/multiarch/rawmemchr.c | 37 + - .../loongarch/lp64/multiarch/stpcpy-aligned.S | 8 + - sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S | 178 ++ - sysdeps/loongarch/lp64/multiarch/stpcpy.c | 43 + - .../loongarch/lp64/multiarch/strchr-aligned.S | 10 + - .../loongarch/lp64/multiarch/strchr-lasx.S | 81 + - sysdeps/loongarch/lp64/multiarch/strchr-lsx.S | 61 + - .../lp64/multiarch/strchr-unaligned.S | 132 + - sysdeps/loongarch/lp64/multiarch/strchr.c | 39 + - .../lp64/multiarch/strchrnul-aligned.S | 8 + - .../loongarch/lp64/multiarch/strchrnul-lasx.S | 4 + - .../loongarch/lp64/multiarch/strchrnul-lsx.S | 3 + - .../lp64/multiarch/strchrnul-unaligned.S | 146 ++ - sysdeps/loongarch/lp64/multiarch/strchrnul.c | 34 + - .../loongarch/lp64/multiarch/strcmp-aligned.S | 8 + - sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S | 147 ++ - .../lp64/multiarch/strcmp-unaligned.S | 191 ++ - sysdeps/loongarch/lp64/multiarch/strcmp.c | 35 + - .../loongarch/lp64/multiarch/strcpy-aligned.S | 8 + - sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S | 174 ++ - .../lp64/multiarch/strcpy-unaligned.S | 199 ++ - sysdeps/loongarch/lp64/multiarch/strcpy.c | 36 + - .../loongarch/lp64/multiarch/strlen-aligned.S | 8 + - .../loongarch/lp64/multiarch/strlen-lasx.S | 55 + - sysdeps/loongarch/lp64/multiarch/strlen-lsx.S | 63 + - .../lp64/multiarch/strlen-unaligned.S | 116 + - sysdeps/loongarch/lp64/multiarch/strlen.c | 39 + - .../lp64/multiarch/strncmp-aligned.S | 8 + - .../loongarch/lp64/multiarch/strncmp-lsx.S | 197 ++ - .../lp64/multiarch/strncmp-unaligned.S | 257 ++ - sysdeps/loongarch/lp64/multiarch/strncmp.c | 35 + - .../lp64/multiarch/strnlen-aligned.S | 8 + - .../loongarch/lp64/multiarch/strnlen-lasx.S | 92 + - .../loongarch/lp64/multiarch/strnlen-lsx.S | 81 + - .../lp64/multiarch/strnlen-unaligned.S | 145 ++ - sysdeps/loongarch/lp64/multiarch/strnlen.c | 40 + - .../lp64/multiarch/strrchr-aligned.S | 12 + - .../loongarch/lp64/multiarch/strrchr-lasx.S | 113 + - .../loongarch/lp64/multiarch/strrchr-lsx.S | 93 + - sysdeps/loongarch/lp64/multiarch/strrchr.c | 39 + - sysdeps/loongarch/lp64/rawmemchr.S | 114 + - sysdeps/loongarch/lp64/s_cosf.S | 409 +++ - sysdeps/loongarch/lp64/s_sinf.S | 392 +++ - sysdeps/loongarch/lp64/stpcpy.S | 180 ++ - sysdeps/loongarch/lp64/strchr.S | 90 + - sysdeps/loongarch/lp64/strchrnul.S | 95 + - sysdeps/loongarch/lp64/strcmp.S | 228 ++ - sysdeps/loongarch/lp64/strcpy.S | 174 ++ - sysdeps/loongarch/lp64/strlen.S | 86 + - sysdeps/loongarch/lp64/strncmp.S | 257 ++ - sysdeps/loongarch/lp64/strnlen.S | 83 + - sysdeps/loongarch/lp64/strrchr.S | 106 + - sysdeps/loongarch/lstat.c | 1 + - sysdeps/loongarch/lstat64.c | 1 + - sysdeps/loongarch/machine-gmon.h | 37 + - sysdeps/loongarch/math_private.h | 245 ++ - sysdeps/loongarch/memusage.h | 21 + - sysdeps/loongarch/mknod.c | 1 + - sysdeps/loongarch/mknodat.c | 1 + - sysdeps/loongarch/nptl/Makefile | 26 + - .../loongarch/nptl/bits/pthreadtypes-arch.h | 68 + - sysdeps/loongarch/nptl/bits/semaphore.h | 33 + - sysdeps/loongarch/nptl/libc-lowlevellock.c | 8 + - sysdeps/loongarch/nptl/nptl-sysdep.S | 2 + - sysdeps/loongarch/nptl/pthread-offsets.h | 23 + - sysdeps/loongarch/nptl/pthreaddef.h | 32 + - sysdeps/loongarch/nptl/tcb-offsets.sym | 6 + - sysdeps/loongarch/nptl/tls.h | 147 ++ - sysdeps/loongarch/preconfigure | 9 + - sysdeps/loongarch/pthread_atfork.c | 1 + - sysdeps/loongarch/setjmp.S | 62 + - sysdeps/loongarch/sfp-machine.h | 79 + - sysdeps/loongarch/sotruss-lib.c | 51 + - sysdeps/loongarch/stack_chk_fail_local.c | 1 + - sysdeps/loongarch/stackinfo.h | 33 + - sysdeps/loongarch/start.S | 51 + - sysdeps/loongarch/stat.c | 1 + - sysdeps/loongarch/stat64.c | 1 + - sysdeps/loongarch/sys/asm.h | 58 + - sysdeps/loongarch/sys/regdef.h | 83 + - sysdeps/loongarch/tininess.h | 1 + - sysdeps/loongarch/tls-macros.h | 46 + - sysdeps/loongarch/tst-audit.h | 23 + - sysdeps/loongarch/warning-nop.c | 1 + - sysdeps/unix/sysv/linux/loongarch/Implies | 1 + - sysdeps/unix/sysv/linux/loongarch/Makefile | 17 + - sysdeps/unix/sysv/linux/loongarch/Versions | 44 + - .../sysv/linux/loongarch/atomic-machine.h | 188 ++ - .../unix/sysv/linux/loongarch/bits/fcntl.h | 62 + - .../unix/sysv/linux/loongarch/bits/hwcap.h | 37 + - .../sysv/linux/loongarch/bits/local_lim.h | 99 + - sysdeps/unix/sysv/linux/loongarch/bits/mman.h | 41 + - sysdeps/unix/sysv/linux/loongarch/bits/shm.h | 112 + - .../sysv/linux/loongarch/bits/sigcontext.h | 47 + - .../unix/sysv/linux/loongarch/bits/signum.h | 58 + - sysdeps/unix/sysv/linux/loongarch/clone.S | 98 + - sysdeps/unix/sysv/linux/loongarch/configure | 199 ++ - .../unix/sysv/linux/loongarch/configure.ac | 27 + - .../unix/sysv/linux/loongarch/cpu-features.c | 32 + - .../unix/sysv/linux/loongarch/cpu-features.h | 53 + - .../unix/sysv/linux/loongarch/dl-procinfo.c | 60 + - sysdeps/unix/sysv/linux/loongarch/dl-static.c | 84 + - sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c | 21 + - .../sysv/linux/loongarch/dl-tunables.list | 27 + - .../unix/sysv/linux/loongarch/getcontext.S | 72 + - sysdeps/unix/sysv/linux/loongarch/getpid.c | 54 + - .../unix/sysv/linux/loongarch/gettimeofday.c | 58 + - sysdeps/unix/sysv/linux/loongarch/getuid.c | 60 + - .../unix/sysv/linux/loongarch/init-first.c | 57 + - sysdeps/unix/sysv/linux/loongarch/ipc_priv.h | 21 + - .../sysv/linux/loongarch/kernel-features.h | 24 + - .../unix/sysv/linux/loongarch/ldd-rewrite.sed | 1 + - sysdeps/unix/sysv/linux/loongarch/ldsodefs.h | 32 + - .../unix/sysv/linux/loongarch/libc-start.c | 28 + - sysdeps/unix/sysv/linux/loongarch/libc-vdso.h | 37 + - .../unix/sysv/linux/loongarch/localplt.data | 13 + - .../unix/sysv/linux/loongarch/lp64/Implies | 3 + - .../sysv/linux/loongarch/lp64/c++-types.data | 67 + - .../linux/loongarch/lp64/jmp_buf-macros.h | 41 + - .../unix/sysv/linux/loongarch/lp64/ld.abilist | 5 + - .../loongarch/lp64/libBrokenLocale.abilist | 1 + - .../sysv/linux/loongarch/lp64/libanl.abilist | 4 + - .../sysv/linux/loongarch/lp64/libc.abilist | 2101 ++++++++++++++++ - .../linux/loongarch/lp64/libcrypt.abilist | 7 + - .../sysv/linux/loongarch/lp64/libdl.abilist | 9 + - .../sysv/linux/loongarch/lp64/libm.abilist | 1021 ++++++++ - .../sysv/linux/loongarch/lp64/libnsl.abilist | 120 + - .../linux/loongarch/lp64/libpthread.abilist | 264 ++ - .../linux/loongarch/lp64/libresolv.abilist | 79 + - .../sysv/linux/loongarch/lp64/librt.abilist | 35 + - .../linux/loongarch/lp64/libthread_db.abilist | 40 + - .../sysv/linux/loongarch/lp64/libutil.abilist | 6 + - .../unix/sysv/linux/loongarch/makecontext.c | 78 + - .../sysv/linux/loongarch/profil-counter.h | 31 + - sysdeps/unix/sysv/linux/loongarch/pt-vfork.S | 1 + - .../unix/sysv/linux/loongarch/register-dump.h | 63 + - .../unix/sysv/linux/loongarch/setcontext.S | 111 + - .../unix/sysv/linux/loongarch/shlib-versions | 2 + - .../sysv/linux/loongarch/sigcontextinfo.h | 22 + - .../unix/sysv/linux/loongarch/swapcontext.S | 120 + - .../unix/sysv/linux/loongarch/sys/procfs.h | 122 + - .../unix/sysv/linux/loongarch/sys/ucontext.h | 81 + - sysdeps/unix/sysv/linux/loongarch/sys/user.h | 31 + - sysdeps/unix/sysv/linux/loongarch/syscall.c | 36 + - sysdeps/unix/sysv/linux/loongarch/sysdep.S | 52 + - sysdeps/unix/sysv/linux/loongarch/sysdep.h | 333 +++ - .../sysv/linux/loongarch/ucontext-macros.h | 44 + - .../unix/sysv/linux/loongarch/ucontext_i.sym | 33 + - sysdeps/unix/sysv/linux/loongarch/vfork.S | 49 + - 291 files changed, 24100 insertions(+), 8 deletions(-) - create mode 100644 sysdeps/loongarch/Implies - create mode 100644 sysdeps/loongarch/Makefile - create mode 100644 sysdeps/loongarch/Versions - create mode 100644 sysdeps/loongarch/__longjmp.S - create mode 100644 sysdeps/loongarch/abort-instr.h - create mode 100644 sysdeps/loongarch/at_quick_exit.c - create mode 100644 sysdeps/loongarch/atexit.c - create mode 100644 sysdeps/loongarch/bits/endian.h - create mode 100644 sysdeps/loongarch/bits/fenv.h - create mode 100644 sysdeps/loongarch/bits/link.h - create mode 100644 sysdeps/loongarch/bits/setjmp.h - create mode 100644 sysdeps/loongarch/bits/wordsize.h - create mode 100644 sysdeps/loongarch/bsd-_setjmp.c - create mode 100644 sysdeps/loongarch/bsd-setjmp.c - create mode 100755 sysdeps/loongarch/configure - create mode 100644 sysdeps/loongarch/configure.ac - create mode 100644 sysdeps/loongarch/cpu-tunables.c - create mode 100644 sysdeps/loongarch/dl-get-cpu-features.c - create mode 100644 sysdeps/loongarch/dl-irel.h - create mode 100644 sysdeps/loongarch/dl-machine.h - create mode 100644 sysdeps/loongarch/dl-tls.h - create mode 100644 sysdeps/loongarch/dl-trampoline.S - create mode 100644 sysdeps/loongarch/dl-trampoline.h - create mode 100644 sysdeps/loongarch/dl-tunables.list - create mode 100644 sysdeps/loongarch/e_sqrtl.c - create mode 100644 sysdeps/loongarch/elf-init.c - create mode 100644 sysdeps/loongarch/fenv_private.h - create mode 100644 sysdeps/loongarch/fpu/e_ilogb.c - create mode 100644 sysdeps/loongarch/fpu/e_ilogbf.c - create mode 100644 sysdeps/loongarch/fpu/e_sqrt.c - create mode 100644 sysdeps/loongarch/fpu/e_sqrtf.c - create mode 100644 sysdeps/loongarch/fpu/fclrexcpt.c - create mode 100644 sysdeps/loongarch/fpu/fedisblxcpt.c - create mode 100644 sysdeps/loongarch/fpu/feenablxcpt.c - create mode 100644 sysdeps/loongarch/fpu/fegetenv.c - create mode 100644 sysdeps/loongarch/fpu/fegetexcept.c - create mode 100644 sysdeps/loongarch/fpu/fegetmode.c - create mode 100644 sysdeps/loongarch/fpu/fegetround.c - create mode 100644 sysdeps/loongarch/fpu/feholdexcpt.c - create mode 100644 sysdeps/loongarch/fpu/fenv_libc.h - create mode 100644 sysdeps/loongarch/fpu/fesetenv.c - create mode 100644 sysdeps/loongarch/fpu/fesetexcept.c - create mode 100644 sysdeps/loongarch/fpu/fesetmode.c - create mode 100644 sysdeps/loongarch/fpu/fesetround.c - create mode 100644 sysdeps/loongarch/fpu/feupdateenv.c - create mode 100644 sysdeps/loongarch/fpu/fgetexcptflg.c - create mode 100644 sysdeps/loongarch/fpu/fraiseexcpt.c - create mode 100644 sysdeps/loongarch/fpu/fsetexcptflg.c - create mode 100644 sysdeps/loongarch/fpu/ftestexcept.c - create mode 100644 sysdeps/loongarch/fpu/s_copysign.c - create mode 100644 sysdeps/loongarch/fpu/s_copysignf.c - create mode 100644 sysdeps/loongarch/fpu/s_finite.c - create mode 100644 sysdeps/loongarch/fpu/s_finitef.c - create mode 100644 sysdeps/loongarch/fpu/s_fmax.c - create mode 100644 sysdeps/loongarch/fpu/s_fmaxf.c - create mode 100644 sysdeps/loongarch/fpu/s_fmaxmag.c - create mode 100644 sysdeps/loongarch/fpu/s_fmaxmagf.c - create mode 100644 sysdeps/loongarch/fpu/s_fmin.c - create mode 100644 sysdeps/loongarch/fpu/s_fminf.c - create mode 100644 sysdeps/loongarch/fpu/s_fminmag.c - create mode 100644 sysdeps/loongarch/fpu/s_fminmagf.c - create mode 100644 sysdeps/loongarch/fpu/s_fpclassify.c - create mode 100644 sysdeps/loongarch/fpu/s_fpclassifyf.c - create mode 100644 sysdeps/loongarch/fpu/s_isinf.c - create mode 100644 sysdeps/loongarch/fpu/s_isinff.c - create mode 100644 sysdeps/loongarch/fpu/s_isnan.c - create mode 100644 sysdeps/loongarch/fpu/s_isnanf.c - create mode 100644 sysdeps/loongarch/fpu/s_issignaling.c - create mode 100644 sysdeps/loongarch/fpu/s_issignalingf.c - create mode 100644 sysdeps/loongarch/fpu/s_llrint.c - create mode 100644 sysdeps/loongarch/fpu/s_llrintf.c - create mode 100644 sysdeps/loongarch/fpu/s_logb.c - create mode 100644 sysdeps/loongarch/fpu/s_logbf.c - create mode 100644 sysdeps/loongarch/fpu/s_lrint.c - create mode 100644 sysdeps/loongarch/fpu/s_lrintf.c - create mode 100644 sysdeps/loongarch/fpu/s_rint.c - create mode 100644 sysdeps/loongarch/fpu/s_rintf.c - create mode 100644 sysdeps/loongarch/fpu/s_scalbn.c - create mode 100644 sysdeps/loongarch/fpu/s_scalbnf.c - create mode 100644 sysdeps/loongarch/fpu_control.h - create mode 100644 sysdeps/loongarch/fstat.c - create mode 100644 sysdeps/loongarch/fstat64.c - create mode 100644 sysdeps/loongarch/fstatat.c - create mode 100644 sysdeps/loongarch/fstatat64.c - create mode 100644 sysdeps/loongarch/gccframe.h - create mode 100644 sysdeps/loongarch/hp-timing.h - create mode 100644 sysdeps/loongarch/init-arch.h - create mode 100644 sysdeps/loongarch/jmpbuf-offsets.h - create mode 100644 sysdeps/loongarch/jmpbuf-unwind.h - create mode 100644 sysdeps/loongarch/ldsodefs.h - create mode 100644 sysdeps/loongarch/libc-start.h - create mode 100644 sysdeps/loongarch/libc-tls.c - create mode 100644 sysdeps/loongarch/linkmap.h - create mode 100644 sysdeps/loongarch/lp64/Implies-after - create mode 100644 sysdeps/loongarch/lp64/libm-test-ulps - create mode 100644 sysdeps/loongarch/lp64/libm-test-ulps-name - create mode 100644 sysdeps/loongarch/lp64/memchr.S - create mode 100644 sysdeps/loongarch/lp64/memcmp.S - create mode 100644 sysdeps/loongarch/lp64/memcpy.S - create mode 100644 sysdeps/loongarch/lp64/memmove.S - create mode 100644 sysdeps/loongarch/lp64/memset.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/Makefile - create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c - create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-lasx.h - create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-lsx.h - create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-memchr.h - create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-memrchr.h - create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-stpcpy.h - create mode 100644 sysdeps/loongarch/lp64/multiarch/memchr-aligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memchr-lasx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memchr-lsx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memchr.c - create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp.c - create mode 100644 sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memcpy.c - create mode 100644 sysdeps/loongarch/lp64/multiarch/memmove-aligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memmove-lasx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memmove-lsx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memmove.c - create mode 100644 sysdeps/loongarch/lp64/multiarch/memrchr-generic.c - create mode 100644 sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memrchr.c - create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-aligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-lasx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-lsx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-unaligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/memset.c - create mode 100644 sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/rawmemchr.c - create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy.c - create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr-aligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr-lasx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr-lsx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr.c - create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul.c - create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp.c - create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy.c - create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-aligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lasx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lsx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen.c - create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp.c - create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen.c - create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S - create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr.c - create mode 100644 sysdeps/loongarch/lp64/rawmemchr.S - create mode 100644 sysdeps/loongarch/lp64/s_cosf.S - create mode 100644 sysdeps/loongarch/lp64/s_sinf.S - create mode 100644 sysdeps/loongarch/lp64/stpcpy.S - create mode 100644 sysdeps/loongarch/lp64/strchr.S - create mode 100644 sysdeps/loongarch/lp64/strchrnul.S - create mode 100644 sysdeps/loongarch/lp64/strcmp.S - create mode 100644 sysdeps/loongarch/lp64/strcpy.S - create mode 100644 sysdeps/loongarch/lp64/strlen.S - create mode 100644 sysdeps/loongarch/lp64/strncmp.S - create mode 100644 sysdeps/loongarch/lp64/strnlen.S - create mode 100644 sysdeps/loongarch/lp64/strrchr.S - create mode 100644 sysdeps/loongarch/lstat.c - create mode 100644 sysdeps/loongarch/lstat64.c - create mode 100644 sysdeps/loongarch/machine-gmon.h - create mode 100644 sysdeps/loongarch/math_private.h - create mode 100644 sysdeps/loongarch/memusage.h - create mode 100644 sysdeps/loongarch/mknod.c - create mode 100644 sysdeps/loongarch/mknodat.c - create mode 100644 sysdeps/loongarch/nptl/Makefile - create mode 100644 sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h - create mode 100644 sysdeps/loongarch/nptl/bits/semaphore.h - create mode 100644 sysdeps/loongarch/nptl/libc-lowlevellock.c - create mode 100644 sysdeps/loongarch/nptl/nptl-sysdep.S - create mode 100644 sysdeps/loongarch/nptl/pthread-offsets.h - create mode 100644 sysdeps/loongarch/nptl/pthreaddef.h - create mode 100644 sysdeps/loongarch/nptl/tcb-offsets.sym - create mode 100644 sysdeps/loongarch/nptl/tls.h - create mode 100644 sysdeps/loongarch/preconfigure - create mode 100644 sysdeps/loongarch/pthread_atfork.c - create mode 100644 sysdeps/loongarch/setjmp.S - create mode 100644 sysdeps/loongarch/sfp-machine.h - create mode 100644 sysdeps/loongarch/sotruss-lib.c - create mode 100644 sysdeps/loongarch/stack_chk_fail_local.c - create mode 100644 sysdeps/loongarch/stackinfo.h - create mode 100644 sysdeps/loongarch/start.S - create mode 100644 sysdeps/loongarch/stat.c - create mode 100644 sysdeps/loongarch/stat64.c - create mode 100644 sysdeps/loongarch/sys/asm.h - create mode 100644 sysdeps/loongarch/sys/regdef.h - create mode 100644 sysdeps/loongarch/tininess.h - create mode 100644 sysdeps/loongarch/tls-macros.h - create mode 100644 sysdeps/loongarch/tst-audit.h - create mode 100644 sysdeps/loongarch/warning-nop.c - create mode 100644 sysdeps/unix/sysv/linux/loongarch/Implies - create mode 100644 sysdeps/unix/sysv/linux/loongarch/Makefile - create mode 100644 sysdeps/unix/sysv/linux/loongarch/Versions - create mode 100644 sysdeps/unix/sysv/linux/loongarch/atomic-machine.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/bits/fcntl.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/bits/local_lim.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/bits/mman.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/bits/shm.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/bits/sigcontext.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/bits/signum.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/clone.S - create mode 100644 sysdeps/unix/sysv/linux/loongarch/configure - create mode 100644 sysdeps/unix/sysv/linux/loongarch/configure.ac - create mode 100644 sysdeps/unix/sysv/linux/loongarch/cpu-features.c - create mode 100644 sysdeps/unix/sysv/linux/loongarch/cpu-features.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c - create mode 100644 sysdeps/unix/sysv/linux/loongarch/dl-static.c - create mode 100644 sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c - create mode 100644 sysdeps/unix/sysv/linux/loongarch/dl-tunables.list - create mode 100644 sysdeps/unix/sysv/linux/loongarch/getcontext.S - create mode 100644 sysdeps/unix/sysv/linux/loongarch/getpid.c - create mode 100644 sysdeps/unix/sysv/linux/loongarch/gettimeofday.c - create mode 100644 sysdeps/unix/sysv/linux/loongarch/getuid.c - create mode 100644 sysdeps/unix/sysv/linux/loongarch/init-first.c - create mode 100644 sysdeps/unix/sysv/linux/loongarch/ipc_priv.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/kernel-features.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/ldd-rewrite.sed - create mode 100644 sysdeps/unix/sysv/linux/loongarch/ldsodefs.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/libc-start.c - create mode 100644 sysdeps/unix/sysv/linux/loongarch/libc-vdso.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/localplt.data - create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/Implies - create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/c++-types.data - create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/jmp_buf-macros.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/ld.abilist - create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libBrokenLocale.abilist - create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libanl.abilist - create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libc.abilist - create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libcrypt.abilist - create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libdl.abilist - create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libm.abilist - create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libnsl.abilist - create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libpthread.abilist - create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libresolv.abilist - create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/librt.abilist - create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libthread_db.abilist - create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libutil.abilist - create mode 100644 sysdeps/unix/sysv/linux/loongarch/makecontext.c - create mode 100644 sysdeps/unix/sysv/linux/loongarch/profil-counter.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/pt-vfork.S - create mode 100644 sysdeps/unix/sysv/linux/loongarch/register-dump.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/setcontext.S - create mode 100644 sysdeps/unix/sysv/linux/loongarch/shlib-versions - create mode 100644 sysdeps/unix/sysv/linux/loongarch/sigcontextinfo.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/swapcontext.S - create mode 100644 sysdeps/unix/sysv/linux/loongarch/sys/procfs.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/sys/ucontext.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/sys/user.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/syscall.c - create mode 100644 sysdeps/unix/sysv/linux/loongarch/sysdep.S - create mode 100644 sysdeps/unix/sysv/linux/loongarch/sysdep.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/ucontext-macros.h - create mode 100644 sysdeps/unix/sysv/linux/loongarch/ucontext_i.sym - create mode 100644 sysdeps/unix/sysv/linux/loongarch/vfork.S - -diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c -index 7a84b1fa..47342c76 100644 ---- a/elf/dl-reloc.c -+++ b/elf/dl-reloc.c -@@ -235,12 +235,6 @@ _dl_relocate_object (struct link_map *l, struct r_scope_elem *scope[], - newp->start = PTR_ALIGN_DOWN (ph->p_vaddr, GLRO(dl_pagesize)) - + (caddr_t) l->l_addr; - -- if (__mprotect (newp->start, newp->len, PROT_READ|PROT_WRITE) < 0) -- { -- errstring = N_("cannot make segment writable for relocation"); -- call_error: -- _dl_signal_error (errno, l->l_name, NULL, errstring); -- } - - #if (PF_R | PF_W | PF_X) == 7 && (PROT_READ | PROT_WRITE | PROT_EXEC) == 7 - newp->prot = (PF_TO_PROT -@@ -254,6 +248,13 @@ _dl_relocate_object (struct link_map *l, struct r_scope_elem *scope[], - if (ph->p_flags & PF_X) - newp->prot |= PROT_EXEC; - #endif -+ if (__mprotect (newp->start, newp->len, PROT_READ|PROT_WRITE) < 0) -+ { -+ errstring = N_("cannot make segment writable for relocation"); -+ call_error: -+ _dl_signal_error (errno, l->l_name, NULL, errstring); -+ } -+ - newp->next = textrels; - textrels = newp; - } -diff --git a/elf/elf.h b/elf/elf.h -index ec09040b..65d1fb46 100644 ---- a/elf/elf.h -+++ b/elf/elf.h -@@ -360,8 +360,9 @@ typedef struct - #define EM_RISCV 243 /* RISC-V */ - - #define EM_BPF 247 /* Linux BPF -- in-kernel virtual machine */ -+#define EM_LOONGARCH 258 /* Loongson Loongarch */ - --#define EM_NUM 248 -+#define EM_NUM 259 - - /* Old spellings/synonyms. */ - -@@ -3932,6 +3933,88 @@ enum - #define R_NDS32_TLS_TPOFF 102 - #define R_NDS32_TLS_DESC 119 - -+/* LoongISA ELF Flags */ -+#define EF_LARCH_ABI 0x0003 -+#define EF_LARCH_ABI_LP64 0x0003 -+#define EF_LARCH_ABI_LPX32 0x0002 -+#define EF_LARCH_ABI_LP32 0x0001 -+ -+/* Loongarch specific dynamic relocations. */ -+#define R_LARCH_NONE 0 -+#define R_LARCH_32 1 -+#define R_LARCH_64 2 -+#define R_LARCH_RELATIVE 3 -+#define R_LARCH_COPY 4 -+#define R_LARCH_JUMP_SLOT 5 -+#define R_LARCH_TLS_DTPMOD32 6 -+#define R_LARCH_TLS_DTPMOD64 7 -+#define R_LARCH_TLS_DTPREL32 8 -+#define R_LARCH_TLS_DTPREL64 9 -+#define R_LARCH_TLS_TPREL32 10 -+#define R_LARCH_TLS_TPREL64 11 -+#define R_LARCH_IRELATIVE 12 -+ -+/* Reserved for future relocs that the dynamic linker must understand. */ -+ -+/* used by the static linker for relocating .text */ -+#define R_LARCH_MARK_LA 20 -+#define R_LARCH_MARK_PCREL 21 -+ -+/* 这个重定位类型将symbol距离重定位位置的pc相对位置偏移量压栈。 -+ 它against symbol,因为如果是个常数,虽然在no-pic的情况下可以得到结果,但因为 -+ 重定位位置相对这个常数的偏移量一定很大,八成填不进去;而在pic的情况下, -+ 偏移量无法在静态连接时确定。因此我们约定这个重定位不可能against constant */ -+#define R_LARCH_SOP_PUSH_PCREL 22 -+ -+/* 这个重定位against a symbol or a constant。它将symbol的运行时绝对地址 -+ 或常数压栈,因此在pic的情况下会报错。另外我不太清楚常数和ABS段的关系。 */ -+#define R_LARCH_SOP_PUSH_ABSOLUTE 23 -+#define R_LARCH_SOP_PUSH_DUP 24 -+#define R_LARCH_SOP_PUSH_GPREL 25 -+#define R_LARCH_SOP_PUSH_TLS_TPREL 26 -+#define R_LARCH_SOP_PUSH_TLS_GOT 27 -+#define R_LARCH_SOP_PUSH_TLS_GD 28 -+#define R_LARCH_SOP_PUSH_PLT_PCREL 29 -+ -+#define R_LARCH_SOP_ASSERT 30 -+#define R_LARCH_SOP_NOT 31 -+#define R_LARCH_SOP_SUB 32 -+#define R_LARCH_SOP_SL 33 -+#define R_LARCH_SOP_SR 34 -+#define R_LARCH_SOP_ADD 35 -+#define R_LARCH_SOP_AND 36 -+#define R_LARCH_SOP_IF_ELSE 37 -+#define R_LARCH_SOP_POP_32_S_10_5 38 -+#define R_LARCH_SOP_POP_32_U_10_12 39 -+#define R_LARCH_SOP_POP_32_S_10_12 40 -+#define R_LARCH_SOP_POP_32_S_10_16 41 -+#define R_LARCH_SOP_POP_32_S_10_16_S2 42 -+#define R_LARCH_SOP_POP_32_S_5_20 43 -+#define R_LARCH_SOP_POP_32_S_0_5_10_16_S2 44 -+#define R_LARCH_SOP_POP_32_S_0_10_10_16_S2 45 -+#define R_LARCH_SOP_POP_32_U 46 -+ -+/* used by the static linker for relocating non .text */ -+/* 这几个重定位类型是为了照顾到 ".dword sym1 - sym2" 这种求差的写法。 -+ 这些重定位类型处理的是连接时地址,一般情况下它们是成对出现的。 -+ 在直接求负数".dword - sym1"的情况下,R_LARCH_SUBxx会单独出现。但注意, -+ 那个位置填进去的是连接时地址。 */ -+#define R_LARCH_ADD8 47 -+#define R_LARCH_ADD16 48 -+#define R_LARCH_ADD24 49 -+#define R_LARCH_ADD32 50 -+#define R_LARCH_ADD64 51 -+#define R_LARCH_SUB8 52 -+#define R_LARCH_SUB16 53 -+#define R_LARCH_SUB24 54 -+#define R_LARCH_SUB32 55 -+#define R_LARCH_SUB64 56 -+ -+ /* I don't know what it is. Existing in almost all other arch */ -+#define R_LARCH_GNU_VTINHERIT 57 -+#define R_LARCH_GNU_VTENTRY 58 -+ -+ - __END_DECLS - - #endif /* elf.h */ -diff --git a/scripts/config.guess b/scripts/config.guess -index 588fe82a..a1d1cb2a 100755 ---- a/scripts/config.guess -+++ b/scripts/config.guess -@@ -957,6 +957,9 @@ EOF - k1om:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} - exit ;; -+ loongarch32:Linux:*:* | loongarch64:Linux:*:*) -+ echo ${UNAME_MACHINE}-unknown-linux-${LIBC} -+ exit ;; - m32r*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} - exit ;; -diff --git a/scripts/config.sub b/scripts/config.sub -index f2632cd8..429ec408 100755 ---- a/scripts/config.sub -+++ b/scripts/config.sub -@@ -142,7 +142,7 @@ case $os in - -sun*os*) - # Prevent following clause from handling this invalid input. - ;; -- -dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \ -+ -dec* | -mips* | -loongarch* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \ - -att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \ - -unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \ - -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\ -@@ -265,6 +265,7 @@ case $basic_machine in - | k1om \ - | le32 | le64 \ - | lm32 \ -+ | loongarch32 | loongarch64 \ - | m32c | m32r | m32rle | m68000 | m68k | m88k \ - | maxq | mb | microblaze | microblazeel | mcore | mep | metag \ - | mips | mipsbe | mipseb | mipsel | mipsle \ -@@ -390,6 +391,7 @@ case $basic_machine in - | k1om-* \ - | le32-* | le64-* \ - | lm32-* \ -+ | loongarch32-* | loongarch64-* \ - | m32c-* | m32r-* | m32rle-* \ - | m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \ - | m88110-* | m88k-* | maxq-* | mcore-* | metag-* \ -@@ -1339,6 +1341,9 @@ case $basic_machine in - pmac | pmac-mpw) - basic_machine=powerpc-apple - ;; -+ loongarch) -+ basic_machine=loongarch-loongson -+ ;; - *-unknown) - # Make sure to match an already-canonicalized machine name. - ;; -diff --git a/sysdeps/loongarch/Implies b/sysdeps/loongarch/Implies -new file mode 100644 -index 00000000..c88325b8 ---- /dev/null -+++ b/sysdeps/loongarch/Implies -@@ -0,0 +1,5 @@ -+init_array -+ -+ieee754/ldbl-128 -+ieee754/dbl-64 -+ieee754/flt-32 -diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile -new file mode 100644 -index 00000000..286cff67 ---- /dev/null -+++ b/sysdeps/loongarch/Makefile -@@ -0,0 +1,36 @@ -+ifeq ($(subdir),misc) -+sysdep_headers += sys/asm.h -+endif -+ -+ifeq ($(subdir),elf) -+ sysdep-dl-routines += dl-get-cpu-features -+endif -+ -+# LoongArch's assembler also needs to know about PIC as it changes the definition -+# of some assembler macros. -+ASFLAGS-.os += $(pic-ccflag) -+CFLAGS-elf-init.oS += -mcmodel=large -+CFLAGS-atexit.oS += -mcmodel=large -+CFLAGS-at_quick_exit.oS += -mcmodel=large -+CFLAGS-stat.oS += -mcmodel=large -+CFLAGS-fstat.oS += -mcmodel=large -+CFLAGS-lstat.oS += -mcmodel=large -+CFLAGS-stat64.oS += -mcmodel=large -+CFLAGS-fstat64.oS += -mcmodel=large -+CFLAGS-lstat64.oS += -mcmodel=large -+CFLAGS-fstatat.oS += -mcmodel=large -+CFLAGS-fstatat64.oS += -mcmodel=large -+CFLAGS-mknod.oS += -mcmodel=large -+CFLAGS-mknodat.oS += -mcmodel=large -+CFLAGS-pthread_atfork.oS += -mcmodel=large -+CFLAGS-warning-nop.oS += -mcmodel=large -+CFLAGS-stack_chk_fail_local.oS += -mcmodel=large -+ -+abi-variants := lp32 lp64 -+ -+ifeq (,$(filter $(default-abi),$(abi-variants))) -+$(error Unknown ABI $(default-abi), must be one of $(abi-variants)) -+endif -+ -+abi-lp64-condition := defined _ABILP64 -+abi-lp32-condition := defined _ABILP32 -diff --git a/sysdeps/loongarch/Versions b/sysdeps/loongarch/Versions -new file mode 100644 -index 00000000..33ae2cc0 ---- /dev/null -+++ b/sysdeps/loongarch/Versions -@@ -0,0 +1,5 @@ -+ld { -+ GLIBC_PRIVATE { -+ _dl_larch_get_cpu_features; -+ } -+} -diff --git a/sysdeps/loongarch/__longjmp.S b/sysdeps/loongarch/__longjmp.S -new file mode 100644 -index 00000000..68f67639 ---- /dev/null -+++ b/sysdeps/loongarch/__longjmp.S -@@ -0,0 +1,50 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+ENTRY (__longjmp) -+ REG_L ra, a0, 0*SZREG -+ REG_L sp, a0, 1*SZREG -+ REG_L x, a0, 2*SZREG -+ REG_L fp, a0, 3*SZREG -+ REG_L s0, a0, 4*SZREG -+ REG_L s1, a0, 5*SZREG -+ REG_L s2, a0, 6*SZREG -+ REG_L s3, a0, 7*SZREG -+ REG_L s4, a0, 8*SZREG -+ REG_L s5, a0, 9*SZREG -+ REG_L s6, a0, 10*SZREG -+ REG_L s7, a0, 11*SZREG -+ REG_L s8, a0, 12*SZREG -+ -+ FREG_L $f24, a0, 13*SZREG + 0*SZFREG -+ FREG_L $f25, a0, 13*SZREG + 1*SZFREG -+ FREG_L $f26, a0, 13*SZREG + 2*SZFREG -+ FREG_L $f27, a0, 13*SZREG + 3*SZFREG -+ FREG_L $f28, a0, 13*SZREG + 4*SZFREG -+ FREG_L $f29, a0, 13*SZREG + 5*SZFREG -+ FREG_L $f30, a0, 13*SZREG + 6*SZFREG -+ FREG_L $f31, a0, 13*SZREG + 7*SZFREG -+ -+ sltui a0,a1,1 -+ add.d a0, a0, a1 # a0 = (a1 == 0) ? 1 : a1 -+ jirl zero,ra,0 -+ -+END (__longjmp) -diff --git a/sysdeps/loongarch/abort-instr.h b/sysdeps/loongarch/abort-instr.h -new file mode 100644 -index 00000000..46d3ad08 ---- /dev/null -+++ b/sysdeps/loongarch/abort-instr.h -@@ -0,0 +1,2 @@ -+/* An instruction which should crash any program is a breakpoint. */ -+#define ABORT_INSTRUCTION asm ("break 0") -diff --git a/sysdeps/loongarch/at_quick_exit.c b/sysdeps/loongarch/at_quick_exit.c -new file mode 100644 -index 00000000..8d4b44a7 ---- /dev/null -+++ b/sysdeps/loongarch/at_quick_exit.c -@@ -0,0 +1 @@ -+#include -diff --git a/sysdeps/loongarch/atexit.c b/sysdeps/loongarch/atexit.c -new file mode 100644 -index 00000000..fc055a48 ---- /dev/null -+++ b/sysdeps/loongarch/atexit.c -@@ -0,0 +1 @@ -+#include -diff --git a/sysdeps/loongarch/bits/endian.h b/sysdeps/loongarch/bits/endian.h -new file mode 100644 -index 00000000..dc9a3f2e ---- /dev/null -+++ b/sysdeps/loongarch/bits/endian.h -@@ -0,0 +1,9 @@ -+/* The MIPS architecture has selectable endianness. -+ It exists in both little and big endian flavours and we -+ want to be able to share the installed header files between -+ both, so we define __BYTE_ORDER based on GCC's predefines. */ -+ -+#ifndef _ENDIAN_H -+# error "Never use directly; include instead." -+#endif -+# define __BYTE_ORDER __LITTLE_ENDIAN -diff --git a/sysdeps/loongarch/bits/fenv.h b/sysdeps/loongarch/bits/fenv.h -new file mode 100644 -index 00000000..42767412 ---- /dev/null -+++ b/sysdeps/loongarch/bits/fenv.h -@@ -0,0 +1,93 @@ -+/* Copyright (C) 1998-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#ifndef _FENV_H -+# error "Never use directly; include instead." -+#endif -+ -+ -+/* Define bits representing the exception. We use the bit positions -+ of the appropriate bits in the FPU control word. */ -+enum -+ { -+ FE_INEXACT = -+#define FE_INEXACT 0x010000 -+ FE_INEXACT, -+ FE_UNDERFLOW = -+#define FE_UNDERFLOW 0x020000 -+ FE_UNDERFLOW, -+ FE_OVERFLOW = -+#define FE_OVERFLOW 0x040000 -+ FE_OVERFLOW, -+ FE_DIVBYZERO = -+#define FE_DIVBYZERO 0x080000 -+ FE_DIVBYZERO, -+ FE_INVALID = -+#define FE_INVALID 0x100000 -+ FE_INVALID, -+ }; -+ -+#define FE_ALL_EXCEPT \ -+ (FE_INEXACT | FE_DIVBYZERO | FE_UNDERFLOW | FE_OVERFLOW | FE_INVALID) -+ -+/* The MIPS FPU supports all of the four defined rounding modes. We -+ use again the bit positions in the FPU control word as the values -+ for the appropriate macros. */ -+enum -+ { -+ FE_TONEAREST = -+#define FE_TONEAREST 0x000 -+ FE_TONEAREST, -+ FE_TOWARDZERO = -+#define FE_TOWARDZERO 0x100 -+ FE_TOWARDZERO, -+ FE_UPWARD = -+#define FE_UPWARD 0x200 -+ FE_UPWARD, -+ FE_DOWNWARD = -+#define FE_DOWNWARD 0x300 -+ FE_DOWNWARD -+ }; -+ -+ -+/* Type representing exception flags. */ -+typedef unsigned int fexcept_t; -+ -+ -+/* Type representing floating-point environment. This function corresponds -+ to the layout of the block written by the `fstenv'. */ -+typedef struct -+ { -+ unsigned int __fp_control_register; -+ } -+fenv_t; -+ -+/* If the default argument is used we use this value. */ -+#define FE_DFL_ENV ((const fenv_t *) -1) -+ -+#ifdef __USE_GNU -+/* Floating-point environment where none of the exception is masked. */ -+# define FE_NOMASK_ENV ((const fenv_t *) -257) -+#endif -+ -+#if __GLIBC_USE (IEC_60559_BFP_EXT) -+/* Type representing floating-point control modes. */ -+typedef unsigned int femode_t; -+ -+/* Default floating-point control modes. */ -+# define FE_DFL_MODE ((const femode_t *) -1L) -+#endif -diff --git a/sysdeps/loongarch/bits/link.h b/sysdeps/loongarch/bits/link.h -new file mode 100644 -index 00000000..554dfdc0 ---- /dev/null -+++ b/sysdeps/loongarch/bits/link.h -@@ -0,0 +1,56 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#ifndef _LINK_H -+# error "Never include directly; use instead." -+#endif -+ -+typedef struct La_loongarch_regs -+{ -+ unsigned long int lr_reg[8]; /* a0 - a7 */ -+ double lr_fpreg[8]; /* fa0 - fa7 */ -+ unsigned long int lr_ra; -+ unsigned long int lr_sp; -+} La_loongarch_regs; -+ -+/* Return values for calls from PLT on LoongArch. */ -+typedef struct La_loongarch_retval -+{ -+ unsigned long int lrv_a0; -+ unsigned long int lrv_a1; -+ double lrv_fa0; -+ double lrv_fa1; -+} La_loongarch_retval; -+ -+__BEGIN_DECLS -+ -+extern ElfW(Addr) la_loongarch_gnu_pltenter (ElfW(Sym) *__sym, unsigned int __ndx, -+ uintptr_t *__refcook, -+ uintptr_t *__defcook, -+ La_loongarch_regs *__regs, -+ unsigned int *__flags, -+ const char *__symname, -+ long int *__framesizep); -+extern unsigned int la_loongarch_gnu_pltexit (ElfW(Sym) *__sym, unsigned int __ndx, -+ uintptr_t *__refcook, -+ uintptr_t *__defcook, -+ const La_loongarch_regs *__inregs, -+ La_loongarch_retval *__outregs, -+ const char *__symname); -+ -+__END_DECLS -diff --git a/sysdeps/loongarch/bits/setjmp.h b/sysdeps/loongarch/bits/setjmp.h -new file mode 100644 -index 00000000..cc9b6bfd ---- /dev/null -+++ b/sysdeps/loongarch/bits/setjmp.h -@@ -0,0 +1,39 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#ifndef _LOONGARCH_BITS_SETJMP_H -+#define _LOONGARCH_BITS_SETJMP_H -+ -+typedef struct __jmp_buf_internal_tag -+ { -+ /* Program counter. */ -+ long int __pc; -+ /* Stack pointer. */ -+ long int __sp; -+ /* Reserved */ -+ long int __x; -+ /* Frame pointer. */ -+ long int __fp; -+ /* Callee-saved registers. */ -+ long int __regs[9]; -+ -+ /* Callee-saved floating point registers. */ -+ double __fpregs[8]; -+ } __jmp_buf[1]; -+ -+#endif /* _LOONGARCH_BITS_SETJMP_H */ -diff --git a/sysdeps/loongarch/bits/wordsize.h b/sysdeps/loongarch/bits/wordsize.h -new file mode 100644 -index 00000000..8dbaa00d ---- /dev/null -+++ b/sysdeps/loongarch/bits/wordsize.h -@@ -0,0 +1,22 @@ -+/* Copyright (C) 1999-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define __loongarch_xlen 64 -+ -+#define __WORDSIZE __loongarch_xlen -+#define __WORDSIZE_TIME64_COMPAT32 0 -+ -diff --git a/sysdeps/loongarch/bsd-_setjmp.c b/sysdeps/loongarch/bsd-_setjmp.c -new file mode 100644 -index 00000000..0d413101 ---- /dev/null -+++ b/sysdeps/loongarch/bsd-_setjmp.c -@@ -0,0 +1 @@ -+/* _setjmp is implemented in setjmp.S */ -diff --git a/sysdeps/loongarch/bsd-setjmp.c b/sysdeps/loongarch/bsd-setjmp.c -new file mode 100644 -index 00000000..ee7c5e34 ---- /dev/null -+++ b/sysdeps/loongarch/bsd-setjmp.c -@@ -0,0 +1 @@ -+/* setjmp is implemented in setjmp.S */ -diff --git a/sysdeps/loongarch/configure b/sysdeps/loongarch/configure -new file mode 100755 -index 00000000..1e5abf81 ---- /dev/null -+++ b/sysdeps/loongarch/configure -@@ -0,0 +1,4 @@ -+# This file is generated from configure.ac by Autoconf. DO NOT EDIT! -+ # Local configure fragment for sysdeps/loongarch/elf. -+ -+#AC_DEFINE(PI_STATIC_AND_HIDDEN) -diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac -new file mode 100644 -index 00000000..67b46ce0 ---- /dev/null -+++ b/sysdeps/loongarch/configure.ac -@@ -0,0 +1,6 @@ -+GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory. -+# Local configure fragment for sysdeps/loongarch/elf. -+ -+dnl It is always possible to access static and hidden symbols in an -+dnl position independent way. -+#AC_DEFINE(PI_STATIC_AND_HIDDEN) -diff --git a/sysdeps/loongarch/cpu-tunables.c b/sysdeps/loongarch/cpu-tunables.c -new file mode 100644 -index 00000000..840c1b8c ---- /dev/null -+++ b/sysdeps/loongarch/cpu-tunables.c -@@ -0,0 +1,94 @@ -+/* LoongArch CPU feature tuning. -+ This file is part of the GNU C Library. -+ Copyright (C) 2017-2018 Free Software Foundation, Inc. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#if HAVE_TUNABLES -+# define TUNABLE_NAMESPACE cpu -+# include -+# include -+# include /* Get STDOUT_FILENO for _dl_printf. */ -+# include -+# include -+# include -+# include -+# include -+ -+# define HWCAP_LOONGARCH_IFUNC \ -+ (HWCAP_LOONGARCH_UAL | HWCAP_LOONGARCH_LSX | HWCAP_LOONGARCH_LASX) -+ -+# define CHECK_GLIBC_IFUNC_CPU_OFF(f, name, len) \ -+ _Static_assert (sizeof (#name) - 1 == len, #name " != " #len); \ -+ if (!memcmp (f, #name, len) && \ -+ (GLRO (dl_hwcap) & HWCAP_LOONGARCH_##name)) \ -+ { \ -+ hwcap |= (HWCAP_LOONGARCH_##name | (~HWCAP_LOONGARCH_IFUNC)); \ -+ break; \ -+ } \ -+ -+ -+attribute_hidden -+void -+TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) -+{ -+ const char *p = valp->strval; -+ size_t len; -+ unsigned long hwcap = 0; -+ const char *c; -+ -+ do { -+ for (c = p; *c != ','; c++) -+ if (*c == '\0') -+ break; -+ -+ len = c - p; -+ -+ switch(len) -+ { -+ default: -+ _dl_fatal_printf ( -+ "The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n" -+ ); -+ break; -+ case 3: -+ { -+ CHECK_GLIBC_IFUNC_CPU_OFF (p, LSX, 3); -+ CHECK_GLIBC_IFUNC_CPU_OFF (p, UAL, 3); -+ _dl_fatal_printf ( -+ "Some features are invalid or not supported on this machine!!\n" -+ "The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n" -+ ); -+ } -+ break; -+ case 4: -+ { -+ CHECK_GLIBC_IFUNC_CPU_OFF (p, LASX, 4); -+ _dl_fatal_printf ( -+ "Some features are invalid or not supported on this machine!!\n" -+ "The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n" -+ ); -+ } -+ break; -+ } -+ -+ p += len + 1; -+ } -+ while (*c != '\0'); -+ -+ GLRO (dl_hwcap) &= hwcap; -+} -+ -+#endif -diff --git a/sysdeps/loongarch/dl-get-cpu-features.c b/sysdeps/loongarch/dl-get-cpu-features.c -new file mode 100644 -index 00000000..ed71abe0 ---- /dev/null -+++ b/sysdeps/loongarch/dl-get-cpu-features.c -@@ -0,0 +1,25 @@ -+/* Define _dl_larch_get_cpu_features. -+ Copyright (C) 2015-2022 Free Software Foundation, Inc. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+ -+#include -+ -+const struct cpu_features * -+_dl_larch_get_cpu_features (void) -+{ -+ return &GLRO(dl_larch_cpu_features); -+} -diff --git a/sysdeps/loongarch/dl-irel.h b/sysdeps/loongarch/dl-irel.h -new file mode 100644 -index 00000000..4216fec2 ---- /dev/null -+++ b/sysdeps/loongarch/dl-irel.h -@@ -0,0 +1,51 @@ -+/* Machine-dependent ELF indirect relocation inline functions. -+ x86-64 version. -+ Copyright (C) 2009-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#ifndef _DL_IREL_H -+#define _DL_IREL_H -+ -+#include -+#include -+ -+#define ELF_MACHINE_IRELA 1 -+ -+static inline ElfW(Addr) -+__attribute ((always_inline)) -+elf_ifunc_invoke (ElfW(Addr) addr) -+{ -+ return ((ElfW(Addr) (*) (void)) (addr)) (); -+} -+ -+static inline void -+__attribute ((always_inline)) -+elf_irela (const ElfW(Rela) *reloc) -+{ -+ ElfW(Addr) *const reloc_addr = (void *) reloc->r_offset; -+ const unsigned long int r_type = ELFW(R_TYPE) (reloc->r_info); -+ -+ if (__glibc_likely (r_type == R_LARCH_IRELATIVE)) -+ { -+ ElfW(Addr) value = elf_ifunc_invoke(reloc->r_addend); -+ *reloc_addr = value; -+ } -+ else -+ __libc_fatal ("Unexpected reloc type in static binary.\n"); -+} -+ -+#endif /* dl-irel.h */ -diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h -new file mode 100644 -index 00000000..2d527241 ---- /dev/null -+++ b/sysdeps/loongarch/dl-machine.h -@@ -0,0 +1,410 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#ifndef dl_machine_h -+#define dl_machine_h -+ -+#define ELF_MACHINE_NAME "LoongArch" -+ -+#if HAVE_TUNABLES -+#define TUNABLE_NAMESPACE cpu -+#include -+extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *) attribute_hidden; -+#endif -+ -+#include -+#include -+#include -+#include -+#include -+ -+ -+#ifndef _RTLD_PROLOGUE -+# define _RTLD_PROLOGUE(entry) \ -+ ".globl\t" __STRING (entry) "\n\t" \ -+ ".type\t" __STRING (entry) ", @function\n\t" \ -+ CFI_STARTPROC "\n" \ -+ __STRING (entry) ":\n" -+#endif -+ -+#ifndef _RTLD_EPILOGUE -+# define _RTLD_EPILOGUE(entry) \ -+ CFI_ENDPROC "\n\t" \ -+ ".size\t" __STRING (entry) ", . - " __STRING (entry) "\n" -+#endif -+ -+#define ELF_MACHINE_JMP_SLOT R_LARCH_JUMP_SLOT -+#define ELF_MACHINE_IRELATIVE R_LARCH_IRELATIVE -+ -+#define elf_machine_type_class(type) \ -+ ((ELF_RTYPE_CLASS_PLT * ((type) == ELF_MACHINE_JMP_SLOT \ -+ || (__WORDSIZE == 32 && (type) == R_LARCH_TLS_DTPREL32) \ -+ || (__WORDSIZE == 32 && (type) == R_LARCH_TLS_DTPMOD32) \ -+ || (__WORDSIZE == 32 && (type) == R_LARCH_TLS_TPREL32) \ -+ || (__WORDSIZE == 64 && (type) == R_LARCH_TLS_DTPREL64) \ -+ || (__WORDSIZE == 64 && (type) == R_LARCH_TLS_DTPMOD64) \ -+ || (__WORDSIZE == 64 && (type) == R_LARCH_TLS_TPREL64))) \ -+ | (ELF_RTYPE_CLASS_COPY * ((type) == R_LARCH_COPY))) -+ -+#define ELF_MACHINE_NO_REL 1 -+#define ELF_MACHINE_NO_RELA 0 -+#define PLTREL ElfW(Rela) -+ -+#define DL_PLATFORM_INIT dl_platform_init () -+ -+static inline void __attribute__ ((unused)) -+dl_platform_init (void) -+{ -+ if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0') -+ /* Avoid an empty string which would disturb us. */ -+ GLRO(dl_platform) = NULL; -+ -+#ifdef SHARED -+ -+#if HAVE_TUNABLES -+ TUNABLE_GET (hwcaps, tunable_val_t *, TUNABLE_CALLBACK (set_hwcaps)); -+#endif -+ /* init_cpu_features has been called early from __libc_start_main in -+ static executable. */ -+ init_cpu_features (&GLRO(dl_larch_cpu_features)); -+#endif -+} -+ -+ -+/* Return nonzero iff ELF header is compatible with the running host. */ -+static inline int __attribute_used__ -+elf_machine_matches_host (const ElfW(Ehdr) *ehdr) -+{ -+ /* We can only run LoongArch binaries. */ -+ if (ehdr->e_machine != EM_LOONGARCH) -+ return 0; -+ -+#ifdef _ABILP64 -+ if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LP64) -+#elif defined _ABILPX32 -+ if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LPX32) -+#elif defined _ABILP32 -+ if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LP32) -+#else -+# error "Unknown ABI" -+#endif -+ return 0; -+ -+ return 1; -+} -+ -+/* Runtime address of .got */ -+#define _GLOBAL_OFFSET_TABLE_ ({ \ -+ ElfW(Addr) *r; \ -+ asm ("la.pcrel %0, _GLOBAL_OFFSET_TABLE_":"=r" (r)); \ -+ r; \ -+}) -+ -+/* Return the link-time address of _DYNAMIC. */ -+static inline ElfW(Addr) -+elf_machine_dynamic (void) -+{ -+ return _GLOBAL_OFFSET_TABLE_[0]; -+} -+ -+#define STRINGXP(X) __STRING (X) -+#define STRINGXV(X) STRINGV_ (X) -+#define STRINGV_(...) # __VA_ARGS__ -+ -+/* Return the run-time load address of the shared object. */ -+static inline ElfW(Addr) -+elf_machine_load_address (void) -+{ -+ ElfW(Addr) got_linktime_addr; -+ asm ( -+ "la.got %0, _GLOBAL_OFFSET_TABLE_" -+ /* Link-time address in GOT entry before runtime relocation */ -+ : "=r" (got_linktime_addr) -+ ); -+ return (ElfW(Addr))_GLOBAL_OFFSET_TABLE_ - got_linktime_addr; -+} -+ -+/* Initial entry point code for the dynamic linker. -+ The C function `_dl_start' is the real entry point; -+ its return value is the user program's entry point. */ -+ -+#define RTLD_START asm (\ -+ ".text\n\ -+ " _RTLD_PROLOGUE (ENTRY_POINT) "\ -+ .cfi_label .Ldummy\n\ -+ " CFI_UNDEFINED (1) "\n\ -+ or $a0, $sp, $zero\n\ -+ bl _dl_start\n\ -+ # Stash user entry point in s0.\n\ -+ or $s0, $v0, $zero\n\ -+ # See if we were run as a command with the executable file\n\ -+ # name as an extra leading argument.\n\ -+ la $a0, _dl_skip_args\n\ -+ ld.w $a0, $a0, 0\n\ -+ # Load the original argument count.\n\ -+ ld.d $a1, $sp, 0\n\ -+ # Subtract _dl_skip_args from it.\n\ -+ sub.d $a1, $a1, $a0\n\ -+ # Adjust the stack pointer to skip _dl_skip_args words.\n\ -+ slli.d $a0, $a0, 3\n\ -+ add.d $sp, $sp, $a0\n\ -+ # Save back the modified argument count.\n\ -+ st.d $a1, $sp, 0\n\ -+ # Call _dl_init (struct link_map *main_map, int argc, char **argv, char **env) \n\ -+ la $a0, _rtld_local\n\ -+ ld.d $a0, $a0, 0\n\ -+ addi.d $a2, $sp, 8\n\ -+ slli.d $a3, $a1, 3\n\ -+ add.d $a3, $a3, $a2\n\ -+ addi.d $a3, $a3, 8\n\ -+ # Adjust $sp for 16-aligned\n\ -+ srli.d $t0, $sp, 4\n\ -+ slli.d $t0, $t0, 4\n\ -+ ori $t1, $sp, 0\n\ -+ addi.d $sp, $t0, -32\n\ -+ st.d $t1, $sp, 24\n\ -+ # Call the function to run the initializers.\n\ -+ bl _dl_init\n\ -+ # Pass our finalizer function to _start.\n\ -+ ld.d $sp, $sp, 24\n\ -+ la $a0, _dl_fini\n\ -+ # Jump to the user entry point.\n\ -+ jirl $zero, $s0, 0\n\ -+ " _RTLD_EPILOGUE (ENTRY_POINT) "\ -+ .previous" \ -+); -+ -+/* Names of the architecture-specific auditing callback functions. */ -+#define ARCH_LA_PLTENTER loongarch_gnu_pltenter -+#define ARCH_LA_PLTEXIT loongarch_gnu_pltexit -+ -+/* Bias .got.plt entry by the offset requested by the PLT header. */ -+#define elf_machine_plt_value(map, reloc, value) (value) -+ -+static inline ElfW(Addr) -+elf_machine_fixup_plt (struct link_map *map, lookup_t t, -+ const ElfW(Sym) *refsym, const ElfW(Sym) *sym, -+ const ElfW(Rela) *reloc, -+ ElfW(Addr) *reloc_addr, ElfW(Addr) value) -+{ -+ return *reloc_addr = value; -+} -+ -+#endif /* !dl_machine_h */ -+ -+#ifdef RESOLVE_MAP -+ -+/* Perform a relocation described by R_INFO at the location pointed to -+ by RELOC_ADDR. SYM is the relocation symbol specified by R_INFO and -+ MAP is the object containing the reloc. */ -+ -+auto inline void -+__attribute__ ((always_inline)) -+elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[], -+ const ElfW(Rela) *reloc, const ElfW(Sym) *sym, -+ const struct r_found_version *version, -+ void *const reloc_addr, int skip_ifunc) -+{ -+ ElfW(Addr) r_info = reloc->r_info; -+ const unsigned long int r_type = ELFW (R_TYPE) (r_info); -+ ElfW(Addr) *addr_field = (ElfW(Addr) *) reloc_addr; -+ const ElfW(Sym) *const __attribute__ ((unused)) refsym = sym; -+ struct link_map *sym_map = RESOLVE_MAP (map, scope, &sym, version, r_type); -+ ElfW(Addr) value = 0; -+ if (sym_map != NULL) -+ value = SYMBOL_ADDRESS (sym_map, sym, true) + reloc->r_addend; -+ -+ if (sym != NULL -+ && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0) -+ && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1) -+ && __builtin_expect (!skip_ifunc, 1)) -+ value = ((ElfW(Addr) (*) (int)) value) (GLRO(dl_hwcap)); -+ -+ switch (r_type) -+ { -+#ifndef RTLD_BOOTSTRAP -+ case __WORDSIZE == 64 ? R_LARCH_TLS_DTPMOD64 : R_LARCH_TLS_DTPMOD32: -+ if (sym_map) -+ *addr_field = sym_map->l_tls_modid; -+ break; -+ -+ case __WORDSIZE == 64 ? R_LARCH_TLS_DTPREL64 : R_LARCH_TLS_DTPREL32: -+ if (sym != NULL) -+ *addr_field = TLS_DTPREL_VALUE (sym) + reloc->r_addend; -+ break; -+ -+ case __WORDSIZE == 64 ? R_LARCH_TLS_TPREL64 : R_LARCH_TLS_TPREL32: -+ if (sym != NULL) -+ { -+ CHECK_STATIC_TLS (map, sym_map); -+ *addr_field = TLS_TPREL_VALUE (sym_map, sym) + reloc->r_addend; -+ } -+ break; -+ -+ case R_LARCH_COPY: -+ { -+ if (__glibc_unlikely (sym == NULL)) -+ /* This can happen in trace mode if an object could not be -+ found. */ -+ break; -+ -+ /* Handle TLS copy relocations. */ -+ if (__glibc_unlikely (ELFW (ST_TYPE) (sym->st_info) == STT_TLS)) -+ { -+ /* There's nothing to do if the symbol is in .tbss. */ -+ if (__glibc_likely (sym->st_value >= sym_map->l_tls_initimage_size)) -+ break; -+ value += (ElfW(Addr)) sym_map->l_tls_initimage - sym_map->l_addr; -+ } -+ -+ size_t size = sym->st_size; -+ if (__glibc_unlikely (sym->st_size != refsym->st_size)) -+ { -+ const char *strtab = (const void *) D_PTR (map, l_info[DT_STRTAB]); -+ if (sym->st_size > refsym->st_size) -+ size = refsym->st_size; -+ if (sym->st_size > refsym->st_size || GLRO(dl_verbose)) -+ _dl_error_printf ("\ -+ %s: Symbol `%s' has different size in shared object, consider re-linking\n", -+ rtld_progname ?: "", -+ strtab + refsym->st_name); -+ } -+ -+ memcpy (reloc_addr, (void *)value, size); -+ break; -+ } -+#endif -+ -+#if !defined RTLD_BOOTSTRAP || !defined HAVE_Z_COMBRELOC -+ case R_LARCH_RELATIVE: -+ { -+# if !defined RTLD_BOOTSTRAP && !defined HAVE_Z_COMBRELOC -+ /* This is defined in rtld.c, but nowhere in the static libc.a; -+ make the reference weak so static programs can still link. -+ This declaration cannot be done when compiling rtld.c -+ (i.e. #ifdef RTLD_BOOTSTRAP) because rtld.c contains the -+ common defn for _dl_rtld_map, which is incompatible with a -+ weak decl in the same file. */ -+# ifndef SHARED -+ weak_extern (GL(dl_rtld_map)); -+# endif -+ if (map != &GL(dl_rtld_map)) /* Already done in rtld itself. */ -+# endif -+ *addr_field = map->l_addr + reloc->r_addend; -+ break; -+ } -+#endif -+ -+ case R_LARCH_JUMP_SLOT: -+ case __WORDSIZE == 64 ? R_LARCH_64 : R_LARCH_32: -+ *addr_field = value; -+ break; -+ -+ case R_LARCH_IRELATIVE: -+ value = map->l_addr + reloc->r_addend; -+ if (__glibc_likely (!skip_ifunc)) -+ value = ((ElfW(Addr) (*) (void)) value) (); -+ *addr_field = value; -+ break; -+ -+ case R_LARCH_NONE: -+ break; -+ -+ default: -+ _dl_reloc_bad_type (map, r_type, 0); -+ break; -+ } -+} -+ -+auto inline void -+__attribute__ ((always_inline)) -+elf_machine_rela_relative (ElfW(Addr) l_addr, const ElfW(Rela) *reloc, -+ void *const reloc_addr) -+{ -+ *(ElfW(Addr) *) reloc_addr = l_addr + reloc->r_addend; -+} -+ -+auto inline void -+__attribute__ ((always_inline)) -+elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[], -+ ElfW(Addr) l_addr, -+ const ElfW(Rela) *reloc, -+ int skip_ifunc) -+{ -+ ElfW(Addr) *const reloc_addr = (void *) (l_addr + reloc->r_offset); -+ const unsigned int r_type = ELFW (R_TYPE) (reloc->r_info); -+ -+ /* Check for unexpected PLT reloc type. */ -+ if (__glibc_likely (r_type == R_LARCH_JUMP_SLOT)) -+ { -+ if (__glibc_unlikely (map->l_mach.plt == 0)) -+ { -+ if (l_addr) -+ *reloc_addr += l_addr; -+ } -+ else -+ *reloc_addr = map->l_mach.plt; -+ } -+ else if (__glibc_unlikely (r_type == R_LARCH_IRELATIVE)) -+ { -+ ElfW(Addr) *value = (void *) (l_addr + reloc->r_addend); -+ if (__glibc_likely (!skip_ifunc)) -+ value = (ElfW(Addr) *)((ElfW(Addr) (*) (void)) value) (); -+ *reloc_addr = (ElfW(Addr))value; -+ } -+ else -+ _dl_reloc_bad_type (map, r_type, 1); -+} -+ -+/* Set up the loaded object described by L so its stub function -+ will jump to the on-demand fixup code __dl_runtime_resolve. */ -+ -+auto inline int -+__attribute__ ((always_inline)) -+elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], -+ int lazy, int profile) -+{ -+#ifndef RTLD_BOOTSTRAP -+ /* If using PLTs, fill in the first two entries of .got.plt. */ -+ if (l->l_info[DT_JMPREL]) -+ { -+ extern void _dl_runtime_resolve (void) __attribute__ ((visibility ("hidden"))); -+ extern void _dl_runtime_resolve_lasx (void) __attribute__ ((visibility ("hidden"))); -+ extern void _dl_runtime_resolve_lsx (void) __attribute__ ((visibility ("hidden"))); -+ ElfW(Addr) *gotplt = (ElfW(Addr) *) D_PTR (l, l_info[DT_PLTGOT]); -+ /* If a library is prelinked but we have to relocate anyway, -+ we have to be able to undo the prelinking of .got.plt. -+ The prelinker saved the address of .plt for us here. */ -+ if (gotplt[1]) -+ l->l_mach.plt = gotplt[1] + l->l_addr; -+ -+ if (SUPPORT_LASX) -+ gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx; -+ else if (SUPPORT_LSX) -+ gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lsx; -+ else -+ gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve; -+ -+ gotplt[1] = (ElfW(Addr)) l; -+ } -+#endif -+ -+ return lazy; -+} -+ -+#endif /* RESOLVE_MAP */ -diff --git a/sysdeps/loongarch/dl-tls.h b/sysdeps/loongarch/dl-tls.h -new file mode 100644 -index 00000000..70110c50 ---- /dev/null -+++ b/sysdeps/loongarch/dl-tls.h -@@ -0,0 +1,49 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+ -+/* Type used for the representation of TLS information in the GOT. */ -+typedef struct -+{ -+ unsigned long int ti_module; -+ unsigned long int ti_offset; -+} tls_index; -+ -+/* The thread pointer points to the first static TLS block. */ -+#define TLS_TP_OFFSET 0 -+ -+/* Dynamic thread vector pointers point 0x800 past the start of each -+ TLS block. */ -+//#define TLS_DTV_OFFSET 0x800 -+#define TLS_DTV_OFFSET 0 -+ -+/* Compute the value for a GOTTPREL reloc. */ -+#define TLS_TPREL_VALUE(sym_map, sym) \ -+ ((sym_map)->l_tls_offset + (sym)->st_value - TLS_TP_OFFSET) -+ -+/* Compute the value for a DTPREL reloc. */ -+#define TLS_DTPREL_VALUE(sym) \ -+ ((sym)->st_value - TLS_DTV_OFFSET) -+ -+extern void *__tls_get_addr (tls_index *ti); -+ -+#define GET_ADDR_OFFSET (ti->ti_offset + TLS_DTV_OFFSET) -+#define __TLS_GET_ADDR(__ti) (__tls_get_addr (__ti) - TLS_DTV_OFFSET) -+ -+/* Value used for dtv entries for which the allocation is delayed. */ -+#define TLS_DTV_UNALLOCATED ((void *) -1l) -diff --git a/sysdeps/loongarch/dl-trampoline.S b/sysdeps/loongarch/dl-trampoline.S -new file mode 100644 -index 00000000..5f627a63 ---- /dev/null -+++ b/sysdeps/loongarch/dl-trampoline.S -@@ -0,0 +1,31 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#define USE_LASX -+#define _dl_runtime_resolve _dl_runtime_resolve_lasx -+#include "dl-trampoline.h" -+#undef USE_LASX -+#undef _dl_runtime_resolve -+ -+#define USE_LSX -+#define _dl_runtime_resolve _dl_runtime_resolve_lsx -+#include "dl-trampoline.h" -+#undef USE_LSX -+#undef _dl_runtime_resolve -+ -+#include "dl-trampoline.h" -diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h -new file mode 100644 -index 00000000..95639111 ---- /dev/null -+++ b/sysdeps/loongarch/dl-trampoline.h -@@ -0,0 +1,153 @@ -+/* LoongArch PLT trampoline -+ Copyright (C) 2017-2018 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+/* Assembler veneer called from the PLT header code for lazy loading. -+ The PLT header passes its own args in t0-t2. */ -+ -+#ifdef __loongarch_soft_float -+# define FRAME_SIZE (-((-10 * SZREG) & ALMASK)) -+#else -+# define FRAME_SIZE (-((-10 * SZREG - 8 * 256) & ALMASK)) -+#endif -+ -+ENTRY (_dl_runtime_resolve) -+ # Save arguments to stack. -+ -+#ifdef __loongarch64 -+ li.d t3, -FRAME_SIZE -+ add.d sp, sp, t3 -+#elif defined __loongarch32 -+ li.w t3, -FRAME_SIZE -+ add.w sp, sp, t3 -+#endif -+ -+ -+ REG_S ra, sp, 9*SZREG -+ REG_S a0, sp, 1*SZREG -+ REG_S a1, sp, 2*SZREG -+ REG_S a2, sp, 3*SZREG -+ REG_S a3, sp, 4*SZREG -+ REG_S a4, sp, 5*SZREG -+ REG_S a5, sp, 6*SZREG -+ REG_S a6, sp, 7*SZREG -+ REG_S a7, sp, 8*SZREG -+ -+#ifndef __loongarch_soft_float -+ FREG_S fa0, sp, 10*SZREG + 0*SZFREG -+ FREG_S fa1, sp, 10*SZREG + 1*SZFREG -+ FREG_S fa2, sp, 10*SZREG + 2*SZFREG -+ FREG_S fa3, sp, 10*SZREG + 3*SZFREG -+ FREG_S fa4, sp, 10*SZREG + 4*SZFREG -+ FREG_S fa5, sp, 10*SZREG + 5*SZFREG -+ FREG_S fa6, sp, 10*SZREG + 6*SZFREG -+ FREG_S fa7, sp, 10*SZREG + 7*SZFREG -+#ifdef USE_LASX -+ xvst $xr0, sp, 10*SZREG + 0*256 -+ xvst $xr1, sp, 10*SZREG + 1*256 -+ xvst $xr2, sp, 10*SZREG + 2*256 -+ xvst $xr3, sp, 10*SZREG + 3*256 -+ xvst $xr4, sp, 10*SZREG + 4*256 -+ xvst $xr5, sp, 10*SZREG + 5*256 -+ xvst $xr6, sp, 10*SZREG + 6*256 -+ xvst $xr7, sp, 10*SZREG + 7*256 -+#elif defined USE_LSX -+ vst $vr0, sp, 10*SZREG + 0*128 -+ vst $vr1, sp, 10*SZREG + 1*128 -+ vst $vr2, sp, 10*SZREG + 2*128 -+ vst $vr3, sp, 10*SZREG + 3*128 -+ vst $vr4, sp, 10*SZREG + 4*128 -+ vst $vr5, sp, 10*SZREG + 5*128 -+ vst $vr6, sp, 10*SZREG + 6*128 -+ vst $vr7, sp, 10*SZREG + 7*128 -+#endif -+#endif -+ -+ # Update .got.plt and obtain runtime address of callee. -+#ifdef __loongarch64 -+ slli.d a1, t1, 1 -+ or a0, t0, zero -+ add.d a1, a1, t1 -+ la a2, _dl_fixup -+ jirl ra, a2, 0 -+ or t1, v0, zero -+#elif defined __loongarch32 -+ slli.w a1, t1, 1 -+ or a0, t0, zero -+ add.w a1, a1, t1 -+ la a2, _dl_fixup -+ jirl ra, a2, 0 -+ or t1, v0, zero -+#endif -+ -+ # Restore arguments from stack. -+ REG_L ra, sp, 9*SZREG -+ REG_L a0, sp, 1*SZREG -+ REG_L a1, sp, 2*SZREG -+ REG_L a2, sp, 3*SZREG -+ REG_L a3, sp, 4*SZREG -+ REG_L a4, sp, 5*SZREG -+ REG_L a5, sp, 6*SZREG -+ REG_L a6, sp, 7*SZREG -+ REG_L a7, sp, 8*SZREG -+ -+#ifndef __loongarch_soft_float -+ FREG_L fa0, sp, 10*SZREG + 0*SZFREG -+ FREG_L fa1, sp, 10*SZREG + 1*SZFREG -+ FREG_L fa2, sp, 10*SZREG + 2*SZFREG -+ FREG_L fa3, sp, 10*SZREG + 3*SZFREG -+ FREG_L fa4, sp, 10*SZREG + 4*SZFREG -+ FREG_L fa5, sp, 10*SZREG + 5*SZFREG -+ FREG_L fa6, sp, 10*SZREG + 6*SZFREG -+ FREG_L fa7, sp, 10*SZREG + 7*SZFREG -+#ifdef USE_LASX -+ xvld $xr0, sp, 10*SZREG + 0*256 -+ xvld $xr1, sp, 10*SZREG + 1*256 -+ xvld $xr2, sp, 10*SZREG + 2*256 -+ xvld $xr3, sp, 10*SZREG + 3*256 -+ xvld $xr4, sp, 10*SZREG + 4*256 -+ xvld $xr5, sp, 10*SZREG + 5*256 -+ xvld $xr6, sp, 10*SZREG + 6*256 -+ xvld $xr7, sp, 10*SZREG + 7*256 -+#elif defined USE_LSX -+ vld $vr0, sp, 10*SZREG + 0*128 -+ vld $vr1, sp, 10*SZREG + 1*128 -+ vld $vr2, sp, 10*SZREG + 2*128 -+ vld $vr3, sp, 10*SZREG + 3*128 -+ vld $vr4, sp, 10*SZREG + 4*128 -+ vld $vr5, sp, 10*SZREG + 5*128 -+ vld $vr6, sp, 10*SZREG + 6*128 -+ vld $vr7, sp, 10*SZREG + 7*128 -+#endif -+#endif -+ -+#ifdef __loongarch64 -+ li.d t3, FRAME_SIZE -+ add.d sp, sp, t3 -+#elif defined __loongarch32 -+ li.w t3, FRAME_SIZE -+ addi.w sp, sp, FRAME_SIZE -+#endif -+ -+ -+ # Invoke the callee. -+ jirl zero, t1, 0 -+END (_dl_runtime_resolve) -diff --git a/sysdeps/loongarch/dl-tunables.list b/sysdeps/loongarch/dl-tunables.list -new file mode 100644 -index 00000000..22c43611 ---- /dev/null -+++ b/sysdeps/loongarch/dl-tunables.list -@@ -0,0 +1,25 @@ -+# LoongArch specific tunables. -+# Copyright (C) 2017-2018 Free Software Foundation, Inc. -+# This file is part of the GNU C Library. -+ -+# The GNU C Library is free software; you can redistribute it and/or -+# modify it under the terms of the GNU Lesser General Public -+# License as published by the Free Software Foundation; either -+# version 2.1 of the License, or (at your option) any later version. -+ -+# The GNU C Library is distributed in the hope that it will be useful, -+# but WITHOUT ANY WARRANTY; without even the implied warranty of -+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+# Lesser General Public License for more details. -+ -+# You should have received a copy of the GNU Lesser General Public -+# License along with the GNU C Library; if not, see -+# . -+ -+glibc { -+ cpu { -+ hwcaps { -+ type: STRING -+ } -+ } -+} -diff --git a/sysdeps/loongarch/e_sqrtl.c b/sysdeps/loongarch/e_sqrtl.c -new file mode 100644 -index 00000000..65ae7ad8 ---- /dev/null -+++ b/sysdeps/loongarch/e_sqrtl.c -@@ -0,0 +1,39 @@ -+/* long double square root in software floating-point emulation. -+ Copyright (C) 1997-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Richard Henderson (rth@cygnus.com) and -+ Jakub Jelinek (jj@ultra.linux.cz). -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+#include -+ -+long double -+__ieee754_sqrtl (const long double a) -+{ -+ FP_DECL_EX; -+ FP_DECL_Q(A); FP_DECL_Q(C); -+ long double c; -+ -+ FP_INIT_ROUNDMODE; -+ FP_UNPACK_Q(A, a); -+ FP_SQRT_Q(C, A); -+ FP_PACK_Q(c, C); -+ FP_HANDLE_EXCEPTIONS; -+ return c; -+} -+strong_alias (__ieee754_sqrtl, __sqrtl_finite) -diff --git a/sysdeps/loongarch/elf-init.c b/sysdeps/loongarch/elf-init.c -new file mode 100644 -index 00000000..5f261a9d ---- /dev/null -+++ b/sysdeps/loongarch/elf-init.c -@@ -0,0 +1 @@ -+#include -diff --git a/sysdeps/loongarch/fenv_private.h b/sysdeps/loongarch/fenv_private.h -new file mode 100644 -index 00000000..416377f6 ---- /dev/null -+++ b/sysdeps/loongarch/fenv_private.h -@@ -0,0 +1,328 @@ -+/* Optimized inline fenv.h functions for libm. Generic version. -+ Copyright (C) 2011-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#ifndef _FENV_PRIVATE_H -+#define _FENV_PRIVATE_H 1 -+ -+#include -+#include -+ -+/* The standards only specify one variant of the fenv.h interfaces. -+ But at least for some architectures we can be more efficient if we -+ know what operations are going to be performed. Therefore we -+ define additional interfaces. By default they refer to the normal -+ interfaces. */ -+ -+static __always_inline void -+default_libc_feholdexcept (fenv_t *e) -+{ -+ (void) __feholdexcept (e); -+} -+ -+#ifndef libc_feholdexcept -+# define libc_feholdexcept default_libc_feholdexcept -+#endif -+#ifndef libc_feholdexceptf -+# define libc_feholdexceptf default_libc_feholdexcept -+#endif -+#ifndef libc_feholdexceptl -+# define libc_feholdexceptl default_libc_feholdexcept -+#endif -+ -+static __always_inline void -+default_libc_fesetround (int r) -+{ -+ (void) __fesetround (r); -+} -+ -+#ifndef libc_fesetround -+# define libc_fesetround default_libc_fesetround -+#endif -+#ifndef libc_fesetroundf -+# define libc_fesetroundf default_libc_fesetround -+#endif -+#ifndef libc_fesetroundl -+# define libc_fesetroundl default_libc_fesetround -+#endif -+ -+static __always_inline void -+default_libc_feholdexcept_setround (fenv_t *e, int r) -+{ -+ __feholdexcept (e); -+ __fesetround (r); -+} -+ -+#ifndef libc_feholdexcept_setround -+# define libc_feholdexcept_setround default_libc_feholdexcept_setround -+#endif -+#ifndef libc_feholdexcept_setroundf -+# define libc_feholdexcept_setroundf default_libc_feholdexcept_setround -+#endif -+#ifndef libc_feholdexcept_setroundl -+# define libc_feholdexcept_setroundl default_libc_feholdexcept_setround -+#endif -+ -+#ifndef libc_feholdsetround_53bit -+# define libc_feholdsetround_53bit libc_feholdsetround -+#endif -+ -+#ifndef libc_fetestexcept -+# define libc_fetestexcept fetestexcept -+#endif -+#ifndef libc_fetestexceptf -+# define libc_fetestexceptf fetestexcept -+#endif -+#ifndef libc_fetestexceptl -+# define libc_fetestexceptl fetestexcept -+#endif -+ -+static __always_inline void -+default_libc_fesetenv (fenv_t *e) -+{ -+ (void) __fesetenv (e); -+} -+ -+#ifndef libc_fesetenv -+# define libc_fesetenv default_libc_fesetenv -+#endif -+#ifndef libc_fesetenvf -+# define libc_fesetenvf default_libc_fesetenv -+#endif -+#ifndef libc_fesetenvl -+# define libc_fesetenvl default_libc_fesetenv -+#endif -+ -+static __always_inline void -+default_libc_feupdateenv (fenv_t *e) -+{ -+ (void) __feupdateenv (e); -+} -+ -+#ifndef libc_feupdateenv -+# define libc_feupdateenv default_libc_feupdateenv -+#endif -+#ifndef libc_feupdateenvf -+# define libc_feupdateenvf default_libc_feupdateenv -+#endif -+#ifndef libc_feupdateenvl -+# define libc_feupdateenvl default_libc_feupdateenv -+#endif -+ -+#ifndef libc_feresetround_53bit -+# define libc_feresetround_53bit libc_feresetround -+#endif -+ -+static __always_inline int -+default_libc_feupdateenv_test (fenv_t *e, int ex) -+{ -+ int ret = fetestexcept (ex); -+ __feupdateenv (e); -+ return ret; -+} -+ -+#ifndef libc_feupdateenv_test -+# define libc_feupdateenv_test default_libc_feupdateenv_test -+#endif -+#ifndef libc_feupdateenv_testf -+# define libc_feupdateenv_testf default_libc_feupdateenv_test -+#endif -+#ifndef libc_feupdateenv_testl -+# define libc_feupdateenv_testl default_libc_feupdateenv_test -+#endif -+ -+/* Save and set the rounding mode. The use of fenv_t to store the old mode -+ allows a target-specific version of this function to avoid converting the -+ rounding mode from the fpu format. By default we have no choice but to -+ manipulate the entire env. */ -+ -+#ifndef libc_feholdsetround -+# define libc_feholdsetround libc_feholdexcept_setround -+#endif -+#ifndef libc_feholdsetroundf -+# define libc_feholdsetroundf libc_feholdexcept_setroundf -+#endif -+#ifndef libc_feholdsetroundl -+# define libc_feholdsetroundl libc_feholdexcept_setroundl -+#endif -+ -+/* ... and the reverse. */ -+ -+#ifndef libc_feresetround -+# define libc_feresetround libc_feupdateenv -+#endif -+#ifndef libc_feresetroundf -+# define libc_feresetroundf libc_feupdateenvf -+#endif -+#ifndef libc_feresetroundl -+# define libc_feresetroundl libc_feupdateenvl -+#endif -+ -+/* ... and a version that also discards exceptions. */ -+ -+#ifndef libc_feresetround_noex -+# define libc_feresetround_noex libc_fesetenv -+#endif -+#ifndef libc_feresetround_noexf -+# define libc_feresetround_noexf libc_fesetenvf -+#endif -+#ifndef libc_feresetround_noexl -+# define libc_feresetround_noexl libc_fesetenvl -+#endif -+ -+#ifndef HAVE_RM_CTX -+# define HAVE_RM_CTX 0 -+#endif -+ -+ -+/* Default implementation using standard fenv functions. -+ Avoid unnecessary rounding mode changes by first checking the -+ current rounding mode. Note the use of __glibc_unlikely is -+ important for performance. */ -+ -+static __always_inline void -+default_libc_feholdsetround_ctx (struct rm_ctx *ctx, int round) -+{ -+ ctx->updated_status = false; -+ -+ /* Update rounding mode only if different. */ -+ if (__glibc_unlikely (round != get_rounding_mode ())) -+ { -+ ctx->updated_status = true; -+ __fegetenv (&ctx->env); -+ __fesetround (round); -+ } -+} -+ -+static __always_inline void -+default_libc_feresetround_ctx (struct rm_ctx *ctx) -+{ -+ /* Restore the rounding mode if updated. */ -+ if (__glibc_unlikely (ctx->updated_status)) -+ __feupdateenv (&ctx->env); -+} -+ -+static __always_inline void -+default_libc_feholdsetround_noex_ctx (struct rm_ctx *ctx, int round) -+{ -+ /* Save exception flags and rounding mode, and disable exception -+ traps. */ -+ __feholdexcept (&ctx->env); -+ -+ /* Update rounding mode only if different. */ -+ if (__glibc_unlikely (round != get_rounding_mode ())) -+ __fesetround (round); -+} -+ -+static __always_inline void -+default_libc_feresetround_noex_ctx (struct rm_ctx *ctx) -+{ -+ /* Restore exception flags and rounding mode. */ -+ __fesetenv (&ctx->env); -+} -+ -+#if HAVE_RM_CTX -+/* Set/Restore Rounding Modes only when necessary. If defined, these functions -+ set/restore floating point state only if the state needed within the lexical -+ block is different from the current state. This saves a lot of time when -+ the floating point unit is much slower than the fixed point units. */ -+ -+# ifndef libc_feholdsetround_noex_ctx -+# define libc_feholdsetround_noex_ctx libc_feholdsetround_ctx -+# endif -+# ifndef libc_feholdsetround_noexf_ctx -+# define libc_feholdsetround_noexf_ctx libc_feholdsetroundf_ctx -+# endif -+# ifndef libc_feholdsetround_noexl_ctx -+# define libc_feholdsetround_noexl_ctx libc_feholdsetroundl_ctx -+# endif -+ -+# ifndef libc_feresetround_noex_ctx -+# define libc_feresetround_noex_ctx libc_fesetenv_ctx -+# endif -+# ifndef libc_feresetround_noexf_ctx -+# define libc_feresetround_noexf_ctx libc_fesetenvf_ctx -+# endif -+# ifndef libc_feresetround_noexl_ctx -+# define libc_feresetround_noexl_ctx libc_fesetenvl_ctx -+# endif -+ -+#else -+ -+# define libc_feholdsetround_ctx default_libc_feholdsetround_ctx -+# define libc_feresetround_ctx default_libc_feresetround_ctx -+# define libc_feholdsetround_noex_ctx default_libc_feholdsetround_noex_ctx -+# define libc_feresetround_noex_ctx default_libc_feresetround_noex_ctx -+ -+# define libc_feholdsetroundf_ctx libc_feholdsetround_ctx -+# define libc_feholdsetroundl_ctx libc_feholdsetround_ctx -+# define libc_feresetroundf_ctx libc_feresetround_ctx -+# define libc_feresetroundl_ctx libc_feresetround_ctx -+ -+# define libc_feholdsetround_noexf_ctx libc_feholdsetround_noex_ctx -+# define libc_feholdsetround_noexl_ctx libc_feholdsetround_noex_ctx -+# define libc_feresetround_noexf_ctx libc_feresetround_noex_ctx -+# define libc_feresetround_noexl_ctx libc_feresetround_noex_ctx -+ -+#endif -+ -+#ifndef libc_feholdsetround_53bit_ctx -+# define libc_feholdsetround_53bit_ctx libc_feholdsetround_ctx -+#endif -+#ifndef libc_feresetround_53bit_ctx -+# define libc_feresetround_53bit_ctx libc_feresetround_ctx -+#endif -+ -+#define SET_RESTORE_ROUND_GENERIC(RM,ROUNDFUNC,CLEANUPFUNC) \ -+ struct rm_ctx ctx __attribute__((cleanup (CLEANUPFUNC ## _ctx))); \ -+ ROUNDFUNC ## _ctx (&ctx, (RM)) -+ -+/* Set the rounding mode within a lexical block. Restore the rounding mode to -+ the value at the start of the block. The exception mode must be preserved. -+ Exceptions raised within the block must be set in the exception flags. -+ Non-stop mode may be enabled inside the block. */ -+ -+#define SET_RESTORE_ROUND(RM) \ -+ SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround, libc_feresetround) -+#define SET_RESTORE_ROUNDF(RM) \ -+ SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundf, libc_feresetroundf) -+#define SET_RESTORE_ROUNDL(RM) \ -+ SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundl, libc_feresetroundl) -+ -+/* Set the rounding mode within a lexical block. Restore the rounding mode to -+ the value at the start of the block. The exception mode must be preserved. -+ Exceptions raised within the block must be discarded, and exception flags -+ are restored to the value at the start of the block. -+ Non-stop mode must be enabled inside the block. */ -+ -+#define SET_RESTORE_ROUND_NOEX(RM) \ -+ SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_noex, \ -+ libc_feresetround_noex) -+#define SET_RESTORE_ROUND_NOEXF(RM) \ -+ SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_noexf, \ -+ libc_feresetround_noexf) -+#define SET_RESTORE_ROUND_NOEXL(RM) \ -+ SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_noexl, \ -+ libc_feresetround_noexl) -+ -+/* Like SET_RESTORE_ROUND, but also set rounding precision to 53 bits. */ -+#define SET_RESTORE_ROUND_53BIT(RM) \ -+ SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_53bit, \ -+ libc_feresetround_53bit) -+ -+#endif /* fenv_private.h. */ -+ -diff --git a/sysdeps/loongarch/fpu/e_ilogb.c b/sysdeps/loongarch/fpu/e_ilogb.c -new file mode 100644 -index 00000000..f9ada692 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/e_ilogb.c -@@ -0,0 +1,39 @@ -+/* __ieee754_ilogb(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+int -+__ieee754_ilogb (double x) -+{ -+ int x_cond; -+ asm volatile ("fclass.d \t%0, %1" : "=f" (x_cond) : "f" (x)); -+ -+ if (__glibc_unlikely (x_cond & _FCLASS_ZERO)) -+ return FP_ILOGB0; -+ else if (__glibc_unlikely (x_cond & ( _FCLASS_NAN | _FCLASS_INF))) -+ return FP_ILOGBNAN; -+ else -+ { -+ asm volatile ("fabs.d \t%0, %1" : "=f" (x) : "f" (x)); -+ asm volatile ("flogb.d \t%0, %1" : "=f" (x) : "f" (x)); -+ return x; -+ } -+} -diff --git a/sysdeps/loongarch/fpu/e_ilogbf.c b/sysdeps/loongarch/fpu/e_ilogbf.c -new file mode 100644 -index 00000000..e1da48ec ---- /dev/null -+++ b/sysdeps/loongarch/fpu/e_ilogbf.c -@@ -0,0 +1,39 @@ -+/* __ieee754_ilogbf(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+int -+__ieee754_ilogbf (float x) -+{ -+ int x_cond; -+ asm volatile ("fclass.s \t%0, %1" : "=f" (x_cond) : "f" (x)); -+ -+ if (__glibc_unlikely (x_cond & _FCLASS_ZERO)) -+ return FP_ILOGB0; -+ else if (__glibc_unlikely (x_cond & ( _FCLASS_NAN | _FCLASS_INF))) -+ return FP_ILOGBNAN; -+ else -+ { -+ asm volatile ("fabs.s \t%0, %1" : "=f" (x) : "f" (x)); -+ asm volatile ("flogb.s \t%0, %1" : "=f" (x) : "f" (x)); -+ return x; -+ } -+} -diff --git a/sysdeps/loongarch/fpu/e_sqrt.c b/sysdeps/loongarch/fpu/e_sqrt.c -new file mode 100644 -index 00000000..dac8696a ---- /dev/null -+++ b/sysdeps/loongarch/fpu/e_sqrt.c -@@ -0,0 +1,29 @@ -+/* Copyright (C) 2002-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Hartvig Ekner , 2002. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+ -+ -+double -+__ieee754_sqrt (double x) -+{ -+ double z; -+ __asm__ ("fsqrt.d %0,%1" : "=f" (z) : "f" (x)); -+ return z; -+} -+strong_alias (__ieee754_sqrt, __sqrt_finite) -+ -diff --git a/sysdeps/loongarch/fpu/e_sqrtf.c b/sysdeps/loongarch/fpu/e_sqrtf.c -new file mode 100644 -index 00000000..706c0494 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/e_sqrtf.c -@@ -0,0 +1,28 @@ -+/* Copyright (C) 2002-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Hartvig Ekner , 2002. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+ -+ -+float -+__ieee754_sqrtf (float x) -+{ -+ float z; -+ __asm__ ("fsqrt.s %0,%1" : "=f" (z) : "f" (x)); -+ return z; -+} -+strong_alias (__ieee754_sqrtf, __sqrtf_finite) -diff --git a/sysdeps/loongarch/fpu/fclrexcpt.c b/sysdeps/loongarch/fpu/fclrexcpt.c -new file mode 100644 -index 00000000..51310d93 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/fclrexcpt.c -@@ -0,0 +1,47 @@ -+/* Clear given exceptions in current floating-point environment. -+ Copyright (C) 1998-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Andreas Jaeger , 1998. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+#include -+ -+int -+feclearexcept (int excepts) -+{ -+ int cw; -+ -+ /* Mask out unsupported bits/exceptions. */ -+ excepts &= FE_ALL_EXCEPT; -+ -+ /* Read the complete control word. */ -+ _FPU_GETCW (cw); -+ -+ /* Clear exception flag bits and cause bits. If the cause bit is not -+ cleared, the next CTC instruction (just below) will re-generate the -+ exception. */ -+ -+ cw &= ~(excepts | (excepts << CAUSE_SHIFT)); -+ -+ /* Put the new data in effect. */ -+ _FPU_SETCW (cw); -+ -+ /* Success. */ -+ return 0; -+} -+libm_hidden_def (feclearexcept) -diff --git a/sysdeps/loongarch/fpu/fedisblxcpt.c b/sysdeps/loongarch/fpu/fedisblxcpt.c -new file mode 100644 -index 00000000..004b0ecb ---- /dev/null -+++ b/sysdeps/loongarch/fpu/fedisblxcpt.c -@@ -0,0 +1,40 @@ -+/* Disable floating-point exceptions. -+ Copyright (C) 2000-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Andreas Jaeger , 2000. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+#include -+ -+int -+fedisableexcept (int excepts) -+{ -+ unsigned int new_exc, old_exc; -+ -+ /* Get the current control word. */ -+ _FPU_GETCW (new_exc); -+ -+ old_exc = (new_exc & ENABLE_MASK) << ENABLE_SHIFT; -+ -+ excepts &= FE_ALL_EXCEPT; -+ -+ new_exc &= ~(excepts >> ENABLE_SHIFT); -+ _FPU_SETCW (new_exc); -+ -+ return old_exc; -+} -diff --git a/sysdeps/loongarch/fpu/feenablxcpt.c b/sysdeps/loongarch/fpu/feenablxcpt.c -new file mode 100644 -index 00000000..b8f56625 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/feenablxcpt.c -@@ -0,0 +1,40 @@ -+/* Enable floating-point exceptions. -+ Copyright (C) 2000-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Andreas Jaeger , 2000. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+#include -+ -+int -+feenableexcept (int excepts) -+{ -+ unsigned int new_exc, old_exc; -+ -+ /* Get the current control word. */ -+ _FPU_GETCW (new_exc); -+ -+ old_exc = (new_exc & ENABLE_MASK) << ENABLE_SHIFT; -+ -+ excepts &= FE_ALL_EXCEPT; -+ -+ new_exc |= excepts >> ENABLE_SHIFT; -+ _FPU_SETCW (new_exc); -+ -+ return old_exc; -+} -diff --git a/sysdeps/loongarch/fpu/fegetenv.c b/sysdeps/loongarch/fpu/fegetenv.c -new file mode 100644 -index 00000000..8e8fa2c5 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/fegetenv.c -@@ -0,0 +1,33 @@ -+/* Store current floating-point environment. -+ Copyright (C) 1998-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Andreas Jaeger , 1998. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+__fegetenv (fenv_t *envp) -+{ -+ _FPU_GETCW (*envp); -+ -+ /* Success. */ -+ return 0; -+} -+libm_hidden_def (__fegetenv) -+weak_alias (__fegetenv, fegetenv) -+libm_hidden_weak (fegetenv) -diff --git a/sysdeps/loongarch/fpu/fegetexcept.c b/sysdeps/loongarch/fpu/fegetexcept.c -new file mode 100644 -index 00000000..2c0a1208 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/fegetexcept.c -@@ -0,0 +1,33 @@ -+/* Get enabled floating-point exceptions. -+ Copyright (C) 2000-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Andreas Jaeger , 2000. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+#include -+ -+int -+fegetexcept (void) -+{ -+ unsigned int exc; -+ -+ /* Get the current control word. */ -+ _FPU_GETCW (exc); -+ -+ return (exc & ENABLE_MASK) << ENABLE_SHIFT; -+} -diff --git a/sysdeps/loongarch/fpu/fegetmode.c b/sysdeps/loongarch/fpu/fegetmode.c -new file mode 100644 -index 00000000..e0a5180f ---- /dev/null -+++ b/sysdeps/loongarch/fpu/fegetmode.c -@@ -0,0 +1,27 @@ -+/* Store current floating-point control modes. MIPS version. -+ Copyright (C) 2016-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+#include -+ -+int -+fegetmode (femode_t *modep) -+{ -+ _FPU_GETCW (*modep); -+ return 0; -+} -diff --git a/sysdeps/loongarch/fpu/fegetround.c b/sysdeps/loongarch/fpu/fegetround.c -new file mode 100644 -index 00000000..a7ac444a ---- /dev/null -+++ b/sysdeps/loongarch/fpu/fegetround.c -@@ -0,0 +1,35 @@ -+/* Return current rounding direction. -+ Copyright (C) 1998-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Andreas Jaeger , 1998. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+__fegetround (void) -+{ -+ int cw; -+ -+ /* Get control word. */ -+ _FPU_GETCW (cw); -+ -+ return cw & _FPU_RC_MASK; -+} -+libm_hidden_def (__fegetround) -+weak_alias (__fegetround, fegetround) -+libm_hidden_weak (fegetround) -diff --git a/sysdeps/loongarch/fpu/feholdexcpt.c b/sysdeps/loongarch/fpu/feholdexcpt.c -new file mode 100644 -index 00000000..eb9d4764 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/feholdexcpt.c -@@ -0,0 +1,41 @@ -+/* Store current floating-point environment and clear exceptions. -+ Copyright (C) 2000-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Andreas Jaeger , 2000. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+__feholdexcept (fenv_t *envp) -+{ -+ fpu_control_t cw; -+ -+ /* Save the current state. */ -+ _FPU_GETCW (cw); -+ envp->__fp_control_register = cw; -+ -+ /* Clear all exception enable bits and flags. */ -+ cw &= ~(_FPU_MASK_V|_FPU_MASK_Z|_FPU_MASK_O|_FPU_MASK_U|_FPU_MASK_I|FE_ALL_EXCEPT); -+ _FPU_SETCW (cw); -+ -+ return 0; -+} -+ -+libm_hidden_def (__feholdexcept) -+weak_alias (__feholdexcept, feholdexcept) -+libm_hidden_weak (feholdexcept) -diff --git a/sysdeps/loongarch/fpu/fenv_libc.h b/sysdeps/loongarch/fpu/fenv_libc.h -new file mode 100644 -index 00000000..f5dd1678 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/fenv_libc.h -@@ -0,0 +1,31 @@ -+/* Copyright (C) 2000-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Andreas Jaeger . -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#ifndef _FENV_LIBC_H -+#define _FENV_LIBC_H 1 -+ -+/* Mask for enabling exceptions and for the CAUSE bits. */ -+#define ENABLE_MASK 0x0000001FU -+#define CAUSE_MASK 0x1F000000U -+ -+/* Shift for FE_* flags to get up to the ENABLE bits and the CAUSE bits. */ -+#define ENABLE_SHIFT 16 -+#define CAUSE_SHIFT 8 -+ -+ -+#endif /* _FENV_LIBC_H */ -diff --git a/sysdeps/loongarch/fpu/fesetenv.c b/sysdeps/loongarch/fpu/fesetenv.c -new file mode 100644 -index 00000000..8dee8782 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/fesetenv.c -@@ -0,0 +1,44 @@ -+/* Install given floating-point environment. -+ Copyright (C) 1998-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Andreas Jaeger , 1998. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+__fesetenv (const fenv_t *envp) -+{ -+ fpu_control_t cw; -+ -+ /* Read first current state to flush fpu pipeline. */ -+ _FPU_GETCW (cw); -+ -+ if (envp == FE_DFL_ENV) -+ _FPU_SETCW (_FPU_DEFAULT); -+ else if (envp == FE_NOMASK_ENV) -+ _FPU_SETCW (_FPU_IEEE); -+ else -+ _FPU_SETCW (envp->__fp_control_register); -+ -+ /* Success. */ -+ return 0; -+} -+ -+libm_hidden_def (__fesetenv) -+weak_alias (__fesetenv, fesetenv) -+libm_hidden_weak (fesetenv) -diff --git a/sysdeps/loongarch/fpu/fesetexcept.c b/sysdeps/loongarch/fpu/fesetexcept.c -new file mode 100644 -index 00000000..d14febca ---- /dev/null -+++ b/sysdeps/loongarch/fpu/fesetexcept.c -@@ -0,0 +1,32 @@ -+/* Set given exception flags. MIPS version. -+ Copyright (C) 2016-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+#include -+ -+int -+fesetexcept (int excepts) -+{ -+ fpu_control_t temp; -+ -+ _FPU_GETCW (temp); -+ temp |= excepts & FE_ALL_EXCEPT; -+ _FPU_SETCW (temp); -+ -+ return 0; -+} -diff --git a/sysdeps/loongarch/fpu/fesetmode.c b/sysdeps/loongarch/fpu/fesetmode.c -new file mode 100644 -index 00000000..8cc5d0b1 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/fesetmode.c -@@ -0,0 +1,38 @@ -+/* Install given floating-point control modes. MIPS version. -+ Copyright (C) 2016-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+#include -+ -+#define FCSR_STATUS 0x1f1f0000 -+ -+int -+fesetmode (const femode_t *modep) -+{ -+ fpu_control_t cw; -+ -+ _FPU_GETCW (cw); -+ cw &= FCSR_STATUS; -+ if (modep == FE_DFL_MODE) -+ cw |= _FPU_DEFAULT; -+ else -+ cw |= *modep & ~FCSR_STATUS; -+ _FPU_SETCW (cw); -+ -+ return 0; -+} -diff --git a/sysdeps/loongarch/fpu/fesetround.c b/sysdeps/loongarch/fpu/fesetround.c -new file mode 100644 -index 00000000..31fdeab3 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/fesetround.c -@@ -0,0 +1,46 @@ -+/* Set current rounding direction. -+ Copyright (C) 1998-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Andreas Jaeger , 1998. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+__fesetround (int round) -+{ -+ fpu_control_t cw; -+ -+ if ((round & ~_FPU_RC_MASK) != 0) -+ /* ROUND is no valid rounding mode. */ -+ return 1; -+ -+ /* Get current state. */ -+ _FPU_GETCW (cw); -+ -+ /* Set rounding bits. */ -+ cw &= ~_FPU_RC_MASK; -+ cw |= round; -+ /* Set new state. */ -+ _FPU_SETCW (cw); -+ -+ return 0; -+} -+ -+libm_hidden_def (__fesetround) -+weak_alias (__fesetround, fesetround) -+libm_hidden_weak (fesetround) -diff --git a/sysdeps/loongarch/fpu/feupdateenv.c b/sysdeps/loongarch/fpu/feupdateenv.c -new file mode 100644 -index 00000000..669bfc3c ---- /dev/null -+++ b/sysdeps/loongarch/fpu/feupdateenv.c -@@ -0,0 +1,45 @@ -+/* Install given floating-point environment and raise exceptions. -+ Copyright (C) 1998-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Andreas Jaeger , 1998. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+__feupdateenv (const fenv_t *envp) -+{ -+ int temp; -+ -+ /* Save current exceptions. */ -+ _FPU_GETCW (temp); -+ temp &= FE_ALL_EXCEPT; -+ -+ /* Install new environment. */ -+ __fesetenv (envp); -+ -+ /* Raise the safed exception. Incidently for us the implementation -+ defined format of the values in objects of type fexcept_t is the -+ same as the ones specified using the FE_* constants. */ -+ __feraiseexcept (temp); -+ -+ /* Success. */ -+ return 0; -+} -+libm_hidden_def (__feupdateenv) -+weak_alias (__feupdateenv, feupdateenv) -+libm_hidden_weak (feupdateenv) -diff --git a/sysdeps/loongarch/fpu/fgetexcptflg.c b/sysdeps/loongarch/fpu/fgetexcptflg.c -new file mode 100644 -index 00000000..1e594e14 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/fgetexcptflg.c -@@ -0,0 +1,39 @@ -+/* Store current representation for exceptions. -+ Copyright (C) 1998-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Andreas Jaeger , 1998. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+fegetexceptflag (fexcept_t *flagp, int excepts) -+{ -+ fpu_control_t temp; -+ -+ /* Get the current exceptions. */ -+ _FPU_GETCW (temp); -+ -+ /* We only save the relevant bits here. In particular, care has to be -+ taken with the CAUSE bits, as an inadvertent restore later on could -+ generate unexpected exceptions. */ -+ -+ *flagp = temp & excepts & FE_ALL_EXCEPT; -+ -+ /* Success. */ -+ return 0; -+} -diff --git a/sysdeps/loongarch/fpu/fraiseexcpt.c b/sysdeps/loongarch/fpu/fraiseexcpt.c -new file mode 100644 -index 00000000..2eec053a ---- /dev/null -+++ b/sysdeps/loongarch/fpu/fraiseexcpt.c -@@ -0,0 +1,84 @@ -+/* Raise given exceptions. -+ Copyright (C) 2000-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Andreas Jaeger , 2000. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+#include -+ -+int -+__feraiseexcept (int excepts) -+{ -+ -+ const float fp_zero = 0.0, fp_one = 1.0, fp_max = FLT_MAX, -+ fp_min = FLT_MIN, fp_1e32 = 1.0e32f, fp_two = 2.0, -+ fp_three = 3.0; -+ -+ /* Raise exceptions represented by EXPECTS. But we must raise only -+ one signal at a time. It is important that if the overflow/underflow -+ exception and the inexact exception are given at the same time, -+ the overflow/underflow exception follows the inexact exception.*/ -+ -+ /* First: invalid exception. */ -+ if (FE_INVALID & excepts) -+ __asm__ __volatile__ ( -+ "fdiv.s $f0,%0,%0\n\t" -+ : -+ : "f" (fp_zero) -+ :"$f0"); -+ -+ /* Next: division by zero. */ -+ if (FE_DIVBYZERO & excepts) -+ __asm__ __volatile__ ( -+ "fdiv.s $f0,%0,%1\n\t" -+ : -+ : "f" (fp_one), "f" (fp_zero) -+ :"$f0"); -+ -+ /* Next: overflow. */ -+ if (FE_OVERFLOW & excepts) -+ /* There's no way to raise overflow without also raising inexact. */ -+ __asm__ __volatile__ ( -+ "fadd.s $f0,%0,%1\n\t" -+ : -+ : "f" (fp_max), "f" (fp_1e32) -+ : "$f0"); -+ -+ /* Next: underflow. */ -+ if (FE_UNDERFLOW & excepts) -+ __asm__ __volatile__ ( -+ "fdiv.s $f0,%0,%1\n\t" -+ : -+ : "f" (fp_min), "f" (fp_three) -+ : "$f0"); -+ -+ /* Last: inexact. */ -+ if (FE_INEXACT & excepts) -+ __asm__ __volatile__ ( -+ "fdiv.s $f0, %0, %1\n\t" -+ : -+ : "f" (fp_two), "f" (fp_three) -+ : "$f0"); -+ -+ /* Success. */ -+ return 0; -+} -+ -+libm_hidden_def (__feraiseexcept) -+weak_alias (__feraiseexcept, feraiseexcept) -+libm_hidden_weak (feraiseexcept) -diff --git a/sysdeps/loongarch/fpu/fsetexcptflg.c b/sysdeps/loongarch/fpu/fsetexcptflg.c -new file mode 100644 -index 00000000..dc447a77 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/fsetexcptflg.c -@@ -0,0 +1,42 @@ -+/* Set floating-point environment exception handling. -+ Copyright (C) 1998-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Hartvig Ekner , 2002. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+fesetexceptflag (const fexcept_t *flagp, int excepts) -+{ -+ fpu_control_t temp; -+ -+ /* Get the current exceptions. */ -+ _FPU_GETCW (temp); -+ -+ /* Make sure the flags we want restored are legal. */ -+ excepts &= FE_ALL_EXCEPT; -+ -+ /* Now clear the bits called for, and copy them in from flagp. Note that -+ we ignore all non-flag bits from *flagp, so they don't matter. */ -+ temp = (temp & ~excepts) | (*flagp & excepts); -+ -+ _FPU_SETCW (temp); -+ -+ /* Success. */ -+ return 0; -+} -diff --git a/sysdeps/loongarch/fpu/ftestexcept.c b/sysdeps/loongarch/fpu/ftestexcept.c -new file mode 100644 -index 00000000..fa645b26 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/ftestexcept.c -@@ -0,0 +1,33 @@ -+/* Test exception in current environment. -+ Copyright (C) 1998-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Andreas Jaeger , 1998. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+fetestexcept (int excepts) -+{ -+ int cw; -+ -+ /* Get current control word. */ -+ _FPU_GETCW (cw); -+ -+ return cw & excepts & FE_ALL_EXCEPT; -+} -+libm_hidden_def (fetestexcept) -diff --git a/sysdeps/loongarch/fpu/s_copysign.c b/sysdeps/loongarch/fpu/s_copysign.c -new file mode 100644 -index 00000000..861c4610 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_copysign.c -@@ -0,0 +1,30 @@ -+/* copysign(). LoongArch version. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+double -+__copysign (double x, double y) -+{ -+ asm ("fcopysign.d %0, %1, %2" : "=f" (x) : "f" (x), "f" (y)); -+ return x; -+} -+libm_alias_double (__copysign, copysign) -diff --git a/sysdeps/loongarch/fpu/s_copysignf.c b/sysdeps/loongarch/fpu/s_copysignf.c -new file mode 100644 -index 00000000..c680b1fd ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_copysignf.c -@@ -0,0 +1,30 @@ -+/* copysignf(). LoongArch version. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+float -+__copysignf (float x, float y) -+{ -+ asm ("fcopysign.s %0, %1, %2" : "=f" (x) : "f" (x), "f" (y)); -+ return x; -+} -+libm_alias_float (__copysign, copysign) -diff --git a/sysdeps/loongarch/fpu/s_finite.c b/sysdeps/loongarch/fpu/s_finite.c -new file mode 100644 -index 00000000..a2e98f0b ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_finite.c -@@ -0,0 +1,30 @@ -+/* finite(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+__finite (double x) -+{ -+ int x_cond; -+ asm volatile ("fclass.d \t%0, %1" : "=f" (x_cond) : "f" (x)); -+ return x_cond & ~(_FCLASS_INF | _FCLASS_NAN); -+} -+hidden_def (__finite) -+weak_alias (__finite, finite) -diff --git a/sysdeps/loongarch/fpu/s_finitef.c b/sysdeps/loongarch/fpu/s_finitef.c -new file mode 100644 -index 00000000..9ffab38a ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_finitef.c -@@ -0,0 +1,30 @@ -+/* finitef(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+__finitef (float x) -+{ -+ int x_cond; -+ asm volatile ("fclass.s \t%0, %1" : "=f" (x_cond) : "f" (x)); -+ return x_cond & ~(_FCLASS_INF | _FCLASS_NAN); -+} -+hidden_def (__finitef) -+weak_alias (__finitef, finitef) -diff --git a/sysdeps/loongarch/fpu/s_fmax.c b/sysdeps/loongarch/fpu/s_fmax.c -new file mode 100644 -index 00000000..fe7265af ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_fmax.c -@@ -0,0 +1,30 @@ -+/* fmax(). LoongArch version. -+ Copyright (C) 2021-2022 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+double -+__fmax (double x, double y) -+{ -+ asm volatile("fmax.d\t%0,%1,%2" : "=f" (x) : "f" (x), "f" (y)); -+ return x; -+} -+libm_alias_double (__fmax, fmax) -diff --git a/sysdeps/loongarch/fpu/s_fmaxf.c b/sysdeps/loongarch/fpu/s_fmaxf.c -new file mode 100644 -index 00000000..3defa7de ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_fmaxf.c -@@ -0,0 +1,30 @@ -+/* fmaxf(). LoongArch version. -+ Copyright (C) 2021-2022 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+float -+__fmaxf (float x, float y) -+{ -+ asm volatile("fmax.s\t%0,%1,%2" : "=f" (x) : "f" (x), "f" (y)); -+ return x; -+} -+libm_alias_float (__fmax, fmax) -diff --git a/sysdeps/loongarch/fpu/s_fmaxmag.c b/sysdeps/loongarch/fpu/s_fmaxmag.c -new file mode 100644 -index 00000000..8570a3ba ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_fmaxmag.c -@@ -0,0 +1,29 @@ -+/* fmaxmag(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+double -+__fmaxmag (double x, double y) -+{ -+ asm volatile ("fmaxa.d \t%0, %1, %2" : "=f" (x) : "f" (x), "f" (y)); -+ return x; -+} -+libm_alias_double (__fmaxmag, fmaxmag) -diff --git a/sysdeps/loongarch/fpu/s_fmaxmagf.c b/sysdeps/loongarch/fpu/s_fmaxmagf.c -new file mode 100644 -index 00000000..413e7683 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_fmaxmagf.c -@@ -0,0 +1,29 @@ -+/* fmaxmagf(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+float -+__fmaxmagf (float x, float y) -+{ -+ asm volatile ("fmaxa.s \t%0, %1, %2" : "=f" (x) : "f" (x), "f" (y)); -+ return x; -+} -+libm_alias_float (__fmaxmag, fmaxmag) -diff --git a/sysdeps/loongarch/fpu/s_fmin.c b/sysdeps/loongarch/fpu/s_fmin.c -new file mode 100644 -index 00000000..cc9d0cd1 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_fmin.c -@@ -0,0 +1,30 @@ -+/* fmin(). LoongArch version. -+ Copyright (C) 2021-2022 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+double -+__fmin (double x, double y) -+{ -+ asm volatile("fmin.d\t%0,%1,%2" : "=f" (x) : "f" (x), "f" (y)); -+ return x; -+} -+libm_alias_double (__fmin, fmin) -diff --git a/sysdeps/loongarch/fpu/s_fminf.c b/sysdeps/loongarch/fpu/s_fminf.c -new file mode 100644 -index 00000000..40efbd71 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_fminf.c -@@ -0,0 +1,30 @@ -+/* fminf(). LoongArch version. -+ Copyright (C) 2021-2022 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+float -+__fminf (float x, float y) -+{ -+ asm volatile("fmin.s\t%0,%1,%2" : "=f" (x) : "f" (x), "f" (y)); -+ return x; -+} -+libm_alias_float (__fmin, fmin) -diff --git a/sysdeps/loongarch/fpu/s_fminmag.c b/sysdeps/loongarch/fpu/s_fminmag.c -new file mode 100644 -index 00000000..2badf3d3 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_fminmag.c -@@ -0,0 +1,29 @@ -+/* fminmag(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+double -+__fminmag (double x, double y) -+{ -+ asm volatile ("fmina.d \t%0, %1, %2" : "=f" (x) : "f" (x), "f" (y)); -+ return x; -+} -+libm_alias_double (__fminmag, fminmag) -diff --git a/sysdeps/loongarch/fpu/s_fminmagf.c b/sysdeps/loongarch/fpu/s_fminmagf.c -new file mode 100644 -index 00000000..4d625312 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_fminmagf.c -@@ -0,0 +1,29 @@ -+/* fminmagf(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+float -+__fminmagf (float x, float y) -+{ -+ asm volatile ("fmina.s \t%0, %1, %2" : "=f" (x) : "f" (x), "f" (y)); -+ return x; -+} -+libm_alias_float (__fminmag, fminmag) -diff --git a/sysdeps/loongarch/fpu/s_fpclassify.c b/sysdeps/loongarch/fpu/s_fpclassify.c -new file mode 100644 -index 00000000..3f4d95da ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_fpclassify.c -@@ -0,0 +1,38 @@ -+/* fpclassify(). LoongArch version. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+__fpclassify (double x) -+{ -+ int cls; -+ asm volatile ("fclass.d \t%0, %1" : "=f" (cls) : "f" (x)); -+ -+ if (__glibc_likely (!!(cls & _FCLASS_NORM))) -+ return FP_NORMAL; -+ if (__glibc_likely (!!(cls & _FCLASS_ZERO))) -+ return FP_ZERO; -+ if (__glibc_likely (!!(cls & _FCLASS_SUBNORM))) -+ return FP_SUBNORMAL; -+ if (__glibc_likely (!!(cls & _FCLASS_INF))) -+ return FP_INFINITE; -+ return FP_NAN; -+} -+libm_hidden_def (__fpclassify) -diff --git a/sysdeps/loongarch/fpu/s_fpclassifyf.c b/sysdeps/loongarch/fpu/s_fpclassifyf.c -new file mode 100644 -index 00000000..b7c8b253 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_fpclassifyf.c -@@ -0,0 +1,38 @@ -+/* Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+__fpclassifyf (float x) -+{ -+ int cls; -+ asm volatile ("fclass.s \t%0, %1" : "=f" (cls) : "f" (x)); -+ -+ if (__glibc_likely (!!(cls & _FCLASS_NORM))) -+ return FP_NORMAL; -+ if (__glibc_likely (!!(cls & _FCLASS_ZERO))) -+ return FP_ZERO; -+ if (__glibc_likely (!!(cls & _FCLASS_SUBNORM))) -+ return FP_SUBNORMAL; -+ if (__glibc_likely (!!(cls & _FCLASS_INF))) -+ return FP_INFINITE; -+ return FP_NAN; -+} -+libm_hidden_def (__fpclassifyf) -diff --git a/sysdeps/loongarch/fpu/s_isinf.c b/sysdeps/loongarch/fpu/s_isinf.c -new file mode 100644 -index 00000000..c7a67841 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_isinf.c -@@ -0,0 +1,30 @@ -+/* isinf(). LoongArch version. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+__isinf (double x) -+{ -+ int x_cond; -+ asm volatile ("fclass.d \t%0, %1" : "=f" (x_cond) : "f" (x)); -+ return -((x_cond & _FCLASS_MINF) ? 1 : 0) | ((x_cond & _FCLASS_PINF) ? 1 : 0); -+} -+hidden_def (__isinf) -+weak_alias (__isinf, isinf) -diff --git a/sysdeps/loongarch/fpu/s_isinff.c b/sysdeps/loongarch/fpu/s_isinff.c -new file mode 100644 -index 00000000..dcb4e04e ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_isinff.c -@@ -0,0 +1,30 @@ -+/* isinff(). LoongArch version. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+__isinff (float x) -+{ -+ int x_cond; -+ asm volatile ("fclass.s \t%0, %1" : "=f" (x_cond) : "f" (x)); -+ return -((x_cond & _FCLASS_MINF) ? 1 : 0) | ((x_cond & _FCLASS_PINF) ? 1 : 0); -+} -+hidden_def (__isinff) -+weak_alias (__isinff, isinff) -diff --git a/sysdeps/loongarch/fpu/s_isnan.c b/sysdeps/loongarch/fpu/s_isnan.c -new file mode 100644 -index 00000000..62bb2e2f ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_isnan.c -@@ -0,0 +1,31 @@ -+/* isnan(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+__isnan (double x) -+{ -+ int x_cond; -+ asm volatile ("fclass.d \t%0, %1" : "=f" (x_cond) : "f" (x)); -+ -+ return (x_cond & _FCLASS_NAN) != 0; -+} -+hidden_def (__isnan) -+weak_alias (__isnan, isnan) -diff --git a/sysdeps/loongarch/fpu/s_isnanf.c b/sysdeps/loongarch/fpu/s_isnanf.c -new file mode 100644 -index 00000000..bbdedb84 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_isnanf.c -@@ -0,0 +1,31 @@ -+/* isnanf(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+__isnanf (float x) -+{ -+ int x_cond; -+ asm volatile ("fclass.s \t%0, %1" : "=f" (x_cond) : "f" (x)); -+ -+ return (x_cond & _FCLASS_NAN) != 0; -+} -+hidden_def (__isnanf) -+weak_alias (__isnanf, isnanf) -diff --git a/sysdeps/loongarch/fpu/s_issignaling.c b/sysdeps/loongarch/fpu/s_issignaling.c -new file mode 100644 -index 00000000..4fe0e2b7 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_issignaling.c -@@ -0,0 +1,29 @@ -+/* issignaling(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+__issignaling (double x) -+{ -+ int x_cond; -+ asm volatile ("fclass.d \t%0, %1" : "=f" (x_cond) : "f" (x)); -+ return (x_cond & _FCLASS_SNAN) != 0; -+} -+libm_hidden_def (__issignaling) -diff --git a/sysdeps/loongarch/fpu/s_issignalingf.c b/sysdeps/loongarch/fpu/s_issignalingf.c -new file mode 100644 -index 00000000..d82abb0e ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_issignalingf.c -@@ -0,0 +1,29 @@ -+/* issignalingf(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+int -+__issignalingf (float x) -+{ -+ int x_cond; -+ asm volatile ("fclass.s \t%0, %1" : "=f" (x_cond) : "f" (x)); -+ return (x_cond & _FCLASS_SNAN) != 0; -+} -+libm_hidden_def (__issignalingf) -diff --git a/sysdeps/loongarch/fpu/s_llrint.c b/sysdeps/loongarch/fpu/s_llrint.c -new file mode 100644 -index 00000000..4a8e46ec ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_llrint.c -@@ -0,0 +1,31 @@ -+/* llrint(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+long long int -+__llrint (double x) -+{ -+ long long int result; -+ asm volatile ("ftint.l.d \t%0, %1" : "=f" (x) : "f" (x)); -+ asm volatile ("movfr2gr.d \t%0, %1" : "=r" (result) : "f" (x)); -+ return result; -+} -+libm_alias_double (__llrint, llrint) -diff --git a/sysdeps/loongarch/fpu/s_llrintf.c b/sysdeps/loongarch/fpu/s_llrintf.c -new file mode 100644 -index 00000000..f3a874a0 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_llrintf.c -@@ -0,0 +1,31 @@ -+/* llrintf(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+long long int -+__llrintf (float x) -+{ -+ long long int result; -+ asm volatile ("ftint.l.s \t%0, %1" : "=f" (x) : "f" (x)); -+ asm volatile ("movfr2gr.d \t%0, %1" : "=r" (result) : "f" (x)); -+ return result; -+} -+libm_alias_float (__llrint, llrint) -diff --git a/sysdeps/loongarch/fpu/s_logb.c b/sysdeps/loongarch/fpu/s_logb.c -new file mode 100644 -index 00000000..31bb3be5 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_logb.c -@@ -0,0 +1,30 @@ -+/* logb(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+double -+__logb (double x) -+{ -+ asm volatile ("fabs.d \t%0, %1" : "=f" (x) : "f" (x)); -+ asm volatile ("flogb.d \t%0, %1" : "=f" (x) : "f" (x)); -+ return x; -+} -+libm_alias_double (__logb, logb) -diff --git a/sysdeps/loongarch/fpu/s_logbf.c b/sysdeps/loongarch/fpu/s_logbf.c -new file mode 100644 -index 00000000..f5166bca ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_logbf.c -@@ -0,0 +1,30 @@ -+/* logbf(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+float -+__logbf (float x) -+{ -+ asm volatile ("fabs.s \t%0, %1" : "=f" (x) : "f" (x)); -+ asm volatile ("flogb.s \t%0, %1" : "=f" (x) : "f" (x)); -+ return x; -+} -+libm_alias_float (__logb, logb) -diff --git a/sysdeps/loongarch/fpu/s_lrint.c b/sysdeps/loongarch/fpu/s_lrint.c -new file mode 100644 -index 00000000..db446111 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_lrint.c -@@ -0,0 +1,31 @@ -+/* lrint(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+long int -+__lrint (double x) -+{ -+ long int result; -+ asm volatile ("ftint.l.d \t%0, %1" : "=f" (x) : "f" (x)); -+ asm volatile ("movfr2gr.d \t%0, %1" : "=r" (result) : "f" (x)); -+ return result; -+} -+libm_alias_double (__lrint, lrint) -diff --git a/sysdeps/loongarch/fpu/s_lrintf.c b/sysdeps/loongarch/fpu/s_lrintf.c -new file mode 100644 -index 00000000..cde60b88 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_lrintf.c -@@ -0,0 +1,31 @@ -+/* lrintf(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+long int -+__lrintf (float x) -+{ -+ long int result; -+ asm volatile ("ftint.l.s \t%0, %1" : "=f" (x) : "f" (x)); -+ asm volatile ("movfr2gr.d \t%0, %1" : "=r" (result) : "f" (x)); -+ return result; -+} -+libm_alias_float (__lrint, lrint) -diff --git a/sysdeps/loongarch/fpu/s_rint.c b/sysdeps/loongarch/fpu/s_rint.c -new file mode 100644 -index 00000000..429d5d11 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_rint.c -@@ -0,0 +1,29 @@ -+/* rint(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+double -+__rint (double x) -+{ -+ asm volatile ("frint.d \t%0, %1" : "=f" (x) : "f" (x)); -+ return x; -+} -+libm_alias_double (__rint, rint) -diff --git a/sysdeps/loongarch/fpu/s_rintf.c b/sysdeps/loongarch/fpu/s_rintf.c -new file mode 100644 -index 00000000..b3faba20 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_rintf.c -@@ -0,0 +1,29 @@ -+/* rintf(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+#include -+ -+float -+__rintf (float x) -+{ -+ asm volatile ("frint.s \t%0, %1" : "=f" (x) : "f" (x)); -+ return x; -+} -+libm_alias_float (__rint, rint) -diff --git a/sysdeps/loongarch/fpu/s_scalbn.c b/sysdeps/loongarch/fpu/s_scalbn.c -new file mode 100644 -index 00000000..c03e81a3 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_scalbn.c -@@ -0,0 +1,29 @@ -+/* scalbn(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+ -+double -+__scalbn (double x, int fn) -+{ -+ double tmp; -+ asm volatile ("movgr2fr.d \t%0, %1" : "=f" (tmp) : "r" (fn)); -+ asm volatile ("fscaleb.d \t%0, %1, %2" : "=f" (x) : "f" (x), "f" (tmp)); -+ return x; -+} -diff --git a/sysdeps/loongarch/fpu/s_scalbnf.c b/sysdeps/loongarch/fpu/s_scalbnf.c -new file mode 100644 -index 00000000..15e64280 ---- /dev/null -+++ b/sysdeps/loongarch/fpu/s_scalbnf.c -@@ -0,0 +1,29 @@ -+/* scalbnf(). LoongArch version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#define NO_MATH_REDIRECT -+#include -+ -+float -+__scalbnf (float x, int fn) -+{ -+ float tmp; -+ asm volatile ("movgr2fr.w \t%0, %1" : "=f" (tmp) : "r" (fn)); -+ asm volatile ("fscaleb.s \t%0, %1, %2" : "=f" (x) : "f" (x), "f" (tmp)); -+ return x; -+} -diff --git a/sysdeps/loongarch/fpu_control.h b/sysdeps/loongarch/fpu_control.h -new file mode 100644 -index 00000000..8f688592 ---- /dev/null -+++ b/sysdeps/loongarch/fpu_control.h -@@ -0,0 +1,128 @@ -+/* FPU control word bits. Mips version. -+ Copyright (C) 1996-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Olaf Flebbe and Ralf Baechle. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#ifndef _FPU_CONTROL_H -+#define _FPU_CONTROL_H -+ -+/* MIPS FPU floating point control register bits. -+ * -+ * 31-25 -> floating point conditions code bits 7-1. These bits are only -+ * available in MIPS IV. -+ * 24 -> flush denormalized results to zero instead of -+ * causing unimplemented operation exception. This bit is only -+ * available for MIPS III and newer. -+ * 23 -> Condition bit -+ * 22-21 -> reserved for architecture implementers -+ * 20 -> reserved (read as 0, write with 0) -+ * 19 -> IEEE 754-2008 non-arithmetic ABS.fmt and NEG.fmt enable -+ * 18 -> IEEE 754-2008 recommended NaN encoding enable -+ * 17 -> cause bit for unimplemented operation -+ * 28 -> cause bit for invalid exception -+ * 27 -> cause bit for division by zero exception -+ * 26 -> cause bit for overflow exception -+ * 25 -> cause bit for underflow exception -+ * 24 -> cause bit for inexact exception -+ * 4 -> enable exception for invalid exception -+ * 3 -> enable exception for division by zero exception -+ * 2 -> enable exception for overflow exception -+ * 1 -> enable exception for underflow exception -+ * 0 -> enable exception for inexact exception -+ * 20 -> flag invalid exception -+ * 19 -> flag division by zero exception -+ * 18 -> flag overflow exception -+ * 17 -> flag underflow exception -+ * 16 -> flag inexact exception -+ * 9-8 -> rounding control -+ * -+ * -+ * Rounding Control: -+ * 00 - rounding to nearest (RN) -+ * 01 - rounding toward zero (RZ) -+ * 10 - rounding (up) toward plus infinity (RP) -+ * 11 - rounding (down)toward minus infinity (RM) -+ */ -+ -+#include -+ -+#ifdef __loongarch_soft_float -+ -+#define _FPU_RESERVED 0xffffffff -+#define _FPU_DEFAULT 0x00000000 -+typedef unsigned int fpu_control_t; -+#define _FPU_GETCW(cw) (cw) = 0 -+#define _FPU_SETCW(cw) (void) (cw) -+extern fpu_control_t __fpu_control; -+ -+#else /* __loongarch_soft_float */ -+ -+/* Masks for interrupts. */ -+#define _FPU_MASK_V 0x10 /* Invalid operation */ -+#define _FPU_MASK_Z 0x08 /* Division by zero */ -+#define _FPU_MASK_O 0x04 /* Overflow */ -+#define _FPU_MASK_U 0x02 /* Underflow */ -+#define _FPU_MASK_I 0x01 /* Inexact operation */ -+ -+/* Flush denormalized numbers to zero. */ -+#define _FPU_FLUSH_TZ 0x1000000 -+ -+/* Rounding control. */ -+#define _FPU_RC_NEAREST 0x000 /* RECOMMENDED */ -+#define _FPU_RC_ZERO 0x100 -+#define _FPU_RC_UP 0x200 -+#define _FPU_RC_DOWN 0x300 -+/* Mask for rounding control. */ -+#define _FPU_RC_MASK 0x300 -+ -+#define _FPU_RESERVED 0x0 -+ -+#define _FPU_DEFAULT 0x0 -+#define _FPU_IEEE 0x1F -+ -+/* Type of the control word. */ -+typedef unsigned int fpu_control_t __attribute__ ((__mode__ (__SI__))); -+ -+/* Macros for accessing the hardware control word. */ -+extern fpu_control_t __mips_fpu_getcw (void) __THROW; -+extern void __mips_fpu_setcw (fpu_control_t) __THROW; -+#define _FPU_GETCW(cw) __asm__ volatile ("movfcsr2gr %0,$r0" : "=r" (cw)) -+#define _FPU_SETCW(cw) __asm__ volatile ("movgr2fcsr $r0,%0" : : "r" (cw)) -+ -+/* Default control word set at startup. */ -+extern fpu_control_t __fpu_control; -+ -+# define _FCLASS_SNAN (1 << 0) -+# define _FCLASS_QNAN (1 << 1) -+# define _FCLASS_MINF (1 << 2) -+# define _FCLASS_MNORM (1 << 3) -+# define _FCLASS_MSUBNORM (1 << 4) -+# define _FCLASS_MZERO (1 << 5) -+# define _FCLASS_PINF (1 << 6) -+# define _FCLASS_PNORM (1 << 7) -+# define _FCLASS_PSUBNORM (1 << 8) -+# define _FCLASS_PZERO (1 << 9) -+ -+# define _FCLASS_ZERO (_FCLASS_MZERO | _FCLASS_PZERO) -+# define _FCLASS_SUBNORM (_FCLASS_MSUBNORM | _FCLASS_PSUBNORM) -+# define _FCLASS_NORM (_FCLASS_MNORM | _FCLASS_PNORM) -+# define _FCLASS_INF (_FCLASS_MINF | _FCLASS_PINF) -+# define _FCLASS_NAN (_FCLASS_SNAN | _FCLASS_QNAN) -+ -+#endif /* __loongarch_soft_float */ -+ -+#endif /* fpu_control.h */ -diff --git a/sysdeps/loongarch/fstat.c b/sysdeps/loongarch/fstat.c -new file mode 100644 -index 00000000..c4504eeb ---- /dev/null -+++ b/sysdeps/loongarch/fstat.c -@@ -0,0 +1 @@ -+#include -diff --git a/sysdeps/loongarch/fstat64.c b/sysdeps/loongarch/fstat64.c -new file mode 100644 -index 00000000..143ca2b0 ---- /dev/null -+++ b/sysdeps/loongarch/fstat64.c -@@ -0,0 +1 @@ -+#include -diff --git a/sysdeps/loongarch/fstatat.c b/sysdeps/loongarch/fstatat.c -new file mode 100644 -index 00000000..0b0a3342 ---- /dev/null -+++ b/sysdeps/loongarch/fstatat.c -@@ -0,0 +1 @@ -+#include -diff --git a/sysdeps/loongarch/fstatat64.c b/sysdeps/loongarch/fstatat64.c -new file mode 100644 -index 00000000..e82b9274 ---- /dev/null -+++ b/sysdeps/loongarch/fstatat64.c -@@ -0,0 +1 @@ -+#include -diff --git a/sysdeps/loongarch/gccframe.h b/sysdeps/loongarch/gccframe.h -new file mode 100644 -index 00000000..5c799c64 ---- /dev/null -+++ b/sysdeps/loongarch/gccframe.h -@@ -0,0 +1,21 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#define FIRST_PSEUDO_REGISTER 74 -+ -+#include -diff --git a/sysdeps/loongarch/hp-timing.h b/sysdeps/loongarch/hp-timing.h -new file mode 100644 -index 00000000..2d006540 ---- /dev/null -+++ b/sysdeps/loongarch/hp-timing.h -@@ -0,0 +1,40 @@ -+/* High precision, low overhead timing functions. x86-64 version. -+ Copyright (C) 2002-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#ifndef _HP_TIMING_H -+#define _HP_TIMING_H 1 -+ -+/* We always assume having the timestamp register. */ -+#define HP_TIMING_AVAIL (1) -+#define HP_SMALL_TIMING_AVAIL (1) -+ -+/* We indeed have inlined functions. */ -+#define HP_TIMING_INLINE (1) -+ -+/* We use 64bit values for the times. */ -+typedef unsigned long long int hp_timing_t; -+ -+/* Read the cp0 count, this maybe inaccurate. */ -+#define HP_TIMING_NOW(Var) \ -+ ({ unsigned long long int _count; \ -+ asm volatile ("rdtime.d\t%0,$r0" : "=r" (_count)); \ -+ (Var) = _count; }) -+ -+#include -+ -+#endif /* hp-timing.h */ -diff --git a/sysdeps/loongarch/init-arch.h b/sysdeps/loongarch/init-arch.h -new file mode 100644 -index 00000000..7db7b7b3 ---- /dev/null -+++ b/sysdeps/loongarch/init-arch.h -@@ -0,0 +1,24 @@ -+/* This file is part of the GNU C Library. -+ Copyright (C) 2008-2022 Free Software Foundation, Inc. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+#include -+ -+#define INIT_ARCH() \ -+ uint64_t __attribute__((unused)) prid = \ -+ GLRO(dl_larch_cpu_features).cpucfg_prid; \ -+ -diff --git a/sysdeps/loongarch/jmpbuf-offsets.h b/sysdeps/loongarch/jmpbuf-offsets.h -new file mode 100644 -index 00000000..bc4c1523 ---- /dev/null -+++ b/sysdeps/loongarch/jmpbuf-offsets.h -@@ -0,0 +1,23 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public License as -+ published by the Free Software Foundation; either version 2.1 of the -+ License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+ -+/* Helper for generic ____longjmp_chk(). */ -+#define JB_FRAME_ADDRESS(buf) \ -+ ((void *) _jmpbuf_sp (buf)) -diff --git a/sysdeps/loongarch/jmpbuf-unwind.h b/sysdeps/loongarch/jmpbuf-unwind.h -new file mode 100644 -index 00000000..c866d910 ---- /dev/null -+++ b/sysdeps/loongarch/jmpbuf-unwind.h -@@ -0,0 +1,46 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+#include -+#include -+ -+/* Test if longjmp to JMPBUF would unwind the frame -+ containing a local variable at ADDRESS. */ -+#define _JMPBUF_UNWINDS(jmpbuf, address, demangle) \ -+ ((void *) (address) < (void *) demangle ((jmpbuf)[0].__sp)) -+ -+#define _JMPBUF_CFA_UNWINDS_ADJ(_jmpbuf, _context, _adj) \ -+ _JMPBUF_UNWINDS_ADJ (_jmpbuf, (void *) _Unwind_GetCFA (_context), _adj) -+ -+static inline uintptr_t __attribute__ ((unused)) -+_jmpbuf_sp (__jmp_buf regs) -+{ -+ uintptr_t sp = regs[0].__sp; -+#ifdef PTR_DEMANGLE -+ PTR_DEMANGLE (sp); -+#endif -+ return sp; -+} -+ -+#define _JMPBUF_UNWINDS_ADJ(_jmpbuf, _address, _adj) \ -+ ((uintptr_t) (_address) - (_adj) < _jmpbuf_sp (_jmpbuf) - (_adj)) -+ -+/* We use the normal longjmp for unwinding. */ -+#define __libc_unwind_longjmp(buf, val) __libc_longjmp (buf, val) -diff --git a/sysdeps/loongarch/ldsodefs.h b/sysdeps/loongarch/ldsodefs.h -new file mode 100644 -index 00000000..f3c07709 ---- /dev/null -+++ b/sysdeps/loongarch/ldsodefs.h -@@ -0,0 +1,48 @@ -+/* Run-time dynamic linker data structures for loaded ELF shared objects. -+ Copyright (C) 2011-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#ifndef _LOONGARCH_LDSODEFS_H -+#define _LOONGARCH_LDSODEFS_H 1 -+ -+#include -+#include -+ -+struct La_loongarch_regs; -+struct La_loongarch_retval; -+ -+#define ARCH_PLTENTER_MEMBERS \ -+ ElfW(Addr) (*loongarch_gnu_pltenter) (ElfW(Sym) *, unsigned int, \ -+ uintptr_t *, uintptr_t *, \ -+ const struct La_loongarch_regs *, \ -+ unsigned int *, const char *name, \ -+ long int *framesizep); -+ -+#define ARCH_PLTEXIT_MEMBERS \ -+ unsigned int (*loongarch_gnu_pltexit) (ElfW(Sym) *, unsigned int, \ -+ uintptr_t *, uintptr_t *, \ -+ const struct La_loongarch_regs *, \ -+ struct La_loongarch_retval *, \ -+ const char *); -+ -+/* The LoongArch ABI specifies that the dynamic section has to be read-only. */ -+ -+#define DL_RO_DYN_SECTION 1 -+ -+#include_next -+ -+#endif -diff --git a/sysdeps/loongarch/libc-start.h b/sysdeps/loongarch/libc-start.h -new file mode 100644 -index 00000000..7bbc658f ---- /dev/null -+++ b/sysdeps/loongarch/libc-start.h -@@ -0,0 +1,25 @@ -+/* LoongArch definitions for libc main startup. -+ Copyright (C) 2023 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#ifndef SHARED -+# define ARCH_SETUP_IREL() apply_irel () -+# define ARCH_APPLY_IREL() -+# ifndef ARCH_SETUP_TLS -+# define ARCH_SETUP_TLS() __libc_setup_tls () -+# endif -+#endif /* !SHARED */ -diff --git a/sysdeps/loongarch/libc-tls.c b/sysdeps/loongarch/libc-tls.c -new file mode 100644 -index 00000000..0b0590d1 ---- /dev/null -+++ b/sysdeps/loongarch/libc-tls.c -@@ -0,0 +1,32 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+/* On LoongArch, linker optimizations are not required, so __tls_get_addr -+ can be called even in statically linked binaries. In this case module -+ must be always 1 and PT_TLS segment exist in the binary, otherwise it -+ would not link. */ -+ -+void * -+__tls_get_addr (tls_index *ti) -+{ -+ dtv_t *dtv = THREAD_DTV (); -+ return (char *) dtv[1].pointer.val + GET_ADDR_OFFSET; -+} -diff --git a/sysdeps/loongarch/linkmap.h b/sysdeps/loongarch/linkmap.h -new file mode 100644 -index 00000000..ac170bb3 ---- /dev/null -+++ b/sysdeps/loongarch/linkmap.h -@@ -0,0 +1,4 @@ -+struct link_map_machine -+ { -+ ElfW(Addr) plt; /* Address of .plt. */ -+ }; -diff --git a/sysdeps/loongarch/lp64/Implies-after b/sysdeps/loongarch/lp64/Implies-after -new file mode 100644 -index 00000000..a8cae95f ---- /dev/null -+++ b/sysdeps/loongarch/lp64/Implies-after -@@ -0,0 +1 @@ -+wordsize-64 -diff --git a/sysdeps/loongarch/lp64/libm-test-ulps b/sysdeps/loongarch/lp64/libm-test-ulps -new file mode 100644 -index 00000000..61be2df6 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/libm-test-ulps -@@ -0,0 +1,2206 @@ -+# Begin of automatic generation -+ -+# Maximal error of functions: -+Function: "acos": -+float: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "acos_downward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "acos_towardzero": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "acos_upward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "acosh": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "acosh_downward": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "acosh_towardzero": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "acosh_upward": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "asin": -+float: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "asin_downward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "asin_towardzero": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "asin_upward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "asinh": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "asinh_downward": -+double: 3 -+float: 3 -+idouble: 3 -+ifloat: 3 -+ildouble: 4 -+ldouble: 4 -+ -+Function: "asinh_towardzero": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "asinh_upward": -+double: 3 -+float: 3 -+idouble: 3 -+ifloat: 3 -+ildouble: 4 -+ldouble: 4 -+ -+Function: "atan": -+float: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "atan2": -+float: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "atan2_downward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "atan2_towardzero": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "atan2_upward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "atan_downward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "atan_towardzero": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "atan_upward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "atanh": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "atanh_downward": -+double: 3 -+float: 3 -+idouble: 3 -+ifloat: 3 -+ildouble: 4 -+ldouble: 4 -+ -+Function: "atanh_towardzero": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "atanh_upward": -+double: 3 -+float: 3 -+idouble: 3 -+ifloat: 3 -+ildouble: 4 -+ldouble: 4 -+ -+Function: "cabs": -+double: 1 -+idouble: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "cabs_downward": -+double: 1 -+idouble: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "cabs_towardzero": -+double: 1 -+idouble: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "cabs_upward": -+double: 1 -+idouble: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: Real part of "cacos": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "cacos": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "cacos_downward": -+double: 3 -+float: 2 -+idouble: 3 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Imaginary part of "cacos_downward": -+double: 5 -+float: 3 -+idouble: 5 -+ifloat: 3 -+ildouble: 6 -+ldouble: 6 -+ -+Function: Real part of "cacos_towardzero": -+double: 3 -+float: 2 -+idouble: 3 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Imaginary part of "cacos_towardzero": -+double: 4 -+float: 2 -+idouble: 4 -+ifloat: 2 -+ildouble: 5 -+ldouble: 5 -+ -+Function: Real part of "cacos_upward": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Imaginary part of "cacos_upward": -+double: 5 -+float: 5 -+idouble: 5 -+ifloat: 5 -+ildouble: 7 -+ldouble: 7 -+ -+Function: Real part of "cacosh": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "cacosh": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "cacosh_downward": -+double: 4 -+float: 2 -+idouble: 4 -+ifloat: 2 -+ildouble: 5 -+ldouble: 5 -+ -+Function: Imaginary part of "cacosh_downward": -+double: 3 -+float: 3 -+idouble: 3 -+ifloat: 3 -+ildouble: 4 -+ldouble: 4 -+ -+Function: Real part of "cacosh_towardzero": -+double: 4 -+float: 2 -+idouble: 4 -+ifloat: 2 -+ildouble: 5 -+ldouble: 5 -+ -+Function: Imaginary part of "cacosh_towardzero": -+double: 3 -+float: 2 -+idouble: 3 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Real part of "cacosh_upward": -+double: 4 -+float: 3 -+idouble: 4 -+ifloat: 3 -+ildouble: 6 -+ldouble: 6 -+ -+Function: Imaginary part of "cacosh_upward": -+double: 3 -+float: 2 -+idouble: 3 -+ifloat: 2 -+ildouble: 4 -+ldouble: 4 -+ -+Function: "carg": -+float: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "carg_downward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "carg_towardzero": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "carg_upward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "casin": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "casin": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "casin_downward": -+double: 3 -+float: 2 -+idouble: 3 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Imaginary part of "casin_downward": -+double: 5 -+float: 3 -+idouble: 5 -+ifloat: 3 -+ildouble: 6 -+ldouble: 6 -+ -+Function: Real part of "casin_towardzero": -+double: 3 -+float: 1 -+idouble: 3 -+ifloat: 1 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Imaginary part of "casin_towardzero": -+double: 4 -+float: 2 -+idouble: 4 -+ifloat: 2 -+ildouble: 5 -+ldouble: 5 -+ -+Function: Real part of "casin_upward": -+double: 3 -+float: 2 -+idouble: 3 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Imaginary part of "casin_upward": -+double: 5 -+float: 5 -+idouble: 5 -+ifloat: 5 -+ildouble: 7 -+ldouble: 7 -+ -+Function: Real part of "casinh": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "casinh": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "casinh_downward": -+double: 5 -+float: 3 -+idouble: 5 -+ifloat: 3 -+ildouble: 6 -+ldouble: 6 -+ -+Function: Imaginary part of "casinh_downward": -+double: 3 -+float: 2 -+idouble: 3 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Real part of "casinh_towardzero": -+double: 4 -+float: 2 -+idouble: 4 -+ifloat: 2 -+ildouble: 5 -+ldouble: 5 -+ -+Function: Imaginary part of "casinh_towardzero": -+double: 3 -+float: 1 -+idouble: 3 -+ifloat: 1 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Real part of "casinh_upward": -+double: 5 -+float: 5 -+idouble: 5 -+ifloat: 5 -+ildouble: 7 -+ldouble: 7 -+ -+Function: Imaginary part of "casinh_upward": -+double: 3 -+float: 2 -+idouble: 3 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Real part of "catan": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: Imaginary part of "catan": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: Real part of "catan_downward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "catan_downward": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "catan_towardzero": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "catan_towardzero": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "catan_upward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "catan_upward": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Real part of "catanh": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: Imaginary part of "catanh": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: Real part of "catanh_downward": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "catanh_downward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "catanh_towardzero": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "catanh_towardzero": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "catanh_upward": -+double: 4 -+float: 4 -+idouble: 4 -+ifloat: 4 -+ildouble: 4 -+ldouble: 4 -+ -+Function: Imaginary part of "catanh_upward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "cbrt": -+double: 3 -+float: 1 -+idouble: 3 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "cbrt_downward": -+double: 4 -+float: 1 -+idouble: 4 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "cbrt_towardzero": -+double: 3 -+float: 1 -+idouble: 3 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "cbrt_upward": -+double: 5 -+float: 1 -+idouble: 5 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: Real part of "ccos": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: Imaginary part of "ccos": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: Real part of "ccos_downward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "ccos_downward": -+double: 2 -+float: 3 -+idouble: 2 -+ifloat: 3 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "ccos_towardzero": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "ccos_towardzero": -+double: 2 -+float: 3 -+idouble: 2 -+ifloat: 3 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "ccos_upward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Imaginary part of "ccos_upward": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "ccosh": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: Imaginary part of "ccosh": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: Real part of "ccosh_downward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "ccosh_downward": -+double: 2 -+float: 3 -+idouble: 2 -+ifloat: 3 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "ccosh_towardzero": -+double: 1 -+float: 3 -+idouble: 1 -+ifloat: 3 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "ccosh_towardzero": -+double: 2 -+float: 3 -+idouble: 2 -+ifloat: 3 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "ccosh_upward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Imaginary part of "ccosh_upward": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "cexp": -+double: 2 -+float: 1 -+idouble: 2 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: Imaginary part of "cexp": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 1 -+ldouble: 1 -+ -+Function: Real part of "cexp_downward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "cexp_downward": -+double: 1 -+float: 3 -+idouble: 1 -+ifloat: 3 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "cexp_towardzero": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "cexp_towardzero": -+double: 1 -+float: 3 -+idouble: 1 -+ifloat: 3 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "cexp_upward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Imaginary part of "cexp_upward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Real part of "clog": -+double: 3 -+float: 3 -+idouble: 3 -+ifloat: 3 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "clog": -+float: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: Real part of "clog10": -+double: 3 -+float: 4 -+idouble: 3 -+ifloat: 4 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "clog10": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "clog10_downward": -+double: 5 -+float: 5 -+idouble: 5 -+ifloat: 5 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Imaginary part of "clog10_downward": -+double: 2 -+float: 4 -+idouble: 2 -+ifloat: 4 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Real part of "clog10_towardzero": -+double: 5 -+float: 5 -+idouble: 5 -+ifloat: 5 -+ildouble: 4 -+ldouble: 4 -+ -+Function: Imaginary part of "clog10_towardzero": -+double: 2 -+float: 4 -+idouble: 2 -+ifloat: 4 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Real part of "clog10_upward": -+double: 6 -+float: 5 -+idouble: 6 -+ifloat: 5 -+ildouble: 4 -+ldouble: 4 -+ -+Function: Imaginary part of "clog10_upward": -+double: 2 -+float: 4 -+idouble: 2 -+ifloat: 4 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Real part of "clog_downward": -+double: 4 -+float: 3 -+idouble: 4 -+ifloat: 3 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Imaginary part of "clog_downward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "clog_towardzero": -+double: 4 -+float: 4 -+idouble: 4 -+ifloat: 4 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Imaginary part of "clog_towardzero": -+double: 1 -+float: 3 -+idouble: 1 -+ifloat: 3 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "clog_upward": -+double: 4 -+float: 3 -+idouble: 4 -+ifloat: 3 -+ildouble: 4 -+ldouble: 4 -+ -+Function: Imaginary part of "clog_upward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "cos": -+double: 1 -+idouble: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "cos_downward": -+double: 1 -+idouble: 1 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "cos_towardzero": -+double: 1 -+idouble: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "cos_upward": -+double: 1 -+idouble: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "cosh": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "cosh_downward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 2 -+ -+Function: "cosh_towardzero": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 2 -+ -+Function: "cosh_upward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 1 -+ldouble: 3 -+ -+Function: Real part of "cpow": -+double: 2 -+float: 5 -+idouble: 2 -+ifloat: 5 -+ildouble: 4 -+ldouble: 4 -+ -+Function: Imaginary part of "cpow": -+float: 2 -+ifloat: 2 -+ildouble: 1 -+ldouble: 1 -+ -+Function: Real part of "cpow_downward": -+double: 4 -+float: 8 -+idouble: 4 -+ifloat: 8 -+ildouble: 6 -+ldouble: 6 -+ -+Function: Imaginary part of "cpow_downward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "cpow_towardzero": -+double: 4 -+float: 8 -+idouble: 4 -+ifloat: 8 -+ildouble: 6 -+ldouble: 6 -+ -+Function: Imaginary part of "cpow_towardzero": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "cpow_upward": -+double: 4 -+float: 1 -+idouble: 4 -+ifloat: 1 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Imaginary part of "cpow_upward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "csin": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: Imaginary part of "csin": -+ildouble: 1 -+ldouble: 1 -+ -+Function: Real part of "csin_downward": -+double: 2 -+float: 3 -+idouble: 2 -+ifloat: 3 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "csin_downward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "csin_towardzero": -+double: 2 -+float: 3 -+idouble: 2 -+ifloat: 3 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "csin_towardzero": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "csin_upward": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "csin_upward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Real part of "csinh": -+float: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: Imaginary part of "csinh": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: Real part of "csinh_downward": -+double: 2 -+float: 1 -+idouble: 2 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "csinh_downward": -+double: 2 -+float: 3 -+idouble: 2 -+ifloat: 3 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "csinh_towardzero": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "csinh_towardzero": -+double: 2 -+float: 3 -+idouble: 2 -+ifloat: 3 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "csinh_upward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Imaginary part of "csinh_upward": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "csqrt": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Imaginary part of "csqrt": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: Real part of "csqrt_downward": -+double: 5 -+float: 4 -+idouble: 5 -+ifloat: 4 -+ildouble: 4 -+ldouble: 4 -+ -+Function: Imaginary part of "csqrt_downward": -+double: 4 -+float: 3 -+idouble: 4 -+ifloat: 3 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Real part of "csqrt_towardzero": -+double: 4 -+float: 3 -+idouble: 4 -+ifloat: 3 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Imaginary part of "csqrt_towardzero": -+double: 4 -+float: 3 -+idouble: 4 -+ifloat: 3 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Real part of "csqrt_upward": -+double: 5 -+float: 4 -+idouble: 5 -+ifloat: 4 -+ildouble: 4 -+ldouble: 4 -+ -+Function: Imaginary part of "csqrt_upward": -+double: 3 -+float: 3 -+idouble: 3 -+ifloat: 3 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Real part of "ctan": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Imaginary part of "ctan": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Real part of "ctan_downward": -+double: 6 -+float: 5 -+idouble: 6 -+ifloat: 5 -+ildouble: 4 -+ldouble: 4 -+ -+Function: Imaginary part of "ctan_downward": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 5 -+ldouble: 5 -+ -+Function: Real part of "ctan_towardzero": -+double: 5 -+float: 2 -+idouble: 5 -+ifloat: 2 -+ildouble: 4 -+ldouble: 4 -+ -+Function: Imaginary part of "ctan_towardzero": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 5 -+ldouble: 5 -+ -+Function: Real part of "ctan_upward": -+double: 2 -+float: 4 -+idouble: 2 -+ifloat: 4 -+ildouble: 5 -+ldouble: 5 -+ -+Function: Imaginary part of "ctan_upward": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 5 -+ldouble: 5 -+ -+Function: Real part of "ctanh": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Imaginary part of "ctanh": -+double: 2 -+float: 1 -+idouble: 2 -+ifloat: 1 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Real part of "ctanh_downward": -+double: 4 -+float: 2 -+idouble: 4 -+ifloat: 2 -+ildouble: 5 -+ldouble: 5 -+ -+Function: Imaginary part of "ctanh_downward": -+double: 6 -+float: 5 -+idouble: 6 -+ifloat: 5 -+ildouble: 4 -+ldouble: 4 -+ -+Function: Real part of "ctanh_towardzero": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 5 -+ldouble: 5 -+ -+Function: Imaginary part of "ctanh_towardzero": -+double: 5 -+float: 2 -+idouble: 5 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: Real part of "ctanh_upward": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 5 -+ldouble: 5 -+ -+Function: Imaginary part of "ctanh_upward": -+double: 2 -+float: 3 -+idouble: 2 -+ifloat: 3 -+ildouble: 5 -+ldouble: 5 -+ -+Function: "erf": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "erf_downward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "erf_towardzero": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "erf_upward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "erfc": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "erfc_downward": -+double: 3 -+float: 4 -+idouble: 3 -+ifloat: 4 -+ildouble: 5 -+ldouble: 5 -+ -+Function: "erfc_towardzero": -+double: 3 -+float: 3 -+idouble: 3 -+ifloat: 3 -+ildouble: 4 -+ldouble: 4 -+ -+Function: "erfc_upward": -+double: 3 -+float: 4 -+idouble: 3 -+ifloat: 4 -+ildouble: 5 -+ldouble: 5 -+ -+Function: "exp": -+ildouble: 1 -+ldouble: 1 -+ -+Function: "exp10": -+double: 2 -+idouble: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "exp10_downward": -+double: 2 -+float: 1 -+idouble: 2 -+ifloat: 1 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "exp10_towardzero": -+double: 2 -+float: 1 -+idouble: 2 -+ifloat: 1 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "exp10_upward": -+double: 2 -+float: 1 -+idouble: 2 -+ifloat: 1 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "exp2": -+double: 1 -+idouble: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "exp2_downward": -+double: 1 -+idouble: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "exp2_towardzero": -+double: 1 -+idouble: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "exp2_upward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "exp_downward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ -+Function: "exp_towardzero": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ -+Function: "exp_upward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ -+Function: "expm1": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "expm1_downward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "expm1_towardzero": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 4 -+ldouble: 4 -+ -+Function: "expm1_upward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "gamma": -+double: 3 -+float: 3 -+idouble: 3 -+ifloat: 3 -+ildouble: 5 -+ldouble: 5 -+ -+Function: "gamma_downward": -+double: 4 -+float: 4 -+idouble: 4 -+ifloat: 4 -+ildouble: 8 -+ldouble: 8 -+ -+Function: "gamma_towardzero": -+double: 4 -+float: 3 -+idouble: 4 -+ifloat: 3 -+ildouble: 5 -+ldouble: 5 -+ -+Function: "gamma_upward": -+double: 4 -+float: 5 -+idouble: 4 -+ifloat: 5 -+ildouble: 8 -+ldouble: 8 -+ -+Function: "hypot": -+double: 1 -+idouble: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "hypot_downward": -+double: 1 -+idouble: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "hypot_towardzero": -+double: 1 -+idouble: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "hypot_upward": -+double: 1 -+idouble: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "j0": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "j0_downward": -+double: 2 -+float: 4 -+idouble: 2 -+ifloat: 4 -+ildouble: 4 -+ldouble: 4 -+ -+Function: "j0_towardzero": -+double: 2 -+float: 1 -+idouble: 2 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "j0_upward": -+double: 3 -+float: 2 -+idouble: 3 -+ifloat: 2 -+ildouble: 5 -+ldouble: 5 -+ -+Function: "j1": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 4 -+ldouble: 4 -+ -+Function: "j1_downward": -+double: 3 -+float: 2 -+idouble: 3 -+ifloat: 2 -+ildouble: 4 -+ldouble: 4 -+ -+Function: "j1_towardzero": -+double: 3 -+float: 2 -+idouble: 3 -+ifloat: 2 -+ildouble: 4 -+ldouble: 4 -+ -+Function: "j1_upward": -+double: 3 -+float: 4 -+idouble: 3 -+ifloat: 4 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "jn": -+double: 4 -+float: 4 -+idouble: 4 -+ifloat: 4 -+ildouble: 7 -+ldouble: 7 -+ -+Function: "jn_downward": -+double: 4 -+float: 5 -+idouble: 4 -+ifloat: 5 -+ildouble: 8 -+ldouble: 8 -+ -+Function: "jn_towardzero": -+double: 4 -+float: 5 -+idouble: 4 -+ifloat: 5 -+ildouble: 8 -+ldouble: 8 -+ -+Function: "jn_upward": -+double: 5 -+float: 4 -+idouble: 5 -+ifloat: 4 -+ildouble: 7 -+ldouble: 7 -+ -+Function: "lgamma": -+double: 3 -+float: 3 -+idouble: 3 -+ifloat: 3 -+ildouble: 5 -+ldouble: 5 -+ -+Function: "lgamma_downward": -+double: 4 -+float: 4 -+idouble: 4 -+ifloat: 4 -+ildouble: 8 -+ldouble: 8 -+ -+Function: "lgamma_towardzero": -+double: 4 -+float: 3 -+idouble: 4 -+ifloat: 3 -+ildouble: 5 -+ldouble: 5 -+ -+Function: "lgamma_upward": -+double: 4 -+float: 5 -+idouble: 4 -+ifloat: 5 -+ildouble: 8 -+ldouble: 8 -+ -+Function: "log": -+ildouble: 1 -+ldouble: 1 -+ -+Function: "log10": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "log10_downward": -+double: 2 -+float: 3 -+idouble: 2 -+ifloat: 3 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "log10_towardzero": -+double: 2 -+float: 1 -+idouble: 2 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "log10_upward": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "log1p": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "log1p_downward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "log1p_towardzero": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "log1p_upward": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "log2": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "log2_downward": -+double: 3 -+idouble: 3 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "log2_towardzero": -+double: 2 -+idouble: 2 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "log2_upward": -+double: 3 -+idouble: 3 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "log_downward": -+ildouble: 1 -+ldouble: 1 -+ -+Function: "log_towardzero": -+ildouble: 2 -+ldouble: 2 -+ -+Function: "log_upward": -+double: 1 -+idouble: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "pow": -+double: 1 -+idouble: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "pow_downward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "pow_towardzero": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "pow_upward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "sin": -+double: 1 -+idouble: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "sin_downward": -+double: 1 -+idouble: 1 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "sin_towardzero": -+double: 1 -+idouble: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "sin_upward": -+double: 1 -+idouble: 1 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "sincos": -+double: 1 -+idouble: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "sincos_downward": -+double: 1 -+idouble: 1 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "sincos_towardzero": -+double: 1 -+idouble: 1 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "sincos_upward": -+double: 1 -+idouble: 1 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "sinh": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "sinh_downward": -+double: 3 -+float: 3 -+idouble: 3 -+ifloat: 3 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "sinh_towardzero": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "sinh_upward": -+double: 3 -+float: 3 -+idouble: 3 -+ifloat: 3 -+ildouble: 4 -+ldouble: 4 -+ -+Function: "tan": -+float: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "tan_downward": -+double: 1 -+float: 2 -+idouble: 1 -+ifloat: 2 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "tan_towardzero": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "tan_upward": -+double: 1 -+float: 1 -+idouble: 1 -+ifloat: 1 -+ildouble: 1 -+ldouble: 1 -+ -+Function: "tanh": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "tanh_downward": -+double: 3 -+float: 3 -+idouble: 3 -+ifloat: 3 -+ildouble: 4 -+ldouble: 4 -+ -+Function: "tanh_towardzero": -+double: 2 -+float: 2 -+idouble: 2 -+ifloat: 2 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "tanh_upward": -+double: 3 -+float: 3 -+idouble: 3 -+ifloat: 3 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "tgamma": -+double: 5 -+float: 4 -+idouble: 5 -+ifloat: 4 -+ildouble: 4 -+ldouble: 4 -+ -+Function: "tgamma_downward": -+double: 5 -+float: 5 -+idouble: 5 -+ifloat: 5 -+ildouble: 5 -+ldouble: 5 -+ -+Function: "tgamma_towardzero": -+double: 5 -+float: 4 -+idouble: 5 -+ifloat: 4 -+ildouble: 5 -+ldouble: 5 -+ -+Function: "tgamma_upward": -+double: 4 -+float: 4 -+idouble: 4 -+ifloat: 4 -+ildouble: 4 -+ldouble: 4 -+ -+Function: "y0": -+double: 2 -+float: 1 -+idouble: 2 -+ifloat: 1 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "y0_downward": -+double: 3 -+float: 4 -+idouble: 3 -+ifloat: 4 -+ildouble: 4 -+ldouble: 4 -+ -+Function: "y0_towardzero": -+double: 3 -+float: 3 -+idouble: 3 -+ifloat: 3 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "y0_upward": -+double: 2 -+float: 5 -+idouble: 2 -+ifloat: 5 -+ildouble: 3 -+ldouble: 3 -+ -+Function: "y1": -+double: 3 -+float: 2 -+idouble: 3 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "y1_downward": -+double: 3 -+float: 2 -+idouble: 3 -+ifloat: 2 -+ildouble: 4 -+ldouble: 4 -+ -+Function: "y1_towardzero": -+double: 3 -+float: 2 -+idouble: 3 -+ifloat: 2 -+ildouble: 2 -+ldouble: 2 -+ -+Function: "y1_upward": -+double: 5 -+float: 2 -+idouble: 5 -+ifloat: 2 -+ildouble: 5 -+ldouble: 5 -+ -+Function: "yn": -+double: 3 -+float: 3 -+idouble: 3 -+ifloat: 3 -+ildouble: 5 -+ldouble: 5 -+ -+Function: "yn_downward": -+double: 3 -+float: 4 -+idouble: 3 -+ifloat: 4 -+ildouble: 5 -+ldouble: 5 -+ -+Function: "yn_towardzero": -+double: 3 -+float: 3 -+idouble: 3 -+ifloat: 3 -+ildouble: 5 -+ldouble: 5 -+ -+Function: "yn_upward": -+double: 4 -+float: 5 -+idouble: 4 -+ifloat: 5 -+ildouble: 5 -+ldouble: 5 -+ -+# end of automatic generation -diff --git a/sysdeps/loongarch/lp64/libm-test-ulps-name b/sysdeps/loongarch/lp64/libm-test-ulps-name -new file mode 100644 -index 00000000..ce02281e ---- /dev/null -+++ b/sysdeps/loongarch/lp64/libm-test-ulps-name -@@ -0,0 +1 @@ -+LoongArch 64-bit -diff --git a/sysdeps/loongarch/lp64/memchr.S b/sysdeps/loongarch/lp64/memchr.S -new file mode 100644 -index 00000000..ec34b1af ---- /dev/null -+++ b/sysdeps/loongarch/lp64/memchr.S -@@ -0,0 +1,99 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#ifndef MEMCHR_NAME -+#define MEMCHR_NAME memchr -+#endif -+ -+#ifdef ANDROID_CHANGES -+LEAF(MEMCHR_NAME, 0) -+#else -+LEAF(MEMCHR_NAME) -+#endif -+ .align 6 -+ beqz a2, L(out) -+ andi t1, a0, 0x7 -+ lu12i.w a3, 0x01010 -+ sub.d a5, a0, t1 -+ -+ bstrins.d a1, a1, 15, 8 -+ ld.d t0, a5, 0 -+ slli.d t2, t1, 3 -+ ori a3, a3, 0x101 -+ -+ bstrins.d a1, a1, 31, 16 -+ li.w t7, -1 -+ li.w t8, 9 -+ bstrins.d a3, a3, 63, 32 -+ -+ srl.d t3, t7, t2 -+ bstrins.d a1, a1, 63, 32 -+ sub.d t4, t8, t1 -+ orn t3, a1, t3 -+ -+ srl.d t0, t0, t2 -+ slli.d a4, a3, 7 # 0x8080808080808080 -+ sltu t4, a2, t4 -+ xor t2, t0, t3 -+ -+ sub.d a6, t2, a3 -+ andn a7, a4, t2 -+ and t2, a6, a7 -+ or t3, t2, t4 -+ -+ bnez t3, L(count_pos) -+ addi.d a2, a2, -8 -+ addi.d a0, a5, 8 -+ add.d a2, a2, t1 -+ -+L(loop): -+ ld.d t0, a0, 0 -+ sltui t4, a2, 9 -+ xor t2, t0, a1 -+ sub.d a6, t2, a3 -+ -+ andn a7, a4, t2 -+ and t2, a6, a7 -+ or t3, t2, t4 -+ bnez t3, L(count_pos) -+ -+ ld.d t1, a0, 8 -+ addi.d a0, a0, 16 -+ sltui t4, a2, 17 -+ xor t2, t1, a1 -+ -+ sub.d a6, t2, a3 -+ andn a7, a4, t2 -+ and t2, a6, a7 -+ addi.d a2, a2, -16 -+ -+ or t3, t2, t4 -+ beqz t3, L(loop) -+ addi.d a0, a0, -8 -+ addi.d a2, a2, 8 -+ -+L(count_pos): -+ ctz.d t0, t2 -+ srli.d t0, t0, 3 -+ sltu t1, t0, a2 -+ add.d a0, a0, t0 -+ -+ maskeqz a0, a0, t1 -+ jr ra -+ -+L(out): -+ move a0, zero -+ jr ra -+END(MEMCHR_NAME) -+ -+#ifndef ANDROID_CHANGES -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMCHR_NAME) -+#endif -+#endif -diff --git a/sysdeps/loongarch/lp64/memcmp.S b/sysdeps/loongarch/lp64/memcmp.S -new file mode 100644 -index 00000000..9e57a924 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/memcmp.S -@@ -0,0 +1,281 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#ifndef MEMCMP_NAME -+#define MEMCMP_NAME memcmp -+#endif -+ -+LEAF(MEMCMP_NAME) -+ .align 6 -+ beqz a2, L(ret) -+ andi a4, a1, 0x7 -+ andi a3, a0, 0x7 -+ sltu a5, a4, a3 -+ -+ xor t0, a0, a1 -+ li.w t8, 8 -+ maskeqz t0, t0, a5 -+ li.w t7, -1 -+ -+ xor a0, a0, t0 // a0 hold smaller one -+ xor a1, a1, t0 // a1 hold larger one -+ andi a3, a0, 0x7 // a3 hold small offset -+ andi a4, a1, 0x7 // a4 hold larger offset -+ -+ xor a0, a0, a3 -+ xor a1, a1, a4 -+ ld.d t2, a0, 0 // t2 = "fedcbaXX" -+ ld.d t1, a1, 0 // t1 = "54321YYY" -+ -+ slli.d t3, a3, 3 -+ slli.d t4, a4, 3 -+ sub.d a6, t3, t4 // a6 = 0xfffffffffffffff8 -+ srl.d t1, t1, t4 // t1 = "00054321" -+ -+ srl.d t0, t2, t3 // t0 = "00fedcba" -+ srl.d t5, t7, t4 // t5 = 0x000000FFFFFFFFFF -+ sub.d t6, t0, t1 // t6 hold diff -+ and t6, t6, t5 // t6 = "000xxxxx" -+ -+ sub.d t5, t8, a4 // t5 hold margin 8 - 3 = 5 -+ bnez t6, L(first_out) -+ bgeu t5, a2, L(ret) -+ sub.d a2, a2, t5 -+ -+ bnez a6, L(unaligned) -+ blt a2, t8, L(al_less_8bytes) -+ andi t1, a2, 31 -+ beq t1, a2, L(al_less_32bytes) -+ -+ sub.d t2, a2, t1 -+ add.d a4, a0, t2 -+ move a2, t1 -+ -+L(al_loop): -+ ld.d t0, a0, 8 -+ -+ ld.d t1, a1, 8 -+ ld.d t2, a0, 16 -+ ld.d t3, a1, 16 -+ ld.d t4, a0, 24 -+ -+ ld.d t5, a1, 24 -+ ld.d t6, a0, 32 -+ ld.d t7, a1, 32 -+ addi.d a0, a0, 32 -+ -+ addi.d a1, a1, 32 -+ bne t0, t1, L(out1) -+ bne t2, t3, L(out2) -+ bne t4, t5, L(out3) -+ -+ bne t6, t7, L(out4) -+ bne a0, a4, L(al_loop) -+ -+L(al_less_32bytes): -+ srai.d a4, a2, 4 -+ beqz a4, L(al_less_16bytes) -+ -+ ld.d t0, a0, 8 -+ ld.d t1, a1, 8 -+ ld.d t2, a0, 16 -+ ld.d t3, a1, 16 -+ -+ addi.d a0, a0, 16 -+ addi.d a1, a1, 16 -+ addi.d a2, a2, -16 -+ bne t0, t1, L(out1) -+ -+ bne t2, t3, L(out2) -+ -+L(al_less_16bytes): -+ srai.d a4, a2, 3 -+ beqz a4, L(al_less_8bytes) -+ ld.d t0, a0, 8 -+ -+ ld.d t1, a1, 8 -+ addi.d a0, a0, 8 -+ addi.d a1, a1, 8 -+ addi.d a2, a2, -8 -+ -+ bne t0, t1, L(out1) -+ -+L(al_less_8bytes): -+ beqz a2, L(ret) -+ ld.d t0, a0, 8 -+ ld.d t1, a1, 8 -+ -+ li.d t7, -1 -+ slli.d t2, a2, 3 -+ sll.d t2, t7, t2 -+ sub.d t3, t0, t1 -+ -+ andn t6, t3, t2 -+ bnez t6, L(count_diff) -+ -+L(ret): -+ move a0, zero -+ jr ra -+ -+L(out4): -+ move t0, t6 -+ move t1, t7 -+ sub.d t6, t6, t7 -+ b L(count_diff) -+ -+L(out3): -+ move t0, t4 -+ move t1, t5 -+ sub.d t6, t4, t5 -+ b L(count_diff) -+ -+L(out2): -+ move t0, t2 -+ move t1, t3 -+L(out1): -+ sub.d t6, t0, t1 -+ b L(count_diff) -+ -+L(first_out): -+ slli.d t4, a2, 3 -+ slt t3, a2, t5 -+ sll.d t4, t7, t4 -+ maskeqz t4, t4, t3 -+ -+ andn t6, t6, t4 -+ -+L(count_diff): -+ ctz.d t2, t6 -+ bstrins.d t2, zero, 2, 0 -+ srl.d t0, t0, t2 -+ -+ srl.d t1, t1, t2 -+ andi t0, t0, 0xff -+ andi t1, t1, 0xff -+ sub.d t2, t0, t1 -+ -+ sub.d t3, t1, t0 -+ masknez t2, t2, a5 -+ maskeqz t3, t3, a5 -+ or a0, t2, t3 -+ -+ jr ra -+ -+L(unaligned): -+ sub.d a7, zero, a6 -+ srl.d t0, t2, a6 -+ blt a2, t8, L(un_less_8bytes) -+ -+ andi t1, a2, 31 -+ beq t1, a2, L(un_less_32bytes) -+ sub.d t2, a2, t1 -+ add.d a4, a0, t2 -+ -+ move a2, t1 -+ -+L(un_loop): -+ ld.d t2, a0, 8 -+ ld.d t1, a1, 8 -+ ld.d t4, a0, 16 -+ -+ ld.d t3, a1, 16 -+ ld.d t6, a0, 24 -+ ld.d t5, a1, 24 -+ ld.d t8, a0, 32 -+ -+ ld.d t7, a1, 32 -+ addi.d a0, a0, 32 -+ addi.d a1, a1, 32 -+ sll.d a3, t2, a7 -+ -+ or t0, a3, t0 -+ bne t0, t1, L(out1) -+ srl.d t0, t2, a6 -+ sll.d a3, t4, a7 -+ -+ or t2, a3, t0 -+ bne t2, t3, L(out2) -+ srl.d t0, t4, a6 -+ sll.d a3, t6, a7 -+ -+ or t4, a3, t0 -+ bne t4, t5, L(out3) -+ srl.d t0, t6, a6 -+ sll.d a3, t8, a7 -+ -+ or t6, t0, a3 -+ bne t6, t7, L(out4) -+ srl.d t0, t8, a6 -+ bne a0, a4, L(un_loop) -+ -+L(un_less_32bytes): -+ srai.d a4, a2, 4 -+ beqz a4, L(un_less_16bytes) -+ ld.d t2, a0, 8 -+ ld.d t1, a1, 8 -+ -+ ld.d t4, a0, 16 -+ ld.d t3, a1, 16 -+ addi.d a0, a0, 16 -+ addi.d a1, a1, 16 -+ -+ addi.d a2, a2, -16 -+ sll.d a3, t2, a7 -+ or t0, a3, t0 -+ bne t0, t1, L(out1) -+ -+ srl.d t0, t2, a6 -+ sll.d a3, t4, a7 -+ or t2, a3, t0 -+ bne t2, t3, L(out2) -+ -+ srl.d t0, t4, a6 -+ -+L(un_less_16bytes): -+ srai.d a4, a2, 3 -+ beqz a4, L(un_less_8bytes) -+ ld.d t2, a0, 8 -+ -+ ld.d t1, a1, 8 -+ addi.d a0, a0, 8 -+ addi.d a1, a1, 8 -+ addi.d a2, a2, -8 -+ -+ sll.d a3, t2, a7 -+ or t0, a3, t0 -+ bne t0, t1, L(out1) -+ srl.d t0, t2, a6 -+ -+L(un_less_8bytes): -+ beqz a2, L(ret) -+ andi a7, a7, 63 -+ slli.d a4, a2, 3 -+ bgeu a7, a4, L(last_cmp) -+ -+ ld.d t2, a0, 8 -+ sll.d a3, t2, a7 -+ or t0, a3, t0 -+ -+L(last_cmp): -+ ld.d t1, a1, 8 -+ -+ li.d t7, -1 -+ sll.d t2, t7, a4 -+ sub.d t3, t0, t1 -+ andn t6, t3, t2 -+ -+ bnez t6, L(count_diff) -+ move a0, zero -+ jr ra -+ -+END(MEMCMP_NAME) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMCMP_NAME) -+#endif -diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S -new file mode 100644 -index 00000000..1076e678 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/memcpy.S -@@ -0,0 +1,818 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#ifndef MEMCPY_NAME -+#define MEMCPY_NAME memcpy -+#endif -+ -+#ifndef MEMMOVE_NAME -+#define MEMMOVE_NAME memmove -+#endif -+ -+#define LD_64(reg, n) \ -+ ld.d t0, reg, n; \ -+ ld.d t1, reg, n+8; \ -+ ld.d t2, reg, n+16; \ -+ ld.d t3, reg, n+24; \ -+ ld.d t4, reg, n+32; \ -+ ld.d t5, reg, n+40; \ -+ ld.d t6, reg, n+48; \ -+ ld.d t7, reg, n+56; -+ -+#define ST_64(reg, n) \ -+ st.d t0, reg, n; \ -+ st.d t1, reg, n+8; \ -+ st.d t2, reg, n+16; \ -+ st.d t3, reg, n+24; \ -+ st.d t4, reg, n+32; \ -+ st.d t5, reg, n+40; \ -+ st.d t6, reg, n+48; \ -+ st.d t7, reg, n+56; -+ -+#ifdef ANDROID_CHANGES -+LEAF(MEMMOVE_NAME, 0) -+#else -+LEAF(MEMMOVE_NAME) -+#endif -+ -+ .align 6 -+ sub.d t0, a0, a1 -+ bltu t0, a2, L(copy_back) -+ -+END(MEMMOVE_NAME) -+ -+#ifndef ANDROID_CHANGES -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMMOVE_NAME) -+#endif -+#endif -+ -+#ifdef ANDROID_CHANGES -+LEAF(MEMCPY_NAME, 0) -+#else -+LEAF(MEMCPY_NAME) -+#endif -+ -+ srai.d a3, a2, 4 -+ beqz a3, L(short_data) # less than 16 bytes -+ -+ move a4, a0 -+ andi a5, a0, 0x7 -+ andi a6, a1, 0x7 -+ li.d t8, 8 -+ beqz a5, L(check_align) -+ -+ # make dest aligned 8 bytes -+ sub.d t2, t8, a5 -+ sub.d a2, a2, t2 -+ -+ pcaddi t1, 20 -+ slli.d t3, t2, 3 -+ add.d a1, a1, t2 -+ sub.d t1, t1, t3 -+ add.d a4, a4, t2 -+ jr t1 -+ -+L(al7): -+ ld.b t0, a1, -7 -+ st.b t0, a4, -7 -+L(al6): -+ ld.b t0, a1, -6 -+ st.b t0, a4, -6 -+L(al5): -+ ld.b t0, a1, -5 -+ st.b t0, a4, -5 -+L(al4): -+ ld.b t0, a1, -4 -+ st.b t0, a4, -4 -+L(al3): -+ ld.b t0, a1, -3 -+ st.b t0, a4, -3 -+L(al2): -+ ld.b t0, a1, -2 -+ st.b t0, a4, -2 -+L(al1): -+ ld.b t0, a1, -1 -+ st.b t0, a4, -1 -+ -+L(check_align): -+ bne a5, a6, L(unalign) -+ -+ srai.d a3, a2, 4 -+ beqz a3, L(al_less_16bytes) -+ -+ andi a3, a2, 0x3f -+ beq a3, a2, L(al_less_64bytes) -+ -+ sub.d t0, a2, a3 -+ move a2, a3 -+ add.d a5, a1, t0 -+ -+L(loop_64bytes): -+ LD_64(a1, 0) -+ addi.d a1, a1, 64 -+ ST_64(a4, 0) -+ -+ addi.d a4, a4, 64 -+ bne a1, a5, L(loop_64bytes) -+ -+L(al_less_64bytes): -+ srai.d a3, a2, 5 -+ beqz a3, L(al_less_32bytes) -+ -+ ld.d t0, a1, 0 -+ ld.d t1, a1, 8 -+ ld.d t2, a1, 16 -+ ld.d t3, a1, 24 -+ -+ addi.d a1, a1, 32 -+ addi.d a2, a2, -32 -+ -+ st.d t0, a4, 0 -+ st.d t1, a4, 8 -+ st.d t2, a4, 16 -+ st.d t3, a4, 24 -+ -+ addi.d a4, a4, 32 -+ -+L(al_less_32bytes): -+ srai.d a3, a2, 4 -+ beqz a3, L(al_less_16bytes) -+ -+ ld.d t0, a1, 0 -+ ld.d t1, a1, 8 -+ addi.d a1, a1, 16 -+ addi.d a2, a2, -16 -+ -+ st.d t0, a4, 0 -+ st.d t1, a4, 8 -+ addi.d a4, a4, 16 -+ -+L(al_less_16bytes): -+ srai.d a3, a2, 3 -+ beqz a3, L(al_less_8bytes) -+ -+ ld.d t0, a1, 0 -+ addi.d a1, a1, 8 -+ addi.d a2, a2, -8 -+ -+ st.d t0, a4, 0 -+ addi.d a4, a4, 8 -+ -+L(al_less_8bytes): -+ srai.d a3, a2, 2 -+ beqz a3, L(al_less_4bytes) -+ -+ ld.w t0, a1, 0 -+ addi.d a1, a1, 4 -+ addi.d a2, a2, -4 -+ -+ st.w t0, a4, 0 -+ addi.d a4, a4, 4 -+ -+L(al_less_4bytes): -+ srai.d a3, a2, 1 -+ beqz a3, L(al_less_2bytes) -+ -+ ld.h t0, a1, 0 -+ addi.d a1, a1, 2 -+ addi.d a2, a2, -2 -+ -+ st.h t0, a4, 0 -+ addi.d a4, a4, 2 -+ -+L(al_less_2bytes): -+ beqz a2, L(al_less_1byte) -+ -+ ld.b t0, a1, 0 -+ st.b t0, a4, 0 -+ -+L(al_less_1byte): -+ jr ra -+ -+L(unalign): -+ andi a5, a1, 0x7 -+ bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned -+ -+ sub.d t8, t8, a5 # use t8 to save count of bytes for aligning -+ slli.d a5, a5, 3 -+ -+ ld.d t0, a1, 0 -+ addi.d a1, a1, 8 -+ -+ slli.d a6, t8, 3 -+ srl.d a7, t0, a5 -+ -+ srai.d a3, a2, 4 -+ beqz a3, L(un_less_16bytes) -+ -+ andi a3, a2, 0x3f -+ beq a3, a2, L(un_less_64bytes) -+ -+ sub.d t0, a2, a3 -+ move a2, a3 -+ add.d a3, a1, t0 -+ -+# a5 shift right num -+# a6 shift left num -+# a7 remaining part -+L(un_long_bytes): -+ ld.d t0, a1, 0 -+ ld.d t1, a1, 8 -+ ld.d t2, a1, 16 -+ ld.d t3, a1, 24 -+ -+ srl.d t4, t0, a5 -+ sll.d t0, t0, a6 -+ -+ srl.d t5, t1, a5 -+ sll.d t1, t1, a6 -+ -+ srl.d t6, t2, a5 -+ sll.d t2, t2, a6 -+ -+ srl.d t7, t3, a5 -+ sll.d t3, t3, a6 -+ -+ or t0, a7, t0 -+ or t1, t4, t1 -+ or t2, t5, t2 -+ or t3, t6, t3 -+ -+ ld.d t4, a1, 32 -+ ld.d t5, a1, 40 -+ ld.d t6, a1, 48 -+ ld.d a7, a1, 56 -+ -+ st.d t0, a4, 0 -+ st.d t1, a4, 8 -+ st.d t2, a4, 16 -+ st.d t3, a4, 24 -+ -+ addi.d a1, a1, 64 -+ -+ srl.d t0, t4, a5 -+ sll.d t4, t4, a6 -+ -+ srl.d t1, t5, a5 -+ sll.d t5, t5, a6 -+ -+ srl.d t2, t6, a5 -+ sll.d t6, t6, a6 -+ -+ sll.d t3, a7, a6 -+ srl.d a7, a7, a5 -+ -+ or t4, t7, t4 -+ or t5, t0, t5 -+ or t6, t1, t6 -+ or t3, t2, t3 -+ -+ st.d t4, a4, 32 -+ st.d t5, a4, 40 -+ st.d t6, a4, 48 -+ st.d t3, a4, 56 -+ -+ addi.d a4, a4, 64 -+ bne a3, a1, L(un_long_bytes) -+ -+L(un_less_64bytes): -+ srai.d a3, a2, 5 -+ beqz a3, L(un_less_32bytes) -+ -+ ld.d t0, a1, 0 -+ ld.d t1, a1, 8 -+ ld.d t2, a1, 16 -+ ld.d t3, a1, 24 -+ -+ addi.d a1, a1, 32 -+ addi.d a2, a2, -32 -+ -+ srl.d t4, t0, a5 -+ sll.d t0, t0, a6 -+ -+ srl.d t5, t1, a5 -+ sll.d t1, t1, a6 -+ -+ srl.d t6, t2, a5 -+ sll.d t2, t2, a6 -+ -+ or t0, a7, t0 -+ -+ srl.d a7, t3, a5 -+ sll.d t3, t3, a6 -+ -+ or t1, t4, t1 -+ or t2, t5, t2 -+ or t3, t6, t3 -+ -+ st.d t0, a4, 0 -+ st.d t1, a4, 8 -+ st.d t2, a4, 16 -+ st.d t3, a4, 24 -+ -+ addi.d a4, a4, 32 -+ -+L(un_less_32bytes): -+ srai.d a3, a2, 4 -+ beqz a3, L(un_less_16bytes) -+ -+ ld.d t0, a1, 0 -+ ld.d t1, a1, 8 -+ -+ addi.d a1, a1, 16 -+ addi.d a2, a2, -16 -+ -+ srl.d t2, t0, a5 -+ sll.d t3, t0, a6 -+ -+ sll.d t4, t1, a6 -+ or t3, a7, t3 -+ or t4, t2, t4 -+ srl.d a7, t1, a5 -+ -+ st.d t3, a4, 0 -+ st.d t4, a4, 8 -+ -+ addi.d a4, a4, 16 -+ -+L(un_less_16bytes): -+ srai.d a3, a2, 3 -+ beqz a3, L(un_less_8bytes) -+ -+ ld.d t0, a1, 0 -+ -+ addi.d a1, a1, 8 -+ addi.d a2, a2, -8 -+ -+ sll.d t1, t0, a6 -+ or t2, a7, t1 -+ srl.d a7, t0, a5 -+ -+ st.d t2, a4, 0 -+ addi.d a4, a4, 8 -+ -+L(un_less_8bytes): -+ beqz a2, L(un_less_1byte) -+ bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7 -+ -+ # combine data in memory and a7(remaining part) -+ ld.d t0, a1, 0 -+ sll.d t0, t0, a6 -+ or a7, a7, t0 -+ -+1: -+ srai.d a3, a2, 2 -+ beqz a3, L(un_less_4bytes) -+ -+ addi.d a2, a2, -4 -+ st.w a7, a4, 0 -+ addi.d a4, a4, 4 -+ srai.d a7, a7, 32 -+ -+L(un_less_4bytes): -+ srai.d a3, a2, 1 -+ beqz a3, L(un_less_2bytes) -+ -+ addi.d a2, a2, -2 -+ st.h a7, a4, 0 -+ addi.d a4, a4, 2 -+ srai.d a7, a7, 16 -+ -+L(un_less_2bytes): -+ beqz a2, L(un_less_1byte) -+ st.b a7, a4, 0 -+ -+L(un_less_1byte): -+ jr ra -+ -+# Bytes copying for data less than 16 bytes -+L(short_data): -+ pcaddi t1, 36 -+ slli.d t2, a2, 3 -+ add.d a4, a0, a2 -+ sub.d t1, t1, t2 -+ add.d a1, a1, a2 -+ jr t1 -+ -+L(short_15_bytes): -+ ld.b t0, a1, -15 -+ st.b t0, a4, -15 -+L(short_14_bytes): -+ ld.b t0, a1, -14 -+ st.b t0, a4, -14 -+L(short_13_bytes): -+ ld.b t0, a1, -13 -+ st.b t0, a4, -13 -+L(short_12_bytes): -+ ld.b t0, a1, -12 -+ st.b t0, a4, -12 -+L(short_11_bytes): -+ ld.b t0, a1, -11 -+ st.b t0, a4, -11 -+L(short_10_bytes): -+ ld.b t0, a1, -10 -+ st.b t0, a4, -10 -+L(short_9_bytes): -+ ld.b t0, a1, -9 -+ st.b t0, a4, -9 -+L(short_8_bytes): -+ ld.b t0, a1, -8 -+ st.b t0, a4, -8 -+L(short_7_bytes): -+ ld.b t0, a1, -7 -+ st.b t0, a4, -7 -+L(short_6_bytes): -+ ld.b t0, a1, -6 -+ st.b t0, a4, -6 -+L(short_5_bytes): -+ ld.b t0, a1, -5 -+ st.b t0, a4, -5 -+L(short_4_bytes): -+ ld.b t0, a1, -4 -+ st.b t0, a4, -4 -+L(short_3_bytes): -+ ld.b t0, a1, -3 -+ st.b t0, a4, -3 -+L(short_2_bytes): -+ ld.b t0, a1, -2 -+ st.b t0, a4, -2 -+L(short_1_bytes): -+ ld.b t0, a1, -1 -+ st.b t0, a4, -1 -+ jr ra -+ -+L(copy_back): -+ srai.d a3, a2, 4 -+ beqz a3, L(back_short_data) # less than 16 bytes -+ -+ add.d a4, a0, a2 # store the tail of dest -+ add.d a1, a1, a2 # store the tail of src -+ -+ andi a5, a4, 0x7 -+ andi a6, a1, 0x7 -+ beqz a5, L(back_check_align) -+ -+ # make dest aligned 8 bytes -+ sub.d a2, a2, a5 -+ sub.d a1, a1, a5 -+ sub.d a4, a4, a5 -+ -+ pcaddi t1, 18 -+ slli.d t3, a5, 3 -+ sub.d t1, t1, t3 -+ jr t1 -+ -+ ld.b t0, a1, 6 -+ st.b t0, a4, 6 -+ ld.b t0, a1, 5 -+ st.b t0, a4, 5 -+ ld.b t0, a1, 4 -+ st.b t0, a4, 4 -+ ld.b t0, a1, 3 -+ st.b t0, a4, 3 -+ ld.b t0, a1, 2 -+ st.b t0, a4, 2 -+ ld.b t0, a1, 1 -+ st.b t0, a4, 1 -+ ld.b t0, a1, 0 -+ st.b t0, a4, 0 -+ -+L(back_check_align): -+ bne a5, a6, L(back_unalign) -+ -+ srai.d a3, a2, 4 -+ beqz a3, L(back_less_16bytes) -+ -+ andi a3, a2, 0x3f -+ beq a3, a2, L(back_less_64bytes) -+ -+ sub.d t0, a2, a3 -+ move a2, a3 -+ sub.d a5, a1, t0 -+ -+L(back_loop_64bytes): -+ LD_64(a1, -64) -+ addi.d a1, a1, -64 -+ ST_64(a4, -64) -+ -+ addi.d a4, a4, -64 -+ bne a1, a5, L(back_loop_64bytes) -+ -+L(back_less_64bytes): -+ srai.d a3, a2, 5 -+ beqz a3, L(back_less_32bytes) -+ -+ ld.d t0, a1, -32 -+ ld.d t1, a1, -24 -+ ld.d t2, a1, -16 -+ ld.d t3, a1, -8 -+ -+ addi.d a1, a1, -32 -+ addi.d a2, a2, -32 -+ -+ st.d t0, a4, -32 -+ st.d t1, a4, -24 -+ st.d t2, a4, -16 -+ st.d t3, a4, -8 -+ -+ addi.d a4, a4, -32 -+ -+L(back_less_32bytes): -+ srai.d a3, a2, 4 -+ beqz a3, L(back_less_16bytes) -+ -+ ld.d t0, a1, -16 -+ ld.d t1, a1, -8 -+ -+ addi.d a2, a2, -16 -+ addi.d a1, a1, -16 -+ -+ st.d t0, a4, -16 -+ st.d t1, a4, -8 -+ addi.d a4, a4, -16 -+ -+L(back_less_16bytes): -+ srai.d a3, a2, 3 -+ beqz a3, L(back_less_8bytes) -+ -+ ld.d t0, a1, -8 -+ addi.d a2, a2, -8 -+ addi.d a1, a1, -8 -+ -+ st.d t0, a4, -8 -+ addi.d a4, a4, -8 -+ -+L(back_less_8bytes): -+ srai.d a3, a2, 2 -+ beqz a3, L(back_less_4bytes) -+ -+ ld.w t0, a1, -4 -+ addi.d a2, a2, -4 -+ addi.d a1, a1, -4 -+ -+ st.w t0, a4, -4 -+ addi.d a4, a4, -4 -+ -+L(back_less_4bytes): -+ srai.d a3, a2, 1 -+ beqz a3, L(back_less_2bytes) -+ -+ ld.h t0, a1, -2 -+ addi.d a2, a2, -2 -+ addi.d a1, a1, -2 -+ -+ st.h t0, a4, -2 -+ addi.d a4, a4, -2 -+ -+L(back_less_2bytes): -+ beqz a2, L(back_less_1byte) -+ -+ ld.b t0, a1, -1 -+ st.b t0, a4, -1 -+ -+L(back_less_1byte): -+ jr ra -+ -+L(back_unalign): -+ andi t8, a1, 0x7 -+ bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned -+ -+ sub.d a6, zero, t8 -+ -+ ld.d t0, a1, 0 -+ slli.d a6, a6, 3 -+ slli.d a5, t8, 3 -+ sll.d a7, t0, a6 -+ -+ srai.d a3, a2, 4 -+ beqz a3, L(back_un_less_16bytes) -+ -+ andi a3, a2, 0x3f -+ beq a3, a2, L(back_un_less_64bytes) -+ -+ sub.d t0, a2, a3 -+ move a2, a3 -+ sub.d a3, a1, t0 -+ -+L(back_un_long_bytes): -+ ld.d t0, a1, -8 -+ ld.d t1, a1, -16 -+ ld.d t2, a1, -24 -+ ld.d t3, a1, -32 -+ -+ sll.d t4, t0, a6 -+ srl.d t0, t0, a5 -+ -+ sll.d t5, t1, a6 -+ srl.d t1, t1, a5 -+ -+ sll.d t6, t2, a6 -+ srl.d t2, t2, a5 -+ -+ sll.d t7, t3, a6 -+ srl.d t3, t3, a5 -+ -+ or t0, t0, a7 -+ or t1, t1, t4 -+ or t2, t2, t5 -+ or t3, t3, t6 -+ -+ ld.d t4, a1, -40 -+ ld.d t5, a1, -48 -+ ld.d t6, a1, -56 -+ ld.d a7, a1, -64 -+ st.d t0, a4, -8 -+ st.d t1, a4, -16 -+ st.d t2, a4, -24 -+ st.d t3, a4, -32 -+ -+ addi.d a1, a1, -64 -+ -+ sll.d t0, t4, a6 -+ srl.d t4, t4, a5 -+ -+ sll.d t1, t5, a6 -+ srl.d t5, t5, a5 -+ -+ sll.d t2, t6, a6 -+ srl.d t6, t6, a5 -+ -+ srl.d t3, a7, a5 -+ sll.d a7, a7, a6 -+ -+ or t4, t7, t4 -+ or t5, t0, t5 -+ or t6, t1, t6 -+ or t3, t2, t3 -+ -+ st.d t4, a4, -40 -+ st.d t5, a4, -48 -+ st.d t6, a4, -56 -+ st.d t3, a4, -64 -+ -+ addi.d a4, a4, -64 -+ bne a3, a1, L(back_un_long_bytes) -+ -+L(back_un_less_64bytes): -+ srai.d a3, a2, 5 -+ beqz a3, L(back_un_less_32bytes) -+ -+ ld.d t0, a1, -8 -+ ld.d t1, a1, -16 -+ ld.d t2, a1, -24 -+ ld.d t3, a1, -32 -+ -+ addi.d a1, a1, -32 -+ addi.d a2, a2, -32 -+ -+ sll.d t4, t0, a6 -+ srl.d t0, t0, a5 -+ -+ sll.d t5, t1, a6 -+ srl.d t1, t1, a5 -+ -+ sll.d t6, t2, a6 -+ srl.d t2, t2, a5 -+ -+ or t0, a7, t0 -+ -+ sll.d a7, t3, a6 -+ srl.d t3, t3, a5 -+ -+ or t1, t4, t1 -+ or t2, t5, t2 -+ or t3, t6, t3 -+ -+ st.d t0, a4, -8 -+ st.d t1, a4, -16 -+ st.d t2, a4, -24 -+ st.d t3, a4, -32 -+ -+ addi.d a4, a4, -32 -+ -+L(back_un_less_32bytes): -+ srai.d a3, a2, 4 -+ beqz a3, L(back_un_less_16bytes) -+ -+ ld.d t0, a1, -8 -+ ld.d t1, a1, -16 -+ -+ addi.d a1, a1, -16 -+ addi.d a2, a2, -16 -+ -+ sll.d t2, t0, a6 -+ srl.d t3, t0, a5 -+ -+ srl.d t4, t1, a5 -+ or t3, a7, t3 -+ or t4, t2, t4 -+ sll.d a7, t1, a6 -+ -+ st.d t3, a4, -8 -+ st.d t4, a4, -16 -+ -+ addi.d a4, a4, -16 -+ -+L(back_un_less_16bytes): -+ srai.d a3, a2, 3 -+ beqz a3, L(back_un_less_8bytes) -+ -+ ld.d t0, a1, -8 -+ -+ addi.d a1, a1, -8 -+ addi.d a2, a2, -8 -+ -+ srl.d t1, t0, a5 -+ or t2, a7, t1 -+ sll.d a7, t0, a6 -+ -+ st.d t2, a4, -8 -+ addi.d a4, a4, -8 -+ -+L(back_un_less_8bytes): -+ beqz a2, L(back_end) -+ bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7 -+ -+ # combine data in memory and a7(remaining part) -+ ld.d t0, a1, -8 -+ srl.d t0, t0, a5 -+ or a7, a7, t0 -+ -+1: -+ srai.d a3, a2, 2 -+ beqz a3, L(back_un_less_4bytes) -+ -+ srai.d t0, a7, 32 -+ addi.d a2, a2, -4 -+ st.w t0, a4, -4 -+ addi.d a4, a4, -4 -+ slli.d a7, a7, 32 -+ -+L(back_un_less_4bytes): -+ srai.d a3, a2, 1 -+ beqz a3, L(back_un_less_2bytes) -+ srai.d t0, a7, 48 -+ addi.d a2, a2, -2 -+ st.h t0, a4, -2 -+ addi.d a4, a4, -2 -+ slli.d a7, a7, 16 -+L(back_un_less_2bytes): -+ beqz a2, L(back_un_less_1byte) -+ srai.d t0, a7, 56 -+ st.b t0, a4, -1 -+L(back_un_less_1byte): -+ jr ra -+ -+L(back_short_data): -+ pcaddi t1, 34 -+ slli.d t2, a2, 3 -+ sub.d t1, t1, t2 -+ jr t1 -+ -+ ld.b t0, a1, 14 -+ st.b t0, a0, 14 -+ ld.b t0, a1, 13 -+ st.b t0, a0, 13 -+ ld.b t0, a1, 12 -+ st.b t0, a0, 12 -+ ld.b t0, a1, 11 -+ st.b t0, a0, 11 -+ ld.b t0, a1, 10 -+ st.b t0, a0, 10 -+ ld.b t0, a1, 9 -+ st.b t0, a0, 9 -+ ld.b t0, a1, 8 -+ st.b t0, a0, 8 -+ ld.b t0, a1, 7 -+ st.b t0, a0, 7 -+ ld.b t0, a1, 6 -+ st.b t0, a0, 6 -+ ld.b t0, a1, 5 -+ st.b t0, a0, 5 -+ ld.b t0, a1, 4 -+ st.b t0, a0, 4 -+ ld.b t0, a1, 3 -+ st.b t0, a0, 3 -+ ld.b t0, a1, 2 -+ st.b t0, a0, 2 -+ ld.b t0, a1, 1 -+ st.b t0, a0, 1 -+ ld.b t0, a1, 0 -+ st.b t0, a0, 0 -+L(back_end): -+ jr ra -+ -+END(MEMCPY_NAME) -+ -+#ifndef ANDROID_CHANGES -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMCPY_NAME) -+#endif -+#endif -diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S -new file mode 100644 -index 00000000..6d1922c4 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/memmove.S -@@ -0,0 +1,2 @@ -+/* DONT DELETE THIS FILE, OTHERWIES MEMCPY.C WILL BE COMPILED. */ -+/* There are too many common code in memcpy and memmove. See memcpy.S */ -diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S -new file mode 100644 -index 00000000..9fe42b24 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/memset.S -@@ -0,0 +1,173 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#ifndef MEMSET_NAME -+#define MEMSET_NAME memset -+#endif -+ -+#define ST_64(n) \ -+ st.d a1, a0, n; \ -+ st.d a1, a0, n+8; \ -+ st.d a1, a0, n+16; \ -+ st.d a1, a0, n+24; \ -+ st.d a1, a0, n+32; \ -+ st.d a1, a0, n+40; \ -+ st.d a1, a0, n+48; \ -+ st.d a1, a0, n+56; -+ -+#ifdef ANDROID_CHANGES -+LEAF(MEMSET_NAME, 0) -+#else -+LEAF(MEMSET_NAME) -+#endif -+ .align 6 -+ move t0, a0 -+ andi a3, a0, 0x7 -+ li.w t6, 16 -+ beqz a3, L(align) -+ blt a2, t6, L(short_data) -+ -+L(make_align): -+ li.w t8, 8 -+ sub.d t2, t8, a3 -+ pcaddi t1, 11 -+ slli.d t3, t2, 2 -+ sub.d t1, t1, t3 -+ jirl zero, t1, 0 -+ -+L(al7): -+ st.b a1, t0, 6 -+L(al6): -+ st.b a1, t0, 5 -+L(al5): -+ st.b a1, t0, 4 -+L(al4): -+ st.b a1, t0, 3 -+L(al3): -+ st.b a1, t0, 2 -+L(al2): -+ st.b a1, t0, 1 -+L(al1): -+ st.b a1, t0, 0 -+L(al0): -+ add.d t0, t0, t2 -+ sub.d a2, a2, t2 -+ -+L(align): -+ bstrins.d a1, a1, 15, 8 -+ bstrins.d a1, a1, 31, 16 -+ bstrins.d a1, a1, 63, 32 -+ -+ blt a2, t6, L(less_16bytes) -+ -+ andi a4, a2, 0x3f -+ beq a4, a2, L(less_64bytes) -+ -+ sub.d t1, a2, a4 -+ move a2, a4 -+ add.d a5, t0, t1 -+ -+L(loop_64bytes): -+ addi.d t0, t0, 64 -+ st.d a1, t0, -64 -+ st.d a1, t0, -56 -+ st.d a1, t0, -48 -+ st.d a1, t0, -40 -+ st.d a1, t0, -32 -+ st.d a1, t0, -24 -+ st.d a1, t0, -16 -+ st.d a1, t0, -8 -+ bne t0, a5, L(loop_64bytes) -+ -+L(less_64bytes): -+ srai.d a4, a2, 5 -+ beqz a4, L(less_32bytes) -+ addi.d a2, a2, -32 -+ st.d a1, t0, 0 -+ st.d a1, t0, 8 -+ st.d a1, t0, 16 -+ st.d a1, t0, 24 -+ addi.d t0, t0, 32 -+L(less_32bytes): -+ blt a2, t6, L(less_16bytes) -+ addi.d a2, a2, -16 -+ st.d a1, t0, 0 -+ st.d a1, t0, 8 -+ addi.d t0, t0, 16 -+L(less_16bytes): -+ srai.d a4, a2, 3 -+ beqz a4, L(less_8bytes) -+ addi.d a2, a2, -8 -+ st.d a1, t0, 0 -+ addi.d t0, t0, 8 -+L(less_8bytes): -+ beqz a2, L(less_1byte) -+ srai.d a4, a2, 2 -+ beqz a4, L(less_4bytes) -+ addi.d a2, a2, -4 -+ st.w a1, t0, 0 -+ addi.d t0, t0, 4 -+L(less_4bytes): -+ srai.d a3, a2, 1 -+ beqz a3, L(less_2bytes) -+ addi.d a2, a2, -2 -+ st.h a1, t0, 0 -+ addi.d t0, t0, 2 -+L(less_2bytes): -+ beqz a2, L(less_1byte) -+ st.b a1, t0, 0 -+L(less_1byte): -+ jr ra -+ -+L(short_data): -+ pcaddi t1, 19 -+ slli.d t3, a2, 2 -+ sub.d t1, t1, t3 -+ jirl zero, t1, 0 -+L(short_15): -+ st.b a1, a0, 14 -+ -+L(short_14): -+ st.b a1, a0, 13 -+L(short_13): -+ st.b a1, a0, 12 -+L(short_12): -+ st.b a1, a0, 11 -+L(short_11): -+ st.b a1, a0, 10 -+L(short_10): -+ st.b a1, a0, 9 -+L(short_9): -+ st.b a1, a0, 8 -+L(short_8): -+ st.b a1, a0, 7 -+L(short_7): -+ st.b a1, a0, 6 -+L(short_6): -+ st.b a1, a0, 5 -+L(short_5): -+ st.b a1, a0, 4 -+L(short_4): -+ st.b a1, a0, 3 -+L(short_3): -+ st.b a1, a0, 2 -+L(short_2): -+ st.b a1, a0, 1 -+L(short_1): -+ st.b a1, a0, 0 -+L(short_0): -+ jr ra -+ -+END(MEMSET_NAME) -+ -+#ifndef ANDROID_CHANGES -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMSET_NAME) -+#endif -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile -new file mode 100644 -index 00000000..6bd48f0e ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/Makefile -@@ -0,0 +1,18 @@ -+ifeq ($(subdir),string) -+sysdep_routines += memcpy-aligned memcpy-unaligned memcpy-lasx \ -+ memset-aligned memset-unaligned memset-lsx memset-lasx \ -+ memmove-unaligned memmove-lsx memmove-lasx \ -+ memchr-aligned memchr-lsx memchr-lasx \ -+ memrchr-generic memrchr-lsx memrchr-lasx \ -+ memcmp-aligned memcmp-lsx memcmp-lasx \ -+ rawmemchr-aligned rawmemchr-lsx rawmemchr-lasx \ -+ strchr-aligned strchr-unaligned strchr-lsx strchr-lasx \ -+ strrchr-aligned strrchr-lsx strrchr-lasx \ -+ strlen-aligned strlen-unaligned strlen-lsx strlen-lasx \ -+ strnlen-aligned strnlen-unaligned strnlen-lsx strnlen-lasx \ -+ strchrnul-aligned strchrnul-unaligned strchrnul-lsx strchrnul-lasx \ -+ strncmp-aligned strncmp-unaligned strncmp-lsx \ -+ strcpy-aligned strcpy-unaligned strcpy-lsx \ -+ stpcpy-aligned stpcpy-lsx \ -+ strcmp-aligned strcmp-unaligned strcmp-lsx -+endif -diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c -new file mode 100644 -index 00000000..c2b6bbf7 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c -@@ -0,0 +1,142 @@ -+/* Enumerate available IFUNC implementations of a function. LoongArch64 version. -+ Copyright (C) 2017-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* Maximum number of IFUNC implementations. */ -+#define MAX_IFUNC 4 -+ -+size_t -+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, -+ size_t max) -+{ -+ assert (max >= MAX_IFUNC); -+ -+ size_t i = 0; -+ -+ IFUNC_IMPL (i, name, memcpy, -+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_lasx) -+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_lsx) -+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_aligned) -+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_unaligned) -+ ) -+ -+ IFUNC_IMPL (i, name, memmove, -+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_lasx) -+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_lsx) -+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_aligned) -+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_unaligned) -+ ) -+ -+ IFUNC_IMPL (i, name, memset, -+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_lasx) -+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_lsx) -+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_aligned) -+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_unaligned) -+ ) -+ -+ IFUNC_IMPL (i, name, memchr, -+ IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_lasx) -+ IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_lsx) -+ IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_aligned) -+ ) -+ -+ IFUNC_IMPL (i, name, memrchr, -+ IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_lasx) -+ IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_lsx) -+ IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_generic) -+ ) -+ -+ IFUNC_IMPL (i, name, memcmp, -+ IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_lasx) -+ IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_lsx) -+ IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_aligned) -+ ) -+ -+ IFUNC_IMPL (i, name, rawmemchr, -+ IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_lasx) -+ IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_lsx) -+ IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_aligned) -+ ) -+ -+ IFUNC_IMPL (i, name, strchr, -+ IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_lasx) -+ IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_lsx) -+ IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_aligned) -+ IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_unaligned) -+ ) -+ -+ IFUNC_IMPL (i, name, strrchr, -+ IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_lasx) -+ IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_lsx) -+ IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_aligned) -+ ) -+ -+ IFUNC_IMPL (i, name, strlen, -+ IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_lasx) -+ IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_lsx) -+ IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned) -+ IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_unaligned) -+ ) -+ -+ IFUNC_IMPL (i, name, strnlen, -+ IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_lasx) -+ IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_lsx) -+ IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_aligned) -+ IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_unaligned) -+ ) -+ -+ IFUNC_IMPL (i, name, strchrnul, -+ IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_lasx) -+ IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_lsx) -+ IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_aligned) -+ IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_unaligned) -+ ) -+ -+ IFUNC_IMPL (i, name, strncmp, -+ IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_lsx) -+ IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_aligned) -+ IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_unaligned) -+ ) -+ -+ IFUNC_IMPL (i, name, strcpy, -+ IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_lsx) -+ IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_aligned) -+ IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_unaligned) -+ ) -+ -+ IFUNC_IMPL (i, name, stpcpy, -+ IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_lsx) -+ IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_aligned) -+ ) -+ -+ IFUNC_IMPL (i, name, strcmp, -+ IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_lsx) -+ IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_aligned) -+ IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_unaligned) -+ ) -+ -+ return i; -+} -+ -diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-lasx.h b/sysdeps/loongarch/lp64/multiarch/ifunc-lasx.h -new file mode 100644 -index 00000000..61c00978 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-lasx.h -@@ -0,0 +1,40 @@ -+/* Common definition for memcpy, and memset implementation. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+ -+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden; -+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden; -+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden; -+extern __typeof (REDIRECT_NAME) OPTIMIZE (unaligned) attribute_hidden; -+ -+static inline void * -+IFUNC_SELECTOR (void) -+{ -+ INIT_ARCH(); -+ -+ if (SUPPORT_LASX) -+ return OPTIMIZE (lasx); -+ else if (SUPPORT_LSX) -+ return OPTIMIZE (lsx); -+ else if (SUPPORT_UAL) -+ return OPTIMIZE (unaligned); -+ else -+ return OPTIMIZE (aligned); -+} -diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-lsx.h b/sysdeps/loongarch/lp64/multiarch/ifunc-lsx.h -new file mode 100644 -index 00000000..771312f6 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-lsx.h -@@ -0,0 +1,37 @@ -+/* Common definition for strchr implementation. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+ -+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden; -+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden; -+extern __typeof (REDIRECT_NAME) OPTIMIZE (unaligned) attribute_hidden; -+ -+static inline void * -+IFUNC_SELECTOR (void) -+{ -+ INIT_ARCH(); -+ -+ if (SUPPORT_LSX) -+ return OPTIMIZE (lsx); -+ if (SUPPORT_UAL) -+ return OPTIMIZE (unaligned); -+ else -+ return OPTIMIZE (aligned); -+} -diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-memchr.h b/sysdeps/loongarch/lp64/multiarch/ifunc-memchr.h -new file mode 100644 -index 00000000..5c01e1af ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-memchr.h -@@ -0,0 +1,37 @@ -+/* Common definition for memchr implementation. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+ -+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden; -+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden; -+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden; -+ -+static inline void * -+IFUNC_SELECTOR (void) -+{ -+ INIT_ARCH(); -+ -+ if (SUPPORT_LASX) -+ return OPTIMIZE (lasx); -+ else if (SUPPORT_LSX) -+ return OPTIMIZE (lsx); -+ else -+ return OPTIMIZE (aligned); -+} -diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-memrchr.h b/sysdeps/loongarch/lp64/multiarch/ifunc-memrchr.h -new file mode 100644 -index 00000000..d264944c ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-memrchr.h -@@ -0,0 +1,37 @@ -+/* Common definition for memrchr implementation. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+ -+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden; -+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden; -+extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; -+ -+static inline void * -+IFUNC_SELECTOR (void) -+{ -+ INIT_ARCH(); -+ -+ if (SUPPORT_LASX) -+ return OPTIMIZE (lasx); -+ else if (SUPPORT_LSX) -+ return OPTIMIZE (lsx); -+ else -+ return OPTIMIZE (generic); -+} -diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-stpcpy.h b/sysdeps/loongarch/lp64/multiarch/ifunc-stpcpy.h -new file mode 100644 -index 00000000..9093f08c ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-stpcpy.h -@@ -0,0 +1,34 @@ -+/* Common definition for memchr implementation. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+ -+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden; -+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden; -+ -+static inline void * -+IFUNC_SELECTOR (void) -+{ -+ INIT_ARCH(); -+ -+ if (SUPPORT_LSX) -+ return OPTIMIZE (lsx); -+ else -+ return OPTIMIZE (aligned); -+} -diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S -new file mode 100644 -index 00000000..4677c912 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S -@@ -0,0 +1,7 @@ -+ -+#if IS_IN (libc) -+#define MEMCHR_NAME __memchr_aligned -+#endif -+ -+#include "../memchr.S" -+ -diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S -new file mode 100644 -index 00000000..e63e34ae ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S -@@ -0,0 +1,108 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define MEMCHR __memchr_lasx -+ -+LEAF(MEMCHR) -+ .align 6 -+ beqz a2, L(ret0) -+ add.d a3, a0, a2 -+ andi t0, a0, 0x3f -+ bstrins.d a0, zero, 5, 0 -+ -+ xvld $xr0, a0, 0 -+ xvld $xr1, a0, 32 -+ li.d t1, -1 -+ li.d t2, 64 -+ -+ xvreplgr2vr.b $xr2, a1 -+ sll.d t3, t1, t0 -+ sub.d t2, t2, t0 -+ xvseq.b $xr0, $xr0, $xr2 -+ -+ xvseq.b $xr1, $xr1, $xr2 -+ xvmsknz.b $xr0, $xr0 -+ xvmsknz.b $xr1, $xr1 -+ xvpickve.w $xr3, $xr0, 4 -+ -+ -+ xvpickve.w $xr4, $xr1, 4 -+ vilvl.h $vr0, $vr3, $vr0 -+ vilvl.h $vr1, $vr4, $vr1 -+ vilvl.w $vr0, $vr1, $vr0 -+ -+ movfr2gr.d t0, $f0 -+ and t0, t0, t3 -+ bgeu t2, a2, L(end) -+ bnez t0, L(found) -+ -+ addi.d a4, a3, -1 -+ bstrins.d a4, zero, 5, 0 -+L(loop): -+ xvld $xr0, a0, 64 -+ xvld $xr1, a0, 96 -+ -+ addi.d a0, a0, 64 -+ xvseq.b $xr0, $xr0, $xr2 -+ xvseq.b $xr1, $xr1, $xr2 -+ beq a0, a4, L(out) -+ -+ -+ xvmax.bu $xr3, $xr0, $xr1 -+ xvseteqz.v $fcc0, $xr3 -+ bcnez $fcc0, L(loop) -+ xvmsknz.b $xr0, $xr0 -+ -+ xvmsknz.b $xr1, $xr1 -+ xvpickve.w $xr3, $xr0, 4 -+ xvpickve.w $xr4, $xr1, 4 -+ vilvl.h $vr0, $vr3, $vr0 -+ -+ vilvl.h $vr1, $vr4, $vr1 -+ vilvl.w $vr0, $vr1, $vr0 -+ movfr2gr.d t0, $f0 -+L(found): -+ ctz.d t1, t0 -+ -+ add.d a0, a0, t1 -+ jr ra -+L(ret0): -+ move a0, zero -+ jr ra -+ -+ -+L(out): -+ xvmsknz.b $xr0, $xr0 -+ xvmsknz.b $xr1, $xr1 -+ xvpickve.w $xr3, $xr0, 4 -+ xvpickve.w $xr4, $xr1, 4 -+ -+ vilvl.h $vr0, $vr3, $vr0 -+ vilvl.h $vr1, $vr4, $vr1 -+ vilvl.w $vr0, $vr1, $vr0 -+ movfr2gr.d t0, $f0 -+ -+L(end): -+ sub.d t2, zero, a3 -+ srl.d t1, t1, t2 -+ and t0, t0, t1 -+ ctz.d t1, t0 -+ -+ add.d a0, a0, t1 -+ maskeqz a0, a0, t0 -+ jr ra -+END(MEMCHR) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMCHR) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S -new file mode 100644 -index 00000000..441db534 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S -@@ -0,0 +1,93 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define MEMCHR __memchr_lsx -+ -+LEAF(MEMCHR) -+ .align 6 -+ beqz a2, L(ret0) -+ add.d a3, a0, a2 -+ andi t0, a0, 0x1f -+ bstrins.d a0, zero, 4, 0 -+ -+ vld $vr0, a0, 0 -+ vld $vr1, a0, 16 -+ li.d t1, -1 -+ li.d t2, 32 -+ -+ vreplgr2vr.b $vr2, a1 -+ sll.d t3, t1, t0 -+ sub.d t2, t2, t0 -+ vseq.b $vr0, $vr0, $vr2 -+ -+ vseq.b $vr1, $vr1, $vr2 -+ vmsknz.b $vr0, $vr0 -+ vmsknz.b $vr1, $vr1 -+ vilvl.h $vr0, $vr1, $vr0 -+ -+ -+ movfr2gr.s t0, $f0 -+ and t0, t0, t3 -+ bgeu t2, a2, L(end) -+ bnez t0, L(found) -+ -+ addi.d a4, a3, -1 -+ bstrins.d a4, zero, 4, 0 -+L(loop): -+ vld $vr0, a0, 32 -+ vld $vr1, a0, 48 -+ -+ addi.d a0, a0, 32 -+ vseq.b $vr0, $vr0, $vr2 -+ vseq.b $vr1, $vr1, $vr2 -+ beq a0, a4, L(out) -+ -+ vmax.bu $vr3, $vr0, $vr1 -+ vseteqz.v $fcc0, $vr3 -+ bcnez $fcc0, L(loop) -+ vmsknz.b $vr0, $vr0 -+ -+ -+ vmsknz.b $vr1, $vr1 -+ vilvl.h $vr0, $vr1, $vr0 -+ movfr2gr.s t0, $f0 -+L(found): -+ ctz.w t0, t0 -+ -+ add.d a0, a0, t0 -+ jr ra -+L(ret0): -+ move a0, zero -+ jr ra -+ -+L(out): -+ vmsknz.b $vr0, $vr0 -+ vmsknz.b $vr1, $vr1 -+ vilvl.h $vr0, $vr1, $vr0 -+ movfr2gr.s t0, $f0 -+ -+L(end): -+ sub.d t2, zero, a3 -+ srl.w t1, t1, t2 -+ and t0, t0, t1 -+ ctz.w t1, t0 -+ -+ -+ add.d a0, a0, t1 -+ maskeqz a0, a0, t0 -+ jr ra -+END(MEMCHR) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMCHR) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memchr.c b/sysdeps/loongarch/lp64/multiarch/memchr.c -new file mode 100644 -index 00000000..18b0e2ef ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memchr.c -@@ -0,0 +1,39 @@ -+/* Multiple versions of memchr. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* Define multiple versions only for the definition in libc. */ -+#if IS_IN (libc) -+# define memchr __redirect_memchr -+# include -+# undef memchr -+ -+# define SYMBOL_NAME memchr -+# include "ifunc-memchr.h" -+ -+libc_ifunc_redirected (__redirect_memchr, __new_memchr, -+ IFUNC_SELECTOR ()); -+ -+# ifdef SHARED -+__hidden_ver1 (__new_memchr, __GI_memchr, __redirect_memchr) -+ __attribute__ ((visibility ("hidden"))); -+# endif -+ -+# include -+versioned_symbol (libc, __new_memchr, memchr, GLIBC_2_27); -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S -new file mode 100644 -index 00000000..512eabca ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S -@@ -0,0 +1,11 @@ -+ -+#if IS_IN (libc) -+ -+#define MEMCMP_NAME __memcmp_aligned -+ -+#endif -+ -+#include "../memcmp.S" -+# undef bcmp -+weak_alias (MEMCMP_NAME, bcmp) -+ -diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S -new file mode 100644 -index 00000000..30e2dbe6 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S -@@ -0,0 +1,199 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define MEMCMP __memcmp_lasx -+ -+LEAF(MEMCMP) -+ .align 6 -+ li.d t2, 32 -+ add.d a3, a0, a2 -+ add.d a4, a1, a2 -+ bgeu t2, a2, L(less32) # a2 <= 32 -+ -+ li.d t1, 160 -+ bgeu a2, t1, L(make_aligned) # a2 >= 160 -+L(loop32): -+ xvld $xr0, a0, 0 -+ xvld $xr1, a1, 0 -+ -+ addi.d a0, a0, 32 -+ addi.d a1, a1, 32 -+ addi.d a2, a2, -32 -+ xvseq.b $xr2, $xr0, $xr1 -+ -+ xvsetanyeqz.b $fcc0, $xr2 -+ bcnez $fcc0, L(end) -+L(last_bytes): -+ bltu t2, a2, L(loop32) -+ xvld $xr0, a3, -32 -+ -+ -+ xvld $xr1, a4, -32 -+ xvseq.b $xr2, $xr0, $xr1 -+L(end): -+ xvmsknz.b $xr2, $xr2 -+ xvpermi.q $xr4, $xr0, 1 -+ -+ xvpickve.w $xr3, $xr2, 4 -+ xvpermi.q $xr5, $xr1, 1 -+ vilvl.h $vr2, $vr3, $vr2 -+ movfr2gr.s t0, $f2 -+ -+ cto.w t0, t0 -+ vreplgr2vr.b $vr2, t0 -+ vshuf.b $vr0, $vr4, $vr0, $vr2 -+ vshuf.b $vr1, $vr5, $vr1, $vr2 -+ -+ vpickve2gr.bu t0, $vr0, 0 -+ vpickve2gr.bu t1, $vr1, 0 -+ sub.d a0, t0, t1 -+ jr ra -+ -+ -+L(less32): -+ srli.d t0, a2, 4 -+ beqz t0, L(less16) -+ vld $vr0, a0, 0 -+ vld $vr1, a1, 0 -+ -+ vld $vr2, a3, -16 -+ vld $vr3, a4, -16 -+L(short_ret): -+ vseq.b $vr4, $vr0, $vr1 -+ vseq.b $vr5, $vr2, $vr3 -+ -+ vmsknz.b $vr4, $vr4 -+ vmsknz.b $vr5, $vr5 -+ vilvl.h $vr4, $vr5, $vr4 -+ movfr2gr.s t0, $f4 -+ -+ cto.w t0, t0 -+ vreplgr2vr.b $vr4, t0 -+ vshuf.b $vr0, $vr2, $vr0, $vr4 -+ vshuf.b $vr1, $vr3, $vr1, $vr4 -+ -+ -+ vpickve2gr.bu t0, $vr0, 0 -+ vpickve2gr.bu t1, $vr1, 0 -+ sub.d a0, t0, t1 -+ jr ra -+ -+L(less16): -+ srli.d t0, a2, 3 -+ beqz t0, L(less8) -+ vldrepl.d $vr0, a0, 0 -+ vldrepl.d $vr1, a1, 0 -+ -+ vldrepl.d $vr2, a3, -8 -+ vldrepl.d $vr3, a4, -8 -+ b L(short_ret) -+L(less8): -+ srli.d t0, a2, 2 -+ -+ beqz t0, L(less4) -+ vldrepl.w $vr0, a0, 0 -+ vldrepl.w $vr1, a1, 0 -+ vldrepl.w $vr2, a3, -4 -+ -+ -+ vldrepl.w $vr3, a4, -4 -+ b L(short_ret) -+L(less4): -+ srli.d t0, a2, 1 -+ beqz t0, L(less2) -+ -+ vldrepl.h $vr0, a0, 0 -+ vldrepl.h $vr1, a1, 0 -+ vldrepl.h $vr2, a3, -2 -+ vldrepl.h $vr3, a4, -2 -+ -+ b L(short_ret) -+L(less2): -+ beqz a2, L(ret0) -+ ld.bu t0, a0, 0 -+ ld.bu t1, a1, 0 -+ -+ sub.d a0, t0, t1 -+ jr ra -+L(ret0): -+ move a0, zero -+ jr ra -+ -+ -+ nop -+ nop -+ nop -+/* make src1 aligned, and adjust scr2 and length. */ -+L(make_aligned): -+ xvld $xr0, a0, 0 -+ -+ xvld $xr1, a1, 0 -+ xvseq.b $xr2, $xr0, $xr1 -+ xvsetanyeqz.b $fcc0, $xr2 -+ bcnez $fcc0, L(end) -+ -+ andi t0, a0, 0x1f -+ sub.d t0, t2, t0 -+ sub.d t1, a2, t0 -+ add.d a0, a0, t0 -+ -+ add.d a1, a1, t0 -+ andi a2, t1, 0x3f -+ sub.d t0, t1, a2 -+ add.d a5, a0, t0 -+ -+ -+L(loop_align): -+ xvld $xr0, a0, 0 -+ xvld $xr1, a1, 0 -+ xvld $xr2, a0, 32 -+ xvld $xr3, a1, 32 -+ -+ xvseq.b $xr0, $xr0, $xr1 -+ xvseq.b $xr1, $xr2, $xr3 -+ xvmin.bu $xr2, $xr1, $xr0 -+ xvsetanyeqz.b $fcc0, $xr2 -+ -+ bcnez $fcc0, L(pair_end) -+ addi.d a0, a0, 64 -+ addi.d a1, a1, 64 -+ bne a0, a5, L(loop_align) -+ -+ bnez a2, L(last_bytes) -+ move a0, zero -+ jr ra -+ nop -+ -+ -+L(pair_end): -+ xvmsknz.b $xr0, $xr0 -+ xvmsknz.b $xr1, $xr1 -+ xvpickve.w $xr2, $xr0, 4 -+ xvpickve.w $xr3, $xr1, 4 -+ -+ vilvl.h $vr0, $vr2, $vr0 -+ vilvl.h $vr1, $vr3, $vr1 -+ vilvl.w $vr0, $vr1, $vr0 -+ movfr2gr.d t0, $f0 -+ -+ cto.d t0, t0 -+ ldx.bu t1, a0, t0 -+ ldx.bu t2, a1, t0 -+ sub.d a0, t1, t2 -+ -+ jr ra -+END(MEMCMP) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMCMP) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S -new file mode 100644 -index 00000000..7fd349b6 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S -@@ -0,0 +1,255 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define MEMCMP __memcmp_lsx -+ -+L(magic_num): -+ .align 6 -+ .dword 0x0706050403020100 -+ .dword 0x0f0e0d0c0b0a0908 -+ nop -+ nop -+ENTRY_NO_ALIGN(MEMCMP) -+ beqz a2, L(out) -+ pcaddi t0, -7 -+ -+ andi a3, a0, 0xf -+ vld $vr5, t0, 0 -+ andi a4, a1, 0xf -+ bne a3, a4, L(unaligned) -+ -+ bstrins.d a0, zero, 3, 0 -+ xor a1, a1, a4 -+ vld $vr0, a0, 0 -+ vld $vr1, a1, 0 -+ -+ -+ li.d t0, 16 -+ vreplgr2vr.b $vr3, a3 -+ sub.d t1, t0, a3 -+ vadd.b $vr3, $vr3, $vr5 -+ -+ vshuf.b $vr0, $vr3, $vr0, $vr3 -+ vshuf.b $vr1, $vr3, $vr1, $vr3 -+ vseq.b $vr4, $vr0, $vr1 -+ bgeu t1, a2, L(al_end) -+ -+ vsetanyeqz.b $fcc0, $vr4 -+ bcnez $fcc0, L(al_found) -+ sub.d a2, a2, t1 -+ andi t1, a2, 31 -+ -+ beq a2, t1, L(al_less_32bytes) -+ sub.d t2, a2, t1 -+ move a2, t1 -+ add.d a4, a0, t2 -+ -+ -+L(al_loop): -+ vld $vr0, a0, 16 -+ vld $vr1, a1, 16 -+ vld $vr2, a0, 32 -+ vld $vr3, a1, 32 -+ -+ addi.d a0, a0, 32 -+ addi.d a1, a1, 32 -+ vseq.b $vr4, $vr0, $vr1 -+ vseq.b $vr6, $vr2, $vr3 -+ -+ vand.v $vr6, $vr4, $vr6 -+ vsetanyeqz.b $fcc0, $vr6 -+ bcnez $fcc0, L(al_pair_end) -+ bne a0, a4, L(al_loop) -+ -+L(al_less_32bytes): -+ bgeu t0, a2, L(al_less_16bytes) -+ vld $vr0, a0, 16 -+ vld $vr1, a1, 16 -+ vld $vr2, a0, 32 -+ -+ -+ vld $vr3, a1, 32 -+ addi.d a2, a2, -16 -+ vreplgr2vr.b $vr6, a2 -+ vslt.b $vr5, $vr5, $vr6 -+ -+ vseq.b $vr4, $vr0, $vr1 -+ vseq.b $vr6, $vr2, $vr3 -+ vorn.v $vr6, $vr6, $vr5 -+L(al_pair_end): -+ vsetanyeqz.b $fcc0, $vr4 -+ -+ bcnez $fcc0, L(al_found) -+ vnori.b $vr4, $vr6, 0 -+ vfrstpi.b $vr4, $vr4, 0 -+ vshuf.b $vr0, $vr2, $vr2, $vr4 -+ -+ vshuf.b $vr1, $vr3, $vr3, $vr4 -+ vpickve2gr.bu t0, $vr0, 0 -+ vpickve2gr.bu t1, $vr1, 0 -+ sub.d a0, t0, t1 -+ -+ -+ jr ra -+L(al_less_16bytes): -+ beqz a2, L(out) -+ vld $vr0, a0, 16 -+ vld $vr1, a1, 16 -+ -+ vseq.b $vr4, $vr0, $vr1 -+L(al_end): -+ vreplgr2vr.b $vr6, a2 -+ vslt.b $vr5, $vr5, $vr6 -+ vorn.v $vr4, $vr4, $vr5 -+ -+L(al_found): -+ vnori.b $vr4, $vr4, 0 -+ vfrstpi.b $vr4, $vr4, 0 -+ vshuf.b $vr0, $vr0, $vr0, $vr4 -+ vshuf.b $vr1, $vr1, $vr1, $vr4 -+ -+ vpickve2gr.bu t0, $vr0, 0 -+ vpickve2gr.bu t1, $vr1, 0 -+ sub.d a0, t0, t1 -+ jr ra -+ -+ -+L(unaligned): -+ xor t2, a0, a1 -+ sltu a5, a3, a4 -+ masknez t2, t2, a5 -+ xor a0, a0, t2 # a0 point to string with smaller offset 2 -+ -+ xor a1, a1, t2 # a1 point to string with larger 4 -+ andi a3, a0, 0xf # a3 = 2 -+ andi a4, a1, 0xf # a4 = 4 -+ bstrins.d a0, zero, 3, 0 -+ -+ xor a1, a1, a4 -+ vld $vr4, a0, 0 -+ vld $vr1, a1, 0 -+ li.d t0, 16 -+ -+ vreplgr2vr.b $vr2, a4 -+ sub.d a6, a4, a3 # a6 hold the diff -+ sub.d t1, t0, a4 -+ sub.d t2, t0, a6 -+ -+ -+ vadd.b $vr2, $vr2, $vr5 # [4, 5, 6, ...] -+ vreplgr2vr.b $vr6, t2 -+ vadd.b $vr6, $vr6, $vr5 # [14, 15, 16, ... ] -+ vshuf.b $vr0, $vr4, $vr4, $vr6 # make data be in the same position -+ -+ vshuf.b $vr1, $vr2, $vr1, $vr2 -+ vshuf.b $vr0, $vr2, $vr0, $vr2 -+ vseq.b $vr7, $vr0, $vr1 -+ bgeu t1, a2, L(un_end) -+ -+ vsetanyeqz.b $fcc0, $vr7 -+ bcnez $fcc0, L(un_found) -+ sub.d a2, a2, t1 -+ andi t1, a2, 31 -+ -+ beq a2, t1, L(un_less_32bytes) -+ sub.d t2, a2, t1 -+ move a2, t1 -+ add.d a4, a1, t2 -+ -+ -+L(un_loop): -+ vld $vr2, a0, 16 -+ vld $vr1, a1, 16 -+ vld $vr3, a1, 32 -+ addi.d a1, a1, 32 -+ -+ addi.d a0, a0, 32 -+ vshuf.b $vr0, $vr2, $vr4, $vr6 -+ vld $vr4, a0, 0 -+ vseq.b $vr7, $vr0, $vr1 -+ -+ vshuf.b $vr2, $vr4, $vr2, $vr6 -+ vseq.b $vr8, $vr2, $vr3 -+ vand.v $vr8, $vr7, $vr8 -+ vsetanyeqz.b $fcc0, $vr8 -+ -+ bcnez $fcc0, L(un_pair_end) -+ bne a1, a4, L(un_loop) -+L(un_less_32bytes): -+ bltu a2, t0, L(un_less_16bytes) -+ vld $vr2, a0, 16 -+ -+ -+ vld $vr1, a1, 16 -+ addi.d a0, a0, 16 -+ addi.d a1, a1, 16 -+ addi.d a2, a2, -16 -+ -+ vshuf.b $vr0, $vr2, $vr4, $vr6 -+ vor.v $vr4, $vr2, $vr2 -+ vseq.b $vr7, $vr0, $vr1 -+ vsetanyeqz.b $fcc0, $vr7 -+ -+ bcnez $fcc0, L(un_found) -+L(un_less_16bytes): -+ beqz a2, L(out) -+ vld $vr1, a1, 16 -+ bgeu a6, a2, 1f -+ -+ vld $vr2, a0, 16 -+1: -+ vshuf.b $vr0, $vr2, $vr4, $vr6 -+ vseq.b $vr7, $vr0, $vr1 -+L(un_end): -+ vreplgr2vr.b $vr3, a2 -+ -+ -+ vslt.b $vr3, $vr5, $vr3 -+ vorn.v $vr7, $vr7, $vr3 -+L(un_found): -+ vnori.b $vr7, $vr7, 0 -+ vfrstpi.b $vr7, $vr7, 0 -+ -+ vshuf.b $vr0, $vr0, $vr0, $vr7 -+ vshuf.b $vr1, $vr1, $vr1, $vr7 -+L(calc_result): -+ vpickve2gr.bu t0, $vr0, 0 -+ vpickve2gr.bu t1, $vr1, 0 -+ -+ sub.d t2, t0, t1 -+ sub.d t3, t1, t0 -+ masknez t0, t3, a5 -+ maskeqz t1, t2, a5 -+ -+ or a0, t0, t1 -+ jr ra -+L(un_pair_end): -+ vsetanyeqz.b $fcc0, $vr7 -+ bcnez $fcc0, L(un_found) -+ -+ -+ vnori.b $vr7, $vr8, 0 -+ vfrstpi.b $vr7, $vr7, 0 -+ vshuf.b $vr0, $vr2, $vr2, $vr7 -+ vshuf.b $vr1, $vr3, $vr3, $vr7 -+ -+ b L(calc_result) -+L(out): -+ move a0, zero -+ jr ra -+ -+END(MEMCMP) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMCMP) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp.c b/sysdeps/loongarch/lp64/multiarch/memcmp.c -new file mode 100644 -index 00000000..a956761e ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memcmp.c -@@ -0,0 +1,41 @@ -+/* Multiple versions of memcmp. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* Define multiple versions only for the definition in libc. */ -+#if IS_IN (libc) -+# define memcmp __redirect_memcmp -+# include -+# undef memcmp -+ -+# define SYMBOL_NAME memcmp -+# include "ifunc-memchr.h" -+ -+libc_ifunc_redirected (__redirect_memcmp, __new_memcmp, -+ IFUNC_SELECTOR ()); -+# undef bcmp -+weak_alias (__new_memcmp, bcmp) -+ -+# ifdef SHARED -+__hidden_ver1 (__new_memcmp, __GI_memcmp, __redirect_memcmp) -+ __attribute__ ((visibility ("hidden"))); -+# endif -+ -+# include -+versioned_symbol (libc, __new_memcmp, memcmp, GLIBC_2_27); -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S -new file mode 100644 -index 00000000..5ff8b4e6 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S -@@ -0,0 +1,11 @@ -+ -+ -+#if IS_IN (libc) -+ -+#define MEMCPY_NAME __memcpy_aligned -+#define MEMMOVE_NAME __memmove_aligned -+ -+#endif -+ -+#include "../memcpy.S" -+ -diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S -new file mode 100644 -index 00000000..99d2cc71 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S -@@ -0,0 +1 @@ -+/* memcpy is part of memmove.S */ -diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S -new file mode 100644 -index 00000000..99d2cc71 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S -@@ -0,0 +1 @@ -+/* memcpy is part of memmove.S */ -diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S -new file mode 100644 -index 00000000..5e38df0d ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S -@@ -0,0 +1,259 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define MEMCPY_NAME __memcpy_unaligned -+ -+#define LD_64(reg, n) \ -+ ld.d t0, reg, n; \ -+ ld.d t1, reg, n+8; \ -+ ld.d t2, reg, n+16; \ -+ ld.d t3, reg, n+24; \ -+ ld.d t4, reg, n+32; \ -+ ld.d t5, reg, n+40; \ -+ ld.d t6, reg, n+48; \ -+ ld.d t7, reg, n+56; -+ -+#define ST_64(reg, n) \ -+ st.d t0, reg, n; \ -+ st.d t1, reg, n+8; \ -+ st.d t2, reg, n+16; \ -+ st.d t3, reg, n+24; \ -+ st.d t4, reg, n+32; \ -+ st.d t5, reg, n+40; \ -+ st.d t6, reg, n+48; \ -+ st.d t7, reg, n+56; -+ -+#ifdef ANDROID_CHANGES -+LEAF(MEMCPY_NAME, 0) -+#else -+LEAF(MEMCPY_NAME) -+#endif -+ -+//1st var: dst ptr: void *a1 $r4 a0 -+//2nd var: src ptr: void *a2 $r5 a1 -+//3rd var: size_t len $r6 a2 -+//t0~t9 registers as temp -+ -+ add.d a4, a1, a2 -+ add.d a3, a0, a2 -+ li.w a6, 16 -+ bge a6, a2, less_16bytes -+ li.w a6, 128 -+ blt a6, a2, long_bytes -+ li.w a6, 64 -+ blt a6, a2, more_64bytes -+ li.w a6, 32 -+ blt a6, a2, more_32bytes -+ -+ /* 17...32 */ -+ ld.d t0, a1, 0 -+ ld.d t1, a1, 8 -+ ld.d t2, a4, -16 -+ ld.d t3, a4, -8 -+ st.d t0, a0, 0 -+ st.d t1, a0, 8 -+ st.d t2, a3, -16 -+ st.d t3, a3, -8 -+ jr ra -+ -+more_64bytes: -+ srli.d t8, a0, 3 -+ slli.d t8, t8, 3 -+ addi.d t8, t8, 0x8 -+ sub.d a7, a0, t8 -+ ld.d t0, a1, 0 -+ sub.d a1, a1, a7 -+ st.d t0, a0, 0 -+ -+ add.d a7, a7, a2 -+ addi.d a7, a7, -0x20 -+loop_32: -+ ld.d t0, a1, 0 -+ ld.d t1, a1, 8 -+ ld.d t2, a1, 16 -+ ld.d t3, a1, 24 -+ st.d t0, t8, 0 -+ st.d t1, t8, 8 -+ st.d t2, t8, 16 -+ st.d t3, t8, 24 -+ -+ addi.d t8, t8, 0x20 -+ addi.d a1, a1, 0x20 -+ addi.d a7, a7, -0x20 -+ blt zero, a7, loop_32 -+ -+ ld.d t4, a4, -32 -+ ld.d t5, a4, -24 -+ ld.d t6, a4, -16 -+ ld.d t7, a4, -8 -+ st.d t4, a3, -32 -+ st.d t5, a3, -24 -+ st.d t6, a3, -16 -+ st.d t7, a3, -8 -+ -+ jr ra -+ -+more_32bytes: -+ /* 33...64 */ -+ ld.d t0, a1, 0 -+ ld.d t1, a1, 8 -+ ld.d t2, a1, 16 -+ ld.d t3, a1, 24 -+ ld.d t4, a4, -32 -+ ld.d t5, a4, -24 -+ ld.d t6, a4, -16 -+ ld.d t7, a4, -8 -+ st.d t0, a0, 0 -+ st.d t1, a0, 8 -+ st.d t2, a0, 16 -+ st.d t3, a0, 24 -+ st.d t4, a3, -32 -+ st.d t5, a3, -24 -+ st.d t6, a3, -16 -+ st.d t7, a3, -8 -+ jr ra -+ -+less_16bytes: -+ srai.d a6, a2, 3 -+ beqz a6, less_8bytes -+ -+ /* 8...16 */ -+ ld.d t0, a1, 0 -+ ld.d t1, a4, -8 -+ st.d t0, a0, 0 -+ st.d t1, a3, -8 -+ -+ jr ra -+ -+less_8bytes: -+ srai.d a6, a2, 2 -+ beqz a6, less_4bytes -+ -+ /* 4...7 */ -+ ld.w t0, a1, 0 -+ ld.w t1, a4, -4 -+ st.w t0, a0, 0 -+ st.w t1, a3, -4 -+ jr ra -+ -+less_4bytes: -+ srai.d a6, a2, 1 -+ beqz a6, less_2bytes -+ -+ /* 2...3 */ -+ ld.h t0, a1, 0 -+ ld.h t1, a4, -2 -+ st.h t0, a0, 0 -+ st.h t1, a3, -2 -+ jr ra -+ -+less_2bytes: -+ beqz a2, less_1bytes -+ -+ ld.b t0, a1, 0 -+ st.b t0, a0, 0 -+ jr ra -+ -+less_1bytes: -+ jr ra -+ -+long_bytes: -+ srli.d t8, a0, 3 -+ slli.d t8, t8, 3 -+ beq a0, t8, start -+ -+ ld.d t0, a1, 0 -+ addi.d t8, t8, 0x8 -+ st.d t0, a0, 0 -+ sub.d a7, a0, t8 -+ sub.d a1, a1, a7 -+ -+start: -+ addi.d a5, a3, -0x80 -+ blt a5, t8, align_end_proc -+ -+loop_128: -+ LD_64(a1, 0) -+ ST_64(t8, 0) -+ LD_64(a1, 64) -+ addi.d a1, a1, 0x80 -+ ST_64(t8, 64) -+ addi.d t8, t8, 0x80 -+ bge a5, t8, loop_128 -+ -+align_end_proc: -+ sub.d a2, a3, t8 -+ -+ pcaddi t1, 34 -+ andi t2, a2, 0x78 -+ sub.d t1, t1, t2 -+ jirl zero, t1, 0 -+ -+end_120_128_unalign: -+ ld.d t0, a1, 112 -+ st.d t0, t8, 112 -+end_112_120_unalign: -+ ld.d t0, a1, 104 -+ st.d t0, t8, 104 -+end_104_112_unalign: -+ ld.d t0, a1, 96 -+ st.d t0, t8, 96 -+end_96_104_unalign: -+ ld.d t0, a1, 88 -+ st.d t0, t8, 88 -+end_88_96_unalign: -+ ld.d t0, a1, 80 -+ st.d t0, t8, 80 -+end_80_88_unalign: -+ ld.d t0, a1, 72 -+ st.d t0, t8, 72 -+end_72_80_unalign: -+ ld.d t0, a1, 64 -+ st.d t0, t8, 64 -+end_64_72_unalign: -+ ld.d t0, a1, 56 -+ st.d t0, t8, 56 -+end_56_64_unalign: -+ ld.d t0, a1, 48 -+ st.d t0, t8, 48 -+end_48_56_unalign: -+ ld.d t0, a1, 40 -+ st.d t0, t8, 40 -+end_40_48_unalign: -+ ld.d t0, a1, 32 -+ st.d t0, t8, 32 -+end_32_40_unalign: -+ ld.d t0, a1, 24 -+ st.d t0, t8, 24 -+end_24_32_unalign: -+ ld.d t0, a1, 16 -+ st.d t0, t8, 16 -+end_16_24_unalign: -+ ld.d t0, a1, 8 -+ st.d t0, t8, 8 -+end_8_16_unalign: -+ ld.d t0, a1, 0 -+ st.d t0, t8, 0 -+end_0_8_unalign: -+ ld.d t0, a4, -8 -+ st.d t0, a3, -8 -+ -+ jr ra -+ -+END(MEMCPY_NAME) -+ -+#ifndef ANDROID_CHANGES -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMCPY_NAME) -+#endif -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy.c b/sysdeps/loongarch/lp64/multiarch/memcpy.c -new file mode 100644 -index 00000000..0ba8254a ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memcpy.c -@@ -0,0 +1,39 @@ -+/* Multiple versions of memcpy. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* Define multiple versions only for the definition in libc. */ -+#if IS_IN (libc) -+# define memcpy __redirect_memcpy -+# include -+# undef memcpy -+ -+# define SYMBOL_NAME memcpy -+# include "ifunc-lasx.h" -+ -+libc_ifunc_redirected (__redirect_memcpy, __new_memcpy, -+ IFUNC_SELECTOR ()); -+ -+# ifdef SHARED -+__hidden_ver1 (__new_memcpy, __GI_memcpy, __redirect_memcpy) -+ __attribute__ ((visibility ("hidden"))); -+# endif -+ -+# include -+versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_27); -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-aligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-aligned.S -new file mode 100644 -index 00000000..bcd37a0e ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memmove-aligned.S -@@ -0,0 +1 @@ -+/* memmove_aligned is part of memcpy_aligned, see memcpy-aligned.S. */ -diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S -new file mode 100644 -index 00000000..9537a35a ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S -@@ -0,0 +1,279 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#ifndef MEMCPY_NAME -+#define MEMCPY_NAME __memcpy_lasx -+#endif -+ -+#ifndef MEMMOVE_NAME -+#define MEMMOVE_NAME __memmove_lasx -+#endif -+ -+LEAF(MEMCPY_NAME) -+ .align 6 -+ -+ li.d t0, 32 -+ add.d a3, a0, a2 -+ add.d a4, a1, a2 -+ bgeu t0, a2, L(less_32bytes) # a2 <= 32 -+ -+ li.d t1, 64 -+ bltu t1, a2, L(copy_long) # a2 > 64 -+ xvld $xr0, a1, 0 -+ xvld $xr1, a4, -32 -+ -+ xvst $xr0, a0, 0 -+ xvst $xr1, a3, -32 -+ jr ra -+L(less_32bytes): -+ srli.d t0, a2, 4 -+ -+ beqz t0, L(less_16bytes) -+ vld $vr0, a1, 0 -+ vld $vr1, a4, -16 -+ vst $vr0, a0, 0 -+ -+ -+ vst $vr1, a3, -16 -+ jr ra -+L(less_16bytes): -+ srli.d t0, a2, 3 -+ beqz t0, L(less_8bytes) -+ -+ ld.d t0, a1, 0 -+ ld.d t1, a4, -8 -+ st.d t0, a0, 0 -+ st.d t1, a3, -8 -+ -+ jr ra -+L(less_8bytes): -+ srli.d t0, a2, 2 -+ beqz t0, L(less_4bytes) -+ ld.w t0, a1, 0 -+ -+ ld.w t1, a4, -4 -+ st.w t0, a0, 0 -+ st.w t1, a3, -4 -+ jr ra -+ -+ -+L(less_4bytes): -+ srli.d t0, a2, 1 -+ beqz t0, L(less_2bytes) -+ ld.h t0, a1, 0 -+ ld.h t1, a4, -2 -+ -+ st.h t0, a0, 0 -+ st.h t1, a3, -2 -+ jr ra -+L(less_2bytes): -+ beqz a2, L(less_1bytes) -+ -+ ld.b t0, a1, 0 -+ st.b t0, a0, 0 -+L(less_1bytes): -+ jr ra -+END(MEMCPY_NAME) -+ -+LEAF(MEMMOVE_NAME) -+ .align 6 -+ -+ li.d t0, 32 -+ add.d a3, a0, a2 -+ add.d a4, a1, a2 -+ bgeu t0, a2, L(less_32bytes) # a2 <= 32 -+ -+ li.d t1, 64 -+ bltu t1, a2, L(move_long) # a2 > 64 -+ xvld $xr0, a1, 0 -+ xvld $xr1, a4, -32 -+ -+ xvst $xr0, a0, 0 -+ xvst $xr1, a3, -32 -+ jr ra -+L(move_long): -+ sub.d t2, a0, a1 -+ -+ bltu t2, a2, L(copy_back) -+L(copy_long): -+ andi t2, a0, 0x1f -+ addi.d a2, a2, -1 -+ sub.d t2, t0, t2 -+ -+ -+ xvld $xr8, a1, 0 -+ xvld $xr9, a4, -32 -+ sub.d t3, a2, t2 -+ add.d a5, a0, t2 -+ -+ andi a2, t3, 0xff -+ add.d a1, a1, t2 -+ beq a2, t3, L(lt256) -+ sub.d a6, a4, a2 -+ -+ addi.d a6, a6, -1 -+L(loop_256): -+ xvld $xr0, a1, 0 -+ xvld $xr1, a1, 32 -+ xvld $xr2, a1, 64 -+ -+ xvld $xr3, a1, 96 -+ xvld $xr4, a1, 128 -+ xvld $xr5, a1, 160 -+ xvld $xr6, a1, 192 -+ -+ -+ xvld $xr7, a1, 224 -+ addi.d a1, a1, 256 -+ xvst $xr0, a5, 0 -+ xvst $xr1, a5, 32 -+ -+ xvst $xr2, a5, 64 -+ xvst $xr3, a5, 96 -+ xvst $xr4, a5, 128 -+ xvst $xr5, a5, 160 -+ -+ xvst $xr6, a5, 192 -+ xvst $xr7, a5, 224 -+ addi.d a5, a5, 256 -+ bne a1, a6, L(loop_256) -+ -+L(lt256): -+ srli.d t2, a2, 7 -+ beqz t2, L(lt128) -+ xvld $xr0, a1, 0 -+ xvld $xr1, a1, 32 -+ -+ -+ xvld $xr2, a1, 64 -+ xvld $xr3, a1, 96 -+ addi.d a1, a1, 128 -+ addi.d a2, a2, -128 -+ -+ xvst $xr0, a5, 0 -+ xvst $xr1, a5, 32 -+ xvst $xr2, a5, 64 -+ xvst $xr3, a5, 96 -+ -+ addi.d a5, a5, 128 -+L(lt128): -+ bltu a2, t1, L(lt64) -+ xvld $xr0, a1, 0 -+ xvld $xr1, a1, 32 -+ -+ addi.d a1, a1, 64 -+ addi.d a2, a2, -64 -+ xvst $xr0, a5, 0 -+ xvst $xr1, a5, 32 -+ -+ -+ addi.d a5, a5, 64 -+L(lt64): -+ bltu a2, t0, L(lt32) -+ xvld $xr0, a1, 0 -+ xvst $xr0, a5, 0 -+ -+L(lt32): -+ xvst $xr8, a0, 0 -+ xvst $xr9, a3, -32 -+ jr ra -+ nop -+ -+L(copy_back): -+ addi.d a3, a3, -1 -+ addi.d a2, a2, -2 -+ andi t2, a3, 0x1f -+ xvld $xr8, a1, 0 -+ -+ xvld $xr9, a4, -32 -+ sub.d t3, a2, t2 -+ sub.d a5, a3, t2 -+ sub.d a4, a4, t2 -+ -+ -+ andi a2, t3, 0xff -+ beq a2, t3, L(back_lt256) -+ add.d a6, a1, a2 -+ addi.d a6, a6, 2 -+ -+L(back_loop_256): -+ xvld $xr0, a4, -33 -+ xvld $xr1, a4, -65 -+ xvld $xr2, a4, -97 -+ xvld $xr3, a4, -129 -+ -+ xvld $xr4, a4, -161 -+ xvld $xr5, a4, -193 -+ xvld $xr6, a4, -225 -+ xvld $xr7, a4, -257 -+ -+ addi.d a4, a4, -256 -+ xvst $xr0, a5, -32 -+ xvst $xr1, a5, -64 -+ xvst $xr2, a5, -96 -+ -+ -+ xvst $xr3, a5, -128 -+ xvst $xr4, a5, -160 -+ xvst $xr5, a5, -192 -+ xvst $xr6, a5, -224 -+ -+ xvst $xr7, a5, -256 -+ addi.d a5, a5, -256 -+ bne a4, a6, L(back_loop_256) -+L(back_lt256): -+ srli.d t2, a2, 7 -+ -+ beqz t2, L(back_lt128) -+ xvld $xr0, a4, -33 -+ xvld $xr1, a4, -65 -+ xvld $xr2, a4, -97 -+ -+ xvld $xr3, a4, -129 -+ addi.d a2, a2, -128 -+ addi.d a4, a4, -128 -+ xvst $xr0, a5, -32 -+ -+ -+ xvst $xr1, a5, -64 -+ xvst $xr2, a5, -96 -+ xvst $xr3, a5, -128 -+ addi.d a5, a5, -128 -+ -+L(back_lt128): -+ blt a2, t1, L(back_lt64) -+ xvld $xr0, a4, -33 -+ xvld $xr1, a4, -65 -+ addi.d a2, a2, -64 -+ -+ addi.d a4, a4, -64 -+ xvst $xr0, a5, -32 -+ xvst $xr1, a5, -64 -+ addi.d a5, a5, -64 -+ -+L(back_lt64): -+ bltu a2, t0, L(back_lt32) -+ xvld $xr0, a4, -33 -+ xvst $xr0, a5, -32 -+L(back_lt32): -+ xvst $xr8, a0, 0 -+ -+ -+ xvst $xr9, a3, -31 -+ jr ra -+END(MEMMOVE_NAME) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMCPY_NAME) -+libc_hidden_builtin_def (MEMMOVE_NAME) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S -new file mode 100644 -index 00000000..26babad4 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S -@@ -0,0 +1,524 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define MEMCPY_NAME __memcpy_lsx -+#define MEMMOVE_NAME __memmove_lsx -+ -+LEAF(MEMCPY_NAME) -+ .align 6 -+ li.d t6, 16 -+ add.d a3, a0, a2 -+ add.d a4, a1, a2 -+ bgeu t6, a2, L(less_16bytes) # a2 <= 16 -+ -+ li.d t8, 64 -+ li.d t7, 32 -+ bltu t8, a2, L(copy_long) # a2 > 64 -+ bltu t7, a2, L(more_32bytes) # a2 > 32 -+ -+ vld $vr0, a1, 0 -+ vld $vr1, a4, -16 -+ vst $vr0, a0, 0 -+ vst $vr1, a3, -16 -+ -+ jr ra -+L(more_32bytes): -+ vld $vr0, a1, 0 -+ vld $vr1, a1, 16 -+ vld $vr2, a4, -32 -+ -+ -+ vld $vr3, a4, -16 -+ vst $vr0, a0, 0 -+ vst $vr1, a0, 16 -+ vst $vr2, a3, -32 -+ -+ vst $vr3, a3, -16 -+ jr ra -+L(less_16bytes): -+ srli.d t0, a2, 3 -+ beqz t0, L(less_8bytes) -+ -+ vldrepl.d $vr0, a1, 0 -+ vldrepl.d $vr1, a4, -8 -+ vstelm.d $vr0, a0, 0, 0 -+ vstelm.d $vr1, a3, -8, 0 -+ -+ jr ra -+L(less_8bytes): -+ srli.d t0, a2, 2 -+ beqz t0, L(less_4bytes) -+ vldrepl.w $vr0, a1, 0 -+ -+ -+ vldrepl.w $vr1, a4, -4 -+ vstelm.w $vr0, a0, 0, 0 -+ vstelm.w $vr1, a3, -4, 0 -+ jr ra -+ -+L(less_4bytes): -+ srli.d t0, a2, 1 -+ beqz t0, L(less_2bytes) -+ vldrepl.h $vr0, a1, 0 -+ vldrepl.h $vr1, a4, -2 -+ -+ vstelm.h $vr0, a0, 0, 0 -+ vstelm.h $vr1, a3, -2, 0 -+ jr ra -+L(less_2bytes): -+ beqz a2, L(less_1bytes) -+ -+ ld.b t0, a1, 0 -+ st.b t0, a0, 0 -+L(less_1bytes): -+ jr ra -+ nop -+END(MEMCPY_NAME) -+ -+LEAF(MEMMOVE_NAME) -+ li.d t6, 16 -+ add.d a3, a0, a2 -+ add.d a4, a1, a2 -+ bgeu t6, a2, L(less_16bytes) # a2 <= 16 -+ -+ li.d t8, 64 -+ li.d t7, 32 -+ bltu t8, a2, L(move_long) # a2 > 64 -+ bltu t7, a2, L(more_32bytes) # a2 > 32 -+ -+ vld $vr0, a1, 0 -+ vld $vr1, a4, -16 -+ vst $vr0, a0, 0 -+ vst $vr1, a3, -16 -+ -+ jr ra -+ nop -+L(move_long): -+ sub.d t0, a0, a1 -+ bltu t0, a2, L(copy_back) -+ -+ -+L(copy_long): -+ vld $vr2, a1, 0 -+ andi t0, a0, 0xf -+ sub.d t0, t6, t0 -+ add.d a1, a1, t0 -+ -+ sub.d a2, a2, t0 -+ andi t1, a1, 0xf -+ bnez t1, L(unaligned) -+ vld $vr0, a1, 0 -+ -+ addi.d a2, a2, -16 -+ vst $vr2, a0, 0 -+ andi t2, a2, 0x7f -+ add.d a5, a0, t0 -+ -+ beq a2, t2, L(al_less_128) -+ sub.d t3, a2, t2 -+ move a2, t2 -+ add.d a6, a1, t3 -+ -+ -+L(al_loop): -+ vld $vr1, a1, 16 -+ vld $vr2, a1, 32 -+ vld $vr3, a1, 48 -+ vld $vr4, a1, 64 -+ -+ vld $vr5, a1, 80 -+ vld $vr6, a1, 96 -+ vld $vr7, a1, 112 -+ vst $vr0, a5, 0 -+ -+ vld $vr0, a1, 128 -+ addi.d a1, a1, 128 -+ vst $vr1, a5, 16 -+ vst $vr2, a5, 32 -+ -+ vst $vr3, a5, 48 -+ vst $vr4, a5, 64 -+ vst $vr5, a5, 80 -+ vst $vr6, a5, 96 -+ -+ -+ vst $vr7, a5, 112 -+ addi.d a5, a5, 128 -+ bne a1, a6, L(al_loop) -+L(al_less_128): -+ blt a2, t8, L(al_less_64) -+ -+ vld $vr1, a1, 16 -+ vld $vr2, a1, 32 -+ vld $vr3, a1, 48 -+ addi.d a2, a2, -64 -+ -+ vst $vr0, a5, 0 -+ vld $vr0, a1, 64 -+ addi.d a1, a1, 64 -+ vst $vr1, a5, 16 -+ -+ vst $vr2, a5, 32 -+ vst $vr3, a5, 48 -+ addi.d a5, a5, 64 -+L(al_less_64): -+ blt a2, t7, L(al_less_32) -+ -+ -+ vld $vr1, a1, 16 -+ addi.d a2, a2, -32 -+ vst $vr0, a5, 0 -+ vld $vr0, a1, 32 -+ -+ addi.d a1, a1, 32 -+ vst $vr1, a5, 16 -+ addi.d a5, a5, 32 -+L(al_less_32): -+ blt a2, t6, L(al_less_16) -+ -+ vst $vr0, a5, 0 -+ vld $vr0, a1, 16 -+ addi.d a5, a5, 16 -+L(al_less_16): -+ vld $vr1, a4, -16 -+ -+ vst $vr0, a5, 0 -+ vst $vr1, a3, -16 -+ jr ra -+ nop -+ -+ -+L(magic_num): -+ .dword 0x0706050403020100 -+ .dword 0x0f0e0d0c0b0a0908 -+L(unaligned): -+ pcaddi t2, -4 -+ bstrins.d a1, zero, 3, 0 -+ vld $vr8, t2, 0 -+ vld $vr0, a1, 0 -+ -+ vld $vr1, a1, 16 -+ addi.d a2, a2, -16 -+ vst $vr2, a0, 0 -+ add.d a5, a0, t0 -+ -+ vreplgr2vr.b $vr9, t1 -+ andi t2, a2, 0x7f -+ vadd.b $vr9, $vr9, $vr8 -+ addi.d a1, a1, 32 -+ -+ -+ beq t2, a2, L(un_less_128) -+ sub.d t3, a2, t2 -+ move a2, t2 -+ add.d a6, a1, t3 -+ -+L(un_loop): -+ vld $vr2, a1, 0 -+ vld $vr3, a1, 16 -+ vld $vr4, a1, 32 -+ vld $vr5, a1, 48 -+ -+ vld $vr6, a1, 64 -+ vld $vr7, a1, 80 -+ vshuf.b $vr8, $vr1, $vr0, $vr9 -+ vld $vr0, a1, 96 -+ -+ vst $vr8, a5, 0 -+ vshuf.b $vr8, $vr2, $vr1, $vr9 -+ vld $vr1, a1, 112 -+ vst $vr8, a5, 16 -+ -+ -+ addi.d a1, a1, 128 -+ vshuf.b $vr2, $vr3, $vr2, $vr9 -+ vshuf.b $vr3, $vr4, $vr3, $vr9 -+ vst $vr2, a5, 32 -+ -+ vshuf.b $vr4, $vr5, $vr4, $vr9 -+ vst $vr3, a5, 48 -+ vshuf.b $vr5, $vr6, $vr5, $vr9 -+ vst $vr4, a5, 64 -+ -+ vshuf.b $vr6, $vr7, $vr6, $vr9 -+ vst $vr5, a5, 80 -+ vshuf.b $vr7, $vr0, $vr7, $vr9 -+ vst $vr6, a5, 96 -+ -+ vst $vr7, a5, 112 -+ addi.d a5, a5, 128 -+ bne a1, a6, L(un_loop) -+L(un_less_128): -+ blt a2, t8, L(un_less_64) -+ -+ -+ vld $vr2, a1, 0 -+ vld $vr3, a1, 16 -+ vshuf.b $vr4, $vr1, $vr0, $vr9 -+ vld $vr0, a1, 32 -+ -+ vst $vr4, a5, 0 -+ addi.d a2, a2, -64 -+ vshuf.b $vr4, $vr2, $vr1, $vr9 -+ vld $vr1, a1, 48 -+ -+ addi.d a1, a1, 64 -+ vst $vr4, a5, 16 -+ vshuf.b $vr2, $vr3, $vr2, $vr9 -+ vshuf.b $vr3, $vr0, $vr3, $vr9 -+ -+ vst $vr2, a5, 32 -+ vst $vr3, a5, 48 -+ addi.d a5, a5, 64 -+L(un_less_64): -+ blt a2, t7, L(un_less_32) -+ -+ -+ vshuf.b $vr3, $vr1, $vr0, $vr9 -+ vld $vr0, a1, 0 -+ vst $vr3, a5, 0 -+ addi.d a2, a2, -32 -+ -+ vshuf.b $vr3, $vr0, $vr1, $vr9 -+ vld $vr1, a1, 16 -+ addi.d a1, a1, 32 -+ vst $vr3, a5, 16 -+ -+ addi.d a5, a5, 32 -+L(un_less_32): -+ blt a2, t6, L(un_less_16) -+ vshuf.b $vr2, $vr1, $vr0, $vr9 -+ vor.v $vr0, $vr1, $vr1 -+ -+ vld $vr1, a1, 0 -+ vst $vr2, a5, 0 -+ addi.d a5, a5, 16 -+L(un_less_16): -+ vld $vr2, a4, -16 -+ -+ -+ vshuf.b $vr0, $vr1, $vr0, $vr9 -+ vst $vr0, a5, 0 -+ vst $vr2, a3, -16 -+ jr ra -+ -+L(copy_back): -+ addi.d t0, a3, -1 -+ vld $vr2, a4, -16 -+ andi t0, t0, 0xf -+ addi.d t0, t0, 1 # in case a3 is already aligned, load 16bytes and store 16bytes -+ -+ sub.d a4, a4, t0 -+ sub.d a2, a2, t0 -+ andi t1, a4, 0xf -+ bnez t1, L(back_unaligned) -+ -+ vld $vr0, a4, -16 -+ addi.d a2, a2, -16 -+ vst $vr2, a3, -16 -+ andi t2, a2, 0x7f -+ -+ -+ sub.d a3, a3, t0 -+ beq t2, a2, L(back_al_less_128) -+ sub.d t3, a2, t2 -+ move a2, t2 -+ -+ sub.d a6, a4, t3 -+L(back_al_loop): -+ vld $vr1, a4, -32 -+ vld $vr2, a4, -48 -+ vld $vr3, a4, -64 -+ -+ vld $vr4, a4, -80 -+ vld $vr5, a4, -96 -+ vld $vr6, a4, -112 -+ vld $vr7, a4, -128 -+ -+ vst $vr0, a3, -16 -+ vld $vr0, a4, -144 -+ addi.d a4, a4, -128 -+ vst $vr1, a3, -32 -+ -+ -+ vst $vr2, a3, -48 -+ vst $vr3, a3, -64 -+ vst $vr4, a3, -80 -+ vst $vr5, a3, -96 -+ -+ vst $vr6, a3, -112 -+ vst $vr7, a3, -128 -+ addi.d a3, a3, -128 -+ bne a4, a6, L(back_al_loop) -+ -+L(back_al_less_128): -+ blt a2, t8, L(back_al_less_64) -+ vld $vr1, a4, -32 -+ vld $vr2, a4, -48 -+ vld $vr3, a4, -64 -+ -+ addi.d a2, a2, -64 -+ vst $vr0, a3, -16 -+ vld $vr0, a4, -80 -+ addi.d a4, a4, -64 -+ -+ -+ vst $vr1, a3, -32 -+ vst $vr2, a3, -48 -+ vst $vr3, a3, -64 -+ addi.d a3, a3, -64 -+ -+L(back_al_less_64): -+ blt a2, t7, L(back_al_less_32) -+ vld $vr1, a4, -32 -+ addi.d a2, a2, -32 -+ vst $vr0, a3, -16 -+ -+ vld $vr0, a4, -48 -+ vst $vr1, a3, -32 -+ addi.d a3, a3, -32 -+ addi.d a4, a4, -32 -+ -+L(back_al_less_32): -+ blt a2, t6, L(back_al_less_16) -+ vst $vr0, a3, -16 -+ vld $vr0, a4, -32 -+ addi.d a3, a3, -16 -+ -+ -+L(back_al_less_16): -+ vld $vr1, a1, 0 -+ vst $vr0, a3, -16 -+ vst $vr1, a0, 0 -+ jr ra -+ -+L(magic_num_2): -+ .dword 0x0706050403020100 -+ .dword 0x0f0e0d0c0b0a0908 -+L(back_unaligned): -+ pcaddi t2, -4 -+ bstrins.d a4, zero, 3, 0 -+ vld $vr8, t2, 0 -+ vld $vr0, a4, 0 -+ -+ vld $vr1, a4, -16 -+ addi.d a2, a2, -16 -+ vst $vr2, a3, -16 -+ sub.d a3, a3, t0 -+ -+ -+ vreplgr2vr.b $vr9, t1 -+ andi t2, a2, 0x7f -+ vadd.b $vr9, $vr9, $vr8 -+ addi.d a4, a4, -16 -+ -+ beq t2, a2, L(back_un_less_128) -+ sub.d t3, a2, t2 -+ move a2, t2 -+ sub.d a6, a4, t3 -+ -+L(back_un_loop): -+ vld $vr2, a4, -16 -+ vld $vr3, a4, -32 -+ vld $vr4, a4, -48 -+ -+ vld $vr5, a4, -64 -+ vld $vr6, a4, -80 -+ vld $vr7, a4, -96 -+ vshuf.b $vr8, $vr0, $vr1, $vr9 -+ -+ -+ vld $vr0, a4, -112 -+ vst $vr8, a3, -16 -+ vshuf.b $vr8, $vr1, $vr2, $vr9 -+ vld $vr1, a4, -128 -+ -+ vst $vr8, a3, -32 -+ addi.d a4, a4, -128 -+ vshuf.b $vr2, $vr2, $vr3, $vr9 -+ vshuf.b $vr3, $vr3, $vr4, $vr9 -+ -+ vst $vr2, a3, -48 -+ vshuf.b $vr4, $vr4, $vr5, $vr9 -+ vst $vr3, a3, -64 -+ vshuf.b $vr5, $vr5, $vr6, $vr9 -+ -+ vst $vr4, a3, -80 -+ vshuf.b $vr6, $vr6, $vr7, $vr9 -+ vst $vr5, a3, -96 -+ vshuf.b $vr7, $vr7, $vr0, $vr9 -+ -+ -+ vst $vr6, a3, -112 -+ vst $vr7, a3, -128 -+ addi.d a3, a3, -128 -+ bne a4, a6, L(back_un_loop) -+ -+L(back_un_less_128): -+ blt a2, t8, L(back_un_less_64) -+ vld $vr2, a4, -16 -+ vld $vr3, a4, -32 -+ vshuf.b $vr4, $vr0, $vr1, $vr9 -+ -+ vld $vr0, a4, -48 -+ vst $vr4, a3, -16 -+ addi.d a2, a2, -64 -+ vshuf.b $vr4, $vr1, $vr2, $vr9 -+ -+ vld $vr1, a4, -64 -+ addi.d a4, a4, -64 -+ vst $vr4, a3, -32 -+ vshuf.b $vr2, $vr2, $vr3, $vr9 -+ -+ -+ vshuf.b $vr3, $vr3, $vr0, $vr9 -+ vst $vr2, a3, -48 -+ vst $vr3, a3, -64 -+ addi.d a3, a3, -64 -+ -+L(back_un_less_64): -+ blt a2, t7, L(back_un_less_32) -+ vshuf.b $vr3, $vr0, $vr1, $vr9 -+ vld $vr0, a4, -16 -+ vst $vr3, a3, -16 -+ -+ addi.d a2, a2, -32 -+ vshuf.b $vr3, $vr1, $vr0, $vr9 -+ vld $vr1, a4, -32 -+ addi.d a4, a4, -32 -+ -+ vst $vr3, a3, -32 -+ addi.d a3, a3, -32 -+L(back_un_less_32): -+ blt a2, t6, L(back_un_less_16) -+ vshuf.b $vr2, $vr0, $vr1, $vr9 -+ -+ -+ vor.v $vr0, $vr1, $vr1 -+ vld $vr1, a4, -16 -+ vst $vr2, a3, -16 -+ addi.d a3, a3, -16 -+ -+L(back_un_less_16): -+ vld $vr2, a1, 0 -+ vshuf.b $vr0, $vr0, $vr1, $vr9 -+ vst $vr0, a3, -16 -+ vst $vr2, a0, 0 -+ -+ jr ra -+END(MEMMOVE_NAME) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMCPY_NAME) -+libc_hidden_builtin_def (MEMMOVE_NAME) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S -new file mode 100644 -index 00000000..27ed0c9c ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S -@@ -0,0 +1,478 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define MEMMOVE_NAME __memmove_unaligned -+ -+#define LD_64(reg, n) \ -+ ld.d t0, reg, n; \ -+ ld.d t1, reg, n+8; \ -+ ld.d t2, reg, n+16; \ -+ ld.d t3, reg, n+24; \ -+ ld.d t4, reg, n+32; \ -+ ld.d t5, reg, n+40; \ -+ ld.d t6, reg, n+48; \ -+ ld.d t7, reg, n+56; -+ -+ -+#define ST_64(reg, n) \ -+ st.d t0, reg, n; \ -+ st.d t1, reg, n+8; \ -+ st.d t2, reg, n+16; \ -+ st.d t3, reg, n+24; \ -+ st.d t4, reg, n+32; \ -+ st.d t5, reg, n+40; \ -+ st.d t6, reg, n+48; \ -+ st.d t7, reg, n+56; -+ -+#define LDST_1024 \ -+ LD_64(a1, 0); \ -+ ST_64(a0, 0); \ -+ LD_64(a1, 64); \ -+ ST_64(a0, 64); \ -+ LD_64(a1, 128); \ -+ ST_64(a0, 128); \ -+ LD_64(a1, 192); \ -+ ST_64(a0, 192); \ -+ LD_64(a1, 256); \ -+ ST_64(a0, 256); \ -+ LD_64(a1, 320); \ -+ ST_64(a0, 320); \ -+ LD_64(a1, 384); \ -+ ST_64(a0, 384); \ -+ LD_64(a1, 448); \ -+ ST_64(a0, 448); \ -+ LD_64(a1, 512); \ -+ ST_64(a0, 512); \ -+ LD_64(a1, 576); \ -+ ST_64(a0, 576); \ -+ LD_64(a1, 640); \ -+ ST_64(a0, 640); \ -+ LD_64(a1, 704); \ -+ ST_64(a0, 704); \ -+ LD_64(a1, 768); \ -+ ST_64(a0, 768); \ -+ LD_64(a1, 832); \ -+ ST_64(a0, 832); \ -+ LD_64(a1, 896); \ -+ ST_64(a0, 896); \ -+ LD_64(a1, 960); \ -+ ST_64(a0, 960); -+ -+#define LDST_1024_BACK \ -+ LD_64(a4, -64); \ -+ ST_64(a3, -64); \ -+ LD_64(a4, -128); \ -+ ST_64(a3, -128); \ -+ LD_64(a4, -192); \ -+ ST_64(a3, -192); \ -+ LD_64(a4, -256); \ -+ ST_64(a3, -256); \ -+ LD_64(a4, -320); \ -+ ST_64(a3, -320); \ -+ LD_64(a4, -384); \ -+ ST_64(a3, -384); \ -+ LD_64(a4, -448); \ -+ ST_64(a3, -448); \ -+ LD_64(a4, -512); \ -+ ST_64(a3, -512); \ -+ LD_64(a4, -576); \ -+ ST_64(a3, -576); \ -+ LD_64(a4, -640); \ -+ ST_64(a3, -640); \ -+ LD_64(a4, -704); \ -+ ST_64(a3, -704); \ -+ LD_64(a4, -768); \ -+ ST_64(a3, -768); \ -+ LD_64(a4, -832); \ -+ ST_64(a3, -832); \ -+ LD_64(a4, -896); \ -+ ST_64(a3, -896); \ -+ LD_64(a4, -960); \ -+ ST_64(a3, -960); \ -+ LD_64(a4, -1024); \ -+ ST_64(a3, -1024); -+ -+#ifdef ANDROID_CHANGES -+LEAF(MEMMOVE_NAME, 0) -+#else -+LEAF(MEMMOVE_NAME) -+#endif -+ -+//1st var: dest ptr: void *str1 $r4 a0 -+//2nd var: src ptr: void *str2 $r5 a1 -+//3rd var: size_t num -+//t0~t9 registers as temp -+ -+ add.d a4, a1, a2 -+ add.d a3, a0, a2 -+ beq a1, a0, less_1bytes -+ move t8, a0 -+ srai.d a6, a2, 4 #num/16 -+ beqz a6, less_16bytes #num<16 -+ srai.d a6, a2, 6 #num/64 -+ bnez a6, more_64bytes #num>64 -+ srai.d a6, a2, 5 -+ beqz a6, less_32bytes #num<32 -+ -+ ld.d t0, a1, 0 #32. */ -+ -+/* Define multiple versions only for the definition in libc. */ -+#if IS_IN (libc) -+# define memmove __redirect_memmove -+# include -+# undef memmove -+ -+# define SYMBOL_NAME memmove -+# include "ifunc-lasx.h" -+ -+libc_ifunc_redirected (__redirect_memmove, __new_memmove, -+ IFUNC_SELECTOR ()); -+ -+# ifdef SHARED -+__hidden_ver1 (__new_memmove, __GI_memmove, __redirect_memmove) -+ __attribute__ ((visibility ("hidden"))); -+# endif -+ -+# include -+versioned_symbol (libc, __new_memmove, memmove, GLIBC_2_27); -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-generic.c b/sysdeps/loongarch/lp64/multiarch/memrchr-generic.c -new file mode 100644 -index 00000000..ee7ab39c ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-generic.c -@@ -0,0 +1,9 @@ -+ -+#if IS_IN (libc) -+ -+#define MEMRCHR __memrchr_generic -+ -+#endif -+ -+#include -+weak_alias (__memrchr_generic, __memrchr) -diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S -new file mode 100644 -index 00000000..57e1035f ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S -@@ -0,0 +1,114 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#ifndef MEMRCHR -+#define MEMRCHR __memrchr_lasx -+#endif -+ -+LEAF(MEMRCHR) -+ .align 6 -+ beqz a2, L(ret0) -+ addi.d a2, a2, -1 -+ add.d a3, a0, a2 -+ andi t1, a3, 0x3f -+ -+ bstrins.d a3, zero, 5, 0 -+ addi.d t1, t1, 1 # len for unaligned address -+ xvld $xr0, a3, 0 -+ xvld $xr1, a3, 32 -+ -+ sub.d t2, zero, t1 -+ li.d t3, -1 -+ xvreplgr2vr.b $xr2, a1 -+ andi t4, a0, 0x3f -+ -+ srl.d t2, t3, t2 -+ xvseq.b $xr0, $xr0, $xr2 -+ xvseq.b $xr1, $xr1, $xr2 -+ xvmsknz.b $xr0, $xr0 -+ -+ -+ xvmsknz.b $xr1, $xr1 -+ xvpickve.w $xr3, $xr0, 4 -+ xvpickve.w $xr4, $xr1, 4 -+ vilvl.h $vr0, $vr3, $vr0 -+ -+ vilvl.h $vr1, $vr4, $vr1 -+ vilvl.w $vr0, $vr1, $vr0 -+ movfr2gr.d t0, $f0 -+ and t0, t0, t2 -+ -+ bltu a2, t1, L(end) -+ bnez t0, L(found) -+ bstrins.d a0, zero, 5, 0 -+L(loop): -+ xvld $xr0, a3, -64 -+ -+ xvld $xr1, a3, -32 -+ addi.d a3, a3, -64 -+ xvseq.b $xr0, $xr0, $xr2 -+ xvseq.b $xr1, $xr1, $xr2 -+ -+ -+ beq a0, a3, L(out) -+ xvmax.bu $xr3, $xr0, $xr1 -+ xvseteqz.v $fcc0, $xr3 -+ bcnez $fcc0, L(loop) -+ -+ xvmsknz.b $xr0, $xr0 -+ xvmsknz.b $xr1, $xr1 -+ xvpickve.w $xr3, $xr0, 4 -+ xvpickve.w $xr4, $xr1, 4 -+ -+ vilvl.h $vr0, $vr3, $vr0 -+ vilvl.h $vr1, $vr4, $vr1 -+ vilvl.w $vr0, $vr1, $vr0 -+ movfr2gr.d t0, $f0 -+ -+L(found): -+ addi.d a0, a3, 63 -+ clz.d t1, t0 -+ sub.d a0, a0, t1 -+ jr ra -+ -+ -+L(out): -+ xvmsknz.b $xr0, $xr0 -+ xvmsknz.b $xr1, $xr1 -+ xvpickve.w $xr3, $xr0, 4 -+ xvpickve.w $xr4, $xr1, 4 -+ -+ vilvl.h $vr0, $vr3, $vr0 -+ vilvl.h $vr1, $vr4, $vr1 -+ vilvl.w $vr0, $vr1, $vr0 -+ movfr2gr.d t0, $f0 -+ -+L(end): -+ sll.d t2, t3, t4 -+ and t0, t0, t2 -+ addi.d a0, a3, 63 -+ clz.d t1, t0 -+ -+ sub.d a0, a0, t1 -+ maskeqz a0, a0, t0 -+ jr ra -+L(ret0): -+ move a0, zero -+ -+ -+ jr ra -+END(MEMRCHR) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMRCHR) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S -new file mode 100644 -index 00000000..eac2059a ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S -@@ -0,0 +1,96 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define MEMRCHR __memrchr_lsx -+ -+LEAF(MEMRCHR) -+ .align 6 -+ beqz a2, L(ret0) -+ addi.d a2, a2, -1 -+ add.d a3, a0, a2 -+ andi t1, a3, 0x1f -+ -+ bstrins.d a3, zero, 4, 0 -+ addi.d t1, t1, 1 # len for unaligned address -+ vld $vr0, a3, 0 -+ vld $vr1, a3, 16 -+ -+ sub.d t2, zero, t1 -+ li.d t3, -1 -+ vreplgr2vr.b $vr2, a1 -+ andi t4, a0, 0x1f -+ -+ srl.d t2, t3, t2 -+ vseq.b $vr0, $vr0, $vr2 -+ vseq.b $vr1, $vr1, $vr2 -+ vmsknz.b $vr0, $vr0 -+ -+ -+ vmsknz.b $vr1, $vr1 -+ vilvl.h $vr0, $vr1, $vr0 -+ movfr2gr.s t0, $f0 -+ and t0, t0, t2 -+ -+ bltu a2, t1, L(end) -+ bnez t0, L(found) -+ bstrins.d a0, zero, 4, 0 -+L(loop): -+ vld $vr0, a3, -32 -+ -+ vld $vr1, a3, -16 -+ addi.d a3, a3, -32 -+ vseq.b $vr0, $vr0, $vr2 -+ vseq.b $vr1, $vr1, $vr2 -+ -+ beq a0, a3, L(out) -+ vmax.bu $vr3, $vr0, $vr1 -+ vseteqz.v $fcc0, $vr3 -+ bcnez $fcc0, L(loop) -+ -+ -+ vmsknz.b $vr0, $vr0 -+ vmsknz.b $vr1, $vr1 -+ vilvl.h $vr0, $vr1, $vr0 -+ movfr2gr.s t0, $f0 -+ -+L(found): -+ addi.d a0, a3, 31 -+ clz.w t1, t0 -+ sub.d a0, a0, t1 -+ jr ra -+ -+L(out): -+ vmsknz.b $vr0, $vr0 -+ vmsknz.b $vr1, $vr1 -+ vilvl.h $vr0, $vr1, $vr0 -+ movfr2gr.s t0, $f0 -+ -+L(end): -+ sll.d t2, t3, t4 -+ and t0, t0, t2 -+ addi.d a0, a3, 31 -+ clz.w t1, t0 -+ -+ -+ sub.d a0, a0, t1 -+ maskeqz a0, a0, t0 -+ jr ra -+L(ret0): -+ move a0, zero -+ -+ jr ra -+END(MEMRCHR) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMRCHR) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr.c b/sysdeps/loongarch/lp64/multiarch/memrchr.c -new file mode 100644 -index 00000000..675c3115 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memrchr.c -@@ -0,0 +1,39 @@ -+/* Multiple versions of memrchr. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* Define multiple versions only for the definition in libc. */ -+#if IS_IN (libc) -+# define memrchr __redirect_memrchr -+# include -+# undef memrchr -+ -+# define SYMBOL_NAME memrchr -+# include "ifunc-memrchr.h" -+ -+libc_ifunc_redirected (__redirect_memrchr, __new_memrchr, -+ IFUNC_SELECTOR ()); -+ -+# ifdef SHARED -+__hidden_ver1 (__new_memrchr, __GI_memrchr, __redirect_memrchr) -+ __attribute__ ((visibility ("hidden"))); -+# endif -+ -+# include -+versioned_symbol (libc, __new_memrchr, memrchr, GLIBC_2_27); -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memset-aligned.S b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S -new file mode 100644 -index 00000000..da2f5ada ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S -@@ -0,0 +1,9 @@ -+ -+#if IS_IN (libc) -+ -+#define MEMSET_NAME __memset_aligned -+ -+#endif -+ -+#include "../memset.S" -+ -diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S -new file mode 100644 -index 00000000..1bd2dda9 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S -@@ -0,0 +1,132 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define MEMSET __memset_lasx -+ -+LEAF(MEMSET) -+ .align 6 -+ li.d t1, 32 -+ move a3, a0 -+ xvreplgr2vr.b $xr0, a1 -+ add.d a4, a0, a2 -+ -+ bgeu t1, a2, L(less_32bytes) # len <= 32 -+ li.d t3, 128 -+ li.d t2, 64 -+ blt t3, a2, L(long_bytes) # len > 128 -+ -+L(less_128bytes): -+ bgeu t2, a2, L(less_64bytes) # len <= 64 -+ xvst $xr0, a3, 0 -+ xvst $xr0, a3, 32 -+ xvst $xr0, a4, -32 -+ -+ xvst $xr0, a4, -64 -+ jr ra -+L(less_64bytes): -+ xvst $xr0, a3, 0 -+ xvst $xr0, a4, -32 -+ -+ -+ jr ra -+L(less_32bytes): -+ srli.d t0, a2, 4 -+ beqz t0, L(less_16bytes) -+ vst $vr0, a3, 0 -+ -+ vst $vr0, a4, -16 -+ jr ra -+L(less_16bytes): -+ srli.d t0, a2, 3 -+ beqz t0, L(less_8bytes) -+ -+ vstelm.d $vr0, a3, 0, 0 -+ vstelm.d $vr0, a4, -8, 0 -+ jr ra -+L(less_8bytes): -+ srli.d t0, a2, 2 -+ -+ beqz t0, L(less_4bytes) -+ vstelm.w $vr0, a3, 0, 0 -+ vstelm.w $vr0, a4, -4, 0 -+ jr ra -+ -+ -+L(less_4bytes): -+ srli.d t0, a2, 1 -+ beqz t0, L(less_2bytes) -+ vstelm.h $vr0, a3, 0, 0 -+ vstelm.h $vr0, a4, -2, 0 -+ -+ jr ra -+L(less_2bytes): -+ beqz a2, L(less_1bytes) -+ st.b a1, a3, 0 -+L(less_1bytes): -+ jr ra -+ -+L(long_bytes): -+ xvst $xr0, a3, 0 -+ bstrins.d a3, zero, 4, 0 -+ addi.d a3, a3, 32 -+ sub.d a2, a4, a3 -+ -+ andi t0, a2, 0xff -+ beq t0, a2, L(long_end) -+ move a2, t0 -+ sub.d t0, a4, t0 -+ -+ -+L(loop_256): -+ xvst $xr0, a3, 0 -+ xvst $xr0, a3, 32 -+ xvst $xr0, a3, 64 -+ xvst $xr0, a3, 96 -+ -+ xvst $xr0, a3, 128 -+ xvst $xr0, a3, 160 -+ xvst $xr0, a3, 192 -+ xvst $xr0, a3, 224 -+ -+ addi.d a3, a3, 256 -+ bne a3, t0, L(loop_256) -+L(long_end): -+ bltu a2, t3, L(end_less_128) -+ addi.d a2, a2, -128 -+ -+ xvst $xr0, a3, 0 -+ xvst $xr0, a3, 32 -+ xvst $xr0, a3, 64 -+ xvst $xr0, a3, 96 -+ -+ -+ addi.d a3, a3, 128 -+L(end_less_128): -+ bltu a2, t2, L(end_less_64) -+ addi.d a2, a2, -64 -+ xvst $xr0, a3, 0 -+ -+ xvst $xr0, a3, 32 -+ addi.d a3, a3, 64 -+L(end_less_64): -+ bltu a2, t1, L(end_less_32) -+ xvst $xr0, a3, 0 -+ -+L(end_less_32): -+ xvst $xr0, a4, -32 -+ jr ra -+END(MEMSET) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMSET) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S -new file mode 100644 -index 00000000..a3bbadb7 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S -@@ -0,0 +1,125 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define MEMSET __memset_lsx -+ -+LEAF(MEMSET) -+ .align 6 -+ li.d t1, 16 -+ move a3, a0 -+ vreplgr2vr.b $vr0, a1 -+ add.d a4, a0, a2 -+ -+ bgeu t1, a2, L(less_16bytes) # len <= 16 -+ li.d t3, 64 -+ li.d t2, 32 -+ bgeu a2, t3, L(long_bytes) # len > 64 -+ -+L(less_64bytes): -+ bgeu t2, a2, L(less_32bytes) # len <= 32 -+ vst $vr0, a3, 0 -+ vst $vr0, a3, 16 -+ vst $vr0, a4, -32 -+ -+ vst $vr0, a4, -16 -+ jr ra -+L(less_32bytes): -+ vst $vr0, a3, 0 -+ vst $vr0, a4, -16 -+ -+ -+ jr ra -+L(less_16bytes): -+ srli.d t0, a2, 3 -+ beqz t0, L(less_8bytes) -+ vstelm.d $vr0, a3, 0, 0 -+ -+ vstelm.d $vr0, a4, -8, 0 -+ jr ra -+L(less_8bytes): -+ srli.d t0, a2, 2 -+ beqz t0, L(less_4bytes) -+ -+ vstelm.w $vr0, a3, 0, 0 -+ vstelm.w $vr0, a4, -4, 0 -+ jr ra -+L(less_4bytes): -+ srli.d t0, a2, 1 -+ -+ beqz t0, L(less_2bytes) -+ vstelm.h $vr0, a3, 0, 0 -+ vstelm.h $vr0, a4, -2, 0 -+ jr ra -+ -+ -+L(less_2bytes): -+ beqz a2, L(less_1bytes) -+ vstelm.b $vr0, a3, 0, 0 -+L(less_1bytes): -+ jr ra -+L(long_bytes): -+ vst $vr0, a3, 0 -+ -+ bstrins.d a3, zero, 3, 0 -+ addi.d a3, a3, 16 -+ sub.d a2, a4, a3 -+ andi t0, a2, 0x7f -+ -+ beq t0, a2, L(long_end) -+ move a2, t0 -+ sub.d t0, a4, t0 -+ -+L(loop_128): -+ vst $vr0, a3, 0 -+ -+ vst $vr0, a3, 16 -+ vst $vr0, a3, 32 -+ vst $vr0, a3, 48 -+ vst $vr0, a3, 64 -+ -+ -+ vst $vr0, a3, 80 -+ vst $vr0, a3, 96 -+ vst $vr0, a3, 112 -+ addi.d a3, a3, 128 -+ -+ bne a3, t0, L(loop_128) -+L(long_end): -+ bltu a2, t3, L(end_less_64) -+ addi.d a2, a2, -64 -+ vst $vr0, a3, 0 -+ -+ vst $vr0, a3, 16 -+ vst $vr0, a3, 32 -+ vst $vr0, a3, 48 -+ addi.d a3, a3, 64 -+ -+L(end_less_64): -+ bltu a2, t2, L(end_less_32) -+ addi.d a2, a2, -32 -+ vst $vr0, a3, 0 -+ vst $vr0, a3, 16 -+ -+ addi.d a3, a3, 32 -+L(end_less_32): -+ bltu a2, t1, L(end_less_16) -+ vst $vr0, a3, 0 -+ -+L(end_less_16): -+ vst $vr0, a4, -16 -+ jr ra -+END(MEMSET) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (MEMSET) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S -new file mode 100644 -index 00000000..16ff2ef7 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S -@@ -0,0 +1,177 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define MEMSET_NAME __memset_unaligned -+ -+#define ST_128(n) \ -+ st.d a1, a0, n; \ -+ st.d a1, a0, n+8 ; \ -+ st.d a1, a0, n+16 ; \ -+ st.d a1, a0, n+24 ; \ -+ st.d a1, a0, n+32 ; \ -+ st.d a1, a0, n+40 ; \ -+ st.d a1, a0, n+48 ; \ -+ st.d a1, a0, n+56 ; \ -+ st.d a1, a0, n+64 ; \ -+ st.d a1, a0, n+72 ; \ -+ st.d a1, a0, n+80 ; \ -+ st.d a1, a0, n+88 ; \ -+ st.d a1, a0, n+96 ; \ -+ st.d a1, a0, n+104; \ -+ st.d a1, a0, n+112; \ -+ st.d a1, a0, n+120; \ -+ -+//1st var: void *str $4 a0 -+//2nd var: int val $5 a1 -+//3rd var: size_t num $6 a2 -+ -+#ifdef ANDROID_CHANGES -+LEAF(MEMSET_NAME, 0) -+#else -+LEAF(MEMSET_NAME) -+#endif -+ -+ .align 6 -+ bstrins.d a1, a1, 15, 8 -+ add.d t7, a0, a2 -+ bstrins.d a1, a1, 31, 16 -+ move t0, a0 -+ bstrins.d a1, a1, 63, 32 -+ srai.d t8, a2, 4 #num/16 -+ beqz t8, less_16bytes #num<16 -+ srai.d t8, a2, 6 #num/64 -+ bnez t8, more_64bytes #num>64 -+ srai.d t8, a2, 5 #num/32 -+ beqz t8, less_32bytes #num<32 -+ st.d a1, a0, 0 #32. */ -+ -+/* Define multiple versions only for the definition in libc. */ -+#if IS_IN (libc) -+# define memset __redirect_memset -+# include -+# undef memset -+ -+# define SYMBOL_NAME memset -+# include "ifunc-lasx.h" -+ -+libc_ifunc_redirected (__redirect_memset, __new_memset, -+ IFUNC_SELECTOR ()); -+ -+# ifdef SHARED -+__hidden_ver1 (__new_memset, __GI_memset, __redirect_memset) -+ __attribute__ ((visibility ("hidden"))); -+# endif -+ -+# include -+versioned_symbol (libc, __new_memset, memset, GLIBC_2_27); -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S -new file mode 100644 -index 00000000..0b46b4ca ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S -@@ -0,0 +1,7 @@ -+ -+#if IS_IN (libc) -+#define RAWMEMCHR_NAME __rawmemchr_aligned -+#endif -+ -+#include "../rawmemchr.S" -+ -diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S -new file mode 100644 -index 00000000..bff92969 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S -@@ -0,0 +1,51 @@ -+#include -+#include -+ -+#if IS_IN (libc) -+ -+# define RAWMEMCHR __rawmemchr_lasx -+ -+LEAF(RAWMEMCHR) -+ .align 6 -+ move a2, a0 -+ bstrins.d a0, zero, 4, 0 -+ xvld $xr0, a0, 0 -+ xvreplgr2vr.b $xr1, a1 -+ -+ xvseq.b $xr0, $xr0, $xr1 -+ xvmsknz.b $xr0, $xr0 -+ xvpickve.w $xr2, $xr0, 4 -+ vilvl.h $vr0, $vr2, $vr0 -+ -+ movfr2gr.s t0, $f0 -+ sra.w t0, t0, a2 -+ beqz t0, L(loop) -+ ctz.w t0, t0 -+ -+ add.d a0, a2, t0 -+ jr ra -+ nop -+ nop -+ -+L(loop): -+ xvld $xr0, a0, 32 -+ addi.d a0, a0, 32 -+ xvseq.b $xr0, $xr0, $xr1 -+ xvseteqz.v $fcc0, $xr0 -+ -+ bcnez $fcc0, L(loop) -+ xvmsknz.b $xr0, $xr0 -+ xvpickve.w $xr1, $xr0, 4 -+ vilvl.h $vr0, $vr1, $vr0 -+ -+ movfr2gr.s t0, $f0 -+ ctz.w t0, t0 -+ add.d a0, a0, t0 -+ jr ra -+END(RAWMEMCHR) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (RAWMEMCHR) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S -new file mode 100644 -index 00000000..11a19c1d ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S -@@ -0,0 +1,56 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+# define RAWMEMCHR __rawmemchr_lsx -+ -+LEAF(RAWMEMCHR) -+ .align 6 -+ move a2, a0 -+ bstrins.d a0, zero, 4, 0 -+ vld $vr0, a0, 0 -+ vld $vr1, a0, 16 -+ -+ vreplgr2vr.b $vr2, a1 -+ vseq.b $vr0, $vr0, $vr2 -+ vseq.b $vr1, $vr1, $vr2 -+ vmsknz.b $vr0, $vr0 -+ -+ vmsknz.b $vr1, $vr1 -+ vilvl.h $vr0, $vr1, $vr0 -+ movfr2gr.s t0, $f0 -+ sra.w t0, t0, a2 -+ -+ beqz t0, L(loop) -+ ctz.w t0, t0 -+ add.d a0, a2, t0 -+ jr ra -+ -+ -+L(loop): -+ vld $vr0, a0, 32 -+ addi.d a0, a0, 16 -+ vseq.b $vr0, $vr0, $vr2 -+ vseteqz.v $fcc0, $vr0 -+ -+ bcnez $fcc0, L(loop) -+ addi.d a0, a0, 16 -+ vfrstpi.b $vr0, $vr0, 0 -+ vpickve2gr.bu t0, $vr0, 0 -+ -+ add.d a0, a0, t0 -+ jr ra -+END(RAWMEMCHR) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (RAWMEMCHR) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr.c b/sysdeps/loongarch/lp64/multiarch/rawmemchr.c -new file mode 100644 -index 00000000..1e514139 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr.c -@@ -0,0 +1,37 @@ -+/* Multiple versions of rawmemchr. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#if IS_IN (libc) -+# define rawmemchr __redirect_rawmemchr -+# define __rawmemchr __redirect___rawmemchr -+# include -+# undef rawmemchr -+# undef __rawmemchr -+ -+# define SYMBOL_NAME rawmemchr -+# include "ifunc-memchr.h" -+ -+libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr, -+ IFUNC_SELECTOR ()); -+weak_alias (__rawmemchr, rawmemchr) -+# ifdef SHARED -+__hidden_ver1 (__rawmemchr, __GI___rawmemchr, __redirect___rawmemchr) -+ __attribute__((visibility ("hidden"))); -+# endif -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S -new file mode 100644 -index 00000000..3d134e3f ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S -@@ -0,0 +1,8 @@ -+ -+#if IS_IN (libc) -+ -+#define STPCPY_NAME __stpcpy_aligned -+ -+#endif -+ -+#include "../stpcpy.S" -diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S -new file mode 100644 -index 00000000..bf0eed43 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S -@@ -0,0 +1,178 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define STPCPY __stpcpy_lsx -+ -+L(magic_num): -+ .align 6 -+ .dword 0x0706050403020100 -+ .dword 0x0f0e0d0c0b0a0908 -+ENTRY_NO_ALIGN(STPCPY) -+ pcaddi t0, -4 -+ andi a4, a1, 0xf -+ vld $vr1, t0, 0 -+ beqz a4, L(load_start) -+ -+ xor t0, a1, a4 -+ vld $vr0, t0, 0 -+ vreplgr2vr.b $vr2, a4 -+ vadd.b $vr2, $vr2, $vr1 -+ -+ vshuf.b $vr0, $vr2, $vr0, $vr2 -+ vsetanyeqz.b $fcc0, $vr0 -+ bcnez $fcc0, L(end) -+L(load_start): -+ vld $vr0, a1, 0 -+ -+ -+ li.d t1, 16 -+ andi a3, a0, 0xf -+ vsetanyeqz.b $fcc0, $vr0 -+ sub.d t0, t1, a3 -+ -+ bcnez $fcc0, L(end) -+ add.d a1, a1, t0 -+ vst $vr0, a0, 0 -+ add.d a0, a0, t0 -+ -+ bne a3, a4, L(unaligned) -+ vld $vr0, a1, 0 -+ vsetanyeqz.b $fcc0, $vr0 -+ bcnez $fcc0, L(end) -+ -+L(loop): -+ vst $vr0, a0, 0 -+ vld $vr0, a1, 16 -+ addi.d a0, a0, 16 -+ addi.d a1, a1, 16 -+ -+ -+ vsetanyeqz.b $fcc0, $vr0 -+ bceqz $fcc0, L(loop) -+ vmsknz.b $vr1, $vr0 -+ movfr2gr.s t0, $f1 -+ -+ cto.w t0, t0 -+ add.d a1, a1, t0 -+ vld $vr0, a1, -15 -+ add.d a0, a0, t0 -+ -+ vst $vr0, a0, -15 -+ jr ra -+L(end): -+ vseqi.b $vr1, $vr0, 0 -+ vfrstpi.b $vr1, $vr1, 0 -+ -+ vpickve2gr.bu t0, $vr1, 0 -+ addi.d t0, t0, 1 -+L(end_16): -+ andi t1, t0, 16 -+ beqz t1, L(end_8) -+ -+ -+ vst $vr0, a0, 0 -+ addi.d a0, a0, 15 -+ jr ra -+L(end_8): -+ andi t2, t0, 8 -+ -+ andi t3, t0, 4 -+ andi t4, t0, 2 -+ andi t5, t0, 1 -+ beqz t2, L(end_4) -+ -+ vstelm.d $vr0, a0, 0, 0 -+ addi.d a0, a0, 8 -+ vbsrl.v $vr0, $vr0, 8 -+L(end_4): -+ beqz t3, L(end_2) -+ -+ vstelm.w $vr0, a0, 0, 0 -+ addi.d a0, a0, 4 -+ vbsrl.v $vr0, $vr0, 4 -+L(end_2): -+ beqz t4, L(end_1) -+ -+ -+ vstelm.h $vr0, a0, 0, 0 -+ addi.d a0, a0, 2 -+ vbsrl.v $vr0, $vr0, 2 -+L(end_1): -+ beqz t5, L(out) -+ -+ vstelm.b $vr0, a0, 0, 0 -+ addi.d a0, a0, 1 -+L(out): -+ addi.d a0, a0, -1 -+ jr ra -+ -+ nop -+ nop -+L(unaligned): -+ andi a3, a1, 0xf -+ bstrins.d a1, zero, 3, 0 -+ -+ vld $vr2, a1, 0 -+ vreplgr2vr.b $vr3, a3 -+ vslt.b $vr4, $vr1, $vr3 -+ vor.v $vr0, $vr2, $vr4 -+ -+ -+ vsetanyeqz.b $fcc0, $vr0 -+ bcnez $fcc0, L(un_first_end) -+ vld $vr0, a1, 16 -+ vadd.b $vr3, $vr3, $vr1 -+ -+ addi.d a1, a1, 16 -+ vshuf.b $vr4, $vr0, $vr2, $vr3 -+ vsetanyeqz.b $fcc0, $vr0 -+ bcnez $fcc0, L(un_end) -+ -+L(un_loop): -+ vor.v $vr2, $vr0, $vr0 -+ vld $vr0, a1, 16 -+ vst $vr4, a0, 0 -+ addi.d a1, a1, 16 -+ -+ addi.d a0, a0, 16 -+ vshuf.b $vr4, $vr0, $vr2, $vr3 -+ vsetanyeqz.b $fcc0, $vr0 -+ bceqz $fcc0, L(un_loop) -+ -+ -+L(un_end): -+ vsetanyeqz.b $fcc0, $vr4 -+ bcnez $fcc0, 1f -+ vst $vr4, a0, 0 -+1: -+ vmsknz.b $vr1, $vr0 -+ -+ movfr2gr.s t0, $f1 -+ cto.w t0, t0 -+ add.d a1, a1, t0 -+ vld $vr0, a1, -15 -+ -+ add.d a0, a0, t0 -+ sub.d a0, a0, a3 -+ vst $vr0, a0, 1 -+ addi.d a0, a0, 16 -+ -+ jr ra -+L(un_first_end): -+ addi.d a0, a0, -16 -+ b 1b -+END(STPCPY) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (STPCPY) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy.c b/sysdeps/loongarch/lp64/multiarch/stpcpy.c -new file mode 100644 -index 00000000..531a3ed6 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy.c -@@ -0,0 +1,43 @@ -+/* Multiple versions of stpcpy. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2023 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* Define multiple versions only for the definition in libc. */ -+#if IS_IN (libc) -+# define stpcpy __redirect_stpcpy -+# define __stpcpy __redirect___stpcpy -+# define NO_MEMPCPY_STPCPY_REDIRECT -+# define __NO_STRING_INLINES -+# include -+# undef stpcpy -+# undef __stpcpy -+ -+# define SYMBOL_NAME stpcpy -+# include "ifunc-stpcpy.h" -+ -+libc_ifunc_redirected (__redirect_stpcpy, __stpcpy, IFUNC_SELECTOR ()); -+ -+weak_alias (__stpcpy, stpcpy) -+# ifdef SHARED -+__hidden_ver1 (__stpcpy, __GI___stpcpy, __redirect___stpcpy) -+ __attribute__ ((visibility ("hidden"))); -+__hidden_ver1 (stpcpy, __GI_stpcpy, __redirect_stpcpy) -+ __attribute__ ((visibility ("hidden"))); -+# endif -+#endif -+ -diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S -new file mode 100644 -index 00000000..92365658 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S -@@ -0,0 +1,10 @@ -+ -+#if IS_IN (libc) -+ -+#define STRCHR_NAME __strchr_aligned -+ -+#endif -+ -+#include "../strchr.S" -+ -+weak_alias (STRCHR_NAME, index) -diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S -new file mode 100644 -index 00000000..ea7eb9d2 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S -@@ -0,0 +1,81 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#ifndef AS_STRCHRNUL -+#define STRCHR __strchr_lasx -+#endif -+ -+LEAF(STRCHR) -+ .align 6 -+ andi t1, a0, 0x1f -+ bstrins.d a0, zero, 4, 0 -+ xvld $xr0, a0, 0 -+ li.d t2, -1 -+ -+ xvreplgr2vr.b $xr1, a1 -+ sll.d t1, t2, t1 -+ xvxor.v $xr2, $xr0, $xr1 -+ xvmin.bu $xr0, $xr0, $xr2 -+ -+ xvmsknz.b $xr0, $xr0 -+ xvpickve.w $xr3, $xr0, 4 -+ vilvl.h $vr0, $vr3, $vr0 -+ movfr2gr.s t0, $f0 -+ -+ orn t0, t0, t1 -+ bne t0, t2, L(end) -+ addi.d a0, a0, 32 -+ nop -+ -+ -+L(loop): -+ xvld $xr0, a0, 0 -+ xvxor.v $xr2, $xr0, $xr1 -+ xvmin.bu $xr0, $xr0, $xr2 -+ xvsetanyeqz.b $fcc0, $xr0 -+ -+ bcnez $fcc0, L(loop_end) -+ xvld $xr0, a0, 32 -+ addi.d a0, a0, 64 -+ xvxor.v $xr2, $xr0, $xr1 -+ -+ xvmin.bu $xr0, $xr0, $xr2 -+ xvsetanyeqz.b $fcc0, $xr0 -+ bceqz $fcc0, L(loop) -+ addi.d a0, a0, -32 -+ -+L(loop_end): -+ xvmsknz.b $xr0, $xr0 -+ xvpickve.w $xr1, $xr0, 4 -+ vilvl.h $vr0, $vr1, $vr0 -+ movfr2gr.s t0, $f0 -+ -+ -+L(end): -+ cto.w t0, t0 -+ add.d a0, a0, t0 -+#ifndef AS_STRCHRNUL -+ vreplgr2vr.b $vr0, t0 -+ xvpermi.q $xr3, $xr2, 1 -+ -+ vshuf.b $vr0, $vr3, $vr2, $vr0 -+ vpickve2gr.bu t0, $vr0, 0 -+ masknez a0, a0, t0 -+#endif -+ jr ra -+ -+END(STRCHR) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def(STRCHR) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S -new file mode 100644 -index 00000000..64ead00b ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S -@@ -0,0 +1,61 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#ifndef AS_STRCHRNUL -+#define STRCHR __strchr_lsx -+#endif -+ -+LEAF(STRCHR) -+ .align 6 -+ andi t1, a0, 0xf -+ bstrins.d a0, zero, 3, 0 -+ vld $vr0, a0, 0 -+ li.d t2, -1 -+ -+ vreplgr2vr.b $vr1, a1 -+ sll.d t3, t2, t1 -+ vxor.v $vr2, $vr0, $vr1 -+ vmin.bu $vr0, $vr0, $vr2 -+ -+ vmsknz.b $vr0, $vr0 -+ movfr2gr.s t0, $f0 -+ ext.w.h t0, t0 -+ orn t0, t0, t3 -+ -+ beq t0, t2, L(loop) -+L(found): -+ cto.w t0, t0 -+ add.d a0, a0, t0 -+#ifndef AS_STRCHRNUL -+ vreplve.b $vr2, $vr2, t0 -+ vpickve2gr.bu t1, $vr2, 0 -+ masknez a0, a0, t1 -+#endif -+ jr ra -+ -+ -+L(loop): -+ vld $vr0, a0, 16 -+ addi.d a0, a0, 16 -+ vxor.v $vr2, $vr0, $vr1 -+ vmin.bu $vr0, $vr0, $vr2 -+ -+ vsetanyeqz.b $fcc0, $vr0 -+ bceqz $fcc0, L(loop) -+ vmsknz.b $vr0, $vr0 -+ movfr2gr.s t0, $f0 -+ -+ b L(found) -+END(STRCHR) -+ -+libc_hidden_builtin_def (STRCHR) -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S -new file mode 100644 -index 00000000..1d5e56c5 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S -@@ -0,0 +1,132 @@ -+/* Copyright 2016 Loongson Technology Corporation Limited */ -+ -+/* Author: songyuekun songyuekun@loongson.cn */ -+ -+/* basic algorithm : -+ +. use ld.d and mask for the first 8 bytes or less; -+ +. build a1 with 8c with dins; -+ +. use xor from a1 and v0 to check if is found; -+ +. if (v0 - 0x0101010101010101) & (~(v0 | 0x7f7f7f7f7f7f7f7f)!= 0, v0 has -+ one byte is \0, else has no \0 -+*/ -+ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+ -+#if IS_IN (libc) -+ -+#define L_ADDIU addi.d -+#define L_ADDU add.d -+#define L_SUBU sub.d -+ -+#define MOVN(rd,rs,rt) \ -+ maskeqz t6, rs, rt;\ -+ masknez rd, rd, rt;\ -+ or rd, rd, t6 -+ -+#define MOVN2(rd,rt) \ -+ masknez rd, rd, rt;\ -+ or rd, rd, rt -+ -+#define STRCHR_NAME __strchr_unaligned -+ -+/* char * strchr (const char *s1, int c); */ -+LEAF(STRCHR_NAME) -+ .align 6 -+ -+ li.w t4, 0x7 -+ lu12i.w a2, 0x01010 -+ bstrins.d a1, a1, 15, 8 -+ andi t0, a0, 0x7 -+ -+ ori a2, a2, 0x101 -+ andn t4, a0, t4 -+ slli.w t1, t0, 3 -+ -+ ld.d t4, t4, 0 -+ -+ -+ nor t8, zero, zero -+ bstrins.d a1, a1, 31, 16 -+ srl.d t4, t4, t1 -+ -+ bstrins.d a1, a1, 63, 32 -+ bstrins.d a2, a2, 63, 32 -+ srl.d a7, t8, t1 -+ -+ li.w t1, 8 -+ nor t8, a7, zero -+ slli.d a3, a2, 7 -+ or t5, t8, t4 -+ and t3, a7, a1 -+ -+ sub.w t1, t1, t0 -+ nor a3, a3, zero -+ xor t2, t5, t3 -+ sub.d a7, t5, a2 -+ nor a6, t5, a3 -+ -+ sub.d a5, t2, a2 -+ nor a4, t2, a3 -+ -+ and a6, a7, a6 -+ and a5, a5, a4 -+ or a7, a6, a5 -+ bnez a7, L(_mc8_a) -+ -+ L_ADDU a0, a0, t1 -+L(_aloop): -+ ld.d t4, a0, 0 -+ -+ xor t2, t4, a1 -+ sub.d a7, t4, a2 -+ nor a6, t4, a3 -+ sub.d a5, t2, a2 -+ -+ nor a4, t2, a3 -+ and a6, a7, a6 -+ and a5, a5, a4 -+ or a7, a6, a5 -+ bnez a7, L(_mc8_a) -+ -+ ld.d t4, a0, 8 -+ L_ADDIU a0, a0, 16 -+ xor t2, t4, a1 -+ sub.d a7, t4, a2 -+ nor a6, t4, a3 -+ sub.d a5, t2, a2 -+ -+ nor a4, t2, a3 -+ and a6, a7, a6 -+ and a5, a5, a4 -+ or a7, a6, a5 -+ beqz a7, L(_aloop) -+ -+ L_ADDIU a0, a0, -8 -+L(_mc8_a): -+ -+ ctz.d t0, a5 -+ ctz.d t2, a6 -+ -+ srli.w t0, t0, 3 -+ srli.w t2, t2, 3 -+ sltu t1, t2, t0 -+ L_ADDU v0, a0, t0 -+ masknez v0, v0, t1 -+ jr ra -+END(STRCHR_NAME) -+ -+#ifndef ANDROID_CHANGES -+#ifdef _LIBC -+libc_hidden_builtin_def (STRCHR_NAME) -+#endif -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strchr.c b/sysdeps/loongarch/lp64/multiarch/strchr.c -new file mode 100644 -index 00000000..c6b069ed ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strchr.c -@@ -0,0 +1,39 @@ -+/* Multiple versions of strchr. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* Define multiple versions only for the definition in libc. */ -+#if IS_IN (libc) -+# define strchr __redirect_strchr -+# include -+# undef strchr -+ -+# define SYMBOL_NAME strchr -+# include "ifunc-lasx.h" -+ -+libc_ifunc_redirected (__redirect_strchr, __new_strchr, -+ IFUNC_SELECTOR ()); -+weak_alias(__new_strchr, index) -+# ifdef SHARED -+__hidden_ver1 (__new_strchr, __GI_strchr, __redirect_strchr) -+ __attribute__ ((visibility ("hidden"))); -+# endif -+ -+# include -+versioned_symbol (libc, __new_strchr, strchr, GLIBC_2_27); -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S -new file mode 100644 -index 00000000..4fa63ecc ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S -@@ -0,0 +1,8 @@ -+ -+#if IS_IN (libc) -+ -+#define STRCHRNUL_NAME __strchrnul_aligned -+ -+#endif -+ -+#include "../strchrnul.S" -diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S -new file mode 100644 -index 00000000..f8765413 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S -@@ -0,0 +1,4 @@ -+#define STRCHR __strchrnul_lasx -+#define AS_STRCHRNUL -+#include "strchr-lasx.S" -+ -diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S -new file mode 100644 -index 00000000..d363f11f ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S -@@ -0,0 +1,3 @@ -+#define STRCHR __strchrnul_lsx -+#define AS_STRCHRNUL -+#include "strchr-lsx.S" -diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S -new file mode 100644 -index 00000000..6338d005 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S -@@ -0,0 +1,146 @@ -+/* Copyright 2016 Loongson Technology Corporation Limited. */ -+ -+/* Author: Songyuekun songyuekun@loongson.cn -+ * ISA: MIPS64R2 -+ * ABI: N64 -+ * basic algorithm : -+ +. use ld.d and mask for the first 8 bytes or less; -+ +. build a1 with 8c with dins; -+ +. use xor from a1 and v0 to check if is found; -+ +. if (v0 - 0x0101010101010101) & (~(v0 | 0x7f7f7f7f7f7f7f7f)!= 0, v0 has -+ one byte is \0, else has no \0 -+*/ -+ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define L_ADDIU addi.d -+#define L_ADDU add.d -+#define L_SUBU sub.d -+ -+#define STRCHRNUL_NAME __strchrnul_unaligned -+ -+#define MOVN(rd,rs,rt) \ -+ maskeqz t6, rs, rt;\ -+ masknez rd, rd, rt;\ -+ or rd, rd, t6 -+ -+#define MOVZ(rd,rs,rt) \ -+ masknez t6, rs, rt;\ -+ maskeqz rd, rd, rt;\ -+ or rd, rd, t6 -+ -+ -+#define MOVN2(rd,rt) \ -+ masknez rd, rd, rt;\ -+ or rd, rd, rt -+ -+ -+/* char * strchrnul (const char *s1, int c); */ -+ -+LEAF(STRCHRNUL_NAME) -+ .align 6 -+ li.w t4, 0x7 -+ lu12i.w a2, 0x01010 -+ bstrins.d a1, a1, 15, 8 -+ andi t0, a0, 0x7 -+ -+ ori a2, a2, 0x101 -+ andn t4, a0, t4 -+ slli.w t1, t0, 3 -+ ld.d t4, t4, 0 -+ -+ -+ nor t8, zero, zero -+ bstrins.d a1, a1, 31, 16 -+ srl.d t4, t4, t1 -+ -+ preld 0, a0, 32 -+ bstrins.d a1, a1, 63, 32 -+ bstrins.d a2, a2, 63, 32 -+ srl.d a7, t8, t1 -+ -+ nor t8, a7, zero -+ slli.d a3, a2, 7 -+ or t5, t8, t4 -+ and t3, a7, a1 -+ -+ nor a3, a3, zero -+ xor t2, t5, t3 -+ sub.d a7, t5, a2 -+ nor a6, t5, a3 -+ -+ li.w t1, 8 -+ sub.d a5, t2, a2 -+ nor a4, t2, a3 -+ -+ and a6, a7, a6 -+ and a5, a5, a4 -+ or a7, a6, a5 -+ bnez a7, L(_mc8_a) -+ -+ -+ sub.w t1, t1, t0 -+ L_ADDU a0, a0, t1 -+L(_aloop): -+ ld.d t4, a0, 0 -+ -+ xor t2, t4, a1 -+ sub.d a7, t4, a2 -+ nor a6, t4, a3 -+ sub.d a5, t2, a2 -+ -+ nor a4, t2, a3 -+ and a6, a7, a6 -+ and a5, a5, a4 -+ -+ or a7, a6, a5 -+ bnez a7, L(_mc8_a) -+ -+ ld.d t4, a0, 8 -+ L_ADDIU a0, a0, 16 -+ -+ xor t2, t4, a1 -+ sub.d a7, t4, a2 -+ nor a6, t4, a3 -+ sub.d a5, t2, a2 -+ -+ nor a4, t2, a3 -+ and a6, a7, a6 -+ and a5, a5, a4 -+ -+ or a7, a6, a5 -+ beqz a7, L(_aloop) -+ -+ L_ADDIU a0, a0, -8 -+L(_mc8_a): -+ -+ ctz.d t0, a5 -+ ctz.d t2, a6 -+ -+ srli.w t0, t0, 3 -+ srli.w t2, t2, 3 -+ slt t1, t0, t2 -+ -+ MOVZ(t0,t2,t1) -+ -+ L_ADDU v0, a0, t0 -+ jr ra -+END(STRCHRNUL_NAME) -+ -+#ifndef ANDROID_CHANGES -+#ifdef _LIBC -+weak_alias(STRCHRNUL_NAME, strchrnul) -+libc_hidden_builtin_def (STRCHRNUL_NAME) -+#endif -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul.c b/sysdeps/loongarch/lp64/multiarch/strchrnul.c -new file mode 100644 -index 00000000..53a7273a ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul.c -@@ -0,0 +1,34 @@ -+/* Multiple versions of strchrnul. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* Define multiple versions only for the definition in libc. */ -+#if IS_IN (libc) -+# define strchrnul __redirect_strchrnul -+# define __strchrnul __redirect___strchrnul -+# include -+# undef __strchrnul -+# undef strchrnul -+ -+# define SYMBOL_NAME strchrnul -+# include "ifunc-lasx.h" -+ -+libc_ifunc_redirected (__redirect_strchrnul, __strchrnul, -+ IFUNC_SELECTOR ()); -+weak_alias (__strchrnul, strchrnul) -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S -new file mode 100644 -index 00000000..f84f52b8 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S -@@ -0,0 +1,8 @@ -+ -+#if IS_IN (libc) -+ -+#define STRCMP_NAME __strcmp_aligned -+ -+#endif -+ -+#include "../strcmp.S" -diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S -new file mode 100644 -index 00000000..226b1d63 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S -@@ -0,0 +1,147 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define STRCMP __strcmp_lsx -+ -+/* int strcmp (const char *s1, const char *s2); */ -+L(magic_num): -+ .align 6 -+ .dword 0x0706050403020100 -+ .dword 0x0f0e0d0c0b0a0908 -+ -+ENTRY_NO_ALIGN(STRCMP) -+ pcaddi t0, -4 -+ andi a2, a0, 0xf -+ vld $vr2, t0, 0 -+ andi a3, a1, 0xf -+ -+ bne a2, a3, L(unaligned) -+ bstrins.d a0, zero, 3, 0 -+ bstrins.d a1, zero, 3, 0 -+ vld $vr0, a0, 0 -+ -+ vld $vr1, a1, 0 -+ vreplgr2vr.b $vr3, a2 -+ vslt.b $vr2, $vr2, $vr3 -+ vseq.b $vr3, $vr0, $vr1 -+ -+ -+ vmin.bu $vr3, $vr0, $vr3 -+ vor.v $vr3, $vr3, $vr2 -+ vsetanyeqz.b $fcc0, $vr3 -+ bcnez $fcc0, L(al_out) -+ -+L(al_loop): -+ vld $vr0, a0, 16 -+ vld $vr1, a1, 16 -+ addi.d a0, a0, 16 -+ addi.d a1, a1, 16 -+ -+ vseq.b $vr3, $vr0, $vr1 -+ vmin.bu $vr3, $vr0, $vr3 -+ vsetanyeqz.b $fcc0, $vr3 -+ bceqz $fcc0, L(al_loop) -+ -+L(al_out): -+ vseqi.b $vr3, $vr3, 0 -+ vfrstpi.b $vr3, $vr3, 0 -+ vshuf.b $vr0, $vr0, $vr0, $vr3 -+ vshuf.b $vr1, $vr1, $vr1, $vr3 -+ -+ -+ vpickve2gr.bu t0, $vr0, 0 -+ vpickve2gr.bu t1, $vr1, 0 -+ sub.d a0, t0, t1 -+ jr ra -+ -+ nop -+ nop -+ nop -+L(unaligned): -+ slt a4, a2, a3 -+ -+ xor t0, a0, a1 -+ maskeqz t0, t0, a4 -+ xor a0, a0, t0 # a0 hold the larger one -+ xor a1, a1, t0 # a1 hold the small one -+ -+ andi a2, a0, 0xf -+ andi a3, a1, 0xf -+ bstrins.d a0, zero, 3, 0 -+ bstrins.d a1, zero, 3, 0 -+ -+ -+ vld $vr0, a0, 0 -+ vld $vr3, a1, 0 -+ vreplgr2vr.b $vr4, a2 -+ vreplgr2vr.b $vr5, a3 -+ -+ vslt.b $vr7, $vr2, $vr4 -+ vsub.b $vr4, $vr4, $vr5 -+ vaddi.bu $vr6, $vr2, 16 -+ vsub.b $vr6, $vr6, $vr4 -+ -+ vshuf.b $vr1, $vr3, $vr3, $vr6 -+ vseq.b $vr4, $vr0, $vr1 -+ vmin.bu $vr4, $vr0, $vr4 -+ vor.v $vr4, $vr4, $vr7 -+ -+ vsetanyeqz.b $fcc0, $vr4 -+ bcnez $fcc0, L(un_end) -+ vslt.b $vr5, $vr2, $vr5 -+ vor.v $vr3, $vr3, $vr5 -+ -+ -+L(un_loop): -+ vld $vr0, a0, 16 -+ vsetanyeqz.b $fcc0, $vr3 -+ bcnez $fcc0, L(remaining_end) -+ vor.v $vr1, $vr3, $vr3 -+ -+ vld $vr3, a1, 16 -+ addi.d a0, a0, 16 -+ addi.d a1, a1, 16 -+ vshuf.b $vr1, $vr3, $vr1, $vr6 -+ -+ vseq.b $vr4, $vr0, $vr1 -+ vmin.bu $vr4, $vr0, $vr4 -+ vsetanyeqz.b $fcc0, $vr4 -+ bceqz $fcc0, L(un_loop) -+ -+L(un_end): -+ vseqi.b $vr4, $vr4, 0 -+ vfrstpi.b $vr4, $vr4, 0 -+ vshuf.b $vr0, $vr0, $vr0, $vr4 -+ vshuf.b $vr1, $vr1, $vr1, $vr4 -+ -+ -+ vpickve2gr.bu t0, $vr0, 0 -+ vpickve2gr.bu t1, $vr1, 0 -+ sub.d t3, t0, t1 -+ sub.d t4, t1, t0 -+ -+ masknez t0, t3, a4 -+ maskeqz t1, t4, a4 -+ or a0, t0, t1 -+ jr ra -+ -+L(remaining_end): -+ vshuf.b $vr1, $vr3, $vr3, $vr6 -+ vseq.b $vr4, $vr0, $vr1 -+ vmin.bu $vr4, $vr4, $vr0 -+ b L(un_end) -+END(STRCMP) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (STRCMP) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S -new file mode 100644 -index 00000000..e29d872f ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S -@@ -0,0 +1,191 @@ -+/* Copyright 2016 Loongson Technology Corporation Limited */ -+ -+/* Author: songyuekun songyuekun@loongson.cn */ -+ -+/* -+ * ISA: MIPS64R2 -+ * ABI: N64 -+ */ -+ -+/* basic algorithm : -+ +. let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1, -+ set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1 -+ +. if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more ld other times; -+ +. if not, load partial t2 and t3, check if t2 has \0; -+ +. then use use ld for t0, ldr for t1, -+ +. if partial 8 byte from t1 has \0, compare partial 8 byte from t1 with 8 -+ byte from t0 with a mask in a7 -+ +. if not, ldl other part of t1, compare 8 byte from t1 with 8 byte from t0 -+ +. if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has -+ one byte is \0, else has no \0 -+ +. for partial 8 byte from ldr t3, 0(a0), preload t3 with 0xffffffffffffffff -+*/ -+ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+ -+#if IS_IN (libc) -+ -+ -+#define STRCMP_NAME __strcmp_unaligned -+ -+#define REP8_01 0x0101010101010101 -+#define REP8_7f 0x7f7f7f7f7f7f7f7f -+#define REP8_80 0x8080808080808080 -+ -+/* Parameters and Results */ -+#define src1 a0 -+#define src2 a1 -+#define result v0 -+// Note: v0 = a0 in N64 ABI -+ -+ -+/* Internal variable */ -+#define data1 t0 -+#define data2 t1 -+#define has_nul t2 -+#define diff t3 -+#define syndrome t4 -+#define zeroones t5 -+#define sevenf t6 -+#define pos t7 -+#define exchange t8 -+#define tmp1 a4 -+#define tmp2 a5 -+#define tmp3 a6 -+#define src1_off a2 -+#define src2_off a3 -+#define tmp4 a7 -+ -+/* rd <- if rc then ra else rb -+ will destroy tmp3. */ -+#define CONDITIONSEL(rd,rc,ra,rb)\ -+ masknez tmp3, rb, rc;\ -+ maskeqz rd, ra, rc;\ -+ or rd, rd, tmp3 -+ -+/* int strcmp (const char *s1, const char *s2); */ -+ -+LEAF(STRCMP_NAME) -+ .align 4 -+ -+ xor tmp1, src1, src2 -+ lu12i.w zeroones, 0x01010 -+ lu12i.w sevenf, 0x7f7f7 -+ andi src1_off, src1, 0x7 -+ ori zeroones, zeroones, 0x101 -+ ori sevenf, sevenf, 0xf7f -+ andi tmp1, tmp1, 0x7 -+ bstrins.d zeroones, zeroones, 63, 32 -+ bstrins.d sevenf, sevenf, 63, 32 -+ bnez tmp1, strcmp_misaligned8 -+ bnez src1_off, strcmp_mutual_align -+strcmp_loop_aligned: -+ ld.d data1, src1, 0 -+ addi.d src1, src1, 8 -+ ld.d data2, src2, 0 -+ addi.d src2, src2, 8 -+strcmp_start_realigned: -+ sub.d tmp1, data1, zeroones -+ or tmp2, data1, sevenf -+ xor diff, data1, data2 -+ andn has_nul, tmp1, tmp2 -+ or syndrome, diff, has_nul -+ beqz syndrome, strcmp_loop_aligned -+ -+strcmp_end: -+ ctz.d pos, syndrome -+ bstrins.d pos, zero, 2, 0 -+ srl.d data1, data1, pos -+ srl.d data2, data2, pos -+ andi data1, data1, 0xff -+ andi data2, data2, 0xff -+ sub.d result, data1, data2 -+ jr ra -+strcmp_mutual_align: -+ bstrins.d src1, zero, 2, 0 -+ bstrins.d src2, zero, 2, 0 -+ slli.d tmp1, src1_off, 0x3 -+ ld.d data1, src1, 0 -+ sub.d tmp1, zero, tmp1 -+ ld.d data2, src2, 0 -+ addi.d src1, src1, 8 -+ addi.d src2, src2, 8 -+ nor tmp2, zero, zero -+ srl.d tmp2, tmp2, tmp1 -+ or data1, data1, tmp2 -+ or data2, data2, tmp2 -+ b strcmp_start_realigned -+ -+strcmp_misaligned8: -+ -+/* check if ((src1 != 0) && ((src2 == 0 ) || (src1 < src2))) -+ then exchange(src1,src2). */ -+ andi src2_off, src2, 0x7 -+ slt tmp2, src1_off, src2_off -+ CONDITIONSEL(tmp2,src2_off,tmp2,tmp1) -+ maskeqz exchange, tmp2, src1_off -+ xor tmp3, src1, src2 -+ maskeqz tmp3, tmp3, exchange -+ xor src1, src1, tmp3 -+ xor src2, src2, tmp3 -+ -+ andi src1_off, src1, 0x7 -+ beqz src1_off, strcmp_loop_misaligned -+strcmp_do_misaligned: -+ ld.bu data1, src1, 0 -+ ld.bu data2, src2, 0 -+ xor tmp3, data1, data2 -+ addi.d src1, src1, 1 -+ masknez tmp3, data1, tmp3 -+ addi.d src2, src2, 1 -+ beqz tmp3, strcmp_done -+ andi src1_off, src1, 0x7 -+ bnez src1_off, strcmp_do_misaligned -+ -+strcmp_loop_misaligned: -+ andi tmp1, src2, 0xff8 -+ xori tmp1, tmp1, 0xff8 -+ beqz tmp1, strcmp_do_misaligned -+ ld.d data1, src1, 0 -+ ld.d data2, src2, 0 -+ addi.d src1, src1, 8 -+ addi.d src2, src2, 8 -+ -+ sub.d tmp1, data1, zeroones -+ or tmp2, data1, sevenf -+ xor diff, data1, data2 -+ andn has_nul, tmp1, tmp2 -+ or syndrome, diff, has_nul -+ beqz syndrome, strcmp_loop_misaligned -+strcmp_misalign_end: -+ ctz.d pos, syndrome -+ bstrins.d pos, zero, 2, 0 -+ srl.d data1, data1, pos -+ srl.d data2, data2, pos -+ andi data1, data1, 0xff -+ andi data2, data2, 0xff -+ sub.d tmp1, data1, data2 -+ sub.d tmp2, data2, data1 -+ CONDITIONSEL(result,exchange,tmp2,tmp1) -+ jr ra -+ -+strcmp_done: -+ sub.d tmp1, data1, data2 -+ sub.d tmp2, data2, data1 -+ CONDITIONSEL(result,exchange,tmp2,tmp1) -+ jr ra -+END(STRCMP_NAME) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (STRCMP_NAME) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp.c b/sysdeps/loongarch/lp64/multiarch/strcmp.c -new file mode 100644 -index 00000000..0b20e6f0 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strcmp.c -@@ -0,0 +1,35 @@ -+/* Multiple versions of strcmp. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* Define multiple versions only for the definition in libc. */ -+#if IS_IN (libc) -+# define strcmp __redirect_strcmp -+# include -+# undef strcmp -+ -+# define SYMBOL_NAME strcmp -+#include -+ -+libc_ifunc_redirected (__redirect_strcmp, strcmp, IFUNC_SELECTOR ()); -+ -+# ifdef SHARED -+__hidden_ver1 (strcmp, __GI_strcmp, __redirect_strcmp) -+ __attribute__ ((visibility ("hidden"))); -+# endif -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S -new file mode 100644 -index 00000000..4860398b ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S -@@ -0,0 +1,8 @@ -+ -+#if IS_IN (libc) -+ -+#define STRCPY __strcpy_aligned -+ -+#endif -+ -+#include "../strcpy.S" -diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S -new file mode 100644 -index 00000000..76db561a ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S -@@ -0,0 +1,174 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define STRCPY __strcpy_lsx -+ -+/* int strcpy (const char *s1, const char *s2); */ -+ -+L(magic_num): -+ .align 6 -+ .dword 0x0706050403020100 -+ .dword 0x0f0e0d0c0b0a0908 -+ENTRY_NO_ALIGN(STRCPY) -+ pcaddi t0, -4 -+ andi a4, a1, 0xf -+ vld $vr1, t0, 0 -+ move a2, a0 -+ -+ beqz a4, L(load_start) -+ xor t0, a1, a4 -+ vld $vr0, t0, 0 -+ vreplgr2vr.b $vr2, a4 -+ -+ vadd.b $vr2, $vr2, $vr1 -+ vshuf.b $vr0, $vr2, $vr0, $vr2 -+ vsetanyeqz.b $fcc0, $vr0 -+ bcnez $fcc0, L(end) -+ -+ -+L(load_start): -+ vld $vr0, a1, 0 -+ li.d t1, 16 -+ andi a3, a2, 0xf -+ vsetanyeqz.b $fcc0, $vr0 -+ -+ sub.d t0, t1, a3 -+ bcnez $fcc0, L(end) -+ add.d a1, a1, t0 -+ vst $vr0, a2, 0 -+ -+ andi a3, a1, 0xf -+ add.d a2, a2, t0 -+ bnez a3, L(unaligned) -+ vld $vr0, a1, 0 -+ -+ vsetanyeqz.b $fcc0, $vr0 -+ bcnez $fcc0, L(end) -+L(loop): -+ vst $vr0, a2, 0 -+ vld $vr0, a1, 16 -+ -+ -+ addi.d a2, a2, 16 -+ addi.d a1, a1, 16 -+ vsetanyeqz.b $fcc0, $vr0 -+ bceqz $fcc0, L(loop) -+ -+ vmsknz.b $vr1, $vr0 -+ movfr2gr.s t0, $f1 -+ cto.w t0, t0 -+ add.d a1, a1, t0 -+ -+ vld $vr0, a1, -15 -+ add.d a2, a2, t0 -+ vst $vr0, a2, -15 -+ jr ra -+ -+L(end): -+ vmsknz.b $vr1, $vr0 -+ movfr2gr.s t0, $f1 -+ cto.w t0, t0 -+ addi.d t0, t0, 1 -+ -+ -+L(end_16): -+ andi t1, t0, 16 -+ beqz t1, L(end_8) -+ vst $vr0, a2, 0 -+ jr ra -+ -+L(end_8): -+ andi t2, t0, 8 -+ andi t3, t0, 4 -+ andi t4, t0, 2 -+ andi t5, t0, 1 -+ -+ beqz t2, L(end_4) -+ vstelm.d $vr0, a2, 0, 0 -+ addi.d a2, a2, 8 -+ vbsrl.v $vr0, $vr0, 8 -+ -+L(end_4): -+ beqz t3, L(end_2) -+ vstelm.w $vr0, a2, 0, 0 -+ addi.d a2, a2, 4 -+ vbsrl.v $vr0, $vr0, 4 -+ -+ -+L(end_2): -+ beqz t4, L(end_1) -+ vstelm.h $vr0, a2, 0, 0 -+ addi.d a2, a2, 2 -+ vbsrl.v $vr0, $vr0, 2 -+ -+L(end_1): -+ beqz t5, L(out) -+ vstelm.b $vr0, a2, 0, 0 -+L(out): -+ jr ra -+L(unaligned): -+ bstrins.d a1, zero, 3, 0 -+ -+ vld $vr2, a1, 0 -+ vreplgr2vr.b $vr3, a3 -+ vslt.b $vr4, $vr1, $vr3 -+ vor.v $vr0, $vr2, $vr4 -+ -+ vsetanyeqz.b $fcc0, $vr0 -+ bcnez $fcc0, L(un_first_end) -+ vld $vr0, a1, 16 -+ vadd.b $vr3, $vr3, $vr1 -+ -+ -+ addi.d a1, a1, 16 -+ vshuf.b $vr4, $vr0, $vr2, $vr3 -+ vsetanyeqz.b $fcc0, $vr0 -+ bcnez $fcc0, L(un_end) -+ -+L(un_loop): -+ vor.v $vr2, $vr0, $vr0 -+ vld $vr0, a1, 16 -+ vst $vr4, a2, 0 -+ addi.d a1, a1, 16 -+ -+ addi.d a2, a2, 16 -+ vshuf.b $vr4, $vr0, $vr2, $vr3 -+ vsetanyeqz.b $fcc0, $vr0 -+ bceqz $fcc0, L(un_loop) -+ -+L(un_end): -+ vsetanyeqz.b $fcc0, $vr4 -+ bcnez $fcc0, 1f -+ vst $vr4, a2, 0 -+1: -+ vmsknz.b $vr1, $vr0 -+ -+ -+ movfr2gr.s t0, $f1 -+ cto.w t0, t0 -+ add.d a1, a1, t0 -+ vld $vr0, a1, -15 -+ -+ add.d a2, a2, t0 -+ sub.d a2, a2, a3 -+ vst $vr0, a2, 1 -+ jr ra -+ -+L(un_first_end): -+ addi.d a2, a2, -16 -+ b 1b -+END(STRCPY) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (STRCPY) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S -new file mode 100644 -index 00000000..449733cb ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S -@@ -0,0 +1,199 @@ -+/* Copyright 2016 Loongson Technology Corporation Limited */ -+ -+/* Author: Huang Pei huangpei@loongson.cn. -+ * ISA: MIPS64R2 -+ * ABI: N64 -+ * basic algorithm : -+ +. if src aligned. just do the copy loop. if not, do the cross page check and copy one double word. -+ Then move src to aligned. -+ +. if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has -+ one byte is \0, else has no \0 -+*/ -+ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define STRCPY __strcpy_unaligned -+ -+#define REP8_01 0x0101010101010101 -+#define REP8_7f 0x7f7f7f7f7f7f7f7f -+#define REP8_80 0x8080808080808080 -+ -+/* Parameters and Results */ -+#define dest a0 -+#define src a1 -+#define result v0 -+// Note: v0 = a0 in N64 ABI -+ -+ -+/* Internal variable */ -+#define data t0 -+#define data1 t1 -+#define has_nul t2 -+#define diff t3 -+#define syndrome t4 -+#define zeroones t5 -+#define sevenf t6 -+#define pos t7 -+#define dest_backup t8 -+#define tmp1 a4 -+#define tmp2 a5 -+#define tmp3 a6 -+#define dest_off a2 -+#define src_off a3 -+#define tmp4 a7 -+ -+/* rd <- if rc then ra else rb -+ will destroy tmp3 -+*/ -+#define CONDITIONSEL(rd,rc,ra,rb)\ -+ masknez tmp3, rb, rc;\ -+ maskeqz rd, ra, rc;\ -+ or rd, rd, tmp3 -+ -+/* int strcpy (const char *s1, const char *s2); */ -+ -+LEAF(STRCPY) -+ .align 4 -+ move dest_backup, dest -+ lu12i.w zeroones, 0x01010 -+ lu12i.w sevenf, 0x7f7f7 -+ ori zeroones, zeroones, 0x101 -+ ori sevenf, sevenf, 0xf7f -+ bstrins.d zeroones, zeroones, 63, 32 -+ bstrins.d sevenf, sevenf, 63, 32 -+ andi src_off, src, 0x7 -+ beqz src_off, strcpy_loop_aligned_1 -+ b strcpy_mutual_align -+strcpy_loop_aligned: -+ st.d data, dest, 0 -+ addi.d dest, dest, 8 -+strcpy_loop_aligned_1: -+ ld.d data, src, 0 -+ addi.d src, src, 8 -+strcpy_start_realigned: -+ sub.d tmp1, data, zeroones -+ or tmp2, data, sevenf -+ andn has_nul, tmp1, tmp2 -+ beqz has_nul, strcpy_loop_aligned -+ -+strcpy_end: -+ ctz.d pos, has_nul -+ srli.d pos, pos, 3 -+ addi.d pos, pos, 1 -+/* Do 8/4/2/1 strcpy based on pos value. -+ pos value is the number of bytes to be copied -+ the bytes include the final \0 so the max length is 8 and the min length is 1. -+ */ -+ -+strcpy_end_8: -+ andi tmp1, pos, 0x8 -+ beqz tmp1, strcpy_end_4 -+ st.d data, dest, 0 -+ move dest, dest_backup -+ jr ra -+strcpy_end_4: -+ andi tmp1, pos, 0x4 -+ beqz tmp1, strcpy_end_2 -+ st.w data, dest, 0 -+ srli.d data, data, 32 -+ addi.d dest, dest, 4 -+strcpy_end_2: -+ andi tmp1, pos, 0x2 -+ beqz tmp1, strcpy_end_1 -+ st.h data, dest, 0 -+ srli.d data, data, 16 -+ addi.d dest, dest, 2 -+strcpy_end_1: -+ andi tmp1, pos, 0x1 -+ beqz tmp1, strcpy_end_ret -+ st.b data, dest, 0 -+strcpy_end_ret: -+ move result, dest_backup -+ jr ra -+ -+ -+strcpy_mutual_align: -+/* Check if around src page bound. -+ if not go to page cross ok. -+ if it is, do further check. -+ use tmp2 to accelerate. */ -+ -+ li.w tmp2, 0xff8 -+ andi tmp1, src, 0xff8 -+ beq tmp1, tmp2, strcpy_page_cross -+ -+strcpy_page_cross_ok: -+/* -+ Load a misaligned double word and check if has \0 -+ If no, do a misaligned double word paste. -+ If yes, calculate the number of avaliable bytes, -+ then jump to 4/2/1 end. -+*/ -+ ld.d data, src, 0 -+ sub.d tmp1, data, zeroones -+ or tmp2, data, sevenf -+ andn has_nul, tmp1, tmp2 -+ bnez has_nul, strcpy_end -+strcpy_mutual_align_finish: -+/* -+ Before jump back to align loop, make dest/src aligned. -+ This will cause a duplicated paste for several bytes between -+ the first double word and the second double word, -+ but should not bring a problem. -+*/ -+ li.w tmp1, 8 -+ st.d data, dest, 0 -+ sub.d tmp1, tmp1, src_off -+ add.d src, src, tmp1 -+ add.d dest, dest, tmp1 -+ -+ b strcpy_loop_aligned_1 -+ -+strcpy_page_cross: -+/* -+ ld.d from aligned address(src & ~0x7). -+ check if high bytes have \0. -+ it not, go back to page cross ok, -+ since the string is supposed to cross the page bound in such situation. -+ if it is, do a srl for data to make it seems like a direct double word from src, -+ then go to 4/2/1 strcpy end. -+ -+ tmp4 is 0xffff...ffff mask -+ tmp2 demonstrate the bytes to be masked -+ tmp2 = src_off << 3 -+ data = data >> (src_off * 8) | -1 << (64 - src_off * 8) -+ and -+ -1 << (64 - src_off * 8) -> ~(-1 >> (src_off * 8)) -+*/ -+ -+ li.w tmp1, 0x7 -+ andn tmp3, src, tmp1 -+ ld.d data, tmp3, 0 -+ li.w tmp4, -1 -+ slli.d tmp2, src_off, 3 -+ srl.d tmp4, tmp4, tmp2 -+ srl.d data, data, tmp2 -+ nor tmp4, tmp4, zero -+ or data, data, tmp4 -+ sub.d tmp1, data, zeroones -+ or tmp2, data, sevenf -+ andn has_nul, tmp1, tmp2 -+ beqz has_nul, strcpy_page_cross_ok -+ b strcpy_end -+END(STRCPY) -+#ifndef ANDROID_CHANGES -+#ifdef _LIBC -+libc_hidden_builtin_def (STRCPY) -+#endif -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy.c b/sysdeps/loongarch/lp64/multiarch/strcpy.c -new file mode 100644 -index 00000000..48fecf66 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strcpy.c -@@ -0,0 +1,36 @@ -+/* Multiple versions of strcpy. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2023 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* Define multiple versions only for the definition in libc. */ -+#if IS_IN (libc) -+# define strcpy __redirect_strcpy -+# include -+# undef strcpy -+ -+# define SYMBOL_NAME strcpy -+# include "ifunc-lsx.h" -+ -+libc_ifunc_redirected (__redirect_strcpy, strcpy, IFUNC_SELECTOR ()); -+ -+# ifdef SHARED -+__hidden_ver1 (strcpy, __GI_strcpy, __redirect_strcpy) -+ __attribute__ ((visibility ("hidden"))); -+# endif -+#endif -+ -diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S -new file mode 100644 -index 00000000..d31875fd ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S -@@ -0,0 +1,8 @@ -+ -+#if IS_IN (libc) -+ -+#define STRLEN __strlen_aligned -+ -+#endif -+ -+#include "../strlen.S" -diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S -new file mode 100644 -index 00000000..cb276aa0 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S -@@ -0,0 +1,55 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define STRLEN __strlen_lasx -+ -+/* size_t strlen(const char *s1); */ -+ -+LEAF(STRLEN) -+ .align 6 -+ move a1, a0 -+ bstrins.d a0, zero, 4, 0 -+ li.d t1, -1 -+ xvld $xr0, a0, 0 -+ -+ xvmsknz.b $xr0, $xr0 -+ xvpickve.w $xr1, $xr0, 4 -+ vilvl.h $vr0, $vr1, $vr0 -+ movfr2gr.s t0, $f0 # sign extend -+ -+ sra.w t0, t0, a1 -+ beq t0, t1, L(loop) -+ cto.w a0, t0 -+ jr ra -+ -+L(loop): -+ xvld $xr0, a0, 32 -+ addi.d a0, a0, 32 -+ xvsetanyeqz.b $fcc0, $xr0 -+ bceqz $fcc0, L(loop) -+ -+ -+ xvmsknz.b $xr0, $xr0 -+ sub.d a0, a0, a1 -+ xvpickve.w $xr1, $xr0, 4 -+ vilvl.h $vr0, $vr1, $vr0 -+ -+ movfr2gr.s t0, $f0 -+ cto.w t0, t0 -+ add.d a0, a0, t0 -+ jr ra -+END(STRLEN) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (STRLEN) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S -new file mode 100644 -index 00000000..6edcac8c ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S -@@ -0,0 +1,63 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define STRLEN __strlen_lsx -+ -+/* size_t strlen(const char *s1); */ -+ -+LEAF(STRLEN) -+ .align 6 -+ move a1, a0 -+ bstrins.d a0, zero, 4, 0 -+ vld $vr0, a0, 0 -+ vld $vr1, a0, 16 -+ -+ li.d t1, -1 -+ vmsknz.b $vr0, $vr0 -+ vmsknz.b $vr1, $vr1 -+ vilvl.h $vr0, $vr1, $vr0 -+ -+ movfr2gr.s t0, $f0 -+ sra.w t0, t0, a1 -+ beq t0, t1, L(loop) -+ cto.w a0, t0 -+ -+ jr ra -+ nop -+ nop -+ nop -+ -+ -+L(loop): -+ vld $vr0, a0, 32 -+ vld $vr1, a0, 48 -+ addi.d a0, a0, 32 -+ vmin.bu $vr2, $vr0, $vr1 -+ -+ vsetanyeqz.b $fcc0, $vr2 -+ bceqz $fcc0, L(loop) -+ vmsknz.b $vr0, $vr0 -+ vmsknz.b $vr1, $vr1 -+ -+ vilvl.h $vr0, $vr1, $vr0 -+ sub.d a0, a0, a1 -+ movfr2gr.s t0, $f0 -+ cto.w t0, t0 -+ -+ add.d a0, a0, t0 -+ jr ra -+END(STRLEN) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (STRLEN) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S -new file mode 100644 -index 00000000..e9b7cf67 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S -@@ -0,0 +1,116 @@ -+/* Copyright 2016 Loongson Technology Corporation Limited. */ -+ -+/* Author: Songyuekun songyuekun@loongson.cn. */ -+ -+/* algorithm: -+ #. use ld/ldr to access word/partial word in the string -+ #. use (x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) != 0 to -+ judge if x has zero byte -+ #. use dctz((x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) >> 3 -+ to get the index of first rightmost zero byte in dword x; -+ #. use dctz(x) = 64 - dclz(~x & (x-1)); -+ #. use pointer to the last non zero byte minus pointer to the start -+ of the string to get the length of string. */ -+ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define L_ADDIU addi.d -+#define L_ADDU add.d -+#define L_SUBU sub.d -+ -+#define STRLEN __strlen_unaligned -+ -+/* size_t strlen (const char *s1); */ -+ -+LEAF(STRLEN) -+ .align 5 -+ nor t4, zero, zero -+ lu12i.w a2, 0x01010 -+ andi t5, a0, 0x7 -+ -+ li.w t7, 0x7 -+ slli.d t6, t5, 0x3 -+ andn t7, a0, t7 -+ ld.d a1, t7, 0 -+ sub.d t7, zero, t6 -+ sll.d t4, t4, t7 -+ maskeqz t4, t4, t6 -+ srl.d a1, a1, t6 -+ or a1, a1, t4 -+ -+ ori a2, a2, 0x101 -+ nor t1, a1, zero -+ li.w a4, 8 -+ -+ bstrins.d a2, a2, 63, 32 -+ sub.d a5, a4, t5 -+ move t5, a0 -+ -+ sub.d t0, a1, a2 -+ slli.d t4, a2, 7 -+ nor a3, zero, t4 -+ nor t1, a1, a3 -+ -+ and t0, t0, t1 -+ bnez t0, strlen_count1 /* instead of use bnel with daddu a0, a0, a5 in branch slot */ -+ L_ADDU a0, a0, a5 -+strlen_loop: -+ ld.d a1, a0, 0 -+ sub.d t0, a1, a2 -+ and t1, t0, t4 -+ bnez t1, strlen_count_pre -+ ld.d a1, a0, 8 -+ sub.d t0, a1, a2 -+ and t1, t0, t4 -+ L_ADDIU a0, a0, 16 -+ beqz t1, strlen_loop -+strlen_count: -+ addi.d a0, a0, -8 -+strlen_count_pre: -+ nor t1, a1, a3 -+ and t0, t0, t1 -+ beqz t0, strlen_noascii_start -+strlen_count1: -+ ctz.d t1, t0 -+ L_SUBU v0, a0, t5 -+ srli.w t1, t1, 3 -+ L_ADDU v0, v0, t1 -+ jr ra -+strlen_noascii_start: -+ addi.d a0, a0, 8 -+strlen_loop_noascii: -+ ld.d a1, a0, 0 -+ sub.d t0, a1, a2 -+ nor t1, a1, a3 -+ and t0, t0, t1 -+ bnez t0, strlen_count1 -+ ld.d a1, a0, 8 -+ sub.d t0, a1, a2 -+ nor t1, a1, a3 -+ and t0, t0, t1 -+ L_ADDIU a0, a0, 16 -+ beqz t0, strlen_loop_noascii -+ addi.d a0, a0, -8 -+ ctz.d t1, t0 -+ L_SUBU v0, a0, t5 -+ srli.w t1, t1, 3 -+ L_ADDU v0, v0, t1 -+ jr ra -+END(STRLEN) -+ -+#ifndef ANDROID_CHANGES -+#ifdef _LIBC -+libc_hidden_builtin_def (STRLEN) -+#endif -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strlen.c b/sysdeps/loongarch/lp64/multiarch/strlen.c -new file mode 100644 -index 00000000..e8454404 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strlen.c -@@ -0,0 +1,39 @@ -+/* Multiple versions of strlen. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* Define multiple versions only for the definition in libc. */ -+#if IS_IN (libc) -+# define strlen __redirect_strlen -+# include -+# undef strlen -+ -+# define SYMBOL_NAME strlen -+# include "ifunc-lasx.h" -+ -+libc_ifunc_redirected (__redirect_strlen, __new_strlen, -+ IFUNC_SELECTOR ()); -+ -+# ifdef SHARED -+__hidden_ver1 (__new_strlen, __GI_strlen, __redirect_strlen) -+ __attribute__ ((visibility ("hidden"))); -+# endif -+ -+# include -+versioned_symbol (libc, __new_strlen, strlen, GLIBC_2_27); -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S -new file mode 100644 -index 00000000..f371b19e ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S -@@ -0,0 +1,8 @@ -+ -+#if IS_IN (libc) -+ -+#define STRNCMP __strncmp_aligned -+ -+#endif -+ -+#include "../strncmp.S" -diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S -new file mode 100644 -index 00000000..3399bf77 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S -@@ -0,0 +1,197 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define STRNCMP __strncmp_lsx -+ -+/* int strncmp (const char *s1, const char *s2); */ -+ -+L(magic_num): -+ .align 6 -+ .dword 0x0706050403020100 -+ .dword 0x0f0e0d0c0b0a0908 -+ENTRY_NO_ALIGN(STRNCMP) -+ beqz a2, L(ret0) -+ pcaddi t0, -5 -+ andi a3, a0, 0xf -+ vld $vr2, t0, 0 -+ -+ andi a4, a1, 0xf -+ li.d t2, 16 -+ bne a3, a4, L(unaligned) -+ xor t0, a0, a3 -+ -+ xor t1, a1, a4 -+ vld $vr0, t0, 0 -+ vld $vr1, t1, 0 -+ vreplgr2vr.b $vr3, a3 -+ -+ -+ sub.d t2, t2, a3 -+ vadd.b $vr3, $vr3, $vr2 -+ vshuf.b $vr0, $vr3, $vr0, $vr3 -+ vshuf.b $vr1, $vr3, $vr1, $vr3 -+ -+ vseq.b $vr3, $vr0, $vr1 -+ vmin.bu $vr3, $vr0, $vr3 -+ bgeu t2, a2, L(al_early_end) -+ vsetanyeqz.b $fcc0, $vr3 -+ -+ bcnez $fcc0, L(al_end) -+ add.d a3, a0, a2 -+ addi.d a4, a3, -1 -+ bstrins.d a4, zero, 3, 0 -+ -+ sub.d a2, a3, a4 -+L(al_loop): -+ vld $vr0, t0, 16 -+ vld $vr1, t1, 16 -+ addi.d t0, t0, 16 -+ -+ -+ addi.d t1, t1, 16 -+ vseq.b $vr3, $vr0, $vr1 -+ vmin.bu $vr3, $vr0, $vr3 -+ beq t0, a4, L(al_early_end) -+ -+ vsetanyeqz.b $fcc0, $vr3 -+ bceqz $fcc0, L(al_loop) -+L(al_end): -+ vseqi.b $vr3, $vr3, 0 -+ vfrstpi.b $vr3, $vr3, 0 -+ -+ vshuf.b $vr0, $vr0, $vr0, $vr3 -+ vshuf.b $vr1, $vr1, $vr1, $vr3 -+ vpickve2gr.bu t0, $vr0, 0 -+ vpickve2gr.bu t1, $vr1, 0 -+ -+ sub.d a0, t0, t1 -+ jr ra -+L(al_early_end): -+ vreplgr2vr.b $vr4, a2 -+ vslt.b $vr4, $vr2, $vr4 -+ -+ -+ vorn.v $vr3, $vr3, $vr4 -+ b L(al_end) -+L(unaligned): -+ slt a5, a3, a4 -+ xor t0, a0, a1 -+ -+ maskeqz t0, t0, a5 -+ xor a0, a0, t0 # a0 hold the larger one -+ xor a1, a1, t0 # a1 hold the small one -+ andi a3, a0, 0xf -+ -+ andi a4, a1, 0xf -+ xor t0, a0, a3 -+ xor t1, a1, a4 -+ vld $vr0, t0, 0 -+ -+ vld $vr3, t1, 0 -+ sub.d t2, t2, a3 -+ vreplgr2vr.b $vr4, a3 -+ vreplgr2vr.b $vr5, a4 -+ -+ -+ vaddi.bu $vr6, $vr2, 16 -+ vsub.b $vr7, $vr4, $vr5 -+ vsub.b $vr6, $vr6, $vr7 -+ vadd.b $vr4, $vr2, $vr4 -+ -+ vshuf.b $vr1, $vr3, $vr3, $vr6 -+ vshuf.b $vr0, $vr7, $vr0, $vr4 -+ vshuf.b $vr1, $vr7, $vr1, $vr4 -+ vseq.b $vr4, $vr0, $vr1 -+ -+ vmin.bu $vr4, $vr0, $vr4 -+ bgeu t2, a2, L(un_early_end) -+ vsetanyeqz.b $fcc0, $vr4 -+ bcnez $fcc0, L(un_end) -+ -+ add.d a6, a0, a2 -+ vslt.b $vr5, $vr2, $vr5 -+ addi.d a7, a6, -1 -+ vor.v $vr3, $vr3, $vr5 -+ -+ -+ bstrins.d a7, zero, 3, 0 -+ sub.d a2, a6, a7 -+L(un_loop): -+ vld $vr0, t0, 16 -+ addi.d t0, t0, 16 -+ -+ vsetanyeqz.b $fcc0, $vr3 -+ bcnez $fcc0, L(has_zero) -+ beq t0, a7, L(end_with_len) -+ vor.v $vr1, $vr3, $vr3 -+ -+ vld $vr3, t1, 16 -+ addi.d t1, t1, 16 -+ vshuf.b $vr1, $vr3, $vr1, $vr6 -+ vseq.b $vr4, $vr0, $vr1 -+ -+ vmin.bu $vr4, $vr0, $vr4 -+ vsetanyeqz.b $fcc0, $vr4 -+ bceqz $fcc0, L(un_loop) -+L(un_end): -+ vseqi.b $vr4, $vr4, 0 -+ -+ -+ vfrstpi.b $vr4, $vr4, 0 -+ vshuf.b $vr0, $vr0, $vr0, $vr4 -+ vshuf.b $vr1, $vr1, $vr1, $vr4 -+ vpickve2gr.bu t0, $vr0, 0 -+ -+ vpickve2gr.bu t1, $vr1, 0 -+ sub.d t2, t0, t1 -+ sub.d t3, t1, t0 -+ masknez t0, t2, a5 -+ -+ maskeqz t1, t3, a5 -+ or a0, t0, t1 -+ jr ra -+L(has_zero): -+ vshuf.b $vr1, $vr3, $vr3, $vr6 -+ -+ vseq.b $vr4, $vr0, $vr1 -+ vmin.bu $vr4, $vr0, $vr4 -+ bne t0, a7, L(un_end) -+L(un_early_end): -+ vreplgr2vr.b $vr5, a2 -+ -+ vslt.b $vr5, $vr2, $vr5 -+ vorn.v $vr4, $vr4, $vr5 -+ b L(un_end) -+L(end_with_len): -+ sub.d a6, a3, a4 -+ -+ bgeu a6, a2, 1f -+ vld $vr4, t1, 16 -+1: -+ vshuf.b $vr1, $vr4, $vr3, $vr6 -+ vseq.b $vr4, $vr0, $vr1 -+ -+ vmin.bu $vr4, $vr0, $vr4 -+ vreplgr2vr.b $vr5, a2 -+ vslt.b $vr5, $vr2, $vr5 -+ vorn.v $vr4, $vr4, $vr5 -+ -+ b L(un_end) -+L(ret0): -+ move a0, zero -+ jr ra -+END(STRNCMP) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (STRNCMP) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S -new file mode 100644 -index 00000000..558df29b ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S -@@ -0,0 +1,257 @@ -+/* Copyright 2016 Loongson Technology Corporation Limited. */ -+ -+/* Author: songyuekun songyuekun@loongson.cn. -+ * ISA: MIPS64R2 -+ * ABI: N64 -+ * basic algorithm : -+ +. let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1, -+ set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1 -+ +. if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more ld other times; -+ +. if not, load partial t2 and t3, check if t2 has \0; -+ +. then use use ld for t0, ldr for t1, -+ +. if partial 8 byte from t1 has \0, compare partial 8 byte from t1 with 8 -+ byte from t0 with a mask in a7 -+ +. if not, ldl other part of t1, compare 8 byte from t1 with 8 byte from t0 -+ +. if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has -+ one byte is \0, else has no \0 -+ +. for partial 8 byte from ldr t3, 0(a0), preload t3 with 0xffffffffffffffff -+*/ -+ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define STRNCMP __strncmp_unaligned -+ -+#define REP8_01 0x0101010101010101 -+#define REP8_7f 0x7f7f7f7f7f7f7f7f -+#define REP8_80 0x8080808080808080 -+ -+/* Parameters and Results */ -+#define src1 a0 -+#define src2 a1 -+#define limit a2 -+#define result v0 -+// Note: v0 = a0 in N64 ABI -+ -+ -+/* Internal variable */ -+#define data1 t0 -+#define data2 t1 -+#define has_nul t2 -+#define diff t3 -+#define syndrome t4 -+#define zeroones t5 -+#define sevenf t6 -+#define pos t7 -+#define exchange t8 -+#define tmp1 a5 -+#define tmp2 a6 -+#define tmp3 a7 -+#define src1_off a3 -+#define limit_wd a4 -+ -+/* int strncmp (const char *s1, const char *s2); */ -+ -+LEAF(STRNCMP) -+ .align 4 -+ beqz limit, strncmp_ret0 -+ -+ xor tmp1, src1, src2 -+ lu12i.w zeroones, 0x01010 -+ lu12i.w sevenf, 0x7f7f7 -+ andi src1_off, src1, 0x7 -+ ori zeroones, zeroones, 0x101 -+ andi tmp1, tmp1, 0x7 -+ ori sevenf, sevenf, 0xf7f -+ bstrins.d zeroones, zeroones, 63, 32 -+ bstrins.d sevenf, sevenf, 63, 32 -+ bnez tmp1, strncmp_misaligned8 -+ bnez src1_off, strncmp_mutual_align -+ -+ addi.d limit_wd, limit, -1 -+ srli.d limit_wd, limit_wd, 3 -+ -+strncmp_loop_aligned: -+ ld.d data1, src1, 0 -+ addi.d src1, src1, 8 -+ ld.d data2, src2, 0 -+ addi.d src2, src2, 8 -+ -+strncmp_start_realigned: -+ addi.d limit_wd, limit_wd, -1 -+ sub.d tmp1, data1, zeroones -+ or tmp2, data1, sevenf -+ xor diff, data1, data2 -+ andn has_nul, tmp1, tmp2 -+ srli.d tmp1, limit_wd, 63 -+ or syndrome, diff, has_nul -+ or tmp2, syndrome, tmp1 -+ beqz tmp2, strncmp_loop_aligned -+ -+ /* if not reach limit. */ -+ bge limit_wd, zero, strncmp_not_limit -+ -+ /* if reach limit. */ -+ andi limit, limit, 0x7 -+ li.w tmp1, 0x8 -+ sub.d limit, tmp1, limit -+ slli.d limit, limit, 0x3 -+ li.d tmp1, -1 -+ srl.d tmp1, tmp1, limit -+ and data1, data1, tmp1 -+ and data2, data2, tmp1 -+ orn syndrome, syndrome, tmp1 -+ -+ -+strncmp_not_limit: -+ ctz.d pos, syndrome -+ bstrins.d pos, zero, 2, 0 -+ srl.d data1, data1, pos -+ srl.d data2, data2, pos -+ andi data1, data1, 0xff -+ andi data2, data2, 0xff -+ sub.d result, data1, data2 -+ jr ra -+ -+strncmp_mutual_align: -+ bstrins.d src1, zero, 2, 0 -+ bstrins.d src2, zero, 2, 0 -+ slli.d tmp1, src1_off, 0x3 -+ ld.d data1, src1, 0 -+ ld.d data2, src2, 0 -+ addi.d src2, src2, 8 -+ addi.d src1, src1, 8 -+ -+ addi.d limit_wd, limit, -1 -+ andi tmp3, limit_wd, 0x7 -+ srli.d limit_wd, limit_wd, 3 -+ add.d limit, limit, src1_off -+ add.d tmp3, tmp3, src1_off -+ srli.d tmp3, tmp3, 3 -+ add.d limit_wd, limit_wd, tmp3 -+ -+ sub.d tmp1, zero, tmp1 -+ nor tmp2, zero, zero -+ srl.d tmp2, tmp2, tmp1 -+ or data1, data1, tmp2 -+ or data2, data2, tmp2 -+ b strncmp_start_realigned -+ -+strncmp_misaligned8: -+ -+ li.w tmp1, 0x10 -+ bge limit, tmp1, strncmp_try_words -+strncmp_byte_loop: -+ ld.bu data1, src1, 0 -+ ld.bu data2, src2, 0 -+ addi.d limit, limit, -1 -+ xor tmp1, data1, data2 -+ masknez tmp1, data1, tmp1 -+ maskeqz tmp1, limit, tmp1 -+ beqz tmp1, strncmp_done -+ -+ ld.bu data1, src1, 1 -+ ld.bu data2, src2, 1 -+ addi.d src1, src1, 2 -+ addi.d src2, src2, 2 -+ addi.d limit, limit, -1 -+ xor tmp1, data1, data2 -+ masknez tmp1, data1, tmp1 -+ maskeqz tmp1, limit, tmp1 -+ bnez tmp1, strncmp_byte_loop -+ -+ -+strncmp_done: -+ sub.d result, data1, data2 -+ jr ra -+ -+strncmp_try_words: -+ srli.d limit_wd, limit, 3 -+ beqz src1_off, strncmp_do_misaligned -+ -+ sub.d src1_off, zero, src1_off -+ andi src1_off, src1_off, 0x7 -+ sub.d limit, limit, src1_off -+ srli.d limit_wd, limit, 0x3 -+ -+ -+strncmp_page_end_loop: -+ ld.bu data1, src1, 0 -+ ld.bu data2, src2, 0 -+ addi.d src1, src1, 1 -+ addi.d src2, src2, 1 -+ xor tmp1, data1, data2 -+ masknez tmp1, data1, tmp1 -+ beqz tmp1, strncmp_done -+ andi tmp1, src1, 0x7 -+ bnez tmp1, strncmp_page_end_loop -+strncmp_do_misaligned: -+ li.w src1_off, 0x8 -+ addi.d limit_wd, limit_wd, -1 -+ blt limit_wd, zero, strncmp_done_loop -+ -+strncmp_loop_misaligned: -+ andi tmp2, src2, 0xff8 -+ xori tmp2, tmp2, 0xff8 -+ beqz tmp2, strncmp_page_end_loop -+ -+ ld.d data1, src1, 0 -+ ld.d data2, src2, 0 -+ addi.d src1, src1, 8 -+ addi.d src2, src2, 8 -+ sub.d tmp1, data1, zeroones -+ or tmp2, data1, sevenf -+ xor diff, data1, data2 -+ andn has_nul, tmp1, tmp2 -+ or syndrome, diff, has_nul -+ bnez syndrome, strncmp_not_limit -+ addi.d limit_wd, limit_wd, -1 -+ bge limit_wd, zero, strncmp_loop_misaligned -+ -+strncmp_done_loop: -+ andi limit, limit, 0x7 -+ beqz limit, strncmp_not_limit -+ -+ /* Read the last double word */ -+ /* check if the final part is about to exceed the page */ -+ andi tmp1, src2, 0x7 -+ andi tmp2, src2, 0xff8 -+ add.d tmp1, tmp1, limit -+ xori tmp2, tmp2, 0xff8 -+ andi tmp1, tmp1, 0x8 -+ masknez tmp1, tmp1, tmp2 -+ bnez tmp1, strncmp_byte_loop -+ addi.d src1, src1, -8 -+ addi.d src2, src2, -8 -+ ldx.d data1, src1, limit -+ ldx.d data2, src2, limit -+ sub.d tmp1, data1, zeroones -+ or tmp2, data1, sevenf -+ xor diff, data1, data2 -+ andn has_nul, tmp1, tmp2 -+ or syndrome, diff, has_nul -+ bnez syndrome, strncmp_not_limit -+ -+strncmp_ret0: -+ move result, zero -+ jr ra -+ -+/* check if ((src1 != 0) && ((src2 == 0 ) || (src1 < src2))) -+ then exchange(src1,src2). */ -+ -+END(STRNCMP) -+#ifndef ANDROID_CHANGES -+#ifdef _LIBC -+libc_hidden_builtin_def (STRNCMP) -+#endif -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp.c b/sysdeps/loongarch/lp64/multiarch/strncmp.c -new file mode 100644 -index 00000000..80ab8c8c ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strncmp.c -@@ -0,0 +1,35 @@ -+/* Multiple versions of strncmp. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* Define multiple versions only for the definition in libc. */ -+#if IS_IN (libc) -+# define strncmp __redirect_strncmp -+# include -+# undef strncmp -+ -+# define SYMBOL_NAME strncmp -+# include "ifunc-lsx.h" -+ -+libc_ifunc_redirected (__redirect_strncmp, strncmp, IFUNC_SELECTOR ()); -+ -+# ifdef SHARED -+__hidden_ver1 (strncmp, __GI_strncmp, __redirect_strncmp) -+ __attribute__ ((visibility ("hidden"))); -+# endif -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S -new file mode 100644 -index 00000000..503442b3 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S -@@ -0,0 +1,8 @@ -+ -+#if IS_IN (libc) -+ -+#define STRNLEN __strnlen_aligned -+ -+#endif -+ -+#include "../strnlen.S" -diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S -new file mode 100644 -index 00000000..8c30f10c ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S -@@ -0,0 +1,92 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define STRNLEN __strnlen_lasx -+ -+/* size_t strnlen (const char *s1, size_t maxlen); */ -+ -+LEAF(STRNLEN) -+ .align 6 -+ beqz a1, L(ret0) -+ andi t1, a0, 0x3f -+ li.d t3, 65 -+ sub.d a2, a0, t1 -+ -+ xvld $xr0, a2, 0 -+ xvld $xr1, a2, 32 -+ sub.d t1, t3, t1 -+ move a3, a0 -+ -+ sltu t1, a1, t1 -+ xvmsknz.b $xr0, $xr0 -+ xvmsknz.b $xr1, $xr1 -+ xvpickve.w $xr2, $xr0, 4 -+ -+ xvpickve.w $xr3, $xr1, 4 -+ vilvl.h $vr0, $vr2, $vr0 -+ vilvl.h $vr1, $vr3, $vr1 -+ vilvl.w $vr0, $vr1, $vr0 -+ -+ -+ movfr2gr.d t0, $f0 -+ sra.d t0, t0, a0 -+ orn t1, t1, t0 -+ bnez t1, L(end) -+ -+ add.d a4, a0, a1 -+ move a0, a2 -+ addi.d a4, a4, -1 -+ bstrins.d a4, zero, 5, 0 -+ -+L(loop): -+ xvld $xr0, a0, 64 -+ xvld $xr1, a0, 96 -+ addi.d a0, a0, 64 -+ beq a0, a4, L(out) -+ -+ xvmin.bu $xr2, $xr0, $xr1 -+ xvsetanyeqz.b $fcc0, $xr2 -+ bceqz $fcc0, L(loop) -+L(out): -+ xvmsknz.b $xr0, $xr0 -+ -+ -+ xvmsknz.b $xr1, $xr1 -+ xvpickve.w $xr2, $xr0, 4 -+ xvpickve.w $xr3, $xr1, 4 -+ vilvl.h $vr0, $vr2, $vr0 -+ -+ vilvl.h $vr1, $vr3, $vr1 -+ vilvl.w $vr0, $vr1, $vr0 -+ movfr2gr.d t0, $f0 -+L(end): -+ sub.d a0, a0, a3 -+ -+ cto.d t0, t0 -+ add.d a0, a0, t0 -+ sltu t1, a0, a1 -+ masknez t0, a1, t1 -+ -+ maskeqz t1, a0, t1 -+ or a0, t0, t1 -+ jr ra -+L(ret0): -+ move a0, zero -+ -+ -+ jr ra -+END(STRNLEN) -+ -+#ifdef _LIBC -+libc_hidden_def (STRNLEN) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S -new file mode 100644 -index 00000000..388c239a ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S -@@ -0,0 +1,81 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define STRNLEN __strnlen_lsx -+ -+/* size_t strnlen (const char *s1, size_t maxlen); */ -+ -+LEAF(STRNLEN) -+ .align 6 -+ beqz a1, L(ret0) -+ andi t1, a0, 0x1f -+ li.d t3, 33 -+ sub.d a2, a0, t1 -+ -+ vld $vr0, a2, 0 -+ vld $vr1, a2, 16 -+ sub.d t1, t3, t1 -+ move a3, a0 -+ -+ sltu t1, a1, t1 -+ vmsknz.b $vr0, $vr0 -+ vmsknz.b $vr1, $vr1 -+ vilvl.h $vr0, $vr1, $vr0 -+ -+ movfr2gr.s t0, $f0 -+ sra.w t0, t0, a0 -+ orn t1, t1, t0 -+ bnez t1, L(end) -+ -+ -+ add.d a4, a0, a1 -+ move a0, a2 -+ addi.d a4, a4, -1 -+ bstrins.d a4, zero, 4, 0 -+ -+L(loop): -+ vld $vr0, a0, 32 -+ vld $vr1, a0, 48 -+ addi.d a0, a0, 32 -+ beq a0, a4, L(out) -+ -+ vmin.bu $vr2, $vr0, $vr1 -+ vsetanyeqz.b $fcc0, $vr2 -+ bceqz $fcc0, L(loop) -+L(out): -+ vmsknz.b $vr0, $vr0 -+ -+ vmsknz.b $vr1, $vr1 -+ vilvl.h $vr0, $vr1, $vr0 -+ movfr2gr.s t0, $f0 -+L(end): -+ sub.d a0, a0, a3 -+ -+ -+ cto.w t0, t0 -+ add.d a0, a0, t0 -+ sltu t1, a0, a1 -+ masknez t0, a1, t1 -+ -+ maskeqz t1, a0, t1 -+ or a0, t0, t1 -+ jr ra -+L(ret0): -+ move a0, zero -+ -+ jr ra -+END(STRNLEN) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (STRNLEN) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S -new file mode 100644 -index 00000000..60eccf00 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S -@@ -0,0 +1,145 @@ -+/* Copyright 2016 Loongson Technology Corporation Limited. */ -+ -+/* Author: Songyuekun songyuekun@loongson.cn -+ * ISA: MIPS64R2 -+ * ABI: N64. -+ * algorithm: -+ #. use ld/ldr to access word/partial word in the string -+ #. use (x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) != 0 to -+ judge if x has zero byte -+ #. use dctz((x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) >> 3 -+ to get the index of first rightmost zero byte in dword x; -+ #. use dctz(x) = 64 - dclz(~x & (x-1)); -+ #. use pointer to the last non zero byte minus pointer to the start -+ of the string to get the length of string. */ -+ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define L_ADDIU addi.d -+#define L_ADDU add.d -+#define L_SUBU sub.d -+ -+#define STRNLEN __strnlen_unaligned -+ -+/* rd <- if rc then ra else rb -+ will destroy t6. */ -+ -+#define CONDITIONSEL(rd,ra,rb,rc)\ -+ masknez a5, rb, rc;\ -+ maskeqz rd, ra, rc;\ -+ or rd, rd, a5 -+ -+/* Parameters and Results */ -+#define srcin a0 -+#define limit a1 -+#define len v0 -+ -+/* Internal variable */ -+#define data1 t0 -+#define data2 t1 -+#define has_nul1 t2 -+#define has_nul2 t3 -+#define src t4 -+#define zeroones t5 -+#define sevenf t6 -+#define data2a t7 -+#define tmp6 t7 -+#define pos t8 -+#define tmp1 a2 -+#define tmp2 a3 -+#define tmp3 a4 -+#define tmp4 a5 -+#define tmp5 a6 -+#define limit_wd a7 -+ -+/* size_t strnlen (const char *s1,size_t maxlen); */ -+ -+LEAF(STRNLEN) -+ -+ .align 4 -+ beqz limit, L(_hit_limit) -+ lu12i.w zeroones, 0x01010 -+ lu12i.w sevenf, 0x7f7f7 -+ ori zeroones, zeroones, 0x101 -+ ori sevenf, sevenf, 0xf7f -+ bstrins.d zeroones, zeroones, 63, 32 -+ bstrins.d sevenf, sevenf, 63, 32 -+ andi tmp1, srcin, 15 -+ sub.d src, srcin, tmp1 -+ bnez tmp1, L(misaligned) -+ addi.d limit_wd, limit, -1 -+ srli.d limit_wd, limit_wd, 4 -+L(_loop): -+ ld.d data1, src, 0 -+ ld.d data2, src, 8 -+ addi.d src, src, 16 -+L(_realigned): -+ sub.d tmp1, data1, zeroones -+ or tmp2, data1, sevenf -+ sub.d tmp3, data2, zeroones -+ or tmp4, data2, sevenf -+ andn has_nul1, tmp1, tmp2 -+ andn has_nul2, tmp3, tmp4 -+ addi.d limit_wd, limit_wd, -1 -+ srli.d tmp1, limit_wd, 63 -+ or tmp2, has_nul1, has_nul2 -+ or tmp3, tmp1, tmp2 -+ beqz tmp3, L(_loop) -+ beqz tmp2, L(_hit_limit) -+ sub.d len, src, srcin -+ beqz has_nul1, L(_nul_in_data2) -+ move has_nul2, has_nul1 -+ addi.d len, len, -8 -+L(_nul_in_data2): -+ ctz.d pos, has_nul2 -+ srli.d pos, pos, 3 -+ addi.d len, len, -8 -+ add.d len, len, pos -+ sltu tmp1, len, limit -+ CONDITIONSEL(len,len,limit,tmp1) -+ jr ra -+ -+ -+L(misaligned): -+ addi.d limit_wd, limit, -1 -+ sub.d tmp4, zero, tmp1 -+ andi tmp3, limit_wd, 15 -+ srli.d limit_wd, limit_wd, 4 -+ li.d tmp5, -1 -+ ld.d data1, src, 0 -+ ld.d data2, src, 8 -+ addi.d src, src, 16 -+ slli.d tmp4, tmp4, 3 -+ add.d tmp3, tmp3, tmp1 -+ srl.d tmp2, tmp5, tmp4 -+ srli.d tmp3, tmp3, 4 -+ add.d limit_wd, limit_wd, tmp3 -+ or data1, data1, tmp2 -+ or data2a, data2, tmp2 -+ li.w tmp3, 9 -+ sltu tmp1, tmp1, tmp3 -+ CONDITIONSEL(data1,data1,tmp5,tmp1) -+ CONDITIONSEL(data2,data2,data2a,tmp1) -+ b L(_realigned) -+ -+ -+L(_hit_limit): -+ move len, limit -+ jr ra -+END(STRNLEN) -+#ifndef ANDROID_CHANGES -+#ifdef _LIBC -+libc_hidden_builtin_def (STRNLEN) -+#endif -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen.c b/sysdeps/loongarch/lp64/multiarch/strnlen.c -new file mode 100644 -index 00000000..6fc406d2 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strnlen.c -@@ -0,0 +1,40 @@ -+/* Multiple versions of strnlen. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* Define multiple versions only for the definition in libc. */ -+#if IS_IN (libc) -+# define strnlen __redirect_strnlen -+# define __strnlen __redirect___strnlen -+# include -+# undef __strnlen -+# undef strnlen -+ -+# define SYMBOL_NAME strnlen -+# include "ifunc-lasx.h" -+ -+libc_ifunc_redirected (__redirect_strnlen, __strnlen, IFUNC_SELECTOR ()); -+weak_alias (__strnlen, strnlen); -+# ifdef SHARED -+__hidden_ver1 (__strnlen, __GI___strnlen, __redirect___strnlen) -+ __attribute__((visibility ("hidden"))); -+__hidden_ver1 (strnlen, __GI_strnlen, __redirect_strnlen) -+ __attribute__((weak, visibility ("hidden"))); -+# endif -+#endif -+ -diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S -new file mode 100644 -index 00000000..a58ddde8 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S -@@ -0,0 +1,12 @@ -+ -+#if IS_IN (libc) -+ -+#define STRRCHR_NAME __strrchr_aligned -+ -+#endif -+ -+#include "../strrchr.S" -+ -+#undef rindex -+weak_alias(STRRCHR_NAME, rindex) -+ -diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S -new file mode 100644 -index 00000000..6f7a5618 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S -@@ -0,0 +1,113 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define STRRCHR __strrchr_lasx -+ -+LEAF(STRRCHR) -+ .align 6 -+ andi t1, a0, 0x3f -+ bstrins.d a0, zero, 5, 0 -+ xvld $xr0, a0, 0 -+ xvld $xr1, a0, 32 -+ -+ li.d t2, -1 -+ xvreplgr2vr.b $xr4, a1 -+ move a2, zero -+ sll.d t3, t2, t1 -+ -+ addi.d a0, a0, 63 -+ xvseq.b $xr2, $xr0, $xr4 -+ xvseq.b $xr3, $xr1, $xr4 -+ xvmsknz.b $xr0, $xr0 -+ -+ xvmsknz.b $xr1, $xr1 -+ xvpickve.w $xr5, $xr0, 4 -+ xvpickve.w $xr6, $xr1, 4 -+ vilvl.h $vr0, $vr5, $vr0 -+ -+ -+ vilvl.h $vr1, $vr6, $vr1 -+ xvmsknz.b $xr2, $xr2 -+ xvmsknz.b $xr3, $xr3 -+ xvpickve.w $xr5, $xr2, 4 -+ -+ xvpickve.w $xr6, $xr3, 4 -+ vilvl.h $vr2, $vr5, $vr2 -+ vilvl.h $vr3, $vr6, $vr3 -+ vilvl.w $vr0, $vr1, $vr0 -+ -+ vilvl.w $vr1, $vr3, $vr2 -+ movfr2gr.d t0, $f0 -+ movfr2gr.d t1, $f1 -+ orn t0, t0, t3 -+ -+ and t1, t1, t3 -+ bne t0, t2, L(end) -+L(loop): -+ xvld $xr0, a0, 1 -+ xvld $xr1, a0, 33 -+ -+ -+ clz.d t0, t1 -+ sub.d t0, a0, t0 -+ addi.d a0, a0, 64 -+ maskeqz t0, t0, t1 -+ -+ masknez t1, a2, t1 -+ or a2, t0, t1 -+ xvseq.b $xr2, $xr0, $xr4 -+ xvseq.b $xr3, $xr1, $xr4 -+ -+ xvmsknz.b $xr2, $xr2 -+ xvmsknz.b $xr3, $xr3 -+ xvpickve.w $xr5, $xr2, 4 -+ xvpickve.w $xr6, $xr3, 4 -+ -+ vilvl.h $vr2, $vr5, $vr2 -+ vilvl.h $vr3, $vr6, $vr3 -+ xvmin.bu $xr5, $xr0, $xr1 -+ vilvl.w $vr2, $vr3, $vr2 -+ -+ -+ xvsetanyeqz.b $fcc0, $xr5 -+ movfr2gr.d t1, $f2 -+ bceqz $fcc0, L(loop) -+ xvmsknz.b $xr0, $xr0 -+ -+ xvmsknz.b $xr1, $xr1 -+ xvpickve.w $xr5, $xr0, 4 -+ xvpickve.w $xr6, $xr1, 4 -+ vilvl.h $vr0, $vr5, $vr0 -+ -+ vilvl.h $vr1, $vr6, $vr1 -+ vilvl.w $vr0, $vr1, $vr0 -+ movfr2gr.d t0, $f0 -+L(end): -+ slli.d t3, t2, 1 # shift one more for the last '\0' -+ -+ cto.d t0, t0 -+ sll.d t3, t3, t0 -+ andn t1, t1, t3 -+ clz.d t0, t1 -+ -+ sub.d a0, a0, t0 -+ maskeqz t0, a0, t1 -+ masknez t1, a2, t1 -+ or a0, t0, t1 -+ -+ jr ra -+END(STRRCHR) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def(STRRCHR) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S -new file mode 100644 -index 00000000..e9228a2e ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S -@@ -0,0 +1,93 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#if IS_IN (libc) -+ -+#define STRRCHR __strrchr_lsx -+ -+LEAF(STRRCHR) -+ .align 6 -+ andi t1, a0, 0x1f -+ bstrins.d a0, zero, 4, 0 -+ vld $vr0, a0, 0 -+ vld $vr1, a0, 16 -+ -+ vreplgr2vr.b $vr4, a1 -+ li.d t2, -1 -+ move a2, zero -+ addi.d a0, a0, 31 -+ -+ vseq.b $vr2, $vr0, $vr4 -+ vseq.b $vr3, $vr1, $vr4 -+ vmsknz.b $vr0, $vr0 -+ vmsknz.b $vr1, $vr1 -+ -+ vmsknz.b $vr2, $vr2 -+ vmsknz.b $vr3, $vr3 -+ vilvl.h $vr0, $vr1, $vr0 -+ vilvl.h $vr1, $vr3, $vr2 -+ -+ -+ movfr2gr.s t0, $f0 -+ sll.d t3, t2, t1 -+ movfr2gr.s t1, $f1 -+ orn t0, t0, t3 -+ -+ and t1, t1, t3 -+ bne t0, t2, L(end) -+L(loop): -+ vld $vr0, a0, 1 -+ vld $vr1, a0, 17 -+ -+ clz.w t0, t1 -+ sub.d t0, a0, t0 -+ addi.d a0, a0, 32 -+ maskeqz t0, t0, t1 -+ -+ masknez t1, a2, t1 -+ or a2, t0, t1 -+ vseq.b $vr2, $vr0, $vr4 -+ vseq.b $vr3, $vr1, $vr4 -+ -+ -+ vmsknz.b $vr2, $vr2 -+ vmsknz.b $vr3, $vr3 -+ vmin.bu $vr5, $vr0, $vr1 -+ vilvl.h $vr2, $vr3, $vr2 -+ -+ vsetanyeqz.b $fcc0, $vr5 -+ movfr2gr.s t1, $f2 -+ bceqz $fcc0, L(loop) -+ vmsknz.b $vr0, $vr0 -+ -+ vmsknz.b $vr1, $vr1 -+ vilvl.h $vr0, $vr1, $vr0 -+ movfr2gr.s t0, $f0 -+L(end): -+ slli.d t3, t2, 1 # shift one more for the last '\0' -+ -+ cto.w t0, t0 -+ sll.d t3, t3, t0 -+ andn t1, t1, t3 -+ clz.w t0, t1 -+ -+ -+ sub.d a0, a0, t0 -+ maskeqz t0, a0, t1 -+ masknez t1, a2, t1 -+ or a0, t0, t1 -+ -+ jr ra -+END(STRRCHR) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def(STRRCHR) -+#endif -+ -+#endif -diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr.c b/sysdeps/loongarch/lp64/multiarch/strrchr.c -new file mode 100644 -index 00000000..32eb6ea6 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/multiarch/strrchr.c -@@ -0,0 +1,39 @@ -+/* Multiple versions of strrchr. -+ All versions must be listed in ifunc-impl-list.c. -+ Copyright (C) 2017-2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* Define multiple versions only for the definition in libc. */ -+#if IS_IN (libc) -+# define strrchr __redirect_strrchr -+# include -+# undef strrchr -+ -+# define SYMBOL_NAME strrchr -+# include "ifunc-memchr.h" -+ -+libc_ifunc_redirected (__redirect_strrchr, __new_strrchr, -+ IFUNC_SELECTOR ()); -+weak_alias(__new_strrchr, rindex) -+# ifdef SHARED -+__hidden_ver1 (__new_strrchr, __GI_strrchr, __redirect_strrchr) -+ __attribute__ ((visibility ("hidden"))); -+# endif -+ -+# include -+versioned_symbol (libc, __new_strrchr, strrchr, GLIBC_2_27); -+#endif -diff --git a/sysdeps/loongarch/lp64/rawmemchr.S b/sysdeps/loongarch/lp64/rawmemchr.S -new file mode 100644 -index 00000000..94b70f2d ---- /dev/null -+++ b/sysdeps/loongarch/lp64/rawmemchr.S -@@ -0,0 +1,114 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#ifndef RAWMEMCHR_NAME -+# define RAWMEMCHR_NAME __rawmemchr -+#endif -+ -+ -+LEAF(RAWMEMCHR_NAME) -+ .align 6 -+ andi t1, a0, 0x7 -+ bstrins.d a0, zero, 2, 0 -+ lu12i.w a2, 0x01010 -+ bstrins.d a1, a1, 15, 8 -+ -+ ld.d t0, a0, 0 -+ slli.d t1, t1, 3 -+ ori a2, a2, 0x101 -+ bstrins.d a1, a1, 31, 16 -+ -+ li.w t8, -1 -+ bstrins.d a1, a1, 63, 32 -+ bstrins.d a2, a2, 63, 32 -+ sll.d t2, t8, t1 -+ -+ sll.d t3, a1, t1 -+ orn t0, t0, t2 -+ slli.d a3, a2, 7 -+ beqz a1, L(find_zero) -+ -+ xor t0, t0, t3 -+ sub.d t1, t0, a2 -+ andn t2, a3, t0 -+ and t3, t1, t2 -+ -+ bnez t3, L(count_pos) -+ addi.d a0, a0, 8 -+ -+L(loop): -+ ld.d t0, a0, 0 -+ xor t0, t0, a1 -+ -+ sub.d t1, t0, a2 -+ andn t2, a3, t0 -+ and t3, t1, t2 -+ bnez t3, L(count_pos) -+ -+ ld.d t0, a0, 8 -+ addi.d a0, a0, 16 -+ xor t0, t0, a1 -+ sub.d t1, t0, a2 -+ -+ andn t2, a3, t0 -+ and t3, t1, t2 -+ beqz t3, L(loop) -+ addi.d a0, a0, -8 -+L(count_pos): -+ ctz.d t0, t3 -+ srli.d t0, t0, 3 -+ add.d a0, a0, t0 -+ jr ra -+ -+L(loop_7bit): -+ ld.d t0, a0, 0 -+L(find_zero): -+ sub.d t1, t0, a2 -+ and t2, t1, a3 -+ bnez t2, L(more_check) -+ -+ ld.d t0, a0, 8 -+ addi.d a0, a0, 16 -+ sub.d t1, t0, a2 -+ and t2, t1, a3 -+ -+ beqz t2, L(loop_7bit) -+ addi.d a0, a0, -8 -+ -+L(more_check): -+ andn t2, a3, t0 -+ and t3, t1, t2 -+ bnez t3, L(count_pos) -+ addi.d a0, a0, 8 -+ -+L(loop_8bit): -+ ld.d t0, a0, 0 -+ -+ sub.d t1, t0, a2 -+ andn t2, a3, t0 -+ and t3, t1, t2 -+ bnez t3, L(count_pos) -+ -+ ld.d t0, a0, 8 -+ addi.d a0, a0, 16 -+ sub.d t1, t0, a2 -+ -+ andn t2, a3, t0 -+ and t3, t1, t2 -+ beqz t3, L(loop_8bit) -+ -+ addi.d a0, a0, -8 -+ b L(count_pos) -+ -+END(RAWMEMCHR_NAME) -+ -+#ifdef _LIBC -+weak_alias (__rawmemchr, rawmemchr) -+libc_hidden_builtin_def (__rawmemchr) -+#endif -diff --git a/sysdeps/loongarch/lp64/s_cosf.S b/sysdeps/loongarch/lp64/s_cosf.S -new file mode 100644 -index 00000000..5bfabefb ---- /dev/null -+++ b/sysdeps/loongarch/lp64/s_cosf.S -@@ -0,0 +1,409 @@ -+#include -+#include -+#include -+ -+/* Short algorithm description: -+ * -+ * 1) if |x|==0: sin(x)=x, -+ * cos(x)=1. -+ * 2) if |x|<2^-27: sin(x)=x-x*DP_SMALL, raising underflow only when needed, -+ * cos(x)=1-|x|. -+ * 3) if |x|<2^-5 : sin(x)=x+x*x^2*DP_SIN2_0+x^5*DP_SIN2_1, -+ * cos(x)=1+1*x^2*DP_COS2_0+x^5*DP_COS2_1 -+ * 4) if |x|< Pi/4: sin(x)=x+x*x^2*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))), -+ * cos(x)=1+1*x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). -+ * 5) if |x| < 9*Pi/4: -+ * 5.1) Range reduction: -+ * k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1, t=|x|-j*Pi/4. -+ * 5.2) Reconstruction: -+ * sign_sin = sign(x) * (-1.0)^(( n >>2)&1) -+ * sign_cos = (-1.0)^(((n+2)>>2)&1) -+ * poly_sin = ((((S4*t^2 + S3)*t^2 + S2)*t^2 + S1)*t^2 + S0)*t^2*t+t -+ * poly_cos = ((((C4*t^2 + C3)*t^2 + C2)*t^2 + C1)*t^2 + C0)*t^2*s+s -+ * if(n&2 != 0) { -+ * using cos(t) and sin(t) polynomials for |t|= 2^23, very large args: -+ * 7.1) Range reduction: -+ * k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4. -+ * 7.2) Reconstruction same as (5.2). -+ * 8) if x is Inf, return x-x, and set errno=EDOM. -+ * 9) if x is NaN, return x-x. -+ * -+ * Special cases: -+ * sin/cos(+-0) = +-0/1 not raising inexact/underflow, -+ * sin/cos(subnormal) raises inexact/underflow, -+ * sin/cos(min_normalized) raises inexact/underflow, -+ * sin/cos(normalized) raises inexact, -+ * sin/cos(Inf) = NaN, raises invalid, sets errno to EDOM, -+ * sin/cos(NaN) = NaN. -+ */ -+ -+#define COSF __cosf -+ -+#define LOADFD(rd, rs, label) \ -+ la.local rs, label;\ -+ fld.d rd, rs, 0 -+ -+#define LOADFS(rd, rs, label) \ -+ la.local rs, label;\ -+ fld.s rd, rs, 0 -+ -+#define FTOL(rd, rs, tmp) \ -+ ftintrz.l.d tmp, rs;\ -+ movfr2gr.d rd, tmp -+ -+#define FTOW(rd, rs, tmp) \ -+ ftintrz.w.d tmp, rs;\ -+ movfr2gr.s rd, tmp -+ -+#define WTOF(rd, rs, tmp) \ -+ movgr2fr.w tmp, rs;\ -+ ffint.d.w rd, tmp -+ -+#define LTOF(rd, rs, tmp) \ -+ movgr2fr.d tmp, rs;\ -+ ffint.d.l rd, tmp -+ -+LEAF(COSF) -+ .align 2 -+ .align 3 -+ /* fa0 is SP x; fa1 is DP x */ -+ movfr2gr.s t0, fa0 /* Bits of x */ -+ fcvt.d.s fa1, fa0 /* DP x */ -+ li.w t1, 0x7fffffff -+ and t0, t0, t1 /* |x| */ -+ li.w t1, 0x3f490fdb /* const Pi/4 */ -+ bltu t0, t1, L(arg_less_pio4) /* |x| < Pi/4 branch */ -+ li.w t1, 0x40e231d6 /* 9*Pi/4 */ -+ la.local t4, L(DP_) /*DP_ base addr*/ -+ bgeu t0, t1, L(greater_or_equal_9pio4) /* |x| >= 9*Pi/4 branch */ -+/* L(median_args): */ -+ /* Here if Pi/4<=|x|<9*Pi/4 */ -+ fabs.d fa0, fa1 /* DP |x| */ -+ fld.d fa1, t4, 56 /* 4/Pi */ -+ fmul.d fa1, fa1, fa0 /* DP |x|/(Pi/4) */ -+ FTOW( t0, fa1, fa1 ) /* k=trunc(|x|/(Pi/4)) */ -+ la.local t1, L(PIO2J) /* base addr of PIO2J table */ -+ addi.w t0, t0, 1 /* k+1 */ -+ bstrpick.d t2, t0, 3, 1 /* j=n/2 */ -+ alsl.d t1, t2, t1, 3 -+ fld.d fa1, t1, 0 /* j*Pi/2 */ -+ addi.w t0, t0, 2 /* n = k+3 */ -+ fsub.d fa0, fa0, fa1 /* t = |x| - j * Pi/2 */ -+/* Input: t0=n fa0=t*/ -+L(reduced): -+ /* Here if cos(x) calculated using cos(t) polynomial for |t|>2)&1) -+ * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))) -+ -+ * Here if cos(x) calculated using sin(t) polynomial for |t|>2)&1) -+ * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))) -+ */ -+ /* TODO: what is the best order ??? */ -+ /* load-to-use latency, hardware module usage, integer pipeline & float pipeline */ -+ /* cancel branch */ -+ slli.w t0, t0, 1 /* (n << 1) */ -+ andi t1, t0, 4 /* (n << 1) & 4 */ -+ alsl.d t2, t1, t4, 4 /* adjust to DP_C or DP_S */ -+ fld.d fa3, t2, 32 /* C4 */ -+ andi t0, t0, 8 /* =====> (n << 1) & 8 */ -+ fmul.d fa1, fa0, fa0 /* y=x^2 */ -+ fld.d fa4, t2, 16 /* C2 */ -+ fmul.d fa2, fa1, fa1 /* z=x^4 */ -+ fld.d fa5, t2, 24 /* C3 */ -+ la.local t3, L(DP_ONES) /* =====> DP_ONES */ -+ fld.d fa6, t2, 8 /* C1 */ -+ fmadd.d fa4, fa2, fa3, fa4 /* cx = C2+z*C4 */ -+ fld.d fa3, t2, 0 /* C0 */ -+ fmadd.d fa5, fa2, fa5, fa6 /* cy = C1+z*C3 */ -+ fld.d fa6, t3, 0 /* one */ -+ fmadd.d fa4, fa2, fa4, fa3 /* cx = C0+z*cx */ -+ add.d t0, t0, t3 /* =====> addr */ -+ fmadd.d fa4, fa1, fa5, fa4 /* cx = cx+y*cy */ -+ fld.d fa2, t0, 0 /* sign */ -+ fmadd.d fa4, fa4, fa1, fa6 /* 1.0+y*cx */ -+ fmul.d fa1, fa2, fa4 /* sign * cx */ -+ bnez t1, L_return -+ fmul.d fa1, fa1, fa0 /* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */ -+L_return: -+ fcvt.s.d fa0, fa1 /* SP result */ -+ jr ra -+ -+L(greater_or_equal_9pio4): -+ /* Here if |x|>=9*Pi/4 */ -+ li.w t1, 0x7f800000 /* x is Inf or NaN? */ -+ bgeu t0, t1, L(inf_or_nan) /* |x| >= Inf branch */ -+ /* Here if finite |x|>=9*Pi/4 */ -+ li.w t1, 0x4b000000 /* 2^23 */ -+ bgeu t0, t1, L(greater_or_equal_2p23) /* |x| >= 2^23 branch */ -+ /* Here if 9*Pi/4<=|x|<2^23 */ -+ fabs.d fa0, fa1 /* DP |x| */ -+ fld.d fa1, t4, 56 -+ fmul.d fa1, fa1, fa0 /* |x|/(Pi/4) */ -+ FTOW( t0, fa1, fa1 ) /* k=trunc(|x|/(Pi/4)) */ -+ addi.w t0, t0, 1 /* k+1 */ -+ srli.w t1, t0, 1 /* x=n/2 */ -+ WTOF( fa1, t1, fa1 ) /* DP x */ -+ fld.d fa2, t4, 104 /* -PIO2HI = high part of -Pi/2 */ -+ fld.d fa3, t4, 112 /* -PIO2LO = low part of -Pi/2 */ -+ fmadd.d fa0, fa2, fa1, fa0 /* |x| - x*PIO2HI */ -+ addi.w t0, t0, 2 /* n = k+3 */ -+ fmadd.d fa0, fa3, fa1, fa0 /* |x| - x*PIO2HI - x*PIO2LO */ -+ b L(reduced) -+ -+L(greater_or_equal_2p23): -+ /* Here if finite |x|>=2^23 */ -+ fabs.s fa5, fa0 /* SP |x| */ -+ /* bitpos = (ix>>23) - BIAS_32; */ -+ srli.w t0, t0, 23 /*TODO???srai.w eb = biased exponent of x */ -+ /* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */ -+ addi.w t0, t0, -124 /* t0 = bitpos */ -+ /* t3= j = bitpos/28 */ -+ /* x/28 = (x * ((0x100000000 / 28) + 1)) >> 32 */ -+ li.w t1, 0x924924a -+ mulh.wu t0, t1, t0 -+ fcvt.d.s fa5, fa5 /* Convert to double */ -+ /* TODO: what is the best order ??? */ -+ la.local t1, L(invpio4_table) /* t2 */ -+ alsl.d t1, t0, t1, 3 -+ fld.d fa0, t1, 0 /* invpio4_table[j] */ -+ fld.d fa1, t1, 8 /* invpio4_table[j+1] */ -+ fmul.d fa0, fa0, fa5 /* a = invpio4_table[j]*|x| */ -+ fld.d fa2, t1, 16 /* invpio4_table[j+2] */ -+ fmul.d fa1, fa1, fa5 /* b = invpio4_table[j+1]*|x| */ -+ fld.d fa3, t1, 24 /* invpio4_table[j+3] */ -+ fmul.d fa2, fa2, fa5 /* c = invpio4_table[j+2]*|x| */ -+ fmul.d fa3, fa3, fa5 /* d = invpio4_table[j+3]*|x| */ -+/*TODO: overflow check*/ -+ FTOL( t0, fa0, fa4 ) /*uint64_t l = a; TODO: change the order*/ -+ li.w t1, -8 /* 0xfffffffffffffff8 */ -+ and t0, t0, t1 /* l &= ~0x7; */ -+ LTOF( fa4, t0, fa4 ) /* DP l*/ -+ fsub.d fa0, fa0, fa4 /* a -= l; */ -+ fadd.d fa4, fa0, fa1 /* fa4 double e = a + b; */ -+/*TODO: overflow check*/ -+ FTOL( t0, fa4, fa4 ) /*uint64_t l = e;*/ -+ andi t2, t0, 1 /* l & 1 TODO: change the order*/ -+ LOADFD( fa5, t1, L(DP_ONES) ) /* fa5 = 1.0 */ -+ LTOF( fa4, t0, fa4 ) /* fa4 DP l*/ -+/* critical!!!! the order */ -+ fsub.d fa0, fa0, fa4 -+ fld.d fa4, t4, 120 /* PI_4 */ -+ beqz t2, L_even_integer -+/*L_odd_integer:*/ -+ fsub.d fa0, fa0, fa5 -+ fadd.d fa0, fa0, fa1 -+ fadd.d fa2, fa2, fa3 -+ fadd.d fa0, fa0, fa2 -+ addi.d t0, t0, 3 -+ fmul.d fa0, fa0, fa4 -+ b L(reduced) -+L_even_integer: -+ fadd.d fa0, fa0, fa1 -+ fadd.d fa2, fa2, fa3 -+ fadd.d fa0, fa0, fa2 -+ fcmp.sle.d $fcc0, fa0, fa5 -+ addi.d t0, t0, 3 -+ bcnez $fcc0, L_leq_one -+/*L_gt_one:*/ -+ fld.d fa2, t1, 16 /* 2.0 */ -+ addi.d t0, t0, 1 -+ fsub.d fa0, fa0, fa2 -+L_leq_one: -+ fmul.d fa0, fa0, fa4 -+ b L(reduced) -+ -+L(arg_less_pio4): -+ /* Here if |x| -+#include -+#include -+ -+/* Short algorithm description: -+ * -+ * 1) if |x|==0: sin(x)=x, -+ * cos(x)=1. -+ * 2) if |x|<2^-27: sin(x)=x-x*DP_SMALL, raising underflow only when needed, -+ * cos(x)=1-|x|. -+ * 3) if |x|<2^-5 : sin(x)=x+x*x^2*DP_SIN2_0+x^5*DP_SIN2_1, -+ * cos(x)=1+1*x^2*DP_COS2_0+x^5*DP_COS2_1 -+ * 4) if |x|< Pi/4: sin(x)=x+x*x^2*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))), -+ * cos(x)=1+1*x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). -+ * 5) if |x| < 9*Pi/4: -+ * 5.1) Range reduction: -+ * k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1, t=|x|-j*Pi/4. -+ * 5.2) Reconstruction: -+ * sign_sin = sign(x) * (-1.0)^(( n >>2)&1) -+ * sign_cos = (-1.0)^(((n+2)>>2)&1) -+ * poly_sin = ((((S4*t^2 + S3)*t^2 + S2)*t^2 + S1)*t^2 + S0)*t^2*t+t -+ * poly_cos = ((((C4*t^2 + C3)*t^2 + C2)*t^2 + C1)*t^2 + C0)*t^2*s+s -+ * if(n&2 != 0) { -+ * using cos(t) and sin(t) polynomials for |t|= 2^23, very large args: -+ * 7.1) Range reduction: -+ * k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4. -+ * 7.2) Reconstruction same as (5.2). -+ * 8) if x is Inf, return x-x, and set errno=EDOM. -+ * 9) if x is NaN, return x-x. -+ * -+ * Special cases: -+ * sin/cos(+-0) = +-0/1 not raising inexact/underflow, -+ * sin/cos(subnormal) raises inexact/underflow, -+ * sin/cos(min_normalized) raises inexact/underflow, -+ * sin/cos(normalized) raises inexact, -+ * sin/cos(Inf) = NaN, raises invalid, sets errno to EDOM, -+ * sin/cos(NaN) = NaN. -+ */ -+ -+#define SINF __sinf -+ -+#define LOADFD(rd, rs, label) \ -+ la.local rs, label;\ -+ fld.d rd, rs, 0 -+ -+#define LOADFS(rd, rs, label) \ -+ la.local rs, label;\ -+ fld.s rd, rs, 0 -+ -+#define FTOL(rd, rs, tmp) \ -+ ftintrz.l.d tmp, rs;\ -+ movfr2gr.d rd, tmp -+ -+#define FTOW(rd, rs, tmp) \ -+ ftintrz.w.d tmp, rs;\ -+ movfr2gr.s rd, tmp -+ -+#define WTOF(rd, rs, tmp) \ -+ movgr2fr.w tmp, rs;\ -+ ffint.d.w rd, tmp -+ -+#define LTOF(rd, rs, tmp) \ -+ movgr2fr.d tmp, rs;\ -+ ffint.d.l rd, tmp -+ -+LEAF(SINF) -+ .align 2 -+ .align 3 -+ /* fa0 is SP x; fa1 is DP x */ -+ movfr2gr.s t2, fa0 /* Bits of x */ -+ fcvt.d.s fa1, fa0 /* DP x */ -+ li.w t1, 0x7fffffff -+ and t0, t2, t1 /* |x| */ -+ li.w t1, 0x3f490fdb /* const Pi/4 */ -+ bltu t0, t1, L(arg_less_pio4) /* |x| < Pi/4 branch */ -+ li.w t1, 0x40e231d6 /* 9*Pi/4 */ -+ la.local t4, L(DP_) /*DP_ base addr*/ -+ bstrpick.d t5, t2, 31, 31 /* sign of x */ -+ slli.w t5, t5, 3 -+ bgeu t0, t1, L(greater_or_equal_9pio4) /* |x| >= 9*Pi/4 branch */ -+/* L(median_args): */ -+ /* Here if Pi/4<=|x|<9*Pi/4 */ -+ fabs.d fa0, fa1 /* DP |x| */ -+ fld.d fa1, t4, 56 /* 4/Pi */ -+ fmul.d fa1, fa1, fa0 /* DP |x|/(Pi/4) */ -+ FTOW( t0, fa1, fa1 ) /* k=trunc(|x|/(Pi/4)) */ -+ la.local t1, L(PIO2J) /* base addr of PIO2J table */ -+ addi.w t0, t0, 1 /* k+1 */ -+ bstrpick.d t2, t0, 3, 1 /* j=n/2 */ -+ alsl.d t1, t2, t1, 3 -+ fld.d fa1, t1, 0 /* j*Pi/2 */ -+ fsub.d fa0, fa0, fa1 /* t = |x| - j * Pi/2 */ -+/* Input: t0=n fa0=t*/ -+/* Input: t0=n fa0=t, t5=sign(x) */ -+L(reduced): -+ /* Here if cos(x) calculated using cos(t) polynomial for |t|>2)&1) -+ * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))) -+ -+ * Here if cos(x) calculated using sin(t) polynomial for |t|>2)&1) -+ * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))) -+ */ -+ /* TODO: what is the best order ??? */ -+ /* load-to-use latency, hardware module usage, integer pipeline & float pipeline */ -+ /* cancel branch */ -+ slli.w t0, t0, 1 /* (n << 1) */ -+ andi t1, t0, 4 /* (n << 1) & 4 */ -+ alsl.d t2, t1, t4, 4 /* adjust to DP_C or DP_S */ -+ fld.d fa3, t2, 32 /* C4 */ -+ andi t0, t0, 8 /* =====> (n << 1) & 8 */ -+ fmul.d fa1, fa0, fa0 /* y=x^2 */ -+ xor t0, t0, t5 /* (-1.0)^((n>>2)&1) XOR sign(x) */ -+ fld.d fa4, t2, 16 /* C2 */ -+ fmul.d fa2, fa1, fa1 /* z=x^4 */ -+ fld.d fa5, t2, 24 /* C3 */ -+ la.local t3, L(DP_ONES) /* =====> DP_ONES */ -+ fld.d fa6, t2, 8 /* C1 */ -+ fmadd.d fa4, fa2, fa3, fa4 /* cx = C2+z*C4 */ -+ fld.d fa3, t2, 0 /* C0 */ -+ fmadd.d fa5, fa2, fa5, fa6 /* cy = C1+z*C3 */ -+ fld.d fa6, t3, 0 /* 1.0 */ -+ fmadd.d fa4, fa2, fa4, fa3 /* cx = C0+z*cx */ -+ add.d t0, t0, t3 /* =====> addr */ -+ fmadd.d fa4, fa1, fa5, fa4 /* cx = cx+y*cy */ -+ fld.d fa2, t0, 0 /* sign */ -+ fmadd.d fa4, fa4, fa1, fa6 /* 1.0+y*cx */ -+ fmul.d fa1, fa2, fa4 /* sign * cx */ -+ bnez t1, L_return -+ fmul.d fa1, fa1, fa0 /* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */ -+L_return: -+ fcvt.s.d fa0, fa1 /* SP result */ -+ jr ra -+ -+L(greater_or_equal_9pio4): -+ /* Here if |x|>=9*Pi/4 */ -+ li.w t1, 0x7f800000 /* x is Inf or NaN? */ -+ bgeu t0, t1, L(inf_or_nan) /* |x| >= Inf branch */ -+ /* Here if finite |x|>=9*Pi/4 */ -+ li.w t1, 0x4b000000 /* 2^23 */ -+ bgeu t0, t1, L(greater_or_equal_2p23) /* |x| >= 2^23 branch */ -+ /* Here if 9*Pi/4<=|x|<2^23 */ -+ fabs.d fa0, fa1 /* DP |x| */ -+ fld.d fa1, t4, 56 -+ fmul.d fa1, fa1, fa0 /* |x|/(Pi/4) */ -+ FTOW( t0, fa1, fa1 ) /* k=trunc(|x|/(Pi/4)) */ -+ addi.w t0, t0, 1 /* k+1 */ -+ srli.w t1, t0, 1 /* x=n/2 */ -+ WTOF( fa1, t1, fa1 ) /* DP x */ -+ fld.d fa2, t4, 104 /* -PIO2HI = high part of -Pi/2 */ -+ fld.d fa3, t4, 112 /* -PIO2LO = low part of -Pi/2 */ -+ fmadd.d fa0, fa2, fa1, fa0 /* |x| - x*PIO2HI */ -+ fmadd.d fa0, fa3, fa1, fa0 /* |x| - x*PIO2HI - x*PIO2LO */ -+ b L(reduced) -+ -+L(greater_or_equal_2p23): -+ /* Here if finite |x|>=2^23 */ -+ fabs.s fa5, fa0 /* SP |x| */ -+ /* bitpos = (ix>>23) - BIAS_32; */ -+ srli.w t0, t0, 23 /*TODO???srai.w eb = biased exponent of x */ -+ /* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */ -+ addi.w t0, t0, -124 /* t0 = bitpos */ -+ /* t3= j = bitpos/28 */ -+ /* x/28 = (x * ((0x100000000 / 28) + 1)) >> 32 */ -+ li.w t1, 0x924924a -+ mulh.wu t0, t1, t0 -+ fcvt.d.s fa5, fa5 /* Convert to double */ -+ /* TODO: what is the best order ??? */ -+ la.local t1, L(invpio4_table) /* t2 */ -+ alsl.d t1, t0, t1, 3 -+ fld.d fa0, t1, 0 /* invpio4_table[j] */ -+ fld.d fa1, t1, 8 /* invpio4_table[j+1] */ -+ fmul.d fa0, fa0, fa5 /* a = invpio4_table[j]*|x| */ -+ fld.d fa2, t1, 16 /* invpio4_table[j+2] */ -+ fmul.d fa1, fa1, fa5 /* b = invpio4_table[j+1]*|x| */ -+ fld.d fa3, t1, 24 /* invpio4_table[j+3] */ -+ fmul.d fa2, fa2, fa5 /* c = invpio4_table[j+2]*|x| */ -+ fmul.d fa3, fa3, fa5 /* d = invpio4_table[j+3]*|x| */ -+/*TODO: overflow check*/ -+ FTOL( t0, fa0, fa4 ) /*uint64_t l = a; TODO: change the order*/ -+ li.w t1, -8 /* 0xfffffffffffffff8 */ -+ and t0, t0, t1 /* l &= ~0x7; */ -+ LTOF( fa4, t0, fa4 ) /* DP l*/ -+ fsub.d fa0, fa0, fa4 /* a -= l; */ -+ fadd.d fa4, fa0, fa1 /* fa4 double e = a + b; */ -+/*TODO: overflow check*/ -+ FTOL( t0, fa4, fa4 ) /*uint64_t l = e;*/ -+ andi t2, t0, 1 /* l & 1 TODO: change the order*/ -+ LOADFD( fa5, t1, L(DP_ONES) ) /* fa5 = 1.0 */ -+ LTOF( fa4, t0, fa4 ) /* fa4 DP l*/ -+/* critical!!!! the order */ -+ fsub.d fa0, fa0, fa4 -+ fld.d fa4, t4, 120 /* PI_4 */ -+ beqz t2, L_even_integer -+/*L_odd_integer:*/ -+ fsub.d fa0, fa0, fa5 -+ fadd.d fa0, fa0, fa1 -+ fadd.d fa2, fa2, fa3 -+ fadd.d fa0, fa0, fa2 -+ addi.d t0, t0, 1 -+ fmul.d fa0, fa0, fa4 -+ b L(reduced) -+L_even_integer: -+ fadd.d fa0, fa0, fa1 -+ fadd.d fa2, fa2, fa3 -+ fadd.d fa0, fa0, fa2 -+ fcmp.sle.d $fcc0, fa0, fa5 -+ addi.d t0, t0, 1 -+ bcnez $fcc0, L_leq_one -+/*L_gt_one:*/ -+ fld.d fa2, t1, 16 /* 2.0 */ -+ addi.d t0, t0, 1 -+ fsub.d fa0, fa0, fa2 -+L_leq_one: -+ fmul.d fa0, fa0, fa4 -+ b L(reduced) -+ -+L(arg_less_pio4): -+ /* Here if |x| -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#ifndef STPCPY_NAME -+#define STPCPY_NAME __stpcpy -+#endif -+ -+LEAF(STPCPY_NAME) -+ .align 6 -+ andi a3, a0, 0x7 -+ beqz a3, L(dest_align) -+ sub.d a5, a1, a3 -+ addi.d a5, a5, 8 -+ -+L(make_dest_align): -+ ld.b t0, a1, 0 -+ addi.d a1, a1, 1 -+ st.b t0, a0, 0 -+ addi.d a0, a0, 1 -+ -+ beqz t0, L(al_out) -+ bne a1, a5, L(make_dest_align) -+ -+L(dest_align): -+ andi a4, a1, 7 -+ bstrins.d a1, zero, 2, 0 -+ -+ lu12i.w t5, 0x1010 -+ ld.d t0, a1, 0 -+ ori t5, t5, 0x101 -+ bstrins.d t5, t5, 63, 32 -+ -+ slli.d t6, t5, 0x7 -+ bnez a4, L(unalign) -+ sub.d t1, t0, t5 -+ andn t2, t6, t0 -+ -+ and t3, t1, t2 -+ bnez t3, L(al_end) -+ -+L(al_loop): -+ st.d t0, a0, 0 -+ ld.d t0, a1, 8 -+ -+ addi.d a1, a1, 8 -+ addi.d a0, a0, 8 -+ sub.d t1, t0, t5 -+ andn t2, t6, t0 -+ -+ and t3, t1, t2 -+ beqz t3, L(al_loop) -+ -+L(al_end): -+ ctz.d t1, t3 -+ srli.d t1, t1, 3 -+ addi.d t1, t1, 1 # add 1, since '\0' needs to be copied to dest -+ -+ andi a3, t1, 8 -+ andi a4, t1, 4 -+ andi a5, t1, 2 -+ andi a6, t1, 1 -+ -+L(al_end_8): -+ beqz a3, L(al_end_4) -+ st.d t0, a0, 0 -+ addi.d a0, a0, 7 -+ jr ra -+L(al_end_4): -+ beqz a4, L(al_end_2) -+ st.w t0, a0, 0 -+ addi.d a0, a0, 4 -+ srli.d t0, t0, 32 -+L(al_end_2): -+ beqz a5, L(al_end_1) -+ st.h t0, a0, 0 -+ addi.d a0, a0, 2 -+ srli.d t0, t0, 16 -+L(al_end_1): -+ beqz a6, L(al_out) -+ st.b t0, a0, 0 -+ addi.d a0, a0, 1 -+L(al_out): -+ addi.d a0, a0, -1 -+ jr ra -+ -+L(unalign): -+ slli.d a5, a4, 3 -+ li.d t1, -1 -+ sub.d a6, zero, a5 -+ -+ srl.d a7, t0, a5 -+ sll.d t7, t1, a6 -+ -+ or t0, a7, t7 -+ sub.d t1, t0, t5 -+ andn t2, t6, t0 -+ and t3, t1, t2 -+ -+ bnez t3, L(un_end) -+ -+ ld.d t4, a1, 8 -+ addi.d a1, a1, 8 -+ -+ sub.d t1, t4, t5 -+ andn t2, t6, t4 -+ sll.d t0, t4, a6 -+ and t3, t1, t2 -+ -+ or t0, t0, a7 -+ bnez t3, L(un_end_with_remaining) -+ -+L(un_loop): -+ srl.d a7, t4, a5 -+ -+ ld.d t4, a1, 8 -+ addi.d a1, a1, 8 -+ -+ st.d t0, a0, 0 -+ addi.d a0, a0, 8 -+ -+ sub.d t1, t4, t5 -+ andn t2, t6, t4 -+ sll.d t0, t4, a6 -+ and t3, t1, t2 -+ -+ or t0, t0, a7 -+ beqz t3, L(un_loop) -+ -+L(un_end_with_remaining): -+ ctz.d t1, t3 -+ srli.d t1, t1, 3 -+ addi.d t1, t1, 1 -+ sub.d t1, t1, a4 -+ -+ blt t1, zero, L(un_end_less_8) -+ st.d t0, a0, 0 -+ addi.d a0, a0, 8 -+ beqz t1, L(un_out) -+ srl.d t0, t4, a5 # get the remaining part -+ b L(un_end_less_8) -+ -+L(un_end): -+ ctz.d t1, t3 -+ srli.d t1, t1, 3 -+ addi.d t1, t1, 1 -+ -+L(un_end_less_8): -+ andi a4, t1, 4 -+ andi a5, t1, 2 -+ andi a6, t1, 1 -+L(un_end_4): -+ beqz a4, L(un_end_2) -+ st.w t0, a0, 0 -+ addi.d a0, a0, 4 -+ srli.d t0, t0, 32 -+L(un_end_2): -+ beqz a5, L(un_end_1) -+ st.h t0, a0, 0 -+ addi.d a0, a0, 2 -+ srli.d t0, t0, 16 -+L(un_end_1): -+ beqz a6, L(un_out) -+ st.b t0, a0, 0 -+ addi.d a0, a0, 1 -+L(un_out): -+ addi.d a0, a0, -1 -+ jr ra -+ -+END(STPCPY_NAME) -+ -+#ifdef _LIBC -+weak_alias (STPCPY_NAME, stpcpy) -+libc_hidden_builtin_def (STPCPY_NAME) -+#endif -diff --git a/sysdeps/loongarch/lp64/strchr.S b/sysdeps/loongarch/lp64/strchr.S -new file mode 100644 -index 00000000..63454c17 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/strchr.S -@@ -0,0 +1,90 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#ifndef STRCHR_NAME -+#define STRCHR_NAME strchr -+#endif -+ -+/* char * strchr (const char *s1, int c); */ -+ -+LEAF(STRCHR_NAME) -+ .align 6 -+ slli.d t1, a0, 3 -+ bstrins.d a0, zero, 2, 0 -+ lu12i.w a2, 0x01010 -+ ld.d t2, a0, 0 -+ -+ ori a2, a2, 0x101 -+ andi a1, a1, 0xff -+ bstrins.d a2, a2, 63, 32 -+ li.w t0, -1 -+ -+ mul.d a1, a1, a2 # "cccccccc" -+ sll.d t0, t0, t1 -+ slli.d a3, a2, 7 # 0x8080808080808080 -+ orn t2, t2, t0 -+ -+ sll.d t3, a1, t1 -+ xor t4, t2, t3 -+ sub.d a7, t2, a2 -+ andn a6, a3, t2 -+ -+ -+ sub.d a5, t4, a2 -+ andn a4, a3, t4 -+ and a6, a7, a6 -+ and a5, a5, a4 -+ -+ or t0, a6, a5 -+ bnez t0, L(_mc8_a) -+ addi.d a0, a0, 8 -+L(_aloop): -+ ld.d t4, a0, 0 -+ -+ xor t2, t4, a1 -+ sub.d a7, t4, a2 -+ andn a6, a3, t4 -+ sub.d a5, t2, a2 -+ -+ andn a4, a3, t2 -+ and a6, a7, a6 -+ and a5, a5, a4 -+ or a7, a6, a5 -+ -+ -+ bnez a7, L(_mc8_a) -+ ld.d t4, a0, 8 -+ addi.d a0, a0, 16 -+ xor t2, t4, a1 -+ -+ sub.d a7, t4, a2 -+ andn a6, a3, t4 -+ sub.d a5, t2, a2 -+ andn a4, a3, t2 -+ -+ and a6, a7, a6 -+ and a5, a5, a4 -+ or a7, a6, a5 -+ beqz a7, L(_aloop) -+ -+ addi.d a0, a0, -8 -+ -+L(_mc8_a): -+ ctz.d t0, a5 -+ ctz.d t2, a6 -+ srli.w t0, t0, 3 -+ -+ -+ srli.w t2, t2, 3 -+ sltu t1, t2, t0 -+ add.d a0, a0, t0 -+ masknez a0, a0, t1 -+ -+ jr ra -+END(STRCHR_NAME) -diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S -new file mode 100644 -index 00000000..c4532e11 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/strchrnul.S -@@ -0,0 +1,95 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#ifndef STRCHRNUL_NAME -+#define STRCHRNUL_NAME __strchrnul -+#endif -+ -+/* char * strchrnul (const char *s1, int c); */ -+ -+LEAF(STRCHRNUL_NAME) -+ .align 6 -+ slli.d t1, a0, 3 -+ bstrins.d a0, zero, 2, 0 -+ lu12i.w a2, 0x01010 -+ ld.d t2, a0, 0 -+ -+ ori a2, a2, 0x101 -+ andi a1, a1, 0xff -+ bstrins.d a2, a2, 63, 32 -+ li.w t0, -1 -+ -+ mul.d a1, a1, a2 # "cccccccc" -+ sll.d t0, t0, t1 -+ slli.d a3, a2, 7 # 0x8080808080808080 -+ orn t2, t2, t0 -+ -+ sll.d t3, a1, t1 -+ xor t4, t2, t3 -+ sub.d a7, t2, a2 -+ andn a6, a3, t2 -+ -+ -+ sub.d a5, t4, a2 -+ andn a4, a3, t4 -+ and a6, a7, a6 -+ and a5, a5, a4 -+ -+ or t0, a6, a5 -+ bnez t0, L(_mc8_a) -+ addi.d a0, a0, 8 -+L(_aloop): -+ ld.d t4, a0, 0 -+ -+ xor t2, t4, a1 -+ sub.d a7, t4, a2 -+ andn a6, a3, t4 -+ sub.d a5, t2, a2 -+ -+ andn a4, a3, t2 -+ and a6, a7, a6 -+ and a5, a5, a4 -+ or a7, a6, a5 -+ -+ -+ bnez a7, L(_mc8_a) -+ ld.d t4, a0, 8 -+ addi.d a0, a0, 16 -+ xor t2, t4, a1 -+ -+ sub.d a7, t4, a2 -+ andn a6, a3, t4 -+ sub.d a5, t2, a2 -+ andn a4, a3, t2 -+ -+ and a6, a7, a6 -+ and a5, a5, a4 -+ or a7, a6, a5 -+ beqz a7, L(_aloop) -+ -+ addi.d a0, a0, -8 -+L(_mc8_a): -+ ctz.d t0, a5 -+ ctz.d t2, a6 -+ srli.w t0, t0, 3 -+ -+ srli.w t2, t2, 3 -+ slt t1, t0, t2 -+ masknez t3, t2, t1 -+ maskeqz t4, t0, t1 -+ -+ or t0, t3, t4 -+ add.d a0, a0, t0 -+ jr ra -+END(STRCHRNUL_NAME) -+ -+#ifdef _LIBC -+weak_alias(STRCHRNUL_NAME, strchrnul) -+libc_hidden_builtin_def (STRCHRNUL_NAME) -+#endif -diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S -new file mode 100644 -index 00000000..22c261a3 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/strcmp.S -@@ -0,0 +1,228 @@ -+/* 2022\06\15 loongarch64 author: chenxiaolong. */ -+ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#ifndef STRCMP_NAME -+#define STRCMP_NAME strcmp -+#endif -+ -+/* int strcmp (const char *s1, const char *s2); */ -+ -+/* Parameters and Results */ -+#define src1 a0 -+#define src2 a1 -+#define result v0 -+LEAF(STRCMP_NAME) -+ .align 6 -+ xor a4, src1, src2 -+ lu12i.w t5, 0x01010 -+ lu12i.w t6, 0x7f7f7 -+ andi a2, src1, 0x7 -+ -+ ori t5, t5, 0x101 -+ andi a4, a4, 0x7 -+ ori t6, t6, 0xf7f -+ bstrins.d t5, t5, 63, 32 -+ bstrins.d t6, t6, 63, 32 -+ -+ bnez a4, 3f // unaligned -+ beqz a2, 1f // loop aligned -+ -+// mutual aligned -+ bstrins.d src1, zero, 2, 0 -+ bstrins.d src2, zero, 2, 0 -+ slli.d a4, a2, 0x3 -+ ld.d t0, src1, 0 -+ -+ sub.d a4, zero, a4 -+ ld.d t1, src2, 0 -+ addi.d src1, src1, 8 -+ addi.d src2, src2, 8 -+ -+ nor a5, zero, zero -+ srl.d a5, a5, a4 -+ or t0, t0, a5 -+ -+ or t1, t1, a5 -+ b 2f //start realigned -+ -+// loop aligned -+1: -+ ld.d t0, src1, 0 -+ addi.d src1, src1, 8 -+ ld.d t1, src2, 0 -+ addi.d src2, src2, 8 -+ -+// start realigned: -+2: -+ sub.d t2, t0, t5 -+ nor t3, t0, t6 -+ and t2, t2, t3 -+ -+ xor t3, t0, t1 -+ or t2, t2, t3 -+ beqz t2, 1b -+ -+ ctz.d t7, t2 -+ bstrins.d t7, zero, 2, 0 -+ srl.d t0, t0, t7 -+ srl.d t1, t1, t7 -+ -+ andi t0, t0, 0xff -+ andi t1, t1, 0xff -+ sub.d v0, t0, t1 -+ jr ra -+ -+// unaligned -+3: -+ andi a3, src2, 0x7 -+ slt a5, a2, a3 -+ masknez t8, a2, a5 -+ xor a6, src1, src2 -+ maskeqz a6, a6, t8 -+ xor src1, src1, a6 -+ xor src2, src2, a6 -+ -+ andi a2, src1, 0x7 -+ beqz a2, 4f // src1 is aligned -+ -+//strcmp_unaligned: -+ andi a3, src2, 0x7 -+ bstrins.d src1, zero, 2, 0 -+ bstrins.d src2, zero, 2, 0 -+ nor t3, zero, zero -+ -+ ld.d t0, src1, 0 -+ ld.d t1, src2, 0 -+ sub.d a2, a3, a2 -+ addi.d t2, zero, 8 -+ -+ sub.d a5, t2, a2 -+ sub.d a6, t2, a3 -+ slli.d a5, a5, 0x3 -+ slli.d a6, a6, 0x3 -+ -+ srl.d t4, t3, a6 -+ srl.d a4, t3, a5 -+ rotr.d a7, t0, a5 -+ -+ addi.d src2, src2, 8 -+ addi.d src1, src1, 8 -+ or t1, t1, t4 -+ or t0, a7, t4 -+ -+ sub.d t2, t0, t5 -+ nor t3, t0, t6 -+ and t2, t2, t3 -+ xor t3, t0, t1 -+ or t2, t2, t3 -+ bnez t2, 7f -+ -+ and a7, a7, a4 -+ slli.d a6, a2, 0x3 -+ nor a4, zero, a4 -+ b 5f -+ -+// src1 is aligned -+4: -+ andi a3, src2, 0x7 -+ ld.d t0, src1, 0 -+ -+ bstrins.d src2, zero, 2, 0 -+ nor t2, zero, zero -+ ld.d t1, src2, 0 -+ -+ addi.d t3, zero, 0x8 -+ sub.d a5, t3, a3 -+ slli.d a5, a5, 0x3 -+ srl.d a4, t2, a5 -+ rotr.d t4, t0, a5 -+ -+ addi.d src2, src2, 8 -+ addi.d src1, src1, 8 -+ or t1, t1, a4 -+ or t0, t4, a4 -+ -+ sub.d t2, t0, t5 -+ nor t3, t0, t6 -+ and t2, t2, t3 -+ xor t3, t0, t1 -+ or t2, t2, t3 -+ -+ bnez t2, 7f -+ -+ and a7, t4, a4 -+ slli.d a6, a3, 0x3 -+ nor a4, zero, a4 -+ -+// unaligned loop -+// a7: remaining number -+// a6: shift left number -+// a5: shift right number -+// a4: mask for checking remaining number -+5: -+ or t0, a7, a4 -+ sub.d t2, t0, t5 -+ nor t3, t0, t6 -+ and t2, t2, t3 -+ bnez t2, 6f -+ -+ ld.d t0, src1, 0 -+ addi.d src1, src1, 8 -+ ld.d t1, src2, 0 -+ addi.d src2, src2, 8 -+ -+ srl.d t7, t0, a5 -+ sll.d t0, t0, a6 -+ or t0, a7, t0 -+ -+ sub.d t2, t0, t5 -+ nor t3, t0, t6 -+ and t2, t2, t3 -+ xor t3, t0, t1 -+ or t2, t2, t3 -+ bnez t2, 7f -+ -+ or a7, t7, zero -+ b 5b -+ -+6: -+ ld.bu t1, src2, 0 -+ andi t0, a7, 0xff -+ xor t2, t0, t1 -+ srli.d a7, a7, 0x8 -+ masknez t2, t0, t2 -+ addi.d src2, src2, 1 -+ beqz t2, 8f -+ b 6b -+ -+7: -+ ctz.d t7, t2 -+ bstrins.d t7, zero, 2, 0 -+ srl.d t0, t0, t7 -+ srl.d t1, t1, t7 -+ -+ andi t0, t0, 0xff -+ andi t1, t1, 0xff -+ -+8: -+ sub.d a4, t0, t1 -+ sub.d a5, t1, t0 -+ maskeqz a6, a5, t8 -+ masknez result, a4, t8 -+ or result, result, a6 -+ jr ra -+ -+END(STRCMP_NAME) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (STRCMP_NAME) -+#endif -+ -diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S -new file mode 100644 -index 00000000..c6fe74cb ---- /dev/null -+++ b/sysdeps/loongarch/lp64/strcpy.S -@@ -0,0 +1,174 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#ifndef STRCPY -+#define STRCPY strcpy -+#endif -+ -+LEAF(STRCPY) -+ .align 6 -+ andi a3, a0, 0x7 -+ move a2, a0 -+ beqz a3, L(dest_align) -+ sub.d a5, a1, a3 -+ addi.d a5, a5, 8 -+ -+L(make_dest_align): -+ ld.b t0, a1, 0 -+ addi.d a1, a1, 1 -+ st.b t0, a2, 0 -+ beqz t0, L(al_out) -+ -+ addi.d a2, a2, 1 -+ bne a1, a5, L(make_dest_align) -+ -+L(dest_align): -+ andi a4, a1, 7 -+ bstrins.d a1, zero, 2, 0 -+ -+ lu12i.w t5, 0x1010 -+ ld.d t0, a1, 0 -+ ori t5, t5, 0x101 -+ bstrins.d t5, t5, 63, 32 -+ -+ slli.d t6, t5, 0x7 -+ bnez a4, L(unalign) -+ sub.d t1, t0, t5 -+ andn t2, t6, t0 -+ -+ and t3, t1, t2 -+ bnez t3, L(al_end) -+ -+L(al_loop): -+ st.d t0, a2, 0 -+ ld.d t0, a1, 8 -+ -+ addi.d a1, a1, 8 -+ addi.d a2, a2, 8 -+ sub.d t1, t0, t5 -+ andn t2, t6, t0 -+ -+ and t3, t1, t2 -+ beqz t3, L(al_loop) -+ -+L(al_end): -+ ctz.d t1, t3 -+ srli.d t1, t1, 3 -+ addi.d t1, t1, 1 # add 1, since '\0' needs to be copied to dest -+ -+ andi a3, t1, 8 -+ andi a4, t1, 4 -+ andi a5, t1, 2 -+ andi a6, t1, 1 -+ -+L(al_end_8): -+ beqz a3, L(al_end_4) -+ st.d t0, a2, 0 -+ jr ra -+L(al_end_4): -+ beqz a4, L(al_end_2) -+ st.w t0, a2, 0 -+ addi.d a2, a2, 4 -+ srli.d t0, t0, 32 -+L(al_end_2): -+ beqz a5, L(al_end_1) -+ st.h t0, a2, 0 -+ addi.d a2, a2, 2 -+ srli.d t0, t0, 16 -+L(al_end_1): -+ beqz a6, L(al_out) -+ st.b t0, a2, 0 -+L(al_out): -+ jr ra -+ -+L(unalign): -+ slli.d a5, a4, 3 -+ li.d t1, -1 -+ sub.d a6, zero, a5 -+ -+ srl.d a7, t0, a5 -+ sll.d t7, t1, a6 -+ -+ or t0, a7, t7 -+ sub.d t1, t0, t5 -+ andn t2, t6, t0 -+ and t3, t1, t2 -+ -+ bnez t3, L(un_end) -+ -+ ld.d t4, a1, 8 -+ -+ sub.d t1, t4, t5 -+ andn t2, t6, t4 -+ sll.d t0, t4, a6 -+ and t3, t1, t2 -+ -+ or t0, t0, a7 -+ bnez t3, L(un_end_with_remaining) -+ -+L(un_loop): -+ srl.d a7, t4, a5 -+ -+ ld.d t4, a1, 16 -+ addi.d a1, a1, 8 -+ -+ st.d t0, a2, 0 -+ addi.d a2, a2, 8 -+ -+ sub.d t1, t4, t5 -+ andn t2, t6, t4 -+ sll.d t0, t4, a6 -+ and t3, t1, t2 -+ -+ or t0, t0, a7 -+ beqz t3, L(un_loop) -+ -+L(un_end_with_remaining): -+ ctz.d t1, t3 -+ srli.d t1, t1, 3 -+ addi.d t1, t1, 1 -+ sub.d t1, t1, a4 -+ -+ blt t1, zero, L(un_end_less_8) -+ st.d t0, a2, 0 -+ addi.d a2, a2, 8 -+ beqz t1, L(un_out) -+ srl.d t0, t4, a5 # get the remaining part -+ b L(un_end_less_8) -+ -+L(un_end): -+ ctz.d t1, t3 -+ srli.d t1, t1, 3 -+ addi.d t1, t1, 1 -+ -+L(un_end_less_8): -+ andi a4, t1, 4 -+ andi a5, t1, 2 -+ andi a6, t1, 1 -+L(un_end_4): -+ beqz a4, L(un_end_2) -+ st.w t0, a2, 0 -+ addi.d a2, a2, 4 -+ srli.d t0, t0, 32 -+L(un_end_2): -+ beqz a5, L(un_end_1) -+ st.h t0, a2, 0 -+ addi.d a2, a2, 2 -+ srli.d t0, t0, 16 -+L(un_end_1): -+ beqz a6, L(un_out) -+ st.b t0, a2, 0 -+L(un_out): -+ jr ra -+ -+END(STRCPY) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (STRCPY) -+#endif -diff --git a/sysdeps/loongarch/lp64/strlen.S b/sysdeps/loongarch/lp64/strlen.S -new file mode 100644 -index 00000000..dd5a8da3 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/strlen.S -@@ -0,0 +1,86 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#ifndef STRLEN -+#define STRLEN strlen -+#endif -+ -+LEAF(STRLEN) -+ .align 6 -+ move a1, a0 -+ bstrins.d a0, zero, 2, 0 -+ lu12i.w a2, 0x01010 -+ li.w t0, -1 -+ -+ ld.d t2, a0, 0 -+ andi t1, a1, 0x7 -+ ori a2, a2, 0x101 -+ slli.d t1, t1, 3 -+ -+ bstrins.d a2, a2, 63, 32 -+ sll.d t1, t0, t1 -+ slli.d t3, a2, 7 -+ nor a3, zero, t3 -+ -+ orn t2, t2, t1 -+ sub.d t0, t2, a2 -+ nor t1, t2, a3 -+ and t0, t0, t1 -+ -+ -+ bnez t0, L(count_pos) -+ addi.d a0, a0, 8 -+L(loop_16_7bit): -+ ld.d t2, a0, 0 -+ sub.d t1, t2, a2 -+ -+ and t0, t1, t3 -+ bnez t0, L(more_check) -+ ld.d t2, a0, 8 -+ addi.d a0, a0, 16 -+ -+ sub.d t1, t2, a2 -+ and t0, t1, t3 -+ beqz t0, L(loop_16_7bit) -+ addi.d a0, a0, -8 -+L(more_check): -+ nor t0, t2, a3 -+ -+ and t0, t1, t0 -+ bnez t0, L(count_pos) -+ addi.d a0, a0, 8 -+L(loop_16_8bit): -+ ld.d t2, a0, 0 -+ -+ sub.d t1, t2, a2 -+ nor t0, t2, a3 -+ and t0, t0, t1 -+ bnez t0, L(count_pos) -+ -+ ld.d t2, a0, 8 -+ addi.d a0, a0, 16 -+ sub.d t1, t2, a2 -+ nor t0, t2, a3 -+ -+ and t0, t0, t1 -+ beqz t0, L(loop_16_8bit) -+ addi.d a0, a0, -8 -+L(count_pos): -+ ctz.d t1, t0 -+ sub.d a0, a0, a1 -+ -+ srli.d t1, t1, 3 -+ add.d a0, a0, t1 -+ jr ra -+ -+END(STRLEN) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def (STRLEN) -+#endif -diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S -new file mode 100644 -index 00000000..dcb15350 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/strncmp.S -@@ -0,0 +1,257 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#ifndef STRNCMP -+#define STRNCMP strncmp -+#endif -+ -+/* int strncmp (const char *s1, const char *s2); */ -+ -+LEAF(STRNCMP) -+ .align 6 -+ beqz a2, L(ret0) -+ xor a4, a0, a1 -+ lu12i.w t5, 0x01010 -+ lu12i.w t6, 0x7f7f7 -+ -+ andi a3, a0, 0x7 -+ ori t5, t5, 0x101 -+ andi a4, a4, 0x7 -+ ori t6, t6, 0xf7f -+ -+ bstrins.d t5, t5, 63, 32 -+ bstrins.d t6, t6, 63, 32 -+ -+ bnez a4, L(unalign) -+ bnez a3, L(mutual_align) -+ -+L(a_loop): -+ ld.d t0, a0, 0 -+ ld.d t1, a1, 0 -+ addi.d a0, a0, 8 -+ addi.d a1, a1, 8 -+ -+ -+ sltui t7, a2, 9 -+ -+L(start_realign): -+ sub.d t2, t0, t5 -+ nor t3, t0, t6 -+ xor t4, t0, t1 -+ -+ and t2, t2, t3 -+ addi.d a2, a2, -8 -+ -+ or t2, t2, t4 -+ or t3, t2, t7 -+ beqz t3, L(a_loop) -+ -+L(end): -+ bge zero, t7, L(out) -+ andi t4, a2, 7 -+ li.d t3, -1 -+ addi.d t4, t4, -1 -+ slli.d t4, t4, 3 -+ sll.d t3, t3, t4 -+ or t2, t2, t3 -+ -+ -+L(out): -+ ctz.d t3, t2 -+ bstrins.d t3, zero, 2, 0 -+ srl.d t0, t0, t3 -+ srl.d t1, t1, t3 -+ -+ andi t0, t0, 0xff -+ andi t1, t1, 0xff -+ sub.d a0, t0, t1 -+ jr ra -+ -+L(mutual_align): -+ bstrins.d a0, zero, 2, 0 -+ bstrins.d a1, zero, 2, 0 -+ slli.d a5, a3, 0x3 -+ li.d t2, -1 -+ -+ ld.d t0, a0, 0 -+ ld.d t1, a1, 0 -+ -+ li.d t3, 9 -+ sll.d t2, t2, a5 -+ -+ sub.d t3, t3, a3 -+ addi.d a0, a0, 8 -+ -+ sltu t7, a2, t3 -+ addi.d a1, a1, 8 -+ -+ add.d a2, a2, a3 -+ orn t0, t0, t2 -+ orn t1, t1, t2 -+ b L(start_realign) -+ -+L(ret0): -+ move a0, zero -+ jr ra -+ -+L(unalign): -+ li.d t8, 8 -+ blt a2, t8, L(short_cmp) -+ -+ # swap a0 and a1 in case a3 > a4 -+ andi a4, a1, 0x7 -+ sltu t8, a4, a3 -+ xor a6, a0, a1 -+ maskeqz a6, a6, t8 -+ xor a0, a0, a6 -+ xor a1, a1, a6 -+ -+ andi a3, a0, 0x7 -+ andi a4, a1, 0x7 -+ -+ bstrins.d a0, zero, 2, 0 -+ bstrins.d a1, zero, 2, 0 -+ -+ li.d t2, -1 -+ li.d t3, 9 -+ -+ ld.d t0, a0, 0 -+ ld.d t1, a1, 0 -+ -+ sub.d t3, t3, a4 -+ sub.d a3, a4, a3 -+ -+ slli.d t4, a4, 3 -+ slli.d a6, a3, 3 -+ -+ sub.d a5, zero, a6 -+ sltu t7, a2, t3 -+ -+ rotr.d a7, t0, a5 -+ sll.d t4, t2, t4 # mask for first num -+ -+ add.d a2, a2, a4 -+ sll.d a4, t2, a6 # mask for a7 -+ -+ orn t0, a7, t4 -+ orn t1, t1, t4 -+ -+ sub.d t2, t0, t5 -+ nor t4, t0, t6 -+ and t2, t2, t4 -+ -+ xor t3, t0, t1 -+ or t2, t2, t3 -+ -+ or t3, t2, t7 -+ bnez t3, L(un_end) -+ -+ andn a7, a7, a4 -+ addi.d a3, a3, 1 -+ -+L(un_loop): -+ addi.d a2, a2, -8 -+ # in case remaining part has '\0', no more load instructions should be executed on a0 address -+ or t0, a7, a4 -+ sltu t7, a2, a3 -+ -+ sub.d t2, t0, t5 -+ nor t3, t0, t6 -+ and t2, t2, t3 -+ -+ or t3, t2, t7 -+ bnez t3, L(check_remaining) -+ -+ ld.d t7, a0, 8 -+ ld.d t1, a1, 8 -+ addi.d a0, a0, 8 -+ addi.d a1, a1, 8 -+ -+ sll.d t4, t7, a6 -+ sub.d t2, t1, t5 -+ nor t3, t1, t6 -+ -+ or t0, t4, a7 -+ srl.d a7, t7, a5 -+ -+ and t2, t2, t3 -+ xor t3, t0, t1 -+ -+ sltui t7, a2, 9 -+ or t2, t2, t3 -+ -+ or t3, t2, t7 -+ beqz t3, L(un_loop) -+ b L(un_end) -+ -+L(check_remaining): -+ ld.d t1, a1, 8 -+ xor t3, t1, a7 -+ or t2, t2, t3 -+ -+L(un_end): -+ bge zero, t7, L(un_out) -+ andi t4, a2, 7 -+ li.d t3, -1 -+ -+ addi.d t4, t4, -1 -+ slli.d t4, t4, 3 -+ sll.d t3, t3, t4 -+ or t2, t2, t3 -+ -+L(un_out): -+ ctz.d t3, t2 -+ bstrins.d t3, zero, 2, 0 -+ srl.d t0, t0, t3 -+ srl.d t1, t1, t3 -+ -+ andi t0, t0, 0xff -+ andi t1, t1, 0xff -+ -+ sub.d a4, t0, t1 -+ sub.d a5, t1, t0 -+ -+ maskeqz a6, a5, t8 -+ masknez a0, a4, t8 -+ -+ or a0, a0, a6 -+ jr ra -+ -+L(short_cmp): -+ ld.bu t0, a0, 0 -+ ld.bu t1, a1, 0 -+ addi.d a2, a2, -1 -+ -+ xor t2, t0, t1 -+ masknez t2, t0, t2 -+ maskeqz t2, a2, t2 -+ -+ beqz t2, L(short_out) -+ -+ ld.bu t0, a0, 1 -+ ld.bu t1, a1, 1 -+ -+ addi.d a2, a2, -1 -+ addi.d a0, a0, 2 -+ -+ addi.d a1, a1, 2 -+ xor t2, t0, t1 -+ masknez t2, t0, t2 -+ maskeqz t2, a2, t2 -+ -+ bnez t2, L(short_cmp) -+ -+L(short_out): -+ sub.d a0, t0, t1 -+ jr ra -+ -+END(STRNCMP) -+#ifdef _LIBC -+libc_hidden_builtin_def (STRNCMP) -+#endif -diff --git a/sysdeps/loongarch/lp64/strnlen.S b/sysdeps/loongarch/lp64/strnlen.S -new file mode 100644 -index 00000000..0517e206 ---- /dev/null -+++ b/sysdeps/loongarch/lp64/strnlen.S -@@ -0,0 +1,83 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#ifndef STRNLEN -+#define STRNLEN __strnlen -+#endif -+ -+#. before every load, a1(t5) must > 0; -+#. first load with t1 != 0, need to adjust t5; -+#. return the less one of both strlen(s) and a1; -+ -+LEAF(STRNLEN) -+ .align 6 -+ beqz a1, L(out) -+ lu12i.w a2, 0x01010 -+ andi t1, a0, 0x7 -+ move t4, a0 -+ -+ bstrins.d a0, zero, 2, 0 -+ ori a2, a2, 0x101 -+ li.w t0, -1 -+ ld.d t2, a0, 0 -+ -+ slli.d t3, t1, 3 -+ bstrins.d a2, a2, 63, 32 -+ li.w t5, 8 -+ slli.d a3, a2, 7 -+ -+ sub.w t1, t5, t1 -+ sll.d t0, t0, t3 -+ nor a3, zero, a3 -+ orn t2, t2, t0 -+ -+ -+ sub.d t0, t2, a2 -+ nor t3, t2, a3 -+ and t0, t0, t3 -+ bnez t0, L(count_pos) -+ -+ sub.d t5, a1, t1 -+ bgeu t1, a1, L(out) -+L(loop_8bytes): -+ ld.d t2, a0, 8 -+ addi.d a0, a0, 8 -+ -+ sub.d t0, t2, a2 -+ nor t1, t2, a3 -+ sltui t6, t5, 9 -+ and t0, t0, t1 -+ -+ addi.d t5, t5, -8 -+ or t7, t0, t6 -+ beqz t7, L(loop_8bytes) -+L(count_pos): -+ ctz.d t1, t0 -+ -+ -+ sub.d a0, a0, t4 -+ srli.d t1, t1, 3 -+ add.d a0, t1, a0 -+ sltu t0, a0, a1 -+ -+ masknez t1, a1, t0 -+ maskeqz a0, a0, t0 -+ or a0, a0, t1 -+ jr ra -+ -+L(out): -+ move a0, a1 -+ jr ra -+ -+END(STRNLEN) -+ -+#ifdef _LIBC -+weak_alias (STRNLEN, strnlen) -+libc_hidden_builtin_def (STRNLEN) -+#endif -diff --git a/sysdeps/loongarch/lp64/strrchr.S b/sysdeps/loongarch/lp64/strrchr.S -new file mode 100644 -index 00000000..3bf92ecd ---- /dev/null -+++ b/sysdeps/loongarch/lp64/strrchr.S -@@ -0,0 +1,106 @@ -+#ifdef _LIBC -+#include -+#include -+#include -+#else -+#include -+#include -+#endif -+ -+#ifndef STRRCHR_NAME -+#define STRRCHR_NAME strrchr -+#endif -+ -+LEAF(STRRCHR_NAME) -+ .align 6 -+ slli.d t1, a0, 3 -+ bstrins.d a0, zero, 2, 0 -+ lu12i.w a2, 0x01010 -+ ld.d t2, a0, 0 // t2 = "5ZZ21abc" -+ -+ ori a2, a2, 0x101 -+ andi a1, a1, 0xff // a1 = "0000000Z" -+ li.d a5, -1 -+ bstrins.d a2, a2, 63, 32 // a2 = 0x0101010101010101 -+ -+ sll.d t1, a5, t1 // t1 = 0xffffffffff000000 -+ mul.d a1, a1, a2 // a1 = "ZZZZZZZZ" -+ orn t2, t2, t1 // t2 = "5ZZ21YYY" -+ slli.d a3, a2, 7 // a3 = 0x8080808080808080 -+ -+ sub.d a4, t2, a2 -+ andn t0, a3, t2 -+ move t3, zero -+ and t0, a4, t0 -+ -+ -+ xor a4, t2, a1 -+ move t5, zero -+ orn a4, a4, t1 -+ bnez t0, L(found_end) -+ -+ sub.d t1, a4, a2 -+ andn t0, a3, a4 -+ and t1, t1, t0 -+ -+L(loop_8bytes): -+ masknez t4, t3, t1 -+ -+ maskeqz t3, t2, t1 -+ ld.d t2, a0, 8 -+ masknez t0, t5, t1 -+ maskeqz t5, a0, t1 -+ -+ or t3, t3, t4 -+ or t5, t0, t5 -+ sub.d t0, t2, a2 -+ andn t1, a3, t2 -+ -+ -+ xor a4, t2, a1 -+ and t0, t0, t1 //t0 hold diff pattern for '\0' -+ sub.d t1, a4, a2 -+ andn t4, a3, a4 -+ -+ and t1, t1, t4 //t1 hold diff pattern for 'a1' -+ addi.d a0, a0, 8 -+ beqz t0, L(loop_8bytes) //ok, neither \0 nor found -+L(found_end): -+ ctz.d t1, t0 -+ -+ xor t3, t3, a1 -+ orn t1, zero, t1 -+ revb.d t3, t3 -+ srl.d t1, a5, t1 // mask for '\0' -+ -+ sub.d t4, t3, a2 -+ orn a4, a4, t1 -+ andn t3, a3, t3 -+ revb.d t2, a4 -+ -+ sub.d t0, t2, a2 -+ andn t1, a3, t2 -+ and t3, t3, t4 -+ and t1, t0, t1 -+ -+ li.d t7, 7 -+ masknez t4, t3, t1 -+ maskeqz t3, t1, t1 -+ masknez t5, t5, t1 -+ -+ or t3, t3, t4 -+ maskeqz t6, a0, t1 -+ ctz.d t0, t3 -+ or t5, t6, t5 -+ -+ srli.d t0, t0, 3 -+ sub.d t0, t7, t0 -+ add.d a0, t5, t0 -+ maskeqz a0, a0, t3 -+ -+ jr ra -+END(STRRCHR_NAME) -+ -+#ifdef _LIBC -+libc_hidden_builtin_def(STRRCHR_NAME) -+#endif -diff --git a/sysdeps/loongarch/lstat.c b/sysdeps/loongarch/lstat.c -new file mode 100644 -index 00000000..f47a56af ---- /dev/null -+++ b/sysdeps/loongarch/lstat.c -@@ -0,0 +1 @@ -+#include -diff --git a/sysdeps/loongarch/lstat64.c b/sysdeps/loongarch/lstat64.c -new file mode 100644 -index 00000000..d6811656 ---- /dev/null -+++ b/sysdeps/loongarch/lstat64.c -@@ -0,0 +1 @@ -+#include -diff --git a/sysdeps/loongarch/machine-gmon.h b/sysdeps/loongarch/machine-gmon.h -new file mode 100644 -index 00000000..0b49082d ---- /dev/null -+++ b/sysdeps/loongarch/machine-gmon.h -@@ -0,0 +1,37 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* Accept 'frompc' address as argument from the function that calls -+ _mcount for profiling. Use __builtin_return_address (0) -+ for the 'selfpc' address. */ -+ -+#include -+ -+static void mcount_internal (unsigned long int frompc, -+ unsigned long int selfpc); -+ -+#define _MCOUNT_DECL(frompc, selfpc) \ -+static inline void mcount_internal (unsigned long int frompc, \ -+unsigned long int selfpc) -+ -+#define MCOUNT \ -+void _mcount (void *frompc) \ -+{ \ -+ mcount_internal ((unsigned long int) frompc, \ -+ (unsigned long int) RETURN_ADDRESS (0)); \ -+} -diff --git a/sysdeps/loongarch/math_private.h b/sysdeps/loongarch/math_private.h -new file mode 100644 -index 00000000..140eef07 ---- /dev/null -+++ b/sysdeps/loongarch/math_private.h -@@ -0,0 +1,245 @@ -+/* Internal math stuff. LOONGARCH version. -+ Copyright (C) 2013-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#ifndef LOONGARCH_MATH_PRIVATE_H -+#define LOONGARCH_MATH_PRIVATE_H 1 -+ -+/* Inline functions to speed up the math library implementation. The -+ default versions of these routines are in generic/math_private.h -+ and call fesetround, feholdexcept, etc. These routines use inlined -+ code instead. */ -+ -+#ifdef __loongarch_hard_float -+ -+# include -+# include -+# include -+ -+# define _FPU_MASK_ALL (_FPU_MASK_V | _FPU_MASK_Z | _FPU_MASK_O \ -+ |_FPU_MASK_U | _FPU_MASK_I | FE_ALL_EXCEPT) -+ -+static __always_inline void -+libc_feholdexcept_loongarch (fenv_t *envp) -+{ -+ fpu_control_t cw; -+ -+ /* Save the current state. */ -+ _FPU_GETCW (cw); -+ envp->__fp_control_register = cw; -+ -+ /* Clear all exception enable bits and flags. */ -+ cw &= ~(_FPU_MASK_ALL); -+ _FPU_SETCW (cw); -+} -+# define libc_feholdexcept libc_feholdexcept_loongarch -+# define libc_feholdexceptf libc_feholdexcept_loongarch -+# define libc_feholdexceptl libc_feholdexcept_loongarch -+ -+static __always_inline void -+libc_fesetround_loongarch (int round) -+{ -+ fpu_control_t cw; -+ -+ /* Get current state. */ -+ _FPU_GETCW (cw); -+ -+ /* Set rounding bits. */ -+ cw &= ~_FPU_RC_MASK; -+ cw |= round; -+ -+ /* Set new state. */ -+ _FPU_SETCW (cw); -+} -+# define libc_fesetround libc_fesetround_loongarch -+# define libc_fesetroundf libc_fesetround_loongarch -+# define libc_fesetroundl libc_fesetround_loongarch -+ -+static __always_inline void -+libc_feholdexcept_setround_loongarch (fenv_t *envp, int round) -+{ -+ fpu_control_t cw; -+ -+ /* Save the current state. */ -+ _FPU_GETCW (cw); -+ envp->__fp_control_register = cw; -+ -+ /* Clear all exception enable bits and flags. */ -+ cw &= ~(_FPU_MASK_ALL); -+ -+ /* Set rounding bits. */ -+ cw &= ~_FPU_RC_MASK; -+ cw |= round; -+ -+ /* Set new state. */ -+ _FPU_SETCW (cw); -+} -+# define libc_feholdexcept_setround libc_feholdexcept_setround_loongarch -+# define libc_feholdexcept_setroundf libc_feholdexcept_setround_loongarch -+# define libc_feholdexcept_setroundl libc_feholdexcept_setround_loongarch -+ -+# define libc_feholdsetround libc_feholdexcept_setround_loongarch -+# define libc_feholdsetroundf libc_feholdexcept_setround_loongarch -+# define libc_feholdsetroundl libc_feholdexcept_setround_loongarch -+ -+static __always_inline void -+libc_fesetenv_loongarch (fenv_t *envp) -+{ -+ fpu_control_t cw __attribute__ ((unused)); -+ -+ /* Read current state to flush fpu pipeline. */ -+ _FPU_GETCW (cw); -+ -+ _FPU_SETCW (envp->__fp_control_register); -+} -+# define libc_fesetenv libc_fesetenv_loongarch -+# define libc_fesetenvf libc_fesetenv_loongarch -+# define libc_fesetenvl libc_fesetenv_loongarch -+ -+static __always_inline int -+libc_feupdateenv_test_loongarch (fenv_t *envp, int excepts) -+{ -+ /* int ret = fetestexcept (excepts); feupdateenv (envp); return ret; */ -+ int cw, temp; -+ -+ /* Get current control word. */ -+ _FPU_GETCW (cw); -+ -+ /* Set flag bits (which are accumulative), and *also* set the -+ cause bits. The setting of the cause bits is what actually causes -+ the hardware to generate the exception, if the corresponding enable -+ bit is set as well. */ -+ temp = cw & FE_ALL_EXCEPT; -+ temp |= envp->__fp_control_register | (temp << CAUSE_SHIFT); -+ -+ /* Set new state. */ -+ _FPU_SETCW (temp); -+ -+ return cw & excepts & FE_ALL_EXCEPT; -+} -+# define libc_feupdateenv_test libc_feupdateenv_test_loongarch -+# define libc_feupdateenv_testf libc_feupdateenv_test_loongarch -+# define libc_feupdateenv_testl libc_feupdateenv_test_loongarch -+ -+static __always_inline void -+libc_feupdateenv_loongarch (fenv_t *envp) -+{ -+ libc_feupdateenv_test_loongarch (envp, 0); -+} -+# define libc_feupdateenv libc_feupdateenv_loongarch -+# define libc_feupdateenvf libc_feupdateenv_loongarch -+# define libc_feupdateenvl libc_feupdateenv_loongarch -+ -+# define libc_feresetround libc_feupdateenv_loongarch -+# define libc_feresetroundf libc_feupdateenv_loongarch -+# define libc_feresetroundl libc_feupdateenv_loongarch -+ -+static __always_inline int -+libc_fetestexcept_loongarch (int excepts) -+{ -+ int cw; -+ -+ /* Get current control word. */ -+ _FPU_GETCW (cw); -+ -+ return cw & excepts & FE_ALL_EXCEPT; -+} -+# define libc_fetestexcept libc_fetestexcept_loongarch -+# define libc_fetestexceptf libc_fetestexcept_loongarch -+# define libc_fetestexceptl libc_fetestexcept_loongarch -+ -+/* Enable support for rounding mode context. */ -+# define HAVE_RM_CTX 1 -+ -+static __always_inline void -+libc_feholdexcept_setround_loongarch_ctx (struct rm_ctx *ctx, int round) -+{ -+ fpu_control_t old, new; -+ -+ /* Save the current state. */ -+ _FPU_GETCW (old); -+ ctx->env.__fp_control_register = old; -+ -+ /* Clear all exception enable bits and flags. */ -+ new = old & ~(_FPU_MASK_ALL); -+ -+ /* Set rounding bits. */ -+ new = (new & ~_FPU_RC_MASK) | round; -+ -+ if (__glibc_unlikely (new != old)) -+ { -+ _FPU_SETCW (new); -+ ctx->updated_status = true; -+ } -+ else -+ ctx->updated_status = false; -+} -+# define libc_feholdexcept_setround_ctx libc_feholdexcept_setround_loongarch_ctx -+# define libc_feholdexcept_setroundf_ctx libc_feholdexcept_setround_loongarch_ctx -+# define libc_feholdexcept_setroundl_ctx libc_feholdexcept_setround_loongarch_ctx -+ -+static __always_inline void -+libc_fesetenv_loongarch_ctx (struct rm_ctx *ctx) -+{ -+ libc_fesetenv_loongarch (&ctx->env); -+} -+# define libc_fesetenv_ctx libc_fesetenv_loongarch_ctx -+# define libc_fesetenvf_ctx libc_fesetenv_loongarch_ctx -+# define libc_fesetenvl_ctx libc_fesetenv_loongarch_ctx -+ -+static __always_inline void -+libc_feupdateenv_loongarch_ctx (struct rm_ctx *ctx) -+{ -+ if (__glibc_unlikely (ctx->updated_status)) -+ libc_feupdateenv_test_loongarch (&ctx->env, 0); -+} -+# define libc_feupdateenv_ctx libc_feupdateenv_loongarch_ctx -+# define libc_feupdateenvf_ctx libc_feupdateenv_loongarch_ctx -+# define libc_feupdateenvl_ctx libc_feupdateenv_loongarch_ctx -+# define libc_feresetround_ctx libc_feupdateenv_loongarch_ctx -+# define libc_feresetroundf_ctx libc_feupdateenv_loongarch_ctx -+# define libc_feresetroundl_ctx libc_feupdateenv_loongarch_ctx -+ -+static __always_inline void -+libc_feholdsetround_loongarch_ctx (struct rm_ctx *ctx, int round) -+{ -+ fpu_control_t old, new; -+ -+ /* Save the current state. */ -+ _FPU_GETCW (old); -+ ctx->env.__fp_control_register = old; -+ -+ /* Set rounding bits. */ -+ new = (old & ~_FPU_RC_MASK) | round; -+ -+ if (__glibc_unlikely (new != old)) -+ { -+ _FPU_SETCW (new); -+ ctx->updated_status = true; -+ } -+ else -+ ctx->updated_status = false; -+} -+# define libc_feholdsetround_ctx libc_feholdsetround_loongarch_ctx -+# define libc_feholdsetroundf_ctx libc_feholdsetround_loongarch_ctx -+# define libc_feholdsetroundl_ctx libc_feholdsetround_loongarch_ctx -+ -+#endif -+ -+#include_next -+ -+#endif -diff --git a/sysdeps/loongarch/memusage.h b/sysdeps/loongarch/memusage.h -new file mode 100644 -index 00000000..bdf99f8a ---- /dev/null -+++ b/sysdeps/loongarch/memusage.h -@@ -0,0 +1,21 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#define GETSP() ({ register uintptr_t stack_ptr asm ("$sp"); stack_ptr; }) -+ -+#include -diff --git a/sysdeps/loongarch/mknod.c b/sysdeps/loongarch/mknod.c -new file mode 100644 -index 00000000..1ed3681f ---- /dev/null -+++ b/sysdeps/loongarch/mknod.c -@@ -0,0 +1 @@ -+#include -diff --git a/sysdeps/loongarch/mknodat.c b/sysdeps/loongarch/mknodat.c -new file mode 100644 -index 00000000..82bc6ee6 ---- /dev/null -+++ b/sysdeps/loongarch/mknodat.c -@@ -0,0 +1 @@ -+#include -diff --git a/sysdeps/loongarch/nptl/Makefile b/sysdeps/loongarch/nptl/Makefile -new file mode 100644 -index 00000000..a1d5768a ---- /dev/null -+++ b/sysdeps/loongarch/nptl/Makefile -@@ -0,0 +1,26 @@ -+# Makefile for sysdeps/loongarch/nptl. -+# Copyright (C) 2005-2018 Free Software Foundation, Inc. -+# This file is part of the GNU C Library. -+# -+# The GNU C Library is free software; you can redistribute it and/or -+# modify it under the terms of the GNU Lesser General Public -+# License as published by the Free Software Foundation; either -+# version 2.1 of the License, or (at your option) any later version. -+# -+# The GNU C Library is distributed in the hope that it will be useful, -+# but WITHOUT ANY WARRANTY; without even the implied warranty of -+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+# Lesser General Public License for more details. -+# -+# You should have received a copy of the GNU Lesser General Public -+# License along with the GNU C Library; if not, see -+# . -+ -+ifeq ($(subdir),csu) -+gen-as-const-headers += tcb-offsets.sym -+endif -+ -+ifeq ($(subdir),nptl) -+libpthread-sysdep_routines += nptl-sysdep -+libpthread-shared-only-routines += nptl-sysdep -+endif -diff --git a/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h b/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h -new file mode 100644 -index 00000000..5a761355 ---- /dev/null -+++ b/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h -@@ -0,0 +1,68 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#ifndef _BITS_PTHREADTYPES_ARCH_H -+#define _BITS_PTHREADTYPES_ARCH_H 1 -+ -+#include -+ -+#if __loongarch_xlen == 64 -+# define __SIZEOF_PTHREAD_ATTR_T 56 -+# define __SIZEOF_PTHREAD_MUTEX_T 40 -+# define __SIZEOF_PTHREAD_MUTEXATTR_T 4 -+# define __SIZEOF_PTHREAD_COND_T 48 -+# define __SIZEOF_PTHREAD_CONDATTR_T 4 -+# define __SIZEOF_PTHREAD_RWLOCK_T 56 -+# define __SIZEOF_PTHREAD_RWLOCKATTR_T 8 -+# define __SIZEOF_PTHREAD_BARRIER_T 32 -+# define __SIZEOF_PTHREAD_BARRIERATTR_T 4 -+#else -+# error "rv32i-based systems are not supported" -+#endif -+ -+#define __PTHREAD_COMPAT_PADDING_MID -+#define __PTHREAD_COMPAT_PADDING_END -+#define __PTHREAD_MUTEX_LOCK_ELISION 0 -+#define __PTHREAD_MUTEX_USE_UNION 0 -+#define __PTHREAD_MUTEX_NUSERS_AFTER_KIND 0 -+ -+#define __LOCK_ALIGNMENT -+#define __ONCE_ALIGNMENT -+ -+/* There is a lot of padding in this structure. While it's not strictly -+ necessary on LoongArch, we're going to leave it in to be on the safe side in -+ case it's needed in the future. Most other architectures have the padding, -+ so this gives us the same extensibility as everyone else has. */ -+struct __pthread_rwlock_arch_t -+{ -+ unsigned int __readers; -+ unsigned int __writers; -+ unsigned int __wrphase_futex; -+ unsigned int __writers_futex; -+ unsigned int __pad3; -+ unsigned int __pad4; -+ int __cur_writer; -+ int __shared; -+ unsigned long int __pad1; -+ unsigned long int __pad2; -+ unsigned int __flags; -+}; -+ -+#define __PTHREAD_RWLOCK_ELISION_EXTRA 0 -+ -+#endif /* bits/pthreadtypes.h */ -diff --git a/sysdeps/loongarch/nptl/bits/semaphore.h b/sysdeps/loongarch/nptl/bits/semaphore.h -new file mode 100644 -index 00000000..a9ddefb2 ---- /dev/null -+++ b/sysdeps/loongarch/nptl/bits/semaphore.h -@@ -0,0 +1,33 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#ifndef _SEMAPHORE_H -+# error "Never use directly; include instead." -+#endif -+ -+#define __SIZEOF_SEM_T (4 * __SIZEOF_POINTER__) -+ -+/* Value returned if `sem_open' failed. */ -+#define SEM_FAILED ((sem_t *) 0) -+ -+ -+typedef union -+{ -+ char __size[__SIZEOF_SEM_T]; -+ long int __align; -+} sem_t; -diff --git a/sysdeps/loongarch/nptl/libc-lowlevellock.c b/sysdeps/loongarch/nptl/libc-lowlevellock.c -new file mode 100644 -index 00000000..9523fb46 ---- /dev/null -+++ b/sysdeps/loongarch/nptl/libc-lowlevellock.c -@@ -0,0 +1,8 @@ -+/* This kludge works around a libpthread static linking problem: -+ https://sourceware.org/bugzilla/show_bug.cgi?id=15648. */ -+ -+#ifndef SHARED -+# define __lll_lock_wait_private weak_function __lll_lock_wait_private -+#endif -+ -+#include -diff --git a/sysdeps/loongarch/nptl/nptl-sysdep.S b/sysdeps/loongarch/nptl/nptl-sysdep.S -new file mode 100644 -index 00000000..3f5c2a36 ---- /dev/null -+++ b/sysdeps/loongarch/nptl/nptl-sysdep.S -@@ -0,0 +1,2 @@ -+/* Pull in __syscall_error. */ -+#include -diff --git a/sysdeps/loongarch/nptl/pthread-offsets.h b/sysdeps/loongarch/nptl/pthread-offsets.h -new file mode 100644 -index 00000000..04130879 ---- /dev/null -+++ b/sysdeps/loongarch/nptl/pthread-offsets.h -@@ -0,0 +1,23 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#define __PTHREAD_MUTEX_NUSERS_OFFSET 12 -+#define __PTHREAD_MUTEX_KIND_OFFSET 16 -+#define __PTHREAD_MUTEX_SPINS_OFFSET 20 -+#define __PTHREAD_MUTEX_ELISION_OFFSET 22 -+#define __PTHREAD_MUTEX_LIST_OFFSET 24 -diff --git a/sysdeps/loongarch/nptl/pthreaddef.h b/sysdeps/loongarch/nptl/pthreaddef.h -new file mode 100644 -index 00000000..87c407bc ---- /dev/null -+++ b/sysdeps/loongarch/nptl/pthreaddef.h -@@ -0,0 +1,32 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+/* Default stack size. */ -+#define ARCH_STACK_DEFAULT_SIZE (2 * 1024 * 1024) -+ -+/* Required stack pointer alignment at beginning. */ -+#define STACK_ALIGN 16 -+ -+/* Minimal stack size after allocating thread descriptor and guard size. */ -+#define MINIMAL_REST_STACK 2048 -+ -+/* Alignment requirement for TCB. */ -+#define TCB_ALIGNMENT 16 -+ -+/* Location of current stack frame. */ -+#define CURRENT_STACK_FRAME __builtin_frame_address (0) -diff --git a/sysdeps/loongarch/nptl/tcb-offsets.sym b/sysdeps/loongarch/nptl/tcb-offsets.sym -new file mode 100644 -index 00000000..ab4981f2 ---- /dev/null -+++ b/sysdeps/loongarch/nptl/tcb-offsets.sym -@@ -0,0 +1,6 @@ -+#include -+#include -+ -+#define thread_offsetof(mem) (long)(offsetof (struct pthread, mem) - TLS_TCB_OFFSET - TLS_PRE_TCB_SIZE) -+ -+MULTIPLE_THREADS_OFFSET thread_offsetof (header.multiple_threads) -diff --git a/sysdeps/loongarch/nptl/tls.h b/sysdeps/loongarch/nptl/tls.h -new file mode 100644 -index 00000000..8d2d4ca2 ---- /dev/null -+++ b/sysdeps/loongarch/nptl/tls.h -@@ -0,0 +1,147 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#ifndef _LOONGARCH_TLS_H -+#define _LOONGARCH_TLS_H 1 -+ -+#include -+ -+#ifndef __ASSEMBLER__ -+# include -+# include -+# include -+# include -+ -+register void *__thread_self asm ("$tp"); /* FIXME */ -+# define READ_THREAD_POINTER() ({ __thread_self; }) -+ -+/* Get system call information. */ -+# include -+ -+/* The TP points to the start of the thread blocks. */ -+# define TLS_DTV_AT_TP 1 -+# define TLS_TCB_AT_TP 0 -+ -+/* Get the thread descriptor definition. */ -+# include -+ -+typedef struct -+{ -+ dtv_t *dtv; -+ void *private; -+} tcbhead_t; -+ -+/* This is the size of the initial TCB. Because our TCB is before the thread -+ pointer, we don't need this. */ -+# define TLS_INIT_TCB_SIZE 0 -+ -+/* Alignment requirements for the initial TCB. */ -+# define TLS_INIT_TCB_ALIGN __alignof__ (struct pthread) -+ -+/* This is the size of the TCB. Because our TCB is before the thread -+ pointer, we don't need this. */ -+# define TLS_TCB_SIZE 0 -+ -+/* Alignment requirements for the TCB. */ -+# define TLS_TCB_ALIGN __alignof__ (struct pthread) -+ -+/* This is the size we need before TCB - actually, it includes the TCB. */ -+# define TLS_PRE_TCB_SIZE \ -+ (sizeof (struct pthread) \ -+ + ((sizeof (tcbhead_t) + TLS_TCB_ALIGN - 1) & ~(TLS_TCB_ALIGN - 1))) -+ -+/* The thread pointer tp points to the end of the TCB. -+ The pthread_descr structure is immediately in front of the TCB. */ -+# define TLS_TCB_OFFSET 0 -+ -+/* Install the dtv pointer. The pointer passed is to the element with -+ index -1 which contain the length. */ -+# define INSTALL_DTV(tcbp, dtvp) \ -+ (((tcbhead_t *) (tcbp))[-1].dtv = (dtvp) + 1) -+ -+/* Install new dtv for current thread. */ -+# define INSTALL_NEW_DTV(dtv) \ -+ (THREAD_DTV() = (dtv)) -+ -+/* Return dtv of given thread descriptor. */ -+# define GET_DTV(tcbp) \ -+ (((tcbhead_t *) (tcbp))[-1].dtv) -+ -+/* Code to initially initialize the thread pointer. */ -+# define TLS_INIT_TP(tcbp) \ -+ ({ __thread_self = (char*)tcbp + TLS_TCB_OFFSET; NULL; }) -+ -+/* Return the address of the dtv for the current thread. */ -+# define THREAD_DTV() \ -+ (((tcbhead_t *) (READ_THREAD_POINTER () - TLS_TCB_OFFSET))[-1].dtv) -+ -+/* Return the thread descriptor for the current thread. */ -+# define THREAD_SELF \ -+ ((struct pthread *) (READ_THREAD_POINTER () \ -+ - TLS_TCB_OFFSET - TLS_PRE_TCB_SIZE)) -+ -+/* Value passed to 'clone' for initialization of the thread register. */ -+# define TLS_DEFINE_INIT_TP(tp, pd) \ -+ void *tp = (void *) (pd) + TLS_TCB_OFFSET + TLS_PRE_TCB_SIZE -+ -+/* Informs libthread_db that the thread pointer is register 2, which is used -+ * to know how to do THREAD_SELF. */ -+# define DB_THREAD_SELF \ -+ REGISTER (64, 64, 2 * 8, - TLS_TCB_OFFSET - TLS_PRE_TCB_SIZE) -+ -+/* Access to data in the thread descriptor is easy. */ -+# define THREAD_GETMEM(descr, member) \ -+ descr->member -+# define THREAD_GETMEM_NC(descr, member, idx) \ -+ descr->member[idx] -+# define THREAD_SETMEM(descr, member, value) \ -+ descr->member = (value) -+# define THREAD_SETMEM_NC(descr, member, idx, value) \ -+ descr->member[idx] = (value) -+ -+/* l_tls_offset == 0 is perfectly valid, so we have to use some different -+ value to mean unset l_tls_offset. */ -+# define NO_TLS_OFFSET -1 -+ -+/* Get and set the global scope generation counter in struct pthread. */ -+# define THREAD_GSCOPE_IN_TCB 1 -+# define THREAD_GSCOPE_FLAG_UNUSED 0 -+# define THREAD_GSCOPE_FLAG_USED 1 -+# define THREAD_GSCOPE_FLAG_WAIT 2 -+# define THREAD_GSCOPE_RESET_FLAG() \ -+ do \ -+ { int __res \ -+ = atomic_exchange_rel (&THREAD_SELF->header.gscope_flag, \ -+ THREAD_GSCOPE_FLAG_UNUSED); \ -+ if (__res == THREAD_GSCOPE_FLAG_WAIT) \ -+ lll_futex_wake (&THREAD_SELF->header.gscope_flag, 1, LLL_PRIVATE); \ -+ } \ -+ while (0) -+# define THREAD_GSCOPE_SET_FLAG() \ -+ do \ -+ { \ -+ THREAD_SELF->header.gscope_flag = THREAD_GSCOPE_FLAG_USED; \ -+ atomic_write_barrier (); \ -+ } \ -+ while (0) -+# define THREAD_GSCOPE_WAIT() \ -+ GL(dl_wait_lookup_done) () -+ -+#endif /* __ASSEMBLER__ */ -+ -+#endif /* tls.h */ -diff --git a/sysdeps/loongarch/preconfigure b/sysdeps/loongarch/preconfigure -new file mode 100644 -index 00000000..26ffe884 ---- /dev/null -+++ b/sysdeps/loongarch/preconfigure -@@ -0,0 +1,9 @@ -+case "$machine" in -+loongarch*) -+ base_machine=loongarch -+ machine=loongarch/lp64 -+ ;; -+esac -+ -+#TODO: this file is useless now. -+#Maybe we can make use of it to get arch info from GCC to set env -diff --git a/sysdeps/loongarch/pthread_atfork.c b/sysdeps/loongarch/pthread_atfork.c -new file mode 100644 -index 00000000..0f01d805 ---- /dev/null -+++ b/sysdeps/loongarch/pthread_atfork.c -@@ -0,0 +1 @@ -+#include -diff --git a/sysdeps/loongarch/setjmp.S b/sysdeps/loongarch/setjmp.S -new file mode 100644 -index 00000000..da09a93c ---- /dev/null -+++ b/sysdeps/loongarch/setjmp.S -@@ -0,0 +1,62 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+ -+ENTRY (_setjmp) -+ li.w a1,0 -+ b __sigsetjmp -+END (_setjmp) -+ENTRY (setjmp) -+ li.w a1,1 -+END (setjmp) -+ENTRY (__sigsetjmp) -+ REG_S ra, a0, 0*SZREG -+ REG_S sp, a0, 1*SZREG -+ REG_S x, a0, 2*SZREG -+ REG_S fp, a0, 3*SZREG -+ REG_S s0, a0, 4*SZREG -+ REG_S s1, a0, 5*SZREG -+ REG_S s2, a0, 6*SZREG -+ REG_S s3, a0, 7*SZREG -+ REG_S s4, a0, 8*SZREG -+ REG_S s5, a0, 9*SZREG -+ REG_S s6, a0, 10*SZREG -+ REG_S s7, a0, 11*SZREG -+ REG_S s8, a0, 12*SZREG -+ -+ FREG_S $f24, a0, 13*SZREG + 0*SZFREG -+ FREG_S $f25, a0, 13*SZREG + 1*SZFREG -+ FREG_S $f26, a0, 13*SZREG + 2*SZFREG -+ FREG_S $f27, a0, 13*SZREG + 3*SZFREG -+ FREG_S $f28, a0, 13*SZREG + 4*SZFREG -+ FREG_S $f29, a0, 13*SZREG + 5*SZFREG -+ FREG_S $f30, a0, 13*SZREG + 6*SZFREG -+ FREG_S $f31, a0, 13*SZREG + 7*SZFREG -+ -+#if !IS_IN (libc) && IS_IN(rtld) -+ li.w v0, 0 -+ jirl zero,ra,0 -+#else -+ b __sigjmp_save -+#endif -+END (__sigsetjmp) -+ -+hidden_def (__sigsetjmp) -+weak_alias (_setjmp, __GI__setjmp) -diff --git a/sysdeps/loongarch/sfp-machine.h b/sysdeps/loongarch/sfp-machine.h -new file mode 100644 -index 00000000..b5c79bc0 ---- /dev/null -+++ b/sysdeps/loongarch/sfp-machine.h -@@ -0,0 +1,79 @@ -+#include -+#include -+ -+#define _FP_W_TYPE_SIZE 64 -+#define _FP_W_TYPE unsigned long long -+#define _FP_WS_TYPE signed long long -+#define _FP_I_TYPE long long -+ -+#define _FP_MUL_MEAT_S(R,X,Y) \ -+ _FP_MUL_MEAT_1_imm(_FP_WFRACBITS_S,R,X,Y) -+#define _FP_MUL_MEAT_D(R,X,Y) \ -+ _FP_MUL_MEAT_1_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm) -+#define _FP_MUL_MEAT_Q(R,X,Y) \ -+ _FP_MUL_MEAT_2_wide_3mul(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm) -+ -+#define _FP_MUL_MEAT_DW_S(R,X,Y) \ -+ _FP_MUL_MEAT_DW_1_imm(_FP_WFRACBITS_S,R,X,Y) -+#define _FP_MUL_MEAT_DW_D(R,X,Y) \ -+ _FP_MUL_MEAT_DW_1_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm) -+#define _FP_MUL_MEAT_DW_Q(R,X,Y) \ -+ _FP_MUL_MEAT_DW_2_wide_3mul(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm) -+ -+#define _FP_DIV_MEAT_S(R,X,Y) _FP_DIV_MEAT_1_imm(S,R,X,Y,_FP_DIV_HELP_imm) -+#define _FP_DIV_MEAT_D(R,X,Y) _FP_DIV_MEAT_1_udiv_norm(D,R,X,Y) -+#define _FP_DIV_MEAT_Q(R,X,Y) _FP_DIV_MEAT_2_udiv(Q,R,X,Y) -+ -+# define _FP_NANFRAC_S _FP_QNANBIT_S -+# define _FP_NANFRAC_D _FP_QNANBIT_D -+# define _FP_NANFRAC_Q _FP_QNANBIT_Q, 0 -+ -+#define _FP_NANSIGN_S 0 -+#define _FP_NANSIGN_D 0 -+#define _FP_NANSIGN_Q 0 -+ -+#define _FP_KEEPNANFRACP 1 -+#define _FP_QNANNEGATEDP 0 -+ -+/* NaN payloads should be preserved for NAN2008. */ -+# define _FP_CHOOSENAN(fs, wc, R, X, Y, OP) \ -+ do \ -+ { \ -+ R##_s = X##_s; \ -+ _FP_FRAC_COPY_##wc (R, X); \ -+ R##_c = FP_CLS_NAN; \ -+ } \ -+ while (0) -+ -+#define _FP_DECL_EX fpu_control_t _fcw -+ -+#define FP_ROUNDMODE (_fcw & 0x300) -+ -+#define FP_RND_NEAREST FE_TONEAREST -+#define FP_RND_ZERO FE_TOWARDZERO -+#define FP_RND_PINF FE_UPWARD -+#define FP_RND_MINF FE_DOWNWARD -+ -+#define FP_EX_INVALID FE_INVALID -+#define FP_EX_OVERFLOW FE_OVERFLOW -+#define FP_EX_UNDERFLOW FE_UNDERFLOW -+#define FP_EX_DIVZERO FE_DIVBYZERO -+#define FP_EX_INEXACT FE_INEXACT -+ -+#define _FP_TININESS_AFTER_ROUNDING 1 -+ -+#ifdef __loongarch_hard_float -+#define FP_INIT_ROUNDMODE \ -+do { \ -+ _FPU_GETCW (_fcw); \ -+} while (0) -+ -+#define FP_HANDLE_EXCEPTIONS \ -+do { \ -+ if (__builtin_expect (_fex, 0)) \ -+ _FPU_SETCW (_fcw | _fex | (_fex << 8)); \ -+} while (0) -+#define FP_TRAPPING_EXCEPTIONS ((_fcw << 16) & 0x1f0000) -+#else -+#define FP_INIT_ROUNDMODE _fcw = FP_RND_NEAREST -+#endif -diff --git a/sysdeps/loongarch/sotruss-lib.c b/sysdeps/loongarch/sotruss-lib.c -new file mode 100644 -index 00000000..124db440 ---- /dev/null -+++ b/sysdeps/loongarch/sotruss-lib.c -@@ -0,0 +1,51 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#define HAVE_ARCH_PLTENTER -+#define HAVE_ARCH_PLTEXIT -+ -+#include -+ -+ElfW(Addr) -+la_loongarch_gnu_pltenter (ElfW(Sym) *sym __attribute__ ((unused)), -+ unsigned int ndx __attribute__ ((unused)), -+ uintptr_t *refcook, uintptr_t *defcook, -+ La_loongarch_regs *regs, unsigned int *flags, -+ const char *symname, long int *framesizep) -+{ -+ print_enter (refcook, defcook, symname, -+ regs->lr_reg[0], regs->lr_reg[1], regs->lr_reg[2], -+ *flags); -+ -+ /* No need to copy anything, we will not need the parameters in any case. */ -+ *framesizep = 0; -+ -+ return sym->st_value; -+} -+ -+unsigned int -+la_loongarch_gnu_pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, -+ uintptr_t *defcook, -+ const struct La_loongarch_regs *inregs, -+ struct La_loongarch_retval *outregs, -+ const char *symname) -+{ -+ print_exit (refcook, defcook, symname, outregs->lrv_a0); -+ -+ return 0; -+} -diff --git a/sysdeps/loongarch/stack_chk_fail_local.c b/sysdeps/loongarch/stack_chk_fail_local.c -new file mode 100644 -index 00000000..305871fb ---- /dev/null -+++ b/sysdeps/loongarch/stack_chk_fail_local.c -@@ -0,0 +1 @@ -+#include -diff --git a/sysdeps/loongarch/stackinfo.h b/sysdeps/loongarch/stackinfo.h -new file mode 100644 -index 00000000..5f5e6ad1 ---- /dev/null -+++ b/sysdeps/loongarch/stackinfo.h -@@ -0,0 +1,33 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+/* This file contains a bit of information about the stack allocation -+ of the processor. */ -+ -+#ifndef _STACKINFO_H -+#define _STACKINFO_H 1 -+ -+#include -+ -+/* On LoongArch the stack grows down. */ -+#define _STACK_GROWS_DOWN 1 -+ -+/* Default to a non-executable stack. */ -+#define DEFAULT_STACK_PERMS (PF_R | PF_W) -+ -+#endif /* stackinfo.h */ -diff --git a/sysdeps/loongarch/start.S b/sysdeps/loongarch/start.S -new file mode 100644 -index 00000000..cf0a14b5 ---- /dev/null -+++ b/sysdeps/loongarch/start.S -@@ -0,0 +1,51 @@ -+#define __ASSEMBLY__ 1 -+#include -+#include -+ -+/* The entry point's job is to call __libc_start_main. Per the ABI, -+ a0 contains the address of a function to be passed to atexit. -+ __libc_start_main wants this in a5. */ -+ -+/* -+int -+__libc_start_main (int (*main) (int, char **, char **), -+ int argc, -+ char **argv, -+ __typeof (main) init, -+ void (*fini) (void), -+ void (*rtld_fini) (void), -+ void *stack_end); -+*/ -+ -+ENTRY (ENTRY_POINT) -+ /* Terminate call stack by noting ra is undefined. Use a dummy -+ .cfi_label to force starting the FDE. */ -+ .cfi_label .Ldummy -+ cfi_undefined (1) -+ or a5, a0, zero /* rtld_fini */ -+ -+ /* 这个main必须要走GOT表拿到。因为main不一定是local的。 -+ 比如googletest就把main定义在动态库里了。 */ -+ la.got a0, t0, main -+#ifdef __loongarch64 -+ ld.d a1, sp, 0 -+ addi.d a2, sp, SZREG -+#elif defined __loongarch32 -+ ld.w a1, sp, 0 -+ addi.w a2, sp, SZREG -+#endif -+ /* Adjust $sp for 16-aligned */ -+ srli.d sp, sp, 4 -+ slli.d sp, sp, 4 -+ -+ la.got a3, t0, __libc_csu_init -+ la.got a4, t0, __libc_csu_fini -+ or a6, sp, zero /* stack_end. */ -+ -+ la.got ra, t0, __libc_start_main -+ jirl ra, ra, 0 -+ -+ la.got ra, t0, abort -+ jirl ra, ra, 0 -+END (ENTRY_POINT) -+ -diff --git a/sysdeps/loongarch/stat.c b/sysdeps/loongarch/stat.c -new file mode 100644 -index 00000000..36461b87 ---- /dev/null -+++ b/sysdeps/loongarch/stat.c -@@ -0,0 +1 @@ -+#include -diff --git a/sysdeps/loongarch/stat64.c b/sysdeps/loongarch/stat64.c -new file mode 100644 -index 00000000..0897282e ---- /dev/null -+++ b/sysdeps/loongarch/stat64.c -@@ -0,0 +1 @@ -+#include -diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h -new file mode 100644 -index 00000000..f64bfb2b ---- /dev/null -+++ b/sysdeps/loongarch/sys/asm.h -@@ -0,0 +1,58 @@ -+#ifndef _SYS_ASM_H -+#define _SYS_ASM_H -+ -+#include -+#include -+ -+/* Macros to handle different pointer/register sizes for 32/64-bit code. */ -+#ifdef __loongarch64 -+# define PTRLOG 3 -+# define SZREG 8 -+# define SZFREG 8 -+# define REG_L ld.d -+# define REG_S st.d -+# define FREG_L fld.d -+# define FREG_S fst.d -+#elif defined __loongarch32 -+# define PTRLOG 2 -+# define SZREG 4 -+# define SZFREG 4 -+# define REG_L ld.w -+# define REG_S st.w -+# define FREG_L fld.w -+# define FREG_S fst.w -+#else -+# error __loongarch_xlen must equal 32 or 64 -+#endif -+ -+ -+/* Declare leaf routine. */ -+#define LEAF(symbol) \ -+ .text; \ -+ .globl symbol; \ -+ .align 3; \ -+ .type symbol, @function; \ -+symbol: \ -+ cfi_startproc; \ -+ -+# define ENTRY(symbol) LEAF(symbol) -+ -+#define LEAF_NO_ALIGN(symbol) \ -+ .text; \ -+ .globl symbol; \ -+ .type symbol, @function; \ -+symbol: \ -+ cfi_startproc; -+ -+# define ENTRY_NO_ALIGN(symbol) LEAF_NO_ALIGN(symbol) -+ -+/* Mark end of function. */ -+#undef END -+#define END(function) \ -+ cfi_endproc ; \ -+ .size function,.-function; -+ -+/* Stack alignment. */ -+#define ALMASK ~15 -+ -+#endif /* sys/asm.h */ -diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h -new file mode 100644 -index 00000000..769784b8 ---- /dev/null -+++ b/sysdeps/loongarch/sys/regdef.h -@@ -0,0 +1,83 @@ -+#ifndef _SYS_REGDEF_H -+#define _SYS_REGDEF_H -+ -+#if _LOONGARCH_SIM == _ABILP64 -+# define zero $r0 -+# define ra $r1 -+# define tp $r2 -+# define sp $r3 -+# define a0 $r4 -+# define a1 $r5 -+# define a2 $r6 -+# define a3 $r7 -+# define a4 $r8 -+# define a5 $r9 -+# define a6 $r10 -+# define a7 $r11 -+# define v0 $r4 -+# define v1 $r5 -+# define t0 $r12 -+# define t1 $r13 -+# define t2 $r14 -+# define t3 $r15 -+# define t4 $r16 -+# define t5 $r17 -+# define t6 $r18 -+# define t7 $r19 -+# define t8 $r20 -+# define x $r21 -+# define fp $r22 -+# define s0 $r23 -+# define s1 $r24 -+# define s2 $r25 -+# define s3 $r26 -+# define s4 $r27 -+# define s5 $r28 -+# define s6 $r29 -+# define s7 $r30 -+# define s8 $r31 -+ -+# define fa0 $f0 -+# define fa1 $f1 -+# define fa2 $f2 -+# define fa3 $f3 -+# define fa4 $f4 -+# define fa5 $f5 -+# define fa6 $f6 -+# define fa7 $f7 -+# define fv0 $f0 -+# define fv1 $f1 -+# define ft0 $f8 -+# define ft1 $f9 -+# define ft2 $f10 -+# define ft3 $f11 -+# define ft4 $f12 -+# define ft5 $f13 -+# define ft6 $f14 -+# define ft7 $f15 -+# define ft8 $f16 -+# define ft9 $f17 -+# define ft10 $f18 -+# define ft11 $f19 -+# define ft12 $f20 -+# define ft13 $f21 -+# define ft14 $f22 -+# define ft15 $f23 -+# define fs0 $f24 -+# define fs1 $f25 -+# define fs2 $f26 -+# define fs3 $f27 -+# define fs4 $f28 -+# define fs5 $f29 -+# define fs6 $f30 -+# define fs7 $f31 -+ -+#elif _LOONGARCH_SIM == _ABILPX32 -+# error ABILPX32 -+#elif _LOONGARCH_SIM == _ABILP32 -+# error ABILP32 -+#else -+# error noABI -+#endif -+ -+#endif /* _SYS_REGDEF_H */ -diff --git a/sysdeps/loongarch/tininess.h b/sysdeps/loongarch/tininess.h -new file mode 100644 -index 00000000..1db37790 ---- /dev/null -+++ b/sysdeps/loongarch/tininess.h -@@ -0,0 +1 @@ -+#define TININESS_AFTER_ROUNDING 1 -diff --git a/sysdeps/loongarch/tls-macros.h b/sysdeps/loongarch/tls-macros.h -new file mode 100644 -index 00000000..f0ad55ac ---- /dev/null -+++ b/sysdeps/loongarch/tls-macros.h -@@ -0,0 +1,46 @@ -+/* Macros to support TLS testing in times of missing compiler support. -+ Copyright (C) 2017-2018 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+ -+#include -+#include -+#include -+#include "dl-tls.h" -+ -+#define TLS_GD(x) \ -+ ({ void *__result; \ -+ asm ("la.tls.gd %0, " #x "\n\t" \ -+ : "=r" (__result)); \ -+ __tls_get_addr (__result); }) -+ -+#define TLS_LD(x) TLS_GD(x) -+ -+#define TLS_IE(x) \ -+ ({ void *__result; \ -+ asm ("la.tls.ie %0, " #x "\n\t" \ -+ "add.d %0, %0, $tp\n\t" \ -+ : "=r" (__result)); \ -+ __result; }) -+ -+#define TLS_LE(x) \ -+ ({ void *__result; \ -+ asm ("la.tls.le %0, " #x "\n\t" \ -+ "add.d %0, %0, $tp\n\t" \ -+ : "=r" (__result)); \ -+ __result; }) -diff --git a/sysdeps/loongarch/tst-audit.h b/sysdeps/loongarch/tst-audit.h -new file mode 100644 -index 00000000..d8d260eb ---- /dev/null -+++ b/sysdeps/loongarch/tst-audit.h -@@ -0,0 +1,23 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#define pltenter la_loongarch_gnu_pltenter -+#define pltexit la_loongarch_gnu_pltexit -+#define La_regs La_loongarch_regs -+#define La_retval La_loongarch_retval -+#define int_retval lrv_a0 -diff --git a/sysdeps/loongarch/warning-nop.c b/sysdeps/loongarch/warning-nop.c -new file mode 100644 -index 00000000..b76aae79 ---- /dev/null -+++ b/sysdeps/loongarch/warning-nop.c -@@ -0,0 +1 @@ -+#include -diff --git a/sysdeps/unix/sysv/linux/loongarch/Implies b/sysdeps/unix/sysv/linux/loongarch/Implies -new file mode 100644 -index 00000000..e52b1ac3 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/Implies -@@ -0,0 +1 @@ -+loongarch/nptl -diff --git a/sysdeps/unix/sysv/linux/loongarch/Makefile b/sysdeps/unix/sysv/linux/loongarch/Makefile -new file mode 100644 -index 00000000..6f049aa9 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/Makefile -@@ -0,0 +1,17 @@ -+ifeq ($(subdir),elf) -+sysdep_routines += dl-vdso -+ifeq ($(build-shared),yes) -+# This is needed for DSO loading from static binaries. -+sysdep-dl-routines += dl-static -+endif -+endif -+ -+#ifeq ($(subdir),misc) -+#sysdep_headers += sys/cachectl.h -+#sysdep_routines += flush-icache -+#endif -+ -+ifeq ($(subdir),stdlib) -+gen-as-const-headers += ucontext_i.sym -+endif -+ -diff --git a/sysdeps/unix/sysv/linux/loongarch/Versions b/sysdeps/unix/sysv/linux/loongarch/Versions -new file mode 100644 -index 00000000..453f276a ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/Versions -@@ -0,0 +1,44 @@ -+ld { -+ GLIBC_PRIVATE { -+ # used for loading by static libraries -+ _dl_var_init; -+ } -+} -+libc { -+ # The comment lines with "#errlist-compat" are magic; see errlist-compat.awk. -+ # When you get an error from errlist-compat.awk, you need to add a new -+ # version here. Don't do this blindly, since this means changing the ABI -+ # for all GNU/Linux configurations. -+ -+ GLIBC_2.0 { -+ #errlist-compat 123 -+ _sys_errlist; sys_errlist; _sys_nerr; sys_nerr; -+ -+ # Exception handling support functions from libgcc -+ __register_frame; __register_frame_table; __deregister_frame; -+ __frame_state_for; __register_frame_info_table; -+ -+ # Needed by gcc: -+ _flush_cache; -+ -+ # c* -+ cachectl; cacheflush; -+ -+ # s* -+ sysmips; -+ } -+ GLIBC_2.2 { -+ #errlist-compat 1134 -+ _sys_errlist; sys_errlist; _sys_nerr; sys_nerr; -+ -+ # _* -+ _test_and_set; -+ } -+ GLIBC_2.11 { -+ fallocate64; -+ } -+ GLIBC_PRIVATE { -+ # nptl/pthread_cond_timedwait.c uses INTERNAL_VSYSCALL(clock_gettime). -+ __vdso_clock_gettime; -+ } -+} -diff --git a/sysdeps/unix/sysv/linux/loongarch/atomic-machine.h b/sysdeps/unix/sysv/linux/loongarch/atomic-machine.h -new file mode 100644 -index 00000000..ac1948ea ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/atomic-machine.h -@@ -0,0 +1,188 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#ifndef _LINUX_LOONGARCH_BITS_ATOMIC_H -+#define _LINUX_LOONGARCH_BITS_ATOMIC_H 1 -+ -+#include -+ -+typedef int32_t atomic32_t; -+typedef uint32_t uatomic32_t; -+ -+typedef int64_t atomic64_t; -+typedef uint64_t uatomic64_t; -+ -+typedef intptr_t atomicptr_t; -+typedef uintptr_t uatomicptr_t; -+typedef intmax_t atomic_max_t; -+typedef uintmax_t uatomic_max_t; -+ -+#define atomic_full_barrier() __sync_synchronize () -+ -+# define __HAVE_64B_ATOMICS (__loongarch_xlen >= 64) -+# define USE_ATOMIC_COMPILER_BUILTINS 1 -+# define ATOMIC_EXCHANGE_USES_CAS 0 -+ -+/* Compare and exchange. -+ For all "bool" routines, we return FALSE if exchange succesful. */ -+ -+# define __arch_compare_and_exchange_bool_8_int(mem, newval, oldval, model) \ -+ ({ \ -+ typeof (*mem) __oldval = (oldval); \ -+ !__atomic_compare_exchange_n (mem, (void *) &__oldval, newval, 0, \ -+ model, __ATOMIC_RELAXED); \ -+ }) -+ -+# define __arch_compare_and_exchange_bool_16_int(mem, newval, oldval, model) \ -+ ({ \ -+ typeof (*mem) __oldval = (oldval); \ -+ !__atomic_compare_exchange_n (mem, (void *) &__oldval, newval, 0, \ -+ model, __ATOMIC_RELAXED); \ -+ }) -+ -+# define __arch_compare_and_exchange_bool_32_int(mem, newval, oldval, model) \ -+ ({ \ -+ typeof (*mem) __oldval = (oldval); \ -+ !__atomic_compare_exchange_n (mem, (void *) &__oldval, newval, 0, \ -+ model, __ATOMIC_RELAXED); \ -+ }) -+ -+# define __arch_compare_and_exchange_bool_64_int(mem, newval, oldval, model) \ -+ ({ \ -+ typeof (*mem) __oldval = (oldval); \ -+ !__atomic_compare_exchange_n (mem, (void *) &__oldval, newval, 0, \ -+ model, __ATOMIC_RELAXED); \ -+ }) -+ -+# define __arch_compare_and_exchange_val_8_int(mem, newval, oldval, model) \ -+ ({ \ -+ typeof (*mem) __oldval = (oldval); \ -+ __atomic_compare_exchange_n (mem, (void *) &__oldval, newval, 0, \ -+ model, __ATOMIC_RELAXED); \ -+ __oldval; \ -+ }) -+ -+# define __arch_compare_and_exchange_val_16_int(mem, newval, oldval, model) \ -+ ({ \ -+ typeof (*mem) __oldval = (oldval); \ -+ __atomic_compare_exchange_n (mem, (void *) &__oldval, newval, 0, \ -+ model, __ATOMIC_RELAXED); \ -+ __oldval; \ -+ }) -+ -+# define __arch_compare_and_exchange_val_32_int(mem, newval, oldval, model) \ -+ ({ \ -+ typeof (*mem) __oldval = (oldval); \ -+ __atomic_compare_exchange_n (mem, (void *) &__oldval, newval, 0, \ -+ model, __ATOMIC_RELAXED); \ -+ __oldval; \ -+ }) -+ -+# define __arch_compare_and_exchange_val_64_int(mem, newval, oldval, model) \ -+ ({ \ -+ typeof (*mem) __oldval = (oldval); \ -+ __atomic_compare_exchange_n (mem, (void *) &__oldval, newval, 0, \ -+ model, __ATOMIC_RELAXED); \ -+ __oldval; \ -+ }) -+ -+/* Atomic compare and exchange. */ -+ -+# define atomic_compare_and_exchange_bool_acq(mem, new, old) \ -+ __atomic_bool_bysize (__arch_compare_and_exchange_bool, int, \ -+ mem, new, old, __ATOMIC_ACQUIRE) -+ -+# define atomic_compare_and_exchange_val_acq(mem, new, old) \ -+ __atomic_val_bysize (__arch_compare_and_exchange_val, int, \ -+ mem, new, old, __ATOMIC_ACQUIRE) -+ -+# define atomic_compare_and_exchange_val_rel(mem, new, old) \ -+ __atomic_val_bysize (__arch_compare_and_exchange_val, int, \ -+ mem, new, old, __ATOMIC_RELEASE) -+ -+/* Atomic exchange (without compare). */ -+ -+# define __arch_exchange_8_int(mem, newval, model) \ -+ __atomic_exchange_n (mem, newval, model) -+ -+# define __arch_exchange_16_int(mem, newval, model) \ -+ __atomic_exchange_n (mem, newval, model) -+ -+# define __arch_exchange_32_int(mem, newval, model) \ -+ __atomic_exchange_n (mem, newval, model) -+ -+# define __arch_exchange_64_int(mem, newval, model) \ -+ __atomic_exchange_n (mem, newval, model) -+ -+# define atomic_exchange_acq(mem, value) \ -+ __atomic_val_bysize (__arch_exchange, int, mem, value, __ATOMIC_ACQUIRE) -+ -+# define atomic_exchange_rel(mem, value) \ -+ __atomic_val_bysize (__arch_exchange, int, mem, value, __ATOMIC_RELEASE) -+ -+/* Atomically add value and return the previous (unincremented) value. */ -+ -+# define __arch_exchange_and_add_8_int(mem, value, model) \ -+ __atomic_fetch_add (mem, value, model) -+ -+# define __arch_exchange_and_add_16_int(mem, value, model) \ -+ __atomic_fetch_add (mem, value, model) -+ -+# define __arch_exchange_and_add_32_int(mem, value, model) \ -+ __atomic_fetch_add (mem, value, model) -+ -+# define __arch_exchange_and_add_64_int(mem, value, model) \ -+ __atomic_fetch_add (mem, value, model) -+ -+# define atomic_exchange_and_add_acq(mem, value) \ -+ __atomic_val_bysize (__arch_exchange_and_add, int, mem, value, \ -+ __ATOMIC_ACQUIRE) -+ -+# define atomic_exchange_and_add_rel(mem, value) \ -+ __atomic_val_bysize (__arch_exchange_and_add, int, mem, value, \ -+ __ATOMIC_RELEASE) -+ -+/* Miscellaneous. */ -+ -+# define asm_amo(which, mem, value) ({ \ -+ __atomic_check_size (mem); \ -+ typeof (*mem) __tmp; \ -+ if (sizeof (__tmp) == 4) \ -+ asm volatile (which ".w""\t%0, %z2, %1" \ -+ : "=&r" (__tmp), "+ZB" (* (mem)) \ -+ : "rJ" (value)); \ -+ else if (sizeof (__tmp) == 8) \ -+ asm volatile (which ".d""\t%0, %z2, %1" \ -+ : "=&r" (__tmp), "+ZB" (* (mem)) \ -+ : "rJ" (value)); \ -+ else \ -+ abort (); \ -+ __tmp; }) -+ -+# define atomic_max(mem, value) asm_amo ("ammax_db", mem, value) -+# define atomic_min(mem, value) asm_amo ("ammin_db", mem, value) -+ -+# define atomic_bit_test_set(mem, bit) \ -+ ({ typeof (*mem) __mask = (typeof (*mem))1 << (bit); \ -+ asm_amo("amor_db", mem, __mask) & __mask; }) -+ -+# define catomic_exchange_and_add(mem, value) \ -+ atomic_exchange_and_add (mem, value) -+# define catomic_max(mem, value) atomic_max (mem, value) -+ -+#endif /* bits/atomic.h */ -diff --git a/sysdeps/unix/sysv/linux/loongarch/bits/fcntl.h b/sysdeps/unix/sysv/linux/loongarch/bits/fcntl.h -new file mode 100644 -index 00000000..5ee2e976 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/bits/fcntl.h -@@ -0,0 +1,62 @@ -+/* O_*, F_*, FD_* bit values for the generic Linux ABI. -+ Copyright (C) 2011-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Chris Metcalf , 2011. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#ifndef _FCNTL_H -+# error "Never use directly; include instead." -+#endif -+ -+#include -+ -+/* In 64-bit ISA files are always with 64bit off_t and F_*LK64 are the same as -+ non-64-bit versions. It will need to be revised for 128-bit. */ -+#if __WORDSIZE == 64 -+# define __O_LARGEFILE 0 -+ -+# define F_GETLK64 5 /* Get record locking info. */ -+# define F_SETLK64 6 /* Set record locking info (non-blocking). */ -+# define F_SETLKW64 7 /* Set record locking info (blocking). */ -+#endif -+ -+struct flock -+ { -+ short int l_type; /* Type of lock: F_RDLCK, F_WRLCK, or F_UNLCK. */ -+ short int l_whence; /* Where `l_start' is relative to (like `lseek'). */ -+#ifndef __USE_FILE_OFFSET64 -+ __off_t l_start; /* Offset where the lock begins. */ -+ __off_t l_len; /* Size of the locked area; zero means until EOF. */ -+#else -+ __off64_t l_start; /* Offset where the lock begins. */ -+ __off64_t l_len; /* Size of the locked area; zero means until EOF. */ -+#endif -+ __pid_t l_pid; /* Process holding the lock. */ -+ }; -+ -+#ifdef __USE_LARGEFILE64 -+struct flock64 -+ { -+ short int l_type; /* Type of lock: F_RDLCK, F_WRLCK, or F_UNLCK. */ -+ short int l_whence; /* Where `l_start' is relative to (like `lseek'). */ -+ __off64_t l_start; /* Offset where the lock begins. */ -+ __off64_t l_len; /* Size of the locked area; zero means until EOF. */ -+ __pid_t l_pid; /* Process holding the lock. */ -+ }; -+#endif -+ -+/* Include generic Linux declarations. */ -+#include -diff --git a/sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h b/sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h -new file mode 100644 -index 00000000..5104b69c ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h -@@ -0,0 +1,37 @@ -+/* Defines for bits in AT_HWCAP. LoongArch64 Linux version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#if !defined (_SYS_AUXV_H) -+# error "Never include directly; use instead." -+#endif -+ -+/* The following must match the kernel's . */ -+/* HWCAP flags */ -+#define HWCAP_LOONGARCH_CPUCFG (1 << 0) -+#define HWCAP_LOONGARCH_LAM (1 << 1) -+#define HWCAP_LOONGARCH_UAL (1 << 2) -+#define HWCAP_LOONGARCH_FPU (1 << 3) -+#define HWCAP_LOONGARCH_LSX (1 << 4) -+#define HWCAP_LOONGARCH_LASX (1 << 5) -+#define HWCAP_LOONGARCH_CRC32 (1 << 6) -+#define HWCAP_LOONGARCH_COMPLEX (1 << 7) -+#define HWCAP_LOONGARCH_CRYPTO (1 << 8) -+#define HWCAP_LOONGARCH_LVZ (1 << 9) -+#define HWCAP_LOONGARCH_LBT_X86 (1 << 10) -+#define HWCAP_LOONGARCH_LBT_ARM (1 << 11) -+#define HWCAP_LOONGARCH_LBT_MIPS (1 << 12) -diff --git a/sysdeps/unix/sysv/linux/loongarch/bits/local_lim.h b/sysdeps/unix/sysv/linux/loongarch/bits/local_lim.h -new file mode 100644 -index 00000000..a8cd6df8 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/bits/local_lim.h -@@ -0,0 +1,99 @@ -+/* Minimum guaranteed maximum values for system limits. Linux version. -+ Copyright (C) 1993-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public License as -+ published by the Free Software Foundation; either version 2.1 of the -+ License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; see the file COPYING.LIB. If -+ not, see . */ -+ -+/* The kernel header pollutes the namespace with the NR_OPEN symbol -+ and defines LINK_MAX although filesystems have different maxima. A -+ similar thing is true for OPEN_MAX: the limit can be changed at -+ runtime and therefore the macro must not be defined. Remove this -+ after including the header if necessary. */ -+#ifndef NR_OPEN -+# define __undef_NR_OPEN -+#endif -+#ifndef LINK_MAX -+# define __undef_LINK_MAX -+#endif -+#ifndef OPEN_MAX -+# define __undef_OPEN_MAX -+#endif -+#ifndef ARG_MAX -+# define __undef_ARG_MAX -+#endif -+ -+/* The kernel sources contain a file with all the needed information. */ -+#include -+ -+/* Have to remove NR_OPEN? */ -+#ifdef __undef_NR_OPEN -+# undef NR_OPEN -+# undef __undef_NR_OPEN -+#endif -+/* Have to remove LINK_MAX? */ -+#ifdef __undef_LINK_MAX -+# undef LINK_MAX -+# undef __undef_LINK_MAX -+#endif -+/* Have to remove OPEN_MAX? */ -+#ifdef __undef_OPEN_MAX -+# undef OPEN_MAX -+# undef __undef_OPEN_MAX -+#endif -+/* Have to remove ARG_MAX? */ -+#ifdef __undef_ARG_MAX -+# undef ARG_MAX -+# undef __undef_ARG_MAX -+#endif -+ -+/* The number of data keys per process. */ -+#define _POSIX_THREAD_KEYS_MAX 128 -+/* This is the value this implementation supports. */ -+#define PTHREAD_KEYS_MAX 1024 -+ -+/* Controlling the iterations of destructors for thread-specific data. */ -+#define _POSIX_THREAD_DESTRUCTOR_ITERATIONS 4 -+/* Number of iterations this implementation does. */ -+#define PTHREAD_DESTRUCTOR_ITERATIONS _POSIX_THREAD_DESTRUCTOR_ITERATIONS -+ -+/* The number of threads per process. */ -+#define _POSIX_THREAD_THREADS_MAX 64 -+/* We have no predefined limit on the number of threads. */ -+#undef PTHREAD_THREADS_MAX -+ -+/* Maximum amount by which a process can descrease its asynchronous I/O -+ priority level. */ -+#define AIO_PRIO_DELTA_MAX 20 -+ -+/* Minimum size for a thread. At least two pages with 64k pages. */ -+#define PTHREAD_STACK_MIN 131072 -+ -+/* Maximum number of timer expiration overruns. */ -+#define DELAYTIMER_MAX 2147483647 -+ -+/* Maximum tty name length. */ -+#define TTY_NAME_MAX 32 -+ -+/* Maximum login name length. This is arbitrary. */ -+#define LOGIN_NAME_MAX 256 -+ -+/* Maximum host name length. */ -+#define HOST_NAME_MAX 64 -+ -+/* Maximum message queue priority level. */ -+#define MQ_PRIO_MAX 32768 -+ -+/* Maximum value the semaphore can have. */ -+#define SEM_VALUE_MAX (2147483647) -diff --git a/sysdeps/unix/sysv/linux/loongarch/bits/mman.h b/sysdeps/unix/sysv/linux/loongarch/bits/mman.h -new file mode 100644 -index 00000000..5a16f8ac ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/bits/mman.h -@@ -0,0 +1,41 @@ -+/* Definitions for POSIX memory map interface. Linux/MIPS version. -+ Copyright (C) 1997-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+ -+#ifndef _SYS_MMAN_H -+# error "Never use directly; include instead." -+#endif -+ -+#ifdef __USE_MISC -+# define MAP_GROWSDOWN 0x00100 /* Stack-like segment. */ -+# define MAP_DENYWRITE 0x00800 /* ETXTBSY. */ -+# define MAP_EXECUTABLE 0x01000 /* Mark it as an executable. */ -+# define MAP_LOCKED 0x02000 /* Lock the mapping. */ -+# define MAP_NORESERVE 0x04000 /* Don't check for reservations. */ -+# define MAP_POPULATE 0x08000 /* Populate (prefault) pagetables. */ -+# define MAP_NONBLOCK 0x10000 /* Do not block on IO. */ -+# define MAP_STACK 0x20000 /* Allocation is for a stack. */ -+# define MAP_HUGETLB 0x40000 /* Create huge page mapping. */ -+# define MAP_SYNC 0x80000 /* Perform synchronous page -+ faults for the mapping. */ -+# define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED but do not unmap -+ underlying mapping. */ -+#endif -+ -+/* Include generic Linux declarations. */ -+#include -diff --git a/sysdeps/unix/sysv/linux/loongarch/bits/shm.h b/sysdeps/unix/sysv/linux/loongarch/bits/shm.h -new file mode 100644 -index 00000000..9e23092d ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/bits/shm.h -@@ -0,0 +1,112 @@ -+/* Copyright (C) 2011-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ Contributed by Chris Metcalf , 2011. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#ifndef _SYS_SHM_H -+# error "Never include directly; use instead." -+#endif -+ -+#include -+#include -+ -+/* Permission flag for shmget. */ -+#define SHM_R 0400 /* or S_IRUGO from */ -+#define SHM_W 0200 /* or S_IWUGO from */ -+ -+/* Flags for `shmat'. */ -+#define SHM_RDONLY 010000 /* attach read-only else read-write */ -+#define SHM_RND 020000 /* round attach address to SHMLBA */ -+#define SHM_REMAP 040000 /* take-over region on attach */ -+#define SHM_EXEC 0100000 /* execution access */ -+ -+/* Commands for `shmctl'. */ -+#define SHM_LOCK 11 /* lock segment (root only) */ -+#define SHM_UNLOCK 12 /* unlock segment (root only) */ -+ -+__BEGIN_DECLS -+ -+/* Segment low boundary address multiple. */ -+#define SHMLBA (__getpagesize () << 2) -+extern int __getpagesize (void) __THROW __attribute__ ((__const__)); -+ -+ -+/* Type to count number of attaches. */ -+typedef unsigned long int shmatt_t; -+ -+/* Data structure describing a shared memory segment. */ -+struct shmid_ds -+ { -+ struct ipc_perm shm_perm; /* operation permission struct */ -+ size_t shm_segsz; /* size of segment in bytes */ -+ __time_t shm_atime; /* time of last shmat() */ -+#if __WORDSIZE == 32 -+ unsigned long int __glibc_reserved1; -+#endif -+ __time_t shm_dtime; /* time of last shmdt() */ -+#if __WORDSIZE == 32 -+ unsigned long int __glibc_reserved2; -+#endif -+ __time_t shm_ctime; /* time of last change by shmctl() */ -+#if __WORDSIZE == 32 -+ unsigned long int __glibc_reserved3; -+#endif -+ __pid_t shm_cpid; /* pid of creator */ -+ __pid_t shm_lpid; /* pid of last shmop */ -+ shmatt_t shm_nattch; /* number of current attaches */ -+ unsigned long int __glibc_reserved4; -+ unsigned long int __glibc_reserved5; -+ }; -+ -+#ifdef __USE_MISC -+ -+/* ipcs ctl commands */ -+# define SHM_STAT 13 -+# define SHM_INFO 14 -+# define SHM_STAT_ANY 15 -+ -+/* shm_mode upper byte flags */ -+# define SHM_DEST 01000 /* segment will be destroyed on last detach */ -+# define SHM_LOCKED 02000 /* segment will not be swapped */ -+# define SHM_HUGETLB 04000 /* segment is mapped via hugetlb */ -+# define SHM_NORESERVE 010000 /* don't check for reservations */ -+ -+struct shminfo -+ { -+ unsigned long int shmmax; -+ unsigned long int shmmin; -+ unsigned long int shmmni; -+ unsigned long int shmseg; -+ unsigned long int shmall; -+ unsigned long int __glibc_reserved1; -+ unsigned long int __glibc_reserved2; -+ unsigned long int __glibc_reserved3; -+ unsigned long int __glibc_reserved4; -+ }; -+ -+struct shm_info -+ { -+ int used_ids; -+ unsigned long int shm_tot; /* total allocated shm */ -+ unsigned long int shm_rss; /* total resident shm */ -+ unsigned long int shm_swp; /* total swapped shm */ -+ unsigned long int swap_attempts; -+ unsigned long int swap_successes; -+ }; -+ -+#endif /* __USE_MISC */ -+ -+__END_DECLS -diff --git a/sysdeps/unix/sysv/linux/loongarch/bits/sigcontext.h b/sysdeps/unix/sysv/linux/loongarch/bits/sigcontext.h -new file mode 100644 -index 00000000..0f925b4c ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/bits/sigcontext.h -@@ -0,0 +1,47 @@ -+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -+/* -+ * This file is subject to the terms and conditions of the GNU General Public -+ * License. See the file "COPYING" in the main directory of this archive -+ * for more details. -+ * -+ * Copyright (C) 1996, 1997, 1999 by Ralf Baechle -+ * Copyright (C) 1999 Silicon Graphics, Inc. -+ */ -+#ifndef _BITS_SIGCONTEXT_H -+#define _BITS_SIGCONTEXT_H -+ -+/* -+ * Keep this struct definition in sync with the sigcontext fragment -+ * in arch/mips/kernel/asm-offsets.c -+ * -+ * Warning: this structure illdefined with sc_badvaddr being just an unsigned -+ * int so it was changed to unsigned long in 2.6.0-test1. This may break -+ * binary compatibility - no prisoners. -+ * DSP ASE in 2.6.12-rc4. Turn sc_mdhi and sc_mdlo into an array of four -+ * entries, add sc_dsp and sc_reserved for padding. No prisoners. -+ */ -+ -+#define FPU_REG_WIDTH 256 -+#define FPU_ALIGN __attribute__((aligned(32))) -+ -+struct sigcontext { -+ unsigned long long sc_pc; -+ unsigned long long sc_regs[32]; -+ unsigned int sc_flags; -+ -+ unsigned int sc_fcsr; -+ unsigned int sc_vcsr; -+ unsigned long long sc_fcc; -+ -+ unsigned long long sc_scr[4]; -+ -+ union { -+ unsigned int val32[FPU_REG_WIDTH / 32]; -+ unsigned long long val64[FPU_REG_WIDTH / 64]; -+ } sc_fpregs[32] FPU_ALIGN; -+ unsigned char sc_reserved[4096] __attribute__((__aligned__(16))); -+ -+}; -+ -+ -+#endif /* _BITS_SIGCONTEXT_H */ -diff --git a/sysdeps/unix/sysv/linux/loongarch/bits/signum.h b/sysdeps/unix/sysv/linux/loongarch/bits/signum.h -new file mode 100644 -index 00000000..3cad0b19 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/bits/signum.h -@@ -0,0 +1,58 @@ -+/* Signal number definitions. Linux version. -+ Copyright (C) 1995-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#ifndef _BITS_SIGNUM_H -+#define _BITS_SIGNUM_H 1 -+ -+#ifndef _SIGNAL_H -+#error "Never include directly; use instead." -+#endif -+ -+#include -+ -+/* Adjustments and additions to the signal number constants for -+ most Linux systems. */ -+ -+#define SIGSTKFLT 16 /* Stack fault (obsolete). */ -+#define SIGPWR 30 /* Power failure imminent. */ -+ -+#undef SIGBUS -+#define SIGBUS 7 -+#undef SIGUSR1 -+#define SIGUSR1 10 -+#undef SIGUSR2 -+#define SIGUSR2 12 -+#undef SIGCHLD -+#define SIGCHLD 17 -+#undef SIGCONT -+#define SIGCONT 18 -+#undef SIGSTOP -+#define SIGSTOP 19 -+#undef SIGTSTP -+#define SIGTSTP 20 -+#undef SIGURG -+#define SIGURG 23 -+#undef SIGPOLL -+#define SIGPOLL 29 -+#undef SIGSYS -+#define SIGSYS 31 -+ -+#undef __SIGRTMAX -+#define __SIGRTMAX 127 -+ -+#endif /* included. */ -diff --git a/sysdeps/unix/sysv/linux/loongarch/clone.S b/sysdeps/unix/sysv/linux/loongarch/clone.S -new file mode 100644 -index 00000000..f0fc566e ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/clone.S -@@ -0,0 +1,98 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+/* clone() is even more special than fork() as it mucks with stacks -+ and invokes a function in the right context after its all over. */ -+ -+#include -+#include -+#define _ERRNO_H 1 -+#include -+#include -+#include "tcb-offsets.h" -+ -+/* int clone(int (*fn)(void *arg), void *child_stack, int flags, void *arg, -+ void *parent_tidptr, void *tls, void *child_tidptr) */ -+ -+ENTRY (__clone) -+ -+ /* Align stack to 16 or 8 bytes per the ABI. */ -+#if _LOONGARCH_SIM == _ABILP64 -+ addi.d t0, zero, -16 -+#elif _LOONGARCH_SIM == _ABILP32 -+ addi.w t0, zero, -8 -+#endif -+ and a1, a1, t0 -+ -+ /* Sanity check arguments. */ -+ beqz a0, L (invalid) /* No NULL function pointers. */ -+ beqz a1, L (invalid) /* No NULL stack pointers. */ -+ -+ addi.d a1, a1, -16 /* Reserve argument save space. */ -+ st.d a0, a1, 0 /* Save function pointer. */ -+ st.d a3, a1, SZREG /* Save argument pointer. */ -+ -+ /* The syscall expects the args to be in different slots. */ -+ or a0, a2, zero -+ or a2, a4, zero -+ or a3, a6, zero -+ or a4, a5, zero -+ -+ /* Do the system call. */ -+ li.d a7,__NR_clone -+ syscall 0 -+ -+ blt a0, zero ,L (error) -+ beqz a0,L (thread_start) -+ -+ /* Successful return from the parent. */ -+ ret -+ -+L (invalid): -+ li.d a0, -EINVAL -+ /* Something bad happened -- no child created. */ -+L (error): -+ b __syscall_error -+ END (__clone) -+ -+/* Load up the arguments to the function. Put this block of code in -+ its own function so that we can terminate the stack trace with our -+ debug info. */ -+ -+ENTRY (__thread_start) -+L (thread_start): -+ /* Terminate call stack by noting ra is undefined. Use a dummy -+ .cfi_label to force starting the FDE. */ -+ .cfi_label .Ldummy -+ cfi_undefined (1) -+ -+ /* Restore the arg for user's function. */ -+ ld.d a1, sp, 0 /* Function pointer. */ -+ ld.d a0, sp, SZREG /* Argument pointer. */ -+ -+ /* Call the user's function. */ -+ jirl ra, a1, 0 -+ -+ /* Call exit with the function's return value. */ -+ li.d a7, __NR_exit -+ syscall 0 -+ -+ END (__thread_start) -+ -+libc_hidden_def (__clone) -+weak_alias (__clone, clone) -diff --git a/sysdeps/unix/sysv/linux/loongarch/configure b/sysdeps/unix/sysv/linux/loongarch/configure -new file mode 100644 -index 00000000..a402323a ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/configure -@@ -0,0 +1,199 @@ -+# This file is generated from configure.ac by Autoconf. DO NOT EDIT! -+ # Local configure fragment for sysdeps/unix/sysv/linux/loongarch. -+ -+arch_minimum_kernel=4.15.0 -+ -+libc_cv_loongarch_int_abi=no -+ -+ -+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5 -+$as_echo_n "checking for grep that handles long lines and -e... " >&6; } -+if ${ac_cv_path_GREP+:} false; then : -+ $as_echo_n "(cached) " >&6 -+else -+ if test -z "$GREP"; then -+ ac_path_GREP_found=false -+ # Loop through the user's path and test for each of PROGNAME-LIST -+ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin -+do -+ IFS=$as_save_IFS -+ test -z "$as_dir" && as_dir=. -+ for ac_prog in grep ggrep; do -+ for ac_exec_ext in '' $ac_executable_extensions; do -+ ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext" -+ as_fn_executable_p "$ac_path_GREP" || continue -+# Check for GNU ac_path_GREP and select it if it is found. -+ # Check for GNU $ac_path_GREP -+case `"$ac_path_GREP" --version 2>&1` in -+*GNU*) -+ ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;; -+*) -+ ac_count=0 -+ $as_echo_n 0123456789 >"conftest.in" -+ while : -+ do -+ cat "conftest.in" "conftest.in" >"conftest.tmp" -+ mv "conftest.tmp" "conftest.in" -+ cp "conftest.in" "conftest.nl" -+ $as_echo 'GREP' >> "conftest.nl" -+ "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break -+ diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break -+ as_fn_arith $ac_count + 1 && ac_count=$as_val -+ if test $ac_count -gt ${ac_path_GREP_max-0}; then -+ # Best one so far, save it but keep looking for a better one -+ ac_cv_path_GREP="$ac_path_GREP" -+ ac_path_GREP_max=$ac_count -+ fi -+ # 10*(2^10) chars as input seems more than enough -+ test $ac_count -gt 10 && break -+ done -+ rm -f conftest.in conftest.tmp conftest.nl conftest.out;; -+esac -+ -+ $ac_path_GREP_found && break 3 -+ done -+ done -+ done -+IFS=$as_save_IFS -+ if test -z "$ac_cv_path_GREP"; then -+ as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5 -+ fi -+else -+ ac_cv_path_GREP=$GREP -+fi -+ -+fi -+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5 -+$as_echo "$ac_cv_path_GREP" >&6; } -+ GREP="$ac_cv_path_GREP" -+ -+ -+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5 -+$as_echo_n "checking for egrep... " >&6; } -+if ${ac_cv_path_EGREP+:} false; then : -+ $as_echo_n "(cached) " >&6 -+else -+ if echo a | $GREP -E '(a|b)' >/dev/null 2>&1 -+ then ac_cv_path_EGREP="$GREP -E" -+ else -+ if test -z "$EGREP"; then -+ ac_path_EGREP_found=false -+ # Loop through the user's path and test for each of PROGNAME-LIST -+ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin -+do -+ IFS=$as_save_IFS -+ test -z "$as_dir" && as_dir=. -+ for ac_prog in egrep; do -+ for ac_exec_ext in '' $ac_executable_extensions; do -+ ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext" -+ as_fn_executable_p "$ac_path_EGREP" || continue -+# Check for GNU ac_path_EGREP and select it if it is found. -+ # Check for GNU $ac_path_EGREP -+case `"$ac_path_EGREP" --version 2>&1` in -+*GNU*) -+ ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;; -+*) -+ ac_count=0 -+ $as_echo_n 0123456789 >"conftest.in" -+ while : -+ do -+ cat "conftest.in" "conftest.in" >"conftest.tmp" -+ mv "conftest.tmp" "conftest.in" -+ cp "conftest.in" "conftest.nl" -+ $as_echo 'EGREP' >> "conftest.nl" -+ "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break -+ diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break -+ as_fn_arith $ac_count + 1 && ac_count=$as_val -+ if test $ac_count -gt ${ac_path_EGREP_max-0}; then -+ # Best one so far, save it but keep looking for a better one -+ ac_cv_path_EGREP="$ac_path_EGREP" -+ ac_path_EGREP_max=$ac_count -+ fi -+ # 10*(2^10) chars as input seems more than enough -+ test $ac_count -gt 10 && break -+ done -+ rm -f conftest.in conftest.tmp conftest.nl conftest.out;; -+esac -+ -+ $ac_path_EGREP_found && break 3 -+ done -+ done -+ done -+IFS=$as_save_IFS -+ if test -z "$ac_cv_path_EGREP"; then -+ as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5 -+ fi -+else -+ ac_cv_path_EGREP=$EGREP -+fi -+ -+ fi -+fi -+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5 -+$as_echo "$ac_cv_path_EGREP" >&6; } -+ EGREP="$ac_cv_path_EGREP" -+ -+ -+cat confdefs.h - <<_ACEOF >conftest.$ac_ext -+/* end confdefs.h. */ -+__SIZEOF_INT__ __SIZEOF_LONG__ __SIZEOF_POINTER__ -+ -+_ACEOF -+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | -+ $EGREP "4 4 4" >/dev/null 2>&1; then : -+ libc_cv_loongarch_int_abi=lp32 -+fi -+rm -f conftest* -+ -+cat confdefs.h - <<_ACEOF >conftest.$ac_ext -+/* end confdefs.h. */ -+__SIZEOF_INT__ __SIZEOF_LONG__ __SIZEOF_POINTER__ -+ -+_ACEOF -+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | -+ $EGREP "4 8 8" >/dev/null 2>&1; then : -+ libc_cv_loongarch_int_abi=lp64 -+fi -+rm -f conftest* -+ -+if test $libc_cv_loongarch_int_abi = no; then -+ as_fn_error $? "Unable to determine integer ABI" "$LINENO" 5 -+fi -+ -+config_vars="$config_vars -+default-abi = $libc_cv_loongarch_int_abi" -+ -+case $libc_cv_loongarch_int_abi in -+lp32) -+ test -n "$libc_cv_slibdir" || -+case "$prefix" in -+/usr | /usr/) -+ libc_cv_slibdir='/lib32' -+ libc_cv_rtlddir='/lib32' -+ if test "$libdir" = '${exec_prefix}/lib'; then -+ libdir='${exec_prefix}/lib32'; -+ # Locale data can be shared between 32-bit and 64-bit libraries. -+ libc_cv_complocaledir='${exec_prefix}/lib/locale' -+ fi -+ ;; -+esac -+ ;; -+lp64) -+ test -n "$libc_cv_slibdir" || -+case "$prefix" in -+/usr | /usr/) -+ libc_cv_slibdir='/lib64' -+ libc_cv_rtlddir='/lib64' -+ if test "$libdir" = '${exec_prefix}/lib'; then -+ libdir='${exec_prefix}/lib64'; -+ # Locale data can be shared between 32-bit and 64-bit libraries. -+ libc_cv_complocaledir='${exec_prefix}/lib/locale' -+ fi -+ ;; -+esac -+ ;; -+esac -+ -+ldd_rewrite_script=sysdeps/unix/sysv/linux/loongarch/ldd-rewrite.sed -diff --git a/sysdeps/unix/sysv/linux/loongarch/configure.ac b/sysdeps/unix/sysv/linux/loongarch/configure.ac -new file mode 100644 -index 00000000..fef4f4d2 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/configure.ac -@@ -0,0 +1,27 @@ -+sinclude(./aclocal.m4)dnl Autoconf lossage -+GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory. -+# Local configure fragment for sysdeps/unix/sysv/linux/loongarch. -+ -+arch_minimum_kernel=4.15.0 -+ -+libc_cv_loongarch_int_abi=no -+AC_EGREP_CPP(4 4 4, [__SIZEOF_INT__ __SIZEOF_LONG__ __SIZEOF_POINTER__ -+ ], libc_cv_loongarch_int_abi=lp32) -+AC_EGREP_CPP(4 8 8, [__SIZEOF_INT__ __SIZEOF_LONG__ __SIZEOF_POINTER__ -+ ], libc_cv_loongarch_int_abi=lp64) -+if test $libc_cv_loongarch_int_abi = no; then -+ AC_MSG_ERROR([Unable to determine integer ABI]) -+fi -+ -+LIBC_CONFIG_VAR([default-abi], [$libc_cv_loongarch_int_abi]) -+ -+case $libc_cv_loongarch_int_abi in -+lp32) -+ LIBC_SLIBDIR_RTLDDIR([lib32], [lib32]) -+ ;; -+lp64) -+ LIBC_SLIBDIR_RTLDDIR([lib64], [lib]) -+ ;; -+esac -+ -+ldd_rewrite_script=sysdeps/unix/sysv/linux/loongarch/ldd-rewrite.sed -diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c -new file mode 100644 -index 00000000..80870f3c ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c -@@ -0,0 +1,32 @@ -+/* Initialize CPU feature data. LoongArch64 version. -+ This file is part of the GNU C Library. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+#include -+ -+static inline void -+init_cpu_features (struct cpu_features *cpu_features) -+{ -+ register uint64_t cpucfg_word = UINT64_MAX; -+ -+ __cpucfg(cpucfg_word, 0); -+ cpu_features->cpucfg_prid = cpucfg_word; -+ -+ __cpucfg(cpucfg_word, 2); -+ cpu_features->cpucfg_word_idx2 = cpucfg_word; -+} -diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h -new file mode 100644 -index 00000000..b46a8489 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h -@@ -0,0 +1,53 @@ -+/* Initialize CPU feature data. LoongArch64 version. -+ This file is part of the GNU C Library. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#ifndef _CPU_FEATURES_LOONGARCH64_H -+#define _CPU_FEATURES_LOONGARCH64_H -+ -+#include -+#include -+ -+#define LA264 0x14a000 -+#define LA364 0x14b000 -+#define LA464 0x14c011 -+ -+struct cpu_features -+{ -+ uint64_t cpucfg_prid; -+ uint64_t cpucfg_word_idx2; -+}; -+ -+/* Get a pointer to the CPU features structure. */ -+extern const struct cpu_features *_dl_larch_get_cpu_features (void) -+ __attribute__ ((pure)); -+ -+#define __cpucfg(ret, index) \ -+ asm volatile ("or %1, %0, $zero\n" \ -+ "cpucfg %0, %0\n" \ -+ :"=r"(ret) \ -+ :"r"(index)); -+ -+#define IS_LA264(prid) (prid == LA264) -+#define IS_LA364(prid) (prid == LA364) -+#define IS_LA464(prid) (prid == LA464) -+#define SUPPORT_UAL (GLRO (dl_hwcap) & HWCAP_LOONGARCH_UAL) -+#define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX) -+#define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX) -+ -+#endif /* _CPU_FEATURES_LOONGARCH64_H */ -+ -diff --git a/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c b/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c -new file mode 100644 -index 00000000..31e92898 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c -@@ -0,0 +1,60 @@ -+/* Data for LoongArch64 version of processor capability information. -+ Linux version. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* If anything should be added here check whether the size of each string -+ is still ok with the given array size. -+ -+ All the #ifdefs in the definitions are quite irritating but -+ necessary if we want to avoid duplicating the information. There -+ are three different modes: -+ -+ - PROCINFO_DECL is defined. This means we are only interested in -+ declarations. -+ -+ - PROCINFO_DECL is not defined: -+ -+ + if SHARED is defined the file is included in an array -+ initializer. The .element = { ... } syntax is needed. -+ -+ + if SHARED is not defined a normal array initialization is -+ needed. -+ */ -+ -+#ifndef PROCINFO_CLASS -+# define PROCINFO_CLASS -+#endif -+ -+#if !IS_IN (ldconfig) -+# if !defined PROCINFO_DECL && defined SHARED -+ ._dl_larch_cpu_features -+# else -+PROCINFO_CLASS struct cpu_features _dl_larch_cpu_features -+# endif -+# ifndef PROCINFO_DECL -+= { } -+# endif -+# if !defined SHARED || defined PROCINFO_DECL -+; -+# else -+, -+# endif -+#endif -+ -+#undef PROCINFO_DECL -+#undef PROCINFO_CLASS -diff --git a/sysdeps/unix/sysv/linux/loongarch/dl-static.c b/sysdeps/unix/sysv/linux/loongarch/dl-static.c -new file mode 100644 -index 00000000..12b030f0 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/dl-static.c -@@ -0,0 +1,84 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+ -+#ifdef SHARED -+ -+void -+_dl_var_init (void *array[]) -+{ -+ /* It has to match "variables" below. */ -+ enum -+ { -+ DL_PAGESIZE = 0 -+ }; -+ -+ GLRO(dl_pagesize) = *((size_t *) array[DL_PAGESIZE]); -+} -+ -+#else -+ -+static void *variables[] = -+{ -+ &GLRO(dl_pagesize) -+}; -+ -+static void -+_dl_unprotect_relro (struct link_map *l) -+{ -+ ElfW(Addr) start = ((l->l_addr + l->l_relro_addr) -+ & ~(GLRO(dl_pagesize) - 1)); -+ ElfW(Addr) end = ((l->l_addr + l->l_relro_addr + l->l_relro_size) -+ & ~(GLRO(dl_pagesize) - 1)); -+ -+ if (start != end) -+ __mprotect ((void *) start, end - start, PROT_READ | PROT_WRITE); -+} -+ -+void -+_dl_static_init (struct link_map *l) -+{ -+ struct link_map *rtld_map = l; -+ struct r_scope_elem **scope; -+ const ElfW(Sym) *ref = NULL; -+ lookup_t loadbase; -+ void (*f) (void *[]); -+ size_t i; -+ -+ loadbase = _dl_lookup_symbol_x ("_dl_var_init", l, &ref, l->l_local_scope, -+ NULL, 0, 1, NULL); -+ -+ for (scope = l->l_local_scope; *scope != NULL; scope++) -+ for (i = 0; i < (*scope)->r_nlist; i++) -+ if ((*scope)->r_list[i] == loadbase) -+ { -+ rtld_map = (*scope)->r_list[i]; -+ break; -+ } -+ -+ if (ref != NULL) -+ { -+ f = (void (*) (void *[])) DL_SYMBOL_ADDRESS (loadbase, ref); -+ _dl_unprotect_relro (rtld_map); -+ f (variables); -+ _dl_protect_relro (rtld_map); -+ } -+} -+ -+#endif -diff --git a/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c b/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c -new file mode 100644 -index 00000000..1fe72410 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c -@@ -0,0 +1,21 @@ -+/* Operating system support for run-time dynamic linker. LoongArch version. -+ Copyright (C) 2017-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+#include -+#include -diff --git a/sysdeps/unix/sysv/linux/loongarch/dl-tunables.list b/sysdeps/unix/sysv/linux/loongarch/dl-tunables.list -new file mode 100644 -index 00000000..c8f9793e ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/dl-tunables.list -@@ -0,0 +1,29 @@ -+# Order of tunables in RHEL 8.7.0. -+@order glibc.rtld.nns -+@order glibc.elision.skip_lock_after_retries -+@order glibc.malloc.trim_threshold -+@order glibc.malloc.perturb -+@order glibc.elision.tries -+@order glibc.elision.enable -+@order glibc.malloc.mxfast -+@order glibc.elision.skip_lock_busy -+@order glibc.malloc.top_pad -+@order glibc.cpu.hwcaps -+@order glibc.cpu.hwcap_mask -+@order glibc.malloc.mmap_max -+@order glibc.elision.skip_trylock_internal_abort -+@order glibc.malloc.tcache_unsorted_limit -+@order glibc.elision.skip_lock_internal_abort -+@order glibc.malloc.arena_max -+@order glibc.malloc.mmap_threshold -+@order glibc.malloc.tcache_count -+@order glibc.malloc.arena_test -+@order glibc.rtld.optional_static_tls -+@order glibc.malloc.tcache_max -+@order glibc.malloc.check -+ -+# Tunables added in RHEL 8.8.0 -+@order glibc.rtld.dynamic_sort -+ -+@order glibc.gmon.minarcs -+@order glibc.gmon.maxarcs -diff --git a/sysdeps/unix/sysv/linux/loongarch/getcontext.S b/sysdeps/unix/sysv/linux/loongarch/getcontext.S -new file mode 100644 -index 00000000..9c28d958 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/getcontext.S -@@ -0,0 +1,72 @@ -+/* Save current context. -+ Copyright (C) 2009-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include "ucontext-macros.h" -+ -+/* int getcontext (ucontext_t *ucp) */ -+ -+ .text -+LEAF (__getcontext) -+ SAVE_INT_REG (ra, 1, a0) -+ SAVE_INT_REG (sp, 3, a0) -+ SAVE_INT_REG (zero, 4, a0) /* return 0 by overwriting a0. */ -+ SAVE_INT_REG (x, 21, a0) -+ SAVE_INT_REG (fp, 22, a0) -+ SAVE_INT_REG (s0, 23, a0) -+ SAVE_INT_REG (s1, 24, a0) -+ SAVE_INT_REG (s2, 25, a0) -+ SAVE_INT_REG (s3, 26, a0) -+ SAVE_INT_REG (s4, 27, a0) -+ SAVE_INT_REG (s5, 28, a0) -+ SAVE_INT_REG (s6, 29, a0) -+ SAVE_INT_REG (s7, 30, a0) -+ SAVE_INT_REG (s8, 31, a0) -+ st.d ra, a0, MCONTEXT_PC -+ -+#ifndef __loongarch_soft_float -+ movfcsr2gr a1, $r0 -+ -+ SAVE_FP_REG (fs0, 24, a0) -+ SAVE_FP_REG (fs1, 25, a0) -+ SAVE_FP_REG (fs2, 26, a0) -+ SAVE_FP_REG (fs3, 27, a0) -+ SAVE_FP_REG (fs4, 28, a0) -+ SAVE_FP_REG (fs5, 29, a0) -+ SAVE_FP_REG (fs6, 30, a0) -+ SAVE_FP_REG (fs7, 31, a0) -+ -+ st.w a1, a0, MCONTEXT_FCSR -+#endif /* __loongarch_soft_float */ -+ -+/* rt_sigprocmask (SIG_BLOCK, NULL, &ucp->uc_sigmask, _NSIG8) */ -+ li.d a3, _NSIG8 -+ addi.d a2, a0, UCONTEXT_SIGMASK -+ ori a1, zero,0 -+ li.d a0, SIG_BLOCK -+ -+ li.d a7, SYS_ify (rt_sigprocmask) -+ syscall 0 -+ blt a0, zero, 99f -+ -+ jirl $r0, $r1, 0 -+ -+99: b __syscall_error -+ -+PSEUDO_END (__getcontext) -+ -+weak_alias (__getcontext, getcontext) -diff --git a/sysdeps/unix/sysv/linux/loongarch/getpid.c b/sysdeps/unix/sysv/linux/loongarch/getpid.c -new file mode 100644 -index 00000000..5b4edb2b ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/getpid.c -@@ -0,0 +1,54 @@ -+/* getpid - get the pid. Linux/Loongarch version. -+ Copyright (C) 2015-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+ -+#ifdef SHARED -+# include -+# include -+ -+static pid_t -+__getpid_syscall (void) -+{ -+ return INLINE_SYSCALL (getpid, 0); -+} -+ -+# ifndef __getpid_type -+# define __getpid_type __getpid -+# endif -+ -+# undef INIT_ARCH -+# define INIT_ARCH() PREPARE_VERSION_KNOWN (linux26, LINUX_2_6) -+libc_ifunc_hidden (__getpid_type, __getpid, (_dl_vdso_vsym ("__vdso_getpid", &linux26) ?: &__getpid_syscall)) -+libc_hidden_def (__getpid) -+ -+#else -+ -+# include -+# include -+ -+pid_t -+__getpid (void) -+{ -+ return INLINE_SYSCALL (getpid, 0); -+} -+libc_hidden_def (__getpid); -+ -+#endif -+weak_alias (__getpid, getpid) -+libc_hidden_weak (getpid) -diff --git a/sysdeps/unix/sysv/linux/loongarch/gettimeofday.c b/sysdeps/unix/sysv/linux/loongarch/gettimeofday.c -new file mode 100644 -index 00000000..902b1a5d ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/gettimeofday.c -@@ -0,0 +1,58 @@ -+/* gettimeofday - get the time. Linux/LoongArch version. -+ Copyright (C) 2015-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+ -+#ifdef SHARED -+ -+# include -+# include -+ -+static int -+__gettimeofday_syscall (struct timeval *tv, struct timezone *tz) -+{ -+ return INLINE_SYSCALL (gettimeofday, 2, tv, tz); -+} -+ -+# ifndef __gettimeofday_type -+# define __gettimeofday_type __gettimeofday -+# endif -+ -+# undef INIT_ARCH -+# define INIT_ARCH() PREPARE_VERSION_KNOWN (linux26, LINUX_2_6) -+/* If the vDSO is not available we fall back to syscall. */ -+libc_ifunc_hidden (__gettimeofday_type, __gettimeofday, -+ (_dl_vdso_vsym ("__vdso_gettimeofday", &linux26) -+ ?: &__gettimeofday_syscall)) -+libc_hidden_def (__gettimeofday) -+ -+#else -+ -+# include -+# include -+ -+int -+__gettimeofday (struct timeval *tv, struct timezone *tz) -+{ -+ return INLINE_SYSCALL (gettimeofday, 2, tv, tz); -+} -+libc_hidden_def (__gettimeofday) -+ -+#endif -+weak_alias (__gettimeofday, gettimeofday) -+libc_hidden_weak (gettimeofday) -diff --git a/sysdeps/unix/sysv/linux/loongarch/getuid.c b/sysdeps/unix/sysv/linux/loongarch/getuid.c -new file mode 100644 -index 00000000..4b3f95eb ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/getuid.c -@@ -0,0 +1,60 @@ -+/* getuid - get the uid. Linux/Loongarch version. -+ Copyright (C) 2015-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+ -+#ifdef SHARED -+ -+# include -+# include -+ -+libc_hidden_proto (getuid) -+ -+extern __uid_t __getuid (void); -+libc_hidden_proto (__getuid) -+ -+static uid_t -+__getuid_syscall(void) -+{ -+ return INLINE_SYSCALL (getuid, 0); -+} -+ -+# ifndef __getuid_type -+# define __getuid_type __getuid -+# endif -+ -+# undef INIT_ARCH -+# define INIT_ARCH() PREPARE_VERSION_KNOWN (linux26, LINUX_2_6) -+libc_ifunc_hidden (__getuid_type, __getuid, (_dl_vdso_vsym ("__vdso_getuid", &linux26) ?: &__getuid_syscall)) -+libc_hidden_def (__getuid) -+ -+#else -+ -+# include -+# include -+ -+uid_t -+__getuid(void) -+{ -+ return INLINE_SYSCALL (getuid, 0); -+} -+libc_hidden_def (__getuid) -+ -+#endif -+weak_alias (__getuid, getuid) -+libc_hidden_weak (getuid) -diff --git a/sysdeps/unix/sysv/linux/loongarch/init-first.c b/sysdeps/unix/sysv/linux/loongarch/init-first.c -new file mode 100644 -index 00000000..5185a831 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/init-first.c -@@ -0,0 +1,57 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public License as -+ published by the Free Software Foundation; either version 2.1 of the -+ License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#ifdef SHARED -+# include -+# include -+ -+long int (*VDSO_SYMBOL (getcpu)) (unsigned int *, unsigned int *, void *) -+ attribute_hidden; -+long int (*VDSO_SYMBOL (gettimeofday)) (struct timeval *, void *) -+ attribute_hidden; -+long int (*VDSO_SYMBOL (clock_gettime)) (clockid_t, struct timespec *) -+ attribute_hidden; -+long int (*VDSO_SYMBOL (clock_getres)) (clockid_t, struct timespec *) -+ attribute_hidden; -+ -+static inline void -+_libc_vdso_platform_setup (void) -+{ -+ PREPARE_VERSION_KNOWN (linux_version, LINUX_2_6); -+ -+ void *p = _dl_vdso_vsym ("__vdso_getcpu", &linux_version); -+ PTR_MANGLE (p); -+ VDSO_SYMBOL (getcpu) = p; -+ -+ p = _dl_vdso_vsym ("__vdso_gettimeofday", &linux_version); -+ PTR_MANGLE (p); -+ VDSO_SYMBOL (gettimeofday) = p; -+ -+ p = _dl_vdso_vsym ("__vdso_clock_gettime", &linux_version); -+ PTR_MANGLE (p); -+ VDSO_SYMBOL (clock_gettime) = p; -+ -+ p = _dl_vdso_vsym ("__vdso_clock_getres", &linux_version); -+ PTR_MANGLE (p); -+ VDSO_SYMBOL (clock_getres) = p; -+} -+ -+# define VDSO_SETUP _libc_vdso_platform_setup -+#endif -+ -+#include -diff --git a/sysdeps/unix/sysv/linux/loongarch/ipc_priv.h b/sysdeps/unix/sysv/linux/loongarch/ipc_priv.h -new file mode 100644 -index 00000000..51583429 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/ipc_priv.h -@@ -0,0 +1,21 @@ -+/* Old SysV permission definition for Linux. LoongArch version. -+ Copyright (C) 2020 Loongson Technology, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include /* For __key_t */ -+ -+#define __IPC_64 0x0 -diff --git a/sysdeps/unix/sysv/linux/loongarch/kernel-features.h b/sysdeps/unix/sysv/linux/loongarch/kernel-features.h -new file mode 100644 -index 00000000..c87c7967 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/kernel-features.h -@@ -0,0 +1,24 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ * -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include_next -+ -+/* No support for PI mutexes or robust futexes before 4.20. */ -+#if __LINUX_KERNEL_VERSION < 0x041400 -+# undef __ASSUME_SET_ROBUST_LIST -+#endif -diff --git a/sysdeps/unix/sysv/linux/loongarch/ldd-rewrite.sed b/sysdeps/unix/sysv/linux/loongarch/ldd-rewrite.sed -new file mode 100644 -index 00000000..131c5f14 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/ldd-rewrite.sed -@@ -0,0 +1 @@ -+s_^\(RTLDLIST=\)\(.*lib/\)\(ld-linux\)-\(loongarch64\)-\(lp64\)\(d*\)\(\.so\.[0-9.]*\)_\1"\2\3-\4-\5\7 \2\3-\4-\5d\7"_ -diff --git a/sysdeps/unix/sysv/linux/loongarch/ldsodefs.h b/sysdeps/unix/sysv/linux/loongarch/ldsodefs.h -new file mode 100644 -index 00000000..c0fc7046 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/ldsodefs.h -@@ -0,0 +1,32 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#ifndef _LDSODEFS_H -+ -+/* Get the real definitions. */ -+#include_next -+ -+/* Now define our stuff. */ -+ -+/* We need special support to initialize DSO loaded for statically linked -+ binaries. */ -+extern void _dl_static_init (struct link_map *map); -+#undef DL_STATIC_INIT -+#define DL_STATIC_INIT(map) _dl_static_init (map) -+ -+#endif /* ldsodefs.h */ -diff --git a/sysdeps/unix/sysv/linux/loongarch/libc-start.c b/sysdeps/unix/sysv/linux/loongarch/libc-start.c -new file mode 100644 -index 00000000..047ad751 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/libc-start.c -@@ -0,0 +1,28 @@ -+/* Override csu/libc-start.c on LoongArch64. -+ Copyright (C) 2022 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#ifndef SHARED -+# include -+# include -+ -+extern struct cpu_features _dl_larch_cpu_features; -+ -+# define ARCH_INIT_CPU_FEATURES() init_cpu_features (&_dl_larch_cpu_features) -+ -+#endif -+#include -diff --git a/sysdeps/unix/sysv/linux/loongarch/libc-vdso.h b/sysdeps/unix/sysv/linux/loongarch/libc-vdso.h -new file mode 100644 -index 00000000..658c27a5 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/libc-vdso.h -@@ -0,0 +1,37 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#ifndef _LIBC_VDSO_H -+#define _LIBC_VDSO_H -+ -+#ifdef SHARED -+ -+# include -+ -+extern long int (*VDSO_SYMBOL (getcpu)) (unsigned int *, unsigned int *, void *) -+ attribute_hidden; -+extern long int (*VDSO_SYMBOL (gettimeofday)) (struct timeval *, void *) -+ attribute_hidden; -+extern long int (*VDSO_SYMBOL (clock_gettime)) (clockid_t, struct timespec *) -+ attribute_hidden; -+extern long int (*VDSO_SYMBOL (clock_getres)) (clockid_t, struct timespec *) -+ attribute_hidden; -+ -+#endif -+ -+#endif /* _LIBC_VDSO_H */ -diff --git a/sysdeps/unix/sysv/linux/loongarch/localplt.data b/sysdeps/unix/sysv/linux/loongarch/localplt.data -new file mode 100644 -index 00000000..0ed8650b ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/localplt.data -@@ -0,0 +1,13 @@ -+# See scripts/check-localplt.awk for how this file is processed. -+# PLT use is required for the malloc family and for matherr because -+# users can define their own functions and have library internals call them. -+libc.so: calloc -+libc.so: free -+libc.so: malloc -+libc.so: memalign -+libc.so: realloc -+# The TLS-enabled version of these functions is interposed from libc.so. -+ld.so: _dl_signal_error -+ld.so: _dl_catch_error -+ld.so: _dl_signal_exception -+ld.so: _dl_catch_exception -diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/Implies b/sysdeps/unix/sysv/linux/loongarch/lp64/Implies -new file mode 100644 -index 00000000..117c2b8e ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/Implies -@@ -0,0 +1,3 @@ -+unix/sysv/linux/loongarch -+unix/sysv/linux/generic -+unix/sysv/linux/wordsize-64 -diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/c++-types.data b/sysdeps/unix/sysv/linux/loongarch/lp64/c++-types.data -new file mode 100644 -index 00000000..ac925ccb ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/c++-types.data -@@ -0,0 +1,67 @@ -+blkcnt64_t:l -+blkcnt_t:l -+blksize_t:i -+caddr_t:Pc -+clockid_t:i -+clock_t:l -+daddr_t:i -+dev_t:m -+fd_mask:l -+fsblkcnt64_t:m -+fsblkcnt_t:m -+fsfilcnt64_t:m -+fsfilcnt_t:m -+fsid_t:8__fsid_t -+gid_t:j -+id_t:j -+ino64_t:m -+ino_t:m -+int16_t:s -+int32_t:i -+int64_t:l -+int8_t:a -+intptr_t:l -+key_t:i -+loff_t:l -+mode_t:j -+nlink_t:j -+off64_t:l -+off_t:l -+pid_t:i -+pthread_attr_t:14pthread_attr_t -+pthread_barrier_t:17pthread_barrier_t -+pthread_barrierattr_t:21pthread_barrierattr_t -+pthread_cond_t:14pthread_cond_t -+pthread_condattr_t:18pthread_condattr_t -+pthread_key_t:j -+pthread_mutex_t:15pthread_mutex_t -+pthread_mutexattr_t:19pthread_mutexattr_t -+pthread_once_t:i -+pthread_rwlock_t:16pthread_rwlock_t -+pthread_rwlockattr_t:20pthread_rwlockattr_t -+pthread_spinlock_t:i -+pthread_t:m -+quad_t:l -+register_t:l -+rlim64_t:m -+rlim_t:m -+sigset_t:10__sigset_t -+size_t:m -+socklen_t:j -+ssize_t:l -+suseconds_t:l -+time_t:l -+u_char:h -+uid_t:j -+uint:j -+u_int:j -+u_int16_t:t -+u_int32_t:j -+u_int64_t:m -+u_int8_t:h -+ulong:m -+u_long:m -+u_quad_t:m -+useconds_t:j -+ushort:t -+u_short:t -diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/jmp_buf-macros.h b/sysdeps/unix/sysv/linux/loongarch/lp64/jmp_buf-macros.h -new file mode 100644 -index 00000000..e1c96e67 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/jmp_buf-macros.h -@@ -0,0 +1,41 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+/* Produced by this program: -+ -+ #include -+ #include -+ #include -+ #include -+ -+ int main (int argc, char **argv) -+ { -+ printf ("#define JMP_BUF_SIZE %d\n", sizeof (jmp_buf)); -+ printf ("#define JMP_BUF_ALIGN %d\n", __alignof__ (jmp_buf)); -+ printf ("#define SIGJMP_BUF_SIZE %d\n", sizeof (sigjmp_buf)); -+ printf ("#define SIGJMP_BUF_ALIGN %d\n", __alignof__ (sigjmp_buf)); -+ printf ("#define MASK_WAS_SAVED_OFFSET %d\n", offsetof (struct __jmp_buf_tag, __mask_was_saved)); -+ printf ("#define SAVED_MASK_OFFSET %d\n", offsetof (struct __jmp_buf_tag, __saved_mask)); -+ } */ -+ -+# define JMP_BUF_SIZE 304 -+# define JMP_BUF_ALIGN 8 -+# define SIGJMP_BUF_SIZE 304 -+# define SIGJMP_BUF_ALIGN 8 -+# define MASK_WAS_SAVED_OFFSET 168 -+# define SAVED_MASK_OFFSET 176 -diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/ld.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/ld.abilist -new file mode 100644 -index 00000000..845f356c ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/ld.abilist -@@ -0,0 +1,5 @@ -+GLIBC_2.27 __libc_stack_end D 0x8 -+GLIBC_2.27 __stack_chk_guard D 0x8 -+GLIBC_2.27 __tls_get_addr F -+GLIBC_2.27 _dl_mcount F -+GLIBC_2.27 _r_debug D 0x28 -diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libBrokenLocale.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libBrokenLocale.abilist -new file mode 100644 -index 00000000..18968d3c ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libBrokenLocale.abilist -@@ -0,0 +1 @@ -+GLIBC_2.27 __ctype_get_mb_cur_max F -diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libanl.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libanl.abilist -new file mode 100644 -index 00000000..711fc87c ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libanl.abilist -@@ -0,0 +1,4 @@ -+GLIBC_2.27 gai_cancel F -+GLIBC_2.27 gai_error F -+GLIBC_2.27 gai_suspend F -+GLIBC_2.27 getaddrinfo_a F -diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libc.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libc.abilist -new file mode 100644 -index 00000000..4d8733f2 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libc.abilist -@@ -0,0 +1,2101 @@ -+GLIBC_2.27 _Exit F -+GLIBC_2.27 _IO_2_1_stderr_ D 0xe0 -+GLIBC_2.27 _IO_2_1_stdin_ D 0xe0 -+GLIBC_2.27 _IO_2_1_stdout_ D 0xe0 -+GLIBC_2.27 _IO_adjust_column F -+GLIBC_2.27 _IO_adjust_wcolumn F -+GLIBC_2.27 _IO_default_doallocate F -+GLIBC_2.27 _IO_default_finish F -+GLIBC_2.27 _IO_default_pbackfail F -+GLIBC_2.27 _IO_default_uflow F -+GLIBC_2.27 _IO_default_xsgetn F -+GLIBC_2.27 _IO_default_xsputn F -+GLIBC_2.27 _IO_do_write F -+GLIBC_2.27 _IO_doallocbuf F -+GLIBC_2.27 _IO_fclose F -+GLIBC_2.27 _IO_fdopen F -+GLIBC_2.27 _IO_feof F -+GLIBC_2.27 _IO_ferror F -+GLIBC_2.27 _IO_fflush F -+GLIBC_2.27 _IO_fgetpos F -+GLIBC_2.27 _IO_fgetpos64 F -+GLIBC_2.27 _IO_fgets F -+GLIBC_2.27 _IO_file_attach F -+GLIBC_2.27 _IO_file_close F -+GLIBC_2.27 _IO_file_close_it F -+GLIBC_2.27 _IO_file_doallocate F -+GLIBC_2.27 _IO_file_finish F -+GLIBC_2.27 _IO_file_fopen F -+GLIBC_2.27 _IO_file_init F -+GLIBC_2.27 _IO_file_jumps D 0xa8 -+GLIBC_2.27 _IO_file_open F -+GLIBC_2.27 _IO_file_overflow F -+GLIBC_2.27 _IO_file_read F -+GLIBC_2.27 _IO_file_seek F -+GLIBC_2.27 _IO_file_seekoff F -+GLIBC_2.27 _IO_file_setbuf F -+GLIBC_2.27 _IO_file_stat F -+GLIBC_2.27 _IO_file_sync F -+GLIBC_2.27 _IO_file_underflow F -+GLIBC_2.27 _IO_file_write F -+GLIBC_2.27 _IO_file_xsputn F -+GLIBC_2.27 _IO_flockfile F -+GLIBC_2.27 _IO_flush_all F -+GLIBC_2.27 _IO_flush_all_linebuffered F -+GLIBC_2.27 _IO_fopen F -+GLIBC_2.27 _IO_fprintf F -+GLIBC_2.27 _IO_fputs F -+GLIBC_2.27 _IO_fread F -+GLIBC_2.27 _IO_free_backup_area F -+GLIBC_2.27 _IO_free_wbackup_area F -+GLIBC_2.27 _IO_fsetpos F -+GLIBC_2.27 _IO_fsetpos64 F -+GLIBC_2.27 _IO_ftell F -+GLIBC_2.27 _IO_ftrylockfile F -+GLIBC_2.27 _IO_funlockfile F -+GLIBC_2.27 _IO_fwrite F -+GLIBC_2.27 _IO_getc F -+GLIBC_2.27 _IO_getline F -+GLIBC_2.27 _IO_getline_info F -+GLIBC_2.27 _IO_gets F -+GLIBC_2.27 _IO_init F -+GLIBC_2.27 _IO_init_marker F -+GLIBC_2.27 _IO_init_wmarker F -+GLIBC_2.27 _IO_iter_begin F -+GLIBC_2.27 _IO_iter_end F -+GLIBC_2.27 _IO_iter_file F -+GLIBC_2.27 _IO_iter_next F -+GLIBC_2.27 _IO_least_wmarker F -+GLIBC_2.27 _IO_link_in F -+GLIBC_2.27 _IO_list_all D 0x8 -+GLIBC_2.27 _IO_list_lock F -+GLIBC_2.27 _IO_list_resetlock F -+GLIBC_2.27 _IO_list_unlock F -+GLIBC_2.27 _IO_marker_delta F -+GLIBC_2.27 _IO_marker_difference F -+GLIBC_2.27 _IO_padn F -+GLIBC_2.27 _IO_peekc_locked F -+GLIBC_2.27 _IO_popen F -+GLIBC_2.27 _IO_printf F -+GLIBC_2.27 _IO_proc_close F -+GLIBC_2.27 _IO_proc_open F -+GLIBC_2.27 _IO_putc F -+GLIBC_2.27 _IO_puts F -+GLIBC_2.27 _IO_remove_marker F -+GLIBC_2.27 _IO_seekmark F -+GLIBC_2.27 _IO_seekoff F -+GLIBC_2.27 _IO_seekpos F -+GLIBC_2.27 _IO_seekwmark F -+GLIBC_2.27 _IO_setb F -+GLIBC_2.27 _IO_setbuffer F -+GLIBC_2.27 _IO_setvbuf F -+GLIBC_2.27 _IO_sgetn F -+GLIBC_2.27 _IO_sprintf F -+GLIBC_2.27 _IO_sputbackc F -+GLIBC_2.27 _IO_sputbackwc F -+GLIBC_2.27 _IO_sscanf F -+GLIBC_2.27 _IO_str_init_readonly F -+GLIBC_2.27 _IO_str_init_static F -+GLIBC_2.27 _IO_str_overflow F -+GLIBC_2.27 _IO_str_pbackfail F -+GLIBC_2.27 _IO_str_seekoff F -+GLIBC_2.27 _IO_str_underflow F -+GLIBC_2.27 _IO_sungetc F -+GLIBC_2.27 _IO_sungetwc F -+GLIBC_2.27 _IO_switch_to_get_mode F -+GLIBC_2.27 _IO_switch_to_main_wget_area F -+GLIBC_2.27 _IO_switch_to_wbackup_area F -+GLIBC_2.27 _IO_switch_to_wget_mode F -+GLIBC_2.27 _IO_un_link F -+GLIBC_2.27 _IO_ungetc F -+GLIBC_2.27 _IO_unsave_markers F -+GLIBC_2.27 _IO_unsave_wmarkers F -+GLIBC_2.27 _IO_vfprintf F -+GLIBC_2.27 _IO_vfscanf F -+GLIBC_2.27 _IO_vsprintf F -+GLIBC_2.27 _IO_wdefault_doallocate F -+GLIBC_2.27 _IO_wdefault_finish F -+GLIBC_2.27 _IO_wdefault_pbackfail F -+GLIBC_2.27 _IO_wdefault_uflow F -+GLIBC_2.27 _IO_wdefault_xsgetn F -+GLIBC_2.27 _IO_wdefault_xsputn F -+GLIBC_2.27 _IO_wdo_write F -+GLIBC_2.27 _IO_wdoallocbuf F -+GLIBC_2.27 _IO_wfile_jumps D 0xa8 -+GLIBC_2.27 _IO_wfile_overflow F -+GLIBC_2.27 _IO_wfile_seekoff F -+GLIBC_2.27 _IO_wfile_sync F -+GLIBC_2.27 _IO_wfile_underflow F -+GLIBC_2.27 _IO_wfile_xsputn F -+GLIBC_2.27 _IO_wmarker_delta F -+GLIBC_2.27 _IO_wsetb F -+GLIBC_2.27 ___brk_addr D 0x8 -+GLIBC_2.27 __adjtimex F -+GLIBC_2.27 __after_morecore_hook D 0x8 -+GLIBC_2.27 __argz_count F -+GLIBC_2.27 __argz_next F -+GLIBC_2.27 __argz_stringify F -+GLIBC_2.27 __asprintf F -+GLIBC_2.27 __asprintf_chk F -+GLIBC_2.27 __assert F -+GLIBC_2.27 __assert_fail F -+GLIBC_2.27 __assert_perror_fail F -+GLIBC_2.27 __backtrace F -+GLIBC_2.27 __backtrace_symbols F -+GLIBC_2.27 __backtrace_symbols_fd F -+GLIBC_2.27 __bsd_getpgrp F -+GLIBC_2.27 __bzero F -+GLIBC_2.27 __check_rhosts_file D 0x4 -+GLIBC_2.27 __chk_fail F -+GLIBC_2.27 __clone F -+GLIBC_2.27 __close F -+GLIBC_2.27 __cmsg_nxthdr F -+GLIBC_2.27 __confstr_chk F -+GLIBC_2.27 __connect F -+GLIBC_2.27 __ctype_b_loc F -+GLIBC_2.27 __ctype_get_mb_cur_max F -+GLIBC_2.27 __ctype_tolower_loc F -+GLIBC_2.27 __ctype_toupper_loc F -+GLIBC_2.27 __curbrk D 0x8 -+GLIBC_2.27 __cxa_at_quick_exit F -+GLIBC_2.27 __cxa_atexit F -+GLIBC_2.27 __cxa_finalize F -+GLIBC_2.27 __cxa_thread_atexit_impl F -+GLIBC_2.27 __cyg_profile_func_enter F -+GLIBC_2.27 __cyg_profile_func_exit F -+GLIBC_2.27 __daylight D 0x4 -+GLIBC_2.27 __dcgettext F -+GLIBC_2.27 __default_morecore F -+GLIBC_2.27 __dgettext F -+GLIBC_2.27 __dprintf_chk F -+GLIBC_2.27 __dup2 F -+GLIBC_2.27 __duplocale F -+GLIBC_2.27 __endmntent F -+GLIBC_2.27 __environ D 0x8 -+GLIBC_2.27 __errno_location F -+GLIBC_2.27 __explicit_bzero_chk F -+GLIBC_2.27 __fbufsize F -+GLIBC_2.27 __fcntl F -+GLIBC_2.27 __fdelt_chk F -+GLIBC_2.27 __fdelt_warn F -+GLIBC_2.27 __ffs F -+GLIBC_2.27 __fgets_chk F -+GLIBC_2.27 __fgets_unlocked_chk F -+GLIBC_2.27 __fgetws_chk F -+GLIBC_2.27 __fgetws_unlocked_chk F -+GLIBC_2.27 __finite F -+GLIBC_2.27 __finitef F -+GLIBC_2.27 __finitel F -+GLIBC_2.27 __flbf F -+GLIBC_2.27 __fork F -+GLIBC_2.27 __fpending F -+GLIBC_2.27 __fprintf_chk F -+GLIBC_2.27 __fpu_control D 0x4 -+GLIBC_2.27 __fpurge F -+GLIBC_2.27 __fread_chk F -+GLIBC_2.27 __fread_unlocked_chk F -+GLIBC_2.27 __freadable F -+GLIBC_2.27 __freading F -+GLIBC_2.27 __free_hook D 0x8 -+GLIBC_2.27 __freelocale F -+GLIBC_2.27 __fsetlocking F -+GLIBC_2.27 __fwprintf_chk F -+GLIBC_2.27 __fwritable F -+GLIBC_2.27 __fwriting F -+GLIBC_2.27 __fxstat F -+GLIBC_2.27 __fxstat64 F -+GLIBC_2.27 __fxstatat F -+GLIBC_2.27 __fxstatat64 F -+GLIBC_2.27 __getauxval F -+GLIBC_2.27 __getcwd_chk F -+GLIBC_2.27 __getdelim F -+GLIBC_2.27 __getdomainname_chk F -+GLIBC_2.27 __getgroups_chk F -+GLIBC_2.27 __gethostname_chk F -+GLIBC_2.27 __getlogin_r_chk F -+GLIBC_2.27 __getmntent_r F -+GLIBC_2.27 __getpagesize F -+GLIBC_2.27 __getpgid F -+GLIBC_2.27 __getpid F -+GLIBC_2.27 __gets_chk F -+GLIBC_2.27 __gettimeofday F -+GLIBC_2.27 __getwd_chk F -+GLIBC_2.27 __gmtime_r F -+GLIBC_2.27 __h_errno_location F -+GLIBC_2.27 __isalnum_l F -+GLIBC_2.27 __isalpha_l F -+GLIBC_2.27 __isascii_l F -+GLIBC_2.27 __isblank_l F -+GLIBC_2.27 __iscntrl_l F -+GLIBC_2.27 __isctype F -+GLIBC_2.27 __isdigit_l F -+GLIBC_2.27 __isgraph_l F -+GLIBC_2.27 __isinf F -+GLIBC_2.27 __isinff F -+GLIBC_2.27 __isinfl F -+GLIBC_2.27 __islower_l F -+GLIBC_2.27 __isnan F -+GLIBC_2.27 __isnanf F -+GLIBC_2.27 __isnanl F -+GLIBC_2.27 __isoc99_fscanf F -+GLIBC_2.27 __isoc99_fwscanf F -+GLIBC_2.27 __isoc99_scanf F -+GLIBC_2.27 __isoc99_sscanf F -+GLIBC_2.27 __isoc99_swscanf F -+GLIBC_2.27 __isoc99_vfscanf F -+GLIBC_2.27 __isoc99_vfwscanf F -+GLIBC_2.27 __isoc99_vscanf F -+GLIBC_2.27 __isoc99_vsscanf F -+GLIBC_2.27 __isoc99_vswscanf F -+GLIBC_2.27 __isoc99_vwscanf F -+GLIBC_2.27 __isoc99_wscanf F -+GLIBC_2.27 __isprint_l F -+GLIBC_2.27 __ispunct_l F -+GLIBC_2.27 __isspace_l F -+GLIBC_2.27 __isupper_l F -+GLIBC_2.27 __iswalnum_l F -+GLIBC_2.27 __iswalpha_l F -+GLIBC_2.27 __iswblank_l F -+GLIBC_2.27 __iswcntrl_l F -+GLIBC_2.27 __iswctype F -+GLIBC_2.27 __iswctype_l F -+GLIBC_2.27 __iswdigit_l F -+GLIBC_2.27 __iswgraph_l F -+GLIBC_2.27 __iswlower_l F -+GLIBC_2.27 __iswprint_l F -+GLIBC_2.27 __iswpunct_l F -+GLIBC_2.27 __iswspace_l F -+GLIBC_2.27 __iswupper_l F -+GLIBC_2.27 __iswxdigit_l F -+GLIBC_2.27 __isxdigit_l F -+GLIBC_2.27 __ivaliduser F -+GLIBC_2.27 __key_decryptsession_pk_LOCAL D 0x8 -+GLIBC_2.27 __key_encryptsession_pk_LOCAL D 0x8 -+GLIBC_2.27 __key_gendes_LOCAL D 0x8 -+GLIBC_2.27 __libc_allocate_rtsig F -+GLIBC_2.27 __libc_calloc F -+GLIBC_2.27 __libc_current_sigrtmax F -+GLIBC_2.27 __libc_current_sigrtmin F -+GLIBC_2.27 __libc_free F -+GLIBC_2.27 __libc_freeres F -+GLIBC_2.27 __libc_init_first F -+GLIBC_2.27 __libc_mallinfo F -+GLIBC_2.27 __libc_malloc F -+GLIBC_2.27 __libc_mallopt F -+GLIBC_2.27 __libc_memalign F -+GLIBC_2.27 __libc_pvalloc F -+GLIBC_2.27 __libc_realloc F -+GLIBC_2.27 __libc_sa_len F -+GLIBC_2.27 __libc_start_main F -+GLIBC_2.27 __libc_valloc F -+GLIBC_2.27 __longjmp_chk F -+GLIBC_2.27 __lseek F -+GLIBC_2.27 __lxstat F -+GLIBC_2.27 __lxstat64 F -+GLIBC_2.27 __malloc_hook D 0x8 -+GLIBC_2.27 __mbrlen F -+GLIBC_2.27 __mbrtowc F -+GLIBC_2.27 __mbsnrtowcs_chk F -+GLIBC_2.27 __mbsrtowcs_chk F -+GLIBC_2.27 __mbstowcs_chk F -+GLIBC_2.27 __memalign_hook D 0x8 -+GLIBC_2.27 __memcpy_chk F -+GLIBC_2.27 __memmove_chk F -+GLIBC_2.27 __mempcpy F -+GLIBC_2.27 __mempcpy_chk F -+GLIBC_2.27 __memset_chk F -+GLIBC_2.27 __monstartup F -+GLIBC_2.27 __morecore D 0x8 -+GLIBC_2.27 __nanosleep F -+GLIBC_2.27 __newlocale F -+GLIBC_2.27 __nl_langinfo_l F -+GLIBC_2.27 __nss_configure_lookup F -+GLIBC_2.27 __nss_database_lookup F -+GLIBC_2.27 __nss_hostname_digits_dots F -+GLIBC_2.27 __nss_next F -+GLIBC_2.27 __obstack_printf_chk F -+GLIBC_2.27 __obstack_vprintf_chk F -+GLIBC_2.27 __open F -+GLIBC_2.27 __open64 F -+GLIBC_2.27 __open64_2 F -+GLIBC_2.27 __open_2 F -+GLIBC_2.27 __openat64_2 F -+GLIBC_2.27 __openat_2 F -+GLIBC_2.27 __overflow F -+GLIBC_2.27 __pipe F -+GLIBC_2.27 __poll F -+GLIBC_2.27 __poll_chk F -+GLIBC_2.27 __posix_getopt F -+GLIBC_2.27 __ppoll_chk F -+GLIBC_2.27 __pread64 F -+GLIBC_2.27 __pread64_chk F -+GLIBC_2.27 __pread_chk F -+GLIBC_2.27 __printf_chk F -+GLIBC_2.27 __printf_fp F -+GLIBC_2.27 __profile_frequency F -+GLIBC_2.27 __progname D 0x8 -+GLIBC_2.27 __progname_full D 0x8 -+GLIBC_2.27 __ptsname_r_chk F -+GLIBC_2.27 __pwrite64 F -+GLIBC_2.27 __rawmemchr F -+GLIBC_2.27 __rcmd_errstr D 0x8 -+GLIBC_2.27 __read F -+GLIBC_2.27 __read_chk F -+GLIBC_2.27 __readlink_chk F -+GLIBC_2.27 __readlinkat_chk F -+GLIBC_2.27 __realloc_hook D 0x8 -+GLIBC_2.27 __realpath_chk F -+GLIBC_2.27 __recv_chk F -+GLIBC_2.27 __recvfrom_chk F -+GLIBC_2.27 __register_atfork F -+GLIBC_2.27 __res_init F -+GLIBC_2.27 __res_nclose F -+GLIBC_2.27 __res_ninit F -+GLIBC_2.27 __res_randomid F -+GLIBC_2.27 __res_state F -+GLIBC_2.27 __rpc_thread_createerr F -+GLIBC_2.27 __rpc_thread_svc_fdset F -+GLIBC_2.27 __rpc_thread_svc_max_pollfd F -+GLIBC_2.27 __rpc_thread_svc_pollfd F -+GLIBC_2.27 __sbrk F -+GLIBC_2.27 __sched_cpualloc F -+GLIBC_2.27 __sched_cpucount F -+GLIBC_2.27 __sched_cpufree F -+GLIBC_2.27 __sched_get_priority_max F -+GLIBC_2.27 __sched_get_priority_min F -+GLIBC_2.27 __sched_getparam F -+GLIBC_2.27 __sched_getscheduler F -+GLIBC_2.27 __sched_setscheduler F -+GLIBC_2.27 __sched_yield F -+GLIBC_2.27 __select F -+GLIBC_2.27 __send F -+GLIBC_2.27 __setmntent F -+GLIBC_2.27 __setpgid F -+GLIBC_2.27 __sigaction F -+GLIBC_2.27 __signbit F -+GLIBC_2.27 __signbitf F -+GLIBC_2.27 __signbitl F -+GLIBC_2.27 __sigpause F -+GLIBC_2.27 __sigsetjmp F -+GLIBC_2.27 __sigsuspend F -+GLIBC_2.27 __snprintf_chk F -+GLIBC_2.27 __sprintf_chk F -+GLIBC_2.27 __stack_chk_fail F -+GLIBC_2.27 __statfs F -+GLIBC_2.27 __stpcpy F -+GLIBC_2.27 __stpcpy_chk F -+GLIBC_2.27 __stpncpy F -+GLIBC_2.27 __stpncpy_chk F -+GLIBC_2.27 __strcasecmp F -+GLIBC_2.27 __strcasecmp_l F -+GLIBC_2.27 __strcasestr F -+GLIBC_2.27 __strcat_chk F -+GLIBC_2.27 __strcoll_l F -+GLIBC_2.27 __strcpy_chk F -+GLIBC_2.27 __strdup F -+GLIBC_2.27 __strerror_r F -+GLIBC_2.27 __strfmon_l F -+GLIBC_2.27 __strftime_l F -+GLIBC_2.27 __strncasecmp_l F -+GLIBC_2.27 __strncat_chk F -+GLIBC_2.27 __strncpy_chk F -+GLIBC_2.27 __strndup F -+GLIBC_2.27 __strsep_g F -+GLIBC_2.27 __strtod_internal F -+GLIBC_2.27 __strtod_l F -+GLIBC_2.27 __strtof_internal F -+GLIBC_2.27 __strtof_l F -+GLIBC_2.27 __strtok_r F -+GLIBC_2.27 __strtol_internal F -+GLIBC_2.27 __strtol_l F -+GLIBC_2.27 __strtold_internal F -+GLIBC_2.27 __strtold_l F -+GLIBC_2.27 __strtoll_internal F -+GLIBC_2.27 __strtoll_l F -+GLIBC_2.27 __strtoul_internal F -+GLIBC_2.27 __strtoul_l F -+GLIBC_2.27 __strtoull_internal F -+GLIBC_2.27 __strtoull_l F -+GLIBC_2.27 __strverscmp F -+GLIBC_2.27 __strxfrm_l F -+GLIBC_2.27 __swprintf_chk F -+GLIBC_2.27 __sysconf F -+GLIBC_2.27 __syslog_chk F -+GLIBC_2.27 __sysv_signal F -+GLIBC_2.27 __timezone D 0x8 -+GLIBC_2.27 __toascii_l F -+GLIBC_2.27 __tolower_l F -+GLIBC_2.27 __toupper_l F -+GLIBC_2.27 __towctrans F -+GLIBC_2.27 __towctrans_l F -+GLIBC_2.27 __towlower_l F -+GLIBC_2.27 __towupper_l F -+GLIBC_2.27 __ttyname_r_chk F -+GLIBC_2.27 __tzname D 0x10 -+GLIBC_2.27 __uflow F -+GLIBC_2.27 __underflow F -+GLIBC_2.27 __uselocale F -+GLIBC_2.27 __vasprintf_chk F -+GLIBC_2.27 __vdprintf_chk F -+GLIBC_2.27 __vfork F -+GLIBC_2.27 __vfprintf_chk F -+GLIBC_2.27 __vfscanf F -+GLIBC_2.27 __vfwprintf_chk F -+GLIBC_2.27 __vprintf_chk F -+GLIBC_2.27 __vsnprintf F -+GLIBC_2.27 __vsnprintf_chk F -+GLIBC_2.27 __vsprintf_chk F -+GLIBC_2.27 __vsscanf F -+GLIBC_2.27 __vswprintf_chk F -+GLIBC_2.27 __vsyslog_chk F -+GLIBC_2.27 __vwprintf_chk F -+GLIBC_2.27 __wait F -+GLIBC_2.27 __waitpid F -+GLIBC_2.27 __wcpcpy_chk F -+GLIBC_2.27 __wcpncpy_chk F -+GLIBC_2.27 __wcrtomb_chk F -+GLIBC_2.27 __wcscasecmp_l F -+GLIBC_2.27 __wcscat_chk F -+GLIBC_2.27 __wcscoll_l F -+GLIBC_2.27 __wcscpy_chk F -+GLIBC_2.27 __wcsftime_l F -+GLIBC_2.27 __wcsncasecmp_l F -+GLIBC_2.27 __wcsncat_chk F -+GLIBC_2.27 __wcsncpy_chk F -+GLIBC_2.27 __wcsnrtombs_chk F -+GLIBC_2.27 __wcsrtombs_chk F -+GLIBC_2.27 __wcstod_internal F -+GLIBC_2.27 __wcstod_l F -+GLIBC_2.27 __wcstof_internal F -+GLIBC_2.27 __wcstof_l F -+GLIBC_2.27 __wcstol_internal F -+GLIBC_2.27 __wcstol_l F -+GLIBC_2.27 __wcstold_internal F -+GLIBC_2.27 __wcstold_l F -+GLIBC_2.27 __wcstoll_internal F -+GLIBC_2.27 __wcstoll_l F -+GLIBC_2.27 __wcstombs_chk F -+GLIBC_2.27 __wcstoul_internal F -+GLIBC_2.27 __wcstoul_l F -+GLIBC_2.27 __wcstoull_internal F -+GLIBC_2.27 __wcstoull_l F -+GLIBC_2.27 __wcsxfrm_l F -+GLIBC_2.27 __wctomb_chk F -+GLIBC_2.27 __wctrans_l F -+GLIBC_2.27 __wctype_l F -+GLIBC_2.27 __wmemcpy_chk F -+GLIBC_2.27 __wmemmove_chk F -+GLIBC_2.27 __wmempcpy_chk F -+GLIBC_2.27 __wmemset_chk F -+GLIBC_2.27 __woverflow F -+GLIBC_2.27 __wprintf_chk F -+GLIBC_2.27 __write F -+GLIBC_2.27 __wuflow F -+GLIBC_2.27 __wunderflow F -+GLIBC_2.27 __xmknod F -+GLIBC_2.27 __xmknodat F -+GLIBC_2.27 __xpg_basename F -+GLIBC_2.27 __xpg_sigpause F -+GLIBC_2.27 __xpg_strerror_r F -+GLIBC_2.27 __xstat F -+GLIBC_2.27 __xstat64 F -+GLIBC_2.27 _authenticate F -+GLIBC_2.27 _dl_mcount_wrapper F -+GLIBC_2.27 _dl_mcount_wrapper_check F -+GLIBC_2.27 _environ D 0x8 -+GLIBC_2.27 _exit F -+GLIBC_2.27 _flushlbf F -+GLIBC_2.27 _libc_intl_domainname D 0x5 -+GLIBC_2.27 _longjmp F -+GLIBC_2.27 _mcleanup F -+GLIBC_2.27 _mcount F -+GLIBC_2.27 _nl_default_dirname D 0x12 -+GLIBC_2.27 _nl_domain_bindings D 0x8 -+GLIBC_2.27 _nl_msg_cat_cntr D 0x4 -+GLIBC_2.27 _null_auth D 0x18 -+GLIBC_2.27 _obstack_allocated_p F -+GLIBC_2.27 _obstack_begin F -+GLIBC_2.27 _obstack_begin_1 F -+GLIBC_2.27 _obstack_free F -+GLIBC_2.27 _obstack_memory_used F -+GLIBC_2.27 _obstack_newchunk F -+GLIBC_2.27 _res D 0x238 -+GLIBC_2.27 _res_hconf D 0x48 -+GLIBC_2.27 _rpc_dtablesize F -+GLIBC_2.27 _seterr_reply F -+GLIBC_2.27 _setjmp F -+GLIBC_2.27 _sys_errlist D 0x2370 -+GLIBC_2.27 _sys_nerr D 0x4 -+GLIBC_2.27 _sys_siglist D 0x400 -+GLIBC_2.27 _tolower F -+GLIBC_2.27 _toupper F -+GLIBC_2.27 a64l F -+GLIBC_2.27 abort F -+GLIBC_2.27 abs F -+GLIBC_2.27 accept F -+GLIBC_2.27 accept4 F -+GLIBC_2.27 access F -+GLIBC_2.27 acct F -+GLIBC_2.27 addmntent F -+GLIBC_2.27 addseverity F -+GLIBC_2.27 adjtime F -+GLIBC_2.27 adjtimex F -+GLIBC_2.27 alarm F -+GLIBC_2.27 aligned_alloc F -+GLIBC_2.27 alphasort F -+GLIBC_2.27 alphasort64 F -+GLIBC_2.27 argp_err_exit_status D 0x4 -+GLIBC_2.27 argp_error F -+GLIBC_2.27 argp_failure F -+GLIBC_2.27 argp_help F -+GLIBC_2.27 argp_parse F -+GLIBC_2.27 argp_program_bug_address D 0x8 -+GLIBC_2.27 argp_program_version D 0x8 -+GLIBC_2.27 argp_program_version_hook D 0x8 -+GLIBC_2.27 argp_state_help F -+GLIBC_2.27 argp_usage F -+GLIBC_2.27 argz_add F -+GLIBC_2.27 argz_add_sep F -+GLIBC_2.27 argz_append F -+GLIBC_2.27 argz_count F -+GLIBC_2.27 argz_create F -+GLIBC_2.27 argz_create_sep F -+GLIBC_2.27 argz_delete F -+GLIBC_2.27 argz_extract F -+GLIBC_2.27 argz_insert F -+GLIBC_2.27 argz_next F -+GLIBC_2.27 argz_replace F -+GLIBC_2.27 argz_stringify F -+GLIBC_2.27 asctime F -+GLIBC_2.27 asctime_r F -+GLIBC_2.27 asprintf F -+GLIBC_2.27 atof F -+GLIBC_2.27 atoi F -+GLIBC_2.27 atol F -+GLIBC_2.27 atoll F -+GLIBC_2.27 authdes_create F -+GLIBC_2.27 authdes_getucred F -+GLIBC_2.27 authdes_pk_create F -+GLIBC_2.27 authnone_create F -+GLIBC_2.27 authunix_create F -+GLIBC_2.27 authunix_create_default F -+GLIBC_2.27 backtrace F -+GLIBC_2.27 backtrace_symbols F -+GLIBC_2.27 backtrace_symbols_fd F -+GLIBC_2.27 basename F -+GLIBC_2.27 bcmp F -+GLIBC_2.27 bcopy F -+GLIBC_2.27 bind F -+GLIBC_2.27 bind_textdomain_codeset F -+GLIBC_2.27 bindresvport F -+GLIBC_2.27 bindtextdomain F -+GLIBC_2.27 brk F -+GLIBC_2.27 bsd_signal F -+GLIBC_2.27 bsearch F -+GLIBC_2.27 btowc F -+GLIBC_2.27 bzero F -+GLIBC_2.27 c16rtomb F -+GLIBC_2.27 c32rtomb F -+GLIBC_2.27 calloc F -+GLIBC_2.27 callrpc F -+GLIBC_2.27 canonicalize_file_name F -+GLIBC_2.27 capget F -+GLIBC_2.27 capset F -+GLIBC_2.27 catclose F -+GLIBC_2.27 catgets F -+GLIBC_2.27 catopen F -+GLIBC_2.27 cbc_crypt F -+GLIBC_2.27 cfgetispeed F -+GLIBC_2.27 cfgetospeed F -+GLIBC_2.27 cfmakeraw F -+GLIBC_2.27 cfsetispeed F -+GLIBC_2.27 cfsetospeed F -+GLIBC_2.27 cfsetspeed F -+GLIBC_2.27 chdir F -+GLIBC_2.27 chflags F -+GLIBC_2.27 chmod F -+GLIBC_2.27 chown F -+GLIBC_2.27 chroot F -+GLIBC_2.27 clearenv F -+GLIBC_2.27 clearerr F -+GLIBC_2.27 clearerr_unlocked F -+GLIBC_2.27 clnt_broadcast F -+GLIBC_2.27 clnt_create F -+GLIBC_2.27 clnt_pcreateerror F -+GLIBC_2.27 clnt_perrno F -+GLIBC_2.27 clnt_perror F -+GLIBC_2.27 clnt_spcreateerror F -+GLIBC_2.27 clnt_sperrno F -+GLIBC_2.27 clnt_sperror F -+GLIBC_2.27 clntraw_create F -+GLIBC_2.27 clnttcp_create F -+GLIBC_2.27 clntudp_bufcreate F -+GLIBC_2.27 clntudp_create F -+GLIBC_2.27 clntunix_create F -+GLIBC_2.27 clock F -+GLIBC_2.27 clock_adjtime F -+GLIBC_2.27 clock_getcpuclockid F -+GLIBC_2.27 clock_getres F -+GLIBC_2.27 clock_gettime F -+GLIBC_2.27 clock_nanosleep F -+GLIBC_2.27 clock_settime F -+GLIBC_2.27 clone F -+GLIBC_2.27 close F -+GLIBC_2.27 closedir F -+GLIBC_2.27 closelog F -+GLIBC_2.27 confstr F -+GLIBC_2.27 connect F -+GLIBC_2.27 copy_file_range F -+GLIBC_2.27 copysign F -+GLIBC_2.27 copysignf F -+GLIBC_2.27 copysignl F -+GLIBC_2.27 creat F -+GLIBC_2.27 creat64 F -+GLIBC_2.27 ctermid F -+GLIBC_2.27 ctime F -+GLIBC_2.27 ctime_r F -+GLIBC_2.27 cuserid F -+GLIBC_2.27 daemon F -+GLIBC_2.27 daylight D 0x4 -+GLIBC_2.27 dcgettext F -+GLIBC_2.27 dcngettext F -+GLIBC_2.27 delete_module F -+GLIBC_2.27 des_setparity F -+GLIBC_2.27 dgettext F -+GLIBC_2.27 difftime F -+GLIBC_2.27 dirfd F -+GLIBC_2.27 dirname F -+GLIBC_2.27 div F -+GLIBC_2.27 dl_iterate_phdr F -+GLIBC_2.27 dngettext F -+GLIBC_2.27 dprintf F -+GLIBC_2.27 drand48 F -+GLIBC_2.27 drand48_r F -+GLIBC_2.27 dup F -+GLIBC_2.27 dup2 F -+GLIBC_2.27 dup3 F -+GLIBC_2.27 duplocale F -+GLIBC_2.27 dysize F -+GLIBC_2.27 eaccess F -+GLIBC_2.27 ecb_crypt F -+GLIBC_2.27 ecvt F -+GLIBC_2.27 ecvt_r F -+GLIBC_2.27 endaliasent F -+GLIBC_2.27 endfsent F -+GLIBC_2.27 endgrent F -+GLIBC_2.27 endhostent F -+GLIBC_2.27 endmntent F -+GLIBC_2.27 endnetent F -+GLIBC_2.27 endnetgrent F -+GLIBC_2.27 endprotoent F -+GLIBC_2.27 endpwent F -+GLIBC_2.27 endrpcent F -+GLIBC_2.27 endservent F -+GLIBC_2.27 endsgent F -+GLIBC_2.27 endspent F -+GLIBC_2.27 endttyent F -+GLIBC_2.27 endusershell F -+GLIBC_2.27 endutent F -+GLIBC_2.27 endutxent F -+GLIBC_2.27 environ D 0x8 -+GLIBC_2.27 envz_add F -+GLIBC_2.27 envz_entry F -+GLIBC_2.27 envz_get F -+GLIBC_2.27 envz_merge F -+GLIBC_2.27 envz_remove F -+GLIBC_2.27 envz_strip F -+GLIBC_2.27 epoll_create F -+GLIBC_2.27 epoll_create1 F -+GLIBC_2.27 epoll_ctl F -+GLIBC_2.27 epoll_pwait F -+GLIBC_2.27 epoll_wait F -+GLIBC_2.27 erand48 F -+GLIBC_2.27 erand48_r F -+GLIBC_2.27 err F -+GLIBC_2.27 error F -+GLIBC_2.27 error_at_line F -+GLIBC_2.27 error_message_count D 0x4 -+GLIBC_2.27 error_one_per_line D 0x4 -+GLIBC_2.27 error_print_progname D 0x8 -+GLIBC_2.27 errx F -+GLIBC_2.27 ether_aton F -+GLIBC_2.27 ether_aton_r F -+GLIBC_2.27 ether_hostton F -+GLIBC_2.27 ether_line F -+GLIBC_2.27 ether_ntoa F -+GLIBC_2.27 ether_ntoa_r F -+GLIBC_2.27 ether_ntohost F -+GLIBC_2.27 euidaccess F -+GLIBC_2.27 eventfd F -+GLIBC_2.27 eventfd_read F -+GLIBC_2.27 eventfd_write F -+GLIBC_2.27 execl F -+GLIBC_2.27 execle F -+GLIBC_2.27 execlp F -+GLIBC_2.27 execv F -+GLIBC_2.27 execve F -+GLIBC_2.27 execvp F -+GLIBC_2.27 execvpe F -+GLIBC_2.27 exit F -+GLIBC_2.27 explicit_bzero F -+GLIBC_2.27 faccessat F -+GLIBC_2.27 fallocate F -+GLIBC_2.27 fallocate64 F -+GLIBC_2.27 fanotify_init F -+GLIBC_2.27 fanotify_mark F -+GLIBC_2.27 fattach F -+GLIBC_2.27 fchdir F -+GLIBC_2.27 fchflags F -+GLIBC_2.27 fchmod F -+GLIBC_2.27 fchmodat F -+GLIBC_2.27 fchown F -+GLIBC_2.27 fchownat F -+GLIBC_2.27 fclose F -+GLIBC_2.27 fcloseall F -+GLIBC_2.27 fcntl F -+GLIBC_2.27 fcvt F -+GLIBC_2.27 fcvt_r F -+GLIBC_2.27 fdatasync F -+GLIBC_2.27 fdetach F -+GLIBC_2.27 fdopen F -+GLIBC_2.27 fdopendir F -+GLIBC_2.27 feof F -+GLIBC_2.27 feof_unlocked F -+GLIBC_2.27 ferror F -+GLIBC_2.27 ferror_unlocked F -+GLIBC_2.27 fexecve F -+GLIBC_2.27 fflush F -+GLIBC_2.27 fflush_unlocked F -+GLIBC_2.27 ffs F -+GLIBC_2.27 ffsl F -+GLIBC_2.27 ffsll F -+GLIBC_2.27 fgetc F -+GLIBC_2.27 fgetc_unlocked F -+GLIBC_2.27 fgetgrent F -+GLIBC_2.27 fgetgrent_r F -+GLIBC_2.27 fgetpos F -+GLIBC_2.27 fgetpos64 F -+GLIBC_2.27 fgetpwent F -+GLIBC_2.27 fgetpwent_r F -+GLIBC_2.27 fgets F -+GLIBC_2.27 fgets_unlocked F -+GLIBC_2.27 fgetsgent F -+GLIBC_2.27 fgetsgent_r F -+GLIBC_2.27 fgetspent F -+GLIBC_2.27 fgetspent_r F -+GLIBC_2.27 fgetwc F -+GLIBC_2.27 fgetwc_unlocked F -+GLIBC_2.27 fgetws F -+GLIBC_2.27 fgetws_unlocked F -+GLIBC_2.27 fgetxattr F -+GLIBC_2.27 fileno F -+GLIBC_2.27 fileno_unlocked F -+GLIBC_2.27 finite F -+GLIBC_2.27 finitef F -+GLIBC_2.27 finitel F -+GLIBC_2.27 flistxattr F -+GLIBC_2.27 flock F -+GLIBC_2.27 flockfile F -+GLIBC_2.27 fmemopen F -+GLIBC_2.27 fmtmsg F -+GLIBC_2.27 fnmatch F -+GLIBC_2.27 fopen F -+GLIBC_2.27 fopen64 F -+GLIBC_2.27 fopencookie F -+GLIBC_2.27 fork F -+GLIBC_2.27 fpathconf F -+GLIBC_2.27 fprintf F -+GLIBC_2.27 fputc F -+GLIBC_2.27 fputc_unlocked F -+GLIBC_2.27 fputs F -+GLIBC_2.27 fputs_unlocked F -+GLIBC_2.27 fputwc F -+GLIBC_2.27 fputwc_unlocked F -+GLIBC_2.27 fputws F -+GLIBC_2.27 fputws_unlocked F -+GLIBC_2.27 fread F -+GLIBC_2.27 fread_unlocked F -+GLIBC_2.27 free F -+GLIBC_2.27 freeaddrinfo F -+GLIBC_2.27 freeifaddrs F -+GLIBC_2.27 freelocale F -+GLIBC_2.27 fremovexattr F -+GLIBC_2.27 freopen F -+GLIBC_2.27 freopen64 F -+GLIBC_2.27 frexp F -+GLIBC_2.27 frexpf F -+GLIBC_2.27 frexpl F -+GLIBC_2.27 fscanf F -+GLIBC_2.27 fseek F -+GLIBC_2.27 fseeko F -+GLIBC_2.27 fseeko64 F -+GLIBC_2.27 fsetpos F -+GLIBC_2.27 fsetpos64 F -+GLIBC_2.27 fsetxattr F -+GLIBC_2.27 fstatfs F -+GLIBC_2.27 fstatfs64 F -+GLIBC_2.27 fstatvfs F -+GLIBC_2.27 fstatvfs64 F -+GLIBC_2.27 fsync F -+GLIBC_2.27 ftell F -+GLIBC_2.27 ftello F -+GLIBC_2.27 ftello64 F -+GLIBC_2.27 ftime F -+GLIBC_2.27 ftok F -+GLIBC_2.27 ftruncate F -+GLIBC_2.27 ftruncate64 F -+GLIBC_2.27 ftrylockfile F -+GLIBC_2.27 fts64_children F -+GLIBC_2.27 fts64_close F -+GLIBC_2.27 fts64_open F -+GLIBC_2.27 fts64_read F -+GLIBC_2.27 fts64_set F -+GLIBC_2.27 fts_children F -+GLIBC_2.27 fts_close F -+GLIBC_2.27 fts_open F -+GLIBC_2.27 fts_read F -+GLIBC_2.27 fts_set F -+GLIBC_2.27 ftw F -+GLIBC_2.27 ftw64 F -+GLIBC_2.27 funlockfile F -+GLIBC_2.27 futimens F -+GLIBC_2.27 futimes F -+GLIBC_2.27 futimesat F -+GLIBC_2.27 fwide F -+GLIBC_2.27 fwprintf F -+GLIBC_2.27 fwrite F -+GLIBC_2.27 fwrite_unlocked F -+GLIBC_2.27 fwscanf F -+GLIBC_2.27 gai_strerror F -+GLIBC_2.27 gcvt F -+GLIBC_2.27 get_avphys_pages F -+GLIBC_2.27 get_current_dir_name F -+GLIBC_2.27 get_myaddress F -+GLIBC_2.27 get_nprocs F -+GLIBC_2.27 get_nprocs_conf F -+GLIBC_2.27 get_phys_pages F -+GLIBC_2.27 getaddrinfo F -+GLIBC_2.27 getaliasbyname F -+GLIBC_2.27 getaliasbyname_r F -+GLIBC_2.27 getaliasent F -+GLIBC_2.27 getaliasent_r F -+GLIBC_2.27 getauxval F -+GLIBC_2.27 getc F -+GLIBC_2.27 getc_unlocked F -+GLIBC_2.27 getchar F -+GLIBC_2.27 getchar_unlocked F -+GLIBC_2.27 getcontext F -+GLIBC_2.27 getcwd F -+GLIBC_2.27 getdate F -+GLIBC_2.27 getdate_err D 0x4 -+GLIBC_2.27 getdate_r F -+GLIBC_2.27 getdelim F -+GLIBC_2.27 getdirentries F -+GLIBC_2.27 getdirentries64 F -+GLIBC_2.27 getdomainname F -+GLIBC_2.27 getdtablesize F -+GLIBC_2.27 getegid F -+GLIBC_2.27 getentropy F -+GLIBC_2.27 getenv F -+GLIBC_2.27 geteuid F -+GLIBC_2.27 getfsent F -+GLIBC_2.27 getfsfile F -+GLIBC_2.27 getfsspec F -+GLIBC_2.27 getgid F -+GLIBC_2.27 getgrent F -+GLIBC_2.27 getgrent_r F -+GLIBC_2.27 getgrgid F -+GLIBC_2.27 getgrgid_r F -+GLIBC_2.27 getgrnam F -+GLIBC_2.27 getgrnam_r F -+GLIBC_2.27 getgrouplist F -+GLIBC_2.27 getgroups F -+GLIBC_2.27 gethostbyaddr F -+GLIBC_2.27 gethostbyaddr_r F -+GLIBC_2.27 gethostbyname F -+GLIBC_2.27 gethostbyname2 F -+GLIBC_2.27 gethostbyname2_r F -+GLIBC_2.27 gethostbyname_r F -+GLIBC_2.27 gethostent F -+GLIBC_2.27 gethostent_r F -+GLIBC_2.27 gethostid F -+GLIBC_2.27 gethostname F -+GLIBC_2.27 getifaddrs F -+GLIBC_2.27 getipv4sourcefilter F -+GLIBC_2.27 getitimer F -+GLIBC_2.27 getline F -+GLIBC_2.27 getloadavg F -+GLIBC_2.27 getlogin F -+GLIBC_2.27 getlogin_r F -+GLIBC_2.27 getmntent F -+GLIBC_2.27 getmntent_r F -+GLIBC_2.27 getmsg F -+GLIBC_2.27 getnameinfo F -+GLIBC_2.27 getnetbyaddr F -+GLIBC_2.27 getnetbyaddr_r F -+GLIBC_2.27 getnetbyname F -+GLIBC_2.27 getnetbyname_r F -+GLIBC_2.27 getnetent F -+GLIBC_2.27 getnetent_r F -+GLIBC_2.27 getnetgrent F -+GLIBC_2.27 getnetgrent_r F -+GLIBC_2.27 getnetname F -+GLIBC_2.27 getopt F -+GLIBC_2.27 getopt_long F -+GLIBC_2.27 getopt_long_only F -+GLIBC_2.27 getpagesize F -+GLIBC_2.27 getpass F -+GLIBC_2.27 getpeername F -+GLIBC_2.27 getpgid F -+GLIBC_2.27 getpgrp F -+GLIBC_2.27 getpid F -+GLIBC_2.27 getpmsg F -+GLIBC_2.27 getppid F -+GLIBC_2.27 getpriority F -+GLIBC_2.27 getprotobyname F -+GLIBC_2.27 getprotobyname_r F -+GLIBC_2.27 getprotobynumber F -+GLIBC_2.27 getprotobynumber_r F -+GLIBC_2.27 getprotoent F -+GLIBC_2.27 getprotoent_r F -+GLIBC_2.27 getpt F -+GLIBC_2.27 getpublickey F -+GLIBC_2.27 getpw F -+GLIBC_2.27 getpwent F -+GLIBC_2.27 getpwent_r F -+GLIBC_2.27 getpwnam F -+GLIBC_2.27 getpwnam_r F -+GLIBC_2.27 getpwuid F -+GLIBC_2.27 getpwuid_r F -+GLIBC_2.27 getrandom F -+GLIBC_2.27 getresgid F -+GLIBC_2.27 getresuid F -+GLIBC_2.27 getrlimit F -+GLIBC_2.27 getrlimit64 F -+GLIBC_2.27 getrpcbyname F -+GLIBC_2.27 getrpcbyname_r F -+GLIBC_2.27 getrpcbynumber F -+GLIBC_2.27 getrpcbynumber_r F -+GLIBC_2.27 getrpcent F -+GLIBC_2.27 getrpcent_r F -+GLIBC_2.27 getrpcport F -+GLIBC_2.27 getrusage F -+GLIBC_2.27 gets F -+GLIBC_2.27 getsecretkey F -+GLIBC_2.27 getservbyname F -+GLIBC_2.27 getservbyname_r F -+GLIBC_2.27 getservbyport F -+GLIBC_2.27 getservbyport_r F -+GLIBC_2.27 getservent F -+GLIBC_2.27 getservent_r F -+GLIBC_2.27 getsgent F -+GLIBC_2.27 getsgent_r F -+GLIBC_2.27 getsgnam F -+GLIBC_2.27 getsgnam_r F -+GLIBC_2.27 getsid F -+GLIBC_2.27 getsockname F -+GLIBC_2.27 getsockopt F -+GLIBC_2.27 getsourcefilter F -+GLIBC_2.27 getspent F -+GLIBC_2.27 getspent_r F -+GLIBC_2.27 getspnam F -+GLIBC_2.27 getspnam_r F -+GLIBC_2.27 getsubopt F -+GLIBC_2.27 gettext F -+GLIBC_2.27 gettimeofday F -+GLIBC_2.27 getttyent F -+GLIBC_2.27 getttynam F -+GLIBC_2.27 getuid F -+GLIBC_2.27 getusershell F -+GLIBC_2.27 getutent F -+GLIBC_2.27 getutent_r F -+GLIBC_2.27 getutid F -+GLIBC_2.27 getutid_r F -+GLIBC_2.27 getutline F -+GLIBC_2.27 getutline_r F -+GLIBC_2.27 getutmp F -+GLIBC_2.27 getutmpx F -+GLIBC_2.27 getutxent F -+GLIBC_2.27 getutxid F -+GLIBC_2.27 getutxline F -+GLIBC_2.27 getw F -+GLIBC_2.27 getwc F -+GLIBC_2.27 getwc_unlocked F -+GLIBC_2.27 getwchar F -+GLIBC_2.27 getwchar_unlocked F -+GLIBC_2.27 getwd F -+GLIBC_2.27 getxattr F -+GLIBC_2.27 glob F -+GLIBC_2.27 glob64 F -+GLIBC_2.27 glob_pattern_p F -+GLIBC_2.27 globfree F -+GLIBC_2.27 globfree64 F -+GLIBC_2.27 gmtime F -+GLIBC_2.27 gmtime_r F -+GLIBC_2.27 gnu_dev_major F -+GLIBC_2.27 gnu_dev_makedev F -+GLIBC_2.27 gnu_dev_minor F -+GLIBC_2.27 gnu_get_libc_release F -+GLIBC_2.27 gnu_get_libc_version F -+GLIBC_2.27 grantpt F -+GLIBC_2.27 group_member F -+GLIBC_2.27 gsignal F -+GLIBC_2.27 gtty F -+GLIBC_2.27 h_errlist D 0x28 -+GLIBC_2.27 h_nerr D 0x4 -+GLIBC_2.27 hasmntopt F -+GLIBC_2.27 hcreate F -+GLIBC_2.27 hcreate_r F -+GLIBC_2.27 hdestroy F -+GLIBC_2.27 hdestroy_r F -+GLIBC_2.27 herror F -+GLIBC_2.27 host2netname F -+GLIBC_2.27 hsearch F -+GLIBC_2.27 hsearch_r F -+GLIBC_2.27 hstrerror F -+GLIBC_2.27 htonl F -+GLIBC_2.27 htons F -+GLIBC_2.27 iconv F -+GLIBC_2.27 iconv_close F -+GLIBC_2.27 iconv_open F -+GLIBC_2.27 if_freenameindex F -+GLIBC_2.27 if_indextoname F -+GLIBC_2.27 if_nameindex F -+GLIBC_2.27 if_nametoindex F -+GLIBC_2.27 imaxabs F -+GLIBC_2.27 imaxdiv F -+GLIBC_2.27 in6addr_any D 0x10 -+GLIBC_2.27 in6addr_loopback D 0x10 -+GLIBC_2.27 index F -+GLIBC_2.27 inet6_opt_append F -+GLIBC_2.27 inet6_opt_find F -+GLIBC_2.27 inet6_opt_finish F -+GLIBC_2.27 inet6_opt_get_val F -+GLIBC_2.27 inet6_opt_init F -+GLIBC_2.27 inet6_opt_next F -+GLIBC_2.27 inet6_opt_set_val F -+GLIBC_2.27 inet6_option_alloc F -+GLIBC_2.27 inet6_option_append F -+GLIBC_2.27 inet6_option_find F -+GLIBC_2.27 inet6_option_init F -+GLIBC_2.27 inet6_option_next F -+GLIBC_2.27 inet6_option_space F -+GLIBC_2.27 inet6_rth_add F -+GLIBC_2.27 inet6_rth_getaddr F -+GLIBC_2.27 inet6_rth_init F -+GLIBC_2.27 inet6_rth_reverse F -+GLIBC_2.27 inet6_rth_segments F -+GLIBC_2.27 inet6_rth_space F -+GLIBC_2.27 inet_addr F -+GLIBC_2.27 inet_aton F -+GLIBC_2.27 inet_lnaof F -+GLIBC_2.27 inet_makeaddr F -+GLIBC_2.27 inet_netof F -+GLIBC_2.27 inet_network F -+GLIBC_2.27 inet_nsap_addr F -+GLIBC_2.27 inet_nsap_ntoa F -+GLIBC_2.27 inet_ntoa F -+GLIBC_2.27 inet_ntop F -+GLIBC_2.27 inet_pton F -+GLIBC_2.27 init_module F -+GLIBC_2.27 initgroups F -+GLIBC_2.27 initstate F -+GLIBC_2.27 initstate_r F -+GLIBC_2.27 innetgr F -+GLIBC_2.27 inotify_add_watch F -+GLIBC_2.27 inotify_init F -+GLIBC_2.27 inotify_init1 F -+GLIBC_2.27 inotify_rm_watch F -+GLIBC_2.27 insque F -+GLIBC_2.27 ioctl F -+GLIBC_2.27 iruserok F -+GLIBC_2.27 iruserok_af F -+GLIBC_2.27 isalnum F -+GLIBC_2.27 isalnum_l F -+GLIBC_2.27 isalpha F -+GLIBC_2.27 isalpha_l F -+GLIBC_2.27 isascii F -+GLIBC_2.27 isastream F -+GLIBC_2.27 isatty F -+GLIBC_2.27 isblank F -+GLIBC_2.27 isblank_l F -+GLIBC_2.27 iscntrl F -+GLIBC_2.27 iscntrl_l F -+GLIBC_2.27 isctype F -+GLIBC_2.27 isdigit F -+GLIBC_2.27 isdigit_l F -+GLIBC_2.27 isfdtype F -+GLIBC_2.27 isgraph F -+GLIBC_2.27 isgraph_l F -+GLIBC_2.27 isinf F -+GLIBC_2.27 isinff F -+GLIBC_2.27 isinfl F -+GLIBC_2.27 islower F -+GLIBC_2.27 islower_l F -+GLIBC_2.27 isnan F -+GLIBC_2.27 isnanf F -+GLIBC_2.27 isnanl F -+GLIBC_2.27 isprint F -+GLIBC_2.27 isprint_l F -+GLIBC_2.27 ispunct F -+GLIBC_2.27 ispunct_l F -+GLIBC_2.27 isspace F -+GLIBC_2.27 isspace_l F -+GLIBC_2.27 isupper F -+GLIBC_2.27 isupper_l F -+GLIBC_2.27 iswalnum F -+GLIBC_2.27 iswalnum_l F -+GLIBC_2.27 iswalpha F -+GLIBC_2.27 iswalpha_l F -+GLIBC_2.27 iswblank F -+GLIBC_2.27 iswblank_l F -+GLIBC_2.27 iswcntrl F -+GLIBC_2.27 iswcntrl_l F -+GLIBC_2.27 iswctype F -+GLIBC_2.27 iswctype_l F -+GLIBC_2.27 iswdigit F -+GLIBC_2.27 iswdigit_l F -+GLIBC_2.27 iswgraph F -+GLIBC_2.27 iswgraph_l F -+GLIBC_2.27 iswlower F -+GLIBC_2.27 iswlower_l F -+GLIBC_2.27 iswprint F -+GLIBC_2.27 iswprint_l F -+GLIBC_2.27 iswpunct F -+GLIBC_2.27 iswpunct_l F -+GLIBC_2.27 iswspace F -+GLIBC_2.27 iswspace_l F -+GLIBC_2.27 iswupper F -+GLIBC_2.27 iswupper_l F -+GLIBC_2.27 iswxdigit F -+GLIBC_2.27 iswxdigit_l F -+GLIBC_2.27 isxdigit F -+GLIBC_2.27 isxdigit_l F -+GLIBC_2.27 jrand48 F -+GLIBC_2.27 jrand48_r F -+GLIBC_2.27 key_decryptsession F -+GLIBC_2.27 key_decryptsession_pk F -+GLIBC_2.27 key_encryptsession F -+GLIBC_2.27 key_encryptsession_pk F -+GLIBC_2.27 key_gendes F -+GLIBC_2.27 key_get_conv F -+GLIBC_2.27 key_secretkey_is_set F -+GLIBC_2.27 key_setnet F -+GLIBC_2.27 key_setsecret F -+GLIBC_2.27 kill F -+GLIBC_2.27 killpg F -+GLIBC_2.27 klogctl F -+GLIBC_2.27 l64a F -+GLIBC_2.27 labs F -+GLIBC_2.27 lchmod F -+GLIBC_2.27 lchown F -+GLIBC_2.27 lckpwdf F -+GLIBC_2.27 lcong48 F -+GLIBC_2.27 lcong48_r F -+GLIBC_2.27 ldexp F -+GLIBC_2.27 ldexpf F -+GLIBC_2.27 ldexpl F -+GLIBC_2.27 ldiv F -+GLIBC_2.27 lfind F -+GLIBC_2.27 lgetxattr F -+GLIBC_2.27 link F -+GLIBC_2.27 linkat F -+GLIBC_2.27 listen F -+GLIBC_2.27 listxattr F -+GLIBC_2.27 llabs F -+GLIBC_2.27 lldiv F -+GLIBC_2.27 llistxattr F -+GLIBC_2.27 llseek F -+GLIBC_2.27 localeconv F -+GLIBC_2.27 localtime F -+GLIBC_2.27 localtime_r F -+GLIBC_2.27 lockf F -+GLIBC_2.27 lockf64 F -+GLIBC_2.27 longjmp F -+GLIBC_2.27 lrand48 F -+GLIBC_2.27 lrand48_r F -+GLIBC_2.27 lremovexattr F -+GLIBC_2.27 lsearch F -+GLIBC_2.27 lseek F -+GLIBC_2.27 lseek64 F -+GLIBC_2.27 lsetxattr F -+GLIBC_2.27 lutimes F -+GLIBC_2.27 madvise F -+GLIBC_2.27 makecontext F -+GLIBC_2.27 mallinfo F -+GLIBC_2.27 malloc F -+GLIBC_2.27 malloc_info F -+GLIBC_2.27 malloc_stats F -+GLIBC_2.27 malloc_trim F -+GLIBC_2.27 malloc_usable_size F -+GLIBC_2.27 mallopt F -+GLIBC_2.27 mallwatch D 0x8 -+GLIBC_2.27 mblen F -+GLIBC_2.27 mbrlen F -+GLIBC_2.27 mbrtoc16 F -+GLIBC_2.27 mbrtoc32 F -+GLIBC_2.27 mbrtowc F -+GLIBC_2.27 mbsinit F -+GLIBC_2.27 mbsnrtowcs F -+GLIBC_2.27 mbsrtowcs F -+GLIBC_2.27 mbstowcs F -+GLIBC_2.27 mbtowc F -+GLIBC_2.27 mcheck F -+GLIBC_2.27 mcheck_check_all F -+GLIBC_2.27 mcheck_pedantic F -+GLIBC_2.27 memalign F -+GLIBC_2.27 memccpy F -+GLIBC_2.27 memchr F -+GLIBC_2.27 memcmp F -+GLIBC_2.27 memcpy F -+GLIBC_2.27 memfd_create F -+GLIBC_2.27 memfrob F -+GLIBC_2.27 memmem F -+GLIBC_2.27 memmove F -+GLIBC_2.27 mempcpy F -+GLIBC_2.27 memrchr F -+GLIBC_2.27 memset F -+GLIBC_2.27 mincore F -+GLIBC_2.27 mkdir F -+GLIBC_2.27 mkdirat F -+GLIBC_2.27 mkdtemp F -+GLIBC_2.27 mkfifo F -+GLIBC_2.27 mkfifoat F -+GLIBC_2.27 mkostemp F -+GLIBC_2.27 mkostemp64 F -+GLIBC_2.27 mkostemps F -+GLIBC_2.27 mkostemps64 F -+GLIBC_2.27 mkstemp F -+GLIBC_2.27 mkstemp64 F -+GLIBC_2.27 mkstemps F -+GLIBC_2.27 mkstemps64 F -+GLIBC_2.27 mktemp F -+GLIBC_2.27 mktime F -+GLIBC_2.27 mlock F -+GLIBC_2.27 mlock2 F -+GLIBC_2.27 mlockall F -+GLIBC_2.27 mmap F -+GLIBC_2.27 mmap64 F -+GLIBC_2.27 modf F -+GLIBC_2.27 modff F -+GLIBC_2.27 modfl F -+GLIBC_2.27 moncontrol F -+GLIBC_2.27 monstartup F -+GLIBC_2.27 mount F -+GLIBC_2.27 mprobe F -+GLIBC_2.27 mprotect F -+GLIBC_2.27 mrand48 F -+GLIBC_2.27 mrand48_r F -+GLIBC_2.27 mremap F -+GLIBC_2.27 msgctl F -+GLIBC_2.27 msgget F -+GLIBC_2.27 msgrcv F -+GLIBC_2.27 msgsnd F -+GLIBC_2.27 msync F -+GLIBC_2.27 mtrace F -+GLIBC_2.27 munlock F -+GLIBC_2.27 munlockall F -+GLIBC_2.27 munmap F -+GLIBC_2.27 muntrace F -+GLIBC_2.27 name_to_handle_at F -+GLIBC_2.27 nanosleep F -+GLIBC_2.27 netname2host F -+GLIBC_2.27 netname2user F -+GLIBC_2.27 newlocale F -+GLIBC_2.27 nfsservctl F -+GLIBC_2.27 nftw F -+GLIBC_2.27 nftw64 F -+GLIBC_2.27 ngettext F -+GLIBC_2.27 nice F -+GLIBC_2.27 nl_langinfo F -+GLIBC_2.27 nl_langinfo_l F -+GLIBC_2.27 nrand48 F -+GLIBC_2.27 nrand48_r F -+GLIBC_2.27 ntohl F -+GLIBC_2.27 ntohs F -+GLIBC_2.27 ntp_adjtime F -+GLIBC_2.27 ntp_gettime F -+GLIBC_2.27 ntp_gettimex F -+GLIBC_2.27 obstack_alloc_failed_handler D 0x8 -+GLIBC_2.27 obstack_exit_failure D 0x4 -+GLIBC_2.27 obstack_free F -+GLIBC_2.27 obstack_printf F -+GLIBC_2.27 obstack_vprintf F -+GLIBC_2.27 on_exit F -+GLIBC_2.27 open F -+GLIBC_2.27 open64 F -+GLIBC_2.27 open_by_handle_at F -+GLIBC_2.27 open_memstream F -+GLIBC_2.27 open_wmemstream F -+GLIBC_2.27 openat F -+GLIBC_2.27 openat64 F -+GLIBC_2.27 opendir F -+GLIBC_2.27 openlog F -+GLIBC_2.27 optarg D 0x8 -+GLIBC_2.27 opterr D 0x4 -+GLIBC_2.27 optind D 0x4 -+GLIBC_2.27 optopt D 0x4 -+GLIBC_2.27 parse_printf_format F -+GLIBC_2.27 passwd2des F -+GLIBC_2.27 pathconf F -+GLIBC_2.27 pause F -+GLIBC_2.27 pclose F -+GLIBC_2.27 perror F -+GLIBC_2.27 personality F -+GLIBC_2.27 pipe F -+GLIBC_2.27 pipe2 F -+GLIBC_2.27 pivot_root F -+GLIBC_2.27 pkey_alloc F -+GLIBC_2.27 pkey_free F -+GLIBC_2.27 pkey_get F -+GLIBC_2.27 pkey_mprotect F -+GLIBC_2.27 pkey_set F -+GLIBC_2.27 pmap_getmaps F -+GLIBC_2.27 pmap_getport F -+GLIBC_2.27 pmap_rmtcall F -+GLIBC_2.27 pmap_set F -+GLIBC_2.27 pmap_unset F -+GLIBC_2.27 poll F -+GLIBC_2.27 popen F -+GLIBC_2.27 posix_fadvise F -+GLIBC_2.27 posix_fadvise64 F -+GLIBC_2.27 posix_fallocate F -+GLIBC_2.27 posix_fallocate64 F -+GLIBC_2.27 posix_madvise F -+GLIBC_2.27 posix_memalign F -+GLIBC_2.27 posix_openpt F -+GLIBC_2.27 posix_spawn F -+GLIBC_2.27 posix_spawn_file_actions_addclose F -+GLIBC_2.27 posix_spawn_file_actions_adddup2 F -+GLIBC_2.27 posix_spawn_file_actions_addopen F -+GLIBC_2.27 posix_spawn_file_actions_destroy F -+GLIBC_2.27 posix_spawn_file_actions_init F -+GLIBC_2.27 posix_spawnattr_destroy F -+GLIBC_2.27 posix_spawnattr_getflags F -+GLIBC_2.27 posix_spawnattr_getpgroup F -+GLIBC_2.27 posix_spawnattr_getschedparam F -+GLIBC_2.27 posix_spawnattr_getschedpolicy F -+GLIBC_2.27 posix_spawnattr_getsigdefault F -+GLIBC_2.27 posix_spawnattr_getsigmask F -+GLIBC_2.27 posix_spawnattr_init F -+GLIBC_2.27 posix_spawnattr_setflags F -+GLIBC_2.27 posix_spawnattr_setpgroup F -+GLIBC_2.27 posix_spawnattr_setschedparam F -+GLIBC_2.27 posix_spawnattr_setschedpolicy F -+GLIBC_2.27 posix_spawnattr_setsigdefault F -+GLIBC_2.27 posix_spawnattr_setsigmask F -+GLIBC_2.27 posix_spawnp F -+GLIBC_2.27 ppoll F -+GLIBC_2.27 prctl F -+GLIBC_2.27 pread F -+GLIBC_2.27 pread64 F -+GLIBC_2.27 preadv F -+GLIBC_2.27 preadv2 F -+GLIBC_2.27 preadv64 F -+GLIBC_2.27 preadv64v2 F -+GLIBC_2.27 printf F -+GLIBC_2.27 printf_size F -+GLIBC_2.27 printf_size_info F -+GLIBC_2.27 prlimit F -+GLIBC_2.27 prlimit64 F -+GLIBC_2.27 process_vm_readv F -+GLIBC_2.27 process_vm_writev F -+GLIBC_2.27 profil F -+GLIBC_2.27 program_invocation_name D 0x8 -+GLIBC_2.27 program_invocation_short_name D 0x8 -+GLIBC_2.27 pselect F -+GLIBC_2.27 psiginfo F -+GLIBC_2.27 psignal F -+GLIBC_2.27 pthread_attr_destroy F -+GLIBC_2.27 pthread_attr_getdetachstate F -+GLIBC_2.27 pthread_attr_getinheritsched F -+GLIBC_2.27 pthread_attr_getschedparam F -+GLIBC_2.27 pthread_attr_getschedpolicy F -+GLIBC_2.27 pthread_attr_getscope F -+GLIBC_2.27 pthread_attr_init F -+GLIBC_2.27 pthread_attr_setdetachstate F -+GLIBC_2.27 pthread_attr_setinheritsched F -+GLIBC_2.27 pthread_attr_setschedparam F -+GLIBC_2.27 pthread_attr_setschedpolicy F -+GLIBC_2.27 pthread_attr_setscope F -+GLIBC_2.27 pthread_cond_broadcast F -+GLIBC_2.27 pthread_cond_destroy F -+GLIBC_2.27 pthread_cond_init F -+GLIBC_2.27 pthread_cond_signal F -+GLIBC_2.27 pthread_cond_timedwait F -+GLIBC_2.27 pthread_cond_wait F -+GLIBC_2.27 pthread_condattr_destroy F -+GLIBC_2.27 pthread_condattr_init F -+GLIBC_2.27 pthread_equal F -+GLIBC_2.27 pthread_exit F -+GLIBC_2.27 pthread_getschedparam F -+GLIBC_2.27 pthread_mutex_destroy F -+GLIBC_2.27 pthread_mutex_init F -+GLIBC_2.27 pthread_mutex_lock F -+GLIBC_2.27 pthread_mutex_unlock F -+GLIBC_2.27 pthread_self F -+GLIBC_2.27 pthread_setcancelstate F -+GLIBC_2.27 pthread_setcanceltype F -+GLIBC_2.27 pthread_setschedparam F -+GLIBC_2.27 ptrace F -+GLIBC_2.27 ptsname F -+GLIBC_2.27 ptsname_r F -+GLIBC_2.27 putc F -+GLIBC_2.27 putc_unlocked F -+GLIBC_2.27 putchar F -+GLIBC_2.27 putchar_unlocked F -+GLIBC_2.27 putenv F -+GLIBC_2.27 putgrent F -+GLIBC_2.27 putmsg F -+GLIBC_2.27 putpmsg F -+GLIBC_2.27 putpwent F -+GLIBC_2.27 puts F -+GLIBC_2.27 putsgent F -+GLIBC_2.27 putspent F -+GLIBC_2.27 pututline F -+GLIBC_2.27 pututxline F -+GLIBC_2.27 putw F -+GLIBC_2.27 putwc F -+GLIBC_2.27 putwc_unlocked F -+GLIBC_2.27 putwchar F -+GLIBC_2.27 putwchar_unlocked F -+GLIBC_2.27 pvalloc F -+GLIBC_2.27 pwrite F -+GLIBC_2.27 pwrite64 F -+GLIBC_2.27 pwritev F -+GLIBC_2.27 pwritev2 F -+GLIBC_2.27 pwritev64 F -+GLIBC_2.27 pwritev64v2 F -+GLIBC_2.27 qecvt F -+GLIBC_2.27 qecvt_r F -+GLIBC_2.27 qfcvt F -+GLIBC_2.27 qfcvt_r F -+GLIBC_2.27 qgcvt F -+GLIBC_2.27 qsort F -+GLIBC_2.27 qsort_r F -+GLIBC_2.27 quick_exit F -+GLIBC_2.27 quotactl F -+GLIBC_2.27 raise F -+GLIBC_2.27 rand F -+GLIBC_2.27 rand_r F -+GLIBC_2.27 random F -+GLIBC_2.27 random_r F -+GLIBC_2.27 rawmemchr F -+GLIBC_2.27 rcmd F -+GLIBC_2.27 rcmd_af F -+GLIBC_2.27 re_comp F -+GLIBC_2.27 re_compile_fastmap F -+GLIBC_2.27 re_compile_pattern F -+GLIBC_2.27 re_exec F -+GLIBC_2.27 re_match F -+GLIBC_2.27 re_match_2 F -+GLIBC_2.27 re_search F -+GLIBC_2.27 re_search_2 F -+GLIBC_2.27 re_set_registers F -+GLIBC_2.27 re_set_syntax F -+GLIBC_2.27 re_syntax_options D 0x8 -+GLIBC_2.27 read F -+GLIBC_2.27 readahead F -+GLIBC_2.27 readdir F -+GLIBC_2.27 readdir64 F -+GLIBC_2.27 readdir64_r F -+GLIBC_2.27 readdir_r F -+GLIBC_2.27 readlink F -+GLIBC_2.27 readlinkat F -+GLIBC_2.27 readv F -+GLIBC_2.27 realloc F -+GLIBC_2.27 reallocarray F -+GLIBC_2.27 realpath F -+GLIBC_2.27 reboot F -+GLIBC_2.27 recv F -+GLIBC_2.27 recvfrom F -+GLIBC_2.27 recvmmsg F -+GLIBC_2.27 recvmsg F -+GLIBC_2.27 regcomp F -+GLIBC_2.27 regerror F -+GLIBC_2.27 regexec F -+GLIBC_2.27 regfree F -+GLIBC_2.27 register_printf_function F -+GLIBC_2.27 register_printf_modifier F -+GLIBC_2.27 register_printf_specifier F -+GLIBC_2.27 register_printf_type F -+GLIBC_2.27 registerrpc F -+GLIBC_2.27 remap_file_pages F -+GLIBC_2.27 remove F -+GLIBC_2.27 removexattr F -+GLIBC_2.27 remque F -+GLIBC_2.27 rename F -+GLIBC_2.27 renameat F -+GLIBC_2.27 revoke F -+GLIBC_2.27 rewind F -+GLIBC_2.27 rewinddir F -+GLIBC_2.27 rexec F -+GLIBC_2.27 rexec_af F -+GLIBC_2.27 rexecoptions D 0x4 -+GLIBC_2.27 rindex F -+GLIBC_2.27 rmdir F -+GLIBC_2.27 rpc_createerr D 0x20 -+GLIBC_2.27 rpmatch F -+GLIBC_2.27 rresvport F -+GLIBC_2.27 rresvport_af F -+GLIBC_2.27 rtime F -+GLIBC_2.27 ruserok F -+GLIBC_2.27 ruserok_af F -+GLIBC_2.27 ruserpass F -+GLIBC_2.27 sbrk F -+GLIBC_2.27 scalbn F -+GLIBC_2.27 scalbnf F -+GLIBC_2.27 scalbnl F -+GLIBC_2.27 scandir F -+GLIBC_2.27 scandir64 F -+GLIBC_2.27 scandirat F -+GLIBC_2.27 scandirat64 F -+GLIBC_2.27 scanf F -+GLIBC_2.27 sched_get_priority_max F -+GLIBC_2.27 sched_get_priority_min F -+GLIBC_2.27 sched_getaffinity F -+GLIBC_2.27 sched_getcpu F -+GLIBC_2.27 sched_getparam F -+GLIBC_2.27 sched_getscheduler F -+GLIBC_2.27 sched_rr_get_interval F -+GLIBC_2.27 sched_setaffinity F -+GLIBC_2.27 sched_setparam F -+GLIBC_2.27 sched_setscheduler F -+GLIBC_2.27 sched_yield F -+GLIBC_2.27 secure_getenv F -+GLIBC_2.27 seed48 F -+GLIBC_2.27 seed48_r F -+GLIBC_2.27 seekdir F -+GLIBC_2.27 select F -+GLIBC_2.27 semctl F -+GLIBC_2.27 semget F -+GLIBC_2.27 semop F -+GLIBC_2.27 semtimedop F -+GLIBC_2.27 send F -+GLIBC_2.27 sendfile F -+GLIBC_2.27 sendfile64 F -+GLIBC_2.27 sendmmsg F -+GLIBC_2.27 sendmsg F -+GLIBC_2.27 sendto F -+GLIBC_2.27 setaliasent F -+GLIBC_2.27 setbuf F -+GLIBC_2.27 setbuffer F -+GLIBC_2.27 setcontext F -+GLIBC_2.27 setdomainname F -+GLIBC_2.27 setegid F -+GLIBC_2.27 setenv F -+GLIBC_2.27 seteuid F -+GLIBC_2.27 setfsent F -+GLIBC_2.27 setfsgid F -+GLIBC_2.27 setfsuid F -+GLIBC_2.27 setgid F -+GLIBC_2.27 setgrent F -+GLIBC_2.27 setgroups F -+GLIBC_2.27 sethostent F -+GLIBC_2.27 sethostid F -+GLIBC_2.27 sethostname F -+GLIBC_2.27 setipv4sourcefilter F -+GLIBC_2.27 setitimer F -+GLIBC_2.27 setjmp F -+GLIBC_2.27 setlinebuf F -+GLIBC_2.27 setlocale F -+GLIBC_2.27 setlogin F -+GLIBC_2.27 setlogmask F -+GLIBC_2.27 setmntent F -+GLIBC_2.27 setnetent F -+GLIBC_2.27 setnetgrent F -+GLIBC_2.27 setns F -+GLIBC_2.27 setpgid F -+GLIBC_2.27 setpgrp F -+GLIBC_2.27 setpriority F -+GLIBC_2.27 setprotoent F -+GLIBC_2.27 setpwent F -+GLIBC_2.27 setregid F -+GLIBC_2.27 setresgid F -+GLIBC_2.27 setresuid F -+GLIBC_2.27 setreuid F -+GLIBC_2.27 setrlimit F -+GLIBC_2.27 setrlimit64 F -+GLIBC_2.27 setrpcent F -+GLIBC_2.27 setservent F -+GLIBC_2.27 setsgent F -+GLIBC_2.27 setsid F -+GLIBC_2.27 setsockopt F -+GLIBC_2.27 setsourcefilter F -+GLIBC_2.27 setspent F -+GLIBC_2.27 setstate F -+GLIBC_2.27 setstate_r F -+GLIBC_2.27 settimeofday F -+GLIBC_2.27 setttyent F -+GLIBC_2.27 setuid F -+GLIBC_2.27 setusershell F -+GLIBC_2.27 setutent F -+GLIBC_2.27 setutxent F -+GLIBC_2.27 setvbuf F -+GLIBC_2.27 setxattr F -+GLIBC_2.27 sgetsgent F -+GLIBC_2.27 sgetsgent_r F -+GLIBC_2.27 sgetspent F -+GLIBC_2.27 sgetspent_r F -+GLIBC_2.27 shmat F -+GLIBC_2.27 shmctl F -+GLIBC_2.27 shmdt F -+GLIBC_2.27 shmget F -+GLIBC_2.27 shutdown F -+GLIBC_2.27 sigaction F -+GLIBC_2.27 sigaddset F -+GLIBC_2.27 sigaltstack F -+GLIBC_2.27 sigandset F -+GLIBC_2.27 sigblock F -+GLIBC_2.27 sigdelset F -+GLIBC_2.27 sigemptyset F -+GLIBC_2.27 sigfillset F -+GLIBC_2.27 siggetmask F -+GLIBC_2.27 sighold F -+GLIBC_2.27 sigignore F -+GLIBC_2.27 siginterrupt F -+GLIBC_2.27 sigisemptyset F -+GLIBC_2.27 sigismember F -+GLIBC_2.27 siglongjmp F -+GLIBC_2.27 signal F -+GLIBC_2.27 signalfd F -+GLIBC_2.27 sigorset F -+GLIBC_2.27 sigpause F -+GLIBC_2.27 sigpending F -+GLIBC_2.27 sigprocmask F -+GLIBC_2.27 sigqueue F -+GLIBC_2.27 sigrelse F -+GLIBC_2.27 sigreturn F -+GLIBC_2.27 sigset F -+GLIBC_2.27 sigsetmask F -+GLIBC_2.27 sigstack F -+GLIBC_2.27 sigsuspend F -+GLIBC_2.27 sigtimedwait F -+GLIBC_2.27 sigwait F -+GLIBC_2.27 sigwaitinfo F -+GLIBC_2.27 sleep F -+GLIBC_2.27 snprintf F -+GLIBC_2.27 sockatmark F -+GLIBC_2.27 socket F -+GLIBC_2.27 socketpair F -+GLIBC_2.27 splice F -+GLIBC_2.27 sprintf F -+GLIBC_2.27 sprofil F -+GLIBC_2.27 srand F -+GLIBC_2.27 srand48 F -+GLIBC_2.27 srand48_r F -+GLIBC_2.27 srandom F -+GLIBC_2.27 srandom_r F -+GLIBC_2.27 sscanf F -+GLIBC_2.27 ssignal F -+GLIBC_2.27 sstk F -+GLIBC_2.27 statfs F -+GLIBC_2.27 statfs64 F -+GLIBC_2.27 statvfs F -+GLIBC_2.27 statvfs64 F -+GLIBC_2.27 stderr D 0x8 -+GLIBC_2.27 stdin D 0x8 -+GLIBC_2.27 stdout D 0x8 -+GLIBC_2.27 stime F -+GLIBC_2.27 stpcpy F -+GLIBC_2.27 stpncpy F -+GLIBC_2.27 strcasecmp F -+GLIBC_2.27 strcasecmp_l F -+GLIBC_2.27 strcasestr F -+GLIBC_2.27 strcat F -+GLIBC_2.27 strchr F -+GLIBC_2.27 strchrnul F -+GLIBC_2.27 strcmp F -+GLIBC_2.27 strcoll F -+GLIBC_2.27 strcoll_l F -+GLIBC_2.27 strcpy F -+GLIBC_2.27 strcspn F -+GLIBC_2.27 strdup F -+GLIBC_2.27 strerror F -+GLIBC_2.27 strerror_l F -+GLIBC_2.27 strerror_r F -+GLIBC_2.27 strfmon F -+GLIBC_2.27 strfmon_l F -+GLIBC_2.27 strfromd F -+GLIBC_2.27 strfromf F -+GLIBC_2.27 strfromf128 F -+GLIBC_2.27 strfromf32 F -+GLIBC_2.27 strfromf32x F -+GLIBC_2.27 strfromf64 F -+GLIBC_2.27 strfromf64x F -+GLIBC_2.27 strfroml F -+GLIBC_2.27 strfry F -+GLIBC_2.27 strftime F -+GLIBC_2.27 strftime_l F -+GLIBC_2.27 strlen F -+GLIBC_2.27 strncasecmp F -+GLIBC_2.27 strncasecmp_l F -+GLIBC_2.27 strncat F -+GLIBC_2.27 strncmp F -+GLIBC_2.27 strncpy F -+GLIBC_2.27 strndup F -+GLIBC_2.27 strnlen F -+GLIBC_2.27 strpbrk F -+GLIBC_2.27 strptime F -+GLIBC_2.27 strptime_l F -+GLIBC_2.27 strrchr F -+GLIBC_2.27 strsep F -+GLIBC_2.27 strsignal F -+GLIBC_2.27 strspn F -+GLIBC_2.27 strstr F -+GLIBC_2.27 strtod F -+GLIBC_2.27 strtod_l F -+GLIBC_2.27 strtof F -+GLIBC_2.27 strtof128 F -+GLIBC_2.27 strtof128_l F -+GLIBC_2.27 strtof32 F -+GLIBC_2.27 strtof32_l F -+GLIBC_2.27 strtof32x F -+GLIBC_2.27 strtof32x_l F -+GLIBC_2.27 strtof64 F -+GLIBC_2.27 strtof64_l F -+GLIBC_2.27 strtof64x F -+GLIBC_2.27 strtof64x_l F -+GLIBC_2.27 strtof_l F -+GLIBC_2.27 strtoimax F -+GLIBC_2.27 strtok F -+GLIBC_2.27 strtok_r F -+GLIBC_2.27 strtol F -+GLIBC_2.27 strtol_l F -+GLIBC_2.27 strtold F -+GLIBC_2.27 strtold_l F -+GLIBC_2.27 strtoll F -+GLIBC_2.27 strtoll_l F -+GLIBC_2.27 strtoq F -+GLIBC_2.27 strtoul F -+GLIBC_2.27 strtoul_l F -+GLIBC_2.27 strtoull F -+GLIBC_2.27 strtoull_l F -+GLIBC_2.27 strtoumax F -+GLIBC_2.27 strtouq F -+GLIBC_2.27 strverscmp F -+GLIBC_2.27 strxfrm F -+GLIBC_2.27 strxfrm_l F -+GLIBC_2.27 stty F -+GLIBC_2.27 svc_exit F -+GLIBC_2.27 svc_fdset D 0x80 -+GLIBC_2.27 svc_getreq F -+GLIBC_2.27 svc_getreq_common F -+GLIBC_2.27 svc_getreq_poll F -+GLIBC_2.27 svc_getreqset F -+GLIBC_2.27 svc_max_pollfd D 0x4 -+GLIBC_2.27 svc_pollfd D 0x8 -+GLIBC_2.27 svc_register F -+GLIBC_2.27 svc_run F -+GLIBC_2.27 svc_sendreply F -+GLIBC_2.27 svc_unregister F -+GLIBC_2.27 svcauthdes_stats D 0x18 -+GLIBC_2.27 svcerr_auth F -+GLIBC_2.27 svcerr_decode F -+GLIBC_2.27 svcerr_noproc F -+GLIBC_2.27 svcerr_noprog F -+GLIBC_2.27 svcerr_progvers F -+GLIBC_2.27 svcerr_systemerr F -+GLIBC_2.27 svcerr_weakauth F -+GLIBC_2.27 svcfd_create F -+GLIBC_2.27 svcraw_create F -+GLIBC_2.27 svctcp_create F -+GLIBC_2.27 svcudp_bufcreate F -+GLIBC_2.27 svcudp_create F -+GLIBC_2.27 svcudp_enablecache F -+GLIBC_2.27 svcunix_create F -+GLIBC_2.27 svcunixfd_create F -+GLIBC_2.27 swab F -+GLIBC_2.27 swapcontext F -+GLIBC_2.27 swapoff F -+GLIBC_2.27 swapon F -+GLIBC_2.27 swprintf F -+GLIBC_2.27 swscanf F -+GLIBC_2.27 symlink F -+GLIBC_2.27 symlinkat F -+GLIBC_2.27 sync F -+GLIBC_2.27 sync_file_range F -+GLIBC_2.27 syncfs F -+GLIBC_2.27 sys_errlist D 0x2370 -+GLIBC_2.27 sys_nerr D 0x4 -+GLIBC_2.27 sys_sigabbrev D 0x400 -+GLIBC_2.27 sys_siglist D 0x400 -+GLIBC_2.27 syscall F -+GLIBC_2.27 sysconf F -+GLIBC_2.27 sysctl F -+GLIBC_2.27 sysinfo F -+GLIBC_2.27 syslog F -+GLIBC_2.27 system F -+GLIBC_2.27 sysv_signal F -+GLIBC_2.27 tcdrain F -+GLIBC_2.27 tcflow F -+GLIBC_2.27 tcflush F -+GLIBC_2.27 tcgetattr F -+GLIBC_2.27 tcgetpgrp F -+GLIBC_2.27 tcgetsid F -+GLIBC_2.27 tcsendbreak F -+GLIBC_2.27 tcsetattr F -+GLIBC_2.27 tcsetpgrp F -+GLIBC_2.27 tdelete F -+GLIBC_2.27 tdestroy F -+GLIBC_2.27 tee F -+GLIBC_2.27 telldir F -+GLIBC_2.27 tempnam F -+GLIBC_2.27 textdomain F -+GLIBC_2.27 tfind F -+GLIBC_2.27 time F -+GLIBC_2.27 timegm F -+GLIBC_2.27 timelocal F -+GLIBC_2.27 timerfd_create F -+GLIBC_2.27 timerfd_gettime F -+GLIBC_2.27 timerfd_settime F -+GLIBC_2.27 times F -+GLIBC_2.27 timespec_get F -+GLIBC_2.27 timezone D 0x8 -+GLIBC_2.27 tmpfile F -+GLIBC_2.27 tmpfile64 F -+GLIBC_2.27 tmpnam F -+GLIBC_2.27 tmpnam_r F -+GLIBC_2.27 toascii F -+GLIBC_2.27 tolower F -+GLIBC_2.27 tolower_l F -+GLIBC_2.27 toupper F -+GLIBC_2.27 toupper_l F -+GLIBC_2.27 towctrans F -+GLIBC_2.27 towctrans_l F -+GLIBC_2.27 towlower F -+GLIBC_2.27 towlower_l F -+GLIBC_2.27 towupper F -+GLIBC_2.27 towupper_l F -+GLIBC_2.27 tr_break F -+GLIBC_2.27 truncate F -+GLIBC_2.27 truncate64 F -+GLIBC_2.27 tsearch F -+GLIBC_2.27 ttyname F -+GLIBC_2.27 ttyname_r F -+GLIBC_2.27 ttyslot F -+GLIBC_2.27 twalk F -+GLIBC_2.27 tzname D 0x10 -+GLIBC_2.27 tzset F -+GLIBC_2.27 ualarm F -+GLIBC_2.27 ulckpwdf F -+GLIBC_2.27 ulimit F -+GLIBC_2.27 umask F -+GLIBC_2.27 umount F -+GLIBC_2.27 umount2 F -+GLIBC_2.27 uname F -+GLIBC_2.27 ungetc F -+GLIBC_2.27 ungetwc F -+GLIBC_2.27 unlink F -+GLIBC_2.27 unlinkat F -+GLIBC_2.27 unlockpt F -+GLIBC_2.27 unsetenv F -+GLIBC_2.27 unshare F -+GLIBC_2.27 updwtmp F -+GLIBC_2.27 updwtmpx F -+GLIBC_2.27 uselocale F -+GLIBC_2.27 user2netname F -+GLIBC_2.27 usleep F -+GLIBC_2.27 ustat F -+GLIBC_2.27 utime F -+GLIBC_2.27 utimensat F -+GLIBC_2.27 utimes F -+GLIBC_2.27 utmpname F -+GLIBC_2.27 utmpxname F -+GLIBC_2.27 valloc F -+GLIBC_2.27 vasprintf F -+GLIBC_2.27 vdprintf F -+GLIBC_2.27 verr F -+GLIBC_2.27 verrx F -+GLIBC_2.27 versionsort F -+GLIBC_2.27 versionsort64 F -+GLIBC_2.27 vfork F -+GLIBC_2.27 vfprintf F -+GLIBC_2.27 vfscanf F -+GLIBC_2.27 vfwprintf F -+GLIBC_2.27 vfwscanf F -+GLIBC_2.27 vhangup F -+GLIBC_2.27 vlimit F -+GLIBC_2.27 vmsplice F -+GLIBC_2.27 vprintf F -+GLIBC_2.27 vscanf F -+GLIBC_2.27 vsnprintf F -+GLIBC_2.27 vsprintf F -+GLIBC_2.27 vsscanf F -+GLIBC_2.27 vswprintf F -+GLIBC_2.27 vswscanf F -+GLIBC_2.27 vsyslog F -+GLIBC_2.27 vtimes F -+GLIBC_2.27 vwarn F -+GLIBC_2.27 vwarnx F -+GLIBC_2.27 vwprintf F -+GLIBC_2.27 vwscanf F -+GLIBC_2.27 wait F -+GLIBC_2.27 wait3 F -+GLIBC_2.27 wait4 F -+GLIBC_2.27 waitid F -+GLIBC_2.27 waitpid F -+GLIBC_2.27 warn F -+GLIBC_2.27 warnx F -+GLIBC_2.27 wcpcpy F -+GLIBC_2.27 wcpncpy F -+GLIBC_2.27 wcrtomb F -+GLIBC_2.27 wcscasecmp F -+GLIBC_2.27 wcscasecmp_l F -+GLIBC_2.27 wcscat F -+GLIBC_2.27 wcschr F -+GLIBC_2.27 wcschrnul F -+GLIBC_2.27 wcscmp F -+GLIBC_2.27 wcscoll F -+GLIBC_2.27 wcscoll_l F -+GLIBC_2.27 wcscpy F -+GLIBC_2.27 wcscspn F -+GLIBC_2.27 wcsdup F -+GLIBC_2.27 wcsftime F -+GLIBC_2.27 wcsftime_l F -+GLIBC_2.27 wcslen F -+GLIBC_2.27 wcsncasecmp F -+GLIBC_2.27 wcsncasecmp_l F -+GLIBC_2.27 wcsncat F -+GLIBC_2.27 wcsncmp F -+GLIBC_2.27 wcsncpy F -+GLIBC_2.27 wcsnlen F -+GLIBC_2.27 wcsnrtombs F -+GLIBC_2.27 wcspbrk F -+GLIBC_2.27 wcsrchr F -+GLIBC_2.27 wcsrtombs F -+GLIBC_2.27 wcsspn F -+GLIBC_2.27 wcsstr F -+GLIBC_2.27 wcstod F -+GLIBC_2.27 wcstod_l F -+GLIBC_2.27 wcstof F -+GLIBC_2.27 wcstof128 F -+GLIBC_2.27 wcstof128_l F -+GLIBC_2.27 wcstof32 F -+GLIBC_2.27 wcstof32_l F -+GLIBC_2.27 wcstof32x F -+GLIBC_2.27 wcstof32x_l F -+GLIBC_2.27 wcstof64 F -+GLIBC_2.27 wcstof64_l F -+GLIBC_2.27 wcstof64x F -+GLIBC_2.27 wcstof64x_l F -+GLIBC_2.27 wcstof_l F -+GLIBC_2.27 wcstoimax F -+GLIBC_2.27 wcstok F -+GLIBC_2.27 wcstol F -+GLIBC_2.27 wcstol_l F -+GLIBC_2.27 wcstold F -+GLIBC_2.27 wcstold_l F -+GLIBC_2.27 wcstoll F -+GLIBC_2.27 wcstoll_l F -+GLIBC_2.27 wcstombs F -+GLIBC_2.27 wcstoq F -+GLIBC_2.27 wcstoul F -+GLIBC_2.27 wcstoul_l F -+GLIBC_2.27 wcstoull F -+GLIBC_2.27 wcstoull_l F -+GLIBC_2.27 wcstoumax F -+GLIBC_2.27 wcstouq F -+GLIBC_2.27 wcswcs F -+GLIBC_2.27 wcswidth F -+GLIBC_2.27 wcsxfrm F -+GLIBC_2.27 wcsxfrm_l F -+GLIBC_2.27 wctob F -+GLIBC_2.27 wctomb F -+GLIBC_2.27 wctrans F -+GLIBC_2.27 wctrans_l F -+GLIBC_2.27 wctype F -+GLIBC_2.27 wctype_l F -+GLIBC_2.27 wcwidth F -+GLIBC_2.27 wmemchr F -+GLIBC_2.27 wmemcmp F -+GLIBC_2.27 wmemcpy F -+GLIBC_2.27 wmemmove F -+GLIBC_2.27 wmempcpy F -+GLIBC_2.27 wmemset F -+GLIBC_2.27 wordexp F -+GLIBC_2.27 wordfree F -+GLIBC_2.27 wprintf F -+GLIBC_2.27 write F -+GLIBC_2.27 writev F -+GLIBC_2.27 wscanf F -+GLIBC_2.27 xdecrypt F -+GLIBC_2.27 xdr_accepted_reply F -+GLIBC_2.27 xdr_array F -+GLIBC_2.27 xdr_authdes_cred F -+GLIBC_2.27 xdr_authdes_verf F -+GLIBC_2.27 xdr_authunix_parms F -+GLIBC_2.27 xdr_bool F -+GLIBC_2.27 xdr_bytes F -+GLIBC_2.27 xdr_callhdr F -+GLIBC_2.27 xdr_callmsg F -+GLIBC_2.27 xdr_char F -+GLIBC_2.27 xdr_cryptkeyarg F -+GLIBC_2.27 xdr_cryptkeyarg2 F -+GLIBC_2.27 xdr_cryptkeyres F -+GLIBC_2.27 xdr_des_block F -+GLIBC_2.27 xdr_double F -+GLIBC_2.27 xdr_enum F -+GLIBC_2.27 xdr_float F -+GLIBC_2.27 xdr_free F -+GLIBC_2.27 xdr_getcredres F -+GLIBC_2.27 xdr_hyper F -+GLIBC_2.27 xdr_int F -+GLIBC_2.27 xdr_int16_t F -+GLIBC_2.27 xdr_int32_t F -+GLIBC_2.27 xdr_int64_t F -+GLIBC_2.27 xdr_int8_t F -+GLIBC_2.27 xdr_key_netstarg F -+GLIBC_2.27 xdr_key_netstres F -+GLIBC_2.27 xdr_keybuf F -+GLIBC_2.27 xdr_keystatus F -+GLIBC_2.27 xdr_long F -+GLIBC_2.27 xdr_longlong_t F -+GLIBC_2.27 xdr_netnamestr F -+GLIBC_2.27 xdr_netobj F -+GLIBC_2.27 xdr_opaque F -+GLIBC_2.27 xdr_opaque_auth F -+GLIBC_2.27 xdr_pmap F -+GLIBC_2.27 xdr_pmaplist F -+GLIBC_2.27 xdr_pointer F -+GLIBC_2.27 xdr_quad_t F -+GLIBC_2.27 xdr_reference F -+GLIBC_2.27 xdr_rejected_reply F -+GLIBC_2.27 xdr_replymsg F -+GLIBC_2.27 xdr_rmtcall_args F -+GLIBC_2.27 xdr_rmtcallres F -+GLIBC_2.27 xdr_short F -+GLIBC_2.27 xdr_sizeof F -+GLIBC_2.27 xdr_string F -+GLIBC_2.27 xdr_u_char F -+GLIBC_2.27 xdr_u_hyper F -+GLIBC_2.27 xdr_u_int F -+GLIBC_2.27 xdr_u_long F -+GLIBC_2.27 xdr_u_longlong_t F -+GLIBC_2.27 xdr_u_quad_t F -+GLIBC_2.27 xdr_u_short F -+GLIBC_2.27 xdr_uint16_t F -+GLIBC_2.27 xdr_uint32_t F -+GLIBC_2.27 xdr_uint64_t F -+GLIBC_2.27 xdr_uint8_t F -+GLIBC_2.27 xdr_union F -+GLIBC_2.27 xdr_unixcred F -+GLIBC_2.27 xdr_vector F -+GLIBC_2.27 xdr_void F -+GLIBC_2.27 xdr_wrapstring F -+GLIBC_2.27 xdrmem_create F -+GLIBC_2.27 xdrrec_create F -+GLIBC_2.27 xdrrec_endofrecord F -+GLIBC_2.27 xdrrec_eof F -+GLIBC_2.27 xdrrec_skiprecord F -+GLIBC_2.27 xdrstdio_create F -+GLIBC_2.27 xencrypt F -+GLIBC_2.27 xprt_register F -+GLIBC_2.27 xprt_unregister F -+GLIBC_2.28 fcntl64 F -+GLIBC_2.28 renameat2 F -+GLIBC_2.28 statx F -+GLIBC_2.28 thrd_current F -+GLIBC_2.28 thrd_equal F -+GLIBC_2.28 thrd_sleep F -+GLIBC_2.28 thrd_yield F -diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libcrypt.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libcrypt.abilist -new file mode 100644 -index 00000000..9484dca7 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libcrypt.abilist -@@ -0,0 +1,7 @@ -+GLIBC_2.27 crypt F -+GLIBC_2.27 crypt_r F -+GLIBC_2.27 encrypt F -+GLIBC_2.27 encrypt_r F -+GLIBC_2.27 fcrypt F -+GLIBC_2.27 setkey F -+GLIBC_2.27 setkey_r F -diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libdl.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libdl.abilist -new file mode 100644 -index 00000000..16adcae5 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libdl.abilist -@@ -0,0 +1,9 @@ -+GLIBC_2.27 dladdr F -+GLIBC_2.27 dladdr1 F -+GLIBC_2.27 dlclose F -+GLIBC_2.27 dlerror F -+GLIBC_2.27 dlinfo F -+GLIBC_2.27 dlmopen F -+GLIBC_2.27 dlopen F -+GLIBC_2.27 dlsym F -+GLIBC_2.27 dlvsym F -diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libm.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libm.abilist -new file mode 100644 -index 00000000..361fce20 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libm.abilist -@@ -0,0 +1,1021 @@ -+GLIBC_2.27 __acos_finite F -+GLIBC_2.27 __acosf_finite F -+GLIBC_2.27 __acosh_finite F -+GLIBC_2.27 __acoshf_finite F -+GLIBC_2.27 __acoshl_finite F -+GLIBC_2.27 __acosl_finite F -+GLIBC_2.27 __asin_finite F -+GLIBC_2.27 __asinf_finite F -+GLIBC_2.27 __asinl_finite F -+GLIBC_2.27 __atan2_finite F -+GLIBC_2.27 __atan2f_finite F -+GLIBC_2.27 __atan2l_finite F -+GLIBC_2.27 __atanh_finite F -+GLIBC_2.27 __atanhf_finite F -+GLIBC_2.27 __atanhl_finite F -+GLIBC_2.27 __clog10 F -+GLIBC_2.27 __clog10f F -+GLIBC_2.27 __clog10l F -+GLIBC_2.27 __cosh_finite F -+GLIBC_2.27 __coshf_finite F -+GLIBC_2.27 __coshl_finite F -+GLIBC_2.27 __exp10_finite F -+GLIBC_2.27 __exp10f_finite F -+GLIBC_2.27 __exp10l_finite F -+GLIBC_2.27 __exp2_finite F -+GLIBC_2.27 __exp2f_finite F -+GLIBC_2.27 __exp2l_finite F -+GLIBC_2.27 __exp_finite F -+GLIBC_2.27 __expf_finite F -+GLIBC_2.27 __expl_finite F -+GLIBC_2.27 __finite F -+GLIBC_2.27 __finitef F -+GLIBC_2.27 __finitel F -+GLIBC_2.27 __fmod_finite F -+GLIBC_2.27 __fmodf_finite F -+GLIBC_2.27 __fmodl_finite F -+GLIBC_2.27 __fpclassify F -+GLIBC_2.27 __fpclassifyf F -+GLIBC_2.27 __fpclassifyl F -+GLIBC_2.27 __gamma_r_finite F -+GLIBC_2.27 __gammaf_r_finite F -+GLIBC_2.27 __gammal_r_finite F -+GLIBC_2.27 __hypot_finite F -+GLIBC_2.27 __hypotf_finite F -+GLIBC_2.27 __hypotl_finite F -+GLIBC_2.27 __iseqsig F -+GLIBC_2.27 __iseqsigf F -+GLIBC_2.27 __iseqsigl F -+GLIBC_2.27 __issignaling F -+GLIBC_2.27 __issignalingf F -+GLIBC_2.27 __issignalingl F -+GLIBC_2.27 __j0_finite F -+GLIBC_2.27 __j0f_finite F -+GLIBC_2.27 __j0l_finite F -+GLIBC_2.27 __j1_finite F -+GLIBC_2.27 __j1f_finite F -+GLIBC_2.27 __j1l_finite F -+GLIBC_2.27 __jn_finite F -+GLIBC_2.27 __jnf_finite F -+GLIBC_2.27 __jnl_finite F -+GLIBC_2.27 __lgamma_r_finite F -+GLIBC_2.27 __lgammaf_r_finite F -+GLIBC_2.27 __lgammal_r_finite F -+GLIBC_2.27 __log10_finite F -+GLIBC_2.27 __log10f_finite F -+GLIBC_2.27 __log10l_finite F -+GLIBC_2.27 __log2_finite F -+GLIBC_2.27 __log2f_finite F -+GLIBC_2.27 __log2l_finite F -+GLIBC_2.27 __log_finite F -+GLIBC_2.27 __logf_finite F -+GLIBC_2.27 __logl_finite F -+GLIBC_2.27 __pow_finite F -+GLIBC_2.27 __powf_finite F -+GLIBC_2.27 __powl_finite F -+GLIBC_2.27 __remainder_finite F -+GLIBC_2.27 __remainderf_finite F -+GLIBC_2.27 __remainderl_finite F -+GLIBC_2.27 __scalb_finite F -+GLIBC_2.27 __scalbf_finite F -+GLIBC_2.27 __scalbl_finite F -+GLIBC_2.27 __signbit F -+GLIBC_2.27 __signbitf F -+GLIBC_2.27 __signbitl F -+GLIBC_2.27 __signgam D 0x4 -+GLIBC_2.27 __sinh_finite F -+GLIBC_2.27 __sinhf_finite F -+GLIBC_2.27 __sinhl_finite F -+GLIBC_2.27 __sqrt_finite F -+GLIBC_2.27 __sqrtf_finite F -+GLIBC_2.27 __sqrtl_finite F -+GLIBC_2.27 __y0_finite F -+GLIBC_2.27 __y0f_finite F -+GLIBC_2.27 __y0l_finite F -+GLIBC_2.27 __y1_finite F -+GLIBC_2.27 __y1f_finite F -+GLIBC_2.27 __y1l_finite F -+GLIBC_2.27 __yn_finite F -+GLIBC_2.27 __ynf_finite F -+GLIBC_2.27 __ynl_finite F -+GLIBC_2.27 acos F -+GLIBC_2.27 acosf F -+GLIBC_2.27 acosf128 F -+GLIBC_2.27 acosf32 F -+GLIBC_2.27 acosf32x F -+GLIBC_2.27 acosf64 F -+GLIBC_2.27 acosf64x F -+GLIBC_2.27 acosh F -+GLIBC_2.27 acoshf F -+GLIBC_2.27 acoshf128 F -+GLIBC_2.27 acoshf32 F -+GLIBC_2.27 acoshf32x F -+GLIBC_2.27 acoshf64 F -+GLIBC_2.27 acoshf64x F -+GLIBC_2.27 acoshl F -+GLIBC_2.27 acosl F -+GLIBC_2.27 asin F -+GLIBC_2.27 asinf F -+GLIBC_2.27 asinf128 F -+GLIBC_2.27 asinf32 F -+GLIBC_2.27 asinf32x F -+GLIBC_2.27 asinf64 F -+GLIBC_2.27 asinf64x F -+GLIBC_2.27 asinh F -+GLIBC_2.27 asinhf F -+GLIBC_2.27 asinhf128 F -+GLIBC_2.27 asinhf32 F -+GLIBC_2.27 asinhf32x F -+GLIBC_2.27 asinhf64 F -+GLIBC_2.27 asinhf64x F -+GLIBC_2.27 asinhl F -+GLIBC_2.27 asinl F -+GLIBC_2.27 atan F -+GLIBC_2.27 atan2 F -+GLIBC_2.27 atan2f F -+GLIBC_2.27 atan2f128 F -+GLIBC_2.27 atan2f32 F -+GLIBC_2.27 atan2f32x F -+GLIBC_2.27 atan2f64 F -+GLIBC_2.27 atan2f64x F -+GLIBC_2.27 atan2l F -+GLIBC_2.27 atanf F -+GLIBC_2.27 atanf128 F -+GLIBC_2.27 atanf32 F -+GLIBC_2.27 atanf32x F -+GLIBC_2.27 atanf64 F -+GLIBC_2.27 atanf64x F -+GLIBC_2.27 atanh F -+GLIBC_2.27 atanhf F -+GLIBC_2.27 atanhf128 F -+GLIBC_2.27 atanhf32 F -+GLIBC_2.27 atanhf32x F -+GLIBC_2.27 atanhf64 F -+GLIBC_2.27 atanhf64x F -+GLIBC_2.27 atanhl F -+GLIBC_2.27 atanl F -+GLIBC_2.27 cabs F -+GLIBC_2.27 cabsf F -+GLIBC_2.27 cabsf128 F -+GLIBC_2.27 cabsf32 F -+GLIBC_2.27 cabsf32x F -+GLIBC_2.27 cabsf64 F -+GLIBC_2.27 cabsf64x F -+GLIBC_2.27 cabsl F -+GLIBC_2.27 cacos F -+GLIBC_2.27 cacosf F -+GLIBC_2.27 cacosf128 F -+GLIBC_2.27 cacosf32 F -+GLIBC_2.27 cacosf32x F -+GLIBC_2.27 cacosf64 F -+GLIBC_2.27 cacosf64x F -+GLIBC_2.27 cacosh F -+GLIBC_2.27 cacoshf F -+GLIBC_2.27 cacoshf128 F -+GLIBC_2.27 cacoshf32 F -+GLIBC_2.27 cacoshf32x F -+GLIBC_2.27 cacoshf64 F -+GLIBC_2.27 cacoshf64x F -+GLIBC_2.27 cacoshl F -+GLIBC_2.27 cacosl F -+GLIBC_2.27 canonicalize F -+GLIBC_2.27 canonicalizef F -+GLIBC_2.27 canonicalizef128 F -+GLIBC_2.27 canonicalizef32 F -+GLIBC_2.27 canonicalizef32x F -+GLIBC_2.27 canonicalizef64 F -+GLIBC_2.27 canonicalizef64x F -+GLIBC_2.27 canonicalizel F -+GLIBC_2.27 carg F -+GLIBC_2.27 cargf F -+GLIBC_2.27 cargf128 F -+GLIBC_2.27 cargf32 F -+GLIBC_2.27 cargf32x F -+GLIBC_2.27 cargf64 F -+GLIBC_2.27 cargf64x F -+GLIBC_2.27 cargl F -+GLIBC_2.27 casin F -+GLIBC_2.27 casinf F -+GLIBC_2.27 casinf128 F -+GLIBC_2.27 casinf32 F -+GLIBC_2.27 casinf32x F -+GLIBC_2.27 casinf64 F -+GLIBC_2.27 casinf64x F -+GLIBC_2.27 casinh F -+GLIBC_2.27 casinhf F -+GLIBC_2.27 casinhf128 F -+GLIBC_2.27 casinhf32 F -+GLIBC_2.27 casinhf32x F -+GLIBC_2.27 casinhf64 F -+GLIBC_2.27 casinhf64x F -+GLIBC_2.27 casinhl F -+GLIBC_2.27 casinl F -+GLIBC_2.27 catan F -+GLIBC_2.27 catanf F -+GLIBC_2.27 catanf128 F -+GLIBC_2.27 catanf32 F -+GLIBC_2.27 catanf32x F -+GLIBC_2.27 catanf64 F -+GLIBC_2.27 catanf64x F -+GLIBC_2.27 catanh F -+GLIBC_2.27 catanhf F -+GLIBC_2.27 catanhf128 F -+GLIBC_2.27 catanhf32 F -+GLIBC_2.27 catanhf32x F -+GLIBC_2.27 catanhf64 F -+GLIBC_2.27 catanhf64x F -+GLIBC_2.27 catanhl F -+GLIBC_2.27 catanl F -+GLIBC_2.27 cbrt F -+GLIBC_2.27 cbrtf F -+GLIBC_2.27 cbrtf128 F -+GLIBC_2.27 cbrtf32 F -+GLIBC_2.27 cbrtf32x F -+GLIBC_2.27 cbrtf64 F -+GLIBC_2.27 cbrtf64x F -+GLIBC_2.27 cbrtl F -+GLIBC_2.27 ccos F -+GLIBC_2.27 ccosf F -+GLIBC_2.27 ccosf128 F -+GLIBC_2.27 ccosf32 F -+GLIBC_2.27 ccosf32x F -+GLIBC_2.27 ccosf64 F -+GLIBC_2.27 ccosf64x F -+GLIBC_2.27 ccosh F -+GLIBC_2.27 ccoshf F -+GLIBC_2.27 ccoshf128 F -+GLIBC_2.27 ccoshf32 F -+GLIBC_2.27 ccoshf32x F -+GLIBC_2.27 ccoshf64 F -+GLIBC_2.27 ccoshf64x F -+GLIBC_2.27 ccoshl F -+GLIBC_2.27 ccosl F -+GLIBC_2.27 ceil F -+GLIBC_2.27 ceilf F -+GLIBC_2.27 ceilf128 F -+GLIBC_2.27 ceilf32 F -+GLIBC_2.27 ceilf32x F -+GLIBC_2.27 ceilf64 F -+GLIBC_2.27 ceilf64x F -+GLIBC_2.27 ceill F -+GLIBC_2.27 cexp F -+GLIBC_2.27 cexpf F -+GLIBC_2.27 cexpf128 F -+GLIBC_2.27 cexpf32 F -+GLIBC_2.27 cexpf32x F -+GLIBC_2.27 cexpf64 F -+GLIBC_2.27 cexpf64x F -+GLIBC_2.27 cexpl F -+GLIBC_2.27 cimag F -+GLIBC_2.27 cimagf F -+GLIBC_2.27 cimagf128 F -+GLIBC_2.27 cimagf32 F -+GLIBC_2.27 cimagf32x F -+GLIBC_2.27 cimagf64 F -+GLIBC_2.27 cimagf64x F -+GLIBC_2.27 cimagl F -+GLIBC_2.27 clog F -+GLIBC_2.27 clog10 F -+GLIBC_2.27 clog10f F -+GLIBC_2.27 clog10f128 F -+GLIBC_2.27 clog10f32 F -+GLIBC_2.27 clog10f32x F -+GLIBC_2.27 clog10f64 F -+GLIBC_2.27 clog10f64x F -+GLIBC_2.27 clog10l F -+GLIBC_2.27 clogf F -+GLIBC_2.27 clogf128 F -+GLIBC_2.27 clogf32 F -+GLIBC_2.27 clogf32x F -+GLIBC_2.27 clogf64 F -+GLIBC_2.27 clogf64x F -+GLIBC_2.27 clogl F -+GLIBC_2.27 conj F -+GLIBC_2.27 conjf F -+GLIBC_2.27 conjf128 F -+GLIBC_2.27 conjf32 F -+GLIBC_2.27 conjf32x F -+GLIBC_2.27 conjf64 F -+GLIBC_2.27 conjf64x F -+GLIBC_2.27 conjl F -+GLIBC_2.27 copysign F -+GLIBC_2.27 copysignf F -+GLIBC_2.27 copysignf128 F -+GLIBC_2.27 copysignf32 F -+GLIBC_2.27 copysignf32x F -+GLIBC_2.27 copysignf64 F -+GLIBC_2.27 copysignf64x F -+GLIBC_2.27 copysignl F -+GLIBC_2.27 cos F -+GLIBC_2.27 cosf F -+GLIBC_2.27 cosf128 F -+GLIBC_2.27 cosf32 F -+GLIBC_2.27 cosf32x F -+GLIBC_2.27 cosf64 F -+GLIBC_2.27 cosf64x F -+GLIBC_2.27 cosh F -+GLIBC_2.27 coshf F -+GLIBC_2.27 coshf128 F -+GLIBC_2.27 coshf32 F -+GLIBC_2.27 coshf32x F -+GLIBC_2.27 coshf64 F -+GLIBC_2.27 coshf64x F -+GLIBC_2.27 coshl F -+GLIBC_2.27 cosl F -+GLIBC_2.27 cpow F -+GLIBC_2.27 cpowf F -+GLIBC_2.27 cpowf128 F -+GLIBC_2.27 cpowf32 F -+GLIBC_2.27 cpowf32x F -+GLIBC_2.27 cpowf64 F -+GLIBC_2.27 cpowf64x F -+GLIBC_2.27 cpowl F -+GLIBC_2.27 cproj F -+GLIBC_2.27 cprojf F -+GLIBC_2.27 cprojf128 F -+GLIBC_2.27 cprojf32 F -+GLIBC_2.27 cprojf32x F -+GLIBC_2.27 cprojf64 F -+GLIBC_2.27 cprojf64x F -+GLIBC_2.27 cprojl F -+GLIBC_2.27 creal F -+GLIBC_2.27 crealf F -+GLIBC_2.27 crealf128 F -+GLIBC_2.27 crealf32 F -+GLIBC_2.27 crealf32x F -+GLIBC_2.27 crealf64 F -+GLIBC_2.27 crealf64x F -+GLIBC_2.27 creall F -+GLIBC_2.27 csin F -+GLIBC_2.27 csinf F -+GLIBC_2.27 csinf128 F -+GLIBC_2.27 csinf32 F -+GLIBC_2.27 csinf32x F -+GLIBC_2.27 csinf64 F -+GLIBC_2.27 csinf64x F -+GLIBC_2.27 csinh F -+GLIBC_2.27 csinhf F -+GLIBC_2.27 csinhf128 F -+GLIBC_2.27 csinhf32 F -+GLIBC_2.27 csinhf32x F -+GLIBC_2.27 csinhf64 F -+GLIBC_2.27 csinhf64x F -+GLIBC_2.27 csinhl F -+GLIBC_2.27 csinl F -+GLIBC_2.27 csqrt F -+GLIBC_2.27 csqrtf F -+GLIBC_2.27 csqrtf128 F -+GLIBC_2.27 csqrtf32 F -+GLIBC_2.27 csqrtf32x F -+GLIBC_2.27 csqrtf64 F -+GLIBC_2.27 csqrtf64x F -+GLIBC_2.27 csqrtl F -+GLIBC_2.27 ctan F -+GLIBC_2.27 ctanf F -+GLIBC_2.27 ctanf128 F -+GLIBC_2.27 ctanf32 F -+GLIBC_2.27 ctanf32x F -+GLIBC_2.27 ctanf64 F -+GLIBC_2.27 ctanf64x F -+GLIBC_2.27 ctanh F -+GLIBC_2.27 ctanhf F -+GLIBC_2.27 ctanhf128 F -+GLIBC_2.27 ctanhf32 F -+GLIBC_2.27 ctanhf32x F -+GLIBC_2.27 ctanhf64 F -+GLIBC_2.27 ctanhf64x F -+GLIBC_2.27 ctanhl F -+GLIBC_2.27 ctanl F -+GLIBC_2.27 drem F -+GLIBC_2.27 dremf F -+GLIBC_2.27 dreml F -+GLIBC_2.27 erf F -+GLIBC_2.27 erfc F -+GLIBC_2.27 erfcf F -+GLIBC_2.27 erfcf128 F -+GLIBC_2.27 erfcf32 F -+GLIBC_2.27 erfcf32x F -+GLIBC_2.27 erfcf64 F -+GLIBC_2.27 erfcf64x F -+GLIBC_2.27 erfcl F -+GLIBC_2.27 erff F -+GLIBC_2.27 erff128 F -+GLIBC_2.27 erff32 F -+GLIBC_2.27 erff32x F -+GLIBC_2.27 erff64 F -+GLIBC_2.27 erff64x F -+GLIBC_2.27 erfl F -+GLIBC_2.27 exp F -+GLIBC_2.27 exp10 F -+GLIBC_2.27 exp10f F -+GLIBC_2.27 exp10f128 F -+GLIBC_2.27 exp10f32 F -+GLIBC_2.27 exp10f32x F -+GLIBC_2.27 exp10f64 F -+GLIBC_2.27 exp10f64x F -+GLIBC_2.27 exp10l F -+GLIBC_2.27 exp2 F -+GLIBC_2.27 exp2f F -+GLIBC_2.27 exp2f128 F -+GLIBC_2.27 exp2f32 F -+GLIBC_2.27 exp2f32x F -+GLIBC_2.27 exp2f64 F -+GLIBC_2.27 exp2f64x F -+GLIBC_2.27 exp2l F -+GLIBC_2.27 expf F -+GLIBC_2.27 expf128 F -+GLIBC_2.27 expf32 F -+GLIBC_2.27 expf32x F -+GLIBC_2.27 expf64 F -+GLIBC_2.27 expf64x F -+GLIBC_2.27 expl F -+GLIBC_2.27 expm1 F -+GLIBC_2.27 expm1f F -+GLIBC_2.27 expm1f128 F -+GLIBC_2.27 expm1f32 F -+GLIBC_2.27 expm1f32x F -+GLIBC_2.27 expm1f64 F -+GLIBC_2.27 expm1f64x F -+GLIBC_2.27 expm1l F -+GLIBC_2.27 fabs F -+GLIBC_2.27 fabsf F -+GLIBC_2.27 fabsf128 F -+GLIBC_2.27 fabsf32 F -+GLIBC_2.27 fabsf32x F -+GLIBC_2.27 fabsf64 F -+GLIBC_2.27 fabsf64x F -+GLIBC_2.27 fabsl F -+GLIBC_2.27 fdim F -+GLIBC_2.27 fdimf F -+GLIBC_2.27 fdimf128 F -+GLIBC_2.27 fdimf32 F -+GLIBC_2.27 fdimf32x F -+GLIBC_2.27 fdimf64 F -+GLIBC_2.27 fdimf64x F -+GLIBC_2.27 fdiml F -+GLIBC_2.27 feclearexcept F -+GLIBC_2.27 fedisableexcept F -+GLIBC_2.27 feenableexcept F -+GLIBC_2.27 fegetenv F -+GLIBC_2.27 fegetexcept F -+GLIBC_2.27 fegetexceptflag F -+GLIBC_2.27 fegetmode F -+GLIBC_2.27 fegetround F -+GLIBC_2.27 feholdexcept F -+GLIBC_2.27 feraiseexcept F -+GLIBC_2.27 fesetenv F -+GLIBC_2.27 fesetexcept F -+GLIBC_2.27 fesetexceptflag F -+GLIBC_2.27 fesetmode F -+GLIBC_2.27 fesetround F -+GLIBC_2.27 fetestexcept F -+GLIBC_2.27 fetestexceptflag F -+GLIBC_2.27 feupdateenv F -+GLIBC_2.27 finite F -+GLIBC_2.27 finitef F -+GLIBC_2.27 finitel F -+GLIBC_2.27 floor F -+GLIBC_2.27 floorf F -+GLIBC_2.27 floorf128 F -+GLIBC_2.27 floorf32 F -+GLIBC_2.27 floorf32x F -+GLIBC_2.27 floorf64 F -+GLIBC_2.27 floorf64x F -+GLIBC_2.27 floorl F -+GLIBC_2.27 fma F -+GLIBC_2.27 fmaf F -+GLIBC_2.27 fmaf128 F -+GLIBC_2.27 fmaf32 F -+GLIBC_2.27 fmaf32x F -+GLIBC_2.27 fmaf64 F -+GLIBC_2.27 fmaf64x F -+GLIBC_2.27 fmal F -+GLIBC_2.27 fmax F -+GLIBC_2.27 fmaxf F -+GLIBC_2.27 fmaxf128 F -+GLIBC_2.27 fmaxf32 F -+GLIBC_2.27 fmaxf32x F -+GLIBC_2.27 fmaxf64 F -+GLIBC_2.27 fmaxf64x F -+GLIBC_2.27 fmaxl F -+GLIBC_2.27 fmaxmag F -+GLIBC_2.27 fmaxmagf F -+GLIBC_2.27 fmaxmagf128 F -+GLIBC_2.27 fmaxmagf32 F -+GLIBC_2.27 fmaxmagf32x F -+GLIBC_2.27 fmaxmagf64 F -+GLIBC_2.27 fmaxmagf64x F -+GLIBC_2.27 fmaxmagl F -+GLIBC_2.27 fmin F -+GLIBC_2.27 fminf F -+GLIBC_2.27 fminf128 F -+GLIBC_2.27 fminf32 F -+GLIBC_2.27 fminf32x F -+GLIBC_2.27 fminf64 F -+GLIBC_2.27 fminf64x F -+GLIBC_2.27 fminl F -+GLIBC_2.27 fminmag F -+GLIBC_2.27 fminmagf F -+GLIBC_2.27 fminmagf128 F -+GLIBC_2.27 fminmagf32 F -+GLIBC_2.27 fminmagf32x F -+GLIBC_2.27 fminmagf64 F -+GLIBC_2.27 fminmagf64x F -+GLIBC_2.27 fminmagl F -+GLIBC_2.27 fmod F -+GLIBC_2.27 fmodf F -+GLIBC_2.27 fmodf128 F -+GLIBC_2.27 fmodf32 F -+GLIBC_2.27 fmodf32x F -+GLIBC_2.27 fmodf64 F -+GLIBC_2.27 fmodf64x F -+GLIBC_2.27 fmodl F -+GLIBC_2.27 frexp F -+GLIBC_2.27 frexpf F -+GLIBC_2.27 frexpf128 F -+GLIBC_2.27 frexpf32 F -+GLIBC_2.27 frexpf32x F -+GLIBC_2.27 frexpf64 F -+GLIBC_2.27 frexpf64x F -+GLIBC_2.27 frexpl F -+GLIBC_2.27 fromfp F -+GLIBC_2.27 fromfpf F -+GLIBC_2.27 fromfpf128 F -+GLIBC_2.27 fromfpf32 F -+GLIBC_2.27 fromfpf32x F -+GLIBC_2.27 fromfpf64 F -+GLIBC_2.27 fromfpf64x F -+GLIBC_2.27 fromfpl F -+GLIBC_2.27 fromfpx F -+GLIBC_2.27 fromfpxf F -+GLIBC_2.27 fromfpxf128 F -+GLIBC_2.27 fromfpxf32 F -+GLIBC_2.27 fromfpxf32x F -+GLIBC_2.27 fromfpxf64 F -+GLIBC_2.27 fromfpxf64x F -+GLIBC_2.27 fromfpxl F -+GLIBC_2.27 gamma F -+GLIBC_2.27 gammaf F -+GLIBC_2.27 gammal F -+GLIBC_2.27 getpayload F -+GLIBC_2.27 getpayloadf F -+GLIBC_2.27 getpayloadf128 F -+GLIBC_2.27 getpayloadf32 F -+GLIBC_2.27 getpayloadf32x F -+GLIBC_2.27 getpayloadf64 F -+GLIBC_2.27 getpayloadf64x F -+GLIBC_2.27 getpayloadl F -+GLIBC_2.27 hypot F -+GLIBC_2.27 hypotf F -+GLIBC_2.27 hypotf128 F -+GLIBC_2.27 hypotf32 F -+GLIBC_2.27 hypotf32x F -+GLIBC_2.27 hypotf64 F -+GLIBC_2.27 hypotf64x F -+GLIBC_2.27 hypotl F -+GLIBC_2.27 ilogb F -+GLIBC_2.27 ilogbf F -+GLIBC_2.27 ilogbf128 F -+GLIBC_2.27 ilogbf32 F -+GLIBC_2.27 ilogbf32x F -+GLIBC_2.27 ilogbf64 F -+GLIBC_2.27 ilogbf64x F -+GLIBC_2.27 ilogbl F -+GLIBC_2.27 j0 F -+GLIBC_2.27 j0f F -+GLIBC_2.27 j0f128 F -+GLIBC_2.27 j0f32 F -+GLIBC_2.27 j0f32x F -+GLIBC_2.27 j0f64 F -+GLIBC_2.27 j0f64x F -+GLIBC_2.27 j0l F -+GLIBC_2.27 j1 F -+GLIBC_2.27 j1f F -+GLIBC_2.27 j1f128 F -+GLIBC_2.27 j1f32 F -+GLIBC_2.27 j1f32x F -+GLIBC_2.27 j1f64 F -+GLIBC_2.27 j1f64x F -+GLIBC_2.27 j1l F -+GLIBC_2.27 jn F -+GLIBC_2.27 jnf F -+GLIBC_2.27 jnf128 F -+GLIBC_2.27 jnf32 F -+GLIBC_2.27 jnf32x F -+GLIBC_2.27 jnf64 F -+GLIBC_2.27 jnf64x F -+GLIBC_2.27 jnl F -+GLIBC_2.27 ldexp F -+GLIBC_2.27 ldexpf F -+GLIBC_2.27 ldexpf128 F -+GLIBC_2.27 ldexpf32 F -+GLIBC_2.27 ldexpf32x F -+GLIBC_2.27 ldexpf64 F -+GLIBC_2.27 ldexpf64x F -+GLIBC_2.27 ldexpl F -+GLIBC_2.27 lgamma F -+GLIBC_2.27 lgamma_r F -+GLIBC_2.27 lgammaf F -+GLIBC_2.27 lgammaf128 F -+GLIBC_2.27 lgammaf128_r F -+GLIBC_2.27 lgammaf32 F -+GLIBC_2.27 lgammaf32_r F -+GLIBC_2.27 lgammaf32x F -+GLIBC_2.27 lgammaf32x_r F -+GLIBC_2.27 lgammaf64 F -+GLIBC_2.27 lgammaf64_r F -+GLIBC_2.27 lgammaf64x F -+GLIBC_2.27 lgammaf64x_r F -+GLIBC_2.27 lgammaf_r F -+GLIBC_2.27 lgammal F -+GLIBC_2.27 lgammal_r F -+GLIBC_2.27 llogb F -+GLIBC_2.27 llogbf F -+GLIBC_2.27 llogbf128 F -+GLIBC_2.27 llogbf32 F -+GLIBC_2.27 llogbf32x F -+GLIBC_2.27 llogbf64 F -+GLIBC_2.27 llogbf64x F -+GLIBC_2.27 llogbl F -+GLIBC_2.27 llrint F -+GLIBC_2.27 llrintf F -+GLIBC_2.27 llrintf128 F -+GLIBC_2.27 llrintf32 F -+GLIBC_2.27 llrintf32x F -+GLIBC_2.27 llrintf64 F -+GLIBC_2.27 llrintf64x F -+GLIBC_2.27 llrintl F -+GLIBC_2.27 llround F -+GLIBC_2.27 llroundf F -+GLIBC_2.27 llroundf128 F -+GLIBC_2.27 llroundf32 F -+GLIBC_2.27 llroundf32x F -+GLIBC_2.27 llroundf64 F -+GLIBC_2.27 llroundf64x F -+GLIBC_2.27 llroundl F -+GLIBC_2.27 log F -+GLIBC_2.27 log10 F -+GLIBC_2.27 log10f F -+GLIBC_2.27 log10f128 F -+GLIBC_2.27 log10f32 F -+GLIBC_2.27 log10f32x F -+GLIBC_2.27 log10f64 F -+GLIBC_2.27 log10f64x F -+GLIBC_2.27 log10l F -+GLIBC_2.27 log1p F -+GLIBC_2.27 log1pf F -+GLIBC_2.27 log1pf128 F -+GLIBC_2.27 log1pf32 F -+GLIBC_2.27 log1pf32x F -+GLIBC_2.27 log1pf64 F -+GLIBC_2.27 log1pf64x F -+GLIBC_2.27 log1pl F -+GLIBC_2.27 log2 F -+GLIBC_2.27 log2f F -+GLIBC_2.27 log2f128 F -+GLIBC_2.27 log2f32 F -+GLIBC_2.27 log2f32x F -+GLIBC_2.27 log2f64 F -+GLIBC_2.27 log2f64x F -+GLIBC_2.27 log2l F -+GLIBC_2.27 logb F -+GLIBC_2.27 logbf F -+GLIBC_2.27 logbf128 F -+GLIBC_2.27 logbf32 F -+GLIBC_2.27 logbf32x F -+GLIBC_2.27 logbf64 F -+GLIBC_2.27 logbf64x F -+GLIBC_2.27 logbl F -+GLIBC_2.27 logf F -+GLIBC_2.27 logf128 F -+GLIBC_2.27 logf32 F -+GLIBC_2.27 logf32x F -+GLIBC_2.27 logf64 F -+GLIBC_2.27 logf64x F -+GLIBC_2.27 logl F -+GLIBC_2.27 lrint F -+GLIBC_2.27 lrintf F -+GLIBC_2.27 lrintf128 F -+GLIBC_2.27 lrintf32 F -+GLIBC_2.27 lrintf32x F -+GLIBC_2.27 lrintf64 F -+GLIBC_2.27 lrintf64x F -+GLIBC_2.27 lrintl F -+GLIBC_2.27 lround F -+GLIBC_2.27 lroundf F -+GLIBC_2.27 lroundf128 F -+GLIBC_2.27 lroundf32 F -+GLIBC_2.27 lroundf32x F -+GLIBC_2.27 lroundf64 F -+GLIBC_2.27 lroundf64x F -+GLIBC_2.27 lroundl F -+GLIBC_2.27 modf F -+GLIBC_2.27 modff F -+GLIBC_2.27 modff128 F -+GLIBC_2.27 modff32 F -+GLIBC_2.27 modff32x F -+GLIBC_2.27 modff64 F -+GLIBC_2.27 modff64x F -+GLIBC_2.27 modfl F -+GLIBC_2.27 nan F -+GLIBC_2.27 nanf F -+GLIBC_2.27 nanf128 F -+GLIBC_2.27 nanf32 F -+GLIBC_2.27 nanf32x F -+GLIBC_2.27 nanf64 F -+GLIBC_2.27 nanf64x F -+GLIBC_2.27 nanl F -+GLIBC_2.27 nearbyint F -+GLIBC_2.27 nearbyintf F -+GLIBC_2.27 nearbyintf128 F -+GLIBC_2.27 nearbyintf32 F -+GLIBC_2.27 nearbyintf32x F -+GLIBC_2.27 nearbyintf64 F -+GLIBC_2.27 nearbyintf64x F -+GLIBC_2.27 nearbyintl F -+GLIBC_2.27 nextafter F -+GLIBC_2.27 nextafterf F -+GLIBC_2.27 nextafterf128 F -+GLIBC_2.27 nextafterf32 F -+GLIBC_2.27 nextafterf32x F -+GLIBC_2.27 nextafterf64 F -+GLIBC_2.27 nextafterf64x F -+GLIBC_2.27 nextafterl F -+GLIBC_2.27 nextdown F -+GLIBC_2.27 nextdownf F -+GLIBC_2.27 nextdownf128 F -+GLIBC_2.27 nextdownf32 F -+GLIBC_2.27 nextdownf32x F -+GLIBC_2.27 nextdownf64 F -+GLIBC_2.27 nextdownf64x F -+GLIBC_2.27 nextdownl F -+GLIBC_2.27 nexttoward F -+GLIBC_2.27 nexttowardf F -+GLIBC_2.27 nexttowardl F -+GLIBC_2.27 nextup F -+GLIBC_2.27 nextupf F -+GLIBC_2.27 nextupf128 F -+GLIBC_2.27 nextupf32 F -+GLIBC_2.27 nextupf32x F -+GLIBC_2.27 nextupf64 F -+GLIBC_2.27 nextupf64x F -+GLIBC_2.27 nextupl F -+GLIBC_2.27 pow F -+GLIBC_2.27 powf F -+GLIBC_2.27 powf128 F -+GLIBC_2.27 powf32 F -+GLIBC_2.27 powf32x F -+GLIBC_2.27 powf64 F -+GLIBC_2.27 powf64x F -+GLIBC_2.27 powl F -+GLIBC_2.27 remainder F -+GLIBC_2.27 remainderf F -+GLIBC_2.27 remainderf128 F -+GLIBC_2.27 remainderf32 F -+GLIBC_2.27 remainderf32x F -+GLIBC_2.27 remainderf64 F -+GLIBC_2.27 remainderf64x F -+GLIBC_2.27 remainderl F -+GLIBC_2.27 remquo F -+GLIBC_2.27 remquof F -+GLIBC_2.27 remquof128 F -+GLIBC_2.27 remquof32 F -+GLIBC_2.27 remquof32x F -+GLIBC_2.27 remquof64 F -+GLIBC_2.27 remquof64x F -+GLIBC_2.27 remquol F -+GLIBC_2.27 rint F -+GLIBC_2.27 rintf F -+GLIBC_2.27 rintf128 F -+GLIBC_2.27 rintf32 F -+GLIBC_2.27 rintf32x F -+GLIBC_2.27 rintf64 F -+GLIBC_2.27 rintf64x F -+GLIBC_2.27 rintl F -+GLIBC_2.27 round F -+GLIBC_2.27 roundeven F -+GLIBC_2.27 roundevenf F -+GLIBC_2.27 roundevenf128 F -+GLIBC_2.27 roundevenf32 F -+GLIBC_2.27 roundevenf32x F -+GLIBC_2.27 roundevenf64 F -+GLIBC_2.27 roundevenf64x F -+GLIBC_2.27 roundevenl F -+GLIBC_2.27 roundf F -+GLIBC_2.27 roundf128 F -+GLIBC_2.27 roundf32 F -+GLIBC_2.27 roundf32x F -+GLIBC_2.27 roundf64 F -+GLIBC_2.27 roundf64x F -+GLIBC_2.27 roundl F -+GLIBC_2.27 scalb F -+GLIBC_2.27 scalbf F -+GLIBC_2.27 scalbl F -+GLIBC_2.27 scalbln F -+GLIBC_2.27 scalblnf F -+GLIBC_2.27 scalblnf128 F -+GLIBC_2.27 scalblnf32 F -+GLIBC_2.27 scalblnf32x F -+GLIBC_2.27 scalblnf64 F -+GLIBC_2.27 scalblnf64x F -+GLIBC_2.27 scalblnl F -+GLIBC_2.27 scalbn F -+GLIBC_2.27 scalbnf F -+GLIBC_2.27 scalbnf128 F -+GLIBC_2.27 scalbnf32 F -+GLIBC_2.27 scalbnf32x F -+GLIBC_2.27 scalbnf64 F -+GLIBC_2.27 scalbnf64x F -+GLIBC_2.27 scalbnl F -+GLIBC_2.27 setpayload F -+GLIBC_2.27 setpayloadf F -+GLIBC_2.27 setpayloadf128 F -+GLIBC_2.27 setpayloadf32 F -+GLIBC_2.27 setpayloadf32x F -+GLIBC_2.27 setpayloadf64 F -+GLIBC_2.27 setpayloadf64x F -+GLIBC_2.27 setpayloadl F -+GLIBC_2.27 setpayloadsig F -+GLIBC_2.27 setpayloadsigf F -+GLIBC_2.27 setpayloadsigf128 F -+GLIBC_2.27 setpayloadsigf32 F -+GLIBC_2.27 setpayloadsigf32x F -+GLIBC_2.27 setpayloadsigf64 F -+GLIBC_2.27 setpayloadsigf64x F -+GLIBC_2.27 setpayloadsigl F -+GLIBC_2.27 signgam D 0x4 -+GLIBC_2.27 significand F -+GLIBC_2.27 significandf F -+GLIBC_2.27 significandl F -+GLIBC_2.27 sin F -+GLIBC_2.27 sincos F -+GLIBC_2.27 sincosf F -+GLIBC_2.27 sincosf128 F -+GLIBC_2.27 sincosf32 F -+GLIBC_2.27 sincosf32x F -+GLIBC_2.27 sincosf64 F -+GLIBC_2.27 sincosf64x F -+GLIBC_2.27 sincosl F -+GLIBC_2.27 sinf F -+GLIBC_2.27 sinf128 F -+GLIBC_2.27 sinf32 F -+GLIBC_2.27 sinf32x F -+GLIBC_2.27 sinf64 F -+GLIBC_2.27 sinf64x F -+GLIBC_2.27 sinh F -+GLIBC_2.27 sinhf F -+GLIBC_2.27 sinhf128 F -+GLIBC_2.27 sinhf32 F -+GLIBC_2.27 sinhf32x F -+GLIBC_2.27 sinhf64 F -+GLIBC_2.27 sinhf64x F -+GLIBC_2.27 sinhl F -+GLIBC_2.27 sinl F -+GLIBC_2.27 sqrt F -+GLIBC_2.27 sqrtf F -+GLIBC_2.27 sqrtf128 F -+GLIBC_2.27 sqrtf32 F -+GLIBC_2.27 sqrtf32x F -+GLIBC_2.27 sqrtf64 F -+GLIBC_2.27 sqrtf64x F -+GLIBC_2.27 sqrtl F -+GLIBC_2.27 tan F -+GLIBC_2.27 tanf F -+GLIBC_2.27 tanf128 F -+GLIBC_2.27 tanf32 F -+GLIBC_2.27 tanf32x F -+GLIBC_2.27 tanf64 F -+GLIBC_2.27 tanf64x F -+GLIBC_2.27 tanh F -+GLIBC_2.27 tanhf F -+GLIBC_2.27 tanhf128 F -+GLIBC_2.27 tanhf32 F -+GLIBC_2.27 tanhf32x F -+GLIBC_2.27 tanhf64 F -+GLIBC_2.27 tanhf64x F -+GLIBC_2.27 tanhl F -+GLIBC_2.27 tanl F -+GLIBC_2.27 tgamma F -+GLIBC_2.27 tgammaf F -+GLIBC_2.27 tgammaf128 F -+GLIBC_2.27 tgammaf32 F -+GLIBC_2.27 tgammaf32x F -+GLIBC_2.27 tgammaf64 F -+GLIBC_2.27 tgammaf64x F -+GLIBC_2.27 tgammal F -+GLIBC_2.27 totalorder F -+GLIBC_2.27 totalorderf F -+GLIBC_2.27 totalorderf128 F -+GLIBC_2.27 totalorderf32 F -+GLIBC_2.27 totalorderf32x F -+GLIBC_2.27 totalorderf64 F -+GLIBC_2.27 totalorderf64x F -+GLIBC_2.27 totalorderl F -+GLIBC_2.27 totalordermag F -+GLIBC_2.27 totalordermagf F -+GLIBC_2.27 totalordermagf128 F -+GLIBC_2.27 totalordermagf32 F -+GLIBC_2.27 totalordermagf32x F -+GLIBC_2.27 totalordermagf64 F -+GLIBC_2.27 totalordermagf64x F -+GLIBC_2.27 totalordermagl F -+GLIBC_2.27 trunc F -+GLIBC_2.27 truncf F -+GLIBC_2.27 truncf128 F -+GLIBC_2.27 truncf32 F -+GLIBC_2.27 truncf32x F -+GLIBC_2.27 truncf64 F -+GLIBC_2.27 truncf64x F -+GLIBC_2.27 truncl F -+GLIBC_2.27 ufromfp F -+GLIBC_2.27 ufromfpf F -+GLIBC_2.27 ufromfpf128 F -+GLIBC_2.27 ufromfpf32 F -+GLIBC_2.27 ufromfpf32x F -+GLIBC_2.27 ufromfpf64 F -+GLIBC_2.27 ufromfpf64x F -+GLIBC_2.27 ufromfpl F -+GLIBC_2.27 ufromfpx F -+GLIBC_2.27 ufromfpxf F -+GLIBC_2.27 ufromfpxf128 F -+GLIBC_2.27 ufromfpxf32 F -+GLIBC_2.27 ufromfpxf32x F -+GLIBC_2.27 ufromfpxf64 F -+GLIBC_2.27 ufromfpxf64x F -+GLIBC_2.27 ufromfpxl F -+GLIBC_2.27 y0 F -+GLIBC_2.27 y0f F -+GLIBC_2.27 y0f128 F -+GLIBC_2.27 y0f32 F -+GLIBC_2.27 y0f32x F -+GLIBC_2.27 y0f64 F -+GLIBC_2.27 y0f64x F -+GLIBC_2.27 y0l F -+GLIBC_2.27 y1 F -+GLIBC_2.27 y1f F -+GLIBC_2.27 y1f128 F -+GLIBC_2.27 y1f32 F -+GLIBC_2.27 y1f32x F -+GLIBC_2.27 y1f64 F -+GLIBC_2.27 y1f64x F -+GLIBC_2.27 y1l F -+GLIBC_2.27 yn F -+GLIBC_2.27 ynf F -+GLIBC_2.27 ynf128 F -+GLIBC_2.27 ynf32 F -+GLIBC_2.27 ynf32x F -+GLIBC_2.27 ynf64 F -+GLIBC_2.27 ynf64x F -+GLIBC_2.27 ynl F -+GLIBC_2.28 daddl F -+GLIBC_2.28 ddivl F -+GLIBC_2.28 dmull F -+GLIBC_2.28 dsubl F -+GLIBC_2.28 f32addf128 F -+GLIBC_2.28 f32addf32x F -+GLIBC_2.28 f32addf64 F -+GLIBC_2.28 f32addf64x F -+GLIBC_2.28 f32divf128 F -+GLIBC_2.28 f32divf32x F -+GLIBC_2.28 f32divf64 F -+GLIBC_2.28 f32divf64x F -+GLIBC_2.28 f32mulf128 F -+GLIBC_2.28 f32mulf32x F -+GLIBC_2.28 f32mulf64 F -+GLIBC_2.28 f32mulf64x F -+GLIBC_2.28 f32subf128 F -+GLIBC_2.28 f32subf32x F -+GLIBC_2.28 f32subf64 F -+GLIBC_2.28 f32subf64x F -+GLIBC_2.28 f32xaddf128 F -+GLIBC_2.28 f32xaddf64 F -+GLIBC_2.28 f32xaddf64x F -+GLIBC_2.28 f32xdivf128 F -+GLIBC_2.28 f32xdivf64 F -+GLIBC_2.28 f32xdivf64x F -+GLIBC_2.28 f32xmulf128 F -+GLIBC_2.28 f32xmulf64 F -+GLIBC_2.28 f32xmulf64x F -+GLIBC_2.28 f32xsubf128 F -+GLIBC_2.28 f32xsubf64 F -+GLIBC_2.28 f32xsubf64x F -+GLIBC_2.28 f64addf128 F -+GLIBC_2.28 f64addf64x F -+GLIBC_2.28 f64divf128 F -+GLIBC_2.28 f64divf64x F -+GLIBC_2.28 f64mulf128 F -+GLIBC_2.28 f64mulf64x F -+GLIBC_2.28 f64subf128 F -+GLIBC_2.28 f64subf64x F -+GLIBC_2.28 f64xaddf128 F -+GLIBC_2.28 f64xdivf128 F -+GLIBC_2.28 f64xmulf128 F -+GLIBC_2.28 f64xsubf128 F -+GLIBC_2.28 fadd F -+GLIBC_2.28 faddl F -+GLIBC_2.28 fdiv F -+GLIBC_2.28 fdivl F -+GLIBC_2.28 fmul F -+GLIBC_2.28 fmull F -+GLIBC_2.28 fsub F -+GLIBC_2.28 fsubl F -diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libnsl.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libnsl.abilist -new file mode 100644 -index 00000000..0767472d ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libnsl.abilist -@@ -0,0 +1,120 @@ -+GLIBC_2.27 __free_fdresult F -+GLIBC_2.27 __nis_default_access F -+GLIBC_2.27 __nis_default_group F -+GLIBC_2.27 __nis_default_owner F -+GLIBC_2.27 __nis_default_ttl F -+GLIBC_2.27 __nis_finddirectory F -+GLIBC_2.27 __nisbind_connect F -+GLIBC_2.27 __nisbind_create F -+GLIBC_2.27 __nisbind_destroy F -+GLIBC_2.27 __nisbind_next F -+GLIBC_2.27 __yp_check F -+GLIBC_2.27 nis_add F -+GLIBC_2.27 nis_add_entry F -+GLIBC_2.27 nis_addmember F -+GLIBC_2.27 nis_checkpoint F -+GLIBC_2.27 nis_clone_directory F -+GLIBC_2.27 nis_clone_object F -+GLIBC_2.27 nis_clone_result F -+GLIBC_2.27 nis_creategroup F -+GLIBC_2.27 nis_destroy_object F -+GLIBC_2.27 nis_destroygroup F -+GLIBC_2.27 nis_dir_cmp F -+GLIBC_2.27 nis_domain_of F -+GLIBC_2.27 nis_domain_of_r F -+GLIBC_2.27 nis_first_entry F -+GLIBC_2.27 nis_free_directory F -+GLIBC_2.27 nis_free_object F -+GLIBC_2.27 nis_free_request F -+GLIBC_2.27 nis_freenames F -+GLIBC_2.27 nis_freeresult F -+GLIBC_2.27 nis_freeservlist F -+GLIBC_2.27 nis_freetags F -+GLIBC_2.27 nis_getnames F -+GLIBC_2.27 nis_getservlist F -+GLIBC_2.27 nis_ismember F -+GLIBC_2.27 nis_leaf_of F -+GLIBC_2.27 nis_leaf_of_r F -+GLIBC_2.27 nis_lerror F -+GLIBC_2.27 nis_list F -+GLIBC_2.27 nis_local_directory F -+GLIBC_2.27 nis_local_group F -+GLIBC_2.27 nis_local_host F -+GLIBC_2.27 nis_local_principal F -+GLIBC_2.27 nis_lookup F -+GLIBC_2.27 nis_mkdir F -+GLIBC_2.27 nis_modify F -+GLIBC_2.27 nis_modify_entry F -+GLIBC_2.27 nis_name_of F -+GLIBC_2.27 nis_name_of_r F -+GLIBC_2.27 nis_next_entry F -+GLIBC_2.27 nis_perror F -+GLIBC_2.27 nis_ping F -+GLIBC_2.27 nis_print_directory F -+GLIBC_2.27 nis_print_entry F -+GLIBC_2.27 nis_print_group F -+GLIBC_2.27 nis_print_group_entry F -+GLIBC_2.27 nis_print_link F -+GLIBC_2.27 nis_print_object F -+GLIBC_2.27 nis_print_result F -+GLIBC_2.27 nis_print_rights F -+GLIBC_2.27 nis_print_table F -+GLIBC_2.27 nis_read_obj F -+GLIBC_2.27 nis_remove F -+GLIBC_2.27 nis_remove_entry F -+GLIBC_2.27 nis_removemember F -+GLIBC_2.27 nis_rmdir F -+GLIBC_2.27 nis_servstate F -+GLIBC_2.27 nis_sperrno F -+GLIBC_2.27 nis_sperror F -+GLIBC_2.27 nis_sperror_r F -+GLIBC_2.27 nis_stats F -+GLIBC_2.27 nis_verifygroup F -+GLIBC_2.27 nis_write_obj F -+GLIBC_2.27 readColdStartFile F -+GLIBC_2.27 writeColdStartFile F -+GLIBC_2.27 xdr_cback_data F -+GLIBC_2.27 xdr_domainname F -+GLIBC_2.27 xdr_keydat F -+GLIBC_2.27 xdr_mapname F -+GLIBC_2.27 xdr_obj_p F -+GLIBC_2.27 xdr_peername F -+GLIBC_2.27 xdr_valdat F -+GLIBC_2.27 xdr_yp_buf F -+GLIBC_2.27 xdr_ypall F -+GLIBC_2.27 xdr_ypbind_binding F -+GLIBC_2.27 xdr_ypbind_resp F -+GLIBC_2.27 xdr_ypbind_resptype F -+GLIBC_2.27 xdr_ypbind_setdom F -+GLIBC_2.27 xdr_ypdelete_args F -+GLIBC_2.27 xdr_ypmap_parms F -+GLIBC_2.27 xdr_ypmaplist F -+GLIBC_2.27 xdr_yppush_status F -+GLIBC_2.27 xdr_yppushresp_xfr F -+GLIBC_2.27 xdr_ypreq_key F -+GLIBC_2.27 xdr_ypreq_nokey F -+GLIBC_2.27 xdr_ypreq_xfr F -+GLIBC_2.27 xdr_ypresp_all F -+GLIBC_2.27 xdr_ypresp_key_val F -+GLIBC_2.27 xdr_ypresp_maplist F -+GLIBC_2.27 xdr_ypresp_master F -+GLIBC_2.27 xdr_ypresp_order F -+GLIBC_2.27 xdr_ypresp_val F -+GLIBC_2.27 xdr_ypresp_xfr F -+GLIBC_2.27 xdr_ypstat F -+GLIBC_2.27 xdr_ypupdate_args F -+GLIBC_2.27 xdr_ypxfrstat F -+GLIBC_2.27 yp_all F -+GLIBC_2.27 yp_bind F -+GLIBC_2.27 yp_first F -+GLIBC_2.27 yp_get_default_domain F -+GLIBC_2.27 yp_maplist F -+GLIBC_2.27 yp_master F -+GLIBC_2.27 yp_match F -+GLIBC_2.27 yp_next F -+GLIBC_2.27 yp_order F -+GLIBC_2.27 yp_unbind F -+GLIBC_2.27 yp_update F -+GLIBC_2.27 ypbinderr_string F -+GLIBC_2.27 yperr_string F -+GLIBC_2.27 ypprot_err F -diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libpthread.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libpthread.abilist -new file mode 100644 -index 00000000..f60b22ef ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libpthread.abilist -@@ -0,0 +1,264 @@ -+GLIBC_2.0 _IO_flockfile F -+GLIBC_2.0 _IO_ftrylockfile F -+GLIBC_2.0 _IO_funlockfile F -+GLIBC_2.0 __close F -+GLIBC_2.0 __connect F -+GLIBC_2.0 __errno_location F -+GLIBC_2.0 __fcntl F -+GLIBC_2.0 __fork F -+GLIBC_2.0 __h_errno_location F -+GLIBC_2.0 __lseek F -+GLIBC_2.0 __open F -+GLIBC_2.0 __pthread_getspecific F -+GLIBC_2.0 __pthread_key_create F -+GLIBC_2.0 __pthread_mutex_destroy F -+GLIBC_2.0 __pthread_mutex_init F -+GLIBC_2.0 __pthread_mutex_lock F -+GLIBC_2.0 __pthread_mutex_trylock F -+GLIBC_2.0 __pthread_mutex_unlock F -+GLIBC_2.0 __pthread_mutexattr_destroy F -+GLIBC_2.0 __pthread_mutexattr_init F -+GLIBC_2.0 __pthread_mutexattr_settype F -+GLIBC_2.0 __pthread_once F -+GLIBC_2.0 __pthread_setspecific F -+GLIBC_2.0 __read F -+GLIBC_2.0 __send F -+GLIBC_2.0 __sigaction F -+GLIBC_2.0 __wait F -+GLIBC_2.0 __write F -+GLIBC_2.0 _pthread_cleanup_pop F -+GLIBC_2.0 _pthread_cleanup_pop_restore F -+GLIBC_2.0 _pthread_cleanup_push F -+GLIBC_2.0 _pthread_cleanup_push_defer F -+GLIBC_2.0 accept F -+GLIBC_2.0 close F -+GLIBC_2.0 connect F -+GLIBC_2.0 fcntl F -+GLIBC_2.0 flockfile F -+GLIBC_2.0 fork F -+GLIBC_2.0 fsync F -+GLIBC_2.0 ftrylockfile F -+GLIBC_2.0 funlockfile F -+GLIBC_2.0 longjmp F -+GLIBC_2.0 lseek F -+GLIBC_2.0 msync F -+GLIBC_2.0 nanosleep F -+GLIBC_2.0 open F -+GLIBC_2.0 pause F -+GLIBC_2.0 pthread_atfork F -+GLIBC_2.0 pthread_attr_destroy F -+GLIBC_2.0 pthread_attr_getdetachstate F -+GLIBC_2.0 pthread_attr_getinheritsched F -+GLIBC_2.0 pthread_attr_getschedparam F -+GLIBC_2.0 pthread_attr_getschedpolicy F -+GLIBC_2.0 pthread_attr_getscope F -+GLIBC_2.0 pthread_attr_init F -+GLIBC_2.0 pthread_attr_setdetachstate F -+GLIBC_2.0 pthread_attr_setinheritsched F -+GLIBC_2.0 pthread_attr_setschedparam F -+GLIBC_2.0 pthread_attr_setschedpolicy F -+GLIBC_2.0 pthread_attr_setscope F -+GLIBC_2.0 pthread_cancel F -+GLIBC_2.0 pthread_cond_broadcast F -+GLIBC_2.0 pthread_cond_destroy F -+GLIBC_2.0 pthread_cond_init F -+GLIBC_2.0 pthread_cond_signal F -+GLIBC_2.0 pthread_cond_timedwait F -+GLIBC_2.0 pthread_cond_wait F -+GLIBC_2.0 pthread_condattr_destroy F -+GLIBC_2.0 pthread_condattr_init F -+GLIBC_2.0 pthread_create F -+GLIBC_2.0 pthread_detach F -+GLIBC_2.0 pthread_equal F -+GLIBC_2.0 pthread_exit F -+GLIBC_2.0 pthread_getschedparam F -+GLIBC_2.0 pthread_getspecific F -+GLIBC_2.0 pthread_join F -+GLIBC_2.0 pthread_key_create F -+GLIBC_2.0 pthread_key_delete F -+GLIBC_2.0 pthread_kill F -+GLIBC_2.0 pthread_kill_other_threads_np F -+GLIBC_2.0 pthread_mutex_destroy F -+GLIBC_2.0 pthread_mutex_init F -+GLIBC_2.0 pthread_mutex_lock F -+GLIBC_2.0 pthread_mutex_trylock F -+GLIBC_2.0 pthread_mutex_unlock F -+GLIBC_2.0 pthread_mutexattr_destroy F -+GLIBC_2.0 pthread_mutexattr_getkind_np F -+GLIBC_2.0 pthread_mutexattr_init F -+GLIBC_2.0 pthread_mutexattr_setkind_np F -+GLIBC_2.0 pthread_once F -+GLIBC_2.0 pthread_self F -+GLIBC_2.0 pthread_setcancelstate F -+GLIBC_2.0 pthread_setcanceltype F -+GLIBC_2.0 pthread_setschedparam F -+GLIBC_2.0 pthread_setspecific F -+GLIBC_2.0 pthread_sigmask F -+GLIBC_2.0 pthread_testcancel F -+GLIBC_2.0 raise F -+GLIBC_2.0 read F -+GLIBC_2.0 recv F -+GLIBC_2.0 recvfrom F -+GLIBC_2.0 recvmsg F -+GLIBC_2.0 sem_destroy F -+GLIBC_2.0 sem_getvalue F -+GLIBC_2.0 sem_init F -+GLIBC_2.0 sem_post F -+GLIBC_2.0 sem_trywait F -+GLIBC_2.0 sem_wait F -+GLIBC_2.0 send F -+GLIBC_2.0 sendmsg F -+GLIBC_2.0 sendto F -+GLIBC_2.0 sigaction F -+GLIBC_2.0 siglongjmp F -+GLIBC_2.0 sigwait F -+GLIBC_2.0 system F -+GLIBC_2.0 tcdrain F -+GLIBC_2.0 wait F -+GLIBC_2.0 waitpid F -+GLIBC_2.0 write F -+GLIBC_2.11 pthread_sigqueue F -+GLIBC_2.12 pthread_getname_np F -+GLIBC_2.12 pthread_mutex_consistent F -+GLIBC_2.12 pthread_mutexattr_getrobust F -+GLIBC_2.12 pthread_mutexattr_setrobust F -+GLIBC_2.12 pthread_setname_np F -+GLIBC_2.18 pthread_getattr_default_np F -+GLIBC_2.18 pthread_setattr_default_np F -+GLIBC_2.2 __libc_allocate_rtsig F -+GLIBC_2.2 __libc_current_sigrtmax F -+GLIBC_2.2 __libc_current_sigrtmin F -+GLIBC_2.2 __open64 F -+GLIBC_2.2 __pread64 F -+GLIBC_2.2 __pthread_rwlock_destroy F -+GLIBC_2.2 __pthread_rwlock_init F -+GLIBC_2.2 __pthread_rwlock_rdlock F -+GLIBC_2.2 __pthread_rwlock_tryrdlock F -+GLIBC_2.2 __pthread_rwlock_trywrlock F -+GLIBC_2.2 __pthread_rwlock_unlock F -+GLIBC_2.2 __pthread_rwlock_wrlock F -+GLIBC_2.2 __pwrite64 F -+GLIBC_2.2 __res_state F -+GLIBC_2.2 lseek64 F -+GLIBC_2.2 open64 F -+GLIBC_2.2 pread F -+GLIBC_2.2 pread64 F -+GLIBC_2.2 pthread_attr_getguardsize F -+GLIBC_2.2 pthread_attr_getstack F -+GLIBC_2.2 pthread_attr_getstackaddr F -+GLIBC_2.2 pthread_attr_getstacksize F -+GLIBC_2.2 pthread_attr_init F -+GLIBC_2.2 pthread_attr_setguardsize F -+GLIBC_2.2 pthread_attr_setstack F -+GLIBC_2.2 pthread_attr_setstackaddr F -+GLIBC_2.2 pthread_attr_setstacksize F -+GLIBC_2.2 pthread_barrier_destroy F -+GLIBC_2.2 pthread_barrier_init F -+GLIBC_2.2 pthread_barrier_wait F -+GLIBC_2.2 pthread_barrierattr_destroy F -+GLIBC_2.2 pthread_barrierattr_init F -+GLIBC_2.2 pthread_barrierattr_setpshared F -+GLIBC_2.2 pthread_condattr_getpshared F -+GLIBC_2.2 pthread_condattr_setpshared F -+GLIBC_2.2 pthread_create F -+GLIBC_2.2 pthread_getconcurrency F -+GLIBC_2.2 pthread_getcpuclockid F -+GLIBC_2.2 pthread_mutex_timedlock F -+GLIBC_2.2 pthread_mutexattr_getpshared F -+GLIBC_2.2 pthread_mutexattr_gettype F -+GLIBC_2.2 pthread_mutexattr_setpshared F -+GLIBC_2.2 pthread_mutexattr_settype F -+GLIBC_2.2 pthread_rwlock_destroy F -+GLIBC_2.2 pthread_rwlock_init F -+GLIBC_2.2 pthread_rwlock_rdlock F -+GLIBC_2.2 pthread_rwlock_timedrdlock F -+GLIBC_2.2 pthread_rwlock_timedwrlock F -+GLIBC_2.2 pthread_rwlock_tryrdlock F -+GLIBC_2.2 pthread_rwlock_trywrlock F -+GLIBC_2.2 pthread_rwlock_unlock F -+GLIBC_2.2 pthread_rwlock_wrlock F -+GLIBC_2.2 pthread_rwlockattr_destroy F -+GLIBC_2.2 pthread_rwlockattr_getkind_np F -+GLIBC_2.2 pthread_rwlockattr_getpshared F -+GLIBC_2.2 pthread_rwlockattr_init F -+GLIBC_2.2 pthread_rwlockattr_setkind_np F -+GLIBC_2.2 pthread_rwlockattr_setpshared F -+GLIBC_2.2 pthread_setconcurrency F -+GLIBC_2.2 pthread_spin_destroy F -+GLIBC_2.2 pthread_spin_init F -+GLIBC_2.2 pthread_spin_lock F -+GLIBC_2.2 pthread_spin_trylock F -+GLIBC_2.2 pthread_spin_unlock F -+GLIBC_2.2 pthread_yield F -+GLIBC_2.2 pwrite F -+GLIBC_2.2 pwrite64 F -+GLIBC_2.2 sem_close F -+GLIBC_2.2 sem_destroy F -+GLIBC_2.2 sem_getvalue F -+GLIBC_2.2 sem_init F -+GLIBC_2.2 sem_open F -+GLIBC_2.2 sem_post F -+GLIBC_2.2 sem_timedwait F -+GLIBC_2.2 sem_trywait F -+GLIBC_2.2 sem_unlink F -+GLIBC_2.2 sem_wait F -+GLIBC_2.2.3 pthread_getattr_np F -+GLIBC_2.2.6 __nanosleep F -+GLIBC_2.28 call_once F -+GLIBC_2.28 cnd_broadcast F -+GLIBC_2.28 cnd_destroy F -+GLIBC_2.28 cnd_init F -+GLIBC_2.28 cnd_signal F -+GLIBC_2.28 cnd_timedwait F -+GLIBC_2.28 cnd_wait F -+GLIBC_2.28 mtx_destroy F -+GLIBC_2.28 mtx_init F -+GLIBC_2.28 mtx_lock F -+GLIBC_2.28 mtx_timedlock F -+GLIBC_2.28 mtx_trylock F -+GLIBC_2.28 mtx_unlock F -+GLIBC_2.28 thrd_create F -+GLIBC_2.28 thrd_detach F -+GLIBC_2.28 thrd_exit F -+GLIBC_2.28 thrd_join F -+GLIBC_2.28 tss_create F -+GLIBC_2.28 tss_delete F -+GLIBC_2.28 tss_get F -+GLIBC_2.28 tss_set F -+GLIBC_2.3.2 pthread_cond_broadcast F -+GLIBC_2.3.2 pthread_cond_destroy F -+GLIBC_2.3.2 pthread_cond_init F -+GLIBC_2.3.2 pthread_cond_signal F -+GLIBC_2.3.2 pthread_cond_timedwait F -+GLIBC_2.3.2 pthread_cond_wait F -+GLIBC_2.3.3 __pthread_cleanup_routine F -+GLIBC_2.3.3 __pthread_register_cancel F -+GLIBC_2.3.3 __pthread_register_cancel_defer F -+GLIBC_2.3.3 __pthread_unregister_cancel F -+GLIBC_2.3.3 __pthread_unregister_cancel_restore F -+GLIBC_2.3.3 __pthread_unwind_next F -+GLIBC_2.3.3 pthread_attr_getaffinity_np F -+GLIBC_2.3.3 pthread_attr_setaffinity_np F -+GLIBC_2.3.3 pthread_attr_setstack F -+GLIBC_2.3.3 pthread_attr_setstacksize F -+GLIBC_2.3.3 pthread_barrierattr_getpshared F -+GLIBC_2.3.3 pthread_condattr_getclock F -+GLIBC_2.3.3 pthread_condattr_setclock F -+GLIBC_2.3.3 pthread_getaffinity_np F -+GLIBC_2.3.3 pthread_setaffinity_np F -+GLIBC_2.3.3 pthread_timedjoin_np F -+GLIBC_2.3.3 pthread_tryjoin_np F -+GLIBC_2.3.4 pthread_attr_getaffinity_np F -+GLIBC_2.3.4 pthread_attr_setaffinity_np F -+GLIBC_2.3.4 pthread_getaffinity_np F -+GLIBC_2.3.4 pthread_setaffinity_np F -+GLIBC_2.3.4 pthread_setschedprio F -+GLIBC_2.4 pthread_mutex_consistent_np F -+GLIBC_2.4 pthread_mutex_getprioceiling F -+GLIBC_2.4 pthread_mutex_setprioceiling F -+GLIBC_2.4 pthread_mutexattr_getprioceiling F -+GLIBC_2.4 pthread_mutexattr_getprotocol F -+GLIBC_2.4 pthread_mutexattr_getrobust_np F -+GLIBC_2.4 pthread_mutexattr_setprioceiling F -+GLIBC_2.4 pthread_mutexattr_setprotocol F -+GLIBC_2.4 pthread_mutexattr_setrobust_np F -diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libresolv.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libresolv.abilist -new file mode 100644 -index 00000000..eb9c1cb7 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libresolv.abilist -@@ -0,0 +1,79 @@ -+GLIBC_2.27 __b64_ntop F -+GLIBC_2.27 __b64_pton F -+GLIBC_2.27 __dn_comp F -+GLIBC_2.27 __dn_count_labels F -+GLIBC_2.27 __dn_expand F -+GLIBC_2.27 __dn_skipname F -+GLIBC_2.27 __fp_nquery F -+GLIBC_2.27 __fp_query F -+GLIBC_2.27 __fp_resstat F -+GLIBC_2.27 __hostalias F -+GLIBC_2.27 __loc_aton F -+GLIBC_2.27 __loc_ntoa F -+GLIBC_2.27 __p_cdname F -+GLIBC_2.27 __p_cdnname F -+GLIBC_2.27 __p_class F -+GLIBC_2.27 __p_class_syms D 0xa8 -+GLIBC_2.27 __p_fqname F -+GLIBC_2.27 __p_fqnname F -+GLIBC_2.27 __p_option F -+GLIBC_2.27 __p_query F -+GLIBC_2.27 __p_rcode F -+GLIBC_2.27 __p_time F -+GLIBC_2.27 __p_type F -+GLIBC_2.27 __p_type_syms D 0x450 -+GLIBC_2.27 __putlong F -+GLIBC_2.27 __putshort F -+GLIBC_2.27 __res_close F -+GLIBC_2.27 __res_dnok F -+GLIBC_2.27 __res_hnok F -+GLIBC_2.27 __res_hostalias F -+GLIBC_2.27 __res_isourserver F -+GLIBC_2.27 __res_mailok F -+GLIBC_2.27 __res_mkquery F -+GLIBC_2.27 __res_nameinquery F -+GLIBC_2.27 __res_nmkquery F -+GLIBC_2.27 __res_nquery F -+GLIBC_2.27 __res_nquerydomain F -+GLIBC_2.27 __res_nsearch F -+GLIBC_2.27 __res_nsend F -+GLIBC_2.27 __res_ownok F -+GLIBC_2.27 __res_queriesmatch F -+GLIBC_2.27 __res_query F -+GLIBC_2.27 __res_querydomain F -+GLIBC_2.27 __res_search F -+GLIBC_2.27 __res_send F -+GLIBC_2.27 __sym_ntop F -+GLIBC_2.27 __sym_ntos F -+GLIBC_2.27 __sym_ston F -+GLIBC_2.27 _getlong F -+GLIBC_2.27 _getshort F -+GLIBC_2.27 inet_net_ntop F -+GLIBC_2.27 inet_net_pton F -+GLIBC_2.27 inet_neta F -+GLIBC_2.27 ns_datetosecs F -+GLIBC_2.27 ns_format_ttl F -+GLIBC_2.27 ns_get16 F -+GLIBC_2.27 ns_get32 F -+GLIBC_2.27 ns_initparse F -+GLIBC_2.27 ns_makecanon F -+GLIBC_2.27 ns_msg_getflag F -+GLIBC_2.27 ns_name_compress F -+GLIBC_2.27 ns_name_ntol F -+GLIBC_2.27 ns_name_ntop F -+GLIBC_2.27 ns_name_pack F -+GLIBC_2.27 ns_name_pton F -+GLIBC_2.27 ns_name_rollback F -+GLIBC_2.27 ns_name_skip F -+GLIBC_2.27 ns_name_uncompress F -+GLIBC_2.27 ns_name_unpack F -+GLIBC_2.27 ns_parse_ttl F -+GLIBC_2.27 ns_parserr F -+GLIBC_2.27 ns_put16 F -+GLIBC_2.27 ns_put32 F -+GLIBC_2.27 ns_samedomain F -+GLIBC_2.27 ns_samename F -+GLIBC_2.27 ns_skiprr F -+GLIBC_2.27 ns_sprintrr F -+GLIBC_2.27 ns_sprintrrf F -+GLIBC_2.27 ns_subdomain F -diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/librt.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/librt.abilist -new file mode 100644 -index 00000000..bfd262ec ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/librt.abilist -@@ -0,0 +1,35 @@ -+GLIBC_2.27 __mq_open_2 F -+GLIBC_2.27 aio_cancel F -+GLIBC_2.27 aio_cancel64 F -+GLIBC_2.27 aio_error F -+GLIBC_2.27 aio_error64 F -+GLIBC_2.27 aio_fsync F -+GLIBC_2.27 aio_fsync64 F -+GLIBC_2.27 aio_init F -+GLIBC_2.27 aio_read F -+GLIBC_2.27 aio_read64 F -+GLIBC_2.27 aio_return F -+GLIBC_2.27 aio_return64 F -+GLIBC_2.27 aio_suspend F -+GLIBC_2.27 aio_suspend64 F -+GLIBC_2.27 aio_write F -+GLIBC_2.27 aio_write64 F -+GLIBC_2.27 lio_listio F -+GLIBC_2.27 lio_listio64 F -+GLIBC_2.27 mq_close F -+GLIBC_2.27 mq_getattr F -+GLIBC_2.27 mq_notify F -+GLIBC_2.27 mq_open F -+GLIBC_2.27 mq_receive F -+GLIBC_2.27 mq_send F -+GLIBC_2.27 mq_setattr F -+GLIBC_2.27 mq_timedreceive F -+GLIBC_2.27 mq_timedsend F -+GLIBC_2.27 mq_unlink F -+GLIBC_2.27 shm_open F -+GLIBC_2.27 shm_unlink F -+GLIBC_2.27 timer_create F -+GLIBC_2.27 timer_delete F -+GLIBC_2.27 timer_getoverrun F -+GLIBC_2.27 timer_gettime F -+GLIBC_2.27 timer_settime F -diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libthread_db.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libthread_db.abilist -new file mode 100644 -index 00000000..4122e563 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libthread_db.abilist -@@ -0,0 +1,40 @@ -+GLIBC_2.27 td_init F -+GLIBC_2.27 td_log F -+GLIBC_2.27 td_symbol_list F -+GLIBC_2.27 td_ta_clear_event F -+GLIBC_2.27 td_ta_delete F -+GLIBC_2.27 td_ta_enable_stats F -+GLIBC_2.27 td_ta_event_addr F -+GLIBC_2.27 td_ta_event_getmsg F -+GLIBC_2.27 td_ta_get_nthreads F -+GLIBC_2.27 td_ta_get_ph F -+GLIBC_2.27 td_ta_get_stats F -+GLIBC_2.27 td_ta_map_id2thr F -+GLIBC_2.27 td_ta_map_lwp2thr F -+GLIBC_2.27 td_ta_new F -+GLIBC_2.27 td_ta_reset_stats F -+GLIBC_2.27 td_ta_set_event F -+GLIBC_2.27 td_ta_setconcurrency F -+GLIBC_2.27 td_ta_thr_iter F -+GLIBC_2.27 td_ta_tsd_iter F -+GLIBC_2.27 td_thr_clear_event F -+GLIBC_2.27 td_thr_dbresume F -+GLIBC_2.27 td_thr_dbsuspend F -+GLIBC_2.27 td_thr_event_enable F -+GLIBC_2.27 td_thr_event_getmsg F -+GLIBC_2.27 td_thr_get_info F -+GLIBC_2.27 td_thr_getfpregs F -+GLIBC_2.27 td_thr_getgregs F -+GLIBC_2.27 td_thr_getxregs F -+GLIBC_2.27 td_thr_getxregsize F -+GLIBC_2.27 td_thr_set_event F -+GLIBC_2.27 td_thr_setfpregs F -+GLIBC_2.27 td_thr_setgregs F -+GLIBC_2.27 td_thr_setprio F -+GLIBC_2.27 td_thr_setsigpending F -+GLIBC_2.27 td_thr_setxregs F -+GLIBC_2.27 td_thr_sigsetmask F -+GLIBC_2.27 td_thr_tls_get_addr F -+GLIBC_2.27 td_thr_tlsbase F -+GLIBC_2.27 td_thr_tsd F -+GLIBC_2.27 td_thr_validate F -diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libutil.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libutil.abilist -new file mode 100644 -index 00000000..cbfec8d4 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libutil.abilist -@@ -0,0 +1,6 @@ -+GLIBC_2.27 forkpty F -+GLIBC_2.27 login F -+GLIBC_2.27 login_tty F -+GLIBC_2.27 logout F -+GLIBC_2.27 logwtmp F -+GLIBC_2.27 openpty F -diff --git a/sysdeps/unix/sysv/linux/loongarch/makecontext.c b/sysdeps/unix/sysv/linux/loongarch/makecontext.c -new file mode 100644 -index 00000000..55d509ab ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/makecontext.c -@@ -0,0 +1,78 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+#include -+#include -+#include -+ -+void -+__makecontext (ucontext_t *ucp, void (*func) (void), int argc, -+ long int a0, long int a1, long int a2, long int a3, long int a4, -+ ...) -+{ -+ extern void __start_context (void) attribute_hidden; -+ long int i, sp; -+ -+ _Static_assert (LARCH_REG_NARGS == 8, "__makecontext assumes 8 argument registers"); -+ -+ /* Set up the stack. */ -+ sp = ((long int) ucp->uc_stack.ss_sp + ucp->uc_stack.ss_size) & ALMASK; -+ -+ /* Set up the register context. -+ ra = s0 = 0, terminating the stack for backtracing purposes. -+ s1 = the function we must call. -+ s2 = the subsequent context to run. */ -+ ucp->uc_mcontext.__gregs[LARCH_REG_RA] = 0; -+ ucp->uc_mcontext.__gregs[LARCH_REG_S0] = 0; -+ ucp->uc_mcontext.__gregs[LARCH_REG_S1] = (long int) func; -+ ucp->uc_mcontext.__gregs[LARCH_REG_S2] = (long int) ucp->uc_link; -+ ucp->uc_mcontext.__gregs[LARCH_REG_SP] = sp; -+ ucp->uc_mcontext.__pc = (long int) &__start_context; -+ -+ /* Put args in a0-a7, then put any remaining args on the stack. */ -+ ucp->uc_mcontext.__gregs[LARCH_REG_A0 + 0] = a0; -+ ucp->uc_mcontext.__gregs[LARCH_REG_A0 + 1] = a1; -+ ucp->uc_mcontext.__gregs[LARCH_REG_A0 + 2] = a2; -+ ucp->uc_mcontext.__gregs[LARCH_REG_A0 + 3] = a3; -+ ucp->uc_mcontext.__gregs[LARCH_REG_A0 + 4] = a4; -+ -+ if (__glibc_unlikely (argc > 5)) -+ { -+ va_list vl; -+ va_start (vl, a4); -+ -+ long reg_args = argc < LARCH_REG_NARGS ? argc : LARCH_REG_NARGS; -+ for (i = 5; i < reg_args; i++) -+ ucp->uc_mcontext.__gregs[LARCH_REG_A0 + i] = va_arg (vl, long); -+ -+ long int stack_args = argc - reg_args; -+ if (stack_args > 0) -+ { -+ sp = (sp - stack_args * sizeof (long int)) & ALMASK; -+ ucp->uc_mcontext.__gregs[LARCH_REG_SP] = sp; -+ for (i = 0; i < stack_args; i++) -+ ((long int *) sp)[i] = va_arg (vl, long int); -+ } -+ -+ va_end (vl); -+ } -+} -+ -+weak_alias (__makecontext, makecontext) -diff --git a/sysdeps/unix/sysv/linux/loongarch/profil-counter.h b/sysdeps/unix/sysv/linux/loongarch/profil-counter.h -new file mode 100644 -index 00000000..6a3cc201 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/profil-counter.h -@@ -0,0 +1,31 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+#include -+ -+static void -+__profil_counter (int signo, const SIGCONTEXT scp) -+{ -+ profil_count ((void *) GET_PC (scp)); -+ -+ /* This is a hack to prevent the compiler from implementing the -+ above function call as a sibcall. The sibcall would overwrite -+ the signal context. */ -+ asm volatile (""); -+} -diff --git a/sysdeps/unix/sysv/linux/loongarch/pt-vfork.S b/sysdeps/unix/sysv/linux/loongarch/pt-vfork.S -new file mode 100644 -index 00000000..1cc89317 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/pt-vfork.S -@@ -0,0 +1 @@ -+/* Not needed. */ -diff --git a/sysdeps/unix/sysv/linux/loongarch/register-dump.h b/sysdeps/unix/sysv/linux/loongarch/register-dump.h -new file mode 100644 -index 00000000..5e45d5c7 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/register-dump.h -@@ -0,0 +1,63 @@ -+/* Dump registers. -+ Copyright (C) 2000-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+#include -+#include <_itoa.h> -+ -+static void -+hexvalue (unsigned long int value, char *buf, size_t len) -+{ -+ char *cp = _itoa_word (value, buf + len, 16, 0); -+ while (cp > buf) -+ *--cp = '0'; -+} -+ -+#define REGDUMP_NREGS 32 -+#define REGDUMP_PER_LINE (80 / (__WORDSIZE / 4 + 4)) -+ -+static void -+register_dump (int fd, ucontext_t *ctx) -+{ -+ int i; -+ char regvalue[__WORDSIZE / 4 + 1]; -+ char str[82 * ((REGDUMP_NREGS + REGDUMP_PER_LINE - 1) / REGDUMP_PER_LINE)]; -+ -+ static const char names[REGDUMP_NREGS][4] = { -+ "pc", "ra", "tp", "sp", "a0", "a1", "a2", "a3", -+ "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", -+ "t4", "t5", "t6", "t7", "t8", "x" , "fp", "s0", -+ "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8" -+ }; -+ -+ str[0] = 0; -+ for (i = 0; i < REGDUMP_NREGS; i++) -+ { -+ strcat (str, names[i]); -+ strcat (str, " "); -+ hexvalue (ctx->uc_mcontext.__gregs[i], regvalue, __WORDSIZE / 4); -+ strcat (str, regvalue); -+ -+ if ((i + 1) % REGDUMP_PER_LINE == 0) -+ strcat (str, "\n"); -+ } -+ -+ write (fd, str, strlen (str)); -+} -+ -+#define REGISTER_DUMP register_dump (fd, ctx) -diff --git a/sysdeps/unix/sysv/linux/loongarch/setcontext.S b/sysdeps/unix/sysv/linux/loongarch/setcontext.S -new file mode 100644 -index 00000000..c96ec43c ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/setcontext.S -@@ -0,0 +1,111 @@ -+/* Set current context. -+ Copyright (C) 2009-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+#include "sys/regdef.h" -+#include "ucontext-macros.h" -+ -+/* int __setcontext (const ucontext_t *ucp) -+ -+ Restores the machine context in UCP and thereby resumes execution -+ in that context. -+ -+ This implementation is intended to be used for *synchronous* context -+ switches only. Therefore, it does not have to restore anything -+ other than the PRESERVED state. */ -+ -+ .text -+LEAF (__setcontext) -+ -+ addi.d sp, sp, -16 -+ st.d a0, sp, 0 /* Save ucp to stack. */ -+/* rt_sigprocmask (SIG_SETMASK, &ucp->uc_sigmask, NULL, _NSIG8) */ -+ li.d a3, _NSIG8 -+ li.d a2, 0 -+ addi.d a1, a0, UCONTEXT_SIGMASK -+ li.d a0, SIG_SETMASK -+ -+ li.d a7, SYS_ify (rt_sigprocmask) -+ syscall 0 -+ -+ blt a0, $r0, 99f -+ -+ ld.d t0, sp, 0 /* Load ucp to t0. */ -+ cfi_def_cfa (12, 0) -+ -+#ifndef __loongarch_soft_float -+ ld.w t1, t0, MCONTEXT_FCSR -+ -+ RESTORE_FP_REG (fs0, 24, t0) -+ RESTORE_FP_REG (fs1, 25, t0) -+ RESTORE_FP_REG (fs2, 26, t0) -+ RESTORE_FP_REG (fs3, 27, t0) -+ RESTORE_FP_REG (fs4, 28, t0) -+ RESTORE_FP_REG (fs5, 29, t0) -+ RESTORE_FP_REG (fs6, 30, t0) -+ RESTORE_FP_REG (fs7, 31, t0) -+ -+ movgr2fcsr $r0, t1 -+#endif /* __loongarch_soft_float */ -+ -+ /* Note the contents of argument registers will be random -+ unless makecontext() has been called. */ -+ RESTORE_INT_REG (ra, 1, t0) -+ RESTORE_INT_REG (sp, 3, t0) -+ RESTORE_INT_REG (a0, 4, t0) -+ RESTORE_INT_REG (a1, 5, t0) -+ RESTORE_INT_REG (a2, 6, t0) -+ RESTORE_INT_REG (a3, 7, t0) -+ RESTORE_INT_REG (a4, 8, t0) -+ RESTORE_INT_REG (a5, 9, t0) -+ RESTORE_INT_REG (a6, 10, t0) -+ RESTORE_INT_REG (a7, 11, t0) -+ RESTORE_INT_REG (x, 21, t0) -+ RESTORE_INT_REG (fp, 22, t0) -+ RESTORE_INT_REG (s0, 23, t0) -+ RESTORE_INT_REG (s1, 24, t0) -+ RESTORE_INT_REG (s2, 25, t0) -+ RESTORE_INT_REG (s3, 26, t0) -+ RESTORE_INT_REG (s4, 27, t0) -+ RESTORE_INT_REG (s5, 28, t0) -+ RESTORE_INT_REG (s6, 29, t0) -+ RESTORE_INT_REG (s7, 30, t0) -+ RESTORE_INT_REG (s8, 31, t0) -+ ld.d t1, t0, MCONTEXT_PC -+ jirl $r0,t1,0 -+ -+99: -+ addi.d sp, sp, 16 -+ b __syscall_error -+ -+PSEUDO_END (__setcontext) -+weak_alias (__setcontext, setcontext) -+ -+LEAF (__start_context) -+ -+ /* Terminate call stack by noting ra == 0. Happily, s0 == 0 here. */ -+ cfi_register (1, 23) -+ -+ /* Call the function passed to makecontext. */ -+ jirl $r1,s1,0 -+ -+ /* Invoke subsequent context if present, else exit(0). */ -+ ori a0, s2, 0 -+ beqz s2, 1f -+ bl __setcontext -+1: b exit -+ -+PSEUDO_END (__start_context) -diff --git a/sysdeps/unix/sysv/linux/loongarch/shlib-versions b/sysdeps/unix/sysv/linux/loongarch/shlib-versions -new file mode 100644 -index 00000000..2a67fe71 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/shlib-versions -@@ -0,0 +1,2 @@ -+DEFAULT GLIBC_2.27 -+libpthread=0 GLIBC_2.0 GLIBC_2.2 -diff --git a/sysdeps/unix/sysv/linux/loongarch/sigcontextinfo.h b/sysdeps/unix/sysv/linux/loongarch/sigcontextinfo.h -new file mode 100644 -index 00000000..2a864795 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/sigcontextinfo.h -@@ -0,0 +1,22 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+ -+#define SIGCONTEXT siginfo_t *_si, ucontext_t * -+#define GET_PC(ctx) ((void *) ctx->uc_mcontext.__pc) -diff --git a/sysdeps/unix/sysv/linux/loongarch/swapcontext.S b/sysdeps/unix/sysv/linux/loongarch/swapcontext.S -new file mode 100644 -index 00000000..d839dd87 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/swapcontext.S -@@ -0,0 +1,120 @@ -+/* Save and set current context. -+ Copyright (C) 2009-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include "ucontext-macros.h" -+ -+/* int swapcontext (ucontext_t *oucp, const ucontext_t *ucp) */ -+ -+LEAF (__swapcontext) -+ ori a2, sp, 0 /* Save sp to a2. */ -+ addi.d sp, sp, -16 -+ st.d a1, sp, 0 -+ ori t0, a1, 0 -+ -+ SAVE_INT_REG (ra, 1, a0) -+ SAVE_INT_REG (a2, 3, a0) /* Store sp .*/ -+ SAVE_INT_REG (zero, 4, a0) /* return 0 by overwriting a0. */ -+ SAVE_INT_REG (x, 21, a0) -+ SAVE_INT_REG (fp, 22, a0) -+ SAVE_INT_REG (s0, 23, a0) -+ SAVE_INT_REG (s1, 24, a0) -+ SAVE_INT_REG (s2, 25, a0) -+ SAVE_INT_REG (s3, 26, a0) -+ SAVE_INT_REG (s4, 27, a0) -+ SAVE_INT_REG (s5, 28, a0) -+ SAVE_INT_REG (s6, 29, a0) -+ SAVE_INT_REG (s7, 30, a0) -+ SAVE_INT_REG (s8, 31, a0) -+ st.d ra, a0, MCONTEXT_PC -+#ifndef __loongarch_soft_float -+ movfcsr2gr a1, $r0 -+ -+ SAVE_FP_REG (fs0, 24, a0) -+ SAVE_FP_REG (fs1, 25, a0) -+ SAVE_FP_REG (fs2, 26, a0) -+ SAVE_FP_REG (fs3, 27, a0) -+ SAVE_FP_REG (fs4, 28, a0) -+ SAVE_FP_REG (fs5, 29, a0) -+ SAVE_FP_REG (fs6, 30, a0) -+ SAVE_FP_REG (fs7, 31, a0) -+ -+ st.w a1, a0, MCONTEXT_FCSR -+#endif /* __loongarch_soft_float */ -+ -+/* rt_sigprocmask (SIG_SETMASK, &ucp->uc_sigmask, &oucp->uc_sigmask, _NSIG8) */ -+ li.d a3, _NSIG8 -+ addi.d a2, a0, UCONTEXT_SIGMASK -+ addi.d a1, t0, UCONTEXT_SIGMASK -+ li.d a0, SIG_SETMASK -+ -+ li.d a7, SYS_ify (rt_sigprocmask) -+ syscall 0 -+ -+ blt a0, zero, 99f -+ -+#ifndef __loongarch_soft_float -+ ld.d t0, sp, 0 /* Load a1 to t0. */ -+ ld.w t1, t0, MCONTEXT_FCSR -+ -+ RESTORE_FP_REG (fs0, 24, t0) -+ RESTORE_FP_REG (fs1, 25, t0) -+ RESTORE_FP_REG (fs2, 26, t0) -+ RESTORE_FP_REG (fs3, 27, t0) -+ RESTORE_FP_REG (fs4, 28, t0) -+ RESTORE_FP_REG (fs5, 29, t0) -+ RESTORE_FP_REG (fs6, 30, t0) -+ RESTORE_FP_REG (fs7, 31, t0) -+ -+ movgr2fcsr $r0, t1 -+#endif /* __loongarch_soft_float */ -+ -+ /* Note the contents of argument registers will be random -+ unless makecontext() has been called. */ -+ RESTORE_INT_REG (ra, 1, t0) -+ RESTORE_INT_REG (sp, 3, t0) -+ RESTORE_INT_REG (a0, 4, t0) -+ RESTORE_INT_REG (a1, 5, t0) -+ RESTORE_INT_REG (a2, 6, t0) -+ RESTORE_INT_REG (a3, 7, t0) -+ RESTORE_INT_REG (a4, 8, t0) -+ RESTORE_INT_REG (a5, 9, t0) -+ RESTORE_INT_REG (a6, 10, t0) -+ RESTORE_INT_REG (a7, 11, t0) -+ RESTORE_INT_REG (x, 21, t0) -+ RESTORE_INT_REG (fp, 22, t0) -+ RESTORE_INT_REG (s0, 23, t0) -+ RESTORE_INT_REG (s1, 24, t0) -+ RESTORE_INT_REG (s2, 25, t0) -+ RESTORE_INT_REG (s3, 26, t0) -+ RESTORE_INT_REG (s4, 27, t0) -+ RESTORE_INT_REG (s5, 28, t0) -+ RESTORE_INT_REG (s6, 29, t0) -+ RESTORE_INT_REG (s7, 30, t0) -+ RESTORE_INT_REG (s8, 31, t0) -+ ld.d t1, t0, MCONTEXT_PC -+ -+ jirl $r0, t1, 0 -+ -+ -+99: -+ addi.d sp, sp, 16 -+ b __syscall_error -+ -+PSEUDO_END (__swapcontext) -+ -+weak_alias (__swapcontext, swapcontext) -diff --git a/sysdeps/unix/sysv/linux/loongarch/sys/procfs.h b/sysdeps/unix/sysv/linux/loongarch/sys/procfs.h -new file mode 100644 -index 00000000..9ae06b40 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/sys/procfs.h -@@ -0,0 +1,122 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#ifndef _SYS_PROCFS_H -+#define _SYS_PROCFS_H 1 -+ -+/* This is somehow modelled after the file of the same name on SysVr4 -+ systems. It provides a definition of the core file format for ELF -+ used on Linux. */ -+ -+#include -+#include -+#include -+#include -+#include -+ -+__BEGIN_DECLS -+ -+/* Type for a general-purpose register. */ -+typedef uint64_t elf_greg_t; -+ -+/* And the whole bunch of them. We could have used `struct -+ pt_regs' directly in the typedef, but tradition says that -+ the register set is an array, which does have some peculiar -+ semantics, so leave it that way. */ -+#define ELF_NGREG (sizeof (struct user_regs_struct) / sizeof(elf_greg_t)) -+typedef elf_greg_t elf_gregset_t[ELF_NGREG]; -+ -+#define ELF_NFPREG 34 /* 32 FPRs + 8-byte byte-vec for fcc + 4-byte FCR */ -+typedef union { double d; float f; } elf_fpreg_t; -+typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG]; -+ -+typedef union { double d[2]; float f[4]; } __attribute__((__aligned__ (16))) elf_lsxregset_t[32]; -+typedef union { double d[4]; float f[8]; } __attribute__((__aligned__ (32))) elf_lasxregset_t[32]; -+ -+struct elf_siginfo -+ { -+ int si_signo; /* Signal number. */ -+ int si_code; /* Extra code. */ -+ int si_errno; /* Errno. */ -+ }; -+ -+/* Definitions to generate Intel SVR4-like core files. These mostly -+ have the same names as the SVR4 types with "elf_" tacked on the -+ front to prevent clashes with linux definitions, and the typedef -+ forms have been avoided. This is mostly like the SVR4 structure, -+ but more Linuxy, with things that Linux does not support and which -+ gdb doesn't really use excluded. Fields present but not used are -+ marked with "XXX". */ -+struct elf_prstatus -+ { -+ struct elf_siginfo pr_info; /* Info associated with signal. */ -+ short int pr_cursig; /* Current signal. */ -+ unsigned long int pr_sigpend; /* Set of pending signals. */ -+ unsigned long int pr_sighold; /* Set of held signals. */ -+ __pid_t pr_pid; -+ __pid_t pr_ppid; -+ __pid_t pr_pgrp; -+ __pid_t pr_sid; -+ struct timeval pr_utime; /* User time. */ -+ struct timeval pr_stime; /* System time. */ -+ struct timeval pr_cutime; /* Cumulative user time. */ -+ struct timeval pr_cstime; /* Cumulative system time. */ -+ elf_gregset_t pr_reg; /* GP registers. */ -+ int pr_fpvalid; /* True if math copro being used. */ -+ }; -+ -+ -+#define ELF_PRARGSZ (80) /* Number of chars for args */ -+ -+struct elf_prpsinfo -+ { -+ char pr_state; /* Numeric process state. */ -+ char pr_sname; /* Char for pr_state. */ -+ char pr_zomb; /* Zombie. */ -+ char pr_nice; /* Nice val. */ -+ unsigned long int pr_flag; /* Flags. */ -+ unsigned int pr_uid; -+ unsigned int pr_gid; -+ int pr_pid, pr_ppid, pr_pgrp, pr_sid; -+ /* Lots missing */ -+ char pr_fname[16]; /* Filename of executable. */ -+ char pr_psargs[ELF_PRARGSZ]; /* Initial part of arg list. */ -+ }; -+ -+/* The rest of this file provides the types for emulation of the -+ Solaris interfaces that should be implemented by -+ users of libthread_db. */ -+ -+/* Addresses. */ -+typedef void *psaddr_t; -+ -+/* Register sets. Linux has different names. */ -+typedef elf_gregset_t prgregset_t; -+typedef elf_fpregset_t prfpregset_t; -+ -+/* We don't have any differences between processes and threads, -+ therefore habe only ine PID type. */ -+typedef __pid_t lwpid_t; -+ -+/* Process status and info. In the end we do provide typedefs for them. */ -+typedef struct elf_prstatus prstatus_t; -+typedef struct elf_prpsinfo prpsinfo_t; -+ -+__END_DECLS -+ -+#endif /* sys/procfs.h */ -diff --git a/sysdeps/unix/sysv/linux/loongarch/sys/ucontext.h b/sysdeps/unix/sysv/linux/loongarch/sys/ucontext.h -new file mode 100644 -index 00000000..e52a46c9 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/sys/ucontext.h -@@ -0,0 +1,81 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+/* Don't rely on this, the interface is currently messed up and may need to -+ be broken to be fixed. */ -+#ifndef _SYS_UCONTEXT_H -+#define _SYS_UCONTEXT_H 1 -+ -+#include -+ -+#include -+#include -+ -+typedef unsigned long int __loongarch_mc_gp_state[32]; -+ -+#ifdef __USE_MISC -+# define LARCH_NGREG 32 -+ -+# define LARCH_REG_RA 1 -+# define LARCH_REG_SP 3 -+# define LARCH_REG_S0 23 -+# define LARCH_REG_S1 24 -+# define LARCH_REG_A0 4 -+# define LARCH_REG_S2 25 -+# define LARCH_REG_NARGS 8 -+ -+typedef unsigned long int greg_t; -+ -+/* Container for all general registers. */ -+typedef __loongarch_mc_gp_state gregset_t; -+ -+/* Container for floating-point state. */ -+typedef union __loongarch_mc_fp_state fpregset_t; -+#endif -+ -+ -+ -+union __loongarch_mc_fp_state { -+ unsigned int __val32[256 / 32]; -+ unsigned long long __val64[256 / 64]; -+}; -+ -+typedef struct mcontext_t { -+ unsigned long long __pc; -+ unsigned long long __gregs[32]; -+ unsigned int __flags; -+ -+ unsigned int __fcsr; -+ unsigned int __vcsr; -+ unsigned long long __fcc; -+ union __loongarch_mc_fp_state __fpregs[32] __attribute__((__aligned__ (32))); -+ -+ unsigned int __reserved; -+} mcontext_t; -+ -+/* Userlevel context. */ -+typedef struct ucontext_t -+ { -+ unsigned long int __uc_flags; -+ struct ucontext_t *uc_link; -+ stack_t uc_stack; -+ mcontext_t uc_mcontext; -+ sigset_t uc_sigmask; -+ } ucontext_t; -+ -+#endif /* sys/ucontext.h */ -diff --git a/sysdeps/unix/sysv/linux/loongarch/sys/user.h b/sysdeps/unix/sysv/linux/loongarch/sys/user.h -new file mode 100644 -index 00000000..f9108350 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/sys/user.h -@@ -0,0 +1,31 @@ -+/* Copyright (C) 2001-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#ifndef _SYS_USER_H -+#define _SYS_USER_H 1 -+ -+#include -+ -+struct user_regs_struct -+{ -+ uint64_t gpr[32]; -+ uint64_t pc; -+ uint64_t badvaddr; -+ uint64_t reserved[11]; -+}; -+ -+#endif /* _SYS_USER_H */ -diff --git a/sysdeps/unix/sysv/linux/loongarch/syscall.c b/sysdeps/unix/sysv/linux/loongarch/syscall.c -new file mode 100644 -index 00000000..b06a528e ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/syscall.c -@@ -0,0 +1,36 @@ -+/* Copyright (C) 2020-2021 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+ -+long int -+syscall (long int syscall_number, long int arg1, long int arg2, long int arg3, -+ long int arg4, long int arg5, long int arg6, long int arg7) -+{ -+ long int ret; -+ INTERNAL_SYSCALL_DECL (err); -+ -+ ret = INTERNAL_SYSCALL_NCS (syscall_number, err, 7, arg1, arg2, arg3, arg4, -+ arg5, arg6, arg7); -+ -+ if (INTERNAL_SYSCALL_ERROR_P (ret, err)) -+ return __syscall_error (ret); -+ -+ return ret; -+} -+ -diff --git a/sysdeps/unix/sysv/linux/loongarch/sysdep.S b/sysdeps/unix/sysv/linux/loongarch/sysdep.S -new file mode 100644 -index 00000000..a8094283 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/sysdep.S -@@ -0,0 +1,52 @@ -+/* syscall error handlers -+ Copyright (C) 2011-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#include -+ -+#if IS_IN (libc) -+# define errno __libc_errno -+#endif -+ -+ENTRY (__syscall_error) -+ /* Fall through to __syscall_set_errno. */ -+END (__syscall_error) -+ -+/* Non-standard calling convention: argument in a0, return address in t0, -+ and clobber only t1. */ -+ENTRY (__syscall_set_errno) -+ /* We got here because a0 < 0, but only codes in the range [-4095, -1] -+ represent errors. Otherwise, just return the result normally. */ -+ -+ li.d t1, -4096 -+ bgeu t1, a0, L (out) -+ sub.w a0, zero, a0 -+ -+#if RTLD_PRIVATE_ERRNO -+ la t1, rtld_errno -+#elif defined(__PIC__) -+ la.tls.ie t1, errno -+ add.d t1, tp, t1 -+#else -+ la.tls.le t1, errno -+ add.d t1, tp, t1 -+#endif -+ st.w a0, t1, 0 -+ li.d a0, -1 -+L (out): -+ ret -+END (__syscall_set_errno) -diff --git a/sysdeps/unix/sysv/linux/loongarch/sysdep.h b/sysdeps/unix/sysv/linux/loongarch/sysdep.h -new file mode 100644 -index 00000000..f50946d4 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/sysdep.h -@@ -0,0 +1,333 @@ -+#ifndef _LINUX_LOONGARCH_SYSDEP_H -+#define _LINUX_LOONGARCH_SYSDEP_H 1 -+ -+#include -+#include -+ -+#ifdef __ASSEMBLER__ -+ -+# include -+# define ret jirl zero, ra, 0 -+# define L(label) .L ## label -+ -+/* Performs a system call, handling errors by setting errno. Linux indicates -+ errors by setting a0 to a value between -1 and -4095. */ -+# undef PSEUDO -+# define PSEUDO(name, syscall_name, args) \ -+ ENTRY (name); \ -+ li.d a7, SYS_ify (syscall_name); \ -+ syscall 0; \ -+ li.d a7, -4096; \ -+ bltu a7, a0, .Lsyscall_error ## name; -+ -+# undef PSEUDO_END -+# define PSEUDO_END(sym) \ -+ SYSCALL_ERROR_HANDLER (sym); \ -+ ret; \ -+ END (sym); -+ -+# if !IS_IN (libc) -+# if RTLD_PRIVATE_ERRNO -+ -+# define SYSCALL_ERROR_HANDLER(name) \ -+.Lsyscall_error ## name: \ -+ la t0, rtld_errno; \ -+ sub.w a0, zero, a0; \ -+ st.w a0, t0, 0; \ -+ li.d a0, -1; -+ -+# else -+ -+# define SYSCALL_ERROR_HANDLER(name) \ -+.Lsyscall_error ## name: \ -+ la.tls.ie t0, errno; \ -+ add.d t0, tp, t0; \ -+ sub.w a0, zero, a0; \ -+ st.w a0, t0, 0; \ -+ li.d a0, -1; -+ -+# endif -+# else -+ -+# define SYSCALL_ERROR_HANDLER(name) \ -+.Lsyscall_error ## name: \ -+ b __syscall_error; -+ -+# endif -+ -+/* Performs a system call, not setting errno. */ -+# undef PSEUDO_NEORRNO -+# define PSEUDO_NOERRNO(name, syscall_name, args) \ -+ ENTRY (name); \ -+ li.d a7, SYS_ify (syscall_name); \ -+ syscall 0; -+ -+# undef PSEUDO_END_NOERRNO -+# define PSEUDO_END_NOERRNO(name) \ -+ END (name); -+ -+# undef ret_NOERRNO -+# define ret_NOERRNO ret -+ -+/* Perfroms a system call, returning the error code. */ -+# undef PSEUDO_ERRVAL -+# define PSEUDO_ERRVAL(name, syscall_name, args) \ -+ PSEUDO_NOERRNO (name, syscall_name, args); \ -+ slli.d a0, a0, 32; \ -+ srai.d a0, a0, 32; /* sign_ext */ \ -+ sub.d a0, zero, a0; -+ -+# undef PSEUDO_END_ERRVAL -+# define PSEUDO_END_ERRVAL(name) \ -+ END (name); -+ -+# undef ret_ERRVAL -+# define ret_ERRVAL ret -+ -+#endif /* __ASSEMBLER__ */ -+ -+/* In order to get __set_errno() definition in INLINE_SYSCALL. */ -+#ifndef __ASSEMBLER__ -+# include -+#endif -+ -+#include -+ -+#undef SYS_ify -+#define SYS_ify(syscall_name) __NR_##syscall_name -+ -+#ifndef __ASSEMBLER__ -+ -+/* List of system calls which are supported as vsyscalls. */ -+# define HAVE_CLOCK_GETRES_VSYSCALL 1 -+# define HAVE_CLOCK_GETTIME_VSYSCALL 1 -+# define HAVE_GETTIMEOFDAY_VSYSCALL 1 -+# define HAVE_GETCPU_VSYSCALL 1 -+ -+/* Define a macro which expands into the inline wrapper code for a system -+ call. */ -+# undef INLINE_SYSCALL -+# define INLINE_SYSCALL(name, nr, args...) \ -+ ({ INTERNAL_SYSCALL_DECL (err); \ -+ long int __sys_result = INTERNAL_SYSCALL (name, err, nr, args); \ -+ if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (__sys_result, ))) \ -+ { \ -+ __set_errno (INTERNAL_SYSCALL_ERRNO (__sys_result, )); \ -+ __sys_result = (unsigned long) -1; \ -+ } \ -+ __sys_result; }) -+ -+ -+# define INTERNAL_SYSCALL_DECL(err) do { } while (0) -+ -+# define INTERNAL_SYSCALL_ERROR_P(val, err) \ -+ ((unsigned long int) (val) > -4096UL) -+ -+# define INTERNAL_SYSCALL_ERRNO(val, err) (-(val)) -+ -+# define INTERNAL_SYSCALL(name, err, nr, args...) \ -+ internal_syscall##nr (SYS_ify (name), err, args) -+ -+# define INTERNAL_SYSCALL_NCS(number, err, nr, args...) \ -+ internal_syscall##nr (number, err, args) -+ -+# define internal_syscall0(number, err, dummy...) \ -+({ \ -+ long int _sys_result; \ -+ \ -+ { \ -+ register long int __a7 asm ("$a7") = number; \ -+ register long int __a0 asm ("$a0"); \ -+ __asm__ volatile ( \ -+ "syscall 0\n\t" \ -+ : "=r" (__a0) \ -+ : "r" (__a7) \ -+ : __SYSCALL_CLOBBERS); \ -+ _sys_result = __a0; \ -+ } \ -+ _sys_result; \ -+}) -+ -+# define internal_syscall1(number, err, arg0) \ -+({ \ -+ long int _sys_result; \ -+ \ -+ { \ -+ long int _arg0 = (long int) (arg0); \ -+ register long int __a7 asm ("$a7") = number; \ -+ register long int __a0 asm ("$a0") = _arg0; \ -+ __asm__ volatile ( \ -+ "syscall 0\n\t" \ -+ : "+r" (__a0) \ -+ : "r" (__a7) \ -+ : __SYSCALL_CLOBBERS); \ -+ _sys_result = __a0; \ -+ } \ -+ _sys_result; \ -+}) -+ -+# define internal_syscall2(number, err, arg0, arg1) \ -+({ \ -+ long int _sys_result; \ -+ \ -+ { \ -+ long int _arg0 = (long int) (arg0); \ -+ long int _arg1 = (long int) (arg1); \ -+ register long int __a7 asm ("$a7") = number; \ -+ register long int __a0 asm ("$a0") = _arg0; \ -+ register long int __a1 asm ("$a1") = _arg1; \ -+ __asm__ volatile ( \ -+ "syscall 0\n\t" \ -+ : "+r" (__a0) \ -+ : "r" (__a7), "r" (__a1) \ -+ : __SYSCALL_CLOBBERS); \ -+ _sys_result = __a0; \ -+ } \ -+ _sys_result; \ -+}) -+ -+# define internal_syscall3(number, err, arg0, arg1, arg2) \ -+({ \ -+ long int _sys_result; \ -+ \ -+ { \ -+ long int _arg0 = (long int) (arg0); \ -+ long int _arg1 = (long int) (arg1); \ -+ long int _arg2 = (long int) (arg2); \ -+ register long int __a7 asm ("$a7") = number; \ -+ register long int __a0 asm ("$a0") = _arg0; \ -+ register long int __a1 asm ("$a1") = _arg1; \ -+ register long int __a2 asm ("$a2") = _arg2; \ -+ __asm__ volatile ( \ -+ "syscall 0\n\t" \ -+ : "+r" (__a0) \ -+ : "r" (__a7), "r" (__a1), "r" (__a2) \ -+ : __SYSCALL_CLOBBERS); \ -+ _sys_result = __a0; \ -+ } \ -+ _sys_result; \ -+}) -+ -+# define internal_syscall4(number, err, arg0, arg1, arg2, arg3) \ -+({ \ -+ long int _sys_result; \ -+ \ -+ { \ -+ long int _arg0 = (long int) (arg0); \ -+ long int _arg1 = (long int) (arg1); \ -+ long int _arg2 = (long int) (arg2); \ -+ long int _arg3 = (long int) (arg3); \ -+ register long int __a7 asm ("$a7") = number; \ -+ register long int __a0 asm ("$a0") = _arg0; \ -+ register long int __a1 asm ("$a1") = _arg1; \ -+ register long int __a2 asm ("$a2") = _arg2; \ -+ register long int __a3 asm ("$a3") = _arg3; \ -+ __asm__ volatile ( \ -+ "syscall 0\n\t" \ -+ : "+r" (__a0) \ -+ : "r" (__a7), "r" (__a1), "r" (__a2), "r" (__a3) \ -+ : __SYSCALL_CLOBBERS); \ -+ _sys_result = __a0; \ -+ } \ -+ _sys_result; \ -+}) -+ -+# define internal_syscall5(number, err, arg0, arg1, arg2, arg3, arg4) \ -+({ \ -+ long int _sys_result; \ -+ \ -+ { \ -+ long int _arg0 = (long int) (arg0); \ -+ long int _arg1 = (long int) (arg1); \ -+ long int _arg2 = (long int) (arg2); \ -+ long int _arg3 = (long int) (arg3); \ -+ long int _arg4 = (long int) (arg4); \ -+ register long int __a7 asm ("$a7") = number; \ -+ register long int __a0 asm ("$a0") = _arg0; \ -+ register long int __a1 asm ("$a1") = _arg1; \ -+ register long int __a2 asm ("$a2") = _arg2; \ -+ register long int __a3 asm ("$a3") = _arg3; \ -+ register long int __a4 asm ("$a4") = _arg4; \ -+ __asm__ volatile ( \ -+ "syscall 0\n\t" \ -+ : "+r" (__a0) \ -+ : "r" (__a7), "r"(__a1), "r"(__a2), "r"(__a3), "r" (__a4) \ -+ : __SYSCALL_CLOBBERS); \ -+ _sys_result = __a0; \ -+ } \ -+ _sys_result; \ -+}) -+ -+# define internal_syscall6(number, err, arg0, arg1, arg2, arg3, arg4, arg5) \ -+({ \ -+ long int _sys_result; \ -+ \ -+ { \ -+ long int _arg0 = (long int) (arg0); \ -+ long int _arg1 = (long int) (arg1); \ -+ long int _arg2 = (long int) (arg2); \ -+ long int _arg3 = (long int) (arg3); \ -+ long int _arg4 = (long int) (arg4); \ -+ long int _arg5 = (long int) (arg5); \ -+ register long int __a7 asm ("$a7") = number; \ -+ register long int __a0 asm ("$a0") = _arg0; \ -+ register long int __a1 asm ("$a1") = _arg1; \ -+ register long int __a2 asm ("$a2") = _arg2; \ -+ register long int __a3 asm ("$a3") = _arg3; \ -+ register long int __a4 asm ("$a4") = _arg4; \ -+ register long int __a5 asm ("$a5") = _arg5; \ -+ __asm__ volatile ( \ -+ "syscall 0\n\t" \ -+ : "+r" (__a0) \ -+ : "r" (__a7), "r" (__a1), "r" (__a2), "r" (__a3), \ -+ "r" (__a4), "r" (__a5) \ -+ : __SYSCALL_CLOBBERS); \ -+ _sys_result = __a0; \ -+ } \ -+ _sys_result; \ -+}) -+ -+# define internal_syscall7(number, err, arg0, arg1, arg2, arg3, arg4, arg5, arg6) \ -+({ \ -+ long int _sys_result; \ -+ \ -+ { \ -+ long int _arg0 = (long int) (arg0); \ -+ long int _arg1 = (long int) (arg1); \ -+ long int _arg2 = (long int) (arg2); \ -+ long int _arg3 = (long int) (arg3); \ -+ long int _arg4 = (long int) (arg4); \ -+ long int _arg5 = (long int) (arg5); \ -+ long int _arg6 = (long int) (arg6); \ -+ register long int __a7 asm ("$a7") = number; \ -+ register long int __a0 asm ("$a0") = _arg0; \ -+ register long int __a1 asm ("$a1") = _arg1; \ -+ register long int __a2 asm ("$a2") = _arg2; \ -+ register long int __a3 asm ("$a3") = _arg3; \ -+ register long int __a4 asm ("$a4") = _arg4; \ -+ register long int __a5 asm ("$a5") = _arg5; \ -+ register long int __a6 asm ("$a6") = _arg6; \ -+ __asm__ volatile ( \ -+ "syscall 0\n\t" \ -+ : "+r" (__a0) \ -+ : "r" (__a7), "r" (__a1), "r" (__a2), "r" (__a3), \ -+ "r" (__a4), "r" (__a5), "r" (__a6) \ -+ : __SYSCALL_CLOBBERS); \ -+ _sys_result = __a0; \ -+ } \ -+ _sys_result; \ -+}) -+ -+# define __SYSCALL_CLOBBERS \ -+ "$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t8",\ -+ "memory" -+ -+extern long int __syscall_error (long int neg_errno); -+ -+#endif /* ! __ASSEMBLER__ */ -+ -+/* Pointer mangling is not supported. */ -+#define PTR_MANGLE(var) (void) (var) -+#define PTR_DEMANGLE(var) (void) (var) -+ -+#endif /* linux/loongarch/sysdep.h */ -diff --git a/sysdeps/unix/sysv/linux/loongarch/ucontext-macros.h b/sysdeps/unix/sysv/linux/loongarch/ucontext-macros.h -new file mode 100644 -index 00000000..abd22247 ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/ucontext-macros.h -@@ -0,0 +1,44 @@ -+/* Macros for ucontext routines. -+ Copyright (C) 2017-2018 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library. If not, see -+ . */ -+ -+#ifndef _LINUX_LOONGARCH_UCONTEXT_MACROS_H -+#define _LINUX_LOONGARCH_UCONTEXT_MACROS_H -+ -+#include -+#include -+ -+#include "ucontext_i.h" -+ -+#define SAVE_FP_REG(name, num, base) \ -+ FREG_S name, base, ((num) * SZFREG + MCONTEXT_FPREGS) -+ -+#define RESTORE_FP_REG(name, num, base) \ -+ FREG_L name, base, ((num) * SZFREG + MCONTEXT_FPREGS) -+ -+#define SAVE_INT_REG(name, num, base) \ -+ REG_S name, base, ((num) * SZREG + MCONTEXT_GREGS) -+ -+#define RESTORE_INT_REG(name, num, base) \ -+ REG_L name, base, ((num) * SZREG + MCONTEXT_GREGS) -+ -+#define SAVE_REG(name, offset, base) \ -+ REG_S name, base, (offset) -+ -+#define RESTORE_REG(name, offset, base) \ -+ REG_L name, base, (offset) -+#endif /* _LINUX_LOONGARCH_UCONTEXT_MACROS_H */ -diff --git a/sysdeps/unix/sysv/linux/loongarch/ucontext_i.sym b/sysdeps/unix/sysv/linux/loongarch/ucontext_i.sym -new file mode 100644 -index 00000000..d7f612fe ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/ucontext_i.sym -@@ -0,0 +1,33 @@ -+#include -+#include -+#include -+#include -+ -+-- Constants used by the rt_sigprocmask call. -+ -+SIG_BLOCK -+SIG_SETMASK -+ -+_NSIG8 (_NSIG / 8) -+ -+-- Offsets of the fields in the ucontext_t structure. -+#define ucontext(member) offsetof (ucontext_t, member) -+#define stack(member) ucontext (uc_stack.member) -+#define mcontext(member) ucontext (uc_mcontext.member) -+ -+UCONTEXT_FLAGS ucontext (__uc_flags) -+UCONTEXT_LINK ucontext (uc_link) -+UCONTEXT_STACK ucontext (uc_stack) -+UCONTEXT_MCONTEXT ucontext (uc_mcontext) -+UCONTEXT_SIGMASK ucontext (uc_sigmask) -+ -+STACK_SP stack (ss_sp) -+STACK_SIZE stack (ss_size) -+STACK_FLAGS stack (ss_flags) -+ -+MCONTEXT_PC mcontext (__pc) -+MCONTEXT_FCSR mcontext (__fcsr) -+MCONTEXT_GREGS mcontext (__gregs) -+MCONTEXT_FPREGS mcontext (__fpregs) -+ -+UCONTEXT_SIZE sizeof (ucontext_t) -diff --git a/sysdeps/unix/sysv/linux/loongarch/vfork.S b/sysdeps/unix/sysv/linux/loongarch/vfork.S -new file mode 100644 -index 00000000..83cf141f ---- /dev/null -+++ b/sysdeps/unix/sysv/linux/loongarch/vfork.S -@@ -0,0 +1,49 @@ -+/* Copyright (C) 1999-2018 Free Software Foundation, Inc. -+ -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public License as -+ published by the Free Software Foundation; either version 2.1 of the -+ License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+#define _ERRNO_H 1 -+#include -+ -+/* Clone the calling process, but without copying the whole address space. -+ The calling process is suspended until the new process exits or is -+ replaced by a call to `execve'. Return -1 for errors, 0 to the new process, -+ and the process ID of the new process to the old process. */ -+ -+ENTRY (__vfork) -+ -+ -+ li.d a0, 0x4111 /* CLONE_VM | CLONE_VFORK | SIGCHLD */ -+ add.d a1, zero, sp -+ -+ /* Do the system call. */ -+ li.d a7, __NR_clone -+ syscall 0 -+ -+ blt a0, zero ,L (error) -+ -+ ret -+ -+L (error): -+ b __syscall_error -+ END (__vfork) -+ -+libc_hidden_def (__vfork) -+ -+weak_alias (__vfork, vfork) -+strong_alias (__vfork, __libc_vfork) --- -2.39.3 - diff --git a/glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch b/glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch deleted file mode 100644 index b6fbf73..0000000 --- a/glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch +++ /dev/null @@ -1,183 +0,0 @@ -From b9f145df85145506f8e61bac38b792584a38d88f Mon Sep 17 00:00:00 2001 -From: Krzysztof Koch -Date: Tue, 5 Nov 2019 17:35:18 +0000 -Subject: [PATCH 02/14] aarch64: Increase small and medium cases for - __memcpy_generic - -Increase the upper bound on medium cases from 96 to 128 bytes. -Now, up to 128 bytes are copied unrolled. - -Increase the upper bound on small cases from 16 to 32 bytes so that -copies of 17-32 bytes are not impacted by the larger medium case. - -Benchmarking: -The attached figures show relative timing difference with respect -to 'memcpy_generic', which is the existing implementation. -'memcpy_med_128' denotes the the version of memcpy_generic with -only the medium case enlarged. The 'memcpy_med_128_small_32' numbers -are for the version of memcpy_generic submitted in this patch, which -has both medium and small cases enlarged. The figures were generated -using the script from: -https://www.sourceware.org/ml/libc-alpha/2019-10/msg00563.html - -Depending on the platform, the performance improvement in the -bench-memcpy-random.c benchmark ranges from 6% to 20% between -the original and final version of memcpy.S - -Tested against GLIBC testsuite and randomized tests. ---- - sysdeps/aarch64/memcpy.S | 82 +++++++++++++++++++++++----------------- - 1 file changed, 47 insertions(+), 35 deletions(-) - -diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S -index 6e4f4a74bd..10801aa0f4 100644 ---- a/sysdeps/aarch64/memcpy.S -+++ b/sysdeps/aarch64/memcpy.S -@@ -41,17 +41,19 @@ - #define C_h x11 - #define D_l x12 - #define D_h x13 --#define E_l src --#define E_h count --#define F_l srcend --#define F_h dst -+#define E_l x14 -+#define E_h x15 -+#define F_l x16 -+#define F_h x17 - #define G_l count - #define G_h dst -+#define H_l src -+#define H_h srcend - #define tmp1 x14 - --/* Copies are split into 3 main cases: small copies of up to 16 bytes, -- medium copies of 17..96 bytes which are fully unrolled. Large copies -- of more than 96 bytes align the destination and use an unrolled loop -+/* Copies are split into 3 main cases: small copies of up to 32 bytes, -+ medium copies of 33..128 bytes which are fully unrolled. Large copies -+ of more than 128 bytes align the destination and use an unrolled loop - processing 64 bytes per iteration. - In order to share code with memmove, small and medium copies read all - data before writing, allowing any kind of overlap. So small, medium -@@ -73,7 +75,7 @@ ENTRY_ALIGN (MEMMOVE, 6) - DELOUSE (2) - - sub tmp1, dstin, src -- cmp count, 96 -+ cmp count, 128 - ccmp tmp1, count, 2, hi - b.lo L(move_long) - -@@ -89,31 +91,39 @@ ENTRY (MEMCPY) - prfm PLDL1KEEP, [src] - add srcend, src, count - add dstend, dstin, count -- cmp count, 16 -- b.ls L(copy16) -- cmp count, 96 -+ cmp count, 32 -+ b.ls L(copy32) -+ cmp count, 128 - b.hi L(copy_long) - -- /* Medium copies: 17..96 bytes. */ -- sub tmp1, count, 1 -+ /* Medium copies: 33..128 bytes. */ - ldp A_l, A_h, [src] -- tbnz tmp1, 6, L(copy96) -- ldp D_l, D_h, [srcend, -16] -- tbz tmp1, 5, 1f - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [srcend, -32] -+ ldp D_l, D_h, [srcend, -16] -+ cmp count, 64 -+ b.hi L(copy128) -+ stp A_l, A_h, [dstin] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] --1: -- stp A_l, A_h, [dstin] - stp D_l, D_h, [dstend, -16] - ret - - .p2align 4 -- /* Small copies: 0..16 bytes. */ --L(copy16): -- cmp count, 8 -+ /* Small copies: 0..32 bytes. */ -+L(copy32): -+ /* 16-32 bytes. */ -+ cmp count, 16 - b.lo 1f -+ ldp A_l, A_h, [src] -+ ldp B_l, B_h, [srcend, -16] -+ stp A_l, A_h, [dstin] -+ stp B_l, B_h, [dstend, -16] -+ ret -+ .p2align 4 -+1: -+ /* 8-15 bytes. */ -+ tbz count, 3, 1f - ldr A_l, [src] - ldr A_h, [srcend, -8] - str A_l, [dstin] -@@ -121,6 +131,7 @@ L(copy16): - ret - .p2align 4 - 1: -+ /* 4-7 bytes. */ - tbz count, 2, 1f - ldr A_lw, [src] - ldr A_hw, [srcend, -4] -@@ -142,24 +153,25 @@ L(copy16): - 2: ret - - .p2align 4 -- /* Copy 64..96 bytes. Copy 64 bytes from the start and -- 32 bytes from the end. */ --L(copy96): -- ldp B_l, B_h, [src, 16] -- ldp C_l, C_h, [src, 32] -- ldp D_l, D_h, [src, 48] -- ldp E_l, E_h, [srcend, -32] -- ldp F_l, F_h, [srcend, -16] -+ /* Copy 65..128 bytes. Copy 64 bytes from the start and -+ 64 bytes from the end. */ -+L(copy128): -+ ldp E_l, E_h, [src, 32] -+ ldp F_l, F_h, [src, 48] -+ ldp G_l, G_h, [srcend, -64] -+ ldp H_l, H_h, [srcend, -48] - stp A_l, A_h, [dstin] - stp B_l, B_h, [dstin, 16] -- stp C_l, C_h, [dstin, 32] -- stp D_l, D_h, [dstin, 48] -- stp E_l, E_h, [dstend, -32] -- stp F_l, F_h, [dstend, -16] -+ stp E_l, E_h, [dstin, 32] -+ stp F_l, F_h, [dstin, 48] -+ stp G_l, G_h, [dstend, -64] -+ stp H_l, H_h, [dstend, -48] -+ stp C_l, C_h, [dstend, -32] -+ stp D_l, D_h, [dstend, -16] - ret - - /* Align DST to 16 byte alignment so that we don't cross cache line -- boundaries on both loads and stores. There are at least 96 bytes -+ boundaries on both loads and stores. There are at least 128 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ - -@@ -215,7 +227,7 @@ L(move_long): - add dstend, dstin, count - - /* Align dstend to 16 byte alignment so that we don't cross cache line -- boundaries on both loads and stores. There are at least 96 bytes -+ boundaries on both loads and stores. There are at least 128 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ - --- -2.39.3 - diff --git a/glibc-elf-Align-argument-of-__munmap-to-page-size-BZ-28676-3.patch b/glibc-elf-Align-argument-of-__munmap-to-page-size-BZ-28676-3.patch deleted file mode 100644 index 11417e6..0000000 --- a/glibc-elf-Align-argument-of-__munmap-to-page-size-BZ-28676-3.patch +++ /dev/null @@ -1,36 +0,0 @@ -From bf126f79dff0370d1e52ef8193da7fd593c37833 Mon Sep 17 00:00:00 2001 -From: "H.J. Lu" -Date: Wed, 19 Jul 2023 23:10:48 +0800 -Subject: [PATCH 4/6] elf: Align argument of __munmap to page size [BZ #28676] - -On Linux/x86-64, for elf/tst-align3, we now get - -munmap(0x7f88f9401000, 1126424) = 0 - -instead of - -munmap(0x7f1615200018, 544768) = -1 EINVAL (Invalid argument) - -Backport from master commit: fd6062e - -Reviewed-by: Florian Weimer -Signed-off-by: Rongwei Wang ---- - elf/dl-map-segments.h | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/elf/dl-map-segments.h b/elf/dl-map-segments.h -index 61ba04cd..f1f7ad88 100644 ---- a/elf/dl-map-segments.h -+++ b/elf/dl-map-segments.h -@@ -55,6 +55,7 @@ _dl_map_segment (const struct loadcmd *c, ElfW(Addr) mappref, - if (delta) - __munmap ((void *) map_start, delta); - ElfW(Addr) map_end = map_start_aligned + maplength; -+ map_end = ALIGN_UP (map_end, GLRO(dl_pagesize)); - delta = map_start + maplen - map_end; - if (delta) - __munmap ((void *) map_end, delta); --- -2.27.0 - diff --git a/glibc-elf-Fix-tst-align3.patch b/glibc-elf-Fix-tst-align3.patch deleted file mode 100644 index 194d142..0000000 --- a/glibc-elf-Fix-tst-align3.patch +++ /dev/null @@ -1,40 +0,0 @@ -From 8b39d3b4bf2fc49ab31f31cf30aa80104afa3432 Mon Sep 17 00:00:00 2001 -From: Adhemerval Zanella -Date: Wed, 19 Jul 2023 23:14:33 +0800 -Subject: [PATCH 6/6] elf: Fix tst-align3 - -The elf/tst-align3.c declares the function using a wrong prototype. - -Checked on aarch64-linux-gnu. - -Signed-off-by: Rongwei Wang ---- - elf/tst-align3.c | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/elf/tst-align3.c b/elf/tst-align3.c -index 87a8ff81..731dd59f 100644 ---- a/elf/tst-align3.c -+++ b/elf/tst-align3.c -@@ -22,7 +22,7 @@ - - int bar __attribute__ ((aligned (ALIGN))) = 1; - --extern int do_load_test (void); -+extern void do_load_test (void); - - static int - do_test (void) -@@ -30,7 +30,8 @@ do_test (void) - printf ("bar: %p\n", &bar); - TEST_VERIFY (is_aligned (&bar, ALIGN) == 0); - -- return do_load_test (); -+ do_load_test (); -+ return 0; - } - - #include --- -2.27.0 - diff --git a/glibc-elf-Properly-align-PT_LOAD-segments-BZ-28676-1.patch b/glibc-elf-Properly-align-PT_LOAD-segments-BZ-28676-1.patch deleted file mode 100644 index a1b6c6b..0000000 --- a/glibc-elf-Properly-align-PT_LOAD-segments-BZ-28676-1.patch +++ /dev/null @@ -1,137 +0,0 @@ -From fe5893121176136b0ae3a5f9198536feeb6f64f8 Mon Sep 17 00:00:00 2001 -From: Rongwei Wang -Date: Wed, 19 Jul 2023 23:05:39 +0800 -Subject: [PATCH 2/6] elf: Properly align PT_LOAD segments [BZ #28676] - -When PT_LOAD segment alignment > the page size, allocate enough space to -ensure that the segment can be properly aligned. This change helps code -segments use huge pages become simple and available. - -This fixes [BZ #28676]. - -Backport from master commit: 718fdd8 - -Signed-off-by: Xu Yu -Signed-off-by: Rongwei Wang ---- - elf/dl-load.c | 2 ++ - elf/dl-load.h | 3 ++- - elf/dl-map-segments.h | 50 +++++++++++++++++++++++++++++++++++++++---- - 3 files changed, 50 insertions(+), 5 deletions(-) - -diff --git a/elf/dl-load.c b/elf/dl-load.c -index 0b45e6e3..132e4233 100644 ---- a/elf/dl-load.c -+++ b/elf/dl-load.c -@@ -1,5 +1,6 @@ - /* Map in a shared object's segments from the file. - Copyright (C) 1995-2018 Free Software Foundation, Inc. -+ Copyright The GNU Toolchain Authors. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or -@@ -1076,6 +1077,7 @@ _dl_map_object_from_fd (const char *name, const char *origname, int fd, - c->mapend = ALIGN_UP (ph->p_vaddr + ph->p_filesz, GLRO(dl_pagesize)); - c->dataend = ph->p_vaddr + ph->p_filesz; - c->allocend = ph->p_vaddr + ph->p_memsz; -+ c->mapalign = ph->p_align; - c->mapoff = ALIGN_DOWN (ph->p_offset, GLRO(dl_pagesize)); - - /* Determine whether there is a gap between the last segment -diff --git a/elf/dl-load.h b/elf/dl-load.h -index 66ea2e92..d9f648ea 100644 ---- a/elf/dl-load.h -+++ b/elf/dl-load.h -@@ -1,5 +1,6 @@ - /* Map in a shared object's segments from the file. - Copyright (C) 1995-2018 Free Software Foundation, Inc. -+ Copyright The GNU Toolchain Authors. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or -@@ -74,7 +75,7 @@ ELF_PREFERRED_ADDRESS_DATA; - Its details have been expanded out and converted. */ - struct loadcmd - { -- ElfW(Addr) mapstart, mapend, dataend, allocend; -+ ElfW(Addr) mapstart, mapend, dataend, allocend, mapalign; - ElfW(Off) mapoff; - int prot; /* PROT_* bits. */ - }; -diff --git a/elf/dl-map-segments.h b/elf/dl-map-segments.h -index 084076a2..61ba04cd 100644 ---- a/elf/dl-map-segments.h -+++ b/elf/dl-map-segments.h -@@ -1,5 +1,6 @@ - /* Map in a shared object's segments. Generic version. - Copyright (C) 1995-2018 Free Software Foundation, Inc. -+ Copyright The GNU Toolchain Authors. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or -@@ -18,6 +19,50 @@ - - #include - -+/* Map a segment and align it properly. */ -+ -+static __always_inline ElfW(Addr) -+_dl_map_segment (const struct loadcmd *c, ElfW(Addr) mappref, -+ const size_t maplength, int fd) -+{ -+ if (__glibc_likely (c->mapalign <= GLRO(dl_pagesize))) -+ return (ElfW(Addr)) __mmap ((void *) mappref, maplength, c->prot, -+ MAP_COPY|MAP_FILE, fd, c->mapoff); -+ -+ /* If the segment alignment > the page size, allocate enough space to -+ ensure that the segment can be properly aligned. */ -+ ElfW(Addr) maplen = (maplength >= c->mapalign -+ ? (maplength + c->mapalign) -+ : (2 * c->mapalign)); -+ ElfW(Addr) map_start = (ElfW(Addr)) __mmap ((void *) mappref, maplen, -+ PROT_NONE, -+ MAP_ANONYMOUS|MAP_PRIVATE, -+ -1, 0); -+ if (__glibc_unlikely ((void *) map_start == MAP_FAILED)) -+ return map_start; -+ -+ ElfW(Addr) map_start_aligned = ALIGN_UP (map_start, c->mapalign); -+ map_start_aligned = (ElfW(Addr)) __mmap ((void *) map_start_aligned, -+ maplength, c->prot, -+ MAP_COPY|MAP_FILE|MAP_FIXED, -+ fd, c->mapoff); -+ if (__glibc_unlikely ((void *) map_start_aligned == MAP_FAILED)) -+ __munmap ((void *) map_start, maplen); -+ else -+ { -+ /* Unmap the unused regions. */ -+ ElfW(Addr) delta = map_start_aligned - map_start; -+ if (delta) -+ __munmap ((void *) map_start, delta); -+ ElfW(Addr) map_end = map_start_aligned + maplength; -+ delta = map_start + maplen - map_end; -+ if (delta) -+ __munmap ((void *) map_end, delta); -+ } -+ -+ return map_start_aligned; -+} -+ - /* This implementation assumes (as does the corresponding implementation - of _dl_unmap_segments, in dl-unmap-segments.h) that shared objects - are always laid out with all segments contiguous (or with gaps -@@ -53,10 +98,7 @@ _dl_map_segments (struct link_map *l, int fd, - - MAP_BASE_ADDR (l)); - - /* Remember which part of the address space this object uses. */ -- l->l_map_start = (ElfW(Addr)) __mmap ((void *) mappref, maplength, -- c->prot, -- MAP_COPY|MAP_FILE, -- fd, c->mapoff); -+ l->l_map_start = _dl_map_segment (c, mappref, maplength, fd); - if (__glibc_unlikely ((void *) l->l_map_start == MAP_FAILED)) - return DL_MAP_SEGMENTS_ERROR_MAP_SEGMENT; - --- -2.27.0 - diff --git a/glibc.spec b/glibc.spec index b5eb36a..91e70d1 100644 --- a/glibc.spec +++ b/glibc.spec @@ -1,7 +1,6 @@ -%global anolis_release .0.1 %define glibcsrcdir glibc-2.28 %define glibcversion 2.28 -%define glibcrelease 236%{anolis_release}%{?dist} +%define glibcrelease 251%{?dist} # Pre-release tarballs are pulled in from git using a command that is # effectively: # @@ -133,7 +132,7 @@ end \ Summary: The GNU libc libraries Name: glibc Version: %{glibcversion} -Release: %{glibcrelease}.13 +Release: %{glibcrelease}.2 # In general, GPLv2+ is used by programs, LGPLv2+ is used for # libraries. @@ -1054,40 +1053,142 @@ Patch865: glibc-RHEL-2435.patch Patch866: glibc-RHEL-2435-2.patch Patch867: glibc-RHEL-2423.patch Patch868: glibc-RHEL-3036.patch -Patch869: glibc-RHEL-21522-1.patch -Patch870: glibc-RHEL-21522-2.patch -Patch871: glibc-RHEL-21522-3.patch -Patch872: glibc-RHEL-21522-4.patch -Patch873: glibc-RHEL-21519.patch -Patch874: glibc-RHEL-22441.patch -Patch875: glibc-RHEL-22846.patch -Patch876: glibc-RHEL-22847.patch +Patch869: glibc-RHEL-3757.patch +Patch870: glibc-RHEL-2122.patch +Patch871: glibc-RHEL-1192.patch +Patch872: glibc-RHEL-3639.patch +Patch873: glibc-RHEL-10481.patch +Patch874: glibc-RHEL-13720-1.patch +Patch875: glibc-RHEL-13720-2.patch +Patch876: glibc-RHEL-15867.patch +Patch877: glibc-RHEL-16825-1.patch +Patch878: glibc-RHEL-16825-2.patch +Patch879: glibc-RHEL-16825-3.patch +Patch880: glibc-RHEL-16825-4.patch +Patch881: glibc-RHEL-15696-1.patch +Patch882: glibc-RHEL-15696-2.patch +Patch883: glibc-RHEL-15696-3.patch +Patch884: glibc-RHEL-15696-4.patch +Patch885: glibc-RHEL-15696-5.patch +Patch886: glibc-RHEL-15696-6.patch +Patch887: glibc-RHEL-15696-7.patch +Patch888: glibc-RHEL-15696-8.patch +Patch889: glibc-RHEL-15696-9.patch +Patch890: glibc-RHEL-15696-10.patch +Patch891: glibc-RHEL-15696-11.patch +Patch892: glibc-RHEL-15696-12.patch +Patch893: glibc-RHEL-15696-13.patch +Patch894: glibc-RHEL-15696-14.patch +Patch895: glibc-RHEL-15696-15.patch +Patch896: glibc-RHEL-15696-16.patch +Patch897: glibc-RHEL-15696-17.patch +Patch898: glibc-RHEL-15696-18.patch +Patch899: glibc-RHEL-15696-19.patch +Patch900: glibc-RHEL-15696-20.patch +Patch901: glibc-RHEL-15696-21.patch +Patch902: glibc-RHEL-15696-22.patch +Patch903: glibc-RHEL-15696-23.patch +Patch904: glibc-RHEL-15696-24.patch +Patch905: glibc-RHEL-15696-25.patch +Patch906: glibc-RHEL-15696-26.patch +Patch907: glibc-RHEL-15696-27.patch +Patch908: glibc-RHEL-15696-28.patch +Patch909: glibc-RHEL-15696-29.patch +Patch910: glibc-RHEL-15696-30.patch +Patch911: glibc-RHEL-15696-31.patch +Patch912: glibc-RHEL-15696-32.patch +Patch913: glibc-RHEL-15696-33.patch +Patch914: glibc-RHEL-15696-34.patch +Patch915: glibc-RHEL-15696-35.patch +Patch916: glibc-RHEL-15696-36.patch +Patch917: glibc-RHEL-15696-37.patch +Patch918: glibc-RHEL-15696-38.patch +Patch919: glibc-RHEL-15696-39.patch +Patch920: glibc-RHEL-15696-40.patch +Patch921: glibc-RHEL-15696-41.patch +Patch922: glibc-RHEL-15696-42.patch +Patch923: glibc-RHEL-15696-43.patch +Patch924: glibc-RHEL-15696-44.patch +Patch925: glibc-RHEL-15696-45.patch +Patch926: glibc-RHEL-15696-46.patch +Patch927: glibc-RHEL-15696-47.patch +Patch928: glibc-RHEL-15696-48.patch +Patch929: glibc-RHEL-15696-49.patch +Patch930: glibc-RHEL-15696-50.patch +Patch931: glibc-RHEL-15696-51.patch +Patch932: glibc-RHEL-15696-52.patch +Patch933: glibc-RHEL-15696-53.patch +Patch934: glibc-RHEL-15696-54.patch +Patch935: glibc-RHEL-15696-55.patch +Patch936: glibc-RHEL-15696-56.patch +Patch937: glibc-RHEL-15696-57.patch +Patch938: glibc-RHEL-15696-58.patch +Patch939: glibc-RHEL-15696-59.patch +Patch940: glibc-RHEL-15696-60.patch +Patch941: glibc-RHEL-15696-61.patch +Patch942: glibc-RHEL-15696-62.patch +Patch943: glibc-RHEL-15696-63.patch +Patch944: glibc-RHEL-15696-64.patch +Patch945: glibc-RHEL-15696-65.patch +Patch946: glibc-RHEL-15696-66.patch +Patch947: glibc-RHEL-15696-67.patch +Patch948: glibc-RHEL-15696-68.patch +Patch949: glibc-RHEL-15696-69.patch +Patch950: glibc-RHEL-15696-70.patch +Patch951: glibc-RHEL-15696-71.patch +Patch952: glibc-RHEL-15696-72.patch +Patch953: glibc-RHEL-15696-73.patch +Patch954: glibc-RHEL-15696-74.patch +Patch955: glibc-RHEL-15696-75.patch +Patch956: glibc-RHEL-15696-76.patch +Patch957: glibc-RHEL-15696-77.patch +Patch958: glibc-RHEL-15696-78.patch +Patch959: glibc-RHEL-15696-79.patch +Patch960: glibc-RHEL-15696-80.patch +Patch961: glibc-RHEL-15696-81.patch +Patch962: glibc-RHEL-15696-82.patch +Patch963: glibc-RHEL-15696-83.patch +Patch964: glibc-RHEL-15696-84.patch +Patch965: glibc-RHEL-15696-85.patch +Patch966: glibc-RHEL-15696-86.patch +Patch967: glibc-RHEL-15696-87.patch +Patch968: glibc-RHEL-15696-88.patch +Patch969: glibc-RHEL-15696-89.patch +Patch970: glibc-RHEL-15696-90.patch +Patch971: glibc-RHEL-15696-91.patch +Patch972: glibc-RHEL-15696-92.patch +Patch973: glibc-RHEL-15696-93.patch +Patch974: glibc-RHEL-15696-94.patch +Patch975: glibc-RHEL-15696-95.patch +Patch976: glibc-RHEL-15696-96.patch +Patch977: glibc-RHEL-15696-97.patch +Patch978: glibc-RHEL-15696-98.patch +Patch979: glibc-RHEL-15696-99.patch +Patch980: glibc-RHEL-15696-100.patch +Patch981: glibc-RHEL-15696-101.patch +Patch982: glibc-RHEL-15696-102.patch +Patch983: glibc-RHEL-15696-103.patch +Patch984: glibc-RHEL-15696-104.patch +Patch985: glibc-RHEL-15696-105.patch +Patch986: glibc-RHEL-15696-106.patch +Patch987: glibc-RHEL-15696-107.patch +Patch988: glibc-RHEL-15696-108.patch +Patch989: glibc-RHEL-15696-109.patch +Patch990: glibc-RHEL-15696-110.patch +Patch991: glibc-RHEL-17468-1.patch +Patch992: glibc-RHEL-17468-2.patch +Patch993: glibc-RHEL-19824.patch +Patch994: glibc-RHEL-3010-1.patch +Patch995: glibc-RHEL-3010-2.patch +Patch996: glibc-RHEL-3010-3.patch +Patch997: glibc-RHEL-19445.patch +Patch998: glibc-RHEL-21997.patch +Patch999: glibc-RHEL-31804.patch +Patch1000: glibc-RHEL-34264.patch +Patch1001: glibc-RHEL-34267-1.patch +Patch1002: glibc-RHEL-34267-2.patch +Patch1003: glibc-RHEL-34273.patch -Patch2000: glibc-Properly-check-stack-alignment-BZ-27901.patch -Patch2001: glibc-elf-Properly-align-PT_LOAD-segments-BZ-28676-1.patch -Patch2002: glibc-Add-a-testcase-to-check-alignment-of-PT_LOAD-segment-2.patch -Patch2003: glibc-elf-Align-argument-of-__munmap-to-page-size-BZ-28676-3.patch -Patch2004: glibc-Support-target-specific-ALIGN-for-variable-alignment-4.patch -Patch2005: glibc-elf-Fix-tst-align3.patch - -Patch2006: glibc-Sync-to-lnd-35-for-LoongArch.patch -Patch2007: Fix-tst-cancel21.c-to-suit-kernel-struct-sigcontext-.patch -Patch2008: glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch -Patch2009: glibc-Add-Hygon-Support.patch -Patch2010: glibc-2.28-Remove-unseless-ANDROID_CHANGES-and-relat.patch -Patch2011: glibc-2.28-use-new-macro-LEAF-and-ENTRY-and-modify-r.patch -Patch2012: glibc-2.28-Fix-ifunc-str-mem-functions-xfail-problem.patch -Patch2013: glibc-2.28-Add-run-one-test-convenience-target-and-m.patch -Patch2014: glibc-2.28-remove-ABILPX32-related-code.patch -Patch2015: glibc-2.28-Refactor-code-of-raw-mem-functions.patch -Patch2016: glibc-2.28-Refactor-code-of-st-r-p-functions.patch -Patch2017: glibc-2.28-Add-new-struct-user_fp_state-in-user.h.patch -Patch2018: glibc-2.28-Redefine-macro-LEAF-ENTRY.patch -Patch2019: glibc-2.28-config-Added-HAVE_LOONGARCH_VEC_ASM.patch -Patch2020: glibc-2.28-Add-macro-defination-of-lasx-lsx-and-fcc-.patch -Patch2021: glibc-2.28-Refactor-code-and-fix-bug-in-_dl_runtime_.patch -Patch2022: glibc-2.28-Remove-useless-IS_LA-264-364-464-and-IS_L.patch -Patch2023: glibc-2.28-Use-RTLD_SUPPORT_-LSX-LASX-to-choose-_dl_.patch ############################################################################## # Continued list of core "glibc" package information: ############################################################################## @@ -1767,11 +1868,6 @@ build() %ifarch %{ix86} --disable-multi-arch \ %endif -%ifarch loongarch64 - --enable-stackguard-randomization \ - --with-selinux \ - --enable-shared \ -%endif %if %{without werror} --disable-werror \ %endif @@ -1951,6 +2047,7 @@ gzip -9nvf %{glibc_sysroot}%{_infodir}/libc* # Copy the debugger interface documentation over to the right location mkdir -p %{glibc_sysroot}%{_docdir}/glibc cp elf/rtld-debugger-interface.txt %{glibc_sysroot}%{_docdir}/glibc +cp posix/gai.conf %{glibc_sysroot}%{_docdir}/glibc %else rm -f %{glibc_sysroot}%{_infodir}/dir rm -f %{glibc_sysroot}%{_infodir}/libc.info* @@ -2922,30 +3019,59 @@ fi %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared %changelog -* Sat May 11 2024 Peng Fan - 2.28-236.0.1.13 -- Sync loongarch64 code to lnd.36. +* Fri Apr 26 2024 Florian Weimer - 2.28-251.2 +- CVE-2024-33599: nscd: buffer overflow in netgroup cache (RHEL-34264) +- CVE-2024-33600: nscd: null pointer dereferences in netgroup cache (RHEL-34267) +- CVE-2024-33601: nscd: crash on out-of-memory condition (RHEL-34271) +- CVE-2024-33602: nscd: memory corruption with NSS netgroup modules (RHEL-34273) -* Mon May 06 2024 Rongwei Wang - 2.28-236.0.1.12 -- elf: Properly align PT_LOAD segments -- Sync loongarch64 code to lnd.35. (lixing@loongson.cn) -- Add patch for gb18030-2022 from upstream bug#30243 (fundawang@yeah.net) -- aarch64: Increase small and medium cases for __memcpy_generic (bug#7060) (Kaiqiang Wang) -- Add Hygon Support (Jing Li) +* Mon Apr 15 2024 Florian Weimer - 2.28-251.1 +- CVE-2024-2961: Out of bounds write in iconv conversion to ISO-2022-CN-EXT (RHEL-31804) -* Mon Jan 29 2024 Florian Weimer - 2.28-236.12 -- Re-enable output buffering for wide stdio streams (RHEL-22847) +* Thu Jan 18 2024 Florian Weimer - 2.28-251 +- Cache information in x86_64 ld.so --list-diagnostics output (RHEL-21997) -* Mon Jan 29 2024 Florian Weimer - 2.28-236.11 -- Avoid lazy binding failures during dlclose (RHEL-22846) +* Wed Jan 10 2024 Arjun Shankar - 2.28-250 +- getaddrinfo: Return correct error EAI_MEMORY when out-of-memory (RHEL-19445) -* Fri Jan 26 2024 Florian Weimer - 2.28-236.10 -- nscd: Skip unusable entries in first pass in prune_cache (RHEL-22441) +* Wed Jan 3 2024 Florian Weimer - 2.28-249 +- Updates for AMD cache size computation (RHEL-3010) -* Fri Jan 26 2024 Florian Weimer - 2.28-236.9 -- Fix force-first handling in dlclose (RHEL-21519) +* Tue Jan 2 2024 Florian Weimer - 2.28-248 +- Re-enable output buffering for wide stdio streams (RHEL-19824) -* Fri Jan 26 2024 Florian Weimer - 2.28-236.8 -- Improve compatibility between underlinking and IFUNC resolvers (RHEL-21522) +* Thu Dec 21 2023 Carlos O'Donell - 2.28-247 +- Fix TLS corruption during dlopen()/dlclose() sequences (RHEL-17468) + +* Thu Dec 14 2023 DJ Delorie - 2.28-246 +- Include CentOS Hyperscaler SIG patches backported by Intel (RHEL-15696) + +* Fri Dec 8 2023 Florian Weimer - 2.28-245 +- Improve compatibility between underlinking and IFUNC resolvers (RHEL-16825) + +* Fri Nov 24 2023 Florian Weimer - 2.28-244 +- Restore compatibility with C90 compilers (RHEL-15867) + +* Tue Nov 21 2023 Florian Weimer - 2.28-243 +- ldconfig should skip temporary files created by RPM (RHEL-13720) + +* Mon Nov 20 2023 Florian Weimer - 2.28-242 +- Fix force-first handling in dlclose (RHEL-10481) + +* Fri Nov 10 2023 Florian Weimer - 2.28-241 +- Avoid lazy binding failures during dlclose (RHEL-3639) + +* Tue Oct 24 2023 Arjun Shankar - 2.28-240 +- Add /usr/share/doc/glibc/gai.conf to glibc-doc (RHEL-12894) + +* Fri Oct 20 2023 Florian Weimer - 2.28-239 +- nscd: Skip unusable entries in first pass in prune_cache (RHEL-1192) + +* Mon Oct 16 2023 DJ Delorie - 2.28-238 +- Fix slow tls access after dlopen (RHEL-2122) + +* Mon Oct 16 2023 Arjun Shankar - 2.28-237 +- Enable running a single test from the testsuite (RHEL-3757) * Wed Sep 20 2023 Siddhesh Poyarekar - 2.28-236.7 - CVE-2023-4911 glibc: buffer overflow in ld.so leading to privilege escalation (RHEL-3036)