diff --git a/Fix-tst-cancel21.c-to-suit-kernel-struct-sigcontext-.patch b/Fix-tst-cancel21.c-to-suit-kernel-struct-sigcontext-.patch
deleted file mode 100644
index 62b5dab..0000000
--- a/Fix-tst-cancel21.c-to-suit-kernel-struct-sigcontext-.patch
+++ /dev/null
@@ -1,34 +0,0 @@
-From c5de7c407853b807e8d0c764e6325bb1311f39cd Mon Sep 17 00:00:00 2001
-From: Xing Li <lixing@loongson.cn>
-Date: Tue, 4 Jul 2023 15:10:03 +0800
-Subject: [PATCH 2/2] Fix tst-cancel21.c to suit kernel struct sigcontext
- change. * nptl/tst-cancel21.c
-
----
- nptl/tst-cancel21.c | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/nptl/tst-cancel21.c b/nptl/tst-cancel21.c
-index b10fdbc1..a3653f21 100644
---- a/nptl/tst-cancel21.c
-+++ b/nptl/tst-cancel21.c
-@@ -217,14 +217,14 @@ static int
- do_test (void)
- {
-   stack_t ss;
--  ss.ss_sp = malloc (2 * SIGSTKSZ);
-+  ss.ss_sp = malloc (4 * SIGSTKSZ);
-   if (ss.ss_sp == NULL)
-     {
-       puts ("failed to allocate alternate stack");
-       return 1;
-     }
-   ss.ss_flags = 0;
--  ss.ss_size = 2 * SIGSTKSZ;
-+  ss.ss_size = 4 * SIGSTKSZ;
-   if (sigaltstack (&ss, NULL) < 0)
-     {
-       printf ("sigaltstack failed %m\n");
--- 
-2.27.0
-
diff --git a/dist b/dist
index 37a6f9c..1fe92cf 100644
--- a/dist
+++ b/dist
@@ -1 +1 @@
-an8_9
+an8_10
diff --git a/glibc-2.28-Add-macro-defination-of-lasx-lsx-and-fcc-.patch b/glibc-2.28-Add-macro-defination-of-lasx-lsx-and-fcc-.patch
deleted file mode 100644
index 86f142d..0000000
--- a/glibc-2.28-Add-macro-defination-of-lasx-lsx-and-fcc-.patch
+++ /dev/null
@@ -1,3946 +0,0 @@
-From d97d963796b092b9c0bd4712f992a08dd20bf5ed Mon Sep 17 00:00:00 2001
-From: caiyinyu <caiyinyu@loongson.cn>
-Date: Tue, 11 Jul 2023 15:40:15 +0800
-Subject: [PATCH 11/14] glibc-2.28: Add macro defination of lasx lsx and fcc
- registers.
-
-Change-Id: Ic723521775a0133e25bf1d568c588f930ec5ff49
-Signed-off-by: ticat_fp <fanpeng@loongson.cn>
----
- sysdeps/loongarch/dl-trampoline.h             |  64 +--
- .../loongarch/lp64/multiarch/memchr-lasx.S    |  74 +--
- sysdeps/loongarch/lp64/multiarch/memchr-lsx.S |  48 +-
- .../loongarch/lp64/multiarch/memcmp-lasx.S    | 138 +++---
- sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S | 194 ++++----
- .../loongarch/lp64/multiarch/memmove-lasx.S   | 160 +++----
- .../loongarch/lp64/multiarch/memmove-lsx.S    | 424 +++++++++---------
- .../loongarch/lp64/multiarch/memrchr-lasx.S   |  74 +--
- .../loongarch/lp64/multiarch/memrchr-lsx.S    |  48 +-
- .../loongarch/lp64/multiarch/memset-lasx.S    |  64 +--
- sysdeps/loongarch/lp64/multiarch/memset-lsx.S |  62 +--
- .../loongarch/lp64/multiarch/rawmemchr-lasx.S |  30 +-
- .../loongarch/lp64/multiarch/rawmemchr-lsx.S  |  30 +-
- sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S | 114 ++---
- .../loongarch/lp64/multiarch/strchr-lasx.S    |  52 +--
- sysdeps/loongarch/lp64/multiarch/strchr-lsx.S |  30 +-
- sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S | 114 ++---
- sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S | 112 ++---
- .../loongarch/lp64/multiarch/strlen-lasx.S    |  24 +-
- sysdeps/loongarch/lp64/multiarch/strlen-lsx.S |  30 +-
- .../loongarch/lp64/multiarch/strncmp-lsx.S    | 144 +++---
- .../loongarch/lp64/multiarch/strnlen-lasx.S   |  46 +-
- .../loongarch/lp64/multiarch/strnlen-lsx.S    |  30 +-
- .../loongarch/lp64/multiarch/strrchr-lasx.S   |  88 ++--
- .../loongarch/lp64/multiarch/strrchr-lsx.S    |  56 +--
- sysdeps/loongarch/lp64/s_cosf.S               |   4 +-
- sysdeps/loongarch/lp64/s_sinf.S               |   4 +-
- sysdeps/loongarch/sys/regdef.h                |  74 +++
- 28 files changed, 1203 insertions(+), 1129 deletions(-)
-
-diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h
-index fb15983f..96f41f1d 100644
---- a/sysdeps/loongarch/dl-trampoline.h
-+++ b/sysdeps/loongarch/dl-trampoline.h
-@@ -61,23 +61,23 @@ ENTRY (_dl_runtime_resolve, 3)
- 	FREG_S	fa6, sp, 10*SZREG + 6*SZFREG
- 	FREG_S	fa7, sp, 10*SZREG + 7*SZFREG
- #ifdef USE_LASX
--	xvst	$xr0, sp, 10*SZREG + 0*256
--	xvst	$xr1, sp, 10*SZREG + 1*256
--	xvst	$xr2, sp, 10*SZREG + 2*256
--	xvst	$xr3, sp, 10*SZREG + 3*256
--	xvst	$xr4, sp, 10*SZREG + 4*256
--	xvst	$xr5, sp, 10*SZREG + 5*256
--	xvst	$xr6, sp, 10*SZREG + 6*256
--	xvst	$xr7, sp, 10*SZREG + 7*256
-+	xvst	xr0, sp, 10*SZREG + 0*256
-+	xvst	xr1, sp, 10*SZREG + 1*256
-+	xvst	xr2, sp, 10*SZREG + 2*256
-+	xvst	xr3, sp, 10*SZREG + 3*256
-+	xvst	xr4, sp, 10*SZREG + 4*256
-+	xvst	xr5, sp, 10*SZREG + 5*256
-+	xvst	xr6, sp, 10*SZREG + 6*256
-+	xvst	xr7, sp, 10*SZREG + 7*256
- #elif defined USE_LSX
--	vst	$vr0, sp, 10*SZREG + 0*128
--	vst	$vr1, sp, 10*SZREG + 1*128
--	vst	$vr2, sp, 10*SZREG + 2*128
--	vst	$vr3, sp, 10*SZREG + 3*128
--	vst	$vr4, sp, 10*SZREG + 4*128
--	vst	$vr5, sp, 10*SZREG + 5*128
--	vst	$vr6, sp, 10*SZREG + 6*128
--	vst	$vr7, sp, 10*SZREG + 7*128
-+	vst	vr0, sp, 10*SZREG + 0*128
-+	vst	vr1, sp, 10*SZREG + 1*128
-+	vst	vr2, sp, 10*SZREG + 2*128
-+	vst	vr3, sp, 10*SZREG + 3*128
-+	vst	vr4, sp, 10*SZREG + 4*128
-+	vst	vr5, sp, 10*SZREG + 5*128
-+	vst	vr6, sp, 10*SZREG + 6*128
-+	vst	vr7, sp, 10*SZREG + 7*128
- #endif
- #endif
- 
-@@ -119,23 +119,23 @@ ENTRY (_dl_runtime_resolve, 3)
- 	FREG_L	fa6, sp, 10*SZREG + 6*SZFREG
- 	FREG_L	fa7, sp, 10*SZREG + 7*SZFREG
- #ifdef USE_LASX
--	xvld	$xr0, sp, 10*SZREG + 0*256
--	xvld	$xr1, sp, 10*SZREG + 1*256
--	xvld	$xr2, sp, 10*SZREG + 2*256
--	xvld	$xr3, sp, 10*SZREG + 3*256
--	xvld	$xr4, sp, 10*SZREG + 4*256
--	xvld	$xr5, sp, 10*SZREG + 5*256
--	xvld	$xr6, sp, 10*SZREG + 6*256
--	xvld	$xr7, sp, 10*SZREG + 7*256
-+	xvld	xr0, sp, 10*SZREG + 0*256
-+	xvld	xr1, sp, 10*SZREG + 1*256
-+	xvld	xr2, sp, 10*SZREG + 2*256
-+	xvld	xr3, sp, 10*SZREG + 3*256
-+	xvld	xr4, sp, 10*SZREG + 4*256
-+	xvld	xr5, sp, 10*SZREG + 5*256
-+	xvld	xr6, sp, 10*SZREG + 6*256
-+	xvld	xr7, sp, 10*SZREG + 7*256
- #elif defined USE_LSX
--	vld	$vr0, sp, 10*SZREG + 0*128
--	vld	$vr1, sp, 10*SZREG + 1*128
--	vld	$vr2, sp, 10*SZREG + 2*128
--	vld	$vr3, sp, 10*SZREG + 3*128
--	vld	$vr4, sp, 10*SZREG + 4*128
--	vld	$vr5, sp, 10*SZREG + 5*128
--	vld	$vr6, sp, 10*SZREG + 6*128
--	vld	$vr7, sp, 10*SZREG + 7*128
-+	vld	vr0, sp, 10*SZREG + 0*128
-+	vld	vr1, sp, 10*SZREG + 1*128
-+	vld	vr2, sp, 10*SZREG + 2*128
-+	vld	vr3, sp, 10*SZREG + 3*128
-+	vld	vr4, sp, 10*SZREG + 4*128
-+	vld	vr5, sp, 10*SZREG + 5*128
-+	vld	vr6, sp, 10*SZREG + 6*128
-+	vld	vr7, sp, 10*SZREG + 7*128
- #endif
- #endif
- 
-diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
-index 387a35fe..425fcede 100644
---- a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
-@@ -17,28 +17,28 @@ LEAF(MEMCHR, 6)
-     andi            t0, a0, 0x3f
-     bstrins.d       a0, zero, 5, 0
- 
--    xvld            $xr0, a0, 0
--    xvld            $xr1, a0, 32
-+    xvld            xr0, a0, 0
-+    xvld            xr1, a0, 32
-     li.d            t1, -1
-     li.d            t2, 64
- 
--    xvreplgr2vr.b   $xr2, a1
-+    xvreplgr2vr.b   xr2, a1
-     sll.d           t3, t1, t0
-     sub.d           t2, t2, t0
--    xvseq.b         $xr0, $xr0, $xr2
-+    xvseq.b         xr0, xr0, xr2
- 
--    xvseq.b         $xr1, $xr1, $xr2
--    xvmsknz.b       $xr0, $xr0
--    xvmsknz.b       $xr1, $xr1
--    xvpickve.w      $xr3, $xr0, 4
-+    xvseq.b         xr1, xr1, xr2
-+    xvmsknz.b       xr0, xr0
-+    xvmsknz.b       xr1, xr1
-+    xvpickve.w      xr3, xr0, 4
- 
- 
--    xvpickve.w      $xr4, $xr1, 4
--    vilvl.h         $vr0, $vr3, $vr0
--    vilvl.h         $vr1, $vr4, $vr1
--    vilvl.w         $vr0, $vr1, $vr0
-+    xvpickve.w      xr4, xr1, 4
-+    vilvl.h         vr0, vr3, vr0
-+    vilvl.h         vr1, vr4, vr1
-+    vilvl.w         vr0, vr1, vr0
- 
--    movfr2gr.d      t0, $f0
-+    movfr2gr.d      t0, fa0
-     and             t0, t0, t3
-     bgeu            t2, a2, L(end)
-     bnez            t0, L(found)
-@@ -46,28 +46,28 @@ LEAF(MEMCHR, 6)
-     addi.d          a4, a3, -1
-     bstrins.d       a4, zero, 5, 0
- L(loop):
--    xvld            $xr0, a0, 64
--    xvld            $xr1, a0, 96
-+    xvld            xr0, a0, 64
-+    xvld            xr1, a0, 96
- 
-     addi.d          a0, a0, 64
--    xvseq.b         $xr0, $xr0, $xr2
--    xvseq.b         $xr1, $xr1, $xr2
-+    xvseq.b         xr0, xr0, xr2
-+    xvseq.b         xr1, xr1, xr2
-     beq             a0, a4, L(out)
- 
- 
--    xvmax.bu        $xr3, $xr0, $xr1
--    xvseteqz.v      $fcc0, $xr3
--    bcnez           $fcc0, L(loop)
--    xvmsknz.b       $xr0, $xr0
-+    xvmax.bu        xr3, xr0, xr1
-+    xvseteqz.v      fcc0, xr3
-+    bcnez           fcc0, L(loop)
-+    xvmsknz.b       xr0, xr0
- 
--    xvmsknz.b       $xr1, $xr1
--    xvpickve.w      $xr3, $xr0, 4
--    xvpickve.w      $xr4, $xr1, 4
--    vilvl.h         $vr0, $vr3, $vr0
-+    xvmsknz.b       xr1, xr1
-+    xvpickve.w      xr3, xr0, 4
-+    xvpickve.w      xr4, xr1, 4
-+    vilvl.h         vr0, vr3, vr0
- 
--    vilvl.h         $vr1, $vr4, $vr1
--    vilvl.w         $vr0, $vr1, $vr0
--    movfr2gr.d      t0, $f0
-+    vilvl.h         vr1, vr4, vr1
-+    vilvl.w         vr0, vr1, vr0
-+    movfr2gr.d      t0, fa0
- L(found):
-     ctz.d           t1, t0
- 
-@@ -79,15 +79,15 @@ L(ret0):
- 
- 
- L(out):
--    xvmsknz.b       $xr0, $xr0
--    xvmsknz.b       $xr1, $xr1
--    xvpickve.w      $xr3, $xr0, 4
--    xvpickve.w      $xr4, $xr1, 4
--
--    vilvl.h         $vr0, $vr3, $vr0
--    vilvl.h         $vr1, $vr4, $vr1
--    vilvl.w         $vr0, $vr1, $vr0
--    movfr2gr.d      t0, $f0
-+    xvmsknz.b       xr0, xr0
-+    xvmsknz.b       xr1, xr1
-+    xvpickve.w      xr3, xr0, 4
-+    xvpickve.w      xr4, xr1, 4
-+
-+    vilvl.h         vr0, vr3, vr0
-+    vilvl.h         vr1, vr4, vr1
-+    vilvl.w         vr0, vr1, vr0
-+    movfr2gr.d      t0, fa0
- 
- L(end):
-     sub.d           t2, zero, a3
-diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
-index c6952657..08a630d3 100644
---- a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
-@@ -17,23 +17,23 @@ LEAF(MEMCHR, 6)
-     andi            t0, a0, 0x1f
-     bstrins.d       a0, zero, 4, 0
- 
--    vld             $vr0, a0, 0
--    vld             $vr1, a0, 16
-+    vld             vr0, a0, 0
-+    vld             vr1, a0, 16
-     li.d            t1, -1
-     li.d            t2, 32
- 
--    vreplgr2vr.b    $vr2, a1
-+    vreplgr2vr.b    vr2, a1
-     sll.d           t3, t1, t0
-     sub.d           t2, t2, t0
--    vseq.b          $vr0, $vr0, $vr2
-+    vseq.b          vr0, vr0, vr2
- 
--    vseq.b          $vr1, $vr1, $vr2
--    vmsknz.b        $vr0, $vr0
--    vmsknz.b        $vr1, $vr1
--    vilvl.h         $vr0, $vr1, $vr0
-+    vseq.b          vr1, vr1, vr2
-+    vmsknz.b        vr0, vr0
-+    vmsknz.b        vr1, vr1
-+    vilvl.h         vr0, vr1, vr0
- 
- 
--    movfr2gr.s      t0, $f0
-+    movfr2gr.s      t0, fa0
-     and             t0, t0, t3
-     bgeu            t2, a2, L(end)
-     bnez            t0, L(found)
-@@ -41,23 +41,23 @@ LEAF(MEMCHR, 6)
-     addi.d          a4, a3, -1
-     bstrins.d       a4, zero, 4, 0
- L(loop):
--    vld             $vr0, a0, 32
--    vld             $vr1, a0, 48
-+    vld             vr0, a0, 32
-+    vld             vr1, a0, 48
- 
-     addi.d          a0, a0, 32
--    vseq.b          $vr0, $vr0, $vr2
--    vseq.b          $vr1, $vr1, $vr2
-+    vseq.b          vr0, vr0, vr2
-+    vseq.b          vr1, vr1, vr2
-     beq             a0, a4, L(out)
- 
--    vmax.bu         $vr3, $vr0, $vr1
--    vseteqz.v       $fcc0, $vr3
--    bcnez           $fcc0, L(loop)
--    vmsknz.b        $vr0, $vr0
-+    vmax.bu         vr3, vr0, vr1
-+    vseteqz.v       fcc0, vr3
-+    bcnez           fcc0, L(loop)
-+    vmsknz.b        vr0, vr0
- 
- 
--    vmsknz.b        $vr1, $vr1
--    vilvl.h         $vr0, $vr1, $vr0
--    movfr2gr.s      t0, $f0
-+    vmsknz.b        vr1, vr1
-+    vilvl.h         vr0, vr1, vr0
-+    movfr2gr.s      t0, fa0
- L(found):
-     ctz.w           t0, t0
- 
-@@ -68,10 +68,10 @@ L(ret0):
-     jr              ra
- 
- L(out):
--    vmsknz.b        $vr0, $vr0
--    vmsknz.b        $vr1, $vr1
--    vilvl.h         $vr0, $vr1, $vr0
--    movfr2gr.s      t0, $f0
-+    vmsknz.b        vr0, vr0
-+    vmsknz.b        vr1, vr1
-+    vilvl.h         vr0, vr1, vr0
-+    movfr2gr.s      t0, fa0
- 
- L(end):
-     sub.d           t2, zero, a3
-diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
-index 9151d38d..2c192954 100644
---- a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
-@@ -20,39 +20,39 @@ LEAF(MEMCMP, 6)
-     li.d            t1, 160
-     bgeu            a2, t1, L(make_aligned)  # a2 >= 160
- L(loop32):
--    xvld            $xr0, a0, 0
--    xvld            $xr1, a1, 0
-+    xvld            xr0, a0, 0
-+    xvld            xr1, a1, 0
- 
-     addi.d          a0, a0, 32
-     addi.d          a1, a1, 32
-     addi.d          a2, a2, -32
--    xvseq.b         $xr2, $xr0, $xr1
-+    xvseq.b         xr2, xr0, xr1
- 
--    xvsetanyeqz.b   $fcc0, $xr2
--    bcnez           $fcc0, L(end)
-+    xvsetanyeqz.b   fcc0, xr2
-+    bcnez           fcc0, L(end)
- L(last_bytes):
-     bltu            t2, a2, L(loop32)
--    xvld            $xr0, a3, -32
-+    xvld            xr0, a3, -32
- 
- 
--    xvld            $xr1, a4, -32
--    xvseq.b         $xr2, $xr0, $xr1
-+    xvld            xr1, a4, -32
-+    xvseq.b         xr2, xr0, xr1
- L(end):
--    xvmsknz.b       $xr2, $xr2
--    xvpermi.q       $xr4, $xr0, 1
-+    xvmsknz.b       xr2, xr2
-+    xvpermi.q       xr4, xr0, 1
- 
--    xvpickve.w      $xr3, $xr2, 4
--    xvpermi.q       $xr5, $xr1, 1
--    vilvl.h         $vr2, $vr3, $vr2
--    movfr2gr.s      t0, $f2
-+    xvpickve.w      xr3, xr2, 4
-+    xvpermi.q       xr5, xr1, 1
-+    vilvl.h         vr2, vr3, vr2
-+    movfr2gr.s      t0, fa2
- 
-     cto.w           t0, t0
--    vreplgr2vr.b    $vr2, t0
--    vshuf.b         $vr0, $vr4, $vr0, $vr2
--    vshuf.b         $vr1, $vr5, $vr1, $vr2
-+    vreplgr2vr.b    vr2, t0
-+    vshuf.b         vr0, vr4, vr0, vr2
-+    vshuf.b         vr1, vr5, vr1, vr2
- 
--    vpickve2gr.bu   t0, $vr0, 0
--    vpickve2gr.bu   t1, $vr1, 0
-+    vpickve2gr.bu   t0, vr0, 0
-+    vpickve2gr.bu   t1, vr1, 0
-     sub.d           a0, t0, t1
-     jr              ra
- 
-@@ -60,59 +60,59 @@ L(end):
- L(less32):
-     srli.d          t0, a2, 4
-     beqz            t0, L(less16)
--    vld             $vr0, a0, 0
--    vld             $vr1, a1, 0
-+    vld             vr0, a0, 0
-+    vld             vr1, a1, 0
- 
--    vld             $vr2, a3, -16
--    vld             $vr3, a4, -16
-+    vld             vr2, a3, -16
-+    vld             vr3, a4, -16
- L(short_ret):
--    vseq.b          $vr4, $vr0, $vr1
--    vseq.b          $vr5, $vr2, $vr3
-+    vseq.b          vr4, vr0, vr1
-+    vseq.b          vr5, vr2, vr3
- 
--    vmsknz.b        $vr4, $vr4
--    vmsknz.b        $vr5, $vr5
--    vilvl.h         $vr4, $vr5, $vr4
--    movfr2gr.s      t0, $f4
-+    vmsknz.b        vr4, vr4
-+    vmsknz.b        vr5, vr5
-+    vilvl.h         vr4, vr5, vr4
-+    movfr2gr.s      t0, fa4
- 
-     cto.w           t0, t0
--    vreplgr2vr.b    $vr4, t0
--    vshuf.b         $vr0, $vr2, $vr0, $vr4
--    vshuf.b         $vr1, $vr3, $vr1, $vr4
-+    vreplgr2vr.b    vr4, t0
-+    vshuf.b         vr0, vr2, vr0, vr4
-+    vshuf.b         vr1, vr3, vr1, vr4
- 
- 
--    vpickve2gr.bu   t0, $vr0, 0
--    vpickve2gr.bu   t1, $vr1, 0
-+    vpickve2gr.bu   t0, vr0, 0
-+    vpickve2gr.bu   t1, vr1, 0
-     sub.d           a0, t0, t1
-     jr              ra
- 
- L(less16):
-     srli.d          t0, a2, 3
-     beqz            t0, L(less8)
--    vldrepl.d       $vr0, a0, 0
--    vldrepl.d       $vr1, a1, 0
-+    vldrepl.d       vr0, a0, 0
-+    vldrepl.d       vr1, a1, 0
- 
--    vldrepl.d       $vr2, a3, -8
--    vldrepl.d       $vr3, a4, -8
-+    vldrepl.d       vr2, a3, -8
-+    vldrepl.d       vr3, a4, -8
-     b               L(short_ret)
- L(less8):
-     srli.d          t0, a2, 2
- 
-     beqz            t0, L(less4)
--    vldrepl.w       $vr0, a0, 0
--    vldrepl.w       $vr1, a1, 0
--    vldrepl.w       $vr2, a3, -4
-+    vldrepl.w       vr0, a0, 0
-+    vldrepl.w       vr1, a1, 0
-+    vldrepl.w       vr2, a3, -4
- 
- 
--    vldrepl.w       $vr3, a4, -4
-+    vldrepl.w       vr3, a4, -4
-     b               L(short_ret)
- L(less4):
-     srli.d          t0, a2, 1
-     beqz            t0, L(less2)
- 
--    vldrepl.h       $vr0, a0, 0
--    vldrepl.h       $vr1, a1, 0
--    vldrepl.h       $vr2, a3, -2
--    vldrepl.h       $vr3, a4, -2
-+    vldrepl.h       vr0, a0, 0
-+    vldrepl.h       vr1, a1, 0
-+    vldrepl.h       vr2, a3, -2
-+    vldrepl.h       vr3, a4, -2
- 
-     b               L(short_ret)
- L(less2):
-@@ -132,12 +132,12 @@ L(ret0):
-     nop
- /* make src1 aligned, and adjust scr2 and length. */
- L(make_aligned):
--    xvld            $xr0, a0, 0
-+    xvld            xr0, a0, 0
- 
--    xvld            $xr1, a1, 0
--    xvseq.b         $xr2, $xr0, $xr1
--    xvsetanyeqz.b   $fcc0, $xr2
--    bcnez           $fcc0, L(end)
-+    xvld            xr1, a1, 0
-+    xvseq.b         xr2, xr0, xr1
-+    xvsetanyeqz.b   fcc0, xr2
-+    bcnez           fcc0, L(end)
- 
-     andi            t0, a0, 0x1f
-     sub.d           t0, t2, t0
-@@ -151,17 +151,17 @@ L(make_aligned):
- 
- 
- L(loop_align):
--    xvld            $xr0, a0, 0
--    xvld            $xr1, a1, 0
--    xvld            $xr2, a0, 32
--    xvld            $xr3, a1, 32
-+    xvld            xr0, a0, 0
-+    xvld            xr1, a1, 0
-+    xvld            xr2, a0, 32
-+    xvld            xr3, a1, 32
- 
--    xvseq.b         $xr0, $xr0, $xr1
--    xvseq.b         $xr1, $xr2, $xr3
--    xvmin.bu        $xr2, $xr1, $xr0
--    xvsetanyeqz.b   $fcc0, $xr2
-+    xvseq.b         xr0, xr0, xr1
-+    xvseq.b         xr1, xr2, xr3
-+    xvmin.bu        xr2, xr1, xr0
-+    xvsetanyeqz.b   fcc0, xr2
- 
--    bcnez           $fcc0, L(pair_end)
-+    bcnez           fcc0, L(pair_end)
-     addi.d          a0, a0, 64
-     addi.d          a1, a1, 64
-     bne             a0, a5, L(loop_align)
-@@ -173,15 +173,15 @@ L(loop_align):
- 
- 
- L(pair_end):
--    xvmsknz.b       $xr0, $xr0
--    xvmsknz.b       $xr1, $xr1
--    xvpickve.w      $xr2, $xr0, 4
--    xvpickve.w      $xr3, $xr1, 4
--
--    vilvl.h         $vr0, $vr2, $vr0
--    vilvl.h         $vr1, $vr3, $vr1
--    vilvl.w         $vr0, $vr1, $vr0
--    movfr2gr.d      t0, $f0
-+    xvmsknz.b       xr0, xr0
-+    xvmsknz.b       xr1, xr1
-+    xvpickve.w      xr2, xr0, 4
-+    xvpickve.w      xr3, xr1, 4
-+
-+    vilvl.h         vr0, vr2, vr0
-+    vilvl.h         vr1, vr3, vr1
-+    vilvl.w         vr0, vr1, vr0
-+    movfr2gr.d      t0, fa0
- 
-     cto.d           t0, t0
-     ldx.bu          t1, a0, t0
-diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
-index 8535aa22..b407275f 100644
---- a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
-@@ -21,28 +21,28 @@ ENTRY_NO_ALIGN(MEMCMP)
-     pcaddi          t0, -7
- 
-     andi            a3, a0, 0xf
--    vld             $vr5, t0, 0
-+    vld             vr5, t0, 0
-     andi            a4, a1, 0xf
-     bne             a3, a4, L(unaligned)
- 
-     bstrins.d       a0, zero, 3, 0
-     xor             a1, a1, a4
--    vld             $vr0, a0, 0
--    vld             $vr1, a1, 0
-+    vld             vr0, a0, 0
-+    vld             vr1, a1, 0
- 
- 
-     li.d            t0, 16
--    vreplgr2vr.b    $vr3, a3
-+    vreplgr2vr.b    vr3, a3
-     sub.d           t1, t0, a3
--    vadd.b          $vr3, $vr3, $vr5
-+    vadd.b          vr3, vr3, vr5
- 
--    vshuf.b         $vr0, $vr3, $vr0, $vr3
--    vshuf.b         $vr1, $vr3, $vr1, $vr3
--    vseq.b          $vr4, $vr0, $vr1
-+    vshuf.b         vr0, vr3, vr0, vr3
-+    vshuf.b         vr1, vr3, vr1, vr3
-+    vseq.b          vr4, vr0, vr1
-     bgeu            t1, a2, L(al_end)
- 
--    vsetanyeqz.b    $fcc0, $vr4
--    bcnez           $fcc0, L(al_found)
-+    vsetanyeqz.b    fcc0, vr4
-+    bcnez           fcc0, L(al_found)
-     sub.d           a2, a2, t1
-     andi            t1, a2, 31
- 
-@@ -53,70 +53,70 @@ ENTRY_NO_ALIGN(MEMCMP)
- 
- 
- L(al_loop):
--    vld             $vr0, a0, 16
--    vld             $vr1, a1, 16
--    vld             $vr2, a0, 32
--    vld             $vr3, a1, 32
-+    vld             vr0, a0, 16
-+    vld             vr1, a1, 16
-+    vld             vr2, a0, 32
-+    vld             vr3, a1, 32
- 
-     addi.d          a0, a0, 32
-     addi.d          a1, a1, 32
--    vseq.b          $vr4, $vr0, $vr1
--    vseq.b          $vr6, $vr2, $vr3
-+    vseq.b          vr4, vr0, vr1
-+    vseq.b          vr6, vr2, vr3
- 
--    vand.v          $vr6, $vr4, $vr6
--    vsetanyeqz.b    $fcc0, $vr6
--    bcnez           $fcc0, L(al_pair_end)
-+    vand.v          vr6, vr4, vr6
-+    vsetanyeqz.b    fcc0, vr6
-+    bcnez           fcc0, L(al_pair_end)
-     bne             a0, a4, L(al_loop)
- 
- L(al_less_32bytes):
-     bgeu            t0, a2, L(al_less_16bytes)
--    vld             $vr0, a0, 16
--    vld             $vr1, a1, 16
--    vld             $vr2, a0, 32
-+    vld             vr0, a0, 16
-+    vld             vr1, a1, 16
-+    vld             vr2, a0, 32
- 
- 
--    vld             $vr3, a1, 32
-+    vld             vr3, a1, 32
-     addi.d          a2, a2, -16
--    vreplgr2vr.b    $vr6, a2
--    vslt.b          $vr5, $vr5, $vr6
-+    vreplgr2vr.b    vr6, a2
-+    vslt.b          vr5, vr5, vr6
- 
--    vseq.b          $vr4, $vr0, $vr1
--    vseq.b          $vr6, $vr2, $vr3
--    vorn.v          $vr6, $vr6, $vr5
-+    vseq.b          vr4, vr0, vr1
-+    vseq.b          vr6, vr2, vr3
-+    vorn.v          vr6, vr6, vr5
- L(al_pair_end):
--    vsetanyeqz.b    $fcc0, $vr4
-+    vsetanyeqz.b    fcc0, vr4
- 
--    bcnez           $fcc0, L(al_found)
--    vnori.b         $vr4, $vr6, 0
--    vfrstpi.b       $vr4, $vr4, 0
--    vshuf.b         $vr0, $vr2, $vr2, $vr4
-+    bcnez           fcc0, L(al_found)
-+    vnori.b         vr4, vr6, 0
-+    vfrstpi.b       vr4, vr4, 0
-+    vshuf.b         vr0, vr2, vr2, vr4
- 
--    vshuf.b         $vr1, $vr3, $vr3, $vr4
--    vpickve2gr.bu   t0, $vr0, 0
--    vpickve2gr.bu   t1, $vr1, 0
-+    vshuf.b         vr1, vr3, vr3, vr4
-+    vpickve2gr.bu   t0, vr0, 0
-+    vpickve2gr.bu   t1, vr1, 0
-     sub.d           a0, t0, t1
- 
- 
-     jr              ra
- L(al_less_16bytes):
-     beqz            a2, L(out)
--    vld             $vr0, a0, 16
--    vld             $vr1, a1, 16
-+    vld             vr0, a0, 16
-+    vld             vr1, a1, 16
- 
--    vseq.b          $vr4, $vr0, $vr1
-+    vseq.b          vr4, vr0, vr1
- L(al_end):
--    vreplgr2vr.b    $vr6, a2
--    vslt.b          $vr5, $vr5, $vr6
--    vorn.v          $vr4, $vr4, $vr5
-+    vreplgr2vr.b    vr6, a2
-+    vslt.b          vr5, vr5, vr6
-+    vorn.v          vr4, vr4, vr5
- 
- L(al_found):
--    vnori.b         $vr4, $vr4, 0
--    vfrstpi.b       $vr4, $vr4, 0
--    vshuf.b         $vr0, $vr0, $vr0, $vr4
--    vshuf.b         $vr1, $vr1, $vr1, $vr4
-+    vnori.b         vr4, vr4, 0
-+    vfrstpi.b       vr4, vr4, 0
-+    vshuf.b         vr0, vr0, vr0, vr4
-+    vshuf.b         vr1, vr1, vr1, vr4
- 
--    vpickve2gr.bu   t0, $vr0, 0
--    vpickve2gr.bu   t1, $vr1, 0
-+    vpickve2gr.bu   t0, vr0, 0
-+    vpickve2gr.bu   t1, vr1, 0
-     sub.d           a0, t0, t1
-     jr              ra
- 
-@@ -133,28 +133,28 @@ L(unaligned):
-     bstrins.d       a0, zero, 3, 0
- 
-     xor             a1, a1, a4
--    vld             $vr4, a0, 0
--    vld             $vr1, a1, 0
-+    vld             vr4, a0, 0
-+    vld             vr1, a1, 0
-     li.d            t0, 16
- 
--    vreplgr2vr.b    $vr2, a4
-+    vreplgr2vr.b    vr2, a4
-     sub.d           a6, a4, a3  # a6 hold the diff
-     sub.d           t1, t0, a4
-     sub.d           t2, t0, a6
- 
- 
--    vadd.b          $vr2, $vr2, $vr5 # [4, 5, 6, ...]
--    vreplgr2vr.b    $vr6, t2
--    vadd.b          $vr6, $vr6, $vr5 # [14, 15, 16, ... ]
--    vshuf.b         $vr0, $vr4, $vr4, $vr6  # make data be in the same position
-+    vadd.b          vr2, vr2, vr5 # [4, 5, 6, ...]
-+    vreplgr2vr.b    vr6, t2
-+    vadd.b          vr6, vr6, vr5 # [14, 15, 16, ... ]
-+    vshuf.b         vr0, vr4, vr4, vr6  # make data be in the same position
- 
--    vshuf.b         $vr1, $vr2, $vr1, $vr2
--    vshuf.b         $vr0, $vr2, $vr0, $vr2
--    vseq.b          $vr7, $vr0, $vr1
-+    vshuf.b         vr1, vr2, vr1, vr2
-+    vshuf.b         vr0, vr2, vr0, vr2
-+    vseq.b          vr7, vr0, vr1
-     bgeu            t1, a2, L(un_end)
- 
--    vsetanyeqz.b    $fcc0, $vr7
--    bcnez           $fcc0, L(un_found)
-+    vsetanyeqz.b    fcc0, vr7
-+    bcnez           fcc0, L(un_found)
-     sub.d           a2, a2, t1
-     andi            t1, a2, 31
- 
-@@ -165,63 +165,63 @@ L(unaligned):
- 
- 
- L(un_loop):
--    vld             $vr2, a0, 16
--    vld             $vr1, a1, 16
--    vld             $vr3, a1, 32
-+    vld             vr2, a0, 16
-+    vld             vr1, a1, 16
-+    vld             vr3, a1, 32
-     addi.d          a1, a1, 32
- 
-     addi.d          a0, a0, 32
--    vshuf.b         $vr0, $vr2, $vr4, $vr6
--    vld             $vr4, a0, 0
--    vseq.b          $vr7, $vr0, $vr1
-+    vshuf.b         vr0, vr2, vr4, vr6
-+    vld             vr4, a0, 0
-+    vseq.b          vr7, vr0, vr1
- 
--    vshuf.b         $vr2, $vr4, $vr2, $vr6
--    vseq.b          $vr8, $vr2, $vr3
--    vand.v          $vr8, $vr7, $vr8
--    vsetanyeqz.b    $fcc0, $vr8
-+    vshuf.b         vr2, vr4, vr2, vr6
-+    vseq.b          vr8, vr2, vr3
-+    vand.v          vr8, vr7, vr8
-+    vsetanyeqz.b    fcc0, vr8
- 
--    bcnez           $fcc0, L(un_pair_end)
-+    bcnez           fcc0, L(un_pair_end)
-     bne             a1, a4, L(un_loop)
- L(un_less_32bytes):
-     bltu            a2, t0, L(un_less_16bytes)
--    vld             $vr2, a0, 16
-+    vld             vr2, a0, 16
- 
- 
--    vld             $vr1, a1, 16
-+    vld             vr1, a1, 16
-     addi.d          a0, a0, 16
-     addi.d          a1, a1, 16
-     addi.d          a2, a2, -16
- 
--    vshuf.b         $vr0, $vr2, $vr4, $vr6
--    vor.v           $vr4, $vr2, $vr2
--    vseq.b          $vr7, $vr0, $vr1
--    vsetanyeqz.b    $fcc0, $vr7
-+    vshuf.b         vr0, vr2, vr4, vr6
-+    vor.v           vr4, vr2, vr2
-+    vseq.b          vr7, vr0, vr1
-+    vsetanyeqz.b    fcc0, vr7
- 
--    bcnez           $fcc0, L(un_found)
-+    bcnez           fcc0, L(un_found)
- L(un_less_16bytes):
-     beqz            a2, L(out)
--    vld             $vr1, a1, 16
-+    vld             vr1, a1, 16
-     bgeu            a6, a2, 1f
- 
--    vld             $vr2, a0, 16
-+    vld             vr2, a0, 16
- 1:
--    vshuf.b         $vr0, $vr2, $vr4, $vr6
--    vseq.b          $vr7, $vr0, $vr1
-+    vshuf.b         vr0, vr2, vr4, vr6
-+    vseq.b          vr7, vr0, vr1
- L(un_end):
--    vreplgr2vr.b    $vr3, a2
-+    vreplgr2vr.b    vr3, a2
- 
- 
--    vslt.b          $vr3, $vr5, $vr3
--    vorn.v          $vr7, $vr7, $vr3
-+    vslt.b          vr3, vr5, vr3
-+    vorn.v          vr7, vr7, vr3
- L(un_found):
--    vnori.b         $vr7, $vr7, 0
--    vfrstpi.b       $vr7, $vr7, 0
-+    vnori.b         vr7, vr7, 0
-+    vfrstpi.b       vr7, vr7, 0
- 
--    vshuf.b         $vr0, $vr0, $vr0, $vr7
--    vshuf.b         $vr1, $vr1, $vr1, $vr7
-+    vshuf.b         vr0, vr0, vr0, vr7
-+    vshuf.b         vr1, vr1, vr1, vr7
- L(calc_result):
--    vpickve2gr.bu   t0, $vr0, 0
--    vpickve2gr.bu   t1, $vr1, 0
-+    vpickve2gr.bu   t0, vr0, 0
-+    vpickve2gr.bu   t1, vr1, 0
- 
-     sub.d           t2, t0, t1
-     sub.d           t3, t1, t0
-@@ -231,14 +231,14 @@ L(calc_result):
-     or              a0, t0, t1
-     jr              ra
- L(un_pair_end):
--    vsetanyeqz.b    $fcc0, $vr7
--    bcnez           $fcc0, L(un_found)
-+    vsetanyeqz.b    fcc0, vr7
-+    bcnez           fcc0, L(un_found)
- 
- 
--    vnori.b         $vr7, $vr8, 0
--    vfrstpi.b       $vr7, $vr7, 0
--    vshuf.b         $vr0, $vr2, $vr2, $vr7
--    vshuf.b         $vr1, $vr3, $vr3, $vr7
-+    vnori.b         vr7, vr8, 0
-+    vfrstpi.b       vr7, vr7, 0
-+    vshuf.b         vr0, vr2, vr2, vr7
-+    vshuf.b         vr1, vr3, vr3, vr7
- 
-     b               L(calc_result)
- L(out):
-diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
-index e8b2c441..c317592f 100644
---- a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
-@@ -26,22 +26,22 @@ LEAF(MEMCPY_NAME, 6)
- 
-     li.d            t1, 64
-     bltu            t1, a2, L(copy_long)    # a2 > 64
--    xvld            $xr0, a1, 0
--    xvld            $xr1, a4, -32
-+    xvld            xr0, a1, 0
-+    xvld            xr1, a4, -32
- 
--    xvst            $xr0, a0, 0
--    xvst            $xr1, a3, -32
-+    xvst            xr0, a0, 0
-+    xvst            xr1, a3, -32
-     jr              ra
- L(less_32bytes):
-     srli.d          t0, a2, 4
- 
-     beqz            t0, L(less_16bytes)
--    vld             $vr0, a1, 0
--    vld             $vr1, a4, -16
--    vst             $vr0, a0, 0
-+    vld             vr0, a1, 0
-+    vld             vr1, a4, -16
-+    vst             vr0, a0, 0
- 
- 
--    vst             $vr1, a3, -16
-+    vst             vr1, a3, -16
-     jr              ra
- L(less_16bytes):
-     srli.d          t0, a2, 3
-@@ -91,11 +91,11 @@ LEAF(MEMMOVE_NAME, 6)
- 
-     li.d            t1, 64
-     bltu            t1, a2, L(move_long)    # a2 > 64
--    xvld            $xr0, a1, 0
--    xvld            $xr1, a4, -32
-+    xvld            xr0, a1, 0
-+    xvld            xr1, a4, -32
- 
--    xvst            $xr0, a0, 0
--    xvst            $xr1, a3, -32
-+    xvst            xr0, a0, 0
-+    xvst            xr1, a3, -32
-     jr              ra
- L(move_long):
-     sub.d           t2, a0, a1
-@@ -107,8 +107,8 @@ L(copy_long):
-     sub.d           t2, t0, t2
- 
- 
--    xvld            $xr8, a1, 0
--    xvld            $xr9, a4, -32
-+    xvld            xr8, a1, 0
-+    xvld            xr9, a4, -32
-     sub.d           t3, a2, t2
-     add.d           a5, a0, t2
- 
-@@ -119,69 +119,69 @@ L(copy_long):
- 
-     addi.d          a6, a6, -1
- L(loop_256):
--    xvld            $xr0, a1, 0
--    xvld            $xr1, a1, 32
--    xvld            $xr2, a1, 64
-+    xvld            xr0, a1, 0
-+    xvld            xr1, a1, 32
-+    xvld            xr2, a1, 64
- 
--    xvld            $xr3, a1, 96
--    xvld            $xr4, a1, 128
--    xvld            $xr5, a1, 160
--    xvld            $xr6, a1, 192
-+    xvld            xr3, a1, 96
-+    xvld            xr4, a1, 128
-+    xvld            xr5, a1, 160
-+    xvld            xr6, a1, 192
- 
- 
--    xvld            $xr7, a1, 224
-+    xvld            xr7, a1, 224
-     addi.d          a1, a1, 256
--    xvst            $xr0, a5, 0
--    xvst            $xr1, a5, 32
-+    xvst            xr0, a5, 0
-+    xvst            xr1, a5, 32
- 
--    xvst            $xr2, a5, 64
--    xvst            $xr3, a5, 96
--    xvst            $xr4, a5, 128
--    xvst            $xr5, a5, 160
-+    xvst            xr2, a5, 64
-+    xvst            xr3, a5, 96
-+    xvst            xr4, a5, 128
-+    xvst            xr5, a5, 160
- 
--    xvst            $xr6, a5, 192
--    xvst            $xr7, a5, 224
-+    xvst            xr6, a5, 192
-+    xvst            xr7, a5, 224
-     addi.d          a5, a5, 256
-     bne             a1, a6, L(loop_256)
- 
- L(lt256):
-     srli.d          t2, a2, 7
-     beqz            t2, L(lt128)
--    xvld            $xr0, a1, 0
--    xvld            $xr1, a1, 32
-+    xvld            xr0, a1, 0
-+    xvld            xr1, a1, 32
- 
- 
--    xvld            $xr2, a1, 64
--    xvld            $xr3, a1, 96
-+    xvld            xr2, a1, 64
-+    xvld            xr3, a1, 96
-     addi.d          a1, a1, 128
-     addi.d          a2, a2, -128
- 
--    xvst            $xr0, a5, 0
--    xvst            $xr1, a5, 32
--    xvst            $xr2, a5, 64
--    xvst            $xr3, a5, 96
-+    xvst            xr0, a5, 0
-+    xvst            xr1, a5, 32
-+    xvst            xr2, a5, 64
-+    xvst            xr3, a5, 96
- 
-     addi.d          a5, a5, 128
- L(lt128):
-     bltu            a2, t1, L(lt64)
--    xvld            $xr0, a1, 0
--    xvld            $xr1, a1, 32
-+    xvld            xr0, a1, 0
-+    xvld            xr1, a1, 32
- 
-     addi.d          a1, a1, 64
-     addi.d          a2, a2, -64
--    xvst            $xr0, a5, 0
--    xvst            $xr1, a5, 32
-+    xvst            xr0, a5, 0
-+    xvst            xr1, a5, 32
- 
- 
-     addi.d          a5, a5, 64
- L(lt64):
-     bltu            a2, t0, L(lt32)
--    xvld            $xr0, a1, 0
--    xvst            $xr0, a5, 0
-+    xvld            xr0, a1, 0
-+    xvst            xr0, a5, 0
- 
- L(lt32):
--    xvst            $xr8, a0, 0
--    xvst            $xr9, a3, -32
-+    xvst            xr8, a0, 0
-+    xvst            xr9, a3, -32
-     jr              ra
-     nop
- 
-@@ -189,9 +189,9 @@ L(copy_back):
-     addi.d          a3, a3, -1
-     addi.d          a2, a2, -2
-     andi            t2, a3, 0x1f
--    xvld            $xr8, a1, 0
-+    xvld            xr8, a1, 0
- 
--    xvld            $xr9, a4, -32
-+    xvld            xr9, a4, -32
-     sub.d           t3, a2, t2
-     sub.d           a5, a3, t2
-     sub.d           a4, a4, t2
-@@ -203,69 +203,69 @@ L(copy_back):
-     addi.d          a6, a6, 2
- 
- L(back_loop_256):
--    xvld            $xr0, a4, -33
--    xvld            $xr1, a4, -65
--    xvld            $xr2, a4, -97
--    xvld            $xr3, a4, -129
-+    xvld            xr0, a4, -33
-+    xvld            xr1, a4, -65
-+    xvld            xr2, a4, -97
-+    xvld            xr3, a4, -129
- 
--    xvld            $xr4, a4, -161
--    xvld            $xr5, a4, -193
--    xvld            $xr6, a4, -225
--    xvld            $xr7, a4, -257
-+    xvld            xr4, a4, -161
-+    xvld            xr5, a4, -193
-+    xvld            xr6, a4, -225
-+    xvld            xr7, a4, -257
- 
-     addi.d          a4, a4, -256
--    xvst            $xr0, a5, -32
--    xvst            $xr1, a5, -64
--    xvst            $xr2, a5, -96
-+    xvst            xr0, a5, -32
-+    xvst            xr1, a5, -64
-+    xvst            xr2, a5, -96
- 
- 
--    xvst            $xr3, a5, -128
--    xvst            $xr4, a5, -160
--    xvst            $xr5, a5, -192
--    xvst            $xr6, a5, -224
-+    xvst            xr3, a5, -128
-+    xvst            xr4, a5, -160
-+    xvst            xr5, a5, -192
-+    xvst            xr6, a5, -224
- 
--    xvst            $xr7, a5, -256
-+    xvst            xr7, a5, -256
-     addi.d          a5, a5, -256
-     bne             a4, a6, L(back_loop_256)
- L(back_lt256):
-     srli.d          t2, a2, 7
- 
-     beqz            t2, L(back_lt128)
--    xvld            $xr0, a4, -33
--    xvld            $xr1, a4, -65
--    xvld            $xr2, a4, -97
-+    xvld            xr0, a4, -33
-+    xvld            xr1, a4, -65
-+    xvld            xr2, a4, -97
- 
--    xvld            $xr3, a4, -129
-+    xvld            xr3, a4, -129
-     addi.d          a2, a2, -128
-     addi.d          a4, a4, -128
--    xvst            $xr0, a5, -32
-+    xvst            xr0, a5, -32
- 
- 
--    xvst            $xr1, a5, -64
--    xvst            $xr2, a5, -96
--    xvst            $xr3, a5, -128
-+    xvst            xr1, a5, -64
-+    xvst            xr2, a5, -96
-+    xvst            xr3, a5, -128
-     addi.d          a5, a5, -128
- 
- L(back_lt128):
-     blt             a2, t1, L(back_lt64)
--    xvld            $xr0, a4, -33
--    xvld            $xr1, a4, -65
-+    xvld            xr0, a4, -33
-+    xvld            xr1, a4, -65
-     addi.d          a2, a2, -64
- 
-     addi.d          a4, a4, -64
--    xvst            $xr0, a5, -32
--    xvst            $xr1, a5, -64
-+    xvst            xr0, a5, -32
-+    xvst            xr1, a5, -64
-     addi.d          a5, a5, -64
- 
- L(back_lt64):
-     bltu            a2, t0, L(back_lt32)
--    xvld            $xr0, a4, -33
--    xvst            $xr0, a5, -32
-+    xvld            xr0, a4, -33
-+    xvst            xr0, a5, -32
- L(back_lt32):
--    xvst            $xr8, a0, 0
-+    xvst            xr8, a0, 0
- 
- 
--    xvst            $xr9, a3, -31
-+    xvst            xr9, a3, -31
-     jr              ra
- END(MEMMOVE_NAME)
- 
-diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
-index 90f89c7a..77f1b4ab 100644
---- a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
-@@ -23,54 +23,54 @@ LEAF(MEMCPY_NAME, 6)
-     bltu            t8, a2, L(copy_long)   # a2 > 64
-     bltu            t7, a2, L(more_32bytes) # a2 > 32
- 
--    vld             $vr0, a1, 0
--    vld             $vr1, a4, -16
--    vst             $vr0, a0, 0
--    vst             $vr1, a3, -16
-+    vld             vr0, a1, 0
-+    vld             vr1, a4, -16
-+    vst             vr0, a0, 0
-+    vst             vr1, a3, -16
- 
-     jr              ra
- L(more_32bytes):
--    vld             $vr0, a1, 0
--    vld             $vr1, a1, 16
--    vld             $vr2, a4, -32
-+    vld             vr0, a1, 0
-+    vld             vr1, a1, 16
-+    vld             vr2, a4, -32
- 
- 
--    vld             $vr3, a4, -16
--    vst             $vr0, a0, 0
--    vst             $vr1, a0, 16
--    vst             $vr2, a3, -32
-+    vld             vr3, a4, -16
-+    vst             vr0, a0, 0
-+    vst             vr1, a0, 16
-+    vst             vr2, a3, -32
- 
--    vst             $vr3, a3, -16
-+    vst             vr3, a3, -16
-     jr              ra
- L(less_16bytes):
-     srli.d          t0, a2, 3
-     beqz            t0, L(less_8bytes)
- 
--    vldrepl.d       $vr0, a1, 0
--    vldrepl.d       $vr1, a4, -8
--    vstelm.d        $vr0, a0, 0, 0
--    vstelm.d        $vr1, a3, -8, 0
-+    vldrepl.d       vr0, a1, 0
-+    vldrepl.d       vr1, a4, -8
-+    vstelm.d        vr0, a0, 0, 0
-+    vstelm.d        vr1, a3, -8, 0
- 
-     jr              ra
- L(less_8bytes):
-     srli.d          t0, a2, 2
-     beqz            t0, L(less_4bytes)
--    vldrepl.w       $vr0, a1, 0
-+    vldrepl.w       vr0, a1, 0
- 
- 
--    vldrepl.w       $vr1, a4, -4
--    vstelm.w        $vr0, a0, 0, 0
--    vstelm.w        $vr1, a3, -4, 0
-+    vldrepl.w       vr1, a4, -4
-+    vstelm.w        vr0, a0, 0, 0
-+    vstelm.w        vr1, a3, -4, 0
-     jr              ra
- 
- L(less_4bytes):
-     srli.d          t0, a2, 1
-     beqz            t0, L(less_2bytes)
--    vldrepl.h       $vr0, a1, 0
--    vldrepl.h       $vr1, a4, -2
-+    vldrepl.h       vr0, a1, 0
-+    vldrepl.h       vr1, a4, -2
- 
--    vstelm.h        $vr0, a0, 0, 0
--    vstelm.h        $vr1, a3, -2, 0
-+    vstelm.h        vr0, a0, 0, 0
-+    vstelm.h        vr1, a3, -2, 0
-     jr              ra
- L(less_2bytes):
-     beqz            a2, L(less_1bytes)
-@@ -93,10 +93,10 @@ LEAF(MEMMOVE_NAME, 6)
-     bltu            t8, a2, L(move_long)    # a2 > 64
-     bltu            t7, a2, L(more_32bytes) # a2 > 32
- 
--    vld             $vr0, a1, 0
--    vld             $vr1, a4, -16
--    vst             $vr0, a0, 0
--    vst             $vr1, a3, -16
-+    vld             vr0, a1, 0
-+    vld             vr1, a4, -16
-+    vst             vr0, a0, 0
-+    vst             vr1, a3, -16
- 
-     jr              ra
-     nop
-@@ -106,7 +106,7 @@ L(move_long):
- 
- 
- L(copy_long):
--    vld             $vr2, a1, 0
-+    vld             vr2, a1, 0
-     andi            t0, a0, 0xf
-     sub.d           t0, t6, t0
-     add.d           a1, a1, t0
-@@ -114,10 +114,10 @@ L(copy_long):
-     sub.d           a2, a2, t0
-     andi            t1, a1, 0xf
-     bnez            t1, L(unaligned)
--    vld             $vr0, a1, 0
-+    vld             vr0, a1, 0
- 
-     addi.d          a2, a2, -16
--    vst             $vr2, a0, 0
-+    vst             vr2, a0, 0
-     andi            t2, a2, 0x7f
-     add.d           a5, a0, t0
- 
-@@ -128,69 +128,69 @@ L(copy_long):
- 
- 
- L(al_loop):
--    vld             $vr1, a1, 16
--    vld             $vr2, a1, 32
--    vld             $vr3, a1, 48
--    vld             $vr4, a1, 64
-+    vld             vr1, a1, 16
-+    vld             vr2, a1, 32
-+    vld             vr3, a1, 48
-+    vld             vr4, a1, 64
- 
--    vld             $vr5, a1, 80
--    vld             $vr6, a1, 96
--    vld             $vr7, a1, 112
--    vst             $vr0, a5, 0
-+    vld             vr5, a1, 80
-+    vld             vr6, a1, 96
-+    vld             vr7, a1, 112
-+    vst             vr0, a5, 0
- 
--    vld             $vr0, a1, 128
-+    vld             vr0, a1, 128
-     addi.d          a1, a1, 128
--    vst             $vr1, a5, 16
--    vst             $vr2, a5, 32
-+    vst             vr1, a5, 16
-+    vst             vr2, a5, 32
- 
--    vst             $vr3, a5, 48
--    vst             $vr4, a5, 64
--    vst             $vr5, a5, 80
--    vst             $vr6, a5, 96
-+    vst             vr3, a5, 48
-+    vst             vr4, a5, 64
-+    vst             vr5, a5, 80
-+    vst             vr6, a5, 96
- 
- 
--    vst             $vr7, a5, 112
-+    vst             vr7, a5, 112
-     addi.d          a5, a5, 128
-     bne             a1, a6, L(al_loop)
- L(al_less_128):
-     blt             a2, t8, L(al_less_64)
- 
--    vld             $vr1, a1, 16
--    vld             $vr2, a1, 32
--    vld             $vr3, a1, 48
-+    vld             vr1, a1, 16
-+    vld             vr2, a1, 32
-+    vld             vr3, a1, 48
-     addi.d          a2, a2, -64
- 
--    vst             $vr0, a5, 0
--    vld             $vr0, a1, 64
-+    vst             vr0, a5, 0
-+    vld             vr0, a1, 64
-     addi.d          a1, a1, 64
--    vst             $vr1, a5, 16
-+    vst             vr1, a5, 16
- 
--    vst             $vr2, a5, 32
--    vst             $vr3, a5, 48
-+    vst             vr2, a5, 32
-+    vst             vr3, a5, 48
-     addi.d          a5, a5, 64
- L(al_less_64):
-     blt             a2, t7, L(al_less_32)
- 
- 
--    vld             $vr1, a1, 16
-+    vld             vr1, a1, 16
-     addi.d          a2, a2, -32
--    vst             $vr0, a5, 0
--    vld             $vr0, a1, 32
-+    vst             vr0, a5, 0
-+    vld             vr0, a1, 32
- 
-     addi.d          a1, a1, 32
--    vst             $vr1, a5, 16
-+    vst             vr1, a5, 16
-     addi.d          a5, a5, 32
- L(al_less_32):
-     blt             a2, t6, L(al_less_16)
- 
--    vst             $vr0, a5, 0
--    vld             $vr0, a1, 16
-+    vst             vr0, a5, 0
-+    vld             vr0, a1, 16
-     addi.d          a5, a5, 16
- L(al_less_16):
--    vld             $vr1, a4, -16
-+    vld             vr1, a4, -16
- 
--    vst             $vr0, a5, 0
--    vst             $vr1, a3, -16
-+    vst             vr0, a5, 0
-+    vst             vr1, a3, -16
-     jr              ra
-     nop
- 
-@@ -201,17 +201,17 @@ L(magic_num):
- L(unaligned):
-     pcaddi          t2, -4
-     bstrins.d       a1, zero, 3, 0
--    vld             $vr8, t2, 0
--    vld             $vr0, a1, 0
-+    vld             vr8, t2, 0
-+    vld             vr0, a1, 0
- 
--    vld             $vr1, a1, 16
-+    vld             vr1, a1, 16
-     addi.d          a2, a2, -16
--    vst             $vr2, a0, 0
-+    vst             vr2, a0, 0
-     add.d           a5, a0, t0
- 
--    vreplgr2vr.b    $vr9, t1
-+    vreplgr2vr.b    vr9, t1
-     andi            t2, a2, 0x7f
--    vadd.b          $vr9, $vr9, $vr8
-+    vadd.b          vr9, vr9, vr8
-     addi.d          a1, a1, 32
- 
- 
-@@ -221,97 +221,97 @@ L(unaligned):
-     add.d           a6, a1, t3
- 
- L(un_loop):
--    vld             $vr2, a1, 0
--    vld             $vr3, a1, 16
--    vld             $vr4, a1, 32
--    vld             $vr5, a1, 48
-+    vld             vr2, a1, 0
-+    vld             vr3, a1, 16
-+    vld             vr4, a1, 32
-+    vld             vr5, a1, 48
- 
--    vld             $vr6, a1, 64
--    vld             $vr7, a1, 80
--    vshuf.b         $vr8, $vr1, $vr0, $vr9
--    vld             $vr0, a1, 96
-+    vld             vr6, a1, 64
-+    vld             vr7, a1, 80
-+    vshuf.b         vr8, vr1, vr0, vr9
-+    vld             vr0, a1, 96
- 
--    vst             $vr8, a5, 0
--    vshuf.b         $vr8, $vr2, $vr1, $vr9
--    vld             $vr1, a1, 112
--    vst             $vr8, a5, 16
-+    vst             vr8, a5, 0
-+    vshuf.b         vr8, vr2, vr1, vr9
-+    vld             vr1, a1, 112
-+    vst             vr8, a5, 16
- 
- 
-     addi.d          a1, a1, 128
--    vshuf.b         $vr2, $vr3, $vr2, $vr9
--    vshuf.b         $vr3, $vr4, $vr3, $vr9
--    vst             $vr2, a5, 32
-+    vshuf.b         vr2, vr3, vr2, vr9
-+    vshuf.b         vr3, vr4, vr3, vr9
-+    vst             vr2, a5, 32
- 
--    vshuf.b         $vr4, $vr5, $vr4, $vr9
--    vst             $vr3, a5, 48
--    vshuf.b         $vr5, $vr6, $vr5, $vr9
--    vst             $vr4, a5, 64
-+    vshuf.b         vr4, vr5, vr4, vr9
-+    vst             vr3, a5, 48
-+    vshuf.b         vr5, vr6, vr5, vr9
-+    vst             vr4, a5, 64
- 
--    vshuf.b         $vr6, $vr7, $vr6, $vr9
--    vst             $vr5, a5, 80
--    vshuf.b         $vr7, $vr0, $vr7, $vr9
--    vst             $vr6, a5, 96
-+    vshuf.b         vr6, vr7, vr6, vr9
-+    vst             vr5, a5, 80
-+    vshuf.b         vr7, vr0, vr7, vr9
-+    vst             vr6, a5, 96
- 
--    vst             $vr7, a5, 112
-+    vst             vr7, a5, 112
-     addi.d          a5, a5, 128
-     bne             a1, a6, L(un_loop)
- L(un_less_128):
-     blt             a2, t8, L(un_less_64)
- 
- 
--    vld             $vr2, a1, 0
--    vld             $vr3, a1, 16
--    vshuf.b         $vr4, $vr1, $vr0, $vr9
--    vld             $vr0, a1, 32
-+    vld             vr2, a1, 0
-+    vld             vr3, a1, 16
-+    vshuf.b         vr4, vr1, vr0, vr9
-+    vld             vr0, a1, 32
- 
--    vst             $vr4, a5, 0
-+    vst             vr4, a5, 0
-     addi.d          a2, a2, -64
--    vshuf.b         $vr4, $vr2, $vr1, $vr9
--    vld             $vr1, a1, 48
-+    vshuf.b         vr4, vr2, vr1, vr9
-+    vld             vr1, a1, 48
- 
-     addi.d          a1, a1, 64
--    vst             $vr4, a5, 16
--    vshuf.b         $vr2, $vr3, $vr2, $vr9
--    vshuf.b         $vr3, $vr0, $vr3, $vr9
-+    vst             vr4, a5, 16
-+    vshuf.b         vr2, vr3, vr2, vr9
-+    vshuf.b         vr3, vr0, vr3, vr9
- 
--    vst             $vr2, a5, 32
--    vst             $vr3, a5, 48
-+    vst             vr2, a5, 32
-+    vst             vr3, a5, 48
-     addi.d          a5, a5, 64
- L(un_less_64):
-     blt             a2, t7, L(un_less_32)
- 
- 
--    vshuf.b         $vr3, $vr1, $vr0, $vr9
--    vld             $vr0, a1, 0
--    vst             $vr3, a5, 0
-+    vshuf.b         vr3, vr1, vr0, vr9
-+    vld             vr0, a1, 0
-+    vst             vr3, a5, 0
-     addi.d          a2, a2, -32
- 
--    vshuf.b         $vr3, $vr0, $vr1, $vr9
--    vld             $vr1, a1, 16
-+    vshuf.b         vr3, vr0, vr1, vr9
-+    vld             vr1, a1, 16
-     addi.d          a1, a1, 32
--    vst             $vr3, a5, 16
-+    vst             vr3, a5, 16
- 
-     addi.d          a5, a5, 32
- L(un_less_32):
-     blt             a2, t6, L(un_less_16)
--    vshuf.b         $vr2, $vr1, $vr0, $vr9
--    vor.v           $vr0, $vr1, $vr1
-+    vshuf.b         vr2, vr1, vr0, vr9
-+    vor.v           vr0, vr1, vr1
- 
--    vld             $vr1, a1, 0
--    vst             $vr2, a5, 0
-+    vld             vr1, a1, 0
-+    vst             vr2, a5, 0
-     addi.d          a5, a5, 16
- L(un_less_16):
--    vld             $vr2, a4, -16
-+    vld             vr2, a4, -16
- 
- 
--    vshuf.b         $vr0, $vr1, $vr0, $vr9
--    vst             $vr0, a5, 0
--    vst             $vr2, a3, -16
-+    vshuf.b         vr0, vr1, vr0, vr9
-+    vst             vr0, a5, 0
-+    vst             vr2, a3, -16
-     jr              ra
- 
- L(copy_back):
-     addi.d          t0, a3, -1
--    vld             $vr2, a4, -16
-+    vld             vr2, a4, -16
-     andi            t0, t0, 0xf
-     addi.d          t0, t0, 1   # in case a3 is already aligned, load 16bytes and store 16bytes
- 
-@@ -320,9 +320,9 @@ L(copy_back):
-     andi            t1, a4, 0xf
-     bnez            t1, L(back_unaligned)
- 
--    vld             $vr0, a4, -16
-+    vld             vr0, a4, -16
-     addi.d          a2, a2, -16
--    vst             $vr2, a3, -16
-+    vst             vr2, a3, -16
-     andi            t2, a2, 0x7f
- 
- 
-@@ -333,70 +333,70 @@ L(copy_back):
- 
-     sub.d           a6, a4, t3
- L(back_al_loop):
--    vld             $vr1, a4, -32
--    vld             $vr2, a4, -48
--    vld             $vr3, a4, -64
-+    vld             vr1, a4, -32
-+    vld             vr2, a4, -48
-+    vld             vr3, a4, -64
- 
--    vld             $vr4, a4, -80
--    vld             $vr5, a4, -96
--    vld             $vr6, a4, -112
--    vld             $vr7, a4, -128
-+    vld             vr4, a4, -80
-+    vld             vr5, a4, -96
-+    vld             vr6, a4, -112
-+    vld             vr7, a4, -128
- 
--    vst             $vr0, a3, -16
--    vld             $vr0, a4, -144
-+    vst             vr0, a3, -16
-+    vld             vr0, a4, -144
-     addi.d          a4, a4, -128
--    vst             $vr1, a3, -32
-+    vst             vr1, a3, -32
- 
- 
--    vst             $vr2, a3, -48
--    vst             $vr3, a3, -64
--    vst             $vr4, a3, -80
--    vst             $vr5, a3, -96
-+    vst             vr2, a3, -48
-+    vst             vr3, a3, -64
-+    vst             vr4, a3, -80
-+    vst             vr5, a3, -96
- 
--    vst             $vr6, a3, -112
--    vst             $vr7, a3, -128
-+    vst             vr6, a3, -112
-+    vst             vr7, a3, -128
-     addi.d          a3, a3, -128
-     bne             a4, a6, L(back_al_loop)
- 
- L(back_al_less_128):
-     blt             a2, t8, L(back_al_less_64)
--    vld             $vr1, a4, -32
--    vld             $vr2, a4, -48
--    vld             $vr3, a4, -64
-+    vld             vr1, a4, -32
-+    vld             vr2, a4, -48
-+    vld             vr3, a4, -64
- 
-     addi.d          a2, a2, -64
--    vst             $vr0, a3, -16
--    vld             $vr0, a4, -80
-+    vst             vr0, a3, -16
-+    vld             vr0, a4, -80
-     addi.d          a4, a4, -64
- 
- 
--    vst             $vr1, a3, -32
--    vst             $vr2, a3, -48
--    vst             $vr3, a3, -64
-+    vst             vr1, a3, -32
-+    vst             vr2, a3, -48
-+    vst             vr3, a3, -64
-     addi.d          a3, a3, -64
- 
- L(back_al_less_64):
-     blt             a2, t7, L(back_al_less_32)
--    vld             $vr1, a4, -32
-+    vld             vr1, a4, -32
-     addi.d          a2, a2, -32
--    vst             $vr0, a3, -16
-+    vst             vr0, a3, -16
- 
--    vld             $vr0, a4, -48
--    vst             $vr1, a3, -32
-+    vld             vr0, a4, -48
-+    vst             vr1, a3, -32
-     addi.d          a3, a3, -32
-     addi.d          a4, a4, -32
- 
- L(back_al_less_32):
-     blt             a2, t6, L(back_al_less_16)
--    vst             $vr0, a3, -16
--    vld             $vr0, a4, -32
-+    vst             vr0, a3, -16
-+    vld             vr0, a4, -32
-     addi.d          a3, a3, -16
- 
- 
- L(back_al_less_16):
--    vld             $vr1, a1, 0
--    vst             $vr0, a3, -16
--    vst             $vr1, a0, 0
-+    vld             vr1, a1, 0
-+    vst             vr0, a3, -16
-+    vst             vr1, a0, 0
-     jr              ra
- 
- L(magic_num_2):
-@@ -405,18 +405,18 @@ L(magic_num_2):
- L(back_unaligned):
-     pcaddi          t2, -4
-     bstrins.d       a4, zero, 3, 0
--    vld             $vr8, t2, 0
--    vld             $vr0, a4, 0
-+    vld             vr8, t2, 0
-+    vld             vr0, a4, 0
- 
--    vld             $vr1, a4, -16
-+    vld             vr1, a4, -16
-     addi.d          a2, a2, -16
--    vst             $vr2, a3, -16
-+    vst             vr2, a3, -16
-     sub.d           a3, a3, t0
- 
- 
--    vreplgr2vr.b    $vr9, t1
-+    vreplgr2vr.b    vr9, t1
-     andi            t2, a2, 0x7f
--    vadd.b          $vr9, $vr9, $vr8
-+    vadd.b          vr9, vr9, vr8
-     addi.d          a4, a4, -16
- 
-     beq             t2, a2, L(back_un_less_128)
-@@ -425,92 +425,92 @@ L(back_unaligned):
-     sub.d           a6, a4, t3
- 
- L(back_un_loop):
--    vld             $vr2, a4, -16
--    vld             $vr3, a4, -32
--    vld             $vr4, a4, -48
-+    vld             vr2, a4, -16
-+    vld             vr3, a4, -32
-+    vld             vr4, a4, -48
- 
--    vld             $vr5, a4, -64
--    vld             $vr6, a4, -80
--    vld             $vr7, a4, -96
--    vshuf.b         $vr8, $vr0, $vr1, $vr9
-+    vld             vr5, a4, -64
-+    vld             vr6, a4, -80
-+    vld             vr7, a4, -96
-+    vshuf.b         vr8, vr0, vr1, vr9
- 
- 
--    vld             $vr0, a4, -112
--    vst             $vr8, a3, -16
--    vshuf.b         $vr8, $vr1, $vr2, $vr9
--    vld             $vr1, a4, -128
-+    vld             vr0, a4, -112
-+    vst             vr8, a3, -16
-+    vshuf.b         vr8, vr1, vr2, vr9
-+    vld             vr1, a4, -128
- 
--    vst             $vr8, a3, -32
-+    vst             vr8, a3, -32
-     addi.d          a4, a4, -128
--    vshuf.b         $vr2, $vr2, $vr3, $vr9
--    vshuf.b         $vr3, $vr3, $vr4, $vr9
-+    vshuf.b         vr2, vr2, vr3, vr9
-+    vshuf.b         vr3, vr3, vr4, vr9
- 
--    vst             $vr2, a3, -48
--    vshuf.b         $vr4, $vr4, $vr5, $vr9
--    vst             $vr3, a3, -64
--    vshuf.b         $vr5, $vr5, $vr6, $vr9
-+    vst             vr2, a3, -48
-+    vshuf.b         vr4, vr4, vr5, vr9
-+    vst             vr3, a3, -64
-+    vshuf.b         vr5, vr5, vr6, vr9
- 
--    vst             $vr4, a3, -80
--    vshuf.b         $vr6, $vr6, $vr7, $vr9
--    vst             $vr5, a3, -96
--    vshuf.b         $vr7, $vr7, $vr0, $vr9
-+    vst             vr4, a3, -80
-+    vshuf.b         vr6, vr6, vr7, vr9
-+    vst             vr5, a3, -96
-+    vshuf.b         vr7, vr7, vr0, vr9
- 
- 
--    vst             $vr6, a3, -112
--    vst             $vr7, a3, -128
-+    vst             vr6, a3, -112
-+    vst             vr7, a3, -128
-     addi.d          a3, a3, -128
-     bne             a4, a6, L(back_un_loop)
- 
- L(back_un_less_128):
-     blt             a2, t8, L(back_un_less_64)
--    vld             $vr2, a4, -16
--    vld             $vr3, a4, -32
--    vshuf.b         $vr4, $vr0, $vr1, $vr9
-+    vld             vr2, a4, -16
-+    vld             vr3, a4, -32
-+    vshuf.b         vr4, vr0, vr1, vr9
- 
--    vld             $vr0, a4, -48
--    vst             $vr4, a3, -16
-+    vld             vr0, a4, -48
-+    vst             vr4, a3, -16
-     addi.d          a2, a2, -64
--    vshuf.b         $vr4, $vr1, $vr2, $vr9
-+    vshuf.b         vr4, vr1, vr2, vr9
- 
--    vld             $vr1, a4, -64
-+    vld             vr1, a4, -64
-     addi.d          a4, a4, -64
--    vst             $vr4, a3, -32
--    vshuf.b         $vr2, $vr2, $vr3, $vr9
-+    vst             vr4, a3, -32
-+    vshuf.b         vr2, vr2, vr3, vr9
- 
- 
--    vshuf.b         $vr3, $vr3, $vr0, $vr9
--    vst             $vr2, a3, -48
--    vst             $vr3, a3, -64
-+    vshuf.b         vr3, vr3, vr0, vr9
-+    vst             vr2, a3, -48
-+    vst             vr3, a3, -64
-     addi.d          a3, a3, -64
- 
- L(back_un_less_64):
-     blt             a2, t7, L(back_un_less_32)
--    vshuf.b         $vr3, $vr0, $vr1, $vr9
--    vld             $vr0, a4, -16
--    vst             $vr3, a3, -16
-+    vshuf.b         vr3, vr0, vr1, vr9
-+    vld             vr0, a4, -16
-+    vst             vr3, a3, -16
- 
-     addi.d          a2, a2, -32
--    vshuf.b         $vr3, $vr1, $vr0, $vr9
--    vld             $vr1, a4, -32
-+    vshuf.b         vr3, vr1, vr0, vr9
-+    vld             vr1, a4, -32
-     addi.d          a4, a4, -32
- 
--    vst             $vr3, a3, -32
-+    vst             vr3, a3, -32
-     addi.d          a3, a3, -32
- L(back_un_less_32):
-     blt             a2, t6, L(back_un_less_16)
--    vshuf.b         $vr2, $vr0, $vr1, $vr9
-+    vshuf.b         vr2, vr0, vr1, vr9
- 
- 
--    vor.v           $vr0, $vr1, $vr1
--    vld             $vr1, a4, -16
--    vst             $vr2, a3, -16
-+    vor.v           vr0, vr1, vr1
-+    vld             vr1, a4, -16
-+    vst             vr2, a3, -16
-     addi.d          a3, a3, -16
- 
- L(back_un_less_16):
--    vld             $vr2, a1, 0
--    vshuf.b         $vr0, $vr0, $vr1, $vr9
--    vst             $vr0, a3, -16
--    vst             $vr2, a0, 0
-+    vld             vr2, a1, 0
-+    vshuf.b         vr0, vr0, vr1, vr9
-+    vst             vr0, a3, -16
-+    vst             vr2, a0, 0
- 
-     jr              ra
- END(MEMMOVE_NAME)
-diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
-index 9ecd0257..41554552 100644
---- a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
-@@ -21,56 +21,56 @@ LEAF(MEMRCHR, 6)
- 
-     bstrins.d       a3, zero, 5, 0
-     addi.d          t1, t1, 1      # len for unaligned address
--    xvld            $xr0, a3, 0
--    xvld            $xr1, a3, 32
-+    xvld            xr0, a3, 0
-+    xvld            xr1, a3, 32
- 
-     sub.d           t2, zero, t1
-     li.d            t3, -1
--    xvreplgr2vr.b   $xr2, a1
-+    xvreplgr2vr.b   xr2, a1
-     andi            t4, a0, 0x3f
- 
-     srl.d           t2, t3, t2
--    xvseq.b         $xr0, $xr0, $xr2
--    xvseq.b         $xr1, $xr1, $xr2
--    xvmsknz.b       $xr0, $xr0
-+    xvseq.b         xr0, xr0, xr2
-+    xvseq.b         xr1, xr1, xr2
-+    xvmsknz.b       xr0, xr0
- 
- 
--    xvmsknz.b       $xr1, $xr1
--    xvpickve.w      $xr3, $xr0, 4
--    xvpickve.w      $xr4, $xr1, 4
--    vilvl.h         $vr0, $vr3, $vr0
-+    xvmsknz.b       xr1, xr1
-+    xvpickve.w      xr3, xr0, 4
-+    xvpickve.w      xr4, xr1, 4
-+    vilvl.h         vr0, vr3, vr0
- 
--    vilvl.h         $vr1, $vr4, $vr1
--    vilvl.w         $vr0, $vr1, $vr0
--    movfr2gr.d      t0, $f0
-+    vilvl.h         vr1, vr4, vr1
-+    vilvl.w         vr0, vr1, vr0
-+    movfr2gr.d      t0, fa0
-     and             t0, t0, t2
- 
-     bltu            a2, t1, L(end)
-     bnez            t0, L(found)
-     bstrins.d       a0, zero, 5, 0
- L(loop):
--    xvld            $xr0, a3, -64
-+    xvld            xr0, a3, -64
- 
--    xvld            $xr1, a3, -32
-+    xvld            xr1, a3, -32
-     addi.d          a3, a3, -64
--    xvseq.b         $xr0, $xr0, $xr2
--    xvseq.b         $xr1, $xr1, $xr2
-+    xvseq.b         xr0, xr0, xr2
-+    xvseq.b         xr1, xr1, xr2
- 
- 
-     beq             a0, a3, L(out)
--    xvmax.bu        $xr3, $xr0, $xr1
--    xvseteqz.v      $fcc0, $xr3
--    bcnez           $fcc0, L(loop)
-+    xvmax.bu        xr3, xr0, xr1
-+    xvseteqz.v      fcc0, xr3
-+    bcnez           fcc0, L(loop)
- 
--    xvmsknz.b       $xr0, $xr0
--    xvmsknz.b       $xr1, $xr1
--    xvpickve.w      $xr3, $xr0, 4
--    xvpickve.w      $xr4, $xr1, 4
-+    xvmsknz.b       xr0, xr0
-+    xvmsknz.b       xr1, xr1
-+    xvpickve.w      xr3, xr0, 4
-+    xvpickve.w      xr4, xr1, 4
- 
--    vilvl.h         $vr0, $vr3, $vr0
--    vilvl.h         $vr1, $vr4, $vr1
--    vilvl.w         $vr0, $vr1, $vr0
--    movfr2gr.d      t0, $f0
-+    vilvl.h         vr0, vr3, vr0
-+    vilvl.h         vr1, vr4, vr1
-+    vilvl.w         vr0, vr1, vr0
-+    movfr2gr.d      t0, fa0
- 
- L(found):
-     addi.d          a0, a3, 63
-@@ -80,15 +80,15 @@ L(found):
- 
- 
- L(out):
--    xvmsknz.b       $xr0, $xr0
--    xvmsknz.b       $xr1, $xr1
--    xvpickve.w      $xr3, $xr0, 4
--    xvpickve.w      $xr4, $xr1, 4
--
--    vilvl.h         $vr0, $vr3, $vr0
--    vilvl.h         $vr1, $vr4, $vr1
--    vilvl.w         $vr0, $vr1, $vr0
--    movfr2gr.d      t0, $f0
-+    xvmsknz.b       xr0, xr0
-+    xvmsknz.b       xr1, xr1
-+    xvpickve.w      xr3, xr0, 4
-+    xvpickve.w      xr4, xr1, 4
-+
-+    vilvl.h         vr0, vr3, vr0
-+    vilvl.h         vr1, vr4, vr1
-+    vilvl.w         vr0, vr1, vr0
-+    movfr2gr.d      t0, fa0
- 
- L(end):
-     sll.d           t2, t3, t4
-diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
-index 4bdc18d8..4a302cac 100644
---- a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
-@@ -19,46 +19,46 @@ LEAF(MEMRCHR, 6)
- 
-     bstrins.d       a3, zero, 4, 0
-     addi.d          t1, t1, 1      # len for unaligned address
--    vld             $vr0, a3, 0
--    vld             $vr1, a3, 16
-+    vld             vr0, a3, 0
-+    vld             vr1, a3, 16
- 
-     sub.d           t2, zero, t1
-     li.d            t3, -1
--    vreplgr2vr.b    $vr2, a1
-+    vreplgr2vr.b    vr2, a1
-     andi            t4, a0, 0x1f
- 
-     srl.d           t2, t3, t2
--    vseq.b          $vr0, $vr0, $vr2
--    vseq.b          $vr1, $vr1, $vr2
--    vmsknz.b        $vr0, $vr0
-+    vseq.b          vr0, vr0, vr2
-+    vseq.b          vr1, vr1, vr2
-+    vmsknz.b        vr0, vr0
- 
- 
--    vmsknz.b        $vr1, $vr1
--    vilvl.h         $vr0, $vr1, $vr0
--    movfr2gr.s      t0, $f0
-+    vmsknz.b        vr1, vr1
-+    vilvl.h         vr0, vr1, vr0
-+    movfr2gr.s      t0, fa0
-     and             t0, t0, t2
- 
-     bltu            a2, t1, L(end)
-     bnez            t0, L(found)
-     bstrins.d       a0, zero, 4, 0
- L(loop):
--    vld             $vr0, a3, -32
-+    vld             vr0, a3, -32
- 
--    vld             $vr1, a3, -16
-+    vld             vr1, a3, -16
-     addi.d          a3, a3, -32
--    vseq.b          $vr0, $vr0, $vr2
--    vseq.b          $vr1, $vr1, $vr2
-+    vseq.b          vr0, vr0, vr2
-+    vseq.b          vr1, vr1, vr2
- 
-     beq             a0, a3, L(out)
--    vmax.bu         $vr3, $vr0, $vr1
--    vseteqz.v       $fcc0, $vr3
--    bcnez           $fcc0, L(loop)
-+    vmax.bu         vr3, vr0, vr1
-+    vseteqz.v       fcc0, vr3
-+    bcnez           fcc0, L(loop)
- 
- 
--    vmsknz.b        $vr0, $vr0
--    vmsknz.b        $vr1, $vr1
--    vilvl.h         $vr0, $vr1, $vr0
--    movfr2gr.s      t0, $f0
-+    vmsknz.b        vr0, vr0
-+    vmsknz.b        vr1, vr1
-+    vilvl.h         vr0, vr1, vr0
-+    movfr2gr.s      t0, fa0
- 
- L(found):
-     addi.d          a0, a3, 31
-@@ -67,10 +67,10 @@ L(found):
-     jr              ra
- 
- L(out):
--    vmsknz.b        $vr0, $vr0
--    vmsknz.b        $vr1, $vr1
--    vilvl.h         $vr0, $vr1, $vr0
--    movfr2gr.s      t0, $f0
-+    vmsknz.b        vr0, vr0
-+    vmsknz.b        vr1, vr1
-+    vilvl.h         vr0, vr1, vr0
-+    movfr2gr.s      t0, fa0
- 
- L(end):
-     sll.d           t2, t3, t4
-diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
-index b53c0b7b..5e4908dc 100644
---- a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
-@@ -14,7 +14,7 @@
- LEAF(MEMSET, 6)
-     li.d            t1, 32
-     move            a3, a0
--    xvreplgr2vr.b   $xr0, a1
-+    xvreplgr2vr.b   xr0, a1
-     add.d           a4, a0, a2
- 
-     bgeu            t1, a2, L(less_32bytes) # len <= 32
-@@ -24,46 +24,46 @@ LEAF(MEMSET, 6)
- 
- L(less_128bytes):
-     bgeu            t2, a2, L(less_64bytes) # len <= 64
--    xvst            $xr0, a3, 0
--    xvst            $xr0, a3, 32
--    xvst            $xr0, a4, -32
-+    xvst            xr0, a3, 0
-+    xvst            xr0, a3, 32
-+    xvst            xr0, a4, -32
- 
--    xvst            $xr0, a4, -64
-+    xvst            xr0, a4, -64
-     jr              ra
- L(less_64bytes):
--    xvst            $xr0, a3, 0
--    xvst            $xr0, a4, -32
-+    xvst            xr0, a3, 0
-+    xvst            xr0, a4, -32
- 
- 
-     jr              ra
- L(less_32bytes):
-     srli.d          t0, a2, 4
-     beqz            t0, L(less_16bytes)
--    vst             $vr0, a3, 0
-+    vst             vr0, a3, 0
- 
--    vst             $vr0, a4, -16
-+    vst             vr0, a4, -16
-     jr              ra
- L(less_16bytes):
-     srli.d          t0, a2, 3
-     beqz            t0, L(less_8bytes)
- 
--    vstelm.d        $vr0, a3, 0, 0
--    vstelm.d        $vr0, a4, -8, 0
-+    vstelm.d        vr0, a3, 0, 0
-+    vstelm.d        vr0, a4, -8, 0
-     jr              ra
- L(less_8bytes):
-     srli.d          t0, a2, 2
- 
-     beqz            t0, L(less_4bytes)
--    vstelm.w        $vr0, a3, 0, 0
--    vstelm.w        $vr0, a4, -4, 0
-+    vstelm.w        vr0, a3, 0, 0
-+    vstelm.w        vr0, a4, -4, 0
-     jr              ra
- 
- 
- L(less_4bytes):
-     srli.d          t0, a2, 1
-     beqz            t0, L(less_2bytes)
--    vstelm.h        $vr0, a3, 0, 0
--    vstelm.h        $vr0, a4, -2, 0
-+    vstelm.h        vr0, a3, 0, 0
-+    vstelm.h        vr0, a4, -2, 0
- 
-     jr              ra
- L(less_2bytes):
-@@ -73,7 +73,7 @@ L(less_1bytes):
-     jr              ra
- 
- L(long_bytes):
--    xvst            $xr0, a3, 0
-+    xvst            xr0, a3, 0
-     bstrins.d       a3, zero, 4, 0
-     addi.d          a3, a3, 32
-     sub.d           a2, a4, a3
-@@ -85,15 +85,15 @@ L(long_bytes):
- 
- 
- L(loop_256):
--    xvst            $xr0, a3, 0
--    xvst            $xr0, a3, 32
--    xvst            $xr0, a3, 64
--    xvst            $xr0, a3, 96
-+    xvst            xr0, a3, 0
-+    xvst            xr0, a3, 32
-+    xvst            xr0, a3, 64
-+    xvst            xr0, a3, 96
- 
--    xvst            $xr0, a3, 128
--    xvst            $xr0, a3, 160
--    xvst            $xr0, a3, 192
--    xvst            $xr0, a3, 224
-+    xvst            xr0, a3, 128
-+    xvst            xr0, a3, 160
-+    xvst            xr0, a3, 192
-+    xvst            xr0, a3, 224
- 
-     addi.d          a3, a3, 256
-     bne             a3, t0, L(loop_256)
-@@ -101,26 +101,26 @@ L(long_end):
-     bltu            a2, t3, L(end_less_128)
-     addi.d          a2, a2, -128
- 
--    xvst            $xr0, a3, 0
--    xvst            $xr0, a3, 32
--    xvst            $xr0, a3, 64
--    xvst            $xr0, a3, 96
-+    xvst            xr0, a3, 0
-+    xvst            xr0, a3, 32
-+    xvst            xr0, a3, 64
-+    xvst            xr0, a3, 96
- 
- 
-     addi.d          a3, a3, 128
- L(end_less_128):
-     bltu            a2, t2, L(end_less_64)
-     addi.d          a2, a2, -64
--    xvst            $xr0, a3, 0
-+    xvst            xr0, a3, 0
- 
--    xvst            $xr0, a3, 32
-+    xvst            xr0, a3, 32
-     addi.d          a3, a3, 64
- L(end_less_64):
-     bltu            a2, t1, L(end_less_32)
--    xvst            $xr0, a3, 0
-+    xvst            xr0, a3, 0
- 
- L(end_less_32):
--    xvst            $xr0, a4, -32
-+    xvst            xr0, a4, -32
-     jr              ra
- END(MEMSET)
- 
-diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
-index 7ab85283..67b279c8 100644
---- a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
-@@ -14,7 +14,7 @@
- LEAF(MEMSET, 6)
-     li.d            t1, 16
-     move            a3, a0
--    vreplgr2vr.b    $vr0, a1
-+    vreplgr2vr.b    vr0, a1
-     add.d           a4, a0, a2
- 
-     bgeu            t1, a2, L(less_16bytes) # len <= 16
-@@ -24,48 +24,48 @@ LEAF(MEMSET, 6)
- 
- L(less_64bytes):
-     bgeu            t2, a2, L(less_32bytes) # len <= 32
--    vst             $vr0, a3, 0
--    vst             $vr0, a3, 16
--    vst             $vr0, a4, -32
-+    vst             vr0, a3, 0
-+    vst             vr0, a3, 16
-+    vst             vr0, a4, -32
- 
--    vst             $vr0, a4, -16
-+    vst             vr0, a4, -16
-     jr              ra
- L(less_32bytes):
--    vst             $vr0, a3, 0
--    vst             $vr0, a4, -16
-+    vst             vr0, a3, 0
-+    vst             vr0, a4, -16
- 
- 
-     jr              ra
- L(less_16bytes):
-     srli.d          t0, a2, 3
-     beqz            t0, L(less_8bytes)
--    vstelm.d        $vr0, a3, 0, 0
-+    vstelm.d        vr0, a3, 0, 0
- 
--    vstelm.d        $vr0, a4, -8, 0
-+    vstelm.d        vr0, a4, -8, 0
-     jr              ra
- L(less_8bytes):
-     srli.d          t0, a2, 2
-     beqz            t0, L(less_4bytes)
- 
--    vstelm.w        $vr0, a3, 0, 0
--    vstelm.w        $vr0, a4, -4, 0
-+    vstelm.w        vr0, a3, 0, 0
-+    vstelm.w        vr0, a4, -4, 0
-     jr              ra
- L(less_4bytes):
-     srli.d          t0, a2, 1
- 
-     beqz            t0, L(less_2bytes)
--    vstelm.h        $vr0, a3, 0, 0
--    vstelm.h        $vr0, a4, -2, 0
-+    vstelm.h        vr0, a3, 0, 0
-+    vstelm.h        vr0, a4, -2, 0
-     jr              ra
- 
- 
- L(less_2bytes):
-     beqz            a2, L(less_1bytes)
--    vstelm.b        $vr0, a3, 0, 0
-+    vstelm.b        vr0, a3, 0, 0
- L(less_1bytes):
-     jr              ra
- L(long_bytes):
--    vst             $vr0, a3, 0
-+    vst             vr0, a3, 0
- 
-     bstrins.d       a3, zero, 3, 0
-     addi.d          a3, a3, 16
-@@ -77,43 +77,43 @@ L(long_bytes):
-     sub.d           t0, a4, t0
- 
- L(loop_128):
--    vst             $vr0, a3, 0
-+    vst             vr0, a3, 0
- 
--    vst             $vr0, a3, 16
--    vst             $vr0, a3, 32
--    vst             $vr0, a3, 48
--    vst             $vr0, a3, 64
-+    vst             vr0, a3, 16
-+    vst             vr0, a3, 32
-+    vst             vr0, a3, 48
-+    vst             vr0, a3, 64
- 
- 
--    vst             $vr0, a3, 80
--    vst             $vr0, a3, 96
--    vst             $vr0, a3, 112
-+    vst             vr0, a3, 80
-+    vst             vr0, a3, 96
-+    vst             vr0, a3, 112
-     addi.d          a3, a3, 128
- 
-     bne             a3, t0, L(loop_128)
- L(long_end):
-     bltu            a2, t3, L(end_less_64)
-     addi.d          a2, a2, -64
--    vst             $vr0, a3, 0
-+    vst             vr0, a3, 0
- 
--    vst             $vr0, a3, 16
--    vst             $vr0, a3, 32
--    vst             $vr0, a3, 48
-+    vst             vr0, a3, 16
-+    vst             vr0, a3, 32
-+    vst             vr0, a3, 48
-     addi.d          a3, a3, 64
- 
- L(end_less_64):
-     bltu            a2, t2, L(end_less_32)
-     addi.d          a2, a2, -32
--    vst             $vr0, a3, 0
--    vst             $vr0, a3, 16
-+    vst             vr0, a3, 0
-+    vst             vr0, a3, 16
- 
-     addi.d          a3, a3, 32
- L(end_less_32):
-     bltu            a2, t1, L(end_less_16)
--    vst             $vr0, a3, 0
-+    vst             vr0, a3, 0
- 
- L(end_less_16):
--    vst             $vr0, a4, -16
-+    vst             vr0, a4, -16
-     jr              ra
- END(MEMSET)
- 
-diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
-index 1e94aa50..856f99ce 100644
---- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
-@@ -8,15 +8,15 @@
- LEAF(RAWMEMCHR, 6)
-     move            a2, a0
-     bstrins.d       a0, zero, 4, 0
--    xvld            $xr0, a0, 0
--    xvreplgr2vr.b   $xr1, a1
-+    xvld            xr0, a0, 0
-+    xvreplgr2vr.b   xr1, a1
- 
--    xvseq.b         $xr0, $xr0, $xr1
--    xvmsknz.b       $xr0, $xr0
--    xvpickve.w      $xr2, $xr0, 4
--    vilvl.h         $vr0, $vr2, $vr0
-+    xvseq.b         xr0, xr0, xr1
-+    xvmsknz.b       xr0, xr0
-+    xvpickve.w      xr2, xr0, 4
-+    vilvl.h         vr0, vr2, vr0
- 
--    movfr2gr.s      t0, $f0
-+    movfr2gr.s      t0, fa0
-     sra.w           t0, t0, a2
-     beqz            t0, L(loop)
-     ctz.w           t0, t0
-@@ -27,17 +27,17 @@ LEAF(RAWMEMCHR, 6)
-     nop
- 
- L(loop):
--    xvld            $xr0, a0, 32
-+    xvld            xr0, a0, 32
-     addi.d          a0, a0, 32
--    xvseq.b         $xr0, $xr0, $xr1
--    xvseteqz.v      $fcc0, $xr0
-+    xvseq.b         xr0, xr0, xr1
-+    xvseteqz.v      fcc0, xr0
- 
--    bcnez           $fcc0, L(loop)
--    xvmsknz.b       $xr0, $xr0
--    xvpickve.w      $xr1, $xr0, 4
--    vilvl.h         $vr0, $vr1, $vr0
-+    bcnez           fcc0, L(loop)
-+    xvmsknz.b       xr0, xr0
-+    xvpickve.w      xr1, xr0, 4
-+    vilvl.h         vr0, vr1, vr0
- 
--    movfr2gr.s      t0, $f0
-+    movfr2gr.s      t0, fa0
-     ctz.w           t0, t0
-     add.d           a0, a0, t0
-     jr              ra
-diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
-index 40bf0cda..7e864e96 100644
---- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
-@@ -14,17 +14,17 @@
- LEAF(RAWMEMCHR, 6)
-     move            a2, a0
-     bstrins.d       a0, zero, 4, 0
--    vld             $vr0, a0, 0
--    vld             $vr1, a0, 16
-+    vld             vr0, a0, 0
-+    vld             vr1, a0, 16
- 
--    vreplgr2vr.b    $vr2, a1
--    vseq.b          $vr0, $vr0, $vr2
--    vseq.b          $vr1, $vr1, $vr2
--    vmsknz.b        $vr0, $vr0
-+    vreplgr2vr.b    vr2, a1
-+    vseq.b          vr0, vr0, vr2
-+    vseq.b          vr1, vr1, vr2
-+    vmsknz.b        vr0, vr0
- 
--    vmsknz.b        $vr1, $vr1
--    vilvl.h         $vr0, $vr1, $vr0
--    movfr2gr.s      t0, $f0
-+    vmsknz.b        vr1, vr1
-+    vilvl.h         vr0, vr1, vr0
-+    movfr2gr.s      t0, fa0
-     sra.w           t0, t0, a2
- 
-     beqz            t0, L(loop)
-@@ -34,15 +34,15 @@ LEAF(RAWMEMCHR, 6)
- 
- 
- L(loop):
--    vld             $vr0, a0, 32
-+    vld             vr0, a0, 32
-     addi.d          a0, a0, 16
--    vseq.b          $vr0, $vr0, $vr2
--    vseteqz.v       $fcc0, $vr0
-+    vseq.b          vr0, vr0, vr2
-+    vseteqz.v       fcc0, vr0
- 
--    bcnez           $fcc0, L(loop)
-+    bcnez           fcc0, L(loop)
-     addi.d          a0, a0, 16
--    vfrstpi.b       $vr0, $vr0, 0
--    vpickve2gr.bu   t0, $vr0, 0
-+    vfrstpi.b       vr0, vr0, 0
-+    vpickve2gr.bu   t0, vr0, 0
- 
-     add.d           a0, a0, t0
-     jr              ra
-diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
-index 0836f590..53832de7 100644
---- a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
-@@ -18,67 +18,67 @@ L(magic_num):
- ENTRY_NO_ALIGN(STPCPY)
-     pcaddi          t0, -4
-     andi            a4, a1, 0xf
--    vld             $vr1, t0, 0
-+    vld             vr1, t0, 0
-     beqz            a4, L(load_start)
- 
-     xor             t0, a1, a4
--    vld             $vr0, t0, 0
--    vreplgr2vr.b    $vr2, a4
--    vadd.b          $vr2, $vr2, $vr1
-+    vld             vr0, t0, 0
-+    vreplgr2vr.b    vr2, a4
-+    vadd.b          vr2, vr2, vr1
- 
--    vshuf.b         $vr0, $vr2, $vr0, $vr2
--    vsetanyeqz.b    $fcc0, $vr0
--    bcnez           $fcc0, L(end)
-+    vshuf.b         vr0, vr2, vr0, vr2
-+    vsetanyeqz.b    fcc0, vr0
-+    bcnez           fcc0, L(end)
- L(load_start):
--    vld             $vr0, a1, 0
-+    vld             vr0, a1, 0
- 
- 
-     li.d            t1, 16
-     andi            a3, a0, 0xf
--    vsetanyeqz.b    $fcc0, $vr0
-+    vsetanyeqz.b    fcc0, vr0
-     sub.d           t0, t1, a3
- 
--    bcnez           $fcc0, L(end)
-+    bcnez           fcc0, L(end)
-     add.d           a1, a1, t0
--    vst             $vr0, a0, 0
-+    vst             vr0, a0, 0
-     add.d           a0, a0, t0
- 
-     bne             a3, a4, L(unaligned)
--    vld             $vr0, a1, 0
--    vsetanyeqz.b    $fcc0, $vr0
--    bcnez           $fcc0, L(end)
-+    vld             vr0, a1, 0
-+    vsetanyeqz.b    fcc0, vr0
-+    bcnez           fcc0, L(end)
- 
- L(loop):
--    vst             $vr0, a0, 0
--    vld             $vr0, a1, 16
-+    vst             vr0, a0, 0
-+    vld             vr0, a1, 16
-     addi.d          a0, a0, 16
-     addi.d          a1, a1, 16
- 
- 
--    vsetanyeqz.b    $fcc0, $vr0
--    bceqz           $fcc0, L(loop)
--    vmsknz.b        $vr1, $vr0
--    movfr2gr.s      t0, $f1
-+    vsetanyeqz.b    fcc0, vr0
-+    bceqz           fcc0, L(loop)
-+    vmsknz.b        vr1, vr0
-+    movfr2gr.s      t0, fa1
- 
-     cto.w           t0, t0
-     add.d           a1, a1, t0
--    vld             $vr0, a1, -15
-+    vld             vr0, a1, -15
-     add.d           a0, a0, t0
- 
--    vst             $vr0, a0, -15
-+    vst             vr0, a0, -15
-     jr              ra
- L(end):
--    vseqi.b         $vr1, $vr0, 0
--    vfrstpi.b       $vr1, $vr1, 0
-+    vseqi.b         vr1, vr0, 0
-+    vfrstpi.b       vr1, vr1, 0
- 
--    vpickve2gr.bu   t0, $vr1, 0
-+    vpickve2gr.bu   t0, vr1, 0
-     addi.d          t0, t0, 1
- L(end_16):
-     andi            t1, t0, 16
-     beqz            t1, L(end_8)
- 
- 
--    vst             $vr0, a0, 0
-+    vst             vr0, a0, 0
-     addi.d          a0, a0, 15
-     jr              ra
- L(end_8):
-@@ -89,26 +89,26 @@ L(end_8):
-     andi            t5, t0, 1
-     beqz            t2, L(end_4)
- 
--    vstelm.d        $vr0, a0, 0, 0
-+    vstelm.d        vr0, a0, 0, 0
-     addi.d          a0, a0, 8
--    vbsrl.v         $vr0, $vr0, 8
-+    vbsrl.v         vr0, vr0, 8
- L(end_4):
-     beqz            t3, L(end_2)
- 
--    vstelm.w        $vr0, a0, 0, 0
-+    vstelm.w        vr0, a0, 0, 0
-     addi.d          a0, a0, 4
--    vbsrl.v         $vr0, $vr0, 4
-+    vbsrl.v         vr0, vr0, 4
- L(end_2):
-     beqz            t4, L(end_1)
- 
- 
--    vstelm.h        $vr0, a0, 0, 0
-+    vstelm.h        vr0, a0, 0, 0
-     addi.d          a0, a0, 2
--    vbsrl.v         $vr0, $vr0, 2
-+    vbsrl.v         vr0, vr0, 2
- L(end_1):
-     beqz            t5, L(out)
- 
--    vstelm.b        $vr0, a0, 0, 0
-+    vstelm.b        vr0, a0, 0, 0
-     addi.d          a0, a0, 1
- L(out):
-     addi.d          a0, a0, -1
-@@ -120,49 +120,49 @@ L(unaligned):
-     andi           a3, a1, 0xf
-     bstrins.d      a1, zero, 3, 0
- 
--    vld            $vr2, a1, 0
--    vreplgr2vr.b   $vr3, a3
--    vslt.b         $vr4, $vr1, $vr3
--    vor.v          $vr0, $vr2, $vr4
-+    vld            vr2, a1, 0
-+    vreplgr2vr.b   vr3, a3
-+    vslt.b         vr4, vr1, vr3
-+    vor.v          vr0, vr2, vr4
- 
- 
--    vsetanyeqz.b   $fcc0, $vr0
--    bcnez          $fcc0, L(un_first_end)
--    vld            $vr0, a1, 16
--    vadd.b         $vr3, $vr3, $vr1
-+    vsetanyeqz.b   fcc0, vr0
-+    bcnez          fcc0, L(un_first_end)
-+    vld            vr0, a1, 16
-+    vadd.b         vr3, vr3, vr1
- 
-     addi.d         a1, a1, 16
--    vshuf.b        $vr4, $vr0, $vr2, $vr3
--    vsetanyeqz.b   $fcc0, $vr0
--    bcnez          $fcc0, L(un_end)
-+    vshuf.b        vr4, vr0, vr2, vr3
-+    vsetanyeqz.b   fcc0, vr0
-+    bcnez          fcc0, L(un_end)
- 
- L(un_loop):
--    vor.v          $vr2, $vr0, $vr0
--    vld            $vr0, a1, 16
--    vst            $vr4, a0, 0
-+    vor.v          vr2, vr0, vr0
-+    vld            vr0, a1, 16
-+    vst            vr4, a0, 0
-     addi.d         a1, a1, 16
- 
-     addi.d         a0, a0, 16
--    vshuf.b        $vr4, $vr0, $vr2, $vr3
--    vsetanyeqz.b   $fcc0, $vr0
--    bceqz          $fcc0, L(un_loop)
-+    vshuf.b        vr4, vr0, vr2, vr3
-+    vsetanyeqz.b   fcc0, vr0
-+    bceqz          fcc0, L(un_loop)
- 
- 
- L(un_end):
--    vsetanyeqz.b    $fcc0, $vr4
--    bcnez           $fcc0, 1f
--    vst             $vr4, a0, 0
-+    vsetanyeqz.b    fcc0, vr4
-+    bcnez           fcc0, 1f
-+    vst             vr4, a0, 0
- 1:
--    vmsknz.b        $vr1, $vr0
-+    vmsknz.b        vr1, vr0
- 
--    movfr2gr.s      t0, $f1
-+    movfr2gr.s      t0, fa1
-     cto.w           t0, t0
-     add.d           a1, a1, t0
--    vld             $vr0, a1, -15
-+    vld             vr0, a1, -15
- 
-     add.d           a0, a0, t0
-     sub.d           a0, a0, a3
--    vst             $vr0, a0, 1
-+    vst             vr0, a0, 1
-     addi.d          a0, a0, 16
- 
-     jr              ra
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
-index 3f6ad915..fab6edc7 100644
---- a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
-@@ -16,18 +16,18 @@
- LEAF(STRCHR, 6)
-     andi            t1, a0, 0x1f
-     bstrins.d       a0, zero, 4, 0
--    xvld            $xr0, a0, 0
-+    xvld            xr0, a0, 0
-     li.d            t2, -1
- 
--    xvreplgr2vr.b   $xr1, a1
-+    xvreplgr2vr.b   xr1, a1
-     sll.d           t1, t2, t1
--    xvxor.v         $xr2, $xr0, $xr1
--    xvmin.bu        $xr0, $xr0, $xr2
-+    xvxor.v         xr2, xr0, xr1
-+    xvmin.bu        xr0, xr0, xr2
- 
--    xvmsknz.b       $xr0, $xr0
--    xvpickve.w      $xr3, $xr0, 4
--    vilvl.h         $vr0, $vr3, $vr0
--    movfr2gr.s      t0, $f0
-+    xvmsknz.b       xr0, xr0
-+    xvpickve.w      xr3, xr0, 4
-+    vilvl.h         vr0, vr3, vr0
-+    movfr2gr.s      t0, fa0
- 
-     orn             t0, t0, t1
-     bne             t0, t2, L(end)
-@@ -36,37 +36,37 @@ LEAF(STRCHR, 6)
- 
- 
- L(loop):
--    xvld            $xr0, a0, 0
--    xvxor.v         $xr2, $xr0, $xr1
--    xvmin.bu        $xr0, $xr0, $xr2
--    xvsetanyeqz.b   $fcc0, $xr0
-+    xvld            xr0, a0, 0
-+    xvxor.v         xr2, xr0, xr1
-+    xvmin.bu        xr0, xr0, xr2
-+    xvsetanyeqz.b   fcc0, xr0
- 
--    bcnez           $fcc0, L(loop_end)
--    xvld            $xr0, a0, 32
-+    bcnez           fcc0, L(loop_end)
-+    xvld            xr0, a0, 32
-     addi.d          a0, a0, 64
--    xvxor.v         $xr2, $xr0, $xr1
-+    xvxor.v         xr2, xr0, xr1
- 
--    xvmin.bu        $xr0, $xr0, $xr2
--    xvsetanyeqz.b   $fcc0, $xr0
--    bceqz           $fcc0, L(loop)
-+    xvmin.bu        xr0, xr0, xr2
-+    xvsetanyeqz.b   fcc0, xr0
-+    bceqz           fcc0, L(loop)
-     addi.d          a0, a0, -32
- 
- L(loop_end):
--    xvmsknz.b       $xr0, $xr0
--    xvpickve.w      $xr1, $xr0, 4
--    vilvl.h         $vr0, $vr1, $vr0
--    movfr2gr.s      t0, $f0
-+    xvmsknz.b       xr0, xr0
-+    xvpickve.w      xr1, xr0, 4
-+    vilvl.h         vr0, vr1, vr0
-+    movfr2gr.s      t0, fa0
- 
- 
- L(end):
-     cto.w           t0, t0
-     add.d           a0, a0, t0
- #ifndef AS_STRCHRNUL
--    vreplgr2vr.b    $vr0, t0
--    xvpermi.q       $xr3, $xr2, 1
-+    vreplgr2vr.b    vr0, t0
-+    xvpermi.q       xr3, xr2, 1
- 
--    vshuf.b         $vr0, $vr3, $vr2, $vr0
--    vpickve2gr.bu   t0, $vr0, 0
-+    vshuf.b         vr0, vr3, vr2, vr0
-+    vpickve2gr.bu   t0, vr0, 0
-     masknez         a0, a0, t0
- #endif
-     jr              ra
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
-index 4ad9a4ad..ebeb332e 100644
---- a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
-@@ -16,16 +16,16 @@
- LEAF(STRCHR, 6)
-     andi            t1, a0, 0xf
-     bstrins.d       a0, zero, 3, 0
--    vld             $vr0, a0, 0
-+    vld             vr0, a0, 0
-     li.d            t2, -1
- 
--    vreplgr2vr.b    $vr1, a1
-+    vreplgr2vr.b    vr1, a1
-     sll.d           t3, t2, t1
--    vxor.v          $vr2, $vr0, $vr1
--    vmin.bu         $vr0, $vr0, $vr2
-+    vxor.v          vr2, vr0, vr1
-+    vmin.bu         vr0, vr0, vr2
- 
--    vmsknz.b        $vr0, $vr0
--    movfr2gr.s      t0, $f0
-+    vmsknz.b        vr0, vr0
-+    movfr2gr.s      t0, fa0
-     ext.w.h         t0, t0
-     orn             t0, t0, t3
- 
-@@ -34,23 +34,23 @@ L(found):
-     cto.w           t0, t0
-     add.d           a0, a0, t0
- #ifndef AS_STRCHRNUL
--    vreplve.b       $vr2, $vr2, t0
--    vpickve2gr.bu   t1, $vr2, 0
-+    vreplve.b       vr2, vr2, t0
-+    vpickve2gr.bu   t1, vr2, 0
-     masknez         a0, a0, t1
- #endif
-     jr              ra
- 
- 
- L(loop):
--    vld             $vr0, a0, 16
-+    vld             vr0, a0, 16
-     addi.d          a0, a0, 16
--    vxor.v          $vr2, $vr0, $vr1
--    vmin.bu         $vr0, $vr0, $vr2
-+    vxor.v          vr2, vr0, vr1
-+    vmin.bu         vr0, vr0, vr2
- 
--    vsetanyeqz.b    $fcc0, $vr0
--    bceqz           $fcc0, L(loop)
--    vmsknz.b        $vr0, $vr0
--    movfr2gr.s      t0, $f0
-+    vsetanyeqz.b    fcc0, vr0
-+    bceqz           fcc0, L(loop)
-+    vmsknz.b        vr0, vr0
-+    movfr2gr.s      t0, fa0
- 
-     b               L(found)
- END(STRCHR)
-diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
-index c86e3ecd..c6e1110c 100644
---- a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
-@@ -20,45 +20,45 @@ L(magic_num):
- ENTRY_NO_ALIGN(STRCMP)
-     pcaddi          t0, -4
-     andi            a2, a0, 0xf
--    vld             $vr2, t0, 0
-+    vld             vr2, t0, 0
-     andi            a3, a1, 0xf
- 
-     bne             a2, a3, L(unaligned)
-     bstrins.d       a0, zero, 3, 0
-     bstrins.d       a1, zero, 3, 0
--    vld             $vr0, a0, 0
-+    vld             vr0, a0, 0
- 
--    vld             $vr1, a1, 0
--    vreplgr2vr.b    $vr3, a2
--    vslt.b          $vr2, $vr2, $vr3
--    vseq.b          $vr3, $vr0, $vr1
-+    vld             vr1, a1, 0
-+    vreplgr2vr.b    vr3, a2
-+    vslt.b          vr2, vr2, vr3
-+    vseq.b          vr3, vr0, vr1
- 
- 
--    vmin.bu         $vr3, $vr0, $vr3
--    vor.v           $vr3, $vr3, $vr2
--    vsetanyeqz.b    $fcc0, $vr3
--    bcnez           $fcc0, L(al_out)
-+    vmin.bu         vr3, vr0, vr3
-+    vor.v           vr3, vr3, vr2
-+    vsetanyeqz.b    fcc0, vr3
-+    bcnez           fcc0, L(al_out)
- 
- L(al_loop):
--    vld             $vr0, a0, 16
--    vld             $vr1, a1, 16
-+    vld             vr0, a0, 16
-+    vld             vr1, a1, 16
-     addi.d          a0, a0, 16
-     addi.d          a1, a1, 16
- 
--    vseq.b          $vr3, $vr0, $vr1
--    vmin.bu         $vr3, $vr0, $vr3
--    vsetanyeqz.b    $fcc0, $vr3
--    bceqz           $fcc0, L(al_loop)
-+    vseq.b          vr3, vr0, vr1
-+    vmin.bu         vr3, vr0, vr3
-+    vsetanyeqz.b    fcc0, vr3
-+    bceqz           fcc0, L(al_loop)
- 
- L(al_out):
--    vseqi.b         $vr3, $vr3, 0
--    vfrstpi.b       $vr3, $vr3, 0
--    vshuf.b         $vr0, $vr0, $vr0, $vr3
--    vshuf.b         $vr1, $vr1, $vr1, $vr3
-+    vseqi.b         vr3, vr3, 0
-+    vfrstpi.b       vr3, vr3, 0
-+    vshuf.b         vr0, vr0, vr0, vr3
-+    vshuf.b         vr1, vr1, vr1, vr3
- 
- 
--    vpickve2gr.bu   t0, $vr0, 0
--    vpickve2gr.bu   t1, $vr1, 0
-+    vpickve2gr.bu   t0, vr0, 0
-+    vpickve2gr.bu   t1, vr1, 0
-     sub.d           a0, t0, t1
-     jr              ra
- 
-@@ -79,52 +79,52 @@ L(unaligned):
-     bstrins.d       a1, zero, 3, 0
- 
- 
--    vld             $vr0, a0, 0
--    vld             $vr3, a1, 0
--    vreplgr2vr.b    $vr4, a2
--    vreplgr2vr.b    $vr5, a3
-+    vld             vr0, a0, 0
-+    vld             vr3, a1, 0
-+    vreplgr2vr.b    vr4, a2
-+    vreplgr2vr.b    vr5, a3
- 
--    vslt.b          $vr7, $vr2, $vr4
--    vsub.b          $vr4, $vr4, $vr5
--    vaddi.bu        $vr6, $vr2, 16
--    vsub.b          $vr6, $vr6, $vr4
-+    vslt.b          vr7, vr2, vr4
-+    vsub.b          vr4, vr4, vr5
-+    vaddi.bu        vr6, vr2, 16
-+    vsub.b          vr6, vr6, vr4
- 
--    vshuf.b         $vr1, $vr3, $vr3, $vr6
--    vseq.b          $vr4, $vr0, $vr1
--    vmin.bu         $vr4, $vr0, $vr4
--    vor.v           $vr4, $vr4, $vr7
-+    vshuf.b         vr1, vr3, vr3, vr6
-+    vseq.b          vr4, vr0, vr1
-+    vmin.bu         vr4, vr0, vr4
-+    vor.v           vr4, vr4, vr7
- 
--    vsetanyeqz.b    $fcc0, $vr4
--    bcnez           $fcc0, L(un_end)
--    vslt.b          $vr5, $vr2, $vr5
--    vor.v           $vr3, $vr3, $vr5
-+    vsetanyeqz.b    fcc0, vr4
-+    bcnez           fcc0, L(un_end)
-+    vslt.b          vr5, vr2, vr5
-+    vor.v           vr3, vr3, vr5
- 
- 
- L(un_loop):
--    vld             $vr0, a0, 16
--    vsetanyeqz.b    $fcc0, $vr3
--    bcnez           $fcc0, L(remaining_end)
--    vor.v           $vr1, $vr3, $vr3
-+    vld             vr0, a0, 16
-+    vsetanyeqz.b    fcc0, vr3
-+    bcnez           fcc0, L(remaining_end)
-+    vor.v           vr1, vr3, vr3
- 
--    vld             $vr3, a1, 16
-+    vld             vr3, a1, 16
-     addi.d          a0, a0, 16
-     addi.d          a1, a1, 16
--    vshuf.b         $vr1, $vr3, $vr1, $vr6
-+    vshuf.b         vr1, vr3, vr1, vr6
- 
--    vseq.b          $vr4, $vr0, $vr1
--    vmin.bu         $vr4, $vr0, $vr4
--    vsetanyeqz.b    $fcc0, $vr4
--    bceqz           $fcc0, L(un_loop)
-+    vseq.b          vr4, vr0, vr1
-+    vmin.bu         vr4, vr0, vr4
-+    vsetanyeqz.b    fcc0, vr4
-+    bceqz           fcc0, L(un_loop)
- 
- L(un_end):
--    vseqi.b         $vr4, $vr4, 0
--    vfrstpi.b       $vr4, $vr4, 0
--    vshuf.b         $vr0, $vr0, $vr0, $vr4
--    vshuf.b         $vr1, $vr1, $vr1, $vr4
-+    vseqi.b         vr4, vr4, 0
-+    vfrstpi.b       vr4, vr4, 0
-+    vshuf.b         vr0, vr0, vr0, vr4
-+    vshuf.b         vr1, vr1, vr1, vr4
- 
- 
--    vpickve2gr.bu   t0, $vr0, 0
--    vpickve2gr.bu   t1, $vr1, 0
-+    vpickve2gr.bu   t0, vr0, 0
-+    vpickve2gr.bu   t1, vr1, 0
-     sub.d           t3, t0, t1
-     sub.d           t4, t1, t0
- 
-@@ -134,9 +134,9 @@ L(un_end):
-     jr              ra
- 
- L(remaining_end):
--    vshuf.b         $vr1, $vr3, $vr3, $vr6
--    vseq.b          $vr4, $vr0, $vr1
--    vmin.bu         $vr4, $vr4, $vr0
-+    vshuf.b         vr1, vr3, vr3, vr6
-+    vseq.b          vr4, vr0, vr1
-+    vmin.bu         vr4, vr4, vr0
-     b               L(un_end)
- END(STRCMP)
- 
-diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
-index dbc061ad..52d77fa3 100644
---- a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
-@@ -21,61 +21,61 @@ L(magic_num):
- ENTRY_NO_ALIGN(STRCPY)
-     pcaddi          t0, -4
-     andi            a4, a1, 0xf
--    vld             $vr1, t0, 0
-+    vld             vr1, t0, 0
-     move            a2, a0
- 
-     beqz            a4, L(load_start)
-     xor             t0, a1, a4
--    vld             $vr0, t0, 0
--    vreplgr2vr.b    $vr2, a4
-+    vld             vr0, t0, 0
-+    vreplgr2vr.b    vr2, a4
- 
--    vadd.b          $vr2, $vr2, $vr1
--    vshuf.b         $vr0, $vr2, $vr0, $vr2
--    vsetanyeqz.b    $fcc0, $vr0
--    bcnez           $fcc0, L(end)
-+    vadd.b          vr2, vr2, vr1
-+    vshuf.b         vr0, vr2, vr0, vr2
-+    vsetanyeqz.b    fcc0, vr0
-+    bcnez           fcc0, L(end)
- 
- 
- L(load_start):
--    vld             $vr0, a1, 0
-+    vld             vr0, a1, 0
-     li.d            t1, 16
-     andi            a3, a2, 0xf
--    vsetanyeqz.b    $fcc0, $vr0
-+    vsetanyeqz.b    fcc0, vr0
- 
-     sub.d           t0, t1, a3
--    bcnez           $fcc0, L(end)
-+    bcnez           fcc0, L(end)
-     add.d           a1, a1, t0
--    vst             $vr0, a2, 0
-+    vst             vr0, a2, 0
- 
-     andi            a3, a1, 0xf
-     add.d           a2, a2, t0
-     bnez            a3, L(unaligned)
--    vld             $vr0, a1, 0
-+    vld             vr0, a1, 0
- 
--    vsetanyeqz.b    $fcc0, $vr0
--    bcnez           $fcc0, L(end)
-+    vsetanyeqz.b    fcc0, vr0
-+    bcnez           fcc0, L(end)
- L(loop):
--    vst             $vr0, a2, 0
--    vld             $vr0, a1, 16
-+    vst             vr0, a2, 0
-+    vld             vr0, a1, 16
- 
- 
-     addi.d          a2, a2, 16
-     addi.d          a1, a1, 16
--    vsetanyeqz.b    $fcc0, $vr0
--    bceqz           $fcc0, L(loop)
-+    vsetanyeqz.b    fcc0, vr0
-+    bceqz           fcc0, L(loop)
- 
--    vmsknz.b        $vr1, $vr0
--    movfr2gr.s      t0, $f1
-+    vmsknz.b        vr1, vr0
-+    movfr2gr.s      t0, fa1
-     cto.w           t0, t0
-     add.d           a1, a1, t0
- 
--    vld             $vr0, a1, -15
-+    vld             vr0, a1, -15
-     add.d           a2, a2, t0
--    vst             $vr0, a2, -15
-+    vst             vr0, a2, -15
-     jr              ra
- 
- L(end):
--    vmsknz.b        $vr1, $vr0
--    movfr2gr.s      t0, $f1
-+    vmsknz.b        vr1, vr0
-+    movfr2gr.s      t0, fa1
-     cto.w           t0, t0
-     addi.d          t0, t0, 1
- 
-@@ -83,7 +83,7 @@ L(end):
- L(end_16):
-     andi            t1, t0, 16
-     beqz            t1, L(end_8)
--    vst             $vr0, a2, 0
-+    vst             vr0, a2, 0
-     jr              ra
- 
- L(end_8):
-@@ -93,74 +93,74 @@ L(end_8):
-     andi            t5, t0, 1
- 
-     beqz            t2, L(end_4)
--    vstelm.d        $vr0, a2, 0, 0
-+    vstelm.d        vr0, a2, 0, 0
-     addi.d          a2, a2, 8
--    vbsrl.v         $vr0, $vr0, 8
-+    vbsrl.v         vr0, vr0, 8
- 
- L(end_4):
-     beqz            t3, L(end_2)
--    vstelm.w        $vr0, a2, 0, 0
-+    vstelm.w        vr0, a2, 0, 0
-     addi.d          a2, a2, 4
--    vbsrl.v         $vr0, $vr0, 4
-+    vbsrl.v         vr0, vr0, 4
- 
- 
- L(end_2):
-     beqz            t4, L(end_1)
--    vstelm.h        $vr0, a2, 0, 0
-+    vstelm.h        vr0, a2, 0, 0
-     addi.d          a2, a2, 2
--    vbsrl.v         $vr0, $vr0, 2
-+    vbsrl.v         vr0, vr0, 2
- 
- L(end_1):
-     beqz            t5, L(out)
--    vstelm.b        $vr0, a2, 0, 0
-+    vstelm.b        vr0, a2, 0, 0
- L(out):
-     jr              ra
- L(unaligned):
-     bstrins.d      a1, zero, 3, 0
- 
--    vld            $vr2, a1, 0
--    vreplgr2vr.b   $vr3, a3
--    vslt.b         $vr4, $vr1, $vr3
--    vor.v          $vr0, $vr2, $vr4
-+    vld            vr2, a1, 0
-+    vreplgr2vr.b   vr3, a3
-+    vslt.b         vr4, vr1, vr3
-+    vor.v          vr0, vr2, vr4
- 
--    vsetanyeqz.b   $fcc0, $vr0
--    bcnez          $fcc0, L(un_first_end)
--    vld            $vr0, a1, 16
--    vadd.b         $vr3, $vr3, $vr1
-+    vsetanyeqz.b   fcc0, vr0
-+    bcnez          fcc0, L(un_first_end)
-+    vld            vr0, a1, 16
-+    vadd.b         vr3, vr3, vr1
- 
- 
-     addi.d         a1, a1, 16
--    vshuf.b        $vr4, $vr0, $vr2, $vr3
--    vsetanyeqz.b   $fcc0, $vr0
--    bcnez          $fcc0, L(un_end)
-+    vshuf.b        vr4, vr0, vr2, vr3
-+    vsetanyeqz.b   fcc0, vr0
-+    bcnez          fcc0, L(un_end)
- 
- L(un_loop):
--    vor.v          $vr2, $vr0, $vr0
--    vld            $vr0, a1, 16
--    vst            $vr4, a2, 0
-+    vor.v          vr2, vr0, vr0
-+    vld            vr0, a1, 16
-+    vst            vr4, a2, 0
-     addi.d         a1, a1, 16
- 
-     addi.d         a2, a2, 16
--    vshuf.b        $vr4, $vr0, $vr2, $vr3
--    vsetanyeqz.b   $fcc0, $vr0
--    bceqz          $fcc0, L(un_loop)
-+    vshuf.b        vr4, vr0, vr2, vr3
-+    vsetanyeqz.b   fcc0, vr0
-+    bceqz          fcc0, L(un_loop)
- 
- L(un_end):
--    vsetanyeqz.b    $fcc0, $vr4
--    bcnez           $fcc0, 1f
--    vst             $vr4, a2, 0
-+    vsetanyeqz.b    fcc0, vr4
-+    bcnez           fcc0, 1f
-+    vst             vr4, a2, 0
- 1:
--    vmsknz.b        $vr1, $vr0
-+    vmsknz.b        vr1, vr0
- 
- 
--    movfr2gr.s      t0, $f1
-+    movfr2gr.s      t0, fa1
-     cto.w           t0, t0
-     add.d           a1, a1, t0
--    vld             $vr0, a1, -15
-+    vld             vr0, a1, -15
- 
-     add.d           a2, a2, t0
-     sub.d           a2, a2, a3
--    vst             $vr0, a2, 1
-+    vst             vr0, a2, 1
-     jr              ra
- 
- L(un_first_end):
-diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
-index fd6c002d..fc25dd50 100644
---- a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
-@@ -17,12 +17,12 @@ LEAF(STRLEN, 6)
-     move            a1, a0
-     bstrins.d       a0, zero, 4, 0
-     li.d            t1, -1
--    xvld            $xr0, a0, 0
-+    xvld            xr0, a0, 0
- 
--    xvmsknz.b       $xr0, $xr0
--    xvpickve.w      $xr1, $xr0, 4
--    vilvl.h         $vr0, $vr1, $vr0
--    movfr2gr.s      t0, $f0  # sign extend
-+    xvmsknz.b       xr0, xr0
-+    xvpickve.w      xr1, xr0, 4
-+    vilvl.h         vr0, vr1, vr0
-+    movfr2gr.s      t0, fa0  # sign extend
- 
-     sra.w           t0, t0, a1
-     beq             t0, t1, L(loop)
-@@ -30,18 +30,18 @@ LEAF(STRLEN, 6)
-     jr              ra
- 
- L(loop):
--    xvld            $xr0, a0, 32
-+    xvld            xr0, a0, 32
-     addi.d          a0, a0, 32
--    xvsetanyeqz.b   $fcc0, $xr0
--    bceqz           $fcc0, L(loop)
-+    xvsetanyeqz.b   fcc0, xr0
-+    bceqz           fcc0, L(loop)
- 
- 
--    xvmsknz.b       $xr0, $xr0
-+    xvmsknz.b       xr0, xr0
-     sub.d           a0, a0, a1
--    xvpickve.w      $xr1, $xr0, 4
--    vilvl.h         $vr0, $vr1, $vr0
-+    xvpickve.w      xr1, xr0, 4
-+    vilvl.h         vr0, vr1, vr0
- 
--    movfr2gr.s      t0, $f0
-+    movfr2gr.s      t0, fa0
-     cto.w           t0, t0
-     add.d           a0, a0, t0
-     jr              ra
-diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
-index 6f311506..45c3db93 100644
---- a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
-@@ -16,15 +16,15 @@
- LEAF(STRLEN, 6)
-     move            a1, a0
-     bstrins.d       a0, zero, 4, 0
--    vld             $vr0, a0, 0
--    vld             $vr1, a0, 16
-+    vld             vr0, a0, 0
-+    vld             vr1, a0, 16
- 
-     li.d            t1, -1
--    vmsknz.b        $vr0, $vr0
--    vmsknz.b        $vr1, $vr1
--    vilvl.h         $vr0, $vr1, $vr0
-+    vmsknz.b        vr0, vr0
-+    vmsknz.b        vr1, vr1
-+    vilvl.h         vr0, vr1, vr0
- 
--    movfr2gr.s      t0, $f0
-+    movfr2gr.s      t0, fa0
-     sra.w           t0, t0, a1
-     beq             t0, t1, L(loop)
-     cto.w           a0, t0
-@@ -36,19 +36,19 @@ LEAF(STRLEN, 6)
- 
- 
- L(loop):
--    vld             $vr0, a0, 32
--    vld             $vr1, a0, 48
-+    vld             vr0, a0, 32
-+    vld             vr1, a0, 48
-     addi.d          a0, a0, 32
--    vmin.bu         $vr2, $vr0, $vr1
-+    vmin.bu         vr2, vr0, vr1
- 
--    vsetanyeqz.b    $fcc0, $vr2
--    bceqz           $fcc0, L(loop)
--    vmsknz.b        $vr0, $vr0
--    vmsknz.b        $vr1, $vr1
-+    vsetanyeqz.b    fcc0, vr2
-+    bceqz           fcc0, L(loop)
-+    vmsknz.b        vr0, vr0
-+    vmsknz.b        vr1, vr1
- 
--    vilvl.h         $vr0, $vr1, $vr0
-+    vilvl.h         vr0, vr1, vr0
-     sub.d           a0, a0, a1
--    movfr2gr.s      t0, $f0
-+    movfr2gr.s      t0, fa0
-     cto.w           t0, t0
- 
-     add.d           a0, a0, t0
-diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
-index 2c6f9614..21f3e689 100644
---- a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
-@@ -22,7 +22,7 @@ ENTRY_NO_ALIGN(STRNCMP)
-     beqz            a2, L(ret0)
-     pcaddi          t0, -5
-     andi            a3, a0, 0xf
--    vld             $vr2, t0, 0
-+    vld             vr2, t0, 0
- 
-     andi            a4, a1, 0xf
-     li.d            t2, 16
-@@ -30,57 +30,57 @@ ENTRY_NO_ALIGN(STRNCMP)
-     xor             t0, a0, a3
- 
-     xor             t1, a1, a4
--    vld             $vr0, t0, 0
--    vld             $vr1, t1, 0
--    vreplgr2vr.b    $vr3, a3
-+    vld             vr0, t0, 0
-+    vld             vr1, t1, 0
-+    vreplgr2vr.b    vr3, a3
- 
- 
-     sub.d           t2, t2, a3
--    vadd.b          $vr3, $vr3, $vr2
--    vshuf.b         $vr0, $vr3, $vr0, $vr3
--    vshuf.b         $vr1, $vr3, $vr1, $vr3
-+    vadd.b          vr3, vr3, vr2
-+    vshuf.b         vr0, vr3, vr0, vr3
-+    vshuf.b         vr1, vr3, vr1, vr3
- 
--    vseq.b          $vr3, $vr0, $vr1
--    vmin.bu         $vr3, $vr0, $vr3
-+    vseq.b          vr3, vr0, vr1
-+    vmin.bu         vr3, vr0, vr3
-     bgeu            t2, a2, L(al_early_end)
--    vsetanyeqz.b    $fcc0, $vr3
-+    vsetanyeqz.b    fcc0, vr3
- 
--    bcnez           $fcc0, L(al_end)
-+    bcnez           fcc0, L(al_end)
-     add.d           a3, a0, a2
-     addi.d          a4, a3, -1
-     bstrins.d       a4, zero, 3, 0
- 
-     sub.d           a2, a3, a4
- L(al_loop):
--    vld             $vr0, t0, 16
--    vld             $vr1, t1, 16
-+    vld             vr0, t0, 16
-+    vld             vr1, t1, 16
-     addi.d          t0, t0, 16
- 
- 
-     addi.d          t1, t1, 16
--    vseq.b          $vr3, $vr0, $vr1
--    vmin.bu         $vr3, $vr0, $vr3
-+    vseq.b          vr3, vr0, vr1
-+    vmin.bu         vr3, vr0, vr3
-     beq             t0, a4, L(al_early_end)
- 
--    vsetanyeqz.b    $fcc0, $vr3
--    bceqz           $fcc0, L(al_loop)
-+    vsetanyeqz.b    fcc0, vr3
-+    bceqz           fcc0, L(al_loop)
- L(al_end):
--    vseqi.b         $vr3, $vr3, 0
--    vfrstpi.b       $vr3, $vr3, 0
-+    vseqi.b         vr3, vr3, 0
-+    vfrstpi.b       vr3, vr3, 0
- 
--    vshuf.b         $vr0, $vr0, $vr0, $vr3
--    vshuf.b         $vr1, $vr1, $vr1, $vr3
--    vpickve2gr.bu   t0, $vr0, 0
--    vpickve2gr.bu   t1, $vr1, 0
-+    vshuf.b         vr0, vr0, vr0, vr3
-+    vshuf.b         vr1, vr1, vr1, vr3
-+    vpickve2gr.bu   t0, vr0, 0
-+    vpickve2gr.bu   t1, vr1, 0
- 
-     sub.d           a0, t0, t1
-     jr              ra
- L(al_early_end):
--    vreplgr2vr.b    $vr4, a2
--    vslt.b          $vr4, $vr2, $vr4
-+    vreplgr2vr.b    vr4, a2
-+    vslt.b          vr4, vr2, vr4
- 
- 
--    vorn.v          $vr3, $vr3, $vr4
-+    vorn.v          vr3, vr3, vr4
-     b               L(al_end)
- L(unaligned):
-     slt             a5, a3, a4
-@@ -94,64 +94,64 @@ L(unaligned):
-     andi            a4, a1, 0xf
-     xor             t0, a0, a3
-     xor             t1, a1, a4
--    vld             $vr0, t0, 0
-+    vld             vr0, t0, 0
- 
--    vld             $vr3, t1, 0
-+    vld             vr3, t1, 0
-     sub.d           t2, t2, a3
--    vreplgr2vr.b    $vr4, a3
--    vreplgr2vr.b    $vr5, a4
-+    vreplgr2vr.b    vr4, a3
-+    vreplgr2vr.b    vr5, a4
- 
- 
--    vaddi.bu        $vr6, $vr2, 16
--    vsub.b          $vr7, $vr4, $vr5
--    vsub.b          $vr6, $vr6, $vr7
--    vadd.b          $vr4, $vr2, $vr4
-+    vaddi.bu        vr6, vr2, 16
-+    vsub.b          vr7, vr4, vr5
-+    vsub.b          vr6, vr6, vr7
-+    vadd.b          vr4, vr2, vr4
- 
--    vshuf.b         $vr1, $vr3, $vr3, $vr6
--    vshuf.b         $vr0, $vr7, $vr0, $vr4
--    vshuf.b         $vr1, $vr7, $vr1, $vr4
--    vseq.b          $vr4, $vr0, $vr1
-+    vshuf.b         vr1, vr3, vr3, vr6
-+    vshuf.b         vr0, vr7, vr0, vr4
-+    vshuf.b         vr1, vr7, vr1, vr4
-+    vseq.b          vr4, vr0, vr1
- 
--    vmin.bu         $vr4, $vr0, $vr4
-+    vmin.bu         vr4, vr0, vr4
-     bgeu            t2, a2, L(un_early_end)
--    vsetanyeqz.b    $fcc0, $vr4
--    bcnez           $fcc0, L(un_end)
-+    vsetanyeqz.b    fcc0, vr4
-+    bcnez           fcc0, L(un_end)
- 
-     add.d           a6, a0, a2
--    vslt.b          $vr5, $vr2, $vr5
-+    vslt.b          vr5, vr2, vr5
-     addi.d          a7, a6, -1
--    vor.v           $vr3, $vr3, $vr5
-+    vor.v           vr3, vr3, vr5
- 
- 
-     bstrins.d       a7, zero, 3, 0
-     sub.d           a2, a6, a7
- L(un_loop):
--    vld             $vr0, t0, 16
-+    vld             vr0, t0, 16
-     addi.d          t0, t0, 16
- 
--    vsetanyeqz.b    $fcc0, $vr3
--    bcnez           $fcc0, L(has_zero)
-+    vsetanyeqz.b    fcc0, vr3
-+    bcnez           fcc0, L(has_zero)
-     beq             t0, a7, L(end_with_len)
--    vor.v           $vr1, $vr3, $vr3
-+    vor.v           vr1, vr3, vr3
- 
--    vld             $vr3, t1, 16
-+    vld             vr3, t1, 16
-     addi.d          t1, t1, 16
--    vshuf.b         $vr1, $vr3, $vr1, $vr6
--    vseq.b          $vr4, $vr0, $vr1
-+    vshuf.b         vr1, vr3, vr1, vr6
-+    vseq.b          vr4, vr0, vr1
- 
--    vmin.bu         $vr4, $vr0, $vr4
--    vsetanyeqz.b    $fcc0, $vr4
--    bceqz           $fcc0, L(un_loop)
-+    vmin.bu         vr4, vr0, vr4
-+    vsetanyeqz.b    fcc0, vr4
-+    bceqz           fcc0, L(un_loop)
- L(un_end):
--    vseqi.b         $vr4, $vr4, 0
-+    vseqi.b         vr4, vr4, 0
- 
- 
--    vfrstpi.b       $vr4, $vr4, 0
--    vshuf.b         $vr0, $vr0, $vr0, $vr4
--    vshuf.b         $vr1, $vr1, $vr1, $vr4
--    vpickve2gr.bu   t0, $vr0, 0
-+    vfrstpi.b       vr4, vr4, 0
-+    vshuf.b         vr0, vr0, vr0, vr4
-+    vshuf.b         vr1, vr1, vr1, vr4
-+    vpickve2gr.bu   t0, vr0, 0
- 
--    vpickve2gr.bu   t1, $vr1, 0
-+    vpickve2gr.bu   t1, vr1, 0
-     sub.d           t2, t0, t1
-     sub.d           t3, t1, t0
-     masknez         t0, t2, a5
-@@ -160,30 +160,30 @@ L(un_end):
-     or              a0, t0, t1
-     jr              ra
- L(has_zero):
--    vshuf.b         $vr1, $vr3, $vr3, $vr6
-+    vshuf.b         vr1, vr3, vr3, vr6
- 
--    vseq.b          $vr4, $vr0, $vr1
--    vmin.bu         $vr4, $vr0, $vr4
-+    vseq.b          vr4, vr0, vr1
-+    vmin.bu         vr4, vr0, vr4
-     bne             t0, a7, L(un_end)
- L(un_early_end):
--    vreplgr2vr.b    $vr5, a2
-+    vreplgr2vr.b    vr5, a2
- 
--    vslt.b          $vr5, $vr2, $vr5
--    vorn.v          $vr4, $vr4, $vr5
-+    vslt.b          vr5, vr2, vr5
-+    vorn.v          vr4, vr4, vr5
-     b               L(un_end)
- L(end_with_len):
-     sub.d           a6, a3, a4
- 
-     bgeu            a6, a2, 1f
--    vld             $vr4, t1, 16
-+    vld             vr4, t1, 16
- 1:
--    vshuf.b         $vr1, $vr4, $vr3, $vr6
--    vseq.b          $vr4, $vr0, $vr1
-+    vshuf.b         vr1, vr4, vr3, vr6
-+    vseq.b          vr4, vr0, vr1
- 
--    vmin.bu         $vr4, $vr0, $vr4
--    vreplgr2vr.b    $vr5, a2
--    vslt.b          $vr5, $vr2, $vr5
--    vorn.v          $vr4, $vr4, $vr5
-+    vmin.bu         vr4, vr0, vr4
-+    vreplgr2vr.b    vr5, a2
-+    vslt.b          vr5, vr2, vr5
-+    vorn.v          vr4, vr4, vr5
- 
-     b               L(un_end)
- L(ret0):
-diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
-index 910b52fe..6410a907 100644
---- a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
-@@ -19,23 +19,23 @@ LEAF(STRNLEN, 6)
-     li.d            t3, 65
-     sub.d           a2, a0, t1
- 
--    xvld            $xr0, a2, 0
--    xvld            $xr1, a2, 32
-+    xvld            xr0, a2, 0
-+    xvld            xr1, a2, 32
-     sub.d           t1, t3, t1
-     move            a3, a0
- 
-     sltu            t1, a1, t1
--    xvmsknz.b       $xr0, $xr0
--    xvmsknz.b       $xr1, $xr1
--    xvpickve.w      $xr2, $xr0, 4
-+    xvmsknz.b       xr0, xr0
-+    xvmsknz.b       xr1, xr1
-+    xvpickve.w      xr2, xr0, 4
- 
--    xvpickve.w      $xr3, $xr1, 4
--    vilvl.h         $vr0, $vr2, $vr0
--    vilvl.h         $vr1, $vr3, $vr1
--    vilvl.w         $vr0, $vr1, $vr0
-+    xvpickve.w      xr3, xr1, 4
-+    vilvl.h         vr0, vr2, vr0
-+    vilvl.h         vr1, vr3, vr1
-+    vilvl.w         vr0, vr1, vr0
- 
- 
--    movfr2gr.d      t0, $f0
-+    movfr2gr.d      t0, fa0
-     sra.d           t0, t0, a0
-     orn             t1, t1, t0
-     bnez            t1, L(end)
-@@ -46,26 +46,26 @@ LEAF(STRNLEN, 6)
-     bstrins.d       a4, zero, 5, 0
- 
- L(loop):
--    xvld            $xr0, a0, 64
--    xvld            $xr1, a0, 96
-+    xvld            xr0, a0, 64
-+    xvld            xr1, a0, 96
-     addi.d          a0, a0, 64
-     beq             a0, a4, L(out)
- 
--    xvmin.bu        $xr2, $xr0, $xr1
--    xvsetanyeqz.b   $fcc0, $xr2
--    bceqz           $fcc0, L(loop)
-+    xvmin.bu        xr2, xr0, xr1
-+    xvsetanyeqz.b   fcc0, xr2
-+    bceqz           fcc0, L(loop)
- L(out):
--    xvmsknz.b       $xr0, $xr0
-+    xvmsknz.b       xr0, xr0
- 
- 
--    xvmsknz.b       $xr1, $xr1
--    xvpickve.w      $xr2, $xr0, 4
--    xvpickve.w      $xr3, $xr1, 4
--    vilvl.h         $vr0, $vr2, $vr0
-+    xvmsknz.b       xr1, xr1
-+    xvpickve.w      xr2, xr0, 4
-+    xvpickve.w      xr3, xr1, 4
-+    vilvl.h         vr0, vr2, vr0
- 
--    vilvl.h         $vr1, $vr3, $vr1
--    vilvl.w         $vr0, $vr1, $vr0
--    movfr2gr.d      t0, $f0
-+    vilvl.h         vr1, vr3, vr1
-+    vilvl.w         vr0, vr1, vr0
-+    movfr2gr.d      t0, fa0
- L(end):
-     sub.d           a0, a0, a3
- 
-diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
-index db0e90ff..9250a0cd 100644
---- a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
-@@ -19,17 +19,17 @@ LEAF(STRNLEN, 6)
-     li.d            t3, 33
-     sub.d           a2, a0, t1
- 
--    vld             $vr0, a2, 0
--    vld             $vr1, a2, 16
-+    vld             vr0, a2, 0
-+    vld             vr1, a2, 16
-     sub.d           t1, t3, t1
-     move            a3, a0
- 
-     sltu            t1, a1, t1
--    vmsknz.b        $vr0, $vr0
--    vmsknz.b        $vr1, $vr1
--    vilvl.h         $vr0, $vr1, $vr0
-+    vmsknz.b        vr0, vr0
-+    vmsknz.b        vr1, vr1
-+    vilvl.h         vr0, vr1, vr0
- 
--    movfr2gr.s      t0, $f0
-+    movfr2gr.s      t0, fa0
-     sra.w           t0, t0, a0
-     orn             t1, t1, t0
-     bnez            t1, L(end)
-@@ -41,20 +41,20 @@ LEAF(STRNLEN, 6)
-     bstrins.d       a4, zero, 4, 0
- 
- L(loop):
--    vld             $vr0, a0, 32
--    vld             $vr1, a0, 48
-+    vld             vr0, a0, 32
-+    vld             vr1, a0, 48
-     addi.d          a0, a0, 32
-     beq             a0, a4, L(out)
- 
--    vmin.bu         $vr2, $vr0, $vr1
--    vsetanyeqz.b    $fcc0, $vr2
--    bceqz           $fcc0, L(loop)
-+    vmin.bu         vr2, vr0, vr1
-+    vsetanyeqz.b    fcc0, vr2
-+    bceqz           fcc0, L(loop)
- L(out):
--    vmsknz.b        $vr0, $vr0
-+    vmsknz.b        vr0, vr0
- 
--    vmsknz.b        $vr1, $vr1
--    vilvl.h         $vr0, $vr1, $vr0
--    movfr2gr.s      t0, $f0
-+    vmsknz.b        vr1, vr1
-+    vilvl.h         vr0, vr1, vr0
-+    movfr2gr.s      t0, fa0
- L(end):
-     sub.d           a0, a0, a3
- 
-diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
-index 325458ff..990be973 100644
---- a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
-@@ -14,45 +14,45 @@
- LEAF(STRRCHR, 6)
-     andi            t1, a0, 0x3f
-     bstrins.d       a0, zero, 5, 0
--    xvld            $xr0, a0, 0
--    xvld            $xr1, a0, 32
-+    xvld            xr0, a0, 0
-+    xvld            xr1, a0, 32
- 
-     li.d            t2, -1
--    xvreplgr2vr.b   $xr4, a1
-+    xvreplgr2vr.b   xr4, a1
-     move            a2, zero
-     sll.d           t3, t2, t1
- 
-     addi.d          a0, a0, 63
--    xvseq.b         $xr2, $xr0, $xr4
--    xvseq.b         $xr3, $xr1, $xr4
--    xvmsknz.b       $xr0, $xr0
-+    xvseq.b         xr2, xr0, xr4
-+    xvseq.b         xr3, xr1, xr4
-+    xvmsknz.b       xr0, xr0
- 
--    xvmsknz.b       $xr1, $xr1
--    xvpickve.w      $xr5, $xr0, 4
--    xvpickve.w      $xr6, $xr1, 4
--    vilvl.h         $vr0, $vr5, $vr0
-+    xvmsknz.b       xr1, xr1
-+    xvpickve.w      xr5, xr0, 4
-+    xvpickve.w      xr6, xr1, 4
-+    vilvl.h         vr0, vr5, vr0
- 
- 
--    vilvl.h         $vr1, $vr6, $vr1
--    xvmsknz.b       $xr2, $xr2
--    xvmsknz.b       $xr3, $xr3
--    xvpickve.w      $xr5, $xr2, 4
-+    vilvl.h         vr1, vr6, vr1
-+    xvmsknz.b       xr2, xr2
-+    xvmsknz.b       xr3, xr3
-+    xvpickve.w      xr5, xr2, 4
- 
--    xvpickve.w      $xr6, $xr3, 4
--    vilvl.h         $vr2, $vr5, $vr2
--    vilvl.h         $vr3, $vr6, $vr3
--    vilvl.w         $vr0, $vr1, $vr0
-+    xvpickve.w      xr6, xr3, 4
-+    vilvl.h         vr2, vr5, vr2
-+    vilvl.h         vr3, vr6, vr3
-+    vilvl.w         vr0, vr1, vr0
- 
--    vilvl.w         $vr1, $vr3, $vr2
--    movfr2gr.d      t0, $f0
--    movfr2gr.d      t1, $f1
-+    vilvl.w         vr1, vr3, vr2
-+    movfr2gr.d      t0, fa0
-+    movfr2gr.d      t1, fa1
-     orn             t0, t0, t3
- 
-     and             t1, t1, t3
-     bne             t0, t2, L(end)
- L(loop):
--    xvld            $xr0, a0, 1
--    xvld            $xr1, a0, 33
-+    xvld            xr0, a0, 1
-+    xvld            xr1, a0, 33
- 
- 
-     clz.d           t0, t1
-@@ -62,33 +62,33 @@ L(loop):
- 
-     masknez         t1, a2, t1
-     or              a2, t0, t1
--    xvseq.b         $xr2, $xr0, $xr4
--    xvseq.b         $xr3, $xr1, $xr4
-+    xvseq.b         xr2, xr0, xr4
-+    xvseq.b         xr3, xr1, xr4
- 
--    xvmsknz.b       $xr2, $xr2
--    xvmsknz.b       $xr3, $xr3
--    xvpickve.w      $xr5, $xr2, 4
--    xvpickve.w      $xr6, $xr3, 4
-+    xvmsknz.b       xr2, xr2
-+    xvmsknz.b       xr3, xr3
-+    xvpickve.w      xr5, xr2, 4
-+    xvpickve.w      xr6, xr3, 4
- 
--    vilvl.h         $vr2, $vr5, $vr2
--    vilvl.h         $vr3, $vr6, $vr3
--    xvmin.bu        $xr5, $xr0, $xr1
--    vilvl.w         $vr2, $vr3, $vr2
-+    vilvl.h         vr2, vr5, vr2
-+    vilvl.h         vr3, vr6, vr3
-+    xvmin.bu        xr5, xr0, xr1
-+    vilvl.w         vr2, vr3, vr2
- 
- 
--    xvsetanyeqz.b   $fcc0, $xr5
--    movfr2gr.d      t1, $f2
--    bceqz           $fcc0, L(loop)
--    xvmsknz.b       $xr0, $xr0
-+    xvsetanyeqz.b   fcc0, xr5
-+    movfr2gr.d      t1, fa2
-+    bceqz           fcc0, L(loop)
-+    xvmsknz.b       xr0, xr0
- 
--    xvmsknz.b       $xr1, $xr1
--    xvpickve.w      $xr5, $xr0, 4
--    xvpickve.w      $xr6, $xr1, 4
--    vilvl.h         $vr0, $vr5, $vr0
-+    xvmsknz.b       xr1, xr1
-+    xvpickve.w      xr5, xr0, 4
-+    xvpickve.w      xr6, xr1, 4
-+    vilvl.h         vr0, vr5, vr0
- 
--    vilvl.h         $vr1, $vr6, $vr1
--    vilvl.w         $vr0, $vr1, $vr0
--    movfr2gr.d      t0, $f0
-+    vilvl.h         vr1, vr6, vr1
-+    vilvl.w         vr0, vr1, vr0
-+    movfr2gr.d      t0, fa0
- L(end):
-     slli.d          t3, t2, 1   # shift one more for the last '\0'
- 
-diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
-index e082eaab..6aede6ae 100644
---- a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
-@@ -14,35 +14,35 @@
- LEAF(STRRCHR, 6)
-     andi            t1, a0, 0x1f
-     bstrins.d       a0, zero, 4, 0
--    vld             $vr0, a0, 0
--    vld             $vr1, a0, 16
-+    vld             vr0, a0, 0
-+    vld             vr1, a0, 16
- 
--    vreplgr2vr.b    $vr4, a1
-+    vreplgr2vr.b    vr4, a1
-     li.d            t2, -1
-     move            a2, zero
-     addi.d          a0, a0, 31
- 
--    vseq.b          $vr2, $vr0, $vr4
--    vseq.b          $vr3, $vr1, $vr4
--    vmsknz.b        $vr0, $vr0
--    vmsknz.b        $vr1, $vr1
-+    vseq.b          vr2, vr0, vr4
-+    vseq.b          vr3, vr1, vr4
-+    vmsknz.b        vr0, vr0
-+    vmsknz.b        vr1, vr1
- 
--    vmsknz.b        $vr2, $vr2
--    vmsknz.b        $vr3, $vr3
--    vilvl.h         $vr0, $vr1, $vr0
--    vilvl.h         $vr1, $vr3, $vr2
-+    vmsknz.b        vr2, vr2
-+    vmsknz.b        vr3, vr3
-+    vilvl.h         vr0, vr1, vr0
-+    vilvl.h         vr1, vr3, vr2
- 
- 
--    movfr2gr.s      t0, $f0
-+    movfr2gr.s      t0, fa0
-     sll.d           t3, t2, t1
--    movfr2gr.s      t1, $f1
-+    movfr2gr.s      t1, fa1
-     orn             t0, t0, t3
- 
-     and             t1, t1, t3
-     bne             t0, t2, L(end)
- L(loop):
--    vld             $vr0, a0, 1
--    vld             $vr1, a0, 17
-+    vld             vr0, a0, 1
-+    vld             vr1, a0, 17
- 
-     clz.w           t0, t1
-     sub.d           t0, a0, t0
-@@ -51,23 +51,23 @@ L(loop):
- 
-     masknez         t1, a2, t1
-     or              a2, t0, t1
--    vseq.b          $vr2, $vr0, $vr4
--    vseq.b          $vr3, $vr1, $vr4
-+    vseq.b          vr2, vr0, vr4
-+    vseq.b          vr3, vr1, vr4
- 
- 
--    vmsknz.b        $vr2, $vr2
--    vmsknz.b        $vr3, $vr3
--    vmin.bu         $vr5, $vr0, $vr1
--    vilvl.h         $vr2, $vr3, $vr2
-+    vmsknz.b        vr2, vr2
-+    vmsknz.b        vr3, vr3
-+    vmin.bu         vr5, vr0, vr1
-+    vilvl.h         vr2, vr3, vr2
- 
--    vsetanyeqz.b    $fcc0, $vr5
--    movfr2gr.s      t1, $f2
--    bceqz           $fcc0, L(loop)
--    vmsknz.b        $vr0, $vr0
-+    vsetanyeqz.b    fcc0, vr5
-+    movfr2gr.s      t1, fa2
-+    bceqz           fcc0, L(loop)
-+    vmsknz.b        vr0, vr0
- 
--    vmsknz.b        $vr1, $vr1
--    vilvl.h         $vr0, $vr1, $vr0
--    movfr2gr.s      t0, $f0
-+    vmsknz.b        vr1, vr1
-+    vilvl.h         vr0, vr1, vr0
-+    movfr2gr.s      t0, fa0
- L(end):
-     slli.d          t3, t2, 1   # shift one more for the last '\0'
- 
-diff --git a/sysdeps/loongarch/lp64/s_cosf.S b/sysdeps/loongarch/lp64/s_cosf.S
-index 9fcbe6ca..cb3a4faa 100644
---- a/sysdeps/loongarch/lp64/s_cosf.S
-+++ b/sysdeps/loongarch/lp64/s_cosf.S
-@@ -213,9 +213,9 @@ L_even_integer:
-     fadd.d      fa0, fa0, fa1
-     fadd.d      fa2, fa2, fa3
-     fadd.d      fa0, fa0, fa2
--    fcmp.sle.d  $fcc0, fa0, fa5
-+    fcmp.sle.d  fcc0, fa0, fa5
-     addi.d      t0, t0, 3
--    bcnez       $fcc0, L_leq_one
-+    bcnez       fcc0, L_leq_one
- /*L_gt_one:*/
-     fld.d       fa2, t1, 16 /* 2.0 */
-     addi.d      t0, t0, 1
-diff --git a/sysdeps/loongarch/lp64/s_sinf.S b/sysdeps/loongarch/lp64/s_sinf.S
-index 45d1c4b5..1e77282d 100644
---- a/sysdeps/loongarch/lp64/s_sinf.S
-+++ b/sysdeps/loongarch/lp64/s_sinf.S
-@@ -215,9 +215,9 @@ L_even_integer:
-     fadd.d      fa0, fa0, fa1
-     fadd.d      fa2, fa2, fa3
-     fadd.d      fa0, fa0, fa2
--    fcmp.sle.d  $fcc0, fa0, fa5
-+    fcmp.sle.d  fcc0, fa0, fa5
-     addi.d      t0, t0, 1
--    bcnez       $fcc0, L_leq_one
-+    bcnez       fcc0, L_leq_one
- /*L_gt_one:*/
-     fld.d       fa2, t1, 16 /* 2.0 */
-     addi.d      t0, t0, 1
-diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h
-index 36f00939..b5ee57cf 100644
---- a/sysdeps/loongarch/sys/regdef.h
-+++ b/sysdeps/loongarch/sys/regdef.h
-@@ -71,6 +71,14 @@
- # define fs5	$f29
- # define fs6	$f30
- # define fs7	$f31
-+# define fcc0	$fcc0
-+# define fcc1	$fcc1
-+# define fcc2	$fcc2
-+# define fcc3	$fcc3
-+# define fcc4	$fcc4
-+# define fcc5	$fcc5
-+# define fcc6	$fcc6
-+# define fcc7	$fcc7
- 
- #elif _LOONGARCH_SIM == _ABILP32
- # error ABILP32 not support yet
-@@ -78,4 +86,70 @@
- # error noABI
- #endif
- 
-+#define vr0 $vr0
-+#define vr1 $vr1
-+#define vr2 $vr2
-+#define vr3 $vr3
-+#define vr4 $vr4
-+#define vr5 $vr5
-+#define vr6 $vr6
-+#define vr7 $vr7
-+#define vr8 $vr8
-+#define vr9 $vr9
-+#define vr10 $vr10
-+#define vr11 $vr11
-+#define vr12 $vr12
-+#define vr13 $vr13
-+#define vr14 $vr14
-+#define vr15 $vr15
-+#define vr16 $vr16
-+#define vr17 $vr17
-+#define vr18 $vr18
-+#define vr19 $vr19
-+#define vr20 $vr20
-+#define vr21 $vr21
-+#define vr22 $vr22
-+#define vr23 $vr23
-+#define vr24 $vr24
-+#define vr25 $vr25
-+#define vr26 $vr26
-+#define vr27 $vr27
-+#define vr28 $vr28
-+#define vr29 $vr29
-+#define vr30 $vr30
-+#define vr31 $vr31
-+
-+#define xr0 $xr0
-+#define xr1 $xr1
-+#define xr2 $xr2
-+#define xr3 $xr3
-+#define xr4 $xr4
-+#define xr5 $xr5
-+#define xr6 $xr6
-+#define xr7 $xr7
-+#define xr8 $xr8
-+#define xr9 $xr9
-+#define xr10 $xr10
-+#define xr11 $xr11
-+#define xr12 $xr12
-+#define xr13 $xr13
-+#define xr14 $xr14
-+#define xr15 $xr15
-+#define xr16 $xr16
-+#define xr17 $xr17
-+#define xr18 $xr18
-+#define xr19 $xr19
-+#define xr20 $xr20
-+#define xr21 $xr21
-+#define xr22 $xr22
-+#define xr23 $xr23
-+#define xr24 $xr24
-+#define xr25 $xr25
-+#define xr26 $xr26
-+#define xr27 $xr27
-+#define xr28 $xr28
-+#define xr29 $xr29
-+#define xr30 $xr30
-+#define xr31 $xr31
-+
- #endif /* _SYS_REGDEF_H */
--- 
-2.33.0
-
diff --git a/glibc-2.28-Add-new-struct-user_fp_state-in-user.h.patch b/glibc-2.28-Add-new-struct-user_fp_state-in-user.h.patch
deleted file mode 100644
index b7ae1ad..0000000
--- a/glibc-2.28-Add-new-struct-user_fp_state-in-user.h.patch
+++ /dev/null
@@ -1,29 +0,0 @@
-From dc2d26d52c129c47fa1f16bd0157cd20c6d9a958 Mon Sep 17 00:00:00 2001
-From: caiyinyu <caiyinyu@loongson.cn>
-Date: Wed, 21 Jun 2023 11:55:02 +0800
-Subject: [PATCH 08/14] glibc-2.28: Add new struct user_fp_state in user.h
-
-Change-Id: Idc233cc11c8f76b624dc2891b432f4d02a53cebc
-Signed-off-by: ticat_fp <fanpeng@loongson.cn>
----
- sysdeps/unix/sysv/linux/loongarch/sys/user.h | 6 ++++++
- 1 file changed, 6 insertions(+)
-
-diff --git a/sysdeps/unix/sysv/linux/loongarch/sys/user.h b/sysdeps/unix/sysv/linux/loongarch/sys/user.h
-index f9108350..21e340f6 100644
---- a/sysdeps/unix/sysv/linux/loongarch/sys/user.h
-+++ b/sysdeps/unix/sysv/linux/loongarch/sys/user.h
-@@ -28,4 +28,10 @@ struct user_regs_struct
-   uint64_t reserved[11];
- };
- 
-+struct user_fp_struct {
-+  uint64_t    fpr[32];
-+  uint64_t    fcc;
-+  uint32_t    fcsr;
-+};
-+
- #endif	/* _SYS_USER_H */
--- 
-2.33.0
-
diff --git a/glibc-2.28-Fix-ifunc-str-mem-functions-xfail-problem.patch b/glibc-2.28-Fix-ifunc-str-mem-functions-xfail-problem.patch
deleted file mode 100644
index ff87ba3..0000000
--- a/glibc-2.28-Fix-ifunc-str-mem-functions-xfail-problem.patch
+++ /dev/null
@@ -1,162 +0,0 @@
-From 647a0a28e5c9aed2f1fa59bbb7595133e7a4e62f Mon Sep 17 00:00:00 2001
-From: caiyinyu <caiyinyu@loongson.cn>
-Date: Mon, 24 Apr 2023 18:09:55 +0800
-Subject: [PATCH 03/14] glibc-2.28: Fix ifunc str/mem functions xfail problems.
-
-Change-Id: Ibff4229fcfef23c0b19fb94b21a4d17b49eceec6
-Signed-off-by: ticat_fp <fanpeng@loongson.cn>
----
- .../lp64/multiarch/ifunc-impl-list.c          | 76 +++++++++----------
- 1 file changed, 38 insertions(+), 38 deletions(-)
-
-diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
-index c2b6bbf7..fdeae797 100644
---- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
-+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
-@@ -36,105 +36,105 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   size_t i = 0;
- 
-   IFUNC_IMPL (i, name, memcpy,
--	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_lasx)
--	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_lsx)
-+	      IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx)
-+	      IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LSX, __memcpy_lsx)
-+	      IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_UAL, __memcpy_unaligned)
- 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_aligned)
--	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_unaligned)
- 	      )
- 
-   IFUNC_IMPL (i, name, memmove,
--	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_lasx)
--	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_lsx)
-+	      IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_LASX, __memmove_lasx)
-+	      IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_LSX, __memmove_lsx)
-+	      IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_UAL, __memmove_unaligned)
- 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_aligned)
--	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_unaligned)
- 	      )
- 
-   IFUNC_IMPL (i, name, memset,
--	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_lasx)
--	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_lsx)
-+	      IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LASX, __memset_lasx)
-+	      IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LSX, __memset_lsx)
-+	      IFUNC_IMPL_ADD (array, i, memset, SUPPORT_UAL, __memset_unaligned)
- 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_aligned)
--	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_unaligned)
- 	      )
- 
-   IFUNC_IMPL (i, name, memchr,
--	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_lasx)
--	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_lsx)
-+	      IFUNC_IMPL_ADD (array, i, memchr, SUPPORT_LASX, __memchr_lasx)
-+	      IFUNC_IMPL_ADD (array, i, memchr, SUPPORT_LSX, __memchr_lsx)
- 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_aligned)
- 	      )
- 
-   IFUNC_IMPL (i, name, memrchr,
--	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_lasx)
--	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_lsx)
-+	      IFUNC_IMPL_ADD (array, i, memrchr, SUPPORT_LASX, __memrchr_lasx)
-+	      IFUNC_IMPL_ADD (array, i, memrchr, SUPPORT_LSX, __memrchr_lsx)
- 	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_generic)
- 	      )
- 
-   IFUNC_IMPL (i, name, memcmp,
--	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_lasx)
--	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_lsx)
-+	      IFUNC_IMPL_ADD (array, i, memcmp, SUPPORT_LASX, __memcmp_lasx)
-+	      IFUNC_IMPL_ADD (array, i, memcmp, SUPPORT_LSX, __memcmp_lsx)
- 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_aligned)
- 	      )
- 
-   IFUNC_IMPL (i, name, rawmemchr,
--	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_lasx)
--	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_lsx)
-+	      IFUNC_IMPL_ADD (array, i, rawmemchr, SUPPORT_LASX, __rawmemchr_lasx)
-+	      IFUNC_IMPL_ADD (array, i, rawmemchr, SUPPORT_LSX, __rawmemchr_lsx)
- 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_aligned)
- 	      )
- 
-   IFUNC_IMPL (i, name, strchr,
--	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_lasx)
--	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_LASX, __strchr_lasx)
-+	      IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_LSX, __strchr_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_UAL, __strchr_unaligned)
- 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_aligned)
--	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_unaligned)
- 	      )
- 
-   IFUNC_IMPL (i, name, strrchr,
--	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_lasx)
--	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strrchr, SUPPORT_LASX, __strrchr_lasx)
-+	      IFUNC_IMPL_ADD (array, i, strrchr, SUPPORT_LSX, __strrchr_lsx)
- 	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_aligned)
- 	      )
- 
-   IFUNC_IMPL (i, name, strlen,
--	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_lasx)
--	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LASX, __strlen_lasx)
-+	      IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LSX, __strlen_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_UAL, __strlen_unaligned)
- 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned)
--	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_unaligned)
- 	      )
- 
-   IFUNC_IMPL (i, name, strnlen,
--	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_lasx)
--	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_LASX, __strnlen_lasx)
-+	      IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_LSX, __strnlen_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_UAL, __strnlen_unaligned)
- 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_aligned)
--	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_unaligned)
- 	      )
- 
-   IFUNC_IMPL (i, name, strchrnul,
--	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_lasx)
--	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_LASX, __strchrnul_lasx)
-+	      IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_LSX, __strchrnul_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_UAL, __strchrnul_unaligned)
- 	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_aligned)
--	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_unaligned)
- 	      )
- 
-   IFUNC_IMPL (i, name, strncmp,
--	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strncmp, SUPPORT_LSX, __strncmp_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strncmp, SUPPORT_UAL, __strncmp_unaligned)
- 	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_aligned)
--	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_unaligned)
- 	      )
- 
-   IFUNC_IMPL (i, name, strcpy,
--	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strcpy, SUPPORT_LSX, __strcpy_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strcpy, SUPPORT_UAL, __strcpy_unaligned)
- 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_aligned)
--	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_unaligned)
- 	      )
- 
-   IFUNC_IMPL (i, name, stpcpy,
--	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_lsx)
-+	      IFUNC_IMPL_ADD (array, i, stpcpy, SUPPORT_LSX, __stpcpy_lsx)
- 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_aligned)
- 	      )
- 
-   IFUNC_IMPL (i, name, strcmp,
--	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strcmp, SUPPORT_LSX, __strcmp_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strcmp, SUPPORT_UAL, __strcmp_unaligned)
- 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_aligned)
--	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_unaligned)
- 	      )
- 
-   return i;
--- 
-2.33.0
-
diff --git a/glibc-2.28-Redefine-macro-LEAF-ENTRY.patch b/glibc-2.28-Redefine-macro-LEAF-ENTRY.patch
deleted file mode 100644
index 42b4200..0000000
--- a/glibc-2.28-Redefine-macro-LEAF-ENTRY.patch
+++ /dev/null
@@ -1,57 +0,0 @@
-From 00537d6945e71af8c9b0b1e7c2695f6a9a1ef1f5 Mon Sep 17 00:00:00 2001
-From: caiyinyu <caiyinyu@loongson.cn>
-Date: Sun, 25 Jun 2023 16:23:25 +0800
-Subject: [PATCH 09/14] glibc-2.28: Redefine macro LEAF/ENTRY.
-
-    The following usage of macro LEAF/ENTRY are all feasible:
-    1. LEAF(fcn) -- the align value of fcn is .align 3 (default value)
-    2. LEAF(fcn, 6) -- the align value of fcn is .align 6
-
-Change-Id: Ie3df4df8dba5259b665bd0e4702aaab0a09a5f65
-Signed-off-by: ticat_fp <fanpeng@loongson.cn>
----
- sysdeps/loongarch/sys/asm.h | 15 ++++++++++-----
- 1 file changed, 10 insertions(+), 5 deletions(-)
-
-diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h
-index 357a5ba3..734e45ae 100644
---- a/sysdeps/loongarch/sys/asm.h
-+++ b/sysdeps/loongarch/sys/asm.h
-@@ -26,16 +26,21 @@
- #endif
- 
- 
--/* Declare leaf routine.  */
--#define	LEAF(symbol, aln)		\
-+/*  Declare leaf routine.
-+    The usage of macro LEAF/ENTRY is as follows:
-+    1. LEAF(fcn) -- the align value of fcn is .align 3 (default value)
-+    2. LEAF(fcn, 6) -- the align value of fcn is .align 6
-+*/
-+#define	LEAF_IMPL(symbol, aln, ...)	\
- 	.text;				\
- 	.globl	symbol;			\
- 	.align	aln;			\
- 	.type	symbol, @function;	\
- symbol: \
--	cfi_startproc;			\
-+	cfi_startproc;
- 
--# define ENTRY(symbol, aln) LEAF(symbol, aln)
-+#define LEAF(...) LEAF_IMPL(__VA_ARGS__, 3)
-+#define ENTRY(...) LEAF(__VA_ARGS__)
- 
- #define	LEAF_NO_ALIGN(symbol)			\
- 	.text;				\
-@@ -44,7 +49,7 @@ symbol: \
- symbol: \
- 	cfi_startproc;
- 
--# define ENTRY_NO_ALIGN(symbol) LEAF_NO_ALIGN(symbol)
-+#define ENTRY_NO_ALIGN(symbol) LEAF_NO_ALIGN(symbol)
- 
- /* Mark end of function.  */
- #undef END
--- 
-2.33.0
-
diff --git a/glibc-2.28-Refactor-code-and-fix-bug-in-_dl_runtime_.patch b/glibc-2.28-Refactor-code-and-fix-bug-in-_dl_runtime_.patch
deleted file mode 100644
index 075149b..0000000
--- a/glibc-2.28-Refactor-code-and-fix-bug-in-_dl_runtime_.patch
+++ /dev/null
@@ -1,306 +0,0 @@
-From 27a004c9777340afd86fc0d129f6ffad508bf090 Mon Sep 17 00:00:00 2001
-From: caiyinyu <caiyinyu@loongson.cn>
-Date: Tue, 11 Jul 2023 16:09:55 +0800
-Subject: [PATCH 12/14] glibc-2.28: Refactor code and fix bug in
- _dl_runtime_resolve.
-
-Change-Id: I4907e6643ef25b87d7862e957ce9bf6d201da816
-Signed-off-by: ticat_fp <fanpeng@loongson.cn>
----
- sysdeps/loongarch/dl-machine.h    |   8 +-
- sysdeps/loongarch/dl-trampoline.S |   7 ++
- sysdeps/loongarch/dl-trampoline.h | 159 +++++++++++++-----------------
- sysdeps/loongarch/sys/asm.h       |   9 ++
- 4 files changed, 90 insertions(+), 93 deletions(-)
-
-diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
-index 6e9c6258..ff520a07 100644
---- a/sysdeps/loongarch/dl-machine.h
-+++ b/sysdeps/loongarch/dl-machine.h
-@@ -381,9 +381,13 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
-   /* If using PLTs, fill in the first two entries of .got.plt.  */
-   if (l->l_info[DT_JMPREL])
-     {
--      extern void _dl_runtime_resolve (void) __attribute__ ((visibility ("hidden")));
-+
-+#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
-       extern void _dl_runtime_resolve_lasx (void) __attribute__ ((visibility ("hidden")));
-       extern void _dl_runtime_resolve_lsx (void) __attribute__ ((visibility ("hidden")));
-+#endif
-+      extern void _dl_runtime_resolve (void) __attribute__ ((visibility ("hidden")));
-+
-       ElfW(Addr) *gotplt = (ElfW(Addr) *) D_PTR (l, l_info[DT_PLTGOT]);
-       /* If a library is prelinked but we have to relocate anyway,
- 	 we have to be able to undo the prelinking of .got.plt.
-@@ -391,11 +395,13 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
-       if (gotplt[1])
- 	l->l_mach.plt = gotplt[1] + l->l_addr;
- 
-+#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
-       if (SUPPORT_LASX)
- 	gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx;
-       else if (SUPPORT_LSX)
- 	gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lsx;
-       else
-+#endif
- 	gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve;
- 
-       gotplt[1] = (ElfW(Addr)) l;
-diff --git a/sysdeps/loongarch/dl-trampoline.S b/sysdeps/loongarch/dl-trampoline.S
-index 5f627a63..78d741f3 100644
---- a/sysdeps/loongarch/dl-trampoline.S
-+++ b/sysdeps/loongarch/dl-trampoline.S
-@@ -16,16 +16,23 @@
-    License along with the GNU C Library.  If not, see
-    <http://www.gnu.org/licenses/>.  */
- 
-+#include <sysdep.h>
-+#include <sys/asm.h>
-+
-+#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
- #define USE_LASX
- #define _dl_runtime_resolve _dl_runtime_resolve_lasx
- #include "dl-trampoline.h"
-+#undef FRAME_SIZE
- #undef USE_LASX
- #undef _dl_runtime_resolve
- 
- #define USE_LSX
- #define _dl_runtime_resolve _dl_runtime_resolve_lsx
- #include "dl-trampoline.h"
-+#undef FRAME_SIZE
- #undef USE_LSX
- #undef _dl_runtime_resolve
-+#endif
- 
- #include "dl-trampoline.h"
-diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h
-index 96f41f1d..9a6d9b6c 100644
---- a/sysdeps/loongarch/dl-trampoline.h
-+++ b/sysdeps/loongarch/dl-trampoline.h
-@@ -17,31 +17,24 @@
-    License along with the GNU C Library.  If not, see
-    <http://www.gnu.org/licenses/>.  */
- 
--#include <sysdep.h>
--#include <sys/asm.h>
--
- /* Assembler veneer called from the PLT header code for lazy loading.
-    The PLT header passes its own args in t0-t2.  */
--
--#ifdef __loongarch_soft_float
--# define FRAME_SIZE (-((-10 * SZREG) & ALMASK))
-+#ifdef USE_LASX
-+# define FRAME_SIZE (-((-9 * SZREG - 8 * SZFREG - 8 * SZXREG) & ALMASK))
-+#elif defined USE_LSX
-+# define FRAME_SIZE (-((-9 * SZREG - 8 * SZFREG - 8 * SZVREG) & ALMASK))
-+#elif !defined __loongarch_soft_float
-+# define FRAME_SIZE (-((-9 * SZREG - 8 * SZFREG) & ALMASK))
- #else
--# define FRAME_SIZE (-((-10 * SZREG - 8 * 256) & ALMASK))
-+# define FRAME_SIZE (-((-9 * SZREG) & ALMASK))
- #endif
- 
- ENTRY (_dl_runtime_resolve, 3)
--  # Save arguments to stack.
--
--#ifdef __loongarch64
--	li.d    t3, -FRAME_SIZE
--	add.d	sp, sp, t3
--#elif defined __loongarch32
--	li.w    t3, -FRAME_SIZE
--	add.w	sp, sp, t3
--#endif
- 
-+	/* Save arguments to stack. */
-+	ADDI	sp, sp, -FRAME_SIZE
- 
--	REG_S	ra, sp, 9*SZREG
-+	REG_S	ra, sp, 0*SZREG
- 	REG_S	a0, sp, 1*SZREG
- 	REG_S	a1, sp, 2*SZREG
- 	REG_S	a2, sp, 3*SZREG
-@@ -51,55 +44,45 @@ ENTRY (_dl_runtime_resolve, 3)
- 	REG_S	a6, sp, 7*SZREG
- 	REG_S	a7, sp, 8*SZREG
- 
--#ifndef __loongarch_soft_float
--	FREG_S	fa0, sp, 10*SZREG + 0*SZFREG
--	FREG_S	fa1, sp, 10*SZREG + 1*SZFREG
--	FREG_S	fa2, sp, 10*SZREG + 2*SZFREG
--	FREG_S	fa3, sp, 10*SZREG + 3*SZFREG
--	FREG_S	fa4, sp, 10*SZREG + 4*SZFREG
--	FREG_S	fa5, sp, 10*SZREG + 5*SZFREG
--	FREG_S	fa6, sp, 10*SZREG + 6*SZFREG
--	FREG_S	fa7, sp, 10*SZREG + 7*SZFREG
- #ifdef USE_LASX
--	xvst	xr0, sp, 10*SZREG + 0*256
--	xvst	xr1, sp, 10*SZREG + 1*256
--	xvst	xr2, sp, 10*SZREG + 2*256
--	xvst	xr3, sp, 10*SZREG + 3*256
--	xvst	xr4, sp, 10*SZREG + 4*256
--	xvst	xr5, sp, 10*SZREG + 5*256
--	xvst	xr6, sp, 10*SZREG + 6*256
--	xvst	xr7, sp, 10*SZREG + 7*256
-+	xvst	xr0, sp, 9*SZREG + 8*SZFREG + 0*SZXREG
-+	xvst	xr1, sp, 9*SZREG + 8*SZFREG + 1*SZXREG
-+	xvst	xr2, sp, 9*SZREG + 8*SZFREG + 2*SZXREG
-+	xvst	xr3, sp, 9*SZREG + 8*SZFREG + 3*SZXREG
-+	xvst	xr4, sp, 9*SZREG + 8*SZFREG + 4*SZXREG
-+	xvst	xr5, sp, 9*SZREG + 8*SZFREG + 5*SZXREG
-+	xvst	xr6, sp, 9*SZREG + 8*SZFREG + 6*SZXREG
-+	xvst	xr7, sp, 9*SZREG + 8*SZFREG + 7*SZXREG
- #elif defined USE_LSX
--	vst	vr0, sp, 10*SZREG + 0*128
--	vst	vr1, sp, 10*SZREG + 1*128
--	vst	vr2, sp, 10*SZREG + 2*128
--	vst	vr3, sp, 10*SZREG + 3*128
--	vst	vr4, sp, 10*SZREG + 4*128
--	vst	vr5, sp, 10*SZREG + 5*128
--	vst	vr6, sp, 10*SZREG + 6*128
--	vst	vr7, sp, 10*SZREG + 7*128
--#endif
-+	vst	vr0, sp, 9*SZREG + 8*SZFREG + 0*SZVREG
-+	vst	vr1, sp, 9*SZREG + 8*SZFREG + 1*SZVREG
-+	vst	vr2, sp, 9*SZREG + 8*SZFREG + 2*SZVREG
-+	vst	vr3, sp, 9*SZREG + 8*SZFREG + 3*SZVREG
-+	vst	vr4, sp, 9*SZREG + 8*SZFREG + 4*SZVREG
-+	vst	vr5, sp, 9*SZREG + 8*SZFREG + 5*SZVREG
-+	vst	vr6, sp, 9*SZREG + 8*SZFREG + 6*SZVREG
-+	vst	vr7, sp, 9*SZREG + 8*SZFREG + 7*SZVREG
-+#elif !defined __loongarch_soft_float
-+	FREG_S	fa0, sp, 9*SZREG + 0*SZFREG
-+	FREG_S	fa1, sp, 9*SZREG + 1*SZFREG
-+	FREG_S	fa2, sp, 9*SZREG + 2*SZFREG
-+	FREG_S	fa3, sp, 9*SZREG + 3*SZFREG
-+	FREG_S	fa4, sp, 9*SZREG + 4*SZFREG
-+	FREG_S	fa5, sp, 9*SZREG + 5*SZFREG
-+	FREG_S	fa6, sp, 9*SZREG + 6*SZFREG
-+	FREG_S	fa7, sp, 9*SZREG + 7*SZFREG
- #endif
- 
--  # Update .got.plt and obtain runtime address of callee.
--#ifdef __loongarch64
--	slli.d	a1, t1, 1
-+	/* Update .got.plt and obtain runtime address of callee */
-+	SLLI	a1, t1, 1
- 	or	a0, t0, zero
--	add.d	a1, a1, t1
-+	ADD	a1, a1, t1
- 	la	a2, _dl_fixup
- 	jirl	ra, a2, 0
- 	or	t1, v0, zero
--#elif defined __loongarch32
--	slli.w	a1, t1, 1
--	or	a0, t0, zero
--	add.w	a1, a1, t1
--	la	a2, _dl_fixup
--	jirl	ra, a2, 0
--	or	t1, v0, zero
--#endif
- 
--  # Restore arguments from stack.
--	REG_L	ra, sp, 9*SZREG
-+	/* Restore arguments from stack. */
-+	REG_L	ra, sp, 0*SZREG
- 	REG_L	a0, sp, 1*SZREG
- 	REG_L	a1, sp, 2*SZREG
- 	REG_L	a2, sp, 3*SZREG
-@@ -109,45 +92,37 @@ ENTRY (_dl_runtime_resolve, 3)
- 	REG_L	a6, sp, 7*SZREG
- 	REG_L	a7, sp, 8*SZREG
- 
--#ifndef __loongarch_soft_float
--	FREG_L	fa0, sp, 10*SZREG + 0*SZFREG
--	FREG_L	fa1, sp, 10*SZREG + 1*SZFREG
--	FREG_L	fa2, sp, 10*SZREG + 2*SZFREG
--	FREG_L	fa3, sp, 10*SZREG + 3*SZFREG
--	FREG_L	fa4, sp, 10*SZREG + 4*SZFREG
--	FREG_L	fa5, sp, 10*SZREG + 5*SZFREG
--	FREG_L	fa6, sp, 10*SZREG + 6*SZFREG
--	FREG_L	fa7, sp, 10*SZREG + 7*SZFREG
- #ifdef USE_LASX
--	xvld	xr0, sp, 10*SZREG + 0*256
--	xvld	xr1, sp, 10*SZREG + 1*256
--	xvld	xr2, sp, 10*SZREG + 2*256
--	xvld	xr3, sp, 10*SZREG + 3*256
--	xvld	xr4, sp, 10*SZREG + 4*256
--	xvld	xr5, sp, 10*SZREG + 5*256
--	xvld	xr6, sp, 10*SZREG + 6*256
--	xvld	xr7, sp, 10*SZREG + 7*256
-+	xvld	xr0, sp, 9*SZREG + 8*SZFREG + 0*SZXREG
-+	xvld	xr1, sp, 9*SZREG + 8*SZFREG + 1*SZXREG
-+	xvld	xr2, sp, 9*SZREG + 8*SZFREG + 2*SZXREG
-+	xvld	xr3, sp, 9*SZREG + 8*SZFREG + 3*SZXREG
-+	xvld	xr4, sp, 9*SZREG + 8*SZFREG + 4*SZXREG
-+	xvld	xr5, sp, 9*SZREG + 8*SZFREG + 5*SZXREG
-+	xvld	xr6, sp, 9*SZREG + 8*SZFREG + 6*SZXREG
-+	xvld	xr7, sp, 9*SZREG + 8*SZFREG + 7*SZXREG
- #elif defined USE_LSX
--	vld	vr0, sp, 10*SZREG + 0*128
--	vld	vr1, sp, 10*SZREG + 1*128
--	vld	vr2, sp, 10*SZREG + 2*128
--	vld	vr3, sp, 10*SZREG + 3*128
--	vld	vr4, sp, 10*SZREG + 4*128
--	vld	vr5, sp, 10*SZREG + 5*128
--	vld	vr6, sp, 10*SZREG + 6*128
--	vld	vr7, sp, 10*SZREG + 7*128
--#endif
--#endif
--
--#ifdef __loongarch64
--	li.d    t3, FRAME_SIZE
--	add.d	sp, sp, t3
--#elif defined __loongarch32
--	li.w    t3, FRAME_SIZE
--	addi.w	sp, sp, FRAME_SIZE
-+	vld	vr0, sp, 9*SZREG + 8*SZFREG + 0*SZVREG
-+	vld	vr1, sp, 9*SZREG + 8*SZFREG + 1*SZVREG
-+	vld	vr2, sp, 9*SZREG + 8*SZFREG + 2*SZVREG
-+	vld	vr3, sp, 9*SZREG + 8*SZFREG + 3*SZVREG
-+	vld	vr4, sp, 9*SZREG + 8*SZFREG + 4*SZVREG
-+	vld	vr5, sp, 9*SZREG + 8*SZFREG + 5*SZVREG
-+	vld	vr6, sp, 9*SZREG + 8*SZFREG + 6*SZVREG
-+	vld	vr7, sp, 9*SZREG + 8*SZFREG + 7*SZVREG
-+#elif !defined __loongarch_soft_float
-+	FREG_L	fa0, sp, 9*SZREG + 0*SZFREG
-+	FREG_L	fa1, sp, 9*SZREG + 1*SZFREG
-+	FREG_L	fa2, sp, 9*SZREG + 2*SZFREG
-+	FREG_L	fa3, sp, 9*SZREG + 3*SZFREG
-+	FREG_L	fa4, sp, 9*SZREG + 4*SZFREG
-+	FREG_L	fa5, sp, 9*SZREG + 5*SZFREG
-+	FREG_L	fa6, sp, 9*SZREG + 6*SZFREG
-+	FREG_L	fa7, sp, 9*SZREG + 7*SZFREG
- #endif
- 
-+	ADDI	sp, sp, FRAME_SIZE
- 
--  # Invoke the callee.
-+	/* Invoke the callee. */
- 	jirl	zero, t1, 0
- END (_dl_runtime_resolve)
-diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h
-index 734e45ae..e80c6245 100644
---- a/sysdeps/loongarch/sys/asm.h
-+++ b/sysdeps/loongarch/sys/asm.h
-@@ -9,8 +9,17 @@
- # define PTRLOG 3
- # define SZREG	8
- # define SZFREG	8
-+# define SZVREG 16
-+# define SZXREG 32
- # define REG_L ld.d
- # define REG_S st.d
-+# define SRLI srli.d
-+# define SLLI slli.d
-+# define ADDI addi.d
-+# define ADD  add.d
-+# define SUB  sub.d
-+# define BSTRINS  bstrins.d
-+# define LI  li.d
- # define FREG_L fld.d
- # define FREG_S fst.d
- #elif defined __loongarch32
--- 
-2.33.0
-
diff --git a/glibc-2.28-Refactor-code-of-raw-mem-functions.patch b/glibc-2.28-Refactor-code-of-raw-mem-functions.patch
deleted file mode 100644
index 0db95f8..0000000
--- a/glibc-2.28-Refactor-code-of-raw-mem-functions.patch
+++ /dev/null
@@ -1,3031 +0,0 @@
-From 4879bd4e0aff7d884d9b026b6081a0e8cffc491c Mon Sep 17 00:00:00 2001
-From: caiyinyu <caiyinyu@loongson.cn>
-Date: Wed, 21 Jun 2023 09:30:54 +0800
-Subject: [PATCH 06/14] glibc-2.28: Refactor code of {raw,}mem* functions.
-
-Change-Id: Icafaf6bc8216f48be64cf25a40b9fe28ce127914
-Signed-off-by: ticat_fp <fanpeng@loongson.cn>
----
- sysdeps/loongarch/lp64/memchr.S               |  92 --
- sysdeps/loongarch/lp64/memcmp.S               | 280 ------
- sysdeps/loongarch/lp64/memcpy.S               | 804 ------------------
- sysdeps/loongarch/lp64/memmove.S              |   2 -
- sysdeps/loongarch/lp64/memset.S               | 166 ----
- .../loongarch/lp64/multiarch/memchr-aligned.S |  91 +-
- .../loongarch/lp64/multiarch/memcmp-aligned.S | 282 +++++-
- .../loongarch/lp64/multiarch/memcpy-aligned.S | 799 ++++++++++++++++-
- .../loongarch/lp64/multiarch/memset-aligned.S | 166 +++-
- .../lp64/multiarch/rawmemchr-aligned.S        | 110 ++-
- sysdeps/loongarch/lp64/rawmemchr.S            | 113 ---
- 11 files changed, 1438 insertions(+), 1467 deletions(-)
- delete mode 100644 sysdeps/loongarch/lp64/memchr.S
- delete mode 100644 sysdeps/loongarch/lp64/memcmp.S
- delete mode 100644 sysdeps/loongarch/lp64/memcpy.S
- delete mode 100644 sysdeps/loongarch/lp64/memmove.S
- delete mode 100644 sysdeps/loongarch/lp64/memset.S
- delete mode 100644 sysdeps/loongarch/lp64/rawmemchr.S
-
-diff --git a/sysdeps/loongarch/lp64/memchr.S b/sysdeps/loongarch/lp64/memchr.S
-deleted file mode 100644
-index 23f1fd13..00000000
---- a/sysdeps/loongarch/lp64/memchr.S
-+++ /dev/null
-@@ -1,92 +0,0 @@
--#ifdef _LIBC
--#include <sysdep.h>
--#include <sys/regdef.h>
--#include <sys/asm.h>
--#else
--#include <sys/asm.h>
--#include <sys/regdef.h>
--#endif
--
--#ifndef MEMCHR_NAME
--#define MEMCHR_NAME memchr
--#endif
--
--LEAF(MEMCHR_NAME, 6)
--    beqz        a2, L(out)
--    andi        t1, a0, 0x7
--    lu12i.w     a3, 0x01010
--    sub.d       a5, a0, t1
--
--    bstrins.d   a1, a1, 15, 8
--    ld.d        t0, a5, 0
--    slli.d      t2, t1, 3
--    ori         a3, a3, 0x101
--
--    bstrins.d   a1, a1, 31, 16
--    li.w        t7, -1
--    li.w        t8, 9
--    bstrins.d   a3, a3, 63, 32
--
--    srl.d       t3, t7, t2
--    bstrins.d   a1, a1, 63, 32
--    sub.d       t4, t8, t1
--    orn         t3, a1, t3
--
--    srl.d       t0, t0, t2
--    slli.d      a4, a3, 7   # 0x8080808080808080
--    sltu        t4, a2, t4
--    xor         t2, t0, t3
--
--    sub.d       a6, t2, a3
--    andn        a7, a4, t2
--    and         t2, a6, a7
--    or          t3, t2, t4
--
--    bnez        t3, L(count_pos)
--    addi.d      a2, a2, -8
--    addi.d      a0, a5, 8
--    add.d       a2, a2, t1
--
--L(loop):
--    ld.d        t0, a0, 0
--    sltui       t4, a2, 9
--    xor         t2, t0, a1
--    sub.d       a6, t2, a3
--
--    andn        a7, a4, t2
--    and         t2, a6, a7
--    or          t3, t2, t4
--    bnez        t3, L(count_pos)
--
--    ld.d        t1, a0, 8
--    addi.d      a0, a0, 16
--    sltui       t4, a2, 17
--    xor         t2, t1, a1
--
--    sub.d       a6, t2, a3
--    andn        a7, a4, t2
--    and         t2, a6, a7
--    addi.d      a2, a2, -16
--
--    or          t3, t2, t4
--    beqz        t3, L(loop)
--    addi.d      a0, a0, -8
--    addi.d      a2, a2, 8
--
--L(count_pos):
--    ctz.d       t0, t2
--    srli.d      t0, t0, 3
--    sltu        t1, t0, a2
--    add.d       a0, a0, t0
--
--    maskeqz     a0, a0, t1
--    jr          ra
--
--L(out):
--    move        a0, zero
--    jr          ra
--END(MEMCHR_NAME)
--
--#ifdef _LIBC
--libc_hidden_builtin_def (MEMCHR_NAME)
--#endif
-diff --git a/sysdeps/loongarch/lp64/memcmp.S b/sysdeps/loongarch/lp64/memcmp.S
-deleted file mode 100644
-index 457a4dc7..00000000
---- a/sysdeps/loongarch/lp64/memcmp.S
-+++ /dev/null
-@@ -1,280 +0,0 @@
--#ifdef _LIBC
--#include <sysdep.h>
--#include <sys/regdef.h>
--#include <sys/asm.h>
--#else
--#include <sys/asm.h>
--#include <sys/regdef.h>
--#endif
--
--#ifndef MEMCMP_NAME
--#define MEMCMP_NAME memcmp
--#endif
--
--LEAF(MEMCMP_NAME, 6)
--    beqz        a2, L(ret)
--    andi        a4, a1, 0x7
--    andi        a3, a0, 0x7
--    sltu        a5, a4, a3
--
--    xor         t0, a0, a1
--    li.w        t8, 8
--    maskeqz     t0, t0, a5
--    li.w        t7, -1
--
--    xor         a0, a0, t0	// a0 hold smaller one
--    xor         a1, a1, t0	// a1 hold larger one
--    andi        a3, a0, 0x7	// a3 hold small offset
--    andi        a4, a1, 0x7	// a4 hold larger offset
--
--    xor         a0, a0, a3
--    xor         a1, a1, a4
--    ld.d        t2, a0, 0	// t2 = "fedcbaXX"
--    ld.d        t1, a1, 0	// t1 = "54321YYY"
--
--    slli.d      t3, a3, 3
--    slli.d      t4, a4, 3
--    sub.d       a6, t3, t4	// a6 = 0xfffffffffffffff8
--    srl.d       t1, t1, t4	// t1 = "00054321"
--
--    srl.d       t0, t2, t3	// t0 = "00fedcba"
--    srl.d       t5, t7, t4	// t5 = 0x000000FFFFFFFFFF
--    sub.d       t6, t0, t1	// t6 hold diff
--    and         t6, t6, t5	// t6 = "000xxxxx"
--
--    sub.d       t5, t8, a4	// t5 hold margin 8 - 3 = 5
--    bnez        t6, L(first_out)
--    bgeu        t5, a2, L(ret)
--    sub.d       a2, a2, t5
--
--    bnez        a6, L(unaligned)
--    blt         a2, t8, L(al_less_8bytes)
--    andi        t1, a2, 31
--    beq         t1, a2, L(al_less_32bytes)
--
--    sub.d       t2, a2, t1
--    add.d       a4, a0, t2
--    move        a2, t1
--
--L(al_loop):
--    ld.d        t0, a0, 8
--
--    ld.d        t1, a1, 8
--    ld.d        t2, a0, 16
--    ld.d        t3, a1, 16
--    ld.d        t4, a0, 24
--
--    ld.d        t5, a1, 24
--    ld.d        t6, a0, 32
--    ld.d        t7, a1, 32
--    addi.d      a0, a0, 32
--
--    addi.d      a1, a1, 32
--    bne         t0, t1, L(out1)
--    bne         t2, t3, L(out2)
--    bne         t4, t5, L(out3)
--
--    bne         t6, t7, L(out4)
--    bne         a0, a4, L(al_loop)
--
--L(al_less_32bytes):
--    srai.d      a4, a2, 4
--    beqz        a4, L(al_less_16bytes)
--
--    ld.d        t0, a0, 8
--    ld.d        t1, a1, 8
--    ld.d        t2, a0, 16
--    ld.d        t3, a1, 16
--
--    addi.d      a0, a0, 16
--    addi.d      a1, a1, 16
--    addi.d      a2, a2, -16
--    bne         t0, t1, L(out1)
--
--    bne         t2, t3, L(out2)
--
--L(al_less_16bytes):
--    srai.d      a4, a2, 3
--    beqz        a4, L(al_less_8bytes)
--    ld.d        t0, a0, 8
--
--    ld.d        t1, a1, 8
--    addi.d      a0, a0, 8
--    addi.d      a1, a1, 8
--    addi.d      a2, a2, -8
--
--    bne         t0, t1, L(out1)
--
--L(al_less_8bytes):
--    beqz        a2, L(ret)
--    ld.d        t0, a0, 8
--    ld.d        t1, a1, 8
--
--    li.d        t7, -1
--    slli.d      t2, a2, 3
--    sll.d       t2, t7, t2
--    sub.d       t3, t0, t1
--
--    andn        t6, t3, t2
--    bnez        t6, L(count_diff)
--
--L(ret):
--    move        a0, zero
--    jr          ra
--
--L(out4):
--    move        t0, t6
--    move        t1, t7
--    sub.d       t6, t6, t7
--    b           L(count_diff)
--
--L(out3):
--    move        t0, t4
--    move        t1, t5
--    sub.d       t6, t4, t5
--    b           L(count_diff)
--
--L(out2):
--    move        t0, t2
--    move        t1, t3
--L(out1):
--    sub.d       t6, t0, t1
--    b           L(count_diff)
--
--L(first_out):
--    slli.d      t4, a2, 3
--    slt         t3, a2, t5
--    sll.d       t4, t7, t4
--    maskeqz     t4, t4, t3
--
--    andn        t6, t6, t4
--
--L(count_diff):
--    ctz.d       t2, t6
--    bstrins.d   t2, zero, 2, 0
--    srl.d       t0, t0, t2
--
--    srl.d       t1, t1, t2
--    andi        t0, t0, 0xff
--    andi        t1, t1, 0xff
--    sub.d       t2, t0, t1
--
--    sub.d       t3, t1, t0
--    masknez     t2, t2, a5
--    maskeqz     t3, t3, a5
--    or          a0, t2, t3
--
--    jr          ra
--
--L(unaligned):
--    sub.d       a7, zero, a6
--    srl.d       t0, t2, a6
--    blt         a2, t8, L(un_less_8bytes)
--
--    andi        t1, a2, 31
--    beq         t1, a2, L(un_less_32bytes)
--    sub.d       t2, a2, t1
--    add.d       a4, a0, t2
--
--    move        a2, t1
--
--L(un_loop):
--    ld.d        t2, a0, 8
--    ld.d        t1, a1, 8
--    ld.d        t4, a0, 16
--
--    ld.d        t3, a1, 16
--    ld.d        t6, a0, 24
--    ld.d        t5, a1, 24
--    ld.d        t8, a0, 32
--
--    ld.d        t7, a1, 32
--    addi.d      a0, a0, 32
--    addi.d      a1, a1, 32
--    sll.d       a3, t2, a7
--
--    or          t0, a3, t0
--    bne         t0, t1, L(out1)
--    srl.d       t0, t2, a6
--    sll.d       a3, t4, a7
--
--    or          t2, a3, t0
--    bne         t2, t3, L(out2)
--    srl.d       t0, t4, a6
--    sll.d       a3, t6, a7
--
--    or          t4, a3, t0
--    bne         t4, t5, L(out3)
--    srl.d       t0, t6, a6
--    sll.d       a3, t8, a7
--
--    or          t6, t0, a3
--    bne         t6, t7, L(out4)
--    srl.d       t0, t8, a6
--    bne         a0, a4, L(un_loop)
--
--L(un_less_32bytes):
--    srai.d      a4, a2, 4
--    beqz        a4, L(un_less_16bytes)
--    ld.d        t2, a0, 8
--    ld.d        t1, a1, 8
--
--    ld.d        t4, a0, 16
--    ld.d        t3, a1, 16
--    addi.d      a0, a0, 16
--    addi.d      a1, a1, 16
--
--    addi.d      a2, a2, -16
--    sll.d       a3, t2, a7
--    or          t0, a3, t0
--    bne         t0, t1, L(out1)
--
--    srl.d       t0, t2, a6
--    sll.d       a3, t4, a7
--    or          t2, a3, t0
--    bne         t2, t3, L(out2)
--
--    srl.d       t0, t4, a6
--
--L(un_less_16bytes):
--    srai.d      a4, a2, 3
--    beqz        a4, L(un_less_8bytes)
--    ld.d        t2, a0, 8
--
--    ld.d        t1, a1, 8
--    addi.d      a0, a0, 8
--    addi.d      a1, a1, 8
--    addi.d      a2, a2, -8
--
--    sll.d       a3, t2, a7
--    or          t0, a3, t0
--    bne         t0, t1, L(out1)
--    srl.d       t0, t2, a6
--
--L(un_less_8bytes):
--    beqz        a2, L(ret)
--    andi        a7, a7, 63
--    slli.d      a4, a2, 3
--    bgeu        a7, a4, L(last_cmp)
--
--    ld.d        t2, a0, 8
--    sll.d       a3, t2, a7
--    or          t0, a3, t0
--
--L(last_cmp):
--    ld.d        t1, a1, 8
--
--    li.d        t7, -1
--    sll.d       t2, t7, a4
--    sub.d       t3, t0, t1
--    andn        t6, t3, t2
--
--    bnez        t6, L(count_diff)
--    move        a0, zero
--    jr          ra
--
--END(MEMCMP_NAME)
--
--#ifdef _LIBC
--libc_hidden_builtin_def (MEMCMP_NAME)
--#endif
-diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S
-deleted file mode 100644
-index 4791e1a4..00000000
---- a/sysdeps/loongarch/lp64/memcpy.S
-+++ /dev/null
-@@ -1,804 +0,0 @@
--#ifdef _LIBC
--#include <sysdep.h>
--#include <sys/regdef.h>
--#include <sys/asm.h>
--#else
--#include <regdef.h>
--#include <sys/asm.h>
--#endif
--
--#ifndef MEMCPY_NAME
--#define MEMCPY_NAME memcpy
--#endif
--
--#ifndef MEMMOVE_NAME
--#define MEMMOVE_NAME memmove
--#endif
--
--#define LD_64(reg, n)          \
--    ld.d        t0, reg, n;    \
--    ld.d        t1, reg, n+8;  \
--    ld.d        t2, reg, n+16; \
--    ld.d        t3, reg, n+24; \
--    ld.d        t4, reg, n+32; \
--    ld.d        t5, reg, n+40; \
--    ld.d        t6, reg, n+48; \
--    ld.d        t7, reg, n+56;
--
--#define ST_64(reg, n)          \
--    st.d        t0, reg, n;    \
--    st.d        t1, reg, n+8;  \
--    st.d        t2, reg, n+16; \
--    st.d        t3, reg, n+24; \
--    st.d        t4, reg, n+32; \
--    st.d        t5, reg, n+40; \
--    st.d        t6, reg, n+48; \
--    st.d        t7, reg, n+56;
--
--LEAF(MEMMOVE_NAME, 6)
--    sub.d       t0, a0, a1
--    bltu        t0, a2, L(copy_back)
--
--END(MEMMOVE_NAME)
--
--#ifdef _LIBC
--libc_hidden_builtin_def (MEMMOVE_NAME)
--#endif
--
--LEAF_NO_ALIGN(MEMCPY_NAME)
--
--    srai.d      a3, a2, 4
--    beqz        a3, L(short_data)  # less than 16 bytes
--
--    move        a4, a0
--    andi        a5, a0, 0x7
--    andi        a6, a1, 0x7
--    li.d        t8, 8
--    beqz        a5, L(check_align)
--
--    # make dest aligned 8 bytes
--    sub.d       t2, t8, a5
--    sub.d       a2, a2, t2
--
--    pcaddi      t1, 20
--    slli.d      t3, t2, 3
--    add.d       a1, a1, t2
--    sub.d       t1, t1, t3
--    add.d       a4, a4, t2
--    jr          t1
--
--L(al7):
--    ld.b        t0, a1, -7
--    st.b        t0, a4, -7
--L(al6):
--    ld.b        t0, a1, -6
--    st.b        t0, a4, -6
--L(al5):
--    ld.b        t0, a1, -5
--    st.b        t0, a4, -5
--L(al4):
--    ld.b        t0, a1, -4
--    st.b        t0, a4, -4
--L(al3):
--    ld.b        t0, a1, -3
--    st.b        t0, a4, -3
--L(al2):
--    ld.b        t0, a1, -2
--    st.b        t0, a4, -2
--L(al1):
--    ld.b        t0, a1, -1
--    st.b        t0, a4, -1
--
--L(check_align):
--    bne         a5, a6, L(unalign)
--
--    srai.d      a3, a2, 4
--    beqz        a3, L(al_less_16bytes)
--
--    andi        a3, a2, 0x3f
--    beq         a3, a2, L(al_less_64bytes)
--
--    sub.d       t0, a2, a3
--    move        a2, a3
--    add.d       a5, a1, t0
--
--L(loop_64bytes):
--    LD_64(a1, 0)
--    addi.d      a1, a1, 64
--    ST_64(a4, 0)
--
--    addi.d      a4, a4, 64
--    bne         a1, a5, L(loop_64bytes)
--
--L(al_less_64bytes):
--    srai.d     a3, a2, 5
--    beqz       a3, L(al_less_32bytes)
--
--    ld.d       t0, a1, 0
--    ld.d       t1, a1, 8
--    ld.d       t2, a1, 16
--    ld.d       t3, a1, 24
--
--    addi.d     a1, a1, 32
--    addi.d     a2, a2, -32
--
--    st.d       t0, a4, 0
--    st.d       t1, a4, 8
--    st.d       t2, a4, 16
--    st.d       t3, a4, 24
--
--    addi.d     a4, a4, 32
--
--L(al_less_32bytes):
--    srai.d     a3, a2, 4
--    beqz       a3, L(al_less_16bytes)
--
--    ld.d       t0, a1, 0
--    ld.d       t1, a1, 8
--    addi.d     a1, a1, 16
--    addi.d     a2, a2, -16
--
--    st.d       t0, a4, 0
--    st.d       t1, a4, 8
--    addi.d     a4, a4, 16
--
--L(al_less_16bytes):
--    srai.d     a3, a2, 3
--    beqz       a3, L(al_less_8bytes)
--
--    ld.d       t0, a1, 0
--    addi.d     a1, a1, 8
--    addi.d     a2, a2, -8
--
--    st.d       t0, a4, 0
--    addi.d     a4, a4, 8
--
--L(al_less_8bytes):
--    srai.d      a3, a2, 2
--    beqz        a3, L(al_less_4bytes)
--
--    ld.w        t0, a1, 0
--    addi.d      a1, a1, 4
--    addi.d      a2, a2, -4
--
--    st.w        t0, a4, 0
--    addi.d      a4, a4, 4
--
--L(al_less_4bytes):
--    srai.d      a3, a2, 1
--    beqz        a3, L(al_less_2bytes)
--
--    ld.h        t0, a1, 0
--    addi.d      a1, a1, 2
--    addi.d      a2, a2, -2
--
--    st.h        t0, a4, 0
--    addi.d      a4, a4, 2
--
--L(al_less_2bytes):
--    beqz        a2, L(al_less_1byte)
--
--    ld.b        t0, a1, 0
--    st.b        t0, a4, 0
--
--L(al_less_1byte):
--    jr          ra
--
--L(unalign):
--    andi        a5, a1, 0x7
--    bstrins.d   a1, zero, 2, 0   # make src 8 bytes aligned
--
--    sub.d       t8, t8, a5  # use t8 to save count of bytes for aligning
--    slli.d      a5, a5, 3
--
--    ld.d        t0, a1, 0
--    addi.d      a1, a1, 8
--
--    slli.d      a6, t8, 3
--    srl.d       a7, t0, a5
--
--    srai.d      a3, a2, 4
--    beqz        a3, L(un_less_16bytes)
--
--    andi        a3, a2, 0x3f
--    beq         a3, a2, L(un_less_64bytes)
--
--    sub.d       t0, a2, a3
--    move        a2, a3
--    add.d       a3, a1, t0
--
--# a5 shift right num
--# a6 shift left num
--# a7 remaining part
--L(un_long_bytes):
--    ld.d        t0, a1, 0
--    ld.d        t1, a1, 8
--    ld.d        t2, a1, 16
--    ld.d        t3, a1, 24
--
--    srl.d       t4, t0, a5
--    sll.d       t0, t0, a6
--
--    srl.d       t5, t1, a5
--    sll.d       t1, t1, a6
--
--    srl.d       t6, t2, a5
--    sll.d       t2, t2, a6
--
--    srl.d       t7, t3, a5
--    sll.d       t3, t3, a6
--
--    or          t0, a7, t0
--    or          t1, t4, t1
--    or          t2, t5, t2
--    or          t3, t6, t3
--
--    ld.d        t4, a1, 32
--    ld.d        t5, a1, 40
--    ld.d        t6, a1, 48
--    ld.d        a7, a1, 56
--
--    st.d        t0, a4, 0
--    st.d        t1, a4, 8
--    st.d        t2, a4, 16
--    st.d        t3, a4, 24
--
--    addi.d      a1, a1, 64
--
--    srl.d       t0, t4, a5
--    sll.d       t4, t4, a6
--
--    srl.d       t1, t5, a5
--    sll.d       t5, t5, a6
--
--    srl.d       t2, t6, a5
--    sll.d       t6, t6, a6
--
--    sll.d       t3, a7, a6
--    srl.d       a7, a7, a5
--
--    or          t4, t7, t4
--    or          t5, t0, t5
--    or          t6, t1, t6
--    or          t3, t2, t3
--
--    st.d        t4, a4, 32
--    st.d        t5, a4, 40
--    st.d        t6, a4, 48
--    st.d        t3, a4, 56
--
--    addi.d      a4, a4, 64
--    bne         a3, a1, L(un_long_bytes)
--
--L(un_less_64bytes):
--    srai.d	a3, a2, 5
--    beqz	a3, L(un_less_32bytes)
--
--    ld.d        t0, a1, 0
--    ld.d        t1, a1, 8
--    ld.d        t2, a1, 16
--    ld.d        t3, a1, 24
--
--    addi.d      a1, a1, 32
--    addi.d      a2, a2, -32
--
--    srl.d       t4, t0, a5
--    sll.d       t0, t0, a6
--
--    srl.d       t5, t1, a5
--    sll.d       t1, t1, a6
--
--    srl.d       t6, t2, a5
--    sll.d       t2, t2, a6
--
--    or          t0, a7, t0
--
--    srl.d       a7, t3, a5
--    sll.d       t3, t3, a6
--
--    or          t1, t4, t1
--    or          t2, t5, t2
--    or          t3, t6, t3
--
--    st.d        t0, a4, 0
--    st.d        t1, a4, 8
--    st.d        t2, a4, 16
--    st.d        t3, a4, 24
--
--    addi.d      a4, a4, 32
--
--L(un_less_32bytes):
--    srai.d      a3, a2, 4
--    beqz        a3, L(un_less_16bytes)
--
--    ld.d        t0, a1, 0
--    ld.d        t1, a1, 8
--
--    addi.d      a1, a1, 16
--    addi.d      a2, a2, -16
--
--    srl.d       t2, t0, a5
--    sll.d       t3, t0, a6
--
--    sll.d       t4, t1, a6
--    or          t3, a7, t3
--    or          t4, t2, t4
--    srl.d       a7, t1, a5
--
--    st.d        t3, a4, 0
--    st.d        t4, a4, 8
--
--    addi.d      a4, a4, 16
--
--L(un_less_16bytes):
--    srai.d      a3, a2, 3
--    beqz        a3, L(un_less_8bytes)
--
--    ld.d        t0, a1, 0
--
--    addi.d      a1, a1, 8
--    addi.d      a2, a2, -8
--
--    sll.d       t1, t0, a6
--    or          t2, a7, t1
--    srl.d       a7, t0, a5
--
--    st.d        t2, a4, 0
--    addi.d      a4, a4, 8
--
--L(un_less_8bytes):
--    beqz        a2, L(un_less_1byte)
--    bge         t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7
--
--    # combine data in memory and a7(remaining part)
--    ld.d        t0, a1, 0
--    sll.d       t0, t0, a6
--    or          a7, a7, t0
--
--1:
--    srai.d      a3, a2, 2
--    beqz        a3, L(un_less_4bytes)
--
--    addi.d      a2, a2, -4
--    st.w        a7, a4, 0
--    addi.d      a4, a4, 4
--    srai.d      a7, a7, 32
--
--L(un_less_4bytes):
--    srai.d      a3, a2, 1
--    beqz        a3, L(un_less_2bytes)
--
--    addi.d      a2, a2, -2
--    st.h        a7, a4, 0
--    addi.d      a4, a4, 2
--    srai.d      a7, a7, 16
--
--L(un_less_2bytes):
--    beqz        a2, L(un_less_1byte)
--    st.b        a7, a4, 0
--
--L(un_less_1byte):
--    jr          ra
--
--# Bytes copying for data less than 16 bytes
--L(short_data):
--    pcaddi      t1, 36
--    slli.d      t2, a2, 3
--    add.d       a4, a0, a2
--    sub.d       t1, t1, t2
--    add.d       a1, a1, a2
--    jr          t1
--
--L(short_15_bytes):
--    ld.b       t0, a1, -15
--    st.b       t0, a4, -15
--L(short_14_bytes):
--    ld.b       t0, a1, -14
--    st.b       t0, a4, -14
--L(short_13_bytes):
--    ld.b       t0, a1, -13
--    st.b       t0, a4, -13
--L(short_12_bytes):
--    ld.b       t0, a1, -12
--    st.b       t0, a4, -12
--L(short_11_bytes):
--    ld.b       t0, a1, -11
--    st.b       t0, a4, -11
--L(short_10_bytes):
--    ld.b       t0, a1, -10
--    st.b       t0, a4, -10
--L(short_9_bytes):
--    ld.b       t0, a1, -9
--    st.b       t0, a4, -9
--L(short_8_bytes):
--    ld.b       t0, a1, -8
--    st.b       t0, a4, -8
--L(short_7_bytes):
--    ld.b       t0, a1, -7
--    st.b       t0, a4, -7
--L(short_6_bytes):
--    ld.b       t0, a1, -6
--    st.b       t0, a4, -6
--L(short_5_bytes):
--    ld.b       t0, a1, -5
--    st.b       t0, a4, -5
--L(short_4_bytes):
--    ld.b       t0, a1, -4
--    st.b       t0, a4, -4
--L(short_3_bytes):
--    ld.b       t0, a1, -3
--    st.b       t0, a4, -3
--L(short_2_bytes):
--    ld.b       t0, a1, -2
--    st.b       t0, a4, -2
--L(short_1_bytes):
--    ld.b       t0, a1, -1
--    st.b       t0, a4, -1
--    jr         ra
--
--L(copy_back):
--    srai.d      a3, a2, 4
--    beqz        a3, L(back_short_data)  # less than 16 bytes
--
--    add.d       a4, a0, a2  # store the tail of dest
--    add.d       a1, a1, a2  # store the tail of src
--
--    andi        a5, a4, 0x7
--    andi        a6, a1, 0x7
--    beqz        a5, L(back_check_align)
--
--    # make dest aligned 8 bytes
--    sub.d       a2, a2, a5
--    sub.d       a1, a1, a5
--    sub.d       a4, a4, a5
--
--    pcaddi      t1, 18
--    slli.d      t3, a5, 3
--    sub.d       t1, t1, t3
--    jr          t1
--
--    ld.b        t0, a1, 6
--    st.b        t0, a4, 6
--    ld.b        t0, a1, 5
--    st.b        t0, a4, 5
--    ld.b        t0, a1, 4
--    st.b        t0, a4, 4
--    ld.b        t0, a1, 3
--    st.b        t0, a4, 3
--    ld.b        t0, a1, 2
--    st.b        t0, a4, 2
--    ld.b        t0, a1, 1
--    st.b        t0, a4, 1
--    ld.b        t0, a1, 0
--    st.b        t0, a4, 0
--
--L(back_check_align):
--    bne         a5, a6, L(back_unalign)
--
--    srai.d      a3, a2, 4
--    beqz        a3, L(back_less_16bytes)
--
--    andi        a3, a2, 0x3f
--    beq         a3, a2, L(back_less_64bytes)
--
--    sub.d       t0, a2, a3
--    move        a2, a3
--    sub.d       a5, a1, t0
--
--L(back_loop_64bytes):
--    LD_64(a1, -64)
--    addi.d      a1, a1, -64
--    ST_64(a4, -64)
--
--    addi.d      a4, a4, -64
--    bne         a1, a5, L(back_loop_64bytes)
--
--L(back_less_64bytes):
--    srai.d     a3, a2, 5
--    beqz       a3, L(back_less_32bytes)
--
--    ld.d       t0, a1, -32
--    ld.d       t1, a1, -24
--    ld.d       t2, a1, -16
--    ld.d       t3, a1, -8
--
--    addi.d     a1, a1, -32
--    addi.d     a2, a2, -32
--
--    st.d       t0, a4, -32
--    st.d       t1, a4, -24
--    st.d       t2, a4, -16
--    st.d       t3, a4, -8
--
--    addi.d     a4, a4, -32
--
--L(back_less_32bytes):
--    srai.d     a3, a2, 4
--    beqz       a3, L(back_less_16bytes)
--
--    ld.d       t0, a1, -16
--    ld.d       t1, a1, -8
--
--    addi.d     a2, a2, -16
--    addi.d     a1, a1, -16
--
--    st.d       t0, a4, -16
--    st.d       t1, a4, -8
--    addi.d     a4, a4, -16
--
--L(back_less_16bytes):
--    srai.d      a3, a2, 3
--    beqz        a3, L(back_less_8bytes)
--
--    ld.d        t0, a1, -8
--    addi.d      a2, a2, -8
--    addi.d      a1, a1, -8
--
--    st.d        t0, a4, -8
--    addi.d      a4, a4, -8
--
--L(back_less_8bytes):
--    srai.d      a3, a2, 2
--    beqz        a3, L(back_less_4bytes)
--
--    ld.w        t0, a1, -4
--    addi.d      a2, a2, -4
--    addi.d      a1, a1, -4
--
--    st.w        t0, a4, -4
--    addi.d      a4, a4, -4
--
--L(back_less_4bytes):
--    srai.d      a3, a2, 1
--    beqz        a3, L(back_less_2bytes)
--
--    ld.h        t0, a1, -2
--    addi.d      a2, a2, -2
--    addi.d      a1, a1, -2
--
--    st.h        t0, a4, -2
--    addi.d      a4, a4, -2
--
--L(back_less_2bytes):
--    beqz        a2, L(back_less_1byte)
--
--    ld.b        t0, a1, -1
--    st.b        t0, a4, -1
--
--L(back_less_1byte):
--    jr          ra
--
--L(back_unalign):
--    andi        t8, a1, 0x7
--    bstrins.d   a1, zero, 2, 0   # make src 8 bytes aligned
--
--    sub.d       a6, zero, t8
--
--    ld.d        t0, a1, 0
--    slli.d      a6, a6, 3
--    slli.d      a5, t8, 3
--    sll.d       a7, t0, a6
--
--    srai.d      a3, a2, 4
--    beqz        a3, L(back_un_less_16bytes)
--
--    andi        a3, a2, 0x3f
--    beq         a3, a2, L(back_un_less_64bytes)
--
--    sub.d       t0, a2, a3
--    move        a2, a3
--    sub.d       a3, a1, t0
--
--L(back_un_long_bytes):
--    ld.d        t0, a1, -8
--    ld.d        t1, a1, -16
--    ld.d        t2, a1, -24
--    ld.d        t3, a1, -32
--
--    sll.d       t4, t0, a6
--    srl.d       t0, t0, a5
--
--    sll.d       t5, t1, a6
--    srl.d       t1, t1, a5
--
--    sll.d       t6, t2, a6
--    srl.d       t2, t2, a5
--
--    sll.d       t7, t3, a6
--    srl.d       t3, t3, a5
--
--    or          t0, t0, a7
--    or          t1, t1, t4
--    or          t2, t2, t5
--    or          t3, t3, t6
--
--    ld.d        t4, a1, -40
--    ld.d        t5, a1, -48
--    ld.d        t6, a1, -56
--    ld.d        a7, a1, -64
--    st.d        t0, a4, -8
--    st.d        t1, a4, -16
--    st.d        t2, a4, -24
--    st.d        t3, a4, -32
--
--    addi.d      a1, a1, -64
--
--    sll.d       t0, t4, a6
--    srl.d       t4, t4, a5
--
--    sll.d       t1, t5, a6
--    srl.d       t5, t5, a5
--
--    sll.d       t2, t6, a6
--    srl.d       t6, t6, a5
--
--    srl.d       t3, a7, a5
--    sll.d       a7, a7, a6
--
--    or          t4, t7, t4
--    or          t5, t0, t5
--    or          t6, t1, t6
--    or          t3, t2, t3
--
--    st.d        t4, a4, -40
--    st.d        t5, a4, -48
--    st.d        t6, a4, -56
--    st.d        t3, a4, -64
--
--    addi.d      a4, a4, -64
--    bne         a3, a1, L(back_un_long_bytes)
--
--L(back_un_less_64bytes):
--    srai.d	a3, a2, 5
--    beqz	a3, L(back_un_less_32bytes)
--
--    ld.d        t0, a1, -8
--    ld.d        t1, a1, -16
--    ld.d        t2, a1, -24
--    ld.d        t3, a1, -32
--
--    addi.d      a1, a1, -32
--    addi.d      a2, a2, -32
--
--    sll.d       t4, t0, a6
--    srl.d       t0, t0, a5
--
--    sll.d       t5, t1, a6
--    srl.d       t1, t1, a5
--
--    sll.d       t6, t2, a6
--    srl.d       t2, t2, a5
--
--    or          t0, a7, t0
--
--    sll.d       a7, t3, a6
--    srl.d       t3, t3, a5
--
--    or          t1, t4, t1
--    or          t2, t5, t2
--    or          t3, t6, t3
--
--    st.d        t0, a4, -8
--    st.d        t1, a4, -16
--    st.d        t2, a4, -24
--    st.d        t3, a4, -32
--
--    addi.d      a4, a4, -32
--
--L(back_un_less_32bytes):
--    srai.d      a3, a2, 4
--    beqz        a3, L(back_un_less_16bytes)
--
--    ld.d        t0, a1, -8
--    ld.d        t1, a1, -16
--
--    addi.d      a1, a1, -16
--    addi.d      a2, a2, -16
--
--    sll.d       t2, t0, a6
--    srl.d       t3, t0, a5
--
--    srl.d       t4, t1, a5
--    or          t3, a7, t3
--    or          t4, t2, t4
--    sll.d       a7, t1, a6
--
--    st.d        t3, a4, -8
--    st.d        t4, a4, -16
--
--    addi.d      a4, a4, -16
--
--L(back_un_less_16bytes):
--    srai.d      a3, a2, 3
--    beqz        a3, L(back_un_less_8bytes)
--
--    ld.d        t0, a1, -8
--
--    addi.d      a1, a1, -8
--    addi.d      a2, a2, -8
--
--    srl.d       t1, t0, a5
--    or          t2, a7, t1
--    sll.d       a7, t0, a6
--
--    st.d        t2, a4, -8
--    addi.d      a4, a4, -8
--
--L(back_un_less_8bytes):
--    beqz        a2, L(back_end)
--    bge         t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7
--
--    # combine data in memory and a7(remaining part)
--    ld.d        t0, a1, -8
--    srl.d       t0, t0, a5
--    or          a7, a7, t0
--
--1:
--    srai.d      a3, a2, 2
--    beqz        a3, L(back_un_less_4bytes)
--
--    srai.d      t0, a7, 32
--    addi.d      a2, a2, -4
--    st.w        t0, a4, -4
--    addi.d      a4, a4, -4
--    slli.d      a7, a7, 32
--
--L(back_un_less_4bytes):
--    srai.d      a3, a2, 1
--    beqz        a3, L(back_un_less_2bytes)
--    srai.d      t0, a7, 48
--    addi.d      a2, a2, -2
--    st.h        t0, a4, -2
--    addi.d      a4, a4, -2
--    slli.d      a7, a7, 16
--L(back_un_less_2bytes):
--    beqz        a2, L(back_un_less_1byte)
--    srai.d      t0, a7, 56
--    st.b        t0, a4, -1
--L(back_un_less_1byte):
--    jr          ra
--
--L(back_short_data):
--    pcaddi     t1, 34
--    slli.d     t2, a2, 3
--    sub.d      t1, t1, t2
--    jr         t1
--
--    ld.b       t0, a1, 14
--    st.b       t0, a0, 14
--    ld.b       t0, a1, 13
--    st.b       t0, a0, 13
--    ld.b       t0, a1, 12
--    st.b       t0, a0, 12
--    ld.b       t0, a1, 11
--    st.b       t0, a0, 11
--    ld.b       t0, a1, 10
--    st.b       t0, a0, 10
--    ld.b       t0, a1, 9
--    st.b       t0, a0, 9
--    ld.b       t0, a1, 8
--    st.b       t0, a0, 8
--    ld.b       t0, a1, 7
--    st.b       t0, a0, 7
--    ld.b       t0, a1, 6
--    st.b       t0, a0, 6
--    ld.b       t0, a1, 5
--    st.b       t0, a0, 5
--    ld.b       t0, a1, 4
--    st.b       t0, a0, 4
--    ld.b       t0, a1, 3
--    st.b       t0, a0, 3
--    ld.b       t0, a1, 2
--    st.b       t0, a0, 2
--    ld.b       t0, a1, 1
--    st.b       t0, a0, 1
--    ld.b       t0, a1, 0
--    st.b       t0, a0, 0
--L(back_end):
--    jr         ra
--
--END(MEMCPY_NAME)
--
--#ifdef _LIBC
--libc_hidden_builtin_def (MEMCPY_NAME)
--#endif
-diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S
-deleted file mode 100644
-index 6d1922c4..00000000
---- a/sysdeps/loongarch/lp64/memmove.S
-+++ /dev/null
-@@ -1,2 +0,0 @@
--/* DONT DELETE THIS FILE, OTHERWIES MEMCPY.C WILL BE COMPILED. */
--/* There are too many common code in memcpy and memmove. See memcpy.S */
-diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S
-deleted file mode 100644
-index eabd7d23..00000000
---- a/sysdeps/loongarch/lp64/memset.S
-+++ /dev/null
-@@ -1,166 +0,0 @@
--#ifdef _LIBC
--#include <sysdep.h>
--#include <sys/regdef.h>
--#include <sys/asm.h>
--#else
--#include <sys/asm.h>
--#include <sys/regdef.h>
--#endif
--
--#ifndef MEMSET_NAME
--#define MEMSET_NAME memset
--#endif
--
--#define ST_64(n)                \
--    st.d        a1, a0, n;      \
--    st.d        a1, a0, n+8;    \
--    st.d        a1, a0, n+16;   \
--    st.d        a1, a0, n+24;   \
--    st.d        a1, a0, n+32;   \
--    st.d        a1, a0, n+40;   \
--    st.d        a1, a0, n+48;   \
--    st.d        a1, a0, n+56;
--
--LEAF(MEMSET_NAME, 6)
--    move        t0, a0
--    andi        a3, a0, 0x7
--    li.w        t6, 16
--    beqz        a3, L(align)
--    blt         a2, t6, L(short_data)
--
--L(make_align):
--    li.w        t8, 8
--    sub.d       t2, t8, a3
--    pcaddi      t1, 11
--    slli.d      t3, t2, 2
--    sub.d       t1, t1, t3
--    jirl        zero, t1, 0
--
--L(al7):
--    st.b        a1, t0, 6
--L(al6):
--    st.b        a1, t0, 5
--L(al5):
--    st.b        a1, t0, 4
--L(al4):
--    st.b        a1, t0, 3
--L(al3):
--    st.b        a1, t0, 2
--L(al2):
--    st.b        a1, t0, 1
--L(al1):
--    st.b        a1, t0, 0
--L(al0):
--    add.d       t0, t0, t2
--    sub.d       a2, a2, t2
--
--L(align):
--    bstrins.d   a1, a1, 15, 8
--    bstrins.d   a1, a1, 31, 16
--    bstrins.d   a1, a1, 63, 32
--
--    blt         a2, t6, L(less_16bytes)
--
--    andi        a4, a2, 0x3f
--    beq         a4, a2, L(less_64bytes)
--
--    sub.d       t1, a2, a4
--    move        a2, a4
--    add.d       a5, t0, t1
--
--L(loop_64bytes):
--    addi.d      t0, t0, 64
--    st.d        a1, t0, -64
--    st.d        a1, t0, -56
--    st.d        a1, t0, -48
--    st.d        a1, t0, -40
--    st.d        a1, t0, -32
--    st.d        a1, t0, -24
--    st.d        a1, t0, -16
--    st.d        a1, t0, -8
--    bne         t0, a5, L(loop_64bytes)
--
--L(less_64bytes):
--    srai.d      a4, a2, 5
--    beqz        a4, L(less_32bytes)
--    addi.d      a2, a2, -32
--    st.d        a1, t0, 0
--    st.d        a1, t0, 8
--    st.d        a1, t0, 16
--    st.d        a1, t0, 24
--    addi.d      t0, t0, 32
--L(less_32bytes):
--    blt         a2, t6, L(less_16bytes)
--    addi.d      a2, a2, -16
--    st.d        a1, t0, 0
--    st.d        a1, t0, 8
--    addi.d      t0, t0, 16
--L(less_16bytes):
--    srai.d      a4, a2, 3
--    beqz        a4, L(less_8bytes)
--    addi.d      a2, a2, -8
--    st.d        a1, t0, 0
--    addi.d      t0, t0, 8
--L(less_8bytes):
--    beqz        a2, L(less_1byte)
--    srai.d      a4, a2, 2
--    beqz        a4, L(less_4bytes)
--    addi.d      a2, a2, -4
--    st.w        a1, t0, 0
--    addi.d      t0, t0, 4
--L(less_4bytes):
--    srai.d      a3, a2, 1
--    beqz        a3, L(less_2bytes)
--    addi.d      a2, a2, -2
--    st.h        a1, t0, 0
--    addi.d      t0, t0, 2
--L(less_2bytes):
--    beqz        a2, L(less_1byte)
--    st.b        a1, t0, 0
--L(less_1byte):
--    jr          ra
--
--L(short_data):
--    pcaddi      t1, 19
--    slli.d      t3, a2, 2
--    sub.d       t1, t1, t3
--    jirl        zero, t1, 0
--L(short_15):
--    st.b        a1, a0, 14
--
--L(short_14):
--    st.b        a1, a0, 13
--L(short_13):
--    st.b        a1, a0, 12
--L(short_12):
--    st.b        a1, a0, 11
--L(short_11):
--    st.b        a1, a0, 10
--L(short_10):
--    st.b        a1, a0, 9
--L(short_9):
--    st.b        a1, a0, 8
--L(short_8):
--    st.b        a1, a0, 7
--L(short_7):
--    st.b        a1, a0, 6
--L(short_6):
--    st.b        a1, a0, 5
--L(short_5):
--    st.b        a1, a0, 4
--L(short_4):
--    st.b        a1, a0, 3
--L(short_3):
--    st.b        a1, a0, 2
--L(short_2):
--    st.b        a1, a0, 1
--L(short_1):
--    st.b        a1, a0, 0
--L(short_0):
--    jr          ra
--
--END(MEMSET_NAME)
--
--#ifdef _LIBC
--libc_hidden_builtin_def (MEMSET_NAME)
--#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S
-index 4677c912..7dfa3ade 100644
---- a/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S
-@@ -1,7 +1,96 @@
- 
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
- #if IS_IN (libc)
- #define MEMCHR_NAME __memchr_aligned
-+#else
-+#define MEMCHR_NAME memchr
- #endif
- 
--#include "../memchr.S"
-+LEAF(MEMCHR_NAME, 6)
-+    beqz        a2, L(out)
-+    andi        t1, a0, 0x7
-+    lu12i.w     a3, 0x01010
-+    sub.d       a5, a0, t1
-+
-+    bstrins.d   a1, a1, 15, 8
-+    ld.d        t0, a5, 0
-+    slli.d      t2, t1, 3
-+    ori         a3, a3, 0x101
-+
-+    bstrins.d   a1, a1, 31, 16
-+    li.w        t7, -1
-+    li.w        t8, 9
-+    bstrins.d   a3, a3, 63, 32
-+
-+    srl.d       t3, t7, t2
-+    bstrins.d   a1, a1, 63, 32
-+    sub.d       t4, t8, t1
-+    orn         t3, a1, t3
-+
-+    srl.d       t0, t0, t2
-+    slli.d      a4, a3, 7   # 0x8080808080808080
-+    sltu        t4, a2, t4
-+    xor         t2, t0, t3
-+
-+    sub.d       a6, t2, a3
-+    andn        a7, a4, t2
-+    and         t2, a6, a7
-+    or          t3, t2, t4
-+
-+    bnez        t3, L(count_pos)
-+    addi.d      a2, a2, -8
-+    addi.d      a0, a5, 8
-+    add.d       a2, a2, t1
-+
-+L(loop):
-+    ld.d        t0, a0, 0
-+    sltui       t4, a2, 9
-+    xor         t2, t0, a1
-+    sub.d       a6, t2, a3
-+
-+    andn        a7, a4, t2
-+    and         t2, a6, a7
-+    or          t3, t2, t4
-+    bnez        t3, L(count_pos)
-+
-+    ld.d        t1, a0, 8
-+    addi.d      a0, a0, 16
-+    sltui       t4, a2, 17
-+    xor         t2, t1, a1
-+
-+    sub.d       a6, t2, a3
-+    andn        a7, a4, t2
-+    and         t2, a6, a7
-+    addi.d      a2, a2, -16
-+
-+    or          t3, t2, t4
-+    beqz        t3, L(loop)
-+    addi.d      a0, a0, -8
-+    addi.d      a2, a2, 8
-+
-+L(count_pos):
-+    ctz.d       t0, t2
-+    srli.d      t0, t0, 3
-+    sltu        t1, t0, a2
-+    add.d       a0, a0, t0
-+
-+    maskeqz     a0, a0, t1
-+    jr          ra
-+
-+L(out):
-+    move        a0, zero
-+    jr          ra
-+END(MEMCHR_NAME)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMCHR_NAME)
-+#endif
- 
-diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S
-index 512eabca..9505dfce 100644
---- a/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S
-@@ -1,11 +1,289 @@
- 
--#if IS_IN (libc)
- 
-+
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#if IS_IN (libc)
- #define MEMCMP_NAME __memcmp_aligned
-+#else
-+#define MEMCMP_NAME memcmp
-+#endif
-+
-+LEAF(MEMCMP_NAME, 6)
-+    beqz        a2, L(ret)
-+    andi        a4, a1, 0x7
-+    andi        a3, a0, 0x7
-+    sltu        a5, a4, a3
-+
-+    xor         t0, a0, a1
-+    li.w        t8, 8
-+    maskeqz     t0, t0, a5
-+    li.w        t7, -1
-+
-+    xor         a0, a0, t0	// a0 hold smaller one
-+    xor         a1, a1, t0	// a1 hold larger one
-+    andi        a3, a0, 0x7	// a3 hold small offset
-+    andi        a4, a1, 0x7	// a4 hold larger offset
-+
-+    xor         a0, a0, a3
-+    xor         a1, a1, a4
-+    ld.d        t2, a0, 0	// t2 = "fedcbaXX"
-+    ld.d        t1, a1, 0	// t1 = "54321YYY"
-+
-+    slli.d      t3, a3, 3
-+    slli.d      t4, a4, 3
-+    sub.d       a6, t3, t4	// a6 = 0xfffffffffffffff8
-+    srl.d       t1, t1, t4	// t1 = "00054321"
-+
-+    srl.d       t0, t2, t3	// t0 = "00fedcba"
-+    srl.d       t5, t7, t4	// t5 = 0x000000FFFFFFFFFF
-+    sub.d       t6, t0, t1	// t6 hold diff
-+    and         t6, t6, t5	// t6 = "000xxxxx"
-+
-+    sub.d       t5, t8, a4	// t5 hold margin 8 - 3 = 5
-+    bnez        t6, L(first_out)
-+    bgeu        t5, a2, L(ret)
-+    sub.d       a2, a2, t5
-+
-+    bnez        a6, L(unaligned)
-+    blt         a2, t8, L(al_less_8bytes)
-+    andi        t1, a2, 31
-+    beq         t1, a2, L(al_less_32bytes)
-+
-+    sub.d       t2, a2, t1
-+    add.d       a4, a0, t2
-+    move        a2, t1
-+
-+L(al_loop):
-+    ld.d        t0, a0, 8
-+
-+    ld.d        t1, a1, 8
-+    ld.d        t2, a0, 16
-+    ld.d        t3, a1, 16
-+    ld.d        t4, a0, 24
-+
-+    ld.d        t5, a1, 24
-+    ld.d        t6, a0, 32
-+    ld.d        t7, a1, 32
-+    addi.d      a0, a0, 32
-+
-+    addi.d      a1, a1, 32
-+    bne         t0, t1, L(out1)
-+    bne         t2, t3, L(out2)
-+    bne         t4, t5, L(out3)
-+
-+    bne         t6, t7, L(out4)
-+    bne         a0, a4, L(al_loop)
-+
-+L(al_less_32bytes):
-+    srai.d      a4, a2, 4
-+    beqz        a4, L(al_less_16bytes)
-+
-+    ld.d        t0, a0, 8
-+    ld.d        t1, a1, 8
-+    ld.d        t2, a0, 16
-+    ld.d        t3, a1, 16
-+
-+    addi.d      a0, a0, 16
-+    addi.d      a1, a1, 16
-+    addi.d      a2, a2, -16
-+    bne         t0, t1, L(out1)
-+
-+    bne         t2, t3, L(out2)
-+
-+L(al_less_16bytes):
-+    srai.d      a4, a2, 3
-+    beqz        a4, L(al_less_8bytes)
-+    ld.d        t0, a0, 8
-+
-+    ld.d        t1, a1, 8
-+    addi.d      a0, a0, 8
-+    addi.d      a1, a1, 8
-+    addi.d      a2, a2, -8
-+
-+    bne         t0, t1, L(out1)
-+
-+L(al_less_8bytes):
-+    beqz        a2, L(ret)
-+    ld.d        t0, a0, 8
-+    ld.d        t1, a1, 8
-+
-+    li.d        t7, -1
-+    slli.d      t2, a2, 3
-+    sll.d       t2, t7, t2
-+    sub.d       t3, t0, t1
-+
-+    andn        t6, t3, t2
-+    bnez        t6, L(count_diff)
-+
-+L(ret):
-+    move        a0, zero
-+    jr          ra
-+
-+L(out4):
-+    move        t0, t6
-+    move        t1, t7
-+    sub.d       t6, t6, t7
-+    b           L(count_diff)
-+
-+L(out3):
-+    move        t0, t4
-+    move        t1, t5
-+    sub.d       t6, t4, t5
-+    b           L(count_diff)
-+
-+L(out2):
-+    move        t0, t2
-+    move        t1, t3
-+L(out1):
-+    sub.d       t6, t0, t1
-+    b           L(count_diff)
-+
-+L(first_out):
-+    slli.d      t4, a2, 3
-+    slt         t3, a2, t5
-+    sll.d       t4, t7, t4
-+    maskeqz     t4, t4, t3
-+
-+    andn        t6, t6, t4
-+
-+L(count_diff):
-+    ctz.d       t2, t6
-+    bstrins.d   t2, zero, 2, 0
-+    srl.d       t0, t0, t2
-+
-+    srl.d       t1, t1, t2
-+    andi        t0, t0, 0xff
-+    andi        t1, t1, 0xff
-+    sub.d       t2, t0, t1
-+
-+    sub.d       t3, t1, t0
-+    masknez     t2, t2, a5
-+    maskeqz     t3, t3, a5
-+    or          a0, t2, t3
-+
-+    jr          ra
-+
-+L(unaligned):
-+    sub.d       a7, zero, a6
-+    srl.d       t0, t2, a6
-+    blt         a2, t8, L(un_less_8bytes)
-+
-+    andi        t1, a2, 31
-+    beq         t1, a2, L(un_less_32bytes)
-+    sub.d       t2, a2, t1
-+    add.d       a4, a0, t2
-+
-+    move        a2, t1
-+
-+L(un_loop):
-+    ld.d        t2, a0, 8
-+    ld.d        t1, a1, 8
-+    ld.d        t4, a0, 16
-+
-+    ld.d        t3, a1, 16
-+    ld.d        t6, a0, 24
-+    ld.d        t5, a1, 24
-+    ld.d        t8, a0, 32
-+
-+    ld.d        t7, a1, 32
-+    addi.d      a0, a0, 32
-+    addi.d      a1, a1, 32
-+    sll.d       a3, t2, a7
-+
-+    or          t0, a3, t0
-+    bne         t0, t1, L(out1)
-+    srl.d       t0, t2, a6
-+    sll.d       a3, t4, a7
-+
-+    or          t2, a3, t0
-+    bne         t2, t3, L(out2)
-+    srl.d       t0, t4, a6
-+    sll.d       a3, t6, a7
-+
-+    or          t4, a3, t0
-+    bne         t4, t5, L(out3)
-+    srl.d       t0, t6, a6
-+    sll.d       a3, t8, a7
-+
-+    or          t6, t0, a3
-+    bne         t6, t7, L(out4)
-+    srl.d       t0, t8, a6
-+    bne         a0, a4, L(un_loop)
-+
-+L(un_less_32bytes):
-+    srai.d      a4, a2, 4
-+    beqz        a4, L(un_less_16bytes)
-+    ld.d        t2, a0, 8
-+    ld.d        t1, a1, 8
-+
-+    ld.d        t4, a0, 16
-+    ld.d        t3, a1, 16
-+    addi.d      a0, a0, 16
-+    addi.d      a1, a1, 16
-+
-+    addi.d      a2, a2, -16
-+    sll.d       a3, t2, a7
-+    or          t0, a3, t0
-+    bne         t0, t1, L(out1)
-+
-+    srl.d       t0, t2, a6
-+    sll.d       a3, t4, a7
-+    or          t2, a3, t0
-+    bne         t2, t3, L(out2)
-+
-+    srl.d       t0, t4, a6
-+
-+L(un_less_16bytes):
-+    srai.d      a4, a2, 3
-+    beqz        a4, L(un_less_8bytes)
-+    ld.d        t2, a0, 8
-+
-+    ld.d        t1, a1, 8
-+    addi.d      a0, a0, 8
-+    addi.d      a1, a1, 8
-+    addi.d      a2, a2, -8
-+
-+    sll.d       a3, t2, a7
-+    or          t0, a3, t0
-+    bne         t0, t1, L(out1)
-+    srl.d       t0, t2, a6
-+
-+L(un_less_8bytes):
-+    beqz        a2, L(ret)
-+    andi        a7, a7, 63
-+    slli.d      a4, a2, 3
-+    bgeu        a7, a4, L(last_cmp)
-+
-+    ld.d        t2, a0, 8
-+    sll.d       a3, t2, a7
-+    or          t0, a3, t0
-+
-+L(last_cmp):
-+    ld.d        t1, a1, 8
-+
-+    li.d        t7, -1
-+    sll.d       t2, t7, a4
-+    sub.d       t3, t0, t1
-+    andn        t6, t3, t2
-+
-+    bnez        t6, L(count_diff)
-+    move        a0, zero
-+    jr          ra
-+
-+END(MEMCMP_NAME)
- 
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMCMP_NAME)
- #endif
- 
--#include "../memcmp.S"
- # undef bcmp
- weak_alias (MEMCMP_NAME, bcmp)
- 
-diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S
-index 5ff8b4e6..3fc86a7f 100644
---- a/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S
-@@ -1,11 +1,804 @@
--
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <regdef.h>
-+#include <sys/asm.h>
-+#endif
- 
- #if IS_IN (libc)
--
- #define MEMCPY_NAME __memcpy_aligned
- #define MEMMOVE_NAME __memmove_aligned
-+#else
-+#define MEMCPY_NAME memcpy
-+#define MEMMOVE_NAME memmove
-+#endif
-+
-+#define LD_64(reg, n)          \
-+    ld.d        t0, reg, n;    \
-+    ld.d        t1, reg, n+8;  \
-+    ld.d        t2, reg, n+16; \
-+    ld.d        t3, reg, n+24; \
-+    ld.d        t4, reg, n+32; \
-+    ld.d        t5, reg, n+40; \
-+    ld.d        t6, reg, n+48; \
-+    ld.d        t7, reg, n+56;
-+
-+#define ST_64(reg, n)          \
-+    st.d        t0, reg, n;    \
-+    st.d        t1, reg, n+8;  \
-+    st.d        t2, reg, n+16; \
-+    st.d        t3, reg, n+24; \
-+    st.d        t4, reg, n+32; \
-+    st.d        t5, reg, n+40; \
-+    st.d        t6, reg, n+48; \
-+    st.d        t7, reg, n+56;
- 
-+LEAF(MEMMOVE_NAME, 6)
-+    sub.d       t0, a0, a1
-+    bltu        t0, a2, L(copy_back)
-+
-+END(MEMMOVE_NAME)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMMOVE_NAME)
- #endif
- 
--#include "../memcpy.S"
-+LEAF_NO_ALIGN(MEMCPY_NAME)
-+
-+    srai.d      a3, a2, 4
-+    beqz        a3, L(short_data)  # less than 16 bytes
-+
-+    move        a4, a0
-+    andi        a5, a0, 0x7
-+    andi        a6, a1, 0x7
-+    li.d        t8, 8
-+    beqz        a5, L(check_align)
-+
-+    # make dest aligned 8 bytes
-+    sub.d       t2, t8, a5
-+    sub.d       a2, a2, t2
-+
-+    pcaddi      t1, 20
-+    slli.d      t3, t2, 3
-+    add.d       a1, a1, t2
-+    sub.d       t1, t1, t3
-+    add.d       a4, a4, t2
-+    jr          t1
-+
-+L(al7):
-+    ld.b        t0, a1, -7
-+    st.b        t0, a4, -7
-+L(al6):
-+    ld.b        t0, a1, -6
-+    st.b        t0, a4, -6
-+L(al5):
-+    ld.b        t0, a1, -5
-+    st.b        t0, a4, -5
-+L(al4):
-+    ld.b        t0, a1, -4
-+    st.b        t0, a4, -4
-+L(al3):
-+    ld.b        t0, a1, -3
-+    st.b        t0, a4, -3
-+L(al2):
-+    ld.b        t0, a1, -2
-+    st.b        t0, a4, -2
-+L(al1):
-+    ld.b        t0, a1, -1
-+    st.b        t0, a4, -1
-+
-+L(check_align):
-+    bne         a5, a6, L(unalign)
-+
-+    srai.d      a3, a2, 4
-+    beqz        a3, L(al_less_16bytes)
-+
-+    andi        a3, a2, 0x3f
-+    beq         a3, a2, L(al_less_64bytes)
-+
-+    sub.d       t0, a2, a3
-+    move        a2, a3
-+    add.d       a5, a1, t0
-+
-+L(loop_64bytes):
-+    LD_64(a1, 0)
-+    addi.d      a1, a1, 64
-+    ST_64(a4, 0)
-+
-+    addi.d      a4, a4, 64
-+    bne         a1, a5, L(loop_64bytes)
-+
-+L(al_less_64bytes):
-+    srai.d     a3, a2, 5
-+    beqz       a3, L(al_less_32bytes)
-+
-+    ld.d       t0, a1, 0
-+    ld.d       t1, a1, 8
-+    ld.d       t2, a1, 16
-+    ld.d       t3, a1, 24
-+
-+    addi.d     a1, a1, 32
-+    addi.d     a2, a2, -32
-+
-+    st.d       t0, a4, 0
-+    st.d       t1, a4, 8
-+    st.d       t2, a4, 16
-+    st.d       t3, a4, 24
-+
-+    addi.d     a4, a4, 32
-+
-+L(al_less_32bytes):
-+    srai.d     a3, a2, 4
-+    beqz       a3, L(al_less_16bytes)
-+
-+    ld.d       t0, a1, 0
-+    ld.d       t1, a1, 8
-+    addi.d     a1, a1, 16
-+    addi.d     a2, a2, -16
-+
-+    st.d       t0, a4, 0
-+    st.d       t1, a4, 8
-+    addi.d     a4, a4, 16
-+
-+L(al_less_16bytes):
-+    srai.d     a3, a2, 3
-+    beqz       a3, L(al_less_8bytes)
-+
-+    ld.d       t0, a1, 0
-+    addi.d     a1, a1, 8
-+    addi.d     a2, a2, -8
-+
-+    st.d       t0, a4, 0
-+    addi.d     a4, a4, 8
-+
-+L(al_less_8bytes):
-+    srai.d      a3, a2, 2
-+    beqz        a3, L(al_less_4bytes)
-+
-+    ld.w        t0, a1, 0
-+    addi.d      a1, a1, 4
-+    addi.d      a2, a2, -4
-+
-+    st.w        t0, a4, 0
-+    addi.d      a4, a4, 4
-+
-+L(al_less_4bytes):
-+    srai.d      a3, a2, 1
-+    beqz        a3, L(al_less_2bytes)
-+
-+    ld.h        t0, a1, 0
-+    addi.d      a1, a1, 2
-+    addi.d      a2, a2, -2
-+
-+    st.h        t0, a4, 0
-+    addi.d      a4, a4, 2
-+
-+L(al_less_2bytes):
-+    beqz        a2, L(al_less_1byte)
-+
-+    ld.b        t0, a1, 0
-+    st.b        t0, a4, 0
-+
-+L(al_less_1byte):
-+    jr          ra
-+
-+L(unalign):
-+    andi        a5, a1, 0x7
-+    bstrins.d   a1, zero, 2, 0   # make src 8 bytes aligned
-+
-+    sub.d       t8, t8, a5  # use t8 to save count of bytes for aligning
-+    slli.d      a5, a5, 3
-+
-+    ld.d        t0, a1, 0
-+    addi.d      a1, a1, 8
-+
-+    slli.d      a6, t8, 3
-+    srl.d       a7, t0, a5
-+
-+    srai.d      a3, a2, 4
-+    beqz        a3, L(un_less_16bytes)
-+
-+    andi        a3, a2, 0x3f
-+    beq         a3, a2, L(un_less_64bytes)
-+
-+    sub.d       t0, a2, a3
-+    move        a2, a3
-+    add.d       a3, a1, t0
-+
-+# a5 shift right num
-+# a6 shift left num
-+# a7 remaining part
-+L(un_long_bytes):
-+    ld.d        t0, a1, 0
-+    ld.d        t1, a1, 8
-+    ld.d        t2, a1, 16
-+    ld.d        t3, a1, 24
-+
-+    srl.d       t4, t0, a5
-+    sll.d       t0, t0, a6
-+
-+    srl.d       t5, t1, a5
-+    sll.d       t1, t1, a6
-+
-+    srl.d       t6, t2, a5
-+    sll.d       t2, t2, a6
-+
-+    srl.d       t7, t3, a5
-+    sll.d       t3, t3, a6
-+
-+    or          t0, a7, t0
-+    or          t1, t4, t1
-+    or          t2, t5, t2
-+    or          t3, t6, t3
-+
-+    ld.d        t4, a1, 32
-+    ld.d        t5, a1, 40
-+    ld.d        t6, a1, 48
-+    ld.d        a7, a1, 56
-+
-+    st.d        t0, a4, 0
-+    st.d        t1, a4, 8
-+    st.d        t2, a4, 16
-+    st.d        t3, a4, 24
-+
-+    addi.d      a1, a1, 64
-+
-+    srl.d       t0, t4, a5
-+    sll.d       t4, t4, a6
-+
-+    srl.d       t1, t5, a5
-+    sll.d       t5, t5, a6
-+
-+    srl.d       t2, t6, a5
-+    sll.d       t6, t6, a6
-+
-+    sll.d       t3, a7, a6
-+    srl.d       a7, a7, a5
-+
-+    or          t4, t7, t4
-+    or          t5, t0, t5
-+    or          t6, t1, t6
-+    or          t3, t2, t3
-+
-+    st.d        t4, a4, 32
-+    st.d        t5, a4, 40
-+    st.d        t6, a4, 48
-+    st.d        t3, a4, 56
-+
-+    addi.d      a4, a4, 64
-+    bne         a3, a1, L(un_long_bytes)
-+
-+L(un_less_64bytes):
-+    srai.d	a3, a2, 5
-+    beqz	a3, L(un_less_32bytes)
-+
-+    ld.d        t0, a1, 0
-+    ld.d        t1, a1, 8
-+    ld.d        t2, a1, 16
-+    ld.d        t3, a1, 24
-+
-+    addi.d      a1, a1, 32
-+    addi.d      a2, a2, -32
-+
-+    srl.d       t4, t0, a5
-+    sll.d       t0, t0, a6
-+
-+    srl.d       t5, t1, a5
-+    sll.d       t1, t1, a6
-+
-+    srl.d       t6, t2, a5
-+    sll.d       t2, t2, a6
-+
-+    or          t0, a7, t0
-+
-+    srl.d       a7, t3, a5
-+    sll.d       t3, t3, a6
-+
-+    or          t1, t4, t1
-+    or          t2, t5, t2
-+    or          t3, t6, t3
-+
-+    st.d        t0, a4, 0
-+    st.d        t1, a4, 8
-+    st.d        t2, a4, 16
-+    st.d        t3, a4, 24
-+
-+    addi.d      a4, a4, 32
-+
-+L(un_less_32bytes):
-+    srai.d      a3, a2, 4
-+    beqz        a3, L(un_less_16bytes)
-+
-+    ld.d        t0, a1, 0
-+    ld.d        t1, a1, 8
-+
-+    addi.d      a1, a1, 16
-+    addi.d      a2, a2, -16
-+
-+    srl.d       t2, t0, a5
-+    sll.d       t3, t0, a6
-+
-+    sll.d       t4, t1, a6
-+    or          t3, a7, t3
-+    or          t4, t2, t4
-+    srl.d       a7, t1, a5
-+
-+    st.d        t3, a4, 0
-+    st.d        t4, a4, 8
-+
-+    addi.d      a4, a4, 16
-+
-+L(un_less_16bytes):
-+    srai.d      a3, a2, 3
-+    beqz        a3, L(un_less_8bytes)
-+
-+    ld.d        t0, a1, 0
-+
-+    addi.d      a1, a1, 8
-+    addi.d      a2, a2, -8
-+
-+    sll.d       t1, t0, a6
-+    or          t2, a7, t1
-+    srl.d       a7, t0, a5
-+
-+    st.d        t2, a4, 0
-+    addi.d      a4, a4, 8
-+
-+L(un_less_8bytes):
-+    beqz        a2, L(un_less_1byte)
-+    bge         t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7
-+
-+    # combine data in memory and a7(remaining part)
-+    ld.d        t0, a1, 0
-+    sll.d       t0, t0, a6
-+    or          a7, a7, t0
-+
-+1:
-+    srai.d      a3, a2, 2
-+    beqz        a3, L(un_less_4bytes)
-+
-+    addi.d      a2, a2, -4
-+    st.w        a7, a4, 0
-+    addi.d      a4, a4, 4
-+    srai.d      a7, a7, 32
-+
-+L(un_less_4bytes):
-+    srai.d      a3, a2, 1
-+    beqz        a3, L(un_less_2bytes)
-+
-+    addi.d      a2, a2, -2
-+    st.h        a7, a4, 0
-+    addi.d      a4, a4, 2
-+    srai.d      a7, a7, 16
- 
-+L(un_less_2bytes):
-+    beqz        a2, L(un_less_1byte)
-+    st.b        a7, a4, 0
-+
-+L(un_less_1byte):
-+    jr          ra
-+
-+# Bytes copying for data less than 16 bytes
-+L(short_data):
-+    pcaddi      t1, 36
-+    slli.d      t2, a2, 3
-+    add.d       a4, a0, a2
-+    sub.d       t1, t1, t2
-+    add.d       a1, a1, a2
-+    jr          t1
-+
-+L(short_15_bytes):
-+    ld.b       t0, a1, -15
-+    st.b       t0, a4, -15
-+L(short_14_bytes):
-+    ld.b       t0, a1, -14
-+    st.b       t0, a4, -14
-+L(short_13_bytes):
-+    ld.b       t0, a1, -13
-+    st.b       t0, a4, -13
-+L(short_12_bytes):
-+    ld.b       t0, a1, -12
-+    st.b       t0, a4, -12
-+L(short_11_bytes):
-+    ld.b       t0, a1, -11
-+    st.b       t0, a4, -11
-+L(short_10_bytes):
-+    ld.b       t0, a1, -10
-+    st.b       t0, a4, -10
-+L(short_9_bytes):
-+    ld.b       t0, a1, -9
-+    st.b       t0, a4, -9
-+L(short_8_bytes):
-+    ld.b       t0, a1, -8
-+    st.b       t0, a4, -8
-+L(short_7_bytes):
-+    ld.b       t0, a1, -7
-+    st.b       t0, a4, -7
-+L(short_6_bytes):
-+    ld.b       t0, a1, -6
-+    st.b       t0, a4, -6
-+L(short_5_bytes):
-+    ld.b       t0, a1, -5
-+    st.b       t0, a4, -5
-+L(short_4_bytes):
-+    ld.b       t0, a1, -4
-+    st.b       t0, a4, -4
-+L(short_3_bytes):
-+    ld.b       t0, a1, -3
-+    st.b       t0, a4, -3
-+L(short_2_bytes):
-+    ld.b       t0, a1, -2
-+    st.b       t0, a4, -2
-+L(short_1_bytes):
-+    ld.b       t0, a1, -1
-+    st.b       t0, a4, -1
-+    jr         ra
-+
-+L(copy_back):
-+    srai.d      a3, a2, 4
-+    beqz        a3, L(back_short_data)  # less than 16 bytes
-+
-+    add.d       a4, a0, a2  # store the tail of dest
-+    add.d       a1, a1, a2  # store the tail of src
-+
-+    andi        a5, a4, 0x7
-+    andi        a6, a1, 0x7
-+    beqz        a5, L(back_check_align)
-+
-+    # make dest aligned 8 bytes
-+    sub.d       a2, a2, a5
-+    sub.d       a1, a1, a5
-+    sub.d       a4, a4, a5
-+
-+    pcaddi      t1, 18
-+    slli.d      t3, a5, 3
-+    sub.d       t1, t1, t3
-+    jr          t1
-+
-+    ld.b        t0, a1, 6
-+    st.b        t0, a4, 6
-+    ld.b        t0, a1, 5
-+    st.b        t0, a4, 5
-+    ld.b        t0, a1, 4
-+    st.b        t0, a4, 4
-+    ld.b        t0, a1, 3
-+    st.b        t0, a4, 3
-+    ld.b        t0, a1, 2
-+    st.b        t0, a4, 2
-+    ld.b        t0, a1, 1
-+    st.b        t0, a4, 1
-+    ld.b        t0, a1, 0
-+    st.b        t0, a4, 0
-+
-+L(back_check_align):
-+    bne         a5, a6, L(back_unalign)
-+
-+    srai.d      a3, a2, 4
-+    beqz        a3, L(back_less_16bytes)
-+
-+    andi        a3, a2, 0x3f
-+    beq         a3, a2, L(back_less_64bytes)
-+
-+    sub.d       t0, a2, a3
-+    move        a2, a3
-+    sub.d       a5, a1, t0
-+
-+L(back_loop_64bytes):
-+    LD_64(a1, -64)
-+    addi.d      a1, a1, -64
-+    ST_64(a4, -64)
-+
-+    addi.d      a4, a4, -64
-+    bne         a1, a5, L(back_loop_64bytes)
-+
-+L(back_less_64bytes):
-+    srai.d     a3, a2, 5
-+    beqz       a3, L(back_less_32bytes)
-+
-+    ld.d       t0, a1, -32
-+    ld.d       t1, a1, -24
-+    ld.d       t2, a1, -16
-+    ld.d       t3, a1, -8
-+
-+    addi.d     a1, a1, -32
-+    addi.d     a2, a2, -32
-+
-+    st.d       t0, a4, -32
-+    st.d       t1, a4, -24
-+    st.d       t2, a4, -16
-+    st.d       t3, a4, -8
-+
-+    addi.d     a4, a4, -32
-+
-+L(back_less_32bytes):
-+    srai.d     a3, a2, 4
-+    beqz       a3, L(back_less_16bytes)
-+
-+    ld.d       t0, a1, -16
-+    ld.d       t1, a1, -8
-+
-+    addi.d     a2, a2, -16
-+    addi.d     a1, a1, -16
-+
-+    st.d       t0, a4, -16
-+    st.d       t1, a4, -8
-+    addi.d     a4, a4, -16
-+
-+L(back_less_16bytes):
-+    srai.d      a3, a2, 3
-+    beqz        a3, L(back_less_8bytes)
-+
-+    ld.d        t0, a1, -8
-+    addi.d      a2, a2, -8
-+    addi.d      a1, a1, -8
-+
-+    st.d        t0, a4, -8
-+    addi.d      a4, a4, -8
-+
-+L(back_less_8bytes):
-+    srai.d      a3, a2, 2
-+    beqz        a3, L(back_less_4bytes)
-+
-+    ld.w        t0, a1, -4
-+    addi.d      a2, a2, -4
-+    addi.d      a1, a1, -4
-+
-+    st.w        t0, a4, -4
-+    addi.d      a4, a4, -4
-+
-+L(back_less_4bytes):
-+    srai.d      a3, a2, 1
-+    beqz        a3, L(back_less_2bytes)
-+
-+    ld.h        t0, a1, -2
-+    addi.d      a2, a2, -2
-+    addi.d      a1, a1, -2
-+
-+    st.h        t0, a4, -2
-+    addi.d      a4, a4, -2
-+
-+L(back_less_2bytes):
-+    beqz        a2, L(back_less_1byte)
-+
-+    ld.b        t0, a1, -1
-+    st.b        t0, a4, -1
-+
-+L(back_less_1byte):
-+    jr          ra
-+
-+L(back_unalign):
-+    andi        t8, a1, 0x7
-+    bstrins.d   a1, zero, 2, 0   # make src 8 bytes aligned
-+
-+    sub.d       a6, zero, t8
-+
-+    ld.d        t0, a1, 0
-+    slli.d      a6, a6, 3
-+    slli.d      a5, t8, 3
-+    sll.d       a7, t0, a6
-+
-+    srai.d      a3, a2, 4
-+    beqz        a3, L(back_un_less_16bytes)
-+
-+    andi        a3, a2, 0x3f
-+    beq         a3, a2, L(back_un_less_64bytes)
-+
-+    sub.d       t0, a2, a3
-+    move        a2, a3
-+    sub.d       a3, a1, t0
-+
-+L(back_un_long_bytes):
-+    ld.d        t0, a1, -8
-+    ld.d        t1, a1, -16
-+    ld.d        t2, a1, -24
-+    ld.d        t3, a1, -32
-+
-+    sll.d       t4, t0, a6
-+    srl.d       t0, t0, a5
-+
-+    sll.d       t5, t1, a6
-+    srl.d       t1, t1, a5
-+
-+    sll.d       t6, t2, a6
-+    srl.d       t2, t2, a5
-+
-+    sll.d       t7, t3, a6
-+    srl.d       t3, t3, a5
-+
-+    or          t0, t0, a7
-+    or          t1, t1, t4
-+    or          t2, t2, t5
-+    or          t3, t3, t6
-+
-+    ld.d        t4, a1, -40
-+    ld.d        t5, a1, -48
-+    ld.d        t6, a1, -56
-+    ld.d        a7, a1, -64
-+    st.d        t0, a4, -8
-+    st.d        t1, a4, -16
-+    st.d        t2, a4, -24
-+    st.d        t3, a4, -32
-+
-+    addi.d      a1, a1, -64
-+
-+    sll.d       t0, t4, a6
-+    srl.d       t4, t4, a5
-+
-+    sll.d       t1, t5, a6
-+    srl.d       t5, t5, a5
-+
-+    sll.d       t2, t6, a6
-+    srl.d       t6, t6, a5
-+
-+    srl.d       t3, a7, a5
-+    sll.d       a7, a7, a6
-+
-+    or          t4, t7, t4
-+    or          t5, t0, t5
-+    or          t6, t1, t6
-+    or          t3, t2, t3
-+
-+    st.d        t4, a4, -40
-+    st.d        t5, a4, -48
-+    st.d        t6, a4, -56
-+    st.d        t3, a4, -64
-+
-+    addi.d      a4, a4, -64
-+    bne         a3, a1, L(back_un_long_bytes)
-+
-+L(back_un_less_64bytes):
-+    srai.d	a3, a2, 5
-+    beqz	a3, L(back_un_less_32bytes)
-+
-+    ld.d        t0, a1, -8
-+    ld.d        t1, a1, -16
-+    ld.d        t2, a1, -24
-+    ld.d        t3, a1, -32
-+
-+    addi.d      a1, a1, -32
-+    addi.d      a2, a2, -32
-+
-+    sll.d       t4, t0, a6
-+    srl.d       t0, t0, a5
-+
-+    sll.d       t5, t1, a6
-+    srl.d       t1, t1, a5
-+
-+    sll.d       t6, t2, a6
-+    srl.d       t2, t2, a5
-+
-+    or          t0, a7, t0
-+
-+    sll.d       a7, t3, a6
-+    srl.d       t3, t3, a5
-+
-+    or          t1, t4, t1
-+    or          t2, t5, t2
-+    or          t3, t6, t3
-+
-+    st.d        t0, a4, -8
-+    st.d        t1, a4, -16
-+    st.d        t2, a4, -24
-+    st.d        t3, a4, -32
-+
-+    addi.d      a4, a4, -32
-+
-+L(back_un_less_32bytes):
-+    srai.d      a3, a2, 4
-+    beqz        a3, L(back_un_less_16bytes)
-+
-+    ld.d        t0, a1, -8
-+    ld.d        t1, a1, -16
-+
-+    addi.d      a1, a1, -16
-+    addi.d      a2, a2, -16
-+
-+    sll.d       t2, t0, a6
-+    srl.d       t3, t0, a5
-+
-+    srl.d       t4, t1, a5
-+    or          t3, a7, t3
-+    or          t4, t2, t4
-+    sll.d       a7, t1, a6
-+
-+    st.d        t3, a4, -8
-+    st.d        t4, a4, -16
-+
-+    addi.d      a4, a4, -16
-+
-+L(back_un_less_16bytes):
-+    srai.d      a3, a2, 3
-+    beqz        a3, L(back_un_less_8bytes)
-+
-+    ld.d        t0, a1, -8
-+
-+    addi.d      a1, a1, -8
-+    addi.d      a2, a2, -8
-+
-+    srl.d       t1, t0, a5
-+    or          t2, a7, t1
-+    sll.d       a7, t0, a6
-+
-+    st.d        t2, a4, -8
-+    addi.d      a4, a4, -8
-+
-+L(back_un_less_8bytes):
-+    beqz        a2, L(back_end)
-+    bge         t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7
-+
-+    # combine data in memory and a7(remaining part)
-+    ld.d        t0, a1, -8
-+    srl.d       t0, t0, a5
-+    or          a7, a7, t0
-+
-+1:
-+    srai.d      a3, a2, 2
-+    beqz        a3, L(back_un_less_4bytes)
-+
-+    srai.d      t0, a7, 32
-+    addi.d      a2, a2, -4
-+    st.w        t0, a4, -4
-+    addi.d      a4, a4, -4
-+    slli.d      a7, a7, 32
-+
-+L(back_un_less_4bytes):
-+    srai.d      a3, a2, 1
-+    beqz        a3, L(back_un_less_2bytes)
-+    srai.d      t0, a7, 48
-+    addi.d      a2, a2, -2
-+    st.h        t0, a4, -2
-+    addi.d      a4, a4, -2
-+    slli.d      a7, a7, 16
-+L(back_un_less_2bytes):
-+    beqz        a2, L(back_un_less_1byte)
-+    srai.d      t0, a7, 56
-+    st.b        t0, a4, -1
-+L(back_un_less_1byte):
-+    jr          ra
-+
-+L(back_short_data):
-+    pcaddi     t1, 34
-+    slli.d     t2, a2, 3
-+    sub.d      t1, t1, t2
-+    jr         t1
-+
-+    ld.b       t0, a1, 14
-+    st.b       t0, a0, 14
-+    ld.b       t0, a1, 13
-+    st.b       t0, a0, 13
-+    ld.b       t0, a1, 12
-+    st.b       t0, a0, 12
-+    ld.b       t0, a1, 11
-+    st.b       t0, a0, 11
-+    ld.b       t0, a1, 10
-+    st.b       t0, a0, 10
-+    ld.b       t0, a1, 9
-+    st.b       t0, a0, 9
-+    ld.b       t0, a1, 8
-+    st.b       t0, a0, 8
-+    ld.b       t0, a1, 7
-+    st.b       t0, a0, 7
-+    ld.b       t0, a1, 6
-+    st.b       t0, a0, 6
-+    ld.b       t0, a1, 5
-+    st.b       t0, a0, 5
-+    ld.b       t0, a1, 4
-+    st.b       t0, a0, 4
-+    ld.b       t0, a1, 3
-+    st.b       t0, a0, 3
-+    ld.b       t0, a1, 2
-+    st.b       t0, a0, 2
-+    ld.b       t0, a1, 1
-+    st.b       t0, a0, 1
-+    ld.b       t0, a1, 0
-+    st.b       t0, a0, 0
-+L(back_end):
-+    jr         ra
-+
-+END(MEMCPY_NAME)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMCPY_NAME)
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memset-aligned.S b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S
-index da2f5ada..412ee849 100644
---- a/sysdeps/loongarch/lp64/multiarch/memset-aligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S
-@@ -1,9 +1,169 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
- 
- #if IS_IN (libc)
--
- #define MEMSET_NAME __memset_aligned
--
-+#else
-+#define MEMSET_NAME memset
- #endif
- 
--#include "../memset.S"
-+#define ST_64(n)                \
-+    st.d        a1, a0, n;      \
-+    st.d        a1, a0, n+8;    \
-+    st.d        a1, a0, n+16;   \
-+    st.d        a1, a0, n+24;   \
-+    st.d        a1, a0, n+32;   \
-+    st.d        a1, a0, n+40;   \
-+    st.d        a1, a0, n+48;   \
-+    st.d        a1, a0, n+56;
-+
-+LEAF(MEMSET_NAME, 6)
-+    move        t0, a0
-+    andi        a3, a0, 0x7
-+    li.w        t6, 16
-+    beqz        a3, L(align)
-+    blt         a2, t6, L(short_data)
-+
-+L(make_align):
-+    li.w        t8, 8
-+    sub.d       t2, t8, a3
-+    pcaddi      t1, 11
-+    slli.d      t3, t2, 2
-+    sub.d       t1, t1, t3
-+    jirl        zero, t1, 0
-+
-+L(al7):
-+    st.b        a1, t0, 6
-+L(al6):
-+    st.b        a1, t0, 5
-+L(al5):
-+    st.b        a1, t0, 4
-+L(al4):
-+    st.b        a1, t0, 3
-+L(al3):
-+    st.b        a1, t0, 2
-+L(al2):
-+    st.b        a1, t0, 1
-+L(al1):
-+    st.b        a1, t0, 0
-+L(al0):
-+    add.d       t0, t0, t2
-+    sub.d       a2, a2, t2
-+
-+L(align):
-+    bstrins.d   a1, a1, 15, 8
-+    bstrins.d   a1, a1, 31, 16
-+    bstrins.d   a1, a1, 63, 32
-+
-+    blt         a2, t6, L(less_16bytes)
-+
-+    andi        a4, a2, 0x3f
-+    beq         a4, a2, L(less_64bytes)
-+
-+    sub.d       t1, a2, a4
-+    move        a2, a4
-+    add.d       a5, t0, t1
-+
-+L(loop_64bytes):
-+    addi.d      t0, t0, 64
-+    st.d        a1, t0, -64
-+    st.d        a1, t0, -56
-+    st.d        a1, t0, -48
-+    st.d        a1, t0, -40
-+    st.d        a1, t0, -32
-+    st.d        a1, t0, -24
-+    st.d        a1, t0, -16
-+    st.d        a1, t0, -8
-+    bne         t0, a5, L(loop_64bytes)
-+
-+L(less_64bytes):
-+    srai.d      a4, a2, 5
-+    beqz        a4, L(less_32bytes)
-+    addi.d      a2, a2, -32
-+    st.d        a1, t0, 0
-+    st.d        a1, t0, 8
-+    st.d        a1, t0, 16
-+    st.d        a1, t0, 24
-+    addi.d      t0, t0, 32
-+L(less_32bytes):
-+    blt         a2, t6, L(less_16bytes)
-+    addi.d      a2, a2, -16
-+    st.d        a1, t0, 0
-+    st.d        a1, t0, 8
-+    addi.d      t0, t0, 16
-+L(less_16bytes):
-+    srai.d      a4, a2, 3
-+    beqz        a4, L(less_8bytes)
-+    addi.d      a2, a2, -8
-+    st.d        a1, t0, 0
-+    addi.d      t0, t0, 8
-+L(less_8bytes):
-+    beqz        a2, L(less_1byte)
-+    srai.d      a4, a2, 2
-+    beqz        a4, L(less_4bytes)
-+    addi.d      a2, a2, -4
-+    st.w        a1, t0, 0
-+    addi.d      t0, t0, 4
-+L(less_4bytes):
-+    srai.d      a3, a2, 1
-+    beqz        a3, L(less_2bytes)
-+    addi.d      a2, a2, -2
-+    st.h        a1, t0, 0
-+    addi.d      t0, t0, 2
-+L(less_2bytes):
-+    beqz        a2, L(less_1byte)
-+    st.b        a1, t0, 0
-+L(less_1byte):
-+    jr          ra
-+
-+L(short_data):
-+    pcaddi      t1, 19
-+    slli.d      t3, a2, 2
-+    sub.d       t1, t1, t3
-+    jirl        zero, t1, 0
-+L(short_15):
-+    st.b        a1, a0, 14
-+
-+L(short_14):
-+    st.b        a1, a0, 13
-+L(short_13):
-+    st.b        a1, a0, 12
-+L(short_12):
-+    st.b        a1, a0, 11
-+L(short_11):
-+    st.b        a1, a0, 10
-+L(short_10):
-+    st.b        a1, a0, 9
-+L(short_9):
-+    st.b        a1, a0, 8
-+L(short_8):
-+    st.b        a1, a0, 7
-+L(short_7):
-+    st.b        a1, a0, 6
-+L(short_6):
-+    st.b        a1, a0, 5
-+L(short_5):
-+    st.b        a1, a0, 4
-+L(short_4):
-+    st.b        a1, a0, 3
-+L(short_3):
-+    st.b        a1, a0, 2
-+L(short_2):
-+    st.b        a1, a0, 1
-+L(short_1):
-+    st.b        a1, a0, 0
-+L(short_0):
-+    jr          ra
-+
-+END(MEMSET_NAME)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMSET_NAME)
-+#endif
- 
-diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S
-index 0b46b4ca..a13e293f 100644
---- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S
-@@ -1,7 +1,115 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
- 
- #if IS_IN (libc)
- #define RAWMEMCHR_NAME __rawmemchr_aligned
-+#else
-+#define RAWMEMCHR_NAME __rawmemchr
- #endif
- 
--#include "../rawmemchr.S"
-+LEAF(RAWMEMCHR_NAME, 6)
-+    andi        t1, a0, 0x7
-+    bstrins.d   a0, zero, 2, 0
-+    lu12i.w     a2, 0x01010
-+    bstrins.d   a1, a1, 15, 8
-+
-+    ld.d        t0, a0, 0
-+    slli.d      t1, t1, 3
-+    ori         a2, a2, 0x101
-+    bstrins.d   a1, a1, 31, 16
-+
-+    li.w        t8, -1
-+    bstrins.d   a1, a1, 63, 32
-+    bstrins.d   a2, a2, 63, 32
-+    sll.d       t2, t8, t1
-+
-+    sll.d       t3, a1, t1
-+    orn         t0, t0, t2
-+    slli.d      a3, a2, 7
-+    beqz        a1, L(find_zero)
-+
-+    xor         t0, t0, t3
-+    sub.d       t1, t0, a2
-+    andn        t2, a3, t0
-+    and         t3, t1, t2
-+
-+    bnez        t3, L(count_pos)
-+    addi.d      a0, a0, 8
-+
-+L(loop):
-+    ld.d        t0, a0, 0
-+    xor         t0, t0, a1
-+
-+    sub.d       t1, t0, a2
-+    andn        t2, a3, t0
-+    and         t3, t1, t2
-+    bnez        t3, L(count_pos)
-+
-+    ld.d        t0, a0, 8
-+    addi.d      a0, a0, 16
-+    xor         t0, t0, a1
-+    sub.d       t1, t0, a2
-+
-+    andn        t2, a3, t0
-+    and         t3, t1, t2
-+    beqz        t3, L(loop)
-+    addi.d      a0, a0, -8
-+L(count_pos):
-+    ctz.d       t0, t3
-+    srli.d      t0, t0, 3
-+    add.d       a0, a0, t0
-+    jr          ra
-+
-+L(loop_7bit):
-+    ld.d        t0, a0, 0
-+L(find_zero):
-+    sub.d       t1, t0, a2
-+    and         t2, t1, a3
-+    bnez        t2, L(more_check)
-+
-+    ld.d        t0, a0, 8
-+    addi.d      a0, a0, 16
-+    sub.d       t1, t0, a2
-+    and         t2, t1, a3
-+
-+    beqz        t2, L(loop_7bit)
-+    addi.d      a0, a0, -8
-+
-+L(more_check):
-+    andn        t2, a3, t0
-+    and         t3, t1, t2
-+    bnez        t3, L(count_pos)
-+    addi.d      a0, a0, 8
-+
-+L(loop_8bit):
-+    ld.d        t0, a0, 0
-+
-+    sub.d       t1, t0, a2
-+    andn        t2, a3, t0
-+    and         t3, t1, t2
-+    bnez        t3, L(count_pos)
-+
-+    ld.d        t0, a0, 8
-+    addi.d      a0, a0, 16
-+    sub.d       t1, t0, a2
-+
-+    andn        t2, a3, t0
-+    and         t3, t1, t2
-+    beqz        t3, L(loop_8bit)
-+
-+    addi.d      a0, a0, -8
-+    b           L(count_pos)
-+
-+END(RAWMEMCHR_NAME)
-+
-+#ifdef _LIBC
-+weak_alias (__rawmemchr, rawmemchr)
-+libc_hidden_builtin_def (__rawmemchr)
-+#endif
- 
-diff --git a/sysdeps/loongarch/lp64/rawmemchr.S b/sysdeps/loongarch/lp64/rawmemchr.S
-deleted file mode 100644
-index ef1db7ed..00000000
---- a/sysdeps/loongarch/lp64/rawmemchr.S
-+++ /dev/null
-@@ -1,113 +0,0 @@
--#ifdef _LIBC
--#include <sysdep.h>
--#include <sys/regdef.h>
--#include <sys/asm.h>
--#else
--#include <sys/asm.h>
--#include <sys/regdef.h>
--#endif
--
--#ifndef RAWMEMCHR_NAME
--# define RAWMEMCHR_NAME __rawmemchr
--#endif
--
--
--LEAF(RAWMEMCHR_NAME, 6)
--    andi        t1, a0, 0x7
--    bstrins.d   a0, zero, 2, 0
--    lu12i.w     a2, 0x01010
--    bstrins.d   a1, a1, 15, 8
--
--    ld.d        t0, a0, 0
--    slli.d      t1, t1, 3
--    ori         a2, a2, 0x101
--    bstrins.d   a1, a1, 31, 16
--
--    li.w        t8, -1
--    bstrins.d   a1, a1, 63, 32
--    bstrins.d   a2, a2, 63, 32
--    sll.d       t2, t8, t1
--
--    sll.d       t3, a1, t1
--    orn         t0, t0, t2
--    slli.d      a3, a2, 7
--    beqz        a1, L(find_zero)
--
--    xor         t0, t0, t3
--    sub.d       t1, t0, a2
--    andn        t2, a3, t0
--    and         t3, t1, t2
--
--    bnez        t3, L(count_pos)
--    addi.d      a0, a0, 8
--
--L(loop):
--    ld.d        t0, a0, 0
--    xor         t0, t0, a1
--
--    sub.d       t1, t0, a2
--    andn        t2, a3, t0
--    and         t3, t1, t2
--    bnez        t3, L(count_pos)
--
--    ld.d        t0, a0, 8
--    addi.d      a0, a0, 16
--    xor         t0, t0, a1
--    sub.d       t1, t0, a2
--
--    andn        t2, a3, t0
--    and         t3, t1, t2
--    beqz        t3, L(loop)
--    addi.d      a0, a0, -8
--L(count_pos):
--    ctz.d       t0, t3
--    srli.d      t0, t0, 3
--    add.d       a0, a0, t0
--    jr          ra
--
--L(loop_7bit):
--    ld.d        t0, a0, 0
--L(find_zero):
--    sub.d       t1, t0, a2
--    and         t2, t1, a3
--    bnez        t2, L(more_check)
--
--    ld.d        t0, a0, 8
--    addi.d      a0, a0, 16
--    sub.d       t1, t0, a2
--    and         t2, t1, a3
--
--    beqz        t2, L(loop_7bit)
--    addi.d      a0, a0, -8
--
--L(more_check):
--    andn        t2, a3, t0
--    and         t3, t1, t2
--    bnez        t3, L(count_pos)
--    addi.d      a0, a0, 8
--
--L(loop_8bit):
--    ld.d        t0, a0, 0
--
--    sub.d       t1, t0, a2
--    andn        t2, a3, t0
--    and         t3, t1, t2
--    bnez        t3, L(count_pos)
--
--    ld.d        t0, a0, 8
--    addi.d      a0, a0, 16
--    sub.d       t1, t0, a2
--
--    andn        t2, a3, t0
--    and         t3, t1, t2
--    beqz        t3, L(loop_8bit)
--
--    addi.d      a0, a0, -8
--    b           L(count_pos)
--
--END(RAWMEMCHR_NAME)
--
--#ifdef _LIBC
--weak_alias (__rawmemchr, rawmemchr)
--libc_hidden_builtin_def (__rawmemchr)
--#endif
--- 
-2.33.0
-
diff --git a/glibc-2.28-Refactor-code-of-st-r-p-functions.patch b/glibc-2.28-Refactor-code-of-st-r-p-functions.patch
deleted file mode 100644
index 7c453e7..0000000
--- a/glibc-2.28-Refactor-code-of-st-r-p-functions.patch
+++ /dev/null
@@ -1,2770 +0,0 @@
-From b720fd44df475685ea164491d76c42e127aab3ea Mon Sep 17 00:00:00 2001
-From: caiyinyu <caiyinyu@loongson.cn>
-Date: Wed, 21 Jun 2023 10:49:39 +0800
-Subject: [PATCH 07/14] glibc-2.28: Refactor code of st{r,p}* functions.
-
-Change-Id: Ife977373e9ba071b284ee19ca4ba121bc27d5834
-Signed-off-by: ticat_fp <fanpeng@loongson.cn>
----
- .../loongarch/lp64/multiarch/stpcpy-aligned.S | 179 +++++++++++-
- .../loongarch/lp64/multiarch/strchr-aligned.S |  91 ++++++-
- .../lp64/multiarch/strchrnul-aligned.S        |  94 ++++++-
- .../loongarch/lp64/multiarch/strcmp-aligned.S | 225 ++++++++++++++-
- .../loongarch/lp64/multiarch/strcpy-aligned.S | 173 +++++++++++-
- .../loongarch/lp64/multiarch/strlen-aligned.S |  85 +++++-
- .../lp64/multiarch/strncmp-aligned.S          | 256 +++++++++++++++++-
- .../lp64/multiarch/strnlen-aligned.S          |  82 +++++-
- .../lp64/multiarch/strrchr-aligned.S          | 105 ++++++-
- sysdeps/loongarch/lp64/stpcpy.S               | 179 ------------
- sysdeps/loongarch/lp64/strchr.S               |  89 ------
- sysdeps/loongarch/lp64/strchrnul.S            |  94 -------
- sysdeps/loongarch/lp64/strcmp.S               | 227 ----------------
- sysdeps/loongarch/lp64/strcpy.S               | 173 ------------
- sysdeps/loongarch/lp64/strlen.S               |  85 ------
- sysdeps/loongarch/lp64/strncmp.S              | 256 ------------------
- sysdeps/loongarch/lp64/strnlen.S              |  82 ------
- sysdeps/loongarch/lp64/strrchr.S              | 105 -------
- 18 files changed, 1264 insertions(+), 1316 deletions(-)
- delete mode 100644 sysdeps/loongarch/lp64/stpcpy.S
- delete mode 100644 sysdeps/loongarch/lp64/strchr.S
- delete mode 100644 sysdeps/loongarch/lp64/strchrnul.S
- delete mode 100644 sysdeps/loongarch/lp64/strcmp.S
- delete mode 100644 sysdeps/loongarch/lp64/strcpy.S
- delete mode 100644 sysdeps/loongarch/lp64/strlen.S
- delete mode 100644 sysdeps/loongarch/lp64/strncmp.S
- delete mode 100644 sysdeps/loongarch/lp64/strnlen.S
- delete mode 100644 sysdeps/loongarch/lp64/strrchr.S
-
-diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S
-index 3d134e3f..7109b0f0 100644
---- a/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S
-@@ -1,8 +1,181 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
- 
- #if IS_IN (libc)
--
- #define STPCPY_NAME __stpcpy_aligned
--
-+#else
-+#define STPCPY_NAME __stpcpy
- #endif
- 
--#include "../stpcpy.S"
-+LEAF(STPCPY_NAME, 6)
-+    andi        a3, a0, 0x7
-+    beqz        a3, L(dest_align)
-+    sub.d       a5, a1, a3
-+    addi.d      a5, a5, 8
-+
-+L(make_dest_align):
-+    ld.b        t0, a1, 0
-+    addi.d      a1, a1, 1
-+    st.b        t0, a0, 0
-+    addi.d      a0, a0, 1
-+
-+    beqz        t0, L(al_out)
-+    bne         a1, a5, L(make_dest_align)
-+
-+L(dest_align):
-+    andi        a4, a1, 7
-+    bstrins.d   a1, zero, 2, 0
-+
-+    lu12i.w     t5, 0x1010
-+    ld.d        t0, a1, 0
-+    ori         t5, t5, 0x101
-+    bstrins.d   t5, t5, 63, 32
-+
-+    slli.d      t6, t5, 0x7
-+    bnez        a4, L(unalign)
-+    sub.d       t1, t0, t5
-+    andn        t2, t6, t0
-+
-+    and         t3, t1, t2
-+    bnez        t3, L(al_end)
-+
-+L(al_loop):
-+    st.d        t0, a0, 0
-+    ld.d        t0, a1, 8
-+
-+    addi.d      a1, a1, 8
-+    addi.d      a0, a0, 8
-+    sub.d       t1, t0, t5
-+    andn        t2, t6, t0
-+
-+    and         t3, t1, t2
-+    beqz        t3, L(al_loop)
-+
-+L(al_end):
-+    ctz.d       t1, t3
-+    srli.d      t1, t1, 3
-+    addi.d      t1, t1, 1 # add 1, since '\0' needs to be copied to dest
-+
-+    andi        a3, t1, 8
-+    andi        a4, t1, 4
-+    andi        a5, t1, 2
-+    andi        a6, t1, 1
-+
-+L(al_end_8):
-+    beqz        a3, L(al_end_4)
-+    st.d        t0, a0, 0
-+    addi.d      a0, a0, 7
-+    jr          ra
-+L(al_end_4):
-+    beqz        a4, L(al_end_2)
-+    st.w        t0, a0, 0
-+    addi.d      a0, a0, 4
-+    srli.d      t0, t0, 32
-+L(al_end_2):
-+    beqz        a5, L(al_end_1)
-+    st.h        t0, a0, 0
-+    addi.d      a0, a0, 2
-+    srli.d      t0, t0, 16
-+L(al_end_1):
-+    beqz        a6, L(al_out)
-+    st.b        t0, a0, 0
-+    addi.d      a0, a0, 1
-+L(al_out):
-+    addi.d      a0, a0, -1
-+    jr          ra
-+
-+L(unalign):
-+    slli.d      a5, a4, 3
-+    li.d        t1, -1
-+    sub.d       a6, zero, a5
-+
-+    srl.d       a7, t0, a5
-+    sll.d       t7, t1, a6
-+
-+    or          t0, a7, t7
-+    sub.d       t1, t0, t5
-+    andn        t2, t6, t0
-+    and         t3, t1, t2
-+
-+    bnez        t3, L(un_end)
-+
-+    ld.d        t4, a1, 8
-+    addi.d      a1, a1, 8
-+
-+    sub.d       t1, t4, t5
-+    andn        t2, t6, t4
-+    sll.d       t0, t4, a6
-+    and         t3, t1, t2
-+
-+    or          t0, t0, a7
-+    bnez        t3, L(un_end_with_remaining)
-+
-+L(un_loop):
-+    srl.d       a7, t4, a5
-+
-+    ld.d        t4, a1, 8
-+    addi.d      a1, a1, 8
-+
-+    st.d        t0, a0, 0
-+    addi.d      a0, a0, 8
-+
-+    sub.d       t1, t4, t5
-+    andn        t2, t6, t4
-+    sll.d       t0, t4, a6
-+    and         t3, t1, t2
-+
-+    or          t0, t0, a7
-+    beqz        t3, L(un_loop)
-+
-+L(un_end_with_remaining):
-+    ctz.d       t1, t3
-+    srli.d      t1, t1, 3
-+    addi.d      t1, t1, 1
-+    sub.d       t1, t1, a4
-+
-+    blt         t1, zero, L(un_end_less_8)
-+    st.d        t0, a0, 0
-+    addi.d      a0, a0, 8
-+    beqz        t1, L(un_out)
-+    srl.d       t0, t4, a5  # get the remaining part
-+    b           L(un_end_less_8)
-+
-+L(un_end):
-+    ctz.d       t1, t3
-+    srli.d      t1, t1, 3
-+    addi.d      t1, t1, 1
-+
-+L(un_end_less_8):
-+    andi        a4, t1, 4
-+    andi        a5, t1, 2
-+    andi        a6, t1, 1
-+L(un_end_4):
-+    beqz        a4, L(un_end_2)
-+    st.w        t0, a0, 0
-+    addi.d      a0, a0, 4
-+    srli.d      t0, t0, 32
-+L(un_end_2):
-+    beqz        a5, L(un_end_1)
-+    st.h        t0, a0, 0
-+    addi.d      a0, a0, 2
-+    srli.d      t0, t0, 16
-+L(un_end_1):
-+    beqz        a6, L(un_out)
-+    st.b        t0, a0, 0
-+    addi.d      a0, a0, 1
-+L(un_out):
-+    addi.d      a0, a0, -1
-+    jr          ra
-+
-+END(STPCPY_NAME)
-+
-+#ifdef _LIBC
-+weak_alias (STPCPY_NAME, stpcpy)
-+libc_hidden_builtin_def (STPCPY_NAME)
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
-index 92365658..d9bd4587 100644
---- a/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
-@@ -1,10 +1,95 @@
- 
--#if IS_IN (libc)
- 
--#define STRCHR_NAME __strchr_aligned
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
- 
-+#if IS_IN (libc)
-+#define STRCHR_NAME __strchr_aligned
-+#else
-+#define STRCHR_NAME strchr
- #endif
- 
--#include "../strchr.S"
-+/* char * strchr (const char *s1, int c); */
-+
-+LEAF(STRCHR_NAME, 6)
-+	slli.d		t1, a0, 3
-+	bstrins.d	a0, zero, 2, 0
-+	lu12i.w		a2, 0x01010
-+	ld.d		t2, a0, 0
-+
-+	ori		a2, a2, 0x101
-+	andi		a1, a1, 0xff
-+	bstrins.d	a2, a2, 63, 32
-+	li.w		t0, -1
-+
-+	mul.d           a1, a1, a2 # "cccccccc"
-+	sll.d		t0, t0, t1
-+	slli.d		a3, a2, 7  # 0x8080808080808080
-+	orn             t2, t2, t0
-+
-+	sll.d           t3, a1, t1
-+	xor             t4, t2, t3
-+	sub.d           a7, t2, a2
-+	andn            a6, a3, t2
-+
-+
-+	sub.d           a5, t4, a2
-+	andn            a4, a3, t4
-+	and		a6, a7, a6
-+	and		a5, a5, a4
-+
-+	or		t0, a6, a5
-+	bnez		t0, L(_mc8_a)
-+	addi.d		a0, a0, 8
-+L(_aloop):
-+	ld.d		t4, a0, 0
-+
-+	xor		t2, t4, a1
-+	sub.d		a7, t4, a2
-+	andn		a6, a3, t4
-+	sub.d		a5, t2, a2
-+
-+	andn		a4, a3, t2
-+	and		a6, a7, a6
-+	and		a5, a5, a4
-+	or		a7, a6, a5
-+
-+
-+	bnez		a7, L(_mc8_a)
-+	ld.d		t4, a0, 8
-+	addi.d		a0, a0, 16
-+	xor		t2, t4, a1
-+
-+	sub.d		a7, t4, a2
-+	andn		a6, a3, t4
-+	sub.d		a5, t2, a2
-+	andn		a4, a3, t2
-+
-+	and		a6, a7, a6
-+	and		a5, a5, a4
-+	or		a7, a6, a5
-+	beqz		a7, L(_aloop)
-+
-+	addi.d		a0, a0, -8
-+
-+L(_mc8_a):
-+	ctz.d		t0, a5
-+	ctz.d		t2, a6
-+	srli.w		t0, t0, 3
-+
-+
-+	srli.w		t2, t2, 3
-+	sltu		t1, t2, t0
-+	add.d		a0, a0, t0
-+	masknez		a0, a0, t1
-+
-+	jr		ra
-+END(STRCHR_NAME)
- 
- weak_alias (STRCHR_NAME, index)
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
-index 4fa63ecc..f18b01a3 100644
---- a/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
-@@ -1,8 +1,96 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
- 
- #if IS_IN (libc)
--
- #define STRCHRNUL_NAME __strchrnul_aligned
--
-+#else
-+#define STRCHRNUL_NAME __strchrnul
- #endif
- 
--#include "../strchrnul.S"
-+/* char * strchrnul (const char *s1, int c); */
-+
-+LEAF(STRCHRNUL_NAME, 6)
-+	slli.d		t1, a0, 3
-+	bstrins.d	a0, zero, 2, 0
-+	lu12i.w		a2, 0x01010
-+	ld.d		t2, a0, 0
-+
-+	ori		a2, a2, 0x101
-+	andi		a1, a1, 0xff
-+	bstrins.d	a2, a2, 63, 32
-+	li.w		t0, -1
-+
-+	mul.d           a1, a1, a2 # "cccccccc"
-+	sll.d		t0, t0, t1
-+	slli.d		a3, a2, 7  # 0x8080808080808080
-+	orn             t2, t2, t0
-+
-+	sll.d           t3, a1, t1
-+	xor             t4, t2, t3
-+	sub.d           a7, t2, a2
-+	andn            a6, a3, t2
-+
-+
-+	sub.d           a5, t4, a2
-+	andn            a4, a3, t4
-+	and		a6, a7, a6
-+	and		a5, a5, a4
-+
-+	or		t0, a6, a5
-+	bnez		t0, L(_mc8_a)
-+	addi.d		a0, a0, 8
-+L(_aloop):
-+	ld.d		t4, a0, 0
-+
-+	xor		t2, t4, a1
-+	sub.d		a7, t4, a2
-+	andn		a6, a3, t4
-+	sub.d		a5, t2, a2
-+
-+	andn		a4, a3, t2
-+	and		a6, a7, a6
-+	and		a5, a5, a4
-+	or		a7, a6, a5
-+
-+
-+	bnez		a7, L(_mc8_a)
-+	ld.d		t4, a0, 8
-+	addi.d		a0, a0, 16
-+	xor		t2, t4, a1
-+
-+	sub.d		a7, t4, a2
-+	andn		a6, a3, t4
-+	sub.d		a5, t2, a2
-+	andn		a4, a3, t2
-+
-+	and		a6, a7, a6
-+	and		a5, a5, a4
-+	or		a7, a6, a5
-+	beqz		a7, L(_aloop)
-+
-+	addi.d		a0, a0, -8
-+L(_mc8_a):
-+	ctz.d		t0, a5
-+	ctz.d		t2, a6
-+	srli.w		t0, t0, 3
-+
-+	srli.w		t2, t2, 3
-+	slt		t1, t0, t2
-+	masknez		t3, t2, t1
-+	maskeqz		t4, t0, t1
-+
-+	or		t0, t3, t4
-+	add.d		a0, a0, t0
-+	jr		ra
-+END(STRCHRNUL_NAME)
-+
-+#ifdef _LIBC
-+weak_alias(STRCHRNUL_NAME, strchrnul)
-+libc_hidden_builtin_def (STRCHRNUL_NAME)
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
-index f84f52b8..a9b74b0c 100644
---- a/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
-@@ -1,8 +1,229 @@
-+/* 2022\06\15  loongarch64 author: chenxiaolong.  */
- 
--#if IS_IN (libc)
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
- 
-+#if IS_IN (libc)
- #define STRCMP_NAME __strcmp_aligned
-+#else
-+#define STRCMP_NAME strcmp
-+#endif
-+
-+/* int strcmp (const char *s1, const char *s2); */
-+
-+/* Parameters and Results */
-+#define src1	a0
-+#define	src2	a1
-+#define result	v0
-+LEAF(STRCMP_NAME, 6)
-+    xor         a4, src1, src2
-+    lu12i.w     t5, 0x01010
-+    lu12i.w     t6, 0x7f7f7
-+    andi        a2, src1, 0x7
-+
-+    ori         t5, t5, 0x101
-+    andi        a4, a4, 0x7
-+    ori         t6, t6, 0xf7f
-+    bstrins.d   t5, t5, 63, 32
-+    bstrins.d   t6, t6, 63, 32
-+
-+    bnez        a4, 3f  // unaligned
-+    beqz        a2, 1f  // loop aligned
-+
-+// mutual aligned
-+    bstrins.d   src1, zero, 2, 0
-+    bstrins.d   src2, zero, 2, 0
-+    slli.d      a4, a2, 0x3
-+    ld.d        t0, src1, 0
-+
-+    sub.d       a4, zero, a4
-+    ld.d        t1, src2, 0
-+    addi.d      src1, src1, 8
-+    addi.d      src2, src2, 8
-+
-+    nor         a5, zero, zero
-+    srl.d       a5, a5, a4
-+    or          t0, t0, a5
-+
-+    or          t1, t1, a5
-+    b           2f  //start realigned
-+
-+// loop aligned
-+1:
-+    ld.d        t0, src1, 0
-+    addi.d      src1, src1, 8
-+    ld.d        t1, src2, 0
-+    addi.d      src2, src2, 8
-+
-+// start realigned:
-+2:
-+    sub.d       t2, t0, t5
-+    nor         t3, t0, t6
-+    and         t2, t2, t3
-+
-+    xor         t3, t0, t1
-+    or          t2, t2, t3
-+    beqz        t2, 1b
-+
-+    ctz.d       t7, t2
-+    bstrins.d   t7, zero, 2, 0
-+    srl.d       t0, t0, t7
-+    srl.d       t1, t1, t7
-+
-+    andi        t0, t0, 0xff
-+    andi        t1, t1, 0xff
-+    sub.d       v0, t0, t1
-+    jr          ra
-+
-+// unaligned
-+3:
-+    andi        a3, src2, 0x7
-+    slt         a5, a2, a3
-+    masknez     t8, a2, a5
-+    xor         a6, src1, src2
-+    maskeqz     a6, a6, t8
-+    xor         src1, src1, a6
-+    xor         src2, src2, a6
-+
-+    andi        a2, src1, 0x7
-+    beqz        a2, 4f // src1 is aligned
-+
-+//strcmp_unaligned:
-+    andi        a3, src2, 0x7
-+    bstrins.d   src1, zero, 2, 0
-+    bstrins.d   src2, zero, 2, 0
-+    nor         t3, zero, zero
-+
-+    ld.d        t0, src1, 0
-+    ld.d        t1, src2, 0
-+    sub.d       a2, a3, a2
-+    addi.d      t2, zero, 8
-+
-+    sub.d       a5, t2, a2
-+    sub.d       a6, t2, a3
-+    slli.d      a5, a5, 0x3
-+    slli.d      a6, a6, 0x3
-+
-+    srl.d       t4, t3, a6
-+    srl.d       a4, t3, a5
-+    rotr.d      a7, t0, a5
-+
-+    addi.d      src2, src2, 8
-+    addi.d      src1, src1, 8
-+    or          t1, t1, t4
-+    or          t0, a7, t4
-+
-+    sub.d       t2, t0, t5
-+    nor         t3, t0, t6
-+    and         t2, t2, t3
-+    xor         t3, t0, t1
-+    or          t2, t2, t3
-+    bnez        t2, 7f
-+
-+    and         a7, a7, a4
-+    slli.d      a6, a2, 0x3
-+    nor         a4, zero, a4
-+    b           5f
-+
-+// src1 is aligned
-+4:
-+    andi        a3, src2, 0x7
-+    ld.d        t0, src1, 0
-+
-+    bstrins.d   src2, zero, 2, 0
-+    nor         t2, zero, zero
-+    ld.d        t1, src2, 0
-+
-+    addi.d      t3, zero, 0x8
-+    sub.d       a5, t3, a3
-+    slli.d      a5, a5, 0x3
-+    srl.d       a4, t2, a5
-+    rotr.d      t4, t0, a5
-+
-+    addi.d      src2, src2, 8
-+    addi.d      src1, src1, 8
-+    or          t1, t1, a4
-+    or          t0, t4, a4
-+
-+    sub.d       t2, t0, t5
-+    nor         t3, t0, t6
-+    and         t2, t2, t3
-+    xor         t3, t0, t1
-+    or          t2, t2, t3
-+
-+    bnez        t2, 7f
-+
-+    and         a7, t4, a4
-+    slli.d      a6, a3, 0x3
-+    nor         a4, zero, a4
-+
-+// unaligned loop
-+// a7: remaining number
-+// a6: shift left number
-+// a5: shift right number
-+// a4: mask for checking remaining number
-+5:
-+    or          t0, a7, a4
-+    sub.d       t2, t0, t5
-+    nor         t3, t0, t6
-+    and         t2, t2, t3
-+    bnez        t2, 6f
-+
-+    ld.d        t0, src1, 0
-+    addi.d      src1, src1, 8
-+    ld.d        t1, src2, 0
-+    addi.d      src2, src2, 8
-+
-+    srl.d       t7, t0, a5
-+    sll.d       t0, t0, a6
-+    or          t0, a7, t0
-+
-+    sub.d       t2, t0, t5
-+    nor         t3, t0, t6
-+    and         t2, t2, t3
-+    xor         t3, t0, t1
-+    or          t2, t2, t3
-+    bnez        t2, 7f
-+
-+    or          a7, t7, zero
-+    b           5b
-+
-+6:
-+    ld.bu       t1, src2, 0
-+    andi        t0, a7, 0xff
-+    xor         t2, t0, t1
-+    srli.d      a7, a7, 0x8
-+    masknez     t2, t0, t2
-+    addi.d      src2, src2, 1
-+    beqz        t2, 8f
-+    b           6b
-+
-+7:
-+    ctz.d       t7, t2
-+    bstrins.d   t7, zero, 2, 0
-+    srl.d       t0, t0, t7
-+    srl.d       t1, t1, t7
-+
-+    andi        t0, t0, 0xff
-+    andi        t1, t1, 0xff
-+
-+8:
-+    sub.d       a4, t0, t1
-+    sub.d       a5, t1, t0
-+    maskeqz     a6, a5, t8
-+    masknez     result, a4, t8
-+    or          result, result, a6
-+    jr	ra
-+
-+END(STRCMP_NAME)
- 
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRCMP_NAME)
- #endif
- 
--#include "../strcmp.S"
-diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S
-index 4860398b..80954912 100644
---- a/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S
-@@ -1,8 +1,175 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
- 
- #if IS_IN (libc)
--
- #define STRCPY __strcpy_aligned
--
-+#else
-+#define STRCPY  strcpy
- #endif
- 
--#include "../strcpy.S"
-+LEAF(STRCPY, 6)
-+    andi        a3, a0, 0x7
-+    move        a2, a0
-+    beqz        a3, L(dest_align)
-+    sub.d       a5, a1, a3
-+    addi.d      a5, a5, 8
-+
-+L(make_dest_align):
-+    ld.b        t0, a1, 0
-+    addi.d      a1, a1, 1
-+    st.b        t0, a2, 0
-+    beqz        t0, L(al_out)
-+
-+    addi.d      a2, a2, 1
-+    bne         a1, a5, L(make_dest_align)
-+
-+L(dest_align):
-+    andi        a4, a1, 7
-+    bstrins.d   a1, zero, 2, 0
-+
-+    lu12i.w     t5, 0x1010
-+    ld.d        t0, a1, 0
-+    ori         t5, t5, 0x101
-+    bstrins.d   t5, t5, 63, 32
-+
-+    slli.d      t6, t5, 0x7
-+    bnez        a4, L(unalign)
-+    sub.d       t1, t0, t5
-+    andn        t2, t6, t0
-+
-+    and         t3, t1, t2
-+    bnez        t3, L(al_end)
-+
-+L(al_loop):
-+    st.d        t0, a2, 0
-+    ld.d        t0, a1, 8
-+
-+    addi.d      a1, a1, 8
-+    addi.d      a2, a2, 8
-+    sub.d       t1, t0, t5
-+    andn        t2, t6, t0
-+
-+    and         t3, t1, t2
-+    beqz        t3, L(al_loop)
-+
-+L(al_end):
-+    ctz.d       t1, t3
-+    srli.d      t1, t1, 3
-+    addi.d      t1, t1, 1 # add 1, since '\0' needs to be copied to dest
-+
-+    andi        a3, t1, 8
-+    andi        a4, t1, 4
-+    andi        a5, t1, 2
-+    andi        a6, t1, 1
-+
-+L(al_end_8):
-+    beqz        a3, L(al_end_4)
-+    st.d        t0, a2, 0
-+    jr          ra
-+L(al_end_4):
-+    beqz        a4, L(al_end_2)
-+    st.w        t0, a2, 0
-+    addi.d      a2, a2, 4
-+    srli.d      t0, t0, 32
-+L(al_end_2):
-+    beqz        a5, L(al_end_1)
-+    st.h        t0, a2, 0
-+    addi.d      a2, a2, 2
-+    srli.d      t0, t0, 16
-+L(al_end_1):
-+    beqz        a6, L(al_out)
-+    st.b        t0, a2, 0
-+L(al_out):
-+    jr          ra
-+
-+L(unalign):
-+    slli.d      a5, a4, 3
-+    li.d        t1, -1
-+    sub.d       a6, zero, a5
-+
-+    srl.d       a7, t0, a5
-+    sll.d       t7, t1, a6
-+
-+    or          t0, a7, t7
-+    sub.d       t1, t0, t5
-+    andn        t2, t6, t0
-+    and         t3, t1, t2
-+
-+    bnez        t3, L(un_end)
-+
-+    ld.d        t4, a1, 8
-+
-+    sub.d       t1, t4, t5
-+    andn        t2, t6, t4
-+    sll.d       t0, t4, a6
-+    and         t3, t1, t2
-+
-+    or          t0, t0, a7
-+    bnez        t3, L(un_end_with_remaining)
-+
-+L(un_loop):
-+    srl.d       a7, t4, a5
-+
-+    ld.d        t4, a1, 16
-+    addi.d      a1, a1, 8
-+
-+    st.d        t0, a2, 0
-+    addi.d      a2, a2, 8
-+
-+    sub.d       t1, t4, t5
-+    andn        t2, t6, t4
-+    sll.d       t0, t4, a6
-+    and         t3, t1, t2
-+
-+    or          t0, t0, a7
-+    beqz        t3, L(un_loop)
-+
-+L(un_end_with_remaining):
-+    ctz.d       t1, t3
-+    srli.d      t1, t1, 3
-+    addi.d      t1, t1, 1
-+    sub.d       t1, t1, a4
-+
-+    blt         t1, zero, L(un_end_less_8)
-+    st.d        t0, a2, 0
-+    addi.d      a2, a2, 8
-+    beqz        t1, L(un_out)
-+    srl.d       t0, t4, a5  # get the remaining part
-+    b           L(un_end_less_8)
-+
-+L(un_end):
-+    ctz.d       t1, t3
-+    srli.d      t1, t1, 3
-+    addi.d      t1, t1, 1
-+
-+L(un_end_less_8):
-+    andi        a4, t1, 4
-+    andi        a5, t1, 2
-+    andi        a6, t1, 1
-+L(un_end_4):
-+    beqz        a4, L(un_end_2)
-+    st.w        t0, a2, 0
-+    addi.d      a2, a2, 4
-+    srli.d      t0, t0, 32
-+L(un_end_2):
-+    beqz        a5, L(un_end_1)
-+    st.h        t0, a2, 0
-+    addi.d      a2, a2, 2
-+    srli.d      t0, t0, 16
-+L(un_end_1):
-+    beqz        a6, L(un_out)
-+    st.b        t0, a2, 0
-+L(un_out):
-+    jr          ra
-+
-+END(STRCPY)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRCPY)
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
-index d31875fd..fcbc4f6a 100644
---- a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
-@@ -1,8 +1,87 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
- 
- #if IS_IN (libc)
--
- #define STRLEN __strlen_aligned
--
-+#else
-+#define STRLEN	strlen
- #endif
- 
--#include "../strlen.S"
-+LEAF(STRLEN, 6)
-+	move		a1, a0
-+	bstrins.d	a0, zero, 2, 0
-+	lu12i.w		a2, 0x01010
-+	li.w		t0, -1
-+
-+	ld.d		t2, a0, 0
-+	andi		t1, a1, 0x7
-+	ori		a2, a2, 0x101
-+	slli.d		t1, t1, 3
-+
-+	bstrins.d	a2, a2, 63, 32
-+	sll.d		t1, t0, t1
-+	slli.d		t3, a2, 7
-+	nor		a3, zero, t3
-+
-+	orn		t2, t2, t1
-+	sub.d		t0, t2, a2
-+	nor		t1, t2, a3
-+	and		t0, t0, t1
-+
-+
-+	bnez		t0, L(count_pos)
-+	addi.d          a0, a0, 8
-+L(loop_16_7bit):
-+	ld.d		t2, a0, 0
-+	sub.d		t1, t2, a2
-+
-+	and 		t0, t1, t3
-+	bnez		t0, L(more_check)
-+	ld.d		t2, a0, 8
-+	addi.d      	a0, a0, 16
-+
-+	sub.d		t1, t2, a2
-+	and 		t0, t1, t3
-+	beqz        	t0, L(loop_16_7bit)
-+	addi.d          a0, a0, -8
-+L(more_check):
-+	nor		t0, t2, a3
-+
-+	and		t0, t1, t0
-+	bnez		t0, L(count_pos)
-+	addi.d          a0, a0, 8
-+L(loop_16_8bit):
-+	ld.d		t2, a0, 0
-+
-+	sub.d		t1, t2, a2
-+	nor		t0, t2, a3
-+	and		t0, t0, t1
-+	bnez		t0, L(count_pos)
-+
-+	ld.d		t2, a0, 8
-+	addi.d      	a0, a0, 16
-+	sub.d		t1, t2, a2
-+	nor		t0, t2, a3
-+
-+	and		t0, t0, t1
-+	beqz		t0, L(loop_16_8bit)
-+	addi.d          a0, a0, -8
-+L(count_pos):
-+	ctz.d		t1, t0
-+	sub.d		a0, a0, a1
-+
-+	srli.d		t1, t1, 3
-+	add.d		a0, a0, t1
-+	jr		ra
-+
-+END(STRLEN)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRLEN)
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
-index f371b19e..2cd56c44 100644
---- a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
-@@ -1,8 +1,258 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
- 
- #if IS_IN (libc)
--
- #define STRNCMP __strncmp_aligned
--
-+#else
-+#define STRNCMP strncmp
- #endif
- 
--#include "../strncmp.S"
-+/* int strncmp (const char *s1, const char *s2); */
-+
-+LEAF(STRNCMP, 6)
-+    beqz        a2, L(ret0)
-+    xor         a4, a0, a1
-+    lu12i.w     t5, 0x01010
-+    lu12i.w     t6, 0x7f7f7
-+
-+    andi        a3, a0, 0x7
-+    ori         t5, t5, 0x101
-+    andi        a4, a4, 0x7
-+    ori         t6, t6, 0xf7f
-+
-+    bstrins.d   t5, t5, 63, 32
-+    bstrins.d   t6, t6, 63, 32
-+
-+    bnez        a4, L(unalign)
-+    bnez        a3, L(mutual_align)
-+
-+L(a_loop):
-+    ld.d        t0, a0, 0
-+    ld.d        t1, a1, 0
-+    addi.d      a0, a0, 8
-+    addi.d      a1, a1, 8
-+
-+
-+    sltui       t7, a2, 9
-+
-+L(start_realign):
-+    sub.d       t2, t0, t5
-+    nor         t3, t0, t6
-+    xor         t4, t0, t1
-+
-+    and         t2, t2, t3
-+    addi.d      a2, a2, -8
-+
-+    or          t2, t2, t4
-+    or          t3, t2, t7
-+    beqz        t3, L(a_loop)
-+
-+L(end):
-+    bge         zero, t7, L(out)
-+    andi        t4, a2, 7
-+    li.d        t3, -1
-+    addi.d      t4, t4, -1
-+    slli.d      t4, t4, 3
-+    sll.d       t3, t3, t4
-+    or          t2, t2, t3
-+
-+
-+L(out):
-+    ctz.d       t3, t2
-+    bstrins.d   t3, zero, 2, 0
-+    srl.d       t0, t0, t3
-+    srl.d       t1, t1, t3
-+
-+    andi        t0, t0, 0xff
-+    andi        t1, t1, 0xff
-+    sub.d       a0, t0, t1
-+    jr          ra
-+
-+L(mutual_align):
-+    bstrins.d   a0, zero, 2, 0
-+    bstrins.d   a1, zero, 2, 0
-+    slli.d      a5, a3, 0x3
-+    li.d        t2, -1
-+
-+    ld.d        t0, a0, 0
-+    ld.d        t1, a1, 0
-+
-+    li.d        t3, 9
-+    sll.d       t2, t2, a5
-+
-+    sub.d       t3, t3, a3
-+    addi.d      a0, a0, 8
-+
-+    sltu        t7, a2, t3
-+    addi.d      a1, a1, 8
-+
-+    add.d       a2, a2, a3
-+    orn         t0, t0, t2
-+    orn         t1, t1, t2
-+    b           L(start_realign)
-+
-+L(ret0):
-+    move        a0, zero
-+    jr          ra
-+
-+L(unalign):
-+    li.d        t8, 8
-+    blt         a2, t8, L(short_cmp)
-+
-+    # swap a0 and a1 in case a3 > a4
-+    andi        a4, a1, 0x7
-+    sltu        t8, a4, a3
-+    xor         a6, a0, a1
-+    maskeqz     a6, a6, t8
-+    xor         a0, a0, a6
-+    xor         a1, a1, a6
-+
-+    andi        a3, a0, 0x7
-+    andi        a4, a1, 0x7
-+
-+    bstrins.d   a0, zero, 2, 0
-+    bstrins.d   a1, zero, 2, 0
-+
-+    li.d        t2, -1
-+    li.d        t3, 9
-+
-+    ld.d        t0, a0, 0
-+    ld.d        t1, a1, 0
-+
-+    sub.d       t3, t3, a4
-+    sub.d       a3, a4, a3
-+
-+    slli.d      t4, a4, 3
-+    slli.d      a6, a3, 3
-+
-+    sub.d       a5, zero, a6
-+    sltu        t7, a2, t3
-+
-+    rotr.d      a7, t0, a5
-+    sll.d       t4, t2, t4 # mask for first num
-+
-+    add.d       a2, a2, a4
-+    sll.d       a4, t2, a6 # mask for a7
-+
-+    orn         t0, a7, t4
-+    orn         t1, t1, t4
-+
-+    sub.d       t2, t0, t5
-+    nor         t4, t0, t6
-+    and         t2, t2, t4
-+
-+    xor         t3, t0, t1
-+    or          t2, t2, t3
-+
-+    or          t3, t2, t7
-+    bnez        t3, L(un_end)
-+
-+    andn        a7, a7, a4
-+    addi.d      a3, a3, 1
-+
-+L(un_loop):
-+    addi.d      a2, a2, -8
-+    # in case remaining part has '\0', no more load instructions should be executed on a0 address
-+    or          t0, a7, a4
-+    sltu        t7, a2, a3
-+
-+    sub.d       t2, t0, t5
-+    nor         t3, t0, t6
-+    and         t2, t2, t3
-+
-+    or          t3, t2, t7
-+    bnez        t3, L(check_remaining)
-+
-+    ld.d        t7, a0, 8
-+    ld.d        t1, a1, 8
-+    addi.d      a0, a0, 8
-+    addi.d      a1, a1, 8
-+
-+    sll.d       t4, t7, a6
-+    sub.d       t2, t1, t5
-+    nor         t3, t1, t6
-+
-+    or          t0, t4, a7
-+    srl.d       a7, t7, a5
-+
-+    and         t2, t2, t3
-+    xor         t3, t0, t1
-+
-+    sltui       t7, a2, 9
-+    or          t2, t2, t3
-+
-+    or          t3, t2, t7
-+    beqz        t3, L(un_loop)
-+    b           L(un_end)
-+
-+L(check_remaining):
-+    ld.d        t1, a1, 8
-+    xor         t3, t1, a7
-+    or          t2, t2, t3
-+
-+L(un_end):
-+    bge         zero, t7, L(un_out)
-+    andi        t4, a2, 7
-+    li.d        t3, -1
-+
-+    addi.d      t4, t4, -1
-+    slli.d      t4, t4, 3
-+    sll.d       t3, t3, t4
-+    or          t2, t2, t3
-+
-+L(un_out):
-+    ctz.d       t3, t2
-+    bstrins.d   t3, zero, 2, 0
-+    srl.d       t0, t0, t3
-+    srl.d       t1, t1, t3
-+
-+    andi        t0, t0, 0xff
-+    andi        t1, t1, 0xff
-+
-+    sub.d       a4, t0, t1
-+    sub.d       a5, t1, t0
-+
-+    maskeqz     a6, a5, t8
-+    masknez     a0, a4, t8
-+
-+    or          a0, a0, a6
-+    jr          ra
-+
-+L(short_cmp):
-+    ld.bu       t0, a0, 0
-+    ld.bu       t1, a1, 0
-+    addi.d      a2, a2, -1
-+
-+    xor         t2, t0, t1
-+    masknez     t2, t0, t2
-+    maskeqz     t2, a2, t2
-+
-+    beqz        t2, L(short_out)
-+
-+    ld.bu       t0, a0, 1
-+    ld.bu       t1, a1, 1
-+
-+    addi.d      a2, a2, -1
-+    addi.d      a0, a0, 2
-+
-+    addi.d      a1, a1, 2
-+    xor         t2, t0, t1
-+    masknez     t2, t0, t2
-+    maskeqz     t2, a2, t2
-+
-+    bnez        t2, L(short_cmp)
-+
-+L(short_out):
-+    sub.d       a0, t0, t1
-+    jr ra
-+
-+END(STRNCMP)
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRNCMP)
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
-index 503442b3..78c8fd5d 100644
---- a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
-@@ -1,8 +1,84 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
- 
- #if IS_IN (libc)
--
- #define STRNLEN __strnlen_aligned
--
-+#else
-+#define STRNLEN	__strnlen
- #endif
- 
--#include "../strnlen.S"
-+#. before every load, a1(t5) must > 0;
-+#. first load with t1 != 0, need to adjust t5;
-+#. return the less one of both strlen(s) and a1;
-+
-+LEAF(STRNLEN, 6)
-+	beqz		a1, L(out)
-+	lu12i.w		a2, 0x01010
-+	andi		t1, a0, 0x7
-+	move		t4, a0
-+
-+	bstrins.d	a0, zero, 2, 0
-+	ori		a2, a2, 0x101
-+	li.w		t0, -1
-+	ld.d		t2, a0, 0
-+
-+	slli.d		t3, t1, 3
-+	bstrins.d	a2, a2, 63, 32
-+	li.w		t5, 8
-+	slli.d		a3, a2, 7
-+
-+	sub.w		t1, t5, t1
-+	sll.d		t0, t0, t3
-+	nor		a3, zero, a3
-+	orn		t2, t2, t0
-+
-+
-+	sub.d		t0, t2, a2
-+	nor		t3, t2, a3
-+	and		t0, t0, t3
-+	bnez		t0, L(count_pos)
-+
-+	sub.d		t5, a1, t1
-+	bgeu		t1, a1, L(out)
-+L(loop_8bytes):
-+	ld.d		t2, a0, 8
-+	addi.d		a0, a0, 8
-+
-+	sub.d		t0, t2, a2
-+	nor 		t1, t2, a3
-+	sltui		t6, t5, 9
-+	and 		t0, t0, t1
-+
-+	addi.d		t5, t5, -8
-+	or		t7, t0, t6
-+	beqz		t7, L(loop_8bytes)
-+L(count_pos):
-+	ctz.d		t1, t0
-+
-+
-+	sub.d		a0, a0, t4
-+	srli.d		t1, t1, 3
-+	add.d		a0, t1, a0
-+	sltu		t0, a0, a1
-+
-+	masknez		t1, a1, t0
-+	maskeqz		a0, a0, t0
-+	or		a0, a0, t1
-+	jr		ra
-+
-+L(out):
-+	move		a0, a1
-+	jr		ra
-+
-+END(STRNLEN)
-+
-+#ifdef _LIBC
-+weak_alias (STRNLEN, strnlen)
-+libc_hidden_builtin_def (STRNLEN)
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S
-index a58ddde8..6931045b 100644
---- a/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S
-@@ -1,11 +1,110 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
- 
- #if IS_IN (libc)
--
- #define STRRCHR_NAME __strrchr_aligned
--
-+#else
-+#define STRRCHR_NAME strrchr
- #endif
- 
--#include "../strrchr.S"
-+LEAF(STRRCHR_NAME, 6)
-+	slli.d		t1, a0, 3
-+	bstrins.d	a0, zero, 2, 0
-+	lu12i.w		a2, 0x01010
-+	ld.d		t2, a0, 0       // t2 = "5ZZ21abc"
-+
-+	ori		a2, a2, 0x101
-+	andi		a1, a1, 0xff	// a1 = "0000000Z"
-+	li.d		a5, -1
-+	bstrins.d	a2, a2, 63, 32	// a2 = 0x0101010101010101
-+
-+	sll.d		t1, a5, t1	// t1 = 0xffffffffff000000
-+	mul.d		a1, a1, a2	// a1 = "ZZZZZZZZ"
-+	orn		t2, t2, t1	// t2 = "5ZZ21YYY"
-+	slli.d		a3, a2, 7	// a3 = 0x8080808080808080
-+
-+	sub.d		a4, t2, a2
-+	andn		t0, a3, t2
-+	move		t3, zero
-+	and		t0, a4, t0
-+
-+
-+	xor		a4, t2, a1
-+	move		t5, zero
-+	orn		a4, a4, t1
-+	bnez		t0, L(found_end)
-+
-+	sub.d		t1, a4, a2
-+	andn		t0, a3, a4
-+	and		t1, t1, t0
-+
-+L(loop_8bytes):
-+	masknez		t4, t3, t1
-+
-+	maskeqz		t3, t2, t1
-+	ld.d		t2, a0, 8
-+	masknez		t0, t5, t1
-+	maskeqz		t5, a0, t1
-+
-+	or		t3, t3, t4
-+	or		t5, t0, t5
-+	sub.d		t0, t2, a2
-+	andn		t1, a3, t2
-+
-+
-+	xor		a4, t2, a1
-+	and		t0, t0, t1	//t0 hold diff pattern for '\0'
-+	sub.d		t1, a4, a2
-+	andn		t4, a3, a4
-+
-+	and		t1, t1, t4	//t1 hold diff pattern for 'a1'
-+	addi.d		a0, a0, 8
-+	beqz		t0, L(loop_8bytes)	//ok, neither \0 nor found
-+L(found_end):
-+	ctz.d		t1, t0
-+
-+	xor		t3, t3, a1
-+	orn		t1, zero, t1
-+	revb.d		t3, t3
-+	srl.d		t1, a5, t1  // mask for '\0'
-+
-+	sub.d		t4, t3, a2
-+	orn		a4, a4, t1
-+	andn		t3, a3, t3
-+	revb.d		t2, a4
-+
-+	sub.d		t0, t2, a2
-+	andn		t1, a3, t2
-+	and		t3, t3, t4
-+	and		t1, t0, t1
-+
-+	li.d		t7, 7
-+	masknez		t4, t3, t1
-+	maskeqz		t3, t1, t1
-+	masknez		t5, t5, t1
-+
-+	or		t3, t3, t4
-+	maskeqz		t6, a0, t1
-+	ctz.d		t0, t3
-+	or		t5, t6, t5
-+
-+	srli.d		t0, t0, 3
-+	sub.d		t0, t7, t0
-+	add.d		a0, t5, t0
-+	maskeqz		a0, a0, t3
-+
-+	jr		ra
-+END(STRRCHR_NAME)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def(STRRCHR_NAME)
-+#endif
- 
- #undef rindex
- weak_alias(STRRCHR_NAME, rindex)
-diff --git a/sysdeps/loongarch/lp64/stpcpy.S b/sysdeps/loongarch/lp64/stpcpy.S
-deleted file mode 100644
-index b6a367dc..00000000
---- a/sysdeps/loongarch/lp64/stpcpy.S
-+++ /dev/null
-@@ -1,179 +0,0 @@
--#ifdef _LIBC
--#include <sysdep.h>
--#include <sys/regdef.h>
--#include <sys/asm.h>
--#else
--#include <sys/asm.h>
--#include <sys/regdef.h>
--#endif
--
--#ifndef STPCPY_NAME
--#define STPCPY_NAME __stpcpy
--#endif
--
--LEAF(STPCPY_NAME, 6)
--    andi        a3, a0, 0x7
--    beqz        a3, L(dest_align)
--    sub.d       a5, a1, a3
--    addi.d      a5, a5, 8
--
--L(make_dest_align):
--    ld.b        t0, a1, 0
--    addi.d      a1, a1, 1
--    st.b        t0, a0, 0
--    addi.d      a0, a0, 1
--
--    beqz        t0, L(al_out)
--    bne         a1, a5, L(make_dest_align)
--
--L(dest_align):
--    andi        a4, a1, 7
--    bstrins.d   a1, zero, 2, 0
--
--    lu12i.w     t5, 0x1010
--    ld.d        t0, a1, 0
--    ori         t5, t5, 0x101
--    bstrins.d   t5, t5, 63, 32
--
--    slli.d      t6, t5, 0x7
--    bnez        a4, L(unalign)
--    sub.d       t1, t0, t5
--    andn        t2, t6, t0
--
--    and         t3, t1, t2
--    bnez        t3, L(al_end)
--
--L(al_loop):
--    st.d        t0, a0, 0
--    ld.d        t0, a1, 8
--
--    addi.d      a1, a1, 8
--    addi.d      a0, a0, 8
--    sub.d       t1, t0, t5
--    andn        t2, t6, t0
--
--    and         t3, t1, t2
--    beqz        t3, L(al_loop)
--
--L(al_end):
--    ctz.d       t1, t3
--    srli.d      t1, t1, 3
--    addi.d      t1, t1, 1 # add 1, since '\0' needs to be copied to dest
--
--    andi        a3, t1, 8
--    andi        a4, t1, 4
--    andi        a5, t1, 2
--    andi        a6, t1, 1
--
--L(al_end_8):
--    beqz        a3, L(al_end_4)
--    st.d        t0, a0, 0
--    addi.d      a0, a0, 7
--    jr          ra
--L(al_end_4):
--    beqz        a4, L(al_end_2)
--    st.w        t0, a0, 0
--    addi.d      a0, a0, 4
--    srli.d      t0, t0, 32
--L(al_end_2):
--    beqz        a5, L(al_end_1)
--    st.h        t0, a0, 0
--    addi.d      a0, a0, 2
--    srli.d      t0, t0, 16
--L(al_end_1):
--    beqz        a6, L(al_out)
--    st.b        t0, a0, 0
--    addi.d      a0, a0, 1
--L(al_out):
--    addi.d      a0, a0, -1
--    jr          ra
--
--L(unalign):
--    slli.d      a5, a4, 3
--    li.d        t1, -1
--    sub.d       a6, zero, a5
--
--    srl.d       a7, t0, a5
--    sll.d       t7, t1, a6
--
--    or          t0, a7, t7
--    sub.d       t1, t0, t5
--    andn        t2, t6, t0
--    and         t3, t1, t2
--
--    bnez        t3, L(un_end)
--
--    ld.d        t4, a1, 8
--    addi.d      a1, a1, 8
--
--    sub.d       t1, t4, t5
--    andn        t2, t6, t4
--    sll.d       t0, t4, a6
--    and         t3, t1, t2
--
--    or          t0, t0, a7
--    bnez        t3, L(un_end_with_remaining)
--
--L(un_loop):
--    srl.d       a7, t4, a5
--
--    ld.d        t4, a1, 8
--    addi.d      a1, a1, 8
--
--    st.d        t0, a0, 0
--    addi.d      a0, a0, 8
--
--    sub.d       t1, t4, t5
--    andn        t2, t6, t4
--    sll.d       t0, t4, a6
--    and         t3, t1, t2
--
--    or          t0, t0, a7
--    beqz        t3, L(un_loop)
--
--L(un_end_with_remaining):
--    ctz.d       t1, t3
--    srli.d      t1, t1, 3
--    addi.d      t1, t1, 1
--    sub.d       t1, t1, a4
--
--    blt         t1, zero, L(un_end_less_8)
--    st.d        t0, a0, 0
--    addi.d      a0, a0, 8
--    beqz        t1, L(un_out)
--    srl.d       t0, t4, a5  # get the remaining part
--    b           L(un_end_less_8)
--
--L(un_end):
--    ctz.d       t1, t3
--    srli.d      t1, t1, 3
--    addi.d      t1, t1, 1
--
--L(un_end_less_8):
--    andi        a4, t1, 4
--    andi        a5, t1, 2
--    andi        a6, t1, 1
--L(un_end_4):
--    beqz        a4, L(un_end_2)
--    st.w        t0, a0, 0
--    addi.d      a0, a0, 4
--    srli.d      t0, t0, 32
--L(un_end_2):
--    beqz        a5, L(un_end_1)
--    st.h        t0, a0, 0
--    addi.d      a0, a0, 2
--    srli.d      t0, t0, 16
--L(un_end_1):
--    beqz        a6, L(un_out)
--    st.b        t0, a0, 0
--    addi.d      a0, a0, 1
--L(un_out):
--    addi.d      a0, a0, -1
--    jr          ra
--
--END(STPCPY_NAME)
--
--#ifdef _LIBC
--weak_alias (STPCPY_NAME, stpcpy)
--libc_hidden_builtin_def (STPCPY_NAME)
--#endif
-diff --git a/sysdeps/loongarch/lp64/strchr.S b/sysdeps/loongarch/lp64/strchr.S
-deleted file mode 100644
-index fde53a30..00000000
---- a/sysdeps/loongarch/lp64/strchr.S
-+++ /dev/null
-@@ -1,89 +0,0 @@
--#ifdef _LIBC
--#include <sysdep.h>
--#include <sys/regdef.h>
--#include <sys/asm.h>
--#else
--#include <sys/asm.h>
--#include <sys/regdef.h>
--#endif
--
--#ifndef STRCHR_NAME
--#define STRCHR_NAME strchr
--#endif
--
--/* char * strchr (const char *s1, int c); */
--
--LEAF(STRCHR_NAME, 6)
--	slli.d		t1, a0, 3
--	bstrins.d	a0, zero, 2, 0
--	lu12i.w		a2, 0x01010
--	ld.d		t2, a0, 0
--
--	ori		a2, a2, 0x101
--	andi		a1, a1, 0xff
--	bstrins.d	a2, a2, 63, 32
--	li.w		t0, -1
--
--	mul.d           a1, a1, a2 # "cccccccc"
--	sll.d		t0, t0, t1
--	slli.d		a3, a2, 7  # 0x8080808080808080
--	orn             t2, t2, t0
--
--	sll.d           t3, a1, t1
--	xor             t4, t2, t3
--	sub.d           a7, t2, a2
--	andn            a6, a3, t2
--
--
--	sub.d           a5, t4, a2
--	andn            a4, a3, t4
--	and		a6, a7, a6
--	and		a5, a5, a4
--
--	or		t0, a6, a5
--	bnez		t0, L(_mc8_a)
--	addi.d		a0, a0, 8
--L(_aloop):
--	ld.d		t4, a0, 0
--
--	xor		t2, t4, a1
--	sub.d		a7, t4, a2
--	andn		a6, a3, t4
--	sub.d		a5, t2, a2
--
--	andn		a4, a3, t2
--	and		a6, a7, a6
--	and		a5, a5, a4
--	or		a7, a6, a5
--
--
--	bnez		a7, L(_mc8_a)
--	ld.d		t4, a0, 8
--	addi.d		a0, a0, 16
--	xor		t2, t4, a1
--
--	sub.d		a7, t4, a2
--	andn		a6, a3, t4
--	sub.d		a5, t2, a2
--	andn		a4, a3, t2
--
--	and		a6, a7, a6
--	and		a5, a5, a4
--	or		a7, a6, a5
--	beqz		a7, L(_aloop)
--
--	addi.d		a0, a0, -8
--
--L(_mc8_a):
--	ctz.d		t0, a5
--	ctz.d		t2, a6
--	srli.w		t0, t0, 3
--
--
--	srli.w		t2, t2, 3
--	sltu		t1, t2, t0
--	add.d		a0, a0, t0
--	masknez		a0, a0, t1
--
--	jr		ra
--END(STRCHR_NAME)
-diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S
-deleted file mode 100644
-index a5ee09a3..00000000
---- a/sysdeps/loongarch/lp64/strchrnul.S
-+++ /dev/null
-@@ -1,94 +0,0 @@
--#ifdef _LIBC
--#include <sysdep.h>
--#include <sys/regdef.h>
--#include <sys/asm.h>
--#else
--#include <sys/asm.h>
--#include <sys/regdef.h>
--#endif
--
--#ifndef STRCHRNUL_NAME
--#define STRCHRNUL_NAME __strchrnul
--#endif
--
--/* char * strchrnul (const char *s1, int c); */
--
--LEAF(STRCHRNUL_NAME, 6)
--	slli.d		t1, a0, 3
--	bstrins.d	a0, zero, 2, 0
--	lu12i.w		a2, 0x01010
--	ld.d		t2, a0, 0
--
--	ori		a2, a2, 0x101
--	andi		a1, a1, 0xff
--	bstrins.d	a2, a2, 63, 32
--	li.w		t0, -1
--
--	mul.d           a1, a1, a2 # "cccccccc"
--	sll.d		t0, t0, t1
--	slli.d		a3, a2, 7  # 0x8080808080808080
--	orn             t2, t2, t0
--
--	sll.d           t3, a1, t1
--	xor             t4, t2, t3
--	sub.d           a7, t2, a2
--	andn            a6, a3, t2
--
--
--	sub.d           a5, t4, a2
--	andn            a4, a3, t4
--	and		a6, a7, a6
--	and		a5, a5, a4
--
--	or		t0, a6, a5
--	bnez		t0, L(_mc8_a)
--	addi.d		a0, a0, 8
--L(_aloop):
--	ld.d		t4, a0, 0
--
--	xor		t2, t4, a1
--	sub.d		a7, t4, a2
--	andn		a6, a3, t4
--	sub.d		a5, t2, a2
--
--	andn		a4, a3, t2
--	and		a6, a7, a6
--	and		a5, a5, a4
--	or		a7, a6, a5
--
--
--	bnez		a7, L(_mc8_a)
--	ld.d		t4, a0, 8
--	addi.d		a0, a0, 16
--	xor		t2, t4, a1
--
--	sub.d		a7, t4, a2
--	andn		a6, a3, t4
--	sub.d		a5, t2, a2
--	andn		a4, a3, t2
--
--	and		a6, a7, a6
--	and		a5, a5, a4
--	or		a7, a6, a5
--	beqz		a7, L(_aloop)
--
--	addi.d		a0, a0, -8
--L(_mc8_a):
--	ctz.d		t0, a5
--	ctz.d		t2, a6
--	srli.w		t0, t0, 3
--
--	srli.w		t2, t2, 3
--	slt		t1, t0, t2
--	masknez		t3, t2, t1
--	maskeqz		t4, t0, t1
--
--	or		t0, t3, t4
--	add.d		a0, a0, t0
--	jr		ra
--END(STRCHRNUL_NAME)
--
--#ifdef _LIBC
--weak_alias(STRCHRNUL_NAME, strchrnul)
--libc_hidden_builtin_def (STRCHRNUL_NAME)
--#endif
-diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S
-deleted file mode 100644
-index 3a863992..00000000
---- a/sysdeps/loongarch/lp64/strcmp.S
-+++ /dev/null
-@@ -1,227 +0,0 @@
--/* 2022\06\15  loongarch64 author: chenxiaolong.  */
--
--#ifdef _LIBC
--#include <sysdep.h>
--#include <sys/regdef.h>
--#include <sys/asm.h>
--#else
--#include <sys/asm.h>
--#include <sys/regdef.h>
--#endif
--
--#ifndef STRCMP_NAME
--#define STRCMP_NAME strcmp
--#endif
--
--/* int strcmp (const char *s1, const char *s2); */
--
--/* Parameters and Results */
--#define src1	a0
--#define	src2	a1
--#define result	v0
--LEAF(STRCMP_NAME, 6)
--    xor         a4, src1, src2
--    lu12i.w     t5, 0x01010
--    lu12i.w     t6, 0x7f7f7
--    andi        a2, src1, 0x7
--
--    ori         t5, t5, 0x101
--    andi        a4, a4, 0x7
--    ori         t6, t6, 0xf7f
--    bstrins.d   t5, t5, 63, 32
--    bstrins.d   t6, t6, 63, 32
--
--    bnez        a4, 3f  // unaligned
--    beqz        a2, 1f  // loop aligned
--
--// mutual aligned
--    bstrins.d   src1, zero, 2, 0
--    bstrins.d   src2, zero, 2, 0
--    slli.d      a4, a2, 0x3
--    ld.d        t0, src1, 0
--
--    sub.d       a4, zero, a4
--    ld.d        t1, src2, 0
--    addi.d      src1, src1, 8
--    addi.d      src2, src2, 8
--
--    nor         a5, zero, zero
--    srl.d       a5, a5, a4
--    or          t0, t0, a5
--
--    or          t1, t1, a5
--    b           2f  //start realigned
--
--// loop aligned
--1:
--    ld.d        t0, src1, 0
--    addi.d      src1, src1, 8
--    ld.d        t1, src2, 0
--    addi.d      src2, src2, 8
--
--// start realigned:
--2:
--    sub.d       t2, t0, t5
--    nor         t3, t0, t6
--    and         t2, t2, t3
--
--    xor         t3, t0, t1
--    or          t2, t2, t3
--    beqz        t2, 1b
--
--    ctz.d       t7, t2
--    bstrins.d   t7, zero, 2, 0
--    srl.d       t0, t0, t7
--    srl.d       t1, t1, t7
--
--    andi        t0, t0, 0xff
--    andi        t1, t1, 0xff
--    sub.d       v0, t0, t1
--    jr          ra
--
--// unaligned
--3:
--    andi        a3, src2, 0x7
--    slt         a5, a2, a3
--    masknez     t8, a2, a5
--    xor         a6, src1, src2
--    maskeqz     a6, a6, t8
--    xor         src1, src1, a6
--    xor         src2, src2, a6
--
--    andi        a2, src1, 0x7
--    beqz        a2, 4f // src1 is aligned
--
--//strcmp_unaligned:
--    andi        a3, src2, 0x7
--    bstrins.d   src1, zero, 2, 0
--    bstrins.d   src2, zero, 2, 0
--    nor         t3, zero, zero
--
--    ld.d        t0, src1, 0
--    ld.d        t1, src2, 0
--    sub.d       a2, a3, a2
--    addi.d      t2, zero, 8
--
--    sub.d       a5, t2, a2
--    sub.d       a6, t2, a3
--    slli.d      a5, a5, 0x3
--    slli.d      a6, a6, 0x3
--
--    srl.d       t4, t3, a6
--    srl.d       a4, t3, a5
--    rotr.d      a7, t0, a5
--
--    addi.d      src2, src2, 8
--    addi.d      src1, src1, 8
--    or          t1, t1, t4
--    or          t0, a7, t4
--
--    sub.d       t2, t0, t5
--    nor         t3, t0, t6
--    and         t2, t2, t3
--    xor         t3, t0, t1
--    or          t2, t2, t3
--    bnez        t2, 7f
--
--    and         a7, a7, a4
--    slli.d      a6, a2, 0x3
--    nor         a4, zero, a4
--    b           5f
--
--// src1 is aligned
--4:
--    andi        a3, src2, 0x7
--    ld.d        t0, src1, 0
--
--    bstrins.d   src2, zero, 2, 0
--    nor         t2, zero, zero
--    ld.d        t1, src2, 0
--
--    addi.d      t3, zero, 0x8
--    sub.d       a5, t3, a3
--    slli.d      a5, a5, 0x3
--    srl.d       a4, t2, a5
--    rotr.d      t4, t0, a5
--
--    addi.d      src2, src2, 8
--    addi.d      src1, src1, 8
--    or          t1, t1, a4
--    or          t0, t4, a4
--
--    sub.d       t2, t0, t5
--    nor         t3, t0, t6
--    and         t2, t2, t3
--    xor         t3, t0, t1
--    or          t2, t2, t3
--
--    bnez        t2, 7f
--
--    and         a7, t4, a4
--    slli.d      a6, a3, 0x3
--    nor         a4, zero, a4
--
--// unaligned loop
--// a7: remaining number
--// a6: shift left number
--// a5: shift right number
--// a4: mask for checking remaining number
--5:
--    or          t0, a7, a4
--    sub.d       t2, t0, t5
--    nor         t3, t0, t6
--    and         t2, t2, t3
--    bnez        t2, 6f
--
--    ld.d        t0, src1, 0
--    addi.d      src1, src1, 8
--    ld.d        t1, src2, 0
--    addi.d      src2, src2, 8
--
--    srl.d       t7, t0, a5
--    sll.d       t0, t0, a6
--    or          t0, a7, t0
--
--    sub.d       t2, t0, t5
--    nor         t3, t0, t6
--    and         t2, t2, t3
--    xor         t3, t0, t1
--    or          t2, t2, t3
--    bnez        t2, 7f
--
--    or          a7, t7, zero
--    b           5b
--
--6:
--    ld.bu       t1, src2, 0
--    andi        t0, a7, 0xff
--    xor         t2, t0, t1
--    srli.d      a7, a7, 0x8
--    masknez     t2, t0, t2
--    addi.d      src2, src2, 1
--    beqz        t2, 8f
--    b           6b
--
--7:
--    ctz.d       t7, t2
--    bstrins.d   t7, zero, 2, 0
--    srl.d       t0, t0, t7
--    srl.d       t1, t1, t7
--
--    andi        t0, t0, 0xff
--    andi        t1, t1, 0xff
--
--8:
--    sub.d       a4, t0, t1
--    sub.d       a5, t1, t0
--    maskeqz     a6, a5, t8
--    masknez     result, a4, t8
--    or          result, result, a6
--    jr	ra
--
--END(STRCMP_NAME)
--
--#ifdef _LIBC
--libc_hidden_builtin_def (STRCMP_NAME)
--#endif
--
-diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S
-deleted file mode 100644
-index 08505192..00000000
---- a/sysdeps/loongarch/lp64/strcpy.S
-+++ /dev/null
-@@ -1,173 +0,0 @@
--#ifdef _LIBC
--#include <sysdep.h>
--#include <sys/regdef.h>
--#include <sys/asm.h>
--#else
--#include <sys/asm.h>
--#include <sys/regdef.h>
--#endif
--
--#ifndef STRCPY
--#define STRCPY  strcpy
--#endif
--
--LEAF(STRCPY, 6)
--    andi        a3, a0, 0x7
--    move        a2, a0
--    beqz        a3, L(dest_align)
--    sub.d       a5, a1, a3
--    addi.d      a5, a5, 8
--
--L(make_dest_align):
--    ld.b        t0, a1, 0
--    addi.d      a1, a1, 1
--    st.b        t0, a2, 0
--    beqz        t0, L(al_out)
--
--    addi.d      a2, a2, 1
--    bne         a1, a5, L(make_dest_align)
--
--L(dest_align):
--    andi        a4, a1, 7
--    bstrins.d   a1, zero, 2, 0
--
--    lu12i.w     t5, 0x1010
--    ld.d        t0, a1, 0
--    ori         t5, t5, 0x101
--    bstrins.d   t5, t5, 63, 32
--
--    slli.d      t6, t5, 0x7
--    bnez        a4, L(unalign)
--    sub.d       t1, t0, t5
--    andn        t2, t6, t0
--
--    and         t3, t1, t2
--    bnez        t3, L(al_end)
--
--L(al_loop):
--    st.d        t0, a2, 0
--    ld.d        t0, a1, 8
--
--    addi.d      a1, a1, 8
--    addi.d      a2, a2, 8
--    sub.d       t1, t0, t5
--    andn        t2, t6, t0
--
--    and         t3, t1, t2
--    beqz        t3, L(al_loop)
--
--L(al_end):
--    ctz.d       t1, t3
--    srli.d      t1, t1, 3
--    addi.d      t1, t1, 1 # add 1, since '\0' needs to be copied to dest
--
--    andi        a3, t1, 8
--    andi        a4, t1, 4
--    andi        a5, t1, 2
--    andi        a6, t1, 1
--
--L(al_end_8):
--    beqz        a3, L(al_end_4)
--    st.d        t0, a2, 0
--    jr          ra
--L(al_end_4):
--    beqz        a4, L(al_end_2)
--    st.w        t0, a2, 0
--    addi.d      a2, a2, 4
--    srli.d      t0, t0, 32
--L(al_end_2):
--    beqz        a5, L(al_end_1)
--    st.h        t0, a2, 0
--    addi.d      a2, a2, 2
--    srli.d      t0, t0, 16
--L(al_end_1):
--    beqz        a6, L(al_out)
--    st.b        t0, a2, 0
--L(al_out):
--    jr          ra
--
--L(unalign):
--    slli.d      a5, a4, 3
--    li.d        t1, -1
--    sub.d       a6, zero, a5
--
--    srl.d       a7, t0, a5
--    sll.d       t7, t1, a6
--
--    or          t0, a7, t7
--    sub.d       t1, t0, t5
--    andn        t2, t6, t0
--    and         t3, t1, t2
--
--    bnez        t3, L(un_end)
--
--    ld.d        t4, a1, 8
--
--    sub.d       t1, t4, t5
--    andn        t2, t6, t4
--    sll.d       t0, t4, a6
--    and         t3, t1, t2
--
--    or          t0, t0, a7
--    bnez        t3, L(un_end_with_remaining)
--
--L(un_loop):
--    srl.d       a7, t4, a5
--
--    ld.d        t4, a1, 16
--    addi.d      a1, a1, 8
--
--    st.d        t0, a2, 0
--    addi.d      a2, a2, 8
--
--    sub.d       t1, t4, t5
--    andn        t2, t6, t4
--    sll.d       t0, t4, a6
--    and         t3, t1, t2
--
--    or          t0, t0, a7
--    beqz        t3, L(un_loop)
--
--L(un_end_with_remaining):
--    ctz.d       t1, t3
--    srli.d      t1, t1, 3
--    addi.d      t1, t1, 1
--    sub.d       t1, t1, a4
--
--    blt         t1, zero, L(un_end_less_8)
--    st.d        t0, a2, 0
--    addi.d      a2, a2, 8
--    beqz        t1, L(un_out)
--    srl.d       t0, t4, a5  # get the remaining part
--    b           L(un_end_less_8)
--
--L(un_end):
--    ctz.d       t1, t3
--    srli.d      t1, t1, 3
--    addi.d      t1, t1, 1
--
--L(un_end_less_8):
--    andi        a4, t1, 4
--    andi        a5, t1, 2
--    andi        a6, t1, 1
--L(un_end_4):
--    beqz        a4, L(un_end_2)
--    st.w        t0, a2, 0
--    addi.d      a2, a2, 4
--    srli.d      t0, t0, 32
--L(un_end_2):
--    beqz        a5, L(un_end_1)
--    st.h        t0, a2, 0
--    addi.d      a2, a2, 2
--    srli.d      t0, t0, 16
--L(un_end_1):
--    beqz        a6, L(un_out)
--    st.b        t0, a2, 0
--L(un_out):
--    jr          ra
--
--END(STRCPY)
--
--#ifdef _LIBC
--libc_hidden_builtin_def (STRCPY)
--#endif
-diff --git a/sysdeps/loongarch/lp64/strlen.S b/sysdeps/loongarch/lp64/strlen.S
-deleted file mode 100644
-index 71431ce2..00000000
---- a/sysdeps/loongarch/lp64/strlen.S
-+++ /dev/null
-@@ -1,85 +0,0 @@
--#ifdef _LIBC
--#include <sysdep.h>
--#include <sys/regdef.h>
--#include <sys/asm.h>
--#else
--#include <sys/asm.h>
--#include <sys/regdef.h>
--#endif
--
--#ifndef STRLEN
--#define STRLEN	strlen
--#endif
--
--LEAF(STRLEN, 6)
--	move		a1, a0
--	bstrins.d	a0, zero, 2, 0
--	lu12i.w		a2, 0x01010
--	li.w		t0, -1
--
--	ld.d		t2, a0, 0
--	andi		t1, a1, 0x7
--	ori		a2, a2, 0x101
--	slli.d		t1, t1, 3
--
--	bstrins.d	a2, a2, 63, 32
--	sll.d		t1, t0, t1
--	slli.d		t3, a2, 7
--	nor		a3, zero, t3
--
--	orn		t2, t2, t1
--	sub.d		t0, t2, a2
--	nor		t1, t2, a3
--	and		t0, t0, t1
--
--
--	bnez		t0, L(count_pos)
--	addi.d          a0, a0, 8
--L(loop_16_7bit):
--	ld.d		t2, a0, 0
--	sub.d		t1, t2, a2
--
--	and 		t0, t1, t3
--	bnez		t0, L(more_check)
--	ld.d		t2, a0, 8
--	addi.d      	a0, a0, 16
--
--	sub.d		t1, t2, a2
--	and 		t0, t1, t3
--	beqz        	t0, L(loop_16_7bit)
--	addi.d          a0, a0, -8
--L(more_check):
--	nor		t0, t2, a3
--
--	and		t0, t1, t0
--	bnez		t0, L(count_pos)
--	addi.d          a0, a0, 8
--L(loop_16_8bit):
--	ld.d		t2, a0, 0
--
--	sub.d		t1, t2, a2
--	nor		t0, t2, a3
--	and		t0, t0, t1
--	bnez		t0, L(count_pos)
--
--	ld.d		t2, a0, 8
--	addi.d      	a0, a0, 16
--	sub.d		t1, t2, a2
--	nor		t0, t2, a3
--
--	and		t0, t0, t1
--	beqz		t0, L(loop_16_8bit)
--	addi.d          a0, a0, -8
--L(count_pos):
--	ctz.d		t1, t0
--	sub.d		a0, a0, a1
--
--	srli.d		t1, t1, 3
--	add.d		a0, a0, t1
--	jr		ra
--
--END(STRLEN)
--
--#ifdef _LIBC
--libc_hidden_builtin_def (STRLEN)
--#endif
-diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S
-deleted file mode 100644
-index 55450e55..00000000
---- a/sysdeps/loongarch/lp64/strncmp.S
-+++ /dev/null
-@@ -1,256 +0,0 @@
--#ifdef _LIBC
--#include <sysdep.h>
--#include <sys/regdef.h>
--#include <sys/asm.h>
--#else
--#include <sys/asm.h>
--#include <sys/regdef.h>
--#endif
--
--#ifndef STRNCMP
--#define STRNCMP strncmp
--#endif
--
--/* int strncmp (const char *s1, const char *s2); */
--
--LEAF(STRNCMP, 6)
--    beqz        a2, L(ret0)
--    xor         a4, a0, a1
--    lu12i.w     t5, 0x01010
--    lu12i.w     t6, 0x7f7f7
--
--    andi        a3, a0, 0x7
--    ori         t5, t5, 0x101
--    andi        a4, a4, 0x7
--    ori         t6, t6, 0xf7f
--
--    bstrins.d   t5, t5, 63, 32
--    bstrins.d   t6, t6, 63, 32
--
--    bnez        a4, L(unalign)
--    bnez        a3, L(mutual_align)
--
--L(a_loop):
--    ld.d        t0, a0, 0
--    ld.d        t1, a1, 0
--    addi.d      a0, a0, 8
--    addi.d      a1, a1, 8
--
--
--    sltui       t7, a2, 9
--
--L(start_realign):
--    sub.d       t2, t0, t5
--    nor         t3, t0, t6
--    xor         t4, t0, t1
--
--    and         t2, t2, t3
--    addi.d      a2, a2, -8
--
--    or          t2, t2, t4
--    or          t3, t2, t7
--    beqz        t3, L(a_loop)
--
--L(end):
--    bge         zero, t7, L(out)
--    andi        t4, a2, 7
--    li.d        t3, -1
--    addi.d      t4, t4, -1
--    slli.d      t4, t4, 3
--    sll.d       t3, t3, t4
--    or          t2, t2, t3
--
--
--L(out):
--    ctz.d       t3, t2
--    bstrins.d   t3, zero, 2, 0
--    srl.d       t0, t0, t3
--    srl.d       t1, t1, t3
--
--    andi        t0, t0, 0xff
--    andi        t1, t1, 0xff
--    sub.d       a0, t0, t1
--    jr          ra
--
--L(mutual_align):
--    bstrins.d   a0, zero, 2, 0
--    bstrins.d   a1, zero, 2, 0
--    slli.d      a5, a3, 0x3
--    li.d        t2, -1
--
--    ld.d        t0, a0, 0
--    ld.d        t1, a1, 0
--
--    li.d        t3, 9
--    sll.d       t2, t2, a5
--
--    sub.d       t3, t3, a3
--    addi.d      a0, a0, 8
--
--    sltu        t7, a2, t3
--    addi.d      a1, a1, 8
--
--    add.d       a2, a2, a3
--    orn         t0, t0, t2
--    orn         t1, t1, t2
--    b           L(start_realign)
--
--L(ret0):
--    move        a0, zero
--    jr          ra
--
--L(unalign):
--    li.d        t8, 8
--    blt         a2, t8, L(short_cmp)
--
--    # swap a0 and a1 in case a3 > a4
--    andi        a4, a1, 0x7
--    sltu        t8, a4, a3
--    xor         a6, a0, a1
--    maskeqz     a6, a6, t8
--    xor         a0, a0, a6
--    xor         a1, a1, a6
--
--    andi        a3, a0, 0x7
--    andi        a4, a1, 0x7
--
--    bstrins.d   a0, zero, 2, 0
--    bstrins.d   a1, zero, 2, 0
--
--    li.d        t2, -1
--    li.d        t3, 9
--
--    ld.d        t0, a0, 0
--    ld.d        t1, a1, 0
--
--    sub.d       t3, t3, a4
--    sub.d       a3, a4, a3
--
--    slli.d      t4, a4, 3
--    slli.d      a6, a3, 3
--
--    sub.d       a5, zero, a6
--    sltu        t7, a2, t3
--
--    rotr.d      a7, t0, a5
--    sll.d       t4, t2, t4 # mask for first num
--
--    add.d       a2, a2, a4
--    sll.d       a4, t2, a6 # mask for a7
--
--    orn         t0, a7, t4
--    orn         t1, t1, t4
--
--    sub.d       t2, t0, t5
--    nor         t4, t0, t6
--    and         t2, t2, t4
--
--    xor         t3, t0, t1
--    or          t2, t2, t3
--
--    or          t3, t2, t7
--    bnez        t3, L(un_end)
--
--    andn        a7, a7, a4
--    addi.d      a3, a3, 1
--
--L(un_loop):
--    addi.d      a2, a2, -8
--    # in case remaining part has '\0', no more load instructions should be executed on a0 address
--    or          t0, a7, a4
--    sltu        t7, a2, a3
--
--    sub.d       t2, t0, t5
--    nor         t3, t0, t6
--    and         t2, t2, t3
--
--    or          t3, t2, t7
--    bnez        t3, L(check_remaining)
--
--    ld.d        t7, a0, 8
--    ld.d        t1, a1, 8
--    addi.d      a0, a0, 8
--    addi.d      a1, a1, 8
--
--    sll.d       t4, t7, a6
--    sub.d       t2, t1, t5
--    nor         t3, t1, t6
--
--    or          t0, t4, a7
--    srl.d       a7, t7, a5
--
--    and         t2, t2, t3
--    xor         t3, t0, t1
--
--    sltui       t7, a2, 9
--    or          t2, t2, t3
--
--    or          t3, t2, t7
--    beqz        t3, L(un_loop)
--    b           L(un_end)
--
--L(check_remaining):
--    ld.d        t1, a1, 8
--    xor         t3, t1, a7
--    or          t2, t2, t3
--
--L(un_end):
--    bge         zero, t7, L(un_out)
--    andi        t4, a2, 7
--    li.d        t3, -1
--
--    addi.d      t4, t4, -1
--    slli.d      t4, t4, 3
--    sll.d       t3, t3, t4
--    or          t2, t2, t3
--
--L(un_out):
--    ctz.d       t3, t2
--    bstrins.d   t3, zero, 2, 0
--    srl.d       t0, t0, t3
--    srl.d       t1, t1, t3
--
--    andi        t0, t0, 0xff
--    andi        t1, t1, 0xff
--
--    sub.d       a4, t0, t1
--    sub.d       a5, t1, t0
--
--    maskeqz     a6, a5, t8
--    masknez     a0, a4, t8
--
--    or          a0, a0, a6
--    jr          ra
--
--L(short_cmp):
--    ld.bu       t0, a0, 0
--    ld.bu       t1, a1, 0
--    addi.d      a2, a2, -1
--
--    xor         t2, t0, t1
--    masknez     t2, t0, t2
--    maskeqz     t2, a2, t2
--
--    beqz        t2, L(short_out)
--
--    ld.bu       t0, a0, 1
--    ld.bu       t1, a1, 1
--
--    addi.d      a2, a2, -1
--    addi.d      a0, a0, 2
--
--    addi.d      a1, a1, 2
--    xor         t2, t0, t1
--    masknez     t2, t0, t2
--    maskeqz     t2, a2, t2
--
--    bnez        t2, L(short_cmp)
--
--L(short_out):
--    sub.d       a0, t0, t1
--    jr ra
--
--END(STRNCMP)
--#ifdef _LIBC
--libc_hidden_builtin_def (STRNCMP)
--#endif
-diff --git a/sysdeps/loongarch/lp64/strnlen.S b/sysdeps/loongarch/lp64/strnlen.S
-deleted file mode 100644
-index 5b5ab585..00000000
---- a/sysdeps/loongarch/lp64/strnlen.S
-+++ /dev/null
-@@ -1,82 +0,0 @@
--#ifdef _LIBC
--#include <sysdep.h>
--#include <sys/regdef.h>
--#include <sys/asm.h>
--#else
--#include <sys/asm.h>
--#include <sys/regdef.h>
--#endif
--
--#ifndef STRNLEN
--#define STRNLEN	__strnlen
--#endif
--
--#. before every load, a1(t5) must > 0;
--#. first load with t1 != 0, need to adjust t5;
--#. return the less one of both strlen(s) and a1;
--
--LEAF(STRNLEN, 6)
--	beqz		a1, L(out)
--	lu12i.w		a2, 0x01010
--	andi		t1, a0, 0x7
--	move		t4, a0
--
--	bstrins.d	a0, zero, 2, 0
--	ori		a2, a2, 0x101
--	li.w		t0, -1
--	ld.d		t2, a0, 0
--
--	slli.d		t3, t1, 3
--	bstrins.d	a2, a2, 63, 32
--	li.w		t5, 8
--	slli.d		a3, a2, 7
--
--	sub.w		t1, t5, t1
--	sll.d		t0, t0, t3
--	nor		a3, zero, a3
--	orn		t2, t2, t0
--
--
--	sub.d		t0, t2, a2
--	nor		t3, t2, a3
--	and		t0, t0, t3
--	bnez		t0, L(count_pos)
--
--	sub.d		t5, a1, t1
--	bgeu		t1, a1, L(out)
--L(loop_8bytes):
--	ld.d		t2, a0, 8
--	addi.d		a0, a0, 8
--
--	sub.d		t0, t2, a2
--	nor 		t1, t2, a3
--	sltui		t6, t5, 9
--	and 		t0, t0, t1
--
--	addi.d		t5, t5, -8
--	or		t7, t0, t6
--	beqz		t7, L(loop_8bytes)
--L(count_pos):
--	ctz.d		t1, t0
--
--
--	sub.d		a0, a0, t4
--	srli.d		t1, t1, 3
--	add.d		a0, t1, a0
--	sltu		t0, a0, a1
--
--	masknez		t1, a1, t0
--	maskeqz		a0, a0, t0
--	or		a0, a0, t1
--	jr		ra
--
--L(out):
--	move		a0, a1
--	jr		ra
--
--END(STRNLEN)
--
--#ifdef _LIBC
--weak_alias (STRNLEN, strnlen)
--libc_hidden_builtin_def (STRNLEN)
--#endif
-diff --git a/sysdeps/loongarch/lp64/strrchr.S b/sysdeps/loongarch/lp64/strrchr.S
-deleted file mode 100644
-index df7fcb6b..00000000
---- a/sysdeps/loongarch/lp64/strrchr.S
-+++ /dev/null
-@@ -1,105 +0,0 @@
--#ifdef _LIBC
--#include <sysdep.h>
--#include <sys/regdef.h>
--#include <sys/asm.h>
--#else
--#include <sys/asm.h>
--#include <sys/regdef.h>
--#endif
--
--#ifndef STRRCHR_NAME
--#define STRRCHR_NAME strrchr
--#endif
--
--LEAF(STRRCHR_NAME, 6)
--	slli.d		t1, a0, 3
--	bstrins.d	a0, zero, 2, 0
--	lu12i.w		a2, 0x01010
--	ld.d		t2, a0, 0       // t2 = "5ZZ21abc"
--
--	ori		a2, a2, 0x101
--	andi		a1, a1, 0xff	// a1 = "0000000Z"
--	li.d		a5, -1
--	bstrins.d	a2, a2, 63, 32	// a2 = 0x0101010101010101
--
--	sll.d		t1, a5, t1	// t1 = 0xffffffffff000000
--	mul.d		a1, a1, a2	// a1 = "ZZZZZZZZ"
--	orn		t2, t2, t1	// t2 = "5ZZ21YYY"
--	slli.d		a3, a2, 7	// a3 = 0x8080808080808080
--
--	sub.d		a4, t2, a2
--	andn		t0, a3, t2
--	move		t3, zero
--	and		t0, a4, t0
--
--
--	xor		a4, t2, a1
--	move		t5, zero
--	orn		a4, a4, t1
--	bnez		t0, L(found_end)
--
--	sub.d		t1, a4, a2
--	andn		t0, a3, a4
--	and		t1, t1, t0
--
--L(loop_8bytes):
--	masknez		t4, t3, t1
--
--	maskeqz		t3, t2, t1
--	ld.d		t2, a0, 8
--	masknez		t0, t5, t1
--	maskeqz		t5, a0, t1
--
--	or		t3, t3, t4
--	or		t5, t0, t5
--	sub.d		t0, t2, a2
--	andn		t1, a3, t2
--
--
--	xor		a4, t2, a1
--	and		t0, t0, t1	//t0 hold diff pattern for '\0'
--	sub.d		t1, a4, a2
--	andn		t4, a3, a4
--
--	and		t1, t1, t4	//t1 hold diff pattern for 'a1'
--	addi.d		a0, a0, 8
--	beqz		t0, L(loop_8bytes)	//ok, neither \0 nor found
--L(found_end):
--	ctz.d		t1, t0
--
--	xor		t3, t3, a1
--	orn		t1, zero, t1
--	revb.d		t3, t3
--	srl.d		t1, a5, t1  // mask for '\0'
--
--	sub.d		t4, t3, a2
--	orn		a4, a4, t1
--	andn		t3, a3, t3
--	revb.d		t2, a4
--
--	sub.d		t0, t2, a2
--	andn		t1, a3, t2
--	and		t3, t3, t4
--	and		t1, t0, t1
--
--	li.d		t7, 7
--	masknez		t4, t3, t1
--	maskeqz		t3, t1, t1
--	masknez		t5, t5, t1
--
--	or		t3, t3, t4
--	maskeqz		t6, a0, t1
--	ctz.d		t0, t3
--	or		t5, t6, t5
--
--	srli.d		t0, t0, 3
--	sub.d		t0, t7, t0
--	add.d		a0, t5, t0
--	maskeqz		a0, a0, t3
--
--	jr		ra
--END(STRRCHR_NAME)
--
--#ifdef _LIBC
--libc_hidden_builtin_def(STRRCHR_NAME)
--#endif
--- 
-2.33.0
-
diff --git a/glibc-2.28-Remove-unseless-ANDROID_CHANGES-and-relat.patch b/glibc-2.28-Remove-unseless-ANDROID_CHANGES-and-relat.patch
deleted file mode 100644
index ad4b53c..0000000
--- a/glibc-2.28-Remove-unseless-ANDROID_CHANGES-and-relat.patch
+++ /dev/null
@@ -1,292 +0,0 @@
-From e2dd1f13592fa3b99b70eb54cc61e9f98cdcb123 Mon Sep 17 00:00:00 2001
-From: caiyinyu <caiyinyu@loongson.cn>
-Date: Mon, 17 Apr 2023 17:20:04 +0800
-Subject: [PATCH 01/14] glibc-2.28: Remove unseless ANDROID_CHANGES and related
- code.
-
-Change-Id: Ib08e92d435126c7b56096ff6f24f1c6b5ea57f46
-Signed-off-by: ticat_fp <fanpeng@loongson.cn>
----
- sysdeps/loongarch/lp64/memchr.S                     |  6 ------
- sysdeps/loongarch/lp64/memcpy.S                     | 13 -------------
- sysdeps/loongarch/lp64/memset.S                     |  6 ------
- sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S |  6 ------
- .../loongarch/lp64/multiarch/memmove-unaligned.S    |  6 ------
- sysdeps/loongarch/lp64/multiarch/memset-unaligned.S |  7 -------
- sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S |  2 --
- .../loongarch/lp64/multiarch/strchrnul-unaligned.S  |  2 --
- sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S |  2 --
- sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S |  2 --
- .../loongarch/lp64/multiarch/strncmp-unaligned.S    |  2 --
- .../loongarch/lp64/multiarch/strnlen-unaligned.S    |  2 --
- 12 files changed, 56 deletions(-)
-
-diff --git a/sysdeps/loongarch/lp64/memchr.S b/sysdeps/loongarch/lp64/memchr.S
-index ec34b1af..75c4e15c 100644
---- a/sysdeps/loongarch/lp64/memchr.S
-+++ b/sysdeps/loongarch/lp64/memchr.S
-@@ -11,11 +11,7 @@
- #define MEMCHR_NAME memchr
- #endif
- 
--#ifdef ANDROID_CHANGES
--LEAF(MEMCHR_NAME, 0)
--#else
- LEAF(MEMCHR_NAME)
--#endif
-     .align      6
-     beqz        a2, L(out)
-     andi        t1, a0, 0x7
-@@ -92,8 +88,6 @@ L(out):
-     jr          ra
- END(MEMCHR_NAME)
- 
--#ifndef ANDROID_CHANGES
- #ifdef _LIBC
- libc_hidden_builtin_def (MEMCHR_NAME)
- #endif
--#endif
-diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S
-index 1076e678..b6ca60a1 100644
---- a/sysdeps/loongarch/lp64/memcpy.S
-+++ b/sysdeps/loongarch/lp64/memcpy.S
-@@ -35,29 +35,18 @@
-     st.d        t6, reg, n+48; \
-     st.d        t7, reg, n+56;
- 
--#ifdef ANDROID_CHANGES
--LEAF(MEMMOVE_NAME, 0)
--#else
- LEAF(MEMMOVE_NAME)
--#endif
--
-     .align      6
-     sub.d       t0, a0, a1
-     bltu        t0, a2, L(copy_back)
- 
- END(MEMMOVE_NAME)
- 
--#ifndef ANDROID_CHANGES
- #ifdef _LIBC
- libc_hidden_builtin_def (MEMMOVE_NAME)
- #endif
--#endif
- 
--#ifdef ANDROID_CHANGES
--LEAF(MEMCPY_NAME, 0)
--#else
- LEAF(MEMCPY_NAME)
--#endif
- 
-     srai.d      a3, a2, 4
-     beqz        a3, L(short_data)  # less than 16 bytes
-@@ -811,8 +800,6 @@ L(back_end):
- 
- END(MEMCPY_NAME)
- 
--#ifndef ANDROID_CHANGES
- #ifdef _LIBC
- libc_hidden_builtin_def (MEMCPY_NAME)
- #endif
--#endif
-diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S
-index 9fe42b24..41629e7e 100644
---- a/sysdeps/loongarch/lp64/memset.S
-+++ b/sysdeps/loongarch/lp64/memset.S
-@@ -21,11 +21,7 @@
-     st.d        a1, a0, n+48;   \
-     st.d        a1, a0, n+56;
- 
--#ifdef ANDROID_CHANGES
--LEAF(MEMSET_NAME, 0)
--#else
- LEAF(MEMSET_NAME)
--#endif
-     .align          6
-     move        t0, a0
-     andi        a3, a0, 0x7
-@@ -166,8 +162,6 @@ L(short_0):
- 
- END(MEMSET_NAME)
- 
--#ifndef ANDROID_CHANGES
- #ifdef _LIBC
- libc_hidden_builtin_def (MEMSET_NAME)
- #endif
--#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
-index 5e38df0d..64b60244 100644
---- a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
-@@ -31,11 +31,7 @@
- 	st.d    t6, reg, n+48; \
- 	st.d    t7, reg, n+56;
- 
--#ifdef ANDROID_CHANGES
--LEAF(MEMCPY_NAME, 0)
--#else
- LEAF(MEMCPY_NAME)
--#endif
- 
- //1st var: dst ptr: void *a1 $r4 a0
- //2nd var: src ptr: void *a2 $r5 a1
-@@ -250,10 +246,8 @@ end_0_8_unalign:
- 
- END(MEMCPY_NAME)
- 
--#ifndef ANDROID_CHANGES
- #ifdef _LIBC
- libc_hidden_builtin_def (MEMCPY_NAME)
- #endif
--#endif
- 
- #endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
-index 27ed0c9c..42920a1a 100644
---- a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
-@@ -100,11 +100,7 @@
- 	LD_64(a4, -1024); \
- 	ST_64(a3, -1024);
- 
--#ifdef ANDROID_CHANGES
--LEAF(MEMMOVE_NAME, 0)
--#else
- LEAF(MEMMOVE_NAME)
--#endif
- 
- //1st var: dest ptr: void *str1 $r4 a0
- //2nd var: src  ptr: void *str2 $r5 a1
-@@ -469,10 +465,8 @@ end_unalign_proc_back:
- 
- END(MEMMOVE_NAME)
- 
--#ifndef ANDROID_CHANGES
- #ifdef _LIBC
- libc_hidden_builtin_def (MEMMOVE_NAME)
- #endif
--#endif
- 
- #endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
-index 16ff2ef7..54e51546 100644
---- a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
-@@ -33,12 +33,7 @@
- //2nd var: int val  $5   a1
- //3rd var: size_t num  $6  a2
- 
--#ifdef ANDROID_CHANGES
--LEAF(MEMSET_NAME, 0)
--#else
- LEAF(MEMSET_NAME)
--#endif
--
- 	.align	6
- 	bstrins.d a1, a1, 15, 8
- 	add.d	  t7, a0, a2
-@@ -168,10 +163,8 @@ end_0_8_unalign:
- 
- END(MEMSET_NAME)
- 
--#ifndef ANDROID_CHANGES
- #ifdef _LIBC
- libc_hidden_builtin_def (MEMSET_NAME)
- #endif
--#endif
- 
- #endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S
-index 1d5e56c5..de6c7f4f 100644
---- a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S
-@@ -123,10 +123,8 @@ L(_mc8_a):
- 	jr		ra
- END(STRCHR_NAME)
- 
--#ifndef ANDROID_CHANGES
- #ifdef _LIBC
- libc_hidden_builtin_def (STRCHR_NAME)
- #endif
--#endif
- 
- #endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S
-index 6338d005..abc246ca 100644
---- a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S
-@@ -136,11 +136,9 @@ L(_mc8_a):
- 	jr		ra
- END(STRCHRNUL_NAME)
- 
--#ifndef ANDROID_CHANGES
- #ifdef _LIBC
- weak_alias(STRCHRNUL_NAME, strchrnul)
- libc_hidden_builtin_def (STRCHRNUL_NAME)
- #endif
--#endif
- 
- #endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
-index 449733cb..c77dc1a9 100644
---- a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
-@@ -190,10 +190,8 @@ strcpy_page_cross:
- 	beqz		has_nul, strcpy_page_cross_ok
- 	b		strcpy_end
- END(STRCPY)
--#ifndef ANDROID_CHANGES
- #ifdef _LIBC
- libc_hidden_builtin_def (STRCPY)
- #endif
--#endif
- 
- #endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S
-index e9b7cf67..2fe0fb34 100644
---- a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S
-@@ -107,10 +107,8 @@ strlen_loop_noascii:
- 	jr		ra
- END(STRLEN)
- 
--#ifndef ANDROID_CHANGES
- #ifdef _LIBC
- libc_hidden_builtin_def (STRLEN)
- #endif
--#endif
- 
- #endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S
-index 558df29b..6ec107ca 100644
---- a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S
-@@ -248,10 +248,8 @@ strncmp_ret0:
-    then exchange(src1,src2).  */
- 
- END(STRNCMP)
--#ifndef ANDROID_CHANGES
- #ifdef _LIBC
- libc_hidden_builtin_def (STRNCMP)
- #endif
--#endif
- 
- #endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S
-index 60eccf00..4a195b7c 100644
---- a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S
-@@ -136,10 +136,8 @@ L(_hit_limit):
-     move	len, limit
-     jr		ra
- END(STRNLEN)
--#ifndef ANDROID_CHANGES
- #ifdef _LIBC
- libc_hidden_builtin_def (STRNLEN)
- #endif
--#endif
- 
- #endif
--- 
-2.33.0
-
diff --git a/glibc-2.28-Remove-useless-IS_LA-264-364-464-and-IS_L.patch b/glibc-2.28-Remove-useless-IS_LA-264-364-464-and-IS_L.patch
deleted file mode 100644
index 4880d26..0000000
--- a/glibc-2.28-Remove-useless-IS_LA-264-364-464-and-IS_L.patch
+++ /dev/null
@@ -1,40 +0,0 @@
-From f4041e5da609a9f5da966fa000c00b150788a948 Mon Sep 17 00:00:00 2001
-From: caiyinyu <caiyinyu@loongson.cn>
-Date: Sun, 23 Jul 2023 14:32:08 +0800
-Subject: [PATCH 13/14] glibc-2.28: Remove useless IS_LA{264,364,464} and
- IS_LA{264, 364, 464}.
-
-Change-Id: Id9a573510e2a493151191372d651f381ec2aefe7
-Signed-off-by: ticat_fp <fanpeng@loongson.cn>
----
- sysdeps/unix/sysv/linux/loongarch/cpu-features.h | 7 -------
- 1 file changed, 7 deletions(-)
-
-diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
-index b46a8489..2703d4f7 100644
---- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
-+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
-@@ -22,10 +22,6 @@
- #include <stdint.h>
- #include <sys/auxv.h>
- 
--#define LA264 0x14a000
--#define LA364 0x14b000
--#define LA464 0x14c011
--
- struct cpu_features
- {
-   uint64_t cpucfg_prid;
-@@ -42,9 +38,6 @@ extern const struct cpu_features *_dl_larch_get_cpu_features (void)
- 		:"=r"(ret)			\
- 		:"r"(index));
- 
--#define IS_LA264(prid) (prid == LA264)
--#define IS_LA364(prid) (prid == LA364)
--#define IS_LA464(prid) (prid == LA464)
- #define SUPPORT_UAL (GLRO (dl_hwcap) & HWCAP_LOONGARCH_UAL)
- #define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
- #define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
--- 
-2.33.0
-
diff --git a/glibc-2.28-Use-RTLD_SUPPORT_-LSX-LASX-to-choose-_dl_.patch b/glibc-2.28-Use-RTLD_SUPPORT_-LSX-LASX-to-choose-_dl_.patch
deleted file mode 100644
index 720cd20..0000000
--- a/glibc-2.28-Use-RTLD_SUPPORT_-LSX-LASX-to-choose-_dl_.patch
+++ /dev/null
@@ -1,123 +0,0 @@
-From c94d9376e241dc52eb9f2a2107313b7836e0e9ad Mon Sep 17 00:00:00 2001
-From: caiyinyu <caiyinyu@loongson.cn>
-Date: Wed, 6 Sep 2023 16:41:09 +0800
-Subject: [PATCH 14/14] glibc-2.28: Use RTLD_SUPPORT_{LSX, LASX} to choose
- _dl_runtime_resolve.
-
-Key Points:
-1. On lasx & lsx platforms, use _dl_runtime_resolve_{lsx, lasx} to save vector registers.
-2. Via "tunables", users can choose str/mem functions with
-   `export GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX`.
-   Note: glibc.cpu.hwcaps doesn't affect _dl_runtime_resolve_{lsx, lasx} selection.
-
-Usage Notes:
-1. Only valid inputs: LASX, LSX, UAL. Case-sensitive, comma-separated, no spaces.
-2. Example: `export GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX,UAL` turns on LASX & UAL.
-   Unmentioned features turn off. With default ifunc: lasx > lsx > unaligned >
-   aligned > generic, effect is: lasx > unaligned > aligned > generic; lsx off.
-3. Incorrect GLIBC_TUNABLES settings will show error messages.
-4. Valid input examples:
-   - GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX: lasx > aligned > generic.
-   - GLIBC_TUNABLES=glibc.cpu.hwcaps=LSX,UAL: lsx > unaligned > aligned > generic.
-   - GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX,UAL,LASX,UAL,LSX,LASX,UAL: Repetitions
-     allowed but not recommended. Results in: lasx > lsx > unaligned > aligned >
-     generic.
-
-Change-Id: I555ce2039bc36bf071fc9265d7b0bb7b93b96ae7
-Signed-off-by: ticat_fp <fanpeng@loongson.cn>
----
- sysdeps/loongarch/cpu-tunables.c                 |  2 +-
- sysdeps/loongarch/dl-machine.h                   | 11 ++++++-----
- sysdeps/unix/sysv/linux/loongarch/cpu-features.c |  2 ++
- sysdeps/unix/sysv/linux/loongarch/cpu-features.h | 10 +++++++---
- 4 files changed, 16 insertions(+), 9 deletions(-)
-
-diff --git a/sysdeps/loongarch/cpu-tunables.c b/sysdeps/loongarch/cpu-tunables.c
-index 840c1b8c..e0799ca9 100644
---- a/sysdeps/loongarch/cpu-tunables.c
-+++ b/sysdeps/loongarch/cpu-tunables.c
-@@ -88,7 +88,7 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
-     }
-   while (*c != '\0');
- 
--  GLRO (dl_hwcap) &= hwcap;
-+  GLRO (dl_larch_cpu_features).hwcap &= hwcap;
- }
- 
- #endif
-diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
-index ff520a07..b5f43c84 100644
---- a/sysdeps/loongarch/dl-machine.h
-+++ b/sysdeps/loongarch/dl-machine.h
-@@ -75,13 +75,14 @@ dl_platform_init (void)
-     GLRO(dl_platform) = NULL;
- 
- #ifdef SHARED
-+  /* init_cpu_features has been called early from __libc_start_main in
-+     static executable.  */
-+  init_cpu_features (&GLRO(dl_larch_cpu_features));
- 
- #if HAVE_TUNABLES
-   TUNABLE_GET (hwcaps, tunable_val_t *, TUNABLE_CALLBACK (set_hwcaps));
- #endif
--  /* init_cpu_features has been called early from __libc_start_main in
--     static executable.  */
--  init_cpu_features (&GLRO(dl_larch_cpu_features));
-+
- #endif
- }
- 
-@@ -396,9 +397,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
- 	l->l_mach.plt = gotplt[1] + l->l_addr;
- 
- #if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
--      if (SUPPORT_LASX)
-+      if (RTLD_SUPPORT_LASX)
- 	gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx;
--      else if (SUPPORT_LSX)
-+      else if (RTLD_SUPPORT_LSX)
- 	gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lsx;
-       else
- #endif
-diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
-index 80870f3c..cf015011 100644
---- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
-+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
-@@ -29,4 +29,6 @@ init_cpu_features (struct cpu_features *cpu_features)
- 
-   __cpucfg(cpucfg_word, 2);
-   cpu_features->cpucfg_word_idx2 = cpucfg_word;
-+
-+  GLRO (dl_larch_cpu_features).hwcap = GLRO (dl_hwcap);
- }
-diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
-index 2703d4f7..17c9f5a7 100644
---- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
-+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
-@@ -26,6 +26,7 @@ struct cpu_features
- {
-   uint64_t cpucfg_prid;
-   uint64_t cpucfg_word_idx2;
-+  uint64_t hwcap;
- };
- 
- /* Get a pointer to the CPU features structure.  */
-@@ -38,9 +39,12 @@ extern const struct cpu_features *_dl_larch_get_cpu_features (void)
- 		:"=r"(ret)			\
- 		:"r"(index));
- 
--#define SUPPORT_UAL (GLRO (dl_hwcap) & HWCAP_LOONGARCH_UAL)
--#define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
--#define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
-+#define SUPPORT_UAL (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_UAL)
-+#define SUPPORT_LSX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LSX)
-+#define SUPPORT_LASX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LASX)
-+
-+#define RTLD_SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
-+#define RTLD_SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
- 
- #endif /* _CPU_FEATURES_LOONGARCH64_H  */
- 
--- 
-2.33.0
-
diff --git a/glibc-2.28-config-Added-HAVE_LOONGARCH_VEC_ASM.patch b/glibc-2.28-config-Added-HAVE_LOONGARCH_VEC_ASM.patch
deleted file mode 100644
index bfbe0e2..0000000
--- a/glibc-2.28-config-Added-HAVE_LOONGARCH_VEC_ASM.patch
+++ /dev/null
@@ -1,91 +0,0 @@
-From 58b1f882644f839259505dde3205e226a1c649f1 Mon Sep 17 00:00:00 2001
-From: caiyinyu <caiyinyu@loongson.cn>
-Date: Tue, 11 Jul 2023 15:42:26 +0800
-Subject: [PATCH 10/14] glibc-2.28: config: Added HAVE_LOONGARCH_VEC_ASM.
-
-Change-Id: Iea464ea0c975a351682a60f66251167f6c79385b
-Signed-off-by: ticat_fp <fanpeng@loongson.cn>
----
- config.h.in                    |  5 +++++
- sysdeps/loongarch/configure    | 28 ++++++++++++++++++++++++++++
- sysdeps/loongarch/configure.ac | 15 +++++++++++++++
- 3 files changed, 48 insertions(+)
-
-diff --git a/config.h.in b/config.h.in
-index 94d5ea36..fa53cc2d 100644
---- a/config.h.in
-+++ b/config.h.in
-@@ -123,6 +123,11 @@
- /* RISC-V floating-point ABI for ld.so.  */
- #undef RISCV_ABI_FLEN
- 
-+/* Assembler support LoongArch LASX/LSX vector instructions.
-+   This macro becomes obsolete when glibc increased the minimum
-+   required version of GNU 'binutils' to 2.41 or later. */
-+#define HAVE_LOONGARCH_VEC_ASM 0
-+
- /* Linux specific: minimum supported kernel version.  */
- #undef	__LINUX_KERNEL_VERSION
- 
-diff --git a/sysdeps/loongarch/configure b/sysdeps/loongarch/configure
-index 1e5abf81..0f0dae3a 100755
---- a/sysdeps/loongarch/configure
-+++ b/sysdeps/loongarch/configure
-@@ -2,3 +2,31 @@
-  # Local configure fragment for sysdeps/loongarch/elf.
- 
- #AC_DEFINE(PI_STATIC_AND_HIDDEN)
-+
-+# Check if asm support vector instructions.
-+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for vector support in assembler" >&5
-+$as_echo_n "checking for vector support in assembler... " >&6; }
-+if ${libc_cv_loongarch_vec_asm+:} false; then :
-+  $as_echo_n "(cached) " >&6
-+else
-+  cat > conftest.s <<\EOF
-+        vld  $vr0, $sp, 0
-+EOF
-+if { ac_try='${CC-cc} -c $CFLAGS conftest.s -o conftest 1>&5'
-+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-+  (eval $ac_try) 2>&5
-+  ac_status=$?
-+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-+  test $ac_status = 0; }; }; then
-+  libc_cv_loongarch_vec_asm=yes
-+else
-+  libc_cv_loongarch_vec_asm=no
-+fi
-+rm -f conftest*
-+fi
-+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_loongarch_vec_asm" >&5
-+$as_echo "$libc_cv_loongarch_vec_asm" >&6; }
-+if test $libc_cv_loongarch_vec_asm = yes; then
-+  $as_echo "#define HAVE_LOONGARCH_VEC_ASM 1" >>confdefs.h
-+
-+fi
-diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac
-index 67b46ce0..aac0efa9 100644
---- a/sysdeps/loongarch/configure.ac
-+++ b/sysdeps/loongarch/configure.ac
-@@ -4,3 +4,18 @@ GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
- dnl It is always possible to access static and hidden symbols in an
- dnl position independent way.
- #AC_DEFINE(PI_STATIC_AND_HIDDEN)
-+
-+# Check if asm support vector instructions.
-+AC_CACHE_CHECK(for vector support in assembler, libc_cv_loongarch_vec_asm, [dnl
-+cat > conftest.s <<\EOF
-+        vld  $vr0, $sp, 0
-+EOF
-+if AC_TRY_COMMAND(${CC-cc} -c $CFLAGS conftest.s -o conftest 1>&AS_MESSAGE_LOG_FD); then
-+  libc_cv_loongarch_vec_asm=yes
-+else
-+  libc_cv_loongarch_vec_asm=no
-+fi
-+rm -f conftest*])
-+if test $libc_cv_loongarch_vec_asm = yes; then
-+  AC_DEFINE(HAVE_LOONGARCH_VEC_ASM)
-+fi
--- 
-2.33.0
-
diff --git a/glibc-2.28-remove-ABILPX32-related-code.patch b/glibc-2.28-remove-ABILPX32-related-code.patch
deleted file mode 100644
index d5ece82..0000000
--- a/glibc-2.28-remove-ABILPX32-related-code.patch
+++ /dev/null
@@ -1,75 +0,0 @@
-From 0153532f680527c4378a10673518cabda2e02584 Mon Sep 17 00:00:00 2001
-From: caiyinyu <caiyinyu@loongson.cn>
-Date: Fri, 26 May 2023 14:58:39 +0800
-Subject: [PATCH 05/14] glibc-2.28: remove ABILPX32 related code.
-
-Change-Id: I73eb5bc4d4ca12e4d45ed6b533fa38d60a3a633f
-Signed-off-by: ticat_fp <fanpeng@loongson.cn>
----
- elf/elf.h                                       | 3 +--
- sysdeps/loongarch/dl-machine.h                  | 2 --
- sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h | 2 +-
- sysdeps/loongarch/sys/regdef.h                  | 4 +---
- 4 files changed, 3 insertions(+), 8 deletions(-)
-
-diff --git a/elf/elf.h b/elf/elf.h
-index 65d1fb46..4bfbad61 100644
---- a/elf/elf.h
-+++ b/elf/elf.h
-@@ -3933,10 +3933,9 @@ enum
- #define R_NDS32_TLS_TPOFF	102
- #define R_NDS32_TLS_DESC	119
- 
--/* LoongISA ELF Flags */
-+/* LoongArch ELF Flags */
- #define EF_LARCH_ABI             0x0003
- #define EF_LARCH_ABI_LP64        0x0003
--#define EF_LARCH_ABI_LPX32       0x0002
- #define EF_LARCH_ABI_LP32        0x0001
- 
- /* Loongarch specific dynamic relocations. */
-diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
-index 2d527241..6e9c6258 100644
---- a/sysdeps/loongarch/dl-machine.h
-+++ b/sysdeps/loongarch/dl-machine.h
-@@ -96,8 +96,6 @@ elf_machine_matches_host (const ElfW(Ehdr) *ehdr)
- 
- #ifdef _ABILP64
-   if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LP64)
--#elif defined _ABILPX32
--  if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LPX32)
- #elif defined _ABILP32
-   if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LP32)
- #else
-diff --git a/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h b/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h
-index 5a761355..aa63bce1 100644
---- a/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h
-+++ b/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h
-@@ -32,7 +32,7 @@
- # define __SIZEOF_PTHREAD_BARRIER_T 		32
- # define __SIZEOF_PTHREAD_BARRIERATTR_T 	 4
- #else
--# error "rv32i-based systems are not supported"
-+# error "32-bit based systems are not supported"
- #endif
- 
- #define __PTHREAD_COMPAT_PADDING_MID
-diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h
-index 769784b8..36f00939 100644
---- a/sysdeps/loongarch/sys/regdef.h
-+++ b/sysdeps/loongarch/sys/regdef.h
-@@ -72,10 +72,8 @@
- # define fs6	$f30
- # define fs7	$f31
- 
--#elif _LOONGARCH_SIM == _ABILPX32
--# error ABILPX32
- #elif _LOONGARCH_SIM == _ABILP32
--# error ABILP32
-+# error ABILP32 not support yet
- #else
- # error noABI
- #endif
--- 
-2.33.0
-
diff --git a/glibc-2.28-use-new-macro-LEAF-and-ENTRY-and-modify-r.patch b/glibc-2.28-use-new-macro-LEAF-and-ENTRY-and-modify-r.patch
deleted file mode 100644
index fce80c4..0000000
--- a/glibc-2.28-use-new-macro-LEAF-and-ENTRY-and-modify-r.patch
+++ /dev/null
@@ -1,1033 +0,0 @@
-From 18331a16d37b191b84296d8a5e96cd069fe45664 Mon Sep 17 00:00:00 2001
-From: caiyinyu <caiyinyu@loongson.cn>
-Date: Mon, 17 Apr 2023 17:04:57 +0800
-Subject: [PATCH 02/14] glibc-2.28: use new macro LEAF and ENTRY and modify
- related code.
-
-Change-Id: Iac8a3cc0f57ba39cf364580966c8bfca1b54a7a5
-Signed-off-by: ticat_fp <fanpeng@loongson.cn>
----
- sysdeps/loongarch/__longjmp.S                          | 2 +-
- sysdeps/loongarch/dl-trampoline.h                      | 2 +-
- sysdeps/loongarch/lp64/memchr.S                        | 3 +--
- sysdeps/loongarch/lp64/memcmp.S                        | 3 +--
- sysdeps/loongarch/lp64/memcpy.S                        | 5 ++---
- sysdeps/loongarch/lp64/memset.S                        | 3 +--
- sysdeps/loongarch/lp64/multiarch/memchr-lasx.S         | 3 +--
- sysdeps/loongarch/lp64/multiarch/memchr-lsx.S          | 3 +--
- sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S         | 3 +--
- sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S          | 7 +++----
- sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S    | 2 +-
- sysdeps/loongarch/lp64/multiarch/memmove-lasx.S        | 6 ++----
- sysdeps/loongarch/lp64/multiarch/memmove-lsx.S         | 5 ++---
- sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S   | 2 +-
- sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S        | 3 +--
- sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S         | 3 +--
- sysdeps/loongarch/lp64/multiarch/memset-lasx.S         | 3 +--
- sysdeps/loongarch/lp64/multiarch/memset-lsx.S          | 3 +--
- sysdeps/loongarch/lp64/multiarch/memset-unaligned.S    | 3 +--
- sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S      | 3 +--
- sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S       | 3 +--
- sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S          | 6 +++---
- sysdeps/loongarch/lp64/multiarch/strchr-lasx.S         | 3 +--
- sysdeps/loongarch/lp64/multiarch/strchr-lsx.S          | 3 +--
- sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S    | 3 +--
- sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S | 3 +--
- sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S          | 6 +++---
- sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S    | 3 +--
- sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S          | 7 ++++---
- sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S    | 3 +--
- sysdeps/loongarch/lp64/multiarch/strlen-lasx.S         | 3 +--
- sysdeps/loongarch/lp64/multiarch/strlen-lsx.S          | 3 +--
- sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S    | 3 +--
- sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S         | 7 ++++---
- sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S   | 3 +--
- sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S        | 3 +--
- sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S         | 3 +--
- sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S   | 3 +--
- sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S        | 3 +--
- sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S         | 3 +--
- sysdeps/loongarch/lp64/rawmemchr.S                     | 3 +--
- sysdeps/loongarch/lp64/s_cosf.S                        | 4 +---
- sysdeps/loongarch/lp64/s_sinf.S                        | 4 +---
- sysdeps/loongarch/lp64/stpcpy.S                        | 3 +--
- sysdeps/loongarch/lp64/strchr.S                        | 3 +--
- sysdeps/loongarch/lp64/strchrnul.S                     | 3 +--
- sysdeps/loongarch/lp64/strcmp.S                        | 3 +--
- sysdeps/loongarch/lp64/strcpy.S                        | 3 +--
- sysdeps/loongarch/lp64/strlen.S                        | 3 +--
- sysdeps/loongarch/lp64/strncmp.S                       | 3 +--
- sysdeps/loongarch/lp64/strnlen.S                       | 3 +--
- sysdeps/loongarch/lp64/strrchr.S                       | 3 +--
- sysdeps/loongarch/setjmp.S                             | 6 +++---
- sysdeps/loongarch/start.S                              | 2 +-
- sysdeps/loongarch/sys/asm.h                            | 6 +++---
- sysdeps/unix/sysv/linux/loongarch/clone.S              | 4 ++--
- sysdeps/unix/sysv/linux/loongarch/getcontext.S         | 2 +-
- sysdeps/unix/sysv/linux/loongarch/setcontext.S         | 4 ++--
- sysdeps/unix/sysv/linux/loongarch/swapcontext.S        | 2 +-
- sysdeps/unix/sysv/linux/loongarch/sysdep.S             | 4 ++--
- sysdeps/unix/sysv/linux/loongarch/sysdep.h             | 4 ++--
- sysdeps/unix/sysv/linux/loongarch/vfork.S              | 2 +-
- 62 files changed, 85 insertions(+), 130 deletions(-)
-
-diff --git a/sysdeps/loongarch/__longjmp.S b/sysdeps/loongarch/__longjmp.S
-index 68f67639..bd06b919 100644
---- a/sysdeps/loongarch/__longjmp.S
-+++ b/sysdeps/loongarch/__longjmp.S
-@@ -19,7 +19,7 @@
- #include <sysdep.h>
- #include <sys/asm.h>
- 
--ENTRY (__longjmp)
-+ENTRY (__longjmp, 3)
-   REG_L ra, a0, 0*SZREG
-   REG_L sp, a0, 1*SZREG
-   REG_L x, a0,  2*SZREG
-diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h
-index 95639111..fb15983f 100644
---- a/sysdeps/loongarch/dl-trampoline.h
-+++ b/sysdeps/loongarch/dl-trampoline.h
-@@ -29,7 +29,7 @@
- # define FRAME_SIZE (-((-10 * SZREG - 8 * 256) & ALMASK))
- #endif
- 
--ENTRY (_dl_runtime_resolve)
-+ENTRY (_dl_runtime_resolve, 3)
-   # Save arguments to stack.
- 
- #ifdef __loongarch64
-diff --git a/sysdeps/loongarch/lp64/memchr.S b/sysdeps/loongarch/lp64/memchr.S
-index 75c4e15c..23f1fd13 100644
---- a/sysdeps/loongarch/lp64/memchr.S
-+++ b/sysdeps/loongarch/lp64/memchr.S
-@@ -11,8 +11,7 @@
- #define MEMCHR_NAME memchr
- #endif
- 
--LEAF(MEMCHR_NAME)
--    .align      6
-+LEAF(MEMCHR_NAME, 6)
-     beqz        a2, L(out)
-     andi        t1, a0, 0x7
-     lu12i.w     a3, 0x01010
-diff --git a/sysdeps/loongarch/lp64/memcmp.S b/sysdeps/loongarch/lp64/memcmp.S
-index 9e57a924..457a4dc7 100644
---- a/sysdeps/loongarch/lp64/memcmp.S
-+++ b/sysdeps/loongarch/lp64/memcmp.S
-@@ -11,8 +11,7 @@
- #define MEMCMP_NAME memcmp
- #endif
- 
--LEAF(MEMCMP_NAME)
--    .align      6
-+LEAF(MEMCMP_NAME, 6)
-     beqz        a2, L(ret)
-     andi        a4, a1, 0x7
-     andi        a3, a0, 0x7
-diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S
-index b6ca60a1..4791e1a4 100644
---- a/sysdeps/loongarch/lp64/memcpy.S
-+++ b/sysdeps/loongarch/lp64/memcpy.S
-@@ -35,8 +35,7 @@
-     st.d        t6, reg, n+48; \
-     st.d        t7, reg, n+56;
- 
--LEAF(MEMMOVE_NAME)
--    .align      6
-+LEAF(MEMMOVE_NAME, 6)
-     sub.d       t0, a0, a1
-     bltu        t0, a2, L(copy_back)
- 
-@@ -46,7 +45,7 @@ END(MEMMOVE_NAME)
- libc_hidden_builtin_def (MEMMOVE_NAME)
- #endif
- 
--LEAF(MEMCPY_NAME)
-+LEAF_NO_ALIGN(MEMCPY_NAME)
- 
-     srai.d      a3, a2, 4
-     beqz        a3, L(short_data)  # less than 16 bytes
-diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S
-index 41629e7e..eabd7d23 100644
---- a/sysdeps/loongarch/lp64/memset.S
-+++ b/sysdeps/loongarch/lp64/memset.S
-@@ -21,8 +21,7 @@
-     st.d        a1, a0, n+48;   \
-     st.d        a1, a0, n+56;
- 
--LEAF(MEMSET_NAME)
--    .align          6
-+LEAF(MEMSET_NAME, 6)
-     move        t0, a0
-     andi        a3, a0, 0x7
-     li.w        t6, 16
-diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
-index e63e34ae..387a35fe 100644
---- a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
-@@ -11,8 +11,7 @@
- 
- #define MEMCHR	__memchr_lasx
- 
--LEAF(MEMCHR)
--    .align          6
-+LEAF(MEMCHR, 6)
-     beqz            a2, L(ret0)
-     add.d           a3, a0, a2
-     andi            t0, a0, 0x3f
-diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
-index 441db534..c6952657 100644
---- a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
-@@ -11,8 +11,7 @@
- 
- #define MEMCHR	__memchr_lsx
- 
--LEAF(MEMCHR)
--    .align          6
-+LEAF(MEMCHR, 6)
-     beqz            a2, L(ret0)
-     add.d           a3, a0, a2
-     andi            t0, a0, 0x1f
-diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
-index 30e2dbe6..9151d38d 100644
---- a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
-@@ -11,8 +11,7 @@
- 
- #define MEMCMP  __memcmp_lasx
- 
--LEAF(MEMCMP)
--    .align          6
-+LEAF(MEMCMP, 6)
-     li.d            t2, 32
-     add.d           a3, a0, a2
-     add.d           a4, a1, a2
-diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
-index 7fd349b6..8535aa22 100644
---- a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
-@@ -10,11 +10,10 @@
- #if IS_IN (libc)
- 
- #define MEMCMP  __memcmp_lsx
--
- L(magic_num):
--    .align          6
--    .dword          0x0706050403020100
--    .dword          0x0f0e0d0c0b0a0908
-+    .align		6
-+    .dword		0x0706050403020100
-+    .dword		0x0f0e0d0c0b0a0908
-     nop
-     nop
- ENTRY_NO_ALIGN(MEMCMP)
-diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
-index 64b60244..96df7c40 100644
---- a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
-@@ -31,7 +31,7 @@
- 	st.d    t6, reg, n+48; \
- 	st.d    t7, reg, n+56;
- 
--LEAF(MEMCPY_NAME)
-+LEAF(MEMCPY_NAME, 3)
- 
- //1st var: dst ptr: void *a1 $r4 a0
- //2nd var: src ptr: void *a2 $r5 a1
-diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
-index 9537a35a..e8b2c441 100644
---- a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
-@@ -17,8 +17,7 @@
- #define MEMMOVE_NAME __memmove_lasx
- #endif
- 
--LEAF(MEMCPY_NAME)
--    .align          6
-+LEAF(MEMCPY_NAME, 6)
- 
-     li.d            t0, 32
-     add.d           a3, a0, a2
-@@ -83,8 +82,7 @@ L(less_1bytes):
-     jr              ra
- END(MEMCPY_NAME)
- 
--LEAF(MEMMOVE_NAME)
--    .align          6
-+LEAF(MEMMOVE_NAME, 6)
- 
-     li.d            t0, 32
-     add.d           a3, a0, a2
-diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
-index 26babad4..90f89c7a 100644
---- a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
-@@ -12,8 +12,7 @@
- #define MEMCPY_NAME __memcpy_lsx
- #define MEMMOVE_NAME __memmove_lsx
- 
--LEAF(MEMCPY_NAME)
--    .align          6
-+LEAF(MEMCPY_NAME, 6)
-     li.d            t6, 16
-     add.d           a3, a0, a2
-     add.d           a4, a1, a2
-@@ -83,7 +82,7 @@ L(less_1bytes):
-     nop
- END(MEMCPY_NAME)
- 
--LEAF(MEMMOVE_NAME)
-+LEAF(MEMMOVE_NAME, 6)
-     li.d            t6, 16
-     add.d           a3, a0, a2
-     add.d           a4, a1, a2
-diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
-index 42920a1a..712b1c62 100644
---- a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
-@@ -100,7 +100,7 @@
- 	LD_64(a4, -1024); \
- 	ST_64(a3, -1024);
- 
--LEAF(MEMMOVE_NAME)
-+LEAF(MEMMOVE_NAME, 3)
- 
- //1st var: dest ptr: void *str1 $r4 a0
- //2nd var: src  ptr: void *str2 $r5 a1
-diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
-index 57e1035f..9ecd0257 100644
---- a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
-@@ -13,8 +13,7 @@
- #define MEMRCHR	__memrchr_lasx
- #endif
- 
--LEAF(MEMRCHR)
--    .align          6
-+LEAF(MEMRCHR, 6)
-     beqz            a2, L(ret0)
-     addi.d          a2, a2, -1
-     add.d           a3, a0, a2
-diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
-index eac2059a..4bdc18d8 100644
---- a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
-@@ -11,8 +11,7 @@
- 
- #define MEMRCHR	__memrchr_lsx
- 
--LEAF(MEMRCHR)
--    .align          6
-+LEAF(MEMRCHR, 6)
-     beqz            a2, L(ret0)
-     addi.d          a2, a2, -1
-     add.d           a3, a0, a2
-diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
-index 1bd2dda9..b53c0b7b 100644
---- a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
-@@ -11,8 +11,7 @@
- 
- #define MEMSET	__memset_lasx
- 
--LEAF(MEMSET)
--    .align          6
-+LEAF(MEMSET, 6)
-     li.d            t1, 32
-     move            a3, a0
-     xvreplgr2vr.b   $xr0, a1
-diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
-index a3bbadb7..7ab85283 100644
---- a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
-@@ -11,8 +11,7 @@
- 
- #define MEMSET	__memset_lsx
- 
--LEAF(MEMSET)
--    .align          6
-+LEAF(MEMSET, 6)
-     li.d            t1, 16
-     move            a3, a0
-     vreplgr2vr.b    $vr0, a1
-diff --git a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
-index 54e51546..92b0fab5 100644
---- a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
-@@ -33,8 +33,7 @@
- //2nd var: int val  $5   a1
- //3rd var: size_t num  $6  a2
- 
--LEAF(MEMSET_NAME)
--	.align	6
-+LEAF(MEMSET_NAME, 6)
- 	bstrins.d a1, a1, 15, 8
- 	add.d	  t7, a0, a2
- 	bstrins.d a1, a1, 31, 16
-diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
-index bff92969..1e94aa50 100644
---- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
-@@ -5,8 +5,7 @@
- 
- # define RAWMEMCHR __rawmemchr_lasx
- 
--LEAF(RAWMEMCHR)
--    .align          6
-+LEAF(RAWMEMCHR, 6)
-     move            a2, a0
-     bstrins.d       a0, zero, 4, 0
-     xvld            $xr0, a0, 0
-diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
-index 11a19c1d..40bf0cda 100644
---- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
-@@ -11,8 +11,7 @@
- 
- # define RAWMEMCHR __rawmemchr_lsx
- 
--LEAF(RAWMEMCHR)
--    .align          6
-+LEAF(RAWMEMCHR, 6)
-     move            a2, a0
-     bstrins.d       a0, zero, 4, 0
-     vld             $vr0, a0, 0
-diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
-index bf0eed43..0836f590 100644
---- a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
-@@ -12,9 +12,9 @@
- #define STPCPY __stpcpy_lsx
- 
- L(magic_num):
--    .align          6
--    .dword          0x0706050403020100
--    .dword          0x0f0e0d0c0b0a0908
-+    .align		6
-+    .dword		0x0706050403020100
-+    .dword		0x0f0e0d0c0b0a0908
- ENTRY_NO_ALIGN(STPCPY)
-     pcaddi          t0, -4
-     andi            a4, a1, 0xf
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
-index ea7eb9d2..3f6ad915 100644
---- a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
-@@ -13,8 +13,7 @@
- #define STRCHR	__strchr_lasx
- #endif
- 
--LEAF(STRCHR)
--    .align          6
-+LEAF(STRCHR, 6)
-     andi            t1, a0, 0x1f
-     bstrins.d       a0, zero, 4, 0
-     xvld            $xr0, a0, 0
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
-index 64ead00b..4ad9a4ad 100644
---- a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
-@@ -13,8 +13,7 @@
- #define STRCHR	__strchr_lsx
- #endif
- 
--LEAF(STRCHR)
--    .align          6
-+LEAF(STRCHR, 6)
-     andi            t1, a0, 0xf
-     bstrins.d       a0, zero, 3, 0
-     vld             $vr0, a0, 0
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S
-index de6c7f4f..365818f9 100644
---- a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S
-@@ -38,8 +38,7 @@
- #define STRCHR_NAME __strchr_unaligned
- 
- /* char * strchr (const char *s1, int c); */
--LEAF(STRCHR_NAME)
--	.align		6
-+LEAF(STRCHR_NAME, 6)
- 
- 	li.w		t4, 0x7
- 	lu12i.w		a2, 0x01010
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S
-index abc246ca..7b496076 100644
---- a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S
-@@ -46,8 +46,7 @@
- 
- /* char * strchrnul (const char *s1, int c); */
- 
--LEAF(STRCHRNUL_NAME)
--	.align		6
-+LEAF(STRCHRNUL_NAME, 6)
- 	li.w		t4, 0x7
- 	lu12i.w		a2, 0x01010
- 	bstrins.d	a1, a1, 15, 8
-diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
-index 226b1d63..c86e3ecd 100644
---- a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
-@@ -13,9 +13,9 @@
- 
- /* int strcmp (const char *s1, const char *s2); */
- L(magic_num):
--    .align          6
--    .dword          0x0706050403020100
--    .dword          0x0f0e0d0c0b0a0908
-+    .align		6
-+    .dword		0x0706050403020100
-+    .dword		0x0f0e0d0c0b0a0908
- 
- ENTRY_NO_ALIGN(STRCMP)
-     pcaddi          t0, -4
-diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S
-index e29d872f..1e2e44ec 100644
---- a/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S
-@@ -73,8 +73,7 @@
- 
- /* int strcmp (const char *s1, const char *s2); */
- 
--LEAF(STRCMP_NAME)
--	.align		4
-+LEAF(STRCMP_NAME, 4)
- 
- 	xor		tmp1, src1, src2
-     	lu12i.w     	zeroones, 0x01010
-diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
-index 76db561a..dbc061ad 100644
---- a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
-@@ -14,9 +14,10 @@
- /* int strcpy (const char *s1, const char *s2); */
- 
- L(magic_num):
--    .align          6
--    .dword          0x0706050403020100
--    .dword          0x0f0e0d0c0b0a0908
-+    .align		6
-+    .dword		0x0706050403020100
-+    .dword		0x0f0e0d0c0b0a0908
-+
- ENTRY_NO_ALIGN(STRCPY)
-     pcaddi          t0, -4
-     andi            a4, a1, 0xf
-diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
-index c77dc1a9..150dc802 100644
---- a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
-@@ -61,8 +61,7 @@
- 
- /* int strcpy (const char *s1, const char *s2); */
- 
--LEAF(STRCPY)
--	.align		4
-+LEAF(STRCPY, 4)
-     	move        	dest_backup, dest
-     	lu12i.w     	zeroones, 0x01010
-     	lu12i.w     	sevenf, 0x7f7f7
-diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
-index cb276aa0..fd6c002d 100644
---- a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
-@@ -13,8 +13,7 @@
- 
- /* size_t strlen(const char *s1); */
- 
--LEAF(STRLEN)
--    .align          6
-+LEAF(STRLEN, 6)
-     move            a1, a0
-     bstrins.d       a0, zero, 4, 0
-     li.d            t1, -1
-diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
-index 6edcac8c..6f311506 100644
---- a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
-@@ -13,8 +13,7 @@
- 
- /* size_t strlen(const char *s1); */
- 
--LEAF(STRLEN)
--    .align          6
-+LEAF(STRLEN, 6)
-     move            a1, a0
-     bstrins.d       a0, zero, 4, 0
-     vld             $vr0, a0, 0
-diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S
-index 2fe0fb34..837255e3 100644
---- a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S
-@@ -31,8 +31,7 @@
- 
- /* size_t strlen (const char *s1); */
- 
--LEAF(STRLEN)
--	.align		5
-+LEAF(STRLEN, 5)
- 	nor		t4, zero, zero
- 	lu12i.w		a2, 0x01010
- 	andi		t5, a0, 0x7
-diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
-index 3399bf77..2c6f9614 100644
---- a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
-@@ -14,9 +14,10 @@
- /* int strncmp (const char *s1, const char *s2); */
- 
- L(magic_num):
--    .align          6
--    .dword          0x0706050403020100
--    .dword          0x0f0e0d0c0b0a0908
-+    .align		6
-+    .dword		0x0706050403020100
-+    .dword		0x0f0e0d0c0b0a0908
-+
- ENTRY_NO_ALIGN(STRNCMP)
-     beqz            a2, L(ret0)
-     pcaddi          t0, -5
-diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S
-index 6ec107ca..88397528 100644
---- a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S
-@@ -60,8 +60,7 @@
- 
- /* int strncmp (const char *s1, const char *s2); */
- 
--LEAF(STRNCMP)
--	.align		4
-+LEAF(STRNCMP, 4)
- 	beqz		limit, strncmp_ret0
- 
- 	xor		tmp1, src1, src2
-diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
-index 8c30f10c..910b52fe 100644
---- a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
-@@ -13,8 +13,7 @@
- 
- /* size_t strnlen (const char *s1, size_t maxlen); */
- 
--LEAF(STRNLEN)
--    .align          6
-+LEAF(STRNLEN, 6)
-     beqz            a1, L(ret0)
-     andi            t1, a0, 0x3f
-     li.d            t3, 65
-diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
-index 388c239a..db0e90ff 100644
---- a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
-@@ -13,8 +13,7 @@
- 
- /* size_t strnlen (const char *s1, size_t maxlen); */
- 
--LEAF(STRNLEN)
--    .align          6
-+LEAF(STRNLEN, 6)
-     beqz            a1, L(ret0)
-     andi            t1, a0, 0x1f
-     li.d            t3, 33
-diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S
-index 4a195b7c..78e7444d 100644
---- a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S
-@@ -63,9 +63,8 @@
- 
- /* size_t strnlen (const char *s1,size_t maxlen); */
- 
--LEAF(STRNLEN)
-+LEAF(STRNLEN, 4)
- 
--    .align	4
-     beqz        limit, L(_hit_limit)
-     lu12i.w     zeroones, 0x01010
-     lu12i.w     sevenf, 0x7f7f7
-diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
-index 6f7a5618..325458ff 100644
---- a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
-@@ -11,8 +11,7 @@
- 
- #define STRRCHR __strrchr_lasx
- 
--LEAF(STRRCHR)
--    .align          6
-+LEAF(STRRCHR, 6)
-     andi            t1, a0, 0x3f
-     bstrins.d       a0, zero, 5, 0
-     xvld            $xr0, a0, 0
-diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
-index e9228a2e..e082eaab 100644
---- a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
-+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
-@@ -11,8 +11,7 @@
- 
- #define STRRCHR __strrchr_lsx
- 
--LEAF(STRRCHR)
--    .align          6
-+LEAF(STRRCHR, 6)
-     andi            t1, a0, 0x1f
-     bstrins.d       a0, zero, 4, 0
-     vld             $vr0, a0, 0
-diff --git a/sysdeps/loongarch/lp64/rawmemchr.S b/sysdeps/loongarch/lp64/rawmemchr.S
-index 94b70f2d..ef1db7ed 100644
---- a/sysdeps/loongarch/lp64/rawmemchr.S
-+++ b/sysdeps/loongarch/lp64/rawmemchr.S
-@@ -12,8 +12,7 @@
- #endif
- 
- 
--LEAF(RAWMEMCHR_NAME)
--    .align      6
-+LEAF(RAWMEMCHR_NAME, 6)
-     andi        t1, a0, 0x7
-     bstrins.d   a0, zero, 2, 0
-     lu12i.w     a2, 0x01010
-diff --git a/sysdeps/loongarch/lp64/s_cosf.S b/sysdeps/loongarch/lp64/s_cosf.S
-index 5bfabefb..9fcbe6ca 100644
---- a/sysdeps/loongarch/lp64/s_cosf.S
-+++ b/sysdeps/loongarch/lp64/s_cosf.S
-@@ -74,9 +74,7 @@
-     movgr2fr.d  tmp, rs;\
-     ffint.d.l   rd, tmp
- 
--LEAF(COSF)
--    .align      2
--    .align      3
-+LEAF(COSF, 3)
-     /* fa0 is SP x; fa1 is DP x */
-     movfr2gr.s  t0, fa0 /* Bits of x */
-     fcvt.d.s    fa1, fa0 /* DP x */
-diff --git a/sysdeps/loongarch/lp64/s_sinf.S b/sysdeps/loongarch/lp64/s_sinf.S
-index 91c9db9e..45d1c4b5 100644
---- a/sysdeps/loongarch/lp64/s_sinf.S
-+++ b/sysdeps/loongarch/lp64/s_sinf.S
-@@ -74,9 +74,7 @@
-     movgr2fr.d  tmp, rs;\
-     ffint.d.l   rd, tmp
- 
--LEAF(SINF)
--    .align      2
--    .align      3
-+LEAF(SINF, 3)
-     /* fa0 is SP x; fa1 is DP x */
-     movfr2gr.s  t2, fa0 /* Bits of x */
-     fcvt.d.s    fa1, fa0 /* DP x */
-diff --git a/sysdeps/loongarch/lp64/stpcpy.S b/sysdeps/loongarch/lp64/stpcpy.S
-index 9d4b0c8d..b6a367dc 100644
---- a/sysdeps/loongarch/lp64/stpcpy.S
-+++ b/sysdeps/loongarch/lp64/stpcpy.S
-@@ -11,8 +11,7 @@
- #define STPCPY_NAME __stpcpy
- #endif
- 
--LEAF(STPCPY_NAME)
--    .align      6
-+LEAF(STPCPY_NAME, 6)
-     andi        a3, a0, 0x7
-     beqz        a3, L(dest_align)
-     sub.d       a5, a1, a3
-diff --git a/sysdeps/loongarch/lp64/strchr.S b/sysdeps/loongarch/lp64/strchr.S
-index 63454c17..fde53a30 100644
---- a/sysdeps/loongarch/lp64/strchr.S
-+++ b/sysdeps/loongarch/lp64/strchr.S
-@@ -13,8 +13,7 @@
- 
- /* char * strchr (const char *s1, int c); */
- 
--LEAF(STRCHR_NAME)
--	.align		6
-+LEAF(STRCHR_NAME, 6)
- 	slli.d		t1, a0, 3
- 	bstrins.d	a0, zero, 2, 0
- 	lu12i.w		a2, 0x01010
-diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S
-index c4532e11..a5ee09a3 100644
---- a/sysdeps/loongarch/lp64/strchrnul.S
-+++ b/sysdeps/loongarch/lp64/strchrnul.S
-@@ -13,8 +13,7 @@
- 
- /* char * strchrnul (const char *s1, int c); */
- 
--LEAF(STRCHRNUL_NAME)
--	.align		6
-+LEAF(STRCHRNUL_NAME, 6)
- 	slli.d		t1, a0, 3
- 	bstrins.d	a0, zero, 2, 0
- 	lu12i.w		a2, 0x01010
-diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S
-index 22c261a3..3a863992 100644
---- a/sysdeps/loongarch/lp64/strcmp.S
-+++ b/sysdeps/loongarch/lp64/strcmp.S
-@@ -19,8 +19,7 @@
- #define src1	a0
- #define	src2	a1
- #define result	v0
--LEAF(STRCMP_NAME)
--    .align	6
-+LEAF(STRCMP_NAME, 6)
-     xor         a4, src1, src2
-     lu12i.w     t5, 0x01010
-     lu12i.w     t6, 0x7f7f7
-diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S
-index c6fe74cb..08505192 100644
---- a/sysdeps/loongarch/lp64/strcpy.S
-+++ b/sysdeps/loongarch/lp64/strcpy.S
-@@ -11,8 +11,7 @@
- #define STRCPY  strcpy
- #endif
- 
--LEAF(STRCPY)
--    .align      6
-+LEAF(STRCPY, 6)
-     andi        a3, a0, 0x7
-     move        a2, a0
-     beqz        a3, L(dest_align)
-diff --git a/sysdeps/loongarch/lp64/strlen.S b/sysdeps/loongarch/lp64/strlen.S
-index dd5a8da3..71431ce2 100644
---- a/sysdeps/loongarch/lp64/strlen.S
-+++ b/sysdeps/loongarch/lp64/strlen.S
-@@ -11,8 +11,7 @@
- #define STRLEN	strlen
- #endif
- 
--LEAF(STRLEN)
--	.align 6
-+LEAF(STRLEN, 6)
- 	move		a1, a0
- 	bstrins.d	a0, zero, 2, 0
- 	lu12i.w		a2, 0x01010
-diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S
-index dcb15350..55450e55 100644
---- a/sysdeps/loongarch/lp64/strncmp.S
-+++ b/sysdeps/loongarch/lp64/strncmp.S
-@@ -13,8 +13,7 @@
- 
- /* int strncmp (const char *s1, const char *s2); */
- 
--LEAF(STRNCMP)
--    .align      6
-+LEAF(STRNCMP, 6)
-     beqz        a2, L(ret0)
-     xor         a4, a0, a1
-     lu12i.w     t5, 0x01010
-diff --git a/sysdeps/loongarch/lp64/strnlen.S b/sysdeps/loongarch/lp64/strnlen.S
-index 0517e206..5b5ab585 100644
---- a/sysdeps/loongarch/lp64/strnlen.S
-+++ b/sysdeps/loongarch/lp64/strnlen.S
-@@ -15,8 +15,7 @@
- #. first load with t1 != 0, need to adjust t5;
- #. return the less one of both strlen(s) and a1;
- 
--LEAF(STRNLEN)
--	.align 6
-+LEAF(STRNLEN, 6)
- 	beqz		a1, L(out)
- 	lu12i.w		a2, 0x01010
- 	andi		t1, a0, 0x7
-diff --git a/sysdeps/loongarch/lp64/strrchr.S b/sysdeps/loongarch/lp64/strrchr.S
-index 3bf92ecd..df7fcb6b 100644
---- a/sysdeps/loongarch/lp64/strrchr.S
-+++ b/sysdeps/loongarch/lp64/strrchr.S
-@@ -11,8 +11,7 @@
- #define STRRCHR_NAME strrchr
- #endif
- 
--LEAF(STRRCHR_NAME)
--	.align 6
-+LEAF(STRRCHR_NAME, 6)
- 	slli.d		t1, a0, 3
- 	bstrins.d	a0, zero, 2, 0
- 	lu12i.w		a2, 0x01010
-diff --git a/sysdeps/loongarch/setjmp.S b/sysdeps/loongarch/setjmp.S
-index da09a93c..c4e6d01c 100644
---- a/sysdeps/loongarch/setjmp.S
-+++ b/sysdeps/loongarch/setjmp.S
-@@ -19,14 +19,14 @@
- #include <sysdep.h>
- #include <sys/asm.h>
- 
--ENTRY (_setjmp)
-+ENTRY (_setjmp, 3)
-   li.w a1,0
-   b __sigsetjmp
- END (_setjmp)
--ENTRY (setjmp)
-+ENTRY (setjmp, 3)
-   li.w a1,1
- END (setjmp)
--ENTRY (__sigsetjmp)
-+ENTRY (__sigsetjmp, 3)
-   REG_S ra, a0, 0*SZREG
-   REG_S sp, a0, 1*SZREG
-   REG_S x, a0,  2*SZREG
-diff --git a/sysdeps/loongarch/start.S b/sysdeps/loongarch/start.S
-index cf0a14b5..b83221e4 100644
---- a/sysdeps/loongarch/start.S
-+++ b/sysdeps/loongarch/start.S
-@@ -17,7 +17,7 @@ __libc_start_main (int (*main) (int, char **, char **),
- 			    void *stack_end);
- */
- 
--ENTRY (ENTRY_POINT)
-+ENTRY (ENTRY_POINT, 3)
- 	/* Terminate call stack by noting ra is undefined.  Use a dummy
- 	   .cfi_label to force starting the FDE.  */
- 	.cfi_label .Ldummy
-diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h
-index f64bfb2b..357a5ba3 100644
---- a/sysdeps/loongarch/sys/asm.h
-+++ b/sysdeps/loongarch/sys/asm.h
-@@ -27,15 +27,15 @@
- 
- 
- /* Declare leaf routine.  */
--#define	LEAF(symbol)			\
-+#define	LEAF(symbol, aln)		\
- 	.text;				\
- 	.globl	symbol;			\
--	.align	3;			\
-+	.align	aln;			\
- 	.type	symbol, @function;	\
- symbol: \
- 	cfi_startproc;			\
- 
--# define ENTRY(symbol) LEAF(symbol)
-+# define ENTRY(symbol, aln) LEAF(symbol, aln)
- 
- #define	LEAF_NO_ALIGN(symbol)			\
- 	.text;				\
-diff --git a/sysdeps/unix/sysv/linux/loongarch/clone.S b/sysdeps/unix/sysv/linux/loongarch/clone.S
-index f0fc566e..1180a11d 100644
---- a/sysdeps/unix/sysv/linux/loongarch/clone.S
-+++ b/sysdeps/unix/sysv/linux/loongarch/clone.S
-@@ -29,7 +29,7 @@
- /* int clone(int (*fn)(void *arg), void *child_stack, int flags, void *arg,
- 	     void *parent_tidptr, void *tls, void *child_tidptr) */
- 
--ENTRY (__clone)
-+ENTRY (__clone, 3)
- 
- 	/* Align stack to 16 or 8 bytes per the ABI.  */
- #if _LOONGARCH_SIM == _ABILP64
-@@ -74,7 +74,7 @@ L (error):
-    its own function so that we can terminate the stack trace with our
-    debug info.  */
- 
--ENTRY (__thread_start)
-+ENTRY (__thread_start, 3)
- L (thread_start):
- 	/* Terminate call stack by noting ra is undefined.  Use a dummy
- 	   .cfi_label to force starting the FDE.  */
-diff --git a/sysdeps/unix/sysv/linux/loongarch/getcontext.S b/sysdeps/unix/sysv/linux/loongarch/getcontext.S
-index 9c28d958..6391850e 100644
---- a/sysdeps/unix/sysv/linux/loongarch/getcontext.S
-+++ b/sysdeps/unix/sysv/linux/loongarch/getcontext.S
-@@ -21,7 +21,7 @@
- /* int getcontext (ucontext_t *ucp) */
- 
- 	.text
--LEAF (__getcontext)
-+LEAF (__getcontext, 3)
- 	SAVE_INT_REG (ra,   1, a0)
- 	SAVE_INT_REG (sp,   3, a0)
- 	SAVE_INT_REG (zero,  4, a0)	/* return 0 by overwriting a0.  */
-diff --git a/sysdeps/unix/sysv/linux/loongarch/setcontext.S b/sysdeps/unix/sysv/linux/loongarch/setcontext.S
-index c96ec43c..3a043a63 100644
---- a/sysdeps/unix/sysv/linux/loongarch/setcontext.S
-+++ b/sysdeps/unix/sysv/linux/loongarch/setcontext.S
-@@ -28,7 +28,7 @@
-   other than the PRESERVED state.  */
- 
- 	.text
--LEAF (__setcontext)
-+LEAF (__setcontext, 3)
- 
- 	addi.d	sp, sp, -16
- 	st.d	a0, sp, 0	/* Save ucp to stack. */
-@@ -94,7 +94,7 @@ LEAF (__setcontext)
- PSEUDO_END (__setcontext)
- weak_alias (__setcontext, setcontext)
- 
--LEAF (__start_context)
-+LEAF (__start_context, 3)
- 
- 	/* Terminate call stack by noting ra == 0.  Happily, s0 == 0 here.  */
- 	cfi_register (1, 23)
-diff --git a/sysdeps/unix/sysv/linux/loongarch/swapcontext.S b/sysdeps/unix/sysv/linux/loongarch/swapcontext.S
-index d839dd87..c9024d5f 100644
---- a/sysdeps/unix/sysv/linux/loongarch/swapcontext.S
-+++ b/sysdeps/unix/sysv/linux/loongarch/swapcontext.S
-@@ -20,7 +20,7 @@
- 
- /* int swapcontext (ucontext_t *oucp, const ucontext_t *ucp) */
- 
--LEAF (__swapcontext)
-+LEAF (__swapcontext, 3)
- 	ori 	a2, sp, 0		/* Save sp to a2. */
- 	addi.d	sp, sp, -16
- 	st.d	a1, sp, 0
-diff --git a/sysdeps/unix/sysv/linux/loongarch/sysdep.S b/sysdeps/unix/sysv/linux/loongarch/sysdep.S
-index a8094283..19c03fb4 100644
---- a/sysdeps/unix/sysv/linux/loongarch/sysdep.S
-+++ b/sysdeps/unix/sysv/linux/loongarch/sysdep.S
-@@ -22,13 +22,13 @@
- # define errno __libc_errno
- #endif
- 
--ENTRY (__syscall_error)
-+ENTRY (__syscall_error, 3)
- 	/* Fall through to __syscall_set_errno.  */
- END (__syscall_error)
- 
- /* Non-standard calling convention: argument in a0, return address in t0,
-    and clobber only t1.  */
--ENTRY (__syscall_set_errno)
-+ENTRY (__syscall_set_errno, 3)
- 	/* We got here because a0 < 0, but only codes in the range [-4095, -1]
- 	  represent errors.  Otherwise, just return the result normally.  */
- 
-diff --git a/sysdeps/unix/sysv/linux/loongarch/sysdep.h b/sysdeps/unix/sysv/linux/loongarch/sysdep.h
-index f50946d4..7b45f609 100644
---- a/sysdeps/unix/sysv/linux/loongarch/sysdep.h
-+++ b/sysdeps/unix/sysv/linux/loongarch/sysdep.h
-@@ -14,7 +14,7 @@
-    errors by setting a0 to a value between -1 and -4095.  */
- # undef PSEUDO
- # define PSEUDO(name, syscall_name, args)		\
--	ENTRY (name);					\
-+	ENTRY (name, 3);					\
- 	li.d	a7, SYS_ify (syscall_name);		\
- 	syscall	0;					\
- 	li.d	a7, -4096;				\
-@@ -58,7 +58,7 @@
- /* Performs a system call, not setting errno.  */
- # undef PSEUDO_NEORRNO
- # define PSEUDO_NOERRNO(name, syscall_name, args)	\
--	ENTRY (name);					\
-+	ENTRY (name, 3);					\
- 	li.d	a7, SYS_ify (syscall_name);		\
- 	syscall	0;
- 
-diff --git a/sysdeps/unix/sysv/linux/loongarch/vfork.S b/sysdeps/unix/sysv/linux/loongarch/vfork.S
-index 83cf141f..5db6720a 100644
---- a/sysdeps/unix/sysv/linux/loongarch/vfork.S
-+++ b/sysdeps/unix/sysv/linux/loongarch/vfork.S
-@@ -25,7 +25,7 @@
-    replaced by a call to `execve'.  Return -1 for errors, 0 to the new process,
-    and the process ID of the new process to the old process.  */
- 
--ENTRY (__vfork)
-+ENTRY (__vfork, 3)
- 
- 
-     li.d a0, 0x4111 /* CLONE_VM | CLONE_VFORK | SIGCHLD */
--- 
-2.33.0
-
diff --git a/glibc-Add-Hygon-Support.patch b/glibc-Add-Hygon-Support.patch
deleted file mode 100644
index c108f84..0000000
--- a/glibc-Add-Hygon-Support.patch
+++ /dev/null
@@ -1,28 +0,0 @@
-From ed64d30125f855e25ac6f12d8863857dfd3e2cbe Mon Sep 17 00:00:00 2001
-From: lijing22222 <lijing@hygon.cn>
-Date: Fri, 1 Mar 2024 16:00:15 +0800
-Subject: [PATCH] Add Hygon Support
-
----
- sysdeps/x86/cpu-features.c | 5 +++--
- 1 file changed, 3 insertions(+), 2 deletions(-)
-
-diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
-index 91042505..0ce37a9a 100644
---- a/sysdeps/x86/cpu-features.c
-+++ b/sysdeps/x86/cpu-features.c
-@@ -527,8 +527,9 @@ init_cpu_features (struct cpu_features *cpu_features)
- 	cpu_features->preferred[index_arch_Prefer_No_AVX512]
- 	  |= bit_arch_Prefer_No_AVX512;
-     }
--  /* This spells out "AuthenticAMD".  */
--  else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
-+  /* This spells out "AuthenticAMD" or "HygonGenuine".  */
-+  else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)||(ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e))
-+
-     {
-       unsigned int extended_model;
- 
--- 
-2.17.1
-
diff --git a/glibc-Add-a-testcase-to-check-alignment-of-PT_LOAD-segment-2.patch b/glibc-Add-a-testcase-to-check-alignment-of-PT_LOAD-segment-2.patch
deleted file mode 100644
index 73f1a06..0000000
--- a/glibc-Add-a-testcase-to-check-alignment-of-PT_LOAD-segment-2.patch
+++ /dev/null
@@ -1,147 +0,0 @@
-From 58f93dff514cc0bdf3c72eff590dcf5fe5bf9e00 Mon Sep 17 00:00:00 2001
-From: "H.J. Lu" <hjl.tools@gmail.com>
-Date: Wed, 19 Jul 2023 23:09:09 +0800
-Subject: [PATCH 3/6] Add a testcase to check alignment of PT_LOAD segment [BZ
- #28676]
-
-Backport from master commit: fc2334a
-
-Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
----
- elf/Makefile        | 13 ++++++++++++-
- elf/tst-align3.c    | 38 ++++++++++++++++++++++++++++++++++++++
- elf/tst-alignmod3.c | 32 ++++++++++++++++++++++++++++++++
- 3 files changed, 82 insertions(+), 1 deletion(-)
- create mode 100644 elf/tst-align3.c
- create mode 100644 elf/tst-alignmod3.c
-
-diff --git a/elf/Makefile b/elf/Makefile
-index 634c3113..442817ca 100644
---- a/elf/Makefile
-+++ b/elf/Makefile
-@@ -331,6 +331,7 @@ tests += \
-   tst-addr1 \
-   tst-align \
-   tst-align2 \
-+  tst-align3 \
-   tst-audit-tlsdesc \
-   tst-audit-tlsdesc-dlopen \
-   tst-audit1 \
-@@ -466,7 +467,9 @@ endif
- test-srcs = \
-   tst-pathopt
-   # tests-srcs
--
-+ifeq (yes,$(have-fpie))
-+tests-pie += tst-align3
-+endif
- selinux-enabled := $(shell cat /selinux/enforce 2> /dev/null)
- 
- ifneq ($(selinux-enabled),1)
-@@ -647,6 +650,7 @@ modules-names = \
-   tst-absolute-zero-lib \
-   tst-alignmod \
-   tst-alignmod2 \
-+  tst-alignmod3 \
-   tst-array2dep \
-   tst-array5dep \
-   tst-audit-tlsdesc-mod1 \
-@@ -1669,6 +1673,13 @@ CFLAGS-tst-alignmod2.c += $(stack-align-test-flags)
- $(objpfx)tst-align: $(libdl)
- $(objpfx)tst-align.out: $(objpfx)tst-alignmod.so
- $(objpfx)tst-align2: $(objpfx)tst-alignmod2.so
-+$(objpfx)tst-align3: $(objpfx)tst-alignmod3.so
-+ifeq (yes,$(have-fpie))
-+CFLAGS-tst-align3.c += $(PIE-ccflag)
-+endif
-+LDFLAGS-tst-align3 += -Wl,-z,max-page-size=0x200000
-+LDFLAGS-tst-alignmod3.so += -Wl,-z,max-page-size=0x200000
-+$(objpfx)tst-alignmod3.so: $(libsupport)
- 
- $(objpfx)unload3: $(libdl)
- $(objpfx)unload3.out: $(objpfx)unload3mod1.so $(objpfx)unload3mod2.so \
-diff --git a/elf/tst-align3.c b/elf/tst-align3.c
-new file mode 100644
-index 00000000..ac86d623
---- /dev/null
-+++ b/elf/tst-align3.c
-@@ -0,0 +1,38 @@
-+/* Check alignment of PT_LOAD segment in a shared library.
-+   Copyright (C) 2021 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <support/check.h>
-+#include <tst-stack-align.h>
-+
-+/* This should cover all possible page sizes we currently support.  */
-+#define ALIGN 0x200000
-+
-+int bar __attribute__ ((aligned (ALIGN))) = 1;
-+
-+extern int do_load_test (void);
-+
-+static int
-+do_test (void)
-+{
-+  printf ("bar: %p\n", &bar);
-+  TEST_VERIFY (is_aligned (&bar, ALIGN) == 0);
-+
-+  return do_load_test ();
-+}
-+
-+#include <support/test-driver.c>
-diff --git a/elf/tst-alignmod3.c b/elf/tst-alignmod3.c
-new file mode 100644
-index 00000000..0d33f237
---- /dev/null
-+++ b/elf/tst-alignmod3.c
-@@ -0,0 +1,32 @@
-+/* Check alignment of PT_LOAD segment in a shared library.
-+   Copyright (C) 2021 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <support/check.h>
-+#include <tst-stack-align.h>
-+
-+/* This should cover all possible page sizes we currently support.  */
-+#define ALIGN 0x200000
-+
-+int foo __attribute__ ((aligned (ALIGN))) = 1;
-+
-+void
-+do_load_test (void)
-+{
-+  printf ("foo: %p\n", &foo);
-+  TEST_VERIFY (is_aligned (&foo, ALIGN) == 0);
-+}
--- 
-2.27.0
-
diff --git a/glibc-Properly-check-stack-alignment-BZ-27901.patch b/glibc-Properly-check-stack-alignment-BZ-27901.patch
deleted file mode 100644
index 20dff99..0000000
--- a/glibc-Properly-check-stack-alignment-BZ-27901.patch
+++ /dev/null
@@ -1,325 +0,0 @@
-From 6152628751bf13f74c9336263a9c22f29ccd8ffb Mon Sep 17 00:00:00 2001
-From: "H.J. Lu" <hjl.tools@gmail.com>
-Date: Wed, 19 Jul 2023 23:01:53 +0800
-Subject: [PATCH 1/6] Properly check stack alignment [BZ #27901]
-
-1. Replace
-
-if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0)
-
-which may be optimized out by compiler, with
-
-int
-__attribute__ ((weak, noclone, noinline))
-is_aligned (void *p, int align)
-{
-  return (((uintptr_t) p) & (align - 1)) != 0;
-}
-
-2. Add TEST_STACK_ALIGN_INIT to TEST_STACK_ALIGN.
-3. Add a common TEST_STACK_ALIGN_INIT to check 16-byte stack alignment
-for both i386 and x86-64.
-4. Update powerpc to use TEST_STACK_ALIGN_INIT.
-
-Reviewed-by: Carlos O'Donell <carlos@redhat.com>
-Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
----
- sysdeps/generic/tst-stack-align.h   | 40 ++++++++++++++++---------
- sysdeps/i386/i686/tst-stack-align.h | 44 ---------------------------
- sysdeps/i386/tst-stack-align.h      | 41 -------------------------
- sysdeps/powerpc/tst-stack-align.h   | 27 +++++------------
- sysdeps/x86/tst-stack-align.h       | 28 ++++++++++++++++++
- sysdeps/x86_64/tst-stack-align.h    | 46 -----------------------------
- 6 files changed, 61 insertions(+), 165 deletions(-)
- delete mode 100644 sysdeps/i386/i686/tst-stack-align.h
- delete mode 100644 sysdeps/i386/tst-stack-align.h
- create mode 100644 sysdeps/x86/tst-stack-align.h
- delete mode 100644 sysdeps/x86_64/tst-stack-align.h
-
-diff --git a/sysdeps/generic/tst-stack-align.h b/sysdeps/generic/tst-stack-align.h
-index e5cb3310..e6050901 100644
---- a/sysdeps/generic/tst-stack-align.h
-+++ b/sysdeps/generic/tst-stack-align.h
-@@ -1,4 +1,5 @@
--/* Copyright (C) 2003-2018 Free Software Foundation, Inc.
-+/* Check stack alignment.  Generic version.
-+   Copyright (C) 2003-2021 Free Software Foundation, Inc.
-    This file is part of the GNU C Library.
- 
-    The GNU C Library is free software; you can redistribute it and/or
-@@ -18,17 +19,28 @@
- #include <stdio.h>
- #include <stdint.h>
- 
-+int
-+__attribute__ ((weak, noclone, noinline))
-+is_aligned (void *p, int align)
-+{
-+  return (((uintptr_t) p) & (align - 1)) != 0;
-+}
-+
-+#ifndef TEST_STACK_ALIGN_INIT
-+# define TEST_STACK_ALIGN_INIT() 0
-+#endif
-+
- #define TEST_STACK_ALIGN() \
--  ({									     \
--    double _d = 12.0;							     \
--    long double _ld = 15.0;						     \
--    int _ret = 0;							     \
--    printf ("double:  %g %p %zu\n", _d, &_d, __alignof (double));	     \
--    if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0)		     \
--      _ret = 1;								     \
--									     \
--    printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double));    \
--    if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0)	     \
--      _ret = 1;								     \
--    _ret;								     \
--    })
-+  ({								     \
-+    double _d = 12.0;						     \
-+    long double _ld = 15.0;					     \
-+    int _ret = TEST_STACK_ALIGN_INIT ();			     \
-+								     \
-+    printf ("double:  %g %p %zu\n", _d, &_d, __alignof (double));    \
-+    _ret += is_aligned (&_d, __alignof (double));		     \
-+								     \
-+    printf ("ldouble: %Lg %p %zu\n", _ld, &_ld,			     \
-+	    __alignof (long double));				     \
-+    _ret += is_aligned (&_ld, __alignof (long double));		     \
-+    _ret;							     \
-+   })
-diff --git a/sysdeps/i386/i686/tst-stack-align.h b/sysdeps/i386/i686/tst-stack-align.h
-deleted file mode 100644
-index 975f26ef..00000000
---- a/sysdeps/i386/i686/tst-stack-align.h
-+++ /dev/null
-@@ -1,44 +0,0 @@
--/* Copyright (C) 2003-2018 Free Software Foundation, Inc.
--   This file is part of the GNU C Library.
--
--   The GNU C Library is free software; you can redistribute it and/or
--   modify it under the terms of the GNU Lesser General Public
--   License as published by the Free Software Foundation; either
--   version 2.1 of the License, or (at your option) any later version.
--
--   The GNU C Library is distributed in the hope that it will be useful,
--   but WITHOUT ANY WARRANTY; without even the implied warranty of
--   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
--   Lesser General Public License for more details.
--
--   You should have received a copy of the GNU Lesser General Public
--   License along with the GNU C Library; if not, see
--   <http://www.gnu.org/licenses/>.  */
--
--#include <stdio.h>
--#include <stdint.h>
--#ifndef __SSE__
--#include_next <tst-stack-align.h>
--#else
--#include <xmmintrin.h>
--
--#define TEST_STACK_ALIGN() \
--  ({									     \
--    __m128 _m;								     \
--    double _d = 12.0;							     \
--    long double _ld = 15.0;						     \
--    int _ret = 0;							     \
--    printf ("__m128:  %p %zu\n", &_m, __alignof (__m128));		     \
--    if ((((uintptr_t) &_m) & (__alignof (__m128) - 1)) != 0)		     \
--      _ret = 1;								     \
--									     \
--    printf ("double:  %g %p %zu\n", _d, &_d, __alignof (double));	     \
--    if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0)		     \
--      _ret = 1;								     \
--									     \
--    printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double));    \
--    if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0)	     \
--      _ret = 1;								     \
--    _ret;								     \
--    })
--#endif
-diff --git a/sysdeps/i386/tst-stack-align.h b/sysdeps/i386/tst-stack-align.h
-deleted file mode 100644
-index 394ff773..00000000
---- a/sysdeps/i386/tst-stack-align.h
-+++ /dev/null
-@@ -1,41 +0,0 @@
--/* Copyright (C) 2004-2018 Free Software Foundation, Inc.
--   This file is part of the GNU C Library.
--
--   The GNU C Library is free software; you can redistribute it and/or
--   modify it under the terms of the GNU Lesser General Public
--   License as published by the Free Software Foundation; either
--   version 2.1 of the License, or (at your option) any later version.
--
--   The GNU C Library is distributed in the hope that it will be useful,
--   but WITHOUT ANY WARRANTY; without even the implied warranty of
--   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
--   Lesser General Public License for more details.
--
--   You should have received a copy of the GNU Lesser General Public
--   License along with the GNU C Library; if not, see
--   <http://www.gnu.org/licenses/>.  */
--
--#include <stdio.h>
--#include <stdint.h>
--
--typedef struct { int i[4]; } int_al16 __attribute__((aligned (16)));
--
--#define TEST_STACK_ALIGN() \
--  ({									     \
--    int_al16 _m;							     \
--    double _d = 12.0;							     \
--    long double _ld = 15.0;						     \
--    int _ret = 0;							     \
--    printf ("int_al16:  %p %zu\n", &_m, __alignof (int_al16));		     \
--    if ((((uintptr_t) &_m) & (__alignof (int_al16) - 1)) != 0)		     \
--      _ret = 1;								     \
--									     \
--    printf ("double:  %g %p %zu\n", _d, &_d, __alignof (double));	     \
--    if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0)		     \
--      _ret = 1;								     \
--									     \
--    printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double));    \
--    if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0)	     \
--      _ret = 1;								     \
--    _ret;								     \
--    })
-diff --git a/sysdeps/powerpc/tst-stack-align.h b/sysdeps/powerpc/tst-stack-align.h
-index 7fd7013b..d7400b28 100644
---- a/sysdeps/powerpc/tst-stack-align.h
-+++ b/sysdeps/powerpc/tst-stack-align.h
-@@ -1,4 +1,5 @@
--/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
-+/* Check stack alignment.  PowerPC version.
-+   Copyright (C) 2005-2021 Free Software Foundation, Inc.
-    This file is part of the GNU C Library.
- 
-    The GNU C Library is free software; you can redistribute it and/or
-@@ -15,10 +16,7 @@
-    License along with the GNU C Library; if not, see
-    <http://www.gnu.org/licenses/>.  */
- 
--#include <stdio.h>
--#include <stdint.h>
--
--#define TEST_STACK_ALIGN() \
-+#define TEST_STACK_ALIGN_INIT() \
-   ({									     \
-     /* Altivec __vector int etc. needs 16byte aligned stack.		     \
-        Instead of using altivec.h here, use aligned attribute instead.  */   \
-@@ -27,20 +25,9 @@
-         int _i __attribute__((aligned (16)));				     \
- 	int _j[3];							     \
-       } _s = { ._i = 18, ._j[0] = 19, ._j[1] = 20, ._j[2] = 21 };	     \
--    double _d = 12.0;							     \
--    long double _ld = 15.0;						     \
--    int _ret = 0;							     \
-     printf ("__vector int:  { %d, %d, %d, %d } %p %zu\n", _s._i, _s._j[0],   \
-             _s._j[1], _s._j[2], &_s, __alignof (_s));			     \
--    if ((((uintptr_t) &_s) & (__alignof (_s) - 1)) != 0)		     \
--      _ret = 1;								     \
--									     \
--    printf ("double:  %g %p %zu\n", _d, &_d, __alignof (double));	     \
--    if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0)		     \
--      _ret = 1;								     \
--									     \
--    printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double));    \
--    if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0)	     \
--      _ret = 1;								     \
--    _ret;								     \
--    })
-+    is_aligned (&_s, __alignof (_s));					     \
-+   })
-+
-+#include_next <tst-stack-align.h>
-diff --git a/sysdeps/x86/tst-stack-align.h b/sysdeps/x86/tst-stack-align.h
-new file mode 100644
-index 00000000..02ecc72d
---- /dev/null
-+++ b/sysdeps/x86/tst-stack-align.h
-@@ -0,0 +1,28 @@
-+/* Check stack alignment.  X86 version.
-+   Copyright (C) 2021 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+typedef struct { int i[16]; } int_al16 __attribute__((aligned (16)));
-+
-+#define TEST_STACK_ALIGN_INIT() \
-+  ({                                                            \
-+    int_al16 _m;                                                \
-+    printf ("int_al16:  %p %zu\n", &_m, __alignof (int_al16));  \
-+    is_aligned (&_m, __alignof (int_al16));                     \
-+   })
-+
-+#include_next <tst-stack-align.h>
-diff --git a/sysdeps/x86_64/tst-stack-align.h b/sysdeps/x86_64/tst-stack-align.h
-deleted file mode 100644
-index b2ef77f6..00000000
---- a/sysdeps/x86_64/tst-stack-align.h
-+++ /dev/null
-@@ -1,46 +0,0 @@
--/* Copyright (C) 2003-2018 Free Software Foundation, Inc.
--   This file is part of the GNU C Library.
--
--   The GNU C Library is free software; you can redistribute it and/or
--   modify it under the terms of the GNU Lesser General Public
--   License as published by the Free Software Foundation; either
--   version 2.1 of the License, or (at your option) any later version.
--
--   The GNU C Library is distributed in the hope that it will be useful,
--   but WITHOUT ANY WARRANTY; without even the implied warranty of
--   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
--   Lesser General Public License for more details.
--
--   You should have received a copy of the GNU Lesser General Public
--   License along with the GNU C Library; if not, see
--   <http://www.gnu.org/licenses/>.  */
--
--#include <stdio.h>
--#include <stdint.h>
--
--#define TEST_STACK_ALIGN() \
--  ({									     \
--    /* AMD64 ABI mandates 16byte aligned stack.				     \
--       Unfortunately, current GCC doesn't support __int128 or __float128     \
--       types, so use aligned attribute instead.  */			     \
--    struct _S								     \
--      {									     \
--        int _i __attribute__((aligned (16)));				     \
--	int _pad[3];							     \
--      } _s = { ._i = 18 };						     \
--    double _d = 12.0;							     \
--    long double _ld = 15.0;						     \
--    int _ret = 0;							     \
--    printf ("__int128:  %d %p %zu\n", _s._i, &_s, __alignof (_s));	     \
--    if ((((uintptr_t) &_s) & (__alignof (_s) - 1)) != 0)		     \
--      _ret = 1;								     \
--									     \
--    printf ("double:  %g %p %zu\n", _d, &_d, __alignof (double));	     \
--    if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0)		     \
--      _ret = 1;								     \
--									     \
--    printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double));    \
--    if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0)	     \
--      _ret = 1;								     \
--    _ret;								     \
--    })
--- 
-2.27.0
-
diff --git a/glibc-RHEL-21519.patch b/glibc-RHEL-10481.patch
similarity index 99%
rename from glibc-RHEL-21519.patch
rename to glibc-RHEL-10481.patch
index 7e22998..825f13c 100644
--- a/glibc-RHEL-21519.patch
+++ b/glibc-RHEL-10481.patch
@@ -16,7 +16,7 @@ Date:   Thu Nov 16 19:55:35 2023 +0100
     Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
 
 diff --git a/elf/dl-close.c b/elf/dl-close.c
-index 22225efb3226c3e1..16a39f5bf17b440f 100644
+index 66524b6708c59f29..8107c2d5f6ad2bc6 100644
 --- a/elf/dl-close.c
 +++ b/elf/dl-close.c
 @@ -182,6 +182,16 @@ _dl_close_worker (struct link_map *map, bool force)
diff --git a/glibc-RHEL-22441.patch b/glibc-RHEL-1192.patch
similarity index 100%
rename from glibc-RHEL-22441.patch
rename to glibc-RHEL-1192.patch
diff --git a/glibc-RHEL-13720-1.patch b/glibc-RHEL-13720-1.patch
new file mode 100644
index 0000000..5eab70c
--- /dev/null
+++ b/glibc-RHEL-13720-1.patch
@@ -0,0 +1,72 @@
+commit 2aa0974d2573441bffd596b07bff8698b1f2f18c
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Fri Oct 20 14:29:50 2023 +0200
+
+    elf: ldconfig should skip temporary files created by package managers
+    
+    This avoids crashes due to partially written files, after a package
+    update is interrupted.
+    
+    Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+
+Conflicts:
+	elf/ldconfig.c
+	  (missing alloca removal downstream)
+
+diff --git a/elf/ldconfig.c b/elf/ldconfig.c
+index 8c66d7e5426d8cc4..51de08f91fbaf093 100644
+--- a/elf/ldconfig.c
++++ b/elf/ldconfig.c
+@@ -771,6 +771,31 @@ struct dlib_entry
+   struct dlib_entry *next;
+ };
+ 
++/* Skip some temporary DSO files.  These files may be partially written
++   and lead to ldconfig crashes when examined.  */
++static bool
++skip_dso_based_on_name (const char *name, size_t len)
++{
++  /* Skip temporary files created by the prelink program.  Files with
++     names like these are never really DSOs we want to look at.  */
++  if (len >= sizeof (".#prelink#") - 1)
++    {
++      if (strcmp (name + len - sizeof (".#prelink#") + 1,
++		  ".#prelink#") == 0)
++	return true;
++      if (len >= sizeof (".#prelink#.XXXXXX") - 1
++	  && memcmp (name + len - sizeof (".#prelink#.XXXXXX")
++		     + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0)
++	return true;
++    }
++  /* Skip temporary files created by RPM.  */
++  if (memchr (name, len, ';') != NULL)
++    return true;
++  /* Skip temporary files created by dpkg.  */
++  if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
++    return true;
++  return false;
++}
+ 
+ static void
+ search_dir (const struct dir_entry *entry)
+@@ -849,18 +874,8 @@ search_dir (const struct dir_entry *entry)
+ 	continue;
+ 
+       size_t len = strlen (direntry->d_name);
+-      /* Skip temporary files created by the prelink program.  Files with
+-	 names like these are never really DSOs we want to look at.  */
+-      if (len >= sizeof (".#prelink#") - 1)
+-	{
+-	  if (strcmp (direntry->d_name + len - sizeof (".#prelink#") + 1,
+-		      ".#prelink#") == 0)
+-	    continue;
+-	  if (len >= sizeof (".#prelink#.XXXXXX") - 1
+-	      && memcmp (direntry->d_name + len - sizeof (".#prelink#.XXXXXX")
+-			 + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0)
+-	    continue;
+-	}
++      if (skip_dso_based_on_name (direntry->d_name, len))
++	continue;
+       len += strlen (entry->path) + 2;
+       if (len > file_name_len)
+ 	{
diff --git a/glibc-RHEL-13720-2.patch b/glibc-RHEL-13720-2.patch
new file mode 100644
index 0000000..69d5a90
--- /dev/null
+++ b/glibc-RHEL-13720-2.patch
@@ -0,0 +1,61 @@
+commit cfb5a97a93ea656e3b2263e42142a4032986d9ba
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Mon Oct 23 12:53:16 2023 +0200
+
+    ldconfig: Fixes for skipping temporary files.
+    
+    Arguments to a memchr call were swapped, causing incorrect skipping
+    of files.
+    
+    Files related to dpkg have different names: they actually end in
+    .dpkg-new and .dpkg-tmp, not .tmp as I mistakenly assumed.
+    
+    Fixes commit 2aa0974d2573441bffd59 ("elf: ldconfig should skip
+    temporary files created by package managers").
+
+diff --git a/elf/ldconfig.c b/elf/ldconfig.c
+index 51de08f91fbaf093..fb19dd68d41c07a4 100644
+--- a/elf/ldconfig.c
++++ b/elf/ldconfig.c
+@@ -771,6 +771,17 @@ struct dlib_entry
+   struct dlib_entry *next;
+ };
+ 
++/* Return true if the N bytes at NAME end with with the characters in
++   the string SUFFIX.  (NAME[N + 1] does not have to be a null byte.)
++   Expected to be called with a string literal for SUFFIX.  */
++static inline bool
++endswithn (const char *name, size_t n, const char *suffix)
++{
++  return (n >= strlen (suffix)
++	  && memcmp (name + n - strlen (suffix), suffix,
++		     strlen (suffix)) == 0);
++}
++
+ /* Skip some temporary DSO files.  These files may be partially written
+    and lead to ldconfig crashes when examined.  */
+ static bool
+@@ -780,8 +791,7 @@ skip_dso_based_on_name (const char *name, size_t len)
+      names like these are never really DSOs we want to look at.  */
+   if (len >= sizeof (".#prelink#") - 1)
+     {
+-      if (strcmp (name + len - sizeof (".#prelink#") + 1,
+-		  ".#prelink#") == 0)
++      if (endswithn (name, len, ".#prelink#"))
+ 	return true;
+       if (len >= sizeof (".#prelink#.XXXXXX") - 1
+ 	  && memcmp (name + len - sizeof (".#prelink#.XXXXXX")
+@@ -789,10 +799,11 @@ skip_dso_based_on_name (const char *name, size_t len)
+ 	return true;
+     }
+   /* Skip temporary files created by RPM.  */
+-  if (memchr (name, len, ';') != NULL)
++  if (memchr (name, ';', len) != NULL)
+     return true;
+   /* Skip temporary files created by dpkg.  */
+-  if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
++  if (endswithn (name, len, ".dpkg-new")
++      || endswithn (name, len, ".dpkg-tmp"))
+     return true;
+   return false;
+ }
diff --git a/glibc-RHEL-15696-1.patch b/glibc-RHEL-15696-1.patch
new file mode 100644
index 0000000..804de54
--- /dev/null
+++ b/glibc-RHEL-15696-1.patch
@@ -0,0 +1,259 @@
+From 97700a34f36721b11a754cf37a1cc40695ece1fd Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:23:59 -0800
+Subject: [PATCH] x86-64 memchr/wmemchr: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memchr/wmemchr for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/memchr.S: Use RDX_LP for length.  Clear the
+	upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and
+	tst-size_t-wmemchr.
+	* sysdeps/x86_64/x32/test-size_t.h: New file.
+	* sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise.
+	* sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise.
+---
+ sysdeps/x86_64/memchr.S                 | 10 ++--
+ sysdeps/x86_64/multiarch/memchr-avx2.S  |  8 ++-
+ sysdeps/x86_64/x32/Makefile             |  8 +++
+ sysdeps/x86_64/x32/test-size_t.h        | 35 ++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-memchr.c  | 72 +++++++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++
+ 6 files changed, 148 insertions(+), 5 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/test-size_t.h
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+	NEWS
+	(removed)
+
+diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
+index feef5d4f..cb320257 100644
+--- a/sysdeps/x86_64/memchr.S
++++ b/sysdeps/x86_64/memchr.S
+@@ -34,12 +34,16 @@ ENTRY(MEMCHR)
+ 	mov	%edi, %ecx
+ 
+ #ifdef USE_AS_WMEMCHR
+-	test	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+-	shl	$2, %rdx
++	shl	$2, %RDX_LP
+ #else
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++# endif
+ 	punpcklbw %xmm1, %xmm1
+-	test	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+ 	punpcklbw %xmm1, %xmm1
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index 5f5e7725..c81da19b 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -40,16 +40,20 @@
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+-	testq	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jz	L(null)
+ # endif
+ 	movl	%edi, %ecx
+ 	/* Broadcast CHAR to YMM0.  */
+ 	vmovd	%esi, %xmm0
+ # ifdef USE_AS_WMEMCHR
+-	shl	$2, %rdx
++	shl	$2, %RDX_LP
+ 	vpbroadcastd %xmm0, %ymm0
+ # else
++#  ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++#  endif
+ 	vpbroadcastb %xmm0, %ymm0
+ # endif
+ 	/* Check if we may cross page boundary with one vector load.  */
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index f2ebc24f..7d528889 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -4,3 +4,11 @@ ifeq ($(subdir),math)
+ # 64-bit llround.  Add -fno-builtin-lround to silence the compiler.
+ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
++
++ifeq ($(subdir),string)
++tests += tst-size_t-memchr
++endif
++
++ifeq ($(subdir),wcsmbs)
++tests += tst-size_t-wmemchr
++endif
+diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h
+new file mode 100644
+index 00000000..78a94086
+--- /dev/null
++++ b/sysdeps/x86_64/x32/test-size_t.h
+@@ -0,0 +1,35 @@
++/* Test string/memory functions with size_t in the lower 32 bits of
++   64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_MAIN
++#include <string/test-string.h>
++
++/* On x32, parameter_t may be passed in a 64-bit register with the LEN
++   field in the lower 32 bits.  When the LEN field of 64-bit register
++   is passed to string/memory function as the size_t parameter, only
++   the lower 32 bits can be used.  */
++typedef struct
++{
++  union
++    {
++      size_t len;
++      void (*fn) (void);
++    };
++  void *p;
++} parameter_t;
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c
+new file mode 100644
+index 00000000..29a3daf1
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c
+@@ -0,0 +1,72 @@
++/* Test memchr with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef WIDE
++# define TEST_NAME "memchr"
++#else
++# define TEST_NAME "wmemchr"
++#endif /* WIDE */
++#include "test-size_t.h"
++
++#ifndef WIDE
++# define MEMCHR memchr
++# define CHAR char
++# define UCHAR unsigned char
++#else
++# include <wchar.h>
++# define MEMCHR wmemchr
++# define CHAR wchar_t
++# define UCHAR wchar_t
++#endif /* WIDE */
++
++IMPL (MEMCHR, 1)
++
++typedef CHAR * (*proto_t) (const CHAR*, int, size_t);
++
++static CHAR *
++__attribute__ ((noinline, noclone))
++do_memchr (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
++  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      c.fn = impl->fn;
++      CHAR *res = do_memchr (src, c);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %p != NULL",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
+new file mode 100644
+index 00000000..877801d6
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
+@@ -0,0 +1,20 @@
++/* Test wmemchr with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include "tst-size_t-memchr.c"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-10.patch b/glibc-RHEL-15696-10.patch
new file mode 100644
index 0000000..10bd49d
--- /dev/null
+++ b/glibc-RHEL-15696-10.patch
@@ -0,0 +1,41 @@
+From ddf0992cf57a93200e0c782e2a94d0733a5a0b87 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 9 Jan 2022 16:02:21 -0600
+Subject: [PATCH] x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
+Content-type: text/plain; charset=UTF-8
+
+Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
+__wcscmp_avx2. For x86_64 this covers the entire address range so any
+length larger could not possibly be used to bound `s1` or `s2`.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 156c1949..8fb8eedc 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -83,6 +83,16 @@ ENTRY (STRCMP)
+ 	je	L(char0)
+ 	jb	L(zero)
+ #  ifdef USE_AS_WCSCMP
++#  ifndef __ILP32__
++	movq	%rdx, %rcx
++	/* Check if length could overflow when multiplied by
++	   sizeof(wchar_t). Checking top 8 bits will cover all potential
++	   overflow cases as well as redirect cases where its impossible to
++	   length to bound a valid memory region. In these cases just use
++	   'wcscmp'.  */
++	shrq	$56, %rcx
++	jnz	__wcscmp_avx2
++#  endif
+ 	/* Convert units: from wide to byte char.  */
+ 	shl	$2, %RDX_LP
+ #  endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-100.patch b/glibc-RHEL-15696-100.patch
new file mode 100644
index 0000000..0e779e4
--- /dev/null
+++ b/glibc-RHEL-15696-100.patch
@@ -0,0 +1,257 @@
+From 244b415d386487521882debb845a040a4758cb18 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 25 Mar 2022 17:13:33 -0500
+Subject: [PATCH] x86: Small improvements for wcslen
+Content-type: text/plain; charset=UTF-8
+
+Just a few QOL changes.
+    1. Prefer `add` > `lea` as it has high execution units it can run
+       on.
+    2. Don't break macro-fusion between `test` and `jcc`
+    3. Reduce code size by removing gratuitous padding bytes (-90
+       bytes).
+
+geometric_mean(N=20) of all benchmarks New / Original: 0.959
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++---------------------
+ 1 file changed, 41 insertions(+), 45 deletions(-)
+
+diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
+index 9f5f7232..254bb030 100644
+--- a/sysdeps/x86_64/wcslen.S
++++ b/sysdeps/x86_64/wcslen.S
+@@ -41,82 +41,82 @@ ENTRY (__wcslen)
+ 	pxor	%xmm0, %xmm0
+ 
+ 	lea	32(%rdi), %rax
+-	lea	16(%rdi), %rcx
++	addq	$16, %rdi
+ 	and	$-16, %rax
+ 
+ 	pcmpeqd	(%rax), %xmm0
+ 	pmovmskb %xmm0, %edx
+ 	pxor	%xmm1, %xmm1
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm1
+ 	pmovmskb %xmm1, %edx
+ 	pxor	%xmm2, %xmm2
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm2
+ 	pmovmskb %xmm2, %edx
+ 	pxor	%xmm3, %xmm3
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm0
+ 	pmovmskb %xmm0, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm1
+ 	pmovmskb %xmm1, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm2
+ 	pmovmskb %xmm2, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm0
+ 	pmovmskb %xmm0, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm1
+ 	pmovmskb %xmm1, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm2
+ 	pmovmskb %xmm2, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	and	$-0x40, %rax
+@@ -133,104 +133,100 @@ L(aligned_64_loop):
+ 	pminub	%xmm0, %xmm2
+ 	pcmpeqd	%xmm3, %xmm2
+ 	pmovmskb %xmm2, %edx
++	addq	$64, %rax
+ 	test	%edx, %edx
+-	lea	64(%rax), %rax
+ 	jz	L(aligned_64_loop)
+ 
+ 	pcmpeqd	-64(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
++    addq	$48, %rdi
+ 	test	%edx, %edx
+-	lea	48(%rcx), %rcx
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	%xmm1, %xmm3
+ 	pmovmskb %xmm3, %edx
++    addq	$-16, %rdi
+ 	test	%edx, %edx
+-	lea	-16(%rcx), %rcx
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	-32(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
++    addq	$-16, %rdi
+ 	test	%edx, %edx
+-	lea	-16(%rcx), %rcx
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	%xmm6, %xmm3
+ 	pmovmskb %xmm3, %edx
++    addq	$-16, %rdi
+ 	test	%edx, %edx
+-	lea	-16(%rcx), %rcx
+-	jnz	L(exit)
+-
+-	jmp	L(aligned_64_loop)
++	jz	L(aligned_64_loop)
+ 
+ 	.p2align 4
+ L(exit):
+-	sub	%rcx, %rax
++	sub	%rdi, %rax
+ 	shr	$2, %rax
+ 	test	%dl, %dl
+ 	jz	L(exit_high)
+ 
+-	mov	%dl, %cl
+-	and	$15, %cl
++	andl	$15, %edx
+ 	jz	L(exit_1)
+ 	ret
+ 
+-	.p2align 4
++	/* No align here. Naturally aligned % 16 == 1.  */
+ L(exit_high):
+-	mov	%dh, %ch
+-	and	$15, %ch
++	andl	$(15 << 8), %edx
+ 	jz	L(exit_3)
+ 	add	$2, %rax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_1):
+ 	add	$1, %rax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_3):
+ 	add	$3, %rax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail0):
+-	xor	%rax, %rax
++	xorl	%eax, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail1):
+-	mov	$1, %rax
++	movl	$1, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail2):
+-	mov	$2, %rax
++	movl	$2, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail3):
+-	mov	$3, %rax
++	movl	$3, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail4):
+-	mov	$4, %rax
++	movl	$4, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail5):
+-	mov	$5, %rax
++	movl	$5, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail6):
+-	mov	$6, %rax
++	movl	$6, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail7):
+-	mov	$7, %rax
++	movl	$7, %eax
+ 	ret
+ 
+ END (__wcslen)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-101.patch b/glibc-RHEL-15696-101.patch
new file mode 100644
index 0000000..131ea5b
--- /dev/null
+++ b/glibc-RHEL-15696-101.patch
@@ -0,0 +1,964 @@
+From 7cbc03d03091d5664060924789afe46d30a5477e Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 15 Apr 2022 12:28:00 -0500
+Subject: [PATCH] x86: Remove memcmp-sse4.S
+Content-type: text/plain; charset=UTF-8
+
+Code didn't actually use any sse4 instructions since `ptest` was
+removed in:
+
+commit 2f9062d7171850451e6044ef78d91ff8c017b9c0
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Nov 10 16:18:56 2021 -0600
+
+    x86: Shrink memcmp-sse4.S code size
+
+The new memcmp-sse2 implementation is also faster.
+
+geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905
+
+Note there are two regressions preferring SSE2 for Size = 1 and Size =
+65.
+
+Size = 1:
+size, align0, align1, ret, New Time/Old Time
+   1,      1,      1,   0,               1.2
+   1,      1,      1,   1,             1.197
+   1,      1,      1,  -1,               1.2
+
+This is intentional. Size == 1 is significantly less hot based on
+profiles of GCC11 and Python3 than sizes [4, 8] (which is made
+hotter).
+
+Python3 Size = 1        -> 13.64%
+Python3 Size = [4, 8]   -> 60.92%
+
+GCC11   Size = 1        ->  1.29%
+GCC11   Size = [4, 8]   -> 33.86%
+
+size, align0, align1, ret, New Time/Old Time
+   4,      4,      4,   0,             0.622
+   4,      4,      4,   1,             0.797
+   4,      4,      4,  -1,             0.805
+   5,      5,      5,   0,             0.623
+   5,      5,      5,   1,             0.777
+   5,      5,      5,  -1,             0.802
+   6,      6,      6,   0,             0.625
+   6,      6,      6,   1,             0.813
+   6,      6,      6,  -1,             0.788
+   7,      7,      7,   0,             0.625
+   7,      7,      7,   1,             0.799
+   7,      7,      7,  -1,             0.795
+   8,      8,      8,   0,             0.625
+   8,      8,      8,   1,             0.848
+   8,      8,      8,  -1,             0.914
+   9,      9,      9,   0,             0.625
+
+Size = 65:
+size, align0, align1, ret, New Time/Old Time
+  65,      0,      0,   0,             1.103
+  65,      0,      0,   1,             1.216
+  65,      0,      0,  -1,             1.227
+  65,     65,      0,   0,             1.091
+  65,      0,     65,   1,              1.19
+  65,     65,     65,  -1,             1.215
+
+This is because A) the checks in range [65, 96] are now unrolled 2x
+and B) because smaller values <= 16 are now given a hotter path. By
+contrast the SSE4 version has a branch for Size = 80. The unrolled
+version has get better performance for returns which need both
+comparisons.
+
+size, align0, align1, ret, New Time/Old Time
+ 128,      4,      8,   0,             0.858
+ 128,      4,      8,   1,             0.879
+ 128,      4,      8,  -1,             0.888
+
+As well, out of microbenchmark environments that are not full
+predictable the branch will have a real-cost.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile          |   2 -
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |   4 -
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h    |   4 -
+ sysdeps/x86_64/multiarch/memcmp-sse4.S     | 804 ---------------------
+ 4 files changed, 814 deletions(-)
+ delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index bca82e38..b503e4b8 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -11,7 +11,6 @@ sysdep_routines += \
+   memcmp-avx2-movbe-rtm \
+   memcmp-evex-movbe \
+   memcmp-sse2 \
+-  memcmp-sse4 \
+   memcmp-ssse3 \
+   memcpy-ssse3 \
+   memcpy-ssse3-back \
+@@ -174,7 +173,6 @@ sysdep_routines += \
+   wmemcmp-avx2-movbe-rtm \
+   wmemcmp-c \
+   wmemcmp-evex-movbe \
+-  wmemcmp-sse4 \
+   wmemcmp-ssse3 \
+ # sysdep_routines
+ endif
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 14314367..450a2917 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -78,8 +78,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_evex_movbe)
+-	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
+-			      __memcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
+ 			      __memcmp_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
+@@ -824,8 +822,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_evex_movbe)
+-	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
+-			      __wmemcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
+ 			      __wmemcmp_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+index 690dffe8..0bc47a7f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+@@ -21,7 +21,6 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
+@@ -47,9 +46,6 @@ IFUNC_SELECTOR (void)
+ 	return OPTIMIZE (avx2_movbe);
+     }
+ 
+-  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+-    return OPTIMIZE (sse4_1);
+-
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
+     return OPTIMIZE (ssse3);
+ 
+diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+deleted file mode 100644
+index 50060006..00000000
+--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
++++ /dev/null
+@@ -1,804 +0,0 @@
+-/* memcmp with SSE4.1, wmemcmp with SSE4.1
+-   Copyright (C) 2010-2018 Free Software Foundation, Inc.
+-   Contributed by Intel Corporation.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#if IS_IN (libc)
+-
+-# include <sysdep.h>
+-
+-# ifndef MEMCMP
+-#  define MEMCMP	__memcmp_sse4_1
+-# endif
+-
+-#ifdef USE_AS_WMEMCMP
+-# define CMPEQ	pcmpeqd
+-# define CHAR_SIZE	4
+-#else
+-# define CMPEQ	pcmpeqb
+-# define CHAR_SIZE	1
+-#endif
+-
+-
+-/* Warning!
+-           wmemcmp has to use SIGNED comparison for elements.
+-           memcmp has to use UNSIGNED comparison for elemnts.
+-*/
+-
+-	.section .text.sse4.1,"ax",@progbits
+-ENTRY (MEMCMP)
+-# ifdef USE_AS_WMEMCMP
+-	shl	$2, %RDX_LP
+-# elif defined __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	mov	%edx, %edx
+-# endif
+-	cmp	$79, %RDX_LP
+-	ja	L(79bytesormore)
+-
+-	cmp	$CHAR_SIZE, %RDX_LP
+-	jbe	L(firstbyte)
+-
+-	/* N in (CHAR_SIZE, 79) bytes.  */
+-	cmpl	$32, %edx
+-	ja	L(more_32_bytes)
+-
+-	cmpl	$16, %edx
+-	jae	L(16_to_32_bytes)
+-
+-# ifndef USE_AS_WMEMCMP
+-	cmpl	$8, %edx
+-	jae	L(8_to_16_bytes)
+-
+-	cmpl	$4, %edx
+-	jb	L(2_to_3_bytes)
+-
+-	movl	(%rdi), %eax
+-	movl	(%rsi), %ecx
+-
+-	bswap	%eax
+-	bswap	%ecx
+-
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-
+-	movl	-4(%rdi, %rdx), %edi
+-	movl	-4(%rsi, %rdx), %esi
+-
+-	bswap	%edi
+-	bswap	%esi
+-
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	cmovne	%edx, %eax
+-	sbbl	%ecx, %ecx
+-	orl	%ecx, %eax
+-	ret
+-
+-	.p2align 4,, 8
+-L(2_to_3_bytes):
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	subl	%ecx, %eax
+-	ret
+-
+-	.p2align 4,, 8
+-L(8_to_16_bytes):
+-	movq	(%rdi), %rax
+-	movq	(%rsi), %rcx
+-
+-	bswap	%rax
+-	bswap	%rcx
+-
+-	subq	%rcx, %rax
+-	jne	L(8_to_16_bytes_done)
+-
+-	movq	-8(%rdi, %rdx), %rax
+-	movq	-8(%rsi, %rdx), %rcx
+-
+-	bswap	%rax
+-	bswap	%rcx
+-
+-	subq	%rcx, %rax
+-
+-L(8_to_16_bytes_done):
+-	cmovne	%edx, %eax
+-	sbbl	%ecx, %ecx
+-	orl	%ecx, %eax
+-	ret
+-# else
+-	xorl	%eax, %eax
+-	movl	(%rdi), %ecx
+-	cmpl	(%rsi), %ecx
+-	jne	L(8_to_16_bytes_done)
+-	movl	4(%rdi), %ecx
+-	cmpl	4(%rsi), %ecx
+-	jne	L(8_to_16_bytes_done)
+-	movl	-4(%rdi, %rdx), %ecx
+-	cmpl	-4(%rsi, %rdx), %ecx
+-	jne	L(8_to_16_bytes_done)
+-	ret
+-# endif
+-
+-	.p2align 4,, 3
+-L(ret_zero):
+-	xorl	%eax, %eax
+-L(zero):
+-	ret
+-
+-	.p2align 4,, 8
+-L(firstbyte):
+-	jb	L(ret_zero)
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi), %ecx
+-	cmpl	(%rsi), %ecx
+-	je	L(zero)
+-L(8_to_16_bytes_done):
+-	setg	%al
+-	leal	-1(%rax, %rax), %eax
+-# else
+-	movzbl	(%rdi), %eax
+-	movzbl	(%rsi), %ecx
+-	sub	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(vec_return_begin_48):
+-	addq	$16, %rdi
+-	addq	$16, %rsi
+-L(vec_return_begin_32):
+-	bsfl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	32(%rdi, %rax), %ecx
+-	xorl	%edx, %edx
+-	cmpl	32(%rsi, %rax), %ecx
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	32(%rsi, %rax), %ecx
+-	movzbl	32(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(vec_return_begin_16):
+-	addq	$16, %rdi
+-	addq	$16, %rsi
+-L(vec_return_begin):
+-	bsfl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rax), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rax), %ecx
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rsi, %rax), %ecx
+-	movzbl	(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(vec_return_end_16):
+-	subl	$16, %edx
+-L(vec_return_end):
+-	bsfl	%eax, %eax
+-	addl	%edx, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	-16(%rdi, %rax), %ecx
+-	xorl	%edx, %edx
+-	cmpl	-16(%rsi, %rax), %ecx
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	-16(%rsi, %rax), %ecx
+-	movzbl	-16(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4,, 8
+-L(more_32_bytes):
+-	movdqu	(%rdi), %xmm0
+-	movdqu	(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm0
+-	movdqu	16(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	cmpl	$64, %edx
+-	jbe	L(32_to_64_bytes)
+-	movdqu	32(%rdi), %xmm0
+-	movdqu	32(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	.p2align 4,, 6
+-L(32_to_64_bytes):
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(16_to_32_bytes):
+-	movdqu	(%rdi), %xmm0
+-	movdqu	(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-
+-	.p2align 4
+-L(79bytesormore):
+-	movdqu	(%rdi), %xmm0
+-	movdqu	(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-
+-	mov	%rsi, %rcx
+-	and	$-16, %rsi
+-	add	$16, %rsi
+-	sub	%rsi, %rcx
+-
+-	sub	%rcx, %rdi
+-	add	%rcx, %rdx
+-	test	$0xf, %rdi
+-	jz	L(2aligned)
+-
+-	cmp	$128, %rdx
+-	ja	L(128bytesormore)
+-
+-	.p2align 4,, 6
+-L(less128bytes):
+-	movdqu	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqu	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	cmp	$96, %rdx
+-	jb	L(32_to_64_bytes)
+-
+-	addq	$64, %rdi
+-	addq	$64, %rsi
+-	subq	$64, %rdx
+-
+-	.p2align 4,, 6
+-L(last_64_bytes):
+-	movdqu	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(128bytesormore):
+-	cmp	$256, %rdx
+-	ja	L(unaligned_loop)
+-L(less256bytes):
+-	movdqu	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqu	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	addq	$64, %rdi
+-	addq	$64, %rsi
+-
+-	movdqu	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqu	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	addq	$-128, %rdx
+-	subq	$-64, %rsi
+-	subq	$-64, %rdi
+-
+-	cmp	$64, %rdx
+-	ja	L(less128bytes)
+-
+-	cmp	$32, %rdx
+-	ja	L(last_64_bytes)
+-
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(unaligned_loop):
+-# ifdef DATA_CACHE_SIZE_HALF
+-	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+-# else
+-	mov	__x86_data_cache_size_half(%rip), %R8_LP
+-# endif
+-	movq	%r8, %r9
+-	addq	%r8, %r8
+-	addq	%r9, %r8
+-	cmpq	%r8, %rdx
+-	ja	L(L2_L3_cache_unaligned)
+-	sub	$64, %rdx
+-	.p2align 4
+-L(64bytesormore_loop):
+-	movdqu	(%rdi), %xmm0
+-	movdqu	16(%rdi), %xmm1
+-	movdqu	32(%rdi), %xmm2
+-	movdqu	48(%rdi), %xmm3
+-
+-	CMPEQ	(%rsi), %xmm0
+-	CMPEQ	16(%rsi), %xmm1
+-	CMPEQ	32(%rsi), %xmm2
+-	CMPEQ	48(%rsi), %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-
+-	add	$64, %rsi
+-	add	$64, %rdi
+-	sub	$64, %rdx
+-	ja	L(64bytesormore_loop)
+-
+-	.p2align 4,, 6
+-L(loop_tail):
+-	addq	%rdx, %rdi
+-	movdqu	(%rdi), %xmm0
+-	movdqu	16(%rdi), %xmm1
+-	movdqu	32(%rdi), %xmm2
+-	movdqu	48(%rdi), %xmm3
+-
+-	addq	%rdx, %rsi
+-	movdqu	(%rsi), %xmm4
+-	movdqu	16(%rsi), %xmm5
+-	movdqu	32(%rsi), %xmm6
+-	movdqu	48(%rsi), %xmm7
+-
+-	CMPEQ	%xmm4, %xmm0
+-	CMPEQ	%xmm5, %xmm1
+-	CMPEQ	%xmm6, %xmm2
+-	CMPEQ	%xmm7, %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-	ret
+-
+-L(L2_L3_cache_unaligned):
+-	subq	$64, %rdx
+-	.p2align 4
+-L(L2_L3_unaligned_128bytes_loop):
+-	prefetchnta 0x1c0(%rdi)
+-	prefetchnta 0x1c0(%rsi)
+-
+-	movdqu	(%rdi), %xmm0
+-	movdqu	16(%rdi), %xmm1
+-	movdqu	32(%rdi), %xmm2
+-	movdqu	48(%rdi), %xmm3
+-
+-	CMPEQ	(%rsi), %xmm0
+-	CMPEQ	16(%rsi), %xmm1
+-	CMPEQ	32(%rsi), %xmm2
+-	CMPEQ	48(%rsi), %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-
+-	add	$64, %rsi
+-	add	$64, %rdi
+-	sub	$64, %rdx
+-	ja	L(L2_L3_unaligned_128bytes_loop)
+-	jmp	L(loop_tail)
+-
+-
+-	/* This case is for machines which are sensitive for unaligned
+-	 * instructions.  */
+-	.p2align 4
+-L(2aligned):
+-	cmp	$128, %rdx
+-	ja	L(128bytesormorein2aligned)
+-L(less128bytesin2aligned):
+-	movdqa	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqa	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqa	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqa	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	cmp	$96, %rdx
+-	jb	L(32_to_64_bytes)
+-
+-	addq	$64, %rdi
+-	addq	$64, %rsi
+-	subq	$64, %rdx
+-
+-	.p2align 4,, 6
+-L(aligned_last_64_bytes):
+-	movdqa	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqa	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(128bytesormorein2aligned):
+-	cmp	$256, %rdx
+-	ja	L(aligned_loop)
+-L(less256bytesin2alinged):
+-	movdqa	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqa	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqa	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqa	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	addq	$64, %rdi
+-	addq	$64, %rsi
+-
+-	movdqa	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqa	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqa	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqa	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	addq	$-128, %rdx
+-	subq	$-64, %rsi
+-	subq	$-64, %rdi
+-
+-	cmp	$64, %rdx
+-	ja	L(less128bytesin2aligned)
+-
+-	cmp	$32, %rdx
+-	ja	L(aligned_last_64_bytes)
+-
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(aligned_loop):
+-# ifdef DATA_CACHE_SIZE_HALF
+-	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+-# else
+-	mov	__x86_data_cache_size_half(%rip), %R8_LP
+-# endif
+-	movq	%r8, %r9
+-	addq	%r8, %r8
+-	addq	%r9, %r8
+-	cmpq	%r8, %rdx
+-	ja	L(L2_L3_cache_aligned)
+-
+-	sub	$64, %rdx
+-	.p2align 4
+-L(64bytesormore_loopin2aligned):
+-	movdqa	(%rdi), %xmm0
+-	movdqa	16(%rdi), %xmm1
+-	movdqa	32(%rdi), %xmm2
+-	movdqa	48(%rdi), %xmm3
+-
+-	CMPEQ	(%rsi), %xmm0
+-	CMPEQ	16(%rsi), %xmm1
+-	CMPEQ	32(%rsi), %xmm2
+-	CMPEQ	48(%rsi), %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-	add	$64, %rsi
+-	add	$64, %rdi
+-	sub	$64, %rdx
+-	ja	L(64bytesormore_loopin2aligned)
+-	jmp	L(loop_tail)
+-
+-L(L2_L3_cache_aligned):
+-	subq	$64, %rdx
+-	.p2align 4
+-L(L2_L3_aligned_128bytes_loop):
+-	prefetchnta 0x1c0(%rdi)
+-	prefetchnta 0x1c0(%rsi)
+-	movdqa	(%rdi), %xmm0
+-	movdqa	16(%rdi), %xmm1
+-	movdqa	32(%rdi), %xmm2
+-	movdqa	48(%rdi), %xmm3
+-
+-	CMPEQ	(%rsi), %xmm0
+-	CMPEQ	16(%rsi), %xmm1
+-	CMPEQ	32(%rsi), %xmm2
+-	CMPEQ	48(%rsi), %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-
+-	addq	$64, %rsi
+-	addq	$64, %rdi
+-	subq	$64, %rdx
+-	ja	L(L2_L3_aligned_128bytes_loop)
+-	jmp	L(loop_tail)
+-
+-	.p2align 4
+-L(64bytesormore_loop_end):
+-	pmovmskb %xmm0, %ecx
+-	incw	%cx
+-	jnz	L(loop_end_ret)
+-
+-	pmovmskb %xmm1, %ecx
+-	notw	%cx
+-	sall	$16, %ecx
+-	jnz	L(loop_end_ret)
+-
+-	pmovmskb %xmm2, %ecx
+-	notw	%cx
+-	shlq	$32, %rcx
+-	jnz	L(loop_end_ret)
+-
+-	addq	$48, %rdi
+-	addq	$48, %rsi
+-	movq	%rax, %rcx
+-
+-	.p2align 4,, 6
+-L(loop_end_ret):
+-	bsfq	%rcx, %rcx
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rcx), %eax
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rcx), %eax
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rdi, %rcx), %eax
+-	movzbl	(%rsi, %rcx), %ecx
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-END (MEMCMP)
+-#endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-102.patch b/glibc-RHEL-15696-102.patch
new file mode 100644
index 0000000..8cb20ad
--- /dev/null
+++ b/glibc-RHEL-15696-102.patch
@@ -0,0 +1,263 @@
+From 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 15 Apr 2022 12:28:01 -0500
+Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+Old code was both inefficient and wasted code size. New code (-62
+bytes) and comparable or better performance in the page cross case.
+
+geometric_mean(N=20) of page cross cases New / Original: 0.960
+
+size, align0, align1, ret, New Time/Old Time
+   1,   4095,      0,   0,             1.001
+   1,   4095,      0,   1,             0.999
+   1,   4095,      0,  -1,               1.0
+   2,   4094,      0,   0,               1.0
+   2,   4094,      0,   1,               1.0
+   2,   4094,      0,  -1,               1.0
+   3,   4093,      0,   0,               1.0
+   3,   4093,      0,   1,               1.0
+   3,   4093,      0,  -1,               1.0
+   4,   4092,      0,   0,             0.987
+   4,   4092,      0,   1,               1.0
+   4,   4092,      0,  -1,               1.0
+   5,   4091,      0,   0,             0.984
+   5,   4091,      0,   1,             1.002
+   5,   4091,      0,  -1,             1.005
+   6,   4090,      0,   0,             0.993
+   6,   4090,      0,   1,             1.001
+   6,   4090,      0,  -1,             1.003
+   7,   4089,      0,   0,             0.991
+   7,   4089,      0,   1,               1.0
+   7,   4089,      0,  -1,             1.001
+   8,   4088,      0,   0,             0.875
+   8,   4088,      0,   1,             0.881
+   8,   4088,      0,  -1,             0.888
+   9,   4087,      0,   0,             0.872
+   9,   4087,      0,   1,             0.879
+   9,   4087,      0,  -1,             0.883
+  10,   4086,      0,   0,             0.878
+  10,   4086,      0,   1,             0.886
+  10,   4086,      0,  -1,             0.873
+  11,   4085,      0,   0,             0.878
+  11,   4085,      0,   1,             0.881
+  11,   4085,      0,  -1,             0.879
+  12,   4084,      0,   0,             0.873
+  12,   4084,      0,   1,             0.889
+  12,   4084,      0,  -1,             0.875
+  13,   4083,      0,   0,             0.873
+  13,   4083,      0,   1,             0.863
+  13,   4083,      0,  -1,             0.863
+  14,   4082,      0,   0,             0.838
+  14,   4082,      0,   1,             0.869
+  14,   4082,      0,  -1,             0.877
+  15,   4081,      0,   0,             0.841
+  15,   4081,      0,   1,             0.869
+  15,   4081,      0,  -1,             0.876
+  16,   4080,      0,   0,             0.988
+  16,   4080,      0,   1,              0.99
+  16,   4080,      0,  -1,             0.989
+  17,   4079,      0,   0,             0.978
+  17,   4079,      0,   1,             0.981
+  17,   4079,      0,  -1,              0.98
+  18,   4078,      0,   0,             0.981
+  18,   4078,      0,   1,              0.98
+  18,   4078,      0,  -1,             0.985
+  19,   4077,      0,   0,             0.977
+  19,   4077,      0,   1,             0.979
+  19,   4077,      0,  -1,             0.986
+  20,   4076,      0,   0,             0.977
+  20,   4076,      0,   1,             0.986
+  20,   4076,      0,  -1,             0.984
+  21,   4075,      0,   0,             0.977
+  21,   4075,      0,   1,             0.983
+  21,   4075,      0,  -1,             0.988
+  22,   4074,      0,   0,             0.983
+  22,   4074,      0,   1,             0.994
+  22,   4074,      0,  -1,             0.993
+  23,   4073,      0,   0,              0.98
+  23,   4073,      0,   1,             0.992
+  23,   4073,      0,  -1,             0.995
+  24,   4072,      0,   0,             0.989
+  24,   4072,      0,   1,             0.989
+  24,   4072,      0,  -1,             0.991
+  25,   4071,      0,   0,              0.99
+  25,   4071,      0,   1,             0.999
+  25,   4071,      0,  -1,             0.996
+  26,   4070,      0,   0,             0.993
+  26,   4070,      0,   1,             0.995
+  26,   4070,      0,  -1,             0.998
+  27,   4069,      0,   0,             0.993
+  27,   4069,      0,   1,             0.999
+  27,   4069,      0,  -1,               1.0
+  28,   4068,      0,   0,             0.997
+  28,   4068,      0,   1,               1.0
+  28,   4068,      0,  -1,             0.999
+  29,   4067,      0,   0,             0.996
+  29,   4067,      0,   1,             0.999
+  29,   4067,      0,  -1,             0.999
+  30,   4066,      0,   0,             0.991
+  30,   4066,      0,   1,             1.001
+  30,   4066,      0,  -1,             0.999
+  31,   4065,      0,   0,             0.988
+  31,   4065,      0,   1,             0.998
+  31,   4065,      0,  -1,             0.998
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
+ 1 file changed, 61 insertions(+), 37 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index 16fc673e..99258cf5 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -429,22 +429,21 @@ L(page_cross_less_vec):
+ # ifndef USE_AS_WMEMCMP
+ 	cmpl	$8, %edx
+ 	jae	L(between_8_15)
++	/* Fall through for [4, 7].  */
+ 	cmpl	$4, %edx
+-	jae	L(between_4_7)
++	jb	L(between_2_3)
+ 
+-	/* Load as big endian to avoid branches.  */
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	/* Subtraction is okay because the upper 8 bits are zero.  */
+-	subl	%ecx, %eax
++	movbe	(%rdi), %eax
++	movbe	(%rsi), %ecx
++	shlq	$32, %rax
++	shlq	$32, %rcx
++	movbe	-4(%rdi, %rdx), %edi
++	movbe	-4(%rsi, %rdx), %esi
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	/* Fast path for return zero.  */
++	jnz	L(ret_nonzero)
+ 	/* No ymm register was touched.  */
+ 	ret
+ 
+@@ -457,9 +456,33 @@ L(one_or_less):
+ 	/* No ymm register was touched.  */
+ 	ret
+ 
++	.p2align 4,, 5
++L(ret_nonzero):
++	sbbl	%eax, %eax
++	orl	$1, %eax
++	/* No ymm register was touched.  */
++	ret
++
++	.p2align 4,, 2
++L(zero):
++	xorl	%eax, %eax
++	/* No ymm register was touched.  */
++	ret
++
+ 	.p2align 4
+ L(between_8_15):
+-# endif
++	movbe	(%rdi), %rax
++	movbe	(%rsi), %rcx
++	subq	%rcx, %rax
++	jnz	L(ret_nonzero)
++	movbe	-8(%rdi, %rdx), %rax
++	movbe	-8(%rsi, %rdx), %rcx
++	subq	%rcx, %rax
++	/* Fast path for return zero.  */
++	jnz	L(ret_nonzero)
++	/* No ymm register was touched.  */
++	ret
++# else
+ 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+ 	vmovq	(%rdi), %xmm1
+ 	vmovq	(%rsi), %xmm2
+@@ -475,16 +498,13 @@ L(between_8_15):
+ 	VPCMPEQ	%xmm1, %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+ 	subl	$0xffff, %eax
++	/* Fast path for return zero.  */
+ 	jnz	L(return_vec_0)
+ 	/* No ymm register was touched.  */
+ 	ret
++# endif
+ 
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-
+-	.p2align 4
++	.p2align 4,, 10
+ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+ 	vmovdqu	(%rsi), %xmm2
+@@ -501,11 +521,17 @@ L(between_16_31):
+ 	VPCMPEQ	(%rdi), %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+ 	subl	$0xffff, %eax
++	/* Fast path for return zero.  */
+ 	jnz	L(return_vec_0)
+ 	/* No ymm register was touched.  */
+ 	ret
+ 
+ # ifdef USE_AS_WMEMCMP
++	.p2align 4,, 2
++L(zero):
++	xorl	%eax, %eax
++	ret
++
+ 	.p2align 4
+ L(one_or_less):
+ 	jb	L(zero)
+@@ -520,22 +546,20 @@ L(one_or_less):
+ # else
+ 
+ 	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.
+-	 */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	jz	L(zero_4_7)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+-L(zero_4_7):
++L(between_2_3):
++	/* Load as big endian to avoid branches.  */
++	movzwl	(%rdi), %eax
++	movzwl	(%rsi), %ecx
++	bswap	%eax
++	bswap	%ecx
++	shrl	%eax
++	shrl	%ecx
++	movzbl	-1(%rdi, %rdx), %edi
++	movzbl	-1(%rsi, %rdx), %esi
++	orl	%edi, %eax
++	orl	%esi, %ecx
++	/* Subtraction is okay because the upper bit is zero.  */
++	subl	%ecx, %eax
+ 	/* No ymm register was touched.  */
+ 	ret
+ # endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-103.patch b/glibc-RHEL-15696-103.patch
new file mode 100644
index 0000000..c080e54
--- /dev/null
+++ b/glibc-RHEL-15696-103.patch
@@ -0,0 +1,876 @@
+From 5307aa9c1800f36a64c183c091c9af392c1fa75c Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 21 Apr 2022 20:52:28 -0500
+Subject: [PATCH] x86: Optimize {str|wcs}rchr-sse2
+Content-type: text/plain; charset=UTF-8
+
+The new code unrolls the main loop slightly without adding too much
+overhead and minimizes the comparisons for the search CHAR.
+
+Geometric Mean of all benchmarks New / Old: 0.741
+See email for all results.
+
+Full xcheck passes on x86_64 with and without multiarch enabled.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
+ sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
+ sysdeps/x86_64/strrchr.S                | 510 +++++++++++++++---------
+ sysdeps/x86_64/wcsrchr.S                | 266 +-----------
+ 4 files changed, 338 insertions(+), 443 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/wcsrchr.S
+	(copyright header)
+
+diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
+index 0ec76fe9..6bb1284b 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
++++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
+@@ -17,7 +17,7 @@
+    <http://www.gnu.org/licenses/>.  */
+ 
+ #if IS_IN (libc)
+-# define strrchr __strrchr_sse2
++# define STRRCHR __strrchr_sse2
+ 
+ # undef weak_alias
+ # define weak_alias(strrchr, rindex)
+diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+index d015e953..f26d53b5 100644
+--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
++++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+@@ -17,7 +17,6 @@
+    <http://www.gnu.org/licenses/>.  */
+ 
+ #if IS_IN (libc)
+-# define wcsrchr __wcsrchr_sse2
++# define STRRCHR	__wcsrchr_sse2
+ #endif
+-
+ #include "../wcsrchr.S"
+diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
+index aca98e7e..a58cc220 100644
+--- a/sysdeps/x86_64/strrchr.S
++++ b/sysdeps/x86_64/strrchr.S
+@@ -19,210 +19,360 @@
+ 
+ #include <sysdep.h>
+ 
++#ifndef STRRCHR
++# define STRRCHR	strrchr
++#endif
++
++#ifdef USE_AS_WCSRCHR
++# define PCMPEQ	pcmpeqd
++# define CHAR_SIZE	4
++# define PMINU	pminud
++#else
++# define PCMPEQ	pcmpeqb
++# define CHAR_SIZE	1
++# define PMINU	pminub
++#endif
++
++#define PAGE_SIZE	4096
++#define VEC_SIZE	16
++
+ 	.text
+-ENTRY (strrchr)
+-	movd	%esi, %xmm1
++ENTRY(STRRCHR)
++	movd	%esi, %xmm0
+ 	movq	%rdi, %rax
+-	andl	$4095, %eax
+-	punpcklbw	%xmm1, %xmm1
+-	cmpq	$4032, %rax
+-	punpcklwd	%xmm1, %xmm1
+-	pshufd	$0, %xmm1, %xmm1
++	andl	$(PAGE_SIZE - 1), %eax
++#ifndef USE_AS_WCSRCHR
++	punpcklbw %xmm0, %xmm0
++	punpcklwd %xmm0, %xmm0
++#endif
++	pshufd	$0, %xmm0, %xmm0
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+ 	ja	L(cross_page)
+-	movdqu	(%rdi), %xmm0
++
++L(cross_page_continue):
++	movups	(%rdi), %xmm1
+ 	pxor	%xmm2, %xmm2
+-	movdqa	%xmm0, %xmm3
+-	pcmpeqb	%xmm1, %xmm0
+-	pcmpeqb	%xmm2, %xmm3
+-	pmovmskb	%xmm0, %ecx
+-	pmovmskb	%xmm3, %edx
+-	testq	%rdx, %rdx
+-	je	L(next_48_bytes)
+-	leaq	-1(%rdx), %rax
+-	xorq	%rdx, %rax
+-	andq	%rcx, %rax
+-	je	L(exit)
+-	bsrq	%rax, %rax
++	PCMPEQ	%xmm1, %xmm2
++	pmovmskb %xmm2, %ecx
++	testl	%ecx, %ecx
++	jz	L(aligned_more)
++
++	PCMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	leal	-1(%rcx), %edx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(ret0)
++	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
++	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
++	   search CHAR is zero we are correct. Either way `andq
++	   -CHAR_SIZE, %rax` gets the correct result.  */
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++L(ret0):
+ 	ret
+ 
++	/* Returns for first vec x1/x2 have hard coded backward search
++	   path for earlier matches.  */
+ 	.p2align 4
+-L(next_48_bytes):
+-	movdqu	16(%rdi), %xmm4
+-	movdqa	%xmm4, %xmm5
+-	movdqu	32(%rdi), %xmm3
+-	pcmpeqb	%xmm1, %xmm4
+-	pcmpeqb	%xmm2, %xmm5
+-	movdqu	48(%rdi), %xmm0
+-	pmovmskb	%xmm5, %edx
+-	movdqa	%xmm3, %xmm5
+-	pcmpeqb	%xmm1, %xmm3
+-	pcmpeqb	%xmm2, %xmm5
+-	pcmpeqb	%xmm0, %xmm2
+-	salq	$16, %rdx
+-	pmovmskb	%xmm3, %r8d
+-	pmovmskb	%xmm5, %eax
+-	pmovmskb	%xmm2, %esi
+-	salq	$32, %r8
+-	salq	$32, %rax
+-	pcmpeqb	%xmm1, %xmm0
+-	orq	%rdx, %rax
+-	movq	%rsi, %rdx
+-	pmovmskb	%xmm4, %esi
+-	salq	$48, %rdx
+-	salq	$16, %rsi
+-	orq	%r8, %rsi
+-	orq	%rcx, %rsi
+-	pmovmskb	%xmm0, %ecx
+-	salq	$48, %rcx
+-	orq	%rcx, %rsi
+-	orq	%rdx, %rax
+-	je	L(loop_header2)
+-	leaq	-1(%rax), %rcx
+-	xorq	%rax, %rcx
+-	andq	%rcx, %rsi
+-	je	L(exit)
+-	bsrq	%rsi, %rsi
+-	leaq	(%rdi,%rsi), %rax
++L(first_vec_x0_test):
++	PCMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	testl	%eax, %eax
++	jz	L(ret0)
++	bsrl	%eax, %eax
++	addq	%r8, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
+ 	ret
+ 
+ 	.p2align 4
+-L(loop_header2):
+-	testq	%rsi, %rsi
+-	movq	%rdi, %rcx
+-	je	L(no_c_found)
+-L(loop_header):
+-	addq	$64, %rdi
+-	pxor	%xmm7, %xmm7
+-	andq	$-64, %rdi
+-	jmp	L(loop_entry)
++L(first_vec_x1):
++	PCMPEQ	%xmm0, %xmm2
++	pmovmskb %xmm2, %eax
++	leal	-1(%rcx), %edx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(first_vec_x0_test)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE)(%rdi, %rax), %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++	ret
+ 
+ 	.p2align 4
+-L(loop64):
+-	testq	%rdx, %rdx
+-	cmovne	%rdx, %rsi
+-	cmovne	%rdi, %rcx
+-	addq	$64, %rdi
+-L(loop_entry):
+-	movdqa	32(%rdi), %xmm3
+-	pxor	%xmm6, %xmm6
+-	movdqa	48(%rdi), %xmm2
+-	movdqa	%xmm3, %xmm0
+-	movdqa	16(%rdi), %xmm4
+-	pminub	%xmm2, %xmm0
+-	movdqa	(%rdi), %xmm5
+-	pminub	%xmm4, %xmm0
+-	pminub	%xmm5, %xmm0
+-	pcmpeqb	%xmm7, %xmm0
+-	pmovmskb	%xmm0, %eax
+-	movdqa	%xmm5, %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	pmovmskb	%xmm0, %r9d
+-	movdqa	%xmm4, %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	movdqa	%xmm3, %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	salq	$16, %rdx
+-	pmovmskb	%xmm0, %r10d
+-	movdqa	%xmm2, %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	salq	$32, %r10
+-	orq	%r10, %rdx
+-	pmovmskb	%xmm0, %r8d
+-	orq	%r9, %rdx
+-	salq	$48, %r8
+-	orq	%r8, %rdx
++L(first_vec_x1_test):
++	PCMPEQ	%xmm0, %xmm2
++	pmovmskb %xmm2, %eax
+ 	testl	%eax, %eax
+-	je	L(loop64)
+-	pcmpeqb	%xmm6, %xmm4
+-	pcmpeqb	%xmm6, %xmm3
+-	pcmpeqb	%xmm6, %xmm5
+-	pmovmskb	%xmm4, %eax
+-	pmovmskb	%xmm3, %r10d
+-	pcmpeqb	%xmm6, %xmm2
+-	pmovmskb	%xmm5, %r9d
+-	salq	$32, %r10
+-	salq	$16, %rax
+-	pmovmskb	%xmm2, %r8d
+-	orq	%r10, %rax
+-	orq	%r9, %rax
+-	salq	$48, %r8
+-	orq	%r8, %rax
+-	leaq	-1(%rax), %r8
+-	xorq	%rax, %r8
+-	andq	%r8, %rdx
+-	cmovne	%rdi, %rcx
+-	cmovne	%rdx, %rsi
+-	bsrq	%rsi, %rsi
+-	leaq	(%rcx,%rsi), %rax
++	jz	L(first_vec_x0_test)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE)(%rdi, %rax), %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++	ret
++
++	.p2align 4
++L(first_vec_x2):
++	PCMPEQ	%xmm0, %xmm3
++	pmovmskb %xmm3, %eax
++	leal	-1(%rcx), %edx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(first_vec_x1_test)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++	ret
++
++	.p2align 4
++L(aligned_more):
++	/* Save original pointer if match was in VEC 0.  */
++	movq	%rdi, %r8
++	andq	$-VEC_SIZE, %rdi
++
++	movaps	VEC_SIZE(%rdi), %xmm2
++	pxor	%xmm3, %xmm3
++	PCMPEQ	%xmm2, %xmm3
++	pmovmskb %xmm3, %ecx
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x1)
++
++	movaps	(VEC_SIZE * 2)(%rdi), %xmm3
++	pxor	%xmm4, %xmm4
++	PCMPEQ	%xmm3, %xmm4
++	pmovmskb %xmm4, %ecx
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x2)
++
++	addq	$VEC_SIZE, %rdi
++	/* Save pointer again before realigning.  */
++	movq	%rdi, %rsi
++	andq	$-(VEC_SIZE * 2), %rdi
++	.p2align 4
++L(first_loop):
++	/* Do 2x VEC at a time.  */
++	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
++	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
++	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
++	   detecting zero. Note if this is found to be a bottleneck it
++	   may be worth adding an SSE4.1 wcsrchr implementation.  */
++#ifdef USE_AS_WCSRCHR
++	movaps	%xmm5, %xmm6
++	pxor	%xmm8, %xmm8
++
++	PCMPEQ	%xmm8, %xmm5
++	PCMPEQ	%xmm4, %xmm8
++	por	%xmm5, %xmm8
++#else
++	movaps	%xmm5, %xmm6
++	PMINU	%xmm4, %xmm5
++#endif
++
++	movaps	%xmm4, %xmm9
++	PCMPEQ	%xmm0, %xmm4
++	PCMPEQ	%xmm0, %xmm6
++	movaps	%xmm6, %xmm7
++	por	%xmm4, %xmm6
++#ifndef USE_AS_WCSRCHR
++	pxor	%xmm8, %xmm8
++	PCMPEQ	%xmm5, %xmm8
++#endif
++	pmovmskb %xmm8, %ecx
++	pmovmskb %xmm6, %eax
++
++	addq	$(VEC_SIZE * 2), %rdi
++	/* Use `addl` 1) so we can undo it with `subl` and 2) it can
++	   macro-fuse with `jz`.  */
++	addl	%ecx, %eax
++	jz	L(first_loop)
++
++	/* Check if there is zero match.  */
++	testl	%ecx, %ecx
++	jz	L(second_loop_match)
++
++	/* Check if there was a match in last iteration.  */
++	subl	%ecx, %eax
++	jnz	L(new_match)
++
++L(first_loop_old_match):
++	PCMPEQ	%xmm0, %xmm2
++	PCMPEQ	%xmm0, %xmm3
++	pmovmskb %xmm2, %ecx
++	pmovmskb %xmm3, %eax
++	addl	%eax, %ecx
++	jz	L(first_vec_x0_test)
++	/* NB: We could move this shift to before the branch and save a
++	   bit of code size / performance on the fall through. The
++	   branch leads to the null case which generally seems hotter
++	   than char in first 3x VEC.  */
++	sall	$16, %eax
++	orl	%ecx, %eax
++
++	bsrl	%eax, %eax
++	addq	%rsi, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++	ret
++
++	.p2align 4
++L(new_match):
++	pxor	%xmm6, %xmm6
++	PCMPEQ	%xmm9, %xmm6
++	pmovmskb %xmm6, %eax
++	sall	$16, %ecx
++	orl	%eax, %ecx
++
++	/* We can't reuse either of the old comparisons as since we mask
++	   of zeros after first zero (instead of using the full
++	   comparison) we can't gurantee no interference between match
++	   after end of string and valid match.  */
++	pmovmskb %xmm4, %eax
++	pmovmskb %xmm7, %edx
++	sall	$16, %edx
++	orl	%edx, %eax
++
++	leal	-1(%ecx), %edx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(first_loop_old_match)
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
+ 	ret
+ 
++	/* Save minimum state for getting most recent match. We can
++	   throw out all previous work.  */
+ 	.p2align 4
+-L(no_c_found):
+-	movl	$1, %esi
+-	xorl	%ecx, %ecx
+-	jmp	L(loop_header)
++L(second_loop_match):
++	movq	%rdi, %rsi
++	movaps	%xmm4, %xmm2
++	movaps	%xmm7, %xmm3
+ 
+ 	.p2align 4
+-L(exit):
+-	xorl	%eax, %eax
++L(second_loop):
++	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
++	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
++	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
++	   detecting zero. Note if this is found to be a bottleneck it
++	   may be worth adding an SSE4.1 wcsrchr implementation.  */
++#ifdef USE_AS_WCSRCHR
++	movaps	%xmm5, %xmm6
++	pxor	%xmm8, %xmm8
++
++	PCMPEQ	%xmm8, %xmm5
++	PCMPEQ	%xmm4, %xmm8
++	por	%xmm5, %xmm8
++#else
++	movaps	%xmm5, %xmm6
++	PMINU	%xmm4, %xmm5
++#endif
++
++	movaps	%xmm4, %xmm9
++	PCMPEQ	%xmm0, %xmm4
++	PCMPEQ	%xmm0, %xmm6
++	movaps	%xmm6, %xmm7
++	por	%xmm4, %xmm6
++#ifndef USE_AS_WCSRCHR
++	pxor	%xmm8, %xmm8
++	PCMPEQ	%xmm5, %xmm8
++#endif
++
++	pmovmskb %xmm8, %ecx
++	pmovmskb %xmm6, %eax
++
++	addq	$(VEC_SIZE * 2), %rdi
++	/* Either null term or new occurence of CHAR.  */
++	addl	%ecx, %eax
++	jz	L(second_loop)
++
++	/* No null term so much be new occurence of CHAR.  */
++	testl	%ecx, %ecx
++	jz	L(second_loop_match)
++
++
++	subl	%ecx, %eax
++	jnz	L(second_loop_new_match)
++
++L(second_loop_old_match):
++	pmovmskb %xmm2, %ecx
++	pmovmskb %xmm3, %eax
++	sall	$16, %eax
++	orl	%ecx, %eax
++	bsrl	%eax, %eax
++	addq	%rsi, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
+ 	ret
+ 
+ 	.p2align 4
++L(second_loop_new_match):
++	pxor	%xmm6, %xmm6
++	PCMPEQ	%xmm9, %xmm6
++	pmovmskb %xmm6, %eax
++	sall	$16, %ecx
++	orl	%eax, %ecx
++
++	/* We can't reuse either of the old comparisons as since we mask
++	   of zeros after first zero (instead of using the full
++	   comparison) we can't gurantee no interference between match
++	   after end of string and valid match.  */
++	pmovmskb %xmm4, %eax
++	pmovmskb %xmm7, %edx
++	sall	$16, %edx
++	orl	%edx, %eax
++
++	leal	-1(%ecx), %edx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(second_loop_old_match)
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++	ret
++
++	.p2align 4,, 4
+ L(cross_page):
+-	movq	%rdi, %rax
+-	pxor	%xmm0, %xmm0
+-	andq	$-64, %rax
+-	movdqu	(%rax), %xmm5
+-	movdqa	%xmm5, %xmm6
+-	movdqu	16(%rax), %xmm4
+-	pcmpeqb	%xmm1, %xmm5
+-	pcmpeqb	%xmm0, %xmm6
+-	movdqu	32(%rax), %xmm3
+-	pmovmskb	%xmm6, %esi
+-	movdqa	%xmm4, %xmm6
+-	movdqu	48(%rax), %xmm2
+-	pcmpeqb	%xmm1, %xmm4
+-	pcmpeqb	%xmm0, %xmm6
+-	pmovmskb	%xmm6, %edx
+-	movdqa	%xmm3, %xmm6
+-	pcmpeqb	%xmm1, %xmm3
+-	pcmpeqb	%xmm0, %xmm6
+-	pcmpeqb	%xmm2, %xmm0
+-	salq	$16, %rdx
+-	pmovmskb	%xmm3, %r9d
+-	pmovmskb	%xmm6, %r8d
+-	pmovmskb	%xmm0, %ecx
+-	salq	$32, %r9
+-	salq	$32, %r8
+-	pcmpeqb	%xmm1, %xmm2
+-	orq	%r8, %rdx
+-	salq	$48, %rcx
+-	pmovmskb	%xmm5, %r8d
+-	orq	%rsi, %rdx
+-	pmovmskb	%xmm4, %esi
+-	orq	%rcx, %rdx
+-	pmovmskb	%xmm2, %ecx
+-	salq	$16, %rsi
+-	salq	$48, %rcx
+-	orq	%r9, %rsi
+-	orq	%r8, %rsi
+-	orq	%rcx, %rsi
++	movq	%rdi, %rsi
++	andq	$-VEC_SIZE, %rsi
++	movaps	(%rsi), %xmm1
++	pxor	%xmm2, %xmm2
++	PCMPEQ	%xmm1, %xmm2
++	pmovmskb %xmm2, %edx
+ 	movl	%edi, %ecx
+-	subl	%eax, %ecx
+-	shrq	%cl, %rdx
+-	shrq	%cl, %rsi
+-	testq	%rdx, %rdx
+-	je	L(loop_header2)
+-	leaq	-1(%rdx), %rax
+-	xorq	%rdx, %rax
+-	andq	%rax, %rsi
+-	je	L(exit)
+-	bsrq	%rsi, %rax
++	andl	$(VEC_SIZE - 1), %ecx
++	sarl	%cl, %edx
++	jz	L(cross_page_continue)
++	PCMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	sarl	%cl, %eax
++	leal	-1(%rdx), %ecx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(ret1)
++	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++L(ret1):
+ 	ret
+-END (strrchr)
++END(STRRCHR)
+ 
+-weak_alias (strrchr, rindex)
+-libc_hidden_builtin_def (strrchr)
++#ifndef USE_AS_WCSRCHR
++	weak_alias (STRRCHR, rindex)
++	libc_hidden_builtin_def (STRRCHR)
++#endif
+diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
+index 2f388537..ae3cfa7d 100644
+--- a/sysdeps/x86_64/wcsrchr.S
++++ b/sysdeps/x86_64/wcsrchr.S
+@@ -17,266 +17,12 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+-#include <sysdep.h>
+ 
+-	.text
+-ENTRY (wcsrchr)
++#define USE_AS_WCSRCHR	1
++#define NO_PMINU	1
+ 
+-	movd	%rsi, %xmm1
+-	mov	%rdi, %rcx
+-	punpckldq %xmm1, %xmm1
+-	pxor	%xmm2, %xmm2
+-	punpckldq %xmm1, %xmm1
+-	and	$63, %rcx
+-	cmp	$48, %rcx
+-	ja	L(crosscache)
++#ifndef STRRCHR
++# define STRRCHR	wcsrchr
++#endif
+ 
+-	movdqu	(%rdi), %xmm0
+-	pcmpeqd	%xmm0, %xmm2
+-	pcmpeqd	%xmm1, %xmm0
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm0, %rax
+-	add	$16, %rdi
+-
+-	test	%rax, %rax
+-	jnz	L(unaligned_match1)
+-
+-	test	%rcx, %rcx
+-	jnz	L(return_null)
+-
+-	and	$-16, %rdi
+-	xor	%r8, %r8
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(unaligned_match1):
+-	test	%rcx, %rcx
+-	jnz	L(prolog_find_zero_1)
+-
+-	mov	%rax, %r8
+-	mov	%rdi, %rsi
+-	and	$-16, %rdi
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(crosscache):
+-	and	$15, %rcx
+-	and	$-16, %rdi
+-	pxor	%xmm3, %xmm3
+-	movdqa	(%rdi), %xmm0
+-	pcmpeqd	%xmm0, %xmm3
+-	pcmpeqd	%xmm1, %xmm0
+-	pmovmskb %xmm3, %rdx
+-	pmovmskb %xmm0, %rax
+-	shr	%cl, %rdx
+-	shr	%cl, %rax
+-	add	$16, %rdi
+-
+-	test	%rax, %rax
+-	jnz	L(unaligned_match)
+-
+-	test	%rdx, %rdx
+-	jnz	L(return_null)
+-
+-	xor	%r8, %r8
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(unaligned_match):
+-	test	%rdx, %rdx
+-	jnz	L(prolog_find_zero)
+-
+-	mov	%rax, %r8
+-	lea	(%rdi, %rcx), %rsi
+-
+-/* Loop start on aligned string.  */
+-	.p2align 4
+-L(loop):
+-	movdqa	(%rdi), %xmm0
+-	pcmpeqd	%xmm0, %xmm2
+-	add	$16, %rdi
+-	pcmpeqd	%xmm1, %xmm0
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm0, %rax
+-	or	%rax, %rcx
+-	jnz	L(matches)
+-
+-	movdqa	(%rdi), %xmm3
+-	pcmpeqd	%xmm3, %xmm2
+-	add	$16, %rdi
+-	pcmpeqd	%xmm1, %xmm3
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm3, %rax
+-	or	%rax, %rcx
+-	jnz	L(matches)
+-
+-	movdqa	(%rdi), %xmm4
+-	pcmpeqd	%xmm4, %xmm2
+-	add	$16, %rdi
+-	pcmpeqd	%xmm1, %xmm4
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm4, %rax
+-	or	%rax, %rcx
+-	jnz	L(matches)
+-
+-	movdqa	(%rdi), %xmm5
+-	pcmpeqd	%xmm5, %xmm2
+-	add	$16, %rdi
+-	pcmpeqd	%xmm1, %xmm5
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm5, %rax
+-	or	%rax, %rcx
+-	jz	L(loop)
+-
+-	.p2align 4
+-L(matches):
+-	test	%rax, %rax
+-	jnz	L(match)
+-L(return_value):
+-	test	%r8, %r8
+-	jz	L(return_null)
+-	mov	%r8, %rax
+-	mov	%rsi, %rdi
+-
+-	test	$15 << 4, %ah
+-	jnz	L(match_fourth_wchar)
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(match):
+-	pmovmskb %xmm2, %rcx
+-	test	%rcx, %rcx
+-	jnz	L(find_zero)
+-	mov	%rax, %r8
+-	mov	%rdi, %rsi
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(find_zero):
+-	test	$15, %cl
+-	jnz	L(find_zero_in_first_wchar)
+-	test	%cl, %cl
+-	jnz	L(find_zero_in_second_wchar)
+-	test	$15, %ch
+-	jnz	L(find_zero_in_third_wchar)
+-
+-	and	$1 << 13 - 1, %rax
+-	jz	L(return_value)
+-
+-	test	$15 << 4, %ah
+-	jnz	L(match_fourth_wchar)
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(find_zero_in_first_wchar):
+-	test	$1, %rax
+-	jz	L(return_value)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(find_zero_in_second_wchar):
+-	and	$1 << 5 - 1, %rax
+-	jz	L(return_value)
+-
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(find_zero_in_third_wchar):
+-	and	$1 << 9 - 1, %rax
+-	jz	L(return_value)
+-
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(prolog_find_zero):
+-	add	%rcx, %rdi
+-	mov     %rdx, %rcx
+-L(prolog_find_zero_1):
+-	test	$15, %cl
+-	jnz	L(prolog_find_zero_in_first_wchar)
+-	test	%cl, %cl
+-	jnz	L(prolog_find_zero_in_second_wchar)
+-	test	$15, %ch
+-	jnz	L(prolog_find_zero_in_third_wchar)
+-
+-	and	$1 << 13 - 1, %rax
+-	jz	L(return_null)
+-
+-	test	$15 << 4, %ah
+-	jnz	L(match_fourth_wchar)
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(prolog_find_zero_in_first_wchar):
+-	test	$1, %rax
+-	jz	L(return_null)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(prolog_find_zero_in_second_wchar):
+-	and	$1 << 5 - 1, %rax
+-	jz	L(return_null)
+-
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(prolog_find_zero_in_third_wchar):
+-	and	$1 << 9 - 1, %rax
+-	jz	L(return_null)
+-
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(match_second_wchar):
+-	lea	-12(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(match_third_wchar):
+-	lea	-8(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(match_fourth_wchar):
+-	lea	-4(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(return_null):
+-	xor	%rax, %rax
+-	ret
+-
+-END (wcsrchr)
++#include "../strrchr.S"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-104.patch b/glibc-RHEL-15696-104.patch
new file mode 100644
index 0000000..1cb312a
--- /dev/null
+++ b/glibc-RHEL-15696-104.patch
@@ -0,0 +1,501 @@
+From df7e295d18ffa34f629578c0017a9881af7620f6 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 21 Apr 2022 20:52:29 -0500
+Subject: [PATCH] x86: Optimize {str|wcs}rchr-avx2
+Content-type: text/plain; charset=UTF-8
+
+The new code unrolls the main loop slightly without adding too much
+overhead and minimizes the comparisons for the search CHAR.
+
+Geometric Mean of all benchmarks New / Old: 0.832
+See email for all results.
+
+Full xcheck passes on x86_64 with and without multiarch enabled.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
+ 1 file changed, 269 insertions(+), 157 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+index c949410b..3d26fad4 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+@@ -27,9 +27,13 @@
+ # ifdef USE_AS_WCSRCHR
+ #  define VPBROADCAST	vpbroadcastd
+ #  define VPCMPEQ	vpcmpeqd
++#  define VPMIN	vpminud
++#  define CHAR_SIZE	4
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMPEQ	vpcmpeqb
++#  define VPMIN	vpminub
++#  define CHAR_SIZE	1
+ # endif
+ 
+ # ifndef VZEROUPPER
+@@ -41,196 +45,304 @@
+ # endif
+ 
+ # define VEC_SIZE	32
++# define PAGE_SIZE	4096
+ 
+-	.section SECTION(.text),"ax",@progbits
+-ENTRY (STRRCHR)
+-	movd	%esi, %xmm4
+-	movl	%edi, %ecx
++	.section SECTION(.text), "ax", @progbits
++ENTRY(STRRCHR)
++	movd	%esi, %xmm7
++	movl	%edi, %eax
+ 	/* Broadcast CHAR to YMM4.  */
+-	VPBROADCAST %xmm4, %ymm4
++	VPBROADCAST %xmm7, %ymm7
+ 	vpxor	%xmm0, %xmm0, %xmm0
+ 
+-	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	/* Shift here instead of `andl` to save code size (saves a fetch
++	   block).  */
++	sall	$20, %eax
++	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
++	ja	L(cross_page)
+ 
++L(page_cross_continue):
+ 	vmovdqu	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
+-	vpmovmskb %ymm3, %eax
+-	addq	$VEC_SIZE, %rdi
++	/* Check end of string match.  */
++	VPCMPEQ	%ymm1, %ymm0, %ymm6
++	vpmovmskb %ymm6, %ecx
++	testl	%ecx, %ecx
++	jz	L(aligned_more)
++
++	/* Only check match with search CHAR if needed.  */
++	VPCMPEQ	%ymm1, %ymm7, %ymm1
++	vpmovmskb %ymm1, %eax
++	/* Check if match before first zero.  */
++	blsmskl	%ecx, %ecx
++	andl	%ecx, %eax
++	jz	L(ret0)
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
++	   search CHAR is zero we are correct. Either way `andq
++	   -CHAR_SIZE, %rax` gets the correct result.  */
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++L(ret0):
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
++
++	/* Returns for first vec x1/x2 have hard coded backward search
++	   path for earlier matches.  */
++	.p2align 4,, 10
++L(first_vec_x1):
++	VPCMPEQ	%ymm2, %ymm7, %ymm6
++	vpmovmskb %ymm6, %eax
++	blsmskl	%ecx, %ecx
++	andl	%ecx, %eax
++	jnz	L(first_vec_x1_return)
++
++	.p2align 4,, 4
++L(first_vec_x0_test):
++	VPCMPEQ	%ymm1, %ymm7, %ymm6
++	vpmovmskb %ymm6, %eax
++	testl	%eax, %eax
++	jz	L(ret1)
++	bsrl	%eax, %eax
++	addq	%r8, %rax
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++L(ret1):
++	VZEROUPPER_RETURN
+ 
++	.p2align 4,, 10
++L(first_vec_x0_x1_test):
++	VPCMPEQ	%ymm2, %ymm7, %ymm6
++	vpmovmskb %ymm6, %eax
++	/* Check ymm2 for search CHAR match. If no match then check ymm1
++	   before returning.  */
+ 	testl	%eax, %eax
+-	jnz	L(first_vec)
++	jz	L(first_vec_x0_test)
++	.p2align 4,, 4
++L(first_vec_x1_return):
++	bsrl	%eax, %eax
++	leaq	1(%rdi, %rax), %rax
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++	VZEROUPPER_RETURN
+ 
+-	testl	%ecx, %ecx
+-	jnz	L(return_null)
+ 
+-	andq	$-VEC_SIZE, %rdi
+-	xorl	%edx, %edx
+-	jmp	L(aligned_loop)
++	.p2align 4,, 10
++L(first_vec_x2):
++	VPCMPEQ	%ymm3, %ymm7, %ymm6
++	vpmovmskb %ymm6, %eax
++	blsmskl	%ecx, %ecx
++	/* If no in-range search CHAR match in ymm3 then need to check
++	   ymm1/ymm2 for an earlier match (we delay checking search
++	   CHAR matches until needed).  */
++	andl	%ecx, %eax
++	jz	L(first_vec_x0_x1_test)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE + 1)(%rdi, %rax), %rax
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++	VZEROUPPER_RETURN
++
+ 
+ 	.p2align 4
+-L(first_vec):
+-	/* Check if there is a nul CHAR.  */
++L(aligned_more):
++	/* Save original pointer if match was in VEC 0.  */
++	movq	%rdi, %r8
++
++	/* Align src.  */
++	orq	$(VEC_SIZE - 1), %rdi
++	vmovdqu	1(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm6
++	vpmovmskb %ymm6, %ecx
+ 	testl	%ecx, %ecx
+-	jnz	L(char_and_nul_in_first_vec)
++	jnz	L(first_vec_x1)
+ 
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+-	movq	%rdi, %rsi
+-	andq	$-VEC_SIZE, %rdi
+-	jmp	L(aligned_loop)
++	vmovdqu	(VEC_SIZE + 1)(%rdi), %ymm3
++	VPCMPEQ	%ymm3, %ymm0, %ymm6
++	vpmovmskb %ymm6, %ecx
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x2)
+ 
++	/* Save pointer again before realigning.  */
++	movq	%rdi, %rsi
++	addq	$(VEC_SIZE + 1), %rdi
++	andq	$-(VEC_SIZE * 2), %rdi
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %edx
+-	vpmovmskb %ymm3, %eax
+-	shrl	%cl, %edx
+-	shrl	%cl, %eax
+-	addq	$VEC_SIZE, %rdi
+-
+-	/* Check if there is a CHAR.  */
++L(first_aligned_loop):
++	/* Do 2x VEC at a time. Any more and the cost of finding the
++	   match outweights loop benefit.  */
++	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
++	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
++
++	VPCMPEQ	%ymm4, %ymm7, %ymm6
++	VPMIN	%ymm4, %ymm5, %ymm8
++	VPCMPEQ	%ymm5, %ymm7, %ymm10
++	vpor	%ymm6, %ymm10, %ymm5
++	VPCMPEQ	%ymm8, %ymm0, %ymm8
++	vpor	%ymm5, %ymm8, %ymm9
++
++	vpmovmskb %ymm9, %eax
++	addq	$(VEC_SIZE * 2), %rdi
++	/* No zero or search CHAR.  */
+ 	testl	%eax, %eax
+-	jnz	L(found_char)
+-
+-	testl	%edx, %edx
+-	jnz	L(return_null)
++	jz	L(first_aligned_loop)
+ 
+-	jmp	L(aligned_loop)
+-
+-	.p2align 4
+-L(found_char):
+-	testl	%edx, %edx
+-	jnz	L(char_and_nul)
++	/* If no zero CHAR then go to second loop (this allows us to
++	   throw away all prior work).  */
++	vpmovmskb %ymm8, %ecx
++	testl	%ecx, %ecx
++	jz	L(second_aligned_loop_prep)
+ 
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+-	leaq	(%rdi, %rcx), %rsi
++	/* Search char could be zero so we need to get the true match.
++	 */
++	vpmovmskb %ymm5, %eax
++	testl	%eax, %eax
++	jnz	L(first_aligned_loop_return)
+ 
+-	.p2align 4
+-L(aligned_loop):
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	addq	$VEC_SIZE, %rdi
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
+-	vpmovmskb %ymm3, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
+-
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	add	$VEC_SIZE, %rdi
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
++	.p2align 4,, 4
++L(first_vec_x1_or_x2):
++	VPCMPEQ	%ymm3, %ymm7, %ymm3
++	VPCMPEQ	%ymm2, %ymm7, %ymm2
+ 	vpmovmskb %ymm3, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
+-
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	addq	$VEC_SIZE, %rdi
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
+-	vpmovmskb %ymm3, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
+-
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	addq	$VEC_SIZE, %rdi
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
+-	vpmovmskb %ymm3, %eax
+-	orl	%eax, %ecx
+-	jz	L(aligned_loop)
+-
+-	.p2align 4
+-L(char_nor_null):
+-	/* Find a CHAR or a nul CHAR in a loop.  */
+-	testl	%eax, %eax
+-	jnz	L(match)
+-L(return_value):
+-	testl	%edx, %edx
+-	jz	L(return_null)
+-	movl	%edx, %eax
+-	movq	%rsi, %rdi
++	vpmovmskb %ymm2, %edx
++	/* Use add for macro-fusion.  */
++	addq	%rax, %rdx
++	jz	L(first_vec_x0_test)
++	/* NB: We could move this shift to before the branch and save a
++	   bit of code size / performance on the fall through. The
++	   branch leads to the null case which generally seems hotter
++	   than char in first 3x VEC.  */
++	salq	$32, %rax
++	addq	%rdx, %rax
++	bsrq	%rax, %rax
++	leaq	1(%rsi, %rax), %rax
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++	VZEROUPPER_RETURN
+ 
++	.p2align 4,, 8
++L(first_aligned_loop_return):
++	VPCMPEQ	%ymm4, %ymm0, %ymm4
++	vpmovmskb %ymm4, %edx
++	salq	$32, %rcx
++	orq	%rdx, %rcx
++
++	vpmovmskb %ymm10, %eax
++	vpmovmskb %ymm6, %edx
++	salq	$32, %rax
++	orq	%rdx, %rax
++	blsmskq	%rcx, %rcx
++	andq	%rcx, %rax
++	jz	L(first_vec_x1_or_x2)
++
++	bsrq	%rax, %rax
++	leaq	-(VEC_SIZE * 2)(%rdi, %rax), %rax
+ # ifdef USE_AS_WCSRCHR
+-	/* Keep the first bit for each matching CHAR for bsr.  */
+-	andl	$0x11111111, %eax
++	andq	$-CHAR_SIZE, %rax
+ # endif
+-	bsrl	%eax, %eax
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
++	VZEROUPPER_RETURN
+ 
++	/* Search char cannot be zero.  */
+ 	.p2align 4
+-L(match):
+-	/* Find a CHAR.  Check if there is a nul CHAR.  */
+-	vpmovmskb %ymm2, %ecx
+-	testl	%ecx, %ecx
+-	jnz	L(find_nul)
+-
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
++L(second_aligned_loop_set_furthest_match):
++	/* Save VEC and pointer from most recent match.  */
++L(second_aligned_loop_prep):
+ 	movq	%rdi, %rsi
+-	jmp	L(aligned_loop)
++	vmovdqu	%ymm6, %ymm2
++	vmovdqu	%ymm10, %ymm3
+ 
+ 	.p2align 4
+-L(find_nul):
+-# ifdef USE_AS_WCSRCHR
+-	/* Keep the first bit for each matching CHAR for bsr.  */
+-	andl	$0x11111111, %ecx
+-	andl	$0x11111111, %eax
+-# endif
+-	/* Mask out any matching bits after the nul CHAR.  */
+-	movl	%ecx, %r8d
+-	subl	$1, %r8d
+-	xorl	%ecx, %r8d
+-	andl	%r8d, %eax
++L(second_aligned_loop):
++	/* Search 2x at at time.  */
++	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
++	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
++
++	VPCMPEQ	%ymm4, %ymm7, %ymm6
++	VPMIN	%ymm4, %ymm5, %ymm1
++	VPCMPEQ	%ymm5, %ymm7, %ymm10
++	vpor	%ymm6, %ymm10, %ymm5
++	VPCMPEQ	%ymm1, %ymm0, %ymm1
++	vpor	%ymm5, %ymm1, %ymm9
++
++	vpmovmskb %ymm9, %eax
++	addq	$(VEC_SIZE * 2), %rdi
+ 	testl	%eax, %eax
+-	/* If there is no CHAR here, return the remembered one.  */
+-	jz	L(return_value)
+-	bsrl	%eax, %eax
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-	VZEROUPPER_RETURN
+-
+-	.p2align 4
+-L(char_and_nul):
+-	/* Find both a CHAR and a nul CHAR.  */
+-	addq	%rcx, %rdi
+-	movl	%edx, %ecx
+-L(char_and_nul_in_first_vec):
+-# ifdef USE_AS_WCSRCHR
+-	/* Keep the first bit for each matching CHAR for bsr.  */
+-	andl	$0x11111111, %ecx
+-	andl	$0x11111111, %eax
+-# endif
+-	/* Mask out any matching bits after the nul CHAR.  */
+-	movl	%ecx, %r8d
+-	subl	$1, %r8d
+-	xorl	%ecx, %r8d
+-	andl	%r8d, %eax
++	jz	L(second_aligned_loop)
++	vpmovmskb %ymm1, %ecx
++	testl	%ecx, %ecx
++	jz	L(second_aligned_loop_set_furthest_match)
++	vpmovmskb %ymm5, %eax
+ 	testl	%eax, %eax
+-	/* Return null pointer if the nul CHAR comes first.  */
+-	jz	L(return_null)
+-	bsrl	%eax, %eax
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
++	jnz	L(return_new_match)
++
++	/* This is the hot patch. We know CHAR is inbounds and that
++	   ymm3/ymm2 have latest match.  */
++	.p2align 4,, 4
++L(return_old_match):
++	vpmovmskb %ymm3, %eax
++	vpmovmskb %ymm2, %edx
++	salq	$32, %rax
++	orq	%rdx, %rax
++	bsrq	%rax, %rax
++	/* Search char cannot be zero so safe to just use lea for
++	   wcsrchr.  */
++	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(return_null):
+-	xorl	%eax, %eax
++	/* Last iteration also potentially has a match.  */
++	.p2align 4,, 8
++L(return_new_match):
++	VPCMPEQ	%ymm4, %ymm0, %ymm4
++	vpmovmskb %ymm4, %edx
++	salq	$32, %rcx
++	orq	%rdx, %rcx
++
++	vpmovmskb %ymm10, %eax
++	vpmovmskb %ymm6, %edx
++	salq	$32, %rax
++	orq	%rdx, %rax
++	blsmskq	%rcx, %rcx
++	andq	%rcx, %rax
++	jz	L(return_old_match)
++	bsrq	%rax, %rax
++	/* Search char cannot be zero so safe to just use lea for
++	   wcsrchr.  */
++	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
+ 	VZEROUPPER_RETURN
+ 
+-END (STRRCHR)
++	.p2align 4,, 4
++L(cross_page):
++	movq	%rdi, %rsi
++	andq	$-VEC_SIZE, %rsi
++	vmovdqu	(%rsi), %ymm1
++	VPCMPEQ	%ymm1, %ymm0, %ymm6
++	vpmovmskb %ymm6, %ecx
++	/* Shift out zero CHAR matches that are before the begining of
++	   src (rdi).  */
++	shrxl	%edi, %ecx, %ecx
++	testl	%ecx, %ecx
++	jz	L(page_cross_continue)
++	VPCMPEQ	%ymm1, %ymm7, %ymm1
++	vpmovmskb %ymm1, %eax
++
++	/* Shift out search CHAR matches that are before the begining of
++	   src (rdi).  */
++	shrxl	%edi, %eax, %eax
++	blsmskl	%ecx, %ecx
++	/* Check if any search CHAR match in range.  */
++	andl	%ecx, %eax
++	jz	L(ret2)
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++L(ret2):
++	VZEROUPPER_RETURN
++END(STRRCHR)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-105.patch b/glibc-RHEL-15696-105.patch
new file mode 100644
index 0000000..e0a157f
--- /dev/null
+++ b/glibc-RHEL-15696-105.patch
@@ -0,0 +1,558 @@
+From c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 21 Apr 2022 20:52:30 -0500
+Subject: [PATCH] x86: Optimize {str|wcs}rchr-evex
+Content-type: text/plain; charset=UTF-8
+
+The new code unrolls the main loop slightly without adding too much
+overhead and minimizes the comparisons for the search CHAR.
+
+Geometric Mean of all benchmarks New / Old: 0.755
+See email for all results.
+
+Full xcheck passes on x86_64 with and without multiarch enabled.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++---------
+ 1 file changed, 290 insertions(+), 181 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
+index f920b5a5..f5b6d755 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
++++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
+@@ -24,242 +24,351 @@
+ #  define STRRCHR	__strrchr_evex
+ # endif
+ 
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
++# define VMOVU	vmovdqu64
++# define VMOVA	vmovdqa64
+ 
+ # ifdef USE_AS_WCSRCHR
++#  define SHIFT_REG	esi
++
++#  define kunpck	kunpckbw
++#  define kmov_2x	kmovd
++#  define maskz_2x	ecx
++#  define maskm_2x	eax
++#  define CHAR_SIZE	4
++#  define VPMIN	vpminud
++#  define VPTESTN	vptestnmd
+ #  define VPBROADCAST	vpbroadcastd
+-#  define VPCMP		vpcmpd
+-#  define SHIFT_REG	r8d
++#  define VPCMP	vpcmpd
+ # else
++#  define SHIFT_REG	edi
++
++#  define kunpck	kunpckdq
++#  define kmov_2x	kmovq
++#  define maskz_2x	rcx
++#  define maskm_2x	rax
++
++#  define CHAR_SIZE	1
++#  define VPMIN	vpminub
++#  define VPTESTN	vptestnmb
+ #  define VPBROADCAST	vpbroadcastb
+-#  define VPCMP		vpcmpb
+-#  define SHIFT_REG	ecx
++#  define VPCMP	vpcmpb
+ # endif
+ 
+ # define XMMZERO	xmm16
+ # define YMMZERO	ymm16
+ # define YMMMATCH	ymm17
+-# define YMM1		ymm18
++# define YMMSAVE	ymm18
++
++# define YMM1	ymm19
++# define YMM2	ymm20
++# define YMM3	ymm21
++# define YMM4	ymm22
++# define YMM5	ymm23
++# define YMM6	ymm24
++# define YMM7	ymm25
++# define YMM8	ymm26
+ 
+-# define VEC_SIZE	32
+ 
+-	.section .text.evex,"ax",@progbits
+-ENTRY (STRRCHR)
+-	movl	%edi, %ecx
++# define VEC_SIZE	32
++# define PAGE_SIZE	4096
++	.section .text.evex, "ax", @progbits
++ENTRY(STRRCHR)
++	movl	%edi, %eax
+ 	/* Broadcast CHAR to YMMMATCH.  */
+ 	VPBROADCAST %esi, %YMMMATCH
+ 
+-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+-
+-	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jg	L(cross_page_boundary)
+ 
++L(page_cross_continue):
+ 	VMOVU	(%rdi), %YMM1
+-
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	/* k0 has a 1 for each zero CHAR in YMM1.  */
++	VPTESTN	%YMM1, %YMM1, %k0
+ 	kmovd	%k0, %ecx
+-	kmovd	%k1, %eax
+-
+-	addq	$VEC_SIZE, %rdi
+-
+-	testl	%eax, %eax
+-	jnz	L(first_vec)
+-
+ 	testl	%ecx, %ecx
+-	jnz	L(return_null)
+-
+-	andq	$-VEC_SIZE, %rdi
+-	xorl	%edx, %edx
+-	jmp	L(aligned_loop)
+-
+-	.p2align 4
+-L(first_vec):
+-	/* Check if there is a null byte.  */
+-	testl	%ecx, %ecx
+-	jnz	L(char_and_nul_in_first_vec)
+-
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+-	movq	%rdi, %rsi
+-	andq	$-VEC_SIZE, %rdi
+-	jmp	L(aligned_loop)
+-
+-	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
++	jz	L(aligned_more)
++	/* fallthrough: zero CHAR in first VEC.  */
+ 
++	/* K1 has a 1 for each search CHAR match in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k1, %eax
++	/* Build mask up until first zero CHAR (used to mask of
++	   potential search CHAR matches past the end of the string).
++	 */
++	blsmskl	%ecx, %ecx
++	andl	%ecx, %eax
++	jz	L(ret0)
++	/* Get last match (the `andl` removed any out of bounds
++	   matches).  */
++	bsrl	%eax, %eax
+ # ifdef USE_AS_WCSRCHR
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+-	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl	$2, %SHIFT_REG
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++# else
++	addq	%rdi, %rax
+ # endif
++L(ret0):
++	ret
+ 
+-	VMOVA	(%rdi), %YMM1
+-
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
++	/* Returns for first vec x1/x2/x3 have hard coded backward
++	   search path for earlier matches.  */
++	.p2align 4,, 6
++L(first_vec_x1):
++	VPCMP	$0, %YMMMATCH, %YMM2, %k1
++	kmovd	%k1, %eax
++	blsmskl	%ecx, %ecx
++	/* eax non-zero if search CHAR in range.  */
++	andl	%ecx, %eax
++	jnz	L(first_vec_x1_return)
++
++	/* fallthrough: no match in YMM2 then need to check for earlier
++	   matches (in YMM1).  */
++	.p2align 4,, 4
++L(first_vec_x0_test):
+ 	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+-	kmovd	%k0, %edx
+ 	kmovd	%k1, %eax
+-
+-	shrxl	%SHIFT_REG, %edx, %edx
+-	shrxl	%SHIFT_REG, %eax, %eax
+-	addq	$VEC_SIZE, %rdi
+-
+-	/* Check if there is a CHAR.  */
+ 	testl	%eax, %eax
+-	jnz	L(found_char)
+-
+-	testl	%edx, %edx
+-	jnz	L(return_null)
+-
+-	jmp	L(aligned_loop)
+-
+-	.p2align 4
+-L(found_char):
+-	testl	%edx, %edx
+-	jnz	L(char_and_nul)
+-
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+-	leaq	(%rdi, %rcx), %rsi
++	jz	L(ret1)
++	bsrl	%eax, %eax
++# ifdef USE_AS_WCSRCHR
++	leaq	(%rsi, %rax, CHAR_SIZE), %rax
++# else
++	addq	%rsi, %rax
++# endif
++L(ret1):
++	ret
+ 
+-	.p2align 4
+-L(aligned_loop):
+-	VMOVA	(%rdi), %YMM1
+-	addq	$VEC_SIZE, %rdi
++	.p2align 4,, 10
++L(first_vec_x1_or_x2):
++	VPCMP	$0, %YMM3, %YMMMATCH, %k3
++	VPCMP	$0, %YMM2, %YMMMATCH, %k2
++	/* K2 and K3 have 1 for any search CHAR match. Test if any
++	   matches between either of them. Otherwise check YMM1.  */
++	kortestd %k2, %k3
++	jz	L(first_vec_x0_test)
++
++	/* Guranteed that YMM2 and YMM3 are within range so merge the
++	   two bitmasks then get last result.  */
++	kunpck	%k2, %k3, %k3
++	kmovq	%k3, %rax
++	bsrq	%rax, %rax
++	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+-	kmovd	%k0, %ecx
++	.p2align 4,, 6
++L(first_vec_x3):
++	VPCMP	$0, %YMMMATCH, %YMM4, %k1
+ 	kmovd	%k1, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
++	blsmskl	%ecx, %ecx
++	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
++	andl	%ecx, %eax
++	jz	L(first_vec_x1_or_x2)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	VMOVA	(%rdi), %YMM1
+-	add	$VEC_SIZE, %rdi
++	.p2align 4,, 6
++L(first_vec_x0_x1_test):
++	VPCMP	$0, %YMMMATCH, %YMM2, %k1
++	kmovd	%k1, %eax
++	/* Check YMM2 for last match first. If no match try YMM1.  */
++	testl	%eax, %eax
++	jz	L(first_vec_x0_test)
++	.p2align 4,, 4
++L(first_vec_x1_return):
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+-	kmovd	%k0, %ecx
++	.p2align 4,, 10
++L(first_vec_x2):
++	VPCMP	$0, %YMMMATCH, %YMM3, %k1
+ 	kmovd	%k1, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
++	blsmskl	%ecx, %ecx
++	/* Check YMM3 for last match first. If no match try YMM2/YMM1.
++	 */
++	andl	%ecx, %eax
++	jz	L(first_vec_x0_x1_test)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	VMOVA	(%rdi), %YMM1
+-	addq	$VEC_SIZE, %rdi
+ 
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	.p2align 4
++L(aligned_more):
++	/* Need to keep original pointer incase YMM1 has last match.  */
++	movq	%rdi, %rsi
++	andq	$-VEC_SIZE, %rdi
++	VMOVU	VEC_SIZE(%rdi), %YMM2
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %ecx
+-	kmovd	%k1, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x1)
+ 
+-	VMOVA	(%rdi), %YMM1
+-	addq	$VEC_SIZE, %rdi
++	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
++	VPTESTN	%YMM3, %YMM3, %k0
++	kmovd	%k0, %ecx
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x2)
+ 
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
++	VPTESTN	%YMM4, %YMM4, %k0
+ 	kmovd	%k0, %ecx
+-	kmovd	%k1, %eax
+-	orl	%eax, %ecx
+-	jz	L(aligned_loop)
++	movq	%rdi, %r8
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x3)
+ 
++	andq	$-(VEC_SIZE * 2), %rdi
+ 	.p2align 4
+-L(char_nor_null):
+-	/* Find a CHAR or a null byte in a loop.  */
++L(first_aligned_loop):
++	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
++	   they don't store a match.  */
++	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
++	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
++
++	VPCMP	$0, %YMM5, %YMMMATCH, %k2
++	vpxord	%YMM6, %YMMMATCH, %YMM7
++
++	VPMIN	%YMM5, %YMM6, %YMM8
++	VPMIN	%YMM8, %YMM7, %YMM7
++
++	VPTESTN	%YMM7, %YMM7, %k1
++	subq	$(VEC_SIZE * -2), %rdi
++	kortestd %k1, %k2
++	jz	L(first_aligned_loop)
++
++	VPCMP	$0, %YMM6, %YMMMATCH, %k3
++	VPTESTN	%YMM8, %YMM8, %k1
++	ktestd	%k1, %k1
++	jz	L(second_aligned_loop_prep)
++
++	kortestd %k2, %k3
++	jnz	L(return_first_aligned_loop)
++
++	.p2align 4,, 6
++L(first_vec_x1_or_x2_or_x3):
++	VPCMP	$0, %YMM4, %YMMMATCH, %k4
++	kmovd	%k4, %eax
+ 	testl	%eax, %eax
+-	jnz	L(match)
+-L(return_value):
+-	testl	%edx, %edx
+-	jz	L(return_null)
+-	movl	%edx, %eax
+-	movq	%rsi, %rdi
++	jz	L(first_vec_x1_or_x2)
+ 	bsrl	%eax, %eax
+-# ifdef USE_AS_WCSRCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-# endif
++	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+-	.p2align 4
+-L(match):
+-	/* Find a CHAR.  Check if there is a null byte.  */
+-	kmovd	%k0, %ecx
+-	testl	%ecx, %ecx
+-	jnz	L(find_nul)
++	.p2align 4,, 8
++L(return_first_aligned_loop):
++	VPTESTN	%YMM5, %YMM5, %k0
++	kunpck	%k0, %k1, %k0
++	kmov_2x	%k0, %maskz_2x
++
++	blsmsk	%maskz_2x, %maskz_2x
++	kunpck	%k2, %k3, %k3
++	kmov_2x	%k3, %maskm_2x
++	and	%maskz_2x, %maskm_2x
++	jz	L(first_vec_x1_or_x2_or_x3)
+ 
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
++	bsr	%maskm_2x, %maskm_2x
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++
++	.p2align 4
++	/* We can throw away the work done for the first 4x checks here
++	   as we have a later match. This is the 'fast' path persay.
++	 */
++L(second_aligned_loop_prep):
++L(second_aligned_loop_set_furthest_match):
+ 	movq	%rdi, %rsi
+-	jmp	L(aligned_loop)
++	kunpck	%k2, %k3, %k4
+ 
+ 	.p2align 4
+-L(find_nul):
+-	/* Mask out any matching bits after the null byte.  */
+-	movl	%ecx, %r8d
+-	subl	$1, %r8d
+-	xorl	%ecx, %r8d
+-	andl	%r8d, %eax
+-	testl	%eax, %eax
+-	/* If there is no CHAR here, return the remembered one.  */
+-	jz	L(return_value)
+-	bsrl	%eax, %eax
++L(second_aligned_loop):
++	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
++	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
++
++	VPCMP	$0, %YMM1, %YMMMATCH, %k2
++	vpxord	%YMM2, %YMMMATCH, %YMM3
++
++	VPMIN	%YMM1, %YMM2, %YMM4
++	VPMIN	%YMM3, %YMM4, %YMM3
++
++	VPTESTN	%YMM3, %YMM3, %k1
++	subq	$(VEC_SIZE * -2), %rdi
++	kortestd %k1, %k2
++	jz	L(second_aligned_loop)
++
++	VPCMP	$0, %YMM2, %YMMMATCH, %k3
++	VPTESTN	%YMM4, %YMM4, %k1
++	ktestd	%k1, %k1
++	jz	L(second_aligned_loop_set_furthest_match)
++
++	kortestd %k2, %k3
++	/* branch here because there is a significant advantage interms
++	   of output dependency chance in using edx.  */
++	jnz	L(return_new_match)
++L(return_old_match):
++	kmovq	%k4, %rax
++	bsrq	%rax, %rax
++	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
++	ret
++
++L(return_new_match):
++	VPTESTN	%YMM1, %YMM1, %k0
++	kunpck	%k0, %k1, %k0
++	kmov_2x	%k0, %maskz_2x
++
++	blsmsk	%maskz_2x, %maskz_2x
++	kunpck	%k2, %k3, %k3
++	kmov_2x	%k3, %maskm_2x
++	and	%maskz_2x, %maskm_2x
++	jz	L(return_old_match)
++
++	bsr	%maskm_2x, %maskm_2x
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++
++L(cross_page_boundary):
++	/* eax contains all the page offset bits of src (rdi). `xor rdi,
++	   rax` sets pointer will all page offset bits cleared so
++	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
++	   before page cross (guranteed to be safe to read). Doing this
++	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
++	   a bit of code size.  */
++	xorq	%rdi, %rax
++	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
++	VPTESTN	%YMM1, %YMM1, %k0
++	kmovd	%k0, %ecx
++
++	/* Shift out zero CHAR matches that are before the begining of
++	   src (rdi).  */
+ # ifdef USE_AS_WCSRCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
++	movl	%edi, %esi
++	andl	$(VEC_SIZE - 1), %esi
++	shrl	$2, %esi
+ # endif
+-	ret
++	shrxl	%SHIFT_REG, %ecx, %ecx
+ 
+-	.p2align 4
+-L(char_and_nul):
+-	/* Find both a CHAR and a null byte.  */
+-	addq	%rcx, %rdi
+-	movl	%edx, %ecx
+-L(char_and_nul_in_first_vec):
+-	/* Mask out any matching bits after the null byte.  */
+-	movl	%ecx, %r8d
+-	subl	$1, %r8d
+-	xorl	%ecx, %r8d
+-	andl	%r8d, %eax
+-	testl	%eax, %eax
+-	/* Return null pointer if the null byte comes first.  */
+-	jz	L(return_null)
++	testl	%ecx, %ecx
++	jz	L(page_cross_continue)
++
++	/* Found zero CHAR so need to test for search CHAR.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k1, %eax
++	/* Shift out search CHAR matches that are before the begining of
++	   src (rdi).  */
++	shrxl	%SHIFT_REG, %eax, %eax
++
++	/* Check if any search CHAR match in range.  */
++	blsmskl	%ecx, %ecx
++	andl	%ecx, %eax
++	jz	L(ret3)
+ 	bsrl	%eax, %eax
+ # ifdef USE_AS_WCSRCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
++	addq	%rdi, %rax
+ # endif
++L(ret3):
+ 	ret
+ 
+-	.p2align 4
+-L(return_null):
+-	xorl	%eax, %eax
+-	ret
+-
+-END (STRRCHR)
++END(STRRCHR)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-106.patch b/glibc-RHEL-15696-106.patch
new file mode 100644
index 0000000..f3bdb17
--- /dev/null
+++ b/glibc-RHEL-15696-106.patch
@@ -0,0 +1,73 @@
+From 911c63a51c690dd1a97dfc587097277029baf00f Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 27 Apr 2022 15:13:02 -0500
+Subject: [PATCH] sysdeps: Add 'get_fast_jitter' interace in fast-jitter.h
+Content-type: text/plain; charset=UTF-8
+
+'get_fast_jitter' is meant to be used purely for performance
+purposes. In all cases it's used it should be acceptable to get no
+randomness (see default case). An example use case is in setting
+jitter for retries between threads at a lock. There is a
+performance benefit to having jitter, but only if the jitter can
+be generated very quickly and ultimately there is no serious issue
+if no jitter is generated.
+
+The implementation generally uses 'HP_TIMING_NOW' iff it is
+inlined (avoid any potential syscall paths).
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/generic/fast-jitter.h | 42 +++++++++++++++++++++++++++++++++++
+ 1 file changed, 42 insertions(+)
+ create mode 100644 sysdeps/generic/fast-jitter.h
+
+diff --git a/sysdeps/generic/fast-jitter.h b/sysdeps/generic/fast-jitter.h
+new file mode 100644
+index 00000000..4dd53e34
+--- /dev/null
++++ b/sysdeps/generic/fast-jitter.h
+@@ -0,0 +1,42 @@
++/* Fallback for fast jitter just return 0.
++   Copyright (C) 2019-2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef _FAST_JITTER_H
++# define _FAST_JITTER_H
++
++# include <stdint.h>
++# include <hp-timing.h>
++
++/* Baseline just return 0.  We could create jitter using a clock or
++   'random_bits' but that may imply a syscall and the goal of
++   'get_fast_jitter' is minimal overhead "randomness" when such
++   randomness helps performance.  Adding high overhead the function
++   defeats the purpose.  */
++static inline uint32_t
++get_fast_jitter (void)
++{
++# if HP_TIMING_INLINE
++  hp_timing_t jitter;
++  HP_TIMING_NOW (jitter);
++  return (uint32_t) jitter;
++# else
++  return 0;
++# endif
++}
++
++#endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-107.patch b/glibc-RHEL-15696-107.patch
new file mode 100644
index 0000000..738cc23
--- /dev/null
+++ b/glibc-RHEL-15696-107.patch
@@ -0,0 +1,226 @@
+From 8162147872491bb5b48e91543b19c49a29ae6b6d Mon Sep 17 00:00:00 2001
+From: Wangyang Guo <wangyang.guo@intel.com>
+Date: Fri, 6 May 2022 01:50:10 +0000
+Subject: [PATCH] nptl: Add backoff mechanism to spinlock loop
+Content-type: text/plain; charset=UTF-8
+
+When mutiple threads waiting for lock at the same time, once lock owner
+releases the lock, waiters will see lock available and all try to lock,
+which may cause an expensive CAS storm.
+
+Binary exponential backoff with random jitter is introduced. As try-lock
+attempt increases, there is more likely that a larger number threads
+compete for adaptive mutex lock, so increase wait time in exponential.
+A random jitter is also added to avoid synchronous try-lock from other
+threads.
+
+v2: Remove read-check before try-lock for performance.
+
+v3:
+1. Restore read-check since it works well in some platform.
+2. Make backoff arch dependent, and enable it for x86_64.
+3. Limit max backoff to reduce latency in large critical section.
+
+v4: Fix strict-prototypes error in sysdeps/nptl/pthread_mutex_backoff.h
+
+v5: Commit log updated for regression in large critical section.
+
+Result of pthread-mutex-locks bench
+
+Test Platform: Xeon 8280L (2 socket, 112 CPUs in total)
+First Row: thread number
+First Col: critical section length
+Values: backoff vs upstream, time based, low is better
+
+non-critical-length: 1
+	1	2	4	8	16	32	64	112	140
+0	0.99	0.58	0.52	0.49	0.43	0.44	0.46	0.52	0.54
+1	0.98	0.43	0.56	0.50	0.44	0.45	0.50	0.56	0.57
+2	0.99	0.41	0.57	0.51	0.45	0.47	0.48	0.60	0.61
+4	0.99	0.45	0.59	0.53	0.48	0.49	0.52	0.64	0.65
+8	1.00	0.66	0.71	0.63	0.56	0.59	0.66	0.72	0.71
+16	0.97	0.78	0.91	0.73	0.67	0.70	0.79	0.80	0.80
+32	0.95	1.17	0.98	0.87	0.82	0.86	0.89	0.90	0.90
+64	0.96	0.95	1.01	1.01	0.98	1.00	1.03	0.99	0.99
+128	0.99	1.01	1.01	1.17	1.08	1.12	1.02	0.97	1.02
+
+non-critical-length: 32
+	1	2	4	8	16	32	64	112	140
+0	1.03	0.97	0.75	0.65	0.58	0.58	0.56	0.70	0.70
+1	0.94	0.95	0.76	0.65	0.58	0.58	0.61	0.71	0.72
+2	0.97	0.96	0.77	0.66	0.58	0.59	0.62	0.74	0.74
+4	0.99	0.96	0.78	0.66	0.60	0.61	0.66	0.76	0.77
+8	0.99	0.99	0.84	0.70	0.64	0.66	0.71	0.80	0.80
+16	0.98	0.97	0.95	0.76	0.70	0.73	0.81	0.85	0.84
+32	1.04	1.12	1.04	0.89	0.82	0.86	0.93	0.91	0.91
+64	0.99	1.15	1.07	1.00	0.99	1.01	1.05	0.99	0.99
+128	1.00	1.21	1.20	1.22	1.25	1.31	1.12	1.10	0.99
+
+non-critical-length: 128
+	1	2	4	8	16	32	64	112	140
+0	1.02	1.00	0.99	0.67	0.61	0.61	0.61	0.74	0.73
+1	0.95	0.99	1.00	0.68	0.61	0.60	0.60	0.74	0.74
+2	1.00	1.04	1.00	0.68	0.59	0.61	0.65	0.76	0.76
+4	1.00	0.96	0.98	0.70	0.63	0.63	0.67	0.78	0.77
+8	1.01	1.02	0.89	0.73	0.65	0.67	0.71	0.81	0.80
+16	0.99	0.96	0.96	0.79	0.71	0.73	0.80	0.84	0.84
+32	0.99	0.95	1.05	0.89	0.84	0.85	0.94	0.92	0.91
+64	1.00	0.99	1.16	1.04	1.00	1.02	1.06	0.99	0.99
+128	1.00	1.06	0.98	1.14	1.39	1.26	1.08	1.02	0.98
+
+There is regression in large critical section. But adaptive mutex is
+aimed for "quick" locks. Small critical section is more common when
+users choose to use adaptive pthread_mutex.
+
+Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+Conflicts:
+	pthreadP.h
+	(had been moved)
+	nptl/pthread_mutex_lock.c
+	(max_adaptive_count renamed)
+
+---
+ nptl/pthreadP.h                             |  1 +
+ nptl/pthread_mutex_lock.c                   | 16 +++++++--
+ sysdeps/nptl/pthread_mutex_backoff.h        | 35 ++++++++++++++++++
+ sysdeps/x86_64/nptl/pthread_mutex_backoff.h | 39 +++++++++++++++++++++
+ 4 files changed, 89 insertions(+), 2 deletions(-)
+ create mode 100644 sysdeps/nptl/pthread_mutex_backoff.h
+ create mode 100644 sysdeps/x86_64/nptl/pthread_mutex_backoff.h
+
+diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
+index 7ddc166c..1550e3b6 100644
+--- a/nptl/pthreadP.h
++++ b/nptl/pthreadP.h
+@@ -33,6 +33,7 @@
+ #include <kernel-features.h>
+ #include <errno.h>
+ #include <internal-signals.h>
++#include <pthread_mutex_backoff.h>
+ 
+ 
+ /* Atomic operations on TLS memory.  */
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index d96a9933..c7770fc9 100644
+--- a/nptl/pthread_mutex_lock.c
++++ b/nptl/pthread_mutex_lock.c
+@@ -133,14 +133,26 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
+ 	  int cnt = 0;
+ 	  int max_cnt = MIN (MAX_ADAPTIVE_COUNT,
+ 			     mutex->__data.__spins * 2 + 10);
++	  int spin_count, exp_backoff = 1;
++	  unsigned int jitter = get_jitter ();
+ 	  do
+ 	    {
+-	      if (cnt++ >= max_cnt)
++	      /* In each loop, spin count is exponential backoff plus
++		 random jitter, random range is [0, exp_backoff-1].  */
++	      spin_count = exp_backoff + (jitter & (exp_backoff - 1));
++	      cnt += spin_count;
++	      if (cnt >= max_cnt)
+ 		{
++		  /* If cnt exceeds max spin count, just go to wait
++		     queue.  */
+ 		  LLL_MUTEX_LOCK (mutex);
+ 		  break;
+ 		}
+-	      atomic_spin_nop ();
++	      do
++		atomic_spin_nop ();
++	      while (--spin_count > 0);
++	      /* Prepare for next loop.  */
++	      exp_backoff = get_next_backoff (exp_backoff);
+ 	    }
+ 	  while (LLL_MUTEX_READ_LOCK (mutex) != 0
+ 		 || LLL_MUTEX_TRYLOCK (mutex) != 0);
+diff --git a/sysdeps/nptl/pthread_mutex_backoff.h b/sysdeps/nptl/pthread_mutex_backoff.h
+new file mode 100644
+index 00000000..5b26c22a
+--- /dev/null
++++ b/sysdeps/nptl/pthread_mutex_backoff.h
+@@ -0,0 +1,35 @@
++/* Pthread mutex backoff configuration.
++   Copyright (C) 2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++#ifndef _PTHREAD_MUTEX_BACKOFF_H
++#define _PTHREAD_MUTEX_BACKOFF_H 1
++
++static inline unsigned int
++get_jitter (void)
++{
++  /* Arch dependent random jitter, return 0 disables random.  */
++  return 0;
++}
++
++static inline int
++get_next_backoff (int backoff)
++{
++  /* Next backoff, return 1 disables mutex backoff.  */
++  return 1;
++}
++
++#endif
+diff --git a/sysdeps/x86_64/nptl/pthread_mutex_backoff.h b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
+new file mode 100644
+index 00000000..ec74c3d9
+--- /dev/null
++++ b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
+@@ -0,0 +1,39 @@
++/* Pthread mutex backoff configuration.
++   Copyright (C) 2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++#ifndef _PTHREAD_MUTEX_BACKOFF_H
++#define _PTHREAD_MUTEX_BACKOFF_H 1
++
++#include <fast-jitter.h>
++
++static inline unsigned int
++get_jitter (void)
++{
++  return get_fast_jitter ();
++}
++
++#define MAX_BACKOFF 16
++
++static inline int
++get_next_backoff (int backoff)
++{
++  /* Binary expontial backoff. Limiting max backoff
++     can reduce latency in large critical section.  */
++  return (backoff < MAX_BACKOFF) ? backoff << 1 : backoff;
++}
++
++#endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-108.patch b/glibc-RHEL-15696-108.patch
new file mode 100644
index 0000000..17bf7d8
--- /dev/null
+++ b/glibc-RHEL-15696-108.patch
@@ -0,0 +1,55 @@
+From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 15 Feb 2022 08:18:15 -0600
+Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
+ #28896]
+Content-type: text/plain; charset=UTF-8
+
+In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
+call strcmp-avx2 and wcscmp-avx2 respectively. This would have
+not checks around vzeroupper and would trigger spurious
+aborts. This commit fixes that.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
+AVX2 machines with and without RTM.
+
+Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S | 8 ++------
+ 1 file changed, 2 insertions(+), 6 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strcmp-avx2.S
+	(split into two patches due to upstream bug differences)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 28cc98b6..e267c6cb 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -345,10 +345,10 @@ L(one_or_less):
+ 	movq	%LOCALE_REG, %rdx
+ #  endif
+ 	jb	L(ret_zero)
+-#  ifdef USE_AS_WCSCMP
+ 	/* 'nbe' covers the case where length is negative (large
+ 	   unsigned).  */
+-	jnbe	__wcscmp_avx2
++	jnbe	OVERFLOW_STRCMP
++#  ifdef USE_AS_WCSCMP
+ 	movl	(%rdi), %edx
+ 	xorl	%eax, %eax
+ 	cmpl	(%rsi), %edx
+@@ -357,10 +357,6 @@ L(one_or_less):
+ 	negl	%eax
+ 	orl	$1, %eax
+ #  else
+-	/* 'nbe' covers the case where length is negative (large
+-	   unsigned).  */
+-
+-	jnbe	__strcmp_avx2
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi), %ecx
+ 	TOLOWER_gpr (%rax, %eax)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-109.patch b/glibc-RHEL-15696-109.patch
new file mode 100644
index 0000000..8aaa314
--- /dev/null
+++ b/glibc-RHEL-15696-109.patch
@@ -0,0 +1,60 @@
+From 259a17cc98058d2576511201f85d28cb5d9de2a2 Mon Sep 17 00:00:00 2001
+From: Stefan Liebler <stli@linux.ibm.com>
+Date: Mon, 28 Jun 2021 13:01:07 +0200
+Subject: s390x: Update math: redirect roundeven function
+
+After recent commit
+447954a206837b5f153869cfeeeab44631c3fac9
+"math: redirect roundeven function", building on
+s390x fails with:
+Error: symbol `__roundevenl' is already defined
+
+Similar to aarch64/riscv fix, this patch redirects target
+specific functions for s390x:
+commit 3213ed770cbc5821920d16caa93c85e92dd7b9f6
+"Update math: redirect roundeven function"
+
+diff --git a/sysdeps/s390/fpu/s_roundeven.c b/sysdeps/s390/fpu/s_roundeven.c
+index 40b07e054b..0773adfed0 100644
+--- a/sysdeps/s390/fpu/s_roundeven.c
++++ b/sysdeps/s390/fpu/s_roundeven.c
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
++# define NO_MATH_REDIRECT
+ # include <math.h>
+ # include <libm-alias-double.h>
+ 
+@@ -31,7 +32,6 @@ __roundeven (double x)
+   __asm__ ("fidbra %0,4,%1,4" : "=f" (y) : "f" (x));
+   return y;
+ }
+-hidden_def (__roundeven)
+ libm_alias_double (__roundeven, roundeven)
+ 
+ #else
+diff --git a/sysdeps/s390/fpu/s_roundevenf.c b/sysdeps/s390/fpu/s_roundevenf.c
+index d2fbf3d2b6..289785bc4a 100644
+--- a/sysdeps/s390/fpu/s_roundevenf.c
++++ b/sysdeps/s390/fpu/s_roundevenf.c
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
++# define NO_MATH_REDIRECT
+ # include <math.h>
+ # include <libm-alias-float.h>
+ 
+diff --git a/sysdeps/s390/fpu/s_roundevenl.c b/sysdeps/s390/fpu/s_roundevenl.c
+index 29ab7a8616..94b6459ab4 100644
+--- a/sysdeps/s390/fpu/s_roundevenl.c
++++ b/sysdeps/s390/fpu/s_roundevenl.c
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
++# define NO_MATH_REDIRECT
+ # include <math.h>
+ # include <math_private.h>
+ # include <libm-alias-ldouble.h>
diff --git a/glibc-RHEL-15696-11.patch b/glibc-RHEL-15696-11.patch
new file mode 100644
index 0000000..54d7eff
--- /dev/null
+++ b/glibc-RHEL-15696-11.patch
@@ -0,0 +1,74 @@
+From 1da50d4bda07f04135dca39f40e79fc9eabed1f8 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 26 Feb 2021 05:36:59 -0800
+Subject: [PATCH] x86: Set Prefer_No_VZEROUPPER and add Prefer_AVX2_STRCMP
+Content-type: text/plain; charset=UTF-8
+
+1. Set Prefer_No_VZEROUPPER if RTM is usable to avoid RTM abort triggered
+by VZEROUPPER inside a transactionally executing RTM region.
+2. Since to compare 2 32-byte strings, 256-bit EVEX strcmp requires 2
+loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp requires 1 load, 2 VPCMPEQs,
+1 VPMINU and 1 VPMOVMSKB, AVX2 strcmp is faster than EVEX strcmp.  Add
+Prefer_AVX2_STRCMP to prefer AVX2 strcmp family functions.
+---
+ sysdeps/x86/cpu-features.c                    | 20 +++++++++++++++++--
+ sysdeps/x86/cpu-tunables.c                    |  2 ++
+ ...cpu-features-preferred_feature_index_1.def |  1 +
+ 3 files changed, 21 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 91042505..3610ee5c 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -524,8 +524,24 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 	cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
+ 	  |= bit_arch_Prefer_No_VZEROUPPER;
+       else
+-	cpu_features->preferred[index_arch_Prefer_No_AVX512]
+-	  |= bit_arch_Prefer_No_AVX512;
++	{
++	  cpu_features->preferred[index_arch_Prefer_No_AVX512]
++	    |= bit_arch_Prefer_No_AVX512;
++
++	  /* Avoid RTM abort triggered by VZEROUPPER inside a
++	     transactionally executing RTM region.  */
++	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	    cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
++	      |= bit_arch_Prefer_No_VZEROUPPER;
++
++	  /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
++	     requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
++	     requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
++	     AVX2 strcmp is faster than EVEX strcmp.  */
++	  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
++	    cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
++	      |= bit_arch_Prefer_AVX2_STRCMP;
++	}
+     }
+   /* This spells out "AuthenticAMD".  */
+   else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
+diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
+index 3173b2b9..73adbaba 100644
+--- a/sysdeps/x86/cpu-tunables.c
++++ b/sysdeps/x86/cpu-tunables.c
+@@ -239,6 +239,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
+ 	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+ 						Fast_Copy_Backward,
+ 						disable, 18);
++	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
++		(n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
+ 	    }
+ 	  break;
+ 	case 19:
+diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+index 17a5cc42..4ca70b40 100644
+--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
++++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+@@ -32,3 +32,4 @@ BIT (Prefer_ERMS)
+ BIT (Prefer_FSRM)
+ BIT (Prefer_No_AVX512)
+ BIT (MathVec_Prefer_No_AVX512)
++BIT (Prefer_AVX2_STRCMP)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-110.patch b/glibc-RHEL-15696-110.patch
new file mode 100644
index 0000000..c499761
--- /dev/null
+++ b/glibc-RHEL-15696-110.patch
@@ -0,0 +1,26 @@
+From 3213ed770cbc5821920d16caa93c85e92dd7b9f6 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 23 Jun 2021 13:29:41 -0700
+Subject: Update math: redirect roundeven function
+
+Redirect target specific roundeven functions for aarch64, ldbl-128ibm
+and riscv.
+
+Conflicts:
+	sysdeps/aarch64/*
+	(not needed)
+	sysdeps/riscv/*
+	(not supported)
+
+diff --git a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
+index 6701970f4a..90eecf496b 100644
+--- a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
++++ b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
++#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ 
diff --git a/glibc-RHEL-15696-12.patch b/glibc-RHEL-15696-12.patch
new file mode 100644
index 0000000..85b568e
--- /dev/null
+++ b/glibc-RHEL-15696-12.patch
@@ -0,0 +1,3410 @@
+From 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 06:24:52 -0800
+Subject: [PATCH] x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-avx2.h, strchr.c, strcmp.c, strncmp.c and wcsnlen.c to
+select the function optimized with 256-bit EVEX instructions using
+YMM16-YMM31 registers to avoid RTM abort with usable AVX512VL, AVX512BW
+and BMI2 since VZEROUPPER isn't needed at function exit.
+
+For strcmp/strncmp, prefer AVX2 strcmp/strncmp if Prefer_AVX2_STRCMP
+is set.
+---
+ sysdeps/x86_64/multiarch/Makefile          |   21 +-
+ sysdeps/x86_64/multiarch/ifunc-avx2.h      |   14 +-
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |   81 ++
+ sysdeps/x86_64/multiarch/memchr-evex.S     |  381 +++++++
+ sysdeps/x86_64/multiarch/memrchr-evex.S    |  337 +++++++
+ sysdeps/x86_64/multiarch/rawmemchr-evex.S  |    4 +
+ sysdeps/x86_64/multiarch/strchr-evex.S     |  335 +++++++
+ sysdeps/x86_64/multiarch/strchr.c          |   14 +-
+ sysdeps/x86_64/multiarch/strchrnul-evex.S  |    3 +
+ sysdeps/x86_64/multiarch/strcmp-evex.S     | 1043 ++++++++++++++++++++
+ sysdeps/x86_64/multiarch/strcmp.c          |   15 +-
+ sysdeps/x86_64/multiarch/strlen-evex.S     |  436 ++++++++
+ sysdeps/x86_64/multiarch/strncmp-evex.S    |    3 +
+ sysdeps/x86_64/multiarch/strncmp.c         |   15 +-
+ sysdeps/x86_64/multiarch/strnlen-evex.S    |    4 +
+ sysdeps/x86_64/multiarch/strrchr-evex.S    |  265 +++++
+ sysdeps/x86_64/multiarch/wcschr-evex.S     |    3 +
+ sysdeps/x86_64/multiarch/wcscmp-evex.S     |    4 +
+ sysdeps/x86_64/multiarch/wcslen-evex.S     |    4 +
+ sysdeps/x86_64/multiarch/wcsncmp-evex.S    |    5 +
+ sysdeps/x86_64/multiarch/wcsnlen-evex.S    |    5 +
+ sysdeps/x86_64/multiarch/wcsnlen.c         |   14 +-
+ sysdeps/x86_64/multiarch/wcsrchr-evex.S    |    3 +
+ sysdeps/x86_64/multiarch/wmemchr-evex.S    |    4 +
+ 24 files changed, 2996 insertions(+), 17 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/memrchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcmp-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strlen-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncmp-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcscmp-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex.S
+
+Conflicts:
+	sysdeps/x86_64/multiarch/wcsnlen.c
+	(account for missing upstream macros)
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 9477538a..5ce85882 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -39,7 +39,17 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memmove-avx512-unaligned-erms \
+ 		   memset-sse2-unaligned-erms \
+ 		   memset-avx2-unaligned-erms \
+-		   memset-avx512-unaligned-erms
++		   memset-avx512-unaligned-erms \
++		   memchr-evex \
++		   memrchr-evex \
++		   rawmemchr-evex \
++		   strchr-evex \
++		   strchrnul-evex \
++		   strcmp-evex \
++		   strlen-evex \
++		   strncmp-evex \
++		   strnlen-evex \
++		   strrchr-evex
+ CFLAGS-varshift.c += -msse4
+ CFLAGS-strcspn-c.c += -msse4
+ CFLAGS-strpbrk-c.c += -msse4
+@@ -56,7 +66,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcschr-sse2 wcschr-avx2 \
+ 		   wcsrchr-sse2 wcsrchr-avx2 \
+ 		   wcsnlen-sse4_1 wcsnlen-c \
+-		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2
++		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
++		   wcschr-evex \
++		   wcscmp-evex \
++		   wcslen-evex \
++		   wcsncmp-evex \
++		   wcsnlen-evex \
++		   wcsrchr-evex \
++		   wmemchr-evex
+ endif
+ 
+ ifeq ($(subdir),debug)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+index 5c88640a..7081b0c9 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
++++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+@@ -21,16 +21,24 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   return OPTIMIZE (sse2);
+ }
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index fe13505c..bd7d9f19 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -43,6 +43,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, memchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __memchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/memcmp.c.  */
+@@ -121,6 +126,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, memrchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __memrchr_evex)
++
+ 	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2))
+ 
+ #ifdef SHARED
+@@ -179,6 +189,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __rawmemchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, rawmemchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __rawmemchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
+@@ -186,6 +201,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, strlen,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
+@@ -193,6 +212,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strnlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, strnlen,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strnlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
+@@ -255,6 +278,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, strchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __strchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
+ 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
+ 
+@@ -263,6 +291,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strchrnul_avx2)
++	      IFUNC_IMPL_ADD (array, i, strchrnul,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __strchrnul_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strrchr.c.  */
+@@ -270,6 +303,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, strrchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strrchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strcmp.c.  */
+@@ -277,6 +314,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strcmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strcmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __strcmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strcmp_sse42)
+ 	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
+@@ -370,6 +412,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcschr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcschr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcschr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsrchr.c.  */
+@@ -377,6 +424,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsrchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcsrchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcscmp.c.  */
+@@ -384,6 +436,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcscmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcscmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcscmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcscmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsncmp.c.  */
+@@ -391,6 +448,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsncmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsncmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsncmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcsncmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcsncmp, 1, __wcsncmp_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcscpy.c.  */
+@@ -404,6 +466,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcslen_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcslen,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcslen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+@@ -411,6 +478,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsnlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsnlen,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcsnlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      CPU_FEATURE_USABLE (SSE4_1),
+ 			      __wcsnlen_sse4_1)
+@@ -421,6 +493,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wmemchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wmemchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wmemchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wmemcmp.c.  */
+@@ -568,6 +645,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strncmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strncmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strncmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strncmp_sse42)
+ 	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+new file mode 100644
+index 00000000..6dd5d67b
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -0,0 +1,381 @@
++/* memchr/wmemchr optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef MEMCHR
++#  define MEMCHR	__memchr_evex
++# endif
++
++# ifdef USE_AS_WMEMCHR
++#  define VPBROADCAST	vpbroadcastd
++#  define VPCMP		vpcmpd
++#  define SHIFT_REG	r8d
++# else
++#  define VPBROADCAST	vpbroadcastb
++#  define VPCMP		vpcmpb
++#  define SHIFT_REG	ecx
++# endif
++
++# define XMMMATCH	xmm16
++# define YMMMATCH	ymm16
++# define YMM1		ymm17
++# define YMM2		ymm18
++# define YMM3		ymm19
++# define YMM4		ymm20
++# define YMM5		ymm21
++# define YMM6		ymm22
++
++# define VEC_SIZE 32
++
++	.section .text.evex,"ax",@progbits
++ENTRY (MEMCHR)
++# ifndef USE_AS_RAWMEMCHR
++	/* Check for zero length.  */
++	test	%RDX_LP, %RDX_LP
++	jz	L(zero)
++# endif
++	movl	%edi, %ecx
++# ifdef USE_AS_WMEMCHR
++	shl	$2, %RDX_LP
++# else
++#  ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++#  endif
++# endif
++	/* Broadcast CHAR to YMMMATCH.  */
++	VPBROADCAST %esi, %YMMMATCH
++	/* Check if we may cross page boundary with one vector load.  */
++	andl	$(2 * VEC_SIZE - 1), %ecx
++	cmpl	$VEC_SIZE, %ecx
++	ja	L(cros_page_boundary)
++
++	/* Check the first VEC_SIZE bytes.  */
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++
++# ifndef USE_AS_RAWMEMCHR
++	jnz	L(first_vec_x0_check)
++	/* Adjust length and check the end of data.  */
++	subq	$VEC_SIZE, %rdx
++	jbe	L(zero)
++# else
++	jnz	L(first_vec_x0)
++# endif
++
++	/* Align data for aligned loads in the loop.  */
++	addq	$VEC_SIZE, %rdi
++	andl	$(VEC_SIZE - 1), %ecx
++	andq	$-VEC_SIZE, %rdi
++
++# ifndef USE_AS_RAWMEMCHR
++	/* Adjust length.  */
++	addq	%rcx, %rdx
++
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_4x_vec_or_less)
++# endif
++	jmp	L(more_4x_vec)
++
++	.p2align 4
++L(cros_page_boundary):
++	andl	$(VEC_SIZE - 1), %ecx
++# ifdef USE_AS_WMEMCHR
++	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	   bytes.  */
++	movl	%ecx, %SHIFT_REG
++	sarl	$2, %SHIFT_REG
++# endif
++	andq	$-VEC_SIZE, %rdi
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	/* Remove the leading bytes.  */
++	sarxl	%SHIFT_REG, %eax, %eax
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++# ifndef USE_AS_RAWMEMCHR
++	/* Check the end of data.  */
++	cmpq	%rax, %rdx
++	jbe	L(zero)
++# endif
++	addq	%rdi, %rax
++	addq	%rcx, %rax
++	ret
++
++	.p2align 4
++L(aligned_more):
++# ifndef USE_AS_RAWMEMCHR
++        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
++	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
++	   overflow.  */
++	negq	%rcx
++	addq	$VEC_SIZE, %rcx
++
++	/* Check the end of data.  */
++	subq	%rcx, %rdx
++	jbe	L(zero)
++# endif
++
++	addq	$VEC_SIZE, %rdi
++
++# ifndef USE_AS_RAWMEMCHR
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_4x_vec_or_less)
++# endif
++
++L(more_4x_vec):
++	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.  */
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x3)
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++# ifndef USE_AS_RAWMEMCHR
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_4x_vec_or_less)
++# endif
++
++	/* Align data to 4 * VEC_SIZE.  */
++	movq	%rdi, %rcx
++	andl	$(4 * VEC_SIZE - 1), %ecx
++	andq	$-(4 * VEC_SIZE), %rdi
++
++# ifndef USE_AS_RAWMEMCHR
++	/* Adjust length.  */
++	addq	%rcx, %rdx
++# endif
++
++	.p2align 4
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
++	kord	%k1, %k2, %k5
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
++
++	kord	%k3, %k4, %k6
++	kortestd %k5, %k6
++	jnz	L(4x_vec_end)
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++# ifdef USE_AS_RAWMEMCHR
++	jmp	L(loop_4x_vec)
++# else
++	subq	$(VEC_SIZE * 4), %rdx
++	ja	L(loop_4x_vec)
++
++L(last_4x_vec_or_less):
++	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
++	addl	$(VEC_SIZE * 2), %edx
++	jle	L(last_2x_vec)
++
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++
++	jnz	L(first_vec_x2_check)
++	subl	$VEC_SIZE, %edx
++	jle	L(zero)
++
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++
++	jnz	L(first_vec_x3_check)
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(last_2x_vec):
++	addl	$(VEC_SIZE * 2), %edx
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++
++	jnz	L(first_vec_x0_check)
++	subl	$VEC_SIZE, %edx
++	jle	L(zero)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1_check)
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(first_vec_x0_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rdx
++	jbe	L(zero)
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(first_vec_x1_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rdx
++	jbe	L(zero)
++	addq	$VEC_SIZE, %rax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(first_vec_x2_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rdx
++	jbe	L(zero)
++	addq	$(VEC_SIZE * 2), %rax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(first_vec_x3_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rdx
++	jbe	L(zero)
++	addq	$(VEC_SIZE * 3), %rax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
++	ret
++# endif
++
++	.p2align 4
++L(first_vec_x0):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(%rdi, %rax, 4), %rax
++# else
++	addq	%rdi, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
++# else
++	addq	$VEC_SIZE, %rax
++	addq	%rdi, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
++# else
++	addq	$(VEC_SIZE * 2), %rax
++	addq	%rdi, %rax
++# endif
++	ret
++
++	.p2align 4
++L(4x_vec_end):
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++	kmovd	%k2, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++	kmovd	%k3, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++	kmovd	%k4, %eax
++	testl	%eax, %eax
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
++# else
++	addq	$(VEC_SIZE * 3), %rax
++	addq	%rdi, %rax
++# endif
++	ret
++
++END (MEMCHR)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
+new file mode 100644
+index 00000000..16bf8e02
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
+@@ -0,0 +1,337 @@
++/* memrchr optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# define VMOVA		vmovdqa64
++
++# define YMMMATCH	ymm16
++
++# define VEC_SIZE 32
++
++	.section .text.evex,"ax",@progbits
++ENTRY (__memrchr_evex)
++	/* Broadcast CHAR to YMMMATCH.  */
++	vpbroadcastb %esi, %YMMMATCH
++
++	sub	$VEC_SIZE, %RDX_LP
++	jbe	L(last_vec_or_less)
++
++	add	%RDX_LP, %RDI_LP
++
++	/* Check the last VEC_SIZE bytes.  */
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x0)
++
++	subq	$(VEC_SIZE * 4), %rdi
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE - 1), %ecx
++	jz	L(aligned_more)
++
++	/* Align data for aligned loads in the loop.  */
++	addq	$VEC_SIZE, %rdi
++	addq	$VEC_SIZE, %rdx
++	andq	$-VEC_SIZE, %rdi
++	subq	%rcx, %rdx
++
++	.p2align 4
++L(aligned_more):
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_4x_vec_or_less)
++
++	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.  */
++	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
++	kmovd	%k2, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
++	kmovd	%k3, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
++
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
++	kmovd	%k4, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x0)
++
++	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
++	   There are some overlaps with above if data isn't aligned
++	   to 4 * VEC_SIZE.  */
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE * 4 - 1), %ecx
++	jz	L(loop_4x_vec)
++
++	addq	$(VEC_SIZE * 4), %rdi
++	addq	$(VEC_SIZE * 4), %rdx
++	andq	$-(VEC_SIZE * 4), %rdi
++	subq	%rcx, %rdx
++
++	.p2align 4
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	subq	$(VEC_SIZE * 4), %rdi
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_4x_vec_or_less)
++
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
++	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
++	kord	%k1, %k2, %k5
++	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
++	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
++
++	kord	%k3, %k4, %k6
++	kortestd %k5, %k6
++	jz	L(loop_4x_vec)
++
++	/* There is a match.  */
++	kmovd	%k4, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3)
++
++	kmovd	%k3, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++	kmovd	%k2, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
++
++	kmovd	%k1, %eax
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_4x_vec_or_less):
++	addl	$(VEC_SIZE * 4), %edx
++	cmpl	$(VEC_SIZE * 2), %edx
++	jbe	L(last_2x_vec)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
++	kmovd	%k2, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
++	kmovd	%k3, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x1_check)
++	cmpl	$(VEC_SIZE * 3), %edx
++	jbe	L(zero)
++
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
++	kmovd	%k4, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++	bsrl	%eax, %eax
++	subq	$(VEC_SIZE * 4), %rdx
++	addq	%rax, %rdx
++	jl	L(zero)
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_2x_vec):
++	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3_check)
++	cmpl	$VEC_SIZE, %edx
++	jbe	L(zero)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++	bsrl	%eax, %eax
++	subq	$(VEC_SIZE * 2), %rdx
++	addq	%rax, %rdx
++	jl	L(zero)
++	addl	$(VEC_SIZE * 2), %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x0):
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x1):
++	bsrl	%eax, %eax
++	addl	$VEC_SIZE, %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x2):
++	bsrl	%eax, %eax
++	addl	$(VEC_SIZE * 2), %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x3):
++	bsrl	%eax, %eax
++	addl	$(VEC_SIZE * 3), %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x1_check):
++	bsrl	%eax, %eax
++	subq	$(VEC_SIZE * 3), %rdx
++	addq	%rax, %rdx
++	jl	L(zero)
++	addl	$VEC_SIZE, %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x3_check):
++	bsrl	%eax, %eax
++	subq	$VEC_SIZE, %rdx
++	addq	%rax, %rdx
++	jl	L(zero)
++	addl	$(VEC_SIZE * 3), %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(last_vec_or_less_aligned):
++	movl	%edx, %ecx
++
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
++
++	movl	$1, %edx
++	/* Support rdx << 32.  */
++	salq	%cl, %rdx
++	subq	$1, %rdx
++
++	kmovd	%k1, %eax
++
++	/* Remove the trailing bytes.  */
++	andl	%edx, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_or_less):
++	addl	$VEC_SIZE, %edx
++
++	/* Check for zero length.  */
++	testl	%edx, %edx
++	jz	L(zero)
++
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE - 1), %ecx
++	jz	L(last_vec_or_less_aligned)
++
++	movl	%ecx, %esi
++	movl	%ecx, %r8d
++	addl	%edx, %esi
++	andq	$-VEC_SIZE, %rdi
++
++	subl	$VEC_SIZE, %esi
++	ja	L(last_vec_2x_aligned)
++
++	/* Check the last VEC.  */
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++
++	/* Remove the leading and trailing bytes.  */
++	sarl	%cl, %eax
++	movl	%edx, %ecx
++
++	movl	$1, %edx
++	sall	%cl, %edx
++	subl	$1, %edx
++
++	andl	%edx, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	addq	%r8, %rax
++	ret
++
++	.p2align 4
++L(last_vec_2x_aligned):
++	movl	%esi, %ecx
++
++	/* Check the last VEC.  */
++	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
++
++	movl	$1, %edx
++	sall	%cl, %edx
++	subl	$1, %edx
++
++	kmovd	%k1, %eax
++
++	/* Remove the trailing bytes.  */
++	andl	%edx, %eax
++
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
++
++	/* Check the second last VEC.  */
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
++
++	movl	%r8d, %ecx
++
++	kmovd	%k1, %eax
++
++	/* Remove the leading bytes.  Must use unsigned right shift for
++	   bsrl below.  */
++	shrl	%cl, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	addq	%r8, %rax
++	ret
++END (__memrchr_evex)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
+new file mode 100644
+index 00000000..ec942b77
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
+@@ -0,0 +1,4 @@
++#define MEMCHR __rawmemchr_evex
++#define USE_AS_RAWMEMCHR 1
++
++#include "memchr-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
+new file mode 100644
+index 00000000..ddc86a70
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strchr-evex.S
+@@ -0,0 +1,335 @@
++/* strchr/strchrnul optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef STRCHR
++#  define STRCHR	__strchr_evex
++# endif
++
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++
++# ifdef USE_AS_WCSCHR
++#  define VPBROADCAST	vpbroadcastd
++#  define VPCMP		vpcmpd
++#  define VPMINU	vpminud
++#  define CHAR_REG	esi
++#  define SHIFT_REG	r8d
++# else
++#  define VPBROADCAST	vpbroadcastb
++#  define VPCMP		vpcmpb
++#  define VPMINU	vpminub
++#  define CHAR_REG	sil
++#  define SHIFT_REG	ecx
++# endif
++
++# define XMMZERO	xmm16
++
++# define YMMZERO	ymm16
++# define YMM0		ymm17
++# define YMM1		ymm18
++# define YMM2		ymm19
++# define YMM3		ymm20
++# define YMM4		ymm21
++# define YMM5		ymm22
++# define YMM6		ymm23
++# define YMM7		ymm24
++# define YMM8		ymm25
++
++# define VEC_SIZE 32
++# define PAGE_SIZE 4096
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRCHR)
++	movl	%edi, %ecx
++# ifndef USE_AS_STRCHRNUL
++	xorl	%edx, %edx
++# endif
++
++	/* Broadcast CHAR to YMM0.	*/
++	VPBROADCAST %esi, %YMM0
++
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++
++	/* Check if we cross page boundary with one vector load.  */
++	andl	$(PAGE_SIZE - 1), %ecx
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
++	ja  L(cross_page_boundary)
++
++	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
++	   null bytes.  */
++	VMOVU	(%rdi), %YMM1
++
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	ktestd	%k0, %k0
++	jz	L(more_vecs)
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
++	/* Found CHAR or the null byte.	 */
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(%rdi, %rax, 4), %rax
++# else
++	addq	%rdi, %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++	.p2align 4
++L(more_vecs):
++	/* Align data for aligned loads in the loop.  */
++	andq	$-VEC_SIZE, %rdi
++L(aligned_more):
++
++	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.	*/
++	VMOVA	VEC_SIZE(%rdi), %YMM1
++	addq	$VEC_SIZE, %rdi
++
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	VMOVA	VEC_SIZE(%rdi), %YMM1
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++
++	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	ktestd	%k0, %k0
++	jz	L(prep_loop_4x)
++
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
++	/* Found CHAR or the null byte.	 */
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
++# else
++	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x0):
++	tzcntl	%eax, %eax
++	/* Found CHAR or the null byte.	 */
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(%rdi, %rax, 4), %rax
++# else
++	addq	%rdi, %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++	/* Found CHAR or the null byte.	 */
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
++# else
++	leaq	VEC_SIZE(%rdi, %rax), %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++	/* Found CHAR or the null byte.	 */
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
++# else
++	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++L(prep_loop_4x):
++	/* Align data to 4 * VEC_SIZE.	*/
++	andq	$-(VEC_SIZE * 4), %rdi
++
++	.p2align 4
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
++	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
++	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
++	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
++
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM5
++	vpxorq	%YMM2, %YMM0, %YMM6
++	vpxorq	%YMM3, %YMM0, %YMM7
++	vpxorq	%YMM4, %YMM0, %YMM8
++
++	VPMINU	%YMM5, %YMM1, %YMM5
++	VPMINU	%YMM6, %YMM2, %YMM6
++	VPMINU	%YMM7, %YMM3, %YMM7
++	VPMINU	%YMM8, %YMM4, %YMM8
++
++	VPMINU	%YMM5, %YMM6, %YMM1
++	VPMINU	%YMM7, %YMM8, %YMM2
++
++	VPMINU	%YMM1, %YMM2, %YMM1
++
++	/* Each bit in K0 represents a CHAR or a null byte.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++	ktestd	%k0, %k0
++	jz	L(loop_4x_vec)
++
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM5, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	/* Each bit in K1 represents a CHAR or a null byte in YMM2.  */
++	VPCMP	$0, %YMMZERO, %YMM6, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	/* Each bit in K2 represents a CHAR or a null byte in YMM3.  */
++	VPCMP	$0, %YMMZERO, %YMM7, %k2
++	/* Each bit in K3 represents a CHAR or a null byte in YMM4.  */
++	VPCMP	$0, %YMMZERO, %YMM8, %k3
++
++# ifdef USE_AS_WCSCHR
++	/* NB: Each bit in K2/K3 represents 4-byte element.  */
++	kshiftlw $8, %k3, %k1
++# else
++	kshiftlq $32, %k3, %k1
++# endif
++
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	korq	%k1, %k2, %k1
++	kmovq	%k1, %rax
++
++	tzcntq  %rax, %rax
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
++# else
++	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++	/* Cold case for crossing page with first load.	 */
++	.p2align 4
++L(cross_page_boundary):
++	andq	$-VEC_SIZE, %rdi
++	andl	$(VEC_SIZE - 1), %ecx
++
++	VMOVA	(%rdi), %YMM1
++
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++
++# ifdef USE_AS_WCSCHR
++	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	   bytes.  */
++	movl	%ecx, %SHIFT_REG
++	sarl    $2, %SHIFT_REG
++# endif
++
++	/* Remove the leading bits.	 */
++	sarxl	%SHIFT_REG, %eax, %eax
++	testl	%eax, %eax
++
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++	addq	%rcx, %rdi
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(%rdi, %rax, 4), %rax
++# else
++	addq	%rdi, %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++END (STRCHR)
++# endif
+diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
+index 32954713..be05e197 100644
+--- a/sysdeps/x86_64/multiarch/strchr.c
++++ b/sysdeps/x86_64/multiarch/strchr.c
+@@ -29,16 +29,24 @@
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, Slow_BSF))
+     return OPTIMIZE (sse2_no_bsf);
+diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex.S b/sysdeps/x86_64/multiarch/strchrnul-evex.S
+new file mode 100644
+index 00000000..064fe7ca
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strchrnul-evex.S
+@@ -0,0 +1,3 @@
++#define STRCHR __strchrnul_evex
++#define USE_AS_STRCHRNUL 1
++#include "strchr-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+new file mode 100644
+index 00000000..459eeed0
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -0,0 +1,1043 @@
++/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef STRCMP
++#  define STRCMP	__strcmp_evex
++# endif
++
++# define PAGE_SIZE	4096
++
++/* VEC_SIZE = Number of bytes in a ymm register */
++# define VEC_SIZE	32
++
++/* Shift for dividing by (VEC_SIZE * 4).  */
++# define DIVIDE_BY_VEC_4_SHIFT	7
++# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
++#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
++# endif
++
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++
++# ifdef USE_AS_WCSCMP
++/* Compare packed dwords.  */
++#  define VPCMP		vpcmpd
++#  define SHIFT_REG32	r8d
++#  define SHIFT_REG64	r8
++/* 1 dword char == 4 bytes.  */
++#  define SIZE_OF_CHAR	4
++# else
++/* Compare packed bytes.  */
++#  define VPCMP		vpcmpb
++#  define SHIFT_REG32	ecx
++#  define SHIFT_REG64	rcx
++/* 1 byte char == 1 byte.  */
++#  define SIZE_OF_CHAR	1
++# endif
++
++# define XMMZERO	xmm16
++# define XMM0		xmm17
++# define XMM1		xmm18
++
++# define YMMZERO	ymm16
++# define YMM0		ymm17
++# define YMM1		ymm18
++# define YMM2		ymm19
++# define YMM3		ymm20
++# define YMM4		ymm21
++# define YMM5		ymm22
++# define YMM6		ymm23
++# define YMM7		ymm24
++
++/* Warning!
++           wcscmp/wcsncmp have to use SIGNED comparison for elements.
++           strcmp/strncmp have to use UNSIGNED comparison for elements.
++*/
++
++/* The main idea of the string comparison (byte or dword) using 256-bit
++   EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
++   latter can be on either packed bytes or dwords depending on
++   USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
++   matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
++   KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
++   are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
++   instructions.  Main loop (away from from page boundary) compares 4
++   vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
++   bytes) on each loop.
++
++   The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
++   is the same as strcmp, except that an a maximum offset is tracked.  If
++   the maximum offset is reached before a difference is found, zero is
++   returned.  */
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRCMP)
++# ifdef USE_AS_STRNCMP
++	/* Check for simple cases (0 or 1) in offset.  */
++	cmp	$1, %RDX_LP
++	je	L(char0)
++	jb	L(zero)
++#  ifdef USE_AS_WCSCMP
++	/* Convert units: from wide to byte char.  */
++	shl	$2, %RDX_LP
++#  endif
++	/* Register %r11 tracks the maximum offset.  */
++	mov	%RDX_LP, %R11_LP
++# endif
++	movl	%edi, %eax
++	xorl	%edx, %edx
++	/* Make %XMMZERO (%YMMZERO) all zeros in this function.  */
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++	orl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
++	jg	L(cross_page)
++	/* Start comparing 4 vectors.  */
++	VMOVU	(%rdi), %YMM0
++	VMOVU	(%rsi), %YMM1
++
++	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
++	VPCMP	$4, %YMM0, %YMM1, %k0
++
++	/* Check for NULL in YMM0.  */
++	VPCMP	$0, %YMMZERO, %YMM0, %k1
++	/* Check for NULL in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k2
++	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
++	kord	%k1, %k2, %k1
++
++	/* Each bit in K1 represents:
++	   1. A mismatch in YMM0 and YMM1.  Or
++	   2. A NULL in YMM0 or YMM1.
++	 */
++	kord	%k0, %k1, %k1
++
++	ktestd	%k1, %k1
++	je	L(next_3_vectors)
++	kmovd	%k1, %ecx
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edx
++# endif
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the mismatched index (%rdx) is after the maximum
++	   offset (%r11).   */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	je	L(return)
++L(wcscmp_return):
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
++L(return):
++# else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++# endif
++	ret
++
++	.p2align 4
++L(return_vec_size):
++	kmovd	%k1, %ecx
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edx
++# endif
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
++	   the maximum offset (%r11).  */
++	addq	$VEC_SIZE, %rdx
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	VEC_SIZE(%rdi, %rdx), %ecx
++	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	VEC_SIZE(%rdi, %rdx), %eax
++	movzbl	VEC_SIZE(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(return_2_vec_size):
++	kmovd	%k1, %ecx
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edx
++# endif
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
++	   after the maximum offset (%r11).  */
++	addq	$(VEC_SIZE * 2), %rdx
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
++	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(return_3_vec_size):
++	kmovd	%k1, %ecx
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edx
++# endif
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
++	   after the maximum offset (%r11).  */
++	addq	$(VEC_SIZE * 3), %rdx
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(next_3_vectors):
++	VMOVU	VEC_SIZE(%rdi), %YMM0
++	VMOVU	VEC_SIZE(%rsi), %YMM1
++	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
++	VPCMP	$4, %YMM0, %YMM1, %k0
++	VPCMP	$0, %YMMZERO, %YMM0, %k1
++	VPCMP	$0, %YMMZERO, %YMM1, %k2
++	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	ktestd	%k1, %k1
++	jne	L(return_vec_size)
++
++	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM2
++	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM3
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM4
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM5
++
++	/* Each bit in K0 represents a mismatch in YMM2 and YMM4.  */
++	VPCMP	$4, %YMM2, %YMM4, %k0
++	VPCMP	$0, %YMMZERO, %YMM2, %k1
++	VPCMP	$0, %YMMZERO, %YMM4, %k2
++	/* Each bit in K1 represents a NULL in YMM2 or YMM4.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	ktestd	%k1, %k1
++	jne	L(return_2_vec_size)
++
++	/* Each bit in K0 represents a mismatch in YMM3 and YMM5.  */
++	VPCMP	$4, %YMM3, %YMM5, %k0
++	VPCMP	$0, %YMMZERO, %YMM3, %k1
++	VPCMP	$0, %YMMZERO, %YMM5, %k2
++	/* Each bit in K1 represents a NULL in YMM3 or YMM5.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	ktestd	%k1, %k1
++	jne	L(return_3_vec_size)
++L(main_loop_header):
++	leaq	(VEC_SIZE * 4)(%rdi), %rdx
++	movl	$PAGE_SIZE, %ecx
++	/* Align load via RAX.  */
++	andq	$-(VEC_SIZE * 4), %rdx
++	subq	%rdi, %rdx
++	leaq	(%rdi, %rdx), %rax
++# ifdef USE_AS_STRNCMP
++	/* Starting from this point, the maximum offset, or simply the
++	   'offset', DECREASES by the same amount when base pointers are
++	   moved forward.  Return 0 when:
++	     1) On match: offset <= the matched vector index.
++	     2) On mistmach, offset is before the mistmatched index.
++	 */
++	subq	%rdx, %r11
++	jbe	L(zero)
++# endif
++	addq	%rsi, %rdx
++	movq	%rdx, %rsi
++	andl	$(PAGE_SIZE - 1), %esi
++	/* Number of bytes before page crossing.  */
++	subq	%rsi, %rcx
++	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
++	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
++	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
++	movl	%ecx, %esi
++	jmp	L(loop_start)
++
++	.p2align 4
++L(loop):
++# ifdef USE_AS_STRNCMP
++	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
++	   the maximum offset (%r11) by the same amount.  */
++	subq	$(VEC_SIZE * 4), %r11
++	jbe	L(zero)
++# endif
++	addq	$(VEC_SIZE * 4), %rax
++	addq	$(VEC_SIZE * 4), %rdx
++L(loop_start):
++	testl	%esi, %esi
++	leal	-1(%esi), %esi
++	je	L(loop_cross_page)
++L(back_to_loop):
++	/* Main loop, comparing 4 vectors are a time.  */
++	VMOVA	(%rax), %YMM0
++	VMOVA	VEC_SIZE(%rax), %YMM2
++	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
++	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
++	VMOVU	(%rdx), %YMM1
++	VMOVU	VEC_SIZE(%rdx), %YMM3
++	VMOVU	(VEC_SIZE * 2)(%rdx), %YMM5
++	VMOVU	(VEC_SIZE * 3)(%rdx), %YMM7
++
++	VPCMP	$4, %YMM0, %YMM1, %k0
++	VPCMP	$0, %YMMZERO, %YMM0, %k1
++	VPCMP	$0, %YMMZERO, %YMM1, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K4 represents a NULL or a mismatch in YMM0 and
++	   YMM1.  */
++	kord	%k0, %k1, %k4
++
++	VPCMP	$4, %YMM2, %YMM3, %k0
++	VPCMP	$0, %YMMZERO, %YMM2, %k1
++	VPCMP	$0, %YMMZERO, %YMM3, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K5 represents a NULL or a mismatch in YMM2 and
++	   YMM3.  */
++	kord	%k0, %k1, %k5
++
++	VPCMP	$4, %YMM4, %YMM5, %k0
++	VPCMP	$0, %YMMZERO, %YMM4, %k1
++	VPCMP	$0, %YMMZERO, %YMM5, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K6 represents a NULL or a mismatch in YMM4 and
++	   YMM5.  */
++	kord	%k0, %k1, %k6
++
++	VPCMP	$4, %YMM6, %YMM7, %k0
++	VPCMP	$0, %YMMZERO, %YMM6, %k1
++	VPCMP	$0, %YMMZERO, %YMM7, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K7 represents a NULL or a mismatch in YMM6 and
++	   YMM7.  */
++	kord	%k0, %k1, %k7
++
++	kord	%k4, %k5, %k0
++	kord	%k6, %k7, %k1
++
++	/* Test each mask (32 bits) individually because for VEC_SIZE
++	   == 32 is not possible to OR the four masks and keep all bits
++	   in a 64-bit integer register, differing from SSE2 strcmp
++	   where ORing is possible.  */
++	kortestd %k0, %k1
++	je	L(loop)
++	ktestd	%k4, %k4
++	je	L(test_vec)
++	kmovd	%k4, %edi
++	tzcntl	%edi, %ecx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %ecx
++# endif
++# ifdef USE_AS_STRNCMP
++	cmpq	%rcx, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(test_vec):
++# ifdef USE_AS_STRNCMP
++	/* The first vector matched.  Return 0 if the maximum offset
++	   (%r11) <= VEC_SIZE.  */
++	cmpq	$VEC_SIZE, %r11
++	jbe	L(zero)
++# endif
++	ktestd	%k5, %k5
++	je	L(test_2_vec)
++	kmovd	%k5, %ecx
++	tzcntl	%ecx, %edi
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edi
++# endif
++# ifdef USE_AS_STRNCMP
++	addq	$VEC_SIZE, %rdi
++	cmpq	%rdi, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rdi), %ecx
++	cmpl	(%rdx, %rdi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rdi), %eax
++	movzbl	(%rdx, %rdi), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	VEC_SIZE(%rsi, %rdi), %ecx
++	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	VEC_SIZE(%rax, %rdi), %eax
++	movzbl	VEC_SIZE(%rdx, %rdi), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(test_2_vec):
++# ifdef USE_AS_STRNCMP
++	/* The first 2 vectors matched.  Return 0 if the maximum offset
++	   (%r11) <= 2 * VEC_SIZE.  */
++	cmpq	$(VEC_SIZE * 2), %r11
++	jbe	L(zero)
++# endif
++	ktestd	%k6, %k6
++	je	L(test_3_vec)
++	kmovd	%k6, %ecx
++	tzcntl	%ecx, %edi
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edi
++# endif
++# ifdef USE_AS_STRNCMP
++	addq	$(VEC_SIZE * 2), %rdi
++	cmpq	%rdi, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rdi), %ecx
++	cmpl	(%rdx, %rdi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rdi), %eax
++	movzbl	(%rdx, %rdi), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
++	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
++	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(test_3_vec):
++# ifdef USE_AS_STRNCMP
++	/* The first 3 vectors matched.  Return 0 if the maximum offset
++	   (%r11) <= 3 * VEC_SIZE.  */
++	cmpq	$(VEC_SIZE * 3), %r11
++	jbe	L(zero)
++# endif
++	kmovd	%k7, %esi
++	tzcntl	%esi, %ecx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %ecx
++# endif
++# ifdef USE_AS_STRNCMP
++	addq	$(VEC_SIZE * 3), %rcx
++	cmpq	%rcx, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %esi
++	cmpl	(%rdx, %rcx), %esi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
++	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(loop_cross_page):
++	xorl	%r10d, %r10d
++	movq	%rdx, %rcx
++	/* Align load via RDX.  We load the extra ECX bytes which should
++	   be ignored.  */
++	andl	$((VEC_SIZE * 4) - 1), %ecx
++	/* R10 is -RCX.  */
++	subq	%rcx, %r10
++
++	/* This works only if VEC_SIZE * 2 == 64. */
++# if (VEC_SIZE * 2) != 64
++#  error (VEC_SIZE * 2) != 64
++# endif
++
++	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
++	cmpl	$(VEC_SIZE * 2), %ecx
++	jge	L(loop_cross_page_2_vec)
++
++	VMOVU	(%rax, %r10), %YMM2
++	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
++	VMOVU	(%rdx, %r10), %YMM4
++	VMOVU	VEC_SIZE(%rdx, %r10), %YMM5
++
++	VPCMP	$4, %YMM4, %YMM2, %k0
++	VPCMP	$0, %YMMZERO, %YMM2, %k1
++	VPCMP	$0, %YMMZERO, %YMM4, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch in YMM2 and
++	   YMM4.  */
++	kord	%k0, %k1, %k1
++
++	VPCMP	$4, %YMM5, %YMM3, %k3
++	VPCMP	$0, %YMMZERO, %YMM3, %k4
++	VPCMP	$0, %YMMZERO, %YMM5, %k5
++	kord	%k4, %k5, %k4
++	/* Each bit in K3 represents a NULL or a mismatch in YMM3 and
++	   YMM5.  */
++	kord	%k3, %k4, %k3
++
++# ifdef USE_AS_WCSCMP
++	/* NB: Each bit in K1/K3 represents 4-byte element.  */
++	kshiftlw $8, %k3, %k2
++	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	   bytes.  */
++	movl	%ecx, %SHIFT_REG32
++	sarl	$2, %SHIFT_REG32
++# else
++	kshiftlq $32, %k3, %k2
++# endif
++
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	korq	%k1, %k2, %k1
++	kmovq	%k1, %rdi
++
++	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
++	shrxq	%SHIFT_REG64, %rdi, %rdi
++	testq	%rdi, %rdi
++	je	L(loop_cross_page_2_vec)
++	tzcntq	%rdi, %rcx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %ecx
++# endif
++# ifdef USE_AS_STRNCMP
++	cmpq	%rcx, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(loop_cross_page_2_vec):
++	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
++	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
++	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
++	VMOVU	(VEC_SIZE * 2)(%rdx, %r10), %YMM2
++	VMOVU	(VEC_SIZE * 3)(%rdx, %r10), %YMM3
++
++	VPCMP	$4, %YMM0, %YMM2, %k0
++	VPCMP	$0, %YMMZERO, %YMM0, %k1
++	VPCMP	$0, %YMMZERO, %YMM2, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch in YMM0 and
++	   YMM2.  */
++	kord	%k0, %k1, %k1
++
++	VPCMP	$4, %YMM1, %YMM3, %k3
++	VPCMP	$0, %YMMZERO, %YMM1, %k4
++	VPCMP	$0, %YMMZERO, %YMM3, %k5
++	kord	%k4, %k5, %k4
++	/* Each bit in K3 represents a NULL or a mismatch in YMM1 and
++	   YMM3.  */
++	kord	%k3, %k4, %k3
++
++# ifdef USE_AS_WCSCMP
++	/* NB: Each bit in K1/K3 represents 4-byte element.  */
++	kshiftlw $8, %k3, %k2
++# else
++	kshiftlq $32, %k3, %k2
++# endif
++
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	korq	%k1, %k2, %k1
++	kmovq	%k1, %rdi
++
++	xorl	%r8d, %r8d
++	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
++	subl	$(VEC_SIZE * 2), %ecx
++	jle	1f
++	/* R8 has number of bytes skipped.  */
++	movl	%ecx, %r8d
++# ifdef USE_AS_WCSCMP
++	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	   bytes.  */
++	sarl	$2, %ecx
++# endif
++	/* Skip ECX bytes.  */
++	shrq	%cl, %rdi
++1:
++	/* Before jumping back to the loop, set ESI to the number of
++	   VEC_SIZE * 4 blocks before page crossing.  */
++	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
++
++	testq	%rdi, %rdi
++# ifdef USE_AS_STRNCMP
++	/* At this point, if %rdi value is 0, it already tested
++	   VEC_SIZE*4+%r10 byte starting from %rax. This label
++	   checks whether strncmp maximum offset reached or not.  */
++	je	L(string_nbyte_offset_check)
++# else
++	je	L(back_to_loop)
++# endif
++	tzcntq	%rdi, %rcx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %ecx
++# endif
++	addq	%r10, %rcx
++	/* Adjust for number of bytes skipped.  */
++	addq	%r8, %rcx
++# ifdef USE_AS_STRNCMP
++	addq	$(VEC_SIZE * 2), %rcx
++	subq	%rcx, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
++	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++# ifdef USE_AS_STRNCMP
++L(string_nbyte_offset_check):
++	leaq	(VEC_SIZE * 4)(%r10), %r10
++	cmpq	%r10, %r11
++	jbe	L(zero)
++	jmp	L(back_to_loop)
++# endif
++
++	.p2align 4
++L(cross_page_loop):
++	/* Check one byte/dword at a time.  */
++# ifdef USE_AS_WCSCMP
++	cmpl	%ecx, %eax
++# else
++	subl	%ecx, %eax
++# endif
++	jne	L(different)
++	addl	$SIZE_OF_CHAR, %edx
++	cmpl	$(VEC_SIZE * 4), %edx
++	je	L(main_loop_header)
++# ifdef USE_AS_STRNCMP
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rdx), %eax
++	movl	(%rsi, %rdx), %ecx
++# else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %ecx
++# endif
++	/* Check null char.  */
++	testl	%eax, %eax
++	jne	L(cross_page_loop)
++	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
++	   comparisons.  */
++	subl	%ecx, %eax
++# ifndef USE_AS_WCSCMP
++L(different):
++# endif
++	ret
++
++# ifdef USE_AS_WCSCMP
++	.p2align 4
++L(different):
++	/* Use movl to avoid modifying EFLAGS.  */
++	movl	$0, %eax
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
++	ret
++# endif
++
++# ifdef USE_AS_STRNCMP
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(char0):
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi), %ecx
++	cmpl	(%rsi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
++	subl	%ecx, %eax
++#  endif
++	ret
++# endif
++
++	.p2align 4
++L(last_vector):
++	addq	%rdx, %rdi
++	addq	%rdx, %rsi
++# ifdef USE_AS_STRNCMP
++	subq	%rdx, %r11
++# endif
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edx
++# endif
++# ifdef USE_AS_STRNCMP
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++# else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++# endif
++	ret
++
++	/* Comparing on page boundary region requires special treatment:
++	   It must done one vector at the time, starting with the wider
++	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
++	   (xmm) still passes the boundary, byte comparison must be done.
++	 */
++	.p2align 4
++L(cross_page):
++	/* Try one ymm vector at a time.  */
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jg	L(cross_page_1_vector)
++L(loop_1_vector):
++	VMOVU	(%rdi, %rdx), %YMM0
++	VMOVU	(%rsi, %rdx), %YMM1
++
++	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
++	VPCMP	$4, %YMM0, %YMM1, %k0
++	VPCMP	$0, %YMMZERO, %YMM0, %k1
++	VPCMP	$0, %YMMZERO, %YMM1, %k2
++	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	kmovd	%k1, %ecx
++	testl	%ecx, %ecx
++	jne	L(last_vector)
++
++	addl	$VEC_SIZE, %edx
++
++	addl	$VEC_SIZE, %eax
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the current offset (%rdx) >= the maximum offset
++	   (%r11).  */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jle	L(loop_1_vector)
++L(cross_page_1_vector):
++	/* Less than 32 bytes to check, try one xmm vector.  */
++	cmpl	$(PAGE_SIZE - 16), %eax
++	jg	L(cross_page_1_xmm)
++	VMOVU	(%rdi, %rdx), %XMM0
++	VMOVU	(%rsi, %rdx), %XMM1
++
++	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
++	VPCMP	$4, %XMM0, %XMM1, %k0
++	VPCMP	$0, %XMMZERO, %XMM0, %k1
++	VPCMP	$0, %XMMZERO, %XMM1, %k2
++	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
++	korw	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	korw	%k0, %k1, %k1
++	kmovw	%k1, %ecx
++	testl	%ecx, %ecx
++	jne	L(last_vector)
++
++	addl	$16, %edx
++# ifndef USE_AS_WCSCMP
++	addl	$16, %eax
++# endif
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the current offset (%rdx) >= the maximum offset
++	   (%r11).  */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++
++L(cross_page_1_xmm):
++# ifndef USE_AS_WCSCMP
++	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
++	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
++	cmpl	$(PAGE_SIZE - 8), %eax
++	jg	L(cross_page_8bytes)
++	vmovq	(%rdi, %rdx), %XMM0
++	vmovq	(%rsi, %rdx), %XMM1
++
++	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
++	VPCMP	$4, %XMM0, %XMM1, %k0
++	VPCMP	$0, %XMMZERO, %XMM0, %k1
++	VPCMP	$0, %XMMZERO, %XMM1, %k2
++	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	kmovd	%k1, %ecx
++
++# ifdef USE_AS_WCSCMP
++	/* Only last 2 bits are valid.  */
++	andl	$0x3, %ecx
++# else
++	/* Only last 8 bits are valid.  */
++	andl	$0xff, %ecx
++# endif
++
++	testl	%ecx, %ecx
++	jne	L(last_vector)
++
++	addl	$8, %edx
++	addl	$8, %eax
++#  ifdef USE_AS_STRNCMP
++	/* Return 0 if the current offset (%rdx) >= the maximum offset
++	   (%r11).  */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  endif
++
++L(cross_page_8bytes):
++	/* Less than 8 bytes to check, try 4 byte vector.  */
++	cmpl	$(PAGE_SIZE - 4), %eax
++	jg	L(cross_page_4bytes)
++	vmovd	(%rdi, %rdx), %XMM0
++	vmovd	(%rsi, %rdx), %XMM1
++
++	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
++	VPCMP	$4, %XMM0, %XMM1, %k0
++	VPCMP	$0, %XMMZERO, %XMM0, %k1
++	VPCMP	$0, %XMMZERO, %XMM1, %k2
++	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	kmovd	%k1, %ecx
++
++# ifdef USE_AS_WCSCMP
++	/* Only the last bit is valid.  */
++	andl	$0x1, %ecx
++# else
++	/* Only last 4 bits are valid.  */
++	andl	$0xf, %ecx
++# endif
++
++	testl	%ecx, %ecx
++	jne	L(last_vector)
++
++	addl	$4, %edx
++#  ifdef USE_AS_STRNCMP
++	/* Return 0 if the current offset (%rdx) >= the maximum offset
++	   (%r11).  */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  endif
++
++L(cross_page_4bytes):
++# endif
++	/* Less than 4 bytes to check, try one byte/dword at a time.  */
++# ifdef USE_AS_STRNCMP
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rdx), %eax
++	movl	(%rsi, %rdx), %ecx
++# else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %ecx
++# endif
++	testl	%eax, %eax
++	jne	L(cross_page_loop)
++	subl	%ecx, %eax
++	ret
++END (STRCMP)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
+index 3f433fbc..c5f38510 100644
+--- a/sysdeps/x86_64/multiarch/strcmp.c
++++ b/sysdeps/x86_64/multiarch/strcmp.c
+@@ -30,16 +30,25 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
++	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+     return OPTIMIZE (sse2_unaligned);
+diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
+new file mode 100644
+index 00000000..cd022509
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strlen-evex.S
+@@ -0,0 +1,436 @@
++/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef STRLEN
++#  define STRLEN	__strlen_evex
++# endif
++
++# define VMOVA		vmovdqa64
++
++# ifdef USE_AS_WCSLEN
++#  define VPCMP		vpcmpd
++#  define VPMINU	vpminud
++#  define SHIFT_REG	r9d
++# else
++#  define VPCMP		vpcmpb
++#  define VPMINU	vpminub
++#  define SHIFT_REG	ecx
++# endif
++
++# define XMMZERO	xmm16
++# define YMMZERO	ymm16
++# define YMM1		ymm17
++# define YMM2		ymm18
++# define YMM3		ymm19
++# define YMM4		ymm20
++# define YMM5		ymm21
++# define YMM6		ymm22
++
++# define VEC_SIZE 32
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRLEN)
++# ifdef USE_AS_STRNLEN
++	/* Check for zero length.  */
++	test	%RSI_LP, %RSI_LP
++	jz	L(zero)
++#  ifdef USE_AS_WCSLEN
++	shl	$2, %RSI_LP
++#  elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%esi, %esi
++#  endif
++	mov	%RSI_LP, %R8_LP
++# endif
++	movl	%edi, %ecx
++	movq	%rdi, %rdx
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++
++	/* Check if we may cross page boundary with one vector load.  */
++	andl	$(2 * VEC_SIZE - 1), %ecx
++	cmpl	$VEC_SIZE, %ecx
++	ja	L(cros_page_boundary)
++
++	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
++	   null byte.  */
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++
++# ifdef USE_AS_STRNLEN
++	jnz	L(first_vec_x0_check)
++	/* Adjust length and check the end of data.  */
++	subq	$VEC_SIZE, %rsi
++	jbe	L(max)
++# else
++	jnz	L(first_vec_x0)
++# endif
++
++	/* Align data for aligned loads in the loop.  */
++	addq	$VEC_SIZE, %rdi
++	andl	$(VEC_SIZE - 1), %ecx
++	andq	$-VEC_SIZE, %rdi
++
++# ifdef USE_AS_STRNLEN
++	/* Adjust length.  */
++	addq	%rcx, %rsi
++
++	subq	$(VEC_SIZE * 4), %rsi
++	jbe	L(last_4x_vec_or_less)
++# endif
++	jmp	L(more_4x_vec)
++
++	.p2align 4
++L(cros_page_boundary):
++	andl	$(VEC_SIZE - 1), %ecx
++	andq	$-VEC_SIZE, %rdi
++
++# ifdef USE_AS_WCSLEN
++	/* NB: Divide shift count by 4 since each bit in K0 represent 4
++	   bytes.  */
++	movl	%ecx, %SHIFT_REG
++	sarl	$2, %SHIFT_REG
++# endif
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++
++	/* Remove the leading bytes.  */
++	sarxl	%SHIFT_REG, %eax, %eax
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++# ifdef USE_AS_STRNLEN
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++# endif
++	addq	%rdi, %rax
++	addq	%rcx, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	ret
++
++	.p2align 4
++L(aligned_more):
++# ifdef USE_AS_STRNLEN
++        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
++	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
++	    to void possible addition overflow.  */
++	negq	%rcx
++	addq	$VEC_SIZE, %rcx
++
++	/* Check the end of data.  */
++	subq	%rcx, %rsi
++	jbe	L(max)
++# endif
++
++	addq	$VEC_SIZE, %rdi
++
++# ifdef USE_AS_STRNLEN
++	subq	$(VEC_SIZE * 4), %rsi
++	jbe	L(last_4x_vec_or_less)
++# endif
++
++L(more_4x_vec):
++	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.  */
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x3)
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++# ifdef USE_AS_STRNLEN
++	subq	$(VEC_SIZE * 4), %rsi
++	jbe	L(last_4x_vec_or_less)
++# endif
++
++	/* Align data to 4 * VEC_SIZE.  */
++	movq	%rdi, %rcx
++	andl	$(4 * VEC_SIZE - 1), %ecx
++	andq	$-(4 * VEC_SIZE), %rdi
++
++# ifdef USE_AS_STRNLEN
++	/* Adjust length.  */
++	addq	%rcx, %rsi
++# endif
++
++	.p2align 4
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	VMOVA	(%rdi), %YMM1
++	VMOVA	VEC_SIZE(%rdi), %YMM2
++	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM3
++	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM4
++
++	VPMINU	%YMM1, %YMM2, %YMM5
++	VPMINU	%YMM3, %YMM4, %YMM6
++
++	VPMINU	%YMM5, %YMM6, %YMM5
++	VPCMP	$0, %YMM5, %YMMZERO, %k0
++	ktestd	%k0, %k0
++	jnz	L(4x_vec_end)
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++# ifndef USE_AS_STRNLEN
++	jmp	L(loop_4x_vec)
++# else
++	subq	$(VEC_SIZE * 4), %rsi
++	ja	L(loop_4x_vec)
++
++L(last_4x_vec_or_less):
++	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
++	addl	$(VEC_SIZE * 2), %esi
++	jle	L(last_2x_vec)
++
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2_check)
++	subl	$VEC_SIZE, %esi
++	jle	L(max)
++
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x3_check)
++	movq	%r8, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(last_2x_vec):
++	addl	$(VEC_SIZE * 2), %esi
++
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0_check)
++	subl	$VEC_SIZE, %esi
++	jle	L(max)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1_check)
++	movq	%r8, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(first_vec_x0_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(first_vec_x1_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++	addq	$VEC_SIZE, %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(first_vec_x2_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++	addq	$(VEC_SIZE * 2), %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(first_vec_x3_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++	addq	$(VEC_SIZE * 3), %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(max):
++	movq	%r8, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
++	ret
++# endif
++
++	.p2align 4
++L(first_vec_x0):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	addq	$VEC_SIZE, %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	addq	$(VEC_SIZE * 2), %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	ret
++
++	.p2align 4
++L(4x_vec_end):
++	VPCMP	$0, %YMM1, %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++	VPCMP	$0, %YMM2, %YMMZERO, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++	VPCMP	$0, %YMM3, %YMMZERO, %k2
++	kmovd	%k2, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++	VPCMP	$0, %YMM4, %YMMZERO, %k3
++	kmovd	%k3, %eax
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	addq	$(VEC_SIZE * 3), %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	ret
++
++END (STRLEN)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/strncmp-evex.S b/sysdeps/x86_64/multiarch/strncmp-evex.S
+new file mode 100644
+index 00000000..a1d53e8c
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncmp-evex.S
+@@ -0,0 +1,3 @@
++#define STRCMP	__strncmp_evex
++#define USE_AS_STRNCMP 1
++#include "strcmp-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
+index 686d654f..4c15542f 100644
+--- a/sysdeps/x86_64/multiarch/strncmp.c
++++ b/sysdeps/x86_64/multiarch/strncmp.c
+@@ -30,16 +30,25 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
++	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
+diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
+new file mode 100644
+index 00000000..722022f3
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
+@@ -0,0 +1,4 @@
++#define STRLEN __strnlen_evex
++#define USE_AS_STRNLEN 1
++
++#include "strlen-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
+new file mode 100644
+index 00000000..f920b5a5
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
+@@ -0,0 +1,265 @@
++/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef STRRCHR
++#  define STRRCHR	__strrchr_evex
++# endif
++
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++
++# ifdef USE_AS_WCSRCHR
++#  define VPBROADCAST	vpbroadcastd
++#  define VPCMP		vpcmpd
++#  define SHIFT_REG	r8d
++# else
++#  define VPBROADCAST	vpbroadcastb
++#  define VPCMP		vpcmpb
++#  define SHIFT_REG	ecx
++# endif
++
++# define XMMZERO	xmm16
++# define YMMZERO	ymm16
++# define YMMMATCH	ymm17
++# define YMM1		ymm18
++
++# define VEC_SIZE	32
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRRCHR)
++	movl	%edi, %ecx
++	/* Broadcast CHAR to YMMMATCH.  */
++	VPBROADCAST %esi, %YMMMATCH
++
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++
++	/* Check if we may cross page boundary with one vector load.  */
++	andl	$(2 * VEC_SIZE - 1), %ecx
++	cmpl	$VEC_SIZE, %ecx
++	ja	L(cros_page_boundary)
++
++	VMOVU	(%rdi), %YMM1
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %ecx
++	kmovd	%k1, %eax
++
++	addq	$VEC_SIZE, %rdi
++
++	testl	%eax, %eax
++	jnz	L(first_vec)
++
++	testl	%ecx, %ecx
++	jnz	L(return_null)
++
++	andq	$-VEC_SIZE, %rdi
++	xorl	%edx, %edx
++	jmp	L(aligned_loop)
++
++	.p2align 4
++L(first_vec):
++	/* Check if there is a null byte.  */
++	testl	%ecx, %ecx
++	jnz	L(char_and_nul_in_first_vec)
++
++	/* Remember the match and keep searching.  */
++	movl	%eax, %edx
++	movq	%rdi, %rsi
++	andq	$-VEC_SIZE, %rdi
++	jmp	L(aligned_loop)
++
++	.p2align 4
++L(cros_page_boundary):
++	andl	$(VEC_SIZE - 1), %ecx
++	andq	$-VEC_SIZE, %rdi
++
++# ifdef USE_AS_WCSRCHR
++	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	   bytes.  */
++	movl	%ecx, %SHIFT_REG
++	sarl	$2, %SHIFT_REG
++# endif
++
++	VMOVA	(%rdi), %YMM1
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %edx
++	kmovd	%k1, %eax
++
++	shrxl	%SHIFT_REG, %edx, %edx
++	shrxl	%SHIFT_REG, %eax, %eax
++	addq	$VEC_SIZE, %rdi
++
++	/* Check if there is a CHAR.  */
++	testl	%eax, %eax
++	jnz	L(found_char)
++
++	testl	%edx, %edx
++	jnz	L(return_null)
++
++	jmp	L(aligned_loop)
++
++	.p2align 4
++L(found_char):
++	testl	%edx, %edx
++	jnz	L(char_and_nul)
++
++	/* Remember the match and keep searching.  */
++	movl	%eax, %edx
++	leaq	(%rdi, %rcx), %rsi
++
++	.p2align 4
++L(aligned_loop):
++	VMOVA	(%rdi), %YMM1
++	addq	$VEC_SIZE, %rdi
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %ecx
++	kmovd	%k1, %eax
++	orl	%eax, %ecx
++	jnz	L(char_nor_null)
++
++	VMOVA	(%rdi), %YMM1
++	add	$VEC_SIZE, %rdi
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %ecx
++	kmovd	%k1, %eax
++	orl	%eax, %ecx
++	jnz	L(char_nor_null)
++
++	VMOVA	(%rdi), %YMM1
++	addq	$VEC_SIZE, %rdi
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %ecx
++	kmovd	%k1, %eax
++	orl	%eax, %ecx
++	jnz	L(char_nor_null)
++
++	VMOVA	(%rdi), %YMM1
++	addq	$VEC_SIZE, %rdi
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %ecx
++	kmovd	%k1, %eax
++	orl	%eax, %ecx
++	jz	L(aligned_loop)
++
++	.p2align 4
++L(char_nor_null):
++	/* Find a CHAR or a null byte in a loop.  */
++	testl	%eax, %eax
++	jnz	L(match)
++L(return_value):
++	testl	%edx, %edx
++	jz	L(return_null)
++	movl	%edx, %eax
++	movq	%rsi, %rdi
++	bsrl	%eax, %eax
++# ifdef USE_AS_WCSRCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
++# else
++	leaq	-VEC_SIZE(%rdi, %rax), %rax
++# endif
++	ret
++
++	.p2align 4
++L(match):
++	/* Find a CHAR.  Check if there is a null byte.  */
++	kmovd	%k0, %ecx
++	testl	%ecx, %ecx
++	jnz	L(find_nul)
++
++	/* Remember the match and keep searching.  */
++	movl	%eax, %edx
++	movq	%rdi, %rsi
++	jmp	L(aligned_loop)
++
++	.p2align 4
++L(find_nul):
++	/* Mask out any matching bits after the null byte.  */
++	movl	%ecx, %r8d
++	subl	$1, %r8d
++	xorl	%ecx, %r8d
++	andl	%r8d, %eax
++	testl	%eax, %eax
++	/* If there is no CHAR here, return the remembered one.  */
++	jz	L(return_value)
++	bsrl	%eax, %eax
++# ifdef USE_AS_WCSRCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
++# else
++	leaq	-VEC_SIZE(%rdi, %rax), %rax
++# endif
++	ret
++
++	.p2align 4
++L(char_and_nul):
++	/* Find both a CHAR and a null byte.  */
++	addq	%rcx, %rdi
++	movl	%edx, %ecx
++L(char_and_nul_in_first_vec):
++	/* Mask out any matching bits after the null byte.  */
++	movl	%ecx, %r8d
++	subl	$1, %r8d
++	xorl	%ecx, %r8d
++	andl	%r8d, %eax
++	testl	%eax, %eax
++	/* Return null pointer if the null byte comes first.  */
++	jz	L(return_null)
++	bsrl	%eax, %eax
++# ifdef USE_AS_WCSRCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
++# else
++	leaq	-VEC_SIZE(%rdi, %rax), %rax
++# endif
++	ret
++
++	.p2align 4
++L(return_null):
++	xorl	%eax, %eax
++	ret
++
++END (STRRCHR)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/wcschr-evex.S b/sysdeps/x86_64/multiarch/wcschr-evex.S
+new file mode 100644
+index 00000000..7cb8f1e4
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcschr-evex.S
+@@ -0,0 +1,3 @@
++#define STRCHR __wcschr_evex
++#define USE_AS_WCSCHR 1
++#include "strchr-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wcscmp-evex.S b/sysdeps/x86_64/multiarch/wcscmp-evex.S
+new file mode 100644
+index 00000000..42e73e51
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcscmp-evex.S
+@@ -0,0 +1,4 @@
++#define STRCMP __wcscmp_evex
++#define USE_AS_WCSCMP 1
++
++#include "strcmp-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wcslen-evex.S b/sysdeps/x86_64/multiarch/wcslen-evex.S
+new file mode 100644
+index 00000000..bdafa83b
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcslen-evex.S
+@@ -0,0 +1,4 @@
++#define STRLEN __wcslen_evex
++#define USE_AS_WCSLEN 1
++
++#include "strlen-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsncmp-evex.S b/sysdeps/x86_64/multiarch/wcsncmp-evex.S
+new file mode 100644
+index 00000000..8a8e3107
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsncmp-evex.S
+@@ -0,0 +1,5 @@
++#define STRCMP __wcsncmp_evex
++#define USE_AS_STRNCMP 1
++#define USE_AS_WCSCMP 1
++
++#include "strcmp-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
+new file mode 100644
+index 00000000..24773bb4
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
+@@ -0,0 +1,5 @@
++#define STRLEN __wcsnlen_evex
++#define USE_AS_WCSLEN 1
++#define USE_AS_STRNLEN 1
++
++#include "strlen-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
+index b3144c93..84254b83 100644
+--- a/sysdeps/x86_64/multiarch/wcsnlen.c
++++ b/sysdeps/x86_64/multiarch/wcsnlen.c
+@@ -29,16 +29,24 @@
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+     return OPTIMIZE (sse4_1);
+diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
+new file mode 100644
+index 00000000..c64602f7
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
+@@ -0,0 +1,3 @@
++#define STRRCHR __wcsrchr_evex
++#define USE_AS_WCSRCHR 1
++#include "strrchr-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S
+new file mode 100644
+index 00000000..06cd0f9f
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S
+@@ -0,0 +1,4 @@
++#define MEMCHR __wmemchr_evex
++#define USE_AS_WMEMCHR 1
++
++#include "memchr-evex.S"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-13.patch b/glibc-RHEL-15696-13.patch
new file mode 100644
index 0000000..a88a3bc
--- /dev/null
+++ b/glibc-RHEL-15696-13.patch
@@ -0,0 +1,1488 @@
+From 525bc2a32c9710df40371f951217c6ae7a923aee Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 06:36:50 -0800
+Subject: [PATCH] x86-64: Add strcpy family functions with 256-bit EVEX
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-strcpy.h to select the function optimized with 256-bit EVEX
+instructions using YMM16-YMM31 registers to avoid RTM abort with usable
+AVX512VL and AVX512BW since VZEROUPPER isn't needed at function exit.
+---
+ sysdeps/x86_64/multiarch/Makefile          |    6 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |   24 +
+ sysdeps/x86_64/multiarch/ifunc-strcpy.h    |   13 +-
+ sysdeps/x86_64/multiarch/stpcpy-evex.S     |    3 +
+ sysdeps/x86_64/multiarch/stpncpy-evex.S    |    4 +
+ sysdeps/x86_64/multiarch/strcat-evex.S     |  283 ++++++
+ sysdeps/x86_64/multiarch/strcpy-evex.S     | 1003 ++++++++++++++++++++
+ sysdeps/x86_64/multiarch/strncat-evex.S    |    3 +
+ sysdeps/x86_64/multiarch/strncpy-evex.S    |    3 +
+ 9 files changed, 1339 insertions(+), 3 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/stpcpy-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/stpncpy-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcat-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcpy-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncat-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncpy-evex.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 5ce85882..46783cd1 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -43,11 +43,17 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memchr-evex \
+ 		   memrchr-evex \
+ 		   rawmemchr-evex \
++		   stpcpy-evex \
++		   stpncpy-evex \
++		   strcat-evex \
+ 		   strchr-evex \
+ 		   strchrnul-evex \
+ 		   strcmp-evex \
++		   strcpy-evex \
+ 		   strlen-evex \
++		   strncat-evex \
+ 		   strncmp-evex \
++		   strncpy-evex \
+ 		   strnlen-evex \
+ 		   strrchr-evex
+ CFLAGS-varshift.c += -msse4
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index bd7d9f19..082e4da3 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -224,6 +224,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __stpncpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __stpncpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, stpncpy,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __stpncpy_evex)
+ 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
+ 			      __stpncpy_sse2_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
+@@ -234,6 +238,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __stpcpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __stpcpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, stpcpy,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __stpcpy_evex)
+ 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
+ 
+@@ -268,6 +276,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strcat,
+ 	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2),
+ 			      __strcat_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcat,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strcat_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
+ 			      __strcat_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
+@@ -330,6 +342,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strcpy,
+ 	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __strcpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcpy,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strcpy_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
+ 			      __strcpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
+@@ -373,6 +389,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strncat,
+ 	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2),
+ 			      __strncat_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncat,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strncat_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
+ 			      __strncat_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
+@@ -383,6 +403,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strncpy,
+ 	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __strncpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncpy,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strncpy_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
+ 			      __strncpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
+diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+index 100dca5c..deae6348 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
++++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+@@ -25,16 +25,23 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+     return OPTIMIZE (sse2_unaligned);
+diff --git a/sysdeps/x86_64/multiarch/stpcpy-evex.S b/sysdeps/x86_64/multiarch/stpcpy-evex.S
+new file mode 100644
+index 00000000..7c6f26cd
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/stpcpy-evex.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STPCPY
++#define STRCPY __stpcpy_evex
++#include "strcpy-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
+new file mode 100644
+index 00000000..1570014d
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
+@@ -0,0 +1,4 @@
++#define USE_AS_STPCPY
++#define USE_AS_STRNCPY
++#define STRCPY __stpncpy_evex
++#include "strcpy-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
+new file mode 100644
+index 00000000..97c3d85b
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcat-evex.S
+@@ -0,0 +1,283 @@
++/* strcat with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef STRCAT
++#  define STRCAT  __strcat_evex
++# endif
++
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++
++/* zero register */
++# define XMMZERO	xmm16
++# define YMMZERO	ymm16
++# define YMM0		ymm17
++# define YMM1		ymm18
++
++# define USE_AS_STRCAT
++
++/* Number of bytes in a vector register */
++# define VEC_SIZE	32
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRCAT)
++	mov	%rdi, %r9
++# ifdef USE_AS_STRNCAT
++	mov	%rdx, %r8
++# endif
++
++	xor	%eax, %eax
++	mov	%edi, %ecx
++	and	$((VEC_SIZE * 4) - 1), %ecx
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++	cmp	$(VEC_SIZE * 3), %ecx
++	ja	L(fourth_vector_boundary)
++	vpcmpb	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_first_vector)
++	mov	%rdi, %rax
++	and	$-VEC_SIZE, %rax
++	jmp	L(align_vec_size_start)
++L(fourth_vector_boundary):
++	mov	%rdi, %rax
++	and	$-VEC_SIZE, %rax
++	vpcmpb	$0, (%rax), %YMMZERO, %k0
++	mov	$-1, %r10d
++	sub	%rax, %rcx
++	shl	%cl, %r10d
++	kmovd	%k0, %edx
++	and	%r10d, %edx
++	jnz	L(exit)
++
++L(align_vec_size_start):
++	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
++	kmovd	%k0, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
++	kmovd	%k2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
++	kmovd	%k3, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fifth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
++	add	$(VEC_SIZE * 4), %rax
++	kmovd	%k4, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
++	kmovd	%k2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
++	kmovd	%k3, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fifth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
++	kmovd	%k4, %edx
++	add	$(VEC_SIZE * 4), %rax
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
++	kmovd	%k2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
++	kmovd	%k3, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fifth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
++	add	$(VEC_SIZE * 4), %rax
++	kmovd	%k4, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
++	kmovd	%k2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
++	kmovd	%k3, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fifth_vector)
++
++	test	$((VEC_SIZE * 4) - 1), %rax
++	jz	L(align_four_vec_loop)
++
++	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
++	add	$(VEC_SIZE * 5), %rax
++	kmovd	%k4, %edx
++	test	%edx, %edx
++	jnz	L(exit)
++
++	test	$((VEC_SIZE * 4) - 1), %rax
++	jz	L(align_four_vec_loop)
++
++	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
++	add	$VEC_SIZE, %rax
++	kmovd	%k0, %edx
++	test	%edx, %edx
++	jnz	L(exit)
++
++	test	$((VEC_SIZE * 4) - 1), %rax
++	jz	L(align_four_vec_loop)
++
++	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
++	add	$VEC_SIZE, %rax
++	kmovd	%k0, %edx
++	test	%edx, %edx
++	jnz	L(exit)
++
++	test	$((VEC_SIZE * 4) - 1), %rax
++	jz	L(align_four_vec_loop)
++
++	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
++	add	$VEC_SIZE, %rax
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit)
++
++	add	$VEC_SIZE, %rax
++
++	.p2align 4
++L(align_four_vec_loop):
++	VMOVA	(%rax), %YMM0
++	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
++	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
++	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
++	vpminub	%YMM0, %YMM1, %YMM0
++	/* If K0 != 0, there is a null byte.  */
++	vpcmpb	$0, %YMM0, %YMMZERO, %k0
++	add	$(VEC_SIZE * 4), %rax
++	ktestd	%k0, %k0
++	jz	L(align_four_vec_loop)
++
++	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
++	sub	$(VEC_SIZE * 5), %rax
++	kmovd	%k0, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
++	kmovd	%k2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
++	kmovd	%k3, %edx
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$(VEC_SIZE * 4), %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit):
++	sub	%rdi, %rax
++L(exit_null_on_first_vector):
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit_null_on_second_vector):
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$VEC_SIZE, %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit_null_on_third_vector):
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$(VEC_SIZE * 2), %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit_null_on_fourth_vector):
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$(VEC_SIZE * 3), %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit_null_on_fifth_vector):
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$(VEC_SIZE * 4), %rax
++
++	.p2align 4
++L(StartStrcpyPart):
++	lea	(%r9, %rax), %rdi
++	mov	%rsi, %rcx
++	mov	%r9, %rax      /* save result */
++
++# ifdef USE_AS_STRNCAT
++	test	%r8, %r8
++	jz	L(ExitZero)
++#  define USE_AS_STRNCPY
++# endif
++
++# include "strcpy-evex.S"
++#endif
+diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
+new file mode 100644
+index 00000000..a343a1a6
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
+@@ -0,0 +1,1003 @@
++/* strcpy with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# ifndef USE_AS_STRCAT
++#  include <sysdep.h>
++
++#  ifndef STRCPY
++#   define STRCPY  __strcpy_evex
++#  endif
++
++# endif
++
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++
++/* Number of bytes in a vector register */
++# ifndef VEC_SIZE
++#  define VEC_SIZE	32
++# endif
++
++# define XMM2		xmm18
++# define XMM3		xmm19
++
++# define YMM2		ymm18
++# define YMM3		ymm19
++# define YMM4		ymm20
++# define YMM5		ymm21
++# define YMM6		ymm22
++# define YMM7		ymm23
++
++# ifndef USE_AS_STRCAT
++
++/* zero register */
++#  define XMMZERO	xmm16
++#  define YMMZERO	ymm16
++#  define YMM1		ymm17
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRCPY)
++#  ifdef USE_AS_STRNCPY
++	mov	%RDX_LP, %R8_LP
++	test	%R8_LP, %R8_LP
++	jz	L(ExitZero)
++#  endif
++	mov	%rsi, %rcx
++#  ifndef USE_AS_STPCPY
++	mov	%rdi, %rax      /* save result */
++#  endif
++
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++# endif
++
++	and	$((VEC_SIZE * 4) - 1), %ecx
++	cmp	$(VEC_SIZE * 2), %ecx
++	jbe	L(SourceStringAlignmentLessTwoVecSize)
++
++	and	$-VEC_SIZE, %rsi
++	and	$(VEC_SIZE - 1), %ecx
++
++	vpcmpb	$0, (%rsi), %YMMZERO, %k0
++	kmovd	%k0, %edx
++	shr	%cl, %rdx
++
++# ifdef USE_AS_STRNCPY
++#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
++	mov	$VEC_SIZE, %r10
++	sub	%rcx, %r10
++	cmp	%r10, %r8
++#  else
++	mov	$(VEC_SIZE + 1), %r10
++	sub	%rcx, %r10
++	cmp	%r10, %r8
++#  endif
++	jbe	L(CopyVecSizeTailCase2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(CopyVecSizeTail)
++
++	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
++	kmovd	%k1, %edx
++
++# ifdef USE_AS_STRNCPY
++	add	$VEC_SIZE, %r10
++	cmp	%r10, %r8
++	jbe	L(CopyTwoVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(CopyTwoVecSize)
++
++	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
++	VMOVU	%YMM2, (%rdi)
++
++/* If source address alignment != destination address alignment */
++	.p2align 4
++L(UnalignVecSizeBoth):
++	sub	%rcx, %rdi
++# ifdef USE_AS_STRNCPY
++	add	%rcx, %r8
++	sbb	%rcx, %rcx
++	or	%rcx, %r8
++# endif
++	mov	$VEC_SIZE, %rcx
++	VMOVA	(%rsi, %rcx), %YMM2
++	VMOVU	%YMM2, (%rdi, %rcx)
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
++	vpcmpb	$0, %YMM2, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$(VEC_SIZE * 3), %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec2)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVU	%YMM2, (%rdi, %rcx)
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
++	vpcmpb	$0, %YMM3, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec3)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVU	%YMM3, (%rdi, %rcx)
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
++	vpcmpb	$0, %YMM4, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec4)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVU	%YMM4, (%rdi, %rcx)
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
++	vpcmpb	$0, %YMM2, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec2)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVU	%YMM2, (%rdi, %rcx)
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
++	vpcmpb	$0, %YMM2, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec2)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
++	VMOVU	%YMM2, (%rdi, %rcx)
++	vpcmpb	$0, %YMM3, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec3)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVU	%YMM3, (%rdi, %rcx)
++	mov	%rsi, %rdx
++	lea	VEC_SIZE(%rsi, %rcx), %rsi
++	and	$-(VEC_SIZE * 4), %rsi
++	sub	%rsi, %rdx
++	sub	%rdx, %rdi
++# ifdef USE_AS_STRNCPY
++	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
++# endif
++L(UnalignedFourVecSizeLoop):
++	VMOVA	(%rsi), %YMM4
++	VMOVA	VEC_SIZE(%rsi), %YMM5
++	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
++	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
++	vpminub	%YMM5, %YMM4, %YMM2
++	vpminub	%YMM7, %YMM6, %YMM3
++	vpminub	%YMM2, %YMM3, %YMM2
++	/* If K7 != 0, there is a null byte.  */
++	vpcmpb	$0, %YMM2, %YMMZERO, %k7
++	kmovd	%k7, %edx
++# ifdef USE_AS_STRNCPY
++	sub	$(VEC_SIZE * 4), %r8
++	jbe	L(UnalignedLeaveCase2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(UnalignedFourVecSizeLeave)
++
++L(UnalignedFourVecSizeLoop_start):
++	add	$(VEC_SIZE * 4), %rdi
++	add	$(VEC_SIZE * 4), %rsi
++	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
++	VMOVA	(%rsi), %YMM4
++	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
++	VMOVA	VEC_SIZE(%rsi), %YMM5
++	vpminub	%YMM5, %YMM4, %YMM2
++	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
++	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
++	VMOVU	%YMM7, -VEC_SIZE(%rdi)
++	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
++	vpminub	%YMM7, %YMM6, %YMM3
++	vpminub	%YMM2, %YMM3, %YMM2
++	/* If K7 != 0, there is a null byte.  */
++	vpcmpb	$0, %YMM2, %YMMZERO, %k7
++	kmovd	%k7, %edx
++# ifdef USE_AS_STRNCPY
++	sub	$(VEC_SIZE * 4), %r8
++	jbe	L(UnalignedLeaveCase2OrCase3)
++# endif
++	test	%edx, %edx
++	jz	L(UnalignedFourVecSizeLoop_start)
++
++L(UnalignedFourVecSizeLeave):
++	vpcmpb	$0, %YMM4, %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(CopyVecSizeUnaligned_0)
++
++	vpcmpb	$0, %YMM5, %YMMZERO, %k2
++	kmovd	%k2, %ecx
++	test	%ecx, %ecx
++	jnz	L(CopyVecSizeUnaligned_16)
++
++	vpcmpb	$0, %YMM6, %YMMZERO, %k3
++	kmovd	%k3, %edx
++	test	%edx, %edx
++	jnz	L(CopyVecSizeUnaligned_32)
++
++	vpcmpb	$0, %YMM7, %YMMZERO, %k4
++	kmovd	%k4, %ecx
++	bsf	%ecx, %edx
++	VMOVU	%YMM4, (%rdi)
++	VMOVU	%YMM5, VEC_SIZE(%rdi)
++	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++# ifdef USE_AS_STPCPY
++	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
++# endif
++	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
++	add	$(VEC_SIZE - 1), %r8
++	sub	%rdx, %r8
++	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
++	jmp	L(StrncpyFillTailWithZero)
++# else
++	add	$(VEC_SIZE * 3), %rsi
++	add	$(VEC_SIZE * 3), %rdi
++	jmp	L(CopyVecSizeExit)
++# endif
++
++/* If source address alignment == destination address alignment */
++
++L(SourceStringAlignmentLessTwoVecSize):
++	VMOVU	(%rsi), %YMM3
++	VMOVU	VEC_SIZE(%rsi), %YMM2
++	vpcmpb	$0, %YMM3, %YMMZERO, %k0
++	kmovd	%k0, %edx
++
++# ifdef USE_AS_STRNCPY
++#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
++	cmp	$VEC_SIZE, %r8
++#  else
++	cmp	$(VEC_SIZE + 1), %r8
++#  endif
++	jbe	L(CopyVecSizeTail1Case2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(CopyVecSizeTail1)
++
++	VMOVU	%YMM3, (%rdi)
++	vpcmpb	$0, %YMM2, %YMMZERO, %k0
++	kmovd	%k0, %edx
++
++# ifdef USE_AS_STRNCPY
++#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
++	cmp	$(VEC_SIZE * 2), %r8
++#  else
++	cmp	$((VEC_SIZE * 2) + 1), %r8
++#  endif
++	jbe	L(CopyTwoVecSize1Case2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(CopyTwoVecSize1)
++
++	and	$-VEC_SIZE, %rsi
++	and	$(VEC_SIZE - 1), %ecx
++	jmp	L(UnalignVecSizeBoth)
++
++/*------End of main part with loops---------------------*/
++
++/* Case1 */
++
++# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
++	.p2align 4
++L(CopyVecSize):
++	add	%rcx, %rdi
++# endif
++L(CopyVecSizeTail):
++	add	%rcx, %rsi
++L(CopyVecSizeTail1):
++	bsf	%edx, %edx
++L(CopyVecSizeExit):
++	cmp	$32, %edx
++	jae	L(Exit32_63)
++	cmp	$16, %edx
++	jae	L(Exit16_31)
++	cmp	$8, %edx
++	jae	L(Exit8_15)
++	cmp	$4, %edx
++	jae	L(Exit4_7)
++	cmp	$3, %edx
++	je	L(Exit3)
++	cmp	$1, %edx
++	ja	L(Exit2)
++	je	L(Exit1)
++	movb	$0, (%rdi)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$1, %r8
++	lea	1(%rdi), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(CopyTwoVecSize1):
++	add	$VEC_SIZE, %rsi
++	add	$VEC_SIZE, %rdi
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$VEC_SIZE, %r8
++# endif
++	jmp	L(CopyVecSizeTail1)
++
++	.p2align 4
++L(CopyTwoVecSize):
++	bsf	%edx, %edx
++	add	%rcx, %rsi
++	add	$VEC_SIZE, %edx
++	sub	%ecx, %edx
++	jmp	L(CopyVecSizeExit)
++
++	.p2align 4
++L(CopyVecSizeUnaligned_0):
++	bsf	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++	VMOVU	%YMM4, (%rdi)
++	add	$((VEC_SIZE * 4) - 1), %r8
++	sub	%rdx, %r8
++	lea	1(%rdi, %rdx), %rdi
++	jmp	L(StrncpyFillTailWithZero)
++# else
++	jmp	L(CopyVecSizeExit)
++# endif
++
++	.p2align 4
++L(CopyVecSizeUnaligned_16):
++	bsf	%ecx, %edx
++	VMOVU	%YMM4, (%rdi)
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++# ifdef USE_AS_STPCPY
++	lea	VEC_SIZE(%rdi, %rdx), %rax
++# endif
++	VMOVU	%YMM5, VEC_SIZE(%rdi)
++	add	$((VEC_SIZE * 3) - 1), %r8
++	sub	%rdx, %r8
++	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
++	jmp	L(StrncpyFillTailWithZero)
++# else
++	add	$VEC_SIZE, %rsi
++	add	$VEC_SIZE, %rdi
++	jmp	L(CopyVecSizeExit)
++# endif
++
++	.p2align 4
++L(CopyVecSizeUnaligned_32):
++	bsf	%edx, %edx
++	VMOVU	%YMM4, (%rdi)
++	VMOVU	%YMM5, VEC_SIZE(%rdi)
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++# ifdef USE_AS_STPCPY
++	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
++# endif
++	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
++	add	$((VEC_SIZE * 2) - 1), %r8
++	sub	%rdx, %r8
++	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
++	jmp	L(StrncpyFillTailWithZero)
++# else
++	add	$(VEC_SIZE * 2), %rsi
++	add	$(VEC_SIZE * 2), %rdi
++	jmp	L(CopyVecSizeExit)
++# endif
++
++# ifdef USE_AS_STRNCPY
++#  ifndef USE_AS_STRCAT
++	.p2align 4
++L(CopyVecSizeUnalignedVec6):
++	VMOVU	%YMM6, (%rdi, %rcx)
++	jmp	L(CopyVecSizeVecExit)
++
++	.p2align 4
++L(CopyVecSizeUnalignedVec5):
++	VMOVU	%YMM5, (%rdi, %rcx)
++	jmp	L(CopyVecSizeVecExit)
++
++	.p2align 4
++L(CopyVecSizeUnalignedVec4):
++	VMOVU	%YMM4, (%rdi, %rcx)
++	jmp	L(CopyVecSizeVecExit)
++
++	.p2align 4
++L(CopyVecSizeUnalignedVec3):
++	VMOVU	%YMM3, (%rdi, %rcx)
++	jmp	L(CopyVecSizeVecExit)
++#  endif
++
++/* Case2 */
++
++	.p2align 4
++L(CopyVecSizeCase2):
++	add	$VEC_SIZE, %r8
++	add	%rcx, %rdi
++	add	%rcx, %rsi
++	bsf	%edx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++	jmp	L(StrncpyExit)
++
++	.p2align 4
++L(CopyTwoVecSizeCase2):
++	add	%rcx, %rsi
++	bsf	%edx, %edx
++	add	$VEC_SIZE, %edx
++	sub	%ecx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++	jmp	L(StrncpyExit)
++
++L(CopyVecSizeTailCase2):
++	add	%rcx, %rsi
++	bsf	%edx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++	jmp	L(StrncpyExit)
++
++L(CopyVecSizeTail1Case2):
++	bsf	%edx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++	jmp	L(StrncpyExit)
++
++/* Case2 or Case3,  Case3 */
++
++	.p2align 4
++L(CopyVecSizeCase2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(CopyVecSizeCase2)
++L(CopyVecSizeCase3):
++	add	$VEC_SIZE, %r8
++	add	%rcx, %rdi
++	add	%rcx, %rsi
++	jmp	L(StrncpyExit)
++
++	.p2align 4
++L(CopyTwoVecSizeCase2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(CopyTwoVecSizeCase2)
++	add	%rcx, %rsi
++	jmp	L(StrncpyExit)
++
++	.p2align 4
++L(CopyVecSizeTailCase2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(CopyVecSizeTailCase2)
++	add	%rcx, %rsi
++	jmp	L(StrncpyExit)
++
++	.p2align 4
++L(CopyTwoVecSize1Case2OrCase3):
++	add	$VEC_SIZE, %rdi
++	add	$VEC_SIZE, %rsi
++	sub	$VEC_SIZE, %r8
++L(CopyVecSizeTail1Case2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(CopyVecSizeTail1Case2)
++	jmp	L(StrncpyExit)
++# endif
++
++/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
++
++	.p2align 4
++L(Exit1):
++	movzwl	(%rsi), %edx
++	mov	%dx, (%rdi)
++# ifdef USE_AS_STPCPY
++	lea	1(%rdi), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$2, %r8
++	lea	2(%rdi), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit2):
++	movzwl	(%rsi), %ecx
++	mov	%cx, (%rdi)
++	movb	$0, 2(%rdi)
++# ifdef USE_AS_STPCPY
++	lea	2(%rdi), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$3, %r8
++	lea	3(%rdi), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit3):
++	mov	(%rsi), %edx
++	mov	%edx, (%rdi)
++# ifdef USE_AS_STPCPY
++	lea	3(%rdi), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$4, %r8
++	lea	4(%rdi), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit4_7):
++	mov	(%rsi), %ecx
++	mov	%ecx, (%rdi)
++	mov	-3(%rsi, %rdx), %ecx
++	mov	%ecx, -3(%rdi, %rdx)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	%rdx, %r8
++	sub	$1, %r8
++	lea	1(%rdi, %rdx), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit8_15):
++	mov	(%rsi), %rcx
++	mov	-7(%rsi, %rdx), %r9
++	mov	%rcx, (%rdi)
++	mov	%r9, -7(%rdi, %rdx)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	%rdx, %r8
++	sub	$1, %r8
++	lea	1(%rdi, %rdx), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit16_31):
++	VMOVU	(%rsi), %XMM2
++	VMOVU	-15(%rsi, %rdx), %XMM3
++	VMOVU	%XMM2, (%rdi)
++	VMOVU	%XMM3, -15(%rdi, %rdx)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub %rdx, %r8
++	sub $1, %r8
++	lea 1(%rdi, %rdx), %rdi
++	jnz L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit32_63):
++	VMOVU	(%rsi), %YMM2
++	VMOVU	-31(%rsi, %rdx), %YMM3
++	VMOVU	%YMM2, (%rdi)
++	VMOVU	%YMM3, -31(%rdi, %rdx)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	%rdx, %r8
++	sub	$1, %r8
++	lea	1(%rdi, %rdx), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++# ifdef USE_AS_STRNCPY
++
++	.p2align 4
++L(StrncpyExit1):
++	movzbl	(%rsi), %edx
++	mov	%dl, (%rdi)
++#  ifdef USE_AS_STPCPY
++	lea	1(%rdi), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, 1(%rdi)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit2):
++	movzwl	(%rsi), %edx
++	mov	%dx, (%rdi)
++#  ifdef USE_AS_STPCPY
++	lea	2(%rdi), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, 2(%rdi)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit3_4):
++	movzwl	(%rsi), %ecx
++	movzwl	-2(%rsi, %r8), %edx
++	mov	%cx, (%rdi)
++	mov	%dx, -2(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit5_8):
++	mov	(%rsi), %ecx
++	mov	-4(%rsi, %r8), %edx
++	mov	%ecx, (%rdi)
++	mov	%edx, -4(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit9_16):
++	mov	(%rsi), %rcx
++	mov	-8(%rsi, %r8), %rdx
++	mov	%rcx, (%rdi)
++	mov	%rdx, -8(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit17_32):
++	VMOVU	(%rsi), %XMM2
++	VMOVU	-16(%rsi, %r8), %XMM3
++	VMOVU	%XMM2, (%rdi)
++	VMOVU	%XMM3, -16(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit33_64):
++	/*  0/32, 31/16 */
++	VMOVU	(%rsi), %YMM2
++	VMOVU	-VEC_SIZE(%rsi, %r8), %YMM3
++	VMOVU	%YMM2, (%rdi)
++	VMOVU	%YMM3, -VEC_SIZE(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit65):
++	/* 0/32, 32/32, 64/1 */
++	VMOVU	(%rsi), %YMM2
++	VMOVU	32(%rsi), %YMM3
++	mov	64(%rsi), %cl
++	VMOVU	%YMM2, (%rdi)
++	VMOVU	%YMM3, 32(%rdi)
++	mov	%cl, 64(%rdi)
++#  ifdef USE_AS_STPCPY
++	lea	65(%rdi), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, 65(%rdi)
++#  endif
++	ret
++
++#  ifndef USE_AS_STRCAT
++
++	.p2align 4
++L(Fill1):
++	mov	%dl, (%rdi)
++	ret
++
++	.p2align 4
++L(Fill2):
++	mov	%dx, (%rdi)
++	ret
++
++	.p2align 4
++L(Fill3_4):
++	mov	%dx, (%rdi)
++	mov     %dx, -2(%rdi, %r8)
++	ret
++
++	.p2align 4
++L(Fill5_8):
++	mov	%edx, (%rdi)
++	mov     %edx, -4(%rdi, %r8)
++	ret
++
++	.p2align 4
++L(Fill9_16):
++	mov	%rdx, (%rdi)
++	mov	%rdx, -8(%rdi, %r8)
++	ret
++
++	.p2align 4
++L(Fill17_32):
++	VMOVU	%XMMZERO, (%rdi)
++	VMOVU	%XMMZERO, -16(%rdi, %r8)
++	ret
++
++	.p2align 4
++L(CopyVecSizeUnalignedVec2):
++	VMOVU	%YMM2, (%rdi, %rcx)
++
++	.p2align 4
++L(CopyVecSizeVecExit):
++	bsf	%edx, %edx
++	add	$(VEC_SIZE - 1), %r8
++	add	%rcx, %rdi
++#   ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++#   endif
++	sub	%rdx, %r8
++	lea	1(%rdi, %rdx), %rdi
++
++	.p2align 4
++L(StrncpyFillTailWithZero):
++	xor	%edx, %edx
++	sub	$VEC_SIZE, %r8
++	jbe	L(StrncpyFillExit)
++
++	VMOVU	%YMMZERO, (%rdi)
++	add	$VEC_SIZE, %rdi
++
++	mov	%rdi, %rsi
++	and	$(VEC_SIZE - 1), %esi
++	sub	%rsi, %rdi
++	add	%rsi, %r8
++	sub	$(VEC_SIZE * 4), %r8
++	jb	L(StrncpyFillLessFourVecSize)
++
++L(StrncpyFillLoopVmovdqa):
++	VMOVA	%YMMZERO, (%rdi)
++	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
++	VMOVA	%YMMZERO, (VEC_SIZE * 2)(%rdi)
++	VMOVA	%YMMZERO, (VEC_SIZE * 3)(%rdi)
++	add	$(VEC_SIZE * 4), %rdi
++	sub	$(VEC_SIZE * 4), %r8
++	jae	L(StrncpyFillLoopVmovdqa)
++
++L(StrncpyFillLessFourVecSize):
++	add	$(VEC_SIZE * 2), %r8
++	jl	L(StrncpyFillLessTwoVecSize)
++	VMOVA	%YMMZERO, (%rdi)
++	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
++	add	$(VEC_SIZE * 2), %rdi
++	sub	$VEC_SIZE, %r8
++	jl	L(StrncpyFillExit)
++	VMOVA	%YMMZERO, (%rdi)
++	add	$VEC_SIZE, %rdi
++	jmp	L(Fill)
++
++	.p2align 4
++L(StrncpyFillLessTwoVecSize):
++	add	$VEC_SIZE, %r8
++	jl	L(StrncpyFillExit)
++	VMOVA	%YMMZERO, (%rdi)
++	add	$VEC_SIZE, %rdi
++	jmp	L(Fill)
++
++	.p2align 4
++L(StrncpyFillExit):
++	add	$VEC_SIZE, %r8
++L(Fill):
++	cmp	$17, %r8d
++	jae	L(Fill17_32)
++	cmp	$9, %r8d
++	jae	L(Fill9_16)
++	cmp	$5, %r8d
++	jae	L(Fill5_8)
++	cmp	$3, %r8d
++	jae	L(Fill3_4)
++	cmp	$1, %r8d
++	ja	L(Fill2)
++	je	L(Fill1)
++	ret
++
++/* end of ifndef USE_AS_STRCAT */
++#  endif
++
++	.p2align 4
++L(UnalignedLeaveCase2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(UnalignedFourVecSizeLeaveCase2)
++L(UnalignedFourVecSizeLeaveCase3):
++	lea	(VEC_SIZE * 4)(%r8), %rcx
++	and	$-VEC_SIZE, %rcx
++	add	$(VEC_SIZE * 3), %r8
++	jl	L(CopyVecSizeCase3)
++	VMOVU	%YMM4, (%rdi)
++	sub	$VEC_SIZE, %r8
++	jb	L(CopyVecSizeCase3)
++	VMOVU	%YMM5, VEC_SIZE(%rdi)
++	sub	$VEC_SIZE, %r8
++	jb	L(CopyVecSizeCase3)
++	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
++	sub	$VEC_SIZE, %r8
++	jb	L(CopyVecSizeCase3)
++	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
++#  ifdef USE_AS_STPCPY
++	lea	(VEC_SIZE * 4)(%rdi), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (VEC_SIZE * 4)(%rdi)
++#  endif
++	ret
++
++	.p2align 4
++L(UnalignedFourVecSizeLeaveCase2):
++	xor	%ecx, %ecx
++	vpcmpb	$0, %YMM4, %YMMZERO, %k1
++	kmovd	%k1, %edx
++	add	$(VEC_SIZE * 3), %r8
++	jle	L(CopyVecSizeCase2OrCase3)
++	test	%edx, %edx
++#  ifndef USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec4)
++#  else
++	jnz	L(CopyVecSize)
++#  endif
++	vpcmpb	$0, %YMM5, %YMMZERO, %k2
++	kmovd	%k2, %edx
++	VMOVU	%YMM4, (%rdi)
++	add	$VEC_SIZE, %rcx
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++	test	%edx, %edx
++#  ifndef USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec5)
++#  else
++	jnz	L(CopyVecSize)
++#  endif
++
++	vpcmpb	$0, %YMM6, %YMMZERO, %k3
++	kmovd	%k3, %edx
++	VMOVU	%YMM5, VEC_SIZE(%rdi)
++	add	$VEC_SIZE, %rcx
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++	test	%edx, %edx
++#  ifndef USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec6)
++#  else
++	jnz	L(CopyVecSize)
++#  endif
++
++	vpcmpb	$0, %YMM7, %YMMZERO, %k4
++	kmovd	%k4, %edx
++	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
++	lea	VEC_SIZE(%rdi, %rcx), %rdi
++	lea	VEC_SIZE(%rsi, %rcx), %rsi
++	bsf	%edx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++L(StrncpyExit):
++	cmp	$65, %r8d
++	je	L(StrncpyExit65)
++	cmp	$33, %r8d
++	jae	L(StrncpyExit33_64)
++	cmp	$17, %r8d
++	jae	L(StrncpyExit17_32)
++	cmp	$9, %r8d
++	jae	L(StrncpyExit9_16)
++	cmp	$5, %r8d
++	jae	L(StrncpyExit5_8)
++	cmp	$3, %r8d
++	jae	L(StrncpyExit3_4)
++	cmp	$1, %r8d
++	ja	L(StrncpyExit2)
++	je	L(StrncpyExit1)
++#  ifdef USE_AS_STPCPY
++	mov	%rdi, %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi)
++#  endif
++	ret
++
++	.p2align 4
++L(ExitZero):
++#  ifndef USE_AS_STRCAT
++	mov	%rdi, %rax
++#  endif
++	ret
++
++# endif
++
++# ifndef USE_AS_STRCAT
++END (STRCPY)
++# else
++END (STRCAT)
++# endif
++#endif
+diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
+new file mode 100644
+index 00000000..8884f023
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncat-evex.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STRNCAT
++#define STRCAT __strncat_evex
++#include "strcat-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
+new file mode 100644
+index 00000000..40e391f0
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STRNCPY
++#define STRCPY __strncpy_evex
++#include "strcpy-evex.S"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-14.patch b/glibc-RHEL-15696-14.patch
new file mode 100644
index 0000000..84a4593
--- /dev/null
+++ b/glibc-RHEL-15696-14.patch
@@ -0,0 +1,242 @@
+From 63ad43566f7a25d140dc723598aeb441ad657eed Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 06:46:08 -0800
+Subject: [PATCH] x86-64: Add memmove family functions with 256-bit EVEX
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memmove.h to select the function optimized with 256-bit EVEX
+instructions using YMM16-YMM31 registers to avoid RTM abort with usable
+AVX512VL since VZEROUPPER isn't needed at function exit.
+---
+ sysdeps/x86_64/multiarch/Makefile             |  1 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 36 +++++++++++++++++++
+ sysdeps/x86_64/multiarch/ifunc-memmove.h      | 21 +++++++++--
+ .../multiarch/memmove-evex-unaligned-erms.S   | 33 +++++++++++++++++
+ .../multiarch/memmove-vec-unaligned-erms.S    | 24 ++++++++-----
+ 5 files changed, 104 insertions(+), 11 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 46783cd1..4563fc56 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memset-avx2-unaligned-erms \
+ 		   memset-avx512-unaligned-erms \
+ 		   memchr-evex \
++		   memmove-evex-unaligned-erms \
+ 		   memrchr-evex \
+ 		   rawmemchr-evex \
+ 		   stpcpy-evex \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 082e4da3..6bd3abfc 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memmove_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memmove_chk_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memmove_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (SSSE3),
+ 			      __memmove_chk_ssse3_back)
+@@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memmove_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memmove,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memmove_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, memmove,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memmove_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memmove_avx512_no_vzeroupper)
+@@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memcpy_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memcpy_chk_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memcpy_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (SSSE3),
+ 			      __memcpy_chk_ssse3_back)
+@@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memcpy_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memcpy,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memcpy_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, memcpy,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memcpy_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
+ 			      __memcpy_ssse3_back)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
+@@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __mempcpy_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __mempcpy_chk_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __mempcpy_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (SSSE3),
+ 			      __mempcpy_chk_ssse3_back)
+@@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __mempcpy_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, mempcpy,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __mempcpy_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, mempcpy,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __mempcpy_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
+ 			      __mempcpy_ssse3_back)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+index 5e5f0299..6f8bce5f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
+   attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
++  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
+@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void)
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx_unaligned_erms);
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (evex_unaligned_erms);
++
++	  return OPTIMIZE (evex_unaligned);
++	}
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx_unaligned_erms);
+ 
+-      return OPTIMIZE (avx_unaligned);
++	  return OPTIMIZE (avx_unaligned);
++	}
+     }
+ 
+   if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+new file mode 100644
+index 00000000..0cbce8f9
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+@@ -0,0 +1,33 @@
++#if IS_IN (libc)
++# define VEC_SIZE	32
++# define XMM0		xmm16
++# define XMM1		xmm17
++# define YMM0		ymm16
++# define YMM1		ymm17
++# define VEC0		ymm16
++# define VEC1		ymm17
++# define VEC2		ymm18
++# define VEC3		ymm19
++# define VEC4		ymm20
++# define VEC5		ymm21
++# define VEC6		ymm22
++# define VEC7		ymm23
++# define VEC8		ymm24
++# define VEC9		ymm25
++# define VEC10		ymm26
++# define VEC11		ymm27
++# define VEC12		ymm28
++# define VEC13		ymm29
++# define VEC14		ymm30
++# define VEC15		ymm31
++# define VEC(i)		VEC##i
++# define VMOVNT		vmovntdq
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++# define VZEROUPPER
++
++# define SECTION(p)		p##.evex
++# define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
++
++# include "memmove-vec-unaligned-erms.S"
++#endif
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 274aa1c7..08e21692 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -48,6 +48,14 @@
+ # define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
+ #endif
+ 
++#ifndef XMM0
++# define XMM0				xmm0
++#endif
++
++#ifndef YMM0
++# define YMM0				ymm0
++#endif
++
+ #ifndef VZEROUPPER
+ # if VEC_SIZE > 16
+ #  define VZEROUPPER vzeroupper
+@@ -277,20 +285,20 @@ L(less_vec):
+ #if VEC_SIZE > 32
+ L(between_32_63):
+ 	/* From 32 to 63.  No branch when size == 32.  */
+-	vmovdqu	(%rsi), %ymm0
+-	vmovdqu	-32(%rsi,%rdx), %ymm1
+-	vmovdqu	%ymm0, (%rdi)
+-	vmovdqu	%ymm1, -32(%rdi,%rdx)
++	VMOVU	(%rsi), %YMM0
++	VMOVU	-32(%rsi,%rdx), %YMM1
++	VMOVU	%YMM0, (%rdi)
++	VMOVU	%YMM1, -32(%rdi,%rdx)
+ 	VZEROUPPER
+ 	ret
+ #endif
+ #if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+-	vmovdqu	(%rsi), %xmm0
+-	vmovdqu	-16(%rsi,%rdx), %xmm1
+-	vmovdqu	%xmm0, (%rdi)
+-	vmovdqu	%xmm1, -16(%rdi,%rdx)
++	VMOVU	(%rsi), %XMM0
++	VMOVU	-16(%rsi,%rdx), %XMM1
++	VMOVU	%XMM0, (%rdi)
++	VMOVU	%XMM1, -16(%rdi,%rdx)
+ 	ret
+ #endif
+ L(between_8_15):
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-15.patch b/glibc-RHEL-15696-15.patch
new file mode 100644
index 0000000..72cd8cf
--- /dev/null
+++ b/glibc-RHEL-15696-15.patch
@@ -0,0 +1,254 @@
+From 1b968b6b9b3aac702ac2f133e0dd16cfdbb415ee Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 07:15:03 -0800
+Subject: [PATCH] x86-64: Add memset family functions with 256-bit EVEX
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
+with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM
+abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
+function exit.
+---
+ sysdeps/x86_64/multiarch/Makefile             |  1 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 22 +++++++++++++++++
+ sysdeps/x86_64/multiarch/ifunc-memset.h       | 24 +++++++++++++++----
+ sysdeps/x86_64/multiarch/ifunc-wmemset.h      | 13 ++++++----
+ .../multiarch/memset-evex-unaligned-erms.S    | 24 +++++++++++++++++++
+ .../multiarch/memset-vec-unaligned-erms.S     | 20 +++++++++++-----
+ 6 files changed, 90 insertions(+), 14 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 4563fc56..1cc0a10e 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memchr-evex \
+ 		   memmove-evex-unaligned-erms \
+ 		   memrchr-evex \
++		   memset-evex-unaligned-erms \
+ 		   rawmemchr-evex \
+ 		   stpcpy-evex \
+ 		   stpncpy-evex \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 6bd3abfc..7cf83485 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memset_chk_avx2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memset_chk,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __memset_chk_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, __memset_chk,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __memset_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memset_chk_avx512_unaligned_erms)
+@@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memset_avx2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memset,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __memset_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, memset,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __memset_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memset_avx512_unaligned_erms)
+@@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_avx2_unaligned)
++	      IFUNC_IMPL_ADD (array, i, wmemset,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __wmemset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __wmemset_avx512_unaligned))
+@@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_chk_avx2_unaligned)
++	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __wmemset_chk_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __wmemset_chk_avx512_unaligned))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 708bd72e..6f31f4dc 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
+   attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
++  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
+@@ -56,10 +60,22 @@ IFUNC_SELECTOR (void)
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+     {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx2_unaligned_erms);
+-      else
+-	return OPTIMIZE (avx2_unaligned);
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (evex_unaligned_erms);
++
++	  return OPTIMIZE (evex_unaligned);
++	}
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx2_unaligned_erms);
++
++	  return OPTIMIZE (avx2_unaligned);
++	}
+     }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+index eb242210..9290c4bf 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+@@ -20,6 +20,7 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
+ 
+ static inline void *
+@@ -27,14 +28,18 @@ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
++	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
++	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx512_unaligned);
+-      else
++
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
++	return OPTIMIZE (evex_unaligned);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2_unaligned);
+     }
+ 
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+new file mode 100644
+index 00000000..ae0a4d6e
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -0,0 +1,24 @@
++#if IS_IN (libc)
++# define VEC_SIZE	32
++# define XMM0		xmm16
++# define YMM0		ymm16
++# define VEC0		ymm16
++# define VEC(i)		VEC##i
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++# define VZEROUPPER
++
++# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++  movq r, %rax; \
++  vpbroadcastb d, %VEC0
++
++# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++  movq r, %rax; \
++  vpbroadcastd d, %VEC0
++
++# define SECTION(p)		p##.evex
++# define MEMSET_SYMBOL(p,s)	p##_evex_##s
++# define WMEMSET_SYMBOL(p,s)	p##_evex_##s
++
++# include "memset-vec-unaligned-erms.S"
++#endif
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 9a0fd818..71e91a8f 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -34,6 +34,14 @@
+ # define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
+ #endif
+ 
++#ifndef XMM0
++# define XMM0				xmm0
++#endif
++
++#ifndef YMM0
++# define YMM0				ymm0
++#endif
++
+ #ifndef VZEROUPPER
+ # if VEC_SIZE > 16
+ #  define VZEROUPPER			vzeroupper
+@@ -67,7 +75,7 @@
+ ENTRY (__bzero)
+ 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+ 	mov	%RSI_LP, %RDX_LP /* Set n.  */
+-	pxor	%xmm0, %xmm0
++	pxor	%XMM0, %XMM0
+ 	jmp	L(entry_from_bzero)
+ END (__bzero)
+ weak_alias (__bzero, bzero)
+@@ -223,7 +231,7 @@ L(less_vec):
+ 	cmpb	$16, %dl
+ 	jae	L(between_16_31)
+ # endif
+-	MOVQ	%xmm0, %rcx
++	MOVQ	%XMM0, %rcx
+ 	cmpb	$8, %dl
+ 	jae	L(between_8_15)
+ 	cmpb	$4, %dl
+@@ -238,16 +246,16 @@ L(less_vec):
+ # if VEC_SIZE > 32
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	vmovdqu	%ymm0, -32(%rdi,%rdx)
+-	vmovdqu	%ymm0, (%rdi)
++	VMOVU	%YMM0, -32(%rdi,%rdx)
++	VMOVU	%YMM0, (%rdi)
+ 	VZEROUPPER
+ 	ret
+ # endif
+ # if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+-	vmovdqu	%xmm0, -16(%rdi,%rdx)
+-	vmovdqu	%xmm0, (%rdi)
++	VMOVU	%XMM0, -16(%rdi,%rdx)
++	VMOVU	%XMM0, (%rdi)
+ 	VZEROUPPER
+ 	ret
+ # endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-16.patch b/glibc-RHEL-15696-16.patch
new file mode 100644
index 0000000..b3f443d
--- /dev/null
+++ b/glibc-RHEL-15696-16.patch
@@ -0,0 +1,561 @@
+From 91264fe3577fe887b4860923fa6142b5274c8965 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 07:20:28 -0800
+Subject: [PATCH] x86-64: Add memcmp family functions with 256-bit EVEX
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memcmp.h to select the function optimized with 256-bit EVEX
+instructions using YMM16-YMM31 registers to avoid RTM abort with usable
+AVX512VL, AVX512BW and MOVBE since VZEROUPPER isn't needed at function
+exit.
+---
+ sysdeps/x86_64/multiarch/Makefile             |   4 +-
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  10 +
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h       |  13 +-
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S  | 440 ++++++++++++++++++
+ sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S |   4 +
+ 5 files changed, 467 insertions(+), 4 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 1cc0a10e..9d79b138 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memset-avx2-unaligned-erms \
+ 		   memset-avx512-unaligned-erms \
+ 		   memchr-evex \
++		   memcmp-evex-movbe \
+ 		   memmove-evex-unaligned-erms \
+ 		   memrchr-evex \
+ 		   memset-evex-unaligned-erms \
+@@ -81,7 +82,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcsncmp-evex \
+ 		   wcsnlen-evex \
+ 		   wcsrchr-evex \
+-		   wmemchr-evex
++		   wmemchr-evex \
++		   wmemcmp-evex-movbe
+ endif
+ 
+ ifeq ($(subdir),debug)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 7cf83485..c8da910e 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -56,6 +56,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_avx2_movbe)
++	      IFUNC_IMPL_ADD (array, i, memcmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (MOVBE)),
++			      __memcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
+ 			      __memcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
+@@ -558,6 +563,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_avx2_movbe)
++	      IFUNC_IMPL_ADD (array, i, wmemcmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (MOVBE)),
++			      __wmemcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
+ 			      __wmemcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+index 6c1f3153..3ca1f0a6 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+@@ -23,17 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2_movbe);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	return OPTIMIZE (evex_movbe);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2_movbe);
++    }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+     return OPTIMIZE (sse4_1);
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+new file mode 100644
+index 00000000..9c093972
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -0,0 +1,440 @@
++/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++/* memcmp/wmemcmp is implemented as:
++   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
++      to avoid branches.
++   2. Use overlapping compare to avoid branch.
++   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
++      bytes for wmemcmp.
++   4. If size is 8 * VEC_SIZE or less, unroll the loop.
++   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
++      area.
++   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
++   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
++   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
++
++# include <sysdep.h>
++
++# ifndef MEMCMP
++#  define MEMCMP	__memcmp_evex_movbe
++# endif
++
++# define VMOVU		vmovdqu64
++
++# ifdef USE_AS_WMEMCMP
++#  define VPCMPEQ	vpcmpeqd
++# else
++#  define VPCMPEQ	vpcmpeqb
++# endif
++
++# define XMM1		xmm17
++# define XMM2		xmm18
++# define YMM1		ymm17
++# define YMM2		ymm18
++# define YMM3		ymm19
++# define YMM4		ymm20
++# define YMM5		ymm21
++# define YMM6		ymm22
++
++# define VEC_SIZE 32
++# ifdef USE_AS_WMEMCMP
++#  define VEC_MASK 0xff
++#  define XMM_MASK 0xf
++# else
++#  define VEC_MASK 0xffffffff
++#  define XMM_MASK 0xffff
++# endif
++
++/* Warning!
++           wmemcmp has to use SIGNED comparison for elements.
++           memcmp has to use UNSIGNED comparison for elemnts.
++*/
++
++	.section .text.evex,"ax",@progbits
++ENTRY (MEMCMP)
++# ifdef USE_AS_WMEMCMP
++	shl	$2, %RDX_LP
++# elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++# endif
++	cmp	$VEC_SIZE, %RDX_LP
++	jb	L(less_vec)
++
++	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k1
++	kmovd	%k1, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++	cmpq	$(VEC_SIZE * 2), %rdx
++	jbe	L(last_vec)
++
++	/* More than 2 * VEC.  */
++	cmpq	$(VEC_SIZE * 8), %rdx
++	ja	L(more_8x_vec)
++	cmpq	$(VEC_SIZE * 4), %rdx
++	jb	L(last_4x_vec)
++
++	/* From 4 * VEC to 8 * VEC, inclusively. */
++	VMOVU	(%rsi), %YMM1
++	VPCMPEQ (%rdi), %YMM1, %k1
++
++	VMOVU	VEC_SIZE(%rsi), %YMM2
++	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
++
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
++	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
++
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
++	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
++
++	kandd	%k1, %k2, %k5
++	kandd	%k3, %k4, %k6
++	kandd	%k5, %k6, %k6
++
++	kmovd	%k6, %eax
++	cmpl	$VEC_MASK, %eax
++	jne	L(4x_vec_end)
++
++	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
++	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
++	VMOVU	(%rsi), %YMM1
++	VPCMPEQ (%rdi), %YMM1, %k1
++
++	VMOVU	VEC_SIZE(%rsi), %YMM2
++	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
++	kandd	%k1, %k2, %k5
++
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
++	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
++	kandd	%k3, %k5, %k5
++
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
++	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
++	kandd	%k4, %k5, %k5
++
++	kmovd	%k5, %eax
++	cmpl	$VEC_MASK, %eax
++	jne	L(4x_vec_end)
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(last_2x_vec):
++	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++L(last_vec):
++	/* Use overlapping loads to avoid branches.  */
++	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
++	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++	ret
++
++	.p2align 4
++L(first_vec):
++	/* A byte or int32 is different within 16 or 32 bytes.  */
++	tzcntl	%eax, %ecx
++# ifdef USE_AS_WMEMCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rcx, 4), %edx
++	cmpl	(%rsi, %rcx, 4), %edx
++L(wmemcmp_return):
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
++# else
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %edx
++	sub	%edx, %eax
++# endif
++	ret
++
++# ifdef USE_AS_WMEMCMP
++	.p2align 4
++L(4):
++	xorl	%eax, %eax
++	movl	(%rdi), %edx
++	cmpl	(%rsi), %edx
++	jne	L(wmemcmp_return)
++	ret
++# else
++	.p2align 4
++L(between_4_7):
++	/* Load as big endian with overlapping movbe to avoid branches.  */
++	movbe	(%rdi), %eax
++	movbe	(%rsi), %ecx
++	shlq	$32, %rax
++	shlq	$32, %rcx
++	movbe	-4(%rdi, %rdx), %edi
++	movbe	-4(%rsi, %rdx), %esi
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	je	L(exit)
++	sbbl	%eax, %eax
++	orl	$1, %eax
++	ret
++
++	.p2align 4
++L(exit):
++	ret
++
++	.p2align 4
++L(between_2_3):
++	/* Load as big endian to avoid branches.  */
++	movzwl	(%rdi), %eax
++	movzwl	(%rsi), %ecx
++	shll	$8, %eax
++	shll	$8, %ecx
++	bswap	%eax
++	bswap	%ecx
++	movb	-1(%rdi, %rdx), %al
++	movb	-1(%rsi, %rdx), %cl
++	/* Subtraction is okay because the upper 8 bits are zero.  */
++	subl	%ecx, %eax
++	ret
++
++	.p2align 4
++L(1):
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	subl	%ecx, %eax
++	ret
++# endif
++
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(less_vec):
++# ifdef USE_AS_WMEMCMP
++	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
++	cmpb	$4, %dl
++	je	L(4)
++	jb	L(zero)
++# else
++	cmpb	$1, %dl
++	je	L(1)
++	jb	L(zero)
++	cmpb	$4, %dl
++	jb	L(between_2_3)
++	cmpb	$8, %dl
++	jb	L(between_4_7)
++# endif
++	cmpb	$16, %dl
++	jae	L(between_16_31)
++	/* It is between 8 and 15 bytes.  */
++	vmovq	(%rdi), %XMM1
++	vmovq	(%rsi), %XMM2
++	VPCMPEQ %XMM1, %XMM2, %k2
++	kmovw	%k2, %eax
++	subl    $XMM_MASK, %eax
++	jnz	L(first_vec)
++	/* Use overlapping loads to avoid branches.  */
++	leaq	-8(%rdi, %rdx), %rdi
++	leaq	-8(%rsi, %rdx), %rsi
++	vmovq	(%rdi), %XMM1
++	vmovq	(%rsi), %XMM2
++	VPCMPEQ %XMM1, %XMM2, %k2
++	kmovw	%k2, %eax
++	subl    $XMM_MASK, %eax
++	jnz	L(first_vec)
++	ret
++
++	.p2align 4
++L(between_16_31):
++	/* From 16 to 31 bytes.  No branch when size == 16.  */
++	VMOVU	(%rsi), %XMM2
++	VPCMPEQ (%rdi), %XMM2, %k2
++	kmovw	%k2, %eax
++	subl    $XMM_MASK, %eax
++	jnz	L(first_vec)
++
++	/* Use overlapping loads to avoid branches.  */
++	leaq	-16(%rdi, %rdx), %rdi
++	leaq	-16(%rsi, %rdx), %rsi
++	VMOVU	(%rsi), %XMM2
++	VPCMPEQ (%rdi), %XMM2, %k2
++	kmovw	%k2, %eax
++	subl    $XMM_MASK, %eax
++	jnz	L(first_vec)
++	ret
++
++	.p2align 4
++L(more_8x_vec):
++	/* More than 8 * VEC.  Check the first VEC.  */
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++	/* Align the first memory area for aligned loads in the loop.
++	   Compute how much the first memory area is misaligned.  */
++	movq	%rdi, %rcx
++	andl	$(VEC_SIZE - 1), %ecx
++	/* Get the negative of offset for alignment.  */
++	subq	$VEC_SIZE, %rcx
++	/* Adjust the second memory area.  */
++	subq	%rcx, %rsi
++	/* Adjust the first memory area which should be aligned now.  */
++	subq	%rcx, %rdi
++	/* Adjust length.  */
++	addq	%rcx, %rdx
++
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	VMOVU	(%rsi), %YMM1
++	VPCMPEQ (%rdi), %YMM1, %k1
++
++	VMOVU	VEC_SIZE(%rsi), %YMM2
++	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
++	kandd	%k2, %k1, %k5
++
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
++	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
++	kandd	%k3, %k5, %k5
++
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
++	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
++	kandd	%k4, %k5, %k5
++
++	kmovd	%k5, %eax
++	cmpl	$VEC_MASK, %eax
++	jne	L(4x_vec_end)
++
++	addq	$(VEC_SIZE * 4), %rdi
++	addq	$(VEC_SIZE * 4), %rsi
++
++	subq	$(VEC_SIZE * 4), %rdx
++	cmpq	$(VEC_SIZE * 4), %rdx
++	jae	L(loop_4x_vec)
++
++	/* Less than 4 * VEC.  */
++	cmpq	$VEC_SIZE, %rdx
++	jbe	L(last_vec)
++	cmpq	$(VEC_SIZE * 2), %rdx
++	jbe	L(last_2x_vec)
++
++L(last_4x_vec):
++	/* From 2 * VEC to 4 * VEC. */
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++	addq	$VEC_SIZE, %rdi
++	addq	$VEC_SIZE, %rsi
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++	/* Use overlapping loads to avoid branches.  */
++	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
++	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++	addq	$VEC_SIZE, %rdi
++	addq	$VEC_SIZE, %rsi
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++	ret
++
++	.p2align 4
++L(4x_vec_end):
++	kmovd	%k1, %eax
++	subl	$VEC_MASK, %eax
++	jnz	L(first_vec)
++	kmovd	%k2, %eax
++	subl	$VEC_MASK, %eax
++	jnz	L(first_vec_x1)
++	kmovd	%k3, %eax
++	subl	$VEC_MASK, %eax
++	jnz	L(first_vec_x2)
++	kmovd	%k4, %eax
++	subl	$VEC_MASK, %eax
++	tzcntl	%eax, %ecx
++# ifdef USE_AS_WMEMCMP
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
++	jmp	L(wmemcmp_return)
++# else
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
++	sub	%edx, %eax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x1):
++	tzcntl	%eax, %ecx
++# ifdef USE_AS_WMEMCMP
++	xorl	%eax, %eax
++	movl	VEC_SIZE(%rdi, %rcx, 4), %edx
++	cmpl	VEC_SIZE(%rsi, %rcx, 4), %edx
++	jmp	L(wmemcmp_return)
++# else
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %edx
++	sub	%edx, %eax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %ecx
++# ifdef USE_AS_WMEMCMP
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
++	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
++	jmp	L(wmemcmp_return)
++# else
++	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
++	sub	%edx, %eax
++# endif
++	ret
++END (MEMCMP)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
+new file mode 100644
+index 00000000..4726d74a
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
+@@ -0,0 +1,4 @@
++#define MEMCMP __wmemcmp_evex_movbe
++#define USE_AS_WMEMCMP 1
++
++#include "memcmp-evex-movbe.S"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-17.patch b/glibc-RHEL-15696-17.patch
new file mode 100644
index 0000000..3176514
--- /dev/null
+++ b/glibc-RHEL-15696-17.patch
@@ -0,0 +1,2568 @@
+From 7ebba91361badf7531d4e75050627a88d424872f Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 07:26:42 -0800
+Subject: [PATCH] x86-64: Add AVX optimized string/memory functions for RTM
+Content-type: text/plain; charset=UTF-8
+
+Since VZEROUPPER triggers RTM abort while VZEROALL won't, select AVX
+optimized string/memory functions with
+
+	xtest
+	jz	1f
+	vzeroall
+	ret
+1:
+	vzeroupper
+	ret
+
+at function exit on processors with usable RTM, but without 256-bit EVEX
+instructions to avoid VZEROUPPER inside a transactionally executing RTM
+region.
+---
+ sysdeps/x86_64/multiarch/Makefile             |  27 +++
+ sysdeps/x86_64/multiarch/ifunc-avx2.h         |   4 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 170 ++++++++++++++++++
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h       |   4 +
+ sysdeps/x86_64/multiarch/ifunc-memmove.h      |  12 ++
+ sysdeps/x86_64/multiarch/ifunc-memset.h       |  12 ++
+ sysdeps/x86_64/multiarch/ifunc-strcpy.h       |   4 +
+ sysdeps/x86_64/multiarch/ifunc-wmemset.h      |   5 +
+ sysdeps/x86_64/multiarch/memchr-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/memchr-avx2.S        |  45 +++--
+ .../x86_64/multiarch/memcmp-avx2-movbe-rtm.S  |  12 ++
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S  |  28 ++-
+ .../memmove-avx-unaligned-erms-rtm.S          |  17 ++
+ .../multiarch/memmove-vec-unaligned-erms.S    |  33 ++--
+ sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S   |  12 ++
+ sysdeps/x86_64/multiarch/memrchr-avx2.S       |  53 +++---
+ .../memset-avx2-unaligned-erms-rtm.S          |  10 ++
+ .../multiarch/memset-avx2-unaligned-erms.S    |  12 +-
+ .../multiarch/memset-vec-unaligned-erms.S     |  41 ++---
+ sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S |   4 +
+ sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S    |   3 +
+ sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S   |   4 +
+ sysdeps/x86_64/multiarch/strcat-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/strcat-avx2.S        |   6 +-
+ sysdeps/x86_64/multiarch/strchr-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/strchr-avx2.S        |  22 +--
+ sysdeps/x86_64/multiarch/strchr.c             |   4 +
+ sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S |   3 +
+ sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/strcmp-avx2.S        |  55 +++---
+ sysdeps/x86_64/multiarch/strcmp.c             |   4 +
+ sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/strcpy-avx2.S        |  85 ++++-----
+ sysdeps/x86_64/multiarch/strlen-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/strlen-avx2.S        |  43 ++---
+ sysdeps/x86_64/multiarch/strncat-avx2-rtm.S   |   3 +
+ sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S   |   3 +
+ sysdeps/x86_64/multiarch/strncmp.c            |   4 +
+ sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S   |   3 +
+ sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S   |   4 +
+ sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S   |  12 ++
+ sysdeps/x86_64/multiarch/strrchr-avx2.S       |  19 +-
+ sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S    |   3 +
+ sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S    |   4 +
+ sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S    |   4 +
+ sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S   |   5 +
+ sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S   |   5 +
+ sysdeps/x86_64/multiarch/wcsnlen.c            |   4 +
+ sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S   |   3 +
+ sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S   |   4 +
+ .../x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S |   4 +
+ sysdeps/x86_64/sysdep.h                       |  22 +++
+ 52 files changed, 668 insertions(+), 244 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strchr-avx2.S
+	(same fix, different location)
+
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 9d79b138..491c7698 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -40,6 +40,25 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memset-sse2-unaligned-erms \
+ 		   memset-avx2-unaligned-erms \
+ 		   memset-avx512-unaligned-erms \
++		   memchr-avx2-rtm \
++		   memcmp-avx2-movbe-rtm \
++		   memmove-avx-unaligned-erms-rtm \
++		   memrchr-avx2-rtm \
++		   memset-avx2-unaligned-erms-rtm \
++		   rawmemchr-avx2-rtm \
++		   strchr-avx2-rtm \
++		   strcmp-avx2-rtm \
++		   strchrnul-avx2-rtm \
++		   stpcpy-avx2-rtm \
++		   stpncpy-avx2-rtm \
++		   strcat-avx2-rtm \
++		   strcpy-avx2-rtm \
++		   strlen-avx2-rtm \
++		   strncat-avx2-rtm \
++		   strncmp-avx2-rtm \
++		   strncpy-avx2-rtm \
++		   strnlen-avx2-rtm \
++		   strrchr-avx2-rtm \
+ 		   memchr-evex \
+ 		   memcmp-evex-movbe \
+ 		   memmove-evex-unaligned-erms \
+@@ -76,6 +95,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcsrchr-sse2 wcsrchr-avx2 \
+ 		   wcsnlen-sse4_1 wcsnlen-c \
+ 		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
++		   wcschr-avx2-rtm \
++		   wcscmp-avx2-rtm \
++		   wcslen-avx2-rtm \
++		   wcsncmp-avx2-rtm \
++		   wcsnlen-avx2-rtm \
++		   wcsrchr-avx2-rtm \
++		   wmemchr-avx2-rtm \
++		   wmemcmp-avx2-movbe-rtm \
+ 		   wcschr-evex \
+ 		   wcscmp-evex \
+ 		   wcslen-evex \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+index 7081b0c9..e0f30e61 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
++++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+@@ -21,6 +21,7 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -36,6 +37,9 @@ IFUNC_SELECTOR (void)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index c8da910e..c1efeec0 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -43,6 +43,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, memchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -56,6 +60,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_avx2_movbe)
++	      IFUNC_IMPL_ADD (array, i, memcmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (MOVBE)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memcmp_avx2_movbe_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -85,6 +94,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memmove_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memmove_chk_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memmove_chk_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_chk_evex_unaligned)
+@@ -113,6 +130,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memmove_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memmove,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memmove_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, memmove,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memmove_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_evex_unaligned)
+@@ -143,6 +168,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, memrchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memrchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memrchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -165,6 +194,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memset_chk_avx2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memset_chk,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memset_chk_avx2_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, __memset_chk,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memset_chk_avx2_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -198,6 +235,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memset_avx2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memset,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memset_avx2_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, memset,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memset_avx2_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -222,6 +267,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __rawmemchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, rawmemchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __rawmemchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -234,6 +283,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, strlen,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -245,6 +298,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strnlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, strnlen,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -257,6 +314,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __stpncpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __stpncpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, stpncpy,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __stpncpy_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, stpncpy,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -271,6 +332,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __stpcpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __stpcpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, stpcpy,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __stpcpy_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, stpcpy,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -309,6 +374,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strcat,
+ 	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2),
+ 			      __strcat_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcat,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strcat_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strcat,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -323,6 +392,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, strchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -336,6 +409,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strchrnul_avx2)
++	      IFUNC_IMPL_ADD (array, i, strchrnul,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strchrnul_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -348,6 +425,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, strrchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strrchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strrchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -359,6 +440,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strcmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strcmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strcmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -375,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strcpy,
+ 	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __strcpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcpy,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strcpy_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strcpy,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -422,6 +511,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strncat,
+ 	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2),
+ 			      __strncat_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncat,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strncat_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strncat,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -436,6 +529,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strncpy,
+ 	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __strncpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncpy,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strncpy_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strncpy,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -469,6 +566,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcschr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcschr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcschr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -481,6 +582,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsrchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcsrchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -493,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcscmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcscmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcscmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcscmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcscmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -505,6 +614,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsncmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsncmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsncmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcsncmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcsncmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -523,6 +636,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcslen_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcslen,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcslen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -535,6 +652,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsnlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsnlen,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcsnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -550,6 +671,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wmemchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wmemchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wmemchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -563,6 +688,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_avx2_movbe)
++	      IFUNC_IMPL_ADD (array, i, wmemcmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (MOVBE)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wmemcmp_avx2_movbe_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -581,6 +711,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_avx2_unaligned)
++	      IFUNC_IMPL_ADD (array, i, wmemset,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wmemset_avx2_unaligned_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __wmemset_evex_unaligned)
+@@ -606,6 +740,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memcpy_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memcpy_chk_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memcpy_chk_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_chk_evex_unaligned)
+@@ -634,6 +776,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memcpy_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memcpy,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memcpy_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, memcpy,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memcpy_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_evex_unaligned)
+@@ -676,6 +826,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __mempcpy_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __mempcpy_chk_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __mempcpy_chk_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_chk_evex_unaligned)
+@@ -713,6 +871,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __mempcpy_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, mempcpy,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __mempcpy_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, mempcpy,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __mempcpy_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_evex_unaligned)
+@@ -734,6 +900,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strncmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strncmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strncmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strncmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+index 3ca1f0a6..8043c635 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+@@ -23,6 +23,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
+ 
+ static inline void *
+@@ -38,6 +39,9 @@ IFUNC_SELECTOR (void)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ 	return OPTIMIZE (evex_movbe);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_movbe_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2_movbe);
+     }
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+index 6f8bce5f..fa09b9fb 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
+   attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm)
++  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+@@ -71,6 +75,14 @@ IFUNC_SELECTOR (void)
+ 	  return OPTIMIZE (evex_unaligned);
+ 	}
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx_unaligned_erms_rtm);
++
++	  return OPTIMIZE (avx_unaligned_rtm);
++	}
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	{
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 6f31f4dc..6f3375cc 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
+   attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm)
++  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+@@ -69,6 +73,14 @@ IFUNC_SELECTOR (void)
+ 	  return OPTIMIZE (evex_unaligned);
+ 	}
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx2_unaligned_erms_rtm);
++
++	  return OPTIMIZE (avx2_unaligned_rtm);
++	}
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	{
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+index deae6348..a924762e 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
++++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -39,6 +40,9 @@ IFUNC_SELECTOR (void)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+index 9290c4bf..bdc94c6c 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+@@ -20,6 +20,8 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
++  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
+ 
+@@ -39,6 +41,9 @@ IFUNC_SELECTOR (void)
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+ 	return OPTIMIZE (evex_unaligned);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_unaligned_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2_unaligned);
+     }
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+new file mode 100644
+index 00000000..87b076c7
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef MEMCHR
++# define MEMCHR __memchr_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "memchr-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index c81da19b..cf893e77 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -34,9 +34,13 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE 32
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+@@ -107,8 +111,8 @@ L(cros_page_boundary):
+ # endif
+ 	addq	%rdi, %rax
+ 	addq	%rcx, %rax
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(aligned_more):
+@@ -224,8 +228,7 @@ L(last_4x_vec_or_less):
+ 
+ 	jnz	L(first_vec_x3_check)
+ 	xorl	%eax, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_2x_vec):
+@@ -243,8 +246,7 @@ L(last_2x_vec):
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1_check)
+ 	xorl	%eax, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x0_check):
+@@ -253,8 +255,7 @@ L(first_vec_x0_check):
+ 	cmpq	%rax, %rdx
+ 	jbe	L(zero)
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1_check):
+@@ -264,8 +265,7 @@ L(first_vec_x1_check):
+ 	jbe	L(zero)
+ 	addq	$VEC_SIZE, %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2_check):
+@@ -275,8 +275,7 @@ L(first_vec_x2_check):
+ 	jbe	L(zero)
+ 	addq	$(VEC_SIZE * 2), %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x3_check):
+@@ -286,12 +285,14 @@ L(first_vec_x3_check):
+ 	jbe	L(zero)
+ 	addq	$(VEC_SIZE * 3), %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(zero):
+-	VZEROUPPER
++	xorl	%eax, %eax
++	jmp     L(return_vzeroupper)
++
++	.p2align 4
+ L(null):
+ 	xorl	%eax, %eax
+ 	ret
+@@ -301,24 +302,21 @@ L(null):
+ L(first_vec_x0):
+ 	tzcntl	%eax, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+ 	addq	$VEC_SIZE, %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+ 	addq	$(VEC_SIZE * 2), %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(4x_vec_end):
+@@ -337,8 +335,7 @@ L(first_vec_x3):
+ 	tzcntl	%eax, %eax
+ 	addq	$(VEC_SIZE * 3), %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ END (MEMCHR)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
+new file mode 100644
+index 00000000..cf4eff5d
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef MEMCMP
++# define MEMCMP __memcmp_avx2_movbe_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "memcmp-avx2-movbe.S"
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index e3a35b89..9d5c9c72 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -47,6 +47,10 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE 32
+ # define VEC_MASK ((1 << VEC_SIZE) - 1)
+ 
+@@ -55,7 +59,7 @@
+            memcmp has to use UNSIGNED comparison for elemnts.
+ */
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+ 	shl	$2, %RDX_LP
+@@ -123,8 +127,8 @@ ENTRY (MEMCMP)
+ 	vptest	%ymm0, %ymm5
+ 	jnc	L(4x_vec_end)
+ 	xorl	%eax, %eax
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(last_2x_vec):
+@@ -144,8 +148,7 @@ L(last_vec):
+ 	vpmovmskb %ymm2, %eax
+ 	subl    $VEC_MASK, %eax
+ 	jnz	L(first_vec)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec):
+@@ -164,8 +167,7 @@ L(wmemcmp_return):
+ 	movzbl	(%rsi, %rcx), %edx
+ 	sub	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_WMEMCMP
+ 	.p2align 4
+@@ -367,8 +369,7 @@ L(last_4x_vec):
+ 	vpmovmskb %ymm2, %eax
+ 	subl    $VEC_MASK, %eax
+ 	jnz	L(first_vec)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(4x_vec_end):
+@@ -394,8 +395,7 @@ L(4x_vec_end):
+ 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+ 	sub	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+@@ -410,8 +410,7 @@ L(first_vec_x1):
+ 	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+ 	sub	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2):
+@@ -426,7 +425,6 @@ L(first_vec_x2):
+ 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+ 	sub	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ END (MEMCMP)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+new file mode 100644
+index 00000000..1ec1962e
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+@@ -0,0 +1,17 @@
++#if IS_IN (libc)
++# define VEC_SIZE	32
++# define VEC(i)		ymm##i
++# define VMOVNT		vmovntdq
++# define VMOVU		vmovdqu
++# define VMOVA		vmovdqa
++
++# define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++# define VZEROUPPER_RETURN jmp	 L(return)
++
++# define SECTION(p)		p##.avx.rtm
++# define MEMMOVE_SYMBOL(p,s)	p##_avx_##s##_rtm
++
++# include "memmove-vec-unaligned-erms.S"
++#endif
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 08e21692..71f5954d 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -140,11 +140,12 @@ L(last_2x_vec):
+ 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+-	VZEROUPPER
+ #if !defined USE_MULTIARCH || !IS_IN (libc)
+ L(nop):
+-#endif
+ 	ret
++#else
++	VZEROUPPER_RETURN
++#endif
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ END (MEMMOVE_SYMBOL (__memmove, unaligned))
+ 
+@@ -237,8 +238,11 @@ L(last_2x_vec):
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+ L(return):
+-	VZEROUPPER
++#if VEC_SIZE > 16
++	ZERO_UPPER_VEC_REGISTERS_RETURN
++#else
+ 	ret
++#endif
+ 
+ L(movsb):
+ 	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+@@ -289,8 +293,7 @@ L(between_32_63):
+ 	VMOVU	-32(%rsi,%rdx), %YMM1
+ 	VMOVU	%YMM0, (%rdi)
+ 	VMOVU	%YMM1, -32(%rdi,%rdx)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ #endif
+ #if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+@@ -299,7 +302,7 @@ L(between_16_31):
+ 	VMOVU	-16(%rsi,%rdx), %XMM1
+ 	VMOVU	%XMM0, (%rdi)
+ 	VMOVU	%XMM1, -16(%rdi,%rdx)
+-	ret
++	VZEROUPPER_RETURN
+ #endif
+ L(between_8_15):
+ 	/* From 8 to 15.  No branch when size == 8.  */
+@@ -352,8 +355,7 @@ L(more_2x_vec):
+ 	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+ 	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+ 	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ L(last_4x_vec):
+ 	/* Copy from 2 * VEC to 4 * VEC. */
+ 	VMOVU	(%rsi), %VEC(0)
+@@ -364,8 +366,7 @@ L(last_4x_vec):
+ 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+ 	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
+ 	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ L(more_8x_vec):
+ 	cmpq	%rsi, %rdi
+@@ -421,8 +422,7 @@ L(loop_4x_vec_forward):
+ 	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+ 	/* Store the first VEC.  */
+ 	VMOVU	%VEC(4), (%r11)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ L(more_8x_vec_backward):
+ 	/* Load the first 4 * VEC and last VEC to support overlapping
+@@ -473,8 +473,7 @@ L(loop_4x_vec_backward):
+ 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+ 	/* Store the last VEC.  */
+ 	VMOVU	%VEC(8), (%r11)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ L(large_forward):
+@@ -509,8 +508,7 @@ L(loop_large_forward):
+ 	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+ 	/* Store the first VEC.  */
+ 	VMOVU	%VEC(4), (%r11)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ L(large_backward):
+ 	/* Don't use non-temporal store if there is overlap between
+@@ -544,8 +542,7 @@ L(loop_large_backward):
+ 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+ 	/* Store the last VEC.  */
+ 	VMOVU	%VEC(8), (%r11)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ #endif
+ END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ 
+diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+new file mode 100644
+index 00000000..cea2d2a7
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef MEMRCHR
++# define MEMRCHR __memrchr_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "memrchr-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+index ce488dd9..20efe7ac 100644
+--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+@@ -20,14 +20,22 @@
+ 
+ # include <sysdep.h>
+ 
++# ifndef MEMRCHR
++#  define MEMRCHR	__memrchr_avx2
++# endif
++
+ # ifndef VZEROUPPER
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE 32
+ 
+-	.section .text.avx,"ax",@progbits
+-ENTRY (__memrchr_avx2)
++	.section SECTION(.text),"ax",@progbits
++ENTRY (MEMRCHR)
+ 	/* Broadcast CHAR to YMM0.  */
+ 	vmovd	%esi, %xmm0
+ 	vpbroadcastb %xmm0, %ymm0
+@@ -134,8 +142,8 @@ L(loop_4x_vec):
+ 	vpmovmskb %ymm1, %eax
+ 	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(last_4x_vec_or_less):
+@@ -169,8 +177,7 @@ L(last_4x_vec_or_less):
+ 	addq	%rax, %rdx
+ 	jl	L(zero)
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_2x_vec):
+@@ -191,31 +198,27 @@ L(last_2x_vec):
+ 	jl	L(zero)
+ 	addl	$(VEC_SIZE * 2), %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_x0):
+ 	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_x1):
+ 	bsrl	%eax, %eax
+ 	addl	$VEC_SIZE, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_x2):
+ 	bsrl	%eax, %eax
+ 	addl	$(VEC_SIZE * 2), %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_x3):
+@@ -232,8 +235,7 @@ L(last_vec_x1_check):
+ 	jl	L(zero)
+ 	addl	$VEC_SIZE, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_x3_check):
+@@ -243,12 +245,14 @@ L(last_vec_x3_check):
+ 	jl	L(zero)
+ 	addl	$(VEC_SIZE * 3), %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(zero):
+-	VZEROUPPER
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
++
++	.p2align 4
+ L(null):
+ 	xorl	%eax, %eax
+ 	ret
+@@ -273,8 +277,7 @@ L(last_vec_or_less_aligned):
+ 
+ 	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_or_less):
+@@ -315,8 +318,7 @@ L(last_vec_or_less):
+ 	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
+ 	addq	%r8, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_2x_aligned):
+@@ -353,7 +355,6 @@ L(last_vec_2x_aligned):
+ 	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
+ 	addq	%r8, %rax
+-	VZEROUPPER
+-	ret
+-END (__memrchr_avx2)
++	VZEROUPPER_RETURN
++END (MEMRCHR)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+new file mode 100644
+index 00000000..8ac3e479
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+@@ -0,0 +1,10 @@
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return)
++
++#define SECTION(p) p##.avx.rtm
++#define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
++#define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
++
++#include "memset-avx2-unaligned-erms.S"
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index 7ab3d898..ae0860f3 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -14,9 +14,15 @@
+   movq r, %rax; \
+   vpbroadcastd %xmm0, %ymm0
+ 
+-# define SECTION(p)		p##.avx
+-# define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+-# define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
++# ifndef SECTION
++#  define SECTION(p)		p##.avx
++# endif
++# ifndef MEMSET_SYMBOL
++#  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
++# endif
++# ifndef WMEMSET_SYMBOL
++#  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
++# endif
+ 
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 71e91a8f..bae5cba4 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -45,17 +45,14 @@
+ #ifndef VZEROUPPER
+ # if VEC_SIZE > 16
+ #  define VZEROUPPER			vzeroupper
++#  define VZEROUPPER_SHORT_RETURN	vzeroupper; ret
+ # else
+ #  define VZEROUPPER
+ # endif
+ #endif
+ 
+ #ifndef VZEROUPPER_SHORT_RETURN
+-# if VEC_SIZE > 16
+-#  define VZEROUPPER_SHORT_RETURN	vzeroupper
+-# else
+-#  define VZEROUPPER_SHORT_RETURN	rep
+-# endif
++# define VZEROUPPER_SHORT_RETURN	rep; ret
+ #endif
+ 
+ #ifndef MOVQ
+@@ -117,8 +114,7 @@ L(entry_from_bzero):
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+ 	VMOVU	%VEC(0), (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ END (MEMSET_SYMBOL (__memset, unaligned))
+ 
+@@ -141,14 +137,12 @@ ENTRY (__memset_erms)
+ ENTRY (MEMSET_SYMBOL (__memset, erms))
+ # endif
+ L(stosb):
+-	/* Issue vzeroupper before rep stosb.  */
+-	VZEROUPPER
+ 	mov	%RDX_LP, %RCX_LP
+ 	movzbl	%sil, %eax
+ 	mov	%RDI_LP, %RDX_LP
+ 	rep stosb
+ 	mov	%RDX_LP, %RAX_LP
+-	ret
++	VZEROUPPER_RETURN
+ # if VEC_SIZE == 16
+ END (__memset_erms)
+ # else
+@@ -175,8 +169,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+ 	VMOVU	%VEC(0), (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ L(stosb_more_2x_vec):
+ 	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
+@@ -190,8 +183,11 @@ L(more_2x_vec):
+ 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+ 	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+ L(return):
+-	VZEROUPPER
++#if VEC_SIZE > 16
++	ZERO_UPPER_VEC_REGISTERS_RETURN
++#else
+ 	ret
++#endif
+ 
+ L(loop_start):
+ 	leaq	(VEC_SIZE * 4)(%rdi), %rcx
+@@ -217,7 +213,6 @@ L(loop):
+ 	cmpq	%rcx, %rdx
+ 	jne	L(loop)
+ 	VZEROUPPER_SHORT_RETURN
+-	ret
+ L(less_vec):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+@@ -241,40 +236,34 @@ L(less_vec):
+ 	jb	1f
+ 	movb	%cl, (%rdi)
+ 1:
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ # if VEC_SIZE > 32
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+ 	VMOVU	%YMM0, -32(%rdi,%rdx)
+ 	VMOVU	%YMM0, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ # endif
+ # if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+ 	VMOVU	%XMM0, -16(%rdi,%rdx)
+ 	VMOVU	%XMM0, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ # endif
+ 	/* From 8 to 15.  No branch when size == 8.  */
+ L(between_8_15):
+ 	movq	%rcx, -8(%rdi,%rdx)
+ 	movq	%rcx, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+ 	movl	%ecx, -4(%rdi,%rdx)
+ 	movl	%ecx, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+ 	movw	%cx, -2(%rdi,%rdx)
+ 	movw	%cx, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
+new file mode 100644
+index 00000000..acc5f6e2
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define MEMCHR __rawmemchr_avx2_rtm
++#define USE_AS_RAWMEMCHR 1
++
++#include "memchr-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
+new file mode 100644
+index 00000000..2b9c07a5
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STPCPY
++#define STRCPY __stpcpy_avx2_rtm
++#include "strcpy-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
+new file mode 100644
+index 00000000..60a2ccfe
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define USE_AS_STPCPY
++#define USE_AS_STRNCPY
++#define STRCPY __stpncpy_avx2_rtm
++#include "strcpy-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
+new file mode 100644
+index 00000000..637fb557
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRCAT
++# define STRCAT __strcat_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strcat-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
+index b0623564..aa48c058 100644
+--- a/sysdeps/x86_64/multiarch/strcat-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
+@@ -30,7 +30,11 @@
+ /* Number of bytes in a vector register */
+ # define VEC_SIZE	32
+ 
+-	.section .text.avx,"ax",@progbits
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCAT)
+ 	mov	%rdi, %r9
+ # ifdef USE_AS_STRNCAT
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
+new file mode 100644
+index 00000000..81f20d1d
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRCHR
++# define STRCHR __strchr_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strchr-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+index 47bc3c99..da7d2620 100644
+--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+@@ -38,9 +38,13 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE 32
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCHR)
+ 	movl	%edi, %ecx
+ 	/* Broadcast CHAR to YMM0.  */
+@@ -93,8 +97,8 @@ L(cros_page_boundary):
+ 	cmp	(%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(aligned_more):
+@@ -190,8 +194,7 @@ L(first_vec_x0):
+ 	cmp	(%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+@@ -205,8 +208,7 @@ L(first_vec_x1):
+ 	cmp	(%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2):
+@@ -220,8 +222,7 @@ L(first_vec_x2):
+ 	cmp	(%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(4x_vec_end):
+@@ -247,8 +248,7 @@ L(first_vec_x3):
+ 	cmp	(%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ END (STRCHR)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
+index be05e197..7e582f02 100644
+--- a/sysdeps/x86_64/multiarch/strchr.c
++++ b/sysdeps/x86_64/multiarch/strchr.c
+@@ -29,6 +29,7 @@
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -44,6 +45,9 @@ IFUNC_SELECTOR (void)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
+new file mode 100644
+index 00000000..cdcf818b
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define STRCHR __strchrnul_avx2_rtm
++#define USE_AS_STRCHRNUL 1
++#include "strchr-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
+new file mode 100644
+index 00000000..aecd30d9
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRCMP
++# define STRCMP __strcmp_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strcmp-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 8fb8eedc..5d1c9d90 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -55,6 +55,10 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+            strcmp/strncmp have to use UNSIGNED comparison for elements.
+@@ -75,7 +79,7 @@
+    the maximum offset is reached before a difference is found, zero is
+    returned.  */
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCMP)
+ # ifdef USE_AS_STRNCMP
+ 	/* Check for simple cases (0 or 1) in offset.  */
+@@ -137,8 +141,8 @@ L(return):
+ 	movzbl	(%rsi, %rdx), %edx
+ 	subl	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(return_vec_size):
+@@ -171,8 +175,7 @@ L(return_vec_size):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(return_2_vec_size):
+@@ -205,8 +208,7 @@ L(return_2_vec_size):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(return_3_vec_size):
+@@ -239,8 +241,7 @@ L(return_3_vec_size):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(next_3_vectors):
+@@ -366,8 +367,7 @@ L(back_to_loop):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(test_vec):
+@@ -410,8 +410,7 @@ L(test_vec):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(test_2_vec):
+@@ -454,8 +453,7 @@ L(test_2_vec):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(test_3_vec):
+@@ -496,8 +494,7 @@ L(test_3_vec):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(loop_cross_page):
+@@ -566,8 +563,7 @@ L(loop_cross_page):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(loop_cross_page_2_vec):
+@@ -641,8 +637,7 @@ L(loop_cross_page_2_vec):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_STRNCMP
+ L(string_nbyte_offset_check):
+@@ -684,8 +679,7 @@ L(cross_page_loop):
+ # ifndef USE_AS_WCSCMP
+ L(different):
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_WCSCMP
+ 	.p2align 4
+@@ -695,16 +689,14 @@ L(different):
+ 	setl	%al
+ 	negl	%eax
+ 	orl	$1, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ # endif
+ 
+ # ifdef USE_AS_STRNCMP
+ 	.p2align 4
+ L(zero):
+ 	xorl	%eax, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(char0):
+@@ -718,8 +710,7 @@ L(char0):
+ 	movzbl	(%rdi), %eax
+ 	subl	%ecx, %eax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ # endif
+ 
+ 	.p2align 4
+@@ -744,8 +735,7 @@ L(last_vector):
+ 	movzbl	(%rsi, %rdx), %edx
+ 	subl	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	/* Comparing on page boundary region requires special treatment:
+ 	   It must done one vector at the time, starting with the wider
+@@ -866,7 +856,6 @@ L(cross_page_4bytes):
+ 	testl	%eax, %eax
+ 	jne	L(cross_page_loop)
+ 	subl	%ecx, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ END (STRCMP)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
+index c5f38510..11bbea2b 100644
+--- a/sysdeps/x86_64/multiarch/strcmp.c
++++ b/sysdeps/x86_64/multiarch/strcmp.c
+@@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -46,6 +47,9 @@ IFUNC_SELECTOR (void)
+ 	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
+new file mode 100644
+index 00000000..c2c581ec
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRCPY
++# define STRCPY __strcpy_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strcpy-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
+index 81677f90..613c59aa 100644
+--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
+@@ -37,6 +37,10 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ /* zero register */
+ #define xmmZ	xmm0
+ #define ymmZ	ymm0
+@@ -46,7 +50,7 @@
+ 
+ # ifndef USE_AS_STRCAT
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCPY)
+ #  ifdef USE_AS_STRNCPY
+ 	mov	%rdx, %r8
+@@ -369,8 +373,8 @@ L(CopyVecSizeExit):
+ 	lea	1(%rdi), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(CopyTwoVecSize1):
+@@ -553,8 +557,7 @@ L(Exit1):
+ 	lea	2(%rdi), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit2):
+@@ -569,8 +572,7 @@ L(Exit2):
+ 	lea	3(%rdi), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit3):
+@@ -584,8 +586,7 @@ L(Exit3):
+ 	lea	4(%rdi), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit4_7):
+@@ -602,8 +603,7 @@ L(Exit4_7):
+ 	lea	1(%rdi, %rdx), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit8_15):
+@@ -620,8 +620,7 @@ L(Exit8_15):
+ 	lea	1(%rdi, %rdx), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit16_31):
+@@ -638,8 +637,7 @@ L(Exit16_31):
+ 	lea 1(%rdi, %rdx), %rdi
+ 	jnz L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit32_63):
+@@ -656,8 +654,7 @@ L(Exit32_63):
+ 	lea	1(%rdi, %rdx), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_STRNCPY
+ 
+@@ -671,8 +668,7 @@ L(StrncpyExit1):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, 1(%rdi)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit2):
+@@ -684,8 +680,7 @@ L(StrncpyExit2):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, 2(%rdi)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit3_4):
+@@ -699,8 +694,7 @@ L(StrncpyExit3_4):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi, %r8)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit5_8):
+@@ -714,8 +708,7 @@ L(StrncpyExit5_8):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi, %r8)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit9_16):
+@@ -729,8 +722,7 @@ L(StrncpyExit9_16):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi, %r8)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit17_32):
+@@ -744,8 +736,7 @@ L(StrncpyExit17_32):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi, %r8)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit33_64):
+@@ -760,8 +751,7 @@ L(StrncpyExit33_64):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi, %r8)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit65):
+@@ -778,50 +768,43 @@ L(StrncpyExit65):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, 65(%rdi)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ #  ifndef USE_AS_STRCAT
+ 
+ 	.p2align 4
+ L(Fill1):
+ 	mov	%dl, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Fill2):
+ 	mov	%dx, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Fill3_4):
+ 	mov	%dx, (%rdi)
+ 	mov     %dx, -2(%rdi, %r8)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Fill5_8):
+ 	mov	%edx, (%rdi)
+ 	mov     %edx, -4(%rdi, %r8)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Fill9_16):
+ 	mov	%rdx, (%rdi)
+ 	mov	%rdx, -8(%rdi, %r8)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Fill17_32):
+ 	vmovdqu %xmmZ, (%rdi)
+ 	vmovdqu %xmmZ, -16(%rdi, %r8)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(CopyVecSizeUnalignedVec2):
+@@ -898,8 +881,7 @@ L(Fill):
+ 	cmp	$1, %r8d
+ 	ja	L(Fill2)
+ 	je	L(Fill1)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ /* end of ifndef USE_AS_STRCAT */
+ #  endif
+@@ -929,8 +911,7 @@ L(UnalignedFourVecSizeLeaveCase3):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (VEC_SIZE * 4)(%rdi)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(UnalignedFourVecSizeLeaveCase2):
+@@ -1001,16 +982,14 @@ L(StrncpyExit):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(ExitZero):
+ #  ifndef USE_AS_STRCAT
+ 	mov	%rdi, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ # endif
+ 
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
+new file mode 100644
+index 00000000..75b4b761
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRLEN
++# define STRLEN __strlen_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strlen-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
+index 645e0446..82826e10 100644
+--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
+@@ -36,9 +36,13 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE 32
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+ 	/* Check for zero length.  */
+@@ -111,8 +115,8 @@ L(cros_page_boundary):
+ # ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(aligned_more):
+@@ -231,8 +235,7 @@ L(last_4x_vec_or_less):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_2x_vec):
+@@ -253,8 +256,7 @@ L(last_2x_vec):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x0_check):
+@@ -267,8 +269,7 @@ L(first_vec_x0_check):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1_check):
+@@ -282,8 +283,7 @@ L(first_vec_x1_check):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2_check):
+@@ -297,8 +297,7 @@ L(first_vec_x2_check):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x3_check):
+@@ -312,8 +311,7 @@ L(first_vec_x3_check):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(max):
+@@ -321,8 +319,7 @@ L(max):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(zero):
+@@ -338,8 +335,7 @@ L(first_vec_x0):
+ # ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+@@ -350,8 +346,7 @@ L(first_vec_x1):
+ # ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2):
+@@ -362,8 +357,7 @@ L(first_vec_x2):
+ # ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(4x_vec_end):
+@@ -389,8 +383,7 @@ L(first_vec_x3):
+ # ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ END (STRLEN)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
+new file mode 100644
+index 00000000..0dcea18d
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STRNCAT
++#define STRCAT __strncat_avx2_rtm
++#include "strcat-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+new file mode 100644
+index 00000000..37d1224b
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define STRCMP	__strncmp_avx2_rtm
++#define USE_AS_STRNCMP 1
++#include "strcmp-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
+index 4c15542f..44c85116 100644
+--- a/sysdeps/x86_64/multiarch/strncmp.c
++++ b/sysdeps/x86_64/multiarch/strncmp.c
+@@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -46,6 +47,9 @@ IFUNC_SELECTOR (void)
+ 	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
+new file mode 100644
+index 00000000..79e70832
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STRNCPY
++#define STRCPY __strncpy_avx2_rtm
++#include "strcpy-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
+new file mode 100644
+index 00000000..04f1626a
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define STRLEN __strnlen_avx2_rtm
++#define USE_AS_STRNLEN 1
++
++#include "strlen-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
+new file mode 100644
+index 00000000..5def14ec
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRRCHR
++# define STRRCHR __strrchr_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strrchr-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+index 4381e6ab..9f22a15e 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+@@ -36,9 +36,13 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE	32
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRRCHR)
+ 	movd	%esi, %xmm4
+ 	movl	%edi, %ecx
+@@ -166,8 +170,8 @@ L(return_value):
+ # endif
+ 	bsrl	%eax, %eax
+ 	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(match):
+@@ -198,8 +202,7 @@ L(find_nul):
+ 	jz	L(return_value)
+ 	bsrl	%eax, %eax
+ 	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(char_and_nul):
+@@ -222,14 +225,12 @@ L(char_and_nul_in_first_vec):
+ 	jz	L(return_null)
+ 	bsrl	%eax, %eax
+ 	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(return_null):
+ 	xorl	%eax, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ END (STRRCHR)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
+new file mode 100644
+index 00000000..d49dbbf0
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define STRCHR __wcschr_avx2_rtm
++#define USE_AS_WCSCHR 1
++#include "strchr-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
+new file mode 100644
+index 00000000..d6ca2b80
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define STRCMP __wcscmp_avx2_rtm
++#define USE_AS_WCSCMP 1
++
++#include "strcmp-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
+new file mode 100644
+index 00000000..35658d73
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define STRLEN __wcslen_avx2_rtm
++#define USE_AS_WCSLEN 1
++
++#include "strlen-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+new file mode 100644
+index 00000000..4e88c70c
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+@@ -0,0 +1,5 @@
++#define STRCMP __wcsncmp_avx2_rtm
++#define USE_AS_STRNCMP 1
++#define USE_AS_WCSCMP 1
++
++#include "strcmp-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
+new file mode 100644
+index 00000000..7437ebee
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
+@@ -0,0 +1,5 @@
++#define STRLEN __wcsnlen_avx2_rtm
++#define USE_AS_WCSLEN 1
++#define USE_AS_STRNLEN 1
++
++#include "strlen-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
+index 84254b83..20b731ae 100644
+--- a/sysdeps/x86_64/multiarch/wcsnlen.c
++++ b/sysdeps/x86_64/multiarch/wcsnlen.c
+@@ -29,6 +29,7 @@
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -44,6 +45,9 @@ IFUNC_SELECTOR (void)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
+new file mode 100644
+index 00000000..9bf76083
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define STRRCHR __wcsrchr_avx2_rtm
++#define USE_AS_WCSRCHR 1
++#include "strrchr-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
+new file mode 100644
+index 00000000..58ed21db
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define MEMCHR __wmemchr_avx2_rtm
++#define USE_AS_WMEMCHR 1
++
++#include "memchr-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
+new file mode 100644
+index 00000000..31104d12
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
+@@ -0,0 +1,4 @@
++#define MEMCMP __wmemcmp_avx2_movbe_rtm
++#define USE_AS_WMEMCMP 1
++
++#include "memcmp-avx2-movbe-rtm.S"
+diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
+index 1738d7f9..223f1a59 100644
+--- a/sysdeps/x86_64/sysdep.h
++++ b/sysdeps/x86_64/sysdep.h
+@@ -95,6 +95,28 @@ lose:									      \
+ #define R14_LP	r14
+ #define R15_LP	r15
+ 
++/* Zero upper vector registers and return with xtest.  NB: Use VZEROALL
++   to avoid RTM abort triggered by VZEROUPPER inside transactionally.  */
++#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \
++	xtest;							\
++	jz	1f;						\
++	vzeroall;						\
++	ret;							\
++1:								\
++	vzeroupper;						\
++	ret
++
++/* Zero upper vector registers and return.  */
++#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
++# define ZERO_UPPER_VEC_REGISTERS_RETURN \
++	VZEROUPPER;						\
++	ret
++#endif
++
++#ifndef VZEROUPPER_RETURN
++# define VZEROUPPER_RETURN	VZEROUPPER; ret
++#endif
++
+ #else	/* __ASSEMBLER__ */
+ 
+ /* Long and pointer size in bytes.  */
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-18.patch b/glibc-RHEL-15696-18.patch
new file mode 100644
index 0000000..2cf0e45
--- /dev/null
+++ b/glibc-RHEL-15696-18.patch
@@ -0,0 +1,735 @@
+From 4bd660be40967cd69072f69ebc2ad32bfcc1f206 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Tue, 23 Feb 2021 06:33:10 -0800
+Subject: [PATCH] x86: Add string/memory function tests in RTM region
+Content-type: text/plain; charset=UTF-8
+
+At function exit, AVX optimized string/memory functions have VZEROUPPER
+which triggers RTM abort.   When such functions are called inside a
+transactionally executing RTM region, RTM abort causes severe performance
+degradation.  Add tests to verify that string/memory functions won't
+cause RTM abort in RTM region.
+---
+ sysdeps/x86/Makefile          | 23 +++++++++++
+ sysdeps/x86/tst-memchr-rtm.c  | 54 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-memcmp-rtm.c  | 52 +++++++++++++++++++++++++
+ sysdeps/x86/tst-memmove-rtm.c | 53 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-memrchr-rtm.c | 54 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-memset-rtm.c  | 45 ++++++++++++++++++++++
+ sysdeps/x86/tst-strchr-rtm.c  | 54 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-strcpy-rtm.c  | 53 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-string-rtm.h  | 72 +++++++++++++++++++++++++++++++++++
+ sysdeps/x86/tst-strlen-rtm.c  | 53 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-strncmp-rtm.c | 52 +++++++++++++++++++++++++
+ sysdeps/x86/tst-strrchr-rtm.c | 53 ++++++++++++++++++++++++++
+ 12 files changed, 618 insertions(+)
+ create mode 100644 sysdeps/x86/tst-memchr-rtm.c
+ create mode 100644 sysdeps/x86/tst-memcmp-rtm.c
+ create mode 100644 sysdeps/x86/tst-memmove-rtm.c
+ create mode 100644 sysdeps/x86/tst-memrchr-rtm.c
+ create mode 100644 sysdeps/x86/tst-memset-rtm.c
+ create mode 100644 sysdeps/x86/tst-strchr-rtm.c
+ create mode 100644 sysdeps/x86/tst-strcpy-rtm.c
+ create mode 100644 sysdeps/x86/tst-string-rtm.h
+ create mode 100644 sysdeps/x86/tst-strlen-rtm.c
+ create mode 100644 sysdeps/x86/tst-strncmp-rtm.c
+ create mode 100644 sysdeps/x86/tst-strrchr-rtm.c
+
+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+index 59e928e9..5be71ada 100644
+--- a/sysdeps/x86/Makefile
++++ b/sysdeps/x86/Makefile
+@@ -17,6 +17,29 @@ endif
+ 
+ ifeq ($(subdir),string)
+ sysdep_routines += cacheinfo
++
++tests += \
++  tst-memchr-rtm \
++  tst-memcmp-rtm \
++  tst-memmove-rtm \
++  tst-memrchr-rtm \
++  tst-memset-rtm \
++  tst-strchr-rtm \
++  tst-strcpy-rtm \
++  tst-strlen-rtm \
++  tst-strncmp-rtm \
++  tst-strrchr-rtm
++
++CFLAGS-tst-memchr-rtm.c += -mrtm
++CFLAGS-tst-memcmp-rtm.c += -mrtm
++CFLAGS-tst-memmove-rtm.c += -mrtm
++CFLAGS-tst-memrchr-rtm.c += -mrtm
++CFLAGS-tst-memset-rtm.c += -mrtm
++CFLAGS-tst-strchr-rtm.c += -mrtm
++CFLAGS-tst-strcpy-rtm.c += -mrtm
++CFLAGS-tst-strlen-rtm.c += -mrtm
++CFLAGS-tst-strncmp-rtm.c += -mrtm
++CFLAGS-tst-strrchr-rtm.c += -mrtm
+ endif
+ 
+ ifneq ($(enable-cet),no)
+diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
+new file mode 100644
+index 00000000..e4749401
+--- /dev/null
++++ b/sysdeps/x86/tst-memchr-rtm.c
+@@ -0,0 +1,54 @@
++/* Test case for memchr inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  string1[100] = 'c';
++  string1[STRING_SIZE - 100] = 'c';
++  char *p = memchr (string1, 'c', STRING_SIZE);
++  if (p == &string1[100])
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  char *p = memchr (string1, 'c', STRING_SIZE);
++  if (p == &string1[100])
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("memchr", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
+new file mode 100644
+index 00000000..e4c8a623
+--- /dev/null
++++ b/sysdeps/x86/tst-memcmp-rtm.c
+@@ -0,0 +1,52 @@
++/* Test case for memcmp inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++char string2[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  memset (string2, 'a', STRING_SIZE);
++  if (memcmp (string1, string2, STRING_SIZE) == 0)
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  if (memcmp (string1, string2, STRING_SIZE) == 0)
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("memcmp", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
+new file mode 100644
+index 00000000..4bf97ef1
+--- /dev/null
++++ b/sysdeps/x86/tst-memmove-rtm.c
+@@ -0,0 +1,53 @@
++/* Test case for memmove inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++char string2[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  if (memmove (string2, string1, STRING_SIZE) == string2
++      && memcmp (string2, string1, STRING_SIZE) == 0)
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  if (memmove (string2, string1, STRING_SIZE) == string2
++      && memcmp (string2, string1, STRING_SIZE) == 0)
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("memmove", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
+new file mode 100644
+index 00000000..a57a5a8e
+--- /dev/null
++++ b/sysdeps/x86/tst-memrchr-rtm.c
+@@ -0,0 +1,54 @@
++/* Test case for memrchr inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  string1[100] = 'c';
++  string1[STRING_SIZE - 100] = 'c';
++  char *p = memrchr (string1, 'c', STRING_SIZE);
++  if (p == &string1[STRING_SIZE - 100])
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  char *p = memrchr (string1, 'c', STRING_SIZE);
++  if (p == &string1[STRING_SIZE - 100])
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("memrchr", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
+new file mode 100644
+index 00000000..bf343a4d
+--- /dev/null
++++ b/sysdeps/x86/tst-memset-rtm.c
+@@ -0,0 +1,45 @@
++/* Test case for memset inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  return EXIT_SUCCESS;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  return 0;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("memset", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
+new file mode 100644
+index 00000000..a82e29c0
+--- /dev/null
++++ b/sysdeps/x86/tst-strchr-rtm.c
+@@ -0,0 +1,54 @@
++/* Test case for strchr inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE - 1);
++  string1[100] = 'c';
++  string1[STRING_SIZE - 100] = 'c';
++  char *p = strchr (string1, 'c');
++  if (p == &string1[100])
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  char *p = strchr (string1, 'c');
++  if (p == &string1[100])
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("strchr", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
+new file mode 100644
+index 00000000..2b2a583f
+--- /dev/null
++++ b/sysdeps/x86/tst-strcpy-rtm.c
+@@ -0,0 +1,53 @@
++/* Test case for strcpy inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++char string2[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE - 1);
++  if (strcpy (string2, string1) == string2
++      && strcmp (string2, string1) == 0)
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  if (strcpy (string2, string1) == string2
++      && strcmp (string2, string1) == 0)
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("strcpy", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
+new file mode 100644
+index 00000000..d2470afa
+--- /dev/null
++++ b/sysdeps/x86/tst-string-rtm.h
+@@ -0,0 +1,72 @@
++/* Test string function in a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <string.h>
++#include <x86intrin.h>
++#include <sys/platform/x86.h>
++#include <support/check.h>
++#include <support/test-driver.h>
++
++static int
++do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
++	   int (*function) (void))
++{
++  if (!CPU_FEATURE_USABLE (RTM))
++    return EXIT_UNSUPPORTED;
++
++  int status = prepare ();
++  if (status != EXIT_SUCCESS)
++    return status;
++
++  unsigned int i;
++  unsigned int naborts = 0;
++  unsigned int failed = 0;
++  for (i = 0; i < loop; i++)
++    {
++      failed |= function ();
++      if (_xbegin() == _XBEGIN_STARTED)
++	{
++	  failed |= function ();
++	  _xend();
++	}
++      else
++	{
++	  failed |= function ();
++	  ++naborts;
++	}
++    }
++
++  if (failed)
++    FAIL_EXIT1 ("%s() failed", name);
++
++  if (naborts)
++    {
++      /* NB: Low single digit (<= 5%) noise-level aborts are normal for
++	 TSX.  */
++      double rate = 100 * ((double) naborts) / ((double) loop);
++      if (rate > 5)
++	FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
++		    rate, naborts, loop);
++    }
++
++  return EXIT_SUCCESS;
++}
++
++static int do_test (void);
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
+new file mode 100644
+index 00000000..0dcf14db
+--- /dev/null
++++ b/sysdeps/x86/tst-strlen-rtm.c
+@@ -0,0 +1,53 @@
++/* Test case for strlen inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE - 1);
++  string1[STRING_SIZE - 100] = '\0';
++  size_t len = strlen (string1);
++  if (len == STRING_SIZE - 100)
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  size_t len = strlen (string1);
++  if (len == STRING_SIZE - 100)
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("strlen", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+new file mode 100644
+index 00000000..236ad951
+--- /dev/null
++++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -0,0 +1,52 @@
++/* Test case for strncmp inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++char string2[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE - 1);
++  memset (string2, 'a', STRING_SIZE - 1);
++  if (strncmp (string1, string2, STRING_SIZE) == 0)
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  if (strncmp (string1, string2, STRING_SIZE) == 0)
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("strncmp", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
+new file mode 100644
+index 00000000..e32bfaf5
+--- /dev/null
++++ b/sysdeps/x86/tst-strrchr-rtm.c
+@@ -0,0 +1,53 @@
++/* Test case for strrchr inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE - 1);
++  string1[STRING_SIZE - 100] = 'c';
++  char *p = strrchr (string1, 'c');
++  if (p == &string1[STRING_SIZE - 100])
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  char *p = strrchr (string1, 'c');
++  if (p == &string1[STRING_SIZE - 100])
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("strrchr", LOOP, prepare, function);
++}
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-19.patch b/glibc-RHEL-15696-19.patch
new file mode 100644
index 0000000..0500875
--- /dev/null
+++ b/glibc-RHEL-15696-19.patch
@@ -0,0 +1,148 @@
+From 4e2d8f352774b56078c34648b14a2412c38384f4 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sun, 7 Mar 2021 09:44:18 -0800
+Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memset family functions
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
+with AVX512 instructions using ZMM16-ZMM31 registers to avoid RTM abort
+with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
+function exit.
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c       | 14 +++++++++-----
+ sysdeps/x86_64/multiarch/ifunc-memset.h          | 13 ++++++++-----
+ sysdeps/x86_64/multiarch/ifunc-wmemset.h         | 12 ++++++------
+ .../multiarch/memset-avx512-unaligned-erms.S     | 16 ++++++++--------
+ 4 files changed, 31 insertions(+), 24 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index c1efeec0..d969a156 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -211,10 +211,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -252,10 +254,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -719,7 +723,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __wmemset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __wmemset_avx512_unaligned))
+ 
+ #ifdef SHARED
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 6f3375cc..19795938 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -53,13 +53,16 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+     {
+-      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx512_no_vzeroupper);
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx512_unaligned_erms);
+ 
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx512_unaligned_erms);
++	  return OPTIMIZE (avx512_unaligned);
++	}
+ 
+-      return OPTIMIZE (avx512_unaligned);
++      return OPTIMIZE (avx512_no_vzeroupper);
+     }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+index bdc94c6c..98c5d406 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+@@ -33,13 +33,13 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx512_unaligned);
+-
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+-	return OPTIMIZE (evex_unaligned);
++	{
++	  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
++	    return OPTIMIZE (avx512_unaligned);
++
++	  return OPTIMIZE (evex_unaligned);
++	}
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ 	return OPTIMIZE (avx2_unaligned_rtm);
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 0783979c..22e7b187 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -1,22 +1,22 @@
+ #if IS_IN (libc)
+ # define VEC_SIZE	64
+-# define VEC(i)		zmm##i
++# define XMM0		xmm16
++# define YMM0		ymm16
++# define VEC0		zmm16
++# define VEC(i)		VEC##i
+ # define VMOVU		vmovdqu64
+ # define VMOVA		vmovdqa64
++# define VZEROUPPER
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  vmovd d, %xmm0; \
+   movq r, %rax; \
+-  vpbroadcastb %xmm0, %xmm0; \
+-  vpbroadcastq %xmm0, %zmm0
++  vpbroadcastb d, %VEC0
+ 
+ # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  vmovd d, %xmm0; \
+   movq r, %rax; \
+-  vpbroadcastd %xmm0, %xmm0; \
+-  vpbroadcastq %xmm0, %zmm0
++  vpbroadcastd d, %VEC0
+ 
+-# define SECTION(p)		p##.avx512
++# define SECTION(p)		p##.evex512
+ # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+ # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
+ 
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-2.patch b/glibc-RHEL-15696-2.patch
new file mode 100644
index 0000000..54f3ac3
--- /dev/null
+++ b/glibc-RHEL-15696-2.patch
@@ -0,0 +1,230 @@
+From b304fc201d2f6baf52ea790df8643e99772243cd Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:25:56 -0800
+Subject: [PATCH] x86-64 memcmp/wmemcmp: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memcmp/wmemcmp for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for
+	length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
+	* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and
+	tst-size_t-wmemcmp.
+	* sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise.
+---
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S |  7 +-
+ sysdeps/x86_64/multiarch/memcmp-sse4.S       |  9 ++-
+ sysdeps/x86_64/multiarch/memcmp-ssse3.S      |  7 +-
+ sysdeps/x86_64/x32/Makefile                  |  4 +-
+ sysdeps/x86_64/x32/tst-size_t-memcmp.c       | 76 ++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wmemcmp.c      | 20 ++++++
+ 6 files changed, 114 insertions(+), 9 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index 30f764c3..e3a35b89 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -58,9 +58,12 @@
+ 	.section .text.avx,"ax",@progbits
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+-	shl	$2, %rdx
++	shl	$2, %RDX_LP
++# elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
+ # endif
+-	cmpq	$VEC_SIZE, %rdx
++	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+ 
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+index 8e164f2c..302900f5 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
++++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+@@ -42,13 +42,16 @@
+ 	.section .text.sse4.1,"ax",@progbits
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+-	shl	$2, %rdx
++	shl	$2, %RDX_LP
++# elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
+ # endif
+ 	pxor	%xmm0, %xmm0
+-	cmp	$79, %rdx
++	cmp	$79, %RDX_LP
+ 	ja	L(79bytesormore)
+ # ifndef USE_AS_WMEMCMP
+-	cmp	$1, %rdx
++	cmp	$1, %RDX_LP
+ 	je	L(firstbyte)
+ # endif
+ 	add	%rdx, %rsi
+diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+index 6f76c641..69d030fc 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
++++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+@@ -33,9 +33,12 @@
+ 	atom_text_section
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+-	shl	$2, %rdx
+-	test	%rdx, %rdx
++	shl	$2, %RDX_LP
++	test	%RDX_LP, %RDX_LP
+ 	jz	L(equal)
++# elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
+ # endif
+ 	mov	%rdx, %rcx
+ 	mov	%rdi, %rdx
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 7d528889..ddec7f04 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
+ 
+ ifeq ($(subdir),string)
+-tests += tst-size_t-memchr
++tests += tst-size_t-memchr tst-size_t-memcmp
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-tests += tst-size_t-wmemchr
++tests += tst-size_t-wmemchr tst-size_t-wmemcmp
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
+new file mode 100644
+index 00000000..9bd6fdb4
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
+@@ -0,0 +1,76 @@
++/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_MAIN
++#ifdef WIDE
++# define TEST_NAME "wmemcmp"
++#else
++# define TEST_NAME "memcmp"
++#endif
++
++#include "test-size_t.h"
++
++#ifdef WIDE
++# include <inttypes.h>
++# include <wchar.h>
++
++# define MEMCMP wmemcmp
++# define CHAR wchar_t
++#else
++# define MEMCMP memcmp
++# define CHAR char
++#endif
++
++IMPL (MEMCMP, 1)
++
++typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
++
++static int
++__attribute__ ((noinline, noclone))
++do_memcmp (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  memcpy (buf1, buf2, page_size);
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      int res = do_memcmp (dest, src);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %i != 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
+new file mode 100644
+index 00000000..e8b5ffd0
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
+@@ -0,0 +1,20 @@
++/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include "tst-size_t-memcmp.c"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-20.patch b/glibc-RHEL-15696-20.patch
new file mode 100644
index 0000000..c63b3fb
--- /dev/null
+++ b/glibc-RHEL-15696-20.patch
@@ -0,0 +1,164 @@
+From e4fda4631017e49d4ee5a2755db34289b6860fa4 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sun, 7 Mar 2021 09:45:23 -0800
+Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memmove family functions
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memmove.h to select the function optimized with AVX512
+instructions using ZMM16-ZMM31 registers to avoid RTM abort with usable
+AVX512VL since VZEROUPPER isn't needed at function exit.
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 24 +++++++++---------
+ sysdeps/x86_64/multiarch/ifunc-memmove.h      | 12 +++++----
+ .../multiarch/memmove-avx512-unaligned-erms.S | 25 +++++++++++++++++--
+ 3 files changed, 42 insertions(+), 19 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index d969a156..fec384f6 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -83,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memmove_chk_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+@@ -148,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memmove_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
+ 			      __memmove_ssse3_back)
+@@ -733,10 +733,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memcpy_chk_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+@@ -802,10 +802,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memcpy_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
+@@ -819,10 +819,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __mempcpy_chk_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+@@ -864,10 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __mempcpy_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+index fa09b9fb..014e95c7 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+@@ -56,13 +56,15 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+     {
+-      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx512_no_vzeroupper);
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx512_unaligned_erms);
+ 
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx512_unaligned_erms);
++	  return OPTIMIZE (avx512_unaligned);
++	}
+ 
+-      return OPTIMIZE (avx512_unaligned);
++      return OPTIMIZE (avx512_no_vzeroupper);
+     }
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+index aac1515c..848848ab 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+@@ -1,11 +1,32 @@
+ #if IS_IN (libc)
+ # define VEC_SIZE	64
+-# define VEC(i)		zmm##i
++# define XMM0		xmm16
++# define XMM1		xmm17
++# define YMM0		ymm16
++# define YMM1		ymm17
++# define VEC0		zmm16
++# define VEC1		zmm17
++# define VEC2		zmm18
++# define VEC3		zmm19
++# define VEC4		zmm20
++# define VEC5		zmm21
++# define VEC6		zmm22
++# define VEC7		zmm23
++# define VEC8		zmm24
++# define VEC9		zmm25
++# define VEC10		zmm26
++# define VEC11		zmm27
++# define VEC12		zmm28
++# define VEC13		zmm29
++# define VEC14		zmm30
++# define VEC15		zmm31
++# define VEC(i)		VEC##i
+ # define VMOVNT		vmovntdq
+ # define VMOVU		vmovdqu64
+ # define VMOVA		vmovdqa64
++# define VZEROUPPER
+ 
+-# define SECTION(p)		p##.avx512
++# define SECTION(p)		p##.evex512
+ # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
+ 
+ # include "memmove-vec-unaligned-erms.S"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-21.patch b/glibc-RHEL-15696-21.patch
new file mode 100644
index 0000000..319c08d
--- /dev/null
+++ b/glibc-RHEL-15696-21.patch
@@ -0,0 +1,71 @@
+From 595c22ecd8e87a27fd19270ed30fdbae9ad25426 Mon Sep 17 00:00:00 2001
+From: Sunil K Pandey <skpgkp2@gmail.com>
+Date: Thu, 1 Apr 2021 15:47:04 -0700
+Subject: [PATCH] x86-64: Fix ifdef indentation in strlen-evex.S
+Content-type: text/plain; charset=UTF-8
+
+Fix some indentations of ifdef in file strlen-evex.S which are off by 1
+and confusing to read.
+---
+ sysdeps/x86_64/multiarch/strlen-evex.S | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
+index cd022509..05838190 100644
+--- a/sysdeps/x86_64/multiarch/strlen-evex.S
++++ b/sysdeps/x86_64/multiarch/strlen-evex.S
+@@ -276,10 +276,10 @@ L(last_2x_vec):
+ 	.p2align 4
+ L(first_vec_x0_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
++#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+@@ -293,10 +293,10 @@ L(first_vec_x0_check):
+ 	.p2align 4
+ L(first_vec_x1_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
++#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+@@ -311,10 +311,10 @@ L(first_vec_x1_check):
+ 	.p2align 4
+ L(first_vec_x2_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
++#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+@@ -329,10 +329,10 @@ L(first_vec_x2_check):
+ 	.p2align 4
+ L(first_vec_x3_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
++#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-22.patch b/glibc-RHEL-15696-22.patch
new file mode 100644
index 0000000..c20557b
--- /dev/null
+++ b/glibc-RHEL-15696-22.patch
@@ -0,0 +1,51 @@
+From 55bf411b451c13f0fb7ff3d3bf9a820020b45df1 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 19 Apr 2021 07:07:21 -0700
+Subject: [PATCH] x86-64: Require BMI2 for __strlen_evex and __strnlen_evex
+Content-type: text/plain; charset=UTF-8
+
+Since __strlen_evex and __strnlen_evex added by
+
+commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Fri Mar 5 06:24:52 2021 -0800
+
+    x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
+
+use sarx:
+
+c4 e2 6a f7 c0       	sarx   %edx,%eax,%eax
+
+require BMI2 for __strlen_evex and __strnlen_evex in ifunc-impl-list.c.
+ifunc-avx2.h already requires BMI2 for EVEX implementation.
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index fec384f6..cbfc1a5d 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -293,7 +293,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __strlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
+ 
+@@ -308,7 +309,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __strnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strnlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
+ 
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-23.patch b/glibc-RHEL-15696-23.patch
new file mode 100644
index 0000000..ffde3d7
--- /dev/null
+++ b/glibc-RHEL-15696-23.patch
@@ -0,0 +1,584 @@
+From acfd088a1963ba51cd83c78f95c0ab25ead79e04 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 3 May 2021 03:01:58 -0400
+Subject: [PATCH] x86: Optimize memchr-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes memchr-avx2.S. The optimizations include
+replacing some branches with cmovcc, avoiding some branches entirely
+in the less_4x_vec case, making the page cross logic less strict,
+asaving a few instructions the in loop return loop. test-memchr,
+test-rawmemchr, and test-wmemchr are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++-----------
+ 1 file changed, 247 insertions(+), 178 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index cf893e77..b377f22e 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -26,8 +26,22 @@
+ 
+ # ifdef USE_AS_WMEMCHR
+ #  define VPCMPEQ	vpcmpeqd
++#  define VPBROADCAST	vpbroadcastd
++#  define CHAR_SIZE	4
+ # else
+ #  define VPCMPEQ	vpcmpeqb
++#  define VPBROADCAST	vpbroadcastb
++#  define CHAR_SIZE	1
++# endif
++
++# ifdef USE_AS_RAWMEMCHR
++#  define ERAW_PTR_REG	ecx
++#  define RRAW_PTR_REG	rcx
++#  define ALGN_PTR_REG	rdi
++# else
++#  define ERAW_PTR_REG	edi
++#  define RRAW_PTR_REG	rdi
++#  define ALGN_PTR_REG	rcx
+ # endif
+ 
+ # ifndef VZEROUPPER
+@@ -39,6 +53,7 @@
+ # endif
+ 
+ # define VEC_SIZE 32
++# define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+@@ -47,295 +62,349 @@ ENTRY (MEMCHR)
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(null)
+ # endif
+-	movl	%edi, %ecx
+-	/* Broadcast CHAR to YMM0.  */
+-	vmovd	%esi, %xmm0
+ # ifdef USE_AS_WMEMCHR
+ 	shl	$2, %RDX_LP
+-	vpbroadcastd %xmm0, %ymm0
+ # else
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+ #  endif
+-	vpbroadcastb %xmm0, %ymm0
+ # endif
++	/* Broadcast CHAR to YMMMATCH.  */
++	vmovd	%esi, %xmm0
++	VPBROADCAST %xmm0, %ymm0
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	movl	%edi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
++	VPCMPEQ	(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-
+ # ifndef USE_AS_RAWMEMCHR
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rdx
+-	jbe	L(zero)
+-# else
+-	jnz	L(first_vec_x0)
++	/* If length < CHAR_PER_VEC handle special.  */
++	cmpq	$VEC_SIZE, %rdx
++	jbe	L(first_vec_x0)
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+ # ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
++	.p2align 5
++L(first_vec_x0):
++	/* Check if first match was before length.  */
++	tzcntl	%eax, %eax
++	xorl	%ecx, %ecx
++	cmpl	%eax, %edx
++	leaq	(%rdi, %rax), %rax
++	cmovle	%rcx, %rax
++	VZEROUPPER_RETURN
+ 
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
++L(null):
++	xorl	%eax, %eax
++	ret
+ # endif
+-	jmp	L(more_4x_vec)
+-
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
++L(cross_page_boundary):
++	/* Save pointer before aligning as its original value is necessary
++	   for computer return address if byte is found or adjusting length
++	   if it is not and this is memchr.  */
++	movq	%rdi, %rcx
++	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
++	   rdi for rawmemchr.  */
++	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
++	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
++# ifndef USE_AS_RAWMEMCHR
++	/* Calculate length until end of page (length checked for a
++	   match).  */
++	leaq	1(%ALGN_PTR_REG), %rsi
++	subq	%RRAW_PTR_REG, %rsi
++# endif
+ 	/* Remove the leading bytes.  */
+-	sarl	%cl, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
+-	tzcntl	%eax, %eax
++	sarxl	%ERAW_PTR_REG, %eax, %eax
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
++	cmpq	%rsi, %rdx
++	jbe	L(first_vec_x0)
+ # endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
++	testl	%eax, %eax
++	jz	L(cross_page_continue)
++	tzcntl	%eax, %eax
++	addq	%RRAW_PTR_REG, %rax
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+-L(aligned_more):
+-# ifndef USE_AS_RAWMEMCHR
+-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
+-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
+-	   overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++	incq	%rdi
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+-	/* Check the end of data.  */
+-	subq	%rcx, %rdx
+-	jbe	L(zero)
+-# endif
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++	addq	$(VEC_SIZE + 1), %rdi
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++	addq	$(VEC_SIZE * 2 + 1), %rdi
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+-	addq	$VEC_SIZE, %rdi
+ 
+-# ifndef USE_AS_RAWMEMCHR
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
++	.p2align 4
++L(first_vec_x4):
++	tzcntl	%eax, %eax
++	addq	$(VEC_SIZE * 3 + 1), %rdi
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+-L(more_4x_vec):
++	.p2align 4
++L(aligned_more):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
++# ifndef USE_AS_RAWMEMCHR
++L(cross_page_continue):
++	/* Align data to VEC_SIZE - 1.  */
++	xorl	%ecx, %ecx
++	subl	%edi, %ecx
++	orq	$(VEC_SIZE - 1), %rdi
++	/* esi is for adjusting length to see if near the end.  */
++	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
++# else
++	orq	$(VEC_SIZE - 1), %rdi
++L(cross_page_continue):
++# endif
++	/* Load first VEC regardless.  */
++	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
++# ifndef USE_AS_RAWMEMCHR
++	/* Adjust length. If near end handle specially.  */
++	subq	%rsi, %rdx
++	jbe	L(last_4x_vec_or_less)
++# endif
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
++	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
++	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x4)
+ 
+ # ifndef USE_AS_RAWMEMCHR
++	/* Check if at last VEC_SIZE * 4 length.  */
+ 	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
+-
+-# ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
++	jbe	L(last_4x_vec_or_less_cmpeq)
++	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
++	   length.  */
++	incq	%rdi
++	movl	%edi, %ecx
++	orq	$(VEC_SIZE * 4 - 1), %rdi
++	andl	$(VEC_SIZE * 4 - 1), %ecx
+ 	addq	%rcx, %rdx
++# else
++	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
++	incq	%rdi
++	orq	$(VEC_SIZE * 4 - 1), %rdi
+ # endif
+ 
++	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+-
++	VPCMPEQ	1(%rdi), %ymm0, %ymm1
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
++	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
++	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
+ 	vpor	%ymm1, %ymm2, %ymm5
+ 	vpor	%ymm3, %ymm4, %ymm6
+ 	vpor	%ymm5, %ymm6, %ymm5
+ 
+-	vpmovmskb %ymm5, %eax
+-	testl	%eax, %eax
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
++	vpmovmskb %ymm5, %ecx
+ # ifdef USE_AS_RAWMEMCHR
+-	jmp	L(loop_4x_vec)
++	subq	$-(VEC_SIZE * 4), %rdi
++	testl	%ecx, %ecx
++	jz	L(loop_4x_vec)
+ # else
+-	subq	$(VEC_SIZE * 4), %rdx
+-	ja	L(loop_4x_vec)
++	testl	%ecx, %ecx
++	jnz	L(loop_4x_vec_end)
+ 
+-L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %edx
+-	jle	L(last_2x_vec)
++	subq	$-(VEC_SIZE * 4), %rdi
+ 
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	subq	$(VEC_SIZE * 4), %rdx
++	ja	L(loop_4x_vec)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
++	/* Fall through into less than 4 remaining vectors of length case.
++	 */
++	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
++	.p2align 4
++L(last_4x_vec_or_less):
++	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
++	jnz	L(first_vec_x1_check)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
++	/* If remaining length > VEC_SIZE * 2.  */
++	addl	$(VEC_SIZE * 2), %edx
++	jg	L(last_4x_vec)
+ 
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
++L(last_2x_vec):
++	/* If remaining length < VEC_SIZE.  */
++	addl	$VEC_SIZE, %edx
++	jle	L(zero_end)
+ 
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
++	/* Check VEC2 and compare any match with remaining length.  */
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-
+-	jnz	L(first_vec_x3_check)
+-	xorl	%eax, %eax
++	tzcntl	%eax, %eax
++	cmpl	%eax, %edx
++	jbe	L(set_zero_end)
++	addq	$(VEC_SIZE + 1), %rdi
++	addq	%rdi, %rax
++L(zero_end):
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %edx
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
++L(loop_4x_vec_end):
++# endif
++	/* rawmemchr will fall through into this if match was found in
++	   loop.  */
++
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
++	jnz	L(last_vec_x1_return)
+ 
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
+-
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	xorl	%eax, %eax
+-	VZEROUPPER_RETURN
++	jnz	L(last_vec_x2_return)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
+-	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
++	vpmovmskb %ymm3, %eax
++	/* Combine VEC3 matches (eax) with VEC4 matches (ecx).  */
++	salq	$32, %rcx
++	orq	%rcx, %rax
++	tzcntq	%rax, %rax
++# ifdef USE_AS_RAWMEMCHR
++	subq	$(VEC_SIZE * 2 - 1), %rdi
++# else
++	subq	$-(VEC_SIZE * 2 + 1), %rdi
++# endif
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
++# ifndef USE_AS_RAWMEMCHR
+ 
+ 	.p2align 4
+ L(first_vec_x1_check):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$VEC_SIZE, %rax
++	/* Adjust length.  */
++	subl	$-(VEC_SIZE * 4), %edx
++	/* Check if match within remaining length.  */
++	cmpl	%eax, %edx
++	jbe	L(set_zero_end)
++	incq	%rdi
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
++	.p2align 4
++L(set_zero_end):
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
++# endif
+ 
+ 	.p2align 4
+-L(first_vec_x2_check):
++L(last_vec_x1_return):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 2), %rax
++# ifdef USE_AS_RAWMEMCHR
++	subq	$(VEC_SIZE * 4 - 1), %rdi
++# else
++	incq	%rdi
++# endif
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x3_check):
++L(last_vec_x2_return):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 3), %rax
++# ifdef USE_AS_RAWMEMCHR
++	subq	$(VEC_SIZE * 3 - 1), %rdi
++# else
++	subq	$-(VEC_SIZE + 1), %rdi
++# endif
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
++# ifndef USE_AS_RAWMEMCHR
+ 	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	jmp     L(return_vzeroupper)
++L(last_4x_vec_or_less_cmpeq):
++	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	subq	$-(VEC_SIZE * 4), %rdi
++	/* Check first VEC regardless.  */
++	testl	%eax, %eax
++	jnz	L(first_vec_x1_check)
+ 
++	/* If remaining length <= CHAR_PER_VEC * 2.  */
++	addl	$(VEC_SIZE * 2), %edx
++	jle	L(last_2x_vec)
+ 	.p2align 4
+-L(null):
+-	xorl	%eax, %eax
+-	ret
+-# endif
++L(last_4x_vec):
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2_return)
+ 
+-	.p2align 4
+-L(first_vec_x0):
+-	tzcntl	%eax, %eax
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
++	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
+ 
+-	.p2align 4
+-L(first_vec_x1):
+-	tzcntl	%eax, %eax
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
++	/* Create mask for possible matches within remaining length.  */
++	movq	$-1, %rcx
++	bzhiq	%rdx, %rcx, %rcx
+ 
+-	.p2align 4
+-L(first_vec_x2):
++	/* Test matches in data against length match.  */
++	andl	%ecx, %eax
++	jnz	L(last_vec_x3)
++
++	/* if remaining length <= VEC_SIZE * 3 (Note this is after
++	   remaining length was found to be > VEC_SIZE * 2.  */
++	subl	$VEC_SIZE, %edx
++	jbe	L(zero_end2)
++
++	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	/* Shift remaining length mask for last VEC.  */
++	shrq	$32, %rcx
++	andl	%ecx, %eax
++	jz	L(zero_end2)
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 2), %rax
++	addq	$(VEC_SIZE * 3 + 1), %rdi
+ 	addq	%rdi, %rax
++L(zero_end2):
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(4x_vec_end):
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	vpmovmskb %ymm2, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	vpmovmskb %ymm3, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	vpmovmskb %ymm4, %eax
+-	testl	%eax, %eax
+-L(first_vec_x3):
++L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 3), %rax
++	subq	$-(VEC_SIZE * 2 + 1), %rdi
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
++# endif
+ 
+ END (MEMCHR)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-24.patch b/glibc-RHEL-15696-24.patch
new file mode 100644
index 0000000..c4f24ff
--- /dev/null
+++ b/glibc-RHEL-15696-24.patch
@@ -0,0 +1,388 @@
+From 645a158978f9520e74074e8c14047503be4db0f0 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 9 Jun 2021 16:25:32 -0400
+Subject: [PATCH] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ
+ #27974]
+Content-type: text/plain; charset=UTF-8
+
+This commit fixes the bug mentioned in the previous commit.
+
+The previous implementations of wmemchr in these files relied
+on n * sizeof(wchar_t) which was not guranteed by the standard.
+
+The new overflow tests added in the previous commit now
+pass (As well as all the other tests).
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/memchr.S                | 77 +++++++++++++++++++-------
+ sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
+ 2 files changed, 98 insertions(+), 37 deletions(-)
+
+diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
+index cb320257..24f9a0c5 100644
+--- a/sysdeps/x86_64/memchr.S
++++ b/sysdeps/x86_64/memchr.S
+@@ -21,9 +21,11 @@
+ #ifdef USE_AS_WMEMCHR
+ # define MEMCHR		wmemchr
+ # define PCMPEQ		pcmpeqd
++# define CHAR_PER_VEC	4
+ #else
+ # define MEMCHR		memchr
+ # define PCMPEQ		pcmpeqb
++# define CHAR_PER_VEC	16
+ #endif
+ 
+ /* fast SSE2 version with using pmaxub and 64 byte loop */
+@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
+ 	movd	%esi, %xmm1
+ 	mov	%edi, %ecx
+ 
++#ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++#endif
+ #ifdef USE_AS_WMEMCHR
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+-	shl	$2, %RDX_LP
+ #else
+-# ifdef __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	movl	%edx, %edx
+-# endif
+ 	punpcklbw %xmm1, %xmm1
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
+ 	test	%eax, %eax
+ 
+ 	jnz	L(matches_1)
+-	sub	$16, %rdx
++	sub	$CHAR_PER_VEC, %rdx
+ 	jbe	L(return_null)
+ 	add	$16, %rdi
+ 	and	$15, %ecx
+ 	and	$-16, %rdi
++#ifdef USE_AS_WMEMCHR
++	shr	$2, %ecx
++#endif
+ 	add	%rcx, %rdx
+-	sub	$64, %rdx
++	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 	jmp	L(loop_prolog)
+ 
+@@ -77,16 +81,21 @@ L(crosscache):
+ 	movdqa	(%rdi), %xmm0
+ 
+ 	PCMPEQ	%xmm1, %xmm0
+-/* Check if there is a match.  */
++	/* Check if there is a match.  */
+ 	pmovmskb %xmm0, %eax
+-/* Remove the leading bytes.  */
++	/* Remove the leading bytes.  */
+ 	sar	%cl, %eax
+ 	test	%eax, %eax
+ 	je	L(unaligned_no_match)
+-/* Check which byte is a match.  */
++	/* Check which byte is a match.  */
+ 	bsf	%eax, %eax
+-
++#ifdef USE_AS_WMEMCHR
++	mov	%eax, %esi
++	shr	$2, %esi
++	sub	%rsi, %rdx
++#else
+ 	sub	%rax, %rdx
++#endif
+ 	jbe	L(return_null)
+ 	add	%rdi, %rax
+ 	add	%rcx, %rax
+@@ -94,15 +103,18 @@ L(crosscache):
+ 
+ 	.p2align 4
+ L(unaligned_no_match):
+-        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
++	/* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+ 	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
+ 	   possible addition overflow.  */
+ 	neg	%rcx
+ 	add	$16, %rcx
++#ifdef USE_AS_WMEMCHR
++	shr	$2, %ecx
++#endif
+ 	sub	%rcx, %rdx
+ 	jbe	L(return_null)
+ 	add	$16, %rdi
+-	sub	$64, %rdx
++	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 
+ 	.p2align 4
+@@ -135,7 +147,7 @@ L(loop_prolog):
+ 	test	$0x3f, %rdi
+ 	jz	L(align64_loop)
+ 
+-	sub	$64, %rdx
++	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 
+ 	movdqa	(%rdi), %xmm0
+@@ -167,11 +179,14 @@ L(loop_prolog):
+ 	mov	%rdi, %rcx
+ 	and	$-64, %rdi
+ 	and	$63, %ecx
++#ifdef USE_AS_WMEMCHR
++	shr	$2, %ecx
++#endif
+ 	add	%rcx, %rdx
+ 
+ 	.p2align 4
+ L(align64_loop):
+-	sub	$64, %rdx
++	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 	movdqa	(%rdi), %xmm0
+ 	movdqa	16(%rdi), %xmm2
+@@ -218,7 +233,7 @@ L(align64_loop):
+ 
+ 	.p2align 4
+ L(exit_loop):
+-	add	$32, %edx
++	add	$(CHAR_PER_VEC * 2), %edx
+ 	jle	L(exit_loop_32)
+ 
+ 	movdqa	(%rdi), %xmm0
+@@ -238,7 +253,7 @@ L(exit_loop):
+ 	pmovmskb %xmm3, %eax
+ 	test	%eax, %eax
+ 	jnz	L(matches32_1)
+-	sub	$16, %edx
++	sub	$CHAR_PER_VEC, %edx
+ 	jle	L(return_null)
+ 
+ 	PCMPEQ	48(%rdi), %xmm1
+@@ -250,13 +265,13 @@ L(exit_loop):
+ 
+ 	.p2align 4
+ L(exit_loop_32):
+-	add	$32, %edx
++	add	$(CHAR_PER_VEC * 2), %edx
+ 	movdqa	(%rdi), %xmm0
+ 	PCMPEQ	%xmm1, %xmm0
+ 	pmovmskb %xmm0, %eax
+ 	test	%eax, %eax
+ 	jnz	L(matches_1)
+-	sub	$16, %edx
++	sub	$CHAR_PER_VEC, %edx
+ 	jbe	L(return_null)
+ 
+ 	PCMPEQ	16(%rdi), %xmm1
+@@ -293,7 +308,13 @@ L(matches32):
+ 	.p2align 4
+ L(matches_1):
+ 	bsf	%eax, %eax
++#ifdef USE_AS_WMEMCHR
++	mov	%eax, %esi
++	shr	$2, %esi
++	sub	%rsi, %rdx
++#else
+ 	sub	%rax, %rdx
++#endif
+ 	jbe	L(return_null)
+ 	add	%rdi, %rax
+ 	ret
+@@ -301,7 +322,13 @@ L(matches_1):
+ 	.p2align 4
+ L(matches16_1):
+ 	bsf	%eax, %eax
++#ifdef USE_AS_WMEMCHR
++	mov	%eax, %esi
++	shr	$2, %esi
++	sub	%rsi, %rdx
++#else
+ 	sub	%rax, %rdx
++#endif
+ 	jbe	L(return_null)
+ 	lea	16(%rdi, %rax), %rax
+ 	ret
+@@ -309,7 +336,13 @@ L(matches16_1):
+ 	.p2align 4
+ L(matches32_1):
+ 	bsf	%eax, %eax
++#ifdef USE_AS_WMEMCHR
++	mov	%eax, %esi
++	shr	$2, %esi
++	sub	%rsi, %rdx
++#else
+ 	sub	%rax, %rdx
++#endif
+ 	jbe	L(return_null)
+ 	lea	32(%rdi, %rax), %rax
+ 	ret
+@@ -317,7 +350,13 @@ L(matches32_1):
+ 	.p2align 4
+ L(matches48_1):
+ 	bsf	%eax, %eax
++#ifdef USE_AS_WMEMCHR
++	mov	%eax, %esi
++	shr	$2, %esi
++	sub	%rsi, %rdx
++#else
+ 	sub	%rax, %rdx
++#endif
+ 	jbe	L(return_null)
+ 	lea	48(%rdi, %rax), %rax
+ 	ret
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index b377f22e..16027abb 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -54,21 +54,19 @@
+ 
+ # define VEC_SIZE 32
+ # define PAGE_SIZE 4096
++# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+-	test	%RDX_LP, %RDX_LP
+-	jz	L(null)
+-# endif
+-# ifdef USE_AS_WMEMCHR
+-	shl	$2, %RDX_LP
+-# else
+ #  ifdef __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	movl	%edx, %edx
++	/* Clear upper bits.  */
++	and	%RDX_LP, %RDX_LP
++#  else
++	test	%RDX_LP, %RDX_LP
+ #  endif
++	jz	L(null)
+ # endif
+ 	/* Broadcast CHAR to YMMMATCH.  */
+ 	vmovd	%esi, %xmm0
+@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
+ 	vpmovmskb %ymm1, %eax
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* If length < CHAR_PER_VEC handle special.  */
+-	cmpq	$VEC_SIZE, %rdx
++	cmpq	$CHAR_PER_VEC, %rdx
+ 	jbe	L(first_vec_x0)
+ # endif
+ 	testl	%eax, %eax
+@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
+ L(first_vec_x0):
+ 	/* Check if first match was before length.  */
+ 	tzcntl	%eax, %eax
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %edx
++#  endif
+ 	xorl	%ecx, %ecx
+ 	cmpl	%eax, %edx
+ 	leaq	(%rdi, %rax), %rax
+@@ -110,12 +112,12 @@ L(null):
+ # endif
+ 	.p2align 4
+ L(cross_page_boundary):
+-	/* Save pointer before aligning as its original value is necessary
+-	   for computer return address if byte is found or adjusting length
+-	   if it is not and this is memchr.  */
++	/* Save pointer before aligning as its original value is
++	   necessary for computer return address if byte is found or
++	   adjusting length if it is not and this is memchr.  */
+ 	movq	%rdi, %rcx
+-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
+-	   rdi for rawmemchr.  */
++	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
++	   and rdi for rawmemchr.  */
+ 	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
+ 	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+@@ -124,6 +126,10 @@ L(cross_page_boundary):
+ 	   match).  */
+ 	leaq	1(%ALGN_PTR_REG), %rsi
+ 	subq	%RRAW_PTR_REG, %rsi
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
++	shrl	$2, %esi
++#  endif
+ # endif
+ 	/* Remove the leading bytes.  */
+ 	sarxl	%ERAW_PTR_REG, %eax, %eax
+@@ -181,6 +187,10 @@ L(cross_page_continue):
+ 	orq	$(VEC_SIZE - 1), %rdi
+ 	/* esi is for adjusting length to see if near the end.  */
+ 	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %esi
++#  endif
+ # else
+ 	orq	$(VEC_SIZE - 1), %rdi
+ L(cross_page_continue):
+@@ -213,7 +223,7 @@ L(cross_page_continue):
+ 
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check if at last VEC_SIZE * 4 length.  */
+-	subq	$(VEC_SIZE * 4), %rdx
++	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(last_4x_vec_or_less_cmpeq)
+ 	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+ 	   length.  */
+@@ -221,6 +231,10 @@ L(cross_page_continue):
+ 	movl	%edi, %ecx
+ 	orq	$(VEC_SIZE * 4 - 1), %rdi
+ 	andl	$(VEC_SIZE * 4 - 1), %ecx
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
++#  endif
+ 	addq	%rcx, %rdx
+ # else
+ 	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
+@@ -250,15 +264,19 @@ L(loop_4x_vec):
+ 
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 
+-	subq	$(VEC_SIZE * 4), %rdx
++	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	ja	L(loop_4x_vec)
+ 
+-	/* Fall through into less than 4 remaining vectors of length case.
+-	 */
++	/* Fall through into less than 4 remaining vectors of length
++	   case.  */
+ 	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	.p2align 4
+ L(last_4x_vec_or_less):
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %edx
++#  endif
+ 	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1_check)
+@@ -355,6 +373,10 @@ L(last_vec_x2_return):
+ L(last_4x_vec_or_less_cmpeq):
+ 	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %edx
++#  endif
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	/* Check first VEC regardless.  */
+ 	testl	%eax, %eax
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-25.patch b/glibc-RHEL-15696-25.patch
new file mode 100644
index 0000000..e0ed8ea
--- /dev/null
+++ b/glibc-RHEL-15696-25.patch
@@ -0,0 +1,767 @@
+From aaa23c35071537e2dcf5807e956802ed215210aa Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 19 Apr 2021 19:36:07 -0400
+Subject: [PATCH] x86: Optimize strlen-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes strlen-avx2.S. The optimizations are
+mostly small things but they add up to roughly 10-30% performance
+improvement for strlen. The results for strnlen are bit more
+ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
+are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
+ sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
+ 2 files changed, 334 insertions(+), 214 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index cbfc1a5d..f1a6460a 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -285,10 +285,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
+   IFUNC_IMPL (i, name, strlen,
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strlen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+@@ -301,10 +303,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
+   IFUNC_IMPL (i, name, strnlen,
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strnlen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+@@ -640,10 +644,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
+   IFUNC_IMPL (i, name, wcslen,
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcslen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wcslen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+@@ -656,10 +662,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+   IFUNC_IMPL (i, name, wcsnlen,
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcsnlen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wcsnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
+index 82826e10..be8a5db5 100644
+--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
+@@ -27,9 +27,11 @@
+ # ifdef USE_AS_WCSLEN
+ #  define VPCMPEQ	vpcmpeqd
+ #  define VPMINU	vpminud
++#  define CHAR_SIZE	4
+ # else
+ #  define VPCMPEQ	vpcmpeqb
+ #  define VPMINU	vpminub
++#  define CHAR_SIZE	1
+ # endif
+ 
+ # ifndef VZEROUPPER
+@@ -41,349 +43,459 @@
+ # endif
+ 
+ # define VEC_SIZE 32
++# define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+-	/* Check for zero length.  */
++	/* Check zero length.  */
+ 	test	%RSI_LP, %RSI_LP
+ 	jz	L(zero)
++	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
++	mov	%RSI_LP, %R8_LP
+ #  ifdef USE_AS_WCSLEN
+ 	shl	$2, %RSI_LP
+ #  elif defined __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%esi, %esi
+ #  endif
+-	mov	%RSI_LP, %R8_LP
+ # endif
+-	movl	%edi, %ecx
++	movl	%edi, %eax
+ 	movq	%rdi, %rdx
+ 	vpxor	%xmm0, %xmm0, %xmm0
+-
++	/* Clear high bits from edi. Only keeping bits relevant to page
++	   cross check.  */
++	andl	$(PAGE_SIZE - 1), %eax
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-
++	VPCMPEQ	(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
+ # ifdef USE_AS_STRNLEN
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rsi
+-	jbe	L(max)
+-# else
+-	jnz	L(first_vec_x0)
++	/* If length < VEC_SIZE handle special.  */
++	cmpq	$VEC_SIZE, %rsi
++	jbe	L(first_vec_x0)
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
++	/* If empty continue to aligned_more. Otherwise return bit
++	   position of first match.  */
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++# endif
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
+-	addq	%rcx, %rsi
++L(zero):
++	xorl	%eax, %eax
++	ret
+ 
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
++	.p2align 4
++L(first_vec_x0):
++	/* Set bit for max len so that tzcnt will return min of max len
++	   and position of first match.  */
++	btsq	%rsi, %rax
++	tzcntl	%eax, %eax
++#  ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++#  endif
++	VZEROUPPER_RETURN
+ # endif
+-	jmp	L(more_4x_vec)
+ 
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	/* Remove the leading bytes.  */
+-	sarl	%cl, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
++L(first_vec_x1):
+ 	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	subl	$(VEC_SIZE * 4 + 1), %ecx
++	addl	%ecx, %eax
++# else
++	subl	%edx, %edi
++	incl	%edi
++	addl	%edi, %eax
+ # endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
+-	subq	%rdx, %rax
+ # ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	shrl	$2, %eax
+ # endif
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(aligned_more):
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
+-	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
+-	    to void possible addition overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
+-
+-	/* Check the end of data.  */
+-	subq	%rcx, %rsi
+-	jbe	L(max)
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	subl	$(VEC_SIZE * 3 + 1), %ecx
++	addl	%ecx, %eax
++# else
++	subl	%edx, %edi
++	addl	$(VEC_SIZE + 1), %edi
++	addl	%edi, %eax
+ # endif
++# ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++# endif
++	VZEROUPPER_RETURN
+ 
+-	addq	$VEC_SIZE, %rdi
++	.p2align 4
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
++# ifdef USE_AS_STRNLEN
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	subl	$(VEC_SIZE * 2 + 1), %ecx
++	addl	%ecx, %eax
++# else
++	subl	%edx, %edi
++	addl	$(VEC_SIZE * 2 + 1), %edi
++	addl	%edi, %eax
++# endif
++# ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++# endif
++	VZEROUPPER_RETURN
+ 
++	.p2align 4
++L(first_vec_x4):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	subl	$(VEC_SIZE + 1), %ecx
++	addl	%ecx, %eax
++# else
++	subl	%edx, %edi
++	addl	$(VEC_SIZE * 3 + 1), %edi
++	addl	%edi, %eax
+ # endif
++# ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++# endif
++	VZEROUPPER_RETURN
+ 
+-L(more_4x_vec):
++	.p2align 5
++L(aligned_more):
++	/* Align data to VEC_SIZE - 1. This is the same number of
++	   instructions as using andq with -VEC_SIZE but saves 4 bytes of
++	   code on the x4 check.  */
++	orq	$(VEC_SIZE - 1), %rdi
++L(cross_page_continue):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++# ifdef USE_AS_STRNLEN
++	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
++	   it simplies the logic in last_4x_vec_or_less.  */
++	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
++	subq	%rdx, %rcx
++# endif
++	/* Load first VEC regardless.  */
++	VPCMPEQ	1(%rdi), %ymm0, %ymm1
++# ifdef USE_AS_STRNLEN
++	/* Adjust length. If near end handle specially.  */
++	subq	%rcx, %rsi
++	jb	L(last_4x_vec_or_less)
++# endif
++	vpmovmskb	%ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-# ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
++	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x4)
+ 
++	/* Align data to VEC_SIZE * 4 - 1.  */
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
++	/* Before adjusting length check if at last VEC_SIZE * 4.  */
++	cmpq	$(VEC_SIZE * 4 - 1), %rsi
++	jbe	L(last_4x_vec_or_less_load)
++	incq	%rdi
++	movl	%edi, %ecx
++	orq	$(VEC_SIZE * 4 - 1), %rdi
++	andl	$(VEC_SIZE * 4 - 1), %ecx
++	/* Readjust length.  */
+ 	addq	%rcx, %rsi
++# else
++	incq	%rdi
++	orq	$(VEC_SIZE * 4 - 1), %rdi
+ # endif
+-
++	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa (%rdi), %ymm1
+-	vmovdqa	VEC_SIZE(%rdi), %ymm2
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
+-	VPMINU	%ymm1, %ymm2, %ymm5
+-	VPMINU	%ymm3, %ymm4, %ymm6
+-	VPMINU	%ymm5, %ymm6, %ymm5
+-
+-	VPCMPEQ	%ymm5, %ymm0, %ymm5
+-	vpmovmskb %ymm5, %eax
+-	testl	%eax, %eax
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-# ifndef USE_AS_STRNLEN
+-	jmp	L(loop_4x_vec)
+-# else
++# ifdef USE_AS_STRNLEN
++	/* Break if at end of length.  */
+ 	subq	$(VEC_SIZE * 4), %rsi
+-	ja	L(loop_4x_vec)
+-
+-L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %esi
+-	jle	L(last_2x_vec)
++	jb	L(last_4x_vec_or_less_cmpeq)
++# endif
++	/* Save some code size by microfusing VPMINU with the load. Since
++	   the matches in ymm2/ymm4 can only be returned if there where no
++	   matches in ymm1/ymm3 respectively there is no issue with overlap.
++	 */
++	vmovdqa	1(%rdi), %ymm1
++	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
++	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
++	VPMINU	(VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
++
++	VPMINU	%ymm2, %ymm4, %ymm5
++	VPCMPEQ	%ymm5, %ymm0, %ymm5
++	vpmovmskb	%ymm5, %ecx
+ 
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	subq	$-(VEC_SIZE * 4), %rdi
++	testl	%ecx, %ecx
++	jz	L(loop_4x_vec)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	VPCMPEQ	%ymm1, %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	subq	%rdx, %rdi
+ 	testl	%eax, %eax
++	jnz	L(last_vec_return_x0)
+ 
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
+-
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	VPCMPEQ	%ymm2, %ymm0, %ymm2
++	vpmovmskb	%ymm2, %eax
+ 	testl	%eax, %eax
+-
+-	jnz	L(first_vec_x3_check)
+-	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
++	jnz	L(last_vec_return_x1)
++
++	/* Combine last 2 VEC.  */
++	VPCMPEQ	%ymm3, %ymm0, %ymm3
++	vpmovmskb	%ymm3, %eax
++	/* rcx has combined result from all 4 VEC. It will only be used if
++	   the first 3 other VEC all did not contain a match.  */
++	salq	$32, %rcx
++	orq	%rcx, %rax
++	tzcntq	%rax, %rax
++	subq	$(VEC_SIZE * 2 - 1), %rdi
++	addq	%rdi, %rax
++# ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-#  endif
++# endif
+ 	VZEROUPPER_RETURN
+ 
++
++# ifdef USE_AS_STRNLEN
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %esi
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
++L(last_4x_vec_or_less_load):
++	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
++	subq	$-(VEC_SIZE * 4), %rdi
++L(last_4x_vec_or_less_cmpeq):
++	VPCMPEQ	1(%rdi), %ymm0, %ymm1
++L(last_4x_vec_or_less):
+ 
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
++	vpmovmskb	%ymm1, %eax
++	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
++	   VEC_SIZE * 4.  */
++	testl	$(VEC_SIZE * 2), %esi
++	jnz	L(last_4x_vec)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	/* length may have been negative or positive by an offset of
++	   VEC_SIZE * 4 depending on where this was called from. This fixes
++	   that.  */
++	andl	$(VEC_SIZE * 4 - 1), %esi
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-#  endif
+-	VZEROUPPER_RETURN
++	jnz	L(last_vec_x1_check)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
++	subl	$VEC_SIZE, %esi
++	jb	L(max)
++
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
++	cmpl	%eax, %esi
++	jb	L(max)
++	subq	%rdx, %rdi
++	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
++# endif
+ 
+ 	.p2align 4
+-L(first_vec_x1_check):
++L(last_vec_return_x0):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$VEC_SIZE, %rax
++	subq	$(VEC_SIZE * 4 - 1), %rdi
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
++# ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-#  endif
++# endif
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2_check):
++L(last_vec_return_x1):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 2), %rax
++	subq	$(VEC_SIZE * 3 - 1), %rdi
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
++# ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-#  endif
++# endif
+ 	VZEROUPPER_RETURN
+ 
++# ifdef USE_AS_STRNLEN
+ 	.p2align 4
+-L(first_vec_x3_check):
++L(last_vec_x1_check):
++
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 3), %rax
++	cmpl	%eax, %esi
++	jb	L(max)
++	subq	%rdx, %rdi
++	incl	%eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+ L(max):
+ 	movq	%r8, %rax
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(last_4x_vec):
++	/* Test first 2x VEC normally.  */
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
++
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++	/* Normalize length.  */
++	andl	$(VEC_SIZE * 4 - 1), %esi
++	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3)
++
++	subl	$(VEC_SIZE * 3), %esi
++	jb	L(max)
++
++	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	tzcntl	%eax, %eax
++	/* Check the end of data.  */
++	cmpl	%eax, %esi
++	jb	L(max)
++	subq	%rdx, %rdi
++	addl	$(VEC_SIZE * 3 + 1), %eax
++	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-# endif
+ 
+ 	.p2align 4
+-L(first_vec_x0):
++L(last_vec_x1):
++	/* essentially duplicates of first_vec_x1 but use 64 bit
++	   instructions.  */
+ 	tzcntl	%eax, %eax
++	subq	%rdx, %rdi
++	incl	%eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-# endif
++#  endif
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x1):
++L(last_vec_x2):
++	/* essentially duplicates of first_vec_x1 but use 64 bit
++	   instructions.  */
+ 	tzcntl	%eax, %eax
+-	addq	$VEC_SIZE, %rax
++	subq	%rdx, %rdi
++	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-# endif
++#  endif
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2):
++L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 2), %rax
++	subl	$(VEC_SIZE * 2), %esi
++	/* Check the end of data.  */
++	cmpl	%eax, %esi
++	jb	L(max_end)
++	subq	%rdx, %rdi
++	addl	$(VEC_SIZE * 2 + 1), %eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-# endif
++#  endif
++	VZEROUPPER_RETURN
++L(max_end):
++	movq	%r8, %rax
+ 	VZEROUPPER_RETURN
++# endif
+ 
++	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+-L(4x_vec_end):
+-	VPCMPEQ	%ymm1, %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	VPCMPEQ %ymm2, %ymm0, %ymm2
+-	vpmovmskb %ymm2, %eax
++L(cross_page_boundary):
++	/* Align data to VEC_SIZE - 1.  */
++	orq	$(VEC_SIZE - 1), %rdi
++	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
++	   so no need to manually mod rdx.  */
++	sarxl	%edx, %eax, %eax
++# ifdef USE_AS_STRNLEN
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	VPCMPEQ %ymm3, %ymm0, %ymm3
+-	vpmovmskb %ymm3, %eax
++	jnz	L(cross_page_less_vec)
++	leaq	1(%rdi), %rcx
++	subq	%rdx, %rcx
++	/* Check length.  */
++	cmpq	%rsi, %rcx
++	jb	L(cross_page_continue)
++	movq	%r8, %rax
++# else
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	VPCMPEQ %ymm4, %ymm0, %ymm4
+-	vpmovmskb %ymm4, %eax
+-L(first_vec_x3):
++	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++#  ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++#  endif
+ # endif
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
++
++# ifdef USE_AS_STRNLEN
++	.p2align 4
++L(cross_page_less_vec):
++	tzcntl	%eax, %eax
++	cmpq	%rax, %rsi
++	cmovb	%esi, %eax
++#  ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++#  endif
+ 	VZEROUPPER_RETURN
++# endif
+ 
+ END (STRLEN)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-26.patch b/glibc-RHEL-15696-26.patch
new file mode 100644
index 0000000..d46fe6e
--- /dev/null
+++ b/glibc-RHEL-15696-26.patch
@@ -0,0 +1,701 @@
+From 2a76821c3081d2c0231ecd2618f52662cb48fccd Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 3 May 2021 03:03:19 -0400
+Subject: [PATCH] x86: Optimize memchr-evex.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes memchr-evex.S. The optimizations include
+replacing some branches with cmovcc, avoiding some branches entirely
+in the less_4x_vec case, making the page cross logic less strict,
+saving some ALU in the alignment process, and most importantly
+increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
+test-wmemchr are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
+ 1 file changed, 322 insertions(+), 225 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+index 6dd5d67b..81d5cd64 100644
+--- a/sysdeps/x86_64/multiarch/memchr-evex.S
++++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -26,14 +26,28 @@
+ 
+ # ifdef USE_AS_WMEMCHR
+ #  define VPBROADCAST	vpbroadcastd
+-#  define VPCMP		vpcmpd
+-#  define SHIFT_REG	r8d
++#  define VPMINU	vpminud
++#  define VPCMP	vpcmpd
++#  define VPCMPEQ	vpcmpeqd
++#  define CHAR_SIZE	4
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+-#  define VPCMP		vpcmpb
+-#  define SHIFT_REG	ecx
++#  define VPMINU	vpminub
++#  define VPCMP	vpcmpb
++#  define VPCMPEQ	vpcmpeqb
++#  define CHAR_SIZE	1
+ # endif
+ 
++# ifdef USE_AS_RAWMEMCHR
++#  define RAW_PTR_REG	rcx
++#  define ALGN_PTR_REG	rdi
++# else
++#  define RAW_PTR_REG	rdi
++#  define ALGN_PTR_REG	rcx
++# endif
++
++# define XMMZERO	xmm23
++# define YMMZERO	ymm23
+ # define XMMMATCH	xmm16
+ # define YMMMATCH	ymm16
+ # define YMM1		ymm17
+@@ -44,6 +58,8 @@
+ # define YMM6		ymm22
+ 
+ # define VEC_SIZE 32
++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
++# define PAGE_SIZE 4096
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (MEMCHR)
+@@ -51,11 +67,7 @@ ENTRY (MEMCHR)
+ 	/* Check for zero length.  */
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(zero)
+-# endif
+-	movl	%edi, %ecx
+-# ifdef USE_AS_WMEMCHR
+-	shl	$2, %RDX_LP
+-# else
++
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+@@ -64,318 +76,403 @@ ENTRY (MEMCHR)
+ 	/* Broadcast CHAR to YMMMATCH.  */
+ 	VPBROADCAST %esi, %YMMMATCH
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	movl	%edi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-
++	VPCMP	$0, (%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
+ # ifndef USE_AS_RAWMEMCHR
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rdx
+-	jbe	L(zero)
++	/* If length < CHAR_PER_VEC handle special.  */
++	cmpq	$CHAR_PER_VEC, %rdx
++	jbe	L(first_vec_x0)
++# endif
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	jnz	L(first_vec_x0)
++	addq	%rdi, %rax
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
++	ret
+ 
+ # ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
+-
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-	jmp	L(more_4x_vec)
++L(zero):
++	xorl	%eax, %eax
++	ret
+ 
++	.p2align 5
++L(first_vec_x0):
++	/* Check if first match was before length.  */
++	tzcntl	%eax, %eax
++	xorl	%ecx, %ecx
++	cmpl	%eax, %edx
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++	cmovle	%rcx, %rax
++	ret
++# else
++	/* NB: first_vec_x0 is 17 bytes which will leave
++	   cross_page_boundary (which is relatively cold) close enough
++	   to ideal alignment. So only realign L(cross_page_boundary) if
++	   rawmemchr.  */
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
++# endif
++L(cross_page_boundary):
++	/* Save pointer before aligning as its original value is
++	   necessary for computer return address if byte is found or
++	   adjusting length if it is not and this is memchr.  */
++	movq	%rdi, %rcx
++	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
++	   for rawmemchr.  */
++	andq	$-VEC_SIZE, %ALGN_PTR_REG
++	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
++	kmovd	%k0, %r8d
+ # ifdef USE_AS_WMEMCHR
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	/* NB: Divide shift count by 4 since each bit in K0 represent 4
+ 	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl	$2, %SHIFT_REG
++	sarl	$2, %eax
++# endif
++# ifndef USE_AS_RAWMEMCHR
++	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
++	subl	%eax, %esi
+ # endif
+-	andq	$-VEC_SIZE, %rdi
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	/* Remove the leading bytes.  */
+-	sarxl	%SHIFT_REG, %eax, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
+-	tzcntl	%eax, %eax
+ # ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
++	andl	$(CHAR_PER_VEC - 1), %eax
+ # endif
++	/* Remove the leading bytes.  */
++	sarxl	%eax, %r8d, %eax
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
++	cmpq	%rsi, %rdx
++	jbe	L(first_vec_x0)
++# endif
++	testl	%eax, %eax
++	jz	L(cross_page_continue)
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
++# else
++	addq	%RAW_PTR_REG, %rax
+ # endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(aligned_more):
+-# ifndef USE_AS_RAWMEMCHR
+-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
+-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
+-	   overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	/* Check the end of data.  */
+-	subq	%rcx, %rdx
+-	jbe	L(zero)
+-# endif
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	addq	$VEC_SIZE, %rdi
++	.p2align 4
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-# ifndef USE_AS_RAWMEMCHR
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
++	.p2align 4
++L(first_vec_x4):
++	tzcntl	%eax, %eax
++	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-L(more_4x_vec):
++	.p2align 5
++L(aligned_more):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
++# ifndef USE_AS_RAWMEMCHR
++	/* Align data to VEC_SIZE.  */
++L(cross_page_continue):
++	xorl	%ecx, %ecx
++	subl	%edi, %ecx
++	andq	$-VEC_SIZE, %rdi
++	/* esi is for adjusting length to see if near the end.  */
++	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %esi
++#  endif
++# else
++	andq	$-VEC_SIZE, %rdi
++L(cross_page_continue):
++# endif
++	/* Load first VEC regardless.  */
++	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++# ifndef USE_AS_RAWMEMCHR
++	/* Adjust length. If near end handle specially.  */
++	subq	%rsi, %rdx
++	jbe	L(last_4x_vec_or_less)
++# endif
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
++	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x4)
++
+ 
+ # ifndef USE_AS_RAWMEMCHR
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
++	/* Check if at last CHAR_PER_VEC * 4 length.  */
++	subq	$(CHAR_PER_VEC * 4), %rdx
++	jbe	L(last_4x_vec_or_less_cmpeq)
++	addq	$VEC_SIZE, %rdi
+ 
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
++	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
++	 */
++#  ifdef USE_AS_WMEMCHR
++	movl	%edi, %ecx
+ 	andq	$-(4 * VEC_SIZE), %rdi
+-
+-# ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
++	andl	$(VEC_SIZE * 4 - 1), %ecx
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
+ 	addq	%rcx, %rdx
++#  else
++	addq	%rdi, %rdx
++	andq	$-(4 * VEC_SIZE), %rdi
++	subq	%rdi, %rdx
++#  endif
++# else
++	addq	$VEC_SIZE, %rdi
++	andq	$-(4 * VEC_SIZE), %rdi
+ # endif
+ 
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++
++	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
+-	kord	%k1, %k2, %k5
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
+-
+-	kord	%k3, %k4, %k6
+-	kortestd %k5, %k6
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
++	/* It would be possible to save some instructions using 4x VPCMP
++	   but bottleneck on port 5 makes it not woth it.  */
++	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
++	/* xor will set bytes match esi to zero.  */
++	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
++	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
++	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
++	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
++	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
++	VPCMP	$0, %YMM3, %YMMZERO, %k2
+ # ifdef USE_AS_RAWMEMCHR
+-	jmp	L(loop_4x_vec)
++	subq	$-(VEC_SIZE * 4), %rdi
++	kortestd %k2, %k3
++	jz	L(loop_4x_vec)
+ # else
+-	subq	$(VEC_SIZE * 4), %rdx
++	kortestd %k2, %k3
++	jnz	L(loop_4x_vec_end)
++
++	subq	$-(VEC_SIZE * 4), %rdi
++
++	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	ja	L(loop_4x_vec)
+ 
++	/* Fall through into less than 4 remaining vectors of length case.
++	 */
++	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	addq	$(VEC_SIZE * 3), %rdi
++	.p2align 4
+ L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %edx
+-	jle	L(last_2x_vec)
+-
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
++	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	jnz	L(first_vec_x1_check)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
++	/* If remaining length > CHAR_PER_VEC * 2.  */
++	addl	$(CHAR_PER_VEC * 2), %edx
++	jg	L(last_4x_vec)
+ 
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
++L(last_2x_vec):
++	/* If remaining length < CHAR_PER_VEC.  */
++	addl	$CHAR_PER_VEC, %edx
++	jle	L(zero_end)
+ 
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
++	/* Check VEC2 and compare any match with remaining length.  */
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
++	cmpl	%eax, %edx
++	jbe	L(set_zero_end)
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++L(zero_end):
++	ret
+ 
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+ 
+-	jnz	L(first_vec_x3_check)
++	.p2align 4
++L(first_vec_x1_check):
++	tzcntl	%eax, %eax
++	/* Adjust length.  */
++	subl	$-(CHAR_PER_VEC * 4), %edx
++	/* Check if match within remaining length.  */
++	cmpl	%eax, %edx
++	jbe	L(set_zero_end)
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++L(set_zero_end):
+ 	xorl	%eax, %eax
+ 	ret
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %edx
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++L(loop_4x_vec_end):
++# endif
++	/* rawmemchr will fall through into this if match was found in
++	   loop.  */
++
++	/* k1 has not of matches with VEC1.  */
+ 	kmovd	%k1, %eax
+-	testl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	subl	$((1 << CHAR_PER_VEC) - 1), %eax
++# else
++	incl	%eax
++# endif
++	jnz	L(last_vec_x1_return)
+ 
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
++	VPCMP	$0, %YMM2, %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2_return)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
++	kmovd	%k2, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	xorl	%eax, %eax
+-	ret
++	jnz	L(last_vec_x3_return)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
++	kmovd	%k3, %eax
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
++# ifdef USE_AS_RAWMEMCHR
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++# else
++	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	%rdi, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x1_check):
++L(last_vec_x1_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$VEC_SIZE, %rax
++# ifdef USE_AS_RAWMEMCHR
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++#  else
+ 	addq	%rdi, %rax
+-	ret
+-
+-	.p2align 4
+-L(first_vec_x2_check):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
++#  endif
++# else
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x3_check):
++L(last_vec_x2_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
++# ifdef USE_AS_RAWMEMCHR
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
++# else
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(first_vec_x0):
++L(last_vec_x3_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
++# ifdef USE_AS_RAWMEMCHR
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	addq	%rdi, %rax
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+ 	ret
+ 
++
++# ifndef USE_AS_RAWMEMCHR
++L(last_4x_vec_or_less_cmpeq):
++	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	subq	$-(VEC_SIZE * 4), %rdi
++	/* Check first VEC regardless.  */
++	testl	%eax, %eax
++	jnz	L(first_vec_x1_check)
++
++	/* If remaining length <= CHAR_PER_VEC * 2.  */
++	addl	$(CHAR_PER_VEC * 2), %edx
++	jle	L(last_2x_vec)
++
+ 	.p2align 4
+-L(first_vec_x1):
++L(last_4x_vec):
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	/* Create mask for possible matches within remaining length.  */
++#  ifdef USE_AS_WMEMCHR
++	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
++	bzhil	%edx, %ecx, %ecx
++#  else
++	movq	$-1, %rcx
++	bzhiq	%rdx, %rcx, %rcx
++#  endif
++	/* Test matches in data against length match.  */
++	andl	%ecx, %eax
++	jnz	L(last_vec_x3)
++
++	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
++	   remaining length was found to be > CHAR_PER_VEC * 2.  */
++	subl	$CHAR_PER_VEC, %edx
++	jbe	L(zero_end2)
++
++
++	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	/* Shift remaining length mask for last VEC.  */
++#  ifdef USE_AS_WMEMCHR
++	shrl	$CHAR_PER_VEC, %ecx
++#  else
++	shrq	$CHAR_PER_VEC, %rcx
++#  endif
++	andl	%ecx, %eax
++	jz	L(zero_end2)
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-# endif
++	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
++L(zero_end2):
+ 	ret
+ 
+-	.p2align 4
+-L(first_vec_x2):
++L(last_vec_x2):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+-# else
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-# endif
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(4x_vec_end):
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	kmovd	%k2, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	kmovd	%k3, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	kmovd	%k4, %eax
+-	testl	%eax, %eax
+-L(first_vec_x3):
++L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
+-# else
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-# endif
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
++# endif
+ 
+ END (MEMCHR)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-27.patch b/glibc-RHEL-15696-27.patch
new file mode 100644
index 0000000..9dcf16d
--- /dev/null
+++ b/glibc-RHEL-15696-27.patch
@@ -0,0 +1,30 @@
+From 6ea916adfa0ab9af6e7dc6adcf6f977dfe017835 Mon Sep 17 00:00:00 2001
+From: Alice Xu <alice.d.xu@gmail.com>
+Date: Fri, 7 May 2021 19:03:21 -0700
+Subject: [PATCH] x86-64: Fix an unknown vector operation in memchr-evex.S
+Content-type: text/plain; charset=UTF-8
+
+An unknown vector operation occurred in commit 2a76821c308. Fixed it
+by using "ymm{k1}{z}" but not "ymm {k1} {z}".
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memchr-evex.S | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+index 81d5cd64..f3fdad4f 100644
+--- a/sysdeps/x86_64/multiarch/memchr-evex.S
++++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -271,7 +271,7 @@ L(loop_4x_vec):
+ 	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+ 	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+ 	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
+-	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
++	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
+ 	VPCMP	$0, %YMM3, %YMMZERO, %k2
+ # ifdef USE_AS_RAWMEMCHR
+ 	subq	$-(VEC_SIZE * 4), %rdi
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-28.patch b/glibc-RHEL-15696-28.patch
new file mode 100644
index 0000000..3063d4d
--- /dev/null
+++ b/glibc-RHEL-15696-28.patch
@@ -0,0 +1,566 @@
+From a0db678071c60b6c47c468d231dd0b3694ba7a98 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Tue, 22 Jun 2021 20:42:10 -0700
+Subject: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S
+Content-type: text/plain; charset=UTF-8
+
+Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1
+version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S
+and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants.
+This also removes the unused symbols, __GI___strlen_sse2 and
+__GI___wcsnlen_sse4_1.
+---
+ sysdeps/x86_64/multiarch/strlen-sse2.S    |   2 +-
+ sysdeps/x86_64/multiarch/strlen-vec.S     | 257 ++++++++++++++++++++++
+ sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S |   2 +-
+ sysdeps/x86_64/strlen.S                   | 243 +-------------------
+ 4 files changed, 262 insertions(+), 242 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
+
+Conflicts:
+	sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
+	(Copyright dates, URL)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
+index 7bc57b8d..449c8a7f 100644
+--- a/sysdeps/x86_64/multiarch/strlen-sse2.S
++++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
+@@ -20,4 +20,4 @@
+ # define strlen __strlen_sse2
+ #endif
+ 
+-#include "../strlen.S"
++#include "strlen-vec.S"
+diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
+new file mode 100644
+index 00000000..8f660bb9
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strlen-vec.S
+@@ -0,0 +1,257 @@
++/* SSE2 version of strlen and SSE4.1 version of wcslen.
++   Copyright (C) 2012-2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++#ifdef AS_WCSLEN
++# define PMINU		pminud
++# define PCMPEQ		pcmpeqd
++# define SHIFT_RETURN	shrq $2, %rax
++#else
++# define PMINU		pminub
++# define PCMPEQ		pcmpeqb
++# define SHIFT_RETURN
++#endif
++
++/* Long lived register in strlen(s), strnlen(s, n) are:
++
++	%xmm3 - zero
++	%rdi   - s
++	%r10  (s+n) & (~(64-1))
++	%r11   s+n
++*/
++
++
++.text
++ENTRY(strlen)
++
++/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
++#define FIND_ZERO	\
++	PCMPEQ	(%rax), %xmm0;	\
++	PCMPEQ	16(%rax), %xmm1;	\
++	PCMPEQ	32(%rax), %xmm2;	\
++	PCMPEQ	48(%rax), %xmm3;	\
++	pmovmskb	%xmm0, %esi;	\
++	pmovmskb	%xmm1, %edx;	\
++	pmovmskb	%xmm2, %r8d;	\
++	pmovmskb	%xmm3, %ecx;	\
++	salq	$16, %rdx;	\
++	salq	$16, %rcx;	\
++	orq	%rsi, %rdx;	\
++	orq	%r8, %rcx;	\
++	salq	$32, %rcx;	\
++	orq	%rcx, %rdx;
++
++#ifdef AS_STRNLEN
++/* Do not read anything when n==0.  */
++	test	%RSI_LP, %RSI_LP
++	jne	L(n_nonzero)
++	xor	%rax, %rax
++	ret
++L(n_nonzero):
++# ifdef AS_WCSLEN
++	shl	$2, %RSI_LP
++# endif
++
++/* Initialize long lived registers.  */
++
++	add	%RDI_LP, %RSI_LP
++	mov	%RSI_LP, %R10_LP
++	and	$-64, %R10_LP
++	mov	%RSI_LP, %R11_LP
++#endif
++
++	pxor	%xmm0, %xmm0
++	pxor	%xmm1, %xmm1
++	pxor	%xmm2, %xmm2
++	pxor	%xmm3, %xmm3
++	movq	%rdi, %rax
++	movq	%rdi, %rcx
++	andq	$4095, %rcx
++/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
++	cmpq	$4047, %rcx
++/* We cannot unify this branching as it would be ~6 cycles slower.  */
++	ja	L(cross_page)
++
++#ifdef AS_STRNLEN
++/* Test if end is among first 64 bytes.  */
++# define STRNLEN_PROLOG	\
++	mov	%r11, %rsi;	\
++	subq	%rax, %rsi;	\
++	andq	$-64, %rax;	\
++	testq	$-64, %rsi;	\
++	je	L(strnlen_ret)
++#else
++# define STRNLEN_PROLOG  andq $-64, %rax;
++#endif
++
++/* Ignore bits in mask that come before start of string.  */
++#define PROLOG(lab)	\
++	movq	%rdi, %rcx;	\
++	xorq	%rax, %rcx;	\
++	STRNLEN_PROLOG;	\
++	sarq	%cl, %rdx;	\
++	test	%rdx, %rdx;	\
++	je	L(lab);	\
++	bsfq	%rdx, %rax;	\
++	SHIFT_RETURN;		\
++	ret
++
++#ifdef AS_STRNLEN
++	andq	$-16, %rax
++	FIND_ZERO
++#else
++	/* Test first 16 bytes unaligned.  */
++	movdqu	(%rax), %xmm4
++	PCMPEQ	%xmm0, %xmm4
++	pmovmskb	%xmm4, %edx
++	test	%edx, %edx
++	je 	L(next48_bytes)
++	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
++	SHIFT_RETURN
++	ret
++
++L(next48_bytes):
++/* Same as FIND_ZERO except we do not check first 16 bytes.  */
++	andq	$-16, %rax
++	PCMPEQ 16(%rax), %xmm1
++	PCMPEQ 32(%rax), %xmm2
++	PCMPEQ 48(%rax), %xmm3
++	pmovmskb	%xmm1, %edx
++	pmovmskb	%xmm2, %r8d
++	pmovmskb	%xmm3, %ecx
++	salq	$16, %rdx
++	salq	$16, %rcx
++	orq	%r8, %rcx
++	salq	$32, %rcx
++	orq	%rcx, %rdx
++#endif
++
++	/* When no zero byte is found xmm1-3 are zero so we do not have to
++	   zero them.  */
++	PROLOG(loop)
++
++	.p2align 4
++L(cross_page):
++	andq	$-64, %rax
++	FIND_ZERO
++	PROLOG(loop_init)
++
++#ifdef AS_STRNLEN
++/* We must do this check to correctly handle strnlen (s, -1).  */
++L(strnlen_ret):
++	bts	%rsi, %rdx
++	sarq	%cl, %rdx
++	test	%rdx, %rdx
++	je	L(loop_init)
++	bsfq	%rdx, %rax
++	SHIFT_RETURN
++	ret
++#endif
++	.p2align 4
++L(loop_init):
++	pxor	%xmm1, %xmm1
++	pxor	%xmm2, %xmm2
++	pxor	%xmm3, %xmm3
++#ifdef AS_STRNLEN
++	.p2align 4
++L(loop):
++
++	addq	$64, %rax
++	cmpq	%rax, %r10
++	je	L(exit_end)
++
++	movdqa	(%rax), %xmm0
++	PMINU	16(%rax), %xmm0
++	PMINU	32(%rax), %xmm0
++	PMINU	48(%rax), %xmm0
++	PCMPEQ	%xmm3, %xmm0
++	pmovmskb	%xmm0, %edx
++	testl	%edx, %edx
++	jne	L(exit)
++	jmp	L(loop)
++
++	.p2align 4
++L(exit_end):
++	cmp	%rax, %r11
++	je	L(first) /* Do not read when end is at page boundary.  */
++	pxor	%xmm0, %xmm0
++	FIND_ZERO
++
++L(first):
++	bts	%r11, %rdx
++	bsfq	%rdx, %rdx
++	addq	%rdx, %rax
++	subq	%rdi, %rax
++	SHIFT_RETURN
++	ret
++
++	.p2align 4
++L(exit):
++	pxor	%xmm0, %xmm0
++	FIND_ZERO
++
++	bsfq	%rdx, %rdx
++	addq	%rdx, %rax
++	subq	%rdi, %rax
++	SHIFT_RETURN
++	ret
++
++#else
++
++	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
++	.p2align 4
++L(loop):
++
++	movdqa	64(%rax), %xmm0
++	PMINU	80(%rax), %xmm0
++	PMINU	96(%rax), %xmm0
++	PMINU	112(%rax), %xmm0
++	PCMPEQ	%xmm3, %xmm0
++	pmovmskb	%xmm0, %edx
++	testl	%edx, %edx
++	jne	L(exit64)
++
++	subq	$-128, %rax
++
++	movdqa	(%rax), %xmm0
++	PMINU	16(%rax), %xmm0
++	PMINU	32(%rax), %xmm0
++	PMINU	48(%rax), %xmm0
++	PCMPEQ	%xmm3, %xmm0
++	pmovmskb	%xmm0, %edx
++	testl	%edx, %edx
++	jne	L(exit0)
++	jmp	L(loop)
++
++	.p2align 4
++L(exit64):
++	addq	$64, %rax
++L(exit0):
++	pxor	%xmm0, %xmm0
++	FIND_ZERO
++
++	bsfq	%rdx, %rdx
++	addq	%rdx, %rax
++	subq	%rdi, %rax
++	SHIFT_RETURN
++	ret
++
++#endif
++
++END(strlen)
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+index a8cab0cb..5fa51fe0 100644
+--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
++++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+@@ -2,4 +2,4 @@
+ #define AS_STRNLEN
+ #define strlen	__wcsnlen_sse4_1
+ 
+-#include "../strlen.S"
++#include "strlen-vec.S"
+diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
+index f845f3d4..ad047d84 100644
+--- a/sysdeps/x86_64/strlen.S
++++ b/sysdeps/x86_64/strlen.S
+@@ -1,5 +1,5 @@
+-/* SSE2 version of strlen/wcslen.
+-   Copyright (C) 2012-2018 Free Software Foundation, Inc.
++/* SSE2 version of strlen.
++   Copyright (C) 2021 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+ 
+    The GNU C Library is free software; you can redistribute it and/or
+@@ -16,243 +16,6 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+-#include <sysdep.h>
++#include "multiarch/strlen-vec.S"
+ 
+-#ifdef AS_WCSLEN
+-# define PMINU		pminud
+-# define PCMPEQ		pcmpeqd
+-# define SHIFT_RETURN	shrq $2, %rax
+-#else
+-# define PMINU		pminub
+-# define PCMPEQ		pcmpeqb
+-# define SHIFT_RETURN
+-#endif
+-
+-/* Long lived register in strlen(s), strnlen(s, n) are:
+-
+-	%xmm3 - zero
+-	%rdi   - s
+-	%r10  (s+n) & (~(64-1))
+-	%r11   s+n
+-*/
+-
+-
+-.text
+-ENTRY(strlen)
+-
+-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
+-#define FIND_ZERO	\
+-	PCMPEQ	(%rax), %xmm0;	\
+-	PCMPEQ	16(%rax), %xmm1;	\
+-	PCMPEQ	32(%rax), %xmm2;	\
+-	PCMPEQ	48(%rax), %xmm3;	\
+-	pmovmskb	%xmm0, %esi;	\
+-	pmovmskb	%xmm1, %edx;	\
+-	pmovmskb	%xmm2, %r8d;	\
+-	pmovmskb	%xmm3, %ecx;	\
+-	salq	$16, %rdx;	\
+-	salq	$16, %rcx;	\
+-	orq	%rsi, %rdx;	\
+-	orq	%r8, %rcx;	\
+-	salq	$32, %rcx;	\
+-	orq	%rcx, %rdx;
+-
+-#ifdef AS_STRNLEN
+-/* Do not read anything when n==0.  */
+-	test	%RSI_LP, %RSI_LP
+-	jne	L(n_nonzero)
+-	xor	%rax, %rax
+-	ret
+-L(n_nonzero):
+-# ifdef AS_WCSLEN
+-	shl	$2, %RSI_LP
+-# endif
+-
+-/* Initialize long lived registers.  */
+-
+-	add	%RDI_LP, %RSI_LP
+-	mov	%RSI_LP, %R10_LP
+-	and	$-64, %R10_LP
+-	mov	%RSI_LP, %R11_LP
+-#endif
+-
+-	pxor	%xmm0, %xmm0
+-	pxor	%xmm1, %xmm1
+-	pxor	%xmm2, %xmm2
+-	pxor	%xmm3, %xmm3
+-	movq	%rdi, %rax
+-	movq	%rdi, %rcx
+-	andq	$4095, %rcx
+-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
+-	cmpq	$4047, %rcx
+-/* We cannot unify this branching as it would be ~6 cycles slower.  */
+-	ja	L(cross_page)
+-
+-#ifdef AS_STRNLEN
+-/* Test if end is among first 64 bytes.  */
+-# define STRNLEN_PROLOG	\
+-	mov	%r11, %rsi;	\
+-	subq	%rax, %rsi;	\
+-	andq	$-64, %rax;	\
+-	testq	$-64, %rsi;	\
+-	je	L(strnlen_ret)
+-#else
+-# define STRNLEN_PROLOG  andq $-64, %rax;
+-#endif
+-
+-/* Ignore bits in mask that come before start of string.  */
+-#define PROLOG(lab)	\
+-	movq	%rdi, %rcx;	\
+-	xorq	%rax, %rcx;	\
+-	STRNLEN_PROLOG;	\
+-	sarq	%cl, %rdx;	\
+-	test	%rdx, %rdx;	\
+-	je	L(lab);	\
+-	bsfq	%rdx, %rax;	\
+-	SHIFT_RETURN;		\
+-	ret
+-
+-#ifdef AS_STRNLEN
+-	andq	$-16, %rax
+-	FIND_ZERO
+-#else
+-	/* Test first 16 bytes unaligned.  */
+-	movdqu	(%rax), %xmm4
+-	PCMPEQ	%xmm0, %xmm4
+-	pmovmskb	%xmm4, %edx
+-	test	%edx, %edx
+-	je 	L(next48_bytes)
+-	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
+-	SHIFT_RETURN
+-	ret
+-
+-L(next48_bytes):
+-/* Same as FIND_ZERO except we do not check first 16 bytes.  */
+-	andq	$-16, %rax
+-	PCMPEQ 16(%rax), %xmm1
+-	PCMPEQ 32(%rax), %xmm2
+-	PCMPEQ 48(%rax), %xmm3
+-	pmovmskb	%xmm1, %edx
+-	pmovmskb	%xmm2, %r8d
+-	pmovmskb	%xmm3, %ecx
+-	salq	$16, %rdx
+-	salq	$16, %rcx
+-	orq	%r8, %rcx
+-	salq	$32, %rcx
+-	orq	%rcx, %rdx
+-#endif
+-
+-	/* When no zero byte is found xmm1-3 are zero so we do not have to
+-	   zero them.  */
+-	PROLOG(loop)
+-
+-	.p2align 4
+-L(cross_page):
+-	andq	$-64, %rax
+-	FIND_ZERO
+-	PROLOG(loop_init)
+-
+-#ifdef AS_STRNLEN
+-/* We must do this check to correctly handle strnlen (s, -1).  */
+-L(strnlen_ret):
+-	bts	%rsi, %rdx
+-	sarq	%cl, %rdx
+-	test	%rdx, %rdx
+-	je	L(loop_init)
+-	bsfq	%rdx, %rax
+-	SHIFT_RETURN
+-	ret
+-#endif
+-	.p2align 4
+-L(loop_init):
+-	pxor	%xmm1, %xmm1
+-	pxor	%xmm2, %xmm2
+-	pxor	%xmm3, %xmm3
+-#ifdef AS_STRNLEN
+-	.p2align 4
+-L(loop):
+-
+-	addq	$64, %rax
+-	cmpq	%rax, %r10
+-	je	L(exit_end)
+-
+-	movdqa	(%rax), %xmm0
+-	PMINU	16(%rax), %xmm0
+-	PMINU	32(%rax), %xmm0
+-	PMINU	48(%rax), %xmm0
+-	PCMPEQ	%xmm3, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	testl	%edx, %edx
+-	jne	L(exit)
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(exit_end):
+-	cmp	%rax, %r11
+-	je	L(first) /* Do not read when end is at page boundary.  */
+-	pxor	%xmm0, %xmm0
+-	FIND_ZERO
+-
+-L(first):
+-	bts	%r11, %rdx
+-	bsfq	%rdx, %rdx
+-	addq	%rdx, %rax
+-	subq	%rdi, %rax
+-	SHIFT_RETURN
+-	ret
+-
+-	.p2align 4
+-L(exit):
+-	pxor	%xmm0, %xmm0
+-	FIND_ZERO
+-
+-	bsfq	%rdx, %rdx
+-	addq	%rdx, %rax
+-	subq	%rdi, %rax
+-	SHIFT_RETURN
+-	ret
+-
+-#else
+-
+-	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
+-	.p2align 4
+-L(loop):
+-
+-	movdqa	64(%rax), %xmm0
+-	PMINU	80(%rax), %xmm0
+-	PMINU	96(%rax), %xmm0
+-	PMINU	112(%rax), %xmm0
+-	PCMPEQ	%xmm3, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	testl	%edx, %edx
+-	jne	L(exit64)
+-
+-	subq	$-128, %rax
+-
+-	movdqa	(%rax), %xmm0
+-	PMINU	16(%rax), %xmm0
+-	PMINU	32(%rax), %xmm0
+-	PMINU	48(%rax), %xmm0
+-	PCMPEQ	%xmm3, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	testl	%edx, %edx
+-	jne	L(exit0)
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(exit64):
+-	addq	$64, %rax
+-L(exit0):
+-	pxor	%xmm0, %xmm0
+-	FIND_ZERO
+-
+-	bsfq	%rdx, %rdx
+-	addq	%rdx, %rax
+-	subq	%rdi, %rax
+-	SHIFT_RETURN
+-	ret
+-
+-#endif
+-
+-END(strlen)
+ libc_hidden_builtin_def (strlen)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-29.patch b/glibc-RHEL-15696-29.patch
new file mode 100644
index 0000000..112821a
--- /dev/null
+++ b/glibc-RHEL-15696-29.patch
@@ -0,0 +1,181 @@
+From 6f573a27b6c8b4236445810a44660612323f5a73 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Jun 2021 01:19:34 -0400
+Subject: [PATCH] x86-64: Add wcslen optimize for sse4.1
+Content-type: text/plain; charset=UTF-8
+
+No bug. This comment adds the ifunc / build infrastructure
+necessary for wcslen to prefer the sse4.1 implementation
+in strlen-vec.S. test-wcslen.c is passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile          |  4 +-
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |  3 ++
+ sysdeps/x86_64/multiarch/ifunc-wcslen.h    | 52 ++++++++++++++++++++++
+ sysdeps/x86_64/multiarch/wcslen-sse4_1.S   |  4 ++
+ sysdeps/x86_64/multiarch/wcslen.c          |  2 +-
+ sysdeps/x86_64/multiarch/wcsnlen.c         | 34 +-------------
+ 6 files changed, 63 insertions(+), 36 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcslen.h
+ create mode 100644 sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 491c7698..65fde4eb 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -93,8 +93,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcscpy-ssse3 wcscpy-c \
+ 		   wcschr-sse2 wcschr-avx2 \
+ 		   wcsrchr-sse2 wcsrchr-avx2 \
+-		   wcsnlen-sse4_1 wcsnlen-c \
+-		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
++		   wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
++		   wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
+ 		   wcschr-avx2-rtm \
+ 		   wcscmp-avx2-rtm \
+ 		   wcslen-avx2-rtm \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index f1a6460a..580913ca 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -657,6 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcslen_evex)
++	      IFUNC_IMPL_ADD (array, i, wcsnlen,
++			      CPU_FEATURE_USABLE (SSE4_1),
++			      __wcsnlen_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
+new file mode 100644
+index 00000000..39e33473
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
+@@ -0,0 +1,52 @@
++/* Common definition for ifunc selections for wcslen and wcsnlen
++   All versions must be listed in ifunc-impl-list.c.
++   Copyright (C) 2017-2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <init-arch.h>
++
++extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
++
++static inline void *
++IFUNC_SELECTOR (void)
++{
++  const struct cpu_features* cpu_features = __get_cpu_features ();
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
++      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	return OPTIMIZE (evex);
++
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
++    return OPTIMIZE (sse4_1);
++
++  return OPTIMIZE (sse2);
++}
+diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+new file mode 100644
+index 00000000..7e62621a
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+@@ -0,0 +1,4 @@
++#define AS_WCSLEN
++#define strlen	__wcslen_sse4_1
++
++#include "strlen-vec.S"
+diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
+index 6d06e47c..3b04b75b 100644
+--- a/sysdeps/x86_64/multiarch/wcslen.c
++++ b/sysdeps/x86_64/multiarch/wcslen.c
+@@ -24,7 +24,7 @@
+ # undef __wcslen
+ 
+ # define SYMBOL_NAME wcslen
+-# include "ifunc-avx2.h"
++# include "ifunc-wcslen.h"
+ 
+ libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ());
+ weak_alias (__wcslen, wcslen);
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
+index 20b731ae..06736410 100644
+--- a/sysdeps/x86_64/multiarch/wcsnlen.c
++++ b/sysdeps/x86_64/multiarch/wcsnlen.c
+@@ -24,39 +24,7 @@
+ # undef __wcsnlen
+ 
+ # define SYMBOL_NAME wcsnlen
+-# include <init-arch.h>
+-
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+-
+-static inline void *
+-IFUNC_SELECTOR (void)
+-{
+-  const struct cpu_features* cpu_features = __get_cpu_features ();
+-
+-  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+-      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+-	return OPTIMIZE (evex);
+-
+-      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+-	return OPTIMIZE (avx2_rtm);
+-
+-      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx2);
+-    }
+-
+-  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+-    return OPTIMIZE (sse4_1);
+-
+-  return OPTIMIZE (sse2);
+-}
++# include "ifunc-wcslen.h"
+ 
+ libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
+ weak_alias (__wcsnlen, wcsnlen);
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-3.patch b/glibc-RHEL-15696-3.patch
new file mode 100644
index 0000000..8f5093c
--- /dev/null
+++ b/glibc-RHEL-15696-3.patch
@@ -0,0 +1,396 @@
+From 231c56760c1e2ded21ad96bbb860b1f08c556c7a Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:27:25 -0800
+Subject: [PATCH] x86-64 memcpy: Properly handle the length parameter [BZ#
+ 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memcpy for x32.  Tested on x86-64 and x32.  On x86-64,
+libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for
+	length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
+	* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S:
+	Likewise.
+	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
+	Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy.
+	tst-size_t-wmemchr.
+	* sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file.
+---
+ sysdeps/x86_64/multiarch/memcpy-ssse3-back.S  | 17 ++++--
+ sysdeps/x86_64/multiarch/memcpy-ssse3.S       | 17 ++++--
+ .../multiarch/memmove-avx512-no-vzeroupper.S  | 16 +++--
+ .../multiarch/memmove-vec-unaligned-erms.S    | 54 +++++++++--------
+ sysdeps/x86_64/x32/Makefile                   |  2 +-
+ sysdeps/x86_64/x32/tst-size_t-memcpy.c        | 58 +++++++++++++++++++
+ 6 files changed, 122 insertions(+), 42 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+index 3cd11233..568eebd3 100644
+--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
++++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+@@ -45,28 +45,33 @@
+ 	.section .text.ssse3,"ax",@progbits
+ #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ ENTRY (MEMPCPY_CHK)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMPCPY_CHK)
+ 
+ ENTRY (MEMPCPY)
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
++	mov	%RDI_LP, %RAX_LP
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (MEMPCPY)
+ #endif
+ 
+ #if !defined USE_AS_BCOPY
+ ENTRY (MEMCPY_CHK)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMCPY_CHK)
+ #endif
+ 
+ ENTRY (MEMCPY)
+-	mov	%rdi, %rax
++	mov	%RDI_LP, %RAX_LP
+ #ifdef USE_AS_MEMPCPY
+-	add	%rdx, %rax
++	add	%RDX_LP, %RAX_LP
++#endif
++
++#ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
+ #endif
+ 
+ #ifdef USE_AS_MEMMOVE
+diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+index 0240bfa3..0bd5ee99 100644
+--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
++++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+@@ -45,28 +45,33 @@
+ 	.section .text.ssse3,"ax",@progbits
+ #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ ENTRY (MEMPCPY_CHK)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMPCPY_CHK)
+ 
+ ENTRY (MEMPCPY)
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
++	mov	%RDI_LP, %RAX_LP
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (MEMPCPY)
+ #endif
+ 
+ #if !defined USE_AS_BCOPY
+ ENTRY (MEMCPY_CHK)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMCPY_CHK)
+ #endif
+ 
+ ENTRY (MEMCPY)
+-	mov	%rdi, %rax
++	mov	%RDI_LP, %RAX_LP
+ #ifdef USE_AS_MEMPCPY
+-	add	%rdx, %rax
++	add	%RDX_LP, %RAX_LP
++#endif
++
++#ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
+ #endif
+ 
+ #ifdef USE_AS_MEMMOVE
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+index effc3ac2..6ca2bbc9 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
++++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+@@ -24,27 +24,31 @@
+ 
+ 	.section .text.avx512,"ax",@progbits
+ ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__mempcpy_chk_avx512_no_vzeroupper)
+ 
+ ENTRY (__mempcpy_avx512_no_vzeroupper)
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
++	mov	%RDI_LP, %RAX_LP
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (__mempcpy_avx512_no_vzeroupper)
+ 
+ ENTRY (__memmove_chk_avx512_no_vzeroupper)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__memmove_chk_avx512_no_vzeroupper)
+ 
+ ENTRY (__memmove_avx512_no_vzeroupper)
+-	mov	%rdi, %rax
++	mov	%RDI_LP, %RAX_LP
+ # ifdef USE_AS_MEMPCPY
+-	add	%rdx, %rax
++	add	%RDX_LP, %RAX_LP
+ # endif
+ L(start):
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
++# endif
+ 	lea	(%rsi, %rdx), %rcx
+ 	lea	(%rdi, %rdx), %r9
+ 	cmp	$512, %rdx
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index c952576c..274aa1c7 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -95,20 +95,20 @@
+ 	.section SECTION(.text),"ax",@progbits
+ #if defined SHARED && IS_IN (libc)
+ ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+ #endif
+ 
+ ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
++	mov	%RDI_LP, %RAX_LP
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+ 
+ #if defined SHARED && IS_IN (libc)
+ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+ #endif
+@@ -116,9 +116,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+ ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
+ 	movq	%rdi, %rax
+ L(start):
+-	cmpq	$VEC_SIZE, %rdx
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++# endif
++	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
++	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(more_2x_vec)
+ #if !defined USE_MULTIARCH || !IS_IN (libc)
+ L(last_2x_vec):
+@@ -138,38 +142,38 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned))
+ 
+ # if VEC_SIZE == 16
+ ENTRY (__mempcpy_chk_erms)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__mempcpy_chk_erms)
+ 
+ /* Only used to measure performance of REP MOVSB.  */
+ ENTRY (__mempcpy_erms)
+-	movq	%rdi, %rax
++	mov	%RDI_LP, %RAX_LP
+ 	/* Skip zero length.  */
+-	testq	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jz	2f
+-	addq	%rdx, %rax
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start_movsb)
+ END (__mempcpy_erms)
+ 
+ ENTRY (__memmove_chk_erms)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__memmove_chk_erms)
+ 
+ ENTRY (__memmove_erms)
+ 	movq	%rdi, %rax
+ 	/* Skip zero length.  */
+-	testq	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jz	2f
+ L(start_movsb):
+-	movq	%rdx, %rcx
+-	cmpq	%rsi, %rdi
++	mov	%RDX_LP, %RCX_LP
++	cmp	%RSI_LP, %RDI_LP
+ 	jb	1f
+ 	/* Source == destination is less common.  */
+ 	je	2f
+-	leaq	(%rsi,%rcx), %rdx
+-	cmpq	%rdx, %rdi
++	lea	(%rsi,%rcx), %RDX_LP
++	cmp	%RDX_LP, %RDI_LP
+ 	jb	L(movsb_backward)
+ 1:
+ 	rep movsb
+@@ -189,20 +193,20 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
+ 
+ # ifdef SHARED
+ ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+ # endif
+ 
+ ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
++	mov	%RDI_LP, %RAX_LP
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start_erms)
+ END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+ 
+ # ifdef SHARED
+ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+ # endif
+@@ -210,9 +214,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+ ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ 	movq	%rdi, %rax
+ L(start_erms):
+-	cmpq	$VEC_SIZE, %rdx
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++# endif
++	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
++	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(movsb_more_2x_vec)
+ L(last_2x_vec):
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+@@ -236,7 +244,7 @@ L(movsb):
+ 	/* Avoid slow backward REP MOVSB.  */
+ 	jb	L(more_8x_vec_backward)
+ 1:
+-	movq	%rdx, %rcx
++	mov	%RDX_LP, %RCX_LP
+ 	rep movsb
+ L(nop):
+ 	ret
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index ddec7f04..2fe1e5ac 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
+ 
+ ifeq ($(subdir),string)
+-tests += tst-size_t-memchr tst-size_t-memcmp
++tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
+new file mode 100644
+index 00000000..66b71e17
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
+@@ -0,0 +1,58 @@
++/* Test memcpy with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_NAME "memcpy"
++#include "test-size_t.h"
++
++IMPL (memcpy, 1)
++
++typedef void *(*proto_t) (void *, const void *, size_t);
++
++static void *
++__attribute__ ((noinline, noclone))
++do_memcpy (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t dest = { { page_size }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      do_memcpy (dest, src);
++      int res = memcmp (dest.p, src.p, dest.len);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %i != 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-30.patch b/glibc-RHEL-15696-30.patch
new file mode 100644
index 0000000..0b16f0f
--- /dev/null
+++ b/glibc-RHEL-15696-30.patch
@@ -0,0 +1,497 @@
+From a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Jun 2021 01:56:29 -0400
+Subject: [PATCH] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ
+ #27974]
+Content-type: text/plain; charset=UTF-8
+
+This commit fixes the bug mentioned in the previous commit.
+
+The previous implementations of wmemchr in these files relied
+on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
+
+The new overflow tests added in the previous commit now
+pass (As well as all the other tests).
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
+ sysdeps/x86_64/multiarch/strlen-vec.S  |  15 ++-
+ 2 files changed, 107 insertions(+), 38 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
+index be8a5db5..37688966 100644
+--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
+@@ -44,21 +44,21 @@
+ 
+ # define VEC_SIZE 32
+ # define PAGE_SIZE 4096
++# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+ 	/* Check zero length.  */
++#  ifdef __ILP32__
++	/* Clear upper bits.  */
++	and	%RSI_LP, %RSI_LP
++#  else
+ 	test	%RSI_LP, %RSI_LP
++#  endif
+ 	jz	L(zero)
+ 	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
+ 	mov	%RSI_LP, %R8_LP
+-#  ifdef USE_AS_WCSLEN
+-	shl	$2, %RSI_LP
+-#  elif defined __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	movl	%esi, %esi
+-#  endif
+ # endif
+ 	movl	%edi, %eax
+ 	movq	%rdi, %rdx
+@@ -72,10 +72,10 @@ ENTRY (STRLEN)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+ 	VPCMPEQ	(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ # ifdef USE_AS_STRNLEN
+ 	/* If length < VEC_SIZE handle special.  */
+-	cmpq	$VEC_SIZE, %rsi
++	cmpq	$CHAR_PER_VEC, %rsi
+ 	jbe	L(first_vec_x0)
+ # endif
+ 	/* If empty continue to aligned_more. Otherwise return bit
+@@ -84,6 +84,7 @@ ENTRY (STRLEN)
+ 	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -97,9 +98,14 @@ L(zero):
+ L(first_vec_x0):
+ 	/* Set bit for max len so that tzcnt will return min of max len
+ 	   and position of first match.  */
++#  ifdef USE_AS_WCSLEN
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %esi
++#  endif
+ 	btsq	%rsi, %rax
+ 	tzcntl	%eax, %eax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -113,14 +119,19 @@ L(first_vec_x1):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
++#  ifdef USE_AS_WCSLEN
++	leal	-(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
++#  else
+ 	subl	$(VEC_SIZE * 4 + 1), %ecx
+ 	addl	%ecx, %eax
++#  endif
+ # else
+ 	subl	%edx, %edi
+ 	incl	%edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -133,14 +144,19 @@ L(first_vec_x2):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
++#  ifdef USE_AS_WCSLEN
++	leal	-(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
++#  else
+ 	subl	$(VEC_SIZE * 3 + 1), %ecx
+ 	addl	%ecx, %eax
++#  endif
+ # else
+ 	subl	%edx, %edi
+ 	addl	$(VEC_SIZE + 1), %edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -153,14 +169,19 @@ L(first_vec_x3):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
++#  ifdef USE_AS_WCSLEN
++	leal	-(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
++#  else
+ 	subl	$(VEC_SIZE * 2 + 1), %ecx
+ 	addl	%ecx, %eax
++#  endif
+ # else
+ 	subl	%edx, %edi
+ 	addl	$(VEC_SIZE * 2 + 1), %edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -173,14 +194,19 @@ L(first_vec_x4):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
++#  ifdef USE_AS_WCSLEN
++	leal	-(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
++#  else
+ 	subl	$(VEC_SIZE + 1), %ecx
+ 	addl	%ecx, %eax
++#  endif
+ # else
+ 	subl	%edx, %edi
+ 	addl	$(VEC_SIZE * 3 + 1), %edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -195,10 +221,14 @@ L(cross_page_continue):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+ # ifdef USE_AS_STRNLEN
+-	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
+-	   it simplies the logic in last_4x_vec_or_less.  */
++	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
++	   because it simplies the logic in last_4x_vec_or_less.  */
+ 	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
+ 	subq	%rdx, %rcx
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
++#  endif
+ # endif
+ 	/* Load first VEC regardless.  */
+ 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+@@ -207,34 +237,38 @@ L(cross_page_continue):
+ 	subq	%rcx, %rsi
+ 	jb	L(last_4x_vec_or_less)
+ # endif
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+ 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+ 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+ 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x4)
+ 
+ 	/* Align data to VEC_SIZE * 4 - 1.  */
+ # ifdef USE_AS_STRNLEN
+ 	/* Before adjusting length check if at last VEC_SIZE * 4.  */
+-	cmpq	$(VEC_SIZE * 4 - 1), %rsi
++	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
+ 	jbe	L(last_4x_vec_or_less_load)
+ 	incq	%rdi
+ 	movl	%edi, %ecx
+ 	orq	$(VEC_SIZE * 4 - 1), %rdi
+ 	andl	$(VEC_SIZE * 4 - 1), %ecx
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
++#  endif
+ 	/* Readjust length.  */
+ 	addq	%rcx, %rsi
+ # else
+@@ -246,13 +280,13 @@ L(cross_page_continue):
+ L(loop_4x_vec):
+ # ifdef USE_AS_STRNLEN
+ 	/* Break if at end of length.  */
+-	subq	$(VEC_SIZE * 4), %rsi
++	subq	$(CHAR_PER_VEC * 4), %rsi
+ 	jb	L(last_4x_vec_or_less_cmpeq)
+ # endif
+-	/* Save some code size by microfusing VPMINU with the load. Since
+-	   the matches in ymm2/ymm4 can only be returned if there where no
+-	   matches in ymm1/ymm3 respectively there is no issue with overlap.
+-	 */
++	/* Save some code size by microfusing VPMINU with the load.
++	   Since the matches in ymm2/ymm4 can only be returned if there
++	   where no matches in ymm1/ymm3 respectively there is no issue
++	   with overlap.  */
+ 	vmovdqa	1(%rdi), %ymm1
+ 	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
+ 	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
+@@ -260,7 +294,7 @@ L(loop_4x_vec):
+ 
+ 	VPMINU	%ymm2, %ymm4, %ymm5
+ 	VPCMPEQ	%ymm5, %ymm0, %ymm5
+-	vpmovmskb	%ymm5, %ecx
++	vpmovmskb %ymm5, %ecx
+ 
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	testl	%ecx, %ecx
+@@ -268,27 +302,28 @@ L(loop_4x_vec):
+ 
+ 
+ 	VPCMPEQ	%ymm1, %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	subq	%rdx, %rdi
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_return_x0)
+ 
+ 	VPCMPEQ	%ymm2, %ymm0, %ymm2
+-	vpmovmskb	%ymm2, %eax
++	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_return_x1)
+ 
+ 	/* Combine last 2 VEC.  */
+ 	VPCMPEQ	%ymm3, %ymm0, %ymm3
+-	vpmovmskb	%ymm3, %eax
+-	/* rcx has combined result from all 4 VEC. It will only be used if
+-	   the first 3 other VEC all did not contain a match.  */
++	vpmovmskb %ymm3, %eax
++	/* rcx has combined result from all 4 VEC. It will only be used
++	   if the first 3 other VEC all did not contain a match.  */
+ 	salq	$32, %rcx
+ 	orq	%rcx, %rax
+ 	tzcntq	%rax, %rax
+ 	subq	$(VEC_SIZE * 2 - 1), %rdi
+ 	addq	%rdi, %rax
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -297,15 +332,19 @@ L(loop_4x_vec):
+ # ifdef USE_AS_STRNLEN
+ 	.p2align 4
+ L(last_4x_vec_or_less_load):
+-	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
++	/* Depending on entry adjust rdi / prepare first VEC in ymm1.
++	 */
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ L(last_4x_vec_or_less_cmpeq):
+ 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+ L(last_4x_vec_or_less):
+-
+-	vpmovmskb	%ymm1, %eax
+-	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
+-	   VEC_SIZE * 4.  */
++#  ifdef USE_AS_WCSLEN
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %esi
++#  endif
++	vpmovmskb %ymm1, %eax
++	/* If remaining length > VEC_SIZE * 2. This works if esi is off
++	   by VEC_SIZE * 4.  */
+ 	testl	$(VEC_SIZE * 2), %esi
+ 	jnz	L(last_4x_vec)
+ 
+@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
+ 	jb	L(max)
+ 
+ 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+ 	cmpl	%eax, %esi
+@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
+ 	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -340,6 +380,7 @@ L(last_vec_return_x0):
+ 	subq	$(VEC_SIZE * 4 - 1), %rdi
+ 	addq	%rdi, %rax
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -350,6 +391,7 @@ L(last_vec_return_x1):
+ 	subq	$(VEC_SIZE * 3 - 1), %rdi
+ 	addq	%rdi, %rax
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -366,6 +408,7 @@ L(last_vec_x1_check):
+ 	incl	%eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -381,14 +424,14 @@ L(last_4x_vec):
+ 	jnz	L(last_vec_x1)
+ 
+ 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x2)
+ 
+ 	/* Normalize length.  */
+ 	andl	$(VEC_SIZE * 4 - 1), %esi
+ 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x3)
+ 
+@@ -396,7 +439,7 @@ L(last_4x_vec):
+ 	jb	L(max)
+ 
+ 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+ 	cmpl	%eax, %esi
+@@ -405,6 +448,7 @@ L(last_4x_vec):
+ 	addl	$(VEC_SIZE * 3 + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -419,6 +463,7 @@ L(last_vec_x1):
+ 	incl	%eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -432,6 +477,7 @@ L(last_vec_x2):
+ 	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -447,6 +493,7 @@ L(last_vec_x3):
+ 	addl	$(VEC_SIZE * 2 + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -455,13 +502,13 @@ L(max_end):
+ 	VZEROUPPER_RETURN
+ # endif
+ 
+-	/* Cold case for crossing page with first load.	 */
++	/* Cold case for crossing page with first load.  */
+ 	.p2align 4
+ L(cross_page_boundary):
+ 	/* Align data to VEC_SIZE - 1.  */
+ 	orq	$(VEC_SIZE - 1), %rdi
+ 	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+ 	   so no need to manually mod rdx.  */
+ 	sarxl	%edx, %eax, %eax
+@@ -470,6 +517,10 @@ L(cross_page_boundary):
+ 	jnz	L(cross_page_less_vec)
+ 	leaq	1(%rdi), %rcx
+ 	subq	%rdx, %rcx
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
++	shrl	$2, %ecx
++#  endif
+ 	/* Check length.  */
+ 	cmpq	%rsi, %rcx
+ 	jb	L(cross_page_continue)
+@@ -479,6 +530,7 @@ L(cross_page_boundary):
+ 	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide length by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ #  endif
+ # endif
+@@ -489,6 +541,10 @@ L(return_vzeroupper):
+ 	.p2align 4
+ L(cross_page_less_vec):
+ 	tzcntl	%eax, %eax
++#  ifdef USE_AS_WCSLEN
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %esi
++#  endif
+ 	cmpq	%rax, %rsi
+ 	cmovb	%esi, %eax
+ #  ifdef USE_AS_WCSLEN
+diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
+index 8f660bb9..439e486a 100644
+--- a/sysdeps/x86_64/multiarch/strlen-vec.S
++++ b/sysdeps/x86_64/multiarch/strlen-vec.S
+@@ -65,12 +65,25 @@ ENTRY(strlen)
+ 	ret
+ L(n_nonzero):
+ # ifdef AS_WCSLEN
+-	shl	$2, %RSI_LP
++/* Check for overflow from maxlen * sizeof(wchar_t). If it would
++   overflow the only way this program doesn't have undefined behavior 
++   is if there is a null terminator in valid memory so wcslen will 
++   suffice.  */
++	mov	%RSI_LP, %R10_LP
++	sar	$62, %R10_LP
++	test	%R10_LP, %R10_LP
++	jnz	__wcslen_sse4_1
++	sal	$2, %RSI_LP
+ # endif
+ 
++
+ /* Initialize long lived registers.  */
+ 
+ 	add	%RDI_LP, %RSI_LP
++# ifdef AS_WCSLEN
++/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
++	jbe	__wcslen_sse4_1
++# endif
+ 	mov	%RSI_LP, %R10_LP
+ 	and	$-64, %R10_LP
+ 	mov	%RSI_LP, %R11_LP
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-31.patch b/glibc-RHEL-15696-31.patch
new file mode 100644
index 0000000..4ef6911
--- /dev/null
+++ b/glibc-RHEL-15696-31.patch
@@ -0,0 +1,745 @@
+From 4ba65586847751372520a36757c17f114588794e Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 19 Apr 2021 19:36:06 -0400
+Subject: [PATCH] x86: Optimize strlen-evex.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes strlen-evex.S. The
+optimizations are mostly small things but they add up to roughly
+10-30% performance improvement for strlen. The results for strnlen are
+bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and
+test-wcsnlen are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++-----------
+ 1 file changed, 317 insertions(+), 264 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
+index 05838190..4bf6874b 100644
+--- a/sysdeps/x86_64/multiarch/strlen-evex.S
++++ b/sysdeps/x86_64/multiarch/strlen-evex.S
+@@ -29,11 +29,13 @@
+ # ifdef USE_AS_WCSLEN
+ #  define VPCMP		vpcmpd
+ #  define VPMINU	vpminud
+-#  define SHIFT_REG	r9d
++#  define SHIFT_REG ecx
++#  define CHAR_SIZE	4
+ # else
+ #  define VPCMP		vpcmpb
+ #  define VPMINU	vpminub
+-#  define SHIFT_REG	ecx
++#  define SHIFT_REG edx
++#  define CHAR_SIZE	1
+ # endif
+ 
+ # define XMMZERO	xmm16
+@@ -46,132 +48,165 @@
+ # define YMM6		ymm22
+ 
+ # define VEC_SIZE 32
++# define PAGE_SIZE 4096
++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+-	/* Check for zero length.  */
++	/* Check zero length.  */
+ 	test	%RSI_LP, %RSI_LP
+ 	jz	L(zero)
+-#  ifdef USE_AS_WCSLEN
+-	shl	$2, %RSI_LP
+-#  elif defined __ILP32__
++#  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%esi, %esi
+ #  endif
+ 	mov	%RSI_LP, %R8_LP
+ # endif
+-	movl	%edi, %ecx
+-	movq	%rdi, %rdx
++	movl	%edi, %eax
+ 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+-
++	/* Clear high bits from edi. Only keeping bits relevant to page
++	   cross check.  */
++	andl	$(PAGE_SIZE - 1), %eax
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+ 	   null byte.  */
+ 	VPCMP	$0, (%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-
+ # ifdef USE_AS_STRNLEN
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rsi
+-	jbe	L(max)
+-# else
+-	jnz	L(first_vec_x0)
++	/* If length < CHAR_PER_VEC handle special.  */
++	cmpq	$CHAR_PER_VEC, %rsi
++	jbe	L(first_vec_x0)
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++	ret
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
+-	addq	%rcx, %rsi
++L(zero):
++	xorl	%eax, %eax
++	ret
+ 
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
++	.p2align 4
++L(first_vec_x0):
++	/* Set bit for max len so that tzcnt will return min of max len
++	   and position of first match.  */
++	btsq	%rsi, %rax
++	tzcntl	%eax, %eax
++	ret
+ # endif
+-	jmp	L(more_4x_vec)
+ 
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Divide shift count by 4 since each bit in K0 represent 4
+-	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl	$2, %SHIFT_REG
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
++# ifdef USE_AS_STRNLEN
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	leal	-(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
++# else
++	subl	%edx, %edi
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %edi
++#  endif
++	leal	CHAR_PER_VEC(%rdi, %rax), %eax
+ # endif
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
++	ret
+ 
+-	/* Remove the leading bytes.  */
+-	sarxl	%SHIFT_REG, %eax, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
++	.p2align 4
++L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-# endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	leal	-(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
++# else
++	subl	%edx, %edi
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %edi
++#  endif
++	leal	(CHAR_PER_VEC * 2)(%rdi, %rax), %eax
+ # endif
+ 	ret
+ 
+ 	.p2align 4
+-L(aligned_more):
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
+-	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
+-	    to void possible addition overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
+-
+-	/* Check the end of data.  */
+-	subq	%rcx, %rsi
+-	jbe	L(max)
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	leal	-(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
++# else
++	subl	%edx, %edi
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %edi
++#  endif
++	leal	(CHAR_PER_VEC * 3)(%rdi, %rax), %eax
+ # endif
++	ret
+ 
+-	addq	$VEC_SIZE, %rdi
+-
++	.p2align 4
++L(first_vec_x4):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	leal	-(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
++# else
++	subl	%edx, %edi
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %edi
++#  endif
++	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
+ # endif
++	ret
+ 
+-L(more_4x_vec):
++	.p2align 5
++L(aligned_more):
++	movq	%rdi, %rdx
++	/* Align data to VEC_SIZE.  */
++	andq	$-(VEC_SIZE), %rdi
++L(cross_page_continue):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
++# ifdef USE_AS_STRNLEN
++	/* + CHAR_SIZE because it simplies the logic in
++	   last_4x_vec_or_less.  */
++	leaq	(VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
++	subq	%rdx, %rcx
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
++#  endif
++# endif
++	/* Load first VEC regardless.  */
+ 	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
++# ifdef USE_AS_STRNLEN
++	/* Adjust length. If near end handle specially.  */
++	subq	%rcx, %rsi
++	jb	L(last_4x_vec_or_less)
++# endif
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+ 	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
++	test	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+ 	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+@@ -179,258 +214,276 @@ L(more_4x_vec):
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-# ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
++	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x4)
+ 
++	addq	$VEC_SIZE, %rdi
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
++	/* Check if at last VEC_SIZE * 4 length.  */
++	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
++	jbe	L(last_4x_vec_or_less_load)
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE * 4 - 1), %ecx
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
++#  endif
++	/* Readjust length.  */
+ 	addq	%rcx, %rsi
+ # endif
++	/* Align data to VEC_SIZE * 4.  */
++	andq	$-(VEC_SIZE * 4), %rdi
+ 
++	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VMOVA	(%rdi), %YMM1
+-	VMOVA	VEC_SIZE(%rdi), %YMM2
+-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM3
+-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM4
+-
+-	VPMINU	%YMM1, %YMM2, %YMM5
+-	VPMINU	%YMM3, %YMM4, %YMM6
++	/* Load first VEC regardless.  */
++	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
++# ifdef USE_AS_STRNLEN
++	/* Break if at end of length.  */
++	subq	$(CHAR_PER_VEC * 4), %rsi
++	jb	L(last_4x_vec_or_less_cmpeq)
++# endif
++	/* Save some code size by microfusing VPMINU with the load. Since
++	   the matches in ymm2/ymm4 can only be returned if there where no
++	   matches in ymm1/ymm3 respectively there is no issue with overlap.
++	 */
++	VPMINU	(VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
++	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
++	VPMINU	(VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
++
++	VPCMP	$0, %YMM2, %YMMZERO, %k0
++	VPCMP	$0, %YMM4, %YMMZERO, %k1
++	subq	$-(VEC_SIZE * 4), %rdi
++	kortestd	%k0, %k1
++	jz	L(loop_4x_vec)
++
++	/* Check if end was in first half.  */
++	kmovd	%k0, %eax
++	subq	%rdx, %rdi
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rdi
++# endif
++	testl	%eax, %eax
++	jz	L(second_vec_return)
+ 
+-	VPMINU	%YMM5, %YMM6, %YMM5
+-	VPCMP	$0, %YMM5, %YMMZERO, %k0
+-	ktestd	%k0, %k0
+-	jnz	L(4x_vec_end)
++	VPCMP	$0, %YMM1, %YMMZERO, %k2
++	kmovd	%k2, %edx
++	/* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
++# ifdef USE_AS_WCSLEN
++	sall	$CHAR_PER_VEC, %eax
++	orl	%edx, %eax
++	tzcntl	%eax, %eax
++# else
++	salq	$CHAR_PER_VEC, %rax
++	orq	%rdx, %rax
++	tzcntq	%rax, %rax
++# endif
++	addq	%rdi, %rax
++	ret
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+ 
+-# ifndef USE_AS_STRNLEN
+-	jmp	L(loop_4x_vec)
+-# else
+-	subq	$(VEC_SIZE * 4), %rsi
+-	ja	L(loop_4x_vec)
++# ifdef USE_AS_STRNLEN
+ 
++L(last_4x_vec_or_less_load):
++	/* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
++	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
++L(last_4x_vec_or_less_cmpeq):
++	VPCMP	$0, %YMM1, %YMMZERO, %k0
++	addq	$(VEC_SIZE * 3), %rdi
+ L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %esi
+-	jle	L(last_2x_vec)
+-
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
++	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
++	   VEC_SIZE * 4.  */
++	testl	$(CHAR_PER_VEC * 2), %esi
++	jnz	L(last_4x_vec)
++
++	/* length may have been negative or positive by an offset of
++	   CHAR_PER_VEC * 4 depending on where this was called from. This
++	   fixes that.  */
++	andl	$(CHAR_PER_VEC * 4 - 1), %esi
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	jnz	L(last_vec_x1_check)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
++	/* Check the end of data.  */
++	subl	$CHAR_PER_VEC, %esi
++	jb	L(max)
+ 
+ 	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
++	tzcntl	%eax, %eax
++	/* Check the end of data.  */
++	cmpl	%eax, %esi
++	jb	L(max)
+ 
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x3_check)
++	subq	%rdx, %rdi
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
++#  endif
++	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
++	ret
++L(max):
+ 	movq	%r8, %rax
++	ret
++# endif
++
++	/* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
++	   in the 4x VEC loop can use 2 byte encoding.  */
++	.p2align 4
++L(second_vec_return):
++	VPCMP	$0, %YMM3, %YMMZERO, %k0
++	/* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
++# ifdef USE_AS_WCSLEN
++	kunpckbw	%k0, %k1, %k0
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
++# else
++	kunpckdq	%k0, %k1, %k0
++	kmovq	%k0, %rax
++	tzcntq	%rax, %rax
++# endif
++	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
++	ret
++
++
++# ifdef USE_AS_STRNLEN
++L(last_vec_x1_check):
++	tzcntl	%eax, %eax
++	/* Check the end of data.  */
++	cmpl	%eax, %esi
++	jb	L(max)
++	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
+ #  endif
++	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %esi
++L(last_4x_vec):
++	/* Test first 2x VEC normally.  */
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
+ 
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
++	jnz	L(last_vec_x2)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
++	/* Normalize length.  */
++	andl	$(CHAR_PER_VEC * 4 - 1), %esi
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-#  endif
+-	ret
++	jnz	L(last_vec_x3)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
++	/* Check the end of data.  */
++	subl	$(CHAR_PER_VEC * 3), %esi
++	jb	L(max)
++
++	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
+ 	tzcntl	%eax, %eax
+-#  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
++	cmpl	%eax, %esi
++	jb	L(max_end)
++
++	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
+ #  endif
++	leaq	(CHAR_PER_VEC * 4)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x1_check):
++L(last_vec_x1):
+ 	tzcntl	%eax, %eax
++	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
+ #  endif
++	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x2_check):
++L(last_vec_x2):
+ 	tzcntl	%eax, %eax
++	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
+ #  endif
++	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x3_check):
++L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-#  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
++	subl	$(CHAR_PER_VEC * 2), %esi
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
++	cmpl	%eax, %esi
++	jb	L(max_end)
++	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
+ #  endif
++	leaq	(CHAR_PER_VEC * 3)(%rdi, %rax), %rax
+ 	ret
+-
+-	.p2align 4
+-L(max):
++L(max_end):
+ 	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-#  endif
+-	ret
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+ 	ret
+ # endif
+ 
++	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+-L(first_vec_x0):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
++L(cross_page_boundary):
++	movq	%rdi, %rdx
++	/* Align data to VEC_SIZE.  */
++	andq	$-VEC_SIZE, %rdi
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	/* Remove the leading bytes.  */
+ # ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide shift count by 4 since each bit in K0 represent 4
++	   bytes.  */
++	movl	%edx, %ecx
++	shrl	$2, %ecx
++	andl	$(CHAR_PER_VEC - 1), %ecx
+ # endif
+-	ret
+-
+-	.p2align 4
+-L(first_vec_x1):
++	/* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
++	sarxl	%SHIFT_REG, %eax, %eax
++	testl	%eax, %eax
++# ifndef USE_AS_STRNLEN
++	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-# endif
+ 	ret
+-
+-	.p2align 4
+-L(first_vec_x2):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-# endif
++# else
++	jnz	L(cross_page_less_vec)
++#  ifndef USE_AS_WCSLEN
++	movl	%edx, %ecx
++	andl	$(CHAR_PER_VEC - 1), %ecx
++#  endif
++	movl	$CHAR_PER_VEC, %eax
++	subl	%ecx, %eax
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	ja	L(cross_page_continue)
++	movl	%esi, %eax
+ 	ret
+-
+-	.p2align 4
+-L(4x_vec_end):
+-	VPCMP	$0, %YMM1, %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	VPCMP	$0, %YMM2, %YMMZERO, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	VPCMP	$0, %YMM3, %YMMZERO, %k2
+-	kmovd	%k2, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	VPCMP	$0, %YMM4, %YMMZERO, %k3
+-	kmovd	%k3, %eax
+-L(first_vec_x3):
++L(cross_page_less_vec):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-# endif
++	/* Select min of length and position of first null.  */
++	cmpq	%rax, %rsi
++	cmovb	%esi, %eax
+ 	ret
++# endif
+ 
+ END (STRLEN)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-32.patch b/glibc-RHEL-15696-32.patch
new file mode 100644
index 0000000..8f1a94a
--- /dev/null
+++ b/glibc-RHEL-15696-32.patch
@@ -0,0 +1,158 @@
+From ea8e465a6b8d0f26c72bcbe453a854de3abf68ec Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 30 Jun 2021 10:47:06 -0700
+Subject: [PATCH] x86: Check RTM_ALWAYS_ABORT for RTM [BZ #28033]
+Content-type: text/plain; charset=UTF-8
+
+From
+
+https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
+
+* Intel TSX will be disabled by default.
+* The processor will force abort all Restricted Transactional Memory (RTM)
+  transactions by default.
+* A new CPUID bit CPUID.07H.0H.EDX[11](RTM_ALWAYS_ABORT) will be enumerated,
+  which is set to indicate to updated software that the loaded microcode is
+  forcing RTM abort.
+* On processors that enumerate support for RTM, the CPUID enumeration bits
+  for Intel TSX (CPUID.07H.0H.EBX[11] and CPUID.07H.0H.EBX[4]) continue to
+  be set by default after microcode update.
+* Workloads that were benefited from Intel TSX might experience a change
+  in performance.
+* System software may use a new bit in Model-Specific Register (MSR) 0x10F
+  TSX_FORCE_ABORT[TSX_CPUID_CLEAR] functionality to clear the Hardware Lock
+  Elision (HLE) and RTM bits to indicate to software that Intel TSX is
+  disabled.
+
+1. Add RTM_ALWAYS_ABORT to CPUID features.
+2. Set RTM usable only if RTM_ALWAYS_ABORT isn't set.  This skips the
+string/tst-memchr-rtm etc. testcases on the affected processors, which
+always fail after a microcde update.
+3. Check RTM feature, instead of usability, against /proc/cpuinfo.
+
+This fixes BZ #28033.
+---
+ manual/platform.texi                    | 3 +++
+ sysdeps/x86/cpu-features.c              | 5 ++++-
+ sysdeps/x86/sys/platform/x86.h          | 6 +++---
+ sysdeps/x86/tst-cpu-features-supports.c | 2 +-
+ sysdeps/x86/tst-get-cpu-features.c      | 2 ++
+ 5 files changed, 13 insertions(+), 5 deletions(-)
+
+Conflicts:
+	sysdeps/x86/bits/platform/x86.h
+	(doesn't exist)
+	sysdeps/x86/bits/platform/x86.h
+	(account for lack of upstream renames)
+
+diff --git a/manual/platform.texi b/manual/platform.texi
+index 8fec2933..b7e8aef7 100644
+--- a/manual/platform.texi
++++ b/manual/platform.texi
+@@ -510,6 +510,9 @@ capability.
+ @item
+ @code{RTM} -- RTM instruction extensions.
+ 
++@item
++@code{RTM_ALWAYS_ABORT} -- Transactions always abort, making RTM unusable.
++
+ @item
+ @code{SDBG} -- IA32_DEBUG_INTERFACE MSR for silicon debug.
+ 
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 3610ee5c..4889f062 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -74,7 +74,6 @@ update_usable (struct cpu_features *cpu_features)
+   CPU_FEATURE_SET_USABLE (cpu_features, HLE);
+   CPU_FEATURE_SET_USABLE (cpu_features, BMI2);
+   CPU_FEATURE_SET_USABLE (cpu_features, ERMS);
+-  CPU_FEATURE_SET_USABLE (cpu_features, RTM);
+   CPU_FEATURE_SET_USABLE (cpu_features, RDSEED);
+   CPU_FEATURE_SET_USABLE (cpu_features, ADX);
+   CPU_FEATURE_SET_USABLE (cpu_features, CLFLUSHOPT);
+@@ -90,6 +89,7 @@ update_usable (struct cpu_features *cpu_features)
+   CPU_FEATURE_SET_USABLE (cpu_features, MOVDIRI);
+   CPU_FEATURE_SET_USABLE (cpu_features, MOVDIR64B);
+   CPU_FEATURE_SET_USABLE (cpu_features, FSRM);
++  CPU_FEATURE_SET_USABLE (cpu_features, RTM_ALWAYS_ABORT);
+   CPU_FEATURE_SET_USABLE (cpu_features, SERIALIZE);
+   CPU_FEATURE_SET_USABLE (cpu_features, TSXLDTRK);
+   CPU_FEATURE_SET_USABLE (cpu_features, LAHF64_SAHF64);
+@@ -779,6 +779,9 @@ no_cpuid:
+     GLRO(dl_platform) = "i586";
+ #endif
+ 
++  if (!CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT))
++    CPU_FEATURE_SET_USABLE (cpu_features, RTM);
++
+ #if CET_ENABLED
+ # if HAVE_TUNABLES
+   TUNABLE_GET (x86_ibt, tunable_val_t *,
+diff --git a/sysdeps/x86/sys/platform/x86.h b/sysdeps/x86/sys/platform/x86.h
+index e5cc7c68..7a434926 100644
+--- a/sysdeps/x86/sys/platform/x86.h
++++ b/sysdeps/x86/sys/platform/x86.h
+@@ -247,7 +247,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
+ #define bit_cpu_AVX512_VP2INTERSECT (1u << 8)
+ #define bit_cpu_INDEX_7_EDX_9	(1u << 9)
+ #define bit_cpu_MD_CLEAR	(1u << 10)
+-#define bit_cpu_INDEX_7_EDX_11	(1u << 11)
++#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11)
+ #define bit_cpu_INDEX_7_EDX_12	(1u << 12)
+ #define bit_cpu_INDEX_7_EDX_13	(1u << 13)
+ #define bit_cpu_SERIALIZE	(1u << 14)
+@@ -471,7 +471,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
+ #define index_cpu_AVX512_VP2INTERSECT COMMON_CPUID_INDEX_7
+ #define index_cpu_INDEX_7_EDX_9	COMMON_CPUID_INDEX_7
+ #define index_cpu_MD_CLEAR	COMMON_CPUID_INDEX_7
+-#define index_cpu_INDEX_7_EDX_11 COMMON_CPUID_INDEX_7
++#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7
+ #define index_cpu_INDEX_7_EDX_12 COMMON_CPUID_INDEX_7
+ #define index_cpu_INDEX_7_EDX_13 COMMON_CPUID_INDEX_7
+ #define index_cpu_SERIALIZE	COMMON_CPUID_INDEX_7
+@@ -695,7 +695,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
+ #define reg_AVX512_VP2INTERSECT	edx
+ #define reg_INDEX_7_EDX_9	edx
+ #define reg_MD_CLEAR		edx
+-#define reg_INDEX_7_EDX_11	edx
++#define reg_RTM_ALWAYS_ABORT	edx
+ #define reg_INDEX_7_EDX_12	edx
+ #define reg_INDEX_7_EDX_13	edx
+ #define reg_SERIALIZE		edx
+diff --git a/sysdeps/x86/tst-cpu-features-supports.c b/sysdeps/x86/tst-cpu-features-supports.c
+index 287cf01f..8100a319 100644
+--- a/sysdeps/x86/tst-cpu-features-supports.c
++++ b/sysdeps/x86/tst-cpu-features-supports.c
+@@ -152,7 +152,7 @@ do_test (int argc, char **argv)
+   fails += CHECK_SUPPORTS (rdpid, RDPID);
+   fails += CHECK_SUPPORTS (rdrnd, RDRAND);
+   fails += CHECK_SUPPORTS (rdseed, RDSEED);
+-  fails += CHECK_SUPPORTS (rtm, RTM);
++  fails += CHECK_CPU_SUPPORTS (rtm, RTM);
+   fails += CHECK_SUPPORTS (serialize, SERIALIZE);
+   fails += CHECK_SUPPORTS (sha, SHA);
+   fails += CHECK_CPU_SUPPORTS (shstk, SHSTK);
+diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c
+index 2763deb6..0717e5d8 100644
+--- a/sysdeps/x86/tst-get-cpu-features.c
++++ b/sysdeps/x86/tst-get-cpu-features.c
+@@ -183,6 +183,7 @@ do_test (void)
+   CHECK_CPU_FEATURE (UINTR);
+   CHECK_CPU_FEATURE (AVX512_VP2INTERSECT);
+   CHECK_CPU_FEATURE (MD_CLEAR);
++  CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT);
+   CHECK_CPU_FEATURE (SERIALIZE);
+   CHECK_CPU_FEATURE (HYBRID);
+   CHECK_CPU_FEATURE (TSXLDTRK);
+@@ -344,6 +345,7 @@ do_test (void)
+   CHECK_CPU_FEATURE_USABLE (FSRM);
+   CHECK_CPU_FEATURE_USABLE (AVX512_VP2INTERSECT);
+   CHECK_CPU_FEATURE_USABLE (MD_CLEAR);
++  CHECK_CPU_FEATURE_USABLE (RTM_ALWAYS_ABORT);
+   CHECK_CPU_FEATURE_USABLE (SERIALIZE);
+   CHECK_CPU_FEATURE_USABLE (HYBRID);
+   CHECK_CPU_FEATURE_USABLE (TSXLDTRK);
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-33.patch b/glibc-RHEL-15696-33.patch
new file mode 100644
index 0000000..1196471
--- /dev/null
+++ b/glibc-RHEL-15696-33.patch
@@ -0,0 +1,51 @@
+From 0679442defedf7e52a94264975880ab8674736b2 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 8 Jul 2021 16:13:19 -0400
+Subject: [PATCH] x86: Remove wcsnlen-sse4_1 from wcslen ifunc-impl-list [BZ
+ #28064]
+Content-type: text/plain; charset=UTF-8
+
+The following commit
+
+commit 6f573a27b6c8b4236445810a44660612323f5a73
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Jun 23 01:19:34 2021 -0400
+
+    x86-64: Add wcslen optimize for sse4.1
+
+Added wcsnlen-sse4.1 to the wcslen ifunc implementation list and did
+not add wcslen-sse4.1 to wcslen ifunc implementation list. This commit
+fixes that by removing wcsnlen-sse4.1 from the wcslen ifunc
+implementation list and adding wcslen-sse4.1 to the ifunc
+implementation list.
+
+Testing:
+test-wcslen.c, test-rsi-wcslen.c, and test-rsi-strlen.c are passing as
+well as all other tests in wcsmbs and string.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 580913ca..695cdba6 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -657,9 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcslen_evex)
+-	      IFUNC_IMPL_ADD (array, i, wcsnlen,
++	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      CPU_FEATURE_USABLE (SSE4_1),
+-			      __wcsnlen_sse4_1)
++			      __wcslen_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-34.patch b/glibc-RHEL-15696-34.patch
new file mode 100644
index 0000000..f7c9a56
--- /dev/null
+++ b/glibc-RHEL-15696-34.patch
@@ -0,0 +1,135 @@
+From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 15 Feb 2022 08:18:15 -0600
+Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
+ #28896]
+Content-type: text/plain; charset=UTF-8
+
+In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
+call strcmp-avx2 and wcscmp-avx2 respectively. This would have
+not checks around vzeroupper and would trigger spurious
+aborts. This commit fixes that.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
+AVX2 machines with and without RTM.
+
+Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/Makefile                        |  2 +-
+ sysdeps/x86/tst-strncmp-rtm.c               | 17 ++++++++++++++++-
+ sysdeps/x86_64/multiarch/strcmp-avx2.S      |  2 +-
+ sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S |  1 +
+ sysdeps/x86_64/multiarch/strncmp-avx2.S     |  1 +
+ sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S |  2 +-
+ sysdeps/x86_64/multiarch/wcsncmp-avx2.S     |  2 +-
+ 7 files changed, 22 insertions(+), 5 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strcmp-avx2.S
+	(split into two patches due to upstream bug differences)
+
+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+index 5be71ada..2d814915 100644
+--- a/sysdeps/x86/Makefile
++++ b/sysdeps/x86/Makefile
+@@ -38,7 +38,7 @@ CFLAGS-tst-memset-rtm.c += -mrtm
+ CFLAGS-tst-strchr-rtm.c += -mrtm
+ CFLAGS-tst-strcpy-rtm.c += -mrtm
+ CFLAGS-tst-strlen-rtm.c += -mrtm
+-CFLAGS-tst-strncmp-rtm.c += -mrtm
++CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
+ CFLAGS-tst-strrchr-rtm.c += -mrtm
+ endif
+ 
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index 236ad951..4d0004b5 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
++++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -16,6 +16,7 @@
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+ 
++#include <stdint.h>
+ #include <tst-string-rtm.h>
+ 
+ #define LOOP 3000
+@@ -45,8 +46,22 @@ function (void)
+     return 1;
+ }
+ 
++__attribute__ ((noinline, noclone))
++static int
++function_overflow (void)
++{
++  if (strncmp (string1, string2, SIZE_MAX) == 0)
++    return 0;
++  else
++    return 1;
++}
++
+ static int
+ do_test (void)
+ {
+-  return do_test_1 ("strncmp", LOOP, prepare, function);
++  int status = do_test_1 ("strncmp", LOOP, prepare, function);
++  if (status != EXIT_SUCCESS)
++    return status;
++  status = do_test_1 ("strncmp", LOOP, prepare, function_overflow);
++  return status;
+ }
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 5d1c9d90..433ae047 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -95,7 +95,7 @@ ENTRY (STRCMP)
+ 	   length to bound a valid memory region. In these cases just use
+ 	   'wcscmp'.  */
+ 	shrq	$56, %rcx
+-	jnz	__wcscmp_avx2
++	jnz	OVERFLOW_STRCMP
+ #  endif
+ 	/* Convert units: from wide to byte char.  */
+ 	shl	$2, %RDX_LP
+diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+index 37d1224b..68bad365 100644
+--- a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
++++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+@@ -1,3 +1,4 @@
+ #define STRCMP	__strncmp_avx2_rtm
+ #define USE_AS_STRNCMP 1
++#define OVERFLOW_STRCMP	__strcmp_avx2_rtm
+ #include "strcmp-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S
+index 1678bcc2..f138e9f1 100644
+--- a/sysdeps/x86_64/multiarch/strncmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S
+@@ -1,3 +1,4 @@
+ #define STRCMP	__strncmp_avx2
+ #define USE_AS_STRNCMP 1
++#define OVERFLOW_STRCMP __strcmp_avx2
+ #include "strcmp-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+index 4e88c70c..f467582c 100644
+--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
++++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+@@ -1,5 +1,5 @@
+ #define STRCMP __wcsncmp_avx2_rtm
+ #define USE_AS_STRNCMP 1
+ #define USE_AS_WCSCMP 1
+-
++#define OVERFLOW_STRCMP	__wcscmp_avx2_rtm
+ #include "strcmp-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
+index 4fa1de4d..e9ede522 100644
+--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
+@@ -1,5 +1,5 @@
+ #define STRCMP __wcsncmp_avx2
+ #define USE_AS_STRNCMP 1
+ #define USE_AS_WCSCMP 1
+-
++#define OVERFLOW_STRCMP	__wcscmp_avx2
+ #include "strcmp-avx2.S"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-35.patch b/glibc-RHEL-15696-35.patch
new file mode 100644
index 0000000..5e4fbdd
--- /dev/null
+++ b/glibc-RHEL-15696-35.patch
@@ -0,0 +1,51 @@
+From 55c7bcc71b84123d5d4bd2814366a6b05fcf8ebd Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sat, 9 May 2020 12:04:23 -0700
+Subject: [PATCH] x86-64: Use RDX_LP on __x86_shared_non_temporal_threshold [BZ
+ #25966]
+Content-type: text/plain; charset=UTF-8
+
+Since __x86_shared_non_temporal_threshold is defined as
+
+long int __x86_shared_non_temporal_threshold;
+
+and long int is 4 bytes for x32, use RDX_LP to compare against
+__x86_shared_non_temporal_threshold in assembly code.
+---
+ sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 71f5954d..673b73aa 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -245,7 +245,7 @@ L(return):
+ #endif
+ 
+ L(movsb):
+-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
++	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	jae	L(more_8x_vec)
+ 	cmpq	%rsi, %rdi
+ 	jb	1f
+@@ -397,7 +397,7 @@ L(more_8x_vec):
+ 	addq	%r8, %rdx
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ 	/* Check non-temporal store threshold.  */
+-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
++	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	ja	L(large_forward)
+ #endif
+ L(loop_4x_vec_forward):
+@@ -448,7 +448,7 @@ L(more_8x_vec_backward):
+ 	subq	%r8, %rdx
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ 	/* Check non-temporal store threshold.  */
+-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
++	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	ja	L(large_backward)
+ #endif
+ L(loop_4x_vec_backward):
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-36.patch b/glibc-RHEL-15696-36.patch
new file mode 100644
index 0000000..e00b96e
--- /dev/null
+++ b/glibc-RHEL-15696-36.patch
@@ -0,0 +1,44 @@
+From a35a59036ebae3efcdf5e8167610e0656fca9770 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 11 Jun 2020 12:41:18 -0700
+Subject: [PATCH] x86_64: Use %xmmN with vpxor to clear a vector register
+Content-type: text/plain; charset=UTF-8
+
+Since "vpxor %xmmN, %xmmN, %xmmN" clears the whole vector register, use
+%xmmN, instead of %ymmN, with vpxor to clear a vector register.
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S  | 4 ++--
+ sysdeps/x86_64/multiarch/strrchr-avx2.S | 2 +-
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 433ae047..70d8499b 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -105,8 +105,8 @@ ENTRY (STRCMP)
+ # endif
+ 	movl	%edi, %eax
+ 	xorl	%edx, %edx
+-	/* Make %ymm7 all zeros in this function.  */
+-	vpxor	%ymm7, %ymm7, %ymm7
++	/* Make %xmm7 (%ymm7) all zeros in this function.  */
++	vpxor	%xmm7, %xmm7, %xmm7
+ 	orl	%esi, %eax
+ 	andl	$(PAGE_SIZE - 1), %eax
+ 	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
+diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+index 9f22a15e..c949410b 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+@@ -48,7 +48,7 @@ ENTRY (STRRCHR)
+ 	movl	%edi, %ecx
+ 	/* Broadcast CHAR to YMM4.  */
+ 	VPBROADCAST %xmm4, %ymm4
+-	vpxor	%ymm0, %ymm0, %ymm0
++	vpxor	%xmm0, %xmm0, %xmm0
+ 
+ 	/* Check if we may cross page boundary with one vector load.  */
+ 	andl	$(2 * VEC_SIZE - 1), %ecx
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-37.patch b/glibc-RHEL-15696-37.patch
new file mode 100644
index 0000000..10b0cc4
--- /dev/null
+++ b/glibc-RHEL-15696-37.patch
@@ -0,0 +1,359 @@
+From 1f745ecc2109890886b161d4791e1406fdfc29b8 Mon Sep 17 00:00:00 2001
+From: noah <goldstein.w.n@gmail.com>
+Date: Wed, 3 Feb 2021 00:38:59 -0500
+Subject: [PATCH] x86-64: Refactor and improve performance of strchr-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. Just seemed the performance could be improved a bit. Observed
+and expected behavior are unchanged. Optimized body of main
+loop. Updated page cross logic and optimized accordingly. Made a few
+minor instruction selection modifications. No regressions in test
+suite. Both test-strchrnul and test-strchr passed.
+---
+ sysdeps/x86_64/multiarch/strchr-avx2.S | 225 ++++++++++++-------------
+ sysdeps/x86_64/multiarch/strchr.c      |   4 +-
+ 2 files changed, 114 insertions(+), 115 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strchr.c
+	(account for missing upstream macros)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+index da7d2620..919d256c 100644
+--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+@@ -27,10 +27,12 @@
+ # ifdef USE_AS_WCSCHR
+ #  define VPBROADCAST	vpbroadcastd
+ #  define VPCMPEQ	vpcmpeqd
++#  define VPMINU	vpminud
+ #  define CHAR_REG	esi
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMPEQ	vpcmpeqb
++#  define VPMINU	vpminub
+ #  define CHAR_REG	sil
+ # endif
+ 
+@@ -43,71 +45,54 @@
+ # endif
+ 
+ # define VEC_SIZE 32
++# define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCHR)
+ 	movl	%edi, %ecx
+-	/* Broadcast CHAR to YMM0.  */
++# ifndef USE_AS_STRCHRNUL
++	xorl	%edx, %edx
++# endif
++
++	/* Broadcast CHAR to YMM0.	*/
+ 	vmovd	%esi, %xmm0
+ 	vpxor	%xmm9, %xmm9, %xmm9
+ 	VPBROADCAST %xmm0, %ymm0
+-	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
+ 
+-	/* Check the first VEC_SIZE bytes.  Search for both CHAR and the
+-	   null byte.  */
+-	vmovdqu	(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	/* Check if we cross page boundary with one vector load.  */
++	andl	$(PAGE_SIZE - 1), %ecx
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
++	ja  L(cross_page_boundary)
+ 
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-
+-	jmp	L(more_4x_vec)
+-
+-	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
++	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
++	   null byte.  */
+ 	vmovdqu	(%rdi), %ymm8
+ 	VPCMPEQ %ymm8, %ymm0, %ymm1
+ 	VPCMPEQ %ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	/* Remove the leading bytes.  */
+-	sarl	%cl, %eax
+ 	testl	%eax, %eax
+-	jz	L(aligned_more)
+-	/* Found CHAR or the null byte.  */
++	jz	L(more_vecs)
+ 	tzcntl	%eax, %eax
+-	addq	%rcx, %rax
+-# ifdef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
+ 	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+-	leaq	(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
++L(more_vecs):
++	/* Align data for aligned loads in the loop.  */
++	andq	$-VEC_SIZE, %rdi
+ L(aligned_more):
+-	addq	$VEC_SIZE, %rdi
+ 
+-L(more_4x_vec):
+-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+-	   since data is only aligned to VEC_SIZE.  */
+-	vmovdqa	(%rdi), %ymm8
++	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.	*/
++	vmovdqa	VEC_SIZE(%rdi), %ymm8
++	addq	$VEC_SIZE, %rdi
+ 	VPCMPEQ %ymm8, %ymm0, %ymm1
+ 	VPCMPEQ %ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+@@ -137,61 +122,24 @@ L(more_4x_vec):
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x3)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
+-
+-	.p2align 4
+-L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa	(%rdi), %ymm5
+-	vmovdqa	VEC_SIZE(%rdi), %ymm6
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+-
+-	VPCMPEQ %ymm5, %ymm0, %ymm1
+-	VPCMPEQ %ymm6, %ymm0, %ymm2
+-	VPCMPEQ %ymm7, %ymm0, %ymm3
+-	VPCMPEQ %ymm8, %ymm0, %ymm4
+-
+-	VPCMPEQ %ymm5, %ymm9, %ymm5
+-	VPCMPEQ %ymm6, %ymm9, %ymm6
+-	VPCMPEQ %ymm7, %ymm9, %ymm7
+-	VPCMPEQ %ymm8, %ymm9, %ymm8
+-
+-	vpor	%ymm1, %ymm5, %ymm1
+-	vpor	%ymm2, %ymm6, %ymm2
+-	vpor	%ymm3, %ymm7, %ymm3
+-	vpor	%ymm4, %ymm8, %ymm4
+-
+-	vpor	%ymm1, %ymm2, %ymm5
+-	vpor	%ymm3, %ymm4, %ymm6
+-
+-	vpor	%ymm5, %ymm6, %ymm5
+-
+-	vpmovmskb %ymm5, %eax
+-	testl	%eax, %eax
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
++	jz	L(prep_loop_4x)
+ 
+-	jmp	L(loop_4x_vec)
++	tzcntl	%eax, %eax
++	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	VZEROUPPER
++	ret
+ 
+ 	.p2align 4
+ L(first_vec_x0):
+-	/* Found CHAR or the null byte.  */
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
+ 	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+-	leaq	(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -199,13 +147,9 @@ L(first_vec_x0):
+ 	.p2align 4
+ L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+ 	leaq	VEC_SIZE(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -213,42 +157,97 @@ L(first_vec_x1):
+ 	.p2align 4
+ L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
++	/* Found CHAR or the null byte.	 */
+ 	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+ 
++L(prep_loop_4x):
++	/* Align data to 4 * VEC_SIZE.	*/
++	andq	$-(VEC_SIZE * 4), %rdi
++
+ 	.p2align 4
+-L(4x_vec_end):
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
++	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
++	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
++	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
++
++	/* Leaves only CHARS matching esi as 0.	 */
++	vpxor	%ymm5, %ymm0, %ymm1
++	vpxor	%ymm6, %ymm0, %ymm2
++	vpxor	%ymm7, %ymm0, %ymm3
++	vpxor	%ymm8, %ymm0, %ymm4
++
++	VPMINU	%ymm1, %ymm5, %ymm1
++	VPMINU	%ymm2, %ymm6, %ymm2
++	VPMINU	%ymm3, %ymm7, %ymm3
++	VPMINU	%ymm4, %ymm8, %ymm4
++
++	VPMINU	%ymm1, %ymm2, %ymm5
++	VPMINU	%ymm3, %ymm4, %ymm6
++
++	VPMINU	%ymm5, %ymm6, %ymm5
++
++	VPCMPEQ %ymm5, %ymm9, %ymm5
++	vpmovmskb %ymm5, %eax
++
++	addq	$(VEC_SIZE * 4), %rdi
++	testl	%eax, %eax
++	jz  L(loop_4x_vec)
++
++	VPCMPEQ %ymm1, %ymm9, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x0)
++
++	VPCMPEQ %ymm2, %ymm9, %ymm2
+ 	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+-	vpmovmskb %ymm3, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
++
++	VPCMPEQ %ymm3, %ymm9, %ymm3
++	VPCMPEQ %ymm4, %ymm9, %ymm4
++	vpmovmskb %ymm3, %ecx
+ 	vpmovmskb %ymm4, %eax
++	salq	$32, %rax
++	orq %rcx, %rax
++	tzcntq  %rax, %rax
++	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	VZEROUPPER
++	ret
++
++	/* Cold case for crossing page with first load.	 */
++	.p2align 4
++L(cross_page_boundary):
++	andq	$-VEC_SIZE, %rdi
++	andl	$(VEC_SIZE - 1), %ecx
++
++	vmovdqa	(%rdi), %ymm8
++	VPCMPEQ %ymm8, %ymm0, %ymm1
++	VPCMPEQ %ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	/* Remove the leading bits.	 */
++	sarxl	%ecx, %eax, %eax
+ 	testl	%eax, %eax
+-L(first_vec_x3):
++	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
+-	addq	$(VEC_SIZE * 3), %rax
++	addq	%rcx, %rdi
+ 	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+ 
+ END (STRCHR)
+-#endif
++# endif
+diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
+index 7e582f02..5225bd4f 100644
+--- a/sysdeps/x86_64/multiarch/strchr.c
++++ b/sysdeps/x86_64/multiarch/strchr.c
+@@ -38,11 +38,11 @@ IFUNC_SELECTOR (void)
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ 	return OPTIMIZE (evex);
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-38.patch b/glibc-RHEL-15696-38.patch
new file mode 100644
index 0000000..f97ab23
--- /dev/null
+++ b/glibc-RHEL-15696-38.patch
@@ -0,0 +1,67 @@
+From 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sat, 25 Jan 2020 14:19:40 -0800
+Subject: [PATCH] x86-64: Avoid rep movsb with short distance [BZ #27130]
+Content-type: text/plain; charset=UTF-8
+
+When copying with "rep movsb", if the distance between source and
+destination is N*4GB + [1..63] with N >= 0, performance may be very
+slow.  This patch updates memmove-vec-unaligned-erms.S for AVX and
+AVX512 versions with the distance in RCX:
+
+	cmpl	$63, %ecx
+	// Don't use "rep movsb" if ECX <= 63
+	jbe	L(Don't use rep movsb")
+	Use "rep movsb"
+
+Benchtests data with bench-memcpy, bench-memcpy-large, bench-memcpy-random
+and bench-memcpy-walk on Skylake, Ice Lake and Tiger Lake show that its
+performance impact is within noise range as "rep movsb" is only used for
+data size >= 4KB.
+---
+ .../multiarch/memmove-vec-unaligned-erms.S    | 21 +++++++++++++++++++
+ 1 file changed, 21 insertions(+)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 673b73aa..c475fed4 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -64,6 +64,13 @@
+ # endif
+ #endif
+ 
++/* Avoid short distance rep movsb only with non-SSE vector.  */
++#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
++# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
++#else
++# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
++#endif
++
+ #ifndef PREFETCH
+ # define PREFETCH(addr) prefetcht0 addr
+ #endif
+@@ -255,7 +262,21 @@ L(movsb):
+ 	cmpq	%r9, %rdi
+ 	/* Avoid slow backward REP MOVSB.  */
+ 	jb	L(more_8x_vec_backward)
++# if AVOID_SHORT_DISTANCE_REP_MOVSB
++	movq	%rdi, %rcx
++	subq	%rsi, %rcx
++	jmp	2f
++# endif
+ 1:
++# if AVOID_SHORT_DISTANCE_REP_MOVSB
++	movq	%rsi, %rcx
++	subq	%rdi, %rcx
++2:
++/* Avoid "rep movsb" if RCX, the distance between source and destination,
++   is N*4GB + [1..63] with N >= 0.  */
++	cmpl	$63, %ecx
++	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
++# endif
+ 	mov	%RDX_LP, %RCX_LP
+ 	rep movsb
+ L(nop):
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-39.patch b/glibc-RHEL-15696-39.patch
new file mode 100644
index 0000000..8343ba9
--- /dev/null
+++ b/glibc-RHEL-15696-39.patch
@@ -0,0 +1,449 @@
+From 1a8605b6cd257e8a74e29b5b71c057211f5fb847 Mon Sep 17 00:00:00 2001
+From: noah <goldstein.w.n@gmail.com>
+Date: Sat, 3 Apr 2021 04:12:15 -0400
+Subject: [PATCH] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No Bug. This commit updates the large memcpy case (no overlap). The
+update is to perform memcpy on either 2 or 4 contiguous pages at
+once. This 1) helps to alleviate the affects of false memory aliasing
+when destination and source have a close 4k alignment and 2) In most
+cases and for most DRAM units is a modestly more efficient access
+pattern. These changes are a clear performance improvement for
+VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
+test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
+pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ .../multiarch/memmove-vec-unaligned-erms.S    | 338 ++++++++++++++----
+ 1 file changed, 265 insertions(+), 73 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+	(different number of sections)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index c475fed4..3e2dd6bc 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -32,7 +32,16 @@
+       overlapping addresses.
+    6. If size >= __x86_shared_non_temporal_threshold and there is no
+       overlap between destination and source, use non-temporal store
+-      instead of aligned store.  */
++      instead of aligned store copying from either 2 or 4 pages at
++      once.
++   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
++      and source and destination do not page alias, copy from 2 pages
++      at once using non-temporal stores. Page aliasing in this case is
++      considered true if destination's page alignment - sources' page
++      alignment is less than 8 * VEC_SIZE.
++   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
++      and destination do page alias copy from 4 pages at once using
++      non-temporal stores.  */
+ 
+ #include <sysdep.h>
+ 
+@@ -64,6 +73,34 @@
+ # endif
+ #endif
+ 
++#ifndef PAGE_SIZE
++# define PAGE_SIZE 4096
++#endif
++
++#if PAGE_SIZE != 4096
++# error Unsupported PAGE_SIZE
++#endif
++
++#ifndef LOG_PAGE_SIZE
++# define LOG_PAGE_SIZE 12
++#endif
++
++#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
++# error Invalid LOG_PAGE_SIZE
++#endif
++
++/* Byte per page for large_memcpy inner loop.  */
++#if VEC_SIZE == 64
++# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
++#else
++# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
++#endif
++
++/* Amount to shift rdx by to compare for memcpy_large_4x.  */
++#ifndef LOG_4X_MEMCPY_THRESH
++# define LOG_4X_MEMCPY_THRESH 4
++#endif
++
+ /* Avoid short distance rep movsb only with non-SSE vector.  */
+ #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
+ # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
+@@ -103,6 +140,28 @@
+ # error Unsupported PREFETCH_SIZE!
+ #endif
+ 
++#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
++# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
++	VMOVU	(offset)base, vec0; \
++	VMOVU	((offset) + VEC_SIZE)base, vec1;
++# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
++	VMOVNT  vec0, (offset)base; \
++	VMOVNT  vec1, ((offset) + VEC_SIZE)base;
++#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
++# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
++	VMOVU	(offset)base, vec0; \
++	VMOVU	((offset) + VEC_SIZE)base, vec1; \
++	VMOVU	((offset) + VEC_SIZE * 2)base, vec2; \
++	VMOVU	((offset) + VEC_SIZE * 3)base, vec3;
++# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
++	VMOVNT	vec0, (offset)base; \
++	VMOVNT	vec1, ((offset) + VEC_SIZE)base; \
++	VMOVNT	vec2, ((offset) + VEC_SIZE * 2)base; \
++	VMOVNT	vec3, ((offset) + VEC_SIZE * 3)base;
++#else
++# error Invalid LARGE_LOAD_SIZE
++#endif
++
+ #ifndef SECTION
+ # error SECTION is not defined!
+ #endif
+@@ -390,6 +449,15 @@ L(last_4x_vec):
+ 	VZEROUPPER_RETURN
+ 
+ L(more_8x_vec):
++	/* Check if non-temporal move candidate.  */
++#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
++	/* Check non-temporal store threshold.  */
++	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
++	ja	L(large_memcpy_2x)
++#endif
++	/* Entry if rdx is greater than non-temporal threshold but there
++       is overlap.  */
++L(more_8x_vec_check):
+ 	cmpq	%rsi, %rdi
+ 	ja	L(more_8x_vec_backward)
+ 	/* Source == destination is less common.  */
+@@ -416,24 +484,21 @@ L(more_8x_vec):
+ 	subq	%r8, %rdi
+ 	/* Adjust length.  */
+ 	addq	%r8, %rdx
+-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-	/* Check non-temporal store threshold.  */
+-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+-	ja	L(large_forward)
+-#endif
++
++	.p2align 4
+ L(loop_4x_vec_forward):
+ 	/* Copy 4 * VEC a time forward.  */
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+-	addq	$(VEC_SIZE * 4), %rsi
+-	subq	$(VEC_SIZE * 4), %rdx
++	subq	$-(VEC_SIZE * 4), %rsi
++	addq	$-(VEC_SIZE * 4), %rdx
+ 	VMOVA	%VEC(0), (%rdi)
+ 	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+ 	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+ 	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+-	addq	$(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rdi
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	ja	L(loop_4x_vec_forward)
+ 	/* Store the last 4 * VEC.  */
+@@ -467,24 +532,21 @@ L(more_8x_vec_backward):
+ 	subq	%r8, %r9
+ 	/* Adjust length.  */
+ 	subq	%r8, %rdx
+-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-	/* Check non-temporal store threshold.  */
+-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+-	ja	L(large_backward)
+-#endif
++
++	.p2align 4
+ L(loop_4x_vec_backward):
+ 	/* Copy 4 * VEC a time backward.  */
+ 	VMOVU	(%rcx), %VEC(0)
+ 	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+ 	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+ 	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+-	subq	$(VEC_SIZE * 4), %rcx
+-	subq	$(VEC_SIZE * 4), %rdx
++	addq	$-(VEC_SIZE * 4), %rcx
++	addq	$-(VEC_SIZE * 4), %rdx
+ 	VMOVA	%VEC(0), (%r9)
+ 	VMOVA	%VEC(1), -VEC_SIZE(%r9)
+ 	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
+ 	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
+-	subq	$(VEC_SIZE * 4), %r9
++	addq	$-(VEC_SIZE * 4), %r9
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	ja	L(loop_4x_vec_backward)
+ 	/* Store the first 4 * VEC.  */
+@@ -497,72 +559,202 @@ L(loop_4x_vec_backward):
+ 	VZEROUPPER_RETURN
+ 
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-L(large_forward):
++	.p2align 4
++L(large_memcpy_2x):
++	/* Compute absolute value of difference between source and
++	   destination.  */
++	movq	%rdi, %r9
++	subq	%rsi, %r9
++	movq	%r9, %r8
++	leaq	-1(%r9), %rcx
++	sarq	$63, %r8
++	xorq	%r8, %r9
++	subq	%r8, %r9
+ 	/* Don't use non-temporal store if there is overlap between
+-	   destination and source since destination may be in cache
+-	   when source is loaded.  */
+-	leaq    (%rdi, %rdx), %r10
+-	cmpq    %r10, %rsi
+-	jb	L(loop_4x_vec_forward)
+-L(loop_large_forward):
++	   destination and source since destination may be in cache when
++	   source is loaded.  */
++	cmpq	%r9, %rdx
++	ja	L(more_8x_vec_check)
++
++	/* Cache align destination. First store the first 64 bytes then
++	   adjust alignments.  */
++	VMOVU	(%rsi), %VEC(8)
++#if VEC_SIZE < 64
++	VMOVU	VEC_SIZE(%rsi), %VEC(9)
++#if VEC_SIZE < 32
++	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
++#endif
++#endif
++	VMOVU	%VEC(8), (%rdi)
++#if VEC_SIZE < 64
++	VMOVU	%VEC(9), VEC_SIZE(%rdi)
++#if VEC_SIZE < 32
++	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
++	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
++#endif
++#endif
++	/* Adjust source, destination, and size.  */
++	movq	%rdi, %r8
++	andq	$63, %r8
++	/* Get the negative of offset for alignment.  */
++	subq	$64, %r8
++	/* Adjust source.  */
++	subq	%r8, %rsi
++	/* Adjust destination which should be aligned now.  */
++	subq	%r8, %rdi
++	/* Adjust length.  */
++	addq	%r8, %rdx
++
++	/* Test if source and destination addresses will alias. If they do
++	   the larger pipeline in large_memcpy_4x alleviated the
++	   performance drop.  */
++	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
++	jz	L(large_memcpy_4x)
++
++	movq	%rdx, %r10
++	shrq	$LOG_4X_MEMCPY_THRESH, %r10
++	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
++	jae	L(large_memcpy_4x)
++
++	/* edx will store remainder size for copying tail.  */
++	andl	$(PAGE_SIZE * 2 - 1), %edx
++	/* r10 stores outer loop counter.  */
++	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
++	/* Copy 4x VEC at a time from 2 pages.  */
++	.p2align 4
++L(loop_large_memcpy_2x_outer):
++	/* ecx stores inner loop counter.  */
++	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
++L(loop_large_memcpy_2x_inner):
++	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
++	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
++	/* Load vectors from rsi.  */
++	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
++	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
++	subq	$-LARGE_LOAD_SIZE, %rsi
++	/* Non-temporal store vectors to rdi.  */
++	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
++	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
++	subq	$-LARGE_LOAD_SIZE, %rdi
++	decl	%ecx
++	jnz	L(loop_large_memcpy_2x_inner)
++	addq	$PAGE_SIZE, %rdi
++	addq	$PAGE_SIZE, %rsi
++	decq	%r10
++	jne	L(loop_large_memcpy_2x_outer)
++	sfence
++
++	/* Check if only last 4 loads are needed.  */
++	cmpl	$(VEC_SIZE * 4), %edx
++	jbe	L(large_memcpy_2x_end)
++
++	/* Handle the last 2 * PAGE_SIZE bytes.  */
++L(loop_large_memcpy_2x_tail):
+ 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
++	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+-	addq	$PREFETCHED_LOAD_SIZE, %rsi
+-	subq	$PREFETCHED_LOAD_SIZE, %rdx
+-	VMOVNT	%VEC(0), (%rdi)
+-	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
+-	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
+-	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
+-	addq	$PREFETCHED_LOAD_SIZE, %rdi
+-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+-	ja	L(loop_large_forward)
+-	sfence
++	subq	$-(VEC_SIZE * 4), %rsi
++	addl	$-(VEC_SIZE * 4), %edx
++	VMOVA	%VEC(0), (%rdi)
++	VMOVA	%VEC(1), VEC_SIZE(%rdi)
++	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
++	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
++	subq	$-(VEC_SIZE * 4), %rdi
++	cmpl	$(VEC_SIZE * 4), %edx
++	ja	L(loop_large_memcpy_2x_tail)
++
++L(large_memcpy_2x_end):
+ 	/* Store the last 4 * VEC.  */
+-	VMOVU	%VEC(5), (%rcx)
+-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+-	/* Store the first VEC.  */
+-	VMOVU	%VEC(4), (%r11)
++	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
++	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
++	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
++	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
++
++	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
++	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
++	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
++	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ 
+-L(large_backward):
+-	/* Don't use non-temporal store if there is overlap between
+-	   destination and source since destination may be in cache
+-	   when source is loaded.  */
+-	leaq    (%rcx, %rdx), %r10
+-	cmpq    %r10, %r9
+-	jb	L(loop_4x_vec_backward)
+-L(loop_large_backward):
+-	/* Copy 4 * VEC a time backward with non-temporal stores.  */
+-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
+-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
+-	VMOVU	(%rcx), %VEC(0)
+-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+-	subq	$PREFETCHED_LOAD_SIZE, %rcx
+-	subq	$PREFETCHED_LOAD_SIZE, %rdx
+-	VMOVNT	%VEC(0), (%r9)
+-	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
+-	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
+-	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
+-	subq	$PREFETCHED_LOAD_SIZE, %r9
+-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+-	ja	L(loop_large_backward)
++	.p2align 4
++L(large_memcpy_4x):
++	movq	%rdx, %r10
++	/* edx will store remainder size for copying tail.  */
++	andl	$(PAGE_SIZE * 4 - 1), %edx
++	/* r10 stores outer loop counter.  */
++	shrq	$(LOG_PAGE_SIZE + 2), %r10
++	/* Copy 4x VEC at a time from 4 pages.  */
++	.p2align 4
++L(loop_large_memcpy_4x_outer):
++	/* ecx stores inner loop counter.  */
++	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
++L(loop_large_memcpy_4x_inner):
++	/* Only one prefetch set per page as doing 4 pages give more time
++	   for prefetcher to keep up.  */
++	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
++	/* Load vectors from rsi.  */
++	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
++	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
++	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
++	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
++	subq	$-LARGE_LOAD_SIZE, %rsi
++	/* Non-temporal store vectors to rdi.  */
++	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
++	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
++	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
++	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
++	subq	$-LARGE_LOAD_SIZE, %rdi
++	decl	%ecx
++	jnz	L(loop_large_memcpy_4x_inner)
++	addq	$(PAGE_SIZE * 3), %rdi
++	addq	$(PAGE_SIZE * 3), %rsi
++	decq	%r10
++	jne	L(loop_large_memcpy_4x_outer)
+ 	sfence
+-	/* Store the first 4 * VEC.  */
+-	VMOVU	%VEC(4), (%rdi)
+-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+-	/* Store the last VEC.  */
+-	VMOVU	%VEC(8), (%r11)
++	/* Check if only last 4 loads are needed.  */
++	cmpl	$(VEC_SIZE * 4), %edx
++	jbe	L(large_memcpy_4x_end)
++
++	/* Handle the last 4  * PAGE_SIZE bytes.  */
++L(loop_large_memcpy_4x_tail):
++	/* Copy 4 * VEC a time forward with non-temporal stores.  */
++	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
++	VMOVU	(%rsi), %VEC(0)
++	VMOVU	VEC_SIZE(%rsi), %VEC(1)
++	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
++	subq	$-(VEC_SIZE * 4), %rsi
++	addl	$-(VEC_SIZE * 4), %edx
++	VMOVA	%VEC(0), (%rdi)
++	VMOVA	%VEC(1), VEC_SIZE(%rdi)
++	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
++	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
++	subq	$-(VEC_SIZE * 4), %rdi
++	cmpl	$(VEC_SIZE * 4), %edx
++	ja	L(loop_large_memcpy_4x_tail)
++
++L(large_memcpy_4x_end):
++	/* Store the last 4 * VEC.  */
++	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
++	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
++	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
++	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
++
++	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
++	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
++	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
++	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
+ END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-4.patch b/glibc-RHEL-15696-4.patch
new file mode 100644
index 0000000..531c171
--- /dev/null
+++ b/glibc-RHEL-15696-4.patch
@@ -0,0 +1,151 @@
+From ecd8b842cf37ea112e59cd9085ff1f1b6e208ae0 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:29:58 -0800
+Subject: [PATCH] x86-64 memrchr: Properly handle the length parameter [BZ#
+ 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memrchr for x32.  Tested on x86-64 and x32.  On x86-64,
+libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/memrchr.S: Use RDX_LP for length.
+	* sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr.
+	* sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file.
+---
+ sysdeps/x86_64/memrchr.S                |  4 +-
+ sysdeps/x86_64/multiarch/memrchr-avx2.S |  4 +-
+ sysdeps/x86_64/x32/Makefile             |  3 +-
+ sysdeps/x86_64/x32/tst-size_t-memrchr.c | 57 +++++++++++++++++++++++++
+ 4 files changed, 63 insertions(+), 5 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memrchr.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
+index b8e3fa1d..dc82f8f7 100644
+--- a/sysdeps/x86_64/memrchr.S
++++ b/sysdeps/x86_64/memrchr.S
+@@ -24,13 +24,13 @@
+ ENTRY (__memrchr)
+ 	movd	%esi, %xmm1
+ 
+-	sub	$16, %rdx
++	sub	$16, %RDX_LP
+ 	jbe	L(length_less16)
+ 
+ 	punpcklbw	%xmm1, %xmm1
+ 	punpcklbw	%xmm1, %xmm1
+ 
+-	add	%rdx, %rdi
++	add	%RDX_LP, %RDI_LP
+ 	pshufd	$0, %xmm1, %xmm1
+ 
+ 	movdqu	(%rdi), %xmm0
+diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+index b41a58bc..ce488dd9 100644
+--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+@@ -32,10 +32,10 @@ ENTRY (__memrchr_avx2)
+ 	vmovd	%esi, %xmm0
+ 	vpbroadcastb %xmm0, %ymm0
+ 
+-	subq	$VEC_SIZE, %rdx
++	sub	$VEC_SIZE, %RDX_LP
+ 	jbe	L(last_vec_or_less)
+ 
+-	addq	%rdx, %rdi
++	add	%RDX_LP, %RDI_LP
+ 
+ 	/* Check the last VEC_SIZE bytes.  */
+ 	vpcmpeqb (%rdi), %ymm0, %ymm1
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 2fe1e5ac..e99dbd7c 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -6,7 +6,8 @@ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
+ 
+ ifeq ($(subdir),string)
+-tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
++tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
++	 tst-size_t-memrchr
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memrchr.c b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
+new file mode 100644
+index 00000000..c83699c0
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
+@@ -0,0 +1,57 @@
++/* Test memrchr with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_NAME "memrchr"
++#include "test-size_t.h"
++
++IMPL (memchr, 1)
++
++typedef void * (*proto_t) (const void *, int, size_t);
++
++static void *
++__attribute__ ((noinline, noclone))
++do_memrchr (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t src = { { page_size }, buf2 };
++  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      c.fn = impl->fn;
++      void * res = do_memrchr (src, c);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %p != NULL",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-40.patch b/glibc-RHEL-15696-40.patch
new file mode 100644
index 0000000..7b7c07b
--- /dev/null
+++ b/glibc-RHEL-15696-40.patch
@@ -0,0 +1,92 @@
+From 83c5b368226c34a2f0a5287df40fc290b2b34359 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 19 Apr 2021 10:45:07 -0700
+Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+Since strchr-avx2.S updated by
+
+commit 1f745ecc2109890886b161d4791e1406fdfc29b8
+Author: noah <goldstein.w.n@gmail.com>
+Date:   Wed Feb 3 00:38:59 2021 -0500
+
+    x86-64: Refactor and improve performance of strchr-avx2.S
+
+uses sarx:
+
+c4 e2 72 f7 c0       	sarx   %ecx,%eax,%eax
+
+for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
+ifunc-avx2.h.
+---
+ sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
+ 2 files changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+index e0f30e61..ef72b73f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
++++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+@@ -30,11 +30,11 @@ IFUNC_SELECTOR (void)
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ 	return OPTIMIZE (evex);
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 695cdba6..85b8863a 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -400,10 +400,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strchr.c.  */
+   IFUNC_IMPL (i, name, strchr,
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strchr_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+@@ -417,10 +419,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strchrnul.c.  */
+   IFUNC_IMPL (i, name, strchrnul,
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strchrnul_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strchrnul_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+@@ -574,10 +578,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/wcschr.c.  */
+   IFUNC_IMPL (i, name, wcschr,
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcschr_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wcschr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-41.patch b/glibc-RHEL-15696-41.patch
new file mode 100644
index 0000000..aa8fc69
--- /dev/null
+++ b/glibc-RHEL-15696-41.patch
@@ -0,0 +1,265 @@
+From f53790272ce7bdc5ecd14b45f65d0464d2a61a3a Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 19 Apr 2021 17:48:10 -0400
+Subject: [PATCH] x86: Optimize less_vec evex and avx512
+ memset-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit adds optimized cased for less_vec memset case that
+uses the avx512vl/avx512bw mask store avoiding the excessive
+branches. test-memset and test-wmemset are passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 40 ++++++++++-----
+ sysdeps/x86_64/multiarch/ifunc-memset.h       |  6 ++-
+ .../multiarch/memset-avx512-unaligned-erms.S  |  2 +-
+ .../multiarch/memset-evex-unaligned-erms.S    |  2 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 51 +++++++++++++++----
+ 5 files changed, 74 insertions(+), 27 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 85b8863a..d59d65f8 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -204,19 +204,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __memset_chk_avx2_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -247,19 +251,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __memset_avx2_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -739,10 +747,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wmemset_avx2_unaligned_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+-			      CPU_FEATURE_USABLE (AVX512VL),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+-			      CPU_FEATURE_USABLE (AVX512VL),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_avx512_unaligned))
+ 
+ #ifdef SHARED
+@@ -946,10 +958,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_chk_avx2_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+-			      CPU_FEATURE_USABLE (AVX512VL),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_chk_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_chk_avx512_unaligned))
+ #endif
+ 
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 19795938..100e3707 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -54,7 +54,8 @@ IFUNC_SELECTOR (void)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	{
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ 	    return OPTIMIZE (avx512_unaligned_erms);
+@@ -68,7 +69,8 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	{
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ 	    return OPTIMIZE (evex_unaligned_erms);
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 22e7b187..8ad842fc 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -19,6 +19,6 @@
+ # define SECTION(p)		p##.evex512
+ # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+ # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
+-
++# define USE_LESS_VEC_MASK_STORE	1
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index ae0a4d6e..640f0929 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -19,6 +19,6 @@
+ # define SECTION(p)		p##.evex
+ # define MEMSET_SYMBOL(p,s)	p##_evex_##s
+ # define WMEMSET_SYMBOL(p,s)	p##_evex_##s
+-
++# define USE_LESS_VEC_MASK_STORE	1
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index bae5cba4..f877ac9d 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -63,6 +63,8 @@
+ # endif
+ #endif
+ 
++#define PAGE_SIZE 4096
++
+ #ifndef SECTION
+ # error SECTION is not defined!
+ #endif
+@@ -213,11 +215,38 @@ L(loop):
+ 	cmpq	%rcx, %rdx
+ 	jne	L(loop)
+ 	VZEROUPPER_SHORT_RETURN
++
++	.p2align 4
+ L(less_vec):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+ # endif
++# ifdef USE_LESS_VEC_MASK_STORE
++	/* Clear high bits from edi. Only keeping bits relevant to page
++	   cross check. Note that we are using rax which is set in
++	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
++	 */
++	andl	$(PAGE_SIZE - 1), %edi
++	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
++	   performance degradation when it has to fault supress.  */
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
++	ja	L(cross_page)
++# if VEC_SIZE > 32
++	movq	$-1, %rcx
++	bzhiq	%rdx, %rcx, %rcx
++	kmovq	%rcx, %k1
++# else
++	movl	$-1, %ecx
++	bzhil	%edx, %ecx, %ecx
++	kmovd	%ecx, %k1
++# endif
++	vmovdqu8	%VEC(0), (%rax) {%k1}
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(cross_page):
++# endif
+ # if VEC_SIZE > 32
+ 	cmpb	$32, %dl
+ 	jae	L(between_32_63)
+@@ -234,36 +263,36 @@ L(less_vec):
+ 	cmpb	$1, %dl
+ 	ja	L(between_2_3)
+ 	jb	1f
+-	movb	%cl, (%rdi)
++	movb	%cl, (%rax)
+ 1:
+ 	VZEROUPPER_RETURN
+ # if VEC_SIZE > 32
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	VMOVU	%YMM0, -32(%rdi,%rdx)
+-	VMOVU	%YMM0, (%rdi)
++	VMOVU	%YMM0, -32(%rax,%rdx)
++	VMOVU	%YMM0, (%rax)
+ 	VZEROUPPER_RETURN
+ # endif
+ # if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+-	VMOVU	%XMM0, -16(%rdi,%rdx)
+-	VMOVU	%XMM0, (%rdi)
++	VMOVU	%XMM0, -16(%rax,%rdx)
++	VMOVU	%XMM0, (%rax)
+ 	VZEROUPPER_RETURN
+ # endif
+ 	/* From 8 to 15.  No branch when size == 8.  */
+ L(between_8_15):
+-	movq	%rcx, -8(%rdi,%rdx)
+-	movq	%rcx, (%rdi)
++	movq	%rcx, -8(%rax,%rdx)
++	movq	%rcx, (%rax)
+ 	VZEROUPPER_RETURN
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	%ecx, -4(%rdi,%rdx)
+-	movl	%ecx, (%rdi)
++	movl	%ecx, -4(%rax,%rdx)
++	movl	%ecx, (%rax)
+ 	VZEROUPPER_RETURN
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+-	movw	%cx, -2(%rdi,%rdx)
+-	movw	%cx, (%rdi)
++	movw	%cx, -2(%rax,%rdx)
++	movw	%cx, (%rax)
+ 	VZEROUPPER_RETURN
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-42.patch b/glibc-RHEL-15696-42.patch
new file mode 100644
index 0000000..e2ca245
--- /dev/null
+++ b/glibc-RHEL-15696-42.patch
@@ -0,0 +1,396 @@
+From ccabe7971f508709d034b63b8672f6f751a3d356 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 23 Apr 2021 15:56:24 -0400
+Subject: [PATCH] x86: Optimize strchr-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes strchr-avx2.S. The optimizations are all
+small things such as save an ALU in the alignment process, saving a
+few instructions in the loop return, saving some bytes in the main
+loop, and increasing the ILP in the return cases. test-strchr,
+test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strchr-avx2.S | 290 +++++++++++++++----------
+ 1 file changed, 170 insertions(+), 120 deletions(-)
+
+Conflics:
+	sysdeps/x86_64/multiarch/strchr-avx2.S
+	(rearranged to account for branch changes)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+index 919d256c..5884726b 100644
+--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+@@ -49,133 +49,144 @@
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCHR)
+-	movl	%edi, %ecx
+-# ifndef USE_AS_STRCHRNUL
+-	xorl	%edx, %edx
+-# endif
+-
+ 	/* Broadcast CHAR to YMM0.	*/
+ 	vmovd	%esi, %xmm0
++	movl	%edi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	VPBROADCAST	%xmm0, %ymm0
+ 	vpxor	%xmm9, %xmm9, %xmm9
+-	VPBROADCAST %xmm0, %ymm0
+ 
+ 	/* Check if we cross page boundary with one vector load.  */
+-	andl	$(PAGE_SIZE - 1), %ecx
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+-	ja  L(cross_page_boundary)
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
+ 	   null byte.  */
+ 	vmovdqu	(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+-	jz	L(more_vecs)
++	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
++# ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
++# endif
+ 	addq	%rdi, %rax
++	VZEROUPPER_RETURN
++
++	/* .p2align 5 helps keep performance more consistent if ENTRY()
++	   alignment % 32 was either 16 or 0. As well this makes the
++	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
++	   easier.  */
++	.p2align 5
++L(first_vec_x4):
++	tzcntl	%eax, %eax
++	addq	$(VEC_SIZE * 3 + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
+ # endif
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
+-
+-	.p2align 4
+-L(more_vecs):
+-	/* Align data for aligned loads in the loop.  */
+-	andq	$-VEC_SIZE, %rdi
+-L(aligned_more):
+-
+-	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
+-	   since data is only aligned to VEC_SIZE.	*/
+-	vmovdqa	VEC_SIZE(%rdi), %ymm8
+-	addq	$VEC_SIZE, %rdi
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
+-	vmovdqa	VEC_SIZE(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jz	L(prep_loop_4x)
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+-	tzcntl	%eax, %eax
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++L(zero):
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
+ # endif
+-	VZEROUPPER
+-	ret
++
+ 
+ 	.p2align 4
+-L(first_vec_x0):
++L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-	addq	%rdi, %rax
++	incq	%rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
+ # endif
++	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x1):
++L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+-	leaq	VEC_SIZE(%rdi, %rax), %rax
++	addq	$(VEC_SIZE + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
+ # endif
++	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2):
++L(first_vec_x3):
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++	addq	$(VEC_SIZE * 2 + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
+ # endif
++	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+-L(prep_loop_4x):
+-	/* Align data to 4 * VEC_SIZE.	*/
+-	andq	$-(VEC_SIZE * 4), %rdi
++	.p2align 4
++L(aligned_more):
++	/* Align data to VEC_SIZE - 1. This is the same number of
++	   instructions as using andq -VEC_SIZE but saves 4 bytes of code
++	   on x4 check.  */
++	orq	$(VEC_SIZE - 1), %rdi
++L(cross_page_continue):
++	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.  */
++	vmovdqa	1(%rdi), %ymm8
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++
++	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x3)
+ 
++	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x4)
++	/* Align data to VEC_SIZE * 4 - 1.	*/
++	addq	$(VEC_SIZE * 4 + 1), %rdi
++	andq	$-(VEC_SIZE * 4), %rdi
+ 	.p2align 4
+ L(loop_4x_vec):
+ 	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
+-	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
+-	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
+-	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
++	vmovdqa	(%rdi), %ymm5
++	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
++	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
++	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+ 
+ 	/* Leaves only CHARS matching esi as 0.	 */
+ 	vpxor	%ymm5, %ymm0, %ymm1
+@@ -191,63 +202,102 @@ L(loop_4x_vec):
+ 	VPMINU	%ymm1, %ymm2, %ymm5
+ 	VPMINU	%ymm3, %ymm4, %ymm6
+ 
+-	VPMINU	%ymm5, %ymm6, %ymm5
++	VPMINU	%ymm5, %ymm6, %ymm6
+ 
+-	VPCMPEQ %ymm5, %ymm9, %ymm5
+-	vpmovmskb %ymm5, %eax
++	VPCMPEQ	%ymm6, %ymm9, %ymm6
++	vpmovmskb %ymm6, %ecx
++	subq	$-(VEC_SIZE * 4), %rdi
++	testl	%ecx, %ecx
++	jz	L(loop_4x_vec)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+-	testl	%eax, %eax
+-	jz  L(loop_4x_vec)
+ 
+-	VPCMPEQ %ymm1, %ymm9, %ymm1
++	VPCMPEQ	%ymm1, %ymm9, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	jnz	L(last_vec_x0)
++
+ 
+-	VPCMPEQ %ymm2, %ymm9, %ymm2
++	VPCMPEQ	%ymm5, %ymm9, %ymm2
+ 	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
++	jnz	L(last_vec_x1)
++
++	VPCMPEQ	%ymm3, %ymm9, %ymm3
++	vpmovmskb %ymm3, %eax
++	/* rcx has combined result from all 4 VEC. It will only be used
++	   if the first 3 other VEC all did not contain a match.  */
++	salq	$32, %rcx
++	orq	%rcx, %rax
++	tzcntq	%rax, %rax
++	subq	$(VEC_SIZE * 2), %rdi
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero_end)
++# endif
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
++
++
++	.p2align 4
++L(last_vec_x0):
++	tzcntl	%eax, %eax
++	addq	$-(VEC_SIZE * 4), %rdi
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero_end)
++# endif
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+-	VPCMPEQ %ymm3, %ymm9, %ymm3
+-	VPCMPEQ %ymm4, %ymm9, %ymm4
+-	vpmovmskb %ymm3, %ecx
+-	vpmovmskb %ymm4, %eax
+-	salq	$32, %rax
+-	orq %rcx, %rax
+-	tzcntq  %rax, %rax
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++L(zero_end):
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
+ # endif
+-	VZEROUPPER
+-	ret
++
++	.p2align 4
++L(last_vec_x1):
++	tzcntl	%eax, %eax
++	subq	$(VEC_SIZE * 3), %rdi
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero_end)
++# endif
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
++
+ 
+ 	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+ L(cross_page_boundary):
+-	andq	$-VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-
+-	vmovdqa	(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
++	movq	%rdi, %rdx
++	/* Align rdi to VEC_SIZE - 1.  */
++	orq	$(VEC_SIZE - 1), %rdi
++	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	/* Remove the leading bits.	 */
+-	sarxl	%ecx, %eax, %eax
++	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
++	   so no need to manually mod edx.  */
++	sarxl	%edx, %eax, %eax
+ 	testl	%eax, %eax
+-	jz	L(aligned_more)
++	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-	addq	%rcx, %rdi
+-	addq	%rdi, %rax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	xorl	%ecx, %ecx
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdx, %rax), %CHAR_REG
++	leaq	(%rdx, %rax), %rax
++	cmovne	%rcx, %rax
++# else
++	addq	%rdx, %rax
+ # endif
+-	VZEROUPPER_RETURN
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ END (STRCHR)
+ # endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-43.patch b/glibc-RHEL-15696-43.patch
new file mode 100644
index 0000000..9f76b11
--- /dev/null
+++ b/glibc-RHEL-15696-43.patch
@@ -0,0 +1,532 @@
+From 7f3e7c262cab4e2401e4331a6ef29c428de02044 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 23 Apr 2021 15:56:25 -0400
+Subject: [PATCH] x86: Optimize strchr-evex.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes strchr-evex.S. The optimizations are
+mostly small things such as save an ALU in the alignment process,
+saving a few instructions in the loop return. The one significant
+change is saving 2 instructions in the 4x loop. test-strchr,
+test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strchr-evex.S | 392 ++++++++++++++-----------
+ 1 file changed, 218 insertions(+), 174 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
+index ddc86a70..7f9d4ee4 100644
+--- a/sysdeps/x86_64/multiarch/strchr-evex.S
++++ b/sysdeps/x86_64/multiarch/strchr-evex.S
+@@ -32,13 +32,15 @@
+ #  define VPCMP		vpcmpd
+ #  define VPMINU	vpminud
+ #  define CHAR_REG	esi
+-#  define SHIFT_REG	r8d
++#  define SHIFT_REG	ecx
++#  define CHAR_SIZE	4
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMP		vpcmpb
+ #  define VPMINU	vpminub
+ #  define CHAR_REG	sil
+-#  define SHIFT_REG	ecx
++#  define SHIFT_REG	edx
++#  define CHAR_SIZE	1
+ # endif
+ 
+ # define XMMZERO	xmm16
+@@ -56,23 +58,20 @@
+ 
+ # define VEC_SIZE 32
+ # define PAGE_SIZE 4096
++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (STRCHR)
+-	movl	%edi, %ecx
+-# ifndef USE_AS_STRCHRNUL
+-	xorl	%edx, %edx
+-# endif
+-
+ 	/* Broadcast CHAR to YMM0.	*/
+-	VPBROADCAST %esi, %YMM0
+-
++	VPBROADCAST	%esi, %YMM0
++	movl	%edi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
+ 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+ 
+-	/* Check if we cross page boundary with one vector load.  */
+-	andl	$(PAGE_SIZE - 1), %ecx
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+-	ja  L(cross_page_boundary)
++	/* Check if we cross page boundary with one vector load.
++	   Otherwise it is safe to use an unaligned load.  */
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
+ 	   null bytes.  */
+@@ -83,251 +82,296 @@ ENTRY (STRCHR)
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+ 	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	ktestd	%k0, %k0
+-	jz	L(more_vecs)
+ 	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+ # ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
++	 */
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+ 	addq	%rdi, %rax
+ # endif
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rax), %CHAR_REG
++	jne	L(zero)
+ # endif
+ 	ret
+ 
+-	.p2align 4
+-L(more_vecs):
+-	/* Align data for aligned loads in the loop.  */
+-	andq	$-VEC_SIZE, %rdi
+-L(aligned_more):
+-
+-	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
+-	   since data is only aligned to VEC_SIZE.	*/
+-	VMOVA	VEC_SIZE(%rdi), %YMM1
+-	addq	$VEC_SIZE, %rdi
+-
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
+-	VMOVA	VEC_SIZE(%rdi), %YMM1
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-
+-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-
+-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	ktestd	%k0, %k0
+-	jz	L(prep_loop_4x)
+-
+-	kmovd	%k0, %eax
++	/* .p2align 5 helps keep performance more consistent if ENTRY()
++	   alignment % 32 was either 16 or 0. As well this makes the
++	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
++	   easier.  */
++	.p2align 5
++L(first_vec_x3):
+ 	tzcntl	%eax, %eax
++# ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
+-# else
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
++	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero)
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+-# endif
++L(zero):
++	xorl	%eax, %eax
+ 	ret
++# endif
+ 
+ 	.p2align 4
+-L(first_vec_x0):
++L(first_vec_x4):
++# ifndef USE_AS_STRCHRNUL
++	/* Check to see if first match was CHAR (k0) or null (k1).  */
++	kmovd	%k0, %eax
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
++	kmovd	%k1, %ecx
++	/* bzhil will not be 0 if first match was null.  */
++	bzhil	%eax, %ecx, %ecx
++	jne	L(zero)
+ # else
+-	addq	%rdi, %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Combine CHAR and null matches.  */
++	kord	%k0, %k1, %k0
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	leaq	VEC_SIZE(%rdi, %rax), %rax
+-# endif
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero)
++
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	.p2align 4
+ L(first_vec_x2):
++# ifndef USE_AS_STRCHRNUL
++	/* Check to see if first match was CHAR (k0) or null (k1).  */
++	kmovd	%k0, %eax
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
++	kmovd	%k1, %ecx
++	/* bzhil will not be 0 if first match was null.  */
++	bzhil	%eax, %ecx, %ecx
++	jne	L(zero)
+ # else
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Combine CHAR and null matches.  */
++	kord	%k0, %k1, %k0
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+-L(prep_loop_4x):
+-	/* Align data to 4 * VEC_SIZE.	*/
++	.p2align 4
++L(aligned_more):
++	/* Align data to VEC_SIZE.  */
++	andq	$-VEC_SIZE, %rdi
++L(cross_page_continue):
++	/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
++	   data is only aligned to VEC_SIZE. Use two alternating methods
++	   for checking VEC to balance latency and port contention.  */
++
++	/* This method has higher latency but has better port
++	   distribution.  */
++	VMOVA	(VEC_SIZE)(%rdi), %YMM1
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	/* This method has higher latency but has better port
++	   distribution.  */
++	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
++	/* Each bit in K0 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMM1, %YMM0, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMM1, %YMMZERO, %k1
++	kortestd	%k0, %k1
++	jnz	L(first_vec_x2)
++
++	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x3)
++
++	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
++	/* Each bit in K0 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMM1, %YMM0, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMM1, %YMMZERO, %k1
++	kortestd	%k0, %k1
++	jnz	L(first_vec_x4)
++
++	/* Align data to VEC_SIZE * 4 for the loop.  */
++	addq	$VEC_SIZE, %rdi
+ 	andq	$-(VEC_SIZE * 4), %rdi
+ 
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
++	/* Check 4x VEC at a time. No penalty to imm32 offset with evex
++	   encoding.  */
+ 	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+ 	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
+ 	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
+ 	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
+ 
+-	/* Leaves only CHARS matching esi as 0.  */
++	/* For YMM1 and YMM3 use xor to set the CHARs matching esi to
++	   zero.  */
+ 	vpxorq	%YMM1, %YMM0, %YMM5
+-	vpxorq	%YMM2, %YMM0, %YMM6
++	/* For YMM2 and YMM4 cmp not equals to CHAR and store result in
++	   k register. Its possible to save either 1 or 2 instructions
++	   using cmp no equals method for either YMM1 or YMM1 and YMM3
++	   respectively but bottleneck on p5 makes it not worth it.  */
++	VPCMP	$4, %YMM0, %YMM2, %k2
+ 	vpxorq	%YMM3, %YMM0, %YMM7
+-	vpxorq	%YMM4, %YMM0, %YMM8
+-
+-	VPMINU	%YMM5, %YMM1, %YMM5
+-	VPMINU	%YMM6, %YMM2, %YMM6
+-	VPMINU	%YMM7, %YMM3, %YMM7
+-	VPMINU	%YMM8, %YMM4, %YMM8
+-
+-	VPMINU	%YMM5, %YMM6, %YMM1
+-	VPMINU	%YMM7, %YMM8, %YMM2
+-
+-	VPMINU	%YMM1, %YMM2, %YMM1
+-
+-	/* Each bit in K0 represents a CHAR or a null byte.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-	ktestd	%k0, %k0
++	VPCMP	$4, %YMM0, %YMM4, %k4
++
++	/* Use min to select all zeros from either xor or end of string).
++	 */
++	VPMINU	%YMM1, %YMM5, %YMM1
++	VPMINU	%YMM3, %YMM7, %YMM3
++
++	/* Use min + zeromask to select for zeros. Since k2 and k4 will
++	   have 0 as positions that matched with CHAR which will set
++	   zero in the corresponding destination bytes in YMM2 / YMM4.
++	 */
++	VPMINU	%YMM1, %YMM2, %YMM2{%k2}{z}
++	VPMINU	%YMM3, %YMM4, %YMM4
++	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
++
++	VPCMP	$0, %YMMZERO, %YMM4, %k1
++	kmovd	%k1, %ecx
++	subq	$-(VEC_SIZE * 4), %rdi
++	testl	%ecx, %ecx
+ 	jz	L(loop_4x_vec)
+ 
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM5, %k0
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	jnz	L(last_vec_x1)
+ 
+-	/* Each bit in K1 represents a CHAR or a null byte in YMM2.  */
+-	VPCMP	$0, %YMMZERO, %YMM6, %k1
+-	kmovd	%k1, %eax
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-
+-	/* Each bit in K2 represents a CHAR or a null byte in YMM3.  */
+-	VPCMP	$0, %YMMZERO, %YMM7, %k2
+-	/* Each bit in K3 represents a CHAR or a null byte in YMM4.  */
+-	VPCMP	$0, %YMMZERO, %YMM8, %k3
++	jnz	L(last_vec_x2)
+ 
++	VPCMP	$0, %YMMZERO, %YMM3, %k0
++	kmovd	%k0, %eax
++	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
+ # ifdef USE_AS_WCSCHR
+-	/* NB: Each bit in K2/K3 represents 4-byte element.  */
+-	kshiftlw $8, %k3, %k1
++	sall	$8, %ecx
++	orl	%ecx, %eax
++	tzcntl	%eax, %eax
+ # else
+-	kshiftlq $32, %k3, %k1
++	salq	$32, %rcx
++	orq	%rcx, %rax
++	tzcntq	%rax, %rax
+ # endif
++# ifndef USE_AS_STRCHRNUL
++	/* Check if match was CHAR or null.  */
++	cmp	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero_end)
++# endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korq	%k1, %k2, %k1
+-	kmovq	%k1, %rax
++# ifndef USE_AS_STRCHRNUL
++L(zero_end):
++	xorl	%eax, %eax
++	ret
++# endif
+ 
+-	tzcntq  %rax, %rax
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+-# else
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++	.p2align 4
++L(last_vec_x1):
++	tzcntl	%eax, %eax
++# ifndef USE_AS_STRCHRNUL
++	/* Check if match was null.  */
++	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero_end)
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++
++	.p2align 4
++L(last_vec_x2):
++	tzcntl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Check if match was null.  */
++	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero_end)
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+ L(cross_page_boundary):
++	movq	%rdi, %rdx
++	/* Align rdi.  */
+ 	andq	$-VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-
+ 	VMOVA	(%rdi), %YMM1
+-
+ 	/* Leaves only CHARS matching esi as 0.  */
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+ 	VPCMP	$0, %YMMZERO, %YMM2, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-
++	/* Remove the leading bits.	 */
+ # ifdef USE_AS_WCSCHR
++	movl	%edx, %SHIFT_REG
+ 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+ 	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl    $2, %SHIFT_REG
++	sarl	$2, %SHIFT_REG
++	andl	$(CHAR_PER_VEC - 1), %SHIFT_REG
+ # endif
+-
+-	/* Remove the leading bits.	 */
+ 	sarxl	%SHIFT_REG, %eax, %eax
++	/* If eax is zero continue.  */
+ 	testl	%eax, %eax
+-
+-	jz	L(aligned_more)
++	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-	addq	%rcx, %rdi
++# ifndef USE_AS_STRCHRNUL
++	/* Check to see if match was CHAR or null.  */
++	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero_end)
++# endif
+ # ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
++	/* NB: Multiply wchar_t count by 4 to get the number of
++	   bytes.  */
++	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+ # else
+-	addq	%rdi, %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	addq	%rdx, %rax
+ # endif
+ 	ret
+ 
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-44.patch b/glibc-RHEL-15696-44.patch
new file mode 100644
index 0000000..52fec88
--- /dev/null
+++ b/glibc-RHEL-15696-44.patch
@@ -0,0 +1,536 @@
+From 104c7b1967c3e78435c6f7eab5e225a7eddf9c6e Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 4 May 2021 19:02:40 -0400
+Subject: [PATCH] x86: Add EVEX optimized memchr family not safe for RTM
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+This commit adds a new implementation for EVEX memchr that is not safe
+for RTM because it uses vzeroupper. The benefit is that by using
+ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
+faster than the RTM safe version which cannot use vpcmpeq because
+there is no EVEX encoding for the instruction. All parts of the
+implementation aside from the 4x loop are the same for the two
+versions and the optimization is only relevant for large sizes.
+
+Tigerlake:
+size  , algn  , Pos   , Cur T , New T , Win     , Dif
+512   , 6     , 192   , 9.2   , 9.04  , no-RTM  , 0.16
+512   , 7     , 224   , 9.19  , 8.98  , no-RTM  , 0.21
+2048  , 0     , 256   , 10.74 , 10.54 , no-RTM  , 0.2
+2048  , 0     , 512   , 14.81 , 14.87 , RTM     , 0.06
+2048  , 0     , 1024  , 22.97 , 22.57 , no-RTM  , 0.4
+2048  , 0     , 2048  , 37.49 , 34.51 , no-RTM  , 2.98   <--
+
+Icelake:
+size  , algn  , Pos   , Cur T , New T , Win     , Dif
+512   , 6     , 192   , 7.6   , 7.3   , no-RTM  , 0.3
+512   , 7     , 224   , 7.63  , 7.27  , no-RTM  , 0.36
+2048  , 0     , 256   , 8.48  , 8.38  , no-RTM  , 0.1
+2048  , 0     , 512   , 11.57 , 11.42 , no-RTM  , 0.15
+2048  , 0     , 1024  , 17.92 , 17.38 , no-RTM  , 0.54
+2048  , 0     , 2048  , 30.37 , 27.34 , no-RTM  , 3.03   <--
+
+test-memchr, test-wmemchr, and test-rawmemchr are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile             |   7 +-
+ sysdeps/x86_64/multiarch/ifunc-evex.h         |  55 ++++++
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  15 ++
+ sysdeps/x86_64/multiarch/memchr-evex-rtm.S    |   8 +
+ sysdeps/x86_64/multiarch/memchr-evex.S        | 161 ++++++++++++++----
+ sysdeps/x86_64/multiarch/memchr.c             |   2 +-
+ sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S |   3 +
+ sysdeps/x86_64/multiarch/rawmemchr.c          |   2 +-
+ sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S   |   3 +
+ sysdeps/x86_64/multiarch/wmemchr.c            |   2 +-
+ 10 files changed, 217 insertions(+), 41 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/ifunc-evex.h
+ create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 65fde4eb..26be4095 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -77,7 +77,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   strncmp-evex \
+ 		   strncpy-evex \
+ 		   strnlen-evex \
+-		   strrchr-evex
++		   strrchr-evex \
++		   memchr-evex-rtm \
++		   rawmemchr-evex-rtm
+ CFLAGS-varshift.c += -msse4
+ CFLAGS-strcspn-c.c += -msse4
+ CFLAGS-strpbrk-c.c += -msse4
+@@ -110,7 +112,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcsnlen-evex \
+ 		   wcsrchr-evex \
+ 		   wmemchr-evex \
+-		   wmemcmp-evex-movbe
++		   wmemcmp-evex-movbe \
++		   wmemchr-evex-rtm
+ endif
+ 
+ ifeq ($(subdir),debug)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h
+new file mode 100644
+index 00000000..fc391edb
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/ifunc-evex.h
+@@ -0,0 +1,55 @@
++/* Common definition for ifunc selection optimized with EVEX.
++   All versions must be listed in ifunc-impl-list.c.
++   Copyright (C) 2017-2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <init-arch.h>
++
++extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden;
++
++
++static inline void *
++IFUNC_SELECTOR (void)
++{
++  const struct cpu_features* cpu_features = __get_cpu_features ();
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
++      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	    return OPTIMIZE (evex_rtm);
++
++	  return OPTIMIZE (evex);
++	}
++
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
++
++  return OPTIMIZE (sse2);
++}
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index d59d65f8..ac097e8d 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -52,6 +52,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memchr_evex)
++	      IFUNC_IMPL_ADD (array, i, memchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __memchr_evex_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/memcmp.c.  */
+@@ -288,6 +293,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __rawmemchr_evex)
++	      IFUNC_IMPL_ADD (array, i, rawmemchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __rawmemchr_evex_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
+@@ -711,6 +721,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemchr_evex)
++	      IFUNC_IMPL_ADD (array, i, wmemchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wmemchr_evex_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wmemcmp.c.  */
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex-rtm.S b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
+new file mode 100644
+index 00000000..19871882
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
+@@ -0,0 +1,8 @@
++#ifndef MEMCHR
++# define MEMCHR __memchr_evex_rtm
++#endif
++
++#define USE_IN_RTM 1
++#define SECTION(p) p##.evex.rtm
++
++#include "memchr-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+index f3fdad4f..4d0ed6d1 100644
+--- a/sysdeps/x86_64/multiarch/memchr-evex.S
++++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -38,10 +38,32 @@
+ #  define CHAR_SIZE	1
+ # endif
+ 
++	/* In the 4x loop the RTM and non-RTM versions have data pointer
++	   off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
++	   This is represented by BASE_OFFSET. As well because the RTM
++	   version uses vpcmp which stores a bit per element compared where
++	   the non-RTM version uses vpcmpeq which stores a bit per byte
++	   compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
++	   version.  */
++# ifdef USE_IN_RTM
++#  define VZEROUPPER
++#  define BASE_OFFSET	(VEC_SIZE * 4)
++#  define RET_SCALE	CHAR_SIZE
++# else
++#  define VZEROUPPER	vzeroupper
++#  define BASE_OFFSET	0
++#  define RET_SCALE	1
++# endif
++
++	/* In the return from 4x loop memchr and rawmemchr versions have
++	   data pointers off by VEC_SIZE * 4 with memchr version being
++	   VEC_SIZE * 4 greater.  */
+ # ifdef USE_AS_RAWMEMCHR
++#  define RET_OFFSET	(BASE_OFFSET - (VEC_SIZE * 4))
+ #  define RAW_PTR_REG	rcx
+ #  define ALGN_PTR_REG	rdi
+ # else
++#  define RET_OFFSET	BASE_OFFSET
+ #  define RAW_PTR_REG	rdi
+ #  define ALGN_PTR_REG	rcx
+ # endif
+@@ -57,11 +79,15 @@
+ # define YMM5		ymm21
+ # define YMM6		ymm22
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.evex
++# endif
++
+ # define VEC_SIZE 32
+ # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ # define PAGE_SIZE 4096
+ 
+-	.section .text.evex,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+@@ -237,14 +263,15 @@ L(cross_page_continue):
+ 	/* Check if at last CHAR_PER_VEC * 4 length.  */
+ 	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(last_4x_vec_or_less_cmpeq)
+-	addq	$VEC_SIZE, %rdi
++	/* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
++	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
+ 
+ 	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
+ 	 */
+ #  ifdef USE_AS_WMEMCHR
+ 	movl	%edi, %ecx
+ 	andq	$-(4 * VEC_SIZE), %rdi
+-	andl	$(VEC_SIZE * 4 - 1), %ecx
++	subl	%edi, %ecx
+ 	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+ 	sarl	$2, %ecx
+ 	addq	%rcx, %rdx
+@@ -254,15 +281,28 @@ L(cross_page_continue):
+ 	subq	%rdi, %rdx
+ #  endif
+ # else
+-	addq	$VEC_SIZE, %rdi
++	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
+ 	andq	$-(4 * VEC_SIZE), %rdi
+ # endif
+-
++# ifdef USE_IN_RTM
+ 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++# else
++	/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
++	   encodable with EVEX registers (ymm16-ymm31).  */
++	vmovdqa64 %YMMMATCH, %ymm0
++# endif
+ 
+ 	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
++	/* Two versions of the loop. One that does not require
++	   vzeroupper by not using ymm0-ymm15 and another does that require
++	   vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
++	   is used at all is because there is no EVEX encoding vpcmpeq and
++	   with vpcmpeq this loop can be performed more efficiently. The
++	   non-vzeroupper version is safe for RTM while the vzeroupper
++	   version should be prefered if RTM are not supported.  */
++# ifdef USE_IN_RTM
+ 	/* It would be possible to save some instructions using 4x VPCMP
+ 	   but bottleneck on port 5 makes it not woth it.  */
+ 	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+@@ -273,12 +313,55 @@ L(loop_4x_vec):
+ 	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
+ 	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
+ 	VPCMP	$0, %YMM3, %YMMZERO, %k2
++# else
++	/* Since vptern can only take 3x vectors fastest to do 1 vec
++	   seperately with EVEX vpcmp.  */
++#  ifdef USE_AS_WMEMCHR
++	/* vptern can only accept masks for epi32/epi64 so can only save
++	   instruction using not equals mask on vptern with wmemchr.  */
++	VPCMP	$4, (%rdi), %YMMMATCH, %k1
++#  else
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++#  endif
++	/* Compare 3x with vpcmpeq and or them all together with vptern.
++	 */
++	VPCMPEQ	VEC_SIZE(%rdi), %ymm0, %ymm2
++	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
++	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
++#  ifdef USE_AS_WMEMCHR
++	/* This takes the not of or between ymm2, ymm3, ymm4 as well as
++	   combines result from VEC0 with zero mask.  */
++	vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
++	vpmovmskb %ymm4, %ecx
++#  else
++	/* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
++	vpternlogd $254, %ymm2, %ymm3, %ymm4
++	vpmovmskb %ymm4, %ecx
++	kmovd	%k1, %eax
++#  endif
++# endif
++
+ # ifdef USE_AS_RAWMEMCHR
+ 	subq	$-(VEC_SIZE * 4), %rdi
++# endif
++# ifdef USE_IN_RTM
+ 	kortestd %k2, %k3
++# else
++#  ifdef USE_AS_WMEMCHR
++	/* ecx contains not of matches. All 1s means no matches. incl will
++	   overflow and set zeroflag if that is the case.  */
++	incl	%ecx
++#  else
++	/* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
++	   to ecx is not an issue because if eax is non-zero it will be
++	   used for returning the match. If it is zero the add does
++	   nothing.  */
++	addq	%rax, %rcx
++#  endif
++# endif
++# ifdef USE_AS_RAWMEMCHR
+ 	jz	L(loop_4x_vec)
+ # else
+-	kortestd %k2, %k3
+ 	jnz	L(loop_4x_vec_end)
+ 
+ 	subq	$-(VEC_SIZE * 4), %rdi
+@@ -288,10 +371,11 @@ L(loop_4x_vec):
+ 
+ 	/* Fall through into less than 4 remaining vectors of length case.
+ 	 */
+-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
++	VPCMP	$0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
++	addq	$(BASE_OFFSET - VEC_SIZE), %rdi
+ 	kmovd	%k0, %eax
+-	addq	$(VEC_SIZE * 3), %rdi
+-	.p2align 4
++	VZEROUPPER
++
+ L(last_4x_vec_or_less):
+ 	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+@@ -338,73 +422,78 @@ L(loop_4x_vec_end):
+ 	/* rawmemchr will fall through into this if match was found in
+ 	   loop.  */
+ 
++# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
+ 	/* k1 has not of matches with VEC1.  */
+ 	kmovd	%k1, %eax
+-# ifdef USE_AS_WMEMCHR
++#  ifdef USE_AS_WMEMCHR
+ 	subl	$((1 << CHAR_PER_VEC) - 1), %eax
+-# else
++#  else
+ 	incl	%eax
++#  endif
++# else
++	/* eax already has matches for VEC1.  */
++	testl	%eax, %eax
+ # endif
+ 	jnz	L(last_vec_x1_return)
+ 
++# ifdef USE_IN_RTM
+ 	VPCMP	$0, %YMM2, %YMMZERO, %k0
+ 	kmovd	%k0, %eax
++# else
++	vpmovmskb %ymm2, %eax
++# endif
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x2_return)
+ 
++# ifdef USE_IN_RTM
+ 	kmovd	%k2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x3_return)
+ 
+ 	kmovd	%k3, %eax
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++	leaq	(VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
++	vpmovmskb %ymm3, %eax
++	/* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
++	salq	$VEC_SIZE, %rcx
++	orq	%rcx, %rax
++	tzcntq	%rax, %rax
++	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
++	VZEROUPPER
+ # endif
+ 	ret
+ 
+ 	.p2align 4
+ L(last_vec_x1_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-#  ifdef USE_AS_WMEMCHR
++# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
+ 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+-#  else
+-	addq	%rdi, %rax
+-#  endif
++	leaq	RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
++	addq	%rdi, %rax
+ # endif
++	VZEROUPPER
+ 	ret
+ 
+ 	.p2align 4
+ L(last_vec_x2_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+-# else
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
+-# endif
++	/* NB: Multiply bytes by RET_SCALE to get the wchar_t count
++	   if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
++	   USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
++	leaq	(VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
++	VZEROUPPER
+ 	ret
+ 
++# ifdef USE_IN_RTM
+ 	.p2align 4
+ L(last_vec_x3_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+-# else
+ 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
+-# endif
++	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+-
++# endif
+ 
+ # ifndef USE_AS_RAWMEMCHR
+ L(last_4x_vec_or_less_cmpeq):
+diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c
+index 016f5784..f28aea77 100644
+--- a/sysdeps/x86_64/multiarch/memchr.c
++++ b/sysdeps/x86_64/multiarch/memchr.c
+@@ -24,7 +24,7 @@
+ # undef memchr
+ 
+ # define SYMBOL_NAME memchr
+-# include "ifunc-avx2.h"
++# include "ifunc-evex.h"
+ 
+ libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ());
+ strong_alias (memchr, __memchr)
+diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+new file mode 100644
+index 00000000..deda1ca3
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+@@ -0,0 +1,3 @@
++#define MEMCHR __rawmemchr_evex_rtm
++#define USE_AS_RAWMEMCHR 1
++#include "memchr-evex-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c
+index 8a0bc313..1f764f35 100644
+--- a/sysdeps/x86_64/multiarch/rawmemchr.c
++++ b/sysdeps/x86_64/multiarch/rawmemchr.c
+@@ -26,7 +26,7 @@
+ # undef __rawmemchr
+ 
+ # define SYMBOL_NAME rawmemchr
+-# include "ifunc-avx2.h"
++# include "ifunc-evex.h"
+ 
+ libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
+ 		       IFUNC_SELECTOR ());
+diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
+new file mode 100644
+index 00000000..a346cd35
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
+@@ -0,0 +1,3 @@
++#define MEMCHR __wmemchr_evex_rtm
++#define USE_AS_WMEMCHR 1
++#include "memchr-evex-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c
+index 6d833702..f9c91915 100644
+--- a/sysdeps/x86_64/multiarch/wmemchr.c
++++ b/sysdeps/x86_64/multiarch/wmemchr.c
+@@ -26,7 +26,7 @@
+ # undef __wmemchr
+ 
+ # define SYMBOL_NAME wmemchr
+-# include "ifunc-avx2.h"
++# include "ifunc-evex.h"
+ 
+ libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ());
+ weak_alias (__wmemchr, wmemchr)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-45.patch b/glibc-RHEL-15696-45.patch
new file mode 100644
index 0000000..380217e
--- /dev/null
+++ b/glibc-RHEL-15696-45.patch
@@ -0,0 +1,873 @@
+From 16d12015c57701b08d7bbed6ec536641bcafb428 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 17 May 2021 13:56:52 -0400
+Subject: [PATCH] x86: Optimize memcmp-avx2-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes memcmp-avx2.S. The optimizations include
+adding a new vec compare path for small sizes, reorganizing the entry
+control flow, and removing some unnecissary ALU instructions from the
+main loop. test-memcmp and test-wmemcmp are both passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c   |   6 +
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h      |   1 +
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 676 +++++++++++--------
+ 3 files changed, 402 insertions(+), 281 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index ac097e8d..8be0d78a 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -63,16 +63,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, memcmp,
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_avx2_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __memcmp_avx2_movbe_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
+@@ -732,16 +735,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, wmemcmp,
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_avx2_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wmemcmp_avx2_movbe_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+index 8043c635..690dffe8 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+@@ -33,6 +33,7 @@ IFUNC_SELECTOR (void)
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
++      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index 9d5c9c72..16fc673e 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -19,17 +19,23 @@
+ #if IS_IN (libc)
+ 
+ /* memcmp/wmemcmp is implemented as:
+-   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+-      to avoid branches.
+-   2. Use overlapping compare to avoid branch.
+-   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+-      bytes for wmemcmp.
+-   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+-   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
++   1. Use ymm vector compares when possible. The only case where
++      vector compares is not possible for when size < VEC_SIZE
++      and loading from either s1 or s2 would cause a page cross.
++   2. For size from 2 to 7 bytes on page cross, load as big endian
++      with movbe and bswap to avoid branches.
++   3. Use xmm vector compare when size >= 4 bytes for memcmp or
++      size >= 8 bytes for wmemcmp.
++   4. Optimistically compare up to first 4 * VEC_SIZE one at a
++      to check for early mismatches. Only do this if its guranteed the
++      work is not wasted.
++   5. If size is 8 * VEC_SIZE or less, unroll the loop.
++   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
+       area.
+-   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+-   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+-   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
++   7. Use 2 vector compares when size is 2 * VEC_SIZE or less.
++   8. Use 4 vector compares when size is 4 * VEC_SIZE or less.
++   9. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
++
+ 
+ # include <sysdep.h>
+ 
+@@ -38,8 +44,10 @@
+ # endif
+ 
+ # ifdef USE_AS_WMEMCMP
++#  define CHAR_SIZE	4
+ #  define VPCMPEQ	vpcmpeqd
+ # else
++#  define CHAR_SIZE	1
+ #  define VPCMPEQ	vpcmpeqb
+ # endif
+ 
+@@ -52,7 +60,7 @@
+ # endif
+ 
+ # define VEC_SIZE 32
+-# define VEC_MASK ((1 << VEC_SIZE) - 1)
++# define PAGE_SIZE	4096
+ 
+ /* Warning!
+            wmemcmp has to use SIGNED comparison for elements.
+@@ -71,136 +79,359 @@ ENTRY (MEMCMP)
+ 	jb	L(less_vec)
+ 
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++	vmovdqu	(%rsi), %ymm1
++	VPCMPEQ	(%rdi), %ymm1, %ymm1
++	vpmovmskb %ymm1, %eax
++	/* NB: eax must be destination register if going to
++	   L(return_vec_[0,2]). For L(return_vec_3 destination register
++	   must be ecx.  */
++	incl	%eax
++	jnz	L(return_vec_0)
+ 
+ 	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_vec)
+-
+-	VPCMPEQ	%ymm0, %ymm0, %ymm0
+-	/* More than 2 * VEC.  */
+-	cmpq	$(VEC_SIZE * 8), %rdx
+-	ja	L(more_8x_vec)
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jb	L(last_4x_vec)
+-
+-	/* From 4 * VEC to 8 * VEC, inclusively. */
+-	vmovdqu	(%rsi), %ymm1
+-	VPCMPEQ (%rdi), %ymm1, %ymm1
++	jbe	L(last_1x_vec)
+ 
++	/* Check second VEC no matter what.  */
+ 	vmovdqu	VEC_SIZE(%rsi), %ymm2
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
++	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
++	vpmovmskb %ymm2, %eax
++	/* If all 4 VEC where equal eax will be all 1s so incl will
++	   overflow and set zero flag.  */
++	incl	%eax
++	jnz	L(return_vec_1)
+ 
+-	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
++	/* Less than 4 * VEC.  */
++	cmpq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_2x_vec)
+ 
++	/* Check third and fourth VEC no matter what.  */
++	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
++	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
++	vpmovmskb %ymm3, %eax
++	incl	%eax
++	jnz	L(return_vec_2)
+ 	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
++	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
++	vpmovmskb %ymm4, %ecx
++	incl	%ecx
++	jnz	L(return_vec_3)
+ 
+-	vpand	%ymm1, %ymm2, %ymm5
+-	vpand	%ymm3, %ymm4, %ymm6
+-	vpand	%ymm5, %ymm6, %ymm5
++	/* Go to 4x VEC loop.  */
++	cmpq	$(VEC_SIZE * 8), %rdx
++	ja	L(more_8x_vec)
+ 
+-	vptest	%ymm0, %ymm5
+-	jnc	L(4x_vec_end)
++	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
++	   branches.  */
+ 
++	/* Load first two VEC from s2 before adjusting addresses.  */
++	vmovdqu	-(VEC_SIZE * 4)(%rsi, %rdx), %ymm1
++	vmovdqu	-(VEC_SIZE * 3)(%rsi, %rdx), %ymm2
+ 	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+ 	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %ymm1
+-	VPCMPEQ (%rdi), %ymm1, %ymm1
+ 
+-	vmovdqu	VEC_SIZE(%rsi), %ymm2
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+-	vpand	%ymm2, %ymm1, %ymm5
++	/* Wait to load from s1 until addressed adjust due to
++	   unlamination of microfusion with complex address mode.  */
++	VPCMPEQ	(%rdi), %ymm1, %ymm1
++	VPCMPEQ	(VEC_SIZE)(%rdi), %ymm2, %ymm2
+ 
+ 	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+-	vpand	%ymm3, %ymm5, %ymm5
+-
++	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+ 	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+-	vpand	%ymm4, %ymm5, %ymm5
++	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+ 
+-	vptest	%ymm0, %ymm5
+-	jnc	L(4x_vec_end)
+-	xorl	%eax, %eax
++	/* Reduce VEC0 - VEC4.  */
++	vpand	%ymm1, %ymm2, %ymm5
++	vpand	%ymm3, %ymm4, %ymm6
++	vpand	%ymm5, %ymm6, %ymm7
++	vpmovmskb %ymm7, %ecx
++	incl	%ecx
++	jnz	L(return_vec_0_1_2_3)
++	/* NB: eax must be zero to reach here.  */
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(return_vec_0):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rax), %ecx
++	/* NB: no partial register stall here because xorl zero idiom
++	   above.  */
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++L(return_vec_1):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	VEC_SIZE(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	VEC_SIZE(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	VEC_SIZE(%rsi, %rax), %ecx
++	movzbl	VEC_SIZE(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(return_vec_2):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	VZEROUPPER_RETURN
++
++	/* NB: p2align 5 here to ensure 4x loop is 32 byte aligned.  */
++	.p2align 5
++L(8x_return_vec_0_1_2_3):
++	/* Returning from L(more_8x_vec) requires restoring rsi.  */
++	addq	%rdi, %rsi
++L(return_vec_0_1_2_3):
++	vpmovmskb %ymm1, %eax
++	incl	%eax
++	jnz	L(return_vec_0)
+ 
+-L(last_vec):
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+-	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+ 	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++	incl	%eax
++	jnz	L(return_vec_1)
++
++	vpmovmskb %ymm3, %eax
++	incl	%eax
++	jnz	L(return_vec_2)
++L(return_vec_3):
++	tzcntl	%ecx, %ecx
++# ifdef USE_AS_WMEMCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %eax
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(more_8x_vec):
++	/* Set end of s1 in rdx.  */
++	leaq	-(VEC_SIZE * 4)(%rdi, %rdx), %rdx
++	/* rsi stores s2 - s1. This allows loop to only update one
++	   pointer.  */
++	subq	%rdi, %rsi
++	/* Align s1 pointer.  */
++	andq	$-VEC_SIZE, %rdi
++	/* Adjust because first 4x vec where check already.  */
++	subq	$-(VEC_SIZE * 4), %rdi
++	.p2align 4
++L(loop_4x_vec):
++	/* rsi has s2 - s1 so get correct address by adding s1 (in rdi).
++	 */
++	vmovdqu	(%rsi, %rdi), %ymm1
++	VPCMPEQ	(%rdi), %ymm1, %ymm1
++
++	vmovdqu	VEC_SIZE(%rsi, %rdi), %ymm2
++	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
++
++	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdi), %ymm3
++	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
++
++	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdi), %ymm4
++	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
++
++	vpand	%ymm1, %ymm2, %ymm5
++	vpand	%ymm3, %ymm4, %ymm6
++	vpand	%ymm5, %ymm6, %ymm7
++	vpmovmskb %ymm7, %ecx
++	incl	%ecx
++	jnz	L(8x_return_vec_0_1_2_3)
++	subq	$-(VEC_SIZE * 4), %rdi
++	/* Check if s1 pointer at end.  */
++	cmpq	%rdx, %rdi
++	jb	L(loop_4x_vec)
++
++	subq	%rdx, %rdi
++	/* rdi has 4 * VEC_SIZE - remaining length.  */
++	cmpl	$(VEC_SIZE * 3), %edi
++	jae	L(8x_last_1x_vec)
++	/* Load regardless of branch.  */
++	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdx), %ymm3
++	cmpl	$(VEC_SIZE * 2), %edi
++	jae	L(8x_last_2x_vec)
++
++	/* Check last 4 VEC.  */
++	vmovdqu	(%rsi, %rdx), %ymm1
++	VPCMPEQ	(%rdx), %ymm1, %ymm1
++
++	vmovdqu	VEC_SIZE(%rsi, %rdx), %ymm2
++	VPCMPEQ	VEC_SIZE(%rdx), %ymm2, %ymm2
++
++	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
++
++	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
++	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
++
++	vpand	%ymm1, %ymm2, %ymm5
++	vpand	%ymm3, %ymm4, %ymm6
++	vpand	%ymm5, %ymm6, %ymm7
++	vpmovmskb %ymm7, %ecx
++	/* Restore s1 pointer to rdi.  */
++	movq	%rdx, %rdi
++	incl	%ecx
++	jnz	L(8x_return_vec_0_1_2_3)
++	/* NB: eax must be zero to reach here.  */
++	VZEROUPPER_RETURN
++
++	/* Only entry is from L(more_8x_vec).  */
++	.p2align 4
++L(8x_last_2x_vec):
++	/* Check second to last VEC. rdx store end pointer of s1 and
++	   ymm3 has already been loaded with second to last VEC from s2.
++	 */
++	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
++	vpmovmskb %ymm3, %eax
++	incl	%eax
++	jnz	L(8x_return_vec_2)
++	/* Check last VEC.  */
++	.p2align 4
++L(8x_last_1x_vec):
++	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
++	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
++	vpmovmskb %ymm4, %eax
++	incl	%eax
++	jnz	L(8x_return_vec_3)
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec):
+-	/* A byte or int32 is different within 16 or 32 bytes.  */
+-	tzcntl	%eax, %ecx
++L(last_2x_vec):
++	/* Check second to last VEC.  */
++	vmovdqu	-(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
++	VPCMPEQ	-(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
++	vpmovmskb %ymm1, %eax
++	incl	%eax
++	jnz	L(return_vec_1_end)
++	/* Check last VEC.  */
++L(last_1x_vec):
++	vmovdqu	-(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
++	VPCMPEQ	-(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
++	vpmovmskb %ymm1, %eax
++	incl	%eax
++	jnz	L(return_vec_0_end)
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(8x_return_vec_2):
++	subq	$VEC_SIZE, %rdx
++L(8x_return_vec_3):
++	tzcntl	%eax, %eax
++	addq	%rdx, %rax
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rcx), %edx
+-	cmpl	(%rsi, %rcx), %edx
+-L(wmemcmp_return):
+-	setl	%al
+-	negl	%eax
+-	orl	$1, %eax
++	movl	(VEC_SIZE * 3)(%rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ # else
+-	movzbl	(%rdi, %rcx), %eax
+-	movzbl	(%rsi, %rcx), %edx
+-	sub	%edx, %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	movzbl	(VEC_SIZE * 3)(%rax), %eax
++	subl	%ecx, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+ 
+-# ifdef USE_AS_WMEMCMP
+ 	.p2align 4
+-L(4):
+-	xorl	%eax, %eax
+-	movl	(%rdi), %edx
+-	cmpl	(%rsi), %edx
+-	jne	L(wmemcmp_return)
+-	ret
++L(return_vec_1_end):
++	tzcntl	%eax, %eax
++	addl	%edx, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	-(VEC_SIZE * 2)(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ # else
++	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	VZEROUPPER_RETURN
++
+ 	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.  */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	je	L(exit)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+-	ret
++L(return_vec_0_end):
++	tzcntl	%eax, %eax
++	addl	%edx, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	-VEC_SIZE(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	-VEC_SIZE(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
++	movzbl	-VEC_SIZE(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(exit):
+-	ret
++L(less_vec):
++	/* Check if one or less CHAR. This is necessary for size = 0 but
++	   is also faster for size = CHAR_SIZE.  */
++	cmpl	$CHAR_SIZE, %edx
++	jbe	L(one_or_less)
++
++	/* Check if loading one VEC from either s1 or s2 could cause a
++	   page cross. This can have false positives but is by far the
++	   fastest method.  */
++	movl	%edi, %eax
++	orl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jg	L(page_cross_less_vec)
++
++	/* No page cross possible.  */
++	vmovdqu	(%rsi), %ymm2
++	VPCMPEQ	(%rdi), %ymm2, %ymm2
++	vpmovmskb %ymm2, %eax
++	incl	%eax
++	/* Result will be zero if s1 and s2 match. Otherwise first set
++	   bit will be first mismatch.  */
++	bzhil	%edx, %eax, %edx
++	jnz	L(return_vec_0)
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(between_2_3):
++L(page_cross_less_vec):
++	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
++	   bytes.  */
++	cmpl	$16, %edx
++	jae	L(between_16_31)
++# ifndef USE_AS_WMEMCMP
++	cmpl	$8, %edx
++	jae	L(between_8_15)
++	cmpl	$4, %edx
++	jae	L(between_4_7)
++
+ 	/* Load as big endian to avoid branches.  */
+ 	movzwl	(%rdi), %eax
+ 	movzwl	(%rsi), %ecx
+@@ -208,223 +439,106 @@ L(between_2_3):
+ 	shll	$8, %ecx
+ 	bswap	%eax
+ 	bswap	%ecx
+-	movb	-1(%rdi, %rdx), %al
+-	movb	-1(%rsi, %rdx), %cl
++	movzbl	-1(%rdi, %rdx), %edi
++	movzbl	-1(%rsi, %rdx), %esi
++	orl	%edi, %eax
++	orl	%esi, %ecx
+ 	/* Subtraction is okay because the upper 8 bits are zero.  */
+ 	subl	%ecx, %eax
++	/* No ymm register was touched.  */
+ 	ret
+ 
+ 	.p2align 4
+-L(1):
+-	movzbl	(%rdi), %eax
++L(one_or_less):
++	jb	L(zero)
+ 	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
+ 	subl	%ecx, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
++	/* No ymm register was touched.  */
+ 	ret
+ 
+ 	.p2align 4
+-L(less_vec):
+-# ifdef USE_AS_WMEMCMP
+-	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+-	cmpb	$4, %dl
+-	je	L(4)
+-	jb	L(zero)
+-# else
+-	cmpb	$1, %dl
+-	je	L(1)
+-	jb	L(zero)
+-	cmpb	$4, %dl
+-	jb	L(between_2_3)
+-	cmpb	$8, %dl
+-	jb	L(between_4_7)
++L(between_8_15):
+ # endif
+-	cmpb	$16, %dl
+-	jae	L(between_16_31)
+-	/* It is between 8 and 15 bytes.  */
++	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+ 	vmovq	(%rdi), %xmm1
+ 	vmovq	(%rsi), %xmm2
+-	VPCMPEQ %xmm1, %xmm2, %xmm2
++	VPCMPEQ	%xmm1, %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
++	subl	$0xffff, %eax
++	jnz	L(return_vec_0)
+ 	/* Use overlapping loads to avoid branches.  */
+ 	leaq	-8(%rdi, %rdx), %rdi
+ 	leaq	-8(%rsi, %rdx), %rsi
+ 	vmovq	(%rdi), %xmm1
+ 	vmovq	(%rsi), %xmm2
+-	VPCMPEQ %xmm1, %xmm2, %xmm2
++	VPCMPEQ	%xmm1, %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
++	subl	$0xffff, %eax
++	jnz	L(return_vec_0)
++	/* No ymm register was touched.  */
++	ret
++
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
+ 	ret
+ 
+ 	.p2align 4
+ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+ 	vmovdqu	(%rsi), %xmm2
+-	VPCMPEQ (%rdi), %xmm2, %xmm2
++	VPCMPEQ	(%rdi), %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
++	subl	$0xffff, %eax
++	jnz	L(return_vec_0)
+ 
+ 	/* Use overlapping loads to avoid branches.  */
++
++	vmovdqu	-16(%rsi, %rdx), %xmm2
+ 	leaq	-16(%rdi, %rdx), %rdi
+ 	leaq	-16(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %xmm2
+-	VPCMPEQ (%rdi), %xmm2, %xmm2
++	VPCMPEQ	(%rdi), %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
++	subl	$0xffff, %eax
++	jnz	L(return_vec_0)
++	/* No ymm register was touched.  */
+ 	ret
+ 
+-	.p2align 4
+-L(more_8x_vec):
+-	/* More than 8 * VEC.  Check the first VEC.  */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Align the first memory area for aligned loads in the loop.
+-	   Compute how much the first memory area is misaligned.  */
+-	movq	%rdi, %rcx
+-	andl	$(VEC_SIZE - 1), %ecx
+-	/* Get the negative of offset for alignment.  */
+-	subq	$VEC_SIZE, %rcx
+-	/* Adjust the second memory area.  */
+-	subq	%rcx, %rsi
+-	/* Adjust the first memory area which should be aligned now.  */
+-	subq	%rcx, %rdi
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
+-
+-L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqu	(%rsi), %ymm1
+-	VPCMPEQ (%rdi), %ymm1, %ymm1
+-
+-	vmovdqu	VEC_SIZE(%rsi), %ymm2
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+-	vpand	%ymm2, %ymm1, %ymm5
+-
+-	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+-	vpand	%ymm3, %ymm5, %ymm5
+-
+-	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+-	vpand	%ymm4, %ymm5, %ymm5
+-
+-	vptest	%ymm0, %ymm5
+-	jnc	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-	addq	$(VEC_SIZE * 4), %rsi
+-
+-	subq	$(VEC_SIZE * 4), %rdx
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jae	L(loop_4x_vec)
+-
+-	/* Less than 4 * VEC.  */
+-	cmpq	$VEC_SIZE, %rdx
+-	jbe	L(last_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_2x_vec)
+-
+-L(last_4x_vec):
+-	/* From 2 * VEC to 4 * VEC. */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+-	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	VZEROUPPER_RETURN
+-
+-	.p2align 4
+-L(4x_vec_end):
+-	vpmovmskb %ymm1, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	vpmovmskb %ymm2, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x1)
+-	vpmovmskb %ymm3, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x2)
+-	vpmovmskb %ymm4, %eax
+-	subl	$VEC_MASK, %eax
+-	tzcntl	%eax, %ecx
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
+-	VZEROUPPER_RETURN
+-
+ 	.p2align 4
+-L(first_vec_x1):
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rcx), %edx
+-	cmpl	VEC_SIZE(%rsi, %rcx), %edx
+-	jmp	L(wmemcmp_return)
++L(one_or_less):
++	jb	L(zero)
++	movl	(%rdi), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi), %ecx
++	je	L(zero)
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++	/* No ymm register was touched.  */
++	ret
+ # else
+-	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
+-	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2):
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
++L(between_4_7):
++	/* Load as big endian with overlapping movbe to avoid branches.
++	 */
++	movbe	(%rdi), %eax
++	movbe	(%rsi), %ecx
++	shlq	$32, %rax
++	shlq	$32, %rcx
++	movbe	-4(%rdi, %rdx), %edi
++	movbe	-4(%rsi, %rdx), %esi
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	jz	L(zero_4_7)
++	sbbl	%eax, %eax
++	orl	$1, %eax
++L(zero_4_7):
++	/* No ymm register was touched.  */
++	ret
+ # endif
+-	VZEROUPPER_RETURN
++
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-46.patch b/glibc-RHEL-15696-46.patch
new file mode 100644
index 0000000..881fe81
--- /dev/null
+++ b/glibc-RHEL-15696-46.patch
@@ -0,0 +1,851 @@
+From 4ad473e97acdc5f6d811755b67c09f2128a644ce Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 17 May 2021 13:57:24 -0400
+Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes memcmp-evex.S. The optimizations include
+adding a new vec compare path for small sizes, reorganizing the entry
+control flow, removing some unnecissary ALU instructions from the main
+loop, and most importantly replacing the heavy use of vpcmp + kand
+logic with vpxor + vptern. test-memcmp and test-wmemcmp are both
+passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 710 +++++++++++--------
+ 1 file changed, 408 insertions(+), 302 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 9c093972..654dc7ac 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -19,17 +19,22 @@
+ #if IS_IN (libc)
+ 
+ /* memcmp/wmemcmp is implemented as:
+-   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+-      to avoid branches.
+-   2. Use overlapping compare to avoid branch.
+-   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+-      bytes for wmemcmp.
+-   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+-   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
++   1. Use ymm vector compares when possible. The only case where
++      vector compares is not possible for when size < CHAR_PER_VEC
++      and loading from either s1 or s2 would cause a page cross.
++   2. For size from 2 to 7 bytes on page cross, load as big endian
++      with movbe and bswap to avoid branches.
++   3. Use xmm vector compare when size >= 4 bytes for memcmp or
++      size >= 8 bytes for wmemcmp.
++   4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
++      to check for early mismatches. Only do this if its guranteed the
++      work is not wasted.
++   5. If size is 8 * VEC_SIZE or less, unroll the loop.
++   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
+       area.
+-   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+-   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+-   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
++   7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
++   8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
++   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
+ 
+ # include <sysdep.h>
+ 
+@@ -40,11 +45,21 @@
+ # define VMOVU		vmovdqu64
+ 
+ # ifdef USE_AS_WMEMCMP
+-#  define VPCMPEQ	vpcmpeqd
++#  define CHAR_SIZE	4
++#  define VPCMP	vpcmpd
+ # else
+-#  define VPCMPEQ	vpcmpeqb
++#  define CHAR_SIZE	1
++#  define VPCMP	vpcmpub
+ # endif
+ 
++# define VEC_SIZE	32
++# define PAGE_SIZE	4096
++# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
++
++# define XMM0		xmm16
++# define XMM1		xmm17
++# define XMM2		xmm18
++# define YMM0		ymm16
+ # define XMM1		xmm17
+ # define XMM2		xmm18
+ # define YMM1		ymm17
+@@ -54,15 +69,6 @@
+ # define YMM5		ymm21
+ # define YMM6		ymm22
+ 
+-# define VEC_SIZE 32
+-# ifdef USE_AS_WMEMCMP
+-#  define VEC_MASK 0xff
+-#  define XMM_MASK 0xf
+-# else
+-#  define VEC_MASK 0xffffffff
+-#  define XMM_MASK 0xffff
+-# endif
+-
+ /* Warning!
+            wmemcmp has to use SIGNED comparison for elements.
+            memcmp has to use UNSIGNED comparison for elemnts.
+@@ -70,145 +76,370 @@
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (MEMCMP)
+-# ifdef USE_AS_WMEMCMP
+-	shl	$2, %RDX_LP
+-# elif defined __ILP32__
++# ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+ # endif
+-	cmp	$VEC_SIZE, %RDX_LP
++	cmp	$CHAR_PER_VEC, %RDX_LP
+ 	jb	L(less_vec)
+ 
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k1
++	VMOVU	(%rsi), %YMM1
++	/* Use compare not equals to directly check for mismatch.  */
++	VPCMP	$4, (%rdi), %YMM1, %k1
+ 	kmovd	%k1, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_vec)
+-
+-	/* More than 2 * VEC.  */
+-	cmpq	$(VEC_SIZE * 8), %rdx
+-	ja	L(more_8x_vec)
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jb	L(last_4x_vec)
++	/* NB: eax must be destination register if going to
++	   L(return_vec_[0,2]). For L(return_vec_3 destination register
++	   must be ecx.  */
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 
+-	/* From 4 * VEC to 8 * VEC, inclusively. */
+-	VMOVU	(%rsi), %YMM1
+-	VPCMPEQ (%rdi), %YMM1, %k1
++	cmpq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(last_1x_vec)
+ 
++	/* Check second VEC no matter what.  */
+ 	VMOVU	VEC_SIZE(%rsi), %YMM2
+-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
++	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_1)
++
++	/* Less than 4 * VEC.  */
++	cmpq	$(CHAR_PER_VEC * 4), %rdx
++	jbe	L(last_2x_vec)
+ 
++	/* Check third and fourth VEC no matter what.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
++	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
++	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
++	kmovd	%k1, %ecx
++	testl	%ecx, %ecx
++	jnz	L(return_vec_3)
+ 
+-	kandd	%k1, %k2, %k5
+-	kandd	%k3, %k4, %k6
+-	kandd	%k5, %k6, %k6
++	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
++	   compare with zero to get a mask is needed.  */
++	vpxorq	%XMM0, %XMM0, %XMM0
+ 
+-	kmovd	%k6, %eax
+-	cmpl	$VEC_MASK, %eax
+-	jne	L(4x_vec_end)
++	/* Go to 4x VEC loop.  */
++	cmpq	$(CHAR_PER_VEC * 8), %rdx
++	ja	L(more_8x_vec)
+ 
+-	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+-	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %YMM1
+-	VPCMPEQ (%rdi), %YMM1, %k1
++	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
++	   branches.  */
+ 
+-	VMOVU	VEC_SIZE(%rsi), %YMM2
+-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+-	kandd	%k1, %k2, %k5
++	/* Load first two VEC from s2 before adjusting addresses.  */
++	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
++	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
++	leaq	-(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
++	leaq	-(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
++
++	/* Wait to load from s1 until addressed adjust due to
++	   unlamination of microfusion with complex address mode.  */
++
++	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
++	   will have some 1s.  */
++	vpxorq	(%rdi), %YMM1, %YMM1
++	vpxorq	(VEC_SIZE)(%rdi), %YMM2, %YMM2
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+-	kandd	%k3, %k5, %k5
++	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
++	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
++	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+-	kandd	%k4, %k5, %k5
++	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
++	   oring with YMM3. Result is stored in YMM4.  */
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
++	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
++	VPCMP	$4, %YMM4, %YMM0, %k1
++	kmovd	%k1, %ecx
++	testl	%ecx, %ecx
++	jnz	L(return_vec_0_1_2_3)
++	/* NB: eax must be zero to reach here.  */
++	ret
+ 
+-	kmovd	%k5, %eax
+-	cmpl	$VEC_MASK, %eax
+-	jne	L(4x_vec_end)
+-	xorl	%eax, %eax
++	/* NB: aligning 32 here allows for the rest of the jump targets
++	   to be tuned for 32 byte alignment. Most important this ensures
++	   the L(more_8x_vec) loop is 32 byte aligned.  */
++	.p2align 5
++L(less_vec):
++	/* Check if one or less CHAR. This is necessary for size = 0 but
++	   is also faster for size = CHAR_SIZE.  */
++	cmpl	$1, %edx
++	jbe	L(one_or_less)
++
++	/* Check if loading one VEC from either s1 or s2 could cause a
++	   page cross. This can have false positives but is by far the
++	   fastest method.  */
++	movl	%edi, %eax
++	orl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jg	L(page_cross_less_vec)
++
++	/* No page cross possible.  */
++	VMOVU	(%rsi), %YMM2
++	VPCMP	$4, (%rdi), %YMM2, %k1
++	kmovd	%k1, %eax
++	/* Create mask in ecx for potentially in bound matches.  */
++	bzhil	%edx, %eax, %eax
++	jnz	L(return_vec_0)
+ 	ret
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++L(return_vec_0):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
++	/* NB: no partial register stall here because xorl zero idiom
++	   above.  */
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
+ 
+-L(last_vec):
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+-	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
++	   which is good enough for a target not in a loop.  */
++L(return_vec_1):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	VEC_SIZE(%rsi, %rax), %ecx
++	movzbl	VEC_SIZE(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
+ 	ret
+ 
+-	.p2align 4
+-L(first_vec):
+-	/* A byte or int32 is different within 16 or 32 bytes.  */
+-	tzcntl	%eax, %ecx
++	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
++	   which is good enough for a target not in a loop.  */
++L(return_vec_2):
++	tzcntl	%eax, %eax
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rcx, 4), %edx
+-	cmpl	(%rsi, %rcx, 4), %edx
+-L(wmemcmp_return):
+-	setl	%al
+-	negl	%eax
+-	orl	$1, %eax
++	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ # else
+-	movzbl	(%rdi, %rcx), %eax
+-	movzbl	(%rsi, %rcx), %edx
+-	sub	%edx, %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
++	subl	%ecx, %eax
+ # endif
+ 	ret
+ 
++	.p2align 4
++L(8x_return_vec_0_1_2_3):
++	/* Returning from L(more_8x_vec) requires restoring rsi.  */
++	addq	%rdi, %rsi
++L(return_vec_0_1_2_3):
++	VPCMP	$4, %YMM1, %YMM0, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
++
++	VPCMP	$4, %YMM2, %YMM0, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_1)
++
++	VPCMP	$4, %YMM3, %YMM0, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_2)
++L(return_vec_3):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WMEMCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
++	ret
++
+ 	.p2align 4
+-L(4):
+-	xorl	%eax, %eax
+-	movl	(%rdi), %edx
+-	cmpl	(%rsi), %edx
+-	jne	L(wmemcmp_return)
++L(more_8x_vec):
++	/* Set end of s1 in rdx.  */
++	leaq	-(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
++	/* rsi stores s2 - s1. This allows loop to only update one
++	   pointer.  */
++	subq	%rdi, %rsi
++	/* Align s1 pointer.  */
++	andq	$-VEC_SIZE, %rdi
++	/* Adjust because first 4x vec where check already.  */
++	subq	$-(VEC_SIZE * 4), %rdi
++	.p2align 4
++L(loop_4x_vec):
++	VMOVU	(%rsi, %rdi), %YMM1
++	vpxorq	(%rdi), %YMM1, %YMM1
++
++	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
++	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
++
++	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
++	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
++	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
++
++	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
++	VPCMP	$4, %YMM4, %YMM0, %k1
++	kmovd	%k1, %ecx
++	testl	%ecx, %ecx
++	jnz	L(8x_return_vec_0_1_2_3)
++	subq	$-(VEC_SIZE * 4), %rdi
++	cmpq	%rdx, %rdi
++	jb	L(loop_4x_vec)
++
++	subq	%rdx, %rdi
++	/* rdi has 4 * VEC_SIZE - remaining length.  */
++	cmpl	$(VEC_SIZE * 3), %edi
++	jae	L(8x_last_1x_vec)
++	/* Load regardless of branch.  */
++	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
++	cmpl	$(VEC_SIZE * 2), %edi
++	jae	L(8x_last_2x_vec)
++
++	VMOVU	(%rsi, %rdx), %YMM1
++	vpxorq	(%rdx), %YMM1, %YMM1
++
++	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
++	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
++
++	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
++	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
++
++	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
++	VPCMP	$4, %YMM4, %YMM0, %k1
++	kmovd	%k1, %ecx
++	/* Restore s1 pointer to rdi.  */
++	movq	%rdx, %rdi
++	testl	%ecx, %ecx
++	jnz	L(8x_return_vec_0_1_2_3)
++	/* NB: eax must be zero to reach here.  */
++	ret
++
++	/* Only entry is from L(more_8x_vec).  */
++	.p2align 4
++L(8x_last_2x_vec):
++	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(8x_return_vec_2)
++	/* Naturally aligned to 16 bytes.  */
++L(8x_last_1x_vec):
++	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
++	VPCMP	$4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(8x_return_vec_3)
++	ret
++
++	.p2align 4
++L(last_2x_vec):
++	/* Check second to last VEC.  */
++	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
++	VPCMP	$4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_1_end)
++
++	/* Check last VEC.  */
++	.p2align 4
++L(last_1x_vec):
++	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
++	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0_end)
+ 	ret
++
++	.p2align 4
++L(8x_return_vec_2):
++	subq	$VEC_SIZE, %rdx
++L(8x_return_vec_3):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	leaq	(%rdx, %rax, CHAR_SIZE), %rax
++	movl	(VEC_SIZE * 3)(%rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ # else
++	addq	%rdx, %rax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	movzbl	(VEC_SIZE * 3)(%rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
+ 	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.  */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	je	L(exit)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
++L(return_vec_0_end):
++	tzcntl	%eax, %eax
++	addl	%edx, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	-VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	-VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
++	movzbl	-VEC_SIZE(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
+ 	ret
+ 
+ 	.p2align 4
+-L(exit):
++L(return_vec_1_end):
++	tzcntl	%eax, %eax
++	addl	%edx, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
+ 	ret
+ 
++
+ 	.p2align 4
++L(page_cross_less_vec):
++	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
++	   bytes.  */
++	cmpl	$(16 / CHAR_SIZE), %edx
++	jae	L(between_16_31)
++# ifndef USE_AS_WMEMCMP
++	cmpl	$8, %edx
++	jae	L(between_8_15)
++	cmpl	$4, %edx
++	jae	L(between_4_7)
+ L(between_2_3):
+ 	/* Load as big endian to avoid branches.  */
+ 	movzwl	(%rdi), %eax
+@@ -217,224 +448,99 @@ L(between_2_3):
+ 	shll	$8, %ecx
+ 	bswap	%eax
+ 	bswap	%ecx
+-	movb	-1(%rdi, %rdx), %al
+-	movb	-1(%rsi, %rdx), %cl
++	movzbl	-1(%rdi, %rdx), %edi
++	movzbl	-1(%rsi, %rdx), %esi
++	orl	%edi, %eax
++	orl	%esi, %ecx
+ 	/* Subtraction is okay because the upper 8 bits are zero.  */
+ 	subl	%ecx, %eax
+ 	ret
+-
+ 	.p2align 4
+-L(1):
+-	movzbl	(%rdi), %eax
++L(one_or_less):
++	jb	L(zero)
+ 	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
+ 	subl	%ecx, %eax
+ 	ret
+-# endif
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+ 
+ 	.p2align 4
+-L(less_vec):
+-# ifdef USE_AS_WMEMCMP
+-	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+-	cmpb	$4, %dl
+-	je	L(4)
+-	jb	L(zero)
+-# else
+-	cmpb	$1, %dl
+-	je	L(1)
+-	jb	L(zero)
+-	cmpb	$4, %dl
+-	jb	L(between_2_3)
+-	cmpb	$8, %dl
+-	jb	L(between_4_7)
++L(between_8_15):
+ # endif
+-	cmpb	$16, %dl
+-	jae	L(between_16_31)
+-	/* It is between 8 and 15 bytes.  */
++	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+ 	vmovq	(%rdi), %XMM1
+ 	vmovq	(%rsi), %XMM2
+-	VPCMPEQ %XMM1, %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
++	VPCMP	$4, %XMM1, %XMM2, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 	/* Use overlapping loads to avoid branches.  */
+-	leaq	-8(%rdi, %rdx), %rdi
+-	leaq	-8(%rsi, %rdx), %rsi
++	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
++	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
+ 	vmovq	(%rdi), %XMM1
+ 	vmovq	(%rsi), %XMM2
+-	VPCMPEQ %XMM1, %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
++	VPCMP	$4, %XMM1, %XMM2, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 	ret
+ 
+ 	.p2align 4
+-L(between_16_31):
+-	/* From 16 to 31 bytes.  No branch when size == 16.  */
+-	VMOVU	(%rsi), %XMM2
+-	VPCMPEQ (%rdi), %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-16(%rdi, %rdx), %rdi
+-	leaq	-16(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %XMM2
+-	VPCMPEQ (%rdi), %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
++L(zero):
++	xorl	%eax, %eax
+ 	ret
+ 
+ 	.p2align 4
+-L(more_8x_vec):
+-	/* More than 8 * VEC.  Check the first VEC.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Align the first memory area for aligned loads in the loop.
+-	   Compute how much the first memory area is misaligned.  */
+-	movq	%rdi, %rcx
+-	andl	$(VEC_SIZE - 1), %ecx
+-	/* Get the negative of offset for alignment.  */
+-	subq	$VEC_SIZE, %rcx
+-	/* Adjust the second memory area.  */
+-	subq	%rcx, %rsi
+-	/* Adjust the first memory area which should be aligned now.  */
+-	subq	%rcx, %rdi
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
+-
+-L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VMOVU	(%rsi), %YMM1
+-	VPCMPEQ (%rdi), %YMM1, %k1
+-
+-	VMOVU	VEC_SIZE(%rsi), %YMM2
+-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+-	kandd	%k2, %k1, %k5
+-
+-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+-	kandd	%k3, %k5, %k5
+-
+-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+-	kandd	%k4, %k5, %k5
+-
+-	kmovd	%k5, %eax
+-	cmpl	$VEC_MASK, %eax
+-	jne	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-	addq	$(VEC_SIZE * 4), %rsi
+-
+-	subq	$(VEC_SIZE * 4), %rdx
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jae	L(loop_4x_vec)
+-
+-	/* Less than 4 * VEC.  */
+-	cmpq	$VEC_SIZE, %rdx
+-	jbe	L(last_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_2x_vec)
+-
+-L(last_4x_vec):
+-	/* From 2 * VEC to 4 * VEC. */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++L(between_16_31):
++	/* From 16 to 31 bytes.  No branch when size == 16.  */
++	VMOVU	(%rsi), %XMM2
++	VPCMP	$4, (%rdi), %XMM2, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 
+ 	/* Use overlapping loads to avoid branches.  */
+-	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+-	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+ 
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	ret
+-
+-	.p2align 4
+-L(4x_vec_end):
++	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
++	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
++	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
++	VPCMP	$4, (%rdi), %XMM2, %k1
+ 	kmovd	%k1, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	kmovd	%k2, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x1)
+-	kmovd	%k3, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x2)
+-	kmovd	%k4, %eax
+-	subl	$VEC_MASK, %eax
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 	ret
+ 
+-	.p2align 4
+-L(first_vec_x1):
+-	tzcntl	%eax, %ecx
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rcx, 4), %edx
+-	cmpl	VEC_SIZE(%rsi, %rcx, 4), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
++	.p2align 4
++L(one_or_less):
++	jb	L(zero)
++	movl	(%rdi), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi), %ecx
++	je	L(zero)
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ 	ret
++# else
+ 
+ 	.p2align 4
+-L(first_vec_x2):
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
++L(between_4_7):
++	/* Load as big endian with overlapping movbe to avoid branches.
++	 */
++	movbe	(%rdi), %eax
++	movbe	(%rsi), %ecx
++	shlq	$32, %rax
++	shlq	$32, %rcx
++	movbe	-4(%rdi, %rdx), %edi
++	movbe	-4(%rsi, %rdx), %esi
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	jz	L(zero_4_7)
++	sbbl	%eax, %eax
++	orl	$1, %eax
++L(zero_4_7):
+ 	ret
++# endif
++
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-47.patch b/glibc-RHEL-15696-47.patch
new file mode 100644
index 0000000..70c3171
--- /dev/null
+++ b/glibc-RHEL-15696-47.patch
@@ -0,0 +1,104 @@
+From 6abf27980a947f9b6e514d6b33b83059d39566ae Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 20 May 2021 13:13:51 -0400
+Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit makes a few small improvements to
+memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64
+instead of 128. Either alignment will perform equally well in a loop
+and 128 just increases the odds of having to do an extra iteration
+which can be significant overhead for small values. 2) Align some
+targets and the loop. 3) Remove an ALU from the alignment process. 4)
+Reorder the last 4x VEC so that they are stored after the loop. 5)
+Move the condition for leq 8x VEC to before the alignment
+process. test-memset and test-wmemset are both passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ .../multiarch/memset-vec-unaligned-erms.S     | 50 +++++++++++--------
+ 1 file changed, 28 insertions(+), 22 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index f877ac9d..909c33f6 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VZEROUPPER_RETURN
+ 
++	.p2align 4
+ L(stosb_more_2x_vec):
+ 	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
+ 	ja	L(stosb)
++#else
++	.p2align 4
+ #endif
+ L(more_2x_vec):
+-	cmpq  $(VEC_SIZE * 4), %rdx
+-	ja	L(loop_start)
++	/* Stores to first 2x VEC before cmp as any path forward will
++	   require it.  */
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
++	cmpq	$(VEC_SIZE * 4), %rdx
++	ja	L(loop_start)
+ 	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
++	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+ L(return):
+ #if VEC_SIZE > 16
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+@@ -192,28 +197,29 @@ L(return):
+ #endif
+ 
+ L(loop_start):
+-	leaq	(VEC_SIZE * 4)(%rdi), %rcx
+-	VMOVU	%VEC(0), (%rdi)
+-	andq	$-(VEC_SIZE * 4), %rcx
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+ 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
+ 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
+-	addq	%rdi, %rdx
+-	andq	$-(VEC_SIZE * 4), %rdx
+-	cmpq	%rdx, %rcx
+-	je	L(return)
++	cmpq	$(VEC_SIZE * 8), %rdx
++	jbe	L(loop_end)
++	andq	$-(VEC_SIZE * 2), %rdi
++	subq	$-(VEC_SIZE * 4), %rdi
++	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
++	.p2align 4
+ L(loop):
+-	VMOVA	%VEC(0), (%rcx)
+-	VMOVA	%VEC(0), VEC_SIZE(%rcx)
+-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
+-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
+-	addq	$(VEC_SIZE * 4), %rcx
+-	cmpq	%rcx, %rdx
+-	jne	L(loop)
++	VMOVA	%VEC(0), (%rdi)
++	VMOVA	%VEC(0), VEC_SIZE(%rdi)
++	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
++	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
++	subq	$-(VEC_SIZE * 4), %rdi
++	cmpq	%rcx, %rdi
++	jb	L(loop)
++L(loop_end):
++	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
++	       rdx as length is also unchanged.  */
++	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
++	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
++	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
++	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+ 	VZEROUPPER_SHORT_RETURN
+ 
+ 	.p2align 4
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-48.patch b/glibc-RHEL-15696-48.patch
new file mode 100644
index 0000000..645536e
--- /dev/null
+++ b/glibc-RHEL-15696-48.patch
@@ -0,0 +1,84 @@
+From 1b992204f68af851e905c16016756fd4421e1934 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 23 May 2021 19:43:24 -0400
+Subject: [PATCH] x86: Improve memmove-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+This patch changes the condition for copy 4x VEC so that if length is
+exactly equal to 4 * VEC_SIZE it will use the 4x VEC case instead of
+8x VEC case.
+
+Results For Skylake memcpy-avx2-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 0   , 9.137   , 6.873   , New , 75.22
+128 , 7   , 0   , 12.933  , 7.732   , New , 59.79
+128 , 0   , 7   , 11.852  , 6.76    , New , 57.04
+128 , 7   , 7   , 12.587  , 6.808   , New , 54.09
+
+Results For Icelake memcpy-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 0   , 9.963   , 5.416   , New , 54.36
+128 , 7   , 0   , 16.467  , 8.061   , New , 48.95
+128 , 0   , 7   , 14.388  , 7.644   , New , 53.13
+128 , 7   , 7   , 14.546  , 7.642   , New , 52.54
+
+Results For Tigerlake memcpy-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 0   , 8.979   , 4.95    , New , 55.13
+128 , 7   , 0   , 14.245  , 7.122   , New , 50.0
+128 , 0   , 7   , 12.668  , 6.675   , New , 52.69
+128 , 7   , 7   , 13.042  , 6.802   , New , 52.15
+
+Results For Skylake memmove-avx2-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 32  , 6.181   , 5.691   , New , 92.07
+128 , 32  , 0   , 6.165   , 5.752   , New , 93.3
+128 , 0   , 7   , 13.923  , 9.37    , New , 67.3
+128 , 7   , 0   , 12.049  , 10.182  , New , 84.5
+
+Results For Icelake memmove-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 32  , 5.479   , 4.889   , New , 89.23
+128 , 32  , 0   , 5.127   , 4.911   , New , 95.79
+128 , 0   , 7   , 18.885  , 13.547  , New , 71.73
+128 , 7   , 0   , 15.565  , 14.436  , New , 92.75
+
+Results For Tigerlake memmove-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 32  , 5.275   , 4.815   , New , 91.28
+128 , 32  , 0   , 5.376   , 4.565   , New , 84.91
+128 , 0   , 7   , 19.426  , 14.273  , New , 73.47
+128 , 7   , 0   , 15.924  , 14.951  , New , 93.89
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 3e2dd6bc..572cef04 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -417,8 +417,8 @@ L(more_2x_vec):
+ 	cmpq	$(VEC_SIZE * 8), %rdx
+ 	ja	L(more_8x_vec)
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+-	jb	L(last_4x_vec)
+-	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
++	jbe	L(last_4x_vec)
++	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+@@ -437,7 +437,7 @@ L(more_2x_vec):
+ 	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+ 	VZEROUPPER_RETURN
+ L(last_4x_vec):
+-	/* Copy from 2 * VEC to 4 * VEC. */
++	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-49.patch b/glibc-RHEL-15696-49.patch
new file mode 100644
index 0000000..b59f582
--- /dev/null
+++ b/glibc-RHEL-15696-49.patch
@@ -0,0 +1,55 @@
+From 08cbcd4dbc686bb38ec3093aff2f919fbff5ec17 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Jun 2021 19:19:34 -0400
+Subject: [PATCH] x86: Remove unnecessary overflow check from wcsnlen-sse4_1.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. The way wcsnlen will check if near the end of maxlen
+is the following macro:
+
+	mov	%r11, %rsi;	\
+	subq	%rax, %rsi;	\
+	andq	$-64, %rax;	\
+	testq	$-64, %rsi;	\
+	je	L(strnlen_ret)
+
+Which words independently of s + maxlen overflowing. So the
+second overflow check is unnecissary for correctness and
+just extra overhead in the common no overflow case.
+
+test-strlen.c, test-wcslen.c, test-strnlen.c and test-wcsnlen.c are
+all passing
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strlen-vec.S | 7 -------
+ 1 file changed, 7 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
+index 439e486a..b7657282 100644
+--- a/sysdeps/x86_64/multiarch/strlen-vec.S
++++ b/sysdeps/x86_64/multiarch/strlen-vec.S
+@@ -71,19 +71,12 @@ L(n_nonzero):
+    suffice.  */
+ 	mov	%RSI_LP, %R10_LP
+ 	sar	$62, %R10_LP
+-	test	%R10_LP, %R10_LP
+ 	jnz	__wcslen_sse4_1
+ 	sal	$2, %RSI_LP
+ # endif
+ 
+-
+ /* Initialize long lived registers.  */
+-
+ 	add	%RDI_LP, %RSI_LP
+-# ifdef AS_WCSLEN
+-/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
+-	jbe	__wcslen_sse4_1
+-# endif
+ 	mov	%RSI_LP, %R10_LP
+ 	and	$-64, %R10_LP
+ 	mov	%RSI_LP, %R11_LP
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-5.patch b/glibc-RHEL-15696-5.patch
new file mode 100644
index 0000000..75d3978
--- /dev/null
+++ b/glibc-RHEL-15696-5.patch
@@ -0,0 +1,290 @@
+From 82d0b4a4d76db554eb6757acb790fcea30b19965 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:32:24 -0800
+Subject: [PATCH] x86-64 memset/wmemset: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memset/wmemset for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use
+	RDX_LP for length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset.
+	* sysdeps/x86_64/x32/tst-size_t-memset.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise.
+---
+ .../multiarch/memset-avx512-no-vzeroupper.S   |  6 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 34 +++++----
+ sysdeps/x86_64/x32/Makefile                   |  4 +-
+ sysdeps/x86_64/x32/tst-size_t-memset.c        | 73 +++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wmemset.c       | 20 +++++
+ 5 files changed, 121 insertions(+), 16 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memset.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemset.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+index 689cc119..99e25519 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+@@ -29,12 +29,16 @@
+ 	.section .text.avx512,"ax",@progbits
+ #if defined PIC
+ ENTRY (MEMSET_CHK)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMSET_CHK)
+ #endif
+ 
+ ENTRY (MEMSET)
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
++# endif
+ 	vpxor	%xmm0, %xmm0, %xmm0
+ 	vmovd	%esi, %xmm1
+ 	lea	(%rdi, %rdx), %rsi
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 270a1d49..9a0fd818 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -65,8 +65,8 @@
+ 	.section SECTION(.text),"ax",@progbits
+ #if VEC_SIZE == 16 && IS_IN (libc)
+ ENTRY (__bzero)
+-	movq	%rdi, %rax /* Set return value.  */
+-	movq	%rsi, %rdx /* Set n.  */
++	mov	%RDI_LP, %RAX_LP /* Set return value.  */
++	mov	%RSI_LP, %RDX_LP /* Set n.  */
+ 	pxor	%xmm0, %xmm0
+ 	jmp	L(entry_from_bzero)
+ END (__bzero)
+@@ -76,13 +76,13 @@ weak_alias (__bzero, bzero)
+ #if IS_IN (libc)
+ # if defined SHARED
+ ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+ # endif
+ 
+ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+-	shlq	$2, %rdx
++	shl	$2, %RDX_LP
+ 	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ 	jmp	L(entry_from_bzero)
+ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+@@ -90,13 +90,17 @@ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+ 
+ #if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+ #endif
+ 
+ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+ 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
++# endif
+ L(entry_from_bzero):
+ 	cmpq	$VEC_SIZE, %rdx
+ 	jb	L(less_vec)
+@@ -112,14 +116,14 @@ END (MEMSET_SYMBOL (__memset, unaligned))
+ 
+ # if VEC_SIZE == 16
+ ENTRY (__memset_chk_erms)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__memset_chk_erms)
+ 
+ /* Only used to measure performance of REP STOSB.  */
+ ENTRY (__memset_erms)
+ 	/* Skip zero length.  */
+-	testq	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jnz	 L(stosb)
+ 	movq	%rdi, %rax
+ 	ret
+@@ -131,11 +135,11 @@ ENTRY (MEMSET_SYMBOL (__memset, erms))
+ L(stosb):
+ 	/* Issue vzeroupper before rep stosb.  */
+ 	VZEROUPPER
+-	movq	%rdx, %rcx
++	mov	%RDX_LP, %RCX_LP
+ 	movzbl	%sil, %eax
+-	movq	%rdi, %rdx
++	mov	%RDI_LP, %RDX_LP
+ 	rep stosb
+-	movq	%rdx, %rax
++	mov	%RDX_LP, %RAX_LP
+ 	ret
+ # if VEC_SIZE == 16
+ END (__memset_erms)
+@@ -145,16 +149,20 @@ END (MEMSET_SYMBOL (__memset, erms))
+ 
+ # if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ # endif
+ 
+ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+-	cmpq	$VEC_SIZE, %rdx
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
++# endif
++	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
++	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(stosb_more_2x_vec)
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index e99dbd7c..98bd9ae9 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -7,9 +7,9 @@ endif
+ 
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+-	 tst-size_t-memrchr
++	 tst-size_t-memrchr tst-size_t-memset
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-tests += tst-size_t-wmemchr tst-size_t-wmemcmp
++tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memset.c b/sysdeps/x86_64/x32/tst-size_t-memset.c
+new file mode 100644
+index 00000000..2c367af6
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memset.c
+@@ -0,0 +1,73 @@
++/* Test memset with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifdef WIDE
++# define TEST_NAME "wmemset"
++#else
++# define TEST_NAME "memset"
++#endif /* WIDE */
++
++#include "test-size_t.h"
++
++#ifdef WIDE
++# include <wchar.h>
++# define MEMSET wmemset
++# define CHAR wchar_t
++#else
++# define MEMSET memset
++# define CHAR char
++#endif /* WIDE */
++
++IMPL (MEMSET, 1)
++
++typedef CHAR *(*proto_t) (CHAR *, int, size_t);
++
++static void *
++__attribute__ ((noinline, noclone))
++do_memset (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  CHAR ch = 0x23;
++  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
++  parameter_t c = { { 0 }, (void *) (uintptr_t) ch };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      c.fn = impl->fn;
++      CHAR *p = (CHAR *) do_memset (src, c);
++      size_t i;
++      for (i = 0; i < src.len; i++)
++	if (p[i] != ch)
++	  {
++	    error (0, 0, "Wrong result in function %s", impl->name);
++	    ret = 1;
++	  }
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemset.c b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
+new file mode 100644
+index 00000000..955eb488
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
+@@ -0,0 +1,20 @@
++/* Test wmemset with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include "tst-size_t-memset.c"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-50.patch b/glibc-RHEL-15696-50.patch
new file mode 100644
index 0000000..e896698
--- /dev/null
+++ b/glibc-RHEL-15696-50.patch
@@ -0,0 +1,43 @@
+From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
+Author: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>  2021-05-23 21:43:10
+Committer: H.J. Lu <hjl.tools@gmail.com>  2021-06-27 10:56:57
+Parent: 2c16cb88a6e5ace0fb7cedca86860ea7bde522a7 (Linux: Move timer helper routines from librt to libc)
+Child:  1683249d17e14827b6579529742eb895027dfa84 (x86_64: roundeven with sse4.1 support)
+Branches: master, remotes/origin/master and many more (41)
+Follows: glibc-2.33.9000
+Precedes: glibc-2.34
+
+    math: redirect roundeven function
+    
+    This patch redirect roundeven function for futhermore changes.
+    
+    Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+Conflicts:
+	*
+	(rewritten for older branch)
+
+diff --git a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
+index 7bbbb2dc..8728d0f2 100644
+--- a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
++++ b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
++#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-double.h>
+@@ -67,5 +68,6 @@ __roundeven (double x)
+   INSERT_WORDS64 (x, ix);
+   return x;
+ }
+-hidden_def (__roundeven)
++#ifndef __roundeven
+ libm_alias_double (__roundeven, roundeven)
++#endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-51.patch b/glibc-RHEL-15696-51.patch
new file mode 100644
index 0000000..105843d
--- /dev/null
+++ b/glibc-RHEL-15696-51.patch
@@ -0,0 +1,118 @@
+From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
+From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Date: Mon, 24 May 2021 09:43:10 +0800
+Subject: [PATCH] math: redirect roundeven function
+Content-type: text/plain; charset=UTF-8
+
+This patch redirect roundeven function for futhermore changes.
+
+Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ include/math.h                             | 3 ++-
+ sysdeps/ieee754/dbl-64/s_roundeven.c       | 4 +++-
+ sysdeps/ieee754/float128/s_roundevenf128.c | 1 +
+ sysdeps/ieee754/flt-32/s_roundevenf.c      | 3 +++
+ sysdeps/ieee754/ldbl-128/s_roundevenl.c    | 1 +
+ sysdeps/ieee754/ldbl-96/s_roundevenl.c     | 1 +
+ 6 files changed, 11 insertions(+), 2 deletions(-)
+
+Conflicts:
+	include/math.h
+	(missing MATH_REDIRECT macros)
+
+diff --git a/include/math.h b/include/math.h
+index e21d34b8..1f9f9a54 100644
+--- a/include/math.h
++++ b/include/math.h
+@@ -38,7 +38,6 @@ libm_hidden_proto (__issignaling)
+ libm_hidden_proto (__issignalingf)
+ libm_hidden_proto (__exp)
+ libm_hidden_proto (__expf)
+-libm_hidden_proto (__roundeven)
+ 
+ # ifndef __NO_LONG_DOUBLE_MATH
+ libm_hidden_proto (__fpclassifyl)
+@@ -56,6 +55,8 @@ libm_hidden_proto (__expm1f128)
+ 
+ # if !(defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ > 0)
+ #  ifndef NO_MATH_REDIRECT
++float (roundevenf) (float) asm ("__roundevenf");
++double (roundeven) (double) asm ("__roundeven");
+ /* Declare sqrt for use within GLIBC.  Compilers typically inline sqrt as a
+    single instruction.  Use an asm to avoid use of PLTs if it doesn't.  */
+ float (sqrtf) (float) asm ("__ieee754_sqrtf");
+diff --git a/sysdeps/ieee754/dbl-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/s_roundeven.c
+index 1438e81d..61962184 100644
+--- a/sysdeps/ieee754/dbl-64/s_roundeven.c
++++ b/sysdeps/ieee754/dbl-64/s_roundeven.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
++#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-double.h>
+@@ -101,5 +102,6 @@ __roundeven (double x)
+   INSERT_WORDS (x, hx, lx);
+   return x;
+ }
+-hidden_def (__roundeven)
++#ifndef __roundeven
+ libm_alias_double (__roundeven, roundeven)
++#endif
+diff --git a/sysdeps/ieee754/float128/s_roundevenf128.c b/sysdeps/ieee754/float128/s_roundevenf128.c
+index 5a9b3f39..e0faf727 100644
+--- a/sysdeps/ieee754/float128/s_roundevenf128.c
++++ b/sysdeps/ieee754/float128/s_roundevenf128.c
+@@ -1,2 +1,3 @@
++#define NO_MATH_REDIRECT
+ #include <float128_private.h>
+ #include "../ldbl-128/s_roundevenl.c"
+diff --git a/sysdeps/ieee754/flt-32/s_roundevenf.c b/sysdeps/ieee754/flt-32/s_roundevenf.c
+index 90f991d5..a661875e 100644
+--- a/sysdeps/ieee754/flt-32/s_roundevenf.c
++++ b/sysdeps/ieee754/flt-32/s_roundevenf.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
++#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-float.h>
+@@ -67,4 +68,6 @@ __roundevenf (float x)
+   SET_FLOAT_WORD (x, ix);
+   return x;
+ }
++#ifndef __roundevenf
+ libm_alias_float (__roundeven, roundeven)
++#endif
+diff --git a/sysdeps/ieee754/ldbl-128/s_roundevenl.c b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
+index 5fc59af4..b9375b6c 100644
+--- a/sysdeps/ieee754/ldbl-128/s_roundevenl.c
++++ b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
++#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-ldouble.h>
+diff --git a/sysdeps/ieee754/ldbl-96/s_roundevenl.c b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
+index be2e4fa4..65031ab7 100644
+--- a/sysdeps/ieee754/ldbl-96/s_roundevenl.c
++++ b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
++#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-ldouble.h>
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-52.patch b/glibc-RHEL-15696-52.patch
new file mode 100644
index 0000000..4602f51
--- /dev/null
+++ b/glibc-RHEL-15696-52.patch
@@ -0,0 +1,242 @@
+From 1683249d17e14827b6579529742eb895027dfa84 Mon Sep 17 00:00:00 2001
+From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Date: Mon, 24 May 2021 09:43:11 +0800
+Subject: [PATCH] x86_64: roundeven with sse4.1 support
+Content-type: text/plain; charset=UTF-8
+
+This patch adds support for the sse4.1 hardware floating point
+roundeven.
+
+Here is some benchmark results on my systems:
+
+=AMD Ryzen 9 3900X 12-Core Processor=
+
+* benchmark result before this commit
+|            |    roundeven |   roundevenf |
+|------------|--------------|--------------|
+| duration   |  3.75587e+09 |  3.75114e+09 |
+| iterations |  3.93053e+08 |  4.35402e+08 |
+| max        | 52.592       | 58.71        |
+| min        |  7.98        |  7.22        |
+| mean       |  9.55563     |  8.61535     |
+
+* benchmark result after this commit
+|            |     roundeven |   roundevenf |
+|------------|---------------|--------------|
+| duration   |   3.73815e+09 |  3.73738e+09 |
+| iterations |   5.82692e+08 |  5.91498e+08 |
+| max        |  56.468       | 51.642       |
+| min        |   6.27        |  6.156       |
+| mean       |   6.41532     |  6.3185      |
+
+=Intel(R) Pentium(R) CPU D1508 @ 2.20GHz=
+
+* benchmark result before this commit
+|            |    roundeven |   roundevenf |
+|------------|--------------|--------------|
+| duration   |  2.18208e+09 |  2.18258e+09 |
+| iterations |  2.39932e+08 |  2.46924e+08 |
+| max        | 96.378       | 98.035       |
+| min        |  6.776       |  5.94        |
+| mean       |  9.09456     |  8.83907     |
+
+* benchmark result after this commit
+|            |    roundeven |   roundevenf |
+|------------|--------------|--------------|
+| duration   |  2.17415e+09 |  2.17005e+09 |
+| iterations |  3.56193e+08 |  4.09824e+08 |
+| max        | 51.693       | 97.192       |
+| min        |  5.926       |  5.093       |
+| mean       |  6.10385     |  5.29507     |
+
+Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/fpu/multiarch/Makefile         |  5 +--
+ sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c  |  2 ++
+ .../x86_64/fpu/multiarch/s_roundeven-sse4_1.S | 24 ++++++++++++++
+ sysdeps/x86_64/fpu/multiarch/s_roundeven.c    | 31 +++++++++++++++++++
+ sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c |  3 ++
+ .../fpu/multiarch/s_roundevenf-sse4_1.S       | 24 ++++++++++++++
+ sysdeps/x86_64/fpu/multiarch/s_roundevenf.c   | 31 +++++++++++++++++++
+ 7 files changed, 118 insertions(+), 2 deletions(-)
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
+
+diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
+index 9f387248..6ddd1c01 100644
+--- a/sysdeps/x86_64/fpu/multiarch/Makefile
++++ b/sysdeps/x86_64/fpu/multiarch/Makefile
+@@ -1,11 +1,12 @@
+ ifeq ($(subdir),math)
+ libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
+ 			s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \
+-			s_trunc-c s_truncf-c
++			s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c
+ 
+ libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
+ 			s_floorf-sse4_1 s_nearbyint-sse4_1 \
+-			s_nearbyintf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
++			s_nearbyintf-sse4_1 s_roundeven-sse4_1 \
++			s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
+ 			s_trunc-sse4_1 s_truncf-sse4_1
+ 
+ libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
+new file mode 100644
+index 00000000..c7be43cb
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
+@@ -0,0 +1,2 @@
++#define __roundeven __roundeven_c
++#include <sysdeps/ieee754/dbl-64/s_roundeven.c>
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
+new file mode 100644
+index 00000000..6ae8f6b1
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
+@@ -0,0 +1,24 @@
++/* Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++	.section .text.sse4.1,"ax",@progbits
++ENTRY(__roundeven_sse41)
++	roundsd	$8, %xmm0, %xmm0
++	ret
++END(__roundeven_sse41)
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
+new file mode 100644
+index 00000000..d92eda65
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
+@@ -0,0 +1,31 @@
++/* Multiple versions of __roundeven.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <libm-alias-double.h>
++
++#define roundeven __redirect_roundeven
++#define __roundeven __redirect___roundeven
++#include <math.h>
++#undef roundeven
++#undef __roundeven
++
++#define SYMBOL_NAME roundeven
++#include "ifunc-sse4_1.h"
++
++libc_ifunc_redirected (__redirect_roundeven, __roundeven, IFUNC_SELECTOR ());
++libm_alias_double (__roundeven, roundeven)
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
+new file mode 100644
+index 00000000..72a6e7d1
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
+@@ -0,0 +1,3 @@
++#undef __roundevenf
++#define __roundevenf __roundevenf_c
++#include <sysdeps/ieee754/flt-32/s_roundevenf.c>
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
+new file mode 100644
+index 00000000..a76e1080
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
+@@ -0,0 +1,24 @@
++/* Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++	.section .text.sse4.1,"ax",@progbits
++ENTRY(__roundevenf_sse41)
++	roundss	$8, %xmm0, %xmm0
++	ret
++END(__roundevenf_sse41)
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
+new file mode 100644
+index 00000000..2ee196e6
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
+@@ -0,0 +1,31 @@
++/* Multiple versions of __roundevenf.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <libm-alias-float.h>
++
++#define roundevenf __redirect_roundevenf
++#define __roundevenf __redirect___roundevenf
++#include <math.h>
++#undef roundevenf
++#undef __roundevenf
++
++#define SYMBOL_NAME roundevenf
++#include "ifunc-sse4_1.h"
++
++libc_ifunc_redirected (__redirect_roundevenf, __roundevenf, IFUNC_SELECTOR ());
++libm_alias_float (__roundeven, roundeven)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-53.patch b/glibc-RHEL-15696-53.patch
new file mode 100644
index 0000000..7221d38
--- /dev/null
+++ b/glibc-RHEL-15696-53.patch
@@ -0,0 +1,41 @@
+From 7e08db3359c86c94918feb33a1182cd0ff3bb10b Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 9 Jan 2022 16:02:28 -0600
+Subject: [PATCH] x86: Fix __wcsncmp_evex in strcmp-evex.S [BZ# 28755]
+Content-type: text/plain; charset=UTF-8
+
+Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
+__wcscmp_evex. For x86_64 this covers the entire address range so any
+length larger could not possibly be used to bound `s1` or `s2`.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 459eeed0..d5aa6daa 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -97,6 +97,16 @@ ENTRY (STRCMP)
+ 	je	L(char0)
+ 	jb	L(zero)
+ #  ifdef USE_AS_WCSCMP
++#  ifndef __ILP32__
++	movq	%rdx, %rcx
++	/* Check if length could overflow when multiplied by
++	   sizeof(wchar_t). Checking top 8 bits will cover all potential
++	   overflow cases as well as redirect cases where its impossible to
++	   length to bound a valid memory region. In these cases just use
++	   'wcscmp'.  */
++	shrq	$56, %rcx
++	jnz	__wcscmp_evex
++#  endif
+ 	/* Convert units: from wide to byte char.  */
+ 	shl	$2, %RDX_LP
+ #  endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-54.patch b/glibc-RHEL-15696-54.patch
new file mode 100644
index 0000000..b2aaaa1
--- /dev/null
+++ b/glibc-RHEL-15696-54.patch
@@ -0,0 +1,268 @@
+From 78c9ec9000f873abe7a15a91b87080a2e4308260 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 20 Aug 2021 06:42:24 -0700
+Subject: [PATCH] x86-64: Optimize load of all bits set into ZMM register [BZ
+ #28252]
+Content-type: text/plain; charset=UTF-8
+
+Optimize loads of all bits set into ZMM register in AVX512 SVML codes
+by replacing
+
+	vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
+
+and
+
+	vmovups   .L_2il0floatpacket.13(%rip), %zmmX
+
+with
+	vpternlogd $0xff, %zmmX, %zmmX, %zmmX
+
+This fixes BZ #28252.
+---
+ .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S   |  7 +------
+ .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S   |  7 +------
+ .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S   |  7 +------
+ .../fpu/multiarch/svml_d_sincos8_core_avx512.S       |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
+ .../fpu/multiarch/svml_s_sincosf16_core_avx512.S     |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S |  7 +------
+ 10 files changed, 11 insertions(+), 64 deletions(-)
+
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+index 24e3b363..07dfed85 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+         vmovaps   %zmm0, %zmm8
+ 
+ /* Check for large arguments path */
+-        vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
++        vpternlogd $0xff, %zmm2, %zmm2, %zmm2
+ 
+ /*
+   ARGUMENT RANGE REDUCTION:
+@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_cos_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.16:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.16,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+index ae8af8d8..ddb60e5b 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+ 
+ /* preserve mantissa, set input exponent to 2^(-10) */
+         vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
+-        vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
++        vpternlogd $0xff, %zmm1, %zmm1, %zmm1
+         vpsrlq    $32, %zmm4, %zmm6
+ 
+ /* reciprocal approximation good to at least 11 bits */
+@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_log_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.12:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.12,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+index 2d4b14fd..529c454a 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+         andq      $-64, %rsp
+         subq      $1280, %rsp
+         movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+-        vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
++        vpternlogd $0xff, %zmm1, %zmm1, %zmm14
+         vmovups __dAbsMask(%rax), %zmm7
+         vmovups __dInvPI(%rax), %zmm2
+         vmovups __dRShifter(%rax), %zmm1
+@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_sin_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.14:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.14,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+index 2df626c0..e501a53a 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
+ 
+ /* SinPoly = SinR*SinPoly */
+         vfmadd213pd %zmm5, %zmm5, %zmm4
+-        vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
++        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
+ 
+ /* Update Cos result's sign */
+         vxorpd    %zmm2, %zmm1, %zmm1
+@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl)
+ ENTRY (_ZGVeN8vvv_sincos_skx)
+ WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
+ END (_ZGVeN8vvv_sincos_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.15:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.15,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+index 6ea1137b..377af394 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+   X = X - Y*PI1 - Y*PI2 - Y*PI3
+  */
+         vmovaps   %zmm0, %zmm6
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm12
++        vpternlogd $0xff, %zmm12, %zmm12, %zmm12
+         vmovups __sRShifter(%rax), %zmm3
+         vmovups __sPI1_FMA(%rax), %zmm5
+         vmovups __sA9_FMA(%rax), %zmm9
+@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16v_cosf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+index 89ba0df2..46f33d46 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+         vmovaps   %zmm0, %zmm7
+ 
+ /* compare against threshold */
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm3
++        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
+         vmovups __sInvLn2(%rax), %zmm4
+         vmovups __sShifter(%rax), %zmm1
+         vmovups __sLn2hi(%rax), %zmm6
+@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+ 
+ #endif
+ END (_ZGVeN16v_expf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+index 4cf0a96f..9e254956 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+         andq      $-64, %rsp
+         subq      $1280, %rsp
+         movq      __svml_slog_data@GOTPCREL(%rip), %rax
+-        vmovups   .L_2il0floatpacket.7(%rip), %zmm6
++        vpternlogd $0xff, %zmm6, %zmm6, %zmm6
+         vmovups _iBrkValue(%rax), %zmm4
+         vmovups _sPoly_7(%rax), %zmm8
+ 
+@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+ 
+ #endif
+ END (_ZGVeN16v_logf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.7:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.7,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+index bdcd50af..e8331ba1 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         vpsrlq    $32, %zmm3, %zmm2
+         vpmovqd   %zmm2, %ymm11
+         vcvtps2pd %ymm14, %zmm13
+-        vmovups   .L_2il0floatpacket.23(%rip), %zmm14
++        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+         vmovaps   %zmm14, %zmm26
+         vpandd _ABSMASK(%rax), %zmm1, %zmm8
+         vpcmpd    $1, _INF(%rax), %zmm8, %k2
+@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         vpmovqd   %zmm11, %ymm5
+         vpxord    %zmm10, %zmm10, %zmm10
+         vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
+-        vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
++        vpternlogd $0xff, %zmm4, %zmm4, %zmm4
+         vpxord    %zmm11, %zmm11, %zmm11
+         vcvtdq2pd %ymm7, %zmm7
+         vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
+@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16vv_powf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.23:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.23,@object
+-.L_2il0floatpacket.24:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.24,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+index 5fa4bc41..1f46f334 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
+ 
+ /* Result sign calculations */
+         vpternlogd $150, %zmm0, %zmm14, %zmm1
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
++        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+ 
+ /* Add correction term 0.5 for cos() part */
+         vaddps    %zmm8, %zmm5, %zmm15
+@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl)
+ ENTRY (_ZGVeN16vvv_sincosf_skx)
+ WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
+ END (_ZGVeN16vvv_sincosf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+index 141f747e..1fc9308a 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+         movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
+ 
+ /* Check for large and special values */
+-        vmovups   .L_2il0floatpacket.11(%rip), %zmm14
++        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+         vmovups __sAbsMask(%rax), %zmm5
+         vmovups __sInvPI(%rax), %zmm1
+         vmovups __sRShifter(%rax), %zmm2
+@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16v_sinf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.11:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.11,@object
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-55.patch b/glibc-RHEL-15696-55.patch
new file mode 100644
index 0000000..d44eef1
--- /dev/null
+++ b/glibc-RHEL-15696-55.patch
@@ -0,0 +1,48 @@
+From fc5bd179ef3a953dff8d1655bd530d0e230ffe71 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 21 Sep 2021 18:31:49 -0500
+Subject: [PATCH] x86: Modify ENTRY in sysdep.h so that p2align can be
+ specified
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+This change adds a new macro ENTRY_P2ALIGN which takes a second
+argument, log2 of the desired function alignment.
+
+The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this
+doesn't affect any existing functionality.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86/sysdep.h | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
+index 01bac0f6..a70bb3a2 100644
+--- a/sysdeps/x86/sysdep.h
++++ b/sysdeps/x86/sysdep.h
+@@ -78,15 +78,18 @@ enum cf_protection_level
+ #define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
+ 
+ /* Define an entry point visible from C.  */
+-#define	ENTRY(name)							      \
++#define	ENTRY_P2ALIGN(name, alignment)					      \
+   .globl C_SYMBOL_NAME(name);						      \
+   .type C_SYMBOL_NAME(name),@function;					      \
+-  .align ALIGNARG(4);							      \
++  .align ALIGNARG(alignment);						      \
+   C_LABEL(name)								      \
+   cfi_startproc;							      \
+   _CET_ENDBR;								      \
+   CALL_MCOUNT
+ 
++/* Common entry 16 byte aligns.  */
++#define ENTRY(name) ENTRY_P2ALIGN (name, 4)
++
+ #undef	END
+ #define END(name)							      \
+   cfi_endproc;								      \
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-56.patch b/glibc-RHEL-15696-56.patch
new file mode 100644
index 0000000..45b9975
--- /dev/null
+++ b/glibc-RHEL-15696-56.patch
@@ -0,0 +1,658 @@
+From 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 21 Sep 2021 18:45:03 -0500
+Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S for frontend behavior and
+ size
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+The frontend optimizations are to:
+1. Reorganize logically connected basic blocks so they are either in
+   the same cache line or adjacent cache lines.
+2. Avoid cases when basic blocks unnecissarily cross cache lines.
+3. Try and 32 byte align any basic blocks possible without sacrificing
+   code size. Smaller / Less hot basic blocks are used for this.
+
+Overall code size shrunk by 168 bytes. This should make up for any
+extra costs due to aligning to 64 bytes.
+
+In general performance before deviated a great deal dependending on
+whether entry alignment % 64 was 0, 16, 32, or 48. These changes
+essentially make it so that the current implementation is at least
+equal to the best alignment of the original for any arguments.
+
+The only additional optimization is in the page cross case. Branch on
+equals case was removed from the size == [4, 7] case. As well the [4,
+7] and [2, 3] case where swapped as [4, 7] is likely a more hot
+argument size.
+
+test-memcmp and test-wmemcmp are both passing.
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 434 +++++++++++--------
+ 1 file changed, 242 insertions(+), 192 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 654dc7ac..2761b54f 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -34,7 +34,24 @@
+       area.
+    7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
+    8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
+-   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
++   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
++
++When possible the implementation tries to optimize for frontend in the
++following ways:
++Throughput:
++    1. All code sections that fit are able to run optimally out of the
++       LSD.
++    2. All code sections that fit are able to run optimally out of the
++       DSB
++    3. Basic blocks are contained in minimum number of fetch blocks
++       necessary.
++
++Latency:
++    1. Logically connected basic blocks are put in the same
++       cache-line.
++    2. Logically connected basic blocks that do not fit in the same
++       cache-line are put in adjacent lines. This can get beneficial
++       L2 spatial prefetching and L1 next-line prefetching.  */
+ 
+ # include <sysdep.h>
+ 
+@@ -47,9 +64,11 @@
+ # ifdef USE_AS_WMEMCMP
+ #  define CHAR_SIZE	4
+ #  define VPCMP	vpcmpd
++#  define VPTEST	vptestmd
+ # else
+ #  define CHAR_SIZE	1
+ #  define VPCMP	vpcmpub
++#  define VPTEST	vptestmb
+ # endif
+ 
+ # define VEC_SIZE	32
+@@ -75,7 +94,9 @@
+ */
+ 
+ 	.section .text.evex,"ax",@progbits
+-ENTRY (MEMCMP)
++/* Cache align memcmp entry. This allows for much more thorough
++   frontend optimization.  */
++ENTRY_P2ALIGN (MEMCMP, 6)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+@@ -89,7 +110,7 @@ ENTRY (MEMCMP)
+ 	VPCMP	$4, (%rdi), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	/* NB: eax must be destination register if going to
+-	   L(return_vec_[0,2]). For L(return_vec_3 destination register
++	   L(return_vec_[0,2]). For L(return_vec_3) destination register
+ 	   must be ecx.  */
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_0)
+@@ -121,10 +142,6 @@ ENTRY (MEMCMP)
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_3)
+ 
+-	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
+-	   compare with zero to get a mask is needed.  */
+-	vpxorq	%XMM0, %XMM0, %XMM0
+-
+ 	/* Go to 4x VEC loop.  */
+ 	cmpq	$(CHAR_PER_VEC * 8), %rdx
+ 	ja	L(more_8x_vec)
+@@ -148,47 +165,61 @@ ENTRY (MEMCMP)
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+-	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+ 	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
+-	   oring with YMM3. Result is stored in YMM4.  */
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+-	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
+-	VPCMP	$4, %YMM4, %YMM0, %k1
++	   oring with YMM1. Result is stored in YMM4.  */
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
++
++	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
++	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
++
++	/* Test YMM4 against itself. Store any CHAR mismatches in k1.
++	 */
++	VPTEST	%YMM4, %YMM4, %k1
++	/* k1 must go to ecx for L(return_vec_0_1_2_3).  */
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_0_1_2_3)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+-	/* NB: aligning 32 here allows for the rest of the jump targets
+-	   to be tuned for 32 byte alignment. Most important this ensures
+-	   the L(more_8x_vec) loop is 32 byte aligned.  */
+-	.p2align 5
+-L(less_vec):
+-	/* Check if one or less CHAR. This is necessary for size = 0 but
+-	   is also faster for size = CHAR_SIZE.  */
+-	cmpl	$1, %edx
+-	jbe	L(one_or_less)
++	.p2align 4
++L(8x_end_return_vec_0_1_2_3):
++	movq	%rdx, %rdi
++L(8x_return_vec_0_1_2_3):
++	addq	%rdi, %rsi
++L(return_vec_0_1_2_3):
++	VPTEST	%YMM1, %YMM1, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 
+-	/* Check if loading one VEC from either s1 or s2 could cause a
+-	   page cross. This can have false positives but is by far the
+-	   fastest method.  */
+-	movl	%edi, %eax
+-	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(page_cross_less_vec)
++	VPTEST	%YMM2, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_1)
+ 
+-	/* No page cross possible.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMP	$4, (%rdi), %YMM2, %k1
+-	kmovd	%k1, %eax
+-	/* Create mask in ecx for potentially in bound matches.  */
+-	bzhil	%edx, %eax, %eax
+-	jnz	L(return_vec_0)
++	VPTEST	%YMM3, %YMM3, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_2)
++L(return_vec_3):
++	/* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
++	   fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
++	   line.  */
++	bsfl	%ecx, %ecx
++# ifdef USE_AS_WMEMCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
+ 	ret
+ 
+ 	.p2align 4
+@@ -209,10 +240,11 @@ L(return_vec_0):
+ # endif
+ 	ret
+ 
+-	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
+-	   which is good enough for a target not in a loop.  */
++	.p2align 4
+ L(return_vec_1):
+-	tzcntl	%eax, %eax
++	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
++	   fetch block.  */
++	bsfl	%eax, %eax
+ # ifdef USE_AS_WMEMCMP
+ 	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+@@ -226,10 +258,11 @@ L(return_vec_1):
+ # endif
+ 	ret
+ 
+-	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
+-	   which is good enough for a target not in a loop.  */
++	.p2align 4,, 10
+ L(return_vec_2):
+-	tzcntl	%eax, %eax
++	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
++	   fetch block.  */
++	bsfl	%eax, %eax
+ # ifdef USE_AS_WMEMCMP
+ 	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+@@ -243,40 +276,6 @@ L(return_vec_2):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+-L(8x_return_vec_0_1_2_3):
+-	/* Returning from L(more_8x_vec) requires restoring rsi.  */
+-	addq	%rdi, %rsi
+-L(return_vec_0_1_2_3):
+-	VPCMP	$4, %YMM1, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-
+-	VPCMP	$4, %YMM2, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_1)
+-
+-	VPCMP	$4, %YMM3, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_2)
+-L(return_vec_3):
+-	tzcntl	%ecx, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
+-	xorl	%edx, %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+ 	.p2align 4
+ L(more_8x_vec):
+ 	/* Set end of s1 in rdx.  */
+@@ -288,21 +287,19 @@ L(more_8x_vec):
+ 	andq	$-VEC_SIZE, %rdi
+ 	/* Adjust because first 4x vec where check already.  */
+ 	subq	$-(VEC_SIZE * 4), %rdi
++
+ 	.p2align 4
+ L(loop_4x_vec):
+ 	VMOVU	(%rsi, %rdi), %YMM1
+ 	vpxorq	(%rdi), %YMM1, %YMM1
+-
+ 	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
+ 	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
+-
+ 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+-
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+-	VPCMP	$4, %YMM4, %YMM0, %k1
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
++	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
++	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(8x_return_vec_0_1_2_3)
+@@ -319,28 +316,25 @@ L(loop_4x_vec):
+ 	cmpl	$(VEC_SIZE * 2), %edi
+ 	jae	L(8x_last_2x_vec)
+ 
++	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
++
+ 	VMOVU	(%rsi, %rdx), %YMM1
+ 	vpxorq	(%rdx), %YMM1, %YMM1
+ 
+ 	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
+ 	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+-
+-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+-
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
+-	VPCMP	$4, %YMM4, %YMM0, %k1
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
++	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
++	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+-	/* Restore s1 pointer to rdi.  */
+-	movq	%rdx, %rdi
+ 	testl	%ecx, %ecx
+-	jnz	L(8x_return_vec_0_1_2_3)
++	jnz	L(8x_end_return_vec_0_1_2_3)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+ 	/* Only entry is from L(more_8x_vec).  */
+-	.p2align 4
++	.p2align 4,, 10
+ L(8x_last_2x_vec):
+ 	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
+ 	kmovd	%k1, %eax
+@@ -355,7 +349,31 @@ L(8x_last_1x_vec):
+ 	jnz	L(8x_return_vec_3)
+ 	ret
+ 
+-	.p2align 4
++	/* Not ideally aligned (at offset +9 bytes in fetch block) but
++	   not aligning keeps it in the same cache line as
++	   L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
++	   size.  */
++	.p2align 4,, 4
++L(8x_return_vec_2):
++	subq	$VEC_SIZE, %rdx
++L(8x_return_vec_3):
++	bsfl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	leaq	(%rdx, %rax, CHAR_SIZE), %rax
++	movl	(VEC_SIZE * 3)(%rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	addq	%rdx, %rax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	movzbl	(VEC_SIZE * 3)(%rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
++	.p2align 4,, 10
+ L(last_2x_vec):
+ 	/* Check second to last VEC.  */
+ 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
+@@ -374,26 +392,49 @@ L(last_1x_vec):
+ 	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4
+-L(8x_return_vec_2):
+-	subq	$VEC_SIZE, %rdx
+-L(8x_return_vec_3):
+-	tzcntl	%eax, %eax
++	.p2align 4,, 10
++L(return_vec_1_end):
++	/* Use bsf to save code size. This is necessary to have
++	   L(one_or_less) fit in aligning bytes between.  */
++	bsfl	%eax, %eax
++	addl	%edx, %eax
+ # ifdef USE_AS_WMEMCMP
+-	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+-	movl	(VEC_SIZE * 3)(%rax), %ecx
++	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+ 	setg	%dl
+ 	leal	-1(%rdx, %rdx), %eax
+ # else
+-	addq	%rdx, %rax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+-	movzbl	(VEC_SIZE * 3)(%rax), %eax
++	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
+ 
++	/* NB: L(one_or_less) fits in alignment padding between
++	   L(return_vec_1_end) and L(return_vec_0_end).  */
++# ifdef USE_AS_WMEMCMP
++L(one_or_less):
++	jb	L(zero)
++	movl	(%rdi), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi), %ecx
++	je	L(zero)
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++	ret
++# else
++L(one_or_less):
++	jb	L(zero)
++	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
++	subl	%ecx, %eax
++	ret
++# endif
++L(zero):
++	xorl	%eax, %eax
++	ret
++
+ 	.p2align 4
+ L(return_vec_0_end):
+ 	tzcntl	%eax, %eax
+@@ -412,23 +453,56 @@ L(return_vec_0_end):
+ 	ret
+ 
+ 	.p2align 4
+-L(return_vec_1_end):
++L(less_vec):
++	/* Check if one or less CHAR. This is necessary for size == 0
++	   but is also faster for size == CHAR_SIZE.  */
++	cmpl	$1, %edx
++	jbe	L(one_or_less)
++
++	/* Check if loading one VEC from either s1 or s2 could cause a
++	   page cross. This can have false positives but is by far the
++	   fastest method.  */
++	movl	%edi, %eax
++	orl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jg	L(page_cross_less_vec)
++
++	/* No page cross possible.  */
++	VMOVU	(%rsi), %YMM2
++	VPCMP	$4, (%rdi), %YMM2, %k1
++	kmovd	%k1, %eax
++	/* Check if any matches where in bounds. Intentionally not
++	   storing result in eax to limit dependency chain if it goes to
++	   L(return_vec_0_lv).  */
++	bzhil	%edx, %eax, %edx
++	jnz	L(return_vec_0_lv)
++	xorl	%eax, %eax
++	ret
++
++	/* Essentially duplicate of L(return_vec_0). Ends up not costing
++	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
++	   the jump and ends up fitting in aligning bytes. As well fits on
++	   same cache line as L(less_vec) so also saves a line from having
++	   to be fetched on cold calls to memcmp.  */
++	.p2align 4,, 4
++L(return_vec_0_lv):
+ 	tzcntl	%eax, %eax
+-	addl	%edx, %eax
+ # ifdef USE_AS_WMEMCMP
+-	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
++	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+-	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
++	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
++	/* NB: no partial register stall here because xorl zero idiom
++	   above.  */
+ 	setg	%dl
+ 	leal	-1(%rdx, %rdx), %eax
+ # else
+-	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
+-	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
+ 
+-
+ 	.p2align 4
+ L(page_cross_less_vec):
+ 	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+@@ -439,108 +513,84 @@ L(page_cross_less_vec):
+ 	cmpl	$8, %edx
+ 	jae	L(between_8_15)
+ 	cmpl	$4, %edx
+-	jae	L(between_4_7)
+-L(between_2_3):
+-	/* Load as big endian to avoid branches.  */
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	/* Subtraction is okay because the upper 8 bits are zero.  */
+-	subl	%ecx, %eax
+-	ret
+-	.p2align 4
+-L(one_or_less):
+-	jb	L(zero)
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
++	jb	L(between_2_3)
++
++	/* Load as big endian with overlapping movbe to avoid branches.
++	 */
++	movbe	(%rdi), %eax
++	movbe	(%rsi), %ecx
++	shlq	$32, %rax
++	shlq	$32, %rcx
++	movbe	-4(%rdi, %rdx), %edi
++	movbe	-4(%rsi, %rdx), %esi
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	/* edx is guranteed to be positive int32 in range [4, 7].  */
++	cmovne	%edx, %eax
++	/* ecx is -1 if rcx > rax. Otherwise 0.  */
++	sbbl	%ecx, %ecx
++	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
++	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
++	   eax doesn't matter.  */
++	orl	%ecx, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 4,, 8
+ L(between_8_15):
+ # endif
+ 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+-	vmovq	(%rdi), %XMM1
+-	vmovq	(%rsi), %XMM2
+-	VPCMP	$4, %XMM1, %XMM2, %k1
++	vmovq	(%rdi), %xmm1
++	vmovq	(%rsi), %xmm2
++	VPCMP	$4, %xmm1, %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
++	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
+-	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
+-	vmovq	(%rdi), %XMM1
+-	vmovq	(%rsi), %XMM2
+-	VPCMP	$4, %XMM1, %XMM2, %k1
++	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
++	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
++	VPCMP	$4, %xmm1, %xmm2, %k1
++	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-	ret
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
++	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4
++	.p2align 4,, 8
+ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+-	VMOVU	(%rsi), %XMM2
+-	VPCMP	$4, (%rdi), %XMM2, %k1
++
++	/* Use movups to save code size.  */
++	movups	(%rsi), %xmm2
++	VPCMP	$4, (%rdi), %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-
++	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-
+-	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
+-	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
+-	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
+-	VPCMP	$4, (%rdi), %XMM2, %k1
++	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
++	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
++	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-	ret
+-
+-# ifdef USE_AS_WMEMCMP
+-	.p2align 4
+-L(one_or_less):
+-	jb	L(zero)
+-	movl	(%rdi), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi), %ecx
+-	je	L(zero)
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
++	jnz	L(return_vec_0_end)
+ 	ret
+-# else
+ 
+-	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.
+-	 */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	jz	L(zero_4_7)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+-L(zero_4_7):
++# ifndef USE_AS_WMEMCMP
++L(between_2_3):
++	/* Load as big endian to avoid branches.  */
++	movzwl	(%rdi), %eax
++	movzwl	(%rsi), %ecx
++	shll	$8, %eax
++	shll	$8, %ecx
++	bswap	%eax
++	bswap	%ecx
++	movzbl	-1(%rdi, %rdx), %edi
++	movzbl	-1(%rsi, %rdx), %esi
++	orl	%edi, %eax
++	orl	%esi, %ecx
++	/* Subtraction is okay because the upper 8 bits are zero.  */
++	subl	%ecx, %eax
+ 	ret
+ # endif
+-
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-57.patch b/glibc-RHEL-15696-57.patch
new file mode 100644
index 0000000..51d5dd0
--- /dev/null
+++ b/glibc-RHEL-15696-57.patch
@@ -0,0 +1,510 @@
+From e59ced238482fd71f3e493717f14f6507346741e Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 20 Sep 2021 16:20:15 -0500
+Subject: [PATCH] x86: Optimize memset-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+Optimization are
+
+1. change control flow for L(more_2x_vec) to fall through to loop and
+   jump for L(less_4x_vec) and L(less_8x_vec). This uses less code
+   size and saves jumps for length > 4x VEC_SIZE.
+
+2. For EVEX/AVX512 move L(less_vec) closer to entry.
+
+3. Avoid complex address mode for length > 2x VEC_SIZE
+
+4. Slightly better aligning code for the loop from the perspective of
+   code size and uops.
+
+5. Align targets so they make full use of their fetch block and if
+   possible cache line.
+
+6. Try and reduce total number of icache lines that will need to be
+   pulled in for a given length.
+
+7. Include "local" version of stosb target. For AVX2/EVEX/AVX512
+   jumping to the stosb target in the sse2 code section will almost
+   certainly be to a new page. The new version does increase code size
+   marginally by duplicating the target but should get better iTLB
+   behavior as a result.
+
+test-memset, test-wmemset, and test-bzero are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/memset.S                       |  10 +-
+ .../multiarch/memset-avx2-unaligned-erms.S    |  10 +-
+ .../multiarch/memset-avx512-unaligned-erms.S  |  11 +-
+ .../multiarch/memset-evex-unaligned-erms.S    |  11 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 285 ++++++++++++------
+ 5 files changed, 232 insertions(+), 95 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/memset.S
+	(GNU URL)
+
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index b3426795..8672b030 100644
+--- a/sysdeps/x86_64/memset.S
++++ b/sysdeps/x86_64/memset.S
+@@ -18,13 +18,15 @@
+    <http://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
++#define USE_WITH_SSE2	1
+ 
+ #define VEC_SIZE	16
++#define MOV_SIZE	3
++#define RET_SIZE	1
++
+ #define VEC(i)		xmm##i
+-/* Don't use movups and movaps since it will get larger nop paddings for
+-   alignment.  */
+-#define VMOVU		movdqu
+-#define VMOVA		movdqa
++#define VMOVU     movups
++#define VMOVA     movaps
+ 
+ #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index ae0860f3..1af668af 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -1,8 +1,14 @@
+ #if IS_IN (libc)
++# define USE_WITH_AVX2	1
++
+ # define VEC_SIZE	32
++# define MOV_SIZE	4
++# define RET_SIZE	4
++
+ # define VEC(i)		ymm##i
+-# define VMOVU		vmovdqu
+-# define VMOVA		vmovdqa
++
++# define VMOVU     vmovdqu
++# define VMOVA     vmovdqa
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+   vmovd d, %xmm0; \
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 8ad842fc..f14d6f84 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -1,11 +1,18 @@
+ #if IS_IN (libc)
++# define USE_WITH_AVX512	1
++
+ # define VEC_SIZE	64
++# define MOV_SIZE	6
++# define RET_SIZE	1
++
+ # define XMM0		xmm16
+ # define YMM0		ymm16
+ # define VEC0		zmm16
+ # define VEC(i)		VEC##i
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
++
++# define VMOVU     vmovdqu64
++# define VMOVA     vmovdqa64
++
+ # define VZEROUPPER
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index 640f0929..64b09e77 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -1,11 +1,18 @@
+ #if IS_IN (libc)
++# define USE_WITH_EVEX	1
++
+ # define VEC_SIZE	32
++# define MOV_SIZE	6
++# define RET_SIZE	1
++
+ # define XMM0		xmm16
+ # define YMM0		ymm16
+ # define VEC0		ymm16
+ # define VEC(i)		VEC##i
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
++
++# define VMOVU     vmovdqu64
++# define VMOVA     vmovdqa64
++
+ # define VZEROUPPER
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 909c33f6..f08b7323 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -63,8 +63,27 @@
+ # endif
+ #endif
+ 
++#if VEC_SIZE == 64
++# define LOOP_4X_OFFSET	(VEC_SIZE * 4)
++#else
++# define LOOP_4X_OFFSET	(0)
++#endif
++
++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
++# define END_REG	rcx
++# define LOOP_REG	rdi
++#else
++# define END_REG	rdi
++# define LOOP_REG	rdx
++#endif
++
+ #define PAGE_SIZE 4096
+ 
++/* Macro to calculate size of small memset block for aligning
++   purposes.  */
++#define SMALL_MEMSET_ALIGN(mov_sz,	ret_sz)	(2 * (mov_sz) + (ret_sz) + 1)
++
++
+ #ifndef SECTION
+ # error SECTION is not defined!
+ #endif
+@@ -74,6 +93,7 @@
+ ENTRY (__bzero)
+ 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+ 	mov	%RSI_LP, %RDX_LP /* Set n.  */
++	xorl	%esi, %esi
+ 	pxor	%XMM0, %XMM0
+ 	jmp	L(entry_from_bzero)
+ END (__bzero)
+@@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ # endif
+ 
+-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
++ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
+ 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+@@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	jb	L(less_vec)
+ 	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(stosb_more_2x_vec)
+-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+-	VMOVU	%VEC(0), (%rdi)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
++	 */
++	VMOVU	%VEC(0), (%rax)
++	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-
+-	.p2align 4
+-L(stosb_more_2x_vec):
+-	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
+-	ja	L(stosb)
+-#else
+-	.p2align 4
+ #endif
+-L(more_2x_vec):
+-	/* Stores to first 2x VEC before cmp as any path forward will
+-	   require it.  */
+-	VMOVU	%VEC(0), (%rdi)
+-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	ja	L(loop_start)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+-L(return):
+-#if VEC_SIZE > 16
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
++
++	.p2align 4,, 10
++L(last_2x_vec):
++#ifdef USE_LESS_VEC_MASK_STORE
++	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
++	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
+ #else
+-	ret
++	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
+ #endif
++	VZEROUPPER_RETURN
+ 
+-L(loop_start):
+-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
+-	cmpq	$(VEC_SIZE * 8), %rdx
+-	jbe	L(loop_end)
+-	andq	$-(VEC_SIZE * 2), %rdi
+-	subq	$-(VEC_SIZE * 4), %rdi
+-	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
+-	.p2align 4
+-L(loop):
+-	VMOVA	%VEC(0), (%rdi)
+-	VMOVA	%VEC(0), VEC_SIZE(%rdi)
+-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
+-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
+-	subq	$-(VEC_SIZE * 4), %rdi
+-	cmpq	%rcx, %rdi
+-	jb	L(loop)
+-L(loop_end):
+-	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
+-	       rdx as length is also unchanged.  */
+-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+-	VZEROUPPER_SHORT_RETURN
+-
+-	.p2align 4
++	/* If have AVX512 mask instructions put L(less_vec) close to
++	   entry as it doesn't take much space and is likely a hot target.
++	 */
++#ifdef USE_LESS_VEC_MASK_STORE
++	.p2align 4,, 10
+ L(less_vec):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+ # endif
+-# ifdef USE_LESS_VEC_MASK_STORE
+ 	/* Clear high bits from edi. Only keeping bits relevant to page
+ 	   cross check. Note that we are using rax which is set in
+-	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
+-	 */
++	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
+ 	andl	$(PAGE_SIZE - 1), %edi
+-	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
+-	   performance degradation when it has to fault supress.  */
++	/* Check if VEC_SIZE store cross page. Mask stores suffer
++	   serious performance degradation when it has to fault supress.
++	 */
+ 	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
++	/* This is generally considered a cold target.  */
+ 	ja	L(cross_page)
+ # if VEC_SIZE > 32
+ 	movq	$-1, %rcx
+@@ -247,58 +235,185 @@ L(less_vec):
+ 	bzhil	%edx, %ecx, %ecx
+ 	kmovd	%ecx, %k1
+ # endif
+-	vmovdqu8	%VEC(0), (%rax) {%k1}
++	vmovdqu8 %VEC(0), (%rax){%k1}
+ 	VZEROUPPER_RETURN
+ 
++# if defined USE_MULTIARCH && IS_IN (libc)
++	/* Include L(stosb_local) here if including L(less_vec) between
++	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
++	   L(stosb_more_2x_vec) target.  */
++	.p2align 4,, 10
++L(stosb_local):
++	movzbl	%sil, %eax
++	mov	%RDX_LP, %RCX_LP
++	mov	%RDI_LP, %RDX_LP
++	rep	stosb
++	mov	%RDX_LP, %RAX_LP
++	VZEROUPPER_RETURN
++# endif
++#endif
++
++#if defined USE_MULTIARCH && IS_IN (libc)
+ 	.p2align 4
+-L(cross_page):
++L(stosb_more_2x_vec):
++	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
++	ja	L(stosb_local)
++#endif
++	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
++	   and (4x, 8x] jump to target.  */
++L(more_2x_vec):
++
++	/* Two different methods of setting up pointers / compare. The
++	   two methods are based on the fact that EVEX/AVX512 mov
++	   instructions take more bytes then AVX2/SSE2 mov instructions. As
++	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
++	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
++	   this saves code size and keeps a few targets in one fetch block.
++	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
++	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
++	   LOOP_4X_OFFSET) with LEA_BID.  */
++
++	/* END_REG is rcx for EVEX/AVX512.  */
++	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
++#endif
++
++	/* Stores to first 2x VEC before cmp as any path forward will
++	   require it.  */
++	VMOVU	%VEC(0), (%rax)
++	VMOVU	%VEC(0), VEC_SIZE(%rax)
++
++
++#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
++	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
++	addq	%rdx, %END_REG
++#endif
++
++	cmpq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_2x_vec)
++
++	/* Store next 2x vec regardless.  */
++	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
++	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
++
++
++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
++	/* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
++	   extra offset to addresses in loop. Used for AVX512 to save space
++	   as no way to get (VEC_SIZE * 4) in imm8.  */
++# if LOOP_4X_OFFSET == 0
++	subq	$-(VEC_SIZE * 4), %LOOP_REG
+ # endif
+-# if VEC_SIZE > 32
+-	cmpb	$32, %dl
+-	jae	L(between_32_63)
++	/* Avoid imm32 compare here to save code size.  */
++	cmpq	%rdi, %rcx
++#else
++	addq	$-(VEC_SIZE * 4), %END_REG
++	cmpq	$(VEC_SIZE * 8), %rdx
++#endif
++	jbe	L(last_4x_vec)
++#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
++	/* Set LOOP_REG (rdx).  */
++	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
++#endif
++	/* Align dst for loop.  */
++	andq	$(VEC_SIZE * -2), %LOOP_REG
++	.p2align 4
++L(loop):
++	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
++	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
++	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
++	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
++	subq	$-(VEC_SIZE * 4), %LOOP_REG
++	cmpq	%END_REG, %LOOP_REG
++	jb	L(loop)
++	.p2align 4,, MOV_SIZE
++L(last_4x_vec):
++	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
++	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
++	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
++	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
++L(return):
++#if VEC_SIZE > 16
++	ZERO_UPPER_VEC_REGISTERS_RETURN
++#else
++	ret
++#endif
++
++	.p2align 4,, 10
++#ifndef USE_LESS_VEC_MASK_STORE
++# if defined USE_MULTIARCH && IS_IN (libc)
++	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
++	   range for 2-byte jump encoding.  */
++L(stosb_local):
++	movzbl	%sil, %eax
++	mov	%RDX_LP, %RCX_LP
++	mov	%RDI_LP, %RDX_LP
++	rep	stosb
++	mov	%RDX_LP, %RAX_LP
++	VZEROUPPER_RETURN
+ # endif
+-# if VEC_SIZE > 16
+-	cmpb	$16, %dl
++	/* Define L(less_vec) only if not otherwise defined.  */
++	.p2align 4
++L(less_vec):
++#endif
++L(cross_page):
++#if VEC_SIZE > 32
++	cmpl	$32, %edx
++	jae	L(between_32_63)
++#endif
++#if VEC_SIZE > 16
++	cmpl	$16, %edx
+ 	jae	L(between_16_31)
+-# endif
+-	MOVQ	%XMM0, %rcx
+-	cmpb	$8, %dl
++#endif
++	MOVQ	%XMM0, %rdi
++	cmpl	$8, %edx
+ 	jae	L(between_8_15)
+-	cmpb	$4, %dl
++	cmpl	$4, %edx
+ 	jae	L(between_4_7)
+-	cmpb	$1, %dl
++	cmpl	$1, %edx
+ 	ja	L(between_2_3)
+-	jb	1f
+-	movb	%cl, (%rax)
+-1:
++	jb	L(return)
++	movb	%sil, (%rax)
+ 	VZEROUPPER_RETURN
+-# if VEC_SIZE > 32
++
++	/* Align small targets only if not doing so would cross a fetch
++	   line.  */
++#if VEC_SIZE > 32
++	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	VMOVU	%YMM0, -32(%rax,%rdx)
+ 	VMOVU	%YMM0, (%rax)
++	VMOVU	%YMM0, -32(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-# endif
+-# if VEC_SIZE > 16
+-	/* From 16 to 31.  No branch when size == 16.  */
++#endif
++
++#if VEC_SIZE >= 32
++	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ L(between_16_31):
+-	VMOVU	%XMM0, -16(%rax,%rdx)
++	/* From 16 to 31.  No branch when size == 16.  */
+ 	VMOVU	%XMM0, (%rax)
++	VMOVU	%XMM0, -16(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-# endif
+-	/* From 8 to 15.  No branch when size == 8.  */
++#endif
++
++	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ L(between_8_15):
+-	movq	%rcx, -8(%rax,%rdx)
+-	movq	%rcx, (%rax)
++	/* From 8 to 15.  No branch when size == 8.  */
++	movq	%rdi, (%rax)
++	movq	%rdi, -8(%rax, %rdx)
+ 	VZEROUPPER_RETURN
++
++	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	%ecx, -4(%rax,%rdx)
+-	movl	%ecx, (%rax)
++	movl	%edi, (%rax)
++	movl	%edi, -4(%rax, %rdx)
+ 	VZEROUPPER_RETURN
++
++	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+-	movw	%cx, -2(%rax,%rdx)
+-	movw	%cx, (%rax)
++	movw	%di, (%rax)
++	movb	%dil, -1(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-58.patch b/glibc-RHEL-15696-58.patch
new file mode 100644
index 0000000..cec0788
--- /dev/null
+++ b/glibc-RHEL-15696-58.patch
@@ -0,0 +1,45 @@
+From bad852b61b79503fcb3c5fc379c70f768df3e1fb Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sat, 23 Oct 2021 01:26:47 -0400
+Subject: [PATCH] x86: Replace sse2 instructions with avx in
+ memcmp-evex-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
+
+it could potentially be dangerous to use SSE2 if this function is ever
+called without using 'vzeroupper' beforehand. While compilers appear
+to use 'vzeroupper' before function calls if AVX2 has been used, using
+SSE2 here is more brittle. Since it is not absolutely necessary it
+should be avoided.
+
+It costs 2-extra bytes but the extra bytes should only eat into
+alignment padding.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 2761b54f..640f6757 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -561,13 +561,13 @@ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+ 
+ 	/* Use movups to save code size.  */
+-	movups	(%rsi), %xmm2
++	vmovdqu	(%rsi), %xmm2
+ 	VPCMP	$4, (%rdi), %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
++	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+ 	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
+ 	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-59.patch b/glibc-RHEL-15696-59.patch
new file mode 100644
index 0000000..efc618c
--- /dev/null
+++ b/glibc-RHEL-15696-59.patch
@@ -0,0 +1,695 @@
+From c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 29 Oct 2021 12:40:20 -0700
+Subject: [PATCH] x86-64: Improve EVEX strcmp with masked load
+Content-type: text/plain; charset=UTF-8
+
+In strcmp-evex.S, to compare 2 32-byte strings, replace
+
+        VMOVU   (%rdi, %rdx), %YMM0
+        VMOVU   (%rsi, %rdx), %YMM1
+        /* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+        VPCMP   $4, %YMM0, %YMM1, %k0
+        VPCMP   $0, %YMMZERO, %YMM0, %k1
+        VPCMP   $0, %YMMZERO, %YMM1, %k2
+        /* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+        kord    %k1, %k2, %k1
+        /* Each bit in K1 represents a NULL or a mismatch.  */
+        kord    %k0, %k1, %k1
+        kmovd   %k1, %ecx
+        testl   %ecx, %ecx
+        jne     L(last_vector)
+
+with
+
+        VMOVU   (%rdi, %rdx), %YMM0
+        VPTESTM %YMM0, %YMM0, %k2
+        /* Each bit cleared in K1 represents a mismatch or a null CHAR
+           in YMM0 and 32 bytes at (%rsi, %rdx).  */
+        VPCMP   $0, (%rsi, %rdx), %YMM0, %k1{%k2}
+        kmovd   %k1, %ecx
+        incl    %ecx
+        jne     L(last_vector)
+
+It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
+and Ice Lake.
+
+Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 461 +++++++++++++------------
+ 1 file changed, 243 insertions(+), 218 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index d5aa6daa..82f12ac8 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -41,6 +41,8 @@
+ # ifdef USE_AS_WCSCMP
+ /* Compare packed dwords.  */
+ #  define VPCMP		vpcmpd
++#  define VPMINU	vpminud
++#  define VPTESTM	vptestmd
+ #  define SHIFT_REG32	r8d
+ #  define SHIFT_REG64	r8
+ /* 1 dword char == 4 bytes.  */
+@@ -48,6 +50,8 @@
+ # else
+ /* Compare packed bytes.  */
+ #  define VPCMP		vpcmpb
++#  define VPMINU	vpminub
++#  define VPTESTM	vptestmb
+ #  define SHIFT_REG32	ecx
+ #  define SHIFT_REG64	rcx
+ /* 1 byte char == 1 byte.  */
+@@ -67,6 +71,9 @@
+ # define YMM5		ymm22
+ # define YMM6		ymm23
+ # define YMM7		ymm24
++# define YMM8		ymm25
++# define YMM9		ymm26
++# define YMM10		ymm27
+ 
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+@@ -76,7 +83,7 @@
+ /* The main idea of the string comparison (byte or dword) using 256-bit
+    EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
+    latter can be on either packed bytes or dwords depending on
+-   USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
++   USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
+    matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
+    KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
+    are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
+@@ -123,27 +130,21 @@ ENTRY (STRCMP)
+ 	jg	L(cross_page)
+ 	/* Start comparing 4 vectors.  */
+ 	VMOVU	(%rdi), %YMM0
+-	VMOVU	(%rsi), %YMM1
+ 
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
++	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k2
+ 
+-	/* Check for NULL in YMM0.  */
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	/* Check for NULL in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (%rsi).  */
++	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+ 
+-	/* Each bit in K1 represents:
+-	   1. A mismatch in YMM0 and YMM1.  Or
+-	   2. A NULL in YMM0 or YMM1.
+-	 */
+-	kord	%k0, %k1, %k1
+-
+-	ktestd	%k1, %k1
+-	je	L(next_3_vectors)
+ 	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
++	je	L(next_3_vectors)
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -172,9 +173,7 @@ L(return):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -210,9 +209,7 @@ L(return_vec_size):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_2_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -248,9 +245,7 @@ L(return_2_vec_size):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_3_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -289,43 +284,45 @@ L(return_3_vec_size):
+ 	.p2align 4
+ L(next_3_vectors):
+ 	VMOVU	VEC_SIZE(%rdi), %YMM0
+-	VMOVU	VEC_SIZE(%rsi), %YMM1
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
++	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at VEC_SIZE(%rsi).  */
++	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	jne	L(return_vec_size)
+ 
+-	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM2
+-	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM3
+-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM4
+-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM5
+-
+-	/* Each bit in K0 represents a mismatch in YMM2 and YMM4.  */
+-	VPCMP	$4, %YMM2, %YMM4, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM4, %k2
+-	/* Each bit in K1 represents a NULL in YMM2 or YMM4.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
++	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
++	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
++	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	jne	L(return_2_vec_size)
+ 
+-	/* Each bit in K0 represents a mismatch in YMM3 and YMM5.  */
+-	VPCMP	$4, %YMM3, %YMM5, %k0
+-	VPCMP	$0, %YMMZERO, %YMM3, %k1
+-	VPCMP	$0, %YMMZERO, %YMM5, %k2
+-	/* Each bit in K1 represents a NULL in YMM3 or YMM5.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
++	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
++	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
++	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	jne	L(return_3_vec_size)
+ L(main_loop_header):
+ 	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+@@ -375,56 +372,51 @@ L(back_to_loop):
+ 	VMOVA	VEC_SIZE(%rax), %YMM2
+ 	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
+ 	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
+-	VMOVU	(%rdx), %YMM1
+-	VMOVU	VEC_SIZE(%rdx), %YMM3
+-	VMOVU	(VEC_SIZE * 2)(%rdx), %YMM5
+-	VMOVU	(VEC_SIZE * 3)(%rdx), %YMM7
+-
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K4 represents a NULL or a mismatch in YMM0 and
+-	   YMM1.  */
+-	kord	%k0, %k1, %k4
+-
+-	VPCMP	$4, %YMM2, %YMM3, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM3, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K5 represents a NULL or a mismatch in YMM2 and
+-	   YMM3.  */
+-	kord	%k0, %k1, %k5
+-
+-	VPCMP	$4, %YMM4, %YMM5, %k0
+-	VPCMP	$0, %YMMZERO, %YMM4, %k1
+-	VPCMP	$0, %YMMZERO, %YMM5, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K6 represents a NULL or a mismatch in YMM4 and
+-	   YMM5.  */
+-	kord	%k0, %k1, %k6
+-
+-	VPCMP	$4, %YMM6, %YMM7, %k0
+-	VPCMP	$0, %YMMZERO, %YMM6, %k1
+-	VPCMP	$0, %YMMZERO, %YMM7, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K7 represents a NULL or a mismatch in YMM6 and
+-	   YMM7.  */
+-	kord	%k0, %k1, %k7
+-
+-	kord	%k4, %k5, %k0
+-	kord	%k6, %k7, %k1
+-
+-	/* Test each mask (32 bits) individually because for VEC_SIZE
+-	   == 32 is not possible to OR the four masks and keep all bits
+-	   in a 64-bit integer register, differing from SSE2 strcmp
+-	   where ORing is possible.  */
+-	kortestd %k0, %k1
+-	je	L(loop)
+-	ktestd	%k4, %k4
++
++	VPMINU	%YMM0, %YMM2, %YMM8
++	VPMINU	%YMM4, %YMM6, %YMM9
++
++	/* A zero CHAR in YMM8 means that there is a null CHAR.  */
++	VPMINU	%YMM8, %YMM9, %YMM8
++
++	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
++	VPTESTM	%YMM8, %YMM8, %k1
++
++	/* (YMM ^ YMM): A non-zero CHAR represents a mismatch.  */
++	vpxorq	(%rdx), %YMM0, %YMM1
++	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM3
++	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
++	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
++
++	vporq	%YMM1, %YMM3, %YMM9
++	vporq	%YMM5, %YMM7, %YMM10
++
++	/* A non-zero CHAR in YMM9 represents a mismatch.  */
++	vporq	%YMM9, %YMM10, %YMM9
++
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR.  */
++	VPCMP	$0, %YMMZERO, %YMM9, %k0{%k1}
++	kmovd   %k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
++	je	 L(loop)
++
++	/* Each bit set in K1 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k1
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR
++	   in YMM0 and (%rdx).  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
++	kmovd	%k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	je	L(test_vec)
+-	kmovd	%k4, %edi
+-	tzcntl	%edi, %ecx
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %ecx
+@@ -466,9 +458,18 @@ L(test_vec):
+ 	cmpq	$VEC_SIZE, %r11
+ 	jbe	L(zero)
+ # endif
+-	ktestd	%k5, %k5
++	/* Each bit set in K1 represents a non-null CHAR in YMM2.  */
++	VPTESTM	%YMM2, %YMM2, %k1
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR
++	   in YMM2 and VEC_SIZE(%rdx).  */
++	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
++	kmovd	%k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	je	L(test_2_vec)
+-	kmovd	%k5, %ecx
+ 	tzcntl	%ecx, %edi
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -512,9 +513,18 @@ L(test_2_vec):
+ 	cmpq	$(VEC_SIZE * 2), %r11
+ 	jbe	L(zero)
+ # endif
+-	ktestd	%k6, %k6
++	/* Each bit set in K1 represents a non-null CHAR in YMM4.  */
++	VPTESTM	%YMM4, %YMM4, %k1
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR
++	   in YMM4 and (VEC_SIZE * 2)(%rdx).  */
++	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
++	kmovd	%k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	je	L(test_3_vec)
+-	kmovd	%k6, %ecx
+ 	tzcntl	%ecx, %edi
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -558,8 +568,18 @@ L(test_3_vec):
+ 	cmpq	$(VEC_SIZE * 3), %r11
+ 	jbe	L(zero)
+ # endif
+-	kmovd	%k7, %esi
+-	tzcntl	%esi, %ecx
++	/* Each bit set in K1 represents a non-null CHAR in YMM6.  */
++	VPTESTM	%YMM6, %YMM6, %k1
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR
++	   in YMM6 and (VEC_SIZE * 3)(%rdx).  */
++	VPCMP	$0, %YMMZERO, %YMM7, %k0{%k1}
++	kmovd	%k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %ecx
+@@ -615,39 +635,51 @@ L(loop_cross_page):
+ 
+ 	VMOVU	(%rax, %r10), %YMM2
+ 	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
+-	VMOVU	(%rdx, %r10), %YMM4
+-	VMOVU	VEC_SIZE(%rdx, %r10), %YMM5
+-
+-	VPCMP	$4, %YMM4, %YMM2, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM4, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch in YMM2 and
+-	   YMM4.  */
+-	kord	%k0, %k1, %k1
+-
+-	VPCMP	$4, %YMM5, %YMM3, %k3
+-	VPCMP	$0, %YMMZERO, %YMM3, %k4
+-	VPCMP	$0, %YMMZERO, %YMM5, %k5
+-	kord	%k4, %k5, %k4
+-	/* Each bit in K3 represents a NULL or a mismatch in YMM3 and
+-	   YMM5.  */
+-	kord	%k3, %k4, %k3
++
++	/* Each bit set in K2 represents a non-null CHAR in YMM2.  */
++	VPTESTM	%YMM2, %YMM2, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM2 and 32 bytes at (%rdx, %r10).  */
++	VPCMP	$0, (%rdx, %r10), %YMM2, %k1{%k2}
++	kmovd	%k1, %r9d
++	/* Don't use subl since it is the lower 16/32 bits of RDI
++	   below.  */
++	notl	%r9d
++# ifdef USE_AS_WCSCMP
++	/* Only last 8 bits are valid.  */
++	andl	$0xff, %r9d
++# endif
++
++	/* Each bit set in K4 represents a non-null CHAR in YMM3.  */
++	VPTESTM	%YMM3, %YMM3, %k4
++	/* Each bit cleared in K3 represents a mismatch or a null CHAR
++	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
++	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
++	kmovd	%k3, %edi
++# ifdef USE_AS_WCSCMP
++	/* Don't use subl since it is the upper 8 bits of EDI below.  */
++	notl	%edi
++	andl	$0xff, %edi
++# else
++	incl	%edi
++# endif
+ 
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in K1/K3 represents 4-byte element.  */
+-	kshiftlw $8, %k3, %k2
++	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
++	sall	$8, %edi
+ 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+ 	   bytes.  */
+ 	movl	%ecx, %SHIFT_REG32
+ 	sarl	$2, %SHIFT_REG32
++
++	/* Each bit in EDI represents a null CHAR or a mismatch.  */
++	orl	%r9d, %edi
+ # else
+-	kshiftlq $32, %k3, %k2
+-# endif
++	salq	$32, %rdi
+ 
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korq	%k1, %k2, %k1
+-	kmovq	%k1, %rdi
++	/* Each bit in RDI represents a null CHAR or a mismatch.  */
++	orq	%r9, %rdi
++# endif
+ 
+ 	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
+ 	shrxq	%SHIFT_REG64, %rdi, %rdi
+@@ -692,35 +724,45 @@ L(loop_cross_page_2_vec):
+ 	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
+ 	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
+-	VMOVU	(VEC_SIZE * 2)(%rdx, %r10), %YMM2
+-	VMOVU	(VEC_SIZE * 3)(%rdx, %r10), %YMM3
+-
+-	VPCMP	$4, %YMM0, %YMM2, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM2, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch in YMM0 and
+-	   YMM2.  */
+-	kord	%k0, %k1, %k1
+-
+-	VPCMP	$4, %YMM1, %YMM3, %k3
+-	VPCMP	$0, %YMMZERO, %YMM1, %k4
+-	VPCMP	$0, %YMMZERO, %YMM3, %k5
+-	kord	%k4, %k5, %k4
+-	/* Each bit in K3 represents a NULL or a mismatch in YMM1 and
+-	   YMM3.  */
+-	kord	%k3, %k4, %k3
+ 
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10).  */
++	VPCMP	$0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
++	kmovd	%k1, %r9d
++	/* Don't use subl since it is the lower 16/32 bits of RDI
++	   below.  */
++	notl	%r9d
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in K1/K3 represents 4-byte element.  */
+-	kshiftlw $8, %k3, %k2
++	/* Only last 8 bits are valid.  */
++	andl	$0xff, %r9d
++# endif
++
++	VPTESTM	%YMM1, %YMM1, %k4
++	/* Each bit cleared in K3 represents a mismatch or a null CHAR
++	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
++	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
++	kmovd	%k3, %edi
++# ifdef USE_AS_WCSCMP
++	/* Don't use subl since it is the upper 8 bits of EDI below.  */
++	notl	%edi
++	andl	$0xff, %edi
+ # else
+-	kshiftlq $32, %k3, %k2
++	incl	%edi
+ # endif
+ 
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korq	%k1, %k2, %k1
+-	kmovq	%k1, %rdi
++# ifdef USE_AS_WCSCMP
++	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
++	sall	$8, %edi
++
++	/* Each bit in EDI represents a null CHAR or a mismatch.  */
++	orl	%r9d, %edi
++# else
++	salq	$32, %rdi
++
++	/* Each bit in RDI represents a null CHAR or a mismatch.  */
++	orq	%r9, %rdi
++# endif
+ 
+ 	xorl	%r8d, %r8d
+ 	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
+@@ -729,12 +771,15 @@ L(loop_cross_page_2_vec):
+ 	/* R8 has number of bytes skipped.  */
+ 	movl	%ecx, %r8d
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	/* NB: Divide shift count by 4 since each bit in RDI represent 4
+ 	   bytes.  */
+ 	sarl	$2, %ecx
+-# endif
++	/* Skip ECX bytes.  */
++	shrl	%cl, %edi
++# else
+ 	/* Skip ECX bytes.  */
+ 	shrq	%cl, %rdi
++# endif
+ 1:
+ 	/* Before jumping back to the loop, set ESI to the number of
+ 	   VEC_SIZE * 4 blocks before page crossing.  */
+@@ -818,7 +863,7 @@ L(cross_page_loop):
+ 	movzbl	(%rdi, %rdx), %eax
+ 	movzbl	(%rsi, %rdx), %ecx
+ # endif
+-	/* Check null char.  */
++	/* Check null CHAR.  */
+ 	testl	%eax, %eax
+ 	jne	L(cross_page_loop)
+ 	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
+@@ -901,18 +946,17 @@ L(cross_page):
+ 	jg	L(cross_page_1_vector)
+ L(loop_1_vector):
+ 	VMOVU	(%rdi, %rdx), %YMM0
+-	VMOVU	(%rsi, %rdx), %YMM1
+-
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
++
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (%rsi, %rdx).  */
++	VPCMP	$0, (%rsi, %rdx), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-	testl	%ecx, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	jne	L(last_vector)
+ 
+ 	addl	$VEC_SIZE, %edx
+@@ -931,18 +975,17 @@ L(cross_page_1_vector):
+ 	cmpl	$(PAGE_SIZE - 16), %eax
+ 	jg	L(cross_page_1_xmm)
+ 	VMOVU	(%rdi, %rdx), %XMM0
+-	VMOVU	(%rsi, %rdx), %XMM1
+-
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	korw	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korw	%k0, %k1, %k1
+-	kmovw	%k1, %ecx
+-	testl	%ecx, %ecx
++
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in XMM0 and 16 bytes at (%rsi, %rdx).  */
++	VPCMP	$0, (%rsi, %rdx), %XMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xf, %ecx
++# else
++	subl	$0xffff, %ecx
++# endif
+ 	jne	L(last_vector)
+ 
+ 	addl	$16, %edx
+@@ -965,25 +1008,16 @@ L(cross_page_1_xmm):
+ 	vmovq	(%rdi, %rdx), %XMM0
+ 	vmovq	(%rsi, %rdx), %XMM1
+ 
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	kmovd	%k1, %ecx
+-
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in XMM0 and XMM1.  */
++	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
++	kmovb	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	/* Only last 2 bits are valid.  */
+-	andl	$0x3, %ecx
++	subl	$0x3, %ecx
+ # else
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %ecx
++	subl	$0xff, %ecx
+ # endif
+-
+-	testl	%ecx, %ecx
+ 	jne	L(last_vector)
+ 
+ 	addl	$8, %edx
+@@ -1002,25 +1036,16 @@ L(cross_page_8bytes):
+ 	vmovd	(%rdi, %rdx), %XMM0
+ 	vmovd	(%rsi, %rdx), %XMM1
+ 
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in XMM0 and XMM1.  */
++	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-
+ # ifdef USE_AS_WCSCMP
+-	/* Only the last bit is valid.  */
+-	andl	$0x1, %ecx
++	subl	$0x1, %ecx
+ # else
+-	/* Only last 4 bits are valid.  */
+-	andl	$0xf, %ecx
++	subl	$0xf, %ecx
+ # endif
+-
+-	testl	%ecx, %ecx
+ 	jne	L(last_vector)
+ 
+ 	addl	$4, %edx
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-6.patch b/glibc-RHEL-15696-6.patch
new file mode 100644
index 0000000..f6725a6
--- /dev/null
+++ b/glibc-RHEL-15696-6.patch
@@ -0,0 +1,300 @@
+From ee915088a0231cd421054dbd8abab7aadf331153 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:33:52 -0800
+Subject: [PATCH] x86-64 strncmp family: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes the strncmp family for x32.  Tested on x86-64 and x32.
+On x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strcmp-avx2.S: Use RDX_LP for length.
+	* sysdeps/x86_64/multiarch/strcmp-sse42.S: Likewise.
+	* sysdeps/x86_64/strcmp.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp,
+	tst-size_t-strncmp and tst-size_t-wcsncmp.
+	* sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise.
+	* sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise.
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S      |  6 +-
+ sysdeps/x86_64/multiarch/strcmp-sse42.S     |  6 +-
+ sysdeps/x86_64/strcmp.S                     |  6 +-
+ sysdeps/x86_64/x32/Makefile                 |  6 +-
+ sysdeps/x86_64/x32/tst-size_t-strncasecmp.c | 59 ++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-strncmp.c     | 78 +++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wcsncmp.c     | 20 ++++++
+ 7 files changed, 170 insertions(+), 11 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncmp.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 327e3d87..156c1949 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -79,15 +79,15 @@
+ ENTRY (STRCMP)
+ # ifdef USE_AS_STRNCMP
+ 	/* Check for simple cases (0 or 1) in offset.  */
+-	cmp	$1, %rdx
++	cmp	$1, %RDX_LP
+ 	je	L(char0)
+ 	jb	L(zero)
+ #  ifdef USE_AS_WCSCMP
+ 	/* Convert units: from wide to byte char.  */
+-	shl	$2, %rdx
++	shl	$2, %RDX_LP
+ #  endif
+ 	/* Register %r11 tracks the maximum offset.  */
+-	movq	%rdx, %r11
++	mov	%RDX_LP, %R11_LP
+ # endif
+ 	movl	%edi, %eax
+ 	xorl	%edx, %edx
+diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+index d3c07bd2..a1ebea46 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+@@ -156,11 +156,11 @@ STRCMP_SSE42:
+ #endif
+ 
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+-	test	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	je	LABEL(strcmp_exitz)
+-	cmp	$1, %rdx
++	cmp	$1, %RDX_LP
+ 	je	LABEL(Byte0)
+-	mov	%rdx, %r11
++	mov	%RDX_LP, %R11_LP
+ #endif
+ 	mov	%esi, %ecx
+ 	mov	%edi, %eax
+diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
+index e16945b9..f47c8ad4 100644
+--- a/sysdeps/x86_64/strcmp.S
++++ b/sysdeps/x86_64/strcmp.S
+@@ -135,11 +135,11 @@ ENTRY (STRCMP)
+  * This implementation uses SSE to compare up to 16 bytes at a time.
+  */
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+-	test	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	je	LABEL(strcmp_exitz)
+-	cmp	$1, %rdx
++	cmp	$1, %RDX_LP
+ 	je	LABEL(Byte0)
+-	mov	%rdx, %r11
++	mov	%RDX_LP, %R11_LP
+ #endif
+ 	mov	%esi, %ecx
+ 	mov	%edi, %eax
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 98bd9ae9..db302839 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -7,9 +7,11 @@ endif
+ 
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+-	 tst-size_t-memrchr tst-size_t-memset
++	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
++	 tst-size_t-strncmp
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
++tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
++	 tst-size_t-wcsncmp
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
+new file mode 100644
+index 00000000..86233593
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
+@@ -0,0 +1,59 @@
++/* Test strncaecmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_NAME "strncasecmp"
++#include "test-size_t.h"
++
++IMPL (strncasecmp, 1)
++
++typedef int (*proto_t) (const char *, const char *, size_t);
++
++static int
++__attribute__ ((noinline, noclone))
++do_strncasecmp (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t dest = { { page_size }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  strncpy ((char *) buf1, (const char *) buf2, page_size);
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      int res = do_strncasecmp (dest, src);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %i != 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strncmp.c b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
+new file mode 100644
+index 00000000..54e6bd83
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
+@@ -0,0 +1,78 @@
++/* Test strncmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifdef WIDE
++# define TEST_NAME "wcsncmp"
++#else
++# define TEST_NAME "strncmp"
++#endif
++
++#include "test-size_t.h"
++
++#ifdef WIDE
++# include <wchar.h>
++
++# define STRNCMP wcsncmp
++# define STRNCPY wcsncpy
++# define CHAR wchar_t
++#else
++# define STRNCMP strncmp
++# define STRNCPY strncpy
++# define CHAR char
++#endif
++
++IMPL (STRNCMP, 1)
++
++typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
++
++
++static int
++__attribute__ ((noinline, noclone))
++do_strncmp (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  size_t size = page_size / sizeof (CHAR);
++  parameter_t dest = { { size }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  STRNCPY ((CHAR *) buf1, (const CHAR *) buf2, size);
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      int res = do_strncmp (dest, src);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %i != 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
+new file mode 100644
+index 00000000..4829647c
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
+@@ -0,0 +1,20 @@
++/* Test wcsncmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include "tst-size_t-strncmp.c"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-60.patch b/glibc-RHEL-15696-60.patch
new file mode 100644
index 0000000..a3739eb
--- /dev/null
+++ b/glibc-RHEL-15696-60.patch
@@ -0,0 +1,54 @@
+From 6720d36b6623c5e48c070d86acf61198b33e144e Mon Sep 17 00:00:00 2001
+From: Fangrui Song <maskray@google.com>
+Date: Tue, 2 Nov 2021 20:59:52 -0700
+Subject: [PATCH] x86-64: Replace movzx with movzbl
+Content-type: text/plain; charset=UTF-8
+
+Clang cannot assemble movzx in the AT&T dialect mode.
+
+../sysdeps/x86_64/strcmp.S:2232:16: error: invalid operand for instruction
+ movzx (%rsi), %ecx
+               ^~~~
+
+Change movzx to movzbl, which follows the AT&T dialect and is used
+elsewhere in the file.
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-sse42.S | 4 ++--
+ sysdeps/x86_64/strcmp.S                 | 4 ++--
+ 2 files changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+index a1ebea46..d8fdeb3a 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+@@ -1771,8 +1771,8 @@ LABEL(strcmp_exitz):
+ 	.p2align 4
+ 	// XXX Same as code above
+ LABEL(Byte0):
+-	movzx	(%rsi), %ecx
+-	movzx	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
+ 
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
+index f47c8ad4..aa6df898 100644
+--- a/sysdeps/x86_64/strcmp.S
++++ b/sysdeps/x86_64/strcmp.S
+@@ -2232,8 +2232,8 @@ LABEL(strcmp_exitz):
+ 
+ 	.p2align 4
+ LABEL(Byte0):
+-	movzx	(%rsi), %ecx
+-	movzx	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
+ 
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-61.patch b/glibc-RHEL-15696-61.patch
new file mode 100644
index 0000000..d6dbe81
--- /dev/null
+++ b/glibc-RHEL-15696-61.patch
@@ -0,0 +1,56 @@
+From cf2c57526ba4b57e6863ad4db8a868e2678adce8 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 30 Apr 2021 05:58:59 -0700
+Subject: [PATCH] x86: Set rep_movsb_threshold to 2112 on processors with FSRM
+Content-type: text/plain; charset=UTF-8
+
+The glibc memcpy benchmark on Intel Core i7-1065G7 (Ice Lake) showed
+that REP MOVSB became faster after 2112 bytes:
+
+                                      Vector Move       REP MOVSB
+length=2112, align1=0, align2=0:        24.20             24.40
+length=2112, align1=1, align2=0:        26.07             23.13
+length=2112, align1=0, align2=1:        27.18             28.13
+length=2112, align1=1, align2=1:        26.23             25.16
+length=2176, align1=0, align2=0:        23.18             22.52
+length=2176, align1=2, align2=0:        25.45             22.52
+length=2176, align1=0, align2=2:        27.14             27.82
+length=2176, align1=2, align2=2:        22.73             25.56
+length=2240, align1=0, align2=0:        24.62             24.25
+length=2240, align1=3, align2=0:        29.77             27.15
+length=2240, align1=0, align2=3:        35.55             29.93
+length=2240, align1=3, align2=3:        34.49             25.15
+length=2304, align1=0, align2=0:        34.75             26.64
+length=2304, align1=4, align2=0:        32.09             22.63
+length=2304, align1=0, align2=4:        28.43             31.24
+
+Use REP MOVSB for data size > 2112 bytes in memcpy on processors with
+fast short REP MOVSB (FSRM).
+
+	* sysdeps/x86/dl-cacheinfo.h (dl_init_cacheinfo): Set
+	rep_movsb_threshold to 2112 on processors with fast short REP
+	MOVSB (FSRM).
+---
+ sysdeps/x86/cacheinfo.h | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index f72f634a..cc3941d3 100644
+--- a/sysdeps/x86/cacheinfo.h
++++ b/sysdeps/x86/cacheinfo.h
+@@ -430,6 +430,12 @@ init_cacheinfo (void)
+       rep_movsb_threshold = 2048 * (16 / 16);
+       minimum_rep_movsb_threshold = 16 * 8;
+     }
++
++  /* NB: The default REP MOVSB threshold is 2112 on processors with fast
++     short REP MOVSB (FSRM).  */
++  if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
++    rep_movsb_threshold = 2112;
++
+   if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold)
+     __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
+   else
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-62.patch b/glibc-RHEL-15696-62.patch
new file mode 100644
index 0000000..a7a9286
--- /dev/null
+++ b/glibc-RHEL-15696-62.patch
@@ -0,0 +1,136 @@
+From 475b63702ef38b69558fc3d31a0b66776a70f1d3 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 1 Nov 2021 00:49:52 -0500
+Subject: [PATCH] x86: Double size of ERMS rep_movsb_threshold in
+ dl-cacheinfo.h
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+This patch doubles the rep_movsb_threshold when using ERMS. Based on
+benchmarks the vector copy loop, especially now that it handles 4k
+aliasing, is better for these medium ranged.
+
+On Skylake with ERMS:
+
+Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
+4096,   0,      0,      0,      0.975
+4096,   0,      0,      1,      0.953
+4096,   12,     0,      0,      0.969
+4096,   12,     0,      1,      0.872
+4096,   44,     0,      0,      0.979
+4096,   44,     0,      1,      0.83
+4096,   0,      12,     0,      1.006
+4096,   0,      12,     1,      0.989
+4096,   0,      44,     0,      0.739
+4096,   0,      44,     1,      0.942
+4096,   12,     12,     0,      1.009
+4096,   12,     12,     1,      0.973
+4096,   44,     44,     0,      0.791
+4096,   44,     44,     1,      0.961
+4096,   2048,   0,      0,      0.978
+4096,   2048,   0,      1,      0.951
+4096,   2060,   0,      0,      0.986
+4096,   2060,   0,      1,      0.963
+4096,   2048,   12,     0,      0.971
+4096,   2048,   12,     1,      0.941
+4096,   2060,   12,     0,      0.977
+4096,   2060,   12,     1,      0.949
+8192,   0,      0,      0,      0.85
+8192,   0,      0,      1,      0.845
+8192,   13,     0,      0,      0.937
+8192,   13,     0,      1,      0.939
+8192,   45,     0,      0,      0.932
+8192,   45,     0,      1,      0.927
+8192,   0,      13,     0,      0.621
+8192,   0,      13,     1,      0.62
+8192,   0,      45,     0,      0.53
+8192,   0,      45,     1,      0.516
+8192,   13,     13,     0,      0.664
+8192,   13,     13,     1,      0.659
+8192,   45,     45,     0,      0.593
+8192,   45,     45,     1,      0.575
+8192,   2048,   0,      0,      0.854
+8192,   2048,   0,      1,      0.834
+8192,   2061,   0,      0,      0.863
+8192,   2061,   0,      1,      0.857
+8192,   2048,   13,     0,      0.63
+8192,   2048,   13,     1,      0.629
+8192,   2061,   13,     0,      0.627
+8192,   2061,   13,     1,      0.62
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/cacheinfo.h      |  8 +++++---
+ sysdeps/x86/dl-tunables.list | 26 +++++++++++++++-----------
+ 2 files changed, 20 insertions(+), 14 deletions(-)
+
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index cc3941d3..ac025e08 100644
+--- a/sysdeps/x86/cacheinfo.h
++++ b/sysdeps/x86/cacheinfo.h
+@@ -411,18 +411,20 @@ init_cacheinfo (void)
+ 
+   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
+   unsigned int minimum_rep_movsb_threshold;
+-  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
++  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
++     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
++     threshold is 2048 * (VEC_SIZE / 16).  */
+   unsigned int rep_movsb_threshold;
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
+     {
+-      rep_movsb_threshold = 2048 * (64 / 16);
++      rep_movsb_threshold = 4096 * (64 / 16);
+       minimum_rep_movsb_threshold = 64 * 8;
+     }
+   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
+ 				    AVX_Fast_Unaligned_Load))
+     {
+-      rep_movsb_threshold = 2048 * (32 / 16);
++      rep_movsb_threshold = 4096 * (32 / 16);
+       minimum_rep_movsb_threshold = 32 * 8;
+     }
+   else
+diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
+index 89bf2966..56c6834a 100644
+--- a/sysdeps/x86/dl-tunables.list
++++ b/sysdeps/x86/dl-tunables.list
+@@ -32,17 +32,21 @@ glibc {
+     }
+     x86_rep_movsb_threshold {
+       type: SIZE_T
+-      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
+-      # isn't faster on short data.  The memcpy micro benchmark in glibc
+-      # shows that 2KB is the approximate value above which REP MOVSB
+-      # becomes faster than SSE2 optimization on processors with Enhanced
+-      # REP MOVSB.  Since larger register size can move more data with a
+-      # single load and store, the threshold is higher with larger register
+-      # size.  Note: Since the REP MOVSB threshold must be greater than 8
+-      # times of vector size and the default value is 2048 * (vector size
+-      # / 16), the default value and the minimum value must be updated at
+-      # run-time.  NB: Don't set the default value since we can't tell if
+-      # the tunable value is set by user or not [BZ #27069].
++      # Since there is overhead to set up REP MOVSB operation, REP
++      # MOVSB isn't faster on short data.  The memcpy micro benchmark
++      # in glibc shows that 2KB is the approximate value above which
++      # REP MOVSB becomes faster than SSE2 optimization on processors
++      # with Enhanced REP MOVSB.  Since larger register size can move
++      # more data with a single load and store, the threshold is
++      # higher with larger register size.  Micro benchmarks show AVX
++      # REP MOVSB becomes faster apprximately at 8KB.  The AVX512
++      # threshold is extrapolated to 16KB.  For machines with FSRM the
++      # threshold is universally set at 2112 bytes.  Note: Since the
++      # REP MOVSB threshold must be greater than 8 times of vector
++      # size and the default value is 4096 * (vector size / 16), the
++      # default value and the minimum value must be updated at
++      # run-time.  NB: Don't set the default value since we can't tell
++      # if the tunable value is set by user or not [BZ #27069].
+       minval: 1
+     }
+     x86_rep_stosb_threshold {
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-63.patch b/glibc-RHEL-15696-63.patch
new file mode 100644
index 0000000..c14e8b3
--- /dev/null
+++ b/glibc-RHEL-15696-63.patch
@@ -0,0 +1,2428 @@
+From 2f9062d7171850451e6044ef78d91ff8c017b9c0 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 10 Nov 2021 16:18:56 -0600
+Subject: [PATCH] x86: Shrink memcmp-sse4.S code size
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+This implementation refactors memcmp-sse4.S primarily with minimizing
+code size in mind. It does this by removing the lookup table logic and
+removing the unrolled check from (256, 512] bytes.
+
+memcmp-sse4 code size reduction : -3487 bytes
+wmemcmp-sse4 code size reduction: -1472 bytes
+
+The current memcmp-sse4.S implementation has a large code size
+cost. This has serious adverse affects on the ICache / ITLB. While
+in micro-benchmarks the implementations appears fast, traces of
+real-world code have shown that the speed in micro benchmarks does not
+translate when the ICache/ITLB are not primed, and that the cost
+of the code size has measurable negative affects on overall
+application performance.
+
+See https://research.google/pubs/pub48320/ for more details.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-sse4.S | 2267 +++++++-----------------
+ 1 file changed, 646 insertions(+), 1621 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+index 302900f5..50060006 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
++++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+@@ -25,14 +25,14 @@
+ #  define MEMCMP	__memcmp_sse4_1
+ # endif
+ 
+-# define JMPTBL(I, B)	(I - B)
++#ifdef USE_AS_WMEMCMP
++# define CMPEQ	pcmpeqd
++# define CHAR_SIZE	4
++#else
++# define CMPEQ	pcmpeqb
++# define CHAR_SIZE	1
++#endif
+ 
+-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+-  lea		TABLE(%rip), %r11;				\
+-  movslq	(%r11, INDEX, SCALE), %rcx;			\
+-  add		%r11, %rcx;					\
+-  _CET_NOTRACK jmp *%rcx;					\
+-  ud2
+ 
+ /* Warning!
+            wmemcmp has to use SIGNED comparison for elements.
+@@ -47,33 +47,253 @@ ENTRY (MEMCMP)
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+ # endif
+-	pxor	%xmm0, %xmm0
+ 	cmp	$79, %RDX_LP
+ 	ja	L(79bytesormore)
++
++	cmp	$CHAR_SIZE, %RDX_LP
++	jbe	L(firstbyte)
++
++	/* N in (CHAR_SIZE, 79) bytes.  */
++	cmpl	$32, %edx
++	ja	L(more_32_bytes)
++
++	cmpl	$16, %edx
++	jae	L(16_to_32_bytes)
++
+ # ifndef USE_AS_WMEMCMP
+-	cmp	$1, %RDX_LP
+-	je	L(firstbyte)
++	cmpl	$8, %edx
++	jae	L(8_to_16_bytes)
++
++	cmpl	$4, %edx
++	jb	L(2_to_3_bytes)
++
++	movl	(%rdi), %eax
++	movl	(%rsi), %ecx
++
++	bswap	%eax
++	bswap	%ecx
++
++	shlq	$32, %rax
++	shlq	$32, %rcx
++
++	movl	-4(%rdi, %rdx), %edi
++	movl	-4(%rsi, %rdx), %esi
++
++	bswap	%edi
++	bswap	%esi
++
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	cmovne	%edx, %eax
++	sbbl	%ecx, %ecx
++	orl	%ecx, %eax
++	ret
++
++	.p2align 4,, 8
++L(2_to_3_bytes):
++	movzwl	(%rdi), %eax
++	movzwl	(%rsi), %ecx
++	shll	$8, %eax
++	shll	$8, %ecx
++	bswap	%eax
++	bswap	%ecx
++	movzbl	-1(%rdi, %rdx), %edi
++	movzbl	-1(%rsi, %rdx), %esi
++	orl	%edi, %eax
++	orl	%esi, %ecx
++	subl	%ecx, %eax
++	ret
++
++	.p2align 4,, 8
++L(8_to_16_bytes):
++	movq	(%rdi), %rax
++	movq	(%rsi), %rcx
++
++	bswap	%rax
++	bswap	%rcx
++
++	subq	%rcx, %rax
++	jne	L(8_to_16_bytes_done)
++
++	movq	-8(%rdi, %rdx), %rax
++	movq	-8(%rsi, %rdx), %rcx
++
++	bswap	%rax
++	bswap	%rcx
++
++	subq	%rcx, %rax
++
++L(8_to_16_bytes_done):
++	cmovne	%edx, %eax
++	sbbl	%ecx, %ecx
++	orl	%ecx, %eax
++	ret
++# else
++	xorl	%eax, %eax
++	movl	(%rdi), %ecx
++	cmpl	(%rsi), %ecx
++	jne	L(8_to_16_bytes_done)
++	movl	4(%rdi), %ecx
++	cmpl	4(%rsi), %ecx
++	jne	L(8_to_16_bytes_done)
++	movl	-4(%rdi, %rdx), %ecx
++	cmpl	-4(%rsi, %rdx), %ecx
++	jne	L(8_to_16_bytes_done)
++	ret
+ # endif
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+ 
+-# ifndef USE_AS_WMEMCMP
+-	.p2align 4
++	.p2align 4,, 3
++L(ret_zero):
++	xorl	%eax, %eax
++L(zero):
++	ret
++
++	.p2align 4,, 8
+ L(firstbyte):
++	jb	L(ret_zero)
++# ifdef USE_AS_WMEMCMP
++	xorl	%eax, %eax
++	movl	(%rdi), %ecx
++	cmpl	(%rsi), %ecx
++	je	L(zero)
++L(8_to_16_bytes_done):
++	setg	%al
++	leal	-1(%rax, %rax), %eax
++# else
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi), %ecx
+ 	sub	%ecx, %eax
++# endif
+ 	ret
++
++	.p2align 4
++L(vec_return_begin_48):
++	addq	$16, %rdi
++	addq	$16, %rsi
++L(vec_return_begin_32):
++	bsfl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	32(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	32(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	32(%rsi, %rax), %ecx
++	movzbl	32(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
++	.p2align 4
++L(vec_return_begin_16):
++	addq	$16, %rdi
++	addq	$16, %rsi
++L(vec_return_begin):
++	bsfl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
++	.p2align 4
++L(vec_return_end_16):
++	subl	$16, %edx
++L(vec_return_end):
++	bsfl	%eax, %eax
++	addl	%edx, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	-16(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	-16(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	-16(%rsi, %rax), %ecx
++	movzbl	-16(%rdi, %rax), %eax
++	subl	%ecx, %eax
+ # endif
++	ret
++
++	.p2align 4,, 8
++L(more_32_bytes):
++	movdqu	(%rdi), %xmm0
++	movdqu	(%rsi), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm0
++	movdqu	16(%rsi), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	cmpl	$64, %edx
++	jbe	L(32_to_64_bytes)
++	movdqu	32(%rdi), %xmm0
++	movdqu	32(%rsi), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	.p2align 4,, 6
++L(32_to_64_bytes):
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
++
++	.p2align 4
++L(16_to_32_bytes):
++	movdqu	(%rdi), %xmm0
++	movdqu	(%rsi), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
++
+ 
+ 	.p2align 4
+ L(79bytesormore):
++	movdqu	(%rdi), %xmm0
+ 	movdqu	(%rsi), %xmm1
+-	movdqu	(%rdi), %xmm2
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++
+ 	mov	%rsi, %rcx
+ 	and	$-16, %rsi
+ 	add	$16, %rsi
+@@ -86,1694 +306,499 @@ L(79bytesormore):
+ 
+ 	cmp	$128, %rdx
+ 	ja	L(128bytesormore)
+-L(less128bytes):
+-	sub	$64, %rdx
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+ 
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqu	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqu	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-	cmp	$32, %rdx
+-	jb	L(less32bytesin64)
+-
+-	movdqu	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqu	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin64):
+-	add	$64, %rdi
+-	add	$64, %rsi
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	.p2align 4,, 6
++L(less128bytes):
++	movdqu	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqu	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	cmp	$96, %rdx
++	jb	L(32_to_64_bytes)
++
++	addq	$64, %rdi
++	addq	$64, %rsi
++	subq	$64, %rdx
++
++	.p2align 4,, 6
++L(last_64_bytes):
++	movdqu	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
+ 
++	.p2align 4
+ L(128bytesormore):
+-	cmp	$512, %rdx
+-	ja	L(512bytesormore)
+ 	cmp	$256, %rdx
+-	ja	L(less512bytes)
++	ja	L(unaligned_loop)
+ L(less256bytes):
+-	sub	$128, %rdx
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqu	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqu	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-
+-	movdqu	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqu	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-
+-	movdqu	96(%rdi), %xmm2
+-	pxor	96(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(112bytesin256)
+-
+-	movdqu	112(%rdi), %xmm2
+-	pxor	112(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(128bytesin256)
+-
+-	add	$128, %rsi
+-	add	$128, %rdi
+-
+-	cmp	$64, %rdx
+-	jae	L(less128bytes)
+-
+-	cmp	$32, %rdx
+-	jb	L(less32bytesin128)
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin128):
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+-
+-L(less512bytes):
+-	sub	$256, %rdx
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqu	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqu	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-
+-	movdqu	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqu	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-
+-	movdqu	96(%rdi), %xmm2
+-	pxor	96(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(112bytesin256)
+-
+-	movdqu	112(%rdi), %xmm2
+-	pxor	112(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(128bytesin256)
+-
+-	movdqu	128(%rdi), %xmm2
+-	pxor	128(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(144bytesin256)
+-
+-	movdqu	144(%rdi), %xmm2
+-	pxor	144(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(160bytesin256)
+-
+-	movdqu	160(%rdi), %xmm2
+-	pxor	160(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(176bytesin256)
+-
+-	movdqu	176(%rdi), %xmm2
+-	pxor	176(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(192bytesin256)
+-
+-	movdqu	192(%rdi), %xmm2
+-	pxor	192(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(208bytesin256)
+-
+-	movdqu	208(%rdi), %xmm2
+-	pxor	208(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(224bytesin256)
+-
+-	movdqu	224(%rdi), %xmm2
+-	pxor	224(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(240bytesin256)
+-
+-	movdqu	240(%rdi), %xmm2
+-	pxor	240(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(256bytesin256)
+-
+-	add	$256, %rsi
+-	add	$256, %rdi
+-
+-	cmp	$128, %rdx
+-	jae	L(less256bytes)
++	movdqu	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqu	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	addq	$64, %rdi
++	addq	$64, %rsi
++
++	movdqu	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqu	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	addq	$-128, %rdx
++	subq	$-64, %rsi
++	subq	$-64, %rdi
+ 
+ 	cmp	$64, %rdx
+-	jae	L(less128bytes)
++	ja	L(less128bytes)
+ 
+ 	cmp	$32, %rdx
+-	jb	L(less32bytesin256)
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin256):
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	ja	L(last_64_bytes)
++
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
+ 
+ 	.p2align 4
+-L(512bytesormore):
++L(unaligned_loop):
+ # ifdef DATA_CACHE_SIZE_HALF
+ 	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+ # else
+ 	mov	__x86_data_cache_size_half(%rip), %R8_LP
+ # endif
+-	mov	%r8, %r9
+-	shr	$1, %r8
+-	add	%r9, %r8
+-	cmp	%r8, %rdx
+-	ja	L(L2_L3_cache_unaglined)
++	movq	%r8, %r9
++	addq	%r8, %r8
++	addq	%r9, %r8
++	cmpq	%r8, %rdx
++	ja	L(L2_L3_cache_unaligned)
+ 	sub	$64, %rdx
+ 	.p2align 4
+ L(64bytesormore_loop):
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	movdqa	%xmm2, %xmm1
++	movdqu	(%rdi), %xmm0
++	movdqu	16(%rdi), %xmm1
++	movdqu	32(%rdi), %xmm2
++	movdqu	48(%rdi), %xmm3
+ 
+-	movdqu	16(%rdi), %xmm3
+-	pxor	16(%rsi), %xmm3
+-	por	%xmm3, %xmm1
++	CMPEQ	(%rsi), %xmm0
++	CMPEQ	16(%rsi), %xmm1
++	CMPEQ	32(%rsi), %xmm2
++	CMPEQ	48(%rsi), %xmm3
+ 
+-	movdqu	32(%rdi), %xmm4
+-	pxor	32(%rsi), %xmm4
+-	por	%xmm4, %xmm1
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
+ 
+-	movdqu	48(%rdi), %xmm5
+-	pxor	48(%rsi), %xmm5
+-	por	%xmm5, %xmm1
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
+ 
+-	ptest	%xmm1, %xmm0
+-	jnc	L(64bytesormore_loop_end)
+ 	add	$64, %rsi
+ 	add	$64, %rdi
+ 	sub	$64, %rdx
+-	jae	L(64bytesormore_loop)
++	ja	L(64bytesormore_loop)
+ 
+-	add	$64, %rdx
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	.p2align 4,, 6
++L(loop_tail):
++	addq	%rdx, %rdi
++	movdqu	(%rdi), %xmm0
++	movdqu	16(%rdi), %xmm1
++	movdqu	32(%rdi), %xmm2
++	movdqu	48(%rdi), %xmm3
++
++	addq	%rdx, %rsi
++	movdqu	(%rsi), %xmm4
++	movdqu	16(%rsi), %xmm5
++	movdqu	32(%rsi), %xmm6
++	movdqu	48(%rsi), %xmm7
++
++	CMPEQ	%xmm4, %xmm0
++	CMPEQ	%xmm5, %xmm1
++	CMPEQ	%xmm6, %xmm2
++	CMPEQ	%xmm7, %xmm3
++
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
++
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
++	ret
+ 
+-L(L2_L3_cache_unaglined):
+-	sub	$64, %rdx
++L(L2_L3_cache_unaligned):
++	subq	$64, %rdx
+ 	.p2align 4
+ L(L2_L3_unaligned_128bytes_loop):
+ 	prefetchnta 0x1c0(%rdi)
+ 	prefetchnta 0x1c0(%rsi)
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	movdqa	%xmm2, %xmm1
+ 
+-	movdqu	16(%rdi), %xmm3
+-	pxor	16(%rsi), %xmm3
+-	por	%xmm3, %xmm1
++	movdqu	(%rdi), %xmm0
++	movdqu	16(%rdi), %xmm1
++	movdqu	32(%rdi), %xmm2
++	movdqu	48(%rdi), %xmm3
++
++	CMPEQ	(%rsi), %xmm0
++	CMPEQ	16(%rsi), %xmm1
++	CMPEQ	32(%rsi), %xmm2
++	CMPEQ	48(%rsi), %xmm3
+ 
+-	movdqu	32(%rdi), %xmm4
+-	pxor	32(%rsi), %xmm4
+-	por	%xmm4, %xmm1
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
+ 
+-	movdqu	48(%rdi), %xmm5
+-	pxor	48(%rsi), %xmm5
+-	por	%xmm5, %xmm1
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
+ 
+-	ptest	%xmm1, %xmm0
+-	jnc	L(64bytesormore_loop_end)
+ 	add	$64, %rsi
+ 	add	$64, %rdi
+ 	sub	$64, %rdx
+-	jae	L(L2_L3_unaligned_128bytes_loop)
++	ja	L(L2_L3_unaligned_128bytes_loop)
++	jmp	L(loop_tail)
+ 
+-	add	$64, %rdx
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+ 
+-/*
+- * This case is for machines which are sensitive for unaligned instructions.
+- */
++	/* This case is for machines which are sensitive for unaligned
++	 * instructions.  */
+ 	.p2align 4
+ L(2aligned):
+ 	cmp	$128, %rdx
+ 	ja	L(128bytesormorein2aligned)
+ L(less128bytesin2aligned):
+-	sub	$64, %rdx
+-
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqa	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqa	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqa	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-	cmp	$32, %rdx
+-	jb	L(less32bytesin64in2alinged)
+-
+-	movdqa	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqa	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin64in2alinged):
+-	add	$64, %rdi
+-	add	$64, %rsi
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	movdqa	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqa	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqa	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqa	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	cmp	$96, %rdx
++	jb	L(32_to_64_bytes)
++
++	addq	$64, %rdi
++	addq	$64, %rsi
++	subq	$64, %rdx
++
++	.p2align 4,, 6
++L(aligned_last_64_bytes):
++	movdqa	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqa	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
+ 
+ 	.p2align 4
+ L(128bytesormorein2aligned):
+-	cmp	$512, %rdx
+-	ja	L(512bytesormorein2aligned)
+ 	cmp	$256, %rdx
+-	ja	L(256bytesormorein2aligned)
++	ja	L(aligned_loop)
+ L(less256bytesin2alinged):
+-	sub	$128, %rdx
+-
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqa	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqa	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqa	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-
+-	movdqa	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqa	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-
+-	movdqa	96(%rdi), %xmm2
+-	pxor	96(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(112bytesin256)
+-
+-	movdqa	112(%rdi), %xmm2
+-	pxor	112(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(128bytesin256)
+-
+-	add	$128, %rsi
+-	add	$128, %rdi
++	movdqa	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqa	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqa	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqa	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	addq	$64, %rdi
++	addq	$64, %rsi
++
++	movdqa	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqa	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqa	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqa	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	addq	$-128, %rdx
++	subq	$-64, %rsi
++	subq	$-64, %rdi
+ 
+ 	cmp	$64, %rdx
+-	jae	L(less128bytesin2aligned)
++	ja	L(less128bytesin2aligned)
+ 
+ 	cmp	$32, %rdx
+-	jb	L(less32bytesin128in2aligned)
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin128in2aligned):
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+-
+-	.p2align 4
+-L(256bytesormorein2aligned):
+-
+-	sub	$256, %rdx
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqa	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqa	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqa	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-
+-	movdqa	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqa	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-
+-	movdqa	96(%rdi), %xmm2
+-	pxor	96(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(112bytesin256)
+-
+-	movdqa	112(%rdi), %xmm2
+-	pxor	112(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(128bytesin256)
+-
+-	movdqa	128(%rdi), %xmm2
+-	pxor	128(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(144bytesin256)
+-
+-	movdqa	144(%rdi), %xmm2
+-	pxor	144(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(160bytesin256)
+-
+-	movdqa	160(%rdi), %xmm2
+-	pxor	160(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(176bytesin256)
+-
+-	movdqa	176(%rdi), %xmm2
+-	pxor	176(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(192bytesin256)
+-
+-	movdqa	192(%rdi), %xmm2
+-	pxor	192(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(208bytesin256)
+-
+-	movdqa	208(%rdi), %xmm2
+-	pxor	208(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(224bytesin256)
+-
+-	movdqa	224(%rdi), %xmm2
+-	pxor	224(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(240bytesin256)
+-
+-	movdqa	240(%rdi), %xmm2
+-	pxor	240(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(256bytesin256)
+-
+-	add	$256, %rsi
+-	add	$256, %rdi
+-
+-	cmp	$128, %rdx
+-	jae	L(less256bytesin2alinged)
+-
+-	cmp	$64, %rdx
+-	jae	L(less128bytesin2aligned)
+-
+-	cmp	$32, %rdx
+-	jb	L(less32bytesin256in2alinged)
+-
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqa	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin256in2alinged):
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	ja	L(aligned_last_64_bytes)
++
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
+ 
+ 	.p2align 4
+-L(512bytesormorein2aligned):
++L(aligned_loop):
+ # ifdef DATA_CACHE_SIZE_HALF
+ 	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+ # else
+ 	mov	__x86_data_cache_size_half(%rip), %R8_LP
+ # endif
+-	mov	%r8, %r9
+-	shr	$1, %r8
+-	add	%r9, %r8
+-	cmp	%r8, %rdx
+-	ja	L(L2_L3_cache_aglined)
++	movq	%r8, %r9
++	addq	%r8, %r8
++	addq	%r9, %r8
++	cmpq	%r8, %rdx
++	ja	L(L2_L3_cache_aligned)
+ 
+ 	sub	$64, %rdx
+ 	.p2align 4
+ L(64bytesormore_loopin2aligned):
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	movdqa	%xmm2, %xmm1
+-
+-	movdqa	16(%rdi), %xmm3
+-	pxor	16(%rsi), %xmm3
+-	por	%xmm3, %xmm1
++	movdqa	(%rdi), %xmm0
++	movdqa	16(%rdi), %xmm1
++	movdqa	32(%rdi), %xmm2
++	movdqa	48(%rdi), %xmm3
+ 
+-	movdqa	32(%rdi), %xmm4
+-	pxor	32(%rsi), %xmm4
+-	por	%xmm4, %xmm1
++	CMPEQ	(%rsi), %xmm0
++	CMPEQ	16(%rsi), %xmm1
++	CMPEQ	32(%rsi), %xmm2
++	CMPEQ	48(%rsi), %xmm3
+ 
+-	movdqa	48(%rdi), %xmm5
+-	pxor	48(%rsi), %xmm5
+-	por	%xmm5, %xmm1
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
+ 
+-	ptest	%xmm1, %xmm0
+-	jnc	L(64bytesormore_loop_end)
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
+ 	add	$64, %rsi
+ 	add	$64, %rdi
+ 	sub	$64, %rdx
+-	jae	L(64bytesormore_loopin2aligned)
+-
+-	add	$64, %rdx
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+-L(L2_L3_cache_aglined):
+-	sub	$64, %rdx
++	ja	L(64bytesormore_loopin2aligned)
++	jmp	L(loop_tail)
+ 
++L(L2_L3_cache_aligned):
++	subq	$64, %rdx
+ 	.p2align 4
+ L(L2_L3_aligned_128bytes_loop):
+ 	prefetchnta 0x1c0(%rdi)
+ 	prefetchnta 0x1c0(%rsi)
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	movdqa	%xmm2, %xmm1
+-
+-	movdqa	16(%rdi), %xmm3
+-	pxor	16(%rsi), %xmm3
+-	por	%xmm3, %xmm1
++	movdqa	(%rdi), %xmm0
++	movdqa	16(%rdi), %xmm1
++	movdqa	32(%rdi), %xmm2
++	movdqa	48(%rdi), %xmm3
+ 
+-	movdqa	32(%rdi), %xmm4
+-	pxor	32(%rsi), %xmm4
+-	por	%xmm4, %xmm1
++	CMPEQ	(%rsi), %xmm0
++	CMPEQ	16(%rsi), %xmm1
++	CMPEQ	32(%rsi), %xmm2
++	CMPEQ	48(%rsi), %xmm3
+ 
+-	movdqa	48(%rdi), %xmm5
+-	pxor	48(%rsi), %xmm5
+-	por	%xmm5, %xmm1
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
+ 
+-	ptest	%xmm1, %xmm0
+-	jnc	L(64bytesormore_loop_end)
+-	add	$64, %rsi
+-	add	$64, %rdi
+-	sub	$64, %rdx
+-	jae	L(L2_L3_aligned_128bytes_loop)
+-
+-	add	$64, %rdx
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
+ 
++	addq	$64, %rsi
++	addq	$64, %rdi
++	subq	$64, %rdx
++	ja	L(L2_L3_aligned_128bytes_loop)
++	jmp	L(loop_tail)
+ 
+ 	.p2align 4
+ L(64bytesormore_loop_end):
+-	add	$16, %rdi
+-	add	$16, %rsi
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytes)
+-
+-	add	$16, %rdi
+-	add	$16, %rsi
+-	ptest	%xmm3, %xmm0
+-	jnc	L(16bytes)
+-
+-	add	$16, %rdi
+-	add	$16, %rsi
+-	ptest	%xmm4, %xmm0
+-	jnc	L(16bytes)
+-
+-	add	$16, %rdi
+-	add	$16, %rsi
+-	jmp	L(16bytes)
+-
+-L(256bytesin256):
+-	add	$256, %rdi
+-	add	$256, %rsi
+-	jmp	L(16bytes)
+-L(240bytesin256):
+-	add	$240, %rdi
+-	add	$240, %rsi
+-	jmp	L(16bytes)
+-L(224bytesin256):
+-	add	$224, %rdi
+-	add	$224, %rsi
+-	jmp	L(16bytes)
+-L(208bytesin256):
+-	add	$208, %rdi
+-	add	$208, %rsi
+-	jmp	L(16bytes)
+-L(192bytesin256):
+-	add	$192, %rdi
+-	add	$192, %rsi
+-	jmp	L(16bytes)
+-L(176bytesin256):
+-	add	$176, %rdi
+-	add	$176, %rsi
+-	jmp	L(16bytes)
+-L(160bytesin256):
+-	add	$160, %rdi
+-	add	$160, %rsi
+-	jmp	L(16bytes)
+-L(144bytesin256):
+-	add	$144, %rdi
+-	add	$144, %rsi
+-	jmp	L(16bytes)
+-L(128bytesin256):
+-	add	$128, %rdi
+-	add	$128, %rsi
+-	jmp	L(16bytes)
+-L(112bytesin256):
+-	add	$112, %rdi
+-	add	$112, %rsi
+-	jmp	L(16bytes)
+-L(96bytesin256):
+-	add	$96, %rdi
+-	add	$96, %rsi
+-	jmp	L(16bytes)
+-L(80bytesin256):
+-	add	$80, %rdi
+-	add	$80, %rsi
+-	jmp	L(16bytes)
+-L(64bytesin256):
+-	add	$64, %rdi
+-	add	$64, %rsi
+-	jmp	L(16bytes)
+-L(48bytesin256):
+-	add	$16, %rdi
+-	add	$16, %rsi
+-L(32bytesin256):
+-	add	$16, %rdi
+-	add	$16, %rsi
+-L(16bytesin256):
+-	add	$16, %rdi
+-	add	$16, %rsi
+-L(16bytes):
+-	mov	-16(%rdi), %rax
+-	mov	-16(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(8bytes):
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(12bytes):
+-	mov	-12(%rdi), %rax
+-	mov	-12(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(4bytes):
+-	mov	-4(%rsi), %ecx
+-# ifndef USE_AS_WMEMCMP
+-	mov	-4(%rdi), %eax
+-	cmp	%eax, %ecx
+-# else
+-	cmp	-4(%rdi), %ecx
+-# endif
+-	jne	L(diffin4bytes)
+-L(0bytes):
+-	xor	%eax, %eax
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-/* unreal case for wmemcmp */
+-	.p2align 4
+-L(65bytes):
+-	movdqu	-65(%rdi), %xmm1
+-	movdqu	-65(%rsi), %xmm2
+-	mov	$-65, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(49bytes):
+-	movdqu	-49(%rdi), %xmm1
+-	movdqu	-49(%rsi), %xmm2
+-	mov	$-49, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(33bytes):
+-	movdqu	-33(%rdi), %xmm1
+-	movdqu	-33(%rsi), %xmm2
+-	mov	$-33, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(17bytes):
+-	mov	-17(%rdi), %rax
+-	mov	-17(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(9bytes):
+-	mov	-9(%rdi), %rax
+-	mov	-9(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	movzbl	-1(%rdi), %eax
+-	movzbl	-1(%rsi), %edx
+-	sub	%edx, %eax
+-	ret
+-
+-	.p2align 4
+-L(13bytes):
+-	mov	-13(%rdi), %rax
+-	mov	-13(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(5bytes):
+-	mov	-5(%rdi), %eax
+-	mov	-5(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	movzbl	-1(%rdi), %eax
+-	movzbl	-1(%rsi), %edx
+-	sub	%edx, %eax
+-	ret
+-
+-	.p2align 4
+-L(66bytes):
+-	movdqu	-66(%rdi), %xmm1
+-	movdqu	-66(%rsi), %xmm2
+-	mov	$-66, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(50bytes):
+-	movdqu	-50(%rdi), %xmm1
+-	movdqu	-50(%rsi), %xmm2
+-	mov	$-50, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(34bytes):
+-	movdqu	-34(%rdi), %xmm1
+-	movdqu	-34(%rsi), %xmm2
+-	mov	$-34, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(18bytes):
+-	mov	-18(%rdi), %rax
+-	mov	-18(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(10bytes):
+-	mov	-10(%rdi), %rax
+-	mov	-10(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	movzwl	-2(%rdi), %eax
+-	movzwl	-2(%rsi), %ecx
+-	cmp	%cl, %al
+-	jne	L(end)
+-	and	$0xffff, %eax
+-	and	$0xffff, %ecx
+-	sub	%ecx, %eax
+-	ret
+-
+-	.p2align 4
+-L(14bytes):
+-	mov	-14(%rdi), %rax
+-	mov	-14(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(6bytes):
+-	mov	-6(%rdi), %eax
+-	mov	-6(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-L(2bytes):
+-	movzwl	-2(%rsi), %ecx
+-	movzwl	-2(%rdi), %eax
+-	cmp	%cl, %al
+-	jne	L(end)
+-	and	$0xffff, %eax
+-	and	$0xffff, %ecx
+-	sub	%ecx, %eax
+-	ret
+-
+-	.p2align 4
+-L(67bytes):
+-	movdqu	-67(%rdi), %xmm2
+-	movdqu	-67(%rsi), %xmm1
+-	mov	$-67, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(51bytes):
+-	movdqu	-51(%rdi), %xmm2
+-	movdqu	-51(%rsi), %xmm1
+-	mov	$-51, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(35bytes):
+-	movdqu	-35(%rsi), %xmm1
+-	movdqu	-35(%rdi), %xmm2
+-	mov	$-35, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(19bytes):
+-	mov	-19(%rdi), %rax
+-	mov	-19(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(11bytes):
+-	mov	-11(%rdi), %rax
+-	mov	-11(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-4(%rdi), %eax
+-	mov	-4(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(15bytes):
+-	mov	-15(%rdi), %rax
+-	mov	-15(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(7bytes):
+-	mov	-7(%rdi), %eax
+-	mov	-7(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	mov	-4(%rdi), %eax
+-	mov	-4(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(3bytes):
+-	movzwl	-3(%rdi), %eax
+-	movzwl	-3(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin2bytes)
+-L(1bytes):
+-	movzbl	-1(%rdi), %eax
+-	movzbl	-1(%rsi), %ecx
+-	sub	%ecx, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(68bytes):
+-	movdqu	-68(%rdi), %xmm2
+-	movdqu	-68(%rsi), %xmm1
+-	mov	$-68, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(52bytes):
+-	movdqu	-52(%rdi), %xmm2
+-	movdqu	-52(%rsi), %xmm1
+-	mov	$-52, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(36bytes):
+-	movdqu	-36(%rdi), %xmm2
+-	movdqu	-36(%rsi), %xmm1
+-	mov	$-36, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(20bytes):
+-	movdqu	-20(%rdi), %xmm2
+-	movdqu	-20(%rsi), %xmm1
+-	mov	$-20, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-4(%rsi), %ecx
+-
+-# ifndef USE_AS_WMEMCMP
+-	mov	-4(%rdi), %eax
+-	cmp	%eax, %ecx
+-# else
+-	cmp	-4(%rdi), %ecx
+-# endif
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-/* unreal cases for wmemcmp */
+-	.p2align 4
+-L(69bytes):
+-	movdqu	-69(%rsi), %xmm1
+-	movdqu	-69(%rdi), %xmm2
+-	mov	$-69, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(53bytes):
+-	movdqu	-53(%rsi), %xmm1
+-	movdqu	-53(%rdi), %xmm2
+-	mov	$-53, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(37bytes):
+-	movdqu	-37(%rsi), %xmm1
+-	movdqu	-37(%rdi), %xmm2
+-	mov	$-37, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(21bytes):
+-	movdqu	-21(%rsi), %xmm1
+-	movdqu	-21(%rdi), %xmm2
+-	mov	$-21, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(70bytes):
+-	movdqu	-70(%rsi), %xmm1
+-	movdqu	-70(%rdi), %xmm2
+-	mov	$-70, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(54bytes):
+-	movdqu	-54(%rsi), %xmm1
+-	movdqu	-54(%rdi), %xmm2
+-	mov	$-54, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(38bytes):
+-	movdqu	-38(%rsi), %xmm1
+-	movdqu	-38(%rdi), %xmm2
+-	mov	$-38, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(22bytes):
+-	movdqu	-22(%rsi), %xmm1
+-	movdqu	-22(%rdi), %xmm2
+-	mov	$-22, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(71bytes):
+-	movdqu	-71(%rsi), %xmm1
+-	movdqu	-71(%rdi), %xmm2
+-	mov	$-71, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(55bytes):
+-	movdqu	-55(%rdi), %xmm2
+-	movdqu	-55(%rsi), %xmm1
+-	mov	$-55, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(39bytes):
+-	movdqu	-39(%rdi), %xmm2
+-	movdqu	-39(%rsi), %xmm1
+-	mov	$-39, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(23bytes):
+-	movdqu	-23(%rdi), %xmm2
+-	movdqu	-23(%rsi), %xmm1
+-	mov	$-23, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(72bytes):
+-	movdqu	-72(%rsi), %xmm1
+-	movdqu	-72(%rdi), %xmm2
+-	mov	$-72, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(56bytes):
+-	movdqu	-56(%rdi), %xmm2
+-	movdqu	-56(%rsi), %xmm1
+-	mov	$-56, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(40bytes):
+-	movdqu	-40(%rdi), %xmm2
+-	movdqu	-40(%rsi), %xmm1
+-	mov	$-40, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(24bytes):
+-	movdqu	-24(%rdi), %xmm2
+-	movdqu	-24(%rsi), %xmm1
+-	mov	$-24, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-
+-	mov	-8(%rsi), %rcx
+-	mov	-8(%rdi), %rax
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-/* unreal cases for wmemcmp */
+-	.p2align 4
+-L(73bytes):
+-	movdqu	-73(%rsi), %xmm1
+-	movdqu	-73(%rdi), %xmm2
+-	mov	$-73, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(57bytes):
+-	movdqu	-57(%rdi), %xmm2
+-	movdqu	-57(%rsi), %xmm1
+-	mov	$-57, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(41bytes):
+-	movdqu	-41(%rdi), %xmm2
+-	movdqu	-41(%rsi), %xmm1
+-	mov	$-41, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(25bytes):
+-	movdqu	-25(%rdi), %xmm2
+-	movdqu	-25(%rsi), %xmm1
+-	mov	$-25, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-9(%rdi), %rax
+-	mov	-9(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	movzbl	-1(%rdi), %eax
+-	movzbl	-1(%rsi), %ecx
+-	sub	%ecx, %eax
+-	ret
+-
+-	.p2align 4
+-L(74bytes):
+-	movdqu	-74(%rsi), %xmm1
+-	movdqu	-74(%rdi), %xmm2
+-	mov	$-74, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(58bytes):
+-	movdqu	-58(%rdi), %xmm2
+-	movdqu	-58(%rsi), %xmm1
+-	mov	$-58, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(42bytes):
+-	movdqu	-42(%rdi), %xmm2
+-	movdqu	-42(%rsi), %xmm1
+-	mov	$-42, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(26bytes):
+-	movdqu	-26(%rdi), %xmm2
+-	movdqu	-26(%rsi), %xmm1
+-	mov	$-26, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-10(%rdi), %rax
+-	mov	-10(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	movzwl	-2(%rdi), %eax
+-	movzwl	-2(%rsi), %ecx
+-	jmp	L(diffin2bytes)
+-
+-	.p2align 4
+-L(75bytes):
+-	movdqu	-75(%rsi), %xmm1
+-	movdqu	-75(%rdi), %xmm2
+-	mov	$-75, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(59bytes):
+-	movdqu	-59(%rdi), %xmm2
+-	movdqu	-59(%rsi), %xmm1
+-	mov	$-59, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(43bytes):
+-	movdqu	-43(%rdi), %xmm2
+-	movdqu	-43(%rsi), %xmm1
+-	mov	$-43, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(27bytes):
+-	movdqu	-27(%rdi), %xmm2
+-	movdqu	-27(%rsi), %xmm1
+-	mov	$-27, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-11(%rdi), %rax
+-	mov	-11(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-4(%rdi), %eax
+-	mov	-4(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-# endif
+-	.p2align 4
+-L(76bytes):
+-	movdqu	-76(%rsi), %xmm1
+-	movdqu	-76(%rdi), %xmm2
+-	mov	$-76, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(60bytes):
+-	movdqu	-60(%rdi), %xmm2
+-	movdqu	-60(%rsi), %xmm1
+-	mov	$-60, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(44bytes):
+-	movdqu	-44(%rdi), %xmm2
+-	movdqu	-44(%rsi), %xmm1
+-	mov	$-44, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(28bytes):
+-	movdqu	-28(%rdi), %xmm2
+-	movdqu	-28(%rsi), %xmm1
+-	mov	$-28, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-12(%rdi), %rax
+-	mov	-12(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-4(%rsi), %ecx
+-# ifndef USE_AS_WMEMCMP
+-	mov	-4(%rdi), %eax
+-	cmp	%eax, %ecx
+-# else
+-	cmp	-4(%rdi), %ecx
+-# endif
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-/* unreal cases for wmemcmp */
+-	.p2align 4
+-L(77bytes):
+-	movdqu	-77(%rsi), %xmm1
+-	movdqu	-77(%rdi), %xmm2
+-	mov	$-77, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(61bytes):
+-	movdqu	-61(%rdi), %xmm2
+-	movdqu	-61(%rsi), %xmm1
+-	mov	$-61, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(45bytes):
+-	movdqu	-45(%rdi), %xmm2
+-	movdqu	-45(%rsi), %xmm1
+-	mov	$-45, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(29bytes):
+-	movdqu	-29(%rdi), %xmm2
+-	movdqu	-29(%rsi), %xmm1
+-	mov	$-29, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-
+-	mov	-13(%rdi), %rax
+-	mov	-13(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(78bytes):
+-	movdqu	-78(%rsi), %xmm1
+-	movdqu	-78(%rdi), %xmm2
+-	mov	$-78, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(62bytes):
+-	movdqu	-62(%rdi), %xmm2
+-	movdqu	-62(%rsi), %xmm1
+-	mov	$-62, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(46bytes):
+-	movdqu	-46(%rdi), %xmm2
+-	movdqu	-46(%rsi), %xmm1
+-	mov	$-46, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(30bytes):
+-	movdqu	-30(%rdi), %xmm2
+-	movdqu	-30(%rsi), %xmm1
+-	mov	$-30, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-14(%rdi), %rax
+-	mov	-14(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(79bytes):
+-	movdqu	-79(%rsi), %xmm1
+-	movdqu	-79(%rdi), %xmm2
+-	mov	$-79, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(63bytes):
+-	movdqu	-63(%rdi), %xmm2
+-	movdqu	-63(%rsi), %xmm1
+-	mov	$-63, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(47bytes):
+-	movdqu	-47(%rdi), %xmm2
+-	movdqu	-47(%rsi), %xmm1
+-	mov	$-47, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(31bytes):
+-	movdqu	-31(%rdi), %xmm2
+-	movdqu	-31(%rsi), %xmm1
+-	mov	$-31, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-15(%rdi), %rax
+-	mov	-15(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-# endif
+-	.p2align 4
+-L(64bytes):
+-	movdqu	-64(%rdi), %xmm2
+-	movdqu	-64(%rsi), %xmm1
+-	mov	$-64, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(48bytes):
+-	movdqu	-48(%rdi), %xmm2
+-	movdqu	-48(%rsi), %xmm1
+-	mov	$-48, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(32bytes):
+-	movdqu	-32(%rdi), %xmm2
+-	movdqu	-32(%rsi), %xmm1
+-	mov	$-32, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-
+-	mov	-16(%rdi), %rax
+-	mov	-16(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-/*
+- * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
+- */
+-	.p2align 3
+-L(less16bytes):
+-	movsbq	%dl, %rdx
+-	mov	(%rsi, %rdx), %rcx
+-	mov	(%rdi, %rdx), %rax
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	8(%rsi, %rdx), %rcx
+-	mov	8(%rdi, %rdx), %rax
+-L(diffin8bytes):
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	shr	$32, %rcx
+-	shr	$32, %rax
+-
++	pmovmskb %xmm0, %ecx
++	incw	%cx
++	jnz	L(loop_end_ret)
++
++	pmovmskb %xmm1, %ecx
++	notw	%cx
++	sall	$16, %ecx
++	jnz	L(loop_end_ret)
++
++	pmovmskb %xmm2, %ecx
++	notw	%cx
++	shlq	$32, %rcx
++	jnz	L(loop_end_ret)
++
++	addq	$48, %rdi
++	addq	$48, %rsi
++	movq	%rax, %rcx
++
++	.p2align 4,, 6
++L(loop_end_ret):
++	bsfq	%rcx, %rcx
+ # ifdef USE_AS_WMEMCMP
+-/* for wmemcmp */
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-# endif
+-
+-L(diffin4bytes):
+-# ifndef USE_AS_WMEMCMP
+-	cmp	%cx, %ax
+-	jne	L(diffin2bytes)
+-	shr	$16, %ecx
+-	shr	$16, %eax
+-L(diffin2bytes):
+-	cmp	%cl, %al
+-	jne	L(end)
+-	and	$0xffff, %eax
+-	and	$0xffff, %ecx
+-	sub	%ecx, %eax
+-	ret
+-
+-	.p2align 4
+-L(end):
+-	and	$0xff, %eax
+-	and	$0xff, %ecx
+-	sub	%ecx, %eax
+-	ret
++	movl	(%rdi, %rcx), %eax
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rcx), %eax
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ # else
+-
+-/* for wmemcmp */
+-	mov	$1, %eax
+-	jl	L(nequal_bigger)
+-	neg	%eax
+-	ret
+-
+-	.p2align 4
+-L(nequal_bigger):
+-	ret
+-
+-L(unreal_case):
+-	xor	%eax, %eax
+-	ret
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
+-
++	ret
+ END (MEMCMP)
+-
+-	.section .rodata.sse4.1,"a",@progbits
+-	.p2align 3
+-# ifndef USE_AS_WMEMCMP
+-L(table_64bytes):
+-	.int	JMPTBL (L(0bytes), L(table_64bytes))
+-	.int	JMPTBL (L(1bytes), L(table_64bytes))
+-	.int	JMPTBL (L(2bytes), L(table_64bytes))
+-	.int	JMPTBL (L(3bytes), L(table_64bytes))
+-	.int	JMPTBL (L(4bytes), L(table_64bytes))
+-	.int	JMPTBL (L(5bytes), L(table_64bytes))
+-	.int	JMPTBL (L(6bytes), L(table_64bytes))
+-	.int	JMPTBL (L(7bytes), L(table_64bytes))
+-	.int	JMPTBL (L(8bytes), L(table_64bytes))
+-	.int	JMPTBL (L(9bytes), L(table_64bytes))
+-	.int	JMPTBL (L(10bytes), L(table_64bytes))
+-	.int	JMPTBL (L(11bytes), L(table_64bytes))
+-	.int	JMPTBL (L(12bytes), L(table_64bytes))
+-	.int	JMPTBL (L(13bytes), L(table_64bytes))
+-	.int	JMPTBL (L(14bytes), L(table_64bytes))
+-	.int	JMPTBL (L(15bytes), L(table_64bytes))
+-	.int	JMPTBL (L(16bytes), L(table_64bytes))
+-	.int	JMPTBL (L(17bytes), L(table_64bytes))
+-	.int	JMPTBL (L(18bytes), L(table_64bytes))
+-	.int	JMPTBL (L(19bytes), L(table_64bytes))
+-	.int	JMPTBL (L(20bytes), L(table_64bytes))
+-	.int	JMPTBL (L(21bytes), L(table_64bytes))
+-	.int	JMPTBL (L(22bytes), L(table_64bytes))
+-	.int	JMPTBL (L(23bytes), L(table_64bytes))
+-	.int	JMPTBL (L(24bytes), L(table_64bytes))
+-	.int	JMPTBL (L(25bytes), L(table_64bytes))
+-	.int	JMPTBL (L(26bytes), L(table_64bytes))
+-	.int	JMPTBL (L(27bytes), L(table_64bytes))
+-	.int	JMPTBL (L(28bytes), L(table_64bytes))
+-	.int	JMPTBL (L(29bytes), L(table_64bytes))
+-	.int	JMPTBL (L(30bytes), L(table_64bytes))
+-	.int	JMPTBL (L(31bytes), L(table_64bytes))
+-	.int	JMPTBL (L(32bytes), L(table_64bytes))
+-	.int	JMPTBL (L(33bytes), L(table_64bytes))
+-	.int	JMPTBL (L(34bytes), L(table_64bytes))
+-	.int	JMPTBL (L(35bytes), L(table_64bytes))
+-	.int	JMPTBL (L(36bytes), L(table_64bytes))
+-	.int	JMPTBL (L(37bytes), L(table_64bytes))
+-	.int	JMPTBL (L(38bytes), L(table_64bytes))
+-	.int	JMPTBL (L(39bytes), L(table_64bytes))
+-	.int	JMPTBL (L(40bytes), L(table_64bytes))
+-	.int	JMPTBL (L(41bytes), L(table_64bytes))
+-	.int	JMPTBL (L(42bytes), L(table_64bytes))
+-	.int	JMPTBL (L(43bytes), L(table_64bytes))
+-	.int	JMPTBL (L(44bytes), L(table_64bytes))
+-	.int	JMPTBL (L(45bytes), L(table_64bytes))
+-	.int	JMPTBL (L(46bytes), L(table_64bytes))
+-	.int	JMPTBL (L(47bytes), L(table_64bytes))
+-	.int	JMPTBL (L(48bytes), L(table_64bytes))
+-	.int	JMPTBL (L(49bytes), L(table_64bytes))
+-	.int	JMPTBL (L(50bytes), L(table_64bytes))
+-	.int	JMPTBL (L(51bytes), L(table_64bytes))
+-	.int	JMPTBL (L(52bytes), L(table_64bytes))
+-	.int	JMPTBL (L(53bytes), L(table_64bytes))
+-	.int	JMPTBL (L(54bytes), L(table_64bytes))
+-	.int	JMPTBL (L(55bytes), L(table_64bytes))
+-	.int	JMPTBL (L(56bytes), L(table_64bytes))
+-	.int	JMPTBL (L(57bytes), L(table_64bytes))
+-	.int	JMPTBL (L(58bytes), L(table_64bytes))
+-	.int	JMPTBL (L(59bytes), L(table_64bytes))
+-	.int	JMPTBL (L(60bytes), L(table_64bytes))
+-	.int	JMPTBL (L(61bytes), L(table_64bytes))
+-	.int	JMPTBL (L(62bytes), L(table_64bytes))
+-	.int	JMPTBL (L(63bytes), L(table_64bytes))
+-	.int	JMPTBL (L(64bytes), L(table_64bytes))
+-	.int	JMPTBL (L(65bytes), L(table_64bytes))
+-	.int	JMPTBL (L(66bytes), L(table_64bytes))
+-	.int	JMPTBL (L(67bytes), L(table_64bytes))
+-	.int	JMPTBL (L(68bytes), L(table_64bytes))
+-	.int	JMPTBL (L(69bytes), L(table_64bytes))
+-	.int	JMPTBL (L(70bytes), L(table_64bytes))
+-	.int	JMPTBL (L(71bytes), L(table_64bytes))
+-	.int	JMPTBL (L(72bytes), L(table_64bytes))
+-	.int	JMPTBL (L(73bytes), L(table_64bytes))
+-	.int	JMPTBL (L(74bytes), L(table_64bytes))
+-	.int	JMPTBL (L(75bytes), L(table_64bytes))
+-	.int	JMPTBL (L(76bytes), L(table_64bytes))
+-	.int	JMPTBL (L(77bytes), L(table_64bytes))
+-	.int	JMPTBL (L(78bytes), L(table_64bytes))
+-	.int	JMPTBL (L(79bytes), L(table_64bytes))
+-# else
+-L(table_64bytes):
+-	.int	JMPTBL (L(0bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(4bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(8bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(12bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(16bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(20bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(24bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(28bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(32bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(36bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(40bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(44bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(48bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(52bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(56bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(60bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(64bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(68bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(72bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(76bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-# endif
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-64.patch b/glibc-RHEL-15696-64.patch
new file mode 100644
index 0000000..ba7f14a
--- /dev/null
+++ b/glibc-RHEL-15696-64.patch
@@ -0,0 +1,39 @@
+From 0b82747dc48d5bf0871bdc6da8cb6eec1256355f Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 11 Nov 2021 06:31:51 -0800
+Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_lock_full [BZ
+ #28537]
+Content-type: text/plain; charset=UTF-8
+
+Replace boolean CAS with value CAS to avoid the extra load.
+
+Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+---
+ nptl/pthread_mutex_lock.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index 29cc143e..60ada70d 100644
+--- a/nptl/pthread_mutex_lock.c
++++ b/nptl/pthread_mutex_lock.c
+@@ -292,12 +292,12 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock,
+-							oldval | FUTEX_WAITERS,
+-							oldval)
+-		  != 0)
++	      int val;
++	      if ((val = atomic_compare_and_exchange_val_acq
++		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
++		    oldval)) != oldval)
+ 		{
+-		  oldval = mutex->__data.__lock;
++		  oldval = val;
+ 		  continue;
+ 		}
+ 	      oldval |= FUTEX_WAITERS;
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-65.patch b/glibc-RHEL-15696-65.patch
new file mode 100644
index 0000000..296d4a9
--- /dev/null
+++ b/glibc-RHEL-15696-65.patch
@@ -0,0 +1,39 @@
+From 49302b8fdf9103b6fc0a398678668a22fa19574c Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 11 Nov 2021 06:54:01 -0800
+Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_clocklock_common
+ [BZ #28537]
+Content-type: text/plain; charset=UTF-8
+
+Replace boolean CAS with value CAS to avoid the extra load.
+
+Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+---
+ nptl/pthread_mutex_timedlock.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
+index 888c12fe..c4627ef6 100644
+--- a/nptl/pthread_mutex_timedlock.c
++++ b/nptl/pthread_mutex_timedlock.c
+@@ -269,12 +269,12 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock,
+-							oldval | FUTEX_WAITERS,
+-							oldval)
+-		  != 0)
++	      int val;
++	      if ((val = atomic_compare_and_exchange_val_acq
++		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
++		    oldval)) != oldval)
+ 		{
+-		  oldval = mutex->__data.__lock;
++		  oldval = val;
+ 		  continue;
+ 		}
+ 	      oldval |= FUTEX_WAITERS;
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-66.patch b/glibc-RHEL-15696-66.patch
new file mode 100644
index 0000000..4579636
--- /dev/null
+++ b/glibc-RHEL-15696-66.patch
@@ -0,0 +1,51 @@
+From d672a98a1af106bd68deb15576710cd61363f7a6 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Tue, 2 Nov 2021 18:33:07 -0700
+Subject: [PATCH] Add LLL_MUTEX_READ_LOCK [BZ #28537]
+Content-type: text/plain; charset=UTF-8
+
+CAS instruction is expensive.  From the x86 CPU's point of view, getting
+a cache line for writing is more expensive than reading.  See Appendix
+A.2 Spinlock in:
+
+https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf
+
+The full compare and swap will grab the cache line exclusive and cause
+excessive cache line bouncing.
+
+Add LLL_MUTEX_READ_LOCK to do an atomic load and skip CAS in spinlock
+loop if compare may fail to reduce cache line bouncing on contended locks.
+
+Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+---
+ nptl/pthread_mutex_lock.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index 60ada70d..eb4d8baa 100644
+--- a/nptl/pthread_mutex_lock.c
++++ b/nptl/pthread_mutex_lock.c
+@@ -56,6 +56,11 @@
+ #define FORCE_ELISION(m, s)
+ #endif
+ 
++#ifndef LLL_MUTEX_READ_LOCK
++# define LLL_MUTEX_READ_LOCK(mutex) \
++  atomic_load_relaxed (&(mutex)->__data.__lock)
++#endif
++
+ static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
+      __attribute_noinline__;
+ 
+@@ -136,6 +141,8 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
+ 		  break;
+ 		}
+ 	      atomic_spin_nop ();
++	      if (LLL_MUTEX_READ_LOCK (mutex) != 0)
++		continue;
+ 	    }
+ 	  while (LLL_MUTEX_TRYLOCK (mutex) != 0);
+ 
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-67.patch b/glibc-RHEL-15696-67.patch
new file mode 100644
index 0000000..73c8306
--- /dev/null
+++ b/glibc-RHEL-15696-67.patch
@@ -0,0 +1,71 @@
+From 120ac6d238825452e8024e2f627da33b2508dfd3 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 12 Nov 2021 11:47:42 -0800
+Subject: [PATCH] Move assignment out of the CAS condition
+Content-type: text/plain; charset=UTF-8
+
+Update
+
+commit 49302b8fdf9103b6fc0a398678668a22fa19574c
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Thu Nov 11 06:54:01 2021 -0800
+
+    Avoid extra load with CAS in __pthread_mutex_clocklock_common [BZ #28537]
+
+    Replace boolean CAS with value CAS to avoid the extra load.
+
+and
+
+commit 0b82747dc48d5bf0871bdc6da8cb6eec1256355f
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Thu Nov 11 06:31:51 2021 -0800
+
+    Avoid extra load with CAS in __pthread_mutex_lock_full [BZ #28537]
+
+    Replace boolean CAS with value CAS to avoid the extra load.
+
+by moving assignment out of the CAS condition.
+---
+ nptl/pthread_mutex_lock.c      | 7 +++----
+ nptl/pthread_mutex_timedlock.c | 7 +++----
+ 2 files changed, 6 insertions(+), 8 deletions(-)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index eb4d8baa..a633d95e 100644
+--- a/nptl/pthread_mutex_lock.c
++++ b/nptl/pthread_mutex_lock.c
+@@ -299,10 +299,9 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      int val;
+-	      if ((val = atomic_compare_and_exchange_val_acq
+-		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
+-		    oldval)) != oldval)
++	      int val = atomic_compare_and_exchange_val_acq
++		(&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
++	      if (val != oldval)
+ 		{
+ 		  oldval = val;
+ 		  continue;
+diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
+index c4627ef6..a76c30b7 100644
+--- a/nptl/pthread_mutex_timedlock.c
++++ b/nptl/pthread_mutex_timedlock.c
+@@ -269,10 +269,9 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      int val;
+-	      if ((val = atomic_compare_and_exchange_val_acq
+-		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
+-		    oldval)) != oldval)
++	      int val = atomic_compare_and_exchange_val_acq
++		(&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
++	      if (val != oldval)
+ 		{
+ 		  oldval = val;
+ 		  continue;
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-68.patch b/glibc-RHEL-15696-68.patch
new file mode 100644
index 0000000..df35b31
--- /dev/null
+++ b/glibc-RHEL-15696-68.patch
@@ -0,0 +1,60 @@
+From 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 3 Dec 2021 15:29:25 -0800
+Subject: [PATCH] x86-64: Use notl in EVEX strcmp [BZ #28646]
+Content-type: text/plain; charset=UTF-8
+
+Must use notl %edi here as lower bits are for CHAR comparisons
+potentially out of range thus can be 0 without indicating mismatch.
+This fixes BZ #28646.
+
+Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+Conflicts:
+	string/test-strcmp.c
+	(new check omitted)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 82f12ac8..6f5c4bf9 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -656,12 +656,13 @@ L(loop_cross_page):
+ 	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
+ 	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
+ 	kmovd	%k3, %edi
++    /* Must use notl %edi here as lower bits are for CHAR
++	   comparisons potentially out of range thus can be 0 without
++	   indicating mismatch.  */
++	notl	%edi
+ # ifdef USE_AS_WCSCMP
+ 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	notl	%edi
+ 	andl	$0xff, %edi
+-# else
+-	incl	%edi
+ # endif
+ 
+ # ifdef USE_AS_WCSCMP
+@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec):
+ 	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
+ 	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
+ 	kmovd	%k3, %edi
++	/* Must use notl %edi here as lower bits are for CHAR
++	   comparisons potentially out of range thus can be 0 without
++	   indicating mismatch.  */
++	notl	%edi
+ # ifdef USE_AS_WCSCMP
+ 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	notl	%edi
+ 	andl	$0xff, %edi
+-# else
+-	incl	%edi
+ # endif
+ 
+ # ifdef USE_AS_WCSCMP
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-69.patch b/glibc-RHEL-15696-69.patch
new file mode 100644
index 0000000..9f859f2
--- /dev/null
+++ b/glibc-RHEL-15696-69.patch
@@ -0,0 +1,35 @@
+From ceeffe968c01b1202e482f4855cb6baf5c6cb713 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 6 Dec 2021 07:14:12 -0800
+Subject: [PATCH] x86: Don't set Prefer_No_AVX512 for processors with AVX512
+ and AVX-VNNI
+Content-type: text/plain; charset=UTF-8
+
+Don't set Prefer_No_AVX512 on processors with AVX512 and AVX-VNNI since
+they won't lower CPU frequency when ZMM load and store instructions are
+used.
+---
+ sysdeps/x86/cpu-features.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 956bfb4f..5ff2baa0 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -525,8 +525,11 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 	  |= bit_arch_Prefer_No_VZEROUPPER;
+       else
+ 	{
+-	  cpu_features->preferred[index_arch_Prefer_No_AVX512]
+-	    |= bit_arch_Prefer_No_AVX512;
++	  /* Processors with AVX512 and AVX-VNNI won't lower CPU frequency
++	     when ZMM load and store instructions are used.  */
++	  if (!CPU_FEATURES_CPU_P (cpu_features, AVX_VNNI))
++	    cpu_features->preferred[index_arch_Prefer_No_AVX512]
++	      |= bit_arch_Prefer_No_AVX512;
+ 
+ 	  /* Avoid RTM abort triggered by VZEROUPPER inside a
+ 	     transactionally executing RTM region.  */
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-7.patch b/glibc-RHEL-15696-7.patch
new file mode 100644
index 0000000..8ef468c
--- /dev/null
+++ b/glibc-RHEL-15696-7.patch
@@ -0,0 +1,153 @@
+From c7c54f65b080affb87a1513dee449c8ad6143c8b Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:35:18 -0800
+Subject: [PATCH] x86-64 strncpy: Properly handle the length parameter [BZ#
+ 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes strncpy for x32.  Tested on x86-64 and x32.  On x86-64,
+libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strcpy-avx2.S: Use RDX_LP for length.
+	* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise.
+	* sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy.
+	* sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file.
+---
+ .../x86_64/multiarch/strcpy-sse2-unaligned.S  |  4 +-
+ sysdeps/x86_64/multiarch/strcpy-ssse3.S       |  6 +-
+ sysdeps/x86_64/x32/Makefile                   |  2 +-
+ sysdeps/x86_64/x32/tst-size_t-strncpy.c       | 58 +++++++++++++++++++
+ 4 files changed, 64 insertions(+), 6 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncpy.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+	sysdeps/x86_64/multiarch/strcpy-avx2.S
+	(skipped, only needed for x32 arch)
+
+diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+index 72bf7e85..50aca22d 100644
+--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
++++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+@@ -40,8 +40,8 @@
+ .text
+ ENTRY (STRCPY)
+ #  ifdef USE_AS_STRNCPY
+-	mov	%rdx, %r8
+-	test	%r8, %r8
++	mov	%RDX_LP, %R8_LP
++	test	%R8_LP, %R8_LP
+ 	jz	L(ExitZero)
+ #  endif
+ 	mov	%rsi, %rcx
+diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+index 9858d0c4..0a62814a 100644
+--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
++++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+@@ -31,13 +31,13 @@ ENTRY (STRCPY)
+ 
+ 	mov	%rsi, %rcx
+ #  ifdef USE_AS_STRNCPY
+-	mov	%rdx, %r8
++	mov	%RDX_LP, %R8_LP
+ #  endif
+ 	mov	%rdi, %rdx
+ #  ifdef USE_AS_STRNCPY
+-	test	%r8, %r8
++	test	%R8_LP, %R8_LP
+ 	jz	L(Exit0)
+-	cmp	$8, %r8
++	cmp	$8, %R8_LP
+ 	jbe	L(StrncpyExit8Bytes)
+ # endif
+ 	cmpb	$0, (%rcx)
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index db302839..2a9e20a9 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -8,7 +8,7 @@ endif
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+ 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+-	 tst-size_t-strncmp
++	 tst-size_t-strncmp tst-size_t-strncpy
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strncpy.c b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
+new file mode 100644
+index 00000000..4dec71e6
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
+@@ -0,0 +1,58 @@
++/* Test strncpy with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_NAME "strncpy"
++#include "test-size_t.h"
++
++IMPL (strncpy, 1)
++
++typedef char *(*proto_t) (char *, const char*, size_t);
++
++static void *
++__attribute__ ((noinline, noclone))
++do_strncpy (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t dest = { { page_size }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      do_strncpy (dest, src);
++      int res = strncmp (dest.p, src.p, dest.len);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %i != 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-70.patch b/glibc-RHEL-15696-70.patch
new file mode 100644
index 0000000..8935ac5
--- /dev/null
+++ b/glibc-RHEL-15696-70.patch
@@ -0,0 +1,389 @@
+From abddd61de090ae84e380aff68a98bd94ef704667 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 24 Dec 2021 18:54:41 -0600
+Subject: [PATCH] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+Optimizations are twofold.
+
+1) Replace page cross and 0/1 checks with masked load instructions in
+   L(less_vec). In applications this reduces branch-misses in the
+   hot [0, 32] case.
+2) Change controlflow so that L(less_vec) case gets the fall through.
+
+Change 2) helps copies in the [0, 32] size range but comes at the cost
+of copies in the [33, 64] size range.  From profiles of GCC and
+Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
+appears to the the right tradeoff.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 249 +++++--------------
+ 1 file changed, 56 insertions(+), 193 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 640f6757..d2899e7c 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -62,15 +62,18 @@ Latency:
+ # define VMOVU		vmovdqu64
+ 
+ # ifdef USE_AS_WMEMCMP
++#  define VMOVU_MASK	vmovdqu32
+ #  define CHAR_SIZE	4
+ #  define VPCMP	vpcmpd
+ #  define VPTEST	vptestmd
+ # else
++#  define VMOVU_MASK	vmovdqu8
+ #  define CHAR_SIZE	1
+ #  define VPCMP	vpcmpub
+ #  define VPTEST	vptestmb
+ # endif
+ 
++
+ # define VEC_SIZE	32
+ # define PAGE_SIZE	4096
+ # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+@@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	movl	%edx, %edx
+ # endif
+ 	cmp	$CHAR_PER_VEC, %RDX_LP
+-	jb	L(less_vec)
++	/* Fall through for [0, VEC_SIZE] as its the hottest.  */
++	ja	L(more_1x_vec)
++
++	/* Create mask for CHAR's we want to compare. This allows us to
++	   avoid having to include page cross logic.  */
++	movl	$-1, %ecx
++	bzhil	%edx, %ecx, %ecx
++	kmovd	%ecx, %k2
++
++	/* Safe to load full ymm with mask.  */
++	VMOVU_MASK (%rsi), %YMM2{%k2}
++	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
++	ret
+ 
++	.p2align 4
++L(return_vec_0):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
++	/* NB: no partial register stall here because xorl zero idiom
++	   above.  */
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
++
++	.p2align 4
++L(more_1x_vec):
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ 	VMOVU	(%rsi), %YMM1
+ 	/* Use compare not equals to directly check for mismatch.  */
+-	VPCMP	$4, (%rdi), %YMM1, %k1
++	VPCMP	$4,(%rdi), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	/* NB: eax must be destination register if going to
+ 	   L(return_vec_[0,2]). For L(return_vec_3) destination register
+@@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 
+ 	/* Check third and fourth VEC no matter what.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
++	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
++	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_3)
+@@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+ 	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
+ 	   oring with YMM1. Result is stored in YMM4.  */
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
++	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ 
+ 	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+@@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+-	.p2align 4
++
++	.p2align 4,, 8
+ L(8x_end_return_vec_0_1_2_3):
+ 	movq	%rdx, %rdi
+ L(8x_return_vec_0_1_2_3):
+@@ -222,23 +262,6 @@ L(return_vec_3):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+-L(return_vec_0):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+-	/* NB: no partial register stall here because xorl zero idiom
+-	   above.  */
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rsi, %rax), %ecx
+-	movzbl	(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+ 
+ 	.p2align 4
+ L(return_vec_1):
+@@ -297,7 +320,7 @@ L(loop_4x_vec):
+ 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
++	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ 	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+@@ -324,7 +347,7 @@ L(loop_4x_vec):
+ 	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
+ 	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
++	vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ 	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+@@ -336,14 +359,14 @@ L(loop_4x_vec):
+ 	/* Only entry is from L(more_8x_vec).  */
+ 	.p2align 4,, 10
+ L(8x_last_2x_vec):
+-	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
++	VPCMP	$4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(8x_return_vec_2)
+ 	/* Naturally aligned to 16 bytes.  */
+ L(8x_last_1x_vec):
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
+-	VPCMP	$4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
++	VPCMP	$4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(8x_return_vec_3)
+@@ -392,7 +415,9 @@ L(last_1x_vec):
+ 	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4,, 10
++
++	/* Don't align. Takes 2-fetch blocks either way and aligning
++	   will cause code to spill into another cacheline.  */
+ L(return_vec_1_end):
+ 	/* Use bsf to save code size. This is necessary to have
+ 	   L(one_or_less) fit in aligning bytes between.  */
+@@ -411,31 +436,8 @@ L(return_vec_1_end):
+ # endif
+ 	ret
+ 
+-	/* NB: L(one_or_less) fits in alignment padding between
+-	   L(return_vec_1_end) and L(return_vec_0_end).  */
+-# ifdef USE_AS_WMEMCMP
+-L(one_or_less):
+-	jb	L(zero)
+-	movl	(%rdi), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi), %ecx
+-	je	L(zero)
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-	ret
+-# else
+-L(one_or_less):
+-	jb	L(zero)
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
+-	ret
+-# endif
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-
+-	.p2align 4
++	/* Don't align. Takes 2-fetch blocks either way and aligning
++	   will cause code to spill into another cacheline.  */
+ L(return_vec_0_end):
+ 	tzcntl	%eax, %eax
+ 	addl	%edx, %eax
+@@ -451,146 +453,7 @@ L(return_vec_0_end):
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
++	/* 1-byte until next cache line.  */
+ 
+-	.p2align 4
+-L(less_vec):
+-	/* Check if one or less CHAR. This is necessary for size == 0
+-	   but is also faster for size == CHAR_SIZE.  */
+-	cmpl	$1, %edx
+-	jbe	L(one_or_less)
+-
+-	/* Check if loading one VEC from either s1 or s2 could cause a
+-	   page cross. This can have false positives but is by far the
+-	   fastest method.  */
+-	movl	%edi, %eax
+-	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(page_cross_less_vec)
+-
+-	/* No page cross possible.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMP	$4, (%rdi), %YMM2, %k1
+-	kmovd	%k1, %eax
+-	/* Check if any matches where in bounds. Intentionally not
+-	   storing result in eax to limit dependency chain if it goes to
+-	   L(return_vec_0_lv).  */
+-	bzhil	%edx, %eax, %edx
+-	jnz	L(return_vec_0_lv)
+-	xorl	%eax, %eax
+-	ret
+-
+-	/* Essentially duplicate of L(return_vec_0). Ends up not costing
+-	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
+-	   the jump and ends up fitting in aligning bytes. As well fits on
+-	   same cache line as L(less_vec) so also saves a line from having
+-	   to be fetched on cold calls to memcmp.  */
+-	.p2align 4,, 4
+-L(return_vec_0_lv):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+-	/* NB: no partial register stall here because xorl zero idiom
+-	   above.  */
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rsi, %rax), %ecx
+-	movzbl	(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(page_cross_less_vec):
+-	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+-	   bytes.  */
+-	cmpl	$(16 / CHAR_SIZE), %edx
+-	jae	L(between_16_31)
+-# ifndef USE_AS_WMEMCMP
+-	cmpl	$8, %edx
+-	jae	L(between_8_15)
+-	cmpl	$4, %edx
+-	jb	L(between_2_3)
+-
+-	/* Load as big endian with overlapping movbe to avoid branches.
+-	 */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	/* edx is guranteed to be positive int32 in range [4, 7].  */
+-	cmovne	%edx, %eax
+-	/* ecx is -1 if rcx > rax. Otherwise 0.  */
+-	sbbl	%ecx, %ecx
+-	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
+-	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
+-	   eax doesn't matter.  */
+-	orl	%ecx, %eax
+-	ret
+-
+-	.p2align 4,, 8
+-L(between_8_15):
+-# endif
+-	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+-	vmovq	(%rdi), %xmm1
+-	vmovq	(%rsi), %xmm2
+-	VPCMP	$4, %xmm1, %xmm2, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_lv)
+-	/* Use overlapping loads to avoid branches.  */
+-	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
+-	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
+-	VPCMP	$4, %xmm1, %xmm2, %k1
+-	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_end)
+-	ret
+-
+-	.p2align 4,, 8
+-L(between_16_31):
+-	/* From 16 to 31 bytes.  No branch when size == 16.  */
+-
+-	/* Use movups to save code size.  */
+-	vmovdqu	(%rsi), %xmm2
+-	VPCMP	$4, (%rdi), %xmm2, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_lv)
+-	/* Use overlapping loads to avoid branches.  */
+-	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+-	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
+-	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_end)
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-L(between_2_3):
+-	/* Load as big endian to avoid branches.  */
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	/* Subtraction is okay because the upper 8 bits are zero.  */
+-	subl	%ecx, %eax
+-	ret
+-# endif
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-71.patch b/glibc-RHEL-15696-71.patch
new file mode 100644
index 0000000..2d018d0
--- /dev/null
+++ b/glibc-RHEL-15696-71.patch
@@ -0,0 +1,43 @@
+From 6b8dbbd03ac88f169b65b5c7d7278576a11d2e44 Mon Sep 17 00:00:00 2001
+From: Jangwoong Kim <6812skiii@gmail.com>
+Date: Tue, 14 Dec 2021 21:30:51 +0900
+Subject: [PATCH] nptl: Effectively skip CAS in spinlock loop
+Content-type: text/plain; charset=UTF-8
+
+The commit:
+"Add LLL_MUTEX_READ_LOCK [BZ #28537]"
+SHA1: d672a98a1af106bd68deb15576710cd61363f7a6
+
+introduced LLL_MUTEX_READ_LOCK, to skip CAS in spinlock loop
+if atomic load fails. But, "continue" inside of do-while loop
+does not skip the evaluation of escape expression, thus CAS
+is not skipped.
+
+Replace do-while with while and skip LLL_MUTEX_TRYLOCK if
+LLL_MUTEX_READ_LOCK fails.
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ nptl/pthread_mutex_lock.c | 5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index a633d95e..d96a9933 100644
+--- a/nptl/pthread_mutex_lock.c
++++ b/nptl/pthread_mutex_lock.c
+@@ -141,10 +141,9 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
+ 		  break;
+ 		}
+ 	      atomic_spin_nop ();
+-	      if (LLL_MUTEX_READ_LOCK (mutex) != 0)
+-		continue;
+ 	    }
+-	  while (LLL_MUTEX_TRYLOCK (mutex) != 0);
++	  while (LLL_MUTEX_READ_LOCK (mutex) != 0
++		 || LLL_MUTEX_TRYLOCK (mutex) != 0);
+ 
+ 	  mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8;
+ 	}
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-72.patch b/glibc-RHEL-15696-72.patch
new file mode 100644
index 0000000..34f2a61
--- /dev/null
+++ b/glibc-RHEL-15696-72.patch
@@ -0,0 +1,146 @@
+From 7835d611af0854e69a0c71e3806f8fe379282d6f Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 18 Feb 2022 14:19:15 -0600
+Subject: [PATCH] x86: Test wcscmp RTM in the wcsncmp overflow case [BZ #28896]
+Content-type: text/plain; charset=UTF-8
+
+In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
+call strcmp-avx2 and wcscmp-avx2 respectively. This would have
+not checks around vzeroupper and would trigger spurious
+aborts. This commit fixes that.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
+AVX2 machines with and without RTM.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/Makefile          |  5 ++++-
+ sysdeps/x86/tst-strncmp-rtm.c | 32 +++++++++++++++++++++++---------
+ sysdeps/x86/tst-wcsncmp-rtm.c | 21 +++++++++++++++++++++
+ 3 files changed, 48 insertions(+), 10 deletions(-)
+ create mode 100644 sysdeps/x86/tst-wcsncmp-rtm.c
+
+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+index 2d814915..c2111f49 100644
+--- a/sysdeps/x86/Makefile
++++ b/sysdeps/x86/Makefile
+@@ -28,7 +28,9 @@ tests += \
+   tst-strcpy-rtm \
+   tst-strlen-rtm \
+   tst-strncmp-rtm \
+-  tst-strrchr-rtm
++  tst-strrchr-rtm \
++  tst-wcsncmp-rtm \
++# tests
+ 
+ CFLAGS-tst-memchr-rtm.c += -mrtm
+ CFLAGS-tst-memcmp-rtm.c += -mrtm
+@@ -40,6 +42,7 @@ CFLAGS-tst-strcpy-rtm.c += -mrtm
+ CFLAGS-tst-strlen-rtm.c += -mrtm
+ CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
+ CFLAGS-tst-strrchr-rtm.c += -mrtm
++CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error
+ endif
+ 
+ ifneq ($(enable-cet),no)
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index 4d0004b5..4e9f094f 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
++++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -19,18 +19,32 @@
+ #include <stdint.h>
+ #include <tst-string-rtm.h>
+ 
++#ifdef WIDE
++# define CHAR wchar_t
++# define MEMSET wmemset
++# define STRNCMP wcsncmp
++# define TEST_NAME wcsncmp
++#else /* !WIDE */
++# define CHAR char
++# define MEMSET memset
++# define STRNCMP strncmp
++# define TEST_NAME strncmp
++#endif /* !WIDE */
++
++
++
+ #define LOOP 3000
+ #define STRING_SIZE 1024
+-char string1[STRING_SIZE];
+-char string2[STRING_SIZE];
++CHAR string1[STRING_SIZE];
++CHAR string2[STRING_SIZE];
+ 
+ __attribute__ ((noinline, noclone))
+ static int
+ prepare (void)
+ {
+-  memset (string1, 'a', STRING_SIZE - 1);
+-  memset (string2, 'a', STRING_SIZE - 1);
+-  if (strncmp (string1, string2, STRING_SIZE) == 0)
++  MEMSET (string1, 'a', STRING_SIZE - 1);
++  MEMSET (string2, 'a', STRING_SIZE - 1);
++  if (STRNCMP (string1, string2, STRING_SIZE) == 0)
+     return EXIT_SUCCESS;
+   else
+     return EXIT_FAILURE;
+@@ -40,7 +54,7 @@ __attribute__ ((noinline, noclone))
+ static int
+ function (void)
+ {
+-  if (strncmp (string1, string2, STRING_SIZE) == 0)
++  if (STRNCMP (string1, string2, STRING_SIZE) == 0)
+     return 0;
+   else
+     return 1;
+@@ -50,7 +64,7 @@ __attribute__ ((noinline, noclone))
+ static int
+ function_overflow (void)
+ {
+-  if (strncmp (string1, string2, SIZE_MAX) == 0)
++  if (STRNCMP (string1, string2, SIZE_MAX) == 0)
+     return 0;
+   else
+     return 1;
+@@ -59,9 +73,9 @@ function_overflow (void)
+ static int
+ do_test (void)
+ {
+-  int status = do_test_1 ("strncmp", LOOP, prepare, function);
++  int status = do_test_1 (TEST_NAME, LOOP, prepare, function);
+   if (status != EXIT_SUCCESS)
+     return status;
+-  status = do_test_1 ("strncmp", LOOP, prepare, function_overflow);
++  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
+   return status;
+ }
+diff --git a/sysdeps/x86/tst-wcsncmp-rtm.c b/sysdeps/x86/tst-wcsncmp-rtm.c
+new file mode 100644
+index 00000000..bad3b863
+--- /dev/null
++++ b/sysdeps/x86/tst-wcsncmp-rtm.c
+@@ -0,0 +1,21 @@
++/* Test case for wcsncmp inside a transactionally executing RTM region.
++   Copyright (C) 2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include <wchar.h>
++#include "tst-strncmp-rtm.c"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-73.patch b/glibc-RHEL-15696-73.patch
new file mode 100644
index 0000000..e8cc3a2
--- /dev/null
+++ b/glibc-RHEL-15696-73.patch
@@ -0,0 +1,37 @@
+From b98d0bbf747f39770e0caba7e984ce9f8f900330 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 18 Feb 2022 17:00:25 -0600
+Subject: [PATCH] x86: Fix TEST_NAME to make it a string in tst-strncmp-rtm.c
+Content-type: text/plain; charset=UTF-8
+
+Previously TEST_NAME was passing a function pointer. This didn't fail
+because of the -Wno-error flag (to allow for overflow sizes passed
+to strncmp/wcsncmp)
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/tst-strncmp-rtm.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index 4e9f094f..aef9866c 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
++++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -23,12 +23,12 @@
+ # define CHAR wchar_t
+ # define MEMSET wmemset
+ # define STRNCMP wcsncmp
+-# define TEST_NAME wcsncmp
++# define TEST_NAME "wcsncmp"
+ #else /* !WIDE */
+ # define CHAR char
+ # define MEMSET memset
+ # define STRNCMP strncmp
+-# define TEST_NAME strncmp
++# define TEST_NAME "strncmp"
+ #endif /* !WIDE */
+ 
+ 
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-74.patch b/glibc-RHEL-15696-74.patch
new file mode 100644
index 0000000..e5e6842
--- /dev/null
+++ b/glibc-RHEL-15696-74.patch
@@ -0,0 +1,1798 @@
+From b77b06e0e296f1a2276c27a67e1d44f2cfa38d45 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 10 Jan 2022 15:35:38 -0600
+Subject: [PATCH] x86: Optimize strcmp-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+Optimization are primarily to the loop logic and how the page cross
+logic interacts with the loop.
+
+The page cross logic is at times more expensive for short strings near
+the end of a page but not crossing the page. This is done to retest
+the page cross conditions with a non-faulty check and to improve the
+logic for entering the loop afterwards. This is only particular cases,
+however, and is general made up for by more than 10x improvements on
+the transition from the page cross -> loop case.
+
+The non-page cross cases are improved most for smaller sizes [0, 128]
+and go about even for (128, 4096]. The loop page cross logic is
+improved so some more significant speedup is seen there as well.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S | 1592 ++++++++++++++----------
+ 1 file changed, 940 insertions(+), 652 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strcmp-avx2.S
+	(account for sw28896 patches)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 70d8499b..554ffe4c 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -26,35 +26,57 @@
+ 
+ # define PAGE_SIZE	4096
+ 
+-/* VEC_SIZE = Number of bytes in a ymm register */
++	/* VEC_SIZE = Number of bytes in a ymm register.  */
+ # define VEC_SIZE	32
+ 
+-/* Shift for dividing by (VEC_SIZE * 4).  */
+-# define DIVIDE_BY_VEC_4_SHIFT	7
+-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+-#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+-# endif
++# define VMOVU	vmovdqu
++# define VMOVA	vmovdqa
+ 
+ # ifdef USE_AS_WCSCMP
+-/* Compare packed dwords.  */
++	/* Compare packed dwords.  */
+ #  define VPCMPEQ	vpcmpeqd
+-/* Compare packed dwords and store minimum.  */
++	/* Compare packed dwords and store minimum.  */
+ #  define VPMINU	vpminud
+-/* 1 dword char == 4 bytes.  */
++	/* 1 dword char == 4 bytes.  */
+ #  define SIZE_OF_CHAR	4
+ # else
+-/* Compare packed bytes.  */
++	/* Compare packed bytes.  */
+ #  define VPCMPEQ	vpcmpeqb
+-/* Compare packed bytes and store minimum.  */
++	/* Compare packed bytes and store minimum.  */
+ #  define VPMINU	vpminub
+-/* 1 byte char == 1 byte.  */
++	/* 1 byte char == 1 byte.  */
+ #  define SIZE_OF_CHAR	1
+ # endif
+ 
++# ifdef USE_AS_STRNCMP
++#  define LOOP_REG	r9d
++#  define LOOP_REG64	r9
++
++#  define OFFSET_REG8	r9b
++#  define OFFSET_REG	r9d
++#  define OFFSET_REG64	r9
++# else
++#  define LOOP_REG	edx
++#  define LOOP_REG64	rdx
++
++#  define OFFSET_REG8	dl
++#  define OFFSET_REG	edx
++#  define OFFSET_REG64	rdx
++# endif
++
+ # ifndef VZEROUPPER
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# if defined USE_AS_STRNCMP
++#  define VEC_OFFSET	0
++# else
++#  define VEC_OFFSET	(-VEC_SIZE)
++# endif
++
++# define xmmZERO	xmm15
++# define ymmZERO	ymm15
++
+ # ifndef SECTION
+ #  define SECTION(p)	p##.avx
+ # endif
+@@ -79,783 +101,1049 @@
+    the maximum offset is reached before a difference is found, zero is
+    returned.  */
+ 
+-	.section SECTION(.text),"ax",@progbits
+-ENTRY (STRCMP)
++	.section SECTION(.text), "ax", @progbits
++ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+-	/* Check for simple cases (0 or 1) in offset.  */
++#  ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %rdx
++#  endif
+ 	cmp	$1, %RDX_LP
+-	je	L(char0)
+-	jb	L(zero)
++	/* Signed comparison intentional. We use this branch to also
++	   test cases where length >= 2^63. These very large sizes can be
++	   handled with strcmp as there is no way for that length to
++	   actually bound the buffer.  */
++	jle	L(one_or_less)
+ #  ifdef USE_AS_WCSCMP
+-#  ifndef __ILP32__
+ 	movq	%rdx, %rcx
+-	/* Check if length could overflow when multiplied by
+-	   sizeof(wchar_t). Checking top 8 bits will cover all potential
+-	   overflow cases as well as redirect cases where its impossible to
+-	   length to bound a valid memory region. In these cases just use
+-	   'wcscmp'.  */
++
++	/* Multiplying length by sizeof(wchar_t) can result in overflow.
++	   Check if that is possible. All cases where overflow are possible
++	   are cases where length is large enough that it can never be a
++	   bound on valid memory so just use wcscmp.  */
+ 	shrq	$56, %rcx
+-	jnz	OVERFLOW_STRCMP
+-#  endif
+-	/* Convert units: from wide to byte char.  */
+-	shl	$2, %RDX_LP
++	jnz	__wcscmp_avx2
++
++	leaq	(, %rdx, 4), %rdx
+ #  endif
+-	/* Register %r11 tracks the maximum offset.  */
+-	mov	%RDX_LP, %R11_LP
+ # endif
++	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
+ 	movl	%edi, %eax
+-	xorl	%edx, %edx
+-	/* Make %xmm7 (%ymm7) all zeros in this function.  */
+-	vpxor	%xmm7, %xmm7, %xmm7
+ 	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
+-	jg	L(cross_page)
+-	/* Start comparing 4 vectors.  */
+-	vmovdqu	(%rdi), %ymm1
+-	VPCMPEQ	(%rsi), %ymm1, %ymm0
+-	VPMINU	%ymm1, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-	vpmovmskb %ymm0, %ecx
+-	testl	%ecx, %ecx
+-	je	L(next_3_vectors)
+-	tzcntl	%ecx, %edx
++	sall	$20, %eax
++	/* Check if s1 or s2 may cross a page  in next 4x VEC loads.  */
++	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
++	ja	L(page_cross)
++
++L(no_page_cross):
++	/* Safe to compare 4x vectors.  */
++	VMOVU	(%rdi), %ymm0
++	/* 1s where s1 and s2 equal.  */
++	VPCMPEQ	(%rsi), %ymm0, %ymm1
++	/* 1s at null CHAR.  */
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	/* 1s where s1 and s2 equal AND not null CHAR.  */
++	vpandn	%ymm1, %ymm2, %ymm1
++
++	/* All 1s -> keep going, any 0s -> return.  */
++	vpmovmskb %ymm1, %ecx
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx) is after the maximum
+-	   offset (%r11).   */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$VEC_SIZE, %rdx
++	jbe	L(vec_0_test_len)
+ # endif
++
++	/* All 1s represents all equals. incl will overflow to zero in
++	   all equals case. Otherwise 1s will carry until position of first
++	   mismatch.  */
++	incl	%ecx
++	jz	L(more_3x_vec)
++
++	.p2align 4,, 4
++L(return_vec_0):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	je	L(return)
+-L(wcscmp_return):
++	cmpl	(%rsi, %rcx), %edx
++	je	L(ret0)
+ 	setl	%al
+ 	negl	%eax
+ 	orl	$1, %eax
+-L(return):
+ # else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
++L(ret0):
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+-	.p2align 4
+-L(return_vec_size):
+-	tzcntl	%ecx, %edx
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
+-	   the maximum offset (%r11).  */
+-	addq	$VEC_SIZE, %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++	.p2align 4,, 8
++L(vec_0_test_len):
++	notl	%ecx
++	bzhil	%edx, %ecx, %eax
++	jnz	L(return_vec_0)
++	/* Align if will cross fetch block.  */
++	.p2align 4,, 2
++L(ret_zero):
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
++	VZEROUPPER_RETURN
++
++	.p2align 4,, 5
++L(one_or_less):
++	jb	L(ret_zero)
+ #  ifdef USE_AS_WCSCMP
++	/* 'nbe' covers the case where length is negative (large
++	   unsigned).  */
++	jnbe	__wcscmp_avx2
++	movl	(%rdi), %edx
+ 	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rdx), %ecx
+-	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	(%rsi), %edx
++	je	L(ret1)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ #  else
+-	movzbl	VEC_SIZE(%rdi, %rdx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	/* 'nbe' covers the case where length is negative (large
++	   unsigned).  */
++
++	jnbe	__strcmp_avx2
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	subl	%ecx, %eax
+ #  endif
++L(ret1):
++	ret
+ # endif
+-	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(return_2_vec_size):
+-	tzcntl	%ecx, %edx
++	.p2align 4,, 10
++L(return_vec_1):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
+-	   after the maximum offset (%r11).  */
+-	addq	$(VEC_SIZE * 2), %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++	/* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of
++	   overflow.  */
++	addq	$-VEC_SIZE, %rdx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	VEC_SIZE(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	VEC_SIZE(%rsi, %rcx), %edx
++	je	L(ret2)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
++L(ret2):
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(return_3_vec_size):
+-	tzcntl	%ecx, %edx
++	.p2align 4,, 10
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
+-	   after the maximum offset (%r11).  */
+-	addq	$(VEC_SIZE * 3), %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++L(return_vec_3):
++	salq	$32, %rcx
++# endif
++
++L(return_vec_2):
++# ifndef USE_AS_STRNCMP
++	tzcntl	%ecx, %ecx
++# else
++	tzcntq	%rcx, %rcx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
++	je	L(ret3)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ # else
++	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
++L(ret3):
++	VZEROUPPER_RETURN
++
++# ifndef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_vec_3):
++	tzcntl	%ecx, %ecx
+ #  ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
++	je	L(ret4)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ #  else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ #  endif
+-# endif
++L(ret4):
+ 	VZEROUPPER_RETURN
++# endif
++
++	.p2align 4,, 10
++L(more_3x_vec):
++	/* Safe to compare 4x vectors.  */
++	VMOVU	VEC_SIZE(%rdi), %ymm0
++	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_1)
++
++# ifdef USE_AS_STRNCMP
++	subq	$(VEC_SIZE * 2), %rdx
++	jbe	L(ret_zero)
++# endif
++
++	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
++	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_2)
++
++	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
++	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_3)
+ 
+-	.p2align 4
+-L(next_3_vectors):
+-	vmovdqu	VEC_SIZE(%rdi), %ymm6
+-	VPCMPEQ	VEC_SIZE(%rsi), %ymm6, %ymm3
+-	VPMINU	%ymm6, %ymm3, %ymm3
+-	VPCMPEQ	%ymm7, %ymm3, %ymm3
+-	vpmovmskb %ymm3, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(return_vec_size)
+-	vmovdqu	(VEC_SIZE * 2)(%rdi), %ymm5
+-	vmovdqu	(VEC_SIZE * 3)(%rdi), %ymm4
+-	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm0
+-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
+-	VPMINU	%ymm5, %ymm2, %ymm2
+-	VPCMPEQ	%ymm4, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm2, %ymm2
+-	vpmovmskb %ymm2, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(return_2_vec_size)
+-	VPMINU	%ymm4, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-	vpmovmskb %ymm0, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(return_3_vec_size)
+-L(main_loop_header):
+-	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+-	movl	$PAGE_SIZE, %ecx
+-	/* Align load via RAX.  */
+-	andq	$-(VEC_SIZE * 4), %rdx
+-	subq	%rdi, %rdx
+-	leaq	(%rdi, %rdx), %rax
+ # ifdef USE_AS_STRNCMP
+-	/* Starting from this point, the maximum offset, or simply the
+-	   'offset', DECREASES by the same amount when base pointers are
+-	   moved forward.  Return 0 when:
+-	     1) On match: offset <= the matched vector index.
+-	     2) On mistmach, offset is before the mistmatched index.
++	cmpq	$(VEC_SIZE * 2), %rdx
++	jbe	L(ret_zero)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	/* any non-zero positive value that doesn't inference with 0x1.
+ 	 */
+-	subq	%rdx, %r11
+-	jbe	L(zero)
+-# endif
+-	addq	%rsi, %rdx
+-	movq	%rdx, %rsi
+-	andl	$(PAGE_SIZE - 1), %esi
+-	/* Number of bytes before page crossing.  */
+-	subq	%rsi, %rcx
+-	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
+-	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
+-	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
+-	movl	%ecx, %esi
+-	jmp	L(loop_start)
++	movl	$2, %r8d
+ 
++# else
++	xorl	%r8d, %r8d
++# endif
++
++	/* The prepare labels are various entry points from the page
++	   cross logic.  */
++L(prepare_loop):
++
++# ifdef USE_AS_STRNCMP
++	/* Store N + (VEC_SIZE * 4) and place check at the begining of
++	   the loop.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
++# endif
++L(prepare_loop_no_len):
++
++	/* Align s1 and adjust s2 accordingly.  */
++	subq	%rdi, %rsi
++	andq	$-(VEC_SIZE * 4), %rdi
++	addq	%rdi, %rsi
++
++# ifdef USE_AS_STRNCMP
++	subq	%rdi, %rdx
++# endif
++
++L(prepare_loop_aligned):
++	/* eax stores distance from rsi to next page cross. These cases
++	   need to be handled specially as the 4x loop could potentially
++	   read memory past the length of s1 or s2 and across a page
++	   boundary.  */
++	movl	$-(VEC_SIZE * 4), %eax
++	subl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++
++	/* Loop 4x comparisons at a time.  */
+ 	.p2align 4
+ L(loop):
++
++	/* End condition for strncmp.  */
+ # ifdef USE_AS_STRNCMP
+-	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
+-	   the maximum offset (%r11) by the same amount.  */
+-	subq	$(VEC_SIZE * 4), %r11
+-	jbe	L(zero)
+-# endif
+-	addq	$(VEC_SIZE * 4), %rax
+-	addq	$(VEC_SIZE * 4), %rdx
+-L(loop_start):
+-	testl	%esi, %esi
+-	leal	-1(%esi), %esi
+-	je	L(loop_cross_page)
+-L(back_to_loop):
+-	/* Main loop, comparing 4 vectors are a time.  */
+-	vmovdqa	(%rax), %ymm0
+-	vmovdqa	VEC_SIZE(%rax), %ymm3
+-	VPCMPEQ	(%rdx), %ymm0, %ymm4
+-	VPCMPEQ	VEC_SIZE(%rdx), %ymm3, %ymm1
+-	VPMINU	%ymm0, %ymm4, %ymm4
+-	VPMINU	%ymm3, %ymm1, %ymm1
+-	vmovdqa	(VEC_SIZE * 2)(%rax), %ymm2
+-	VPMINU	%ymm1, %ymm4, %ymm0
+-	vmovdqa	(VEC_SIZE * 3)(%rax), %ymm3
+-	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
+-	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
+-	VPMINU	%ymm2, %ymm5, %ymm5
+-	VPMINU	%ymm3, %ymm6, %ymm6
+-	VPMINU	%ymm5, %ymm0, %ymm0
+-	VPMINU	%ymm6, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-
+-	/* Test each mask (32 bits) individually because for VEC_SIZE
+-	   == 32 is not possible to OR the four masks and keep all bits
+-	   in a 64-bit integer register, differing from SSE2 strcmp
+-	   where ORing is possible.  */
+-	vpmovmskb %ymm0, %ecx
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(ret_zero)
++# endif
++
++	subq	$-(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rsi
++
++	/* Check if rsi loads will cross a page boundary.  */
++	addl	$-(VEC_SIZE * 4), %eax
++	jnb	L(page_cross_during_loop)
++
++	/* Loop entry after handling page cross during loop.  */
++L(loop_skip_page_cross_check):
++	VMOVA	(VEC_SIZE * 0)(%rdi), %ymm0
++	VMOVA	(VEC_SIZE * 1)(%rdi), %ymm2
++	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
++	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
++
++	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
++	VPCMPEQ	(VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
++
++	VPCMPEQ	(VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
++	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
++	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
++
++
++	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
++	   zero.  */
++	vpand	%ymm0, %ymm1, %ymm1
++
++
++	vpand	%ymm2, %ymm3, %ymm3
++	vpand	%ymm4, %ymm5, %ymm5
++	vpand	%ymm6, %ymm7, %ymm7
++
++	VPMINU	%ymm1, %ymm3, %ymm3
++	VPMINU	%ymm5, %ymm7, %ymm7
++
++	/* Reduce all 0 CHARs for the 4x VEC into ymm7.  */
++	VPMINU	%ymm3, %ymm7, %ymm7
++
++	/* If any 0 CHAR then done.  */
++	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
++	vpmovmskb %ymm7, %LOOP_REG
++	testl	%LOOP_REG, %LOOP_REG
++	jz	L(loop)
++
++	/* Find which VEC has the mismatch of end of string.  */
++	VPCMPEQ	%ymm1, %ymmZERO, %ymm1
++	vpmovmskb %ymm1, %ecx
+ 	testl	%ecx, %ecx
+-	je	L(loop)
+-	VPCMPEQ	%ymm7, %ymm4, %ymm0
+-	vpmovmskb %ymm0, %edi
+-	testl	%edi, %edi
+-	je	L(test_vec)
+-	tzcntl	%edi, %ecx
++	jnz	L(return_vec_0_end)
++
++
++	VPCMPEQ	%ymm3, %ymmZERO, %ymm3
++	vpmovmskb %ymm3, %ecx
++	testl	%ecx, %ecx
++	jnz	L(return_vec_1_end)
++
++L(return_vec_2_3_end):
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	subq	$(VEC_SIZE * 2), %rdx
++	jbe	L(ret_zero_end)
++# endif
++
++	VPCMPEQ	%ymm5, %ymmZERO, %ymm5
++	vpmovmskb %ymm5, %ecx
++	testl	%ecx, %ecx
++	jnz	L(return_vec_2_end)
++
++	/* LOOP_REG contains matches for null/mismatch from the loop. If
++	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
++	   must entirely be from VEC 3 which is fully represented by
++	   LOOP_REG.  */
++	tzcntl	%LOOP_REG, %LOOP_REG
++
++# ifdef USE_AS_STRNCMP
++	subl	$-(VEC_SIZE), %LOOP_REG
++	cmpq	%LOOP_REG64, %rdx
++	jbe	L(ret_zero_end)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
++	je	L(ret5)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
++	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret5):
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(test_vec):
+ # ifdef USE_AS_STRNCMP
+-	/* The first vector matched.  Return 0 if the maximum offset
+-	   (%r11) <= VEC_SIZE.  */
+-	cmpq	$VEC_SIZE, %r11
+-	jbe	L(zero)
++	.p2align 4,, 2
++L(ret_zero_end):
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
+ # endif
+-	VPCMPEQ	%ymm7, %ymm1, %ymm1
+-	vpmovmskb %ymm1, %ecx
+-	testl	%ecx, %ecx
+-	je	L(test_2_vec)
+-	tzcntl	%ecx, %edi
++
++
++	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
++	   they use the value of `r8` to negate the return value. This is
++	   because the page cross logic can swap `rdi` and `rsi`.  */
++	.p2align 4,, 10
+ # ifdef USE_AS_STRNCMP
+-	addq	$VEC_SIZE, %rdi
+-	cmpq	%rdi, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++L(return_vec_1_end):
++	salq	$32, %rcx
++# endif
++L(return_vec_0_end):
++# ifndef USE_AS_STRNCMP
++	tzcntl	%ecx, %ecx
++# else
++	tzcntq	%rcx, %rcx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero_end)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rdi), %ecx
+-	cmpl	(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rdi), %eax
+-	movzbl	(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(%rsi, %rcx), %edx
++	je	L(ret6)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
++# endif
++L(ret6):
++	VZEROUPPER_RETURN
++
++# ifndef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_vec_1_end):
++	tzcntl	%ecx, %ecx
+ #  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	movl	VEC_SIZE(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rsi, %rdi), %ecx
+-	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	VEC_SIZE(%rsi, %rcx), %edx
++	je	L(ret7)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ #  else
+-	movzbl	VEC_SIZE(%rax, %rdi), %eax
+-	movzbl	VEC_SIZE(%rdx, %rdi), %edx
+-	subl	%edx, %eax
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ #  endif
+-# endif
++L(ret7):
+ 	VZEROUPPER_RETURN
++# endif
+ 
+-	.p2align 4
+-L(test_2_vec):
++	.p2align 4,, 10
++L(return_vec_2_end):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_STRNCMP
+-	/* The first 2 vectors matched.  Return 0 if the maximum offset
+-	   (%r11) <= 2 * VEC_SIZE.  */
+-	cmpq	$(VEC_SIZE * 2), %r11
+-	jbe	L(zero)
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero_page_cross)
+ # endif
+-	VPCMPEQ	%ymm7, %ymm5, %ymm5
+-	vpmovmskb %ymm5, %ecx
+-	testl	%ecx, %ecx
+-	je	L(test_3_vec)
+-	tzcntl	%ecx, %edi
+-# ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 2), %rdi
+-	cmpq	%rdi, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++# ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rdi), %ecx
+-	cmpl	(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rdi), %eax
+-	movzbl	(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
++	je	L(ret11)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
+-	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
+-	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
++	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret11):
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(test_3_vec):
++
++	/* Page cross in rsi in next 4x VEC.  */
++
++	/* TODO: Improve logic here.  */
++	.p2align 4,, 10
++L(page_cross_during_loop):
++	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
++
++	/* Optimistically rsi and rdi and both aligned inwhich case we
++	   don't need any logic here.  */
++	cmpl	$-(VEC_SIZE * 4), %eax
++	/* Don't adjust eax before jumping back to loop and we will
++	   never hit page cross case again.  */
++	je	L(loop_skip_page_cross_check)
++
++	/* Check if we can safely load a VEC.  */
++	cmpl	$-(VEC_SIZE * 3), %eax
++	jle	L(less_1x_vec_till_page_cross)
++
++	VMOVA	(%rdi), %ymm0
++	VPCMPEQ	(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_0_end)
++
++	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
++	cmpl	$-(VEC_SIZE * 2), %eax
++	jg	L(more_2x_vec_till_page_cross)
++
++	.p2align 4,, 4
++L(less_1x_vec_till_page_cross):
++	subl	$-(VEC_SIZE * 4), %eax
++	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
++	   concerning case is first iteration if incoming s1 was near start
++	   of a page and s2 near end. If s1 was near the start of the page
++	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
++	   to read back -VEC_SIZE. If rdi is truly at the start of a page
++	   here, it means the previous page (rdi - VEC_SIZE) has already
++	   been loaded earlier so must be valid.  */
++	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
++	VPCMPEQ	-VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++
++	/* Mask of potentially valid bits. The lower bits can be out of
++	   range comparisons (but safe regarding page crosses).  */
++	movl	$-1, %r10d
++	shlxl	%esi, %r10d, %r10d
++	notl	%ecx
++
+ # ifdef USE_AS_STRNCMP
+-	/* The first 3 vectors matched.  Return 0 if the maximum offset
+-	   (%r11) <= 3 * VEC_SIZE.  */
+-	cmpq	$(VEC_SIZE * 3), %r11
+-	jbe	L(zero)
+-# endif
+-	VPCMPEQ	%ymm7, %ymm6, %ymm6
+-	vpmovmskb %ymm6, %esi
+-	tzcntl	%esi, %ecx
++	cmpq	%rax, %rdx
++	jbe	L(return_page_cross_end_check)
++# endif
++	movl	%eax, %OFFSET_REG
++	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
++
++	andl	%r10d, %ecx
++	jz	L(loop_skip_page_cross_check)
++
++	.p2align 4,, 3
++L(return_page_cross_end):
++	tzcntl	%ecx, %ecx
++
+ # ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 3), %rcx
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %esi
+-	cmpl	(%rdx, %rcx), %esi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	leal	-VEC_SIZE(%OFFSET_REG64, %rcx), %ecx
++L(return_page_cross_cmp_mem):
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	addl	%OFFSET_REG, %ecx
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	VEC_OFFSET(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
+-	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
++	je	L(ret8)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
++# else
++	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
++	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret8):
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(loop_cross_page):
+-	xorl	%r10d, %r10d
+-	movq	%rdx, %rcx
+-	/* Align load via RDX.  We load the extra ECX bytes which should
+-	   be ignored.  */
+-	andl	$((VEC_SIZE * 4) - 1), %ecx
+-	/* R10 is -RCX.  */
+-	subq	%rcx, %r10
+-
+-	/* This works only if VEC_SIZE * 2 == 64. */
+-# if (VEC_SIZE * 2) != 64
+-#  error (VEC_SIZE * 2) != 64
+-# endif
+-
+-	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
+-	cmpl	$(VEC_SIZE * 2), %ecx
+-	jge	L(loop_cross_page_2_vec)
+-
+-	vmovdqu	(%rax, %r10), %ymm2
+-	vmovdqu	VEC_SIZE(%rax, %r10), %ymm3
+-	VPCMPEQ	(%rdx, %r10), %ymm2, %ymm0
+-	VPCMPEQ	VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
+-	VPMINU	%ymm2, %ymm0, %ymm0
+-	VPMINU	%ymm3, %ymm1, %ymm1
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm1, %ymm1
+-
+-	vpmovmskb %ymm0, %edi
+-	vpmovmskb %ymm1, %esi
+-
+-	salq	$32, %rsi
+-	xorq	%rsi, %rdi
+-
+-	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
+-	shrq	%cl, %rdi
+-
+-	testq	%rdi, %rdi
+-	je	L(loop_cross_page_2_vec)
+-	tzcntq	%rdi, %rcx
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	.p2align 4,, 10
++L(return_page_cross_end_check):
++	tzcntl	%ecx, %ecx
++	leal	-VEC_SIZE(%rax, %rcx), %ecx
++	cmpl	%ecx, %edx
++	ja	L(return_page_cross_cmp_mem)
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# endif
+ 	VZEROUPPER_RETURN
++# endif
+ 
+-	.p2align 4
+-L(loop_cross_page_2_vec):
+-	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
+-	vmovdqu	(VEC_SIZE * 2)(%rax, %r10), %ymm2
+-	vmovdqu	(VEC_SIZE * 3)(%rax, %r10), %ymm3
+-	VPCMPEQ	(VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
+-	VPMINU	%ymm2, %ymm5, %ymm5
+-	VPCMPEQ	(VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
+-	VPCMPEQ	%ymm7, %ymm5, %ymm5
+-	VPMINU	%ymm3, %ymm6, %ymm6
+-	VPCMPEQ	%ymm7, %ymm6, %ymm6
+-
+-	vpmovmskb %ymm5, %edi
+-	vpmovmskb %ymm6, %esi
+-
+-	salq	$32, %rsi
+-	xorq	%rsi, %rdi
+ 
+-	xorl	%r8d, %r8d
+-	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
+-	subl	$(VEC_SIZE * 2), %ecx
+-	jle	1f
+-	/* Skip ECX bytes.  */
+-	shrq	%cl, %rdi
+-	/* R8 has number of bytes skipped.  */
+-	movl	%ecx, %r8d
+-1:
+-	/* Before jumping back to the loop, set ESI to the number of
+-	   VEC_SIZE * 4 blocks before page crossing.  */
+-	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
+-
+-	testq	%rdi, %rdi
++	.p2align 4,, 10
++L(more_2x_vec_till_page_cross):
++	/* If more 2x vec till cross we will complete a full loop
++	   iteration here.  */
++
++	VMOVU	VEC_SIZE(%rdi), %ymm0
++	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_1_end)
++
+ # ifdef USE_AS_STRNCMP
+-	/* At this point, if %rdi value is 0, it already tested
+-	   VEC_SIZE*4+%r10 byte starting from %rax. This label
+-	   checks whether strncmp maximum offset reached or not.  */
+-	je	L(string_nbyte_offset_check)
+-# else
+-	je	L(back_to_loop)
++	cmpq	$(VEC_SIZE * 2), %rdx
++	jbe	L(ret_zero_in_loop_page_cross)
+ # endif
+-	tzcntq	%rdi, %rcx
+-	addq	%r10, %rcx
+-	/* Adjust for number of bytes skipped.  */
+-	addq	%r8, %rcx
++
++	subl	$-(VEC_SIZE * 4), %eax
++
++	/* Safe to include comparisons from lower bytes.  */
++	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
++	VPCMPEQ	-(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_page_cross_0)
++
++	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
++	VPCMPEQ	-(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_page_cross_1)
++
+ # ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 2), %rcx
+-	subq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	/* Must check length here as length might proclude reading next
++	   page.  */
++	cmpq	%rax, %rdx
++	jbe	L(ret_zero_in_loop_page_cross)
++# endif
++
++	/* Finish the loop.  */
++	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
++	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
++
++	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
++	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
++	vpand	%ymm4, %ymm5, %ymm5
++	vpand	%ymm6, %ymm7, %ymm7
++	VPMINU	%ymm5, %ymm7, %ymm7
++	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
++	vpmovmskb %ymm7, %LOOP_REG
++	testl	%LOOP_REG, %LOOP_REG
++	jnz	L(return_vec_2_3_end)
++
++	/* Best for code size to include ucond-jmp here. Would be faster
++	   if this case is hot to duplicate the L(return_vec_2_3_end) code
++	   as fall-through and have jump back to loop on mismatch
++	   comparison.  */
++	subq	$-(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rsi
++	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
++# ifdef USE_AS_STRNCMP
++	subq	$(VEC_SIZE * 4), %rdx
++	ja	L(loop_skip_page_cross_check)
++L(ret_zero_in_loop_page_cross):
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	VZEROUPPER_RETURN
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
+-	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	jmp	L(loop_skip_page_cross_check)
+ # endif
+-	VZEROUPPER_RETURN
+ 
++
++	.p2align 4,, 10
++L(return_vec_page_cross_0):
++	addl	$-VEC_SIZE, %eax
++L(return_vec_page_cross_1):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_STRNCMP
+-L(string_nbyte_offset_check):
+-	leaq	(VEC_SIZE * 4)(%r10), %r10
+-	cmpq	%r10, %r11
+-	jbe	L(zero)
+-	jmp	L(back_to_loop)
++	leal	-VEC_SIZE(%rax, %rcx), %ecx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero_in_loop_page_cross)
++# else
++	addl	%eax, %ecx
+ # endif
+ 
+-	.p2align 4
+-L(cross_page_loop):
+-	/* Check one byte/dword at a time.  */
+ # ifdef USE_AS_WCSCMP
+-	cmpl	%ecx, %eax
++	movl	VEC_OFFSET(%rdi, %rcx), %edx
++	xorl	%eax, %eax
++	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
++	je	L(ret9)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
++	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
++	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+ 	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
+-	jne	L(different)
+-	addl	$SIZE_OF_CHAR, %edx
+-	cmpl	$(VEC_SIZE * 4), %edx
+-	je	L(main_loop_header)
+-# ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++L(ret9):
++	VZEROUPPER_RETURN
++
++
++	.p2align 4,, 10
++L(page_cross):
++# ifndef USE_AS_STRNCMP
++	/* If both are VEC aligned we don't need any special logic here.
++	   Only valid for strcmp where stop condition is guranteed to be
++	   reachable by just reading memory.  */
++	testl	$((VEC_SIZE - 1) << 20), %eax
++	jz	L(no_page_cross)
+ # endif
++
++	movl	%edi, %eax
++	movl	%esi, %ecx
++	andl	$(PAGE_SIZE - 1), %eax
++	andl	$(PAGE_SIZE - 1), %ecx
++
++	xorl	%OFFSET_REG, %OFFSET_REG
++
++	/* Check which is closer to page cross, s1 or s2.  */
++	cmpl	%eax, %ecx
++	jg	L(page_cross_s2)
++
++	/* The previous page cross check has false positives. Check for
++	   true positive as page cross logic is very expensive.  */
++	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
++	jbe	L(no_page_cross)
++
++	/* Set r8 to not interfere with normal return value (rdi and rsi
++	   did not swap).  */
+ # ifdef USE_AS_WCSCMP
+-	movl	(%rdi, %rdx), %eax
+-	movl	(%rsi, %rdx), %ecx
++	/* any non-zero positive value that doesn't inference with 0x1.
++	 */
++	movl	$2, %r8d
+ # else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %ecx
++	xorl	%r8d, %r8d
+ # endif
+-	/* Check null char.  */
+-	testl	%eax, %eax
+-	jne	L(cross_page_loop)
+-	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
+-	   comparisons.  */
+-	subl	%ecx, %eax
+-# ifndef USE_AS_WCSCMP
+-L(different):
++
++	/* Check if less than 1x VEC till page cross.  */
++	subl	$(VEC_SIZE * 3), %eax
++	jg	L(less_1x_vec_till_page)
++
++	/* If more than 1x VEC till page cross, loop throuh safely
++	   loadable memory until within 1x VEC of page cross.  */
++
++	.p2align 4,, 10
++L(page_cross_loop):
++
++	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
++	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++
++	jnz	L(check_ret_vec_page_cross)
++	addl	$VEC_SIZE, %OFFSET_REG
++# ifdef USE_AS_STRNCMP
++	cmpq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross)
+ # endif
+-	VZEROUPPER_RETURN
++	addl	$VEC_SIZE, %eax
++	jl	L(page_cross_loop)
++
++	subl	%eax, %OFFSET_REG
++	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
++	   to not cross page so is safe to load. Since we have already
++	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
++	 */
++
++	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
++	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++
++# ifdef USE_AS_STRNCMP
++	leal	VEC_SIZE(%OFFSET_REG64), %eax
++	cmpq	%rax, %rdx
++	jbe	L(check_ret_vec_page_cross2)
++	addq	%rdi, %rdx
++# endif
++	incl	%ecx
++	jz	L(prepare_loop_no_len)
+ 
++	.p2align 4,, 4
++L(ret_vec_page_cross):
++# ifndef USE_AS_STRNCMP
++L(check_ret_vec_page_cross):
++# endif
++	tzcntl	%ecx, %ecx
++	addl	%OFFSET_REG, %ecx
++L(ret_vec_page_cross_cont):
+ # ifdef USE_AS_WCSCMP
+-	.p2align 4
+-L(different):
+-	/* Use movl to avoid modifying EFLAGS.  */
+-	movl	$0, %eax
++	movl	(%rdi, %rcx), %edx
++	xorl	%eax, %eax
++	cmpl	(%rsi, %rcx), %edx
++	je	L(ret12)
+ 	setl	%al
+ 	negl	%eax
+-	orl	$1, %eax
+-	VZEROUPPER_RETURN
++	xorl	%r8d, %eax
++# else
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret12):
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_STRNCMP
+-	.p2align 4
+-L(zero):
++	.p2align 4,, 10
++L(check_ret_vec_page_cross2):
++	incl	%ecx
++L(check_ret_vec_page_cross):
++	tzcntl	%ecx, %ecx
++	addl	%OFFSET_REG, %ecx
++	cmpq	%rcx, %rdx
++	ja	L(ret_vec_page_cross_cont)
++	.p2align 4,, 2
++L(ret_zero_page_cross):
+ 	xorl	%eax, %eax
+ 	VZEROUPPER_RETURN
++# endif
+ 
+-	.p2align 4
+-L(char0):
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi), %ecx
+-	cmpl	(%rsi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
+-#  endif
+-	VZEROUPPER_RETURN
++	.p2align 4,, 4
++L(page_cross_s2):
++	/* Ensure this is a true page cross.  */
++	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
++	jbe	L(no_page_cross)
++
++
++	movl	%ecx, %eax
++	movq	%rdi, %rcx
++	movq	%rsi, %rdi
++	movq	%rcx, %rsi
++
++	/* set r8 to negate return value as rdi and rsi swapped.  */
++# ifdef USE_AS_WCSCMP
++	movl	$-4, %r8d
++# else
++	movl	$-1, %r8d
+ # endif
++	xorl	%OFFSET_REG, %OFFSET_REG
+ 
+-	.p2align 4
+-L(last_vector):
+-	addq	%rdx, %rdi
+-	addq	%rdx, %rsi
++	/* Check if more than 1x VEC till page cross.  */
++	subl	$(VEC_SIZE * 3), %eax
++	jle	L(page_cross_loop)
++
++	.p2align 4,, 6
++L(less_1x_vec_till_page):
++	/* Find largest load size we can use.  */
++	cmpl	$16, %eax
++	ja	L(less_16_till_page)
++
++	VMOVU	(%rdi), %xmm0
++	VPCMPEQ	(%rsi), %xmm0, %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	incw	%cx
++	jnz	L(check_ret_vec_page_cross)
++	movl	$16, %OFFSET_REG
+ # ifdef USE_AS_STRNCMP
+-	subq	%rdx, %r11
++	cmpq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subl	%eax, %OFFSET_REG
++# else
++	/* Explicit check for 16 byte alignment.  */
++	subl	%eax, %OFFSET_REG
++	jz	L(prepare_loop)
+ # endif
+-	tzcntl	%ecx, %edx
++
++	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
++	VPCMPEQ	(%rsi, %OFFSET_REG64), %xmm0, %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	incw	%cx
++	jnz	L(check_ret_vec_page_cross)
++
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	addl	$16, %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subq	$-(VEC_SIZE * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++# else
++	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+ # endif
+-# ifdef USE_AS_WCSCMP
++	jmp	L(prepare_loop_aligned)
++
++# ifdef USE_AS_STRNCMP
++	.p2align 4,, 2
++L(ret_zero_page_cross_slow_case0):
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-# else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	ret
+ # endif
+-	VZEROUPPER_RETURN
+ 
+-	/* Comparing on page boundary region requires special treatment:
+-	   It must done one vector at the time, starting with the wider
+-	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
+-	   (xmm) still passes the boundary, byte comparison must be done.
+-	 */
+-	.p2align 4
+-L(cross_page):
+-	/* Try one ymm vector at a time.  */
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(cross_page_1_vector)
+-L(loop_1_vector):
+-	vmovdqu	(%rdi, %rdx), %ymm1
+-	VPCMPEQ	(%rsi, %rdx), %ymm1, %ymm0
+-	VPMINU	%ymm1, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-	vpmovmskb %ymm0, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(last_vector)
+ 
+-	addl	$VEC_SIZE, %edx
++	.p2align 4,, 10
++L(less_16_till_page):
++	/* Find largest load size we can use.  */
++	cmpl	$24, %eax
++	ja	L(less_8_till_page)
+ 
+-	addl	$VEC_SIZE, %eax
+-# ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-# endif
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jle	L(loop_1_vector)
+-L(cross_page_1_vector):
+-	/* Less than 32 bytes to check, try one xmm vector.  */
+-	cmpl	$(PAGE_SIZE - 16), %eax
+-	jg	L(cross_page_1_xmm)
+-	vmovdqu	(%rdi, %rdx), %xmm1
+-	VPCMPEQ	(%rsi, %rdx), %xmm1, %xmm0
+-	VPMINU	%xmm1, %xmm0, %xmm0
+-	VPCMPEQ	%xmm7, %xmm0, %xmm0
+-	vpmovmskb %xmm0, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(last_vector)
++	vmovq	(%rdi), %xmm0
++	vmovq	(%rsi), %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	incb	%cl
++	jnz	L(check_ret_vec_page_cross)
+ 
+-	addl	$16, %edx
+-# ifndef USE_AS_WCSCMP
+-	addl	$16, %eax
++
++# ifdef USE_AS_STRNCMP
++	cmpq	$8, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
+ # endif
++	movl	$24, %OFFSET_REG
++	/* Explicit check for 16 byte alignment.  */
++	subl	%eax, %OFFSET_REG
++
++
++
++	vmovq	(%rdi, %OFFSET_REG64), %xmm0
++	vmovq	(%rsi, %OFFSET_REG64), %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	incb	%cl
++	jnz	L(check_ret_vec_page_cross)
++
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-# endif
+-
+-L(cross_page_1_xmm):
+-# ifndef USE_AS_WCSCMP
+-	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
+-	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
+-	cmpl	$(PAGE_SIZE - 8), %eax
+-	jg	L(cross_page_8bytes)
+-	vmovq	(%rdi, %rdx), %xmm1
+-	vmovq	(%rsi, %rdx), %xmm0
+-	VPCMPEQ	%xmm0, %xmm1, %xmm0
+-	VPMINU	%xmm1, %xmm0, %xmm0
+-	VPCMPEQ	%xmm7, %xmm0, %xmm0
+-	vpmovmskb %xmm0, %ecx
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(last_vector)
++	addl	$8, %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subq	$-(VEC_SIZE * 4), %rdx
+ 
+-	addl	$8, %edx
+-	addl	$8, %eax
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++# else
++	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++# endif
++	jmp	L(prepare_loop_aligned)
++
++
++	.p2align 4,, 10
++L(less_8_till_page):
++# ifdef USE_AS_WCSCMP
++	/* If using wchar then this is the only check before we reach
++	   the page boundary.  */
++	movl	(%rdi), %eax
++	movl	(%rsi), %ecx
++	cmpl	%ecx, %eax
++	jnz	L(ret_less_8_wcs)
+ #  ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	addq	%rdi, %rdx
++	/* We already checked for len <= 1 so cannot hit that case here.
++	 */
+ #  endif
++	testl	%eax, %eax
++	jnz	L(prepare_loop_no_len)
++	ret
+ 
+-L(cross_page_8bytes):
+-	/* Less than 8 bytes to check, try 4 byte vector.  */
+-	cmpl	$(PAGE_SIZE - 4), %eax
+-	jg	L(cross_page_4bytes)
+-	vmovd	(%rdi, %rdx), %xmm1
+-	vmovd	(%rsi, %rdx), %xmm0
+-	VPCMPEQ	%xmm0, %xmm1, %xmm0
+-	VPMINU	%xmm1, %xmm0, %xmm0
+-	VPCMPEQ	%xmm7, %xmm0, %xmm0
+-	vpmovmskb %xmm0, %ecx
+-	/* Only last 4 bits are valid.  */
+-	andl	$0xf, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(last_vector)
++	.p2align 4,, 8
++L(ret_less_8_wcs):
++	setl	%OFFSET_REG8
++	negl	%OFFSET_REG
++	movl	%OFFSET_REG, %eax
++	xorl	%r8d, %eax
++	ret
++
++# else
++
++	/* Find largest load size we can use.  */
++	cmpl	$28, %eax
++	ja	L(less_4_till_page)
++
++	vmovd	(%rdi), %xmm0
++	vmovd	(%rsi), %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	subl	$0xf, %ecx
++	jnz	L(check_ret_vec_page_cross)
+ 
+-	addl	$4, %edx
+ #  ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$4, %rdx
++	jbe	L(ret_zero_page_cross_slow_case1)
+ #  endif
++	movl	$28, %OFFSET_REG
++	/* Explicit check for 16 byte alignment.  */
++	subl	%eax, %OFFSET_REG
+ 
+-L(cross_page_4bytes):
+-# endif
+-	/* Less than 4 bytes to check, try one byte/dword at a time.  */
+-# ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-# endif
+-# ifdef USE_AS_WCSCMP
+-	movl	(%rdi, %rdx), %eax
+-	movl	(%rsi, %rdx), %ecx
+-# else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %ecx
+-# endif
+-	testl	%eax, %eax
+-	jne	L(cross_page_loop)
++
++
++	vmovd	(%rdi, %OFFSET_REG64), %xmm0
++	vmovd	(%rsi, %OFFSET_REG64), %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	subl	$0xf, %ecx
++	jnz	L(check_ret_vec_page_cross)
++
++#  ifdef USE_AS_STRNCMP
++	addl	$4, %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case1)
++	subq	$-(VEC_SIZE * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++#  else
++	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++#  endif
++	jmp	L(prepare_loop_aligned)
++
++#  ifdef USE_AS_STRNCMP
++	.p2align 4,, 2
++L(ret_zero_page_cross_slow_case1):
++	xorl	%eax, %eax
++	ret
++#  endif
++
++	.p2align 4,, 10
++L(less_4_till_page):
++	subq	%rdi, %rsi
++	/* Extremely slow byte comparison loop.  */
++L(less_4_loop):
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi, %rdi), %ecx
+ 	subl	%ecx, %eax
+-	VZEROUPPER_RETURN
+-END (STRCMP)
++	jnz	L(ret_less_4_loop)
++	testl	%ecx, %ecx
++	jz	L(ret_zero_4_loop)
++#  ifdef USE_AS_STRNCMP
++	decq	%rdx
++	jz	L(ret_zero_4_loop)
++#  endif
++	incq	%rdi
++	/* end condition is reach page boundary (rdi is aligned).  */
++	testl	$31, %edi
++	jnz	L(less_4_loop)
++	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
++	addq	$-(VEC_SIZE * 4), %rdi
++#  ifdef USE_AS_STRNCMP
++	subq	$-(VEC_SIZE * 4), %rdx
++#  endif
++	jmp	L(prepare_loop_aligned)
++
++L(ret_zero_4_loop):
++	xorl	%eax, %eax
++	ret
++L(ret_less_4_loop):
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
++	ret
++# endif
++END(STRCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-75.patch b/glibc-RHEL-15696-75.patch
new file mode 100644
index 0000000..4bd0cd4
--- /dev/null
+++ b/glibc-RHEL-15696-75.patch
@@ -0,0 +1,1992 @@
+From 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 10 Jan 2022 15:35:39 -0600
+Subject: [PATCH] x86: Optimize strcmp-evex.S
+Content-type: text/plain; charset=UTF-8
+
+Optimization are primarily to the loop logic and how the page cross
+logic interacts with the loop.
+
+The page cross logic is at times more expensive for short strings near
+the end of a page but not crossing the page. This is done to retest
+the page cross conditions with a non-faulty check and to improve the
+logic for entering the loop afterwards. This is only particular cases,
+however, and is general made up for by more than 10x improvements on
+the transition from the page cross -> loop case.
+
+The non-page cross cases as well are nearly universally improved.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 1712 +++++++++++++-----------
+ 1 file changed, 919 insertions(+), 793 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 6f5c4bf9..99d8409a 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -26,54 +26,69 @@
+ 
+ # define PAGE_SIZE	4096
+ 
+-/* VEC_SIZE = Number of bytes in a ymm register */
++	/* VEC_SIZE = Number of bytes in a ymm register.  */
+ # define VEC_SIZE	32
++# define CHAR_PER_VEC	(VEC_SIZE	/	SIZE_OF_CHAR)
+ 
+-/* Shift for dividing by (VEC_SIZE * 4).  */
+-# define DIVIDE_BY_VEC_4_SHIFT	7
+-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+-#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+-# endif
+-
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
++# define VMOVU	vmovdqu64
++# define VMOVA	vmovdqa64
+ 
+ # ifdef USE_AS_WCSCMP
+-/* Compare packed dwords.  */
+-#  define VPCMP		vpcmpd
++#  define TESTEQ	subl	$0xff,
++	/* Compare packed dwords.  */
++#  define VPCMP	vpcmpd
+ #  define VPMINU	vpminud
+ #  define VPTESTM	vptestmd
+-#  define SHIFT_REG32	r8d
+-#  define SHIFT_REG64	r8
+-/* 1 dword char == 4 bytes.  */
++	/* 1 dword char == 4 bytes.  */
+ #  define SIZE_OF_CHAR	4
+ # else
+-/* Compare packed bytes.  */
+-#  define VPCMP		vpcmpb
++#  define TESTEQ	incl
++	/* Compare packed bytes.  */
++#  define VPCMP	vpcmpb
+ #  define VPMINU	vpminub
+ #  define VPTESTM	vptestmb
+-#  define SHIFT_REG32	ecx
+-#  define SHIFT_REG64	rcx
+-/* 1 byte char == 1 byte.  */
++	/* 1 byte char == 1 byte.  */
+ #  define SIZE_OF_CHAR	1
+ # endif
+ 
++# ifdef USE_AS_STRNCMP
++#  define LOOP_REG	r9d
++#  define LOOP_REG64	r9
++
++#  define OFFSET_REG8	r9b
++#  define OFFSET_REG	r9d
++#  define OFFSET_REG64	r9
++# else
++#  define LOOP_REG	edx
++#  define LOOP_REG64	rdx
++
++#  define OFFSET_REG8	dl
++#  define OFFSET_REG	edx
++#  define OFFSET_REG64	rdx
++# endif
++
++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
++#  define VEC_OFFSET	0
++# else
++#  define VEC_OFFSET	(-VEC_SIZE)
++# endif
++
+ # define XMMZERO	xmm16
+-# define XMM0		xmm17
+-# define XMM1		xmm18
++# define XMM0	xmm17
++# define XMM1	xmm18
+ 
+ # define YMMZERO	ymm16
+-# define YMM0		ymm17
+-# define YMM1		ymm18
+-# define YMM2		ymm19
+-# define YMM3		ymm20
+-# define YMM4		ymm21
+-# define YMM5		ymm22
+-# define YMM6		ymm23
+-# define YMM7		ymm24
+-# define YMM8		ymm25
+-# define YMM9		ymm26
+-# define YMM10		ymm27
++# define YMM0	ymm17
++# define YMM1	ymm18
++# define YMM2	ymm19
++# define YMM3	ymm20
++# define YMM4	ymm21
++# define YMM5	ymm22
++# define YMM6	ymm23
++# define YMM7	ymm24
++# define YMM8	ymm25
++# define YMM9	ymm26
++# define YMM10	ymm27
+ 
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+@@ -96,985 +111,1096 @@
+    the maximum offset is reached before a difference is found, zero is
+    returned.  */
+ 
+-	.section .text.evex,"ax",@progbits
+-ENTRY (STRCMP)
++	.section .text.evex, "ax", @progbits
++ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+-	/* Check for simple cases (0 or 1) in offset.  */
+-	cmp	$1, %RDX_LP
+-	je	L(char0)
+-	jb	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-#  ifndef __ILP32__
+-	movq	%rdx, %rcx
+-	/* Check if length could overflow when multiplied by
+-	   sizeof(wchar_t). Checking top 8 bits will cover all potential
+-	   overflow cases as well as redirect cases where its impossible to
+-	   length to bound a valid memory region. In these cases just use
+-	   'wcscmp'.  */
+-	shrq	$56, %rcx
+-	jnz	__wcscmp_evex
+-#  endif
+-	/* Convert units: from wide to byte char.  */
+-	shl	$2, %RDX_LP
++#  ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %rdx
+ #  endif
+-	/* Register %r11 tracks the maximum offset.  */
+-	mov	%RDX_LP, %R11_LP
++	cmp	$1, %RDX_LP
++	/* Signed comparison intentional. We use this branch to also
++	   test cases where length >= 2^63. These very large sizes can be
++	   handled with strcmp as there is no way for that length to
++	   actually bound the buffer.  */
++	jle	L(one_or_less)
+ # endif
+ 	movl	%edi, %eax
+-	xorl	%edx, %edx
+-	/* Make %XMMZERO (%YMMZERO) all zeros in this function.  */
+-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+ 	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
+-	jg	L(cross_page)
+-	/* Start comparing 4 vectors.  */
++	/* Shift out the bits irrelivant to page boundary ([63:12]).  */
++	sall	$20, %eax
++	/* Check if s1 or s2 may cross a page in next 4x VEC loads.  */
++	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
++	ja	L(page_cross)
++
++L(no_page_cross):
++	/* Safe to compare 4x vectors.  */
+ 	VMOVU	(%rdi), %YMM0
+-
+-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-
+ 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+ 	   in YMM0 and 32 bytes at (%rsi).  */
+ 	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+-
+ 	kmovd	%k1, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	je	L(next_3_vectors)
+-	tzcntl	%ecx, %edx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
+-# endif
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx) is after the maximum
+-	   offset (%r11).   */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$CHAR_PER_VEC, %rdx
++	jbe	L(vec_0_test_len)
+ # endif
++
++	/* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
++	   wcscmp/wcsncmp.  */
++
++	/* All 1s represents all equals. TESTEQ will overflow to zero in
++	   all equals case. Otherwise 1s will carry until position of first
++	   mismatch.  */
++	TESTEQ	%ecx
++	jz	L(more_3x_vec)
++
++	.p2align 4,, 4
++L(return_vec_0):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	je	L(return)
+-L(wcscmp_return):
++	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret0)
+ 	setl	%al
+ 	negl	%eax
+ 	orl	$1, %eax
+-L(return):
+ # else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
++L(ret0):
+ 	ret
+ 
+-L(return_vec_size):
+-	tzcntl	%ecx, %edx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
+-# endif
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
+-	   the maximum offset (%r11).  */
+-	addq	$VEC_SIZE, %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++	.p2align 4,, 4
++L(vec_0_test_len):
++	notl	%ecx
++	bzhil	%edx, %ecx, %eax
++	jnz	L(return_vec_0)
++	/* Align if will cross fetch block.  */
++	.p2align 4,, 2
++L(ret_zero):
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
++	ret
++
++	.p2align 4,, 5
++L(one_or_less):
++	jb	L(ret_zero)
+ #  ifdef USE_AS_WCSCMP
++	/* 'nbe' covers the case where length is negative (large
++	   unsigned).  */
++	jnbe	__wcscmp_evex
++	movl	(%rdi), %edx
+ 	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rdx), %ecx
+-	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	(%rsi), %edx
++	je	L(ret1)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ #  else
+-	movzbl	VEC_SIZE(%rdi, %rdx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	/* 'nbe' covers the case where length is negative (large
++	   unsigned).  */
++	jnbe	__strcmp_evex
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	subl	%ecx, %eax
+ #  endif
+-# endif
++L(ret1):
+ 	ret
++# endif
+ 
+-L(return_2_vec_size):
+-	tzcntl	%ecx, %edx
++	.p2align 4,, 10
++L(return_vec_1):
++	tzcntl	%ecx, %ecx
++# ifdef USE_AS_STRNCMP
++	/* rdx must be > CHAR_PER_VEC so its safe to subtract without
++	   worrying about underflow.  */
++	addq	$-CHAR_PER_VEC, %rdx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero)
++# endif
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
++	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
++	xorl	%eax, %eax
++	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret2)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
++# else
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
++L(ret2):
++	ret
++
++	.p2align 4,, 10
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
+-	   after the maximum offset (%r11).  */
+-	addq	$(VEC_SIZE * 2), %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++L(return_vec_3):
++#  if CHAR_PER_VEC <= 16
++	sall	$CHAR_PER_VEC, %ecx
+ #  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	salq	$CHAR_PER_VEC, %rcx
+ #  endif
++# endif
++L(return_vec_2):
++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
++	tzcntl	%ecx, %ecx
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	tzcntq	%rcx, %rcx
+ # endif
+-	ret
+ 
+-L(return_3_vec_size):
+-	tzcntl	%ecx, %edx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
+-# endif
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
+-	   after the maximum offset (%r11).  */
+-	addq	$(VEC_SIZE * 3), %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret3)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ # else
++	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
++L(ret3):
++	ret
++
++# ifndef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_vec_3):
++	tzcntl	%ecx, %ecx
+ #  ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ 	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret4)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ #  else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ #  endif
+-# endif
++L(ret4):
+ 	ret
++# endif
+ 
+-	.p2align 4
+-L(next_3_vectors):
+-	VMOVU	VEC_SIZE(%rdi), %YMM0
+-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	/* 32 byte align here ensures the main loop is ideally aligned
++	   for DSB.  */
++	.p2align 5
++L(more_3x_vec):
++	/* Safe to compare 4x vectors.  */
++	VMOVU	(VEC_SIZE)(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at VEC_SIZE(%rsi).  */
+-	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
++	VPCMP	$0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_1)
++
++# ifdef USE_AS_STRNCMP
++	subq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(ret_zero)
+ # endif
+-	jne	L(return_vec_size)
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
+-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
+ 	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	jne	L(return_2_vec_size)
++	TESTEQ	%ecx
++	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
+-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
+ 	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_3)
++
++# ifdef USE_AS_STRNCMP
++	cmpq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(ret_zero)
++# endif
++
++
+ # ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
++	/* any non-zero positive value that doesn't inference with 0x1.
++	 */
++	movl	$2, %r8d
++
+ # else
+-	incl	%ecx
++	xorl	%r8d, %r8d
+ # endif
+-	jne	L(return_3_vec_size)
+-L(main_loop_header):
+-	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+-	movl	$PAGE_SIZE, %ecx
+-	/* Align load via RAX.  */
+-	andq	$-(VEC_SIZE * 4), %rdx
+-	subq	%rdi, %rdx
+-	leaq	(%rdi, %rdx), %rax
++
++	/* The prepare labels are various entry points from the page
++	   cross logic.  */
++L(prepare_loop):
++
+ # ifdef USE_AS_STRNCMP
+-	/* Starting from this point, the maximum offset, or simply the
+-	   'offset', DECREASES by the same amount when base pointers are
+-	   moved forward.  Return 0 when:
+-	     1) On match: offset <= the matched vector index.
+-	     2) On mistmach, offset is before the mistmatched index.
+-	 */
+-	subq	%rdx, %r11
+-	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++L(prepare_loop_no_len):
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE * 4 - 1), %ecx
++	shrl	$2, %ecx
++	leaq	(CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
++#  else
++	/* Store N + (VEC_SIZE * 4) and place check at the begining of
++	   the loop.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
++L(prepare_loop_no_len):
++#  endif
++# else
++L(prepare_loop_no_len):
+ # endif
+-	addq	%rsi, %rdx
+-	movq	%rdx, %rsi
+-	andl	$(PAGE_SIZE - 1), %esi
+-	/* Number of bytes before page crossing.  */
+-	subq	%rsi, %rcx
+-	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
+-	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
+-	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
+-	movl	%ecx, %esi
+-	jmp	L(loop_start)
+ 
++	/* Align s1 and adjust s2 accordingly.  */
++	subq	%rdi, %rsi
++	andq	$-(VEC_SIZE * 4), %rdi
++L(prepare_loop_readj):
++	addq	%rdi, %rsi
++# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
++	subq	%rdi, %rdx
++# endif
++
++L(prepare_loop_aligned):
++	/* eax stores distance from rsi to next page cross. These cases
++	   need to be handled specially as the 4x loop could potentially
++	   read memory past the length of s1 or s2 and across a page
++	   boundary.  */
++	movl	$-(VEC_SIZE * 4), %eax
++	subl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++
++	vpxorq	%YMMZERO, %YMMZERO, %YMMZERO
++
++	/* Loop 4x comparisons at a time.  */
+ 	.p2align 4
+ L(loop):
++
++	/* End condition for strncmp.  */
+ # ifdef USE_AS_STRNCMP
+-	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
+-	   the maximum offset (%r11) by the same amount.  */
+-	subq	$(VEC_SIZE * 4), %r11
+-	jbe	L(zero)
++	subq	$(CHAR_PER_VEC * 4), %rdx
++	jbe	L(ret_zero)
+ # endif
+-	addq	$(VEC_SIZE * 4), %rax
+-	addq	$(VEC_SIZE * 4), %rdx
+-L(loop_start):
+-	testl	%esi, %esi
+-	leal	-1(%esi), %esi
+-	je	L(loop_cross_page)
+-L(back_to_loop):
+-	/* Main loop, comparing 4 vectors are a time.  */
+-	VMOVA	(%rax), %YMM0
+-	VMOVA	VEC_SIZE(%rax), %YMM2
+-	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
+-	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
++
++	subq	$-(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rsi
++
++	/* Check if rsi loads will cross a page boundary.  */
++	addl	$-(VEC_SIZE * 4), %eax
++	jnb	L(page_cross_during_loop)
++
++	/* Loop entry after handling page cross during loop.  */
++L(loop_skip_page_cross_check):
++	VMOVA	(VEC_SIZE * 0)(%rdi), %YMM0
++	VMOVA	(VEC_SIZE * 1)(%rdi), %YMM2
++	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
++	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
+ 
+ 	VPMINU	%YMM0, %YMM2, %YMM8
+ 	VPMINU	%YMM4, %YMM6, %YMM9
+ 
+-	/* A zero CHAR in YMM8 means that there is a null CHAR.  */
+-	VPMINU	%YMM8, %YMM9, %YMM8
++	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
++	VPMINU	%YMM8, %YMM9, %YMM9
+ 
+ 	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
+-	VPTESTM	%YMM8, %YMM8, %k1
++	VPTESTM	%YMM9, %YMM9, %k1
+ 
+-	/* (YMM ^ YMM): A non-zero CHAR represents a mismatch.  */
+-	vpxorq	(%rdx), %YMM0, %YMM1
+-	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM3
+-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
+-	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
++	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
++	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
++	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
++	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
++	   oring with YMM1. Result is stored in YMM6.  */
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
+ 
+-	vporq	%YMM1, %YMM3, %YMM9
+-	vporq	%YMM5, %YMM7, %YMM10
++	/* Or together YMM3, YMM5, and YMM6.  */
++	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
+ 
+-	/* A non-zero CHAR in YMM9 represents a mismatch.  */
+-	vporq	%YMM9, %YMM10, %YMM9
+ 
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR.  */
+-	VPCMP	$0, %YMMZERO, %YMM9, %k0{%k1}
+-	kmovd   %k0, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	je	 L(loop)
++	/* A non-zero CHAR in YMM6 represents a mismatch.  */
++	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
++	kmovd	%k0, %LOOP_REG
+ 
+-	/* Each bit set in K1 represents a non-null CHAR in YMM0.  */
++	TESTEQ	%LOOP_REG
++	jz	L(loop)
++
++
++	/* Find which VEC has the mismatch of end of string.  */
+ 	VPTESTM	%YMM0, %YMM0, %k1
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+-	   in YMM0 and (%rdx).  */
+ 	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
+ 	kmovd	%k0, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	je	L(test_vec)
+-	tzcntl	%ecx, %ecx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %ecx
+-# endif
+-# ifdef USE_AS_STRNCMP
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# endif
+-	ret
++	TESTEQ	%ecx
++	jnz	L(return_vec_0_end)
+ 
+-	.p2align 4
+-L(test_vec):
+-# ifdef USE_AS_STRNCMP
+-	/* The first vector matched.  Return 0 if the maximum offset
+-	   (%r11) <= VEC_SIZE.  */
+-	cmpq	$VEC_SIZE, %r11
+-	jbe	L(zero)
+-# endif
+-	/* Each bit set in K1 represents a non-null CHAR in YMM2.  */
+ 	VPTESTM	%YMM2, %YMM2, %k1
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+-	   in YMM2 and VEC_SIZE(%rdx).  */
+ 	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
+ 	kmovd	%k0, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	je	L(test_2_vec)
+-	tzcntl	%ecx, %edi
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edi
+-# endif
+-# ifdef USE_AS_STRNCMP
+-	addq	$VEC_SIZE, %rdi
+-	cmpq	%rdi, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rdi), %ecx
+-	cmpl	(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rdi), %eax
+-	movzbl	(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rsi, %rdi), %ecx
+-	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	VEC_SIZE(%rax, %rdi), %eax
+-	movzbl	VEC_SIZE(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
+-# endif
+-	ret
++	TESTEQ	%ecx
++	jnz	L(return_vec_1_end)
+ 
+-	.p2align 4
+-L(test_2_vec):
++
++	/* Handle VEC 2 and 3 without branches.  */
++L(return_vec_2_3_end):
+ # ifdef USE_AS_STRNCMP
+-	/* The first 2 vectors matched.  Return 0 if the maximum offset
+-	   (%r11) <= 2 * VEC_SIZE.  */
+-	cmpq	$(VEC_SIZE * 2), %r11
+-	jbe	L(zero)
++	subq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(ret_zero_end)
+ # endif
+-	/* Each bit set in K1 represents a non-null CHAR in YMM4.  */
++
+ 	VPTESTM	%YMM4, %YMM4, %k1
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+-	   in YMM4 and (VEC_SIZE * 2)(%rdx).  */
+ 	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
+ 	kmovd	%k0, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
++	TESTEQ	%ecx
++# if CHAR_PER_VEC <= 16
++	sall	$CHAR_PER_VEC, %LOOP_REG
++	orl	%ecx, %LOOP_REG
+ # else
+-	incl	%ecx
++	salq	$CHAR_PER_VEC, %LOOP_REG64
++	orq	%rcx, %LOOP_REG64
++# endif
++L(return_vec_3_end):
++	/* LOOP_REG contains matches for null/mismatch from the loop. If
++	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
++	   must entirely be from VEC 3 which is fully represented by
++	   LOOP_REG.  */
++# if CHAR_PER_VEC <= 16
++	tzcntl	%LOOP_REG, %LOOP_REG
++# else
++	tzcntq	%LOOP_REG64, %LOOP_REG64
++# endif
++# ifdef USE_AS_STRNCMP
++	cmpq	%LOOP_REG64, %rdx
++	jbe	L(ret_zero_end)
+ # endif
+-	je	L(test_3_vec)
+-	tzcntl	%ecx, %edi
++
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edi
++	movl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
++	xorl	%eax, %eax
++	cmpl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
++	je	L(ret5)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
++# else
++	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret5):
++	ret
++
+ # ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 2), %rdi
+-	cmpq	%rdi, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	.p2align 4,, 2
++L(ret_zero_end):
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rdi), %ecx
+-	cmpl	(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
++	ret
++# endif
++
++
++	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
++	   they use the value of `r8` to negate the return value. This is
++	   because the page cross logic can swap `rdi` and `rsi`.  */
++	.p2align 4,, 10
++# ifdef USE_AS_STRNCMP
++L(return_vec_1_end):
++#  if CHAR_PER_VEC <= 16
++	sall	$CHAR_PER_VEC, %ecx
+ #  else
+-	movzbl	(%rax, %rdi), %eax
+-	movzbl	(%rdx, %rdi), %edx
+-	subl	%edx, %eax
++	salq	$CHAR_PER_VEC, %rcx
+ #  endif
++# endif
++L(return_vec_0_end):
++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
++	tzcntl	%ecx, %ecx
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
+-	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
+-	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
++	tzcntq	%rcx, %rcx
+ # endif
+-	ret
+ 
+-	.p2align 4
+-L(test_3_vec):
+ # ifdef USE_AS_STRNCMP
+-	/* The first 3 vectors matched.  Return 0 if the maximum offset
+-	   (%r11) <= 3 * VEC_SIZE.  */
+-	cmpq	$(VEC_SIZE * 3), %r11
+-	jbe	L(zero)
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero_end)
+ # endif
+-	/* Each bit set in K1 represents a non-null CHAR in YMM6.  */
+-	VPTESTM	%YMM6, %YMM6, %k1
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+-	   in YMM6 and (VEC_SIZE * 3)(%rdx).  */
+-	VPCMP	$0, %YMMZERO, %YMM7, %k0{%k1}
+-	kmovd	%k0, %ecx
++
+ # ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
++	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
++	xorl	%eax, %eax
++	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret6)
++	setl	%al
++	negl	%eax
++	/* This is the non-zero case for `eax` so just xorl with `r8d`
++	   flip is `rdi` and `rsi` where swapped.  */
++	xorl	%r8d, %eax
+ # else
+-	incl	%ecx
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
++	   logic. Subtract `r8d` after xor for zero case.  */
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret6):
++	ret
++
++# ifndef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_vec_1_end):
+ 	tzcntl	%ecx, %ecx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %ecx
+-# endif
+-# ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 3), %rcx
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+ #  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %esi
+-	cmpl	(%rdx, %rcx), %esi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
+-	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
+-	jne	L(wcscmp_return)
++	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret7)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ #  else
+-	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
+-	subl	%edx, %eax
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ #  endif
+-# endif
++L(ret7):
+ 	ret
+-
+-	.p2align 4
+-L(loop_cross_page):
+-	xorl	%r10d, %r10d
+-	movq	%rdx, %rcx
+-	/* Align load via RDX.  We load the extra ECX bytes which should
+-	   be ignored.  */
+-	andl	$((VEC_SIZE * 4) - 1), %ecx
+-	/* R10 is -RCX.  */
+-	subq	%rcx, %r10
+-
+-	/* This works only if VEC_SIZE * 2 == 64. */
+-# if (VEC_SIZE * 2) != 64
+-#  error (VEC_SIZE * 2) != 64
+ # endif
+ 
+-	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
+-	cmpl	$(VEC_SIZE * 2), %ecx
+-	jge	L(loop_cross_page_2_vec)
+ 
+-	VMOVU	(%rax, %r10), %YMM2
+-	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
++	/* Page cross in rsi in next 4x VEC.  */
+ 
+-	/* Each bit set in K2 represents a non-null CHAR in YMM2.  */
+-	VPTESTM	%YMM2, %YMM2, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM2 and 32 bytes at (%rdx, %r10).  */
+-	VPCMP	$0, (%rdx, %r10), %YMM2, %k1{%k2}
+-	kmovd	%k1, %r9d
+-	/* Don't use subl since it is the lower 16/32 bits of RDI
+-	   below.  */
+-	notl	%r9d
+-# ifdef USE_AS_WCSCMP
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %r9d
+-# endif
++	/* TODO: Improve logic here.  */
++	.p2align 4,, 10
++L(page_cross_during_loop):
++	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
+ 
+-	/* Each bit set in K4 represents a non-null CHAR in YMM3.  */
+-	VPTESTM	%YMM3, %YMM3, %k4
+-	/* Each bit cleared in K3 represents a mismatch or a null CHAR
+-	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
+-	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
+-	kmovd	%k3, %edi
+-    /* Must use notl %edi here as lower bits are for CHAR
+-	   comparisons potentially out of range thus can be 0 without
+-	   indicating mismatch.  */
+-	notl	%edi
+-# ifdef USE_AS_WCSCMP
+-	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	andl	$0xff, %edi
+-# endif
++	/* Optimistically rsi and rdi and both aligned in which case we
++	   don't need any logic here.  */
++	cmpl	$-(VEC_SIZE * 4), %eax
++	/* Don't adjust eax before jumping back to loop and we will
++	   never hit page cross case again.  */
++	je	L(loop_skip_page_cross_check)
+ 
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
+-	sall	$8, %edi
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+-	   bytes.  */
+-	movl	%ecx, %SHIFT_REG32
+-	sarl	$2, %SHIFT_REG32
+-
+-	/* Each bit in EDI represents a null CHAR or a mismatch.  */
+-	orl	%r9d, %edi
+-# else
+-	salq	$32, %rdi
++	/* Check if we can safely load a VEC.  */
++	cmpl	$-(VEC_SIZE * 3), %eax
++	jle	L(less_1x_vec_till_page_cross)
+ 
+-	/* Each bit in RDI represents a null CHAR or a mismatch.  */
+-	orq	%r9, %rdi
+-# endif
++	VMOVA	(%rdi), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_0_end)
++
++	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
++	cmpl	$-(VEC_SIZE * 2), %eax
++	jg	L(more_2x_vec_till_page_cross)
++
++	.p2align 4,, 4
++L(less_1x_vec_till_page_cross):
++	subl	$-(VEC_SIZE * 4), %eax
++	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
++	   concerning case is first iteration if incoming s1 was near start
++	   of a page and s2 near end. If s1 was near the start of the page
++	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
++	   to read back -VEC_SIZE. If rdi is truly at the start of a page
++	   here, it means the previous page (rdi - VEC_SIZE) has already
++	   been loaded earlier so must be valid.  */
++	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
++
++	/* Mask of potentially valid bits. The lower bits can be out of
++	   range comparisons (but safe regarding page crosses).  */
+ 
+-	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
+-	shrxq	%SHIFT_REG64, %rdi, %rdi
+-	testq	%rdi, %rdi
+-	je	L(loop_cross_page_2_vec)
+-	tzcntq	%rdi, %rcx
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %ecx
++	movl	$-1, %r10d
++	movl	%esi, %ecx
++	andl	$(VEC_SIZE - 1), %ecx
++	shrl	$2, %ecx
++	shlxl	%ecx, %r10d, %ecx
++	movzbl	%cl, %r10d
++# else
++	movl	$-1, %ecx
++	shlxl	%esi, %ecx, %r10d
+ # endif
++
++	kmovd	%k1, %ecx
++	notl	%ecx
++
++
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+ #  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
++	movl	%eax, %r11d
++	shrl	$2, %r11d
++	cmpq	%r11, %rdx
+ #  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
++	cmpq	%rax, %rdx
+ #  endif
++	jbe	L(return_page_cross_end_check)
++# endif
++	movl	%eax, %OFFSET_REG
++
++	/* Readjust eax before potentially returning to the loop.  */
++	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
++
++	andl	%r10d, %ecx
++	jz	L(loop_skip_page_cross_check)
++
++	.p2align 4,, 3
++L(return_page_cross_end):
++	tzcntl	%ecx, %ecx
++
++# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
++	leal	-VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
++L(return_page_cross_cmp_mem):
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	addl	%OFFSET_REG, %ecx
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	VEC_OFFSET(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
++	je	L(ret8)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
++# else
++	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
++	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret8):
+ 	ret
+ 
+-	.p2align 4
+-L(loop_cross_page_2_vec):
+-	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
+-	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
+-	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
++# ifdef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_page_cross_end_check):
++	tzcntl	%ecx, %ecx
++	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
++#  ifdef USE_AS_WCSCMP
++	sall	$2, %edx
++#  endif
++	cmpl	%ecx, %edx
++	ja	L(return_page_cross_cmp_mem)
++	xorl	%eax, %eax
++	ret
++# endif
++
+ 
++	.p2align 4,, 10
++L(more_2x_vec_till_page_cross):
++	/* If more 2x vec till cross we will complete a full loop
++	   iteration here.  */
++
++	VMOVA	VEC_SIZE(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10).  */
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
+-	kmovd	%k1, %r9d
+-	/* Don't use subl since it is the lower 16/32 bits of RDI
+-	   below.  */
+-	notl	%r9d
+-# ifdef USE_AS_WCSCMP
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %r9d
+-# endif
++	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_1_end)
+ 
+-	VPTESTM	%YMM1, %YMM1, %k4
+-	/* Each bit cleared in K3 represents a mismatch or a null CHAR
+-	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
+-	kmovd	%k3, %edi
+-	/* Must use notl %edi here as lower bits are for CHAR
+-	   comparisons potentially out of range thus can be 0 without
+-	   indicating mismatch.  */
+-	notl	%edi
+-# ifdef USE_AS_WCSCMP
+-	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	andl	$0xff, %edi
++# ifdef USE_AS_STRNCMP
++	cmpq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(ret_zero_in_loop_page_cross)
+ # endif
+ 
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
+-	sall	$8, %edi
++	subl	$-(VEC_SIZE * 4), %eax
+ 
+-	/* Each bit in EDI represents a null CHAR or a mismatch.  */
+-	orl	%r9d, %edi
+-# else
+-	salq	$32, %rdi
++	/* Safe to include comparisons from lower bytes.  */
++	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_page_cross_0)
++
++	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_page_cross_1)
+ 
+-	/* Each bit in RDI represents a null CHAR or a mismatch.  */
+-	orq	%r9, %rdi
++# ifdef USE_AS_STRNCMP
++	/* Must check length here as length might proclude reading next
++	   page.  */
++#  ifdef USE_AS_WCSCMP
++	movl	%eax, %r11d
++	shrl	$2, %r11d
++	cmpq	%r11, %rdx
++#  else
++	cmpq	%rax, %rdx
++#  endif
++	jbe	L(ret_zero_in_loop_page_cross)
+ # endif
+ 
+-	xorl	%r8d, %r8d
+-	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
+-	subl	$(VEC_SIZE * 2), %ecx
+-	jle	1f
+-	/* R8 has number of bytes skipped.  */
+-	movl	%ecx, %r8d
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Divide shift count by 4 since each bit in RDI represent 4
+-	   bytes.  */
+-	sarl	$2, %ecx
+-	/* Skip ECX bytes.  */
+-	shrl	%cl, %edi
++	/* Finish the loop.  */
++	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
++	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
++	VPMINU	%YMM4, %YMM6, %YMM9
++	VPTESTM	%YMM9, %YMM9, %k1
++
++	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
++	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
++
++	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
++	kmovd	%k0, %LOOP_REG
++	TESTEQ	%LOOP_REG
++	jnz	L(return_vec_2_3_end)
++
++	/* Best for code size to include ucond-jmp here. Would be faster
++	   if this case is hot to duplicate the L(return_vec_2_3_end) code
++	   as fall-through and have jump back to loop on mismatch
++	   comparison.  */
++	subq	$-(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rsi
++	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
++# ifdef USE_AS_STRNCMP
++	subq	$(CHAR_PER_VEC * 4), %rdx
++	ja	L(loop_skip_page_cross_check)
++L(ret_zero_in_loop_page_cross):
++	xorl	%eax, %eax
++	ret
+ # else
+-	/* Skip ECX bytes.  */
+-	shrq	%cl, %rdi
++	jmp	L(loop_skip_page_cross_check)
+ # endif
+-1:
+-	/* Before jumping back to the loop, set ESI to the number of
+-	   VEC_SIZE * 4 blocks before page crossing.  */
+-	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
+ 
+-	testq	%rdi, %rdi
+-# ifdef USE_AS_STRNCMP
+-	/* At this point, if %rdi value is 0, it already tested
+-	   VEC_SIZE*4+%r10 byte starting from %rax. This label
+-	   checks whether strncmp maximum offset reached or not.  */
+-	je	L(string_nbyte_offset_check)
++
++	.p2align 4,, 10
++L(return_vec_page_cross_0):
++	addl	$-VEC_SIZE, %eax
++L(return_vec_page_cross_1):
++	tzcntl	%ecx, %ecx
++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
++	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
++#  ifdef USE_AS_STRNCMP
++#   ifdef USE_AS_WCSCMP
++	/* Must divide ecx instead of multiply rdx due to overflow.  */
++	movl	%ecx, %eax
++	shrl	$2, %eax
++	cmpq	%rax, %rdx
++#   else
++	cmpq	%rcx, %rdx
++#   endif
++	jbe	L(ret_zero_in_loop_page_cross)
++#  endif
+ # else
+-	je	L(back_to_loop)
++	addl	%eax, %ecx
+ # endif
+-	tzcntq	%rdi, %rcx
++
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %ecx
+-# endif
+-	addq	%r10, %rcx
+-	/* Adjust for number of bytes skipped.  */
+-	addq	%r8, %rcx
+-# ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 2), %rcx
+-	subq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	movl	VEC_OFFSET(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
++	je	L(ret9)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
+-	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
++	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret9):
+ 	ret
+ 
+-# ifdef USE_AS_STRNCMP
+-L(string_nbyte_offset_check):
+-	leaq	(VEC_SIZE * 4)(%r10), %r10
+-	cmpq	%r10, %r11
+-	jbe	L(zero)
+-	jmp	L(back_to_loop)
++
++	.p2align 4,, 10
++L(page_cross):
++# ifndef USE_AS_STRNCMP
++	/* If both are VEC aligned we don't need any special logic here.
++	   Only valid for strcmp where stop condition is guranteed to be
++	   reachable by just reading memory.  */
++	testl	$((VEC_SIZE - 1) << 20), %eax
++	jz	L(no_page_cross)
+ # endif
+ 
+-	.p2align 4
+-L(cross_page_loop):
+-	/* Check one byte/dword at a time.  */
++	movl	%edi, %eax
++	movl	%esi, %ecx
++	andl	$(PAGE_SIZE - 1), %eax
++	andl	$(PAGE_SIZE - 1), %ecx
++
++	xorl	%OFFSET_REG, %OFFSET_REG
++
++	/* Check which is closer to page cross, s1 or s2.  */
++	cmpl	%eax, %ecx
++	jg	L(page_cross_s2)
++
++	/* The previous page cross check has false positives. Check for
++	   true positive as page cross logic is very expensive.  */
++	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
++	jbe	L(no_page_cross)
++
++
++	/* Set r8 to not interfere with normal return value (rdi and rsi
++	   did not swap).  */
+ # ifdef USE_AS_WCSCMP
+-	cmpl	%ecx, %eax
++	/* any non-zero positive value that doesn't inference with 0x1.
++	 */
++	movl	$2, %r8d
+ # else
+-	subl	%ecx, %eax
++	xorl	%r8d, %r8d
+ # endif
+-	jne	L(different)
+-	addl	$SIZE_OF_CHAR, %edx
+-	cmpl	$(VEC_SIZE * 4), %edx
+-	je	L(main_loop_header)
++
++	/* Check if less than 1x VEC till page cross.  */
++	subl	$(VEC_SIZE * 3), %eax
++	jg	L(less_1x_vec_till_page)
++
++
++	/* If more than 1x VEC till page cross, loop throuh safely
++	   loadable memory until within 1x VEC of page cross.  */
++	.p2align 4,, 8
++L(page_cross_loop):
++	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(check_ret_vec_page_cross)
++	addl	$CHAR_PER_VEC, %OFFSET_REG
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross)
+ # endif
++	addl	$VEC_SIZE, %eax
++	jl	L(page_cross_loop)
++
+ # ifdef USE_AS_WCSCMP
+-	movl	(%rdi, %rdx), %eax
+-	movl	(%rsi, %rdx), %ecx
+-# else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %ecx
++	shrl	$2, %eax
+ # endif
+-	/* Check null CHAR.  */
+-	testl	%eax, %eax
+-	jne	L(cross_page_loop)
+-	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
+-	   comparisons.  */
+-	subl	%ecx, %eax
+-# ifndef USE_AS_WCSCMP
+-L(different):
++
++
++	subl	%eax, %OFFSET_REG
++	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
++	   to not cross page so is safe to load. Since we have already
++	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
++	 */
++	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
++
++	kmovd	%k1, %ecx
++# ifdef USE_AS_STRNCMP
++	leal	CHAR_PER_VEC(%OFFSET_REG64), %eax
++	cmpq	%rax, %rdx
++	jbe	L(check_ret_vec_page_cross2)
++#  ifdef USE_AS_WCSCMP
++	addq	$-(CHAR_PER_VEC * 2), %rdx
++#  else
++	addq	%rdi, %rdx
++#  endif
+ # endif
+-	ret
++	TESTEQ	%ecx
++	jz	L(prepare_loop_no_len)
+ 
++	.p2align 4,, 4
++L(ret_vec_page_cross):
++# ifndef USE_AS_STRNCMP
++L(check_ret_vec_page_cross):
++# endif
++	tzcntl	%ecx, %ecx
++	addl	%OFFSET_REG, %ecx
++L(ret_vec_page_cross_cont):
+ # ifdef USE_AS_WCSCMP
+-	.p2align 4
+-L(different):
+-	/* Use movl to avoid modifying EFLAGS.  */
+-	movl	$0, %eax
++	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
++	xorl	%eax, %eax
++	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret12)
+ 	setl	%al
+ 	negl	%eax
+-	orl	$1, %eax
+-	ret
++	xorl	%r8d, %eax
++# else
++	movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax
++	movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret12):
++	ret
++
+ 
+ # ifdef USE_AS_STRNCMP
+-	.p2align 4
+-L(zero):
++	.p2align 4,, 10
++L(check_ret_vec_page_cross2):
++	TESTEQ	%ecx
++L(check_ret_vec_page_cross):
++	tzcntl	%ecx, %ecx
++	addl	%OFFSET_REG, %ecx
++	cmpq	%rcx, %rdx
++	ja	L(ret_vec_page_cross_cont)
++	.p2align 4,, 2
++L(ret_zero_page_cross):
+ 	xorl	%eax, %eax
+ 	ret
++# endif
+ 
+-	.p2align 4
+-L(char0):
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi), %ecx
+-	cmpl	(%rsi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
+-#  endif
+-	ret
++	.p2align 4,, 4
++L(page_cross_s2):
++	/* Ensure this is a true page cross.  */
++	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
++	jbe	L(no_page_cross)
++
++
++	movl	%ecx, %eax
++	movq	%rdi, %rcx
++	movq	%rsi, %rdi
++	movq	%rcx, %rsi
++
++	/* set r8 to negate return value as rdi and rsi swapped.  */
++# ifdef USE_AS_WCSCMP
++	movl	$-4, %r8d
++# else
++	movl	$-1, %r8d
+ # endif
++	xorl	%OFFSET_REG, %OFFSET_REG
+ 
+-	.p2align 4
+-L(last_vector):
+-	addq	%rdx, %rdi
+-	addq	%rdx, %rsi
+-# ifdef USE_AS_STRNCMP
+-	subq	%rdx, %r11
++	/* Check if more than 1x VEC till page cross.  */
++	subl	$(VEC_SIZE * 3), %eax
++	jle	L(page_cross_loop)
++
++	.p2align 4,, 6
++L(less_1x_vec_till_page):
++# ifdef USE_AS_WCSCMP
++	shrl	$2, %eax
+ # endif
+-	tzcntl	%ecx, %edx
++	/* Find largest load size we can use.  */
++	cmpl	$(16 / SIZE_OF_CHAR), %eax
++	ja	L(less_16_till_page)
++
++	/* Use 16 byte comparison.  */
++	vmovdqu	(%rdi), %xmm0
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, (%rsi), %xmm0, %k1{%k2}
++	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
++	subl	$0xf, %ecx
++# else
++	incw	%cx
+ # endif
++	jnz	L(check_ret_vec_page_cross)
++	movl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subl	%eax, %OFFSET_REG
++# else
++	/* Explicit check for 16 byte alignment.  */
++	subl	%eax, %OFFSET_REG
++	jz	L(prepare_loop)
+ # endif
++	vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
++	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	subl	$0xf, %ecx
+ # else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	incw	%cx
+ # endif
++	jnz	L(check_ret_vec_page_cross)
++# ifdef USE_AS_STRNCMP
++	addl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subq	$-(CHAR_PER_VEC * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++# else
++	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++# endif
++	jmp	L(prepare_loop_aligned)
++
++# ifdef USE_AS_STRNCMP
++	.p2align 4,, 2
++L(ret_zero_page_cross_slow_case0):
++	xorl	%eax, %eax
+ 	ret
++# endif
+ 
+-	/* Comparing on page boundary region requires special treatment:
+-	   It must done one vector at the time, starting with the wider
+-	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
+-	   (xmm) still passes the boundary, byte comparison must be done.
+-	 */
+-	.p2align 4
+-L(cross_page):
+-	/* Try one ymm vector at a time.  */
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(cross_page_1_vector)
+-L(loop_1_vector):
+-	VMOVU	(%rdi, %rdx), %YMM0
+ 
+-	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at (%rsi, %rdx).  */
+-	VPCMP	$0, (%rsi, %rdx), %YMM0, %k1{%k2}
++	.p2align 4,, 10
++L(less_16_till_page):
++	cmpl	$(24 / SIZE_OF_CHAR), %eax
++	ja	L(less_8_till_page)
++
++	/* Use 8 byte comparison.  */
++	vmovq	(%rdi), %xmm0
++	vmovq	(%rsi), %xmm1
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
++	subl	$0x3, %ecx
+ # else
+-	incl	%ecx
++	incb	%cl
+ # endif
+-	jne	L(last_vector)
++	jnz	L(check_ret_vec_page_cross)
+ 
+-	addl	$VEC_SIZE, %edx
+ 
+-	addl	$VEC_SIZE, %eax
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$(8 / SIZE_OF_CHAR), %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
+ # endif
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jle	L(loop_1_vector)
+-L(cross_page_1_vector):
+-	/* Less than 32 bytes to check, try one xmm vector.  */
+-	cmpl	$(PAGE_SIZE - 16), %eax
+-	jg	L(cross_page_1_xmm)
+-	VMOVU	(%rdi, %rdx), %XMM0
++	movl	$(24 / SIZE_OF_CHAR), %OFFSET_REG
++	subl	%eax, %OFFSET_REG
+ 
+-	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in XMM0 and 16 bytes at (%rsi, %rdx).  */
+-	VPCMP	$0, (%rsi, %rdx), %XMM0, %k1{%k2}
++	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
++	vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	subl	$0xf, %ecx
++	subl	$0x3, %ecx
+ # else
+-	subl	$0xffff, %ecx
++	incb	%cl
+ # endif
+-	jne	L(last_vector)
++	jnz	L(check_ret_vec_page_cross)
++
+ 
+-	addl	$16, %edx
+-# ifndef USE_AS_WCSCMP
+-	addl	$16, %eax
+-# endif
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	addl	$(8 / SIZE_OF_CHAR), %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subq	$-(CHAR_PER_VEC * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++# else
++	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+ # endif
++	jmp	L(prepare_loop_aligned)
+ 
+-L(cross_page_1_xmm):
+-# ifndef USE_AS_WCSCMP
+-	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
+-	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
+-	cmpl	$(PAGE_SIZE - 8), %eax
+-	jg	L(cross_page_8bytes)
+-	vmovq	(%rdi, %rdx), %XMM0
+-	vmovq	(%rsi, %rdx), %XMM1
+ 
+-	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in XMM0 and XMM1.  */
+-	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
+-	kmovb	%k1, %ecx
++
++
++	.p2align 4,, 10
++L(less_8_till_page):
+ # ifdef USE_AS_WCSCMP
+-	subl	$0x3, %ecx
++	/* If using wchar then this is the only check before we reach
++	   the page boundary.  */
++	movl	(%rdi), %eax
++	movl	(%rsi), %ecx
++	cmpl	%ecx, %eax
++	jnz	L(ret_less_8_wcs)
++#  ifdef USE_AS_STRNCMP
++	addq	$-(CHAR_PER_VEC * 2), %rdx
++	/* We already checked for len <= 1 so cannot hit that case here.
++	 */
++#  endif
++	testl	%eax, %eax
++	jnz	L(prepare_loop)
++	ret
++
++	.p2align 4,, 8
++L(ret_less_8_wcs):
++	setl	%OFFSET_REG8
++	negl	%OFFSET_REG
++	movl	%OFFSET_REG, %eax
++	xorl	%r8d, %eax
++	ret
++
+ # else
+-	subl	$0xff, %ecx
+-# endif
+-	jne	L(last_vector)
++	cmpl	$28, %eax
++	ja	L(less_4_till_page)
++
++	vmovd	(%rdi), %xmm0
++	vmovd	(%rsi), %xmm1
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
++	kmovd	%k1, %ecx
++	subl	$0xf, %ecx
++	jnz	L(check_ret_vec_page_cross)
+ 
+-	addl	$8, %edx
+-	addl	$8, %eax
+ #  ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$4, %rdx
++	jbe	L(ret_zero_page_cross_slow_case1)
+ #  endif
++	movl	$(28 / SIZE_OF_CHAR), %OFFSET_REG
++	subl	%eax, %OFFSET_REG
+ 
+-L(cross_page_8bytes):
+-	/* Less than 8 bytes to check, try 4 byte vector.  */
+-	cmpl	$(PAGE_SIZE - 4), %eax
+-	jg	L(cross_page_4bytes)
+-	vmovd	(%rdi, %rdx), %XMM0
+-	vmovd	(%rsi, %rdx), %XMM1
+-
+-	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in XMM0 and XMM1.  */
+-	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
++	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
++	vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0x1, %ecx
+-# else
+ 	subl	$0xf, %ecx
+-# endif
+-	jne	L(last_vector)
++	jnz	L(check_ret_vec_page_cross)
++#  ifdef USE_AS_STRNCMP
++	addl	$(4 / SIZE_OF_CHAR), %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case1)
++	subq	$-(CHAR_PER_VEC * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++#  else
++	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++#  endif
++	jmp	L(prepare_loop_aligned)
++
+ 
+-	addl	$4, %edx
+ #  ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	.p2align 4,, 2
++L(ret_zero_page_cross_slow_case1):
++	xorl	%eax, %eax
++	ret
+ #  endif
+ 
+-L(cross_page_4bytes):
+-# endif
+-	/* Less than 4 bytes to check, try one byte/dword at a time.  */
+-# ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-# endif
+-# ifdef USE_AS_WCSCMP
+-	movl	(%rdi, %rdx), %eax
+-	movl	(%rsi, %rdx), %ecx
+-# else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %ecx
+-# endif
+-	testl	%eax, %eax
+-	jne	L(cross_page_loop)
++	.p2align 4,, 10
++L(less_4_till_page):
++	subq	%rdi, %rsi
++	/* Extremely slow byte comparison loop.  */
++L(less_4_loop):
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi, %rdi), %ecx
+ 	subl	%ecx, %eax
++	jnz	L(ret_less_4_loop)
++	testl	%ecx, %ecx
++	jz	L(ret_zero_4_loop)
++#  ifdef USE_AS_STRNCMP
++	decq	%rdx
++	jz	L(ret_zero_4_loop)
++#  endif
++	incq	%rdi
++	/* end condition is reach page boundary (rdi is aligned).  */
++	testl	$31, %edi
++	jnz	L(less_4_loop)
++	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
++	addq	$-(VEC_SIZE * 4), %rdi
++#  ifdef USE_AS_STRNCMP
++	subq	$-(CHAR_PER_VEC * 4), %rdx
++#  endif
++	jmp	L(prepare_loop_aligned)
++
++L(ret_zero_4_loop):
++	xorl	%eax, %eax
++	ret
++L(ret_less_4_loop):
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ 	ret
+-END (STRCMP)
++# endif
++END(STRCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-76.patch b/glibc-RHEL-15696-76.patch
new file mode 100644
index 0000000..84d9a6f
--- /dev/null
+++ b/glibc-RHEL-15696-76.patch
@@ -0,0 +1,33 @@
+From c15efd011cea3d8f0494269eb539583215a1feed Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 4 Feb 2022 11:09:10 -0800
+Subject: [PATCH] x86-64: Fix strcmp-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+Change "movl %edx, %rdx" to "movl %edx, %edx" in:
+
+commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Jan 10 15:35:38 2022 -0600
+
+    x86: Optimize strcmp-avx2.S
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 554ffe4c..04675aa4 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -106,7 +106,7 @@ ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+-	movl	%edx, %rdx
++	movl	%edx, %edx
+ #  endif
+ 	cmp	$1, %RDX_LP
+ 	/* Signed comparison intentional. We use this branch to also
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-77.patch b/glibc-RHEL-15696-77.patch
new file mode 100644
index 0000000..1a1cdae
--- /dev/null
+++ b/glibc-RHEL-15696-77.patch
@@ -0,0 +1,33 @@
+From 0e0199a9e02ebe42e2b36958964d63f03573c382 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 4 Feb 2022 11:11:08 -0800
+Subject: [PATCH] x86-64: Fix strcmp-evex.S
+Content-type: text/plain; charset=UTF-8
+
+Change "movl %edx, %rdx" to "movl %edx, %edx" in:
+
+commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Jan 10 15:35:39 2022 -0600
+
+    x86: Optimize strcmp-evex.S
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 99d8409a..ed56af8e 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -116,7 +116,7 @@ ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+-	movl	%edx, %rdx
++	movl	%edx, %edx
+ #  endif
+ 	cmp	$1, %RDX_LP
+ 	/* Signed comparison intentional. We use this branch to also
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-78.patch b/glibc-RHEL-15696-78.patch
new file mode 100644
index 0000000..885b715
--- /dev/null
+++ b/glibc-RHEL-15696-78.patch
@@ -0,0 +1,459 @@
+From b62ace2740a106222e124cc86956448fa07abf4d Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 6 Feb 2022 00:54:18 -0600
+Subject: [PATCH] x86: Improve vec generation in memset-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+Split vec generation into multiple steps. This allows the
+broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
+case. This saves an expensive lane-cross instruction and removes
+the need for 'vzeroupper'.
+
+For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
+byte broadcast.
+
+Results for memset-avx2 small (geomean of N = 20 benchset runs).
+
+size, New Time, Old Time, New / Old
+   0,    4.100,    3.831,     0.934
+   1,    5.074,    4.399,     0.867
+   2,    4.433,    4.411,     0.995
+   4,    4.487,    4.415,     0.984
+   8,    4.454,    4.396,     0.987
+  16,    4.502,    4.443,     0.987
+
+All relevant string/wcsmbs tests are passing.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/memset.S                       |  21 ++-
+ .../multiarch/memset-avx2-unaligned-erms.S    |  18 +-
+ .../multiarch/memset-avx512-unaligned-erms.S  |  18 +-
+ .../multiarch/memset-evex-unaligned-erms.S    |  18 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 164 +++++++++++-------
+ 5 files changed, 152 insertions(+), 87 deletions(-)
+
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 8672b030..27debd2b 100644
+--- a/sysdeps/x86_64/memset.S
++++ b/sysdeps/x86_64/memset.S
+@@ -28,17 +28,22 @@
+ #define VMOVU     movups
+ #define VMOVA     movaps
+ 
+-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-  movq r, %rax; \
+-  punpcklbw %xmm0, %xmm0; \
+-  punpcklwd %xmm0, %xmm0; \
+-  pshufd $0, %xmm0, %xmm0
++  pxor %xmm1, %xmm1; \
++  pshufb %xmm1, %xmm0; \
++  movq r, %rax
+ 
+-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-  movq r, %rax; \
+-  pshufd $0, %xmm0, %xmm0
++  pshufd $0, %xmm0, %xmm0; \
++  movq r, %rax
++
++# define MEMSET_VDUP_TO_VEC0_HIGH()
++# define MEMSET_VDUP_TO_VEC0_LOW()
++
++# define WMEMSET_VDUP_TO_VEC0_HIGH()
++# define WMEMSET_VDUP_TO_VEC0_LOW()
+ 
+ #define SECTION(p)		p
+ 
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index 1af668af..c0bf2875 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -10,15 +10,18 @@
+ # define VMOVU     vmovdqu
+ # define VMOVA     vmovdqa
+ 
+-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   vmovd d, %xmm0; \
+-  movq r, %rax; \
+-  vpbroadcastb %xmm0, %ymm0
++  movq r, %rax;
+ 
+-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  vmovd d, %xmm0; \
+-  movq r, %rax; \
+-  vpbroadcastd %xmm0, %ymm0
++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
++
++# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
++# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
++
++# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
++# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
+ 
+ # ifndef SECTION
+ #  define SECTION(p)		p##.avx
+@@ -30,5 +33,6 @@
+ #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+ # endif
+ 
++# define USE_XMM_LESS_VEC
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index f14d6f84..5241216a 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -15,13 +15,19 @@
+ 
+ # define VZEROUPPER
+ 
+-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastb d, %VEC0
++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  vpbroadcastb d, %VEC0; \
++  movq r, %rax
+ 
+-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastd d, %VEC0
++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  vpbroadcastd d, %VEC0; \
++  movq r, %rax
++
++# define MEMSET_VDUP_TO_VEC0_HIGH()
++# define MEMSET_VDUP_TO_VEC0_LOW()
++
++# define WMEMSET_VDUP_TO_VEC0_HIGH()
++# define WMEMSET_VDUP_TO_VEC0_LOW()
+ 
+ # define SECTION(p)		p##.evex512
+ # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index 64b09e77..63700215 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -15,13 +15,19 @@
+ 
+ # define VZEROUPPER
+ 
+-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastb d, %VEC0
++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  vpbroadcastb d, %VEC0; \
++  movq r, %rax
+ 
+-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastd d, %VEC0
++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  vpbroadcastd d, %VEC0; \
++  movq r, %rax
++
++# define MEMSET_VDUP_TO_VEC0_HIGH()
++# define MEMSET_VDUP_TO_VEC0_LOW()
++
++# define WMEMSET_VDUP_TO_VEC0_HIGH()
++# define WMEMSET_VDUP_TO_VEC0_LOW()
+ 
+ # define SECTION(p)		p##.evex
+ # define MEMSET_SYMBOL(p,s)	p##_evex_##s
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index f08b7323..a67f9833 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -58,8 +58,10 @@
+ #ifndef MOVQ
+ # if VEC_SIZE > 16
+ #  define MOVQ				vmovq
++#  define MOVD				vmovd
+ # else
+ #  define MOVQ				movq
++#  define MOVD				movd
+ # endif
+ #endif
+ 
+@@ -72,9 +74,17 @@
+ #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+ # define END_REG	rcx
+ # define LOOP_REG	rdi
++# define LESS_VEC_REG	rax
+ #else
+ # define END_REG	rdi
+ # define LOOP_REG	rdx
++# define LESS_VEC_REG	rdi
++#endif
++
++#ifdef USE_XMM_LESS_VEC
++# define XMM_SMALL	1
++#else
++# define XMM_SMALL	0
+ #endif
+ 
+ #define PAGE_SIZE 4096
+@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+ 
+ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+ 	shl	$2, %RDX_LP
+-	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+-	jmp	L(entry_from_bzero)
++	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
++	WMEMSET_VDUP_TO_VEC0_LOW()
++	cmpq	$VEC_SIZE, %rdx
++	jb	L(less_vec_no_vdup)
++	WMEMSET_VDUP_TO_VEC0_HIGH()
++	jmp	L(entry_from_wmemset)
+ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+ #endif
+ 
+@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+ #endif
+ 
+ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
++	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+ L(entry_from_bzero):
+ 	cmpq	$VEC_SIZE, %rdx
+ 	jb	L(less_vec)
++	MEMSET_VDUP_TO_VEC0_HIGH()
++L(entry_from_wmemset):
+ 	cmpq	$(VEC_SIZE * 2), %rdx
+ 	ja	L(more_2x_vec)
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ # endif
+ 
+ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
+-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
++	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+ # endif
+ 	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
++	MEMSET_VDUP_TO_VEC0_HIGH ()
+ 	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(stosb_more_2x_vec)
+-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
+-	 */
+-	VMOVU	%VEC(0), (%rax)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
+ 
+-	.p2align 4,, 10
++	.p2align 4,, 4
+ L(last_2x_vec):
+ #ifdef USE_LESS_VEC_MASK_STORE
+-	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
+-	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
++	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+ #else
+ 	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
+ 	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
+@@ -212,6 +228,7 @@ L(last_2x_vec):
+ #ifdef USE_LESS_VEC_MASK_STORE
+ 	.p2align 4,, 10
+ L(less_vec):
++L(less_vec_no_vdup):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
+ 	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
+ 	   and (4x, 8x] jump to target.  */
+ L(more_2x_vec):
+-
+-	/* Two different methods of setting up pointers / compare. The
+-	   two methods are based on the fact that EVEX/AVX512 mov
+-	   instructions take more bytes then AVX2/SSE2 mov instructions. As
+-	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
+-	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
+-	   this saves code size and keeps a few targets in one fetch block.
+-	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
+-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+-	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
+-	   LOOP_4X_OFFSET) with LEA_BID.  */
+-
+-	/* END_REG is rcx for EVEX/AVX512.  */
+-	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+-#endif
+-
+-	/* Stores to first 2x VEC before cmp as any path forward will
+-	   require it.  */
+-	VMOVU	%VEC(0), (%rax)
+-	VMOVU	%VEC(0), VEC_SIZE(%rax)
++	/* Store next 2x vec regardless.  */
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
+ 
+ 
++	/* Two different methods of setting up pointers / compare. The two
++	   methods are based on the fact that EVEX/AVX512 mov instructions take
++	   more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
++	   machines also have fast LEA_BID. Both setup and END_REG to avoid complex
++	   address mode. For EVEX/AVX512 this saves code size and keeps a few
++	   targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
++	   bottlenecks.  */
+ #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+ 	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
+ 	addq	%rdx, %END_REG
+@@ -292,6 +299,15 @@ L(more_2x_vec):
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	jbe	L(last_2x_vec)
+ 
++
++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
++	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
++	   LEA_BID.  */
++
++	/* END_REG is rcx for EVEX/AVX512.  */
++	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
++#endif
++
+ 	/* Store next 2x vec regardless.  */
+ 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
+ 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
+@@ -355,65 +371,93 @@ L(stosb_local):
+ 	/* Define L(less_vec) only if not otherwise defined.  */
+ 	.p2align 4
+ L(less_vec):
++	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
++	   xmm). This is only does anything for AVX2.  */
++	MEMSET_VDUP_TO_VEC0_LOW ()
++L(less_vec_no_vdup):
+ #endif
+ L(cross_page):
+ #if VEC_SIZE > 32
+ 	cmpl	$32, %edx
+-	jae	L(between_32_63)
++	jge	L(between_32_63)
+ #endif
+ #if VEC_SIZE > 16
+ 	cmpl	$16, %edx
+-	jae	L(between_16_31)
++	jge	L(between_16_31)
++#endif
++#ifndef USE_XMM_LESS_VEC
++	MOVQ	%XMM0, %rcx
+ #endif
+-	MOVQ	%XMM0, %rdi
+ 	cmpl	$8, %edx
+-	jae	L(between_8_15)
++	jge	L(between_8_15)
+ 	cmpl	$4, %edx
+-	jae	L(between_4_7)
++	jge	L(between_4_7)
+ 	cmpl	$1, %edx
+-	ja	L(between_2_3)
+-	jb	L(return)
+-	movb	%sil, (%rax)
+-	VZEROUPPER_RETURN
++	jg	L(between_2_3)
++	jl	L(between_0_0)
++	movb	%sil, (%LESS_VEC_REG)
++L(between_0_0):
++	ret
+ 
+-	/* Align small targets only if not doing so would cross a fetch
+-	   line.  */
++	/* Align small targets only if not doing so would cross a fetch line.
++	 */
+ #if VEC_SIZE > 32
+ 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	VMOVU	%YMM0, (%rax)
+-	VMOVU	%YMM0, -32(%rax, %rdx)
++	VMOVU	%YMM0, (%LESS_VEC_REG)
++	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
+ 
+ #if VEC_SIZE >= 32
+-	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
++	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
+ L(between_16_31):
+ 	/* From 16 to 31.  No branch when size == 16.  */
+-	VMOVU	%XMM0, (%rax)
+-	VMOVU	%XMM0, -16(%rax, %rdx)
+-	VZEROUPPER_RETURN
++	VMOVU	%XMM0, (%LESS_VEC_REG)
++	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
++	ret
+ #endif
+ 
+-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
++	/* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
++	 */
++	.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
+ L(between_8_15):
+ 	/* From 8 to 15.  No branch when size == 8.  */
+-	movq	%rdi, (%rax)
+-	movq	%rdi, -8(%rax, %rdx)
+-	VZEROUPPER_RETURN
++#ifdef USE_XMM_LESS_VEC
++	MOVQ	%XMM0, (%rdi)
++	MOVQ	%XMM0, -8(%rdi, %rdx)
++#else
++	movq	%rcx, (%LESS_VEC_REG)
++	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
++#endif
++	ret
+ 
+-	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
++	/* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
++	 */
++	.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	%edi, (%rax)
+-	movl	%edi, -4(%rax, %rdx)
+-	VZEROUPPER_RETURN
++#ifdef USE_XMM_LESS_VEC
++	MOVD	%XMM0, (%rdi)
++	MOVD	%XMM0, -4(%rdi, %rdx)
++#else
++	movl	%ecx, (%LESS_VEC_REG)
++	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
++#endif
++	ret
+ 
+-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
++	/* 4 * XMM_SMALL for the third mov for AVX2.  */
++	.p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+-	movw	%di, (%rax)
+-	movb	%dil, -1(%rax, %rdx)
+-	VZEROUPPER_RETURN
++#ifdef USE_XMM_LESS_VEC
++	movb	%sil, (%rdi)
++	movb	%sil, 1(%rdi)
++	movb	%sil, -1(%rdi, %rdx)
++#else
++	movw	%cx, (%LESS_VEC_REG)
++	movb	%sil, -1(%LESS_VEC_REG, %rdx)
++#endif
++	ret
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-79.patch b/glibc-RHEL-15696-79.patch
new file mode 100644
index 0000000..91e850f
--- /dev/null
+++ b/glibc-RHEL-15696-79.patch
@@ -0,0 +1,40 @@
+From 1b0c60f95bbe2eded80b2bb5be75c0e45b11cde1 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 7 Feb 2022 00:32:23 -0600
+Subject: [PATCH] x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2
+ Only)
+Content-type: text/plain; charset=UTF-8
+
+commit b62ace2740a106222e124cc86956448fa07abf4d
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Sun Feb 6 00:54:18 2022 -0600
+
+    x86: Improve vec generation in memset-vec-unaligned-erms.S
+
+Revert usage of 'pshufb' in broadcast logic as it is an SSSE3
+instruction and memset.S is restricted to only SSE2 instructions.
+---
+ sysdeps/x86_64/memset.S | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 27debd2b..4cb4aa71 100644
+--- a/sysdeps/x86_64/memset.S
++++ b/sysdeps/x86_64/memset.S
+@@ -30,9 +30,10 @@
+ 
+ # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-  pxor %xmm1, %xmm1; \
+-  pshufb %xmm1, %xmm0; \
+-  movq r, %rax
++  movq r, %rax; \
++  punpcklbw %xmm0, %xmm0; \
++  punpcklwd %xmm0, %xmm0; \
++  pshufd $0, %xmm0, %xmm0
+ 
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-8.patch b/glibc-RHEL-15696-8.patch
new file mode 100644
index 0000000..5cf7633
--- /dev/null
+++ b/glibc-RHEL-15696-8.patch
@@ -0,0 +1,218 @@
+From 5165de69c0908e28a380cbd4bb054e55ea4abc95 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:36:36 -0800
+Subject: [PATCH] x86-64 strnlen/wcsnlen: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes strnlen/wcsnlen for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strlen-avx2.S: Use RSI_LP for length.
+	Clear the upper 32 bits of RSI register.
+	* sysdeps/x86_64/strlen.S: Use RSI_LP for length.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strnlen
+	and tst-size_t-wcsnlen.
+	* sysdeps/x86_64/x32/tst-size_t-strnlen.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wcsnlen.c: Likewise.
+---
+ sysdeps/x86_64/multiarch/strlen-avx2.S  |  9 ++--
+ sysdeps/x86_64/strlen.S                 | 12 ++---
+ sysdeps/x86_64/x32/Makefile             |  4 +-
+ sysdeps/x86_64/x32/tst-size_t-strnlen.c | 72 +++++++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wcsnlen.c | 20 +++++++
+ 5 files changed, 106 insertions(+), 11 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strnlen.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
+index fb2418cd..645e0446 100644
+--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
+@@ -42,12 +42,15 @@
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+ 	/* Check for zero length.  */
+-	testq	%rsi, %rsi
++	test	%RSI_LP, %RSI_LP
+ 	jz	L(zero)
+ #  ifdef USE_AS_WCSLEN
+-	shl	$2, %rsi
++	shl	$2, %RSI_LP
++#  elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%esi, %esi
+ #  endif
+-	movq	%rsi, %r8
++	mov	%RSI_LP, %R8_LP
+ # endif
+ 	movl	%edi, %ecx
+ 	movq	%rdi, %rdx
+diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
+index 01cb5fa8..f845f3d4 100644
+--- a/sysdeps/x86_64/strlen.S
++++ b/sysdeps/x86_64/strlen.S
+@@ -59,21 +59,21 @@ ENTRY(strlen)
+ 
+ #ifdef AS_STRNLEN
+ /* Do not read anything when n==0.  */
+-	test	%rsi, %rsi
++	test	%RSI_LP, %RSI_LP
+ 	jne	L(n_nonzero)
+ 	xor	%rax, %rax
+ 	ret
+ L(n_nonzero):
+ # ifdef AS_WCSLEN
+-	shlq	$2, %rsi
++	shl	$2, %RSI_LP
+ # endif
+ 
+ /* Initialize long lived registers.  */
+ 
+-	add	%rdi, %rsi
+-	mov	%rsi, %r10
+-	and	$-64, %r10
+-	mov	%rsi, %r11
++	add	%RDI_LP, %RSI_LP
++	mov	%RSI_LP, %R10_LP
++	and	$-64, %R10_LP
++	mov	%RSI_LP, %R11_LP
+ #endif
+ 
+ 	pxor	%xmm0, %xmm0
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 2a9e20a9..1557724b 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -8,10 +8,10 @@ endif
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+ 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+-	 tst-size_t-strncmp tst-size_t-strncpy
++	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+ tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
+-	 tst-size_t-wcsncmp
++	 tst-size_t-wcsncmp tst-size_t-wcsnlen
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strnlen.c b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
+new file mode 100644
+index 00000000..690a4a8a
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
+@@ -0,0 +1,72 @@
++/* Test strnlen with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifdef WIDE
++# define TEST_NAME "wcsnlen"
++#else
++# define TEST_NAME "strnlen"
++#endif /* WIDE */
++
++#include "test-size_t.h"
++
++#ifdef WIDE
++# include <wchar.h>
++# define STRNLEN wcsnlen
++# define CHAR wchar_t
++#else
++# define STRNLEN strnlen
++# define CHAR char
++#endif /* WIDE */
++
++IMPL (STRNLEN, 1)
++
++typedef size_t (*proto_t) (const CHAR *, size_t);
++
++static size_t
++__attribute__ ((noinline, noclone))
++do_strnlen (parameter_t a, parameter_t b)
++{
++  return CALL (&a, a.p, b.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  size_t size = page_size / sizeof (CHAR);
++  parameter_t src = { { 0 }, buf2 };
++  parameter_t c = { { size }, (void *) (uintptr_t) 'a' };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      size_t res = do_strnlen (src, c);
++      if (res != size)
++	{
++	  error (0, 0, "Wrong result in function %s: 0x%x != 0x%x",
++		 impl->name, res, size);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
+new file mode 100644
+index 00000000..093b4bbe
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
+@@ -0,0 +1,20 @@
++/* Test wcsnlen with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include "tst-size_t-strnlen.c"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-80.patch b/glibc-RHEL-15696-80.patch
new file mode 100644
index 0000000..53a3e7e
--- /dev/null
+++ b/glibc-RHEL-15696-80.patch
@@ -0,0 +1,753 @@
+From 3d9f171bfb5325bd5f427e9fc386453358c6e840 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 7 Feb 2022 05:55:15 -0800
+Subject: [PATCH] x86-64: Optimize bzero
+Content-type: text/plain; charset=UTF-8
+
+memset with zero as the value to set is by far the majority value (99%+
+for Python3 and GCC).
+
+bzero can be slightly more optimized for this case by using a zero-idiom
+xor for broadcasting the set value to a register (vector or GPR).
+
+Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/generic/ifunc-init.h                  |   5 +-
+ sysdeps/x86_64/memset.S                       |   8 +
+ sysdeps/x86_64/multiarch/Makefile             | 205 +++++++++++-------
+ sysdeps/x86_64/multiarch/bzero.c              | 106 +++++++++
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 ++++
+ .../memset-avx2-unaligned-erms-rtm.S          |   1 +
+ .../multiarch/memset-avx2-unaligned-erms.S    |   6 +
+ .../multiarch/memset-avx512-unaligned-erms.S  |   3 +
+ .../multiarch/memset-evex-unaligned-erms.S    |   3 +
+ .../multiarch/memset-sse2-unaligned-erms.S    |   1 +
+ .../multiarch/memset-vec-unaligned-erms.S     | 110 +++++++---
+ 11 files changed, 384 insertions(+), 106 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/bzero.c
+
+Conflicts:
+	sysdeps/generic/ifunc-init.h
+	(needs macros from cf4fd28ea453d1a9cec93939bc88b58ccef5437a (memcmpeq))
+	sysdeps/x86_64/multiarch/Makefile
+	(file ordering)
+
+diff --git a/sysdeps/generic/ifunc-init.h b/sysdeps/generic/ifunc-init.h
+index 241e4161..f7a72375 100644
+--- a/sysdeps/generic/ifunc-init.h
++++ b/sysdeps/generic/ifunc-init.h
+@@ -50,5 +50,8 @@
+    '__<symbol>_<variant>' as the optimized implementation and
+    '<symbol>_ifunc_selector' as the IFUNC selector.  */
+ #define REDIRECT_NAME	EVALUATOR1 (__redirect, SYMBOL_NAME)
+-#define OPTIMIZE(name)	EVALUATOR2 (SYMBOL_NAME, name)
++#define OPTIMIZE1(name)	EVALUATOR1 (SYMBOL_NAME, name)
++#define OPTIMIZE2(name)	EVALUATOR2 (SYMBOL_NAME, name)
++/* Default is to use OPTIMIZE2.  */
++#define OPTIMIZE(name)	OPTIMIZE2(name)
+ #define IFUNC_SELECTOR	EVALUATOR1 (SYMBOL_NAME, ifunc_selector)
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 4cb4aa71..a1353f89 100644
+--- a/sysdeps/x86_64/memset.S
++++ b/sysdeps/x86_64/memset.S
+@@ -35,6 +35,9 @@
+   punpcklwd %xmm0, %xmm0; \
+   pshufd $0, %xmm0, %xmm0
+ 
++# define BZERO_ZERO_VEC0() \
++  pxor %xmm0, %xmm0
++
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+   pshufd $0, %xmm0, %xmm0; \
+@@ -53,6 +56,10 @@
+ # define MEMSET_SYMBOL(p,s)	memset
+ #endif
+ 
++#ifndef BZERO_SYMBOL
++# define BZERO_SYMBOL(p,s)	__bzero
++#endif
++
+ #ifndef WMEMSET_SYMBOL
+ # define WMEMSET_CHK_SYMBOL(p,s) p
+ # define WMEMSET_SYMBOL(p,s)	__wmemset
+@@ -63,6 +70,7 @@
+ libc_hidden_builtin_def (memset)
+ 
+ #if IS_IN (libc)
++weak_alias (__bzero, bzero)
+ libc_hidden_def (__wmemset)
+ weak_alias (__wmemset, wmemset)
+ libc_hidden_weak (wmemset)
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 26be4095..37d8d6f0 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -1,85 +1,130 @@
+ ifeq ($(subdir),string)
+ 
+-sysdep_routines += strncat-c stpncpy-c strncpy-c \
+-		   strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3  \
+-		   strcmp-sse4_2 strcmp-avx2 \
+-		   strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \
+-		   memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \
+-		   memrchr-sse2 memrchr-avx2 \
+-		   memcmp-sse2 \
+-		   memcmp-avx2-movbe \
+-		   memcmp-sse4 memcpy-ssse3 \
+-		   memmove-ssse3 \
+-		   memcpy-ssse3-back \
+-		   memmove-ssse3-back \
+-		   memmove-avx512-no-vzeroupper \
+-		   strcasecmp_l-sse2 strcasecmp_l-ssse3 \
+-		   strcasecmp_l-sse4_2 strcasecmp_l-avx \
+-		   strncase_l-sse2 strncase_l-ssse3 \
+-		   strncase_l-sse4_2 strncase_l-avx \
+-		   strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
+-		   strrchr-sse2 strrchr-avx2 \
+-		   strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
+-		   strcat-avx2 strncat-avx2 \
+-		   strcat-ssse3 strncat-ssse3\
+-		   strcpy-avx2 strncpy-avx2 \
+-		   strcpy-sse2 stpcpy-sse2 \
+-		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
+-		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
+-		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
+-		   stpcpy-avx2 stpncpy-avx2 \
+-		   strcat-sse2 \
+-		   strcat-sse2-unaligned strncat-sse2-unaligned \
+-		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
+-		   strcspn-sse2 strpbrk-sse2 strspn-sse2 \
+-		   strcspn-c strpbrk-c strspn-c varshift \
+-		   memset-avx512-no-vzeroupper \
+-		   memmove-sse2-unaligned-erms \
+-		   memmove-avx-unaligned-erms \
+-		   memmove-avx512-unaligned-erms \
+-		   memset-sse2-unaligned-erms \
+-		   memset-avx2-unaligned-erms \
+-		   memset-avx512-unaligned-erms \
+-		   memchr-avx2-rtm \
+-		   memcmp-avx2-movbe-rtm \
+-		   memmove-avx-unaligned-erms-rtm \
+-		   memrchr-avx2-rtm \
+-		   memset-avx2-unaligned-erms-rtm \
+-		   rawmemchr-avx2-rtm \
+-		   strchr-avx2-rtm \
+-		   strcmp-avx2-rtm \
+-		   strchrnul-avx2-rtm \
+-		   stpcpy-avx2-rtm \
+-		   stpncpy-avx2-rtm \
+-		   strcat-avx2-rtm \
+-		   strcpy-avx2-rtm \
+-		   strlen-avx2-rtm \
+-		   strncat-avx2-rtm \
+-		   strncmp-avx2-rtm \
+-		   strncpy-avx2-rtm \
+-		   strnlen-avx2-rtm \
+-		   strrchr-avx2-rtm \
+-		   memchr-evex \
+-		   memcmp-evex-movbe \
+-		   memmove-evex-unaligned-erms \
+-		   memrchr-evex \
+-		   memset-evex-unaligned-erms \
+-		   rawmemchr-evex \
+-		   stpcpy-evex \
+-		   stpncpy-evex \
+-		   strcat-evex \
+-		   strchr-evex \
+-		   strchrnul-evex \
+-		   strcmp-evex \
+-		   strcpy-evex \
+-		   strlen-evex \
+-		   strncat-evex \
+-		   strncmp-evex \
+-		   strncpy-evex \
+-		   strnlen-evex \
+-		   strrchr-evex \
+-		   memchr-evex-rtm \
+-		   rawmemchr-evex-rtm
++sysdep_routines += \
++  bzero \
++  memchr-avx2 \
++  memchr-avx2-rtm \
++  memchr-evex \
++  memchr-evex-rtm \
++  memchr-sse2 \
++  memcmp-avx2-movbe \
++  memcmp-avx2-movbe-rtm \
++  memcmp-evex-movbe \
++  memcmp-sse2 \
++  memcmp-sse4 \
++  memcmp-ssse3 \
++  memcpy-ssse3 \
++  memcpy-ssse3-back \
++  memmove-avx-unaligned-erms \
++  memmove-avx-unaligned-erms-rtm \
++  memmove-avx512-no-vzeroupper \
++  memmove-avx512-unaligned-erms \
++  memmove-evex-unaligned-erms \
++  memmove-sse2-unaligned-erms \
++  memmove-ssse3 \
++  memmove-ssse3-back \
++  memrchr-avx2 \
++  memrchr-avx2-rtm \
++  memrchr-evex \
++  memrchr-sse2 \
++  memset-avx2-unaligned-erms \
++  memset-avx2-unaligned-erms-rtm \
++  memset-avx512-no-vzeroupper \
++  memset-avx512-unaligned-erms \
++  memset-evex-unaligned-erms \
++  memset-sse2-unaligned-erms \
++  rawmemchr-avx2 \
++  rawmemchr-avx2-rtm \
++  rawmemchr-evex \
++  rawmemchr-evex-rtm \
++  rawmemchr-sse2 \
++  stpcpy-avx2 \
++  stpcpy-avx2-rtm \
++  stpcpy-evex \
++  stpcpy-sse2 \
++  stpcpy-sse2-unaligned \
++  stpcpy-ssse3 \
++  stpncpy-avx2 \
++  stpncpy-avx2-rtm \
++  stpncpy-c \
++  stpncpy-evex \
++  stpncpy-sse2-unaligned \
++  stpncpy-ssse3 \
++  strcasecmp_l-avx \
++  strcasecmp_l-sse2 \
++  strcasecmp_l-sse4_2 \
++  strcasecmp_l-ssse3 \
++  strcat-avx2 \
++  strcat-avx2-rtm \
++  strcat-evex \
++  strcat-sse2 \
++  strcat-sse2-unaligned \
++  strcat-ssse3 \
++  strchr-avx2 \
++  strchr-avx2-rtm \
++  strchr-evex \
++  strchr-sse2 \
++  strchr-sse2-no-bsf \
++  strchrnul-avx2 \
++  strchrnul-avx2-rtm \
++  strchrnul-evex \
++  strchrnul-sse2 \
++  strcmp-avx2 \
++  strcmp-avx2-rtm \
++  strcmp-evex \
++  strcmp-sse2 \
++  strcmp-sse2-unaligned \
++  strcmp-sse4_2 \
++  strcmp-ssse3 \
++  strcpy-avx2 \
++  strcpy-avx2-rtm \
++  strcpy-evex \
++  strcpy-sse2 \
++  strcpy-sse2-unaligned \
++  strcpy-ssse3 \
++  strcspn-c \
++  strcspn-sse2 \
++  strlen-avx2 \
++  strlen-avx2-rtm \
++  strlen-evex \
++  strlen-sse2 \
++  strncase_l-avx \
++  strncase_l-sse2 \
++  strncase_l-sse4_2 \
++  strncase_l-ssse3 \
++  strncat-avx2 \
++  strncat-avx2-rtm \
++  strncat-c \
++  strncat-evex \
++  strncat-sse2-unaligned \
++  strncat-ssse3 \
++  strncmp-avx2 \
++  strncmp-avx2-rtm \
++  strncmp-evex \
++  strncmp-sse2 \
++  strncmp-sse4_2 \
++  strncmp-ssse3 \
++  strncpy-avx2 \
++  strncpy-avx2-rtm \
++  strncpy-c \
++  strncpy-evex \
++  strncpy-sse2-unaligned \
++  strncpy-ssse3 \
++  strnlen-avx2 \
++  strnlen-avx2-rtm \
++  strnlen-evex \
++  strnlen-sse2 \
++  strpbrk-c \
++  strpbrk-sse2 \
++  strrchr-avx2 \
++  strrchr-avx2-rtm \
++  strrchr-evex \
++  strrchr-sse2 \
++  strspn-c \
++  strspn-sse2 \
++  strstr-sse2-unaligned \
++  varshift \
++# sysdep_routines
+ CFLAGS-varshift.c += -msse4
+ CFLAGS-strcspn-c.c += -msse4
+ CFLAGS-strpbrk-c.c += -msse4
+diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
+new file mode 100644
+index 00000000..58a14b2c
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/bzero.c
+@@ -0,0 +1,106 @@
++/* Multiple versions of bzero.
++   All versions must be listed in ifunc-impl-list.c.
++   Copyright (C) 2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++/* Define multiple versions only for the definition in libc.  */
++#if IS_IN (libc)
++# define __bzero __redirect___bzero
++# include <string.h>
++# undef __bzero
++
++# define SYMBOL_NAME __bzero
++# include <init-arch.h>
++
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
++  attribute_hidden;
++
++static inline void *
++IFUNC_SELECTOR (void)
++{
++  const struct cpu_features* cpu_features = __get_cpu_features ();
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
++      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE1 (avx512_unaligned_erms);
++
++	  return OPTIMIZE1 (avx512_unaligned);
++	}
++    }
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE1 (evex_unaligned_erms);
++
++	  return OPTIMIZE1 (evex_unaligned);
++	}
++
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE1 (avx2_unaligned_erms_rtm);
++
++	  return OPTIMIZE1 (avx2_unaligned_rtm);
++	}
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE1 (avx2_unaligned_erms);
++
++	  return OPTIMIZE1 (avx2_unaligned);
++	}
++    }
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++    return OPTIMIZE1 (sse2_unaligned_erms);
++
++  return OPTIMIZE1 (sse2_unaligned);
++}
++
++libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
++
++weak_alias (__bzero, bzero)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 8be0d78a..c963d391 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -282,6 +282,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __memset_avx512_no_vzeroupper)
+ 	     )
+ 
++  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
++  IFUNC_IMPL (i, name, bzero,
++	      IFUNC_IMPL_ADD (array, i, bzero, 1,
++			      __bzero_sse2_unaligned)
++	      IFUNC_IMPL_ADD (array, i, bzero, 1,
++			      __bzero_sse2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __bzero_avx2_unaligned)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __bzero_avx2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __bzero_avx2_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __bzero_avx2_unaligned_erms_rtm)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __bzero_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __bzero_evex_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __bzero_avx512_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __bzero_avx512_unaligned)
++	     )
++
+   /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
+   IFUNC_IMPL (i, name, rawmemchr,
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+index 8ac3e479..5a5ee6f6 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+@@ -5,6 +5,7 @@
+ 
+ #define SECTION(p) p##.avx.rtm
+ #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
++#define BZERO_SYMBOL(p,s)	p##_avx2_##s##_rtm
+ #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
+ 
+ #include "memset-avx2-unaligned-erms.S"
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index c0bf2875..a093a283 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -14,6 +14,9 @@
+   vmovd d, %xmm0; \
+   movq r, %rax;
+ 
++# define BZERO_ZERO_VEC0() \
++  vpxor %xmm0, %xmm0, %xmm0
++
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+ 
+@@ -29,6 +32,9 @@
+ # ifndef MEMSET_SYMBOL
+ #  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+ # endif
++# ifndef BZERO_SYMBOL
++#  define BZERO_SYMBOL(p,s)	p##_avx2_##s
++# endif
+ # ifndef WMEMSET_SYMBOL
+ #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+ # endif
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 5241216a..727c9213 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -19,6 +19,9 @@
+   vpbroadcastb d, %VEC0; \
+   movq r, %rax
+ 
++# define BZERO_ZERO_VEC0() \
++  vpxorq %XMM0, %XMM0, %XMM0
++
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   vpbroadcastd d, %VEC0; \
+   movq r, %rax
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index 63700215..5d8fa78f 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -19,6 +19,9 @@
+   vpbroadcastb d, %VEC0; \
+   movq r, %rax
+ 
++# define BZERO_ZERO_VEC0() \
++  vpxorq %XMM0, %XMM0, %XMM0
++
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   vpbroadcastd d, %VEC0; \
+   movq r, %rax
+diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+index 56b81f5c..8f579ad6 100644
+--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+@@ -22,6 +22,7 @@
+ 
+ #if IS_IN (libc)
+ # define MEMSET_SYMBOL(p,s)	p##_sse2_##s
++# define BZERO_SYMBOL(p,s)	MEMSET_SYMBOL (p, s)
+ # define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
+ 
+ # ifdef SHARED
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index a67f9833..06f5f5d7 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -26,6 +26,10 @@
+ 
+ #include <sysdep.h>
+ 
++#ifndef BZERO_SYMBOL
++# define BZERO_SYMBOL(p,s)		MEMSET_SYMBOL (p, s)
++#endif
++
+ #ifndef MEMSET_CHK_SYMBOL
+ # define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
+ #endif
+@@ -87,6 +91,18 @@
+ # define XMM_SMALL	0
+ #endif
+ 
++#ifdef USE_LESS_VEC_MASK_STORE
++# define SET_REG64	rcx
++# define SET_REG32	ecx
++# define SET_REG16	cx
++# define SET_REG8	cl
++#else
++# define SET_REG64	rsi
++# define SET_REG32	esi
++# define SET_REG16	si
++# define SET_REG8	sil
++#endif
++
+ #define PAGE_SIZE 4096
+ 
+ /* Macro to calculate size of small memset block for aligning
+@@ -96,18 +112,6 @@
+ 
+ #ifndef SECTION
+ # error SECTION is not defined!
+-#endif
+-
+-	.section SECTION(.text),"ax",@progbits
+-#if VEC_SIZE == 16 && IS_IN (libc)
+-ENTRY (__bzero)
+-	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+-	mov	%RSI_LP, %RDX_LP /* Set n.  */
+-	xorl	%esi, %esi
+-	pxor	%XMM0, %XMM0
+-	jmp	L(entry_from_bzero)
+-END (__bzero)
+-weak_alias (__bzero, bzero)
+ #endif
+ 
+ #if IS_IN (libc)
+@@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+ 	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ 	WMEMSET_VDUP_TO_VEC0_LOW()
+ 	cmpq	$VEC_SIZE, %rdx
+-	jb	L(less_vec_no_vdup)
++	jb	L(less_vec_from_wmemset)
+ 	WMEMSET_VDUP_TO_VEC0_HIGH()
+ 	jmp	L(entry_from_wmemset)
+ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+ #endif
+ 
++ENTRY (BZERO_SYMBOL(__bzero, unaligned))
++#if VEC_SIZE > 16
++	BZERO_ZERO_VEC0 ()
++#endif
++	mov	%RDI_LP, %RAX_LP
++	mov	%RSI_LP, %RDX_LP
++#ifndef USE_LESS_VEC_MASK_STORE
++	xorl	%esi, %esi
++#endif
++	cmp	$VEC_SIZE, %RDX_LP
++	jb	L(less_vec_no_vdup)
++#ifdef USE_LESS_VEC_MASK_STORE
++	xorl	%esi, %esi
++#endif
++#if VEC_SIZE <= 16
++	BZERO_ZERO_VEC0 ()
++#endif
++	cmp	$(VEC_SIZE * 2), %RDX_LP
++	ja	L(more_2x_vec)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
++	VZEROUPPER_RETURN
++END (BZERO_SYMBOL(__bzero, unaligned))
++
+ #if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+ 	cmp	%RDX_LP, %RCX_LP
+@@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+ # endif
+-L(entry_from_bzero):
+ 	cmpq	$VEC_SIZE, %rdx
+ 	jb	L(less_vec)
+ 	MEMSET_VDUP_TO_VEC0_HIGH()
+@@ -187,6 +215,31 @@ END (__memset_erms)
+ END (MEMSET_SYMBOL (__memset, erms))
+ # endif
+ 
++ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
++# if VEC_SIZE > 16
++	BZERO_ZERO_VEC0 ()
++# endif
++	mov	%RDI_LP, %RAX_LP
++	mov	%RSI_LP, %RDX_LP
++# ifndef USE_LESS_VEC_MASK_STORE
++	xorl	%esi, %esi
++# endif
++	cmp	$VEC_SIZE, %RDX_LP
++	jb	L(less_vec_no_vdup)
++# ifdef USE_LESS_VEC_MASK_STORE
++	xorl	%esi, %esi
++# endif
++# if VEC_SIZE <= 16
++	BZERO_ZERO_VEC0 ()
++# endif
++	cmp	$(VEC_SIZE * 2), %RDX_LP
++	ja	L(stosb_more_2x_vec)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
++	VZEROUPPER_RETURN
++END (BZERO_SYMBOL(__bzero, unaligned_erms))
++
+ # if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ 	cmp	%RDX_LP, %RCX_LP
+@@ -229,6 +282,7 @@ L(last_2x_vec):
+ 	.p2align 4,, 10
+ L(less_vec):
+ L(less_vec_no_vdup):
++L(less_vec_from_wmemset):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+@@ -374,8 +428,11 @@ L(less_vec):
+ 	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+ 	   xmm). This is only does anything for AVX2.  */
+ 	MEMSET_VDUP_TO_VEC0_LOW ()
++L(less_vec_from_wmemset):
++#if VEC_SIZE > 16
+ L(less_vec_no_vdup):
+ #endif
++#endif
+ L(cross_page):
+ #if VEC_SIZE > 32
+ 	cmpl	$32, %edx
+@@ -386,7 +443,10 @@ L(cross_page):
+ 	jge	L(between_16_31)
+ #endif
+ #ifndef USE_XMM_LESS_VEC
+-	MOVQ	%XMM0, %rcx
++	MOVQ	%XMM0, %SET_REG64
++#endif
++#if VEC_SIZE <= 16
++L(less_vec_no_vdup):
+ #endif
+ 	cmpl	$8, %edx
+ 	jge	L(between_8_15)
+@@ -395,7 +455,7 @@ L(cross_page):
+ 	cmpl	$1, %edx
+ 	jg	L(between_2_3)
+ 	jl	L(between_0_0)
+-	movb	%sil, (%LESS_VEC_REG)
++	movb	%SET_REG8, (%LESS_VEC_REG)
+ L(between_0_0):
+ 	ret
+ 
+@@ -428,8 +488,8 @@ L(between_8_15):
+ 	MOVQ	%XMM0, (%rdi)
+ 	MOVQ	%XMM0, -8(%rdi, %rdx)
+ #else
+-	movq	%rcx, (%LESS_VEC_REG)
+-	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
++	movq	%SET_REG64, (%LESS_VEC_REG)
++	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
+ #endif
+ 	ret
+ 
+@@ -442,8 +502,8 @@ L(between_4_7):
+ 	MOVD	%XMM0, (%rdi)
+ 	MOVD	%XMM0, -4(%rdi, %rdx)
+ #else
+-	movl	%ecx, (%LESS_VEC_REG)
+-	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
++	movl	%SET_REG32, (%LESS_VEC_REG)
++	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
+ #endif
+ 	ret
+ 
+@@ -452,12 +512,12 @@ L(between_4_7):
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+ #ifdef USE_XMM_LESS_VEC
+-	movb	%sil, (%rdi)
+-	movb	%sil, 1(%rdi)
+-	movb	%sil, -1(%rdi, %rdx)
++	movb	%SET_REG8, (%rdi)
++	movb	%SET_REG8, 1(%rdi)
++	movb	%SET_REG8, -1(%rdi, %rdx)
+ #else
+-	movw	%cx, (%LESS_VEC_REG)
+-	movb	%sil, -1(%LESS_VEC_REG, %rdx)
++	movw	%SET_REG16, (%LESS_VEC_REG)
++	movb	%SET_REG8, -1(%LESS_VEC_REG, %rdx)
+ #endif
+ 	ret
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-81.patch b/glibc-RHEL-15696-81.patch
new file mode 100644
index 0000000..960a4cc
--- /dev/null
+++ b/glibc-RHEL-15696-81.patch
@@ -0,0 +1,33 @@
+From 7912236f4a597deb092650ca79f33504ddb4af28 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sat, 12 Feb 2022 00:45:00 -0600
+Subject: [PATCH] x86: Set .text section in memset-vec-unaligned-erms
+Content-type: text/plain; charset=UTF-8
+
+commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Mon Feb 7 05:55:15 2022 -0800
+
+    x86-64: Optimize bzero
+
+Remove setting the .text section for the code. This commit
+adds that back.
+---
+ sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 06f5f5d7..4fb475c0 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -114,6 +114,7 @@
+ # error SECTION is not defined!
+ #endif
+ 
++	.section SECTION(.text), "ax", @progbits
+ #if IS_IN (libc)
+ # if defined SHARED
+ ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-82.patch b/glibc-RHEL-15696-82.patch
new file mode 100644
index 0000000..23ee46e
--- /dev/null
+++ b/glibc-RHEL-15696-82.patch
@@ -0,0 +1,90 @@
+From e108c02a5e23c8c88ce66d8705d4a24bb6b9a8bf Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 15 Feb 2022 20:27:21 -0600
+Subject: [PATCH] x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895]
+Content-type: text/plain; charset=UTF-8
+
+Logic can read before the start of `s1` / `s2` if both `s1` and `s2`
+are near the start of a page. To avoid having the result contimated by
+these comparisons the `strcmp` variants would mask off these
+comparisons. This was missing in the `strncmp` variants causing
+the bug. This commit adds the masking to `strncmp` so that out of
+range comparisons don't affect the result.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass as
+well a full xcheck on x86_64 linux.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ string/test-strncmp.c                  | 23 +++++++++++++++++++++++
+ sysdeps/x86_64/multiarch/strcmp-avx2.S |  1 +
+ sysdeps/x86_64/multiarch/strcmp-evex.S |  1 +
+ 3 files changed, 25 insertions(+)
+
+diff --git a/string/test-strncmp.c b/string/test-strncmp.c
+index 927a6daa..e61fffd9 100644
+--- a/string/test-strncmp.c
++++ b/string/test-strncmp.c
+@@ -403,6 +403,28 @@ check2 (void)
+   free (s2);
+ }
+ 
++static void
++check4 (void)
++{
++  /* To trigger bug 28895; We need 1) both s1 and s2 to be within 32 bytes of
++     the end of the page. 2) For there to be no mismatch/null byte before the
++     first page cross. 3) For length (`n`) to be large enough for one string to
++     cross the page. And 4) for there to be either mismatch/null bytes before
++     the start of the strings.  */
++
++  size_t size = 10;
++  size_t addr_mask = (getpagesize () - 1) ^ (sizeof (CHAR) - 1);
++  CHAR *s1 = (CHAR *)(buf1 + (addr_mask & 0xffa));
++  CHAR *s2 = (CHAR *)(buf2 + (addr_mask & 0xfed));
++  int exp_result;
++
++  STRCPY (s1, L ("tst-tlsmod%"));
++  STRCPY (s2, L ("tst-tls-manydynamic73mod"));
++  exp_result = SIMPLE_STRNCMP (s1, s2, size);
++  FOR_EACH_IMPL (impl, 0)
++  check_result (impl, s1, s2, size, exp_result);
++}
++
+ static void
+ check3 (void)
+ {
+@@ -445,6 +467,7 @@ test_main (void)
+   check1 ();
+   check2 ();
+   check3 ();
++  check4 ();
+ 
+   printf ("%23s", "");
+   FOR_EACH_IMPL (impl, 0)
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 04675aa4..179cc0e3 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -661,6 +661,7 @@ L(ret8):
+ # ifdef USE_AS_STRNCMP
+ 	.p2align 4,, 10
+ L(return_page_cross_end_check):
++	andl	%r10d, %ecx
+ 	tzcntl	%ecx, %ecx
+ 	leal	-VEC_SIZE(%rax, %rcx), %ecx
+ 	cmpl	%ecx, %edx
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index ed56af8e..0dfa62bd 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -689,6 +689,7 @@ L(ret8):
+ # ifdef USE_AS_STRNCMP
+ 	.p2align 4,, 10
+ L(return_page_cross_end_check):
++	andl	%r10d, %ecx
+ 	tzcntl	%ecx, %ecx
+ 	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
+ #  ifdef USE_AS_WCSCMP
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-83.patch b/glibc-RHEL-15696-83.patch
new file mode 100644
index 0000000..e7475a8
--- /dev/null
+++ b/glibc-RHEL-15696-83.patch
@@ -0,0 +1,77 @@
+From 9fef7039a7d04947bc89296ee0d187bc8d89b772 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 24 Mar 2022 15:50:33 -0500
+Subject: [PATCH] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ
+ #28896]
+Content-type: text/plain; charset=UTF-8
+
+Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
+__wcscmp_avx2.
+
+commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Sun Jan 9 16:02:21 2022 -0600
+
+    x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
+
+Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set
+to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which
+can cause spurious aborts.
+
+This change will need to be backported.
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/tst-strncmp-rtm.c          | 15 +++++++++++++++
+ sysdeps/x86_64/multiarch/strcmp-avx2.S |  2 +-
+ 2 files changed, 16 insertions(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index aef9866c..ba6543be 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
++++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -70,6 +70,16 @@ function_overflow (void)
+     return 1;
+ }
+ 
++__attribute__ ((noinline, noclone))
++static int
++function_overflow2 (void)
++{
++  if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
++    return 0;
++  else
++    return 1;
++}
++
+ static int
+ do_test (void)
+ {
+@@ -77,5 +87,10 @@ do_test (void)
+   if (status != EXIT_SUCCESS)
+     return status;
+   status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
++  if (status != EXIT_SUCCESS)
++    return status;
++  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
++  if (status != EXIT_SUCCESS)
++    return status;
+   return status;
+ }
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 179cc0e3..782f9472 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -122,7 +122,7 @@ ENTRY(STRCMP)
+ 	   are cases where length is large enough that it can never be a
+ 	   bound on valid memory so just use wcscmp.  */
+ 	shrq	$56, %rcx
+-	jnz	__wcscmp_avx2
++	jnz	OVERFLOW_STRCMP
+ 
+ 	leaq	(, %rdx, 4), %rdx
+ #  endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-84.patch b/glibc-RHEL-15696-84.patch
new file mode 100644
index 0000000..e998eff
--- /dev/null
+++ b/glibc-RHEL-15696-84.patch
@@ -0,0 +1,27 @@
+From 1283948f236f209b7d3f44b69a42b96806fa6da0 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sat, 5 Feb 2022 11:06:01 -0800
+Subject: [PATCH] x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ))
+Content-type: text/plain; charset=UTF-8
+
+---
+ sysdeps/x86/sysdep.h | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
+index a70bb3a2..49b0efe2 100644
+--- a/sysdeps/x86/sysdep.h
++++ b/sysdeps/x86/sysdep.h
+@@ -111,7 +111,8 @@ enum cf_protection_level
+ /* Local label name for asm code. */
+ #ifndef L
+ /* ELF-like local names start with `.L'.  */
+-# define L(name)	.L##name
++# define LOCAL_LABEL(name) .L##name
++# define L(name)	LOCAL_LABEL(name)
+ #endif
+ 
+ #define atom_text_section .section ".text.atom", "ax"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-85.patch b/glibc-RHEL-15696-85.patch
new file mode 100644
index 0000000..18f8a47
--- /dev/null
+++ b/glibc-RHEL-15696-85.patch
@@ -0,0 +1,108 @@
+From c328d0152d4b14cca58407ec68143894c8863004 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sat, 5 Feb 2022 11:52:33 -0800
+Subject: [PATCH] x86_64/multiarch: Sort sysdep_routines and put one entry per
+ line
+Content-type: text/plain; charset=UTF-8
+
+Conflicts:
+	sysdeps/x86_64/multiarch/Makefile
+	(test order changed)
+
+---
+ sysdeps/x86_64/multiarch/Makefile | 78 +++++++++++++++++++------------
+ 1 file changed, 48 insertions(+), 30 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 37d8d6f0..8c9e7812 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -132,37 +132,55 @@ CFLAGS-strspn-c.c += -msse4
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+-		   wmemcmp-avx2-movbe \
+-		   wmemchr-sse2 wmemchr-avx2 \
+-		   wcscmp-sse2 wcscmp-avx2 \
+-		   wcsncmp-sse2 wcsncmp-avx2 \
+-		   wcscpy-ssse3 wcscpy-c \
+-		   wcschr-sse2 wcschr-avx2 \
+-		   wcsrchr-sse2 wcsrchr-avx2 \
+-		   wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
+-		   wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
+-		   wcschr-avx2-rtm \
+-		   wcscmp-avx2-rtm \
+-		   wcslen-avx2-rtm \
+-		   wcsncmp-avx2-rtm \
+-		   wcsnlen-avx2-rtm \
+-		   wcsrchr-avx2-rtm \
+-		   wmemchr-avx2-rtm \
+-		   wmemcmp-avx2-movbe-rtm \
+-		   wcschr-evex \
+-		   wcscmp-evex \
+-		   wcslen-evex \
+-		   wcsncmp-evex \
+-		   wcsnlen-evex \
+-		   wcsrchr-evex \
+-		   wmemchr-evex \
+-		   wmemcmp-evex-movbe \
+-		   wmemchr-evex-rtm
++sysdep_routines += \
++  wcschr-avx2 \
++  wcschr-avx2-rtm \
++  wcschr-evex \
++  wcschr-sse2 \
++  wcscmp-avx2 \
++  wcscmp-avx2-rtm \
++  wcscmp-evex \
++  wcscmp-sse2 \
++  wcscpy-c \
++  wcscpy-ssse3 \
++  wcslen-avx2 \
++  wcslen-avx2-rtm \
++  wcslen-evex \
++  wcslen-sse2 \
++  wcslen-sse4_1 \
++  wcsncmp-avx2 \
++  wcsncmp-avx2-rtm \
++  wcsncmp-evex \
++  wcsncmp-sse2 \
++  wcsnlen-avx2 \
++  wcsnlen-avx2-rtm \
++  wcsnlen-c \
++  wcsnlen-evex \
++  wcsnlen-sse4_1 \
++  wcsrchr-avx2 \
++  wcsrchr-avx2-rtm \
++  wcsrchr-evex \
++  wcsrchr-sse2 \
++  wmemchr-avx2 \
++  wmemchr-avx2-rtm \
++  wmemchr-evex \
++  wmemchr-evex-rtm \
++  wmemchr-sse2 \
++  wmemcmp-avx2-movbe \
++  wmemcmp-avx2-movbe-rtm \
++  wmemcmp-c \
++  wmemcmp-evex-movbe \
++  wmemcmp-sse4 \
++  wmemcmp-ssse3 \
++# sysdep_routines
+ endif
+ 
+ ifeq ($(subdir),debug)
+-sysdep_routines += memcpy_chk-nonshared mempcpy_chk-nonshared \
+-		   memmove_chk-nonshared memset_chk-nonshared \
+-		   wmemset_chk-nonshared
++sysdep_routines += \
++  memcpy_chk-nonshared \
++  memmove_chk-nonshared \
++  mempcpy_chk-nonshared \
++  memset_chk-nonshared \
++  wmemset_chk-nonshared \
++# sysdep_routines
+ endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-86.patch b/glibc-RHEL-15696-86.patch
new file mode 100644
index 0000000..d4fb42f
--- /dev/null
+++ b/glibc-RHEL-15696-86.patch
@@ -0,0 +1,36 @@
+From 0fb8800029d230b3711bf722b2a47db92d0e273f Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 10 Feb 2022 11:52:50 -0800
+Subject: [PATCH] x86-64: Remove bzero weak alias in SS2 memset
+Content-type: text/plain; charset=UTF-8
+
+commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Mon Feb 7 05:55:15 2022 -0800
+
+    x86-64: Optimize bzero
+
+added the optimized bzero.  Remove bzero weak alias in SS2 memset to
+avoid undefined __bzero in memset-sse2-unaligned-erms.
+---
+ sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S | 4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+index 8f579ad6..af51362b 100644
+--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+@@ -31,9 +31,7 @@
+ # endif
+ 
+ # undef weak_alias
+-# define weak_alias(original, alias) \
+-	.weak bzero; bzero = __bzero
+-
++# define weak_alias(original, alias)
+ # undef strong_alias
+ # define strong_alias(ignored1, ignored2)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-87.patch b/glibc-RHEL-15696-87.patch
new file mode 100644
index 0000000..4882613
--- /dev/null
+++ b/glibc-RHEL-15696-87.patch
@@ -0,0 +1,29 @@
+From bf92893a14ebc161b08b28acc24fa06ae6be19cb Mon Sep 17 00:00:00 2001
+From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date: Thu, 10 Feb 2022 11:23:24 -0300
+Subject: [PATCH] x86_64: Remove bcopy optimizations
+Content-type: text/plain; charset=UTF-8
+
+The symbols is not present in current POSIX specification and compiler
+already generates memmove call.
+---
+ sysdeps/x86_64/multiarch/bcopy.S | 7 -------
+ 1 file changed, 7 deletions(-)
+ delete mode 100644 sysdeps/x86_64/multiarch/bcopy.S
+
+diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S
+deleted file mode 100644
+index 639f02bd..00000000
+--- a/sysdeps/x86_64/multiarch/bcopy.S
++++ /dev/null
+@@ -1,7 +0,0 @@
+-#include <sysdep.h>
+-
+-	.text
+-ENTRY(bcopy)
+-	xchg	%rdi, %rsi
+-	jmp	__libc_memmove	/* Branch to IFUNC memmove.  */
+-END(bcopy)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-88.patch b/glibc-RHEL-15696-88.patch
new file mode 100644
index 0000000..d075f80
--- /dev/null
+++ b/glibc-RHEL-15696-88.patch
@@ -0,0 +1,372 @@
+From a6fbf4d51e9ba8063c4f8331564892ead9c67344 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:16 -0500
+Subject: [PATCH] x86: Code cleanup in strchr-avx2 and comment justifying
+ branch
+Content-type: text/plain; charset=UTF-8
+
+Small code cleanup for size: -53 bytes.
+
+Add comment justifying using a branch to do NULL/non-null return.
+
+All string/memory tests pass and no regressions in benchtests.
+
+geometric_mean(N=20) of all benchmarks Original / New: 1.00
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------
+ 1 file changed, 107 insertions(+), 97 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+index 5884726b..89dd2bf7 100644
+--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+@@ -48,13 +48,13 @@
+ # define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+-ENTRY (STRCHR)
++ENTRY_P2ALIGN (STRCHR, 5)
+ 	/* Broadcast CHAR to YMM0.	*/
+ 	vmovd	%esi, %xmm0
+ 	movl	%edi, %eax
+ 	andl	$(PAGE_SIZE - 1), %eax
+ 	VPBROADCAST	%xmm0, %ymm0
+-	vpxor	%xmm9, %xmm9, %xmm9
++	vpxor	%xmm1, %xmm1, %xmm1
+ 
+ 	/* Check if we cross page boundary with one vector load.  */
+ 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+@@ -62,37 +62,29 @@ ENTRY (STRCHR)
+ 
+ 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
+ 	   null byte.  */
+-	vmovdqu	(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqu	(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+-	/* Found CHAR or the null byte.	 */
+-	cmp	(%rdi, %rax), %CHAR_REG
+-	jne	L(zero)
+-# endif
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
+-
+-	/* .p2align 5 helps keep performance more consistent if ENTRY()
+-	   alignment % 32 was either 16 or 0. As well this makes the
+-	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
+-	   easier.  */
+-	.p2align 5
+-L(first_vec_x4):
+-	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 3 + 1), %rdi
+-# ifndef USE_AS_STRCHRNUL
+-	/* Found CHAR or the null byte.	 */
++	/* Found CHAR or the null byte.  */
+ 	cmp	(%rdi, %rax), %CHAR_REG
++	/* NB: Use a branch instead of cmovcc here. The expectation is
++	   that with strchr the user will branch based on input being
++	   null. Since this branch will be 100% predictive of the user
++	   branch a branch miss here should save what otherwise would
++	   be branch miss in the user code. Otherwise using a branch 1)
++	   saves code size and 2) is faster in highly predictable
++	   environments.  */
+ 	jne	L(zero)
+ # endif
+ 	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ # ifndef USE_AS_STRCHRNUL
+ L(zero):
+@@ -103,7 +95,8 @@ L(zero):
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+-	tzcntl	%eax, %eax
++	/* Use bsf to save code size.  */
++	bsfl	%eax, %eax
+ 	incq	%rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+@@ -113,9 +106,10 @@ L(first_vec_x1):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
++	.p2align 4,, 10
+ L(first_vec_x2):
+-	tzcntl	%eax, %eax
++	/* Use bsf to save code size.  */
++	bsfl	%eax, %eax
+ 	addq	$(VEC_SIZE + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+@@ -125,9 +119,10 @@ L(first_vec_x2):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
++	.p2align 4,, 8
+ L(first_vec_x3):
+-	tzcntl	%eax, %eax
++	/* Use bsf to save code size.  */
++	bsfl	%eax, %eax
+ 	addq	$(VEC_SIZE * 2 + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+@@ -137,6 +132,21 @@ L(first_vec_x3):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
++	.p2align 4,, 10
++L(first_vec_x4):
++	/* Use bsf to save code size.  */
++	bsfl	%eax, %eax
++	addq	$(VEC_SIZE * 3 + 1), %rdi
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
++# endif
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
++
++
++
+ 	.p2align 4
+ L(aligned_more):
+ 	/* Align data to VEC_SIZE - 1. This is the same number of
+@@ -146,90 +156,92 @@ L(aligned_more):
+ L(cross_page_continue):
+ 	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	vmovdqa	1(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqa	1(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x4)
+-	/* Align data to VEC_SIZE * 4 - 1.	*/
+-	addq	$(VEC_SIZE * 4 + 1), %rdi
+-	andq	$-(VEC_SIZE * 4), %rdi
++	/* Align data to VEC_SIZE * 4 - 1.  */
++	incq	%rdi
++	orq	$(VEC_SIZE * 4 - 1), %rdi
+ 	.p2align 4
+ L(loop_4x_vec):
+ 	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa	(%rdi), %ymm5
+-	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
++	vmovdqa	1(%rdi), %ymm6
++	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm7
+ 
+ 	/* Leaves only CHARS matching esi as 0.	 */
+-	vpxor	%ymm5, %ymm0, %ymm1
+ 	vpxor	%ymm6, %ymm0, %ymm2
+ 	vpxor	%ymm7, %ymm0, %ymm3
+-	vpxor	%ymm8, %ymm0, %ymm4
+ 
+-	VPMINU	%ymm1, %ymm5, %ymm1
+ 	VPMINU	%ymm2, %ymm6, %ymm2
+ 	VPMINU	%ymm3, %ymm7, %ymm3
+-	VPMINU	%ymm4, %ymm8, %ymm4
+ 
+-	VPMINU	%ymm1, %ymm2, %ymm5
+-	VPMINU	%ymm3, %ymm4, %ymm6
++	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm6
++	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm7
++
++	vpxor	%ymm6, %ymm0, %ymm4
++	vpxor	%ymm7, %ymm0, %ymm5
++
++	VPMINU	%ymm4, %ymm6, %ymm4
++	VPMINU	%ymm5, %ymm7, %ymm5
+ 
+-	VPMINU	%ymm5, %ymm6, %ymm6
++	VPMINU	%ymm2, %ymm3, %ymm6
++	VPMINU	%ymm4, %ymm5, %ymm7
+ 
+-	VPCMPEQ	%ymm6, %ymm9, %ymm6
+-	vpmovmskb %ymm6, %ecx
++	VPMINU	%ymm6, %ymm7, %ymm7
++
++	VPCMPEQ	%ymm7, %ymm1, %ymm7
++	vpmovmskb %ymm7, %ecx
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	testl	%ecx, %ecx
+ 	jz	L(loop_4x_vec)
+ 
+-
+-	VPCMPEQ	%ymm1, %ymm9, %ymm1
+-	vpmovmskb %ymm1, %eax
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x0)
+ 
+ 
+-	VPCMPEQ	%ymm5, %ymm9, %ymm2
+-	vpmovmskb %ymm2, %eax
++	VPCMPEQ	%ymm3, %ymm1, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x1)
+ 
+-	VPCMPEQ	%ymm3, %ymm9, %ymm3
+-	vpmovmskb %ymm3, %eax
++	VPCMPEQ	%ymm4, %ymm1, %ymm4
++	vpmovmskb %ymm4, %eax
+ 	/* rcx has combined result from all 4 VEC. It will only be used
+ 	   if the first 3 other VEC all did not contain a match.  */
+ 	salq	$32, %rcx
+ 	orq	%rcx, %rax
+ 	tzcntq	%rax, %rax
+-	subq	$(VEC_SIZE * 2), %rdi
++	subq	$(VEC_SIZE * 2 - 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(%rdi, %rax), %CHAR_REG
+@@ -239,10 +251,11 @@ L(loop_4x_vec):
+ 	VZEROUPPER_RETURN
+ 
+ 
+-	.p2align 4
++	.p2align 4,, 10
+ L(last_vec_x0):
+-	tzcntl	%eax, %eax
+-	addq	$-(VEC_SIZE * 4), %rdi
++	/* Use bsf to save code size.  */
++	bsfl	%eax, %eax
++	addq	$-(VEC_SIZE * 4 - 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(%rdi, %rax), %CHAR_REG
+@@ -251,16 +264,11 @@ L(last_vec_x0):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+-# ifndef USE_AS_STRCHRNUL
+-L(zero_end):
+-	xorl	%eax, %eax
+-	VZEROUPPER_RETURN
+-# endif
+ 
+-	.p2align 4
++	.p2align 4,, 10
+ L(last_vec_x1):
+ 	tzcntl	%eax, %eax
+-	subq	$(VEC_SIZE * 3), %rdi
++	subq	$(VEC_SIZE * 3 - 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(%rdi, %rax), %CHAR_REG
+@@ -269,18 +277,23 @@ L(last_vec_x1):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
++# ifndef USE_AS_STRCHRNUL
++L(zero_end):
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
++# endif
+ 
+ 	/* Cold case for crossing page with first load.	 */
+-	.p2align 4
++	.p2align 4,, 8
+ L(cross_page_boundary):
+ 	movq	%rdi, %rdx
+ 	/* Align rdi to VEC_SIZE - 1.  */
+ 	orq	$(VEC_SIZE - 1), %rdi
+-	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+ 	   so no need to manually mod edx.  */
+ 	sarxl	%edx, %eax, %eax
+@@ -291,13 +304,10 @@ L(cross_page_boundary):
+ 	xorl	%ecx, %ecx
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(%rdx, %rax), %CHAR_REG
+-	leaq	(%rdx, %rax), %rax
+-	cmovne	%rcx, %rax
+-# else
+-	addq	%rdx, %rax
++	jne	L(zero_end)
+ # endif
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
++	addq	%rdx, %rax
++	VZEROUPPER_RETURN
+ 
+ END (STRCHR)
+-# endif
++#endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-89.patch b/glibc-RHEL-15696-89.patch
new file mode 100644
index 0000000..45ee946
--- /dev/null
+++ b/glibc-RHEL-15696-89.patch
@@ -0,0 +1,343 @@
+From ec285ea90415458225623ddc0492ae3f705af043 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:18 -0500
+Subject: [PATCH] x86: Code cleanup in strchr-evex and comment justifying
+ branch
+Content-type: text/plain; charset=UTF-8
+
+Small code cleanup for size: -81 bytes.
+
+Add comment justifying using a branch to do NULL/non-null return.
+
+All string/memory tests pass and no regressions in benchtests.
+
+geometric_mean(N=20) of all benchmarks New / Original: .985
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++-----------
+ 1 file changed, 80 insertions(+), 66 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
+index 7f9d4ee4..0b49e0ac 100644
+--- a/sysdeps/x86_64/multiarch/strchr-evex.S
++++ b/sysdeps/x86_64/multiarch/strchr-evex.S
+@@ -30,6 +30,7 @@
+ # ifdef USE_AS_WCSCHR
+ #  define VPBROADCAST	vpbroadcastd
+ #  define VPCMP		vpcmpd
++#  define VPTESTN	vptestnmd
+ #  define VPMINU	vpminud
+ #  define CHAR_REG	esi
+ #  define SHIFT_REG	ecx
+@@ -37,6 +38,7 @@
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMP		vpcmpb
++#  define VPTESTN	vptestnmb
+ #  define VPMINU	vpminub
+ #  define CHAR_REG	sil
+ #  define SHIFT_REG	edx
+@@ -61,13 +63,11 @@
+ # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section .text.evex,"ax",@progbits
+-ENTRY (STRCHR)
++ENTRY_P2ALIGN (STRCHR, 5)
+ 	/* Broadcast CHAR to YMM0.	*/
+ 	VPBROADCAST	%esi, %YMM0
+ 	movl	%edi, %eax
+ 	andl	$(PAGE_SIZE - 1), %eax
+-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+-
+ 	/* Check if we cross page boundary with one vector load.
+ 	   Otherwise it is safe to use an unaligned load.  */
+ 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+@@ -81,49 +81,35 @@ ENTRY (STRCHR)
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.  */
++	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	/* NB: Use a branch instead of cmovcc here. The expectation is
++	   that with strchr the user will branch based on input being
++	   null. Since this branch will be 100% predictive of the user
++	   branch a branch miss here should save what otherwise would
++	   be branch miss in the user code. Otherwise using a branch 1)
++	   saves code size and 2) is faster in highly predictable
++	   environments.  */
++	jne	L(zero)
++# endif
+ # ifdef USE_AS_WCSCHR
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+ 	 */
+ 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+ 	addq	%rdi, %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	/* Found CHAR or the null byte.	 */
+-	cmp	(%rax), %CHAR_REG
+-	jne	L(zero)
+ # endif
+ 	ret
+ 
+-	/* .p2align 5 helps keep performance more consistent if ENTRY()
+-	   alignment % 32 was either 16 or 0. As well this makes the
+-	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
+-	   easier.  */
+-	.p2align 5
+-L(first_vec_x3):
+-	tzcntl	%eax, %eax
+-# ifndef USE_AS_STRCHRNUL
+-	/* Found CHAR or the null byte.	 */
+-	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+-	jne	L(zero)
+-# endif
+-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+-	   bytes.  */
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+-	ret
+ 
+-# ifndef USE_AS_STRCHRNUL
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-# endif
+ 
+-	.p2align 4
++	.p2align 4,, 10
+ L(first_vec_x4):
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check to see if first match was CHAR (k0) or null (k1).  */
+@@ -144,9 +130,18 @@ L(first_vec_x4):
+ 	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
++# ifndef USE_AS_STRCHRNUL
++L(zero):
++	xorl	%eax, %eax
++	ret
++# endif
++
++
+ 	.p2align 4
+ L(first_vec_x1):
+-	tzcntl	%eax, %eax
++	/* Use bsf here to save 1-byte keeping keeping the block in 1x
++	   fetch block. eax guranteed non-zero.  */
++	bsfl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+@@ -158,7 +153,7 @@ L(first_vec_x1):
+ 	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 4,, 10
+ L(first_vec_x2):
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check to see if first match was CHAR (k0) or null (k1).  */
+@@ -179,6 +174,21 @@ L(first_vec_x2):
+ 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
++	.p2align 4,, 10
++L(first_vec_x3):
++	/* Use bsf here to save 1-byte keeping keeping the block in 1x
++	   fetch block. eax guranteed non-zero.  */
++	bsfl	%eax, %eax
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
++	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero)
++# endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++
+ 	.p2align 4
+ L(aligned_more):
+ 	/* Align data to VEC_SIZE.  */
+@@ -195,7 +205,7 @@ L(cross_page_continue):
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+@@ -206,7 +216,7 @@ L(cross_page_continue):
+ 	/* Each bit in K0 represents a CHAR in YMM1.  */
+ 	VPCMP	$0, %YMM1, %YMM0, %k0
+ 	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMM1, %YMMZERO, %k1
++	VPTESTN	%YMM1, %YMM1, %k1
+ 	kortestd	%k0, %k1
+ 	jnz	L(first_vec_x2)
+ 
+@@ -215,7 +225,7 @@ L(cross_page_continue):
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+@@ -224,7 +234,7 @@ L(cross_page_continue):
+ 	/* Each bit in K0 represents a CHAR in YMM1.  */
+ 	VPCMP	$0, %YMM1, %YMM0, %k0
+ 	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMM1, %YMMZERO, %k1
++	VPTESTN	%YMM1, %YMM1, %k1
+ 	kortestd	%k0, %k1
+ 	jnz	L(first_vec_x4)
+ 
+@@ -265,33 +275,33 @@ L(loop_4x_vec):
+ 	VPMINU	%YMM3, %YMM4, %YMM4
+ 	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
+ 
+-	VPCMP	$0, %YMMZERO, %YMM4, %k1
++	VPTESTN	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	testl	%ecx, %ecx
+ 	jz	L(loop_4x_vec)
+ 
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	VPTESTN	%YMM1, %YMM1, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x1)
+ 
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x2)
+ 
+-	VPCMP	$0, %YMMZERO, %YMM3, %k0
++	VPTESTN	%YMM3, %YMM3, %k0
+ 	kmovd	%k0, %eax
+ 	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
+ # ifdef USE_AS_WCSCHR
+ 	sall	$8, %ecx
+ 	orl	%ecx, %eax
+-	tzcntl	%eax, %eax
++	bsfl	%eax, %eax
+ # else
+ 	salq	$32, %rcx
+ 	orq	%rcx, %rax
+-	tzcntq	%rax, %rax
++	bsfq	%rax, %rax
+ # endif
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check if match was CHAR or null.  */
+@@ -303,28 +313,28 @@ L(loop_4x_vec):
+ 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+-# ifndef USE_AS_STRCHRNUL
+-L(zero_end):
+-	xorl	%eax, %eax
+-	ret
++	.p2align 4,, 8
++L(last_vec_x1):
++	bsfl	%eax, %eax
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
++	   */
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++# else
++	addq	%rdi, %rax
+ # endif
+ 
+-	.p2align 4
+-L(last_vec_x1):
+-	tzcntl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check if match was null.  */
+-	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	cmp	(%rax), %CHAR_REG
+ 	jne	L(zero_end)
+ # endif
+-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+-	   bytes.  */
+-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++
+ 	ret
+ 
+-	.p2align 4
++	.p2align 4,, 8
+ L(last_vec_x2):
+-	tzcntl	%eax, %eax
++	bsfl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check if match was null.  */
+ 	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+@@ -336,7 +346,7 @@ L(last_vec_x2):
+ 	ret
+ 
+ 	/* Cold case for crossing page with first load.	 */
+-	.p2align 4
++	.p2align 4,, 8
+ L(cross_page_boundary):
+ 	movq	%rdi, %rdx
+ 	/* Align rdi.  */
+@@ -346,9 +356,9 @@ L(cross_page_boundary):
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+-	/* Remove the leading bits.	 */
++	/* Remove the leading bits.  */
+ # ifdef USE_AS_WCSCHR
+ 	movl	%edx, %SHIFT_REG
+ 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+@@ -360,20 +370,24 @@ L(cross_page_boundary):
+ 	/* If eax is zero continue.  */
+ 	testl	%eax, %eax
+ 	jz	L(cross_page_continue)
+-	tzcntl	%eax, %eax
+-# ifndef USE_AS_STRCHRNUL
+-	/* Check to see if match was CHAR or null.  */
+-	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
+-	jne	L(zero_end)
+-# endif
++	bsfl	%eax, %eax
++
+ # ifdef USE_AS_WCSCHR
+ 	/* NB: Multiply wchar_t count by 4 to get the number of
+ 	   bytes.  */
+ 	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+ # else
+ 	addq	%rdx, %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	/* Check to see if match was CHAR or null.  */
++	cmp	(%rax), %CHAR_REG
++	je	L(cross_page_ret)
++L(zero_end):
++	xorl	%eax, %eax
++L(cross_page_ret):
+ # endif
+ 	ret
+ 
+ END (STRCHR)
+-# endif
++#endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-9.patch b/glibc-RHEL-15696-9.patch
new file mode 100644
index 0000000..5aa3e7b
--- /dev/null
+++ b/glibc-RHEL-15696-9.patch
@@ -0,0 +1,206 @@
+From 3f635fb43389b54f682fc9ed2acc0b2aaf4a923d Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 4 Feb 2019 06:31:01 -0800
+Subject: [PATCH] x86-64 memcmp: Use unsigned Jcc instructions on size [BZ
+ #24155]
+Content-type: text/plain; charset=UTF-8
+
+Since the size argument is unsigned. we should use unsigned Jcc
+instructions, instead of signed, to check size.
+
+Tested on x86-64 and x32, with and without --disable-multi-arch.
+
+	[BZ #24155]
+	CVE-2019-7309
+	* NEWS: Updated for CVE-2019-7309.
+	* sysdeps/x86_64/memcmp.S: Use RDX_LP for size.  Clear the
+	upper 32 bits of RDX register for x32.  Use unsigned Jcc
+	instructions, instead of signed.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp-2.
+	* sysdeps/x86_64/x32/tst-size_t-memcmp-2.c: New test.
+---
+ sysdeps/x86_64/memcmp.S                  | 20 +++---
+ sysdeps/x86_64/x32/Makefile              |  3 +-
+ sysdeps/x86_64/x32/tst-size_t-memcmp-2.c | 79 ++++++++++++++++++++++++
+ 3 files changed, 93 insertions(+), 9 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
+
+Conflics:
+	ChangeLog
+	(removed)
+	NEWS
+	(removed)
+
+diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
+index bcb4a2e8..45918d37 100644
+--- a/sysdeps/x86_64/memcmp.S
++++ b/sysdeps/x86_64/memcmp.S
+@@ -21,14 +21,18 @@
+ 
+ 	.text
+ ENTRY (memcmp)
+-	test	%rdx, %rdx
++#ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++#endif
++	test	%RDX_LP, %RDX_LP
+ 	jz	L(finz)
+ 	cmpq	$1, %rdx
+-	jle	L(finr1b)
++	jbe	L(finr1b)
+ 	subq	%rdi, %rsi
+ 	movq	%rdx, %r10
+ 	cmpq	$32, %r10
+-	jge	L(gt32)
++	jae	L(gt32)
+ 	/* Handle small chunks and last block of less than 32 bytes.  */
+ L(small):
+ 	testq	$1, %r10
+@@ -156,7 +160,7 @@ L(A32):
+ 	movq	%r11, %r10
+ 	andq	$-32, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt16)
++        jae	L(mt16)
+ 	/* Pre-unroll to be ready for unrolled 64B loop.  */
+ 	testq	$32, %rdi
+ 	jz	L(A64)
+@@ -178,7 +182,7 @@ L(A64):
+ 	movq	%r11, %r10
+ 	andq	$-64, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt32)
++        jae	L(mt32)
+ 
+ L(A64main):
+ 	movdqu    (%rdi,%rsi), %xmm0
+@@ -216,7 +220,7 @@ L(mt32):
+ 	movq	%r11, %r10
+ 	andq	$-32, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt16)
++        jae	L(mt16)
+ 
+ L(A32main):
+ 	movdqu    (%rdi,%rsi), %xmm0
+@@ -254,7 +258,7 @@ L(ATR):
+ 	movq	%r11, %r10
+ 	andq	$-32, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt16)
++        jae	L(mt16)
+ 	testq	$16, %rdi
+ 	jz	L(ATR32)
+ 
+@@ -325,7 +329,7 @@ L(ATR64main):
+ 	movq	%r11, %r10
+ 	andq	$-32, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt16)
++        jae	L(mt16)
+ 
+ L(ATR32res):
+ 	movdqa    (%rdi,%rsi), %xmm0
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 1557724b..87489565 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -8,7 +8,8 @@ endif
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+ 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+-	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
++	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen \
++	 tst-size_t-memcmp-2
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
+new file mode 100644
+index 00000000..d8ae1a08
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
+@@ -0,0 +1,79 @@
++/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_MAIN
++#ifdef WIDE
++# define TEST_NAME "wmemcmp"
++#else
++# define TEST_NAME "memcmp"
++#endif
++
++#include "test-size_t.h"
++
++#ifdef WIDE
++# include <inttypes.h>
++# include <wchar.h>
++
++# define MEMCMP wmemcmp
++# define CHAR wchar_t
++#else
++# define MEMCMP memcmp
++# define CHAR char
++#endif
++
++IMPL (MEMCMP, 1)
++
++typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
++
++static int
++__attribute__ ((noinline, noclone))
++do_memcmp (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  memcpy (buf1, buf2, page_size);
++
++  CHAR *p = (CHAR *) buf1;
++  p[page_size / sizeof (CHAR) - 1] = (CHAR) 1;
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      int res = do_memcmp (dest, src);
++      if (res >= 0)
++	{
++	  error (0, 0, "Wrong result in function %s: %i >= 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-90.patch b/glibc-RHEL-15696-90.patch
new file mode 100644
index 0000000..11835aa
--- /dev/null
+++ b/glibc-RHEL-15696-90.patch
@@ -0,0 +1,147 @@
+From 30d627d477d7255345a4b713cf352ac32d644d61 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:22 -0500
+Subject: [PATCH] x86: Optimize strcspn and strpbrk in strcspn-c.c
+Content-type: text/plain; charset=UTF-8
+
+Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
+_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
+sign extensions.
+
+geometric_mean(N=20) of all benchmarks that dont fallback on
+sse2/strlen; New / Original: .928
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++---------------
+ 1 file changed, 37 insertions(+), 46 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
+index 857af104..6cce4296 100644
+--- a/sysdeps/x86_64/multiarch/strcspn-c.c
++++ b/sysdeps/x86_64/multiarch/strcspn-c.c
+@@ -85,83 +85,74 @@ STRCSPN_SSE42 (const char *s, const char *a)
+     RETURN (NULL, strlen (s));
+ 
+   const char *aligned;
+-  __m128i mask;
+-  int offset = (int) ((size_t) a & 15);
++  __m128i mask, maskz, zero;
++  unsigned int maskz_bits;
++  unsigned int offset = (unsigned int) ((size_t) a & 15);
++  zero = _mm_set1_epi8 (0);
+   if (offset != 0)
+     {
+       /* Load masks.  */
+       aligned = (const char *) ((size_t) a & -16L);
+       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+-
+-      mask = __m128i_shift_right (mask0, offset);
++      maskz = _mm_cmpeq_epi8 (mask0, zero);
+ 
+       /* Find where the NULL terminator is.  */
+-      int length = _mm_cmpistri (mask, mask, 0x3a);
+-      if (length == 16 - offset)
+-	{
+-	  /* There is no NULL terminator.  */
+-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
+-	  length += index;
+-
+-	  /* Don't use SSE4.2 if the length of A > 16.  */
+-	  if (length > 16)
+-	    return STRCSPN_SSE2 (s, a);
+-
+-	  if (index != 0)
+-	    {
+-	      /* Combine mask0 and mask1.  We could play games with
+-		 palignr, but frankly this data should be in L1 now
+-		 so do the merge via an unaligned load.  */
+-	      mask = _mm_loadu_si128 ((__m128i *) a);
+-	    }
+-	}
++      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
++      if (maskz_bits != 0)
++        {
++          mask = __m128i_shift_right (mask0, offset);
++          offset = (unsigned int) ((size_t) s & 15);
++          if (offset)
++            goto start_unaligned;
++
++          aligned = s;
++          goto start_loop;
++        }
+     }
+-  else
+-    {
+-      /* A is aligned.  */
+-      mask = _mm_load_si128 ((__m128i *) a);
+ 
+-      /* Find where the NULL terminator is.  */
+-      int length = _mm_cmpistri (mask, mask, 0x3a);
+-      if (length == 16)
+-	{
+-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
+-	     of A > 16.  */
+-	  if (a[16] != 0)
+-	    return STRCSPN_SSE2 (s, a);
+-	}
++  /* A is aligned.  */
++  mask = _mm_loadu_si128 ((__m128i *) a);
++  /* Find where the NULL terminator is.  */
++  maskz = _mm_cmpeq_epi8 (mask, zero);
++  maskz_bits = _mm_movemask_epi8 (maskz);
++  if (maskz_bits == 0)
++    {
++      /* There is no NULL terminator.  Don't use SSE4.2 if the length
++         of A > 16.  */
++      if (a[16] != 0)
++        return STRCSPN_SSE2 (s, a);
+     }
+ 
+-  offset = (int) ((size_t) s & 15);
++  aligned = s;
++  offset = (unsigned int) ((size_t) s & 15);
+   if (offset != 0)
+     {
++    start_unaligned:
+       /* Check partial string.  */
+       aligned = (const char *) ((size_t) s & -16L);
+       __m128i value = _mm_load_si128 ((__m128i *) aligned);
+ 
+       value = __m128i_shift_right (value, offset);
+ 
+-      int length = _mm_cmpistri (mask, value, 0x2);
++      unsigned int length = _mm_cmpistri (mask, value, 0x2);
+       /* No need to check ZFlag since ZFlag is always 1.  */
+-      int cflag = _mm_cmpistrc (mask, value, 0x2);
++      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
+       if (cflag)
+ 	RETURN ((char *) (s + length), length);
+       /* Find where the NULL terminator is.  */
+-      int index = _mm_cmpistri (value, value, 0x3a);
++      unsigned int index = _mm_cmpistri (value, value, 0x3a);
+       if (index < 16 - offset)
+ 	RETURN (NULL, index);
+       aligned += 16;
+     }
+-  else
+-    aligned = s;
+ 
++start_loop:
+   while (1)
+     {
+       __m128i value = _mm_load_si128 ((__m128i *) aligned);
+-      int index = _mm_cmpistri (mask, value, 0x2);
+-      int cflag = _mm_cmpistrc (mask, value, 0x2);
+-      int zflag = _mm_cmpistrz (mask, value, 0x2);
++      unsigned int index = _mm_cmpistri (mask, value, 0x2);
++      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
++      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
+       if (cflag)
+ 	RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
+       if (zflag)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-91.patch b/glibc-RHEL-15696-91.patch
new file mode 100644
index 0000000..de3c8ec
--- /dev/null
+++ b/glibc-RHEL-15696-91.patch
@@ -0,0 +1,147 @@
+From 412d10343168b05b8cf6c3683457cf9711d28046 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:24 -0500
+Subject: [PATCH] x86: Optimize strspn in strspn-c.c
+Content-type: text/plain; charset=UTF-8
+
+Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
+_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
+sign extensions.
+
+geometric_mean(N=20) of all benchmarks that dont fallback on
+sse2; New / Original: .901
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++----------------
+ 1 file changed, 39 insertions(+), 47 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
+index 4554cff0..87c5e4bf 100644
+--- a/sysdeps/x86_64/multiarch/strspn-c.c
++++ b/sysdeps/x86_64/multiarch/strspn-c.c
+@@ -63,81 +63,73 @@ __strspn_sse42 (const char *s, const char *a)
+     return 0;
+ 
+   const char *aligned;
+-  __m128i mask;
+-  int offset = (int) ((size_t) a & 15);
++  __m128i mask, maskz, zero;
++  unsigned int maskz_bits;
++  unsigned int offset = (int) ((size_t) a & 15);
++  zero = _mm_set1_epi8 (0);
+   if (offset != 0)
+     {
+       /* Load masks.  */
+       aligned = (const char *) ((size_t) a & -16L);
+       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+-
+-      mask = __m128i_shift_right (mask0, offset);
++      maskz = _mm_cmpeq_epi8 (mask0, zero);
+ 
+       /* Find where the NULL terminator is.  */
+-      int length = _mm_cmpistri (mask, mask, 0x3a);
+-      if (length == 16 - offset)
+-	{
+-	  /* There is no NULL terminator.  */
+-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
+-	  length += index;
+-
+-	  /* Don't use SSE4.2 if the length of A > 16.  */
+-	  if (length > 16)
+-	    return __strspn_sse2 (s, a);
+-
+-	  if (index != 0)
+-	    {
+-	      /* Combine mask0 and mask1.  We could play games with
+-		 palignr, but frankly this data should be in L1 now
+-		 so do the merge via an unaligned load.  */
+-	      mask = _mm_loadu_si128 ((__m128i *) a);
+-	    }
+-	}
++      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
++      if (maskz_bits != 0)
++        {
++          mask = __m128i_shift_right (mask0, offset);
++          offset = (unsigned int) ((size_t) s & 15);
++          if (offset)
++            goto start_unaligned;
++
++          aligned = s;
++          goto start_loop;
++        }
+     }
+-  else
+-    {
+-      /* A is aligned.  */
+-      mask = _mm_load_si128 ((__m128i *) a);
+ 
+-      /* Find where the NULL terminator is.  */
+-      int length = _mm_cmpistri (mask, mask, 0x3a);
+-      if (length == 16)
+-	{
+-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
+-	     of A > 16.  */
+-	  if (a[16] != 0)
+-	    return __strspn_sse2 (s, a);
+-	}
++  /* A is aligned.  */
++  mask = _mm_loadu_si128 ((__m128i *) a);
++
++  /* Find where the NULL terminator is.  */
++  maskz = _mm_cmpeq_epi8 (mask, zero);
++  maskz_bits = _mm_movemask_epi8 (maskz);
++  if (maskz_bits == 0)
++    {
++      /* There is no NULL terminator.  Don't use SSE4.2 if the length
++         of A > 16.  */
++      if (a[16] != 0)
++        return __strspn_sse2 (s, a);
+     }
++  aligned = s;
++  offset = (unsigned int) ((size_t) s & 15);
+ 
+-  offset = (int) ((size_t) s & 15);
+   if (offset != 0)
+     {
++    start_unaligned:
+       /* Check partial string.  */
+       aligned = (const char *) ((size_t) s & -16L);
+       __m128i value = _mm_load_si128 ((__m128i *) aligned);
++      __m128i adj_value = __m128i_shift_right (value, offset);
+ 
+-      value = __m128i_shift_right (value, offset);
+-
+-      int length = _mm_cmpistri (mask, value, 0x12);
++      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
+       /* No need to check CFlag since it is always 1.  */
+       if (length < 16 - offset)
+ 	return length;
+       /* Find where the NULL terminator is.  */
+-      int index = _mm_cmpistri (value, value, 0x3a);
+-      if (index < 16 - offset)
++      maskz = _mm_cmpeq_epi8 (value, zero);
++      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
++      if (maskz_bits != 0)
+ 	return length;
+       aligned += 16;
+     }
+-  else
+-    aligned = s;
+ 
++start_loop:
+   while (1)
+     {
+       __m128i value = _mm_load_si128 ((__m128i *) aligned);
+-      int index = _mm_cmpistri (mask, value, 0x12);
+-      int cflag = _mm_cmpistrc (mask, value, 0x12);
++      unsigned int index = _mm_cmpistri (mask, value, 0x12);
++      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
+       if (cflag)
+ 	return (size_t) (aligned + index - s);
+       aligned += 16;
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-92.patch b/glibc-RHEL-15696-92.patch
new file mode 100644
index 0000000..f19914e
--- /dev/null
+++ b/glibc-RHEL-15696-92.patch
@@ -0,0 +1,175 @@
+From fe28e7d9d9535ebab4081d195c553b4fbf39d9ae Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:26 -0500
+Subject: [PATCH] x86: Remove strcspn-sse2.S and use the generic implementation
+Content-type: text/plain; charset=UTF-8
+
+The generic implementation is faster.
+
+geometric_mean(N=20) of all benchmarks New / Original: .678
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ .../{strcspn-sse2.S => strcspn-sse2.c}        |   6 +-
+ sysdeps/x86_64/strcspn.S                      | 122 ------------------
+ 2 files changed, 3 insertions(+), 125 deletions(-)
+ rename sysdeps/x86_64/multiarch/{strcspn-sse2.S => strcspn-sse2.c} (89%)
+ delete mode 100644 sysdeps/x86_64/strcspn.S
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strcspn-sse2.S
+	(copyright header)
+
+diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.c
+similarity index 89%
+rename from sysdeps/x86_64/multiarch/strcspn-sse2.S
+rename to sysdeps/x86_64/multiarch/strcspn-sse2.c
+index 8a0c69d7..32debee4 100644
+--- a/sysdeps/x86_64/multiarch/strcspn-sse2.S
++++ b/sysdeps/x86_64/multiarch/strcspn-sse2.c
+@@ -19,10 +19,10 @@
+ #if IS_IN (libc)
+ 
+ # include <sysdep.h>
+-# define strcspn __strcspn_sse2
++# define STRCSPN __strcspn_sse2
+ 
+ # undef libc_hidden_builtin_def
+-# define libc_hidden_builtin_def(strcspn)
++# define libc_hidden_builtin_def(STRCSPN)
+ #endif
+ 
+-#include <sysdeps/x86_64/strcspn.S>
++#include <string/strcspn.c>
+diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S
+deleted file mode 100644
+index 7f9202d6..00000000
+--- a/sysdeps/x86_64/strcspn.S
++++ /dev/null
+@@ -1,122 +0,0 @@
+-/* strcspn (str, ss) -- Return the length of the initial segment of STR
+-			which contains no characters from SS.
+-   For AMD x86-64.
+-   Copyright (C) 1994-2018 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
+-   Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>.
+-   Adopted for x86-64 by Andreas Jaeger <aj@suse.de>.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#include <sysdep.h>
+-#include "asm-syntax.h"
+-
+-	.text
+-ENTRY (strcspn)
+-
+-	movq %rdi, %rdx		/* Save SRC.  */
+-
+-	/* First we create a table with flags for all possible characters.
+-	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+-	   supported by the C string functions we have 256 characters.
+-	   Before inserting marks for the stop characters we clear the whole
+-	   table.  */
+-	movq %rdi, %r8			/* Save value.  */
+-	subq $256, %rsp			/* Make space for 256 bytes.  */
+-	cfi_adjust_cfa_offset(256)
+-	movl $32,  %ecx			/* 32*8 bytes = 256 bytes.  */
+-	movq %rsp, %rdi
+-	xorl %eax, %eax			/* We store 0s.  */
+-	cld
+-	rep
+-	stosq
+-
+-	movq %rsi, %rax			/* Setup skipset.  */
+-
+-/* For understanding the following code remember that %rcx == 0 now.
+-   Although all the following instruction only modify %cl we always
+-   have a correct zero-extended 64-bit value in %rcx.  */
+-
+-	.p2align 4
+-L(2):	movb (%rax), %cl	/* get byte from skipset */
+-	testb %cl, %cl		/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
+-
+-	movb 1(%rax), %cl	/* get byte from skipset */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
+-
+-	movb 2(%rax), %cl	/* get byte from skipset */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
+-
+-	movb 3(%rax), %cl	/* get byte from skipset */
+-	addq $4, %rax		/* increment skipset pointer */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jnz L(2)		/* no => process next dword from skipset */
+-
+-L(1):	leaq -4(%rdx), %rax	/* prepare loop */
+-
+-	/* We use a neat trick for the following loop.  Normally we would
+-	   have to test for two termination conditions
+-	   1. a character in the skipset was found
+-	   and
+-	   2. the end of the string was found
+-	   But as a sign that the character is in the skipset we store its
+-	   value in the table.  But the value of NUL is NUL so the loop
+-	   terminates for NUL in every case.  */
+-
+-	.p2align 4
+-L(3):	addq $4, %rax		/* adjust pointer for full loop round */
+-
+-	movb (%rax), %cl	/* get byte from string */
+-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	je L(4)			/* yes => return */
+-
+-	movb 1(%rax), %cl	/* get byte from string */
+-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	je L(5)			/* yes => return */
+-
+-	movb 2(%rax), %cl	/* get byte from string */
+-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jz L(6)			/* yes => return */
+-
+-	movb 3(%rax), %cl	/* get byte from string */
+-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jne L(3)		/* no => start loop again */
+-
+-	incq %rax		/* adjust pointer */
+-L(6):	incq %rax
+-L(5):	incq %rax
+-
+-L(4):	addq $256, %rsp		/* remove skipset */
+-	cfi_adjust_cfa_offset(-256)
+-#ifdef USE_AS_STRPBRK
+-	xorl %edx,%edx
+-	orb %cl, %cl		/* was last character NUL? */
+-	cmovzq %rdx, %rax	/* Yes:	return NULL */
+-#else
+-	subq %rdx, %rax		/* we have to return the number of valid
+-				   characters, so compute distance to first
+-				   non-valid character */
+-#endif
+-	ret
+-END (strcspn)
+-libc_hidden_builtin_def (strcspn)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-93.patch b/glibc-RHEL-15696-93.patch
new file mode 100644
index 0000000..45c8527
--- /dev/null
+++ b/glibc-RHEL-15696-93.patch
@@ -0,0 +1,55 @@
+From 653358535280a599382cb6c77538a187dac6a87f Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:27 -0500
+Subject: [PATCH] x86: Remove strpbrk-sse2.S and use the generic implementation
+Content-type: text/plain; charset=UTF-8
+
+The generic implementation is faster (see strcspn commit).
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ .../x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c}    | 7 +++----
+ sysdeps/x86_64/strpbrk.S                                   | 3 ---
+ 2 files changed, 3 insertions(+), 7 deletions(-)
+ rename sysdeps/x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} (87%)
+ delete mode 100644 sysdeps/x86_64/strpbrk.S
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strpbrk-sse2.S
+	(copyright header)
+
+diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.S b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
+similarity index 87%
+rename from sysdeps/x86_64/multiarch/strpbrk-sse2.S
+rename to sysdeps/x86_64/multiarch/strpbrk-sse2.c
+index 3c6a74db..ec0b6fda 100644
+--- a/sysdeps/x86_64/multiarch/strpbrk-sse2.S
++++ b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
+@@ -19,11 +19,10 @@
+ #if IS_IN (libc)
+ 
+ # include <sysdep.h>
+-# define strcspn __strpbrk_sse2
++# define STRPBRK __strpbrk_sse2
+ 
+ # undef libc_hidden_builtin_def
+-# define libc_hidden_builtin_def(strpbrk)
++# define libc_hidden_builtin_def(STRPBRK)
+ #endif
+ 
+-#define USE_AS_STRPBRK
+-#include <sysdeps/x86_64/strcspn.S>
++#include <string/strpbrk.c>
+diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S
+deleted file mode 100644
+index 21888a5b..00000000
+--- a/sysdeps/x86_64/strpbrk.S
++++ /dev/null
+@@ -1,3 +0,0 @@
+-#define strcspn strpbrk
+-#define USE_AS_STRPBRK
+-#include <sysdeps/x86_64/strcspn.S>
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-94.patch b/glibc-RHEL-15696-94.patch
new file mode 100644
index 0000000..2fa86da
--- /dev/null
+++ b/glibc-RHEL-15696-94.patch
@@ -0,0 +1,168 @@
+From 9c8a6ad620b49a27120ecdd7049c26bf05900397 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:29 -0500
+Subject: [PATCH] x86: Remove strspn-sse2.S and use the generic implementation
+Content-type: text/plain; charset=UTF-8
+
+The generic implementation is faster.
+
+geometric_mean(N=20) of all benchmarks New / Original: .710
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ .../{strspn-sse2.S => strspn-sse2.c}          |   6 +-
+ sysdeps/x86_64/strspn.S                       | 115 ------------------
+ 2 files changed, 3 insertions(+), 118 deletions(-)
+ rename sysdeps/x86_64/multiarch/{strspn-sse2.S => strspn-sse2.c} (89%)
+ delete mode 100644 sysdeps/x86_64/strspn.S
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strspn-sse2.c
+	(copyright header)
+
+diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.S b/sysdeps/x86_64/multiarch/strspn-sse2.c
+similarity index 89%
+rename from sysdeps/x86_64/multiarch/strspn-sse2.S
+rename to sysdeps/x86_64/multiarch/strspn-sse2.c
+index 4686cdd5..ab0dae40 100644
+--- a/sysdeps/x86_64/multiarch/strspn-sse2.S
++++ b/sysdeps/x86_64/multiarch/strspn-sse2.c
+@@ -19,10 +19,10 @@
+ #if IS_IN (libc)
+ 
+ # include <sysdep.h>
+-# define strspn __strspn_sse2
++# define STRSPN __strspn_sse2
+ 
+ # undef libc_hidden_builtin_def
+-# define libc_hidden_builtin_def(strspn)
++# define libc_hidden_builtin_def(STRSPN)
+ #endif
+ 
+-#include <sysdeps/x86_64/strspn.S>
++#include <string/strspn.c>
+diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S
+deleted file mode 100644
+index 635f1bc6..00000000
+--- a/sysdeps/x86_64/strspn.S
++++ /dev/null
+@@ -1,115 +0,0 @@
+-/* strspn (str, ss) -- Return the length of the initial segment of STR
+-			which contains only characters from SS.
+-   For AMD x86-64.
+-   Copyright (C) 1994-2018 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
+-   Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>.
+-   Adopted for x86-64 by Andreas Jaeger <aj@suse.de>.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#include <sysdep.h>
+-
+-	.text
+-ENTRY (strspn)
+-
+-	movq %rdi, %rdx		/* Save SRC.  */
+-
+-	/* First we create a table with flags for all possible characters.
+-	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+-	   supported by the C string functions we have 256 characters.
+-	   Before inserting marks for the stop characters we clear the whole
+-	   table.  */
+-	movq %rdi, %r8			/* Save value.  */
+-	subq $256, %rsp			/* Make space for 256 bytes.  */
+-	cfi_adjust_cfa_offset(256)
+-	movl $32,  %ecx			/* 32*8 bytes = 256 bytes.  */
+-	movq %rsp, %rdi
+-	xorl %eax, %eax			/* We store 0s.  */
+-	cld
+-	rep
+-	stosq
+-
+-	movq %rsi, %rax			/* Setup stopset.  */
+-
+-/* For understanding the following code remember that %rcx == 0 now.
+-   Although all the following instruction only modify %cl we always
+-   have a correct zero-extended 64-bit value in %rcx.  */
+-
+-	.p2align 4
+-L(2):	movb (%rax), %cl	/* get byte from stopset */
+-	testb %cl, %cl		/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
+-
+-	movb 1(%rax), %cl	/* get byte from stopset */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
+-
+-	movb 2(%rax), %cl	/* get byte from stopset */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
+-
+-	movb 3(%rax), %cl	/* get byte from stopset */
+-	addq $4, %rax		/* increment stopset pointer */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jnz L(2)		/* no => process next dword from stopset */
+-
+-L(1):	leaq -4(%rdx), %rax	/* prepare loop */
+-
+-	/* We use a neat trick for the following loop.  Normally we would
+-	   have to test for two termination conditions
+-	   1. a character in the stopset was found
+-	   and
+-	   2. the end of the string was found
+-	   But as a sign that the character is in the stopset we store its
+-	   value in the table.  But the value of NUL is NUL so the loop
+-	   terminates for NUL in every case.  */
+-
+-	.p2align 4
+-L(3):	addq $4, %rax		/* adjust pointer for full loop round */
+-
+-	movb (%rax), %cl	/* get byte from string */
+-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jz L(4)			/* no => return */
+-
+-	movb 1(%rax), %cl	/* get byte from string */
+-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jz L(5)			/* no => return */
+-
+-	movb 2(%rax), %cl	/* get byte from string */
+-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jz L(6)			/* no => return */
+-
+-	movb 3(%rax), %cl	/* get byte from string */
+-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jnz L(3)		/* yes => start loop again */
+-
+-	incq %rax		/* adjust pointer */
+-L(6):	incq %rax
+-L(5):	incq %rax
+-
+-L(4):	addq $256, %rsp		/* remove stopset */
+-	cfi_adjust_cfa_offset(-256)
+-	subq %rdx, %rax		/* we have to return the number of valid
+-				   characters, so compute distance to first
+-				   non-valid character */
+-	ret
+-END (strspn)
+-libc_hidden_builtin_def (strspn)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-95.patch b/glibc-RHEL-15696-95.patch
new file mode 100644
index 0000000..cf21b96
--- /dev/null
+++ b/glibc-RHEL-15696-95.patch
@@ -0,0 +1,122 @@
+From 670b54bc585ea4a94f3b2e9272ba44aa6b730b73 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:36 -0500
+Subject: [PATCH] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S
+Content-type: text/plain; charset=UTF-8
+
+Slightly faster method of doing TOLOWER that saves an
+instruction.
+
+Also replace the hard coded 5-byte no with .p2align 4. On builds with
+CET enabled this misaligned entry to strcasecmp.
+
+geometric_mean(N=40) of all benchmarks New / Original: .894
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
+ 1 file changed, 29 insertions(+), 35 deletions(-)
+
+diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
+index aa6df898..f454ce5b 100644
+--- a/sysdeps/x86_64/strcmp.S
++++ b/sysdeps/x86_64/strcmp.S
+@@ -78,9 +78,8 @@ ENTRY2 (__strcasecmp)
+ 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+ 	mov	%fs:(%rax),%RDX_LP
+ 
+-	// XXX 5 byte should be before the function
+-	/* 5-byte NOP.  */
+-	.byte	0x0f,0x1f,0x44,0x00,0x00
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
+ END2 (__strcasecmp)
+ # ifndef NO_NOLOCALE_ALIAS
+ weak_alias (__strcasecmp, strcasecmp)
+@@ -97,9 +96,8 @@ ENTRY2 (__strncasecmp)
+ 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+ 	mov	%fs:(%rax),%RCX_LP
+ 
+-	// XXX 5 byte should be before the function
+-	/* 5-byte NOP.  */
+-	.byte	0x0f,0x1f,0x44,0x00,0x00
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
+ END2 (__strncasecmp)
+ # ifndef NO_NOLOCALE_ALIAS
+ weak_alias (__strncasecmp, strncasecmp)
+@@ -149,22 +147,22 @@ ENTRY (STRCMP)
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	.section .rodata.cst16,"aM",@progbits,16
+ 	.align 16
+-.Lbelowupper:
+-	.quad	0x4040404040404040
+-	.quad	0x4040404040404040
+-.Ltopupper:
+-	.quad	0x5b5b5b5b5b5b5b5b
+-	.quad	0x5b5b5b5b5b5b5b5b
+-.Ltouppermask:
++.Llcase_min:
++	.quad	0x3f3f3f3f3f3f3f3f
++	.quad	0x3f3f3f3f3f3f3f3f
++.Llcase_max:
++	.quad	0x9999999999999999
++	.quad	0x9999999999999999
++.Lcase_add:
+ 	.quad	0x2020202020202020
+ 	.quad	0x2020202020202020
+ 	.previous
+-	movdqa	.Lbelowupper(%rip), %xmm5
+-# define UCLOW_reg %xmm5
+-	movdqa	.Ltopupper(%rip), %xmm6
+-# define UCHIGH_reg %xmm6
+-	movdqa	.Ltouppermask(%rip), %xmm7
+-# define LCQWORD_reg %xmm7
++	movdqa	.Llcase_min(%rip), %xmm5
++# define LCASE_MIN_reg %xmm5
++	movdqa	.Llcase_max(%rip), %xmm6
++# define LCASE_MAX_reg %xmm6
++	movdqa	.Lcase_add(%rip), %xmm7
++# define CASE_ADD_reg %xmm7
+ #endif
+ 	cmp	$0x30, %ecx
+ 	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
+@@ -175,22 +173,18 @@ ENTRY (STRCMP)
+ 	movhpd	8(%rdi), %xmm1
+ 	movhpd	8(%rsi), %xmm2
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+-# define TOLOWER(reg1, reg2) \
+-	movdqa	reg1, %xmm8;					\
+-	movdqa	UCHIGH_reg, %xmm9;				\
+-	movdqa	reg2, %xmm10;					\
+-	movdqa	UCHIGH_reg, %xmm11;				\
+-	pcmpgtb	UCLOW_reg, %xmm8;				\
+-	pcmpgtb	reg1, %xmm9;					\
+-	pcmpgtb	UCLOW_reg, %xmm10;				\
+-	pcmpgtb	reg2, %xmm11;					\
+-	pand	%xmm9, %xmm8;					\
+-	pand	%xmm11, %xmm10;					\
+-	pand	LCQWORD_reg, %xmm8;				\
+-	pand	LCQWORD_reg, %xmm10;				\
+-	por	%xmm8, reg1;					\
+-	por	%xmm10, reg2
+-	TOLOWER (%xmm1, %xmm2)
++#  define TOLOWER(reg1, reg2) \
++	movdqa	LCASE_MIN_reg, %xmm8;					\
++	movdqa	LCASE_MIN_reg, %xmm9;					\
++	paddb	reg1, %xmm8;					\
++	paddb	reg2, %xmm9;					\
++	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
++	pcmpgtb	LCASE_MAX_reg, %xmm9;				\
++	pandn	CASE_ADD_reg, %xmm8;					\
++	pandn	CASE_ADD_reg, %xmm9;					\
++	paddb	%xmm8, reg1;					\
++	paddb	%xmm9, reg2
++	TOLOWER	(%xmm1, %xmm2)
+ #else
+ # define TOLOWER(reg1, reg2)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-96.patch b/glibc-RHEL-15696-96.patch
new file mode 100644
index 0000000..2d3b891
--- /dev/null
+++ b/glibc-RHEL-15696-96.patch
@@ -0,0 +1,143 @@
+From d154758e618ec9324f5d339c46db0aa27e8b1226 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:38 -0500
+Subject: [PATCH] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S
+Content-type: text/plain; charset=UTF-8
+
+Slightly faster method of doing TOLOWER that saves an
+instruction.
+
+Also replace the hard coded 5-byte no with .p2align 4. On builds with
+CET enabled this misaligned entry to strcasecmp.
+
+geometric_mean(N=40) of all benchmarks New / Original: .920
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++--------------
+ 1 file changed, 35 insertions(+), 48 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+index d8fdeb3a..59e8ddfc 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+@@ -89,9 +89,8 @@ ENTRY (GLABEL(__strcasecmp))
+ 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+ 	mov	%fs:(%rax),%RDX_LP
+ 
+-	// XXX 5 byte should be before the function
+-	/* 5-byte NOP.  */
+-	.byte	0x0f,0x1f,0x44,0x00,0x00
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
+ END (GLABEL(__strcasecmp))
+ 	/* FALLTHROUGH to strcasecmp_l.  */
+ #endif
+@@ -100,9 +99,8 @@ ENTRY (GLABEL(__strncasecmp))
+ 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+ 	mov	%fs:(%rax),%RCX_LP
+ 
+-	// XXX 5 byte should be before the function
+-	/* 5-byte NOP.  */
+-	.byte	0x0f,0x1f,0x44,0x00,0x00
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
+ END (GLABEL(__strncasecmp))
+ 	/* FALLTHROUGH to strncasecmp_l.  */
+ #endif
+@@ -170,27 +168,22 @@ STRCMP_SSE42:
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	.section .rodata.cst16,"aM",@progbits,16
+ 	.align 16
+-LABEL(belowupper):
+-	.quad	0x4040404040404040
+-	.quad	0x4040404040404040
+-LABEL(topupper):
+-# ifdef USE_AVX
+-	.quad	0x5a5a5a5a5a5a5a5a
+-	.quad	0x5a5a5a5a5a5a5a5a
+-# else
+-	.quad	0x5b5b5b5b5b5b5b5b
+-	.quad	0x5b5b5b5b5b5b5b5b
+-# endif
+-LABEL(touppermask):
++LABEL(lcase_min):
++	.quad	0x3f3f3f3f3f3f3f3f
++	.quad	0x3f3f3f3f3f3f3f3f
++LABEL(lcase_max):
++	.quad	0x9999999999999999
++	.quad	0x9999999999999999
++LABEL(case_add):
+ 	.quad	0x2020202020202020
+ 	.quad	0x2020202020202020
+ 	.previous
+-	movdqa	LABEL(belowupper)(%rip), %xmm4
+-# define UCLOW_reg %xmm4
+-	movdqa	LABEL(topupper)(%rip), %xmm5
+-# define UCHIGH_reg %xmm5
+-	movdqa	LABEL(touppermask)(%rip), %xmm6
+-# define LCQWORD_reg %xmm6
++	movdqa	LABEL(lcase_min)(%rip), %xmm4
++# define LCASE_MIN_reg %xmm4
++	movdqa	LABEL(lcase_max)(%rip), %xmm5
++# define LCASE_MAX_reg %xmm5
++	movdqa	LABEL(case_add)(%rip), %xmm6
++# define CASE_ADD_reg %xmm6
+ #endif
+ 	cmp	$0x30, %ecx
+ 	ja	LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
+@@ -201,32 +194,26 @@ LABEL(touppermask):
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ # ifdef USE_AVX
+ #  define TOLOWER(reg1, reg2) \
+-	vpcmpgtb UCLOW_reg, reg1, %xmm7;			\
+-	vpcmpgtb UCHIGH_reg, reg1, %xmm8;			\
+-	vpcmpgtb UCLOW_reg, reg2, %xmm9;			\
+-	vpcmpgtb UCHIGH_reg, reg2, %xmm10;			\
+-	vpandn	%xmm7, %xmm8, %xmm8;					\
+-	vpandn	%xmm9, %xmm10, %xmm10;					\
+-	vpand	LCQWORD_reg, %xmm8, %xmm8;				\
+-	vpand	LCQWORD_reg, %xmm10, %xmm10;				\
+-	vpor	reg1, %xmm8, reg1;					\
+-	vpor	reg2, %xmm10, reg2
++	vpaddb	LCASE_MIN_reg, reg1, %xmm7;					\
++	vpaddb	LCASE_MIN_reg, reg2, %xmm8;					\
++	vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;					\
++	vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;					\
++	vpandn	CASE_ADD_reg, %xmm7, %xmm7;					\
++	vpandn	CASE_ADD_reg, %xmm8, %xmm8;					\
++	vpaddb	%xmm7, reg1, reg1;					\
++	vpaddb	%xmm8, reg2, reg2
+ # else
+ #  define TOLOWER(reg1, reg2) \
+-	movdqa	reg1, %xmm7;					\
+-	movdqa	UCHIGH_reg, %xmm8;				\
+-	movdqa	reg2, %xmm9;					\
+-	movdqa	UCHIGH_reg, %xmm10;				\
+-	pcmpgtb	UCLOW_reg, %xmm7;				\
+-	pcmpgtb	reg1, %xmm8;					\
+-	pcmpgtb	UCLOW_reg, %xmm9;				\
+-	pcmpgtb	reg2, %xmm10;					\
+-	pand	%xmm8, %xmm7;					\
+-	pand	%xmm10, %xmm9;					\
+-	pand	LCQWORD_reg, %xmm7;				\
+-	pand	LCQWORD_reg, %xmm9;				\
+-	por	%xmm7, reg1;					\
+-	por	%xmm9, reg2
++	movdqa	LCASE_MIN_reg, %xmm7;					\
++	movdqa	LCASE_MIN_reg, %xmm8;					\
++	paddb	reg1, %xmm7;					\
++	paddb	reg2, %xmm8;					\
++	pcmpgtb	LCASE_MAX_reg, %xmm7;				\
++	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
++	pandn	CASE_ADD_reg, %xmm7;					\
++	pandn	CASE_ADD_reg, %xmm8;					\
++	paddb	%xmm7, reg1;					\
++	paddb	%xmm8, reg2
+ # endif
+ 	TOLOWER (%xmm1, %xmm2)
+ #else
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-97.patch b/glibc-RHEL-15696-97.patch
new file mode 100644
index 0000000..9592795
--- /dev/null
+++ b/glibc-RHEL-15696-97.patch
@@ -0,0 +1,759 @@
+From bbf81222343fed5cd704001a2ae0d86c71544151 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 24 Mar 2022 18:56:12 -0500
+Subject: [PATCH] x86: Add AVX2 optimized str{n}casecmp
+Content-type: text/plain; charset=UTF-8
+
+geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile             |   4 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  28 +++
+ sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |  12 +
+ .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S  |  15 ++
+ sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S  |  23 ++
+ sysdeps/x86_64/multiarch/strcmp-avx2.S        | 237 +++++++++++++++---
+ .../x86_64/multiarch/strncase_l-avx2-rtm.S    |  16 ++
+ sysdeps/x86_64/multiarch/strncase_l-avx2.S    |  27 ++
+ 8 files changed, 331 insertions(+), 31 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 8c9e7812..711ecf2e 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -51,6 +51,8 @@ sysdep_routines += \
+   stpncpy-sse2-unaligned \
+   stpncpy-ssse3 \
+   strcasecmp_l-avx \
++  strcasecmp_l-avx2 \
++  strcasecmp_l-avx2-rtm \
+   strcasecmp_l-sse2 \
+   strcasecmp_l-sse4_2 \
+   strcasecmp_l-ssse3 \
+@@ -89,6 +91,8 @@ sysdep_routines += \
+   strlen-evex \
+   strlen-sse2 \
+   strncase_l-avx \
++  strncase_l-avx2 \
++  strncase_l-avx2-rtm \
+   strncase_l-sse2 \
+   strncase_l-sse4_2 \
+   strncase_l-ssse3 \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index c963d391..d873e1be 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -418,6 +418,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
+   IFUNC_IMPL (i, name, strcasecmp,
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __strcasecmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strcasecmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __strcasecmp_avx)
+@@ -431,6 +438,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
+   IFUNC_IMPL (i, name, strcasecmp_l,
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __strcasecmp_l_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strcasecmp_l_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __strcasecmp_l_avx)
+@@ -558,6 +572,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
+   IFUNC_IMPL (i, name, strncasecmp,
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __strncasecmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strncasecmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __strncasecmp_avx)
+@@ -572,6 +593,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
+   IFUNC_IMPL (i, name, strncasecmp_l,
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __strncasecmp_l_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strncasecmp_l_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __strncasecmp_l_avx)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+index 6a4bb078..926508c4 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++        return OPTIMIZE (avx2_rtm);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++        return OPTIMIZE (avx2);
++    }
++
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
+     return OPTIMIZE (avx);
+ 
+diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
+new file mode 100644
+index 00000000..09957fc3
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
+@@ -0,0 +1,15 @@
++#ifndef STRCMP
++# define STRCMP	__strcasecmp_l_avx2_rtm
++#endif
++
++#define _GLABEL(x)	x ## _rtm
++#define GLABEL(x)	_GLABEL(x)
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
++	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
++
++#define SECTION(p)	p##.avx.rtm
++
++#include "strcasecmp_l-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
+new file mode 100644
+index 00000000..e2762f2a
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
+@@ -0,0 +1,23 @@
++/* strcasecmp_l optimized with AVX2.
++   Copyright (C) 2017-2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef STRCMP
++# define STRCMP	__strcasecmp_l_avx2
++#endif
++#define USE_AS_STRCASECMP_L
++#include "strcmp-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 782f9472..28cc98b6 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -20,6 +20,10 @@
+ 
+ # include <sysdep.h>
+ 
++# if defined USE_AS_STRCASECMP_L
++#  include "locale-defines.h"
++# endif
++
+ # ifndef STRCMP
+ #  define STRCMP	__strcmp_avx2
+ # endif
+@@ -74,13 +78,88 @@
+ #  define VEC_OFFSET	(-VEC_SIZE)
+ # endif
+ 
++# ifdef USE_AS_STRCASECMP_L
++#  define BYTE_LOOP_REG	OFFSET_REG
++# else
++#  define BYTE_LOOP_REG	ecx
++# endif
++
++# ifdef USE_AS_STRCASECMP_L
++#  ifdef USE_AS_STRNCMP
++#   define STRCASECMP	__strncasecmp_avx2
++#   define LOCALE_REG	rcx
++#   define LOCALE_REG_LP	RCX_LP
++#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
++#  else
++#   define STRCASECMP	__strcasecmp_avx2
++#   define LOCALE_REG	rdx
++#   define LOCALE_REG_LP	RDX_LP
++#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
++#  endif
++# endif
++
+ # define xmmZERO	xmm15
+ # define ymmZERO	ymm15
+ 
++# define LCASE_MIN_ymm	%ymm10
++# define LCASE_MAX_ymm	%ymm11
++# define CASE_ADD_ymm	%ymm12
++
++# define LCASE_MIN_xmm	%xmm10
++# define LCASE_MAX_xmm	%xmm11
++# define CASE_ADD_xmm	%xmm12
++
++	/* r11 is never use elsewhere so this is safe to maintain.  */
++# define TOLOWER_BASE	%r11
++
+ # ifndef SECTION
+ #  define SECTION(p)	p##.avx
+ # endif
+ 
++# ifdef USE_AS_STRCASECMP_L
++#  define REG(x, y) x ## y
++#  define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext)			\
++	vpaddb	REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8);				\
++	vpaddb	REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9);				\
++	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8);			\
++	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9);			\
++	vpandn	REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8);			\
++	vpandn	REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9);			\
++	vpaddb	REG(%ext, 8), reg1_in, reg1_out;							\
++	vpaddb	REG(%ext, 9), reg2_in, reg2_out
++
++#  define TOLOWER_gpr(src, dst)	movl (TOLOWER_BASE, src, 4), dst
++#  define TOLOWER_ymm(...)	TOLOWER(__VA_ARGS__, ymm)
++#  define TOLOWER_xmm(...)	TOLOWER(__VA_ARGS__, xmm)
++
++#  define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext)			\
++	TOLOWER	(s1_reg, scratch_reg, s2_reg, s2_reg, ext);					\
++	VPCMPEQ	scratch_reg, s2_reg, reg_out
++
++#  define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext)			\
++	VMOVU	s2_mem, reg_out;											\
++	CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
++
++#  define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
++#  define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
++
++#  define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
++#  define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
++
++# else
++#  define TOLOWER_gpr(...)
++#  define TOLOWER_ymm(...)
++#  define TOLOWER_xmm(...)
++
++#  define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out)			\
++	VPCMPEQ	s2_reg, s1_reg, reg_out
++
++#  define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
++
++#  define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
++#  define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
++# endif
++
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+            strcmp/strncmp have to use UNSIGNED comparison for elements.
+@@ -102,8 +181,49 @@
+    returned.  */
+ 
+ 	.section SECTION(.text), "ax", @progbits
+-ENTRY(STRCMP)
++	.align	16
++	.type	STRCMP, @function
++	.globl	STRCMP
++	.hidden	STRCMP
++
++# ifndef GLABEL
++#  define GLABEL(...)	__VA_ARGS__
++# endif
++
++# ifdef USE_AS_STRCASECMP_L
++ENTRY (GLABEL(STRCASECMP))
++	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
++	mov	%fs:(%rax), %LOCALE_REG_LP
++
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
++END (GLABEL(STRCASECMP))
++	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
++# endif
++
++	.p2align 4
++STRCMP:
++	cfi_startproc
++	_CET_ENDBR
++	CALL_MCOUNT
++
++# if defined USE_AS_STRCASECMP_L
++	/* We have to fall back on the C implementation for locales with
++	   encodings not matching ASCII for single bytes.  */
++#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
++	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
++#  else
++	mov	(%LOCALE_REG), %RAX_LP
++#  endif
++	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
++	jne	STRCASECMP_NONASCII
++	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
++# endif
++
+ # ifdef USE_AS_STRNCMP
++	/* Don't overwrite LOCALE_REG (rcx) until we have pass
++	   L(one_or_less). Otherwise we might use the wrong locale in
++	   the OVERFLOW_STRCMP (strcasecmp_l).  */
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+@@ -128,6 +248,30 @@ ENTRY(STRCMP)
+ #  endif
+ # endif
+ 	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
++# if defined USE_AS_STRCASECMP_L
++	.section .rodata.cst32, "aM", @progbits, 32
++	.align	32
++L(lcase_min):
++	.quad	0x3f3f3f3f3f3f3f3f
++	.quad	0x3f3f3f3f3f3f3f3f
++	.quad	0x3f3f3f3f3f3f3f3f
++	.quad	0x3f3f3f3f3f3f3f3f
++L(lcase_max):
++	.quad	0x9999999999999999
++	.quad	0x9999999999999999
++	.quad	0x9999999999999999
++	.quad	0x9999999999999999
++L(case_add):
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.previous
++
++	vmovdqa	L(lcase_min)(%rip), LCASE_MIN_ymm
++	vmovdqa	L(lcase_max)(%rip), LCASE_MAX_ymm
++	vmovdqa	L(case_add)(%rip), CASE_ADD_ymm
++# endif
+ 	movl	%edi, %eax
+ 	orl	%esi, %eax
+ 	sall	$20, %eax
+@@ -138,8 +282,10 @@ ENTRY(STRCMP)
+ L(no_page_cross):
+ 	/* Safe to compare 4x vectors.  */
+ 	VMOVU	(%rdi), %ymm0
+-	/* 1s where s1 and s2 equal.  */
+-	VPCMPEQ	(%rsi), %ymm0, %ymm1
++	/* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
++	   Otherwise converts ymm0 and load from rsi to lower. ymm2 is
++	   scratch and ymm1 is the return.  */
++	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
+ 	/* 1s at null CHAR.  */
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	/* 1s where s1 and s2 equal AND not null CHAR.  */
+@@ -172,6 +318,8 @@ L(return_vec_0):
+ # else
+ 	movzbl	(%rdi, %rcx), %eax
+ 	movzbl	(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret0):
+@@ -192,6 +340,10 @@ L(ret_zero):
+ 
+ 	.p2align 4,, 5
+ L(one_or_less):
++#  ifdef USE_AS_STRCASECMP_L
++	/* Set locale argument for strcasecmp.  */
++	movq	%LOCALE_REG, %rdx
++#  endif
+ 	jb	L(ret_zero)
+ #  ifdef USE_AS_WCSCMP
+ 	/* 'nbe' covers the case where length is negative (large
+@@ -211,6 +363,8 @@ L(one_or_less):
+ 	jnbe	__strcmp_avx2
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ #  endif
+ L(ret1):
+@@ -238,6 +392,8 @@ L(return_vec_1):
+ # else
+ 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+ 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret2):
+@@ -269,6 +425,8 @@ L(return_vec_2):
+ # else
+ 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+ 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret3):
+@@ -289,6 +447,8 @@ L(return_vec_3):
+ #  else
+ 	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+ 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ #  endif
+ L(ret4):
+@@ -299,7 +459,7 @@ L(ret4):
+ L(more_3x_vec):
+ 	/* Safe to compare 4x vectors.  */
+ 	VMOVU	VEC_SIZE(%rdi), %ymm0
+-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -312,7 +472,7 @@ L(more_3x_vec):
+ # endif
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
+-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -320,7 +480,7 @@ L(more_3x_vec):
+ 	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
+-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -395,12 +555,10 @@ L(loop_skip_page_cross_check):
+ 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
+ 
+ 	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
+-	VPCMPEQ	(VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
+-
+-	VPCMPEQ	(VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
+-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
+-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
+-
++	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
++	CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
++	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
++	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
+ 
+ 	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
+ 	   zero.  */
+@@ -469,6 +627,8 @@ L(return_vec_2_3_end):
+ # else
+ 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
+ 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -512,6 +672,8 @@ L(return_vec_0_end):
+ # else
+ 	movzbl	(%rdi, %rcx), %eax
+ 	movzbl	(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -534,6 +696,8 @@ L(return_vec_1_end):
+ #  else
+ 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+ 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -560,6 +724,8 @@ L(return_vec_2_end):
+ # else
+ 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+ 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -587,7 +753,7 @@ L(page_cross_during_loop):
+ 	jle	L(less_1x_vec_till_page_cross)
+ 
+ 	VMOVA	(%rdi), %ymm0
+-	VPCMPEQ	(%rsi), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -609,7 +775,7 @@ L(less_1x_vec_till_page_cross):
+ 	   here, it means the previous page (rdi - VEC_SIZE) has already
+ 	   been loaded earlier so must be valid.  */
+ 	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
+-	VPCMPEQ	-VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -651,6 +817,8 @@ L(return_page_cross_cmp_mem):
+ # else
+ 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+ 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -677,7 +845,7 @@ L(more_2x_vec_till_page_cross):
+ 	   iteration here.  */
+ 
+ 	VMOVU	VEC_SIZE(%rdi), %ymm0
+-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -693,7 +861,7 @@ L(more_2x_vec_till_page_cross):
+ 
+ 	/* Safe to include comparisons from lower bytes.  */
+ 	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
+-	VPCMPEQ	-(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -701,7 +869,7 @@ L(more_2x_vec_till_page_cross):
+ 	jnz	L(return_vec_page_cross_0)
+ 
+ 	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
+-	VPCMPEQ	-(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -719,8 +887,8 @@ L(more_2x_vec_till_page_cross):
+ 	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
+ 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
+ 
+-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
+-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
++	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
++	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
+ 	vpand	%ymm4, %ymm5, %ymm5
+ 	vpand	%ymm6, %ymm7, %ymm7
+ 	VPMINU	%ymm5, %ymm7, %ymm7
+@@ -771,6 +939,8 @@ L(return_vec_page_cross_1):
+ # else
+ 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+ 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -826,7 +996,7 @@ L(page_cross):
+ L(page_cross_loop):
+ 
+ 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
+-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -844,11 +1014,11 @@ L(page_cross_loop):
+ 	subl	%eax, %OFFSET_REG
+ 	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
+ 	   to not cross page so is safe to load. Since we have already
+-	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
+-	 */
++	   loaded at least 1 VEC from rsi it is also guranteed to be
++	   safe.  */
+ 
+ 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
+-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -881,6 +1051,8 @@ L(ret_vec_page_cross_cont):
+ # else
+ 	movzbl	(%rdi, %rcx), %eax
+ 	movzbl	(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -934,7 +1106,7 @@ L(less_1x_vec_till_page):
+ 	ja	L(less_16_till_page)
+ 
+ 	VMOVU	(%rdi), %xmm0
+-	VPCMPEQ	(%rsi), %xmm0, %xmm1
++	CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -952,7 +1124,7 @@ L(less_1x_vec_till_page):
+ # endif
+ 
+ 	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
+-	VPCMPEQ	(%rsi, %OFFSET_REG64), %xmm0, %xmm1
++	CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -990,7 +1162,7 @@ L(less_16_till_page):
+ 	vmovq	(%rdi), %xmm0
+ 	vmovq	(%rsi), %xmm1
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+-	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+ 	incb	%cl
+@@ -1010,7 +1182,7 @@ L(less_16_till_page):
+ 	vmovq	(%rdi, %OFFSET_REG64), %xmm0
+ 	vmovq	(%rsi, %OFFSET_REG64), %xmm1
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+-	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+ 	incb	%cl
+@@ -1066,7 +1238,7 @@ L(ret_less_8_wcs):
+ 	vmovd	(%rdi), %xmm0
+ 	vmovd	(%rsi), %xmm1
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+-	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+ 	subl	$0xf, %ecx
+@@ -1085,7 +1257,7 @@ L(ret_less_8_wcs):
+ 	vmovd	(%rdi, %OFFSET_REG64), %xmm0
+ 	vmovd	(%rsi, %OFFSET_REG64), %xmm1
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+-	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+ 	subl	$0xf, %ecx
+@@ -1119,7 +1291,9 @@ L(less_4_till_page):
+ L(less_4_loop):
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi, %rdi), %ecx
+-	subl	%ecx, %eax
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
++	subl	%BYTE_LOOP_REG, %eax
+ 	jnz	L(ret_less_4_loop)
+ 	testl	%ecx, %ecx
+ 	jz	L(ret_zero_4_loop)
+@@ -1146,5 +1320,6 @@ L(ret_less_4_loop):
+ 	subl	%r8d, %eax
+ 	ret
+ # endif
+-END(STRCMP)
++	cfi_endproc
++	.size	STRCMP, .-STRCMP
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
+new file mode 100644
+index 00000000..58c05dcf
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
+@@ -0,0 +1,16 @@
++#ifndef STRCMP
++# define STRCMP	__strncasecmp_l_avx2_rtm
++#endif
++
++#define _GLABEL(x)	x ## _rtm
++#define GLABEL(x)	_GLABEL(x)
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
++	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
++
++#define SECTION(p)	p##.avx.rtm
++#define OVERFLOW_STRCMP	__strcasecmp_l_avx2_rtm
++
++#include "strncase_l-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
+new file mode 100644
+index 00000000..48c0aa21
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
+@@ -0,0 +1,27 @@
++/* strncasecmp_l optimized with AVX2.
++   Copyright (C) 2017-2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef STRCMP
++# define STRCMP	__strncasecmp_l_avx2
++#endif
++#define USE_AS_STRCASECMP_L
++#define USE_AS_STRNCMP
++#ifndef OVERFLOW_STRCMP
++# define OVERFLOW_STRCMP	__strcasecmp_l_avx2
++#endif
++#include "strcmp-avx2.S"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-98.patch b/glibc-RHEL-15696-98.patch
new file mode 100644
index 0000000..9941bcc
--- /dev/null
+++ b/glibc-RHEL-15696-98.patch
@@ -0,0 +1,814 @@
+From 84e7c46df4086873eae28a1fb87d2cf5388b1e16 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 24 Mar 2022 18:56:13 -0500
+Subject: [PATCH] x86: Add EVEX optimized str{n}casecmp
+Content-type: text/plain; charset=UTF-8
+
+geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile            |   2 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  16 +
+ sysdeps/x86_64/multiarch/ifunc-strcasecmp.h  |   5 +
+ sysdeps/x86_64/multiarch/strcasecmp_l-evex.S |  23 ++
+ sysdeps/x86_64/multiarch/strcmp-evex.S       | 290 ++++++++++++++++---
+ sysdeps/x86_64/multiarch/strncase_l-evex.S   |  25 ++
+ 6 files changed, 321 insertions(+), 40 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 711ecf2e..359712c1 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -53,6 +53,7 @@ sysdep_routines += \
+   strcasecmp_l-avx \
+   strcasecmp_l-avx2 \
+   strcasecmp_l-avx2-rtm \
++  strcasecmp_l-evex \
+   strcasecmp_l-sse2 \
+   strcasecmp_l-sse4_2 \
+   strcasecmp_l-ssse3 \
+@@ -93,6 +94,7 @@ sysdep_routines += \
+   strncase_l-avx \
+   strncase_l-avx2 \
+   strncase_l-avx2-rtm \
++  strncase_l-evex \
+   strncase_l-sse2 \
+   strncase_l-sse4_2 \
+   strncase_l-ssse3 \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index d873e1be..1dedc637 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -418,6 +418,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
+   IFUNC_IMPL (i, name, strcasecmp,
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strcasecmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strcasecmp_avx2)
+@@ -438,6 +442,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
+   IFUNC_IMPL (i, name, strcasecmp_l,
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strcasecmp_l_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strcasecmp_l_avx2)
+@@ -572,6 +580,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
+   IFUNC_IMPL (i, name, strncasecmp,
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strncasecmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strncasecmp_avx2)
+@@ -593,6 +605,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
+   IFUNC_IMPL (i, name, strncasecmp_l,
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strncasecmp_l_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strncasecmp_l_avx2)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+index 926508c4..6dd49a21 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+@@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++        return OPTIMIZE (evex);
++
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+         return OPTIMIZE (avx2_rtm);
+ 
+diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
+new file mode 100644
+index 00000000..58642db7
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
+@@ -0,0 +1,23 @@
++/* strcasecmp_l optimized with EVEX.
++   Copyright (C) 2017-2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef STRCMP
++# define STRCMP	__strcasecmp_l_evex
++#endif
++#define USE_AS_STRCASECMP_L
++#include "strcmp-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 0dfa62bd..b81b5775 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -19,6 +19,9 @@
+ #if IS_IN (libc)
+ 
+ # include <sysdep.h>
++# if defined USE_AS_STRCASECMP_L
++#  include "locale-defines.h"
++# endif
+ 
+ # ifndef STRCMP
+ #  define STRCMP	__strcmp_evex
+@@ -34,19 +37,29 @@
+ # define VMOVA	vmovdqa64
+ 
+ # ifdef USE_AS_WCSCMP
+-#  define TESTEQ	subl	$0xff,
++#  ifndef OVERFLOW_STRCMP
++#   define OVERFLOW_STRCMP	__wcscmp_evex
++#  endif
++
++#  define TESTEQ	subl $0xff,
+ 	/* Compare packed dwords.  */
+ #  define VPCMP	vpcmpd
+ #  define VPMINU	vpminud
+ #  define VPTESTM	vptestmd
++#  define VPTESTNM	vptestnmd
+ 	/* 1 dword char == 4 bytes.  */
+ #  define SIZE_OF_CHAR	4
+ # else
++#  ifndef OVERFLOW_STRCMP
++#   define OVERFLOW_STRCMP	__strcmp_evex
++#  endif
++
+ #  define TESTEQ	incl
+ 	/* Compare packed bytes.  */
+ #  define VPCMP	vpcmpb
+ #  define VPMINU	vpminub
+ #  define VPTESTM	vptestmb
++#  define VPTESTNM	vptestnmb
+ 	/* 1 byte char == 1 byte.  */
+ #  define SIZE_OF_CHAR	1
+ # endif
+@@ -73,11 +86,16 @@
+ #  define VEC_OFFSET	(-VEC_SIZE)
+ # endif
+ 
+-# define XMMZERO	xmm16
+ # define XMM0	xmm17
+ # define XMM1	xmm18
+ 
+-# define YMMZERO	ymm16
++# define XMM10	xmm27
++# define XMM11	xmm28
++# define XMM12	xmm29
++# define XMM13	xmm30
++# define XMM14	xmm31
++
++
+ # define YMM0	ymm17
+ # define YMM1	ymm18
+ # define YMM2	ymm19
+@@ -89,6 +107,87 @@
+ # define YMM8	ymm25
+ # define YMM9	ymm26
+ # define YMM10	ymm27
++# define YMM11	ymm28
++# define YMM12	ymm29
++# define YMM13	ymm30
++# define YMM14	ymm31
++
++# ifdef USE_AS_STRCASECMP_L
++#  define BYTE_LOOP_REG	OFFSET_REG
++# else
++#  define BYTE_LOOP_REG	ecx
++# endif
++
++# ifdef USE_AS_STRCASECMP_L
++#  ifdef USE_AS_STRNCMP
++#   define STRCASECMP	__strncasecmp_evex
++#   define LOCALE_REG	rcx
++#   define LOCALE_REG_LP	RCX_LP
++#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
++#  else
++#   define STRCASECMP	__strcasecmp_evex
++#   define LOCALE_REG	rdx
++#   define LOCALE_REG_LP	RDX_LP
++#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
++#  endif
++# endif
++
++# define LCASE_MIN_YMM	%YMM12
++# define LCASE_MAX_YMM	%YMM13
++# define CASE_ADD_YMM	%YMM14
++
++# define LCASE_MIN_XMM	%XMM12
++# define LCASE_MAX_XMM	%XMM13
++# define CASE_ADD_XMM	%XMM14
++
++	/* NB: wcsncmp uses r11 but strcasecmp is never used in
++	   conjunction with wcscmp.  */
++# define TOLOWER_BASE	%r11
++
++# ifdef USE_AS_STRCASECMP_L
++#  define _REG(x, y) x ## y
++#  define REG(x, y) _REG(x, y)
++#  define TOLOWER(reg1, reg2, ext)										\
++	vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);					\
++	vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);					\
++	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;				\
++	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;				\
++	vpaddb	reg1, REG(CASE_ADD_, ext), reg1{%k5};						\
++	vpaddb	reg2, REG(CASE_ADD_, ext), reg2{%k6}
++
++#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
++#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM)
++#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM)
++
++#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)						\
++	TOLOWER	(s1_reg, s2_reg, ext);										\
++	VPCMP	$0, s1_reg, s2_reg, reg_out
++
++#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)				\
++	VMOVU	s2_mem, s2_reg;												\
++	CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
++
++#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
++#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
++
++#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
++#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
++
++# else
++#  define TOLOWER_gpr(...)
++#  define TOLOWER_YMM(...)
++#  define TOLOWER_XMM(...)
++
++#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)						\
++	VPCMP	$0, s2_reg, s1_reg, reg_out
++
++#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
++
++#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)				\
++	VPCMP	$0, s2_mem, s1_reg, reg_out
++
++#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
++# endif
+ 
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+@@ -112,8 +211,45 @@
+    returned.  */
+ 
+ 	.section .text.evex, "ax", @progbits
+-ENTRY(STRCMP)
++	.align	16
++	.type	STRCMP, @function
++	.globl	STRCMP
++	.hidden	STRCMP
++
++# ifdef USE_AS_STRCASECMP_L
++ENTRY (STRCASECMP)
++	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
++	mov	%fs:(%rax), %LOCALE_REG_LP
++
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
++END (STRCASECMP)
++	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
++# endif
++
++	.p2align 4
++STRCMP:
++	cfi_startproc
++	_CET_ENDBR
++	CALL_MCOUNT
++
++# if defined USE_AS_STRCASECMP_L
++	/* We have to fall back on the C implementation for locales with
++	   encodings not matching ASCII for single bytes.  */
++#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
++	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
++#  else
++	mov	(%LOCALE_REG), %RAX_LP
++#  endif
++	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
++	jne	STRCASECMP_NONASCII
++	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
++# endif
++
+ # ifdef USE_AS_STRNCMP
++	/* Don't overwrite LOCALE_REG (rcx) until we have pass
++	   L(one_or_less). Otherwise we might use the wrong locale in
++	   the OVERFLOW_STRCMP (strcasecmp_l).  */
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+@@ -125,6 +261,32 @@ ENTRY(STRCMP)
+ 	   actually bound the buffer.  */
+ 	jle	L(one_or_less)
+ # endif
++
++# if defined USE_AS_STRCASECMP_L
++	.section .rodata.cst32, "aM", @progbits, 32
++	.align	32
++L(lcase_min):
++	.quad	0x4141414141414141
++	.quad	0x4141414141414141
++	.quad	0x4141414141414141
++	.quad	0x4141414141414141
++L(lcase_max):
++	.quad	0x1a1a1a1a1a1a1a1a
++	.quad	0x1a1a1a1a1a1a1a1a
++	.quad	0x1a1a1a1a1a1a1a1a
++	.quad	0x1a1a1a1a1a1a1a1a
++L(case_add):
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.previous
++
++	vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
++	vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
++	vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
++# endif
++
+ 	movl	%edi, %eax
+ 	orl	%esi, %eax
+ 	/* Shift out the bits irrelivant to page boundary ([63:12]).  */
+@@ -139,7 +301,7 @@ L(no_page_cross):
+ 	VPTESTM	%YMM0, %YMM0, %k2
+ 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+ 	   in YMM0 and 32 bytes at (%rsi).  */
+-	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_STRNCMP
+ 	cmpq	$CHAR_PER_VEC, %rdx
+@@ -169,6 +331,8 @@ L(return_vec_0):
+ # else
+ 	movzbl	(%rdi, %rcx), %eax
+ 	movzbl	(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret0):
+@@ -188,11 +352,15 @@ L(ret_zero):
+ 
+ 	.p2align 4,, 5
+ L(one_or_less):
++#  ifdef USE_AS_STRCASECMP_L
++	/* Set locale argument for strcasecmp.  */
++	movq	%LOCALE_REG, %rdx
++#  endif
+ 	jb	L(ret_zero)
+-#  ifdef USE_AS_WCSCMP
+ 	/* 'nbe' covers the case where length is negative (large
+ 	   unsigned).  */
+-	jnbe	__wcscmp_evex
++	jnbe	OVERFLOW_STRCMP
++#  ifdef USE_AS_WCSCMP
+ 	movl	(%rdi), %edx
+ 	xorl	%eax, %eax
+ 	cmpl	(%rsi), %edx
+@@ -201,11 +369,10 @@ L(one_or_less):
+ 	negl	%eax
+ 	orl	$1, %eax
+ #  else
+-	/* 'nbe' covers the case where length is negative (large
+-	   unsigned).  */
+-	jnbe	__strcmp_evex
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ #  endif
+ L(ret1):
+@@ -233,6 +400,8 @@ L(return_vec_1):
+ # else
+ 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+ 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret2):
+@@ -270,6 +439,8 @@ L(return_vec_2):
+ # else
+ 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+ 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret3):
+@@ -290,6 +461,8 @@ L(return_vec_3):
+ #  else
+ 	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+ 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ #  endif
+ L(ret4):
+@@ -303,7 +476,7 @@ L(more_3x_vec):
+ 	/* Safe to compare 4x vectors.  */
+ 	VMOVU	(VEC_SIZE)(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_1)
+@@ -315,14 +488,14 @@ L(more_3x_vec):
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_3)
+@@ -381,7 +554,6 @@ L(prepare_loop_aligned):
+ 	subl	%esi, %eax
+ 	andl	$(PAGE_SIZE - 1), %eax
+ 
+-	vpxorq	%YMMZERO, %YMMZERO, %YMMZERO
+ 
+ 	/* Loop 4x comparisons at a time.  */
+ 	.p2align 4
+@@ -413,22 +585,35 @@ L(loop_skip_page_cross_check):
+ 	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
+ 	VPMINU	%YMM8, %YMM9, %YMM9
+ 
+-	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
++	/* Each bit set in K1 represents a non-null CHAR in YMM9.  */
+ 	VPTESTM	%YMM9, %YMM9, %k1
+-
++# ifndef USE_AS_STRCASECMP_L
+ 	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
+ 	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+ 	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
+ 	   oring with YMM1. Result is stored in YMM6.  */
+ 	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
+-
++# else
++	VMOVU	(VEC_SIZE * 0)(%rsi), %YMM1
++	TOLOWER_YMM (%YMM0, %YMM1)
++	VMOVU	(VEC_SIZE * 1)(%rsi), %YMM3
++	TOLOWER_YMM (%YMM2, %YMM3)
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
++	TOLOWER_YMM (%YMM4, %YMM5)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
++	TOLOWER_YMM (%YMM6, %YMM7)
++	vpxorq	%YMM0, %YMM1, %YMM1
++	vpxorq	%YMM2, %YMM3, %YMM3
++	vpxorq	%YMM4, %YMM5, %YMM5
++	vpternlogd $0xde, %YMM7, %YMM1, %YMM6
++# endif
+ 	/* Or together YMM3, YMM5, and YMM6.  */
+ 	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
+ 
+ 
+ 	/* A non-zero CHAR in YMM6 represents a mismatch.  */
+-	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
++	VPTESTNM %YMM6, %YMM6, %k0{%k1}
+ 	kmovd	%k0, %LOOP_REG
+ 
+ 	TESTEQ	%LOOP_REG
+@@ -437,13 +622,13 @@ L(loop_skip_page_cross_check):
+ 
+ 	/* Find which VEC has the mismatch of end of string.  */
+ 	VPTESTM	%YMM0, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
++	VPTESTNM %YMM1, %YMM1, %k0{%k1}
+ 	kmovd	%k0, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_0_end)
+ 
+ 	VPTESTM	%YMM2, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
++	VPTESTNM %YMM3, %YMM3, %k0{%k1}
+ 	kmovd	%k0, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_1_end)
+@@ -457,7 +642,7 @@ L(return_vec_2_3_end):
+ # endif
+ 
+ 	VPTESTM	%YMM4, %YMM4, %k1
+-	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
++	VPTESTNM %YMM5, %YMM5, %k0{%k1}
+ 	kmovd	%k0, %ecx
+ 	TESTEQ	%ecx
+ # if CHAR_PER_VEC <= 16
+@@ -493,6 +678,8 @@ L(return_vec_3_end):
+ # else
+ 	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
+ 	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -545,6 +732,8 @@ L(return_vec_0_end):
+ # else
+ 	movzbl	(%rdi, %rcx), %eax
+ 	movzbl	(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
+ 	   logic. Subtract `r8d` after xor for zero case.  */
+@@ -569,6 +758,8 @@ L(return_vec_1_end):
+ #  else
+ 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+ 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -598,7 +789,7 @@ L(page_cross_during_loop):
+ 
+ 	VMOVA	(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_0_end)
+@@ -619,8 +810,7 @@ L(less_1x_vec_till_page_cross):
+ 	   been loaded earlier so must be valid.  */
+ 	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
+-
++	CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
+ 	/* Mask of potentially valid bits. The lower bits can be out of
+ 	   range comparisons (but safe regarding page crosses).  */
+ 
+@@ -642,6 +832,8 @@ L(less_1x_vec_till_page_cross):
+ 
+ # ifdef USE_AS_STRNCMP
+ #  ifdef USE_AS_WCSCMP
++	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
++	   safe.  */
+ 	movl	%eax, %r11d
+ 	shrl	$2, %r11d
+ 	cmpq	%r11, %rdx
+@@ -679,6 +871,8 @@ L(return_page_cross_cmp_mem):
+ # else
+ 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+ 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -709,7 +903,7 @@ L(more_2x_vec_till_page_cross):
+ 
+ 	VMOVA	VEC_SIZE(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_1_end)
+@@ -724,14 +918,14 @@ L(more_2x_vec_till_page_cross):
+ 	/* Safe to include comparisons from lower bytes.  */
+ 	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_page_cross_0)
+ 
+ 	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_page_cross_1)
+@@ -740,6 +934,8 @@ L(more_2x_vec_till_page_cross):
+ 	/* Must check length here as length might proclude reading next
+ 	   page.  */
+ #  ifdef USE_AS_WCSCMP
++	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
++	   safe.  */
+ 	movl	%eax, %r11d
+ 	shrl	$2, %r11d
+ 	cmpq	%r11, %rdx
+@@ -754,12 +950,19 @@ L(more_2x_vec_till_page_cross):
+ 	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
+ 	VPMINU	%YMM4, %YMM6, %YMM9
+ 	VPTESTM	%YMM9, %YMM9, %k1
+-
++# ifndef USE_AS_STRCASECMP_L
+ 	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+ 	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
+ 	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
+-
+-	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
++# else
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
++	TOLOWER_YMM (%YMM4, %YMM5)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
++	TOLOWER_YMM (%YMM6, %YMM7)
++	vpxorq	%YMM4, %YMM5, %YMM5
++	vpternlogd $0xde, %YMM7, %YMM5, %YMM6
++# endif
++	VPTESTNM %YMM6, %YMM6, %k0{%k1}
+ 	kmovd	%k0, %LOOP_REG
+ 	TESTEQ	%LOOP_REG
+ 	jnz	L(return_vec_2_3_end)
+@@ -815,6 +1018,8 @@ L(return_vec_page_cross_1):
+ # else
+ 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+ 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -871,7 +1076,7 @@ L(page_cross):
+ L(page_cross_loop):
+ 	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(check_ret_vec_page_cross)
+@@ -895,7 +1100,7 @@ L(page_cross_loop):
+ 	 */
+ 	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+ 
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_STRNCMP
+@@ -930,6 +1135,8 @@ L(ret_vec_page_cross_cont):
+ # else
+ 	movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax
+ 	movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -989,7 +1196,7 @@ L(less_1x_vec_till_page):
+ 	/* Use 16 byte comparison.  */
+ 	vmovdqu	(%rdi), %xmm0
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, (%rsi), %xmm0, %k1{%k2}
++	CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	subl	$0xf, %ecx
+@@ -1009,7 +1216,7 @@ L(less_1x_vec_till_page):
+ # endif
+ 	vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
++	CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	subl	$0xf, %ecx
+@@ -1048,7 +1255,7 @@ L(less_16_till_page):
+ 	vmovq	(%rdi), %xmm0
+ 	vmovq	(%rsi), %xmm1
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
++	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	subl	$0x3, %ecx
+@@ -1068,7 +1275,7 @@ L(less_16_till_page):
+ 	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+ 	vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
++	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	subl	$0x3, %ecx
+@@ -1128,7 +1335,7 @@ L(ret_less_8_wcs):
+ 	vmovd	(%rdi), %xmm0
+ 	vmovd	(%rsi), %xmm1
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
++	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	subl	$0xf, %ecx
+ 	jnz	L(check_ret_vec_page_cross)
+@@ -1143,7 +1350,7 @@ L(ret_less_8_wcs):
+ 	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+ 	vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
++	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	subl	$0xf, %ecx
+ 	jnz	L(check_ret_vec_page_cross)
+@@ -1176,7 +1383,9 @@ L(less_4_till_page):
+ L(less_4_loop):
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi, %rdi), %ecx
+-	subl	%ecx, %eax
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
++	subl	%BYTE_LOOP_REG, %eax
+ 	jnz	L(ret_less_4_loop)
+ 	testl	%ecx, %ecx
+ 	jz	L(ret_zero_4_loop)
+@@ -1203,5 +1412,6 @@ L(ret_less_4_loop):
+ 	subl	%r8d, %eax
+ 	ret
+ # endif
+-END(STRCMP)
++	cfi_endproc
++	.size	STRCMP, .-STRCMP
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
+new file mode 100644
+index 00000000..8a5af369
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
+@@ -0,0 +1,25 @@
++/* strncasecmp_l optimized with EVEX.
++   Copyright (C) 2017-2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef STRCMP
++# define STRCMP	__strncasecmp_l_evex
++#endif
++#define OVERFLOW_STRCMP	__strcasecmp_l_evex
++#define USE_AS_STRCASECMP_L
++#define USE_AS_STRNCMP
++#include "strcmp-evex.S"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-99.patch b/glibc-RHEL-15696-99.patch
new file mode 100644
index 0000000..06d5d53
--- /dev/null
+++ b/glibc-RHEL-15696-99.patch
@@ -0,0 +1,913 @@
+From 305769b2a15c2e96f9e1b5195d3c4e0d6f0f4b68 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:46 -0500
+Subject: [PATCH] x86: Remove AVX str{n}casecmp
+Content-type: text/plain; charset=UTF-8
+
+The rational is:
+
+1. SSE42 has nearly identical logic so any benefit is minimal (3.4%
+   regression on Tigerlake using SSE42 versus AVX across the
+   benchtest suite).
+2. AVX2 version covers the majority of targets that previously
+   prefered it.
+3. The targets where AVX would still be best (SnB and IVB) are
+   becoming outdated.
+
+All in all the saving the code size is worth it.
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile           |   2 -
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c  |  12 -
+ sysdeps/x86_64/multiarch/ifunc-strcasecmp.h |   4 -
+ sysdeps/x86_64/multiarch/strcasecmp_l-avx.S |  22 --
+ sysdeps/x86_64/multiarch/strcmp-sse42.S     | 240 +++++++++-----------
+ sysdeps/x86_64/multiarch/strncase_l-avx.S   |  22 --
+ 6 files changed, 105 insertions(+), 197 deletions(-)
+ delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
+ delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 359712c1..bca82e38 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -50,7 +50,6 @@ sysdep_routines += \
+   stpncpy-evex \
+   stpncpy-sse2-unaligned \
+   stpncpy-ssse3 \
+-  strcasecmp_l-avx \
+   strcasecmp_l-avx2 \
+   strcasecmp_l-avx2-rtm \
+   strcasecmp_l-evex \
+@@ -91,7 +90,6 @@ sysdep_routines += \
+   strlen-avx2-rtm \
+   strlen-evex \
+   strlen-sse2 \
+-  strncase_l-avx \
+   strncase_l-avx2 \
+   strncase_l-avx2-rtm \
+   strncase_l-evex \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 1dedc637..14314367 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -429,9 +429,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strcasecmp_avx2_rtm)
+-	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+-			      CPU_FEATURE_USABLE (AVX),
+-			      __strcasecmp_avx)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+ 			      CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strcasecmp_sse42)
+@@ -453,9 +450,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strcasecmp_l_avx2_rtm)
+-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+-			      CPU_FEATURE_USABLE (AVX),
+-			      __strcasecmp_l_avx)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+ 			      CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strcasecmp_l_sse42)
+@@ -591,9 +585,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strncasecmp_avx2_rtm)
+-	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+-			      CPU_FEATURE_USABLE (AVX),
+-			      __strncasecmp_avx)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+ 			      CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strncasecmp_sse42)
+@@ -616,9 +607,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strncasecmp_l_avx2_rtm)
+-	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+-			      CPU_FEATURE_USABLE (AVX),
+-			      __strncasecmp_l_avx)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+ 			      CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strncasecmp_l_sse42)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+index 6dd49a21..34cfbb8f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+@@ -22,7 +22,6 @@
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+@@ -46,9 +45,6 @@ IFUNC_SELECTOR (void)
+         return OPTIMIZE (avx2);
+     }
+ 
+-  if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
+-    return OPTIMIZE (avx);
+-
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
+     return OPTIMIZE (sse42);
+diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
+deleted file mode 100644
+index 56a03547..00000000
+--- a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
++++ /dev/null
+@@ -1,22 +0,0 @@
+-/* strcasecmp_l optimized with AVX.
+-   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#define STRCMP_SSE42 __strcasecmp_l_avx
+-#define USE_AVX 1
+-#define USE_AS_STRCASECMP_L
+-#include "strcmp-sse42.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+index 59e8ddfc..0a42b7a4 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+@@ -42,13 +42,8 @@
+ # define UPDATE_STRNCMP_COUNTER
+ #endif
+ 
+-#ifdef USE_AVX
+-# define SECTION	avx
+-# define GLABEL(l)	l##_avx
+-#else
+-# define SECTION	sse4.2
+-# define GLABEL(l)	l##_sse42
+-#endif
++#define SECTION	sse4.2
++#define GLABEL(l)	l##_sse42
+ 
+ #define LABEL(l)	.L##l
+ 
+@@ -106,21 +101,7 @@ END (GLABEL(__strncasecmp))
+ #endif
+ 
+ 
+-#ifdef USE_AVX
+-# define movdqa vmovdqa
+-# define movdqu vmovdqu
+-# define pmovmskb vpmovmskb
+-# define pcmpistri vpcmpistri
+-# define psubb vpsubb
+-# define pcmpeqb vpcmpeqb
+-# define psrldq vpsrldq
+-# define pslldq vpslldq
+-# define palignr vpalignr
+-# define pxor vpxor
+-# define D(arg) arg, arg
+-#else
+-# define D(arg) arg
+-#endif
++#define arg arg
+ 
+ STRCMP_SSE42:
+ 	cfi_startproc
+@@ -192,18 +173,7 @@ LABEL(case_add):
+ 	movdqu	(%rdi), %xmm1
+ 	movdqu	(%rsi), %xmm2
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+-# ifdef USE_AVX
+-#  define TOLOWER(reg1, reg2) \
+-	vpaddb	LCASE_MIN_reg, reg1, %xmm7;					\
+-	vpaddb	LCASE_MIN_reg, reg2, %xmm8;					\
+-	vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;					\
+-	vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;					\
+-	vpandn	CASE_ADD_reg, %xmm7, %xmm7;					\
+-	vpandn	CASE_ADD_reg, %xmm8, %xmm8;					\
+-	vpaddb	%xmm7, reg1, reg1;					\
+-	vpaddb	%xmm8, reg2, reg2
+-# else
+-#  define TOLOWER(reg1, reg2) \
++# define TOLOWER(reg1, reg2) \
+ 	movdqa	LCASE_MIN_reg, %xmm7;					\
+ 	movdqa	LCASE_MIN_reg, %xmm8;					\
+ 	paddb	reg1, %xmm7;					\
+@@ -214,15 +184,15 @@ LABEL(case_add):
+ 	pandn	CASE_ADD_reg, %xmm8;					\
+ 	paddb	%xmm7, reg1;					\
+ 	paddb	%xmm8, reg2
+-# endif
++
+ 	TOLOWER (%xmm1, %xmm2)
+ #else
+ # define TOLOWER(reg1, reg2)
+ #endif
+-	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char checks */
+-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
+-	pcmpeqb	%xmm2, D(%xmm1)		/* compare first 16 bytes for equality */
+-	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
++	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
++	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
++	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
++	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+ 	pmovmskb %xmm1, %edx
+ 	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
+ 	jnz	LABEL(less16bytes)/* If not, find different value or null char */
+@@ -246,7 +216,7 @@ LABEL(crosscache):
+ 	xor	%r8d, %r8d
+ 	and	$0xf, %ecx		/* offset of rsi */
+ 	and	$0xf, %eax		/* offset of rdi */
+-	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char check */
++	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char check */
+ 	cmp	%eax, %ecx
+ 	je	LABEL(ashr_0)		/* rsi and rdi relative offset same */
+ 	ja	LABEL(bigger)
+@@ -260,7 +230,7 @@ LABEL(bigger):
+ 	sub	%rcx, %r9
+ 	lea	LABEL(unaligned_table)(%rip), %r10
+ 	movslq	(%r10, %r9,4), %r9
+-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
++	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
+ 	lea	(%r10, %r9), %r10
+ 	_CET_NOTRACK jmp *%r10		/* jump to corresponding case */
+ 
+@@ -273,15 +243,15 @@ LABEL(bigger):
+ LABEL(ashr_0):
+ 
+ 	movdqa	(%rsi), %xmm1
+-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
++	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+-	pcmpeqb	(%rdi), D(%xmm1)	/* compare 16 bytes for equality */
++	pcmpeqb	(%rdi), %xmm1		/* compare 16 bytes for equality */
+ #else
+ 	movdqa	(%rdi), %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm2, D(%xmm1)		/* compare 16 bytes for equality */
++	pcmpeqb	%xmm2, %xmm1		/* compare 16 bytes for equality */
+ #endif
+-	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
++	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+ 	pmovmskb %xmm1, %r9d
+ 	shr	%cl, %edx		/* adjust 0xffff for offset */
+ 	shr	%cl, %r9d		/* adjust for 16-byte offset */
+@@ -361,10 +331,10 @@ LABEL(ashr_0_exit_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_1):
+-	pslldq	$15, D(%xmm2)		/* shift first string to align with second */
++	pslldq	$15, %xmm2		/* shift first string to align with second */
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)		/* compare 16 bytes for equality */
+-	psubb	%xmm0, D(%xmm2)		/* packed sub of comparison results*/
++	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
++	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx		/* adjust 0xffff for offset */
+ 	shr	%cl, %r9d		/* adjust for 16-byte offset */
+@@ -392,7 +362,7 @@ LABEL(loop_ashr_1_use):
+ 
+ LABEL(nibble_ashr_1_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $1, -16(%rdi, %rdx), D(%xmm0)
++	palignr $1, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -411,7 +381,7 @@ LABEL(nibble_ashr_1_restart_use):
+ 	jg	LABEL(nibble_ashr_1_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $1, -16(%rdi, %rdx), D(%xmm0)
++	palignr $1, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -431,7 +401,7 @@ LABEL(nibble_ashr_1_restart_use):
+ LABEL(nibble_ashr_1_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$1, D(%xmm0)
++	psrldq	$1, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -449,10 +419,10 @@ LABEL(nibble_ashr_1_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_2):
+-	pslldq	$14, D(%xmm2)
++	pslldq	$14, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -480,7 +450,7 @@ LABEL(loop_ashr_2_use):
+ 
+ LABEL(nibble_ashr_2_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $2, -16(%rdi, %rdx), D(%xmm0)
++	palignr $2, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -499,7 +469,7 @@ LABEL(nibble_ashr_2_restart_use):
+ 	jg	LABEL(nibble_ashr_2_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $2, -16(%rdi, %rdx), D(%xmm0)
++	palignr $2, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -519,7 +489,7 @@ LABEL(nibble_ashr_2_restart_use):
+ LABEL(nibble_ashr_2_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$2, D(%xmm0)
++	psrldq	$2, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -537,10 +507,10 @@ LABEL(nibble_ashr_2_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_3):
+-	pslldq	$13, D(%xmm2)
++	pslldq	$13, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -568,7 +538,7 @@ LABEL(loop_ashr_3_use):
+ 
+ LABEL(nibble_ashr_3_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $3, -16(%rdi, %rdx), D(%xmm0)
++	palignr $3, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -587,7 +557,7 @@ LABEL(nibble_ashr_3_restart_use):
+ 	jg	LABEL(nibble_ashr_3_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $3, -16(%rdi, %rdx), D(%xmm0)
++	palignr $3, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -607,7 +577,7 @@ LABEL(nibble_ashr_3_restart_use):
+ LABEL(nibble_ashr_3_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$3, D(%xmm0)
++	psrldq	$3, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -625,10 +595,10 @@ LABEL(nibble_ashr_3_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_4):
+-	pslldq	$12, D(%xmm2)
++	pslldq	$12, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -657,7 +627,7 @@ LABEL(loop_ashr_4_use):
+ 
+ LABEL(nibble_ashr_4_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $4, -16(%rdi, %rdx), D(%xmm0)
++	palignr $4, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -676,7 +646,7 @@ LABEL(nibble_ashr_4_restart_use):
+ 	jg	LABEL(nibble_ashr_4_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $4, -16(%rdi, %rdx), D(%xmm0)
++	palignr $4, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -696,7 +666,7 @@ LABEL(nibble_ashr_4_restart_use):
+ LABEL(nibble_ashr_4_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$4, D(%xmm0)
++	psrldq	$4, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -714,10 +684,10 @@ LABEL(nibble_ashr_4_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_5):
+-	pslldq	$11, D(%xmm2)
++	pslldq	$11, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -746,7 +716,7 @@ LABEL(loop_ashr_5_use):
+ 
+ LABEL(nibble_ashr_5_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $5, -16(%rdi, %rdx), D(%xmm0)
++	palignr $5, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -766,7 +736,7 @@ LABEL(nibble_ashr_5_restart_use):
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+ 
+-	palignr $5, -16(%rdi, %rdx), D(%xmm0)
++	palignr $5, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -786,7 +756,7 @@ LABEL(nibble_ashr_5_restart_use):
+ LABEL(nibble_ashr_5_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$5, D(%xmm0)
++	psrldq	$5, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -804,10 +774,10 @@ LABEL(nibble_ashr_5_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_6):
+-	pslldq	$10, D(%xmm2)
++	pslldq	$10, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -836,7 +806,7 @@ LABEL(loop_ashr_6_use):
+ 
+ LABEL(nibble_ashr_6_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $6, -16(%rdi, %rdx), D(%xmm0)
++	palignr $6, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -855,7 +825,7 @@ LABEL(nibble_ashr_6_restart_use):
+ 	jg	LABEL(nibble_ashr_6_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $6, -16(%rdi, %rdx), D(%xmm0)
++	palignr $6, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -875,7 +845,7 @@ LABEL(nibble_ashr_6_restart_use):
+ LABEL(nibble_ashr_6_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$6, D(%xmm0)
++	psrldq	$6, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -893,10 +863,10 @@ LABEL(nibble_ashr_6_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_7):
+-	pslldq	$9, D(%xmm2)
++	pslldq	$9, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -925,7 +895,7 @@ LABEL(loop_ashr_7_use):
+ 
+ LABEL(nibble_ashr_7_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $7, -16(%rdi, %rdx), D(%xmm0)
++	palignr $7, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -944,7 +914,7 @@ LABEL(nibble_ashr_7_restart_use):
+ 	jg	LABEL(nibble_ashr_7_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $7, -16(%rdi, %rdx), D(%xmm0)
++	palignr $7, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -964,7 +934,7 @@ LABEL(nibble_ashr_7_restart_use):
+ LABEL(nibble_ashr_7_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$7, D(%xmm0)
++	psrldq	$7, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -982,10 +952,10 @@ LABEL(nibble_ashr_7_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_8):
+-	pslldq	$8, D(%xmm2)
++	pslldq	$8, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1014,7 +984,7 @@ LABEL(loop_ashr_8_use):
+ 
+ LABEL(nibble_ashr_8_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $8, -16(%rdi, %rdx), D(%xmm0)
++	palignr $8, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1033,7 +1003,7 @@ LABEL(nibble_ashr_8_restart_use):
+ 	jg	LABEL(nibble_ashr_8_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $8, -16(%rdi, %rdx), D(%xmm0)
++	palignr $8, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1053,7 +1023,7 @@ LABEL(nibble_ashr_8_restart_use):
+ LABEL(nibble_ashr_8_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$8, D(%xmm0)
++	psrldq	$8, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1071,10 +1041,10 @@ LABEL(nibble_ashr_8_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_9):
+-	pslldq	$7, D(%xmm2)
++	pslldq	$7, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1104,7 +1074,7 @@ LABEL(loop_ashr_9_use):
+ LABEL(nibble_ashr_9_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+ 
+-	palignr $9, -16(%rdi, %rdx), D(%xmm0)
++	palignr $9, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1123,7 +1093,7 @@ LABEL(nibble_ashr_9_restart_use):
+ 	jg	LABEL(nibble_ashr_9_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $9, -16(%rdi, %rdx), D(%xmm0)
++	palignr $9, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1143,7 +1113,7 @@ LABEL(nibble_ashr_9_restart_use):
+ LABEL(nibble_ashr_9_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$9, D(%xmm0)
++	psrldq	$9, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1161,10 +1131,10 @@ LABEL(nibble_ashr_9_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_10):
+-	pslldq	$6, D(%xmm2)
++	pslldq	$6, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1193,7 +1163,7 @@ LABEL(loop_ashr_10_use):
+ 
+ LABEL(nibble_ashr_10_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $10, -16(%rdi, %rdx), D(%xmm0)
++	palignr $10, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1212,7 +1182,7 @@ LABEL(nibble_ashr_10_restart_use):
+ 	jg	LABEL(nibble_ashr_10_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $10, -16(%rdi, %rdx), D(%xmm0)
++	palignr $10, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1232,7 +1202,7 @@ LABEL(nibble_ashr_10_restart_use):
+ LABEL(nibble_ashr_10_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$10, D(%xmm0)
++	psrldq	$10, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1250,10 +1220,10 @@ LABEL(nibble_ashr_10_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_11):
+-	pslldq	$5, D(%xmm2)
++	pslldq	$5, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1282,7 +1252,7 @@ LABEL(loop_ashr_11_use):
+ 
+ LABEL(nibble_ashr_11_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $11, -16(%rdi, %rdx), D(%xmm0)
++	palignr $11, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1301,7 +1271,7 @@ LABEL(nibble_ashr_11_restart_use):
+ 	jg	LABEL(nibble_ashr_11_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $11, -16(%rdi, %rdx), D(%xmm0)
++	palignr $11, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1321,7 +1291,7 @@ LABEL(nibble_ashr_11_restart_use):
+ LABEL(nibble_ashr_11_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$11, D(%xmm0)
++	psrldq	$11, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1339,10 +1309,10 @@ LABEL(nibble_ashr_11_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_12):
+-	pslldq	$4, D(%xmm2)
++	pslldq	$4, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1371,7 +1341,7 @@ LABEL(loop_ashr_12_use):
+ 
+ LABEL(nibble_ashr_12_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $12, -16(%rdi, %rdx), D(%xmm0)
++	palignr $12, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1390,7 +1360,7 @@ LABEL(nibble_ashr_12_restart_use):
+ 	jg	LABEL(nibble_ashr_12_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $12, -16(%rdi, %rdx), D(%xmm0)
++	palignr $12, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1410,7 +1380,7 @@ LABEL(nibble_ashr_12_restart_use):
+ LABEL(nibble_ashr_12_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$12, D(%xmm0)
++	psrldq	$12, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1428,10 +1398,10 @@ LABEL(nibble_ashr_12_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_13):
+-	pslldq	$3, D(%xmm2)
++	pslldq	$3, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1461,7 +1431,7 @@ LABEL(loop_ashr_13_use):
+ 
+ LABEL(nibble_ashr_13_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $13, -16(%rdi, %rdx), D(%xmm0)
++	palignr $13, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1480,7 +1450,7 @@ LABEL(nibble_ashr_13_restart_use):
+ 	jg	LABEL(nibble_ashr_13_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $13, -16(%rdi, %rdx), D(%xmm0)
++	palignr $13, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1500,7 +1470,7 @@ LABEL(nibble_ashr_13_restart_use):
+ LABEL(nibble_ashr_13_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$13, D(%xmm0)
++	psrldq	$13, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1518,10 +1488,10 @@ LABEL(nibble_ashr_13_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_14):
+-	pslldq  $2, D(%xmm2)
++	pslldq  $2, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1551,7 +1521,7 @@ LABEL(loop_ashr_14_use):
+ 
+ LABEL(nibble_ashr_14_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $14, -16(%rdi, %rdx), D(%xmm0)
++	palignr $14, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1570,7 +1540,7 @@ LABEL(nibble_ashr_14_restart_use):
+ 	jg	LABEL(nibble_ashr_14_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $14, -16(%rdi, %rdx), D(%xmm0)
++	palignr $14, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1590,7 +1560,7 @@ LABEL(nibble_ashr_14_restart_use):
+ LABEL(nibble_ashr_14_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$14, D(%xmm0)
++	psrldq	$14, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1608,10 +1578,10 @@ LABEL(nibble_ashr_14_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_15):
+-	pslldq	$1, D(%xmm2)
++	pslldq	$1, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1643,7 +1613,7 @@ LABEL(loop_ashr_15_use):
+ 
+ LABEL(nibble_ashr_15_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $15, -16(%rdi, %rdx), D(%xmm0)
++	palignr $15, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1662,7 +1632,7 @@ LABEL(nibble_ashr_15_restart_use):
+ 	jg	LABEL(nibble_ashr_15_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $15, -16(%rdi, %rdx), D(%xmm0)
++	palignr $15, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1682,7 +1652,7 @@ LABEL(nibble_ashr_15_restart_use):
+ LABEL(nibble_ashr_15_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$15, D(%xmm0)
++	psrldq	$15, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx.S b/sysdeps/x86_64/multiarch/strncase_l-avx.S
+deleted file mode 100644
+index 0c4e525b..00000000
+--- a/sysdeps/x86_64/multiarch/strncase_l-avx.S
++++ /dev/null
+@@ -1,22 +0,0 @@
+-/* strncasecmp_l optimized with AVX.
+-   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#define STRCMP_SSE42 __strncasecmp_l_avx
+-#define USE_AVX 1
+-#define USE_AS_STRNCASECMP_L
+-#include "strcmp-sse42.S"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15867.patch b/glibc-RHEL-15867.patch
new file mode 100644
index 0000000..7df2fb8
--- /dev/null
+++ b/glibc-RHEL-15867.patch
@@ -0,0 +1,47 @@
+commit 2337e04e21ba6040926ec871e403533f77043c40
+Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
+Date:   Thu Feb 2 07:49:02 2023 -0500
+
+    cdefs: Limit definition of fortification macros
+    
+    Define the __glibc_fortify and other macros only when __FORTIFY_LEVEL >
+    0.  This has the effect of not defining these macros on older C90
+    compilers that do not have support for variable length argument lists.
+    
+    Also trim off the trailing backslashes from the definition of
+    __glibc_fortify and __glibc_fortify_n macros.
+    
+    Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
+    Reviewed-by: Florian Weimer <fweimer@redhat.com>
+
+diff --git a/misc/sys/cdefs.h b/misc/sys/cdefs.h
+index f3d7efdd2a9320f7..46ec4ef71e14c569 100644
+--- a/misc/sys/cdefs.h
++++ b/misc/sys/cdefs.h
+@@ -133,6 +133,7 @@
+ # define __glibc_objsize(__o) __bos (__o)
+ #endif
+ 
++#if __USE_FORTIFY_LEVEL > 0
+ /* Compile time conditions to choose between the regular, _chk and _chk_warn
+    variants.  These conditions should get evaluated to constant and optimized
+    away.  */
+@@ -168,7 +169,7 @@
+    ? __ ## f ## _alias (__VA_ARGS__)					      \
+    : (__glibc_unsafe_len (__l, __s, __osz)				      \
+       ? __ ## f ## _chk_warn (__VA_ARGS__, __osz)			      \
+-      : __ ## f ## _chk (__VA_ARGS__, __osz)))			      \
++      : __ ## f ## _chk (__VA_ARGS__, __osz)))
+ 
+ /* Fortify function f, where object size argument passed to f is the number of
+    elements and not total size.  */
+@@ -178,7 +179,8 @@
+    ? __ ## f ## _alias (__VA_ARGS__)					      \
+    : (__glibc_unsafe_len (__l, __s, __osz)				      \
+       ? __ ## f ## _chk_warn (__VA_ARGS__, (__osz) / (__s))		      \
+-      : __ ## f ## _chk (__VA_ARGS__, (__osz) / (__s))))		      \
++      : __ ## f ## _chk (__VA_ARGS__, (__osz) / (__s))))
++#endif
+ 
+ #if __GNUC_PREREQ (4,3)
+ # define __warndecl(name, msg) \
diff --git a/glibc-RHEL-21522-1.patch b/glibc-RHEL-16825-1.patch
similarity index 95%
rename from glibc-RHEL-21522-1.patch
rename to glibc-RHEL-16825-1.patch
index 9cc6c79..c13cfb3 100644
--- a/glibc-RHEL-21522-1.patch
+++ b/glibc-RHEL-16825-1.patch
@@ -10,7 +10,7 @@ Date:   Mon Nov 27 11:28:07 2023 +0100
     Reviewed-by: Carlos O'Donell <carlos@redhat.com>
 
 diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c
-index 7a84b1fa8c3a7fdd..a80a54fb013adab5 100644
+index 7d8b2bd2336eecb6..66c9266d7f9d65af 100644
 --- a/elf/dl-reloc.c
 +++ b/elf/dl-reloc.c
 @@ -165,6 +165,9 @@ void
diff --git a/glibc-RHEL-21522-2.patch b/glibc-RHEL-16825-2.patch
similarity index 98%
rename from glibc-RHEL-21522-2.patch
rename to glibc-RHEL-16825-2.patch
index f42374b..37d2bf9 100644
--- a/glibc-RHEL-21522-2.patch
+++ b/glibc-RHEL-16825-2.patch
@@ -9,7 +9,7 @@ Date:   Mon Nov 27 11:28:10 2023 +0100
     Reviewed-by: Carlos O'Donell <carlos@redhat.com>
 
 diff --git a/elf/dl-open.c b/elf/dl-open.c
-index e82e53ff8b38fa11..1505fdb73088dcdb 100644
+index 7dfb6b680c108c0b..160451790bb88447 100644
 --- a/elf/dl-open.c
 +++ b/elf/dl-open.c
 @@ -466,6 +466,50 @@ activate_nodelete (struct link_map *new)
diff --git a/glibc-RHEL-21522-3.patch b/glibc-RHEL-16825-3.patch
similarity index 95%
rename from glibc-RHEL-21522-3.patch
rename to glibc-RHEL-16825-3.patch
index 1471f87..758e156 100644
--- a/glibc-RHEL-21522-3.patch
+++ b/glibc-RHEL-16825-3.patch
@@ -15,11 +15,12 @@ Conflicts:
 	elf/rtld.c
 	  (removal of prelink support upstream)
 
+
 diff --git a/elf/Makefile b/elf/Makefile
-index 634c3113227d64a6..6f0f36cdfe3961e8 100644
+index 42dc878209b11d29..ebf46a297d241d8f 100644
 --- a/elf/Makefile
 +++ b/elf/Makefile
-@@ -386,6 +386,8 @@ tests += \
+@@ -387,6 +387,8 @@ tests += \
    tst-nodelete2 \
    tst-nodelete-dlclose \
    tst-nodelete-opened \
@@ -28,7 +29,7 @@ index 634c3113227d64a6..6f0f36cdfe3961e8 100644
    tst-noload \
    tst-null-argv \
    tst-relsort1 \
-@@ -740,6 +742,8 @@ modules-names = \
+@@ -743,6 +745,8 @@ modules-names = \
    tst-nodelete-dlclose-dso \
    tst-nodelete-dlclose-plugin \
    tst-nodelete-opened-lib \
@@ -37,7 +38,7 @@ index 634c3113227d64a6..6f0f36cdfe3961e8 100644
    tst-null-argv-lib \
    tst-relsort1mod1 \
    tst-relsort1mod2 \
-@@ -886,8 +890,13 @@ modules-execstack-yes = tst-execstack-mod
+@@ -889,8 +893,13 @@ modules-execstack-yes = tst-execstack-mod
  extra-test-objs += $(addsuffix .os,$(strip $(modules-names)))
  
  # filtmod1.so has a special rule
@@ -53,11 +54,10 @@ index 634c3113227d64a6..6f0f36cdfe3961e8 100644
  
  tests += $(tests-static)
  
-@@ -2697,3 +2706,19 @@ $(objpfx)tst-dlmopen-twice: $(libdl)
- $(objpfx)tst-dlmopen-twice.out: \
-   $(objpfx)tst-dlmopen-twice-mod1.so \
-   $(objpfx)tst-dlmopen-twice-mod2.so
-+
+@@ -2707,3 +2716,18 @@ $(objpfx)tst-dlclose-lazy: $(libdl)
+ $(objpfx)tst-dlclose-lazy.out: \
+   $(objpfx)tst-dlclose-lazy-mod1.so $(objpfx)tst-dlclose-lazy-mod2.so
+ 
 +# The object tst-nodeps1-mod.so has no explicit dependencies on libc.so.
 +$(objpfx)tst-nodeps1-mod.so: $(objpfx)tst-nodeps1-mod.os
 +	$(LINK.o) -nostartfiles -nostdlib -shared -o $@ $^
@@ -74,7 +74,7 @@ index 634c3113227d64a6..6f0f36cdfe3961e8 100644
 +$(objpfx)tst-nodeps2.out: \
 +  $(objpfx)tst-nodeps1-mod.so $(objpfx)tst-nodeps2-mod.so
 diff --git a/elf/dl-open.c b/elf/dl-open.c
-index 1505fdb73088dcdb..6508b0ea545440b8 100644
+index 160451790bb88447..f32e2fd4ee39db93 100644
 --- a/elf/dl-open.c
 +++ b/elf/dl-open.c
 @@ -692,6 +692,17 @@ dl_open_worker_begin (void *a)
diff --git a/glibc-RHEL-21522-4.patch b/glibc-RHEL-16825-4.patch
similarity index 100%
rename from glibc-RHEL-21522-4.patch
rename to glibc-RHEL-16825-4.patch
diff --git a/glibc-RHEL-17468-1.patch b/glibc-RHEL-17468-1.patch
new file mode 100644
index 0000000..7d7209a
--- /dev/null
+++ b/glibc-RHEL-17468-1.patch
@@ -0,0 +1,47 @@
+commit 3921c5b40f293c57cb326f58713c924b0662ef59
+Author: Hector Martin <marcan@marcan.st>
+Date:   Tue Nov 28 15:23:07 2023 +0900
+
+    elf: Fix TLS modid reuse generation assignment (BZ 29039)
+    
+    _dl_assign_tls_modid() assigns a slotinfo entry for a new module, but
+    does *not* do anything to the generation counter. The first time this
+    happens, the generation is zero and map_generation() returns the current
+    generation to be used during relocation processing. However, if
+    a slotinfo entry is later reused, it will already have a generation
+    assigned. If this generation has fallen behind the current global max
+    generation, then this causes an obsolete generation to be assigned
+    during relocation processing, as map_generation() returns this
+    generation if nonzero. _dl_add_to_slotinfo() eventually resets the
+    generation, but by then it is too late. This causes DTV updates to be
+    skipped, leading to NULL or broken TLS slot pointers and segfaults.
+    
+    Fix this by resetting the generation to zero in _dl_assign_tls_modid(),
+    so it behaves the same as the first time a slot is assigned.
+    _dl_add_to_slotinfo() will still assign the correct static generation
+    later during module load, but relocation processing will no longer use
+    an obsolete generation.
+    
+    Note that slotinfo entry (aka modid) reuse typically happens after a
+    dlclose and only TLS access via dynamic tlsdesc is affected. Because
+    tlsdesc is optimized to use the optional part of static TLS, dynamic
+    tlsdesc can be avoided by increasing the glibc.rtld.optional_static_tls
+    tunable to a large enough value, or by LD_PRELOAD-ing the affected
+    modules.
+    
+    Fixes bug 29039.
+    
+    Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+
+diff --git a/elf/dl-tls.c b/elf/dl-tls.c
+index a21276732968d88b..c8104078b2aa0aa2 100644
+--- a/elf/dl-tls.c
++++ b/elf/dl-tls.c
+@@ -156,6 +156,7 @@ _dl_assign_tls_modid (struct link_map *l)
+ 	      {
+ 		/* Mark the entry as used, so any dependency see it.  */
+ 		atomic_store_relaxed (&runp->slotinfo[result - disp].map, l);
++		atomic_store_relaxed (&runp->slotinfo[result - disp].gen, 0);
+ 		break;
+ 	      }
+ 
diff --git a/glibc-RHEL-17468-2.patch b/glibc-RHEL-17468-2.patch
new file mode 100644
index 0000000..3722477
--- /dev/null
+++ b/glibc-RHEL-17468-2.patch
@@ -0,0 +1,198 @@
+commit 980450f12685326729d63ff72e93a996113bf073
+Author: Szabolcs Nagy <szabolcs.nagy@arm.com>
+Date:   Wed Nov 29 11:31:37 2023 +0000
+
+    elf: Add TLS modid reuse test for bug 29039
+    
+    This is a minimal regression test for bug 29039 which only affects
+    targets with TLSDESC and a reproducer requires that
+    
+    1) Have modid gaps (closed modules) with old generation.
+    2) Update a DTV to a newer generation (needs a newer dlopen).
+    3) But do not update the closed gap entry in that DTV.
+    4) Reuse the modid gap for a new module (another dlopen).
+    5) Use dynamic TLSDESC in that new module with old generation (bug).
+    6) Access TLS via this TLSDESC and the now outdated DTV.
+    
+    However step (3) in practice rarely happens: during DTV update the
+    entries for closed modids are initialized to "unallocated" and then
+    dynamic TLSDESC calls __tls_get_addr independently of its generation.
+    The only exception to this is DTV setup at thread creation (gaps are
+    initialized to NULL instead of unallocated) or DTV resize where the
+    gap entries are outside the previous DTV array (again NULL instead
+    of unallocated, and this requires loading > DTV_SURPLUS modules).
+    
+    So the bug can only cause NULL (+ offset) dereference, not use after
+    free. And the easiest way to get (3) is via thread creation.
+    
+    Note that step (5) requires that the newly loaded module has larger
+    TLS than the remaining optional static TLS. And for (6) there cannot
+    be other TLS access or dlopen in the thread that updates the DTV.
+    
+    Tested on aarch64-linux-gnu.
+    
+    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+
+Conflicts:
+	elf/Makefile
+	(Add $(libdl), Resolve test case ordering conflict.)
+
+diff --git a/elf/Makefile b/elf/Makefile
+index ebf46a297d241d8f..b8fdee7c0d37137e 100644
+--- a/elf/Makefile
++++ b/elf/Makefile
+@@ -416,6 +416,7 @@ tests += \
+   tst-tls-ie \
+   tst-tls-ie-dlmopen \
+   tst-tls-manydynamic \
++  tst-tlsgap \
+   tst-unique1 \
+   tst-unique2 \
+   unload3 \
+@@ -759,6 +760,9 @@ modules-names = \
+   tst-tls20mod-bad \
+   tst-tls21mod \
+   tst-tlsalign-lib \
++  tst-tlsgap-mod0 \
++  tst-tlsgap-mod1 \
++  tst-tlsgap-mod2 \
+   tst-tls-ie-mod0 \
+   tst-tls-ie-mod1 \
+   tst-tls-ie-mod2 \
+@@ -2731,3 +2735,14 @@ $(objpfx)tst-nodeps2-mod.so: $(common-objpfx)libc.so \
+ $(objpfx)tst-nodeps2: $(libdl)
+ $(objpfx)tst-nodeps2.out: \
+   $(objpfx)tst-nodeps1-mod.so $(objpfx)tst-nodeps2-mod.so
++
++$(objpfx)tst-tlsgap: $(libdl) $(shared-thread-library)
++$(objpfx)tst-tlsgap.out: \
++  $(objpfx)tst-tlsgap-mod0.so \
++  $(objpfx)tst-tlsgap-mod1.so \
++  $(objpfx)tst-tlsgap-mod2.so
++ifeq (yes,$(have-mtls-dialect-gnu2))
++CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
++CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
++CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
++endif
+diff --git a/elf/tst-tlsgap-mod0.c b/elf/tst-tlsgap-mod0.c
+new file mode 100644
+index 0000000000000000..1478b0beac5faf98
+--- /dev/null
++++ b/elf/tst-tlsgap-mod0.c
+@@ -0,0 +1,2 @@
++int __thread tls0;
++int *f0(void) { return &tls0; }
+diff --git a/elf/tst-tlsgap-mod1.c b/elf/tst-tlsgap-mod1.c
+new file mode 100644
+index 0000000000000000..b10fc3702c43e478
+--- /dev/null
++++ b/elf/tst-tlsgap-mod1.c
+@@ -0,0 +1,2 @@
++int __thread tls1[100]; /* Size > glibc.rtld.optional_static_tls / 2.  */
++int *f1(void) { return tls1; }
+diff --git a/elf/tst-tlsgap-mod2.c b/elf/tst-tlsgap-mod2.c
+new file mode 100644
+index 0000000000000000..166c27d7f3fac252
+--- /dev/null
++++ b/elf/tst-tlsgap-mod2.c
+@@ -0,0 +1,2 @@
++int __thread tls2;
++int *f2(void) { return &tls2; }
+diff --git a/elf/tst-tlsgap.c b/elf/tst-tlsgap.c
+new file mode 100644
+index 0000000000000000..49328850769c5609
+--- /dev/null
++++ b/elf/tst-tlsgap.c
+@@ -0,0 +1,92 @@
++/* TLS modid gap reuse regression test for bug 29039.
++   Copyright (C) 2023 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#include <stdio.h>
++#include <dlfcn.h>
++#include <pthread.h>
++#include <support/xdlfcn.h>
++#include <support/xthread.h>
++#include <support/check.h>
++
++static void *mod[3];
++#define MOD(i) "tst-tlsgap-mod" #i ".so"
++static const char *modname[3] = { MOD(0), MOD(1), MOD(2) };
++#undef MOD
++
++static void
++open_mod (int i)
++{
++  mod[i] = xdlopen (modname[i], RTLD_LAZY);
++  printf ("open %s\n", modname[i]);
++}
++
++static void
++close_mod (int i)
++{
++  xdlclose (mod[i]);
++  mod[i] = NULL;
++  printf ("close %s\n", modname[i]);
++}
++
++static void
++access_mod (int i, const char *sym)
++{
++  int *(*f) (void) = xdlsym (mod[i], sym);
++  int *p = f ();
++  printf ("access %s: %s() = %p\n", modname[i], sym, p);
++  TEST_VERIFY_EXIT (p != NULL);
++  ++*p;
++}
++
++static void *
++start (void *arg)
++{
++  /* The DTV generation is at the last dlopen of mod0 and the
++     entry for mod1 is NULL.  */
++
++  open_mod (1); /* Reuse modid of mod1. Uses dynamic TLS.  */
++
++  /* DTV is unchanged: dlopen only updates the DTV to the latest
++     generation if static TLS is allocated for a loaded module.
++
++     With bug 29039, the TLSDESC relocation in mod1 uses the old
++     dlclose generation of mod1 instead of the new dlopen one so
++     DTV is not updated on TLS access.  */
++
++  access_mod (1, "f1");
++
++  return arg;
++}
++
++static int
++do_test (void)
++{
++  open_mod (0);
++  open_mod (1);
++  open_mod (2);
++  close_mod (0);
++  close_mod (1); /* Create modid gap at mod1.  */
++  open_mod (0); /* Reuse modid of mod0, bump generation count.  */
++
++  /* Create a thread where DTV of mod1 is NULL.  */
++  pthread_t t = xpthread_create (NULL, start, NULL);
++  xpthread_join (t);
++  return 0;
++}
++
++#include <support/test-driver.c>
diff --git a/glibc-RHEL-19445.patch b/glibc-RHEL-19445.patch
new file mode 100644
index 0000000..b3769d3
--- /dev/null
+++ b/glibc-RHEL-19445.patch
@@ -0,0 +1,31 @@
+Based on the following commit, adjusted for glibc-2.28 in RHEL-8:
+
+commit 5eabdb6a6ac1599d23dd5966a37417215950245f
+Author: Andreas Schwab <schwab@suse.de>
+Date:   Wed Dec 6 14:48:22 2023 +0100
+
+    getaddrinfo: translate ENOMEM to EAI_MEMORY (bug 31163)
+    
+    When __resolv_context_get returns NULL due to out of memory, translate it
+    to a return value of EAI_MEMORY.
+
+diff --git a/sysdeps/posix/getaddrinfo.c b/sysdeps/posix/getaddrinfo.c
+index 46046504a6858f2e..d0708f3e84e20025 100644
+--- a/sysdeps/posix/getaddrinfo.c
++++ b/sysdeps/posix/getaddrinfo.c
+@@ -777,7 +777,14 @@ gaih_inet (const char *name, const struct gaih_service *service,
+ 	  res_ctx = __resolv_context_get ();
+ 	  res_enable_inet6 = __resolv_context_disable_inet6 (res_ctx);
+ 	  if (res_ctx == NULL)
+-	    no_more = 1;
++	    {
++	      if (errno == ENOMEM)
++		{
++		  result = -EAI_MEMORY;
++		  goto free_and_return;
++		}
++	      no_more = 1;
++	    }
+ 
+ 	  while (!no_more)
+ 	    {
diff --git a/glibc-RHEL-22847.patch b/glibc-RHEL-19824.patch
similarity index 100%
rename from glibc-RHEL-22847.patch
rename to glibc-RHEL-19824.patch
diff --git a/glibc-RHEL-2122.patch b/glibc-RHEL-2122.patch
new file mode 100644
index 0000000..69a294f
--- /dev/null
+++ b/glibc-RHEL-2122.patch
@@ -0,0 +1,312 @@
+From d2123d68275acc0f061e73d5f86ca504e0d5a344 Mon Sep 17 00:00:00 2001
+From: Szabolcs Nagy <szabolcs.nagy@arm.com>
+Date: Tue, 16 Feb 2021 12:55:13 +0000
+Subject: elf: Fix slow tls access after dlopen [BZ #19924]
+
+In short: __tls_get_addr checks the global generation counter and if
+the current dtv is older then _dl_update_slotinfo updates dtv up to the
+generation of the accessed module. So if the global generation is newer
+than generation of the module then __tls_get_addr keeps hitting the
+slow dtv update path. The dtv update path includes a number of checks
+to see if any update is needed and this already causes measurable tls
+access slow down after dlopen.
+
+It may be possible to detect up-to-date dtv faster.  But if there are
+many modules loaded (> TLS_SLOTINFO_SURPLUS) then this requires at
+least walking the slotinfo list.
+
+This patch tries to update the dtv to the global generation instead, so
+after a dlopen the tls access slow path is only hit once.  The modules
+with larger generation than the accessed one were not necessarily
+synchronized before, so additional synchronization is needed.
+
+This patch uses acquire/release synchronization when accessing the
+generation counter.
+
+Note: in the x86_64 version of dl-tls.c the generation is only loaded
+once, since relaxed mo is not faster than acquire mo load.
+
+I have not benchmarked this. Tested by Adhemerval Zanella on aarch64,
+powerpc, sparc, x86 who reported that it fixes the performance issue
+of bug 19924.
+
+Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+
+[rebased to c8s by DJ]
+
+diff -rup a/elf/dl-close.c b/elf/dl-close.c
+--- a/elf/dl-close.c	2023-10-13 16:24:27.068217519 -0400
++++ b/elf/dl-close.c	2023-10-13 16:28:59.936019397 -0400
+@@ -739,7 +739,7 @@ _dl_close_worker (struct link_map *map,
+       if (__glibc_unlikely (newgen == 0))
+ 	_dl_fatal_printf ("TLS generation counter wrapped!  Please report as described in "REPORT_BUGS_TO".\n");
+       /* Can be read concurrently.  */
+-      atomic_store_relaxed (&GL(dl_tls_generation), newgen);
++      atomic_store_release (&GL(dl_tls_generation), newgen);
+ 
+       if (tls_free_end == GL(dl_tls_static_used))
+ 	GL(dl_tls_static_used) = tls_free_start;
+diff -rup a/elf/dl-open.c b/elf/dl-open.c
+--- a/elf/dl-open.c	2023-10-13 16:24:26.930212160 -0400
++++ b/elf/dl-open.c	2023-10-13 16:28:59.936019397 -0400
+@@ -403,7 +403,7 @@ update_tls_slotinfo (struct link_map *ne
+     _dl_fatal_printf (N_("\
+ TLS generation counter wrapped!  Please report this."));
+   /* Can be read concurrently.  */
+-  atomic_store_relaxed (&GL(dl_tls_generation), newgen);
++  atomic_store_release (&GL(dl_tls_generation), newgen);
+ 
+   /* We need a second pass for static tls data, because
+      _dl_update_slotinfo must not be run while calls to
+@@ -420,8 +420,8 @@ TLS generation counter wrapped!  Please
+ 	     now, but we can delay updating the DTV.  */
+ 	  imap->l_need_tls_init = 0;
+ #ifdef SHARED
+-	  /* Update the slot information data for at least the
+-	     generation of the DSO we are allocating data for.  */
++	  /* Update the slot information data for the current
++	     generation.  */
+ 
+ 	  /* FIXME: This can terminate the process on memory
+ 	     allocation failure.  It is not possible to raise
+@@ -429,7 +429,7 @@ TLS generation counter wrapped!  Please
+ 	     _dl_update_slotinfo would have to be split into two
+ 	     operations, similar to resize_scopes and update_scopes
+ 	     above.  This is related to bug 16134.  */
+-	  _dl_update_slotinfo (imap->l_tls_modid);
++	  _dl_update_slotinfo (imap->l_tls_modid, newgen);
+ #endif
+ 
+ 	  GL(dl_init_static_tls) (imap);
+diff -rup a/elf/dl-reloc.c b/elf/dl-reloc.c
+--- a/elf/dl-reloc.c	2023-10-13 16:24:26.390191189 -0400
++++ b/elf/dl-reloc.c	2023-10-13 16:28:59.937019438 -0400
+@@ -111,11 +111,11 @@ _dl_try_allocate_static_tls (struct link
+   if (map->l_real->l_relocated)
+     {
+ #ifdef SHARED
++      /* Update the DTV of the current thread.  Note: GL(dl_load_tls_lock)
++	 is held here so normal load of the generation counter is valid.  */
+       if (__builtin_expect (THREAD_DTV()[0].counter != GL(dl_tls_generation),
+ 			    0))
+-	/* Update the slot information data for at least the generation of
+-	   the DSO we are allocating data for.  */
+-	(void) _dl_update_slotinfo (map->l_tls_modid);
++	(void) _dl_update_slotinfo (map->l_tls_modid, GL(dl_tls_generation));
+ #endif
+ 
+       GL(dl_init_static_tls) (map);
+diff -rup a/elf/dl-tls.c b/elf/dl-tls.c
+--- a/elf/dl-tls.c	2023-10-13 16:24:26.564197946 -0400
++++ b/elf/dl-tls.c	2023-10-13 16:28:59.937019438 -0400
+@@ -716,57 +716,57 @@ allocate_and_init (struct link_map *map)
+ 
+ 
+ struct link_map *
+-_dl_update_slotinfo (unsigned long int req_modid)
++_dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
+ {
+   struct link_map *the_map = NULL;
+   dtv_t *dtv = THREAD_DTV ();
+ 
+-  /* The global dl_tls_dtv_slotinfo array contains for each module
+-     index the generation counter current when the entry was created.
++  /* CONCURRENCY NOTES:
++
++     The global dl_tls_dtv_slotinfo_list array contains for each module
++     index the generation counter current when that entry was updated.
+      This array never shrinks so that all module indices which were
+-     valid at some time can be used to access it.  Before the first
+-     use of a new module index in this function the array was extended
+-     appropriately.  Access also does not have to be guarded against
+-     modifications of the array.  It is assumed that pointer-size
+-     values can be read atomically even in SMP environments.  It is
+-     possible that other threads at the same time dynamically load
+-     code and therefore add to the slotinfo list.  This is a problem
+-     since we must not pick up any information about incomplete work.
+-     The solution to this is to ignore all dtv slots which were
+-     created after the one we are currently interested.  We know that
+-     dynamic loading for this module is completed and this is the last
+-     load operation we know finished.  */
+-  unsigned long int idx = req_modid;
++     valid at some time can be used to access it.  Concurrent loading
++     and unloading of modules can update slotinfo entries or extend
++     the array.  The updates happen under the GL(dl_load_tls_lock) and
++     finish with the release store of the generation counter to
++     GL(dl_tls_generation) which is synchronized with the load of
++     new_gen in the caller.  So updates up to new_gen are synchronized
++     but updates for later generations may not be.
++
++     Here we update the thread dtv from old_gen (== dtv[0].counter) to
++     new_gen generation.  For this, each dtv[i] entry is either set to
++     an unallocated state (set), or left unmodified (nop).  Where (set)
++     may resize the dtv first if modid i >= dtv[-1].counter. The rules
++     for the decision between (set) and (nop) are
++
++     (1) If slotinfo entry i is concurrently updated then either (set)
++         or (nop) is valid: TLS access cannot use dtv[i] unless it is
++         synchronized with a generation > new_gen.
++
++     Otherwise, if the generation of slotinfo entry i is gen and the
++     loaded module for this entry is map then
++
++     (2) If gen <= old_gen then do (nop).
++
++     (3) If old_gen < gen <= new_gen then
++         (3.1) if map != 0 then (set)
++         (3.2) if map == 0 then either (set) or (nop).
++
++     Note that (1) cannot be reliably detected, but since both actions
++     are valid it does not have to be.  Only (2) and (3.1) cases need
++     to be distinguished for which relaxed mo access of gen and map is
++     enough: their value is synchronized when it matters.
++
++     Note that a relaxed mo load may give an out-of-thin-air value since
++     it is used in decisions that can affect concurrent stores.  But this
++     should only happen if the OOTA value causes UB that justifies the
++     concurrent store of the value.  This is not expected to be an issue
++     in practice.  */
+   struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
+ 
+-  while (idx >= listp->len)
++  if (dtv[0].counter < new_gen)
+     {
+-      idx -= listp->len;
+-      listp = listp->next;
+-    }
+-
+-  if (dtv[0].counter < listp->slotinfo[idx].gen)
+-    {
+-      /* CONCURRENCY NOTES:
+-
+-	 Here the dtv needs to be updated to new_gen generation count.
+-
+-	 This code may be called during TLS access when GL(dl_load_tls_lock)
+-	 is not held.  In that case the user code has to synchronize with
+-	 dlopen and dlclose calls of relevant modules.  A module m is
+-	 relevant if the generation of m <= new_gen and dlclose of m is
+-	 synchronized: a memory access here happens after the dlopen and
+-	 before the dlclose of relevant modules.  The dtv entries for
+-	 relevant modules need to be updated, other entries can be
+-	 arbitrary.
+-
+-	 This e.g. means that the first part of the slotinfo list can be
+-	 accessed race free, but the tail may be concurrently extended.
+-	 Similarly relevant slotinfo entries can be read race free, but
+-	 other entries are racy.  However updating a non-relevant dtv
+-	 entry does not affect correctness.  For a relevant module m,
+-	 max_modid >= modid of m.  */
+-      size_t new_gen = listp->slotinfo[idx].gen;
+       size_t total = 0;
+       size_t max_modid  = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx));
+       assert (max_modid >= req_modid);
+@@ -779,31 +779,33 @@ _dl_update_slotinfo (unsigned long int r
+ 	    {
+ 	      size_t modid = total + cnt;
+ 
+-	      /* Later entries are not relevant.  */
++	      /* Case (1) for all later modids.  */
+ 	      if (modid > max_modid)
+ 		break;
+ 
+ 	      size_t gen = atomic_load_relaxed (&listp->slotinfo[cnt].gen);
+ 
++	      /* Case (1).  */
+ 	      if (gen > new_gen)
+-		/* Not relevant.  */
+ 		continue;
+ 
+-	      /* If the entry is older than the current dtv layout we
+-		 know we don't have to handle it.  */
++	      /* Case (2) or (1).  */
+ 	      if (gen <= dtv[0].counter)
+ 		continue;
+ 
++	      /* Case (3) or (1).  */
++
+ 	      /* If there is no map this means the entry is empty.  */
+ 	      struct link_map *map
+ 		= atomic_load_relaxed (&listp->slotinfo[cnt].map);
+ 	      /* Check whether the current dtv array is large enough.  */
+ 	      if (dtv[-1].counter < modid)
+ 		{
++		  /* Case (3.2) or (1).  */
+ 		  if (map == NULL)
+ 		    continue;
+ 
+-		  /* Resize the dtv.  */
++		  /* Resizing the dtv aborts on failure: bug 16134.  */
+ 		  dtv = _dl_resize_dtv (dtv, max_modid);
+ 
+ 		  assert (modid <= dtv[-1].counter);
+@@ -814,7 +816,7 @@ _dl_update_slotinfo (unsigned long int r
+ 		}
+ 
+ 	      /* If there is currently memory allocate for this
+-		 dtv entry free it.  */
++		 dtv entry free it.  Note: this is not AS-safe.  */
+ 	      /* XXX Ideally we will at some point create a memory
+ 		 pool.  */
+ 	      free (dtv[modid].pointer.to_free);
+@@ -909,9 +911,9 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t
+ 
+ static struct link_map *
+ __attribute_noinline__
+-update_get_addr (GET_ADDR_ARGS)
++update_get_addr (GET_ADDR_ARGS, size_t gen)
+ {
+-  struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE);
++  struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE, gen);
+   dtv_t *dtv = THREAD_DTV ();
+ 
+   void *p = dtv[GET_ADDR_MODULE].pointer.val;
+@@ -941,12 +943,17 @@ __tls_get_addr (GET_ADDR_ARGS)
+   dtv_t *dtv = THREAD_DTV ();
+ 
+   /* Update is needed if dtv[0].counter < the generation of the accessed
+-     module.  The global generation counter is used here as it is easier
+-     to check.  Synchronization for the relaxed MO access is guaranteed
+-     by user code, see CONCURRENCY NOTES in _dl_update_slotinfo.  */
++     module, but the global generation counter is easier to check (which
++     must be synchronized up to the generation of the accessed module by
++     user code doing the TLS access so relaxed mo read is enough).  */
+   size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
+   if (__glibc_unlikely (dtv[0].counter != gen))
+-    return update_get_addr (GET_ADDR_PARAM);
++    {
++      /* Update DTV up to the global generation, see CONCURRENCY NOTES
++         in _dl_update_slotinfo.  */
++      gen = atomic_load_acquire (&GL(dl_tls_generation));
++      return update_get_addr (GET_ADDR_PARAM, gen);
++    }
+ 
+   void *p = dtv[GET_ADDR_MODULE].pointer.val;
+ 
+diff -rup a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
+--- a/sysdeps/generic/ldsodefs.h	2023-10-13 16:24:27.136220160 -0400
++++ b/sysdeps/generic/ldsodefs.h	2023-10-13 16:28:59.937019438 -0400
+@@ -1231,7 +1231,8 @@ extern void _dl_add_to_slotinfo (struct
+ 
+ /* Update slot information data for at least the generation of the
+    module with the given index.  */
+-extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid)
++extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid,
++					     size_t gen)
+      attribute_hidden;
+ 
+ /* Look up the module's TLS block as for __tls_get_addr,
+diff -rup a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
+--- a/sysdeps/x86_64/dl-tls.c	2023-10-13 16:24:24.948135189 -0400
++++ b/sysdeps/x86_64/dl-tls.c	2023-10-13 16:28:59.938019479 -0400
+@@ -40,9 +40,9 @@ __tls_get_addr_slow (GET_ADDR_ARGS)
+ {
+   dtv_t *dtv = THREAD_DTV ();
+ 
+-  size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
++  size_t gen = atomic_load_acquire (&GL(dl_tls_generation));
+   if (__glibc_unlikely (dtv[0].counter != gen))
+-    return update_get_addr (GET_ADDR_PARAM);
++    return update_get_addr (GET_ADDR_PARAM, gen);
+ 
+   return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
+ }
diff --git a/glibc-RHEL-21997.patch b/glibc-RHEL-21997.patch
new file mode 100644
index 0000000..865c508
--- /dev/null
+++ b/glibc-RHEL-21997.patch
@@ -0,0 +1,112 @@
+This downstream-only patch compensates for the missing backport of
+commit 2d651eb9265d1366d7b9e881bfddd46db9c1ecc4 ("x86: Move
+x86 processor cache info to cpu_features").  Without it,
+ld.so --list-diagnostics prints values that have not been properly
+initalized from CPUID data.
+
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index 10ebadd819d9efff..d8421fab83ab08ac 100644
+--- a/sysdeps/x86/cacheinfo.h
++++ b/sysdeps/x86/cacheinfo.h
+@@ -19,31 +19,42 @@
+ #include <assert.h>
+ #include <unistd.h>
+ 
++/* When building ld.so, do not export any of the variables.  They are
++   only used for diagnostics and are not initialized during regular
++   operation.  */
++#if IS_IN (rtld)
++# define CACHEINFO_VARIABLE(name, initializer) \
++  static long int name = initializer
++#else
++# define CACHEINFO_VARIABLE(name, initializer) \
++  long int name attribute_hidden = initializer
++#endif
++
+ /* Data cache size for use in memory and string routines, typically
+    L1 size, rounded to multiple of 256 bytes.  */
+-long int __x86_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
+-long int __x86_data_cache_size attribute_hidden = 32 * 1024;
++CACHEINFO_VARIABLE (__x86_data_cache_size_half, 32 * 1024 / 2);
++CACHEINFO_VARIABLE (__x86_data_cache_size, 32 * 1024);
+ /* Similar to __x86_data_cache_size_half, but not rounded.  */
+-long int __x86_raw_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
++CACHEINFO_VARIABLE (__x86_raw_data_cache_size_half, 32 * 1024 / 2);
+ /* Similar to __x86_data_cache_size, but not rounded.  */
+-long int __x86_raw_data_cache_size attribute_hidden = 32 * 1024;
++CACHEINFO_VARIABLE (__x86_raw_data_cache_size, 32 * 1024);
+ /* Shared cache size for use in memory and string routines, typically
+    L2 or L3 size, rounded to multiple of 256 bytes.  */
+-long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
+-long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
++CACHEINFO_VARIABLE (__x86_shared_cache_size_half, 1024 * 1024 / 2);
++CACHEINFO_VARIABLE (__x86_shared_cache_size, 1024 * 1024);
+ /* Similar to __x86_shared_cache_size_half, but not rounded.  */
+-long int __x86_raw_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
++CACHEINFO_VARIABLE (__x86_raw_shared_cache_size_half, 1024 * 1024 / 2);
+ /* Similar to __x86_shared_cache_size, but not rounded.  */
+-long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
++CACHEINFO_VARIABLE (__x86_raw_shared_cache_size, 1024 * 1024);
+ 
+ /* Threshold to use non temporal store.  */
+-long int __x86_shared_non_temporal_threshold attribute_hidden;
++CACHEINFO_VARIABLE (__x86_shared_non_temporal_threshold, 0);
+ 
+ /* Threshold to use Enhanced REP MOVSB.  */
+-long int __x86_rep_movsb_threshold attribute_hidden = 2048;
++CACHEINFO_VARIABLE (__x86_rep_movsb_threshold, 2048);
+ 
+ /* Threshold to use Enhanced REP STOSB.  */
+-long int __x86_rep_stosb_threshold attribute_hidden = 2048;
++CACHEINFO_VARIABLE (__x86_rep_stosb_threshold, 2048);
+ 
+ static void
+ get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
+diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
+index 0ba286a828b69937..9215604ecf22344c 100644
+--- a/sysdeps/x86/dl-diagnostics-cpu.c
++++ b/sysdeps/x86/dl-diagnostics-cpu.c
+@@ -19,6 +19,13 @@
+ #include <dl-diagnostics.h>
+ #include <ldsodefs.h>
+ 
++#include <assert.h>
++#include <unistd.h>
++#include <cpu-features.h>
++#include <cpuid.h>
++#include <dl-cacheinfo.h>
++#include <cacheinfo.h>
++
+ static void
+ print_cpu_features_value (const char *label, uint64_t value)
+ {
+@@ -81,19 +88,21 @@ _dl_diagnostics_cpu (void)
+ #include "cpu-features-preferred_feature_index_1.def"
+ #undef BIT
+ 
++  /* The cache information variables are only used for diagnostics and
++     are not initialized during startup.  The values used at run time
++     are only in libc.so.6.  */
++  init_cacheinfo ();
++
+   print_cpu_features_value ("xsave_state_size",
+                             cpu_features->xsave_state_size);
+   print_cpu_features_value ("xsave_state_full_size",
+                             cpu_features->xsave_state_full_size);
+-  print_cpu_features_value ("data_cache_size", cpu_features->data_cache_size);
+-  print_cpu_features_value ("shared_cache_size",
+-                            cpu_features->shared_cache_size);
++  print_cpu_features_value ("data_cache_size", __x86_data_cache_size);
++  print_cpu_features_value ("shared_cache_size",  __x86_shared_cache_size);
+   print_cpu_features_value ("non_temporal_threshold",
+-                            cpu_features->non_temporal_threshold);
+-  print_cpu_features_value ("rep_movsb_threshold",
+-                            cpu_features->rep_movsb_threshold);
+-  print_cpu_features_value ("rep_stosb_threshold",
+-                            cpu_features->rep_stosb_threshold);
++                            __x86_shared_non_temporal_threshold);
++  print_cpu_features_value ("rep_movsb_threshold", __x86_rep_movsb_threshold);
++  print_cpu_features_value ("rep_stosb_threshold", __x86_rep_stosb_threshold);
+   _Static_assert (offsetof (struct cpu_features, rep_stosb_threshold)
+                   + sizeof (cpu_features->rep_stosb_threshold)
+                   == sizeof (*cpu_features),
diff --git a/glibc-RHEL-3010-1.patch b/glibc-RHEL-3010-1.patch
new file mode 100644
index 0000000..494ebfd
--- /dev/null
+++ b/glibc-RHEL-3010-1.patch
@@ -0,0 +1,247 @@
+commit 103a469dc7755fd9e8ccf362f3dd4c55dc761908
+Author: Sajan Karumanchi <sajan.karumanchi@amd.com>
+Date:   Wed Jan 18 18:29:04 2023 +0100
+
+    x86: Cache computation for AMD architecture.
+
+    All AMD architectures cache details will be computed based on
+    __cpuid__ `0x8000_001D` and the reference to __cpuid__ `0x8000_0006` will be
+    zeroed out for future architectures.
+
+    Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>
+
+Conflicts:
+	sysdeps/x86/dl-cacheinfo.h
+	  (missing backport of commit 2d651eb9265d1366d7b9e881bfddd4
+	  ("x86: Move x86 processor cache info to cpu_features"))
+
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index 572f753474ee0610..b6f111e6668cc212 100644
+--- a/sysdeps/x86/cacheinfo.h
++++ b/sysdeps/x86/cacheinfo.h
+@@ -266,10 +266,6 @@ static void
+ init_cacheinfo (void)
+ {
+   /* Find out what brand of processor.  */
+-  unsigned int ebx;
+-  unsigned int ecx;
+-  unsigned int edx;
+-  int max_cpuid_ex;
+   long int data = -1;
+   long int shared = -1;
+   long int shared_per_thread = -1;
+@@ -303,62 +299,14 @@ init_cacheinfo (void)
+     }
+   else if (cpu_features->basic.kind == arch_kind_amd)
+     {
+-      data   = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
+-      long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
+-      shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
++      data   = handle_amd (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
++      long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE, cpu_features);
++      shared = handle_amd (_SC_LEVEL3_CACHE_SIZE, cpu_features);
+       shared_per_thread = shared;
+ 
+-      /* Get maximum extended function. */
+-      __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
+-
+       if (shared <= 0)
+ 	/* No shared L3 cache.  All we have is the L2 cache.  */
+ 	shared = core;
+-      else
+-	{
+-	  /* Figure out the number of logical threads that share L3.  */
+-	  if (max_cpuid_ex >= 0x80000008)
+-	    {
+-	      /* Get width of APIC ID.  */
+-	      __cpuid (0x80000008, max_cpuid_ex, ebx, ecx, edx);
+-	      threads = 1 << ((ecx >> 12) & 0x0f);
+-	    }
+-
+-	  if (threads == 0 || cpu_features->basic.family >= 0x17)
+-	    {
+-	      /* If APIC ID width is not available, use logical
+-		 processor count.  */
+-	      __cpuid (0x00000001, max_cpuid_ex, ebx, ecx, edx);
+-
+-	      if ((edx & (1 << 28)) != 0)
+-		threads = (ebx >> 16) & 0xff;
+-	    }
+-
+-	  /* Cap usage of highest cache level to the number of
+-	     supported threads.  */
+-	  if (threads > 0)
+-	    shared /= threads;
+-
+-	  /* Get shared cache per ccx for Zen architectures.  */
+-	  if (cpu_features->basic.family >= 0x17)
+-	    {
+-	      unsigned int eax;
+-
+-	      /* Get number of threads share the L3 cache in CCX.  */
+-	      __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
+-
+-	      unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
+-	      shared *= threads_per_ccx;
+-	    }
+-	  else
+-	    {
+-	      /* Account for exclusive L2 and L3 caches.  */
+-	      shared += core;
+-            }
+-	}
+-
+-      if (shared_per_thread <= 0)
+-	shared_per_thread = shared;
+     }
+ 
+   if (cpu_features->data_cache_size != 0)
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index b2b90074b0e98a60..294a7d8bfc564aef 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -311,117 +311,47 @@ handle_intel (int name, const struct cpu_features *cpu_features)
+ 
+ 
+ static long int __attribute__ ((noinline))
+-handle_amd (int name)
++handle_amd (int name, const struct cpu_features *cpu_features)
+ {
+   unsigned int eax;
+   unsigned int ebx;
+   unsigned int ecx;
+   unsigned int edx;
+-  __cpuid (0x80000000, eax, ebx, ecx, edx);
++  unsigned int count = 0x1;
+ 
+   /* No level 4 cache (yet).  */
+   if (name > _SC_LEVEL3_CACHE_LINESIZE)
+     return 0;
+ 
+-  unsigned int fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
+-  if (eax < fn)
+-    return 0;
++  if (name >= _SC_LEVEL3_CACHE_SIZE)
++    count = 0x3;
++  else if (name >= _SC_LEVEL2_CACHE_SIZE)
++    count = 0x2;
++  else if (name >= _SC_LEVEL1_DCACHE_SIZE)
++    count = 0x0;
+ 
+-  __cpuid (fn, eax, ebx, ecx, edx);
+-
+-  if (name < _SC_LEVEL1_DCACHE_SIZE)
+-    {
+-      name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
+-      ecx = edx;
+-    }
++  __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
+ 
+   switch (name)
+     {
+-    case _SC_LEVEL1_DCACHE_SIZE:
+-      return (ecx >> 14) & 0x3fc00;
+-
+-    case _SC_LEVEL1_DCACHE_ASSOC:
+-      ecx >>= 16;
+-      if ((ecx & 0xff) == 0xff)
+-	/* Fully associative.  */
+-	return (ecx << 2) & 0x3fc00;
+-      return ecx & 0xff;
+-
+-    case _SC_LEVEL1_DCACHE_LINESIZE:
+-      return ecx & 0xff;
+-
+-    case _SC_LEVEL2_CACHE_SIZE:
+-      return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
+-
+-    case _SC_LEVEL2_CACHE_ASSOC:
+-      switch ((ecx >> 12) & 0xf)
+-	{
+-	case 0:
+-	case 1:
+-	case 2:
+-	case 4:
+-	  return (ecx >> 12) & 0xf;
+-	case 6:
+-	  return 8;
+-	case 8:
+-	  return 16;
+-	case 10:
+-	  return 32;
+-	case 11:
+-	  return 48;
+-	case 12:
+-	  return 64;
+-	case 13:
+-	  return 96;
+-	case 14:
+-	  return 128;
+-	case 15:
+-	  return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff);
+-	default:
+-	  return 0;
+-	}
+-      /* NOTREACHED */
+-
+-    case _SC_LEVEL2_CACHE_LINESIZE:
+-      return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
+-
+-    case _SC_LEVEL3_CACHE_SIZE:
+-      return (edx & 0xf000) == 0 ? 0 : (edx & 0x3ffc0000) << 1;
+-
+-    case _SC_LEVEL3_CACHE_ASSOC:
+-      switch ((edx >> 12) & 0xf)
+-	{
+-	case 0:
+-	case 1:
+-	case 2:
+-	case 4:
+-	  return (edx >> 12) & 0xf;
+-	case 6:
+-	  return 8;
+-	case 8:
+-	  return 16;
+-	case 10:
+-	  return 32;
+-	case 11:
+-	  return 48;
+-	case 12:
+-	  return 64;
+-	case 13:
+-	  return 96;
+-	case 14:
+-	  return 128;
+-	case 15:
+-	  return ((edx & 0x3ffc0000) << 1) / (edx & 0xff);
+-	default:
+-	  return 0;
+-	}
+-      /* NOTREACHED */
+-
+-    case _SC_LEVEL3_CACHE_LINESIZE:
+-      return (edx & 0xf000) == 0 ? 0 : edx & 0xff;
+-
+-    default:
+-      assert (! "cannot happen");
++       case _SC_LEVEL1_ICACHE_ASSOC:
++       case _SC_LEVEL1_DCACHE_ASSOC:
++       case _SC_LEVEL2_CACHE_ASSOC:
++       case _SC_LEVEL3_CACHE_ASSOC:
++         return ecx?((ebx >> 22) & 0x3ff) + 1 : 0;
++       case _SC_LEVEL1_ICACHE_LINESIZE:
++       case _SC_LEVEL1_DCACHE_LINESIZE:
++       case _SC_LEVEL2_CACHE_LINESIZE:
++       case _SC_LEVEL3_CACHE_LINESIZE:
++         return ecx?(ebx & 0xfff) + 1 : 0;
++       case _SC_LEVEL1_ICACHE_SIZE:
++       case _SC_LEVEL1_DCACHE_SIZE:
++       case _SC_LEVEL2_CACHE_SIZE:
++       case _SC_LEVEL3_CACHE_SIZE:
++         return ecx?(((ebx >> 22) & 0x3ff) + 1)*((ebx & 0xfff) + 1)\
++                                                    *(ecx + 1):0;
++       default:
++         assert (! "cannot happen");
+     }
+   return -1;
+ }
diff --git a/glibc-RHEL-3010-2.patch b/glibc-RHEL-3010-2.patch
new file mode 100644
index 0000000..26a42d9
--- /dev/null
+++ b/glibc-RHEL-3010-2.patch
@@ -0,0 +1,85 @@
+commit 856bab7717ef6d1033fd7cbf7cfb2ddefbfffb07
+Author: Andreas Schwab <schwab@suse.de>
+Date:   Thu Feb 9 14:56:21 2023 +0100
+
+    x86/dl-cacheinfo: remove unsused parameter from handle_amd
+    
+    Also replace an unreachable assert with __builtin_unreachable.
+
+Conflicts:
+	sysdeps/x86/dl-cacheinfo.h
+	  (missing backport of commit 2d651eb9265d1366d7b9e881bfddd4
+	  ("x86: Move x86 processor cache info to cpu_features"))
+
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index b6f111e6668cc212..85e5731281c62503 100644
+--- a/sysdeps/x86/cacheinfo.h
++++ b/sysdeps/x86/cacheinfo.h
+@@ -299,9 +299,9 @@ init_cacheinfo (void)
+     }
+   else if (cpu_features->basic.kind == arch_kind_amd)
+     {
+-      data   = handle_amd (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
+-      long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE, cpu_features);
+-      shared = handle_amd (_SC_LEVEL3_CACHE_SIZE, cpu_features);
++      data   = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
++      long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
++      shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
+       shared_per_thread = shared;
+ 
+       if (shared <= 0)
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index 294a7d8bfc564aef..74cd5072a9d10756 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -311,7 +311,7 @@ handle_intel (int name, const struct cpu_features *cpu_features)
+ 
+ 
+ static long int __attribute__ ((noinline))
+-handle_amd (int name, const struct cpu_features *cpu_features)
++handle_amd (int name)
+ {
+   unsigned int eax;
+   unsigned int ebx;
+@@ -334,24 +334,23 @@ handle_amd (int name, const struct cpu_features *cpu_features)
+ 
+   switch (name)
+     {
+-       case _SC_LEVEL1_ICACHE_ASSOC:
+-       case _SC_LEVEL1_DCACHE_ASSOC:
+-       case _SC_LEVEL2_CACHE_ASSOC:
+-       case _SC_LEVEL3_CACHE_ASSOC:
+-         return ecx?((ebx >> 22) & 0x3ff) + 1 : 0;
+-       case _SC_LEVEL1_ICACHE_LINESIZE:
+-       case _SC_LEVEL1_DCACHE_LINESIZE:
+-       case _SC_LEVEL2_CACHE_LINESIZE:
+-       case _SC_LEVEL3_CACHE_LINESIZE:
+-         return ecx?(ebx & 0xfff) + 1 : 0;
+-       case _SC_LEVEL1_ICACHE_SIZE:
+-       case _SC_LEVEL1_DCACHE_SIZE:
+-       case _SC_LEVEL2_CACHE_SIZE:
+-       case _SC_LEVEL3_CACHE_SIZE:
+-         return ecx?(((ebx >> 22) & 0x3ff) + 1)*((ebx & 0xfff) + 1)\
+-                                                    *(ecx + 1):0;
+-       default:
+-         assert (! "cannot happen");
++    case _SC_LEVEL1_ICACHE_ASSOC:
++    case _SC_LEVEL1_DCACHE_ASSOC:
++    case _SC_LEVEL2_CACHE_ASSOC:
++    case _SC_LEVEL3_CACHE_ASSOC:
++      return ecx ? ((ebx >> 22) & 0x3ff) + 1 : 0;
++    case _SC_LEVEL1_ICACHE_LINESIZE:
++    case _SC_LEVEL1_DCACHE_LINESIZE:
++    case _SC_LEVEL2_CACHE_LINESIZE:
++    case _SC_LEVEL3_CACHE_LINESIZE:
++      return ecx ? (ebx & 0xfff) + 1 : 0;
++    case _SC_LEVEL1_ICACHE_SIZE:
++    case _SC_LEVEL1_DCACHE_SIZE:
++    case _SC_LEVEL2_CACHE_SIZE:
++    case _SC_LEVEL3_CACHE_SIZE:
++      return ecx ? (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1): 0;
++    default:
++      __builtin_unreachable ();
+     }
+   return -1;
+ }
diff --git a/glibc-RHEL-3010-3.patch b/glibc-RHEL-3010-3.patch
new file mode 100644
index 0000000..05022a2
--- /dev/null
+++ b/glibc-RHEL-3010-3.patch
@@ -0,0 +1,280 @@
+commit dcad5c8578130dec7f35fd5b0885304b59f9f543
+Author: Sajan Karumanchi <sajan.karumanchi@amd.com>
+Date:   Tue Aug 1 15:20:55 2023 +0000
+
+    x86: Fix for cache computation on AMD legacy cpus.
+    
+    Some legacy AMD CPUs and hypervisors have the _cpuid_ '0x8000_001D'
+    set to Zero, thus resulting in zeroed-out computed cache values.
+    This patch reintroduces the old way of cache computation as a
+    fail-safe option to handle these exceptions.
+    Fixed 'level4_cache_size' value through handle_amd().
+    
+    Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>
+    Tested-by: Florian Weimer <fweimer@redhat.com>
+
+Conflicts:
+	sysdeps/x86/dl-cacheinfo.h
+	  (missing backport of commit 2d651eb9265d1366d7b9e881bfddd4
+	  ("x86: Move x86 processor cache info to cpu_features"))
+
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index 85e5731281c62503..10ebadd819d9efff 100644
+--- a/sysdeps/x86/cacheinfo.h
++++ b/sysdeps/x86/cacheinfo.h
+@@ -302,11 +302,19 @@ init_cacheinfo (void)
+       data   = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
+       long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
+       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
+-      shared_per_thread = shared;
+ 
+       if (shared <= 0)
+-	/* No shared L3 cache.  All we have is the L2 cache.  */
+-	shared = core;
++        {
++           /* No shared L3 cache.  All we have is the L2 cache.  */
++           shared = core;
++        }
++      else if (cpu_features->basic.family < 0x17)
++        {
++           /* Account for exclusive L2 and L3 caches.  */
++           shared += core;
++        }
++
++      shared_per_thread = shared;
+     }
+ 
+   if (cpu_features->data_cache_size != 0)
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index 74cd5072a9d10756..75a6b1dfde199dd7 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -315,40 +315,206 @@ handle_amd (int name)
+ {
+   unsigned int eax;
+   unsigned int ebx;
+-  unsigned int ecx;
++  unsigned int ecx = 0;
+   unsigned int edx;
+-  unsigned int count = 0x1;
++  unsigned int max_cpuid = 0;
++  unsigned int fn = 0;
+ 
+   /* No level 4 cache (yet).  */
+   if (name > _SC_LEVEL3_CACHE_LINESIZE)
+     return 0;
+ 
+-  if (name >= _SC_LEVEL3_CACHE_SIZE)
+-    count = 0x3;
+-  else if (name >= _SC_LEVEL2_CACHE_SIZE)
+-    count = 0x2;
+-  else if (name >= _SC_LEVEL1_DCACHE_SIZE)
+-    count = 0x0;
++  __cpuid (0x80000000, max_cpuid, ebx, ecx, edx);
++
++  if (max_cpuid >= 0x8000001D)
++    /* Use __cpuid__ '0x8000_001D' to compute cache details.  */
++    {
++      unsigned int count = 0x1;
++
++      if (name >= _SC_LEVEL3_CACHE_SIZE)
++        count = 0x3;
++      else if (name >= _SC_LEVEL2_CACHE_SIZE)
++        count = 0x2;
++      else if (name >= _SC_LEVEL1_DCACHE_SIZE)
++        count = 0x0;
++
++      __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
++
++      if (ecx != 0)
++        {
++          switch (name)
++            {
++            case _SC_LEVEL1_ICACHE_ASSOC:
++            case _SC_LEVEL1_DCACHE_ASSOC:
++            case _SC_LEVEL2_CACHE_ASSOC:
++            case _SC_LEVEL3_CACHE_ASSOC:
++              return ((ebx >> 22) & 0x3ff) + 1;
++            case _SC_LEVEL1_ICACHE_LINESIZE:
++            case _SC_LEVEL1_DCACHE_LINESIZE:
++            case _SC_LEVEL2_CACHE_LINESIZE:
++            case _SC_LEVEL3_CACHE_LINESIZE:
++              return (ebx & 0xfff) + 1;
++            case _SC_LEVEL1_ICACHE_SIZE:
++            case _SC_LEVEL1_DCACHE_SIZE:
++            case _SC_LEVEL2_CACHE_SIZE:
++            case _SC_LEVEL3_CACHE_SIZE:
++              return (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1);
++            default:
++              __builtin_unreachable ();
++            }
++          return -1;
++        }
++    }
+ 
+-  __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
++  /* Legacy cache computation for CPUs prior to Bulldozer family.
++     This is also a fail-safe mechanism for some hypervisors that
++     accidentally configure __cpuid__ '0x8000_001D' to Zero.  */
++
++  fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
++
++  if (max_cpuid < fn)
++    return 0;
++
++  __cpuid (fn, eax, ebx, ecx, edx);
++
++  if (name < _SC_LEVEL1_DCACHE_SIZE)
++    {
++      name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
++      ecx = edx;
++    }
+ 
+   switch (name)
+     {
+-    case _SC_LEVEL1_ICACHE_ASSOC:
+-    case _SC_LEVEL1_DCACHE_ASSOC:
+-    case _SC_LEVEL2_CACHE_ASSOC:
++      case _SC_LEVEL1_DCACHE_SIZE:
++        return (ecx >> 14) & 0x3fc00;
++
++      case _SC_LEVEL1_DCACHE_ASSOC:
++        ecx >>= 16;
++        if ((ecx & 0xff) == 0xff)
++        {
++          /* Fully associative.  */
++          return (ecx << 2) & 0x3fc00;
++        }
++        return ecx & 0xff;
++
++      case _SC_LEVEL1_DCACHE_LINESIZE:
++        return ecx & 0xff;
++
++      case _SC_LEVEL2_CACHE_SIZE:
++        return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
++
++      case _SC_LEVEL2_CACHE_ASSOC:
++        switch ((ecx >> 12) & 0xf)
++          {
++            case 0:
++            case 1:
++            case 2:
++            case 4:
++              return (ecx >> 12) & 0xf;
++            case 6:
++              return 8;
++            case 8:
++              return 16;
++            case 10:
++              return 32;
++            case 11:
++              return 48;
++            case 12:
++              return 64;
++            case 13:
++              return 96;
++            case 14:
++              return 128;
++            case 15:
++              return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff);
++            default:
++              return 0;
++          }
++
++      case _SC_LEVEL2_CACHE_LINESIZE:
++        return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
++
++      case _SC_LEVEL3_CACHE_SIZE:
++        {
++        long int total_l3_cache = 0, l3_cache_per_thread = 0;
++        unsigned int threads = 0;
++        const struct cpu_features *cpu_features;
++
++        if ((edx & 0xf000) == 0)
++          return 0;
++
++        total_l3_cache = (edx & 0x3ffc0000) << 1;
++        cpu_features = __get_cpu_features ();
++
++        /* Figure out the number of logical threads that share L3.  */
++        if (max_cpuid >= 0x80000008)
++          {
++            /* Get width of APIC ID.  */
++            __cpuid (0x80000008, eax, ebx, ecx, edx);
++            threads = (ecx & 0xff) + 1;
++          }
++
++        if (threads == 0)
++          {
++            /* If APIC ID width is not available, use logical
++            processor count.  */
++            __cpuid (0x00000001, eax, ebx, ecx, edx);
++            if ((edx & (1 << 28)) != 0)
++              threads = (ebx >> 16) & 0xff;
++          }
++
++        /* Cap usage of highest cache level to the number of
++           supported threads.  */
++        if (threads > 0)
++          l3_cache_per_thread = total_l3_cache/threads;
++
++        /* Get shared cache per ccx for Zen architectures.  */
++        if (cpu_features->basic.family >= 0x17)
++          {
++            long int l3_cache_per_ccx = 0;
++            /* Get number of threads share the L3 cache in CCX.  */
++            __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
++            unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
++            l3_cache_per_ccx = l3_cache_per_thread * threads_per_ccx;
++            return l3_cache_per_ccx;
++          }
++        else
++          {
++            return l3_cache_per_thread;
++          }
++      }
++
+     case _SC_LEVEL3_CACHE_ASSOC:
+-      return ecx ? ((ebx >> 22) & 0x3ff) + 1 : 0;
+-    case _SC_LEVEL1_ICACHE_LINESIZE:
+-    case _SC_LEVEL1_DCACHE_LINESIZE:
+-    case _SC_LEVEL2_CACHE_LINESIZE:
++      switch ((edx >> 12) & 0xf)
++      {
++        case 0:
++        case 1:
++        case 2:
++        case 4:
++          return (edx >> 12) & 0xf;
++        case 6:
++          return 8;
++        case 8:
++          return 16;
++        case 10:
++          return 32;
++        case 11:
++          return 48;
++        case 12:
++          return 64;
++        case 13:
++          return 96;
++        case 14:
++          return 128;
++        case 15:
++          return ((edx & 0x3ffc0000) << 1) / (edx & 0xff);
++        default:
++          return 0;
++      }
++
+     case _SC_LEVEL3_CACHE_LINESIZE:
+-      return ecx ? (ebx & 0xfff) + 1 : 0;
+-    case _SC_LEVEL1_ICACHE_SIZE:
+-    case _SC_LEVEL1_DCACHE_SIZE:
+-    case _SC_LEVEL2_CACHE_SIZE:
+-    case _SC_LEVEL3_CACHE_SIZE:
+-      return ecx ? (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1): 0;
++      return (edx & 0xf000) == 0 ? 0 : edx & 0xff;
++
+     default:
+       __builtin_unreachable ();
+     }
diff --git a/glibc-RHEL-31804.patch b/glibc-RHEL-31804.patch
new file mode 100644
index 0000000..21f8672
--- /dev/null
+++ b/glibc-RHEL-31804.patch
@@ -0,0 +1,203 @@
+Author: Charles Fol <folcharles@gmail.com>
+Date:   Thu Mar 28 12:25:38 2024 -0300
+
+    iconv: ISO-2022-CN-EXT: fix out-of-bound writes when writing escape sequence (CVE-2024-2961)
+
+    ISO-2022-CN-EXT uses escape sequences to indicate character set changes
+    (as specified by RFC 1922).  While the SOdesignation has the expected
+    bounds checks, neither SS2designation nor SS3designation have its;
+    allowing a write overflow of 1, 2, or 3 bytes with fixed values:
+    '$+I', '$+J', '$+K', '$+L', '$+M', or '$*H'.
+
+    Checked on aarch64-linux-gnu.
+
+    Co-authored-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    Tested-by: Carlos O'Donell <carlos@redhat.com>
+
+diff --git a/iconvdata/Makefile b/iconvdata/Makefile
+index 646e2ccd11478646..c959758a90ed954f 100644
+--- a/iconvdata/Makefile
++++ b/iconvdata/Makefile
+@@ -75,7 +75,7 @@ ifeq (yes,$(build-shared))
+ tests = bug-iconv1 bug-iconv2 tst-loading tst-e2big tst-iconv4 bug-iconv4 \
+ 	tst-iconv6 bug-iconv5 bug-iconv6 tst-iconv7 bug-iconv8 bug-iconv9 \
+ 	bug-iconv10 bug-iconv11 bug-iconv12 bug-iconv13 bug-iconv14 \
+-	bug-iconv15
++	bug-iconv15 tst-iconv-iso-2022-cn-ext
+ ifeq ($(have-thread-library),yes)
+ tests += bug-iconv3
+ endif
+@@ -325,6 +325,8 @@ $(objpfx)bug-iconv14.out: $(addprefix $(objpfx), $(gconv-modules)) \
+ 			  $(addprefix $(objpfx),$(modules.so))
+ $(objpfx)bug-iconv15.out: $(addprefix $(objpfx), $(gconv-modules)) \
+ 			  $(addprefix $(objpfx),$(modules.so))
++$(objpfx)tst-iconv-iso-2022-cn-ext.out: $(addprefix $(objpfx), $(gconv-modules)) \
++					$(addprefix $(objpfx),$(modules.so))
+ 
+ $(objpfx)iconv-test.out: run-iconv-test.sh \
+ 			 $(addprefix $(objpfx), $(gconv-modules)) \
+diff --git a/iconvdata/iso-2022-cn-ext.c b/iconvdata/iso-2022-cn-ext.c
+index c21a7187b4d7808e..bd9493c12d95070b 100644
+--- a/iconvdata/iso-2022-cn-ext.c
++++ b/iconvdata/iso-2022-cn-ext.c
+@@ -575,6 +575,12 @@ DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized");
+ 	      {								      \
+ 		const char *escseq;					      \
+ 									      \
++		if (outptr + 4 > outend)				      \
++		  {							      \
++		    result = __GCONV_FULL_OUTPUT;			      \
++		    break;						      \
++		  }							      \
++									      \
+ 		assert (used == CNS11643_2_set); /* XXX */		      \
+ 		escseq = "*H";						      \
+ 		*outptr++ = ESC;					      \
+@@ -588,6 +594,12 @@ DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized");
+ 	      {								      \
+ 		const char *escseq;					      \
+ 									      \
++		if (outptr + 4 > outend)				      \
++		  {							      \
++		    result = __GCONV_FULL_OUTPUT;			      \
++		    break;						      \
++		  }							      \
++									      \
+ 		assert ((used >> 5) >= 3 && (used >> 5) <= 7);		      \
+ 		escseq = "+I+J+K+L+M" + ((used >> 5) - 3) * 2;		      \
+ 		*outptr++ = ESC;					      \
+diff --git a/iconvdata/tst-iconv-iso-2022-cn-ext.c b/iconvdata/tst-iconv-iso-2022-cn-ext.c
+new file mode 100644
+index 0000000000000000..96a8765fd5369681
+--- /dev/null
++++ b/iconvdata/tst-iconv-iso-2022-cn-ext.c
+@@ -0,0 +1,128 @@
++/* Verify ISO-2022-CN-EXT does not write out of the bounds.
++   Copyright (C) 2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <stdio.h>
++#include <string.h>
++
++#include <errno.h>
++#include <iconv.h>
++#include <sys/mman.h>
++
++#include <support/xunistd.h>
++#include <support/check.h>
++#include <support/support.h>
++
++/* The test sets up a two memory page buffer with the second page marked
++   PROT_NONE to trigger a fault if the conversion writes beyond the exact
++   expected amount.  Then we carry out various conversions and precisely
++   place the start of the output buffer in order to trigger a SIGSEGV if the
++   process writes anywhere between 1 and page sized bytes more (only one
++   PROT_NONE page is setup as a canary) than expected.  These tests exercise
++   all three of the cases in ISO-2022-CN-EXT where the converter must switch
++   character sets and may run out of buffer space while doing the
++   operation.  */
++
++static int
++do_test (void)
++{
++  iconv_t cd = iconv_open ("ISO-2022-CN-EXT", "UTF-8");
++  TEST_VERIFY_EXIT (cd != (iconv_t) -1);
++
++  char *ntf;
++  size_t ntfsize;
++  char *outbufbase;
++  {
++    int pgz = getpagesize ();
++    TEST_VERIFY_EXIT (pgz > 0);
++    ntfsize = 2 * pgz;
++
++    ntf = xmmap (NULL, ntfsize, PROT_READ | PROT_WRITE, MAP_PRIVATE
++		 | MAP_ANONYMOUS, -1);
++    xmprotect (ntf + pgz, pgz, PROT_NONE);
++
++    outbufbase = ntf + pgz;
++  }
++
++  /* Check if SOdesignation escape sequence does not trigger an OOB write.  */
++  {
++    char inbuf[] = "\xe4\xba\xa4\xe6\x8d\xa2";
++
++    for (int i = 0; i < 9; i++)
++      {
++	char *inp = inbuf;
++	size_t inleft = sizeof (inbuf) - 1;
++
++	char *outp = outbufbase - i;
++	size_t outleft = i;
++
++	TEST_VERIFY_EXIT (iconv (cd, &inp, &inleft, &outp, &outleft)
++			  == (size_t) -1);
++	TEST_COMPARE (errno, E2BIG);
++
++	TEST_VERIFY_EXIT (iconv (cd, NULL, NULL, NULL, NULL) == 0);
++      }
++  }
++
++  /* Same as before for SS2designation.  */
++  {
++    char inbuf[] = "㴽 \xe3\xb4\xbd";
++
++    for (int i = 0; i < 14; i++)
++      {
++	char *inp = inbuf;
++	size_t inleft = sizeof (inbuf) - 1;
++
++	char *outp = outbufbase - i;
++	size_t outleft = i;
++
++	TEST_VERIFY_EXIT (iconv (cd, &inp, &inleft, &outp, &outleft)
++			  == (size_t) -1);
++	TEST_COMPARE (errno, E2BIG);
++
++	TEST_VERIFY_EXIT (iconv (cd, NULL, NULL, NULL, NULL) == 0);
++      }
++  }
++
++  /* Same as before for SS3designation.  */
++  {
++    char inbuf[] = "劄 \xe5\x8a\x84";
++
++    for (int i = 0; i < 14; i++)
++      {
++	char *inp = inbuf;
++	size_t inleft = sizeof (inbuf) - 1;
++
++	char *outp = outbufbase - i;
++	size_t outleft = i;
++
++	TEST_VERIFY_EXIT (iconv (cd, &inp, &inleft, &outp, &outleft)
++			  == (size_t) -1);
++	TEST_COMPARE (errno, E2BIG);
++
++	TEST_VERIFY_EXIT (iconv (cd, NULL, NULL, NULL, NULL) == 0);
++      }
++  }
++
++  TEST_VERIFY_EXIT (iconv_close (cd) != -1);
++
++  xmunmap (ntf, ntfsize);
++
++  return 0;
++}
++
++#include <support/test-driver.c>
diff --git a/glibc-RHEL-34264.patch b/glibc-RHEL-34264.patch
new file mode 100644
index 0000000..550ae38
--- /dev/null
+++ b/glibc-RHEL-34264.patch
@@ -0,0 +1,31 @@
+commit 87801a8fd06db1d654eea3e4f7626ff476a9bdaa
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Thu Apr 25 15:00:45 2024 +0200
+
+    CVE-2024-33599: nscd: Stack-based buffer overflow in netgroup cache (bug 31677)
+    
+    Using alloca matches what other caches do.  The request length is
+    bounded by MAXKEYLEN.
+    
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+
+diff --git a/nscd/netgroupcache.c b/nscd/netgroupcache.c
+index 5ee4413ef9384ec9..60c8225639a33b6b 100644
+--- a/nscd/netgroupcache.c
++++ b/nscd/netgroupcache.c
+@@ -503,12 +503,13 @@ addinnetgrX (struct database_dyn *db, int fd, request_header *req,
+       = (struct indataset *) mempool_alloc (db,
+ 					    sizeof (*dataset) + req->key_len,
+ 					    1);
+-  struct indataset dataset_mem;
+   bool cacheable = true;
+   if (__glibc_unlikely (dataset == NULL))
+     {
+       cacheable = false;
+-      dataset = &dataset_mem;
++      /* The alloca is safe because nscd_run_worker verfies that
++	 key_len is not larger than MAXKEYLEN.  */
++      dataset = alloca (sizeof (*dataset) + req->key_len);
+     }
+ 
+   datahead_init_pos (&dataset->head, sizeof (*dataset) + req->key_len,
diff --git a/glibc-RHEL-34267-1.patch b/glibc-RHEL-34267-1.patch
new file mode 100644
index 0000000..8cd4d05
--- /dev/null
+++ b/glibc-RHEL-34267-1.patch
@@ -0,0 +1,52 @@
+commit 7835b00dbce53c3c87bbbb1754a95fb5e58187aa
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Thu Apr 25 15:01:07 2024 +0200
+
+    CVE-2024-33600: nscd: Do not send missing not-found response in addgetnetgrentX (bug 31678)
+    
+    If we failed to add a not-found response to the cache, the dataset
+    point can be null, resulting in a null pointer dereference.
+    
+    Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
+
+diff --git a/nscd/netgroupcache.c b/nscd/netgroupcache.c
+index 60c8225639a33b6b..a3e04b4c43e6acae 100644
+--- a/nscd/netgroupcache.c
++++ b/nscd/netgroupcache.c
+@@ -148,7 +148,7 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req,
+       /* No such service.  */
+       cacheable = do_notfound (db, fd, req, key, &dataset, &total, &timeout,
+ 			       &key_copy);
+-      goto writeout;
++      goto maybe_cache_add;
+     }
+ 
+   memset (&data, '\0', sizeof (data));
+@@ -349,7 +349,7 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req,
+     {
+       cacheable = do_notfound (db, fd, req, key, &dataset, &total, &timeout,
+ 			       &key_copy);
+-      goto writeout;
++      goto maybe_cache_add;
+     }
+ 
+   total = buffilled;
+@@ -411,14 +411,12 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req,
+   }
+ 
+   if (he == NULL && fd != -1)
+-    {
+-      /* We write the dataset before inserting it to the database
+-	 since while inserting this thread might block and so would
+-	 unnecessarily let the receiver wait.  */
+-    writeout:
++    /* We write the dataset before inserting it to the database since
++       while inserting this thread might block and so would
++       unnecessarily let the receiver wait.  */
+       writeall (fd, &dataset->resp, dataset->head.recsize);
+-    }
+ 
++ maybe_cache_add:
+   if (cacheable)
+     {
+       /* If necessary, we also propagate the data to disk.  */
diff --git a/glibc-RHEL-34267-2.patch b/glibc-RHEL-34267-2.patch
new file mode 100644
index 0000000..6f1b7a1
--- /dev/null
+++ b/glibc-RHEL-34267-2.patch
@@ -0,0 +1,53 @@
+commit b048a482f088e53144d26a61c390bed0210f49f2
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Thu Apr 25 15:01:07 2024 +0200
+
+    CVE-2024-33600: nscd: Avoid null pointer crashes after notfound response (bug 31678)
+    
+    The addgetnetgrentX call in addinnetgrX may have failed to produce
+    a result, so the result variable in addinnetgrX can be NULL.
+    Use db->negtimeout as the fallback value if there is no result data;
+    the timeout is also overwritten below.
+    
+    Also avoid sending a second not-found response.  (The client
+    disconnects after receiving the first response, so the data stream did
+    not go out of sync even without this fix.)  It is still beneficial to
+    add the negative response to the mapping, so that the client can get
+    it from there in the future, instead of going through the socket.
+    
+    Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
+
+diff --git a/nscd/netgroupcache.c b/nscd/netgroupcache.c
+index a3e04b4c43e6acae..f656872ae8c3b888 100644
+--- a/nscd/netgroupcache.c
++++ b/nscd/netgroupcache.c
+@@ -512,14 +512,15 @@ addinnetgrX (struct database_dyn *db, int fd, request_header *req,
+ 
+   datahead_init_pos (&dataset->head, sizeof (*dataset) + req->key_len,
+ 		     sizeof (innetgroup_response_header),
+-		     he == NULL ? 0 : dh->nreloads + 1, result->head.ttl);
++		     he == NULL ? 0 : dh->nreloads + 1,
++		     result == NULL ? db->negtimeout : result->head.ttl);
+   /* Set the notfound status and timeout based on the result from
+      getnetgrent.  */
+-  dataset->head.notfound = result->head.notfound;
++  dataset->head.notfound = result == NULL || result->head.notfound;
+   dataset->head.timeout = timeout;
+ 
+   dataset->resp.version = NSCD_VERSION;
+-  dataset->resp.found = result->resp.found;
++  dataset->resp.found = result != NULL && result->resp.found;
+   /* Until we find a matching entry the result is 0.  */
+   dataset->resp.result = 0;
+ 
+@@ -567,7 +568,9 @@ addinnetgrX (struct database_dyn *db, int fd, request_header *req,
+       goto out;
+     }
+ 
+-  if (he == NULL)
++  /* addgetnetgrentX may have already sent a notfound response.  Do
++     not send another one.  */
++  if (he == NULL && dataset->resp.found)
+     {
+       /* We write the dataset before inserting it to the database
+ 	 since while inserting this thread might block and so would
diff --git a/glibc-RHEL-34273.patch b/glibc-RHEL-34273.patch
new file mode 100644
index 0000000..f855206
--- /dev/null
+++ b/glibc-RHEL-34273.patch
@@ -0,0 +1,383 @@
+commit c04a21e050d64a1193a6daab872bca2528bda44b
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Thu Apr 25 15:01:07 2024 +0200
+
+    CVE-2024-33601, CVE-2024-33602: nscd: netgroup: Use two buffers in addgetnetgrentX (bug 31680)
+    
+    This avoids potential memory corruption when the underlying NSS
+    callback function does not use the buffer space to store all strings
+    (e.g., for constant strings).
+    
+    Instead of custom buffer management, two scratch buffers are used.
+    This increases stack usage somewhat.
+    
+    Scratch buffer allocation failure is handled by return -1
+    (an invalid timeout value) instead of terminating the process.
+    This fixes bug 31679.
+    
+    Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
+
+diff --git a/nscd/netgroupcache.c b/nscd/netgroupcache.c
+index f656872ae8c3b888..dd180f8083e7c9f9 100644
+--- a/nscd/netgroupcache.c
++++ b/nscd/netgroupcache.c
+@@ -24,6 +24,7 @@
+ #include <stdlib.h>
+ #include <unistd.h>
+ #include <sys/mman.h>
++#include <scratch_buffer.h>
+ 
+ #include "../inet/netgroup.h"
+ #include "nscd.h"
+@@ -66,6 +67,16 @@ struct dataset
+   char strdata[0];
+ };
+ 
++/* Send a notfound response to FD.  Always returns -1 to indicate an
++   ephemeral error.  */
++static time_t
++send_notfound (int fd)
++{
++  if (fd != -1)
++    TEMP_FAILURE_RETRY (send (fd, &notfound, sizeof (notfound), MSG_NOSIGNAL));
++  return -1;
++}
++
+ /* Sends a notfound message and prepares a notfound dataset to write to the
+    cache.  Returns true if there was enough memory to allocate the dataset and
+    returns the dataset in DATASETP, total bytes to write in TOTALP and the
+@@ -84,8 +95,7 @@ do_notfound (struct database_dyn *db, int fd, request_header *req,
+   total = sizeof (notfound);
+   timeout = time (NULL) + db->negtimeout;
+ 
+-  if (fd != -1)
+-    TEMP_FAILURE_RETRY (send (fd, &notfound, total, MSG_NOSIGNAL));
++  send_notfound (fd);
+ 
+   dataset = mempool_alloc (db, sizeof (struct dataset) + req->key_len, 1);
+   /* If we cannot permanently store the result, so be it.  */
+@@ -110,11 +120,78 @@ do_notfound (struct database_dyn *db, int fd, request_header *req,
+   return cacheable;
+ }
+ 
++struct addgetnetgrentX_scratch
++{
++  /* This is the result that the caller should use.  It can be NULL,
++     point into buffer, or it can be in the cache.  */
++  struct dataset *dataset;
++
++  struct scratch_buffer buffer;
++
++  /* Used internally in addgetnetgrentX as a staging area.  */
++  struct scratch_buffer tmp;
++
++  /* Number of bytes in buffer that are actually used.  */
++  size_t buffer_used;
++};
++
++static void
++addgetnetgrentX_scratch_init (struct addgetnetgrentX_scratch *scratch)
++{
++  scratch->dataset = NULL;
++  scratch_buffer_init (&scratch->buffer);
++  scratch_buffer_init (&scratch->tmp);
++
++  /* Reserve space for the header.  */
++  scratch->buffer_used = sizeof (struct dataset);
++  static_assert (sizeof (struct dataset) < sizeof (scratch->tmp.__space),
++		 "initial buffer space");
++  memset (scratch->tmp.data, 0, sizeof (struct dataset));
++}
++
++static void
++addgetnetgrentX_scratch_free (struct addgetnetgrentX_scratch *scratch)
++{
++  scratch_buffer_free (&scratch->buffer);
++  scratch_buffer_free (&scratch->tmp);
++}
++
++/* Copy LENGTH bytes from S into SCRATCH.  Returns NULL if SCRATCH
++   could not be resized, otherwise a pointer to the copy.  */
++static char *
++addgetnetgrentX_append_n (struct addgetnetgrentX_scratch *scratch,
++			  const char *s, size_t length)
++{
++  while (true)
++    {
++      size_t remaining = scratch->buffer.length - scratch->buffer_used;
++      if (remaining >= length)
++	break;
++      if (!scratch_buffer_grow_preserve (&scratch->buffer))
++	return NULL;
++    }
++  char *copy = scratch->buffer.data + scratch->buffer_used;
++  memcpy (copy, s, length);
++  scratch->buffer_used += length;
++  return copy;
++}
++
++/* Copy S into SCRATCH, including its null terminator.  Returns false
++   if SCRATCH could not be resized.  */
++static bool
++addgetnetgrentX_append (struct addgetnetgrentX_scratch *scratch, const char *s)
++{
++  if (s == NULL)
++    s = "";
++  return addgetnetgrentX_append_n (scratch, s, strlen (s) + 1) != NULL;
++}
++
++/* Caller must initialize and free *SCRATCH.  If the return value is
++   negative, this function has sent a notfound response.  */
+ static time_t
+ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req,
+ 		 const char *key, uid_t uid, struct hashentry *he,
+-		 struct datahead *dh, struct dataset **resultp,
+-		 void **tofreep)
++		 struct datahead *dh, struct addgetnetgrentX_scratch *scratch)
+ {
+   if (__glibc_unlikely (debug_level > 0))
+     {
+@@ -133,14 +210,10 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req,
+ 
+   char *key_copy = NULL;
+   struct __netgrent data;
+-  size_t buflen = MAX (1024, sizeof (*dataset) + req->key_len);
+-  size_t buffilled = sizeof (*dataset);
+-  char *buffer = NULL;
+   size_t nentries = 0;
+   size_t group_len = strlen (key) + 1;
+   struct name_list *first_needed
+     = alloca (sizeof (struct name_list) + group_len);
+-  *tofreep = NULL;
+ 
+   if (netgroup_database == NULL
+       && __nss_database_lookup2 ("netgroup", NULL, NULL, &netgroup_database))
+@@ -152,8 +225,6 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req,
+     }
+ 
+   memset (&data, '\0', sizeof (data));
+-  buffer = xmalloc (buflen);
+-  *tofreep = buffer;
+   first_needed->next = first_needed;
+   memcpy (first_needed->name, key, group_len);
+   data.needed_groups = first_needed;
+@@ -196,8 +267,8 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req,
+ 		while (1)
+ 		  {
+ 		    int e;
+-		    status = getfct.f (&data, buffer + buffilled,
+-				       buflen - buffilled - req->key_len, &e);
++		    status = getfct.f (&data, scratch->tmp.data,
++				       scratch->tmp.length, &e);
+ 		    if (status == NSS_STATUS_SUCCESS)
+ 		      {
+ 			if (data.type == triple_val)
+@@ -205,68 +276,10 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req,
+ 			    const char *nhost = data.val.triple.host;
+ 			    const char *nuser = data.val.triple.user;
+ 			    const char *ndomain = data.val.triple.domain;
+-
+-			    size_t hostlen = strlen (nhost ?: "") + 1;
+-			    size_t userlen = strlen (nuser ?: "") + 1;
+-			    size_t domainlen = strlen (ndomain ?: "") + 1;
+-
+-			    if (nhost == NULL || nuser == NULL || ndomain == NULL
+-				|| nhost > nuser || nuser > ndomain)
+-			      {
+-				const char *last = nhost;
+-				if (last == NULL
+-				    || (nuser != NULL && nuser > last))
+-				  last = nuser;
+-				if (last == NULL
+-				    || (ndomain != NULL && ndomain > last))
+-				  last = ndomain;
+-
+-				size_t bufused
+-				  = (last == NULL
+-				     ? buffilled
+-				     : last + strlen (last) + 1 - buffer);
+-
+-				/* We have to make temporary copies.  */
+-				size_t needed = hostlen + userlen + domainlen;
+-
+-				if (buflen - req->key_len - bufused < needed)
+-				  {
+-				    buflen += MAX (buflen, 2 * needed);
+-				    /* Save offset in the old buffer.  We don't
+-				       bother with the NULL check here since
+-				       we'll do that later anyway.  */
+-				    size_t nhostdiff = nhost - buffer;
+-				    size_t nuserdiff = nuser - buffer;
+-				    size_t ndomaindiff = ndomain - buffer;
+-
+-				    char *newbuf = xrealloc (buffer, buflen);
+-				    /* Fix up the triplet pointers into the new
+-				       buffer.  */
+-				    nhost = (nhost ? newbuf + nhostdiff
+-					     : NULL);
+-				    nuser = (nuser ? newbuf + nuserdiff
+-					     : NULL);
+-				    ndomain = (ndomain ? newbuf + ndomaindiff
+-					       : NULL);
+-				    *tofreep = buffer = newbuf;
+-				  }
+-
+-				nhost = memcpy (buffer + bufused,
+-						nhost ?: "", hostlen);
+-				nuser = memcpy ((char *) nhost + hostlen,
+-						nuser ?: "", userlen);
+-				ndomain = memcpy ((char *) nuser + userlen,
+-						  ndomain ?: "", domainlen);
+-			      }
+-
+-			    char *wp = buffer + buffilled;
+-			    wp = memmove (wp, nhost ?: "", hostlen);
+-			    wp += hostlen;
+-			    wp = memmove (wp, nuser ?: "", userlen);
+-			    wp += userlen;
+-			    wp = memmove (wp, ndomain ?: "", domainlen);
+-			    wp += domainlen;
+-			    buffilled = wp - buffer;
++			    if (!(addgetnetgrentX_append (scratch, nhost)
++				  && addgetnetgrentX_append (scratch, nuser)
++				  && addgetnetgrentX_append (scratch, ndomain)))
++			      return send_notfound (fd);
+ 			    ++nentries;
+ 			  }
+ 			else
+@@ -318,8 +331,8 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req,
+ 		      }
+ 		    else if (status == NSS_STATUS_TRYAGAIN && e == ERANGE)
+ 		      {
+-			buflen *= 2;
+-			*tofreep = buffer = xrealloc (buffer, buflen);
++			if (!scratch_buffer_grow (&scratch->tmp))
++			  return send_notfound (fd);
+ 		      }
+ 		    else if (status == NSS_STATUS_RETURN
+ 			     || status == NSS_STATUS_NOTFOUND
+@@ -352,10 +365,17 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req,
+       goto maybe_cache_add;
+     }
+ 
+-  total = buffilled;
++  /* Capture the result size without the key appended.   */
++  total = scratch->buffer_used;
++
++  /* Make a copy of the key.  The scratch buffer must not move after
++     this point.  */
++  key_copy = addgetnetgrentX_append_n (scratch, key, req->key_len);
++  if (key_copy == NULL)
++    return send_notfound (fd);
+ 
+   /* Fill in the dataset.  */
+-  dataset = (struct dataset *) buffer;
++  dataset = scratch->buffer.data;
+   timeout = datahead_init_pos (&dataset->head, total + req->key_len,
+ 			       total - offsetof (struct dataset, resp),
+ 			       he == NULL ? 0 : dh->nreloads + 1,
+@@ -364,11 +384,7 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req,
+   dataset->resp.version = NSCD_VERSION;
+   dataset->resp.found = 1;
+   dataset->resp.nresults = nentries;
+-  dataset->resp.result_len = buffilled - sizeof (*dataset);
+-
+-  assert (buflen - buffilled >= req->key_len);
+-  key_copy = memcpy (buffer + buffilled, key, req->key_len);
+-  buffilled += req->key_len;
++  dataset->resp.result_len = total - sizeof (*dataset);
+ 
+   /* Now we can determine whether on refill we have to create a new
+      record or not.  */
+@@ -399,7 +415,7 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req,
+     if (__glibc_likely (newp != NULL))
+       {
+ 	/* Adjust pointer into the memory block.  */
+-	key_copy = (char *) newp + (key_copy - buffer);
++	key_copy = (char *) newp + (key_copy - (char *) dataset);
+ 
+ 	dataset = memcpy (newp, dataset, total + req->key_len);
+ 	cacheable = true;
+@@ -440,7 +456,7 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req,
+     }
+ 
+  out:
+-  *resultp = dataset;
++  scratch->dataset = dataset;
+ 
+   return timeout;
+ }
+@@ -461,6 +477,9 @@ addinnetgrX (struct database_dyn *db, int fd, request_header *req,
+   if (user != NULL)
+     key = (char *) rawmemchr (key, '\0') + 1;
+   const char *domain = *key++ ? key : NULL;
++  struct addgetnetgrentX_scratch scratch;
++
++  addgetnetgrentX_scratch_init (&scratch);
+ 
+   if (__glibc_unlikely (debug_level > 0))
+     {
+@@ -476,12 +495,8 @@ addinnetgrX (struct database_dyn *db, int fd, request_header *req,
+ 							    group, group_len,
+ 							    db, uid);
+   time_t timeout;
+-  void *tofree;
+   if (result != NULL)
+-    {
+-      timeout = result->head.timeout;
+-      tofree = NULL;
+-    }
++    timeout = result->head.timeout;
+   else
+     {
+       request_header req_get =
+@@ -490,7 +505,10 @@ addinnetgrX (struct database_dyn *db, int fd, request_header *req,
+ 	  .key_len = group_len
+ 	};
+       timeout = addgetnetgrentX (db, -1, &req_get, group, uid, NULL, NULL,
+-				 &result, &tofree);
++				 &scratch);
++      result = scratch.dataset;
++      if (timeout < 0)
++	goto out;
+     }
+ 
+   struct indataset
+@@ -604,7 +622,7 @@ addinnetgrX (struct database_dyn *db, int fd, request_header *req,
+     }
+ 
+  out:
+-  free (tofree);
++  addgetnetgrentX_scratch_free (&scratch);
+   return timeout;
+ }
+ 
+@@ -614,11 +632,12 @@ addgetnetgrentX_ignore (struct database_dyn *db, int fd, request_header *req,
+ 			const char *key, uid_t uid, struct hashentry *he,
+ 			struct datahead *dh)
+ {
+-  struct dataset *ignore;
+-  void *tofree;
+-  time_t timeout = addgetnetgrentX (db, fd, req, key, uid, he, dh,
+-				    &ignore, &tofree);
+-  free (tofree);
++  struct addgetnetgrentX_scratch scratch;
++  addgetnetgrentX_scratch_init (&scratch);
++  time_t timeout = addgetnetgrentX (db, fd, req, key, uid, he, dh, &scratch);
++  addgetnetgrentX_scratch_free (&scratch);
++  if (timeout < 0)
++    timeout = 0;
+   return timeout;
+ }
+ 
+@@ -662,5 +681,9 @@ readdinnetgr (struct database_dyn *db, struct hashentry *he,
+       .key_len = he->len
+     };
+ 
+-  return addinnetgrX (db, -1, &req, db->data + he->key, he->owner, he, dh);
++  int timeout = addinnetgrX (db, -1, &req, db->data + he->key, he->owner,
++			     he, dh);
++  if (timeout < 0)
++    timeout = 0;
++  return timeout;
+ }
diff --git a/glibc-RHEL-22846.patch b/glibc-RHEL-3639.patch
similarity index 96%
rename from glibc-RHEL-22846.patch
rename to glibc-RHEL-3639.patch
index 4179f74..e51ab13 100644
--- a/glibc-RHEL-22846.patch
+++ b/glibc-RHEL-3639.patch
@@ -20,7 +20,7 @@ Conflicts:
 	  (usual test differences, link test with -ldl)
 
 diff --git a/elf/Makefile b/elf/Makefile
-index 6f0f36cdfe3961e8..ebf46a297d241d8f 100644
+index 634c3113227d64a6..42dc878209b11d29 100644
 --- a/elf/Makefile
 +++ b/elf/Makefile
 @@ -362,6 +362,7 @@ tests += \
@@ -31,7 +31,7 @@ index 6f0f36cdfe3961e8..ebf46a297d241d8f 100644
    tst-dlmodcount \
    tst-dlmopen1 \
    tst-dlmopen3 \
-@@ -711,6 +712,8 @@ modules-names = \
+@@ -709,6 +710,8 @@ modules-names = \
    tst-deep1mod2 \
    tst-deep1mod3 \
    tst-dlmopen1mod \
@@ -40,19 +40,17 @@ index 6f0f36cdfe3961e8..ebf46a297d241d8f 100644
    tst-dlmopen-dlerror-mod \
    tst-dlmopen-gethostbyname-mod \
    tst-dlmopen-twice-mod1 \
-@@ -2707,6 +2710,12 @@ $(objpfx)tst-dlmopen-twice.out: \
+@@ -2697,3 +2700,10 @@ $(objpfx)tst-dlmopen-twice: $(libdl)
+ $(objpfx)tst-dlmopen-twice.out: \
    $(objpfx)tst-dlmopen-twice-mod1.so \
    $(objpfx)tst-dlmopen-twice-mod2.so
- 
++
 +LDFLAGS-tst-dlclose-lazy-mod1.so = -Wl,-z,lazy,--no-as-needed
 +$(objpfx)tst-dlclose-lazy-mod1.so: $(objpfx)tst-dlclose-lazy-mod2.so
 +$(objpfx)tst-dlclose-lazy: $(libdl)
 +$(objpfx)tst-dlclose-lazy.out: \
 +  $(objpfx)tst-dlclose-lazy-mod1.so $(objpfx)tst-dlclose-lazy-mod2.so
 +
- # The object tst-nodeps1-mod.so has no explicit dependencies on libc.so.
- $(objpfx)tst-nodeps1-mod.so: $(objpfx)tst-nodeps1-mod.os
- 	$(LINK.o) -nostartfiles -nostdlib -shared -o $@ $^
 diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
 index 47acd134600b44b5..9e8f14b8483f5eba 100644
 --- a/elf/dl-lookup.c
diff --git a/glibc-2.28-Add-run-one-test-convenience-target-and-m.patch b/glibc-RHEL-3757.patch
similarity index 65%
rename from glibc-2.28-Add-run-one-test-convenience-target-and-m.patch
rename to glibc-RHEL-3757.patch
index 7cd6393..8902823 100644
--- a/glibc-2.28-Add-run-one-test-convenience-target-and-m.patch
+++ b/glibc-RHEL-3757.patch
@@ -1,33 +1,17 @@
-From bbc404e8f6e59aa808642c2a40e24a81744967e3 Mon Sep 17 00:00:00 2001
-From: caiyinyu <caiyinyu@loongson.cn>
-Date: Mon, 15 May 2023 12:00:50 +0800
-Subject: [PATCH 04/14] glibc-2.28: Add run-one-test convenience target and
- makefile help text
-
-Reference:
-
-    commit 2ac579f9c25388a7734948d77b03e4dd10f35334
-    Author: DJ Delorie <dj@redhat.com>
-    Date:   Mon Sep 30 16:04:52 2019 -0400
+commit 2ac579f9c25388a7734948d77b03e4dd10f35334
+Author: DJ Delorie <dj@redhat.com>
+Date:   Mon Sep 30 16:04:52 2019 -0400
 
     Add run-one-test convenience target and makefile help text
-
+    
     Adds "make test" for re-running just one test.  Also adds
     "make help" for help with our Makefile targets, and adds a
     mini-help when you just run "make".
-
+    
     Reviewed-by: Carlos O'Donell <carlos@redhat.com>
 
-Change-Id: I8c7ccf9a5ec4dc4afd4901d2f8f693677d0d94ea
-Signed-off-by: ticat_fp <fanpeng@loongson.cn>
----
- Makefile      | 22 ++++++++++++++++++++--
- Makefile.help | 42 ++++++++++++++++++++++++++++++++++++++++++
- 2 files changed, 62 insertions(+), 2 deletions(-)
- create mode 100644 Makefile.help
-
 diff --git a/Makefile b/Makefile
-index 6d73241b..6518f62e 100644
+index 6d73241bbc811c13..6518f62ee0676b0d 100644
 --- a/Makefile
 +++ b/Makefile
 @@ -26,8 +26,17 @@ include Makeconfig
@@ -65,7 +49,7 @@ index 6d73241b..6518f62e 100644
 +	@cat $(objpfx)$t.out
 diff --git a/Makefile.help b/Makefile.help
 new file mode 100644
-index 00000000..319fdaa1
+index 0000000000000000..3b043bce013cc2b4
 --- /dev/null
 +++ b/Makefile.help
 @@ -0,0 +1,42 @@
@@ -92,25 +76,22 @@ index 00000000..319fdaa1
 +help-starts-here
 +
 +all
-+       The usual default; builds everything but doesn't run the
-+       tests.
++	The usual default; builds everything but doesn't run the
++	tests.
 +
 +check (or tests)
-+       Runs the standard set of tests.
++	Runs the standard set of tests.
 +
 +test
-+       Runs one test.  Use like this:
-+               make test t=wcsmbs/test-wcsnlen
-+       Note that this will rebuild the test if needed, but will not
-+       rebuild what "make all" would have rebuilt.
++	Runs one test.  Use like this:
++		make test t=wcsmbs/test-wcsnlen
++	Note that this will rebuild the test if needed, but will not
++	rebuild what "make all" would have rebuilt.
 +
 +--
 +Other useful hints:
 +
 +builddir$ rm testroot.pristine/install.stamp
-+       Forces the testroot to be reinstalled the next time you run
-+       the testsuite (or just rm -rf testroot.pristine)
++	Forces the testroot to be reinstalled the next time you run
++	the testsuite (or just rm -rf testroot.pristine)
 +
--- 
-2.33.0
-
diff --git a/glibc-Support-target-specific-ALIGN-for-variable-alignment-4.patch b/glibc-Support-target-specific-ALIGN-for-variable-alignment-4.patch
deleted file mode 100644
index d7552e1..0000000
--- a/glibc-Support-target-specific-ALIGN-for-variable-alignment-4.patch
+++ /dev/null
@@ -1,171 +0,0 @@
-From 2e86602d21fcaa8353c529f2f6768125396da39f Mon Sep 17 00:00:00 2001
-From: "H.J. Lu" <hjl.tools@gmail.com>
-Date: Wed, 19 Jul 2023 23:12:30 +0800
-Subject: [PATCH 5/6] Support target specific ALIGN for variable alignment test
- [BZ #28676]
-
-Add <tst-file-align.h> to support target specific ALIGN for variable
-alignment test:
-
-1. Alpha: Use 0x10000.
-2. MicroBlaze and Nios II: Use 0x8000.
-3. All others: Use 0x200000.
-
-Backport from master commit: 4435c29
-
-Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
-Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
----
- elf/tst-align3.c                    |  4 +---
- elf/tst-alignmod3.c                 |  4 +---
- sysdeps/alpha/tst-file-align.h      | 20 ++++++++++++++++++++
- sysdeps/generic/tst-file-align.h    | 20 ++++++++++++++++++++
- sysdeps/microblaze/tst-file-align.h | 20 ++++++++++++++++++++
- sysdeps/nios2/tst-file-align.h      | 20 ++++++++++++++++++++
- 6 files changed, 82 insertions(+), 6 deletions(-)
- create mode 100644 sysdeps/alpha/tst-file-align.h
- create mode 100644 sysdeps/generic/tst-file-align.h
- create mode 100644 sysdeps/microblaze/tst-file-align.h
- create mode 100644 sysdeps/nios2/tst-file-align.h
-
-diff --git a/elf/tst-align3.c b/elf/tst-align3.c
-index ac86d623..87a8ff81 100644
---- a/elf/tst-align3.c
-+++ b/elf/tst-align3.c
-@@ -17,11 +17,9 @@
-    <https://www.gnu.org/licenses/>.  */
- 
- #include <support/check.h>
-+#include <tst-file-align.h>
- #include <tst-stack-align.h>
- 
--/* This should cover all possible page sizes we currently support.  */
--#define ALIGN 0x200000
--
- int bar __attribute__ ((aligned (ALIGN))) = 1;
- 
- extern int do_load_test (void);
-diff --git a/elf/tst-alignmod3.c b/elf/tst-alignmod3.c
-index 0d33f237..9520c352 100644
---- a/elf/tst-alignmod3.c
-+++ b/elf/tst-alignmod3.c
-@@ -17,11 +17,9 @@
-    <https://www.gnu.org/licenses/>.  */
- 
- #include <support/check.h>
-+#include <tst-file-align.h>
- #include <tst-stack-align.h>
- 
--/* This should cover all possible page sizes we currently support.  */
--#define ALIGN 0x200000
--
- int foo __attribute__ ((aligned (ALIGN))) = 1;
- 
- void
-diff --git a/sysdeps/alpha/tst-file-align.h b/sysdeps/alpha/tst-file-align.h
-new file mode 100644
-index 00000000..8fc3c940
---- /dev/null
-+++ b/sysdeps/alpha/tst-file-align.h
-@@ -0,0 +1,20 @@
-+/* Check file alignment.  Alpha version.
-+   Copyright (C) 2021 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+/* This should cover all possible alignments we currently support.  */
-+#define ALIGN 0x10000
-diff --git a/sysdeps/generic/tst-file-align.h b/sysdeps/generic/tst-file-align.h
-new file mode 100644
-index 00000000..6ee6783a
---- /dev/null
-+++ b/sysdeps/generic/tst-file-align.h
-@@ -0,0 +1,20 @@
-+/* Check file alignment.  Generic version.
-+   Copyright (C) 2021 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+/* This should cover all possible page sizes we currently support.  */
-+#define ALIGN 0x200000
-diff --git a/sysdeps/microblaze/tst-file-align.h b/sysdeps/microblaze/tst-file-align.h
-new file mode 100644
-index 00000000..43c58b29
---- /dev/null
-+++ b/sysdeps/microblaze/tst-file-align.h
-@@ -0,0 +1,20 @@
-+/* Check file alignment.  MicroBlaze version.
-+   Copyright (C) 2021 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+/* This should cover all possible alignments we currently support.  */
-+#define ALIGN 0x8000
-diff --git a/sysdeps/nios2/tst-file-align.h b/sysdeps/nios2/tst-file-align.h
-new file mode 100644
-index 00000000..589a2d5a
---- /dev/null
-+++ b/sysdeps/nios2/tst-file-align.h
-@@ -0,0 +1,20 @@
-+/* Check file alignment.  Nios II version.
-+   Copyright (C) 2021 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+/* This should cover all possible alignments we currently support.  */
-+#define ALIGN 0x8000
--- 
-2.27.0
-
diff --git a/glibc-Sync-to-lnd-35-for-LoongArch.patch b/glibc-Sync-to-lnd-35-for-LoongArch.patch
deleted file mode 100644
index e1e1615..0000000
--- a/glibc-Sync-to-lnd-35-for-LoongArch.patch
+++ /dev/null
@@ -1,26506 +0,0 @@
-From 4bcb0bf4f727666ba875302baf52d60f65bd7cb1 Mon Sep 17 00:00:00 2001
-From: Lixing <lixing@loongosn.cn>
-Date: Wed, 19 Jul 2023 11:59:19 +0800
-Subject: [PATCH] glibc Sync to vec.35 for LoongArch
-
-dl-machine.h: scope used
-	      #define PLTREL ElfW(Rela)
-dl-tunables.list added
-ld.abilist changed
-localplt.data changed
----
- elf/dl-reloc.c                                |   13 +-
- elf/elf.h                                     |   85 +-
- scripts/config.guess                          |    3 +
- scripts/config.sub                            |    7 +-
- sysdeps/loongarch/Implies                     |    5 +
- sysdeps/loongarch/Makefile                    |   36 +
- sysdeps/loongarch/Versions                    |    5 +
- sysdeps/loongarch/__longjmp.S                 |   50 +
- sysdeps/loongarch/abort-instr.h               |    2 +
- sysdeps/loongarch/at_quick_exit.c             |    1 +
- sysdeps/loongarch/atexit.c                    |    1 +
- sysdeps/loongarch/bits/endian.h               |    9 +
- sysdeps/loongarch/bits/fenv.h                 |   93 +
- sysdeps/loongarch/bits/link.h                 |   56 +
- sysdeps/loongarch/bits/setjmp.h               |   39 +
- sysdeps/loongarch/bits/wordsize.h             |   22 +
- sysdeps/loongarch/bsd-_setjmp.c               |    1 +
- sysdeps/loongarch/bsd-setjmp.c                |    1 +
- sysdeps/loongarch/configure                   |    4 +
- sysdeps/loongarch/configure.ac                |    6 +
- sysdeps/loongarch/cpu-tunables.c              |   94 +
- sysdeps/loongarch/dl-get-cpu-features.c       |   25 +
- sysdeps/loongarch/dl-irel.h                   |   51 +
- sysdeps/loongarch/dl-machine.h                |  410 +++
- sysdeps/loongarch/dl-tls.h                    |   49 +
- sysdeps/loongarch/dl-trampoline.S             |   31 +
- sysdeps/loongarch/dl-trampoline.h             |  153 ++
- sysdeps/loongarch/dl-tunables.list            |   25 +
- sysdeps/loongarch/e_sqrtl.c                   |   39 +
- sysdeps/loongarch/elf-init.c                  |    1 +
- sysdeps/loongarch/fenv_private.h              |  328 +++
- sysdeps/loongarch/fpu/e_ilogb.c               |   39 +
- sysdeps/loongarch/fpu/e_ilogbf.c              |   39 +
- sysdeps/loongarch/fpu/e_sqrt.c                |   29 +
- sysdeps/loongarch/fpu/e_sqrtf.c               |   28 +
- sysdeps/loongarch/fpu/fclrexcpt.c             |   47 +
- sysdeps/loongarch/fpu/fedisblxcpt.c           |   40 +
- sysdeps/loongarch/fpu/feenablxcpt.c           |   40 +
- sysdeps/loongarch/fpu/fegetenv.c              |   33 +
- sysdeps/loongarch/fpu/fegetexcept.c           |   33 +
- sysdeps/loongarch/fpu/fegetmode.c             |   27 +
- sysdeps/loongarch/fpu/fegetround.c            |   35 +
- sysdeps/loongarch/fpu/feholdexcpt.c           |   41 +
- sysdeps/loongarch/fpu/fenv_libc.h             |   31 +
- sysdeps/loongarch/fpu/fesetenv.c              |   44 +
- sysdeps/loongarch/fpu/fesetexcept.c           |   32 +
- sysdeps/loongarch/fpu/fesetmode.c             |   38 +
- sysdeps/loongarch/fpu/fesetround.c            |   46 +
- sysdeps/loongarch/fpu/feupdateenv.c           |   45 +
- sysdeps/loongarch/fpu/fgetexcptflg.c          |   39 +
- sysdeps/loongarch/fpu/fraiseexcpt.c           |   84 +
- sysdeps/loongarch/fpu/fsetexcptflg.c          |   42 +
- sysdeps/loongarch/fpu/ftestexcept.c           |   33 +
- sysdeps/loongarch/fpu/s_copysign.c            |   30 +
- sysdeps/loongarch/fpu/s_copysignf.c           |   30 +
- sysdeps/loongarch/fpu/s_finite.c              |   30 +
- sysdeps/loongarch/fpu/s_finitef.c             |   30 +
- sysdeps/loongarch/fpu/s_fmax.c                |   30 +
- sysdeps/loongarch/fpu/s_fmaxf.c               |   30 +
- sysdeps/loongarch/fpu/s_fmaxmag.c             |   29 +
- sysdeps/loongarch/fpu/s_fmaxmagf.c            |   29 +
- sysdeps/loongarch/fpu/s_fmin.c                |   30 +
- sysdeps/loongarch/fpu/s_fminf.c               |   30 +
- sysdeps/loongarch/fpu/s_fminmag.c             |   29 +
- sysdeps/loongarch/fpu/s_fminmagf.c            |   29 +
- sysdeps/loongarch/fpu/s_fpclassify.c          |   38 +
- sysdeps/loongarch/fpu/s_fpclassifyf.c         |   38 +
- sysdeps/loongarch/fpu/s_isinf.c               |   30 +
- sysdeps/loongarch/fpu/s_isinff.c              |   30 +
- sysdeps/loongarch/fpu/s_isnan.c               |   31 +
- sysdeps/loongarch/fpu/s_isnanf.c              |   31 +
- sysdeps/loongarch/fpu/s_issignaling.c         |   29 +
- sysdeps/loongarch/fpu/s_issignalingf.c        |   29 +
- sysdeps/loongarch/fpu/s_llrint.c              |   31 +
- sysdeps/loongarch/fpu/s_llrintf.c             |   31 +
- sysdeps/loongarch/fpu/s_logb.c                |   30 +
- sysdeps/loongarch/fpu/s_logbf.c               |   30 +
- sysdeps/loongarch/fpu/s_lrint.c               |   31 +
- sysdeps/loongarch/fpu/s_lrintf.c              |   31 +
- sysdeps/loongarch/fpu/s_rint.c                |   29 +
- sysdeps/loongarch/fpu/s_rintf.c               |   29 +
- sysdeps/loongarch/fpu/s_scalbn.c              |   29 +
- sysdeps/loongarch/fpu/s_scalbnf.c             |   29 +
- sysdeps/loongarch/fpu_control.h               |  128 +
- sysdeps/loongarch/fstat.c                     |    1 +
- sysdeps/loongarch/fstat64.c                   |    1 +
- sysdeps/loongarch/fstatat.c                   |    1 +
- sysdeps/loongarch/fstatat64.c                 |    1 +
- sysdeps/loongarch/gccframe.h                  |   21 +
- sysdeps/loongarch/hp-timing.h                 |   40 +
- sysdeps/loongarch/init-arch.h                 |   24 +
- sysdeps/loongarch/jmpbuf-offsets.h            |   23 +
- sysdeps/loongarch/jmpbuf-unwind.h             |   46 +
- sysdeps/loongarch/ldsodefs.h                  |   48 +
- sysdeps/loongarch/libc-start.h                |   25 +
- sysdeps/loongarch/libc-tls.c                  |   32 +
- sysdeps/loongarch/linkmap.h                   |    4 +
- sysdeps/loongarch/lp64/Implies-after          |    1 +
- sysdeps/loongarch/lp64/libm-test-ulps         | 2206 +++++++++++++++++
- sysdeps/loongarch/lp64/libm-test-ulps-name    |    1 +
- sysdeps/loongarch/lp64/memchr.S               |   99 +
- sysdeps/loongarch/lp64/memcmp.S               |  281 +++
- sysdeps/loongarch/lp64/memcpy.S               |  818 ++++++
- sysdeps/loongarch/lp64/memmove.S              |    2 +
- sysdeps/loongarch/lp64/memset.S               |  173 ++
- sysdeps/loongarch/lp64/multiarch/Makefile     |   18 +
- .../lp64/multiarch/ifunc-impl-list.c          |  142 ++
- sysdeps/loongarch/lp64/multiarch/ifunc-lasx.h |   40 +
- sysdeps/loongarch/lp64/multiarch/ifunc-lsx.h  |   37 +
- .../loongarch/lp64/multiarch/ifunc-memchr.h   |   37 +
- .../loongarch/lp64/multiarch/ifunc-memrchr.h  |   37 +
- .../loongarch/lp64/multiarch/ifunc-stpcpy.h   |   34 +
- .../loongarch/lp64/multiarch/memchr-aligned.S |    7 +
- .../loongarch/lp64/multiarch/memchr-lasx.S    |  108 +
- sysdeps/loongarch/lp64/multiarch/memchr-lsx.S |   93 +
- sysdeps/loongarch/lp64/multiarch/memchr.c     |   39 +
- .../loongarch/lp64/multiarch/memcmp-aligned.S |   11 +
- .../loongarch/lp64/multiarch/memcmp-lasx.S    |  199 ++
- sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S |  255 ++
- sysdeps/loongarch/lp64/multiarch/memcmp.c     |   41 +
- .../loongarch/lp64/multiarch/memcpy-aligned.S |   11 +
- .../loongarch/lp64/multiarch/memcpy-lasx.S    |    1 +
- sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S |    1 +
- .../lp64/multiarch/memcpy-unaligned.S         |  259 ++
- sysdeps/loongarch/lp64/multiarch/memcpy.c     |   39 +
- .../lp64/multiarch/memmove-aligned.S          |    1 +
- .../loongarch/lp64/multiarch/memmove-lasx.S   |  279 +++
- .../loongarch/lp64/multiarch/memmove-lsx.S    |  524 ++++
- .../lp64/multiarch/memmove-unaligned.S        |  478 ++++
- sysdeps/loongarch/lp64/multiarch/memmove.c    |   39 +
- .../lp64/multiarch/memrchr-generic.c          |    9 +
- .../loongarch/lp64/multiarch/memrchr-lasx.S   |  114 +
- .../loongarch/lp64/multiarch/memrchr-lsx.S    |   96 +
- sysdeps/loongarch/lp64/multiarch/memrchr.c    |   39 +
- .../loongarch/lp64/multiarch/memset-aligned.S |    9 +
- .../loongarch/lp64/multiarch/memset-lasx.S    |  132 +
- sysdeps/loongarch/lp64/multiarch/memset-lsx.S |  125 +
- .../lp64/multiarch/memset-unaligned.S         |  177 ++
- sysdeps/loongarch/lp64/multiarch/memset.c     |   39 +
- .../lp64/multiarch/rawmemchr-aligned.S        |    7 +
- .../loongarch/lp64/multiarch/rawmemchr-lasx.S |   51 +
- .../loongarch/lp64/multiarch/rawmemchr-lsx.S  |   56 +
- sysdeps/loongarch/lp64/multiarch/rawmemchr.c  |   37 +
- .../loongarch/lp64/multiarch/stpcpy-aligned.S |    8 +
- sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S |  178 ++
- sysdeps/loongarch/lp64/multiarch/stpcpy.c     |   43 +
- .../loongarch/lp64/multiarch/strchr-aligned.S |   10 +
- .../loongarch/lp64/multiarch/strchr-lasx.S    |   81 +
- sysdeps/loongarch/lp64/multiarch/strchr-lsx.S |   61 +
- .../lp64/multiarch/strchr-unaligned.S         |  132 +
- sysdeps/loongarch/lp64/multiarch/strchr.c     |   39 +
- .../lp64/multiarch/strchrnul-aligned.S        |    8 +
- .../loongarch/lp64/multiarch/strchrnul-lasx.S |    4 +
- .../loongarch/lp64/multiarch/strchrnul-lsx.S  |    3 +
- .../lp64/multiarch/strchrnul-unaligned.S      |  146 ++
- sysdeps/loongarch/lp64/multiarch/strchrnul.c  |   34 +
- .../loongarch/lp64/multiarch/strcmp-aligned.S |    8 +
- sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S |  147 ++
- .../lp64/multiarch/strcmp-unaligned.S         |  191 ++
- sysdeps/loongarch/lp64/multiarch/strcmp.c     |   35 +
- .../loongarch/lp64/multiarch/strcpy-aligned.S |    8 +
- sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S |  174 ++
- .../lp64/multiarch/strcpy-unaligned.S         |  199 ++
- sysdeps/loongarch/lp64/multiarch/strcpy.c     |   36 +
- .../loongarch/lp64/multiarch/strlen-aligned.S |    8 +
- .../loongarch/lp64/multiarch/strlen-lasx.S    |   55 +
- sysdeps/loongarch/lp64/multiarch/strlen-lsx.S |   63 +
- .../lp64/multiarch/strlen-unaligned.S         |  116 +
- sysdeps/loongarch/lp64/multiarch/strlen.c     |   39 +
- .../lp64/multiarch/strncmp-aligned.S          |    8 +
- .../loongarch/lp64/multiarch/strncmp-lsx.S    |  197 ++
- .../lp64/multiarch/strncmp-unaligned.S        |  257 ++
- sysdeps/loongarch/lp64/multiarch/strncmp.c    |   35 +
- .../lp64/multiarch/strnlen-aligned.S          |    8 +
- .../loongarch/lp64/multiarch/strnlen-lasx.S   |   92 +
- .../loongarch/lp64/multiarch/strnlen-lsx.S    |   81 +
- .../lp64/multiarch/strnlen-unaligned.S        |  145 ++
- sysdeps/loongarch/lp64/multiarch/strnlen.c    |   40 +
- .../lp64/multiarch/strrchr-aligned.S          |   12 +
- .../loongarch/lp64/multiarch/strrchr-lasx.S   |  113 +
- .../loongarch/lp64/multiarch/strrchr-lsx.S    |   93 +
- sysdeps/loongarch/lp64/multiarch/strrchr.c    |   39 +
- sysdeps/loongarch/lp64/rawmemchr.S            |  114 +
- sysdeps/loongarch/lp64/s_cosf.S               |  409 +++
- sysdeps/loongarch/lp64/s_sinf.S               |  392 +++
- sysdeps/loongarch/lp64/stpcpy.S               |  180 ++
- sysdeps/loongarch/lp64/strchr.S               |   90 +
- sysdeps/loongarch/lp64/strchrnul.S            |   95 +
- sysdeps/loongarch/lp64/strcmp.S               |  228 ++
- sysdeps/loongarch/lp64/strcpy.S               |  174 ++
- sysdeps/loongarch/lp64/strlen.S               |   86 +
- sysdeps/loongarch/lp64/strncmp.S              |  257 ++
- sysdeps/loongarch/lp64/strnlen.S              |   83 +
- sysdeps/loongarch/lp64/strrchr.S              |  106 +
- sysdeps/loongarch/lstat.c                     |    1 +
- sysdeps/loongarch/lstat64.c                   |    1 +
- sysdeps/loongarch/machine-gmon.h              |   37 +
- sysdeps/loongarch/math_private.h              |  245 ++
- sysdeps/loongarch/memusage.h                  |   21 +
- sysdeps/loongarch/mknod.c                     |    1 +
- sysdeps/loongarch/mknodat.c                   |    1 +
- sysdeps/loongarch/nptl/Makefile               |   26 +
- .../loongarch/nptl/bits/pthreadtypes-arch.h   |   68 +
- sysdeps/loongarch/nptl/bits/semaphore.h       |   33 +
- sysdeps/loongarch/nptl/libc-lowlevellock.c    |    8 +
- sysdeps/loongarch/nptl/nptl-sysdep.S          |    2 +
- sysdeps/loongarch/nptl/pthread-offsets.h      |   23 +
- sysdeps/loongarch/nptl/pthreaddef.h           |   32 +
- sysdeps/loongarch/nptl/tcb-offsets.sym        |    6 +
- sysdeps/loongarch/nptl/tls.h                  |  147 ++
- sysdeps/loongarch/preconfigure                |    9 +
- sysdeps/loongarch/pthread_atfork.c            |    1 +
- sysdeps/loongarch/setjmp.S                    |   62 +
- sysdeps/loongarch/sfp-machine.h               |   79 +
- sysdeps/loongarch/sotruss-lib.c               |   51 +
- sysdeps/loongarch/stack_chk_fail_local.c      |    1 +
- sysdeps/loongarch/stackinfo.h                 |   33 +
- sysdeps/loongarch/start.S                     |   51 +
- sysdeps/loongarch/stat.c                      |    1 +
- sysdeps/loongarch/stat64.c                    |    1 +
- sysdeps/loongarch/sys/asm.h                   |   58 +
- sysdeps/loongarch/sys/regdef.h                |   83 +
- sysdeps/loongarch/tininess.h                  |    1 +
- sysdeps/loongarch/tls-macros.h                |   46 +
- sysdeps/loongarch/tst-audit.h                 |   23 +
- sysdeps/loongarch/warning-nop.c               |    1 +
- sysdeps/unix/sysv/linux/loongarch/Implies     |    1 +
- sysdeps/unix/sysv/linux/loongarch/Makefile    |   17 +
- sysdeps/unix/sysv/linux/loongarch/Versions    |   44 +
- .../sysv/linux/loongarch/atomic-machine.h     |  188 ++
- .../unix/sysv/linux/loongarch/bits/fcntl.h    |   62 +
- .../unix/sysv/linux/loongarch/bits/hwcap.h    |   37 +
- .../sysv/linux/loongarch/bits/local_lim.h     |   99 +
- sysdeps/unix/sysv/linux/loongarch/bits/mman.h |   41 +
- sysdeps/unix/sysv/linux/loongarch/bits/shm.h  |  112 +
- .../sysv/linux/loongarch/bits/sigcontext.h    |   47 +
- .../unix/sysv/linux/loongarch/bits/signum.h   |   58 +
- sysdeps/unix/sysv/linux/loongarch/clone.S     |   98 +
- sysdeps/unix/sysv/linux/loongarch/configure   |  199 ++
- .../unix/sysv/linux/loongarch/configure.ac    |   27 +
- .../unix/sysv/linux/loongarch/cpu-features.c  |   32 +
- .../unix/sysv/linux/loongarch/cpu-features.h  |   53 +
- .../unix/sysv/linux/loongarch/dl-procinfo.c   |   60 +
- sysdeps/unix/sysv/linux/loongarch/dl-static.c |   84 +
- sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c |   21 +
- .../sysv/linux/loongarch/dl-tunables.list     |   27 +
- .../unix/sysv/linux/loongarch/getcontext.S    |   72 +
- sysdeps/unix/sysv/linux/loongarch/getpid.c    |   54 +
- .../unix/sysv/linux/loongarch/gettimeofday.c  |   58 +
- sysdeps/unix/sysv/linux/loongarch/getuid.c    |   60 +
- .../unix/sysv/linux/loongarch/init-first.c    |   57 +
- sysdeps/unix/sysv/linux/loongarch/ipc_priv.h  |   21 +
- .../sysv/linux/loongarch/kernel-features.h    |   24 +
- .../unix/sysv/linux/loongarch/ldd-rewrite.sed |    1 +
- sysdeps/unix/sysv/linux/loongarch/ldsodefs.h  |   32 +
- .../unix/sysv/linux/loongarch/libc-start.c    |   28 +
- sysdeps/unix/sysv/linux/loongarch/libc-vdso.h |   37 +
- .../unix/sysv/linux/loongarch/localplt.data   |   13 +
- .../unix/sysv/linux/loongarch/lp64/Implies    |    3 +
- .../sysv/linux/loongarch/lp64/c++-types.data  |   67 +
- .../linux/loongarch/lp64/jmp_buf-macros.h     |   41 +
- .../unix/sysv/linux/loongarch/lp64/ld.abilist |    5 +
- .../loongarch/lp64/libBrokenLocale.abilist    |    1 +
- .../sysv/linux/loongarch/lp64/libanl.abilist  |    4 +
- .../sysv/linux/loongarch/lp64/libc.abilist    | 2101 ++++++++++++++++
- .../linux/loongarch/lp64/libcrypt.abilist     |    7 +
- .../sysv/linux/loongarch/lp64/libdl.abilist   |    9 +
- .../sysv/linux/loongarch/lp64/libm.abilist    | 1021 ++++++++
- .../sysv/linux/loongarch/lp64/libnsl.abilist  |  120 +
- .../linux/loongarch/lp64/libpthread.abilist   |  264 ++
- .../linux/loongarch/lp64/libresolv.abilist    |   79 +
- .../sysv/linux/loongarch/lp64/librt.abilist   |   35 +
- .../linux/loongarch/lp64/libthread_db.abilist |   40 +
- .../sysv/linux/loongarch/lp64/libutil.abilist |    6 +
- .../unix/sysv/linux/loongarch/makecontext.c   |   78 +
- .../sysv/linux/loongarch/profil-counter.h     |   31 +
- sysdeps/unix/sysv/linux/loongarch/pt-vfork.S  |    1 +
- .../unix/sysv/linux/loongarch/register-dump.h |   63 +
- .../unix/sysv/linux/loongarch/setcontext.S    |  111 +
- .../unix/sysv/linux/loongarch/shlib-versions  |    2 +
- .../sysv/linux/loongarch/sigcontextinfo.h     |   22 +
- .../unix/sysv/linux/loongarch/swapcontext.S   |  120 +
- .../unix/sysv/linux/loongarch/sys/procfs.h    |  122 +
- .../unix/sysv/linux/loongarch/sys/ucontext.h  |   81 +
- sysdeps/unix/sysv/linux/loongarch/sys/user.h  |   31 +
- sysdeps/unix/sysv/linux/loongarch/syscall.c   |   36 +
- sysdeps/unix/sysv/linux/loongarch/sysdep.S    |   52 +
- sysdeps/unix/sysv/linux/loongarch/sysdep.h    |  333 +++
- .../sysv/linux/loongarch/ucontext-macros.h    |   44 +
- .../unix/sysv/linux/loongarch/ucontext_i.sym  |   33 +
- sysdeps/unix/sysv/linux/loongarch/vfork.S     |   49 +
- 291 files changed, 24100 insertions(+), 8 deletions(-)
- create mode 100644 sysdeps/loongarch/Implies
- create mode 100644 sysdeps/loongarch/Makefile
- create mode 100644 sysdeps/loongarch/Versions
- create mode 100644 sysdeps/loongarch/__longjmp.S
- create mode 100644 sysdeps/loongarch/abort-instr.h
- create mode 100644 sysdeps/loongarch/at_quick_exit.c
- create mode 100644 sysdeps/loongarch/atexit.c
- create mode 100644 sysdeps/loongarch/bits/endian.h
- create mode 100644 sysdeps/loongarch/bits/fenv.h
- create mode 100644 sysdeps/loongarch/bits/link.h
- create mode 100644 sysdeps/loongarch/bits/setjmp.h
- create mode 100644 sysdeps/loongarch/bits/wordsize.h
- create mode 100644 sysdeps/loongarch/bsd-_setjmp.c
- create mode 100644 sysdeps/loongarch/bsd-setjmp.c
- create mode 100755 sysdeps/loongarch/configure
- create mode 100644 sysdeps/loongarch/configure.ac
- create mode 100644 sysdeps/loongarch/cpu-tunables.c
- create mode 100644 sysdeps/loongarch/dl-get-cpu-features.c
- create mode 100644 sysdeps/loongarch/dl-irel.h
- create mode 100644 sysdeps/loongarch/dl-machine.h
- create mode 100644 sysdeps/loongarch/dl-tls.h
- create mode 100644 sysdeps/loongarch/dl-trampoline.S
- create mode 100644 sysdeps/loongarch/dl-trampoline.h
- create mode 100644 sysdeps/loongarch/dl-tunables.list
- create mode 100644 sysdeps/loongarch/e_sqrtl.c
- create mode 100644 sysdeps/loongarch/elf-init.c
- create mode 100644 sysdeps/loongarch/fenv_private.h
- create mode 100644 sysdeps/loongarch/fpu/e_ilogb.c
- create mode 100644 sysdeps/loongarch/fpu/e_ilogbf.c
- create mode 100644 sysdeps/loongarch/fpu/e_sqrt.c
- create mode 100644 sysdeps/loongarch/fpu/e_sqrtf.c
- create mode 100644 sysdeps/loongarch/fpu/fclrexcpt.c
- create mode 100644 sysdeps/loongarch/fpu/fedisblxcpt.c
- create mode 100644 sysdeps/loongarch/fpu/feenablxcpt.c
- create mode 100644 sysdeps/loongarch/fpu/fegetenv.c
- create mode 100644 sysdeps/loongarch/fpu/fegetexcept.c
- create mode 100644 sysdeps/loongarch/fpu/fegetmode.c
- create mode 100644 sysdeps/loongarch/fpu/fegetround.c
- create mode 100644 sysdeps/loongarch/fpu/feholdexcpt.c
- create mode 100644 sysdeps/loongarch/fpu/fenv_libc.h
- create mode 100644 sysdeps/loongarch/fpu/fesetenv.c
- create mode 100644 sysdeps/loongarch/fpu/fesetexcept.c
- create mode 100644 sysdeps/loongarch/fpu/fesetmode.c
- create mode 100644 sysdeps/loongarch/fpu/fesetround.c
- create mode 100644 sysdeps/loongarch/fpu/feupdateenv.c
- create mode 100644 sysdeps/loongarch/fpu/fgetexcptflg.c
- create mode 100644 sysdeps/loongarch/fpu/fraiseexcpt.c
- create mode 100644 sysdeps/loongarch/fpu/fsetexcptflg.c
- create mode 100644 sysdeps/loongarch/fpu/ftestexcept.c
- create mode 100644 sysdeps/loongarch/fpu/s_copysign.c
- create mode 100644 sysdeps/loongarch/fpu/s_copysignf.c
- create mode 100644 sysdeps/loongarch/fpu/s_finite.c
- create mode 100644 sysdeps/loongarch/fpu/s_finitef.c
- create mode 100644 sysdeps/loongarch/fpu/s_fmax.c
- create mode 100644 sysdeps/loongarch/fpu/s_fmaxf.c
- create mode 100644 sysdeps/loongarch/fpu/s_fmaxmag.c
- create mode 100644 sysdeps/loongarch/fpu/s_fmaxmagf.c
- create mode 100644 sysdeps/loongarch/fpu/s_fmin.c
- create mode 100644 sysdeps/loongarch/fpu/s_fminf.c
- create mode 100644 sysdeps/loongarch/fpu/s_fminmag.c
- create mode 100644 sysdeps/loongarch/fpu/s_fminmagf.c
- create mode 100644 sysdeps/loongarch/fpu/s_fpclassify.c
- create mode 100644 sysdeps/loongarch/fpu/s_fpclassifyf.c
- create mode 100644 sysdeps/loongarch/fpu/s_isinf.c
- create mode 100644 sysdeps/loongarch/fpu/s_isinff.c
- create mode 100644 sysdeps/loongarch/fpu/s_isnan.c
- create mode 100644 sysdeps/loongarch/fpu/s_isnanf.c
- create mode 100644 sysdeps/loongarch/fpu/s_issignaling.c
- create mode 100644 sysdeps/loongarch/fpu/s_issignalingf.c
- create mode 100644 sysdeps/loongarch/fpu/s_llrint.c
- create mode 100644 sysdeps/loongarch/fpu/s_llrintf.c
- create mode 100644 sysdeps/loongarch/fpu/s_logb.c
- create mode 100644 sysdeps/loongarch/fpu/s_logbf.c
- create mode 100644 sysdeps/loongarch/fpu/s_lrint.c
- create mode 100644 sysdeps/loongarch/fpu/s_lrintf.c
- create mode 100644 sysdeps/loongarch/fpu/s_rint.c
- create mode 100644 sysdeps/loongarch/fpu/s_rintf.c
- create mode 100644 sysdeps/loongarch/fpu/s_scalbn.c
- create mode 100644 sysdeps/loongarch/fpu/s_scalbnf.c
- create mode 100644 sysdeps/loongarch/fpu_control.h
- create mode 100644 sysdeps/loongarch/fstat.c
- create mode 100644 sysdeps/loongarch/fstat64.c
- create mode 100644 sysdeps/loongarch/fstatat.c
- create mode 100644 sysdeps/loongarch/fstatat64.c
- create mode 100644 sysdeps/loongarch/gccframe.h
- create mode 100644 sysdeps/loongarch/hp-timing.h
- create mode 100644 sysdeps/loongarch/init-arch.h
- create mode 100644 sysdeps/loongarch/jmpbuf-offsets.h
- create mode 100644 sysdeps/loongarch/jmpbuf-unwind.h
- create mode 100644 sysdeps/loongarch/ldsodefs.h
- create mode 100644 sysdeps/loongarch/libc-start.h
- create mode 100644 sysdeps/loongarch/libc-tls.c
- create mode 100644 sysdeps/loongarch/linkmap.h
- create mode 100644 sysdeps/loongarch/lp64/Implies-after
- create mode 100644 sysdeps/loongarch/lp64/libm-test-ulps
- create mode 100644 sysdeps/loongarch/lp64/libm-test-ulps-name
- create mode 100644 sysdeps/loongarch/lp64/memchr.S
- create mode 100644 sysdeps/loongarch/lp64/memcmp.S
- create mode 100644 sysdeps/loongarch/lp64/memcpy.S
- create mode 100644 sysdeps/loongarch/lp64/memmove.S
- create mode 100644 sysdeps/loongarch/lp64/memset.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/Makefile
- create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
- create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-lasx.h
- create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-lsx.h
- create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-memchr.h
- create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-memrchr.h
- create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-stpcpy.h
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memchr-aligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memchr.c
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp.c
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memcpy.c
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memmove-aligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memmove.c
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memrchr-generic.c
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memrchr.c
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-aligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-lasx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-lsx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/memset.c
- create mode 100644 sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/rawmemchr.c
- create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy.c
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr.c
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul.c
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp.c
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy.c
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen.c
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp.c
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen.c
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
- create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr.c
- create mode 100644 sysdeps/loongarch/lp64/rawmemchr.S
- create mode 100644 sysdeps/loongarch/lp64/s_cosf.S
- create mode 100644 sysdeps/loongarch/lp64/s_sinf.S
- create mode 100644 sysdeps/loongarch/lp64/stpcpy.S
- create mode 100644 sysdeps/loongarch/lp64/strchr.S
- create mode 100644 sysdeps/loongarch/lp64/strchrnul.S
- create mode 100644 sysdeps/loongarch/lp64/strcmp.S
- create mode 100644 sysdeps/loongarch/lp64/strcpy.S
- create mode 100644 sysdeps/loongarch/lp64/strlen.S
- create mode 100644 sysdeps/loongarch/lp64/strncmp.S
- create mode 100644 sysdeps/loongarch/lp64/strnlen.S
- create mode 100644 sysdeps/loongarch/lp64/strrchr.S
- create mode 100644 sysdeps/loongarch/lstat.c
- create mode 100644 sysdeps/loongarch/lstat64.c
- create mode 100644 sysdeps/loongarch/machine-gmon.h
- create mode 100644 sysdeps/loongarch/math_private.h
- create mode 100644 sysdeps/loongarch/memusage.h
- create mode 100644 sysdeps/loongarch/mknod.c
- create mode 100644 sysdeps/loongarch/mknodat.c
- create mode 100644 sysdeps/loongarch/nptl/Makefile
- create mode 100644 sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h
- create mode 100644 sysdeps/loongarch/nptl/bits/semaphore.h
- create mode 100644 sysdeps/loongarch/nptl/libc-lowlevellock.c
- create mode 100644 sysdeps/loongarch/nptl/nptl-sysdep.S
- create mode 100644 sysdeps/loongarch/nptl/pthread-offsets.h
- create mode 100644 sysdeps/loongarch/nptl/pthreaddef.h
- create mode 100644 sysdeps/loongarch/nptl/tcb-offsets.sym
- create mode 100644 sysdeps/loongarch/nptl/tls.h
- create mode 100644 sysdeps/loongarch/preconfigure
- create mode 100644 sysdeps/loongarch/pthread_atfork.c
- create mode 100644 sysdeps/loongarch/setjmp.S
- create mode 100644 sysdeps/loongarch/sfp-machine.h
- create mode 100644 sysdeps/loongarch/sotruss-lib.c
- create mode 100644 sysdeps/loongarch/stack_chk_fail_local.c
- create mode 100644 sysdeps/loongarch/stackinfo.h
- create mode 100644 sysdeps/loongarch/start.S
- create mode 100644 sysdeps/loongarch/stat.c
- create mode 100644 sysdeps/loongarch/stat64.c
- create mode 100644 sysdeps/loongarch/sys/asm.h
- create mode 100644 sysdeps/loongarch/sys/regdef.h
- create mode 100644 sysdeps/loongarch/tininess.h
- create mode 100644 sysdeps/loongarch/tls-macros.h
- create mode 100644 sysdeps/loongarch/tst-audit.h
- create mode 100644 sysdeps/loongarch/warning-nop.c
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/Implies
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/Makefile
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/Versions
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/atomic-machine.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/bits/fcntl.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/bits/local_lim.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/bits/mman.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/bits/shm.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/bits/sigcontext.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/bits/signum.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/clone.S
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/configure
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/configure.ac
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/cpu-features.c
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/cpu-features.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/dl-static.c
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/dl-tunables.list
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/getcontext.S
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/getpid.c
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/gettimeofday.c
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/getuid.c
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/init-first.c
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/ipc_priv.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/kernel-features.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/ldd-rewrite.sed
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/ldsodefs.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/libc-start.c
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/libc-vdso.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/localplt.data
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/Implies
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/c++-types.data
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/jmp_buf-macros.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/ld.abilist
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libBrokenLocale.abilist
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libanl.abilist
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libc.abilist
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libcrypt.abilist
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libdl.abilist
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libm.abilist
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libnsl.abilist
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libpthread.abilist
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libresolv.abilist
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/librt.abilist
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libthread_db.abilist
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/lp64/libutil.abilist
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/makecontext.c
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/profil-counter.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/pt-vfork.S
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/register-dump.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/setcontext.S
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/shlib-versions
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/sigcontextinfo.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/swapcontext.S
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/sys/procfs.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/sys/ucontext.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/sys/user.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/syscall.c
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/sysdep.S
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/sysdep.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/ucontext-macros.h
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/ucontext_i.sym
- create mode 100644 sysdeps/unix/sysv/linux/loongarch/vfork.S
-
-diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c
-index 7a84b1fa..47342c76 100644
---- a/elf/dl-reloc.c
-+++ b/elf/dl-reloc.c
-@@ -235,12 +235,6 @@ _dl_relocate_object (struct link_map *l, struct r_scope_elem *scope[],
- 	    newp->start = PTR_ALIGN_DOWN (ph->p_vaddr, GLRO(dl_pagesize))
- 			  + (caddr_t) l->l_addr;
- 
--	    if (__mprotect (newp->start, newp->len, PROT_READ|PROT_WRITE) < 0)
--	      {
--		errstring = N_("cannot make segment writable for relocation");
--	      call_error:
--		_dl_signal_error (errno, l->l_name, NULL, errstring);
--	      }
- 
- #if (PF_R | PF_W | PF_X) == 7 && (PROT_READ | PROT_WRITE | PROT_EXEC) == 7
- 	    newp->prot = (PF_TO_PROT
-@@ -254,6 +248,13 @@ _dl_relocate_object (struct link_map *l, struct r_scope_elem *scope[],
- 	    if (ph->p_flags & PF_X)
- 	      newp->prot |= PROT_EXEC;
- #endif
-+	    if (__mprotect (newp->start, newp->len, PROT_READ|PROT_WRITE) < 0)
-+	      {
-+		errstring = N_("cannot make segment writable for relocation");
-+	      call_error:
-+		_dl_signal_error (errno, l->l_name, NULL, errstring);
-+	      }
-+
- 	    newp->next = textrels;
- 	    textrels = newp;
- 	  }
-diff --git a/elf/elf.h b/elf/elf.h
-index ec09040b..65d1fb46 100644
---- a/elf/elf.h
-+++ b/elf/elf.h
-@@ -360,8 +360,9 @@ typedef struct
- #define EM_RISCV	243	/* RISC-V */
- 
- #define EM_BPF		247	/* Linux BPF -- in-kernel virtual machine */
-+#define EM_LOONGARCH    258     /* Loongson Loongarch */
- 
--#define EM_NUM		248
-+#define EM_NUM		259
- 
- /* Old spellings/synonyms.  */
- 
-@@ -3932,6 +3933,88 @@ enum
- #define R_NDS32_TLS_TPOFF	102
- #define R_NDS32_TLS_DESC	119
- 
-+/* LoongISA ELF Flags */
-+#define EF_LARCH_ABI             0x0003
-+#define EF_LARCH_ABI_LP64        0x0003
-+#define EF_LARCH_ABI_LPX32       0x0002
-+#define EF_LARCH_ABI_LP32        0x0001
-+
-+/* Loongarch specific dynamic relocations. */
-+#define R_LARCH_NONE		0
-+#define R_LARCH_32		1
-+#define R_LARCH_64		2
-+#define R_LARCH_RELATIVE	3
-+#define R_LARCH_COPY		4
-+#define R_LARCH_JUMP_SLOT	5
-+#define R_LARCH_TLS_DTPMOD32	6
-+#define R_LARCH_TLS_DTPMOD64	7
-+#define R_LARCH_TLS_DTPREL32	8
-+#define R_LARCH_TLS_DTPREL64	9
-+#define R_LARCH_TLS_TPREL32	10
-+#define R_LARCH_TLS_TPREL64	11
-+#define R_LARCH_IRELATIVE	12
-+
-+/* Reserved for future relocs that the dynamic linker must understand.  */
-+
-+/* used by the static linker for relocating .text */
-+#define R_LARCH_MARK_LA  20
-+#define R_LARCH_MARK_PCREL  21
-+
-+/* 这个重定位类型将symbol距离重定位位置的pc相对位置偏移量压栈。
-+   它against symbol，因为如果是个常数，虽然在no-pic的情况下可以得到结果，但因为
-+   重定位位置相对这个常数的偏移量一定很大，八成填不进去；而在pic的情况下，
-+   偏移量无法在静态连接时确定。因此我们约定这个重定位不可能against constant */
-+#define R_LARCH_SOP_PUSH_PCREL  22
-+
-+/* 这个重定位against a symbol or a constant。它将symbol的运行时绝对地址
-+       或常数压栈，因此在pic的情况下会报错。另外我不太清楚常数和ABS段的关系。 */
-+#define R_LARCH_SOP_PUSH_ABSOLUTE  23
-+#define R_LARCH_SOP_PUSH_DUP  24
-+#define R_LARCH_SOP_PUSH_GPREL  25
-+#define R_LARCH_SOP_PUSH_TLS_TPREL  26
-+#define R_LARCH_SOP_PUSH_TLS_GOT  27
-+#define R_LARCH_SOP_PUSH_TLS_GD  28
-+#define R_LARCH_SOP_PUSH_PLT_PCREL  29
-+
-+#define R_LARCH_SOP_ASSERT  30
-+#define R_LARCH_SOP_NOT  31
-+#define R_LARCH_SOP_SUB  32
-+#define R_LARCH_SOP_SL  33
-+#define R_LARCH_SOP_SR  34
-+#define R_LARCH_SOP_ADD  35
-+#define R_LARCH_SOP_AND  36
-+#define R_LARCH_SOP_IF_ELSE  37
-+#define R_LARCH_SOP_POP_32_S_10_5  38
-+#define R_LARCH_SOP_POP_32_U_10_12  39
-+#define R_LARCH_SOP_POP_32_S_10_12  40
-+#define R_LARCH_SOP_POP_32_S_10_16  41
-+#define R_LARCH_SOP_POP_32_S_10_16_S2  42
-+#define R_LARCH_SOP_POP_32_S_5_20  43
-+#define R_LARCH_SOP_POP_32_S_0_5_10_16_S2  44
-+#define R_LARCH_SOP_POP_32_S_0_10_10_16_S2  45
-+#define R_LARCH_SOP_POP_32_U  46
-+
-+/* used by the static linker for relocating non .text */
-+/* 这几个重定位类型是为了照顾到 ".dword sym1 - sym2" 这种求差的写法。
-+   这些重定位类型处理的是连接时地址，一般情况下它们是成对出现的。
-+   在直接求负数".dword - sym1"的情况下，R_LARCH_SUBxx会单独出现。但注意，
-+   那个位置填进去的是连接时地址。 */
-+#define R_LARCH_ADD8  47
-+#define R_LARCH_ADD16  48
-+#define R_LARCH_ADD24  49
-+#define R_LARCH_ADD32  50
-+#define R_LARCH_ADD64  51
-+#define R_LARCH_SUB8  52
-+#define R_LARCH_SUB16  53
-+#define R_LARCH_SUB24  54
-+#define R_LARCH_SUB32  55
-+#define R_LARCH_SUB64  56
-+
-+  /* I don't know what it is. Existing in almost all other arch */
-+#define R_LARCH_GNU_VTINHERIT  57
-+#define R_LARCH_GNU_VTENTRY  58
-+
-+
- __END_DECLS
- 
- #endif	/* elf.h */
-diff --git a/scripts/config.guess b/scripts/config.guess
-index 588fe82a..a1d1cb2a 100755
---- a/scripts/config.guess
-+++ b/scripts/config.guess
-@@ -957,6 +957,9 @@ EOF
-     k1om:Linux:*:*)
- 	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
- 	exit ;;
-+    loongarch32:Linux:*:* | loongarch64:Linux:*:*)
-+        echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-+        exit ;;
-     m32r*:Linux:*:*)
- 	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
- 	exit ;;
-diff --git a/scripts/config.sub b/scripts/config.sub
-index f2632cd8..429ec408 100755
---- a/scripts/config.sub
-+++ b/scripts/config.sub
-@@ -142,7 +142,7 @@ case $os in
- 	-sun*os*)
- 		# Prevent following clause from handling this invalid input.
- 		;;
--	-dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
-+	-dec* | -mips* | -loongarch* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
- 	-att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
- 	-unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
- 	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
-@@ -265,6 +265,7 @@ case $basic_machine in
- 	| k1om \
- 	| le32 | le64 \
- 	| lm32 \
-+	| loongarch32 | loongarch64 \
- 	| m32c | m32r | m32rle | m68000 | m68k | m88k \
- 	| maxq | mb | microblaze | microblazeel | mcore | mep | metag \
- 	| mips | mipsbe | mipseb | mipsel | mipsle \
-@@ -390,6 +391,7 @@ case $basic_machine in
- 	| k1om-* \
- 	| le32-* | le64-* \
- 	| lm32-* \
-+	| loongarch32-* | loongarch64-* \
- 	| m32c-* | m32r-* | m32rle-* \
- 	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
- 	| m88110-* | m88k-* | maxq-* | mcore-* | metag-* \
-@@ -1339,6 +1341,9 @@ case $basic_machine in
- 	pmac | pmac-mpw)
- 		basic_machine=powerpc-apple
- 		;;
-+	loongarch)
-+                basic_machine=loongarch-loongson
-+		;;
- 	*-unknown)
- 		# Make sure to match an already-canonicalized machine name.
- 		;;
-diff --git a/sysdeps/loongarch/Implies b/sysdeps/loongarch/Implies
-new file mode 100644
-index 00000000..c88325b8
---- /dev/null
-+++ b/sysdeps/loongarch/Implies
-@@ -0,0 +1,5 @@
-+init_array
-+
-+ieee754/ldbl-128
-+ieee754/dbl-64
-+ieee754/flt-32
-diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
-new file mode 100644
-index 00000000..286cff67
---- /dev/null
-+++ b/sysdeps/loongarch/Makefile
-@@ -0,0 +1,36 @@
-+ifeq ($(subdir),misc)
-+sysdep_headers += sys/asm.h
-+endif
-+
-+ifeq ($(subdir),elf)
-+  sysdep-dl-routines += dl-get-cpu-features
-+endif
-+
-+# LoongArch's assembler also needs to know about PIC as it changes the definition
-+# of some assembler macros.
-+ASFLAGS-.os += $(pic-ccflag)
-+CFLAGS-elf-init.oS += -mcmodel=large
-+CFLAGS-atexit.oS += -mcmodel=large
-+CFLAGS-at_quick_exit.oS += -mcmodel=large
-+CFLAGS-stat.oS += -mcmodel=large
-+CFLAGS-fstat.oS += -mcmodel=large
-+CFLAGS-lstat.oS += -mcmodel=large
-+CFLAGS-stat64.oS += -mcmodel=large
-+CFLAGS-fstat64.oS += -mcmodel=large
-+CFLAGS-lstat64.oS += -mcmodel=large
-+CFLAGS-fstatat.oS += -mcmodel=large
-+CFLAGS-fstatat64.oS += -mcmodel=large
-+CFLAGS-mknod.oS += -mcmodel=large
-+CFLAGS-mknodat.oS += -mcmodel=large
-+CFLAGS-pthread_atfork.oS += -mcmodel=large
-+CFLAGS-warning-nop.oS += -mcmodel=large
-+CFLAGS-stack_chk_fail_local.oS += -mcmodel=large
-+
-+abi-variants := lp32 lp64
-+
-+ifeq (,$(filter $(default-abi),$(abi-variants)))
-+$(error Unknown ABI $(default-abi), must be one of $(abi-variants))
-+endif
-+
-+abi-lp64-condition    := defined _ABILP64
-+abi-lp32-condition    := defined _ABILP32
-diff --git a/sysdeps/loongarch/Versions b/sysdeps/loongarch/Versions
-new file mode 100644
-index 00000000..33ae2cc0
---- /dev/null
-+++ b/sysdeps/loongarch/Versions
-@@ -0,0 +1,5 @@
-+ld {
-+  GLIBC_PRIVATE {
-+    _dl_larch_get_cpu_features;
-+  }
-+}
-diff --git a/sysdeps/loongarch/__longjmp.S b/sysdeps/loongarch/__longjmp.S
-new file mode 100644
-index 00000000..68f67639
---- /dev/null
-+++ b/sysdeps/loongarch/__longjmp.S
-@@ -0,0 +1,50 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <sysdep.h>
-+#include <sys/asm.h>
-+
-+ENTRY (__longjmp)
-+  REG_L ra, a0, 0*SZREG
-+  REG_L sp, a0, 1*SZREG
-+  REG_L x, a0,  2*SZREG
-+  REG_L fp, a0, 3*SZREG
-+  REG_L s0, a0, 4*SZREG
-+  REG_L s1, a0, 5*SZREG
-+  REG_L s2, a0, 6*SZREG
-+  REG_L s3, a0, 7*SZREG
-+  REG_L s4, a0, 8*SZREG
-+  REG_L s5, a0, 9*SZREG
-+  REG_L s6, a0, 10*SZREG
-+  REG_L s7, a0, 11*SZREG
-+  REG_L s8, a0, 12*SZREG
-+
-+  FREG_L $f24, a0, 13*SZREG + 0*SZFREG
-+  FREG_L $f25, a0, 13*SZREG + 1*SZFREG
-+  FREG_L $f26, a0, 13*SZREG + 2*SZFREG
-+  FREG_L $f27, a0, 13*SZREG + 3*SZFREG
-+  FREG_L $f28, a0, 13*SZREG + 4*SZFREG
-+  FREG_L $f29, a0, 13*SZREG + 5*SZFREG
-+  FREG_L $f30, a0, 13*SZREG + 6*SZFREG
-+  FREG_L $f31, a0, 13*SZREG + 7*SZFREG
-+
-+  sltui a0,a1,1
-+  add.d  a0, a0, a1   # a0 = (a1 == 0) ? 1 : a1
-+  jirl zero,ra,0
-+
-+END (__longjmp)
-diff --git a/sysdeps/loongarch/abort-instr.h b/sysdeps/loongarch/abort-instr.h
-new file mode 100644
-index 00000000..46d3ad08
---- /dev/null
-+++ b/sysdeps/loongarch/abort-instr.h
-@@ -0,0 +1,2 @@
-+/* An instruction which should crash any program is a breakpoint.  */
-+#define ABORT_INSTRUCTION asm ("break 0")
-diff --git a/sysdeps/loongarch/at_quick_exit.c b/sysdeps/loongarch/at_quick_exit.c
-new file mode 100644
-index 00000000..8d4b44a7
---- /dev/null
-+++ b/sysdeps/loongarch/at_quick_exit.c
-@@ -0,0 +1 @@
-+#include <stdlib/at_quick_exit.c>
-diff --git a/sysdeps/loongarch/atexit.c b/sysdeps/loongarch/atexit.c
-new file mode 100644
-index 00000000..fc055a48
---- /dev/null
-+++ b/sysdeps/loongarch/atexit.c
-@@ -0,0 +1 @@
-+#include <stdlib/atexit.c>
-diff --git a/sysdeps/loongarch/bits/endian.h b/sysdeps/loongarch/bits/endian.h
-new file mode 100644
-index 00000000..dc9a3f2e
---- /dev/null
-+++ b/sysdeps/loongarch/bits/endian.h
-@@ -0,0 +1,9 @@
-+/* The MIPS architecture has selectable endianness.
-+   It exists in both little and big endian flavours and we
-+   want to be able to share the installed header files between
-+   both, so we define __BYTE_ORDER based on GCC's predefines.  */
-+
-+#ifndef _ENDIAN_H
-+# error "Never use <bits/endian.h> directly; include <endian.h> instead."
-+#endif
-+# define __BYTE_ORDER __LITTLE_ENDIAN
-diff --git a/sysdeps/loongarch/bits/fenv.h b/sysdeps/loongarch/bits/fenv.h
-new file mode 100644
-index 00000000..42767412
---- /dev/null
-+++ b/sysdeps/loongarch/bits/fenv.h
-@@ -0,0 +1,93 @@
-+/* Copyright (C) 1998-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _FENV_H
-+# error "Never use <bits/fenv.h> directly; include <fenv.h> instead."
-+#endif
-+
-+
-+/* Define bits representing the exception.  We use the bit positions
-+   of the appropriate bits in the FPU control word.  */
-+enum
-+  {
-+    FE_INEXACT =
-+#define FE_INEXACT	0x010000
-+      FE_INEXACT,
-+    FE_UNDERFLOW =
-+#define FE_UNDERFLOW	0x020000
-+      FE_UNDERFLOW,
-+    FE_OVERFLOW =
-+#define FE_OVERFLOW	0x040000
-+      FE_OVERFLOW,
-+    FE_DIVBYZERO =
-+#define FE_DIVBYZERO	0x080000
-+      FE_DIVBYZERO,
-+    FE_INVALID =
-+#define FE_INVALID	0x100000
-+      FE_INVALID,
-+  };
-+
-+#define FE_ALL_EXCEPT \
-+	(FE_INEXACT | FE_DIVBYZERO | FE_UNDERFLOW | FE_OVERFLOW | FE_INVALID)
-+
-+/* The MIPS FPU supports all of the four defined rounding modes.  We
-+   use again the bit positions in the FPU control word as the values
-+   for the appropriate macros.  */
-+enum
-+  {
-+    FE_TONEAREST =
-+#define FE_TONEAREST	0x000
-+      FE_TONEAREST,
-+    FE_TOWARDZERO =
-+#define FE_TOWARDZERO	0x100
-+      FE_TOWARDZERO,
-+    FE_UPWARD =
-+#define FE_UPWARD	0x200
-+      FE_UPWARD,
-+    FE_DOWNWARD =
-+#define FE_DOWNWARD	0x300
-+      FE_DOWNWARD
-+  };
-+
-+
-+/* Type representing exception flags.  */
-+typedef unsigned int fexcept_t;
-+
-+
-+/* Type representing floating-point environment.  This function corresponds
-+   to the layout of the block written by the `fstenv'.  */
-+typedef struct
-+  {
-+    unsigned int __fp_control_register;
-+  }
-+fenv_t;
-+
-+/* If the default argument is used we use this value.  */
-+#define FE_DFL_ENV	((const fenv_t *) -1)
-+
-+#ifdef __USE_GNU
-+/* Floating-point environment where none of the exception is masked.  */
-+# define FE_NOMASK_ENV  ((const fenv_t *) -257)
-+#endif
-+
-+#if __GLIBC_USE (IEC_60559_BFP_EXT)
-+/* Type representing floating-point control modes.  */
-+typedef unsigned int femode_t;
-+
-+/* Default floating-point control modes.  */
-+# define FE_DFL_MODE	((const femode_t *) -1L)
-+#endif
-diff --git a/sysdeps/loongarch/bits/link.h b/sysdeps/loongarch/bits/link.h
-new file mode 100644
-index 00000000..554dfdc0
---- /dev/null
-+++ b/sysdeps/loongarch/bits/link.h
-@@ -0,0 +1,56 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef	_LINK_H
-+# error "Never include <bits/link.h> directly; use <link.h> instead."
-+#endif
-+
-+typedef struct La_loongarch_regs
-+{
-+  unsigned long int lr_reg[8]; /* a0 - a7 */
-+  double lr_fpreg[8]; /* fa0 - fa7 */
-+  unsigned long int lr_ra;
-+  unsigned long int lr_sp;
-+} La_loongarch_regs;
-+
-+/* Return values for calls from PLT on LoongArch.  */
-+typedef struct La_loongarch_retval
-+{
-+  unsigned long int lrv_a0;
-+  unsigned long int lrv_a1;
-+  double lrv_fa0;
-+  double lrv_fa1;
-+} La_loongarch_retval;
-+
-+__BEGIN_DECLS
-+
-+extern ElfW(Addr) la_loongarch_gnu_pltenter (ElfW(Sym) *__sym, unsigned int __ndx,
-+					 uintptr_t *__refcook,
-+					 uintptr_t *__defcook,
-+					 La_loongarch_regs *__regs,
-+					 unsigned int *__flags,
-+					 const char *__symname,
-+					 long int *__framesizep);
-+extern unsigned int la_loongarch_gnu_pltexit (ElfW(Sym) *__sym, unsigned int __ndx,
-+					  uintptr_t *__refcook,
-+					  uintptr_t *__defcook,
-+					  const La_loongarch_regs *__inregs,
-+					  La_loongarch_retval *__outregs,
-+					  const char *__symname);
-+
-+__END_DECLS
-diff --git a/sysdeps/loongarch/bits/setjmp.h b/sysdeps/loongarch/bits/setjmp.h
-new file mode 100644
-index 00000000..cc9b6bfd
---- /dev/null
-+++ b/sysdeps/loongarch/bits/setjmp.h
-@@ -0,0 +1,39 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _LOONGARCH_BITS_SETJMP_H
-+#define _LOONGARCH_BITS_SETJMP_H
-+
-+typedef struct __jmp_buf_internal_tag
-+  {
-+    /* Program counter.  */
-+    long int __pc;
-+    /* Stack pointer.  */
-+    long int __sp;
-+    /* Reserved */
-+    long int __x;
-+    /* Frame pointer. */
-+    long int __fp;
-+    /* Callee-saved registers.  */
-+    long int __regs[9];
-+
-+    /* Callee-saved floating point registers.  */
-+   double __fpregs[8];
-+  } __jmp_buf[1];
-+
-+#endif /* _LOONGARCH_BITS_SETJMP_H */
-diff --git a/sysdeps/loongarch/bits/wordsize.h b/sysdeps/loongarch/bits/wordsize.h
-new file mode 100644
-index 00000000..8dbaa00d
---- /dev/null
-+++ b/sysdeps/loongarch/bits/wordsize.h
-@@ -0,0 +1,22 @@
-+/* Copyright (C) 1999-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#define __loongarch_xlen			64
-+
-+#define __WORDSIZE			__loongarch_xlen
-+#define __WORDSIZE_TIME64_COMPAT32	0
-+
-diff --git a/sysdeps/loongarch/bsd-_setjmp.c b/sysdeps/loongarch/bsd-_setjmp.c
-new file mode 100644
-index 00000000..0d413101
---- /dev/null
-+++ b/sysdeps/loongarch/bsd-_setjmp.c
-@@ -0,0 +1 @@
-+/* _setjmp is implemented in setjmp.S */
-diff --git a/sysdeps/loongarch/bsd-setjmp.c b/sysdeps/loongarch/bsd-setjmp.c
-new file mode 100644
-index 00000000..ee7c5e34
---- /dev/null
-+++ b/sysdeps/loongarch/bsd-setjmp.c
-@@ -0,0 +1 @@
-+/* setjmp is implemented in setjmp.S */
-diff --git a/sysdeps/loongarch/configure b/sysdeps/loongarch/configure
-new file mode 100755
-index 00000000..1e5abf81
---- /dev/null
-+++ b/sysdeps/loongarch/configure
-@@ -0,0 +1,4 @@
-+# This file is generated from configure.ac by Autoconf.  DO NOT EDIT!
-+ # Local configure fragment for sysdeps/loongarch/elf.
-+
-+#AC_DEFINE(PI_STATIC_AND_HIDDEN)
-diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac
-new file mode 100644
-index 00000000..67b46ce0
---- /dev/null
-+++ b/sysdeps/loongarch/configure.ac
-@@ -0,0 +1,6 @@
-+GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
-+# Local configure fragment for sysdeps/loongarch/elf.
-+
-+dnl It is always possible to access static and hidden symbols in an
-+dnl position independent way.
-+#AC_DEFINE(PI_STATIC_AND_HIDDEN)
-diff --git a/sysdeps/loongarch/cpu-tunables.c b/sysdeps/loongarch/cpu-tunables.c
-new file mode 100644
-index 00000000..840c1b8c
---- /dev/null
-+++ b/sysdeps/loongarch/cpu-tunables.c
-@@ -0,0 +1,94 @@
-+/* LoongArch CPU feature tuning.
-+   This file is part of the GNU C Library.
-+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#if HAVE_TUNABLES
-+# define TUNABLE_NAMESPACE cpu
-+# include <stdbool.h>
-+# include <stdint.h>
-+# include <unistd.h>		/* Get STDOUT_FILENO for _dl_printf.  */
-+# include <elf/dl-tunables.h>
-+# include <string.h>
-+# include <cpu-features.h>
-+# include <ldsodefs.h>
-+# include <sys/auxv.h>
-+
-+# define HWCAP_LOONGARCH_IFUNC \
-+  (HWCAP_LOONGARCH_UAL | HWCAP_LOONGARCH_LSX | HWCAP_LOONGARCH_LASX)
-+
-+# define CHECK_GLIBC_IFUNC_CPU_OFF(f, name, len)			\
-+  _Static_assert (sizeof (#name) - 1 == len, #name " != " #len);	\
-+  if (!memcmp (f, #name, len) &&					\
-+      (GLRO (dl_hwcap) & HWCAP_LOONGARCH_##name))			\
-+    {									\
-+      hwcap |= (HWCAP_LOONGARCH_##name | (~HWCAP_LOONGARCH_IFUNC));	\
-+      break;								\
-+    }									\
-+
-+
-+attribute_hidden
-+void
-+TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
-+{
-+  const char *p = valp->strval;
-+  size_t len;
-+  unsigned long hwcap = 0;
-+  const char *c;
-+
-+  do {
-+      for (c = p; *c != ','; c++)
-+	if (*c == '\0')
-+	  break;
-+
-+      len = c - p;
-+
-+      switch(len)
-+      {
-+	default:
-+	  _dl_fatal_printf (
-+	    "The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n"
-+			    );
-+	  break;
-+	case 3:
-+	  {
-+	    CHECK_GLIBC_IFUNC_CPU_OFF (p, LSX, 3);
-+	    CHECK_GLIBC_IFUNC_CPU_OFF (p, UAL, 3);
-+	    _dl_fatal_printf (
-+		"Some features are invalid or not supported on this machine!!\n"
-+		"The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n"
-+                       );
-+	  }
-+	  break;
-+	case 4:
-+	  {
-+	    CHECK_GLIBC_IFUNC_CPU_OFF (p, LASX, 4);
-+	    _dl_fatal_printf (
-+		"Some features are invalid or not supported on this machine!!\n"
-+		"The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n"
-+                       );
-+	  }
-+	  break;
-+      }
-+
-+      p += len + 1;
-+    }
-+  while (*c != '\0');
-+
-+  GLRO (dl_hwcap) &= hwcap;
-+}
-+
-+#endif
-diff --git a/sysdeps/loongarch/dl-get-cpu-features.c b/sysdeps/loongarch/dl-get-cpu-features.c
-new file mode 100644
-index 00000000..ed71abe0
---- /dev/null
-+++ b/sysdeps/loongarch/dl-get-cpu-features.c
-@@ -0,0 +1,25 @@
-+/* Define _dl_larch_get_cpu_features.
-+   Copyright (C) 2015-2022 Free Software Foundation, Inc.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+
-+#include <ldsodefs.h>
-+
-+const struct cpu_features *
-+_dl_larch_get_cpu_features (void)
-+{
-+  return &GLRO(dl_larch_cpu_features);
-+}
-diff --git a/sysdeps/loongarch/dl-irel.h b/sysdeps/loongarch/dl-irel.h
-new file mode 100644
-index 00000000..4216fec2
---- /dev/null
-+++ b/sysdeps/loongarch/dl-irel.h
-@@ -0,0 +1,51 @@
-+/* Machine-dependent ELF indirect relocation inline functions.
-+   x86-64 version.
-+   Copyright (C) 2009-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _DL_IREL_H
-+#define _DL_IREL_H
-+
-+#include <stdio.h>
-+#include <unistd.h>
-+
-+#define ELF_MACHINE_IRELA	1
-+
-+static inline ElfW(Addr)
-+__attribute ((always_inline))
-+elf_ifunc_invoke (ElfW(Addr) addr)
-+{
-+  return ((ElfW(Addr) (*) (void)) (addr)) ();
-+}
-+
-+static inline void
-+__attribute ((always_inline))
-+elf_irela (const ElfW(Rela) *reloc)
-+{
-+  ElfW(Addr) *const reloc_addr = (void *) reloc->r_offset;
-+  const unsigned long int r_type = ELFW(R_TYPE) (reloc->r_info);
-+
-+  if (__glibc_likely (r_type == R_LARCH_IRELATIVE))
-+    {
-+      ElfW(Addr) value = elf_ifunc_invoke(reloc->r_addend);
-+      *reloc_addr = value;
-+    }
-+  else
-+    __libc_fatal ("Unexpected reloc type in static binary.\n");
-+}
-+
-+#endif /* dl-irel.h */
-diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
-new file mode 100644
-index 00000000..2d527241
---- /dev/null
-+++ b/sysdeps/loongarch/dl-machine.h
-@@ -0,0 +1,410 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef dl_machine_h
-+#define dl_machine_h
-+
-+#define ELF_MACHINE_NAME "LoongArch"
-+
-+#if HAVE_TUNABLES
-+#define TUNABLE_NAMESPACE cpu
-+#include <elf/dl-tunables.h>
-+extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *) attribute_hidden;
-+#endif
-+
-+#include <entry.h>
-+#include <elf/elf.h>
-+#include <sys/asm.h>
-+#include <dl-tls.h>
-+#include <cpu-features.c>
-+
-+
-+#ifndef _RTLD_PROLOGUE
-+# define _RTLD_PROLOGUE(entry)						\
-+	".globl\t" __STRING (entry) "\n\t"				\
-+	".type\t" __STRING (entry) ", @function\n\t"			\
-+	CFI_STARTPROC "\n"						\
-+	__STRING (entry) ":\n"
-+#endif
-+
-+#ifndef _RTLD_EPILOGUE
-+# define _RTLD_EPILOGUE(entry)						\
-+	CFI_ENDPROC "\n\t"						\
-+	".size\t" __STRING (entry) ", . - " __STRING (entry) "\n"
-+#endif
-+
-+#define ELF_MACHINE_JMP_SLOT R_LARCH_JUMP_SLOT
-+#define ELF_MACHINE_IRELATIVE R_LARCH_IRELATIVE
-+
-+#define elf_machine_type_class(type)				\
-+  ((ELF_RTYPE_CLASS_PLT * ((type) == ELF_MACHINE_JMP_SLOT	\
-+     || (__WORDSIZE == 32 && (type) == R_LARCH_TLS_DTPREL32)	\
-+     || (__WORDSIZE == 32 && (type) == R_LARCH_TLS_DTPMOD32)	\
-+     || (__WORDSIZE == 32 && (type) == R_LARCH_TLS_TPREL32)	\
-+     || (__WORDSIZE == 64 && (type) == R_LARCH_TLS_DTPREL64)	\
-+     || (__WORDSIZE == 64 && (type) == R_LARCH_TLS_DTPMOD64)	\
-+     || (__WORDSIZE == 64 && (type) == R_LARCH_TLS_TPREL64)))	\
-+   | (ELF_RTYPE_CLASS_COPY * ((type) == R_LARCH_COPY)))
-+
-+#define ELF_MACHINE_NO_REL 1
-+#define ELF_MACHINE_NO_RELA 0
-+#define PLTREL ElfW(Rela)
-+
-+#define DL_PLATFORM_INIT dl_platform_init ()
-+
-+static inline void __attribute__ ((unused))
-+dl_platform_init (void)
-+{
-+  if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
-+    /* Avoid an empty string which would disturb us.  */
-+    GLRO(dl_platform) = NULL;
-+
-+#ifdef SHARED
-+
-+#if HAVE_TUNABLES
-+  TUNABLE_GET (hwcaps, tunable_val_t *, TUNABLE_CALLBACK (set_hwcaps));
-+#endif
-+  /* init_cpu_features has been called early from __libc_start_main in
-+     static executable.  */
-+  init_cpu_features (&GLRO(dl_larch_cpu_features));
-+#endif
-+}
-+
-+
-+/* Return nonzero iff ELF header is compatible with the running host.  */
-+static inline int __attribute_used__
-+elf_machine_matches_host (const ElfW(Ehdr) *ehdr)
-+{
-+  /* We can only run LoongArch binaries.  */
-+  if (ehdr->e_machine != EM_LOONGARCH)
-+    return 0;
-+
-+#ifdef _ABILP64
-+  if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LP64)
-+#elif defined _ABILPX32
-+  if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LPX32)
-+#elif defined _ABILP32
-+  if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LP32)
-+#else
-+# error "Unknown ABI"
-+#endif
-+    return 0;
-+
-+  return 1;
-+}
-+
-+/* Runtime address of .got */
-+#define _GLOBAL_OFFSET_TABLE_ ({ \
-+  ElfW(Addr) *r; \
-+  asm ("la.pcrel %0, _GLOBAL_OFFSET_TABLE_":"=r" (r)); \
-+  r; \
-+})
-+
-+/* Return the link-time address of _DYNAMIC.  */
-+static inline ElfW(Addr)
-+elf_machine_dynamic (void)
-+{
-+  return _GLOBAL_OFFSET_TABLE_[0];
-+}
-+
-+#define STRINGXP(X) __STRING (X)
-+#define STRINGXV(X) STRINGV_ (X)
-+#define STRINGV_(...) # __VA_ARGS__
-+
-+/* Return the run-time load address of the shared object.  */
-+static inline ElfW(Addr)
-+elf_machine_load_address (void)
-+{
-+  ElfW(Addr) got_linktime_addr;
-+  asm (
-+    "la.got %0, _GLOBAL_OFFSET_TABLE_"
-+    /* Link-time address in GOT entry before runtime relocation */
-+    : "=r" (got_linktime_addr)
-+  );
-+  return (ElfW(Addr))_GLOBAL_OFFSET_TABLE_ - got_linktime_addr;
-+}
-+
-+/* Initial entry point code for the dynamic linker.
-+   The C function `_dl_start' is the real entry point;
-+   its return value is the user program's entry point.  */
-+
-+#define RTLD_START asm (\
-+	".text\n\
-+	" _RTLD_PROLOGUE (ENTRY_POINT) "\
-+	.cfi_label .Ldummy\n\
-+	" CFI_UNDEFINED (1) "\n\
-+	or	$a0, $sp, $zero\n\
-+	bl	_dl_start\n\
-+	# Stash user entry point in s0.\n\
-+	or	$s0, $v0, $zero\n\
-+	# See if we were run as a command with the executable file\n\
-+	# name as an extra leading argument.\n\
-+	la	$a0, _dl_skip_args\n\
-+	ld.w	$a0, $a0, 0\n\
-+	# Load the original argument count.\n\
-+	ld.d	$a1, $sp, 0\n\
-+	# Subtract _dl_skip_args from it.\n\
-+	sub.d	$a1, $a1, $a0\n\
-+	# Adjust the stack pointer to skip _dl_skip_args words.\n\
-+	slli.d	$a0, $a0, 3\n\
-+	add.d	$sp, $sp, $a0\n\
-+	# Save back the modified argument count.\n\
-+	st.d	$a1, $sp, 0\n\
-+	# Call _dl_init (struct link_map *main_map, int argc, char **argv, char **env) \n\
-+	la	$a0, _rtld_local\n\
-+	ld.d	$a0, $a0, 0\n\
-+	addi.d	$a2, $sp, 8\n\
-+	slli.d	$a3, $a1, 3\n\
-+	add.d	$a3, $a3, $a2\n\
-+	addi.d	$a3, $a3, 8\n\
-+	# Adjust $sp for 16-aligned\n\
-+	srli.d  $t0, $sp, 4\n\
-+	slli.d  $t0, $t0, 4\n\
-+	ori	$t1, $sp, 0\n\
-+	addi.d  $sp, $t0, -32\n\
-+	st.d	$t1, $sp, 24\n\
-+	# Call the function to run the initializers.\n\
-+	bl	_dl_init\n\
-+	# Pass our finalizer function to _start.\n\
-+	ld.d    $sp, $sp, 24\n\
-+	la	$a0, _dl_fini\n\
-+	# Jump to the user entry point.\n\
-+	jirl	$zero, $s0, 0\n\
-+	" _RTLD_EPILOGUE (ENTRY_POINT) "\
-+	.previous" \
-+);
-+
-+/* Names of the architecture-specific auditing callback functions.  */
-+#define ARCH_LA_PLTENTER loongarch_gnu_pltenter
-+#define ARCH_LA_PLTEXIT loongarch_gnu_pltexit
-+
-+/* Bias .got.plt entry by the offset requested by the PLT header.  */
-+#define elf_machine_plt_value(map, reloc, value) (value)
-+
-+static inline ElfW(Addr)
-+elf_machine_fixup_plt (struct link_map *map, lookup_t t,
-+		       const ElfW(Sym) *refsym, const ElfW(Sym) *sym,
-+		       const ElfW(Rela) *reloc,
-+		       ElfW(Addr) *reloc_addr, ElfW(Addr) value)
-+{
-+  return *reloc_addr = value;
-+}
-+
-+#endif /* !dl_machine_h */
-+
-+#ifdef RESOLVE_MAP
-+
-+/* Perform a relocation described by R_INFO at the location pointed to
-+   by RELOC_ADDR.  SYM is the relocation symbol specified by R_INFO and
-+   MAP is the object containing the reloc.  */
-+
-+auto inline void
-+__attribute__ ((always_inline))
-+elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
-+		  const ElfW(Rela) *reloc, const ElfW(Sym) *sym,
-+		  const struct r_found_version *version,
-+		  void *const reloc_addr, int skip_ifunc)
-+{
-+  ElfW(Addr) r_info = reloc->r_info;
-+  const unsigned long int r_type = ELFW (R_TYPE) (r_info);
-+  ElfW(Addr) *addr_field = (ElfW(Addr) *) reloc_addr;
-+  const ElfW(Sym) *const __attribute__ ((unused)) refsym = sym;
-+  struct link_map *sym_map = RESOLVE_MAP (map, scope, &sym, version, r_type);
-+  ElfW(Addr) value = 0;
-+  if (sym_map != NULL)
-+    value = SYMBOL_ADDRESS (sym_map, sym, true) + reloc->r_addend;
-+
-+  if (sym != NULL
-+      && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0)
-+      && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1)
-+      && __builtin_expect (!skip_ifunc, 1))
-+    value = ((ElfW(Addr) (*) (int)) value) (GLRO(dl_hwcap));
-+
-+  switch (r_type)
-+    {
-+#ifndef RTLD_BOOTSTRAP
-+    case __WORDSIZE == 64 ? R_LARCH_TLS_DTPMOD64 : R_LARCH_TLS_DTPMOD32:
-+      if (sym_map)
-+	*addr_field = sym_map->l_tls_modid;
-+      break;
-+
-+    case __WORDSIZE == 64 ? R_LARCH_TLS_DTPREL64 : R_LARCH_TLS_DTPREL32:
-+      if (sym != NULL)
-+	*addr_field = TLS_DTPREL_VALUE (sym) + reloc->r_addend;
-+      break;
-+
-+    case __WORDSIZE == 64 ? R_LARCH_TLS_TPREL64 : R_LARCH_TLS_TPREL32:
-+      if (sym != NULL)
-+	{
-+	  CHECK_STATIC_TLS (map, sym_map);
-+	  *addr_field = TLS_TPREL_VALUE (sym_map, sym) + reloc->r_addend;
-+	}
-+      break;
-+
-+    case R_LARCH_COPY:
-+      {
-+	if (__glibc_unlikely (sym == NULL))
-+	  /* This can happen in trace mode if an object could not be
-+	     found.  */
-+	  break;
-+
-+	/* Handle TLS copy relocations.  */
-+	if (__glibc_unlikely (ELFW (ST_TYPE) (sym->st_info) == STT_TLS))
-+	  {
-+	    /* There's nothing to do if the symbol is in .tbss.  */
-+	    if (__glibc_likely (sym->st_value >= sym_map->l_tls_initimage_size))
-+	      break;
-+	    value += (ElfW(Addr)) sym_map->l_tls_initimage - sym_map->l_addr;
-+	  }
-+
-+	size_t size = sym->st_size;
-+	if (__glibc_unlikely (sym->st_size != refsym->st_size))
-+	  {
-+	    const char *strtab = (const void *) D_PTR (map, l_info[DT_STRTAB]);
-+	    if (sym->st_size > refsym->st_size)
-+	      size = refsym->st_size;
-+	    if (sym->st_size > refsym->st_size || GLRO(dl_verbose))
-+	      _dl_error_printf ("\
-+  %s: Symbol `%s' has different size in shared object, consider re-linking\n",
-+				rtld_progname ?: "<program name unknown>",
-+				strtab + refsym->st_name);
-+	  }
-+
-+	memcpy (reloc_addr, (void *)value, size);
-+	break;
-+      }
-+#endif
-+
-+#if !defined RTLD_BOOTSTRAP || !defined HAVE_Z_COMBRELOC
-+    case R_LARCH_RELATIVE:
-+      {
-+# if !defined RTLD_BOOTSTRAP && !defined HAVE_Z_COMBRELOC
-+	/* This is defined in rtld.c, but nowhere in the static libc.a;
-+	   make the reference weak so static programs can still link.
-+	   This declaration cannot be done when compiling rtld.c
-+	   (i.e. #ifdef RTLD_BOOTSTRAP) because rtld.c contains the
-+	   common defn for _dl_rtld_map, which is incompatible with a
-+	   weak decl in the same file.  */
-+#  ifndef SHARED
-+	weak_extern (GL(dl_rtld_map));
-+#  endif
-+	if (map != &GL(dl_rtld_map)) /* Already done in rtld itself.  */
-+# endif
-+	  *addr_field = map->l_addr + reloc->r_addend;
-+      break;
-+    }
-+#endif
-+
-+    case R_LARCH_JUMP_SLOT:
-+    case __WORDSIZE == 64 ? R_LARCH_64 : R_LARCH_32:
-+      *addr_field = value;
-+      break;
-+
-+    case R_LARCH_IRELATIVE:
-+      value = map->l_addr + reloc->r_addend;
-+      if (__glibc_likely (!skip_ifunc))
-+	value = ((ElfW(Addr) (*) (void)) value) ();
-+      *addr_field = value;
-+      break;
-+
-+    case R_LARCH_NONE:
-+      break;
-+
-+    default:
-+      _dl_reloc_bad_type (map, r_type, 0);
-+      break;
-+    }
-+}
-+
-+auto inline void
-+__attribute__ ((always_inline))
-+elf_machine_rela_relative (ElfW(Addr) l_addr, const ElfW(Rela) *reloc,
-+			  void *const reloc_addr)
-+{
-+  *(ElfW(Addr) *) reloc_addr = l_addr + reloc->r_addend;
-+}
-+
-+auto inline void
-+__attribute__ ((always_inline))
-+elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
-+		      ElfW(Addr) l_addr,
-+		      const ElfW(Rela) *reloc,
-+		      int skip_ifunc)
-+{
-+  ElfW(Addr) *const reloc_addr = (void *) (l_addr + reloc->r_offset);
-+  const unsigned int r_type = ELFW (R_TYPE) (reloc->r_info);
-+
-+  /* Check for unexpected PLT reloc type.  */
-+  if (__glibc_likely (r_type == R_LARCH_JUMP_SLOT))
-+    {
-+      if (__glibc_unlikely (map->l_mach.plt == 0))
-+	{
-+	  if (l_addr)
-+	    *reloc_addr += l_addr;
-+	}
-+      else
-+	*reloc_addr = map->l_mach.plt;
-+    }
-+  else if (__glibc_unlikely (r_type == R_LARCH_IRELATIVE))
-+    {
-+      ElfW(Addr) *value = (void *) (l_addr + reloc->r_addend);
-+      if (__glibc_likely (!skip_ifunc))
-+        value = (ElfW(Addr) *)((ElfW(Addr) (*) (void)) value) ();
-+      *reloc_addr = (ElfW(Addr))value;
-+	}
-+  else
-+    _dl_reloc_bad_type (map, r_type, 1);
-+}
-+
-+/* Set up the loaded object described by L so its stub function
-+   will jump to the on-demand fixup code __dl_runtime_resolve.  */
-+
-+auto inline int
-+__attribute__ ((always_inline))
-+elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
-+			   int lazy, int profile)
-+{
-+#ifndef RTLD_BOOTSTRAP
-+  /* If using PLTs, fill in the first two entries of .got.plt.  */
-+  if (l->l_info[DT_JMPREL])
-+    {
-+      extern void _dl_runtime_resolve (void) __attribute__ ((visibility ("hidden")));
-+      extern void _dl_runtime_resolve_lasx (void) __attribute__ ((visibility ("hidden")));
-+      extern void _dl_runtime_resolve_lsx (void) __attribute__ ((visibility ("hidden")));
-+      ElfW(Addr) *gotplt = (ElfW(Addr) *) D_PTR (l, l_info[DT_PLTGOT]);
-+      /* If a library is prelinked but we have to relocate anyway,
-+	 we have to be able to undo the prelinking of .got.plt.
-+	 The prelinker saved the address of .plt for us here.  */
-+      if (gotplt[1])
-+	l->l_mach.plt = gotplt[1] + l->l_addr;
-+
-+      if (SUPPORT_LASX)
-+	gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx;
-+      else if (SUPPORT_LSX)
-+	gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lsx;
-+      else
-+	gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve;
-+
-+      gotplt[1] = (ElfW(Addr)) l;
-+    }
-+#endif
-+
-+  return lazy;
-+}
-+
-+#endif /* RESOLVE_MAP */
-diff --git a/sysdeps/loongarch/dl-tls.h b/sysdeps/loongarch/dl-tls.h
-new file mode 100644
-index 00000000..70110c50
---- /dev/null
-+++ b/sysdeps/loongarch/dl-tls.h
-@@ -0,0 +1,49 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+
-+/* Type used for the representation of TLS information in the GOT.  */
-+typedef struct
-+{
-+  unsigned long int ti_module;
-+  unsigned long int ti_offset;
-+} tls_index;
-+
-+/* The thread pointer points to the first static TLS block.  */
-+#define TLS_TP_OFFSET		0
-+
-+/* Dynamic thread vector pointers point 0x800 past the start of each
-+   TLS block.  */
-+//#define TLS_DTV_OFFSET		0x800
-+#define TLS_DTV_OFFSET		0
-+
-+/* Compute the value for a GOTTPREL reloc.  */
-+#define TLS_TPREL_VALUE(sym_map, sym) \
-+  ((sym_map)->l_tls_offset + (sym)->st_value - TLS_TP_OFFSET)
-+
-+/* Compute the value for a DTPREL reloc.  */
-+#define TLS_DTPREL_VALUE(sym) \
-+  ((sym)->st_value - TLS_DTV_OFFSET)
-+
-+extern void *__tls_get_addr (tls_index *ti);
-+
-+#define GET_ADDR_OFFSET	(ti->ti_offset + TLS_DTV_OFFSET)
-+#define __TLS_GET_ADDR(__ti)	(__tls_get_addr (__ti) - TLS_DTV_OFFSET)
-+
-+/* Value used for dtv entries for which the allocation is delayed.  */
-+#define TLS_DTV_UNALLOCATED	((void *) -1l)
-diff --git a/sysdeps/loongarch/dl-trampoline.S b/sysdeps/loongarch/dl-trampoline.S
-new file mode 100644
-index 00000000..5f627a63
---- /dev/null
-+++ b/sysdeps/loongarch/dl-trampoline.S
-@@ -0,0 +1,31 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#define USE_LASX
-+#define _dl_runtime_resolve _dl_runtime_resolve_lasx
-+#include "dl-trampoline.h"
-+#undef USE_LASX
-+#undef _dl_runtime_resolve
-+
-+#define USE_LSX
-+#define _dl_runtime_resolve _dl_runtime_resolve_lsx
-+#include "dl-trampoline.h"
-+#undef USE_LSX
-+#undef _dl_runtime_resolve
-+
-+#include "dl-trampoline.h"
-diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h
-new file mode 100644
-index 00000000..95639111
---- /dev/null
-+++ b/sysdeps/loongarch/dl-trampoline.h
-@@ -0,0 +1,153 @@
-+/* LoongArch PLT trampoline
-+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <sysdep.h>
-+#include <sys/asm.h>
-+
-+/* Assembler veneer called from the PLT header code for lazy loading.
-+   The PLT header passes its own args in t0-t2.  */
-+
-+#ifdef __loongarch_soft_float
-+# define FRAME_SIZE (-((-10 * SZREG) & ALMASK))
-+#else
-+# define FRAME_SIZE (-((-10 * SZREG - 8 * 256) & ALMASK))
-+#endif
-+
-+ENTRY (_dl_runtime_resolve)
-+  # Save arguments to stack.
-+
-+#ifdef __loongarch64
-+	li.d    t3, -FRAME_SIZE
-+	add.d	sp, sp, t3
-+#elif defined __loongarch32
-+	li.w    t3, -FRAME_SIZE
-+	add.w	sp, sp, t3
-+#endif
-+
-+
-+	REG_S	ra, sp, 9*SZREG
-+	REG_S	a0, sp, 1*SZREG
-+	REG_S	a1, sp, 2*SZREG
-+	REG_S	a2, sp, 3*SZREG
-+	REG_S	a3, sp, 4*SZREG
-+	REG_S	a4, sp, 5*SZREG
-+	REG_S	a5, sp, 6*SZREG
-+	REG_S	a6, sp, 7*SZREG
-+	REG_S	a7, sp, 8*SZREG
-+
-+#ifndef __loongarch_soft_float
-+	FREG_S	fa0, sp, 10*SZREG + 0*SZFREG
-+	FREG_S	fa1, sp, 10*SZREG + 1*SZFREG
-+	FREG_S	fa2, sp, 10*SZREG + 2*SZFREG
-+	FREG_S	fa3, sp, 10*SZREG + 3*SZFREG
-+	FREG_S	fa4, sp, 10*SZREG + 4*SZFREG
-+	FREG_S	fa5, sp, 10*SZREG + 5*SZFREG
-+	FREG_S	fa6, sp, 10*SZREG + 6*SZFREG
-+	FREG_S	fa7, sp, 10*SZREG + 7*SZFREG
-+#ifdef USE_LASX
-+	xvst	$xr0, sp, 10*SZREG + 0*256
-+	xvst	$xr1, sp, 10*SZREG + 1*256
-+	xvst	$xr2, sp, 10*SZREG + 2*256
-+	xvst	$xr3, sp, 10*SZREG + 3*256
-+	xvst	$xr4, sp, 10*SZREG + 4*256
-+	xvst	$xr5, sp, 10*SZREG + 5*256
-+	xvst	$xr6, sp, 10*SZREG + 6*256
-+	xvst	$xr7, sp, 10*SZREG + 7*256
-+#elif defined USE_LSX
-+	vst	$vr0, sp, 10*SZREG + 0*128
-+	vst	$vr1, sp, 10*SZREG + 1*128
-+	vst	$vr2, sp, 10*SZREG + 2*128
-+	vst	$vr3, sp, 10*SZREG + 3*128
-+	vst	$vr4, sp, 10*SZREG + 4*128
-+	vst	$vr5, sp, 10*SZREG + 5*128
-+	vst	$vr6, sp, 10*SZREG + 6*128
-+	vst	$vr7, sp, 10*SZREG + 7*128
-+#endif
-+#endif
-+
-+  # Update .got.plt and obtain runtime address of callee.
-+#ifdef __loongarch64
-+	slli.d	a1, t1, 1
-+	or	a0, t0, zero
-+	add.d	a1, a1, t1
-+	la	a2, _dl_fixup
-+	jirl	ra, a2, 0
-+	or	t1, v0, zero
-+#elif defined __loongarch32
-+	slli.w	a1, t1, 1
-+	or	a0, t0, zero
-+	add.w	a1, a1, t1
-+	la	a2, _dl_fixup
-+	jirl	ra, a2, 0
-+	or	t1, v0, zero
-+#endif
-+
-+  # Restore arguments from stack.
-+	REG_L	ra, sp, 9*SZREG
-+	REG_L	a0, sp, 1*SZREG
-+	REG_L	a1, sp, 2*SZREG
-+	REG_L	a2, sp, 3*SZREG
-+	REG_L	a3, sp, 4*SZREG
-+	REG_L	a4, sp, 5*SZREG
-+	REG_L	a5, sp, 6*SZREG
-+	REG_L	a6, sp, 7*SZREG
-+	REG_L	a7, sp, 8*SZREG
-+
-+#ifndef __loongarch_soft_float
-+	FREG_L	fa0, sp, 10*SZREG + 0*SZFREG
-+	FREG_L	fa1, sp, 10*SZREG + 1*SZFREG
-+	FREG_L	fa2, sp, 10*SZREG + 2*SZFREG
-+	FREG_L	fa3, sp, 10*SZREG + 3*SZFREG
-+	FREG_L	fa4, sp, 10*SZREG + 4*SZFREG
-+	FREG_L	fa5, sp, 10*SZREG + 5*SZFREG
-+	FREG_L	fa6, sp, 10*SZREG + 6*SZFREG
-+	FREG_L	fa7, sp, 10*SZREG + 7*SZFREG
-+#ifdef USE_LASX
-+	xvld	$xr0, sp, 10*SZREG + 0*256
-+	xvld	$xr1, sp, 10*SZREG + 1*256
-+	xvld	$xr2, sp, 10*SZREG + 2*256
-+	xvld	$xr3, sp, 10*SZREG + 3*256
-+	xvld	$xr4, sp, 10*SZREG + 4*256
-+	xvld	$xr5, sp, 10*SZREG + 5*256
-+	xvld	$xr6, sp, 10*SZREG + 6*256
-+	xvld	$xr7, sp, 10*SZREG + 7*256
-+#elif defined USE_LSX
-+	vld	$vr0, sp, 10*SZREG + 0*128
-+	vld	$vr1, sp, 10*SZREG + 1*128
-+	vld	$vr2, sp, 10*SZREG + 2*128
-+	vld	$vr3, sp, 10*SZREG + 3*128
-+	vld	$vr4, sp, 10*SZREG + 4*128
-+	vld	$vr5, sp, 10*SZREG + 5*128
-+	vld	$vr6, sp, 10*SZREG + 6*128
-+	vld	$vr7, sp, 10*SZREG + 7*128
-+#endif
-+#endif
-+
-+#ifdef __loongarch64
-+	li.d    t3, FRAME_SIZE
-+	add.d	sp, sp, t3
-+#elif defined __loongarch32
-+	li.w    t3, FRAME_SIZE
-+	addi.w	sp, sp, FRAME_SIZE
-+#endif
-+
-+
-+  # Invoke the callee.
-+	jirl	zero, t1, 0
-+END (_dl_runtime_resolve)
-diff --git a/sysdeps/loongarch/dl-tunables.list b/sysdeps/loongarch/dl-tunables.list
-new file mode 100644
-index 00000000..22c43611
---- /dev/null
-+++ b/sysdeps/loongarch/dl-tunables.list
-@@ -0,0 +1,25 @@
-+# LoongArch specific tunables.
-+# Copyright (C) 2017-2018 Free Software Foundation, Inc.
-+# This file is part of the GNU C Library.
-+
-+# The GNU C Library is free software; you can redistribute it and/or
-+# modify it under the terms of the GNU Lesser General Public
-+# License as published by the Free Software Foundation; either
-+# version 2.1 of the License, or (at your option) any later version.
-+
-+# The GNU C Library is distributed in the hope that it will be useful,
-+# but WITHOUT ANY WARRANTY; without even the implied warranty of
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+# Lesser General Public License for more details.
-+
-+# You should have received a copy of the GNU Lesser General Public
-+# License along with the GNU C Library; if not, see
-+# <http://www.gnu.org/licenses/>.
-+
-+glibc {
-+  cpu {
-+    hwcaps {
-+      type: STRING
-+    }
-+  }
-+}
-diff --git a/sysdeps/loongarch/e_sqrtl.c b/sysdeps/loongarch/e_sqrtl.c
-new file mode 100644
-index 00000000..65ae7ad8
---- /dev/null
-+++ b/sysdeps/loongarch/e_sqrtl.c
-@@ -0,0 +1,39 @@
-+/* long double square root in software floating-point emulation.
-+   Copyright (C) 1997-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Richard Henderson (rth@cygnus.com) and
-+		  Jakub Jelinek (jj@ultra.linux.cz).
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <stdlib.h>
-+#include <soft-fp/soft-fp.h>
-+#include <soft-fp/quad.h>
-+
-+long double
-+__ieee754_sqrtl (const long double a)
-+{
-+  FP_DECL_EX;
-+  FP_DECL_Q(A); FP_DECL_Q(C);
-+  long double c;
-+
-+  FP_INIT_ROUNDMODE;
-+  FP_UNPACK_Q(A, a);
-+  FP_SQRT_Q(C, A);
-+  FP_PACK_Q(c, C);
-+  FP_HANDLE_EXCEPTIONS;
-+  return c;
-+}
-+strong_alias (__ieee754_sqrtl, __sqrtl_finite)
-diff --git a/sysdeps/loongarch/elf-init.c b/sysdeps/loongarch/elf-init.c
-new file mode 100644
-index 00000000..5f261a9d
---- /dev/null
-+++ b/sysdeps/loongarch/elf-init.c
-@@ -0,0 +1 @@
-+#include <sysdeps/init_array/elf-init.c>
-diff --git a/sysdeps/loongarch/fenv_private.h b/sysdeps/loongarch/fenv_private.h
-new file mode 100644
-index 00000000..416377f6
---- /dev/null
-+++ b/sysdeps/loongarch/fenv_private.h
-@@ -0,0 +1,328 @@
-+/* Optimized inline fenv.h functions for libm.  Generic version.
-+   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#ifndef _FENV_PRIVATE_H
-+#define _FENV_PRIVATE_H 1
-+
-+#include <fenv.h>
-+#include <get-rounding-mode.h>
-+
-+/* The standards only specify one variant of the fenv.h interfaces.
-+   But at least for some architectures we can be more efficient if we
-+   know what operations are going to be performed.  Therefore we
-+   define additional interfaces.  By default they refer to the normal
-+   interfaces.  */
-+
-+static __always_inline void
-+default_libc_feholdexcept (fenv_t *e)
-+{
-+  (void) __feholdexcept (e);
-+}
-+
-+#ifndef libc_feholdexcept
-+# define libc_feholdexcept  default_libc_feholdexcept
-+#endif
-+#ifndef libc_feholdexceptf
-+# define libc_feholdexceptf default_libc_feholdexcept
-+#endif
-+#ifndef libc_feholdexceptl
-+# define libc_feholdexceptl default_libc_feholdexcept
-+#endif
-+
-+static __always_inline void
-+default_libc_fesetround (int r)
-+{
-+  (void) __fesetround (r);
-+}
-+
-+#ifndef libc_fesetround
-+# define libc_fesetround  default_libc_fesetround
-+#endif
-+#ifndef libc_fesetroundf
-+# define libc_fesetroundf default_libc_fesetround
-+#endif
-+#ifndef libc_fesetroundl
-+# define libc_fesetroundl default_libc_fesetround
-+#endif
-+
-+static __always_inline void
-+default_libc_feholdexcept_setround (fenv_t *e, int r)
-+{
-+  __feholdexcept (e);
-+  __fesetround (r);
-+}
-+
-+#ifndef libc_feholdexcept_setround
-+# define libc_feholdexcept_setround  default_libc_feholdexcept_setround
-+#endif
-+#ifndef libc_feholdexcept_setroundf
-+# define libc_feholdexcept_setroundf default_libc_feholdexcept_setround
-+#endif
-+#ifndef libc_feholdexcept_setroundl
-+# define libc_feholdexcept_setroundl default_libc_feholdexcept_setround
-+#endif
-+
-+#ifndef libc_feholdsetround_53bit
-+# define libc_feholdsetround_53bit libc_feholdsetround
-+#endif
-+
-+#ifndef libc_fetestexcept
-+# define libc_fetestexcept  fetestexcept
-+#endif
-+#ifndef libc_fetestexceptf
-+# define libc_fetestexceptf fetestexcept
-+#endif
-+#ifndef libc_fetestexceptl
-+# define libc_fetestexceptl fetestexcept
-+#endif
-+
-+static __always_inline void
-+default_libc_fesetenv (fenv_t *e)
-+{
-+  (void) __fesetenv (e);
-+}
-+
-+#ifndef libc_fesetenv
-+# define libc_fesetenv  default_libc_fesetenv
-+#endif
-+#ifndef libc_fesetenvf
-+# define libc_fesetenvf default_libc_fesetenv
-+#endif
-+#ifndef libc_fesetenvl
-+# define libc_fesetenvl default_libc_fesetenv
-+#endif
-+
-+static __always_inline void
-+default_libc_feupdateenv (fenv_t *e)
-+{
-+  (void) __feupdateenv (e);
-+}
-+
-+#ifndef libc_feupdateenv
-+# define libc_feupdateenv  default_libc_feupdateenv
-+#endif
-+#ifndef libc_feupdateenvf
-+# define libc_feupdateenvf default_libc_feupdateenv
-+#endif
-+#ifndef libc_feupdateenvl
-+# define libc_feupdateenvl default_libc_feupdateenv
-+#endif
-+
-+#ifndef libc_feresetround_53bit
-+# define libc_feresetround_53bit libc_feresetround
-+#endif
-+
-+static __always_inline int
-+default_libc_feupdateenv_test (fenv_t *e, int ex)
-+{
-+  int ret = fetestexcept (ex);
-+  __feupdateenv (e);
-+  return ret;
-+}
-+
-+#ifndef libc_feupdateenv_test
-+# define libc_feupdateenv_test  default_libc_feupdateenv_test
-+#endif
-+#ifndef libc_feupdateenv_testf
-+# define libc_feupdateenv_testf default_libc_feupdateenv_test
-+#endif
-+#ifndef libc_feupdateenv_testl
-+# define libc_feupdateenv_testl default_libc_feupdateenv_test
-+#endif
-+
-+/* Save and set the rounding mode.  The use of fenv_t to store the old mode
-+   allows a target-specific version of this function to avoid converting the
-+   rounding mode from the fpu format.  By default we have no choice but to
-+   manipulate the entire env.  */
-+
-+#ifndef libc_feholdsetround
-+# define libc_feholdsetround  libc_feholdexcept_setround
-+#endif
-+#ifndef libc_feholdsetroundf
-+# define libc_feholdsetroundf libc_feholdexcept_setroundf
-+#endif
-+#ifndef libc_feholdsetroundl
-+# define libc_feholdsetroundl libc_feholdexcept_setroundl
-+#endif
-+
-+/* ... and the reverse.  */
-+
-+#ifndef libc_feresetround
-+# define libc_feresetround  libc_feupdateenv
-+#endif
-+#ifndef libc_feresetroundf
-+# define libc_feresetroundf libc_feupdateenvf
-+#endif
-+#ifndef libc_feresetroundl
-+# define libc_feresetroundl libc_feupdateenvl
-+#endif
-+
-+/* ... and a version that also discards exceptions.  */
-+
-+#ifndef libc_feresetround_noex
-+# define libc_feresetround_noex  libc_fesetenv
-+#endif
-+#ifndef libc_feresetround_noexf
-+# define libc_feresetround_noexf libc_fesetenvf
-+#endif
-+#ifndef libc_feresetround_noexl
-+# define libc_feresetround_noexl libc_fesetenvl
-+#endif
-+
-+#ifndef HAVE_RM_CTX
-+# define HAVE_RM_CTX 0
-+#endif
-+
-+
-+/* Default implementation using standard fenv functions.
-+   Avoid unnecessary rounding mode changes by first checking the
-+   current rounding mode.  Note the use of __glibc_unlikely is
-+   important for performance.  */
-+
-+static __always_inline void
-+default_libc_feholdsetround_ctx (struct rm_ctx *ctx, int round)
-+{
-+  ctx->updated_status = false;
-+
-+  /* Update rounding mode only if different.  */
-+  if (__glibc_unlikely (round != get_rounding_mode ()))
-+    {
-+      ctx->updated_status = true;
-+      __fegetenv (&ctx->env);
-+      __fesetround (round);
-+    }
-+}
-+
-+static __always_inline void
-+default_libc_feresetround_ctx (struct rm_ctx *ctx)
-+{
-+  /* Restore the rounding mode if updated.  */
-+  if (__glibc_unlikely (ctx->updated_status))
-+    __feupdateenv (&ctx->env);
-+}
-+
-+static __always_inline void
-+default_libc_feholdsetround_noex_ctx (struct rm_ctx *ctx, int round)
-+{
-+  /* Save exception flags and rounding mode, and disable exception
-+     traps.  */
-+  __feholdexcept (&ctx->env);
-+
-+  /* Update rounding mode only if different.  */
-+  if (__glibc_unlikely (round != get_rounding_mode ()))
-+    __fesetround (round);
-+}
-+
-+static __always_inline void
-+default_libc_feresetround_noex_ctx (struct rm_ctx *ctx)
-+{
-+  /* Restore exception flags and rounding mode.  */
-+  __fesetenv (&ctx->env);
-+}
-+
-+#if HAVE_RM_CTX
-+/* Set/Restore Rounding Modes only when necessary.  If defined, these functions
-+   set/restore floating point state only if the state needed within the lexical
-+   block is different from the current state.  This saves a lot of time when
-+   the floating point unit is much slower than the fixed point units.  */
-+
-+# ifndef libc_feholdsetround_noex_ctx
-+#   define libc_feholdsetround_noex_ctx  libc_feholdsetround_ctx
-+# endif
-+# ifndef libc_feholdsetround_noexf_ctx
-+#   define libc_feholdsetround_noexf_ctx libc_feholdsetroundf_ctx
-+# endif
-+# ifndef libc_feholdsetround_noexl_ctx
-+#   define libc_feholdsetround_noexl_ctx libc_feholdsetroundl_ctx
-+# endif
-+
-+# ifndef libc_feresetround_noex_ctx
-+#   define libc_feresetround_noex_ctx  libc_fesetenv_ctx
-+# endif
-+# ifndef libc_feresetround_noexf_ctx
-+#   define libc_feresetround_noexf_ctx libc_fesetenvf_ctx
-+# endif
-+# ifndef libc_feresetround_noexl_ctx
-+#   define libc_feresetround_noexl_ctx libc_fesetenvl_ctx
-+# endif
-+
-+#else
-+
-+# define libc_feholdsetround_ctx      default_libc_feholdsetround_ctx
-+# define libc_feresetround_ctx        default_libc_feresetround_ctx
-+# define libc_feholdsetround_noex_ctx default_libc_feholdsetround_noex_ctx
-+# define libc_feresetround_noex_ctx   default_libc_feresetround_noex_ctx
-+
-+# define libc_feholdsetroundf_ctx libc_feholdsetround_ctx
-+# define libc_feholdsetroundl_ctx libc_feholdsetround_ctx
-+# define libc_feresetroundf_ctx   libc_feresetround_ctx
-+# define libc_feresetroundl_ctx   libc_feresetround_ctx
-+
-+# define libc_feholdsetround_noexf_ctx libc_feholdsetround_noex_ctx
-+# define libc_feholdsetround_noexl_ctx libc_feholdsetround_noex_ctx
-+# define libc_feresetround_noexf_ctx   libc_feresetround_noex_ctx
-+# define libc_feresetround_noexl_ctx   libc_feresetround_noex_ctx
-+
-+#endif
-+
-+#ifndef libc_feholdsetround_53bit_ctx
-+#  define libc_feholdsetround_53bit_ctx libc_feholdsetround_ctx
-+#endif
-+#ifndef libc_feresetround_53bit_ctx
-+#  define libc_feresetround_53bit_ctx libc_feresetround_ctx
-+#endif
-+
-+#define SET_RESTORE_ROUND_GENERIC(RM,ROUNDFUNC,CLEANUPFUNC) \
-+  struct rm_ctx ctx __attribute__((cleanup (CLEANUPFUNC ## _ctx))); \
-+  ROUNDFUNC ## _ctx (&ctx, (RM))
-+
-+/* Set the rounding mode within a lexical block.  Restore the rounding mode to
-+   the value at the start of the block.  The exception mode must be preserved.
-+   Exceptions raised within the block must be set in the exception flags.
-+   Non-stop mode may be enabled inside the block.  */
-+
-+#define SET_RESTORE_ROUND(RM) \
-+  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround, libc_feresetround)
-+#define SET_RESTORE_ROUNDF(RM) \
-+  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundf, libc_feresetroundf)
-+#define SET_RESTORE_ROUNDL(RM) \
-+  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundl, libc_feresetroundl)
-+
-+/* Set the rounding mode within a lexical block.  Restore the rounding mode to
-+   the value at the start of the block.  The exception mode must be preserved.
-+   Exceptions raised within the block must be discarded, and exception flags
-+   are restored to the value at the start of the block.
-+   Non-stop mode must be enabled inside the block.  */
-+
-+#define SET_RESTORE_ROUND_NOEX(RM) \
-+  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_noex, \
-+			     libc_feresetround_noex)
-+#define SET_RESTORE_ROUND_NOEXF(RM) \
-+  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_noexf, \
-+			     libc_feresetround_noexf)
-+#define SET_RESTORE_ROUND_NOEXL(RM) \
-+  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_noexl, \
-+			     libc_feresetround_noexl)
-+
-+/* Like SET_RESTORE_ROUND, but also set rounding precision to 53 bits.  */
-+#define SET_RESTORE_ROUND_53BIT(RM) \
-+  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_53bit,	      \
-+			     libc_feresetround_53bit)
-+
-+#endif /* fenv_private.h.  */
-+
-diff --git a/sysdeps/loongarch/fpu/e_ilogb.c b/sysdeps/loongarch/fpu/e_ilogb.c
-new file mode 100644
-index 00000000..f9ada692
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/e_ilogb.c
-@@ -0,0 +1,39 @@
-+/* __ieee754_ilogb().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <fpu_control.h>
-+
-+int
-+__ieee754_ilogb (double x)
-+{
-+  int x_cond;
-+  asm volatile ("fclass.d \t%0, %1" : "=f" (x_cond) : "f" (x));
-+
-+  if (__glibc_unlikely (x_cond & _FCLASS_ZERO))
-+      return FP_ILOGB0;
-+  else if (__glibc_unlikely (x_cond & ( _FCLASS_NAN | _FCLASS_INF)))
-+      return FP_ILOGBNAN;
-+  else
-+    {
-+      asm volatile ("fabs.d \t%0, %1" : "=f" (x) : "f" (x));
-+      asm volatile ("flogb.d \t%0, %1" : "=f" (x) : "f" (x));
-+      return x;
-+    }
-+}
-diff --git a/sysdeps/loongarch/fpu/e_ilogbf.c b/sysdeps/loongarch/fpu/e_ilogbf.c
-new file mode 100644
-index 00000000..e1da48ec
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/e_ilogbf.c
-@@ -0,0 +1,39 @@
-+/* __ieee754_ilogbf().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <fpu_control.h>
-+
-+int
-+__ieee754_ilogbf (float x)
-+{
-+  int x_cond;
-+  asm volatile ("fclass.s \t%0, %1" : "=f" (x_cond) : "f" (x));
-+
-+  if (__glibc_unlikely (x_cond & _FCLASS_ZERO))
-+      return FP_ILOGB0;
-+  else if (__glibc_unlikely (x_cond & ( _FCLASS_NAN | _FCLASS_INF)))
-+      return FP_ILOGBNAN;
-+  else
-+    {
-+      asm volatile ("fabs.s \t%0, %1" : "=f" (x) : "f" (x));
-+      asm volatile ("flogb.s \t%0, %1" : "=f" (x) : "f" (x));
-+      return x;
-+    }
-+}
-diff --git a/sysdeps/loongarch/fpu/e_sqrt.c b/sysdeps/loongarch/fpu/e_sqrt.c
-new file mode 100644
-index 00000000..dac8696a
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/e_sqrt.c
-@@ -0,0 +1,29 @@
-+/* Copyright (C) 2002-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+
-+
-+double
-+__ieee754_sqrt (double x)
-+{
-+  double z;
-+  __asm__ ("fsqrt.d %0,%1" : "=f" (z) : "f" (x));
-+  return z;
-+}
-+strong_alias (__ieee754_sqrt, __sqrt_finite)
-+
-diff --git a/sysdeps/loongarch/fpu/e_sqrtf.c b/sysdeps/loongarch/fpu/e_sqrtf.c
-new file mode 100644
-index 00000000..706c0494
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/e_sqrtf.c
-@@ -0,0 +1,28 @@
-+/* Copyright (C) 2002-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+
-+
-+float
-+__ieee754_sqrtf (float x)
-+{
-+  float z;
-+  __asm__ ("fsqrt.s %0,%1" : "=f" (z) : "f" (x));
-+  return z;
-+}
-+strong_alias (__ieee754_sqrtf, __sqrtf_finite)
-diff --git a/sysdeps/loongarch/fpu/fclrexcpt.c b/sysdeps/loongarch/fpu/fclrexcpt.c
-new file mode 100644
-index 00000000..51310d93
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/fclrexcpt.c
-@@ -0,0 +1,47 @@
-+/* Clear given exceptions in current floating-point environment.
-+   Copyright (C) 1998-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Andreas Jaeger <aj@suse.de>, 1998.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <fenv.h>
-+#include <fenv_libc.h>
-+#include <fpu_control.h>
-+
-+int
-+feclearexcept (int excepts)
-+{
-+  int cw;
-+
-+  /* Mask out unsupported bits/exceptions.  */
-+  excepts &= FE_ALL_EXCEPT;
-+
-+  /* Read the complete control word.  */
-+  _FPU_GETCW (cw);
-+
-+  /* Clear exception flag bits and cause bits. If the cause bit is not
-+     cleared, the next CTC instruction (just below) will re-generate the
-+     exception.  */
-+
-+  cw &= ~(excepts | (excepts << CAUSE_SHIFT));
-+
-+  /* Put the new data in effect.  */
-+  _FPU_SETCW (cw);
-+
-+  /* Success.  */
-+  return 0;
-+}
-+libm_hidden_def (feclearexcept)
-diff --git a/sysdeps/loongarch/fpu/fedisblxcpt.c b/sysdeps/loongarch/fpu/fedisblxcpt.c
-new file mode 100644
-index 00000000..004b0ecb
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/fedisblxcpt.c
-@@ -0,0 +1,40 @@
-+/* Disable floating-point exceptions.
-+   Copyright (C) 2000-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Andreas Jaeger <aj@suse.de>, 2000.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <fenv.h>
-+#include <fenv_libc.h>
-+#include <fpu_control.h>
-+
-+int
-+fedisableexcept (int excepts)
-+{
-+  unsigned int new_exc, old_exc;
-+
-+  /* Get the current control word.  */
-+  _FPU_GETCW (new_exc);
-+
-+  old_exc = (new_exc & ENABLE_MASK) << ENABLE_SHIFT;
-+
-+  excepts &= FE_ALL_EXCEPT;
-+
-+  new_exc &= ~(excepts >> ENABLE_SHIFT);
-+  _FPU_SETCW (new_exc);
-+
-+  return old_exc;
-+}
-diff --git a/sysdeps/loongarch/fpu/feenablxcpt.c b/sysdeps/loongarch/fpu/feenablxcpt.c
-new file mode 100644
-index 00000000..b8f56625
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/feenablxcpt.c
-@@ -0,0 +1,40 @@
-+/* Enable floating-point exceptions.
-+   Copyright (C) 2000-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Andreas Jaeger <aj@suse.de>, 2000.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <fenv.h>
-+#include <fenv_libc.h>
-+#include <fpu_control.h>
-+
-+int
-+feenableexcept (int excepts)
-+{
-+  unsigned int new_exc, old_exc;
-+
-+  /* Get the current control word.  */
-+  _FPU_GETCW (new_exc);
-+
-+  old_exc = (new_exc & ENABLE_MASK) << ENABLE_SHIFT;
-+
-+  excepts &= FE_ALL_EXCEPT;
-+
-+  new_exc |= excepts >> ENABLE_SHIFT;
-+  _FPU_SETCW (new_exc);
-+
-+  return old_exc;
-+}
-diff --git a/sysdeps/loongarch/fpu/fegetenv.c b/sysdeps/loongarch/fpu/fegetenv.c
-new file mode 100644
-index 00000000..8e8fa2c5
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/fegetenv.c
-@@ -0,0 +1,33 @@
-+/* Store current floating-point environment.
-+   Copyright (C) 1998-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Andreas Jaeger <aj@suse.de>, 1998.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <fenv.h>
-+#include <fpu_control.h>
-+
-+int
-+__fegetenv (fenv_t *envp)
-+{
-+  _FPU_GETCW (*envp);
-+
-+  /* Success.  */
-+  return 0;
-+}
-+libm_hidden_def (__fegetenv)
-+weak_alias (__fegetenv, fegetenv)
-+libm_hidden_weak (fegetenv)
-diff --git a/sysdeps/loongarch/fpu/fegetexcept.c b/sysdeps/loongarch/fpu/fegetexcept.c
-new file mode 100644
-index 00000000..2c0a1208
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/fegetexcept.c
-@@ -0,0 +1,33 @@
-+/* Get enabled floating-point exceptions.
-+   Copyright (C) 2000-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Andreas Jaeger <aj@suse.de>, 2000.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <fenv.h>
-+#include <fenv_libc.h>
-+#include <fpu_control.h>
-+
-+int
-+fegetexcept (void)
-+{
-+  unsigned int exc;
-+
-+  /* Get the current control word.  */
-+  _FPU_GETCW (exc);
-+
-+  return (exc & ENABLE_MASK) << ENABLE_SHIFT;
-+}
-diff --git a/sysdeps/loongarch/fpu/fegetmode.c b/sysdeps/loongarch/fpu/fegetmode.c
-new file mode 100644
-index 00000000..e0a5180f
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/fegetmode.c
-@@ -0,0 +1,27 @@
-+/* Store current floating-point control modes.  MIPS version.
-+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <fenv.h>
-+#include <fpu_control.h>
-+
-+int
-+fegetmode (femode_t *modep)
-+{
-+  _FPU_GETCW (*modep);
-+  return 0;
-+}
-diff --git a/sysdeps/loongarch/fpu/fegetround.c b/sysdeps/loongarch/fpu/fegetround.c
-new file mode 100644
-index 00000000..a7ac444a
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/fegetround.c
-@@ -0,0 +1,35 @@
-+/* Return current rounding direction.
-+   Copyright (C) 1998-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Andreas Jaeger <aj@arthur.rhein-neckar.de>, 1998.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <fenv.h>
-+#include <fpu_control.h>
-+
-+int
-+__fegetround (void)
-+{
-+  int cw;
-+
-+  /* Get control word.  */
-+  _FPU_GETCW (cw);
-+
-+  return cw & _FPU_RC_MASK;
-+}
-+libm_hidden_def (__fegetround)
-+weak_alias (__fegetround, fegetround)
-+libm_hidden_weak (fegetround)
-diff --git a/sysdeps/loongarch/fpu/feholdexcpt.c b/sysdeps/loongarch/fpu/feholdexcpt.c
-new file mode 100644
-index 00000000..eb9d4764
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/feholdexcpt.c
-@@ -0,0 +1,41 @@
-+/* Store current floating-point environment and clear exceptions.
-+   Copyright (C) 2000-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Andreas Jaeger <aj@suse.de>, 2000.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <fenv.h>
-+#include <fpu_control.h>
-+
-+int
-+__feholdexcept (fenv_t *envp)
-+{
-+  fpu_control_t cw;
-+
-+  /* Save the current state.  */
-+  _FPU_GETCW (cw);
-+  envp->__fp_control_register = cw;
-+
-+  /* Clear all exception enable bits and flags.  */
-+  cw &= ~(_FPU_MASK_V|_FPU_MASK_Z|_FPU_MASK_O|_FPU_MASK_U|_FPU_MASK_I|FE_ALL_EXCEPT);
-+  _FPU_SETCW (cw);
-+
-+  return 0;
-+}
-+
-+libm_hidden_def (__feholdexcept)
-+weak_alias (__feholdexcept, feholdexcept)
-+libm_hidden_weak (feholdexcept)
-diff --git a/sysdeps/loongarch/fpu/fenv_libc.h b/sysdeps/loongarch/fpu/fenv_libc.h
-new file mode 100644
-index 00000000..f5dd1678
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/fenv_libc.h
-@@ -0,0 +1,31 @@
-+/* Copyright (C) 2000-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Andreas Jaeger <aj@suse.de>.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _FENV_LIBC_H
-+#define _FENV_LIBC_H    1
-+
-+/* Mask for enabling exceptions and for the CAUSE bits.  */
-+#define ENABLE_MASK	0x0000001FU
-+#define CAUSE_MASK	0x1F000000U
-+
-+/* Shift for FE_* flags to get up to the ENABLE bits and the CAUSE bits.  */
-+#define	ENABLE_SHIFT	16
-+#define	CAUSE_SHIFT	8
-+
-+
-+#endif /* _FENV_LIBC_H */
-diff --git a/sysdeps/loongarch/fpu/fesetenv.c b/sysdeps/loongarch/fpu/fesetenv.c
-new file mode 100644
-index 00000000..8dee8782
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/fesetenv.c
-@@ -0,0 +1,44 @@
-+/* Install given floating-point environment.
-+   Copyright (C) 1998-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Andreas Jaeger <aj@suse.de>, 1998.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <fenv.h>
-+#include <fpu_control.h>
-+
-+int
-+__fesetenv (const fenv_t *envp)
-+{
-+  fpu_control_t cw;
-+
-+  /* Read first current state to flush fpu pipeline.  */
-+  _FPU_GETCW (cw);
-+
-+  if (envp == FE_DFL_ENV)
-+    _FPU_SETCW (_FPU_DEFAULT);
-+  else if (envp == FE_NOMASK_ENV)
-+    _FPU_SETCW (_FPU_IEEE);
-+  else
-+    _FPU_SETCW (envp->__fp_control_register);
-+
-+  /* Success.  */
-+  return 0;
-+}
-+
-+libm_hidden_def (__fesetenv)
-+weak_alias (__fesetenv, fesetenv)
-+libm_hidden_weak (fesetenv)
-diff --git a/sysdeps/loongarch/fpu/fesetexcept.c b/sysdeps/loongarch/fpu/fesetexcept.c
-new file mode 100644
-index 00000000..d14febca
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/fesetexcept.c
-@@ -0,0 +1,32 @@
-+/* Set given exception flags.  MIPS version.
-+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <fenv.h>
-+#include <fpu_control.h>
-+
-+int
-+fesetexcept (int excepts)
-+{
-+  fpu_control_t temp;
-+
-+  _FPU_GETCW (temp);
-+  temp |= excepts & FE_ALL_EXCEPT;
-+  _FPU_SETCW (temp);
-+
-+  return 0;
-+}
-diff --git a/sysdeps/loongarch/fpu/fesetmode.c b/sysdeps/loongarch/fpu/fesetmode.c
-new file mode 100644
-index 00000000..8cc5d0b1
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/fesetmode.c
-@@ -0,0 +1,38 @@
-+/* Install given floating-point control modes.  MIPS version.
-+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <fenv.h>
-+#include <fpu_control.h>
-+
-+#define FCSR_STATUS 0x1f1f0000
-+
-+int
-+fesetmode (const femode_t *modep)
-+{
-+  fpu_control_t cw;
-+
-+  _FPU_GETCW (cw);
-+  cw &= FCSR_STATUS;
-+  if (modep == FE_DFL_MODE)
-+    cw |= _FPU_DEFAULT;
-+  else
-+    cw |= *modep & ~FCSR_STATUS;
-+  _FPU_SETCW (cw);
-+
-+  return 0;
-+}
-diff --git a/sysdeps/loongarch/fpu/fesetround.c b/sysdeps/loongarch/fpu/fesetround.c
-new file mode 100644
-index 00000000..31fdeab3
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/fesetround.c
-@@ -0,0 +1,46 @@
-+/* Set current rounding direction.
-+   Copyright (C) 1998-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Andreas Jaeger <aj@arthur.rhein-neckar.de>, 1998.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <fenv.h>
-+#include <fpu_control.h>
-+
-+int
-+__fesetround (int round)
-+{
-+  fpu_control_t cw;
-+
-+  if ((round & ~_FPU_RC_MASK) != 0)
-+    /* ROUND is no valid rounding mode.  */
-+    return 1;
-+
-+  /* Get current state.  */
-+  _FPU_GETCW (cw);
-+
-+  /* Set rounding bits.  */
-+  cw &= ~_FPU_RC_MASK;
-+  cw |= round;
-+  /* Set new state.  */
-+  _FPU_SETCW (cw);
-+
-+  return 0;
-+}
-+
-+libm_hidden_def (__fesetround)
-+weak_alias (__fesetround, fesetround)
-+libm_hidden_weak (fesetround)
-diff --git a/sysdeps/loongarch/fpu/feupdateenv.c b/sysdeps/loongarch/fpu/feupdateenv.c
-new file mode 100644
-index 00000000..669bfc3c
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/feupdateenv.c
-@@ -0,0 +1,45 @@
-+/* Install given floating-point environment and raise exceptions.
-+   Copyright (C) 1998-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Andreas Jaeger <aj@suse.de>, 1998.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <fenv.h>
-+#include <fpu_control.h>
-+
-+int
-+__feupdateenv (const fenv_t *envp)
-+{
-+  int temp;
-+
-+  /* Save current exceptions.  */
-+  _FPU_GETCW (temp);
-+  temp &= FE_ALL_EXCEPT;
-+
-+  /* Install new environment.  */
-+  __fesetenv (envp);
-+
-+  /* Raise the safed exception.  Incidently for us the implementation
-+     defined format of the values in objects of type fexcept_t is the
-+     same as the ones specified using the FE_* constants.  */
-+  __feraiseexcept (temp);
-+
-+  /* Success.  */
-+  return 0;
-+}
-+libm_hidden_def (__feupdateenv)
-+weak_alias (__feupdateenv, feupdateenv)
-+libm_hidden_weak (feupdateenv)
-diff --git a/sysdeps/loongarch/fpu/fgetexcptflg.c b/sysdeps/loongarch/fpu/fgetexcptflg.c
-new file mode 100644
-index 00000000..1e594e14
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/fgetexcptflg.c
-@@ -0,0 +1,39 @@
-+/* Store current representation for exceptions.
-+   Copyright (C) 1998-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Andreas Jaeger <aj@suse.de>, 1998.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <fenv.h>
-+#include <fpu_control.h>
-+
-+int
-+fegetexceptflag (fexcept_t *flagp, int excepts)
-+{
-+  fpu_control_t temp;
-+
-+  /* Get the current exceptions.  */
-+  _FPU_GETCW (temp);
-+
-+  /* We only save the relevant bits here. In particular, care has to be
-+     taken with the CAUSE bits, as an inadvertent restore later on could
-+     generate unexpected exceptions.  */
-+
-+  *flagp = temp & excepts & FE_ALL_EXCEPT;
-+
-+  /* Success.  */
-+  return 0;
-+}
-diff --git a/sysdeps/loongarch/fpu/fraiseexcpt.c b/sysdeps/loongarch/fpu/fraiseexcpt.c
-new file mode 100644
-index 00000000..2eec053a
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/fraiseexcpt.c
-@@ -0,0 +1,84 @@
-+/* Raise given exceptions.
-+   Copyright (C) 2000-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Andreas Jaeger <aj@suse.de>, 2000.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <fenv.h>
-+#include <fpu_control.h>
-+#include <float.h>
-+
-+int
-+__feraiseexcept (int excepts)
-+{
-+
-+  const float fp_zero = 0.0, fp_one = 1.0, fp_max = FLT_MAX,
-+	fp_min = FLT_MIN, fp_1e32 = 1.0e32f, fp_two = 2.0,
-+	fp_three = 3.0;
-+
-+  /* Raise exceptions represented by EXPECTS.  But we must raise only
-+     one signal at a time.  It is important that if the overflow/underflow
-+     exception and the inexact exception are given at the same time,
-+     the overflow/underflow exception follows the inexact exception.*/
-+
-+  /* First: invalid exception.  */
-+  if (FE_INVALID & excepts)
-+    __asm__ __volatile__ (
-+			  "fdiv.s $f0,%0,%0\n\t"
-+			  :
-+			  : "f" (fp_zero)
-+			  :"$f0");
-+
-+  /* Next: division by zero.  */
-+  if (FE_DIVBYZERO & excepts)
-+    __asm__ __volatile__ (
-+			  "fdiv.s $f0,%0,%1\n\t"
-+			  :
-+			  : "f" (fp_one), "f" (fp_zero)
-+			  :"$f0");
-+
-+  /* Next: overflow.  */
-+  if (FE_OVERFLOW & excepts)
-+    /* There's no way to raise overflow without also raising inexact.  */
-+    __asm__ __volatile__ (
-+			  "fadd.s $f0,%0,%1\n\t"
-+			  :
-+			  : "f" (fp_max), "f" (fp_1e32)
-+			  : "$f0");
-+
-+  /* Next: underflow.  */
-+  if (FE_UNDERFLOW & excepts)
-+    __asm__ __volatile__ (
-+			  "fdiv.s $f0,%0,%1\n\t"
-+			  :
-+			  : "f" (fp_min), "f" (fp_three)
-+			  : "$f0");
-+
-+  /* Last: inexact.  */
-+  if (FE_INEXACT & excepts)
-+    __asm__ __volatile__ (
-+			  "fdiv.s $f0, %0, %1\n\t"
-+			  :
-+			  : "f" (fp_two), "f" (fp_three)
-+			  : "$f0");
-+
-+  /* Success.  */
-+  return 0;
-+}
-+
-+libm_hidden_def (__feraiseexcept)
-+weak_alias (__feraiseexcept, feraiseexcept)
-+libm_hidden_weak (feraiseexcept)
-diff --git a/sysdeps/loongarch/fpu/fsetexcptflg.c b/sysdeps/loongarch/fpu/fsetexcptflg.c
-new file mode 100644
-index 00000000..dc447a77
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/fsetexcptflg.c
-@@ -0,0 +1,42 @@
-+/* Set floating-point environment exception handling.
-+   Copyright (C) 1998-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <fenv.h>
-+#include <fpu_control.h>
-+
-+int
-+fesetexceptflag (const fexcept_t *flagp, int excepts)
-+{
-+  fpu_control_t temp;
-+
-+  /* Get the current exceptions.  */
-+  _FPU_GETCW (temp);
-+
-+  /* Make sure the flags we want restored are legal.  */
-+  excepts &= FE_ALL_EXCEPT;
-+
-+  /* Now clear the bits called for, and copy them in from flagp. Note that
-+     we ignore all non-flag bits from *flagp, so they don't matter.  */
-+  temp = (temp & ~excepts) | (*flagp & excepts);
-+
-+  _FPU_SETCW (temp);
-+
-+  /* Success.  */
-+  return 0;
-+}
-diff --git a/sysdeps/loongarch/fpu/ftestexcept.c b/sysdeps/loongarch/fpu/ftestexcept.c
-new file mode 100644
-index 00000000..fa645b26
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/ftestexcept.c
-@@ -0,0 +1,33 @@
-+/* Test exception in current environment.
-+   Copyright (C) 1998-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Andreas Jaeger <aj@arthur.rhein-neckar.de>, 1998.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <fenv.h>
-+#include <fpu_control.h>
-+
-+int
-+fetestexcept (int excepts)
-+{
-+  int cw;
-+
-+  /* Get current control word.  */
-+  _FPU_GETCW (cw);
-+
-+  return cw & excepts & FE_ALL_EXCEPT;
-+}
-+libm_hidden_def (fetestexcept)
-diff --git a/sysdeps/loongarch/fpu/s_copysign.c b/sysdeps/loongarch/fpu/s_copysign.c
-new file mode 100644
-index 00000000..861c4610
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_copysign.c
-@@ -0,0 +1,30 @@
-+/* copysign().  LoongArch version.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-double.h>
-+
-+double
-+__copysign (double x, double y)
-+{
-+  asm ("fcopysign.d %0, %1, %2" : "=f" (x) : "f" (x), "f" (y));
-+  return x;
-+}
-+libm_alias_double (__copysign, copysign)
-diff --git a/sysdeps/loongarch/fpu/s_copysignf.c b/sysdeps/loongarch/fpu/s_copysignf.c
-new file mode 100644
-index 00000000..c680b1fd
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_copysignf.c
-@@ -0,0 +1,30 @@
-+/* copysignf().  LoongArch version.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-float.h>
-+
-+float
-+__copysignf (float x, float y)
-+{
-+  asm ("fcopysign.s %0, %1, %2" : "=f" (x) : "f" (x), "f" (y));
-+  return x;
-+}
-+libm_alias_float (__copysign, copysign)
-diff --git a/sysdeps/loongarch/fpu/s_finite.c b/sysdeps/loongarch/fpu/s_finite.c
-new file mode 100644
-index 00000000..a2e98f0b
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_finite.c
-@@ -0,0 +1,30 @@
-+/* finite().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <math.h>
-+#include <fenv_private.h>
-+
-+int
-+__finite (double x)
-+{
-+  int x_cond;
-+  asm volatile ("fclass.d \t%0, %1" : "=f" (x_cond) : "f" (x));
-+  return  x_cond & ~(_FCLASS_INF | _FCLASS_NAN);
-+}
-+hidden_def (__finite)
-+weak_alias (__finite, finite)
-diff --git a/sysdeps/loongarch/fpu/s_finitef.c b/sysdeps/loongarch/fpu/s_finitef.c
-new file mode 100644
-index 00000000..9ffab38a
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_finitef.c
-@@ -0,0 +1,30 @@
-+/* finitef().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <math.h>
-+#include <fenv_private.h>
-+
-+int
-+__finitef (float x)
-+{
-+  int x_cond;
-+  asm volatile ("fclass.s \t%0, %1" : "=f" (x_cond) : "f" (x));
-+  return  x_cond & ~(_FCLASS_INF | _FCLASS_NAN);
-+}
-+hidden_def (__finitef)
-+weak_alias (__finitef, finitef)
-diff --git a/sysdeps/loongarch/fpu/s_fmax.c b/sysdeps/loongarch/fpu/s_fmax.c
-new file mode 100644
-index 00000000..fe7265af
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_fmax.c
-@@ -0,0 +1,30 @@
-+/* fmax().  LoongArch version.
-+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-double.h>
-+
-+double
-+__fmax (double x, double y)
-+{
-+  asm volatile("fmax.d\t%0,%1,%2" : "=f" (x) : "f" (x), "f" (y));
-+  return x;
-+}
-+libm_alias_double (__fmax, fmax)
-diff --git a/sysdeps/loongarch/fpu/s_fmaxf.c b/sysdeps/loongarch/fpu/s_fmaxf.c
-new file mode 100644
-index 00000000..3defa7de
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_fmaxf.c
-@@ -0,0 +1,30 @@
-+/* fmaxf().  LoongArch version.
-+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-float.h>
-+
-+float
-+__fmaxf (float x, float y)
-+{
-+  asm volatile("fmax.s\t%0,%1,%2" : "=f" (x) : "f" (x), "f" (y));
-+  return x;
-+}
-+libm_alias_float (__fmax, fmax)
-diff --git a/sysdeps/loongarch/fpu/s_fmaxmag.c b/sysdeps/loongarch/fpu/s_fmaxmag.c
-new file mode 100644
-index 00000000..8570a3ba
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_fmaxmag.c
-@@ -0,0 +1,29 @@
-+/* fmaxmag().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-double.h>
-+
-+double
-+__fmaxmag (double x, double y)
-+{
-+  asm volatile ("fmaxa.d \t%0, %1, %2" : "=f" (x) : "f" (x), "f" (y));
-+  return x;
-+}
-+libm_alias_double (__fmaxmag, fmaxmag)
-diff --git a/sysdeps/loongarch/fpu/s_fmaxmagf.c b/sysdeps/loongarch/fpu/s_fmaxmagf.c
-new file mode 100644
-index 00000000..413e7683
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_fmaxmagf.c
-@@ -0,0 +1,29 @@
-+/* fmaxmagf().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-float.h>
-+
-+float
-+__fmaxmagf (float x, float y)
-+{
-+  asm volatile ("fmaxa.s \t%0, %1, %2" : "=f" (x) : "f" (x), "f" (y));
-+  return x;
-+}
-+libm_alias_float (__fmaxmag, fmaxmag)
-diff --git a/sysdeps/loongarch/fpu/s_fmin.c b/sysdeps/loongarch/fpu/s_fmin.c
-new file mode 100644
-index 00000000..cc9d0cd1
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_fmin.c
-@@ -0,0 +1,30 @@
-+/* fmin().  LoongArch version.
-+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-double.h>
-+
-+double
-+__fmin (double x, double y)
-+{
-+  asm volatile("fmin.d\t%0,%1,%2" : "=f" (x) : "f" (x), "f" (y));
-+  return x;
-+}
-+libm_alias_double (__fmin, fmin)
-diff --git a/sysdeps/loongarch/fpu/s_fminf.c b/sysdeps/loongarch/fpu/s_fminf.c
-new file mode 100644
-index 00000000..40efbd71
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_fminf.c
-@@ -0,0 +1,30 @@
-+/* fminf().  LoongArch version.
-+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-float.h>
-+
-+float
-+__fminf (float x, float y)
-+{
-+  asm volatile("fmin.s\t%0,%1,%2" : "=f" (x) : "f" (x), "f" (y));
-+  return x;
-+}
-+libm_alias_float (__fmin, fmin)
-diff --git a/sysdeps/loongarch/fpu/s_fminmag.c b/sysdeps/loongarch/fpu/s_fminmag.c
-new file mode 100644
-index 00000000..2badf3d3
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_fminmag.c
-@@ -0,0 +1,29 @@
-+/* fminmag().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-double.h>
-+
-+double
-+__fminmag (double x, double y)
-+{
-+  asm volatile ("fmina.d \t%0, %1, %2" : "=f" (x) : "f" (x), "f" (y));
-+  return x;
-+}
-+libm_alias_double (__fminmag, fminmag)
-diff --git a/sysdeps/loongarch/fpu/s_fminmagf.c b/sysdeps/loongarch/fpu/s_fminmagf.c
-new file mode 100644
-index 00000000..4d625312
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_fminmagf.c
-@@ -0,0 +1,29 @@
-+/* fminmagf().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-float.h>
-+
-+float
-+__fminmagf (float x, float y)
-+{
-+  asm volatile ("fmina.s \t%0, %1, %2" : "=f" (x) : "f" (x), "f" (y));
-+  return x;
-+}
-+libm_alias_float (__fminmag, fminmag)
-diff --git a/sysdeps/loongarch/fpu/s_fpclassify.c b/sysdeps/loongarch/fpu/s_fpclassify.c
-new file mode 100644
-index 00000000..3f4d95da
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_fpclassify.c
-@@ -0,0 +1,38 @@
-+/* fpclassify().  LoongArch version.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <math.h>
-+#include <fenv_private.h>
-+
-+int
-+__fpclassify (double x)
-+{
-+  int cls;
-+  asm volatile ("fclass.d \t%0, %1" : "=f" (cls) : "f" (x));
-+
-+  if (__glibc_likely (!!(cls & _FCLASS_NORM)))
-+    return FP_NORMAL;
-+  if (__glibc_likely (!!(cls & _FCLASS_ZERO)))
-+    return FP_ZERO;
-+  if (__glibc_likely (!!(cls & _FCLASS_SUBNORM)))
-+    return FP_SUBNORMAL;
-+  if (__glibc_likely (!!(cls & _FCLASS_INF)))
-+    return FP_INFINITE;
-+  return FP_NAN;
-+}
-+libm_hidden_def (__fpclassify)
-diff --git a/sysdeps/loongarch/fpu/s_fpclassifyf.c b/sysdeps/loongarch/fpu/s_fpclassifyf.c
-new file mode 100644
-index 00000000..b7c8b253
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_fpclassifyf.c
-@@ -0,0 +1,38 @@
-+/* Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <math.h>
-+#include <fenv_private.h>
-+
-+int
-+__fpclassifyf (float x)
-+{
-+  int cls;
-+  asm volatile ("fclass.s \t%0, %1" : "=f" (cls) : "f" (x));
-+
-+  if (__glibc_likely (!!(cls & _FCLASS_NORM)))
-+    return FP_NORMAL;
-+  if (__glibc_likely (!!(cls & _FCLASS_ZERO)))
-+    return FP_ZERO;
-+  if (__glibc_likely (!!(cls & _FCLASS_SUBNORM)))
-+    return FP_SUBNORMAL;
-+  if (__glibc_likely (!!(cls & _FCLASS_INF)))
-+    return FP_INFINITE;
-+  return FP_NAN;
-+}
-+libm_hidden_def (__fpclassifyf)
-diff --git a/sysdeps/loongarch/fpu/s_isinf.c b/sysdeps/loongarch/fpu/s_isinf.c
-new file mode 100644
-index 00000000..c7a67841
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_isinf.c
-@@ -0,0 +1,30 @@
-+/* isinf().  LoongArch version.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <math.h>
-+#include <fenv_private.h>
-+
-+int
-+__isinf (double x)
-+{
-+  int x_cond;
-+  asm volatile ("fclass.d \t%0, %1" : "=f" (x_cond) : "f" (x));
-+  return -((x_cond & _FCLASS_MINF) ? 1 : 0) | ((x_cond & _FCLASS_PINF) ? 1 : 0);
-+}
-+hidden_def (__isinf)
-+weak_alias (__isinf, isinf)
-diff --git a/sysdeps/loongarch/fpu/s_isinff.c b/sysdeps/loongarch/fpu/s_isinff.c
-new file mode 100644
-index 00000000..dcb4e04e
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_isinff.c
-@@ -0,0 +1,30 @@
-+/* isinff().  LoongArch version.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <math.h>
-+#include <fenv_private.h>
-+
-+int
-+__isinff (float x)
-+{
-+  int x_cond;
-+  asm volatile ("fclass.s \t%0, %1" : "=f" (x_cond) : "f" (x));
-+  return -((x_cond & _FCLASS_MINF) ? 1 : 0) | ((x_cond & _FCLASS_PINF) ? 1 : 0);
-+}
-+hidden_def (__isinff)
-+weak_alias (__isinff, isinff)
-diff --git a/sysdeps/loongarch/fpu/s_isnan.c b/sysdeps/loongarch/fpu/s_isnan.c
-new file mode 100644
-index 00000000..62bb2e2f
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_isnan.c
-@@ -0,0 +1,31 @@
-+/* isnan().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <math.h>
-+#include <fenv_private.h>
-+
-+int
-+__isnan (double x)
-+{
-+  int x_cond;
-+  asm volatile ("fclass.d \t%0, %1" : "=f" (x_cond) : "f" (x));
-+
-+  return (x_cond & _FCLASS_NAN) != 0;
-+}
-+hidden_def (__isnan)
-+weak_alias (__isnan, isnan)
-diff --git a/sysdeps/loongarch/fpu/s_isnanf.c b/sysdeps/loongarch/fpu/s_isnanf.c
-new file mode 100644
-index 00000000..bbdedb84
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_isnanf.c
-@@ -0,0 +1,31 @@
-+/* isnanf().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <math.h>
-+#include <fenv_private.h>
-+
-+int
-+__isnanf (float x)
-+{
-+  int x_cond;
-+  asm volatile ("fclass.s \t%0, %1" : "=f" (x_cond) : "f" (x));
-+
-+  return (x_cond & _FCLASS_NAN) != 0;
-+}
-+hidden_def (__isnanf)
-+weak_alias (__isnanf, isnanf)
-diff --git a/sysdeps/loongarch/fpu/s_issignaling.c b/sysdeps/loongarch/fpu/s_issignaling.c
-new file mode 100644
-index 00000000..4fe0e2b7
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_issignaling.c
-@@ -0,0 +1,29 @@
-+/* issignaling().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <math.h>
-+#include <fenv_private.h>
-+
-+int
-+__issignaling (double x)
-+{
-+  int x_cond;
-+  asm volatile ("fclass.d \t%0, %1" : "=f" (x_cond) : "f" (x));
-+  return (x_cond & _FCLASS_SNAN) != 0;
-+}
-+libm_hidden_def (__issignaling)
-diff --git a/sysdeps/loongarch/fpu/s_issignalingf.c b/sysdeps/loongarch/fpu/s_issignalingf.c
-new file mode 100644
-index 00000000..d82abb0e
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_issignalingf.c
-@@ -0,0 +1,29 @@
-+/* issignalingf().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <math.h>
-+#include <fenv_private.h>
-+
-+int
-+__issignalingf (float x)
-+{
-+  int x_cond;
-+  asm volatile ("fclass.s \t%0, %1" : "=f" (x_cond) : "f" (x));
-+  return (x_cond & _FCLASS_SNAN) != 0;
-+}
-+libm_hidden_def (__issignalingf)
-diff --git a/sysdeps/loongarch/fpu/s_llrint.c b/sysdeps/loongarch/fpu/s_llrint.c
-new file mode 100644
-index 00000000..4a8e46ec
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_llrint.c
-@@ -0,0 +1,31 @@
-+/* llrint().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-double.h>
-+
-+long long int
-+__llrint (double x)
-+{
-+  long long int result;
-+  asm volatile ("ftint.l.d \t%0, %1" : "=f" (x) : "f" (x));
-+  asm volatile ("movfr2gr.d \t%0, %1" : "=r" (result) : "f" (x));
-+  return result;
-+}
-+libm_alias_double (__llrint, llrint)
-diff --git a/sysdeps/loongarch/fpu/s_llrintf.c b/sysdeps/loongarch/fpu/s_llrintf.c
-new file mode 100644
-index 00000000..f3a874a0
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_llrintf.c
-@@ -0,0 +1,31 @@
-+/* llrintf().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-float.h>
-+
-+long long int
-+__llrintf (float x)
-+{
-+  long long int result;
-+  asm volatile ("ftint.l.s \t%0, %1" : "=f" (x) : "f" (x));
-+  asm volatile ("movfr2gr.d \t%0, %1" : "=r" (result) : "f" (x));
-+  return result;
-+}
-+libm_alias_float (__llrint, llrint)
-diff --git a/sysdeps/loongarch/fpu/s_logb.c b/sysdeps/loongarch/fpu/s_logb.c
-new file mode 100644
-index 00000000..31bb3be5
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_logb.c
-@@ -0,0 +1,30 @@
-+/* logb().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-double.h>
-+
-+double
-+__logb (double x)
-+{
-+  asm volatile ("fabs.d \t%0, %1" : "=f" (x) : "f" (x));
-+  asm volatile ("flogb.d \t%0, %1" : "=f" (x) : "f" (x));
-+  return x;
-+}
-+libm_alias_double (__logb, logb)
-diff --git a/sysdeps/loongarch/fpu/s_logbf.c b/sysdeps/loongarch/fpu/s_logbf.c
-new file mode 100644
-index 00000000..f5166bca
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_logbf.c
-@@ -0,0 +1,30 @@
-+/* logbf().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-float.h>
-+
-+float
-+__logbf (float x)
-+{
-+  asm volatile ("fabs.s \t%0, %1" : "=f" (x) : "f" (x));
-+  asm volatile ("flogb.s \t%0, %1" : "=f" (x) : "f" (x));
-+  return x;
-+}
-+libm_alias_float (__logb, logb)
-diff --git a/sysdeps/loongarch/fpu/s_lrint.c b/sysdeps/loongarch/fpu/s_lrint.c
-new file mode 100644
-index 00000000..db446111
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_lrint.c
-@@ -0,0 +1,31 @@
-+/* lrint().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-double.h>
-+
-+long int
-+__lrint (double x)
-+{
-+  long int result;
-+  asm volatile ("ftint.l.d \t%0, %1" : "=f" (x) : "f" (x));
-+  asm volatile ("movfr2gr.d \t%0, %1" : "=r" (result) : "f" (x));
-+  return result;
-+}
-+libm_alias_double (__lrint, lrint)
-diff --git a/sysdeps/loongarch/fpu/s_lrintf.c b/sysdeps/loongarch/fpu/s_lrintf.c
-new file mode 100644
-index 00000000..cde60b88
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_lrintf.c
-@@ -0,0 +1,31 @@
-+/* lrintf().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-float.h>
-+
-+long int
-+__lrintf (float x)
-+{
-+  long int result;
-+  asm volatile ("ftint.l.s \t%0, %1" : "=f" (x) : "f" (x));
-+  asm volatile ("movfr2gr.d \t%0, %1" : "=r" (result) : "f" (x));
-+  return result;
-+}
-+libm_alias_float (__lrint, lrint)
-diff --git a/sysdeps/loongarch/fpu/s_rint.c b/sysdeps/loongarch/fpu/s_rint.c
-new file mode 100644
-index 00000000..429d5d11
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_rint.c
-@@ -0,0 +1,29 @@
-+/* rint().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-double.h>
-+
-+double
-+__rint (double x)
-+{
-+  asm volatile ("frint.d \t%0, %1" : "=f" (x) : "f" (x));
-+  return x;
-+}
-+libm_alias_double (__rint, rint)
-diff --git a/sysdeps/loongarch/fpu/s_rintf.c b/sysdeps/loongarch/fpu/s_rintf.c
-new file mode 100644
-index 00000000..b3faba20
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_rintf.c
-@@ -0,0 +1,29 @@
-+/* rintf().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+#include <libm-alias-float.h>
-+
-+float
-+__rintf (float x)
-+{
-+  asm volatile ("frint.s \t%0, %1" : "=f" (x) : "f" (x));
-+  return x;
-+}
-+libm_alias_float (__rint, rint)
-diff --git a/sysdeps/loongarch/fpu/s_scalbn.c b/sysdeps/loongarch/fpu/s_scalbn.c
-new file mode 100644
-index 00000000..c03e81a3
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_scalbn.c
-@@ -0,0 +1,29 @@
-+/* scalbn().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+
-+double
-+__scalbn (double x, int fn)
-+{
-+  double tmp;
-+  asm volatile ("movgr2fr.d \t%0, %1" : "=f" (tmp) : "r" (fn));
-+  asm volatile ("fscaleb.d \t%0, %1, %2" : "=f" (x) : "f" (x), "f" (tmp));
-+  return x;
-+}
-diff --git a/sysdeps/loongarch/fpu/s_scalbnf.c b/sysdeps/loongarch/fpu/s_scalbnf.c
-new file mode 100644
-index 00000000..15e64280
---- /dev/null
-+++ b/sysdeps/loongarch/fpu/s_scalbnf.c
-@@ -0,0 +1,29 @@
-+/* scalbnf().  LoongArch version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#define NO_MATH_REDIRECT
-+#include <math.h>
-+
-+float
-+__scalbnf (float x, int fn)
-+{
-+  float tmp;
-+  asm volatile ("movgr2fr.w \t%0, %1" : "=f" (tmp) : "r" (fn));
-+  asm volatile ("fscaleb.s \t%0, %1, %2" : "=f" (x) : "f" (x), "f" (tmp));
-+  return x;
-+}
-diff --git a/sysdeps/loongarch/fpu_control.h b/sysdeps/loongarch/fpu_control.h
-new file mode 100644
-index 00000000..8f688592
---- /dev/null
-+++ b/sysdeps/loongarch/fpu_control.h
-@@ -0,0 +1,128 @@
-+/* FPU control word bits.  Mips version.
-+   Copyright (C) 1996-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Olaf Flebbe and Ralf Baechle.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _FPU_CONTROL_H
-+#define _FPU_CONTROL_H
-+
-+/* MIPS FPU floating point control register bits.
-+ *
-+ * 31-25  -> floating point conditions code bits 7-1.  These bits are only
-+ *           available in MIPS IV.
-+ * 24     -> flush denormalized results to zero instead of
-+ *           causing unimplemented operation exception.  This bit is only
-+ *           available for MIPS III and newer.
-+ * 23     -> Condition bit
-+ * 22-21  -> reserved for architecture implementers
-+ * 20     -> reserved (read as 0, write with 0)
-+ * 19     -> IEEE 754-2008 non-arithmetic ABS.fmt and NEG.fmt enable
-+ * 18     -> IEEE 754-2008 recommended NaN encoding enable
-+ * 17     -> cause bit for unimplemented operation
-+ * 28     -> cause bit for invalid exception
-+ * 27     -> cause bit for division by zero exception
-+ * 26     -> cause bit for overflow exception
-+ * 25     -> cause bit for underflow exception
-+ * 24     -> cause bit for inexact exception
-+ *  4     -> enable exception for invalid exception
-+ *  3     -> enable exception for division by zero exception
-+ *  2     -> enable exception for overflow exception
-+ *  1     -> enable exception for underflow exception
-+ *  0     -> enable exception for inexact exception
-+ * 20     -> flag invalid exception
-+ * 19     -> flag division by zero exception
-+ * 18     -> flag overflow exception
-+ * 17     -> flag underflow exception
-+ * 16     -> flag inexact exception
-+ *  9-8   -> rounding control
-+ *
-+ *
-+ * Rounding Control:
-+ * 00 - rounding to nearest (RN)
-+ * 01 - rounding toward zero (RZ)
-+ * 10 - rounding (up) toward plus infinity (RP)
-+ * 11 - rounding (down)toward minus infinity (RM)
-+ */
-+
-+#include <features.h>
-+
-+#ifdef __loongarch_soft_float
-+
-+#define _FPU_RESERVED 0xffffffff
-+#define _FPU_DEFAULT  0x00000000
-+typedef unsigned int fpu_control_t;
-+#define _FPU_GETCW(cw) (cw) = 0
-+#define _FPU_SETCW(cw) (void) (cw)
-+extern fpu_control_t __fpu_control;
-+
-+#else /* __loongarch_soft_float */
-+
-+/* Masks for interrupts.  */
-+#define _FPU_MASK_V     0x10  /* Invalid operation */
-+#define _FPU_MASK_Z     0x08  /* Division by zero  */
-+#define _FPU_MASK_O     0x04  /* Overflow          */
-+#define _FPU_MASK_U     0x02  /* Underflow         */
-+#define _FPU_MASK_I     0x01  /* Inexact operation */
-+
-+/* Flush denormalized numbers to zero.  */
-+#define _FPU_FLUSH_TZ   0x1000000
-+
-+/* Rounding control.  */
-+#define _FPU_RC_NEAREST 0x000     /* RECOMMENDED */
-+#define _FPU_RC_ZERO    0x100
-+#define _FPU_RC_UP      0x200
-+#define _FPU_RC_DOWN    0x300
-+/* Mask for rounding control.  */
-+#define _FPU_RC_MASK	0x300
-+
-+#define _FPU_RESERVED 0x0
-+
-+#define _FPU_DEFAULT 0x0
-+#define _FPU_IEEE    0x1F
-+
-+/* Type of the control word.  */
-+typedef unsigned int fpu_control_t __attribute__ ((__mode__ (__SI__)));
-+
-+/* Macros for accessing the hardware control word.  */
-+extern fpu_control_t __mips_fpu_getcw (void) __THROW;
-+extern void __mips_fpu_setcw (fpu_control_t) __THROW;
-+#define _FPU_GETCW(cw) __asm__ volatile ("movfcsr2gr %0,$r0" : "=r" (cw))
-+#define _FPU_SETCW(cw) __asm__ volatile ("movgr2fcsr $r0,%0" : : "r" (cw))
-+
-+/* Default control word set at startup.  */
-+extern fpu_control_t __fpu_control;
-+
-+# define _FCLASS_SNAN     (1 << 0)
-+# define _FCLASS_QNAN     (1 << 1)
-+# define _FCLASS_MINF     (1 << 2)
-+# define _FCLASS_MNORM    (1 << 3)
-+# define _FCLASS_MSUBNORM (1 << 4)
-+# define _FCLASS_MZERO    (1 << 5)
-+# define _FCLASS_PINF     (1 << 6)
-+# define _FCLASS_PNORM    (1 << 7)
-+# define _FCLASS_PSUBNORM (1 << 8)
-+# define _FCLASS_PZERO    (1 << 9)
-+
-+# define _FCLASS_ZERO     (_FCLASS_MZERO | _FCLASS_PZERO)
-+# define _FCLASS_SUBNORM  (_FCLASS_MSUBNORM | _FCLASS_PSUBNORM)
-+# define _FCLASS_NORM     (_FCLASS_MNORM | _FCLASS_PNORM)
-+# define _FCLASS_INF      (_FCLASS_MINF | _FCLASS_PINF)
-+# define _FCLASS_NAN      (_FCLASS_SNAN | _FCLASS_QNAN)
-+
-+#endif /* __loongarch_soft_float */
-+
-+#endif	/* fpu_control.h */
-diff --git a/sysdeps/loongarch/fstat.c b/sysdeps/loongarch/fstat.c
-new file mode 100644
-index 00000000..c4504eeb
---- /dev/null
-+++ b/sysdeps/loongarch/fstat.c
-@@ -0,0 +1 @@
-+#include <io/fstat.c>
-diff --git a/sysdeps/loongarch/fstat64.c b/sysdeps/loongarch/fstat64.c
-new file mode 100644
-index 00000000..143ca2b0
---- /dev/null
-+++ b/sysdeps/loongarch/fstat64.c
-@@ -0,0 +1 @@
-+#include <io/fstat64.c>
-diff --git a/sysdeps/loongarch/fstatat.c b/sysdeps/loongarch/fstatat.c
-new file mode 100644
-index 00000000..0b0a3342
---- /dev/null
-+++ b/sysdeps/loongarch/fstatat.c
-@@ -0,0 +1 @@
-+#include <io/fstatat.c>
-diff --git a/sysdeps/loongarch/fstatat64.c b/sysdeps/loongarch/fstatat64.c
-new file mode 100644
-index 00000000..e82b9274
---- /dev/null
-+++ b/sysdeps/loongarch/fstatat64.c
-@@ -0,0 +1 @@
-+#include <io/fstatat64.c>
-diff --git a/sysdeps/loongarch/gccframe.h b/sysdeps/loongarch/gccframe.h
-new file mode 100644
-index 00000000..5c799c64
---- /dev/null
-+++ b/sysdeps/loongarch/gccframe.h
-@@ -0,0 +1,21 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#define FIRST_PSEUDO_REGISTER 74
-+
-+#include <sysdeps/generic/gccframe.h>
-diff --git a/sysdeps/loongarch/hp-timing.h b/sysdeps/loongarch/hp-timing.h
-new file mode 100644
-index 00000000..2d006540
---- /dev/null
-+++ b/sysdeps/loongarch/hp-timing.h
-@@ -0,0 +1,40 @@
-+/* High precision, low overhead timing functions.  x86-64 version.
-+   Copyright (C) 2002-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _HP_TIMING_H
-+#define _HP_TIMING_H	1
-+
-+/* We always assume having the timestamp register.  */
-+#define HP_TIMING_AVAIL		(1)
-+#define HP_SMALL_TIMING_AVAIL	(1)
-+
-+/* We indeed have inlined functions.  */
-+#define HP_TIMING_INLINE	(1)
-+
-+/* We use 64bit values for the times.  */
-+typedef unsigned long long int hp_timing_t;
-+
-+/* Read the cp0 count, this maybe inaccurate.  */
-+#define HP_TIMING_NOW(Var) \
-+  ({ unsigned long long int _count; \
-+     asm volatile ("rdtime.d\t%0,$r0" : "=r" (_count)); \
-+     (Var) = _count; })
-+
-+#include <hp-timing-common.h>
-+
-+#endif /* hp-timing.h */
-diff --git a/sysdeps/loongarch/init-arch.h b/sysdeps/loongarch/init-arch.h
-new file mode 100644
-index 00000000..7db7b7b3
---- /dev/null
-+++ b/sysdeps/loongarch/init-arch.h
-@@ -0,0 +1,24 @@
-+/* This file is part of the GNU C Library.
-+   Copyright (C) 2008-2022 Free Software Foundation, Inc.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <ldsodefs.h>
-+#include <ifunc-init.h>
-+
-+#define INIT_ARCH()                           \
-+  uint64_t __attribute__((unused)) prid =     \
-+    GLRO(dl_larch_cpu_features).cpucfg_prid;  \
-+
-diff --git a/sysdeps/loongarch/jmpbuf-offsets.h b/sysdeps/loongarch/jmpbuf-offsets.h
-new file mode 100644
-index 00000000..bc4c1523
---- /dev/null
-+++ b/sysdeps/loongarch/jmpbuf-offsets.h
-@@ -0,0 +1,23 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public License as
-+   published by the Free Software Foundation; either version 2.1 of the
-+   License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <jmpbuf-unwind.h>
-+
-+/* Helper for generic ____longjmp_chk().  */
-+#define JB_FRAME_ADDRESS(buf) \
-+  ((void *) _jmpbuf_sp (buf))
-diff --git a/sysdeps/loongarch/jmpbuf-unwind.h b/sysdeps/loongarch/jmpbuf-unwind.h
-new file mode 100644
-index 00000000..c866d910
---- /dev/null
-+++ b/sysdeps/loongarch/jmpbuf-unwind.h
-@@ -0,0 +1,46 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <setjmp.h>
-+#include <stdint.h>
-+#include <unwind.h>
-+#include <sysdep.h>
-+
-+/* Test if longjmp to JMPBUF would unwind the frame
-+   containing a local variable at ADDRESS.  */
-+#define _JMPBUF_UNWINDS(jmpbuf, address, demangle)		\
-+  ((void *) (address) < (void *) demangle ((jmpbuf)[0].__sp))
-+
-+#define _JMPBUF_CFA_UNWINDS_ADJ(_jmpbuf, _context, _adj) \
-+  _JMPBUF_UNWINDS_ADJ (_jmpbuf, (void *) _Unwind_GetCFA (_context), _adj)
-+
-+static inline uintptr_t __attribute__ ((unused))
-+_jmpbuf_sp (__jmp_buf regs)
-+{
-+  uintptr_t sp = regs[0].__sp;
-+#ifdef PTR_DEMANGLE
-+  PTR_DEMANGLE (sp);
-+#endif
-+  return sp;
-+}
-+
-+#define _JMPBUF_UNWINDS_ADJ(_jmpbuf, _address, _adj) \
-+  ((uintptr_t) (_address) - (_adj) < _jmpbuf_sp (_jmpbuf) - (_adj))
-+
-+/* We use the normal longjmp for unwinding.  */
-+#define __libc_unwind_longjmp(buf, val) __libc_longjmp (buf, val)
-diff --git a/sysdeps/loongarch/ldsodefs.h b/sysdeps/loongarch/ldsodefs.h
-new file mode 100644
-index 00000000..f3c07709
---- /dev/null
-+++ b/sysdeps/loongarch/ldsodefs.h
-@@ -0,0 +1,48 @@
-+/* Run-time dynamic linker data structures for loaded ELF shared objects.
-+   Copyright (C) 2011-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _LOONGARCH_LDSODEFS_H
-+#define _LOONGARCH_LDSODEFS_H 1
-+
-+#include <elf.h>
-+#include <cpu-features.h>
-+
-+struct La_loongarch_regs;
-+struct La_loongarch_retval;
-+
-+#define ARCH_PLTENTER_MEMBERS						\
-+    ElfW(Addr) (*loongarch_gnu_pltenter) (ElfW(Sym) *, unsigned int,	\
-+				      uintptr_t *, uintptr_t *,		\
-+				      const struct La_loongarch_regs *,	\
-+				      unsigned int *, const char *name,	\
-+				      long int *framesizep);
-+
-+#define ARCH_PLTEXIT_MEMBERS						\
-+    unsigned int (*loongarch_gnu_pltexit) (ElfW(Sym) *, unsigned int,	\
-+				       uintptr_t *, uintptr_t *,	\
-+				       const struct La_loongarch_regs *,	\
-+				       struct La_loongarch_retval *,	\
-+				       const char *);
-+
-+/* The LoongArch ABI specifies that the dynamic section has to be read-only.  */
-+
-+#define DL_RO_DYN_SECTION 1
-+
-+#include_next <ldsodefs.h>
-+
-+#endif
-diff --git a/sysdeps/loongarch/libc-start.h b/sysdeps/loongarch/libc-start.h
-new file mode 100644
-index 00000000..7bbc658f
---- /dev/null
-+++ b/sysdeps/loongarch/libc-start.h
-@@ -0,0 +1,25 @@
-+/* LoongArch definitions for libc main startup.
-+   Copyright (C) 2023 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef SHARED
-+# define ARCH_SETUP_IREL()  apply_irel ()
-+# define ARCH_APPLY_IREL()
-+# ifndef ARCH_SETUP_TLS
-+#  define ARCH_SETUP_TLS() __libc_setup_tls ()
-+# endif
-+#endif /* !SHARED */
-diff --git a/sysdeps/loongarch/libc-tls.c b/sysdeps/loongarch/libc-tls.c
-new file mode 100644
-index 00000000..0b0590d1
---- /dev/null
-+++ b/sysdeps/loongarch/libc-tls.c
-@@ -0,0 +1,32 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <csu/libc-tls.c>
-+#include <dl-tls.h>
-+
-+/* On LoongArch, linker optimizations are not required, so __tls_get_addr
-+   can be called even in statically linked binaries.  In this case module
-+   must be always 1 and PT_TLS segment exist in the binary, otherwise it
-+   would not link.  */
-+
-+void *
-+__tls_get_addr (tls_index *ti)
-+{
-+  dtv_t *dtv = THREAD_DTV ();
-+  return (char *) dtv[1].pointer.val + GET_ADDR_OFFSET;
-+}
-diff --git a/sysdeps/loongarch/linkmap.h b/sysdeps/loongarch/linkmap.h
-new file mode 100644
-index 00000000..ac170bb3
---- /dev/null
-+++ b/sysdeps/loongarch/linkmap.h
-@@ -0,0 +1,4 @@
-+struct link_map_machine
-+  {
-+    ElfW(Addr) plt; /* Address of .plt.  */
-+  };
-diff --git a/sysdeps/loongarch/lp64/Implies-after b/sysdeps/loongarch/lp64/Implies-after
-new file mode 100644
-index 00000000..a8cae95f
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/Implies-after
-@@ -0,0 +1 @@
-+wordsize-64
-diff --git a/sysdeps/loongarch/lp64/libm-test-ulps b/sysdeps/loongarch/lp64/libm-test-ulps
-new file mode 100644
-index 00000000..61be2df6
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/libm-test-ulps
-@@ -0,0 +1,2206 @@
-+# Begin of automatic generation
-+
-+# Maximal error of functions:
-+Function: "acos":
-+float: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "acos_downward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "acos_towardzero":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "acos_upward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "acosh":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "acosh_downward":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "acosh_towardzero":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "acosh_upward":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "asin":
-+float: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "asin_downward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "asin_towardzero":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "asin_upward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "asinh":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "asinh_downward":
-+double: 3
-+float: 3
-+idouble: 3
-+ifloat: 3
-+ildouble: 4
-+ldouble: 4
-+
-+Function: "asinh_towardzero":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "asinh_upward":
-+double: 3
-+float: 3
-+idouble: 3
-+ifloat: 3
-+ildouble: 4
-+ldouble: 4
-+
-+Function: "atan":
-+float: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "atan2":
-+float: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "atan2_downward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "atan2_towardzero":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "atan2_upward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "atan_downward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "atan_towardzero":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "atan_upward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "atanh":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "atanh_downward":
-+double: 3
-+float: 3
-+idouble: 3
-+ifloat: 3
-+ildouble: 4
-+ldouble: 4
-+
-+Function: "atanh_towardzero":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "atanh_upward":
-+double: 3
-+float: 3
-+idouble: 3
-+ifloat: 3
-+ildouble: 4
-+ldouble: 4
-+
-+Function: "cabs":
-+double: 1
-+idouble: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "cabs_downward":
-+double: 1
-+idouble: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "cabs_towardzero":
-+double: 1
-+idouble: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "cabs_upward":
-+double: 1
-+idouble: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Real part of "cacos":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "cacos":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "cacos_downward":
-+double: 3
-+float: 2
-+idouble: 3
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Imaginary part of "cacos_downward":
-+double: 5
-+float: 3
-+idouble: 5
-+ifloat: 3
-+ildouble: 6
-+ldouble: 6
-+
-+Function: Real part of "cacos_towardzero":
-+double: 3
-+float: 2
-+idouble: 3
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Imaginary part of "cacos_towardzero":
-+double: 4
-+float: 2
-+idouble: 4
-+ifloat: 2
-+ildouble: 5
-+ldouble: 5
-+
-+Function: Real part of "cacos_upward":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Imaginary part of "cacos_upward":
-+double: 5
-+float: 5
-+idouble: 5
-+ifloat: 5
-+ildouble: 7
-+ldouble: 7
-+
-+Function: Real part of "cacosh":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "cacosh":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "cacosh_downward":
-+double: 4
-+float: 2
-+idouble: 4
-+ifloat: 2
-+ildouble: 5
-+ldouble: 5
-+
-+Function: Imaginary part of "cacosh_downward":
-+double: 3
-+float: 3
-+idouble: 3
-+ifloat: 3
-+ildouble: 4
-+ldouble: 4
-+
-+Function: Real part of "cacosh_towardzero":
-+double: 4
-+float: 2
-+idouble: 4
-+ifloat: 2
-+ildouble: 5
-+ldouble: 5
-+
-+Function: Imaginary part of "cacosh_towardzero":
-+double: 3
-+float: 2
-+idouble: 3
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Real part of "cacosh_upward":
-+double: 4
-+float: 3
-+idouble: 4
-+ifloat: 3
-+ildouble: 6
-+ldouble: 6
-+
-+Function: Imaginary part of "cacosh_upward":
-+double: 3
-+float: 2
-+idouble: 3
-+ifloat: 2
-+ildouble: 4
-+ldouble: 4
-+
-+Function: "carg":
-+float: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "carg_downward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "carg_towardzero":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "carg_upward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "casin":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "casin":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "casin_downward":
-+double: 3
-+float: 2
-+idouble: 3
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Imaginary part of "casin_downward":
-+double: 5
-+float: 3
-+idouble: 5
-+ifloat: 3
-+ildouble: 6
-+ldouble: 6
-+
-+Function: Real part of "casin_towardzero":
-+double: 3
-+float: 1
-+idouble: 3
-+ifloat: 1
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Imaginary part of "casin_towardzero":
-+double: 4
-+float: 2
-+idouble: 4
-+ifloat: 2
-+ildouble: 5
-+ldouble: 5
-+
-+Function: Real part of "casin_upward":
-+double: 3
-+float: 2
-+idouble: 3
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Imaginary part of "casin_upward":
-+double: 5
-+float: 5
-+idouble: 5
-+ifloat: 5
-+ildouble: 7
-+ldouble: 7
-+
-+Function: Real part of "casinh":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "casinh":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "casinh_downward":
-+double: 5
-+float: 3
-+idouble: 5
-+ifloat: 3
-+ildouble: 6
-+ldouble: 6
-+
-+Function: Imaginary part of "casinh_downward":
-+double: 3
-+float: 2
-+idouble: 3
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Real part of "casinh_towardzero":
-+double: 4
-+float: 2
-+idouble: 4
-+ifloat: 2
-+ildouble: 5
-+ldouble: 5
-+
-+Function: Imaginary part of "casinh_towardzero":
-+double: 3
-+float: 1
-+idouble: 3
-+ifloat: 1
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Real part of "casinh_upward":
-+double: 5
-+float: 5
-+idouble: 5
-+ifloat: 5
-+ildouble: 7
-+ldouble: 7
-+
-+Function: Imaginary part of "casinh_upward":
-+double: 3
-+float: 2
-+idouble: 3
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Real part of "catan":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Imaginary part of "catan":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Real part of "catan_downward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "catan_downward":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "catan_towardzero":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "catan_towardzero":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "catan_upward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "catan_upward":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Real part of "catanh":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Imaginary part of "catanh":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Real part of "catanh_downward":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "catanh_downward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "catanh_towardzero":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "catanh_towardzero":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "catanh_upward":
-+double: 4
-+float: 4
-+idouble: 4
-+ifloat: 4
-+ildouble: 4
-+ldouble: 4
-+
-+Function: Imaginary part of "catanh_upward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "cbrt":
-+double: 3
-+float: 1
-+idouble: 3
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "cbrt_downward":
-+double: 4
-+float: 1
-+idouble: 4
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "cbrt_towardzero":
-+double: 3
-+float: 1
-+idouble: 3
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "cbrt_upward":
-+double: 5
-+float: 1
-+idouble: 5
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Real part of "ccos":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Imaginary part of "ccos":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Real part of "ccos_downward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "ccos_downward":
-+double: 2
-+float: 3
-+idouble: 2
-+ifloat: 3
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "ccos_towardzero":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "ccos_towardzero":
-+double: 2
-+float: 3
-+idouble: 2
-+ifloat: 3
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "ccos_upward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Imaginary part of "ccos_upward":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "ccosh":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Imaginary part of "ccosh":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Real part of "ccosh_downward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "ccosh_downward":
-+double: 2
-+float: 3
-+idouble: 2
-+ifloat: 3
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "ccosh_towardzero":
-+double: 1
-+float: 3
-+idouble: 1
-+ifloat: 3
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "ccosh_towardzero":
-+double: 2
-+float: 3
-+idouble: 2
-+ifloat: 3
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "ccosh_upward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Imaginary part of "ccosh_upward":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "cexp":
-+double: 2
-+float: 1
-+idouble: 2
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Imaginary part of "cexp":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Real part of "cexp_downward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "cexp_downward":
-+double: 1
-+float: 3
-+idouble: 1
-+ifloat: 3
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "cexp_towardzero":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "cexp_towardzero":
-+double: 1
-+float: 3
-+idouble: 1
-+ifloat: 3
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "cexp_upward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Imaginary part of "cexp_upward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Real part of "clog":
-+double: 3
-+float: 3
-+idouble: 3
-+ifloat: 3
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "clog":
-+float: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Real part of "clog10":
-+double: 3
-+float: 4
-+idouble: 3
-+ifloat: 4
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "clog10":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "clog10_downward":
-+double: 5
-+float: 5
-+idouble: 5
-+ifloat: 5
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Imaginary part of "clog10_downward":
-+double: 2
-+float: 4
-+idouble: 2
-+ifloat: 4
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Real part of "clog10_towardzero":
-+double: 5
-+float: 5
-+idouble: 5
-+ifloat: 5
-+ildouble: 4
-+ldouble: 4
-+
-+Function: Imaginary part of "clog10_towardzero":
-+double: 2
-+float: 4
-+idouble: 2
-+ifloat: 4
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Real part of "clog10_upward":
-+double: 6
-+float: 5
-+idouble: 6
-+ifloat: 5
-+ildouble: 4
-+ldouble: 4
-+
-+Function: Imaginary part of "clog10_upward":
-+double: 2
-+float: 4
-+idouble: 2
-+ifloat: 4
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Real part of "clog_downward":
-+double: 4
-+float: 3
-+idouble: 4
-+ifloat: 3
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Imaginary part of "clog_downward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "clog_towardzero":
-+double: 4
-+float: 4
-+idouble: 4
-+ifloat: 4
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Imaginary part of "clog_towardzero":
-+double: 1
-+float: 3
-+idouble: 1
-+ifloat: 3
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "clog_upward":
-+double: 4
-+float: 3
-+idouble: 4
-+ifloat: 3
-+ildouble: 4
-+ldouble: 4
-+
-+Function: Imaginary part of "clog_upward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "cos":
-+double: 1
-+idouble: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "cos_downward":
-+double: 1
-+idouble: 1
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "cos_towardzero":
-+double: 1
-+idouble: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "cos_upward":
-+double: 1
-+idouble: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "cosh":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "cosh_downward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 2
-+
-+Function: "cosh_towardzero":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 2
-+
-+Function: "cosh_upward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 1
-+ldouble: 3
-+
-+Function: Real part of "cpow":
-+double: 2
-+float: 5
-+idouble: 2
-+ifloat: 5
-+ildouble: 4
-+ldouble: 4
-+
-+Function: Imaginary part of "cpow":
-+float: 2
-+ifloat: 2
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Real part of "cpow_downward":
-+double: 4
-+float: 8
-+idouble: 4
-+ifloat: 8
-+ildouble: 6
-+ldouble: 6
-+
-+Function: Imaginary part of "cpow_downward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "cpow_towardzero":
-+double: 4
-+float: 8
-+idouble: 4
-+ifloat: 8
-+ildouble: 6
-+ldouble: 6
-+
-+Function: Imaginary part of "cpow_towardzero":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "cpow_upward":
-+double: 4
-+float: 1
-+idouble: 4
-+ifloat: 1
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Imaginary part of "cpow_upward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "csin":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Imaginary part of "csin":
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Real part of "csin_downward":
-+double: 2
-+float: 3
-+idouble: 2
-+ifloat: 3
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "csin_downward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "csin_towardzero":
-+double: 2
-+float: 3
-+idouble: 2
-+ifloat: 3
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "csin_towardzero":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "csin_upward":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "csin_upward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Real part of "csinh":
-+float: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Imaginary part of "csinh":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: Real part of "csinh_downward":
-+double: 2
-+float: 1
-+idouble: 2
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "csinh_downward":
-+double: 2
-+float: 3
-+idouble: 2
-+ifloat: 3
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "csinh_towardzero":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "csinh_towardzero":
-+double: 2
-+float: 3
-+idouble: 2
-+ifloat: 3
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "csinh_upward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Imaginary part of "csinh_upward":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "csqrt":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Imaginary part of "csqrt":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: Real part of "csqrt_downward":
-+double: 5
-+float: 4
-+idouble: 5
-+ifloat: 4
-+ildouble: 4
-+ldouble: 4
-+
-+Function: Imaginary part of "csqrt_downward":
-+double: 4
-+float: 3
-+idouble: 4
-+ifloat: 3
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Real part of "csqrt_towardzero":
-+double: 4
-+float: 3
-+idouble: 4
-+ifloat: 3
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Imaginary part of "csqrt_towardzero":
-+double: 4
-+float: 3
-+idouble: 4
-+ifloat: 3
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Real part of "csqrt_upward":
-+double: 5
-+float: 4
-+idouble: 5
-+ifloat: 4
-+ildouble: 4
-+ldouble: 4
-+
-+Function: Imaginary part of "csqrt_upward":
-+double: 3
-+float: 3
-+idouble: 3
-+ifloat: 3
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Real part of "ctan":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Imaginary part of "ctan":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Real part of "ctan_downward":
-+double: 6
-+float: 5
-+idouble: 6
-+ifloat: 5
-+ildouble: 4
-+ldouble: 4
-+
-+Function: Imaginary part of "ctan_downward":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 5
-+ldouble: 5
-+
-+Function: Real part of "ctan_towardzero":
-+double: 5
-+float: 2
-+idouble: 5
-+ifloat: 2
-+ildouble: 4
-+ldouble: 4
-+
-+Function: Imaginary part of "ctan_towardzero":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 5
-+ldouble: 5
-+
-+Function: Real part of "ctan_upward":
-+double: 2
-+float: 4
-+idouble: 2
-+ifloat: 4
-+ildouble: 5
-+ldouble: 5
-+
-+Function: Imaginary part of "ctan_upward":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 5
-+ldouble: 5
-+
-+Function: Real part of "ctanh":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Imaginary part of "ctanh":
-+double: 2
-+float: 1
-+idouble: 2
-+ifloat: 1
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Real part of "ctanh_downward":
-+double: 4
-+float: 2
-+idouble: 4
-+ifloat: 2
-+ildouble: 5
-+ldouble: 5
-+
-+Function: Imaginary part of "ctanh_downward":
-+double: 6
-+float: 5
-+idouble: 6
-+ifloat: 5
-+ildouble: 4
-+ldouble: 4
-+
-+Function: Real part of "ctanh_towardzero":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 5
-+ldouble: 5
-+
-+Function: Imaginary part of "ctanh_towardzero":
-+double: 5
-+float: 2
-+idouble: 5
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: Real part of "ctanh_upward":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 5
-+ldouble: 5
-+
-+Function: Imaginary part of "ctanh_upward":
-+double: 2
-+float: 3
-+idouble: 2
-+ifloat: 3
-+ildouble: 5
-+ldouble: 5
-+
-+Function: "erf":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "erf_downward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "erf_towardzero":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "erf_upward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "erfc":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "erfc_downward":
-+double: 3
-+float: 4
-+idouble: 3
-+ifloat: 4
-+ildouble: 5
-+ldouble: 5
-+
-+Function: "erfc_towardzero":
-+double: 3
-+float: 3
-+idouble: 3
-+ifloat: 3
-+ildouble: 4
-+ldouble: 4
-+
-+Function: "erfc_upward":
-+double: 3
-+float: 4
-+idouble: 3
-+ifloat: 4
-+ildouble: 5
-+ldouble: 5
-+
-+Function: "exp":
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "exp10":
-+double: 2
-+idouble: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "exp10_downward":
-+double: 2
-+float: 1
-+idouble: 2
-+ifloat: 1
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "exp10_towardzero":
-+double: 2
-+float: 1
-+idouble: 2
-+ifloat: 1
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "exp10_upward":
-+double: 2
-+float: 1
-+idouble: 2
-+ifloat: 1
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "exp2":
-+double: 1
-+idouble: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "exp2_downward":
-+double: 1
-+idouble: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "exp2_towardzero":
-+double: 1
-+idouble: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "exp2_upward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "exp_downward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+
-+Function: "exp_towardzero":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+
-+Function: "exp_upward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+
-+Function: "expm1":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "expm1_downward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "expm1_towardzero":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 4
-+ldouble: 4
-+
-+Function: "expm1_upward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "gamma":
-+double: 3
-+float: 3
-+idouble: 3
-+ifloat: 3
-+ildouble: 5
-+ldouble: 5
-+
-+Function: "gamma_downward":
-+double: 4
-+float: 4
-+idouble: 4
-+ifloat: 4
-+ildouble: 8
-+ldouble: 8
-+
-+Function: "gamma_towardzero":
-+double: 4
-+float: 3
-+idouble: 4
-+ifloat: 3
-+ildouble: 5
-+ldouble: 5
-+
-+Function: "gamma_upward":
-+double: 4
-+float: 5
-+idouble: 4
-+ifloat: 5
-+ildouble: 8
-+ldouble: 8
-+
-+Function: "hypot":
-+double: 1
-+idouble: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "hypot_downward":
-+double: 1
-+idouble: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "hypot_towardzero":
-+double: 1
-+idouble: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "hypot_upward":
-+double: 1
-+idouble: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "j0":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "j0_downward":
-+double: 2
-+float: 4
-+idouble: 2
-+ifloat: 4
-+ildouble: 4
-+ldouble: 4
-+
-+Function: "j0_towardzero":
-+double: 2
-+float: 1
-+idouble: 2
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "j0_upward":
-+double: 3
-+float: 2
-+idouble: 3
-+ifloat: 2
-+ildouble: 5
-+ldouble: 5
-+
-+Function: "j1":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 4
-+ldouble: 4
-+
-+Function: "j1_downward":
-+double: 3
-+float: 2
-+idouble: 3
-+ifloat: 2
-+ildouble: 4
-+ldouble: 4
-+
-+Function: "j1_towardzero":
-+double: 3
-+float: 2
-+idouble: 3
-+ifloat: 2
-+ildouble: 4
-+ldouble: 4
-+
-+Function: "j1_upward":
-+double: 3
-+float: 4
-+idouble: 3
-+ifloat: 4
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "jn":
-+double: 4
-+float: 4
-+idouble: 4
-+ifloat: 4
-+ildouble: 7
-+ldouble: 7
-+
-+Function: "jn_downward":
-+double: 4
-+float: 5
-+idouble: 4
-+ifloat: 5
-+ildouble: 8
-+ldouble: 8
-+
-+Function: "jn_towardzero":
-+double: 4
-+float: 5
-+idouble: 4
-+ifloat: 5
-+ildouble: 8
-+ldouble: 8
-+
-+Function: "jn_upward":
-+double: 5
-+float: 4
-+idouble: 5
-+ifloat: 4
-+ildouble: 7
-+ldouble: 7
-+
-+Function: "lgamma":
-+double: 3
-+float: 3
-+idouble: 3
-+ifloat: 3
-+ildouble: 5
-+ldouble: 5
-+
-+Function: "lgamma_downward":
-+double: 4
-+float: 4
-+idouble: 4
-+ifloat: 4
-+ildouble: 8
-+ldouble: 8
-+
-+Function: "lgamma_towardzero":
-+double: 4
-+float: 3
-+idouble: 4
-+ifloat: 3
-+ildouble: 5
-+ldouble: 5
-+
-+Function: "lgamma_upward":
-+double: 4
-+float: 5
-+idouble: 4
-+ifloat: 5
-+ildouble: 8
-+ldouble: 8
-+
-+Function: "log":
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "log10":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "log10_downward":
-+double: 2
-+float: 3
-+idouble: 2
-+ifloat: 3
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "log10_towardzero":
-+double: 2
-+float: 1
-+idouble: 2
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "log10_upward":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "log1p":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "log1p_downward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "log1p_towardzero":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "log1p_upward":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "log2":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "log2_downward":
-+double: 3
-+idouble: 3
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "log2_towardzero":
-+double: 2
-+idouble: 2
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "log2_upward":
-+double: 3
-+idouble: 3
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "log_downward":
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "log_towardzero":
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "log_upward":
-+double: 1
-+idouble: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "pow":
-+double: 1
-+idouble: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "pow_downward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "pow_towardzero":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "pow_upward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "sin":
-+double: 1
-+idouble: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "sin_downward":
-+double: 1
-+idouble: 1
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "sin_towardzero":
-+double: 1
-+idouble: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "sin_upward":
-+double: 1
-+idouble: 1
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "sincos":
-+double: 1
-+idouble: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "sincos_downward":
-+double: 1
-+idouble: 1
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "sincos_towardzero":
-+double: 1
-+idouble: 1
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "sincos_upward":
-+double: 1
-+idouble: 1
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "sinh":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "sinh_downward":
-+double: 3
-+float: 3
-+idouble: 3
-+ifloat: 3
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "sinh_towardzero":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "sinh_upward":
-+double: 3
-+float: 3
-+idouble: 3
-+ifloat: 3
-+ildouble: 4
-+ldouble: 4
-+
-+Function: "tan":
-+float: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "tan_downward":
-+double: 1
-+float: 2
-+idouble: 1
-+ifloat: 2
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "tan_towardzero":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "tan_upward":
-+double: 1
-+float: 1
-+idouble: 1
-+ifloat: 1
-+ildouble: 1
-+ldouble: 1
-+
-+Function: "tanh":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "tanh_downward":
-+double: 3
-+float: 3
-+idouble: 3
-+ifloat: 3
-+ildouble: 4
-+ldouble: 4
-+
-+Function: "tanh_towardzero":
-+double: 2
-+float: 2
-+idouble: 2
-+ifloat: 2
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "tanh_upward":
-+double: 3
-+float: 3
-+idouble: 3
-+ifloat: 3
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "tgamma":
-+double: 5
-+float: 4
-+idouble: 5
-+ifloat: 4
-+ildouble: 4
-+ldouble: 4
-+
-+Function: "tgamma_downward":
-+double: 5
-+float: 5
-+idouble: 5
-+ifloat: 5
-+ildouble: 5
-+ldouble: 5
-+
-+Function: "tgamma_towardzero":
-+double: 5
-+float: 4
-+idouble: 5
-+ifloat: 4
-+ildouble: 5
-+ldouble: 5
-+
-+Function: "tgamma_upward":
-+double: 4
-+float: 4
-+idouble: 4
-+ifloat: 4
-+ildouble: 4
-+ldouble: 4
-+
-+Function: "y0":
-+double: 2
-+float: 1
-+idouble: 2
-+ifloat: 1
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "y0_downward":
-+double: 3
-+float: 4
-+idouble: 3
-+ifloat: 4
-+ildouble: 4
-+ldouble: 4
-+
-+Function: "y0_towardzero":
-+double: 3
-+float: 3
-+idouble: 3
-+ifloat: 3
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "y0_upward":
-+double: 2
-+float: 5
-+idouble: 2
-+ifloat: 5
-+ildouble: 3
-+ldouble: 3
-+
-+Function: "y1":
-+double: 3
-+float: 2
-+idouble: 3
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "y1_downward":
-+double: 3
-+float: 2
-+idouble: 3
-+ifloat: 2
-+ildouble: 4
-+ldouble: 4
-+
-+Function: "y1_towardzero":
-+double: 3
-+float: 2
-+idouble: 3
-+ifloat: 2
-+ildouble: 2
-+ldouble: 2
-+
-+Function: "y1_upward":
-+double: 5
-+float: 2
-+idouble: 5
-+ifloat: 2
-+ildouble: 5
-+ldouble: 5
-+
-+Function: "yn":
-+double: 3
-+float: 3
-+idouble: 3
-+ifloat: 3
-+ildouble: 5
-+ldouble: 5
-+
-+Function: "yn_downward":
-+double: 3
-+float: 4
-+idouble: 3
-+ifloat: 4
-+ildouble: 5
-+ldouble: 5
-+
-+Function: "yn_towardzero":
-+double: 3
-+float: 3
-+idouble: 3
-+ifloat: 3
-+ildouble: 5
-+ldouble: 5
-+
-+Function: "yn_upward":
-+double: 4
-+float: 5
-+idouble: 4
-+ifloat: 5
-+ildouble: 5
-+ldouble: 5
-+
-+# end of automatic generation
-diff --git a/sysdeps/loongarch/lp64/libm-test-ulps-name b/sysdeps/loongarch/lp64/libm-test-ulps-name
-new file mode 100644
-index 00000000..ce02281e
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/libm-test-ulps-name
-@@ -0,0 +1 @@
-+LoongArch 64-bit
-diff --git a/sysdeps/loongarch/lp64/memchr.S b/sysdeps/loongarch/lp64/memchr.S
-new file mode 100644
-index 00000000..ec34b1af
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/memchr.S
-@@ -0,0 +1,99 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#ifndef MEMCHR_NAME
-+#define MEMCHR_NAME memchr
-+#endif
-+
-+#ifdef ANDROID_CHANGES
-+LEAF(MEMCHR_NAME, 0)
-+#else
-+LEAF(MEMCHR_NAME)
-+#endif
-+    .align      6
-+    beqz        a2, L(out)
-+    andi        t1, a0, 0x7
-+    lu12i.w     a3, 0x01010
-+    sub.d       a5, a0, t1
-+
-+    bstrins.d   a1, a1, 15, 8
-+    ld.d        t0, a5, 0
-+    slli.d      t2, t1, 3
-+    ori         a3, a3, 0x101
-+
-+    bstrins.d   a1, a1, 31, 16
-+    li.w        t7, -1
-+    li.w        t8, 9
-+    bstrins.d   a3, a3, 63, 32
-+
-+    srl.d       t3, t7, t2
-+    bstrins.d   a1, a1, 63, 32
-+    sub.d       t4, t8, t1
-+    orn         t3, a1, t3
-+
-+    srl.d       t0, t0, t2
-+    slli.d      a4, a3, 7   # 0x8080808080808080
-+    sltu        t4, a2, t4
-+    xor         t2, t0, t3
-+
-+    sub.d       a6, t2, a3
-+    andn        a7, a4, t2
-+    and         t2, a6, a7
-+    or          t3, t2, t4
-+
-+    bnez        t3, L(count_pos)
-+    addi.d      a2, a2, -8
-+    addi.d      a0, a5, 8
-+    add.d       a2, a2, t1
-+
-+L(loop):
-+    ld.d        t0, a0, 0
-+    sltui       t4, a2, 9
-+    xor         t2, t0, a1
-+    sub.d       a6, t2, a3
-+
-+    andn        a7, a4, t2
-+    and         t2, a6, a7
-+    or          t3, t2, t4
-+    bnez        t3, L(count_pos)
-+
-+    ld.d        t1, a0, 8
-+    addi.d      a0, a0, 16
-+    sltui       t4, a2, 17
-+    xor         t2, t1, a1
-+
-+    sub.d       a6, t2, a3
-+    andn        a7, a4, t2
-+    and         t2, a6, a7
-+    addi.d      a2, a2, -16
-+
-+    or          t3, t2, t4
-+    beqz        t3, L(loop)
-+    addi.d      a0, a0, -8
-+    addi.d      a2, a2, 8
-+
-+L(count_pos):
-+    ctz.d       t0, t2
-+    srli.d      t0, t0, 3
-+    sltu        t1, t0, a2
-+    add.d       a0, a0, t0
-+
-+    maskeqz     a0, a0, t1
-+    jr          ra
-+
-+L(out):
-+    move        a0, zero
-+    jr          ra
-+END(MEMCHR_NAME)
-+
-+#ifndef ANDROID_CHANGES
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMCHR_NAME)
-+#endif
-+#endif
-diff --git a/sysdeps/loongarch/lp64/memcmp.S b/sysdeps/loongarch/lp64/memcmp.S
-new file mode 100644
-index 00000000..9e57a924
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/memcmp.S
-@@ -0,0 +1,281 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#ifndef MEMCMP_NAME
-+#define MEMCMP_NAME memcmp
-+#endif
-+
-+LEAF(MEMCMP_NAME)
-+    .align      6
-+    beqz        a2, L(ret)
-+    andi        a4, a1, 0x7
-+    andi        a3, a0, 0x7
-+    sltu        a5, a4, a3
-+
-+    xor         t0, a0, a1
-+    li.w        t8, 8
-+    maskeqz     t0, t0, a5
-+    li.w        t7, -1
-+
-+    xor         a0, a0, t0	// a0 hold smaller one
-+    xor         a1, a1, t0	// a1 hold larger one
-+    andi        a3, a0, 0x7	// a3 hold small offset
-+    andi        a4, a1, 0x7	// a4 hold larger offset
-+
-+    xor         a0, a0, a3
-+    xor         a1, a1, a4
-+    ld.d        t2, a0, 0	// t2 = "fedcbaXX"
-+    ld.d        t1, a1, 0	// t1 = "54321YYY"
-+
-+    slli.d      t3, a3, 3
-+    slli.d      t4, a4, 3
-+    sub.d       a6, t3, t4	// a6 = 0xfffffffffffffff8
-+    srl.d       t1, t1, t4	// t1 = "00054321"
-+
-+    srl.d       t0, t2, t3	// t0 = "00fedcba"
-+    srl.d       t5, t7, t4	// t5 = 0x000000FFFFFFFFFF
-+    sub.d       t6, t0, t1	// t6 hold diff
-+    and         t6, t6, t5	// t6 = "000xxxxx"
-+
-+    sub.d       t5, t8, a4	// t5 hold margin 8 - 3 = 5
-+    bnez        t6, L(first_out)
-+    bgeu        t5, a2, L(ret)
-+    sub.d       a2, a2, t5
-+
-+    bnez        a6, L(unaligned)
-+    blt         a2, t8, L(al_less_8bytes)
-+    andi        t1, a2, 31
-+    beq         t1, a2, L(al_less_32bytes)
-+
-+    sub.d       t2, a2, t1
-+    add.d       a4, a0, t2
-+    move        a2, t1
-+
-+L(al_loop):
-+    ld.d        t0, a0, 8
-+
-+    ld.d        t1, a1, 8
-+    ld.d        t2, a0, 16
-+    ld.d        t3, a1, 16
-+    ld.d        t4, a0, 24
-+
-+    ld.d        t5, a1, 24
-+    ld.d        t6, a0, 32
-+    ld.d        t7, a1, 32
-+    addi.d      a0, a0, 32
-+
-+    addi.d      a1, a1, 32
-+    bne         t0, t1, L(out1)
-+    bne         t2, t3, L(out2)
-+    bne         t4, t5, L(out3)
-+
-+    bne         t6, t7, L(out4)
-+    bne         a0, a4, L(al_loop)
-+
-+L(al_less_32bytes):
-+    srai.d      a4, a2, 4
-+    beqz        a4, L(al_less_16bytes)
-+
-+    ld.d        t0, a0, 8
-+    ld.d        t1, a1, 8
-+    ld.d        t2, a0, 16
-+    ld.d        t3, a1, 16
-+
-+    addi.d      a0, a0, 16
-+    addi.d      a1, a1, 16
-+    addi.d      a2, a2, -16
-+    bne         t0, t1, L(out1)
-+
-+    bne         t2, t3, L(out2)
-+
-+L(al_less_16bytes):
-+    srai.d      a4, a2, 3
-+    beqz        a4, L(al_less_8bytes)
-+    ld.d        t0, a0, 8
-+
-+    ld.d        t1, a1, 8
-+    addi.d      a0, a0, 8
-+    addi.d      a1, a1, 8
-+    addi.d      a2, a2, -8
-+
-+    bne         t0, t1, L(out1)
-+
-+L(al_less_8bytes):
-+    beqz        a2, L(ret)
-+    ld.d        t0, a0, 8
-+    ld.d        t1, a1, 8
-+
-+    li.d        t7, -1
-+    slli.d      t2, a2, 3
-+    sll.d       t2, t7, t2
-+    sub.d       t3, t0, t1
-+
-+    andn        t6, t3, t2
-+    bnez        t6, L(count_diff)
-+
-+L(ret):
-+    move        a0, zero
-+    jr          ra
-+
-+L(out4):
-+    move        t0, t6
-+    move        t1, t7
-+    sub.d       t6, t6, t7
-+    b           L(count_diff)
-+
-+L(out3):
-+    move        t0, t4
-+    move        t1, t5
-+    sub.d       t6, t4, t5
-+    b           L(count_diff)
-+
-+L(out2):
-+    move        t0, t2
-+    move        t1, t3
-+L(out1):
-+    sub.d       t6, t0, t1
-+    b           L(count_diff)
-+
-+L(first_out):
-+    slli.d      t4, a2, 3
-+    slt         t3, a2, t5
-+    sll.d       t4, t7, t4
-+    maskeqz     t4, t4, t3
-+
-+    andn        t6, t6, t4
-+
-+L(count_diff):
-+    ctz.d       t2, t6
-+    bstrins.d   t2, zero, 2, 0
-+    srl.d       t0, t0, t2
-+
-+    srl.d       t1, t1, t2
-+    andi        t0, t0, 0xff
-+    andi        t1, t1, 0xff
-+    sub.d       t2, t0, t1
-+
-+    sub.d       t3, t1, t0
-+    masknez     t2, t2, a5
-+    maskeqz     t3, t3, a5
-+    or          a0, t2, t3
-+
-+    jr          ra
-+
-+L(unaligned):
-+    sub.d       a7, zero, a6
-+    srl.d       t0, t2, a6
-+    blt         a2, t8, L(un_less_8bytes)
-+
-+    andi        t1, a2, 31
-+    beq         t1, a2, L(un_less_32bytes)
-+    sub.d       t2, a2, t1
-+    add.d       a4, a0, t2
-+
-+    move        a2, t1
-+
-+L(un_loop):
-+    ld.d        t2, a0, 8
-+    ld.d        t1, a1, 8
-+    ld.d        t4, a0, 16
-+
-+    ld.d        t3, a1, 16
-+    ld.d        t6, a0, 24
-+    ld.d        t5, a1, 24
-+    ld.d        t8, a0, 32
-+
-+    ld.d        t7, a1, 32
-+    addi.d      a0, a0, 32
-+    addi.d      a1, a1, 32
-+    sll.d       a3, t2, a7
-+
-+    or          t0, a3, t0
-+    bne         t0, t1, L(out1)
-+    srl.d       t0, t2, a6
-+    sll.d       a3, t4, a7
-+
-+    or          t2, a3, t0
-+    bne         t2, t3, L(out2)
-+    srl.d       t0, t4, a6
-+    sll.d       a3, t6, a7
-+
-+    or          t4, a3, t0
-+    bne         t4, t5, L(out3)
-+    srl.d       t0, t6, a6
-+    sll.d       a3, t8, a7
-+
-+    or          t6, t0, a3
-+    bne         t6, t7, L(out4)
-+    srl.d       t0, t8, a6
-+    bne         a0, a4, L(un_loop)
-+
-+L(un_less_32bytes):
-+    srai.d      a4, a2, 4
-+    beqz        a4, L(un_less_16bytes)
-+    ld.d        t2, a0, 8
-+    ld.d        t1, a1, 8
-+
-+    ld.d        t4, a0, 16
-+    ld.d        t3, a1, 16
-+    addi.d      a0, a0, 16
-+    addi.d      a1, a1, 16
-+
-+    addi.d      a2, a2, -16
-+    sll.d       a3, t2, a7
-+    or          t0, a3, t0
-+    bne         t0, t1, L(out1)
-+
-+    srl.d       t0, t2, a6
-+    sll.d       a3, t4, a7
-+    or          t2, a3, t0
-+    bne         t2, t3, L(out2)
-+
-+    srl.d       t0, t4, a6
-+
-+L(un_less_16bytes):
-+    srai.d      a4, a2, 3
-+    beqz        a4, L(un_less_8bytes)
-+    ld.d        t2, a0, 8
-+
-+    ld.d        t1, a1, 8
-+    addi.d      a0, a0, 8
-+    addi.d      a1, a1, 8
-+    addi.d      a2, a2, -8
-+
-+    sll.d       a3, t2, a7
-+    or          t0, a3, t0
-+    bne         t0, t1, L(out1)
-+    srl.d       t0, t2, a6
-+
-+L(un_less_8bytes):
-+    beqz        a2, L(ret)
-+    andi        a7, a7, 63
-+    slli.d      a4, a2, 3
-+    bgeu        a7, a4, L(last_cmp)
-+
-+    ld.d        t2, a0, 8
-+    sll.d       a3, t2, a7
-+    or          t0, a3, t0
-+
-+L(last_cmp):
-+    ld.d        t1, a1, 8
-+
-+    li.d        t7, -1
-+    sll.d       t2, t7, a4
-+    sub.d       t3, t0, t1
-+    andn        t6, t3, t2
-+
-+    bnez        t6, L(count_diff)
-+    move        a0, zero
-+    jr          ra
-+
-+END(MEMCMP_NAME)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMCMP_NAME)
-+#endif
-diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S
-new file mode 100644
-index 00000000..1076e678
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/memcpy.S
-@@ -0,0 +1,818 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <regdef.h>
-+#include <sys/asm.h>
-+#endif
-+
-+#ifndef MEMCPY_NAME
-+#define MEMCPY_NAME memcpy
-+#endif
-+
-+#ifndef MEMMOVE_NAME
-+#define MEMMOVE_NAME memmove
-+#endif
-+
-+#define LD_64(reg, n)          \
-+    ld.d        t0, reg, n;    \
-+    ld.d        t1, reg, n+8;  \
-+    ld.d        t2, reg, n+16; \
-+    ld.d        t3, reg, n+24; \
-+    ld.d        t4, reg, n+32; \
-+    ld.d        t5, reg, n+40; \
-+    ld.d        t6, reg, n+48; \
-+    ld.d        t7, reg, n+56;
-+
-+#define ST_64(reg, n)          \
-+    st.d        t0, reg, n;    \
-+    st.d        t1, reg, n+8;  \
-+    st.d        t2, reg, n+16; \
-+    st.d        t3, reg, n+24; \
-+    st.d        t4, reg, n+32; \
-+    st.d        t5, reg, n+40; \
-+    st.d        t6, reg, n+48; \
-+    st.d        t7, reg, n+56;
-+
-+#ifdef ANDROID_CHANGES
-+LEAF(MEMMOVE_NAME, 0)
-+#else
-+LEAF(MEMMOVE_NAME)
-+#endif
-+
-+    .align      6
-+    sub.d       t0, a0, a1
-+    bltu        t0, a2, L(copy_back)
-+
-+END(MEMMOVE_NAME)
-+
-+#ifndef ANDROID_CHANGES
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMMOVE_NAME)
-+#endif
-+#endif
-+
-+#ifdef ANDROID_CHANGES
-+LEAF(MEMCPY_NAME, 0)
-+#else
-+LEAF(MEMCPY_NAME)
-+#endif
-+
-+    srai.d      a3, a2, 4
-+    beqz        a3, L(short_data)  # less than 16 bytes
-+
-+    move        a4, a0
-+    andi        a5, a0, 0x7
-+    andi        a6, a1, 0x7
-+    li.d        t8, 8
-+    beqz        a5, L(check_align)
-+
-+    # make dest aligned 8 bytes
-+    sub.d       t2, t8, a5
-+    sub.d       a2, a2, t2
-+
-+    pcaddi      t1, 20
-+    slli.d      t3, t2, 3
-+    add.d       a1, a1, t2
-+    sub.d       t1, t1, t3
-+    add.d       a4, a4, t2
-+    jr          t1
-+
-+L(al7):
-+    ld.b        t0, a1, -7
-+    st.b        t0, a4, -7
-+L(al6):
-+    ld.b        t0, a1, -6
-+    st.b        t0, a4, -6
-+L(al5):
-+    ld.b        t0, a1, -5
-+    st.b        t0, a4, -5
-+L(al4):
-+    ld.b        t0, a1, -4
-+    st.b        t0, a4, -4
-+L(al3):
-+    ld.b        t0, a1, -3
-+    st.b        t0, a4, -3
-+L(al2):
-+    ld.b        t0, a1, -2
-+    st.b        t0, a4, -2
-+L(al1):
-+    ld.b        t0, a1, -1
-+    st.b        t0, a4, -1
-+
-+L(check_align):
-+    bne         a5, a6, L(unalign)
-+
-+    srai.d      a3, a2, 4
-+    beqz        a3, L(al_less_16bytes)
-+
-+    andi        a3, a2, 0x3f
-+    beq         a3, a2, L(al_less_64bytes)
-+
-+    sub.d       t0, a2, a3
-+    move        a2, a3
-+    add.d       a5, a1, t0
-+
-+L(loop_64bytes):
-+    LD_64(a1, 0)
-+    addi.d      a1, a1, 64
-+    ST_64(a4, 0)
-+
-+    addi.d      a4, a4, 64
-+    bne         a1, a5, L(loop_64bytes)
-+
-+L(al_less_64bytes):
-+    srai.d     a3, a2, 5
-+    beqz       a3, L(al_less_32bytes)
-+
-+    ld.d       t0, a1, 0
-+    ld.d       t1, a1, 8
-+    ld.d       t2, a1, 16
-+    ld.d       t3, a1, 24
-+
-+    addi.d     a1, a1, 32
-+    addi.d     a2, a2, -32
-+
-+    st.d       t0, a4, 0
-+    st.d       t1, a4, 8
-+    st.d       t2, a4, 16
-+    st.d       t3, a4, 24
-+
-+    addi.d     a4, a4, 32
-+
-+L(al_less_32bytes):
-+    srai.d     a3, a2, 4
-+    beqz       a3, L(al_less_16bytes)
-+
-+    ld.d       t0, a1, 0
-+    ld.d       t1, a1, 8
-+    addi.d     a1, a1, 16
-+    addi.d     a2, a2, -16
-+
-+    st.d       t0, a4, 0
-+    st.d       t1, a4, 8
-+    addi.d     a4, a4, 16
-+
-+L(al_less_16bytes):
-+    srai.d     a3, a2, 3
-+    beqz       a3, L(al_less_8bytes)
-+
-+    ld.d       t0, a1, 0
-+    addi.d     a1, a1, 8
-+    addi.d     a2, a2, -8
-+
-+    st.d       t0, a4, 0
-+    addi.d     a4, a4, 8
-+
-+L(al_less_8bytes):
-+    srai.d      a3, a2, 2
-+    beqz        a3, L(al_less_4bytes)
-+
-+    ld.w        t0, a1, 0
-+    addi.d      a1, a1, 4
-+    addi.d      a2, a2, -4
-+
-+    st.w        t0, a4, 0
-+    addi.d      a4, a4, 4
-+
-+L(al_less_4bytes):
-+    srai.d      a3, a2, 1
-+    beqz        a3, L(al_less_2bytes)
-+
-+    ld.h        t0, a1, 0
-+    addi.d      a1, a1, 2
-+    addi.d      a2, a2, -2
-+
-+    st.h        t0, a4, 0
-+    addi.d      a4, a4, 2
-+
-+L(al_less_2bytes):
-+    beqz        a2, L(al_less_1byte)
-+
-+    ld.b        t0, a1, 0
-+    st.b        t0, a4, 0
-+
-+L(al_less_1byte):
-+    jr          ra
-+
-+L(unalign):
-+    andi        a5, a1, 0x7
-+    bstrins.d   a1, zero, 2, 0   # make src 8 bytes aligned
-+
-+    sub.d       t8, t8, a5  # use t8 to save count of bytes for aligning
-+    slli.d      a5, a5, 3
-+
-+    ld.d        t0, a1, 0
-+    addi.d      a1, a1, 8
-+
-+    slli.d      a6, t8, 3
-+    srl.d       a7, t0, a5
-+
-+    srai.d      a3, a2, 4
-+    beqz        a3, L(un_less_16bytes)
-+
-+    andi        a3, a2, 0x3f
-+    beq         a3, a2, L(un_less_64bytes)
-+
-+    sub.d       t0, a2, a3
-+    move        a2, a3
-+    add.d       a3, a1, t0
-+
-+# a5 shift right num
-+# a6 shift left num
-+# a7 remaining part
-+L(un_long_bytes):
-+    ld.d        t0, a1, 0
-+    ld.d        t1, a1, 8
-+    ld.d        t2, a1, 16
-+    ld.d        t3, a1, 24
-+
-+    srl.d       t4, t0, a5
-+    sll.d       t0, t0, a6
-+
-+    srl.d       t5, t1, a5
-+    sll.d       t1, t1, a6
-+
-+    srl.d       t6, t2, a5
-+    sll.d       t2, t2, a6
-+
-+    srl.d       t7, t3, a5
-+    sll.d       t3, t3, a6
-+
-+    or          t0, a7, t0
-+    or          t1, t4, t1
-+    or          t2, t5, t2
-+    or          t3, t6, t3
-+
-+    ld.d        t4, a1, 32
-+    ld.d        t5, a1, 40
-+    ld.d        t6, a1, 48
-+    ld.d        a7, a1, 56
-+
-+    st.d        t0, a4, 0
-+    st.d        t1, a4, 8
-+    st.d        t2, a4, 16
-+    st.d        t3, a4, 24
-+
-+    addi.d      a1, a1, 64
-+
-+    srl.d       t0, t4, a5
-+    sll.d       t4, t4, a6
-+
-+    srl.d       t1, t5, a5
-+    sll.d       t5, t5, a6
-+
-+    srl.d       t2, t6, a5
-+    sll.d       t6, t6, a6
-+
-+    sll.d       t3, a7, a6
-+    srl.d       a7, a7, a5
-+
-+    or          t4, t7, t4
-+    or          t5, t0, t5
-+    or          t6, t1, t6
-+    or          t3, t2, t3
-+
-+    st.d        t4, a4, 32
-+    st.d        t5, a4, 40
-+    st.d        t6, a4, 48
-+    st.d        t3, a4, 56
-+
-+    addi.d      a4, a4, 64
-+    bne         a3, a1, L(un_long_bytes)
-+
-+L(un_less_64bytes):
-+    srai.d	a3, a2, 5
-+    beqz	a3, L(un_less_32bytes)
-+
-+    ld.d        t0, a1, 0
-+    ld.d        t1, a1, 8
-+    ld.d        t2, a1, 16
-+    ld.d        t3, a1, 24
-+
-+    addi.d      a1, a1, 32
-+    addi.d      a2, a2, -32
-+
-+    srl.d       t4, t0, a5
-+    sll.d       t0, t0, a6
-+
-+    srl.d       t5, t1, a5
-+    sll.d       t1, t1, a6
-+
-+    srl.d       t6, t2, a5
-+    sll.d       t2, t2, a6
-+
-+    or          t0, a7, t0
-+
-+    srl.d       a7, t3, a5
-+    sll.d       t3, t3, a6
-+
-+    or          t1, t4, t1
-+    or          t2, t5, t2
-+    or          t3, t6, t3
-+
-+    st.d        t0, a4, 0
-+    st.d        t1, a4, 8
-+    st.d        t2, a4, 16
-+    st.d        t3, a4, 24
-+
-+    addi.d      a4, a4, 32
-+
-+L(un_less_32bytes):
-+    srai.d      a3, a2, 4
-+    beqz        a3, L(un_less_16bytes)
-+
-+    ld.d        t0, a1, 0
-+    ld.d        t1, a1, 8
-+
-+    addi.d      a1, a1, 16
-+    addi.d      a2, a2, -16
-+
-+    srl.d       t2, t0, a5
-+    sll.d       t3, t0, a6
-+
-+    sll.d       t4, t1, a6
-+    or          t3, a7, t3
-+    or          t4, t2, t4
-+    srl.d       a7, t1, a5
-+
-+    st.d        t3, a4, 0
-+    st.d        t4, a4, 8
-+
-+    addi.d      a4, a4, 16
-+
-+L(un_less_16bytes):
-+    srai.d      a3, a2, 3
-+    beqz        a3, L(un_less_8bytes)
-+
-+    ld.d        t0, a1, 0
-+
-+    addi.d      a1, a1, 8
-+    addi.d      a2, a2, -8
-+
-+    sll.d       t1, t0, a6
-+    or          t2, a7, t1
-+    srl.d       a7, t0, a5
-+
-+    st.d        t2, a4, 0
-+    addi.d      a4, a4, 8
-+
-+L(un_less_8bytes):
-+    beqz        a2, L(un_less_1byte)
-+    bge         t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7
-+
-+    # combine data in memory and a7(remaining part)
-+    ld.d        t0, a1, 0
-+    sll.d       t0, t0, a6
-+    or          a7, a7, t0
-+
-+1:
-+    srai.d      a3, a2, 2
-+    beqz        a3, L(un_less_4bytes)
-+
-+    addi.d      a2, a2, -4
-+    st.w        a7, a4, 0
-+    addi.d      a4, a4, 4
-+    srai.d      a7, a7, 32
-+
-+L(un_less_4bytes):
-+    srai.d      a3, a2, 1
-+    beqz        a3, L(un_less_2bytes)
-+
-+    addi.d      a2, a2, -2
-+    st.h        a7, a4, 0
-+    addi.d      a4, a4, 2
-+    srai.d      a7, a7, 16
-+
-+L(un_less_2bytes):
-+    beqz        a2, L(un_less_1byte)
-+    st.b        a7, a4, 0
-+
-+L(un_less_1byte):
-+    jr          ra
-+
-+# Bytes copying for data less than 16 bytes
-+L(short_data):
-+    pcaddi      t1, 36
-+    slli.d      t2, a2, 3
-+    add.d       a4, a0, a2
-+    sub.d       t1, t1, t2
-+    add.d       a1, a1, a2
-+    jr          t1
-+
-+L(short_15_bytes):
-+    ld.b       t0, a1, -15
-+    st.b       t0, a4, -15
-+L(short_14_bytes):
-+    ld.b       t0, a1, -14
-+    st.b       t0, a4, -14
-+L(short_13_bytes):
-+    ld.b       t0, a1, -13
-+    st.b       t0, a4, -13
-+L(short_12_bytes):
-+    ld.b       t0, a1, -12
-+    st.b       t0, a4, -12
-+L(short_11_bytes):
-+    ld.b       t0, a1, -11
-+    st.b       t0, a4, -11
-+L(short_10_bytes):
-+    ld.b       t0, a1, -10
-+    st.b       t0, a4, -10
-+L(short_9_bytes):
-+    ld.b       t0, a1, -9
-+    st.b       t0, a4, -9
-+L(short_8_bytes):
-+    ld.b       t0, a1, -8
-+    st.b       t0, a4, -8
-+L(short_7_bytes):
-+    ld.b       t0, a1, -7
-+    st.b       t0, a4, -7
-+L(short_6_bytes):
-+    ld.b       t0, a1, -6
-+    st.b       t0, a4, -6
-+L(short_5_bytes):
-+    ld.b       t0, a1, -5
-+    st.b       t0, a4, -5
-+L(short_4_bytes):
-+    ld.b       t0, a1, -4
-+    st.b       t0, a4, -4
-+L(short_3_bytes):
-+    ld.b       t0, a1, -3
-+    st.b       t0, a4, -3
-+L(short_2_bytes):
-+    ld.b       t0, a1, -2
-+    st.b       t0, a4, -2
-+L(short_1_bytes):
-+    ld.b       t0, a1, -1
-+    st.b       t0, a4, -1
-+    jr         ra
-+
-+L(copy_back):
-+    srai.d      a3, a2, 4
-+    beqz        a3, L(back_short_data)  # less than 16 bytes
-+
-+    add.d       a4, a0, a2  # store the tail of dest
-+    add.d       a1, a1, a2  # store the tail of src
-+
-+    andi        a5, a4, 0x7
-+    andi        a6, a1, 0x7
-+    beqz        a5, L(back_check_align)
-+
-+    # make dest aligned 8 bytes
-+    sub.d       a2, a2, a5
-+    sub.d       a1, a1, a5
-+    sub.d       a4, a4, a5
-+
-+    pcaddi      t1, 18
-+    slli.d      t3, a5, 3
-+    sub.d       t1, t1, t3
-+    jr          t1
-+
-+    ld.b        t0, a1, 6
-+    st.b        t0, a4, 6
-+    ld.b        t0, a1, 5
-+    st.b        t0, a4, 5
-+    ld.b        t0, a1, 4
-+    st.b        t0, a4, 4
-+    ld.b        t0, a1, 3
-+    st.b        t0, a4, 3
-+    ld.b        t0, a1, 2
-+    st.b        t0, a4, 2
-+    ld.b        t0, a1, 1
-+    st.b        t0, a4, 1
-+    ld.b        t0, a1, 0
-+    st.b        t0, a4, 0
-+
-+L(back_check_align):
-+    bne         a5, a6, L(back_unalign)
-+
-+    srai.d      a3, a2, 4
-+    beqz        a3, L(back_less_16bytes)
-+
-+    andi        a3, a2, 0x3f
-+    beq         a3, a2, L(back_less_64bytes)
-+
-+    sub.d       t0, a2, a3
-+    move        a2, a3
-+    sub.d       a5, a1, t0
-+
-+L(back_loop_64bytes):
-+    LD_64(a1, -64)
-+    addi.d      a1, a1, -64
-+    ST_64(a4, -64)
-+
-+    addi.d      a4, a4, -64
-+    bne         a1, a5, L(back_loop_64bytes)
-+
-+L(back_less_64bytes):
-+    srai.d     a3, a2, 5
-+    beqz       a3, L(back_less_32bytes)
-+
-+    ld.d       t0, a1, -32
-+    ld.d       t1, a1, -24
-+    ld.d       t2, a1, -16
-+    ld.d       t3, a1, -8
-+
-+    addi.d     a1, a1, -32
-+    addi.d     a2, a2, -32
-+
-+    st.d       t0, a4, -32
-+    st.d       t1, a4, -24
-+    st.d       t2, a4, -16
-+    st.d       t3, a4, -8
-+
-+    addi.d     a4, a4, -32
-+
-+L(back_less_32bytes):
-+    srai.d     a3, a2, 4
-+    beqz       a3, L(back_less_16bytes)
-+
-+    ld.d       t0, a1, -16
-+    ld.d       t1, a1, -8
-+
-+    addi.d     a2, a2, -16
-+    addi.d     a1, a1, -16
-+
-+    st.d       t0, a4, -16
-+    st.d       t1, a4, -8
-+    addi.d     a4, a4, -16
-+
-+L(back_less_16bytes):
-+    srai.d      a3, a2, 3
-+    beqz        a3, L(back_less_8bytes)
-+
-+    ld.d        t0, a1, -8
-+    addi.d      a2, a2, -8
-+    addi.d      a1, a1, -8
-+
-+    st.d        t0, a4, -8
-+    addi.d      a4, a4, -8
-+
-+L(back_less_8bytes):
-+    srai.d      a3, a2, 2
-+    beqz        a3, L(back_less_4bytes)
-+
-+    ld.w        t0, a1, -4
-+    addi.d      a2, a2, -4
-+    addi.d      a1, a1, -4
-+
-+    st.w        t0, a4, -4
-+    addi.d      a4, a4, -4
-+
-+L(back_less_4bytes):
-+    srai.d      a3, a2, 1
-+    beqz        a3, L(back_less_2bytes)
-+
-+    ld.h        t0, a1, -2
-+    addi.d      a2, a2, -2
-+    addi.d      a1, a1, -2
-+
-+    st.h        t0, a4, -2
-+    addi.d      a4, a4, -2
-+
-+L(back_less_2bytes):
-+    beqz        a2, L(back_less_1byte)
-+
-+    ld.b        t0, a1, -1
-+    st.b        t0, a4, -1
-+
-+L(back_less_1byte):
-+    jr          ra
-+
-+L(back_unalign):
-+    andi        t8, a1, 0x7
-+    bstrins.d   a1, zero, 2, 0   # make src 8 bytes aligned
-+
-+    sub.d       a6, zero, t8
-+
-+    ld.d        t0, a1, 0
-+    slli.d      a6, a6, 3
-+    slli.d      a5, t8, 3
-+    sll.d       a7, t0, a6
-+
-+    srai.d      a3, a2, 4
-+    beqz        a3, L(back_un_less_16bytes)
-+
-+    andi        a3, a2, 0x3f
-+    beq         a3, a2, L(back_un_less_64bytes)
-+
-+    sub.d       t0, a2, a3
-+    move        a2, a3
-+    sub.d       a3, a1, t0
-+
-+L(back_un_long_bytes):
-+    ld.d        t0, a1, -8
-+    ld.d        t1, a1, -16
-+    ld.d        t2, a1, -24
-+    ld.d        t3, a1, -32
-+
-+    sll.d       t4, t0, a6
-+    srl.d       t0, t0, a5
-+
-+    sll.d       t5, t1, a6
-+    srl.d       t1, t1, a5
-+
-+    sll.d       t6, t2, a6
-+    srl.d       t2, t2, a5
-+
-+    sll.d       t7, t3, a6
-+    srl.d       t3, t3, a5
-+
-+    or          t0, t0, a7
-+    or          t1, t1, t4
-+    or          t2, t2, t5
-+    or          t3, t3, t6
-+
-+    ld.d        t4, a1, -40
-+    ld.d        t5, a1, -48
-+    ld.d        t6, a1, -56
-+    ld.d        a7, a1, -64
-+    st.d        t0, a4, -8
-+    st.d        t1, a4, -16
-+    st.d        t2, a4, -24
-+    st.d        t3, a4, -32
-+
-+    addi.d      a1, a1, -64
-+
-+    sll.d       t0, t4, a6
-+    srl.d       t4, t4, a5
-+
-+    sll.d       t1, t5, a6
-+    srl.d       t5, t5, a5
-+
-+    sll.d       t2, t6, a6
-+    srl.d       t6, t6, a5
-+
-+    srl.d       t3, a7, a5
-+    sll.d       a7, a7, a6
-+
-+    or          t4, t7, t4
-+    or          t5, t0, t5
-+    or          t6, t1, t6
-+    or          t3, t2, t3
-+
-+    st.d        t4, a4, -40
-+    st.d        t5, a4, -48
-+    st.d        t6, a4, -56
-+    st.d        t3, a4, -64
-+
-+    addi.d      a4, a4, -64
-+    bne         a3, a1, L(back_un_long_bytes)
-+
-+L(back_un_less_64bytes):
-+    srai.d	a3, a2, 5
-+    beqz	a3, L(back_un_less_32bytes)
-+
-+    ld.d        t0, a1, -8
-+    ld.d        t1, a1, -16
-+    ld.d        t2, a1, -24
-+    ld.d        t3, a1, -32
-+
-+    addi.d      a1, a1, -32
-+    addi.d      a2, a2, -32
-+
-+    sll.d       t4, t0, a6
-+    srl.d       t0, t0, a5
-+
-+    sll.d       t5, t1, a6
-+    srl.d       t1, t1, a5
-+
-+    sll.d       t6, t2, a6
-+    srl.d       t2, t2, a5
-+
-+    or          t0, a7, t0
-+
-+    sll.d       a7, t3, a6
-+    srl.d       t3, t3, a5
-+
-+    or          t1, t4, t1
-+    or          t2, t5, t2
-+    or          t3, t6, t3
-+
-+    st.d        t0, a4, -8
-+    st.d        t1, a4, -16
-+    st.d        t2, a4, -24
-+    st.d        t3, a4, -32
-+
-+    addi.d      a4, a4, -32
-+
-+L(back_un_less_32bytes):
-+    srai.d      a3, a2, 4
-+    beqz        a3, L(back_un_less_16bytes)
-+
-+    ld.d        t0, a1, -8
-+    ld.d        t1, a1, -16
-+
-+    addi.d      a1, a1, -16
-+    addi.d      a2, a2, -16
-+
-+    sll.d       t2, t0, a6
-+    srl.d       t3, t0, a5
-+
-+    srl.d       t4, t1, a5
-+    or          t3, a7, t3
-+    or          t4, t2, t4
-+    sll.d       a7, t1, a6
-+
-+    st.d        t3, a4, -8
-+    st.d        t4, a4, -16
-+
-+    addi.d      a4, a4, -16
-+
-+L(back_un_less_16bytes):
-+    srai.d      a3, a2, 3
-+    beqz        a3, L(back_un_less_8bytes)
-+
-+    ld.d        t0, a1, -8
-+
-+    addi.d      a1, a1, -8
-+    addi.d      a2, a2, -8
-+
-+    srl.d       t1, t0, a5
-+    or          t2, a7, t1
-+    sll.d       a7, t0, a6
-+
-+    st.d        t2, a4, -8
-+    addi.d      a4, a4, -8
-+
-+L(back_un_less_8bytes):
-+    beqz        a2, L(back_end)
-+    bge         t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7
-+
-+    # combine data in memory and a7(remaining part)
-+    ld.d        t0, a1, -8
-+    srl.d       t0, t0, a5
-+    or          a7, a7, t0
-+
-+1:
-+    srai.d      a3, a2, 2
-+    beqz        a3, L(back_un_less_4bytes)
-+
-+    srai.d      t0, a7, 32
-+    addi.d      a2, a2, -4
-+    st.w        t0, a4, -4
-+    addi.d      a4, a4, -4
-+    slli.d      a7, a7, 32
-+
-+L(back_un_less_4bytes):
-+    srai.d      a3, a2, 1
-+    beqz        a3, L(back_un_less_2bytes)
-+    srai.d      t0, a7, 48
-+    addi.d      a2, a2, -2
-+    st.h        t0, a4, -2
-+    addi.d      a4, a4, -2
-+    slli.d      a7, a7, 16
-+L(back_un_less_2bytes):
-+    beqz        a2, L(back_un_less_1byte)
-+    srai.d      t0, a7, 56
-+    st.b        t0, a4, -1
-+L(back_un_less_1byte):
-+    jr          ra
-+
-+L(back_short_data):
-+    pcaddi     t1, 34
-+    slli.d     t2, a2, 3
-+    sub.d      t1, t1, t2
-+    jr         t1
-+
-+    ld.b       t0, a1, 14
-+    st.b       t0, a0, 14
-+    ld.b       t0, a1, 13
-+    st.b       t0, a0, 13
-+    ld.b       t0, a1, 12
-+    st.b       t0, a0, 12
-+    ld.b       t0, a1, 11
-+    st.b       t0, a0, 11
-+    ld.b       t0, a1, 10
-+    st.b       t0, a0, 10
-+    ld.b       t0, a1, 9
-+    st.b       t0, a0, 9
-+    ld.b       t0, a1, 8
-+    st.b       t0, a0, 8
-+    ld.b       t0, a1, 7
-+    st.b       t0, a0, 7
-+    ld.b       t0, a1, 6
-+    st.b       t0, a0, 6
-+    ld.b       t0, a1, 5
-+    st.b       t0, a0, 5
-+    ld.b       t0, a1, 4
-+    st.b       t0, a0, 4
-+    ld.b       t0, a1, 3
-+    st.b       t0, a0, 3
-+    ld.b       t0, a1, 2
-+    st.b       t0, a0, 2
-+    ld.b       t0, a1, 1
-+    st.b       t0, a0, 1
-+    ld.b       t0, a1, 0
-+    st.b       t0, a0, 0
-+L(back_end):
-+    jr         ra
-+
-+END(MEMCPY_NAME)
-+
-+#ifndef ANDROID_CHANGES
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMCPY_NAME)
-+#endif
-+#endif
-diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S
-new file mode 100644
-index 00000000..6d1922c4
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/memmove.S
-@@ -0,0 +1,2 @@
-+/* DONT DELETE THIS FILE, OTHERWIES MEMCPY.C WILL BE COMPILED. */
-+/* There are too many common code in memcpy and memmove. See memcpy.S */
-diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S
-new file mode 100644
-index 00000000..9fe42b24
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/memset.S
-@@ -0,0 +1,173 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#ifndef MEMSET_NAME
-+#define MEMSET_NAME memset
-+#endif
-+
-+#define ST_64(n)                \
-+    st.d        a1, a0, n;      \
-+    st.d        a1, a0, n+8;    \
-+    st.d        a1, a0, n+16;   \
-+    st.d        a1, a0, n+24;   \
-+    st.d        a1, a0, n+32;   \
-+    st.d        a1, a0, n+40;   \
-+    st.d        a1, a0, n+48;   \
-+    st.d        a1, a0, n+56;
-+
-+#ifdef ANDROID_CHANGES
-+LEAF(MEMSET_NAME, 0)
-+#else
-+LEAF(MEMSET_NAME)
-+#endif
-+    .align          6
-+    move        t0, a0
-+    andi        a3, a0, 0x7
-+    li.w        t6, 16
-+    beqz        a3, L(align)
-+    blt         a2, t6, L(short_data)
-+
-+L(make_align):
-+    li.w        t8, 8
-+    sub.d       t2, t8, a3
-+    pcaddi      t1, 11
-+    slli.d      t3, t2, 2
-+    sub.d       t1, t1, t3
-+    jirl        zero, t1, 0
-+
-+L(al7):
-+    st.b        a1, t0, 6
-+L(al6):
-+    st.b        a1, t0, 5
-+L(al5):
-+    st.b        a1, t0, 4
-+L(al4):
-+    st.b        a1, t0, 3
-+L(al3):
-+    st.b        a1, t0, 2
-+L(al2):
-+    st.b        a1, t0, 1
-+L(al1):
-+    st.b        a1, t0, 0
-+L(al0):
-+    add.d       t0, t0, t2
-+    sub.d       a2, a2, t2
-+
-+L(align):
-+    bstrins.d   a1, a1, 15, 8
-+    bstrins.d   a1, a1, 31, 16
-+    bstrins.d   a1, a1, 63, 32
-+
-+    blt         a2, t6, L(less_16bytes)
-+
-+    andi        a4, a2, 0x3f
-+    beq         a4, a2, L(less_64bytes)
-+
-+    sub.d       t1, a2, a4
-+    move        a2, a4
-+    add.d       a5, t0, t1
-+
-+L(loop_64bytes):
-+    addi.d      t0, t0, 64
-+    st.d        a1, t0, -64
-+    st.d        a1, t0, -56
-+    st.d        a1, t0, -48
-+    st.d        a1, t0, -40
-+    st.d        a1, t0, -32
-+    st.d        a1, t0, -24
-+    st.d        a1, t0, -16
-+    st.d        a1, t0, -8
-+    bne         t0, a5, L(loop_64bytes)
-+
-+L(less_64bytes):
-+    srai.d      a4, a2, 5
-+    beqz        a4, L(less_32bytes)
-+    addi.d      a2, a2, -32
-+    st.d        a1, t0, 0
-+    st.d        a1, t0, 8
-+    st.d        a1, t0, 16
-+    st.d        a1, t0, 24
-+    addi.d      t0, t0, 32
-+L(less_32bytes):
-+    blt         a2, t6, L(less_16bytes)
-+    addi.d      a2, a2, -16
-+    st.d        a1, t0, 0
-+    st.d        a1, t0, 8
-+    addi.d      t0, t0, 16
-+L(less_16bytes):
-+    srai.d      a4, a2, 3
-+    beqz        a4, L(less_8bytes)
-+    addi.d      a2, a2, -8
-+    st.d        a1, t0, 0
-+    addi.d      t0, t0, 8
-+L(less_8bytes):
-+    beqz        a2, L(less_1byte)
-+    srai.d      a4, a2, 2
-+    beqz        a4, L(less_4bytes)
-+    addi.d      a2, a2, -4
-+    st.w        a1, t0, 0
-+    addi.d      t0, t0, 4
-+L(less_4bytes):
-+    srai.d      a3, a2, 1
-+    beqz        a3, L(less_2bytes)
-+    addi.d      a2, a2, -2
-+    st.h        a1, t0, 0
-+    addi.d      t0, t0, 2
-+L(less_2bytes):
-+    beqz        a2, L(less_1byte)
-+    st.b        a1, t0, 0
-+L(less_1byte):
-+    jr          ra
-+
-+L(short_data):
-+    pcaddi      t1, 19
-+    slli.d      t3, a2, 2
-+    sub.d       t1, t1, t3
-+    jirl        zero, t1, 0
-+L(short_15):
-+    st.b        a1, a0, 14
-+
-+L(short_14):
-+    st.b        a1, a0, 13
-+L(short_13):
-+    st.b        a1, a0, 12
-+L(short_12):
-+    st.b        a1, a0, 11
-+L(short_11):
-+    st.b        a1, a0, 10
-+L(short_10):
-+    st.b        a1, a0, 9
-+L(short_9):
-+    st.b        a1, a0, 8
-+L(short_8):
-+    st.b        a1, a0, 7
-+L(short_7):
-+    st.b        a1, a0, 6
-+L(short_6):
-+    st.b        a1, a0, 5
-+L(short_5):
-+    st.b        a1, a0, 4
-+L(short_4):
-+    st.b        a1, a0, 3
-+L(short_3):
-+    st.b        a1, a0, 2
-+L(short_2):
-+    st.b        a1, a0, 1
-+L(short_1):
-+    st.b        a1, a0, 0
-+L(short_0):
-+    jr          ra
-+
-+END(MEMSET_NAME)
-+
-+#ifndef ANDROID_CHANGES
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMSET_NAME)
-+#endif
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
-new file mode 100644
-index 00000000..6bd48f0e
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
-@@ -0,0 +1,18 @@
-+ifeq ($(subdir),string)
-+sysdep_routines += memcpy-aligned memcpy-unaligned memcpy-lasx \
-+		   memset-aligned memset-unaligned memset-lsx memset-lasx \
-+		   memmove-unaligned memmove-lsx memmove-lasx \
-+		   memchr-aligned memchr-lsx memchr-lasx \
-+		   memrchr-generic memrchr-lsx memrchr-lasx \
-+		   memcmp-aligned memcmp-lsx memcmp-lasx \
-+		   rawmemchr-aligned rawmemchr-lsx rawmemchr-lasx \
-+		   strchr-aligned strchr-unaligned strchr-lsx strchr-lasx \
-+		   strrchr-aligned strrchr-lsx strrchr-lasx \
-+		   strlen-aligned strlen-unaligned strlen-lsx strlen-lasx \
-+		   strnlen-aligned strnlen-unaligned strnlen-lsx strnlen-lasx \
-+		   strchrnul-aligned strchrnul-unaligned strchrnul-lsx strchrnul-lasx \
-+		   strncmp-aligned strncmp-unaligned strncmp-lsx \
-+		   strcpy-aligned strcpy-unaligned strcpy-lsx \
-+		   stpcpy-aligned stpcpy-lsx \
-+		   strcmp-aligned strcmp-unaligned strcmp-lsx
-+endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
-new file mode 100644
-index 00000000..c2b6bbf7
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
-@@ -0,0 +1,142 @@
-+/* Enumerate available IFUNC implementations of a function.  LoongArch64 version.
-+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <assert.h>
-+#include <string.h>
-+#include <wchar.h>
-+#include <ldsodefs.h>
-+#include <ifunc-impl-list.h>
-+#include <init-arch.h>
-+#include <stdio.h>
-+
-+/* Maximum number of IFUNC implementations.  */
-+#define MAX_IFUNC	4
-+
-+size_t
-+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-+			size_t max)
-+{
-+  assert (max >= MAX_IFUNC);
-+
-+  size_t i = 0;
-+
-+  IFUNC_IMPL (i, name, memcpy,
-+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_lasx)
-+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_lsx)
-+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_aligned)
-+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_unaligned)
-+	      )
-+
-+  IFUNC_IMPL (i, name, memmove,
-+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_lasx)
-+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_lsx)
-+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_aligned)
-+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_unaligned)
-+	      )
-+
-+  IFUNC_IMPL (i, name, memset,
-+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_lasx)
-+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_lsx)
-+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_aligned)
-+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_unaligned)
-+	      )
-+
-+  IFUNC_IMPL (i, name, memchr,
-+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_lasx)
-+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_lsx)
-+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_aligned)
-+	      )
-+
-+  IFUNC_IMPL (i, name, memrchr,
-+	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_lasx)
-+	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_lsx)
-+	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_generic)
-+	      )
-+
-+  IFUNC_IMPL (i, name, memcmp,
-+	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_lasx)
-+	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_lsx)
-+	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_aligned)
-+	      )
-+
-+  IFUNC_IMPL (i, name, rawmemchr,
-+	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_lasx)
-+	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_lsx)
-+	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_aligned)
-+	      )
-+
-+  IFUNC_IMPL (i, name, strchr,
-+	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_lasx)
-+	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_aligned)
-+	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_unaligned)
-+	      )
-+
-+  IFUNC_IMPL (i, name, strrchr,
-+	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_lasx)
-+	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_aligned)
-+	      )
-+
-+  IFUNC_IMPL (i, name, strlen,
-+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_lasx)
-+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned)
-+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_unaligned)
-+	      )
-+
-+  IFUNC_IMPL (i, name, strnlen,
-+	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_lasx)
-+	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_aligned)
-+	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_unaligned)
-+	      )
-+
-+  IFUNC_IMPL (i, name, strchrnul,
-+	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_lasx)
-+	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_aligned)
-+	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_unaligned)
-+	      )
-+
-+  IFUNC_IMPL (i, name, strncmp,
-+	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_aligned)
-+	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_unaligned)
-+	      )
-+
-+  IFUNC_IMPL (i, name, strcpy,
-+	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_aligned)
-+	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_unaligned)
-+	      )
-+
-+  IFUNC_IMPL (i, name, stpcpy,
-+	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_lsx)
-+	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_aligned)
-+	      )
-+
-+  IFUNC_IMPL (i, name, strcmp,
-+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_lsx)
-+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_aligned)
-+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_unaligned)
-+	      )
-+
-+  return i;
-+}
-+
-diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-lasx.h b/sysdeps/loongarch/lp64/multiarch/ifunc-lasx.h
-new file mode 100644
-index 00000000..61c00978
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-lasx.h
-@@ -0,0 +1,40 @@
-+/* Common definition for memcpy, and memset implementation.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <init-arch.h>
-+
-+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
-+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
-+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
-+extern __typeof (REDIRECT_NAME) OPTIMIZE (unaligned) attribute_hidden;
-+
-+static inline void *
-+IFUNC_SELECTOR (void)
-+{
-+  INIT_ARCH();
-+
-+  if (SUPPORT_LASX)
-+    return OPTIMIZE (lasx);
-+  else if (SUPPORT_LSX)
-+    return OPTIMIZE (lsx);
-+  else if (SUPPORT_UAL)
-+    return OPTIMIZE (unaligned);
-+  else
-+    return OPTIMIZE (aligned);
-+}
-diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-lsx.h b/sysdeps/loongarch/lp64/multiarch/ifunc-lsx.h
-new file mode 100644
-index 00000000..771312f6
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-lsx.h
-@@ -0,0 +1,37 @@
-+/* Common definition for strchr implementation.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <init-arch.h>
-+
-+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
-+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
-+extern __typeof (REDIRECT_NAME) OPTIMIZE (unaligned) attribute_hidden;
-+
-+static inline void *
-+IFUNC_SELECTOR (void)
-+{
-+  INIT_ARCH();
-+
-+  if (SUPPORT_LSX)
-+    return OPTIMIZE (lsx);
-+  if (SUPPORT_UAL)
-+    return OPTIMIZE (unaligned);
-+  else
-+    return OPTIMIZE (aligned);
-+}
-diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-memchr.h b/sysdeps/loongarch/lp64/multiarch/ifunc-memchr.h
-new file mode 100644
-index 00000000..5c01e1af
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-memchr.h
-@@ -0,0 +1,37 @@
-+/* Common definition for memchr implementation.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <init-arch.h>
-+
-+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
-+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
-+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
-+
-+static inline void *
-+IFUNC_SELECTOR (void)
-+{
-+  INIT_ARCH();
-+
-+  if (SUPPORT_LASX)
-+    return OPTIMIZE (lasx);
-+  else if (SUPPORT_LSX)
-+    return OPTIMIZE (lsx);
-+  else
-+    return OPTIMIZE (aligned);
-+}
-diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-memrchr.h b/sysdeps/loongarch/lp64/multiarch/ifunc-memrchr.h
-new file mode 100644
-index 00000000..d264944c
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-memrchr.h
-@@ -0,0 +1,37 @@
-+/* Common definition for memrchr implementation.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <init-arch.h>
-+
-+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
-+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
-+extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
-+
-+static inline void *
-+IFUNC_SELECTOR (void)
-+{
-+  INIT_ARCH();
-+
-+  if (SUPPORT_LASX)
-+    return OPTIMIZE (lasx);
-+  else if (SUPPORT_LSX)
-+    return OPTIMIZE (lsx);
-+  else
-+    return OPTIMIZE (generic);
-+}
-diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-stpcpy.h b/sysdeps/loongarch/lp64/multiarch/ifunc-stpcpy.h
-new file mode 100644
-index 00000000..9093f08c
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-stpcpy.h
-@@ -0,0 +1,34 @@
-+/* Common definition for memchr implementation.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <init-arch.h>
-+
-+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
-+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
-+
-+static inline void *
-+IFUNC_SELECTOR (void)
-+{
-+  INIT_ARCH();
-+
-+  if (SUPPORT_LSX)
-+    return OPTIMIZE (lsx);
-+  else
-+    return OPTIMIZE (aligned);
-+}
-diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S
-new file mode 100644
-index 00000000..4677c912
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S
-@@ -0,0 +1,7 @@
-+
-+#if IS_IN (libc)
-+#define MEMCHR_NAME __memchr_aligned
-+#endif
-+
-+#include "../memchr.S"
-+
-diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
-new file mode 100644
-index 00000000..e63e34ae
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
-@@ -0,0 +1,108 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define MEMCHR	__memchr_lasx
-+
-+LEAF(MEMCHR)
-+    .align          6
-+    beqz            a2, L(ret0)
-+    add.d           a3, a0, a2
-+    andi            t0, a0, 0x3f
-+    bstrins.d       a0, zero, 5, 0
-+
-+    xvld            $xr0, a0, 0
-+    xvld            $xr1, a0, 32
-+    li.d            t1, -1
-+    li.d            t2, 64
-+
-+    xvreplgr2vr.b   $xr2, a1
-+    sll.d           t3, t1, t0
-+    sub.d           t2, t2, t0
-+    xvseq.b         $xr0, $xr0, $xr2
-+
-+    xvseq.b         $xr1, $xr1, $xr2
-+    xvmsknz.b       $xr0, $xr0
-+    xvmsknz.b       $xr1, $xr1
-+    xvpickve.w      $xr3, $xr0, 4
-+
-+
-+    xvpickve.w      $xr4, $xr1, 4
-+    vilvl.h         $vr0, $vr3, $vr0
-+    vilvl.h         $vr1, $vr4, $vr1
-+    vilvl.w         $vr0, $vr1, $vr0
-+
-+    movfr2gr.d      t0, $f0
-+    and             t0, t0, t3
-+    bgeu            t2, a2, L(end)
-+    bnez            t0, L(found)
-+
-+    addi.d          a4, a3, -1
-+    bstrins.d       a4, zero, 5, 0
-+L(loop):
-+    xvld            $xr0, a0, 64
-+    xvld            $xr1, a0, 96
-+
-+    addi.d          a0, a0, 64
-+    xvseq.b         $xr0, $xr0, $xr2
-+    xvseq.b         $xr1, $xr1, $xr2
-+    beq             a0, a4, L(out)
-+
-+
-+    xvmax.bu        $xr3, $xr0, $xr1
-+    xvseteqz.v      $fcc0, $xr3
-+    bcnez           $fcc0, L(loop)
-+    xvmsknz.b       $xr0, $xr0
-+
-+    xvmsknz.b       $xr1, $xr1
-+    xvpickve.w      $xr3, $xr0, 4
-+    xvpickve.w      $xr4, $xr1, 4
-+    vilvl.h         $vr0, $vr3, $vr0
-+
-+    vilvl.h         $vr1, $vr4, $vr1
-+    vilvl.w         $vr0, $vr1, $vr0
-+    movfr2gr.d      t0, $f0
-+L(found):
-+    ctz.d           t1, t0
-+
-+    add.d           a0, a0, t1
-+    jr              ra
-+L(ret0):
-+    move            a0, zero
-+    jr              ra
-+
-+
-+L(out):
-+    xvmsknz.b       $xr0, $xr0
-+    xvmsknz.b       $xr1, $xr1
-+    xvpickve.w      $xr3, $xr0, 4
-+    xvpickve.w      $xr4, $xr1, 4
-+
-+    vilvl.h         $vr0, $vr3, $vr0
-+    vilvl.h         $vr1, $vr4, $vr1
-+    vilvl.w         $vr0, $vr1, $vr0
-+    movfr2gr.d      t0, $f0
-+
-+L(end):
-+    sub.d           t2, zero, a3
-+    srl.d           t1, t1, t2
-+    and             t0, t0, t1
-+    ctz.d           t1, t0
-+
-+    add.d           a0, a0, t1
-+    maskeqz         a0, a0, t0
-+    jr              ra
-+END(MEMCHR)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMCHR)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
-new file mode 100644
-index 00000000..441db534
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
-@@ -0,0 +1,93 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define MEMCHR	__memchr_lsx
-+
-+LEAF(MEMCHR)
-+    .align          6
-+    beqz            a2, L(ret0)
-+    add.d           a3, a0, a2
-+    andi            t0, a0, 0x1f
-+    bstrins.d       a0, zero, 4, 0
-+
-+    vld             $vr0, a0, 0
-+    vld             $vr1, a0, 16
-+    li.d            t1, -1
-+    li.d            t2, 32
-+
-+    vreplgr2vr.b    $vr2, a1
-+    sll.d           t3, t1, t0
-+    sub.d           t2, t2, t0
-+    vseq.b          $vr0, $vr0, $vr2
-+
-+    vseq.b          $vr1, $vr1, $vr2
-+    vmsknz.b        $vr0, $vr0
-+    vmsknz.b        $vr1, $vr1
-+    vilvl.h         $vr0, $vr1, $vr0
-+
-+
-+    movfr2gr.s      t0, $f0
-+    and             t0, t0, t3
-+    bgeu            t2, a2, L(end)
-+    bnez            t0, L(found)
-+
-+    addi.d          a4, a3, -1
-+    bstrins.d       a4, zero, 4, 0
-+L(loop):
-+    vld             $vr0, a0, 32
-+    vld             $vr1, a0, 48
-+
-+    addi.d          a0, a0, 32
-+    vseq.b          $vr0, $vr0, $vr2
-+    vseq.b          $vr1, $vr1, $vr2
-+    beq             a0, a4, L(out)
-+
-+    vmax.bu         $vr3, $vr0, $vr1
-+    vseteqz.v       $fcc0, $vr3
-+    bcnez           $fcc0, L(loop)
-+    vmsknz.b        $vr0, $vr0
-+
-+
-+    vmsknz.b        $vr1, $vr1
-+    vilvl.h         $vr0, $vr1, $vr0
-+    movfr2gr.s      t0, $f0
-+L(found):
-+    ctz.w           t0, t0
-+
-+    add.d           a0, a0, t0
-+    jr              ra
-+L(ret0):
-+    move            a0, zero
-+    jr              ra
-+
-+L(out):
-+    vmsknz.b        $vr0, $vr0
-+    vmsknz.b        $vr1, $vr1
-+    vilvl.h         $vr0, $vr1, $vr0
-+    movfr2gr.s      t0, $f0
-+
-+L(end):
-+    sub.d           t2, zero, a3
-+    srl.w           t1, t1, t2
-+    and             t0, t0, t1
-+    ctz.w           t1, t0
-+
-+
-+    add.d           a0, a0, t1
-+    maskeqz         a0, a0, t0
-+    jr              ra
-+END(MEMCHR)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMCHR)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memchr.c b/sysdeps/loongarch/lp64/multiarch/memchr.c
-new file mode 100644
-index 00000000..18b0e2ef
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memchr.c
-@@ -0,0 +1,39 @@
-+/* Multiple versions of memchr.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+/* Define multiple versions only for the definition in libc.  */
-+#if IS_IN (libc)
-+# define memchr __redirect_memchr
-+# include <string.h>
-+# undef memchr
-+
-+# define SYMBOL_NAME memchr
-+# include "ifunc-memchr.h"
-+
-+libc_ifunc_redirected (__redirect_memchr, __new_memchr,
-+		       IFUNC_SELECTOR ());
-+
-+# ifdef SHARED
-+__hidden_ver1 (__new_memchr, __GI_memchr, __redirect_memchr)
-+  __attribute__ ((visibility ("hidden")));
-+# endif
-+
-+# include <shlib-compat.h>
-+versioned_symbol (libc, __new_memchr, memchr, GLIBC_2_27);
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S
-new file mode 100644
-index 00000000..512eabca
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S
-@@ -0,0 +1,11 @@
-+
-+#if IS_IN (libc)
-+
-+#define MEMCMP_NAME __memcmp_aligned
-+
-+#endif
-+
-+#include "../memcmp.S"
-+# undef bcmp
-+weak_alias (MEMCMP_NAME, bcmp)
-+
-diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
-new file mode 100644
-index 00000000..30e2dbe6
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
-@@ -0,0 +1,199 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define MEMCMP  __memcmp_lasx
-+
-+LEAF(MEMCMP)
-+    .align          6
-+    li.d            t2, 32
-+    add.d           a3, a0, a2
-+    add.d           a4, a1, a2
-+    bgeu            t2, a2, L(less32) # a2 <= 32
-+
-+    li.d            t1, 160
-+    bgeu            a2, t1, L(make_aligned)  # a2 >= 160
-+L(loop32):
-+    xvld            $xr0, a0, 0
-+    xvld            $xr1, a1, 0
-+
-+    addi.d          a0, a0, 32
-+    addi.d          a1, a1, 32
-+    addi.d          a2, a2, -32
-+    xvseq.b         $xr2, $xr0, $xr1
-+
-+    xvsetanyeqz.b   $fcc0, $xr2
-+    bcnez           $fcc0, L(end)
-+L(last_bytes):
-+    bltu            t2, a2, L(loop32)
-+    xvld            $xr0, a3, -32
-+
-+
-+    xvld            $xr1, a4, -32
-+    xvseq.b         $xr2, $xr0, $xr1
-+L(end):
-+    xvmsknz.b       $xr2, $xr2
-+    xvpermi.q       $xr4, $xr0, 1
-+
-+    xvpickve.w      $xr3, $xr2, 4
-+    xvpermi.q       $xr5, $xr1, 1
-+    vilvl.h         $vr2, $vr3, $vr2
-+    movfr2gr.s      t0, $f2
-+
-+    cto.w           t0, t0
-+    vreplgr2vr.b    $vr2, t0
-+    vshuf.b         $vr0, $vr4, $vr0, $vr2
-+    vshuf.b         $vr1, $vr5, $vr1, $vr2
-+
-+    vpickve2gr.bu   t0, $vr0, 0
-+    vpickve2gr.bu   t1, $vr1, 0
-+    sub.d           a0, t0, t1
-+    jr              ra
-+
-+
-+L(less32):
-+    srli.d          t0, a2, 4
-+    beqz            t0, L(less16)
-+    vld             $vr0, a0, 0
-+    vld             $vr1, a1, 0
-+
-+    vld             $vr2, a3, -16
-+    vld             $vr3, a4, -16
-+L(short_ret):
-+    vseq.b          $vr4, $vr0, $vr1
-+    vseq.b          $vr5, $vr2, $vr3
-+
-+    vmsknz.b        $vr4, $vr4
-+    vmsknz.b        $vr5, $vr5
-+    vilvl.h         $vr4, $vr5, $vr4
-+    movfr2gr.s      t0, $f4
-+
-+    cto.w           t0, t0
-+    vreplgr2vr.b    $vr4, t0
-+    vshuf.b         $vr0, $vr2, $vr0, $vr4
-+    vshuf.b         $vr1, $vr3, $vr1, $vr4
-+
-+
-+    vpickve2gr.bu   t0, $vr0, 0
-+    vpickve2gr.bu   t1, $vr1, 0
-+    sub.d           a0, t0, t1
-+    jr              ra
-+
-+L(less16):
-+    srli.d          t0, a2, 3
-+    beqz            t0, L(less8)
-+    vldrepl.d       $vr0, a0, 0
-+    vldrepl.d       $vr1, a1, 0
-+
-+    vldrepl.d       $vr2, a3, -8
-+    vldrepl.d       $vr3, a4, -8
-+    b               L(short_ret)
-+L(less8):
-+    srli.d          t0, a2, 2
-+
-+    beqz            t0, L(less4)
-+    vldrepl.w       $vr0, a0, 0
-+    vldrepl.w       $vr1, a1, 0
-+    vldrepl.w       $vr2, a3, -4
-+
-+
-+    vldrepl.w       $vr3, a4, -4
-+    b               L(short_ret)
-+L(less4):
-+    srli.d          t0, a2, 1
-+    beqz            t0, L(less2)
-+
-+    vldrepl.h       $vr0, a0, 0
-+    vldrepl.h       $vr1, a1, 0
-+    vldrepl.h       $vr2, a3, -2
-+    vldrepl.h       $vr3, a4, -2
-+
-+    b               L(short_ret)
-+L(less2):
-+    beqz            a2, L(ret0)
-+    ld.bu           t0, a0, 0
-+    ld.bu           t1, a1, 0
-+
-+    sub.d           a0, t0, t1
-+    jr              ra
-+L(ret0):
-+    move            a0, zero
-+    jr              ra
-+
-+
-+    nop
-+    nop
-+    nop
-+/* make src1 aligned, and adjust scr2 and length. */
-+L(make_aligned):
-+    xvld            $xr0, a0, 0
-+
-+    xvld            $xr1, a1, 0
-+    xvseq.b         $xr2, $xr0, $xr1
-+    xvsetanyeqz.b   $fcc0, $xr2
-+    bcnez           $fcc0, L(end)
-+
-+    andi            t0, a0, 0x1f
-+    sub.d           t0, t2, t0
-+    sub.d           t1, a2, t0
-+    add.d           a0, a0, t0
-+
-+    add.d           a1, a1, t0
-+    andi            a2, t1, 0x3f
-+    sub.d           t0, t1, a2
-+    add.d           a5, a0, t0
-+
-+
-+L(loop_align):
-+    xvld            $xr0, a0, 0
-+    xvld            $xr1, a1, 0
-+    xvld            $xr2, a0, 32
-+    xvld            $xr3, a1, 32
-+
-+    xvseq.b         $xr0, $xr0, $xr1
-+    xvseq.b         $xr1, $xr2, $xr3
-+    xvmin.bu        $xr2, $xr1, $xr0
-+    xvsetanyeqz.b   $fcc0, $xr2
-+
-+    bcnez           $fcc0, L(pair_end)
-+    addi.d          a0, a0, 64
-+    addi.d          a1, a1, 64
-+    bne             a0, a5, L(loop_align)
-+
-+    bnez            a2, L(last_bytes)
-+    move            a0, zero
-+    jr              ra
-+    nop
-+
-+
-+L(pair_end):
-+    xvmsknz.b       $xr0, $xr0
-+    xvmsknz.b       $xr1, $xr1
-+    xvpickve.w      $xr2, $xr0, 4
-+    xvpickve.w      $xr3, $xr1, 4
-+
-+    vilvl.h         $vr0, $vr2, $vr0
-+    vilvl.h         $vr1, $vr3, $vr1
-+    vilvl.w         $vr0, $vr1, $vr0
-+    movfr2gr.d      t0, $f0
-+
-+    cto.d           t0, t0
-+    ldx.bu          t1, a0, t0
-+    ldx.bu          t2, a1, t0
-+    sub.d           a0, t1, t2
-+
-+    jr              ra
-+END(MEMCMP)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMCMP)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
-new file mode 100644
-index 00000000..7fd349b6
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
-@@ -0,0 +1,255 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define MEMCMP  __memcmp_lsx
-+
-+L(magic_num):
-+    .align          6
-+    .dword          0x0706050403020100
-+    .dword          0x0f0e0d0c0b0a0908
-+    nop
-+    nop
-+ENTRY_NO_ALIGN(MEMCMP)
-+    beqz            a2, L(out)
-+    pcaddi          t0, -7
-+
-+    andi            a3, a0, 0xf
-+    vld             $vr5, t0, 0
-+    andi            a4, a1, 0xf
-+    bne             a3, a4, L(unaligned)
-+
-+    bstrins.d       a0, zero, 3, 0
-+    xor             a1, a1, a4
-+    vld             $vr0, a0, 0
-+    vld             $vr1, a1, 0
-+
-+
-+    li.d            t0, 16
-+    vreplgr2vr.b    $vr3, a3
-+    sub.d           t1, t0, a3
-+    vadd.b          $vr3, $vr3, $vr5
-+
-+    vshuf.b         $vr0, $vr3, $vr0, $vr3
-+    vshuf.b         $vr1, $vr3, $vr1, $vr3
-+    vseq.b          $vr4, $vr0, $vr1
-+    bgeu            t1, a2, L(al_end)
-+
-+    vsetanyeqz.b    $fcc0, $vr4
-+    bcnez           $fcc0, L(al_found)
-+    sub.d           a2, a2, t1
-+    andi            t1, a2, 31
-+
-+    beq             a2, t1, L(al_less_32bytes)
-+    sub.d           t2, a2, t1
-+    move            a2, t1
-+    add.d           a4, a0, t2
-+
-+
-+L(al_loop):
-+    vld             $vr0, a0, 16
-+    vld             $vr1, a1, 16
-+    vld             $vr2, a0, 32
-+    vld             $vr3, a1, 32
-+
-+    addi.d          a0, a0, 32
-+    addi.d          a1, a1, 32
-+    vseq.b          $vr4, $vr0, $vr1
-+    vseq.b          $vr6, $vr2, $vr3
-+
-+    vand.v          $vr6, $vr4, $vr6
-+    vsetanyeqz.b    $fcc0, $vr6
-+    bcnez           $fcc0, L(al_pair_end)
-+    bne             a0, a4, L(al_loop)
-+
-+L(al_less_32bytes):
-+    bgeu            t0, a2, L(al_less_16bytes)
-+    vld             $vr0, a0, 16
-+    vld             $vr1, a1, 16
-+    vld             $vr2, a0, 32
-+
-+
-+    vld             $vr3, a1, 32
-+    addi.d          a2, a2, -16
-+    vreplgr2vr.b    $vr6, a2
-+    vslt.b          $vr5, $vr5, $vr6
-+
-+    vseq.b          $vr4, $vr0, $vr1
-+    vseq.b          $vr6, $vr2, $vr3
-+    vorn.v          $vr6, $vr6, $vr5
-+L(al_pair_end):
-+    vsetanyeqz.b    $fcc0, $vr4
-+
-+    bcnez           $fcc0, L(al_found)
-+    vnori.b         $vr4, $vr6, 0
-+    vfrstpi.b       $vr4, $vr4, 0
-+    vshuf.b         $vr0, $vr2, $vr2, $vr4
-+
-+    vshuf.b         $vr1, $vr3, $vr3, $vr4
-+    vpickve2gr.bu   t0, $vr0, 0
-+    vpickve2gr.bu   t1, $vr1, 0
-+    sub.d           a0, t0, t1
-+
-+
-+    jr              ra
-+L(al_less_16bytes):
-+    beqz            a2, L(out)
-+    vld             $vr0, a0, 16
-+    vld             $vr1, a1, 16
-+
-+    vseq.b          $vr4, $vr0, $vr1
-+L(al_end):
-+    vreplgr2vr.b    $vr6, a2
-+    vslt.b          $vr5, $vr5, $vr6
-+    vorn.v          $vr4, $vr4, $vr5
-+
-+L(al_found):
-+    vnori.b         $vr4, $vr4, 0
-+    vfrstpi.b       $vr4, $vr4, 0
-+    vshuf.b         $vr0, $vr0, $vr0, $vr4
-+    vshuf.b         $vr1, $vr1, $vr1, $vr4
-+
-+    vpickve2gr.bu   t0, $vr0, 0
-+    vpickve2gr.bu   t1, $vr1, 0
-+    sub.d           a0, t0, t1
-+    jr              ra
-+
-+
-+L(unaligned):
-+    xor             t2, a0, a1
-+    sltu            a5, a3, a4
-+    masknez         t2, t2, a5
-+    xor             a0, a0, t2      # a0 point to string with smaller offset 2
-+
-+    xor             a1, a1, t2      # a1 point to string with larger 4
-+    andi            a3, a0, 0xf     # a3 = 2
-+    andi            a4, a1, 0xf     # a4 = 4
-+    bstrins.d       a0, zero, 3, 0
-+
-+    xor             a1, a1, a4
-+    vld             $vr4, a0, 0
-+    vld             $vr1, a1, 0
-+    li.d            t0, 16
-+
-+    vreplgr2vr.b    $vr2, a4
-+    sub.d           a6, a4, a3  # a6 hold the diff
-+    sub.d           t1, t0, a4
-+    sub.d           t2, t0, a6
-+
-+
-+    vadd.b          $vr2, $vr2, $vr5 # [4, 5, 6, ...]
-+    vreplgr2vr.b    $vr6, t2
-+    vadd.b          $vr6, $vr6, $vr5 # [14, 15, 16, ... ]
-+    vshuf.b         $vr0, $vr4, $vr4, $vr6  # make data be in the same position
-+
-+    vshuf.b         $vr1, $vr2, $vr1, $vr2
-+    vshuf.b         $vr0, $vr2, $vr0, $vr2
-+    vseq.b          $vr7, $vr0, $vr1
-+    bgeu            t1, a2, L(un_end)
-+
-+    vsetanyeqz.b    $fcc0, $vr7
-+    bcnez           $fcc0, L(un_found)
-+    sub.d           a2, a2, t1
-+    andi            t1, a2, 31
-+
-+    beq             a2, t1, L(un_less_32bytes)
-+    sub.d           t2, a2, t1
-+    move            a2, t1
-+    add.d           a4, a1, t2
-+
-+
-+L(un_loop):
-+    vld             $vr2, a0, 16
-+    vld             $vr1, a1, 16
-+    vld             $vr3, a1, 32
-+    addi.d          a1, a1, 32
-+
-+    addi.d          a0, a0, 32
-+    vshuf.b         $vr0, $vr2, $vr4, $vr6
-+    vld             $vr4, a0, 0
-+    vseq.b          $vr7, $vr0, $vr1
-+
-+    vshuf.b         $vr2, $vr4, $vr2, $vr6
-+    vseq.b          $vr8, $vr2, $vr3
-+    vand.v          $vr8, $vr7, $vr8
-+    vsetanyeqz.b    $fcc0, $vr8
-+
-+    bcnez           $fcc0, L(un_pair_end)
-+    bne             a1, a4, L(un_loop)
-+L(un_less_32bytes):
-+    bltu            a2, t0, L(un_less_16bytes)
-+    vld             $vr2, a0, 16
-+
-+
-+    vld             $vr1, a1, 16
-+    addi.d          a0, a0, 16
-+    addi.d          a1, a1, 16
-+    addi.d          a2, a2, -16
-+
-+    vshuf.b         $vr0, $vr2, $vr4, $vr6
-+    vor.v           $vr4, $vr2, $vr2
-+    vseq.b          $vr7, $vr0, $vr1
-+    vsetanyeqz.b    $fcc0, $vr7
-+
-+    bcnez           $fcc0, L(un_found)
-+L(un_less_16bytes):
-+    beqz            a2, L(out)
-+    vld             $vr1, a1, 16
-+    bgeu            a6, a2, 1f
-+
-+    vld             $vr2, a0, 16
-+1:
-+    vshuf.b         $vr0, $vr2, $vr4, $vr6
-+    vseq.b          $vr7, $vr0, $vr1
-+L(un_end):
-+    vreplgr2vr.b    $vr3, a2
-+
-+
-+    vslt.b          $vr3, $vr5, $vr3
-+    vorn.v          $vr7, $vr7, $vr3
-+L(un_found):
-+    vnori.b         $vr7, $vr7, 0
-+    vfrstpi.b       $vr7, $vr7, 0
-+
-+    vshuf.b         $vr0, $vr0, $vr0, $vr7
-+    vshuf.b         $vr1, $vr1, $vr1, $vr7
-+L(calc_result):
-+    vpickve2gr.bu   t0, $vr0, 0
-+    vpickve2gr.bu   t1, $vr1, 0
-+
-+    sub.d           t2, t0, t1
-+    sub.d           t3, t1, t0
-+    masknez         t0, t3, a5
-+    maskeqz         t1, t2, a5
-+
-+    or              a0, t0, t1
-+    jr              ra
-+L(un_pair_end):
-+    vsetanyeqz.b    $fcc0, $vr7
-+    bcnez           $fcc0, L(un_found)
-+
-+
-+    vnori.b         $vr7, $vr8, 0
-+    vfrstpi.b       $vr7, $vr7, 0
-+    vshuf.b         $vr0, $vr2, $vr2, $vr7
-+    vshuf.b         $vr1, $vr3, $vr3, $vr7
-+
-+    b               L(calc_result)
-+L(out):
-+    move            a0, zero
-+    jr              ra
-+
-+END(MEMCMP)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMCMP)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp.c b/sysdeps/loongarch/lp64/multiarch/memcmp.c
-new file mode 100644
-index 00000000..a956761e
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memcmp.c
-@@ -0,0 +1,41 @@
-+/* Multiple versions of memcmp.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+/* Define multiple versions only for the definition in libc.  */
-+#if IS_IN (libc)
-+# define memcmp __redirect_memcmp
-+# include <string.h>
-+# undef memcmp
-+
-+# define SYMBOL_NAME memcmp
-+# include "ifunc-memchr.h"
-+
-+libc_ifunc_redirected (__redirect_memcmp, __new_memcmp,
-+		       IFUNC_SELECTOR ());
-+# undef bcmp
-+weak_alias (__new_memcmp, bcmp)
-+
-+# ifdef SHARED
-+__hidden_ver1 (__new_memcmp, __GI_memcmp, __redirect_memcmp)
-+  __attribute__ ((visibility ("hidden")));
-+# endif
-+
-+# include <shlib-compat.h>
-+versioned_symbol (libc, __new_memcmp, memcmp, GLIBC_2_27);
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S
-new file mode 100644
-index 00000000..5ff8b4e6
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S
-@@ -0,0 +1,11 @@
-+
-+
-+#if IS_IN (libc)
-+
-+#define MEMCPY_NAME __memcpy_aligned
-+#define MEMMOVE_NAME __memmove_aligned
-+
-+#endif
-+
-+#include "../memcpy.S"
-+
-diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S
-new file mode 100644
-index 00000000..99d2cc71
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S
-@@ -0,0 +1 @@
-+/* memcpy is part of memmove.S */
-diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S
-new file mode 100644
-index 00000000..99d2cc71
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S
-@@ -0,0 +1 @@
-+/* memcpy is part of memmove.S */
-diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
-new file mode 100644
-index 00000000..5e38df0d
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
-@@ -0,0 +1,259 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <regdef.h>
-+#include <sys/asm.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define MEMCPY_NAME __memcpy_unaligned
-+
-+#define LD_64(reg, n) \
-+	ld.d    t0, reg, n;    \
-+	ld.d    t1, reg, n+8;  \
-+	ld.d    t2, reg, n+16; \
-+	ld.d    t3, reg, n+24; \
-+	ld.d    t4, reg, n+32; \
-+	ld.d    t5, reg, n+40; \
-+	ld.d    t6, reg, n+48; \
-+	ld.d    t7, reg, n+56;
-+
-+#define ST_64(reg, n) \
-+	st.d    t0, reg, n;    \
-+	st.d    t1, reg, n+8;  \
-+	st.d    t2, reg, n+16; \
-+	st.d    t3, reg, n+24; \
-+	st.d    t4, reg, n+32; \
-+	st.d    t5, reg, n+40; \
-+	st.d    t6, reg, n+48; \
-+	st.d    t7, reg, n+56;
-+
-+#ifdef ANDROID_CHANGES
-+LEAF(MEMCPY_NAME, 0)
-+#else
-+LEAF(MEMCPY_NAME)
-+#endif
-+
-+//1st var: dst ptr: void *a1 $r4 a0
-+//2nd var: src ptr: void *a2 $r5 a1
-+//3rd var: size_t len $r6 a2
-+//t0~t9 registers as temp
-+
-+	add.d   a4, a1, a2
-+	add.d   a3, a0, a2
-+	li.w    a6, 16
-+	bge     a6, a2, less_16bytes
-+	li.w    a6, 128
-+	blt     a6, a2, long_bytes
-+	li.w    a6, 64
-+	blt     a6, a2, more_64bytes
-+	li.w    a6, 32
-+	blt     a6, a2, more_32bytes
-+
-+	/* 17...32 */
-+	ld.d    t0, a1, 0
-+	ld.d    t1, a1, 8
-+	ld.d    t2, a4, -16
-+	ld.d    t3, a4, -8
-+	st.d    t0, a0, 0
-+	st.d    t1, a0, 8
-+	st.d    t2, a3, -16
-+	st.d    t3, a3, -8
-+	jr  ra
-+
-+more_64bytes:
-+	srli.d	t8, a0, 3
-+	slli.d	t8, t8, 3
-+	addi.d	t8, t8,  0x8
-+	sub.d	a7, a0, t8
-+	ld.d	t0, a1, 0
-+	sub.d	a1, a1, a7
-+	st.d	t0, a0, 0
-+
-+	add.d	a7, a7, a2
-+	addi.d	a7, a7, -0x20
-+loop_32:
-+	ld.d	t0, a1, 0
-+	ld.d	t1, a1, 8
-+	ld.d	t2, a1, 16
-+	ld.d	t3, a1, 24
-+	st.d	t0, t8, 0
-+	st.d	t1, t8, 8
-+	st.d	t2, t8, 16
-+	st.d	t3, t8, 24
-+
-+	addi.d	t8,  t8,   0x20
-+	addi.d	a1,  a1,   0x20
-+	addi.d	a7,  a7,  -0x20
-+	blt     zero, a7, loop_32
-+
-+	ld.d	t4, a4, -32
-+	ld.d	t5, a4, -24
-+	ld.d	t6, a4, -16
-+	ld.d	t7, a4, -8
-+	st.d	t4, a3, -32
-+	st.d	t5, a3, -24
-+	st.d	t6, a3, -16
-+	st.d	t7, a3, -8
-+
-+	jr	ra
-+
-+more_32bytes:
-+	/* 33...64 */
-+	ld.d    t0, a1, 0
-+	ld.d    t1, a1, 8
-+	ld.d    t2, a1, 16
-+	ld.d    t3, a1, 24
-+	ld.d    t4, a4, -32
-+	ld.d    t5, a4, -24
-+	ld.d    t6, a4, -16
-+	ld.d    t7, a4, -8
-+	st.d    t0, a0, 0
-+	st.d    t1, a0, 8
-+	st.d    t2, a0, 16
-+	st.d    t3, a0, 24
-+	st.d    t4, a3, -32
-+	st.d    t5, a3, -24
-+	st.d    t6, a3, -16
-+	st.d    t7, a3, -8
-+	jr  ra
-+
-+less_16bytes:
-+	srai.d  a6, a2, 3
-+	beqz    a6, less_8bytes
-+
-+	/* 8...16 */
-+	ld.d    t0, a1, 0
-+	ld.d    t1, a4, -8
-+	st.d    t0, a0, 0
-+	st.d    t1, a3, -8
-+
-+	jr  ra
-+
-+less_8bytes:
-+	srai.d  a6, a2, 2
-+	beqz    a6, less_4bytes
-+
-+	/* 4...7 */
-+	ld.w    t0, a1, 0
-+	ld.w    t1, a4, -4
-+	st.w    t0, a0, 0
-+	st.w    t1, a3, -4
-+	jr  ra
-+
-+less_4bytes:
-+	srai.d  a6, a2, 1
-+	beqz    a6, less_2bytes
-+
-+	/* 2...3 */
-+	ld.h    t0, a1, 0
-+	ld.h    t1, a4, -2
-+	st.h    t0, a0, 0
-+	st.h    t1, a3, -2
-+	jr  ra
-+
-+less_2bytes:
-+	beqz    a2, less_1bytes
-+
-+	ld.b    t0, a1, 0
-+	st.b    t0, a0, 0
-+	jr  ra
-+
-+less_1bytes:
-+	jr  ra
-+
-+long_bytes:
-+	srli.d  t8, a0, 3
-+	slli.d  t8, t8, 3
-+	beq     a0, t8, start
-+
-+	ld.d    t0, a1, 0
-+	addi.d  t8, t8, 0x8
-+	st.d    t0, a0, 0
-+	sub.d   a7, a0, t8
-+	sub.d   a1, a1, a7
-+
-+start:
-+	addi.d  a5, a3, -0x80
-+	blt     a5, t8, align_end_proc
-+
-+loop_128:
-+	LD_64(a1, 0)
-+	ST_64(t8, 0)
-+	LD_64(a1, 64)
-+	addi.d  a1, a1,  0x80
-+	ST_64(t8, 64)
-+	addi.d  t8, t8,  0x80
-+	bge     a5, t8, loop_128
-+
-+align_end_proc:
-+	sub.d   a2, a3, t8
-+
-+	pcaddi  t1, 34
-+	andi    t2, a2, 0x78
-+	sub.d   t1, t1, t2
-+	jirl    zero, t1, 0
-+
-+end_120_128_unalign:
-+	ld.d    t0, a1, 112
-+	st.d    t0, t8, 112
-+end_112_120_unalign:
-+	ld.d    t0, a1, 104
-+	st.d    t0, t8, 104
-+end_104_112_unalign:
-+	ld.d    t0, a1, 96
-+	st.d    t0, t8, 96
-+end_96_104_unalign:
-+	ld.d    t0, a1, 88
-+	st.d    t0, t8, 88
-+end_88_96_unalign:
-+	ld.d    t0, a1, 80
-+	st.d    t0, t8, 80
-+end_80_88_unalign:
-+	ld.d    t0, a1, 72
-+	st.d    t0, t8, 72
-+end_72_80_unalign:
-+	ld.d    t0, a1, 64
-+	st.d    t0, t8, 64
-+end_64_72_unalign:
-+	ld.d    t0, a1, 56
-+	st.d    t0, t8, 56
-+end_56_64_unalign:
-+	ld.d    t0, a1, 48
-+	st.d    t0, t8, 48
-+end_48_56_unalign:
-+	ld.d    t0, a1, 40
-+	st.d    t0, t8, 40
-+end_40_48_unalign:
-+	ld.d    t0, a1, 32
-+	st.d    t0, t8, 32
-+end_32_40_unalign:
-+	ld.d    t0, a1, 24
-+	st.d    t0, t8, 24
-+end_24_32_unalign:
-+	ld.d    t0, a1, 16
-+	st.d    t0, t8, 16
-+end_16_24_unalign:
-+	ld.d    t0, a1, 8
-+	st.d    t0, t8, 8
-+end_8_16_unalign:
-+	ld.d    t0, a1, 0
-+	st.d    t0, t8, 0
-+end_0_8_unalign:
-+	ld.d    t0, a4, -8
-+	st.d    t0, a3, -8
-+
-+	jr  ra
-+
-+END(MEMCPY_NAME)
-+
-+#ifndef ANDROID_CHANGES
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMCPY_NAME)
-+#endif
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy.c b/sysdeps/loongarch/lp64/multiarch/memcpy.c
-new file mode 100644
-index 00000000..0ba8254a
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memcpy.c
-@@ -0,0 +1,39 @@
-+/* Multiple versions of memcpy.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+/* Define multiple versions only for the definition in libc.  */
-+#if IS_IN (libc)
-+# define memcpy __redirect_memcpy
-+# include <string.h>
-+# undef memcpy
-+
-+# define SYMBOL_NAME memcpy
-+# include "ifunc-lasx.h"
-+
-+libc_ifunc_redirected (__redirect_memcpy, __new_memcpy,
-+		       IFUNC_SELECTOR ());
-+
-+# ifdef SHARED
-+__hidden_ver1 (__new_memcpy, __GI_memcpy, __redirect_memcpy)
-+  __attribute__ ((visibility ("hidden")));
-+# endif
-+
-+# include <shlib-compat.h>
-+versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_27);
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-aligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-aligned.S
-new file mode 100644
-index 00000000..bcd37a0e
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memmove-aligned.S
-@@ -0,0 +1 @@
-+/* memmove_aligned is part of memcpy_aligned, see memcpy-aligned.S.  */
-diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
-new file mode 100644
-index 00000000..9537a35a
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
-@@ -0,0 +1,279 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <regdef.h>
-+#include <sys/asm.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#ifndef MEMCPY_NAME
-+#define MEMCPY_NAME __memcpy_lasx
-+#endif
-+
-+#ifndef MEMMOVE_NAME
-+#define MEMMOVE_NAME __memmove_lasx
-+#endif
-+
-+LEAF(MEMCPY_NAME)
-+    .align          6
-+
-+    li.d            t0, 32
-+    add.d           a3, a0, a2
-+    add.d           a4, a1, a2
-+    bgeu            t0, a2, L(less_32bytes) # a2 <= 32
-+
-+    li.d            t1, 64
-+    bltu            t1, a2, L(copy_long)    # a2 > 64
-+    xvld            $xr0, a1, 0
-+    xvld            $xr1, a4, -32
-+
-+    xvst            $xr0, a0, 0
-+    xvst            $xr1, a3, -32
-+    jr              ra
-+L(less_32bytes):
-+    srli.d          t0, a2, 4
-+
-+    beqz            t0, L(less_16bytes)
-+    vld             $vr0, a1, 0
-+    vld             $vr1, a4, -16
-+    vst             $vr0, a0, 0
-+
-+
-+    vst             $vr1, a3, -16
-+    jr              ra
-+L(less_16bytes):
-+    srli.d          t0, a2, 3
-+    beqz            t0, L(less_8bytes)
-+
-+    ld.d            t0, a1, 0
-+    ld.d            t1, a4, -8
-+    st.d            t0, a0, 0
-+    st.d            t1, a3, -8
-+
-+    jr              ra
-+L(less_8bytes):
-+    srli.d          t0, a2, 2
-+    beqz            t0, L(less_4bytes)
-+    ld.w            t0, a1, 0
-+
-+    ld.w            t1, a4, -4
-+    st.w            t0, a0, 0
-+    st.w            t1, a3, -4
-+    jr              ra
-+
-+
-+L(less_4bytes):
-+    srli.d          t0, a2, 1
-+    beqz            t0, L(less_2bytes)
-+    ld.h            t0, a1, 0
-+    ld.h            t1, a4, -2
-+
-+    st.h            t0, a0, 0
-+    st.h            t1, a3, -2
-+    jr              ra
-+L(less_2bytes):
-+    beqz            a2, L(less_1bytes)
-+
-+    ld.b            t0, a1, 0
-+    st.b            t0, a0, 0
-+L(less_1bytes):
-+    jr              ra
-+END(MEMCPY_NAME)
-+
-+LEAF(MEMMOVE_NAME)
-+    .align          6
-+
-+    li.d            t0, 32
-+    add.d           a3, a0, a2
-+    add.d           a4, a1, a2
-+    bgeu            t0, a2, L(less_32bytes) # a2 <= 32
-+
-+    li.d            t1, 64
-+    bltu            t1, a2, L(move_long)    # a2 > 64
-+    xvld            $xr0, a1, 0
-+    xvld            $xr1, a4, -32
-+
-+    xvst            $xr0, a0, 0
-+    xvst            $xr1, a3, -32
-+    jr              ra
-+L(move_long):
-+    sub.d           t2, a0, a1
-+
-+    bltu            t2, a2, L(copy_back)
-+L(copy_long):
-+    andi            t2, a0, 0x1f
-+    addi.d          a2, a2, -1
-+    sub.d           t2, t0, t2
-+
-+
-+    xvld            $xr8, a1, 0
-+    xvld            $xr9, a4, -32
-+    sub.d           t3, a2, t2
-+    add.d           a5, a0, t2
-+
-+    andi            a2, t3, 0xff
-+    add.d           a1, a1, t2
-+    beq             a2, t3, L(lt256)
-+    sub.d           a6, a4, a2
-+
-+    addi.d          a6, a6, -1
-+L(loop_256):
-+    xvld            $xr0, a1, 0
-+    xvld            $xr1, a1, 32
-+    xvld            $xr2, a1, 64
-+
-+    xvld            $xr3, a1, 96
-+    xvld            $xr4, a1, 128
-+    xvld            $xr5, a1, 160
-+    xvld            $xr6, a1, 192
-+
-+
-+    xvld            $xr7, a1, 224
-+    addi.d          a1, a1, 256
-+    xvst            $xr0, a5, 0
-+    xvst            $xr1, a5, 32
-+
-+    xvst            $xr2, a5, 64
-+    xvst            $xr3, a5, 96
-+    xvst            $xr4, a5, 128
-+    xvst            $xr5, a5, 160
-+
-+    xvst            $xr6, a5, 192
-+    xvst            $xr7, a5, 224
-+    addi.d          a5, a5, 256
-+    bne             a1, a6, L(loop_256)
-+
-+L(lt256):
-+    srli.d          t2, a2, 7
-+    beqz            t2, L(lt128)
-+    xvld            $xr0, a1, 0
-+    xvld            $xr1, a1, 32
-+
-+
-+    xvld            $xr2, a1, 64
-+    xvld            $xr3, a1, 96
-+    addi.d          a1, a1, 128
-+    addi.d          a2, a2, -128
-+
-+    xvst            $xr0, a5, 0
-+    xvst            $xr1, a5, 32
-+    xvst            $xr2, a5, 64
-+    xvst            $xr3, a5, 96
-+
-+    addi.d          a5, a5, 128
-+L(lt128):
-+    bltu            a2, t1, L(lt64)
-+    xvld            $xr0, a1, 0
-+    xvld            $xr1, a1, 32
-+
-+    addi.d          a1, a1, 64
-+    addi.d          a2, a2, -64
-+    xvst            $xr0, a5, 0
-+    xvst            $xr1, a5, 32
-+
-+
-+    addi.d          a5, a5, 64
-+L(lt64):
-+    bltu            a2, t0, L(lt32)
-+    xvld            $xr0, a1, 0
-+    xvst            $xr0, a5, 0
-+
-+L(lt32):
-+    xvst            $xr8, a0, 0
-+    xvst            $xr9, a3, -32
-+    jr              ra
-+    nop
-+
-+L(copy_back):
-+    addi.d          a3, a3, -1
-+    addi.d          a2, a2, -2
-+    andi            t2, a3, 0x1f
-+    xvld            $xr8, a1, 0
-+
-+    xvld            $xr9, a4, -32
-+    sub.d           t3, a2, t2
-+    sub.d           a5, a3, t2
-+    sub.d           a4, a4, t2
-+
-+
-+    andi            a2, t3, 0xff
-+    beq             a2, t3, L(back_lt256)
-+    add.d           a6, a1, a2
-+    addi.d          a6, a6, 2
-+
-+L(back_loop_256):
-+    xvld            $xr0, a4, -33
-+    xvld            $xr1, a4, -65
-+    xvld            $xr2, a4, -97
-+    xvld            $xr3, a4, -129
-+
-+    xvld            $xr4, a4, -161
-+    xvld            $xr5, a4, -193
-+    xvld            $xr6, a4, -225
-+    xvld            $xr7, a4, -257
-+
-+    addi.d          a4, a4, -256
-+    xvst            $xr0, a5, -32
-+    xvst            $xr1, a5, -64
-+    xvst            $xr2, a5, -96
-+
-+
-+    xvst            $xr3, a5, -128
-+    xvst            $xr4, a5, -160
-+    xvst            $xr5, a5, -192
-+    xvst            $xr6, a5, -224
-+
-+    xvst            $xr7, a5, -256
-+    addi.d          a5, a5, -256
-+    bne             a4, a6, L(back_loop_256)
-+L(back_lt256):
-+    srli.d          t2, a2, 7
-+
-+    beqz            t2, L(back_lt128)
-+    xvld            $xr0, a4, -33
-+    xvld            $xr1, a4, -65
-+    xvld            $xr2, a4, -97
-+
-+    xvld            $xr3, a4, -129
-+    addi.d          a2, a2, -128
-+    addi.d          a4, a4, -128
-+    xvst            $xr0, a5, -32
-+
-+
-+    xvst            $xr1, a5, -64
-+    xvst            $xr2, a5, -96
-+    xvst            $xr3, a5, -128
-+    addi.d          a5, a5, -128
-+
-+L(back_lt128):
-+    blt             a2, t1, L(back_lt64)
-+    xvld            $xr0, a4, -33
-+    xvld            $xr1, a4, -65
-+    addi.d          a2, a2, -64
-+
-+    addi.d          a4, a4, -64
-+    xvst            $xr0, a5, -32
-+    xvst            $xr1, a5, -64
-+    addi.d          a5, a5, -64
-+
-+L(back_lt64):
-+    bltu            a2, t0, L(back_lt32)
-+    xvld            $xr0, a4, -33
-+    xvst            $xr0, a5, -32
-+L(back_lt32):
-+    xvst            $xr8, a0, 0
-+
-+
-+    xvst            $xr9, a3, -31
-+    jr              ra
-+END(MEMMOVE_NAME)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMCPY_NAME)
-+libc_hidden_builtin_def (MEMMOVE_NAME)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
-new file mode 100644
-index 00000000..26babad4
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
-@@ -0,0 +1,524 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <regdef.h>
-+#include <sys/asm.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define MEMCPY_NAME __memcpy_lsx
-+#define MEMMOVE_NAME __memmove_lsx
-+
-+LEAF(MEMCPY_NAME)
-+    .align          6
-+    li.d            t6, 16
-+    add.d           a3, a0, a2
-+    add.d           a4, a1, a2
-+    bgeu            t6, a2, L(less_16bytes) # a2 <= 16
-+
-+    li.d            t8, 64
-+    li.d            t7, 32
-+    bltu            t8, a2, L(copy_long)   # a2 > 64
-+    bltu            t7, a2, L(more_32bytes) # a2 > 32
-+
-+    vld             $vr0, a1, 0
-+    vld             $vr1, a4, -16
-+    vst             $vr0, a0, 0
-+    vst             $vr1, a3, -16
-+
-+    jr              ra
-+L(more_32bytes):
-+    vld             $vr0, a1, 0
-+    vld             $vr1, a1, 16
-+    vld             $vr2, a4, -32
-+
-+
-+    vld             $vr3, a4, -16
-+    vst             $vr0, a0, 0
-+    vst             $vr1, a0, 16
-+    vst             $vr2, a3, -32
-+
-+    vst             $vr3, a3, -16
-+    jr              ra
-+L(less_16bytes):
-+    srli.d          t0, a2, 3
-+    beqz            t0, L(less_8bytes)
-+
-+    vldrepl.d       $vr0, a1, 0
-+    vldrepl.d       $vr1, a4, -8
-+    vstelm.d        $vr0, a0, 0, 0
-+    vstelm.d        $vr1, a3, -8, 0
-+
-+    jr              ra
-+L(less_8bytes):
-+    srli.d          t0, a2, 2
-+    beqz            t0, L(less_4bytes)
-+    vldrepl.w       $vr0, a1, 0
-+
-+
-+    vldrepl.w       $vr1, a4, -4
-+    vstelm.w        $vr0, a0, 0, 0
-+    vstelm.w        $vr1, a3, -4, 0
-+    jr              ra
-+
-+L(less_4bytes):
-+    srli.d          t0, a2, 1
-+    beqz            t0, L(less_2bytes)
-+    vldrepl.h       $vr0, a1, 0
-+    vldrepl.h       $vr1, a4, -2
-+
-+    vstelm.h        $vr0, a0, 0, 0
-+    vstelm.h        $vr1, a3, -2, 0
-+    jr              ra
-+L(less_2bytes):
-+    beqz            a2, L(less_1bytes)
-+
-+    ld.b            t0, a1, 0
-+    st.b            t0, a0, 0
-+L(less_1bytes):
-+    jr              ra
-+    nop
-+END(MEMCPY_NAME)
-+
-+LEAF(MEMMOVE_NAME)
-+    li.d            t6, 16
-+    add.d           a3, a0, a2
-+    add.d           a4, a1, a2
-+    bgeu            t6, a2, L(less_16bytes) # a2 <= 16
-+
-+    li.d            t8, 64
-+    li.d            t7, 32
-+    bltu            t8, a2, L(move_long)    # a2 > 64
-+    bltu            t7, a2, L(more_32bytes) # a2 > 32
-+
-+    vld             $vr0, a1, 0
-+    vld             $vr1, a4, -16
-+    vst             $vr0, a0, 0
-+    vst             $vr1, a3, -16
-+
-+    jr              ra
-+    nop
-+L(move_long):
-+    sub.d           t0, a0, a1
-+    bltu            t0, a2, L(copy_back)
-+
-+
-+L(copy_long):
-+    vld             $vr2, a1, 0
-+    andi            t0, a0, 0xf
-+    sub.d           t0, t6, t0
-+    add.d           a1, a1, t0
-+
-+    sub.d           a2, a2, t0
-+    andi            t1, a1, 0xf
-+    bnez            t1, L(unaligned)
-+    vld             $vr0, a1, 0
-+
-+    addi.d          a2, a2, -16
-+    vst             $vr2, a0, 0
-+    andi            t2, a2, 0x7f
-+    add.d           a5, a0, t0
-+
-+    beq             a2, t2, L(al_less_128)
-+    sub.d           t3, a2, t2
-+    move            a2, t2
-+    add.d           a6, a1, t3
-+
-+
-+L(al_loop):
-+    vld             $vr1, a1, 16
-+    vld             $vr2, a1, 32
-+    vld             $vr3, a1, 48
-+    vld             $vr4, a1, 64
-+
-+    vld             $vr5, a1, 80
-+    vld             $vr6, a1, 96
-+    vld             $vr7, a1, 112
-+    vst             $vr0, a5, 0
-+
-+    vld             $vr0, a1, 128
-+    addi.d          a1, a1, 128
-+    vst             $vr1, a5, 16
-+    vst             $vr2, a5, 32
-+
-+    vst             $vr3, a5, 48
-+    vst             $vr4, a5, 64
-+    vst             $vr5, a5, 80
-+    vst             $vr6, a5, 96
-+
-+
-+    vst             $vr7, a5, 112
-+    addi.d          a5, a5, 128
-+    bne             a1, a6, L(al_loop)
-+L(al_less_128):
-+    blt             a2, t8, L(al_less_64)
-+
-+    vld             $vr1, a1, 16
-+    vld             $vr2, a1, 32
-+    vld             $vr3, a1, 48
-+    addi.d          a2, a2, -64
-+
-+    vst             $vr0, a5, 0
-+    vld             $vr0, a1, 64
-+    addi.d          a1, a1, 64
-+    vst             $vr1, a5, 16
-+
-+    vst             $vr2, a5, 32
-+    vst             $vr3, a5, 48
-+    addi.d          a5, a5, 64
-+L(al_less_64):
-+    blt             a2, t7, L(al_less_32)
-+
-+
-+    vld             $vr1, a1, 16
-+    addi.d          a2, a2, -32
-+    vst             $vr0, a5, 0
-+    vld             $vr0, a1, 32
-+
-+    addi.d          a1, a1, 32
-+    vst             $vr1, a5, 16
-+    addi.d          a5, a5, 32
-+L(al_less_32):
-+    blt             a2, t6, L(al_less_16)
-+
-+    vst             $vr0, a5, 0
-+    vld             $vr0, a1, 16
-+    addi.d          a5, a5, 16
-+L(al_less_16):
-+    vld             $vr1, a4, -16
-+
-+    vst             $vr0, a5, 0
-+    vst             $vr1, a3, -16
-+    jr              ra
-+    nop
-+
-+
-+L(magic_num):
-+    .dword          0x0706050403020100
-+    .dword          0x0f0e0d0c0b0a0908
-+L(unaligned):
-+    pcaddi          t2, -4
-+    bstrins.d       a1, zero, 3, 0
-+    vld             $vr8, t2, 0
-+    vld             $vr0, a1, 0
-+
-+    vld             $vr1, a1, 16
-+    addi.d          a2, a2, -16
-+    vst             $vr2, a0, 0
-+    add.d           a5, a0, t0
-+
-+    vreplgr2vr.b    $vr9, t1
-+    andi            t2, a2, 0x7f
-+    vadd.b          $vr9, $vr9, $vr8
-+    addi.d          a1, a1, 32
-+
-+
-+    beq             t2, a2, L(un_less_128)
-+    sub.d           t3, a2, t2
-+    move            a2, t2
-+    add.d           a6, a1, t3
-+
-+L(un_loop):
-+    vld             $vr2, a1, 0
-+    vld             $vr3, a1, 16
-+    vld             $vr4, a1, 32
-+    vld             $vr5, a1, 48
-+
-+    vld             $vr6, a1, 64
-+    vld             $vr7, a1, 80
-+    vshuf.b         $vr8, $vr1, $vr0, $vr9
-+    vld             $vr0, a1, 96
-+
-+    vst             $vr8, a5, 0
-+    vshuf.b         $vr8, $vr2, $vr1, $vr9
-+    vld             $vr1, a1, 112
-+    vst             $vr8, a5, 16
-+
-+
-+    addi.d          a1, a1, 128
-+    vshuf.b         $vr2, $vr3, $vr2, $vr9
-+    vshuf.b         $vr3, $vr4, $vr3, $vr9
-+    vst             $vr2, a5, 32
-+
-+    vshuf.b         $vr4, $vr5, $vr4, $vr9
-+    vst             $vr3, a5, 48
-+    vshuf.b         $vr5, $vr6, $vr5, $vr9
-+    vst             $vr4, a5, 64
-+
-+    vshuf.b         $vr6, $vr7, $vr6, $vr9
-+    vst             $vr5, a5, 80
-+    vshuf.b         $vr7, $vr0, $vr7, $vr9
-+    vst             $vr6, a5, 96
-+
-+    vst             $vr7, a5, 112
-+    addi.d          a5, a5, 128
-+    bne             a1, a6, L(un_loop)
-+L(un_less_128):
-+    blt             a2, t8, L(un_less_64)
-+
-+
-+    vld             $vr2, a1, 0
-+    vld             $vr3, a1, 16
-+    vshuf.b         $vr4, $vr1, $vr0, $vr9
-+    vld             $vr0, a1, 32
-+
-+    vst             $vr4, a5, 0
-+    addi.d          a2, a2, -64
-+    vshuf.b         $vr4, $vr2, $vr1, $vr9
-+    vld             $vr1, a1, 48
-+
-+    addi.d          a1, a1, 64
-+    vst             $vr4, a5, 16
-+    vshuf.b         $vr2, $vr3, $vr2, $vr9
-+    vshuf.b         $vr3, $vr0, $vr3, $vr9
-+
-+    vst             $vr2, a5, 32
-+    vst             $vr3, a5, 48
-+    addi.d          a5, a5, 64
-+L(un_less_64):
-+    blt             a2, t7, L(un_less_32)
-+
-+
-+    vshuf.b         $vr3, $vr1, $vr0, $vr9
-+    vld             $vr0, a1, 0
-+    vst             $vr3, a5, 0
-+    addi.d          a2, a2, -32
-+
-+    vshuf.b         $vr3, $vr0, $vr1, $vr9
-+    vld             $vr1, a1, 16
-+    addi.d          a1, a1, 32
-+    vst             $vr3, a5, 16
-+
-+    addi.d          a5, a5, 32
-+L(un_less_32):
-+    blt             a2, t6, L(un_less_16)
-+    vshuf.b         $vr2, $vr1, $vr0, $vr9
-+    vor.v           $vr0, $vr1, $vr1
-+
-+    vld             $vr1, a1, 0
-+    vst             $vr2, a5, 0
-+    addi.d          a5, a5, 16
-+L(un_less_16):
-+    vld             $vr2, a4, -16
-+
-+
-+    vshuf.b         $vr0, $vr1, $vr0, $vr9
-+    vst             $vr0, a5, 0
-+    vst             $vr2, a3, -16
-+    jr              ra
-+
-+L(copy_back):
-+    addi.d          t0, a3, -1
-+    vld             $vr2, a4, -16
-+    andi            t0, t0, 0xf
-+    addi.d          t0, t0, 1   # in case a3 is already aligned, load 16bytes and store 16bytes
-+
-+    sub.d           a4, a4, t0
-+    sub.d           a2, a2, t0
-+    andi            t1, a4, 0xf
-+    bnez            t1, L(back_unaligned)
-+
-+    vld             $vr0, a4, -16
-+    addi.d          a2, a2, -16
-+    vst             $vr2, a3, -16
-+    andi            t2, a2, 0x7f
-+
-+
-+    sub.d           a3, a3, t0
-+    beq             t2, a2, L(back_al_less_128)
-+    sub.d           t3, a2, t2
-+    move            a2, t2
-+
-+    sub.d           a6, a4, t3
-+L(back_al_loop):
-+    vld             $vr1, a4, -32
-+    vld             $vr2, a4, -48
-+    vld             $vr3, a4, -64
-+
-+    vld             $vr4, a4, -80
-+    vld             $vr5, a4, -96
-+    vld             $vr6, a4, -112
-+    vld             $vr7, a4, -128
-+
-+    vst             $vr0, a3, -16
-+    vld             $vr0, a4, -144
-+    addi.d          a4, a4, -128
-+    vst             $vr1, a3, -32
-+
-+
-+    vst             $vr2, a3, -48
-+    vst             $vr3, a3, -64
-+    vst             $vr4, a3, -80
-+    vst             $vr5, a3, -96
-+
-+    vst             $vr6, a3, -112
-+    vst             $vr7, a3, -128
-+    addi.d          a3, a3, -128
-+    bne             a4, a6, L(back_al_loop)
-+
-+L(back_al_less_128):
-+    blt             a2, t8, L(back_al_less_64)
-+    vld             $vr1, a4, -32
-+    vld             $vr2, a4, -48
-+    vld             $vr3, a4, -64
-+
-+    addi.d          a2, a2, -64
-+    vst             $vr0, a3, -16
-+    vld             $vr0, a4, -80
-+    addi.d          a4, a4, -64
-+
-+
-+    vst             $vr1, a3, -32
-+    vst             $vr2, a3, -48
-+    vst             $vr3, a3, -64
-+    addi.d          a3, a3, -64
-+
-+L(back_al_less_64):
-+    blt             a2, t7, L(back_al_less_32)
-+    vld             $vr1, a4, -32
-+    addi.d          a2, a2, -32
-+    vst             $vr0, a3, -16
-+
-+    vld             $vr0, a4, -48
-+    vst             $vr1, a3, -32
-+    addi.d          a3, a3, -32
-+    addi.d          a4, a4, -32
-+
-+L(back_al_less_32):
-+    blt             a2, t6, L(back_al_less_16)
-+    vst             $vr0, a3, -16
-+    vld             $vr0, a4, -32
-+    addi.d          a3, a3, -16
-+
-+
-+L(back_al_less_16):
-+    vld             $vr1, a1, 0
-+    vst             $vr0, a3, -16
-+    vst             $vr1, a0, 0
-+    jr              ra
-+
-+L(magic_num_2):
-+    .dword          0x0706050403020100
-+    .dword          0x0f0e0d0c0b0a0908
-+L(back_unaligned):
-+    pcaddi          t2, -4
-+    bstrins.d       a4, zero, 3, 0
-+    vld             $vr8, t2, 0
-+    vld             $vr0, a4, 0
-+
-+    vld             $vr1, a4, -16
-+    addi.d          a2, a2, -16
-+    vst             $vr2, a3, -16
-+    sub.d           a3, a3, t0
-+
-+
-+    vreplgr2vr.b    $vr9, t1
-+    andi            t2, a2, 0x7f
-+    vadd.b          $vr9, $vr9, $vr8
-+    addi.d          a4, a4, -16
-+
-+    beq             t2, a2, L(back_un_less_128)
-+    sub.d           t3, a2, t2
-+    move            a2, t2
-+    sub.d           a6, a4, t3
-+
-+L(back_un_loop):
-+    vld             $vr2, a4, -16
-+    vld             $vr3, a4, -32
-+    vld             $vr4, a4, -48
-+
-+    vld             $vr5, a4, -64
-+    vld             $vr6, a4, -80
-+    vld             $vr7, a4, -96
-+    vshuf.b         $vr8, $vr0, $vr1, $vr9
-+
-+
-+    vld             $vr0, a4, -112
-+    vst             $vr8, a3, -16
-+    vshuf.b         $vr8, $vr1, $vr2, $vr9
-+    vld             $vr1, a4, -128
-+
-+    vst             $vr8, a3, -32
-+    addi.d          a4, a4, -128
-+    vshuf.b         $vr2, $vr2, $vr3, $vr9
-+    vshuf.b         $vr3, $vr3, $vr4, $vr9
-+
-+    vst             $vr2, a3, -48
-+    vshuf.b         $vr4, $vr4, $vr5, $vr9
-+    vst             $vr3, a3, -64
-+    vshuf.b         $vr5, $vr5, $vr6, $vr9
-+
-+    vst             $vr4, a3, -80
-+    vshuf.b         $vr6, $vr6, $vr7, $vr9
-+    vst             $vr5, a3, -96
-+    vshuf.b         $vr7, $vr7, $vr0, $vr9
-+
-+
-+    vst             $vr6, a3, -112
-+    vst             $vr7, a3, -128
-+    addi.d          a3, a3, -128
-+    bne             a4, a6, L(back_un_loop)
-+
-+L(back_un_less_128):
-+    blt             a2, t8, L(back_un_less_64)
-+    vld             $vr2, a4, -16
-+    vld             $vr3, a4, -32
-+    vshuf.b         $vr4, $vr0, $vr1, $vr9
-+
-+    vld             $vr0, a4, -48
-+    vst             $vr4, a3, -16
-+    addi.d          a2, a2, -64
-+    vshuf.b         $vr4, $vr1, $vr2, $vr9
-+
-+    vld             $vr1, a4, -64
-+    addi.d          a4, a4, -64
-+    vst             $vr4, a3, -32
-+    vshuf.b         $vr2, $vr2, $vr3, $vr9
-+
-+
-+    vshuf.b         $vr3, $vr3, $vr0, $vr9
-+    vst             $vr2, a3, -48
-+    vst             $vr3, a3, -64
-+    addi.d          a3, a3, -64
-+
-+L(back_un_less_64):
-+    blt             a2, t7, L(back_un_less_32)
-+    vshuf.b         $vr3, $vr0, $vr1, $vr9
-+    vld             $vr0, a4, -16
-+    vst             $vr3, a3, -16
-+
-+    addi.d          a2, a2, -32
-+    vshuf.b         $vr3, $vr1, $vr0, $vr9
-+    vld             $vr1, a4, -32
-+    addi.d          a4, a4, -32
-+
-+    vst             $vr3, a3, -32
-+    addi.d          a3, a3, -32
-+L(back_un_less_32):
-+    blt             a2, t6, L(back_un_less_16)
-+    vshuf.b         $vr2, $vr0, $vr1, $vr9
-+
-+
-+    vor.v           $vr0, $vr1, $vr1
-+    vld             $vr1, a4, -16
-+    vst             $vr2, a3, -16
-+    addi.d          a3, a3, -16
-+
-+L(back_un_less_16):
-+    vld             $vr2, a1, 0
-+    vshuf.b         $vr0, $vr0, $vr1, $vr9
-+    vst             $vr0, a3, -16
-+    vst             $vr2, a0, 0
-+
-+    jr              ra
-+END(MEMMOVE_NAME)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMCPY_NAME)
-+libc_hidden_builtin_def (MEMMOVE_NAME)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
-new file mode 100644
-index 00000000..27ed0c9c
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
-@@ -0,0 +1,478 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <regdef.h>
-+#include <sys/asm.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define MEMMOVE_NAME __memmove_unaligned
-+
-+#define LD_64(reg, n) \
-+	ld.d	t0, reg, n;    \
-+	ld.d	t1, reg, n+8;  \
-+	ld.d	t2, reg, n+16; \
-+	ld.d	t3, reg, n+24; \
-+	ld.d	t4, reg, n+32; \
-+	ld.d	t5, reg, n+40; \
-+	ld.d	t6, reg, n+48; \
-+	ld.d	t7, reg, n+56;
-+
-+
-+#define ST_64(reg, n) \
-+	st.d	t0, reg, n;    \
-+	st.d	t1, reg, n+8;  \
-+	st.d	t2, reg, n+16; \
-+	st.d	t3, reg, n+24; \
-+	st.d	t4, reg, n+32; \
-+	st.d	t5, reg, n+40; \
-+	st.d	t6, reg, n+48; \
-+	st.d	t7, reg, n+56;
-+
-+#define LDST_1024 \
-+	LD_64(a1, 0);    \
-+	ST_64(a0, 0);    \
-+	LD_64(a1, 64);   \
-+	ST_64(a0, 64);   \
-+	LD_64(a1, 128);  \
-+	ST_64(a0, 128);  \
-+	LD_64(a1, 192);  \
-+	ST_64(a0, 192);  \
-+	LD_64(a1, 256);  \
-+	ST_64(a0, 256);  \
-+	LD_64(a1, 320);  \
-+	ST_64(a0, 320);  \
-+	LD_64(a1, 384);  \
-+	ST_64(a0, 384);  \
-+	LD_64(a1, 448);  \
-+	ST_64(a0, 448);  \
-+	LD_64(a1, 512);  \
-+	ST_64(a0, 512);  \
-+	LD_64(a1, 576);  \
-+	ST_64(a0, 576);  \
-+	LD_64(a1, 640);  \
-+	ST_64(a0, 640);  \
-+	LD_64(a1, 704);  \
-+	ST_64(a0, 704);  \
-+	LD_64(a1, 768);  \
-+	ST_64(a0, 768);  \
-+	LD_64(a1, 832);  \
-+	ST_64(a0, 832);  \
-+	LD_64(a1, 896);  \
-+	ST_64(a0, 896);  \
-+	LD_64(a1, 960);  \
-+	ST_64(a0, 960);
-+
-+#define LDST_1024_BACK \
-+	LD_64(a4, -64);   \
-+	ST_64(a3, -64);   \
-+	LD_64(a4, -128);  \
-+	ST_64(a3, -128);  \
-+	LD_64(a4, -192);  \
-+	ST_64(a3, -192);  \
-+	LD_64(a4, -256);  \
-+	ST_64(a3, -256);  \
-+	LD_64(a4, -320);  \
-+	ST_64(a3, -320);  \
-+	LD_64(a4, -384);  \
-+	ST_64(a3, -384);  \
-+	LD_64(a4, -448);  \
-+	ST_64(a3, -448);  \
-+	LD_64(a4, -512);  \
-+	ST_64(a3, -512);  \
-+	LD_64(a4, -576);  \
-+	ST_64(a3, -576);  \
-+	LD_64(a4, -640);  \
-+	ST_64(a3, -640);  \
-+	LD_64(a4, -704);  \
-+	ST_64(a3, -704);  \
-+	LD_64(a4, -768);  \
-+	ST_64(a3, -768);  \
-+	LD_64(a4, -832);  \
-+	ST_64(a3, -832);  \
-+	LD_64(a4, -896);  \
-+	ST_64(a3, -896);  \
-+	LD_64(a4, -960);  \
-+	ST_64(a3, -960);  \
-+	LD_64(a4, -1024); \
-+	ST_64(a3, -1024);
-+
-+#ifdef ANDROID_CHANGES
-+LEAF(MEMMOVE_NAME, 0)
-+#else
-+LEAF(MEMMOVE_NAME)
-+#endif
-+
-+//1st var: dest ptr: void *str1 $r4 a0
-+//2nd var: src  ptr: void *str2 $r5 a1
-+//3rd var: size_t num
-+//t0~t9 registers as temp
-+
-+	add.d	a4, a1, a2
-+	add.d	a3, a0, a2
-+	beq		a1, a0, less_1bytes
-+	move	t8, a0
-+	srai.d	a6, a2, 4  		#num/16
-+	beqz	a6, less_16bytes        #num<16
-+	srai.d	a6, a2, 6  		#num/64
-+	bnez	a6, more_64bytes       #num>64
-+	srai.d	a6, a2, 5
-+	beqz	a6, less_32bytes	   #num<32
-+
-+	ld.d	t0, a1, 0              #32<num<64
-+	ld.d	t1, a1, 8
-+	ld.d	t2, a1, 16
-+	ld.d	t3, a1, 24
-+	ld.d	t4, a4, -32
-+	ld.d	t5, a4, -24
-+	ld.d	t6, a4, -16
-+	ld.d	t7, a4, -8
-+	st.d	t0, a0, 0
-+	st.d	t1, a0, 8
-+	st.d	t2, a0, 16
-+	st.d	t3, a0, 24
-+	st.d	t4, a3, -32
-+	st.d	t5, a3, -24
-+	st.d	t6, a3, -16
-+	st.d	t7, a3, -8
-+
-+	jr  ra
-+
-+less_32bytes:
-+	ld.d	t0, a1, 0
-+	ld.d	t1, a1, 8
-+	ld.d	t2, a4, -16
-+	ld.d	t3, a4, -8
-+	st.d	t0, a0, 0
-+	st.d	t1, a0, 8
-+	st.d	t2, a3, -16
-+	st.d	t3, a3, -8
-+
-+	jr	ra
-+
-+less_16bytes:
-+	srai.d	a6, a2, 3 #num/8
-+	beqz	a6, less_8bytes
-+
-+	ld.d	t0, a1, 0
-+	ld.d	t1, a4, -8
-+	st.d	t0, a0, 0
-+	st.d	t1, a3, -8
-+
-+	jr	ra
-+
-+less_8bytes:
-+	srai.d	a6, a2, 2
-+	beqz	a6, less_4bytes
-+
-+	ld.w	t0, a1, 0
-+	ld.w	t1, a4, -4
-+	st.w	t0, a0, 0
-+	st.w	t1, a3, -4
-+
-+	jr	ra
-+
-+less_4bytes:
-+	srai.d	a6, a2, 1
-+	beqz	a6, less_2bytes
-+
-+	ld.h	t0, a1, 0
-+	ld.h	t1, a4, -2
-+	st.h	t0, a0, 0
-+	st.h	t1, a3, -2
-+
-+	jr	ra
-+
-+less_2bytes:
-+	beqz	a2, less_1bytes
-+
-+	ld.b	t0, a1, 0
-+	st.b	t0, a0, 0
-+
-+	jr	ra
-+
-+less_1bytes:
-+	jr	ra
-+
-+more_64bytes:
-+	sub.d   a7, a0, a1
-+	bltu	a7, a2, copy_backward
-+
-+copy_forward:
-+	srli.d	a0, a0, 3
-+	slli.d	a0, a0, 3
-+	beq 	a0, t8, all_align
-+	addi.d	a0, a0, 0x8
-+	sub.d	a7, t8, a0
-+	sub.d	a1, a1, a7
-+	add.d	a2, a7, a2
-+
-+start_unalign_proc:
-+	pcaddi  t1, 18
-+	slli.d  a6, a7, 3
-+	add.d   t1, t1, a6
-+	jirl    zero, t1, 0
-+
-+start_7_unalign:
-+	ld.b    t0, a1, -7
-+	st.b    t0, a0, -7
-+start_6_unalign:
-+	ld.b    t0, a1, -6
-+	st.b    t0, a0, -6
-+start_5_unalign:
-+	ld.b    t0, a1, -5
-+	st.b    t0, a0, -5
-+start_4_unalign:
-+	ld.b    t0, a1, -4
-+	st.b    t0, a0, -4
-+start_3_unalign:
-+	ld.b    t0, a1, -3
-+	st.b    t0, a0, -3
-+start_2_unalign:
-+	ld.b    t0, a1, -2
-+	st.b    t0, a0, -2
-+start_1_unalign:
-+	ld.b    t0, a1, -1
-+	st.b    t0, a0, -1
-+start_over:
-+
-+	addi.d	a2, a2, -0x80
-+	blt     a2, zero, end_unalign_proc
-+
-+loop_less:
-+	LD_64(a1, 0)
-+	ST_64(a0, 0)
-+	LD_64(a1, 64)
-+	ST_64(a0, 64)
-+
-+	addi.d	a0, a0,  0x80
-+	addi.d	a1, a1,  0x80
-+	addi.d	a2, a2, -0x80
-+	bge     a2, zero, loop_less
-+
-+end_unalign_proc:
-+	addi.d  a2, a2, 0x80
-+
-+    	pcaddi  t1, 36
-+    	andi    t2, a2, 0x78
-+	add.d   a1, a1, t2
-+	add.d   a0, a0, t2
-+    	sub.d   t1, t1, t2
-+    	jirl    zero, t1, 0
-+
-+end_120_128_unalign:
-+	ld.d    t0, a1, -120
-+	st.d    t0, a0, -120
-+end_112_120_unalign:
-+	ld.d    t0, a1, -112
-+	st.d    t0, a0, -112
-+end_104_112_unalign:
-+	ld.d    t0, a1, -104
-+	st.d    t0, a0, -104
-+end_96_104_unalign:
-+	ld.d    t0, a1, -96
-+	st.d    t0, a0, -96
-+end_88_96_unalign:
-+	ld.d    t0, a1, -88
-+	st.d    t0, a0, -88
-+end_80_88_unalign:
-+	ld.d    t0, a1, -80
-+	st.d    t0, a0, -80
-+end_72_80_unalign:
-+	ld.d    t0, a1, -72
-+	st.d    t0, a0, -72
-+end_64_72_unalign:
-+	ld.d    t0, a1, -64
-+	st.d    t0, a0, -64
-+end_56_64_unalign:
-+	ld.d    t0, a1, -56
-+	st.d    t0, a0, -56
-+end_48_56_unalign:
-+	ld.d    t0, a1, -48
-+	st.d    t0, a0, -48
-+end_40_48_unalign:
-+	ld.d    t0, a1, -40
-+	st.d    t0, a0, -40
-+end_32_40_unalign:
-+	ld.d    t0, a1, -32
-+	st.d    t0, a0, -32
-+end_24_32_unalign:
-+    	ld.d    t0, a1, -24
-+    	st.d    t0, a0, -24
-+end_16_24_unalign:
-+    	ld.d    t0, a1, -16
-+    	st.d    t0, a0, -16
-+end_8_16_unalign:
-+    	ld.d    t0, a1, -8
-+    	st.d    t0, a0, -8
-+end_0_8_unalign:
-+
-+    	andi    a2, a2, 0x7
-+	pcaddi  t1, 18
-+	slli.d  a2, a2, 3
-+	sub.d   t1, t1, a2
-+	jirl    zero, t1, 0
-+
-+end_7_unalign:
-+	ld.b    t0, a4, -7
-+	st.b    t0, a3, -7
-+end_6_unalign:
-+	ld.b    t0, a4, -6
-+	st.b    t0, a3, -6
-+end_5_unalign:
-+	ld.b    t0, a4, -5
-+	st.b    t0, a3, -5
-+end_4_unalign:
-+	ld.b    t0, a4, -4
-+	st.b    t0, a3, -4
-+end_3_unalign:
-+	ld.b    t0, a4, -3
-+	st.b    t0, a3, -3
-+end_2_unalign:
-+	ld.b    t0, a4, -2
-+	st.b    t0, a3, -2
-+end_1_unalign:
-+	ld.b    t0, a4, -1
-+	st.b    t0, a3, -1
-+end:
-+
-+	move    v0, t8
-+	jr	ra
-+
-+all_align:
-+	addi.d  a1, a1, 0x8
-+	addi.d  a0, a0, 0x8
-+	ld.d	t0, a1, -8
-+	st.d    t0, a0, -8
-+	addi.d  a2, a2, -8
-+	b 		start_over
-+
-+all_align_back:
-+	addi.d  a4, a4, -0x8
-+	addi.d  a3, a3, -0x8
-+	ld.d    t0, a4, 0
-+	st.d    t0, a3, 0
-+	addi.d  a2, a2, -8
-+	b       start_over_back
-+
-+copy_backward:
-+	move    a5, a3
-+	srli.d  a3, a3, 3
-+	slli.d  a3, a3, 3
-+	beq     a3, a5, all_align_back
-+	sub.d   a7, a3, a5
-+	add.d   a4, a4, a7
-+	add.d   a2, a7, a2
-+
-+	pcaddi  t1, 18
-+	slli.d  a6, a7, 3
-+	add.d   t1, t1, a6
-+	jirl    zero, t1, 0
-+
-+	ld.b    t0, a4, 6
-+	st.b    t0, a3, 6
-+	ld.b    t0, a4, 5
-+	st.b    t0, a3, 5
-+	ld.b    t0, a4, 4
-+	st.b    t0, a3, 4
-+	ld.b    t0, a4, 3
-+	st.b    t0, a3, 3
-+	ld.b    t0, a4, 2
-+	st.b    t0, a3, 2
-+	ld.b    t0, a4, 1
-+	st.b    t0, a3, 1
-+	ld.b    t0, a4, 0
-+	st.b    t0, a3, 0
-+start_over_back:
-+
-+	addi.d  a2, a2, -0x80
-+	blt     a2, zero, end_unalign_proc_back
-+
-+loop_less_back:
-+	LD_64(a4, -64)
-+	ST_64(a3, -64)
-+	LD_64(a4, -128)
-+	ST_64(a3, -128)
-+
-+	addi.d a4, a4, -0x80
-+	addi.d a3, a3, -0x80
-+	addi.d a2, a2, -0x80
-+	bge    a2, zero, loop_less_back
-+
-+end_unalign_proc_back:
-+	addi.d  a2, a2, 0x80
-+
-+	pcaddi  t1, 36
-+	andi    t2, a2, 0x78
-+	sub.d   a4, a4, t2
-+	sub.d   a3, a3, t2
-+	sub.d   t1, t1, t2
-+	jirl    zero, t1, 0
-+
-+	ld.d    t0, a4, 112
-+	st.d    t0, a3, 112
-+	ld.d    t0, a4, 104
-+	st.d    t0, a3, 104
-+	ld.d    t0, a4, 96
-+	st.d    t0, a3, 96
-+	ld.d    t0, a4, 88
-+	st.d    t0, a3, 88
-+	ld.d    t0, a4, 80
-+	st.d    t0, a3, 80
-+	ld.d    t0, a4, 72
-+	st.d    t0, a3, 72
-+	ld.d    t0, a4, 64
-+	st.d    t0, a3, 64
-+	ld.d    t0, a4, 56
-+	st.d    t0, a3, 56
-+	ld.d    t0, a4, 48
-+	st.d    t0, a3, 48
-+	ld.d    t0, a4, 40
-+	st.d    t0, a3, 40
-+	ld.d    t0, a4, 32
-+	st.d    t0, a3, 32
-+    	ld.d    t0, a4, 24
-+    	st.d    t0, a3, 24
-+    	ld.d    t0, a4, 16
-+    	st.d    t0, a3, 16
-+    	ld.d    t0, a4, 8
-+    	st.d    t0, a3, 8
-+	ld.d    t0, a4, 0
-+	st.d    t0, a3, 0
-+
-+	andi    a2, a2, 0x7
-+	pcaddi  t1, 18
-+	slli.d  a2, a2, 3
-+	sub.d   t1, t1, a2
-+	jirl    zero, t1, 0
-+
-+	ld.b    t0, a1, 6
-+	st.b    t0, a0, 6
-+	ld.b    t0, a1, 5
-+	st.b    t0, a0, 5
-+	ld.b    t0, a1, 4
-+	st.b    t0, a0, 4
-+	ld.b    t0, a1, 3
-+	st.b    t0, a0, 3
-+	ld.b    t0, a1, 2
-+	st.b    t0, a0, 2
-+	ld.b    t0, a1, 1
-+	st.b    t0, a0, 1
-+	ld.b    t0, a1, 0
-+	st.b    t0, a0, 0
-+
-+	move    v0, t8
-+	jr	ra
-+
-+END(MEMMOVE_NAME)
-+
-+#ifndef ANDROID_CHANGES
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMMOVE_NAME)
-+#endif
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memmove.c b/sysdeps/loongarch/lp64/multiarch/memmove.c
-new file mode 100644
-index 00000000..50a5205b
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memmove.c
-@@ -0,0 +1,39 @@
-+/* Multiple versions of memcpy.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+/* Define multiple versions only for the definition in libc.  */
-+#if IS_IN (libc)
-+# define memmove __redirect_memmove
-+# include <string.h>
-+# undef memmove
-+
-+# define SYMBOL_NAME memmove
-+# include "ifunc-lasx.h"
-+
-+libc_ifunc_redirected (__redirect_memmove, __new_memmove,
-+		       IFUNC_SELECTOR ());
-+
-+# ifdef SHARED
-+__hidden_ver1 (__new_memmove, __GI_memmove, __redirect_memmove)
-+  __attribute__ ((visibility ("hidden")));
-+# endif
-+
-+# include <shlib-compat.h>
-+versioned_symbol (libc, __new_memmove, memmove, GLIBC_2_27);
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-generic.c b/sysdeps/loongarch/lp64/multiarch/memrchr-generic.c
-new file mode 100644
-index 00000000..ee7ab39c
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-generic.c
-@@ -0,0 +1,9 @@
-+
-+#if IS_IN (libc)
-+
-+#define MEMRCHR __memrchr_generic
-+
-+#endif
-+
-+#include <string/memrchr.c>
-+weak_alias (__memrchr_generic, __memrchr)
-diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
-new file mode 100644
-index 00000000..57e1035f
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
-@@ -0,0 +1,114 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#ifndef MEMRCHR
-+#define MEMRCHR	__memrchr_lasx
-+#endif
-+
-+LEAF(MEMRCHR)
-+    .align          6
-+    beqz            a2, L(ret0)
-+    addi.d          a2, a2, -1
-+    add.d           a3, a0, a2
-+    andi            t1, a3, 0x3f
-+
-+    bstrins.d       a3, zero, 5, 0
-+    addi.d          t1, t1, 1      # len for unaligned address
-+    xvld            $xr0, a3, 0
-+    xvld            $xr1, a3, 32
-+
-+    sub.d           t2, zero, t1
-+    li.d            t3, -1
-+    xvreplgr2vr.b   $xr2, a1
-+    andi            t4, a0, 0x3f
-+
-+    srl.d           t2, t3, t2
-+    xvseq.b         $xr0, $xr0, $xr2
-+    xvseq.b         $xr1, $xr1, $xr2
-+    xvmsknz.b       $xr0, $xr0
-+
-+
-+    xvmsknz.b       $xr1, $xr1
-+    xvpickve.w      $xr3, $xr0, 4
-+    xvpickve.w      $xr4, $xr1, 4
-+    vilvl.h         $vr0, $vr3, $vr0
-+
-+    vilvl.h         $vr1, $vr4, $vr1
-+    vilvl.w         $vr0, $vr1, $vr0
-+    movfr2gr.d      t0, $f0
-+    and             t0, t0, t2
-+
-+    bltu            a2, t1, L(end)
-+    bnez            t0, L(found)
-+    bstrins.d       a0, zero, 5, 0
-+L(loop):
-+    xvld            $xr0, a3, -64
-+
-+    xvld            $xr1, a3, -32
-+    addi.d          a3, a3, -64
-+    xvseq.b         $xr0, $xr0, $xr2
-+    xvseq.b         $xr1, $xr1, $xr2
-+
-+
-+    beq             a0, a3, L(out)
-+    xvmax.bu        $xr3, $xr0, $xr1
-+    xvseteqz.v      $fcc0, $xr3
-+    bcnez           $fcc0, L(loop)
-+
-+    xvmsknz.b       $xr0, $xr0
-+    xvmsknz.b       $xr1, $xr1
-+    xvpickve.w      $xr3, $xr0, 4
-+    xvpickve.w      $xr4, $xr1, 4
-+
-+    vilvl.h         $vr0, $vr3, $vr0
-+    vilvl.h         $vr1, $vr4, $vr1
-+    vilvl.w         $vr0, $vr1, $vr0
-+    movfr2gr.d      t0, $f0
-+
-+L(found):
-+    addi.d          a0, a3, 63
-+    clz.d           t1, t0
-+    sub.d           a0, a0, t1
-+    jr              ra
-+
-+
-+L(out):
-+    xvmsknz.b       $xr0, $xr0
-+    xvmsknz.b       $xr1, $xr1
-+    xvpickve.w      $xr3, $xr0, 4
-+    xvpickve.w      $xr4, $xr1, 4
-+
-+    vilvl.h         $vr0, $vr3, $vr0
-+    vilvl.h         $vr1, $vr4, $vr1
-+    vilvl.w         $vr0, $vr1, $vr0
-+    movfr2gr.d      t0, $f0
-+
-+L(end):
-+    sll.d           t2, t3, t4
-+    and             t0, t0, t2
-+    addi.d          a0, a3, 63
-+    clz.d           t1, t0
-+
-+    sub.d           a0, a0, t1
-+    maskeqz         a0, a0, t0
-+    jr              ra
-+L(ret0):
-+    move            a0, zero
-+
-+
-+    jr              ra
-+END(MEMRCHR)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMRCHR)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
-new file mode 100644
-index 00000000..eac2059a
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
-@@ -0,0 +1,96 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <regdef.h>
-+#include <sys/asm.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define MEMRCHR	__memrchr_lsx
-+
-+LEAF(MEMRCHR)
-+    .align          6
-+    beqz            a2, L(ret0)
-+    addi.d          a2, a2, -1
-+    add.d           a3, a0, a2
-+    andi            t1, a3, 0x1f
-+
-+    bstrins.d       a3, zero, 4, 0
-+    addi.d          t1, t1, 1      # len for unaligned address
-+    vld             $vr0, a3, 0
-+    vld             $vr1, a3, 16
-+
-+    sub.d           t2, zero, t1
-+    li.d            t3, -1
-+    vreplgr2vr.b    $vr2, a1
-+    andi            t4, a0, 0x1f
-+
-+    srl.d           t2, t3, t2
-+    vseq.b          $vr0, $vr0, $vr2
-+    vseq.b          $vr1, $vr1, $vr2
-+    vmsknz.b        $vr0, $vr0
-+
-+
-+    vmsknz.b        $vr1, $vr1
-+    vilvl.h         $vr0, $vr1, $vr0
-+    movfr2gr.s      t0, $f0
-+    and             t0, t0, t2
-+
-+    bltu            a2, t1, L(end)
-+    bnez            t0, L(found)
-+    bstrins.d       a0, zero, 4, 0
-+L(loop):
-+    vld             $vr0, a3, -32
-+
-+    vld             $vr1, a3, -16
-+    addi.d          a3, a3, -32
-+    vseq.b          $vr0, $vr0, $vr2
-+    vseq.b          $vr1, $vr1, $vr2
-+
-+    beq             a0, a3, L(out)
-+    vmax.bu         $vr3, $vr0, $vr1
-+    vseteqz.v       $fcc0, $vr3
-+    bcnez           $fcc0, L(loop)
-+
-+
-+    vmsknz.b        $vr0, $vr0
-+    vmsknz.b        $vr1, $vr1
-+    vilvl.h         $vr0, $vr1, $vr0
-+    movfr2gr.s      t0, $f0
-+
-+L(found):
-+    addi.d          a0, a3, 31
-+    clz.w           t1, t0
-+    sub.d           a0, a0, t1
-+    jr              ra
-+
-+L(out):
-+    vmsknz.b        $vr0, $vr0
-+    vmsknz.b        $vr1, $vr1
-+    vilvl.h         $vr0, $vr1, $vr0
-+    movfr2gr.s      t0, $f0
-+
-+L(end):
-+    sll.d           t2, t3, t4
-+    and             t0, t0, t2
-+    addi.d          a0, a3, 31
-+    clz.w           t1, t0
-+
-+
-+    sub.d           a0, a0, t1
-+    maskeqz         a0, a0, t0
-+    jr              ra
-+L(ret0):
-+    move            a0, zero
-+
-+    jr              ra
-+END(MEMRCHR)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMRCHR)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr.c b/sysdeps/loongarch/lp64/multiarch/memrchr.c
-new file mode 100644
-index 00000000..675c3115
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memrchr.c
-@@ -0,0 +1,39 @@
-+/* Multiple versions of memrchr.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+/* Define multiple versions only for the definition in libc.  */
-+#if IS_IN (libc)
-+# define memrchr __redirect_memrchr
-+# include <string.h>
-+# undef memrchr
-+
-+# define SYMBOL_NAME memrchr
-+# include "ifunc-memrchr.h"
-+
-+libc_ifunc_redirected (__redirect_memrchr, __new_memrchr,
-+		       IFUNC_SELECTOR ());
-+
-+# ifdef SHARED
-+__hidden_ver1 (__new_memrchr, __GI_memrchr, __redirect_memrchr)
-+  __attribute__ ((visibility ("hidden")));
-+# endif
-+
-+# include <shlib-compat.h>
-+versioned_symbol (libc, __new_memrchr, memrchr, GLIBC_2_27);
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memset-aligned.S b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S
-new file mode 100644
-index 00000000..da2f5ada
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S
-@@ -0,0 +1,9 @@
-+
-+#if IS_IN (libc)
-+
-+#define MEMSET_NAME __memset_aligned
-+
-+#endif
-+
-+#include "../memset.S"
-+
-diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
-new file mode 100644
-index 00000000..1bd2dda9
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
-@@ -0,0 +1,132 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define MEMSET	__memset_lasx
-+
-+LEAF(MEMSET)
-+    .align          6
-+    li.d            t1, 32
-+    move            a3, a0
-+    xvreplgr2vr.b   $xr0, a1
-+    add.d           a4, a0, a2
-+
-+    bgeu            t1, a2, L(less_32bytes) # len <= 32
-+    li.d            t3, 128
-+    li.d            t2, 64
-+    blt             t3, a2, L(long_bytes)   # len > 128
-+
-+L(less_128bytes):
-+    bgeu            t2, a2, L(less_64bytes) # len <= 64
-+    xvst            $xr0, a3, 0
-+    xvst            $xr0, a3, 32
-+    xvst            $xr0, a4, -32
-+
-+    xvst            $xr0, a4, -64
-+    jr              ra
-+L(less_64bytes):
-+    xvst            $xr0, a3, 0
-+    xvst            $xr0, a4, -32
-+
-+
-+    jr              ra
-+L(less_32bytes):
-+    srli.d          t0, a2, 4
-+    beqz            t0, L(less_16bytes)
-+    vst             $vr0, a3, 0
-+
-+    vst             $vr0, a4, -16
-+    jr              ra
-+L(less_16bytes):
-+    srli.d          t0, a2, 3
-+    beqz            t0, L(less_8bytes)
-+
-+    vstelm.d        $vr0, a3, 0, 0
-+    vstelm.d        $vr0, a4, -8, 0
-+    jr              ra
-+L(less_8bytes):
-+    srli.d          t0, a2, 2
-+
-+    beqz            t0, L(less_4bytes)
-+    vstelm.w        $vr0, a3, 0, 0
-+    vstelm.w        $vr0, a4, -4, 0
-+    jr              ra
-+
-+
-+L(less_4bytes):
-+    srli.d          t0, a2, 1
-+    beqz            t0, L(less_2bytes)
-+    vstelm.h        $vr0, a3, 0, 0
-+    vstelm.h        $vr0, a4, -2, 0
-+
-+    jr              ra
-+L(less_2bytes):
-+    beqz            a2, L(less_1bytes)
-+    st.b            a1, a3, 0
-+L(less_1bytes):
-+    jr              ra
-+
-+L(long_bytes):
-+    xvst            $xr0, a3, 0
-+    bstrins.d       a3, zero, 4, 0
-+    addi.d          a3, a3, 32
-+    sub.d           a2, a4, a3
-+
-+    andi            t0, a2, 0xff
-+    beq             t0, a2, L(long_end)
-+    move            a2, t0
-+    sub.d           t0, a4, t0
-+
-+
-+L(loop_256):
-+    xvst            $xr0, a3, 0
-+    xvst            $xr0, a3, 32
-+    xvst            $xr0, a3, 64
-+    xvst            $xr0, a3, 96
-+
-+    xvst            $xr0, a3, 128
-+    xvst            $xr0, a3, 160
-+    xvst            $xr0, a3, 192
-+    xvst            $xr0, a3, 224
-+
-+    addi.d          a3, a3, 256
-+    bne             a3, t0, L(loop_256)
-+L(long_end):
-+    bltu            a2, t3, L(end_less_128)
-+    addi.d          a2, a2, -128
-+
-+    xvst            $xr0, a3, 0
-+    xvst            $xr0, a3, 32
-+    xvst            $xr0, a3, 64
-+    xvst            $xr0, a3, 96
-+
-+
-+    addi.d          a3, a3, 128
-+L(end_less_128):
-+    bltu            a2, t2, L(end_less_64)
-+    addi.d          a2, a2, -64
-+    xvst            $xr0, a3, 0
-+
-+    xvst            $xr0, a3, 32
-+    addi.d          a3, a3, 64
-+L(end_less_64):
-+    bltu            a2, t1, L(end_less_32)
-+    xvst            $xr0, a3, 0
-+
-+L(end_less_32):
-+    xvst            $xr0, a4, -32
-+    jr              ra
-+END(MEMSET)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMSET)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
-new file mode 100644
-index 00000000..a3bbadb7
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
-@@ -0,0 +1,125 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <regdef.h>
-+#include <sys/asm.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define MEMSET	__memset_lsx
-+
-+LEAF(MEMSET)
-+    .align          6
-+    li.d            t1, 16
-+    move            a3, a0
-+    vreplgr2vr.b    $vr0, a1
-+    add.d           a4, a0, a2
-+
-+    bgeu            t1, a2, L(less_16bytes) # len <= 16
-+    li.d            t3, 64
-+    li.d            t2, 32
-+    bgeu            a2, t3, L(long_bytes) # len > 64
-+
-+L(less_64bytes):
-+    bgeu            t2, a2, L(less_32bytes) # len <= 32
-+    vst             $vr0, a3, 0
-+    vst             $vr0, a3, 16
-+    vst             $vr0, a4, -32
-+
-+    vst             $vr0, a4, -16
-+    jr              ra
-+L(less_32bytes):
-+    vst             $vr0, a3, 0
-+    vst             $vr0, a4, -16
-+
-+
-+    jr              ra
-+L(less_16bytes):
-+    srli.d          t0, a2, 3
-+    beqz            t0, L(less_8bytes)
-+    vstelm.d        $vr0, a3, 0, 0
-+
-+    vstelm.d        $vr0, a4, -8, 0
-+    jr              ra
-+L(less_8bytes):
-+    srli.d          t0, a2, 2
-+    beqz            t0, L(less_4bytes)
-+
-+    vstelm.w        $vr0, a3, 0, 0
-+    vstelm.w        $vr0, a4, -4, 0
-+    jr              ra
-+L(less_4bytes):
-+    srli.d          t0, a2, 1
-+
-+    beqz            t0, L(less_2bytes)
-+    vstelm.h        $vr0, a3, 0, 0
-+    vstelm.h        $vr0, a4, -2, 0
-+    jr              ra
-+
-+
-+L(less_2bytes):
-+    beqz            a2, L(less_1bytes)
-+    vstelm.b        $vr0, a3, 0, 0
-+L(less_1bytes):
-+    jr              ra
-+L(long_bytes):
-+    vst             $vr0, a3, 0
-+
-+    bstrins.d       a3, zero, 3, 0
-+    addi.d          a3, a3, 16
-+    sub.d           a2, a4, a3
-+    andi            t0, a2, 0x7f
-+
-+    beq             t0, a2, L(long_end)
-+    move            a2, t0
-+    sub.d           t0, a4, t0
-+
-+L(loop_128):
-+    vst             $vr0, a3, 0
-+
-+    vst             $vr0, a3, 16
-+    vst             $vr0, a3, 32
-+    vst             $vr0, a3, 48
-+    vst             $vr0, a3, 64
-+
-+
-+    vst             $vr0, a3, 80
-+    vst             $vr0, a3, 96
-+    vst             $vr0, a3, 112
-+    addi.d          a3, a3, 128
-+
-+    bne             a3, t0, L(loop_128)
-+L(long_end):
-+    bltu            a2, t3, L(end_less_64)
-+    addi.d          a2, a2, -64
-+    vst             $vr0, a3, 0
-+
-+    vst             $vr0, a3, 16
-+    vst             $vr0, a3, 32
-+    vst             $vr0, a3, 48
-+    addi.d          a3, a3, 64
-+
-+L(end_less_64):
-+    bltu            a2, t2, L(end_less_32)
-+    addi.d          a2, a2, -32
-+    vst             $vr0, a3, 0
-+    vst             $vr0, a3, 16
-+
-+    addi.d          a3, a3, 32
-+L(end_less_32):
-+    bltu            a2, t1, L(end_less_16)
-+    vst             $vr0, a3, 0
-+
-+L(end_less_16):
-+    vst             $vr0, a4, -16
-+    jr              ra
-+END(MEMSET)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMSET)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
-new file mode 100644
-index 00000000..16ff2ef7
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
-@@ -0,0 +1,177 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define MEMSET_NAME __memset_unaligned
-+
-+#define ST_128(n) 	\
-+	st.d	a1, a0, n;		 \
-+	st.d    a1, a0, n+8  ; 	 \
-+	st.d    a1, a0, n+16 ;   \
-+	st.d    a1, a0, n+24 ;   \
-+	st.d    a1, a0, n+32 ;   \
-+	st.d    a1, a0, n+40 ;   \
-+	st.d    a1, a0, n+48 ;   \
-+	st.d    a1, a0, n+56 ;   \
-+	st.d    a1, a0, n+64 ;   \
-+	st.d    a1, a0, n+72 ;   \
-+	st.d    a1, a0, n+80 ;   \
-+	st.d    a1, a0, n+88 ;   \
-+	st.d    a1, a0, n+96 ;   \
-+	st.d    a1, a0, n+104;   \
-+	st.d    a1, a0, n+112;   \
-+	st.d    a1, a0, n+120;	 \
-+
-+//1st var: void *str  $4 a0
-+//2nd var: int val  $5   a1
-+//3rd var: size_t num  $6  a2
-+
-+#ifdef ANDROID_CHANGES
-+LEAF(MEMSET_NAME, 0)
-+#else
-+LEAF(MEMSET_NAME)
-+#endif
-+
-+	.align	6
-+	bstrins.d a1, a1, 15, 8
-+	add.d	  t7, a0, a2
-+	bstrins.d a1, a1, 31, 16
-+	move	  t0, a0
-+	bstrins.d a1, a1, 63, 32
-+	srai.d	  t8, a2, 4         	#num/16
-+	beqz	  t8, less_16bytes	#num<16
-+	srai.d	  t8, a2, 6		#num/64
-+	bnez	  t8, more_64bytes	#num>64
-+	srai.d	  t8, a2, 5		#num/32
-+	beqz	  t8, less_32bytes	#num<32
-+	st.d	  a1, a0, 0 		#32<num<64
-+	st.d	  a1, a0, 8
-+	st.d	  a1, a0, 16
-+	st.d	  a1, a0, 24
-+	st.d	  a1, t7, -32
-+	st.d	  a1, t7, -24
-+	st.d	  a1, t7, -16
-+	st.d	  a1, t7, -8
-+
-+	jr	  ra
-+
-+less_32bytes:
-+	st.d	  a1, a0, 0
-+	st.d	  a1, a0, 8
-+	st.d	  a1, t7, -16
-+	st.d	  a1, t7, -8
-+
-+	jr	  ra
-+
-+less_16bytes:
-+	srai.d	  t8, a2, 3		#num/8
-+	beqz	  t8, less_8bytes
-+	st.d	  a1, a0, 0
-+	st.d	  a1, t7, -8
-+
-+	jr	  ra
-+
-+less_8bytes:
-+	srai.d	  t8, a2, 2
-+	beqz	  t8, less_4bytes
-+	st.w	  a1, a0, 0
-+	st.w	  a1, t7, -4
-+
-+	jr	  ra
-+
-+less_4bytes:
-+	srai.d	  t8, a2, 1
-+	beqz	  t8, less_2bytes
-+	st.h	  a1, a0, 0
-+	st.h	  a1, t7, -2
-+
-+	jr	  ra
-+
-+less_2bytes:
-+	beqz	  a2, less_1bytes
-+	st.b	  a1, a0, 0
-+
-+	jr	  ra
-+
-+less_1bytes:
-+	jr	  ra
-+
-+more_64bytes:
-+	srli.d	  a0, a0, 3
-+	slli.d	  a0, a0, 3
-+	addi.d	  a0, a0, 0x8
-+	st.d      a1, t0, 0
-+	sub.d	  t2, t0, a0
-+	add.d	  a2, t2, a2
-+
-+	addi.d	  a2, a2, -0x80
-+	blt       a2, zero, end_unalign_proc
-+
-+loop_less:
-+	ST_128(0)
-+	addi.d	a0, a0,  0x80
-+	addi.d	a2, a2, -0x80
-+	bge     a2, zero, loop_less
-+
-+end_unalign_proc:
-+	addi.d  a2, a2, 0x80
-+
-+	pcaddi  t1, 20
-+	andi    t5, a2, 0x78
-+	srli.d  t5, t5, 1
-+	sub.d   t1, t1, t5
-+	jirl    zero, t1, 0
-+
-+end_120_128_unalign:
-+	st.d    a1, a0, 112
-+end_112_120_unalign:
-+	st.d    a1, a0, 104
-+end_104_112_unalign:
-+	st.d    a1, a0, 96
-+end_96_104_unalign:
-+	st.d    a1, a0, 88
-+end_88_96_unalign:
-+	st.d    a1, a0, 80
-+end_80_88_unalign:
-+	st.d    a1, a0, 72
-+end_72_80_unalign:
-+	st.d    a1, a0, 64
-+end_64_72_unalign:
-+	st.d    a1, a0, 56
-+end_56_64_unalign:
-+	st.d    a1, a0, 48
-+end_48_56_unalign:
-+	st.d    a1, a0, 40
-+end_40_48_unalign:
-+	st.d    a1, a0, 32
-+end_32_40_unalign:
-+	st.d    a1, a0, 24
-+end_24_32_unalign:
-+    st.d    a1, a0, 16
-+end_16_24_unalign:
-+    st.d    a1, a0, 8
-+end_8_16_unalign:
-+    st.d    a1, a0, 0
-+end_0_8_unalign:
-+
-+	st.d    a1, t7, -8
-+
-+	move	  v0, t0
-+	jr	  ra
-+
-+END(MEMSET_NAME)
-+
-+#ifndef ANDROID_CHANGES
-+#ifdef _LIBC
-+libc_hidden_builtin_def (MEMSET_NAME)
-+#endif
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/memset.c b/sysdeps/loongarch/lp64/multiarch/memset.c
-new file mode 100644
-index 00000000..fb316be5
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/memset.c
-@@ -0,0 +1,39 @@
-+/* Multiple versions of memset.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+/* Define multiple versions only for the definition in libc.  */
-+#if IS_IN (libc)
-+# define memset __redirect_memset
-+# include <string.h>
-+# undef memset
-+
-+# define SYMBOL_NAME memset
-+# include "ifunc-lasx.h"
-+
-+libc_ifunc_redirected (__redirect_memset, __new_memset,
-+		       IFUNC_SELECTOR ());
-+
-+# ifdef SHARED
-+__hidden_ver1 (__new_memset, __GI_memset, __redirect_memset)
-+  __attribute__ ((visibility ("hidden")));
-+# endif
-+
-+# include <shlib-compat.h>
-+versioned_symbol (libc, __new_memset, memset, GLIBC_2_27);
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S
-new file mode 100644
-index 00000000..0b46b4ca
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S
-@@ -0,0 +1,7 @@
-+
-+#if IS_IN (libc)
-+#define RAWMEMCHR_NAME __rawmemchr_aligned
-+#endif
-+
-+#include "../rawmemchr.S"
-+
-diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
-new file mode 100644
-index 00000000..bff92969
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
-@@ -0,0 +1,51 @@
-+#include <sysdep.h>
-+#include <sys/asm.h>
-+
-+#if IS_IN (libc)
-+
-+# define RAWMEMCHR __rawmemchr_lasx
-+
-+LEAF(RAWMEMCHR)
-+    .align          6
-+    move            a2, a0
-+    bstrins.d       a0, zero, 4, 0
-+    xvld            $xr0, a0, 0
-+    xvreplgr2vr.b   $xr1, a1
-+
-+    xvseq.b         $xr0, $xr0, $xr1
-+    xvmsknz.b       $xr0, $xr0
-+    xvpickve.w      $xr2, $xr0, 4
-+    vilvl.h         $vr0, $vr2, $vr0
-+
-+    movfr2gr.s      t0, $f0
-+    sra.w           t0, t0, a2
-+    beqz            t0, L(loop)
-+    ctz.w           t0, t0
-+
-+    add.d           a0, a2, t0
-+    jr              ra
-+    nop
-+    nop
-+
-+L(loop):
-+    xvld            $xr0, a0, 32
-+    addi.d          a0, a0, 32
-+    xvseq.b         $xr0, $xr0, $xr1
-+    xvseteqz.v      $fcc0, $xr0
-+
-+    bcnez           $fcc0, L(loop)
-+    xvmsknz.b       $xr0, $xr0
-+    xvpickve.w      $xr1, $xr0, 4
-+    vilvl.h         $vr0, $vr1, $vr0
-+
-+    movfr2gr.s      t0, $f0
-+    ctz.w           t0, t0
-+    add.d           a0, a0, t0
-+    jr              ra
-+END(RAWMEMCHR)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (RAWMEMCHR)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
-new file mode 100644
-index 00000000..11a19c1d
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
-@@ -0,0 +1,56 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <regdef.h>
-+#include <sys/asm.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+# define RAWMEMCHR __rawmemchr_lsx
-+
-+LEAF(RAWMEMCHR)
-+    .align          6
-+    move            a2, a0
-+    bstrins.d       a0, zero, 4, 0
-+    vld             $vr0, a0, 0
-+    vld             $vr1, a0, 16
-+
-+    vreplgr2vr.b    $vr2, a1
-+    vseq.b          $vr0, $vr0, $vr2
-+    vseq.b          $vr1, $vr1, $vr2
-+    vmsknz.b        $vr0, $vr0
-+
-+    vmsknz.b        $vr1, $vr1
-+    vilvl.h         $vr0, $vr1, $vr0
-+    movfr2gr.s      t0, $f0
-+    sra.w           t0, t0, a2
-+
-+    beqz            t0, L(loop)
-+    ctz.w           t0, t0
-+    add.d           a0, a2, t0
-+    jr              ra
-+
-+
-+L(loop):
-+    vld             $vr0, a0, 32
-+    addi.d          a0, a0, 16
-+    vseq.b          $vr0, $vr0, $vr2
-+    vseteqz.v       $fcc0, $vr0
-+
-+    bcnez           $fcc0, L(loop)
-+    addi.d          a0, a0, 16
-+    vfrstpi.b       $vr0, $vr0, 0
-+    vpickve2gr.bu   t0, $vr0, 0
-+
-+    add.d           a0, a0, t0
-+    jr              ra
-+END(RAWMEMCHR)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (RAWMEMCHR)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr.c b/sysdeps/loongarch/lp64/multiarch/rawmemchr.c
-new file mode 100644
-index 00000000..1e514139
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr.c
-@@ -0,0 +1,37 @@
-+/* Multiple versions of rawmemchr.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#if IS_IN (libc)
-+# define rawmemchr __redirect_rawmemchr
-+# define __rawmemchr __redirect___rawmemchr
-+# include <string.h>
-+# undef rawmemchr
-+# undef __rawmemchr
-+
-+# define SYMBOL_NAME rawmemchr
-+# include "ifunc-memchr.h"
-+
-+libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
-+                       IFUNC_SELECTOR ());
-+weak_alias (__rawmemchr, rawmemchr)
-+# ifdef SHARED
-+__hidden_ver1 (__rawmemchr, __GI___rawmemchr, __redirect___rawmemchr)
-+  __attribute__((visibility ("hidden")));
-+# endif
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S
-new file mode 100644
-index 00000000..3d134e3f
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S
-@@ -0,0 +1,8 @@
-+
-+#if IS_IN (libc)
-+
-+#define STPCPY_NAME __stpcpy_aligned
-+
-+#endif
-+
-+#include "../stpcpy.S"
-diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
-new file mode 100644
-index 00000000..bf0eed43
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
-@@ -0,0 +1,178 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <regdef.h>
-+#include <sys/asm.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define STPCPY __stpcpy_lsx
-+
-+L(magic_num):
-+    .align          6
-+    .dword          0x0706050403020100
-+    .dword          0x0f0e0d0c0b0a0908
-+ENTRY_NO_ALIGN(STPCPY)
-+    pcaddi          t0, -4
-+    andi            a4, a1, 0xf
-+    vld             $vr1, t0, 0
-+    beqz            a4, L(load_start)
-+
-+    xor             t0, a1, a4
-+    vld             $vr0, t0, 0
-+    vreplgr2vr.b    $vr2, a4
-+    vadd.b          $vr2, $vr2, $vr1
-+
-+    vshuf.b         $vr0, $vr2, $vr0, $vr2
-+    vsetanyeqz.b    $fcc0, $vr0
-+    bcnez           $fcc0, L(end)
-+L(load_start):
-+    vld             $vr0, a1, 0
-+
-+
-+    li.d            t1, 16
-+    andi            a3, a0, 0xf
-+    vsetanyeqz.b    $fcc0, $vr0
-+    sub.d           t0, t1, a3
-+
-+    bcnez           $fcc0, L(end)
-+    add.d           a1, a1, t0
-+    vst             $vr0, a0, 0
-+    add.d           a0, a0, t0
-+
-+    bne             a3, a4, L(unaligned)
-+    vld             $vr0, a1, 0
-+    vsetanyeqz.b    $fcc0, $vr0
-+    bcnez           $fcc0, L(end)
-+
-+L(loop):
-+    vst             $vr0, a0, 0
-+    vld             $vr0, a1, 16
-+    addi.d          a0, a0, 16
-+    addi.d          a1, a1, 16
-+
-+
-+    vsetanyeqz.b    $fcc0, $vr0
-+    bceqz           $fcc0, L(loop)
-+    vmsknz.b        $vr1, $vr0
-+    movfr2gr.s      t0, $f1
-+
-+    cto.w           t0, t0
-+    add.d           a1, a1, t0
-+    vld             $vr0, a1, -15
-+    add.d           a0, a0, t0
-+
-+    vst             $vr0, a0, -15
-+    jr              ra
-+L(end):
-+    vseqi.b         $vr1, $vr0, 0
-+    vfrstpi.b       $vr1, $vr1, 0
-+
-+    vpickve2gr.bu   t0, $vr1, 0
-+    addi.d          t0, t0, 1
-+L(end_16):
-+    andi            t1, t0, 16
-+    beqz            t1, L(end_8)
-+
-+
-+    vst             $vr0, a0, 0
-+    addi.d          a0, a0, 15
-+    jr              ra
-+L(end_8):
-+    andi            t2, t0, 8
-+
-+    andi            t3, t0, 4
-+    andi            t4, t0, 2
-+    andi            t5, t0, 1
-+    beqz            t2, L(end_4)
-+
-+    vstelm.d        $vr0, a0, 0, 0
-+    addi.d          a0, a0, 8
-+    vbsrl.v         $vr0, $vr0, 8
-+L(end_4):
-+    beqz            t3, L(end_2)
-+
-+    vstelm.w        $vr0, a0, 0, 0
-+    addi.d          a0, a0, 4
-+    vbsrl.v         $vr0, $vr0, 4
-+L(end_2):
-+    beqz            t4, L(end_1)
-+
-+
-+    vstelm.h        $vr0, a0, 0, 0
-+    addi.d          a0, a0, 2
-+    vbsrl.v         $vr0, $vr0, 2
-+L(end_1):
-+    beqz            t5, L(out)
-+
-+    vstelm.b        $vr0, a0, 0, 0
-+    addi.d          a0, a0, 1
-+L(out):
-+    addi.d          a0, a0, -1
-+    jr              ra
-+
-+    nop
-+    nop
-+L(unaligned):
-+    andi           a3, a1, 0xf
-+    bstrins.d      a1, zero, 3, 0
-+
-+    vld            $vr2, a1, 0
-+    vreplgr2vr.b   $vr3, a3
-+    vslt.b         $vr4, $vr1, $vr3
-+    vor.v          $vr0, $vr2, $vr4
-+
-+
-+    vsetanyeqz.b   $fcc0, $vr0
-+    bcnez          $fcc0, L(un_first_end)
-+    vld            $vr0, a1, 16
-+    vadd.b         $vr3, $vr3, $vr1
-+
-+    addi.d         a1, a1, 16
-+    vshuf.b        $vr4, $vr0, $vr2, $vr3
-+    vsetanyeqz.b   $fcc0, $vr0
-+    bcnez          $fcc0, L(un_end)
-+
-+L(un_loop):
-+    vor.v          $vr2, $vr0, $vr0
-+    vld            $vr0, a1, 16
-+    vst            $vr4, a0, 0
-+    addi.d         a1, a1, 16
-+
-+    addi.d         a0, a0, 16
-+    vshuf.b        $vr4, $vr0, $vr2, $vr3
-+    vsetanyeqz.b   $fcc0, $vr0
-+    bceqz          $fcc0, L(un_loop)
-+
-+
-+L(un_end):
-+    vsetanyeqz.b    $fcc0, $vr4
-+    bcnez           $fcc0, 1f
-+    vst             $vr4, a0, 0
-+1:
-+    vmsknz.b        $vr1, $vr0
-+
-+    movfr2gr.s      t0, $f1
-+    cto.w           t0, t0
-+    add.d           a1, a1, t0
-+    vld             $vr0, a1, -15
-+
-+    add.d           a0, a0, t0
-+    sub.d           a0, a0, a3
-+    vst             $vr0, a0, 1
-+    addi.d          a0, a0, 16
-+
-+    jr              ra
-+L(un_first_end):
-+    addi.d          a0, a0, -16
-+    b               1b
-+END(STPCPY)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STPCPY)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy.c b/sysdeps/loongarch/lp64/multiarch/stpcpy.c
-new file mode 100644
-index 00000000..531a3ed6
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy.c
-@@ -0,0 +1,43 @@
-+/* Multiple versions of stpcpy.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2023 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+/* Define multiple versions only for the definition in libc.  */
-+#if IS_IN (libc)
-+# define stpcpy __redirect_stpcpy
-+# define __stpcpy __redirect___stpcpy
-+# define NO_MEMPCPY_STPCPY_REDIRECT
-+# define __NO_STRING_INLINES
-+# include <string.h>
-+# undef stpcpy
-+# undef __stpcpy
-+
-+# define SYMBOL_NAME stpcpy
-+# include "ifunc-stpcpy.h"
-+
-+libc_ifunc_redirected (__redirect_stpcpy, __stpcpy, IFUNC_SELECTOR ());
-+
-+weak_alias (__stpcpy, stpcpy)
-+# ifdef SHARED
-+__hidden_ver1 (__stpcpy, __GI___stpcpy, __redirect___stpcpy)
-+  __attribute__ ((visibility ("hidden")));
-+__hidden_ver1 (stpcpy, __GI_stpcpy, __redirect_stpcpy)
-+  __attribute__ ((visibility ("hidden")));
-+# endif
-+#endif
-+
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
-new file mode 100644
-index 00000000..92365658
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
-@@ -0,0 +1,10 @@
-+
-+#if IS_IN (libc)
-+
-+#define STRCHR_NAME __strchr_aligned
-+
-+#endif
-+
-+#include "../strchr.S"
-+
-+weak_alias (STRCHR_NAME, index)
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
-new file mode 100644
-index 00000000..ea7eb9d2
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
-@@ -0,0 +1,81 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#ifndef AS_STRCHRNUL
-+#define STRCHR	__strchr_lasx
-+#endif
-+
-+LEAF(STRCHR)
-+    .align          6
-+    andi            t1, a0, 0x1f
-+    bstrins.d       a0, zero, 4, 0
-+    xvld            $xr0, a0, 0
-+    li.d            t2, -1
-+
-+    xvreplgr2vr.b   $xr1, a1
-+    sll.d           t1, t2, t1
-+    xvxor.v         $xr2, $xr0, $xr1
-+    xvmin.bu        $xr0, $xr0, $xr2
-+
-+    xvmsknz.b       $xr0, $xr0
-+    xvpickve.w      $xr3, $xr0, 4
-+    vilvl.h         $vr0, $vr3, $vr0
-+    movfr2gr.s      t0, $f0
-+
-+    orn             t0, t0, t1
-+    bne             t0, t2, L(end)
-+    addi.d          a0, a0, 32
-+    nop
-+
-+
-+L(loop):
-+    xvld            $xr0, a0, 0
-+    xvxor.v         $xr2, $xr0, $xr1
-+    xvmin.bu        $xr0, $xr0, $xr2
-+    xvsetanyeqz.b   $fcc0, $xr0
-+
-+    bcnez           $fcc0, L(loop_end)
-+    xvld            $xr0, a0, 32
-+    addi.d          a0, a0, 64
-+    xvxor.v         $xr2, $xr0, $xr1
-+
-+    xvmin.bu        $xr0, $xr0, $xr2
-+    xvsetanyeqz.b   $fcc0, $xr0
-+    bceqz           $fcc0, L(loop)
-+    addi.d          a0, a0, -32
-+
-+L(loop_end):
-+    xvmsknz.b       $xr0, $xr0
-+    xvpickve.w      $xr1, $xr0, 4
-+    vilvl.h         $vr0, $vr1, $vr0
-+    movfr2gr.s      t0, $f0
-+
-+
-+L(end):
-+    cto.w           t0, t0
-+    add.d           a0, a0, t0
-+#ifndef AS_STRCHRNUL
-+    vreplgr2vr.b    $vr0, t0
-+    xvpermi.q       $xr3, $xr2, 1
-+
-+    vshuf.b         $vr0, $vr3, $vr2, $vr0
-+    vpickve2gr.bu   t0, $vr0, 0
-+    masknez         a0, a0, t0
-+#endif
-+    jr              ra
-+
-+END(STRCHR)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def(STRCHR)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
-new file mode 100644
-index 00000000..64ead00b
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
-@@ -0,0 +1,61 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <regdef.h>
-+#include <sys/asm.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#ifndef AS_STRCHRNUL
-+#define STRCHR	__strchr_lsx
-+#endif
-+
-+LEAF(STRCHR)
-+    .align          6
-+    andi            t1, a0, 0xf
-+    bstrins.d       a0, zero, 3, 0
-+    vld             $vr0, a0, 0
-+    li.d            t2, -1
-+
-+    vreplgr2vr.b    $vr1, a1
-+    sll.d           t3, t2, t1
-+    vxor.v          $vr2, $vr0, $vr1
-+    vmin.bu         $vr0, $vr0, $vr2
-+
-+    vmsknz.b        $vr0, $vr0
-+    movfr2gr.s      t0, $f0
-+    ext.w.h         t0, t0
-+    orn             t0, t0, t3
-+
-+    beq             t0, t2, L(loop)
-+L(found):
-+    cto.w           t0, t0
-+    add.d           a0, a0, t0
-+#ifndef AS_STRCHRNUL
-+    vreplve.b       $vr2, $vr2, t0
-+    vpickve2gr.bu   t1, $vr2, 0
-+    masknez         a0, a0, t1
-+#endif
-+    jr              ra
-+
-+
-+L(loop):
-+    vld             $vr0, a0, 16
-+    addi.d          a0, a0, 16
-+    vxor.v          $vr2, $vr0, $vr1
-+    vmin.bu         $vr0, $vr0, $vr2
-+
-+    vsetanyeqz.b    $fcc0, $vr0
-+    bceqz           $fcc0, L(loop)
-+    vmsknz.b        $vr0, $vr0
-+    movfr2gr.s      t0, $f0
-+
-+    b               L(found)
-+END(STRCHR)
-+
-+libc_hidden_builtin_def (STRCHR)
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S
-new file mode 100644
-index 00000000..1d5e56c5
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S
-@@ -0,0 +1,132 @@
-+/* Copyright 2016 Loongson Technology Corporation Limited  */
-+
-+/* Author: songyuekun songyuekun@loongson.cn */
-+
-+/* basic algorithm :
-+	+. use ld.d and mask for the first 8 bytes or less;
-+	+. build a1 with 8c with dins;
-+	+. use xor from a1 and v0 to check if is found;
-+	+. if (v0 - 0x0101010101010101) & (~(v0 | 0x7f7f7f7f7f7f7f7f)!= 0, v0 has
-+	one byte is \0, else has no \0
-+*/
-+
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+
-+#if IS_IN (libc)
-+
-+#define L_ADDIU  addi.d
-+#define L_ADDU   add.d
-+#define L_SUBU   sub.d
-+
-+#define MOVN(rd,rs,rt) \
-+	maskeqz t6, rs, rt;\
-+	masknez rd, rd, rt;\
-+	or	rd, rd, t6
-+
-+#define MOVN2(rd,rt) \
-+	masknez rd, rd, rt;\
-+	or	rd, rd, rt
-+
-+#define STRCHR_NAME __strchr_unaligned
-+
-+/* char * strchr (const char *s1, int c); */
-+LEAF(STRCHR_NAME)
-+	.align		6
-+
-+	li.w		t4, 0x7
-+	lu12i.w		a2, 0x01010
-+	bstrins.d	a1, a1, 15, 8
-+	andi		t0, a0, 0x7
-+
-+	ori		a2, a2, 0x101
-+	andn		t4, a0, t4
-+	slli.w		t1, t0, 3
-+
-+	ld.d		t4, t4, 0
-+
-+
-+	nor		t8, zero, zero
-+	bstrins.d	a1, a1, 31, 16
-+	srl.d		t4, t4, t1
-+
-+	bstrins.d	a1, a1, 63, 32
-+	bstrins.d	a2, a2, 63, 32
-+	srl.d		a7, t8, t1
-+
-+	li.w		t1, 8
-+	nor		t8, a7, zero
-+	slli.d		a3, a2, 7
-+	or		t5, t8, t4
-+	and		t3, a7, a1
-+
-+	sub.w		t1, t1, t0
-+	nor		a3, a3, zero
-+	xor		t2, t5, t3
-+	sub.d		a7, t5, a2
-+	nor		a6, t5, a3
-+
-+	sub.d		a5, t2, a2
-+	nor		a4, t2, a3
-+
-+    	and         	a6, a7, a6
-+    	and         	a5, a5, a4
-+    	or          	a7, a6, a5
-+	bnez		a7, L(_mc8_a)
-+
-+	L_ADDU		a0, a0, t1
-+L(_aloop):
-+	ld.d		t4, a0, 0
-+
-+	xor		t2, t4, a1
-+	sub.d		a7, t4, a2
-+	nor		a6, t4, a3
-+	sub.d		a5, t2, a2
-+
-+	nor		a4, t2, a3
-+    	and         	a6, a7, a6
-+    	and         	a5, a5, a4
-+    	or          	a7, a6, a5
-+	bnez		a7, L(_mc8_a)
-+
-+	ld.d		t4, a0, 8
-+	L_ADDIU		a0, a0, 16
-+	xor		t2, t4, a1
-+	sub.d		a7, t4, a2
-+	nor		a6, t4, a3
-+	sub.d		a5, t2, a2
-+
-+	nor		a4, t2, a3
-+    	and         	a6, a7, a6
-+    	and         	a5, a5, a4
-+    	or          	a7, a6, a5
-+	beqz		a7, L(_aloop)
-+
-+	L_ADDIU		a0, a0, -8
-+L(_mc8_a):
-+
-+    	ctz.d       	t0, a5
-+    	ctz.d       	t2, a6
-+
-+	srli.w		t0, t0, 3
-+	srli.w		t2, t2, 3
-+	sltu		t1, t2, t0
-+	L_ADDU		v0, a0, t0
-+	masknez     v0, v0, t1
-+	jr		ra
-+END(STRCHR_NAME)
-+
-+#ifndef ANDROID_CHANGES
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRCHR_NAME)
-+#endif
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchr.c b/sysdeps/loongarch/lp64/multiarch/strchr.c
-new file mode 100644
-index 00000000..c6b069ed
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strchr.c
-@@ -0,0 +1,39 @@
-+/* Multiple versions of strchr.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+/* Define multiple versions only for the definition in libc.  */
-+#if IS_IN (libc)
-+# define strchr __redirect_strchr
-+# include <string.h>
-+# undef strchr
-+
-+# define SYMBOL_NAME strchr
-+# include "ifunc-lasx.h"
-+
-+libc_ifunc_redirected (__redirect_strchr, __new_strchr,
-+		       IFUNC_SELECTOR ());
-+weak_alias(__new_strchr, index)
-+# ifdef SHARED
-+__hidden_ver1 (__new_strchr, __GI_strchr, __redirect_strchr)
-+  __attribute__ ((visibility ("hidden")));
-+# endif
-+
-+# include <shlib-compat.h>
-+versioned_symbol (libc, __new_strchr, strchr, GLIBC_2_27);
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
-new file mode 100644
-index 00000000..4fa63ecc
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
-@@ -0,0 +1,8 @@
-+
-+#if IS_IN (libc)
-+
-+#define STRCHRNUL_NAME __strchrnul_aligned
-+
-+#endif
-+
-+#include "../strchrnul.S"
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S
-new file mode 100644
-index 00000000..f8765413
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S
-@@ -0,0 +1,4 @@
-+#define STRCHR __strchrnul_lasx
-+#define AS_STRCHRNUL
-+#include "strchr-lasx.S"
-+
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S
-new file mode 100644
-index 00000000..d363f11f
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S
-@@ -0,0 +1,3 @@
-+#define STRCHR __strchrnul_lsx
-+#define AS_STRCHRNUL
-+#include "strchr-lsx.S"
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S
-new file mode 100644
-index 00000000..6338d005
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S
-@@ -0,0 +1,146 @@
-+/* Copyright 2016 Loongson Technology Corporation Limited.  */
-+
-+/* Author: Songyuekun songyuekun@loongson.cn
-+ * ISA: MIPS64R2
-+ * ABI: N64
-+ * basic algorithm :
-+	+. use ld.d and mask for the first 8 bytes or less;
-+	+. build a1 with 8c with dins;
-+	+. use xor from a1 and v0 to check if is found;
-+	+. if (v0 - 0x0101010101010101) & (~(v0 | 0x7f7f7f7f7f7f7f7f)!= 0, v0 has
-+	one byte is \0, else has no \0
-+*/
-+
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define L_ADDIU  addi.d
-+#define L_ADDU   add.d
-+#define L_SUBU   sub.d
-+
-+#define STRCHRNUL_NAME	__strchrnul_unaligned
-+
-+#define MOVN(rd,rs,rt) \
-+	maskeqz t6, rs, rt;\
-+	masknez rd, rd, rt;\
-+	or	rd, rd, t6
-+
-+#define MOVZ(rd,rs,rt) \
-+	masknez t6, rs, rt;\
-+	maskeqz rd, rd, rt;\
-+	or	rd, rd, t6
-+
-+
-+#define MOVN2(rd,rt) \
-+	masknez rd, rd, rt;\
-+	or	rd, rd, rt
-+
-+
-+/* char * strchrnul (const char *s1, int c); */
-+
-+LEAF(STRCHRNUL_NAME)
-+	.align		6
-+	li.w		t4, 0x7
-+	lu12i.w		a2, 0x01010
-+	bstrins.d	a1, a1, 15, 8
-+	andi		t0, a0, 0x7
-+
-+	ori		a2, a2, 0x101
-+	andn		t4, a0, t4
-+	slli.w		t1, t0, 3
-+	ld.d		t4, t4, 0
-+
-+
-+	nor		t8, zero, zero
-+	bstrins.d	a1, a1, 31, 16
-+	srl.d		t4, t4, t1
-+
-+	preld		0, a0, 32
-+	bstrins.d	a1, a1, 63, 32
-+	bstrins.d	a2, a2, 63, 32
-+	srl.d		a7, t8, t1
-+
-+	nor		t8, a7, zero
-+	slli.d		a3, a2, 7
-+	or		t5, t8, t4
-+	and		t3, a7, a1
-+
-+	nor		a3, a3, zero
-+	xor		t2, t5, t3
-+	sub.d		a7, t5, a2
-+	nor		a6, t5, a3
-+
-+	li.w		t1, 8
-+	sub.d		a5, t2, a2
-+	nor		a4, t2, a3
-+
-+	and		a6, a7, a6
-+	and		a5, a5, a4
-+	or		a7, a6, a5
-+	bnez		a7, L(_mc8_a)
-+
-+
-+	sub.w		t1, t1, t0
-+	L_ADDU		a0, a0, t1
-+L(_aloop):
-+	ld.d		t4, a0, 0
-+
-+	xor		t2, t4, a1
-+	sub.d		a7, t4, a2
-+	nor		a6, t4, a3
-+	sub.d		a5, t2, a2
-+
-+	nor		a4, t2, a3
-+	and		a6, a7, a6
-+	and		a5, a5, a4
-+
-+	or		a7, a6, a5
-+	bnez		a7, L(_mc8_a)
-+
-+	ld.d		t4, a0, 8
-+	L_ADDIU		a0, a0, 16
-+
-+	xor		t2, t4, a1
-+	sub.d		a7, t4, a2
-+	nor		a6, t4, a3
-+	sub.d		a5, t2, a2
-+
-+	nor		a4, t2, a3
-+	and		a6, a7, a6
-+	and		a5, a5, a4
-+
-+	or		a7, a6, a5
-+	beqz		a7, L(_aloop)
-+
-+	L_ADDIU		a0, a0, -8
-+L(_mc8_a):
-+
-+    	ctz.d       	t0, a5
-+    	ctz.d       	t2, a6
-+
-+	srli.w		t0, t0, 3
-+	srli.w		t2, t2, 3
-+	slt 		t1, t0, t2
-+
-+    MOVZ(t0,t2,t1)
-+
-+	L_ADDU		v0, a0, t0
-+	jr		ra
-+END(STRCHRNUL_NAME)
-+
-+#ifndef ANDROID_CHANGES
-+#ifdef _LIBC
-+weak_alias(STRCHRNUL_NAME, strchrnul)
-+libc_hidden_builtin_def (STRCHRNUL_NAME)
-+#endif
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul.c b/sysdeps/loongarch/lp64/multiarch/strchrnul.c
-new file mode 100644
-index 00000000..53a7273a
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul.c
-@@ -0,0 +1,34 @@
-+/* Multiple versions of strchrnul.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+/* Define multiple versions only for the definition in libc.  */
-+#if IS_IN (libc)
-+# define strchrnul __redirect_strchrnul
-+# define __strchrnul __redirect___strchrnul
-+# include <string.h>
-+# undef __strchrnul
-+# undef strchrnul
-+
-+# define SYMBOL_NAME strchrnul
-+# include "ifunc-lasx.h"
-+
-+libc_ifunc_redirected (__redirect_strchrnul, __strchrnul,
-+                       IFUNC_SELECTOR ());
-+weak_alias (__strchrnul, strchrnul)
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
-new file mode 100644
-index 00000000..f84f52b8
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
-@@ -0,0 +1,8 @@
-+
-+#if IS_IN (libc)
-+
-+#define STRCMP_NAME __strcmp_aligned
-+
-+#endif
-+
-+#include "../strcmp.S"
-diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
-new file mode 100644
-index 00000000..226b1d63
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
-@@ -0,0 +1,147 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <regdef.h>
-+#include <sys/asm.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define STRCMP	__strcmp_lsx
-+
-+/* int strcmp (const char *s1, const char *s2); */
-+L(magic_num):
-+    .align          6
-+    .dword          0x0706050403020100
-+    .dword          0x0f0e0d0c0b0a0908
-+
-+ENTRY_NO_ALIGN(STRCMP)
-+    pcaddi          t0, -4
-+    andi            a2, a0, 0xf
-+    vld             $vr2, t0, 0
-+    andi            a3, a1, 0xf
-+
-+    bne             a2, a3, L(unaligned)
-+    bstrins.d       a0, zero, 3, 0
-+    bstrins.d       a1, zero, 3, 0
-+    vld             $vr0, a0, 0
-+
-+    vld             $vr1, a1, 0
-+    vreplgr2vr.b    $vr3, a2
-+    vslt.b          $vr2, $vr2, $vr3
-+    vseq.b          $vr3, $vr0, $vr1
-+
-+
-+    vmin.bu         $vr3, $vr0, $vr3
-+    vor.v           $vr3, $vr3, $vr2
-+    vsetanyeqz.b    $fcc0, $vr3
-+    bcnez           $fcc0, L(al_out)
-+
-+L(al_loop):
-+    vld             $vr0, a0, 16
-+    vld             $vr1, a1, 16
-+    addi.d          a0, a0, 16
-+    addi.d          a1, a1, 16
-+
-+    vseq.b          $vr3, $vr0, $vr1
-+    vmin.bu         $vr3, $vr0, $vr3
-+    vsetanyeqz.b    $fcc0, $vr3
-+    bceqz           $fcc0, L(al_loop)
-+
-+L(al_out):
-+    vseqi.b         $vr3, $vr3, 0
-+    vfrstpi.b       $vr3, $vr3, 0
-+    vshuf.b         $vr0, $vr0, $vr0, $vr3
-+    vshuf.b         $vr1, $vr1, $vr1, $vr3
-+
-+
-+    vpickve2gr.bu   t0, $vr0, 0
-+    vpickve2gr.bu   t1, $vr1, 0
-+    sub.d           a0, t0, t1
-+    jr              ra
-+
-+    nop
-+    nop
-+    nop
-+L(unaligned):
-+    slt             a4, a2, a3
-+
-+    xor             t0, a0, a1
-+    maskeqz         t0, t0, a4
-+    xor             a0, a0, t0   # a0 hold the larger one
-+    xor             a1, a1, t0   # a1 hold the small one
-+
-+    andi            a2, a0, 0xf
-+    andi            a3, a1, 0xf
-+    bstrins.d       a0, zero, 3, 0
-+    bstrins.d       a1, zero, 3, 0
-+
-+
-+    vld             $vr0, a0, 0
-+    vld             $vr3, a1, 0
-+    vreplgr2vr.b    $vr4, a2
-+    vreplgr2vr.b    $vr5, a3
-+
-+    vslt.b          $vr7, $vr2, $vr4
-+    vsub.b          $vr4, $vr4, $vr5
-+    vaddi.bu        $vr6, $vr2, 16
-+    vsub.b          $vr6, $vr6, $vr4
-+
-+    vshuf.b         $vr1, $vr3, $vr3, $vr6
-+    vseq.b          $vr4, $vr0, $vr1
-+    vmin.bu         $vr4, $vr0, $vr4
-+    vor.v           $vr4, $vr4, $vr7
-+
-+    vsetanyeqz.b    $fcc0, $vr4
-+    bcnez           $fcc0, L(un_end)
-+    vslt.b          $vr5, $vr2, $vr5
-+    vor.v           $vr3, $vr3, $vr5
-+
-+
-+L(un_loop):
-+    vld             $vr0, a0, 16
-+    vsetanyeqz.b    $fcc0, $vr3
-+    bcnez           $fcc0, L(remaining_end)
-+    vor.v           $vr1, $vr3, $vr3
-+
-+    vld             $vr3, a1, 16
-+    addi.d          a0, a0, 16
-+    addi.d          a1, a1, 16
-+    vshuf.b         $vr1, $vr3, $vr1, $vr6
-+
-+    vseq.b          $vr4, $vr0, $vr1
-+    vmin.bu         $vr4, $vr0, $vr4
-+    vsetanyeqz.b    $fcc0, $vr4
-+    bceqz           $fcc0, L(un_loop)
-+
-+L(un_end):
-+    vseqi.b         $vr4, $vr4, 0
-+    vfrstpi.b       $vr4, $vr4, 0
-+    vshuf.b         $vr0, $vr0, $vr0, $vr4
-+    vshuf.b         $vr1, $vr1, $vr1, $vr4
-+
-+
-+    vpickve2gr.bu   t0, $vr0, 0
-+    vpickve2gr.bu   t1, $vr1, 0
-+    sub.d           t3, t0, t1
-+    sub.d           t4, t1, t0
-+
-+    masknez         t0, t3, a4
-+    maskeqz         t1, t4, a4
-+    or              a0, t0, t1
-+    jr              ra
-+
-+L(remaining_end):
-+    vshuf.b         $vr1, $vr3, $vr3, $vr6
-+    vseq.b          $vr4, $vr0, $vr1
-+    vmin.bu         $vr4, $vr4, $vr0
-+    b               L(un_end)
-+END(STRCMP)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRCMP)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S
-new file mode 100644
-index 00000000..e29d872f
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S
-@@ -0,0 +1,191 @@
-+/* Copyright 2016 Loongson Technology Corporation Limited  */
-+
-+/* Author: songyuekun songyuekun@loongson.cn */
-+
-+/*
-+ * ISA: MIPS64R2
-+ * ABI: N64
-+ */
-+
-+/* basic algorithm :
-+	+. let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1,
-+		set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1
-+	+. if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more ld other times;
-+	+. if not,  load partial t2 and t3, check if t2 has \0;
-+	+. then use use ld for t0, ldr for t1,
-+	+. if partial 8 byte  from t1 has \0, compare partial 8 byte from t1 with 8
-+		byte from t0 with a mask in a7
-+	+. if not, ldl other part of t1, compare  8 byte from t1 with 8 byte from t0
-+	+. if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has
-+		one byte is \0, else has no \0
-+	+. for partial 8 byte from ldr t3, 0(a0), preload t3 with 0xffffffffffffffff
-+*/
-+
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+
-+#if IS_IN (libc)
-+
-+
-+#define STRCMP_NAME __strcmp_unaligned
-+
-+#define REP8_01 0x0101010101010101
-+#define REP8_7f 0x7f7f7f7f7f7f7f7f
-+#define REP8_80 0x8080808080808080
-+
-+/* Parameters and Results */
-+#define src1	a0
-+#define	src2	a1
-+#define result	v0
-+// Note: v0 = a0 in N64 ABI
-+
-+
-+/* Internal variable */
-+#define data1		t0
-+#define	data2		t1
-+#define	has_nul		t2
-+#define	diff		t3
-+#define syndrome	t4
-+#define zeroones	t5
-+#define	sevenf		t6
-+#define pos		t7
-+#define exchange	t8
-+#define tmp1		a4
-+#define	tmp2		a5
-+#define	tmp3		a6
-+#define src1_off    	a2
-+#define src2_off    	a3
-+#define tmp4        	a7
-+
-+/* rd <- if rc then ra else rb
-+    will destroy tmp3.  */
-+#define CONDITIONSEL(rd,rc,ra,rb)\
-+        masknez tmp3, rb, rc;\
-+        maskeqz rd,   ra, rc;\
-+        or      rd,   rd, tmp3
-+
-+/* int strcmp (const char *s1, const char *s2); */
-+
-+LEAF(STRCMP_NAME)
-+	.align		4
-+
-+	xor		tmp1, src1, src2
-+    	lu12i.w     	zeroones, 0x01010
-+    	lu12i.w     	sevenf, 0x7f7f7
-+    	andi        	src1_off, src1, 0x7
-+    	ori         	zeroones, zeroones, 0x101
-+    	ori         	sevenf, sevenf, 0xf7f
-+	andi		tmp1, tmp1, 0x7
-+    	bstrins.d   	zeroones, zeroones, 63, 32
-+    	bstrins.d   	sevenf, sevenf, 63, 32
-+	bnez		tmp1, strcmp_misaligned8
-+	bnez		src1_off, strcmp_mutual_align
-+strcmp_loop_aligned:
-+	ld.d		data1, src1, 0
-+    	addi.d      	src1, src1, 8
-+	ld.d		data2, src2, 0
-+    	addi.d      	src2, src2, 8
-+strcmp_start_realigned:
-+	sub.d		tmp1, data1, zeroones
-+	or		tmp2, data1, sevenf
-+	xor		diff, data1, data2
-+	andn		has_nul, tmp1, tmp2
-+	or		syndrome, diff, has_nul
-+	beqz		syndrome, strcmp_loop_aligned
-+
-+strcmp_end:
-+	ctz.d		pos, syndrome
-+	bstrins.d   	pos, zero, 2, 0
-+	srl.d		data1, data1, pos
-+	srl.d		data2, data2, pos
-+	andi		data1, data1, 0xff
-+	andi		data2, data2, 0xff
-+	sub.d		result, data1, data2
-+	jr ra
-+strcmp_mutual_align:
-+    	bstrins.d   	src1, zero, 2, 0
-+    	bstrins.d   	src2, zero, 2, 0
-+	slli.d		tmp1, src1_off,  0x3
-+	ld.d		data1, src1, 0
-+	sub.d		tmp1, zero, tmp1
-+	ld.d		data2, src2, 0
-+    	addi.d      	src1, src1, 8
-+    	addi.d      	src2, src2, 8
-+	nor		tmp2, zero, zero
-+	srl.d		tmp2, tmp2, tmp1
-+	or		data1, data1, tmp2
-+	or		data2, data2, tmp2
-+	b		strcmp_start_realigned
-+
-+strcmp_misaligned8:
-+
-+/* check if ((src1 != 0) && ((src2 == 0 ) || (src1 < src2)))
-+    then exchange(src1,src2).  */
-+    	andi        	src2_off, src2, 0x7
-+    	slt         	tmp2, src1_off, src2_off
-+    	CONDITIONSEL(tmp2,src2_off,tmp2,tmp1)
-+    	maskeqz     	exchange, tmp2, src1_off
-+    	xor         	tmp3, src1, src2
-+    	maskeqz     	tmp3, tmp3, exchange
-+    	xor         	src1, src1, tmp3
-+    	xor         	src2, src2, tmp3
-+
-+	andi		src1_off, src1, 0x7
-+	beqz		src1_off, strcmp_loop_misaligned
-+strcmp_do_misaligned:
-+	ld.bu		data1, src1, 0
-+	ld.bu		data2, src2, 0
-+	xor         	tmp3, data1, data2
-+	addi.d		src1, src1, 1
-+    	masknez     	tmp3, data1, tmp3
-+	addi.d		src2, src2, 1
-+    	beqz        	tmp3, strcmp_done
-+	andi		src1_off, src1, 0x7
-+	bnez		src1_off, strcmp_do_misaligned
-+
-+strcmp_loop_misaligned:
-+	andi		tmp1, src2, 0xff8
-+	xori		tmp1, tmp1, 0xff8
-+	beqz		tmp1, strcmp_do_misaligned
-+	ld.d		data1, src1, 0
-+	ld.d		data2, src2, 0
-+	addi.d		src1, src1, 8
-+	addi.d		src2, src2, 8
-+
-+	sub.d		tmp1, data1, zeroones
-+	or		tmp2, data1, sevenf
-+	xor		diff, data1, data2
-+	andn		has_nul, tmp1, tmp2
-+	or		syndrome, diff, has_nul
-+	beqz		syndrome, strcmp_loop_misaligned
-+strcmp_misalign_end:
-+	ctz.d		pos, syndrome
-+    	bstrins.d	pos, zero, 2, 0
-+	srl.d		data1, data1, pos
-+	srl.d		data2, data2, pos
-+	andi		data1, data1, 0xff
-+	andi		data2, data2, 0xff
-+	sub.d		tmp1, data1, data2
-+	sub.d		tmp2, data2, data1
-+    	CONDITIONSEL(result,exchange,tmp2,tmp1)
-+	jr ra
-+
-+strcmp_done:
-+	sub.d		tmp1, data1, data2
-+	sub.d		tmp2, data2, data1
-+    	CONDITIONSEL(result,exchange,tmp2,tmp1)
-+	jr		ra
-+END(STRCMP_NAME)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRCMP_NAME)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp.c b/sysdeps/loongarch/lp64/multiarch/strcmp.c
-new file mode 100644
-index 00000000..0b20e6f0
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strcmp.c
-@@ -0,0 +1,35 @@
-+/* Multiple versions of strcmp.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+/* Define multiple versions only for the definition in libc.  */
-+#if IS_IN (libc)
-+# define strcmp __redirect_strcmp
-+# include <string.h>
-+# undef strcmp
-+
-+# define SYMBOL_NAME strcmp
-+#include <ifunc-lsx.h>
-+
-+libc_ifunc_redirected (__redirect_strcmp, strcmp, IFUNC_SELECTOR ());
-+
-+# ifdef SHARED
-+__hidden_ver1 (strcmp, __GI_strcmp, __redirect_strcmp)
-+  __attribute__ ((visibility ("hidden")));
-+# endif
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S
-new file mode 100644
-index 00000000..4860398b
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S
-@@ -0,0 +1,8 @@
-+
-+#if IS_IN (libc)
-+
-+#define STRCPY __strcpy_aligned
-+
-+#endif
-+
-+#include "../strcpy.S"
-diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
-new file mode 100644
-index 00000000..76db561a
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
-@@ -0,0 +1,174 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <regdef.h>
-+#include <sys/asm.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define STRCPY __strcpy_lsx
-+
-+/* int strcpy (const char *s1, const char *s2); */
-+
-+L(magic_num):
-+    .align          6
-+    .dword          0x0706050403020100
-+    .dword          0x0f0e0d0c0b0a0908
-+ENTRY_NO_ALIGN(STRCPY)
-+    pcaddi          t0, -4
-+    andi            a4, a1, 0xf
-+    vld             $vr1, t0, 0
-+    move            a2, a0
-+
-+    beqz            a4, L(load_start)
-+    xor             t0, a1, a4
-+    vld             $vr0, t0, 0
-+    vreplgr2vr.b    $vr2, a4
-+
-+    vadd.b          $vr2, $vr2, $vr1
-+    vshuf.b         $vr0, $vr2, $vr0, $vr2
-+    vsetanyeqz.b    $fcc0, $vr0
-+    bcnez           $fcc0, L(end)
-+
-+
-+L(load_start):
-+    vld             $vr0, a1, 0
-+    li.d            t1, 16
-+    andi            a3, a2, 0xf
-+    vsetanyeqz.b    $fcc0, $vr0
-+
-+    sub.d           t0, t1, a3
-+    bcnez           $fcc0, L(end)
-+    add.d           a1, a1, t0
-+    vst             $vr0, a2, 0
-+
-+    andi            a3, a1, 0xf
-+    add.d           a2, a2, t0
-+    bnez            a3, L(unaligned)
-+    vld             $vr0, a1, 0
-+
-+    vsetanyeqz.b    $fcc0, $vr0
-+    bcnez           $fcc0, L(end)
-+L(loop):
-+    vst             $vr0, a2, 0
-+    vld             $vr0, a1, 16
-+
-+
-+    addi.d          a2, a2, 16
-+    addi.d          a1, a1, 16
-+    vsetanyeqz.b    $fcc0, $vr0
-+    bceqz           $fcc0, L(loop)
-+
-+    vmsknz.b        $vr1, $vr0
-+    movfr2gr.s      t0, $f1
-+    cto.w           t0, t0
-+    add.d           a1, a1, t0
-+
-+    vld             $vr0, a1, -15
-+    add.d           a2, a2, t0
-+    vst             $vr0, a2, -15
-+    jr              ra
-+
-+L(end):
-+    vmsknz.b        $vr1, $vr0
-+    movfr2gr.s      t0, $f1
-+    cto.w           t0, t0
-+    addi.d          t0, t0, 1
-+
-+
-+L(end_16):
-+    andi            t1, t0, 16
-+    beqz            t1, L(end_8)
-+    vst             $vr0, a2, 0
-+    jr              ra
-+
-+L(end_8):
-+    andi            t2, t0, 8
-+    andi            t3, t0, 4
-+    andi            t4, t0, 2
-+    andi            t5, t0, 1
-+
-+    beqz            t2, L(end_4)
-+    vstelm.d        $vr0, a2, 0, 0
-+    addi.d          a2, a2, 8
-+    vbsrl.v         $vr0, $vr0, 8
-+
-+L(end_4):
-+    beqz            t3, L(end_2)
-+    vstelm.w        $vr0, a2, 0, 0
-+    addi.d          a2, a2, 4
-+    vbsrl.v         $vr0, $vr0, 4
-+
-+
-+L(end_2):
-+    beqz            t4, L(end_1)
-+    vstelm.h        $vr0, a2, 0, 0
-+    addi.d          a2, a2, 2
-+    vbsrl.v         $vr0, $vr0, 2
-+
-+L(end_1):
-+    beqz            t5, L(out)
-+    vstelm.b        $vr0, a2, 0, 0
-+L(out):
-+    jr              ra
-+L(unaligned):
-+    bstrins.d      a1, zero, 3, 0
-+
-+    vld            $vr2, a1, 0
-+    vreplgr2vr.b   $vr3, a3
-+    vslt.b         $vr4, $vr1, $vr3
-+    vor.v          $vr0, $vr2, $vr4
-+
-+    vsetanyeqz.b   $fcc0, $vr0
-+    bcnez          $fcc0, L(un_first_end)
-+    vld            $vr0, a1, 16
-+    vadd.b         $vr3, $vr3, $vr1
-+
-+
-+    addi.d         a1, a1, 16
-+    vshuf.b        $vr4, $vr0, $vr2, $vr3
-+    vsetanyeqz.b   $fcc0, $vr0
-+    bcnez          $fcc0, L(un_end)
-+
-+L(un_loop):
-+    vor.v          $vr2, $vr0, $vr0
-+    vld            $vr0, a1, 16
-+    vst            $vr4, a2, 0
-+    addi.d         a1, a1, 16
-+
-+    addi.d         a2, a2, 16
-+    vshuf.b        $vr4, $vr0, $vr2, $vr3
-+    vsetanyeqz.b   $fcc0, $vr0
-+    bceqz          $fcc0, L(un_loop)
-+
-+L(un_end):
-+    vsetanyeqz.b    $fcc0, $vr4
-+    bcnez           $fcc0, 1f
-+    vst             $vr4, a2, 0
-+1:
-+    vmsknz.b        $vr1, $vr0
-+
-+
-+    movfr2gr.s      t0, $f1
-+    cto.w           t0, t0
-+    add.d           a1, a1, t0
-+    vld             $vr0, a1, -15
-+
-+    add.d           a2, a2, t0
-+    sub.d           a2, a2, a3
-+    vst             $vr0, a2, 1
-+    jr              ra
-+
-+L(un_first_end):
-+    addi.d          a2, a2, -16
-+    b               1b
-+END(STRCPY)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRCPY)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
-new file mode 100644
-index 00000000..449733cb
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
-@@ -0,0 +1,199 @@
-+/* Copyright 2016 Loongson Technology Corporation Limited  */
-+
-+/* Author: Huang Pei huangpei@loongson.cn.
-+ * ISA: MIPS64R2
-+ * ABI: N64
-+ * basic algorithm :
-+    +. if src aligned. just do the copy loop. if not, do the cross page check and copy one double word.
-+       Then move src to aligned.
-+    +. if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has
-+       one byte is \0, else has no \0
-+*/
-+
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define STRCPY __strcpy_unaligned
-+
-+#define REP8_01 0x0101010101010101
-+#define REP8_7f 0x7f7f7f7f7f7f7f7f
-+#define REP8_80 0x8080808080808080
-+
-+/* Parameters and Results */
-+#define dest	a0
-+#define	src	a1
-+#define result	v0
-+// Note: v0 = a0 in N64 ABI
-+
-+
-+/* Internal variable */
-+#define data		t0
-+#define	data1		t1
-+#define	has_nul		t2
-+#define	diff		t3
-+#define syndrome	t4
-+#define zeroones	t5
-+#define	sevenf		t6
-+#define pos		t7
-+#define dest_backup	t8
-+#define tmp1		a4
-+#define	tmp2		a5
-+#define	tmp3		a6
-+#define dest_off    	a2
-+#define src_off     	a3
-+#define tmp4        	a7
-+
-+/* rd <- if rc then ra else rb
-+    will destroy tmp3
-+*/
-+#define CONDITIONSEL(rd,rc,ra,rb)\
-+        masknez tmp3, rb, rc;\
-+        maskeqz rd,   ra, rc;\
-+        or      rd,   rd, tmp3
-+
-+/* int strcpy (const char *s1, const char *s2); */
-+
-+LEAF(STRCPY)
-+	.align		4
-+    	move        	dest_backup, dest
-+    	lu12i.w     	zeroones, 0x01010
-+    	lu12i.w     	sevenf, 0x7f7f7
-+    	ori         	zeroones, zeroones, 0x101
-+    	ori         	sevenf, sevenf, 0xf7f
-+    	bstrins.d   	zeroones, zeroones, 63, 32
-+    	bstrins.d   	sevenf, sevenf, 63, 32
-+    	andi        	src_off, src, 0x7
-+	beqz		src_off, strcpy_loop_aligned_1
-+	b		strcpy_mutual_align
-+strcpy_loop_aligned:
-+    	st.d        	data, dest, 0
-+    	addi.d      	dest, dest, 8
-+strcpy_loop_aligned_1:
-+	ld.d		data, src, 0
-+	addi.d		src, src, 8
-+strcpy_start_realigned:
-+	sub.d		tmp1, data, zeroones
-+	or		tmp2, data, sevenf
-+	andn		has_nul, tmp1, tmp2
-+	beqz		has_nul, strcpy_loop_aligned
-+
-+strcpy_end:
-+	ctz.d		pos, has_nul
-+	srli.d		pos, pos, 3
-+	addi.d		pos, pos, 1
-+/*  Do 8/4/2/1 strcpy based on pos value.
-+    pos value is the number of bytes to be copied
-+    the bytes include the final \0 so the max length is 8 and the min length is 1.
-+ */
-+
-+strcpy_end_8:
-+    	andi        	tmp1, pos, 0x8
-+    	beqz        	tmp1, strcpy_end_4
-+    	st.d        	data, dest, 0
-+    	move        	dest, dest_backup
-+    	jr		ra
-+strcpy_end_4:
-+    	andi        	tmp1, pos, 0x4
-+    	beqz        	tmp1, strcpy_end_2
-+    	st.w        	data, dest, 0
-+    	srli.d      	data, data, 32
-+    	addi.d      	dest, dest, 4
-+strcpy_end_2:
-+    	andi        	tmp1, pos, 0x2
-+    	beqz        	tmp1, strcpy_end_1
-+    	st.h        	data, dest, 0
-+    	srli.d      	data, data, 16
-+    	addi.d      	dest, dest, 2
-+strcpy_end_1:
-+    	andi        	tmp1, pos, 0x1
-+    	beqz        	tmp1, strcpy_end_ret
-+    	st.b        	data, dest, 0
-+strcpy_end_ret:
-+    	move        	result, dest_backup
-+    	jr	    	ra
-+
-+
-+strcpy_mutual_align:
-+/*  Check if around src page bound.
-+    if not go to page cross ok.
-+    if it is, do further check.
-+    use tmp2 to accelerate.  */
-+
-+    	li.w        	tmp2, 0xff8
-+    	andi        	tmp1, src,  0xff8
-+    	beq         	tmp1, tmp2, strcpy_page_cross
-+
-+strcpy_page_cross_ok:
-+/*
-+    Load a misaligned double word and check if has \0
-+    If no, do a misaligned double word paste.
-+    If yes, calculate the number of avaliable bytes,
-+    then jump to 4/2/1 end.
-+*/
-+	ld.d		data, src, 0
-+	sub.d		tmp1, data, zeroones
-+	or		tmp2, data, sevenf
-+	andn		has_nul, tmp1, tmp2
-+	bnez    	has_nul, strcpy_end
-+strcpy_mutual_align_finish:
-+/*
-+    Before jump back to align loop, make dest/src aligned.
-+    This will cause a duplicated paste for several bytes between
-+    the first double word and the second double word,
-+    but should not bring a problem.
-+*/
-+    	li.w		tmp1, 8
-+    	st.d        	data, dest, 0
-+    	sub.d       	tmp1, tmp1, src_off
-+    	add.d       	src,  src,  tmp1
-+    	add.d       	dest, dest, tmp1
-+
-+	b		strcpy_loop_aligned_1
-+
-+strcpy_page_cross:
-+/*
-+    ld.d from aligned address(src & ~0x7).
-+    check if high bytes have \0.
-+    it not, go back to page cross ok,
-+    since the string is supposed to cross the page bound in such situation.
-+    if it is, do a srl for data to make it seems like a direct double word from src,
-+    then go to 4/2/1 strcpy end.
-+
-+    tmp4 is 0xffff...ffff mask
-+    tmp2 demonstrate the bytes to be masked
-+    tmp2 = src_off << 3
-+    data = data >> (src_off * 8) | -1 << (64 - src_off * 8)
-+    and
-+    -1 << (64 - src_off * 8) ->  ~(-1 >> (src_off * 8))
-+*/
-+
-+    	li.w		tmp1, 0x7
-+    	andn        	tmp3, src,  tmp1
-+    	ld.d        	data, tmp3, 0
-+    	li.w		tmp4, -1
-+    	slli.d      	tmp2, src_off, 3
-+    	srl.d       	tmp4, tmp4, tmp2
-+    	srl.d       	data, data, tmp2
-+    	nor         	tmp4, tmp4, zero
-+    	or          	data, data, tmp4
-+    	sub.d		tmp1, data, zeroones
-+	or		tmp2, data, sevenf
-+	andn		has_nul, tmp1, tmp2
-+	beqz		has_nul, strcpy_page_cross_ok
-+	b		strcpy_end
-+END(STRCPY)
-+#ifndef ANDROID_CHANGES
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRCPY)
-+#endif
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy.c b/sysdeps/loongarch/lp64/multiarch/strcpy.c
-new file mode 100644
-index 00000000..48fecf66
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strcpy.c
-@@ -0,0 +1,36 @@
-+/* Multiple versions of strcpy.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2023 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+/* Define multiple versions only for the definition in libc.  */
-+#if IS_IN (libc)
-+# define strcpy __redirect_strcpy
-+# include <string.h>
-+# undef strcpy
-+
-+# define SYMBOL_NAME strcpy
-+# include "ifunc-lsx.h"
-+
-+libc_ifunc_redirected (__redirect_strcpy, strcpy, IFUNC_SELECTOR ());
-+
-+# ifdef SHARED
-+__hidden_ver1 (strcpy, __GI_strcpy, __redirect_strcpy)
-+  __attribute__ ((visibility ("hidden")));
-+# endif
-+#endif
-+
-diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
-new file mode 100644
-index 00000000..d31875fd
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
-@@ -0,0 +1,8 @@
-+
-+#if IS_IN (libc)
-+
-+#define STRLEN __strlen_aligned
-+
-+#endif
-+
-+#include "../strlen.S"
-diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
-new file mode 100644
-index 00000000..cb276aa0
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
-@@ -0,0 +1,55 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define STRLEN	__strlen_lasx
-+
-+/* size_t strlen(const char *s1); */
-+
-+LEAF(STRLEN)
-+    .align          6
-+    move            a1, a0
-+    bstrins.d       a0, zero, 4, 0
-+    li.d            t1, -1
-+    xvld            $xr0, a0, 0
-+
-+    xvmsknz.b       $xr0, $xr0
-+    xvpickve.w      $xr1, $xr0, 4
-+    vilvl.h         $vr0, $vr1, $vr0
-+    movfr2gr.s      t0, $f0  # sign extend
-+
-+    sra.w           t0, t0, a1
-+    beq             t0, t1, L(loop)
-+    cto.w           a0, t0
-+    jr              ra
-+
-+L(loop):
-+    xvld            $xr0, a0, 32
-+    addi.d          a0, a0, 32
-+    xvsetanyeqz.b   $fcc0, $xr0
-+    bceqz           $fcc0, L(loop)
-+
-+
-+    xvmsknz.b       $xr0, $xr0
-+    sub.d           a0, a0, a1
-+    xvpickve.w      $xr1, $xr0, 4
-+    vilvl.h         $vr0, $vr1, $vr0
-+
-+    movfr2gr.s      t0, $f0
-+    cto.w           t0, t0
-+    add.d           a0, a0, t0
-+    jr              ra
-+END(STRLEN)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRLEN)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
-new file mode 100644
-index 00000000..6edcac8c
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
-@@ -0,0 +1,63 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <regdef.h>
-+#include <sys/asm.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define STRLEN	__strlen_lsx
-+
-+/* size_t strlen(const char *s1); */
-+
-+LEAF(STRLEN)
-+    .align          6
-+    move            a1, a0
-+    bstrins.d       a0, zero, 4, 0
-+    vld             $vr0, a0, 0
-+    vld             $vr1, a0, 16
-+
-+    li.d            t1, -1
-+    vmsknz.b        $vr0, $vr0
-+    vmsknz.b        $vr1, $vr1
-+    vilvl.h         $vr0, $vr1, $vr0
-+
-+    movfr2gr.s      t0, $f0
-+    sra.w           t0, t0, a1
-+    beq             t0, t1, L(loop)
-+    cto.w           a0, t0
-+
-+    jr              ra
-+    nop
-+    nop
-+    nop
-+
-+
-+L(loop):
-+    vld             $vr0, a0, 32
-+    vld             $vr1, a0, 48
-+    addi.d          a0, a0, 32
-+    vmin.bu         $vr2, $vr0, $vr1
-+
-+    vsetanyeqz.b    $fcc0, $vr2
-+    bceqz           $fcc0, L(loop)
-+    vmsknz.b        $vr0, $vr0
-+    vmsknz.b        $vr1, $vr1
-+
-+    vilvl.h         $vr0, $vr1, $vr0
-+    sub.d           a0, a0, a1
-+    movfr2gr.s      t0, $f0
-+    cto.w           t0, t0
-+
-+    add.d           a0, a0, t0
-+    jr              ra
-+END(STRLEN)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRLEN)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S
-new file mode 100644
-index 00000000..e9b7cf67
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S
-@@ -0,0 +1,116 @@
-+/* Copyright 2016 Loongson Technology Corporation Limited.  */
-+
-+/* Author: Songyuekun songyuekun@loongson.cn.  */
-+
-+/* algorithm:
-+  #. use ld/ldr to access word/partial word in the string
-+  #. use (x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) != 0 to
-+     judge if x has zero byte
-+  #. use dctz((x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) >> 3
-+     to get the index of first rightmost zero byte in dword x;
-+  #. use dctz(x) = 64 - dclz(~x & (x-1));
-+  #. use pointer to the last non zero byte minus pointer to the start
-+     of the string to get the length of string.  */
-+
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define L_ADDIU  addi.d
-+#define L_ADDU   add.d
-+#define L_SUBU   sub.d
-+
-+#define STRLEN	__strlen_unaligned
-+
-+/* size_t strlen (const char *s1); */
-+
-+LEAF(STRLEN)
-+	.align		5
-+	nor		t4, zero, zero
-+	lu12i.w		a2, 0x01010
-+	andi		t5, a0, 0x7
-+
-+	li.w		t7, 0x7
-+	slli.d		t6, t5, 0x3
-+	andn		t7, a0, t7
-+	ld.d		a1, t7, 0
-+	sub.d		t7, zero, t6
-+	sll.d		t4, t4, t7
-+	maskeqz		t4, t4, t6
-+	srl.d		a1, a1, t6
-+	or		a1, a1, t4
-+
-+	ori		a2, a2, 0x101
-+	nor		t1, a1, zero
-+	li.w		a4, 8
-+
-+	bstrins.d	a2, a2, 63, 32
-+	sub.d		a5, a4, t5
-+	move		t5, a0
-+
-+	sub.d		t0, a1, a2
-+	slli.d		t4, a2, 7
-+	nor		a3, zero, t4
-+	nor		t1, a1, a3
-+
-+	and		t0, t0, t1
-+	bnez		t0, strlen_count1	/* instead of use bnel with daddu a0, a0, a5 in branch slot */
-+	L_ADDU		a0, a0, a5
-+strlen_loop:
-+	ld.d		a1, a0, 0
-+	sub.d		t0, a1, a2
-+	and		t1, t0, t4
-+	bnez		t1, strlen_count_pre
-+	ld.d		a1, a0, 8
-+	sub.d		t0, a1, a2
-+	and		    t1, t0, t4
-+	L_ADDIU		a0, a0, 16
-+	beqz		t1, strlen_loop
-+strlen_count:
-+	addi.d      	a0, a0, -8
-+strlen_count_pre:
-+	nor         	t1, a1, a3
-+    	and         	t0, t0, t1
-+    	beqz        	t0, strlen_noascii_start
-+strlen_count1:
-+	ctz.d		t1, t0
-+	L_SUBU		v0, a0, t5
-+	srli.w		t1, t1, 3
-+	L_ADDU		v0, v0, t1
-+	jr		ra
-+strlen_noascii_start:
-+	addi.d		a0, a0, 8
-+strlen_loop_noascii:
-+	ld.d		a1, a0, 0
-+	sub.d		t0, a1, a2
-+	nor		t1, a1, a3
-+	and		t0, t0, t1
-+	bnez		t0, strlen_count1
-+	ld.d		a1, a0, 8
-+	sub.d		t0, a1, a2
-+	nor		t1, a1, a3
-+	and		t0, t0, t1
-+	L_ADDIU		a0, a0, 16
-+	beqz		t0, strlen_loop_noascii
-+	addi.d		a0, a0, -8
-+	ctz.d		t1, t0
-+	L_SUBU		v0, a0, t5
-+	srli.w		t1, t1, 3
-+	L_ADDU		v0, v0, t1
-+	jr		ra
-+END(STRLEN)
-+
-+#ifndef ANDROID_CHANGES
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRLEN)
-+#endif
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strlen.c b/sysdeps/loongarch/lp64/multiarch/strlen.c
-new file mode 100644
-index 00000000..e8454404
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strlen.c
-@@ -0,0 +1,39 @@
-+/* Multiple versions of strlen.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+/* Define multiple versions only for the definition in libc.  */
-+#if IS_IN (libc)
-+# define strlen __redirect_strlen
-+# include <string.h>
-+# undef strlen
-+
-+# define SYMBOL_NAME strlen
-+# include "ifunc-lasx.h"
-+
-+libc_ifunc_redirected (__redirect_strlen, __new_strlen,
-+		       IFUNC_SELECTOR ());
-+
-+# ifdef SHARED
-+__hidden_ver1 (__new_strlen, __GI_strlen, __redirect_strlen)
-+  __attribute__ ((visibility ("hidden")));
-+# endif
-+
-+# include <shlib-compat.h>
-+versioned_symbol (libc, __new_strlen, strlen, GLIBC_2_27);
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
-new file mode 100644
-index 00000000..f371b19e
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
-@@ -0,0 +1,8 @@
-+
-+#if IS_IN (libc)
-+
-+#define STRNCMP __strncmp_aligned
-+
-+#endif
-+
-+#include "../strncmp.S"
-diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
-new file mode 100644
-index 00000000..3399bf77
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
-@@ -0,0 +1,197 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <regdef.h>
-+#include <sys/asm.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define STRNCMP	__strncmp_lsx
-+
-+/* int strncmp (const char *s1, const char *s2); */
-+
-+L(magic_num):
-+    .align          6
-+    .dword          0x0706050403020100
-+    .dword          0x0f0e0d0c0b0a0908
-+ENTRY_NO_ALIGN(STRNCMP)
-+    beqz            a2, L(ret0)
-+    pcaddi          t0, -5
-+    andi            a3, a0, 0xf
-+    vld             $vr2, t0, 0
-+
-+    andi            a4, a1, 0xf
-+    li.d            t2, 16
-+    bne             a3, a4, L(unaligned)
-+    xor             t0, a0, a3
-+
-+    xor             t1, a1, a4
-+    vld             $vr0, t0, 0
-+    vld             $vr1, t1, 0
-+    vreplgr2vr.b    $vr3, a3
-+
-+
-+    sub.d           t2, t2, a3
-+    vadd.b          $vr3, $vr3, $vr2
-+    vshuf.b         $vr0, $vr3, $vr0, $vr3
-+    vshuf.b         $vr1, $vr3, $vr1, $vr3
-+
-+    vseq.b          $vr3, $vr0, $vr1
-+    vmin.bu         $vr3, $vr0, $vr3
-+    bgeu            t2, a2, L(al_early_end)
-+    vsetanyeqz.b    $fcc0, $vr3
-+
-+    bcnez           $fcc0, L(al_end)
-+    add.d           a3, a0, a2
-+    addi.d          a4, a3, -1
-+    bstrins.d       a4, zero, 3, 0
-+
-+    sub.d           a2, a3, a4
-+L(al_loop):
-+    vld             $vr0, t0, 16
-+    vld             $vr1, t1, 16
-+    addi.d          t0, t0, 16
-+
-+
-+    addi.d          t1, t1, 16
-+    vseq.b          $vr3, $vr0, $vr1
-+    vmin.bu         $vr3, $vr0, $vr3
-+    beq             t0, a4, L(al_early_end)
-+
-+    vsetanyeqz.b    $fcc0, $vr3
-+    bceqz           $fcc0, L(al_loop)
-+L(al_end):
-+    vseqi.b         $vr3, $vr3, 0
-+    vfrstpi.b       $vr3, $vr3, 0
-+
-+    vshuf.b         $vr0, $vr0, $vr0, $vr3
-+    vshuf.b         $vr1, $vr1, $vr1, $vr3
-+    vpickve2gr.bu   t0, $vr0, 0
-+    vpickve2gr.bu   t1, $vr1, 0
-+
-+    sub.d           a0, t0, t1
-+    jr              ra
-+L(al_early_end):
-+    vreplgr2vr.b    $vr4, a2
-+    vslt.b          $vr4, $vr2, $vr4
-+
-+
-+    vorn.v          $vr3, $vr3, $vr4
-+    b               L(al_end)
-+L(unaligned):
-+    slt             a5, a3, a4
-+    xor             t0, a0, a1
-+
-+    maskeqz         t0, t0, a5
-+    xor             a0, a0, t0   # a0 hold the larger one
-+    xor             a1, a1, t0   # a1 hold the small one
-+    andi            a3, a0, 0xf
-+
-+    andi            a4, a1, 0xf
-+    xor             t0, a0, a3
-+    xor             t1, a1, a4
-+    vld             $vr0, t0, 0
-+
-+    vld             $vr3, t1, 0
-+    sub.d           t2, t2, a3
-+    vreplgr2vr.b    $vr4, a3
-+    vreplgr2vr.b    $vr5, a4
-+
-+
-+    vaddi.bu        $vr6, $vr2, 16
-+    vsub.b          $vr7, $vr4, $vr5
-+    vsub.b          $vr6, $vr6, $vr7
-+    vadd.b          $vr4, $vr2, $vr4
-+
-+    vshuf.b         $vr1, $vr3, $vr3, $vr6
-+    vshuf.b         $vr0, $vr7, $vr0, $vr4
-+    vshuf.b         $vr1, $vr7, $vr1, $vr4
-+    vseq.b          $vr4, $vr0, $vr1
-+
-+    vmin.bu         $vr4, $vr0, $vr4
-+    bgeu            t2, a2, L(un_early_end)
-+    vsetanyeqz.b    $fcc0, $vr4
-+    bcnez           $fcc0, L(un_end)
-+
-+    add.d           a6, a0, a2
-+    vslt.b          $vr5, $vr2, $vr5
-+    addi.d          a7, a6, -1
-+    vor.v           $vr3, $vr3, $vr5
-+
-+
-+    bstrins.d       a7, zero, 3, 0
-+    sub.d           a2, a6, a7
-+L(un_loop):
-+    vld             $vr0, t0, 16
-+    addi.d          t0, t0, 16
-+
-+    vsetanyeqz.b    $fcc0, $vr3
-+    bcnez           $fcc0, L(has_zero)
-+    beq             t0, a7, L(end_with_len)
-+    vor.v           $vr1, $vr3, $vr3
-+
-+    vld             $vr3, t1, 16
-+    addi.d          t1, t1, 16
-+    vshuf.b         $vr1, $vr3, $vr1, $vr6
-+    vseq.b          $vr4, $vr0, $vr1
-+
-+    vmin.bu         $vr4, $vr0, $vr4
-+    vsetanyeqz.b    $fcc0, $vr4
-+    bceqz           $fcc0, L(un_loop)
-+L(un_end):
-+    vseqi.b         $vr4, $vr4, 0
-+
-+
-+    vfrstpi.b       $vr4, $vr4, 0
-+    vshuf.b         $vr0, $vr0, $vr0, $vr4
-+    vshuf.b         $vr1, $vr1, $vr1, $vr4
-+    vpickve2gr.bu   t0, $vr0, 0
-+
-+    vpickve2gr.bu   t1, $vr1, 0
-+    sub.d           t2, t0, t1
-+    sub.d           t3, t1, t0
-+    masknez         t0, t2, a5
-+
-+    maskeqz         t1, t3, a5
-+    or              a0, t0, t1
-+    jr              ra
-+L(has_zero):
-+    vshuf.b         $vr1, $vr3, $vr3, $vr6
-+
-+    vseq.b          $vr4, $vr0, $vr1
-+    vmin.bu         $vr4, $vr0, $vr4
-+    bne             t0, a7, L(un_end)
-+L(un_early_end):
-+    vreplgr2vr.b    $vr5, a2
-+
-+    vslt.b          $vr5, $vr2, $vr5
-+    vorn.v          $vr4, $vr4, $vr5
-+    b               L(un_end)
-+L(end_with_len):
-+    sub.d           a6, a3, a4
-+
-+    bgeu            a6, a2, 1f
-+    vld             $vr4, t1, 16
-+1:
-+    vshuf.b         $vr1, $vr4, $vr3, $vr6
-+    vseq.b          $vr4, $vr0, $vr1
-+
-+    vmin.bu         $vr4, $vr0, $vr4
-+    vreplgr2vr.b    $vr5, a2
-+    vslt.b          $vr5, $vr2, $vr5
-+    vorn.v          $vr4, $vr4, $vr5
-+
-+    b               L(un_end)
-+L(ret0):
-+    move            a0, zero
-+    jr              ra
-+END(STRNCMP)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRNCMP)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S
-new file mode 100644
-index 00000000..558df29b
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S
-@@ -0,0 +1,257 @@
-+/* Copyright 2016 Loongson Technology Corporation Limited.  */
-+
-+/* Author: songyuekun songyuekun@loongson.cn.
-+ * ISA: MIPS64R2
-+ * ABI: N64
-+ * basic algorithm :
-+	+. let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1,
-+	   set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1
-+	+. if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more ld other times;
-+	+. if not, load partial t2 and t3, check if t2 has \0;
-+	+. then use use ld for t0, ldr for t1,
-+	+. if partial 8 byte  from t1 has \0, compare partial 8 byte from t1 with 8
-+	   byte from t0 with a mask in a7
-+	+. if not, ldl other part of t1, compare  8 byte from t1 with 8 byte from t0
-+	+. if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has
-+	   one byte is \0, else has no \0
-+	+. for partial 8 byte from ldr t3, 0(a0), preload t3 with 0xffffffffffffffff
-+*/
-+
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define STRNCMP	__strncmp_unaligned
-+
-+#define REP8_01 0x0101010101010101
-+#define REP8_7f 0x7f7f7f7f7f7f7f7f
-+#define REP8_80 0x8080808080808080
-+
-+/* Parameters and Results */
-+#define src1	a0
-+#define	src2	a1
-+#define	limit	a2
-+#define result	v0
-+// Note: v0 = a0 in N64 ABI
-+
-+
-+/* Internal variable */
-+#define data1		t0
-+#define	data2		t1
-+#define	has_nul		t2
-+#define	diff		t3
-+#define syndrome	t4
-+#define zeroones	t5
-+#define	sevenf		t6
-+#define pos		t7
-+#define exchange	t8
-+#define tmp1		a5
-+#define	tmp2		a6
-+#define	tmp3		a7
-+#define src1_off    	a3
-+#define limit_wd    	a4
-+
-+/* int strncmp (const char *s1, const char *s2); */
-+
-+LEAF(STRNCMP)
-+	.align		4
-+	beqz		limit, strncmp_ret0
-+
-+	xor		tmp1, src1, src2
-+    	lu12i.w     	zeroones, 0x01010
-+    	lu12i.w     	sevenf, 0x7f7f7
-+    	andi        	src1_off, src1, 0x7
-+    	ori         	zeroones, zeroones, 0x101
-+	andi		tmp1, tmp1, 0x7
-+    	ori         	sevenf, sevenf, 0xf7f
-+    	bstrins.d   	zeroones, zeroones, 63, 32
-+    	bstrins.d   	sevenf, sevenf, 63, 32
-+	bnez		tmp1, strncmp_misaligned8
-+	bnez		src1_off, strncmp_mutual_align
-+
-+    	addi.d      	limit_wd, limit, -1
-+    	srli.d      	limit_wd, limit_wd, 3
-+
-+strncmp_loop_aligned:
-+	ld.d		data1, src1, 0
-+    	addi.d		src1, src1, 8
-+	ld.d		data2, src2, 0
-+    	addi.d		src2, src2, 8
-+
-+strncmp_start_realigned:
-+	addi.d		limit_wd, limit_wd, -1
-+	sub.d		tmp1, data1, zeroones
-+	or		tmp2, data1, sevenf
-+	xor	    	diff, data1, data2
-+	andn		has_nul, tmp1, tmp2
-+	srli.d		tmp1, limit_wd, 63
-+	or	        syndrome, diff, has_nul
-+	or		tmp2, syndrome, tmp1
-+	beqz		tmp2, strncmp_loop_aligned
-+
-+    	/* if not reach limit.  */
-+    	bge		limit_wd, zero, strncmp_not_limit
-+
-+    	/* if reach limit.  */
-+    	andi        	limit, limit, 0x7
-+    	li.w        	tmp1, 0x8
-+    	sub.d       	limit, tmp1, limit
-+    	slli.d      	limit, limit, 0x3
-+    	li.d        	tmp1, -1
-+    	srl.d       	tmp1, tmp1, limit
-+    	and         	data1, data1, tmp1
-+    	and         	data2, data2, tmp1
-+    	orn         	syndrome, syndrome, tmp1
-+
-+
-+strncmp_not_limit:
-+	ctz.d		pos, syndrome
-+	bstrins.d	pos, zero, 2, 0
-+	srl.d		data1, data1, pos
-+	srl.d		data2, data2, pos
-+	andi		data1, data1, 0xff
-+	andi		data2, data2, 0xff
-+	sub.d		result, data1, data2
-+	jr		ra
-+
-+strncmp_mutual_align:
-+    	bstrins.d   	src1, zero, 2, 0
-+    	bstrins.d   	src2, zero, 2, 0
-+	slli.d		tmp1, src1_off,  0x3
-+	ld.d		data1, src1, 0
-+	ld.d		data2, src2, 0
-+    	addi.d      	src2, src2, 8
-+    	addi.d      	src1, src1, 8
-+
-+    	addi.d      	limit_wd, limit, -1
-+    	andi        	tmp3, limit_wd, 0x7
-+    	srli.d      	limit_wd, limit_wd, 3
-+    	add.d       	limit, limit, src1_off
-+    	add.d       	tmp3, tmp3, src1_off
-+    	srli.d      	tmp3, tmp3, 3
-+    	add.d       	limit_wd, limit_wd, tmp3
-+
-+	sub.d		tmp1, zero, tmp1
-+	nor		tmp2, zero, zero
-+	srl.d		tmp2, tmp2, tmp1
-+	or		data1, data1, tmp2
-+	or		data2, data2, tmp2
-+	b		strncmp_start_realigned
-+
-+strncmp_misaligned8:
-+
-+    li.w        tmp1, 0x10
-+    bge         limit, tmp1, strncmp_try_words
-+strncmp_byte_loop:
-+    ld.bu       data1, src1, 0
-+    ld.bu       data2, src2, 0
-+    addi.d      limit, limit, -1
-+    xor         tmp1, data1, data2
-+    masknez     tmp1, data1, tmp1
-+    maskeqz     tmp1, limit, tmp1
-+    beqz        tmp1, strncmp_done
-+
-+    ld.bu       data1, src1, 1
-+    ld.bu       data2, src2, 1
-+    addi.d      src1, src1, 2
-+    addi.d      src2, src2, 2
-+    addi.d      limit, limit, -1
-+    xor         tmp1, data1, data2
-+    masknez     tmp1, data1, tmp1
-+    maskeqz     tmp1, limit, tmp1
-+    bnez        tmp1, strncmp_byte_loop
-+
-+
-+strncmp_done:
-+    sub.d       result, data1, data2
-+    jr		ra
-+
-+strncmp_try_words:
-+    srli.d      limit_wd, limit, 3
-+    beqz        src1_off, strncmp_do_misaligned
-+
-+    sub.d       src1_off, zero, src1_off
-+    andi        src1_off, src1_off, 0x7
-+    sub.d       limit, limit, src1_off
-+    srli.d      limit_wd, limit, 0x3
-+
-+
-+strncmp_page_end_loop:
-+    ld.bu       data1, src1, 0
-+    ld.bu       data2, src2, 0
-+    addi.d      src1, src1, 1
-+    addi.d      src2, src2, 1
-+    xor         tmp1, data1, data2
-+    masknez     tmp1, data1, tmp1
-+    beqz        tmp1, strncmp_done
-+    andi        tmp1, src1, 0x7
-+    bnez        tmp1, strncmp_page_end_loop
-+strncmp_do_misaligned:
-+    li.w        src1_off, 0x8
-+    addi.d      limit_wd, limit_wd, -1
-+    blt         limit_wd, zero, strncmp_done_loop
-+
-+strncmp_loop_misaligned:
-+    andi        tmp2, src2, 0xff8
-+    xori        tmp2, tmp2, 0xff8
-+    beqz        tmp2, strncmp_page_end_loop
-+
-+    ld.d        data1, src1, 0
-+    ld.d        data2, src2, 0
-+    addi.d      src1, src1, 8
-+    addi.d      src2, src2, 8
-+    sub.d       tmp1, data1, zeroones
-+    or          tmp2, data1, sevenf
-+    xor         diff, data1, data2
-+    andn        has_nul, tmp1, tmp2
-+    or          syndrome, diff, has_nul
-+    bnez        syndrome, strncmp_not_limit
-+    addi.d      limit_wd, limit_wd, -1
-+    bge         limit_wd, zero, strncmp_loop_misaligned
-+
-+strncmp_done_loop:
-+    andi        limit, limit, 0x7
-+    beqz        limit, strncmp_not_limit
-+
-+    /* Read the last double word */
-+    /* check if the final part is about to exceed the page */
-+    andi        tmp1, src2, 0x7
-+    andi        tmp2, src2, 0xff8
-+    add.d       tmp1, tmp1, limit
-+    xori        tmp2, tmp2, 0xff8
-+    andi        tmp1, tmp1, 0x8
-+    masknez     tmp1, tmp1, tmp2
-+    bnez        tmp1, strncmp_byte_loop
-+    addi.d      src1, src1, -8
-+    addi.d      src2, src2, -8
-+    ldx.d       data1, src1, limit
-+    ldx.d       data2, src2, limit
-+    sub.d       tmp1, data1, zeroones
-+    or          tmp2, data1, sevenf
-+    xor         diff, data1, data2
-+    andn        has_nul, tmp1, tmp2
-+    or          syndrome, diff, has_nul
-+    bnez        syndrome, strncmp_not_limit
-+
-+strncmp_ret0:
-+    move	result, zero
-+    jr		ra
-+
-+/* check if ((src1 != 0) && ((src2 == 0 ) || (src1 < src2)))
-+   then exchange(src1,src2).  */
-+
-+END(STRNCMP)
-+#ifndef ANDROID_CHANGES
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRNCMP)
-+#endif
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp.c b/sysdeps/loongarch/lp64/multiarch/strncmp.c
-new file mode 100644
-index 00000000..80ab8c8c
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strncmp.c
-@@ -0,0 +1,35 @@
-+/* Multiple versions of strncmp.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+/* Define multiple versions only for the definition in libc.  */
-+#if IS_IN (libc)
-+# define strncmp __redirect_strncmp
-+# include <string.h>
-+# undef strncmp
-+
-+# define SYMBOL_NAME strncmp
-+# include "ifunc-lsx.h"
-+
-+libc_ifunc_redirected (__redirect_strncmp, strncmp, IFUNC_SELECTOR ());
-+
-+# ifdef SHARED
-+__hidden_ver1 (strncmp, __GI_strncmp, __redirect_strncmp)
-+  __attribute__ ((visibility ("hidden")));
-+# endif
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
-new file mode 100644
-index 00000000..503442b3
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
-@@ -0,0 +1,8 @@
-+
-+#if IS_IN (libc)
-+
-+#define STRNLEN __strnlen_aligned
-+
-+#endif
-+
-+#include "../strnlen.S"
-diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
-new file mode 100644
-index 00000000..8c30f10c
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
-@@ -0,0 +1,92 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define STRNLEN	__strnlen_lasx
-+
-+/* size_t strnlen (const char *s1, size_t maxlen); */
-+
-+LEAF(STRNLEN)
-+    .align          6
-+    beqz            a1, L(ret0)
-+    andi            t1, a0, 0x3f
-+    li.d            t3, 65
-+    sub.d           a2, a0, t1
-+
-+    xvld            $xr0, a2, 0
-+    xvld            $xr1, a2, 32
-+    sub.d           t1, t3, t1
-+    move            a3, a0
-+
-+    sltu            t1, a1, t1
-+    xvmsknz.b       $xr0, $xr0
-+    xvmsknz.b       $xr1, $xr1
-+    xvpickve.w      $xr2, $xr0, 4
-+
-+    xvpickve.w      $xr3, $xr1, 4
-+    vilvl.h         $vr0, $vr2, $vr0
-+    vilvl.h         $vr1, $vr3, $vr1
-+    vilvl.w         $vr0, $vr1, $vr0
-+
-+
-+    movfr2gr.d      t0, $f0
-+    sra.d           t0, t0, a0
-+    orn             t1, t1, t0
-+    bnez            t1, L(end)
-+
-+    add.d           a4, a0, a1
-+    move            a0, a2
-+    addi.d          a4, a4, -1
-+    bstrins.d       a4, zero, 5, 0
-+
-+L(loop):
-+    xvld            $xr0, a0, 64
-+    xvld            $xr1, a0, 96
-+    addi.d          a0, a0, 64
-+    beq             a0, a4, L(out)
-+
-+    xvmin.bu        $xr2, $xr0, $xr1
-+    xvsetanyeqz.b   $fcc0, $xr2
-+    bceqz           $fcc0, L(loop)
-+L(out):
-+    xvmsknz.b       $xr0, $xr0
-+
-+
-+    xvmsknz.b       $xr1, $xr1
-+    xvpickve.w      $xr2, $xr0, 4
-+    xvpickve.w      $xr3, $xr1, 4
-+    vilvl.h         $vr0, $vr2, $vr0
-+
-+    vilvl.h         $vr1, $vr3, $vr1
-+    vilvl.w         $vr0, $vr1, $vr0
-+    movfr2gr.d      t0, $f0
-+L(end):
-+    sub.d           a0, a0, a3
-+
-+    cto.d           t0, t0
-+    add.d           a0, a0, t0
-+    sltu            t1, a0, a1
-+    masknez         t0, a1, t1
-+
-+    maskeqz         t1, a0, t1
-+    or              a0, t0, t1
-+    jr              ra
-+L(ret0):
-+    move            a0, zero
-+
-+
-+    jr              ra
-+END(STRNLEN)
-+
-+#ifdef _LIBC
-+libc_hidden_def (STRNLEN)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
-new file mode 100644
-index 00000000..388c239a
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
-@@ -0,0 +1,81 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <regdef.h>
-+#include <sys/asm.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define STRNLEN	__strnlen_lsx
-+
-+/* size_t strnlen (const char *s1, size_t maxlen); */
-+
-+LEAF(STRNLEN)
-+    .align          6
-+    beqz            a1, L(ret0)
-+    andi            t1, a0, 0x1f
-+    li.d            t3, 33
-+    sub.d           a2, a0, t1
-+
-+    vld             $vr0, a2, 0
-+    vld             $vr1, a2, 16
-+    sub.d           t1, t3, t1
-+    move            a3, a0
-+
-+    sltu            t1, a1, t1
-+    vmsknz.b        $vr0, $vr0
-+    vmsknz.b        $vr1, $vr1
-+    vilvl.h         $vr0, $vr1, $vr0
-+
-+    movfr2gr.s      t0, $f0
-+    sra.w           t0, t0, a0
-+    orn             t1, t1, t0
-+    bnez            t1, L(end)
-+
-+
-+    add.d           a4, a0, a1
-+    move            a0, a2
-+    addi.d          a4, a4, -1
-+    bstrins.d       a4, zero, 4, 0
-+
-+L(loop):
-+    vld             $vr0, a0, 32
-+    vld             $vr1, a0, 48
-+    addi.d          a0, a0, 32
-+    beq             a0, a4, L(out)
-+
-+    vmin.bu         $vr2, $vr0, $vr1
-+    vsetanyeqz.b    $fcc0, $vr2
-+    bceqz           $fcc0, L(loop)
-+L(out):
-+    vmsknz.b        $vr0, $vr0
-+
-+    vmsknz.b        $vr1, $vr1
-+    vilvl.h         $vr0, $vr1, $vr0
-+    movfr2gr.s      t0, $f0
-+L(end):
-+    sub.d           a0, a0, a3
-+
-+
-+    cto.w           t0, t0
-+    add.d           a0, a0, t0
-+    sltu            t1, a0, a1
-+    masknez         t0, a1, t1
-+
-+    maskeqz         t1, a0, t1
-+    or              a0, t0, t1
-+    jr              ra
-+L(ret0):
-+    move            a0, zero
-+
-+    jr              ra
-+END(STRNLEN)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRNLEN)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S
-new file mode 100644
-index 00000000..60eccf00
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S
-@@ -0,0 +1,145 @@
-+/* Copyright 2016 Loongson Technology Corporation Limited.  */
-+
-+/* Author: Songyuekun songyuekun@loongson.cn
-+ * ISA: MIPS64R2
-+ * ABI: N64.
-+ * algorithm:
-+	#. use ld/ldr to access word/partial word in the string
-+	#. use (x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) != 0 to
-+		judge if x has zero byte
-+	#. use dctz((x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) >> 3
-+		to get the index of first rightmost zero byte in dword x;
-+	#. use dctz(x) = 64 - dclz(~x & (x-1));
-+	#. use pointer to the last non zero byte  minus pointer to the start
-+	of the string to get the length of string.  */
-+
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define L_ADDIU  addi.d
-+#define L_ADDU   add.d
-+#define L_SUBU   sub.d
-+
-+#define STRNLEN	__strnlen_unaligned
-+
-+/* rd <- if rc then ra else rb
-+    will destroy t6.  */
-+
-+#define CONDITIONSEL(rd,ra,rb,rc)\
-+        masknez a5, rb, rc;\
-+        maskeqz rd, ra, rc;\
-+        or      rd, rd, a5
-+
-+/* Parameters and Results */
-+#define srcin	a0
-+#define	limit   a1
-+#define len 	v0
-+
-+/* Internal variable */
-+#define data1		t0
-+#define	data2		t1
-+#define	has_nul1	t2
-+#define	has_nul2	t3
-+#define src	        t4
-+#define zeroones	t5
-+#define	sevenf		t6
-+#define data2a	    	t7
-+#define tmp6	    	t7
-+#define pos	        t8
-+#define tmp1		a2
-+#define	tmp2		a3
-+#define	tmp3		a4
-+#define tmp4        	a5
-+#define tmp5        	a6
-+#define limit_wd    	a7
-+
-+/* size_t strnlen (const char *s1,size_t maxlen); */
-+
-+LEAF(STRNLEN)
-+
-+    .align	4
-+    beqz        limit, L(_hit_limit)
-+    lu12i.w     zeroones, 0x01010
-+    lu12i.w     sevenf, 0x7f7f7
-+    ori         zeroones, zeroones, 0x101
-+    ori         sevenf, sevenf, 0xf7f
-+    bstrins.d   zeroones, zeroones, 63, 32
-+    bstrins.d   sevenf, sevenf, 63, 32
-+    andi        tmp1, srcin, 15
-+    sub.d       src, srcin, tmp1
-+    bnez        tmp1, L(misaligned)
-+    addi.d      limit_wd, limit, -1
-+    srli.d      limit_wd, limit_wd, 4
-+L(_loop):
-+    ld.d        data1, src, 0
-+    ld.d        data2, src, 8
-+    addi.d      src, src, 16
-+L(_realigned):
-+    sub.d       tmp1, data1, zeroones
-+    or          tmp2, data1, sevenf
-+    sub.d       tmp3, data2, zeroones
-+    or          tmp4, data2, sevenf
-+    andn        has_nul1, tmp1, tmp2
-+    andn        has_nul2, tmp3, tmp4
-+    addi.d      limit_wd, limit_wd, -1
-+    srli.d      tmp1, limit_wd, 63
-+    or          tmp2, has_nul1, has_nul2
-+    or          tmp3, tmp1, tmp2
-+    beqz        tmp3, L(_loop)
-+    beqz        tmp2, L(_hit_limit)
-+    sub.d       len, src, srcin
-+    beqz        has_nul1, L(_nul_in_data2)
-+    move        has_nul2, has_nul1
-+    addi.d      len, len, -8
-+L(_nul_in_data2):
-+    ctz.d       pos, has_nul2
-+    srli.d      pos, pos, 3
-+    addi.d      len, len, -8
-+    add.d       len, len, pos
-+    sltu        tmp1, len, limit
-+    CONDITIONSEL(len,len,limit,tmp1)
-+    jr ra
-+
-+
-+L(misaligned):
-+    addi.d      limit_wd, limit, -1
-+    sub.d       tmp4, zero, tmp1
-+    andi        tmp3, limit_wd, 15
-+    srli.d      limit_wd, limit_wd, 4
-+    li.d        tmp5, -1
-+    ld.d        data1, src, 0
-+    ld.d        data2, src, 8
-+    addi.d      src, src, 16
-+    slli.d      tmp4, tmp4, 3
-+    add.d       tmp3, tmp3, tmp1
-+    srl.d       tmp2, tmp5, tmp4
-+    srli.d      tmp3, tmp3, 4
-+    add.d       limit_wd, limit_wd, tmp3
-+    or          data1, data1, tmp2
-+    or          data2a, data2, tmp2
-+    li.w        tmp3, 9
-+    sltu        tmp1, tmp1, tmp3
-+    CONDITIONSEL(data1,data1,tmp5,tmp1)
-+    CONDITIONSEL(data2,data2,data2a,tmp1)
-+    b           L(_realigned)
-+
-+
-+L(_hit_limit):
-+    move	len, limit
-+    jr		ra
-+END(STRNLEN)
-+#ifndef ANDROID_CHANGES
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRNLEN)
-+#endif
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen.c b/sysdeps/loongarch/lp64/multiarch/strnlen.c
-new file mode 100644
-index 00000000..6fc406d2
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strnlen.c
-@@ -0,0 +1,40 @@
-+/* Multiple versions of strnlen.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+/* Define multiple versions only for the definition in libc.  */
-+#if IS_IN (libc)
-+# define strnlen __redirect_strnlen
-+# define __strnlen __redirect___strnlen
-+# include <string.h>
-+# undef __strnlen
-+# undef strnlen
-+
-+# define SYMBOL_NAME strnlen
-+# include "ifunc-lasx.h"
-+
-+libc_ifunc_redirected (__redirect_strnlen, __strnlen, IFUNC_SELECTOR ());
-+weak_alias (__strnlen, strnlen);
-+# ifdef SHARED
-+__hidden_ver1 (__strnlen, __GI___strnlen, __redirect___strnlen)
-+  __attribute__((visibility ("hidden")));
-+__hidden_ver1 (strnlen, __GI_strnlen, __redirect_strnlen)
-+  __attribute__((weak, visibility ("hidden")));
-+# endif
-+#endif
-+
-diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S
-new file mode 100644
-index 00000000..a58ddde8
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S
-@@ -0,0 +1,12 @@
-+
-+#if IS_IN (libc)
-+
-+#define STRRCHR_NAME __strrchr_aligned
-+
-+#endif
-+
-+#include "../strrchr.S"
-+
-+#undef rindex
-+weak_alias(STRRCHR_NAME, rindex)
-+
-diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
-new file mode 100644
-index 00000000..6f7a5618
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
-@@ -0,0 +1,113 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define STRRCHR __strrchr_lasx
-+
-+LEAF(STRRCHR)
-+    .align          6
-+    andi            t1, a0, 0x3f
-+    bstrins.d       a0, zero, 5, 0
-+    xvld            $xr0, a0, 0
-+    xvld            $xr1, a0, 32
-+
-+    li.d            t2, -1
-+    xvreplgr2vr.b   $xr4, a1
-+    move            a2, zero
-+    sll.d           t3, t2, t1
-+
-+    addi.d          a0, a0, 63
-+    xvseq.b         $xr2, $xr0, $xr4
-+    xvseq.b         $xr3, $xr1, $xr4
-+    xvmsknz.b       $xr0, $xr0
-+
-+    xvmsknz.b       $xr1, $xr1
-+    xvpickve.w      $xr5, $xr0, 4
-+    xvpickve.w      $xr6, $xr1, 4
-+    vilvl.h         $vr0, $vr5, $vr0
-+
-+
-+    vilvl.h         $vr1, $vr6, $vr1
-+    xvmsknz.b       $xr2, $xr2
-+    xvmsknz.b       $xr3, $xr3
-+    xvpickve.w      $xr5, $xr2, 4
-+
-+    xvpickve.w      $xr6, $xr3, 4
-+    vilvl.h         $vr2, $vr5, $vr2
-+    vilvl.h         $vr3, $vr6, $vr3
-+    vilvl.w         $vr0, $vr1, $vr0
-+
-+    vilvl.w         $vr1, $vr3, $vr2
-+    movfr2gr.d      t0, $f0
-+    movfr2gr.d      t1, $f1
-+    orn             t0, t0, t3
-+
-+    and             t1, t1, t3
-+    bne             t0, t2, L(end)
-+L(loop):
-+    xvld            $xr0, a0, 1
-+    xvld            $xr1, a0, 33
-+
-+
-+    clz.d           t0, t1
-+    sub.d           t0, a0, t0
-+    addi.d          a0, a0, 64
-+    maskeqz         t0, t0, t1
-+
-+    masknez         t1, a2, t1
-+    or              a2, t0, t1
-+    xvseq.b         $xr2, $xr0, $xr4
-+    xvseq.b         $xr3, $xr1, $xr4
-+
-+    xvmsknz.b       $xr2, $xr2
-+    xvmsknz.b       $xr3, $xr3
-+    xvpickve.w      $xr5, $xr2, 4
-+    xvpickve.w      $xr6, $xr3, 4
-+
-+    vilvl.h         $vr2, $vr5, $vr2
-+    vilvl.h         $vr3, $vr6, $vr3
-+    xvmin.bu        $xr5, $xr0, $xr1
-+    vilvl.w         $vr2, $vr3, $vr2
-+
-+
-+    xvsetanyeqz.b   $fcc0, $xr5
-+    movfr2gr.d      t1, $f2
-+    bceqz           $fcc0, L(loop)
-+    xvmsknz.b       $xr0, $xr0
-+
-+    xvmsknz.b       $xr1, $xr1
-+    xvpickve.w      $xr5, $xr0, 4
-+    xvpickve.w      $xr6, $xr1, 4
-+    vilvl.h         $vr0, $vr5, $vr0
-+
-+    vilvl.h         $vr1, $vr6, $vr1
-+    vilvl.w         $vr0, $vr1, $vr0
-+    movfr2gr.d      t0, $f0
-+L(end):
-+    slli.d          t3, t2, 1   # shift one more for the last '\0'
-+
-+    cto.d           t0, t0
-+    sll.d           t3, t3, t0
-+    andn            t1, t1, t3
-+    clz.d           t0, t1
-+
-+    sub.d           a0, a0, t0
-+    maskeqz         t0, a0, t1
-+    masknez         t1, a2, t1
-+    or              a0, t0, t1
-+
-+    jr              ra
-+END(STRRCHR)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def(STRRCHR)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
-new file mode 100644
-index 00000000..e9228a2e
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
-@@ -0,0 +1,93 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <regdef.h>
-+#include <sys/asm.h>
-+#endif
-+
-+#if IS_IN (libc)
-+
-+#define STRRCHR __strrchr_lsx
-+
-+LEAF(STRRCHR)
-+    .align          6
-+    andi            t1, a0, 0x1f
-+    bstrins.d       a0, zero, 4, 0
-+    vld             $vr0, a0, 0
-+    vld             $vr1, a0, 16
-+
-+    vreplgr2vr.b    $vr4, a1
-+    li.d            t2, -1
-+    move            a2, zero
-+    addi.d          a0, a0, 31
-+
-+    vseq.b          $vr2, $vr0, $vr4
-+    vseq.b          $vr3, $vr1, $vr4
-+    vmsknz.b        $vr0, $vr0
-+    vmsknz.b        $vr1, $vr1
-+
-+    vmsknz.b        $vr2, $vr2
-+    vmsknz.b        $vr3, $vr3
-+    vilvl.h         $vr0, $vr1, $vr0
-+    vilvl.h         $vr1, $vr3, $vr2
-+
-+
-+    movfr2gr.s      t0, $f0
-+    sll.d           t3, t2, t1
-+    movfr2gr.s      t1, $f1
-+    orn             t0, t0, t3
-+
-+    and             t1, t1, t3
-+    bne             t0, t2, L(end)
-+L(loop):
-+    vld             $vr0, a0, 1
-+    vld             $vr1, a0, 17
-+
-+    clz.w           t0, t1
-+    sub.d           t0, a0, t0
-+    addi.d          a0, a0, 32
-+    maskeqz         t0, t0, t1
-+
-+    masknez         t1, a2, t1
-+    or              a2, t0, t1
-+    vseq.b          $vr2, $vr0, $vr4
-+    vseq.b          $vr3, $vr1, $vr4
-+
-+
-+    vmsknz.b        $vr2, $vr2
-+    vmsknz.b        $vr3, $vr3
-+    vmin.bu         $vr5, $vr0, $vr1
-+    vilvl.h         $vr2, $vr3, $vr2
-+
-+    vsetanyeqz.b    $fcc0, $vr5
-+    movfr2gr.s      t1, $f2
-+    bceqz           $fcc0, L(loop)
-+    vmsknz.b        $vr0, $vr0
-+
-+    vmsknz.b        $vr1, $vr1
-+    vilvl.h         $vr0, $vr1, $vr0
-+    movfr2gr.s      t0, $f0
-+L(end):
-+    slli.d          t3, t2, 1   # shift one more for the last '\0'
-+
-+    cto.w           t0, t0
-+    sll.d           t3, t3, t0
-+    andn            t1, t1, t3
-+    clz.w           t0, t1
-+
-+
-+    sub.d           a0, a0, t0
-+    maskeqz         t0, a0, t1
-+    masknez         t1, a2, t1
-+    or              a0, t0, t1
-+
-+    jr              ra
-+END(STRRCHR)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def(STRRCHR)
-+#endif
-+
-+#endif
-diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr.c b/sysdeps/loongarch/lp64/multiarch/strrchr.c
-new file mode 100644
-index 00000000..32eb6ea6
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/multiarch/strrchr.c
-@@ -0,0 +1,39 @@
-+/* Multiple versions of strrchr.
-+   All versions must be listed in ifunc-impl-list.c.
-+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+/* Define multiple versions only for the definition in libc.  */
-+#if IS_IN (libc)
-+# define strrchr __redirect_strrchr
-+# include <string.h>
-+# undef strrchr
-+
-+# define SYMBOL_NAME strrchr
-+# include "ifunc-memchr.h"
-+
-+libc_ifunc_redirected (__redirect_strrchr, __new_strrchr,
-+		       IFUNC_SELECTOR ());
-+weak_alias(__new_strrchr, rindex)
-+# ifdef SHARED
-+__hidden_ver1 (__new_strrchr, __GI_strrchr, __redirect_strrchr)
-+  __attribute__ ((visibility ("hidden")));
-+# endif
-+
-+# include <shlib-compat.h>
-+versioned_symbol (libc, __new_strrchr, strrchr, GLIBC_2_27);
-+#endif
-diff --git a/sysdeps/loongarch/lp64/rawmemchr.S b/sysdeps/loongarch/lp64/rawmemchr.S
-new file mode 100644
-index 00000000..94b70f2d
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/rawmemchr.S
-@@ -0,0 +1,114 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#ifndef RAWMEMCHR_NAME
-+# define RAWMEMCHR_NAME __rawmemchr
-+#endif
-+
-+
-+LEAF(RAWMEMCHR_NAME)
-+    .align      6
-+    andi        t1, a0, 0x7
-+    bstrins.d   a0, zero, 2, 0
-+    lu12i.w     a2, 0x01010
-+    bstrins.d   a1, a1, 15, 8
-+
-+    ld.d        t0, a0, 0
-+    slli.d      t1, t1, 3
-+    ori         a2, a2, 0x101
-+    bstrins.d   a1, a1, 31, 16
-+
-+    li.w        t8, -1
-+    bstrins.d   a1, a1, 63, 32
-+    bstrins.d   a2, a2, 63, 32
-+    sll.d       t2, t8, t1
-+
-+    sll.d       t3, a1, t1
-+    orn         t0, t0, t2
-+    slli.d      a3, a2, 7
-+    beqz        a1, L(find_zero)
-+
-+    xor         t0, t0, t3
-+    sub.d       t1, t0, a2
-+    andn        t2, a3, t0
-+    and         t3, t1, t2
-+
-+    bnez        t3, L(count_pos)
-+    addi.d      a0, a0, 8
-+
-+L(loop):
-+    ld.d        t0, a0, 0
-+    xor         t0, t0, a1
-+
-+    sub.d       t1, t0, a2
-+    andn        t2, a3, t0
-+    and         t3, t1, t2
-+    bnez        t3, L(count_pos)
-+
-+    ld.d        t0, a0, 8
-+    addi.d      a0, a0, 16
-+    xor         t0, t0, a1
-+    sub.d       t1, t0, a2
-+
-+    andn        t2, a3, t0
-+    and         t3, t1, t2
-+    beqz        t3, L(loop)
-+    addi.d      a0, a0, -8
-+L(count_pos):
-+    ctz.d       t0, t3
-+    srli.d      t0, t0, 3
-+    add.d       a0, a0, t0
-+    jr          ra
-+
-+L(loop_7bit):
-+    ld.d        t0, a0, 0
-+L(find_zero):
-+    sub.d       t1, t0, a2
-+    and         t2, t1, a3
-+    bnez        t2, L(more_check)
-+
-+    ld.d        t0, a0, 8
-+    addi.d      a0, a0, 16
-+    sub.d       t1, t0, a2
-+    and         t2, t1, a3
-+
-+    beqz        t2, L(loop_7bit)
-+    addi.d      a0, a0, -8
-+
-+L(more_check):
-+    andn        t2, a3, t0
-+    and         t3, t1, t2
-+    bnez        t3, L(count_pos)
-+    addi.d      a0, a0, 8
-+
-+L(loop_8bit):
-+    ld.d        t0, a0, 0
-+
-+    sub.d       t1, t0, a2
-+    andn        t2, a3, t0
-+    and         t3, t1, t2
-+    bnez        t3, L(count_pos)
-+
-+    ld.d        t0, a0, 8
-+    addi.d      a0, a0, 16
-+    sub.d       t1, t0, a2
-+
-+    andn        t2, a3, t0
-+    and         t3, t1, t2
-+    beqz        t3, L(loop_8bit)
-+
-+    addi.d      a0, a0, -8
-+    b           L(count_pos)
-+
-+END(RAWMEMCHR_NAME)
-+
-+#ifdef _LIBC
-+weak_alias (__rawmemchr, rawmemchr)
-+libc_hidden_builtin_def (__rawmemchr)
-+#endif
-diff --git a/sysdeps/loongarch/lp64/s_cosf.S b/sysdeps/loongarch/lp64/s_cosf.S
-new file mode 100644
-index 00000000..5bfabefb
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/s_cosf.S
-@@ -0,0 +1,409 @@
-+#include <sysdep.h>
-+#include <sys/asm.h>
-+#include <libm-alias-float.h>
-+
-+/* Short algorithm description:
-+ *
-+ *  1) if |x|==0:    sin(x)=x,
-+ *                   cos(x)=1.
-+ *  2) if |x|<2^-27: sin(x)=x-x*DP_SMALL, raising underflow only when needed,
-+ *                   cos(x)=1-|x|.
-+ *  3) if |x|<2^-5 : sin(x)=x+x*x^2*DP_SIN2_0+x^5*DP_SIN2_1,
-+ *                   cos(x)=1+1*x^2*DP_COS2_0+x^5*DP_COS2_1
-+ *  4) if |x|< Pi/4: sin(x)=x+x*x^2*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))),
-+ *                   cos(x)=1+1*x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
-+ *  5) if |x| < 9*Pi/4:
-+ *      5.1) Range reduction:
-+ *          k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1, t=|x|-j*Pi/4.
-+ *      5.2) Reconstruction:
-+ *          sign_sin = sign(x) * (-1.0)^(( n   >>2)&1)
-+ *          sign_cos =           (-1.0)^(((n+2)>>2)&1)
-+ *          poly_sin = ((((S4*t^2 + S3)*t^2 + S2)*t^2 + S1)*t^2 + S0)*t^2*t+t
-+ *          poly_cos = ((((C4*t^2 + C3)*t^2 + C2)*t^2 + C1)*t^2 + C0)*t^2*s+s
-+ *          if(n&2 != 0) {
-+ *              using cos(t) and sin(t) polynomials for |t|<Pi/4, results are
-+ *              cos(x) = poly_sin * sign_cos
-+ *              sin(x) = poly_cos * sign_sin
-+ *          } else {
-+ *              sin(x) = poly_sin * sign_sin
-+ *              cos(x) = poly_cos * sign_cos
-+ *          }
-+ *  6) if |x| < 2^23, large args:
-+ *      6.1) Range reduction:
-+ *          k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4
-+ *      6.2) Reconstruction same as (5.2).
-+ *  7) if |x| >= 2^23, very large args:
-+ *      7.1) Range reduction:
-+ *          k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4.
-+ *      7.2) Reconstruction same as (5.2).
-+ *  8) if x is Inf, return x-x, and set errno=EDOM.
-+ *  9) if x is NaN, return x-x.
-+ *
-+ * Special cases:
-+ *  sin/cos(+-0) = +-0/1 not raising inexact/underflow,
-+ *  sin/cos(subnormal) raises inexact/underflow,
-+ *  sin/cos(min_normalized) raises inexact/underflow,
-+ *  sin/cos(normalized) raises inexact,
-+ *  sin/cos(Inf) = NaN, raises invalid, sets errno to EDOM,
-+ *  sin/cos(NaN) = NaN.
-+ */
-+ 
-+#define COSF __cosf
-+
-+#define LOADFD(rd, rs, label) \
-+    la.local rs, label;\
-+    fld.d rd, rs, 0
-+
-+#define LOADFS(rd, rs, label) \
-+    la.local rs, label;\
-+    fld.s rd, rs, 0
-+
-+#define FTOL(rd, rs, tmp) \
-+    ftintrz.l.d tmp, rs;\
-+    movfr2gr.d    rd, tmp
-+
-+#define FTOW(rd, rs, tmp) \
-+    ftintrz.w.d tmp, rs;\
-+    movfr2gr.s  rd, tmp
-+
-+#define WTOF(rd, rs, tmp) \
-+    movgr2fr.w  tmp, rs;\
-+    ffint.d.w   rd, tmp
-+
-+#define LTOF(rd, rs, tmp) \
-+    movgr2fr.d  tmp, rs;\
-+    ffint.d.l   rd, tmp
-+
-+LEAF(COSF)
-+    .align      2
-+    .align      3
-+    /* fa0 is SP x; fa1 is DP x */
-+    movfr2gr.s  t0, fa0 /* Bits of x */
-+    fcvt.d.s    fa1, fa0 /* DP x */
-+    li.w        t1, 0x7fffffff
-+    and         t0, t0, t1    /* |x| */
-+    li.w        t1, 0x3f490fdb /* const Pi/4 */
-+    bltu        t0, t1, L(arg_less_pio4) /* |x| < Pi/4 branch */
-+    li.w        t1, 0x40e231d6 /* 9*Pi/4 */
-+    la.local    t4, L(DP_) /*DP_ base addr*/
-+    bgeu        t0, t1, L(greater_or_equal_9pio4) /* |x| >= 9*Pi/4 branch */
-+/* L(median_args): */    
-+    /* Here if Pi/4<=|x|<9*Pi/4 */
-+    fabs.d      fa0, fa1 /* DP |x| */
-+    fld.d       fa1, t4, 56 /* 4/Pi */
-+    fmul.d      fa1, fa1, fa0 /* DP |x|/(Pi/4) */
-+    FTOW( t0, fa1, fa1 ) /* k=trunc(|x|/(Pi/4)) */
-+    la.local    t1, L(PIO2J) /* base addr of PIO2J table */
-+    addi.w      t0, t0, 1 /* k+1 */
-+    bstrpick.d  t2, t0, 3, 1 /* j=n/2 */
-+    alsl.d      t1, t2, t1, 3
-+    fld.d       fa1, t1, 0 /* j*Pi/2 */
-+    addi.w      t0, t0, 2 /* n = k+3 */
-+    fsub.d      fa0, fa0, fa1 /* t = |x| - j * Pi/2 */
-+/* Input: t0=n fa0=t*/
-+L(reduced):
-+   /* Here if cos(x) calculated using cos(t) polynomial for |t|<Pi/4:
-+    * y = t*t; z = y*y;
-+    * s = sign(x) * (-1.0)^((n>>2)&1)
-+    * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))))
-+    
-+    * Here if cos(x) calculated using sin(t) polynomial for |t|<Pi/4:
-+    * y = t*t; z = y*y;
-+    * s = sign(x) * (-1.0)^((n>>2)&1)
-+    * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))))
-+    */
-+    /* TODO: what is the best order ??? */
-+    /* load-to-use latency, hardware module usage, integer pipeline & float pipeline */
-+    /* cancel branch */
-+    slli.w      t0, t0, 1 /* (n << 1) */
-+    andi        t1, t0, 4 /* (n << 1) & 4 */
-+    alsl.d      t2, t1, t4, 4 /* adjust to DP_C or DP_S */
-+    fld.d       fa3, t2, 32 /* C4 */
-+    andi        t0, t0, 8 /* =====> (n << 1) & 8 */
-+    fmul.d      fa1, fa0, fa0 /* y=x^2 */    
-+    fld.d       fa4, t2, 16 /* C2 */
-+    fmul.d      fa2, fa1, fa1 /* z=x^4 */
-+    fld.d       fa5, t2, 24 /* C3 */
-+    la.local    t3, L(DP_ONES) /* =====> DP_ONES */
-+    fld.d       fa6, t2, 8  /* C1 */
-+    fmadd.d     fa4, fa2, fa3, fa4 /* cx = C2+z*C4 */
-+    fld.d       fa3, t2, 0  /* C0 */
-+    fmadd.d     fa5, fa2, fa5, fa6 /* cy = C1+z*C3 */
-+    fld.d       fa6, t3, 0 /* one */
-+    fmadd.d     fa4, fa2, fa4, fa3 /* cx = C0+z*cx */
-+    add.d       t0, t0, t3 /* =====> addr */
-+    fmadd.d     fa4, fa1, fa5, fa4 /* cx = cx+y*cy */
-+    fld.d       fa2, t0, 0 /* sign */
-+    fmadd.d     fa4, fa4, fa1, fa6 /* 1.0+y*cx */
-+    fmul.d      fa1, fa2, fa4 /* sign * cx */
-+    bnez        t1, L_return
-+    fmul.d      fa1, fa1, fa0 /* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */
-+L_return:
-+    fcvt.s.d    fa0, fa1 /* SP result */
-+    jr          ra
-+
-+L(greater_or_equal_9pio4):
-+    /* Here if |x|>=9*Pi/4 */
-+    li.w        t1, 0x7f800000 /* x is Inf or NaN?  */
-+    bgeu        t0, t1, L(inf_or_nan) /* |x| >= Inf branch */
-+    /* Here if finite |x|>=9*Pi/4 */
-+    li.w        t1, 0x4b000000 /* 2^23  */
-+    bgeu        t0, t1, L(greater_or_equal_2p23) /* |x| >= 2^23 branch */
-+    /* Here if 9*Pi/4<=|x|<2^23 */
-+    fabs.d      fa0, fa1 /* DP |x| */
-+    fld.d       fa1, t4, 56
-+    fmul.d      fa1, fa1, fa0 /* |x|/(Pi/4) */
-+    FTOW( t0, fa1, fa1 )  /* k=trunc(|x|/(Pi/4)) */
-+    addi.w      t0, t0, 1 /* k+1 */
-+    srli.w      t1, t0, 1 /* x=n/2 */
-+    WTOF( fa1, t1, fa1 ) /* DP x */
-+    fld.d       fa2, t4, 104 /* -PIO2HI = high part of -Pi/2 */
-+    fld.d       fa3, t4, 112 /* -PIO2LO = low part of -Pi/2 */
-+    fmadd.d     fa0, fa2, fa1, fa0 /* |x| - x*PIO2HI */
-+    addi.w      t0, t0, 2 /* n = k+3 */
-+    fmadd.d     fa0, fa3, fa1, fa0 /* |x| - x*PIO2HI - x*PIO2LO */
-+    b           L(reduced)
-+
-+L(greater_or_equal_2p23):
-+    /* Here if finite |x|>=2^23 */
-+    fabs.s      fa5, fa0 /* SP |x| */
-+    /* bitpos = (ix>>23) - BIAS_32; */
-+    srli.w      t0, t0, 23 /*TODO???srai.w eb = biased exponent of x */
-+    /* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
-+    addi.w      t0, t0, -124 /* t0 = bitpos */
-+    /* t3= j = bitpos/28 */
-+    /* x/28 = (x * ((0x100000000 / 28) + 1)) >> 32 */
-+    li.w        t1, 0x924924a
-+    mulh.wu     t0, t1, t0
-+    fcvt.d.s    fa5, fa5 /* Convert to double */
-+    /* TODO: what is the best order ??? */
-+    la.local    t1, L(invpio4_table) /* t2 */
-+    alsl.d      t1, t0, t1, 3
-+    fld.d       fa0, t1, 0  /* invpio4_table[j] */
-+    fld.d       fa1, t1, 8  /* invpio4_table[j+1] */
-+    fmul.d      fa0, fa0, fa5 /* a = invpio4_table[j]*|x| */
-+    fld.d       fa2, t1, 16 /* invpio4_table[j+2] */
-+    fmul.d      fa1, fa1, fa5 /* b = invpio4_table[j+1]*|x| */
-+    fld.d       fa3, t1, 24 /* invpio4_table[j+3] */
-+    fmul.d      fa2, fa2, fa5 /* c = invpio4_table[j+2]*|x| */
-+    fmul.d      fa3, fa3, fa5 /* d = invpio4_table[j+3]*|x| */
-+/*TODO: overflow check*/
-+    FTOL( t0, fa0, fa4 ) /*uint64_t l = a; TODO: change the order*/
-+    li.w        t1, -8   /* 0xfffffffffffffff8 */
-+    and         t0, t0, t1 /* l &= ~0x7; */
-+    LTOF( fa4, t0, fa4 ) /* DP l*/
-+    fsub.d      fa0, fa0, fa4 /* a -= l; */
-+    fadd.d      fa4, fa0, fa1 /* fa4 double e = a + b; */
-+/*TODO: overflow check*/
-+    FTOL( t0, fa4, fa4 ) /*uint64_t l = e;*/
-+    andi        t2, t0, 1 /* l & 1 TODO: change the order*/
-+    LOADFD( fa5, t1, L(DP_ONES) ) /* fa5 = 1.0 */
-+    LTOF( fa4, t0, fa4 ) /* fa4 DP l*/
-+/* critical!!!! the order */
-+    fsub.d      fa0, fa0, fa4
-+    fld.d       fa4, t4, 120 /* PI_4 */
-+    beqz        t2, L_even_integer
-+/*L_odd_integer:*/
-+    fsub.d      fa0, fa0, fa5
-+    fadd.d      fa0, fa0, fa1
-+    fadd.d      fa2, fa2, fa3
-+    fadd.d      fa0, fa0, fa2
-+    addi.d      t0, t0, 3
-+    fmul.d      fa0, fa0, fa4
-+    b           L(reduced)
-+L_even_integer:
-+    fadd.d      fa0, fa0, fa1
-+    fadd.d      fa2, fa2, fa3
-+    fadd.d      fa0, fa0, fa2
-+    fcmp.sle.d  $fcc0, fa0, fa5
-+    addi.d      t0, t0, 3
-+    bcnez       $fcc0, L_leq_one
-+/*L_gt_one:*/
-+    fld.d       fa2, t1, 16 /* 2.0 */
-+    addi.d      t0, t0, 1
-+    fsub.d      fa0, fa0, fa2
-+L_leq_one:
-+    fmul.d      fa0, fa0, fa4
-+    b           L(reduced)
-+
-+L(arg_less_pio4):
-+    /* Here if |x|<Pi/4 */
-+    li.w        t1, 0x3d000000 /* const 2^-5 */
-+    blt         t0, t1, L(less_2pn5) /* |x| < 2^-5 branch */
-+    /* Here if 2^-5<=|x|<Pi/4 */
-+    /* 
-+     * Chebyshev polynomial of the form:
-+     * 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
-+     */
-+    la.local    t0, L(DP_) /*DP_ base addr*/
-+    fld.d       fa3, t0, 96 /* C4 */
-+    fmul.d      fa1, fa1, fa1 /* y=x^2 */    
-+    fld.d       fa4, t0, 80 /* C2 */
-+    fmul.d      fa2, fa1, fa1 /* z=x^4 */
-+    fld.d       fa5, t0, 88 /* C3 */
-+    fld.d       fa6, t0, 72 /* C1 */
-+    fmadd.d     fa0, fa2, fa3, fa4 /* cx = C2+z*C4 */
-+    fld.d       fa3, t0, 64 /* C0 */
-+    fmadd.d     fa5, fa2, fa5, fa6 /* cy = C1+z*C3 */
-+    la.local    t0, L(DP_ONES)
-+    fmadd.d     fa0, fa0, fa2, fa3 /* cx = C0+z*cx */
-+    fld.d       fa2, t0, 0 /* 1.0 */
-+    fmadd.d     fa0, fa1, fa5, fa0 /* cx = cx+y*cy */
-+    fmadd.d     fa0, fa0, fa1, fa2 /* 1.0+y*cx */
-+    /*    
-+    fld.d       fa6, t0, 96
-+    fld.d       fa5, t0, 88
-+    fld.d       fa4, t0, 80
-+    fmul.d      fa1, fa1, fa1    
-+    fld.d       fa3, t0, 72
-+    fld.d       fa2, t0, 64
-+    la.local    t0, L(DP_ONES)
-+    fld.d       fa6, t0, 0
-+    fmadd.d     fa0, fa1, fa5, fa0
-+    fmadd.d     fa0, fa1, fa4, fa0
-+    fmadd.d     fa0, fa1, fa3, fa0
-+    fmadd.d     fa0, fa1, fa2, fa0
-+    fmadd.d     fa0, fa1, fa5, fa4
-+    */
-+    fcvt.s.d    fa0, fa0
-+    jr          ra
-+
-+L(less_2pn5):
-+    /* Here if |x|<2^-5 */
-+    li.w        t1, 0x32000000 /* 2^-27?  */
-+    blt         t0, t1, L(less_2pn27)
-+    /* Here if 2^-27<=|x|<2^-5 */
-+    fmul.d      fa0, fa1, fa1 /* theta2=x^2 */
-+    la.local    t0, L(DP_)
-+    fld.d       fa2, t0, 48  /* DP_COS2_1 */
-+    fmul.d      fa1, fa1, fa0 /* x*theta2 */
-+    fld.d       fa3, t0, 40 /* DP_COS2_0 */
-+    la.local    t0, L(DP_ONES)
-+    fld.d       fa4, t0, 0 /* 1.0 */
-+    fmadd.d     fa1, fa1, fa2, fa3 /* DP_COS2_0+x^2*DP_COS2_1 */
-+    fmadd.d     fa0, fa0, fa1, fa4 /* cx = 1.0 + theta2 * cx */
-+    fcvt.s.d    fa0, fa0
-+    jr          ra
-+
-+L(less_2pn27):
-+    /* Here if |x|<2^-27 */
-+    fabs.s      fa0, fa0
-+    LOADFS( fa1, t0, L(SP_ONE) ) /* 1.0 */
-+    fsub.s      fa0, fa1, fa0 /* 1.0 - abstheta */
-+    /* No need to convert */
-+    jr          ra
-+
-+L(inf_or_nan):
-+    /* Here if |x| is Inf or NAN */
-+    bne         t0, t1, L_skip_errno_setting /* in case of x is NaN */
-+    la.tls.ie   t0, errno
-+    li.w        t1, 0x21
-+    stx.w       t1, t0, tp
-+L_skip_errno_setting:
-+    /* Here if |x| is Inf or NAN. Continued.  */
-+    fsub.s      fa0, fa0, fa0 /* Result is NaN */
-+    jr          ra
-+END(COSF)
-+
-+    .section .rodata
-+    .align 3
-+    .type L(PIO2J), @object
-+    .size L(PIO2J), 48
-+L(PIO2J): /* Table of j*Pi/2, for j=0,1,..,5 */
-+    .word   0x00000000
-+    .word   0x00000000
-+    .word   0x54442d18
-+    .word   0x3ff921fb
-+    .word   0x54442d18
-+    .word   0x400921fb
-+    .word   0x7f3321d2
-+    .word   0x4012d97c
-+    .word   0x54442d18
-+    .word   0x401921fb
-+    .word   0x2955385e
-+    .word   0x401f6a7a
-+
-+    .align 3
-+    .type L(invpio4_table), @object
-+    .size L(invpio4_table), 64
-+L(invpio4_table): /* 4/Pi broken into sum of positive DP values */
-+    .word   0x00000000
-+    .word   0x00000000
-+    .word   0x6c000000
-+    .word   0x3ff45f30
-+    .word   0x2a000000
-+    .word   0x3e3c9c88
-+    .word   0xa8000000
-+    .word   0x3c54fe13
-+    .word   0xd0000000
-+    .word   0x3aaf47d4
-+    .word   0x6c000000
-+    .word   0x38fbb81b
-+    .word   0xe0000000
-+    .word   0x3714acc9
-+    .word   0x7c000000
-+    .word   0x3560e410
-+
-+/* Coefficients of polynomial
-+    for sin(x)~=x+x^3*DP_SIN2_0+x^5*DP_SIN2_1, |x|<2^-5.  */
-+/* Coefficients of polynomial
-+    for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4.  
-+    for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4.
-+*/
-+    .align 3
-+    .type L(DP_), @object
-+    .size L(DP_), 128
-+L(DP_):
-+    .word   0x55551cd9
-+    .word   0xbfc55555 /*S0*/
-+    .word   0x10c2688b
-+    .word   0x3f811111 /*S1*/
-+    .word   0x8b4bd1f9
-+    .word   0xbf2a019f /*S2*/
-+    .word   0x64e6b5b4
-+    .word   0x3ec71d72 /*S3*/
-+    .word   0x1674b58a
-+    .word   0xbe5a947e /*S4*/
-+    .word   0xff5cc6fd
-+    .word   0xbfdfffff /*CC0 +40*/
-+    .word   0xb178dac5
-+    .word   0x3fa55514 /*CC1 +48*/
-+    .word   0x6dc9c883
-+    .word   0x3ff45f30 /* inv_PI_4 */
-+    .word   0xfffe98ae
-+    .word   0xbfdfffff /*C0*/
-+    .word   0x545c50c7
-+    .word   0x3fa55555 /*C1*/
-+    .word   0x348b6874
-+    .word   0xbf56c16b /*C2*/
-+    .word   0x9ac43cc0
-+    .word   0x3efa00eb /*C3*/
-+    .word   0xdd8844d7
-+    .word   0xbe923c97 /*C4*/
-+    .word   0x54400000
-+    .word   0xbff921fb /* PI_2_hi */
-+    .word   0x1a626332
-+    .word   0xbdd0b461 /* PI_2_lo */
-+    .word   0x54442d18
-+    .word   0x3fe921fb /* PI_4 */
-+
-+    .align 3
-+    .type L(DP_ONES), @object
-+    .size L(DP_ONES), 24
-+L(DP_ONES):
-+    .word   0x00000000
-+    .word   0x3ff00000    /* +1.0 */
-+    .word   0x00000000
-+    .word   0xbff00000    /* -1.0 */
-+    .word   0x00000000
-+    .word   0x40000000    /* +2.0 */
-+
-+    .align 2
-+L(SP_INVPIO4):
-+    .word   0x3fa2f983   /* 4/Pi */
-+
-+    .align 2
-+L(SP_ONE):
-+    .word   0x3f800000   /* 1.0 */
-+
-+libm_alias_float (__cos, cos)
-diff --git a/sysdeps/loongarch/lp64/s_sinf.S b/sysdeps/loongarch/lp64/s_sinf.S
-new file mode 100644
-index 00000000..91c9db9e
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/s_sinf.S
-@@ -0,0 +1,392 @@
-+#include <sysdep.h>
-+#include <sys/asm.h>
-+#include <libm-alias-float.h>
-+
-+/* Short algorithm description:
-+ *
-+ *  1) if |x|==0:    sin(x)=x,
-+ *                   cos(x)=1.
-+ *  2) if |x|<2^-27: sin(x)=x-x*DP_SMALL, raising underflow only when needed,
-+ *                   cos(x)=1-|x|.
-+ *  3) if |x|<2^-5 : sin(x)=x+x*x^2*DP_SIN2_0+x^5*DP_SIN2_1,
-+ *                   cos(x)=1+1*x^2*DP_COS2_0+x^5*DP_COS2_1
-+ *  4) if |x|< Pi/4: sin(x)=x+x*x^2*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))),
-+ *                   cos(x)=1+1*x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
-+ *  5) if |x| < 9*Pi/4:
-+ *      5.1) Range reduction:
-+ *          k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1, t=|x|-j*Pi/4.
-+ *      5.2) Reconstruction:
-+ *          sign_sin = sign(x) * (-1.0)^(( n   >>2)&1)
-+ *          sign_cos =           (-1.0)^(((n+2)>>2)&1)
-+ *          poly_sin = ((((S4*t^2 + S3)*t^2 + S2)*t^2 + S1)*t^2 + S0)*t^2*t+t
-+ *          poly_cos = ((((C4*t^2 + C3)*t^2 + C2)*t^2 + C1)*t^2 + C0)*t^2*s+s
-+ *          if(n&2 != 0) {
-+ *              using cos(t) and sin(t) polynomials for |t|<Pi/4, results are
-+ *              cos(x) = poly_sin * sign_cos
-+ *              sin(x) = poly_cos * sign_sin
-+ *          } else {
-+ *              sin(x) = poly_sin * sign_sin
-+ *              cos(x) = poly_cos * sign_cos
-+ *          }
-+ *  6) if |x| < 2^23, large args:
-+ *      6.1) Range reduction:
-+ *          k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4
-+ *      6.2) Reconstruction same as (5.2).
-+ *  7) if |x| >= 2^23, very large args:
-+ *      7.1) Range reduction:
-+ *          k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4.
-+ *      7.2) Reconstruction same as (5.2).
-+ *  8) if x is Inf, return x-x, and set errno=EDOM.
-+ *  9) if x is NaN, return x-x.
-+ *
-+ * Special cases:
-+ *  sin/cos(+-0) = +-0/1 not raising inexact/underflow,
-+ *  sin/cos(subnormal) raises inexact/underflow,
-+ *  sin/cos(min_normalized) raises inexact/underflow,
-+ *  sin/cos(normalized) raises inexact,
-+ *  sin/cos(Inf) = NaN, raises invalid, sets errno to EDOM,
-+ *  sin/cos(NaN) = NaN.
-+ */
-+ 
-+#define SINF __sinf
-+
-+#define LOADFD(rd, rs, label) \
-+    la.local rs, label;\
-+    fld.d rd, rs, 0
-+
-+#define LOADFS(rd, rs, label) \
-+    la.local rs, label;\
-+    fld.s rd, rs, 0
-+
-+#define FTOL(rd, rs, tmp) \
-+    ftintrz.l.d tmp, rs;\
-+    movfr2gr.d    rd, tmp
-+
-+#define FTOW(rd, rs, tmp) \
-+    ftintrz.w.d tmp, rs;\
-+    movfr2gr.s  rd, tmp
-+
-+#define WTOF(rd, rs, tmp) \
-+    movgr2fr.w  tmp, rs;\
-+    ffint.d.w   rd, tmp
-+
-+#define LTOF(rd, rs, tmp) \
-+    movgr2fr.d  tmp, rs;\
-+    ffint.d.l   rd, tmp
-+
-+LEAF(SINF)
-+    .align      2
-+    .align      3
-+    /* fa0 is SP x; fa1 is DP x */
-+    movfr2gr.s  t2, fa0 /* Bits of x */
-+    fcvt.d.s    fa1, fa0 /* DP x */
-+    li.w        t1, 0x7fffffff
-+    and         t0, t2, t1    /* |x| */
-+    li.w        t1, 0x3f490fdb /* const Pi/4 */
-+    bltu        t0, t1, L(arg_less_pio4) /* |x| < Pi/4 branch */
-+    li.w        t1, 0x40e231d6 /* 9*Pi/4 */
-+    la.local    t4, L(DP_) /*DP_ base addr*/
-+    bstrpick.d  t5, t2, 31, 31 /* sign of x */
-+    slli.w      t5, t5, 3
-+    bgeu        t0, t1, L(greater_or_equal_9pio4) /* |x| >= 9*Pi/4 branch */
-+/* L(median_args): */    
-+    /* Here if Pi/4<=|x|<9*Pi/4 */
-+    fabs.d      fa0, fa1 /* DP |x| */
-+    fld.d       fa1, t4, 56 /* 4/Pi */
-+    fmul.d      fa1, fa1, fa0 /* DP |x|/(Pi/4) */
-+    FTOW( t0, fa1, fa1 ) /* k=trunc(|x|/(Pi/4)) */
-+    la.local    t1, L(PIO2J) /* base addr of PIO2J table */
-+    addi.w      t0, t0, 1 /* k+1 */
-+    bstrpick.d  t2, t0, 3, 1 /* j=n/2 */
-+    alsl.d      t1, t2, t1, 3
-+    fld.d       fa1, t1, 0 /* j*Pi/2 */
-+    fsub.d      fa0, fa0, fa1 /* t = |x| - j * Pi/2 */
-+/* Input: t0=n fa0=t*/
-+/* Input: t0=n fa0=t, t5=sign(x) */
-+L(reduced):
-+   /* Here if cos(x) calculated using cos(t) polynomial for |t|<Pi/4:
-+    * y = t*t; z = y*y;
-+    * s = sign(x) * (-1.0)^((n>>2)&1)
-+    * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))))
-+    
-+    * Here if cos(x) calculated using sin(t) polynomial for |t|<Pi/4:
-+    * y = t*t; z = y*y;
-+    * s = sign(x) * (-1.0)^((n>>2)&1)
-+    * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))))
-+    */
-+    /* TODO: what is the best order ??? */
-+    /* load-to-use latency, hardware module usage, integer pipeline & float pipeline */
-+    /* cancel branch */
-+    slli.w      t0, t0, 1 /* (n << 1) */
-+    andi        t1, t0, 4 /* (n << 1) & 4 */
-+    alsl.d      t2, t1, t4, 4 /* adjust to DP_C or DP_S */
-+    fld.d       fa3, t2, 32 /* C4 */
-+    andi        t0, t0, 8 /* =====> (n << 1) & 8 */
-+    fmul.d      fa1, fa0, fa0 /* y=x^2 */
-+    xor         t0, t0, t5 /* (-1.0)^((n>>2)&1) XOR sign(x) */
-+    fld.d       fa4, t2, 16 /* C2 */
-+    fmul.d      fa2, fa1, fa1 /* z=x^4 */
-+    fld.d       fa5, t2, 24 /* C3 */
-+    la.local    t3, L(DP_ONES) /* =====> DP_ONES */
-+    fld.d       fa6, t2, 8  /* C1 */
-+    fmadd.d     fa4, fa2, fa3, fa4 /* cx = C2+z*C4 */
-+    fld.d       fa3, t2, 0  /* C0 */
-+    fmadd.d     fa5, fa2, fa5, fa6 /* cy = C1+z*C3 */
-+    fld.d       fa6, t3, 0 /* 1.0 */
-+    fmadd.d     fa4, fa2, fa4, fa3 /* cx = C0+z*cx */
-+    add.d       t0, t0, t3 /* =====> addr */
-+    fmadd.d     fa4, fa1, fa5, fa4 /* cx = cx+y*cy */
-+    fld.d       fa2, t0, 0 /* sign */
-+    fmadd.d     fa4, fa4, fa1, fa6 /* 1.0+y*cx */
-+    fmul.d      fa1, fa2, fa4 /* sign * cx */
-+    bnez        t1, L_return
-+    fmul.d      fa1, fa1, fa0 /* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */
-+L_return:
-+    fcvt.s.d    fa0, fa1 /* SP result */
-+    jr          ra
-+
-+L(greater_or_equal_9pio4):
-+    /* Here if |x|>=9*Pi/4 */
-+    li.w        t1, 0x7f800000 /* x is Inf or NaN?  */
-+    bgeu        t0, t1, L(inf_or_nan) /* |x| >= Inf branch */
-+    /* Here if finite |x|>=9*Pi/4 */
-+    li.w        t1, 0x4b000000 /* 2^23  */
-+    bgeu        t0, t1, L(greater_or_equal_2p23) /* |x| >= 2^23 branch */
-+    /* Here if 9*Pi/4<=|x|<2^23 */
-+    fabs.d      fa0, fa1 /* DP |x| */
-+    fld.d       fa1, t4, 56
-+    fmul.d      fa1, fa1, fa0 /* |x|/(Pi/4) */
-+    FTOW( t0, fa1, fa1 )  /* k=trunc(|x|/(Pi/4)) */
-+    addi.w      t0, t0, 1 /* k+1 */
-+    srli.w      t1, t0, 1 /* x=n/2 */
-+    WTOF( fa1, t1, fa1 ) /* DP x */
-+    fld.d       fa2, t4, 104 /* -PIO2HI = high part of -Pi/2 */
-+    fld.d       fa3, t4, 112 /* -PIO2LO = low part of -Pi/2 */
-+    fmadd.d     fa0, fa2, fa1, fa0 /* |x| - x*PIO2HI */
-+    fmadd.d     fa0, fa3, fa1, fa0 /* |x| - x*PIO2HI - x*PIO2LO */
-+    b           L(reduced)
-+
-+L(greater_or_equal_2p23):
-+    /* Here if finite |x|>=2^23 */
-+    fabs.s      fa5, fa0 /* SP |x| */
-+    /* bitpos = (ix>>23) - BIAS_32; */
-+    srli.w      t0, t0, 23 /*TODO???srai.w eb = biased exponent of x */
-+    /* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
-+    addi.w      t0, t0, -124 /* t0 = bitpos */
-+    /* t3= j = bitpos/28 */
-+    /* x/28 = (x * ((0x100000000 / 28) + 1)) >> 32 */
-+    li.w        t1, 0x924924a
-+    mulh.wu     t0, t1, t0
-+    fcvt.d.s    fa5, fa5 /* Convert to double */
-+    /* TODO: what is the best order ??? */
-+    la.local    t1, L(invpio4_table) /* t2 */
-+    alsl.d      t1, t0, t1, 3
-+    fld.d       fa0, t1, 0  /* invpio4_table[j] */
-+    fld.d       fa1, t1, 8  /* invpio4_table[j+1] */
-+    fmul.d      fa0, fa0, fa5 /* a = invpio4_table[j]*|x| */
-+    fld.d       fa2, t1, 16 /* invpio4_table[j+2] */
-+    fmul.d      fa1, fa1, fa5 /* b = invpio4_table[j+1]*|x| */
-+    fld.d       fa3, t1, 24 /* invpio4_table[j+3] */
-+    fmul.d      fa2, fa2, fa5 /* c = invpio4_table[j+2]*|x| */
-+    fmul.d      fa3, fa3, fa5 /* d = invpio4_table[j+3]*|x| */
-+/*TODO: overflow check*/
-+    FTOL( t0, fa0, fa4 ) /*uint64_t l = a; TODO: change the order*/
-+    li.w        t1, -8   /* 0xfffffffffffffff8 */
-+    and         t0, t0, t1 /* l &= ~0x7; */
-+    LTOF( fa4, t0, fa4 ) /* DP l*/
-+    fsub.d      fa0, fa0, fa4 /* a -= l; */
-+    fadd.d      fa4, fa0, fa1 /* fa4 double e = a + b; */
-+/*TODO: overflow check*/
-+    FTOL( t0, fa4, fa4 ) /*uint64_t l = e;*/
-+    andi        t2, t0, 1 /* l & 1 TODO: change the order*/
-+    LOADFD( fa5, t1, L(DP_ONES) ) /* fa5 = 1.0 */
-+    LTOF( fa4, t0, fa4 ) /* fa4 DP l*/
-+/* critical!!!! the order */
-+    fsub.d      fa0, fa0, fa4
-+    fld.d       fa4, t4, 120 /* PI_4 */
-+    beqz        t2, L_even_integer
-+/*L_odd_integer:*/
-+    fsub.d      fa0, fa0, fa5
-+    fadd.d      fa0, fa0, fa1
-+    fadd.d      fa2, fa2, fa3
-+    fadd.d      fa0, fa0, fa2
-+    addi.d      t0, t0, 1
-+    fmul.d      fa0, fa0, fa4
-+    b           L(reduced)
-+L_even_integer:
-+    fadd.d      fa0, fa0, fa1
-+    fadd.d      fa2, fa2, fa3
-+    fadd.d      fa0, fa0, fa2
-+    fcmp.sle.d  $fcc0, fa0, fa5
-+    addi.d      t0, t0, 1
-+    bcnez       $fcc0, L_leq_one
-+/*L_gt_one:*/
-+    fld.d       fa2, t1, 16 /* 2.0 */
-+    addi.d      t0, t0, 1
-+    fsub.d      fa0, fa0, fa2
-+L_leq_one:
-+    fmul.d      fa0, fa0, fa4
-+    b           L(reduced)
-+
-+L(arg_less_pio4):
-+    /* Here if |x|<Pi/4 */
-+    li.w        t1, 0x3d000000 /* const 2^-5 */
-+    blt         t0, t1, L(less_2pn5) /* |x| < 2^-5 branch */
-+    /* Here if 2^-5<=|x|<Pi/4 */
-+    /* 
-+     * Chebyshev polynomial of the form:
-+     * 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
-+     */
-+    la.local    t0, L(DP_) /*DP_ base addr*/
-+    fld.d       fa3, t0, 32 /* S4 */
-+    fmul.d      fa0, fa1, fa1 /* y=x^2 */    
-+    fld.d       fa4, t0, 16 /* S2 */
-+    fmul.d      fa2, fa0, fa0 /* z=x^4 */
-+    fld.d       fa5, t0, 24 /* S3 */
-+    fmul.d      fa7, fa0, fa1 /* w=x^3 */
-+    fld.d       fa6, t0, 8 /* S1 */
-+    fmadd.d     fa4, fa2, fa3, fa4 /* sx = S2+z*S4 */
-+    fld.d       fa3, t0, 0 /* S0 */
-+    fmadd.d     fa5, fa5, fa2, fa6 /* sy = S1+z*S3 */
-+    fmadd.d     fa4, fa4, fa2, fa3 /* sx = S0+z*sx */
-+    fmadd.d     fa4, fa0, fa5, fa4 /* sx = sx+y*sy */
-+    fmadd.d     fa0, fa4, fa7, fa1 /* sx = x +w*sx */
-+    fcvt.s.d    fa0, fa0
-+    jr          ra
-+
-+L(less_2pn5):
-+    /* Here if |x|<2^-5 */
-+    li.w        t1, 0x32000000 /* 2^-27?  */
-+    blt         t0, t1, L(less_2pn27)
-+    /* Here if 2^-27<=|x|<2^-5 */
-+    fmul.d      fa0, fa1, fa1 /* theta2=x^2 */
-+    la.local    t0, L(DP_)
-+    fld.d       fa2, t0, 48  /* SS_1 */
-+    fmul.d      fa4, fa1, fa0 /* theta3=x^3 */
-+    fld.d       fa3, t0, 40 /* SS_0 */
-+    fmadd.d     fa0, fa0, fa2, fa3 /* sx = SS_0+theta2*SS_1 */
-+    fmadd.d     fa0, fa0, fa4, fa1 /* sx = theta + theta3 * sx */
-+    fcvt.s.d    fa0, fa0
-+    jr          ra
-+
-+L(less_2pn27):
-+    /* Here if |x|<2^-27 */
-+    beqz        t0, L(eq_zero)
-+    la.local    t0, L(DP_ONES)
-+    fld.d       fa2, t0, 24
-+    /*fmul.d      fa2, fa2, fa1
-+     *fsub.d      fa0, fa1, fa2*/
-+    fnmsub.d    fa0, fa1, fa2, fa1
-+    fcvt.s.d    fa0, fa0
-+L(eq_zero):
-+    jr          ra
-+
-+L(inf_or_nan):
-+    /* Here if |x| is Inf or NAN */
-+    bne         t0, t1, L_skip_errno_setting /* in case of x is NaN */
-+    la.tls.ie   t0, errno
-+    li.w        t1, 0x21
-+    stx.w       t1, t0, tp
-+L_skip_errno_setting:
-+    /* Here if |x| is Inf or NAN. Continued.  */
-+    fsub.s      fa0, fa0, fa0 /* Result is NaN */
-+    jr          ra
-+END(SINF)
-+
-+    .section .rodata
-+    .align 3
-+    .type L(PIO2J), @object
-+    .size L(PIO2J), 48
-+L(PIO2J): /* Table of j*Pi/2, for j=0,1,..,5 */
-+    .word   0x00000000
-+    .word   0x00000000
-+    .word   0x54442d18
-+    .word   0x3ff921fb
-+    .word   0x54442d18
-+    .word   0x400921fb
-+    .word   0x7f3321d2
-+    .word   0x4012d97c
-+    .word   0x54442d18
-+    .word   0x401921fb
-+    .word   0x2955385e
-+    .word   0x401f6a7a
-+
-+    .align 3
-+    .type L(invpio4_table), @object
-+    .size L(invpio4_table), 64
-+L(invpio4_table): /* 4/Pi broken into sum of positive DP values */
-+    .word   0x00000000
-+    .word   0x00000000
-+    .word   0x6c000000
-+    .word   0x3ff45f30
-+    .word   0x2a000000
-+    .word   0x3e3c9c88
-+    .word   0xa8000000
-+    .word   0x3c54fe13
-+    .word   0xd0000000
-+    .word   0x3aaf47d4
-+    .word   0x6c000000
-+    .word   0x38fbb81b
-+    .word   0xe0000000
-+    .word   0x3714acc9
-+    .word   0x7c000000
-+    .word   0x3560e410
-+
-+/* Coefficients of polynomial
-+    for sin(x)~=x+x^3*DP_SIN2_0+x^5*DP_SIN2_1, |x|<2^-5.  */
-+/* Coefficients of polynomial
-+    for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4.  
-+    for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4.
-+*/
-+    .align 3
-+    .type L(DP_), @object
-+    .size L(DP_), 128
-+L(DP_):
-+    .word   0x55551cd9
-+    .word   0xbfc55555 /*S0*/
-+    .word   0x10c2688b
-+    .word   0x3f811111 /*S1*/
-+    .word   0x8b4bd1f9
-+    .word   0xbf2a019f /*S2*/
-+    .word   0x64e6b5b4
-+    .word   0x3ec71d72 /*S3*/
-+    .word   0x1674b58a
-+    .word   0xbe5a947e /*S4*/
-+    .word   0x5543d49d
-+    .word   0xbfc55555 /*SS0 +40*/
-+    .word   0x75cec8c5
-+    .word   0x3f8110f4 /*SS1 +48*/
-+    .word   0x6dc9c883
-+    .word   0x3ff45f30 /* inv_PI_4 */
-+    .word   0xfffe98ae
-+    .word   0xbfdfffff /*C0*/
-+    .word   0x545c50c7
-+    .word   0x3fa55555 /*C1*/
-+    .word   0x348b6874
-+    .word   0xbf56c16b /*C2*/
-+    .word   0x9ac43cc0
-+    .word   0x3efa00eb /*C3*/
-+    .word   0xdd8844d7
-+    .word   0xbe923c97 /*C4*/
-+    .word   0x54400000
-+    .word   0xbff921fb /* PI_2_hi */
-+    .word   0x1a626332
-+    .word   0xbdd0b461 /* PI_2_lo */
-+    .word   0x54442d18
-+    .word   0x3fe921fb /* PI_4 */
-+
-+    .align 3
-+    .type L(DP_ONES), @object
-+    .size L(DP_ONES), 32
-+L(DP_ONES):
-+    .word   0x00000000
-+    .word   0x3ff00000 /* +1.0 */
-+    .word   0x00000000
-+    .word   0xbff00000 /* -1.0 */
-+    .word   0x00000000
-+    .word   0x40000000 /* +2.0 */
-+    .word   0x00000000
-+    .word   0x3cd00000 /* 2^(-50) */
-+
-+
-+libm_alias_float (__sin, sin)
-diff --git a/sysdeps/loongarch/lp64/stpcpy.S b/sysdeps/loongarch/lp64/stpcpy.S
-new file mode 100644
-index 00000000..9d4b0c8d
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/stpcpy.S
-@@ -0,0 +1,180 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#ifndef STPCPY_NAME
-+#define STPCPY_NAME __stpcpy
-+#endif
-+
-+LEAF(STPCPY_NAME)
-+    .align      6
-+    andi        a3, a0, 0x7
-+    beqz        a3, L(dest_align)
-+    sub.d       a5, a1, a3
-+    addi.d      a5, a5, 8
-+
-+L(make_dest_align):
-+    ld.b        t0, a1, 0
-+    addi.d      a1, a1, 1
-+    st.b        t0, a0, 0
-+    addi.d      a0, a0, 1
-+
-+    beqz        t0, L(al_out)
-+    bne         a1, a5, L(make_dest_align)
-+
-+L(dest_align):
-+    andi        a4, a1, 7
-+    bstrins.d   a1, zero, 2, 0
-+
-+    lu12i.w     t5, 0x1010
-+    ld.d        t0, a1, 0
-+    ori         t5, t5, 0x101
-+    bstrins.d   t5, t5, 63, 32
-+
-+    slli.d      t6, t5, 0x7
-+    bnez        a4, L(unalign)
-+    sub.d       t1, t0, t5
-+    andn        t2, t6, t0
-+
-+    and         t3, t1, t2
-+    bnez        t3, L(al_end)
-+
-+L(al_loop):
-+    st.d        t0, a0, 0
-+    ld.d        t0, a1, 8
-+
-+    addi.d      a1, a1, 8
-+    addi.d      a0, a0, 8
-+    sub.d       t1, t0, t5
-+    andn        t2, t6, t0
-+
-+    and         t3, t1, t2
-+    beqz        t3, L(al_loop)
-+
-+L(al_end):
-+    ctz.d       t1, t3
-+    srli.d      t1, t1, 3
-+    addi.d      t1, t1, 1 # add 1, since '\0' needs to be copied to dest
-+
-+    andi        a3, t1, 8
-+    andi        a4, t1, 4
-+    andi        a5, t1, 2
-+    andi        a6, t1, 1
-+
-+L(al_end_8):
-+    beqz        a3, L(al_end_4)
-+    st.d        t0, a0, 0
-+    addi.d      a0, a0, 7
-+    jr          ra
-+L(al_end_4):
-+    beqz        a4, L(al_end_2)
-+    st.w        t0, a0, 0
-+    addi.d      a0, a0, 4
-+    srli.d      t0, t0, 32
-+L(al_end_2):
-+    beqz        a5, L(al_end_1)
-+    st.h        t0, a0, 0
-+    addi.d      a0, a0, 2
-+    srli.d      t0, t0, 16
-+L(al_end_1):
-+    beqz        a6, L(al_out)
-+    st.b        t0, a0, 0
-+    addi.d      a0, a0, 1
-+L(al_out):
-+    addi.d      a0, a0, -1
-+    jr          ra
-+
-+L(unalign):
-+    slli.d      a5, a4, 3
-+    li.d        t1, -1
-+    sub.d       a6, zero, a5
-+
-+    srl.d       a7, t0, a5
-+    sll.d       t7, t1, a6
-+
-+    or          t0, a7, t7
-+    sub.d       t1, t0, t5
-+    andn        t2, t6, t0
-+    and         t3, t1, t2
-+
-+    bnez        t3, L(un_end)
-+
-+    ld.d        t4, a1, 8
-+    addi.d      a1, a1, 8
-+
-+    sub.d       t1, t4, t5
-+    andn        t2, t6, t4
-+    sll.d       t0, t4, a6
-+    and         t3, t1, t2
-+
-+    or          t0, t0, a7
-+    bnez        t3, L(un_end_with_remaining)
-+
-+L(un_loop):
-+    srl.d       a7, t4, a5
-+
-+    ld.d        t4, a1, 8
-+    addi.d      a1, a1, 8
-+
-+    st.d        t0, a0, 0
-+    addi.d      a0, a0, 8
-+
-+    sub.d       t1, t4, t5
-+    andn        t2, t6, t4
-+    sll.d       t0, t4, a6
-+    and         t3, t1, t2
-+
-+    or          t0, t0, a7
-+    beqz        t3, L(un_loop)
-+
-+L(un_end_with_remaining):
-+    ctz.d       t1, t3
-+    srli.d      t1, t1, 3
-+    addi.d      t1, t1, 1
-+    sub.d       t1, t1, a4
-+
-+    blt         t1, zero, L(un_end_less_8)
-+    st.d        t0, a0, 0
-+    addi.d      a0, a0, 8
-+    beqz        t1, L(un_out)
-+    srl.d       t0, t4, a5  # get the remaining part
-+    b           L(un_end_less_8)
-+
-+L(un_end):
-+    ctz.d       t1, t3
-+    srli.d      t1, t1, 3
-+    addi.d      t1, t1, 1
-+
-+L(un_end_less_8):
-+    andi        a4, t1, 4
-+    andi        a5, t1, 2
-+    andi        a6, t1, 1
-+L(un_end_4):
-+    beqz        a4, L(un_end_2)
-+    st.w        t0, a0, 0
-+    addi.d      a0, a0, 4
-+    srli.d      t0, t0, 32
-+L(un_end_2):
-+    beqz        a5, L(un_end_1)
-+    st.h        t0, a0, 0
-+    addi.d      a0, a0, 2
-+    srli.d      t0, t0, 16
-+L(un_end_1):
-+    beqz        a6, L(un_out)
-+    st.b        t0, a0, 0
-+    addi.d      a0, a0, 1
-+L(un_out):
-+    addi.d      a0, a0, -1
-+    jr          ra
-+
-+END(STPCPY_NAME)
-+
-+#ifdef _LIBC
-+weak_alias (STPCPY_NAME, stpcpy)
-+libc_hidden_builtin_def (STPCPY_NAME)
-+#endif
-diff --git a/sysdeps/loongarch/lp64/strchr.S b/sysdeps/loongarch/lp64/strchr.S
-new file mode 100644
-index 00000000..63454c17
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/strchr.S
-@@ -0,0 +1,90 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#ifndef STRCHR_NAME
-+#define STRCHR_NAME strchr
-+#endif
-+
-+/* char * strchr (const char *s1, int c); */
-+
-+LEAF(STRCHR_NAME)
-+	.align		6
-+	slli.d		t1, a0, 3
-+	bstrins.d	a0, zero, 2, 0
-+	lu12i.w		a2, 0x01010
-+	ld.d		t2, a0, 0
-+
-+	ori		a2, a2, 0x101
-+	andi		a1, a1, 0xff
-+	bstrins.d	a2, a2, 63, 32
-+	li.w		t0, -1
-+
-+	mul.d           a1, a1, a2 # "cccccccc"
-+	sll.d		t0, t0, t1
-+	slli.d		a3, a2, 7  # 0x8080808080808080
-+	orn             t2, t2, t0
-+
-+	sll.d           t3, a1, t1
-+	xor             t4, t2, t3
-+	sub.d           a7, t2, a2
-+	andn            a6, a3, t2
-+
-+
-+	sub.d           a5, t4, a2
-+	andn            a4, a3, t4
-+	and		a6, a7, a6
-+	and		a5, a5, a4
-+
-+	or		t0, a6, a5
-+	bnez		t0, L(_mc8_a)
-+	addi.d		a0, a0, 8
-+L(_aloop):
-+	ld.d		t4, a0, 0
-+
-+	xor		t2, t4, a1
-+	sub.d		a7, t4, a2
-+	andn		a6, a3, t4
-+	sub.d		a5, t2, a2
-+
-+	andn		a4, a3, t2
-+	and		a6, a7, a6
-+	and		a5, a5, a4
-+	or		a7, a6, a5
-+
-+
-+	bnez		a7, L(_mc8_a)
-+	ld.d		t4, a0, 8
-+	addi.d		a0, a0, 16
-+	xor		t2, t4, a1
-+
-+	sub.d		a7, t4, a2
-+	andn		a6, a3, t4
-+	sub.d		a5, t2, a2
-+	andn		a4, a3, t2
-+
-+	and		a6, a7, a6
-+	and		a5, a5, a4
-+	or		a7, a6, a5
-+	beqz		a7, L(_aloop)
-+
-+	addi.d		a0, a0, -8
-+
-+L(_mc8_a):
-+	ctz.d		t0, a5
-+	ctz.d		t2, a6
-+	srli.w		t0, t0, 3
-+
-+
-+	srli.w		t2, t2, 3
-+	sltu		t1, t2, t0
-+	add.d		a0, a0, t0
-+	masknez		a0, a0, t1
-+
-+	jr		ra
-+END(STRCHR_NAME)
-diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S
-new file mode 100644
-index 00000000..c4532e11
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/strchrnul.S
-@@ -0,0 +1,95 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#ifndef STRCHRNUL_NAME
-+#define STRCHRNUL_NAME __strchrnul
-+#endif
-+
-+/* char * strchrnul (const char *s1, int c); */
-+
-+LEAF(STRCHRNUL_NAME)
-+	.align		6
-+	slli.d		t1, a0, 3
-+	bstrins.d	a0, zero, 2, 0
-+	lu12i.w		a2, 0x01010
-+	ld.d		t2, a0, 0
-+
-+	ori		a2, a2, 0x101
-+	andi		a1, a1, 0xff
-+	bstrins.d	a2, a2, 63, 32
-+	li.w		t0, -1
-+
-+	mul.d           a1, a1, a2 # "cccccccc"
-+	sll.d		t0, t0, t1
-+	slli.d		a3, a2, 7  # 0x8080808080808080
-+	orn             t2, t2, t0
-+
-+	sll.d           t3, a1, t1
-+	xor             t4, t2, t3
-+	sub.d           a7, t2, a2
-+	andn            a6, a3, t2
-+
-+
-+	sub.d           a5, t4, a2
-+	andn            a4, a3, t4
-+	and		a6, a7, a6
-+	and		a5, a5, a4
-+
-+	or		t0, a6, a5
-+	bnez		t0, L(_mc8_a)
-+	addi.d		a0, a0, 8
-+L(_aloop):
-+	ld.d		t4, a0, 0
-+
-+	xor		t2, t4, a1
-+	sub.d		a7, t4, a2
-+	andn		a6, a3, t4
-+	sub.d		a5, t2, a2
-+
-+	andn		a4, a3, t2
-+	and		a6, a7, a6
-+	and		a5, a5, a4
-+	or		a7, a6, a5
-+
-+
-+	bnez		a7, L(_mc8_a)
-+	ld.d		t4, a0, 8
-+	addi.d		a0, a0, 16
-+	xor		t2, t4, a1
-+
-+	sub.d		a7, t4, a2
-+	andn		a6, a3, t4
-+	sub.d		a5, t2, a2
-+	andn		a4, a3, t2
-+
-+	and		a6, a7, a6
-+	and		a5, a5, a4
-+	or		a7, a6, a5
-+	beqz		a7, L(_aloop)
-+
-+	addi.d		a0, a0, -8
-+L(_mc8_a):
-+	ctz.d		t0, a5
-+	ctz.d		t2, a6
-+	srli.w		t0, t0, 3
-+
-+	srli.w		t2, t2, 3
-+	slt		t1, t0, t2
-+	masknez		t3, t2, t1
-+	maskeqz		t4, t0, t1
-+
-+	or		t0, t3, t4
-+	add.d		a0, a0, t0
-+	jr		ra
-+END(STRCHRNUL_NAME)
-+
-+#ifdef _LIBC
-+weak_alias(STRCHRNUL_NAME, strchrnul)
-+libc_hidden_builtin_def (STRCHRNUL_NAME)
-+#endif
-diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S
-new file mode 100644
-index 00000000..22c261a3
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/strcmp.S
-@@ -0,0 +1,228 @@
-+/* 2022\06\15  loongarch64 author: chenxiaolong.  */
-+
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#ifndef STRCMP_NAME
-+#define STRCMP_NAME strcmp
-+#endif
-+
-+/* int strcmp (const char *s1, const char *s2); */
-+
-+/* Parameters and Results */
-+#define src1	a0
-+#define	src2	a1
-+#define result	v0
-+LEAF(STRCMP_NAME)
-+    .align	6
-+    xor         a4, src1, src2
-+    lu12i.w     t5, 0x01010
-+    lu12i.w     t6, 0x7f7f7
-+    andi        a2, src1, 0x7
-+
-+    ori         t5, t5, 0x101
-+    andi        a4, a4, 0x7
-+    ori         t6, t6, 0xf7f
-+    bstrins.d   t5, t5, 63, 32
-+    bstrins.d   t6, t6, 63, 32
-+
-+    bnez        a4, 3f  // unaligned
-+    beqz        a2, 1f  // loop aligned
-+
-+// mutual aligned
-+    bstrins.d   src1, zero, 2, 0
-+    bstrins.d   src2, zero, 2, 0
-+    slli.d      a4, a2, 0x3
-+    ld.d        t0, src1, 0
-+
-+    sub.d       a4, zero, a4
-+    ld.d        t1, src2, 0
-+    addi.d      src1, src1, 8
-+    addi.d      src2, src2, 8
-+
-+    nor         a5, zero, zero
-+    srl.d       a5, a5, a4
-+    or          t0, t0, a5
-+
-+    or          t1, t1, a5
-+    b           2f  //start realigned
-+
-+// loop aligned
-+1:
-+    ld.d        t0, src1, 0
-+    addi.d      src1, src1, 8
-+    ld.d        t1, src2, 0
-+    addi.d      src2, src2, 8
-+
-+// start realigned:
-+2:
-+    sub.d       t2, t0, t5
-+    nor         t3, t0, t6
-+    and         t2, t2, t3
-+
-+    xor         t3, t0, t1
-+    or          t2, t2, t3
-+    beqz        t2, 1b
-+
-+    ctz.d       t7, t2
-+    bstrins.d   t7, zero, 2, 0
-+    srl.d       t0, t0, t7
-+    srl.d       t1, t1, t7
-+
-+    andi        t0, t0, 0xff
-+    andi        t1, t1, 0xff
-+    sub.d       v0, t0, t1
-+    jr          ra
-+
-+// unaligned
-+3:
-+    andi        a3, src2, 0x7
-+    slt         a5, a2, a3
-+    masknez     t8, a2, a5
-+    xor         a6, src1, src2
-+    maskeqz     a6, a6, t8
-+    xor         src1, src1, a6
-+    xor         src2, src2, a6
-+
-+    andi        a2, src1, 0x7
-+    beqz        a2, 4f // src1 is aligned
-+
-+//strcmp_unaligned:
-+    andi        a3, src2, 0x7
-+    bstrins.d   src1, zero, 2, 0
-+    bstrins.d   src2, zero, 2, 0
-+    nor         t3, zero, zero
-+
-+    ld.d        t0, src1, 0
-+    ld.d        t1, src2, 0
-+    sub.d       a2, a3, a2
-+    addi.d      t2, zero, 8
-+
-+    sub.d       a5, t2, a2
-+    sub.d       a6, t2, a3
-+    slli.d      a5, a5, 0x3
-+    slli.d      a6, a6, 0x3
-+
-+    srl.d       t4, t3, a6
-+    srl.d       a4, t3, a5
-+    rotr.d      a7, t0, a5
-+
-+    addi.d      src2, src2, 8
-+    addi.d      src1, src1, 8
-+    or          t1, t1, t4
-+    or          t0, a7, t4
-+
-+    sub.d       t2, t0, t5
-+    nor         t3, t0, t6
-+    and         t2, t2, t3
-+    xor         t3, t0, t1
-+    or          t2, t2, t3
-+    bnez        t2, 7f
-+
-+    and         a7, a7, a4
-+    slli.d      a6, a2, 0x3
-+    nor         a4, zero, a4
-+    b           5f
-+
-+// src1 is aligned
-+4:
-+    andi        a3, src2, 0x7
-+    ld.d        t0, src1, 0
-+
-+    bstrins.d   src2, zero, 2, 0
-+    nor         t2, zero, zero
-+    ld.d        t1, src2, 0
-+
-+    addi.d      t3, zero, 0x8
-+    sub.d       a5, t3, a3
-+    slli.d      a5, a5, 0x3
-+    srl.d       a4, t2, a5
-+    rotr.d      t4, t0, a5
-+
-+    addi.d      src2, src2, 8
-+    addi.d      src1, src1, 8
-+    or          t1, t1, a4
-+    or          t0, t4, a4
-+
-+    sub.d       t2, t0, t5
-+    nor         t3, t0, t6
-+    and         t2, t2, t3
-+    xor         t3, t0, t1
-+    or          t2, t2, t3
-+
-+    bnez        t2, 7f
-+
-+    and         a7, t4, a4
-+    slli.d      a6, a3, 0x3
-+    nor         a4, zero, a4
-+
-+// unaligned loop
-+// a7: remaining number
-+// a6: shift left number
-+// a5: shift right number
-+// a4: mask for checking remaining number
-+5:
-+    or          t0, a7, a4
-+    sub.d       t2, t0, t5
-+    nor         t3, t0, t6
-+    and         t2, t2, t3
-+    bnez        t2, 6f
-+
-+    ld.d        t0, src1, 0
-+    addi.d      src1, src1, 8
-+    ld.d        t1, src2, 0
-+    addi.d      src2, src2, 8
-+
-+    srl.d       t7, t0, a5
-+    sll.d       t0, t0, a6
-+    or          t0, a7, t0
-+
-+    sub.d       t2, t0, t5
-+    nor         t3, t0, t6
-+    and         t2, t2, t3
-+    xor         t3, t0, t1
-+    or          t2, t2, t3
-+    bnez        t2, 7f
-+
-+    or          a7, t7, zero
-+    b           5b
-+
-+6:
-+    ld.bu       t1, src2, 0
-+    andi        t0, a7, 0xff
-+    xor         t2, t0, t1
-+    srli.d      a7, a7, 0x8
-+    masknez     t2, t0, t2
-+    addi.d      src2, src2, 1
-+    beqz        t2, 8f
-+    b           6b
-+
-+7:
-+    ctz.d       t7, t2
-+    bstrins.d   t7, zero, 2, 0
-+    srl.d       t0, t0, t7
-+    srl.d       t1, t1, t7
-+
-+    andi        t0, t0, 0xff
-+    andi        t1, t1, 0xff
-+
-+8:
-+    sub.d       a4, t0, t1
-+    sub.d       a5, t1, t0
-+    maskeqz     a6, a5, t8
-+    masknez     result, a4, t8
-+    or          result, result, a6
-+    jr	ra
-+
-+END(STRCMP_NAME)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRCMP_NAME)
-+#endif
-+
-diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S
-new file mode 100644
-index 00000000..c6fe74cb
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/strcpy.S
-@@ -0,0 +1,174 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#ifndef STRCPY
-+#define STRCPY  strcpy
-+#endif
-+
-+LEAF(STRCPY)
-+    .align      6
-+    andi        a3, a0, 0x7
-+    move        a2, a0
-+    beqz        a3, L(dest_align)
-+    sub.d       a5, a1, a3
-+    addi.d      a5, a5, 8
-+
-+L(make_dest_align):
-+    ld.b        t0, a1, 0
-+    addi.d      a1, a1, 1
-+    st.b        t0, a2, 0
-+    beqz        t0, L(al_out)
-+
-+    addi.d      a2, a2, 1
-+    bne         a1, a5, L(make_dest_align)
-+
-+L(dest_align):
-+    andi        a4, a1, 7
-+    bstrins.d   a1, zero, 2, 0
-+
-+    lu12i.w     t5, 0x1010
-+    ld.d        t0, a1, 0
-+    ori         t5, t5, 0x101
-+    bstrins.d   t5, t5, 63, 32
-+
-+    slli.d      t6, t5, 0x7
-+    bnez        a4, L(unalign)
-+    sub.d       t1, t0, t5
-+    andn        t2, t6, t0
-+
-+    and         t3, t1, t2
-+    bnez        t3, L(al_end)
-+
-+L(al_loop):
-+    st.d        t0, a2, 0
-+    ld.d        t0, a1, 8
-+
-+    addi.d      a1, a1, 8
-+    addi.d      a2, a2, 8
-+    sub.d       t1, t0, t5
-+    andn        t2, t6, t0
-+
-+    and         t3, t1, t2
-+    beqz        t3, L(al_loop)
-+
-+L(al_end):
-+    ctz.d       t1, t3
-+    srli.d      t1, t1, 3
-+    addi.d      t1, t1, 1 # add 1, since '\0' needs to be copied to dest
-+
-+    andi        a3, t1, 8
-+    andi        a4, t1, 4
-+    andi        a5, t1, 2
-+    andi        a6, t1, 1
-+
-+L(al_end_8):
-+    beqz        a3, L(al_end_4)
-+    st.d        t0, a2, 0
-+    jr          ra
-+L(al_end_4):
-+    beqz        a4, L(al_end_2)
-+    st.w        t0, a2, 0
-+    addi.d      a2, a2, 4
-+    srli.d      t0, t0, 32
-+L(al_end_2):
-+    beqz        a5, L(al_end_1)
-+    st.h        t0, a2, 0
-+    addi.d      a2, a2, 2
-+    srli.d      t0, t0, 16
-+L(al_end_1):
-+    beqz        a6, L(al_out)
-+    st.b        t0, a2, 0
-+L(al_out):
-+    jr          ra
-+
-+L(unalign):
-+    slli.d      a5, a4, 3
-+    li.d        t1, -1
-+    sub.d       a6, zero, a5
-+
-+    srl.d       a7, t0, a5
-+    sll.d       t7, t1, a6
-+
-+    or          t0, a7, t7
-+    sub.d       t1, t0, t5
-+    andn        t2, t6, t0
-+    and         t3, t1, t2
-+
-+    bnez        t3, L(un_end)
-+
-+    ld.d        t4, a1, 8
-+
-+    sub.d       t1, t4, t5
-+    andn        t2, t6, t4
-+    sll.d       t0, t4, a6
-+    and         t3, t1, t2
-+
-+    or          t0, t0, a7
-+    bnez        t3, L(un_end_with_remaining)
-+
-+L(un_loop):
-+    srl.d       a7, t4, a5
-+
-+    ld.d        t4, a1, 16
-+    addi.d      a1, a1, 8
-+
-+    st.d        t0, a2, 0
-+    addi.d      a2, a2, 8
-+
-+    sub.d       t1, t4, t5
-+    andn        t2, t6, t4
-+    sll.d       t0, t4, a6
-+    and         t3, t1, t2
-+
-+    or          t0, t0, a7
-+    beqz        t3, L(un_loop)
-+
-+L(un_end_with_remaining):
-+    ctz.d       t1, t3
-+    srli.d      t1, t1, 3
-+    addi.d      t1, t1, 1
-+    sub.d       t1, t1, a4
-+
-+    blt         t1, zero, L(un_end_less_8)
-+    st.d        t0, a2, 0
-+    addi.d      a2, a2, 8
-+    beqz        t1, L(un_out)
-+    srl.d       t0, t4, a5  # get the remaining part
-+    b           L(un_end_less_8)
-+
-+L(un_end):
-+    ctz.d       t1, t3
-+    srli.d      t1, t1, 3
-+    addi.d      t1, t1, 1
-+
-+L(un_end_less_8):
-+    andi        a4, t1, 4
-+    andi        a5, t1, 2
-+    andi        a6, t1, 1
-+L(un_end_4):
-+    beqz        a4, L(un_end_2)
-+    st.w        t0, a2, 0
-+    addi.d      a2, a2, 4
-+    srli.d      t0, t0, 32
-+L(un_end_2):
-+    beqz        a5, L(un_end_1)
-+    st.h        t0, a2, 0
-+    addi.d      a2, a2, 2
-+    srli.d      t0, t0, 16
-+L(un_end_1):
-+    beqz        a6, L(un_out)
-+    st.b        t0, a2, 0
-+L(un_out):
-+    jr          ra
-+
-+END(STRCPY)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRCPY)
-+#endif
-diff --git a/sysdeps/loongarch/lp64/strlen.S b/sysdeps/loongarch/lp64/strlen.S
-new file mode 100644
-index 00000000..dd5a8da3
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/strlen.S
-@@ -0,0 +1,86 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#ifndef STRLEN
-+#define STRLEN	strlen
-+#endif
-+
-+LEAF(STRLEN)
-+	.align 6
-+	move		a1, a0
-+	bstrins.d	a0, zero, 2, 0
-+	lu12i.w		a2, 0x01010
-+	li.w		t0, -1
-+
-+	ld.d		t2, a0, 0
-+	andi		t1, a1, 0x7
-+	ori		a2, a2, 0x101
-+	slli.d		t1, t1, 3
-+
-+	bstrins.d	a2, a2, 63, 32
-+	sll.d		t1, t0, t1
-+	slli.d		t3, a2, 7
-+	nor		a3, zero, t3
-+
-+	orn		t2, t2, t1
-+	sub.d		t0, t2, a2
-+	nor		t1, t2, a3
-+	and		t0, t0, t1
-+
-+
-+	bnez		t0, L(count_pos)
-+	addi.d          a0, a0, 8
-+L(loop_16_7bit):
-+	ld.d		t2, a0, 0
-+	sub.d		t1, t2, a2
-+
-+	and 		t0, t1, t3
-+	bnez		t0, L(more_check)
-+	ld.d		t2, a0, 8
-+	addi.d      	a0, a0, 16
-+
-+	sub.d		t1, t2, a2
-+	and 		t0, t1, t3
-+	beqz        	t0, L(loop_16_7bit)
-+	addi.d          a0, a0, -8
-+L(more_check):
-+	nor		t0, t2, a3
-+
-+	and		t0, t1, t0
-+	bnez		t0, L(count_pos)
-+	addi.d          a0, a0, 8
-+L(loop_16_8bit):
-+	ld.d		t2, a0, 0
-+
-+	sub.d		t1, t2, a2
-+	nor		t0, t2, a3
-+	and		t0, t0, t1
-+	bnez		t0, L(count_pos)
-+
-+	ld.d		t2, a0, 8
-+	addi.d      	a0, a0, 16
-+	sub.d		t1, t2, a2
-+	nor		t0, t2, a3
-+
-+	and		t0, t0, t1
-+	beqz		t0, L(loop_16_8bit)
-+	addi.d          a0, a0, -8
-+L(count_pos):
-+	ctz.d		t1, t0
-+	sub.d		a0, a0, a1
-+
-+	srli.d		t1, t1, 3
-+	add.d		a0, a0, t1
-+	jr		ra
-+
-+END(STRLEN)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRLEN)
-+#endif
-diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S
-new file mode 100644
-index 00000000..dcb15350
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/strncmp.S
-@@ -0,0 +1,257 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#ifndef STRNCMP
-+#define STRNCMP strncmp
-+#endif
-+
-+/* int strncmp (const char *s1, const char *s2); */
-+
-+LEAF(STRNCMP)
-+    .align      6
-+    beqz        a2, L(ret0)
-+    xor         a4, a0, a1
-+    lu12i.w     t5, 0x01010
-+    lu12i.w     t6, 0x7f7f7
-+
-+    andi        a3, a0, 0x7
-+    ori         t5, t5, 0x101
-+    andi        a4, a4, 0x7
-+    ori         t6, t6, 0xf7f
-+
-+    bstrins.d   t5, t5, 63, 32
-+    bstrins.d   t6, t6, 63, 32
-+
-+    bnez        a4, L(unalign)
-+    bnez        a3, L(mutual_align)
-+
-+L(a_loop):
-+    ld.d        t0, a0, 0
-+    ld.d        t1, a1, 0
-+    addi.d      a0, a0, 8
-+    addi.d      a1, a1, 8
-+
-+
-+    sltui       t7, a2, 9
-+
-+L(start_realign):
-+    sub.d       t2, t0, t5
-+    nor         t3, t0, t6
-+    xor         t4, t0, t1
-+
-+    and         t2, t2, t3
-+    addi.d      a2, a2, -8
-+
-+    or          t2, t2, t4
-+    or          t3, t2, t7
-+    beqz        t3, L(a_loop)
-+
-+L(end):
-+    bge         zero, t7, L(out)
-+    andi        t4, a2, 7
-+    li.d        t3, -1
-+    addi.d      t4, t4, -1
-+    slli.d      t4, t4, 3
-+    sll.d       t3, t3, t4
-+    or          t2, t2, t3
-+
-+
-+L(out):
-+    ctz.d       t3, t2
-+    bstrins.d   t3, zero, 2, 0
-+    srl.d       t0, t0, t3
-+    srl.d       t1, t1, t3
-+
-+    andi        t0, t0, 0xff
-+    andi        t1, t1, 0xff
-+    sub.d       a0, t0, t1
-+    jr          ra
-+
-+L(mutual_align):
-+    bstrins.d   a0, zero, 2, 0
-+    bstrins.d   a1, zero, 2, 0
-+    slli.d      a5, a3, 0x3
-+    li.d        t2, -1
-+
-+    ld.d        t0, a0, 0
-+    ld.d        t1, a1, 0
-+
-+    li.d        t3, 9
-+    sll.d       t2, t2, a5
-+
-+    sub.d       t3, t3, a3
-+    addi.d      a0, a0, 8
-+
-+    sltu        t7, a2, t3
-+    addi.d      a1, a1, 8
-+
-+    add.d       a2, a2, a3
-+    orn         t0, t0, t2
-+    orn         t1, t1, t2
-+    b           L(start_realign)
-+
-+L(ret0):
-+    move        a0, zero
-+    jr          ra
-+
-+L(unalign):
-+    li.d        t8, 8
-+    blt         a2, t8, L(short_cmp)
-+
-+    # swap a0 and a1 in case a3 > a4
-+    andi        a4, a1, 0x7
-+    sltu        t8, a4, a3
-+    xor         a6, a0, a1
-+    maskeqz     a6, a6, t8
-+    xor         a0, a0, a6
-+    xor         a1, a1, a6
-+
-+    andi        a3, a0, 0x7
-+    andi        a4, a1, 0x7
-+
-+    bstrins.d   a0, zero, 2, 0
-+    bstrins.d   a1, zero, 2, 0
-+
-+    li.d        t2, -1
-+    li.d        t3, 9
-+
-+    ld.d        t0, a0, 0
-+    ld.d        t1, a1, 0
-+
-+    sub.d       t3, t3, a4
-+    sub.d       a3, a4, a3
-+
-+    slli.d      t4, a4, 3
-+    slli.d      a6, a3, 3
-+
-+    sub.d       a5, zero, a6
-+    sltu        t7, a2, t3
-+
-+    rotr.d      a7, t0, a5
-+    sll.d       t4, t2, t4 # mask for first num
-+
-+    add.d       a2, a2, a4
-+    sll.d       a4, t2, a6 # mask for a7
-+
-+    orn         t0, a7, t4
-+    orn         t1, t1, t4
-+
-+    sub.d       t2, t0, t5
-+    nor         t4, t0, t6
-+    and         t2, t2, t4
-+
-+    xor         t3, t0, t1
-+    or          t2, t2, t3
-+
-+    or          t3, t2, t7
-+    bnez        t3, L(un_end)
-+
-+    andn        a7, a7, a4
-+    addi.d      a3, a3, 1
-+
-+L(un_loop):
-+    addi.d      a2, a2, -8
-+    # in case remaining part has '\0', no more load instructions should be executed on a0 address
-+    or          t0, a7, a4
-+    sltu        t7, a2, a3
-+
-+    sub.d       t2, t0, t5
-+    nor         t3, t0, t6
-+    and         t2, t2, t3
-+
-+    or          t3, t2, t7
-+    bnez        t3, L(check_remaining)
-+
-+    ld.d        t7, a0, 8
-+    ld.d        t1, a1, 8
-+    addi.d      a0, a0, 8
-+    addi.d      a1, a1, 8
-+
-+    sll.d       t4, t7, a6
-+    sub.d       t2, t1, t5
-+    nor         t3, t1, t6
-+
-+    or          t0, t4, a7
-+    srl.d       a7, t7, a5
-+
-+    and         t2, t2, t3
-+    xor         t3, t0, t1
-+
-+    sltui       t7, a2, 9
-+    or          t2, t2, t3
-+
-+    or          t3, t2, t7
-+    beqz        t3, L(un_loop)
-+    b           L(un_end)
-+
-+L(check_remaining):
-+    ld.d        t1, a1, 8
-+    xor         t3, t1, a7
-+    or          t2, t2, t3
-+
-+L(un_end):
-+    bge         zero, t7, L(un_out)
-+    andi        t4, a2, 7
-+    li.d        t3, -1
-+
-+    addi.d      t4, t4, -1
-+    slli.d      t4, t4, 3
-+    sll.d       t3, t3, t4
-+    or          t2, t2, t3
-+
-+L(un_out):
-+    ctz.d       t3, t2
-+    bstrins.d   t3, zero, 2, 0
-+    srl.d       t0, t0, t3
-+    srl.d       t1, t1, t3
-+
-+    andi        t0, t0, 0xff
-+    andi        t1, t1, 0xff
-+
-+    sub.d       a4, t0, t1
-+    sub.d       a5, t1, t0
-+
-+    maskeqz     a6, a5, t8
-+    masknez     a0, a4, t8
-+
-+    or          a0, a0, a6
-+    jr          ra
-+
-+L(short_cmp):
-+    ld.bu       t0, a0, 0
-+    ld.bu       t1, a1, 0
-+    addi.d      a2, a2, -1
-+
-+    xor         t2, t0, t1
-+    masknez     t2, t0, t2
-+    maskeqz     t2, a2, t2
-+
-+    beqz        t2, L(short_out)
-+
-+    ld.bu       t0, a0, 1
-+    ld.bu       t1, a1, 1
-+
-+    addi.d      a2, a2, -1
-+    addi.d      a0, a0, 2
-+
-+    addi.d      a1, a1, 2
-+    xor         t2, t0, t1
-+    masknez     t2, t0, t2
-+    maskeqz     t2, a2, t2
-+
-+    bnez        t2, L(short_cmp)
-+
-+L(short_out):
-+    sub.d       a0, t0, t1
-+    jr ra
-+
-+END(STRNCMP)
-+#ifdef _LIBC
-+libc_hidden_builtin_def (STRNCMP)
-+#endif
-diff --git a/sysdeps/loongarch/lp64/strnlen.S b/sysdeps/loongarch/lp64/strnlen.S
-new file mode 100644
-index 00000000..0517e206
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/strnlen.S
-@@ -0,0 +1,83 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#ifndef STRNLEN
-+#define STRNLEN	__strnlen
-+#endif
-+
-+#. before every load, a1(t5) must > 0;
-+#. first load with t1 != 0, need to adjust t5;
-+#. return the less one of both strlen(s) and a1;
-+
-+LEAF(STRNLEN)
-+	.align 6
-+	beqz		a1, L(out)
-+	lu12i.w		a2, 0x01010
-+	andi		t1, a0, 0x7
-+	move		t4, a0
-+
-+	bstrins.d	a0, zero, 2, 0
-+	ori		a2, a2, 0x101
-+	li.w		t0, -1
-+	ld.d		t2, a0, 0
-+
-+	slli.d		t3, t1, 3
-+	bstrins.d	a2, a2, 63, 32
-+	li.w		t5, 8
-+	slli.d		a3, a2, 7
-+
-+	sub.w		t1, t5, t1
-+	sll.d		t0, t0, t3
-+	nor		a3, zero, a3
-+	orn		t2, t2, t0
-+
-+
-+	sub.d		t0, t2, a2
-+	nor		t3, t2, a3
-+	and		t0, t0, t3
-+	bnez		t0, L(count_pos)
-+
-+	sub.d		t5, a1, t1
-+	bgeu		t1, a1, L(out)
-+L(loop_8bytes):
-+	ld.d		t2, a0, 8
-+	addi.d		a0, a0, 8
-+
-+	sub.d		t0, t2, a2
-+	nor 		t1, t2, a3
-+	sltui		t6, t5, 9
-+	and 		t0, t0, t1
-+
-+	addi.d		t5, t5, -8
-+	or		t7, t0, t6
-+	beqz		t7, L(loop_8bytes)
-+L(count_pos):
-+	ctz.d		t1, t0
-+
-+
-+	sub.d		a0, a0, t4
-+	srli.d		t1, t1, 3
-+	add.d		a0, t1, a0
-+	sltu		t0, a0, a1
-+
-+	masknez		t1, a1, t0
-+	maskeqz		a0, a0, t0
-+	or		a0, a0, t1
-+	jr		ra
-+
-+L(out):
-+	move		a0, a1
-+	jr		ra
-+
-+END(STRNLEN)
-+
-+#ifdef _LIBC
-+weak_alias (STRNLEN, strnlen)
-+libc_hidden_builtin_def (STRNLEN)
-+#endif
-diff --git a/sysdeps/loongarch/lp64/strrchr.S b/sysdeps/loongarch/lp64/strrchr.S
-new file mode 100644
-index 00000000..3bf92ecd
---- /dev/null
-+++ b/sysdeps/loongarch/lp64/strrchr.S
-@@ -0,0 +1,106 @@
-+#ifdef _LIBC
-+#include <sysdep.h>
-+#include <sys/regdef.h>
-+#include <sys/asm.h>
-+#else
-+#include <sys/asm.h>
-+#include <sys/regdef.h>
-+#endif
-+
-+#ifndef STRRCHR_NAME
-+#define STRRCHR_NAME strrchr
-+#endif
-+
-+LEAF(STRRCHR_NAME)
-+	.align 6
-+	slli.d		t1, a0, 3
-+	bstrins.d	a0, zero, 2, 0
-+	lu12i.w		a2, 0x01010
-+	ld.d		t2, a0, 0       // t2 = "5ZZ21abc"
-+
-+	ori		a2, a2, 0x101
-+	andi		a1, a1, 0xff	// a1 = "0000000Z"
-+	li.d		a5, -1
-+	bstrins.d	a2, a2, 63, 32	// a2 = 0x0101010101010101
-+
-+	sll.d		t1, a5, t1	// t1 = 0xffffffffff000000
-+	mul.d		a1, a1, a2	// a1 = "ZZZZZZZZ"
-+	orn		t2, t2, t1	// t2 = "5ZZ21YYY"
-+	slli.d		a3, a2, 7	// a3 = 0x8080808080808080
-+
-+	sub.d		a4, t2, a2
-+	andn		t0, a3, t2
-+	move		t3, zero
-+	and		t0, a4, t0
-+
-+
-+	xor		a4, t2, a1
-+	move		t5, zero
-+	orn		a4, a4, t1
-+	bnez		t0, L(found_end)
-+
-+	sub.d		t1, a4, a2
-+	andn		t0, a3, a4
-+	and		t1, t1, t0
-+
-+L(loop_8bytes):
-+	masknez		t4, t3, t1
-+
-+	maskeqz		t3, t2, t1
-+	ld.d		t2, a0, 8
-+	masknez		t0, t5, t1
-+	maskeqz		t5, a0, t1
-+
-+	or		t3, t3, t4
-+	or		t5, t0, t5
-+	sub.d		t0, t2, a2
-+	andn		t1, a3, t2
-+
-+
-+	xor		a4, t2, a1
-+	and		t0, t0, t1	//t0 hold diff pattern for '\0'
-+	sub.d		t1, a4, a2
-+	andn		t4, a3, a4
-+
-+	and		t1, t1, t4	//t1 hold diff pattern for 'a1'
-+	addi.d		a0, a0, 8
-+	beqz		t0, L(loop_8bytes)	//ok, neither \0 nor found
-+L(found_end):
-+	ctz.d		t1, t0
-+
-+	xor		t3, t3, a1
-+	orn		t1, zero, t1
-+	revb.d		t3, t3
-+	srl.d		t1, a5, t1  // mask for '\0'
-+
-+	sub.d		t4, t3, a2
-+	orn		a4, a4, t1
-+	andn		t3, a3, t3
-+	revb.d		t2, a4
-+
-+	sub.d		t0, t2, a2
-+	andn		t1, a3, t2
-+	and		t3, t3, t4
-+	and		t1, t0, t1
-+
-+	li.d		t7, 7
-+	masknez		t4, t3, t1
-+	maskeqz		t3, t1, t1
-+	masknez		t5, t5, t1
-+
-+	or		t3, t3, t4
-+	maskeqz		t6, a0, t1
-+	ctz.d		t0, t3
-+	or		t5, t6, t5
-+
-+	srli.d		t0, t0, 3
-+	sub.d		t0, t7, t0
-+	add.d		a0, t5, t0
-+	maskeqz		a0, a0, t3
-+
-+	jr		ra
-+END(STRRCHR_NAME)
-+
-+#ifdef _LIBC
-+libc_hidden_builtin_def(STRRCHR_NAME)
-+#endif
-diff --git a/sysdeps/loongarch/lstat.c b/sysdeps/loongarch/lstat.c
-new file mode 100644
-index 00000000..f47a56af
---- /dev/null
-+++ b/sysdeps/loongarch/lstat.c
-@@ -0,0 +1 @@
-+#include <io/lstat.c>
-diff --git a/sysdeps/loongarch/lstat64.c b/sysdeps/loongarch/lstat64.c
-new file mode 100644
-index 00000000..d6811656
---- /dev/null
-+++ b/sysdeps/loongarch/lstat64.c
-@@ -0,0 +1 @@
-+#include <io/lstat64.c>
-diff --git a/sysdeps/loongarch/machine-gmon.h b/sysdeps/loongarch/machine-gmon.h
-new file mode 100644
-index 00000000..0b49082d
---- /dev/null
-+++ b/sysdeps/loongarch/machine-gmon.h
-@@ -0,0 +1,37 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+/* Accept 'frompc' address as argument from the function that calls
-+   _mcount for profiling.  Use  __builtin_return_address (0)
-+   for the 'selfpc' address.  */
-+
-+#include <sysdep.h>
-+
-+static void mcount_internal (unsigned long int frompc,
-+			     unsigned long int selfpc);
-+
-+#define _MCOUNT_DECL(frompc, selfpc) \
-+static inline void mcount_internal (unsigned long int frompc, \
-+unsigned long int selfpc)
-+
-+#define MCOUNT								\
-+void _mcount (void *frompc)						\
-+{									\
-+  mcount_internal ((unsigned long int) frompc,				\
-+		   (unsigned long int) RETURN_ADDRESS (0));		\
-+}
-diff --git a/sysdeps/loongarch/math_private.h b/sysdeps/loongarch/math_private.h
-new file mode 100644
-index 00000000..140eef07
---- /dev/null
-+++ b/sysdeps/loongarch/math_private.h
-@@ -0,0 +1,245 @@
-+/* Internal math stuff.  LOONGARCH version.
-+   Copyright (C) 2013-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef LOONGARCH_MATH_PRIVATE_H
-+#define LOONGARCH_MATH_PRIVATE_H 1
-+
-+/* Inline functions to speed up the math library implementation.  The
-+   default versions of these routines are in generic/math_private.h
-+   and call fesetround, feholdexcept, etc.  These routines use inlined
-+   code instead.  */
-+
-+#ifdef __loongarch_hard_float
-+
-+# include <fenv.h>
-+# include <fenv_libc.h>
-+# include <fpu_control.h>
-+
-+# define _FPU_MASK_ALL (_FPU_MASK_V | _FPU_MASK_Z | _FPU_MASK_O \
-+			|_FPU_MASK_U | _FPU_MASK_I | FE_ALL_EXCEPT)
-+
-+static __always_inline void
-+libc_feholdexcept_loongarch (fenv_t *envp)
-+{
-+  fpu_control_t cw;
-+
-+  /* Save the current state.  */
-+  _FPU_GETCW (cw);
-+  envp->__fp_control_register = cw;
-+
-+  /* Clear all exception enable bits and flags.  */
-+  cw &= ~(_FPU_MASK_ALL);
-+  _FPU_SETCW (cw);
-+}
-+# define libc_feholdexcept libc_feholdexcept_loongarch
-+# define libc_feholdexceptf libc_feholdexcept_loongarch
-+# define libc_feholdexceptl libc_feholdexcept_loongarch
-+
-+static __always_inline void
-+libc_fesetround_loongarch (int round)
-+{
-+  fpu_control_t cw;
-+
-+  /* Get current state.  */
-+  _FPU_GETCW (cw);
-+
-+  /* Set rounding bits.  */
-+  cw &= ~_FPU_RC_MASK;
-+  cw |= round;
-+
-+  /* Set new state.  */
-+  _FPU_SETCW (cw);
-+}
-+# define libc_fesetround libc_fesetround_loongarch
-+# define libc_fesetroundf libc_fesetround_loongarch
-+# define libc_fesetroundl libc_fesetround_loongarch
-+
-+static __always_inline void
-+libc_feholdexcept_setround_loongarch (fenv_t *envp, int round)
-+{
-+  fpu_control_t cw;
-+
-+  /* Save the current state.  */
-+  _FPU_GETCW (cw);
-+  envp->__fp_control_register = cw;
-+
-+  /* Clear all exception enable bits and flags.  */
-+  cw &= ~(_FPU_MASK_ALL);
-+
-+  /* Set rounding bits.  */
-+  cw &= ~_FPU_RC_MASK;
-+  cw |= round;
-+
-+  /* Set new state.  */
-+  _FPU_SETCW (cw);
-+}
-+# define libc_feholdexcept_setround libc_feholdexcept_setround_loongarch
-+# define libc_feholdexcept_setroundf libc_feholdexcept_setround_loongarch
-+# define libc_feholdexcept_setroundl libc_feholdexcept_setround_loongarch
-+
-+# define libc_feholdsetround libc_feholdexcept_setround_loongarch
-+# define libc_feholdsetroundf libc_feholdexcept_setround_loongarch
-+# define libc_feholdsetroundl libc_feholdexcept_setround_loongarch
-+
-+static __always_inline void
-+libc_fesetenv_loongarch (fenv_t *envp)
-+{
-+  fpu_control_t cw __attribute__ ((unused));
-+
-+  /* Read current state to flush fpu pipeline.  */
-+  _FPU_GETCW (cw);
-+
-+  _FPU_SETCW (envp->__fp_control_register);
-+}
-+# define libc_fesetenv libc_fesetenv_loongarch
-+# define libc_fesetenvf libc_fesetenv_loongarch
-+# define libc_fesetenvl libc_fesetenv_loongarch
-+
-+static __always_inline int
-+libc_feupdateenv_test_loongarch (fenv_t *envp, int excepts)
-+{
-+  /* int ret = fetestexcept (excepts); feupdateenv (envp); return ret; */
-+  int cw, temp;
-+
-+  /* Get current control word.  */
-+  _FPU_GETCW (cw);
-+
-+  /* Set flag bits (which are accumulative), and *also* set the
-+     cause bits.  The setting of the cause bits is what actually causes
-+     the hardware to generate the exception, if the corresponding enable
-+     bit is set as well.  */
-+  temp = cw & FE_ALL_EXCEPT;
-+  temp |= envp->__fp_control_register | (temp << CAUSE_SHIFT);
-+
-+  /* Set new state.  */
-+  _FPU_SETCW (temp);
-+
-+  return cw & excepts & FE_ALL_EXCEPT;
-+}
-+# define libc_feupdateenv_test libc_feupdateenv_test_loongarch
-+# define libc_feupdateenv_testf libc_feupdateenv_test_loongarch
-+# define libc_feupdateenv_testl libc_feupdateenv_test_loongarch
-+
-+static __always_inline void
-+libc_feupdateenv_loongarch (fenv_t *envp)
-+{
-+  libc_feupdateenv_test_loongarch (envp, 0);
-+}
-+# define libc_feupdateenv libc_feupdateenv_loongarch
-+# define libc_feupdateenvf libc_feupdateenv_loongarch
-+# define libc_feupdateenvl libc_feupdateenv_loongarch
-+
-+# define libc_feresetround libc_feupdateenv_loongarch
-+# define libc_feresetroundf libc_feupdateenv_loongarch
-+# define libc_feresetroundl libc_feupdateenv_loongarch
-+
-+static __always_inline int
-+libc_fetestexcept_loongarch (int excepts)
-+{
-+  int cw;
-+
-+  /* Get current control word.  */
-+  _FPU_GETCW (cw);
-+
-+  return cw & excepts & FE_ALL_EXCEPT;
-+}
-+# define libc_fetestexcept libc_fetestexcept_loongarch
-+# define libc_fetestexceptf libc_fetestexcept_loongarch
-+# define libc_fetestexceptl libc_fetestexcept_loongarch
-+
-+/*  Enable support for rounding mode context.  */
-+# define HAVE_RM_CTX 1
-+
-+static __always_inline void
-+libc_feholdexcept_setround_loongarch_ctx (struct rm_ctx *ctx, int round)
-+{
-+  fpu_control_t old, new;
-+
-+  /* Save the current state.  */
-+  _FPU_GETCW (old);
-+  ctx->env.__fp_control_register = old;
-+
-+  /* Clear all exception enable bits and flags.  */
-+  new = old & ~(_FPU_MASK_ALL);
-+
-+  /* Set rounding bits.  */
-+  new = (new & ~_FPU_RC_MASK) | round;
-+
-+  if (__glibc_unlikely (new != old))
-+    {
-+      _FPU_SETCW (new);
-+      ctx->updated_status = true;
-+    }
-+  else
-+    ctx->updated_status = false;
-+}
-+# define libc_feholdexcept_setround_ctx   libc_feholdexcept_setround_loongarch_ctx
-+# define libc_feholdexcept_setroundf_ctx  libc_feholdexcept_setround_loongarch_ctx
-+# define libc_feholdexcept_setroundl_ctx  libc_feholdexcept_setround_loongarch_ctx
-+
-+static __always_inline void
-+libc_fesetenv_loongarch_ctx (struct rm_ctx *ctx)
-+{
-+  libc_fesetenv_loongarch (&ctx->env);
-+}
-+# define libc_fesetenv_ctx                libc_fesetenv_loongarch_ctx
-+# define libc_fesetenvf_ctx               libc_fesetenv_loongarch_ctx
-+# define libc_fesetenvl_ctx               libc_fesetenv_loongarch_ctx
-+
-+static __always_inline void
-+libc_feupdateenv_loongarch_ctx (struct rm_ctx *ctx)
-+{
-+  if (__glibc_unlikely (ctx->updated_status))
-+    libc_feupdateenv_test_loongarch (&ctx->env, 0);
-+}
-+# define libc_feupdateenv_ctx             libc_feupdateenv_loongarch_ctx
-+# define libc_feupdateenvf_ctx            libc_feupdateenv_loongarch_ctx
-+# define libc_feupdateenvl_ctx            libc_feupdateenv_loongarch_ctx
-+# define libc_feresetround_ctx            libc_feupdateenv_loongarch_ctx
-+# define libc_feresetroundf_ctx           libc_feupdateenv_loongarch_ctx
-+# define libc_feresetroundl_ctx           libc_feupdateenv_loongarch_ctx
-+
-+static __always_inline void
-+libc_feholdsetround_loongarch_ctx (struct rm_ctx *ctx, int round)
-+{
-+  fpu_control_t old, new;
-+
-+  /* Save the current state.  */
-+  _FPU_GETCW (old);
-+  ctx->env.__fp_control_register = old;
-+
-+  /* Set rounding bits.  */
-+  new = (old & ~_FPU_RC_MASK) | round;
-+
-+  if (__glibc_unlikely (new != old))
-+    {
-+      _FPU_SETCW (new);
-+      ctx->updated_status = true;
-+    }
-+  else
-+    ctx->updated_status = false;
-+}
-+# define libc_feholdsetround_ctx          libc_feholdsetround_loongarch_ctx
-+# define libc_feholdsetroundf_ctx         libc_feholdsetround_loongarch_ctx
-+# define libc_feholdsetroundl_ctx         libc_feholdsetround_loongarch_ctx
-+
-+#endif
-+
-+#include_next <math_private.h>
-+
-+#endif
-diff --git a/sysdeps/loongarch/memusage.h b/sysdeps/loongarch/memusage.h
-new file mode 100644
-index 00000000..bdf99f8a
---- /dev/null
-+++ b/sysdeps/loongarch/memusage.h
-@@ -0,0 +1,21 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#define GETSP() ({ register uintptr_t stack_ptr asm ("$sp"); stack_ptr; })
-+
-+#include <sysdeps/generic/memusage.h>
-diff --git a/sysdeps/loongarch/mknod.c b/sysdeps/loongarch/mknod.c
-new file mode 100644
-index 00000000..1ed3681f
---- /dev/null
-+++ b/sysdeps/loongarch/mknod.c
-@@ -0,0 +1 @@
-+#include <io/mknod.c>
-diff --git a/sysdeps/loongarch/mknodat.c b/sysdeps/loongarch/mknodat.c
-new file mode 100644
-index 00000000..82bc6ee6
---- /dev/null
-+++ b/sysdeps/loongarch/mknodat.c
-@@ -0,0 +1 @@
-+#include <io/mknodat.c>
-diff --git a/sysdeps/loongarch/nptl/Makefile b/sysdeps/loongarch/nptl/Makefile
-new file mode 100644
-index 00000000..a1d5768a
---- /dev/null
-+++ b/sysdeps/loongarch/nptl/Makefile
-@@ -0,0 +1,26 @@
-+# Makefile for sysdeps/loongarch/nptl.
-+# Copyright (C) 2005-2018 Free Software Foundation, Inc.
-+# This file is part of the GNU C Library.
-+#
-+# The GNU C Library is free software; you can redistribute it and/or
-+# modify it under the terms of the GNU Lesser General Public
-+# License as published by the Free Software Foundation; either
-+# version 2.1 of the License, or (at your option) any later version.
-+#
-+# The GNU C Library is distributed in the hope that it will be useful,
-+# but WITHOUT ANY WARRANTY; without even the implied warranty of
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+# Lesser General Public License for more details.
-+#
-+# You should have received a copy of the GNU Lesser General Public
-+# License along with the GNU C Library; if not, see
-+# <http://www.gnu.org/licenses/>.
-+
-+ifeq ($(subdir),csu)
-+gen-as-const-headers += tcb-offsets.sym
-+endif
-+
-+ifeq ($(subdir),nptl)
-+libpthread-sysdep_routines += nptl-sysdep
-+libpthread-shared-only-routines += nptl-sysdep
-+endif
-diff --git a/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h b/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h
-new file mode 100644
-index 00000000..5a761355
---- /dev/null
-+++ b/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h
-@@ -0,0 +1,68 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _BITS_PTHREADTYPES_ARCH_H
-+#define _BITS_PTHREADTYPES_ARCH_H	1
-+
-+#include <endian.h>
-+
-+#if __loongarch_xlen == 64
-+# define __SIZEOF_PTHREAD_ATTR_T 		56
-+# define __SIZEOF_PTHREAD_MUTEX_T 		40
-+# define __SIZEOF_PTHREAD_MUTEXATTR_T 		 4
-+# define __SIZEOF_PTHREAD_COND_T 		48
-+# define __SIZEOF_PTHREAD_CONDATTR_T 		 4
-+# define __SIZEOF_PTHREAD_RWLOCK_T 		56
-+# define __SIZEOF_PTHREAD_RWLOCKATTR_T 		 8
-+# define __SIZEOF_PTHREAD_BARRIER_T 		32
-+# define __SIZEOF_PTHREAD_BARRIERATTR_T 	 4
-+#else
-+# error "rv32i-based systems are not supported"
-+#endif
-+
-+#define __PTHREAD_COMPAT_PADDING_MID
-+#define __PTHREAD_COMPAT_PADDING_END
-+#define __PTHREAD_MUTEX_LOCK_ELISION		0
-+#define __PTHREAD_MUTEX_USE_UNION	 	0
-+#define __PTHREAD_MUTEX_NUSERS_AFTER_KIND	0
-+
-+#define __LOCK_ALIGNMENT
-+#define __ONCE_ALIGNMENT
-+
-+/* There is a lot of padding in this structure.  While it's not strictly
-+   necessary on LoongArch, we're going to leave it in to be on the safe side in
-+   case it's needed in the future.  Most other architectures have the padding,
-+   so this gives us the same extensibility as everyone else has.  */
-+struct __pthread_rwlock_arch_t
-+{
-+  unsigned int __readers;
-+  unsigned int __writers;
-+  unsigned int __wrphase_futex;
-+  unsigned int __writers_futex;
-+  unsigned int __pad3;
-+  unsigned int __pad4;
-+  int __cur_writer;
-+  int __shared;
-+  unsigned long int __pad1;
-+  unsigned long int __pad2;
-+  unsigned int __flags;
-+};
-+
-+#define __PTHREAD_RWLOCK_ELISION_EXTRA 		0
-+
-+#endif	/* bits/pthreadtypes.h */
-diff --git a/sysdeps/loongarch/nptl/bits/semaphore.h b/sysdeps/loongarch/nptl/bits/semaphore.h
-new file mode 100644
-index 00000000..a9ddefb2
---- /dev/null
-+++ b/sysdeps/loongarch/nptl/bits/semaphore.h
-@@ -0,0 +1,33 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _SEMAPHORE_H
-+# error "Never use <bits/semaphore.h> directly; include <semaphore.h> instead."
-+#endif
-+
-+#define __SIZEOF_SEM_T	(4 * __SIZEOF_POINTER__)
-+
-+/* Value returned if `sem_open' failed.  */
-+#define SEM_FAILED      ((sem_t *) 0)
-+
-+
-+typedef union
-+{
-+  char __size[__SIZEOF_SEM_T];
-+  long int __align;
-+} sem_t;
-diff --git a/sysdeps/loongarch/nptl/libc-lowlevellock.c b/sysdeps/loongarch/nptl/libc-lowlevellock.c
-new file mode 100644
-index 00000000..9523fb46
---- /dev/null
-+++ b/sysdeps/loongarch/nptl/libc-lowlevellock.c
-@@ -0,0 +1,8 @@
-+/* This kludge works around a libpthread static linking problem:
-+   https://sourceware.org/bugzilla/show_bug.cgi?id=15648.  */
-+
-+#ifndef SHARED
-+# define __lll_lock_wait_private weak_function __lll_lock_wait_private
-+#endif
-+
-+#include <lowlevellock.c>
-diff --git a/sysdeps/loongarch/nptl/nptl-sysdep.S b/sysdeps/loongarch/nptl/nptl-sysdep.S
-new file mode 100644
-index 00000000..3f5c2a36
---- /dev/null
-+++ b/sysdeps/loongarch/nptl/nptl-sysdep.S
-@@ -0,0 +1,2 @@
-+/* Pull in __syscall_error.  */
-+#include <sysdep.S>
-diff --git a/sysdeps/loongarch/nptl/pthread-offsets.h b/sysdeps/loongarch/nptl/pthread-offsets.h
-new file mode 100644
-index 00000000..04130879
---- /dev/null
-+++ b/sysdeps/loongarch/nptl/pthread-offsets.h
-@@ -0,0 +1,23 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#define __PTHREAD_MUTEX_NUSERS_OFFSET   12
-+#define __PTHREAD_MUTEX_KIND_OFFSET     16
-+#define __PTHREAD_MUTEX_SPINS_OFFSET    20
-+#define __PTHREAD_MUTEX_ELISION_OFFSET  22
-+#define __PTHREAD_MUTEX_LIST_OFFSET     24
-diff --git a/sysdeps/loongarch/nptl/pthreaddef.h b/sysdeps/loongarch/nptl/pthreaddef.h
-new file mode 100644
-index 00000000..87c407bc
---- /dev/null
-+++ b/sysdeps/loongarch/nptl/pthreaddef.h
-@@ -0,0 +1,32 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+/* Default stack size.  */
-+#define ARCH_STACK_DEFAULT_SIZE	(2 * 1024 * 1024)
-+
-+/* Required stack pointer alignment at beginning.  */
-+#define STACK_ALIGN		16
-+
-+/* Minimal stack size after allocating thread descriptor and guard size.  */
-+#define MINIMAL_REST_STACK	2048
-+
-+/* Alignment requirement for TCB.  */
-+#define TCB_ALIGNMENT		16
-+
-+/* Location of current stack frame.  */
-+#define CURRENT_STACK_FRAME	__builtin_frame_address (0)
-diff --git a/sysdeps/loongarch/nptl/tcb-offsets.sym b/sysdeps/loongarch/nptl/tcb-offsets.sym
-new file mode 100644
-index 00000000..ab4981f2
---- /dev/null
-+++ b/sysdeps/loongarch/nptl/tcb-offsets.sym
-@@ -0,0 +1,6 @@
-+#include <sysdep.h>
-+#include <tls.h>
-+
-+#define thread_offsetof(mem)	(long)(offsetof (struct pthread, mem) - TLS_TCB_OFFSET - TLS_PRE_TCB_SIZE)
-+
-+MULTIPLE_THREADS_OFFSET		thread_offsetof (header.multiple_threads)
-diff --git a/sysdeps/loongarch/nptl/tls.h b/sysdeps/loongarch/nptl/tls.h
-new file mode 100644
-index 00000000..8d2d4ca2
---- /dev/null
-+++ b/sysdeps/loongarch/nptl/tls.h
-@@ -0,0 +1,147 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _LOONGARCH_TLS_H
-+#define _LOONGARCH_TLS_H	1
-+
-+#include <dl-sysdep.h>
-+
-+#ifndef __ASSEMBLER__
-+# include <stdbool.h>
-+# include <stddef.h>
-+# include <stdint.h>
-+# include <dl-dtv.h>
-+
-+register void *__thread_self asm ("$tp"); /* FIXME */
-+# define READ_THREAD_POINTER() ({ __thread_self; })
-+
-+/* Get system call information.  */
-+# include <sysdep.h>
-+
-+/* The TP points to the start of the thread blocks.  */
-+# define TLS_DTV_AT_TP	1
-+# define TLS_TCB_AT_TP	0
-+
-+/* Get the thread descriptor definition.  */
-+# include <nptl/descr.h>
-+
-+typedef struct
-+{
-+  dtv_t *dtv;
-+  void *private;
-+} tcbhead_t;
-+
-+/* This is the size of the initial TCB.  Because our TCB is before the thread
-+   pointer, we don't need this.  */
-+# define TLS_INIT_TCB_SIZE	0
-+
-+/* Alignment requirements for the initial TCB.  */
-+# define TLS_INIT_TCB_ALIGN	__alignof__ (struct pthread)
-+
-+/* This is the size of the TCB.  Because our TCB is before the thread
-+   pointer, we don't need this.  */
-+# define TLS_TCB_SIZE		0
-+
-+/* Alignment requirements for the TCB.  */
-+# define TLS_TCB_ALIGN		__alignof__ (struct pthread)
-+
-+/* This is the size we need before TCB - actually, it includes the TCB.  */
-+# define TLS_PRE_TCB_SIZE \
-+  (sizeof (struct pthread)						      \
-+   + ((sizeof (tcbhead_t) + TLS_TCB_ALIGN - 1) & ~(TLS_TCB_ALIGN - 1)))
-+
-+/* The thread pointer tp points to the end of the TCB.
-+   The pthread_descr structure is immediately in front of the TCB.  */
-+# define TLS_TCB_OFFSET	0
-+
-+/* Install the dtv pointer.  The pointer passed is to the element with
-+   index -1 which contain the length.  */
-+# define INSTALL_DTV(tcbp, dtvp) \
-+  (((tcbhead_t *) (tcbp))[-1].dtv = (dtvp) + 1)
-+
-+/* Install new dtv for current thread.  */
-+# define INSTALL_NEW_DTV(dtv) \
-+  (THREAD_DTV() = (dtv))
-+
-+/* Return dtv of given thread descriptor.  */
-+# define GET_DTV(tcbp) \
-+  (((tcbhead_t *) (tcbp))[-1].dtv)
-+
-+/* Code to initially initialize the thread pointer.  */
-+# define TLS_INIT_TP(tcbp) \
-+  ({ __thread_self = (char*)tcbp + TLS_TCB_OFFSET; NULL; })
-+
-+/* Return the address of the dtv for the current thread.  */
-+# define THREAD_DTV() \
-+  (((tcbhead_t *) (READ_THREAD_POINTER () - TLS_TCB_OFFSET))[-1].dtv)
-+
-+/* Return the thread descriptor for the current thread.  */
-+# define THREAD_SELF \
-+ ((struct pthread *) (READ_THREAD_POINTER ()			     \
-+		      - TLS_TCB_OFFSET - TLS_PRE_TCB_SIZE))
-+
-+/* Value passed to 'clone' for initialization of the thread register.  */
-+# define TLS_DEFINE_INIT_TP(tp, pd) \
-+  void *tp = (void *) (pd) + TLS_TCB_OFFSET + TLS_PRE_TCB_SIZE
-+
-+/* Informs libthread_db that the thread pointer is register 2, which is used
-+ * to know how to do THREAD_SELF.  */
-+# define DB_THREAD_SELF \
-+  REGISTER (64, 64, 2 * 8, - TLS_TCB_OFFSET - TLS_PRE_TCB_SIZE)
-+
-+/* Access to data in the thread descriptor is easy.  */
-+# define THREAD_GETMEM(descr, member) \
-+  descr->member
-+# define THREAD_GETMEM_NC(descr, member, idx) \
-+  descr->member[idx]
-+# define THREAD_SETMEM(descr, member, value) \
-+  descr->member = (value)
-+# define THREAD_SETMEM_NC(descr, member, idx, value) \
-+  descr->member[idx] = (value)
-+
-+/* l_tls_offset == 0 is perfectly valid, so we have to use some different
-+   value to mean unset l_tls_offset.  */
-+# define NO_TLS_OFFSET		-1
-+
-+/* Get and set the global scope generation counter in struct pthread.  */
-+# define THREAD_GSCOPE_IN_TCB      1
-+# define THREAD_GSCOPE_FLAG_UNUSED 0
-+# define THREAD_GSCOPE_FLAG_USED   1
-+# define THREAD_GSCOPE_FLAG_WAIT   2
-+# define THREAD_GSCOPE_RESET_FLAG() \
-+  do									     \
-+    { int __res								     \
-+	= atomic_exchange_rel (&THREAD_SELF->header.gscope_flag,	     \
-+			       THREAD_GSCOPE_FLAG_UNUSED);		     \
-+      if (__res == THREAD_GSCOPE_FLAG_WAIT)				     \
-+	lll_futex_wake (&THREAD_SELF->header.gscope_flag, 1, LLL_PRIVATE);   \
-+    }									     \
-+  while (0)
-+# define THREAD_GSCOPE_SET_FLAG() \
-+  do									     \
-+    {									     \
-+      THREAD_SELF->header.gscope_flag = THREAD_GSCOPE_FLAG_USED;	     \
-+      atomic_write_barrier ();						     \
-+    }									     \
-+  while (0)
-+# define THREAD_GSCOPE_WAIT() \
-+  GL(dl_wait_lookup_done) ()
-+
-+#endif /* __ASSEMBLER__ */
-+
-+#endif	/* tls.h */
-diff --git a/sysdeps/loongarch/preconfigure b/sysdeps/loongarch/preconfigure
-new file mode 100644
-index 00000000..26ffe884
---- /dev/null
-+++ b/sysdeps/loongarch/preconfigure
-@@ -0,0 +1,9 @@
-+case "$machine" in
-+loongarch*)
-+	base_machine=loongarch
-+	machine=loongarch/lp64
-+	;;
-+esac
-+
-+#TODO: this file is useless now.
-+#Maybe we can make use of it to get arch info from GCC to set env
-diff --git a/sysdeps/loongarch/pthread_atfork.c b/sysdeps/loongarch/pthread_atfork.c
-new file mode 100644
-index 00000000..0f01d805
---- /dev/null
-+++ b/sysdeps/loongarch/pthread_atfork.c
-@@ -0,0 +1 @@
-+#include <nptl/pthread_atfork.c>
-diff --git a/sysdeps/loongarch/setjmp.S b/sysdeps/loongarch/setjmp.S
-new file mode 100644
-index 00000000..da09a93c
---- /dev/null
-+++ b/sysdeps/loongarch/setjmp.S
-@@ -0,0 +1,62 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <sysdep.h>
-+#include <sys/asm.h>
-+
-+ENTRY (_setjmp)
-+  li.w a1,0
-+  b __sigsetjmp
-+END (_setjmp)
-+ENTRY (setjmp)
-+  li.w a1,1
-+END (setjmp)
-+ENTRY (__sigsetjmp)
-+  REG_S ra, a0, 0*SZREG
-+  REG_S sp, a0, 1*SZREG
-+  REG_S x, a0,  2*SZREG
-+  REG_S fp, a0, 3*SZREG
-+  REG_S s0, a0, 4*SZREG
-+  REG_S s1, a0, 5*SZREG
-+  REG_S s2, a0, 6*SZREG
-+  REG_S s3, a0, 7*SZREG
-+  REG_S s4, a0, 8*SZREG
-+  REG_S s5, a0, 9*SZREG
-+  REG_S s6, a0, 10*SZREG
-+  REG_S s7, a0, 11*SZREG
-+  REG_S s8, a0, 12*SZREG
-+
-+  FREG_S $f24, a0, 13*SZREG + 0*SZFREG
-+  FREG_S $f25, a0, 13*SZREG + 1*SZFREG
-+  FREG_S $f26, a0, 13*SZREG + 2*SZFREG
-+  FREG_S $f27, a0, 13*SZREG + 3*SZFREG
-+  FREG_S $f28, a0, 13*SZREG + 4*SZFREG
-+  FREG_S $f29, a0, 13*SZREG + 5*SZFREG
-+  FREG_S $f30, a0, 13*SZREG + 6*SZFREG
-+  FREG_S $f31, a0, 13*SZREG + 7*SZFREG
-+
-+#if !IS_IN (libc) && IS_IN(rtld)
-+  li.w v0, 0
-+  jirl zero,ra,0
-+#else
-+  b __sigjmp_save
-+#endif
-+END (__sigsetjmp)
-+
-+hidden_def (__sigsetjmp)
-+weak_alias (_setjmp, __GI__setjmp)
-diff --git a/sysdeps/loongarch/sfp-machine.h b/sysdeps/loongarch/sfp-machine.h
-new file mode 100644
-index 00000000..b5c79bc0
---- /dev/null
-+++ b/sysdeps/loongarch/sfp-machine.h
-@@ -0,0 +1,79 @@
-+#include <fenv.h>
-+#include <fpu_control.h>
-+
-+#define _FP_W_TYPE_SIZE		64
-+#define _FP_W_TYPE		unsigned long long
-+#define _FP_WS_TYPE		signed long long
-+#define _FP_I_TYPE		long long
-+
-+#define _FP_MUL_MEAT_S(R,X,Y)					\
-+  _FP_MUL_MEAT_1_imm(_FP_WFRACBITS_S,R,X,Y)
-+#define _FP_MUL_MEAT_D(R,X,Y)					\
-+  _FP_MUL_MEAT_1_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm)
-+#define _FP_MUL_MEAT_Q(R,X,Y)					\
-+  _FP_MUL_MEAT_2_wide_3mul(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm)
-+
-+#define _FP_MUL_MEAT_DW_S(R,X,Y)				\
-+  _FP_MUL_MEAT_DW_1_imm(_FP_WFRACBITS_S,R,X,Y)
-+#define _FP_MUL_MEAT_DW_D(R,X,Y)				\
-+  _FP_MUL_MEAT_DW_1_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm)
-+#define _FP_MUL_MEAT_DW_Q(R,X,Y)				\
-+  _FP_MUL_MEAT_DW_2_wide_3mul(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm)
-+
-+#define _FP_DIV_MEAT_S(R,X,Y)	_FP_DIV_MEAT_1_imm(S,R,X,Y,_FP_DIV_HELP_imm)
-+#define _FP_DIV_MEAT_D(R,X,Y)	_FP_DIV_MEAT_1_udiv_norm(D,R,X,Y)
-+#define _FP_DIV_MEAT_Q(R,X,Y)	_FP_DIV_MEAT_2_udiv(Q,R,X,Y)
-+
-+# define _FP_NANFRAC_S		_FP_QNANBIT_S
-+# define _FP_NANFRAC_D		_FP_QNANBIT_D
-+# define _FP_NANFRAC_Q		_FP_QNANBIT_Q, 0
-+
-+#define _FP_NANSIGN_S		0
-+#define _FP_NANSIGN_D		0
-+#define _FP_NANSIGN_Q		0
-+
-+#define _FP_KEEPNANFRACP 1
-+#define _FP_QNANNEGATEDP 0
-+
-+/* NaN payloads should be preserved for NAN2008.  */
-+# define _FP_CHOOSENAN(fs, wc, R, X, Y, OP)	\
-+  do						\
-+    {						\
-+      R##_s = X##_s;				\
-+      _FP_FRAC_COPY_##wc (R, X);		\
-+      R##_c = FP_CLS_NAN;			\
-+    }						\
-+  while (0)
-+
-+#define _FP_DECL_EX		fpu_control_t _fcw
-+
-+#define FP_ROUNDMODE		(_fcw & 0x300)
-+
-+#define FP_RND_NEAREST		FE_TONEAREST
-+#define FP_RND_ZERO		FE_TOWARDZERO
-+#define FP_RND_PINF		FE_UPWARD
-+#define FP_RND_MINF		FE_DOWNWARD
-+
-+#define FP_EX_INVALID		FE_INVALID
-+#define FP_EX_OVERFLOW		FE_OVERFLOW
-+#define FP_EX_UNDERFLOW		FE_UNDERFLOW
-+#define FP_EX_DIVZERO		FE_DIVBYZERO
-+#define FP_EX_INEXACT		FE_INEXACT
-+
-+#define _FP_TININESS_AFTER_ROUNDING 1
-+
-+#ifdef __loongarch_hard_float
-+#define FP_INIT_ROUNDMODE			\
-+do {						\
-+  _FPU_GETCW (_fcw);				\
-+} while (0)
-+
-+#define FP_HANDLE_EXCEPTIONS			\
-+do {						\
-+  if (__builtin_expect (_fex, 0))		\
-+    _FPU_SETCW (_fcw | _fex | (_fex << 8));	\
-+} while (0)
-+#define FP_TRAPPING_EXCEPTIONS ((_fcw << 16) & 0x1f0000)
-+#else
-+#define FP_INIT_ROUNDMODE	_fcw = FP_RND_NEAREST
-+#endif
-diff --git a/sysdeps/loongarch/sotruss-lib.c b/sysdeps/loongarch/sotruss-lib.c
-new file mode 100644
-index 00000000..124db440
---- /dev/null
-+++ b/sysdeps/loongarch/sotruss-lib.c
-@@ -0,0 +1,51 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#define HAVE_ARCH_PLTENTER
-+#define HAVE_ARCH_PLTEXIT
-+
-+#include <elf/sotruss-lib.c>
-+
-+ElfW(Addr)
-+la_loongarch_gnu_pltenter (ElfW(Sym) *sym __attribute__ ((unused)),
-+		       unsigned int ndx __attribute__ ((unused)),
-+		       uintptr_t *refcook, uintptr_t *defcook,
-+		       La_loongarch_regs *regs, unsigned int *flags,
-+		       const char *symname, long int *framesizep)
-+{
-+  print_enter (refcook, defcook, symname,
-+	       regs->lr_reg[0], regs->lr_reg[1], regs->lr_reg[2],
-+	       *flags);
-+
-+  /* No need to copy anything, we will not need the parameters in any case.  */
-+  *framesizep = 0;
-+
-+  return sym->st_value;
-+}
-+
-+unsigned int
-+la_loongarch_gnu_pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
-+		      uintptr_t *defcook,
-+		      const struct La_loongarch_regs *inregs,
-+		      struct La_loongarch_retval *outregs,
-+		      const char *symname)
-+{
-+  print_exit (refcook, defcook, symname, outregs->lrv_a0);
-+
-+  return 0;
-+}
-diff --git a/sysdeps/loongarch/stack_chk_fail_local.c b/sysdeps/loongarch/stack_chk_fail_local.c
-new file mode 100644
-index 00000000..305871fb
---- /dev/null
-+++ b/sysdeps/loongarch/stack_chk_fail_local.c
-@@ -0,0 +1 @@
-+#include <debug/stack_chk_fail_local.c>
-diff --git a/sysdeps/loongarch/stackinfo.h b/sysdeps/loongarch/stackinfo.h
-new file mode 100644
-index 00000000..5f5e6ad1
---- /dev/null
-+++ b/sysdeps/loongarch/stackinfo.h
-@@ -0,0 +1,33 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+/* This file contains a bit of information about the stack allocation
-+   of the processor.  */
-+
-+#ifndef _STACKINFO_H
-+#define _STACKINFO_H	1
-+
-+#include <elf.h>
-+
-+/* On LoongArch the stack grows down.  */
-+#define _STACK_GROWS_DOWN	1
-+
-+/* Default to a non-executable stack.  */
-+#define DEFAULT_STACK_PERMS (PF_R | PF_W)
-+
-+#endif	/* stackinfo.h */
-diff --git a/sysdeps/loongarch/start.S b/sysdeps/loongarch/start.S
-new file mode 100644
-index 00000000..cf0a14b5
---- /dev/null
-+++ b/sysdeps/loongarch/start.S
-@@ -0,0 +1,51 @@
-+#define __ASSEMBLY__ 1
-+#include <entry.h>
-+#include <sys/asm.h>
-+
-+/* The entry point's job is to call __libc_start_main.  Per the ABI,
-+   a0 contains the address of a function to be passed to atexit.
-+   __libc_start_main wants this in a5.  */
-+
-+/*
-+int
-+__libc_start_main (int (*main) (int, char **, char **),
-+			    int argc,
-+			    char **argv,
-+			    __typeof (main) init,
-+			    void (*fini) (void),
-+			    void (*rtld_fini) (void),
-+			    void *stack_end);
-+*/
-+
-+ENTRY (ENTRY_POINT)
-+	/* Terminate call stack by noting ra is undefined.  Use a dummy
-+	   .cfi_label to force starting the FDE.  */
-+	.cfi_label .Ldummy
-+	cfi_undefined (1)
-+	or	a5, a0, zero /* rtld_fini */
-+
-+	/* 这个main必须要走GOT表拿到。因为main不一定是local的。
-+	   比如googletest就把main定义在动态库里了。 */
-+	la.got	a0, t0, main
-+#ifdef __loongarch64
-+	ld.d	a1, sp, 0
-+	addi.d	a2, sp, SZREG
-+#elif defined __loongarch32
-+	ld.w	a1, sp, 0
-+	addi.w	a2, sp, SZREG
-+#endif
-+	/* Adjust $sp for 16-aligned */
-+	srli.d  sp, sp, 4
-+	slli.d  sp, sp, 4
-+
-+	la.got	a3, t0,  __libc_csu_init
-+	la.got	a4, t0,  __libc_csu_fini
-+	or	a6, sp, zero /* stack_end. */
-+
-+	la.got	ra, t0, __libc_start_main
-+	jirl	ra, ra, 0
-+
-+	la.got	ra, t0, abort
-+	jirl	ra, ra, 0
-+END (ENTRY_POINT)
-+
-diff --git a/sysdeps/loongarch/stat.c b/sysdeps/loongarch/stat.c
-new file mode 100644
-index 00000000..36461b87
---- /dev/null
-+++ b/sysdeps/loongarch/stat.c
-@@ -0,0 +1 @@
-+#include <io/stat.c>
-diff --git a/sysdeps/loongarch/stat64.c b/sysdeps/loongarch/stat64.c
-new file mode 100644
-index 00000000..0897282e
---- /dev/null
-+++ b/sysdeps/loongarch/stat64.c
-@@ -0,0 +1 @@
-+#include <io/stat64.c>
-diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h
-new file mode 100644
-index 00000000..f64bfb2b
---- /dev/null
-+++ b/sysdeps/loongarch/sys/asm.h
-@@ -0,0 +1,58 @@
-+#ifndef _SYS_ASM_H
-+#define _SYS_ASM_H
-+
-+#include <sys/regdef.h>
-+#include <sysdeps/generic/sysdep.h>
-+
-+/* Macros to handle different pointer/register sizes for 32/64-bit code.  */
-+#ifdef __loongarch64
-+# define PTRLOG 3
-+# define SZREG	8
-+# define SZFREG	8
-+# define REG_L ld.d
-+# define REG_S st.d
-+# define FREG_L fld.d
-+# define FREG_S fst.d
-+#elif defined __loongarch32
-+# define PTRLOG 2
-+# define SZREG	4
-+# define SZFREG	4
-+# define REG_L ld.w
-+# define REG_S st.w
-+# define FREG_L fld.w
-+# define FREG_S fst.w
-+#else
-+# error __loongarch_xlen must equal 32 or 64
-+#endif
-+
-+
-+/* Declare leaf routine.  */
-+#define	LEAF(symbol)			\
-+	.text;				\
-+	.globl	symbol;			\
-+	.align	3;			\
-+	.type	symbol, @function;	\
-+symbol: \
-+	cfi_startproc;			\
-+
-+# define ENTRY(symbol) LEAF(symbol)
-+
-+#define	LEAF_NO_ALIGN(symbol)			\
-+	.text;				\
-+	.globl	symbol;			\
-+	.type	symbol, @function;	\
-+symbol: \
-+	cfi_startproc;
-+
-+# define ENTRY_NO_ALIGN(symbol) LEAF_NO_ALIGN(symbol)
-+
-+/* Mark end of function.  */
-+#undef END
-+#define END(function)			\
-+	cfi_endproc ;			\
-+	.size	function,.-function;
-+
-+/* Stack alignment.  */
-+#define ALMASK	~15
-+
-+#endif /* sys/asm.h */
-diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h
-new file mode 100644
-index 00000000..769784b8
---- /dev/null
-+++ b/sysdeps/loongarch/sys/regdef.h
-@@ -0,0 +1,83 @@
-+#ifndef _SYS_REGDEF_H
-+#define _SYS_REGDEF_H
-+
-+#if _LOONGARCH_SIM == _ABILP64
-+# define zero	$r0
-+# define ra	$r1
-+# define tp	$r2
-+# define sp	$r3
-+# define a0	$r4
-+# define a1	$r5
-+# define a2	$r6
-+# define a3	$r7
-+# define a4	$r8
-+# define a5	$r9
-+# define a6	$r10
-+# define a7	$r11
-+# define v0	$r4
-+# define v1	$r5
-+# define t0	$r12
-+# define t1	$r13
-+# define t2	$r14
-+# define t3	$r15
-+# define t4	$r16
-+# define t5	$r17
-+# define t6	$r18
-+# define t7	$r19
-+# define t8	$r20
-+# define x	$r21
-+# define fp	$r22
-+# define s0	$r23
-+# define s1	$r24
-+# define s2	$r25
-+# define s3	$r26
-+# define s4	$r27
-+# define s5	$r28
-+# define s6	$r29
-+# define s7	$r30
-+# define s8	$r31
-+
-+# define fa0	$f0
-+# define fa1	$f1
-+# define fa2	$f2
-+# define fa3	$f3
-+# define fa4	$f4
-+# define fa5	$f5
-+# define fa6	$f6
-+# define fa7	$f7
-+# define fv0	$f0
-+# define fv1	$f1
-+# define ft0	$f8
-+# define ft1	$f9
-+# define ft2	$f10
-+# define ft3	$f11
-+# define ft4	$f12
-+# define ft5	$f13
-+# define ft6	$f14
-+# define ft7	$f15
-+# define ft8	$f16
-+# define ft9	$f17
-+# define ft10	$f18
-+# define ft11	$f19
-+# define ft12	$f20
-+# define ft13	$f21
-+# define ft14	$f22
-+# define ft15	$f23
-+# define fs0	$f24
-+# define fs1	$f25
-+# define fs2	$f26
-+# define fs3	$f27
-+# define fs4	$f28
-+# define fs5	$f29
-+# define fs6	$f30
-+# define fs7	$f31
-+
-+#elif _LOONGARCH_SIM == _ABILPX32
-+# error ABILPX32
-+#elif _LOONGARCH_SIM == _ABILP32
-+# error ABILP32
-+#else
-+# error noABI
-+#endif
-+
-+#endif /* _SYS_REGDEF_H */
-diff --git a/sysdeps/loongarch/tininess.h b/sysdeps/loongarch/tininess.h
-new file mode 100644
-index 00000000..1db37790
---- /dev/null
-+++ b/sysdeps/loongarch/tininess.h
-@@ -0,0 +1 @@
-+#define TININESS_AFTER_ROUNDING	1
-diff --git a/sysdeps/loongarch/tls-macros.h b/sysdeps/loongarch/tls-macros.h
-new file mode 100644
-index 00000000..f0ad55ac
---- /dev/null
-+++ b/sysdeps/loongarch/tls-macros.h
-@@ -0,0 +1,46 @@
-+/* Macros to support TLS testing in times of missing compiler support.
-+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+
-+#include <sys/cdefs.h>
-+#include <sys/asm.h>
-+#include <sysdep.h>
-+#include "dl-tls.h"
-+
-+#define TLS_GD(x)					\
-+	({ void *__result;				\
-+	asm ("la.tls.gd %0, " #x "\n\t"			\
-+	     : "=r" (__result));			\
-+	__tls_get_addr (__result); })
-+
-+#define TLS_LD(x) TLS_GD(x)
-+
-+#define TLS_IE(x)					\
-+	({ void *__result;				\
-+	asm ("la.tls.ie %0, " #x "\n\t"			\
-+	     "add.d %0, %0, $tp\n\t"			\
-+	     : "=r" (__result));			\
-+	__result; })
-+
-+#define TLS_LE(x)					\
-+	({ void *__result;				\
-+	asm ("la.tls.le %0, " #x "\n\t"			\
-+	     "add.d %0, %0, $tp\n\t"			\
-+	     : "=r" (__result));			\
-+	__result; })
-diff --git a/sysdeps/loongarch/tst-audit.h b/sysdeps/loongarch/tst-audit.h
-new file mode 100644
-index 00000000..d8d260eb
---- /dev/null
-+++ b/sysdeps/loongarch/tst-audit.h
-@@ -0,0 +1,23 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#define pltenter la_loongarch_gnu_pltenter
-+#define pltexit la_loongarch_gnu_pltexit
-+#define La_regs La_loongarch_regs
-+#define La_retval La_loongarch_retval
-+#define int_retval lrv_a0
-diff --git a/sysdeps/loongarch/warning-nop.c b/sysdeps/loongarch/warning-nop.c
-new file mode 100644
-index 00000000..b76aae79
---- /dev/null
-+++ b/sysdeps/loongarch/warning-nop.c
-@@ -0,0 +1 @@
-+#include <debug/warning-nop.c>
-diff --git a/sysdeps/unix/sysv/linux/loongarch/Implies b/sysdeps/unix/sysv/linux/loongarch/Implies
-new file mode 100644
-index 00000000..e52b1ac3
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/Implies
-@@ -0,0 +1 @@
-+loongarch/nptl
-diff --git a/sysdeps/unix/sysv/linux/loongarch/Makefile b/sysdeps/unix/sysv/linux/loongarch/Makefile
-new file mode 100644
-index 00000000..6f049aa9
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/Makefile
-@@ -0,0 +1,17 @@
-+ifeq ($(subdir),elf)
-+sysdep_routines	+= dl-vdso
-+ifeq ($(build-shared),yes)
-+# This is needed for DSO loading from static binaries.
-+sysdep-dl-routines += dl-static
-+endif
-+endif
-+
-+#ifeq ($(subdir),misc)
-+#sysdep_headers += sys/cachectl.h
-+#sysdep_routines += flush-icache
-+#endif
-+
-+ifeq ($(subdir),stdlib)
-+gen-as-const-headers += ucontext_i.sym
-+endif
-+
-diff --git a/sysdeps/unix/sysv/linux/loongarch/Versions b/sysdeps/unix/sysv/linux/loongarch/Versions
-new file mode 100644
-index 00000000..453f276a
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/Versions
-@@ -0,0 +1,44 @@
-+ld {
-+  GLIBC_PRIVATE {
-+    # used for loading by static libraries
-+    _dl_var_init;
-+  }
-+}
-+libc {
-+  # The comment lines with "#errlist-compat" are magic; see errlist-compat.awk.
-+  # When you get an error from errlist-compat.awk, you need to add a new
-+  # version here.  Don't do this blindly, since this means changing the ABI
-+  # for all GNU/Linux configurations.
-+
-+  GLIBC_2.0 {
-+    #errlist-compat	123
-+    _sys_errlist; sys_errlist; _sys_nerr; sys_nerr;
-+
-+    # Exception handling support functions from libgcc
-+    __register_frame; __register_frame_table; __deregister_frame;
-+    __frame_state_for; __register_frame_info_table;
-+
-+    # Needed by gcc:
-+    _flush_cache;
-+
-+    # c*
-+    cachectl; cacheflush;
-+
-+    # s*
-+    sysmips;
-+  }
-+  GLIBC_2.2 {
-+    #errlist-compat	1134
-+    _sys_errlist; sys_errlist; _sys_nerr; sys_nerr;
-+
-+    # _*
-+    _test_and_set;
-+  }
-+  GLIBC_2.11 {
-+    fallocate64;
-+  }
-+  GLIBC_PRIVATE {
-+    # nptl/pthread_cond_timedwait.c uses INTERNAL_VSYSCALL(clock_gettime).
-+    __vdso_clock_gettime;
-+  }
-+}
-diff --git a/sysdeps/unix/sysv/linux/loongarch/atomic-machine.h b/sysdeps/unix/sysv/linux/loongarch/atomic-machine.h
-new file mode 100644
-index 00000000..ac1948ea
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/atomic-machine.h
-@@ -0,0 +1,188 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _LINUX_LOONGARCH_BITS_ATOMIC_H
-+#define _LINUX_LOONGARCH_BITS_ATOMIC_H 1
-+
-+#include <stdint.h>
-+
-+typedef int32_t atomic32_t;
-+typedef uint32_t uatomic32_t;
-+
-+typedef int64_t atomic64_t;
-+typedef uint64_t uatomic64_t;
-+
-+typedef intptr_t atomicptr_t;
-+typedef uintptr_t uatomicptr_t;
-+typedef intmax_t atomic_max_t;
-+typedef uintmax_t uatomic_max_t;
-+
-+#define atomic_full_barrier() __sync_synchronize ()
-+
-+# define __HAVE_64B_ATOMICS (__loongarch_xlen >= 64)
-+# define USE_ATOMIC_COMPILER_BUILTINS 1
-+# define ATOMIC_EXCHANGE_USES_CAS 0
-+
-+/* Compare and exchange.
-+   For all "bool" routines, we return FALSE if exchange succesful.  */
-+
-+# define __arch_compare_and_exchange_bool_8_int(mem, newval, oldval, model) \
-+  ({									\
-+    typeof (*mem) __oldval = (oldval);					\
-+    !__atomic_compare_exchange_n (mem, (void *) &__oldval, newval, 0,	\
-+				  model, __ATOMIC_RELAXED);		\
-+  })
-+
-+# define __arch_compare_and_exchange_bool_16_int(mem, newval, oldval, model) \
-+  ({									\
-+    typeof (*mem) __oldval = (oldval);					\
-+    !__atomic_compare_exchange_n (mem, (void *) &__oldval, newval, 0,	\
-+				  model, __ATOMIC_RELAXED);		\
-+  })
-+
-+# define __arch_compare_and_exchange_bool_32_int(mem, newval, oldval, model) \
-+  ({									\
-+    typeof (*mem) __oldval = (oldval);					\
-+    !__atomic_compare_exchange_n (mem, (void *) &__oldval, newval, 0,	\
-+				  model, __ATOMIC_RELAXED);		\
-+  })
-+
-+#  define __arch_compare_and_exchange_bool_64_int(mem, newval, oldval, model) \
-+  ({									\
-+    typeof (*mem) __oldval = (oldval);					\
-+    !__atomic_compare_exchange_n (mem, (void *) &__oldval, newval, 0,	\
-+				  model, __ATOMIC_RELAXED);		\
-+  })
-+
-+# define __arch_compare_and_exchange_val_8_int(mem, newval, oldval, model) \
-+  ({									\
-+    typeof (*mem) __oldval = (oldval);					\
-+    __atomic_compare_exchange_n (mem, (void *) &__oldval, newval, 0,	\
-+				 model, __ATOMIC_RELAXED);		\
-+    __oldval;								\
-+  })
-+
-+# define __arch_compare_and_exchange_val_16_int(mem, newval, oldval, model) \
-+  ({									\
-+    typeof (*mem) __oldval = (oldval);					\
-+    __atomic_compare_exchange_n (mem, (void *) &__oldval, newval, 0,	\
-+				 model, __ATOMIC_RELAXED);		\
-+    __oldval;								\
-+  })
-+
-+# define __arch_compare_and_exchange_val_32_int(mem, newval, oldval, model) \
-+  ({									\
-+    typeof (*mem) __oldval = (oldval);					\
-+    __atomic_compare_exchange_n (mem, (void *) &__oldval, newval, 0,	\
-+				 model, __ATOMIC_RELAXED);		\
-+    __oldval;								\
-+  })
-+
-+# define __arch_compare_and_exchange_val_64_int(mem, newval, oldval, model) \
-+  ({									\
-+    typeof (*mem) __oldval = (oldval);					\
-+    __atomic_compare_exchange_n (mem, (void *) &__oldval, newval, 0,	\
-+				 model, __ATOMIC_RELAXED);		\
-+    __oldval;								\
-+  })
-+
-+/* Atomic compare and exchange.  */
-+
-+# define atomic_compare_and_exchange_bool_acq(mem, new, old)	\
-+  __atomic_bool_bysize (__arch_compare_and_exchange_bool, int,	\
-+			mem, new, old, __ATOMIC_ACQUIRE)
-+
-+# define atomic_compare_and_exchange_val_acq(mem, new, old)	\
-+  __atomic_val_bysize (__arch_compare_and_exchange_val, int,	\
-+		       mem, new, old, __ATOMIC_ACQUIRE)
-+
-+# define atomic_compare_and_exchange_val_rel(mem, new, old)	 \
-+  __atomic_val_bysize (__arch_compare_and_exchange_val, int,    \
-+                       mem, new, old, __ATOMIC_RELEASE)
-+
-+/* Atomic exchange (without compare).  */
-+
-+# define __arch_exchange_8_int(mem, newval, model)	\
-+  __atomic_exchange_n (mem, newval, model)
-+
-+# define __arch_exchange_16_int(mem, newval, model)	\
-+  __atomic_exchange_n (mem, newval, model)
-+
-+# define __arch_exchange_32_int(mem, newval, model)	\
-+  __atomic_exchange_n (mem, newval, model)
-+
-+#  define __arch_exchange_64_int(mem, newval, model)	\
-+  __atomic_exchange_n (mem, newval, model)
-+
-+# define atomic_exchange_acq(mem, value)				\
-+  __atomic_val_bysize (__arch_exchange, int, mem, value, __ATOMIC_ACQUIRE)
-+
-+# define atomic_exchange_rel(mem, value)				\
-+  __atomic_val_bysize (__arch_exchange, int, mem, value, __ATOMIC_RELEASE)
-+
-+/* Atomically add value and return the previous (unincremented) value.  */
-+
-+# define __arch_exchange_and_add_8_int(mem, value, model)	\
-+  __atomic_fetch_add (mem, value, model)
-+
-+# define __arch_exchange_and_add_16_int(mem, value, model)	\
-+  __atomic_fetch_add (mem, value, model)
-+
-+# define __arch_exchange_and_add_32_int(mem, value, model)	\
-+  __atomic_fetch_add (mem, value, model)
-+
-+#  define __arch_exchange_and_add_64_int(mem, value, model)	\
-+  __atomic_fetch_add (mem, value, model)
-+
-+# define atomic_exchange_and_add_acq(mem, value)			\
-+  __atomic_val_bysize (__arch_exchange_and_add, int, mem, value,	\
-+		       __ATOMIC_ACQUIRE)
-+
-+# define atomic_exchange_and_add_rel(mem, value)			\
-+  __atomic_val_bysize (__arch_exchange_and_add, int, mem, value,	\
-+		       __ATOMIC_RELEASE)
-+
-+/* Miscellaneous.  */
-+
-+# define asm_amo(which, mem, value) ({ 		\
-+  __atomic_check_size (mem);					\
-+  typeof (*mem) __tmp; 						\
-+  if (sizeof (__tmp) == 4)					\
-+    asm volatile (which ".w""\t%0, %z2, %1"		\
-+		  : "=&r" (__tmp), "+ZB" (* (mem))		\
-+		  : "rJ" (value));				\
-+  else if (sizeof (__tmp) == 8)					\
-+    asm volatile (which ".d""\t%0, %z2, %1"		\
-+		  : "=&r" (__tmp), "+ZB" (* (mem))		\
-+		  : "rJ" (value));				\
-+  else								\
-+    abort ();							\
-+  __tmp; })
-+
-+# define atomic_max(mem, value) asm_amo ("ammax_db", mem, value)
-+# define atomic_min(mem, value) asm_amo ("ammin_db", mem, value)
-+
-+# define atomic_bit_test_set(mem, bit)			\
-+  ({ typeof (*mem) __mask = (typeof (*mem))1 << (bit);	\
-+     asm_amo("amor_db", mem, __mask) & __mask; })
-+
-+# define catomic_exchange_and_add(mem, value)		\
-+  atomic_exchange_and_add (mem, value)
-+# define catomic_max(mem, value) atomic_max (mem, value)
-+
-+#endif /* bits/atomic.h */
-diff --git a/sysdeps/unix/sysv/linux/loongarch/bits/fcntl.h b/sysdeps/unix/sysv/linux/loongarch/bits/fcntl.h
-new file mode 100644
-index 00000000..5ee2e976
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/bits/fcntl.h
-@@ -0,0 +1,62 @@
-+/* O_*, F_*, FD_* bit values for the generic Linux ABI.
-+   Copyright (C) 2011-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef	_FCNTL_H
-+# error "Never use <bits/fcntl.h> directly; include <fcntl.h> instead."
-+#endif
-+
-+#include <bits/wordsize.h>
-+
-+/* In 64-bit ISA files are always with 64bit off_t and F_*LK64 are the same as
-+   non-64-bit versions.  It will need to be revised for 128-bit.  */
-+#if __WORDSIZE == 64
-+# define __O_LARGEFILE  0
-+
-+# define F_GETLK64      5       /* Get record locking info.  */
-+# define F_SETLK64      6       /* Set record locking info (non-blocking).  */
-+# define F_SETLKW64     7       /* Set record locking info (blocking).  */
-+#endif
-+
-+struct flock
-+  {
-+    short int l_type;	/* Type of lock: F_RDLCK, F_WRLCK, or F_UNLCK.	*/
-+    short int l_whence;	/* Where `l_start' is relative to (like `lseek').  */
-+#ifndef __USE_FILE_OFFSET64
-+    __off_t l_start;	/* Offset where the lock begins.  */
-+    __off_t l_len;	/* Size of the locked area; zero means until EOF.  */
-+#else
-+    __off64_t l_start;	/* Offset where the lock begins.  */
-+    __off64_t l_len;	/* Size of the locked area; zero means until EOF.  */
-+#endif
-+    __pid_t l_pid;	/* Process holding the lock.  */
-+  };
-+
-+#ifdef __USE_LARGEFILE64
-+struct flock64
-+  {
-+    short int l_type;	/* Type of lock: F_RDLCK, F_WRLCK, or F_UNLCK.	*/
-+    short int l_whence;	/* Where `l_start' is relative to (like `lseek').  */
-+    __off64_t l_start;	/* Offset where the lock begins.  */
-+    __off64_t l_len;	/* Size of the locked area; zero means until EOF.  */
-+    __pid_t l_pid;	/* Process holding the lock.  */
-+  };
-+#endif
-+
-+/* Include generic Linux declarations.  */
-+#include <bits/fcntl-linux.h>
-diff --git a/sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h b/sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h
-new file mode 100644
-index 00000000..5104b69c
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h
-@@ -0,0 +1,37 @@
-+/* Defines for bits in AT_HWCAP.  LoongArch64 Linux version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#if !defined (_SYS_AUXV_H)
-+# error "Never include <bits/hwcap.h> directly; use <sys/auxv.h> instead."
-+#endif
-+
-+/* The following must match the kernel's <asm/hwcap.h>.  */
-+/* HWCAP flags */
-+#define HWCAP_LOONGARCH_CPUCFG          (1 << 0)
-+#define HWCAP_LOONGARCH_LAM             (1 << 1)
-+#define HWCAP_LOONGARCH_UAL             (1 << 2)
-+#define HWCAP_LOONGARCH_FPU             (1 << 3)
-+#define HWCAP_LOONGARCH_LSX             (1 << 4)
-+#define HWCAP_LOONGARCH_LASX            (1 << 5)
-+#define HWCAP_LOONGARCH_CRC32           (1 << 6)
-+#define HWCAP_LOONGARCH_COMPLEX         (1 << 7)
-+#define HWCAP_LOONGARCH_CRYPTO          (1 << 8)
-+#define HWCAP_LOONGARCH_LVZ             (1 << 9)
-+#define HWCAP_LOONGARCH_LBT_X86         (1 << 10)
-+#define HWCAP_LOONGARCH_LBT_ARM         (1 << 11)
-+#define HWCAP_LOONGARCH_LBT_MIPS        (1 << 12)
-diff --git a/sysdeps/unix/sysv/linux/loongarch/bits/local_lim.h b/sysdeps/unix/sysv/linux/loongarch/bits/local_lim.h
-new file mode 100644
-index 00000000..a8cd6df8
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/bits/local_lim.h
-@@ -0,0 +1,99 @@
-+/* Minimum guaranteed maximum values for system limits.  Linux version.
-+   Copyright (C) 1993-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public License as
-+   published by the Free Software Foundation; either version 2.1 of the
-+   License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; see the file COPYING.LIB.  If
-+   not, see <http://www.gnu.org/licenses/>.  */
-+
-+/* The kernel header pollutes the namespace with the NR_OPEN symbol
-+   and defines LINK_MAX although filesystems have different maxima.  A
-+   similar thing is true for OPEN_MAX: the limit can be changed at
-+   runtime and therefore the macro must not be defined.  Remove this
-+   after including the header if necessary.  */
-+#ifndef NR_OPEN
-+# define __undef_NR_OPEN
-+#endif
-+#ifndef LINK_MAX
-+# define __undef_LINK_MAX
-+#endif
-+#ifndef OPEN_MAX
-+# define __undef_OPEN_MAX
-+#endif
-+#ifndef ARG_MAX
-+# define __undef_ARG_MAX
-+#endif
-+
-+/* The kernel sources contain a file with all the needed information.  */
-+#include <linux/limits.h>
-+
-+/* Have to remove NR_OPEN?  */
-+#ifdef __undef_NR_OPEN
-+# undef NR_OPEN
-+# undef __undef_NR_OPEN
-+#endif
-+/* Have to remove LINK_MAX?  */
-+#ifdef __undef_LINK_MAX
-+# undef LINK_MAX
-+# undef __undef_LINK_MAX
-+#endif
-+/* Have to remove OPEN_MAX?  */
-+#ifdef __undef_OPEN_MAX
-+# undef OPEN_MAX
-+# undef __undef_OPEN_MAX
-+#endif
-+/* Have to remove ARG_MAX?  */
-+#ifdef __undef_ARG_MAX
-+# undef ARG_MAX
-+# undef __undef_ARG_MAX
-+#endif
-+
-+/* The number of data keys per process.  */
-+#define _POSIX_THREAD_KEYS_MAX	128
-+/* This is the value this implementation supports.  */
-+#define PTHREAD_KEYS_MAX	1024
-+
-+/* Controlling the iterations of destructors for thread-specific data.  */
-+#define _POSIX_THREAD_DESTRUCTOR_ITERATIONS	4
-+/* Number of iterations this implementation does.  */
-+#define PTHREAD_DESTRUCTOR_ITERATIONS	_POSIX_THREAD_DESTRUCTOR_ITERATIONS
-+
-+/* The number of threads per process.  */
-+#define _POSIX_THREAD_THREADS_MAX	64
-+/* We have no predefined limit on the number of threads.  */
-+#undef PTHREAD_THREADS_MAX
-+
-+/* Maximum amount by which a process can descrease its asynchronous I/O
-+   priority level.  */
-+#define AIO_PRIO_DELTA_MAX	20
-+
-+/* Minimum size for a thread.  At least two pages with 64k pages.  */
-+#define PTHREAD_STACK_MIN       131072
-+
-+/* Maximum number of timer expiration overruns.  */
-+#define DELAYTIMER_MAX	2147483647
-+
-+/* Maximum tty name length.  */
-+#define TTY_NAME_MAX		32
-+
-+/* Maximum login name length.  This is arbitrary.  */
-+#define LOGIN_NAME_MAX		256
-+
-+/* Maximum host name length.  */
-+#define HOST_NAME_MAX		64
-+
-+/* Maximum message queue priority level.  */
-+#define MQ_PRIO_MAX		32768
-+
-+/* Maximum value the semaphore can have.  */
-+#define SEM_VALUE_MAX   (2147483647)
-diff --git a/sysdeps/unix/sysv/linux/loongarch/bits/mman.h b/sysdeps/unix/sysv/linux/loongarch/bits/mman.h
-new file mode 100644
-index 00000000..5a16f8ac
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/bits/mman.h
-@@ -0,0 +1,41 @@
-+/* Definitions for POSIX memory map interface.  Linux/MIPS version.
-+   Copyright (C) 1997-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+
-+#ifndef _SYS_MMAN_H
-+# error "Never use <bits/mman.h> directly; include <sys/mman.h> instead."
-+#endif
-+
-+#ifdef __USE_MISC
-+# define MAP_GROWSDOWN	0x00100		/* Stack-like segment.  */
-+# define MAP_DENYWRITE	0x00800		/* ETXTBSY.  */
-+# define MAP_EXECUTABLE	0x01000		/* Mark it as an executable.  */
-+# define MAP_LOCKED	0x02000		/* Lock the mapping.  */
-+# define MAP_NORESERVE	0x04000		/* Don't check for reservations.  */
-+# define MAP_POPULATE	0x08000		/* Populate (prefault) pagetables.  */
-+# define MAP_NONBLOCK	0x10000		/* Do not block on IO.  */
-+# define MAP_STACK	0x20000		/* Allocation is for a stack.  */
-+# define MAP_HUGETLB	0x40000		/* Create huge page mapping.  */
-+# define MAP_SYNC	0x80000		/* Perform synchronous page
-+					   faults for the mapping.  */
-+# define MAP_FIXED_NOREPLACE 0x100000	/* MAP_FIXED but do not unmap
-+					   underlying mapping.  */
-+#endif
-+
-+/* Include generic Linux declarations.  */
-+#include <bits/mman-linux.h>
-diff --git a/sysdeps/unix/sysv/linux/loongarch/bits/shm.h b/sysdeps/unix/sysv/linux/loongarch/bits/shm.h
-new file mode 100644
-index 00000000..9e23092d
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/bits/shm.h
-@@ -0,0 +1,112 @@
-+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _SYS_SHM_H
-+# error "Never include <bits/shm.h> directly; use <sys/shm.h> instead."
-+#endif
-+
-+#include <bits/types.h>
-+#include <bits/wordsize.h>
-+
-+/* Permission flag for shmget.  */
-+#define SHM_R		0400		/* or S_IRUGO from <linux/stat.h> */
-+#define SHM_W		0200		/* or S_IWUGO from <linux/stat.h> */
-+
-+/* Flags for `shmat'.  */
-+#define SHM_RDONLY	010000		/* attach read-only else read-write */
-+#define SHM_RND		020000		/* round attach address to SHMLBA */
-+#define SHM_REMAP	040000		/* take-over region on attach */
-+#define SHM_EXEC	0100000		/* execution access */
-+
-+/* Commands for `shmctl'.  */
-+#define SHM_LOCK	11		/* lock segment (root only) */
-+#define SHM_UNLOCK	12		/* unlock segment (root only) */
-+
-+__BEGIN_DECLS
-+
-+/* Segment low boundary address multiple.  */
-+#define SHMLBA		(__getpagesize () << 2)
-+extern int __getpagesize (void) __THROW __attribute__ ((__const__));
-+
-+
-+/* Type to count number of attaches.  */
-+typedef unsigned long int shmatt_t;
-+
-+/* Data structure describing a shared memory segment.  */
-+struct shmid_ds
-+  {
-+    struct ipc_perm shm_perm;		/* operation permission struct */
-+    size_t shm_segsz;			/* size of segment in bytes */
-+    __time_t shm_atime;			/* time of last shmat() */
-+#if __WORDSIZE == 32
-+    unsigned long int __glibc_reserved1;
-+#endif
-+    __time_t shm_dtime;			/* time of last shmdt() */
-+#if __WORDSIZE == 32
-+    unsigned long int __glibc_reserved2;
-+#endif
-+    __time_t shm_ctime;			/* time of last change by shmctl() */
-+#if __WORDSIZE == 32
-+    unsigned long int __glibc_reserved3;
-+#endif
-+    __pid_t shm_cpid;			/* pid of creator */
-+    __pid_t shm_lpid;			/* pid of last shmop */
-+    shmatt_t shm_nattch;		/* number of current attaches */
-+    unsigned long int __glibc_reserved4;
-+    unsigned long int __glibc_reserved5;
-+  };
-+
-+#ifdef __USE_MISC
-+
-+/* ipcs ctl commands */
-+# define SHM_STAT 	13
-+# define SHM_INFO 	14
-+# define SHM_STAT_ANY	15
-+
-+/* shm_mode upper byte flags */
-+# define SHM_DEST	01000	/* segment will be destroyed on last detach */
-+# define SHM_LOCKED	02000   /* segment will not be swapped */
-+# define SHM_HUGETLB	04000	/* segment is mapped via hugetlb */
-+# define SHM_NORESERVE	010000	/* don't check for reservations */
-+
-+struct	shminfo
-+  {
-+    unsigned long int shmmax;
-+    unsigned long int shmmin;
-+    unsigned long int shmmni;
-+    unsigned long int shmseg;
-+    unsigned long int shmall;
-+    unsigned long int __glibc_reserved1;
-+    unsigned long int __glibc_reserved2;
-+    unsigned long int __glibc_reserved3;
-+    unsigned long int __glibc_reserved4;
-+  };
-+
-+struct shm_info
-+  {
-+    int used_ids;
-+    unsigned long int shm_tot;	/* total allocated shm */
-+    unsigned long int shm_rss;	/* total resident shm */
-+    unsigned long int shm_swp;	/* total swapped shm */
-+    unsigned long int swap_attempts;
-+    unsigned long int swap_successes;
-+  };
-+
-+#endif /* __USE_MISC */
-+
-+__END_DECLS
-diff --git a/sysdeps/unix/sysv/linux/loongarch/bits/sigcontext.h b/sysdeps/unix/sysv/linux/loongarch/bits/sigcontext.h
-new file mode 100644
-index 00000000..0f925b4c
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/bits/sigcontext.h
-@@ -0,0 +1,47 @@
-+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-+/*
-+ * This file is subject to the terms and conditions of the GNU General Public
-+ * License.  See the file "COPYING" in the main directory of this archive
-+ * for more details.
-+ *
-+ * Copyright (C) 1996, 1997, 1999 by Ralf Baechle
-+ * Copyright (C) 1999 Silicon Graphics, Inc.
-+ */
-+#ifndef _BITS_SIGCONTEXT_H
-+#define _BITS_SIGCONTEXT_H
-+
-+/*
-+ * Keep this struct definition in sync with the sigcontext fragment
-+ * in arch/mips/kernel/asm-offsets.c
-+ *
-+ * Warning: this structure illdefined with sc_badvaddr being just an unsigned
-+ * int so it was changed to unsigned long in 2.6.0-test1.  This may break
-+ * binary compatibility - no prisoners.
-+ * DSP ASE in 2.6.12-rc4.  Turn sc_mdhi and sc_mdlo into an array of four
-+ * entries, add sc_dsp and sc_reserved for padding.  No prisoners.
-+ */
-+
-+#define FPU_REG_WIDTH           256
-+#define FPU_ALIGN               __attribute__((aligned(32)))
-+
-+struct sigcontext {
-+    unsigned long long   sc_pc;
-+    unsigned long long    sc_regs[32];
-+    unsigned int   sc_flags;
-+
-+    unsigned int   sc_fcsr;
-+    unsigned int   sc_vcsr;
-+    unsigned long long    sc_fcc;
-+
-+    unsigned long long    sc_scr[4];
-+
-+    union {
-+                unsigned int        val32[FPU_REG_WIDTH / 32];
-+                unsigned long long  val64[FPU_REG_WIDTH / 64];
-+        } sc_fpregs[32] FPU_ALIGN;
-+    unsigned char   sc_reserved[4096] __attribute__((__aligned__(16)));
-+
-+};
-+
-+
-+#endif /* _BITS_SIGCONTEXT_H */
-diff --git a/sysdeps/unix/sysv/linux/loongarch/bits/signum.h b/sysdeps/unix/sysv/linux/loongarch/bits/signum.h
-new file mode 100644
-index 00000000..3cad0b19
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/bits/signum.h
-@@ -0,0 +1,58 @@
-+/* Signal number definitions.  Linux version.
-+   Copyright (C) 1995-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _BITS_SIGNUM_H
-+#define _BITS_SIGNUM_H 1
-+
-+#ifndef _SIGNAL_H
-+#error "Never include <bits/signum.h> directly; use <signal.h> instead."
-+#endif
-+
-+#include <bits/signum-generic.h>
-+
-+/* Adjustments and additions to the signal number constants for
-+   most Linux systems.  */
-+
-+#define	SIGSTKFLT	16	/* Stack fault (obsolete).  */
-+#define	SIGPWR		30	/* Power failure imminent.  */
-+
-+#undef	SIGBUS
-+#define	SIGBUS		 7
-+#undef	SIGUSR1
-+#define	SIGUSR1		10
-+#undef	SIGUSR2
-+#define	SIGUSR2		12
-+#undef	SIGCHLD
-+#define	SIGCHLD		17
-+#undef	SIGCONT
-+#define	SIGCONT		18
-+#undef	SIGSTOP
-+#define	SIGSTOP		19
-+#undef	SIGTSTP
-+#define	SIGTSTP		20
-+#undef	SIGURG
-+#define	SIGURG		23
-+#undef	SIGPOLL
-+#define	SIGPOLL		29
-+#undef	SIGSYS
-+#define SIGSYS		31
-+
-+#undef	__SIGRTMAX
-+#define __SIGRTMAX	127
-+
-+#endif	/* <signal.h> included.  */
-diff --git a/sysdeps/unix/sysv/linux/loongarch/clone.S b/sysdeps/unix/sysv/linux/loongarch/clone.S
-new file mode 100644
-index 00000000..f0fc566e
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/clone.S
-@@ -0,0 +1,98 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+/* clone() is even more special than fork() as it mucks with stacks
-+   and invokes a function in the right context after its all over.  */
-+
-+#include <sys/asm.h>
-+#include <sysdep.h>
-+#define _ERRNO_H	1
-+#include <bits/errno.h>
-+#include <tls.h>
-+#include "tcb-offsets.h"
-+
-+/* int clone(int (*fn)(void *arg), void *child_stack, int flags, void *arg,
-+	     void *parent_tidptr, void *tls, void *child_tidptr) */
-+
-+ENTRY (__clone)
-+
-+	/* Align stack to 16 or 8 bytes per the ABI.  */
-+#if _LOONGARCH_SIM == _ABILP64
-+	addi.d	t0, zero, -16
-+#elif _LOONGARCH_SIM == _ABILP32
-+	addi.w	t0, zero, -8
-+#endif
-+	and	a1, a1, t0
-+
-+	/* Sanity check arguments.  */
-+	beqz	a0, L (invalid)	/* No NULL function pointers.  */
-+	beqz	a1, L (invalid)	/* No NULL stack pointers.  */
-+
-+	addi.d	a1, a1, -16	/* Reserve argument save space.  */
-+	st.d	a0, a1, 0	/* Save function pointer.  */
-+	st.d	a3, a1, SZREG	/* Save argument pointer.  */
-+
-+	/* The syscall expects the args to be in different slots.  */
-+	or	a0, a2, zero
-+	or	a2, a4, zero
-+	or	a3, a6, zero
-+	or	a4, a5, zero
-+
-+	/* Do the system call.  */
-+	li.d	a7,__NR_clone
-+	syscall	0
-+
-+	blt	a0, zero ,L (error)
-+	beqz	a0,L (thread_start)
-+
-+	/* Successful return from the parent.  */
-+	ret
-+
-+L (invalid):
-+	li.d	a0, -EINVAL
-+	/* Something bad happened -- no child created.  */
-+L (error):
-+	b	__syscall_error
-+	END (__clone)
-+
-+/* Load up the arguments to the function.  Put this block of code in
-+   its own function so that we can terminate the stack trace with our
-+   debug info.  */
-+
-+ENTRY (__thread_start)
-+L (thread_start):
-+	/* Terminate call stack by noting ra is undefined.  Use a dummy
-+	   .cfi_label to force starting the FDE.  */
-+	.cfi_label .Ldummy
-+	cfi_undefined (1)
-+
-+	/* Restore the arg for user's function.  */
-+	ld.d	a1, sp, 0	/* Function pointer.  */
-+	ld.d	a0, sp, SZREG	/* Argument pointer.  */
-+
-+	/* Call the user's function.  */
-+	jirl	ra, a1, 0
-+
-+	/* Call exit with the function's return value.  */
-+	li.d	a7, __NR_exit
-+	syscall	0
-+
-+	END (__thread_start)
-+
-+libc_hidden_def (__clone)
-+weak_alias (__clone, clone)
-diff --git a/sysdeps/unix/sysv/linux/loongarch/configure b/sysdeps/unix/sysv/linux/loongarch/configure
-new file mode 100644
-index 00000000..a402323a
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/configure
-@@ -0,0 +1,199 @@
-+# This file is generated from configure.ac by Autoconf.  DO NOT EDIT!
-+ # Local configure fragment for sysdeps/unix/sysv/linux/loongarch.
-+
-+arch_minimum_kernel=4.15.0
-+
-+libc_cv_loongarch_int_abi=no
-+
-+
-+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5
-+$as_echo_n "checking for grep that handles long lines and -e... " >&6; }
-+if ${ac_cv_path_GREP+:} false; then :
-+  $as_echo_n "(cached) " >&6
-+else
-+  if test -z "$GREP"; then
-+  ac_path_GREP_found=false
-+  # Loop through the user's path and test for each of PROGNAME-LIST
-+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
-+do
-+  IFS=$as_save_IFS
-+  test -z "$as_dir" && as_dir=.
-+    for ac_prog in grep ggrep; do
-+    for ac_exec_ext in '' $ac_executable_extensions; do
-+      ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
-+      as_fn_executable_p "$ac_path_GREP" || continue
-+# Check for GNU ac_path_GREP and select it if it is found.
-+  # Check for GNU $ac_path_GREP
-+case `"$ac_path_GREP" --version 2>&1` in
-+*GNU*)
-+  ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
-+*)
-+  ac_count=0
-+  $as_echo_n 0123456789 >"conftest.in"
-+  while :
-+  do
-+    cat "conftest.in" "conftest.in" >"conftest.tmp"
-+    mv "conftest.tmp" "conftest.in"
-+    cp "conftest.in" "conftest.nl"
-+    $as_echo 'GREP' >> "conftest.nl"
-+    "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
-+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
-+    as_fn_arith $ac_count + 1 && ac_count=$as_val
-+    if test $ac_count -gt ${ac_path_GREP_max-0}; then
-+      # Best one so far, save it but keep looking for a better one
-+      ac_cv_path_GREP="$ac_path_GREP"
-+      ac_path_GREP_max=$ac_count
-+    fi
-+    # 10*(2^10) chars as input seems more than enough
-+    test $ac_count -gt 10 && break
-+  done
-+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
-+esac
-+
-+      $ac_path_GREP_found && break 3
-+    done
-+  done
-+  done
-+IFS=$as_save_IFS
-+  if test -z "$ac_cv_path_GREP"; then
-+    as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
-+  fi
-+else
-+  ac_cv_path_GREP=$GREP
-+fi
-+
-+fi
-+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5
-+$as_echo "$ac_cv_path_GREP" >&6; }
-+ GREP="$ac_cv_path_GREP"
-+
-+
-+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5
-+$as_echo_n "checking for egrep... " >&6; }
-+if ${ac_cv_path_EGREP+:} false; then :
-+  $as_echo_n "(cached) " >&6
-+else
-+  if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
-+   then ac_cv_path_EGREP="$GREP -E"
-+   else
-+     if test -z "$EGREP"; then
-+  ac_path_EGREP_found=false
-+  # Loop through the user's path and test for each of PROGNAME-LIST
-+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
-+do
-+  IFS=$as_save_IFS
-+  test -z "$as_dir" && as_dir=.
-+    for ac_prog in egrep; do
-+    for ac_exec_ext in '' $ac_executable_extensions; do
-+      ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
-+      as_fn_executable_p "$ac_path_EGREP" || continue
-+# Check for GNU ac_path_EGREP and select it if it is found.
-+  # Check for GNU $ac_path_EGREP
-+case `"$ac_path_EGREP" --version 2>&1` in
-+*GNU*)
-+  ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
-+*)
-+  ac_count=0
-+  $as_echo_n 0123456789 >"conftest.in"
-+  while :
-+  do
-+    cat "conftest.in" "conftest.in" >"conftest.tmp"
-+    mv "conftest.tmp" "conftest.in"
-+    cp "conftest.in" "conftest.nl"
-+    $as_echo 'EGREP' >> "conftest.nl"
-+    "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
-+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
-+    as_fn_arith $ac_count + 1 && ac_count=$as_val
-+    if test $ac_count -gt ${ac_path_EGREP_max-0}; then
-+      # Best one so far, save it but keep looking for a better one
-+      ac_cv_path_EGREP="$ac_path_EGREP"
-+      ac_path_EGREP_max=$ac_count
-+    fi
-+    # 10*(2^10) chars as input seems more than enough
-+    test $ac_count -gt 10 && break
-+  done
-+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
-+esac
-+
-+      $ac_path_EGREP_found && break 3
-+    done
-+  done
-+  done
-+IFS=$as_save_IFS
-+  if test -z "$ac_cv_path_EGREP"; then
-+    as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
-+  fi
-+else
-+  ac_cv_path_EGREP=$EGREP
-+fi
-+
-+   fi
-+fi
-+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5
-+$as_echo "$ac_cv_path_EGREP" >&6; }
-+ EGREP="$ac_cv_path_EGREP"
-+
-+
-+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-+/* end confdefs.h.  */
-+__SIZEOF_INT__ __SIZEOF_LONG__ __SIZEOF_POINTER__
-+
-+_ACEOF
-+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
-+  $EGREP "4 4 4" >/dev/null 2>&1; then :
-+  libc_cv_loongarch_int_abi=lp32
-+fi
-+rm -f conftest*
-+
-+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-+/* end confdefs.h.  */
-+__SIZEOF_INT__ __SIZEOF_LONG__ __SIZEOF_POINTER__
-+
-+_ACEOF
-+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
-+  $EGREP "4 8 8" >/dev/null 2>&1; then :
-+  libc_cv_loongarch_int_abi=lp64
-+fi
-+rm -f conftest*
-+
-+if test $libc_cv_loongarch_int_abi = no; then
-+  as_fn_error $? "Unable to determine integer ABI" "$LINENO" 5
-+fi
-+
-+config_vars="$config_vars
-+default-abi = $libc_cv_loongarch_int_abi"
-+
-+case $libc_cv_loongarch_int_abi in
-+lp32)
-+  test -n "$libc_cv_slibdir" ||
-+case "$prefix" in
-+/usr | /usr/)
-+  libc_cv_slibdir='/lib32'
-+  libc_cv_rtlddir='/lib32'
-+  if test "$libdir" = '${exec_prefix}/lib'; then
-+    libdir='${exec_prefix}/lib32';
-+    # Locale data can be shared between 32-bit and 64-bit libraries.
-+    libc_cv_complocaledir='${exec_prefix}/lib/locale'
-+  fi
-+  ;;
-+esac
-+  ;;
-+lp64)
-+  test -n "$libc_cv_slibdir" ||
-+case "$prefix" in
-+/usr | /usr/)
-+  libc_cv_slibdir='/lib64'
-+  libc_cv_rtlddir='/lib64'
-+  if test "$libdir" = '${exec_prefix}/lib'; then
-+    libdir='${exec_prefix}/lib64';
-+    # Locale data can be shared between 32-bit and 64-bit libraries.
-+    libc_cv_complocaledir='${exec_prefix}/lib/locale'
-+  fi
-+  ;;
-+esac
-+  ;;
-+esac
-+
-+ldd_rewrite_script=sysdeps/unix/sysv/linux/loongarch/ldd-rewrite.sed
-diff --git a/sysdeps/unix/sysv/linux/loongarch/configure.ac b/sysdeps/unix/sysv/linux/loongarch/configure.ac
-new file mode 100644
-index 00000000..fef4f4d2
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/configure.ac
-@@ -0,0 +1,27 @@
-+sinclude(./aclocal.m4)dnl Autoconf lossage
-+GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
-+# Local configure fragment for sysdeps/unix/sysv/linux/loongarch.
-+
-+arch_minimum_kernel=4.15.0
-+
-+libc_cv_loongarch_int_abi=no
-+AC_EGREP_CPP(4 4 4, [__SIZEOF_INT__ __SIZEOF_LONG__ __SIZEOF_POINTER__
-+  ], libc_cv_loongarch_int_abi=lp32)
-+AC_EGREP_CPP(4 8 8, [__SIZEOF_INT__ __SIZEOF_LONG__ __SIZEOF_POINTER__
-+  ], libc_cv_loongarch_int_abi=lp64)
-+if test $libc_cv_loongarch_int_abi = no; then
-+  AC_MSG_ERROR([Unable to determine integer ABI])
-+fi
-+
-+LIBC_CONFIG_VAR([default-abi], [$libc_cv_loongarch_int_abi])
-+
-+case $libc_cv_loongarch_int_abi in
-+lp32)
-+  LIBC_SLIBDIR_RTLDDIR([lib32], [lib32])
-+  ;;
-+lp64)
-+  LIBC_SLIBDIR_RTLDDIR([lib64], [lib])
-+  ;;
-+esac
-+
-+ldd_rewrite_script=sysdeps/unix/sysv/linux/loongarch/ldd-rewrite.sed
-diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
-new file mode 100644
-index 00000000..80870f3c
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
-@@ -0,0 +1,32 @@
-+/* Initialize CPU feature data.  LoongArch64 version.
-+   This file is part of the GNU C Library.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <cpu-features.h>
-+#include <elf/dl-hwcaps.h>
-+
-+static inline void
-+init_cpu_features (struct cpu_features *cpu_features)
-+{
-+  register uint64_t cpucfg_word = UINT64_MAX;
-+
-+  __cpucfg(cpucfg_word, 0);
-+  cpu_features->cpucfg_prid = cpucfg_word;
-+
-+  __cpucfg(cpucfg_word, 2);
-+  cpu_features->cpucfg_word_idx2 = cpucfg_word;
-+}
-diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
-new file mode 100644
-index 00000000..b46a8489
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
-@@ -0,0 +1,53 @@
-+/* Initialize CPU feature data.  LoongArch64 version.
-+   This file is part of the GNU C Library.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _CPU_FEATURES_LOONGARCH64_H
-+#define _CPU_FEATURES_LOONGARCH64_H
-+
-+#include <stdint.h>
-+#include <sys/auxv.h>
-+
-+#define LA264 0x14a000
-+#define LA364 0x14b000
-+#define LA464 0x14c011
-+
-+struct cpu_features
-+{
-+  uint64_t cpucfg_prid;
-+  uint64_t cpucfg_word_idx2;
-+};
-+
-+/* Get a pointer to the CPU features structure.  */
-+extern const struct cpu_features *_dl_larch_get_cpu_features (void)
-+     __attribute__ ((pure));
-+
-+#define __cpucfg(ret, index)  \
-+  asm volatile ("or	%1, %0, $zero\n"	\
-+		"cpucfg %0, %0\n"		\
-+		:"=r"(ret)			\
-+		:"r"(index));
-+
-+#define IS_LA264(prid) (prid == LA264)
-+#define IS_LA364(prid) (prid == LA364)
-+#define IS_LA464(prid) (prid == LA464)
-+#define SUPPORT_UAL (GLRO (dl_hwcap) & HWCAP_LOONGARCH_UAL)
-+#define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
-+#define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
-+
-+#endif /* _CPU_FEATURES_LOONGARCH64_H  */
-+
-diff --git a/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c b/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c
-new file mode 100644
-index 00000000..31e92898
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c
-@@ -0,0 +1,60 @@
-+/* Data for LoongArch64 version of processor capability information.
-+   Linux version.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+/* If anything should be added here check whether the size of each string
-+   is still ok with the given array size.
-+
-+   All the #ifdefs in the definitions are quite irritating but
-+   necessary if we want to avoid duplicating the information.  There
-+   are three different modes:
-+
-+   - PROCINFO_DECL is defined.  This means we are only interested in
-+     declarations.
-+
-+   - PROCINFO_DECL is not defined:
-+
-+     + if SHARED is defined the file is included in an array
-+       initializer.  The .element = { ... } syntax is needed.
-+
-+     + if SHARED is not defined a normal array initialization is
-+       needed.
-+  */
-+
-+#ifndef PROCINFO_CLASS
-+# define PROCINFO_CLASS
-+#endif
-+
-+#if !IS_IN (ldconfig)
-+# if !defined PROCINFO_DECL && defined SHARED
-+  ._dl_larch_cpu_features
-+# else
-+PROCINFO_CLASS struct cpu_features _dl_larch_cpu_features
-+# endif
-+# ifndef PROCINFO_DECL
-+= { }
-+# endif
-+# if !defined SHARED || defined PROCINFO_DECL
-+;
-+# else
-+,
-+# endif
-+#endif
-+
-+#undef PROCINFO_DECL
-+#undef PROCINFO_CLASS
-diff --git a/sysdeps/unix/sysv/linux/loongarch/dl-static.c b/sysdeps/unix/sysv/linux/loongarch/dl-static.c
-new file mode 100644
-index 00000000..12b030f0
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/dl-static.c
-@@ -0,0 +1,84 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <ldsodefs.h>
-+
-+#ifdef SHARED
-+
-+void
-+_dl_var_init (void *array[])
-+{
-+  /* It has to match "variables" below.  */
-+  enum
-+    {
-+      DL_PAGESIZE = 0
-+    };
-+
-+  GLRO(dl_pagesize) = *((size_t *) array[DL_PAGESIZE]);
-+}
-+
-+#else
-+
-+static void *variables[] =
-+{
-+  &GLRO(dl_pagesize)
-+};
-+
-+static void
-+_dl_unprotect_relro (struct link_map *l)
-+{
-+  ElfW(Addr) start = ((l->l_addr + l->l_relro_addr)
-+		      & ~(GLRO(dl_pagesize) - 1));
-+  ElfW(Addr) end = ((l->l_addr + l->l_relro_addr + l->l_relro_size)
-+		    & ~(GLRO(dl_pagesize) - 1));
-+
-+  if (start != end)
-+    __mprotect ((void *) start, end - start, PROT_READ | PROT_WRITE);
-+}
-+
-+void
-+_dl_static_init (struct link_map *l)
-+{
-+  struct link_map *rtld_map = l;
-+  struct r_scope_elem **scope;
-+  const ElfW(Sym) *ref = NULL;
-+  lookup_t loadbase;
-+  void (*f) (void *[]);
-+  size_t i;
-+
-+  loadbase = _dl_lookup_symbol_x ("_dl_var_init", l, &ref, l->l_local_scope,
-+				  NULL, 0, 1, NULL);
-+
-+  for (scope = l->l_local_scope; *scope != NULL; scope++)
-+    for (i = 0; i < (*scope)->r_nlist; i++)
-+      if ((*scope)->r_list[i] == loadbase)
-+	{
-+	  rtld_map = (*scope)->r_list[i];
-+	  break;
-+	}
-+
-+  if (ref != NULL)
-+    {
-+      f = (void (*) (void *[])) DL_SYMBOL_ADDRESS (loadbase, ref);
-+      _dl_unprotect_relro (rtld_map);
-+      f (variables);
-+      _dl_protect_relro (rtld_map);
-+    }
-+}
-+
-+#endif
-diff --git a/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c b/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c
-new file mode 100644
-index 00000000..1fe72410
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c
-@@ -0,0 +1,21 @@
-+/* Operating system support for run-time dynamic linker.  LoongArch version.
-+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <config.h>
-+#include <sysdeps/loongarch/cpu-tunables.c>
-+#include <sysdeps/unix/sysv/linux/dl-sysdep.c>
-diff --git a/sysdeps/unix/sysv/linux/loongarch/dl-tunables.list b/sysdeps/unix/sysv/linux/loongarch/dl-tunables.list
-new file mode 100644
-index 00000000..c8f9793e
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/dl-tunables.list
-@@ -0,0 +1,29 @@
-+# Order of tunables in RHEL 8.7.0.
-+@order glibc.rtld.nns
-+@order glibc.elision.skip_lock_after_retries
-+@order glibc.malloc.trim_threshold
-+@order glibc.malloc.perturb
-+@order glibc.elision.tries
-+@order glibc.elision.enable
-+@order glibc.malloc.mxfast
-+@order glibc.elision.skip_lock_busy
-+@order glibc.malloc.top_pad
-+@order glibc.cpu.hwcaps
-+@order glibc.cpu.hwcap_mask
-+@order glibc.malloc.mmap_max
-+@order glibc.elision.skip_trylock_internal_abort
-+@order glibc.malloc.tcache_unsorted_limit
-+@order glibc.elision.skip_lock_internal_abort
-+@order glibc.malloc.arena_max
-+@order glibc.malloc.mmap_threshold
-+@order glibc.malloc.tcache_count
-+@order glibc.malloc.arena_test
-+@order glibc.rtld.optional_static_tls
-+@order glibc.malloc.tcache_max
-+@order glibc.malloc.check
-+
-+# Tunables added in RHEL 8.8.0
-+@order glibc.rtld.dynamic_sort
-+
-+@order glibc.gmon.minarcs
-+@order glibc.gmon.maxarcs
-diff --git a/sysdeps/unix/sysv/linux/loongarch/getcontext.S b/sysdeps/unix/sysv/linux/loongarch/getcontext.S
-new file mode 100644
-index 00000000..9c28d958
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/getcontext.S
-@@ -0,0 +1,72 @@
-+/* Save current context.
-+   Copyright (C) 2009-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include "ucontext-macros.h"
-+
-+/* int getcontext (ucontext_t *ucp) */
-+
-+	.text
-+LEAF (__getcontext)
-+	SAVE_INT_REG (ra,   1, a0)
-+	SAVE_INT_REG (sp,   3, a0)
-+	SAVE_INT_REG (zero,  4, a0)	/* return 0 by overwriting a0.  */
-+	SAVE_INT_REG (x,  21, a0)
-+	SAVE_INT_REG (fp,  22, a0)
-+	SAVE_INT_REG (s0,  23, a0)
-+	SAVE_INT_REG (s1,  24, a0)
-+	SAVE_INT_REG (s2,  25, a0)
-+	SAVE_INT_REG (s3,  26, a0)
-+	SAVE_INT_REG (s4,  27, a0)
-+	SAVE_INT_REG (s5,  28, a0)
-+	SAVE_INT_REG (s6,  29, a0)
-+	SAVE_INT_REG (s7,  30, a0)
-+	SAVE_INT_REG (s8,  31, a0)
-+	st.d ra,  a0, MCONTEXT_PC
-+
-+#ifndef __loongarch_soft_float
-+	movfcsr2gr a1, $r0
-+
-+	SAVE_FP_REG (fs0,  24, a0)
-+	SAVE_FP_REG (fs1,  25, a0)
-+	SAVE_FP_REG (fs2,  26, a0)
-+	SAVE_FP_REG (fs3,  27, a0)
-+	SAVE_FP_REG (fs4,  28, a0)
-+	SAVE_FP_REG (fs5,  29, a0)
-+	SAVE_FP_REG (fs6,  30, a0)
-+	SAVE_FP_REG (fs7,  31, a0)
-+
-+	st.w	a1, a0, MCONTEXT_FCSR
-+#endif /* __loongarch_soft_float */
-+
-+/* rt_sigprocmask (SIG_BLOCK, NULL, &ucp->uc_sigmask, _NSIG8) */
-+	li.d	a3, _NSIG8
-+	addi.d     a2, a0, UCONTEXT_SIGMASK
-+	ori	a1, zero,0
-+	li.d	a0, SIG_BLOCK
-+
-+	li.d	a7, SYS_ify (rt_sigprocmask)
-+	syscall 0
-+	blt	a0, zero, 99f
-+
-+	jirl    $r0, $r1, 0
-+
-+99:	b	__syscall_error
-+
-+PSEUDO_END (__getcontext)
-+
-+weak_alias (__getcontext, getcontext)
-diff --git a/sysdeps/unix/sysv/linux/loongarch/getpid.c b/sysdeps/unix/sysv/linux/loongarch/getpid.c
-new file mode 100644
-index 00000000..5b4edb2b
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/getpid.c
-@@ -0,0 +1,54 @@
-+/* getpid - get the pid.  Linux/Loongarch version.
-+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <unistd.h>
-+
-+#ifdef SHARED
-+# include <dl-vdso.h>
-+# include <errno.h>
-+
-+static pid_t
-+__getpid_syscall (void)
-+{
-+  return INLINE_SYSCALL (getpid, 0);
-+}
-+
-+# ifndef __getpid_type
-+#  define __getpid_type __getpid
-+# endif
-+
-+# undef INIT_ARCH
-+# define INIT_ARCH() PREPARE_VERSION_KNOWN (linux26, LINUX_2_6)
-+libc_ifunc_hidden (__getpid_type, __getpid, (_dl_vdso_vsym ("__vdso_getpid", &linux26) ?: &__getpid_syscall))
-+libc_hidden_def (__getpid)
-+
-+#else
-+
-+# include <sysdep.h>
-+# include <errno.h>
-+
-+pid_t
-+__getpid (void)
-+{
-+  return INLINE_SYSCALL (getpid, 0);
-+}
-+libc_hidden_def (__getpid);
-+
-+#endif
-+weak_alias (__getpid, getpid)
-+libc_hidden_weak (getpid)
-diff --git a/sysdeps/unix/sysv/linux/loongarch/gettimeofday.c b/sysdeps/unix/sysv/linux/loongarch/gettimeofday.c
-new file mode 100644
-index 00000000..902b1a5d
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/gettimeofday.c
-@@ -0,0 +1,58 @@
-+/* gettimeofday - get the time.  Linux/LoongArch version.
-+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <sys/time.h>
-+
-+#ifdef SHARED
-+
-+# include <dl-vdso.h>
-+# include <errno.h>
-+
-+static int
-+__gettimeofday_syscall (struct timeval *tv, struct timezone *tz)
-+{
-+  return INLINE_SYSCALL (gettimeofday, 2, tv, tz);
-+}
-+
-+# ifndef __gettimeofday_type
-+#  define __gettimeofday_type __gettimeofday
-+# endif
-+
-+# undef INIT_ARCH
-+# define INIT_ARCH() PREPARE_VERSION_KNOWN (linux26, LINUX_2_6)
-+/* If the vDSO is not available we fall back to syscall.  */
-+libc_ifunc_hidden (__gettimeofday_type, __gettimeofday,
-+		   (_dl_vdso_vsym ("__vdso_gettimeofday", &linux26)
-+		    ?: &__gettimeofday_syscall))
-+libc_hidden_def (__gettimeofday)
-+
-+#else
-+
-+# include <sysdep.h>
-+# include <errno.h>
-+
-+int
-+__gettimeofday (struct timeval *tv, struct timezone *tz)
-+{
-+  return INLINE_SYSCALL (gettimeofday, 2, tv, tz);
-+}
-+libc_hidden_def (__gettimeofday)
-+
-+#endif
-+weak_alias (__gettimeofday, gettimeofday)
-+libc_hidden_weak (gettimeofday)
-diff --git a/sysdeps/unix/sysv/linux/loongarch/getuid.c b/sysdeps/unix/sysv/linux/loongarch/getuid.c
-new file mode 100644
-index 00000000..4b3f95eb
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/getuid.c
-@@ -0,0 +1,60 @@
-+/* getuid - get the uid.  Linux/Loongarch version.
-+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <unistd.h>
-+
-+#ifdef SHARED
-+
-+# include <dl-vdso.h>
-+# include <errno.h>
-+
-+libc_hidden_proto (getuid)
-+
-+extern __uid_t __getuid (void);
-+libc_hidden_proto (__getuid)
-+
-+static uid_t
-+__getuid_syscall(void)
-+{
-+  return INLINE_SYSCALL (getuid, 0);
-+}
-+
-+# ifndef __getuid_type
-+#  define __getuid_type __getuid
-+# endif
-+
-+# undef INIT_ARCH
-+# define INIT_ARCH() PREPARE_VERSION_KNOWN (linux26, LINUX_2_6)
-+libc_ifunc_hidden (__getuid_type, __getuid, (_dl_vdso_vsym ("__vdso_getuid", &linux26) ?: &__getuid_syscall))
-+libc_hidden_def (__getuid)
-+
-+#else
-+
-+# include <sysdep.h>
-+# include <errno.h>
-+
-+uid_t
-+__getuid(void)
-+{
-+  return INLINE_SYSCALL (getuid, 0);
-+}
-+libc_hidden_def (__getuid)
-+
-+#endif
-+weak_alias (__getuid, getuid)
-+libc_hidden_weak (getuid)
-diff --git a/sysdeps/unix/sysv/linux/loongarch/init-first.c b/sysdeps/unix/sysv/linux/loongarch/init-first.c
-new file mode 100644
-index 00000000..5185a831
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/init-first.c
-@@ -0,0 +1,57 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public License as
-+   published by the Free Software Foundation; either version 2.1 of the
-+   License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifdef SHARED
-+# include <dl-vdso.h>
-+# include <libc-vdso.h>
-+
-+long int (*VDSO_SYMBOL (getcpu)) (unsigned int *, unsigned int *, void *)
-+    attribute_hidden;
-+long int (*VDSO_SYMBOL (gettimeofday)) (struct timeval *, void *)
-+    attribute_hidden;
-+long int (*VDSO_SYMBOL (clock_gettime)) (clockid_t, struct timespec *)
-+    attribute_hidden;
-+long int (*VDSO_SYMBOL (clock_getres)) (clockid_t, struct timespec *)
-+    attribute_hidden;
-+
-+static inline void
-+_libc_vdso_platform_setup (void)
-+{
-+  PREPARE_VERSION_KNOWN (linux_version, LINUX_2_6);
-+
-+  void *p = _dl_vdso_vsym ("__vdso_getcpu", &linux_version);
-+  PTR_MANGLE (p);
-+  VDSO_SYMBOL (getcpu) = p;
-+
-+  p = _dl_vdso_vsym ("__vdso_gettimeofday", &linux_version);
-+  PTR_MANGLE (p);
-+  VDSO_SYMBOL (gettimeofday) = p;
-+
-+  p = _dl_vdso_vsym ("__vdso_clock_gettime", &linux_version);
-+  PTR_MANGLE (p);
-+  VDSO_SYMBOL (clock_gettime) = p;
-+
-+  p = _dl_vdso_vsym ("__vdso_clock_getres", &linux_version);
-+  PTR_MANGLE (p);
-+  VDSO_SYMBOL (clock_getres) = p;
-+}
-+
-+# define VDSO_SETUP _libc_vdso_platform_setup
-+#endif
-+
-+#include <csu/init-first.c>
-diff --git a/sysdeps/unix/sysv/linux/loongarch/ipc_priv.h b/sysdeps/unix/sysv/linux/loongarch/ipc_priv.h
-new file mode 100644
-index 00000000..51583429
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/ipc_priv.h
-@@ -0,0 +1,21 @@
-+/* Old SysV permission definition for Linux.  LoongArch version.
-+   Copyright (C) 2020 Loongson Technology, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <sys/ipc.h>  /* For __key_t  */
-+
-+#define __IPC_64      0x0
-diff --git a/sysdeps/unix/sysv/linux/loongarch/kernel-features.h b/sysdeps/unix/sysv/linux/loongarch/kernel-features.h
-new file mode 100644
-index 00000000..c87c7967
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/kernel-features.h
-@@ -0,0 +1,24 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+ *
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include_next <kernel-features.h>
-+
-+/* No support for PI mutexes or robust futexes before 4.20.  */
-+#if __LINUX_KERNEL_VERSION < 0x041400
-+# undef __ASSUME_SET_ROBUST_LIST
-+#endif
-diff --git a/sysdeps/unix/sysv/linux/loongarch/ldd-rewrite.sed b/sysdeps/unix/sysv/linux/loongarch/ldd-rewrite.sed
-new file mode 100644
-index 00000000..131c5f14
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/ldd-rewrite.sed
-@@ -0,0 +1 @@
-+s_^\(RTLDLIST=\)\(.*lib/\)\(ld-linux\)-\(loongarch64\)-\(lp64\)\(d*\)\(\.so\.[0-9.]*\)_\1"\2\3-\4-\5\7 \2\3-\4-\5d\7"_
-diff --git a/sysdeps/unix/sysv/linux/loongarch/ldsodefs.h b/sysdeps/unix/sysv/linux/loongarch/ldsodefs.h
-new file mode 100644
-index 00000000..c0fc7046
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/ldsodefs.h
-@@ -0,0 +1,32 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef	_LDSODEFS_H
-+
-+/* Get the real definitions.  */
-+#include_next <ldsodefs.h>
-+
-+/* Now define our stuff.  */
-+
-+/* We need special support to initialize DSO loaded for statically linked
-+   binaries.  */
-+extern void _dl_static_init (struct link_map *map);
-+#undef DL_STATIC_INIT
-+#define DL_STATIC_INIT(map) _dl_static_init (map)
-+
-+#endif /* ldsodefs.h */
-diff --git a/sysdeps/unix/sysv/linux/loongarch/libc-start.c b/sysdeps/unix/sysv/linux/loongarch/libc-start.c
-new file mode 100644
-index 00000000..047ad751
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/libc-start.c
-@@ -0,0 +1,28 @@
-+/* Override csu/libc-start.c on LoongArch64.
-+   Copyright (C) 2022 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef SHARED
-+# include <ldsodefs.h>
-+# include <cpu-features.c>
-+
-+extern struct cpu_features _dl_larch_cpu_features;
-+
-+# define ARCH_INIT_CPU_FEATURES() init_cpu_features (&_dl_larch_cpu_features)
-+
-+#endif
-+#include <csu/libc-start.c>
-diff --git a/sysdeps/unix/sysv/linux/loongarch/libc-vdso.h b/sysdeps/unix/sysv/linux/loongarch/libc-vdso.h
-new file mode 100644
-index 00000000..658c27a5
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/libc-vdso.h
-@@ -0,0 +1,37 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _LIBC_VDSO_H
-+#define _LIBC_VDSO_H
-+
-+#ifdef SHARED
-+
-+# include <sysdep-vdso.h>
-+
-+extern long int (*VDSO_SYMBOL (getcpu)) (unsigned int *, unsigned int *, void *)
-+    attribute_hidden;
-+extern long int (*VDSO_SYMBOL (gettimeofday)) (struct timeval *, void *)
-+    attribute_hidden;
-+extern long int (*VDSO_SYMBOL (clock_gettime)) (clockid_t, struct timespec *)
-+    attribute_hidden;
-+extern long int (*VDSO_SYMBOL (clock_getres)) (clockid_t, struct timespec *)
-+    attribute_hidden;
-+
-+#endif
-+
-+#endif /* _LIBC_VDSO_H */
-diff --git a/sysdeps/unix/sysv/linux/loongarch/localplt.data b/sysdeps/unix/sysv/linux/loongarch/localplt.data
-new file mode 100644
-index 00000000..0ed8650b
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/localplt.data
-@@ -0,0 +1,13 @@
-+# See scripts/check-localplt.awk for how this file is processed.
-+# PLT use is required for the malloc family and for matherr because
-+# users can define their own functions and have library internals call them.
-+libc.so: calloc
-+libc.so: free
-+libc.so: malloc
-+libc.so: memalign
-+libc.so: realloc
-+# The TLS-enabled version of these functions is interposed from libc.so.
-+ld.so: _dl_signal_error
-+ld.so: _dl_catch_error
-+ld.so: _dl_signal_exception
-+ld.so: _dl_catch_exception
-diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/Implies b/sysdeps/unix/sysv/linux/loongarch/lp64/Implies
-new file mode 100644
-index 00000000..117c2b8e
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/Implies
-@@ -0,0 +1,3 @@
-+unix/sysv/linux/loongarch
-+unix/sysv/linux/generic
-+unix/sysv/linux/wordsize-64
-diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/c++-types.data b/sysdeps/unix/sysv/linux/loongarch/lp64/c++-types.data
-new file mode 100644
-index 00000000..ac925ccb
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/c++-types.data
-@@ -0,0 +1,67 @@
-+blkcnt64_t:l
-+blkcnt_t:l
-+blksize_t:i
-+caddr_t:Pc
-+clockid_t:i
-+clock_t:l
-+daddr_t:i
-+dev_t:m
-+fd_mask:l
-+fsblkcnt64_t:m
-+fsblkcnt_t:m
-+fsfilcnt64_t:m
-+fsfilcnt_t:m
-+fsid_t:8__fsid_t
-+gid_t:j
-+id_t:j
-+ino64_t:m
-+ino_t:m
-+int16_t:s
-+int32_t:i
-+int64_t:l
-+int8_t:a
-+intptr_t:l
-+key_t:i
-+loff_t:l
-+mode_t:j
-+nlink_t:j
-+off64_t:l
-+off_t:l
-+pid_t:i
-+pthread_attr_t:14pthread_attr_t
-+pthread_barrier_t:17pthread_barrier_t
-+pthread_barrierattr_t:21pthread_barrierattr_t
-+pthread_cond_t:14pthread_cond_t
-+pthread_condattr_t:18pthread_condattr_t
-+pthread_key_t:j
-+pthread_mutex_t:15pthread_mutex_t
-+pthread_mutexattr_t:19pthread_mutexattr_t
-+pthread_once_t:i
-+pthread_rwlock_t:16pthread_rwlock_t
-+pthread_rwlockattr_t:20pthread_rwlockattr_t
-+pthread_spinlock_t:i
-+pthread_t:m
-+quad_t:l
-+register_t:l
-+rlim64_t:m
-+rlim_t:m
-+sigset_t:10__sigset_t
-+size_t:m
-+socklen_t:j
-+ssize_t:l
-+suseconds_t:l
-+time_t:l
-+u_char:h
-+uid_t:j
-+uint:j
-+u_int:j
-+u_int16_t:t
-+u_int32_t:j
-+u_int64_t:m
-+u_int8_t:h
-+ulong:m
-+u_long:m
-+u_quad_t:m
-+useconds_t:j
-+ushort:t
-+u_short:t
-diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/jmp_buf-macros.h b/sysdeps/unix/sysv/linux/loongarch/lp64/jmp_buf-macros.h
-new file mode 100644
-index 00000000..e1c96e67
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/jmp_buf-macros.h
-@@ -0,0 +1,41 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+/* Produced by this program:
-+
-+   #include <stdio.h>
-+   #include <unistd.h>
-+   #include <setjmp.h>
-+   #include <stddef.h>
-+
-+   int main (int argc, char **argv)
-+   {
-+       printf ("#define JMP_BUF_SIZE %d\n", sizeof (jmp_buf));
-+       printf ("#define JMP_BUF_ALIGN %d\n", __alignof__ (jmp_buf));
-+       printf ("#define SIGJMP_BUF_SIZE %d\n", sizeof (sigjmp_buf));
-+       printf ("#define SIGJMP_BUF_ALIGN %d\n", __alignof__ (sigjmp_buf));
-+       printf ("#define MASK_WAS_SAVED_OFFSET %d\n", offsetof (struct __jmp_buf_tag, __mask_was_saved));
-+       printf ("#define SAVED_MASK_OFFSET %d\n", offsetof (struct __jmp_buf_tag, __saved_mask));
-+   } */
-+
-+# define JMP_BUF_SIZE 304
-+# define JMP_BUF_ALIGN 8
-+# define SIGJMP_BUF_SIZE 304
-+# define SIGJMP_BUF_ALIGN 8
-+# define MASK_WAS_SAVED_OFFSET 168
-+# define SAVED_MASK_OFFSET 176
-diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/ld.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/ld.abilist
-new file mode 100644
-index 00000000..845f356c
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/ld.abilist
-@@ -0,0 +1,5 @@
-+GLIBC_2.27 __libc_stack_end D 0x8
-+GLIBC_2.27 __stack_chk_guard D 0x8
-+GLIBC_2.27 __tls_get_addr F
-+GLIBC_2.27 _dl_mcount F
-+GLIBC_2.27 _r_debug D 0x28
-diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libBrokenLocale.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libBrokenLocale.abilist
-new file mode 100644
-index 00000000..18968d3c
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libBrokenLocale.abilist
-@@ -0,0 +1 @@
-+GLIBC_2.27 __ctype_get_mb_cur_max F
-diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libanl.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libanl.abilist
-new file mode 100644
-index 00000000..711fc87c
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libanl.abilist
-@@ -0,0 +1,4 @@
-+GLIBC_2.27 gai_cancel F
-+GLIBC_2.27 gai_error F
-+GLIBC_2.27 gai_suspend F
-+GLIBC_2.27 getaddrinfo_a F
-diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libc.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libc.abilist
-new file mode 100644
-index 00000000..4d8733f2
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libc.abilist
-@@ -0,0 +1,2101 @@
-+GLIBC_2.27 _Exit F
-+GLIBC_2.27 _IO_2_1_stderr_ D 0xe0
-+GLIBC_2.27 _IO_2_1_stdin_ D 0xe0
-+GLIBC_2.27 _IO_2_1_stdout_ D 0xe0
-+GLIBC_2.27 _IO_adjust_column F
-+GLIBC_2.27 _IO_adjust_wcolumn F
-+GLIBC_2.27 _IO_default_doallocate F
-+GLIBC_2.27 _IO_default_finish F
-+GLIBC_2.27 _IO_default_pbackfail F
-+GLIBC_2.27 _IO_default_uflow F
-+GLIBC_2.27 _IO_default_xsgetn F
-+GLIBC_2.27 _IO_default_xsputn F
-+GLIBC_2.27 _IO_do_write F
-+GLIBC_2.27 _IO_doallocbuf F
-+GLIBC_2.27 _IO_fclose F
-+GLIBC_2.27 _IO_fdopen F
-+GLIBC_2.27 _IO_feof F
-+GLIBC_2.27 _IO_ferror F
-+GLIBC_2.27 _IO_fflush F
-+GLIBC_2.27 _IO_fgetpos F
-+GLIBC_2.27 _IO_fgetpos64 F
-+GLIBC_2.27 _IO_fgets F
-+GLIBC_2.27 _IO_file_attach F
-+GLIBC_2.27 _IO_file_close F
-+GLIBC_2.27 _IO_file_close_it F
-+GLIBC_2.27 _IO_file_doallocate F
-+GLIBC_2.27 _IO_file_finish F
-+GLIBC_2.27 _IO_file_fopen F
-+GLIBC_2.27 _IO_file_init F
-+GLIBC_2.27 _IO_file_jumps D 0xa8
-+GLIBC_2.27 _IO_file_open F
-+GLIBC_2.27 _IO_file_overflow F
-+GLIBC_2.27 _IO_file_read F
-+GLIBC_2.27 _IO_file_seek F
-+GLIBC_2.27 _IO_file_seekoff F
-+GLIBC_2.27 _IO_file_setbuf F
-+GLIBC_2.27 _IO_file_stat F
-+GLIBC_2.27 _IO_file_sync F
-+GLIBC_2.27 _IO_file_underflow F
-+GLIBC_2.27 _IO_file_write F
-+GLIBC_2.27 _IO_file_xsputn F
-+GLIBC_2.27 _IO_flockfile F
-+GLIBC_2.27 _IO_flush_all F
-+GLIBC_2.27 _IO_flush_all_linebuffered F
-+GLIBC_2.27 _IO_fopen F
-+GLIBC_2.27 _IO_fprintf F
-+GLIBC_2.27 _IO_fputs F
-+GLIBC_2.27 _IO_fread F
-+GLIBC_2.27 _IO_free_backup_area F
-+GLIBC_2.27 _IO_free_wbackup_area F
-+GLIBC_2.27 _IO_fsetpos F
-+GLIBC_2.27 _IO_fsetpos64 F
-+GLIBC_2.27 _IO_ftell F
-+GLIBC_2.27 _IO_ftrylockfile F
-+GLIBC_2.27 _IO_funlockfile F
-+GLIBC_2.27 _IO_fwrite F
-+GLIBC_2.27 _IO_getc F
-+GLIBC_2.27 _IO_getline F
-+GLIBC_2.27 _IO_getline_info F
-+GLIBC_2.27 _IO_gets F
-+GLIBC_2.27 _IO_init F
-+GLIBC_2.27 _IO_init_marker F
-+GLIBC_2.27 _IO_init_wmarker F
-+GLIBC_2.27 _IO_iter_begin F
-+GLIBC_2.27 _IO_iter_end F
-+GLIBC_2.27 _IO_iter_file F
-+GLIBC_2.27 _IO_iter_next F
-+GLIBC_2.27 _IO_least_wmarker F
-+GLIBC_2.27 _IO_link_in F
-+GLIBC_2.27 _IO_list_all D 0x8
-+GLIBC_2.27 _IO_list_lock F
-+GLIBC_2.27 _IO_list_resetlock F
-+GLIBC_2.27 _IO_list_unlock F
-+GLIBC_2.27 _IO_marker_delta F
-+GLIBC_2.27 _IO_marker_difference F
-+GLIBC_2.27 _IO_padn F
-+GLIBC_2.27 _IO_peekc_locked F
-+GLIBC_2.27 _IO_popen F
-+GLIBC_2.27 _IO_printf F
-+GLIBC_2.27 _IO_proc_close F
-+GLIBC_2.27 _IO_proc_open F
-+GLIBC_2.27 _IO_putc F
-+GLIBC_2.27 _IO_puts F
-+GLIBC_2.27 _IO_remove_marker F
-+GLIBC_2.27 _IO_seekmark F
-+GLIBC_2.27 _IO_seekoff F
-+GLIBC_2.27 _IO_seekpos F
-+GLIBC_2.27 _IO_seekwmark F
-+GLIBC_2.27 _IO_setb F
-+GLIBC_2.27 _IO_setbuffer F
-+GLIBC_2.27 _IO_setvbuf F
-+GLIBC_2.27 _IO_sgetn F
-+GLIBC_2.27 _IO_sprintf F
-+GLIBC_2.27 _IO_sputbackc F
-+GLIBC_2.27 _IO_sputbackwc F
-+GLIBC_2.27 _IO_sscanf F
-+GLIBC_2.27 _IO_str_init_readonly F
-+GLIBC_2.27 _IO_str_init_static F
-+GLIBC_2.27 _IO_str_overflow F
-+GLIBC_2.27 _IO_str_pbackfail F
-+GLIBC_2.27 _IO_str_seekoff F
-+GLIBC_2.27 _IO_str_underflow F
-+GLIBC_2.27 _IO_sungetc F
-+GLIBC_2.27 _IO_sungetwc F
-+GLIBC_2.27 _IO_switch_to_get_mode F
-+GLIBC_2.27 _IO_switch_to_main_wget_area F
-+GLIBC_2.27 _IO_switch_to_wbackup_area F
-+GLIBC_2.27 _IO_switch_to_wget_mode F
-+GLIBC_2.27 _IO_un_link F
-+GLIBC_2.27 _IO_ungetc F
-+GLIBC_2.27 _IO_unsave_markers F
-+GLIBC_2.27 _IO_unsave_wmarkers F
-+GLIBC_2.27 _IO_vfprintf F
-+GLIBC_2.27 _IO_vfscanf F
-+GLIBC_2.27 _IO_vsprintf F
-+GLIBC_2.27 _IO_wdefault_doallocate F
-+GLIBC_2.27 _IO_wdefault_finish F
-+GLIBC_2.27 _IO_wdefault_pbackfail F
-+GLIBC_2.27 _IO_wdefault_uflow F
-+GLIBC_2.27 _IO_wdefault_xsgetn F
-+GLIBC_2.27 _IO_wdefault_xsputn F
-+GLIBC_2.27 _IO_wdo_write F
-+GLIBC_2.27 _IO_wdoallocbuf F
-+GLIBC_2.27 _IO_wfile_jumps D 0xa8
-+GLIBC_2.27 _IO_wfile_overflow F
-+GLIBC_2.27 _IO_wfile_seekoff F
-+GLIBC_2.27 _IO_wfile_sync F
-+GLIBC_2.27 _IO_wfile_underflow F
-+GLIBC_2.27 _IO_wfile_xsputn F
-+GLIBC_2.27 _IO_wmarker_delta F
-+GLIBC_2.27 _IO_wsetb F
-+GLIBC_2.27 ___brk_addr D 0x8
-+GLIBC_2.27 __adjtimex F
-+GLIBC_2.27 __after_morecore_hook D 0x8
-+GLIBC_2.27 __argz_count F
-+GLIBC_2.27 __argz_next F
-+GLIBC_2.27 __argz_stringify F
-+GLIBC_2.27 __asprintf F
-+GLIBC_2.27 __asprintf_chk F
-+GLIBC_2.27 __assert F
-+GLIBC_2.27 __assert_fail F
-+GLIBC_2.27 __assert_perror_fail F
-+GLIBC_2.27 __backtrace F
-+GLIBC_2.27 __backtrace_symbols F
-+GLIBC_2.27 __backtrace_symbols_fd F
-+GLIBC_2.27 __bsd_getpgrp F
-+GLIBC_2.27 __bzero F
-+GLIBC_2.27 __check_rhosts_file D 0x4
-+GLIBC_2.27 __chk_fail F
-+GLIBC_2.27 __clone F
-+GLIBC_2.27 __close F
-+GLIBC_2.27 __cmsg_nxthdr F
-+GLIBC_2.27 __confstr_chk F
-+GLIBC_2.27 __connect F
-+GLIBC_2.27 __ctype_b_loc F
-+GLIBC_2.27 __ctype_get_mb_cur_max F
-+GLIBC_2.27 __ctype_tolower_loc F
-+GLIBC_2.27 __ctype_toupper_loc F
-+GLIBC_2.27 __curbrk D 0x8
-+GLIBC_2.27 __cxa_at_quick_exit F
-+GLIBC_2.27 __cxa_atexit F
-+GLIBC_2.27 __cxa_finalize F
-+GLIBC_2.27 __cxa_thread_atexit_impl F
-+GLIBC_2.27 __cyg_profile_func_enter F
-+GLIBC_2.27 __cyg_profile_func_exit F
-+GLIBC_2.27 __daylight D 0x4
-+GLIBC_2.27 __dcgettext F
-+GLIBC_2.27 __default_morecore F
-+GLIBC_2.27 __dgettext F
-+GLIBC_2.27 __dprintf_chk F
-+GLIBC_2.27 __dup2 F
-+GLIBC_2.27 __duplocale F
-+GLIBC_2.27 __endmntent F
-+GLIBC_2.27 __environ D 0x8
-+GLIBC_2.27 __errno_location F
-+GLIBC_2.27 __explicit_bzero_chk F
-+GLIBC_2.27 __fbufsize F
-+GLIBC_2.27 __fcntl F
-+GLIBC_2.27 __fdelt_chk F
-+GLIBC_2.27 __fdelt_warn F
-+GLIBC_2.27 __ffs F
-+GLIBC_2.27 __fgets_chk F
-+GLIBC_2.27 __fgets_unlocked_chk F
-+GLIBC_2.27 __fgetws_chk F
-+GLIBC_2.27 __fgetws_unlocked_chk F
-+GLIBC_2.27 __finite F
-+GLIBC_2.27 __finitef F
-+GLIBC_2.27 __finitel F
-+GLIBC_2.27 __flbf F
-+GLIBC_2.27 __fork F
-+GLIBC_2.27 __fpending F
-+GLIBC_2.27 __fprintf_chk F
-+GLIBC_2.27 __fpu_control D 0x4
-+GLIBC_2.27 __fpurge F
-+GLIBC_2.27 __fread_chk F
-+GLIBC_2.27 __fread_unlocked_chk F
-+GLIBC_2.27 __freadable F
-+GLIBC_2.27 __freading F
-+GLIBC_2.27 __free_hook D 0x8
-+GLIBC_2.27 __freelocale F
-+GLIBC_2.27 __fsetlocking F
-+GLIBC_2.27 __fwprintf_chk F
-+GLIBC_2.27 __fwritable F
-+GLIBC_2.27 __fwriting F
-+GLIBC_2.27 __fxstat F
-+GLIBC_2.27 __fxstat64 F
-+GLIBC_2.27 __fxstatat F
-+GLIBC_2.27 __fxstatat64 F
-+GLIBC_2.27 __getauxval F
-+GLIBC_2.27 __getcwd_chk F
-+GLIBC_2.27 __getdelim F
-+GLIBC_2.27 __getdomainname_chk F
-+GLIBC_2.27 __getgroups_chk F
-+GLIBC_2.27 __gethostname_chk F
-+GLIBC_2.27 __getlogin_r_chk F
-+GLIBC_2.27 __getmntent_r F
-+GLIBC_2.27 __getpagesize F
-+GLIBC_2.27 __getpgid F
-+GLIBC_2.27 __getpid F
-+GLIBC_2.27 __gets_chk F
-+GLIBC_2.27 __gettimeofday F
-+GLIBC_2.27 __getwd_chk F
-+GLIBC_2.27 __gmtime_r F
-+GLIBC_2.27 __h_errno_location F
-+GLIBC_2.27 __isalnum_l F
-+GLIBC_2.27 __isalpha_l F
-+GLIBC_2.27 __isascii_l F
-+GLIBC_2.27 __isblank_l F
-+GLIBC_2.27 __iscntrl_l F
-+GLIBC_2.27 __isctype F
-+GLIBC_2.27 __isdigit_l F
-+GLIBC_2.27 __isgraph_l F
-+GLIBC_2.27 __isinf F
-+GLIBC_2.27 __isinff F
-+GLIBC_2.27 __isinfl F
-+GLIBC_2.27 __islower_l F
-+GLIBC_2.27 __isnan F
-+GLIBC_2.27 __isnanf F
-+GLIBC_2.27 __isnanl F
-+GLIBC_2.27 __isoc99_fscanf F
-+GLIBC_2.27 __isoc99_fwscanf F
-+GLIBC_2.27 __isoc99_scanf F
-+GLIBC_2.27 __isoc99_sscanf F
-+GLIBC_2.27 __isoc99_swscanf F
-+GLIBC_2.27 __isoc99_vfscanf F
-+GLIBC_2.27 __isoc99_vfwscanf F
-+GLIBC_2.27 __isoc99_vscanf F
-+GLIBC_2.27 __isoc99_vsscanf F
-+GLIBC_2.27 __isoc99_vswscanf F
-+GLIBC_2.27 __isoc99_vwscanf F
-+GLIBC_2.27 __isoc99_wscanf F
-+GLIBC_2.27 __isprint_l F
-+GLIBC_2.27 __ispunct_l F
-+GLIBC_2.27 __isspace_l F
-+GLIBC_2.27 __isupper_l F
-+GLIBC_2.27 __iswalnum_l F
-+GLIBC_2.27 __iswalpha_l F
-+GLIBC_2.27 __iswblank_l F
-+GLIBC_2.27 __iswcntrl_l F
-+GLIBC_2.27 __iswctype F
-+GLIBC_2.27 __iswctype_l F
-+GLIBC_2.27 __iswdigit_l F
-+GLIBC_2.27 __iswgraph_l F
-+GLIBC_2.27 __iswlower_l F
-+GLIBC_2.27 __iswprint_l F
-+GLIBC_2.27 __iswpunct_l F
-+GLIBC_2.27 __iswspace_l F
-+GLIBC_2.27 __iswupper_l F
-+GLIBC_2.27 __iswxdigit_l F
-+GLIBC_2.27 __isxdigit_l F
-+GLIBC_2.27 __ivaliduser F
-+GLIBC_2.27 __key_decryptsession_pk_LOCAL D 0x8
-+GLIBC_2.27 __key_encryptsession_pk_LOCAL D 0x8
-+GLIBC_2.27 __key_gendes_LOCAL D 0x8
-+GLIBC_2.27 __libc_allocate_rtsig F
-+GLIBC_2.27 __libc_calloc F
-+GLIBC_2.27 __libc_current_sigrtmax F
-+GLIBC_2.27 __libc_current_sigrtmin F
-+GLIBC_2.27 __libc_free F
-+GLIBC_2.27 __libc_freeres F
-+GLIBC_2.27 __libc_init_first F
-+GLIBC_2.27 __libc_mallinfo F
-+GLIBC_2.27 __libc_malloc F
-+GLIBC_2.27 __libc_mallopt F
-+GLIBC_2.27 __libc_memalign F
-+GLIBC_2.27 __libc_pvalloc F
-+GLIBC_2.27 __libc_realloc F
-+GLIBC_2.27 __libc_sa_len F
-+GLIBC_2.27 __libc_start_main F
-+GLIBC_2.27 __libc_valloc F
-+GLIBC_2.27 __longjmp_chk F
-+GLIBC_2.27 __lseek F
-+GLIBC_2.27 __lxstat F
-+GLIBC_2.27 __lxstat64 F
-+GLIBC_2.27 __malloc_hook D 0x8
-+GLIBC_2.27 __mbrlen F
-+GLIBC_2.27 __mbrtowc F
-+GLIBC_2.27 __mbsnrtowcs_chk F
-+GLIBC_2.27 __mbsrtowcs_chk F
-+GLIBC_2.27 __mbstowcs_chk F
-+GLIBC_2.27 __memalign_hook D 0x8
-+GLIBC_2.27 __memcpy_chk F
-+GLIBC_2.27 __memmove_chk F
-+GLIBC_2.27 __mempcpy F
-+GLIBC_2.27 __mempcpy_chk F
-+GLIBC_2.27 __memset_chk F
-+GLIBC_2.27 __monstartup F
-+GLIBC_2.27 __morecore D 0x8
-+GLIBC_2.27 __nanosleep F
-+GLIBC_2.27 __newlocale F
-+GLIBC_2.27 __nl_langinfo_l F
-+GLIBC_2.27 __nss_configure_lookup F
-+GLIBC_2.27 __nss_database_lookup F
-+GLIBC_2.27 __nss_hostname_digits_dots F
-+GLIBC_2.27 __nss_next F
-+GLIBC_2.27 __obstack_printf_chk F
-+GLIBC_2.27 __obstack_vprintf_chk F
-+GLIBC_2.27 __open F
-+GLIBC_2.27 __open64 F
-+GLIBC_2.27 __open64_2 F
-+GLIBC_2.27 __open_2 F
-+GLIBC_2.27 __openat64_2 F
-+GLIBC_2.27 __openat_2 F
-+GLIBC_2.27 __overflow F
-+GLIBC_2.27 __pipe F
-+GLIBC_2.27 __poll F
-+GLIBC_2.27 __poll_chk F
-+GLIBC_2.27 __posix_getopt F
-+GLIBC_2.27 __ppoll_chk F
-+GLIBC_2.27 __pread64 F
-+GLIBC_2.27 __pread64_chk F
-+GLIBC_2.27 __pread_chk F
-+GLIBC_2.27 __printf_chk F
-+GLIBC_2.27 __printf_fp F
-+GLIBC_2.27 __profile_frequency F
-+GLIBC_2.27 __progname D 0x8
-+GLIBC_2.27 __progname_full D 0x8
-+GLIBC_2.27 __ptsname_r_chk F
-+GLIBC_2.27 __pwrite64 F
-+GLIBC_2.27 __rawmemchr F
-+GLIBC_2.27 __rcmd_errstr D 0x8
-+GLIBC_2.27 __read F
-+GLIBC_2.27 __read_chk F
-+GLIBC_2.27 __readlink_chk F
-+GLIBC_2.27 __readlinkat_chk F
-+GLIBC_2.27 __realloc_hook D 0x8
-+GLIBC_2.27 __realpath_chk F
-+GLIBC_2.27 __recv_chk F
-+GLIBC_2.27 __recvfrom_chk F
-+GLIBC_2.27 __register_atfork F
-+GLIBC_2.27 __res_init F
-+GLIBC_2.27 __res_nclose F
-+GLIBC_2.27 __res_ninit F
-+GLIBC_2.27 __res_randomid F
-+GLIBC_2.27 __res_state F
-+GLIBC_2.27 __rpc_thread_createerr F
-+GLIBC_2.27 __rpc_thread_svc_fdset F
-+GLIBC_2.27 __rpc_thread_svc_max_pollfd F
-+GLIBC_2.27 __rpc_thread_svc_pollfd F
-+GLIBC_2.27 __sbrk F
-+GLIBC_2.27 __sched_cpualloc F
-+GLIBC_2.27 __sched_cpucount F
-+GLIBC_2.27 __sched_cpufree F
-+GLIBC_2.27 __sched_get_priority_max F
-+GLIBC_2.27 __sched_get_priority_min F
-+GLIBC_2.27 __sched_getparam F
-+GLIBC_2.27 __sched_getscheduler F
-+GLIBC_2.27 __sched_setscheduler F
-+GLIBC_2.27 __sched_yield F
-+GLIBC_2.27 __select F
-+GLIBC_2.27 __send F
-+GLIBC_2.27 __setmntent F
-+GLIBC_2.27 __setpgid F
-+GLIBC_2.27 __sigaction F
-+GLIBC_2.27 __signbit F
-+GLIBC_2.27 __signbitf F
-+GLIBC_2.27 __signbitl F
-+GLIBC_2.27 __sigpause F
-+GLIBC_2.27 __sigsetjmp F
-+GLIBC_2.27 __sigsuspend F
-+GLIBC_2.27 __snprintf_chk F
-+GLIBC_2.27 __sprintf_chk F
-+GLIBC_2.27 __stack_chk_fail F
-+GLIBC_2.27 __statfs F
-+GLIBC_2.27 __stpcpy F
-+GLIBC_2.27 __stpcpy_chk F
-+GLIBC_2.27 __stpncpy F
-+GLIBC_2.27 __stpncpy_chk F
-+GLIBC_2.27 __strcasecmp F
-+GLIBC_2.27 __strcasecmp_l F
-+GLIBC_2.27 __strcasestr F
-+GLIBC_2.27 __strcat_chk F
-+GLIBC_2.27 __strcoll_l F
-+GLIBC_2.27 __strcpy_chk F
-+GLIBC_2.27 __strdup F
-+GLIBC_2.27 __strerror_r F
-+GLIBC_2.27 __strfmon_l F
-+GLIBC_2.27 __strftime_l F
-+GLIBC_2.27 __strncasecmp_l F
-+GLIBC_2.27 __strncat_chk F
-+GLIBC_2.27 __strncpy_chk F
-+GLIBC_2.27 __strndup F
-+GLIBC_2.27 __strsep_g F
-+GLIBC_2.27 __strtod_internal F
-+GLIBC_2.27 __strtod_l F
-+GLIBC_2.27 __strtof_internal F
-+GLIBC_2.27 __strtof_l F
-+GLIBC_2.27 __strtok_r F
-+GLIBC_2.27 __strtol_internal F
-+GLIBC_2.27 __strtol_l F
-+GLIBC_2.27 __strtold_internal F
-+GLIBC_2.27 __strtold_l F
-+GLIBC_2.27 __strtoll_internal F
-+GLIBC_2.27 __strtoll_l F
-+GLIBC_2.27 __strtoul_internal F
-+GLIBC_2.27 __strtoul_l F
-+GLIBC_2.27 __strtoull_internal F
-+GLIBC_2.27 __strtoull_l F
-+GLIBC_2.27 __strverscmp F
-+GLIBC_2.27 __strxfrm_l F
-+GLIBC_2.27 __swprintf_chk F
-+GLIBC_2.27 __sysconf F
-+GLIBC_2.27 __syslog_chk F
-+GLIBC_2.27 __sysv_signal F
-+GLIBC_2.27 __timezone D 0x8
-+GLIBC_2.27 __toascii_l F
-+GLIBC_2.27 __tolower_l F
-+GLIBC_2.27 __toupper_l F
-+GLIBC_2.27 __towctrans F
-+GLIBC_2.27 __towctrans_l F
-+GLIBC_2.27 __towlower_l F
-+GLIBC_2.27 __towupper_l F
-+GLIBC_2.27 __ttyname_r_chk F
-+GLIBC_2.27 __tzname D 0x10
-+GLIBC_2.27 __uflow F
-+GLIBC_2.27 __underflow F
-+GLIBC_2.27 __uselocale F
-+GLIBC_2.27 __vasprintf_chk F
-+GLIBC_2.27 __vdprintf_chk F
-+GLIBC_2.27 __vfork F
-+GLIBC_2.27 __vfprintf_chk F
-+GLIBC_2.27 __vfscanf F
-+GLIBC_2.27 __vfwprintf_chk F
-+GLIBC_2.27 __vprintf_chk F
-+GLIBC_2.27 __vsnprintf F
-+GLIBC_2.27 __vsnprintf_chk F
-+GLIBC_2.27 __vsprintf_chk F
-+GLIBC_2.27 __vsscanf F
-+GLIBC_2.27 __vswprintf_chk F
-+GLIBC_2.27 __vsyslog_chk F
-+GLIBC_2.27 __vwprintf_chk F
-+GLIBC_2.27 __wait F
-+GLIBC_2.27 __waitpid F
-+GLIBC_2.27 __wcpcpy_chk F
-+GLIBC_2.27 __wcpncpy_chk F
-+GLIBC_2.27 __wcrtomb_chk F
-+GLIBC_2.27 __wcscasecmp_l F
-+GLIBC_2.27 __wcscat_chk F
-+GLIBC_2.27 __wcscoll_l F
-+GLIBC_2.27 __wcscpy_chk F
-+GLIBC_2.27 __wcsftime_l F
-+GLIBC_2.27 __wcsncasecmp_l F
-+GLIBC_2.27 __wcsncat_chk F
-+GLIBC_2.27 __wcsncpy_chk F
-+GLIBC_2.27 __wcsnrtombs_chk F
-+GLIBC_2.27 __wcsrtombs_chk F
-+GLIBC_2.27 __wcstod_internal F
-+GLIBC_2.27 __wcstod_l F
-+GLIBC_2.27 __wcstof_internal F
-+GLIBC_2.27 __wcstof_l F
-+GLIBC_2.27 __wcstol_internal F
-+GLIBC_2.27 __wcstol_l F
-+GLIBC_2.27 __wcstold_internal F
-+GLIBC_2.27 __wcstold_l F
-+GLIBC_2.27 __wcstoll_internal F
-+GLIBC_2.27 __wcstoll_l F
-+GLIBC_2.27 __wcstombs_chk F
-+GLIBC_2.27 __wcstoul_internal F
-+GLIBC_2.27 __wcstoul_l F
-+GLIBC_2.27 __wcstoull_internal F
-+GLIBC_2.27 __wcstoull_l F
-+GLIBC_2.27 __wcsxfrm_l F
-+GLIBC_2.27 __wctomb_chk F
-+GLIBC_2.27 __wctrans_l F
-+GLIBC_2.27 __wctype_l F
-+GLIBC_2.27 __wmemcpy_chk F
-+GLIBC_2.27 __wmemmove_chk F
-+GLIBC_2.27 __wmempcpy_chk F
-+GLIBC_2.27 __wmemset_chk F
-+GLIBC_2.27 __woverflow F
-+GLIBC_2.27 __wprintf_chk F
-+GLIBC_2.27 __write F
-+GLIBC_2.27 __wuflow F
-+GLIBC_2.27 __wunderflow F
-+GLIBC_2.27 __xmknod F
-+GLIBC_2.27 __xmknodat F
-+GLIBC_2.27 __xpg_basename F
-+GLIBC_2.27 __xpg_sigpause F
-+GLIBC_2.27 __xpg_strerror_r F
-+GLIBC_2.27 __xstat F
-+GLIBC_2.27 __xstat64 F
-+GLIBC_2.27 _authenticate F
-+GLIBC_2.27 _dl_mcount_wrapper F
-+GLIBC_2.27 _dl_mcount_wrapper_check F
-+GLIBC_2.27 _environ D 0x8
-+GLIBC_2.27 _exit F
-+GLIBC_2.27 _flushlbf F
-+GLIBC_2.27 _libc_intl_domainname D 0x5
-+GLIBC_2.27 _longjmp F
-+GLIBC_2.27 _mcleanup F
-+GLIBC_2.27 _mcount F
-+GLIBC_2.27 _nl_default_dirname D 0x12
-+GLIBC_2.27 _nl_domain_bindings D 0x8
-+GLIBC_2.27 _nl_msg_cat_cntr D 0x4
-+GLIBC_2.27 _null_auth D 0x18
-+GLIBC_2.27 _obstack_allocated_p F
-+GLIBC_2.27 _obstack_begin F
-+GLIBC_2.27 _obstack_begin_1 F
-+GLIBC_2.27 _obstack_free F
-+GLIBC_2.27 _obstack_memory_used F
-+GLIBC_2.27 _obstack_newchunk F
-+GLIBC_2.27 _res D 0x238
-+GLIBC_2.27 _res_hconf D 0x48
-+GLIBC_2.27 _rpc_dtablesize F
-+GLIBC_2.27 _seterr_reply F
-+GLIBC_2.27 _setjmp F
-+GLIBC_2.27 _sys_errlist D 0x2370
-+GLIBC_2.27 _sys_nerr D 0x4
-+GLIBC_2.27 _sys_siglist D 0x400
-+GLIBC_2.27 _tolower F
-+GLIBC_2.27 _toupper F
-+GLIBC_2.27 a64l F
-+GLIBC_2.27 abort F
-+GLIBC_2.27 abs F
-+GLIBC_2.27 accept F
-+GLIBC_2.27 accept4 F
-+GLIBC_2.27 access F
-+GLIBC_2.27 acct F
-+GLIBC_2.27 addmntent F
-+GLIBC_2.27 addseverity F
-+GLIBC_2.27 adjtime F
-+GLIBC_2.27 adjtimex F
-+GLIBC_2.27 alarm F
-+GLIBC_2.27 aligned_alloc F
-+GLIBC_2.27 alphasort F
-+GLIBC_2.27 alphasort64 F
-+GLIBC_2.27 argp_err_exit_status D 0x4
-+GLIBC_2.27 argp_error F
-+GLIBC_2.27 argp_failure F
-+GLIBC_2.27 argp_help F
-+GLIBC_2.27 argp_parse F
-+GLIBC_2.27 argp_program_bug_address D 0x8
-+GLIBC_2.27 argp_program_version D 0x8
-+GLIBC_2.27 argp_program_version_hook D 0x8
-+GLIBC_2.27 argp_state_help F
-+GLIBC_2.27 argp_usage F
-+GLIBC_2.27 argz_add F
-+GLIBC_2.27 argz_add_sep F
-+GLIBC_2.27 argz_append F
-+GLIBC_2.27 argz_count F
-+GLIBC_2.27 argz_create F
-+GLIBC_2.27 argz_create_sep F
-+GLIBC_2.27 argz_delete F
-+GLIBC_2.27 argz_extract F
-+GLIBC_2.27 argz_insert F
-+GLIBC_2.27 argz_next F
-+GLIBC_2.27 argz_replace F
-+GLIBC_2.27 argz_stringify F
-+GLIBC_2.27 asctime F
-+GLIBC_2.27 asctime_r F
-+GLIBC_2.27 asprintf F
-+GLIBC_2.27 atof F
-+GLIBC_2.27 atoi F
-+GLIBC_2.27 atol F
-+GLIBC_2.27 atoll F
-+GLIBC_2.27 authdes_create F
-+GLIBC_2.27 authdes_getucred F
-+GLIBC_2.27 authdes_pk_create F
-+GLIBC_2.27 authnone_create F
-+GLIBC_2.27 authunix_create F
-+GLIBC_2.27 authunix_create_default F
-+GLIBC_2.27 backtrace F
-+GLIBC_2.27 backtrace_symbols F
-+GLIBC_2.27 backtrace_symbols_fd F
-+GLIBC_2.27 basename F
-+GLIBC_2.27 bcmp F
-+GLIBC_2.27 bcopy F
-+GLIBC_2.27 bind F
-+GLIBC_2.27 bind_textdomain_codeset F
-+GLIBC_2.27 bindresvport F
-+GLIBC_2.27 bindtextdomain F
-+GLIBC_2.27 brk F
-+GLIBC_2.27 bsd_signal F
-+GLIBC_2.27 bsearch F
-+GLIBC_2.27 btowc F
-+GLIBC_2.27 bzero F
-+GLIBC_2.27 c16rtomb F
-+GLIBC_2.27 c32rtomb F
-+GLIBC_2.27 calloc F
-+GLIBC_2.27 callrpc F
-+GLIBC_2.27 canonicalize_file_name F
-+GLIBC_2.27 capget F
-+GLIBC_2.27 capset F
-+GLIBC_2.27 catclose F
-+GLIBC_2.27 catgets F
-+GLIBC_2.27 catopen F
-+GLIBC_2.27 cbc_crypt F
-+GLIBC_2.27 cfgetispeed F
-+GLIBC_2.27 cfgetospeed F
-+GLIBC_2.27 cfmakeraw F
-+GLIBC_2.27 cfsetispeed F
-+GLIBC_2.27 cfsetospeed F
-+GLIBC_2.27 cfsetspeed F
-+GLIBC_2.27 chdir F
-+GLIBC_2.27 chflags F
-+GLIBC_2.27 chmod F
-+GLIBC_2.27 chown F
-+GLIBC_2.27 chroot F
-+GLIBC_2.27 clearenv F
-+GLIBC_2.27 clearerr F
-+GLIBC_2.27 clearerr_unlocked F
-+GLIBC_2.27 clnt_broadcast F
-+GLIBC_2.27 clnt_create F
-+GLIBC_2.27 clnt_pcreateerror F
-+GLIBC_2.27 clnt_perrno F
-+GLIBC_2.27 clnt_perror F
-+GLIBC_2.27 clnt_spcreateerror F
-+GLIBC_2.27 clnt_sperrno F
-+GLIBC_2.27 clnt_sperror F
-+GLIBC_2.27 clntraw_create F
-+GLIBC_2.27 clnttcp_create F
-+GLIBC_2.27 clntudp_bufcreate F
-+GLIBC_2.27 clntudp_create F
-+GLIBC_2.27 clntunix_create F
-+GLIBC_2.27 clock F
-+GLIBC_2.27 clock_adjtime F
-+GLIBC_2.27 clock_getcpuclockid F
-+GLIBC_2.27 clock_getres F
-+GLIBC_2.27 clock_gettime F
-+GLIBC_2.27 clock_nanosleep F
-+GLIBC_2.27 clock_settime F
-+GLIBC_2.27 clone F
-+GLIBC_2.27 close F
-+GLIBC_2.27 closedir F
-+GLIBC_2.27 closelog F
-+GLIBC_2.27 confstr F
-+GLIBC_2.27 connect F
-+GLIBC_2.27 copy_file_range F
-+GLIBC_2.27 copysign F
-+GLIBC_2.27 copysignf F
-+GLIBC_2.27 copysignl F
-+GLIBC_2.27 creat F
-+GLIBC_2.27 creat64 F
-+GLIBC_2.27 ctermid F
-+GLIBC_2.27 ctime F
-+GLIBC_2.27 ctime_r F
-+GLIBC_2.27 cuserid F
-+GLIBC_2.27 daemon F
-+GLIBC_2.27 daylight D 0x4
-+GLIBC_2.27 dcgettext F
-+GLIBC_2.27 dcngettext F
-+GLIBC_2.27 delete_module F
-+GLIBC_2.27 des_setparity F
-+GLIBC_2.27 dgettext F
-+GLIBC_2.27 difftime F
-+GLIBC_2.27 dirfd F
-+GLIBC_2.27 dirname F
-+GLIBC_2.27 div F
-+GLIBC_2.27 dl_iterate_phdr F
-+GLIBC_2.27 dngettext F
-+GLIBC_2.27 dprintf F
-+GLIBC_2.27 drand48 F
-+GLIBC_2.27 drand48_r F
-+GLIBC_2.27 dup F
-+GLIBC_2.27 dup2 F
-+GLIBC_2.27 dup3 F
-+GLIBC_2.27 duplocale F
-+GLIBC_2.27 dysize F
-+GLIBC_2.27 eaccess F
-+GLIBC_2.27 ecb_crypt F
-+GLIBC_2.27 ecvt F
-+GLIBC_2.27 ecvt_r F
-+GLIBC_2.27 endaliasent F
-+GLIBC_2.27 endfsent F
-+GLIBC_2.27 endgrent F
-+GLIBC_2.27 endhostent F
-+GLIBC_2.27 endmntent F
-+GLIBC_2.27 endnetent F
-+GLIBC_2.27 endnetgrent F
-+GLIBC_2.27 endprotoent F
-+GLIBC_2.27 endpwent F
-+GLIBC_2.27 endrpcent F
-+GLIBC_2.27 endservent F
-+GLIBC_2.27 endsgent F
-+GLIBC_2.27 endspent F
-+GLIBC_2.27 endttyent F
-+GLIBC_2.27 endusershell F
-+GLIBC_2.27 endutent F
-+GLIBC_2.27 endutxent F
-+GLIBC_2.27 environ D 0x8
-+GLIBC_2.27 envz_add F
-+GLIBC_2.27 envz_entry F
-+GLIBC_2.27 envz_get F
-+GLIBC_2.27 envz_merge F
-+GLIBC_2.27 envz_remove F
-+GLIBC_2.27 envz_strip F
-+GLIBC_2.27 epoll_create F
-+GLIBC_2.27 epoll_create1 F
-+GLIBC_2.27 epoll_ctl F
-+GLIBC_2.27 epoll_pwait F
-+GLIBC_2.27 epoll_wait F
-+GLIBC_2.27 erand48 F
-+GLIBC_2.27 erand48_r F
-+GLIBC_2.27 err F
-+GLIBC_2.27 error F
-+GLIBC_2.27 error_at_line F
-+GLIBC_2.27 error_message_count D 0x4
-+GLIBC_2.27 error_one_per_line D 0x4
-+GLIBC_2.27 error_print_progname D 0x8
-+GLIBC_2.27 errx F
-+GLIBC_2.27 ether_aton F
-+GLIBC_2.27 ether_aton_r F
-+GLIBC_2.27 ether_hostton F
-+GLIBC_2.27 ether_line F
-+GLIBC_2.27 ether_ntoa F
-+GLIBC_2.27 ether_ntoa_r F
-+GLIBC_2.27 ether_ntohost F
-+GLIBC_2.27 euidaccess F
-+GLIBC_2.27 eventfd F
-+GLIBC_2.27 eventfd_read F
-+GLIBC_2.27 eventfd_write F
-+GLIBC_2.27 execl F
-+GLIBC_2.27 execle F
-+GLIBC_2.27 execlp F
-+GLIBC_2.27 execv F
-+GLIBC_2.27 execve F
-+GLIBC_2.27 execvp F
-+GLIBC_2.27 execvpe F
-+GLIBC_2.27 exit F
-+GLIBC_2.27 explicit_bzero F
-+GLIBC_2.27 faccessat F
-+GLIBC_2.27 fallocate F
-+GLIBC_2.27 fallocate64 F
-+GLIBC_2.27 fanotify_init F
-+GLIBC_2.27 fanotify_mark F
-+GLIBC_2.27 fattach F
-+GLIBC_2.27 fchdir F
-+GLIBC_2.27 fchflags F
-+GLIBC_2.27 fchmod F
-+GLIBC_2.27 fchmodat F
-+GLIBC_2.27 fchown F
-+GLIBC_2.27 fchownat F
-+GLIBC_2.27 fclose F
-+GLIBC_2.27 fcloseall F
-+GLIBC_2.27 fcntl F
-+GLIBC_2.27 fcvt F
-+GLIBC_2.27 fcvt_r F
-+GLIBC_2.27 fdatasync F
-+GLIBC_2.27 fdetach F
-+GLIBC_2.27 fdopen F
-+GLIBC_2.27 fdopendir F
-+GLIBC_2.27 feof F
-+GLIBC_2.27 feof_unlocked F
-+GLIBC_2.27 ferror F
-+GLIBC_2.27 ferror_unlocked F
-+GLIBC_2.27 fexecve F
-+GLIBC_2.27 fflush F
-+GLIBC_2.27 fflush_unlocked F
-+GLIBC_2.27 ffs F
-+GLIBC_2.27 ffsl F
-+GLIBC_2.27 ffsll F
-+GLIBC_2.27 fgetc F
-+GLIBC_2.27 fgetc_unlocked F
-+GLIBC_2.27 fgetgrent F
-+GLIBC_2.27 fgetgrent_r F
-+GLIBC_2.27 fgetpos F
-+GLIBC_2.27 fgetpos64 F
-+GLIBC_2.27 fgetpwent F
-+GLIBC_2.27 fgetpwent_r F
-+GLIBC_2.27 fgets F
-+GLIBC_2.27 fgets_unlocked F
-+GLIBC_2.27 fgetsgent F
-+GLIBC_2.27 fgetsgent_r F
-+GLIBC_2.27 fgetspent F
-+GLIBC_2.27 fgetspent_r F
-+GLIBC_2.27 fgetwc F
-+GLIBC_2.27 fgetwc_unlocked F
-+GLIBC_2.27 fgetws F
-+GLIBC_2.27 fgetws_unlocked F
-+GLIBC_2.27 fgetxattr F
-+GLIBC_2.27 fileno F
-+GLIBC_2.27 fileno_unlocked F
-+GLIBC_2.27 finite F
-+GLIBC_2.27 finitef F
-+GLIBC_2.27 finitel F
-+GLIBC_2.27 flistxattr F
-+GLIBC_2.27 flock F
-+GLIBC_2.27 flockfile F
-+GLIBC_2.27 fmemopen F
-+GLIBC_2.27 fmtmsg F
-+GLIBC_2.27 fnmatch F
-+GLIBC_2.27 fopen F
-+GLIBC_2.27 fopen64 F
-+GLIBC_2.27 fopencookie F
-+GLIBC_2.27 fork F
-+GLIBC_2.27 fpathconf F
-+GLIBC_2.27 fprintf F
-+GLIBC_2.27 fputc F
-+GLIBC_2.27 fputc_unlocked F
-+GLIBC_2.27 fputs F
-+GLIBC_2.27 fputs_unlocked F
-+GLIBC_2.27 fputwc F
-+GLIBC_2.27 fputwc_unlocked F
-+GLIBC_2.27 fputws F
-+GLIBC_2.27 fputws_unlocked F
-+GLIBC_2.27 fread F
-+GLIBC_2.27 fread_unlocked F
-+GLIBC_2.27 free F
-+GLIBC_2.27 freeaddrinfo F
-+GLIBC_2.27 freeifaddrs F
-+GLIBC_2.27 freelocale F
-+GLIBC_2.27 fremovexattr F
-+GLIBC_2.27 freopen F
-+GLIBC_2.27 freopen64 F
-+GLIBC_2.27 frexp F
-+GLIBC_2.27 frexpf F
-+GLIBC_2.27 frexpl F
-+GLIBC_2.27 fscanf F
-+GLIBC_2.27 fseek F
-+GLIBC_2.27 fseeko F
-+GLIBC_2.27 fseeko64 F
-+GLIBC_2.27 fsetpos F
-+GLIBC_2.27 fsetpos64 F
-+GLIBC_2.27 fsetxattr F
-+GLIBC_2.27 fstatfs F
-+GLIBC_2.27 fstatfs64 F
-+GLIBC_2.27 fstatvfs F
-+GLIBC_2.27 fstatvfs64 F
-+GLIBC_2.27 fsync F
-+GLIBC_2.27 ftell F
-+GLIBC_2.27 ftello F
-+GLIBC_2.27 ftello64 F
-+GLIBC_2.27 ftime F
-+GLIBC_2.27 ftok F
-+GLIBC_2.27 ftruncate F
-+GLIBC_2.27 ftruncate64 F
-+GLIBC_2.27 ftrylockfile F
-+GLIBC_2.27 fts64_children F
-+GLIBC_2.27 fts64_close F
-+GLIBC_2.27 fts64_open F
-+GLIBC_2.27 fts64_read F
-+GLIBC_2.27 fts64_set F
-+GLIBC_2.27 fts_children F
-+GLIBC_2.27 fts_close F
-+GLIBC_2.27 fts_open F
-+GLIBC_2.27 fts_read F
-+GLIBC_2.27 fts_set F
-+GLIBC_2.27 ftw F
-+GLIBC_2.27 ftw64 F
-+GLIBC_2.27 funlockfile F
-+GLIBC_2.27 futimens F
-+GLIBC_2.27 futimes F
-+GLIBC_2.27 futimesat F
-+GLIBC_2.27 fwide F
-+GLIBC_2.27 fwprintf F
-+GLIBC_2.27 fwrite F
-+GLIBC_2.27 fwrite_unlocked F
-+GLIBC_2.27 fwscanf F
-+GLIBC_2.27 gai_strerror F
-+GLIBC_2.27 gcvt F
-+GLIBC_2.27 get_avphys_pages F
-+GLIBC_2.27 get_current_dir_name F
-+GLIBC_2.27 get_myaddress F
-+GLIBC_2.27 get_nprocs F
-+GLIBC_2.27 get_nprocs_conf F
-+GLIBC_2.27 get_phys_pages F
-+GLIBC_2.27 getaddrinfo F
-+GLIBC_2.27 getaliasbyname F
-+GLIBC_2.27 getaliasbyname_r F
-+GLIBC_2.27 getaliasent F
-+GLIBC_2.27 getaliasent_r F
-+GLIBC_2.27 getauxval F
-+GLIBC_2.27 getc F
-+GLIBC_2.27 getc_unlocked F
-+GLIBC_2.27 getchar F
-+GLIBC_2.27 getchar_unlocked F
-+GLIBC_2.27 getcontext F
-+GLIBC_2.27 getcwd F
-+GLIBC_2.27 getdate F
-+GLIBC_2.27 getdate_err D 0x4
-+GLIBC_2.27 getdate_r F
-+GLIBC_2.27 getdelim F
-+GLIBC_2.27 getdirentries F
-+GLIBC_2.27 getdirentries64 F
-+GLIBC_2.27 getdomainname F
-+GLIBC_2.27 getdtablesize F
-+GLIBC_2.27 getegid F
-+GLIBC_2.27 getentropy F
-+GLIBC_2.27 getenv F
-+GLIBC_2.27 geteuid F
-+GLIBC_2.27 getfsent F
-+GLIBC_2.27 getfsfile F
-+GLIBC_2.27 getfsspec F
-+GLIBC_2.27 getgid F
-+GLIBC_2.27 getgrent F
-+GLIBC_2.27 getgrent_r F
-+GLIBC_2.27 getgrgid F
-+GLIBC_2.27 getgrgid_r F
-+GLIBC_2.27 getgrnam F
-+GLIBC_2.27 getgrnam_r F
-+GLIBC_2.27 getgrouplist F
-+GLIBC_2.27 getgroups F
-+GLIBC_2.27 gethostbyaddr F
-+GLIBC_2.27 gethostbyaddr_r F
-+GLIBC_2.27 gethostbyname F
-+GLIBC_2.27 gethostbyname2 F
-+GLIBC_2.27 gethostbyname2_r F
-+GLIBC_2.27 gethostbyname_r F
-+GLIBC_2.27 gethostent F
-+GLIBC_2.27 gethostent_r F
-+GLIBC_2.27 gethostid F
-+GLIBC_2.27 gethostname F
-+GLIBC_2.27 getifaddrs F
-+GLIBC_2.27 getipv4sourcefilter F
-+GLIBC_2.27 getitimer F
-+GLIBC_2.27 getline F
-+GLIBC_2.27 getloadavg F
-+GLIBC_2.27 getlogin F
-+GLIBC_2.27 getlogin_r F
-+GLIBC_2.27 getmntent F
-+GLIBC_2.27 getmntent_r F
-+GLIBC_2.27 getmsg F
-+GLIBC_2.27 getnameinfo F
-+GLIBC_2.27 getnetbyaddr F
-+GLIBC_2.27 getnetbyaddr_r F
-+GLIBC_2.27 getnetbyname F
-+GLIBC_2.27 getnetbyname_r F
-+GLIBC_2.27 getnetent F
-+GLIBC_2.27 getnetent_r F
-+GLIBC_2.27 getnetgrent F
-+GLIBC_2.27 getnetgrent_r F
-+GLIBC_2.27 getnetname F
-+GLIBC_2.27 getopt F
-+GLIBC_2.27 getopt_long F
-+GLIBC_2.27 getopt_long_only F
-+GLIBC_2.27 getpagesize F
-+GLIBC_2.27 getpass F
-+GLIBC_2.27 getpeername F
-+GLIBC_2.27 getpgid F
-+GLIBC_2.27 getpgrp F
-+GLIBC_2.27 getpid F
-+GLIBC_2.27 getpmsg F
-+GLIBC_2.27 getppid F
-+GLIBC_2.27 getpriority F
-+GLIBC_2.27 getprotobyname F
-+GLIBC_2.27 getprotobyname_r F
-+GLIBC_2.27 getprotobynumber F
-+GLIBC_2.27 getprotobynumber_r F
-+GLIBC_2.27 getprotoent F
-+GLIBC_2.27 getprotoent_r F
-+GLIBC_2.27 getpt F
-+GLIBC_2.27 getpublickey F
-+GLIBC_2.27 getpw F
-+GLIBC_2.27 getpwent F
-+GLIBC_2.27 getpwent_r F
-+GLIBC_2.27 getpwnam F
-+GLIBC_2.27 getpwnam_r F
-+GLIBC_2.27 getpwuid F
-+GLIBC_2.27 getpwuid_r F
-+GLIBC_2.27 getrandom F
-+GLIBC_2.27 getresgid F
-+GLIBC_2.27 getresuid F
-+GLIBC_2.27 getrlimit F
-+GLIBC_2.27 getrlimit64 F
-+GLIBC_2.27 getrpcbyname F
-+GLIBC_2.27 getrpcbyname_r F
-+GLIBC_2.27 getrpcbynumber F
-+GLIBC_2.27 getrpcbynumber_r F
-+GLIBC_2.27 getrpcent F
-+GLIBC_2.27 getrpcent_r F
-+GLIBC_2.27 getrpcport F
-+GLIBC_2.27 getrusage F
-+GLIBC_2.27 gets F
-+GLIBC_2.27 getsecretkey F
-+GLIBC_2.27 getservbyname F
-+GLIBC_2.27 getservbyname_r F
-+GLIBC_2.27 getservbyport F
-+GLIBC_2.27 getservbyport_r F
-+GLIBC_2.27 getservent F
-+GLIBC_2.27 getservent_r F
-+GLIBC_2.27 getsgent F
-+GLIBC_2.27 getsgent_r F
-+GLIBC_2.27 getsgnam F
-+GLIBC_2.27 getsgnam_r F
-+GLIBC_2.27 getsid F
-+GLIBC_2.27 getsockname F
-+GLIBC_2.27 getsockopt F
-+GLIBC_2.27 getsourcefilter F
-+GLIBC_2.27 getspent F
-+GLIBC_2.27 getspent_r F
-+GLIBC_2.27 getspnam F
-+GLIBC_2.27 getspnam_r F
-+GLIBC_2.27 getsubopt F
-+GLIBC_2.27 gettext F
-+GLIBC_2.27 gettimeofday F
-+GLIBC_2.27 getttyent F
-+GLIBC_2.27 getttynam F
-+GLIBC_2.27 getuid F
-+GLIBC_2.27 getusershell F
-+GLIBC_2.27 getutent F
-+GLIBC_2.27 getutent_r F
-+GLIBC_2.27 getutid F
-+GLIBC_2.27 getutid_r F
-+GLIBC_2.27 getutline F
-+GLIBC_2.27 getutline_r F
-+GLIBC_2.27 getutmp F
-+GLIBC_2.27 getutmpx F
-+GLIBC_2.27 getutxent F
-+GLIBC_2.27 getutxid F
-+GLIBC_2.27 getutxline F
-+GLIBC_2.27 getw F
-+GLIBC_2.27 getwc F
-+GLIBC_2.27 getwc_unlocked F
-+GLIBC_2.27 getwchar F
-+GLIBC_2.27 getwchar_unlocked F
-+GLIBC_2.27 getwd F
-+GLIBC_2.27 getxattr F
-+GLIBC_2.27 glob F
-+GLIBC_2.27 glob64 F
-+GLIBC_2.27 glob_pattern_p F
-+GLIBC_2.27 globfree F
-+GLIBC_2.27 globfree64 F
-+GLIBC_2.27 gmtime F
-+GLIBC_2.27 gmtime_r F
-+GLIBC_2.27 gnu_dev_major F
-+GLIBC_2.27 gnu_dev_makedev F
-+GLIBC_2.27 gnu_dev_minor F
-+GLIBC_2.27 gnu_get_libc_release F
-+GLIBC_2.27 gnu_get_libc_version F
-+GLIBC_2.27 grantpt F
-+GLIBC_2.27 group_member F
-+GLIBC_2.27 gsignal F
-+GLIBC_2.27 gtty F
-+GLIBC_2.27 h_errlist D 0x28
-+GLIBC_2.27 h_nerr D 0x4
-+GLIBC_2.27 hasmntopt F
-+GLIBC_2.27 hcreate F
-+GLIBC_2.27 hcreate_r F
-+GLIBC_2.27 hdestroy F
-+GLIBC_2.27 hdestroy_r F
-+GLIBC_2.27 herror F
-+GLIBC_2.27 host2netname F
-+GLIBC_2.27 hsearch F
-+GLIBC_2.27 hsearch_r F
-+GLIBC_2.27 hstrerror F
-+GLIBC_2.27 htonl F
-+GLIBC_2.27 htons F
-+GLIBC_2.27 iconv F
-+GLIBC_2.27 iconv_close F
-+GLIBC_2.27 iconv_open F
-+GLIBC_2.27 if_freenameindex F
-+GLIBC_2.27 if_indextoname F
-+GLIBC_2.27 if_nameindex F
-+GLIBC_2.27 if_nametoindex F
-+GLIBC_2.27 imaxabs F
-+GLIBC_2.27 imaxdiv F
-+GLIBC_2.27 in6addr_any D 0x10
-+GLIBC_2.27 in6addr_loopback D 0x10
-+GLIBC_2.27 index F
-+GLIBC_2.27 inet6_opt_append F
-+GLIBC_2.27 inet6_opt_find F
-+GLIBC_2.27 inet6_opt_finish F
-+GLIBC_2.27 inet6_opt_get_val F
-+GLIBC_2.27 inet6_opt_init F
-+GLIBC_2.27 inet6_opt_next F
-+GLIBC_2.27 inet6_opt_set_val F
-+GLIBC_2.27 inet6_option_alloc F
-+GLIBC_2.27 inet6_option_append F
-+GLIBC_2.27 inet6_option_find F
-+GLIBC_2.27 inet6_option_init F
-+GLIBC_2.27 inet6_option_next F
-+GLIBC_2.27 inet6_option_space F
-+GLIBC_2.27 inet6_rth_add F
-+GLIBC_2.27 inet6_rth_getaddr F
-+GLIBC_2.27 inet6_rth_init F
-+GLIBC_2.27 inet6_rth_reverse F
-+GLIBC_2.27 inet6_rth_segments F
-+GLIBC_2.27 inet6_rth_space F
-+GLIBC_2.27 inet_addr F
-+GLIBC_2.27 inet_aton F
-+GLIBC_2.27 inet_lnaof F
-+GLIBC_2.27 inet_makeaddr F
-+GLIBC_2.27 inet_netof F
-+GLIBC_2.27 inet_network F
-+GLIBC_2.27 inet_nsap_addr F
-+GLIBC_2.27 inet_nsap_ntoa F
-+GLIBC_2.27 inet_ntoa F
-+GLIBC_2.27 inet_ntop F
-+GLIBC_2.27 inet_pton F
-+GLIBC_2.27 init_module F
-+GLIBC_2.27 initgroups F
-+GLIBC_2.27 initstate F
-+GLIBC_2.27 initstate_r F
-+GLIBC_2.27 innetgr F
-+GLIBC_2.27 inotify_add_watch F
-+GLIBC_2.27 inotify_init F
-+GLIBC_2.27 inotify_init1 F
-+GLIBC_2.27 inotify_rm_watch F
-+GLIBC_2.27 insque F
-+GLIBC_2.27 ioctl F
-+GLIBC_2.27 iruserok F
-+GLIBC_2.27 iruserok_af F
-+GLIBC_2.27 isalnum F
-+GLIBC_2.27 isalnum_l F
-+GLIBC_2.27 isalpha F
-+GLIBC_2.27 isalpha_l F
-+GLIBC_2.27 isascii F
-+GLIBC_2.27 isastream F
-+GLIBC_2.27 isatty F
-+GLIBC_2.27 isblank F
-+GLIBC_2.27 isblank_l F
-+GLIBC_2.27 iscntrl F
-+GLIBC_2.27 iscntrl_l F
-+GLIBC_2.27 isctype F
-+GLIBC_2.27 isdigit F
-+GLIBC_2.27 isdigit_l F
-+GLIBC_2.27 isfdtype F
-+GLIBC_2.27 isgraph F
-+GLIBC_2.27 isgraph_l F
-+GLIBC_2.27 isinf F
-+GLIBC_2.27 isinff F
-+GLIBC_2.27 isinfl F
-+GLIBC_2.27 islower F
-+GLIBC_2.27 islower_l F
-+GLIBC_2.27 isnan F
-+GLIBC_2.27 isnanf F
-+GLIBC_2.27 isnanl F
-+GLIBC_2.27 isprint F
-+GLIBC_2.27 isprint_l F
-+GLIBC_2.27 ispunct F
-+GLIBC_2.27 ispunct_l F
-+GLIBC_2.27 isspace F
-+GLIBC_2.27 isspace_l F
-+GLIBC_2.27 isupper F
-+GLIBC_2.27 isupper_l F
-+GLIBC_2.27 iswalnum F
-+GLIBC_2.27 iswalnum_l F
-+GLIBC_2.27 iswalpha F
-+GLIBC_2.27 iswalpha_l F
-+GLIBC_2.27 iswblank F
-+GLIBC_2.27 iswblank_l F
-+GLIBC_2.27 iswcntrl F
-+GLIBC_2.27 iswcntrl_l F
-+GLIBC_2.27 iswctype F
-+GLIBC_2.27 iswctype_l F
-+GLIBC_2.27 iswdigit F
-+GLIBC_2.27 iswdigit_l F
-+GLIBC_2.27 iswgraph F
-+GLIBC_2.27 iswgraph_l F
-+GLIBC_2.27 iswlower F
-+GLIBC_2.27 iswlower_l F
-+GLIBC_2.27 iswprint F
-+GLIBC_2.27 iswprint_l F
-+GLIBC_2.27 iswpunct F
-+GLIBC_2.27 iswpunct_l F
-+GLIBC_2.27 iswspace F
-+GLIBC_2.27 iswspace_l F
-+GLIBC_2.27 iswupper F
-+GLIBC_2.27 iswupper_l F
-+GLIBC_2.27 iswxdigit F
-+GLIBC_2.27 iswxdigit_l F
-+GLIBC_2.27 isxdigit F
-+GLIBC_2.27 isxdigit_l F
-+GLIBC_2.27 jrand48 F
-+GLIBC_2.27 jrand48_r F
-+GLIBC_2.27 key_decryptsession F
-+GLIBC_2.27 key_decryptsession_pk F
-+GLIBC_2.27 key_encryptsession F
-+GLIBC_2.27 key_encryptsession_pk F
-+GLIBC_2.27 key_gendes F
-+GLIBC_2.27 key_get_conv F
-+GLIBC_2.27 key_secretkey_is_set F
-+GLIBC_2.27 key_setnet F
-+GLIBC_2.27 key_setsecret F
-+GLIBC_2.27 kill F
-+GLIBC_2.27 killpg F
-+GLIBC_2.27 klogctl F
-+GLIBC_2.27 l64a F
-+GLIBC_2.27 labs F
-+GLIBC_2.27 lchmod F
-+GLIBC_2.27 lchown F
-+GLIBC_2.27 lckpwdf F
-+GLIBC_2.27 lcong48 F
-+GLIBC_2.27 lcong48_r F
-+GLIBC_2.27 ldexp F
-+GLIBC_2.27 ldexpf F
-+GLIBC_2.27 ldexpl F
-+GLIBC_2.27 ldiv F
-+GLIBC_2.27 lfind F
-+GLIBC_2.27 lgetxattr F
-+GLIBC_2.27 link F
-+GLIBC_2.27 linkat F
-+GLIBC_2.27 listen F
-+GLIBC_2.27 listxattr F
-+GLIBC_2.27 llabs F
-+GLIBC_2.27 lldiv F
-+GLIBC_2.27 llistxattr F
-+GLIBC_2.27 llseek F
-+GLIBC_2.27 localeconv F
-+GLIBC_2.27 localtime F
-+GLIBC_2.27 localtime_r F
-+GLIBC_2.27 lockf F
-+GLIBC_2.27 lockf64 F
-+GLIBC_2.27 longjmp F
-+GLIBC_2.27 lrand48 F
-+GLIBC_2.27 lrand48_r F
-+GLIBC_2.27 lremovexattr F
-+GLIBC_2.27 lsearch F
-+GLIBC_2.27 lseek F
-+GLIBC_2.27 lseek64 F
-+GLIBC_2.27 lsetxattr F
-+GLIBC_2.27 lutimes F
-+GLIBC_2.27 madvise F
-+GLIBC_2.27 makecontext F
-+GLIBC_2.27 mallinfo F
-+GLIBC_2.27 malloc F
-+GLIBC_2.27 malloc_info F
-+GLIBC_2.27 malloc_stats F
-+GLIBC_2.27 malloc_trim F
-+GLIBC_2.27 malloc_usable_size F
-+GLIBC_2.27 mallopt F
-+GLIBC_2.27 mallwatch D 0x8
-+GLIBC_2.27 mblen F
-+GLIBC_2.27 mbrlen F
-+GLIBC_2.27 mbrtoc16 F
-+GLIBC_2.27 mbrtoc32 F
-+GLIBC_2.27 mbrtowc F
-+GLIBC_2.27 mbsinit F
-+GLIBC_2.27 mbsnrtowcs F
-+GLIBC_2.27 mbsrtowcs F
-+GLIBC_2.27 mbstowcs F
-+GLIBC_2.27 mbtowc F
-+GLIBC_2.27 mcheck F
-+GLIBC_2.27 mcheck_check_all F
-+GLIBC_2.27 mcheck_pedantic F
-+GLIBC_2.27 memalign F
-+GLIBC_2.27 memccpy F
-+GLIBC_2.27 memchr F
-+GLIBC_2.27 memcmp F
-+GLIBC_2.27 memcpy F
-+GLIBC_2.27 memfd_create F
-+GLIBC_2.27 memfrob F
-+GLIBC_2.27 memmem F
-+GLIBC_2.27 memmove F
-+GLIBC_2.27 mempcpy F
-+GLIBC_2.27 memrchr F
-+GLIBC_2.27 memset F
-+GLIBC_2.27 mincore F
-+GLIBC_2.27 mkdir F
-+GLIBC_2.27 mkdirat F
-+GLIBC_2.27 mkdtemp F
-+GLIBC_2.27 mkfifo F
-+GLIBC_2.27 mkfifoat F
-+GLIBC_2.27 mkostemp F
-+GLIBC_2.27 mkostemp64 F
-+GLIBC_2.27 mkostemps F
-+GLIBC_2.27 mkostemps64 F
-+GLIBC_2.27 mkstemp F
-+GLIBC_2.27 mkstemp64 F
-+GLIBC_2.27 mkstemps F
-+GLIBC_2.27 mkstemps64 F
-+GLIBC_2.27 mktemp F
-+GLIBC_2.27 mktime F
-+GLIBC_2.27 mlock F
-+GLIBC_2.27 mlock2 F
-+GLIBC_2.27 mlockall F
-+GLIBC_2.27 mmap F
-+GLIBC_2.27 mmap64 F
-+GLIBC_2.27 modf F
-+GLIBC_2.27 modff F
-+GLIBC_2.27 modfl F
-+GLIBC_2.27 moncontrol F
-+GLIBC_2.27 monstartup F
-+GLIBC_2.27 mount F
-+GLIBC_2.27 mprobe F
-+GLIBC_2.27 mprotect F
-+GLIBC_2.27 mrand48 F
-+GLIBC_2.27 mrand48_r F
-+GLIBC_2.27 mremap F
-+GLIBC_2.27 msgctl F
-+GLIBC_2.27 msgget F
-+GLIBC_2.27 msgrcv F
-+GLIBC_2.27 msgsnd F
-+GLIBC_2.27 msync F
-+GLIBC_2.27 mtrace F
-+GLIBC_2.27 munlock F
-+GLIBC_2.27 munlockall F
-+GLIBC_2.27 munmap F
-+GLIBC_2.27 muntrace F
-+GLIBC_2.27 name_to_handle_at F
-+GLIBC_2.27 nanosleep F
-+GLIBC_2.27 netname2host F
-+GLIBC_2.27 netname2user F
-+GLIBC_2.27 newlocale F
-+GLIBC_2.27 nfsservctl F
-+GLIBC_2.27 nftw F
-+GLIBC_2.27 nftw64 F
-+GLIBC_2.27 ngettext F
-+GLIBC_2.27 nice F
-+GLIBC_2.27 nl_langinfo F
-+GLIBC_2.27 nl_langinfo_l F
-+GLIBC_2.27 nrand48 F
-+GLIBC_2.27 nrand48_r F
-+GLIBC_2.27 ntohl F
-+GLIBC_2.27 ntohs F
-+GLIBC_2.27 ntp_adjtime F
-+GLIBC_2.27 ntp_gettime F
-+GLIBC_2.27 ntp_gettimex F
-+GLIBC_2.27 obstack_alloc_failed_handler D 0x8
-+GLIBC_2.27 obstack_exit_failure D 0x4
-+GLIBC_2.27 obstack_free F
-+GLIBC_2.27 obstack_printf F
-+GLIBC_2.27 obstack_vprintf F
-+GLIBC_2.27 on_exit F
-+GLIBC_2.27 open F
-+GLIBC_2.27 open64 F
-+GLIBC_2.27 open_by_handle_at F
-+GLIBC_2.27 open_memstream F
-+GLIBC_2.27 open_wmemstream F
-+GLIBC_2.27 openat F
-+GLIBC_2.27 openat64 F
-+GLIBC_2.27 opendir F
-+GLIBC_2.27 openlog F
-+GLIBC_2.27 optarg D 0x8
-+GLIBC_2.27 opterr D 0x4
-+GLIBC_2.27 optind D 0x4
-+GLIBC_2.27 optopt D 0x4
-+GLIBC_2.27 parse_printf_format F
-+GLIBC_2.27 passwd2des F
-+GLIBC_2.27 pathconf F
-+GLIBC_2.27 pause F
-+GLIBC_2.27 pclose F
-+GLIBC_2.27 perror F
-+GLIBC_2.27 personality F
-+GLIBC_2.27 pipe F
-+GLIBC_2.27 pipe2 F
-+GLIBC_2.27 pivot_root F
-+GLIBC_2.27 pkey_alloc F
-+GLIBC_2.27 pkey_free F
-+GLIBC_2.27 pkey_get F
-+GLIBC_2.27 pkey_mprotect F
-+GLIBC_2.27 pkey_set F
-+GLIBC_2.27 pmap_getmaps F
-+GLIBC_2.27 pmap_getport F
-+GLIBC_2.27 pmap_rmtcall F
-+GLIBC_2.27 pmap_set F
-+GLIBC_2.27 pmap_unset F
-+GLIBC_2.27 poll F
-+GLIBC_2.27 popen F
-+GLIBC_2.27 posix_fadvise F
-+GLIBC_2.27 posix_fadvise64 F
-+GLIBC_2.27 posix_fallocate F
-+GLIBC_2.27 posix_fallocate64 F
-+GLIBC_2.27 posix_madvise F
-+GLIBC_2.27 posix_memalign F
-+GLIBC_2.27 posix_openpt F
-+GLIBC_2.27 posix_spawn F
-+GLIBC_2.27 posix_spawn_file_actions_addclose F
-+GLIBC_2.27 posix_spawn_file_actions_adddup2 F
-+GLIBC_2.27 posix_spawn_file_actions_addopen F
-+GLIBC_2.27 posix_spawn_file_actions_destroy F
-+GLIBC_2.27 posix_spawn_file_actions_init F
-+GLIBC_2.27 posix_spawnattr_destroy F
-+GLIBC_2.27 posix_spawnattr_getflags F
-+GLIBC_2.27 posix_spawnattr_getpgroup F
-+GLIBC_2.27 posix_spawnattr_getschedparam F
-+GLIBC_2.27 posix_spawnattr_getschedpolicy F
-+GLIBC_2.27 posix_spawnattr_getsigdefault F
-+GLIBC_2.27 posix_spawnattr_getsigmask F
-+GLIBC_2.27 posix_spawnattr_init F
-+GLIBC_2.27 posix_spawnattr_setflags F
-+GLIBC_2.27 posix_spawnattr_setpgroup F
-+GLIBC_2.27 posix_spawnattr_setschedparam F
-+GLIBC_2.27 posix_spawnattr_setschedpolicy F
-+GLIBC_2.27 posix_spawnattr_setsigdefault F
-+GLIBC_2.27 posix_spawnattr_setsigmask F
-+GLIBC_2.27 posix_spawnp F
-+GLIBC_2.27 ppoll F
-+GLIBC_2.27 prctl F
-+GLIBC_2.27 pread F
-+GLIBC_2.27 pread64 F
-+GLIBC_2.27 preadv F
-+GLIBC_2.27 preadv2 F
-+GLIBC_2.27 preadv64 F
-+GLIBC_2.27 preadv64v2 F
-+GLIBC_2.27 printf F
-+GLIBC_2.27 printf_size F
-+GLIBC_2.27 printf_size_info F
-+GLIBC_2.27 prlimit F
-+GLIBC_2.27 prlimit64 F
-+GLIBC_2.27 process_vm_readv F
-+GLIBC_2.27 process_vm_writev F
-+GLIBC_2.27 profil F
-+GLIBC_2.27 program_invocation_name D 0x8
-+GLIBC_2.27 program_invocation_short_name D 0x8
-+GLIBC_2.27 pselect F
-+GLIBC_2.27 psiginfo F
-+GLIBC_2.27 psignal F
-+GLIBC_2.27 pthread_attr_destroy F
-+GLIBC_2.27 pthread_attr_getdetachstate F
-+GLIBC_2.27 pthread_attr_getinheritsched F
-+GLIBC_2.27 pthread_attr_getschedparam F
-+GLIBC_2.27 pthread_attr_getschedpolicy F
-+GLIBC_2.27 pthread_attr_getscope F
-+GLIBC_2.27 pthread_attr_init F
-+GLIBC_2.27 pthread_attr_setdetachstate F
-+GLIBC_2.27 pthread_attr_setinheritsched F
-+GLIBC_2.27 pthread_attr_setschedparam F
-+GLIBC_2.27 pthread_attr_setschedpolicy F
-+GLIBC_2.27 pthread_attr_setscope F
-+GLIBC_2.27 pthread_cond_broadcast F
-+GLIBC_2.27 pthread_cond_destroy F
-+GLIBC_2.27 pthread_cond_init F
-+GLIBC_2.27 pthread_cond_signal F
-+GLIBC_2.27 pthread_cond_timedwait F
-+GLIBC_2.27 pthread_cond_wait F
-+GLIBC_2.27 pthread_condattr_destroy F
-+GLIBC_2.27 pthread_condattr_init F
-+GLIBC_2.27 pthread_equal F
-+GLIBC_2.27 pthread_exit F
-+GLIBC_2.27 pthread_getschedparam F
-+GLIBC_2.27 pthread_mutex_destroy F
-+GLIBC_2.27 pthread_mutex_init F
-+GLIBC_2.27 pthread_mutex_lock F
-+GLIBC_2.27 pthread_mutex_unlock F
-+GLIBC_2.27 pthread_self F
-+GLIBC_2.27 pthread_setcancelstate F
-+GLIBC_2.27 pthread_setcanceltype F
-+GLIBC_2.27 pthread_setschedparam F
-+GLIBC_2.27 ptrace F
-+GLIBC_2.27 ptsname F
-+GLIBC_2.27 ptsname_r F
-+GLIBC_2.27 putc F
-+GLIBC_2.27 putc_unlocked F
-+GLIBC_2.27 putchar F
-+GLIBC_2.27 putchar_unlocked F
-+GLIBC_2.27 putenv F
-+GLIBC_2.27 putgrent F
-+GLIBC_2.27 putmsg F
-+GLIBC_2.27 putpmsg F
-+GLIBC_2.27 putpwent F
-+GLIBC_2.27 puts F
-+GLIBC_2.27 putsgent F
-+GLIBC_2.27 putspent F
-+GLIBC_2.27 pututline F
-+GLIBC_2.27 pututxline F
-+GLIBC_2.27 putw F
-+GLIBC_2.27 putwc F
-+GLIBC_2.27 putwc_unlocked F
-+GLIBC_2.27 putwchar F
-+GLIBC_2.27 putwchar_unlocked F
-+GLIBC_2.27 pvalloc F
-+GLIBC_2.27 pwrite F
-+GLIBC_2.27 pwrite64 F
-+GLIBC_2.27 pwritev F
-+GLIBC_2.27 pwritev2 F
-+GLIBC_2.27 pwritev64 F
-+GLIBC_2.27 pwritev64v2 F
-+GLIBC_2.27 qecvt F
-+GLIBC_2.27 qecvt_r F
-+GLIBC_2.27 qfcvt F
-+GLIBC_2.27 qfcvt_r F
-+GLIBC_2.27 qgcvt F
-+GLIBC_2.27 qsort F
-+GLIBC_2.27 qsort_r F
-+GLIBC_2.27 quick_exit F
-+GLIBC_2.27 quotactl F
-+GLIBC_2.27 raise F
-+GLIBC_2.27 rand F
-+GLIBC_2.27 rand_r F
-+GLIBC_2.27 random F
-+GLIBC_2.27 random_r F
-+GLIBC_2.27 rawmemchr F
-+GLIBC_2.27 rcmd F
-+GLIBC_2.27 rcmd_af F
-+GLIBC_2.27 re_comp F
-+GLIBC_2.27 re_compile_fastmap F
-+GLIBC_2.27 re_compile_pattern F
-+GLIBC_2.27 re_exec F
-+GLIBC_2.27 re_match F
-+GLIBC_2.27 re_match_2 F
-+GLIBC_2.27 re_search F
-+GLIBC_2.27 re_search_2 F
-+GLIBC_2.27 re_set_registers F
-+GLIBC_2.27 re_set_syntax F
-+GLIBC_2.27 re_syntax_options D 0x8
-+GLIBC_2.27 read F
-+GLIBC_2.27 readahead F
-+GLIBC_2.27 readdir F
-+GLIBC_2.27 readdir64 F
-+GLIBC_2.27 readdir64_r F
-+GLIBC_2.27 readdir_r F
-+GLIBC_2.27 readlink F
-+GLIBC_2.27 readlinkat F
-+GLIBC_2.27 readv F
-+GLIBC_2.27 realloc F
-+GLIBC_2.27 reallocarray F
-+GLIBC_2.27 realpath F
-+GLIBC_2.27 reboot F
-+GLIBC_2.27 recv F
-+GLIBC_2.27 recvfrom F
-+GLIBC_2.27 recvmmsg F
-+GLIBC_2.27 recvmsg F
-+GLIBC_2.27 regcomp F
-+GLIBC_2.27 regerror F
-+GLIBC_2.27 regexec F
-+GLIBC_2.27 regfree F
-+GLIBC_2.27 register_printf_function F
-+GLIBC_2.27 register_printf_modifier F
-+GLIBC_2.27 register_printf_specifier F
-+GLIBC_2.27 register_printf_type F
-+GLIBC_2.27 registerrpc F
-+GLIBC_2.27 remap_file_pages F
-+GLIBC_2.27 remove F
-+GLIBC_2.27 removexattr F
-+GLIBC_2.27 remque F
-+GLIBC_2.27 rename F
-+GLIBC_2.27 renameat F
-+GLIBC_2.27 revoke F
-+GLIBC_2.27 rewind F
-+GLIBC_2.27 rewinddir F
-+GLIBC_2.27 rexec F
-+GLIBC_2.27 rexec_af F
-+GLIBC_2.27 rexecoptions D 0x4
-+GLIBC_2.27 rindex F
-+GLIBC_2.27 rmdir F
-+GLIBC_2.27 rpc_createerr D 0x20
-+GLIBC_2.27 rpmatch F
-+GLIBC_2.27 rresvport F
-+GLIBC_2.27 rresvport_af F
-+GLIBC_2.27 rtime F
-+GLIBC_2.27 ruserok F
-+GLIBC_2.27 ruserok_af F
-+GLIBC_2.27 ruserpass F
-+GLIBC_2.27 sbrk F
-+GLIBC_2.27 scalbn F
-+GLIBC_2.27 scalbnf F
-+GLIBC_2.27 scalbnl F
-+GLIBC_2.27 scandir F
-+GLIBC_2.27 scandir64 F
-+GLIBC_2.27 scandirat F
-+GLIBC_2.27 scandirat64 F
-+GLIBC_2.27 scanf F
-+GLIBC_2.27 sched_get_priority_max F
-+GLIBC_2.27 sched_get_priority_min F
-+GLIBC_2.27 sched_getaffinity F
-+GLIBC_2.27 sched_getcpu F
-+GLIBC_2.27 sched_getparam F
-+GLIBC_2.27 sched_getscheduler F
-+GLIBC_2.27 sched_rr_get_interval F
-+GLIBC_2.27 sched_setaffinity F
-+GLIBC_2.27 sched_setparam F
-+GLIBC_2.27 sched_setscheduler F
-+GLIBC_2.27 sched_yield F
-+GLIBC_2.27 secure_getenv F
-+GLIBC_2.27 seed48 F
-+GLIBC_2.27 seed48_r F
-+GLIBC_2.27 seekdir F
-+GLIBC_2.27 select F
-+GLIBC_2.27 semctl F
-+GLIBC_2.27 semget F
-+GLIBC_2.27 semop F
-+GLIBC_2.27 semtimedop F
-+GLIBC_2.27 send F
-+GLIBC_2.27 sendfile F
-+GLIBC_2.27 sendfile64 F
-+GLIBC_2.27 sendmmsg F
-+GLIBC_2.27 sendmsg F
-+GLIBC_2.27 sendto F
-+GLIBC_2.27 setaliasent F
-+GLIBC_2.27 setbuf F
-+GLIBC_2.27 setbuffer F
-+GLIBC_2.27 setcontext F
-+GLIBC_2.27 setdomainname F
-+GLIBC_2.27 setegid F
-+GLIBC_2.27 setenv F
-+GLIBC_2.27 seteuid F
-+GLIBC_2.27 setfsent F
-+GLIBC_2.27 setfsgid F
-+GLIBC_2.27 setfsuid F
-+GLIBC_2.27 setgid F
-+GLIBC_2.27 setgrent F
-+GLIBC_2.27 setgroups F
-+GLIBC_2.27 sethostent F
-+GLIBC_2.27 sethostid F
-+GLIBC_2.27 sethostname F
-+GLIBC_2.27 setipv4sourcefilter F
-+GLIBC_2.27 setitimer F
-+GLIBC_2.27 setjmp F
-+GLIBC_2.27 setlinebuf F
-+GLIBC_2.27 setlocale F
-+GLIBC_2.27 setlogin F
-+GLIBC_2.27 setlogmask F
-+GLIBC_2.27 setmntent F
-+GLIBC_2.27 setnetent F
-+GLIBC_2.27 setnetgrent F
-+GLIBC_2.27 setns F
-+GLIBC_2.27 setpgid F
-+GLIBC_2.27 setpgrp F
-+GLIBC_2.27 setpriority F
-+GLIBC_2.27 setprotoent F
-+GLIBC_2.27 setpwent F
-+GLIBC_2.27 setregid F
-+GLIBC_2.27 setresgid F
-+GLIBC_2.27 setresuid F
-+GLIBC_2.27 setreuid F
-+GLIBC_2.27 setrlimit F
-+GLIBC_2.27 setrlimit64 F
-+GLIBC_2.27 setrpcent F
-+GLIBC_2.27 setservent F
-+GLIBC_2.27 setsgent F
-+GLIBC_2.27 setsid F
-+GLIBC_2.27 setsockopt F
-+GLIBC_2.27 setsourcefilter F
-+GLIBC_2.27 setspent F
-+GLIBC_2.27 setstate F
-+GLIBC_2.27 setstate_r F
-+GLIBC_2.27 settimeofday F
-+GLIBC_2.27 setttyent F
-+GLIBC_2.27 setuid F
-+GLIBC_2.27 setusershell F
-+GLIBC_2.27 setutent F
-+GLIBC_2.27 setutxent F
-+GLIBC_2.27 setvbuf F
-+GLIBC_2.27 setxattr F
-+GLIBC_2.27 sgetsgent F
-+GLIBC_2.27 sgetsgent_r F
-+GLIBC_2.27 sgetspent F
-+GLIBC_2.27 sgetspent_r F
-+GLIBC_2.27 shmat F
-+GLIBC_2.27 shmctl F
-+GLIBC_2.27 shmdt F
-+GLIBC_2.27 shmget F
-+GLIBC_2.27 shutdown F
-+GLIBC_2.27 sigaction F
-+GLIBC_2.27 sigaddset F
-+GLIBC_2.27 sigaltstack F
-+GLIBC_2.27 sigandset F
-+GLIBC_2.27 sigblock F
-+GLIBC_2.27 sigdelset F
-+GLIBC_2.27 sigemptyset F
-+GLIBC_2.27 sigfillset F
-+GLIBC_2.27 siggetmask F
-+GLIBC_2.27 sighold F
-+GLIBC_2.27 sigignore F
-+GLIBC_2.27 siginterrupt F
-+GLIBC_2.27 sigisemptyset F
-+GLIBC_2.27 sigismember F
-+GLIBC_2.27 siglongjmp F
-+GLIBC_2.27 signal F
-+GLIBC_2.27 signalfd F
-+GLIBC_2.27 sigorset F
-+GLIBC_2.27 sigpause F
-+GLIBC_2.27 sigpending F
-+GLIBC_2.27 sigprocmask F
-+GLIBC_2.27 sigqueue F
-+GLIBC_2.27 sigrelse F
-+GLIBC_2.27 sigreturn F
-+GLIBC_2.27 sigset F
-+GLIBC_2.27 sigsetmask F
-+GLIBC_2.27 sigstack F
-+GLIBC_2.27 sigsuspend F
-+GLIBC_2.27 sigtimedwait F
-+GLIBC_2.27 sigwait F
-+GLIBC_2.27 sigwaitinfo F
-+GLIBC_2.27 sleep F
-+GLIBC_2.27 snprintf F
-+GLIBC_2.27 sockatmark F
-+GLIBC_2.27 socket F
-+GLIBC_2.27 socketpair F
-+GLIBC_2.27 splice F
-+GLIBC_2.27 sprintf F
-+GLIBC_2.27 sprofil F
-+GLIBC_2.27 srand F
-+GLIBC_2.27 srand48 F
-+GLIBC_2.27 srand48_r F
-+GLIBC_2.27 srandom F
-+GLIBC_2.27 srandom_r F
-+GLIBC_2.27 sscanf F
-+GLIBC_2.27 ssignal F
-+GLIBC_2.27 sstk F
-+GLIBC_2.27 statfs F
-+GLIBC_2.27 statfs64 F
-+GLIBC_2.27 statvfs F
-+GLIBC_2.27 statvfs64 F
-+GLIBC_2.27 stderr D 0x8
-+GLIBC_2.27 stdin D 0x8
-+GLIBC_2.27 stdout D 0x8
-+GLIBC_2.27 stime F
-+GLIBC_2.27 stpcpy F
-+GLIBC_2.27 stpncpy F
-+GLIBC_2.27 strcasecmp F
-+GLIBC_2.27 strcasecmp_l F
-+GLIBC_2.27 strcasestr F
-+GLIBC_2.27 strcat F
-+GLIBC_2.27 strchr F
-+GLIBC_2.27 strchrnul F
-+GLIBC_2.27 strcmp F
-+GLIBC_2.27 strcoll F
-+GLIBC_2.27 strcoll_l F
-+GLIBC_2.27 strcpy F
-+GLIBC_2.27 strcspn F
-+GLIBC_2.27 strdup F
-+GLIBC_2.27 strerror F
-+GLIBC_2.27 strerror_l F
-+GLIBC_2.27 strerror_r F
-+GLIBC_2.27 strfmon F
-+GLIBC_2.27 strfmon_l F
-+GLIBC_2.27 strfromd F
-+GLIBC_2.27 strfromf F
-+GLIBC_2.27 strfromf128 F
-+GLIBC_2.27 strfromf32 F
-+GLIBC_2.27 strfromf32x F
-+GLIBC_2.27 strfromf64 F
-+GLIBC_2.27 strfromf64x F
-+GLIBC_2.27 strfroml F
-+GLIBC_2.27 strfry F
-+GLIBC_2.27 strftime F
-+GLIBC_2.27 strftime_l F
-+GLIBC_2.27 strlen F
-+GLIBC_2.27 strncasecmp F
-+GLIBC_2.27 strncasecmp_l F
-+GLIBC_2.27 strncat F
-+GLIBC_2.27 strncmp F
-+GLIBC_2.27 strncpy F
-+GLIBC_2.27 strndup F
-+GLIBC_2.27 strnlen F
-+GLIBC_2.27 strpbrk F
-+GLIBC_2.27 strptime F
-+GLIBC_2.27 strptime_l F
-+GLIBC_2.27 strrchr F
-+GLIBC_2.27 strsep F
-+GLIBC_2.27 strsignal F
-+GLIBC_2.27 strspn F
-+GLIBC_2.27 strstr F
-+GLIBC_2.27 strtod F
-+GLIBC_2.27 strtod_l F
-+GLIBC_2.27 strtof F
-+GLIBC_2.27 strtof128 F
-+GLIBC_2.27 strtof128_l F
-+GLIBC_2.27 strtof32 F
-+GLIBC_2.27 strtof32_l F
-+GLIBC_2.27 strtof32x F
-+GLIBC_2.27 strtof32x_l F
-+GLIBC_2.27 strtof64 F
-+GLIBC_2.27 strtof64_l F
-+GLIBC_2.27 strtof64x F
-+GLIBC_2.27 strtof64x_l F
-+GLIBC_2.27 strtof_l F
-+GLIBC_2.27 strtoimax F
-+GLIBC_2.27 strtok F
-+GLIBC_2.27 strtok_r F
-+GLIBC_2.27 strtol F
-+GLIBC_2.27 strtol_l F
-+GLIBC_2.27 strtold F
-+GLIBC_2.27 strtold_l F
-+GLIBC_2.27 strtoll F
-+GLIBC_2.27 strtoll_l F
-+GLIBC_2.27 strtoq F
-+GLIBC_2.27 strtoul F
-+GLIBC_2.27 strtoul_l F
-+GLIBC_2.27 strtoull F
-+GLIBC_2.27 strtoull_l F
-+GLIBC_2.27 strtoumax F
-+GLIBC_2.27 strtouq F
-+GLIBC_2.27 strverscmp F
-+GLIBC_2.27 strxfrm F
-+GLIBC_2.27 strxfrm_l F
-+GLIBC_2.27 stty F
-+GLIBC_2.27 svc_exit F
-+GLIBC_2.27 svc_fdset D 0x80
-+GLIBC_2.27 svc_getreq F
-+GLIBC_2.27 svc_getreq_common F
-+GLIBC_2.27 svc_getreq_poll F
-+GLIBC_2.27 svc_getreqset F
-+GLIBC_2.27 svc_max_pollfd D 0x4
-+GLIBC_2.27 svc_pollfd D 0x8
-+GLIBC_2.27 svc_register F
-+GLIBC_2.27 svc_run F
-+GLIBC_2.27 svc_sendreply F
-+GLIBC_2.27 svc_unregister F
-+GLIBC_2.27 svcauthdes_stats D 0x18
-+GLIBC_2.27 svcerr_auth F
-+GLIBC_2.27 svcerr_decode F
-+GLIBC_2.27 svcerr_noproc F
-+GLIBC_2.27 svcerr_noprog F
-+GLIBC_2.27 svcerr_progvers F
-+GLIBC_2.27 svcerr_systemerr F
-+GLIBC_2.27 svcerr_weakauth F
-+GLIBC_2.27 svcfd_create F
-+GLIBC_2.27 svcraw_create F
-+GLIBC_2.27 svctcp_create F
-+GLIBC_2.27 svcudp_bufcreate F
-+GLIBC_2.27 svcudp_create F
-+GLIBC_2.27 svcudp_enablecache F
-+GLIBC_2.27 svcunix_create F
-+GLIBC_2.27 svcunixfd_create F
-+GLIBC_2.27 swab F
-+GLIBC_2.27 swapcontext F
-+GLIBC_2.27 swapoff F
-+GLIBC_2.27 swapon F
-+GLIBC_2.27 swprintf F
-+GLIBC_2.27 swscanf F
-+GLIBC_2.27 symlink F
-+GLIBC_2.27 symlinkat F
-+GLIBC_2.27 sync F
-+GLIBC_2.27 sync_file_range F
-+GLIBC_2.27 syncfs F
-+GLIBC_2.27 sys_errlist D 0x2370
-+GLIBC_2.27 sys_nerr D 0x4
-+GLIBC_2.27 sys_sigabbrev D 0x400
-+GLIBC_2.27 sys_siglist D 0x400
-+GLIBC_2.27 syscall F
-+GLIBC_2.27 sysconf F
-+GLIBC_2.27 sysctl F
-+GLIBC_2.27 sysinfo F
-+GLIBC_2.27 syslog F
-+GLIBC_2.27 system F
-+GLIBC_2.27 sysv_signal F
-+GLIBC_2.27 tcdrain F
-+GLIBC_2.27 tcflow F
-+GLIBC_2.27 tcflush F
-+GLIBC_2.27 tcgetattr F
-+GLIBC_2.27 tcgetpgrp F
-+GLIBC_2.27 tcgetsid F
-+GLIBC_2.27 tcsendbreak F
-+GLIBC_2.27 tcsetattr F
-+GLIBC_2.27 tcsetpgrp F
-+GLIBC_2.27 tdelete F
-+GLIBC_2.27 tdestroy F
-+GLIBC_2.27 tee F
-+GLIBC_2.27 telldir F
-+GLIBC_2.27 tempnam F
-+GLIBC_2.27 textdomain F
-+GLIBC_2.27 tfind F
-+GLIBC_2.27 time F
-+GLIBC_2.27 timegm F
-+GLIBC_2.27 timelocal F
-+GLIBC_2.27 timerfd_create F
-+GLIBC_2.27 timerfd_gettime F
-+GLIBC_2.27 timerfd_settime F
-+GLIBC_2.27 times F
-+GLIBC_2.27 timespec_get F
-+GLIBC_2.27 timezone D 0x8
-+GLIBC_2.27 tmpfile F
-+GLIBC_2.27 tmpfile64 F
-+GLIBC_2.27 tmpnam F
-+GLIBC_2.27 tmpnam_r F
-+GLIBC_2.27 toascii F
-+GLIBC_2.27 tolower F
-+GLIBC_2.27 tolower_l F
-+GLIBC_2.27 toupper F
-+GLIBC_2.27 toupper_l F
-+GLIBC_2.27 towctrans F
-+GLIBC_2.27 towctrans_l F
-+GLIBC_2.27 towlower F
-+GLIBC_2.27 towlower_l F
-+GLIBC_2.27 towupper F
-+GLIBC_2.27 towupper_l F
-+GLIBC_2.27 tr_break F
-+GLIBC_2.27 truncate F
-+GLIBC_2.27 truncate64 F
-+GLIBC_2.27 tsearch F
-+GLIBC_2.27 ttyname F
-+GLIBC_2.27 ttyname_r F
-+GLIBC_2.27 ttyslot F
-+GLIBC_2.27 twalk F
-+GLIBC_2.27 tzname D 0x10
-+GLIBC_2.27 tzset F
-+GLIBC_2.27 ualarm F
-+GLIBC_2.27 ulckpwdf F
-+GLIBC_2.27 ulimit F
-+GLIBC_2.27 umask F
-+GLIBC_2.27 umount F
-+GLIBC_2.27 umount2 F
-+GLIBC_2.27 uname F
-+GLIBC_2.27 ungetc F
-+GLIBC_2.27 ungetwc F
-+GLIBC_2.27 unlink F
-+GLIBC_2.27 unlinkat F
-+GLIBC_2.27 unlockpt F
-+GLIBC_2.27 unsetenv F
-+GLIBC_2.27 unshare F
-+GLIBC_2.27 updwtmp F
-+GLIBC_2.27 updwtmpx F
-+GLIBC_2.27 uselocale F
-+GLIBC_2.27 user2netname F
-+GLIBC_2.27 usleep F
-+GLIBC_2.27 ustat F
-+GLIBC_2.27 utime F
-+GLIBC_2.27 utimensat F
-+GLIBC_2.27 utimes F
-+GLIBC_2.27 utmpname F
-+GLIBC_2.27 utmpxname F
-+GLIBC_2.27 valloc F
-+GLIBC_2.27 vasprintf F
-+GLIBC_2.27 vdprintf F
-+GLIBC_2.27 verr F
-+GLIBC_2.27 verrx F
-+GLIBC_2.27 versionsort F
-+GLIBC_2.27 versionsort64 F
-+GLIBC_2.27 vfork F
-+GLIBC_2.27 vfprintf F
-+GLIBC_2.27 vfscanf F
-+GLIBC_2.27 vfwprintf F
-+GLIBC_2.27 vfwscanf F
-+GLIBC_2.27 vhangup F
-+GLIBC_2.27 vlimit F
-+GLIBC_2.27 vmsplice F
-+GLIBC_2.27 vprintf F
-+GLIBC_2.27 vscanf F
-+GLIBC_2.27 vsnprintf F
-+GLIBC_2.27 vsprintf F
-+GLIBC_2.27 vsscanf F
-+GLIBC_2.27 vswprintf F
-+GLIBC_2.27 vswscanf F
-+GLIBC_2.27 vsyslog F
-+GLIBC_2.27 vtimes F
-+GLIBC_2.27 vwarn F
-+GLIBC_2.27 vwarnx F
-+GLIBC_2.27 vwprintf F
-+GLIBC_2.27 vwscanf F
-+GLIBC_2.27 wait F
-+GLIBC_2.27 wait3 F
-+GLIBC_2.27 wait4 F
-+GLIBC_2.27 waitid F
-+GLIBC_2.27 waitpid F
-+GLIBC_2.27 warn F
-+GLIBC_2.27 warnx F
-+GLIBC_2.27 wcpcpy F
-+GLIBC_2.27 wcpncpy F
-+GLIBC_2.27 wcrtomb F
-+GLIBC_2.27 wcscasecmp F
-+GLIBC_2.27 wcscasecmp_l F
-+GLIBC_2.27 wcscat F
-+GLIBC_2.27 wcschr F
-+GLIBC_2.27 wcschrnul F
-+GLIBC_2.27 wcscmp F
-+GLIBC_2.27 wcscoll F
-+GLIBC_2.27 wcscoll_l F
-+GLIBC_2.27 wcscpy F
-+GLIBC_2.27 wcscspn F
-+GLIBC_2.27 wcsdup F
-+GLIBC_2.27 wcsftime F
-+GLIBC_2.27 wcsftime_l F
-+GLIBC_2.27 wcslen F
-+GLIBC_2.27 wcsncasecmp F
-+GLIBC_2.27 wcsncasecmp_l F
-+GLIBC_2.27 wcsncat F
-+GLIBC_2.27 wcsncmp F
-+GLIBC_2.27 wcsncpy F
-+GLIBC_2.27 wcsnlen F
-+GLIBC_2.27 wcsnrtombs F
-+GLIBC_2.27 wcspbrk F
-+GLIBC_2.27 wcsrchr F
-+GLIBC_2.27 wcsrtombs F
-+GLIBC_2.27 wcsspn F
-+GLIBC_2.27 wcsstr F
-+GLIBC_2.27 wcstod F
-+GLIBC_2.27 wcstod_l F
-+GLIBC_2.27 wcstof F
-+GLIBC_2.27 wcstof128 F
-+GLIBC_2.27 wcstof128_l F
-+GLIBC_2.27 wcstof32 F
-+GLIBC_2.27 wcstof32_l F
-+GLIBC_2.27 wcstof32x F
-+GLIBC_2.27 wcstof32x_l F
-+GLIBC_2.27 wcstof64 F
-+GLIBC_2.27 wcstof64_l F
-+GLIBC_2.27 wcstof64x F
-+GLIBC_2.27 wcstof64x_l F
-+GLIBC_2.27 wcstof_l F
-+GLIBC_2.27 wcstoimax F
-+GLIBC_2.27 wcstok F
-+GLIBC_2.27 wcstol F
-+GLIBC_2.27 wcstol_l F
-+GLIBC_2.27 wcstold F
-+GLIBC_2.27 wcstold_l F
-+GLIBC_2.27 wcstoll F
-+GLIBC_2.27 wcstoll_l F
-+GLIBC_2.27 wcstombs F
-+GLIBC_2.27 wcstoq F
-+GLIBC_2.27 wcstoul F
-+GLIBC_2.27 wcstoul_l F
-+GLIBC_2.27 wcstoull F
-+GLIBC_2.27 wcstoull_l F
-+GLIBC_2.27 wcstoumax F
-+GLIBC_2.27 wcstouq F
-+GLIBC_2.27 wcswcs F
-+GLIBC_2.27 wcswidth F
-+GLIBC_2.27 wcsxfrm F
-+GLIBC_2.27 wcsxfrm_l F
-+GLIBC_2.27 wctob F
-+GLIBC_2.27 wctomb F
-+GLIBC_2.27 wctrans F
-+GLIBC_2.27 wctrans_l F
-+GLIBC_2.27 wctype F
-+GLIBC_2.27 wctype_l F
-+GLIBC_2.27 wcwidth F
-+GLIBC_2.27 wmemchr F
-+GLIBC_2.27 wmemcmp F
-+GLIBC_2.27 wmemcpy F
-+GLIBC_2.27 wmemmove F
-+GLIBC_2.27 wmempcpy F
-+GLIBC_2.27 wmemset F
-+GLIBC_2.27 wordexp F
-+GLIBC_2.27 wordfree F
-+GLIBC_2.27 wprintf F
-+GLIBC_2.27 write F
-+GLIBC_2.27 writev F
-+GLIBC_2.27 wscanf F
-+GLIBC_2.27 xdecrypt F
-+GLIBC_2.27 xdr_accepted_reply F
-+GLIBC_2.27 xdr_array F
-+GLIBC_2.27 xdr_authdes_cred F
-+GLIBC_2.27 xdr_authdes_verf F
-+GLIBC_2.27 xdr_authunix_parms F
-+GLIBC_2.27 xdr_bool F
-+GLIBC_2.27 xdr_bytes F
-+GLIBC_2.27 xdr_callhdr F
-+GLIBC_2.27 xdr_callmsg F
-+GLIBC_2.27 xdr_char F
-+GLIBC_2.27 xdr_cryptkeyarg F
-+GLIBC_2.27 xdr_cryptkeyarg2 F
-+GLIBC_2.27 xdr_cryptkeyres F
-+GLIBC_2.27 xdr_des_block F
-+GLIBC_2.27 xdr_double F
-+GLIBC_2.27 xdr_enum F
-+GLIBC_2.27 xdr_float F
-+GLIBC_2.27 xdr_free F
-+GLIBC_2.27 xdr_getcredres F
-+GLIBC_2.27 xdr_hyper F
-+GLIBC_2.27 xdr_int F
-+GLIBC_2.27 xdr_int16_t F
-+GLIBC_2.27 xdr_int32_t F
-+GLIBC_2.27 xdr_int64_t F
-+GLIBC_2.27 xdr_int8_t F
-+GLIBC_2.27 xdr_key_netstarg F
-+GLIBC_2.27 xdr_key_netstres F
-+GLIBC_2.27 xdr_keybuf F
-+GLIBC_2.27 xdr_keystatus F
-+GLIBC_2.27 xdr_long F
-+GLIBC_2.27 xdr_longlong_t F
-+GLIBC_2.27 xdr_netnamestr F
-+GLIBC_2.27 xdr_netobj F
-+GLIBC_2.27 xdr_opaque F
-+GLIBC_2.27 xdr_opaque_auth F
-+GLIBC_2.27 xdr_pmap F
-+GLIBC_2.27 xdr_pmaplist F
-+GLIBC_2.27 xdr_pointer F
-+GLIBC_2.27 xdr_quad_t F
-+GLIBC_2.27 xdr_reference F
-+GLIBC_2.27 xdr_rejected_reply F
-+GLIBC_2.27 xdr_replymsg F
-+GLIBC_2.27 xdr_rmtcall_args F
-+GLIBC_2.27 xdr_rmtcallres F
-+GLIBC_2.27 xdr_short F
-+GLIBC_2.27 xdr_sizeof F
-+GLIBC_2.27 xdr_string F
-+GLIBC_2.27 xdr_u_char F
-+GLIBC_2.27 xdr_u_hyper F
-+GLIBC_2.27 xdr_u_int F
-+GLIBC_2.27 xdr_u_long F
-+GLIBC_2.27 xdr_u_longlong_t F
-+GLIBC_2.27 xdr_u_quad_t F
-+GLIBC_2.27 xdr_u_short F
-+GLIBC_2.27 xdr_uint16_t F
-+GLIBC_2.27 xdr_uint32_t F
-+GLIBC_2.27 xdr_uint64_t F
-+GLIBC_2.27 xdr_uint8_t F
-+GLIBC_2.27 xdr_union F
-+GLIBC_2.27 xdr_unixcred F
-+GLIBC_2.27 xdr_vector F
-+GLIBC_2.27 xdr_void F
-+GLIBC_2.27 xdr_wrapstring F
-+GLIBC_2.27 xdrmem_create F
-+GLIBC_2.27 xdrrec_create F
-+GLIBC_2.27 xdrrec_endofrecord F
-+GLIBC_2.27 xdrrec_eof F
-+GLIBC_2.27 xdrrec_skiprecord F
-+GLIBC_2.27 xdrstdio_create F
-+GLIBC_2.27 xencrypt F
-+GLIBC_2.27 xprt_register F
-+GLIBC_2.27 xprt_unregister F
-+GLIBC_2.28 fcntl64 F
-+GLIBC_2.28 renameat2 F
-+GLIBC_2.28 statx F
-+GLIBC_2.28 thrd_current F
-+GLIBC_2.28 thrd_equal F
-+GLIBC_2.28 thrd_sleep F
-+GLIBC_2.28 thrd_yield F
-diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libcrypt.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libcrypt.abilist
-new file mode 100644
-index 00000000..9484dca7
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libcrypt.abilist
-@@ -0,0 +1,7 @@
-+GLIBC_2.27 crypt F
-+GLIBC_2.27 crypt_r F
-+GLIBC_2.27 encrypt F
-+GLIBC_2.27 encrypt_r F
-+GLIBC_2.27 fcrypt F
-+GLIBC_2.27 setkey F
-+GLIBC_2.27 setkey_r F
-diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libdl.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libdl.abilist
-new file mode 100644
-index 00000000..16adcae5
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libdl.abilist
-@@ -0,0 +1,9 @@
-+GLIBC_2.27 dladdr F
-+GLIBC_2.27 dladdr1 F
-+GLIBC_2.27 dlclose F
-+GLIBC_2.27 dlerror F
-+GLIBC_2.27 dlinfo F
-+GLIBC_2.27 dlmopen F
-+GLIBC_2.27 dlopen F
-+GLIBC_2.27 dlsym F
-+GLIBC_2.27 dlvsym F
-diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libm.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libm.abilist
-new file mode 100644
-index 00000000..361fce20
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libm.abilist
-@@ -0,0 +1,1021 @@
-+GLIBC_2.27 __acos_finite F
-+GLIBC_2.27 __acosf_finite F
-+GLIBC_2.27 __acosh_finite F
-+GLIBC_2.27 __acoshf_finite F
-+GLIBC_2.27 __acoshl_finite F
-+GLIBC_2.27 __acosl_finite F
-+GLIBC_2.27 __asin_finite F
-+GLIBC_2.27 __asinf_finite F
-+GLIBC_2.27 __asinl_finite F
-+GLIBC_2.27 __atan2_finite F
-+GLIBC_2.27 __atan2f_finite F
-+GLIBC_2.27 __atan2l_finite F
-+GLIBC_2.27 __atanh_finite F
-+GLIBC_2.27 __atanhf_finite F
-+GLIBC_2.27 __atanhl_finite F
-+GLIBC_2.27 __clog10 F
-+GLIBC_2.27 __clog10f F
-+GLIBC_2.27 __clog10l F
-+GLIBC_2.27 __cosh_finite F
-+GLIBC_2.27 __coshf_finite F
-+GLIBC_2.27 __coshl_finite F
-+GLIBC_2.27 __exp10_finite F
-+GLIBC_2.27 __exp10f_finite F
-+GLIBC_2.27 __exp10l_finite F
-+GLIBC_2.27 __exp2_finite F
-+GLIBC_2.27 __exp2f_finite F
-+GLIBC_2.27 __exp2l_finite F
-+GLIBC_2.27 __exp_finite F
-+GLIBC_2.27 __expf_finite F
-+GLIBC_2.27 __expl_finite F
-+GLIBC_2.27 __finite F
-+GLIBC_2.27 __finitef F
-+GLIBC_2.27 __finitel F
-+GLIBC_2.27 __fmod_finite F
-+GLIBC_2.27 __fmodf_finite F
-+GLIBC_2.27 __fmodl_finite F
-+GLIBC_2.27 __fpclassify F
-+GLIBC_2.27 __fpclassifyf F
-+GLIBC_2.27 __fpclassifyl F
-+GLIBC_2.27 __gamma_r_finite F
-+GLIBC_2.27 __gammaf_r_finite F
-+GLIBC_2.27 __gammal_r_finite F
-+GLIBC_2.27 __hypot_finite F
-+GLIBC_2.27 __hypotf_finite F
-+GLIBC_2.27 __hypotl_finite F
-+GLIBC_2.27 __iseqsig F
-+GLIBC_2.27 __iseqsigf F
-+GLIBC_2.27 __iseqsigl F
-+GLIBC_2.27 __issignaling F
-+GLIBC_2.27 __issignalingf F
-+GLIBC_2.27 __issignalingl F
-+GLIBC_2.27 __j0_finite F
-+GLIBC_2.27 __j0f_finite F
-+GLIBC_2.27 __j0l_finite F
-+GLIBC_2.27 __j1_finite F
-+GLIBC_2.27 __j1f_finite F
-+GLIBC_2.27 __j1l_finite F
-+GLIBC_2.27 __jn_finite F
-+GLIBC_2.27 __jnf_finite F
-+GLIBC_2.27 __jnl_finite F
-+GLIBC_2.27 __lgamma_r_finite F
-+GLIBC_2.27 __lgammaf_r_finite F
-+GLIBC_2.27 __lgammal_r_finite F
-+GLIBC_2.27 __log10_finite F
-+GLIBC_2.27 __log10f_finite F
-+GLIBC_2.27 __log10l_finite F
-+GLIBC_2.27 __log2_finite F
-+GLIBC_2.27 __log2f_finite F
-+GLIBC_2.27 __log2l_finite F
-+GLIBC_2.27 __log_finite F
-+GLIBC_2.27 __logf_finite F
-+GLIBC_2.27 __logl_finite F
-+GLIBC_2.27 __pow_finite F
-+GLIBC_2.27 __powf_finite F
-+GLIBC_2.27 __powl_finite F
-+GLIBC_2.27 __remainder_finite F
-+GLIBC_2.27 __remainderf_finite F
-+GLIBC_2.27 __remainderl_finite F
-+GLIBC_2.27 __scalb_finite F
-+GLIBC_2.27 __scalbf_finite F
-+GLIBC_2.27 __scalbl_finite F
-+GLIBC_2.27 __signbit F
-+GLIBC_2.27 __signbitf F
-+GLIBC_2.27 __signbitl F
-+GLIBC_2.27 __signgam D 0x4
-+GLIBC_2.27 __sinh_finite F
-+GLIBC_2.27 __sinhf_finite F
-+GLIBC_2.27 __sinhl_finite F
-+GLIBC_2.27 __sqrt_finite F
-+GLIBC_2.27 __sqrtf_finite F
-+GLIBC_2.27 __sqrtl_finite F
-+GLIBC_2.27 __y0_finite F
-+GLIBC_2.27 __y0f_finite F
-+GLIBC_2.27 __y0l_finite F
-+GLIBC_2.27 __y1_finite F
-+GLIBC_2.27 __y1f_finite F
-+GLIBC_2.27 __y1l_finite F
-+GLIBC_2.27 __yn_finite F
-+GLIBC_2.27 __ynf_finite F
-+GLIBC_2.27 __ynl_finite F
-+GLIBC_2.27 acos F
-+GLIBC_2.27 acosf F
-+GLIBC_2.27 acosf128 F
-+GLIBC_2.27 acosf32 F
-+GLIBC_2.27 acosf32x F
-+GLIBC_2.27 acosf64 F
-+GLIBC_2.27 acosf64x F
-+GLIBC_2.27 acosh F
-+GLIBC_2.27 acoshf F
-+GLIBC_2.27 acoshf128 F
-+GLIBC_2.27 acoshf32 F
-+GLIBC_2.27 acoshf32x F
-+GLIBC_2.27 acoshf64 F
-+GLIBC_2.27 acoshf64x F
-+GLIBC_2.27 acoshl F
-+GLIBC_2.27 acosl F
-+GLIBC_2.27 asin F
-+GLIBC_2.27 asinf F
-+GLIBC_2.27 asinf128 F
-+GLIBC_2.27 asinf32 F
-+GLIBC_2.27 asinf32x F
-+GLIBC_2.27 asinf64 F
-+GLIBC_2.27 asinf64x F
-+GLIBC_2.27 asinh F
-+GLIBC_2.27 asinhf F
-+GLIBC_2.27 asinhf128 F
-+GLIBC_2.27 asinhf32 F
-+GLIBC_2.27 asinhf32x F
-+GLIBC_2.27 asinhf64 F
-+GLIBC_2.27 asinhf64x F
-+GLIBC_2.27 asinhl F
-+GLIBC_2.27 asinl F
-+GLIBC_2.27 atan F
-+GLIBC_2.27 atan2 F
-+GLIBC_2.27 atan2f F
-+GLIBC_2.27 atan2f128 F
-+GLIBC_2.27 atan2f32 F
-+GLIBC_2.27 atan2f32x F
-+GLIBC_2.27 atan2f64 F
-+GLIBC_2.27 atan2f64x F
-+GLIBC_2.27 atan2l F
-+GLIBC_2.27 atanf F
-+GLIBC_2.27 atanf128 F
-+GLIBC_2.27 atanf32 F
-+GLIBC_2.27 atanf32x F
-+GLIBC_2.27 atanf64 F
-+GLIBC_2.27 atanf64x F
-+GLIBC_2.27 atanh F
-+GLIBC_2.27 atanhf F
-+GLIBC_2.27 atanhf128 F
-+GLIBC_2.27 atanhf32 F
-+GLIBC_2.27 atanhf32x F
-+GLIBC_2.27 atanhf64 F
-+GLIBC_2.27 atanhf64x F
-+GLIBC_2.27 atanhl F
-+GLIBC_2.27 atanl F
-+GLIBC_2.27 cabs F
-+GLIBC_2.27 cabsf F
-+GLIBC_2.27 cabsf128 F
-+GLIBC_2.27 cabsf32 F
-+GLIBC_2.27 cabsf32x F
-+GLIBC_2.27 cabsf64 F
-+GLIBC_2.27 cabsf64x F
-+GLIBC_2.27 cabsl F
-+GLIBC_2.27 cacos F
-+GLIBC_2.27 cacosf F
-+GLIBC_2.27 cacosf128 F
-+GLIBC_2.27 cacosf32 F
-+GLIBC_2.27 cacosf32x F
-+GLIBC_2.27 cacosf64 F
-+GLIBC_2.27 cacosf64x F
-+GLIBC_2.27 cacosh F
-+GLIBC_2.27 cacoshf F
-+GLIBC_2.27 cacoshf128 F
-+GLIBC_2.27 cacoshf32 F
-+GLIBC_2.27 cacoshf32x F
-+GLIBC_2.27 cacoshf64 F
-+GLIBC_2.27 cacoshf64x F
-+GLIBC_2.27 cacoshl F
-+GLIBC_2.27 cacosl F
-+GLIBC_2.27 canonicalize F
-+GLIBC_2.27 canonicalizef F
-+GLIBC_2.27 canonicalizef128 F
-+GLIBC_2.27 canonicalizef32 F
-+GLIBC_2.27 canonicalizef32x F
-+GLIBC_2.27 canonicalizef64 F
-+GLIBC_2.27 canonicalizef64x F
-+GLIBC_2.27 canonicalizel F
-+GLIBC_2.27 carg F
-+GLIBC_2.27 cargf F
-+GLIBC_2.27 cargf128 F
-+GLIBC_2.27 cargf32 F
-+GLIBC_2.27 cargf32x F
-+GLIBC_2.27 cargf64 F
-+GLIBC_2.27 cargf64x F
-+GLIBC_2.27 cargl F
-+GLIBC_2.27 casin F
-+GLIBC_2.27 casinf F
-+GLIBC_2.27 casinf128 F
-+GLIBC_2.27 casinf32 F
-+GLIBC_2.27 casinf32x F
-+GLIBC_2.27 casinf64 F
-+GLIBC_2.27 casinf64x F
-+GLIBC_2.27 casinh F
-+GLIBC_2.27 casinhf F
-+GLIBC_2.27 casinhf128 F
-+GLIBC_2.27 casinhf32 F
-+GLIBC_2.27 casinhf32x F
-+GLIBC_2.27 casinhf64 F
-+GLIBC_2.27 casinhf64x F
-+GLIBC_2.27 casinhl F
-+GLIBC_2.27 casinl F
-+GLIBC_2.27 catan F
-+GLIBC_2.27 catanf F
-+GLIBC_2.27 catanf128 F
-+GLIBC_2.27 catanf32 F
-+GLIBC_2.27 catanf32x F
-+GLIBC_2.27 catanf64 F
-+GLIBC_2.27 catanf64x F
-+GLIBC_2.27 catanh F
-+GLIBC_2.27 catanhf F
-+GLIBC_2.27 catanhf128 F
-+GLIBC_2.27 catanhf32 F
-+GLIBC_2.27 catanhf32x F
-+GLIBC_2.27 catanhf64 F
-+GLIBC_2.27 catanhf64x F
-+GLIBC_2.27 catanhl F
-+GLIBC_2.27 catanl F
-+GLIBC_2.27 cbrt F
-+GLIBC_2.27 cbrtf F
-+GLIBC_2.27 cbrtf128 F
-+GLIBC_2.27 cbrtf32 F
-+GLIBC_2.27 cbrtf32x F
-+GLIBC_2.27 cbrtf64 F
-+GLIBC_2.27 cbrtf64x F
-+GLIBC_2.27 cbrtl F
-+GLIBC_2.27 ccos F
-+GLIBC_2.27 ccosf F
-+GLIBC_2.27 ccosf128 F
-+GLIBC_2.27 ccosf32 F
-+GLIBC_2.27 ccosf32x F
-+GLIBC_2.27 ccosf64 F
-+GLIBC_2.27 ccosf64x F
-+GLIBC_2.27 ccosh F
-+GLIBC_2.27 ccoshf F
-+GLIBC_2.27 ccoshf128 F
-+GLIBC_2.27 ccoshf32 F
-+GLIBC_2.27 ccoshf32x F
-+GLIBC_2.27 ccoshf64 F
-+GLIBC_2.27 ccoshf64x F
-+GLIBC_2.27 ccoshl F
-+GLIBC_2.27 ccosl F
-+GLIBC_2.27 ceil F
-+GLIBC_2.27 ceilf F
-+GLIBC_2.27 ceilf128 F
-+GLIBC_2.27 ceilf32 F
-+GLIBC_2.27 ceilf32x F
-+GLIBC_2.27 ceilf64 F
-+GLIBC_2.27 ceilf64x F
-+GLIBC_2.27 ceill F
-+GLIBC_2.27 cexp F
-+GLIBC_2.27 cexpf F
-+GLIBC_2.27 cexpf128 F
-+GLIBC_2.27 cexpf32 F
-+GLIBC_2.27 cexpf32x F
-+GLIBC_2.27 cexpf64 F
-+GLIBC_2.27 cexpf64x F
-+GLIBC_2.27 cexpl F
-+GLIBC_2.27 cimag F
-+GLIBC_2.27 cimagf F
-+GLIBC_2.27 cimagf128 F
-+GLIBC_2.27 cimagf32 F
-+GLIBC_2.27 cimagf32x F
-+GLIBC_2.27 cimagf64 F
-+GLIBC_2.27 cimagf64x F
-+GLIBC_2.27 cimagl F
-+GLIBC_2.27 clog F
-+GLIBC_2.27 clog10 F
-+GLIBC_2.27 clog10f F
-+GLIBC_2.27 clog10f128 F
-+GLIBC_2.27 clog10f32 F
-+GLIBC_2.27 clog10f32x F
-+GLIBC_2.27 clog10f64 F
-+GLIBC_2.27 clog10f64x F
-+GLIBC_2.27 clog10l F
-+GLIBC_2.27 clogf F
-+GLIBC_2.27 clogf128 F
-+GLIBC_2.27 clogf32 F
-+GLIBC_2.27 clogf32x F
-+GLIBC_2.27 clogf64 F
-+GLIBC_2.27 clogf64x F
-+GLIBC_2.27 clogl F
-+GLIBC_2.27 conj F
-+GLIBC_2.27 conjf F
-+GLIBC_2.27 conjf128 F
-+GLIBC_2.27 conjf32 F
-+GLIBC_2.27 conjf32x F
-+GLIBC_2.27 conjf64 F
-+GLIBC_2.27 conjf64x F
-+GLIBC_2.27 conjl F
-+GLIBC_2.27 copysign F
-+GLIBC_2.27 copysignf F
-+GLIBC_2.27 copysignf128 F
-+GLIBC_2.27 copysignf32 F
-+GLIBC_2.27 copysignf32x F
-+GLIBC_2.27 copysignf64 F
-+GLIBC_2.27 copysignf64x F
-+GLIBC_2.27 copysignl F
-+GLIBC_2.27 cos F
-+GLIBC_2.27 cosf F
-+GLIBC_2.27 cosf128 F
-+GLIBC_2.27 cosf32 F
-+GLIBC_2.27 cosf32x F
-+GLIBC_2.27 cosf64 F
-+GLIBC_2.27 cosf64x F
-+GLIBC_2.27 cosh F
-+GLIBC_2.27 coshf F
-+GLIBC_2.27 coshf128 F
-+GLIBC_2.27 coshf32 F
-+GLIBC_2.27 coshf32x F
-+GLIBC_2.27 coshf64 F
-+GLIBC_2.27 coshf64x F
-+GLIBC_2.27 coshl F
-+GLIBC_2.27 cosl F
-+GLIBC_2.27 cpow F
-+GLIBC_2.27 cpowf F
-+GLIBC_2.27 cpowf128 F
-+GLIBC_2.27 cpowf32 F
-+GLIBC_2.27 cpowf32x F
-+GLIBC_2.27 cpowf64 F
-+GLIBC_2.27 cpowf64x F
-+GLIBC_2.27 cpowl F
-+GLIBC_2.27 cproj F
-+GLIBC_2.27 cprojf F
-+GLIBC_2.27 cprojf128 F
-+GLIBC_2.27 cprojf32 F
-+GLIBC_2.27 cprojf32x F
-+GLIBC_2.27 cprojf64 F
-+GLIBC_2.27 cprojf64x F
-+GLIBC_2.27 cprojl F
-+GLIBC_2.27 creal F
-+GLIBC_2.27 crealf F
-+GLIBC_2.27 crealf128 F
-+GLIBC_2.27 crealf32 F
-+GLIBC_2.27 crealf32x F
-+GLIBC_2.27 crealf64 F
-+GLIBC_2.27 crealf64x F
-+GLIBC_2.27 creall F
-+GLIBC_2.27 csin F
-+GLIBC_2.27 csinf F
-+GLIBC_2.27 csinf128 F
-+GLIBC_2.27 csinf32 F
-+GLIBC_2.27 csinf32x F
-+GLIBC_2.27 csinf64 F
-+GLIBC_2.27 csinf64x F
-+GLIBC_2.27 csinh F
-+GLIBC_2.27 csinhf F
-+GLIBC_2.27 csinhf128 F
-+GLIBC_2.27 csinhf32 F
-+GLIBC_2.27 csinhf32x F
-+GLIBC_2.27 csinhf64 F
-+GLIBC_2.27 csinhf64x F
-+GLIBC_2.27 csinhl F
-+GLIBC_2.27 csinl F
-+GLIBC_2.27 csqrt F
-+GLIBC_2.27 csqrtf F
-+GLIBC_2.27 csqrtf128 F
-+GLIBC_2.27 csqrtf32 F
-+GLIBC_2.27 csqrtf32x F
-+GLIBC_2.27 csqrtf64 F
-+GLIBC_2.27 csqrtf64x F
-+GLIBC_2.27 csqrtl F
-+GLIBC_2.27 ctan F
-+GLIBC_2.27 ctanf F
-+GLIBC_2.27 ctanf128 F
-+GLIBC_2.27 ctanf32 F
-+GLIBC_2.27 ctanf32x F
-+GLIBC_2.27 ctanf64 F
-+GLIBC_2.27 ctanf64x F
-+GLIBC_2.27 ctanh F
-+GLIBC_2.27 ctanhf F
-+GLIBC_2.27 ctanhf128 F
-+GLIBC_2.27 ctanhf32 F
-+GLIBC_2.27 ctanhf32x F
-+GLIBC_2.27 ctanhf64 F
-+GLIBC_2.27 ctanhf64x F
-+GLIBC_2.27 ctanhl F
-+GLIBC_2.27 ctanl F
-+GLIBC_2.27 drem F
-+GLIBC_2.27 dremf F
-+GLIBC_2.27 dreml F
-+GLIBC_2.27 erf F
-+GLIBC_2.27 erfc F
-+GLIBC_2.27 erfcf F
-+GLIBC_2.27 erfcf128 F
-+GLIBC_2.27 erfcf32 F
-+GLIBC_2.27 erfcf32x F
-+GLIBC_2.27 erfcf64 F
-+GLIBC_2.27 erfcf64x F
-+GLIBC_2.27 erfcl F
-+GLIBC_2.27 erff F
-+GLIBC_2.27 erff128 F
-+GLIBC_2.27 erff32 F
-+GLIBC_2.27 erff32x F
-+GLIBC_2.27 erff64 F
-+GLIBC_2.27 erff64x F
-+GLIBC_2.27 erfl F
-+GLIBC_2.27 exp F
-+GLIBC_2.27 exp10 F
-+GLIBC_2.27 exp10f F
-+GLIBC_2.27 exp10f128 F
-+GLIBC_2.27 exp10f32 F
-+GLIBC_2.27 exp10f32x F
-+GLIBC_2.27 exp10f64 F
-+GLIBC_2.27 exp10f64x F
-+GLIBC_2.27 exp10l F
-+GLIBC_2.27 exp2 F
-+GLIBC_2.27 exp2f F
-+GLIBC_2.27 exp2f128 F
-+GLIBC_2.27 exp2f32 F
-+GLIBC_2.27 exp2f32x F
-+GLIBC_2.27 exp2f64 F
-+GLIBC_2.27 exp2f64x F
-+GLIBC_2.27 exp2l F
-+GLIBC_2.27 expf F
-+GLIBC_2.27 expf128 F
-+GLIBC_2.27 expf32 F
-+GLIBC_2.27 expf32x F
-+GLIBC_2.27 expf64 F
-+GLIBC_2.27 expf64x F
-+GLIBC_2.27 expl F
-+GLIBC_2.27 expm1 F
-+GLIBC_2.27 expm1f F
-+GLIBC_2.27 expm1f128 F
-+GLIBC_2.27 expm1f32 F
-+GLIBC_2.27 expm1f32x F
-+GLIBC_2.27 expm1f64 F
-+GLIBC_2.27 expm1f64x F
-+GLIBC_2.27 expm1l F
-+GLIBC_2.27 fabs F
-+GLIBC_2.27 fabsf F
-+GLIBC_2.27 fabsf128 F
-+GLIBC_2.27 fabsf32 F
-+GLIBC_2.27 fabsf32x F
-+GLIBC_2.27 fabsf64 F
-+GLIBC_2.27 fabsf64x F
-+GLIBC_2.27 fabsl F
-+GLIBC_2.27 fdim F
-+GLIBC_2.27 fdimf F
-+GLIBC_2.27 fdimf128 F
-+GLIBC_2.27 fdimf32 F
-+GLIBC_2.27 fdimf32x F
-+GLIBC_2.27 fdimf64 F
-+GLIBC_2.27 fdimf64x F
-+GLIBC_2.27 fdiml F
-+GLIBC_2.27 feclearexcept F
-+GLIBC_2.27 fedisableexcept F
-+GLIBC_2.27 feenableexcept F
-+GLIBC_2.27 fegetenv F
-+GLIBC_2.27 fegetexcept F
-+GLIBC_2.27 fegetexceptflag F
-+GLIBC_2.27 fegetmode F
-+GLIBC_2.27 fegetround F
-+GLIBC_2.27 feholdexcept F
-+GLIBC_2.27 feraiseexcept F
-+GLIBC_2.27 fesetenv F
-+GLIBC_2.27 fesetexcept F
-+GLIBC_2.27 fesetexceptflag F
-+GLIBC_2.27 fesetmode F
-+GLIBC_2.27 fesetround F
-+GLIBC_2.27 fetestexcept F
-+GLIBC_2.27 fetestexceptflag F
-+GLIBC_2.27 feupdateenv F
-+GLIBC_2.27 finite F
-+GLIBC_2.27 finitef F
-+GLIBC_2.27 finitel F
-+GLIBC_2.27 floor F
-+GLIBC_2.27 floorf F
-+GLIBC_2.27 floorf128 F
-+GLIBC_2.27 floorf32 F
-+GLIBC_2.27 floorf32x F
-+GLIBC_2.27 floorf64 F
-+GLIBC_2.27 floorf64x F
-+GLIBC_2.27 floorl F
-+GLIBC_2.27 fma F
-+GLIBC_2.27 fmaf F
-+GLIBC_2.27 fmaf128 F
-+GLIBC_2.27 fmaf32 F
-+GLIBC_2.27 fmaf32x F
-+GLIBC_2.27 fmaf64 F
-+GLIBC_2.27 fmaf64x F
-+GLIBC_2.27 fmal F
-+GLIBC_2.27 fmax F
-+GLIBC_2.27 fmaxf F
-+GLIBC_2.27 fmaxf128 F
-+GLIBC_2.27 fmaxf32 F
-+GLIBC_2.27 fmaxf32x F
-+GLIBC_2.27 fmaxf64 F
-+GLIBC_2.27 fmaxf64x F
-+GLIBC_2.27 fmaxl F
-+GLIBC_2.27 fmaxmag F
-+GLIBC_2.27 fmaxmagf F
-+GLIBC_2.27 fmaxmagf128 F
-+GLIBC_2.27 fmaxmagf32 F
-+GLIBC_2.27 fmaxmagf32x F
-+GLIBC_2.27 fmaxmagf64 F
-+GLIBC_2.27 fmaxmagf64x F
-+GLIBC_2.27 fmaxmagl F
-+GLIBC_2.27 fmin F
-+GLIBC_2.27 fminf F
-+GLIBC_2.27 fminf128 F
-+GLIBC_2.27 fminf32 F
-+GLIBC_2.27 fminf32x F
-+GLIBC_2.27 fminf64 F
-+GLIBC_2.27 fminf64x F
-+GLIBC_2.27 fminl F
-+GLIBC_2.27 fminmag F
-+GLIBC_2.27 fminmagf F
-+GLIBC_2.27 fminmagf128 F
-+GLIBC_2.27 fminmagf32 F
-+GLIBC_2.27 fminmagf32x F
-+GLIBC_2.27 fminmagf64 F
-+GLIBC_2.27 fminmagf64x F
-+GLIBC_2.27 fminmagl F
-+GLIBC_2.27 fmod F
-+GLIBC_2.27 fmodf F
-+GLIBC_2.27 fmodf128 F
-+GLIBC_2.27 fmodf32 F
-+GLIBC_2.27 fmodf32x F
-+GLIBC_2.27 fmodf64 F
-+GLIBC_2.27 fmodf64x F
-+GLIBC_2.27 fmodl F
-+GLIBC_2.27 frexp F
-+GLIBC_2.27 frexpf F
-+GLIBC_2.27 frexpf128 F
-+GLIBC_2.27 frexpf32 F
-+GLIBC_2.27 frexpf32x F
-+GLIBC_2.27 frexpf64 F
-+GLIBC_2.27 frexpf64x F
-+GLIBC_2.27 frexpl F
-+GLIBC_2.27 fromfp F
-+GLIBC_2.27 fromfpf F
-+GLIBC_2.27 fromfpf128 F
-+GLIBC_2.27 fromfpf32 F
-+GLIBC_2.27 fromfpf32x F
-+GLIBC_2.27 fromfpf64 F
-+GLIBC_2.27 fromfpf64x F
-+GLIBC_2.27 fromfpl F
-+GLIBC_2.27 fromfpx F
-+GLIBC_2.27 fromfpxf F
-+GLIBC_2.27 fromfpxf128 F
-+GLIBC_2.27 fromfpxf32 F
-+GLIBC_2.27 fromfpxf32x F
-+GLIBC_2.27 fromfpxf64 F
-+GLIBC_2.27 fromfpxf64x F
-+GLIBC_2.27 fromfpxl F
-+GLIBC_2.27 gamma F
-+GLIBC_2.27 gammaf F
-+GLIBC_2.27 gammal F
-+GLIBC_2.27 getpayload F
-+GLIBC_2.27 getpayloadf F
-+GLIBC_2.27 getpayloadf128 F
-+GLIBC_2.27 getpayloadf32 F
-+GLIBC_2.27 getpayloadf32x F
-+GLIBC_2.27 getpayloadf64 F
-+GLIBC_2.27 getpayloadf64x F
-+GLIBC_2.27 getpayloadl F
-+GLIBC_2.27 hypot F
-+GLIBC_2.27 hypotf F
-+GLIBC_2.27 hypotf128 F
-+GLIBC_2.27 hypotf32 F
-+GLIBC_2.27 hypotf32x F
-+GLIBC_2.27 hypotf64 F
-+GLIBC_2.27 hypotf64x F
-+GLIBC_2.27 hypotl F
-+GLIBC_2.27 ilogb F
-+GLIBC_2.27 ilogbf F
-+GLIBC_2.27 ilogbf128 F
-+GLIBC_2.27 ilogbf32 F
-+GLIBC_2.27 ilogbf32x F
-+GLIBC_2.27 ilogbf64 F
-+GLIBC_2.27 ilogbf64x F
-+GLIBC_2.27 ilogbl F
-+GLIBC_2.27 j0 F
-+GLIBC_2.27 j0f F
-+GLIBC_2.27 j0f128 F
-+GLIBC_2.27 j0f32 F
-+GLIBC_2.27 j0f32x F
-+GLIBC_2.27 j0f64 F
-+GLIBC_2.27 j0f64x F
-+GLIBC_2.27 j0l F
-+GLIBC_2.27 j1 F
-+GLIBC_2.27 j1f F
-+GLIBC_2.27 j1f128 F
-+GLIBC_2.27 j1f32 F
-+GLIBC_2.27 j1f32x F
-+GLIBC_2.27 j1f64 F
-+GLIBC_2.27 j1f64x F
-+GLIBC_2.27 j1l F
-+GLIBC_2.27 jn F
-+GLIBC_2.27 jnf F
-+GLIBC_2.27 jnf128 F
-+GLIBC_2.27 jnf32 F
-+GLIBC_2.27 jnf32x F
-+GLIBC_2.27 jnf64 F
-+GLIBC_2.27 jnf64x F
-+GLIBC_2.27 jnl F
-+GLIBC_2.27 ldexp F
-+GLIBC_2.27 ldexpf F
-+GLIBC_2.27 ldexpf128 F
-+GLIBC_2.27 ldexpf32 F
-+GLIBC_2.27 ldexpf32x F
-+GLIBC_2.27 ldexpf64 F
-+GLIBC_2.27 ldexpf64x F
-+GLIBC_2.27 ldexpl F
-+GLIBC_2.27 lgamma F
-+GLIBC_2.27 lgamma_r F
-+GLIBC_2.27 lgammaf F
-+GLIBC_2.27 lgammaf128 F
-+GLIBC_2.27 lgammaf128_r F
-+GLIBC_2.27 lgammaf32 F
-+GLIBC_2.27 lgammaf32_r F
-+GLIBC_2.27 lgammaf32x F
-+GLIBC_2.27 lgammaf32x_r F
-+GLIBC_2.27 lgammaf64 F
-+GLIBC_2.27 lgammaf64_r F
-+GLIBC_2.27 lgammaf64x F
-+GLIBC_2.27 lgammaf64x_r F
-+GLIBC_2.27 lgammaf_r F
-+GLIBC_2.27 lgammal F
-+GLIBC_2.27 lgammal_r F
-+GLIBC_2.27 llogb F
-+GLIBC_2.27 llogbf F
-+GLIBC_2.27 llogbf128 F
-+GLIBC_2.27 llogbf32 F
-+GLIBC_2.27 llogbf32x F
-+GLIBC_2.27 llogbf64 F
-+GLIBC_2.27 llogbf64x F
-+GLIBC_2.27 llogbl F
-+GLIBC_2.27 llrint F
-+GLIBC_2.27 llrintf F
-+GLIBC_2.27 llrintf128 F
-+GLIBC_2.27 llrintf32 F
-+GLIBC_2.27 llrintf32x F
-+GLIBC_2.27 llrintf64 F
-+GLIBC_2.27 llrintf64x F
-+GLIBC_2.27 llrintl F
-+GLIBC_2.27 llround F
-+GLIBC_2.27 llroundf F
-+GLIBC_2.27 llroundf128 F
-+GLIBC_2.27 llroundf32 F
-+GLIBC_2.27 llroundf32x F
-+GLIBC_2.27 llroundf64 F
-+GLIBC_2.27 llroundf64x F
-+GLIBC_2.27 llroundl F
-+GLIBC_2.27 log F
-+GLIBC_2.27 log10 F
-+GLIBC_2.27 log10f F
-+GLIBC_2.27 log10f128 F
-+GLIBC_2.27 log10f32 F
-+GLIBC_2.27 log10f32x F
-+GLIBC_2.27 log10f64 F
-+GLIBC_2.27 log10f64x F
-+GLIBC_2.27 log10l F
-+GLIBC_2.27 log1p F
-+GLIBC_2.27 log1pf F
-+GLIBC_2.27 log1pf128 F
-+GLIBC_2.27 log1pf32 F
-+GLIBC_2.27 log1pf32x F
-+GLIBC_2.27 log1pf64 F
-+GLIBC_2.27 log1pf64x F
-+GLIBC_2.27 log1pl F
-+GLIBC_2.27 log2 F
-+GLIBC_2.27 log2f F
-+GLIBC_2.27 log2f128 F
-+GLIBC_2.27 log2f32 F
-+GLIBC_2.27 log2f32x F
-+GLIBC_2.27 log2f64 F
-+GLIBC_2.27 log2f64x F
-+GLIBC_2.27 log2l F
-+GLIBC_2.27 logb F
-+GLIBC_2.27 logbf F
-+GLIBC_2.27 logbf128 F
-+GLIBC_2.27 logbf32 F
-+GLIBC_2.27 logbf32x F
-+GLIBC_2.27 logbf64 F
-+GLIBC_2.27 logbf64x F
-+GLIBC_2.27 logbl F
-+GLIBC_2.27 logf F
-+GLIBC_2.27 logf128 F
-+GLIBC_2.27 logf32 F
-+GLIBC_2.27 logf32x F
-+GLIBC_2.27 logf64 F
-+GLIBC_2.27 logf64x F
-+GLIBC_2.27 logl F
-+GLIBC_2.27 lrint F
-+GLIBC_2.27 lrintf F
-+GLIBC_2.27 lrintf128 F
-+GLIBC_2.27 lrintf32 F
-+GLIBC_2.27 lrintf32x F
-+GLIBC_2.27 lrintf64 F
-+GLIBC_2.27 lrintf64x F
-+GLIBC_2.27 lrintl F
-+GLIBC_2.27 lround F
-+GLIBC_2.27 lroundf F
-+GLIBC_2.27 lroundf128 F
-+GLIBC_2.27 lroundf32 F
-+GLIBC_2.27 lroundf32x F
-+GLIBC_2.27 lroundf64 F
-+GLIBC_2.27 lroundf64x F
-+GLIBC_2.27 lroundl F
-+GLIBC_2.27 modf F
-+GLIBC_2.27 modff F
-+GLIBC_2.27 modff128 F
-+GLIBC_2.27 modff32 F
-+GLIBC_2.27 modff32x F
-+GLIBC_2.27 modff64 F
-+GLIBC_2.27 modff64x F
-+GLIBC_2.27 modfl F
-+GLIBC_2.27 nan F
-+GLIBC_2.27 nanf F
-+GLIBC_2.27 nanf128 F
-+GLIBC_2.27 nanf32 F
-+GLIBC_2.27 nanf32x F
-+GLIBC_2.27 nanf64 F
-+GLIBC_2.27 nanf64x F
-+GLIBC_2.27 nanl F
-+GLIBC_2.27 nearbyint F
-+GLIBC_2.27 nearbyintf F
-+GLIBC_2.27 nearbyintf128 F
-+GLIBC_2.27 nearbyintf32 F
-+GLIBC_2.27 nearbyintf32x F
-+GLIBC_2.27 nearbyintf64 F
-+GLIBC_2.27 nearbyintf64x F
-+GLIBC_2.27 nearbyintl F
-+GLIBC_2.27 nextafter F
-+GLIBC_2.27 nextafterf F
-+GLIBC_2.27 nextafterf128 F
-+GLIBC_2.27 nextafterf32 F
-+GLIBC_2.27 nextafterf32x F
-+GLIBC_2.27 nextafterf64 F
-+GLIBC_2.27 nextafterf64x F
-+GLIBC_2.27 nextafterl F
-+GLIBC_2.27 nextdown F
-+GLIBC_2.27 nextdownf F
-+GLIBC_2.27 nextdownf128 F
-+GLIBC_2.27 nextdownf32 F
-+GLIBC_2.27 nextdownf32x F
-+GLIBC_2.27 nextdownf64 F
-+GLIBC_2.27 nextdownf64x F
-+GLIBC_2.27 nextdownl F
-+GLIBC_2.27 nexttoward F
-+GLIBC_2.27 nexttowardf F
-+GLIBC_2.27 nexttowardl F
-+GLIBC_2.27 nextup F
-+GLIBC_2.27 nextupf F
-+GLIBC_2.27 nextupf128 F
-+GLIBC_2.27 nextupf32 F
-+GLIBC_2.27 nextupf32x F
-+GLIBC_2.27 nextupf64 F
-+GLIBC_2.27 nextupf64x F
-+GLIBC_2.27 nextupl F
-+GLIBC_2.27 pow F
-+GLIBC_2.27 powf F
-+GLIBC_2.27 powf128 F
-+GLIBC_2.27 powf32 F
-+GLIBC_2.27 powf32x F
-+GLIBC_2.27 powf64 F
-+GLIBC_2.27 powf64x F
-+GLIBC_2.27 powl F
-+GLIBC_2.27 remainder F
-+GLIBC_2.27 remainderf F
-+GLIBC_2.27 remainderf128 F
-+GLIBC_2.27 remainderf32 F
-+GLIBC_2.27 remainderf32x F
-+GLIBC_2.27 remainderf64 F
-+GLIBC_2.27 remainderf64x F
-+GLIBC_2.27 remainderl F
-+GLIBC_2.27 remquo F
-+GLIBC_2.27 remquof F
-+GLIBC_2.27 remquof128 F
-+GLIBC_2.27 remquof32 F
-+GLIBC_2.27 remquof32x F
-+GLIBC_2.27 remquof64 F
-+GLIBC_2.27 remquof64x F
-+GLIBC_2.27 remquol F
-+GLIBC_2.27 rint F
-+GLIBC_2.27 rintf F
-+GLIBC_2.27 rintf128 F
-+GLIBC_2.27 rintf32 F
-+GLIBC_2.27 rintf32x F
-+GLIBC_2.27 rintf64 F
-+GLIBC_2.27 rintf64x F
-+GLIBC_2.27 rintl F
-+GLIBC_2.27 round F
-+GLIBC_2.27 roundeven F
-+GLIBC_2.27 roundevenf F
-+GLIBC_2.27 roundevenf128 F
-+GLIBC_2.27 roundevenf32 F
-+GLIBC_2.27 roundevenf32x F
-+GLIBC_2.27 roundevenf64 F
-+GLIBC_2.27 roundevenf64x F
-+GLIBC_2.27 roundevenl F
-+GLIBC_2.27 roundf F
-+GLIBC_2.27 roundf128 F
-+GLIBC_2.27 roundf32 F
-+GLIBC_2.27 roundf32x F
-+GLIBC_2.27 roundf64 F
-+GLIBC_2.27 roundf64x F
-+GLIBC_2.27 roundl F
-+GLIBC_2.27 scalb F
-+GLIBC_2.27 scalbf F
-+GLIBC_2.27 scalbl F
-+GLIBC_2.27 scalbln F
-+GLIBC_2.27 scalblnf F
-+GLIBC_2.27 scalblnf128 F
-+GLIBC_2.27 scalblnf32 F
-+GLIBC_2.27 scalblnf32x F
-+GLIBC_2.27 scalblnf64 F
-+GLIBC_2.27 scalblnf64x F
-+GLIBC_2.27 scalblnl F
-+GLIBC_2.27 scalbn F
-+GLIBC_2.27 scalbnf F
-+GLIBC_2.27 scalbnf128 F
-+GLIBC_2.27 scalbnf32 F
-+GLIBC_2.27 scalbnf32x F
-+GLIBC_2.27 scalbnf64 F
-+GLIBC_2.27 scalbnf64x F
-+GLIBC_2.27 scalbnl F
-+GLIBC_2.27 setpayload F
-+GLIBC_2.27 setpayloadf F
-+GLIBC_2.27 setpayloadf128 F
-+GLIBC_2.27 setpayloadf32 F
-+GLIBC_2.27 setpayloadf32x F
-+GLIBC_2.27 setpayloadf64 F
-+GLIBC_2.27 setpayloadf64x F
-+GLIBC_2.27 setpayloadl F
-+GLIBC_2.27 setpayloadsig F
-+GLIBC_2.27 setpayloadsigf F
-+GLIBC_2.27 setpayloadsigf128 F
-+GLIBC_2.27 setpayloadsigf32 F
-+GLIBC_2.27 setpayloadsigf32x F
-+GLIBC_2.27 setpayloadsigf64 F
-+GLIBC_2.27 setpayloadsigf64x F
-+GLIBC_2.27 setpayloadsigl F
-+GLIBC_2.27 signgam D 0x4
-+GLIBC_2.27 significand F
-+GLIBC_2.27 significandf F
-+GLIBC_2.27 significandl F
-+GLIBC_2.27 sin F
-+GLIBC_2.27 sincos F
-+GLIBC_2.27 sincosf F
-+GLIBC_2.27 sincosf128 F
-+GLIBC_2.27 sincosf32 F
-+GLIBC_2.27 sincosf32x F
-+GLIBC_2.27 sincosf64 F
-+GLIBC_2.27 sincosf64x F
-+GLIBC_2.27 sincosl F
-+GLIBC_2.27 sinf F
-+GLIBC_2.27 sinf128 F
-+GLIBC_2.27 sinf32 F
-+GLIBC_2.27 sinf32x F
-+GLIBC_2.27 sinf64 F
-+GLIBC_2.27 sinf64x F
-+GLIBC_2.27 sinh F
-+GLIBC_2.27 sinhf F
-+GLIBC_2.27 sinhf128 F
-+GLIBC_2.27 sinhf32 F
-+GLIBC_2.27 sinhf32x F
-+GLIBC_2.27 sinhf64 F
-+GLIBC_2.27 sinhf64x F
-+GLIBC_2.27 sinhl F
-+GLIBC_2.27 sinl F
-+GLIBC_2.27 sqrt F
-+GLIBC_2.27 sqrtf F
-+GLIBC_2.27 sqrtf128 F
-+GLIBC_2.27 sqrtf32 F
-+GLIBC_2.27 sqrtf32x F
-+GLIBC_2.27 sqrtf64 F
-+GLIBC_2.27 sqrtf64x F
-+GLIBC_2.27 sqrtl F
-+GLIBC_2.27 tan F
-+GLIBC_2.27 tanf F
-+GLIBC_2.27 tanf128 F
-+GLIBC_2.27 tanf32 F
-+GLIBC_2.27 tanf32x F
-+GLIBC_2.27 tanf64 F
-+GLIBC_2.27 tanf64x F
-+GLIBC_2.27 tanh F
-+GLIBC_2.27 tanhf F
-+GLIBC_2.27 tanhf128 F
-+GLIBC_2.27 tanhf32 F
-+GLIBC_2.27 tanhf32x F
-+GLIBC_2.27 tanhf64 F
-+GLIBC_2.27 tanhf64x F
-+GLIBC_2.27 tanhl F
-+GLIBC_2.27 tanl F
-+GLIBC_2.27 tgamma F
-+GLIBC_2.27 tgammaf F
-+GLIBC_2.27 tgammaf128 F
-+GLIBC_2.27 tgammaf32 F
-+GLIBC_2.27 tgammaf32x F
-+GLIBC_2.27 tgammaf64 F
-+GLIBC_2.27 tgammaf64x F
-+GLIBC_2.27 tgammal F
-+GLIBC_2.27 totalorder F
-+GLIBC_2.27 totalorderf F
-+GLIBC_2.27 totalorderf128 F
-+GLIBC_2.27 totalorderf32 F
-+GLIBC_2.27 totalorderf32x F
-+GLIBC_2.27 totalorderf64 F
-+GLIBC_2.27 totalorderf64x F
-+GLIBC_2.27 totalorderl F
-+GLIBC_2.27 totalordermag F
-+GLIBC_2.27 totalordermagf F
-+GLIBC_2.27 totalordermagf128 F
-+GLIBC_2.27 totalordermagf32 F
-+GLIBC_2.27 totalordermagf32x F
-+GLIBC_2.27 totalordermagf64 F
-+GLIBC_2.27 totalordermagf64x F
-+GLIBC_2.27 totalordermagl F
-+GLIBC_2.27 trunc F
-+GLIBC_2.27 truncf F
-+GLIBC_2.27 truncf128 F
-+GLIBC_2.27 truncf32 F
-+GLIBC_2.27 truncf32x F
-+GLIBC_2.27 truncf64 F
-+GLIBC_2.27 truncf64x F
-+GLIBC_2.27 truncl F
-+GLIBC_2.27 ufromfp F
-+GLIBC_2.27 ufromfpf F
-+GLIBC_2.27 ufromfpf128 F
-+GLIBC_2.27 ufromfpf32 F
-+GLIBC_2.27 ufromfpf32x F
-+GLIBC_2.27 ufromfpf64 F
-+GLIBC_2.27 ufromfpf64x F
-+GLIBC_2.27 ufromfpl F
-+GLIBC_2.27 ufromfpx F
-+GLIBC_2.27 ufromfpxf F
-+GLIBC_2.27 ufromfpxf128 F
-+GLIBC_2.27 ufromfpxf32 F
-+GLIBC_2.27 ufromfpxf32x F
-+GLIBC_2.27 ufromfpxf64 F
-+GLIBC_2.27 ufromfpxf64x F
-+GLIBC_2.27 ufromfpxl F
-+GLIBC_2.27 y0 F
-+GLIBC_2.27 y0f F
-+GLIBC_2.27 y0f128 F
-+GLIBC_2.27 y0f32 F
-+GLIBC_2.27 y0f32x F
-+GLIBC_2.27 y0f64 F
-+GLIBC_2.27 y0f64x F
-+GLIBC_2.27 y0l F
-+GLIBC_2.27 y1 F
-+GLIBC_2.27 y1f F
-+GLIBC_2.27 y1f128 F
-+GLIBC_2.27 y1f32 F
-+GLIBC_2.27 y1f32x F
-+GLIBC_2.27 y1f64 F
-+GLIBC_2.27 y1f64x F
-+GLIBC_2.27 y1l F
-+GLIBC_2.27 yn F
-+GLIBC_2.27 ynf F
-+GLIBC_2.27 ynf128 F
-+GLIBC_2.27 ynf32 F
-+GLIBC_2.27 ynf32x F
-+GLIBC_2.27 ynf64 F
-+GLIBC_2.27 ynf64x F
-+GLIBC_2.27 ynl F
-+GLIBC_2.28 daddl F
-+GLIBC_2.28 ddivl F
-+GLIBC_2.28 dmull F
-+GLIBC_2.28 dsubl F
-+GLIBC_2.28 f32addf128 F
-+GLIBC_2.28 f32addf32x F
-+GLIBC_2.28 f32addf64 F
-+GLIBC_2.28 f32addf64x F
-+GLIBC_2.28 f32divf128 F
-+GLIBC_2.28 f32divf32x F
-+GLIBC_2.28 f32divf64 F
-+GLIBC_2.28 f32divf64x F
-+GLIBC_2.28 f32mulf128 F
-+GLIBC_2.28 f32mulf32x F
-+GLIBC_2.28 f32mulf64 F
-+GLIBC_2.28 f32mulf64x F
-+GLIBC_2.28 f32subf128 F
-+GLIBC_2.28 f32subf32x F
-+GLIBC_2.28 f32subf64 F
-+GLIBC_2.28 f32subf64x F
-+GLIBC_2.28 f32xaddf128 F
-+GLIBC_2.28 f32xaddf64 F
-+GLIBC_2.28 f32xaddf64x F
-+GLIBC_2.28 f32xdivf128 F
-+GLIBC_2.28 f32xdivf64 F
-+GLIBC_2.28 f32xdivf64x F
-+GLIBC_2.28 f32xmulf128 F
-+GLIBC_2.28 f32xmulf64 F
-+GLIBC_2.28 f32xmulf64x F
-+GLIBC_2.28 f32xsubf128 F
-+GLIBC_2.28 f32xsubf64 F
-+GLIBC_2.28 f32xsubf64x F
-+GLIBC_2.28 f64addf128 F
-+GLIBC_2.28 f64addf64x F
-+GLIBC_2.28 f64divf128 F
-+GLIBC_2.28 f64divf64x F
-+GLIBC_2.28 f64mulf128 F
-+GLIBC_2.28 f64mulf64x F
-+GLIBC_2.28 f64subf128 F
-+GLIBC_2.28 f64subf64x F
-+GLIBC_2.28 f64xaddf128 F
-+GLIBC_2.28 f64xdivf128 F
-+GLIBC_2.28 f64xmulf128 F
-+GLIBC_2.28 f64xsubf128 F
-+GLIBC_2.28 fadd F
-+GLIBC_2.28 faddl F
-+GLIBC_2.28 fdiv F
-+GLIBC_2.28 fdivl F
-+GLIBC_2.28 fmul F
-+GLIBC_2.28 fmull F
-+GLIBC_2.28 fsub F
-+GLIBC_2.28 fsubl F
-diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libnsl.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libnsl.abilist
-new file mode 100644
-index 00000000..0767472d
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libnsl.abilist
-@@ -0,0 +1,120 @@
-+GLIBC_2.27 __free_fdresult F
-+GLIBC_2.27 __nis_default_access F
-+GLIBC_2.27 __nis_default_group F
-+GLIBC_2.27 __nis_default_owner F
-+GLIBC_2.27 __nis_default_ttl F
-+GLIBC_2.27 __nis_finddirectory F
-+GLIBC_2.27 __nisbind_connect F
-+GLIBC_2.27 __nisbind_create F
-+GLIBC_2.27 __nisbind_destroy F
-+GLIBC_2.27 __nisbind_next F
-+GLIBC_2.27 __yp_check F
-+GLIBC_2.27 nis_add F
-+GLIBC_2.27 nis_add_entry F
-+GLIBC_2.27 nis_addmember F
-+GLIBC_2.27 nis_checkpoint F
-+GLIBC_2.27 nis_clone_directory F
-+GLIBC_2.27 nis_clone_object F
-+GLIBC_2.27 nis_clone_result F
-+GLIBC_2.27 nis_creategroup F
-+GLIBC_2.27 nis_destroy_object F
-+GLIBC_2.27 nis_destroygroup F
-+GLIBC_2.27 nis_dir_cmp F
-+GLIBC_2.27 nis_domain_of F
-+GLIBC_2.27 nis_domain_of_r F
-+GLIBC_2.27 nis_first_entry F
-+GLIBC_2.27 nis_free_directory F
-+GLIBC_2.27 nis_free_object F
-+GLIBC_2.27 nis_free_request F
-+GLIBC_2.27 nis_freenames F
-+GLIBC_2.27 nis_freeresult F
-+GLIBC_2.27 nis_freeservlist F
-+GLIBC_2.27 nis_freetags F
-+GLIBC_2.27 nis_getnames F
-+GLIBC_2.27 nis_getservlist F
-+GLIBC_2.27 nis_ismember F
-+GLIBC_2.27 nis_leaf_of F
-+GLIBC_2.27 nis_leaf_of_r F
-+GLIBC_2.27 nis_lerror F
-+GLIBC_2.27 nis_list F
-+GLIBC_2.27 nis_local_directory F
-+GLIBC_2.27 nis_local_group F
-+GLIBC_2.27 nis_local_host F
-+GLIBC_2.27 nis_local_principal F
-+GLIBC_2.27 nis_lookup F
-+GLIBC_2.27 nis_mkdir F
-+GLIBC_2.27 nis_modify F
-+GLIBC_2.27 nis_modify_entry F
-+GLIBC_2.27 nis_name_of F
-+GLIBC_2.27 nis_name_of_r F
-+GLIBC_2.27 nis_next_entry F
-+GLIBC_2.27 nis_perror F
-+GLIBC_2.27 nis_ping F
-+GLIBC_2.27 nis_print_directory F
-+GLIBC_2.27 nis_print_entry F
-+GLIBC_2.27 nis_print_group F
-+GLIBC_2.27 nis_print_group_entry F
-+GLIBC_2.27 nis_print_link F
-+GLIBC_2.27 nis_print_object F
-+GLIBC_2.27 nis_print_result F
-+GLIBC_2.27 nis_print_rights F
-+GLIBC_2.27 nis_print_table F
-+GLIBC_2.27 nis_read_obj F
-+GLIBC_2.27 nis_remove F
-+GLIBC_2.27 nis_remove_entry F
-+GLIBC_2.27 nis_removemember F
-+GLIBC_2.27 nis_rmdir F
-+GLIBC_2.27 nis_servstate F
-+GLIBC_2.27 nis_sperrno F
-+GLIBC_2.27 nis_sperror F
-+GLIBC_2.27 nis_sperror_r F
-+GLIBC_2.27 nis_stats F
-+GLIBC_2.27 nis_verifygroup F
-+GLIBC_2.27 nis_write_obj F
-+GLIBC_2.27 readColdStartFile F
-+GLIBC_2.27 writeColdStartFile F
-+GLIBC_2.27 xdr_cback_data F
-+GLIBC_2.27 xdr_domainname F
-+GLIBC_2.27 xdr_keydat F
-+GLIBC_2.27 xdr_mapname F
-+GLIBC_2.27 xdr_obj_p F
-+GLIBC_2.27 xdr_peername F
-+GLIBC_2.27 xdr_valdat F
-+GLIBC_2.27 xdr_yp_buf F
-+GLIBC_2.27 xdr_ypall F
-+GLIBC_2.27 xdr_ypbind_binding F
-+GLIBC_2.27 xdr_ypbind_resp F
-+GLIBC_2.27 xdr_ypbind_resptype F
-+GLIBC_2.27 xdr_ypbind_setdom F
-+GLIBC_2.27 xdr_ypdelete_args F
-+GLIBC_2.27 xdr_ypmap_parms F
-+GLIBC_2.27 xdr_ypmaplist F
-+GLIBC_2.27 xdr_yppush_status F
-+GLIBC_2.27 xdr_yppushresp_xfr F
-+GLIBC_2.27 xdr_ypreq_key F
-+GLIBC_2.27 xdr_ypreq_nokey F
-+GLIBC_2.27 xdr_ypreq_xfr F
-+GLIBC_2.27 xdr_ypresp_all F
-+GLIBC_2.27 xdr_ypresp_key_val F
-+GLIBC_2.27 xdr_ypresp_maplist F
-+GLIBC_2.27 xdr_ypresp_master F
-+GLIBC_2.27 xdr_ypresp_order F
-+GLIBC_2.27 xdr_ypresp_val F
-+GLIBC_2.27 xdr_ypresp_xfr F
-+GLIBC_2.27 xdr_ypstat F
-+GLIBC_2.27 xdr_ypupdate_args F
-+GLIBC_2.27 xdr_ypxfrstat F
-+GLIBC_2.27 yp_all F
-+GLIBC_2.27 yp_bind F
-+GLIBC_2.27 yp_first F
-+GLIBC_2.27 yp_get_default_domain F
-+GLIBC_2.27 yp_maplist F
-+GLIBC_2.27 yp_master F
-+GLIBC_2.27 yp_match F
-+GLIBC_2.27 yp_next F
-+GLIBC_2.27 yp_order F
-+GLIBC_2.27 yp_unbind F
-+GLIBC_2.27 yp_update F
-+GLIBC_2.27 ypbinderr_string F
-+GLIBC_2.27 yperr_string F
-+GLIBC_2.27 ypprot_err F
-diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libpthread.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libpthread.abilist
-new file mode 100644
-index 00000000..f60b22ef
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libpthread.abilist
-@@ -0,0 +1,264 @@
-+GLIBC_2.0 _IO_flockfile F
-+GLIBC_2.0 _IO_ftrylockfile F
-+GLIBC_2.0 _IO_funlockfile F
-+GLIBC_2.0 __close F
-+GLIBC_2.0 __connect F
-+GLIBC_2.0 __errno_location F
-+GLIBC_2.0 __fcntl F
-+GLIBC_2.0 __fork F
-+GLIBC_2.0 __h_errno_location F
-+GLIBC_2.0 __lseek F
-+GLIBC_2.0 __open F
-+GLIBC_2.0 __pthread_getspecific F
-+GLIBC_2.0 __pthread_key_create F
-+GLIBC_2.0 __pthread_mutex_destroy F
-+GLIBC_2.0 __pthread_mutex_init F
-+GLIBC_2.0 __pthread_mutex_lock F
-+GLIBC_2.0 __pthread_mutex_trylock F
-+GLIBC_2.0 __pthread_mutex_unlock F
-+GLIBC_2.0 __pthread_mutexattr_destroy F
-+GLIBC_2.0 __pthread_mutexattr_init F
-+GLIBC_2.0 __pthread_mutexattr_settype F
-+GLIBC_2.0 __pthread_once F
-+GLIBC_2.0 __pthread_setspecific F
-+GLIBC_2.0 __read F
-+GLIBC_2.0 __send F
-+GLIBC_2.0 __sigaction F
-+GLIBC_2.0 __wait F
-+GLIBC_2.0 __write F
-+GLIBC_2.0 _pthread_cleanup_pop F
-+GLIBC_2.0 _pthread_cleanup_pop_restore F
-+GLIBC_2.0 _pthread_cleanup_push F
-+GLIBC_2.0 _pthread_cleanup_push_defer F
-+GLIBC_2.0 accept F
-+GLIBC_2.0 close F
-+GLIBC_2.0 connect F
-+GLIBC_2.0 fcntl F
-+GLIBC_2.0 flockfile F
-+GLIBC_2.0 fork F
-+GLIBC_2.0 fsync F
-+GLIBC_2.0 ftrylockfile F
-+GLIBC_2.0 funlockfile F
-+GLIBC_2.0 longjmp F
-+GLIBC_2.0 lseek F
-+GLIBC_2.0 msync F
-+GLIBC_2.0 nanosleep F
-+GLIBC_2.0 open F
-+GLIBC_2.0 pause F
-+GLIBC_2.0 pthread_atfork F
-+GLIBC_2.0 pthread_attr_destroy F
-+GLIBC_2.0 pthread_attr_getdetachstate F
-+GLIBC_2.0 pthread_attr_getinheritsched F
-+GLIBC_2.0 pthread_attr_getschedparam F
-+GLIBC_2.0 pthread_attr_getschedpolicy F
-+GLIBC_2.0 pthread_attr_getscope F
-+GLIBC_2.0 pthread_attr_init F
-+GLIBC_2.0 pthread_attr_setdetachstate F
-+GLIBC_2.0 pthread_attr_setinheritsched F
-+GLIBC_2.0 pthread_attr_setschedparam F
-+GLIBC_2.0 pthread_attr_setschedpolicy F
-+GLIBC_2.0 pthread_attr_setscope F
-+GLIBC_2.0 pthread_cancel F
-+GLIBC_2.0 pthread_cond_broadcast F
-+GLIBC_2.0 pthread_cond_destroy F
-+GLIBC_2.0 pthread_cond_init F
-+GLIBC_2.0 pthread_cond_signal F
-+GLIBC_2.0 pthread_cond_timedwait F
-+GLIBC_2.0 pthread_cond_wait F
-+GLIBC_2.0 pthread_condattr_destroy F
-+GLIBC_2.0 pthread_condattr_init F
-+GLIBC_2.0 pthread_create F
-+GLIBC_2.0 pthread_detach F
-+GLIBC_2.0 pthread_equal F
-+GLIBC_2.0 pthread_exit F
-+GLIBC_2.0 pthread_getschedparam F
-+GLIBC_2.0 pthread_getspecific F
-+GLIBC_2.0 pthread_join F
-+GLIBC_2.0 pthread_key_create F
-+GLIBC_2.0 pthread_key_delete F
-+GLIBC_2.0 pthread_kill F
-+GLIBC_2.0 pthread_kill_other_threads_np F
-+GLIBC_2.0 pthread_mutex_destroy F
-+GLIBC_2.0 pthread_mutex_init F
-+GLIBC_2.0 pthread_mutex_lock F
-+GLIBC_2.0 pthread_mutex_trylock F
-+GLIBC_2.0 pthread_mutex_unlock F
-+GLIBC_2.0 pthread_mutexattr_destroy F
-+GLIBC_2.0 pthread_mutexattr_getkind_np F
-+GLIBC_2.0 pthread_mutexattr_init F
-+GLIBC_2.0 pthread_mutexattr_setkind_np F
-+GLIBC_2.0 pthread_once F
-+GLIBC_2.0 pthread_self F
-+GLIBC_2.0 pthread_setcancelstate F
-+GLIBC_2.0 pthread_setcanceltype F
-+GLIBC_2.0 pthread_setschedparam F
-+GLIBC_2.0 pthread_setspecific F
-+GLIBC_2.0 pthread_sigmask F
-+GLIBC_2.0 pthread_testcancel F
-+GLIBC_2.0 raise F
-+GLIBC_2.0 read F
-+GLIBC_2.0 recv F
-+GLIBC_2.0 recvfrom F
-+GLIBC_2.0 recvmsg F
-+GLIBC_2.0 sem_destroy F
-+GLIBC_2.0 sem_getvalue F
-+GLIBC_2.0 sem_init F
-+GLIBC_2.0 sem_post F
-+GLIBC_2.0 sem_trywait F
-+GLIBC_2.0 sem_wait F
-+GLIBC_2.0 send F
-+GLIBC_2.0 sendmsg F
-+GLIBC_2.0 sendto F
-+GLIBC_2.0 sigaction F
-+GLIBC_2.0 siglongjmp F
-+GLIBC_2.0 sigwait F
-+GLIBC_2.0 system F
-+GLIBC_2.0 tcdrain F
-+GLIBC_2.0 wait F
-+GLIBC_2.0 waitpid F
-+GLIBC_2.0 write F
-+GLIBC_2.11 pthread_sigqueue F
-+GLIBC_2.12 pthread_getname_np F
-+GLIBC_2.12 pthread_mutex_consistent F
-+GLIBC_2.12 pthread_mutexattr_getrobust F
-+GLIBC_2.12 pthread_mutexattr_setrobust F
-+GLIBC_2.12 pthread_setname_np F
-+GLIBC_2.18 pthread_getattr_default_np F
-+GLIBC_2.18 pthread_setattr_default_np F
-+GLIBC_2.2 __libc_allocate_rtsig F
-+GLIBC_2.2 __libc_current_sigrtmax F
-+GLIBC_2.2 __libc_current_sigrtmin F
-+GLIBC_2.2 __open64 F
-+GLIBC_2.2 __pread64 F
-+GLIBC_2.2 __pthread_rwlock_destroy F
-+GLIBC_2.2 __pthread_rwlock_init F
-+GLIBC_2.2 __pthread_rwlock_rdlock F
-+GLIBC_2.2 __pthread_rwlock_tryrdlock F
-+GLIBC_2.2 __pthread_rwlock_trywrlock F
-+GLIBC_2.2 __pthread_rwlock_unlock F
-+GLIBC_2.2 __pthread_rwlock_wrlock F
-+GLIBC_2.2 __pwrite64 F
-+GLIBC_2.2 __res_state F
-+GLIBC_2.2 lseek64 F
-+GLIBC_2.2 open64 F
-+GLIBC_2.2 pread F
-+GLIBC_2.2 pread64 F
-+GLIBC_2.2 pthread_attr_getguardsize F
-+GLIBC_2.2 pthread_attr_getstack F
-+GLIBC_2.2 pthread_attr_getstackaddr F
-+GLIBC_2.2 pthread_attr_getstacksize F
-+GLIBC_2.2 pthread_attr_init F
-+GLIBC_2.2 pthread_attr_setguardsize F
-+GLIBC_2.2 pthread_attr_setstack F
-+GLIBC_2.2 pthread_attr_setstackaddr F
-+GLIBC_2.2 pthread_attr_setstacksize F
-+GLIBC_2.2 pthread_barrier_destroy F
-+GLIBC_2.2 pthread_barrier_init F
-+GLIBC_2.2 pthread_barrier_wait F
-+GLIBC_2.2 pthread_barrierattr_destroy F
-+GLIBC_2.2 pthread_barrierattr_init F
-+GLIBC_2.2 pthread_barrierattr_setpshared F
-+GLIBC_2.2 pthread_condattr_getpshared F
-+GLIBC_2.2 pthread_condattr_setpshared F
-+GLIBC_2.2 pthread_create F
-+GLIBC_2.2 pthread_getconcurrency F
-+GLIBC_2.2 pthread_getcpuclockid F
-+GLIBC_2.2 pthread_mutex_timedlock F
-+GLIBC_2.2 pthread_mutexattr_getpshared F
-+GLIBC_2.2 pthread_mutexattr_gettype F
-+GLIBC_2.2 pthread_mutexattr_setpshared F
-+GLIBC_2.2 pthread_mutexattr_settype F
-+GLIBC_2.2 pthread_rwlock_destroy F
-+GLIBC_2.2 pthread_rwlock_init F
-+GLIBC_2.2 pthread_rwlock_rdlock F
-+GLIBC_2.2 pthread_rwlock_timedrdlock F
-+GLIBC_2.2 pthread_rwlock_timedwrlock F
-+GLIBC_2.2 pthread_rwlock_tryrdlock F
-+GLIBC_2.2 pthread_rwlock_trywrlock F
-+GLIBC_2.2 pthread_rwlock_unlock F
-+GLIBC_2.2 pthread_rwlock_wrlock F
-+GLIBC_2.2 pthread_rwlockattr_destroy F
-+GLIBC_2.2 pthread_rwlockattr_getkind_np F
-+GLIBC_2.2 pthread_rwlockattr_getpshared F
-+GLIBC_2.2 pthread_rwlockattr_init F
-+GLIBC_2.2 pthread_rwlockattr_setkind_np F
-+GLIBC_2.2 pthread_rwlockattr_setpshared F
-+GLIBC_2.2 pthread_setconcurrency F
-+GLIBC_2.2 pthread_spin_destroy F
-+GLIBC_2.2 pthread_spin_init F
-+GLIBC_2.2 pthread_spin_lock F
-+GLIBC_2.2 pthread_spin_trylock F
-+GLIBC_2.2 pthread_spin_unlock F
-+GLIBC_2.2 pthread_yield F
-+GLIBC_2.2 pwrite F
-+GLIBC_2.2 pwrite64 F
-+GLIBC_2.2 sem_close F
-+GLIBC_2.2 sem_destroy F
-+GLIBC_2.2 sem_getvalue F
-+GLIBC_2.2 sem_init F
-+GLIBC_2.2 sem_open F
-+GLIBC_2.2 sem_post F
-+GLIBC_2.2 sem_timedwait F
-+GLIBC_2.2 sem_trywait F
-+GLIBC_2.2 sem_unlink F
-+GLIBC_2.2 sem_wait F
-+GLIBC_2.2.3 pthread_getattr_np F
-+GLIBC_2.2.6 __nanosleep F
-+GLIBC_2.28 call_once F
-+GLIBC_2.28 cnd_broadcast F
-+GLIBC_2.28 cnd_destroy F
-+GLIBC_2.28 cnd_init F
-+GLIBC_2.28 cnd_signal F
-+GLIBC_2.28 cnd_timedwait F
-+GLIBC_2.28 cnd_wait F
-+GLIBC_2.28 mtx_destroy F
-+GLIBC_2.28 mtx_init F
-+GLIBC_2.28 mtx_lock F
-+GLIBC_2.28 mtx_timedlock F
-+GLIBC_2.28 mtx_trylock F
-+GLIBC_2.28 mtx_unlock F
-+GLIBC_2.28 thrd_create F
-+GLIBC_2.28 thrd_detach F
-+GLIBC_2.28 thrd_exit F
-+GLIBC_2.28 thrd_join F
-+GLIBC_2.28 tss_create F
-+GLIBC_2.28 tss_delete F
-+GLIBC_2.28 tss_get F
-+GLIBC_2.28 tss_set F
-+GLIBC_2.3.2 pthread_cond_broadcast F
-+GLIBC_2.3.2 pthread_cond_destroy F
-+GLIBC_2.3.2 pthread_cond_init F
-+GLIBC_2.3.2 pthread_cond_signal F
-+GLIBC_2.3.2 pthread_cond_timedwait F
-+GLIBC_2.3.2 pthread_cond_wait F
-+GLIBC_2.3.3 __pthread_cleanup_routine F
-+GLIBC_2.3.3 __pthread_register_cancel F
-+GLIBC_2.3.3 __pthread_register_cancel_defer F
-+GLIBC_2.3.3 __pthread_unregister_cancel F
-+GLIBC_2.3.3 __pthread_unregister_cancel_restore F
-+GLIBC_2.3.3 __pthread_unwind_next F
-+GLIBC_2.3.3 pthread_attr_getaffinity_np F
-+GLIBC_2.3.3 pthread_attr_setaffinity_np F
-+GLIBC_2.3.3 pthread_attr_setstack F
-+GLIBC_2.3.3 pthread_attr_setstacksize F
-+GLIBC_2.3.3 pthread_barrierattr_getpshared F
-+GLIBC_2.3.3 pthread_condattr_getclock F
-+GLIBC_2.3.3 pthread_condattr_setclock F
-+GLIBC_2.3.3 pthread_getaffinity_np F
-+GLIBC_2.3.3 pthread_setaffinity_np F
-+GLIBC_2.3.3 pthread_timedjoin_np F
-+GLIBC_2.3.3 pthread_tryjoin_np F
-+GLIBC_2.3.4 pthread_attr_getaffinity_np F
-+GLIBC_2.3.4 pthread_attr_setaffinity_np F
-+GLIBC_2.3.4 pthread_getaffinity_np F
-+GLIBC_2.3.4 pthread_setaffinity_np F
-+GLIBC_2.3.4 pthread_setschedprio F
-+GLIBC_2.4 pthread_mutex_consistent_np F
-+GLIBC_2.4 pthread_mutex_getprioceiling F
-+GLIBC_2.4 pthread_mutex_setprioceiling F
-+GLIBC_2.4 pthread_mutexattr_getprioceiling F
-+GLIBC_2.4 pthread_mutexattr_getprotocol F
-+GLIBC_2.4 pthread_mutexattr_getrobust_np F
-+GLIBC_2.4 pthread_mutexattr_setprioceiling F
-+GLIBC_2.4 pthread_mutexattr_setprotocol F
-+GLIBC_2.4 pthread_mutexattr_setrobust_np F
-diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libresolv.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libresolv.abilist
-new file mode 100644
-index 00000000..eb9c1cb7
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libresolv.abilist
-@@ -0,0 +1,79 @@
-+GLIBC_2.27 __b64_ntop F
-+GLIBC_2.27 __b64_pton F
-+GLIBC_2.27 __dn_comp F
-+GLIBC_2.27 __dn_count_labels F
-+GLIBC_2.27 __dn_expand F
-+GLIBC_2.27 __dn_skipname F
-+GLIBC_2.27 __fp_nquery F
-+GLIBC_2.27 __fp_query F
-+GLIBC_2.27 __fp_resstat F
-+GLIBC_2.27 __hostalias F
-+GLIBC_2.27 __loc_aton F
-+GLIBC_2.27 __loc_ntoa F
-+GLIBC_2.27 __p_cdname F
-+GLIBC_2.27 __p_cdnname F
-+GLIBC_2.27 __p_class F
-+GLIBC_2.27 __p_class_syms D 0xa8
-+GLIBC_2.27 __p_fqname F
-+GLIBC_2.27 __p_fqnname F
-+GLIBC_2.27 __p_option F
-+GLIBC_2.27 __p_query F
-+GLIBC_2.27 __p_rcode F
-+GLIBC_2.27 __p_time F
-+GLIBC_2.27 __p_type F
-+GLIBC_2.27 __p_type_syms D 0x450
-+GLIBC_2.27 __putlong F
-+GLIBC_2.27 __putshort F
-+GLIBC_2.27 __res_close F
-+GLIBC_2.27 __res_dnok F
-+GLIBC_2.27 __res_hnok F
-+GLIBC_2.27 __res_hostalias F
-+GLIBC_2.27 __res_isourserver F
-+GLIBC_2.27 __res_mailok F
-+GLIBC_2.27 __res_mkquery F
-+GLIBC_2.27 __res_nameinquery F
-+GLIBC_2.27 __res_nmkquery F
-+GLIBC_2.27 __res_nquery F
-+GLIBC_2.27 __res_nquerydomain F
-+GLIBC_2.27 __res_nsearch F
-+GLIBC_2.27 __res_nsend F
-+GLIBC_2.27 __res_ownok F
-+GLIBC_2.27 __res_queriesmatch F
-+GLIBC_2.27 __res_query F
-+GLIBC_2.27 __res_querydomain F
-+GLIBC_2.27 __res_search F
-+GLIBC_2.27 __res_send F
-+GLIBC_2.27 __sym_ntop F
-+GLIBC_2.27 __sym_ntos F
-+GLIBC_2.27 __sym_ston F
-+GLIBC_2.27 _getlong F
-+GLIBC_2.27 _getshort F
-+GLIBC_2.27 inet_net_ntop F
-+GLIBC_2.27 inet_net_pton F
-+GLIBC_2.27 inet_neta F
-+GLIBC_2.27 ns_datetosecs F
-+GLIBC_2.27 ns_format_ttl F
-+GLIBC_2.27 ns_get16 F
-+GLIBC_2.27 ns_get32 F
-+GLIBC_2.27 ns_initparse F
-+GLIBC_2.27 ns_makecanon F
-+GLIBC_2.27 ns_msg_getflag F
-+GLIBC_2.27 ns_name_compress F
-+GLIBC_2.27 ns_name_ntol F
-+GLIBC_2.27 ns_name_ntop F
-+GLIBC_2.27 ns_name_pack F
-+GLIBC_2.27 ns_name_pton F
-+GLIBC_2.27 ns_name_rollback F
-+GLIBC_2.27 ns_name_skip F
-+GLIBC_2.27 ns_name_uncompress F
-+GLIBC_2.27 ns_name_unpack F
-+GLIBC_2.27 ns_parse_ttl F
-+GLIBC_2.27 ns_parserr F
-+GLIBC_2.27 ns_put16 F
-+GLIBC_2.27 ns_put32 F
-+GLIBC_2.27 ns_samedomain F
-+GLIBC_2.27 ns_samename F
-+GLIBC_2.27 ns_skiprr F
-+GLIBC_2.27 ns_sprintrr F
-+GLIBC_2.27 ns_sprintrrf F
-+GLIBC_2.27 ns_subdomain F
-diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/librt.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/librt.abilist
-new file mode 100644
-index 00000000..bfd262ec
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/librt.abilist
-@@ -0,0 +1,35 @@
-+GLIBC_2.27 __mq_open_2 F
-+GLIBC_2.27 aio_cancel F
-+GLIBC_2.27 aio_cancel64 F
-+GLIBC_2.27 aio_error F
-+GLIBC_2.27 aio_error64 F
-+GLIBC_2.27 aio_fsync F
-+GLIBC_2.27 aio_fsync64 F
-+GLIBC_2.27 aio_init F
-+GLIBC_2.27 aio_read F
-+GLIBC_2.27 aio_read64 F
-+GLIBC_2.27 aio_return F
-+GLIBC_2.27 aio_return64 F
-+GLIBC_2.27 aio_suspend F
-+GLIBC_2.27 aio_suspend64 F
-+GLIBC_2.27 aio_write F
-+GLIBC_2.27 aio_write64 F
-+GLIBC_2.27 lio_listio F
-+GLIBC_2.27 lio_listio64 F
-+GLIBC_2.27 mq_close F
-+GLIBC_2.27 mq_getattr F
-+GLIBC_2.27 mq_notify F
-+GLIBC_2.27 mq_open F
-+GLIBC_2.27 mq_receive F
-+GLIBC_2.27 mq_send F
-+GLIBC_2.27 mq_setattr F
-+GLIBC_2.27 mq_timedreceive F
-+GLIBC_2.27 mq_timedsend F
-+GLIBC_2.27 mq_unlink F
-+GLIBC_2.27 shm_open F
-+GLIBC_2.27 shm_unlink F
-+GLIBC_2.27 timer_create F
-+GLIBC_2.27 timer_delete F
-+GLIBC_2.27 timer_getoverrun F
-+GLIBC_2.27 timer_gettime F
-+GLIBC_2.27 timer_settime F
-diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libthread_db.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libthread_db.abilist
-new file mode 100644
-index 00000000..4122e563
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libthread_db.abilist
-@@ -0,0 +1,40 @@
-+GLIBC_2.27 td_init F
-+GLIBC_2.27 td_log F
-+GLIBC_2.27 td_symbol_list F
-+GLIBC_2.27 td_ta_clear_event F
-+GLIBC_2.27 td_ta_delete F
-+GLIBC_2.27 td_ta_enable_stats F
-+GLIBC_2.27 td_ta_event_addr F
-+GLIBC_2.27 td_ta_event_getmsg F
-+GLIBC_2.27 td_ta_get_nthreads F
-+GLIBC_2.27 td_ta_get_ph F
-+GLIBC_2.27 td_ta_get_stats F
-+GLIBC_2.27 td_ta_map_id2thr F
-+GLIBC_2.27 td_ta_map_lwp2thr F
-+GLIBC_2.27 td_ta_new F
-+GLIBC_2.27 td_ta_reset_stats F
-+GLIBC_2.27 td_ta_set_event F
-+GLIBC_2.27 td_ta_setconcurrency F
-+GLIBC_2.27 td_ta_thr_iter F
-+GLIBC_2.27 td_ta_tsd_iter F
-+GLIBC_2.27 td_thr_clear_event F
-+GLIBC_2.27 td_thr_dbresume F
-+GLIBC_2.27 td_thr_dbsuspend F
-+GLIBC_2.27 td_thr_event_enable F
-+GLIBC_2.27 td_thr_event_getmsg F
-+GLIBC_2.27 td_thr_get_info F
-+GLIBC_2.27 td_thr_getfpregs F
-+GLIBC_2.27 td_thr_getgregs F
-+GLIBC_2.27 td_thr_getxregs F
-+GLIBC_2.27 td_thr_getxregsize F
-+GLIBC_2.27 td_thr_set_event F
-+GLIBC_2.27 td_thr_setfpregs F
-+GLIBC_2.27 td_thr_setgregs F
-+GLIBC_2.27 td_thr_setprio F
-+GLIBC_2.27 td_thr_setsigpending F
-+GLIBC_2.27 td_thr_setxregs F
-+GLIBC_2.27 td_thr_sigsetmask F
-+GLIBC_2.27 td_thr_tls_get_addr F
-+GLIBC_2.27 td_thr_tlsbase F
-+GLIBC_2.27 td_thr_tsd F
-+GLIBC_2.27 td_thr_validate F
-diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libutil.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libutil.abilist
-new file mode 100644
-index 00000000..cbfec8d4
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libutil.abilist
-@@ -0,0 +1,6 @@
-+GLIBC_2.27 forkpty F
-+GLIBC_2.27 login F
-+GLIBC_2.27 login_tty F
-+GLIBC_2.27 logout F
-+GLIBC_2.27 logwtmp F
-+GLIBC_2.27 openpty F
-diff --git a/sysdeps/unix/sysv/linux/loongarch/makecontext.c b/sysdeps/unix/sysv/linux/loongarch/makecontext.c
-new file mode 100644
-index 00000000..55d509ab
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/makecontext.c
-@@ -0,0 +1,78 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <sysdep.h>
-+#include <sys/asm.h>
-+#include <sys/ucontext.h>
-+#include <stdarg.h>
-+#include <assert.h>
-+
-+void
-+__makecontext (ucontext_t *ucp, void (*func) (void), int argc,
-+	       long int a0, long int a1, long int a2, long int a3, long int a4,
-+	       ...)
-+{
-+  extern void __start_context (void) attribute_hidden;
-+  long int i, sp;
-+
-+  _Static_assert (LARCH_REG_NARGS == 8, "__makecontext assumes 8 argument registers");
-+
-+  /* Set up the stack.  */
-+  sp = ((long int) ucp->uc_stack.ss_sp + ucp->uc_stack.ss_size) & ALMASK;
-+
-+  /* Set up the register context.
-+     ra = s0 = 0, terminating the stack for backtracing purposes.
-+     s1 = the function we must call.
-+     s2 = the subsequent context to run.  */
-+  ucp->uc_mcontext.__gregs[LARCH_REG_RA] = 0;
-+  ucp->uc_mcontext.__gregs[LARCH_REG_S0] = 0;
-+  ucp->uc_mcontext.__gregs[LARCH_REG_S1] = (long int) func;
-+  ucp->uc_mcontext.__gregs[LARCH_REG_S2] = (long int) ucp->uc_link;
-+  ucp->uc_mcontext.__gregs[LARCH_REG_SP] = sp;
-+  ucp->uc_mcontext.__pc = (long int) &__start_context;
-+
-+  /* Put args in a0-a7, then put any remaining args on the stack.  */
-+  ucp->uc_mcontext.__gregs[LARCH_REG_A0 + 0] = a0;
-+  ucp->uc_mcontext.__gregs[LARCH_REG_A0 + 1] = a1;
-+  ucp->uc_mcontext.__gregs[LARCH_REG_A0 + 2] = a2;
-+  ucp->uc_mcontext.__gregs[LARCH_REG_A0 + 3] = a3;
-+  ucp->uc_mcontext.__gregs[LARCH_REG_A0 + 4] = a4;
-+
-+  if (__glibc_unlikely (argc > 5))
-+    {
-+      va_list vl;
-+      va_start (vl, a4);
-+
-+      long reg_args = argc < LARCH_REG_NARGS ? argc : LARCH_REG_NARGS;
-+      for (i = 5; i < reg_args; i++)
-+        ucp->uc_mcontext.__gregs[LARCH_REG_A0 + i] = va_arg (vl, long);
-+
-+      long int stack_args = argc - reg_args;
-+      if (stack_args > 0)
-+	{
-+	  sp = (sp - stack_args * sizeof (long int)) & ALMASK;
-+	  ucp->uc_mcontext.__gregs[LARCH_REG_SP] = sp;
-+	  for (i = 0; i < stack_args; i++)
-+	    ((long int *) sp)[i] = va_arg (vl, long int);
-+	}
-+
-+      va_end (vl);
-+    }
-+}
-+
-+weak_alias (__makecontext, makecontext)
-diff --git a/sysdeps/unix/sysv/linux/loongarch/profil-counter.h b/sysdeps/unix/sysv/linux/loongarch/profil-counter.h
-new file mode 100644
-index 00000000..6a3cc201
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/profil-counter.h
-@@ -0,0 +1,31 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <signal.h>
-+#include <sigcontextinfo.h>
-+
-+static void
-+__profil_counter (int signo, const SIGCONTEXT scp)
-+{
-+  profil_count ((void *) GET_PC (scp));
-+
-+  /* This is a hack to prevent the compiler from implementing the
-+     above function call as a sibcall.  The sibcall would overwrite
-+     the signal context.  */
-+  asm volatile ("");
-+}
-diff --git a/sysdeps/unix/sysv/linux/loongarch/pt-vfork.S b/sysdeps/unix/sysv/linux/loongarch/pt-vfork.S
-new file mode 100644
-index 00000000..1cc89317
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/pt-vfork.S
-@@ -0,0 +1 @@
-+/* Not needed.  */
-diff --git a/sysdeps/unix/sysv/linux/loongarch/register-dump.h b/sysdeps/unix/sysv/linux/loongarch/register-dump.h
-new file mode 100644
-index 00000000..5e45d5c7
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/register-dump.h
-@@ -0,0 +1,63 @@
-+/* Dump registers.
-+   Copyright (C) 2000-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <unistd.h>
-+#include <string.h>
-+#include <_itoa.h>
-+
-+static void
-+hexvalue (unsigned long int value, char *buf, size_t len)
-+{
-+  char *cp = _itoa_word (value, buf + len, 16, 0);
-+  while (cp > buf)
-+    *--cp = '0';
-+}
-+
-+#define REGDUMP_NREGS 32
-+#define REGDUMP_PER_LINE (80 / (__WORDSIZE / 4 + 4))
-+
-+static void
-+register_dump (int fd, ucontext_t *ctx)
-+{
-+  int i;
-+  char regvalue[__WORDSIZE / 4 + 1];
-+  char str[82 * ((REGDUMP_NREGS + REGDUMP_PER_LINE - 1) / REGDUMP_PER_LINE)];
-+
-+  static const char names[REGDUMP_NREGS][4] = {
-+    "pc", "ra", "tp", "sp", "a0", "a1", "a2", "a3",
-+    "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3",
-+    "t4", "t5", "t6", "t7", "t8", "x" , "fp", "s0",
-+    "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8"
-+  };
-+
-+  str[0] = 0;
-+  for (i = 0; i < REGDUMP_NREGS; i++)
-+    {
-+      strcat (str, names[i]);
-+      strcat (str, " ");
-+      hexvalue (ctx->uc_mcontext.__gregs[i], regvalue, __WORDSIZE / 4);
-+      strcat (str, regvalue);
-+
-+      if ((i + 1) % REGDUMP_PER_LINE == 0)
-+	strcat (str, "\n");
-+    }
-+
-+  write (fd, str, strlen (str));
-+}
-+
-+#define REGISTER_DUMP register_dump (fd, ctx)
-diff --git a/sysdeps/unix/sysv/linux/loongarch/setcontext.S b/sysdeps/unix/sysv/linux/loongarch/setcontext.S
-new file mode 100644
-index 00000000..c96ec43c
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/setcontext.S
-@@ -0,0 +1,111 @@
-+/* Set current context.
-+   Copyright (C) 2009-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+#include "sys/regdef.h"
-+#include "ucontext-macros.h"
-+
-+/*  int __setcontext (const ucontext_t *ucp)
-+
-+  Restores the machine context in UCP and thereby resumes execution
-+  in that context.
-+
-+  This implementation is intended to be used for *synchronous* context
-+  switches only.  Therefore, it does not have to restore anything
-+  other than the PRESERVED state.  */
-+
-+	.text
-+LEAF (__setcontext)
-+
-+	addi.d	sp, sp, -16
-+	st.d	a0, sp, 0	/* Save ucp to stack. */
-+/* rt_sigprocmask (SIG_SETMASK, &ucp->uc_sigmask, NULL, _NSIG8) */
-+	li.d	a3, _NSIG8
-+	li.d	a2, 0
-+	addi.d  a1, a0, UCONTEXT_SIGMASK
-+	li.d	a0, SIG_SETMASK
-+
-+	li.d	a7, SYS_ify (rt_sigprocmask)
-+	syscall 0
-+
-+	blt	a0, $r0, 99f
-+
-+	ld.d	t0, sp, 0      /* Load ucp to t0. */
-+	cfi_def_cfa (12, 0)
-+
-+#ifndef __loongarch_soft_float
-+	ld.w	t1, t0, MCONTEXT_FCSR
-+
-+	RESTORE_FP_REG     (fs0,  24, t0)
-+	RESTORE_FP_REG     (fs1,  25, t0)
-+	RESTORE_FP_REG     (fs2,  26, t0)
-+	RESTORE_FP_REG     (fs3,  27, t0)
-+	RESTORE_FP_REG     (fs4,  28, t0)
-+	RESTORE_FP_REG     (fs5,  29, t0)
-+	RESTORE_FP_REG     (fs6,  30, t0)
-+	RESTORE_FP_REG     (fs7,  31, t0)
-+
-+	movgr2fcsr	$r0, t1
-+#endif /* __loongarch_soft_float */
-+
-+	/* Note the contents of argument registers will be random
-+	   unless makecontext() has been called.  */
-+	RESTORE_INT_REG     (ra,   1, t0)
-+	RESTORE_INT_REG     (sp,   3, t0)
-+	RESTORE_INT_REG     (a0,   4, t0)
-+	RESTORE_INT_REG     (a1,   5, t0)
-+	RESTORE_INT_REG     (a2,   6, t0)
-+	RESTORE_INT_REG     (a3,   7, t0)
-+	RESTORE_INT_REG     (a4,   8, t0)
-+	RESTORE_INT_REG     (a5,   9, t0)
-+	RESTORE_INT_REG     (a6,  10, t0)
-+	RESTORE_INT_REG     (a7,  11, t0)
-+	RESTORE_INT_REG     (x,  21, t0)
-+	RESTORE_INT_REG     (fp,  22, t0)
-+	RESTORE_INT_REG     (s0,  23, t0)
-+	RESTORE_INT_REG     (s1,  24, t0)
-+	RESTORE_INT_REG     (s2,  25, t0)
-+	RESTORE_INT_REG     (s3,  26, t0)
-+	RESTORE_INT_REG     (s4,  27, t0)
-+	RESTORE_INT_REG     (s5,  28, t0)
-+	RESTORE_INT_REG     (s6,  29, t0)
-+	RESTORE_INT_REG     (s7,  30, t0)
-+	RESTORE_INT_REG     (s8,  31, t0)
-+	ld.d t1, t0, MCONTEXT_PC
-+	jirl	$r0,t1,0
-+
-+99:	
-+	addi.d	sp, sp, 16
-+	b	__syscall_error
-+
-+PSEUDO_END (__setcontext)
-+weak_alias (__setcontext, setcontext)
-+
-+LEAF (__start_context)
-+
-+	/* Terminate call stack by noting ra == 0.  Happily, s0 == 0 here.  */
-+	cfi_register (1, 23)
-+
-+	/* Call the function passed to makecontext.  */
-+	jirl	$r1,s1,0
-+
-+	/* Invoke subsequent context if present, else exit(0).  */
-+	ori	a0, s2, 0
-+	beqz	s2, 1f
-+	bl	__setcontext
-+1:	b	exit
-+
-+PSEUDO_END (__start_context)
-diff --git a/sysdeps/unix/sysv/linux/loongarch/shlib-versions b/sysdeps/unix/sysv/linux/loongarch/shlib-versions
-new file mode 100644
-index 00000000..2a67fe71
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/shlib-versions
-@@ -0,0 +1,2 @@
-+DEFAULT		GLIBC_2.27
-+libpthread=0            GLIBC_2.0 GLIBC_2.2
-diff --git a/sysdeps/unix/sysv/linux/loongarch/sigcontextinfo.h b/sysdeps/unix/sysv/linux/loongarch/sigcontextinfo.h
-new file mode 100644
-index 00000000..2a864795
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/sigcontextinfo.h
-@@ -0,0 +1,22 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <sys/ucontext.h>
-+
-+#define SIGCONTEXT siginfo_t *_si, ucontext_t *
-+#define GET_PC(ctx)	((void *) ctx->uc_mcontext.__pc)
-diff --git a/sysdeps/unix/sysv/linux/loongarch/swapcontext.S b/sysdeps/unix/sysv/linux/loongarch/swapcontext.S
-new file mode 100644
-index 00000000..d839dd87
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/swapcontext.S
-@@ -0,0 +1,120 @@
-+/* Save and set current context.
-+   Copyright (C) 2009-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include "ucontext-macros.h"
-+
-+/* int swapcontext (ucontext_t *oucp, const ucontext_t *ucp) */
-+
-+LEAF (__swapcontext)
-+	ori 	a2, sp, 0		/* Save sp to a2. */
-+	addi.d	sp, sp, -16
-+	st.d	a1, sp, 0
-+	ori 	t0, a1, 0	
-+
-+	SAVE_INT_REG (ra,   1, a0)
-+	SAVE_INT_REG (a2,   3, a0)      /* Store sp .*/
-+	SAVE_INT_REG (zero,  4, a0)	/* return 0 by overwriting a0.  */
-+	SAVE_INT_REG (x,  21, a0)
-+	SAVE_INT_REG (fp,  22, a0)
-+	SAVE_INT_REG (s0,  23, a0)
-+	SAVE_INT_REG (s1,  24, a0)
-+	SAVE_INT_REG (s2,  25, a0)
-+	SAVE_INT_REG (s3,  26, a0)
-+	SAVE_INT_REG (s4,  27, a0)
-+	SAVE_INT_REG (s5,  28, a0)
-+	SAVE_INT_REG (s6,  29, a0)
-+	SAVE_INT_REG (s7,  30, a0)
-+	SAVE_INT_REG (s8,  31, a0)
-+	st.d ra, a0, MCONTEXT_PC
-+#ifndef __loongarch_soft_float
-+	movfcsr2gr a1, $r0
-+
-+	SAVE_FP_REG (fs0,  24, a0)
-+	SAVE_FP_REG (fs1,  25, a0)
-+	SAVE_FP_REG (fs2,  26, a0)
-+	SAVE_FP_REG (fs3,  27, a0)
-+	SAVE_FP_REG (fs4,  28, a0)
-+	SAVE_FP_REG (fs5,  29, a0)
-+	SAVE_FP_REG (fs6,  30, a0)
-+	SAVE_FP_REG (fs7,  31, a0)
-+
-+	st.w	a1, a0, MCONTEXT_FCSR
-+#endif /* __loongarch_soft_float */
-+
-+/* rt_sigprocmask (SIG_SETMASK, &ucp->uc_sigmask, &oucp->uc_sigmask, _NSIG8) */
-+	li.d	a3, _NSIG8
-+	addi.d	a2, a0, UCONTEXT_SIGMASK
-+	addi.d  a1, t0, UCONTEXT_SIGMASK
-+	li.d	a0, SIG_SETMASK
-+
-+	li.d	a7, SYS_ify (rt_sigprocmask)
-+	syscall 0
-+
-+	blt	a0, zero, 99f
-+
-+#ifndef __loongarch_soft_float
-+	ld.d	t0, sp, 0		/* Load a1 to t0. */
-+	ld.w	t1, t0, MCONTEXT_FCSR
-+
-+	RESTORE_FP_REG (fs0,  24, t0)
-+	RESTORE_FP_REG (fs1,  25, t0)
-+	RESTORE_FP_REG (fs2,  26, t0)
-+	RESTORE_FP_REG (fs3,  27, t0)
-+	RESTORE_FP_REG (fs4,  28, t0)
-+	RESTORE_FP_REG (fs5,  29, t0)
-+	RESTORE_FP_REG (fs6,  30, t0)
-+	RESTORE_FP_REG (fs7,  31, t0)
-+
-+	movgr2fcsr	$r0, t1
-+#endif /* __loongarch_soft_float */
-+
-+	/* Note the contents of argument registers will be random
-+	   unless makecontext() has been called.  */
-+	RESTORE_INT_REG (ra,   1, t0)
-+	RESTORE_INT_REG (sp,   3, t0)
-+	RESTORE_INT_REG (a0,   4, t0)
-+	RESTORE_INT_REG (a1,   5, t0)
-+	RESTORE_INT_REG (a2,   6, t0)
-+	RESTORE_INT_REG (a3,   7, t0)
-+	RESTORE_INT_REG (a4,   8, t0)
-+	RESTORE_INT_REG (a5,   9, t0)
-+	RESTORE_INT_REG (a6,  10, t0)
-+	RESTORE_INT_REG (a7,  11, t0)
-+	RESTORE_INT_REG (x,  21, t0)
-+	RESTORE_INT_REG (fp,  22, t0)
-+	RESTORE_INT_REG (s0,  23, t0)
-+	RESTORE_INT_REG (s1,  24, t0)
-+	RESTORE_INT_REG (s2,  25, t0)
-+	RESTORE_INT_REG (s3,  26, t0)
-+	RESTORE_INT_REG (s4,  27, t0)
-+	RESTORE_INT_REG (s5,  28, t0)
-+	RESTORE_INT_REG (s6,  29, t0)
-+	RESTORE_INT_REG (s7,  30, t0)
-+	RESTORE_INT_REG (s8,  31, t0)
-+	ld.d t1, t0, MCONTEXT_PC
-+
-+	jirl	$r0, t1, 0
-+
-+
-+99:	
-+	addi.d	sp, sp, 16
-+	b	__syscall_error
-+
-+PSEUDO_END (__swapcontext)
-+
-+weak_alias (__swapcontext, swapcontext)
-diff --git a/sysdeps/unix/sysv/linux/loongarch/sys/procfs.h b/sysdeps/unix/sysv/linux/loongarch/sys/procfs.h
-new file mode 100644
-index 00000000..9ae06b40
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/sys/procfs.h
-@@ -0,0 +1,122 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _SYS_PROCFS_H
-+#define _SYS_PROCFS_H	1
-+
-+/* This is somehow modelled after the file of the same name on SysVr4
-+   systems.  It provides a definition of the core file format for ELF
-+   used on Linux.  */
-+
-+#include <features.h>
-+#include <sys/time.h>
-+#include <sys/types.h>
-+#include <sys/user.h>
-+#include <sys/ucontext.h>
-+
-+__BEGIN_DECLS
-+
-+/* Type for a general-purpose register.  */
-+typedef uint64_t elf_greg_t;
-+
-+/* And the whole bunch of them.  We could have used `struct
-+   pt_regs' directly in the typedef, but tradition says that
-+   the register set is an array, which does have some peculiar
-+   semantics, so leave it that way.  */
-+#define ELF_NGREG (sizeof (struct user_regs_struct) / sizeof(elf_greg_t))
-+typedef elf_greg_t elf_gregset_t[ELF_NGREG];
-+
-+#define ELF_NFPREG      34      /* 32 FPRs + 8-byte byte-vec for fcc + 4-byte FCR */
-+typedef union { double d; float f; } elf_fpreg_t;
-+typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
-+
-+typedef union { double d[2]; float f[4]; } __attribute__((__aligned__ (16))) elf_lsxregset_t[32];
-+typedef union { double d[4]; float f[8]; } __attribute__((__aligned__ (32))) elf_lasxregset_t[32];
-+
-+struct elf_siginfo
-+  {
-+    int si_signo;			/* Signal number.  */
-+    int si_code;			/* Extra code.  */
-+    int si_errno;			/* Errno.  */
-+  };
-+
-+/* Definitions to generate Intel SVR4-like core files.  These mostly
-+   have the same names as the SVR4 types with "elf_" tacked on the
-+   front to prevent clashes with linux definitions, and the typedef
-+   forms have been avoided.  This is mostly like the SVR4 structure,
-+   but more Linuxy, with things that Linux does not support and which
-+   gdb doesn't really use excluded.  Fields present but not used are
-+   marked with "XXX".  */
-+struct elf_prstatus
-+  {
-+    struct elf_siginfo pr_info;		/* Info associated with signal.  */
-+    short int pr_cursig;		/* Current signal.  */
-+    unsigned long int pr_sigpend;	/* Set of pending signals.  */
-+    unsigned long int pr_sighold;	/* Set of held signals.  */
-+    __pid_t pr_pid;
-+    __pid_t pr_ppid;
-+    __pid_t pr_pgrp;
-+    __pid_t pr_sid;
-+    struct timeval pr_utime;		/* User time.  */
-+    struct timeval pr_stime;		/* System time.  */
-+    struct timeval pr_cutime;		/* Cumulative user time.  */
-+    struct timeval pr_cstime;		/* Cumulative system time.  */
-+    elf_gregset_t pr_reg;		/* GP registers.  */
-+    int pr_fpvalid;			/* True if math copro being used.  */
-+  };
-+
-+
-+#define ELF_PRARGSZ     (80)    /* Number of chars for args */
-+
-+struct elf_prpsinfo
-+  {
-+    char pr_state;			/* Numeric process state.  */
-+    char pr_sname;			/* Char for pr_state.  */
-+    char pr_zomb;			/* Zombie.  */
-+    char pr_nice;			/* Nice val.  */
-+    unsigned long int pr_flag;		/* Flags.  */
-+    unsigned int pr_uid;
-+    unsigned int pr_gid;
-+    int pr_pid, pr_ppid, pr_pgrp, pr_sid;
-+    /* Lots missing */
-+    char pr_fname[16];			/* Filename of executable.  */
-+    char pr_psargs[ELF_PRARGSZ];	/* Initial part of arg list.  */
-+  };
-+
-+/* The rest of this file provides the types for emulation of the
-+   Solaris <proc_service.h> interfaces that should be implemented by
-+   users of libthread_db.  */
-+
-+/* Addresses.  */
-+typedef void *psaddr_t;
-+
-+/* Register sets.  Linux has different names.  */
-+typedef elf_gregset_t prgregset_t;
-+typedef elf_fpregset_t prfpregset_t;
-+
-+/* We don't have any differences between processes and threads,
-+   therefore habe only ine PID type.  */
-+typedef __pid_t lwpid_t;
-+
-+/* Process status and info.  In the end we do provide typedefs for them.  */
-+typedef struct elf_prstatus prstatus_t;
-+typedef struct elf_prpsinfo prpsinfo_t;
-+
-+__END_DECLS
-+
-+#endif	/* sys/procfs.h */
-diff --git a/sysdeps/unix/sysv/linux/loongarch/sys/ucontext.h b/sysdeps/unix/sysv/linux/loongarch/sys/ucontext.h
-new file mode 100644
-index 00000000..e52a46c9
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/sys/ucontext.h
-@@ -0,0 +1,81 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+/* Don't rely on this, the interface is currently messed up and may need to
-+   be broken to be fixed.  */
-+#ifndef _SYS_UCONTEXT_H
-+#define _SYS_UCONTEXT_H	1
-+
-+#include <features.h>
-+
-+#include <bits/types/sigset_t.h>
-+#include <bits/types/stack_t.h>
-+
-+typedef unsigned long int __loongarch_mc_gp_state[32];
-+
-+#ifdef __USE_MISC
-+# define LARCH_NGREG	32
-+
-+# define LARCH_REG_RA 1
-+# define LARCH_REG_SP 3
-+# define LARCH_REG_S0 23
-+# define LARCH_REG_S1 24
-+# define LARCH_REG_A0 4
-+# define LARCH_REG_S2 25
-+# define LARCH_REG_NARGS 8
-+
-+typedef unsigned long int greg_t;
-+
-+/* Container for all general registers.  */
-+typedef __loongarch_mc_gp_state gregset_t;
-+
-+/* Container for floating-point state.  */
-+typedef union __loongarch_mc_fp_state fpregset_t;
-+#endif
-+
-+
-+
-+union __loongarch_mc_fp_state {
-+    unsigned int   __val32[256 / 32];
-+    unsigned long long   __val64[256 / 64];
-+};
-+
-+typedef struct mcontext_t {
-+    unsigned long long   __pc;
-+    unsigned long long   __gregs[32];
-+    unsigned int   __flags;
-+
-+    unsigned int   __fcsr;
-+    unsigned int   __vcsr;
-+    unsigned long long   __fcc;
-+    union __loongarch_mc_fp_state    __fpregs[32] __attribute__((__aligned__ (32)));
-+
-+    unsigned int   __reserved;
-+} mcontext_t;
-+
-+/* Userlevel context.  */
-+typedef struct ucontext_t
-+  {
-+    unsigned long int  __uc_flags;
-+    struct ucontext_t *uc_link;
-+    stack_t            uc_stack;
-+    mcontext_t uc_mcontext;
-+    sigset_t           uc_sigmask;
-+  } ucontext_t;
-+
-+#endif /* sys/ucontext.h */
-diff --git a/sysdeps/unix/sysv/linux/loongarch/sys/user.h b/sysdeps/unix/sysv/linux/loongarch/sys/user.h
-new file mode 100644
-index 00000000..f9108350
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/sys/user.h
-@@ -0,0 +1,31 @@
-+/* Copyright (C) 2001-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _SYS_USER_H
-+#define _SYS_USER_H	1
-+
-+#include <stdint.h>
-+
-+struct user_regs_struct
-+{
-+  uint64_t gpr[32];
-+  uint64_t pc;
-+  uint64_t badvaddr;
-+  uint64_t reserved[11];
-+};
-+
-+#endif	/* _SYS_USER_H */
-diff --git a/sysdeps/unix/sysv/linux/loongarch/syscall.c b/sysdeps/unix/sysv/linux/loongarch/syscall.c
-new file mode 100644
-index 00000000..b06a528e
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/syscall.c
-@@ -0,0 +1,36 @@
-+/* Copyright (C) 2020-2021 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <sysdep.h>
-+
-+long int
-+syscall (long int syscall_number, long int arg1, long int arg2, long int arg3,
-+	 long int arg4, long int arg5, long int arg6, long int arg7)
-+{
-+  long int ret;
-+  INTERNAL_SYSCALL_DECL (err);
-+
-+  ret = INTERNAL_SYSCALL_NCS (syscall_number, err, 7, arg1, arg2, arg3, arg4,
-+			      arg5, arg6, arg7);
-+
-+  if (INTERNAL_SYSCALL_ERROR_P (ret, err))
-+    return __syscall_error (ret);
-+
-+  return ret;
-+}
-+
-diff --git a/sysdeps/unix/sysv/linux/loongarch/sysdep.S b/sysdeps/unix/sysv/linux/loongarch/sysdep.S
-new file mode 100644
-index 00000000..a8094283
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/sysdep.S
-@@ -0,0 +1,52 @@
-+/* syscall error handlers
-+   Copyright (C) 2011-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <sysdep.h>
-+
-+#if IS_IN (libc)
-+# define errno __libc_errno
-+#endif
-+
-+ENTRY (__syscall_error)
-+	/* Fall through to __syscall_set_errno.  */
-+END (__syscall_error)
-+
-+/* Non-standard calling convention: argument in a0, return address in t0,
-+   and clobber only t1.  */
-+ENTRY (__syscall_set_errno)
-+	/* We got here because a0 < 0, but only codes in the range [-4095, -1]
-+	  represent errors.  Otherwise, just return the result normally.  */
-+
-+	li.d	t1, -4096
-+	bgeu	t1, a0, L (out)
-+	sub.w	a0, zero, a0
-+
-+#if RTLD_PRIVATE_ERRNO
-+	la	t1, rtld_errno
-+#elif defined(__PIC__)
-+	la.tls.ie	t1, errno
-+	add.d	t1, tp, t1
-+#else
-+	la.tls.le	t1, errno
-+	add.d	t1, tp, t1
-+#endif
-+	st.w	a0, t1, 0
-+	li.d	a0, -1
-+L (out):
-+	ret
-+END (__syscall_set_errno)
-diff --git a/sysdeps/unix/sysv/linux/loongarch/sysdep.h b/sysdeps/unix/sysv/linux/loongarch/sysdep.h
-new file mode 100644
-index 00000000..f50946d4
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/sysdep.h
-@@ -0,0 +1,333 @@
-+#ifndef _LINUX_LOONGARCH_SYSDEP_H
-+#define _LINUX_LOONGARCH_SYSDEP_H 1
-+
-+#include <sysdeps/unix/sysv/linux/generic/sysdep.h>
-+#include <tls.h>
-+
-+#ifdef __ASSEMBLER__
-+
-+# include <sys/asm.h>
-+# define ret 	jirl	zero, ra, 0
-+# define L(label) .L ## label
-+
-+/* Performs a system call, handling errors by setting errno.  Linux indicates
-+   errors by setting a0 to a value between -1 and -4095.  */
-+# undef PSEUDO
-+# define PSEUDO(name, syscall_name, args)		\
-+	ENTRY (name);					\
-+	li.d	a7, SYS_ify (syscall_name);		\
-+	syscall	0;					\
-+	li.d	a7, -4096;				\
-+	bltu	a7, a0, .Lsyscall_error ## name;
-+
-+# undef PSEUDO_END
-+# define PSEUDO_END(sym) 				\
-+	SYSCALL_ERROR_HANDLER (sym);			\
-+	ret;						\
-+	END (sym);
-+
-+# if !IS_IN (libc)
-+#  if RTLD_PRIVATE_ERRNO
-+
-+#   define SYSCALL_ERROR_HANDLER(name)			\
-+.Lsyscall_error ## name:				\
-+	la	t0, rtld_errno;				\
-+	sub.w	a0, zero, a0;				\
-+	st.w	a0, t0, 0;				\
-+	li.d	a0, -1;
-+
-+#  else
-+
-+#   define SYSCALL_ERROR_HANDLER(name)			\
-+.Lsyscall_error ## name:				\
-+	la.tls.ie	t0, errno;			\
-+	add.d	t0, tp, t0;				\
-+	sub.w	a0, zero, a0;				\
-+	st.w	a0, t0, 0;				\
-+	li.d	a0, -1;
-+
-+#  endif
-+# else
-+
-+#  define SYSCALL_ERROR_HANDLER(name)			\
-+.Lsyscall_error ## name:				\
-+	b	__syscall_error;
-+
-+# endif
-+
-+/* Performs a system call, not setting errno.  */
-+# undef PSEUDO_NEORRNO
-+# define PSEUDO_NOERRNO(name, syscall_name, args)	\
-+	ENTRY (name);					\
-+	li.d	a7, SYS_ify (syscall_name);		\
-+	syscall	0;
-+
-+# undef PSEUDO_END_NOERRNO
-+# define PSEUDO_END_NOERRNO(name)			\
-+	END (name);
-+
-+# undef ret_NOERRNO
-+# define ret_NOERRNO ret
-+
-+/* Perfroms a system call, returning the error code.  */
-+# undef PSEUDO_ERRVAL
-+# define PSEUDO_ERRVAL(name, syscall_name, args)	\
-+	PSEUDO_NOERRNO (name, syscall_name, args);	\
-+	slli.d	a0, a0, 32;				\
-+	srai.d	a0, a0, 32; /* sign_ext */		\
-+	sub.d	a0, zero, a0;
-+
-+# undef PSEUDO_END_ERRVAL
-+# define PSEUDO_END_ERRVAL(name)			\
-+	END (name);
-+
-+# undef ret_ERRVAL
-+# define ret_ERRVAL ret
-+
-+#endif /* __ASSEMBLER__ */
-+
-+/* In order to get __set_errno() definition in INLINE_SYSCALL.  */
-+#ifndef __ASSEMBLER__
-+# include <errno.h>
-+#endif
-+
-+#include <sysdeps/unix/sysdep.h>
-+
-+#undef SYS_ify
-+#define SYS_ify(syscall_name)	__NR_##syscall_name
-+
-+#ifndef __ASSEMBLER__
-+
-+/* List of system calls which are supported as vsyscalls.  */
-+# define HAVE_CLOCK_GETRES_VSYSCALL	1
-+# define HAVE_CLOCK_GETTIME_VSYSCALL	1
-+# define HAVE_GETTIMEOFDAY_VSYSCALL	1
-+# define HAVE_GETCPU_VSYSCALL		1
-+
-+/* Define a macro which expands into the inline wrapper code for a system
-+   call.  */
-+# undef INLINE_SYSCALL
-+# define INLINE_SYSCALL(name, nr, args...)				\
-+  ({ INTERNAL_SYSCALL_DECL (err);					\
-+     long int __sys_result = INTERNAL_SYSCALL (name, err, nr, args);	\
-+     if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (__sys_result, )))  \
-+       {								\
-+         __set_errno (INTERNAL_SYSCALL_ERRNO (__sys_result, ));		\
-+	 __sys_result = (unsigned long) -1;				\
-+       }								\
-+     __sys_result; })
-+
-+
-+# define INTERNAL_SYSCALL_DECL(err) do { } while (0)
-+
-+# define INTERNAL_SYSCALL_ERROR_P(val, err) \
-+	((unsigned long int) (val) > -4096UL)
-+
-+# define INTERNAL_SYSCALL_ERRNO(val, err) (-(val))
-+
-+# define INTERNAL_SYSCALL(name, err, nr, args...) \
-+	internal_syscall##nr (SYS_ify (name), err, args)
-+
-+# define INTERNAL_SYSCALL_NCS(number, err, nr, args...) \
-+	internal_syscall##nr (number, err, args)
-+
-+# define internal_syscall0(number, err, dummy...)			\
-+({ 									\
-+	long int _sys_result;						\
-+									\
-+	{								\
-+	register long int __a7 asm ("$a7") = number;			\
-+	register long int __a0 asm ("$a0");				\
-+	__asm__ volatile ( 						\
-+	"syscall	0\n\t" 						\
-+	: "=r" (__a0)							\
-+	: "r" (__a7)							\
-+	: __SYSCALL_CLOBBERS); 						\
-+	_sys_result = __a0;						\
-+	}								\
-+	_sys_result;							\
-+})
-+
-+# define internal_syscall1(number, err, arg0)				\
-+({ 									\
-+	long int _sys_result;						\
-+									\
-+	{								\
-+	long int _arg0 = (long int) (arg0);				\
-+	register long int __a7 asm ("$a7") = number;			\
-+	register long int __a0 asm ("$a0") = _arg0;			\
-+	__asm__ volatile ( 						\
-+	"syscall	0\n\t" 						\
-+	: "+r" (__a0)							\
-+	: "r" (__a7)							\
-+	: __SYSCALL_CLOBBERS); 						\
-+	_sys_result = __a0;						\
-+	}								\
-+	_sys_result;							\
-+})
-+
-+# define internal_syscall2(number, err, arg0, arg1)	    		\
-+({ 									\
-+	long int _sys_result;						\
-+									\
-+	{								\
-+	long int _arg0 = (long int) (arg0);				\
-+	long int _arg1 = (long int) (arg1);				\
-+	register long int __a7 asm ("$a7") = number;			\
-+	register long int __a0 asm ("$a0") = _arg0;			\
-+	register long int __a1 asm ("$a1") = _arg1;			\
-+	__asm__ volatile ( 						\
-+	"syscall	0\n\t" 						\
-+	: "+r" (__a0)							\
-+	: "r" (__a7), "r" (__a1)					\
-+	: __SYSCALL_CLOBBERS); 						\
-+	_sys_result = __a0;						\
-+	}								\
-+	_sys_result;							\
-+})
-+
-+# define internal_syscall3(number, err, arg0, arg1, arg2)      		\
-+({ 									\
-+	long int _sys_result;						\
-+									\
-+	{								\
-+	long int _arg0 = (long int) (arg0);				\
-+	long int _arg1 = (long int) (arg1);				\
-+	long int _arg2 = (long int) (arg2);				\
-+	register long int __a7 asm ("$a7") = number;			\
-+	register long int __a0 asm ("$a0") = _arg0;			\
-+	register long int __a1 asm ("$a1") = _arg1;			\
-+	register long int __a2 asm ("$a2") = _arg2;			\
-+	__asm__ volatile ( 						\
-+	"syscall	0\n\t" 						\
-+	: "+r" (__a0)							\
-+	: "r" (__a7), "r" (__a1), "r" (__a2)				\
-+	: __SYSCALL_CLOBBERS); 						\
-+	_sys_result = __a0;						\
-+	}								\
-+	_sys_result;							\
-+})
-+
-+# define internal_syscall4(number, err, arg0, arg1, arg2, arg3)	  	\
-+({ 									\
-+	long int _sys_result;						\
-+									\
-+	{								\
-+	long int _arg0 = (long int) (arg0);				\
-+	long int _arg1 = (long int) (arg1);				\
-+	long int _arg2 = (long int) (arg2);				\
-+	long int _arg3 = (long int) (arg3);				\
-+	register long int __a7 asm ("$a7") = number;			\
-+	register long int __a0 asm ("$a0") = _arg0;			\
-+	register long int __a1 asm ("$a1") = _arg1;			\
-+	register long int __a2 asm ("$a2") = _arg2;			\
-+	register long int __a3 asm ("$a3") = _arg3;			\
-+	__asm__ volatile ( 						\
-+	"syscall	0\n\t" 						\
-+	: "+r" (__a0)							\
-+	: "r" (__a7), "r" (__a1), "r" (__a2), "r" (__a3)		\
-+	: __SYSCALL_CLOBBERS); 						\
-+	_sys_result = __a0;						\
-+	}								\
-+	_sys_result;							\
-+})
-+
-+# define internal_syscall5(number, err, arg0, arg1, arg2, arg3, arg4)   \
-+({ 									\
-+	long int _sys_result;						\
-+									\
-+	{								\
-+	long int _arg0 = (long int) (arg0);				\
-+	long int _arg1 = (long int) (arg1);				\
-+	long int _arg2 = (long int) (arg2);				\
-+	long int _arg3 = (long int) (arg3);				\
-+	long int _arg4 = (long int) (arg4);				\
-+	register long int __a7 asm ("$a7") = number;			\
-+	register long int __a0 asm ("$a0") = _arg0;			\
-+	register long int __a1 asm ("$a1") = _arg1;			\
-+	register long int __a2 asm ("$a2") = _arg2;			\
-+	register long int __a3 asm ("$a3") = _arg3;			\
-+	register long int __a4 asm ("$a4") = _arg4;			\
-+	__asm__ volatile ( 						\
-+	"syscall	0\n\t" 						\
-+	: "+r" (__a0)							\
-+	: "r" (__a7), "r"(__a1), "r"(__a2), "r"(__a3), "r" (__a4)	\
-+	: __SYSCALL_CLOBBERS); 						\
-+	_sys_result = __a0;						\
-+	}								\
-+	_sys_result;							\
-+})
-+
-+# define internal_syscall6(number, err, arg0, arg1, arg2, arg3, arg4, arg5) \
-+({ 									\
-+	long int _sys_result;						\
-+									\
-+	{								\
-+	long int _arg0 = (long int) (arg0);				\
-+	long int _arg1 = (long int) (arg1);				\
-+	long int _arg2 = (long int) (arg2);				\
-+	long int _arg3 = (long int) (arg3);				\
-+	long int _arg4 = (long int) (arg4);				\
-+	long int _arg5 = (long int) (arg5);				\
-+	register long int __a7 asm ("$a7") = number;			\
-+	register long int __a0 asm ("$a0") = _arg0;			\
-+	register long int __a1 asm ("$a1") = _arg1;			\
-+	register long int __a2 asm ("$a2") = _arg2;			\
-+	register long int __a3 asm ("$a3") = _arg3;			\
-+	register long int __a4 asm ("$a4") = _arg4;			\
-+	register long int __a5 asm ("$a5") = _arg5;			\
-+	__asm__ volatile ( 						\
-+	"syscall	0\n\t" 						\
-+	: "+r" (__a0)							\
-+	: "r" (__a7), "r" (__a1), "r" (__a2), "r" (__a3),		\
-+	  "r" (__a4), "r" (__a5)					\
-+	: __SYSCALL_CLOBBERS); 						\
-+	_sys_result = __a0;						\
-+	}								\
-+	_sys_result;							\
-+})
-+
-+# define internal_syscall7(number, err, arg0, arg1, arg2, arg3, arg4, arg5, arg6) \
-+({ 									\
-+	long int _sys_result;						\
-+									\
-+	{								\
-+	long int _arg0 = (long int) (arg0);				\
-+	long int _arg1 = (long int) (arg1);				\
-+	long int _arg2 = (long int) (arg2);				\
-+	long int _arg3 = (long int) (arg3);				\
-+	long int _arg4 = (long int) (arg4);				\
-+	long int _arg5 = (long int) (arg5);				\
-+	long int _arg6 = (long int) (arg6);				\
-+	register long int __a7 asm ("$a7") = number;			\
-+	register long int __a0 asm ("$a0") = _arg0;			\
-+	register long int __a1 asm ("$a1") = _arg1;			\
-+	register long int __a2 asm ("$a2") = _arg2;			\
-+	register long int __a3 asm ("$a3") = _arg3;			\
-+	register long int __a4 asm ("$a4") = _arg4;			\
-+	register long int __a5 asm ("$a5") = _arg5;			\
-+	register long int __a6 asm ("$a6") = _arg6;			\
-+	__asm__ volatile ( 						\
-+	"syscall	0\n\t" 						\
-+	: "+r" (__a0)							\
-+	: "r" (__a7), "r" (__a1), "r" (__a2), "r" (__a3),		\
-+	  "r" (__a4), "r" (__a5), "r" (__a6)				\
-+	: __SYSCALL_CLOBBERS); 						\
-+	_sys_result = __a0;						\
-+	}								\
-+	_sys_result;							\
-+})
-+
-+# define __SYSCALL_CLOBBERS \
-+	"$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t8",\
-+	"memory"
-+
-+extern long int __syscall_error (long int neg_errno);
-+
-+#endif /* ! __ASSEMBLER__ */
-+
-+/* Pointer mangling is not supported.  */
-+#define PTR_MANGLE(var) (void) (var)
-+#define PTR_DEMANGLE(var) (void) (var)
-+
-+#endif /* linux/loongarch/sysdep.h */
-diff --git a/sysdeps/unix/sysv/linux/loongarch/ucontext-macros.h b/sysdeps/unix/sysv/linux/loongarch/ucontext-macros.h
-new file mode 100644
-index 00000000..abd22247
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/ucontext-macros.h
-@@ -0,0 +1,44 @@
-+/* Macros for ucontext routines.
-+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _LINUX_LOONGARCH_UCONTEXT_MACROS_H
-+#define _LINUX_LOONGARCH_UCONTEXT_MACROS_H
-+
-+#include <sysdep.h>
-+#include <sys/asm.h>
-+
-+#include "ucontext_i.h"
-+
-+#define SAVE_FP_REG(name, num, base)			\
-+  FREG_S name, base, ((num) * SZFREG + MCONTEXT_FPREGS)
-+
-+#define RESTORE_FP_REG(name, num, base)			\
-+  FREG_L name, base, ((num) * SZFREG + MCONTEXT_FPREGS)
-+
-+#define SAVE_INT_REG(name, num, base)			\
-+  REG_S name, base, ((num) * SZREG + MCONTEXT_GREGS)
-+
-+#define RESTORE_INT_REG(name, num, base)		\
-+  REG_L name, base, ((num) * SZREG + MCONTEXT_GREGS)
-+
-+#define SAVE_REG(name, offset, base)			\
-+  REG_S name, base, (offset)
-+
-+#define RESTORE_REG(name, offset, base)		\
-+  REG_L name, base, (offset)
-+#endif /* _LINUX_LOONGARCH_UCONTEXT_MACROS_H */
-diff --git a/sysdeps/unix/sysv/linux/loongarch/ucontext_i.sym b/sysdeps/unix/sysv/linux/loongarch/ucontext_i.sym
-new file mode 100644
-index 00000000..d7f612fe
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/ucontext_i.sym
-@@ -0,0 +1,33 @@
-+#include <inttypes.h>
-+#include <signal.h>
-+#include <stddef.h>
-+#include <sys/ucontext.h>
-+
-+-- Constants used by the rt_sigprocmask call.
-+
-+SIG_BLOCK
-+SIG_SETMASK
-+
-+_NSIG8				(_NSIG / 8)
-+
-+-- Offsets of the fields in the ucontext_t structure.
-+#define ucontext(member)	offsetof (ucontext_t, member)
-+#define stack(member)		ucontext (uc_stack.member)
-+#define mcontext(member)	ucontext (uc_mcontext.member)
-+
-+UCONTEXT_FLAGS			ucontext (__uc_flags)
-+UCONTEXT_LINK			ucontext (uc_link)
-+UCONTEXT_STACK			ucontext (uc_stack)
-+UCONTEXT_MCONTEXT		ucontext (uc_mcontext)
-+UCONTEXT_SIGMASK		ucontext (uc_sigmask)
-+
-+STACK_SP			stack (ss_sp)
-+STACK_SIZE			stack (ss_size)
-+STACK_FLAGS			stack (ss_flags)
-+
-+MCONTEXT_PC			mcontext (__pc)
-+MCONTEXT_FCSR 			mcontext (__fcsr)
-+MCONTEXT_GREGS			mcontext (__gregs)
-+MCONTEXT_FPREGS			mcontext (__fpregs)
-+
-+UCONTEXT_SIZE			sizeof (ucontext_t)
-diff --git a/sysdeps/unix/sysv/linux/loongarch/vfork.S b/sysdeps/unix/sysv/linux/loongarch/vfork.S
-new file mode 100644
-index 00000000..83cf141f
---- /dev/null
-+++ b/sysdeps/unix/sysv/linux/loongarch/vfork.S
-@@ -0,0 +1,49 @@
-+/* Copyright (C) 1999-2018 Free Software Foundation, Inc.
-+
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public License as
-+   published by the Free Software Foundation; either version 2.1 of the
-+   License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <sysdep.h>
-+#define _ERRNO_H    1
-+#include <bits/errno.h>
-+
-+/* Clone the calling process, but without copying the whole address space.
-+   The calling process is suspended until the new process exits or is
-+   replaced by a call to `execve'.  Return -1 for errors, 0 to the new process,
-+   and the process ID of the new process to the old process.  */
-+
-+ENTRY (__vfork)
-+
-+
-+    li.d a0, 0x4111 /* CLONE_VM | CLONE_VFORK | SIGCHLD */
-+    add.d a1, zero, sp
-+
-+    /* Do the system call.  */
-+    li.d a7, __NR_clone
-+    syscall 0
-+
-+    blt a0, zero ,L (error)
-+
-+    ret
-+
-+L (error):
-+    b   __syscall_error
-+    END (__vfork)
-+
-+libc_hidden_def (__vfork)
-+
-+weak_alias (__vfork, vfork)
-+strong_alias (__vfork, __libc_vfork)
--- 
-2.39.3
-
diff --git a/glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch b/glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch
deleted file mode 100644
index b6fbf73..0000000
--- a/glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch
+++ /dev/null
@@ -1,183 +0,0 @@
-From b9f145df85145506f8e61bac38b792584a38d88f Mon Sep 17 00:00:00 2001
-From: Krzysztof Koch <Krzysztof.Koch@arm.com>
-Date: Tue, 5 Nov 2019 17:35:18 +0000
-Subject: [PATCH 02/14] aarch64: Increase small and medium cases for
- __memcpy_generic
-
-Increase the upper bound on medium cases from 96 to 128 bytes.
-Now, up to 128 bytes are copied unrolled.
-
-Increase the upper bound on small cases from 16 to 32 bytes so that
-copies of 17-32 bytes are not impacted by the larger medium case.
-
-Benchmarking:
-The attached figures show relative timing difference with respect
-to 'memcpy_generic', which is the existing implementation.
-'memcpy_med_128' denotes the the version of memcpy_generic with
-only the medium case enlarged. The 'memcpy_med_128_small_32' numbers
-are for the version of memcpy_generic submitted in this patch, which
-has both medium and small cases enlarged. The figures were generated
-using the script from:
-https://www.sourceware.org/ml/libc-alpha/2019-10/msg00563.html
-
-Depending on the platform, the performance improvement in the
-bench-memcpy-random.c benchmark ranges from 6% to 20% between
-the original and final version of memcpy.S
-
-Tested against GLIBC testsuite and randomized tests.
----
- sysdeps/aarch64/memcpy.S | 82 +++++++++++++++++++++++-----------------
- 1 file changed, 47 insertions(+), 35 deletions(-)
-
-diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
-index 6e4f4a74bd..10801aa0f4 100644
---- a/sysdeps/aarch64/memcpy.S
-+++ b/sysdeps/aarch64/memcpy.S
-@@ -41,17 +41,19 @@
- #define C_h	x11
- #define D_l	x12
- #define D_h	x13
--#define E_l	src
--#define E_h	count
--#define F_l	srcend
--#define F_h	dst
-+#define E_l	x14
-+#define E_h	x15
-+#define F_l	x16
-+#define F_h	x17
- #define G_l	count
- #define G_h	dst
-+#define H_l	src
-+#define H_h	srcend
- #define tmp1	x14
- 
--/* Copies are split into 3 main cases: small copies of up to 16 bytes,
--   medium copies of 17..96 bytes which are fully unrolled. Large copies
--   of more than 96 bytes align the destination and use an unrolled loop
-+/* Copies are split into 3 main cases: small copies of up to 32 bytes,
-+   medium copies of 33..128 bytes which are fully unrolled. Large copies
-+   of more than 128 bytes align the destination and use an unrolled loop
-    processing 64 bytes per iteration.
-    In order to share code with memmove, small and medium copies read all
-    data before writing, allowing any kind of overlap. So small, medium
-@@ -73,7 +75,7 @@ ENTRY_ALIGN (MEMMOVE, 6)
- 	DELOUSE (2)
- 
- 	sub	tmp1, dstin, src
--	cmp	count, 96
-+	cmp	count, 128
- 	ccmp	tmp1, count, 2, hi
- 	b.lo	L(move_long)
- 
-@@ -89,31 +91,39 @@ ENTRY (MEMCPY)
- 	prfm	PLDL1KEEP, [src]
- 	add	srcend, src, count
- 	add	dstend, dstin, count
--	cmp	count, 16
--	b.ls	L(copy16)
--	cmp	count, 96
-+	cmp	count, 32
-+	b.ls	L(copy32)
-+	cmp	count, 128
- 	b.hi	L(copy_long)
- 
--	/* Medium copies: 17..96 bytes.  */
--	sub	tmp1, count, 1
-+	/* Medium copies: 33..128 bytes.  */
- 	ldp	A_l, A_h, [src]
--	tbnz	tmp1, 6, L(copy96)
--	ldp	D_l, D_h, [srcend, -16]
--	tbz	tmp1, 5, 1f
- 	ldp	B_l, B_h, [src, 16]
- 	ldp	C_l, C_h, [srcend, -32]
-+	ldp	D_l, D_h, [srcend, -16]
-+	cmp	count, 64
-+	b.hi	L(copy128)
-+	stp	A_l, A_h, [dstin]
- 	stp	B_l, B_h, [dstin, 16]
- 	stp	C_l, C_h, [dstend, -32]
--1:
--	stp	A_l, A_h, [dstin]
- 	stp	D_l, D_h, [dstend, -16]
- 	ret
- 
- 	.p2align 4
--	/* Small copies: 0..16 bytes.  */
--L(copy16):
--	cmp	count, 8
-+	/* Small copies: 0..32 bytes.  */
-+L(copy32):
-+	/* 16-32 bytes.  */
-+	cmp	count, 16
- 	b.lo	1f
-+	ldp	A_l, A_h, [src]
-+	ldp	B_l, B_h, [srcend, -16]
-+	stp	A_l, A_h, [dstin]
-+	stp	B_l, B_h, [dstend, -16]
-+	ret
-+	.p2align 4
-+1:
-+	/* 8-15 bytes.  */
-+	tbz	count, 3, 1f
- 	ldr	A_l, [src]
- 	ldr	A_h, [srcend, -8]
- 	str	A_l, [dstin]
-@@ -121,6 +131,7 @@ L(copy16):
- 	ret
- 	.p2align 4
- 1:
-+	/* 4-7 bytes.  */
- 	tbz	count, 2, 1f
- 	ldr	A_lw, [src]
- 	ldr	A_hw, [srcend, -4]
-@@ -142,24 +153,25 @@ L(copy16):
- 2:	ret
- 
- 	.p2align 4
--	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
--	   32 bytes from the end.  */
--L(copy96):
--	ldp	B_l, B_h, [src, 16]
--	ldp	C_l, C_h, [src, 32]
--	ldp	D_l, D_h, [src, 48]
--	ldp	E_l, E_h, [srcend, -32]
--	ldp	F_l, F_h, [srcend, -16]
-+	/* Copy 65..128 bytes.  Copy 64 bytes from the start and
-+	   64 bytes from the end.  */
-+L(copy128):
-+	ldp	E_l, E_h, [src, 32]
-+	ldp	F_l, F_h, [src, 48]
-+	ldp	G_l, G_h, [srcend, -64]
-+	ldp	H_l, H_h, [srcend, -48]
- 	stp	A_l, A_h, [dstin]
- 	stp	B_l, B_h, [dstin, 16]
--	stp	C_l, C_h, [dstin, 32]
--	stp	D_l, D_h, [dstin, 48]
--	stp	E_l, E_h, [dstend, -32]
--	stp	F_l, F_h, [dstend, -16]
-+	stp	E_l, E_h, [dstin, 32]
-+	stp	F_l, F_h, [dstin, 48]
-+	stp	G_l, G_h, [dstend, -64]
-+	stp	H_l, H_h, [dstend, -48]
-+	stp	C_l, C_h, [dstend, -32]
-+	stp	D_l, D_h, [dstend, -16]
- 	ret
- 
- 	/* Align DST to 16 byte alignment so that we don't cross cache line
--	   boundaries on both loads and stores.  There are at least 96 bytes
-+	   boundaries on both loads and stores.  There are at least 128 bytes
- 	   to copy, so copy 16 bytes unaligned and then align.  The loop
- 	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
- 
-@@ -215,7 +227,7 @@ L(move_long):
- 	add	dstend, dstin, count
- 
- 	/* Align dstend to 16 byte alignment so that we don't cross cache line
--	   boundaries on both loads and stores.  There are at least 96 bytes
-+	   boundaries on both loads and stores.  There are at least 128 bytes
- 	   to copy, so copy 16 bytes unaligned and then align.  The loop
- 	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
- 
--- 
-2.39.3
-
diff --git a/glibc-elf-Align-argument-of-__munmap-to-page-size-BZ-28676-3.patch b/glibc-elf-Align-argument-of-__munmap-to-page-size-BZ-28676-3.patch
deleted file mode 100644
index 11417e6..0000000
--- a/glibc-elf-Align-argument-of-__munmap-to-page-size-BZ-28676-3.patch
+++ /dev/null
@@ -1,36 +0,0 @@
-From bf126f79dff0370d1e52ef8193da7fd593c37833 Mon Sep 17 00:00:00 2001
-From: "H.J. Lu" <hjl.tools@gmail.com>
-Date: Wed, 19 Jul 2023 23:10:48 +0800
-Subject: [PATCH 4/6] elf: Align argument of __munmap to page size [BZ #28676]
-
-On Linux/x86-64, for elf/tst-align3, we now get
-
-munmap(0x7f88f9401000, 1126424)         = 0
-
-instead of
-
-munmap(0x7f1615200018, 544768)          = -1 EINVAL (Invalid argument)
-
-Backport from master commit: fd6062e
-
-Reviewed-by: Florian Weimer <fweimer@redhat.com>
-Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
----
- elf/dl-map-segments.h | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/elf/dl-map-segments.h b/elf/dl-map-segments.h
-index 61ba04cd..f1f7ad88 100644
---- a/elf/dl-map-segments.h
-+++ b/elf/dl-map-segments.h
-@@ -55,6 +55,7 @@ _dl_map_segment (const struct loadcmd *c, ElfW(Addr) mappref,
-       if (delta)
- 	__munmap ((void *) map_start, delta);
-       ElfW(Addr) map_end = map_start_aligned + maplength;
-+      map_end = ALIGN_UP (map_end, GLRO(dl_pagesize));
-       delta = map_start + maplen - map_end;
-       if (delta)
- 	__munmap ((void *) map_end, delta);
--- 
-2.27.0
-
diff --git a/glibc-elf-Fix-tst-align3.patch b/glibc-elf-Fix-tst-align3.patch
deleted file mode 100644
index 194d142..0000000
--- a/glibc-elf-Fix-tst-align3.patch
+++ /dev/null
@@ -1,40 +0,0 @@
-From 8b39d3b4bf2fc49ab31f31cf30aa80104afa3432 Mon Sep 17 00:00:00 2001
-From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
-Date: Wed, 19 Jul 2023 23:14:33 +0800
-Subject: [PATCH 6/6] elf: Fix tst-align3
-
-The elf/tst-align3.c declares the function using a wrong prototype.
-
-Checked on aarch64-linux-gnu.
-
-Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
----
- elf/tst-align3.c | 5 +++--
- 1 file changed, 3 insertions(+), 2 deletions(-)
-
-diff --git a/elf/tst-align3.c b/elf/tst-align3.c
-index 87a8ff81..731dd59f 100644
---- a/elf/tst-align3.c
-+++ b/elf/tst-align3.c
-@@ -22,7 +22,7 @@
- 
- int bar __attribute__ ((aligned (ALIGN))) = 1;
- 
--extern int do_load_test (void);
-+extern void do_load_test (void);
- 
- static int
- do_test (void)
-@@ -30,7 +30,8 @@ do_test (void)
-   printf ("bar: %p\n", &bar);
-   TEST_VERIFY (is_aligned (&bar, ALIGN) == 0);
- 
--  return do_load_test ();
-+  do_load_test ();
-+  return 0;
- }
- 
- #include <support/test-driver.c>
--- 
-2.27.0
-
diff --git a/glibc-elf-Properly-align-PT_LOAD-segments-BZ-28676-1.patch b/glibc-elf-Properly-align-PT_LOAD-segments-BZ-28676-1.patch
deleted file mode 100644
index a1b6c6b..0000000
--- a/glibc-elf-Properly-align-PT_LOAD-segments-BZ-28676-1.patch
+++ /dev/null
@@ -1,137 +0,0 @@
-From fe5893121176136b0ae3a5f9198536feeb6f64f8 Mon Sep 17 00:00:00 2001
-From: Rongwei Wang <rongwei.wang@linux.alibaba.com>
-Date: Wed, 19 Jul 2023 23:05:39 +0800
-Subject: [PATCH 2/6] elf: Properly align PT_LOAD segments [BZ #28676]
-
-When PT_LOAD segment alignment > the page size, allocate enough space to
-ensure that the segment can be properly aligned.  This change helps code
-segments use huge pages become simple and available.
-
-This fixes [BZ #28676].
-
-Backport from master commit: 718fdd8
-
-Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
-Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
----
- elf/dl-load.c         |  2 ++
- elf/dl-load.h         |  3 ++-
- elf/dl-map-segments.h | 50 +++++++++++++++++++++++++++++++++++++++----
- 3 files changed, 50 insertions(+), 5 deletions(-)
-
-diff --git a/elf/dl-load.c b/elf/dl-load.c
-index 0b45e6e3..132e4233 100644
---- a/elf/dl-load.c
-+++ b/elf/dl-load.c
-@@ -1,5 +1,6 @@
- /* Map in a shared object's segments from the file.
-    Copyright (C) 1995-2018 Free Software Foundation, Inc.
-+   Copyright The GNU Toolchain Authors.
-    This file is part of the GNU C Library.
- 
-    The GNU C Library is free software; you can redistribute it and/or
-@@ -1076,6 +1077,7 @@ _dl_map_object_from_fd (const char *name, const char *origname, int fd,
- 	  c->mapend = ALIGN_UP (ph->p_vaddr + ph->p_filesz, GLRO(dl_pagesize));
- 	  c->dataend = ph->p_vaddr + ph->p_filesz;
- 	  c->allocend = ph->p_vaddr + ph->p_memsz;
-+	  c->mapalign = ph->p_align;
- 	  c->mapoff = ALIGN_DOWN (ph->p_offset, GLRO(dl_pagesize));
- 
- 	  /* Determine whether there is a gap between the last segment
-diff --git a/elf/dl-load.h b/elf/dl-load.h
-index 66ea2e92..d9f648ea 100644
---- a/elf/dl-load.h
-+++ b/elf/dl-load.h
-@@ -1,5 +1,6 @@
- /* Map in a shared object's segments from the file.
-    Copyright (C) 1995-2018 Free Software Foundation, Inc.
-+   Copyright The GNU Toolchain Authors.
-    This file is part of the GNU C Library.
- 
-    The GNU C Library is free software; you can redistribute it and/or
-@@ -74,7 +75,7 @@ ELF_PREFERRED_ADDRESS_DATA;
-    Its details have been expanded out and converted.  */
- struct loadcmd
- {
--  ElfW(Addr) mapstart, mapend, dataend, allocend;
-+  ElfW(Addr) mapstart, mapend, dataend, allocend, mapalign;
-   ElfW(Off) mapoff;
-   int prot;                             /* PROT_* bits.  */
- };
-diff --git a/elf/dl-map-segments.h b/elf/dl-map-segments.h
-index 084076a2..61ba04cd 100644
---- a/elf/dl-map-segments.h
-+++ b/elf/dl-map-segments.h
-@@ -1,5 +1,6 @@
- /* Map in a shared object's segments.  Generic version.
-    Copyright (C) 1995-2018 Free Software Foundation, Inc.
-+   Copyright The GNU Toolchain Authors.
-    This file is part of the GNU C Library.
- 
-    The GNU C Library is free software; you can redistribute it and/or
-@@ -18,6 +19,50 @@
- 
- #include <dl-load.h>
- 
-+/* Map a segment and align it properly.  */
-+
-+static __always_inline ElfW(Addr)
-+_dl_map_segment (const struct loadcmd *c, ElfW(Addr) mappref,
-+		 const size_t maplength, int fd)
-+{
-+  if (__glibc_likely (c->mapalign <= GLRO(dl_pagesize)))
-+    return (ElfW(Addr)) __mmap ((void *) mappref, maplength, c->prot,
-+				MAP_COPY|MAP_FILE, fd, c->mapoff);
-+
-+  /* If the segment alignment > the page size, allocate enough space to
-+     ensure that the segment can be properly aligned.  */
-+  ElfW(Addr) maplen = (maplength >= c->mapalign
-+		       ? (maplength + c->mapalign)
-+		       : (2 * c->mapalign));
-+  ElfW(Addr) map_start = (ElfW(Addr)) __mmap ((void *) mappref, maplen,
-+					      PROT_NONE,
-+					      MAP_ANONYMOUS|MAP_PRIVATE,
-+					      -1, 0);
-+  if (__glibc_unlikely ((void *) map_start == MAP_FAILED))
-+    return map_start;
-+
-+  ElfW(Addr) map_start_aligned = ALIGN_UP (map_start, c->mapalign);
-+  map_start_aligned = (ElfW(Addr)) __mmap ((void *) map_start_aligned,
-+					   maplength, c->prot,
-+					   MAP_COPY|MAP_FILE|MAP_FIXED,
-+					   fd, c->mapoff);
-+  if (__glibc_unlikely ((void *) map_start_aligned == MAP_FAILED))
-+    __munmap ((void *) map_start, maplen);
-+  else
-+    {
-+      /* Unmap the unused regions.  */
-+      ElfW(Addr) delta = map_start_aligned - map_start;
-+      if (delta)
-+	__munmap ((void *) map_start, delta);
-+      ElfW(Addr) map_end = map_start_aligned + maplength;
-+      delta = map_start + maplen - map_end;
-+      if (delta)
-+	__munmap ((void *) map_end, delta);
-+    }
-+
-+  return map_start_aligned;
-+}
-+
- /* This implementation assumes (as does the corresponding implementation
-    of _dl_unmap_segments, in dl-unmap-segments.h) that shared objects
-    are always laid out with all segments contiguous (or with gaps
-@@ -53,10 +98,7 @@ _dl_map_segments (struct link_map *l, int fd,
-            - MAP_BASE_ADDR (l));
- 
-       /* Remember which part of the address space this object uses.  */
--      l->l_map_start = (ElfW(Addr)) __mmap ((void *) mappref, maplength,
--                                            c->prot,
--                                            MAP_COPY|MAP_FILE,
--                                            fd, c->mapoff);
-+      l->l_map_start = _dl_map_segment (c, mappref, maplength, fd);
-       if (__glibc_unlikely ((void *) l->l_map_start == MAP_FAILED))
-         return DL_MAP_SEGMENTS_ERROR_MAP_SEGMENT;
- 
--- 
-2.27.0
-
diff --git a/glibc.spec b/glibc.spec
index b5eb36a..91e70d1 100644
--- a/glibc.spec
+++ b/glibc.spec
@@ -1,7 +1,6 @@
-%global anolis_release .0.1
 %define glibcsrcdir glibc-2.28
 %define glibcversion 2.28
-%define glibcrelease 236%{anolis_release}%{?dist}
+%define glibcrelease 251%{?dist}
 # Pre-release tarballs are pulled in from git using a command that is
 # effectively:
 #
@@ -133,7 +132,7 @@ end \
 Summary: The GNU libc libraries
 Name: glibc
 Version: %{glibcversion}
-Release: %{glibcrelease}.13
+Release: %{glibcrelease}.2
 
 # In general, GPLv2+ is used by programs, LGPLv2+ is used for
 # libraries.
@@ -1054,40 +1053,142 @@ Patch865: glibc-RHEL-2435.patch
 Patch866: glibc-RHEL-2435-2.patch
 Patch867: glibc-RHEL-2423.patch
 Patch868: glibc-RHEL-3036.patch
-Patch869: glibc-RHEL-21522-1.patch
-Patch870: glibc-RHEL-21522-2.patch
-Patch871: glibc-RHEL-21522-3.patch
-Patch872: glibc-RHEL-21522-4.patch
-Patch873: glibc-RHEL-21519.patch
-Patch874: glibc-RHEL-22441.patch
-Patch875: glibc-RHEL-22846.patch
-Patch876: glibc-RHEL-22847.patch
+Patch869: glibc-RHEL-3757.patch
+Patch870: glibc-RHEL-2122.patch
+Patch871: glibc-RHEL-1192.patch
+Patch872: glibc-RHEL-3639.patch
+Patch873: glibc-RHEL-10481.patch
+Patch874: glibc-RHEL-13720-1.patch
+Patch875: glibc-RHEL-13720-2.patch
+Patch876: glibc-RHEL-15867.patch
+Patch877: glibc-RHEL-16825-1.patch
+Patch878: glibc-RHEL-16825-2.patch
+Patch879: glibc-RHEL-16825-3.patch
+Patch880: glibc-RHEL-16825-4.patch
+Patch881: glibc-RHEL-15696-1.patch
+Patch882: glibc-RHEL-15696-2.patch
+Patch883: glibc-RHEL-15696-3.patch
+Patch884: glibc-RHEL-15696-4.patch
+Patch885: glibc-RHEL-15696-5.patch
+Patch886: glibc-RHEL-15696-6.patch
+Patch887: glibc-RHEL-15696-7.patch
+Patch888: glibc-RHEL-15696-8.patch
+Patch889: glibc-RHEL-15696-9.patch
+Patch890: glibc-RHEL-15696-10.patch
+Patch891: glibc-RHEL-15696-11.patch
+Patch892: glibc-RHEL-15696-12.patch
+Patch893: glibc-RHEL-15696-13.patch
+Patch894: glibc-RHEL-15696-14.patch
+Patch895: glibc-RHEL-15696-15.patch
+Patch896: glibc-RHEL-15696-16.patch
+Patch897: glibc-RHEL-15696-17.patch
+Patch898: glibc-RHEL-15696-18.patch
+Patch899: glibc-RHEL-15696-19.patch
+Patch900: glibc-RHEL-15696-20.patch
+Patch901: glibc-RHEL-15696-21.patch
+Patch902: glibc-RHEL-15696-22.patch
+Patch903: glibc-RHEL-15696-23.patch
+Patch904: glibc-RHEL-15696-24.patch
+Patch905: glibc-RHEL-15696-25.patch
+Patch906: glibc-RHEL-15696-26.patch
+Patch907: glibc-RHEL-15696-27.patch
+Patch908: glibc-RHEL-15696-28.patch
+Patch909: glibc-RHEL-15696-29.patch
+Patch910: glibc-RHEL-15696-30.patch
+Patch911: glibc-RHEL-15696-31.patch
+Patch912: glibc-RHEL-15696-32.patch
+Patch913: glibc-RHEL-15696-33.patch
+Patch914: glibc-RHEL-15696-34.patch
+Patch915: glibc-RHEL-15696-35.patch
+Patch916: glibc-RHEL-15696-36.patch
+Patch917: glibc-RHEL-15696-37.patch
+Patch918: glibc-RHEL-15696-38.patch
+Patch919: glibc-RHEL-15696-39.patch
+Patch920: glibc-RHEL-15696-40.patch
+Patch921: glibc-RHEL-15696-41.patch
+Patch922: glibc-RHEL-15696-42.patch
+Patch923: glibc-RHEL-15696-43.patch
+Patch924: glibc-RHEL-15696-44.patch
+Patch925: glibc-RHEL-15696-45.patch
+Patch926: glibc-RHEL-15696-46.patch
+Patch927: glibc-RHEL-15696-47.patch
+Patch928: glibc-RHEL-15696-48.patch
+Patch929: glibc-RHEL-15696-49.patch
+Patch930: glibc-RHEL-15696-50.patch
+Patch931: glibc-RHEL-15696-51.patch
+Patch932: glibc-RHEL-15696-52.patch
+Patch933: glibc-RHEL-15696-53.patch
+Patch934: glibc-RHEL-15696-54.patch
+Patch935: glibc-RHEL-15696-55.patch
+Patch936: glibc-RHEL-15696-56.patch
+Patch937: glibc-RHEL-15696-57.patch
+Patch938: glibc-RHEL-15696-58.patch
+Patch939: glibc-RHEL-15696-59.patch
+Patch940: glibc-RHEL-15696-60.patch
+Patch941: glibc-RHEL-15696-61.patch
+Patch942: glibc-RHEL-15696-62.patch
+Patch943: glibc-RHEL-15696-63.patch
+Patch944: glibc-RHEL-15696-64.patch
+Patch945: glibc-RHEL-15696-65.patch
+Patch946: glibc-RHEL-15696-66.patch
+Patch947: glibc-RHEL-15696-67.patch
+Patch948: glibc-RHEL-15696-68.patch
+Patch949: glibc-RHEL-15696-69.patch
+Patch950: glibc-RHEL-15696-70.patch
+Patch951: glibc-RHEL-15696-71.patch
+Patch952: glibc-RHEL-15696-72.patch
+Patch953: glibc-RHEL-15696-73.patch
+Patch954: glibc-RHEL-15696-74.patch
+Patch955: glibc-RHEL-15696-75.patch
+Patch956: glibc-RHEL-15696-76.patch
+Patch957: glibc-RHEL-15696-77.patch
+Patch958: glibc-RHEL-15696-78.patch
+Patch959: glibc-RHEL-15696-79.patch
+Patch960: glibc-RHEL-15696-80.patch
+Patch961: glibc-RHEL-15696-81.patch
+Patch962: glibc-RHEL-15696-82.patch
+Patch963: glibc-RHEL-15696-83.patch
+Patch964: glibc-RHEL-15696-84.patch
+Patch965: glibc-RHEL-15696-85.patch
+Patch966: glibc-RHEL-15696-86.patch
+Patch967: glibc-RHEL-15696-87.patch
+Patch968: glibc-RHEL-15696-88.patch
+Patch969: glibc-RHEL-15696-89.patch
+Patch970: glibc-RHEL-15696-90.patch
+Patch971: glibc-RHEL-15696-91.patch
+Patch972: glibc-RHEL-15696-92.patch
+Patch973: glibc-RHEL-15696-93.patch
+Patch974: glibc-RHEL-15696-94.patch
+Patch975: glibc-RHEL-15696-95.patch
+Patch976: glibc-RHEL-15696-96.patch
+Patch977: glibc-RHEL-15696-97.patch
+Patch978: glibc-RHEL-15696-98.patch
+Patch979: glibc-RHEL-15696-99.patch
+Patch980: glibc-RHEL-15696-100.patch
+Patch981: glibc-RHEL-15696-101.patch
+Patch982: glibc-RHEL-15696-102.patch
+Patch983: glibc-RHEL-15696-103.patch
+Patch984: glibc-RHEL-15696-104.patch
+Patch985: glibc-RHEL-15696-105.patch
+Patch986: glibc-RHEL-15696-106.patch
+Patch987: glibc-RHEL-15696-107.patch
+Patch988: glibc-RHEL-15696-108.patch
+Patch989: glibc-RHEL-15696-109.patch
+Patch990: glibc-RHEL-15696-110.patch
+Patch991: glibc-RHEL-17468-1.patch
+Patch992: glibc-RHEL-17468-2.patch
+Patch993: glibc-RHEL-19824.patch
+Patch994: glibc-RHEL-3010-1.patch
+Patch995: glibc-RHEL-3010-2.patch
+Patch996: glibc-RHEL-3010-3.patch
+Patch997: glibc-RHEL-19445.patch
+Patch998: glibc-RHEL-21997.patch
+Patch999: glibc-RHEL-31804.patch
+Patch1000: glibc-RHEL-34264.patch
+Patch1001: glibc-RHEL-34267-1.patch
+Patch1002: glibc-RHEL-34267-2.patch
+Patch1003: glibc-RHEL-34273.patch
 
-Patch2000: glibc-Properly-check-stack-alignment-BZ-27901.patch
-Patch2001: glibc-elf-Properly-align-PT_LOAD-segments-BZ-28676-1.patch
-Patch2002: glibc-Add-a-testcase-to-check-alignment-of-PT_LOAD-segment-2.patch
-Patch2003: glibc-elf-Align-argument-of-__munmap-to-page-size-BZ-28676-3.patch
-Patch2004: glibc-Support-target-specific-ALIGN-for-variable-alignment-4.patch
-Patch2005: glibc-elf-Fix-tst-align3.patch
-
-Patch2006: glibc-Sync-to-lnd-35-for-LoongArch.patch
-Patch2007: Fix-tst-cancel21.c-to-suit-kernel-struct-sigcontext-.patch
-Patch2008: glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch
-Patch2009: glibc-Add-Hygon-Support.patch
-Patch2010: glibc-2.28-Remove-unseless-ANDROID_CHANGES-and-relat.patch
-Patch2011: glibc-2.28-use-new-macro-LEAF-and-ENTRY-and-modify-r.patch
-Patch2012: glibc-2.28-Fix-ifunc-str-mem-functions-xfail-problem.patch
-Patch2013: glibc-2.28-Add-run-one-test-convenience-target-and-m.patch
-Patch2014: glibc-2.28-remove-ABILPX32-related-code.patch
-Patch2015: glibc-2.28-Refactor-code-of-raw-mem-functions.patch
-Patch2016: glibc-2.28-Refactor-code-of-st-r-p-functions.patch
-Patch2017: glibc-2.28-Add-new-struct-user_fp_state-in-user.h.patch
-Patch2018: glibc-2.28-Redefine-macro-LEAF-ENTRY.patch
-Patch2019: glibc-2.28-config-Added-HAVE_LOONGARCH_VEC_ASM.patch
-Patch2020: glibc-2.28-Add-macro-defination-of-lasx-lsx-and-fcc-.patch
-Patch2021: glibc-2.28-Refactor-code-and-fix-bug-in-_dl_runtime_.patch
-Patch2022: glibc-2.28-Remove-useless-IS_LA-264-364-464-and-IS_L.patch
-Patch2023: glibc-2.28-Use-RTLD_SUPPORT_-LSX-LASX-to-choose-_dl_.patch
 ##############################################################################
 # Continued list of core "glibc" package information:
 ##############################################################################
@@ -1767,11 +1868,6 @@ build()
 %ifarch %{ix86}
 		--disable-multi-arch \
 %endif
-%ifarch loongarch64
-                --enable-stackguard-randomization \
-                --with-selinux \
-                --enable-shared \
-%endif
 %if %{without werror}
 		--disable-werror \
 %endif
@@ -1951,6 +2047,7 @@ gzip -9nvf %{glibc_sysroot}%{_infodir}/libc*
 # Copy the debugger interface documentation over to the right location
 mkdir -p %{glibc_sysroot}%{_docdir}/glibc
 cp elf/rtld-debugger-interface.txt %{glibc_sysroot}%{_docdir}/glibc
+cp posix/gai.conf %{glibc_sysroot}%{_docdir}/glibc
 %else
 rm -f %{glibc_sysroot}%{_infodir}/dir
 rm -f %{glibc_sysroot}%{_infodir}/libc.info*
@@ -2922,30 +3019,59 @@ fi
 %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
 
 %changelog
-* Sat May 11 2024 Peng Fan <fanpeng@loongson.cn> - 2.28-236.0.1.13
-- Sync loongarch64 code to lnd.36.
+* Fri Apr 26 2024 Florian Weimer <fweimer@redhat.com> - 2.28-251.2
+- CVE-2024-33599: nscd: buffer overflow in netgroup cache (RHEL-34264)
+- CVE-2024-33600: nscd: null pointer dereferences in netgroup cache (RHEL-34267)
+- CVE-2024-33601: nscd: crash on out-of-memory condition (RHEL-34271)
+- CVE-2024-33602: nscd: memory corruption with NSS netgroup modules (RHEL-34273)
 
-* Mon May 06 2024 Rongwei Wang <rongwei.wang@linux.alibaba.com> - 2.28-236.0.1.12
-- elf: Properly align PT_LOAD segments
-- Sync loongarch64 code to lnd.35. (lixing@loongson.cn)
-- Add patch for gb18030-2022 from upstream bug#30243 (fundawang@yeah.net)
-- aarch64: Increase small and medium cases for __memcpy_generic (bug#7060) (Kaiqiang Wang)
-- Add Hygon Support (Jing Li)
+* Mon Apr 15 2024 Florian Weimer <fweimer@redhat.com> - 2.28-251.1
+- CVE-2024-2961: Out of bounds write in iconv conversion to ISO-2022-CN-EXT (RHEL-31804)
 
-* Mon Jan 29 2024 Florian Weimer <fweimer@redhat.com> - 2.28-236.12
-- Re-enable output buffering for wide stdio streams (RHEL-22847)
+* Thu Jan 18 2024 Florian Weimer <fweimer@redhat.com> - 2.28-251
+- Cache information in x86_64 ld.so --list-diagnostics output (RHEL-21997)
 
-* Mon Jan 29 2024 Florian Weimer <fweimer@redhat.com> - 2.28-236.11
-- Avoid lazy binding failures during dlclose (RHEL-22846)
+* Wed Jan 10 2024 Arjun Shankar <arjun@redhat.com> - 2.28-250
+- getaddrinfo: Return correct error EAI_MEMORY when out-of-memory (RHEL-19445)
 
-* Fri Jan 26 2024 Florian Weimer <fweimer@redhat.com> - 2.28-236.10
-- nscd: Skip unusable entries in first pass in prune_cache (RHEL-22441)
+* Wed Jan  3 2024 Florian Weimer <fweimer@redhat.com> - 2.28-249
+- Updates for AMD cache size computation (RHEL-3010)
 
-* Fri Jan 26 2024 Florian Weimer <fweimer@redhat.com> - 2.28-236.9
-- Fix force-first handling in dlclose (RHEL-21519)
+* Tue Jan  2 2024 Florian Weimer <fweimer@redhat.com> - 2.28-248
+- Re-enable output buffering for wide stdio streams (RHEL-19824)
 
-* Fri Jan 26 2024 Florian Weimer <fweimer@redhat.com> - 2.28-236.8
-- Improve compatibility between underlinking and IFUNC resolvers (RHEL-21522)
+* Thu Dec 21 2023 Carlos O'Donell <carlos@redhat.com> - 2.28-247
+- Fix TLS corruption during dlopen()/dlclose() sequences (RHEL-17468)
+
+* Thu Dec 14 2023 DJ Delorie <dj@redhat.com> - 2.28-246
+- Include CentOS Hyperscaler SIG patches backported by Intel (RHEL-15696)
+
+* Fri Dec  8 2023 Florian Weimer <fweimer@redhat.com> - 2.28-245
+- Improve compatibility between underlinking and IFUNC resolvers (RHEL-16825)
+
+* Fri Nov 24 2023 Florian Weimer <fweimer@redhat.com> - 2.28-244
+- Restore <sys/cdefs.h> compatibility with C90 compilers (RHEL-15867)
+
+* Tue Nov 21 2023 Florian Weimer <fweimer@redhat.com> - 2.28-243
+- ldconfig should skip temporary files created by RPM (RHEL-13720)
+
+* Mon Nov 20 2023 Florian Weimer <fweimer@redhat.com> - 2.28-242
+- Fix force-first handling in dlclose (RHEL-10481)
+
+* Fri Nov 10 2023 Florian Weimer <fweimer@redhat.com> - 2.28-241
+- Avoid lazy binding failures during dlclose (RHEL-3639)
+
+* Tue Oct 24 2023 Arjun Shankar <arjun@redhat.com> - 2.28-240
+- Add /usr/share/doc/glibc/gai.conf to glibc-doc (RHEL-12894)
+
+* Fri Oct 20 2023 Florian Weimer <fweimer@redhat.com> - 2.28-239
+- nscd: Skip unusable entries in first pass in prune_cache (RHEL-1192)
+
+* Mon Oct 16 2023 DJ Delorie <dj@redhat.com> - 2.28-238
+- Fix slow tls access after dlopen (RHEL-2122)
+
+* Mon Oct 16 2023 Arjun Shankar <arjun@redhat.com> - 2.28-237
+- Enable running a single test from the testsuite (RHEL-3757)
 
 * Wed Sep 20 2023 Siddhesh Poyarekar <siddhesh@redhat.com> - 2.28-236.7
 - CVE-2023-4911 glibc: buffer overflow in ld.so leading to privilege escalation (RHEL-3036)