diff --git a/glibc-2.28-Add-macro-defination-of-lasx-lsx-and-fcc-.patch b/glibc-2.28-Add-macro-defination-of-lasx-lsx-and-fcc-.patch new file mode 100644 index 0000000..86f142d --- /dev/null +++ b/glibc-2.28-Add-macro-defination-of-lasx-lsx-and-fcc-.patch @@ -0,0 +1,3946 @@ +From d97d963796b092b9c0bd4712f992a08dd20bf5ed Mon Sep 17 00:00:00 2001 +From: caiyinyu +Date: Tue, 11 Jul 2023 15:40:15 +0800 +Subject: [PATCH 11/14] glibc-2.28: Add macro defination of lasx lsx and fcc + registers. + +Change-Id: Ic723521775a0133e25bf1d568c588f930ec5ff49 +Signed-off-by: ticat_fp +--- + sysdeps/loongarch/dl-trampoline.h | 64 +-- + .../loongarch/lp64/multiarch/memchr-lasx.S | 74 +-- + sysdeps/loongarch/lp64/multiarch/memchr-lsx.S | 48 +- + .../loongarch/lp64/multiarch/memcmp-lasx.S | 138 +++--- + sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S | 194 ++++---- + .../loongarch/lp64/multiarch/memmove-lasx.S | 160 +++---- + .../loongarch/lp64/multiarch/memmove-lsx.S | 424 +++++++++--------- + .../loongarch/lp64/multiarch/memrchr-lasx.S | 74 +-- + .../loongarch/lp64/multiarch/memrchr-lsx.S | 48 +- + .../loongarch/lp64/multiarch/memset-lasx.S | 64 +-- + sysdeps/loongarch/lp64/multiarch/memset-lsx.S | 62 +-- + .../loongarch/lp64/multiarch/rawmemchr-lasx.S | 30 +- + .../loongarch/lp64/multiarch/rawmemchr-lsx.S | 30 +- + sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S | 114 ++--- + .../loongarch/lp64/multiarch/strchr-lasx.S | 52 +-- + sysdeps/loongarch/lp64/multiarch/strchr-lsx.S | 30 +- + sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S | 114 ++--- + sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S | 112 ++--- + .../loongarch/lp64/multiarch/strlen-lasx.S | 24 +- + sysdeps/loongarch/lp64/multiarch/strlen-lsx.S | 30 +- + .../loongarch/lp64/multiarch/strncmp-lsx.S | 144 +++--- + .../loongarch/lp64/multiarch/strnlen-lasx.S | 46 +- + .../loongarch/lp64/multiarch/strnlen-lsx.S | 30 +- + .../loongarch/lp64/multiarch/strrchr-lasx.S | 88 ++-- + .../loongarch/lp64/multiarch/strrchr-lsx.S | 56 +-- + sysdeps/loongarch/lp64/s_cosf.S | 4 +- + sysdeps/loongarch/lp64/s_sinf.S | 4 +- + sysdeps/loongarch/sys/regdef.h | 74 +++ + 28 files changed, 1203 insertions(+), 1129 deletions(-) + +diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h +index fb15983f..96f41f1d 100644 +--- a/sysdeps/loongarch/dl-trampoline.h ++++ b/sysdeps/loongarch/dl-trampoline.h +@@ -61,23 +61,23 @@ ENTRY (_dl_runtime_resolve, 3) + FREG_S fa6, sp, 10*SZREG + 6*SZFREG + FREG_S fa7, sp, 10*SZREG + 7*SZFREG + #ifdef USE_LASX +- xvst $xr0, sp, 10*SZREG + 0*256 +- xvst $xr1, sp, 10*SZREG + 1*256 +- xvst $xr2, sp, 10*SZREG + 2*256 +- xvst $xr3, sp, 10*SZREG + 3*256 +- xvst $xr4, sp, 10*SZREG + 4*256 +- xvst $xr5, sp, 10*SZREG + 5*256 +- xvst $xr6, sp, 10*SZREG + 6*256 +- xvst $xr7, sp, 10*SZREG + 7*256 ++ xvst xr0, sp, 10*SZREG + 0*256 ++ xvst xr1, sp, 10*SZREG + 1*256 ++ xvst xr2, sp, 10*SZREG + 2*256 ++ xvst xr3, sp, 10*SZREG + 3*256 ++ xvst xr4, sp, 10*SZREG + 4*256 ++ xvst xr5, sp, 10*SZREG + 5*256 ++ xvst xr6, sp, 10*SZREG + 6*256 ++ xvst xr7, sp, 10*SZREG + 7*256 + #elif defined USE_LSX +- vst $vr0, sp, 10*SZREG + 0*128 +- vst $vr1, sp, 10*SZREG + 1*128 +- vst $vr2, sp, 10*SZREG + 2*128 +- vst $vr3, sp, 10*SZREG + 3*128 +- vst $vr4, sp, 10*SZREG + 4*128 +- vst $vr5, sp, 10*SZREG + 5*128 +- vst $vr6, sp, 10*SZREG + 6*128 +- vst $vr7, sp, 10*SZREG + 7*128 ++ vst vr0, sp, 10*SZREG + 0*128 ++ vst vr1, sp, 10*SZREG + 1*128 ++ vst vr2, sp, 10*SZREG + 2*128 ++ vst vr3, sp, 10*SZREG + 3*128 ++ vst vr4, sp, 10*SZREG + 4*128 ++ vst vr5, sp, 10*SZREG + 5*128 ++ vst vr6, sp, 10*SZREG + 6*128 ++ vst vr7, sp, 10*SZREG + 7*128 + #endif + #endif + +@@ -119,23 +119,23 @@ ENTRY (_dl_runtime_resolve, 3) + FREG_L fa6, sp, 10*SZREG + 6*SZFREG + FREG_L fa7, sp, 10*SZREG + 7*SZFREG + #ifdef USE_LASX +- xvld $xr0, sp, 10*SZREG + 0*256 +- xvld $xr1, sp, 10*SZREG + 1*256 +- xvld $xr2, sp, 10*SZREG + 2*256 +- xvld $xr3, sp, 10*SZREG + 3*256 +- xvld $xr4, sp, 10*SZREG + 4*256 +- xvld $xr5, sp, 10*SZREG + 5*256 +- xvld $xr6, sp, 10*SZREG + 6*256 +- xvld $xr7, sp, 10*SZREG + 7*256 ++ xvld xr0, sp, 10*SZREG + 0*256 ++ xvld xr1, sp, 10*SZREG + 1*256 ++ xvld xr2, sp, 10*SZREG + 2*256 ++ xvld xr3, sp, 10*SZREG + 3*256 ++ xvld xr4, sp, 10*SZREG + 4*256 ++ xvld xr5, sp, 10*SZREG + 5*256 ++ xvld xr6, sp, 10*SZREG + 6*256 ++ xvld xr7, sp, 10*SZREG + 7*256 + #elif defined USE_LSX +- vld $vr0, sp, 10*SZREG + 0*128 +- vld $vr1, sp, 10*SZREG + 1*128 +- vld $vr2, sp, 10*SZREG + 2*128 +- vld $vr3, sp, 10*SZREG + 3*128 +- vld $vr4, sp, 10*SZREG + 4*128 +- vld $vr5, sp, 10*SZREG + 5*128 +- vld $vr6, sp, 10*SZREG + 6*128 +- vld $vr7, sp, 10*SZREG + 7*128 ++ vld vr0, sp, 10*SZREG + 0*128 ++ vld vr1, sp, 10*SZREG + 1*128 ++ vld vr2, sp, 10*SZREG + 2*128 ++ vld vr3, sp, 10*SZREG + 3*128 ++ vld vr4, sp, 10*SZREG + 4*128 ++ vld vr5, sp, 10*SZREG + 5*128 ++ vld vr6, sp, 10*SZREG + 6*128 ++ vld vr7, sp, 10*SZREG + 7*128 + #endif + #endif + +diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S +index 387a35fe..425fcede 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S +@@ -17,28 +17,28 @@ LEAF(MEMCHR, 6) + andi t0, a0, 0x3f + bstrins.d a0, zero, 5, 0 + +- xvld $xr0, a0, 0 +- xvld $xr1, a0, 32 ++ xvld xr0, a0, 0 ++ xvld xr1, a0, 32 + li.d t1, -1 + li.d t2, 64 + +- xvreplgr2vr.b $xr2, a1 ++ xvreplgr2vr.b xr2, a1 + sll.d t3, t1, t0 + sub.d t2, t2, t0 +- xvseq.b $xr0, $xr0, $xr2 ++ xvseq.b xr0, xr0, xr2 + +- xvseq.b $xr1, $xr1, $xr2 +- xvmsknz.b $xr0, $xr0 +- xvmsknz.b $xr1, $xr1 +- xvpickve.w $xr3, $xr0, 4 ++ xvseq.b xr1, xr1, xr2 ++ xvmsknz.b xr0, xr0 ++ xvmsknz.b xr1, xr1 ++ xvpickve.w xr3, xr0, 4 + + +- xvpickve.w $xr4, $xr1, 4 +- vilvl.h $vr0, $vr3, $vr0 +- vilvl.h $vr1, $vr4, $vr1 +- vilvl.w $vr0, $vr1, $vr0 ++ xvpickve.w xr4, xr1, 4 ++ vilvl.h vr0, vr3, vr0 ++ vilvl.h vr1, vr4, vr1 ++ vilvl.w vr0, vr1, vr0 + +- movfr2gr.d t0, $f0 ++ movfr2gr.d t0, fa0 + and t0, t0, t3 + bgeu t2, a2, L(end) + bnez t0, L(found) +@@ -46,28 +46,28 @@ LEAF(MEMCHR, 6) + addi.d a4, a3, -1 + bstrins.d a4, zero, 5, 0 + L(loop): +- xvld $xr0, a0, 64 +- xvld $xr1, a0, 96 ++ xvld xr0, a0, 64 ++ xvld xr1, a0, 96 + + addi.d a0, a0, 64 +- xvseq.b $xr0, $xr0, $xr2 +- xvseq.b $xr1, $xr1, $xr2 ++ xvseq.b xr0, xr0, xr2 ++ xvseq.b xr1, xr1, xr2 + beq a0, a4, L(out) + + +- xvmax.bu $xr3, $xr0, $xr1 +- xvseteqz.v $fcc0, $xr3 +- bcnez $fcc0, L(loop) +- xvmsknz.b $xr0, $xr0 ++ xvmax.bu xr3, xr0, xr1 ++ xvseteqz.v fcc0, xr3 ++ bcnez fcc0, L(loop) ++ xvmsknz.b xr0, xr0 + +- xvmsknz.b $xr1, $xr1 +- xvpickve.w $xr3, $xr0, 4 +- xvpickve.w $xr4, $xr1, 4 +- vilvl.h $vr0, $vr3, $vr0 ++ xvmsknz.b xr1, xr1 ++ xvpickve.w xr3, xr0, 4 ++ xvpickve.w xr4, xr1, 4 ++ vilvl.h vr0, vr3, vr0 + +- vilvl.h $vr1, $vr4, $vr1 +- vilvl.w $vr0, $vr1, $vr0 +- movfr2gr.d t0, $f0 ++ vilvl.h vr1, vr4, vr1 ++ vilvl.w vr0, vr1, vr0 ++ movfr2gr.d t0, fa0 + L(found): + ctz.d t1, t0 + +@@ -79,15 +79,15 @@ L(ret0): + + + L(out): +- xvmsknz.b $xr0, $xr0 +- xvmsknz.b $xr1, $xr1 +- xvpickve.w $xr3, $xr0, 4 +- xvpickve.w $xr4, $xr1, 4 +- +- vilvl.h $vr0, $vr3, $vr0 +- vilvl.h $vr1, $vr4, $vr1 +- vilvl.w $vr0, $vr1, $vr0 +- movfr2gr.d t0, $f0 ++ xvmsknz.b xr0, xr0 ++ xvmsknz.b xr1, xr1 ++ xvpickve.w xr3, xr0, 4 ++ xvpickve.w xr4, xr1, 4 ++ ++ vilvl.h vr0, vr3, vr0 ++ vilvl.h vr1, vr4, vr1 ++ vilvl.w vr0, vr1, vr0 ++ movfr2gr.d t0, fa0 + + L(end): + sub.d t2, zero, a3 +diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S +index c6952657..08a630d3 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S +@@ -17,23 +17,23 @@ LEAF(MEMCHR, 6) + andi t0, a0, 0x1f + bstrins.d a0, zero, 4, 0 + +- vld $vr0, a0, 0 +- vld $vr1, a0, 16 ++ vld vr0, a0, 0 ++ vld vr1, a0, 16 + li.d t1, -1 + li.d t2, 32 + +- vreplgr2vr.b $vr2, a1 ++ vreplgr2vr.b vr2, a1 + sll.d t3, t1, t0 + sub.d t2, t2, t0 +- vseq.b $vr0, $vr0, $vr2 ++ vseq.b vr0, vr0, vr2 + +- vseq.b $vr1, $vr1, $vr2 +- vmsknz.b $vr0, $vr0 +- vmsknz.b $vr1, $vr1 +- vilvl.h $vr0, $vr1, $vr0 ++ vseq.b vr1, vr1, vr2 ++ vmsknz.b vr0, vr0 ++ vmsknz.b vr1, vr1 ++ vilvl.h vr0, vr1, vr0 + + +- movfr2gr.s t0, $f0 ++ movfr2gr.s t0, fa0 + and t0, t0, t3 + bgeu t2, a2, L(end) + bnez t0, L(found) +@@ -41,23 +41,23 @@ LEAF(MEMCHR, 6) + addi.d a4, a3, -1 + bstrins.d a4, zero, 4, 0 + L(loop): +- vld $vr0, a0, 32 +- vld $vr1, a0, 48 ++ vld vr0, a0, 32 ++ vld vr1, a0, 48 + + addi.d a0, a0, 32 +- vseq.b $vr0, $vr0, $vr2 +- vseq.b $vr1, $vr1, $vr2 ++ vseq.b vr0, vr0, vr2 ++ vseq.b vr1, vr1, vr2 + beq a0, a4, L(out) + +- vmax.bu $vr3, $vr0, $vr1 +- vseteqz.v $fcc0, $vr3 +- bcnez $fcc0, L(loop) +- vmsknz.b $vr0, $vr0 ++ vmax.bu vr3, vr0, vr1 ++ vseteqz.v fcc0, vr3 ++ bcnez fcc0, L(loop) ++ vmsknz.b vr0, vr0 + + +- vmsknz.b $vr1, $vr1 +- vilvl.h $vr0, $vr1, $vr0 +- movfr2gr.s t0, $f0 ++ vmsknz.b vr1, vr1 ++ vilvl.h vr0, vr1, vr0 ++ movfr2gr.s t0, fa0 + L(found): + ctz.w t0, t0 + +@@ -68,10 +68,10 @@ L(ret0): + jr ra + + L(out): +- vmsknz.b $vr0, $vr0 +- vmsknz.b $vr1, $vr1 +- vilvl.h $vr0, $vr1, $vr0 +- movfr2gr.s t0, $f0 ++ vmsknz.b vr0, vr0 ++ vmsknz.b vr1, vr1 ++ vilvl.h vr0, vr1, vr0 ++ movfr2gr.s t0, fa0 + + L(end): + sub.d t2, zero, a3 +diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S +index 9151d38d..2c192954 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S +@@ -20,39 +20,39 @@ LEAF(MEMCMP, 6) + li.d t1, 160 + bgeu a2, t1, L(make_aligned) # a2 >= 160 + L(loop32): +- xvld $xr0, a0, 0 +- xvld $xr1, a1, 0 ++ xvld xr0, a0, 0 ++ xvld xr1, a1, 0 + + addi.d a0, a0, 32 + addi.d a1, a1, 32 + addi.d a2, a2, -32 +- xvseq.b $xr2, $xr0, $xr1 ++ xvseq.b xr2, xr0, xr1 + +- xvsetanyeqz.b $fcc0, $xr2 +- bcnez $fcc0, L(end) ++ xvsetanyeqz.b fcc0, xr2 ++ bcnez fcc0, L(end) + L(last_bytes): + bltu t2, a2, L(loop32) +- xvld $xr0, a3, -32 ++ xvld xr0, a3, -32 + + +- xvld $xr1, a4, -32 +- xvseq.b $xr2, $xr0, $xr1 ++ xvld xr1, a4, -32 ++ xvseq.b xr2, xr0, xr1 + L(end): +- xvmsknz.b $xr2, $xr2 +- xvpermi.q $xr4, $xr0, 1 ++ xvmsknz.b xr2, xr2 ++ xvpermi.q xr4, xr0, 1 + +- xvpickve.w $xr3, $xr2, 4 +- xvpermi.q $xr5, $xr1, 1 +- vilvl.h $vr2, $vr3, $vr2 +- movfr2gr.s t0, $f2 ++ xvpickve.w xr3, xr2, 4 ++ xvpermi.q xr5, xr1, 1 ++ vilvl.h vr2, vr3, vr2 ++ movfr2gr.s t0, fa2 + + cto.w t0, t0 +- vreplgr2vr.b $vr2, t0 +- vshuf.b $vr0, $vr4, $vr0, $vr2 +- vshuf.b $vr1, $vr5, $vr1, $vr2 ++ vreplgr2vr.b vr2, t0 ++ vshuf.b vr0, vr4, vr0, vr2 ++ vshuf.b vr1, vr5, vr1, vr2 + +- vpickve2gr.bu t0, $vr0, 0 +- vpickve2gr.bu t1, $vr1, 0 ++ vpickve2gr.bu t0, vr0, 0 ++ vpickve2gr.bu t1, vr1, 0 + sub.d a0, t0, t1 + jr ra + +@@ -60,59 +60,59 @@ L(end): + L(less32): + srli.d t0, a2, 4 + beqz t0, L(less16) +- vld $vr0, a0, 0 +- vld $vr1, a1, 0 ++ vld vr0, a0, 0 ++ vld vr1, a1, 0 + +- vld $vr2, a3, -16 +- vld $vr3, a4, -16 ++ vld vr2, a3, -16 ++ vld vr3, a4, -16 + L(short_ret): +- vseq.b $vr4, $vr0, $vr1 +- vseq.b $vr5, $vr2, $vr3 ++ vseq.b vr4, vr0, vr1 ++ vseq.b vr5, vr2, vr3 + +- vmsknz.b $vr4, $vr4 +- vmsknz.b $vr5, $vr5 +- vilvl.h $vr4, $vr5, $vr4 +- movfr2gr.s t0, $f4 ++ vmsknz.b vr4, vr4 ++ vmsknz.b vr5, vr5 ++ vilvl.h vr4, vr5, vr4 ++ movfr2gr.s t0, fa4 + + cto.w t0, t0 +- vreplgr2vr.b $vr4, t0 +- vshuf.b $vr0, $vr2, $vr0, $vr4 +- vshuf.b $vr1, $vr3, $vr1, $vr4 ++ vreplgr2vr.b vr4, t0 ++ vshuf.b vr0, vr2, vr0, vr4 ++ vshuf.b vr1, vr3, vr1, vr4 + + +- vpickve2gr.bu t0, $vr0, 0 +- vpickve2gr.bu t1, $vr1, 0 ++ vpickve2gr.bu t0, vr0, 0 ++ vpickve2gr.bu t1, vr1, 0 + sub.d a0, t0, t1 + jr ra + + L(less16): + srli.d t0, a2, 3 + beqz t0, L(less8) +- vldrepl.d $vr0, a0, 0 +- vldrepl.d $vr1, a1, 0 ++ vldrepl.d vr0, a0, 0 ++ vldrepl.d vr1, a1, 0 + +- vldrepl.d $vr2, a3, -8 +- vldrepl.d $vr3, a4, -8 ++ vldrepl.d vr2, a3, -8 ++ vldrepl.d vr3, a4, -8 + b L(short_ret) + L(less8): + srli.d t0, a2, 2 + + beqz t0, L(less4) +- vldrepl.w $vr0, a0, 0 +- vldrepl.w $vr1, a1, 0 +- vldrepl.w $vr2, a3, -4 ++ vldrepl.w vr0, a0, 0 ++ vldrepl.w vr1, a1, 0 ++ vldrepl.w vr2, a3, -4 + + +- vldrepl.w $vr3, a4, -4 ++ vldrepl.w vr3, a4, -4 + b L(short_ret) + L(less4): + srli.d t0, a2, 1 + beqz t0, L(less2) + +- vldrepl.h $vr0, a0, 0 +- vldrepl.h $vr1, a1, 0 +- vldrepl.h $vr2, a3, -2 +- vldrepl.h $vr3, a4, -2 ++ vldrepl.h vr0, a0, 0 ++ vldrepl.h vr1, a1, 0 ++ vldrepl.h vr2, a3, -2 ++ vldrepl.h vr3, a4, -2 + + b L(short_ret) + L(less2): +@@ -132,12 +132,12 @@ L(ret0): + nop + /* make src1 aligned, and adjust scr2 and length. */ + L(make_aligned): +- xvld $xr0, a0, 0 ++ xvld xr0, a0, 0 + +- xvld $xr1, a1, 0 +- xvseq.b $xr2, $xr0, $xr1 +- xvsetanyeqz.b $fcc0, $xr2 +- bcnez $fcc0, L(end) ++ xvld xr1, a1, 0 ++ xvseq.b xr2, xr0, xr1 ++ xvsetanyeqz.b fcc0, xr2 ++ bcnez fcc0, L(end) + + andi t0, a0, 0x1f + sub.d t0, t2, t0 +@@ -151,17 +151,17 @@ L(make_aligned): + + + L(loop_align): +- xvld $xr0, a0, 0 +- xvld $xr1, a1, 0 +- xvld $xr2, a0, 32 +- xvld $xr3, a1, 32 ++ xvld xr0, a0, 0 ++ xvld xr1, a1, 0 ++ xvld xr2, a0, 32 ++ xvld xr3, a1, 32 + +- xvseq.b $xr0, $xr0, $xr1 +- xvseq.b $xr1, $xr2, $xr3 +- xvmin.bu $xr2, $xr1, $xr0 +- xvsetanyeqz.b $fcc0, $xr2 ++ xvseq.b xr0, xr0, xr1 ++ xvseq.b xr1, xr2, xr3 ++ xvmin.bu xr2, xr1, xr0 ++ xvsetanyeqz.b fcc0, xr2 + +- bcnez $fcc0, L(pair_end) ++ bcnez fcc0, L(pair_end) + addi.d a0, a0, 64 + addi.d a1, a1, 64 + bne a0, a5, L(loop_align) +@@ -173,15 +173,15 @@ L(loop_align): + + + L(pair_end): +- xvmsknz.b $xr0, $xr0 +- xvmsknz.b $xr1, $xr1 +- xvpickve.w $xr2, $xr0, 4 +- xvpickve.w $xr3, $xr1, 4 +- +- vilvl.h $vr0, $vr2, $vr0 +- vilvl.h $vr1, $vr3, $vr1 +- vilvl.w $vr0, $vr1, $vr0 +- movfr2gr.d t0, $f0 ++ xvmsknz.b xr0, xr0 ++ xvmsknz.b xr1, xr1 ++ xvpickve.w xr2, xr0, 4 ++ xvpickve.w xr3, xr1, 4 ++ ++ vilvl.h vr0, vr2, vr0 ++ vilvl.h vr1, vr3, vr1 ++ vilvl.w vr0, vr1, vr0 ++ movfr2gr.d t0, fa0 + + cto.d t0, t0 + ldx.bu t1, a0, t0 +diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S +index 8535aa22..b407275f 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S +@@ -21,28 +21,28 @@ ENTRY_NO_ALIGN(MEMCMP) + pcaddi t0, -7 + + andi a3, a0, 0xf +- vld $vr5, t0, 0 ++ vld vr5, t0, 0 + andi a4, a1, 0xf + bne a3, a4, L(unaligned) + + bstrins.d a0, zero, 3, 0 + xor a1, a1, a4 +- vld $vr0, a0, 0 +- vld $vr1, a1, 0 ++ vld vr0, a0, 0 ++ vld vr1, a1, 0 + + + li.d t0, 16 +- vreplgr2vr.b $vr3, a3 ++ vreplgr2vr.b vr3, a3 + sub.d t1, t0, a3 +- vadd.b $vr3, $vr3, $vr5 ++ vadd.b vr3, vr3, vr5 + +- vshuf.b $vr0, $vr3, $vr0, $vr3 +- vshuf.b $vr1, $vr3, $vr1, $vr3 +- vseq.b $vr4, $vr0, $vr1 ++ vshuf.b vr0, vr3, vr0, vr3 ++ vshuf.b vr1, vr3, vr1, vr3 ++ vseq.b vr4, vr0, vr1 + bgeu t1, a2, L(al_end) + +- vsetanyeqz.b $fcc0, $vr4 +- bcnez $fcc0, L(al_found) ++ vsetanyeqz.b fcc0, vr4 ++ bcnez fcc0, L(al_found) + sub.d a2, a2, t1 + andi t1, a2, 31 + +@@ -53,70 +53,70 @@ ENTRY_NO_ALIGN(MEMCMP) + + + L(al_loop): +- vld $vr0, a0, 16 +- vld $vr1, a1, 16 +- vld $vr2, a0, 32 +- vld $vr3, a1, 32 ++ vld vr0, a0, 16 ++ vld vr1, a1, 16 ++ vld vr2, a0, 32 ++ vld vr3, a1, 32 + + addi.d a0, a0, 32 + addi.d a1, a1, 32 +- vseq.b $vr4, $vr0, $vr1 +- vseq.b $vr6, $vr2, $vr3 ++ vseq.b vr4, vr0, vr1 ++ vseq.b vr6, vr2, vr3 + +- vand.v $vr6, $vr4, $vr6 +- vsetanyeqz.b $fcc0, $vr6 +- bcnez $fcc0, L(al_pair_end) ++ vand.v vr6, vr4, vr6 ++ vsetanyeqz.b fcc0, vr6 ++ bcnez fcc0, L(al_pair_end) + bne a0, a4, L(al_loop) + + L(al_less_32bytes): + bgeu t0, a2, L(al_less_16bytes) +- vld $vr0, a0, 16 +- vld $vr1, a1, 16 +- vld $vr2, a0, 32 ++ vld vr0, a0, 16 ++ vld vr1, a1, 16 ++ vld vr2, a0, 32 + + +- vld $vr3, a1, 32 ++ vld vr3, a1, 32 + addi.d a2, a2, -16 +- vreplgr2vr.b $vr6, a2 +- vslt.b $vr5, $vr5, $vr6 ++ vreplgr2vr.b vr6, a2 ++ vslt.b vr5, vr5, vr6 + +- vseq.b $vr4, $vr0, $vr1 +- vseq.b $vr6, $vr2, $vr3 +- vorn.v $vr6, $vr6, $vr5 ++ vseq.b vr4, vr0, vr1 ++ vseq.b vr6, vr2, vr3 ++ vorn.v vr6, vr6, vr5 + L(al_pair_end): +- vsetanyeqz.b $fcc0, $vr4 ++ vsetanyeqz.b fcc0, vr4 + +- bcnez $fcc0, L(al_found) +- vnori.b $vr4, $vr6, 0 +- vfrstpi.b $vr4, $vr4, 0 +- vshuf.b $vr0, $vr2, $vr2, $vr4 ++ bcnez fcc0, L(al_found) ++ vnori.b vr4, vr6, 0 ++ vfrstpi.b vr4, vr4, 0 ++ vshuf.b vr0, vr2, vr2, vr4 + +- vshuf.b $vr1, $vr3, $vr3, $vr4 +- vpickve2gr.bu t0, $vr0, 0 +- vpickve2gr.bu t1, $vr1, 0 ++ vshuf.b vr1, vr3, vr3, vr4 ++ vpickve2gr.bu t0, vr0, 0 ++ vpickve2gr.bu t1, vr1, 0 + sub.d a0, t0, t1 + + + jr ra + L(al_less_16bytes): + beqz a2, L(out) +- vld $vr0, a0, 16 +- vld $vr1, a1, 16 ++ vld vr0, a0, 16 ++ vld vr1, a1, 16 + +- vseq.b $vr4, $vr0, $vr1 ++ vseq.b vr4, vr0, vr1 + L(al_end): +- vreplgr2vr.b $vr6, a2 +- vslt.b $vr5, $vr5, $vr6 +- vorn.v $vr4, $vr4, $vr5 ++ vreplgr2vr.b vr6, a2 ++ vslt.b vr5, vr5, vr6 ++ vorn.v vr4, vr4, vr5 + + L(al_found): +- vnori.b $vr4, $vr4, 0 +- vfrstpi.b $vr4, $vr4, 0 +- vshuf.b $vr0, $vr0, $vr0, $vr4 +- vshuf.b $vr1, $vr1, $vr1, $vr4 ++ vnori.b vr4, vr4, 0 ++ vfrstpi.b vr4, vr4, 0 ++ vshuf.b vr0, vr0, vr0, vr4 ++ vshuf.b vr1, vr1, vr1, vr4 + +- vpickve2gr.bu t0, $vr0, 0 +- vpickve2gr.bu t1, $vr1, 0 ++ vpickve2gr.bu t0, vr0, 0 ++ vpickve2gr.bu t1, vr1, 0 + sub.d a0, t0, t1 + jr ra + +@@ -133,28 +133,28 @@ L(unaligned): + bstrins.d a0, zero, 3, 0 + + xor a1, a1, a4 +- vld $vr4, a0, 0 +- vld $vr1, a1, 0 ++ vld vr4, a0, 0 ++ vld vr1, a1, 0 + li.d t0, 16 + +- vreplgr2vr.b $vr2, a4 ++ vreplgr2vr.b vr2, a4 + sub.d a6, a4, a3 # a6 hold the diff + sub.d t1, t0, a4 + sub.d t2, t0, a6 + + +- vadd.b $vr2, $vr2, $vr5 # [4, 5, 6, ...] +- vreplgr2vr.b $vr6, t2 +- vadd.b $vr6, $vr6, $vr5 # [14, 15, 16, ... ] +- vshuf.b $vr0, $vr4, $vr4, $vr6 # make data be in the same position ++ vadd.b vr2, vr2, vr5 # [4, 5, 6, ...] ++ vreplgr2vr.b vr6, t2 ++ vadd.b vr6, vr6, vr5 # [14, 15, 16, ... ] ++ vshuf.b vr0, vr4, vr4, vr6 # make data be in the same position + +- vshuf.b $vr1, $vr2, $vr1, $vr2 +- vshuf.b $vr0, $vr2, $vr0, $vr2 +- vseq.b $vr7, $vr0, $vr1 ++ vshuf.b vr1, vr2, vr1, vr2 ++ vshuf.b vr0, vr2, vr0, vr2 ++ vseq.b vr7, vr0, vr1 + bgeu t1, a2, L(un_end) + +- vsetanyeqz.b $fcc0, $vr7 +- bcnez $fcc0, L(un_found) ++ vsetanyeqz.b fcc0, vr7 ++ bcnez fcc0, L(un_found) + sub.d a2, a2, t1 + andi t1, a2, 31 + +@@ -165,63 +165,63 @@ L(unaligned): + + + L(un_loop): +- vld $vr2, a0, 16 +- vld $vr1, a1, 16 +- vld $vr3, a1, 32 ++ vld vr2, a0, 16 ++ vld vr1, a1, 16 ++ vld vr3, a1, 32 + addi.d a1, a1, 32 + + addi.d a0, a0, 32 +- vshuf.b $vr0, $vr2, $vr4, $vr6 +- vld $vr4, a0, 0 +- vseq.b $vr7, $vr0, $vr1 ++ vshuf.b vr0, vr2, vr4, vr6 ++ vld vr4, a0, 0 ++ vseq.b vr7, vr0, vr1 + +- vshuf.b $vr2, $vr4, $vr2, $vr6 +- vseq.b $vr8, $vr2, $vr3 +- vand.v $vr8, $vr7, $vr8 +- vsetanyeqz.b $fcc0, $vr8 ++ vshuf.b vr2, vr4, vr2, vr6 ++ vseq.b vr8, vr2, vr3 ++ vand.v vr8, vr7, vr8 ++ vsetanyeqz.b fcc0, vr8 + +- bcnez $fcc0, L(un_pair_end) ++ bcnez fcc0, L(un_pair_end) + bne a1, a4, L(un_loop) + L(un_less_32bytes): + bltu a2, t0, L(un_less_16bytes) +- vld $vr2, a0, 16 ++ vld vr2, a0, 16 + + +- vld $vr1, a1, 16 ++ vld vr1, a1, 16 + addi.d a0, a0, 16 + addi.d a1, a1, 16 + addi.d a2, a2, -16 + +- vshuf.b $vr0, $vr2, $vr4, $vr6 +- vor.v $vr4, $vr2, $vr2 +- vseq.b $vr7, $vr0, $vr1 +- vsetanyeqz.b $fcc0, $vr7 ++ vshuf.b vr0, vr2, vr4, vr6 ++ vor.v vr4, vr2, vr2 ++ vseq.b vr7, vr0, vr1 ++ vsetanyeqz.b fcc0, vr7 + +- bcnez $fcc0, L(un_found) ++ bcnez fcc0, L(un_found) + L(un_less_16bytes): + beqz a2, L(out) +- vld $vr1, a1, 16 ++ vld vr1, a1, 16 + bgeu a6, a2, 1f + +- vld $vr2, a0, 16 ++ vld vr2, a0, 16 + 1: +- vshuf.b $vr0, $vr2, $vr4, $vr6 +- vseq.b $vr7, $vr0, $vr1 ++ vshuf.b vr0, vr2, vr4, vr6 ++ vseq.b vr7, vr0, vr1 + L(un_end): +- vreplgr2vr.b $vr3, a2 ++ vreplgr2vr.b vr3, a2 + + +- vslt.b $vr3, $vr5, $vr3 +- vorn.v $vr7, $vr7, $vr3 ++ vslt.b vr3, vr5, vr3 ++ vorn.v vr7, vr7, vr3 + L(un_found): +- vnori.b $vr7, $vr7, 0 +- vfrstpi.b $vr7, $vr7, 0 ++ vnori.b vr7, vr7, 0 ++ vfrstpi.b vr7, vr7, 0 + +- vshuf.b $vr0, $vr0, $vr0, $vr7 +- vshuf.b $vr1, $vr1, $vr1, $vr7 ++ vshuf.b vr0, vr0, vr0, vr7 ++ vshuf.b vr1, vr1, vr1, vr7 + L(calc_result): +- vpickve2gr.bu t0, $vr0, 0 +- vpickve2gr.bu t1, $vr1, 0 ++ vpickve2gr.bu t0, vr0, 0 ++ vpickve2gr.bu t1, vr1, 0 + + sub.d t2, t0, t1 + sub.d t3, t1, t0 +@@ -231,14 +231,14 @@ L(calc_result): + or a0, t0, t1 + jr ra + L(un_pair_end): +- vsetanyeqz.b $fcc0, $vr7 +- bcnez $fcc0, L(un_found) ++ vsetanyeqz.b fcc0, vr7 ++ bcnez fcc0, L(un_found) + + +- vnori.b $vr7, $vr8, 0 +- vfrstpi.b $vr7, $vr7, 0 +- vshuf.b $vr0, $vr2, $vr2, $vr7 +- vshuf.b $vr1, $vr3, $vr3, $vr7 ++ vnori.b vr7, vr8, 0 ++ vfrstpi.b vr7, vr7, 0 ++ vshuf.b vr0, vr2, vr2, vr7 ++ vshuf.b vr1, vr3, vr3, vr7 + + b L(calc_result) + L(out): +diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S +index e8b2c441..c317592f 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S +@@ -26,22 +26,22 @@ LEAF(MEMCPY_NAME, 6) + + li.d t1, 64 + bltu t1, a2, L(copy_long) # a2 > 64 +- xvld $xr0, a1, 0 +- xvld $xr1, a4, -32 ++ xvld xr0, a1, 0 ++ xvld xr1, a4, -32 + +- xvst $xr0, a0, 0 +- xvst $xr1, a3, -32 ++ xvst xr0, a0, 0 ++ xvst xr1, a3, -32 + jr ra + L(less_32bytes): + srli.d t0, a2, 4 + + beqz t0, L(less_16bytes) +- vld $vr0, a1, 0 +- vld $vr1, a4, -16 +- vst $vr0, a0, 0 ++ vld vr0, a1, 0 ++ vld vr1, a4, -16 ++ vst vr0, a0, 0 + + +- vst $vr1, a3, -16 ++ vst vr1, a3, -16 + jr ra + L(less_16bytes): + srli.d t0, a2, 3 +@@ -91,11 +91,11 @@ LEAF(MEMMOVE_NAME, 6) + + li.d t1, 64 + bltu t1, a2, L(move_long) # a2 > 64 +- xvld $xr0, a1, 0 +- xvld $xr1, a4, -32 ++ xvld xr0, a1, 0 ++ xvld xr1, a4, -32 + +- xvst $xr0, a0, 0 +- xvst $xr1, a3, -32 ++ xvst xr0, a0, 0 ++ xvst xr1, a3, -32 + jr ra + L(move_long): + sub.d t2, a0, a1 +@@ -107,8 +107,8 @@ L(copy_long): + sub.d t2, t0, t2 + + +- xvld $xr8, a1, 0 +- xvld $xr9, a4, -32 ++ xvld xr8, a1, 0 ++ xvld xr9, a4, -32 + sub.d t3, a2, t2 + add.d a5, a0, t2 + +@@ -119,69 +119,69 @@ L(copy_long): + + addi.d a6, a6, -1 + L(loop_256): +- xvld $xr0, a1, 0 +- xvld $xr1, a1, 32 +- xvld $xr2, a1, 64 ++ xvld xr0, a1, 0 ++ xvld xr1, a1, 32 ++ xvld xr2, a1, 64 + +- xvld $xr3, a1, 96 +- xvld $xr4, a1, 128 +- xvld $xr5, a1, 160 +- xvld $xr6, a1, 192 ++ xvld xr3, a1, 96 ++ xvld xr4, a1, 128 ++ xvld xr5, a1, 160 ++ xvld xr6, a1, 192 + + +- xvld $xr7, a1, 224 ++ xvld xr7, a1, 224 + addi.d a1, a1, 256 +- xvst $xr0, a5, 0 +- xvst $xr1, a5, 32 ++ xvst xr0, a5, 0 ++ xvst xr1, a5, 32 + +- xvst $xr2, a5, 64 +- xvst $xr3, a5, 96 +- xvst $xr4, a5, 128 +- xvst $xr5, a5, 160 ++ xvst xr2, a5, 64 ++ xvst xr3, a5, 96 ++ xvst xr4, a5, 128 ++ xvst xr5, a5, 160 + +- xvst $xr6, a5, 192 +- xvst $xr7, a5, 224 ++ xvst xr6, a5, 192 ++ xvst xr7, a5, 224 + addi.d a5, a5, 256 + bne a1, a6, L(loop_256) + + L(lt256): + srli.d t2, a2, 7 + beqz t2, L(lt128) +- xvld $xr0, a1, 0 +- xvld $xr1, a1, 32 ++ xvld xr0, a1, 0 ++ xvld xr1, a1, 32 + + +- xvld $xr2, a1, 64 +- xvld $xr3, a1, 96 ++ xvld xr2, a1, 64 ++ xvld xr3, a1, 96 + addi.d a1, a1, 128 + addi.d a2, a2, -128 + +- xvst $xr0, a5, 0 +- xvst $xr1, a5, 32 +- xvst $xr2, a5, 64 +- xvst $xr3, a5, 96 ++ xvst xr0, a5, 0 ++ xvst xr1, a5, 32 ++ xvst xr2, a5, 64 ++ xvst xr3, a5, 96 + + addi.d a5, a5, 128 + L(lt128): + bltu a2, t1, L(lt64) +- xvld $xr0, a1, 0 +- xvld $xr1, a1, 32 ++ xvld xr0, a1, 0 ++ xvld xr1, a1, 32 + + addi.d a1, a1, 64 + addi.d a2, a2, -64 +- xvst $xr0, a5, 0 +- xvst $xr1, a5, 32 ++ xvst xr0, a5, 0 ++ xvst xr1, a5, 32 + + + addi.d a5, a5, 64 + L(lt64): + bltu a2, t0, L(lt32) +- xvld $xr0, a1, 0 +- xvst $xr0, a5, 0 ++ xvld xr0, a1, 0 ++ xvst xr0, a5, 0 + + L(lt32): +- xvst $xr8, a0, 0 +- xvst $xr9, a3, -32 ++ xvst xr8, a0, 0 ++ xvst xr9, a3, -32 + jr ra + nop + +@@ -189,9 +189,9 @@ L(copy_back): + addi.d a3, a3, -1 + addi.d a2, a2, -2 + andi t2, a3, 0x1f +- xvld $xr8, a1, 0 ++ xvld xr8, a1, 0 + +- xvld $xr9, a4, -32 ++ xvld xr9, a4, -32 + sub.d t3, a2, t2 + sub.d a5, a3, t2 + sub.d a4, a4, t2 +@@ -203,69 +203,69 @@ L(copy_back): + addi.d a6, a6, 2 + + L(back_loop_256): +- xvld $xr0, a4, -33 +- xvld $xr1, a4, -65 +- xvld $xr2, a4, -97 +- xvld $xr3, a4, -129 ++ xvld xr0, a4, -33 ++ xvld xr1, a4, -65 ++ xvld xr2, a4, -97 ++ xvld xr3, a4, -129 + +- xvld $xr4, a4, -161 +- xvld $xr5, a4, -193 +- xvld $xr6, a4, -225 +- xvld $xr7, a4, -257 ++ xvld xr4, a4, -161 ++ xvld xr5, a4, -193 ++ xvld xr6, a4, -225 ++ xvld xr7, a4, -257 + + addi.d a4, a4, -256 +- xvst $xr0, a5, -32 +- xvst $xr1, a5, -64 +- xvst $xr2, a5, -96 ++ xvst xr0, a5, -32 ++ xvst xr1, a5, -64 ++ xvst xr2, a5, -96 + + +- xvst $xr3, a5, -128 +- xvst $xr4, a5, -160 +- xvst $xr5, a5, -192 +- xvst $xr6, a5, -224 ++ xvst xr3, a5, -128 ++ xvst xr4, a5, -160 ++ xvst xr5, a5, -192 ++ xvst xr6, a5, -224 + +- xvst $xr7, a5, -256 ++ xvst xr7, a5, -256 + addi.d a5, a5, -256 + bne a4, a6, L(back_loop_256) + L(back_lt256): + srli.d t2, a2, 7 + + beqz t2, L(back_lt128) +- xvld $xr0, a4, -33 +- xvld $xr1, a4, -65 +- xvld $xr2, a4, -97 ++ xvld xr0, a4, -33 ++ xvld xr1, a4, -65 ++ xvld xr2, a4, -97 + +- xvld $xr3, a4, -129 ++ xvld xr3, a4, -129 + addi.d a2, a2, -128 + addi.d a4, a4, -128 +- xvst $xr0, a5, -32 ++ xvst xr0, a5, -32 + + +- xvst $xr1, a5, -64 +- xvst $xr2, a5, -96 +- xvst $xr3, a5, -128 ++ xvst xr1, a5, -64 ++ xvst xr2, a5, -96 ++ xvst xr3, a5, -128 + addi.d a5, a5, -128 + + L(back_lt128): + blt a2, t1, L(back_lt64) +- xvld $xr0, a4, -33 +- xvld $xr1, a4, -65 ++ xvld xr0, a4, -33 ++ xvld xr1, a4, -65 + addi.d a2, a2, -64 + + addi.d a4, a4, -64 +- xvst $xr0, a5, -32 +- xvst $xr1, a5, -64 ++ xvst xr0, a5, -32 ++ xvst xr1, a5, -64 + addi.d a5, a5, -64 + + L(back_lt64): + bltu a2, t0, L(back_lt32) +- xvld $xr0, a4, -33 +- xvst $xr0, a5, -32 ++ xvld xr0, a4, -33 ++ xvst xr0, a5, -32 + L(back_lt32): +- xvst $xr8, a0, 0 ++ xvst xr8, a0, 0 + + +- xvst $xr9, a3, -31 ++ xvst xr9, a3, -31 + jr ra + END(MEMMOVE_NAME) + +diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S +index 90f89c7a..77f1b4ab 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S +@@ -23,54 +23,54 @@ LEAF(MEMCPY_NAME, 6) + bltu t8, a2, L(copy_long) # a2 > 64 + bltu t7, a2, L(more_32bytes) # a2 > 32 + +- vld $vr0, a1, 0 +- vld $vr1, a4, -16 +- vst $vr0, a0, 0 +- vst $vr1, a3, -16 ++ vld vr0, a1, 0 ++ vld vr1, a4, -16 ++ vst vr0, a0, 0 ++ vst vr1, a3, -16 + + jr ra + L(more_32bytes): +- vld $vr0, a1, 0 +- vld $vr1, a1, 16 +- vld $vr2, a4, -32 ++ vld vr0, a1, 0 ++ vld vr1, a1, 16 ++ vld vr2, a4, -32 + + +- vld $vr3, a4, -16 +- vst $vr0, a0, 0 +- vst $vr1, a0, 16 +- vst $vr2, a3, -32 ++ vld vr3, a4, -16 ++ vst vr0, a0, 0 ++ vst vr1, a0, 16 ++ vst vr2, a3, -32 + +- vst $vr3, a3, -16 ++ vst vr3, a3, -16 + jr ra + L(less_16bytes): + srli.d t0, a2, 3 + beqz t0, L(less_8bytes) + +- vldrepl.d $vr0, a1, 0 +- vldrepl.d $vr1, a4, -8 +- vstelm.d $vr0, a0, 0, 0 +- vstelm.d $vr1, a3, -8, 0 ++ vldrepl.d vr0, a1, 0 ++ vldrepl.d vr1, a4, -8 ++ vstelm.d vr0, a0, 0, 0 ++ vstelm.d vr1, a3, -8, 0 + + jr ra + L(less_8bytes): + srli.d t0, a2, 2 + beqz t0, L(less_4bytes) +- vldrepl.w $vr0, a1, 0 ++ vldrepl.w vr0, a1, 0 + + +- vldrepl.w $vr1, a4, -4 +- vstelm.w $vr0, a0, 0, 0 +- vstelm.w $vr1, a3, -4, 0 ++ vldrepl.w vr1, a4, -4 ++ vstelm.w vr0, a0, 0, 0 ++ vstelm.w vr1, a3, -4, 0 + jr ra + + L(less_4bytes): + srli.d t0, a2, 1 + beqz t0, L(less_2bytes) +- vldrepl.h $vr0, a1, 0 +- vldrepl.h $vr1, a4, -2 ++ vldrepl.h vr0, a1, 0 ++ vldrepl.h vr1, a4, -2 + +- vstelm.h $vr0, a0, 0, 0 +- vstelm.h $vr1, a3, -2, 0 ++ vstelm.h vr0, a0, 0, 0 ++ vstelm.h vr1, a3, -2, 0 + jr ra + L(less_2bytes): + beqz a2, L(less_1bytes) +@@ -93,10 +93,10 @@ LEAF(MEMMOVE_NAME, 6) + bltu t8, a2, L(move_long) # a2 > 64 + bltu t7, a2, L(more_32bytes) # a2 > 32 + +- vld $vr0, a1, 0 +- vld $vr1, a4, -16 +- vst $vr0, a0, 0 +- vst $vr1, a3, -16 ++ vld vr0, a1, 0 ++ vld vr1, a4, -16 ++ vst vr0, a0, 0 ++ vst vr1, a3, -16 + + jr ra + nop +@@ -106,7 +106,7 @@ L(move_long): + + + L(copy_long): +- vld $vr2, a1, 0 ++ vld vr2, a1, 0 + andi t0, a0, 0xf + sub.d t0, t6, t0 + add.d a1, a1, t0 +@@ -114,10 +114,10 @@ L(copy_long): + sub.d a2, a2, t0 + andi t1, a1, 0xf + bnez t1, L(unaligned) +- vld $vr0, a1, 0 ++ vld vr0, a1, 0 + + addi.d a2, a2, -16 +- vst $vr2, a0, 0 ++ vst vr2, a0, 0 + andi t2, a2, 0x7f + add.d a5, a0, t0 + +@@ -128,69 +128,69 @@ L(copy_long): + + + L(al_loop): +- vld $vr1, a1, 16 +- vld $vr2, a1, 32 +- vld $vr3, a1, 48 +- vld $vr4, a1, 64 ++ vld vr1, a1, 16 ++ vld vr2, a1, 32 ++ vld vr3, a1, 48 ++ vld vr4, a1, 64 + +- vld $vr5, a1, 80 +- vld $vr6, a1, 96 +- vld $vr7, a1, 112 +- vst $vr0, a5, 0 ++ vld vr5, a1, 80 ++ vld vr6, a1, 96 ++ vld vr7, a1, 112 ++ vst vr0, a5, 0 + +- vld $vr0, a1, 128 ++ vld vr0, a1, 128 + addi.d a1, a1, 128 +- vst $vr1, a5, 16 +- vst $vr2, a5, 32 ++ vst vr1, a5, 16 ++ vst vr2, a5, 32 + +- vst $vr3, a5, 48 +- vst $vr4, a5, 64 +- vst $vr5, a5, 80 +- vst $vr6, a5, 96 ++ vst vr3, a5, 48 ++ vst vr4, a5, 64 ++ vst vr5, a5, 80 ++ vst vr6, a5, 96 + + +- vst $vr7, a5, 112 ++ vst vr7, a5, 112 + addi.d a5, a5, 128 + bne a1, a6, L(al_loop) + L(al_less_128): + blt a2, t8, L(al_less_64) + +- vld $vr1, a1, 16 +- vld $vr2, a1, 32 +- vld $vr3, a1, 48 ++ vld vr1, a1, 16 ++ vld vr2, a1, 32 ++ vld vr3, a1, 48 + addi.d a2, a2, -64 + +- vst $vr0, a5, 0 +- vld $vr0, a1, 64 ++ vst vr0, a5, 0 ++ vld vr0, a1, 64 + addi.d a1, a1, 64 +- vst $vr1, a5, 16 ++ vst vr1, a5, 16 + +- vst $vr2, a5, 32 +- vst $vr3, a5, 48 ++ vst vr2, a5, 32 ++ vst vr3, a5, 48 + addi.d a5, a5, 64 + L(al_less_64): + blt a2, t7, L(al_less_32) + + +- vld $vr1, a1, 16 ++ vld vr1, a1, 16 + addi.d a2, a2, -32 +- vst $vr0, a5, 0 +- vld $vr0, a1, 32 ++ vst vr0, a5, 0 ++ vld vr0, a1, 32 + + addi.d a1, a1, 32 +- vst $vr1, a5, 16 ++ vst vr1, a5, 16 + addi.d a5, a5, 32 + L(al_less_32): + blt a2, t6, L(al_less_16) + +- vst $vr0, a5, 0 +- vld $vr0, a1, 16 ++ vst vr0, a5, 0 ++ vld vr0, a1, 16 + addi.d a5, a5, 16 + L(al_less_16): +- vld $vr1, a4, -16 ++ vld vr1, a4, -16 + +- vst $vr0, a5, 0 +- vst $vr1, a3, -16 ++ vst vr0, a5, 0 ++ vst vr1, a3, -16 + jr ra + nop + +@@ -201,17 +201,17 @@ L(magic_num): + L(unaligned): + pcaddi t2, -4 + bstrins.d a1, zero, 3, 0 +- vld $vr8, t2, 0 +- vld $vr0, a1, 0 ++ vld vr8, t2, 0 ++ vld vr0, a1, 0 + +- vld $vr1, a1, 16 ++ vld vr1, a1, 16 + addi.d a2, a2, -16 +- vst $vr2, a0, 0 ++ vst vr2, a0, 0 + add.d a5, a0, t0 + +- vreplgr2vr.b $vr9, t1 ++ vreplgr2vr.b vr9, t1 + andi t2, a2, 0x7f +- vadd.b $vr9, $vr9, $vr8 ++ vadd.b vr9, vr9, vr8 + addi.d a1, a1, 32 + + +@@ -221,97 +221,97 @@ L(unaligned): + add.d a6, a1, t3 + + L(un_loop): +- vld $vr2, a1, 0 +- vld $vr3, a1, 16 +- vld $vr4, a1, 32 +- vld $vr5, a1, 48 ++ vld vr2, a1, 0 ++ vld vr3, a1, 16 ++ vld vr4, a1, 32 ++ vld vr5, a1, 48 + +- vld $vr6, a1, 64 +- vld $vr7, a1, 80 +- vshuf.b $vr8, $vr1, $vr0, $vr9 +- vld $vr0, a1, 96 ++ vld vr6, a1, 64 ++ vld vr7, a1, 80 ++ vshuf.b vr8, vr1, vr0, vr9 ++ vld vr0, a1, 96 + +- vst $vr8, a5, 0 +- vshuf.b $vr8, $vr2, $vr1, $vr9 +- vld $vr1, a1, 112 +- vst $vr8, a5, 16 ++ vst vr8, a5, 0 ++ vshuf.b vr8, vr2, vr1, vr9 ++ vld vr1, a1, 112 ++ vst vr8, a5, 16 + + + addi.d a1, a1, 128 +- vshuf.b $vr2, $vr3, $vr2, $vr9 +- vshuf.b $vr3, $vr4, $vr3, $vr9 +- vst $vr2, a5, 32 ++ vshuf.b vr2, vr3, vr2, vr9 ++ vshuf.b vr3, vr4, vr3, vr9 ++ vst vr2, a5, 32 + +- vshuf.b $vr4, $vr5, $vr4, $vr9 +- vst $vr3, a5, 48 +- vshuf.b $vr5, $vr6, $vr5, $vr9 +- vst $vr4, a5, 64 ++ vshuf.b vr4, vr5, vr4, vr9 ++ vst vr3, a5, 48 ++ vshuf.b vr5, vr6, vr5, vr9 ++ vst vr4, a5, 64 + +- vshuf.b $vr6, $vr7, $vr6, $vr9 +- vst $vr5, a5, 80 +- vshuf.b $vr7, $vr0, $vr7, $vr9 +- vst $vr6, a5, 96 ++ vshuf.b vr6, vr7, vr6, vr9 ++ vst vr5, a5, 80 ++ vshuf.b vr7, vr0, vr7, vr9 ++ vst vr6, a5, 96 + +- vst $vr7, a5, 112 ++ vst vr7, a5, 112 + addi.d a5, a5, 128 + bne a1, a6, L(un_loop) + L(un_less_128): + blt a2, t8, L(un_less_64) + + +- vld $vr2, a1, 0 +- vld $vr3, a1, 16 +- vshuf.b $vr4, $vr1, $vr0, $vr9 +- vld $vr0, a1, 32 ++ vld vr2, a1, 0 ++ vld vr3, a1, 16 ++ vshuf.b vr4, vr1, vr0, vr9 ++ vld vr0, a1, 32 + +- vst $vr4, a5, 0 ++ vst vr4, a5, 0 + addi.d a2, a2, -64 +- vshuf.b $vr4, $vr2, $vr1, $vr9 +- vld $vr1, a1, 48 ++ vshuf.b vr4, vr2, vr1, vr9 ++ vld vr1, a1, 48 + + addi.d a1, a1, 64 +- vst $vr4, a5, 16 +- vshuf.b $vr2, $vr3, $vr2, $vr9 +- vshuf.b $vr3, $vr0, $vr3, $vr9 ++ vst vr4, a5, 16 ++ vshuf.b vr2, vr3, vr2, vr9 ++ vshuf.b vr3, vr0, vr3, vr9 + +- vst $vr2, a5, 32 +- vst $vr3, a5, 48 ++ vst vr2, a5, 32 ++ vst vr3, a5, 48 + addi.d a5, a5, 64 + L(un_less_64): + blt a2, t7, L(un_less_32) + + +- vshuf.b $vr3, $vr1, $vr0, $vr9 +- vld $vr0, a1, 0 +- vst $vr3, a5, 0 ++ vshuf.b vr3, vr1, vr0, vr9 ++ vld vr0, a1, 0 ++ vst vr3, a5, 0 + addi.d a2, a2, -32 + +- vshuf.b $vr3, $vr0, $vr1, $vr9 +- vld $vr1, a1, 16 ++ vshuf.b vr3, vr0, vr1, vr9 ++ vld vr1, a1, 16 + addi.d a1, a1, 32 +- vst $vr3, a5, 16 ++ vst vr3, a5, 16 + + addi.d a5, a5, 32 + L(un_less_32): + blt a2, t6, L(un_less_16) +- vshuf.b $vr2, $vr1, $vr0, $vr9 +- vor.v $vr0, $vr1, $vr1 ++ vshuf.b vr2, vr1, vr0, vr9 ++ vor.v vr0, vr1, vr1 + +- vld $vr1, a1, 0 +- vst $vr2, a5, 0 ++ vld vr1, a1, 0 ++ vst vr2, a5, 0 + addi.d a5, a5, 16 + L(un_less_16): +- vld $vr2, a4, -16 ++ vld vr2, a4, -16 + + +- vshuf.b $vr0, $vr1, $vr0, $vr9 +- vst $vr0, a5, 0 +- vst $vr2, a3, -16 ++ vshuf.b vr0, vr1, vr0, vr9 ++ vst vr0, a5, 0 ++ vst vr2, a3, -16 + jr ra + + L(copy_back): + addi.d t0, a3, -1 +- vld $vr2, a4, -16 ++ vld vr2, a4, -16 + andi t0, t0, 0xf + addi.d t0, t0, 1 # in case a3 is already aligned, load 16bytes and store 16bytes + +@@ -320,9 +320,9 @@ L(copy_back): + andi t1, a4, 0xf + bnez t1, L(back_unaligned) + +- vld $vr0, a4, -16 ++ vld vr0, a4, -16 + addi.d a2, a2, -16 +- vst $vr2, a3, -16 ++ vst vr2, a3, -16 + andi t2, a2, 0x7f + + +@@ -333,70 +333,70 @@ L(copy_back): + + sub.d a6, a4, t3 + L(back_al_loop): +- vld $vr1, a4, -32 +- vld $vr2, a4, -48 +- vld $vr3, a4, -64 ++ vld vr1, a4, -32 ++ vld vr2, a4, -48 ++ vld vr3, a4, -64 + +- vld $vr4, a4, -80 +- vld $vr5, a4, -96 +- vld $vr6, a4, -112 +- vld $vr7, a4, -128 ++ vld vr4, a4, -80 ++ vld vr5, a4, -96 ++ vld vr6, a4, -112 ++ vld vr7, a4, -128 + +- vst $vr0, a3, -16 +- vld $vr0, a4, -144 ++ vst vr0, a3, -16 ++ vld vr0, a4, -144 + addi.d a4, a4, -128 +- vst $vr1, a3, -32 ++ vst vr1, a3, -32 + + +- vst $vr2, a3, -48 +- vst $vr3, a3, -64 +- vst $vr4, a3, -80 +- vst $vr5, a3, -96 ++ vst vr2, a3, -48 ++ vst vr3, a3, -64 ++ vst vr4, a3, -80 ++ vst vr5, a3, -96 + +- vst $vr6, a3, -112 +- vst $vr7, a3, -128 ++ vst vr6, a3, -112 ++ vst vr7, a3, -128 + addi.d a3, a3, -128 + bne a4, a6, L(back_al_loop) + + L(back_al_less_128): + blt a2, t8, L(back_al_less_64) +- vld $vr1, a4, -32 +- vld $vr2, a4, -48 +- vld $vr3, a4, -64 ++ vld vr1, a4, -32 ++ vld vr2, a4, -48 ++ vld vr3, a4, -64 + + addi.d a2, a2, -64 +- vst $vr0, a3, -16 +- vld $vr0, a4, -80 ++ vst vr0, a3, -16 ++ vld vr0, a4, -80 + addi.d a4, a4, -64 + + +- vst $vr1, a3, -32 +- vst $vr2, a3, -48 +- vst $vr3, a3, -64 ++ vst vr1, a3, -32 ++ vst vr2, a3, -48 ++ vst vr3, a3, -64 + addi.d a3, a3, -64 + + L(back_al_less_64): + blt a2, t7, L(back_al_less_32) +- vld $vr1, a4, -32 ++ vld vr1, a4, -32 + addi.d a2, a2, -32 +- vst $vr0, a3, -16 ++ vst vr0, a3, -16 + +- vld $vr0, a4, -48 +- vst $vr1, a3, -32 ++ vld vr0, a4, -48 ++ vst vr1, a3, -32 + addi.d a3, a3, -32 + addi.d a4, a4, -32 + + L(back_al_less_32): + blt a2, t6, L(back_al_less_16) +- vst $vr0, a3, -16 +- vld $vr0, a4, -32 ++ vst vr0, a3, -16 ++ vld vr0, a4, -32 + addi.d a3, a3, -16 + + + L(back_al_less_16): +- vld $vr1, a1, 0 +- vst $vr0, a3, -16 +- vst $vr1, a0, 0 ++ vld vr1, a1, 0 ++ vst vr0, a3, -16 ++ vst vr1, a0, 0 + jr ra + + L(magic_num_2): +@@ -405,18 +405,18 @@ L(magic_num_2): + L(back_unaligned): + pcaddi t2, -4 + bstrins.d a4, zero, 3, 0 +- vld $vr8, t2, 0 +- vld $vr0, a4, 0 ++ vld vr8, t2, 0 ++ vld vr0, a4, 0 + +- vld $vr1, a4, -16 ++ vld vr1, a4, -16 + addi.d a2, a2, -16 +- vst $vr2, a3, -16 ++ vst vr2, a3, -16 + sub.d a3, a3, t0 + + +- vreplgr2vr.b $vr9, t1 ++ vreplgr2vr.b vr9, t1 + andi t2, a2, 0x7f +- vadd.b $vr9, $vr9, $vr8 ++ vadd.b vr9, vr9, vr8 + addi.d a4, a4, -16 + + beq t2, a2, L(back_un_less_128) +@@ -425,92 +425,92 @@ L(back_unaligned): + sub.d a6, a4, t3 + + L(back_un_loop): +- vld $vr2, a4, -16 +- vld $vr3, a4, -32 +- vld $vr4, a4, -48 ++ vld vr2, a4, -16 ++ vld vr3, a4, -32 ++ vld vr4, a4, -48 + +- vld $vr5, a4, -64 +- vld $vr6, a4, -80 +- vld $vr7, a4, -96 +- vshuf.b $vr8, $vr0, $vr1, $vr9 ++ vld vr5, a4, -64 ++ vld vr6, a4, -80 ++ vld vr7, a4, -96 ++ vshuf.b vr8, vr0, vr1, vr9 + + +- vld $vr0, a4, -112 +- vst $vr8, a3, -16 +- vshuf.b $vr8, $vr1, $vr2, $vr9 +- vld $vr1, a4, -128 ++ vld vr0, a4, -112 ++ vst vr8, a3, -16 ++ vshuf.b vr8, vr1, vr2, vr9 ++ vld vr1, a4, -128 + +- vst $vr8, a3, -32 ++ vst vr8, a3, -32 + addi.d a4, a4, -128 +- vshuf.b $vr2, $vr2, $vr3, $vr9 +- vshuf.b $vr3, $vr3, $vr4, $vr9 ++ vshuf.b vr2, vr2, vr3, vr9 ++ vshuf.b vr3, vr3, vr4, vr9 + +- vst $vr2, a3, -48 +- vshuf.b $vr4, $vr4, $vr5, $vr9 +- vst $vr3, a3, -64 +- vshuf.b $vr5, $vr5, $vr6, $vr9 ++ vst vr2, a3, -48 ++ vshuf.b vr4, vr4, vr5, vr9 ++ vst vr3, a3, -64 ++ vshuf.b vr5, vr5, vr6, vr9 + +- vst $vr4, a3, -80 +- vshuf.b $vr6, $vr6, $vr7, $vr9 +- vst $vr5, a3, -96 +- vshuf.b $vr7, $vr7, $vr0, $vr9 ++ vst vr4, a3, -80 ++ vshuf.b vr6, vr6, vr7, vr9 ++ vst vr5, a3, -96 ++ vshuf.b vr7, vr7, vr0, vr9 + + +- vst $vr6, a3, -112 +- vst $vr7, a3, -128 ++ vst vr6, a3, -112 ++ vst vr7, a3, -128 + addi.d a3, a3, -128 + bne a4, a6, L(back_un_loop) + + L(back_un_less_128): + blt a2, t8, L(back_un_less_64) +- vld $vr2, a4, -16 +- vld $vr3, a4, -32 +- vshuf.b $vr4, $vr0, $vr1, $vr9 ++ vld vr2, a4, -16 ++ vld vr3, a4, -32 ++ vshuf.b vr4, vr0, vr1, vr9 + +- vld $vr0, a4, -48 +- vst $vr4, a3, -16 ++ vld vr0, a4, -48 ++ vst vr4, a3, -16 + addi.d a2, a2, -64 +- vshuf.b $vr4, $vr1, $vr2, $vr9 ++ vshuf.b vr4, vr1, vr2, vr9 + +- vld $vr1, a4, -64 ++ vld vr1, a4, -64 + addi.d a4, a4, -64 +- vst $vr4, a3, -32 +- vshuf.b $vr2, $vr2, $vr3, $vr9 ++ vst vr4, a3, -32 ++ vshuf.b vr2, vr2, vr3, vr9 + + +- vshuf.b $vr3, $vr3, $vr0, $vr9 +- vst $vr2, a3, -48 +- vst $vr3, a3, -64 ++ vshuf.b vr3, vr3, vr0, vr9 ++ vst vr2, a3, -48 ++ vst vr3, a3, -64 + addi.d a3, a3, -64 + + L(back_un_less_64): + blt a2, t7, L(back_un_less_32) +- vshuf.b $vr3, $vr0, $vr1, $vr9 +- vld $vr0, a4, -16 +- vst $vr3, a3, -16 ++ vshuf.b vr3, vr0, vr1, vr9 ++ vld vr0, a4, -16 ++ vst vr3, a3, -16 + + addi.d a2, a2, -32 +- vshuf.b $vr3, $vr1, $vr0, $vr9 +- vld $vr1, a4, -32 ++ vshuf.b vr3, vr1, vr0, vr9 ++ vld vr1, a4, -32 + addi.d a4, a4, -32 + +- vst $vr3, a3, -32 ++ vst vr3, a3, -32 + addi.d a3, a3, -32 + L(back_un_less_32): + blt a2, t6, L(back_un_less_16) +- vshuf.b $vr2, $vr0, $vr1, $vr9 ++ vshuf.b vr2, vr0, vr1, vr9 + + +- vor.v $vr0, $vr1, $vr1 +- vld $vr1, a4, -16 +- vst $vr2, a3, -16 ++ vor.v vr0, vr1, vr1 ++ vld vr1, a4, -16 ++ vst vr2, a3, -16 + addi.d a3, a3, -16 + + L(back_un_less_16): +- vld $vr2, a1, 0 +- vshuf.b $vr0, $vr0, $vr1, $vr9 +- vst $vr0, a3, -16 +- vst $vr2, a0, 0 ++ vld vr2, a1, 0 ++ vshuf.b vr0, vr0, vr1, vr9 ++ vst vr0, a3, -16 ++ vst vr2, a0, 0 + + jr ra + END(MEMMOVE_NAME) +diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S +index 9ecd0257..41554552 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S +@@ -21,56 +21,56 @@ LEAF(MEMRCHR, 6) + + bstrins.d a3, zero, 5, 0 + addi.d t1, t1, 1 # len for unaligned address +- xvld $xr0, a3, 0 +- xvld $xr1, a3, 32 ++ xvld xr0, a3, 0 ++ xvld xr1, a3, 32 + + sub.d t2, zero, t1 + li.d t3, -1 +- xvreplgr2vr.b $xr2, a1 ++ xvreplgr2vr.b xr2, a1 + andi t4, a0, 0x3f + + srl.d t2, t3, t2 +- xvseq.b $xr0, $xr0, $xr2 +- xvseq.b $xr1, $xr1, $xr2 +- xvmsknz.b $xr0, $xr0 ++ xvseq.b xr0, xr0, xr2 ++ xvseq.b xr1, xr1, xr2 ++ xvmsknz.b xr0, xr0 + + +- xvmsknz.b $xr1, $xr1 +- xvpickve.w $xr3, $xr0, 4 +- xvpickve.w $xr4, $xr1, 4 +- vilvl.h $vr0, $vr3, $vr0 ++ xvmsknz.b xr1, xr1 ++ xvpickve.w xr3, xr0, 4 ++ xvpickve.w xr4, xr1, 4 ++ vilvl.h vr0, vr3, vr0 + +- vilvl.h $vr1, $vr4, $vr1 +- vilvl.w $vr0, $vr1, $vr0 +- movfr2gr.d t0, $f0 ++ vilvl.h vr1, vr4, vr1 ++ vilvl.w vr0, vr1, vr0 ++ movfr2gr.d t0, fa0 + and t0, t0, t2 + + bltu a2, t1, L(end) + bnez t0, L(found) + bstrins.d a0, zero, 5, 0 + L(loop): +- xvld $xr0, a3, -64 ++ xvld xr0, a3, -64 + +- xvld $xr1, a3, -32 ++ xvld xr1, a3, -32 + addi.d a3, a3, -64 +- xvseq.b $xr0, $xr0, $xr2 +- xvseq.b $xr1, $xr1, $xr2 ++ xvseq.b xr0, xr0, xr2 ++ xvseq.b xr1, xr1, xr2 + + + beq a0, a3, L(out) +- xvmax.bu $xr3, $xr0, $xr1 +- xvseteqz.v $fcc0, $xr3 +- bcnez $fcc0, L(loop) ++ xvmax.bu xr3, xr0, xr1 ++ xvseteqz.v fcc0, xr3 ++ bcnez fcc0, L(loop) + +- xvmsknz.b $xr0, $xr0 +- xvmsknz.b $xr1, $xr1 +- xvpickve.w $xr3, $xr0, 4 +- xvpickve.w $xr4, $xr1, 4 ++ xvmsknz.b xr0, xr0 ++ xvmsknz.b xr1, xr1 ++ xvpickve.w xr3, xr0, 4 ++ xvpickve.w xr4, xr1, 4 + +- vilvl.h $vr0, $vr3, $vr0 +- vilvl.h $vr1, $vr4, $vr1 +- vilvl.w $vr0, $vr1, $vr0 +- movfr2gr.d t0, $f0 ++ vilvl.h vr0, vr3, vr0 ++ vilvl.h vr1, vr4, vr1 ++ vilvl.w vr0, vr1, vr0 ++ movfr2gr.d t0, fa0 + + L(found): + addi.d a0, a3, 63 +@@ -80,15 +80,15 @@ L(found): + + + L(out): +- xvmsknz.b $xr0, $xr0 +- xvmsknz.b $xr1, $xr1 +- xvpickve.w $xr3, $xr0, 4 +- xvpickve.w $xr4, $xr1, 4 +- +- vilvl.h $vr0, $vr3, $vr0 +- vilvl.h $vr1, $vr4, $vr1 +- vilvl.w $vr0, $vr1, $vr0 +- movfr2gr.d t0, $f0 ++ xvmsknz.b xr0, xr0 ++ xvmsknz.b xr1, xr1 ++ xvpickve.w xr3, xr0, 4 ++ xvpickve.w xr4, xr1, 4 ++ ++ vilvl.h vr0, vr3, vr0 ++ vilvl.h vr1, vr4, vr1 ++ vilvl.w vr0, vr1, vr0 ++ movfr2gr.d t0, fa0 + + L(end): + sll.d t2, t3, t4 +diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S +index 4bdc18d8..4a302cac 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S +@@ -19,46 +19,46 @@ LEAF(MEMRCHR, 6) + + bstrins.d a3, zero, 4, 0 + addi.d t1, t1, 1 # len for unaligned address +- vld $vr0, a3, 0 +- vld $vr1, a3, 16 ++ vld vr0, a3, 0 ++ vld vr1, a3, 16 + + sub.d t2, zero, t1 + li.d t3, -1 +- vreplgr2vr.b $vr2, a1 ++ vreplgr2vr.b vr2, a1 + andi t4, a0, 0x1f + + srl.d t2, t3, t2 +- vseq.b $vr0, $vr0, $vr2 +- vseq.b $vr1, $vr1, $vr2 +- vmsknz.b $vr0, $vr0 ++ vseq.b vr0, vr0, vr2 ++ vseq.b vr1, vr1, vr2 ++ vmsknz.b vr0, vr0 + + +- vmsknz.b $vr1, $vr1 +- vilvl.h $vr0, $vr1, $vr0 +- movfr2gr.s t0, $f0 ++ vmsknz.b vr1, vr1 ++ vilvl.h vr0, vr1, vr0 ++ movfr2gr.s t0, fa0 + and t0, t0, t2 + + bltu a2, t1, L(end) + bnez t0, L(found) + bstrins.d a0, zero, 4, 0 + L(loop): +- vld $vr0, a3, -32 ++ vld vr0, a3, -32 + +- vld $vr1, a3, -16 ++ vld vr1, a3, -16 + addi.d a3, a3, -32 +- vseq.b $vr0, $vr0, $vr2 +- vseq.b $vr1, $vr1, $vr2 ++ vseq.b vr0, vr0, vr2 ++ vseq.b vr1, vr1, vr2 + + beq a0, a3, L(out) +- vmax.bu $vr3, $vr0, $vr1 +- vseteqz.v $fcc0, $vr3 +- bcnez $fcc0, L(loop) ++ vmax.bu vr3, vr0, vr1 ++ vseteqz.v fcc0, vr3 ++ bcnez fcc0, L(loop) + + +- vmsknz.b $vr0, $vr0 +- vmsknz.b $vr1, $vr1 +- vilvl.h $vr0, $vr1, $vr0 +- movfr2gr.s t0, $f0 ++ vmsknz.b vr0, vr0 ++ vmsknz.b vr1, vr1 ++ vilvl.h vr0, vr1, vr0 ++ movfr2gr.s t0, fa0 + + L(found): + addi.d a0, a3, 31 +@@ -67,10 +67,10 @@ L(found): + jr ra + + L(out): +- vmsknz.b $vr0, $vr0 +- vmsknz.b $vr1, $vr1 +- vilvl.h $vr0, $vr1, $vr0 +- movfr2gr.s t0, $f0 ++ vmsknz.b vr0, vr0 ++ vmsknz.b vr1, vr1 ++ vilvl.h vr0, vr1, vr0 ++ movfr2gr.s t0, fa0 + + L(end): + sll.d t2, t3, t4 +diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S +index b53c0b7b..5e4908dc 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S +@@ -14,7 +14,7 @@ + LEAF(MEMSET, 6) + li.d t1, 32 + move a3, a0 +- xvreplgr2vr.b $xr0, a1 ++ xvreplgr2vr.b xr0, a1 + add.d a4, a0, a2 + + bgeu t1, a2, L(less_32bytes) # len <= 32 +@@ -24,46 +24,46 @@ LEAF(MEMSET, 6) + + L(less_128bytes): + bgeu t2, a2, L(less_64bytes) # len <= 64 +- xvst $xr0, a3, 0 +- xvst $xr0, a3, 32 +- xvst $xr0, a4, -32 ++ xvst xr0, a3, 0 ++ xvst xr0, a3, 32 ++ xvst xr0, a4, -32 + +- xvst $xr0, a4, -64 ++ xvst xr0, a4, -64 + jr ra + L(less_64bytes): +- xvst $xr0, a3, 0 +- xvst $xr0, a4, -32 ++ xvst xr0, a3, 0 ++ xvst xr0, a4, -32 + + + jr ra + L(less_32bytes): + srli.d t0, a2, 4 + beqz t0, L(less_16bytes) +- vst $vr0, a3, 0 ++ vst vr0, a3, 0 + +- vst $vr0, a4, -16 ++ vst vr0, a4, -16 + jr ra + L(less_16bytes): + srli.d t0, a2, 3 + beqz t0, L(less_8bytes) + +- vstelm.d $vr0, a3, 0, 0 +- vstelm.d $vr0, a4, -8, 0 ++ vstelm.d vr0, a3, 0, 0 ++ vstelm.d vr0, a4, -8, 0 + jr ra + L(less_8bytes): + srli.d t0, a2, 2 + + beqz t0, L(less_4bytes) +- vstelm.w $vr0, a3, 0, 0 +- vstelm.w $vr0, a4, -4, 0 ++ vstelm.w vr0, a3, 0, 0 ++ vstelm.w vr0, a4, -4, 0 + jr ra + + + L(less_4bytes): + srli.d t0, a2, 1 + beqz t0, L(less_2bytes) +- vstelm.h $vr0, a3, 0, 0 +- vstelm.h $vr0, a4, -2, 0 ++ vstelm.h vr0, a3, 0, 0 ++ vstelm.h vr0, a4, -2, 0 + + jr ra + L(less_2bytes): +@@ -73,7 +73,7 @@ L(less_1bytes): + jr ra + + L(long_bytes): +- xvst $xr0, a3, 0 ++ xvst xr0, a3, 0 + bstrins.d a3, zero, 4, 0 + addi.d a3, a3, 32 + sub.d a2, a4, a3 +@@ -85,15 +85,15 @@ L(long_bytes): + + + L(loop_256): +- xvst $xr0, a3, 0 +- xvst $xr0, a3, 32 +- xvst $xr0, a3, 64 +- xvst $xr0, a3, 96 ++ xvst xr0, a3, 0 ++ xvst xr0, a3, 32 ++ xvst xr0, a3, 64 ++ xvst xr0, a3, 96 + +- xvst $xr0, a3, 128 +- xvst $xr0, a3, 160 +- xvst $xr0, a3, 192 +- xvst $xr0, a3, 224 ++ xvst xr0, a3, 128 ++ xvst xr0, a3, 160 ++ xvst xr0, a3, 192 ++ xvst xr0, a3, 224 + + addi.d a3, a3, 256 + bne a3, t0, L(loop_256) +@@ -101,26 +101,26 @@ L(long_end): + bltu a2, t3, L(end_less_128) + addi.d a2, a2, -128 + +- xvst $xr0, a3, 0 +- xvst $xr0, a3, 32 +- xvst $xr0, a3, 64 +- xvst $xr0, a3, 96 ++ xvst xr0, a3, 0 ++ xvst xr0, a3, 32 ++ xvst xr0, a3, 64 ++ xvst xr0, a3, 96 + + + addi.d a3, a3, 128 + L(end_less_128): + bltu a2, t2, L(end_less_64) + addi.d a2, a2, -64 +- xvst $xr0, a3, 0 ++ xvst xr0, a3, 0 + +- xvst $xr0, a3, 32 ++ xvst xr0, a3, 32 + addi.d a3, a3, 64 + L(end_less_64): + bltu a2, t1, L(end_less_32) +- xvst $xr0, a3, 0 ++ xvst xr0, a3, 0 + + L(end_less_32): +- xvst $xr0, a4, -32 ++ xvst xr0, a4, -32 + jr ra + END(MEMSET) + +diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S +index 7ab85283..67b279c8 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S +@@ -14,7 +14,7 @@ + LEAF(MEMSET, 6) + li.d t1, 16 + move a3, a0 +- vreplgr2vr.b $vr0, a1 ++ vreplgr2vr.b vr0, a1 + add.d a4, a0, a2 + + bgeu t1, a2, L(less_16bytes) # len <= 16 +@@ -24,48 +24,48 @@ LEAF(MEMSET, 6) + + L(less_64bytes): + bgeu t2, a2, L(less_32bytes) # len <= 32 +- vst $vr0, a3, 0 +- vst $vr0, a3, 16 +- vst $vr0, a4, -32 ++ vst vr0, a3, 0 ++ vst vr0, a3, 16 ++ vst vr0, a4, -32 + +- vst $vr0, a4, -16 ++ vst vr0, a4, -16 + jr ra + L(less_32bytes): +- vst $vr0, a3, 0 +- vst $vr0, a4, -16 ++ vst vr0, a3, 0 ++ vst vr0, a4, -16 + + + jr ra + L(less_16bytes): + srli.d t0, a2, 3 + beqz t0, L(less_8bytes) +- vstelm.d $vr0, a3, 0, 0 ++ vstelm.d vr0, a3, 0, 0 + +- vstelm.d $vr0, a4, -8, 0 ++ vstelm.d vr0, a4, -8, 0 + jr ra + L(less_8bytes): + srli.d t0, a2, 2 + beqz t0, L(less_4bytes) + +- vstelm.w $vr0, a3, 0, 0 +- vstelm.w $vr0, a4, -4, 0 ++ vstelm.w vr0, a3, 0, 0 ++ vstelm.w vr0, a4, -4, 0 + jr ra + L(less_4bytes): + srli.d t0, a2, 1 + + beqz t0, L(less_2bytes) +- vstelm.h $vr0, a3, 0, 0 +- vstelm.h $vr0, a4, -2, 0 ++ vstelm.h vr0, a3, 0, 0 ++ vstelm.h vr0, a4, -2, 0 + jr ra + + + L(less_2bytes): + beqz a2, L(less_1bytes) +- vstelm.b $vr0, a3, 0, 0 ++ vstelm.b vr0, a3, 0, 0 + L(less_1bytes): + jr ra + L(long_bytes): +- vst $vr0, a3, 0 ++ vst vr0, a3, 0 + + bstrins.d a3, zero, 3, 0 + addi.d a3, a3, 16 +@@ -77,43 +77,43 @@ L(long_bytes): + sub.d t0, a4, t0 + + L(loop_128): +- vst $vr0, a3, 0 ++ vst vr0, a3, 0 + +- vst $vr0, a3, 16 +- vst $vr0, a3, 32 +- vst $vr0, a3, 48 +- vst $vr0, a3, 64 ++ vst vr0, a3, 16 ++ vst vr0, a3, 32 ++ vst vr0, a3, 48 ++ vst vr0, a3, 64 + + +- vst $vr0, a3, 80 +- vst $vr0, a3, 96 +- vst $vr0, a3, 112 ++ vst vr0, a3, 80 ++ vst vr0, a3, 96 ++ vst vr0, a3, 112 + addi.d a3, a3, 128 + + bne a3, t0, L(loop_128) + L(long_end): + bltu a2, t3, L(end_less_64) + addi.d a2, a2, -64 +- vst $vr0, a3, 0 ++ vst vr0, a3, 0 + +- vst $vr0, a3, 16 +- vst $vr0, a3, 32 +- vst $vr0, a3, 48 ++ vst vr0, a3, 16 ++ vst vr0, a3, 32 ++ vst vr0, a3, 48 + addi.d a3, a3, 64 + + L(end_less_64): + bltu a2, t2, L(end_less_32) + addi.d a2, a2, -32 +- vst $vr0, a3, 0 +- vst $vr0, a3, 16 ++ vst vr0, a3, 0 ++ vst vr0, a3, 16 + + addi.d a3, a3, 32 + L(end_less_32): + bltu a2, t1, L(end_less_16) +- vst $vr0, a3, 0 ++ vst vr0, a3, 0 + + L(end_less_16): +- vst $vr0, a4, -16 ++ vst vr0, a4, -16 + jr ra + END(MEMSET) + +diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S +index 1e94aa50..856f99ce 100644 +--- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S +@@ -8,15 +8,15 @@ + LEAF(RAWMEMCHR, 6) + move a2, a0 + bstrins.d a0, zero, 4, 0 +- xvld $xr0, a0, 0 +- xvreplgr2vr.b $xr1, a1 ++ xvld xr0, a0, 0 ++ xvreplgr2vr.b xr1, a1 + +- xvseq.b $xr0, $xr0, $xr1 +- xvmsknz.b $xr0, $xr0 +- xvpickve.w $xr2, $xr0, 4 +- vilvl.h $vr0, $vr2, $vr0 ++ xvseq.b xr0, xr0, xr1 ++ xvmsknz.b xr0, xr0 ++ xvpickve.w xr2, xr0, 4 ++ vilvl.h vr0, vr2, vr0 + +- movfr2gr.s t0, $f0 ++ movfr2gr.s t0, fa0 + sra.w t0, t0, a2 + beqz t0, L(loop) + ctz.w t0, t0 +@@ -27,17 +27,17 @@ LEAF(RAWMEMCHR, 6) + nop + + L(loop): +- xvld $xr0, a0, 32 ++ xvld xr0, a0, 32 + addi.d a0, a0, 32 +- xvseq.b $xr0, $xr0, $xr1 +- xvseteqz.v $fcc0, $xr0 ++ xvseq.b xr0, xr0, xr1 ++ xvseteqz.v fcc0, xr0 + +- bcnez $fcc0, L(loop) +- xvmsknz.b $xr0, $xr0 +- xvpickve.w $xr1, $xr0, 4 +- vilvl.h $vr0, $vr1, $vr0 ++ bcnez fcc0, L(loop) ++ xvmsknz.b xr0, xr0 ++ xvpickve.w xr1, xr0, 4 ++ vilvl.h vr0, vr1, vr0 + +- movfr2gr.s t0, $f0 ++ movfr2gr.s t0, fa0 + ctz.w t0, t0 + add.d a0, a0, t0 + jr ra +diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S +index 40bf0cda..7e864e96 100644 +--- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S +@@ -14,17 +14,17 @@ + LEAF(RAWMEMCHR, 6) + move a2, a0 + bstrins.d a0, zero, 4, 0 +- vld $vr0, a0, 0 +- vld $vr1, a0, 16 ++ vld vr0, a0, 0 ++ vld vr1, a0, 16 + +- vreplgr2vr.b $vr2, a1 +- vseq.b $vr0, $vr0, $vr2 +- vseq.b $vr1, $vr1, $vr2 +- vmsknz.b $vr0, $vr0 ++ vreplgr2vr.b vr2, a1 ++ vseq.b vr0, vr0, vr2 ++ vseq.b vr1, vr1, vr2 ++ vmsknz.b vr0, vr0 + +- vmsknz.b $vr1, $vr1 +- vilvl.h $vr0, $vr1, $vr0 +- movfr2gr.s t0, $f0 ++ vmsknz.b vr1, vr1 ++ vilvl.h vr0, vr1, vr0 ++ movfr2gr.s t0, fa0 + sra.w t0, t0, a2 + + beqz t0, L(loop) +@@ -34,15 +34,15 @@ LEAF(RAWMEMCHR, 6) + + + L(loop): +- vld $vr0, a0, 32 ++ vld vr0, a0, 32 + addi.d a0, a0, 16 +- vseq.b $vr0, $vr0, $vr2 +- vseteqz.v $fcc0, $vr0 ++ vseq.b vr0, vr0, vr2 ++ vseteqz.v fcc0, vr0 + +- bcnez $fcc0, L(loop) ++ bcnez fcc0, L(loop) + addi.d a0, a0, 16 +- vfrstpi.b $vr0, $vr0, 0 +- vpickve2gr.bu t0, $vr0, 0 ++ vfrstpi.b vr0, vr0, 0 ++ vpickve2gr.bu t0, vr0, 0 + + add.d a0, a0, t0 + jr ra +diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S +index 0836f590..53832de7 100644 +--- a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S +@@ -18,67 +18,67 @@ L(magic_num): + ENTRY_NO_ALIGN(STPCPY) + pcaddi t0, -4 + andi a4, a1, 0xf +- vld $vr1, t0, 0 ++ vld vr1, t0, 0 + beqz a4, L(load_start) + + xor t0, a1, a4 +- vld $vr0, t0, 0 +- vreplgr2vr.b $vr2, a4 +- vadd.b $vr2, $vr2, $vr1 ++ vld vr0, t0, 0 ++ vreplgr2vr.b vr2, a4 ++ vadd.b vr2, vr2, vr1 + +- vshuf.b $vr0, $vr2, $vr0, $vr2 +- vsetanyeqz.b $fcc0, $vr0 +- bcnez $fcc0, L(end) ++ vshuf.b vr0, vr2, vr0, vr2 ++ vsetanyeqz.b fcc0, vr0 ++ bcnez fcc0, L(end) + L(load_start): +- vld $vr0, a1, 0 ++ vld vr0, a1, 0 + + + li.d t1, 16 + andi a3, a0, 0xf +- vsetanyeqz.b $fcc0, $vr0 ++ vsetanyeqz.b fcc0, vr0 + sub.d t0, t1, a3 + +- bcnez $fcc0, L(end) ++ bcnez fcc0, L(end) + add.d a1, a1, t0 +- vst $vr0, a0, 0 ++ vst vr0, a0, 0 + add.d a0, a0, t0 + + bne a3, a4, L(unaligned) +- vld $vr0, a1, 0 +- vsetanyeqz.b $fcc0, $vr0 +- bcnez $fcc0, L(end) ++ vld vr0, a1, 0 ++ vsetanyeqz.b fcc0, vr0 ++ bcnez fcc0, L(end) + + L(loop): +- vst $vr0, a0, 0 +- vld $vr0, a1, 16 ++ vst vr0, a0, 0 ++ vld vr0, a1, 16 + addi.d a0, a0, 16 + addi.d a1, a1, 16 + + +- vsetanyeqz.b $fcc0, $vr0 +- bceqz $fcc0, L(loop) +- vmsknz.b $vr1, $vr0 +- movfr2gr.s t0, $f1 ++ vsetanyeqz.b fcc0, vr0 ++ bceqz fcc0, L(loop) ++ vmsknz.b vr1, vr0 ++ movfr2gr.s t0, fa1 + + cto.w t0, t0 + add.d a1, a1, t0 +- vld $vr0, a1, -15 ++ vld vr0, a1, -15 + add.d a0, a0, t0 + +- vst $vr0, a0, -15 ++ vst vr0, a0, -15 + jr ra + L(end): +- vseqi.b $vr1, $vr0, 0 +- vfrstpi.b $vr1, $vr1, 0 ++ vseqi.b vr1, vr0, 0 ++ vfrstpi.b vr1, vr1, 0 + +- vpickve2gr.bu t0, $vr1, 0 ++ vpickve2gr.bu t0, vr1, 0 + addi.d t0, t0, 1 + L(end_16): + andi t1, t0, 16 + beqz t1, L(end_8) + + +- vst $vr0, a0, 0 ++ vst vr0, a0, 0 + addi.d a0, a0, 15 + jr ra + L(end_8): +@@ -89,26 +89,26 @@ L(end_8): + andi t5, t0, 1 + beqz t2, L(end_4) + +- vstelm.d $vr0, a0, 0, 0 ++ vstelm.d vr0, a0, 0, 0 + addi.d a0, a0, 8 +- vbsrl.v $vr0, $vr0, 8 ++ vbsrl.v vr0, vr0, 8 + L(end_4): + beqz t3, L(end_2) + +- vstelm.w $vr0, a0, 0, 0 ++ vstelm.w vr0, a0, 0, 0 + addi.d a0, a0, 4 +- vbsrl.v $vr0, $vr0, 4 ++ vbsrl.v vr0, vr0, 4 + L(end_2): + beqz t4, L(end_1) + + +- vstelm.h $vr0, a0, 0, 0 ++ vstelm.h vr0, a0, 0, 0 + addi.d a0, a0, 2 +- vbsrl.v $vr0, $vr0, 2 ++ vbsrl.v vr0, vr0, 2 + L(end_1): + beqz t5, L(out) + +- vstelm.b $vr0, a0, 0, 0 ++ vstelm.b vr0, a0, 0, 0 + addi.d a0, a0, 1 + L(out): + addi.d a0, a0, -1 +@@ -120,49 +120,49 @@ L(unaligned): + andi a3, a1, 0xf + bstrins.d a1, zero, 3, 0 + +- vld $vr2, a1, 0 +- vreplgr2vr.b $vr3, a3 +- vslt.b $vr4, $vr1, $vr3 +- vor.v $vr0, $vr2, $vr4 ++ vld vr2, a1, 0 ++ vreplgr2vr.b vr3, a3 ++ vslt.b vr4, vr1, vr3 ++ vor.v vr0, vr2, vr4 + + +- vsetanyeqz.b $fcc0, $vr0 +- bcnez $fcc0, L(un_first_end) +- vld $vr0, a1, 16 +- vadd.b $vr3, $vr3, $vr1 ++ vsetanyeqz.b fcc0, vr0 ++ bcnez fcc0, L(un_first_end) ++ vld vr0, a1, 16 ++ vadd.b vr3, vr3, vr1 + + addi.d a1, a1, 16 +- vshuf.b $vr4, $vr0, $vr2, $vr3 +- vsetanyeqz.b $fcc0, $vr0 +- bcnez $fcc0, L(un_end) ++ vshuf.b vr4, vr0, vr2, vr3 ++ vsetanyeqz.b fcc0, vr0 ++ bcnez fcc0, L(un_end) + + L(un_loop): +- vor.v $vr2, $vr0, $vr0 +- vld $vr0, a1, 16 +- vst $vr4, a0, 0 ++ vor.v vr2, vr0, vr0 ++ vld vr0, a1, 16 ++ vst vr4, a0, 0 + addi.d a1, a1, 16 + + addi.d a0, a0, 16 +- vshuf.b $vr4, $vr0, $vr2, $vr3 +- vsetanyeqz.b $fcc0, $vr0 +- bceqz $fcc0, L(un_loop) ++ vshuf.b vr4, vr0, vr2, vr3 ++ vsetanyeqz.b fcc0, vr0 ++ bceqz fcc0, L(un_loop) + + + L(un_end): +- vsetanyeqz.b $fcc0, $vr4 +- bcnez $fcc0, 1f +- vst $vr4, a0, 0 ++ vsetanyeqz.b fcc0, vr4 ++ bcnez fcc0, 1f ++ vst vr4, a0, 0 + 1: +- vmsknz.b $vr1, $vr0 ++ vmsknz.b vr1, vr0 + +- movfr2gr.s t0, $f1 ++ movfr2gr.s t0, fa1 + cto.w t0, t0 + add.d a1, a1, t0 +- vld $vr0, a1, -15 ++ vld vr0, a1, -15 + + add.d a0, a0, t0 + sub.d a0, a0, a3 +- vst $vr0, a0, 1 ++ vst vr0, a0, 1 + addi.d a0, a0, 16 + + jr ra +diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S +index 3f6ad915..fab6edc7 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S +@@ -16,18 +16,18 @@ + LEAF(STRCHR, 6) + andi t1, a0, 0x1f + bstrins.d a0, zero, 4, 0 +- xvld $xr0, a0, 0 ++ xvld xr0, a0, 0 + li.d t2, -1 + +- xvreplgr2vr.b $xr1, a1 ++ xvreplgr2vr.b xr1, a1 + sll.d t1, t2, t1 +- xvxor.v $xr2, $xr0, $xr1 +- xvmin.bu $xr0, $xr0, $xr2 ++ xvxor.v xr2, xr0, xr1 ++ xvmin.bu xr0, xr0, xr2 + +- xvmsknz.b $xr0, $xr0 +- xvpickve.w $xr3, $xr0, 4 +- vilvl.h $vr0, $vr3, $vr0 +- movfr2gr.s t0, $f0 ++ xvmsknz.b xr0, xr0 ++ xvpickve.w xr3, xr0, 4 ++ vilvl.h vr0, vr3, vr0 ++ movfr2gr.s t0, fa0 + + orn t0, t0, t1 + bne t0, t2, L(end) +@@ -36,37 +36,37 @@ LEAF(STRCHR, 6) + + + L(loop): +- xvld $xr0, a0, 0 +- xvxor.v $xr2, $xr0, $xr1 +- xvmin.bu $xr0, $xr0, $xr2 +- xvsetanyeqz.b $fcc0, $xr0 ++ xvld xr0, a0, 0 ++ xvxor.v xr2, xr0, xr1 ++ xvmin.bu xr0, xr0, xr2 ++ xvsetanyeqz.b fcc0, xr0 + +- bcnez $fcc0, L(loop_end) +- xvld $xr0, a0, 32 ++ bcnez fcc0, L(loop_end) ++ xvld xr0, a0, 32 + addi.d a0, a0, 64 +- xvxor.v $xr2, $xr0, $xr1 ++ xvxor.v xr2, xr0, xr1 + +- xvmin.bu $xr0, $xr0, $xr2 +- xvsetanyeqz.b $fcc0, $xr0 +- bceqz $fcc0, L(loop) ++ xvmin.bu xr0, xr0, xr2 ++ xvsetanyeqz.b fcc0, xr0 ++ bceqz fcc0, L(loop) + addi.d a0, a0, -32 + + L(loop_end): +- xvmsknz.b $xr0, $xr0 +- xvpickve.w $xr1, $xr0, 4 +- vilvl.h $vr0, $vr1, $vr0 +- movfr2gr.s t0, $f0 ++ xvmsknz.b xr0, xr0 ++ xvpickve.w xr1, xr0, 4 ++ vilvl.h vr0, vr1, vr0 ++ movfr2gr.s t0, fa0 + + + L(end): + cto.w t0, t0 + add.d a0, a0, t0 + #ifndef AS_STRCHRNUL +- vreplgr2vr.b $vr0, t0 +- xvpermi.q $xr3, $xr2, 1 ++ vreplgr2vr.b vr0, t0 ++ xvpermi.q xr3, xr2, 1 + +- vshuf.b $vr0, $vr3, $vr2, $vr0 +- vpickve2gr.bu t0, $vr0, 0 ++ vshuf.b vr0, vr3, vr2, vr0 ++ vpickve2gr.bu t0, vr0, 0 + masknez a0, a0, t0 + #endif + jr ra +diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S +index 4ad9a4ad..ebeb332e 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S +@@ -16,16 +16,16 @@ + LEAF(STRCHR, 6) + andi t1, a0, 0xf + bstrins.d a0, zero, 3, 0 +- vld $vr0, a0, 0 ++ vld vr0, a0, 0 + li.d t2, -1 + +- vreplgr2vr.b $vr1, a1 ++ vreplgr2vr.b vr1, a1 + sll.d t3, t2, t1 +- vxor.v $vr2, $vr0, $vr1 +- vmin.bu $vr0, $vr0, $vr2 ++ vxor.v vr2, vr0, vr1 ++ vmin.bu vr0, vr0, vr2 + +- vmsknz.b $vr0, $vr0 +- movfr2gr.s t0, $f0 ++ vmsknz.b vr0, vr0 ++ movfr2gr.s t0, fa0 + ext.w.h t0, t0 + orn t0, t0, t3 + +@@ -34,23 +34,23 @@ L(found): + cto.w t0, t0 + add.d a0, a0, t0 + #ifndef AS_STRCHRNUL +- vreplve.b $vr2, $vr2, t0 +- vpickve2gr.bu t1, $vr2, 0 ++ vreplve.b vr2, vr2, t0 ++ vpickve2gr.bu t1, vr2, 0 + masknez a0, a0, t1 + #endif + jr ra + + + L(loop): +- vld $vr0, a0, 16 ++ vld vr0, a0, 16 + addi.d a0, a0, 16 +- vxor.v $vr2, $vr0, $vr1 +- vmin.bu $vr0, $vr0, $vr2 ++ vxor.v vr2, vr0, vr1 ++ vmin.bu vr0, vr0, vr2 + +- vsetanyeqz.b $fcc0, $vr0 +- bceqz $fcc0, L(loop) +- vmsknz.b $vr0, $vr0 +- movfr2gr.s t0, $f0 ++ vsetanyeqz.b fcc0, vr0 ++ bceqz fcc0, L(loop) ++ vmsknz.b vr0, vr0 ++ movfr2gr.s t0, fa0 + + b L(found) + END(STRCHR) +diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S +index c86e3ecd..c6e1110c 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S +@@ -20,45 +20,45 @@ L(magic_num): + ENTRY_NO_ALIGN(STRCMP) + pcaddi t0, -4 + andi a2, a0, 0xf +- vld $vr2, t0, 0 ++ vld vr2, t0, 0 + andi a3, a1, 0xf + + bne a2, a3, L(unaligned) + bstrins.d a0, zero, 3, 0 + bstrins.d a1, zero, 3, 0 +- vld $vr0, a0, 0 ++ vld vr0, a0, 0 + +- vld $vr1, a1, 0 +- vreplgr2vr.b $vr3, a2 +- vslt.b $vr2, $vr2, $vr3 +- vseq.b $vr3, $vr0, $vr1 ++ vld vr1, a1, 0 ++ vreplgr2vr.b vr3, a2 ++ vslt.b vr2, vr2, vr3 ++ vseq.b vr3, vr0, vr1 + + +- vmin.bu $vr3, $vr0, $vr3 +- vor.v $vr3, $vr3, $vr2 +- vsetanyeqz.b $fcc0, $vr3 +- bcnez $fcc0, L(al_out) ++ vmin.bu vr3, vr0, vr3 ++ vor.v vr3, vr3, vr2 ++ vsetanyeqz.b fcc0, vr3 ++ bcnez fcc0, L(al_out) + + L(al_loop): +- vld $vr0, a0, 16 +- vld $vr1, a1, 16 ++ vld vr0, a0, 16 ++ vld vr1, a1, 16 + addi.d a0, a0, 16 + addi.d a1, a1, 16 + +- vseq.b $vr3, $vr0, $vr1 +- vmin.bu $vr3, $vr0, $vr3 +- vsetanyeqz.b $fcc0, $vr3 +- bceqz $fcc0, L(al_loop) ++ vseq.b vr3, vr0, vr1 ++ vmin.bu vr3, vr0, vr3 ++ vsetanyeqz.b fcc0, vr3 ++ bceqz fcc0, L(al_loop) + + L(al_out): +- vseqi.b $vr3, $vr3, 0 +- vfrstpi.b $vr3, $vr3, 0 +- vshuf.b $vr0, $vr0, $vr0, $vr3 +- vshuf.b $vr1, $vr1, $vr1, $vr3 ++ vseqi.b vr3, vr3, 0 ++ vfrstpi.b vr3, vr3, 0 ++ vshuf.b vr0, vr0, vr0, vr3 ++ vshuf.b vr1, vr1, vr1, vr3 + + +- vpickve2gr.bu t0, $vr0, 0 +- vpickve2gr.bu t1, $vr1, 0 ++ vpickve2gr.bu t0, vr0, 0 ++ vpickve2gr.bu t1, vr1, 0 + sub.d a0, t0, t1 + jr ra + +@@ -79,52 +79,52 @@ L(unaligned): + bstrins.d a1, zero, 3, 0 + + +- vld $vr0, a0, 0 +- vld $vr3, a1, 0 +- vreplgr2vr.b $vr4, a2 +- vreplgr2vr.b $vr5, a3 ++ vld vr0, a0, 0 ++ vld vr3, a1, 0 ++ vreplgr2vr.b vr4, a2 ++ vreplgr2vr.b vr5, a3 + +- vslt.b $vr7, $vr2, $vr4 +- vsub.b $vr4, $vr4, $vr5 +- vaddi.bu $vr6, $vr2, 16 +- vsub.b $vr6, $vr6, $vr4 ++ vslt.b vr7, vr2, vr4 ++ vsub.b vr4, vr4, vr5 ++ vaddi.bu vr6, vr2, 16 ++ vsub.b vr6, vr6, vr4 + +- vshuf.b $vr1, $vr3, $vr3, $vr6 +- vseq.b $vr4, $vr0, $vr1 +- vmin.bu $vr4, $vr0, $vr4 +- vor.v $vr4, $vr4, $vr7 ++ vshuf.b vr1, vr3, vr3, vr6 ++ vseq.b vr4, vr0, vr1 ++ vmin.bu vr4, vr0, vr4 ++ vor.v vr4, vr4, vr7 + +- vsetanyeqz.b $fcc0, $vr4 +- bcnez $fcc0, L(un_end) +- vslt.b $vr5, $vr2, $vr5 +- vor.v $vr3, $vr3, $vr5 ++ vsetanyeqz.b fcc0, vr4 ++ bcnez fcc0, L(un_end) ++ vslt.b vr5, vr2, vr5 ++ vor.v vr3, vr3, vr5 + + + L(un_loop): +- vld $vr0, a0, 16 +- vsetanyeqz.b $fcc0, $vr3 +- bcnez $fcc0, L(remaining_end) +- vor.v $vr1, $vr3, $vr3 ++ vld vr0, a0, 16 ++ vsetanyeqz.b fcc0, vr3 ++ bcnez fcc0, L(remaining_end) ++ vor.v vr1, vr3, vr3 + +- vld $vr3, a1, 16 ++ vld vr3, a1, 16 + addi.d a0, a0, 16 + addi.d a1, a1, 16 +- vshuf.b $vr1, $vr3, $vr1, $vr6 ++ vshuf.b vr1, vr3, vr1, vr6 + +- vseq.b $vr4, $vr0, $vr1 +- vmin.bu $vr4, $vr0, $vr4 +- vsetanyeqz.b $fcc0, $vr4 +- bceqz $fcc0, L(un_loop) ++ vseq.b vr4, vr0, vr1 ++ vmin.bu vr4, vr0, vr4 ++ vsetanyeqz.b fcc0, vr4 ++ bceqz fcc0, L(un_loop) + + L(un_end): +- vseqi.b $vr4, $vr4, 0 +- vfrstpi.b $vr4, $vr4, 0 +- vshuf.b $vr0, $vr0, $vr0, $vr4 +- vshuf.b $vr1, $vr1, $vr1, $vr4 ++ vseqi.b vr4, vr4, 0 ++ vfrstpi.b vr4, vr4, 0 ++ vshuf.b vr0, vr0, vr0, vr4 ++ vshuf.b vr1, vr1, vr1, vr4 + + +- vpickve2gr.bu t0, $vr0, 0 +- vpickve2gr.bu t1, $vr1, 0 ++ vpickve2gr.bu t0, vr0, 0 ++ vpickve2gr.bu t1, vr1, 0 + sub.d t3, t0, t1 + sub.d t4, t1, t0 + +@@ -134,9 +134,9 @@ L(un_end): + jr ra + + L(remaining_end): +- vshuf.b $vr1, $vr3, $vr3, $vr6 +- vseq.b $vr4, $vr0, $vr1 +- vmin.bu $vr4, $vr4, $vr0 ++ vshuf.b vr1, vr3, vr3, vr6 ++ vseq.b vr4, vr0, vr1 ++ vmin.bu vr4, vr4, vr0 + b L(un_end) + END(STRCMP) + +diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S +index dbc061ad..52d77fa3 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S +@@ -21,61 +21,61 @@ L(magic_num): + ENTRY_NO_ALIGN(STRCPY) + pcaddi t0, -4 + andi a4, a1, 0xf +- vld $vr1, t0, 0 ++ vld vr1, t0, 0 + move a2, a0 + + beqz a4, L(load_start) + xor t0, a1, a4 +- vld $vr0, t0, 0 +- vreplgr2vr.b $vr2, a4 ++ vld vr0, t0, 0 ++ vreplgr2vr.b vr2, a4 + +- vadd.b $vr2, $vr2, $vr1 +- vshuf.b $vr0, $vr2, $vr0, $vr2 +- vsetanyeqz.b $fcc0, $vr0 +- bcnez $fcc0, L(end) ++ vadd.b vr2, vr2, vr1 ++ vshuf.b vr0, vr2, vr0, vr2 ++ vsetanyeqz.b fcc0, vr0 ++ bcnez fcc0, L(end) + + + L(load_start): +- vld $vr0, a1, 0 ++ vld vr0, a1, 0 + li.d t1, 16 + andi a3, a2, 0xf +- vsetanyeqz.b $fcc0, $vr0 ++ vsetanyeqz.b fcc0, vr0 + + sub.d t0, t1, a3 +- bcnez $fcc0, L(end) ++ bcnez fcc0, L(end) + add.d a1, a1, t0 +- vst $vr0, a2, 0 ++ vst vr0, a2, 0 + + andi a3, a1, 0xf + add.d a2, a2, t0 + bnez a3, L(unaligned) +- vld $vr0, a1, 0 ++ vld vr0, a1, 0 + +- vsetanyeqz.b $fcc0, $vr0 +- bcnez $fcc0, L(end) ++ vsetanyeqz.b fcc0, vr0 ++ bcnez fcc0, L(end) + L(loop): +- vst $vr0, a2, 0 +- vld $vr0, a1, 16 ++ vst vr0, a2, 0 ++ vld vr0, a1, 16 + + + addi.d a2, a2, 16 + addi.d a1, a1, 16 +- vsetanyeqz.b $fcc0, $vr0 +- bceqz $fcc0, L(loop) ++ vsetanyeqz.b fcc0, vr0 ++ bceqz fcc0, L(loop) + +- vmsknz.b $vr1, $vr0 +- movfr2gr.s t0, $f1 ++ vmsknz.b vr1, vr0 ++ movfr2gr.s t0, fa1 + cto.w t0, t0 + add.d a1, a1, t0 + +- vld $vr0, a1, -15 ++ vld vr0, a1, -15 + add.d a2, a2, t0 +- vst $vr0, a2, -15 ++ vst vr0, a2, -15 + jr ra + + L(end): +- vmsknz.b $vr1, $vr0 +- movfr2gr.s t0, $f1 ++ vmsknz.b vr1, vr0 ++ movfr2gr.s t0, fa1 + cto.w t0, t0 + addi.d t0, t0, 1 + +@@ -83,7 +83,7 @@ L(end): + L(end_16): + andi t1, t0, 16 + beqz t1, L(end_8) +- vst $vr0, a2, 0 ++ vst vr0, a2, 0 + jr ra + + L(end_8): +@@ -93,74 +93,74 @@ L(end_8): + andi t5, t0, 1 + + beqz t2, L(end_4) +- vstelm.d $vr0, a2, 0, 0 ++ vstelm.d vr0, a2, 0, 0 + addi.d a2, a2, 8 +- vbsrl.v $vr0, $vr0, 8 ++ vbsrl.v vr0, vr0, 8 + + L(end_4): + beqz t3, L(end_2) +- vstelm.w $vr0, a2, 0, 0 ++ vstelm.w vr0, a2, 0, 0 + addi.d a2, a2, 4 +- vbsrl.v $vr0, $vr0, 4 ++ vbsrl.v vr0, vr0, 4 + + + L(end_2): + beqz t4, L(end_1) +- vstelm.h $vr0, a2, 0, 0 ++ vstelm.h vr0, a2, 0, 0 + addi.d a2, a2, 2 +- vbsrl.v $vr0, $vr0, 2 ++ vbsrl.v vr0, vr0, 2 + + L(end_1): + beqz t5, L(out) +- vstelm.b $vr0, a2, 0, 0 ++ vstelm.b vr0, a2, 0, 0 + L(out): + jr ra + L(unaligned): + bstrins.d a1, zero, 3, 0 + +- vld $vr2, a1, 0 +- vreplgr2vr.b $vr3, a3 +- vslt.b $vr4, $vr1, $vr3 +- vor.v $vr0, $vr2, $vr4 ++ vld vr2, a1, 0 ++ vreplgr2vr.b vr3, a3 ++ vslt.b vr4, vr1, vr3 ++ vor.v vr0, vr2, vr4 + +- vsetanyeqz.b $fcc0, $vr0 +- bcnez $fcc0, L(un_first_end) +- vld $vr0, a1, 16 +- vadd.b $vr3, $vr3, $vr1 ++ vsetanyeqz.b fcc0, vr0 ++ bcnez fcc0, L(un_first_end) ++ vld vr0, a1, 16 ++ vadd.b vr3, vr3, vr1 + + + addi.d a1, a1, 16 +- vshuf.b $vr4, $vr0, $vr2, $vr3 +- vsetanyeqz.b $fcc0, $vr0 +- bcnez $fcc0, L(un_end) ++ vshuf.b vr4, vr0, vr2, vr3 ++ vsetanyeqz.b fcc0, vr0 ++ bcnez fcc0, L(un_end) + + L(un_loop): +- vor.v $vr2, $vr0, $vr0 +- vld $vr0, a1, 16 +- vst $vr4, a2, 0 ++ vor.v vr2, vr0, vr0 ++ vld vr0, a1, 16 ++ vst vr4, a2, 0 + addi.d a1, a1, 16 + + addi.d a2, a2, 16 +- vshuf.b $vr4, $vr0, $vr2, $vr3 +- vsetanyeqz.b $fcc0, $vr0 +- bceqz $fcc0, L(un_loop) ++ vshuf.b vr4, vr0, vr2, vr3 ++ vsetanyeqz.b fcc0, vr0 ++ bceqz fcc0, L(un_loop) + + L(un_end): +- vsetanyeqz.b $fcc0, $vr4 +- bcnez $fcc0, 1f +- vst $vr4, a2, 0 ++ vsetanyeqz.b fcc0, vr4 ++ bcnez fcc0, 1f ++ vst vr4, a2, 0 + 1: +- vmsknz.b $vr1, $vr0 ++ vmsknz.b vr1, vr0 + + +- movfr2gr.s t0, $f1 ++ movfr2gr.s t0, fa1 + cto.w t0, t0 + add.d a1, a1, t0 +- vld $vr0, a1, -15 ++ vld vr0, a1, -15 + + add.d a2, a2, t0 + sub.d a2, a2, a3 +- vst $vr0, a2, 1 ++ vst vr0, a2, 1 + jr ra + + L(un_first_end): +diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S +index fd6c002d..fc25dd50 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S +@@ -17,12 +17,12 @@ LEAF(STRLEN, 6) + move a1, a0 + bstrins.d a0, zero, 4, 0 + li.d t1, -1 +- xvld $xr0, a0, 0 ++ xvld xr0, a0, 0 + +- xvmsknz.b $xr0, $xr0 +- xvpickve.w $xr1, $xr0, 4 +- vilvl.h $vr0, $vr1, $vr0 +- movfr2gr.s t0, $f0 # sign extend ++ xvmsknz.b xr0, xr0 ++ xvpickve.w xr1, xr0, 4 ++ vilvl.h vr0, vr1, vr0 ++ movfr2gr.s t0, fa0 # sign extend + + sra.w t0, t0, a1 + beq t0, t1, L(loop) +@@ -30,18 +30,18 @@ LEAF(STRLEN, 6) + jr ra + + L(loop): +- xvld $xr0, a0, 32 ++ xvld xr0, a0, 32 + addi.d a0, a0, 32 +- xvsetanyeqz.b $fcc0, $xr0 +- bceqz $fcc0, L(loop) ++ xvsetanyeqz.b fcc0, xr0 ++ bceqz fcc0, L(loop) + + +- xvmsknz.b $xr0, $xr0 ++ xvmsknz.b xr0, xr0 + sub.d a0, a0, a1 +- xvpickve.w $xr1, $xr0, 4 +- vilvl.h $vr0, $vr1, $vr0 ++ xvpickve.w xr1, xr0, 4 ++ vilvl.h vr0, vr1, vr0 + +- movfr2gr.s t0, $f0 ++ movfr2gr.s t0, fa0 + cto.w t0, t0 + add.d a0, a0, t0 + jr ra +diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S +index 6f311506..45c3db93 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S +@@ -16,15 +16,15 @@ + LEAF(STRLEN, 6) + move a1, a0 + bstrins.d a0, zero, 4, 0 +- vld $vr0, a0, 0 +- vld $vr1, a0, 16 ++ vld vr0, a0, 0 ++ vld vr1, a0, 16 + + li.d t1, -1 +- vmsknz.b $vr0, $vr0 +- vmsknz.b $vr1, $vr1 +- vilvl.h $vr0, $vr1, $vr0 ++ vmsknz.b vr0, vr0 ++ vmsknz.b vr1, vr1 ++ vilvl.h vr0, vr1, vr0 + +- movfr2gr.s t0, $f0 ++ movfr2gr.s t0, fa0 + sra.w t0, t0, a1 + beq t0, t1, L(loop) + cto.w a0, t0 +@@ -36,19 +36,19 @@ LEAF(STRLEN, 6) + + + L(loop): +- vld $vr0, a0, 32 +- vld $vr1, a0, 48 ++ vld vr0, a0, 32 ++ vld vr1, a0, 48 + addi.d a0, a0, 32 +- vmin.bu $vr2, $vr0, $vr1 ++ vmin.bu vr2, vr0, vr1 + +- vsetanyeqz.b $fcc0, $vr2 +- bceqz $fcc0, L(loop) +- vmsknz.b $vr0, $vr0 +- vmsknz.b $vr1, $vr1 ++ vsetanyeqz.b fcc0, vr2 ++ bceqz fcc0, L(loop) ++ vmsknz.b vr0, vr0 ++ vmsknz.b vr1, vr1 + +- vilvl.h $vr0, $vr1, $vr0 ++ vilvl.h vr0, vr1, vr0 + sub.d a0, a0, a1 +- movfr2gr.s t0, $f0 ++ movfr2gr.s t0, fa0 + cto.w t0, t0 + + add.d a0, a0, t0 +diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S +index 2c6f9614..21f3e689 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S +@@ -22,7 +22,7 @@ ENTRY_NO_ALIGN(STRNCMP) + beqz a2, L(ret0) + pcaddi t0, -5 + andi a3, a0, 0xf +- vld $vr2, t0, 0 ++ vld vr2, t0, 0 + + andi a4, a1, 0xf + li.d t2, 16 +@@ -30,57 +30,57 @@ ENTRY_NO_ALIGN(STRNCMP) + xor t0, a0, a3 + + xor t1, a1, a4 +- vld $vr0, t0, 0 +- vld $vr1, t1, 0 +- vreplgr2vr.b $vr3, a3 ++ vld vr0, t0, 0 ++ vld vr1, t1, 0 ++ vreplgr2vr.b vr3, a3 + + + sub.d t2, t2, a3 +- vadd.b $vr3, $vr3, $vr2 +- vshuf.b $vr0, $vr3, $vr0, $vr3 +- vshuf.b $vr1, $vr3, $vr1, $vr3 ++ vadd.b vr3, vr3, vr2 ++ vshuf.b vr0, vr3, vr0, vr3 ++ vshuf.b vr1, vr3, vr1, vr3 + +- vseq.b $vr3, $vr0, $vr1 +- vmin.bu $vr3, $vr0, $vr3 ++ vseq.b vr3, vr0, vr1 ++ vmin.bu vr3, vr0, vr3 + bgeu t2, a2, L(al_early_end) +- vsetanyeqz.b $fcc0, $vr3 ++ vsetanyeqz.b fcc0, vr3 + +- bcnez $fcc0, L(al_end) ++ bcnez fcc0, L(al_end) + add.d a3, a0, a2 + addi.d a4, a3, -1 + bstrins.d a4, zero, 3, 0 + + sub.d a2, a3, a4 + L(al_loop): +- vld $vr0, t0, 16 +- vld $vr1, t1, 16 ++ vld vr0, t0, 16 ++ vld vr1, t1, 16 + addi.d t0, t0, 16 + + + addi.d t1, t1, 16 +- vseq.b $vr3, $vr0, $vr1 +- vmin.bu $vr3, $vr0, $vr3 ++ vseq.b vr3, vr0, vr1 ++ vmin.bu vr3, vr0, vr3 + beq t0, a4, L(al_early_end) + +- vsetanyeqz.b $fcc0, $vr3 +- bceqz $fcc0, L(al_loop) ++ vsetanyeqz.b fcc0, vr3 ++ bceqz fcc0, L(al_loop) + L(al_end): +- vseqi.b $vr3, $vr3, 0 +- vfrstpi.b $vr3, $vr3, 0 ++ vseqi.b vr3, vr3, 0 ++ vfrstpi.b vr3, vr3, 0 + +- vshuf.b $vr0, $vr0, $vr0, $vr3 +- vshuf.b $vr1, $vr1, $vr1, $vr3 +- vpickve2gr.bu t0, $vr0, 0 +- vpickve2gr.bu t1, $vr1, 0 ++ vshuf.b vr0, vr0, vr0, vr3 ++ vshuf.b vr1, vr1, vr1, vr3 ++ vpickve2gr.bu t0, vr0, 0 ++ vpickve2gr.bu t1, vr1, 0 + + sub.d a0, t0, t1 + jr ra + L(al_early_end): +- vreplgr2vr.b $vr4, a2 +- vslt.b $vr4, $vr2, $vr4 ++ vreplgr2vr.b vr4, a2 ++ vslt.b vr4, vr2, vr4 + + +- vorn.v $vr3, $vr3, $vr4 ++ vorn.v vr3, vr3, vr4 + b L(al_end) + L(unaligned): + slt a5, a3, a4 +@@ -94,64 +94,64 @@ L(unaligned): + andi a4, a1, 0xf + xor t0, a0, a3 + xor t1, a1, a4 +- vld $vr0, t0, 0 ++ vld vr0, t0, 0 + +- vld $vr3, t1, 0 ++ vld vr3, t1, 0 + sub.d t2, t2, a3 +- vreplgr2vr.b $vr4, a3 +- vreplgr2vr.b $vr5, a4 ++ vreplgr2vr.b vr4, a3 ++ vreplgr2vr.b vr5, a4 + + +- vaddi.bu $vr6, $vr2, 16 +- vsub.b $vr7, $vr4, $vr5 +- vsub.b $vr6, $vr6, $vr7 +- vadd.b $vr4, $vr2, $vr4 ++ vaddi.bu vr6, vr2, 16 ++ vsub.b vr7, vr4, vr5 ++ vsub.b vr6, vr6, vr7 ++ vadd.b vr4, vr2, vr4 + +- vshuf.b $vr1, $vr3, $vr3, $vr6 +- vshuf.b $vr0, $vr7, $vr0, $vr4 +- vshuf.b $vr1, $vr7, $vr1, $vr4 +- vseq.b $vr4, $vr0, $vr1 ++ vshuf.b vr1, vr3, vr3, vr6 ++ vshuf.b vr0, vr7, vr0, vr4 ++ vshuf.b vr1, vr7, vr1, vr4 ++ vseq.b vr4, vr0, vr1 + +- vmin.bu $vr4, $vr0, $vr4 ++ vmin.bu vr4, vr0, vr4 + bgeu t2, a2, L(un_early_end) +- vsetanyeqz.b $fcc0, $vr4 +- bcnez $fcc0, L(un_end) ++ vsetanyeqz.b fcc0, vr4 ++ bcnez fcc0, L(un_end) + + add.d a6, a0, a2 +- vslt.b $vr5, $vr2, $vr5 ++ vslt.b vr5, vr2, vr5 + addi.d a7, a6, -1 +- vor.v $vr3, $vr3, $vr5 ++ vor.v vr3, vr3, vr5 + + + bstrins.d a7, zero, 3, 0 + sub.d a2, a6, a7 + L(un_loop): +- vld $vr0, t0, 16 ++ vld vr0, t0, 16 + addi.d t0, t0, 16 + +- vsetanyeqz.b $fcc0, $vr3 +- bcnez $fcc0, L(has_zero) ++ vsetanyeqz.b fcc0, vr3 ++ bcnez fcc0, L(has_zero) + beq t0, a7, L(end_with_len) +- vor.v $vr1, $vr3, $vr3 ++ vor.v vr1, vr3, vr3 + +- vld $vr3, t1, 16 ++ vld vr3, t1, 16 + addi.d t1, t1, 16 +- vshuf.b $vr1, $vr3, $vr1, $vr6 +- vseq.b $vr4, $vr0, $vr1 ++ vshuf.b vr1, vr3, vr1, vr6 ++ vseq.b vr4, vr0, vr1 + +- vmin.bu $vr4, $vr0, $vr4 +- vsetanyeqz.b $fcc0, $vr4 +- bceqz $fcc0, L(un_loop) ++ vmin.bu vr4, vr0, vr4 ++ vsetanyeqz.b fcc0, vr4 ++ bceqz fcc0, L(un_loop) + L(un_end): +- vseqi.b $vr4, $vr4, 0 ++ vseqi.b vr4, vr4, 0 + + +- vfrstpi.b $vr4, $vr4, 0 +- vshuf.b $vr0, $vr0, $vr0, $vr4 +- vshuf.b $vr1, $vr1, $vr1, $vr4 +- vpickve2gr.bu t0, $vr0, 0 ++ vfrstpi.b vr4, vr4, 0 ++ vshuf.b vr0, vr0, vr0, vr4 ++ vshuf.b vr1, vr1, vr1, vr4 ++ vpickve2gr.bu t0, vr0, 0 + +- vpickve2gr.bu t1, $vr1, 0 ++ vpickve2gr.bu t1, vr1, 0 + sub.d t2, t0, t1 + sub.d t3, t1, t0 + masknez t0, t2, a5 +@@ -160,30 +160,30 @@ L(un_end): + or a0, t0, t1 + jr ra + L(has_zero): +- vshuf.b $vr1, $vr3, $vr3, $vr6 ++ vshuf.b vr1, vr3, vr3, vr6 + +- vseq.b $vr4, $vr0, $vr1 +- vmin.bu $vr4, $vr0, $vr4 ++ vseq.b vr4, vr0, vr1 ++ vmin.bu vr4, vr0, vr4 + bne t0, a7, L(un_end) + L(un_early_end): +- vreplgr2vr.b $vr5, a2 ++ vreplgr2vr.b vr5, a2 + +- vslt.b $vr5, $vr2, $vr5 +- vorn.v $vr4, $vr4, $vr5 ++ vslt.b vr5, vr2, vr5 ++ vorn.v vr4, vr4, vr5 + b L(un_end) + L(end_with_len): + sub.d a6, a3, a4 + + bgeu a6, a2, 1f +- vld $vr4, t1, 16 ++ vld vr4, t1, 16 + 1: +- vshuf.b $vr1, $vr4, $vr3, $vr6 +- vseq.b $vr4, $vr0, $vr1 ++ vshuf.b vr1, vr4, vr3, vr6 ++ vseq.b vr4, vr0, vr1 + +- vmin.bu $vr4, $vr0, $vr4 +- vreplgr2vr.b $vr5, a2 +- vslt.b $vr5, $vr2, $vr5 +- vorn.v $vr4, $vr4, $vr5 ++ vmin.bu vr4, vr0, vr4 ++ vreplgr2vr.b vr5, a2 ++ vslt.b vr5, vr2, vr5 ++ vorn.v vr4, vr4, vr5 + + b L(un_end) + L(ret0): +diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S +index 910b52fe..6410a907 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S +@@ -19,23 +19,23 @@ LEAF(STRNLEN, 6) + li.d t3, 65 + sub.d a2, a0, t1 + +- xvld $xr0, a2, 0 +- xvld $xr1, a2, 32 ++ xvld xr0, a2, 0 ++ xvld xr1, a2, 32 + sub.d t1, t3, t1 + move a3, a0 + + sltu t1, a1, t1 +- xvmsknz.b $xr0, $xr0 +- xvmsknz.b $xr1, $xr1 +- xvpickve.w $xr2, $xr0, 4 ++ xvmsknz.b xr0, xr0 ++ xvmsknz.b xr1, xr1 ++ xvpickve.w xr2, xr0, 4 + +- xvpickve.w $xr3, $xr1, 4 +- vilvl.h $vr0, $vr2, $vr0 +- vilvl.h $vr1, $vr3, $vr1 +- vilvl.w $vr0, $vr1, $vr0 ++ xvpickve.w xr3, xr1, 4 ++ vilvl.h vr0, vr2, vr0 ++ vilvl.h vr1, vr3, vr1 ++ vilvl.w vr0, vr1, vr0 + + +- movfr2gr.d t0, $f0 ++ movfr2gr.d t0, fa0 + sra.d t0, t0, a0 + orn t1, t1, t0 + bnez t1, L(end) +@@ -46,26 +46,26 @@ LEAF(STRNLEN, 6) + bstrins.d a4, zero, 5, 0 + + L(loop): +- xvld $xr0, a0, 64 +- xvld $xr1, a0, 96 ++ xvld xr0, a0, 64 ++ xvld xr1, a0, 96 + addi.d a0, a0, 64 + beq a0, a4, L(out) + +- xvmin.bu $xr2, $xr0, $xr1 +- xvsetanyeqz.b $fcc0, $xr2 +- bceqz $fcc0, L(loop) ++ xvmin.bu xr2, xr0, xr1 ++ xvsetanyeqz.b fcc0, xr2 ++ bceqz fcc0, L(loop) + L(out): +- xvmsknz.b $xr0, $xr0 ++ xvmsknz.b xr0, xr0 + + +- xvmsknz.b $xr1, $xr1 +- xvpickve.w $xr2, $xr0, 4 +- xvpickve.w $xr3, $xr1, 4 +- vilvl.h $vr0, $vr2, $vr0 ++ xvmsknz.b xr1, xr1 ++ xvpickve.w xr2, xr0, 4 ++ xvpickve.w xr3, xr1, 4 ++ vilvl.h vr0, vr2, vr0 + +- vilvl.h $vr1, $vr3, $vr1 +- vilvl.w $vr0, $vr1, $vr0 +- movfr2gr.d t0, $f0 ++ vilvl.h vr1, vr3, vr1 ++ vilvl.w vr0, vr1, vr0 ++ movfr2gr.d t0, fa0 + L(end): + sub.d a0, a0, a3 + +diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S +index db0e90ff..9250a0cd 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S +@@ -19,17 +19,17 @@ LEAF(STRNLEN, 6) + li.d t3, 33 + sub.d a2, a0, t1 + +- vld $vr0, a2, 0 +- vld $vr1, a2, 16 ++ vld vr0, a2, 0 ++ vld vr1, a2, 16 + sub.d t1, t3, t1 + move a3, a0 + + sltu t1, a1, t1 +- vmsknz.b $vr0, $vr0 +- vmsknz.b $vr1, $vr1 +- vilvl.h $vr0, $vr1, $vr0 ++ vmsknz.b vr0, vr0 ++ vmsknz.b vr1, vr1 ++ vilvl.h vr0, vr1, vr0 + +- movfr2gr.s t0, $f0 ++ movfr2gr.s t0, fa0 + sra.w t0, t0, a0 + orn t1, t1, t0 + bnez t1, L(end) +@@ -41,20 +41,20 @@ LEAF(STRNLEN, 6) + bstrins.d a4, zero, 4, 0 + + L(loop): +- vld $vr0, a0, 32 +- vld $vr1, a0, 48 ++ vld vr0, a0, 32 ++ vld vr1, a0, 48 + addi.d a0, a0, 32 + beq a0, a4, L(out) + +- vmin.bu $vr2, $vr0, $vr1 +- vsetanyeqz.b $fcc0, $vr2 +- bceqz $fcc0, L(loop) ++ vmin.bu vr2, vr0, vr1 ++ vsetanyeqz.b fcc0, vr2 ++ bceqz fcc0, L(loop) + L(out): +- vmsknz.b $vr0, $vr0 ++ vmsknz.b vr0, vr0 + +- vmsknz.b $vr1, $vr1 +- vilvl.h $vr0, $vr1, $vr0 +- movfr2gr.s t0, $f0 ++ vmsknz.b vr1, vr1 ++ vilvl.h vr0, vr1, vr0 ++ movfr2gr.s t0, fa0 + L(end): + sub.d a0, a0, a3 + +diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S +index 325458ff..990be973 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S +@@ -14,45 +14,45 @@ + LEAF(STRRCHR, 6) + andi t1, a0, 0x3f + bstrins.d a0, zero, 5, 0 +- xvld $xr0, a0, 0 +- xvld $xr1, a0, 32 ++ xvld xr0, a0, 0 ++ xvld xr1, a0, 32 + + li.d t2, -1 +- xvreplgr2vr.b $xr4, a1 ++ xvreplgr2vr.b xr4, a1 + move a2, zero + sll.d t3, t2, t1 + + addi.d a0, a0, 63 +- xvseq.b $xr2, $xr0, $xr4 +- xvseq.b $xr3, $xr1, $xr4 +- xvmsknz.b $xr0, $xr0 ++ xvseq.b xr2, xr0, xr4 ++ xvseq.b xr3, xr1, xr4 ++ xvmsknz.b xr0, xr0 + +- xvmsknz.b $xr1, $xr1 +- xvpickve.w $xr5, $xr0, 4 +- xvpickve.w $xr6, $xr1, 4 +- vilvl.h $vr0, $vr5, $vr0 ++ xvmsknz.b xr1, xr1 ++ xvpickve.w xr5, xr0, 4 ++ xvpickve.w xr6, xr1, 4 ++ vilvl.h vr0, vr5, vr0 + + +- vilvl.h $vr1, $vr6, $vr1 +- xvmsknz.b $xr2, $xr2 +- xvmsknz.b $xr3, $xr3 +- xvpickve.w $xr5, $xr2, 4 ++ vilvl.h vr1, vr6, vr1 ++ xvmsknz.b xr2, xr2 ++ xvmsknz.b xr3, xr3 ++ xvpickve.w xr5, xr2, 4 + +- xvpickve.w $xr6, $xr3, 4 +- vilvl.h $vr2, $vr5, $vr2 +- vilvl.h $vr3, $vr6, $vr3 +- vilvl.w $vr0, $vr1, $vr0 ++ xvpickve.w xr6, xr3, 4 ++ vilvl.h vr2, vr5, vr2 ++ vilvl.h vr3, vr6, vr3 ++ vilvl.w vr0, vr1, vr0 + +- vilvl.w $vr1, $vr3, $vr2 +- movfr2gr.d t0, $f0 +- movfr2gr.d t1, $f1 ++ vilvl.w vr1, vr3, vr2 ++ movfr2gr.d t0, fa0 ++ movfr2gr.d t1, fa1 + orn t0, t0, t3 + + and t1, t1, t3 + bne t0, t2, L(end) + L(loop): +- xvld $xr0, a0, 1 +- xvld $xr1, a0, 33 ++ xvld xr0, a0, 1 ++ xvld xr1, a0, 33 + + + clz.d t0, t1 +@@ -62,33 +62,33 @@ L(loop): + + masknez t1, a2, t1 + or a2, t0, t1 +- xvseq.b $xr2, $xr0, $xr4 +- xvseq.b $xr3, $xr1, $xr4 ++ xvseq.b xr2, xr0, xr4 ++ xvseq.b xr3, xr1, xr4 + +- xvmsknz.b $xr2, $xr2 +- xvmsknz.b $xr3, $xr3 +- xvpickve.w $xr5, $xr2, 4 +- xvpickve.w $xr6, $xr3, 4 ++ xvmsknz.b xr2, xr2 ++ xvmsknz.b xr3, xr3 ++ xvpickve.w xr5, xr2, 4 ++ xvpickve.w xr6, xr3, 4 + +- vilvl.h $vr2, $vr5, $vr2 +- vilvl.h $vr3, $vr6, $vr3 +- xvmin.bu $xr5, $xr0, $xr1 +- vilvl.w $vr2, $vr3, $vr2 ++ vilvl.h vr2, vr5, vr2 ++ vilvl.h vr3, vr6, vr3 ++ xvmin.bu xr5, xr0, xr1 ++ vilvl.w vr2, vr3, vr2 + + +- xvsetanyeqz.b $fcc0, $xr5 +- movfr2gr.d t1, $f2 +- bceqz $fcc0, L(loop) +- xvmsknz.b $xr0, $xr0 ++ xvsetanyeqz.b fcc0, xr5 ++ movfr2gr.d t1, fa2 ++ bceqz fcc0, L(loop) ++ xvmsknz.b xr0, xr0 + +- xvmsknz.b $xr1, $xr1 +- xvpickve.w $xr5, $xr0, 4 +- xvpickve.w $xr6, $xr1, 4 +- vilvl.h $vr0, $vr5, $vr0 ++ xvmsknz.b xr1, xr1 ++ xvpickve.w xr5, xr0, 4 ++ xvpickve.w xr6, xr1, 4 ++ vilvl.h vr0, vr5, vr0 + +- vilvl.h $vr1, $vr6, $vr1 +- vilvl.w $vr0, $vr1, $vr0 +- movfr2gr.d t0, $f0 ++ vilvl.h vr1, vr6, vr1 ++ vilvl.w vr0, vr1, vr0 ++ movfr2gr.d t0, fa0 + L(end): + slli.d t3, t2, 1 # shift one more for the last '\0' + +diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S +index e082eaab..6aede6ae 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S +@@ -14,35 +14,35 @@ + LEAF(STRRCHR, 6) + andi t1, a0, 0x1f + bstrins.d a0, zero, 4, 0 +- vld $vr0, a0, 0 +- vld $vr1, a0, 16 ++ vld vr0, a0, 0 ++ vld vr1, a0, 16 + +- vreplgr2vr.b $vr4, a1 ++ vreplgr2vr.b vr4, a1 + li.d t2, -1 + move a2, zero + addi.d a0, a0, 31 + +- vseq.b $vr2, $vr0, $vr4 +- vseq.b $vr3, $vr1, $vr4 +- vmsknz.b $vr0, $vr0 +- vmsknz.b $vr1, $vr1 ++ vseq.b vr2, vr0, vr4 ++ vseq.b vr3, vr1, vr4 ++ vmsknz.b vr0, vr0 ++ vmsknz.b vr1, vr1 + +- vmsknz.b $vr2, $vr2 +- vmsknz.b $vr3, $vr3 +- vilvl.h $vr0, $vr1, $vr0 +- vilvl.h $vr1, $vr3, $vr2 ++ vmsknz.b vr2, vr2 ++ vmsknz.b vr3, vr3 ++ vilvl.h vr0, vr1, vr0 ++ vilvl.h vr1, vr3, vr2 + + +- movfr2gr.s t0, $f0 ++ movfr2gr.s t0, fa0 + sll.d t3, t2, t1 +- movfr2gr.s t1, $f1 ++ movfr2gr.s t1, fa1 + orn t0, t0, t3 + + and t1, t1, t3 + bne t0, t2, L(end) + L(loop): +- vld $vr0, a0, 1 +- vld $vr1, a0, 17 ++ vld vr0, a0, 1 ++ vld vr1, a0, 17 + + clz.w t0, t1 + sub.d t0, a0, t0 +@@ -51,23 +51,23 @@ L(loop): + + masknez t1, a2, t1 + or a2, t0, t1 +- vseq.b $vr2, $vr0, $vr4 +- vseq.b $vr3, $vr1, $vr4 ++ vseq.b vr2, vr0, vr4 ++ vseq.b vr3, vr1, vr4 + + +- vmsknz.b $vr2, $vr2 +- vmsknz.b $vr3, $vr3 +- vmin.bu $vr5, $vr0, $vr1 +- vilvl.h $vr2, $vr3, $vr2 ++ vmsknz.b vr2, vr2 ++ vmsknz.b vr3, vr3 ++ vmin.bu vr5, vr0, vr1 ++ vilvl.h vr2, vr3, vr2 + +- vsetanyeqz.b $fcc0, $vr5 +- movfr2gr.s t1, $f2 +- bceqz $fcc0, L(loop) +- vmsknz.b $vr0, $vr0 ++ vsetanyeqz.b fcc0, vr5 ++ movfr2gr.s t1, fa2 ++ bceqz fcc0, L(loop) ++ vmsknz.b vr0, vr0 + +- vmsknz.b $vr1, $vr1 +- vilvl.h $vr0, $vr1, $vr0 +- movfr2gr.s t0, $f0 ++ vmsknz.b vr1, vr1 ++ vilvl.h vr0, vr1, vr0 ++ movfr2gr.s t0, fa0 + L(end): + slli.d t3, t2, 1 # shift one more for the last '\0' + +diff --git a/sysdeps/loongarch/lp64/s_cosf.S b/sysdeps/loongarch/lp64/s_cosf.S +index 9fcbe6ca..cb3a4faa 100644 +--- a/sysdeps/loongarch/lp64/s_cosf.S ++++ b/sysdeps/loongarch/lp64/s_cosf.S +@@ -213,9 +213,9 @@ L_even_integer: + fadd.d fa0, fa0, fa1 + fadd.d fa2, fa2, fa3 + fadd.d fa0, fa0, fa2 +- fcmp.sle.d $fcc0, fa0, fa5 ++ fcmp.sle.d fcc0, fa0, fa5 + addi.d t0, t0, 3 +- bcnez $fcc0, L_leq_one ++ bcnez fcc0, L_leq_one + /*L_gt_one:*/ + fld.d fa2, t1, 16 /* 2.0 */ + addi.d t0, t0, 1 +diff --git a/sysdeps/loongarch/lp64/s_sinf.S b/sysdeps/loongarch/lp64/s_sinf.S +index 45d1c4b5..1e77282d 100644 +--- a/sysdeps/loongarch/lp64/s_sinf.S ++++ b/sysdeps/loongarch/lp64/s_sinf.S +@@ -215,9 +215,9 @@ L_even_integer: + fadd.d fa0, fa0, fa1 + fadd.d fa2, fa2, fa3 + fadd.d fa0, fa0, fa2 +- fcmp.sle.d $fcc0, fa0, fa5 ++ fcmp.sle.d fcc0, fa0, fa5 + addi.d t0, t0, 1 +- bcnez $fcc0, L_leq_one ++ bcnez fcc0, L_leq_one + /*L_gt_one:*/ + fld.d fa2, t1, 16 /* 2.0 */ + addi.d t0, t0, 1 +diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h +index 36f00939..b5ee57cf 100644 +--- a/sysdeps/loongarch/sys/regdef.h ++++ b/sysdeps/loongarch/sys/regdef.h +@@ -71,6 +71,14 @@ + # define fs5 $f29 + # define fs6 $f30 + # define fs7 $f31 ++# define fcc0 $fcc0 ++# define fcc1 $fcc1 ++# define fcc2 $fcc2 ++# define fcc3 $fcc3 ++# define fcc4 $fcc4 ++# define fcc5 $fcc5 ++# define fcc6 $fcc6 ++# define fcc7 $fcc7 + + #elif _LOONGARCH_SIM == _ABILP32 + # error ABILP32 not support yet +@@ -78,4 +86,70 @@ + # error noABI + #endif + ++#define vr0 $vr0 ++#define vr1 $vr1 ++#define vr2 $vr2 ++#define vr3 $vr3 ++#define vr4 $vr4 ++#define vr5 $vr5 ++#define vr6 $vr6 ++#define vr7 $vr7 ++#define vr8 $vr8 ++#define vr9 $vr9 ++#define vr10 $vr10 ++#define vr11 $vr11 ++#define vr12 $vr12 ++#define vr13 $vr13 ++#define vr14 $vr14 ++#define vr15 $vr15 ++#define vr16 $vr16 ++#define vr17 $vr17 ++#define vr18 $vr18 ++#define vr19 $vr19 ++#define vr20 $vr20 ++#define vr21 $vr21 ++#define vr22 $vr22 ++#define vr23 $vr23 ++#define vr24 $vr24 ++#define vr25 $vr25 ++#define vr26 $vr26 ++#define vr27 $vr27 ++#define vr28 $vr28 ++#define vr29 $vr29 ++#define vr30 $vr30 ++#define vr31 $vr31 ++ ++#define xr0 $xr0 ++#define xr1 $xr1 ++#define xr2 $xr2 ++#define xr3 $xr3 ++#define xr4 $xr4 ++#define xr5 $xr5 ++#define xr6 $xr6 ++#define xr7 $xr7 ++#define xr8 $xr8 ++#define xr9 $xr9 ++#define xr10 $xr10 ++#define xr11 $xr11 ++#define xr12 $xr12 ++#define xr13 $xr13 ++#define xr14 $xr14 ++#define xr15 $xr15 ++#define xr16 $xr16 ++#define xr17 $xr17 ++#define xr18 $xr18 ++#define xr19 $xr19 ++#define xr20 $xr20 ++#define xr21 $xr21 ++#define xr22 $xr22 ++#define xr23 $xr23 ++#define xr24 $xr24 ++#define xr25 $xr25 ++#define xr26 $xr26 ++#define xr27 $xr27 ++#define xr28 $xr28 ++#define xr29 $xr29 ++#define xr30 $xr30 ++#define xr31 $xr31 ++ + #endif /* _SYS_REGDEF_H */ +-- +2.33.0 + diff --git a/glibc-2.28-Add-new-struct-user_fp_state-in-user.h.patch b/glibc-2.28-Add-new-struct-user_fp_state-in-user.h.patch new file mode 100644 index 0000000..b7ae1ad --- /dev/null +++ b/glibc-2.28-Add-new-struct-user_fp_state-in-user.h.patch @@ -0,0 +1,29 @@ +From dc2d26d52c129c47fa1f16bd0157cd20c6d9a958 Mon Sep 17 00:00:00 2001 +From: caiyinyu +Date: Wed, 21 Jun 2023 11:55:02 +0800 +Subject: [PATCH 08/14] glibc-2.28: Add new struct user_fp_state in user.h + +Change-Id: Idc233cc11c8f76b624dc2891b432f4d02a53cebc +Signed-off-by: ticat_fp +--- + sysdeps/unix/sysv/linux/loongarch/sys/user.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/sysdeps/unix/sysv/linux/loongarch/sys/user.h b/sysdeps/unix/sysv/linux/loongarch/sys/user.h +index f9108350..21e340f6 100644 +--- a/sysdeps/unix/sysv/linux/loongarch/sys/user.h ++++ b/sysdeps/unix/sysv/linux/loongarch/sys/user.h +@@ -28,4 +28,10 @@ struct user_regs_struct + uint64_t reserved[11]; + }; + ++struct user_fp_struct { ++ uint64_t fpr[32]; ++ uint64_t fcc; ++ uint32_t fcsr; ++}; ++ + #endif /* _SYS_USER_H */ +-- +2.33.0 + diff --git a/glibc-2.28-Add-run-one-test-convenience-target-and-m.patch b/glibc-2.28-Add-run-one-test-convenience-target-and-m.patch new file mode 100644 index 0000000..7cd6393 --- /dev/null +++ b/glibc-2.28-Add-run-one-test-convenience-target-and-m.patch @@ -0,0 +1,116 @@ +From bbc404e8f6e59aa808642c2a40e24a81744967e3 Mon Sep 17 00:00:00 2001 +From: caiyinyu +Date: Mon, 15 May 2023 12:00:50 +0800 +Subject: [PATCH 04/14] glibc-2.28: Add run-one-test convenience target and + makefile help text + +Reference: + + commit 2ac579f9c25388a7734948d77b03e4dd10f35334 + Author: DJ Delorie + Date: Mon Sep 30 16:04:52 2019 -0400 + + Add run-one-test convenience target and makefile help text + + Adds "make test" for re-running just one test. Also adds + "make help" for help with our Makefile targets, and adds a + mini-help when you just run "make". + + Reviewed-by: Carlos O'Donell + +Change-Id: I8c7ccf9a5ec4dc4afd4901d2f8f693677d0d94ea +Signed-off-by: ticat_fp +--- + Makefile | 22 ++++++++++++++++++++-- + Makefile.help | 42 ++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 62 insertions(+), 2 deletions(-) + create mode 100644 Makefile.help + +diff --git a/Makefile b/Makefile +index 6d73241b..6518f62e 100644 +--- a/Makefile ++++ b/Makefile +@@ -26,8 +26,17 @@ include Makeconfig + + + # This is the default target; it makes everything except the tests. +-.PHONY: all +-all: lib others ++.PHONY: all help minihelp ++all: minihelp lib others ++ ++help: ++ @sed '0,/^help-starts-here$$/d' Makefile.help ++ ++minihelp: ++ @echo ++ @echo type \"make help\" for help with common glibc makefile targets ++ @echo ++ + + ifneq ($(AUTOCONF),no) + +@@ -503,3 +512,12 @@ FORCE: + + iconvdata/% localedata/% po/%: FORCE + $(MAKE) $(PARALLELMFLAGS) -C $(@D) $(@F) ++ ++# Convenience target to rerun one test, from the top of the build tree ++# Example: make test t=wcsmbs/test-wcsnlen ++.PHONY: test ++test : ++ @-rm -f $(objpfx)$t.out ++ $(MAKE) subdir=$(dir $t) -C $(dir $t) ..=../ $(objpfx)$t.out ++ @cat $(objpfx)$t.test-result ++ @cat $(objpfx)$t.out +diff --git a/Makefile.help b/Makefile.help +new file mode 100644 +index 00000000..319fdaa1 +--- /dev/null ++++ b/Makefile.help +@@ -0,0 +1,42 @@ ++# Copyright (C) 2019 Free Software Foundation, Inc. ++# This file is part of the GNU C Library. ++ ++# The GNU C Library is free software; you can redistribute it and/or ++# modify it under the terms of the GNU Lesser General Public ++# License as published by the Free Software Foundation; either ++# version 2.1 of the License, or (at your option) any later version. ++ ++# The GNU C Library is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++# Lesser General Public License for more details. ++ ++# You should have received a copy of the GNU Lesser General Public ++# License along with the GNU C Library; if not, see ++# . ++ ++This is the file that gets printed when the user runs "make help", ++starting just after the "help-starts-here" line. ++ ++help-starts-here ++ ++all ++ The usual default; builds everything but doesn't run the ++ tests. ++ ++check (or tests) ++ Runs the standard set of tests. ++ ++test ++ Runs one test. Use like this: ++ make test t=wcsmbs/test-wcsnlen ++ Note that this will rebuild the test if needed, but will not ++ rebuild what "make all" would have rebuilt. ++ ++-- ++Other useful hints: ++ ++builddir$ rm testroot.pristine/install.stamp ++ Forces the testroot to be reinstalled the next time you run ++ the testsuite (or just rm -rf testroot.pristine) ++ +-- +2.33.0 + diff --git a/glibc-2.28-Fix-ifunc-str-mem-functions-xfail-problem.patch b/glibc-2.28-Fix-ifunc-str-mem-functions-xfail-problem.patch new file mode 100644 index 0000000..ff87ba3 --- /dev/null +++ b/glibc-2.28-Fix-ifunc-str-mem-functions-xfail-problem.patch @@ -0,0 +1,162 @@ +From 647a0a28e5c9aed2f1fa59bbb7595133e7a4e62f Mon Sep 17 00:00:00 2001 +From: caiyinyu +Date: Mon, 24 Apr 2023 18:09:55 +0800 +Subject: [PATCH 03/14] glibc-2.28: Fix ifunc str/mem functions xfail problems. + +Change-Id: Ibff4229fcfef23c0b19fb94b21a4d17b49eceec6 +Signed-off-by: ticat_fp +--- + .../lp64/multiarch/ifunc-impl-list.c | 76 +++++++++---------- + 1 file changed, 38 insertions(+), 38 deletions(-) + +diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c +index c2b6bbf7..fdeae797 100644 +--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c +@@ -36,105 +36,105 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + size_t i = 0; + + IFUNC_IMPL (i, name, memcpy, +- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_lasx) +- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_lsx) ++ IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx) ++ IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LSX, __memcpy_lsx) ++ IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_UAL, __memcpy_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_aligned) +- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_unaligned) + ) + + IFUNC_IMPL (i, name, memmove, +- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_lasx) +- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_lsx) ++ IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_LASX, __memmove_lasx) ++ IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_LSX, __memmove_lsx) ++ IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_UAL, __memmove_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_aligned) +- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_unaligned) + ) + + IFUNC_IMPL (i, name, memset, +- IFUNC_IMPL_ADD (array, i, memset, 1, __memset_lasx) +- IFUNC_IMPL_ADD (array, i, memset, 1, __memset_lsx) ++ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LASX, __memset_lasx) ++ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LSX, __memset_lsx) ++ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_UAL, __memset_unaligned) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_aligned) +- IFUNC_IMPL_ADD (array, i, memset, 1, __memset_unaligned) + ) + + IFUNC_IMPL (i, name, memchr, +- IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_lasx) +- IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_lsx) ++ IFUNC_IMPL_ADD (array, i, memchr, SUPPORT_LASX, __memchr_lasx) ++ IFUNC_IMPL_ADD (array, i, memchr, SUPPORT_LSX, __memchr_lsx) + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_aligned) + ) + + IFUNC_IMPL (i, name, memrchr, +- IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_lasx) +- IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_lsx) ++ IFUNC_IMPL_ADD (array, i, memrchr, SUPPORT_LASX, __memrchr_lasx) ++ IFUNC_IMPL_ADD (array, i, memrchr, SUPPORT_LSX, __memrchr_lsx) + IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_generic) + ) + + IFUNC_IMPL (i, name, memcmp, +- IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_lasx) +- IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_lsx) ++ IFUNC_IMPL_ADD (array, i, memcmp, SUPPORT_LASX, __memcmp_lasx) ++ IFUNC_IMPL_ADD (array, i, memcmp, SUPPORT_LSX, __memcmp_lsx) + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_aligned) + ) + + IFUNC_IMPL (i, name, rawmemchr, +- IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_lasx) +- IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_lsx) ++ IFUNC_IMPL_ADD (array, i, rawmemchr, SUPPORT_LASX, __rawmemchr_lasx) ++ IFUNC_IMPL_ADD (array, i, rawmemchr, SUPPORT_LSX, __rawmemchr_lsx) + IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_aligned) + ) + + IFUNC_IMPL (i, name, strchr, +- IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_lasx) +- IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_lsx) ++ IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_LASX, __strchr_lasx) ++ IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_LSX, __strchr_lsx) ++ IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_UAL, __strchr_unaligned) + IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_aligned) +- IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_unaligned) + ) + + IFUNC_IMPL (i, name, strrchr, +- IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_lasx) +- IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_lsx) ++ IFUNC_IMPL_ADD (array, i, strrchr, SUPPORT_LASX, __strrchr_lasx) ++ IFUNC_IMPL_ADD (array, i, strrchr, SUPPORT_LSX, __strrchr_lsx) + IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_aligned) + ) + + IFUNC_IMPL (i, name, strlen, +- IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_lasx) +- IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_lsx) ++ IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LASX, __strlen_lasx) ++ IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LSX, __strlen_lsx) ++ IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_UAL, __strlen_unaligned) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned) +- IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_unaligned) + ) + + IFUNC_IMPL (i, name, strnlen, +- IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_lasx) +- IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_lsx) ++ IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_LASX, __strnlen_lasx) ++ IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_LSX, __strnlen_lsx) ++ IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_UAL, __strnlen_unaligned) + IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_aligned) +- IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_unaligned) + ) + + IFUNC_IMPL (i, name, strchrnul, +- IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_lasx) +- IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_lsx) ++ IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_LASX, __strchrnul_lasx) ++ IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_LSX, __strchrnul_lsx) ++ IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_UAL, __strchrnul_unaligned) + IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_aligned) +- IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_unaligned) + ) + + IFUNC_IMPL (i, name, strncmp, +- IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_lsx) ++ IFUNC_IMPL_ADD (array, i, strncmp, SUPPORT_LSX, __strncmp_lsx) ++ IFUNC_IMPL_ADD (array, i, strncmp, SUPPORT_UAL, __strncmp_unaligned) + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_aligned) +- IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_unaligned) + ) + + IFUNC_IMPL (i, name, strcpy, +- IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_lsx) ++ IFUNC_IMPL_ADD (array, i, strcpy, SUPPORT_LSX, __strcpy_lsx) ++ IFUNC_IMPL_ADD (array, i, strcpy, SUPPORT_UAL, __strcpy_unaligned) + IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_aligned) +- IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_unaligned) + ) + + IFUNC_IMPL (i, name, stpcpy, +- IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_lsx) ++ IFUNC_IMPL_ADD (array, i, stpcpy, SUPPORT_LSX, __stpcpy_lsx) + IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_aligned) + ) + + IFUNC_IMPL (i, name, strcmp, +- IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_lsx) ++ IFUNC_IMPL_ADD (array, i, strcmp, SUPPORT_LSX, __strcmp_lsx) ++ IFUNC_IMPL_ADD (array, i, strcmp, SUPPORT_UAL, __strcmp_unaligned) + IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_aligned) +- IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_unaligned) + ) + + return i; +-- +2.33.0 + diff --git a/glibc-2.28-Redefine-macro-LEAF-ENTRY.patch b/glibc-2.28-Redefine-macro-LEAF-ENTRY.patch new file mode 100644 index 0000000..42b4200 --- /dev/null +++ b/glibc-2.28-Redefine-macro-LEAF-ENTRY.patch @@ -0,0 +1,57 @@ +From 00537d6945e71af8c9b0b1e7c2695f6a9a1ef1f5 Mon Sep 17 00:00:00 2001 +From: caiyinyu +Date: Sun, 25 Jun 2023 16:23:25 +0800 +Subject: [PATCH 09/14] glibc-2.28: Redefine macro LEAF/ENTRY. + + The following usage of macro LEAF/ENTRY are all feasible: + 1. LEAF(fcn) -- the align value of fcn is .align 3 (default value) + 2. LEAF(fcn, 6) -- the align value of fcn is .align 6 + +Change-Id: Ie3df4df8dba5259b665bd0e4702aaab0a09a5f65 +Signed-off-by: ticat_fp +--- + sysdeps/loongarch/sys/asm.h | 15 ++++++++++----- + 1 file changed, 10 insertions(+), 5 deletions(-) + +diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h +index 357a5ba3..734e45ae 100644 +--- a/sysdeps/loongarch/sys/asm.h ++++ b/sysdeps/loongarch/sys/asm.h +@@ -26,16 +26,21 @@ + #endif + + +-/* Declare leaf routine. */ +-#define LEAF(symbol, aln) \ ++/* Declare leaf routine. ++ The usage of macro LEAF/ENTRY is as follows: ++ 1. LEAF(fcn) -- the align value of fcn is .align 3 (default value) ++ 2. LEAF(fcn, 6) -- the align value of fcn is .align 6 ++*/ ++#define LEAF_IMPL(symbol, aln, ...) \ + .text; \ + .globl symbol; \ + .align aln; \ + .type symbol, @function; \ + symbol: \ +- cfi_startproc; \ ++ cfi_startproc; + +-# define ENTRY(symbol, aln) LEAF(symbol, aln) ++#define LEAF(...) LEAF_IMPL(__VA_ARGS__, 3) ++#define ENTRY(...) LEAF(__VA_ARGS__) + + #define LEAF_NO_ALIGN(symbol) \ + .text; \ +@@ -44,7 +49,7 @@ symbol: \ + symbol: \ + cfi_startproc; + +-# define ENTRY_NO_ALIGN(symbol) LEAF_NO_ALIGN(symbol) ++#define ENTRY_NO_ALIGN(symbol) LEAF_NO_ALIGN(symbol) + + /* Mark end of function. */ + #undef END +-- +2.33.0 + diff --git a/glibc-2.28-Refactor-code-and-fix-bug-in-_dl_runtime_.patch b/glibc-2.28-Refactor-code-and-fix-bug-in-_dl_runtime_.patch new file mode 100644 index 0000000..075149b --- /dev/null +++ b/glibc-2.28-Refactor-code-and-fix-bug-in-_dl_runtime_.patch @@ -0,0 +1,306 @@ +From 27a004c9777340afd86fc0d129f6ffad508bf090 Mon Sep 17 00:00:00 2001 +From: caiyinyu +Date: Tue, 11 Jul 2023 16:09:55 +0800 +Subject: [PATCH 12/14] glibc-2.28: Refactor code and fix bug in + _dl_runtime_resolve. + +Change-Id: I4907e6643ef25b87d7862e957ce9bf6d201da816 +Signed-off-by: ticat_fp +--- + sysdeps/loongarch/dl-machine.h | 8 +- + sysdeps/loongarch/dl-trampoline.S | 7 ++ + sysdeps/loongarch/dl-trampoline.h | 159 +++++++++++++----------------- + sysdeps/loongarch/sys/asm.h | 9 ++ + 4 files changed, 90 insertions(+), 93 deletions(-) + +diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h +index 6e9c6258..ff520a07 100644 +--- a/sysdeps/loongarch/dl-machine.h ++++ b/sysdeps/loongarch/dl-machine.h +@@ -381,9 +381,13 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], + /* If using PLTs, fill in the first two entries of .got.plt. */ + if (l->l_info[DT_JMPREL]) + { +- extern void _dl_runtime_resolve (void) __attribute__ ((visibility ("hidden"))); ++ ++#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float + extern void _dl_runtime_resolve_lasx (void) __attribute__ ((visibility ("hidden"))); + extern void _dl_runtime_resolve_lsx (void) __attribute__ ((visibility ("hidden"))); ++#endif ++ extern void _dl_runtime_resolve (void) __attribute__ ((visibility ("hidden"))); ++ + ElfW(Addr) *gotplt = (ElfW(Addr) *) D_PTR (l, l_info[DT_PLTGOT]); + /* If a library is prelinked but we have to relocate anyway, + we have to be able to undo the prelinking of .got.plt. +@@ -391,11 +395,13 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], + if (gotplt[1]) + l->l_mach.plt = gotplt[1] + l->l_addr; + ++#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float + if (SUPPORT_LASX) + gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx; + else if (SUPPORT_LSX) + gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lsx; + else ++#endif + gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve; + + gotplt[1] = (ElfW(Addr)) l; +diff --git a/sysdeps/loongarch/dl-trampoline.S b/sysdeps/loongarch/dl-trampoline.S +index 5f627a63..78d741f3 100644 +--- a/sysdeps/loongarch/dl-trampoline.S ++++ b/sysdeps/loongarch/dl-trampoline.S +@@ -16,16 +16,23 @@ + License along with the GNU C Library. If not, see + . */ + ++#include ++#include ++ ++#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float + #define USE_LASX + #define _dl_runtime_resolve _dl_runtime_resolve_lasx + #include "dl-trampoline.h" ++#undef FRAME_SIZE + #undef USE_LASX + #undef _dl_runtime_resolve + + #define USE_LSX + #define _dl_runtime_resolve _dl_runtime_resolve_lsx + #include "dl-trampoline.h" ++#undef FRAME_SIZE + #undef USE_LSX + #undef _dl_runtime_resolve ++#endif + + #include "dl-trampoline.h" +diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h +index 96f41f1d..9a6d9b6c 100644 +--- a/sysdeps/loongarch/dl-trampoline.h ++++ b/sysdeps/loongarch/dl-trampoline.h +@@ -17,31 +17,24 @@ + License along with the GNU C Library. If not, see + . */ + +-#include +-#include +- + /* Assembler veneer called from the PLT header code for lazy loading. + The PLT header passes its own args in t0-t2. */ +- +-#ifdef __loongarch_soft_float +-# define FRAME_SIZE (-((-10 * SZREG) & ALMASK)) ++#ifdef USE_LASX ++# define FRAME_SIZE (-((-9 * SZREG - 8 * SZFREG - 8 * SZXREG) & ALMASK)) ++#elif defined USE_LSX ++# define FRAME_SIZE (-((-9 * SZREG - 8 * SZFREG - 8 * SZVREG) & ALMASK)) ++#elif !defined __loongarch_soft_float ++# define FRAME_SIZE (-((-9 * SZREG - 8 * SZFREG) & ALMASK)) + #else +-# define FRAME_SIZE (-((-10 * SZREG - 8 * 256) & ALMASK)) ++# define FRAME_SIZE (-((-9 * SZREG) & ALMASK)) + #endif + + ENTRY (_dl_runtime_resolve, 3) +- # Save arguments to stack. +- +-#ifdef __loongarch64 +- li.d t3, -FRAME_SIZE +- add.d sp, sp, t3 +-#elif defined __loongarch32 +- li.w t3, -FRAME_SIZE +- add.w sp, sp, t3 +-#endif + ++ /* Save arguments to stack. */ ++ ADDI sp, sp, -FRAME_SIZE + +- REG_S ra, sp, 9*SZREG ++ REG_S ra, sp, 0*SZREG + REG_S a0, sp, 1*SZREG + REG_S a1, sp, 2*SZREG + REG_S a2, sp, 3*SZREG +@@ -51,55 +44,45 @@ ENTRY (_dl_runtime_resolve, 3) + REG_S a6, sp, 7*SZREG + REG_S a7, sp, 8*SZREG + +-#ifndef __loongarch_soft_float +- FREG_S fa0, sp, 10*SZREG + 0*SZFREG +- FREG_S fa1, sp, 10*SZREG + 1*SZFREG +- FREG_S fa2, sp, 10*SZREG + 2*SZFREG +- FREG_S fa3, sp, 10*SZREG + 3*SZFREG +- FREG_S fa4, sp, 10*SZREG + 4*SZFREG +- FREG_S fa5, sp, 10*SZREG + 5*SZFREG +- FREG_S fa6, sp, 10*SZREG + 6*SZFREG +- FREG_S fa7, sp, 10*SZREG + 7*SZFREG + #ifdef USE_LASX +- xvst xr0, sp, 10*SZREG + 0*256 +- xvst xr1, sp, 10*SZREG + 1*256 +- xvst xr2, sp, 10*SZREG + 2*256 +- xvst xr3, sp, 10*SZREG + 3*256 +- xvst xr4, sp, 10*SZREG + 4*256 +- xvst xr5, sp, 10*SZREG + 5*256 +- xvst xr6, sp, 10*SZREG + 6*256 +- xvst xr7, sp, 10*SZREG + 7*256 ++ xvst xr0, sp, 9*SZREG + 8*SZFREG + 0*SZXREG ++ xvst xr1, sp, 9*SZREG + 8*SZFREG + 1*SZXREG ++ xvst xr2, sp, 9*SZREG + 8*SZFREG + 2*SZXREG ++ xvst xr3, sp, 9*SZREG + 8*SZFREG + 3*SZXREG ++ xvst xr4, sp, 9*SZREG + 8*SZFREG + 4*SZXREG ++ xvst xr5, sp, 9*SZREG + 8*SZFREG + 5*SZXREG ++ xvst xr6, sp, 9*SZREG + 8*SZFREG + 6*SZXREG ++ xvst xr7, sp, 9*SZREG + 8*SZFREG + 7*SZXREG + #elif defined USE_LSX +- vst vr0, sp, 10*SZREG + 0*128 +- vst vr1, sp, 10*SZREG + 1*128 +- vst vr2, sp, 10*SZREG + 2*128 +- vst vr3, sp, 10*SZREG + 3*128 +- vst vr4, sp, 10*SZREG + 4*128 +- vst vr5, sp, 10*SZREG + 5*128 +- vst vr6, sp, 10*SZREG + 6*128 +- vst vr7, sp, 10*SZREG + 7*128 +-#endif ++ vst vr0, sp, 9*SZREG + 8*SZFREG + 0*SZVREG ++ vst vr1, sp, 9*SZREG + 8*SZFREG + 1*SZVREG ++ vst vr2, sp, 9*SZREG + 8*SZFREG + 2*SZVREG ++ vst vr3, sp, 9*SZREG + 8*SZFREG + 3*SZVREG ++ vst vr4, sp, 9*SZREG + 8*SZFREG + 4*SZVREG ++ vst vr5, sp, 9*SZREG + 8*SZFREG + 5*SZVREG ++ vst vr6, sp, 9*SZREG + 8*SZFREG + 6*SZVREG ++ vst vr7, sp, 9*SZREG + 8*SZFREG + 7*SZVREG ++#elif !defined __loongarch_soft_float ++ FREG_S fa0, sp, 9*SZREG + 0*SZFREG ++ FREG_S fa1, sp, 9*SZREG + 1*SZFREG ++ FREG_S fa2, sp, 9*SZREG + 2*SZFREG ++ FREG_S fa3, sp, 9*SZREG + 3*SZFREG ++ FREG_S fa4, sp, 9*SZREG + 4*SZFREG ++ FREG_S fa5, sp, 9*SZREG + 5*SZFREG ++ FREG_S fa6, sp, 9*SZREG + 6*SZFREG ++ FREG_S fa7, sp, 9*SZREG + 7*SZFREG + #endif + +- # Update .got.plt and obtain runtime address of callee. +-#ifdef __loongarch64 +- slli.d a1, t1, 1 ++ /* Update .got.plt and obtain runtime address of callee */ ++ SLLI a1, t1, 1 + or a0, t0, zero +- add.d a1, a1, t1 ++ ADD a1, a1, t1 + la a2, _dl_fixup + jirl ra, a2, 0 + or t1, v0, zero +-#elif defined __loongarch32 +- slli.w a1, t1, 1 +- or a0, t0, zero +- add.w a1, a1, t1 +- la a2, _dl_fixup +- jirl ra, a2, 0 +- or t1, v0, zero +-#endif + +- # Restore arguments from stack. +- REG_L ra, sp, 9*SZREG ++ /* Restore arguments from stack. */ ++ REG_L ra, sp, 0*SZREG + REG_L a0, sp, 1*SZREG + REG_L a1, sp, 2*SZREG + REG_L a2, sp, 3*SZREG +@@ -109,45 +92,37 @@ ENTRY (_dl_runtime_resolve, 3) + REG_L a6, sp, 7*SZREG + REG_L a7, sp, 8*SZREG + +-#ifndef __loongarch_soft_float +- FREG_L fa0, sp, 10*SZREG + 0*SZFREG +- FREG_L fa1, sp, 10*SZREG + 1*SZFREG +- FREG_L fa2, sp, 10*SZREG + 2*SZFREG +- FREG_L fa3, sp, 10*SZREG + 3*SZFREG +- FREG_L fa4, sp, 10*SZREG + 4*SZFREG +- FREG_L fa5, sp, 10*SZREG + 5*SZFREG +- FREG_L fa6, sp, 10*SZREG + 6*SZFREG +- FREG_L fa7, sp, 10*SZREG + 7*SZFREG + #ifdef USE_LASX +- xvld xr0, sp, 10*SZREG + 0*256 +- xvld xr1, sp, 10*SZREG + 1*256 +- xvld xr2, sp, 10*SZREG + 2*256 +- xvld xr3, sp, 10*SZREG + 3*256 +- xvld xr4, sp, 10*SZREG + 4*256 +- xvld xr5, sp, 10*SZREG + 5*256 +- xvld xr6, sp, 10*SZREG + 6*256 +- xvld xr7, sp, 10*SZREG + 7*256 ++ xvld xr0, sp, 9*SZREG + 8*SZFREG + 0*SZXREG ++ xvld xr1, sp, 9*SZREG + 8*SZFREG + 1*SZXREG ++ xvld xr2, sp, 9*SZREG + 8*SZFREG + 2*SZXREG ++ xvld xr3, sp, 9*SZREG + 8*SZFREG + 3*SZXREG ++ xvld xr4, sp, 9*SZREG + 8*SZFREG + 4*SZXREG ++ xvld xr5, sp, 9*SZREG + 8*SZFREG + 5*SZXREG ++ xvld xr6, sp, 9*SZREG + 8*SZFREG + 6*SZXREG ++ xvld xr7, sp, 9*SZREG + 8*SZFREG + 7*SZXREG + #elif defined USE_LSX +- vld vr0, sp, 10*SZREG + 0*128 +- vld vr1, sp, 10*SZREG + 1*128 +- vld vr2, sp, 10*SZREG + 2*128 +- vld vr3, sp, 10*SZREG + 3*128 +- vld vr4, sp, 10*SZREG + 4*128 +- vld vr5, sp, 10*SZREG + 5*128 +- vld vr6, sp, 10*SZREG + 6*128 +- vld vr7, sp, 10*SZREG + 7*128 +-#endif +-#endif +- +-#ifdef __loongarch64 +- li.d t3, FRAME_SIZE +- add.d sp, sp, t3 +-#elif defined __loongarch32 +- li.w t3, FRAME_SIZE +- addi.w sp, sp, FRAME_SIZE ++ vld vr0, sp, 9*SZREG + 8*SZFREG + 0*SZVREG ++ vld vr1, sp, 9*SZREG + 8*SZFREG + 1*SZVREG ++ vld vr2, sp, 9*SZREG + 8*SZFREG + 2*SZVREG ++ vld vr3, sp, 9*SZREG + 8*SZFREG + 3*SZVREG ++ vld vr4, sp, 9*SZREG + 8*SZFREG + 4*SZVREG ++ vld vr5, sp, 9*SZREG + 8*SZFREG + 5*SZVREG ++ vld vr6, sp, 9*SZREG + 8*SZFREG + 6*SZVREG ++ vld vr7, sp, 9*SZREG + 8*SZFREG + 7*SZVREG ++#elif !defined __loongarch_soft_float ++ FREG_L fa0, sp, 9*SZREG + 0*SZFREG ++ FREG_L fa1, sp, 9*SZREG + 1*SZFREG ++ FREG_L fa2, sp, 9*SZREG + 2*SZFREG ++ FREG_L fa3, sp, 9*SZREG + 3*SZFREG ++ FREG_L fa4, sp, 9*SZREG + 4*SZFREG ++ FREG_L fa5, sp, 9*SZREG + 5*SZFREG ++ FREG_L fa6, sp, 9*SZREG + 6*SZFREG ++ FREG_L fa7, sp, 9*SZREG + 7*SZFREG + #endif + ++ ADDI sp, sp, FRAME_SIZE + +- # Invoke the callee. ++ /* Invoke the callee. */ + jirl zero, t1, 0 + END (_dl_runtime_resolve) +diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h +index 734e45ae..e80c6245 100644 +--- a/sysdeps/loongarch/sys/asm.h ++++ b/sysdeps/loongarch/sys/asm.h +@@ -9,8 +9,17 @@ + # define PTRLOG 3 + # define SZREG 8 + # define SZFREG 8 ++# define SZVREG 16 ++# define SZXREG 32 + # define REG_L ld.d + # define REG_S st.d ++# define SRLI srli.d ++# define SLLI slli.d ++# define ADDI addi.d ++# define ADD add.d ++# define SUB sub.d ++# define BSTRINS bstrins.d ++# define LI li.d + # define FREG_L fld.d + # define FREG_S fst.d + #elif defined __loongarch32 +-- +2.33.0 + diff --git a/glibc-2.28-Refactor-code-of-raw-mem-functions.patch b/glibc-2.28-Refactor-code-of-raw-mem-functions.patch new file mode 100644 index 0000000..0db95f8 --- /dev/null +++ b/glibc-2.28-Refactor-code-of-raw-mem-functions.patch @@ -0,0 +1,3031 @@ +From 4879bd4e0aff7d884d9b026b6081a0e8cffc491c Mon Sep 17 00:00:00 2001 +From: caiyinyu +Date: Wed, 21 Jun 2023 09:30:54 +0800 +Subject: [PATCH 06/14] glibc-2.28: Refactor code of {raw,}mem* functions. + +Change-Id: Icafaf6bc8216f48be64cf25a40b9fe28ce127914 +Signed-off-by: ticat_fp +--- + sysdeps/loongarch/lp64/memchr.S | 92 -- + sysdeps/loongarch/lp64/memcmp.S | 280 ------ + sysdeps/loongarch/lp64/memcpy.S | 804 ------------------ + sysdeps/loongarch/lp64/memmove.S | 2 - + sysdeps/loongarch/lp64/memset.S | 166 ---- + .../loongarch/lp64/multiarch/memchr-aligned.S | 91 +- + .../loongarch/lp64/multiarch/memcmp-aligned.S | 282 +++++- + .../loongarch/lp64/multiarch/memcpy-aligned.S | 799 ++++++++++++++++- + .../loongarch/lp64/multiarch/memset-aligned.S | 166 +++- + .../lp64/multiarch/rawmemchr-aligned.S | 110 ++- + sysdeps/loongarch/lp64/rawmemchr.S | 113 --- + 11 files changed, 1438 insertions(+), 1467 deletions(-) + delete mode 100644 sysdeps/loongarch/lp64/memchr.S + delete mode 100644 sysdeps/loongarch/lp64/memcmp.S + delete mode 100644 sysdeps/loongarch/lp64/memcpy.S + delete mode 100644 sysdeps/loongarch/lp64/memmove.S + delete mode 100644 sysdeps/loongarch/lp64/memset.S + delete mode 100644 sysdeps/loongarch/lp64/rawmemchr.S + +diff --git a/sysdeps/loongarch/lp64/memchr.S b/sysdeps/loongarch/lp64/memchr.S +deleted file mode 100644 +index 23f1fd13..00000000 +--- a/sysdeps/loongarch/lp64/memchr.S ++++ /dev/null +@@ -1,92 +0,0 @@ +-#ifdef _LIBC +-#include +-#include +-#include +-#else +-#include +-#include +-#endif +- +-#ifndef MEMCHR_NAME +-#define MEMCHR_NAME memchr +-#endif +- +-LEAF(MEMCHR_NAME, 6) +- beqz a2, L(out) +- andi t1, a0, 0x7 +- lu12i.w a3, 0x01010 +- sub.d a5, a0, t1 +- +- bstrins.d a1, a1, 15, 8 +- ld.d t0, a5, 0 +- slli.d t2, t1, 3 +- ori a3, a3, 0x101 +- +- bstrins.d a1, a1, 31, 16 +- li.w t7, -1 +- li.w t8, 9 +- bstrins.d a3, a3, 63, 32 +- +- srl.d t3, t7, t2 +- bstrins.d a1, a1, 63, 32 +- sub.d t4, t8, t1 +- orn t3, a1, t3 +- +- srl.d t0, t0, t2 +- slli.d a4, a3, 7 # 0x8080808080808080 +- sltu t4, a2, t4 +- xor t2, t0, t3 +- +- sub.d a6, t2, a3 +- andn a7, a4, t2 +- and t2, a6, a7 +- or t3, t2, t4 +- +- bnez t3, L(count_pos) +- addi.d a2, a2, -8 +- addi.d a0, a5, 8 +- add.d a2, a2, t1 +- +-L(loop): +- ld.d t0, a0, 0 +- sltui t4, a2, 9 +- xor t2, t0, a1 +- sub.d a6, t2, a3 +- +- andn a7, a4, t2 +- and t2, a6, a7 +- or t3, t2, t4 +- bnez t3, L(count_pos) +- +- ld.d t1, a0, 8 +- addi.d a0, a0, 16 +- sltui t4, a2, 17 +- xor t2, t1, a1 +- +- sub.d a6, t2, a3 +- andn a7, a4, t2 +- and t2, a6, a7 +- addi.d a2, a2, -16 +- +- or t3, t2, t4 +- beqz t3, L(loop) +- addi.d a0, a0, -8 +- addi.d a2, a2, 8 +- +-L(count_pos): +- ctz.d t0, t2 +- srli.d t0, t0, 3 +- sltu t1, t0, a2 +- add.d a0, a0, t0 +- +- maskeqz a0, a0, t1 +- jr ra +- +-L(out): +- move a0, zero +- jr ra +-END(MEMCHR_NAME) +- +-#ifdef _LIBC +-libc_hidden_builtin_def (MEMCHR_NAME) +-#endif +diff --git a/sysdeps/loongarch/lp64/memcmp.S b/sysdeps/loongarch/lp64/memcmp.S +deleted file mode 100644 +index 457a4dc7..00000000 +--- a/sysdeps/loongarch/lp64/memcmp.S ++++ /dev/null +@@ -1,280 +0,0 @@ +-#ifdef _LIBC +-#include +-#include +-#include +-#else +-#include +-#include +-#endif +- +-#ifndef MEMCMP_NAME +-#define MEMCMP_NAME memcmp +-#endif +- +-LEAF(MEMCMP_NAME, 6) +- beqz a2, L(ret) +- andi a4, a1, 0x7 +- andi a3, a0, 0x7 +- sltu a5, a4, a3 +- +- xor t0, a0, a1 +- li.w t8, 8 +- maskeqz t0, t0, a5 +- li.w t7, -1 +- +- xor a0, a0, t0 // a0 hold smaller one +- xor a1, a1, t0 // a1 hold larger one +- andi a3, a0, 0x7 // a3 hold small offset +- andi a4, a1, 0x7 // a4 hold larger offset +- +- xor a0, a0, a3 +- xor a1, a1, a4 +- ld.d t2, a0, 0 // t2 = "fedcbaXX" +- ld.d t1, a1, 0 // t1 = "54321YYY" +- +- slli.d t3, a3, 3 +- slli.d t4, a4, 3 +- sub.d a6, t3, t4 // a6 = 0xfffffffffffffff8 +- srl.d t1, t1, t4 // t1 = "00054321" +- +- srl.d t0, t2, t3 // t0 = "00fedcba" +- srl.d t5, t7, t4 // t5 = 0x000000FFFFFFFFFF +- sub.d t6, t0, t1 // t6 hold diff +- and t6, t6, t5 // t6 = "000xxxxx" +- +- sub.d t5, t8, a4 // t5 hold margin 8 - 3 = 5 +- bnez t6, L(first_out) +- bgeu t5, a2, L(ret) +- sub.d a2, a2, t5 +- +- bnez a6, L(unaligned) +- blt a2, t8, L(al_less_8bytes) +- andi t1, a2, 31 +- beq t1, a2, L(al_less_32bytes) +- +- sub.d t2, a2, t1 +- add.d a4, a0, t2 +- move a2, t1 +- +-L(al_loop): +- ld.d t0, a0, 8 +- +- ld.d t1, a1, 8 +- ld.d t2, a0, 16 +- ld.d t3, a1, 16 +- ld.d t4, a0, 24 +- +- ld.d t5, a1, 24 +- ld.d t6, a0, 32 +- ld.d t7, a1, 32 +- addi.d a0, a0, 32 +- +- addi.d a1, a1, 32 +- bne t0, t1, L(out1) +- bne t2, t3, L(out2) +- bne t4, t5, L(out3) +- +- bne t6, t7, L(out4) +- bne a0, a4, L(al_loop) +- +-L(al_less_32bytes): +- srai.d a4, a2, 4 +- beqz a4, L(al_less_16bytes) +- +- ld.d t0, a0, 8 +- ld.d t1, a1, 8 +- ld.d t2, a0, 16 +- ld.d t3, a1, 16 +- +- addi.d a0, a0, 16 +- addi.d a1, a1, 16 +- addi.d a2, a2, -16 +- bne t0, t1, L(out1) +- +- bne t2, t3, L(out2) +- +-L(al_less_16bytes): +- srai.d a4, a2, 3 +- beqz a4, L(al_less_8bytes) +- ld.d t0, a0, 8 +- +- ld.d t1, a1, 8 +- addi.d a0, a0, 8 +- addi.d a1, a1, 8 +- addi.d a2, a2, -8 +- +- bne t0, t1, L(out1) +- +-L(al_less_8bytes): +- beqz a2, L(ret) +- ld.d t0, a0, 8 +- ld.d t1, a1, 8 +- +- li.d t7, -1 +- slli.d t2, a2, 3 +- sll.d t2, t7, t2 +- sub.d t3, t0, t1 +- +- andn t6, t3, t2 +- bnez t6, L(count_diff) +- +-L(ret): +- move a0, zero +- jr ra +- +-L(out4): +- move t0, t6 +- move t1, t7 +- sub.d t6, t6, t7 +- b L(count_diff) +- +-L(out3): +- move t0, t4 +- move t1, t5 +- sub.d t6, t4, t5 +- b L(count_diff) +- +-L(out2): +- move t0, t2 +- move t1, t3 +-L(out1): +- sub.d t6, t0, t1 +- b L(count_diff) +- +-L(first_out): +- slli.d t4, a2, 3 +- slt t3, a2, t5 +- sll.d t4, t7, t4 +- maskeqz t4, t4, t3 +- +- andn t6, t6, t4 +- +-L(count_diff): +- ctz.d t2, t6 +- bstrins.d t2, zero, 2, 0 +- srl.d t0, t0, t2 +- +- srl.d t1, t1, t2 +- andi t0, t0, 0xff +- andi t1, t1, 0xff +- sub.d t2, t0, t1 +- +- sub.d t3, t1, t0 +- masknez t2, t2, a5 +- maskeqz t3, t3, a5 +- or a0, t2, t3 +- +- jr ra +- +-L(unaligned): +- sub.d a7, zero, a6 +- srl.d t0, t2, a6 +- blt a2, t8, L(un_less_8bytes) +- +- andi t1, a2, 31 +- beq t1, a2, L(un_less_32bytes) +- sub.d t2, a2, t1 +- add.d a4, a0, t2 +- +- move a2, t1 +- +-L(un_loop): +- ld.d t2, a0, 8 +- ld.d t1, a1, 8 +- ld.d t4, a0, 16 +- +- ld.d t3, a1, 16 +- ld.d t6, a0, 24 +- ld.d t5, a1, 24 +- ld.d t8, a0, 32 +- +- ld.d t7, a1, 32 +- addi.d a0, a0, 32 +- addi.d a1, a1, 32 +- sll.d a3, t2, a7 +- +- or t0, a3, t0 +- bne t0, t1, L(out1) +- srl.d t0, t2, a6 +- sll.d a3, t4, a7 +- +- or t2, a3, t0 +- bne t2, t3, L(out2) +- srl.d t0, t4, a6 +- sll.d a3, t6, a7 +- +- or t4, a3, t0 +- bne t4, t5, L(out3) +- srl.d t0, t6, a6 +- sll.d a3, t8, a7 +- +- or t6, t0, a3 +- bne t6, t7, L(out4) +- srl.d t0, t8, a6 +- bne a0, a4, L(un_loop) +- +-L(un_less_32bytes): +- srai.d a4, a2, 4 +- beqz a4, L(un_less_16bytes) +- ld.d t2, a0, 8 +- ld.d t1, a1, 8 +- +- ld.d t4, a0, 16 +- ld.d t3, a1, 16 +- addi.d a0, a0, 16 +- addi.d a1, a1, 16 +- +- addi.d a2, a2, -16 +- sll.d a3, t2, a7 +- or t0, a3, t0 +- bne t0, t1, L(out1) +- +- srl.d t0, t2, a6 +- sll.d a3, t4, a7 +- or t2, a3, t0 +- bne t2, t3, L(out2) +- +- srl.d t0, t4, a6 +- +-L(un_less_16bytes): +- srai.d a4, a2, 3 +- beqz a4, L(un_less_8bytes) +- ld.d t2, a0, 8 +- +- ld.d t1, a1, 8 +- addi.d a0, a0, 8 +- addi.d a1, a1, 8 +- addi.d a2, a2, -8 +- +- sll.d a3, t2, a7 +- or t0, a3, t0 +- bne t0, t1, L(out1) +- srl.d t0, t2, a6 +- +-L(un_less_8bytes): +- beqz a2, L(ret) +- andi a7, a7, 63 +- slli.d a4, a2, 3 +- bgeu a7, a4, L(last_cmp) +- +- ld.d t2, a0, 8 +- sll.d a3, t2, a7 +- or t0, a3, t0 +- +-L(last_cmp): +- ld.d t1, a1, 8 +- +- li.d t7, -1 +- sll.d t2, t7, a4 +- sub.d t3, t0, t1 +- andn t6, t3, t2 +- +- bnez t6, L(count_diff) +- move a0, zero +- jr ra +- +-END(MEMCMP_NAME) +- +-#ifdef _LIBC +-libc_hidden_builtin_def (MEMCMP_NAME) +-#endif +diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S +deleted file mode 100644 +index 4791e1a4..00000000 +--- a/sysdeps/loongarch/lp64/memcpy.S ++++ /dev/null +@@ -1,804 +0,0 @@ +-#ifdef _LIBC +-#include +-#include +-#include +-#else +-#include +-#include +-#endif +- +-#ifndef MEMCPY_NAME +-#define MEMCPY_NAME memcpy +-#endif +- +-#ifndef MEMMOVE_NAME +-#define MEMMOVE_NAME memmove +-#endif +- +-#define LD_64(reg, n) \ +- ld.d t0, reg, n; \ +- ld.d t1, reg, n+8; \ +- ld.d t2, reg, n+16; \ +- ld.d t3, reg, n+24; \ +- ld.d t4, reg, n+32; \ +- ld.d t5, reg, n+40; \ +- ld.d t6, reg, n+48; \ +- ld.d t7, reg, n+56; +- +-#define ST_64(reg, n) \ +- st.d t0, reg, n; \ +- st.d t1, reg, n+8; \ +- st.d t2, reg, n+16; \ +- st.d t3, reg, n+24; \ +- st.d t4, reg, n+32; \ +- st.d t5, reg, n+40; \ +- st.d t6, reg, n+48; \ +- st.d t7, reg, n+56; +- +-LEAF(MEMMOVE_NAME, 6) +- sub.d t0, a0, a1 +- bltu t0, a2, L(copy_back) +- +-END(MEMMOVE_NAME) +- +-#ifdef _LIBC +-libc_hidden_builtin_def (MEMMOVE_NAME) +-#endif +- +-LEAF_NO_ALIGN(MEMCPY_NAME) +- +- srai.d a3, a2, 4 +- beqz a3, L(short_data) # less than 16 bytes +- +- move a4, a0 +- andi a5, a0, 0x7 +- andi a6, a1, 0x7 +- li.d t8, 8 +- beqz a5, L(check_align) +- +- # make dest aligned 8 bytes +- sub.d t2, t8, a5 +- sub.d a2, a2, t2 +- +- pcaddi t1, 20 +- slli.d t3, t2, 3 +- add.d a1, a1, t2 +- sub.d t1, t1, t3 +- add.d a4, a4, t2 +- jr t1 +- +-L(al7): +- ld.b t0, a1, -7 +- st.b t0, a4, -7 +-L(al6): +- ld.b t0, a1, -6 +- st.b t0, a4, -6 +-L(al5): +- ld.b t0, a1, -5 +- st.b t0, a4, -5 +-L(al4): +- ld.b t0, a1, -4 +- st.b t0, a4, -4 +-L(al3): +- ld.b t0, a1, -3 +- st.b t0, a4, -3 +-L(al2): +- ld.b t0, a1, -2 +- st.b t0, a4, -2 +-L(al1): +- ld.b t0, a1, -1 +- st.b t0, a4, -1 +- +-L(check_align): +- bne a5, a6, L(unalign) +- +- srai.d a3, a2, 4 +- beqz a3, L(al_less_16bytes) +- +- andi a3, a2, 0x3f +- beq a3, a2, L(al_less_64bytes) +- +- sub.d t0, a2, a3 +- move a2, a3 +- add.d a5, a1, t0 +- +-L(loop_64bytes): +- LD_64(a1, 0) +- addi.d a1, a1, 64 +- ST_64(a4, 0) +- +- addi.d a4, a4, 64 +- bne a1, a5, L(loop_64bytes) +- +-L(al_less_64bytes): +- srai.d a3, a2, 5 +- beqz a3, L(al_less_32bytes) +- +- ld.d t0, a1, 0 +- ld.d t1, a1, 8 +- ld.d t2, a1, 16 +- ld.d t3, a1, 24 +- +- addi.d a1, a1, 32 +- addi.d a2, a2, -32 +- +- st.d t0, a4, 0 +- st.d t1, a4, 8 +- st.d t2, a4, 16 +- st.d t3, a4, 24 +- +- addi.d a4, a4, 32 +- +-L(al_less_32bytes): +- srai.d a3, a2, 4 +- beqz a3, L(al_less_16bytes) +- +- ld.d t0, a1, 0 +- ld.d t1, a1, 8 +- addi.d a1, a1, 16 +- addi.d a2, a2, -16 +- +- st.d t0, a4, 0 +- st.d t1, a4, 8 +- addi.d a4, a4, 16 +- +-L(al_less_16bytes): +- srai.d a3, a2, 3 +- beqz a3, L(al_less_8bytes) +- +- ld.d t0, a1, 0 +- addi.d a1, a1, 8 +- addi.d a2, a2, -8 +- +- st.d t0, a4, 0 +- addi.d a4, a4, 8 +- +-L(al_less_8bytes): +- srai.d a3, a2, 2 +- beqz a3, L(al_less_4bytes) +- +- ld.w t0, a1, 0 +- addi.d a1, a1, 4 +- addi.d a2, a2, -4 +- +- st.w t0, a4, 0 +- addi.d a4, a4, 4 +- +-L(al_less_4bytes): +- srai.d a3, a2, 1 +- beqz a3, L(al_less_2bytes) +- +- ld.h t0, a1, 0 +- addi.d a1, a1, 2 +- addi.d a2, a2, -2 +- +- st.h t0, a4, 0 +- addi.d a4, a4, 2 +- +-L(al_less_2bytes): +- beqz a2, L(al_less_1byte) +- +- ld.b t0, a1, 0 +- st.b t0, a4, 0 +- +-L(al_less_1byte): +- jr ra +- +-L(unalign): +- andi a5, a1, 0x7 +- bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned +- +- sub.d t8, t8, a5 # use t8 to save count of bytes for aligning +- slli.d a5, a5, 3 +- +- ld.d t0, a1, 0 +- addi.d a1, a1, 8 +- +- slli.d a6, t8, 3 +- srl.d a7, t0, a5 +- +- srai.d a3, a2, 4 +- beqz a3, L(un_less_16bytes) +- +- andi a3, a2, 0x3f +- beq a3, a2, L(un_less_64bytes) +- +- sub.d t0, a2, a3 +- move a2, a3 +- add.d a3, a1, t0 +- +-# a5 shift right num +-# a6 shift left num +-# a7 remaining part +-L(un_long_bytes): +- ld.d t0, a1, 0 +- ld.d t1, a1, 8 +- ld.d t2, a1, 16 +- ld.d t3, a1, 24 +- +- srl.d t4, t0, a5 +- sll.d t0, t0, a6 +- +- srl.d t5, t1, a5 +- sll.d t1, t1, a6 +- +- srl.d t6, t2, a5 +- sll.d t2, t2, a6 +- +- srl.d t7, t3, a5 +- sll.d t3, t3, a6 +- +- or t0, a7, t0 +- or t1, t4, t1 +- or t2, t5, t2 +- or t3, t6, t3 +- +- ld.d t4, a1, 32 +- ld.d t5, a1, 40 +- ld.d t6, a1, 48 +- ld.d a7, a1, 56 +- +- st.d t0, a4, 0 +- st.d t1, a4, 8 +- st.d t2, a4, 16 +- st.d t3, a4, 24 +- +- addi.d a1, a1, 64 +- +- srl.d t0, t4, a5 +- sll.d t4, t4, a6 +- +- srl.d t1, t5, a5 +- sll.d t5, t5, a6 +- +- srl.d t2, t6, a5 +- sll.d t6, t6, a6 +- +- sll.d t3, a7, a6 +- srl.d a7, a7, a5 +- +- or t4, t7, t4 +- or t5, t0, t5 +- or t6, t1, t6 +- or t3, t2, t3 +- +- st.d t4, a4, 32 +- st.d t5, a4, 40 +- st.d t6, a4, 48 +- st.d t3, a4, 56 +- +- addi.d a4, a4, 64 +- bne a3, a1, L(un_long_bytes) +- +-L(un_less_64bytes): +- srai.d a3, a2, 5 +- beqz a3, L(un_less_32bytes) +- +- ld.d t0, a1, 0 +- ld.d t1, a1, 8 +- ld.d t2, a1, 16 +- ld.d t3, a1, 24 +- +- addi.d a1, a1, 32 +- addi.d a2, a2, -32 +- +- srl.d t4, t0, a5 +- sll.d t0, t0, a6 +- +- srl.d t5, t1, a5 +- sll.d t1, t1, a6 +- +- srl.d t6, t2, a5 +- sll.d t2, t2, a6 +- +- or t0, a7, t0 +- +- srl.d a7, t3, a5 +- sll.d t3, t3, a6 +- +- or t1, t4, t1 +- or t2, t5, t2 +- or t3, t6, t3 +- +- st.d t0, a4, 0 +- st.d t1, a4, 8 +- st.d t2, a4, 16 +- st.d t3, a4, 24 +- +- addi.d a4, a4, 32 +- +-L(un_less_32bytes): +- srai.d a3, a2, 4 +- beqz a3, L(un_less_16bytes) +- +- ld.d t0, a1, 0 +- ld.d t1, a1, 8 +- +- addi.d a1, a1, 16 +- addi.d a2, a2, -16 +- +- srl.d t2, t0, a5 +- sll.d t3, t0, a6 +- +- sll.d t4, t1, a6 +- or t3, a7, t3 +- or t4, t2, t4 +- srl.d a7, t1, a5 +- +- st.d t3, a4, 0 +- st.d t4, a4, 8 +- +- addi.d a4, a4, 16 +- +-L(un_less_16bytes): +- srai.d a3, a2, 3 +- beqz a3, L(un_less_8bytes) +- +- ld.d t0, a1, 0 +- +- addi.d a1, a1, 8 +- addi.d a2, a2, -8 +- +- sll.d t1, t0, a6 +- or t2, a7, t1 +- srl.d a7, t0, a5 +- +- st.d t2, a4, 0 +- addi.d a4, a4, 8 +- +-L(un_less_8bytes): +- beqz a2, L(un_less_1byte) +- bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7 +- +- # combine data in memory and a7(remaining part) +- ld.d t0, a1, 0 +- sll.d t0, t0, a6 +- or a7, a7, t0 +- +-1: +- srai.d a3, a2, 2 +- beqz a3, L(un_less_4bytes) +- +- addi.d a2, a2, -4 +- st.w a7, a4, 0 +- addi.d a4, a4, 4 +- srai.d a7, a7, 32 +- +-L(un_less_4bytes): +- srai.d a3, a2, 1 +- beqz a3, L(un_less_2bytes) +- +- addi.d a2, a2, -2 +- st.h a7, a4, 0 +- addi.d a4, a4, 2 +- srai.d a7, a7, 16 +- +-L(un_less_2bytes): +- beqz a2, L(un_less_1byte) +- st.b a7, a4, 0 +- +-L(un_less_1byte): +- jr ra +- +-# Bytes copying for data less than 16 bytes +-L(short_data): +- pcaddi t1, 36 +- slli.d t2, a2, 3 +- add.d a4, a0, a2 +- sub.d t1, t1, t2 +- add.d a1, a1, a2 +- jr t1 +- +-L(short_15_bytes): +- ld.b t0, a1, -15 +- st.b t0, a4, -15 +-L(short_14_bytes): +- ld.b t0, a1, -14 +- st.b t0, a4, -14 +-L(short_13_bytes): +- ld.b t0, a1, -13 +- st.b t0, a4, -13 +-L(short_12_bytes): +- ld.b t0, a1, -12 +- st.b t0, a4, -12 +-L(short_11_bytes): +- ld.b t0, a1, -11 +- st.b t0, a4, -11 +-L(short_10_bytes): +- ld.b t0, a1, -10 +- st.b t0, a4, -10 +-L(short_9_bytes): +- ld.b t0, a1, -9 +- st.b t0, a4, -9 +-L(short_8_bytes): +- ld.b t0, a1, -8 +- st.b t0, a4, -8 +-L(short_7_bytes): +- ld.b t0, a1, -7 +- st.b t0, a4, -7 +-L(short_6_bytes): +- ld.b t0, a1, -6 +- st.b t0, a4, -6 +-L(short_5_bytes): +- ld.b t0, a1, -5 +- st.b t0, a4, -5 +-L(short_4_bytes): +- ld.b t0, a1, -4 +- st.b t0, a4, -4 +-L(short_3_bytes): +- ld.b t0, a1, -3 +- st.b t0, a4, -3 +-L(short_2_bytes): +- ld.b t0, a1, -2 +- st.b t0, a4, -2 +-L(short_1_bytes): +- ld.b t0, a1, -1 +- st.b t0, a4, -1 +- jr ra +- +-L(copy_back): +- srai.d a3, a2, 4 +- beqz a3, L(back_short_data) # less than 16 bytes +- +- add.d a4, a0, a2 # store the tail of dest +- add.d a1, a1, a2 # store the tail of src +- +- andi a5, a4, 0x7 +- andi a6, a1, 0x7 +- beqz a5, L(back_check_align) +- +- # make dest aligned 8 bytes +- sub.d a2, a2, a5 +- sub.d a1, a1, a5 +- sub.d a4, a4, a5 +- +- pcaddi t1, 18 +- slli.d t3, a5, 3 +- sub.d t1, t1, t3 +- jr t1 +- +- ld.b t0, a1, 6 +- st.b t0, a4, 6 +- ld.b t0, a1, 5 +- st.b t0, a4, 5 +- ld.b t0, a1, 4 +- st.b t0, a4, 4 +- ld.b t0, a1, 3 +- st.b t0, a4, 3 +- ld.b t0, a1, 2 +- st.b t0, a4, 2 +- ld.b t0, a1, 1 +- st.b t0, a4, 1 +- ld.b t0, a1, 0 +- st.b t0, a4, 0 +- +-L(back_check_align): +- bne a5, a6, L(back_unalign) +- +- srai.d a3, a2, 4 +- beqz a3, L(back_less_16bytes) +- +- andi a3, a2, 0x3f +- beq a3, a2, L(back_less_64bytes) +- +- sub.d t0, a2, a3 +- move a2, a3 +- sub.d a5, a1, t0 +- +-L(back_loop_64bytes): +- LD_64(a1, -64) +- addi.d a1, a1, -64 +- ST_64(a4, -64) +- +- addi.d a4, a4, -64 +- bne a1, a5, L(back_loop_64bytes) +- +-L(back_less_64bytes): +- srai.d a3, a2, 5 +- beqz a3, L(back_less_32bytes) +- +- ld.d t0, a1, -32 +- ld.d t1, a1, -24 +- ld.d t2, a1, -16 +- ld.d t3, a1, -8 +- +- addi.d a1, a1, -32 +- addi.d a2, a2, -32 +- +- st.d t0, a4, -32 +- st.d t1, a4, -24 +- st.d t2, a4, -16 +- st.d t3, a4, -8 +- +- addi.d a4, a4, -32 +- +-L(back_less_32bytes): +- srai.d a3, a2, 4 +- beqz a3, L(back_less_16bytes) +- +- ld.d t0, a1, -16 +- ld.d t1, a1, -8 +- +- addi.d a2, a2, -16 +- addi.d a1, a1, -16 +- +- st.d t0, a4, -16 +- st.d t1, a4, -8 +- addi.d a4, a4, -16 +- +-L(back_less_16bytes): +- srai.d a3, a2, 3 +- beqz a3, L(back_less_8bytes) +- +- ld.d t0, a1, -8 +- addi.d a2, a2, -8 +- addi.d a1, a1, -8 +- +- st.d t0, a4, -8 +- addi.d a4, a4, -8 +- +-L(back_less_8bytes): +- srai.d a3, a2, 2 +- beqz a3, L(back_less_4bytes) +- +- ld.w t0, a1, -4 +- addi.d a2, a2, -4 +- addi.d a1, a1, -4 +- +- st.w t0, a4, -4 +- addi.d a4, a4, -4 +- +-L(back_less_4bytes): +- srai.d a3, a2, 1 +- beqz a3, L(back_less_2bytes) +- +- ld.h t0, a1, -2 +- addi.d a2, a2, -2 +- addi.d a1, a1, -2 +- +- st.h t0, a4, -2 +- addi.d a4, a4, -2 +- +-L(back_less_2bytes): +- beqz a2, L(back_less_1byte) +- +- ld.b t0, a1, -1 +- st.b t0, a4, -1 +- +-L(back_less_1byte): +- jr ra +- +-L(back_unalign): +- andi t8, a1, 0x7 +- bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned +- +- sub.d a6, zero, t8 +- +- ld.d t0, a1, 0 +- slli.d a6, a6, 3 +- slli.d a5, t8, 3 +- sll.d a7, t0, a6 +- +- srai.d a3, a2, 4 +- beqz a3, L(back_un_less_16bytes) +- +- andi a3, a2, 0x3f +- beq a3, a2, L(back_un_less_64bytes) +- +- sub.d t0, a2, a3 +- move a2, a3 +- sub.d a3, a1, t0 +- +-L(back_un_long_bytes): +- ld.d t0, a1, -8 +- ld.d t1, a1, -16 +- ld.d t2, a1, -24 +- ld.d t3, a1, -32 +- +- sll.d t4, t0, a6 +- srl.d t0, t0, a5 +- +- sll.d t5, t1, a6 +- srl.d t1, t1, a5 +- +- sll.d t6, t2, a6 +- srl.d t2, t2, a5 +- +- sll.d t7, t3, a6 +- srl.d t3, t3, a5 +- +- or t0, t0, a7 +- or t1, t1, t4 +- or t2, t2, t5 +- or t3, t3, t6 +- +- ld.d t4, a1, -40 +- ld.d t5, a1, -48 +- ld.d t6, a1, -56 +- ld.d a7, a1, -64 +- st.d t0, a4, -8 +- st.d t1, a4, -16 +- st.d t2, a4, -24 +- st.d t3, a4, -32 +- +- addi.d a1, a1, -64 +- +- sll.d t0, t4, a6 +- srl.d t4, t4, a5 +- +- sll.d t1, t5, a6 +- srl.d t5, t5, a5 +- +- sll.d t2, t6, a6 +- srl.d t6, t6, a5 +- +- srl.d t3, a7, a5 +- sll.d a7, a7, a6 +- +- or t4, t7, t4 +- or t5, t0, t5 +- or t6, t1, t6 +- or t3, t2, t3 +- +- st.d t4, a4, -40 +- st.d t5, a4, -48 +- st.d t6, a4, -56 +- st.d t3, a4, -64 +- +- addi.d a4, a4, -64 +- bne a3, a1, L(back_un_long_bytes) +- +-L(back_un_less_64bytes): +- srai.d a3, a2, 5 +- beqz a3, L(back_un_less_32bytes) +- +- ld.d t0, a1, -8 +- ld.d t1, a1, -16 +- ld.d t2, a1, -24 +- ld.d t3, a1, -32 +- +- addi.d a1, a1, -32 +- addi.d a2, a2, -32 +- +- sll.d t4, t0, a6 +- srl.d t0, t0, a5 +- +- sll.d t5, t1, a6 +- srl.d t1, t1, a5 +- +- sll.d t6, t2, a6 +- srl.d t2, t2, a5 +- +- or t0, a7, t0 +- +- sll.d a7, t3, a6 +- srl.d t3, t3, a5 +- +- or t1, t4, t1 +- or t2, t5, t2 +- or t3, t6, t3 +- +- st.d t0, a4, -8 +- st.d t1, a4, -16 +- st.d t2, a4, -24 +- st.d t3, a4, -32 +- +- addi.d a4, a4, -32 +- +-L(back_un_less_32bytes): +- srai.d a3, a2, 4 +- beqz a3, L(back_un_less_16bytes) +- +- ld.d t0, a1, -8 +- ld.d t1, a1, -16 +- +- addi.d a1, a1, -16 +- addi.d a2, a2, -16 +- +- sll.d t2, t0, a6 +- srl.d t3, t0, a5 +- +- srl.d t4, t1, a5 +- or t3, a7, t3 +- or t4, t2, t4 +- sll.d a7, t1, a6 +- +- st.d t3, a4, -8 +- st.d t4, a4, -16 +- +- addi.d a4, a4, -16 +- +-L(back_un_less_16bytes): +- srai.d a3, a2, 3 +- beqz a3, L(back_un_less_8bytes) +- +- ld.d t0, a1, -8 +- +- addi.d a1, a1, -8 +- addi.d a2, a2, -8 +- +- srl.d t1, t0, a5 +- or t2, a7, t1 +- sll.d a7, t0, a6 +- +- st.d t2, a4, -8 +- addi.d a4, a4, -8 +- +-L(back_un_less_8bytes): +- beqz a2, L(back_end) +- bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7 +- +- # combine data in memory and a7(remaining part) +- ld.d t0, a1, -8 +- srl.d t0, t0, a5 +- or a7, a7, t0 +- +-1: +- srai.d a3, a2, 2 +- beqz a3, L(back_un_less_4bytes) +- +- srai.d t0, a7, 32 +- addi.d a2, a2, -4 +- st.w t0, a4, -4 +- addi.d a4, a4, -4 +- slli.d a7, a7, 32 +- +-L(back_un_less_4bytes): +- srai.d a3, a2, 1 +- beqz a3, L(back_un_less_2bytes) +- srai.d t0, a7, 48 +- addi.d a2, a2, -2 +- st.h t0, a4, -2 +- addi.d a4, a4, -2 +- slli.d a7, a7, 16 +-L(back_un_less_2bytes): +- beqz a2, L(back_un_less_1byte) +- srai.d t0, a7, 56 +- st.b t0, a4, -1 +-L(back_un_less_1byte): +- jr ra +- +-L(back_short_data): +- pcaddi t1, 34 +- slli.d t2, a2, 3 +- sub.d t1, t1, t2 +- jr t1 +- +- ld.b t0, a1, 14 +- st.b t0, a0, 14 +- ld.b t0, a1, 13 +- st.b t0, a0, 13 +- ld.b t0, a1, 12 +- st.b t0, a0, 12 +- ld.b t0, a1, 11 +- st.b t0, a0, 11 +- ld.b t0, a1, 10 +- st.b t0, a0, 10 +- ld.b t0, a1, 9 +- st.b t0, a0, 9 +- ld.b t0, a1, 8 +- st.b t0, a0, 8 +- ld.b t0, a1, 7 +- st.b t0, a0, 7 +- ld.b t0, a1, 6 +- st.b t0, a0, 6 +- ld.b t0, a1, 5 +- st.b t0, a0, 5 +- ld.b t0, a1, 4 +- st.b t0, a0, 4 +- ld.b t0, a1, 3 +- st.b t0, a0, 3 +- ld.b t0, a1, 2 +- st.b t0, a0, 2 +- ld.b t0, a1, 1 +- st.b t0, a0, 1 +- ld.b t0, a1, 0 +- st.b t0, a0, 0 +-L(back_end): +- jr ra +- +-END(MEMCPY_NAME) +- +-#ifdef _LIBC +-libc_hidden_builtin_def (MEMCPY_NAME) +-#endif +diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S +deleted file mode 100644 +index 6d1922c4..00000000 +--- a/sysdeps/loongarch/lp64/memmove.S ++++ /dev/null +@@ -1,2 +0,0 @@ +-/* DONT DELETE THIS FILE, OTHERWIES MEMCPY.C WILL BE COMPILED. */ +-/* There are too many common code in memcpy and memmove. See memcpy.S */ +diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S +deleted file mode 100644 +index eabd7d23..00000000 +--- a/sysdeps/loongarch/lp64/memset.S ++++ /dev/null +@@ -1,166 +0,0 @@ +-#ifdef _LIBC +-#include +-#include +-#include +-#else +-#include +-#include +-#endif +- +-#ifndef MEMSET_NAME +-#define MEMSET_NAME memset +-#endif +- +-#define ST_64(n) \ +- st.d a1, a0, n; \ +- st.d a1, a0, n+8; \ +- st.d a1, a0, n+16; \ +- st.d a1, a0, n+24; \ +- st.d a1, a0, n+32; \ +- st.d a1, a0, n+40; \ +- st.d a1, a0, n+48; \ +- st.d a1, a0, n+56; +- +-LEAF(MEMSET_NAME, 6) +- move t0, a0 +- andi a3, a0, 0x7 +- li.w t6, 16 +- beqz a3, L(align) +- blt a2, t6, L(short_data) +- +-L(make_align): +- li.w t8, 8 +- sub.d t2, t8, a3 +- pcaddi t1, 11 +- slli.d t3, t2, 2 +- sub.d t1, t1, t3 +- jirl zero, t1, 0 +- +-L(al7): +- st.b a1, t0, 6 +-L(al6): +- st.b a1, t0, 5 +-L(al5): +- st.b a1, t0, 4 +-L(al4): +- st.b a1, t0, 3 +-L(al3): +- st.b a1, t0, 2 +-L(al2): +- st.b a1, t0, 1 +-L(al1): +- st.b a1, t0, 0 +-L(al0): +- add.d t0, t0, t2 +- sub.d a2, a2, t2 +- +-L(align): +- bstrins.d a1, a1, 15, 8 +- bstrins.d a1, a1, 31, 16 +- bstrins.d a1, a1, 63, 32 +- +- blt a2, t6, L(less_16bytes) +- +- andi a4, a2, 0x3f +- beq a4, a2, L(less_64bytes) +- +- sub.d t1, a2, a4 +- move a2, a4 +- add.d a5, t0, t1 +- +-L(loop_64bytes): +- addi.d t0, t0, 64 +- st.d a1, t0, -64 +- st.d a1, t0, -56 +- st.d a1, t0, -48 +- st.d a1, t0, -40 +- st.d a1, t0, -32 +- st.d a1, t0, -24 +- st.d a1, t0, -16 +- st.d a1, t0, -8 +- bne t0, a5, L(loop_64bytes) +- +-L(less_64bytes): +- srai.d a4, a2, 5 +- beqz a4, L(less_32bytes) +- addi.d a2, a2, -32 +- st.d a1, t0, 0 +- st.d a1, t0, 8 +- st.d a1, t0, 16 +- st.d a1, t0, 24 +- addi.d t0, t0, 32 +-L(less_32bytes): +- blt a2, t6, L(less_16bytes) +- addi.d a2, a2, -16 +- st.d a1, t0, 0 +- st.d a1, t0, 8 +- addi.d t0, t0, 16 +-L(less_16bytes): +- srai.d a4, a2, 3 +- beqz a4, L(less_8bytes) +- addi.d a2, a2, -8 +- st.d a1, t0, 0 +- addi.d t0, t0, 8 +-L(less_8bytes): +- beqz a2, L(less_1byte) +- srai.d a4, a2, 2 +- beqz a4, L(less_4bytes) +- addi.d a2, a2, -4 +- st.w a1, t0, 0 +- addi.d t0, t0, 4 +-L(less_4bytes): +- srai.d a3, a2, 1 +- beqz a3, L(less_2bytes) +- addi.d a2, a2, -2 +- st.h a1, t0, 0 +- addi.d t0, t0, 2 +-L(less_2bytes): +- beqz a2, L(less_1byte) +- st.b a1, t0, 0 +-L(less_1byte): +- jr ra +- +-L(short_data): +- pcaddi t1, 19 +- slli.d t3, a2, 2 +- sub.d t1, t1, t3 +- jirl zero, t1, 0 +-L(short_15): +- st.b a1, a0, 14 +- +-L(short_14): +- st.b a1, a0, 13 +-L(short_13): +- st.b a1, a0, 12 +-L(short_12): +- st.b a1, a0, 11 +-L(short_11): +- st.b a1, a0, 10 +-L(short_10): +- st.b a1, a0, 9 +-L(short_9): +- st.b a1, a0, 8 +-L(short_8): +- st.b a1, a0, 7 +-L(short_7): +- st.b a1, a0, 6 +-L(short_6): +- st.b a1, a0, 5 +-L(short_5): +- st.b a1, a0, 4 +-L(short_4): +- st.b a1, a0, 3 +-L(short_3): +- st.b a1, a0, 2 +-L(short_2): +- st.b a1, a0, 1 +-L(short_1): +- st.b a1, a0, 0 +-L(short_0): +- jr ra +- +-END(MEMSET_NAME) +- +-#ifdef _LIBC +-libc_hidden_builtin_def (MEMSET_NAME) +-#endif +diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S +index 4677c912..7dfa3ade 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S +@@ -1,7 +1,96 @@ + ++#ifdef _LIBC ++#include ++#include ++#include ++#else ++#include ++#include ++#endif ++ + #if IS_IN (libc) + #define MEMCHR_NAME __memchr_aligned ++#else ++#define MEMCHR_NAME memchr + #endif + +-#include "../memchr.S" ++LEAF(MEMCHR_NAME, 6) ++ beqz a2, L(out) ++ andi t1, a0, 0x7 ++ lu12i.w a3, 0x01010 ++ sub.d a5, a0, t1 ++ ++ bstrins.d a1, a1, 15, 8 ++ ld.d t0, a5, 0 ++ slli.d t2, t1, 3 ++ ori a3, a3, 0x101 ++ ++ bstrins.d a1, a1, 31, 16 ++ li.w t7, -1 ++ li.w t8, 9 ++ bstrins.d a3, a3, 63, 32 ++ ++ srl.d t3, t7, t2 ++ bstrins.d a1, a1, 63, 32 ++ sub.d t4, t8, t1 ++ orn t3, a1, t3 ++ ++ srl.d t0, t0, t2 ++ slli.d a4, a3, 7 # 0x8080808080808080 ++ sltu t4, a2, t4 ++ xor t2, t0, t3 ++ ++ sub.d a6, t2, a3 ++ andn a7, a4, t2 ++ and t2, a6, a7 ++ or t3, t2, t4 ++ ++ bnez t3, L(count_pos) ++ addi.d a2, a2, -8 ++ addi.d a0, a5, 8 ++ add.d a2, a2, t1 ++ ++L(loop): ++ ld.d t0, a0, 0 ++ sltui t4, a2, 9 ++ xor t2, t0, a1 ++ sub.d a6, t2, a3 ++ ++ andn a7, a4, t2 ++ and t2, a6, a7 ++ or t3, t2, t4 ++ bnez t3, L(count_pos) ++ ++ ld.d t1, a0, 8 ++ addi.d a0, a0, 16 ++ sltui t4, a2, 17 ++ xor t2, t1, a1 ++ ++ sub.d a6, t2, a3 ++ andn a7, a4, t2 ++ and t2, a6, a7 ++ addi.d a2, a2, -16 ++ ++ or t3, t2, t4 ++ beqz t3, L(loop) ++ addi.d a0, a0, -8 ++ addi.d a2, a2, 8 ++ ++L(count_pos): ++ ctz.d t0, t2 ++ srli.d t0, t0, 3 ++ sltu t1, t0, a2 ++ add.d a0, a0, t0 ++ ++ maskeqz a0, a0, t1 ++ jr ra ++ ++L(out): ++ move a0, zero ++ jr ra ++END(MEMCHR_NAME) ++ ++#ifdef _LIBC ++libc_hidden_builtin_def (MEMCHR_NAME) ++#endif + +diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S +index 512eabca..9505dfce 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S +@@ -1,11 +1,289 @@ + +-#if IS_IN (libc) + ++ ++#ifdef _LIBC ++#include ++#include ++#include ++#else ++#include ++#include ++#endif ++ ++#if IS_IN (libc) + #define MEMCMP_NAME __memcmp_aligned ++#else ++#define MEMCMP_NAME memcmp ++#endif ++ ++LEAF(MEMCMP_NAME, 6) ++ beqz a2, L(ret) ++ andi a4, a1, 0x7 ++ andi a3, a0, 0x7 ++ sltu a5, a4, a3 ++ ++ xor t0, a0, a1 ++ li.w t8, 8 ++ maskeqz t0, t0, a5 ++ li.w t7, -1 ++ ++ xor a0, a0, t0 // a0 hold smaller one ++ xor a1, a1, t0 // a1 hold larger one ++ andi a3, a0, 0x7 // a3 hold small offset ++ andi a4, a1, 0x7 // a4 hold larger offset ++ ++ xor a0, a0, a3 ++ xor a1, a1, a4 ++ ld.d t2, a0, 0 // t2 = "fedcbaXX" ++ ld.d t1, a1, 0 // t1 = "54321YYY" ++ ++ slli.d t3, a3, 3 ++ slli.d t4, a4, 3 ++ sub.d a6, t3, t4 // a6 = 0xfffffffffffffff8 ++ srl.d t1, t1, t4 // t1 = "00054321" ++ ++ srl.d t0, t2, t3 // t0 = "00fedcba" ++ srl.d t5, t7, t4 // t5 = 0x000000FFFFFFFFFF ++ sub.d t6, t0, t1 // t6 hold diff ++ and t6, t6, t5 // t6 = "000xxxxx" ++ ++ sub.d t5, t8, a4 // t5 hold margin 8 - 3 = 5 ++ bnez t6, L(first_out) ++ bgeu t5, a2, L(ret) ++ sub.d a2, a2, t5 ++ ++ bnez a6, L(unaligned) ++ blt a2, t8, L(al_less_8bytes) ++ andi t1, a2, 31 ++ beq t1, a2, L(al_less_32bytes) ++ ++ sub.d t2, a2, t1 ++ add.d a4, a0, t2 ++ move a2, t1 ++ ++L(al_loop): ++ ld.d t0, a0, 8 ++ ++ ld.d t1, a1, 8 ++ ld.d t2, a0, 16 ++ ld.d t3, a1, 16 ++ ld.d t4, a0, 24 ++ ++ ld.d t5, a1, 24 ++ ld.d t6, a0, 32 ++ ld.d t7, a1, 32 ++ addi.d a0, a0, 32 ++ ++ addi.d a1, a1, 32 ++ bne t0, t1, L(out1) ++ bne t2, t3, L(out2) ++ bne t4, t5, L(out3) ++ ++ bne t6, t7, L(out4) ++ bne a0, a4, L(al_loop) ++ ++L(al_less_32bytes): ++ srai.d a4, a2, 4 ++ beqz a4, L(al_less_16bytes) ++ ++ ld.d t0, a0, 8 ++ ld.d t1, a1, 8 ++ ld.d t2, a0, 16 ++ ld.d t3, a1, 16 ++ ++ addi.d a0, a0, 16 ++ addi.d a1, a1, 16 ++ addi.d a2, a2, -16 ++ bne t0, t1, L(out1) ++ ++ bne t2, t3, L(out2) ++ ++L(al_less_16bytes): ++ srai.d a4, a2, 3 ++ beqz a4, L(al_less_8bytes) ++ ld.d t0, a0, 8 ++ ++ ld.d t1, a1, 8 ++ addi.d a0, a0, 8 ++ addi.d a1, a1, 8 ++ addi.d a2, a2, -8 ++ ++ bne t0, t1, L(out1) ++ ++L(al_less_8bytes): ++ beqz a2, L(ret) ++ ld.d t0, a0, 8 ++ ld.d t1, a1, 8 ++ ++ li.d t7, -1 ++ slli.d t2, a2, 3 ++ sll.d t2, t7, t2 ++ sub.d t3, t0, t1 ++ ++ andn t6, t3, t2 ++ bnez t6, L(count_diff) ++ ++L(ret): ++ move a0, zero ++ jr ra ++ ++L(out4): ++ move t0, t6 ++ move t1, t7 ++ sub.d t6, t6, t7 ++ b L(count_diff) ++ ++L(out3): ++ move t0, t4 ++ move t1, t5 ++ sub.d t6, t4, t5 ++ b L(count_diff) ++ ++L(out2): ++ move t0, t2 ++ move t1, t3 ++L(out1): ++ sub.d t6, t0, t1 ++ b L(count_diff) ++ ++L(first_out): ++ slli.d t4, a2, 3 ++ slt t3, a2, t5 ++ sll.d t4, t7, t4 ++ maskeqz t4, t4, t3 ++ ++ andn t6, t6, t4 ++ ++L(count_diff): ++ ctz.d t2, t6 ++ bstrins.d t2, zero, 2, 0 ++ srl.d t0, t0, t2 ++ ++ srl.d t1, t1, t2 ++ andi t0, t0, 0xff ++ andi t1, t1, 0xff ++ sub.d t2, t0, t1 ++ ++ sub.d t3, t1, t0 ++ masknez t2, t2, a5 ++ maskeqz t3, t3, a5 ++ or a0, t2, t3 ++ ++ jr ra ++ ++L(unaligned): ++ sub.d a7, zero, a6 ++ srl.d t0, t2, a6 ++ blt a2, t8, L(un_less_8bytes) ++ ++ andi t1, a2, 31 ++ beq t1, a2, L(un_less_32bytes) ++ sub.d t2, a2, t1 ++ add.d a4, a0, t2 ++ ++ move a2, t1 ++ ++L(un_loop): ++ ld.d t2, a0, 8 ++ ld.d t1, a1, 8 ++ ld.d t4, a0, 16 ++ ++ ld.d t3, a1, 16 ++ ld.d t6, a0, 24 ++ ld.d t5, a1, 24 ++ ld.d t8, a0, 32 ++ ++ ld.d t7, a1, 32 ++ addi.d a0, a0, 32 ++ addi.d a1, a1, 32 ++ sll.d a3, t2, a7 ++ ++ or t0, a3, t0 ++ bne t0, t1, L(out1) ++ srl.d t0, t2, a6 ++ sll.d a3, t4, a7 ++ ++ or t2, a3, t0 ++ bne t2, t3, L(out2) ++ srl.d t0, t4, a6 ++ sll.d a3, t6, a7 ++ ++ or t4, a3, t0 ++ bne t4, t5, L(out3) ++ srl.d t0, t6, a6 ++ sll.d a3, t8, a7 ++ ++ or t6, t0, a3 ++ bne t6, t7, L(out4) ++ srl.d t0, t8, a6 ++ bne a0, a4, L(un_loop) ++ ++L(un_less_32bytes): ++ srai.d a4, a2, 4 ++ beqz a4, L(un_less_16bytes) ++ ld.d t2, a0, 8 ++ ld.d t1, a1, 8 ++ ++ ld.d t4, a0, 16 ++ ld.d t3, a1, 16 ++ addi.d a0, a0, 16 ++ addi.d a1, a1, 16 ++ ++ addi.d a2, a2, -16 ++ sll.d a3, t2, a7 ++ or t0, a3, t0 ++ bne t0, t1, L(out1) ++ ++ srl.d t0, t2, a6 ++ sll.d a3, t4, a7 ++ or t2, a3, t0 ++ bne t2, t3, L(out2) ++ ++ srl.d t0, t4, a6 ++ ++L(un_less_16bytes): ++ srai.d a4, a2, 3 ++ beqz a4, L(un_less_8bytes) ++ ld.d t2, a0, 8 ++ ++ ld.d t1, a1, 8 ++ addi.d a0, a0, 8 ++ addi.d a1, a1, 8 ++ addi.d a2, a2, -8 ++ ++ sll.d a3, t2, a7 ++ or t0, a3, t0 ++ bne t0, t1, L(out1) ++ srl.d t0, t2, a6 ++ ++L(un_less_8bytes): ++ beqz a2, L(ret) ++ andi a7, a7, 63 ++ slli.d a4, a2, 3 ++ bgeu a7, a4, L(last_cmp) ++ ++ ld.d t2, a0, 8 ++ sll.d a3, t2, a7 ++ or t0, a3, t0 ++ ++L(last_cmp): ++ ld.d t1, a1, 8 ++ ++ li.d t7, -1 ++ sll.d t2, t7, a4 ++ sub.d t3, t0, t1 ++ andn t6, t3, t2 ++ ++ bnez t6, L(count_diff) ++ move a0, zero ++ jr ra ++ ++END(MEMCMP_NAME) + ++#ifdef _LIBC ++libc_hidden_builtin_def (MEMCMP_NAME) + #endif + +-#include "../memcmp.S" + # undef bcmp + weak_alias (MEMCMP_NAME, bcmp) + +diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S +index 5ff8b4e6..3fc86a7f 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S +@@ -1,11 +1,804 @@ +- ++#ifdef _LIBC ++#include ++#include ++#include ++#else ++#include ++#include ++#endif + + #if IS_IN (libc) +- + #define MEMCPY_NAME __memcpy_aligned + #define MEMMOVE_NAME __memmove_aligned ++#else ++#define MEMCPY_NAME memcpy ++#define MEMMOVE_NAME memmove ++#endif ++ ++#define LD_64(reg, n) \ ++ ld.d t0, reg, n; \ ++ ld.d t1, reg, n+8; \ ++ ld.d t2, reg, n+16; \ ++ ld.d t3, reg, n+24; \ ++ ld.d t4, reg, n+32; \ ++ ld.d t5, reg, n+40; \ ++ ld.d t6, reg, n+48; \ ++ ld.d t7, reg, n+56; ++ ++#define ST_64(reg, n) \ ++ st.d t0, reg, n; \ ++ st.d t1, reg, n+8; \ ++ st.d t2, reg, n+16; \ ++ st.d t3, reg, n+24; \ ++ st.d t4, reg, n+32; \ ++ st.d t5, reg, n+40; \ ++ st.d t6, reg, n+48; \ ++ st.d t7, reg, n+56; + ++LEAF(MEMMOVE_NAME, 6) ++ sub.d t0, a0, a1 ++ bltu t0, a2, L(copy_back) ++ ++END(MEMMOVE_NAME) ++ ++#ifdef _LIBC ++libc_hidden_builtin_def (MEMMOVE_NAME) + #endif + +-#include "../memcpy.S" ++LEAF_NO_ALIGN(MEMCPY_NAME) ++ ++ srai.d a3, a2, 4 ++ beqz a3, L(short_data) # less than 16 bytes ++ ++ move a4, a0 ++ andi a5, a0, 0x7 ++ andi a6, a1, 0x7 ++ li.d t8, 8 ++ beqz a5, L(check_align) ++ ++ # make dest aligned 8 bytes ++ sub.d t2, t8, a5 ++ sub.d a2, a2, t2 ++ ++ pcaddi t1, 20 ++ slli.d t3, t2, 3 ++ add.d a1, a1, t2 ++ sub.d t1, t1, t3 ++ add.d a4, a4, t2 ++ jr t1 ++ ++L(al7): ++ ld.b t0, a1, -7 ++ st.b t0, a4, -7 ++L(al6): ++ ld.b t0, a1, -6 ++ st.b t0, a4, -6 ++L(al5): ++ ld.b t0, a1, -5 ++ st.b t0, a4, -5 ++L(al4): ++ ld.b t0, a1, -4 ++ st.b t0, a4, -4 ++L(al3): ++ ld.b t0, a1, -3 ++ st.b t0, a4, -3 ++L(al2): ++ ld.b t0, a1, -2 ++ st.b t0, a4, -2 ++L(al1): ++ ld.b t0, a1, -1 ++ st.b t0, a4, -1 ++ ++L(check_align): ++ bne a5, a6, L(unalign) ++ ++ srai.d a3, a2, 4 ++ beqz a3, L(al_less_16bytes) ++ ++ andi a3, a2, 0x3f ++ beq a3, a2, L(al_less_64bytes) ++ ++ sub.d t0, a2, a3 ++ move a2, a3 ++ add.d a5, a1, t0 ++ ++L(loop_64bytes): ++ LD_64(a1, 0) ++ addi.d a1, a1, 64 ++ ST_64(a4, 0) ++ ++ addi.d a4, a4, 64 ++ bne a1, a5, L(loop_64bytes) ++ ++L(al_less_64bytes): ++ srai.d a3, a2, 5 ++ beqz a3, L(al_less_32bytes) ++ ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a1, 16 ++ ld.d t3, a1, 24 ++ ++ addi.d a1, a1, 32 ++ addi.d a2, a2, -32 ++ ++ st.d t0, a4, 0 ++ st.d t1, a4, 8 ++ st.d t2, a4, 16 ++ st.d t3, a4, 24 ++ ++ addi.d a4, a4, 32 ++ ++L(al_less_32bytes): ++ srai.d a3, a2, 4 ++ beqz a3, L(al_less_16bytes) ++ ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ addi.d a1, a1, 16 ++ addi.d a2, a2, -16 ++ ++ st.d t0, a4, 0 ++ st.d t1, a4, 8 ++ addi.d a4, a4, 16 ++ ++L(al_less_16bytes): ++ srai.d a3, a2, 3 ++ beqz a3, L(al_less_8bytes) ++ ++ ld.d t0, a1, 0 ++ addi.d a1, a1, 8 ++ addi.d a2, a2, -8 ++ ++ st.d t0, a4, 0 ++ addi.d a4, a4, 8 ++ ++L(al_less_8bytes): ++ srai.d a3, a2, 2 ++ beqz a3, L(al_less_4bytes) ++ ++ ld.w t0, a1, 0 ++ addi.d a1, a1, 4 ++ addi.d a2, a2, -4 ++ ++ st.w t0, a4, 0 ++ addi.d a4, a4, 4 ++ ++L(al_less_4bytes): ++ srai.d a3, a2, 1 ++ beqz a3, L(al_less_2bytes) ++ ++ ld.h t0, a1, 0 ++ addi.d a1, a1, 2 ++ addi.d a2, a2, -2 ++ ++ st.h t0, a4, 0 ++ addi.d a4, a4, 2 ++ ++L(al_less_2bytes): ++ beqz a2, L(al_less_1byte) ++ ++ ld.b t0, a1, 0 ++ st.b t0, a4, 0 ++ ++L(al_less_1byte): ++ jr ra ++ ++L(unalign): ++ andi a5, a1, 0x7 ++ bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned ++ ++ sub.d t8, t8, a5 # use t8 to save count of bytes for aligning ++ slli.d a5, a5, 3 ++ ++ ld.d t0, a1, 0 ++ addi.d a1, a1, 8 ++ ++ slli.d a6, t8, 3 ++ srl.d a7, t0, a5 ++ ++ srai.d a3, a2, 4 ++ beqz a3, L(un_less_16bytes) ++ ++ andi a3, a2, 0x3f ++ beq a3, a2, L(un_less_64bytes) ++ ++ sub.d t0, a2, a3 ++ move a2, a3 ++ add.d a3, a1, t0 ++ ++# a5 shift right num ++# a6 shift left num ++# a7 remaining part ++L(un_long_bytes): ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a1, 16 ++ ld.d t3, a1, 24 ++ ++ srl.d t4, t0, a5 ++ sll.d t0, t0, a6 ++ ++ srl.d t5, t1, a5 ++ sll.d t1, t1, a6 ++ ++ srl.d t6, t2, a5 ++ sll.d t2, t2, a6 ++ ++ srl.d t7, t3, a5 ++ sll.d t3, t3, a6 ++ ++ or t0, a7, t0 ++ or t1, t4, t1 ++ or t2, t5, t2 ++ or t3, t6, t3 ++ ++ ld.d t4, a1, 32 ++ ld.d t5, a1, 40 ++ ld.d t6, a1, 48 ++ ld.d a7, a1, 56 ++ ++ st.d t0, a4, 0 ++ st.d t1, a4, 8 ++ st.d t2, a4, 16 ++ st.d t3, a4, 24 ++ ++ addi.d a1, a1, 64 ++ ++ srl.d t0, t4, a5 ++ sll.d t4, t4, a6 ++ ++ srl.d t1, t5, a5 ++ sll.d t5, t5, a6 ++ ++ srl.d t2, t6, a5 ++ sll.d t6, t6, a6 ++ ++ sll.d t3, a7, a6 ++ srl.d a7, a7, a5 ++ ++ or t4, t7, t4 ++ or t5, t0, t5 ++ or t6, t1, t6 ++ or t3, t2, t3 ++ ++ st.d t4, a4, 32 ++ st.d t5, a4, 40 ++ st.d t6, a4, 48 ++ st.d t3, a4, 56 ++ ++ addi.d a4, a4, 64 ++ bne a3, a1, L(un_long_bytes) ++ ++L(un_less_64bytes): ++ srai.d a3, a2, 5 ++ beqz a3, L(un_less_32bytes) ++ ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a1, 16 ++ ld.d t3, a1, 24 ++ ++ addi.d a1, a1, 32 ++ addi.d a2, a2, -32 ++ ++ srl.d t4, t0, a5 ++ sll.d t0, t0, a6 ++ ++ srl.d t5, t1, a5 ++ sll.d t1, t1, a6 ++ ++ srl.d t6, t2, a5 ++ sll.d t2, t2, a6 ++ ++ or t0, a7, t0 ++ ++ srl.d a7, t3, a5 ++ sll.d t3, t3, a6 ++ ++ or t1, t4, t1 ++ or t2, t5, t2 ++ or t3, t6, t3 ++ ++ st.d t0, a4, 0 ++ st.d t1, a4, 8 ++ st.d t2, a4, 16 ++ st.d t3, a4, 24 ++ ++ addi.d a4, a4, 32 ++ ++L(un_less_32bytes): ++ srai.d a3, a2, 4 ++ beqz a3, L(un_less_16bytes) ++ ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ++ addi.d a1, a1, 16 ++ addi.d a2, a2, -16 ++ ++ srl.d t2, t0, a5 ++ sll.d t3, t0, a6 ++ ++ sll.d t4, t1, a6 ++ or t3, a7, t3 ++ or t4, t2, t4 ++ srl.d a7, t1, a5 ++ ++ st.d t3, a4, 0 ++ st.d t4, a4, 8 ++ ++ addi.d a4, a4, 16 ++ ++L(un_less_16bytes): ++ srai.d a3, a2, 3 ++ beqz a3, L(un_less_8bytes) ++ ++ ld.d t0, a1, 0 ++ ++ addi.d a1, a1, 8 ++ addi.d a2, a2, -8 ++ ++ sll.d t1, t0, a6 ++ or t2, a7, t1 ++ srl.d a7, t0, a5 ++ ++ st.d t2, a4, 0 ++ addi.d a4, a4, 8 ++ ++L(un_less_8bytes): ++ beqz a2, L(un_less_1byte) ++ bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7 ++ ++ # combine data in memory and a7(remaining part) ++ ld.d t0, a1, 0 ++ sll.d t0, t0, a6 ++ or a7, a7, t0 ++ ++1: ++ srai.d a3, a2, 2 ++ beqz a3, L(un_less_4bytes) ++ ++ addi.d a2, a2, -4 ++ st.w a7, a4, 0 ++ addi.d a4, a4, 4 ++ srai.d a7, a7, 32 ++ ++L(un_less_4bytes): ++ srai.d a3, a2, 1 ++ beqz a3, L(un_less_2bytes) ++ ++ addi.d a2, a2, -2 ++ st.h a7, a4, 0 ++ addi.d a4, a4, 2 ++ srai.d a7, a7, 16 + ++L(un_less_2bytes): ++ beqz a2, L(un_less_1byte) ++ st.b a7, a4, 0 ++ ++L(un_less_1byte): ++ jr ra ++ ++# Bytes copying for data less than 16 bytes ++L(short_data): ++ pcaddi t1, 36 ++ slli.d t2, a2, 3 ++ add.d a4, a0, a2 ++ sub.d t1, t1, t2 ++ add.d a1, a1, a2 ++ jr t1 ++ ++L(short_15_bytes): ++ ld.b t0, a1, -15 ++ st.b t0, a4, -15 ++L(short_14_bytes): ++ ld.b t0, a1, -14 ++ st.b t0, a4, -14 ++L(short_13_bytes): ++ ld.b t0, a1, -13 ++ st.b t0, a4, -13 ++L(short_12_bytes): ++ ld.b t0, a1, -12 ++ st.b t0, a4, -12 ++L(short_11_bytes): ++ ld.b t0, a1, -11 ++ st.b t0, a4, -11 ++L(short_10_bytes): ++ ld.b t0, a1, -10 ++ st.b t0, a4, -10 ++L(short_9_bytes): ++ ld.b t0, a1, -9 ++ st.b t0, a4, -9 ++L(short_8_bytes): ++ ld.b t0, a1, -8 ++ st.b t0, a4, -8 ++L(short_7_bytes): ++ ld.b t0, a1, -7 ++ st.b t0, a4, -7 ++L(short_6_bytes): ++ ld.b t0, a1, -6 ++ st.b t0, a4, -6 ++L(short_5_bytes): ++ ld.b t0, a1, -5 ++ st.b t0, a4, -5 ++L(short_4_bytes): ++ ld.b t0, a1, -4 ++ st.b t0, a4, -4 ++L(short_3_bytes): ++ ld.b t0, a1, -3 ++ st.b t0, a4, -3 ++L(short_2_bytes): ++ ld.b t0, a1, -2 ++ st.b t0, a4, -2 ++L(short_1_bytes): ++ ld.b t0, a1, -1 ++ st.b t0, a4, -1 ++ jr ra ++ ++L(copy_back): ++ srai.d a3, a2, 4 ++ beqz a3, L(back_short_data) # less than 16 bytes ++ ++ add.d a4, a0, a2 # store the tail of dest ++ add.d a1, a1, a2 # store the tail of src ++ ++ andi a5, a4, 0x7 ++ andi a6, a1, 0x7 ++ beqz a5, L(back_check_align) ++ ++ # make dest aligned 8 bytes ++ sub.d a2, a2, a5 ++ sub.d a1, a1, a5 ++ sub.d a4, a4, a5 ++ ++ pcaddi t1, 18 ++ slli.d t3, a5, 3 ++ sub.d t1, t1, t3 ++ jr t1 ++ ++ ld.b t0, a1, 6 ++ st.b t0, a4, 6 ++ ld.b t0, a1, 5 ++ st.b t0, a4, 5 ++ ld.b t0, a1, 4 ++ st.b t0, a4, 4 ++ ld.b t0, a1, 3 ++ st.b t0, a4, 3 ++ ld.b t0, a1, 2 ++ st.b t0, a4, 2 ++ ld.b t0, a1, 1 ++ st.b t0, a4, 1 ++ ld.b t0, a1, 0 ++ st.b t0, a4, 0 ++ ++L(back_check_align): ++ bne a5, a6, L(back_unalign) ++ ++ srai.d a3, a2, 4 ++ beqz a3, L(back_less_16bytes) ++ ++ andi a3, a2, 0x3f ++ beq a3, a2, L(back_less_64bytes) ++ ++ sub.d t0, a2, a3 ++ move a2, a3 ++ sub.d a5, a1, t0 ++ ++L(back_loop_64bytes): ++ LD_64(a1, -64) ++ addi.d a1, a1, -64 ++ ST_64(a4, -64) ++ ++ addi.d a4, a4, -64 ++ bne a1, a5, L(back_loop_64bytes) ++ ++L(back_less_64bytes): ++ srai.d a3, a2, 5 ++ beqz a3, L(back_less_32bytes) ++ ++ ld.d t0, a1, -32 ++ ld.d t1, a1, -24 ++ ld.d t2, a1, -16 ++ ld.d t3, a1, -8 ++ ++ addi.d a1, a1, -32 ++ addi.d a2, a2, -32 ++ ++ st.d t0, a4, -32 ++ st.d t1, a4, -24 ++ st.d t2, a4, -16 ++ st.d t3, a4, -8 ++ ++ addi.d a4, a4, -32 ++ ++L(back_less_32bytes): ++ srai.d a3, a2, 4 ++ beqz a3, L(back_less_16bytes) ++ ++ ld.d t0, a1, -16 ++ ld.d t1, a1, -8 ++ ++ addi.d a2, a2, -16 ++ addi.d a1, a1, -16 ++ ++ st.d t0, a4, -16 ++ st.d t1, a4, -8 ++ addi.d a4, a4, -16 ++ ++L(back_less_16bytes): ++ srai.d a3, a2, 3 ++ beqz a3, L(back_less_8bytes) ++ ++ ld.d t0, a1, -8 ++ addi.d a2, a2, -8 ++ addi.d a1, a1, -8 ++ ++ st.d t0, a4, -8 ++ addi.d a4, a4, -8 ++ ++L(back_less_8bytes): ++ srai.d a3, a2, 2 ++ beqz a3, L(back_less_4bytes) ++ ++ ld.w t0, a1, -4 ++ addi.d a2, a2, -4 ++ addi.d a1, a1, -4 ++ ++ st.w t0, a4, -4 ++ addi.d a4, a4, -4 ++ ++L(back_less_4bytes): ++ srai.d a3, a2, 1 ++ beqz a3, L(back_less_2bytes) ++ ++ ld.h t0, a1, -2 ++ addi.d a2, a2, -2 ++ addi.d a1, a1, -2 ++ ++ st.h t0, a4, -2 ++ addi.d a4, a4, -2 ++ ++L(back_less_2bytes): ++ beqz a2, L(back_less_1byte) ++ ++ ld.b t0, a1, -1 ++ st.b t0, a4, -1 ++ ++L(back_less_1byte): ++ jr ra ++ ++L(back_unalign): ++ andi t8, a1, 0x7 ++ bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned ++ ++ sub.d a6, zero, t8 ++ ++ ld.d t0, a1, 0 ++ slli.d a6, a6, 3 ++ slli.d a5, t8, 3 ++ sll.d a7, t0, a6 ++ ++ srai.d a3, a2, 4 ++ beqz a3, L(back_un_less_16bytes) ++ ++ andi a3, a2, 0x3f ++ beq a3, a2, L(back_un_less_64bytes) ++ ++ sub.d t0, a2, a3 ++ move a2, a3 ++ sub.d a3, a1, t0 ++ ++L(back_un_long_bytes): ++ ld.d t0, a1, -8 ++ ld.d t1, a1, -16 ++ ld.d t2, a1, -24 ++ ld.d t3, a1, -32 ++ ++ sll.d t4, t0, a6 ++ srl.d t0, t0, a5 ++ ++ sll.d t5, t1, a6 ++ srl.d t1, t1, a5 ++ ++ sll.d t6, t2, a6 ++ srl.d t2, t2, a5 ++ ++ sll.d t7, t3, a6 ++ srl.d t3, t3, a5 ++ ++ or t0, t0, a7 ++ or t1, t1, t4 ++ or t2, t2, t5 ++ or t3, t3, t6 ++ ++ ld.d t4, a1, -40 ++ ld.d t5, a1, -48 ++ ld.d t6, a1, -56 ++ ld.d a7, a1, -64 ++ st.d t0, a4, -8 ++ st.d t1, a4, -16 ++ st.d t2, a4, -24 ++ st.d t3, a4, -32 ++ ++ addi.d a1, a1, -64 ++ ++ sll.d t0, t4, a6 ++ srl.d t4, t4, a5 ++ ++ sll.d t1, t5, a6 ++ srl.d t5, t5, a5 ++ ++ sll.d t2, t6, a6 ++ srl.d t6, t6, a5 ++ ++ srl.d t3, a7, a5 ++ sll.d a7, a7, a6 ++ ++ or t4, t7, t4 ++ or t5, t0, t5 ++ or t6, t1, t6 ++ or t3, t2, t3 ++ ++ st.d t4, a4, -40 ++ st.d t5, a4, -48 ++ st.d t6, a4, -56 ++ st.d t3, a4, -64 ++ ++ addi.d a4, a4, -64 ++ bne a3, a1, L(back_un_long_bytes) ++ ++L(back_un_less_64bytes): ++ srai.d a3, a2, 5 ++ beqz a3, L(back_un_less_32bytes) ++ ++ ld.d t0, a1, -8 ++ ld.d t1, a1, -16 ++ ld.d t2, a1, -24 ++ ld.d t3, a1, -32 ++ ++ addi.d a1, a1, -32 ++ addi.d a2, a2, -32 ++ ++ sll.d t4, t0, a6 ++ srl.d t0, t0, a5 ++ ++ sll.d t5, t1, a6 ++ srl.d t1, t1, a5 ++ ++ sll.d t6, t2, a6 ++ srl.d t2, t2, a5 ++ ++ or t0, a7, t0 ++ ++ sll.d a7, t3, a6 ++ srl.d t3, t3, a5 ++ ++ or t1, t4, t1 ++ or t2, t5, t2 ++ or t3, t6, t3 ++ ++ st.d t0, a4, -8 ++ st.d t1, a4, -16 ++ st.d t2, a4, -24 ++ st.d t3, a4, -32 ++ ++ addi.d a4, a4, -32 ++ ++L(back_un_less_32bytes): ++ srai.d a3, a2, 4 ++ beqz a3, L(back_un_less_16bytes) ++ ++ ld.d t0, a1, -8 ++ ld.d t1, a1, -16 ++ ++ addi.d a1, a1, -16 ++ addi.d a2, a2, -16 ++ ++ sll.d t2, t0, a6 ++ srl.d t3, t0, a5 ++ ++ srl.d t4, t1, a5 ++ or t3, a7, t3 ++ or t4, t2, t4 ++ sll.d a7, t1, a6 ++ ++ st.d t3, a4, -8 ++ st.d t4, a4, -16 ++ ++ addi.d a4, a4, -16 ++ ++L(back_un_less_16bytes): ++ srai.d a3, a2, 3 ++ beqz a3, L(back_un_less_8bytes) ++ ++ ld.d t0, a1, -8 ++ ++ addi.d a1, a1, -8 ++ addi.d a2, a2, -8 ++ ++ srl.d t1, t0, a5 ++ or t2, a7, t1 ++ sll.d a7, t0, a6 ++ ++ st.d t2, a4, -8 ++ addi.d a4, a4, -8 ++ ++L(back_un_less_8bytes): ++ beqz a2, L(back_end) ++ bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7 ++ ++ # combine data in memory and a7(remaining part) ++ ld.d t0, a1, -8 ++ srl.d t0, t0, a5 ++ or a7, a7, t0 ++ ++1: ++ srai.d a3, a2, 2 ++ beqz a3, L(back_un_less_4bytes) ++ ++ srai.d t0, a7, 32 ++ addi.d a2, a2, -4 ++ st.w t0, a4, -4 ++ addi.d a4, a4, -4 ++ slli.d a7, a7, 32 ++ ++L(back_un_less_4bytes): ++ srai.d a3, a2, 1 ++ beqz a3, L(back_un_less_2bytes) ++ srai.d t0, a7, 48 ++ addi.d a2, a2, -2 ++ st.h t0, a4, -2 ++ addi.d a4, a4, -2 ++ slli.d a7, a7, 16 ++L(back_un_less_2bytes): ++ beqz a2, L(back_un_less_1byte) ++ srai.d t0, a7, 56 ++ st.b t0, a4, -1 ++L(back_un_less_1byte): ++ jr ra ++ ++L(back_short_data): ++ pcaddi t1, 34 ++ slli.d t2, a2, 3 ++ sub.d t1, t1, t2 ++ jr t1 ++ ++ ld.b t0, a1, 14 ++ st.b t0, a0, 14 ++ ld.b t0, a1, 13 ++ st.b t0, a0, 13 ++ ld.b t0, a1, 12 ++ st.b t0, a0, 12 ++ ld.b t0, a1, 11 ++ st.b t0, a0, 11 ++ ld.b t0, a1, 10 ++ st.b t0, a0, 10 ++ ld.b t0, a1, 9 ++ st.b t0, a0, 9 ++ ld.b t0, a1, 8 ++ st.b t0, a0, 8 ++ ld.b t0, a1, 7 ++ st.b t0, a0, 7 ++ ld.b t0, a1, 6 ++ st.b t0, a0, 6 ++ ld.b t0, a1, 5 ++ st.b t0, a0, 5 ++ ld.b t0, a1, 4 ++ st.b t0, a0, 4 ++ ld.b t0, a1, 3 ++ st.b t0, a0, 3 ++ ld.b t0, a1, 2 ++ st.b t0, a0, 2 ++ ld.b t0, a1, 1 ++ st.b t0, a0, 1 ++ ld.b t0, a1, 0 ++ st.b t0, a0, 0 ++L(back_end): ++ jr ra ++ ++END(MEMCPY_NAME) ++ ++#ifdef _LIBC ++libc_hidden_builtin_def (MEMCPY_NAME) ++#endif +diff --git a/sysdeps/loongarch/lp64/multiarch/memset-aligned.S b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S +index da2f5ada..412ee849 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memset-aligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S +@@ -1,9 +1,169 @@ ++#ifdef _LIBC ++#include ++#include ++#include ++#else ++#include ++#include ++#endif + + #if IS_IN (libc) +- + #define MEMSET_NAME __memset_aligned +- ++#else ++#define MEMSET_NAME memset + #endif + +-#include "../memset.S" ++#define ST_64(n) \ ++ st.d a1, a0, n; \ ++ st.d a1, a0, n+8; \ ++ st.d a1, a0, n+16; \ ++ st.d a1, a0, n+24; \ ++ st.d a1, a0, n+32; \ ++ st.d a1, a0, n+40; \ ++ st.d a1, a0, n+48; \ ++ st.d a1, a0, n+56; ++ ++LEAF(MEMSET_NAME, 6) ++ move t0, a0 ++ andi a3, a0, 0x7 ++ li.w t6, 16 ++ beqz a3, L(align) ++ blt a2, t6, L(short_data) ++ ++L(make_align): ++ li.w t8, 8 ++ sub.d t2, t8, a3 ++ pcaddi t1, 11 ++ slli.d t3, t2, 2 ++ sub.d t1, t1, t3 ++ jirl zero, t1, 0 ++ ++L(al7): ++ st.b a1, t0, 6 ++L(al6): ++ st.b a1, t0, 5 ++L(al5): ++ st.b a1, t0, 4 ++L(al4): ++ st.b a1, t0, 3 ++L(al3): ++ st.b a1, t0, 2 ++L(al2): ++ st.b a1, t0, 1 ++L(al1): ++ st.b a1, t0, 0 ++L(al0): ++ add.d t0, t0, t2 ++ sub.d a2, a2, t2 ++ ++L(align): ++ bstrins.d a1, a1, 15, 8 ++ bstrins.d a1, a1, 31, 16 ++ bstrins.d a1, a1, 63, 32 ++ ++ blt a2, t6, L(less_16bytes) ++ ++ andi a4, a2, 0x3f ++ beq a4, a2, L(less_64bytes) ++ ++ sub.d t1, a2, a4 ++ move a2, a4 ++ add.d a5, t0, t1 ++ ++L(loop_64bytes): ++ addi.d t0, t0, 64 ++ st.d a1, t0, -64 ++ st.d a1, t0, -56 ++ st.d a1, t0, -48 ++ st.d a1, t0, -40 ++ st.d a1, t0, -32 ++ st.d a1, t0, -24 ++ st.d a1, t0, -16 ++ st.d a1, t0, -8 ++ bne t0, a5, L(loop_64bytes) ++ ++L(less_64bytes): ++ srai.d a4, a2, 5 ++ beqz a4, L(less_32bytes) ++ addi.d a2, a2, -32 ++ st.d a1, t0, 0 ++ st.d a1, t0, 8 ++ st.d a1, t0, 16 ++ st.d a1, t0, 24 ++ addi.d t0, t0, 32 ++L(less_32bytes): ++ blt a2, t6, L(less_16bytes) ++ addi.d a2, a2, -16 ++ st.d a1, t0, 0 ++ st.d a1, t0, 8 ++ addi.d t0, t0, 16 ++L(less_16bytes): ++ srai.d a4, a2, 3 ++ beqz a4, L(less_8bytes) ++ addi.d a2, a2, -8 ++ st.d a1, t0, 0 ++ addi.d t0, t0, 8 ++L(less_8bytes): ++ beqz a2, L(less_1byte) ++ srai.d a4, a2, 2 ++ beqz a4, L(less_4bytes) ++ addi.d a2, a2, -4 ++ st.w a1, t0, 0 ++ addi.d t0, t0, 4 ++L(less_4bytes): ++ srai.d a3, a2, 1 ++ beqz a3, L(less_2bytes) ++ addi.d a2, a2, -2 ++ st.h a1, t0, 0 ++ addi.d t0, t0, 2 ++L(less_2bytes): ++ beqz a2, L(less_1byte) ++ st.b a1, t0, 0 ++L(less_1byte): ++ jr ra ++ ++L(short_data): ++ pcaddi t1, 19 ++ slli.d t3, a2, 2 ++ sub.d t1, t1, t3 ++ jirl zero, t1, 0 ++L(short_15): ++ st.b a1, a0, 14 ++ ++L(short_14): ++ st.b a1, a0, 13 ++L(short_13): ++ st.b a1, a0, 12 ++L(short_12): ++ st.b a1, a0, 11 ++L(short_11): ++ st.b a1, a0, 10 ++L(short_10): ++ st.b a1, a0, 9 ++L(short_9): ++ st.b a1, a0, 8 ++L(short_8): ++ st.b a1, a0, 7 ++L(short_7): ++ st.b a1, a0, 6 ++L(short_6): ++ st.b a1, a0, 5 ++L(short_5): ++ st.b a1, a0, 4 ++L(short_4): ++ st.b a1, a0, 3 ++L(short_3): ++ st.b a1, a0, 2 ++L(short_2): ++ st.b a1, a0, 1 ++L(short_1): ++ st.b a1, a0, 0 ++L(short_0): ++ jr ra ++ ++END(MEMSET_NAME) ++ ++#ifdef _LIBC ++libc_hidden_builtin_def (MEMSET_NAME) ++#endif + +diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S +index 0b46b4ca..a13e293f 100644 +--- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S +@@ -1,7 +1,115 @@ ++#ifdef _LIBC ++#include ++#include ++#include ++#else ++#include ++#include ++#endif + + #if IS_IN (libc) + #define RAWMEMCHR_NAME __rawmemchr_aligned ++#else ++#define RAWMEMCHR_NAME __rawmemchr + #endif + +-#include "../rawmemchr.S" ++LEAF(RAWMEMCHR_NAME, 6) ++ andi t1, a0, 0x7 ++ bstrins.d a0, zero, 2, 0 ++ lu12i.w a2, 0x01010 ++ bstrins.d a1, a1, 15, 8 ++ ++ ld.d t0, a0, 0 ++ slli.d t1, t1, 3 ++ ori a2, a2, 0x101 ++ bstrins.d a1, a1, 31, 16 ++ ++ li.w t8, -1 ++ bstrins.d a1, a1, 63, 32 ++ bstrins.d a2, a2, 63, 32 ++ sll.d t2, t8, t1 ++ ++ sll.d t3, a1, t1 ++ orn t0, t0, t2 ++ slli.d a3, a2, 7 ++ beqz a1, L(find_zero) ++ ++ xor t0, t0, t3 ++ sub.d t1, t0, a2 ++ andn t2, a3, t0 ++ and t3, t1, t2 ++ ++ bnez t3, L(count_pos) ++ addi.d a0, a0, 8 ++ ++L(loop): ++ ld.d t0, a0, 0 ++ xor t0, t0, a1 ++ ++ sub.d t1, t0, a2 ++ andn t2, a3, t0 ++ and t3, t1, t2 ++ bnez t3, L(count_pos) ++ ++ ld.d t0, a0, 8 ++ addi.d a0, a0, 16 ++ xor t0, t0, a1 ++ sub.d t1, t0, a2 ++ ++ andn t2, a3, t0 ++ and t3, t1, t2 ++ beqz t3, L(loop) ++ addi.d a0, a0, -8 ++L(count_pos): ++ ctz.d t0, t3 ++ srli.d t0, t0, 3 ++ add.d a0, a0, t0 ++ jr ra ++ ++L(loop_7bit): ++ ld.d t0, a0, 0 ++L(find_zero): ++ sub.d t1, t0, a2 ++ and t2, t1, a3 ++ bnez t2, L(more_check) ++ ++ ld.d t0, a0, 8 ++ addi.d a0, a0, 16 ++ sub.d t1, t0, a2 ++ and t2, t1, a3 ++ ++ beqz t2, L(loop_7bit) ++ addi.d a0, a0, -8 ++ ++L(more_check): ++ andn t2, a3, t0 ++ and t3, t1, t2 ++ bnez t3, L(count_pos) ++ addi.d a0, a0, 8 ++ ++L(loop_8bit): ++ ld.d t0, a0, 0 ++ ++ sub.d t1, t0, a2 ++ andn t2, a3, t0 ++ and t3, t1, t2 ++ bnez t3, L(count_pos) ++ ++ ld.d t0, a0, 8 ++ addi.d a0, a0, 16 ++ sub.d t1, t0, a2 ++ ++ andn t2, a3, t0 ++ and t3, t1, t2 ++ beqz t3, L(loop_8bit) ++ ++ addi.d a0, a0, -8 ++ b L(count_pos) ++ ++END(RAWMEMCHR_NAME) ++ ++#ifdef _LIBC ++weak_alias (__rawmemchr, rawmemchr) ++libc_hidden_builtin_def (__rawmemchr) ++#endif + +diff --git a/sysdeps/loongarch/lp64/rawmemchr.S b/sysdeps/loongarch/lp64/rawmemchr.S +deleted file mode 100644 +index ef1db7ed..00000000 +--- a/sysdeps/loongarch/lp64/rawmemchr.S ++++ /dev/null +@@ -1,113 +0,0 @@ +-#ifdef _LIBC +-#include +-#include +-#include +-#else +-#include +-#include +-#endif +- +-#ifndef RAWMEMCHR_NAME +-# define RAWMEMCHR_NAME __rawmemchr +-#endif +- +- +-LEAF(RAWMEMCHR_NAME, 6) +- andi t1, a0, 0x7 +- bstrins.d a0, zero, 2, 0 +- lu12i.w a2, 0x01010 +- bstrins.d a1, a1, 15, 8 +- +- ld.d t0, a0, 0 +- slli.d t1, t1, 3 +- ori a2, a2, 0x101 +- bstrins.d a1, a1, 31, 16 +- +- li.w t8, -1 +- bstrins.d a1, a1, 63, 32 +- bstrins.d a2, a2, 63, 32 +- sll.d t2, t8, t1 +- +- sll.d t3, a1, t1 +- orn t0, t0, t2 +- slli.d a3, a2, 7 +- beqz a1, L(find_zero) +- +- xor t0, t0, t3 +- sub.d t1, t0, a2 +- andn t2, a3, t0 +- and t3, t1, t2 +- +- bnez t3, L(count_pos) +- addi.d a0, a0, 8 +- +-L(loop): +- ld.d t0, a0, 0 +- xor t0, t0, a1 +- +- sub.d t1, t0, a2 +- andn t2, a3, t0 +- and t3, t1, t2 +- bnez t3, L(count_pos) +- +- ld.d t0, a0, 8 +- addi.d a0, a0, 16 +- xor t0, t0, a1 +- sub.d t1, t0, a2 +- +- andn t2, a3, t0 +- and t3, t1, t2 +- beqz t3, L(loop) +- addi.d a0, a0, -8 +-L(count_pos): +- ctz.d t0, t3 +- srli.d t0, t0, 3 +- add.d a0, a0, t0 +- jr ra +- +-L(loop_7bit): +- ld.d t0, a0, 0 +-L(find_zero): +- sub.d t1, t0, a2 +- and t2, t1, a3 +- bnez t2, L(more_check) +- +- ld.d t0, a0, 8 +- addi.d a0, a0, 16 +- sub.d t1, t0, a2 +- and t2, t1, a3 +- +- beqz t2, L(loop_7bit) +- addi.d a0, a0, -8 +- +-L(more_check): +- andn t2, a3, t0 +- and t3, t1, t2 +- bnez t3, L(count_pos) +- addi.d a0, a0, 8 +- +-L(loop_8bit): +- ld.d t0, a0, 0 +- +- sub.d t1, t0, a2 +- andn t2, a3, t0 +- and t3, t1, t2 +- bnez t3, L(count_pos) +- +- ld.d t0, a0, 8 +- addi.d a0, a0, 16 +- sub.d t1, t0, a2 +- +- andn t2, a3, t0 +- and t3, t1, t2 +- beqz t3, L(loop_8bit) +- +- addi.d a0, a0, -8 +- b L(count_pos) +- +-END(RAWMEMCHR_NAME) +- +-#ifdef _LIBC +-weak_alias (__rawmemchr, rawmemchr) +-libc_hidden_builtin_def (__rawmemchr) +-#endif +-- +2.33.0 + diff --git a/glibc-2.28-Refactor-code-of-st-r-p-functions.patch b/glibc-2.28-Refactor-code-of-st-r-p-functions.patch new file mode 100644 index 0000000..7c453e7 --- /dev/null +++ b/glibc-2.28-Refactor-code-of-st-r-p-functions.patch @@ -0,0 +1,2770 @@ +From b720fd44df475685ea164491d76c42e127aab3ea Mon Sep 17 00:00:00 2001 +From: caiyinyu +Date: Wed, 21 Jun 2023 10:49:39 +0800 +Subject: [PATCH 07/14] glibc-2.28: Refactor code of st{r,p}* functions. + +Change-Id: Ife977373e9ba071b284ee19ca4ba121bc27d5834 +Signed-off-by: ticat_fp +--- + .../loongarch/lp64/multiarch/stpcpy-aligned.S | 179 +++++++++++- + .../loongarch/lp64/multiarch/strchr-aligned.S | 91 ++++++- + .../lp64/multiarch/strchrnul-aligned.S | 94 ++++++- + .../loongarch/lp64/multiarch/strcmp-aligned.S | 225 ++++++++++++++- + .../loongarch/lp64/multiarch/strcpy-aligned.S | 173 +++++++++++- + .../loongarch/lp64/multiarch/strlen-aligned.S | 85 +++++- + .../lp64/multiarch/strncmp-aligned.S | 256 +++++++++++++++++- + .../lp64/multiarch/strnlen-aligned.S | 82 +++++- + .../lp64/multiarch/strrchr-aligned.S | 105 ++++++- + sysdeps/loongarch/lp64/stpcpy.S | 179 ------------ + sysdeps/loongarch/lp64/strchr.S | 89 ------ + sysdeps/loongarch/lp64/strchrnul.S | 94 ------- + sysdeps/loongarch/lp64/strcmp.S | 227 ---------------- + sysdeps/loongarch/lp64/strcpy.S | 173 ------------ + sysdeps/loongarch/lp64/strlen.S | 85 ------ + sysdeps/loongarch/lp64/strncmp.S | 256 ------------------ + sysdeps/loongarch/lp64/strnlen.S | 82 ------ + sysdeps/loongarch/lp64/strrchr.S | 105 ------- + 18 files changed, 1264 insertions(+), 1316 deletions(-) + delete mode 100644 sysdeps/loongarch/lp64/stpcpy.S + delete mode 100644 sysdeps/loongarch/lp64/strchr.S + delete mode 100644 sysdeps/loongarch/lp64/strchrnul.S + delete mode 100644 sysdeps/loongarch/lp64/strcmp.S + delete mode 100644 sysdeps/loongarch/lp64/strcpy.S + delete mode 100644 sysdeps/loongarch/lp64/strlen.S + delete mode 100644 sysdeps/loongarch/lp64/strncmp.S + delete mode 100644 sysdeps/loongarch/lp64/strnlen.S + delete mode 100644 sysdeps/loongarch/lp64/strrchr.S + +diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S +index 3d134e3f..7109b0f0 100644 +--- a/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S +@@ -1,8 +1,181 @@ ++#ifdef _LIBC ++#include ++#include ++#include ++#else ++#include ++#include ++#endif + + #if IS_IN (libc) +- + #define STPCPY_NAME __stpcpy_aligned +- ++#else ++#define STPCPY_NAME __stpcpy + #endif + +-#include "../stpcpy.S" ++LEAF(STPCPY_NAME, 6) ++ andi a3, a0, 0x7 ++ beqz a3, L(dest_align) ++ sub.d a5, a1, a3 ++ addi.d a5, a5, 8 ++ ++L(make_dest_align): ++ ld.b t0, a1, 0 ++ addi.d a1, a1, 1 ++ st.b t0, a0, 0 ++ addi.d a0, a0, 1 ++ ++ beqz t0, L(al_out) ++ bne a1, a5, L(make_dest_align) ++ ++L(dest_align): ++ andi a4, a1, 7 ++ bstrins.d a1, zero, 2, 0 ++ ++ lu12i.w t5, 0x1010 ++ ld.d t0, a1, 0 ++ ori t5, t5, 0x101 ++ bstrins.d t5, t5, 63, 32 ++ ++ slli.d t6, t5, 0x7 ++ bnez a4, L(unalign) ++ sub.d t1, t0, t5 ++ andn t2, t6, t0 ++ ++ and t3, t1, t2 ++ bnez t3, L(al_end) ++ ++L(al_loop): ++ st.d t0, a0, 0 ++ ld.d t0, a1, 8 ++ ++ addi.d a1, a1, 8 ++ addi.d a0, a0, 8 ++ sub.d t1, t0, t5 ++ andn t2, t6, t0 ++ ++ and t3, t1, t2 ++ beqz t3, L(al_loop) ++ ++L(al_end): ++ ctz.d t1, t3 ++ srli.d t1, t1, 3 ++ addi.d t1, t1, 1 # add 1, since '\0' needs to be copied to dest ++ ++ andi a3, t1, 8 ++ andi a4, t1, 4 ++ andi a5, t1, 2 ++ andi a6, t1, 1 ++ ++L(al_end_8): ++ beqz a3, L(al_end_4) ++ st.d t0, a0, 0 ++ addi.d a0, a0, 7 ++ jr ra ++L(al_end_4): ++ beqz a4, L(al_end_2) ++ st.w t0, a0, 0 ++ addi.d a0, a0, 4 ++ srli.d t0, t0, 32 ++L(al_end_2): ++ beqz a5, L(al_end_1) ++ st.h t0, a0, 0 ++ addi.d a0, a0, 2 ++ srli.d t0, t0, 16 ++L(al_end_1): ++ beqz a6, L(al_out) ++ st.b t0, a0, 0 ++ addi.d a0, a0, 1 ++L(al_out): ++ addi.d a0, a0, -1 ++ jr ra ++ ++L(unalign): ++ slli.d a5, a4, 3 ++ li.d t1, -1 ++ sub.d a6, zero, a5 ++ ++ srl.d a7, t0, a5 ++ sll.d t7, t1, a6 ++ ++ or t0, a7, t7 ++ sub.d t1, t0, t5 ++ andn t2, t6, t0 ++ and t3, t1, t2 ++ ++ bnez t3, L(un_end) ++ ++ ld.d t4, a1, 8 ++ addi.d a1, a1, 8 ++ ++ sub.d t1, t4, t5 ++ andn t2, t6, t4 ++ sll.d t0, t4, a6 ++ and t3, t1, t2 ++ ++ or t0, t0, a7 ++ bnez t3, L(un_end_with_remaining) ++ ++L(un_loop): ++ srl.d a7, t4, a5 ++ ++ ld.d t4, a1, 8 ++ addi.d a1, a1, 8 ++ ++ st.d t0, a0, 0 ++ addi.d a0, a0, 8 ++ ++ sub.d t1, t4, t5 ++ andn t2, t6, t4 ++ sll.d t0, t4, a6 ++ and t3, t1, t2 ++ ++ or t0, t0, a7 ++ beqz t3, L(un_loop) ++ ++L(un_end_with_remaining): ++ ctz.d t1, t3 ++ srli.d t1, t1, 3 ++ addi.d t1, t1, 1 ++ sub.d t1, t1, a4 ++ ++ blt t1, zero, L(un_end_less_8) ++ st.d t0, a0, 0 ++ addi.d a0, a0, 8 ++ beqz t1, L(un_out) ++ srl.d t0, t4, a5 # get the remaining part ++ b L(un_end_less_8) ++ ++L(un_end): ++ ctz.d t1, t3 ++ srli.d t1, t1, 3 ++ addi.d t1, t1, 1 ++ ++L(un_end_less_8): ++ andi a4, t1, 4 ++ andi a5, t1, 2 ++ andi a6, t1, 1 ++L(un_end_4): ++ beqz a4, L(un_end_2) ++ st.w t0, a0, 0 ++ addi.d a0, a0, 4 ++ srli.d t0, t0, 32 ++L(un_end_2): ++ beqz a5, L(un_end_1) ++ st.h t0, a0, 0 ++ addi.d a0, a0, 2 ++ srli.d t0, t0, 16 ++L(un_end_1): ++ beqz a6, L(un_out) ++ st.b t0, a0, 0 ++ addi.d a0, a0, 1 ++L(un_out): ++ addi.d a0, a0, -1 ++ jr ra ++ ++END(STPCPY_NAME) ++ ++#ifdef _LIBC ++weak_alias (STPCPY_NAME, stpcpy) ++libc_hidden_builtin_def (STPCPY_NAME) ++#endif +diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S +index 92365658..d9bd4587 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S +@@ -1,10 +1,95 @@ + +-#if IS_IN (libc) + +-#define STRCHR_NAME __strchr_aligned ++#ifdef _LIBC ++#include ++#include ++#include ++#else ++#include ++#include ++#endif + ++#if IS_IN (libc) ++#define STRCHR_NAME __strchr_aligned ++#else ++#define STRCHR_NAME strchr + #endif + +-#include "../strchr.S" ++/* char * strchr (const char *s1, int c); */ ++ ++LEAF(STRCHR_NAME, 6) ++ slli.d t1, a0, 3 ++ bstrins.d a0, zero, 2, 0 ++ lu12i.w a2, 0x01010 ++ ld.d t2, a0, 0 ++ ++ ori a2, a2, 0x101 ++ andi a1, a1, 0xff ++ bstrins.d a2, a2, 63, 32 ++ li.w t0, -1 ++ ++ mul.d a1, a1, a2 # "cccccccc" ++ sll.d t0, t0, t1 ++ slli.d a3, a2, 7 # 0x8080808080808080 ++ orn t2, t2, t0 ++ ++ sll.d t3, a1, t1 ++ xor t4, t2, t3 ++ sub.d a7, t2, a2 ++ andn a6, a3, t2 ++ ++ ++ sub.d a5, t4, a2 ++ andn a4, a3, t4 ++ and a6, a7, a6 ++ and a5, a5, a4 ++ ++ or t0, a6, a5 ++ bnez t0, L(_mc8_a) ++ addi.d a0, a0, 8 ++L(_aloop): ++ ld.d t4, a0, 0 ++ ++ xor t2, t4, a1 ++ sub.d a7, t4, a2 ++ andn a6, a3, t4 ++ sub.d a5, t2, a2 ++ ++ andn a4, a3, t2 ++ and a6, a7, a6 ++ and a5, a5, a4 ++ or a7, a6, a5 ++ ++ ++ bnez a7, L(_mc8_a) ++ ld.d t4, a0, 8 ++ addi.d a0, a0, 16 ++ xor t2, t4, a1 ++ ++ sub.d a7, t4, a2 ++ andn a6, a3, t4 ++ sub.d a5, t2, a2 ++ andn a4, a3, t2 ++ ++ and a6, a7, a6 ++ and a5, a5, a4 ++ or a7, a6, a5 ++ beqz a7, L(_aloop) ++ ++ addi.d a0, a0, -8 ++ ++L(_mc8_a): ++ ctz.d t0, a5 ++ ctz.d t2, a6 ++ srli.w t0, t0, 3 ++ ++ ++ srli.w t2, t2, 3 ++ sltu t1, t2, t0 ++ add.d a0, a0, t0 ++ masknez a0, a0, t1 ++ ++ jr ra ++END(STRCHR_NAME) + + weak_alias (STRCHR_NAME, index) +diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S +index 4fa63ecc..f18b01a3 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S +@@ -1,8 +1,96 @@ ++#ifdef _LIBC ++#include ++#include ++#include ++#else ++#include ++#include ++#endif + + #if IS_IN (libc) +- + #define STRCHRNUL_NAME __strchrnul_aligned +- ++#else ++#define STRCHRNUL_NAME __strchrnul + #endif + +-#include "../strchrnul.S" ++/* char * strchrnul (const char *s1, int c); */ ++ ++LEAF(STRCHRNUL_NAME, 6) ++ slli.d t1, a0, 3 ++ bstrins.d a0, zero, 2, 0 ++ lu12i.w a2, 0x01010 ++ ld.d t2, a0, 0 ++ ++ ori a2, a2, 0x101 ++ andi a1, a1, 0xff ++ bstrins.d a2, a2, 63, 32 ++ li.w t0, -1 ++ ++ mul.d a1, a1, a2 # "cccccccc" ++ sll.d t0, t0, t1 ++ slli.d a3, a2, 7 # 0x8080808080808080 ++ orn t2, t2, t0 ++ ++ sll.d t3, a1, t1 ++ xor t4, t2, t3 ++ sub.d a7, t2, a2 ++ andn a6, a3, t2 ++ ++ ++ sub.d a5, t4, a2 ++ andn a4, a3, t4 ++ and a6, a7, a6 ++ and a5, a5, a4 ++ ++ or t0, a6, a5 ++ bnez t0, L(_mc8_a) ++ addi.d a0, a0, 8 ++L(_aloop): ++ ld.d t4, a0, 0 ++ ++ xor t2, t4, a1 ++ sub.d a7, t4, a2 ++ andn a6, a3, t4 ++ sub.d a5, t2, a2 ++ ++ andn a4, a3, t2 ++ and a6, a7, a6 ++ and a5, a5, a4 ++ or a7, a6, a5 ++ ++ ++ bnez a7, L(_mc8_a) ++ ld.d t4, a0, 8 ++ addi.d a0, a0, 16 ++ xor t2, t4, a1 ++ ++ sub.d a7, t4, a2 ++ andn a6, a3, t4 ++ sub.d a5, t2, a2 ++ andn a4, a3, t2 ++ ++ and a6, a7, a6 ++ and a5, a5, a4 ++ or a7, a6, a5 ++ beqz a7, L(_aloop) ++ ++ addi.d a0, a0, -8 ++L(_mc8_a): ++ ctz.d t0, a5 ++ ctz.d t2, a6 ++ srli.w t0, t0, 3 ++ ++ srli.w t2, t2, 3 ++ slt t1, t0, t2 ++ masknez t3, t2, t1 ++ maskeqz t4, t0, t1 ++ ++ or t0, t3, t4 ++ add.d a0, a0, t0 ++ jr ra ++END(STRCHRNUL_NAME) ++ ++#ifdef _LIBC ++weak_alias(STRCHRNUL_NAME, strchrnul) ++libc_hidden_builtin_def (STRCHRNUL_NAME) ++#endif +diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S +index f84f52b8..a9b74b0c 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S +@@ -1,8 +1,229 @@ ++/* 2022\06\15 loongarch64 author: chenxiaolong. */ + +-#if IS_IN (libc) ++#ifdef _LIBC ++#include ++#include ++#include ++#else ++#include ++#include ++#endif + ++#if IS_IN (libc) + #define STRCMP_NAME __strcmp_aligned ++#else ++#define STRCMP_NAME strcmp ++#endif ++ ++/* int strcmp (const char *s1, const char *s2); */ ++ ++/* Parameters and Results */ ++#define src1 a0 ++#define src2 a1 ++#define result v0 ++LEAF(STRCMP_NAME, 6) ++ xor a4, src1, src2 ++ lu12i.w t5, 0x01010 ++ lu12i.w t6, 0x7f7f7 ++ andi a2, src1, 0x7 ++ ++ ori t5, t5, 0x101 ++ andi a4, a4, 0x7 ++ ori t6, t6, 0xf7f ++ bstrins.d t5, t5, 63, 32 ++ bstrins.d t6, t6, 63, 32 ++ ++ bnez a4, 3f // unaligned ++ beqz a2, 1f // loop aligned ++ ++// mutual aligned ++ bstrins.d src1, zero, 2, 0 ++ bstrins.d src2, zero, 2, 0 ++ slli.d a4, a2, 0x3 ++ ld.d t0, src1, 0 ++ ++ sub.d a4, zero, a4 ++ ld.d t1, src2, 0 ++ addi.d src1, src1, 8 ++ addi.d src2, src2, 8 ++ ++ nor a5, zero, zero ++ srl.d a5, a5, a4 ++ or t0, t0, a5 ++ ++ or t1, t1, a5 ++ b 2f //start realigned ++ ++// loop aligned ++1: ++ ld.d t0, src1, 0 ++ addi.d src1, src1, 8 ++ ld.d t1, src2, 0 ++ addi.d src2, src2, 8 ++ ++// start realigned: ++2: ++ sub.d t2, t0, t5 ++ nor t3, t0, t6 ++ and t2, t2, t3 ++ ++ xor t3, t0, t1 ++ or t2, t2, t3 ++ beqz t2, 1b ++ ++ ctz.d t7, t2 ++ bstrins.d t7, zero, 2, 0 ++ srl.d t0, t0, t7 ++ srl.d t1, t1, t7 ++ ++ andi t0, t0, 0xff ++ andi t1, t1, 0xff ++ sub.d v0, t0, t1 ++ jr ra ++ ++// unaligned ++3: ++ andi a3, src2, 0x7 ++ slt a5, a2, a3 ++ masknez t8, a2, a5 ++ xor a6, src1, src2 ++ maskeqz a6, a6, t8 ++ xor src1, src1, a6 ++ xor src2, src2, a6 ++ ++ andi a2, src1, 0x7 ++ beqz a2, 4f // src1 is aligned ++ ++//strcmp_unaligned: ++ andi a3, src2, 0x7 ++ bstrins.d src1, zero, 2, 0 ++ bstrins.d src2, zero, 2, 0 ++ nor t3, zero, zero ++ ++ ld.d t0, src1, 0 ++ ld.d t1, src2, 0 ++ sub.d a2, a3, a2 ++ addi.d t2, zero, 8 ++ ++ sub.d a5, t2, a2 ++ sub.d a6, t2, a3 ++ slli.d a5, a5, 0x3 ++ slli.d a6, a6, 0x3 ++ ++ srl.d t4, t3, a6 ++ srl.d a4, t3, a5 ++ rotr.d a7, t0, a5 ++ ++ addi.d src2, src2, 8 ++ addi.d src1, src1, 8 ++ or t1, t1, t4 ++ or t0, a7, t4 ++ ++ sub.d t2, t0, t5 ++ nor t3, t0, t6 ++ and t2, t2, t3 ++ xor t3, t0, t1 ++ or t2, t2, t3 ++ bnez t2, 7f ++ ++ and a7, a7, a4 ++ slli.d a6, a2, 0x3 ++ nor a4, zero, a4 ++ b 5f ++ ++// src1 is aligned ++4: ++ andi a3, src2, 0x7 ++ ld.d t0, src1, 0 ++ ++ bstrins.d src2, zero, 2, 0 ++ nor t2, zero, zero ++ ld.d t1, src2, 0 ++ ++ addi.d t3, zero, 0x8 ++ sub.d a5, t3, a3 ++ slli.d a5, a5, 0x3 ++ srl.d a4, t2, a5 ++ rotr.d t4, t0, a5 ++ ++ addi.d src2, src2, 8 ++ addi.d src1, src1, 8 ++ or t1, t1, a4 ++ or t0, t4, a4 ++ ++ sub.d t2, t0, t5 ++ nor t3, t0, t6 ++ and t2, t2, t3 ++ xor t3, t0, t1 ++ or t2, t2, t3 ++ ++ bnez t2, 7f ++ ++ and a7, t4, a4 ++ slli.d a6, a3, 0x3 ++ nor a4, zero, a4 ++ ++// unaligned loop ++// a7: remaining number ++// a6: shift left number ++// a5: shift right number ++// a4: mask for checking remaining number ++5: ++ or t0, a7, a4 ++ sub.d t2, t0, t5 ++ nor t3, t0, t6 ++ and t2, t2, t3 ++ bnez t2, 6f ++ ++ ld.d t0, src1, 0 ++ addi.d src1, src1, 8 ++ ld.d t1, src2, 0 ++ addi.d src2, src2, 8 ++ ++ srl.d t7, t0, a5 ++ sll.d t0, t0, a6 ++ or t0, a7, t0 ++ ++ sub.d t2, t0, t5 ++ nor t3, t0, t6 ++ and t2, t2, t3 ++ xor t3, t0, t1 ++ or t2, t2, t3 ++ bnez t2, 7f ++ ++ or a7, t7, zero ++ b 5b ++ ++6: ++ ld.bu t1, src2, 0 ++ andi t0, a7, 0xff ++ xor t2, t0, t1 ++ srli.d a7, a7, 0x8 ++ masknez t2, t0, t2 ++ addi.d src2, src2, 1 ++ beqz t2, 8f ++ b 6b ++ ++7: ++ ctz.d t7, t2 ++ bstrins.d t7, zero, 2, 0 ++ srl.d t0, t0, t7 ++ srl.d t1, t1, t7 ++ ++ andi t0, t0, 0xff ++ andi t1, t1, 0xff ++ ++8: ++ sub.d a4, t0, t1 ++ sub.d a5, t1, t0 ++ maskeqz a6, a5, t8 ++ masknez result, a4, t8 ++ or result, result, a6 ++ jr ra ++ ++END(STRCMP_NAME) + ++#ifdef _LIBC ++libc_hidden_builtin_def (STRCMP_NAME) + #endif + +-#include "../strcmp.S" +diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S +index 4860398b..80954912 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S +@@ -1,8 +1,175 @@ ++#ifdef _LIBC ++#include ++#include ++#include ++#else ++#include ++#include ++#endif + + #if IS_IN (libc) +- + #define STRCPY __strcpy_aligned +- ++#else ++#define STRCPY strcpy + #endif + +-#include "../strcpy.S" ++LEAF(STRCPY, 6) ++ andi a3, a0, 0x7 ++ move a2, a0 ++ beqz a3, L(dest_align) ++ sub.d a5, a1, a3 ++ addi.d a5, a5, 8 ++ ++L(make_dest_align): ++ ld.b t0, a1, 0 ++ addi.d a1, a1, 1 ++ st.b t0, a2, 0 ++ beqz t0, L(al_out) ++ ++ addi.d a2, a2, 1 ++ bne a1, a5, L(make_dest_align) ++ ++L(dest_align): ++ andi a4, a1, 7 ++ bstrins.d a1, zero, 2, 0 ++ ++ lu12i.w t5, 0x1010 ++ ld.d t0, a1, 0 ++ ori t5, t5, 0x101 ++ bstrins.d t5, t5, 63, 32 ++ ++ slli.d t6, t5, 0x7 ++ bnez a4, L(unalign) ++ sub.d t1, t0, t5 ++ andn t2, t6, t0 ++ ++ and t3, t1, t2 ++ bnez t3, L(al_end) ++ ++L(al_loop): ++ st.d t0, a2, 0 ++ ld.d t0, a1, 8 ++ ++ addi.d a1, a1, 8 ++ addi.d a2, a2, 8 ++ sub.d t1, t0, t5 ++ andn t2, t6, t0 ++ ++ and t3, t1, t2 ++ beqz t3, L(al_loop) ++ ++L(al_end): ++ ctz.d t1, t3 ++ srli.d t1, t1, 3 ++ addi.d t1, t1, 1 # add 1, since '\0' needs to be copied to dest ++ ++ andi a3, t1, 8 ++ andi a4, t1, 4 ++ andi a5, t1, 2 ++ andi a6, t1, 1 ++ ++L(al_end_8): ++ beqz a3, L(al_end_4) ++ st.d t0, a2, 0 ++ jr ra ++L(al_end_4): ++ beqz a4, L(al_end_2) ++ st.w t0, a2, 0 ++ addi.d a2, a2, 4 ++ srli.d t0, t0, 32 ++L(al_end_2): ++ beqz a5, L(al_end_1) ++ st.h t0, a2, 0 ++ addi.d a2, a2, 2 ++ srli.d t0, t0, 16 ++L(al_end_1): ++ beqz a6, L(al_out) ++ st.b t0, a2, 0 ++L(al_out): ++ jr ra ++ ++L(unalign): ++ slli.d a5, a4, 3 ++ li.d t1, -1 ++ sub.d a6, zero, a5 ++ ++ srl.d a7, t0, a5 ++ sll.d t7, t1, a6 ++ ++ or t0, a7, t7 ++ sub.d t1, t0, t5 ++ andn t2, t6, t0 ++ and t3, t1, t2 ++ ++ bnez t3, L(un_end) ++ ++ ld.d t4, a1, 8 ++ ++ sub.d t1, t4, t5 ++ andn t2, t6, t4 ++ sll.d t0, t4, a6 ++ and t3, t1, t2 ++ ++ or t0, t0, a7 ++ bnez t3, L(un_end_with_remaining) ++ ++L(un_loop): ++ srl.d a7, t4, a5 ++ ++ ld.d t4, a1, 16 ++ addi.d a1, a1, 8 ++ ++ st.d t0, a2, 0 ++ addi.d a2, a2, 8 ++ ++ sub.d t1, t4, t5 ++ andn t2, t6, t4 ++ sll.d t0, t4, a6 ++ and t3, t1, t2 ++ ++ or t0, t0, a7 ++ beqz t3, L(un_loop) ++ ++L(un_end_with_remaining): ++ ctz.d t1, t3 ++ srli.d t1, t1, 3 ++ addi.d t1, t1, 1 ++ sub.d t1, t1, a4 ++ ++ blt t1, zero, L(un_end_less_8) ++ st.d t0, a2, 0 ++ addi.d a2, a2, 8 ++ beqz t1, L(un_out) ++ srl.d t0, t4, a5 # get the remaining part ++ b L(un_end_less_8) ++ ++L(un_end): ++ ctz.d t1, t3 ++ srli.d t1, t1, 3 ++ addi.d t1, t1, 1 ++ ++L(un_end_less_8): ++ andi a4, t1, 4 ++ andi a5, t1, 2 ++ andi a6, t1, 1 ++L(un_end_4): ++ beqz a4, L(un_end_2) ++ st.w t0, a2, 0 ++ addi.d a2, a2, 4 ++ srli.d t0, t0, 32 ++L(un_end_2): ++ beqz a5, L(un_end_1) ++ st.h t0, a2, 0 ++ addi.d a2, a2, 2 ++ srli.d t0, t0, 16 ++L(un_end_1): ++ beqz a6, L(un_out) ++ st.b t0, a2, 0 ++L(un_out): ++ jr ra ++ ++END(STRCPY) ++ ++#ifdef _LIBC ++libc_hidden_builtin_def (STRCPY) ++#endif +diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S +index d31875fd..fcbc4f6a 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S +@@ -1,8 +1,87 @@ ++#ifdef _LIBC ++#include ++#include ++#include ++#else ++#include ++#include ++#endif + + #if IS_IN (libc) +- + #define STRLEN __strlen_aligned +- ++#else ++#define STRLEN strlen + #endif + +-#include "../strlen.S" ++LEAF(STRLEN, 6) ++ move a1, a0 ++ bstrins.d a0, zero, 2, 0 ++ lu12i.w a2, 0x01010 ++ li.w t0, -1 ++ ++ ld.d t2, a0, 0 ++ andi t1, a1, 0x7 ++ ori a2, a2, 0x101 ++ slli.d t1, t1, 3 ++ ++ bstrins.d a2, a2, 63, 32 ++ sll.d t1, t0, t1 ++ slli.d t3, a2, 7 ++ nor a3, zero, t3 ++ ++ orn t2, t2, t1 ++ sub.d t0, t2, a2 ++ nor t1, t2, a3 ++ and t0, t0, t1 ++ ++ ++ bnez t0, L(count_pos) ++ addi.d a0, a0, 8 ++L(loop_16_7bit): ++ ld.d t2, a0, 0 ++ sub.d t1, t2, a2 ++ ++ and t0, t1, t3 ++ bnez t0, L(more_check) ++ ld.d t2, a0, 8 ++ addi.d a0, a0, 16 ++ ++ sub.d t1, t2, a2 ++ and t0, t1, t3 ++ beqz t0, L(loop_16_7bit) ++ addi.d a0, a0, -8 ++L(more_check): ++ nor t0, t2, a3 ++ ++ and t0, t1, t0 ++ bnez t0, L(count_pos) ++ addi.d a0, a0, 8 ++L(loop_16_8bit): ++ ld.d t2, a0, 0 ++ ++ sub.d t1, t2, a2 ++ nor t0, t2, a3 ++ and t0, t0, t1 ++ bnez t0, L(count_pos) ++ ++ ld.d t2, a0, 8 ++ addi.d a0, a0, 16 ++ sub.d t1, t2, a2 ++ nor t0, t2, a3 ++ ++ and t0, t0, t1 ++ beqz t0, L(loop_16_8bit) ++ addi.d a0, a0, -8 ++L(count_pos): ++ ctz.d t1, t0 ++ sub.d a0, a0, a1 ++ ++ srli.d t1, t1, 3 ++ add.d a0, a0, t1 ++ jr ra ++ ++END(STRLEN) ++ ++#ifdef _LIBC ++libc_hidden_builtin_def (STRLEN) ++#endif +diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S +index f371b19e..2cd56c44 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S +@@ -1,8 +1,258 @@ ++#ifdef _LIBC ++#include ++#include ++#include ++#else ++#include ++#include ++#endif + + #if IS_IN (libc) +- + #define STRNCMP __strncmp_aligned +- ++#else ++#define STRNCMP strncmp + #endif + +-#include "../strncmp.S" ++/* int strncmp (const char *s1, const char *s2); */ ++ ++LEAF(STRNCMP, 6) ++ beqz a2, L(ret0) ++ xor a4, a0, a1 ++ lu12i.w t5, 0x01010 ++ lu12i.w t6, 0x7f7f7 ++ ++ andi a3, a0, 0x7 ++ ori t5, t5, 0x101 ++ andi a4, a4, 0x7 ++ ori t6, t6, 0xf7f ++ ++ bstrins.d t5, t5, 63, 32 ++ bstrins.d t6, t6, 63, 32 ++ ++ bnez a4, L(unalign) ++ bnez a3, L(mutual_align) ++ ++L(a_loop): ++ ld.d t0, a0, 0 ++ ld.d t1, a1, 0 ++ addi.d a0, a0, 8 ++ addi.d a1, a1, 8 ++ ++ ++ sltui t7, a2, 9 ++ ++L(start_realign): ++ sub.d t2, t0, t5 ++ nor t3, t0, t6 ++ xor t4, t0, t1 ++ ++ and t2, t2, t3 ++ addi.d a2, a2, -8 ++ ++ or t2, t2, t4 ++ or t3, t2, t7 ++ beqz t3, L(a_loop) ++ ++L(end): ++ bge zero, t7, L(out) ++ andi t4, a2, 7 ++ li.d t3, -1 ++ addi.d t4, t4, -1 ++ slli.d t4, t4, 3 ++ sll.d t3, t3, t4 ++ or t2, t2, t3 ++ ++ ++L(out): ++ ctz.d t3, t2 ++ bstrins.d t3, zero, 2, 0 ++ srl.d t0, t0, t3 ++ srl.d t1, t1, t3 ++ ++ andi t0, t0, 0xff ++ andi t1, t1, 0xff ++ sub.d a0, t0, t1 ++ jr ra ++ ++L(mutual_align): ++ bstrins.d a0, zero, 2, 0 ++ bstrins.d a1, zero, 2, 0 ++ slli.d a5, a3, 0x3 ++ li.d t2, -1 ++ ++ ld.d t0, a0, 0 ++ ld.d t1, a1, 0 ++ ++ li.d t3, 9 ++ sll.d t2, t2, a5 ++ ++ sub.d t3, t3, a3 ++ addi.d a0, a0, 8 ++ ++ sltu t7, a2, t3 ++ addi.d a1, a1, 8 ++ ++ add.d a2, a2, a3 ++ orn t0, t0, t2 ++ orn t1, t1, t2 ++ b L(start_realign) ++ ++L(ret0): ++ move a0, zero ++ jr ra ++ ++L(unalign): ++ li.d t8, 8 ++ blt a2, t8, L(short_cmp) ++ ++ # swap a0 and a1 in case a3 > a4 ++ andi a4, a1, 0x7 ++ sltu t8, a4, a3 ++ xor a6, a0, a1 ++ maskeqz a6, a6, t8 ++ xor a0, a0, a6 ++ xor a1, a1, a6 ++ ++ andi a3, a0, 0x7 ++ andi a4, a1, 0x7 ++ ++ bstrins.d a0, zero, 2, 0 ++ bstrins.d a1, zero, 2, 0 ++ ++ li.d t2, -1 ++ li.d t3, 9 ++ ++ ld.d t0, a0, 0 ++ ld.d t1, a1, 0 ++ ++ sub.d t3, t3, a4 ++ sub.d a3, a4, a3 ++ ++ slli.d t4, a4, 3 ++ slli.d a6, a3, 3 ++ ++ sub.d a5, zero, a6 ++ sltu t7, a2, t3 ++ ++ rotr.d a7, t0, a5 ++ sll.d t4, t2, t4 # mask for first num ++ ++ add.d a2, a2, a4 ++ sll.d a4, t2, a6 # mask for a7 ++ ++ orn t0, a7, t4 ++ orn t1, t1, t4 ++ ++ sub.d t2, t0, t5 ++ nor t4, t0, t6 ++ and t2, t2, t4 ++ ++ xor t3, t0, t1 ++ or t2, t2, t3 ++ ++ or t3, t2, t7 ++ bnez t3, L(un_end) ++ ++ andn a7, a7, a4 ++ addi.d a3, a3, 1 ++ ++L(un_loop): ++ addi.d a2, a2, -8 ++ # in case remaining part has '\0', no more load instructions should be executed on a0 address ++ or t0, a7, a4 ++ sltu t7, a2, a3 ++ ++ sub.d t2, t0, t5 ++ nor t3, t0, t6 ++ and t2, t2, t3 ++ ++ or t3, t2, t7 ++ bnez t3, L(check_remaining) ++ ++ ld.d t7, a0, 8 ++ ld.d t1, a1, 8 ++ addi.d a0, a0, 8 ++ addi.d a1, a1, 8 ++ ++ sll.d t4, t7, a6 ++ sub.d t2, t1, t5 ++ nor t3, t1, t6 ++ ++ or t0, t4, a7 ++ srl.d a7, t7, a5 ++ ++ and t2, t2, t3 ++ xor t3, t0, t1 ++ ++ sltui t7, a2, 9 ++ or t2, t2, t3 ++ ++ or t3, t2, t7 ++ beqz t3, L(un_loop) ++ b L(un_end) ++ ++L(check_remaining): ++ ld.d t1, a1, 8 ++ xor t3, t1, a7 ++ or t2, t2, t3 ++ ++L(un_end): ++ bge zero, t7, L(un_out) ++ andi t4, a2, 7 ++ li.d t3, -1 ++ ++ addi.d t4, t4, -1 ++ slli.d t4, t4, 3 ++ sll.d t3, t3, t4 ++ or t2, t2, t3 ++ ++L(un_out): ++ ctz.d t3, t2 ++ bstrins.d t3, zero, 2, 0 ++ srl.d t0, t0, t3 ++ srl.d t1, t1, t3 ++ ++ andi t0, t0, 0xff ++ andi t1, t1, 0xff ++ ++ sub.d a4, t0, t1 ++ sub.d a5, t1, t0 ++ ++ maskeqz a6, a5, t8 ++ masknez a0, a4, t8 ++ ++ or a0, a0, a6 ++ jr ra ++ ++L(short_cmp): ++ ld.bu t0, a0, 0 ++ ld.bu t1, a1, 0 ++ addi.d a2, a2, -1 ++ ++ xor t2, t0, t1 ++ masknez t2, t0, t2 ++ maskeqz t2, a2, t2 ++ ++ beqz t2, L(short_out) ++ ++ ld.bu t0, a0, 1 ++ ld.bu t1, a1, 1 ++ ++ addi.d a2, a2, -1 ++ addi.d a0, a0, 2 ++ ++ addi.d a1, a1, 2 ++ xor t2, t0, t1 ++ masknez t2, t0, t2 ++ maskeqz t2, a2, t2 ++ ++ bnez t2, L(short_cmp) ++ ++L(short_out): ++ sub.d a0, t0, t1 ++ jr ra ++ ++END(STRNCMP) ++#ifdef _LIBC ++libc_hidden_builtin_def (STRNCMP) ++#endif +diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S +index 503442b3..78c8fd5d 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S +@@ -1,8 +1,84 @@ ++#ifdef _LIBC ++#include ++#include ++#include ++#else ++#include ++#include ++#endif + + #if IS_IN (libc) +- + #define STRNLEN __strnlen_aligned +- ++#else ++#define STRNLEN __strnlen + #endif + +-#include "../strnlen.S" ++#. before every load, a1(t5) must > 0; ++#. first load with t1 != 0, need to adjust t5; ++#. return the less one of both strlen(s) and a1; ++ ++LEAF(STRNLEN, 6) ++ beqz a1, L(out) ++ lu12i.w a2, 0x01010 ++ andi t1, a0, 0x7 ++ move t4, a0 ++ ++ bstrins.d a0, zero, 2, 0 ++ ori a2, a2, 0x101 ++ li.w t0, -1 ++ ld.d t2, a0, 0 ++ ++ slli.d t3, t1, 3 ++ bstrins.d a2, a2, 63, 32 ++ li.w t5, 8 ++ slli.d a3, a2, 7 ++ ++ sub.w t1, t5, t1 ++ sll.d t0, t0, t3 ++ nor a3, zero, a3 ++ orn t2, t2, t0 ++ ++ ++ sub.d t0, t2, a2 ++ nor t3, t2, a3 ++ and t0, t0, t3 ++ bnez t0, L(count_pos) ++ ++ sub.d t5, a1, t1 ++ bgeu t1, a1, L(out) ++L(loop_8bytes): ++ ld.d t2, a0, 8 ++ addi.d a0, a0, 8 ++ ++ sub.d t0, t2, a2 ++ nor t1, t2, a3 ++ sltui t6, t5, 9 ++ and t0, t0, t1 ++ ++ addi.d t5, t5, -8 ++ or t7, t0, t6 ++ beqz t7, L(loop_8bytes) ++L(count_pos): ++ ctz.d t1, t0 ++ ++ ++ sub.d a0, a0, t4 ++ srli.d t1, t1, 3 ++ add.d a0, t1, a0 ++ sltu t0, a0, a1 ++ ++ masknez t1, a1, t0 ++ maskeqz a0, a0, t0 ++ or a0, a0, t1 ++ jr ra ++ ++L(out): ++ move a0, a1 ++ jr ra ++ ++END(STRNLEN) ++ ++#ifdef _LIBC ++weak_alias (STRNLEN, strnlen) ++libc_hidden_builtin_def (STRNLEN) ++#endif +diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S +index a58ddde8..6931045b 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S +@@ -1,11 +1,110 @@ ++#ifdef _LIBC ++#include ++#include ++#include ++#else ++#include ++#include ++#endif + + #if IS_IN (libc) +- + #define STRRCHR_NAME __strrchr_aligned +- ++#else ++#define STRRCHR_NAME strrchr + #endif + +-#include "../strrchr.S" ++LEAF(STRRCHR_NAME, 6) ++ slli.d t1, a0, 3 ++ bstrins.d a0, zero, 2, 0 ++ lu12i.w a2, 0x01010 ++ ld.d t2, a0, 0 // t2 = "5ZZ21abc" ++ ++ ori a2, a2, 0x101 ++ andi a1, a1, 0xff // a1 = "0000000Z" ++ li.d a5, -1 ++ bstrins.d a2, a2, 63, 32 // a2 = 0x0101010101010101 ++ ++ sll.d t1, a5, t1 // t1 = 0xffffffffff000000 ++ mul.d a1, a1, a2 // a1 = "ZZZZZZZZ" ++ orn t2, t2, t1 // t2 = "5ZZ21YYY" ++ slli.d a3, a2, 7 // a3 = 0x8080808080808080 ++ ++ sub.d a4, t2, a2 ++ andn t0, a3, t2 ++ move t3, zero ++ and t0, a4, t0 ++ ++ ++ xor a4, t2, a1 ++ move t5, zero ++ orn a4, a4, t1 ++ bnez t0, L(found_end) ++ ++ sub.d t1, a4, a2 ++ andn t0, a3, a4 ++ and t1, t1, t0 ++ ++L(loop_8bytes): ++ masknez t4, t3, t1 ++ ++ maskeqz t3, t2, t1 ++ ld.d t2, a0, 8 ++ masknez t0, t5, t1 ++ maskeqz t5, a0, t1 ++ ++ or t3, t3, t4 ++ or t5, t0, t5 ++ sub.d t0, t2, a2 ++ andn t1, a3, t2 ++ ++ ++ xor a4, t2, a1 ++ and t0, t0, t1 //t0 hold diff pattern for '\0' ++ sub.d t1, a4, a2 ++ andn t4, a3, a4 ++ ++ and t1, t1, t4 //t1 hold diff pattern for 'a1' ++ addi.d a0, a0, 8 ++ beqz t0, L(loop_8bytes) //ok, neither \0 nor found ++L(found_end): ++ ctz.d t1, t0 ++ ++ xor t3, t3, a1 ++ orn t1, zero, t1 ++ revb.d t3, t3 ++ srl.d t1, a5, t1 // mask for '\0' ++ ++ sub.d t4, t3, a2 ++ orn a4, a4, t1 ++ andn t3, a3, t3 ++ revb.d t2, a4 ++ ++ sub.d t0, t2, a2 ++ andn t1, a3, t2 ++ and t3, t3, t4 ++ and t1, t0, t1 ++ ++ li.d t7, 7 ++ masknez t4, t3, t1 ++ maskeqz t3, t1, t1 ++ masknez t5, t5, t1 ++ ++ or t3, t3, t4 ++ maskeqz t6, a0, t1 ++ ctz.d t0, t3 ++ or t5, t6, t5 ++ ++ srli.d t0, t0, 3 ++ sub.d t0, t7, t0 ++ add.d a0, t5, t0 ++ maskeqz a0, a0, t3 ++ ++ jr ra ++END(STRRCHR_NAME) ++ ++#ifdef _LIBC ++libc_hidden_builtin_def(STRRCHR_NAME) ++#endif + + #undef rindex + weak_alias(STRRCHR_NAME, rindex) +diff --git a/sysdeps/loongarch/lp64/stpcpy.S b/sysdeps/loongarch/lp64/stpcpy.S +deleted file mode 100644 +index b6a367dc..00000000 +--- a/sysdeps/loongarch/lp64/stpcpy.S ++++ /dev/null +@@ -1,179 +0,0 @@ +-#ifdef _LIBC +-#include +-#include +-#include +-#else +-#include +-#include +-#endif +- +-#ifndef STPCPY_NAME +-#define STPCPY_NAME __stpcpy +-#endif +- +-LEAF(STPCPY_NAME, 6) +- andi a3, a0, 0x7 +- beqz a3, L(dest_align) +- sub.d a5, a1, a3 +- addi.d a5, a5, 8 +- +-L(make_dest_align): +- ld.b t0, a1, 0 +- addi.d a1, a1, 1 +- st.b t0, a0, 0 +- addi.d a0, a0, 1 +- +- beqz t0, L(al_out) +- bne a1, a5, L(make_dest_align) +- +-L(dest_align): +- andi a4, a1, 7 +- bstrins.d a1, zero, 2, 0 +- +- lu12i.w t5, 0x1010 +- ld.d t0, a1, 0 +- ori t5, t5, 0x101 +- bstrins.d t5, t5, 63, 32 +- +- slli.d t6, t5, 0x7 +- bnez a4, L(unalign) +- sub.d t1, t0, t5 +- andn t2, t6, t0 +- +- and t3, t1, t2 +- bnez t3, L(al_end) +- +-L(al_loop): +- st.d t0, a0, 0 +- ld.d t0, a1, 8 +- +- addi.d a1, a1, 8 +- addi.d a0, a0, 8 +- sub.d t1, t0, t5 +- andn t2, t6, t0 +- +- and t3, t1, t2 +- beqz t3, L(al_loop) +- +-L(al_end): +- ctz.d t1, t3 +- srli.d t1, t1, 3 +- addi.d t1, t1, 1 # add 1, since '\0' needs to be copied to dest +- +- andi a3, t1, 8 +- andi a4, t1, 4 +- andi a5, t1, 2 +- andi a6, t1, 1 +- +-L(al_end_8): +- beqz a3, L(al_end_4) +- st.d t0, a0, 0 +- addi.d a0, a0, 7 +- jr ra +-L(al_end_4): +- beqz a4, L(al_end_2) +- st.w t0, a0, 0 +- addi.d a0, a0, 4 +- srli.d t0, t0, 32 +-L(al_end_2): +- beqz a5, L(al_end_1) +- st.h t0, a0, 0 +- addi.d a0, a0, 2 +- srli.d t0, t0, 16 +-L(al_end_1): +- beqz a6, L(al_out) +- st.b t0, a0, 0 +- addi.d a0, a0, 1 +-L(al_out): +- addi.d a0, a0, -1 +- jr ra +- +-L(unalign): +- slli.d a5, a4, 3 +- li.d t1, -1 +- sub.d a6, zero, a5 +- +- srl.d a7, t0, a5 +- sll.d t7, t1, a6 +- +- or t0, a7, t7 +- sub.d t1, t0, t5 +- andn t2, t6, t0 +- and t3, t1, t2 +- +- bnez t3, L(un_end) +- +- ld.d t4, a1, 8 +- addi.d a1, a1, 8 +- +- sub.d t1, t4, t5 +- andn t2, t6, t4 +- sll.d t0, t4, a6 +- and t3, t1, t2 +- +- or t0, t0, a7 +- bnez t3, L(un_end_with_remaining) +- +-L(un_loop): +- srl.d a7, t4, a5 +- +- ld.d t4, a1, 8 +- addi.d a1, a1, 8 +- +- st.d t0, a0, 0 +- addi.d a0, a0, 8 +- +- sub.d t1, t4, t5 +- andn t2, t6, t4 +- sll.d t0, t4, a6 +- and t3, t1, t2 +- +- or t0, t0, a7 +- beqz t3, L(un_loop) +- +-L(un_end_with_remaining): +- ctz.d t1, t3 +- srli.d t1, t1, 3 +- addi.d t1, t1, 1 +- sub.d t1, t1, a4 +- +- blt t1, zero, L(un_end_less_8) +- st.d t0, a0, 0 +- addi.d a0, a0, 8 +- beqz t1, L(un_out) +- srl.d t0, t4, a5 # get the remaining part +- b L(un_end_less_8) +- +-L(un_end): +- ctz.d t1, t3 +- srli.d t1, t1, 3 +- addi.d t1, t1, 1 +- +-L(un_end_less_8): +- andi a4, t1, 4 +- andi a5, t1, 2 +- andi a6, t1, 1 +-L(un_end_4): +- beqz a4, L(un_end_2) +- st.w t0, a0, 0 +- addi.d a0, a0, 4 +- srli.d t0, t0, 32 +-L(un_end_2): +- beqz a5, L(un_end_1) +- st.h t0, a0, 0 +- addi.d a0, a0, 2 +- srli.d t0, t0, 16 +-L(un_end_1): +- beqz a6, L(un_out) +- st.b t0, a0, 0 +- addi.d a0, a0, 1 +-L(un_out): +- addi.d a0, a0, -1 +- jr ra +- +-END(STPCPY_NAME) +- +-#ifdef _LIBC +-weak_alias (STPCPY_NAME, stpcpy) +-libc_hidden_builtin_def (STPCPY_NAME) +-#endif +diff --git a/sysdeps/loongarch/lp64/strchr.S b/sysdeps/loongarch/lp64/strchr.S +deleted file mode 100644 +index fde53a30..00000000 +--- a/sysdeps/loongarch/lp64/strchr.S ++++ /dev/null +@@ -1,89 +0,0 @@ +-#ifdef _LIBC +-#include +-#include +-#include +-#else +-#include +-#include +-#endif +- +-#ifndef STRCHR_NAME +-#define STRCHR_NAME strchr +-#endif +- +-/* char * strchr (const char *s1, int c); */ +- +-LEAF(STRCHR_NAME, 6) +- slli.d t1, a0, 3 +- bstrins.d a0, zero, 2, 0 +- lu12i.w a2, 0x01010 +- ld.d t2, a0, 0 +- +- ori a2, a2, 0x101 +- andi a1, a1, 0xff +- bstrins.d a2, a2, 63, 32 +- li.w t0, -1 +- +- mul.d a1, a1, a2 # "cccccccc" +- sll.d t0, t0, t1 +- slli.d a3, a2, 7 # 0x8080808080808080 +- orn t2, t2, t0 +- +- sll.d t3, a1, t1 +- xor t4, t2, t3 +- sub.d a7, t2, a2 +- andn a6, a3, t2 +- +- +- sub.d a5, t4, a2 +- andn a4, a3, t4 +- and a6, a7, a6 +- and a5, a5, a4 +- +- or t0, a6, a5 +- bnez t0, L(_mc8_a) +- addi.d a0, a0, 8 +-L(_aloop): +- ld.d t4, a0, 0 +- +- xor t2, t4, a1 +- sub.d a7, t4, a2 +- andn a6, a3, t4 +- sub.d a5, t2, a2 +- +- andn a4, a3, t2 +- and a6, a7, a6 +- and a5, a5, a4 +- or a7, a6, a5 +- +- +- bnez a7, L(_mc8_a) +- ld.d t4, a0, 8 +- addi.d a0, a0, 16 +- xor t2, t4, a1 +- +- sub.d a7, t4, a2 +- andn a6, a3, t4 +- sub.d a5, t2, a2 +- andn a4, a3, t2 +- +- and a6, a7, a6 +- and a5, a5, a4 +- or a7, a6, a5 +- beqz a7, L(_aloop) +- +- addi.d a0, a0, -8 +- +-L(_mc8_a): +- ctz.d t0, a5 +- ctz.d t2, a6 +- srli.w t0, t0, 3 +- +- +- srli.w t2, t2, 3 +- sltu t1, t2, t0 +- add.d a0, a0, t0 +- masknez a0, a0, t1 +- +- jr ra +-END(STRCHR_NAME) +diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S +deleted file mode 100644 +index a5ee09a3..00000000 +--- a/sysdeps/loongarch/lp64/strchrnul.S ++++ /dev/null +@@ -1,94 +0,0 @@ +-#ifdef _LIBC +-#include +-#include +-#include +-#else +-#include +-#include +-#endif +- +-#ifndef STRCHRNUL_NAME +-#define STRCHRNUL_NAME __strchrnul +-#endif +- +-/* char * strchrnul (const char *s1, int c); */ +- +-LEAF(STRCHRNUL_NAME, 6) +- slli.d t1, a0, 3 +- bstrins.d a0, zero, 2, 0 +- lu12i.w a2, 0x01010 +- ld.d t2, a0, 0 +- +- ori a2, a2, 0x101 +- andi a1, a1, 0xff +- bstrins.d a2, a2, 63, 32 +- li.w t0, -1 +- +- mul.d a1, a1, a2 # "cccccccc" +- sll.d t0, t0, t1 +- slli.d a3, a2, 7 # 0x8080808080808080 +- orn t2, t2, t0 +- +- sll.d t3, a1, t1 +- xor t4, t2, t3 +- sub.d a7, t2, a2 +- andn a6, a3, t2 +- +- +- sub.d a5, t4, a2 +- andn a4, a3, t4 +- and a6, a7, a6 +- and a5, a5, a4 +- +- or t0, a6, a5 +- bnez t0, L(_mc8_a) +- addi.d a0, a0, 8 +-L(_aloop): +- ld.d t4, a0, 0 +- +- xor t2, t4, a1 +- sub.d a7, t4, a2 +- andn a6, a3, t4 +- sub.d a5, t2, a2 +- +- andn a4, a3, t2 +- and a6, a7, a6 +- and a5, a5, a4 +- or a7, a6, a5 +- +- +- bnez a7, L(_mc8_a) +- ld.d t4, a0, 8 +- addi.d a0, a0, 16 +- xor t2, t4, a1 +- +- sub.d a7, t4, a2 +- andn a6, a3, t4 +- sub.d a5, t2, a2 +- andn a4, a3, t2 +- +- and a6, a7, a6 +- and a5, a5, a4 +- or a7, a6, a5 +- beqz a7, L(_aloop) +- +- addi.d a0, a0, -8 +-L(_mc8_a): +- ctz.d t0, a5 +- ctz.d t2, a6 +- srli.w t0, t0, 3 +- +- srli.w t2, t2, 3 +- slt t1, t0, t2 +- masknez t3, t2, t1 +- maskeqz t4, t0, t1 +- +- or t0, t3, t4 +- add.d a0, a0, t0 +- jr ra +-END(STRCHRNUL_NAME) +- +-#ifdef _LIBC +-weak_alias(STRCHRNUL_NAME, strchrnul) +-libc_hidden_builtin_def (STRCHRNUL_NAME) +-#endif +diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S +deleted file mode 100644 +index 3a863992..00000000 +--- a/sysdeps/loongarch/lp64/strcmp.S ++++ /dev/null +@@ -1,227 +0,0 @@ +-/* 2022\06\15 loongarch64 author: chenxiaolong. */ +- +-#ifdef _LIBC +-#include +-#include +-#include +-#else +-#include +-#include +-#endif +- +-#ifndef STRCMP_NAME +-#define STRCMP_NAME strcmp +-#endif +- +-/* int strcmp (const char *s1, const char *s2); */ +- +-/* Parameters and Results */ +-#define src1 a0 +-#define src2 a1 +-#define result v0 +-LEAF(STRCMP_NAME, 6) +- xor a4, src1, src2 +- lu12i.w t5, 0x01010 +- lu12i.w t6, 0x7f7f7 +- andi a2, src1, 0x7 +- +- ori t5, t5, 0x101 +- andi a4, a4, 0x7 +- ori t6, t6, 0xf7f +- bstrins.d t5, t5, 63, 32 +- bstrins.d t6, t6, 63, 32 +- +- bnez a4, 3f // unaligned +- beqz a2, 1f // loop aligned +- +-// mutual aligned +- bstrins.d src1, zero, 2, 0 +- bstrins.d src2, zero, 2, 0 +- slli.d a4, a2, 0x3 +- ld.d t0, src1, 0 +- +- sub.d a4, zero, a4 +- ld.d t1, src2, 0 +- addi.d src1, src1, 8 +- addi.d src2, src2, 8 +- +- nor a5, zero, zero +- srl.d a5, a5, a4 +- or t0, t0, a5 +- +- or t1, t1, a5 +- b 2f //start realigned +- +-// loop aligned +-1: +- ld.d t0, src1, 0 +- addi.d src1, src1, 8 +- ld.d t1, src2, 0 +- addi.d src2, src2, 8 +- +-// start realigned: +-2: +- sub.d t2, t0, t5 +- nor t3, t0, t6 +- and t2, t2, t3 +- +- xor t3, t0, t1 +- or t2, t2, t3 +- beqz t2, 1b +- +- ctz.d t7, t2 +- bstrins.d t7, zero, 2, 0 +- srl.d t0, t0, t7 +- srl.d t1, t1, t7 +- +- andi t0, t0, 0xff +- andi t1, t1, 0xff +- sub.d v0, t0, t1 +- jr ra +- +-// unaligned +-3: +- andi a3, src2, 0x7 +- slt a5, a2, a3 +- masknez t8, a2, a5 +- xor a6, src1, src2 +- maskeqz a6, a6, t8 +- xor src1, src1, a6 +- xor src2, src2, a6 +- +- andi a2, src1, 0x7 +- beqz a2, 4f // src1 is aligned +- +-//strcmp_unaligned: +- andi a3, src2, 0x7 +- bstrins.d src1, zero, 2, 0 +- bstrins.d src2, zero, 2, 0 +- nor t3, zero, zero +- +- ld.d t0, src1, 0 +- ld.d t1, src2, 0 +- sub.d a2, a3, a2 +- addi.d t2, zero, 8 +- +- sub.d a5, t2, a2 +- sub.d a6, t2, a3 +- slli.d a5, a5, 0x3 +- slli.d a6, a6, 0x3 +- +- srl.d t4, t3, a6 +- srl.d a4, t3, a5 +- rotr.d a7, t0, a5 +- +- addi.d src2, src2, 8 +- addi.d src1, src1, 8 +- or t1, t1, t4 +- or t0, a7, t4 +- +- sub.d t2, t0, t5 +- nor t3, t0, t6 +- and t2, t2, t3 +- xor t3, t0, t1 +- or t2, t2, t3 +- bnez t2, 7f +- +- and a7, a7, a4 +- slli.d a6, a2, 0x3 +- nor a4, zero, a4 +- b 5f +- +-// src1 is aligned +-4: +- andi a3, src2, 0x7 +- ld.d t0, src1, 0 +- +- bstrins.d src2, zero, 2, 0 +- nor t2, zero, zero +- ld.d t1, src2, 0 +- +- addi.d t3, zero, 0x8 +- sub.d a5, t3, a3 +- slli.d a5, a5, 0x3 +- srl.d a4, t2, a5 +- rotr.d t4, t0, a5 +- +- addi.d src2, src2, 8 +- addi.d src1, src1, 8 +- or t1, t1, a4 +- or t0, t4, a4 +- +- sub.d t2, t0, t5 +- nor t3, t0, t6 +- and t2, t2, t3 +- xor t3, t0, t1 +- or t2, t2, t3 +- +- bnez t2, 7f +- +- and a7, t4, a4 +- slli.d a6, a3, 0x3 +- nor a4, zero, a4 +- +-// unaligned loop +-// a7: remaining number +-// a6: shift left number +-// a5: shift right number +-// a4: mask for checking remaining number +-5: +- or t0, a7, a4 +- sub.d t2, t0, t5 +- nor t3, t0, t6 +- and t2, t2, t3 +- bnez t2, 6f +- +- ld.d t0, src1, 0 +- addi.d src1, src1, 8 +- ld.d t1, src2, 0 +- addi.d src2, src2, 8 +- +- srl.d t7, t0, a5 +- sll.d t0, t0, a6 +- or t0, a7, t0 +- +- sub.d t2, t0, t5 +- nor t3, t0, t6 +- and t2, t2, t3 +- xor t3, t0, t1 +- or t2, t2, t3 +- bnez t2, 7f +- +- or a7, t7, zero +- b 5b +- +-6: +- ld.bu t1, src2, 0 +- andi t0, a7, 0xff +- xor t2, t0, t1 +- srli.d a7, a7, 0x8 +- masknez t2, t0, t2 +- addi.d src2, src2, 1 +- beqz t2, 8f +- b 6b +- +-7: +- ctz.d t7, t2 +- bstrins.d t7, zero, 2, 0 +- srl.d t0, t0, t7 +- srl.d t1, t1, t7 +- +- andi t0, t0, 0xff +- andi t1, t1, 0xff +- +-8: +- sub.d a4, t0, t1 +- sub.d a5, t1, t0 +- maskeqz a6, a5, t8 +- masknez result, a4, t8 +- or result, result, a6 +- jr ra +- +-END(STRCMP_NAME) +- +-#ifdef _LIBC +-libc_hidden_builtin_def (STRCMP_NAME) +-#endif +- +diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S +deleted file mode 100644 +index 08505192..00000000 +--- a/sysdeps/loongarch/lp64/strcpy.S ++++ /dev/null +@@ -1,173 +0,0 @@ +-#ifdef _LIBC +-#include +-#include +-#include +-#else +-#include +-#include +-#endif +- +-#ifndef STRCPY +-#define STRCPY strcpy +-#endif +- +-LEAF(STRCPY, 6) +- andi a3, a0, 0x7 +- move a2, a0 +- beqz a3, L(dest_align) +- sub.d a5, a1, a3 +- addi.d a5, a5, 8 +- +-L(make_dest_align): +- ld.b t0, a1, 0 +- addi.d a1, a1, 1 +- st.b t0, a2, 0 +- beqz t0, L(al_out) +- +- addi.d a2, a2, 1 +- bne a1, a5, L(make_dest_align) +- +-L(dest_align): +- andi a4, a1, 7 +- bstrins.d a1, zero, 2, 0 +- +- lu12i.w t5, 0x1010 +- ld.d t0, a1, 0 +- ori t5, t5, 0x101 +- bstrins.d t5, t5, 63, 32 +- +- slli.d t6, t5, 0x7 +- bnez a4, L(unalign) +- sub.d t1, t0, t5 +- andn t2, t6, t0 +- +- and t3, t1, t2 +- bnez t3, L(al_end) +- +-L(al_loop): +- st.d t0, a2, 0 +- ld.d t0, a1, 8 +- +- addi.d a1, a1, 8 +- addi.d a2, a2, 8 +- sub.d t1, t0, t5 +- andn t2, t6, t0 +- +- and t3, t1, t2 +- beqz t3, L(al_loop) +- +-L(al_end): +- ctz.d t1, t3 +- srli.d t1, t1, 3 +- addi.d t1, t1, 1 # add 1, since '\0' needs to be copied to dest +- +- andi a3, t1, 8 +- andi a4, t1, 4 +- andi a5, t1, 2 +- andi a6, t1, 1 +- +-L(al_end_8): +- beqz a3, L(al_end_4) +- st.d t0, a2, 0 +- jr ra +-L(al_end_4): +- beqz a4, L(al_end_2) +- st.w t0, a2, 0 +- addi.d a2, a2, 4 +- srli.d t0, t0, 32 +-L(al_end_2): +- beqz a5, L(al_end_1) +- st.h t0, a2, 0 +- addi.d a2, a2, 2 +- srli.d t0, t0, 16 +-L(al_end_1): +- beqz a6, L(al_out) +- st.b t0, a2, 0 +-L(al_out): +- jr ra +- +-L(unalign): +- slli.d a5, a4, 3 +- li.d t1, -1 +- sub.d a6, zero, a5 +- +- srl.d a7, t0, a5 +- sll.d t7, t1, a6 +- +- or t0, a7, t7 +- sub.d t1, t0, t5 +- andn t2, t6, t0 +- and t3, t1, t2 +- +- bnez t3, L(un_end) +- +- ld.d t4, a1, 8 +- +- sub.d t1, t4, t5 +- andn t2, t6, t4 +- sll.d t0, t4, a6 +- and t3, t1, t2 +- +- or t0, t0, a7 +- bnez t3, L(un_end_with_remaining) +- +-L(un_loop): +- srl.d a7, t4, a5 +- +- ld.d t4, a1, 16 +- addi.d a1, a1, 8 +- +- st.d t0, a2, 0 +- addi.d a2, a2, 8 +- +- sub.d t1, t4, t5 +- andn t2, t6, t4 +- sll.d t0, t4, a6 +- and t3, t1, t2 +- +- or t0, t0, a7 +- beqz t3, L(un_loop) +- +-L(un_end_with_remaining): +- ctz.d t1, t3 +- srli.d t1, t1, 3 +- addi.d t1, t1, 1 +- sub.d t1, t1, a4 +- +- blt t1, zero, L(un_end_less_8) +- st.d t0, a2, 0 +- addi.d a2, a2, 8 +- beqz t1, L(un_out) +- srl.d t0, t4, a5 # get the remaining part +- b L(un_end_less_8) +- +-L(un_end): +- ctz.d t1, t3 +- srli.d t1, t1, 3 +- addi.d t1, t1, 1 +- +-L(un_end_less_8): +- andi a4, t1, 4 +- andi a5, t1, 2 +- andi a6, t1, 1 +-L(un_end_4): +- beqz a4, L(un_end_2) +- st.w t0, a2, 0 +- addi.d a2, a2, 4 +- srli.d t0, t0, 32 +-L(un_end_2): +- beqz a5, L(un_end_1) +- st.h t0, a2, 0 +- addi.d a2, a2, 2 +- srli.d t0, t0, 16 +-L(un_end_1): +- beqz a6, L(un_out) +- st.b t0, a2, 0 +-L(un_out): +- jr ra +- +-END(STRCPY) +- +-#ifdef _LIBC +-libc_hidden_builtin_def (STRCPY) +-#endif +diff --git a/sysdeps/loongarch/lp64/strlen.S b/sysdeps/loongarch/lp64/strlen.S +deleted file mode 100644 +index 71431ce2..00000000 +--- a/sysdeps/loongarch/lp64/strlen.S ++++ /dev/null +@@ -1,85 +0,0 @@ +-#ifdef _LIBC +-#include +-#include +-#include +-#else +-#include +-#include +-#endif +- +-#ifndef STRLEN +-#define STRLEN strlen +-#endif +- +-LEAF(STRLEN, 6) +- move a1, a0 +- bstrins.d a0, zero, 2, 0 +- lu12i.w a2, 0x01010 +- li.w t0, -1 +- +- ld.d t2, a0, 0 +- andi t1, a1, 0x7 +- ori a2, a2, 0x101 +- slli.d t1, t1, 3 +- +- bstrins.d a2, a2, 63, 32 +- sll.d t1, t0, t1 +- slli.d t3, a2, 7 +- nor a3, zero, t3 +- +- orn t2, t2, t1 +- sub.d t0, t2, a2 +- nor t1, t2, a3 +- and t0, t0, t1 +- +- +- bnez t0, L(count_pos) +- addi.d a0, a0, 8 +-L(loop_16_7bit): +- ld.d t2, a0, 0 +- sub.d t1, t2, a2 +- +- and t0, t1, t3 +- bnez t0, L(more_check) +- ld.d t2, a0, 8 +- addi.d a0, a0, 16 +- +- sub.d t1, t2, a2 +- and t0, t1, t3 +- beqz t0, L(loop_16_7bit) +- addi.d a0, a0, -8 +-L(more_check): +- nor t0, t2, a3 +- +- and t0, t1, t0 +- bnez t0, L(count_pos) +- addi.d a0, a0, 8 +-L(loop_16_8bit): +- ld.d t2, a0, 0 +- +- sub.d t1, t2, a2 +- nor t0, t2, a3 +- and t0, t0, t1 +- bnez t0, L(count_pos) +- +- ld.d t2, a0, 8 +- addi.d a0, a0, 16 +- sub.d t1, t2, a2 +- nor t0, t2, a3 +- +- and t0, t0, t1 +- beqz t0, L(loop_16_8bit) +- addi.d a0, a0, -8 +-L(count_pos): +- ctz.d t1, t0 +- sub.d a0, a0, a1 +- +- srli.d t1, t1, 3 +- add.d a0, a0, t1 +- jr ra +- +-END(STRLEN) +- +-#ifdef _LIBC +-libc_hidden_builtin_def (STRLEN) +-#endif +diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S +deleted file mode 100644 +index 55450e55..00000000 +--- a/sysdeps/loongarch/lp64/strncmp.S ++++ /dev/null +@@ -1,256 +0,0 @@ +-#ifdef _LIBC +-#include +-#include +-#include +-#else +-#include +-#include +-#endif +- +-#ifndef STRNCMP +-#define STRNCMP strncmp +-#endif +- +-/* int strncmp (const char *s1, const char *s2); */ +- +-LEAF(STRNCMP, 6) +- beqz a2, L(ret0) +- xor a4, a0, a1 +- lu12i.w t5, 0x01010 +- lu12i.w t6, 0x7f7f7 +- +- andi a3, a0, 0x7 +- ori t5, t5, 0x101 +- andi a4, a4, 0x7 +- ori t6, t6, 0xf7f +- +- bstrins.d t5, t5, 63, 32 +- bstrins.d t6, t6, 63, 32 +- +- bnez a4, L(unalign) +- bnez a3, L(mutual_align) +- +-L(a_loop): +- ld.d t0, a0, 0 +- ld.d t1, a1, 0 +- addi.d a0, a0, 8 +- addi.d a1, a1, 8 +- +- +- sltui t7, a2, 9 +- +-L(start_realign): +- sub.d t2, t0, t5 +- nor t3, t0, t6 +- xor t4, t0, t1 +- +- and t2, t2, t3 +- addi.d a2, a2, -8 +- +- or t2, t2, t4 +- or t3, t2, t7 +- beqz t3, L(a_loop) +- +-L(end): +- bge zero, t7, L(out) +- andi t4, a2, 7 +- li.d t3, -1 +- addi.d t4, t4, -1 +- slli.d t4, t4, 3 +- sll.d t3, t3, t4 +- or t2, t2, t3 +- +- +-L(out): +- ctz.d t3, t2 +- bstrins.d t3, zero, 2, 0 +- srl.d t0, t0, t3 +- srl.d t1, t1, t3 +- +- andi t0, t0, 0xff +- andi t1, t1, 0xff +- sub.d a0, t0, t1 +- jr ra +- +-L(mutual_align): +- bstrins.d a0, zero, 2, 0 +- bstrins.d a1, zero, 2, 0 +- slli.d a5, a3, 0x3 +- li.d t2, -1 +- +- ld.d t0, a0, 0 +- ld.d t1, a1, 0 +- +- li.d t3, 9 +- sll.d t2, t2, a5 +- +- sub.d t3, t3, a3 +- addi.d a0, a0, 8 +- +- sltu t7, a2, t3 +- addi.d a1, a1, 8 +- +- add.d a2, a2, a3 +- orn t0, t0, t2 +- orn t1, t1, t2 +- b L(start_realign) +- +-L(ret0): +- move a0, zero +- jr ra +- +-L(unalign): +- li.d t8, 8 +- blt a2, t8, L(short_cmp) +- +- # swap a0 and a1 in case a3 > a4 +- andi a4, a1, 0x7 +- sltu t8, a4, a3 +- xor a6, a0, a1 +- maskeqz a6, a6, t8 +- xor a0, a0, a6 +- xor a1, a1, a6 +- +- andi a3, a0, 0x7 +- andi a4, a1, 0x7 +- +- bstrins.d a0, zero, 2, 0 +- bstrins.d a1, zero, 2, 0 +- +- li.d t2, -1 +- li.d t3, 9 +- +- ld.d t0, a0, 0 +- ld.d t1, a1, 0 +- +- sub.d t3, t3, a4 +- sub.d a3, a4, a3 +- +- slli.d t4, a4, 3 +- slli.d a6, a3, 3 +- +- sub.d a5, zero, a6 +- sltu t7, a2, t3 +- +- rotr.d a7, t0, a5 +- sll.d t4, t2, t4 # mask for first num +- +- add.d a2, a2, a4 +- sll.d a4, t2, a6 # mask for a7 +- +- orn t0, a7, t4 +- orn t1, t1, t4 +- +- sub.d t2, t0, t5 +- nor t4, t0, t6 +- and t2, t2, t4 +- +- xor t3, t0, t1 +- or t2, t2, t3 +- +- or t3, t2, t7 +- bnez t3, L(un_end) +- +- andn a7, a7, a4 +- addi.d a3, a3, 1 +- +-L(un_loop): +- addi.d a2, a2, -8 +- # in case remaining part has '\0', no more load instructions should be executed on a0 address +- or t0, a7, a4 +- sltu t7, a2, a3 +- +- sub.d t2, t0, t5 +- nor t3, t0, t6 +- and t2, t2, t3 +- +- or t3, t2, t7 +- bnez t3, L(check_remaining) +- +- ld.d t7, a0, 8 +- ld.d t1, a1, 8 +- addi.d a0, a0, 8 +- addi.d a1, a1, 8 +- +- sll.d t4, t7, a6 +- sub.d t2, t1, t5 +- nor t3, t1, t6 +- +- or t0, t4, a7 +- srl.d a7, t7, a5 +- +- and t2, t2, t3 +- xor t3, t0, t1 +- +- sltui t7, a2, 9 +- or t2, t2, t3 +- +- or t3, t2, t7 +- beqz t3, L(un_loop) +- b L(un_end) +- +-L(check_remaining): +- ld.d t1, a1, 8 +- xor t3, t1, a7 +- or t2, t2, t3 +- +-L(un_end): +- bge zero, t7, L(un_out) +- andi t4, a2, 7 +- li.d t3, -1 +- +- addi.d t4, t4, -1 +- slli.d t4, t4, 3 +- sll.d t3, t3, t4 +- or t2, t2, t3 +- +-L(un_out): +- ctz.d t3, t2 +- bstrins.d t3, zero, 2, 0 +- srl.d t0, t0, t3 +- srl.d t1, t1, t3 +- +- andi t0, t0, 0xff +- andi t1, t1, 0xff +- +- sub.d a4, t0, t1 +- sub.d a5, t1, t0 +- +- maskeqz a6, a5, t8 +- masknez a0, a4, t8 +- +- or a0, a0, a6 +- jr ra +- +-L(short_cmp): +- ld.bu t0, a0, 0 +- ld.bu t1, a1, 0 +- addi.d a2, a2, -1 +- +- xor t2, t0, t1 +- masknez t2, t0, t2 +- maskeqz t2, a2, t2 +- +- beqz t2, L(short_out) +- +- ld.bu t0, a0, 1 +- ld.bu t1, a1, 1 +- +- addi.d a2, a2, -1 +- addi.d a0, a0, 2 +- +- addi.d a1, a1, 2 +- xor t2, t0, t1 +- masknez t2, t0, t2 +- maskeqz t2, a2, t2 +- +- bnez t2, L(short_cmp) +- +-L(short_out): +- sub.d a0, t0, t1 +- jr ra +- +-END(STRNCMP) +-#ifdef _LIBC +-libc_hidden_builtin_def (STRNCMP) +-#endif +diff --git a/sysdeps/loongarch/lp64/strnlen.S b/sysdeps/loongarch/lp64/strnlen.S +deleted file mode 100644 +index 5b5ab585..00000000 +--- a/sysdeps/loongarch/lp64/strnlen.S ++++ /dev/null +@@ -1,82 +0,0 @@ +-#ifdef _LIBC +-#include +-#include +-#include +-#else +-#include +-#include +-#endif +- +-#ifndef STRNLEN +-#define STRNLEN __strnlen +-#endif +- +-#. before every load, a1(t5) must > 0; +-#. first load with t1 != 0, need to adjust t5; +-#. return the less one of both strlen(s) and a1; +- +-LEAF(STRNLEN, 6) +- beqz a1, L(out) +- lu12i.w a2, 0x01010 +- andi t1, a0, 0x7 +- move t4, a0 +- +- bstrins.d a0, zero, 2, 0 +- ori a2, a2, 0x101 +- li.w t0, -1 +- ld.d t2, a0, 0 +- +- slli.d t3, t1, 3 +- bstrins.d a2, a2, 63, 32 +- li.w t5, 8 +- slli.d a3, a2, 7 +- +- sub.w t1, t5, t1 +- sll.d t0, t0, t3 +- nor a3, zero, a3 +- orn t2, t2, t0 +- +- +- sub.d t0, t2, a2 +- nor t3, t2, a3 +- and t0, t0, t3 +- bnez t0, L(count_pos) +- +- sub.d t5, a1, t1 +- bgeu t1, a1, L(out) +-L(loop_8bytes): +- ld.d t2, a0, 8 +- addi.d a0, a0, 8 +- +- sub.d t0, t2, a2 +- nor t1, t2, a3 +- sltui t6, t5, 9 +- and t0, t0, t1 +- +- addi.d t5, t5, -8 +- or t7, t0, t6 +- beqz t7, L(loop_8bytes) +-L(count_pos): +- ctz.d t1, t0 +- +- +- sub.d a0, a0, t4 +- srli.d t1, t1, 3 +- add.d a0, t1, a0 +- sltu t0, a0, a1 +- +- masknez t1, a1, t0 +- maskeqz a0, a0, t0 +- or a0, a0, t1 +- jr ra +- +-L(out): +- move a0, a1 +- jr ra +- +-END(STRNLEN) +- +-#ifdef _LIBC +-weak_alias (STRNLEN, strnlen) +-libc_hidden_builtin_def (STRNLEN) +-#endif +diff --git a/sysdeps/loongarch/lp64/strrchr.S b/sysdeps/loongarch/lp64/strrchr.S +deleted file mode 100644 +index df7fcb6b..00000000 +--- a/sysdeps/loongarch/lp64/strrchr.S ++++ /dev/null +@@ -1,105 +0,0 @@ +-#ifdef _LIBC +-#include +-#include +-#include +-#else +-#include +-#include +-#endif +- +-#ifndef STRRCHR_NAME +-#define STRRCHR_NAME strrchr +-#endif +- +-LEAF(STRRCHR_NAME, 6) +- slli.d t1, a0, 3 +- bstrins.d a0, zero, 2, 0 +- lu12i.w a2, 0x01010 +- ld.d t2, a0, 0 // t2 = "5ZZ21abc" +- +- ori a2, a2, 0x101 +- andi a1, a1, 0xff // a1 = "0000000Z" +- li.d a5, -1 +- bstrins.d a2, a2, 63, 32 // a2 = 0x0101010101010101 +- +- sll.d t1, a5, t1 // t1 = 0xffffffffff000000 +- mul.d a1, a1, a2 // a1 = "ZZZZZZZZ" +- orn t2, t2, t1 // t2 = "5ZZ21YYY" +- slli.d a3, a2, 7 // a3 = 0x8080808080808080 +- +- sub.d a4, t2, a2 +- andn t0, a3, t2 +- move t3, zero +- and t0, a4, t0 +- +- +- xor a4, t2, a1 +- move t5, zero +- orn a4, a4, t1 +- bnez t0, L(found_end) +- +- sub.d t1, a4, a2 +- andn t0, a3, a4 +- and t1, t1, t0 +- +-L(loop_8bytes): +- masknez t4, t3, t1 +- +- maskeqz t3, t2, t1 +- ld.d t2, a0, 8 +- masknez t0, t5, t1 +- maskeqz t5, a0, t1 +- +- or t3, t3, t4 +- or t5, t0, t5 +- sub.d t0, t2, a2 +- andn t1, a3, t2 +- +- +- xor a4, t2, a1 +- and t0, t0, t1 //t0 hold diff pattern for '\0' +- sub.d t1, a4, a2 +- andn t4, a3, a4 +- +- and t1, t1, t4 //t1 hold diff pattern for 'a1' +- addi.d a0, a0, 8 +- beqz t0, L(loop_8bytes) //ok, neither \0 nor found +-L(found_end): +- ctz.d t1, t0 +- +- xor t3, t3, a1 +- orn t1, zero, t1 +- revb.d t3, t3 +- srl.d t1, a5, t1 // mask for '\0' +- +- sub.d t4, t3, a2 +- orn a4, a4, t1 +- andn t3, a3, t3 +- revb.d t2, a4 +- +- sub.d t0, t2, a2 +- andn t1, a3, t2 +- and t3, t3, t4 +- and t1, t0, t1 +- +- li.d t7, 7 +- masknez t4, t3, t1 +- maskeqz t3, t1, t1 +- masknez t5, t5, t1 +- +- or t3, t3, t4 +- maskeqz t6, a0, t1 +- ctz.d t0, t3 +- or t5, t6, t5 +- +- srli.d t0, t0, 3 +- sub.d t0, t7, t0 +- add.d a0, t5, t0 +- maskeqz a0, a0, t3 +- +- jr ra +-END(STRRCHR_NAME) +- +-#ifdef _LIBC +-libc_hidden_builtin_def(STRRCHR_NAME) +-#endif +-- +2.33.0 + diff --git a/glibc-2.28-Remove-unseless-ANDROID_CHANGES-and-relat.patch b/glibc-2.28-Remove-unseless-ANDROID_CHANGES-and-relat.patch new file mode 100644 index 0000000..ad4b53c --- /dev/null +++ b/glibc-2.28-Remove-unseless-ANDROID_CHANGES-and-relat.patch @@ -0,0 +1,292 @@ +From e2dd1f13592fa3b99b70eb54cc61e9f98cdcb123 Mon Sep 17 00:00:00 2001 +From: caiyinyu +Date: Mon, 17 Apr 2023 17:20:04 +0800 +Subject: [PATCH 01/14] glibc-2.28: Remove unseless ANDROID_CHANGES and related + code. + +Change-Id: Ib08e92d435126c7b56096ff6f24f1c6b5ea57f46 +Signed-off-by: ticat_fp +--- + sysdeps/loongarch/lp64/memchr.S | 6 ------ + sysdeps/loongarch/lp64/memcpy.S | 13 ------------- + sysdeps/loongarch/lp64/memset.S | 6 ------ + sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S | 6 ------ + .../loongarch/lp64/multiarch/memmove-unaligned.S | 6 ------ + sysdeps/loongarch/lp64/multiarch/memset-unaligned.S | 7 ------- + sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S | 2 -- + .../loongarch/lp64/multiarch/strchrnul-unaligned.S | 2 -- + sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S | 2 -- + sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S | 2 -- + .../loongarch/lp64/multiarch/strncmp-unaligned.S | 2 -- + .../loongarch/lp64/multiarch/strnlen-unaligned.S | 2 -- + 12 files changed, 56 deletions(-) + +diff --git a/sysdeps/loongarch/lp64/memchr.S b/sysdeps/loongarch/lp64/memchr.S +index ec34b1af..75c4e15c 100644 +--- a/sysdeps/loongarch/lp64/memchr.S ++++ b/sysdeps/loongarch/lp64/memchr.S +@@ -11,11 +11,7 @@ + #define MEMCHR_NAME memchr + #endif + +-#ifdef ANDROID_CHANGES +-LEAF(MEMCHR_NAME, 0) +-#else + LEAF(MEMCHR_NAME) +-#endif + .align 6 + beqz a2, L(out) + andi t1, a0, 0x7 +@@ -92,8 +88,6 @@ L(out): + jr ra + END(MEMCHR_NAME) + +-#ifndef ANDROID_CHANGES + #ifdef _LIBC + libc_hidden_builtin_def (MEMCHR_NAME) + #endif +-#endif +diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S +index 1076e678..b6ca60a1 100644 +--- a/sysdeps/loongarch/lp64/memcpy.S ++++ b/sysdeps/loongarch/lp64/memcpy.S +@@ -35,29 +35,18 @@ + st.d t6, reg, n+48; \ + st.d t7, reg, n+56; + +-#ifdef ANDROID_CHANGES +-LEAF(MEMMOVE_NAME, 0) +-#else + LEAF(MEMMOVE_NAME) +-#endif +- + .align 6 + sub.d t0, a0, a1 + bltu t0, a2, L(copy_back) + + END(MEMMOVE_NAME) + +-#ifndef ANDROID_CHANGES + #ifdef _LIBC + libc_hidden_builtin_def (MEMMOVE_NAME) + #endif +-#endif + +-#ifdef ANDROID_CHANGES +-LEAF(MEMCPY_NAME, 0) +-#else + LEAF(MEMCPY_NAME) +-#endif + + srai.d a3, a2, 4 + beqz a3, L(short_data) # less than 16 bytes +@@ -811,8 +800,6 @@ L(back_end): + + END(MEMCPY_NAME) + +-#ifndef ANDROID_CHANGES + #ifdef _LIBC + libc_hidden_builtin_def (MEMCPY_NAME) + #endif +-#endif +diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S +index 9fe42b24..41629e7e 100644 +--- a/sysdeps/loongarch/lp64/memset.S ++++ b/sysdeps/loongarch/lp64/memset.S +@@ -21,11 +21,7 @@ + st.d a1, a0, n+48; \ + st.d a1, a0, n+56; + +-#ifdef ANDROID_CHANGES +-LEAF(MEMSET_NAME, 0) +-#else + LEAF(MEMSET_NAME) +-#endif + .align 6 + move t0, a0 + andi a3, a0, 0x7 +@@ -166,8 +162,6 @@ L(short_0): + + END(MEMSET_NAME) + +-#ifndef ANDROID_CHANGES + #ifdef _LIBC + libc_hidden_builtin_def (MEMSET_NAME) + #endif +-#endif +diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S +index 5e38df0d..64b60244 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S +@@ -31,11 +31,7 @@ + st.d t6, reg, n+48; \ + st.d t7, reg, n+56; + +-#ifdef ANDROID_CHANGES +-LEAF(MEMCPY_NAME, 0) +-#else + LEAF(MEMCPY_NAME) +-#endif + + //1st var: dst ptr: void *a1 $r4 a0 + //2nd var: src ptr: void *a2 $r5 a1 +@@ -250,10 +246,8 @@ end_0_8_unalign: + + END(MEMCPY_NAME) + +-#ifndef ANDROID_CHANGES + #ifdef _LIBC + libc_hidden_builtin_def (MEMCPY_NAME) + #endif +-#endif + + #endif +diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S +index 27ed0c9c..42920a1a 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S +@@ -100,11 +100,7 @@ + LD_64(a4, -1024); \ + ST_64(a3, -1024); + +-#ifdef ANDROID_CHANGES +-LEAF(MEMMOVE_NAME, 0) +-#else + LEAF(MEMMOVE_NAME) +-#endif + + //1st var: dest ptr: void *str1 $r4 a0 + //2nd var: src ptr: void *str2 $r5 a1 +@@ -469,10 +465,8 @@ end_unalign_proc_back: + + END(MEMMOVE_NAME) + +-#ifndef ANDROID_CHANGES + #ifdef _LIBC + libc_hidden_builtin_def (MEMMOVE_NAME) + #endif +-#endif + + #endif +diff --git a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S +index 16ff2ef7..54e51546 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S +@@ -33,12 +33,7 @@ + //2nd var: int val $5 a1 + //3rd var: size_t num $6 a2 + +-#ifdef ANDROID_CHANGES +-LEAF(MEMSET_NAME, 0) +-#else + LEAF(MEMSET_NAME) +-#endif +- + .align 6 + bstrins.d a1, a1, 15, 8 + add.d t7, a0, a2 +@@ -168,10 +163,8 @@ end_0_8_unalign: + + END(MEMSET_NAME) + +-#ifndef ANDROID_CHANGES + #ifdef _LIBC + libc_hidden_builtin_def (MEMSET_NAME) + #endif +-#endif + + #endif +diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S +index 1d5e56c5..de6c7f4f 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S +@@ -123,10 +123,8 @@ L(_mc8_a): + jr ra + END(STRCHR_NAME) + +-#ifndef ANDROID_CHANGES + #ifdef _LIBC + libc_hidden_builtin_def (STRCHR_NAME) + #endif +-#endif + + #endif +diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S +index 6338d005..abc246ca 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S +@@ -136,11 +136,9 @@ L(_mc8_a): + jr ra + END(STRCHRNUL_NAME) + +-#ifndef ANDROID_CHANGES + #ifdef _LIBC + weak_alias(STRCHRNUL_NAME, strchrnul) + libc_hidden_builtin_def (STRCHRNUL_NAME) + #endif +-#endif + + #endif +diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S +index 449733cb..c77dc1a9 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S +@@ -190,10 +190,8 @@ strcpy_page_cross: + beqz has_nul, strcpy_page_cross_ok + b strcpy_end + END(STRCPY) +-#ifndef ANDROID_CHANGES + #ifdef _LIBC + libc_hidden_builtin_def (STRCPY) + #endif +-#endif + + #endif +diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S +index e9b7cf67..2fe0fb34 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S +@@ -107,10 +107,8 @@ strlen_loop_noascii: + jr ra + END(STRLEN) + +-#ifndef ANDROID_CHANGES + #ifdef _LIBC + libc_hidden_builtin_def (STRLEN) + #endif +-#endif + + #endif +diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S +index 558df29b..6ec107ca 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S +@@ -248,10 +248,8 @@ strncmp_ret0: + then exchange(src1,src2). */ + + END(STRNCMP) +-#ifndef ANDROID_CHANGES + #ifdef _LIBC + libc_hidden_builtin_def (STRNCMP) + #endif +-#endif + + #endif +diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S +index 60eccf00..4a195b7c 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S +@@ -136,10 +136,8 @@ L(_hit_limit): + move len, limit + jr ra + END(STRNLEN) +-#ifndef ANDROID_CHANGES + #ifdef _LIBC + libc_hidden_builtin_def (STRNLEN) + #endif +-#endif + + #endif +-- +2.33.0 + diff --git a/glibc-2.28-Remove-useless-IS_LA-264-364-464-and-IS_L.patch b/glibc-2.28-Remove-useless-IS_LA-264-364-464-and-IS_L.patch new file mode 100644 index 0000000..4880d26 --- /dev/null +++ b/glibc-2.28-Remove-useless-IS_LA-264-364-464-and-IS_L.patch @@ -0,0 +1,40 @@ +From f4041e5da609a9f5da966fa000c00b150788a948 Mon Sep 17 00:00:00 2001 +From: caiyinyu +Date: Sun, 23 Jul 2023 14:32:08 +0800 +Subject: [PATCH 13/14] glibc-2.28: Remove useless IS_LA{264,364,464} and + IS_LA{264, 364, 464}. + +Change-Id: Id9a573510e2a493151191372d651f381ec2aefe7 +Signed-off-by: ticat_fp +--- + sysdeps/unix/sysv/linux/loongarch/cpu-features.h | 7 ------- + 1 file changed, 7 deletions(-) + +diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h +index b46a8489..2703d4f7 100644 +--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h ++++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h +@@ -22,10 +22,6 @@ + #include + #include + +-#define LA264 0x14a000 +-#define LA364 0x14b000 +-#define LA464 0x14c011 +- + struct cpu_features + { + uint64_t cpucfg_prid; +@@ -42,9 +38,6 @@ extern const struct cpu_features *_dl_larch_get_cpu_features (void) + :"=r"(ret) \ + :"r"(index)); + +-#define IS_LA264(prid) (prid == LA264) +-#define IS_LA364(prid) (prid == LA364) +-#define IS_LA464(prid) (prid == LA464) + #define SUPPORT_UAL (GLRO (dl_hwcap) & HWCAP_LOONGARCH_UAL) + #define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX) + #define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX) +-- +2.33.0 + diff --git a/glibc-2.28-Use-RTLD_SUPPORT_-LSX-LASX-to-choose-_dl_.patch b/glibc-2.28-Use-RTLD_SUPPORT_-LSX-LASX-to-choose-_dl_.patch new file mode 100644 index 0000000..720cd20 --- /dev/null +++ b/glibc-2.28-Use-RTLD_SUPPORT_-LSX-LASX-to-choose-_dl_.patch @@ -0,0 +1,123 @@ +From c94d9376e241dc52eb9f2a2107313b7836e0e9ad Mon Sep 17 00:00:00 2001 +From: caiyinyu +Date: Wed, 6 Sep 2023 16:41:09 +0800 +Subject: [PATCH 14/14] glibc-2.28: Use RTLD_SUPPORT_{LSX, LASX} to choose + _dl_runtime_resolve. + +Key Points: +1. On lasx & lsx platforms, use _dl_runtime_resolve_{lsx, lasx} to save vector registers. +2. Via "tunables", users can choose str/mem functions with + `export GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX`. + Note: glibc.cpu.hwcaps doesn't affect _dl_runtime_resolve_{lsx, lasx} selection. + +Usage Notes: +1. Only valid inputs: LASX, LSX, UAL. Case-sensitive, comma-separated, no spaces. +2. Example: `export GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX,UAL` turns on LASX & UAL. + Unmentioned features turn off. With default ifunc: lasx > lsx > unaligned > + aligned > generic, effect is: lasx > unaligned > aligned > generic; lsx off. +3. Incorrect GLIBC_TUNABLES settings will show error messages. +4. Valid input examples: + - GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX: lasx > aligned > generic. + - GLIBC_TUNABLES=glibc.cpu.hwcaps=LSX,UAL: lsx > unaligned > aligned > generic. + - GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX,UAL,LASX,UAL,LSX,LASX,UAL: Repetitions + allowed but not recommended. Results in: lasx > lsx > unaligned > aligned > + generic. + +Change-Id: I555ce2039bc36bf071fc9265d7b0bb7b93b96ae7 +Signed-off-by: ticat_fp +--- + sysdeps/loongarch/cpu-tunables.c | 2 +- + sysdeps/loongarch/dl-machine.h | 11 ++++++----- + sysdeps/unix/sysv/linux/loongarch/cpu-features.c | 2 ++ + sysdeps/unix/sysv/linux/loongarch/cpu-features.h | 10 +++++++--- + 4 files changed, 16 insertions(+), 9 deletions(-) + +diff --git a/sysdeps/loongarch/cpu-tunables.c b/sysdeps/loongarch/cpu-tunables.c +index 840c1b8c..e0799ca9 100644 +--- a/sysdeps/loongarch/cpu-tunables.c ++++ b/sysdeps/loongarch/cpu-tunables.c +@@ -88,7 +88,7 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) + } + while (*c != '\0'); + +- GLRO (dl_hwcap) &= hwcap; ++ GLRO (dl_larch_cpu_features).hwcap &= hwcap; + } + + #endif +diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h +index ff520a07..b5f43c84 100644 +--- a/sysdeps/loongarch/dl-machine.h ++++ b/sysdeps/loongarch/dl-machine.h +@@ -75,13 +75,14 @@ dl_platform_init (void) + GLRO(dl_platform) = NULL; + + #ifdef SHARED ++ /* init_cpu_features has been called early from __libc_start_main in ++ static executable. */ ++ init_cpu_features (&GLRO(dl_larch_cpu_features)); + + #if HAVE_TUNABLES + TUNABLE_GET (hwcaps, tunable_val_t *, TUNABLE_CALLBACK (set_hwcaps)); + #endif +- /* init_cpu_features has been called early from __libc_start_main in +- static executable. */ +- init_cpu_features (&GLRO(dl_larch_cpu_features)); ++ + #endif + } + +@@ -396,9 +397,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], + l->l_mach.plt = gotplt[1] + l->l_addr; + + #if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float +- if (SUPPORT_LASX) ++ if (RTLD_SUPPORT_LASX) + gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx; +- else if (SUPPORT_LSX) ++ else if (RTLD_SUPPORT_LSX) + gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lsx; + else + #endif +diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c +index 80870f3c..cf015011 100644 +--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c ++++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c +@@ -29,4 +29,6 @@ init_cpu_features (struct cpu_features *cpu_features) + + __cpucfg(cpucfg_word, 2); + cpu_features->cpucfg_word_idx2 = cpucfg_word; ++ ++ GLRO (dl_larch_cpu_features).hwcap = GLRO (dl_hwcap); + } +diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h +index 2703d4f7..17c9f5a7 100644 +--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h ++++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h +@@ -26,6 +26,7 @@ struct cpu_features + { + uint64_t cpucfg_prid; + uint64_t cpucfg_word_idx2; ++ uint64_t hwcap; + }; + + /* Get a pointer to the CPU features structure. */ +@@ -38,9 +39,12 @@ extern const struct cpu_features *_dl_larch_get_cpu_features (void) + :"=r"(ret) \ + :"r"(index)); + +-#define SUPPORT_UAL (GLRO (dl_hwcap) & HWCAP_LOONGARCH_UAL) +-#define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX) +-#define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX) ++#define SUPPORT_UAL (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_UAL) ++#define SUPPORT_LSX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LSX) ++#define SUPPORT_LASX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LASX) ++ ++#define RTLD_SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX) ++#define RTLD_SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX) + + #endif /* _CPU_FEATURES_LOONGARCH64_H */ + +-- +2.33.0 + diff --git a/glibc-2.28-config-Added-HAVE_LOONGARCH_VEC_ASM.patch b/glibc-2.28-config-Added-HAVE_LOONGARCH_VEC_ASM.patch new file mode 100644 index 0000000..bfbe0e2 --- /dev/null +++ b/glibc-2.28-config-Added-HAVE_LOONGARCH_VEC_ASM.patch @@ -0,0 +1,91 @@ +From 58b1f882644f839259505dde3205e226a1c649f1 Mon Sep 17 00:00:00 2001 +From: caiyinyu +Date: Tue, 11 Jul 2023 15:42:26 +0800 +Subject: [PATCH 10/14] glibc-2.28: config: Added HAVE_LOONGARCH_VEC_ASM. + +Change-Id: Iea464ea0c975a351682a60f66251167f6c79385b +Signed-off-by: ticat_fp +--- + config.h.in | 5 +++++ + sysdeps/loongarch/configure | 28 ++++++++++++++++++++++++++++ + sysdeps/loongarch/configure.ac | 15 +++++++++++++++ + 3 files changed, 48 insertions(+) + +diff --git a/config.h.in b/config.h.in +index 94d5ea36..fa53cc2d 100644 +--- a/config.h.in ++++ b/config.h.in +@@ -123,6 +123,11 @@ + /* RISC-V floating-point ABI for ld.so. */ + #undef RISCV_ABI_FLEN + ++/* Assembler support LoongArch LASX/LSX vector instructions. ++ This macro becomes obsolete when glibc increased the minimum ++ required version of GNU 'binutils' to 2.41 or later. */ ++#define HAVE_LOONGARCH_VEC_ASM 0 ++ + /* Linux specific: minimum supported kernel version. */ + #undef __LINUX_KERNEL_VERSION + +diff --git a/sysdeps/loongarch/configure b/sysdeps/loongarch/configure +index 1e5abf81..0f0dae3a 100755 +--- a/sysdeps/loongarch/configure ++++ b/sysdeps/loongarch/configure +@@ -2,3 +2,31 @@ + # Local configure fragment for sysdeps/loongarch/elf. + + #AC_DEFINE(PI_STATIC_AND_HIDDEN) ++ ++# Check if asm support vector instructions. ++{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for vector support in assembler" >&5 ++$as_echo_n "checking for vector support in assembler... " >&6; } ++if ${libc_cv_loongarch_vec_asm+:} false; then : ++ $as_echo_n "(cached) " >&6 ++else ++ cat > conftest.s <<\EOF ++ vld $vr0, $sp, 0 ++EOF ++if { ac_try='${CC-cc} -c $CFLAGS conftest.s -o conftest 1>&5' ++ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 ++ (eval $ac_try) 2>&5 ++ ac_status=$? ++ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 ++ test $ac_status = 0; }; }; then ++ libc_cv_loongarch_vec_asm=yes ++else ++ libc_cv_loongarch_vec_asm=no ++fi ++rm -f conftest* ++fi ++{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_loongarch_vec_asm" >&5 ++$as_echo "$libc_cv_loongarch_vec_asm" >&6; } ++if test $libc_cv_loongarch_vec_asm = yes; then ++ $as_echo "#define HAVE_LOONGARCH_VEC_ASM 1" >>confdefs.h ++ ++fi +diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac +index 67b46ce0..aac0efa9 100644 +--- a/sysdeps/loongarch/configure.ac ++++ b/sysdeps/loongarch/configure.ac +@@ -4,3 +4,18 @@ GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory. + dnl It is always possible to access static and hidden symbols in an + dnl position independent way. + #AC_DEFINE(PI_STATIC_AND_HIDDEN) ++ ++# Check if asm support vector instructions. ++AC_CACHE_CHECK(for vector support in assembler, libc_cv_loongarch_vec_asm, [dnl ++cat > conftest.s <<\EOF ++ vld $vr0, $sp, 0 ++EOF ++if AC_TRY_COMMAND(${CC-cc} -c $CFLAGS conftest.s -o conftest 1>&AS_MESSAGE_LOG_FD); then ++ libc_cv_loongarch_vec_asm=yes ++else ++ libc_cv_loongarch_vec_asm=no ++fi ++rm -f conftest*]) ++if test $libc_cv_loongarch_vec_asm = yes; then ++ AC_DEFINE(HAVE_LOONGARCH_VEC_ASM) ++fi +-- +2.33.0 + diff --git a/glibc-2.28-remove-ABILPX32-related-code.patch b/glibc-2.28-remove-ABILPX32-related-code.patch new file mode 100644 index 0000000..d5ece82 --- /dev/null +++ b/glibc-2.28-remove-ABILPX32-related-code.patch @@ -0,0 +1,75 @@ +From 0153532f680527c4378a10673518cabda2e02584 Mon Sep 17 00:00:00 2001 +From: caiyinyu +Date: Fri, 26 May 2023 14:58:39 +0800 +Subject: [PATCH 05/14] glibc-2.28: remove ABILPX32 related code. + +Change-Id: I73eb5bc4d4ca12e4d45ed6b533fa38d60a3a633f +Signed-off-by: ticat_fp +--- + elf/elf.h | 3 +-- + sysdeps/loongarch/dl-machine.h | 2 -- + sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h | 2 +- + sysdeps/loongarch/sys/regdef.h | 4 +--- + 4 files changed, 3 insertions(+), 8 deletions(-) + +diff --git a/elf/elf.h b/elf/elf.h +index 65d1fb46..4bfbad61 100644 +--- a/elf/elf.h ++++ b/elf/elf.h +@@ -3933,10 +3933,9 @@ enum + #define R_NDS32_TLS_TPOFF 102 + #define R_NDS32_TLS_DESC 119 + +-/* LoongISA ELF Flags */ ++/* LoongArch ELF Flags */ + #define EF_LARCH_ABI 0x0003 + #define EF_LARCH_ABI_LP64 0x0003 +-#define EF_LARCH_ABI_LPX32 0x0002 + #define EF_LARCH_ABI_LP32 0x0001 + + /* Loongarch specific dynamic relocations. */ +diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h +index 2d527241..6e9c6258 100644 +--- a/sysdeps/loongarch/dl-machine.h ++++ b/sysdeps/loongarch/dl-machine.h +@@ -96,8 +96,6 @@ elf_machine_matches_host (const ElfW(Ehdr) *ehdr) + + #ifdef _ABILP64 + if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LP64) +-#elif defined _ABILPX32 +- if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LPX32) + #elif defined _ABILP32 + if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LP32) + #else +diff --git a/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h b/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h +index 5a761355..aa63bce1 100644 +--- a/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h ++++ b/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h +@@ -32,7 +32,7 @@ + # define __SIZEOF_PTHREAD_BARRIER_T 32 + # define __SIZEOF_PTHREAD_BARRIERATTR_T 4 + #else +-# error "rv32i-based systems are not supported" ++# error "32-bit based systems are not supported" + #endif + + #define __PTHREAD_COMPAT_PADDING_MID +diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h +index 769784b8..36f00939 100644 +--- a/sysdeps/loongarch/sys/regdef.h ++++ b/sysdeps/loongarch/sys/regdef.h +@@ -72,10 +72,8 @@ + # define fs6 $f30 + # define fs7 $f31 + +-#elif _LOONGARCH_SIM == _ABILPX32 +-# error ABILPX32 + #elif _LOONGARCH_SIM == _ABILP32 +-# error ABILP32 ++# error ABILP32 not support yet + #else + # error noABI + #endif +-- +2.33.0 + diff --git a/glibc-2.28-use-new-macro-LEAF-and-ENTRY-and-modify-r.patch b/glibc-2.28-use-new-macro-LEAF-and-ENTRY-and-modify-r.patch new file mode 100644 index 0000000..fce80c4 --- /dev/null +++ b/glibc-2.28-use-new-macro-LEAF-and-ENTRY-and-modify-r.patch @@ -0,0 +1,1033 @@ +From 18331a16d37b191b84296d8a5e96cd069fe45664 Mon Sep 17 00:00:00 2001 +From: caiyinyu +Date: Mon, 17 Apr 2023 17:04:57 +0800 +Subject: [PATCH 02/14] glibc-2.28: use new macro LEAF and ENTRY and modify + related code. + +Change-Id: Iac8a3cc0f57ba39cf364580966c8bfca1b54a7a5 +Signed-off-by: ticat_fp +--- + sysdeps/loongarch/__longjmp.S | 2 +- + sysdeps/loongarch/dl-trampoline.h | 2 +- + sysdeps/loongarch/lp64/memchr.S | 3 +-- + sysdeps/loongarch/lp64/memcmp.S | 3 +-- + sysdeps/loongarch/lp64/memcpy.S | 5 ++--- + sysdeps/loongarch/lp64/memset.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/memchr-lasx.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/memchr-lsx.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S | 7 +++---- + sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S | 2 +- + sysdeps/loongarch/lp64/multiarch/memmove-lasx.S | 6 ++---- + sysdeps/loongarch/lp64/multiarch/memmove-lsx.S | 5 ++--- + sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S | 2 +- + sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/memset-lasx.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/memset-lsx.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/memset-unaligned.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S | 6 +++--- + sysdeps/loongarch/lp64/multiarch/strchr-lasx.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/strchr-lsx.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S | 6 +++--- + sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S | 7 ++++--- + sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/strlen-lasx.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/strlen-lsx.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S | 7 ++++--- + sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S | 3 +-- + sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S | 3 +-- + sysdeps/loongarch/lp64/rawmemchr.S | 3 +-- + sysdeps/loongarch/lp64/s_cosf.S | 4 +--- + sysdeps/loongarch/lp64/s_sinf.S | 4 +--- + sysdeps/loongarch/lp64/stpcpy.S | 3 +-- + sysdeps/loongarch/lp64/strchr.S | 3 +-- + sysdeps/loongarch/lp64/strchrnul.S | 3 +-- + sysdeps/loongarch/lp64/strcmp.S | 3 +-- + sysdeps/loongarch/lp64/strcpy.S | 3 +-- + sysdeps/loongarch/lp64/strlen.S | 3 +-- + sysdeps/loongarch/lp64/strncmp.S | 3 +-- + sysdeps/loongarch/lp64/strnlen.S | 3 +-- + sysdeps/loongarch/lp64/strrchr.S | 3 +-- + sysdeps/loongarch/setjmp.S | 6 +++--- + sysdeps/loongarch/start.S | 2 +- + sysdeps/loongarch/sys/asm.h | 6 +++--- + sysdeps/unix/sysv/linux/loongarch/clone.S | 4 ++-- + sysdeps/unix/sysv/linux/loongarch/getcontext.S | 2 +- + sysdeps/unix/sysv/linux/loongarch/setcontext.S | 4 ++-- + sysdeps/unix/sysv/linux/loongarch/swapcontext.S | 2 +- + sysdeps/unix/sysv/linux/loongarch/sysdep.S | 4 ++-- + sysdeps/unix/sysv/linux/loongarch/sysdep.h | 4 ++-- + sysdeps/unix/sysv/linux/loongarch/vfork.S | 2 +- + 62 files changed, 85 insertions(+), 130 deletions(-) + +diff --git a/sysdeps/loongarch/__longjmp.S b/sysdeps/loongarch/__longjmp.S +index 68f67639..bd06b919 100644 +--- a/sysdeps/loongarch/__longjmp.S ++++ b/sysdeps/loongarch/__longjmp.S +@@ -19,7 +19,7 @@ + #include + #include + +-ENTRY (__longjmp) ++ENTRY (__longjmp, 3) + REG_L ra, a0, 0*SZREG + REG_L sp, a0, 1*SZREG + REG_L x, a0, 2*SZREG +diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h +index 95639111..fb15983f 100644 +--- a/sysdeps/loongarch/dl-trampoline.h ++++ b/sysdeps/loongarch/dl-trampoline.h +@@ -29,7 +29,7 @@ + # define FRAME_SIZE (-((-10 * SZREG - 8 * 256) & ALMASK)) + #endif + +-ENTRY (_dl_runtime_resolve) ++ENTRY (_dl_runtime_resolve, 3) + # Save arguments to stack. + + #ifdef __loongarch64 +diff --git a/sysdeps/loongarch/lp64/memchr.S b/sysdeps/loongarch/lp64/memchr.S +index 75c4e15c..23f1fd13 100644 +--- a/sysdeps/loongarch/lp64/memchr.S ++++ b/sysdeps/loongarch/lp64/memchr.S +@@ -11,8 +11,7 @@ + #define MEMCHR_NAME memchr + #endif + +-LEAF(MEMCHR_NAME) +- .align 6 ++LEAF(MEMCHR_NAME, 6) + beqz a2, L(out) + andi t1, a0, 0x7 + lu12i.w a3, 0x01010 +diff --git a/sysdeps/loongarch/lp64/memcmp.S b/sysdeps/loongarch/lp64/memcmp.S +index 9e57a924..457a4dc7 100644 +--- a/sysdeps/loongarch/lp64/memcmp.S ++++ b/sysdeps/loongarch/lp64/memcmp.S +@@ -11,8 +11,7 @@ + #define MEMCMP_NAME memcmp + #endif + +-LEAF(MEMCMP_NAME) +- .align 6 ++LEAF(MEMCMP_NAME, 6) + beqz a2, L(ret) + andi a4, a1, 0x7 + andi a3, a0, 0x7 +diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S +index b6ca60a1..4791e1a4 100644 +--- a/sysdeps/loongarch/lp64/memcpy.S ++++ b/sysdeps/loongarch/lp64/memcpy.S +@@ -35,8 +35,7 @@ + st.d t6, reg, n+48; \ + st.d t7, reg, n+56; + +-LEAF(MEMMOVE_NAME) +- .align 6 ++LEAF(MEMMOVE_NAME, 6) + sub.d t0, a0, a1 + bltu t0, a2, L(copy_back) + +@@ -46,7 +45,7 @@ END(MEMMOVE_NAME) + libc_hidden_builtin_def (MEMMOVE_NAME) + #endif + +-LEAF(MEMCPY_NAME) ++LEAF_NO_ALIGN(MEMCPY_NAME) + + srai.d a3, a2, 4 + beqz a3, L(short_data) # less than 16 bytes +diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S +index 41629e7e..eabd7d23 100644 +--- a/sysdeps/loongarch/lp64/memset.S ++++ b/sysdeps/loongarch/lp64/memset.S +@@ -21,8 +21,7 @@ + st.d a1, a0, n+48; \ + st.d a1, a0, n+56; + +-LEAF(MEMSET_NAME) +- .align 6 ++LEAF(MEMSET_NAME, 6) + move t0, a0 + andi a3, a0, 0x7 + li.w t6, 16 +diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S +index e63e34ae..387a35fe 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S +@@ -11,8 +11,7 @@ + + #define MEMCHR __memchr_lasx + +-LEAF(MEMCHR) +- .align 6 ++LEAF(MEMCHR, 6) + beqz a2, L(ret0) + add.d a3, a0, a2 + andi t0, a0, 0x3f +diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S +index 441db534..c6952657 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S +@@ -11,8 +11,7 @@ + + #define MEMCHR __memchr_lsx + +-LEAF(MEMCHR) +- .align 6 ++LEAF(MEMCHR, 6) + beqz a2, L(ret0) + add.d a3, a0, a2 + andi t0, a0, 0x1f +diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S +index 30e2dbe6..9151d38d 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S +@@ -11,8 +11,7 @@ + + #define MEMCMP __memcmp_lasx + +-LEAF(MEMCMP) +- .align 6 ++LEAF(MEMCMP, 6) + li.d t2, 32 + add.d a3, a0, a2 + add.d a4, a1, a2 +diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S +index 7fd349b6..8535aa22 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S +@@ -10,11 +10,10 @@ + #if IS_IN (libc) + + #define MEMCMP __memcmp_lsx +- + L(magic_num): +- .align 6 +- .dword 0x0706050403020100 +- .dword 0x0f0e0d0c0b0a0908 ++ .align 6 ++ .dword 0x0706050403020100 ++ .dword 0x0f0e0d0c0b0a0908 + nop + nop + ENTRY_NO_ALIGN(MEMCMP) +diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S +index 64b60244..96df7c40 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S +@@ -31,7 +31,7 @@ + st.d t6, reg, n+48; \ + st.d t7, reg, n+56; + +-LEAF(MEMCPY_NAME) ++LEAF(MEMCPY_NAME, 3) + + //1st var: dst ptr: void *a1 $r4 a0 + //2nd var: src ptr: void *a2 $r5 a1 +diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S +index 9537a35a..e8b2c441 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S +@@ -17,8 +17,7 @@ + #define MEMMOVE_NAME __memmove_lasx + #endif + +-LEAF(MEMCPY_NAME) +- .align 6 ++LEAF(MEMCPY_NAME, 6) + + li.d t0, 32 + add.d a3, a0, a2 +@@ -83,8 +82,7 @@ L(less_1bytes): + jr ra + END(MEMCPY_NAME) + +-LEAF(MEMMOVE_NAME) +- .align 6 ++LEAF(MEMMOVE_NAME, 6) + + li.d t0, 32 + add.d a3, a0, a2 +diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S +index 26babad4..90f89c7a 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S +@@ -12,8 +12,7 @@ + #define MEMCPY_NAME __memcpy_lsx + #define MEMMOVE_NAME __memmove_lsx + +-LEAF(MEMCPY_NAME) +- .align 6 ++LEAF(MEMCPY_NAME, 6) + li.d t6, 16 + add.d a3, a0, a2 + add.d a4, a1, a2 +@@ -83,7 +82,7 @@ L(less_1bytes): + nop + END(MEMCPY_NAME) + +-LEAF(MEMMOVE_NAME) ++LEAF(MEMMOVE_NAME, 6) + li.d t6, 16 + add.d a3, a0, a2 + add.d a4, a1, a2 +diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S +index 42920a1a..712b1c62 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S +@@ -100,7 +100,7 @@ + LD_64(a4, -1024); \ + ST_64(a3, -1024); + +-LEAF(MEMMOVE_NAME) ++LEAF(MEMMOVE_NAME, 3) + + //1st var: dest ptr: void *str1 $r4 a0 + //2nd var: src ptr: void *str2 $r5 a1 +diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S +index 57e1035f..9ecd0257 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S +@@ -13,8 +13,7 @@ + #define MEMRCHR __memrchr_lasx + #endif + +-LEAF(MEMRCHR) +- .align 6 ++LEAF(MEMRCHR, 6) + beqz a2, L(ret0) + addi.d a2, a2, -1 + add.d a3, a0, a2 +diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S +index eac2059a..4bdc18d8 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S +@@ -11,8 +11,7 @@ + + #define MEMRCHR __memrchr_lsx + +-LEAF(MEMRCHR) +- .align 6 ++LEAF(MEMRCHR, 6) + beqz a2, L(ret0) + addi.d a2, a2, -1 + add.d a3, a0, a2 +diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S +index 1bd2dda9..b53c0b7b 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S +@@ -11,8 +11,7 @@ + + #define MEMSET __memset_lasx + +-LEAF(MEMSET) +- .align 6 ++LEAF(MEMSET, 6) + li.d t1, 32 + move a3, a0 + xvreplgr2vr.b $xr0, a1 +diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S +index a3bbadb7..7ab85283 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S +@@ -11,8 +11,7 @@ + + #define MEMSET __memset_lsx + +-LEAF(MEMSET) +- .align 6 ++LEAF(MEMSET, 6) + li.d t1, 16 + move a3, a0 + vreplgr2vr.b $vr0, a1 +diff --git a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S +index 54e51546..92b0fab5 100644 +--- a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S +@@ -33,8 +33,7 @@ + //2nd var: int val $5 a1 + //3rd var: size_t num $6 a2 + +-LEAF(MEMSET_NAME) +- .align 6 ++LEAF(MEMSET_NAME, 6) + bstrins.d a1, a1, 15, 8 + add.d t7, a0, a2 + bstrins.d a1, a1, 31, 16 +diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S +index bff92969..1e94aa50 100644 +--- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S +@@ -5,8 +5,7 @@ + + # define RAWMEMCHR __rawmemchr_lasx + +-LEAF(RAWMEMCHR) +- .align 6 ++LEAF(RAWMEMCHR, 6) + move a2, a0 + bstrins.d a0, zero, 4, 0 + xvld $xr0, a0, 0 +diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S +index 11a19c1d..40bf0cda 100644 +--- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S +@@ -11,8 +11,7 @@ + + # define RAWMEMCHR __rawmemchr_lsx + +-LEAF(RAWMEMCHR) +- .align 6 ++LEAF(RAWMEMCHR, 6) + move a2, a0 + bstrins.d a0, zero, 4, 0 + vld $vr0, a0, 0 +diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S +index bf0eed43..0836f590 100644 +--- a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S +@@ -12,9 +12,9 @@ + #define STPCPY __stpcpy_lsx + + L(magic_num): +- .align 6 +- .dword 0x0706050403020100 +- .dword 0x0f0e0d0c0b0a0908 ++ .align 6 ++ .dword 0x0706050403020100 ++ .dword 0x0f0e0d0c0b0a0908 + ENTRY_NO_ALIGN(STPCPY) + pcaddi t0, -4 + andi a4, a1, 0xf +diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S +index ea7eb9d2..3f6ad915 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S +@@ -13,8 +13,7 @@ + #define STRCHR __strchr_lasx + #endif + +-LEAF(STRCHR) +- .align 6 ++LEAF(STRCHR, 6) + andi t1, a0, 0x1f + bstrins.d a0, zero, 4, 0 + xvld $xr0, a0, 0 +diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S +index 64ead00b..4ad9a4ad 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S +@@ -13,8 +13,7 @@ + #define STRCHR __strchr_lsx + #endif + +-LEAF(STRCHR) +- .align 6 ++LEAF(STRCHR, 6) + andi t1, a0, 0xf + bstrins.d a0, zero, 3, 0 + vld $vr0, a0, 0 +diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S +index de6c7f4f..365818f9 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S +@@ -38,8 +38,7 @@ + #define STRCHR_NAME __strchr_unaligned + + /* char * strchr (const char *s1, int c); */ +-LEAF(STRCHR_NAME) +- .align 6 ++LEAF(STRCHR_NAME, 6) + + li.w t4, 0x7 + lu12i.w a2, 0x01010 +diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S +index abc246ca..7b496076 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S +@@ -46,8 +46,7 @@ + + /* char * strchrnul (const char *s1, int c); */ + +-LEAF(STRCHRNUL_NAME) +- .align 6 ++LEAF(STRCHRNUL_NAME, 6) + li.w t4, 0x7 + lu12i.w a2, 0x01010 + bstrins.d a1, a1, 15, 8 +diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S +index 226b1d63..c86e3ecd 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S +@@ -13,9 +13,9 @@ + + /* int strcmp (const char *s1, const char *s2); */ + L(magic_num): +- .align 6 +- .dword 0x0706050403020100 +- .dword 0x0f0e0d0c0b0a0908 ++ .align 6 ++ .dword 0x0706050403020100 ++ .dword 0x0f0e0d0c0b0a0908 + + ENTRY_NO_ALIGN(STRCMP) + pcaddi t0, -4 +diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S +index e29d872f..1e2e44ec 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strcmp-unaligned.S +@@ -73,8 +73,7 @@ + + /* int strcmp (const char *s1, const char *s2); */ + +-LEAF(STRCMP_NAME) +- .align 4 ++LEAF(STRCMP_NAME, 4) + + xor tmp1, src1, src2 + lu12i.w zeroones, 0x01010 +diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S +index 76db561a..dbc061ad 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S +@@ -14,9 +14,10 @@ + /* int strcpy (const char *s1, const char *s2); */ + + L(magic_num): +- .align 6 +- .dword 0x0706050403020100 +- .dword 0x0f0e0d0c0b0a0908 ++ .align 6 ++ .dword 0x0706050403020100 ++ .dword 0x0f0e0d0c0b0a0908 ++ + ENTRY_NO_ALIGN(STRCPY) + pcaddi t0, -4 + andi a4, a1, 0xf +diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S +index c77dc1a9..150dc802 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S +@@ -61,8 +61,7 @@ + + /* int strcpy (const char *s1, const char *s2); */ + +-LEAF(STRCPY) +- .align 4 ++LEAF(STRCPY, 4) + move dest_backup, dest + lu12i.w zeroones, 0x01010 + lu12i.w sevenf, 0x7f7f7 +diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S +index cb276aa0..fd6c002d 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S +@@ -13,8 +13,7 @@ + + /* size_t strlen(const char *s1); */ + +-LEAF(STRLEN) +- .align 6 ++LEAF(STRLEN, 6) + move a1, a0 + bstrins.d a0, zero, 4, 0 + li.d t1, -1 +diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S +index 6edcac8c..6f311506 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S +@@ -13,8 +13,7 @@ + + /* size_t strlen(const char *s1); */ + +-LEAF(STRLEN) +- .align 6 ++LEAF(STRLEN, 6) + move a1, a0 + bstrins.d a0, zero, 4, 0 + vld $vr0, a0, 0 +diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S +index 2fe0fb34..837255e3 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S +@@ -31,8 +31,7 @@ + + /* size_t strlen (const char *s1); */ + +-LEAF(STRLEN) +- .align 5 ++LEAF(STRLEN, 5) + nor t4, zero, zero + lu12i.w a2, 0x01010 + andi t5, a0, 0x7 +diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S +index 3399bf77..2c6f9614 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S +@@ -14,9 +14,10 @@ + /* int strncmp (const char *s1, const char *s2); */ + + L(magic_num): +- .align 6 +- .dword 0x0706050403020100 +- .dword 0x0f0e0d0c0b0a0908 ++ .align 6 ++ .dword 0x0706050403020100 ++ .dword 0x0f0e0d0c0b0a0908 ++ + ENTRY_NO_ALIGN(STRNCMP) + beqz a2, L(ret0) + pcaddi t0, -5 +diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S +index 6ec107ca..88397528 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S +@@ -60,8 +60,7 @@ + + /* int strncmp (const char *s1, const char *s2); */ + +-LEAF(STRNCMP) +- .align 4 ++LEAF(STRNCMP, 4) + beqz limit, strncmp_ret0 + + xor tmp1, src1, src2 +diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S +index 8c30f10c..910b52fe 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S +@@ -13,8 +13,7 @@ + + /* size_t strnlen (const char *s1, size_t maxlen); */ + +-LEAF(STRNLEN) +- .align 6 ++LEAF(STRNLEN, 6) + beqz a1, L(ret0) + andi t1, a0, 0x3f + li.d t3, 65 +diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S +index 388c239a..db0e90ff 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S +@@ -13,8 +13,7 @@ + + /* size_t strnlen (const char *s1, size_t maxlen); */ + +-LEAF(STRNLEN) +- .align 6 ++LEAF(STRNLEN, 6) + beqz a1, L(ret0) + andi t1, a0, 0x1f + li.d t3, 33 +diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S +index 4a195b7c..78e7444d 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S ++++ b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S +@@ -63,9 +63,8 @@ + + /* size_t strnlen (const char *s1,size_t maxlen); */ + +-LEAF(STRNLEN) ++LEAF(STRNLEN, 4) + +- .align 4 + beqz limit, L(_hit_limit) + lu12i.w zeroones, 0x01010 + lu12i.w sevenf, 0x7f7f7 +diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S +index 6f7a5618..325458ff 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S +@@ -11,8 +11,7 @@ + + #define STRRCHR __strrchr_lasx + +-LEAF(STRRCHR) +- .align 6 ++LEAF(STRRCHR, 6) + andi t1, a0, 0x3f + bstrins.d a0, zero, 5, 0 + xvld $xr0, a0, 0 +diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S +index e9228a2e..e082eaab 100644 +--- a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S ++++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S +@@ -11,8 +11,7 @@ + + #define STRRCHR __strrchr_lsx + +-LEAF(STRRCHR) +- .align 6 ++LEAF(STRRCHR, 6) + andi t1, a0, 0x1f + bstrins.d a0, zero, 4, 0 + vld $vr0, a0, 0 +diff --git a/sysdeps/loongarch/lp64/rawmemchr.S b/sysdeps/loongarch/lp64/rawmemchr.S +index 94b70f2d..ef1db7ed 100644 +--- a/sysdeps/loongarch/lp64/rawmemchr.S ++++ b/sysdeps/loongarch/lp64/rawmemchr.S +@@ -12,8 +12,7 @@ + #endif + + +-LEAF(RAWMEMCHR_NAME) +- .align 6 ++LEAF(RAWMEMCHR_NAME, 6) + andi t1, a0, 0x7 + bstrins.d a0, zero, 2, 0 + lu12i.w a2, 0x01010 +diff --git a/sysdeps/loongarch/lp64/s_cosf.S b/sysdeps/loongarch/lp64/s_cosf.S +index 5bfabefb..9fcbe6ca 100644 +--- a/sysdeps/loongarch/lp64/s_cosf.S ++++ b/sysdeps/loongarch/lp64/s_cosf.S +@@ -74,9 +74,7 @@ + movgr2fr.d tmp, rs;\ + ffint.d.l rd, tmp + +-LEAF(COSF) +- .align 2 +- .align 3 ++LEAF(COSF, 3) + /* fa0 is SP x; fa1 is DP x */ + movfr2gr.s t0, fa0 /* Bits of x */ + fcvt.d.s fa1, fa0 /* DP x */ +diff --git a/sysdeps/loongarch/lp64/s_sinf.S b/sysdeps/loongarch/lp64/s_sinf.S +index 91c9db9e..45d1c4b5 100644 +--- a/sysdeps/loongarch/lp64/s_sinf.S ++++ b/sysdeps/loongarch/lp64/s_sinf.S +@@ -74,9 +74,7 @@ + movgr2fr.d tmp, rs;\ + ffint.d.l rd, tmp + +-LEAF(SINF) +- .align 2 +- .align 3 ++LEAF(SINF, 3) + /* fa0 is SP x; fa1 is DP x */ + movfr2gr.s t2, fa0 /* Bits of x */ + fcvt.d.s fa1, fa0 /* DP x */ +diff --git a/sysdeps/loongarch/lp64/stpcpy.S b/sysdeps/loongarch/lp64/stpcpy.S +index 9d4b0c8d..b6a367dc 100644 +--- a/sysdeps/loongarch/lp64/stpcpy.S ++++ b/sysdeps/loongarch/lp64/stpcpy.S +@@ -11,8 +11,7 @@ + #define STPCPY_NAME __stpcpy + #endif + +-LEAF(STPCPY_NAME) +- .align 6 ++LEAF(STPCPY_NAME, 6) + andi a3, a0, 0x7 + beqz a3, L(dest_align) + sub.d a5, a1, a3 +diff --git a/sysdeps/loongarch/lp64/strchr.S b/sysdeps/loongarch/lp64/strchr.S +index 63454c17..fde53a30 100644 +--- a/sysdeps/loongarch/lp64/strchr.S ++++ b/sysdeps/loongarch/lp64/strchr.S +@@ -13,8 +13,7 @@ + + /* char * strchr (const char *s1, int c); */ + +-LEAF(STRCHR_NAME) +- .align 6 ++LEAF(STRCHR_NAME, 6) + slli.d t1, a0, 3 + bstrins.d a0, zero, 2, 0 + lu12i.w a2, 0x01010 +diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S +index c4532e11..a5ee09a3 100644 +--- a/sysdeps/loongarch/lp64/strchrnul.S ++++ b/sysdeps/loongarch/lp64/strchrnul.S +@@ -13,8 +13,7 @@ + + /* char * strchrnul (const char *s1, int c); */ + +-LEAF(STRCHRNUL_NAME) +- .align 6 ++LEAF(STRCHRNUL_NAME, 6) + slli.d t1, a0, 3 + bstrins.d a0, zero, 2, 0 + lu12i.w a2, 0x01010 +diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S +index 22c261a3..3a863992 100644 +--- a/sysdeps/loongarch/lp64/strcmp.S ++++ b/sysdeps/loongarch/lp64/strcmp.S +@@ -19,8 +19,7 @@ + #define src1 a0 + #define src2 a1 + #define result v0 +-LEAF(STRCMP_NAME) +- .align 6 ++LEAF(STRCMP_NAME, 6) + xor a4, src1, src2 + lu12i.w t5, 0x01010 + lu12i.w t6, 0x7f7f7 +diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S +index c6fe74cb..08505192 100644 +--- a/sysdeps/loongarch/lp64/strcpy.S ++++ b/sysdeps/loongarch/lp64/strcpy.S +@@ -11,8 +11,7 @@ + #define STRCPY strcpy + #endif + +-LEAF(STRCPY) +- .align 6 ++LEAF(STRCPY, 6) + andi a3, a0, 0x7 + move a2, a0 + beqz a3, L(dest_align) +diff --git a/sysdeps/loongarch/lp64/strlen.S b/sysdeps/loongarch/lp64/strlen.S +index dd5a8da3..71431ce2 100644 +--- a/sysdeps/loongarch/lp64/strlen.S ++++ b/sysdeps/loongarch/lp64/strlen.S +@@ -11,8 +11,7 @@ + #define STRLEN strlen + #endif + +-LEAF(STRLEN) +- .align 6 ++LEAF(STRLEN, 6) + move a1, a0 + bstrins.d a0, zero, 2, 0 + lu12i.w a2, 0x01010 +diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S +index dcb15350..55450e55 100644 +--- a/sysdeps/loongarch/lp64/strncmp.S ++++ b/sysdeps/loongarch/lp64/strncmp.S +@@ -13,8 +13,7 @@ + + /* int strncmp (const char *s1, const char *s2); */ + +-LEAF(STRNCMP) +- .align 6 ++LEAF(STRNCMP, 6) + beqz a2, L(ret0) + xor a4, a0, a1 + lu12i.w t5, 0x01010 +diff --git a/sysdeps/loongarch/lp64/strnlen.S b/sysdeps/loongarch/lp64/strnlen.S +index 0517e206..5b5ab585 100644 +--- a/sysdeps/loongarch/lp64/strnlen.S ++++ b/sysdeps/loongarch/lp64/strnlen.S +@@ -15,8 +15,7 @@ + #. first load with t1 != 0, need to adjust t5; + #. return the less one of both strlen(s) and a1; + +-LEAF(STRNLEN) +- .align 6 ++LEAF(STRNLEN, 6) + beqz a1, L(out) + lu12i.w a2, 0x01010 + andi t1, a0, 0x7 +diff --git a/sysdeps/loongarch/lp64/strrchr.S b/sysdeps/loongarch/lp64/strrchr.S +index 3bf92ecd..df7fcb6b 100644 +--- a/sysdeps/loongarch/lp64/strrchr.S ++++ b/sysdeps/loongarch/lp64/strrchr.S +@@ -11,8 +11,7 @@ + #define STRRCHR_NAME strrchr + #endif + +-LEAF(STRRCHR_NAME) +- .align 6 ++LEAF(STRRCHR_NAME, 6) + slli.d t1, a0, 3 + bstrins.d a0, zero, 2, 0 + lu12i.w a2, 0x01010 +diff --git a/sysdeps/loongarch/setjmp.S b/sysdeps/loongarch/setjmp.S +index da09a93c..c4e6d01c 100644 +--- a/sysdeps/loongarch/setjmp.S ++++ b/sysdeps/loongarch/setjmp.S +@@ -19,14 +19,14 @@ + #include + #include + +-ENTRY (_setjmp) ++ENTRY (_setjmp, 3) + li.w a1,0 + b __sigsetjmp + END (_setjmp) +-ENTRY (setjmp) ++ENTRY (setjmp, 3) + li.w a1,1 + END (setjmp) +-ENTRY (__sigsetjmp) ++ENTRY (__sigsetjmp, 3) + REG_S ra, a0, 0*SZREG + REG_S sp, a0, 1*SZREG + REG_S x, a0, 2*SZREG +diff --git a/sysdeps/loongarch/start.S b/sysdeps/loongarch/start.S +index cf0a14b5..b83221e4 100644 +--- a/sysdeps/loongarch/start.S ++++ b/sysdeps/loongarch/start.S +@@ -17,7 +17,7 @@ __libc_start_main (int (*main) (int, char **, char **), + void *stack_end); + */ + +-ENTRY (ENTRY_POINT) ++ENTRY (ENTRY_POINT, 3) + /* Terminate call stack by noting ra is undefined. Use a dummy + .cfi_label to force starting the FDE. */ + .cfi_label .Ldummy +diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h +index f64bfb2b..357a5ba3 100644 +--- a/sysdeps/loongarch/sys/asm.h ++++ b/sysdeps/loongarch/sys/asm.h +@@ -27,15 +27,15 @@ + + + /* Declare leaf routine. */ +-#define LEAF(symbol) \ ++#define LEAF(symbol, aln) \ + .text; \ + .globl symbol; \ +- .align 3; \ ++ .align aln; \ + .type symbol, @function; \ + symbol: \ + cfi_startproc; \ + +-# define ENTRY(symbol) LEAF(symbol) ++# define ENTRY(symbol, aln) LEAF(symbol, aln) + + #define LEAF_NO_ALIGN(symbol) \ + .text; \ +diff --git a/sysdeps/unix/sysv/linux/loongarch/clone.S b/sysdeps/unix/sysv/linux/loongarch/clone.S +index f0fc566e..1180a11d 100644 +--- a/sysdeps/unix/sysv/linux/loongarch/clone.S ++++ b/sysdeps/unix/sysv/linux/loongarch/clone.S +@@ -29,7 +29,7 @@ + /* int clone(int (*fn)(void *arg), void *child_stack, int flags, void *arg, + void *parent_tidptr, void *tls, void *child_tidptr) */ + +-ENTRY (__clone) ++ENTRY (__clone, 3) + + /* Align stack to 16 or 8 bytes per the ABI. */ + #if _LOONGARCH_SIM == _ABILP64 +@@ -74,7 +74,7 @@ L (error): + its own function so that we can terminate the stack trace with our + debug info. */ + +-ENTRY (__thread_start) ++ENTRY (__thread_start, 3) + L (thread_start): + /* Terminate call stack by noting ra is undefined. Use a dummy + .cfi_label to force starting the FDE. */ +diff --git a/sysdeps/unix/sysv/linux/loongarch/getcontext.S b/sysdeps/unix/sysv/linux/loongarch/getcontext.S +index 9c28d958..6391850e 100644 +--- a/sysdeps/unix/sysv/linux/loongarch/getcontext.S ++++ b/sysdeps/unix/sysv/linux/loongarch/getcontext.S +@@ -21,7 +21,7 @@ + /* int getcontext (ucontext_t *ucp) */ + + .text +-LEAF (__getcontext) ++LEAF (__getcontext, 3) + SAVE_INT_REG (ra, 1, a0) + SAVE_INT_REG (sp, 3, a0) + SAVE_INT_REG (zero, 4, a0) /* return 0 by overwriting a0. */ +diff --git a/sysdeps/unix/sysv/linux/loongarch/setcontext.S b/sysdeps/unix/sysv/linux/loongarch/setcontext.S +index c96ec43c..3a043a63 100644 +--- a/sysdeps/unix/sysv/linux/loongarch/setcontext.S ++++ b/sysdeps/unix/sysv/linux/loongarch/setcontext.S +@@ -28,7 +28,7 @@ + other than the PRESERVED state. */ + + .text +-LEAF (__setcontext) ++LEAF (__setcontext, 3) + + addi.d sp, sp, -16 + st.d a0, sp, 0 /* Save ucp to stack. */ +@@ -94,7 +94,7 @@ LEAF (__setcontext) + PSEUDO_END (__setcontext) + weak_alias (__setcontext, setcontext) + +-LEAF (__start_context) ++LEAF (__start_context, 3) + + /* Terminate call stack by noting ra == 0. Happily, s0 == 0 here. */ + cfi_register (1, 23) +diff --git a/sysdeps/unix/sysv/linux/loongarch/swapcontext.S b/sysdeps/unix/sysv/linux/loongarch/swapcontext.S +index d839dd87..c9024d5f 100644 +--- a/sysdeps/unix/sysv/linux/loongarch/swapcontext.S ++++ b/sysdeps/unix/sysv/linux/loongarch/swapcontext.S +@@ -20,7 +20,7 @@ + + /* int swapcontext (ucontext_t *oucp, const ucontext_t *ucp) */ + +-LEAF (__swapcontext) ++LEAF (__swapcontext, 3) + ori a2, sp, 0 /* Save sp to a2. */ + addi.d sp, sp, -16 + st.d a1, sp, 0 +diff --git a/sysdeps/unix/sysv/linux/loongarch/sysdep.S b/sysdeps/unix/sysv/linux/loongarch/sysdep.S +index a8094283..19c03fb4 100644 +--- a/sysdeps/unix/sysv/linux/loongarch/sysdep.S ++++ b/sysdeps/unix/sysv/linux/loongarch/sysdep.S +@@ -22,13 +22,13 @@ + # define errno __libc_errno + #endif + +-ENTRY (__syscall_error) ++ENTRY (__syscall_error, 3) + /* Fall through to __syscall_set_errno. */ + END (__syscall_error) + + /* Non-standard calling convention: argument in a0, return address in t0, + and clobber only t1. */ +-ENTRY (__syscall_set_errno) ++ENTRY (__syscall_set_errno, 3) + /* We got here because a0 < 0, but only codes in the range [-4095, -1] + represent errors. Otherwise, just return the result normally. */ + +diff --git a/sysdeps/unix/sysv/linux/loongarch/sysdep.h b/sysdeps/unix/sysv/linux/loongarch/sysdep.h +index f50946d4..7b45f609 100644 +--- a/sysdeps/unix/sysv/linux/loongarch/sysdep.h ++++ b/sysdeps/unix/sysv/linux/loongarch/sysdep.h +@@ -14,7 +14,7 @@ + errors by setting a0 to a value between -1 and -4095. */ + # undef PSEUDO + # define PSEUDO(name, syscall_name, args) \ +- ENTRY (name); \ ++ ENTRY (name, 3); \ + li.d a7, SYS_ify (syscall_name); \ + syscall 0; \ + li.d a7, -4096; \ +@@ -58,7 +58,7 @@ + /* Performs a system call, not setting errno. */ + # undef PSEUDO_NEORRNO + # define PSEUDO_NOERRNO(name, syscall_name, args) \ +- ENTRY (name); \ ++ ENTRY (name, 3); \ + li.d a7, SYS_ify (syscall_name); \ + syscall 0; + +diff --git a/sysdeps/unix/sysv/linux/loongarch/vfork.S b/sysdeps/unix/sysv/linux/loongarch/vfork.S +index 83cf141f..5db6720a 100644 +--- a/sysdeps/unix/sysv/linux/loongarch/vfork.S ++++ b/sysdeps/unix/sysv/linux/loongarch/vfork.S +@@ -25,7 +25,7 @@ + replaced by a call to `execve'. Return -1 for errors, 0 to the new process, + and the process ID of the new process to the old process. */ + +-ENTRY (__vfork) ++ENTRY (__vfork, 3) + + + li.d a0, 0x4111 /* CLONE_VM | CLONE_VFORK | SIGCHLD */ +-- +2.33.0 + diff --git a/glibc.spec b/glibc.spec index 964c5f4..b5eb36a 100644 --- a/glibc.spec +++ b/glibc.spec @@ -133,7 +133,7 @@ end \ Summary: The GNU libc libraries Name: glibc Version: %{glibcversion} -Release: %{glibcrelease}.12 +Release: %{glibcrelease}.13 # In general, GPLv2+ is used by programs, LGPLv2+ is used for # libraries. @@ -1074,6 +1074,20 @@ Patch2006: glibc-Sync-to-lnd-35-for-LoongArch.patch Patch2007: Fix-tst-cancel21.c-to-suit-kernel-struct-sigcontext-.patch Patch2008: glibc-aarch64-Increase-small-and-medium-cases-for-__memcpy.patch Patch2009: glibc-Add-Hygon-Support.patch +Patch2010: glibc-2.28-Remove-unseless-ANDROID_CHANGES-and-relat.patch +Patch2011: glibc-2.28-use-new-macro-LEAF-and-ENTRY-and-modify-r.patch +Patch2012: glibc-2.28-Fix-ifunc-str-mem-functions-xfail-problem.patch +Patch2013: glibc-2.28-Add-run-one-test-convenience-target-and-m.patch +Patch2014: glibc-2.28-remove-ABILPX32-related-code.patch +Patch2015: glibc-2.28-Refactor-code-of-raw-mem-functions.patch +Patch2016: glibc-2.28-Refactor-code-of-st-r-p-functions.patch +Patch2017: glibc-2.28-Add-new-struct-user_fp_state-in-user.h.patch +Patch2018: glibc-2.28-Redefine-macro-LEAF-ENTRY.patch +Patch2019: glibc-2.28-config-Added-HAVE_LOONGARCH_VEC_ASM.patch +Patch2020: glibc-2.28-Add-macro-defination-of-lasx-lsx-and-fcc-.patch +Patch2021: glibc-2.28-Refactor-code-and-fix-bug-in-_dl_runtime_.patch +Patch2022: glibc-2.28-Remove-useless-IS_LA-264-364-464-and-IS_L.patch +Patch2023: glibc-2.28-Use-RTLD_SUPPORT_-LSX-LASX-to-choose-_dl_.patch ############################################################################## # Continued list of core "glibc" package information: ############################################################################## @@ -2908,6 +2922,9 @@ fi %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared %changelog +* Sat May 11 2024 Peng Fan - 2.28-236.0.1.13 +- Sync loongarch64 code to lnd.36. + * Mon May 06 2024 Rongwei Wang - 2.28-236.0.1.12 - elf: Properly align PT_LOAD segments - Sync loongarch64 code to lnd.35. (lixing@loongson.cn)