From d97d963796b092b9c0bd4712f992a08dd20bf5ed Mon Sep 17 00:00:00 2001 From: caiyinyu Date: Tue, 11 Jul 2023 15:40:15 +0800 Subject: [PATCH 11/14] glibc-2.28: Add macro defination of lasx lsx and fcc registers. Change-Id: Ic723521775a0133e25bf1d568c588f930ec5ff49 Signed-off-by: ticat_fp --- sysdeps/loongarch/dl-trampoline.h | 64 +-- .../loongarch/lp64/multiarch/memchr-lasx.S | 74 +-- sysdeps/loongarch/lp64/multiarch/memchr-lsx.S | 48 +- .../loongarch/lp64/multiarch/memcmp-lasx.S | 138 +++--- sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S | 194 ++++---- .../loongarch/lp64/multiarch/memmove-lasx.S | 160 +++---- .../loongarch/lp64/multiarch/memmove-lsx.S | 424 +++++++++--------- .../loongarch/lp64/multiarch/memrchr-lasx.S | 74 +-- .../loongarch/lp64/multiarch/memrchr-lsx.S | 48 +- .../loongarch/lp64/multiarch/memset-lasx.S | 64 +-- sysdeps/loongarch/lp64/multiarch/memset-lsx.S | 62 +-- .../loongarch/lp64/multiarch/rawmemchr-lasx.S | 30 +- .../loongarch/lp64/multiarch/rawmemchr-lsx.S | 30 +- sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S | 114 ++--- .../loongarch/lp64/multiarch/strchr-lasx.S | 52 +-- sysdeps/loongarch/lp64/multiarch/strchr-lsx.S | 30 +- sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S | 114 ++--- sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S | 112 ++--- .../loongarch/lp64/multiarch/strlen-lasx.S | 24 +- sysdeps/loongarch/lp64/multiarch/strlen-lsx.S | 30 +- .../loongarch/lp64/multiarch/strncmp-lsx.S | 144 +++--- .../loongarch/lp64/multiarch/strnlen-lasx.S | 46 +- .../loongarch/lp64/multiarch/strnlen-lsx.S | 30 +- .../loongarch/lp64/multiarch/strrchr-lasx.S | 88 ++-- .../loongarch/lp64/multiarch/strrchr-lsx.S | 56 +-- sysdeps/loongarch/lp64/s_cosf.S | 4 +- sysdeps/loongarch/lp64/s_sinf.S | 4 +- sysdeps/loongarch/sys/regdef.h | 74 +++ 28 files changed, 1203 insertions(+), 1129 deletions(-) diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h index fb15983f..96f41f1d 100644 --- a/sysdeps/loongarch/dl-trampoline.h +++ b/sysdeps/loongarch/dl-trampoline.h @@ -61,23 +61,23 @@ ENTRY (_dl_runtime_resolve, 3) FREG_S fa6, sp, 10*SZREG + 6*SZFREG FREG_S fa7, sp, 10*SZREG + 7*SZFREG #ifdef USE_LASX - xvst $xr0, sp, 10*SZREG + 0*256 - xvst $xr1, sp, 10*SZREG + 1*256 - xvst $xr2, sp, 10*SZREG + 2*256 - xvst $xr3, sp, 10*SZREG + 3*256 - xvst $xr4, sp, 10*SZREG + 4*256 - xvst $xr5, sp, 10*SZREG + 5*256 - xvst $xr6, sp, 10*SZREG + 6*256 - xvst $xr7, sp, 10*SZREG + 7*256 + xvst xr0, sp, 10*SZREG + 0*256 + xvst xr1, sp, 10*SZREG + 1*256 + xvst xr2, sp, 10*SZREG + 2*256 + xvst xr3, sp, 10*SZREG + 3*256 + xvst xr4, sp, 10*SZREG + 4*256 + xvst xr5, sp, 10*SZREG + 5*256 + xvst xr6, sp, 10*SZREG + 6*256 + xvst xr7, sp, 10*SZREG + 7*256 #elif defined USE_LSX - vst $vr0, sp, 10*SZREG + 0*128 - vst $vr1, sp, 10*SZREG + 1*128 - vst $vr2, sp, 10*SZREG + 2*128 - vst $vr3, sp, 10*SZREG + 3*128 - vst $vr4, sp, 10*SZREG + 4*128 - vst $vr5, sp, 10*SZREG + 5*128 - vst $vr6, sp, 10*SZREG + 6*128 - vst $vr7, sp, 10*SZREG + 7*128 + vst vr0, sp, 10*SZREG + 0*128 + vst vr1, sp, 10*SZREG + 1*128 + vst vr2, sp, 10*SZREG + 2*128 + vst vr3, sp, 10*SZREG + 3*128 + vst vr4, sp, 10*SZREG + 4*128 + vst vr5, sp, 10*SZREG + 5*128 + vst vr6, sp, 10*SZREG + 6*128 + vst vr7, sp, 10*SZREG + 7*128 #endif #endif @@ -119,23 +119,23 @@ ENTRY (_dl_runtime_resolve, 3) FREG_L fa6, sp, 10*SZREG + 6*SZFREG FREG_L fa7, sp, 10*SZREG + 7*SZFREG #ifdef USE_LASX - xvld $xr0, sp, 10*SZREG + 0*256 - xvld $xr1, sp, 10*SZREG + 1*256 - xvld $xr2, sp, 10*SZREG + 2*256 - xvld $xr3, sp, 10*SZREG + 3*256 - xvld $xr4, sp, 10*SZREG + 4*256 - xvld $xr5, sp, 10*SZREG + 5*256 - xvld $xr6, sp, 10*SZREG + 6*256 - xvld $xr7, sp, 10*SZREG + 7*256 + xvld xr0, sp, 10*SZREG + 0*256 + xvld xr1, sp, 10*SZREG + 1*256 + xvld xr2, sp, 10*SZREG + 2*256 + xvld xr3, sp, 10*SZREG + 3*256 + xvld xr4, sp, 10*SZREG + 4*256 + xvld xr5, sp, 10*SZREG + 5*256 + xvld xr6, sp, 10*SZREG + 6*256 + xvld xr7, sp, 10*SZREG + 7*256 #elif defined USE_LSX - vld $vr0, sp, 10*SZREG + 0*128 - vld $vr1, sp, 10*SZREG + 1*128 - vld $vr2, sp, 10*SZREG + 2*128 - vld $vr3, sp, 10*SZREG + 3*128 - vld $vr4, sp, 10*SZREG + 4*128 - vld $vr5, sp, 10*SZREG + 5*128 - vld $vr6, sp, 10*SZREG + 6*128 - vld $vr7, sp, 10*SZREG + 7*128 + vld vr0, sp, 10*SZREG + 0*128 + vld vr1, sp, 10*SZREG + 1*128 + vld vr2, sp, 10*SZREG + 2*128 + vld vr3, sp, 10*SZREG + 3*128 + vld vr4, sp, 10*SZREG + 4*128 + vld vr5, sp, 10*SZREG + 5*128 + vld vr6, sp, 10*SZREG + 6*128 + vld vr7, sp, 10*SZREG + 7*128 #endif #endif diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S index 387a35fe..425fcede 100644 --- a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S +++ b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S @@ -17,28 +17,28 @@ LEAF(MEMCHR, 6) andi t0, a0, 0x3f bstrins.d a0, zero, 5, 0 - xvld $xr0, a0, 0 - xvld $xr1, a0, 32 + xvld xr0, a0, 0 + xvld xr1, a0, 32 li.d t1, -1 li.d t2, 64 - xvreplgr2vr.b $xr2, a1 + xvreplgr2vr.b xr2, a1 sll.d t3, t1, t0 sub.d t2, t2, t0 - xvseq.b $xr0, $xr0, $xr2 + xvseq.b xr0, xr0, xr2 - xvseq.b $xr1, $xr1, $xr2 - xvmsknz.b $xr0, $xr0 - xvmsknz.b $xr1, $xr1 - xvpickve.w $xr3, $xr0, 4 + xvseq.b xr1, xr1, xr2 + xvmsknz.b xr0, xr0 + xvmsknz.b xr1, xr1 + xvpickve.w xr3, xr0, 4 - xvpickve.w $xr4, $xr1, 4 - vilvl.h $vr0, $vr3, $vr0 - vilvl.h $vr1, $vr4, $vr1 - vilvl.w $vr0, $vr1, $vr0 + xvpickve.w xr4, xr1, 4 + vilvl.h vr0, vr3, vr0 + vilvl.h vr1, vr4, vr1 + vilvl.w vr0, vr1, vr0 - movfr2gr.d t0, $f0 + movfr2gr.d t0, fa0 and t0, t0, t3 bgeu t2, a2, L(end) bnez t0, L(found) @@ -46,28 +46,28 @@ LEAF(MEMCHR, 6) addi.d a4, a3, -1 bstrins.d a4, zero, 5, 0 L(loop): - xvld $xr0, a0, 64 - xvld $xr1, a0, 96 + xvld xr0, a0, 64 + xvld xr1, a0, 96 addi.d a0, a0, 64 - xvseq.b $xr0, $xr0, $xr2 - xvseq.b $xr1, $xr1, $xr2 + xvseq.b xr0, xr0, xr2 + xvseq.b xr1, xr1, xr2 beq a0, a4, L(out) - xvmax.bu $xr3, $xr0, $xr1 - xvseteqz.v $fcc0, $xr3 - bcnez $fcc0, L(loop) - xvmsknz.b $xr0, $xr0 + xvmax.bu xr3, xr0, xr1 + xvseteqz.v fcc0, xr3 + bcnez fcc0, L(loop) + xvmsknz.b xr0, xr0 - xvmsknz.b $xr1, $xr1 - xvpickve.w $xr3, $xr0, 4 - xvpickve.w $xr4, $xr1, 4 - vilvl.h $vr0, $vr3, $vr0 + xvmsknz.b xr1, xr1 + xvpickve.w xr3, xr0, 4 + xvpickve.w xr4, xr1, 4 + vilvl.h vr0, vr3, vr0 - vilvl.h $vr1, $vr4, $vr1 - vilvl.w $vr0, $vr1, $vr0 - movfr2gr.d t0, $f0 + vilvl.h vr1, vr4, vr1 + vilvl.w vr0, vr1, vr0 + movfr2gr.d t0, fa0 L(found): ctz.d t1, t0 @@ -79,15 +79,15 @@ L(ret0): L(out): - xvmsknz.b $xr0, $xr0 - xvmsknz.b $xr1, $xr1 - xvpickve.w $xr3, $xr0, 4 - xvpickve.w $xr4, $xr1, 4 - - vilvl.h $vr0, $vr3, $vr0 - vilvl.h $vr1, $vr4, $vr1 - vilvl.w $vr0, $vr1, $vr0 - movfr2gr.d t0, $f0 + xvmsknz.b xr0, xr0 + xvmsknz.b xr1, xr1 + xvpickve.w xr3, xr0, 4 + xvpickve.w xr4, xr1, 4 + + vilvl.h vr0, vr3, vr0 + vilvl.h vr1, vr4, vr1 + vilvl.w vr0, vr1, vr0 + movfr2gr.d t0, fa0 L(end): sub.d t2, zero, a3 diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S index c6952657..08a630d3 100644 --- a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S +++ b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S @@ -17,23 +17,23 @@ LEAF(MEMCHR, 6) andi t0, a0, 0x1f bstrins.d a0, zero, 4, 0 - vld $vr0, a0, 0 - vld $vr1, a0, 16 + vld vr0, a0, 0 + vld vr1, a0, 16 li.d t1, -1 li.d t2, 32 - vreplgr2vr.b $vr2, a1 + vreplgr2vr.b vr2, a1 sll.d t3, t1, t0 sub.d t2, t2, t0 - vseq.b $vr0, $vr0, $vr2 + vseq.b vr0, vr0, vr2 - vseq.b $vr1, $vr1, $vr2 - vmsknz.b $vr0, $vr0 - vmsknz.b $vr1, $vr1 - vilvl.h $vr0, $vr1, $vr0 + vseq.b vr1, vr1, vr2 + vmsknz.b vr0, vr0 + vmsknz.b vr1, vr1 + vilvl.h vr0, vr1, vr0 - movfr2gr.s t0, $f0 + movfr2gr.s t0, fa0 and t0, t0, t3 bgeu t2, a2, L(end) bnez t0, L(found) @@ -41,23 +41,23 @@ LEAF(MEMCHR, 6) addi.d a4, a3, -1 bstrins.d a4, zero, 4, 0 L(loop): - vld $vr0, a0, 32 - vld $vr1, a0, 48 + vld vr0, a0, 32 + vld vr1, a0, 48 addi.d a0, a0, 32 - vseq.b $vr0, $vr0, $vr2 - vseq.b $vr1, $vr1, $vr2 + vseq.b vr0, vr0, vr2 + vseq.b vr1, vr1, vr2 beq a0, a4, L(out) - vmax.bu $vr3, $vr0, $vr1 - vseteqz.v $fcc0, $vr3 - bcnez $fcc0, L(loop) - vmsknz.b $vr0, $vr0 + vmax.bu vr3, vr0, vr1 + vseteqz.v fcc0, vr3 + bcnez fcc0, L(loop) + vmsknz.b vr0, vr0 - vmsknz.b $vr1, $vr1 - vilvl.h $vr0, $vr1, $vr0 - movfr2gr.s t0, $f0 + vmsknz.b vr1, vr1 + vilvl.h vr0, vr1, vr0 + movfr2gr.s t0, fa0 L(found): ctz.w t0, t0 @@ -68,10 +68,10 @@ L(ret0): jr ra L(out): - vmsknz.b $vr0, $vr0 - vmsknz.b $vr1, $vr1 - vilvl.h $vr0, $vr1, $vr0 - movfr2gr.s t0, $f0 + vmsknz.b vr0, vr0 + vmsknz.b vr1, vr1 + vilvl.h vr0, vr1, vr0 + movfr2gr.s t0, fa0 L(end): sub.d t2, zero, a3 diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S index 9151d38d..2c192954 100644 --- a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S +++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S @@ -20,39 +20,39 @@ LEAF(MEMCMP, 6) li.d t1, 160 bgeu a2, t1, L(make_aligned) # a2 >= 160 L(loop32): - xvld $xr0, a0, 0 - xvld $xr1, a1, 0 + xvld xr0, a0, 0 + xvld xr1, a1, 0 addi.d a0, a0, 32 addi.d a1, a1, 32 addi.d a2, a2, -32 - xvseq.b $xr2, $xr0, $xr1 + xvseq.b xr2, xr0, xr1 - xvsetanyeqz.b $fcc0, $xr2 - bcnez $fcc0, L(end) + xvsetanyeqz.b fcc0, xr2 + bcnez fcc0, L(end) L(last_bytes): bltu t2, a2, L(loop32) - xvld $xr0, a3, -32 + xvld xr0, a3, -32 - xvld $xr1, a4, -32 - xvseq.b $xr2, $xr0, $xr1 + xvld xr1, a4, -32 + xvseq.b xr2, xr0, xr1 L(end): - xvmsknz.b $xr2, $xr2 - xvpermi.q $xr4, $xr0, 1 + xvmsknz.b xr2, xr2 + xvpermi.q xr4, xr0, 1 - xvpickve.w $xr3, $xr2, 4 - xvpermi.q $xr5, $xr1, 1 - vilvl.h $vr2, $vr3, $vr2 - movfr2gr.s t0, $f2 + xvpickve.w xr3, xr2, 4 + xvpermi.q xr5, xr1, 1 + vilvl.h vr2, vr3, vr2 + movfr2gr.s t0, fa2 cto.w t0, t0 - vreplgr2vr.b $vr2, t0 - vshuf.b $vr0, $vr4, $vr0, $vr2 - vshuf.b $vr1, $vr5, $vr1, $vr2 + vreplgr2vr.b vr2, t0 + vshuf.b vr0, vr4, vr0, vr2 + vshuf.b vr1, vr5, vr1, vr2 - vpickve2gr.bu t0, $vr0, 0 - vpickve2gr.bu t1, $vr1, 0 + vpickve2gr.bu t0, vr0, 0 + vpickve2gr.bu t1, vr1, 0 sub.d a0, t0, t1 jr ra @@ -60,59 +60,59 @@ L(end): L(less32): srli.d t0, a2, 4 beqz t0, L(less16) - vld $vr0, a0, 0 - vld $vr1, a1, 0 + vld vr0, a0, 0 + vld vr1, a1, 0 - vld $vr2, a3, -16 - vld $vr3, a4, -16 + vld vr2, a3, -16 + vld vr3, a4, -16 L(short_ret): - vseq.b $vr4, $vr0, $vr1 - vseq.b $vr5, $vr2, $vr3 + vseq.b vr4, vr0, vr1 + vseq.b vr5, vr2, vr3 - vmsknz.b $vr4, $vr4 - vmsknz.b $vr5, $vr5 - vilvl.h $vr4, $vr5, $vr4 - movfr2gr.s t0, $f4 + vmsknz.b vr4, vr4 + vmsknz.b vr5, vr5 + vilvl.h vr4, vr5, vr4 + movfr2gr.s t0, fa4 cto.w t0, t0 - vreplgr2vr.b $vr4, t0 - vshuf.b $vr0, $vr2, $vr0, $vr4 - vshuf.b $vr1, $vr3, $vr1, $vr4 + vreplgr2vr.b vr4, t0 + vshuf.b vr0, vr2, vr0, vr4 + vshuf.b vr1, vr3, vr1, vr4 - vpickve2gr.bu t0, $vr0, 0 - vpickve2gr.bu t1, $vr1, 0 + vpickve2gr.bu t0, vr0, 0 + vpickve2gr.bu t1, vr1, 0 sub.d a0, t0, t1 jr ra L(less16): srli.d t0, a2, 3 beqz t0, L(less8) - vldrepl.d $vr0, a0, 0 - vldrepl.d $vr1, a1, 0 + vldrepl.d vr0, a0, 0 + vldrepl.d vr1, a1, 0 - vldrepl.d $vr2, a3, -8 - vldrepl.d $vr3, a4, -8 + vldrepl.d vr2, a3, -8 + vldrepl.d vr3, a4, -8 b L(short_ret) L(less8): srli.d t0, a2, 2 beqz t0, L(less4) - vldrepl.w $vr0, a0, 0 - vldrepl.w $vr1, a1, 0 - vldrepl.w $vr2, a3, -4 + vldrepl.w vr0, a0, 0 + vldrepl.w vr1, a1, 0 + vldrepl.w vr2, a3, -4 - vldrepl.w $vr3, a4, -4 + vldrepl.w vr3, a4, -4 b L(short_ret) L(less4): srli.d t0, a2, 1 beqz t0, L(less2) - vldrepl.h $vr0, a0, 0 - vldrepl.h $vr1, a1, 0 - vldrepl.h $vr2, a3, -2 - vldrepl.h $vr3, a4, -2 + vldrepl.h vr0, a0, 0 + vldrepl.h vr1, a1, 0 + vldrepl.h vr2, a3, -2 + vldrepl.h vr3, a4, -2 b L(short_ret) L(less2): @@ -132,12 +132,12 @@ L(ret0): nop /* make src1 aligned, and adjust scr2 and length. */ L(make_aligned): - xvld $xr0, a0, 0 + xvld xr0, a0, 0 - xvld $xr1, a1, 0 - xvseq.b $xr2, $xr0, $xr1 - xvsetanyeqz.b $fcc0, $xr2 - bcnez $fcc0, L(end) + xvld xr1, a1, 0 + xvseq.b xr2, xr0, xr1 + xvsetanyeqz.b fcc0, xr2 + bcnez fcc0, L(end) andi t0, a0, 0x1f sub.d t0, t2, t0 @@ -151,17 +151,17 @@ L(make_aligned): L(loop_align): - xvld $xr0, a0, 0 - xvld $xr1, a1, 0 - xvld $xr2, a0, 32 - xvld $xr3, a1, 32 + xvld xr0, a0, 0 + xvld xr1, a1, 0 + xvld xr2, a0, 32 + xvld xr3, a1, 32 - xvseq.b $xr0, $xr0, $xr1 - xvseq.b $xr1, $xr2, $xr3 - xvmin.bu $xr2, $xr1, $xr0 - xvsetanyeqz.b $fcc0, $xr2 + xvseq.b xr0, xr0, xr1 + xvseq.b xr1, xr2, xr3 + xvmin.bu xr2, xr1, xr0 + xvsetanyeqz.b fcc0, xr2 - bcnez $fcc0, L(pair_end) + bcnez fcc0, L(pair_end) addi.d a0, a0, 64 addi.d a1, a1, 64 bne a0, a5, L(loop_align) @@ -173,15 +173,15 @@ L(loop_align): L(pair_end): - xvmsknz.b $xr0, $xr0 - xvmsknz.b $xr1, $xr1 - xvpickve.w $xr2, $xr0, 4 - xvpickve.w $xr3, $xr1, 4 - - vilvl.h $vr0, $vr2, $vr0 - vilvl.h $vr1, $vr3, $vr1 - vilvl.w $vr0, $vr1, $vr0 - movfr2gr.d t0, $f0 + xvmsknz.b xr0, xr0 + xvmsknz.b xr1, xr1 + xvpickve.w xr2, xr0, 4 + xvpickve.w xr3, xr1, 4 + + vilvl.h vr0, vr2, vr0 + vilvl.h vr1, vr3, vr1 + vilvl.w vr0, vr1, vr0 + movfr2gr.d t0, fa0 cto.d t0, t0 ldx.bu t1, a0, t0 diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S index 8535aa22..b407275f 100644 --- a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S +++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S @@ -21,28 +21,28 @@ ENTRY_NO_ALIGN(MEMCMP) pcaddi t0, -7 andi a3, a0, 0xf - vld $vr5, t0, 0 + vld vr5, t0, 0 andi a4, a1, 0xf bne a3, a4, L(unaligned) bstrins.d a0, zero, 3, 0 xor a1, a1, a4 - vld $vr0, a0, 0 - vld $vr1, a1, 0 + vld vr0, a0, 0 + vld vr1, a1, 0 li.d t0, 16 - vreplgr2vr.b $vr3, a3 + vreplgr2vr.b vr3, a3 sub.d t1, t0, a3 - vadd.b $vr3, $vr3, $vr5 + vadd.b vr3, vr3, vr5 - vshuf.b $vr0, $vr3, $vr0, $vr3 - vshuf.b $vr1, $vr3, $vr1, $vr3 - vseq.b $vr4, $vr0, $vr1 + vshuf.b vr0, vr3, vr0, vr3 + vshuf.b vr1, vr3, vr1, vr3 + vseq.b vr4, vr0, vr1 bgeu t1, a2, L(al_end) - vsetanyeqz.b $fcc0, $vr4 - bcnez $fcc0, L(al_found) + vsetanyeqz.b fcc0, vr4 + bcnez fcc0, L(al_found) sub.d a2, a2, t1 andi t1, a2, 31 @@ -53,70 +53,70 @@ ENTRY_NO_ALIGN(MEMCMP) L(al_loop): - vld $vr0, a0, 16 - vld $vr1, a1, 16 - vld $vr2, a0, 32 - vld $vr3, a1, 32 + vld vr0, a0, 16 + vld vr1, a1, 16 + vld vr2, a0, 32 + vld vr3, a1, 32 addi.d a0, a0, 32 addi.d a1, a1, 32 - vseq.b $vr4, $vr0, $vr1 - vseq.b $vr6, $vr2, $vr3 + vseq.b vr4, vr0, vr1 + vseq.b vr6, vr2, vr3 - vand.v $vr6, $vr4, $vr6 - vsetanyeqz.b $fcc0, $vr6 - bcnez $fcc0, L(al_pair_end) + vand.v vr6, vr4, vr6 + vsetanyeqz.b fcc0, vr6 + bcnez fcc0, L(al_pair_end) bne a0, a4, L(al_loop) L(al_less_32bytes): bgeu t0, a2, L(al_less_16bytes) - vld $vr0, a0, 16 - vld $vr1, a1, 16 - vld $vr2, a0, 32 + vld vr0, a0, 16 + vld vr1, a1, 16 + vld vr2, a0, 32 - vld $vr3, a1, 32 + vld vr3, a1, 32 addi.d a2, a2, -16 - vreplgr2vr.b $vr6, a2 - vslt.b $vr5, $vr5, $vr6 + vreplgr2vr.b vr6, a2 + vslt.b vr5, vr5, vr6 - vseq.b $vr4, $vr0, $vr1 - vseq.b $vr6, $vr2, $vr3 - vorn.v $vr6, $vr6, $vr5 + vseq.b vr4, vr0, vr1 + vseq.b vr6, vr2, vr3 + vorn.v vr6, vr6, vr5 L(al_pair_end): - vsetanyeqz.b $fcc0, $vr4 + vsetanyeqz.b fcc0, vr4 - bcnez $fcc0, L(al_found) - vnori.b $vr4, $vr6, 0 - vfrstpi.b $vr4, $vr4, 0 - vshuf.b $vr0, $vr2, $vr2, $vr4 + bcnez fcc0, L(al_found) + vnori.b vr4, vr6, 0 + vfrstpi.b vr4, vr4, 0 + vshuf.b vr0, vr2, vr2, vr4 - vshuf.b $vr1, $vr3, $vr3, $vr4 - vpickve2gr.bu t0, $vr0, 0 - vpickve2gr.bu t1, $vr1, 0 + vshuf.b vr1, vr3, vr3, vr4 + vpickve2gr.bu t0, vr0, 0 + vpickve2gr.bu t1, vr1, 0 sub.d a0, t0, t1 jr ra L(al_less_16bytes): beqz a2, L(out) - vld $vr0, a0, 16 - vld $vr1, a1, 16 + vld vr0, a0, 16 + vld vr1, a1, 16 - vseq.b $vr4, $vr0, $vr1 + vseq.b vr4, vr0, vr1 L(al_end): - vreplgr2vr.b $vr6, a2 - vslt.b $vr5, $vr5, $vr6 - vorn.v $vr4, $vr4, $vr5 + vreplgr2vr.b vr6, a2 + vslt.b vr5, vr5, vr6 + vorn.v vr4, vr4, vr5 L(al_found): - vnori.b $vr4, $vr4, 0 - vfrstpi.b $vr4, $vr4, 0 - vshuf.b $vr0, $vr0, $vr0, $vr4 - vshuf.b $vr1, $vr1, $vr1, $vr4 + vnori.b vr4, vr4, 0 + vfrstpi.b vr4, vr4, 0 + vshuf.b vr0, vr0, vr0, vr4 + vshuf.b vr1, vr1, vr1, vr4 - vpickve2gr.bu t0, $vr0, 0 - vpickve2gr.bu t1, $vr1, 0 + vpickve2gr.bu t0, vr0, 0 + vpickve2gr.bu t1, vr1, 0 sub.d a0, t0, t1 jr ra @@ -133,28 +133,28 @@ L(unaligned): bstrins.d a0, zero, 3, 0 xor a1, a1, a4 - vld $vr4, a0, 0 - vld $vr1, a1, 0 + vld vr4, a0, 0 + vld vr1, a1, 0 li.d t0, 16 - vreplgr2vr.b $vr2, a4 + vreplgr2vr.b vr2, a4 sub.d a6, a4, a3 # a6 hold the diff sub.d t1, t0, a4 sub.d t2, t0, a6 - vadd.b $vr2, $vr2, $vr5 # [4, 5, 6, ...] - vreplgr2vr.b $vr6, t2 - vadd.b $vr6, $vr6, $vr5 # [14, 15, 16, ... ] - vshuf.b $vr0, $vr4, $vr4, $vr6 # make data be in the same position + vadd.b vr2, vr2, vr5 # [4, 5, 6, ...] + vreplgr2vr.b vr6, t2 + vadd.b vr6, vr6, vr5 # [14, 15, 16, ... ] + vshuf.b vr0, vr4, vr4, vr6 # make data be in the same position - vshuf.b $vr1, $vr2, $vr1, $vr2 - vshuf.b $vr0, $vr2, $vr0, $vr2 - vseq.b $vr7, $vr0, $vr1 + vshuf.b vr1, vr2, vr1, vr2 + vshuf.b vr0, vr2, vr0, vr2 + vseq.b vr7, vr0, vr1 bgeu t1, a2, L(un_end) - vsetanyeqz.b $fcc0, $vr7 - bcnez $fcc0, L(un_found) + vsetanyeqz.b fcc0, vr7 + bcnez fcc0, L(un_found) sub.d a2, a2, t1 andi t1, a2, 31 @@ -165,63 +165,63 @@ L(unaligned): L(un_loop): - vld $vr2, a0, 16 - vld $vr1, a1, 16 - vld $vr3, a1, 32 + vld vr2, a0, 16 + vld vr1, a1, 16 + vld vr3, a1, 32 addi.d a1, a1, 32 addi.d a0, a0, 32 - vshuf.b $vr0, $vr2, $vr4, $vr6 - vld $vr4, a0, 0 - vseq.b $vr7, $vr0, $vr1 + vshuf.b vr0, vr2, vr4, vr6 + vld vr4, a0, 0 + vseq.b vr7, vr0, vr1 - vshuf.b $vr2, $vr4, $vr2, $vr6 - vseq.b $vr8, $vr2, $vr3 - vand.v $vr8, $vr7, $vr8 - vsetanyeqz.b $fcc0, $vr8 + vshuf.b vr2, vr4, vr2, vr6 + vseq.b vr8, vr2, vr3 + vand.v vr8, vr7, vr8 + vsetanyeqz.b fcc0, vr8 - bcnez $fcc0, L(un_pair_end) + bcnez fcc0, L(un_pair_end) bne a1, a4, L(un_loop) L(un_less_32bytes): bltu a2, t0, L(un_less_16bytes) - vld $vr2, a0, 16 + vld vr2, a0, 16 - vld $vr1, a1, 16 + vld vr1, a1, 16 addi.d a0, a0, 16 addi.d a1, a1, 16 addi.d a2, a2, -16 - vshuf.b $vr0, $vr2, $vr4, $vr6 - vor.v $vr4, $vr2, $vr2 - vseq.b $vr7, $vr0, $vr1 - vsetanyeqz.b $fcc0, $vr7 + vshuf.b vr0, vr2, vr4, vr6 + vor.v vr4, vr2, vr2 + vseq.b vr7, vr0, vr1 + vsetanyeqz.b fcc0, vr7 - bcnez $fcc0, L(un_found) + bcnez fcc0, L(un_found) L(un_less_16bytes): beqz a2, L(out) - vld $vr1, a1, 16 + vld vr1, a1, 16 bgeu a6, a2, 1f - vld $vr2, a0, 16 + vld vr2, a0, 16 1: - vshuf.b $vr0, $vr2, $vr4, $vr6 - vseq.b $vr7, $vr0, $vr1 + vshuf.b vr0, vr2, vr4, vr6 + vseq.b vr7, vr0, vr1 L(un_end): - vreplgr2vr.b $vr3, a2 + vreplgr2vr.b vr3, a2 - vslt.b $vr3, $vr5, $vr3 - vorn.v $vr7, $vr7, $vr3 + vslt.b vr3, vr5, vr3 + vorn.v vr7, vr7, vr3 L(un_found): - vnori.b $vr7, $vr7, 0 - vfrstpi.b $vr7, $vr7, 0 + vnori.b vr7, vr7, 0 + vfrstpi.b vr7, vr7, 0 - vshuf.b $vr0, $vr0, $vr0, $vr7 - vshuf.b $vr1, $vr1, $vr1, $vr7 + vshuf.b vr0, vr0, vr0, vr7 + vshuf.b vr1, vr1, vr1, vr7 L(calc_result): - vpickve2gr.bu t0, $vr0, 0 - vpickve2gr.bu t1, $vr1, 0 + vpickve2gr.bu t0, vr0, 0 + vpickve2gr.bu t1, vr1, 0 sub.d t2, t0, t1 sub.d t3, t1, t0 @@ -231,14 +231,14 @@ L(calc_result): or a0, t0, t1 jr ra L(un_pair_end): - vsetanyeqz.b $fcc0, $vr7 - bcnez $fcc0, L(un_found) + vsetanyeqz.b fcc0, vr7 + bcnez fcc0, L(un_found) - vnori.b $vr7, $vr8, 0 - vfrstpi.b $vr7, $vr7, 0 - vshuf.b $vr0, $vr2, $vr2, $vr7 - vshuf.b $vr1, $vr3, $vr3, $vr7 + vnori.b vr7, vr8, 0 + vfrstpi.b vr7, vr7, 0 + vshuf.b vr0, vr2, vr2, vr7 + vshuf.b vr1, vr3, vr3, vr7 b L(calc_result) L(out): diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S index e8b2c441..c317592f 100644 --- a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S +++ b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S @@ -26,22 +26,22 @@ LEAF(MEMCPY_NAME, 6) li.d t1, 64 bltu t1, a2, L(copy_long) # a2 > 64 - xvld $xr0, a1, 0 - xvld $xr1, a4, -32 + xvld xr0, a1, 0 + xvld xr1, a4, -32 - xvst $xr0, a0, 0 - xvst $xr1, a3, -32 + xvst xr0, a0, 0 + xvst xr1, a3, -32 jr ra L(less_32bytes): srli.d t0, a2, 4 beqz t0, L(less_16bytes) - vld $vr0, a1, 0 - vld $vr1, a4, -16 - vst $vr0, a0, 0 + vld vr0, a1, 0 + vld vr1, a4, -16 + vst vr0, a0, 0 - vst $vr1, a3, -16 + vst vr1, a3, -16 jr ra L(less_16bytes): srli.d t0, a2, 3 @@ -91,11 +91,11 @@ LEAF(MEMMOVE_NAME, 6) li.d t1, 64 bltu t1, a2, L(move_long) # a2 > 64 - xvld $xr0, a1, 0 - xvld $xr1, a4, -32 + xvld xr0, a1, 0 + xvld xr1, a4, -32 - xvst $xr0, a0, 0 - xvst $xr1, a3, -32 + xvst xr0, a0, 0 + xvst xr1, a3, -32 jr ra L(move_long): sub.d t2, a0, a1 @@ -107,8 +107,8 @@ L(copy_long): sub.d t2, t0, t2 - xvld $xr8, a1, 0 - xvld $xr9, a4, -32 + xvld xr8, a1, 0 + xvld xr9, a4, -32 sub.d t3, a2, t2 add.d a5, a0, t2 @@ -119,69 +119,69 @@ L(copy_long): addi.d a6, a6, -1 L(loop_256): - xvld $xr0, a1, 0 - xvld $xr1, a1, 32 - xvld $xr2, a1, 64 + xvld xr0, a1, 0 + xvld xr1, a1, 32 + xvld xr2, a1, 64 - xvld $xr3, a1, 96 - xvld $xr4, a1, 128 - xvld $xr5, a1, 160 - xvld $xr6, a1, 192 + xvld xr3, a1, 96 + xvld xr4, a1, 128 + xvld xr5, a1, 160 + xvld xr6, a1, 192 - xvld $xr7, a1, 224 + xvld xr7, a1, 224 addi.d a1, a1, 256 - xvst $xr0, a5, 0 - xvst $xr1, a5, 32 + xvst xr0, a5, 0 + xvst xr1, a5, 32 - xvst $xr2, a5, 64 - xvst $xr3, a5, 96 - xvst $xr4, a5, 128 - xvst $xr5, a5, 160 + xvst xr2, a5, 64 + xvst xr3, a5, 96 + xvst xr4, a5, 128 + xvst xr5, a5, 160 - xvst $xr6, a5, 192 - xvst $xr7, a5, 224 + xvst xr6, a5, 192 + xvst xr7, a5, 224 addi.d a5, a5, 256 bne a1, a6, L(loop_256) L(lt256): srli.d t2, a2, 7 beqz t2, L(lt128) - xvld $xr0, a1, 0 - xvld $xr1, a1, 32 + xvld xr0, a1, 0 + xvld xr1, a1, 32 - xvld $xr2, a1, 64 - xvld $xr3, a1, 96 + xvld xr2, a1, 64 + xvld xr3, a1, 96 addi.d a1, a1, 128 addi.d a2, a2, -128 - xvst $xr0, a5, 0 - xvst $xr1, a5, 32 - xvst $xr2, a5, 64 - xvst $xr3, a5, 96 + xvst xr0, a5, 0 + xvst xr1, a5, 32 + xvst xr2, a5, 64 + xvst xr3, a5, 96 addi.d a5, a5, 128 L(lt128): bltu a2, t1, L(lt64) - xvld $xr0, a1, 0 - xvld $xr1, a1, 32 + xvld xr0, a1, 0 + xvld xr1, a1, 32 addi.d a1, a1, 64 addi.d a2, a2, -64 - xvst $xr0, a5, 0 - xvst $xr1, a5, 32 + xvst xr0, a5, 0 + xvst xr1, a5, 32 addi.d a5, a5, 64 L(lt64): bltu a2, t0, L(lt32) - xvld $xr0, a1, 0 - xvst $xr0, a5, 0 + xvld xr0, a1, 0 + xvst xr0, a5, 0 L(lt32): - xvst $xr8, a0, 0 - xvst $xr9, a3, -32 + xvst xr8, a0, 0 + xvst xr9, a3, -32 jr ra nop @@ -189,9 +189,9 @@ L(copy_back): addi.d a3, a3, -1 addi.d a2, a2, -2 andi t2, a3, 0x1f - xvld $xr8, a1, 0 + xvld xr8, a1, 0 - xvld $xr9, a4, -32 + xvld xr9, a4, -32 sub.d t3, a2, t2 sub.d a5, a3, t2 sub.d a4, a4, t2 @@ -203,69 +203,69 @@ L(copy_back): addi.d a6, a6, 2 L(back_loop_256): - xvld $xr0, a4, -33 - xvld $xr1, a4, -65 - xvld $xr2, a4, -97 - xvld $xr3, a4, -129 + xvld xr0, a4, -33 + xvld xr1, a4, -65 + xvld xr2, a4, -97 + xvld xr3, a4, -129 - xvld $xr4, a4, -161 - xvld $xr5, a4, -193 - xvld $xr6, a4, -225 - xvld $xr7, a4, -257 + xvld xr4, a4, -161 + xvld xr5, a4, -193 + xvld xr6, a4, -225 + xvld xr7, a4, -257 addi.d a4, a4, -256 - xvst $xr0, a5, -32 - xvst $xr1, a5, -64 - xvst $xr2, a5, -96 + xvst xr0, a5, -32 + xvst xr1, a5, -64 + xvst xr2, a5, -96 - xvst $xr3, a5, -128 - xvst $xr4, a5, -160 - xvst $xr5, a5, -192 - xvst $xr6, a5, -224 + xvst xr3, a5, -128 + xvst xr4, a5, -160 + xvst xr5, a5, -192 + xvst xr6, a5, -224 - xvst $xr7, a5, -256 + xvst xr7, a5, -256 addi.d a5, a5, -256 bne a4, a6, L(back_loop_256) L(back_lt256): srli.d t2, a2, 7 beqz t2, L(back_lt128) - xvld $xr0, a4, -33 - xvld $xr1, a4, -65 - xvld $xr2, a4, -97 + xvld xr0, a4, -33 + xvld xr1, a4, -65 + xvld xr2, a4, -97 - xvld $xr3, a4, -129 + xvld xr3, a4, -129 addi.d a2, a2, -128 addi.d a4, a4, -128 - xvst $xr0, a5, -32 + xvst xr0, a5, -32 - xvst $xr1, a5, -64 - xvst $xr2, a5, -96 - xvst $xr3, a5, -128 + xvst xr1, a5, -64 + xvst xr2, a5, -96 + xvst xr3, a5, -128 addi.d a5, a5, -128 L(back_lt128): blt a2, t1, L(back_lt64) - xvld $xr0, a4, -33 - xvld $xr1, a4, -65 + xvld xr0, a4, -33 + xvld xr1, a4, -65 addi.d a2, a2, -64 addi.d a4, a4, -64 - xvst $xr0, a5, -32 - xvst $xr1, a5, -64 + xvst xr0, a5, -32 + xvst xr1, a5, -64 addi.d a5, a5, -64 L(back_lt64): bltu a2, t0, L(back_lt32) - xvld $xr0, a4, -33 - xvst $xr0, a5, -32 + xvld xr0, a4, -33 + xvst xr0, a5, -32 L(back_lt32): - xvst $xr8, a0, 0 + xvst xr8, a0, 0 - xvst $xr9, a3, -31 + xvst xr9, a3, -31 jr ra END(MEMMOVE_NAME) diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S index 90f89c7a..77f1b4ab 100644 --- a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S +++ b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S @@ -23,54 +23,54 @@ LEAF(MEMCPY_NAME, 6) bltu t8, a2, L(copy_long) # a2 > 64 bltu t7, a2, L(more_32bytes) # a2 > 32 - vld $vr0, a1, 0 - vld $vr1, a4, -16 - vst $vr0, a0, 0 - vst $vr1, a3, -16 + vld vr0, a1, 0 + vld vr1, a4, -16 + vst vr0, a0, 0 + vst vr1, a3, -16 jr ra L(more_32bytes): - vld $vr0, a1, 0 - vld $vr1, a1, 16 - vld $vr2, a4, -32 + vld vr0, a1, 0 + vld vr1, a1, 16 + vld vr2, a4, -32 - vld $vr3, a4, -16 - vst $vr0, a0, 0 - vst $vr1, a0, 16 - vst $vr2, a3, -32 + vld vr3, a4, -16 + vst vr0, a0, 0 + vst vr1, a0, 16 + vst vr2, a3, -32 - vst $vr3, a3, -16 + vst vr3, a3, -16 jr ra L(less_16bytes): srli.d t0, a2, 3 beqz t0, L(less_8bytes) - vldrepl.d $vr0, a1, 0 - vldrepl.d $vr1, a4, -8 - vstelm.d $vr0, a0, 0, 0 - vstelm.d $vr1, a3, -8, 0 + vldrepl.d vr0, a1, 0 + vldrepl.d vr1, a4, -8 + vstelm.d vr0, a0, 0, 0 + vstelm.d vr1, a3, -8, 0 jr ra L(less_8bytes): srli.d t0, a2, 2 beqz t0, L(less_4bytes) - vldrepl.w $vr0, a1, 0 + vldrepl.w vr0, a1, 0 - vldrepl.w $vr1, a4, -4 - vstelm.w $vr0, a0, 0, 0 - vstelm.w $vr1, a3, -4, 0 + vldrepl.w vr1, a4, -4 + vstelm.w vr0, a0, 0, 0 + vstelm.w vr1, a3, -4, 0 jr ra L(less_4bytes): srli.d t0, a2, 1 beqz t0, L(less_2bytes) - vldrepl.h $vr0, a1, 0 - vldrepl.h $vr1, a4, -2 + vldrepl.h vr0, a1, 0 + vldrepl.h vr1, a4, -2 - vstelm.h $vr0, a0, 0, 0 - vstelm.h $vr1, a3, -2, 0 + vstelm.h vr0, a0, 0, 0 + vstelm.h vr1, a3, -2, 0 jr ra L(less_2bytes): beqz a2, L(less_1bytes) @@ -93,10 +93,10 @@ LEAF(MEMMOVE_NAME, 6) bltu t8, a2, L(move_long) # a2 > 64 bltu t7, a2, L(more_32bytes) # a2 > 32 - vld $vr0, a1, 0 - vld $vr1, a4, -16 - vst $vr0, a0, 0 - vst $vr1, a3, -16 + vld vr0, a1, 0 + vld vr1, a4, -16 + vst vr0, a0, 0 + vst vr1, a3, -16 jr ra nop @@ -106,7 +106,7 @@ L(move_long): L(copy_long): - vld $vr2, a1, 0 + vld vr2, a1, 0 andi t0, a0, 0xf sub.d t0, t6, t0 add.d a1, a1, t0 @@ -114,10 +114,10 @@ L(copy_long): sub.d a2, a2, t0 andi t1, a1, 0xf bnez t1, L(unaligned) - vld $vr0, a1, 0 + vld vr0, a1, 0 addi.d a2, a2, -16 - vst $vr2, a0, 0 + vst vr2, a0, 0 andi t2, a2, 0x7f add.d a5, a0, t0 @@ -128,69 +128,69 @@ L(copy_long): L(al_loop): - vld $vr1, a1, 16 - vld $vr2, a1, 32 - vld $vr3, a1, 48 - vld $vr4, a1, 64 + vld vr1, a1, 16 + vld vr2, a1, 32 + vld vr3, a1, 48 + vld vr4, a1, 64 - vld $vr5, a1, 80 - vld $vr6, a1, 96 - vld $vr7, a1, 112 - vst $vr0, a5, 0 + vld vr5, a1, 80 + vld vr6, a1, 96 + vld vr7, a1, 112 + vst vr0, a5, 0 - vld $vr0, a1, 128 + vld vr0, a1, 128 addi.d a1, a1, 128 - vst $vr1, a5, 16 - vst $vr2, a5, 32 + vst vr1, a5, 16 + vst vr2, a5, 32 - vst $vr3, a5, 48 - vst $vr4, a5, 64 - vst $vr5, a5, 80 - vst $vr6, a5, 96 + vst vr3, a5, 48 + vst vr4, a5, 64 + vst vr5, a5, 80 + vst vr6, a5, 96 - vst $vr7, a5, 112 + vst vr7, a5, 112 addi.d a5, a5, 128 bne a1, a6, L(al_loop) L(al_less_128): blt a2, t8, L(al_less_64) - vld $vr1, a1, 16 - vld $vr2, a1, 32 - vld $vr3, a1, 48 + vld vr1, a1, 16 + vld vr2, a1, 32 + vld vr3, a1, 48 addi.d a2, a2, -64 - vst $vr0, a5, 0 - vld $vr0, a1, 64 + vst vr0, a5, 0 + vld vr0, a1, 64 addi.d a1, a1, 64 - vst $vr1, a5, 16 + vst vr1, a5, 16 - vst $vr2, a5, 32 - vst $vr3, a5, 48 + vst vr2, a5, 32 + vst vr3, a5, 48 addi.d a5, a5, 64 L(al_less_64): blt a2, t7, L(al_less_32) - vld $vr1, a1, 16 + vld vr1, a1, 16 addi.d a2, a2, -32 - vst $vr0, a5, 0 - vld $vr0, a1, 32 + vst vr0, a5, 0 + vld vr0, a1, 32 addi.d a1, a1, 32 - vst $vr1, a5, 16 + vst vr1, a5, 16 addi.d a5, a5, 32 L(al_less_32): blt a2, t6, L(al_less_16) - vst $vr0, a5, 0 - vld $vr0, a1, 16 + vst vr0, a5, 0 + vld vr0, a1, 16 addi.d a5, a5, 16 L(al_less_16): - vld $vr1, a4, -16 + vld vr1, a4, -16 - vst $vr0, a5, 0 - vst $vr1, a3, -16 + vst vr0, a5, 0 + vst vr1, a3, -16 jr ra nop @@ -201,17 +201,17 @@ L(magic_num): L(unaligned): pcaddi t2, -4 bstrins.d a1, zero, 3, 0 - vld $vr8, t2, 0 - vld $vr0, a1, 0 + vld vr8, t2, 0 + vld vr0, a1, 0 - vld $vr1, a1, 16 + vld vr1, a1, 16 addi.d a2, a2, -16 - vst $vr2, a0, 0 + vst vr2, a0, 0 add.d a5, a0, t0 - vreplgr2vr.b $vr9, t1 + vreplgr2vr.b vr9, t1 andi t2, a2, 0x7f - vadd.b $vr9, $vr9, $vr8 + vadd.b vr9, vr9, vr8 addi.d a1, a1, 32 @@ -221,97 +221,97 @@ L(unaligned): add.d a6, a1, t3 L(un_loop): - vld $vr2, a1, 0 - vld $vr3, a1, 16 - vld $vr4, a1, 32 - vld $vr5, a1, 48 + vld vr2, a1, 0 + vld vr3, a1, 16 + vld vr4, a1, 32 + vld vr5, a1, 48 - vld $vr6, a1, 64 - vld $vr7, a1, 80 - vshuf.b $vr8, $vr1, $vr0, $vr9 - vld $vr0, a1, 96 + vld vr6, a1, 64 + vld vr7, a1, 80 + vshuf.b vr8, vr1, vr0, vr9 + vld vr0, a1, 96 - vst $vr8, a5, 0 - vshuf.b $vr8, $vr2, $vr1, $vr9 - vld $vr1, a1, 112 - vst $vr8, a5, 16 + vst vr8, a5, 0 + vshuf.b vr8, vr2, vr1, vr9 + vld vr1, a1, 112 + vst vr8, a5, 16 addi.d a1, a1, 128 - vshuf.b $vr2, $vr3, $vr2, $vr9 - vshuf.b $vr3, $vr4, $vr3, $vr9 - vst $vr2, a5, 32 + vshuf.b vr2, vr3, vr2, vr9 + vshuf.b vr3, vr4, vr3, vr9 + vst vr2, a5, 32 - vshuf.b $vr4, $vr5, $vr4, $vr9 - vst $vr3, a5, 48 - vshuf.b $vr5, $vr6, $vr5, $vr9 - vst $vr4, a5, 64 + vshuf.b vr4, vr5, vr4, vr9 + vst vr3, a5, 48 + vshuf.b vr5, vr6, vr5, vr9 + vst vr4, a5, 64 - vshuf.b $vr6, $vr7, $vr6, $vr9 - vst $vr5, a5, 80 - vshuf.b $vr7, $vr0, $vr7, $vr9 - vst $vr6, a5, 96 + vshuf.b vr6, vr7, vr6, vr9 + vst vr5, a5, 80 + vshuf.b vr7, vr0, vr7, vr9 + vst vr6, a5, 96 - vst $vr7, a5, 112 + vst vr7, a5, 112 addi.d a5, a5, 128 bne a1, a6, L(un_loop) L(un_less_128): blt a2, t8, L(un_less_64) - vld $vr2, a1, 0 - vld $vr3, a1, 16 - vshuf.b $vr4, $vr1, $vr0, $vr9 - vld $vr0, a1, 32 + vld vr2, a1, 0 + vld vr3, a1, 16 + vshuf.b vr4, vr1, vr0, vr9 + vld vr0, a1, 32 - vst $vr4, a5, 0 + vst vr4, a5, 0 addi.d a2, a2, -64 - vshuf.b $vr4, $vr2, $vr1, $vr9 - vld $vr1, a1, 48 + vshuf.b vr4, vr2, vr1, vr9 + vld vr1, a1, 48 addi.d a1, a1, 64 - vst $vr4, a5, 16 - vshuf.b $vr2, $vr3, $vr2, $vr9 - vshuf.b $vr3, $vr0, $vr3, $vr9 + vst vr4, a5, 16 + vshuf.b vr2, vr3, vr2, vr9 + vshuf.b vr3, vr0, vr3, vr9 - vst $vr2, a5, 32 - vst $vr3, a5, 48 + vst vr2, a5, 32 + vst vr3, a5, 48 addi.d a5, a5, 64 L(un_less_64): blt a2, t7, L(un_less_32) - vshuf.b $vr3, $vr1, $vr0, $vr9 - vld $vr0, a1, 0 - vst $vr3, a5, 0 + vshuf.b vr3, vr1, vr0, vr9 + vld vr0, a1, 0 + vst vr3, a5, 0 addi.d a2, a2, -32 - vshuf.b $vr3, $vr0, $vr1, $vr9 - vld $vr1, a1, 16 + vshuf.b vr3, vr0, vr1, vr9 + vld vr1, a1, 16 addi.d a1, a1, 32 - vst $vr3, a5, 16 + vst vr3, a5, 16 addi.d a5, a5, 32 L(un_less_32): blt a2, t6, L(un_less_16) - vshuf.b $vr2, $vr1, $vr0, $vr9 - vor.v $vr0, $vr1, $vr1 + vshuf.b vr2, vr1, vr0, vr9 + vor.v vr0, vr1, vr1 - vld $vr1, a1, 0 - vst $vr2, a5, 0 + vld vr1, a1, 0 + vst vr2, a5, 0 addi.d a5, a5, 16 L(un_less_16): - vld $vr2, a4, -16 + vld vr2, a4, -16 - vshuf.b $vr0, $vr1, $vr0, $vr9 - vst $vr0, a5, 0 - vst $vr2, a3, -16 + vshuf.b vr0, vr1, vr0, vr9 + vst vr0, a5, 0 + vst vr2, a3, -16 jr ra L(copy_back): addi.d t0, a3, -1 - vld $vr2, a4, -16 + vld vr2, a4, -16 andi t0, t0, 0xf addi.d t0, t0, 1 # in case a3 is already aligned, load 16bytes and store 16bytes @@ -320,9 +320,9 @@ L(copy_back): andi t1, a4, 0xf bnez t1, L(back_unaligned) - vld $vr0, a4, -16 + vld vr0, a4, -16 addi.d a2, a2, -16 - vst $vr2, a3, -16 + vst vr2, a3, -16 andi t2, a2, 0x7f @@ -333,70 +333,70 @@ L(copy_back): sub.d a6, a4, t3 L(back_al_loop): - vld $vr1, a4, -32 - vld $vr2, a4, -48 - vld $vr3, a4, -64 + vld vr1, a4, -32 + vld vr2, a4, -48 + vld vr3, a4, -64 - vld $vr4, a4, -80 - vld $vr5, a4, -96 - vld $vr6, a4, -112 - vld $vr7, a4, -128 + vld vr4, a4, -80 + vld vr5, a4, -96 + vld vr6, a4, -112 + vld vr7, a4, -128 - vst $vr0, a3, -16 - vld $vr0, a4, -144 + vst vr0, a3, -16 + vld vr0, a4, -144 addi.d a4, a4, -128 - vst $vr1, a3, -32 + vst vr1, a3, -32 - vst $vr2, a3, -48 - vst $vr3, a3, -64 - vst $vr4, a3, -80 - vst $vr5, a3, -96 + vst vr2, a3, -48 + vst vr3, a3, -64 + vst vr4, a3, -80 + vst vr5, a3, -96 - vst $vr6, a3, -112 - vst $vr7, a3, -128 + vst vr6, a3, -112 + vst vr7, a3, -128 addi.d a3, a3, -128 bne a4, a6, L(back_al_loop) L(back_al_less_128): blt a2, t8, L(back_al_less_64) - vld $vr1, a4, -32 - vld $vr2, a4, -48 - vld $vr3, a4, -64 + vld vr1, a4, -32 + vld vr2, a4, -48 + vld vr3, a4, -64 addi.d a2, a2, -64 - vst $vr0, a3, -16 - vld $vr0, a4, -80 + vst vr0, a3, -16 + vld vr0, a4, -80 addi.d a4, a4, -64 - vst $vr1, a3, -32 - vst $vr2, a3, -48 - vst $vr3, a3, -64 + vst vr1, a3, -32 + vst vr2, a3, -48 + vst vr3, a3, -64 addi.d a3, a3, -64 L(back_al_less_64): blt a2, t7, L(back_al_less_32) - vld $vr1, a4, -32 + vld vr1, a4, -32 addi.d a2, a2, -32 - vst $vr0, a3, -16 + vst vr0, a3, -16 - vld $vr0, a4, -48 - vst $vr1, a3, -32 + vld vr0, a4, -48 + vst vr1, a3, -32 addi.d a3, a3, -32 addi.d a4, a4, -32 L(back_al_less_32): blt a2, t6, L(back_al_less_16) - vst $vr0, a3, -16 - vld $vr0, a4, -32 + vst vr0, a3, -16 + vld vr0, a4, -32 addi.d a3, a3, -16 L(back_al_less_16): - vld $vr1, a1, 0 - vst $vr0, a3, -16 - vst $vr1, a0, 0 + vld vr1, a1, 0 + vst vr0, a3, -16 + vst vr1, a0, 0 jr ra L(magic_num_2): @@ -405,18 +405,18 @@ L(magic_num_2): L(back_unaligned): pcaddi t2, -4 bstrins.d a4, zero, 3, 0 - vld $vr8, t2, 0 - vld $vr0, a4, 0 + vld vr8, t2, 0 + vld vr0, a4, 0 - vld $vr1, a4, -16 + vld vr1, a4, -16 addi.d a2, a2, -16 - vst $vr2, a3, -16 + vst vr2, a3, -16 sub.d a3, a3, t0 - vreplgr2vr.b $vr9, t1 + vreplgr2vr.b vr9, t1 andi t2, a2, 0x7f - vadd.b $vr9, $vr9, $vr8 + vadd.b vr9, vr9, vr8 addi.d a4, a4, -16 beq t2, a2, L(back_un_less_128) @@ -425,92 +425,92 @@ L(back_unaligned): sub.d a6, a4, t3 L(back_un_loop): - vld $vr2, a4, -16 - vld $vr3, a4, -32 - vld $vr4, a4, -48 + vld vr2, a4, -16 + vld vr3, a4, -32 + vld vr4, a4, -48 - vld $vr5, a4, -64 - vld $vr6, a4, -80 - vld $vr7, a4, -96 - vshuf.b $vr8, $vr0, $vr1, $vr9 + vld vr5, a4, -64 + vld vr6, a4, -80 + vld vr7, a4, -96 + vshuf.b vr8, vr0, vr1, vr9 - vld $vr0, a4, -112 - vst $vr8, a3, -16 - vshuf.b $vr8, $vr1, $vr2, $vr9 - vld $vr1, a4, -128 + vld vr0, a4, -112 + vst vr8, a3, -16 + vshuf.b vr8, vr1, vr2, vr9 + vld vr1, a4, -128 - vst $vr8, a3, -32 + vst vr8, a3, -32 addi.d a4, a4, -128 - vshuf.b $vr2, $vr2, $vr3, $vr9 - vshuf.b $vr3, $vr3, $vr4, $vr9 + vshuf.b vr2, vr2, vr3, vr9 + vshuf.b vr3, vr3, vr4, vr9 - vst $vr2, a3, -48 - vshuf.b $vr4, $vr4, $vr5, $vr9 - vst $vr3, a3, -64 - vshuf.b $vr5, $vr5, $vr6, $vr9 + vst vr2, a3, -48 + vshuf.b vr4, vr4, vr5, vr9 + vst vr3, a3, -64 + vshuf.b vr5, vr5, vr6, vr9 - vst $vr4, a3, -80 - vshuf.b $vr6, $vr6, $vr7, $vr9 - vst $vr5, a3, -96 - vshuf.b $vr7, $vr7, $vr0, $vr9 + vst vr4, a3, -80 + vshuf.b vr6, vr6, vr7, vr9 + vst vr5, a3, -96 + vshuf.b vr7, vr7, vr0, vr9 - vst $vr6, a3, -112 - vst $vr7, a3, -128 + vst vr6, a3, -112 + vst vr7, a3, -128 addi.d a3, a3, -128 bne a4, a6, L(back_un_loop) L(back_un_less_128): blt a2, t8, L(back_un_less_64) - vld $vr2, a4, -16 - vld $vr3, a4, -32 - vshuf.b $vr4, $vr0, $vr1, $vr9 + vld vr2, a4, -16 + vld vr3, a4, -32 + vshuf.b vr4, vr0, vr1, vr9 - vld $vr0, a4, -48 - vst $vr4, a3, -16 + vld vr0, a4, -48 + vst vr4, a3, -16 addi.d a2, a2, -64 - vshuf.b $vr4, $vr1, $vr2, $vr9 + vshuf.b vr4, vr1, vr2, vr9 - vld $vr1, a4, -64 + vld vr1, a4, -64 addi.d a4, a4, -64 - vst $vr4, a3, -32 - vshuf.b $vr2, $vr2, $vr3, $vr9 + vst vr4, a3, -32 + vshuf.b vr2, vr2, vr3, vr9 - vshuf.b $vr3, $vr3, $vr0, $vr9 - vst $vr2, a3, -48 - vst $vr3, a3, -64 + vshuf.b vr3, vr3, vr0, vr9 + vst vr2, a3, -48 + vst vr3, a3, -64 addi.d a3, a3, -64 L(back_un_less_64): blt a2, t7, L(back_un_less_32) - vshuf.b $vr3, $vr0, $vr1, $vr9 - vld $vr0, a4, -16 - vst $vr3, a3, -16 + vshuf.b vr3, vr0, vr1, vr9 + vld vr0, a4, -16 + vst vr3, a3, -16 addi.d a2, a2, -32 - vshuf.b $vr3, $vr1, $vr0, $vr9 - vld $vr1, a4, -32 + vshuf.b vr3, vr1, vr0, vr9 + vld vr1, a4, -32 addi.d a4, a4, -32 - vst $vr3, a3, -32 + vst vr3, a3, -32 addi.d a3, a3, -32 L(back_un_less_32): blt a2, t6, L(back_un_less_16) - vshuf.b $vr2, $vr0, $vr1, $vr9 + vshuf.b vr2, vr0, vr1, vr9 - vor.v $vr0, $vr1, $vr1 - vld $vr1, a4, -16 - vst $vr2, a3, -16 + vor.v vr0, vr1, vr1 + vld vr1, a4, -16 + vst vr2, a3, -16 addi.d a3, a3, -16 L(back_un_less_16): - vld $vr2, a1, 0 - vshuf.b $vr0, $vr0, $vr1, $vr9 - vst $vr0, a3, -16 - vst $vr2, a0, 0 + vld vr2, a1, 0 + vshuf.b vr0, vr0, vr1, vr9 + vst vr0, a3, -16 + vst vr2, a0, 0 jr ra END(MEMMOVE_NAME) diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S index 9ecd0257..41554552 100644 --- a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S +++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S @@ -21,56 +21,56 @@ LEAF(MEMRCHR, 6) bstrins.d a3, zero, 5, 0 addi.d t1, t1, 1 # len for unaligned address - xvld $xr0, a3, 0 - xvld $xr1, a3, 32 + xvld xr0, a3, 0 + xvld xr1, a3, 32 sub.d t2, zero, t1 li.d t3, -1 - xvreplgr2vr.b $xr2, a1 + xvreplgr2vr.b xr2, a1 andi t4, a0, 0x3f srl.d t2, t3, t2 - xvseq.b $xr0, $xr0, $xr2 - xvseq.b $xr1, $xr1, $xr2 - xvmsknz.b $xr0, $xr0 + xvseq.b xr0, xr0, xr2 + xvseq.b xr1, xr1, xr2 + xvmsknz.b xr0, xr0 - xvmsknz.b $xr1, $xr1 - xvpickve.w $xr3, $xr0, 4 - xvpickve.w $xr4, $xr1, 4 - vilvl.h $vr0, $vr3, $vr0 + xvmsknz.b xr1, xr1 + xvpickve.w xr3, xr0, 4 + xvpickve.w xr4, xr1, 4 + vilvl.h vr0, vr3, vr0 - vilvl.h $vr1, $vr4, $vr1 - vilvl.w $vr0, $vr1, $vr0 - movfr2gr.d t0, $f0 + vilvl.h vr1, vr4, vr1 + vilvl.w vr0, vr1, vr0 + movfr2gr.d t0, fa0 and t0, t0, t2 bltu a2, t1, L(end) bnez t0, L(found) bstrins.d a0, zero, 5, 0 L(loop): - xvld $xr0, a3, -64 + xvld xr0, a3, -64 - xvld $xr1, a3, -32 + xvld xr1, a3, -32 addi.d a3, a3, -64 - xvseq.b $xr0, $xr0, $xr2 - xvseq.b $xr1, $xr1, $xr2 + xvseq.b xr0, xr0, xr2 + xvseq.b xr1, xr1, xr2 beq a0, a3, L(out) - xvmax.bu $xr3, $xr0, $xr1 - xvseteqz.v $fcc0, $xr3 - bcnez $fcc0, L(loop) + xvmax.bu xr3, xr0, xr1 + xvseteqz.v fcc0, xr3 + bcnez fcc0, L(loop) - xvmsknz.b $xr0, $xr0 - xvmsknz.b $xr1, $xr1 - xvpickve.w $xr3, $xr0, 4 - xvpickve.w $xr4, $xr1, 4 + xvmsknz.b xr0, xr0 + xvmsknz.b xr1, xr1 + xvpickve.w xr3, xr0, 4 + xvpickve.w xr4, xr1, 4 - vilvl.h $vr0, $vr3, $vr0 - vilvl.h $vr1, $vr4, $vr1 - vilvl.w $vr0, $vr1, $vr0 - movfr2gr.d t0, $f0 + vilvl.h vr0, vr3, vr0 + vilvl.h vr1, vr4, vr1 + vilvl.w vr0, vr1, vr0 + movfr2gr.d t0, fa0 L(found): addi.d a0, a3, 63 @@ -80,15 +80,15 @@ L(found): L(out): - xvmsknz.b $xr0, $xr0 - xvmsknz.b $xr1, $xr1 - xvpickve.w $xr3, $xr0, 4 - xvpickve.w $xr4, $xr1, 4 - - vilvl.h $vr0, $vr3, $vr0 - vilvl.h $vr1, $vr4, $vr1 - vilvl.w $vr0, $vr1, $vr0 - movfr2gr.d t0, $f0 + xvmsknz.b xr0, xr0 + xvmsknz.b xr1, xr1 + xvpickve.w xr3, xr0, 4 + xvpickve.w xr4, xr1, 4 + + vilvl.h vr0, vr3, vr0 + vilvl.h vr1, vr4, vr1 + vilvl.w vr0, vr1, vr0 + movfr2gr.d t0, fa0 L(end): sll.d t2, t3, t4 diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S index 4bdc18d8..4a302cac 100644 --- a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S +++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S @@ -19,46 +19,46 @@ LEAF(MEMRCHR, 6) bstrins.d a3, zero, 4, 0 addi.d t1, t1, 1 # len for unaligned address - vld $vr0, a3, 0 - vld $vr1, a3, 16 + vld vr0, a3, 0 + vld vr1, a3, 16 sub.d t2, zero, t1 li.d t3, -1 - vreplgr2vr.b $vr2, a1 + vreplgr2vr.b vr2, a1 andi t4, a0, 0x1f srl.d t2, t3, t2 - vseq.b $vr0, $vr0, $vr2 - vseq.b $vr1, $vr1, $vr2 - vmsknz.b $vr0, $vr0 + vseq.b vr0, vr0, vr2 + vseq.b vr1, vr1, vr2 + vmsknz.b vr0, vr0 - vmsknz.b $vr1, $vr1 - vilvl.h $vr0, $vr1, $vr0 - movfr2gr.s t0, $f0 + vmsknz.b vr1, vr1 + vilvl.h vr0, vr1, vr0 + movfr2gr.s t0, fa0 and t0, t0, t2 bltu a2, t1, L(end) bnez t0, L(found) bstrins.d a0, zero, 4, 0 L(loop): - vld $vr0, a3, -32 + vld vr0, a3, -32 - vld $vr1, a3, -16 + vld vr1, a3, -16 addi.d a3, a3, -32 - vseq.b $vr0, $vr0, $vr2 - vseq.b $vr1, $vr1, $vr2 + vseq.b vr0, vr0, vr2 + vseq.b vr1, vr1, vr2 beq a0, a3, L(out) - vmax.bu $vr3, $vr0, $vr1 - vseteqz.v $fcc0, $vr3 - bcnez $fcc0, L(loop) + vmax.bu vr3, vr0, vr1 + vseteqz.v fcc0, vr3 + bcnez fcc0, L(loop) - vmsknz.b $vr0, $vr0 - vmsknz.b $vr1, $vr1 - vilvl.h $vr0, $vr1, $vr0 - movfr2gr.s t0, $f0 + vmsknz.b vr0, vr0 + vmsknz.b vr1, vr1 + vilvl.h vr0, vr1, vr0 + movfr2gr.s t0, fa0 L(found): addi.d a0, a3, 31 @@ -67,10 +67,10 @@ L(found): jr ra L(out): - vmsknz.b $vr0, $vr0 - vmsknz.b $vr1, $vr1 - vilvl.h $vr0, $vr1, $vr0 - movfr2gr.s t0, $f0 + vmsknz.b vr0, vr0 + vmsknz.b vr1, vr1 + vilvl.h vr0, vr1, vr0 + movfr2gr.s t0, fa0 L(end): sll.d t2, t3, t4 diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S index b53c0b7b..5e4908dc 100644 --- a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S +++ b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S @@ -14,7 +14,7 @@ LEAF(MEMSET, 6) li.d t1, 32 move a3, a0 - xvreplgr2vr.b $xr0, a1 + xvreplgr2vr.b xr0, a1 add.d a4, a0, a2 bgeu t1, a2, L(less_32bytes) # len <= 32 @@ -24,46 +24,46 @@ LEAF(MEMSET, 6) L(less_128bytes): bgeu t2, a2, L(less_64bytes) # len <= 64 - xvst $xr0, a3, 0 - xvst $xr0, a3, 32 - xvst $xr0, a4, -32 + xvst xr0, a3, 0 + xvst xr0, a3, 32 + xvst xr0, a4, -32 - xvst $xr0, a4, -64 + xvst xr0, a4, -64 jr ra L(less_64bytes): - xvst $xr0, a3, 0 - xvst $xr0, a4, -32 + xvst xr0, a3, 0 + xvst xr0, a4, -32 jr ra L(less_32bytes): srli.d t0, a2, 4 beqz t0, L(less_16bytes) - vst $vr0, a3, 0 + vst vr0, a3, 0 - vst $vr0, a4, -16 + vst vr0, a4, -16 jr ra L(less_16bytes): srli.d t0, a2, 3 beqz t0, L(less_8bytes) - vstelm.d $vr0, a3, 0, 0 - vstelm.d $vr0, a4, -8, 0 + vstelm.d vr0, a3, 0, 0 + vstelm.d vr0, a4, -8, 0 jr ra L(less_8bytes): srli.d t0, a2, 2 beqz t0, L(less_4bytes) - vstelm.w $vr0, a3, 0, 0 - vstelm.w $vr0, a4, -4, 0 + vstelm.w vr0, a3, 0, 0 + vstelm.w vr0, a4, -4, 0 jr ra L(less_4bytes): srli.d t0, a2, 1 beqz t0, L(less_2bytes) - vstelm.h $vr0, a3, 0, 0 - vstelm.h $vr0, a4, -2, 0 + vstelm.h vr0, a3, 0, 0 + vstelm.h vr0, a4, -2, 0 jr ra L(less_2bytes): @@ -73,7 +73,7 @@ L(less_1bytes): jr ra L(long_bytes): - xvst $xr0, a3, 0 + xvst xr0, a3, 0 bstrins.d a3, zero, 4, 0 addi.d a3, a3, 32 sub.d a2, a4, a3 @@ -85,15 +85,15 @@ L(long_bytes): L(loop_256): - xvst $xr0, a3, 0 - xvst $xr0, a3, 32 - xvst $xr0, a3, 64 - xvst $xr0, a3, 96 + xvst xr0, a3, 0 + xvst xr0, a3, 32 + xvst xr0, a3, 64 + xvst xr0, a3, 96 - xvst $xr0, a3, 128 - xvst $xr0, a3, 160 - xvst $xr0, a3, 192 - xvst $xr0, a3, 224 + xvst xr0, a3, 128 + xvst xr0, a3, 160 + xvst xr0, a3, 192 + xvst xr0, a3, 224 addi.d a3, a3, 256 bne a3, t0, L(loop_256) @@ -101,26 +101,26 @@ L(long_end): bltu a2, t3, L(end_less_128) addi.d a2, a2, -128 - xvst $xr0, a3, 0 - xvst $xr0, a3, 32 - xvst $xr0, a3, 64 - xvst $xr0, a3, 96 + xvst xr0, a3, 0 + xvst xr0, a3, 32 + xvst xr0, a3, 64 + xvst xr0, a3, 96 addi.d a3, a3, 128 L(end_less_128): bltu a2, t2, L(end_less_64) addi.d a2, a2, -64 - xvst $xr0, a3, 0 + xvst xr0, a3, 0 - xvst $xr0, a3, 32 + xvst xr0, a3, 32 addi.d a3, a3, 64 L(end_less_64): bltu a2, t1, L(end_less_32) - xvst $xr0, a3, 0 + xvst xr0, a3, 0 L(end_less_32): - xvst $xr0, a4, -32 + xvst xr0, a4, -32 jr ra END(MEMSET) diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S index 7ab85283..67b279c8 100644 --- a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S +++ b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S @@ -14,7 +14,7 @@ LEAF(MEMSET, 6) li.d t1, 16 move a3, a0 - vreplgr2vr.b $vr0, a1 + vreplgr2vr.b vr0, a1 add.d a4, a0, a2 bgeu t1, a2, L(less_16bytes) # len <= 16 @@ -24,48 +24,48 @@ LEAF(MEMSET, 6) L(less_64bytes): bgeu t2, a2, L(less_32bytes) # len <= 32 - vst $vr0, a3, 0 - vst $vr0, a3, 16 - vst $vr0, a4, -32 + vst vr0, a3, 0 + vst vr0, a3, 16 + vst vr0, a4, -32 - vst $vr0, a4, -16 + vst vr0, a4, -16 jr ra L(less_32bytes): - vst $vr0, a3, 0 - vst $vr0, a4, -16 + vst vr0, a3, 0 + vst vr0, a4, -16 jr ra L(less_16bytes): srli.d t0, a2, 3 beqz t0, L(less_8bytes) - vstelm.d $vr0, a3, 0, 0 + vstelm.d vr0, a3, 0, 0 - vstelm.d $vr0, a4, -8, 0 + vstelm.d vr0, a4, -8, 0 jr ra L(less_8bytes): srli.d t0, a2, 2 beqz t0, L(less_4bytes) - vstelm.w $vr0, a3, 0, 0 - vstelm.w $vr0, a4, -4, 0 + vstelm.w vr0, a3, 0, 0 + vstelm.w vr0, a4, -4, 0 jr ra L(less_4bytes): srli.d t0, a2, 1 beqz t0, L(less_2bytes) - vstelm.h $vr0, a3, 0, 0 - vstelm.h $vr0, a4, -2, 0 + vstelm.h vr0, a3, 0, 0 + vstelm.h vr0, a4, -2, 0 jr ra L(less_2bytes): beqz a2, L(less_1bytes) - vstelm.b $vr0, a3, 0, 0 + vstelm.b vr0, a3, 0, 0 L(less_1bytes): jr ra L(long_bytes): - vst $vr0, a3, 0 + vst vr0, a3, 0 bstrins.d a3, zero, 3, 0 addi.d a3, a3, 16 @@ -77,43 +77,43 @@ L(long_bytes): sub.d t0, a4, t0 L(loop_128): - vst $vr0, a3, 0 + vst vr0, a3, 0 - vst $vr0, a3, 16 - vst $vr0, a3, 32 - vst $vr0, a3, 48 - vst $vr0, a3, 64 + vst vr0, a3, 16 + vst vr0, a3, 32 + vst vr0, a3, 48 + vst vr0, a3, 64 - vst $vr0, a3, 80 - vst $vr0, a3, 96 - vst $vr0, a3, 112 + vst vr0, a3, 80 + vst vr0, a3, 96 + vst vr0, a3, 112 addi.d a3, a3, 128 bne a3, t0, L(loop_128) L(long_end): bltu a2, t3, L(end_less_64) addi.d a2, a2, -64 - vst $vr0, a3, 0 + vst vr0, a3, 0 - vst $vr0, a3, 16 - vst $vr0, a3, 32 - vst $vr0, a3, 48 + vst vr0, a3, 16 + vst vr0, a3, 32 + vst vr0, a3, 48 addi.d a3, a3, 64 L(end_less_64): bltu a2, t2, L(end_less_32) addi.d a2, a2, -32 - vst $vr0, a3, 0 - vst $vr0, a3, 16 + vst vr0, a3, 0 + vst vr0, a3, 16 addi.d a3, a3, 32 L(end_less_32): bltu a2, t1, L(end_less_16) - vst $vr0, a3, 0 + vst vr0, a3, 0 L(end_less_16): - vst $vr0, a4, -16 + vst vr0, a4, -16 jr ra END(MEMSET) diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S index 1e94aa50..856f99ce 100644 --- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S +++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S @@ -8,15 +8,15 @@ LEAF(RAWMEMCHR, 6) move a2, a0 bstrins.d a0, zero, 4, 0 - xvld $xr0, a0, 0 - xvreplgr2vr.b $xr1, a1 + xvld xr0, a0, 0 + xvreplgr2vr.b xr1, a1 - xvseq.b $xr0, $xr0, $xr1 - xvmsknz.b $xr0, $xr0 - xvpickve.w $xr2, $xr0, 4 - vilvl.h $vr0, $vr2, $vr0 + xvseq.b xr0, xr0, xr1 + xvmsknz.b xr0, xr0 + xvpickve.w xr2, xr0, 4 + vilvl.h vr0, vr2, vr0 - movfr2gr.s t0, $f0 + movfr2gr.s t0, fa0 sra.w t0, t0, a2 beqz t0, L(loop) ctz.w t0, t0 @@ -27,17 +27,17 @@ LEAF(RAWMEMCHR, 6) nop L(loop): - xvld $xr0, a0, 32 + xvld xr0, a0, 32 addi.d a0, a0, 32 - xvseq.b $xr0, $xr0, $xr1 - xvseteqz.v $fcc0, $xr0 + xvseq.b xr0, xr0, xr1 + xvseteqz.v fcc0, xr0 - bcnez $fcc0, L(loop) - xvmsknz.b $xr0, $xr0 - xvpickve.w $xr1, $xr0, 4 - vilvl.h $vr0, $vr1, $vr0 + bcnez fcc0, L(loop) + xvmsknz.b xr0, xr0 + xvpickve.w xr1, xr0, 4 + vilvl.h vr0, vr1, vr0 - movfr2gr.s t0, $f0 + movfr2gr.s t0, fa0 ctz.w t0, t0 add.d a0, a0, t0 jr ra diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S index 40bf0cda..7e864e96 100644 --- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S +++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S @@ -14,17 +14,17 @@ LEAF(RAWMEMCHR, 6) move a2, a0 bstrins.d a0, zero, 4, 0 - vld $vr0, a0, 0 - vld $vr1, a0, 16 + vld vr0, a0, 0 + vld vr1, a0, 16 - vreplgr2vr.b $vr2, a1 - vseq.b $vr0, $vr0, $vr2 - vseq.b $vr1, $vr1, $vr2 - vmsknz.b $vr0, $vr0 + vreplgr2vr.b vr2, a1 + vseq.b vr0, vr0, vr2 + vseq.b vr1, vr1, vr2 + vmsknz.b vr0, vr0 - vmsknz.b $vr1, $vr1 - vilvl.h $vr0, $vr1, $vr0 - movfr2gr.s t0, $f0 + vmsknz.b vr1, vr1 + vilvl.h vr0, vr1, vr0 + movfr2gr.s t0, fa0 sra.w t0, t0, a2 beqz t0, L(loop) @@ -34,15 +34,15 @@ LEAF(RAWMEMCHR, 6) L(loop): - vld $vr0, a0, 32 + vld vr0, a0, 32 addi.d a0, a0, 16 - vseq.b $vr0, $vr0, $vr2 - vseteqz.v $fcc0, $vr0 + vseq.b vr0, vr0, vr2 + vseteqz.v fcc0, vr0 - bcnez $fcc0, L(loop) + bcnez fcc0, L(loop) addi.d a0, a0, 16 - vfrstpi.b $vr0, $vr0, 0 - vpickve2gr.bu t0, $vr0, 0 + vfrstpi.b vr0, vr0, 0 + vpickve2gr.bu t0, vr0, 0 add.d a0, a0, t0 jr ra diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S index 0836f590..53832de7 100644 --- a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S +++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S @@ -18,67 +18,67 @@ L(magic_num): ENTRY_NO_ALIGN(STPCPY) pcaddi t0, -4 andi a4, a1, 0xf - vld $vr1, t0, 0 + vld vr1, t0, 0 beqz a4, L(load_start) xor t0, a1, a4 - vld $vr0, t0, 0 - vreplgr2vr.b $vr2, a4 - vadd.b $vr2, $vr2, $vr1 + vld vr0, t0, 0 + vreplgr2vr.b vr2, a4 + vadd.b vr2, vr2, vr1 - vshuf.b $vr0, $vr2, $vr0, $vr2 - vsetanyeqz.b $fcc0, $vr0 - bcnez $fcc0, L(end) + vshuf.b vr0, vr2, vr0, vr2 + vsetanyeqz.b fcc0, vr0 + bcnez fcc0, L(end) L(load_start): - vld $vr0, a1, 0 + vld vr0, a1, 0 li.d t1, 16 andi a3, a0, 0xf - vsetanyeqz.b $fcc0, $vr0 + vsetanyeqz.b fcc0, vr0 sub.d t0, t1, a3 - bcnez $fcc0, L(end) + bcnez fcc0, L(end) add.d a1, a1, t0 - vst $vr0, a0, 0 + vst vr0, a0, 0 add.d a0, a0, t0 bne a3, a4, L(unaligned) - vld $vr0, a1, 0 - vsetanyeqz.b $fcc0, $vr0 - bcnez $fcc0, L(end) + vld vr0, a1, 0 + vsetanyeqz.b fcc0, vr0 + bcnez fcc0, L(end) L(loop): - vst $vr0, a0, 0 - vld $vr0, a1, 16 + vst vr0, a0, 0 + vld vr0, a1, 16 addi.d a0, a0, 16 addi.d a1, a1, 16 - vsetanyeqz.b $fcc0, $vr0 - bceqz $fcc0, L(loop) - vmsknz.b $vr1, $vr0 - movfr2gr.s t0, $f1 + vsetanyeqz.b fcc0, vr0 + bceqz fcc0, L(loop) + vmsknz.b vr1, vr0 + movfr2gr.s t0, fa1 cto.w t0, t0 add.d a1, a1, t0 - vld $vr0, a1, -15 + vld vr0, a1, -15 add.d a0, a0, t0 - vst $vr0, a0, -15 + vst vr0, a0, -15 jr ra L(end): - vseqi.b $vr1, $vr0, 0 - vfrstpi.b $vr1, $vr1, 0 + vseqi.b vr1, vr0, 0 + vfrstpi.b vr1, vr1, 0 - vpickve2gr.bu t0, $vr1, 0 + vpickve2gr.bu t0, vr1, 0 addi.d t0, t0, 1 L(end_16): andi t1, t0, 16 beqz t1, L(end_8) - vst $vr0, a0, 0 + vst vr0, a0, 0 addi.d a0, a0, 15 jr ra L(end_8): @@ -89,26 +89,26 @@ L(end_8): andi t5, t0, 1 beqz t2, L(end_4) - vstelm.d $vr0, a0, 0, 0 + vstelm.d vr0, a0, 0, 0 addi.d a0, a0, 8 - vbsrl.v $vr0, $vr0, 8 + vbsrl.v vr0, vr0, 8 L(end_4): beqz t3, L(end_2) - vstelm.w $vr0, a0, 0, 0 + vstelm.w vr0, a0, 0, 0 addi.d a0, a0, 4 - vbsrl.v $vr0, $vr0, 4 + vbsrl.v vr0, vr0, 4 L(end_2): beqz t4, L(end_1) - vstelm.h $vr0, a0, 0, 0 + vstelm.h vr0, a0, 0, 0 addi.d a0, a0, 2 - vbsrl.v $vr0, $vr0, 2 + vbsrl.v vr0, vr0, 2 L(end_1): beqz t5, L(out) - vstelm.b $vr0, a0, 0, 0 + vstelm.b vr0, a0, 0, 0 addi.d a0, a0, 1 L(out): addi.d a0, a0, -1 @@ -120,49 +120,49 @@ L(unaligned): andi a3, a1, 0xf bstrins.d a1, zero, 3, 0 - vld $vr2, a1, 0 - vreplgr2vr.b $vr3, a3 - vslt.b $vr4, $vr1, $vr3 - vor.v $vr0, $vr2, $vr4 + vld vr2, a1, 0 + vreplgr2vr.b vr3, a3 + vslt.b vr4, vr1, vr3 + vor.v vr0, vr2, vr4 - vsetanyeqz.b $fcc0, $vr0 - bcnez $fcc0, L(un_first_end) - vld $vr0, a1, 16 - vadd.b $vr3, $vr3, $vr1 + vsetanyeqz.b fcc0, vr0 + bcnez fcc0, L(un_first_end) + vld vr0, a1, 16 + vadd.b vr3, vr3, vr1 addi.d a1, a1, 16 - vshuf.b $vr4, $vr0, $vr2, $vr3 - vsetanyeqz.b $fcc0, $vr0 - bcnez $fcc0, L(un_end) + vshuf.b vr4, vr0, vr2, vr3 + vsetanyeqz.b fcc0, vr0 + bcnez fcc0, L(un_end) L(un_loop): - vor.v $vr2, $vr0, $vr0 - vld $vr0, a1, 16 - vst $vr4, a0, 0 + vor.v vr2, vr0, vr0 + vld vr0, a1, 16 + vst vr4, a0, 0 addi.d a1, a1, 16 addi.d a0, a0, 16 - vshuf.b $vr4, $vr0, $vr2, $vr3 - vsetanyeqz.b $fcc0, $vr0 - bceqz $fcc0, L(un_loop) + vshuf.b vr4, vr0, vr2, vr3 + vsetanyeqz.b fcc0, vr0 + bceqz fcc0, L(un_loop) L(un_end): - vsetanyeqz.b $fcc0, $vr4 - bcnez $fcc0, 1f - vst $vr4, a0, 0 + vsetanyeqz.b fcc0, vr4 + bcnez fcc0, 1f + vst vr4, a0, 0 1: - vmsknz.b $vr1, $vr0 + vmsknz.b vr1, vr0 - movfr2gr.s t0, $f1 + movfr2gr.s t0, fa1 cto.w t0, t0 add.d a1, a1, t0 - vld $vr0, a1, -15 + vld vr0, a1, -15 add.d a0, a0, t0 sub.d a0, a0, a3 - vst $vr0, a0, 1 + vst vr0, a0, 1 addi.d a0, a0, 16 jr ra diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S index 3f6ad915..fab6edc7 100644 --- a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S +++ b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S @@ -16,18 +16,18 @@ LEAF(STRCHR, 6) andi t1, a0, 0x1f bstrins.d a0, zero, 4, 0 - xvld $xr0, a0, 0 + xvld xr0, a0, 0 li.d t2, -1 - xvreplgr2vr.b $xr1, a1 + xvreplgr2vr.b xr1, a1 sll.d t1, t2, t1 - xvxor.v $xr2, $xr0, $xr1 - xvmin.bu $xr0, $xr0, $xr2 + xvxor.v xr2, xr0, xr1 + xvmin.bu xr0, xr0, xr2 - xvmsknz.b $xr0, $xr0 - xvpickve.w $xr3, $xr0, 4 - vilvl.h $vr0, $vr3, $vr0 - movfr2gr.s t0, $f0 + xvmsknz.b xr0, xr0 + xvpickve.w xr3, xr0, 4 + vilvl.h vr0, vr3, vr0 + movfr2gr.s t0, fa0 orn t0, t0, t1 bne t0, t2, L(end) @@ -36,37 +36,37 @@ LEAF(STRCHR, 6) L(loop): - xvld $xr0, a0, 0 - xvxor.v $xr2, $xr0, $xr1 - xvmin.bu $xr0, $xr0, $xr2 - xvsetanyeqz.b $fcc0, $xr0 + xvld xr0, a0, 0 + xvxor.v xr2, xr0, xr1 + xvmin.bu xr0, xr0, xr2 + xvsetanyeqz.b fcc0, xr0 - bcnez $fcc0, L(loop_end) - xvld $xr0, a0, 32 + bcnez fcc0, L(loop_end) + xvld xr0, a0, 32 addi.d a0, a0, 64 - xvxor.v $xr2, $xr0, $xr1 + xvxor.v xr2, xr0, xr1 - xvmin.bu $xr0, $xr0, $xr2 - xvsetanyeqz.b $fcc0, $xr0 - bceqz $fcc0, L(loop) + xvmin.bu xr0, xr0, xr2 + xvsetanyeqz.b fcc0, xr0 + bceqz fcc0, L(loop) addi.d a0, a0, -32 L(loop_end): - xvmsknz.b $xr0, $xr0 - xvpickve.w $xr1, $xr0, 4 - vilvl.h $vr0, $vr1, $vr0 - movfr2gr.s t0, $f0 + xvmsknz.b xr0, xr0 + xvpickve.w xr1, xr0, 4 + vilvl.h vr0, vr1, vr0 + movfr2gr.s t0, fa0 L(end): cto.w t0, t0 add.d a0, a0, t0 #ifndef AS_STRCHRNUL - vreplgr2vr.b $vr0, t0 - xvpermi.q $xr3, $xr2, 1 + vreplgr2vr.b vr0, t0 + xvpermi.q xr3, xr2, 1 - vshuf.b $vr0, $vr3, $vr2, $vr0 - vpickve2gr.bu t0, $vr0, 0 + vshuf.b vr0, vr3, vr2, vr0 + vpickve2gr.bu t0, vr0, 0 masknez a0, a0, t0 #endif jr ra diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S index 4ad9a4ad..ebeb332e 100644 --- a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S +++ b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S @@ -16,16 +16,16 @@ LEAF(STRCHR, 6) andi t1, a0, 0xf bstrins.d a0, zero, 3, 0 - vld $vr0, a0, 0 + vld vr0, a0, 0 li.d t2, -1 - vreplgr2vr.b $vr1, a1 + vreplgr2vr.b vr1, a1 sll.d t3, t2, t1 - vxor.v $vr2, $vr0, $vr1 - vmin.bu $vr0, $vr0, $vr2 + vxor.v vr2, vr0, vr1 + vmin.bu vr0, vr0, vr2 - vmsknz.b $vr0, $vr0 - movfr2gr.s t0, $f0 + vmsknz.b vr0, vr0 + movfr2gr.s t0, fa0 ext.w.h t0, t0 orn t0, t0, t3 @@ -34,23 +34,23 @@ L(found): cto.w t0, t0 add.d a0, a0, t0 #ifndef AS_STRCHRNUL - vreplve.b $vr2, $vr2, t0 - vpickve2gr.bu t1, $vr2, 0 + vreplve.b vr2, vr2, t0 + vpickve2gr.bu t1, vr2, 0 masknez a0, a0, t1 #endif jr ra L(loop): - vld $vr0, a0, 16 + vld vr0, a0, 16 addi.d a0, a0, 16 - vxor.v $vr2, $vr0, $vr1 - vmin.bu $vr0, $vr0, $vr2 + vxor.v vr2, vr0, vr1 + vmin.bu vr0, vr0, vr2 - vsetanyeqz.b $fcc0, $vr0 - bceqz $fcc0, L(loop) - vmsknz.b $vr0, $vr0 - movfr2gr.s t0, $f0 + vsetanyeqz.b fcc0, vr0 + bceqz fcc0, L(loop) + vmsknz.b vr0, vr0 + movfr2gr.s t0, fa0 b L(found) END(STRCHR) diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S index c86e3ecd..c6e1110c 100644 --- a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S +++ b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S @@ -20,45 +20,45 @@ L(magic_num): ENTRY_NO_ALIGN(STRCMP) pcaddi t0, -4 andi a2, a0, 0xf - vld $vr2, t0, 0 + vld vr2, t0, 0 andi a3, a1, 0xf bne a2, a3, L(unaligned) bstrins.d a0, zero, 3, 0 bstrins.d a1, zero, 3, 0 - vld $vr0, a0, 0 + vld vr0, a0, 0 - vld $vr1, a1, 0 - vreplgr2vr.b $vr3, a2 - vslt.b $vr2, $vr2, $vr3 - vseq.b $vr3, $vr0, $vr1 + vld vr1, a1, 0 + vreplgr2vr.b vr3, a2 + vslt.b vr2, vr2, vr3 + vseq.b vr3, vr0, vr1 - vmin.bu $vr3, $vr0, $vr3 - vor.v $vr3, $vr3, $vr2 - vsetanyeqz.b $fcc0, $vr3 - bcnez $fcc0, L(al_out) + vmin.bu vr3, vr0, vr3 + vor.v vr3, vr3, vr2 + vsetanyeqz.b fcc0, vr3 + bcnez fcc0, L(al_out) L(al_loop): - vld $vr0, a0, 16 - vld $vr1, a1, 16 + vld vr0, a0, 16 + vld vr1, a1, 16 addi.d a0, a0, 16 addi.d a1, a1, 16 - vseq.b $vr3, $vr0, $vr1 - vmin.bu $vr3, $vr0, $vr3 - vsetanyeqz.b $fcc0, $vr3 - bceqz $fcc0, L(al_loop) + vseq.b vr3, vr0, vr1 + vmin.bu vr3, vr0, vr3 + vsetanyeqz.b fcc0, vr3 + bceqz fcc0, L(al_loop) L(al_out): - vseqi.b $vr3, $vr3, 0 - vfrstpi.b $vr3, $vr3, 0 - vshuf.b $vr0, $vr0, $vr0, $vr3 - vshuf.b $vr1, $vr1, $vr1, $vr3 + vseqi.b vr3, vr3, 0 + vfrstpi.b vr3, vr3, 0 + vshuf.b vr0, vr0, vr0, vr3 + vshuf.b vr1, vr1, vr1, vr3 - vpickve2gr.bu t0, $vr0, 0 - vpickve2gr.bu t1, $vr1, 0 + vpickve2gr.bu t0, vr0, 0 + vpickve2gr.bu t1, vr1, 0 sub.d a0, t0, t1 jr ra @@ -79,52 +79,52 @@ L(unaligned): bstrins.d a1, zero, 3, 0 - vld $vr0, a0, 0 - vld $vr3, a1, 0 - vreplgr2vr.b $vr4, a2 - vreplgr2vr.b $vr5, a3 + vld vr0, a0, 0 + vld vr3, a1, 0 + vreplgr2vr.b vr4, a2 + vreplgr2vr.b vr5, a3 - vslt.b $vr7, $vr2, $vr4 - vsub.b $vr4, $vr4, $vr5 - vaddi.bu $vr6, $vr2, 16 - vsub.b $vr6, $vr6, $vr4 + vslt.b vr7, vr2, vr4 + vsub.b vr4, vr4, vr5 + vaddi.bu vr6, vr2, 16 + vsub.b vr6, vr6, vr4 - vshuf.b $vr1, $vr3, $vr3, $vr6 - vseq.b $vr4, $vr0, $vr1 - vmin.bu $vr4, $vr0, $vr4 - vor.v $vr4, $vr4, $vr7 + vshuf.b vr1, vr3, vr3, vr6 + vseq.b vr4, vr0, vr1 + vmin.bu vr4, vr0, vr4 + vor.v vr4, vr4, vr7 - vsetanyeqz.b $fcc0, $vr4 - bcnez $fcc0, L(un_end) - vslt.b $vr5, $vr2, $vr5 - vor.v $vr3, $vr3, $vr5 + vsetanyeqz.b fcc0, vr4 + bcnez fcc0, L(un_end) + vslt.b vr5, vr2, vr5 + vor.v vr3, vr3, vr5 L(un_loop): - vld $vr0, a0, 16 - vsetanyeqz.b $fcc0, $vr3 - bcnez $fcc0, L(remaining_end) - vor.v $vr1, $vr3, $vr3 + vld vr0, a0, 16 + vsetanyeqz.b fcc0, vr3 + bcnez fcc0, L(remaining_end) + vor.v vr1, vr3, vr3 - vld $vr3, a1, 16 + vld vr3, a1, 16 addi.d a0, a0, 16 addi.d a1, a1, 16 - vshuf.b $vr1, $vr3, $vr1, $vr6 + vshuf.b vr1, vr3, vr1, vr6 - vseq.b $vr4, $vr0, $vr1 - vmin.bu $vr4, $vr0, $vr4 - vsetanyeqz.b $fcc0, $vr4 - bceqz $fcc0, L(un_loop) + vseq.b vr4, vr0, vr1 + vmin.bu vr4, vr0, vr4 + vsetanyeqz.b fcc0, vr4 + bceqz fcc0, L(un_loop) L(un_end): - vseqi.b $vr4, $vr4, 0 - vfrstpi.b $vr4, $vr4, 0 - vshuf.b $vr0, $vr0, $vr0, $vr4 - vshuf.b $vr1, $vr1, $vr1, $vr4 + vseqi.b vr4, vr4, 0 + vfrstpi.b vr4, vr4, 0 + vshuf.b vr0, vr0, vr0, vr4 + vshuf.b vr1, vr1, vr1, vr4 - vpickve2gr.bu t0, $vr0, 0 - vpickve2gr.bu t1, $vr1, 0 + vpickve2gr.bu t0, vr0, 0 + vpickve2gr.bu t1, vr1, 0 sub.d t3, t0, t1 sub.d t4, t1, t0 @@ -134,9 +134,9 @@ L(un_end): jr ra L(remaining_end): - vshuf.b $vr1, $vr3, $vr3, $vr6 - vseq.b $vr4, $vr0, $vr1 - vmin.bu $vr4, $vr4, $vr0 + vshuf.b vr1, vr3, vr3, vr6 + vseq.b vr4, vr0, vr1 + vmin.bu vr4, vr4, vr0 b L(un_end) END(STRCMP) diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S index dbc061ad..52d77fa3 100644 --- a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S +++ b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S @@ -21,61 +21,61 @@ L(magic_num): ENTRY_NO_ALIGN(STRCPY) pcaddi t0, -4 andi a4, a1, 0xf - vld $vr1, t0, 0 + vld vr1, t0, 0 move a2, a0 beqz a4, L(load_start) xor t0, a1, a4 - vld $vr0, t0, 0 - vreplgr2vr.b $vr2, a4 + vld vr0, t0, 0 + vreplgr2vr.b vr2, a4 - vadd.b $vr2, $vr2, $vr1 - vshuf.b $vr0, $vr2, $vr0, $vr2 - vsetanyeqz.b $fcc0, $vr0 - bcnez $fcc0, L(end) + vadd.b vr2, vr2, vr1 + vshuf.b vr0, vr2, vr0, vr2 + vsetanyeqz.b fcc0, vr0 + bcnez fcc0, L(end) L(load_start): - vld $vr0, a1, 0 + vld vr0, a1, 0 li.d t1, 16 andi a3, a2, 0xf - vsetanyeqz.b $fcc0, $vr0 + vsetanyeqz.b fcc0, vr0 sub.d t0, t1, a3 - bcnez $fcc0, L(end) + bcnez fcc0, L(end) add.d a1, a1, t0 - vst $vr0, a2, 0 + vst vr0, a2, 0 andi a3, a1, 0xf add.d a2, a2, t0 bnez a3, L(unaligned) - vld $vr0, a1, 0 + vld vr0, a1, 0 - vsetanyeqz.b $fcc0, $vr0 - bcnez $fcc0, L(end) + vsetanyeqz.b fcc0, vr0 + bcnez fcc0, L(end) L(loop): - vst $vr0, a2, 0 - vld $vr0, a1, 16 + vst vr0, a2, 0 + vld vr0, a1, 16 addi.d a2, a2, 16 addi.d a1, a1, 16 - vsetanyeqz.b $fcc0, $vr0 - bceqz $fcc0, L(loop) + vsetanyeqz.b fcc0, vr0 + bceqz fcc0, L(loop) - vmsknz.b $vr1, $vr0 - movfr2gr.s t0, $f1 + vmsknz.b vr1, vr0 + movfr2gr.s t0, fa1 cto.w t0, t0 add.d a1, a1, t0 - vld $vr0, a1, -15 + vld vr0, a1, -15 add.d a2, a2, t0 - vst $vr0, a2, -15 + vst vr0, a2, -15 jr ra L(end): - vmsknz.b $vr1, $vr0 - movfr2gr.s t0, $f1 + vmsknz.b vr1, vr0 + movfr2gr.s t0, fa1 cto.w t0, t0 addi.d t0, t0, 1 @@ -83,7 +83,7 @@ L(end): L(end_16): andi t1, t0, 16 beqz t1, L(end_8) - vst $vr0, a2, 0 + vst vr0, a2, 0 jr ra L(end_8): @@ -93,74 +93,74 @@ L(end_8): andi t5, t0, 1 beqz t2, L(end_4) - vstelm.d $vr0, a2, 0, 0 + vstelm.d vr0, a2, 0, 0 addi.d a2, a2, 8 - vbsrl.v $vr0, $vr0, 8 + vbsrl.v vr0, vr0, 8 L(end_4): beqz t3, L(end_2) - vstelm.w $vr0, a2, 0, 0 + vstelm.w vr0, a2, 0, 0 addi.d a2, a2, 4 - vbsrl.v $vr0, $vr0, 4 + vbsrl.v vr0, vr0, 4 L(end_2): beqz t4, L(end_1) - vstelm.h $vr0, a2, 0, 0 + vstelm.h vr0, a2, 0, 0 addi.d a2, a2, 2 - vbsrl.v $vr0, $vr0, 2 + vbsrl.v vr0, vr0, 2 L(end_1): beqz t5, L(out) - vstelm.b $vr0, a2, 0, 0 + vstelm.b vr0, a2, 0, 0 L(out): jr ra L(unaligned): bstrins.d a1, zero, 3, 0 - vld $vr2, a1, 0 - vreplgr2vr.b $vr3, a3 - vslt.b $vr4, $vr1, $vr3 - vor.v $vr0, $vr2, $vr4 + vld vr2, a1, 0 + vreplgr2vr.b vr3, a3 + vslt.b vr4, vr1, vr3 + vor.v vr0, vr2, vr4 - vsetanyeqz.b $fcc0, $vr0 - bcnez $fcc0, L(un_first_end) - vld $vr0, a1, 16 - vadd.b $vr3, $vr3, $vr1 + vsetanyeqz.b fcc0, vr0 + bcnez fcc0, L(un_first_end) + vld vr0, a1, 16 + vadd.b vr3, vr3, vr1 addi.d a1, a1, 16 - vshuf.b $vr4, $vr0, $vr2, $vr3 - vsetanyeqz.b $fcc0, $vr0 - bcnez $fcc0, L(un_end) + vshuf.b vr4, vr0, vr2, vr3 + vsetanyeqz.b fcc0, vr0 + bcnez fcc0, L(un_end) L(un_loop): - vor.v $vr2, $vr0, $vr0 - vld $vr0, a1, 16 - vst $vr4, a2, 0 + vor.v vr2, vr0, vr0 + vld vr0, a1, 16 + vst vr4, a2, 0 addi.d a1, a1, 16 addi.d a2, a2, 16 - vshuf.b $vr4, $vr0, $vr2, $vr3 - vsetanyeqz.b $fcc0, $vr0 - bceqz $fcc0, L(un_loop) + vshuf.b vr4, vr0, vr2, vr3 + vsetanyeqz.b fcc0, vr0 + bceqz fcc0, L(un_loop) L(un_end): - vsetanyeqz.b $fcc0, $vr4 - bcnez $fcc0, 1f - vst $vr4, a2, 0 + vsetanyeqz.b fcc0, vr4 + bcnez fcc0, 1f + vst vr4, a2, 0 1: - vmsknz.b $vr1, $vr0 + vmsknz.b vr1, vr0 - movfr2gr.s t0, $f1 + movfr2gr.s t0, fa1 cto.w t0, t0 add.d a1, a1, t0 - vld $vr0, a1, -15 + vld vr0, a1, -15 add.d a2, a2, t0 sub.d a2, a2, a3 - vst $vr0, a2, 1 + vst vr0, a2, 1 jr ra L(un_first_end): diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S index fd6c002d..fc25dd50 100644 --- a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S +++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S @@ -17,12 +17,12 @@ LEAF(STRLEN, 6) move a1, a0 bstrins.d a0, zero, 4, 0 li.d t1, -1 - xvld $xr0, a0, 0 + xvld xr0, a0, 0 - xvmsknz.b $xr0, $xr0 - xvpickve.w $xr1, $xr0, 4 - vilvl.h $vr0, $vr1, $vr0 - movfr2gr.s t0, $f0 # sign extend + xvmsknz.b xr0, xr0 + xvpickve.w xr1, xr0, 4 + vilvl.h vr0, vr1, vr0 + movfr2gr.s t0, fa0 # sign extend sra.w t0, t0, a1 beq t0, t1, L(loop) @@ -30,18 +30,18 @@ LEAF(STRLEN, 6) jr ra L(loop): - xvld $xr0, a0, 32 + xvld xr0, a0, 32 addi.d a0, a0, 32 - xvsetanyeqz.b $fcc0, $xr0 - bceqz $fcc0, L(loop) + xvsetanyeqz.b fcc0, xr0 + bceqz fcc0, L(loop) - xvmsknz.b $xr0, $xr0 + xvmsknz.b xr0, xr0 sub.d a0, a0, a1 - xvpickve.w $xr1, $xr0, 4 - vilvl.h $vr0, $vr1, $vr0 + xvpickve.w xr1, xr0, 4 + vilvl.h vr0, vr1, vr0 - movfr2gr.s t0, $f0 + movfr2gr.s t0, fa0 cto.w t0, t0 add.d a0, a0, t0 jr ra diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S index 6f311506..45c3db93 100644 --- a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S +++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S @@ -16,15 +16,15 @@ LEAF(STRLEN, 6) move a1, a0 bstrins.d a0, zero, 4, 0 - vld $vr0, a0, 0 - vld $vr1, a0, 16 + vld vr0, a0, 0 + vld vr1, a0, 16 li.d t1, -1 - vmsknz.b $vr0, $vr0 - vmsknz.b $vr1, $vr1 - vilvl.h $vr0, $vr1, $vr0 + vmsknz.b vr0, vr0 + vmsknz.b vr1, vr1 + vilvl.h vr0, vr1, vr0 - movfr2gr.s t0, $f0 + movfr2gr.s t0, fa0 sra.w t0, t0, a1 beq t0, t1, L(loop) cto.w a0, t0 @@ -36,19 +36,19 @@ LEAF(STRLEN, 6) L(loop): - vld $vr0, a0, 32 - vld $vr1, a0, 48 + vld vr0, a0, 32 + vld vr1, a0, 48 addi.d a0, a0, 32 - vmin.bu $vr2, $vr0, $vr1 + vmin.bu vr2, vr0, vr1 - vsetanyeqz.b $fcc0, $vr2 - bceqz $fcc0, L(loop) - vmsknz.b $vr0, $vr0 - vmsknz.b $vr1, $vr1 + vsetanyeqz.b fcc0, vr2 + bceqz fcc0, L(loop) + vmsknz.b vr0, vr0 + vmsknz.b vr1, vr1 - vilvl.h $vr0, $vr1, $vr0 + vilvl.h vr0, vr1, vr0 sub.d a0, a0, a1 - movfr2gr.s t0, $f0 + movfr2gr.s t0, fa0 cto.w t0, t0 add.d a0, a0, t0 diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S index 2c6f9614..21f3e689 100644 --- a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S +++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S @@ -22,7 +22,7 @@ ENTRY_NO_ALIGN(STRNCMP) beqz a2, L(ret0) pcaddi t0, -5 andi a3, a0, 0xf - vld $vr2, t0, 0 + vld vr2, t0, 0 andi a4, a1, 0xf li.d t2, 16 @@ -30,57 +30,57 @@ ENTRY_NO_ALIGN(STRNCMP) xor t0, a0, a3 xor t1, a1, a4 - vld $vr0, t0, 0 - vld $vr1, t1, 0 - vreplgr2vr.b $vr3, a3 + vld vr0, t0, 0 + vld vr1, t1, 0 + vreplgr2vr.b vr3, a3 sub.d t2, t2, a3 - vadd.b $vr3, $vr3, $vr2 - vshuf.b $vr0, $vr3, $vr0, $vr3 - vshuf.b $vr1, $vr3, $vr1, $vr3 + vadd.b vr3, vr3, vr2 + vshuf.b vr0, vr3, vr0, vr3 + vshuf.b vr1, vr3, vr1, vr3 - vseq.b $vr3, $vr0, $vr1 - vmin.bu $vr3, $vr0, $vr3 + vseq.b vr3, vr0, vr1 + vmin.bu vr3, vr0, vr3 bgeu t2, a2, L(al_early_end) - vsetanyeqz.b $fcc0, $vr3 + vsetanyeqz.b fcc0, vr3 - bcnez $fcc0, L(al_end) + bcnez fcc0, L(al_end) add.d a3, a0, a2 addi.d a4, a3, -1 bstrins.d a4, zero, 3, 0 sub.d a2, a3, a4 L(al_loop): - vld $vr0, t0, 16 - vld $vr1, t1, 16 + vld vr0, t0, 16 + vld vr1, t1, 16 addi.d t0, t0, 16 addi.d t1, t1, 16 - vseq.b $vr3, $vr0, $vr1 - vmin.bu $vr3, $vr0, $vr3 + vseq.b vr3, vr0, vr1 + vmin.bu vr3, vr0, vr3 beq t0, a4, L(al_early_end) - vsetanyeqz.b $fcc0, $vr3 - bceqz $fcc0, L(al_loop) + vsetanyeqz.b fcc0, vr3 + bceqz fcc0, L(al_loop) L(al_end): - vseqi.b $vr3, $vr3, 0 - vfrstpi.b $vr3, $vr3, 0 + vseqi.b vr3, vr3, 0 + vfrstpi.b vr3, vr3, 0 - vshuf.b $vr0, $vr0, $vr0, $vr3 - vshuf.b $vr1, $vr1, $vr1, $vr3 - vpickve2gr.bu t0, $vr0, 0 - vpickve2gr.bu t1, $vr1, 0 + vshuf.b vr0, vr0, vr0, vr3 + vshuf.b vr1, vr1, vr1, vr3 + vpickve2gr.bu t0, vr0, 0 + vpickve2gr.bu t1, vr1, 0 sub.d a0, t0, t1 jr ra L(al_early_end): - vreplgr2vr.b $vr4, a2 - vslt.b $vr4, $vr2, $vr4 + vreplgr2vr.b vr4, a2 + vslt.b vr4, vr2, vr4 - vorn.v $vr3, $vr3, $vr4 + vorn.v vr3, vr3, vr4 b L(al_end) L(unaligned): slt a5, a3, a4 @@ -94,64 +94,64 @@ L(unaligned): andi a4, a1, 0xf xor t0, a0, a3 xor t1, a1, a4 - vld $vr0, t0, 0 + vld vr0, t0, 0 - vld $vr3, t1, 0 + vld vr3, t1, 0 sub.d t2, t2, a3 - vreplgr2vr.b $vr4, a3 - vreplgr2vr.b $vr5, a4 + vreplgr2vr.b vr4, a3 + vreplgr2vr.b vr5, a4 - vaddi.bu $vr6, $vr2, 16 - vsub.b $vr7, $vr4, $vr5 - vsub.b $vr6, $vr6, $vr7 - vadd.b $vr4, $vr2, $vr4 + vaddi.bu vr6, vr2, 16 + vsub.b vr7, vr4, vr5 + vsub.b vr6, vr6, vr7 + vadd.b vr4, vr2, vr4 - vshuf.b $vr1, $vr3, $vr3, $vr6 - vshuf.b $vr0, $vr7, $vr0, $vr4 - vshuf.b $vr1, $vr7, $vr1, $vr4 - vseq.b $vr4, $vr0, $vr1 + vshuf.b vr1, vr3, vr3, vr6 + vshuf.b vr0, vr7, vr0, vr4 + vshuf.b vr1, vr7, vr1, vr4 + vseq.b vr4, vr0, vr1 - vmin.bu $vr4, $vr0, $vr4 + vmin.bu vr4, vr0, vr4 bgeu t2, a2, L(un_early_end) - vsetanyeqz.b $fcc0, $vr4 - bcnez $fcc0, L(un_end) + vsetanyeqz.b fcc0, vr4 + bcnez fcc0, L(un_end) add.d a6, a0, a2 - vslt.b $vr5, $vr2, $vr5 + vslt.b vr5, vr2, vr5 addi.d a7, a6, -1 - vor.v $vr3, $vr3, $vr5 + vor.v vr3, vr3, vr5 bstrins.d a7, zero, 3, 0 sub.d a2, a6, a7 L(un_loop): - vld $vr0, t0, 16 + vld vr0, t0, 16 addi.d t0, t0, 16 - vsetanyeqz.b $fcc0, $vr3 - bcnez $fcc0, L(has_zero) + vsetanyeqz.b fcc0, vr3 + bcnez fcc0, L(has_zero) beq t0, a7, L(end_with_len) - vor.v $vr1, $vr3, $vr3 + vor.v vr1, vr3, vr3 - vld $vr3, t1, 16 + vld vr3, t1, 16 addi.d t1, t1, 16 - vshuf.b $vr1, $vr3, $vr1, $vr6 - vseq.b $vr4, $vr0, $vr1 + vshuf.b vr1, vr3, vr1, vr6 + vseq.b vr4, vr0, vr1 - vmin.bu $vr4, $vr0, $vr4 - vsetanyeqz.b $fcc0, $vr4 - bceqz $fcc0, L(un_loop) + vmin.bu vr4, vr0, vr4 + vsetanyeqz.b fcc0, vr4 + bceqz fcc0, L(un_loop) L(un_end): - vseqi.b $vr4, $vr4, 0 + vseqi.b vr4, vr4, 0 - vfrstpi.b $vr4, $vr4, 0 - vshuf.b $vr0, $vr0, $vr0, $vr4 - vshuf.b $vr1, $vr1, $vr1, $vr4 - vpickve2gr.bu t0, $vr0, 0 + vfrstpi.b vr4, vr4, 0 + vshuf.b vr0, vr0, vr0, vr4 + vshuf.b vr1, vr1, vr1, vr4 + vpickve2gr.bu t0, vr0, 0 - vpickve2gr.bu t1, $vr1, 0 + vpickve2gr.bu t1, vr1, 0 sub.d t2, t0, t1 sub.d t3, t1, t0 masknez t0, t2, a5 @@ -160,30 +160,30 @@ L(un_end): or a0, t0, t1 jr ra L(has_zero): - vshuf.b $vr1, $vr3, $vr3, $vr6 + vshuf.b vr1, vr3, vr3, vr6 - vseq.b $vr4, $vr0, $vr1 - vmin.bu $vr4, $vr0, $vr4 + vseq.b vr4, vr0, vr1 + vmin.bu vr4, vr0, vr4 bne t0, a7, L(un_end) L(un_early_end): - vreplgr2vr.b $vr5, a2 + vreplgr2vr.b vr5, a2 - vslt.b $vr5, $vr2, $vr5 - vorn.v $vr4, $vr4, $vr5 + vslt.b vr5, vr2, vr5 + vorn.v vr4, vr4, vr5 b L(un_end) L(end_with_len): sub.d a6, a3, a4 bgeu a6, a2, 1f - vld $vr4, t1, 16 + vld vr4, t1, 16 1: - vshuf.b $vr1, $vr4, $vr3, $vr6 - vseq.b $vr4, $vr0, $vr1 + vshuf.b vr1, vr4, vr3, vr6 + vseq.b vr4, vr0, vr1 - vmin.bu $vr4, $vr0, $vr4 - vreplgr2vr.b $vr5, a2 - vslt.b $vr5, $vr2, $vr5 - vorn.v $vr4, $vr4, $vr5 + vmin.bu vr4, vr0, vr4 + vreplgr2vr.b vr5, a2 + vslt.b vr5, vr2, vr5 + vorn.v vr4, vr4, vr5 b L(un_end) L(ret0): diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S index 910b52fe..6410a907 100644 --- a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S +++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S @@ -19,23 +19,23 @@ LEAF(STRNLEN, 6) li.d t3, 65 sub.d a2, a0, t1 - xvld $xr0, a2, 0 - xvld $xr1, a2, 32 + xvld xr0, a2, 0 + xvld xr1, a2, 32 sub.d t1, t3, t1 move a3, a0 sltu t1, a1, t1 - xvmsknz.b $xr0, $xr0 - xvmsknz.b $xr1, $xr1 - xvpickve.w $xr2, $xr0, 4 + xvmsknz.b xr0, xr0 + xvmsknz.b xr1, xr1 + xvpickve.w xr2, xr0, 4 - xvpickve.w $xr3, $xr1, 4 - vilvl.h $vr0, $vr2, $vr0 - vilvl.h $vr1, $vr3, $vr1 - vilvl.w $vr0, $vr1, $vr0 + xvpickve.w xr3, xr1, 4 + vilvl.h vr0, vr2, vr0 + vilvl.h vr1, vr3, vr1 + vilvl.w vr0, vr1, vr0 - movfr2gr.d t0, $f0 + movfr2gr.d t0, fa0 sra.d t0, t0, a0 orn t1, t1, t0 bnez t1, L(end) @@ -46,26 +46,26 @@ LEAF(STRNLEN, 6) bstrins.d a4, zero, 5, 0 L(loop): - xvld $xr0, a0, 64 - xvld $xr1, a0, 96 + xvld xr0, a0, 64 + xvld xr1, a0, 96 addi.d a0, a0, 64 beq a0, a4, L(out) - xvmin.bu $xr2, $xr0, $xr1 - xvsetanyeqz.b $fcc0, $xr2 - bceqz $fcc0, L(loop) + xvmin.bu xr2, xr0, xr1 + xvsetanyeqz.b fcc0, xr2 + bceqz fcc0, L(loop) L(out): - xvmsknz.b $xr0, $xr0 + xvmsknz.b xr0, xr0 - xvmsknz.b $xr1, $xr1 - xvpickve.w $xr2, $xr0, 4 - xvpickve.w $xr3, $xr1, 4 - vilvl.h $vr0, $vr2, $vr0 + xvmsknz.b xr1, xr1 + xvpickve.w xr2, xr0, 4 + xvpickve.w xr3, xr1, 4 + vilvl.h vr0, vr2, vr0 - vilvl.h $vr1, $vr3, $vr1 - vilvl.w $vr0, $vr1, $vr0 - movfr2gr.d t0, $f0 + vilvl.h vr1, vr3, vr1 + vilvl.w vr0, vr1, vr0 + movfr2gr.d t0, fa0 L(end): sub.d a0, a0, a3 diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S index db0e90ff..9250a0cd 100644 --- a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S +++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S @@ -19,17 +19,17 @@ LEAF(STRNLEN, 6) li.d t3, 33 sub.d a2, a0, t1 - vld $vr0, a2, 0 - vld $vr1, a2, 16 + vld vr0, a2, 0 + vld vr1, a2, 16 sub.d t1, t3, t1 move a3, a0 sltu t1, a1, t1 - vmsknz.b $vr0, $vr0 - vmsknz.b $vr1, $vr1 - vilvl.h $vr0, $vr1, $vr0 + vmsknz.b vr0, vr0 + vmsknz.b vr1, vr1 + vilvl.h vr0, vr1, vr0 - movfr2gr.s t0, $f0 + movfr2gr.s t0, fa0 sra.w t0, t0, a0 orn t1, t1, t0 bnez t1, L(end) @@ -41,20 +41,20 @@ LEAF(STRNLEN, 6) bstrins.d a4, zero, 4, 0 L(loop): - vld $vr0, a0, 32 - vld $vr1, a0, 48 + vld vr0, a0, 32 + vld vr1, a0, 48 addi.d a0, a0, 32 beq a0, a4, L(out) - vmin.bu $vr2, $vr0, $vr1 - vsetanyeqz.b $fcc0, $vr2 - bceqz $fcc0, L(loop) + vmin.bu vr2, vr0, vr1 + vsetanyeqz.b fcc0, vr2 + bceqz fcc0, L(loop) L(out): - vmsknz.b $vr0, $vr0 + vmsknz.b vr0, vr0 - vmsknz.b $vr1, $vr1 - vilvl.h $vr0, $vr1, $vr0 - movfr2gr.s t0, $f0 + vmsknz.b vr1, vr1 + vilvl.h vr0, vr1, vr0 + movfr2gr.s t0, fa0 L(end): sub.d a0, a0, a3 diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S index 325458ff..990be973 100644 --- a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S +++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S @@ -14,45 +14,45 @@ LEAF(STRRCHR, 6) andi t1, a0, 0x3f bstrins.d a0, zero, 5, 0 - xvld $xr0, a0, 0 - xvld $xr1, a0, 32 + xvld xr0, a0, 0 + xvld xr1, a0, 32 li.d t2, -1 - xvreplgr2vr.b $xr4, a1 + xvreplgr2vr.b xr4, a1 move a2, zero sll.d t3, t2, t1 addi.d a0, a0, 63 - xvseq.b $xr2, $xr0, $xr4 - xvseq.b $xr3, $xr1, $xr4 - xvmsknz.b $xr0, $xr0 + xvseq.b xr2, xr0, xr4 + xvseq.b xr3, xr1, xr4 + xvmsknz.b xr0, xr0 - xvmsknz.b $xr1, $xr1 - xvpickve.w $xr5, $xr0, 4 - xvpickve.w $xr6, $xr1, 4 - vilvl.h $vr0, $vr5, $vr0 + xvmsknz.b xr1, xr1 + xvpickve.w xr5, xr0, 4 + xvpickve.w xr6, xr1, 4 + vilvl.h vr0, vr5, vr0 - vilvl.h $vr1, $vr6, $vr1 - xvmsknz.b $xr2, $xr2 - xvmsknz.b $xr3, $xr3 - xvpickve.w $xr5, $xr2, 4 + vilvl.h vr1, vr6, vr1 + xvmsknz.b xr2, xr2 + xvmsknz.b xr3, xr3 + xvpickve.w xr5, xr2, 4 - xvpickve.w $xr6, $xr3, 4 - vilvl.h $vr2, $vr5, $vr2 - vilvl.h $vr3, $vr6, $vr3 - vilvl.w $vr0, $vr1, $vr0 + xvpickve.w xr6, xr3, 4 + vilvl.h vr2, vr5, vr2 + vilvl.h vr3, vr6, vr3 + vilvl.w vr0, vr1, vr0 - vilvl.w $vr1, $vr3, $vr2 - movfr2gr.d t0, $f0 - movfr2gr.d t1, $f1 + vilvl.w vr1, vr3, vr2 + movfr2gr.d t0, fa0 + movfr2gr.d t1, fa1 orn t0, t0, t3 and t1, t1, t3 bne t0, t2, L(end) L(loop): - xvld $xr0, a0, 1 - xvld $xr1, a0, 33 + xvld xr0, a0, 1 + xvld xr1, a0, 33 clz.d t0, t1 @@ -62,33 +62,33 @@ L(loop): masknez t1, a2, t1 or a2, t0, t1 - xvseq.b $xr2, $xr0, $xr4 - xvseq.b $xr3, $xr1, $xr4 + xvseq.b xr2, xr0, xr4 + xvseq.b xr3, xr1, xr4 - xvmsknz.b $xr2, $xr2 - xvmsknz.b $xr3, $xr3 - xvpickve.w $xr5, $xr2, 4 - xvpickve.w $xr6, $xr3, 4 + xvmsknz.b xr2, xr2 + xvmsknz.b xr3, xr3 + xvpickve.w xr5, xr2, 4 + xvpickve.w xr6, xr3, 4 - vilvl.h $vr2, $vr5, $vr2 - vilvl.h $vr3, $vr6, $vr3 - xvmin.bu $xr5, $xr0, $xr1 - vilvl.w $vr2, $vr3, $vr2 + vilvl.h vr2, vr5, vr2 + vilvl.h vr3, vr6, vr3 + xvmin.bu xr5, xr0, xr1 + vilvl.w vr2, vr3, vr2 - xvsetanyeqz.b $fcc0, $xr5 - movfr2gr.d t1, $f2 - bceqz $fcc0, L(loop) - xvmsknz.b $xr0, $xr0 + xvsetanyeqz.b fcc0, xr5 + movfr2gr.d t1, fa2 + bceqz fcc0, L(loop) + xvmsknz.b xr0, xr0 - xvmsknz.b $xr1, $xr1 - xvpickve.w $xr5, $xr0, 4 - xvpickve.w $xr6, $xr1, 4 - vilvl.h $vr0, $vr5, $vr0 + xvmsknz.b xr1, xr1 + xvpickve.w xr5, xr0, 4 + xvpickve.w xr6, xr1, 4 + vilvl.h vr0, vr5, vr0 - vilvl.h $vr1, $vr6, $vr1 - vilvl.w $vr0, $vr1, $vr0 - movfr2gr.d t0, $f0 + vilvl.h vr1, vr6, vr1 + vilvl.w vr0, vr1, vr0 + movfr2gr.d t0, fa0 L(end): slli.d t3, t2, 1 # shift one more for the last '\0' diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S index e082eaab..6aede6ae 100644 --- a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S +++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S @@ -14,35 +14,35 @@ LEAF(STRRCHR, 6) andi t1, a0, 0x1f bstrins.d a0, zero, 4, 0 - vld $vr0, a0, 0 - vld $vr1, a0, 16 + vld vr0, a0, 0 + vld vr1, a0, 16 - vreplgr2vr.b $vr4, a1 + vreplgr2vr.b vr4, a1 li.d t2, -1 move a2, zero addi.d a0, a0, 31 - vseq.b $vr2, $vr0, $vr4 - vseq.b $vr3, $vr1, $vr4 - vmsknz.b $vr0, $vr0 - vmsknz.b $vr1, $vr1 + vseq.b vr2, vr0, vr4 + vseq.b vr3, vr1, vr4 + vmsknz.b vr0, vr0 + vmsknz.b vr1, vr1 - vmsknz.b $vr2, $vr2 - vmsknz.b $vr3, $vr3 - vilvl.h $vr0, $vr1, $vr0 - vilvl.h $vr1, $vr3, $vr2 + vmsknz.b vr2, vr2 + vmsknz.b vr3, vr3 + vilvl.h vr0, vr1, vr0 + vilvl.h vr1, vr3, vr2 - movfr2gr.s t0, $f0 + movfr2gr.s t0, fa0 sll.d t3, t2, t1 - movfr2gr.s t1, $f1 + movfr2gr.s t1, fa1 orn t0, t0, t3 and t1, t1, t3 bne t0, t2, L(end) L(loop): - vld $vr0, a0, 1 - vld $vr1, a0, 17 + vld vr0, a0, 1 + vld vr1, a0, 17 clz.w t0, t1 sub.d t0, a0, t0 @@ -51,23 +51,23 @@ L(loop): masknez t1, a2, t1 or a2, t0, t1 - vseq.b $vr2, $vr0, $vr4 - vseq.b $vr3, $vr1, $vr4 + vseq.b vr2, vr0, vr4 + vseq.b vr3, vr1, vr4 - vmsknz.b $vr2, $vr2 - vmsknz.b $vr3, $vr3 - vmin.bu $vr5, $vr0, $vr1 - vilvl.h $vr2, $vr3, $vr2 + vmsknz.b vr2, vr2 + vmsknz.b vr3, vr3 + vmin.bu vr5, vr0, vr1 + vilvl.h vr2, vr3, vr2 - vsetanyeqz.b $fcc0, $vr5 - movfr2gr.s t1, $f2 - bceqz $fcc0, L(loop) - vmsknz.b $vr0, $vr0 + vsetanyeqz.b fcc0, vr5 + movfr2gr.s t1, fa2 + bceqz fcc0, L(loop) + vmsknz.b vr0, vr0 - vmsknz.b $vr1, $vr1 - vilvl.h $vr0, $vr1, $vr0 - movfr2gr.s t0, $f0 + vmsknz.b vr1, vr1 + vilvl.h vr0, vr1, vr0 + movfr2gr.s t0, fa0 L(end): slli.d t3, t2, 1 # shift one more for the last '\0' diff --git a/sysdeps/loongarch/lp64/s_cosf.S b/sysdeps/loongarch/lp64/s_cosf.S index 9fcbe6ca..cb3a4faa 100644 --- a/sysdeps/loongarch/lp64/s_cosf.S +++ b/sysdeps/loongarch/lp64/s_cosf.S @@ -213,9 +213,9 @@ L_even_integer: fadd.d fa0, fa0, fa1 fadd.d fa2, fa2, fa3 fadd.d fa0, fa0, fa2 - fcmp.sle.d $fcc0, fa0, fa5 + fcmp.sle.d fcc0, fa0, fa5 addi.d t0, t0, 3 - bcnez $fcc0, L_leq_one + bcnez fcc0, L_leq_one /*L_gt_one:*/ fld.d fa2, t1, 16 /* 2.0 */ addi.d t0, t0, 1 diff --git a/sysdeps/loongarch/lp64/s_sinf.S b/sysdeps/loongarch/lp64/s_sinf.S index 45d1c4b5..1e77282d 100644 --- a/sysdeps/loongarch/lp64/s_sinf.S +++ b/sysdeps/loongarch/lp64/s_sinf.S @@ -215,9 +215,9 @@ L_even_integer: fadd.d fa0, fa0, fa1 fadd.d fa2, fa2, fa3 fadd.d fa0, fa0, fa2 - fcmp.sle.d $fcc0, fa0, fa5 + fcmp.sle.d fcc0, fa0, fa5 addi.d t0, t0, 1 - bcnez $fcc0, L_leq_one + bcnez fcc0, L_leq_one /*L_gt_one:*/ fld.d fa2, t1, 16 /* 2.0 */ addi.d t0, t0, 1 diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h index 36f00939..b5ee57cf 100644 --- a/sysdeps/loongarch/sys/regdef.h +++ b/sysdeps/loongarch/sys/regdef.h @@ -71,6 +71,14 @@ # define fs5 $f29 # define fs6 $f30 # define fs7 $f31 +# define fcc0 $fcc0 +# define fcc1 $fcc1 +# define fcc2 $fcc2 +# define fcc3 $fcc3 +# define fcc4 $fcc4 +# define fcc5 $fcc5 +# define fcc6 $fcc6 +# define fcc7 $fcc7 #elif _LOONGARCH_SIM == _ABILP32 # error ABILP32 not support yet @@ -78,4 +86,70 @@ # error noABI #endif +#define vr0 $vr0 +#define vr1 $vr1 +#define vr2 $vr2 +#define vr3 $vr3 +#define vr4 $vr4 +#define vr5 $vr5 +#define vr6 $vr6 +#define vr7 $vr7 +#define vr8 $vr8 +#define vr9 $vr9 +#define vr10 $vr10 +#define vr11 $vr11 +#define vr12 $vr12 +#define vr13 $vr13 +#define vr14 $vr14 +#define vr15 $vr15 +#define vr16 $vr16 +#define vr17 $vr17 +#define vr18 $vr18 +#define vr19 $vr19 +#define vr20 $vr20 +#define vr21 $vr21 +#define vr22 $vr22 +#define vr23 $vr23 +#define vr24 $vr24 +#define vr25 $vr25 +#define vr26 $vr26 +#define vr27 $vr27 +#define vr28 $vr28 +#define vr29 $vr29 +#define vr30 $vr30 +#define vr31 $vr31 + +#define xr0 $xr0 +#define xr1 $xr1 +#define xr2 $xr2 +#define xr3 $xr3 +#define xr4 $xr4 +#define xr5 $xr5 +#define xr6 $xr6 +#define xr7 $xr7 +#define xr8 $xr8 +#define xr9 $xr9 +#define xr10 $xr10 +#define xr11 $xr11 +#define xr12 $xr12 +#define xr13 $xr13 +#define xr14 $xr14 +#define xr15 $xr15 +#define xr16 $xr16 +#define xr17 $xr17 +#define xr18 $xr18 +#define xr19 $xr19 +#define xr20 $xr20 +#define xr21 $xr21 +#define xr22 $xr22 +#define xr23 $xr23 +#define xr24 $xr24 +#define xr25 $xr25 +#define xr26 $xr26 +#define xr27 $xr27 +#define xr28 $xr28 +#define xr29 $xr29 +#define xr30 $xr30 +#define xr31 $xr31 + #endif /* _SYS_REGDEF_H */ -- 2.33.0